From 6859a8402945cf1d74af75a2e1aa4e327a506ab4 Mon Sep 17 00:00:00 2001
From: Alan Mayer <ajm@sgi.com>
Date: Wed, 26 Mar 2008 16:11:31 -0500
Subject: x86: resize NR_IRQS for large machines

On machines with very large numbers of cpus, tables that are dimensioned
by NR_IRQS get very large, especially the irq_desc table.  They are also
very sparsely used.  When the cpu count is > MAX_IO_APICS, use MAX_IO_APICS
to set NR_IRQS, otherwise use NR_CPUS.

Signed-off-by: Alan Mayer <ajm@sgi.com>
Reviewed-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/kernel_stat.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index e8ffce898bf9..cf9f40a91c9c 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -1,11 +1,11 @@
 #ifndef _LINUX_KERNEL_STAT_H
 #define _LINUX_KERNEL_STAT_H
 
-#include <asm/irq.h>
 #include <linux/smp.h>
 #include <linux/threads.h>
 #include <linux/percpu.h>
 #include <linux/cpumask.h>
+#include <asm/irq.h>
 #include <asm/cputime.h>
 
 /*
-- 
cgit v1.2.3


From 988f7b5789ccf5cfed14c72e28573a49f0cb4809 Mon Sep 17 00:00:00 2001
From: "venkatesh.pallipadi@intel.com" <venkatesh.pallipadi@intel.com>
Date: Tue, 18 Mar 2008 17:00:22 -0700
Subject: x86: PAT export resource_wc in pci sysfs

For the ranges with IORESOURCE_PREFETCH, export a new resource_wc interface in
pci /sysfs along with resource (which is uncached).

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Acked-by: Jesse Barnes <jbarnes@virtuousgeek.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 Documentation/filesystems/sysfs-pci.txt |  1 +
 drivers/pci/pci-sysfs.c                 | 84 ++++++++++++++++++++++++---------
 include/linux/pci.h                     |  1 +
 3 files changed, 64 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/sysfs-pci.txt b/Documentation/filesystems/sysfs-pci.txt
index 5daa2aaec2c5..68ef48839c04 100644
--- a/Documentation/filesystems/sysfs-pci.txt
+++ b/Documentation/filesystems/sysfs-pci.txt
@@ -36,6 +36,7 @@ files, each with their own function.
        local_cpus	   nearby CPU mask (cpumask, ro)
        resource		   PCI resource host addresses (ascii, ro)
        resource0..N	   PCI resource N, if present (binary, mmap)
+       resource0_wc..N_wc  PCI WC map resource N, if prefetchable (binary, mmap)
        rom		   PCI ROM resource, if present (binary, ro)
        subsystem_device	   PCI subsystem device (ascii, ro)
        subsystem_vendor	   PCI subsystem vendor (ascii, ro)
diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index 271d41cc05ab..9ec7d3977a82 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -489,13 +489,14 @@ pci_mmap_legacy_mem(struct kobject *kobj, struct bin_attribute *attr,
  * @kobj: kobject for mapping
  * @attr: struct bin_attribute for the file being mapped
  * @vma: struct vm_area_struct passed into the mmap
+ * @write_combine: 1 for write_combine mapping
  *
  * Use the regular PCI mapping routines to map a PCI resource into userspace.
  * FIXME: write combining?  maybe automatic for prefetchable regions?
  */
 static int
 pci_mmap_resource(struct kobject *kobj, struct bin_attribute *attr,
-		  struct vm_area_struct *vma)
+		  struct vm_area_struct *vma, int write_combine)
 {
 	struct pci_dev *pdev = to_pci_dev(container_of(kobj,
 						       struct device, kobj));
@@ -518,7 +519,21 @@ pci_mmap_resource(struct kobject *kobj, struct bin_attribute *attr,
 	vma->vm_pgoff += start >> PAGE_SHIFT;
 	mmap_type = res->flags & IORESOURCE_MEM ? pci_mmap_mem : pci_mmap_io;
 
-	return pci_mmap_page_range(pdev, vma, mmap_type, 0);
+	return pci_mmap_page_range(pdev, vma, mmap_type, write_combine);
+}
+
+static int
+pci_mmap_resource_uc(struct kobject *kobj, struct bin_attribute *attr,
+		     struct vm_area_struct *vma)
+{
+	return pci_mmap_resource(kobj, attr, vma, 0);
+}
+
+static int
+pci_mmap_resource_wc(struct kobject *kobj, struct bin_attribute *attr,
+		     struct vm_area_struct *vma)
+{
+	return pci_mmap_resource(kobj, attr, vma, 1);
 }
 
 /**
@@ -541,9 +556,46 @@ pci_remove_resource_files(struct pci_dev *pdev)
 			sysfs_remove_bin_file(&pdev->dev.kobj, res_attr);
 			kfree(res_attr);
 		}
+
+		res_attr = pdev->res_attr_wc[i];
+		if (res_attr) {
+			sysfs_remove_bin_file(&pdev->dev.kobj, res_attr);
+			kfree(res_attr);
+		}
 	}
 }
 
+static int pci_create_attr(struct pci_dev *pdev, int num, int write_combine)
+{
+	/* allocate attribute structure, piggyback attribute name */
+	int name_len = write_combine ? 13 : 10;
+	struct bin_attribute *res_attr;
+	int retval;
+
+	res_attr = kzalloc(sizeof(*res_attr) + name_len, GFP_ATOMIC);
+	if (res_attr) {
+		char *res_attr_name = (char *)(res_attr + 1);
+
+		if (write_combine) {
+			pdev->res_attr_wc[num] = res_attr;
+			sprintf(res_attr_name, "resource%d_wc", num);
+			res_attr->mmap = pci_mmap_resource_wc;
+		} else {
+			pdev->res_attr[num] = res_attr;
+			sprintf(res_attr_name, "resource%d", num);
+			res_attr->mmap = pci_mmap_resource_uc;
+		}
+		res_attr->attr.name = res_attr_name;
+		res_attr->attr.mode = S_IRUSR | S_IWUSR;
+		res_attr->size = pci_resource_len(pdev, num);
+		res_attr->private = &pdev->resource[num];
+		retval = sysfs_create_bin_file(&pdev->dev.kobj, res_attr);
+	} else
+		retval = -ENOMEM;
+
+	return retval;
+}
+
 /**
  * pci_create_resource_files - create resource files in sysfs for @dev
  * @dev: dev in question
@@ -557,31 +609,19 @@ static int pci_create_resource_files(struct pci_dev *pdev)
 
 	/* Expose the PCI resources from this device as files */
 	for (i = 0; i < PCI_ROM_RESOURCE; i++) {
-		struct bin_attribute *res_attr;
 
 		/* skip empty resources */
 		if (!pci_resource_len(pdev, i))
 			continue;
 
-		/* allocate attribute structure, piggyback attribute name */
-		res_attr = kzalloc(sizeof(*res_attr) + 10, GFP_ATOMIC);
-		if (res_attr) {
-			char *res_attr_name = (char *)(res_attr + 1);
-
-			pdev->res_attr[i] = res_attr;
-			sprintf(res_attr_name, "resource%d", i);
-			res_attr->attr.name = res_attr_name;
-			res_attr->attr.mode = S_IRUSR | S_IWUSR;
-			res_attr->size = pci_resource_len(pdev, i);
-			res_attr->mmap = pci_mmap_resource;
-			res_attr->private = &pdev->resource[i];
-			retval = sysfs_create_bin_file(&pdev->dev.kobj, res_attr);
-			if (retval) {
-				pci_remove_resource_files(pdev);
-				return retval;
-			}
-		} else {
-			return -ENOMEM;
+		retval = pci_create_attr(pdev, i, 0);
+		/* for prefetchable resources, create a WC mappable file */
+		if (!retval && pdev->resource[i].flags & IORESOURCE_PREFETCH)
+			retval = pci_create_attr(pdev, i, 1);
+
+		if (retval) {
+			pci_remove_resource_files(pdev);
+			return retval;
 		}
 	}
 	return 0;
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 509159bcd4e7..d18b1dd49fab 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -206,6 +206,7 @@ struct pci_dev {
 	struct bin_attribute *rom_attr; /* attribute descriptor for sysfs ROM entry */
 	int rom_attr_enabled;		/* has display of the rom attribute been enabled? */
 	struct bin_attribute *res_attr[DEVICE_COUNT_RESOURCE]; /* sysfs file for resources */
+	struct bin_attribute *res_attr_wc[DEVICE_COUNT_RESOURCE]; /* sysfs file for WC mapping of resources */
 #ifdef CONFIG_PCI_MSI
 	struct list_head msi_list;
 #endif
-- 
cgit v1.2.3


From 8b09dee67f484e9b42114b1a1f068e080fd7aa56 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 12 May 2008 21:21:05 +0200
Subject: rcupreempt: remove duplicate prototypes

rcu_batches_completed and rcu_patches_completed_bh are both declared
in rcuclassic.h and rcupreempt.h. This patch removes the extra
prototypes for them from rcupdate.h.

rcu_batches_completed_bh is defined as a static inline in the rcupreempt.h
header file. Trying to export this as EXPORT_SYMBOL_GPL causes linking problems
with the powerpc linker. There's no need to export a static inlined function.

Modules must be compiled with the same type of RCU implementation as the
kernel they are for.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/rcupdate.h | 2 --
 kernel/rcupreempt.c      | 2 --
 2 files changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index d42dbec06083..ec2fc5b32646 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -224,8 +224,6 @@ extern void call_rcu_bh(struct rcu_head *head,
 /* Exported common interfaces */
 extern void synchronize_rcu(void);
 extern void rcu_barrier(void);
-extern long rcu_batches_completed(void);
-extern long rcu_batches_completed_bh(void);
 
 /* Internal to kernel */
 extern void rcu_init(void);
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index e1cdf196a515..5e02b7740702 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -217,8 +217,6 @@ long rcu_batches_completed(void)
 }
 EXPORT_SYMBOL_GPL(rcu_batches_completed);
 
-EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
-
 void __rcu_read_lock(void)
 {
 	int idx;
-- 
cgit v1.2.3


From 4446a36ff8c74ac3b32feb009b651048e129c6af Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 12 May 2008 21:21:05 +0200
Subject: rcu: add call_rcu_sched()

Fourth cut of patch to provide the call_rcu_sched().  This is again to
synchronize_sched() as call_rcu() is to synchronize_rcu().

Should be fine for experimental and -rt use, but not ready for inclusion.
With some luck, I will be able to tell Andrew to come out of hiding on
the next round.

Passes multi-day rcutorture sessions with concurrent CPU hotplugging.

Fixes since the first version include a bug that could result in
indefinite blocking (spotted by Gautham Shenoy), better resiliency
against CPU-hotplug operations, and other minor fixes.

Fixes since the second version include reworking grace-period detection
to avoid deadlocks that could happen when running concurrently with
CPU hotplug, adding Mathieu's fix to avoid the softlockup messages,
as well as Mathieu's fix to allow use earlier in boot.

Fixes since the third version include a wrong-CPU bug spotted by
Andrew, getting rid of the obsolete synchronize_kernel API that somehow
snuck back in, merging spin_unlock() and local_irq_restore() in a
few places, commenting the code that checks for quiescent states based
on interrupting from user-mode execution or the idle loop, removing
some inline attributes, and some code-style changes.

Known/suspected shortcomings:

o	I still do not entirely trust the sleep/wakeup logic.  Next step
	will be to use a private snapshot of the CPU online mask in
	rcu_sched_grace_period() -- if the CPU wasn't there at the start
	of the grace period, we don't need to hear from it.  And the
	bit about accounting for changes in online CPUs inside of
	rcu_sched_grace_period() is ugly anyway.

o	It might be good for rcu_sched_grace_period() to invoke
	resched_cpu() when a given CPU wasn't responding quickly,
	but resched_cpu() is declared static...

This patch also fixes a long-standing bug in the earlier preemptable-RCU
implementation of synchronize_rcu() that could result in loss of
concurrent external changes to a task's CPU affinity mask.  I still cannot
remember who reported this...

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/rcuclassic.h |   3 +
 include/linux/rcupdate.h   |  22 +++
 include/linux/rcupreempt.h |  42 ++++-
 init/main.c                |   1 +
 kernel/rcupdate.c          |  20 +--
 kernel/rcupreempt.c        | 414 ++++++++++++++++++++++++++++++++++++++++-----
 6 files changed, 434 insertions(+), 68 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcuclassic.h b/include/linux/rcuclassic.h
index b3aa05baab8a..8c774905dcfe 100644
--- a/include/linux/rcuclassic.h
+++ b/include/linux/rcuclassic.h
@@ -151,7 +151,10 @@ extern struct lockdep_map rcu_lock_map;
 
 #define __synchronize_sched() synchronize_rcu()
 
+#define call_rcu_sched(head, func) call_rcu(head, func)
+
 extern void __rcu_init(void);
+#define rcu_init_sched()	do { } while (0)
 extern void rcu_check_callbacks(int cpu, int user);
 extern void rcu_restart_cpu(int cpu);
 
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index ec2fc5b32646..411969cb5243 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -40,6 +40,7 @@
 #include <linux/cpumask.h>
 #include <linux/seqlock.h>
 #include <linux/lockdep.h>
+#include <linux/completion.h>
 
 /**
  * struct rcu_head - callback structure for use with RCU
@@ -168,6 +169,27 @@ struct rcu_head {
 		(p) = (v); \
 	})
 
+/* Infrastructure to implement the synchronize_() primitives. */
+
+struct rcu_synchronize {
+	struct rcu_head head;
+	struct completion completion;
+};
+
+extern void wakeme_after_rcu(struct rcu_head  *head);
+
+#define synchronize_rcu_xxx(name, func) \
+void name(void) \
+{ \
+	struct rcu_synchronize rcu; \
+	\
+	init_completion(&rcu.completion); \
+	/* Will wake me after RCU finished. */ \
+	func(&rcu.head, wakeme_after_rcu); \
+	/* Wait for it. */ \
+	wait_for_completion(&rcu.completion); \
+}
+
 /**
  * synchronize_sched - block until all CPUs have exited any non-preemptive
  * kernel code sequences.
diff --git a/include/linux/rcupreempt.h b/include/linux/rcupreempt.h
index 8a05c7e20bc4..f04b64eca636 100644
--- a/include/linux/rcupreempt.h
+++ b/include/linux/rcupreempt.h
@@ -40,10 +40,39 @@
 #include <linux/cpumask.h>
 #include <linux/seqlock.h>
 
-#define rcu_qsctr_inc(cpu)
+struct rcu_dyntick_sched {
+	int dynticks;
+	int dynticks_snap;
+	int sched_qs;
+	int sched_qs_snap;
+	int sched_dynticks_snap;
+};
+
+DECLARE_PER_CPU(struct rcu_dyntick_sched, rcu_dyntick_sched);
+
+static inline void rcu_qsctr_inc(int cpu)
+{
+	struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
+
+	rdssp->sched_qs++;
+}
 #define rcu_bh_qsctr_inc(cpu)
 #define call_rcu_bh(head, rcu) call_rcu(head, rcu)
 
+/**
+ * call_rcu_sched - Queue RCU callback for invocation after sched grace period.
+ * @head: structure to be used for queueing the RCU updates.
+ * @func: actual update function to be invoked after the grace period
+ *
+ * The update function will be invoked some time after a full
+ * synchronize_sched()-style grace period elapses, in other words after
+ * all currently executing preempt-disabled sections of code (including
+ * hardirq handlers, NMI handlers, and local_irq_save() blocks) have
+ * completed.
+ */
+extern void call_rcu_sched(struct rcu_head *head,
+			   void (*func)(struct rcu_head *head));
+
 extern void __rcu_read_lock(void)	__acquires(RCU);
 extern void __rcu_read_unlock(void)	__releases(RCU);
 extern int rcu_pending(int cpu);
@@ -55,6 +84,7 @@ extern int rcu_needs_cpu(int cpu);
 extern void __synchronize_sched(void);
 
 extern void __rcu_init(void);
+extern void rcu_init_sched(void);
 extern void rcu_check_callbacks(int cpu, int user);
 extern void rcu_restart_cpu(int cpu);
 extern long rcu_batches_completed(void);
@@ -81,20 +111,20 @@ extern struct rcupreempt_trace *rcupreempt_trace_cpu(int cpu);
 struct softirq_action;
 
 #ifdef CONFIG_NO_HZ
-DECLARE_PER_CPU(long, dynticks_progress_counter);
+DECLARE_PER_CPU(struct rcu_dyntick_sched, rcu_dyntick_sched);
 
 static inline void rcu_enter_nohz(void)
 {
 	smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
-	__get_cpu_var(dynticks_progress_counter)++;
-	WARN_ON(__get_cpu_var(dynticks_progress_counter) & 0x1);
+	__get_cpu_var(rcu_dyntick_sched).dynticks++;
+	WARN_ON(__get_cpu_var(rcu_dyntick_sched).dynticks & 0x1);
 }
 
 static inline void rcu_exit_nohz(void)
 {
-	__get_cpu_var(dynticks_progress_counter)++;
 	smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
-	WARN_ON(!(__get_cpu_var(dynticks_progress_counter) & 0x1));
+	__get_cpu_var(rcu_dyntick_sched).dynticks++;
+	WARN_ON(!(__get_cpu_var(rcu_dyntick_sched).dynticks & 0x1));
 }
 
 #else /* CONFIG_NO_HZ */
diff --git a/init/main.c b/init/main.c
index f7fb20021d48..a9cc3e0803de 100644
--- a/init/main.c
+++ b/init/main.c
@@ -758,6 +758,7 @@ static void __init do_initcalls(void)
  */
 static void __init do_basic_setup(void)
 {
+	rcu_init_sched(); /* needed by module_init stage. */
 	/* drivers will send hotplug events */
 	init_workqueues();
 	usermodehelper_init();
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index c09605f8d16c..a4e329d92883 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -39,18 +39,12 @@
 #include <linux/sched.h>
 #include <asm/atomic.h>
 #include <linux/bitops.h>
-#include <linux/completion.h>
 #include <linux/percpu.h>
 #include <linux/notifier.h>
 #include <linux/cpu.h>
 #include <linux/mutex.h>
 #include <linux/module.h>
 
-struct rcu_synchronize {
-	struct rcu_head head;
-	struct completion completion;
-};
-
 static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
 static atomic_t rcu_barrier_cpu_count;
 static DEFINE_MUTEX(rcu_barrier_mutex);
@@ -60,7 +54,7 @@ static struct completion rcu_barrier_completion;
  * Awaken the corresponding synchronize_rcu() instance now that a
  * grace period has elapsed.
  */
-static void wakeme_after_rcu(struct rcu_head  *head)
+void wakeme_after_rcu(struct rcu_head  *head)
 {
 	struct rcu_synchronize *rcu;
 
@@ -77,17 +71,7 @@ static void wakeme_after_rcu(struct rcu_head  *head)
  * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
  * and may be nested.
  */
-void synchronize_rcu(void)
-{
-	struct rcu_synchronize rcu;
-
-	init_completion(&rcu.completion);
-	/* Will wake me after RCU finished */
-	call_rcu(&rcu.head, wakeme_after_rcu);
-
-	/* Wait for it */
-	wait_for_completion(&rcu.completion);
-}
+synchronize_rcu_xxx(synchronize_rcu, call_rcu)
 EXPORT_SYMBOL_GPL(synchronize_rcu);
 
 static void rcu_barrier_callback(struct rcu_head *notused)
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index 5e02b7740702..aaa7976bd85f 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -46,6 +46,7 @@
 #include <asm/atomic.h>
 #include <linux/bitops.h>
 #include <linux/module.h>
+#include <linux/kthread.h>
 #include <linux/completion.h>
 #include <linux/moduleparam.h>
 #include <linux/percpu.h>
@@ -87,9 +88,14 @@ struct rcu_data {
 	struct rcu_head **nexttail;
 	struct rcu_head *waitlist[GP_STAGES];
 	struct rcu_head **waittail[GP_STAGES];
-	struct rcu_head *donelist;
+	struct rcu_head *donelist;	/* from waitlist & waitschedlist */
 	struct rcu_head **donetail;
 	long rcu_flipctr[2];
+	struct rcu_head *nextschedlist;
+	struct rcu_head **nextschedtail;
+	struct rcu_head *waitschedlist;
+	struct rcu_head **waitschedtail;
+	int rcu_sched_sleeping;
 #ifdef CONFIG_RCU_TRACE
 	struct rcupreempt_trace trace;
 #endif /* #ifdef CONFIG_RCU_TRACE */
@@ -131,11 +137,24 @@ enum rcu_try_flip_states {
 	rcu_try_flip_waitmb_state,
 };
 
+/*
+ * States for rcu_ctrlblk.rcu_sched_sleep.
+ */
+
+enum rcu_sched_sleep_states {
+	rcu_sched_not_sleeping,	/* Not sleeping, callbacks need GP.  */
+	rcu_sched_sleep_prep,	/* Thinking of sleeping, rechecking. */
+	rcu_sched_sleeping,	/* Sleeping, awaken if GP needed. */
+};
+
 struct rcu_ctrlblk {
 	spinlock_t	fliplock;	/* Protect state-machine transitions. */
 	long		completed;	/* Number of last completed batch. */
 	enum rcu_try_flip_states rcu_try_flip_state; /* The current state of
 							the rcu state machine */
+	spinlock_t	schedlock;	/* Protect rcu_sched sleep state. */
+	enum rcu_sched_sleep_states sched_sleep; /* rcu_sched state. */
+	wait_queue_head_t sched_wq;	/* Place for rcu_sched to sleep. */
 };
 
 static DEFINE_PER_CPU(struct rcu_data, rcu_data);
@@ -143,8 +162,12 @@ static struct rcu_ctrlblk rcu_ctrlblk = {
 	.fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock),
 	.completed = 0,
 	.rcu_try_flip_state = rcu_try_flip_idle_state,
+	.schedlock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.schedlock),
+	.sched_sleep = rcu_sched_not_sleeping,
+	.sched_wq = __WAIT_QUEUE_HEAD_INITIALIZER(rcu_ctrlblk.sched_wq),
 };
 
+static struct task_struct *rcu_sched_grace_period_task;
 
 #ifdef CONFIG_RCU_TRACE
 static char *rcu_try_flip_state_names[] =
@@ -207,6 +230,8 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_mb_flag_values, rcu_mb_flag)
  */
 #define RCU_TRACE_RDP(f, rdp) RCU_TRACE(f, &((rdp)->trace));
 
+#define RCU_SCHED_BATCH_TIME (HZ / 50)
+
 /*
  * Return the number of RCU batches processed thus far.  Useful
  * for debug and statistics.
@@ -411,32 +436,34 @@ static void __rcu_advance_callbacks(struct rcu_data *rdp)
 	}
 }
 
-#ifdef CONFIG_NO_HZ
+DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_dyntick_sched, rcu_dyntick_sched) = {
+	.dynticks = 1,
+};
 
-DEFINE_PER_CPU(long, dynticks_progress_counter) = 1;
-static DEFINE_PER_CPU(long, rcu_dyntick_snapshot);
+#ifdef CONFIG_NO_HZ
 static DEFINE_PER_CPU(int, rcu_update_flag);
 
 /**
  * rcu_irq_enter - Called from Hard irq handlers and NMI/SMI.
  *
  * If the CPU was idle with dynamic ticks active, this updates the
- * dynticks_progress_counter to let the RCU handling know that the
+ * rcu_dyntick_sched.dynticks to let the RCU handling know that the
  * CPU is active.
  */
 void rcu_irq_enter(void)
 {
 	int cpu = smp_processor_id();
+	struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
 
 	if (per_cpu(rcu_update_flag, cpu))
 		per_cpu(rcu_update_flag, cpu)++;
 
 	/*
 	 * Only update if we are coming from a stopped ticks mode
-	 * (dynticks_progress_counter is even).
+	 * (rcu_dyntick_sched.dynticks is even).
 	 */
 	if (!in_interrupt() &&
-	    (per_cpu(dynticks_progress_counter, cpu) & 0x1) == 0) {
+	    (rdssp->dynticks & 0x1) == 0) {
 		/*
 		 * The following might seem like we could have a race
 		 * with NMI/SMIs. But this really isn't a problem.
@@ -459,12 +486,12 @@ void rcu_irq_enter(void)
 		 * RCU read-side critical sections on this CPU would
 		 * have already completed.
 		 */
-		per_cpu(dynticks_progress_counter, cpu)++;
+		rdssp->dynticks++;
 		/*
 		 * The following memory barrier ensures that any
 		 * rcu_read_lock() primitives in the irq handler
 		 * are seen by other CPUs to follow the above
-		 * increment to dynticks_progress_counter. This is
+		 * increment to rcu_dyntick_sched.dynticks. This is
 		 * required in order for other CPUs to correctly
 		 * determine when it is safe to advance the RCU
 		 * grace-period state machine.
@@ -472,7 +499,7 @@ void rcu_irq_enter(void)
 		smp_mb(); /* see above block comment. */
 		/*
 		 * Since we can't determine the dynamic tick mode from
-		 * the dynticks_progress_counter after this routine,
+		 * the rcu_dyntick_sched.dynticks after this routine,
 		 * we use a second flag to acknowledge that we came
 		 * from an idle state with ticks stopped.
 		 */
@@ -480,7 +507,7 @@ void rcu_irq_enter(void)
 		/*
 		 * If we take an NMI/SMI now, they will also increment
 		 * the rcu_update_flag, and will not update the
-		 * dynticks_progress_counter on exit. That is for
+		 * rcu_dyntick_sched.dynticks on exit. That is for
 		 * this IRQ to do.
 		 */
 	}
@@ -490,12 +517,13 @@ void rcu_irq_enter(void)
  * rcu_irq_exit - Called from exiting Hard irq context.
  *
  * If the CPU was idle with dynamic ticks active, update the
- * dynticks_progress_counter to put let the RCU handling be
+ * rcu_dyntick_sched.dynticks to put let the RCU handling be
  * aware that the CPU is going back to idle with no ticks.
  */
 void rcu_irq_exit(void)
 {
 	int cpu = smp_processor_id();
+	struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
 
 	/*
 	 * rcu_update_flag is set if we interrupted the CPU
@@ -503,7 +531,7 @@ void rcu_irq_exit(void)
 	 * Once this occurs, we keep track of interrupt nesting
 	 * because a NMI/SMI could also come in, and we still
 	 * only want the IRQ that started the increment of the
-	 * dynticks_progress_counter to be the one that modifies
+	 * rcu_dyntick_sched.dynticks to be the one that modifies
 	 * it on exit.
 	 */
 	if (per_cpu(rcu_update_flag, cpu)) {
@@ -515,28 +543,29 @@ void rcu_irq_exit(void)
 
 		/*
 		 * If an NMI/SMI happens now we are still
-		 * protected by the dynticks_progress_counter being odd.
+		 * protected by the rcu_dyntick_sched.dynticks being odd.
 		 */
 
 		/*
 		 * The following memory barrier ensures that any
 		 * rcu_read_unlock() primitives in the irq handler
 		 * are seen by other CPUs to preceed the following
-		 * increment to dynticks_progress_counter. This
+		 * increment to rcu_dyntick_sched.dynticks. This
 		 * is required in order for other CPUs to determine
 		 * when it is safe to advance the RCU grace-period
 		 * state machine.
 		 */
 		smp_mb(); /* see above block comment. */
-		per_cpu(dynticks_progress_counter, cpu)++;
-		WARN_ON(per_cpu(dynticks_progress_counter, cpu) & 0x1);
+		rdssp->dynticks++;
+		WARN_ON(rdssp->dynticks & 0x1);
 	}
 }
 
 static void dyntick_save_progress_counter(int cpu)
 {
-	per_cpu(rcu_dyntick_snapshot, cpu) =
-		per_cpu(dynticks_progress_counter, cpu);
+	struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
+
+	rdssp->dynticks_snap = rdssp->dynticks;
 }
 
 static inline int
@@ -544,9 +573,10 @@ rcu_try_flip_waitack_needed(int cpu)
 {
 	long curr;
 	long snap;
+	struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
 
-	curr = per_cpu(dynticks_progress_counter, cpu);
-	snap = per_cpu(rcu_dyntick_snapshot, cpu);
+	curr = rdssp->dynticks;
+	snap = rdssp->dynticks_snap;
 	smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
 
 	/*
@@ -580,9 +610,10 @@ rcu_try_flip_waitmb_needed(int cpu)
 {
 	long curr;
 	long snap;
+	struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
 
-	curr = per_cpu(dynticks_progress_counter, cpu);
-	snap = per_cpu(rcu_dyntick_snapshot, cpu);
+	curr = rdssp->dynticks;
+	snap = rdssp->dynticks_snap;
 	smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
 
 	/*
@@ -609,14 +640,86 @@ rcu_try_flip_waitmb_needed(int cpu)
 	return 1;
 }
 
+static void dyntick_save_progress_counter_sched(int cpu)
+{
+	struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
+
+	rdssp->sched_dynticks_snap = rdssp->dynticks;
+}
+
+static int rcu_qsctr_inc_needed_dyntick(int cpu)
+{
+	long curr;
+	long snap;
+	struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
+
+	curr = rdssp->dynticks;
+	snap = rdssp->sched_dynticks_snap;
+	smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
+
+	/*
+	 * If the CPU remained in dynticks mode for the entire time
+	 * and didn't take any interrupts, NMIs, SMIs, or whatever,
+	 * then it cannot be in the middle of an rcu_read_lock(), so
+	 * the next rcu_read_lock() it executes must use the new value
+	 * of the counter.  Therefore, this CPU has been in a quiescent
+	 * state the entire time, and we don't need to wait for it.
+	 */
+
+	if ((curr == snap) && ((curr & 0x1) == 0))
+		return 0;
+
+	/*
+	 * If the CPU passed through or entered a dynticks idle phase with
+	 * no active irq handlers, then, as above, this CPU has already
+	 * passed through a quiescent state.
+	 */
+
+	if ((curr - snap) > 2 || (snap & 0x1) == 0)
+		return 0;
+
+	/* We need this CPU to go through a quiescent state. */
+
+	return 1;
+}
+
 #else /* !CONFIG_NO_HZ */
 
-# define dyntick_save_progress_counter(cpu)	do { } while (0)
-# define rcu_try_flip_waitack_needed(cpu)	(1)
-# define rcu_try_flip_waitmb_needed(cpu)	(1)
+# define dyntick_save_progress_counter(cpu)		do { } while (0)
+# define rcu_try_flip_waitack_needed(cpu)		(1)
+# define rcu_try_flip_waitmb_needed(cpu)		(1)
+
+# define dyntick_save_progress_counter_sched(cpu)	do { } while (0)
+# define rcu_qsctr_inc_needed_dyntick(cpu)		(1)
 
 #endif /* CONFIG_NO_HZ */
 
+static void save_qsctr_sched(int cpu)
+{
+	struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
+
+	rdssp->sched_qs_snap = rdssp->sched_qs;
+}
+
+static inline int rcu_qsctr_inc_needed(int cpu)
+{
+	struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
+
+	/*
+	 * If there has been a quiescent state, no more need to wait
+	 * on this CPU.
+	 */
+
+	if (rdssp->sched_qs != rdssp->sched_qs_snap) {
+		smp_mb(); /* force ordering with cpu entering schedule(). */
+		return 0;
+	}
+
+	/* We need this CPU to go through a quiescent state. */
+
+	return 1;
+}
+
 /*
  * Get here when RCU is idle.  Decide whether we need to
  * move out of idle state, and return non-zero if so.
@@ -819,6 +922,26 @@ void rcu_check_callbacks(int cpu, int user)
 	unsigned long flags;
 	struct rcu_data *rdp = RCU_DATA_CPU(cpu);
 
+	/*
+	 * If this CPU took its interrupt from user mode or from the
+	 * idle loop, and this is not a nested interrupt, then
+	 * this CPU has to have exited all prior preept-disable
+	 * sections of code.  So increment the counter to note this.
+	 *
+	 * The memory barrier is needed to handle the case where
+	 * writes from a preempt-disable section of code get reordered
+	 * into schedule() by this CPU's write buffer.  So the memory
+	 * barrier makes sure that the rcu_qsctr_inc() is seen by other
+	 * CPUs to happen after any such write.
+	 */
+
+	if (user ||
+	    (idle_cpu(cpu) && !in_softirq() &&
+	     hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
+		smp_mb();	/* Guard against aggressive schedule(). */
+	     	rcu_qsctr_inc(cpu);
+	}
+
 	rcu_check_mb(cpu);
 	if (rcu_ctrlblk.completed == rdp->completed)
 		rcu_try_flip();
@@ -869,6 +992,8 @@ void rcu_offline_cpu(int cpu)
 	struct rcu_head *list = NULL;
 	unsigned long flags;
 	struct rcu_data *rdp = RCU_DATA_CPU(cpu);
+	struct rcu_head *schedlist = NULL;
+	struct rcu_head **schedtail = &schedlist;
 	struct rcu_head **tail = &list;
 
 	/*
@@ -882,6 +1007,11 @@ void rcu_offline_cpu(int cpu)
 		rcu_offline_cpu_enqueue(rdp->waitlist[i], rdp->waittail[i],
 						list, tail);
 	rcu_offline_cpu_enqueue(rdp->nextlist, rdp->nexttail, list, tail);
+	rcu_offline_cpu_enqueue(rdp->waitschedlist, rdp->waitschedtail,
+				schedlist, schedtail);
+	rcu_offline_cpu_enqueue(rdp->nextschedlist, rdp->nextschedtail,
+				schedlist, schedtail);
+	rdp->rcu_sched_sleeping = 0;
 	spin_unlock_irqrestore(&rdp->lock, flags);
 	rdp->waitlistcount = 0;
 
@@ -916,22 +1046,40 @@ void rcu_offline_cpu(int cpu)
 	 * fix.
 	 */
 
-	local_irq_save(flags);
+	local_irq_save(flags);  /* disable preempt till we know what lock. */
 	rdp = RCU_DATA_ME();
 	spin_lock(&rdp->lock);
 	*rdp->nexttail = list;
 	if (list)
 		rdp->nexttail = tail;
+	*rdp->nextschedtail = schedlist;
+	if (schedlist)
+		rdp->nextschedtail = schedtail;
 	spin_unlock_irqrestore(&rdp->lock, flags);
 }
 
 void __devinit rcu_online_cpu(int cpu)
 {
 	unsigned long flags;
+	struct rcu_data *rdp;
 
 	spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
 	cpu_set(cpu, rcu_cpu_online_map);
 	spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
+
+	/*
+	 * The rcu_sched grace-period processing might have bypassed
+	 * this CPU, given that it was not in the rcu_cpu_online_map
+	 * when the grace-period scan started.  This means that the
+	 * grace-period task might sleep.  So make sure that if this
+	 * should happen, the first callback posted to this CPU will
+	 * wake up the grace-period task if need be.
+	 */
+
+	rdp = RCU_DATA_CPU(cpu);
+	spin_lock_irqsave(&rdp->lock, flags);
+	rdp->rcu_sched_sleeping = 1;
+	spin_unlock_irqrestore(&rdp->lock, flags);
 }
 
 #else /* #ifdef CONFIG_HOTPLUG_CPU */
@@ -986,31 +1134,196 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 	*rdp->nexttail = head;
 	rdp->nexttail = &head->next;
 	RCU_TRACE_RDP(rcupreempt_trace_next_add, rdp);
-	spin_unlock(&rdp->lock);
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&rdp->lock, flags);
 }
 EXPORT_SYMBOL_GPL(call_rcu);
 
+void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+{
+	unsigned long flags;
+	struct rcu_data *rdp;
+	int wake_gp = 0;
+
+	head->func = func;
+	head->next = NULL;
+	local_irq_save(flags);
+	rdp = RCU_DATA_ME();
+	spin_lock(&rdp->lock);
+	*rdp->nextschedtail = head;
+	rdp->nextschedtail = &head->next;
+	if (rdp->rcu_sched_sleeping) {
+
+		/* Grace-period processing might be sleeping... */
+
+		rdp->rcu_sched_sleeping = 0;
+		wake_gp = 1;
+	}
+	spin_unlock_irqrestore(&rdp->lock, flags);
+	if (wake_gp) {
+
+		/* Wake up grace-period processing, unless someone beat us. */
+
+		spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
+		if (rcu_ctrlblk.sched_sleep != rcu_sched_sleeping)
+			wake_gp = 0;
+		rcu_ctrlblk.sched_sleep = rcu_sched_not_sleeping;
+		spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
+		if (wake_gp)
+			wake_up_interruptible(&rcu_ctrlblk.sched_wq);
+	}
+}
+EXPORT_SYMBOL_GPL(call_rcu_sched);
+
 /*
  * Wait until all currently running preempt_disable() code segments
  * (including hardware-irq-disable segments) complete.  Note that
  * in -rt this does -not- necessarily result in all currently executing
  * interrupt -handlers- having completed.
  */
-void __synchronize_sched(void)
+synchronize_rcu_xxx(__synchronize_sched, call_rcu_sched)
+EXPORT_SYMBOL_GPL(__synchronize_sched);
+
+/*
+ * kthread function that manages call_rcu_sched grace periods.
+ */
+static int rcu_sched_grace_period(void *arg)
 {
-	cpumask_t oldmask;
+	int couldsleep;		/* might sleep after current pass. */
+	int couldsleepnext = 0; /* might sleep after next pass. */
 	int cpu;
+	unsigned long flags;
+	struct rcu_data *rdp;
+	int ret;
 
-	if (sched_getaffinity(0, &oldmask) < 0)
-		oldmask = cpu_possible_map;
-	for_each_online_cpu(cpu) {
-		sched_setaffinity(0, &cpumask_of_cpu(cpu));
-		schedule();
-	}
-	sched_setaffinity(0, &oldmask);
+	/*
+	 * Each pass through the following loop handles one
+	 * rcu_sched grace period cycle.
+	 */
+	do {
+		/* Save each CPU's current state. */
+
+		for_each_online_cpu(cpu) {
+			dyntick_save_progress_counter_sched(cpu);
+			save_qsctr_sched(cpu);
+		}
+
+		/*
+		 * Sleep for about an RCU grace-period's worth to
+		 * allow better batching and to consume less CPU.
+		 */
+		schedule_timeout_interruptible(RCU_SCHED_BATCH_TIME);
+
+		/*
+		 * If there was nothing to do last time, prepare to
+		 * sleep at the end of the current grace period cycle.
+		 */
+		couldsleep = couldsleepnext;
+		couldsleepnext = 1;
+		if (couldsleep) {
+			spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
+			rcu_ctrlblk.sched_sleep = rcu_sched_sleep_prep;
+			spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
+		}
+
+		/*
+		 * Wait on each CPU in turn to have either visited
+		 * a quiescent state or been in dynticks-idle mode.
+		 */
+		for_each_online_cpu(cpu) {
+			while (rcu_qsctr_inc_needed(cpu) &&
+			       rcu_qsctr_inc_needed_dyntick(cpu)) {
+				/* resched_cpu(cpu); @@@ */
+				schedule_timeout_interruptible(1);
+			}
+		}
+
+		/* Advance callbacks for each CPU.  */
+
+		for_each_online_cpu(cpu) {
+
+			rdp = RCU_DATA_CPU(cpu);
+			spin_lock_irqsave(&rdp->lock, flags);
+
+			/*
+			 * We are running on this CPU irq-disabled, so no
+			 * CPU can go offline until we re-enable irqs.
+			 * The current CPU might have already gone
+			 * offline (between the for_each_offline_cpu and
+			 * the spin_lock_irqsave), but in that case all its
+			 * callback lists will be empty, so no harm done.
+			 *
+			 * Advance the callbacks!  We share normal RCU's
+			 * donelist, since callbacks are invoked the
+			 * same way in either case.
+			 */
+			if (rdp->waitschedlist != NULL) {
+				*rdp->donetail = rdp->waitschedlist;
+				rdp->donetail = rdp->waitschedtail;
+
+				/*
+				 * Next rcu_check_callbacks() will
+				 * do the required raise_softirq().
+				 */
+			}
+			if (rdp->nextschedlist != NULL) {
+				rdp->waitschedlist = rdp->nextschedlist;
+				rdp->waitschedtail = rdp->nextschedtail;
+				couldsleep = 0;
+				couldsleepnext = 0;
+			} else {
+				rdp->waitschedlist = NULL;
+				rdp->waitschedtail = &rdp->waitschedlist;
+			}
+			rdp->nextschedlist = NULL;
+			rdp->nextschedtail = &rdp->nextschedlist;
+
+			/* Mark sleep intention. */
+
+			rdp->rcu_sched_sleeping = couldsleep;
+
+			spin_unlock_irqrestore(&rdp->lock, flags);
+		}
+
+		/* If we saw callbacks on the last scan, go deal with them. */
+
+		if (!couldsleep)
+			continue;
+
+		/* Attempt to block... */
+
+		spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
+		if (rcu_ctrlblk.sched_sleep != rcu_sched_sleep_prep) {
+
+			/*
+			 * Someone posted a callback after we scanned.
+			 * Go take care of it.
+			 */
+			spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
+			couldsleepnext = 0;
+			continue;
+		}
+
+		/* Block until the next person posts a callback. */
+
+		rcu_ctrlblk.sched_sleep = rcu_sched_sleeping;
+		spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
+		ret = 0;
+		__wait_event_interruptible(rcu_ctrlblk.sched_wq,
+			rcu_ctrlblk.sched_sleep != rcu_sched_sleeping,
+			ret);
+
+		/*
+		 * Signals would prevent us from sleeping, and we cannot
+		 * do much with them in any case.  So flush them.
+		 */
+		if (ret)
+			flush_signals(current);
+		couldsleepnext = 0;
+
+	} while (!kthread_should_stop());
+
+	return (0);
 }
-EXPORT_SYMBOL_GPL(__synchronize_sched);
 
 /*
  * Check to see if any future RCU-related work will need to be done
@@ -1027,7 +1340,9 @@ int rcu_needs_cpu(int cpu)
 
 	return (rdp->donelist != NULL ||
 		!!rdp->waitlistcount ||
-		rdp->nextlist != NULL);
+		rdp->nextlist != NULL ||
+		rdp->nextschedlist != NULL ||
+		rdp->waitschedlist != NULL);
 }
 
 int rcu_pending(int cpu)
@@ -1038,7 +1353,9 @@ int rcu_pending(int cpu)
 
 	if (rdp->donelist != NULL ||
 	    !!rdp->waitlistcount ||
-	    rdp->nextlist != NULL)
+	    rdp->nextlist != NULL ||
+	    rdp->nextschedlist != NULL ||
+	    rdp->waitschedlist != NULL)
 		return 1;
 
 	/* The RCU core needs an acknowledgement from this CPU. */
@@ -1105,6 +1422,11 @@ void __init __rcu_init(void)
 		rdp->donetail = &rdp->donelist;
 		rdp->rcu_flipctr[0] = 0;
 		rdp->rcu_flipctr[1] = 0;
+		rdp->nextschedlist = NULL;
+		rdp->nextschedtail = &rdp->nextschedlist;
+		rdp->waitschedlist = NULL;
+		rdp->waitschedtail = &rdp->waitschedlist;
+		rdp->rcu_sched_sleeping = 0;
 	}
 	register_cpu_notifier(&rcu_nb);
 
@@ -1127,11 +1449,15 @@ void __init __rcu_init(void)
 }
 
 /*
- * Deprecated, use synchronize_rcu() or synchronize_sched() instead.
+ * Late-boot-time RCU initialization that must wait until after scheduler
+ * has been initialized.
  */
-void synchronize_kernel(void)
+void __init rcu_init_sched(void)
 {
-	synchronize_rcu();
+	rcu_sched_grace_period_task = kthread_run(rcu_sched_grace_period,
+						  NULL,
+						  "rcu_sched_grace_period");
+	WARN_ON(IS_ERR(rcu_sched_grace_period_task));
 }
 
 #ifdef CONFIG_RCU_TRACE
-- 
cgit v1.2.3


From 70f12f848d3e981479b4f6f751e73c14f7c13e5b Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 12 May 2008 21:21:05 +0200
Subject: rcu: add rcu_barrier_sched() and rcu_barrier_bh()

Add rcu_barrier_sched() and rcu_barrier_bh().  With these in place,
rcutorture no longer gives the occasional oops when repeatedly starting
and stopping torturing rcu_bh.  Also adds the API needed to flush out
pre-existing call_rcu_sched() callbacks.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/rcupdate.h |  2 ++
 kernel/rcupdate.c        | 55 ++++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 51 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 411969cb5243..e8b4039cfb2f 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -246,6 +246,8 @@ extern void call_rcu_bh(struct rcu_head *head,
 /* Exported common interfaces */
 extern void synchronize_rcu(void);
 extern void rcu_barrier(void);
+extern void rcu_barrier_bh(void);
+extern void rcu_barrier_sched(void);
 
 /* Internal to kernel */
 extern void rcu_init(void);
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index a4e329d92883..4a74b8d48d90 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -45,6 +45,12 @@
 #include <linux/mutex.h>
 #include <linux/module.h>
 
+enum rcu_barrier {
+	RCU_BARRIER_STD,
+	RCU_BARRIER_BH,
+	RCU_BARRIER_SCHED,
+};
+
 static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
 static atomic_t rcu_barrier_cpu_count;
 static DEFINE_MUTEX(rcu_barrier_mutex);
@@ -83,19 +89,30 @@ static void rcu_barrier_callback(struct rcu_head *notused)
 /*
  * Called with preemption disabled, and from cross-cpu IRQ context.
  */
-static void rcu_barrier_func(void *notused)
+static void rcu_barrier_func(void *type)
 {
 	int cpu = smp_processor_id();
 	struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu);
 
 	atomic_inc(&rcu_barrier_cpu_count);
-	call_rcu(head, rcu_barrier_callback);
+	switch ((enum rcu_barrier)type) {
+	case RCU_BARRIER_STD:
+		call_rcu(head, rcu_barrier_callback);
+		break;
+	case RCU_BARRIER_BH:
+		call_rcu_bh(head, rcu_barrier_callback);
+		break;
+	case RCU_BARRIER_SCHED:
+		call_rcu_sched(head, rcu_barrier_callback);
+		break;
+	}
 }
 
-/**
- * rcu_barrier - Wait until all the in-flight RCUs are complete.
+/*
+ * Orchestrate the specified type of RCU barrier, waiting for all
+ * RCU callbacks of the specified type to complete.
  */
-void rcu_barrier(void)
+static void _rcu_barrier(enum rcu_barrier type)
 {
 	BUG_ON(in_interrupt());
 	/* Take cpucontrol mutex to protect against CPU hotplug */
@@ -111,13 +128,39 @@ void rcu_barrier(void)
 	 * until all the callbacks are queued.
 	 */
 	rcu_read_lock();
-	on_each_cpu(rcu_barrier_func, NULL, 0, 1);
+	on_each_cpu(rcu_barrier_func, (void *)type, 0, 1);
 	rcu_read_unlock();
 	wait_for_completion(&rcu_barrier_completion);
 	mutex_unlock(&rcu_barrier_mutex);
 }
+
+/**
+ * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
+ */
+void rcu_barrier(void)
+{
+	_rcu_barrier(RCU_BARRIER_STD);
+}
 EXPORT_SYMBOL_GPL(rcu_barrier);
 
+/**
+ * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete.
+ */
+void rcu_barrier_bh(void)
+{
+	_rcu_barrier(RCU_BARRIER_BH);
+}
+EXPORT_SYMBOL_GPL(rcu_barrier_bh);
+
+/**
+ * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks.
+ */
+void rcu_barrier_sched(void)
+{
+	_rcu_barrier(RCU_BARRIER_SCHED);
+}
+EXPORT_SYMBOL_GPL(rcu_barrier_sched);
+
 void __init rcu_init(void)
 {
 	__rcu_init();
-- 
cgit v1.2.3


From 82524746c27fa418c250a56dd7606b9d3fc79826 Mon Sep 17 00:00:00 2001
From: Franck Bui-Huu <fbuihuu@gmail.com>
Date: Mon, 12 May 2008 21:21:05 +0200
Subject: rcu: split list.h and move rcu-protected lists into rculist.h

Move rcu-protected lists from list.h into a new header file rculist.h.

This is done because list are a very used primitive structure all over the
kernel and it's currently impossible to include other header files in this
list.h without creating some circular dependencies.

For example, list.h implements rcu-protected list and uses rcu_dereference()
without including rcupdate.h.  It actually compiles because users of
rcu_dereference() are macros.  Others RCU functions could be used too but
aren't probably because of this.

Therefore this patch creates rculist.h which includes rcupdates without to
many changes/troubles.

Signed-off-by: Franck Bui-Huu <fbuihuu@gmail.com>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Josh Triplett <josh@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/ia64/sn/kernel/irq.c                       |   1 +
 crypto/async_tx/async_tx.c                      |   1 +
 drivers/infiniband/hw/ipath/ipath_verbs.c       |   1 +
 drivers/infiniband/hw/ipath/ipath_verbs_mcast.c |   3 +-
 drivers/net/macvlan.c                           |   2 +-
 include/linux/dcache.h                          |   1 +
 include/linux/list.h                            | 367 ----------------------
 include/linux/rculist.h                         | 396 ++++++++++++++++++++++++
 kernel/pid.c                                    |   1 +
 lib/textsearch.c                                |   1 +
 net/802/psnap.c                                 |   1 +
 net/8021q/vlan.c                                |   1 +
 net/bridge/br_fdb.c                             |   1 +
 net/bridge/br_stp.c                             |   1 +
 net/netlabel/netlabel_domainhash.c              |   3 +-
 15 files changed, 409 insertions(+), 372 deletions(-)
 create mode 100644 include/linux/rculist.h

(limited to 'include/linux')

diff --git a/arch/ia64/sn/kernel/irq.c b/arch/ia64/sn/kernel/irq.c
index 53351c3cd7b1..96c31b4180c3 100644
--- a/arch/ia64/sn/kernel/irq.c
+++ b/arch/ia64/sn/kernel/irq.c
@@ -11,6 +11,7 @@
 #include <linux/irq.h>
 #include <linux/spinlock.h>
 #include <linux/init.h>
+#include <linux/rculist.h>
 #include <asm/sn/addrs.h>
 #include <asm/sn/arch.h>
 #include <asm/sn/intr.h>
diff --git a/crypto/async_tx/async_tx.c b/crypto/async_tx/async_tx.c
index c6e772fc5ccd..095c798d3170 100644
--- a/crypto/async_tx/async_tx.c
+++ b/crypto/async_tx/async_tx.c
@@ -23,6 +23,7 @@
  * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
  *
  */
+#include <linux/rculist.h>
 #include <linux/kernel.h>
 #include <linux/async_tx.h>
 
diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.c b/drivers/infiniband/hw/ipath/ipath_verbs.c
index e0ec540042bf..5d830d87ebca 100644
--- a/drivers/infiniband/hw/ipath/ipath_verbs.c
+++ b/drivers/infiniband/hw/ipath/ipath_verbs.c
@@ -35,6 +35,7 @@
 #include <rdma/ib_user_verbs.h>
 #include <linux/io.h>
 #include <linux/utsname.h>
+#include <linux/rculist.h>
 
 #include "ipath_kernel.h"
 #include "ipath_verbs.h"
diff --git a/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c b/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c
index 9e5abf9c309d..d73e32232879 100644
--- a/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c
+++ b/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c
@@ -31,8 +31,7 @@
  * SOFTWARE.
  */
 
-#include <linux/list.h>
-#include <linux/rcupdate.h>
+#include <linux/rculist.h>
 
 #include "ipath_verbs.h"
 
diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index c36a03ae9bfb..860d75d81f82 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -20,7 +20,7 @@
 #include <linux/errno.h>
 #include <linux/slab.h>
 #include <linux/string.h>
-#include <linux/list.h>
+#include <linux/rculist.h>
 #include <linux/notifier.h>
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 2a6639407c80..1f5cebf10a23 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -3,6 +3,7 @@
 
 #include <asm/atomic.h>
 #include <linux/list.h>
+#include <linux/rculist.h>
 #include <linux/spinlock.h>
 #include <linux/cache.h>
 #include <linux/rcupdate.h>
diff --git a/include/linux/list.h b/include/linux/list.h
index 08cf4f651889..139ec41d9c2e 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -84,65 +84,6 @@ static inline void list_add_tail(struct list_head *new, struct list_head *head)
 	__list_add(new, head->prev, head);
 }
 
-/*
- * Insert a new entry between two known consecutive entries.
- *
- * This is only for internal list manipulation where we know
- * the prev/next entries already!
- */
-static inline void __list_add_rcu(struct list_head * new,
-		struct list_head * prev, struct list_head * next)
-{
-	new->next = next;
-	new->prev = prev;
-	smp_wmb();
-	next->prev = new;
-	prev->next = new;
-}
-
-/**
- * list_add_rcu - add a new entry to rcu-protected list
- * @new: new entry to be added
- * @head: list head to add it after
- *
- * Insert a new entry after the specified head.
- * This is good for implementing stacks.
- *
- * The caller must take whatever precautions are necessary
- * (such as holding appropriate locks) to avoid racing
- * with another list-mutation primitive, such as list_add_rcu()
- * or list_del_rcu(), running on this same list.
- * However, it is perfectly legal to run concurrently with
- * the _rcu list-traversal primitives, such as
- * list_for_each_entry_rcu().
- */
-static inline void list_add_rcu(struct list_head *new, struct list_head *head)
-{
-	__list_add_rcu(new, head, head->next);
-}
-
-/**
- * list_add_tail_rcu - add a new entry to rcu-protected list
- * @new: new entry to be added
- * @head: list head to add it before
- *
- * Insert a new entry before the specified head.
- * This is useful for implementing queues.
- *
- * The caller must take whatever precautions are necessary
- * (such as holding appropriate locks) to avoid racing
- * with another list-mutation primitive, such as list_add_tail_rcu()
- * or list_del_rcu(), running on this same list.
- * However, it is perfectly legal to run concurrently with
- * the _rcu list-traversal primitives, such as
- * list_for_each_entry_rcu().
- */
-static inline void list_add_tail_rcu(struct list_head *new,
-					struct list_head *head)
-{
-	__list_add_rcu(new, head->prev, head);
-}
-
 /*
  * Delete a list entry by making the prev/next entries
  * point to each other.
@@ -173,36 +114,6 @@ static inline void list_del(struct list_head *entry)
 extern void list_del(struct list_head *entry);
 #endif
 
-/**
- * list_del_rcu - deletes entry from list without re-initialization
- * @entry: the element to delete from the list.
- *
- * Note: list_empty() on entry does not return true after this,
- * the entry is in an undefined state. It is useful for RCU based
- * lockfree traversal.
- *
- * In particular, it means that we can not poison the forward
- * pointers that may still be used for walking the list.
- *
- * The caller must take whatever precautions are necessary
- * (such as holding appropriate locks) to avoid racing
- * with another list-mutation primitive, such as list_del_rcu()
- * or list_add_rcu(), running on this same list.
- * However, it is perfectly legal to run concurrently with
- * the _rcu list-traversal primitives, such as
- * list_for_each_entry_rcu().
- *
- * Note that the caller is not permitted to immediately free
- * the newly deleted entry.  Instead, either synchronize_rcu()
- * or call_rcu() must be used to defer freeing until an RCU
- * grace period has elapsed.
- */
-static inline void list_del_rcu(struct list_head *entry)
-{
-	__list_del(entry->prev, entry->next);
-	entry->prev = LIST_POISON2;
-}
-
 /**
  * list_replace - replace old entry by new one
  * @old : the element to be replaced
@@ -226,25 +137,6 @@ static inline void list_replace_init(struct list_head *old,
 	INIT_LIST_HEAD(old);
 }
 
-/**
- * list_replace_rcu - replace old entry by new one
- * @old : the element to be replaced
- * @new : the new element to insert
- *
- * The @old entry will be replaced with the @new entry atomically.
- * Note: @old should not be empty.
- */
-static inline void list_replace_rcu(struct list_head *old,
-				struct list_head *new)
-{
-	new->next = old->next;
-	new->prev = old->prev;
-	smp_wmb();
-	new->next->prev = new;
-	new->prev->next = new;
-	old->prev = LIST_POISON2;
-}
-
 /**
  * list_del_init - deletes entry from list and reinitialize it.
  * @entry: the element to delete from the list.
@@ -368,62 +260,6 @@ static inline void list_splice_init(struct list_head *list,
 	}
 }
 
-/**
- * list_splice_init_rcu - splice an RCU-protected list into an existing list.
- * @list:	the RCU-protected list to splice
- * @head:	the place in the list to splice the first list into
- * @sync:	function to sync: synchronize_rcu(), synchronize_sched(), ...
- *
- * @head can be RCU-read traversed concurrently with this function.
- *
- * Note that this function blocks.
- *
- * Important note: the caller must take whatever action is necessary to
- *	prevent any other updates to @head.  In principle, it is possible
- *	to modify the list as soon as sync() begins execution.
- *	If this sort of thing becomes necessary, an alternative version
- *	based on call_rcu() could be created.  But only if -really-
- *	needed -- there is no shortage of RCU API members.
- */
-static inline void list_splice_init_rcu(struct list_head *list,
-					struct list_head *head,
-					void (*sync)(void))
-{
-	struct list_head *first = list->next;
-	struct list_head *last = list->prev;
-	struct list_head *at = head->next;
-
-	if (list_empty(head))
-		return;
-
-	/* "first" and "last" tracking list, so initialize it. */
-
-	INIT_LIST_HEAD(list);
-
-	/*
-	 * At this point, the list body still points to the source list.
-	 * Wait for any readers to finish using the list before splicing
-	 * the list body into the new list.  Any new readers will see
-	 * an empty list.
-	 */
-
-	sync();
-
-	/*
-	 * Readers are finished with the source list, so perform splice.
-	 * The order is important if the new list is global and accessible
-	 * to concurrent RCU readers.  Note that RCU readers are not
-	 * permitted to traverse the prev pointers without excluding
-	 * this function.
-	 */
-
-	last->next = at;
-	smp_wmb();
-	head->next = first;
-	first->prev = head;
-	at->prev = last;
-}
-
 /**
  * list_entry - get the struct for this entry
  * @ptr:	the &struct list_head pointer.
@@ -629,57 +465,6 @@ static inline void list_splice_init_rcu(struct list_head *list,
 	     &pos->member != (head); 					\
 	     pos = n, n = list_entry(n->member.prev, typeof(*n), member))
 
-/**
- * list_for_each_rcu	-	iterate over an rcu-protected list
- * @pos:	the &struct list_head to use as a loop cursor.
- * @head:	the head for your list.
- *
- * This list-traversal primitive may safely run concurrently with
- * the _rcu list-mutation primitives such as list_add_rcu()
- * as long as the traversal is guarded by rcu_read_lock().
- */
-#define list_for_each_rcu(pos, head) \
-	for (pos = rcu_dereference((head)->next); \
-		prefetch(pos->next), pos != (head); \
-		pos = rcu_dereference(pos->next))
-
-#define __list_for_each_rcu(pos, head) \
-	for (pos = rcu_dereference((head)->next); \
-		pos != (head); \
-		pos = rcu_dereference(pos->next))
-
-/**
- * list_for_each_entry_rcu	-	iterate over rcu list of given type
- * @pos:	the type * to use as a loop cursor.
- * @head:	the head for your list.
- * @member:	the name of the list_struct within the struct.
- *
- * This list-traversal primitive may safely run concurrently with
- * the _rcu list-mutation primitives such as list_add_rcu()
- * as long as the traversal is guarded by rcu_read_lock().
- */
-#define list_for_each_entry_rcu(pos, head, member) \
-	for (pos = list_entry(rcu_dereference((head)->next), typeof(*pos), member); \
-		prefetch(pos->member.next), &pos->member != (head); \
-		pos = list_entry(rcu_dereference(pos->member.next), typeof(*pos), member))
-
-
-/**
- * list_for_each_continue_rcu
- * @pos:	the &struct list_head to use as a loop cursor.
- * @head:	the head for your list.
- *
- * Iterate over an rcu-protected list, continuing after current point.
- *
- * This list-traversal primitive may safely run concurrently with
- * the _rcu list-mutation primitives such as list_add_rcu()
- * as long as the traversal is guarded by rcu_read_lock().
- */
-#define list_for_each_continue_rcu(pos, head) \
-	for ((pos) = rcu_dereference((pos)->next); \
-		prefetch((pos)->next), (pos) != (head); \
-		(pos) = rcu_dereference((pos)->next))
-
 /*
  * Double linked lists with a single pointer list head.
  * Mostly useful for hash tables where the two pointer list head is
@@ -730,31 +515,6 @@ static inline void hlist_del(struct hlist_node *n)
 	n->pprev = LIST_POISON2;
 }
 
-/**
- * hlist_del_rcu - deletes entry from hash list without re-initialization
- * @n: the element to delete from the hash list.
- *
- * Note: list_unhashed() on entry does not return true after this,
- * the entry is in an undefined state. It is useful for RCU based
- * lockfree traversal.
- *
- * In particular, it means that we can not poison the forward
- * pointers that may still be used for walking the hash list.
- *
- * The caller must take whatever precautions are necessary
- * (such as holding appropriate locks) to avoid racing
- * with another list-mutation primitive, such as hlist_add_head_rcu()
- * or hlist_del_rcu(), running on this same list.
- * However, it is perfectly legal to run concurrently with
- * the _rcu list-traversal primitives, such as
- * hlist_for_each_entry().
- */
-static inline void hlist_del_rcu(struct hlist_node *n)
-{
-	__hlist_del(n);
-	n->pprev = LIST_POISON2;
-}
-
 static inline void hlist_del_init(struct hlist_node *n)
 {
 	if (!hlist_unhashed(n)) {
@@ -763,27 +523,6 @@ static inline void hlist_del_init(struct hlist_node *n)
 	}
 }
 
-/**
- * hlist_replace_rcu - replace old entry by new one
- * @old : the element to be replaced
- * @new : the new element to insert
- *
- * The @old entry will be replaced with the @new entry atomically.
- */
-static inline void hlist_replace_rcu(struct hlist_node *old,
-					struct hlist_node *new)
-{
-	struct hlist_node *next = old->next;
-
-	new->next = next;
-	new->pprev = old->pprev;
-	smp_wmb();
-	if (next)
-		new->next->pprev = &new->next;
-	*new->pprev = new;
-	old->pprev = LIST_POISON2;
-}
-
 static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h)
 {
 	struct hlist_node *first = h->first;
@@ -794,38 +533,6 @@ static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h)
 	n->pprev = &h->first;
 }
 
-
-/**
- * hlist_add_head_rcu
- * @n: the element to add to the hash list.
- * @h: the list to add to.
- *
- * Description:
- * Adds the specified element to the specified hlist,
- * while permitting racing traversals.
- *
- * The caller must take whatever precautions are necessary
- * (such as holding appropriate locks) to avoid racing
- * with another list-mutation primitive, such as hlist_add_head_rcu()
- * or hlist_del_rcu(), running on this same list.
- * However, it is perfectly legal to run concurrently with
- * the _rcu list-traversal primitives, such as
- * hlist_for_each_entry_rcu(), used to prevent memory-consistency
- * problems on Alpha CPUs.  Regardless of the type of CPU, the
- * list-traversal primitive must be guarded by rcu_read_lock().
- */
-static inline void hlist_add_head_rcu(struct hlist_node *n,
-					struct hlist_head *h)
-{
-	struct hlist_node *first = h->first;
-	n->next = first;
-	n->pprev = &h->first;
-	smp_wmb();
-	if (first)
-		first->pprev = &n->next;
-	h->first = n;
-}
-
 /* next must be != NULL */
 static inline void hlist_add_before(struct hlist_node *n,
 					struct hlist_node *next)
@@ -847,63 +554,6 @@ static inline void hlist_add_after(struct hlist_node *n,
 		next->next->pprev  = &next->next;
 }
 
-/**
- * hlist_add_before_rcu
- * @n: the new element to add to the hash list.
- * @next: the existing element to add the new element before.
- *
- * Description:
- * Adds the specified element to the specified hlist
- * before the specified node while permitting racing traversals.
- *
- * The caller must take whatever precautions are necessary
- * (such as holding appropriate locks) to avoid racing
- * with another list-mutation primitive, such as hlist_add_head_rcu()
- * or hlist_del_rcu(), running on this same list.
- * However, it is perfectly legal to run concurrently with
- * the _rcu list-traversal primitives, such as
- * hlist_for_each_entry_rcu(), used to prevent memory-consistency
- * problems on Alpha CPUs.
- */
-static inline void hlist_add_before_rcu(struct hlist_node *n,
-					struct hlist_node *next)
-{
-	n->pprev = next->pprev;
-	n->next = next;
-	smp_wmb();
-	next->pprev = &n->next;
-	*(n->pprev) = n;
-}
-
-/**
- * hlist_add_after_rcu
- * @prev: the existing element to add the new element after.
- * @n: the new element to add to the hash list.
- *
- * Description:
- * Adds the specified element to the specified hlist
- * after the specified node while permitting racing traversals.
- *
- * The caller must take whatever precautions are necessary
- * (such as holding appropriate locks) to avoid racing
- * with another list-mutation primitive, such as hlist_add_head_rcu()
- * or hlist_del_rcu(), running on this same list.
- * However, it is perfectly legal to run concurrently with
- * the _rcu list-traversal primitives, such as
- * hlist_for_each_entry_rcu(), used to prevent memory-consistency
- * problems on Alpha CPUs.
- */
-static inline void hlist_add_after_rcu(struct hlist_node *prev,
-				       struct hlist_node *n)
-{
-	n->next = prev->next;
-	n->pprev = &prev->next;
-	smp_wmb();
-	prev->next = n;
-	if (n->next)
-		n->next->pprev = &n->next;
-}
-
 #define hlist_entry(ptr, type, member) container_of(ptr,type,member)
 
 #define hlist_for_each(pos, head) \
@@ -964,21 +614,4 @@ static inline void hlist_add_after_rcu(struct hlist_node *prev,
 		({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
 	     pos = n)
 
-/**
- * hlist_for_each_entry_rcu - iterate over rcu list of given type
- * @tpos:	the type * to use as a loop cursor.
- * @pos:	the &struct hlist_node to use as a loop cursor.
- * @head:	the head for your list.
- * @member:	the name of the hlist_node within the struct.
- *
- * This list-traversal primitive may safely run concurrently with
- * the _rcu list-mutation primitives such as hlist_add_head_rcu()
- * as long as the traversal is guarded by rcu_read_lock().
- */
-#define hlist_for_each_entry_rcu(tpos, pos, head, member)		 \
-	for (pos = rcu_dereference((head)->first);			 \
-	        pos && ({ prefetch(pos->next); 1;}) &&			 \
-		({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
-	     pos = rcu_dereference(pos->next))
-
 #endif
diff --git a/include/linux/rculist.h b/include/linux/rculist.h
new file mode 100644
index 000000000000..aa9b3eb15683
--- /dev/null
+++ b/include/linux/rculist.h
@@ -0,0 +1,396 @@
+#ifndef _LINUX_RCULIST_H
+#define _LINUX_RCULIST_H
+
+#ifdef __KERNEL__
+
+/*
+ * RCU-protected list version
+ */
+#include <linux/list.h>
+
+/*
+ * Insert a new entry between two known consecutive entries.
+ *
+ * This is only for internal list manipulation where we know
+ * the prev/next entries already!
+ */
+static inline void __list_add_rcu(struct list_head *new,
+		struct list_head *prev, struct list_head *next)
+{
+	new->next = next;
+	new->prev = prev;
+	smp_wmb();
+	next->prev = new;
+	prev->next = new;
+}
+
+/**
+ * list_add_rcu - add a new entry to rcu-protected list
+ * @new: new entry to be added
+ * @head: list head to add it after
+ *
+ * Insert a new entry after the specified head.
+ * This is good for implementing stacks.
+ *
+ * The caller must take whatever precautions are necessary
+ * (such as holding appropriate locks) to avoid racing
+ * with another list-mutation primitive, such as list_add_rcu()
+ * or list_del_rcu(), running on this same list.
+ * However, it is perfectly legal to run concurrently with
+ * the _rcu list-traversal primitives, such as
+ * list_for_each_entry_rcu().
+ */
+static inline void list_add_rcu(struct list_head *new, struct list_head *head)
+{
+	__list_add_rcu(new, head, head->next);
+}
+
+/**
+ * list_add_tail_rcu - add a new entry to rcu-protected list
+ * @new: new entry to be added
+ * @head: list head to add it before
+ *
+ * Insert a new entry before the specified head.
+ * This is useful for implementing queues.
+ *
+ * The caller must take whatever precautions are necessary
+ * (such as holding appropriate locks) to avoid racing
+ * with another list-mutation primitive, such as list_add_tail_rcu()
+ * or list_del_rcu(), running on this same list.
+ * However, it is perfectly legal to run concurrently with
+ * the _rcu list-traversal primitives, such as
+ * list_for_each_entry_rcu().
+ */
+static inline void list_add_tail_rcu(struct list_head *new,
+					struct list_head *head)
+{
+	__list_add_rcu(new, head->prev, head);
+}
+
+/**
+ * list_del_rcu - deletes entry from list without re-initialization
+ * @entry: the element to delete from the list.
+ *
+ * Note: list_empty() on entry does not return true after this,
+ * the entry is in an undefined state. It is useful for RCU based
+ * lockfree traversal.
+ *
+ * In particular, it means that we can not poison the forward
+ * pointers that may still be used for walking the list.
+ *
+ * The caller must take whatever precautions are necessary
+ * (such as holding appropriate locks) to avoid racing
+ * with another list-mutation primitive, such as list_del_rcu()
+ * or list_add_rcu(), running on this same list.
+ * However, it is perfectly legal to run concurrently with
+ * the _rcu list-traversal primitives, such as
+ * list_for_each_entry_rcu().
+ *
+ * Note that the caller is not permitted to immediately free
+ * the newly deleted entry.  Instead, either synchronize_rcu()
+ * or call_rcu() must be used to defer freeing until an RCU
+ * grace period has elapsed.
+ */
+static inline void list_del_rcu(struct list_head *entry)
+{
+	__list_del(entry->prev, entry->next);
+	entry->prev = LIST_POISON2;
+}
+
+/**
+ * list_replace_rcu - replace old entry by new one
+ * @old : the element to be replaced
+ * @new : the new element to insert
+ *
+ * The @old entry will be replaced with the @new entry atomically.
+ * Note: @old should not be empty.
+ */
+static inline void list_replace_rcu(struct list_head *old,
+				struct list_head *new)
+{
+	new->next = old->next;
+	new->prev = old->prev;
+	smp_wmb();
+	new->next->prev = new;
+	new->prev->next = new;
+	old->prev = LIST_POISON2;
+}
+
+/**
+ * list_splice_init_rcu - splice an RCU-protected list into an existing list.
+ * @list:	the RCU-protected list to splice
+ * @head:	the place in the list to splice the first list into
+ * @sync:	function to sync: synchronize_rcu(), synchronize_sched(), ...
+ *
+ * @head can be RCU-read traversed concurrently with this function.
+ *
+ * Note that this function blocks.
+ *
+ * Important note: the caller must take whatever action is necessary to
+ *	prevent any other updates to @head.  In principle, it is possible
+ *	to modify the list as soon as sync() begins execution.
+ *	If this sort of thing becomes necessary, an alternative version
+ *	based on call_rcu() could be created.  But only if -really-
+ *	needed -- there is no shortage of RCU API members.
+ */
+static inline void list_splice_init_rcu(struct list_head *list,
+					struct list_head *head,
+					void (*sync)(void))
+{
+	struct list_head *first = list->next;
+	struct list_head *last = list->prev;
+	struct list_head *at = head->next;
+
+	if (list_empty(head))
+		return;
+
+	/* "first" and "last" tracking list, so initialize it. */
+
+	INIT_LIST_HEAD(list);
+
+	/*
+	 * At this point, the list body still points to the source list.
+	 * Wait for any readers to finish using the list before splicing
+	 * the list body into the new list.  Any new readers will see
+	 * an empty list.
+	 */
+
+	sync();
+
+	/*
+	 * Readers are finished with the source list, so perform splice.
+	 * The order is important if the new list is global and accessible
+	 * to concurrent RCU readers.  Note that RCU readers are not
+	 * permitted to traverse the prev pointers without excluding
+	 * this function.
+	 */
+
+	last->next = at;
+	smp_wmb();
+	head->next = first;
+	first->prev = head;
+	at->prev = last;
+}
+
+/**
+ * list_for_each_rcu	-	iterate over an rcu-protected list
+ * @pos:	the &struct list_head to use as a loop cursor.
+ * @head:	the head for your list.
+ *
+ * This list-traversal primitive may safely run concurrently with
+ * the _rcu list-mutation primitives such as list_add_rcu()
+ * as long as the traversal is guarded by rcu_read_lock().
+ */
+#define list_for_each_rcu(pos, head) \
+	for (pos = (head)->next; \
+		prefetch(rcu_dereference(pos)->next), pos != (head); \
+		pos = pos->next)
+
+#define __list_for_each_rcu(pos, head) \
+	for (pos = (head)->next; \
+		rcu_dereference(pos) != (head); \
+		pos = pos->next)
+
+/**
+ * list_for_each_safe_rcu
+ * @pos:	the &struct list_head to use as a loop cursor.
+ * @n:		another &struct list_head to use as temporary storage
+ * @head:	the head for your list.
+ *
+ * Iterate over an rcu-protected list, safe against removal of list entry.
+ *
+ * This list-traversal primitive may safely run concurrently with
+ * the _rcu list-mutation primitives such as list_add_rcu()
+ * as long as the traversal is guarded by rcu_read_lock().
+ */
+#define list_for_each_safe_rcu(pos, n, head) \
+	for (pos = (head)->next; \
+		n = rcu_dereference(pos)->next, pos != (head); \
+		pos = n)
+
+/**
+ * list_for_each_entry_rcu	-	iterate over rcu list of given type
+ * @pos:	the type * to use as a loop cursor.
+ * @head:	the head for your list.
+ * @member:	the name of the list_struct within the struct.
+ *
+ * This list-traversal primitive may safely run concurrently with
+ * the _rcu list-mutation primitives such as list_add_rcu()
+ * as long as the traversal is guarded by rcu_read_lock().
+ */
+#define list_for_each_entry_rcu(pos, head, member) \
+	for (pos = list_entry((head)->next, typeof(*pos), member); \
+		prefetch(rcu_dereference(pos)->member.next), \
+			&pos->member != (head); \
+		pos = list_entry(pos->member.next, typeof(*pos), member))
+
+
+/**
+ * list_for_each_continue_rcu
+ * @pos:	the &struct list_head to use as a loop cursor.
+ * @head:	the head for your list.
+ *
+ * Iterate over an rcu-protected list, continuing after current point.
+ *
+ * This list-traversal primitive may safely run concurrently with
+ * the _rcu list-mutation primitives such as list_add_rcu()
+ * as long as the traversal is guarded by rcu_read_lock().
+ */
+#define list_for_each_continue_rcu(pos, head) \
+	for ((pos) = (pos)->next; \
+		prefetch(rcu_dereference((pos))->next), (pos) != (head); \
+		(pos) = (pos)->next)
+
+/**
+ * hlist_del_rcu - deletes entry from hash list without re-initialization
+ * @n: the element to delete from the hash list.
+ *
+ * Note: list_unhashed() on entry does not return true after this,
+ * the entry is in an undefined state. It is useful for RCU based
+ * lockfree traversal.
+ *
+ * In particular, it means that we can not poison the forward
+ * pointers that may still be used for walking the hash list.
+ *
+ * The caller must take whatever precautions are necessary
+ * (such as holding appropriate locks) to avoid racing
+ * with another list-mutation primitive, such as hlist_add_head_rcu()
+ * or hlist_del_rcu(), running on this same list.
+ * However, it is perfectly legal to run concurrently with
+ * the _rcu list-traversal primitives, such as
+ * hlist_for_each_entry().
+ */
+static inline void hlist_del_rcu(struct hlist_node *n)
+{
+	__hlist_del(n);
+	n->pprev = LIST_POISON2;
+}
+
+/**
+ * hlist_replace_rcu - replace old entry by new one
+ * @old : the element to be replaced
+ * @new : the new element to insert
+ *
+ * The @old entry will be replaced with the @new entry atomically.
+ */
+static inline void hlist_replace_rcu(struct hlist_node *old,
+					struct hlist_node *new)
+{
+	struct hlist_node *next = old->next;
+
+	new->next = next;
+	new->pprev = old->pprev;
+	smp_wmb();
+	if (next)
+		new->next->pprev = &new->next;
+	*new->pprev = new;
+	old->pprev = LIST_POISON2;
+}
+
+/**
+ * hlist_add_head_rcu
+ * @n: the element to add to the hash list.
+ * @h: the list to add to.
+ *
+ * Description:
+ * Adds the specified element to the specified hlist,
+ * while permitting racing traversals.
+ *
+ * The caller must take whatever precautions are necessary
+ * (such as holding appropriate locks) to avoid racing
+ * with another list-mutation primitive, such as hlist_add_head_rcu()
+ * or hlist_del_rcu(), running on this same list.
+ * However, it is perfectly legal to run concurrently with
+ * the _rcu list-traversal primitives, such as
+ * hlist_for_each_entry_rcu(), used to prevent memory-consistency
+ * problems on Alpha CPUs.  Regardless of the type of CPU, the
+ * list-traversal primitive must be guarded by rcu_read_lock().
+ */
+static inline void hlist_add_head_rcu(struct hlist_node *n,
+					struct hlist_head *h)
+{
+	struct hlist_node *first = h->first;
+	n->next = first;
+	n->pprev = &h->first;
+	smp_wmb();
+	if (first)
+		first->pprev = &n->next;
+	h->first = n;
+}
+
+/**
+ * hlist_add_before_rcu
+ * @n: the new element to add to the hash list.
+ * @next: the existing element to add the new element before.
+ *
+ * Description:
+ * Adds the specified element to the specified hlist
+ * before the specified node while permitting racing traversals.
+ *
+ * The caller must take whatever precautions are necessary
+ * (such as holding appropriate locks) to avoid racing
+ * with another list-mutation primitive, such as hlist_add_head_rcu()
+ * or hlist_del_rcu(), running on this same list.
+ * However, it is perfectly legal to run concurrently with
+ * the _rcu list-traversal primitives, such as
+ * hlist_for_each_entry_rcu(), used to prevent memory-consistency
+ * problems on Alpha CPUs.
+ */
+static inline void hlist_add_before_rcu(struct hlist_node *n,
+					struct hlist_node *next)
+{
+	n->pprev = next->pprev;
+	n->next = next;
+	smp_wmb();
+	next->pprev = &n->next;
+	*(n->pprev) = n;
+}
+
+/**
+ * hlist_add_after_rcu
+ * @prev: the existing element to add the new element after.
+ * @n: the new element to add to the hash list.
+ *
+ * Description:
+ * Adds the specified element to the specified hlist
+ * after the specified node while permitting racing traversals.
+ *
+ * The caller must take whatever precautions are necessary
+ * (such as holding appropriate locks) to avoid racing
+ * with another list-mutation primitive, such as hlist_add_head_rcu()
+ * or hlist_del_rcu(), running on this same list.
+ * However, it is perfectly legal to run concurrently with
+ * the _rcu list-traversal primitives, such as
+ * hlist_for_each_entry_rcu(), used to prevent memory-consistency
+ * problems on Alpha CPUs.
+ */
+static inline void hlist_add_after_rcu(struct hlist_node *prev,
+				       struct hlist_node *n)
+{
+	n->next = prev->next;
+	n->pprev = &prev->next;
+	smp_wmb();
+	prev->next = n;
+	if (n->next)
+		n->next->pprev = &n->next;
+}
+
+/**
+ * hlist_for_each_entry_rcu - iterate over rcu list of given type
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct hlist_node to use as a loop cursor.
+ * @head:	the head for your list.
+ * @member:	the name of the hlist_node within the struct.
+ *
+ * This list-traversal primitive may safely run concurrently with
+ * the _rcu list-mutation primitives such as hlist_add_head_rcu()
+ * as long as the traversal is guarded by rcu_read_lock().
+ */
+#define hlist_for_each_entry_rcu(tpos, pos, head, member)		 \
+	for (pos = (head)->first;					 \
+	     rcu_dereference(pos) && ({ prefetch(pos->next); 1; }) &&	 \
+		({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; }); \
+	     pos = pos->next)
+
+#endif	/* __KERNEL__ */
+#endif
diff --git a/kernel/pid.c b/kernel/pid.c
index 20d59fa2d493..30bd5d4b2ac7 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -30,6 +30,7 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/init.h>
+#include <linux/rculist.h>
 #include <linux/bootmem.h>
 #include <linux/hash.h>
 #include <linux/pid_namespace.h>
diff --git a/lib/textsearch.c b/lib/textsearch.c
index be8bda3862f5..a3e500ad51d7 100644
--- a/lib/textsearch.c
+++ b/lib/textsearch.c
@@ -97,6 +97,7 @@
 #include <linux/types.h>
 #include <linux/string.h>
 #include <linux/init.h>
+#include <linux/rculist.h>
 #include <linux/rcupdate.h>
 #include <linux/err.h>
 #include <linux/textsearch.h>
diff --git a/net/802/psnap.c b/net/802/psnap.c
index 31128cb92a23..ea4643931446 100644
--- a/net/802/psnap.c
+++ b/net/802/psnap.c
@@ -20,6 +20,7 @@
 #include <linux/mm.h>
 #include <linux/in.h>
 #include <linux/init.h>
+#include <linux/rculist.h>
 
 static LIST_HEAD(snap_list);
 static DEFINE_SPINLOCK(snap_lock);
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 2a739adaa92b..e7ddbfa0e02f 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -27,6 +27,7 @@
 #include <linux/mm.h>
 #include <linux/in.h>
 #include <linux/init.h>
+#include <linux/rculist.h>
 #include <net/p8022.h>
 #include <net/arp.h>
 #include <linux/rtnetlink.h>
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index 72c5976a5ce3..142060f02054 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -15,6 +15,7 @@
 
 #include <linux/kernel.h>
 #include <linux/init.h>
+#include <linux/rculist.h>
 #include <linux/spinlock.h>
 #include <linux/times.h>
 #include <linux/netdevice.h>
diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c
index e38034aa56f5..9e96ffcd29a3 100644
--- a/net/bridge/br_stp.c
+++ b/net/bridge/br_stp.c
@@ -13,6 +13,7 @@
  *	2 of the License, or (at your option) any later version.
  */
 #include <linux/kernel.h>
+#include <linux/rculist.h>
 
 #include "br_private.h"
 #include "br_private_stp.h"
diff --git a/net/netlabel/netlabel_domainhash.c b/net/netlabel/netlabel_domainhash.c
index 02c2f7c0b255..643c032a3a57 100644
--- a/net/netlabel/netlabel_domainhash.c
+++ b/net/netlabel/netlabel_domainhash.c
@@ -30,8 +30,7 @@
  */
 
 #include <linux/types.h>
-#include <linux/rcupdate.h>
-#include <linux/list.h>
+#include <linux/rculist.h>
 #include <linux/skbuff.h>
 #include <linux/spinlock.h>
 #include <linux/string.h>
-- 
cgit v1.2.3


From 10aa9d2cf9878757b003023d33ff90a37aa3044b Mon Sep 17 00:00:00 2001
From: Franck Bui-Huu <fbuihuu@gmail.com>
Date: Mon, 12 May 2008 21:21:06 +0200
Subject: rculist.h: use the rcu API

Make almost all list mutation primitives use rcu_assign_pointer().

The main point of this being readability improvement.

Signed-off-by: Franck Bui-Huu <fbuihuu@gmail.com>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Cc: Josh Triplett <josh@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/rculist.h | 23 +++++++++--------------
 1 file changed, 9 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index aa9b3eb15683..8d2c81fccfe5 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -7,6 +7,7 @@
  * RCU-protected list version
  */
 #include <linux/list.h>
+#include <linux/rcupdate.h>
 
 /*
  * Insert a new entry between two known consecutive entries.
@@ -19,9 +20,8 @@ static inline void __list_add_rcu(struct list_head *new,
 {
 	new->next = next;
 	new->prev = prev;
-	smp_wmb();
+	rcu_assign_pointer(prev->next, new);
 	next->prev = new;
-	prev->next = new;
 }
 
 /**
@@ -110,9 +110,8 @@ static inline void list_replace_rcu(struct list_head *old,
 {
 	new->next = old->next;
 	new->prev = old->prev;
-	smp_wmb();
+	rcu_assign_pointer(new->prev->next, new);
 	new->next->prev = new;
-	new->prev->next = new;
 	old->prev = LIST_POISON2;
 }
 
@@ -166,8 +165,7 @@ static inline void list_splice_init_rcu(struct list_head *list,
 	 */
 
 	last->next = at;
-	smp_wmb();
-	head->next = first;
+	rcu_assign_pointer(head->next, first);
 	first->prev = head;
 	at->prev = last;
 }
@@ -280,10 +278,9 @@ static inline void hlist_replace_rcu(struct hlist_node *old,
 
 	new->next = next;
 	new->pprev = old->pprev;
-	smp_wmb();
+	rcu_assign_pointer(*new->pprev, new);
 	if (next)
 		new->next->pprev = &new->next;
-	*new->pprev = new;
 	old->pprev = LIST_POISON2;
 }
 
@@ -310,12 +307,12 @@ static inline void hlist_add_head_rcu(struct hlist_node *n,
 					struct hlist_head *h)
 {
 	struct hlist_node *first = h->first;
+
 	n->next = first;
 	n->pprev = &h->first;
-	smp_wmb();
+	rcu_assign_pointer(h->first, n);
 	if (first)
 		first->pprev = &n->next;
-	h->first = n;
 }
 
 /**
@@ -341,9 +338,8 @@ static inline void hlist_add_before_rcu(struct hlist_node *n,
 {
 	n->pprev = next->pprev;
 	n->next = next;
-	smp_wmb();
+	rcu_assign_pointer(*(n->pprev), n);
 	next->pprev = &n->next;
-	*(n->pprev) = n;
 }
 
 /**
@@ -369,8 +365,7 @@ static inline void hlist_add_after_rcu(struct hlist_node *prev,
 {
 	n->next = prev->next;
 	n->pprev = &prev->next;
-	smp_wmb();
-	prev->next = n;
+	rcu_assign_pointer(prev->next, n);
 	if (n->next)
 		n->next->pprev = &n->next;
 }
-- 
cgit v1.2.3


From 78b0e0e9b27b62c4b22f05a147f7a80fa58b1ae3 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 12 May 2008 21:21:06 +0200
Subject: RCU, rculist.h: fix list iterators

RCU list iterators: should prefetch ever be optimised out with no
side-effects, the current version will lose the barrier completely.

Pointed-out-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/rculist.h | 48 +++++++++++++++---------------------------------
 1 file changed, 15 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index 8d2c81fccfe5..b0f39be08b6c 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -180,31 +180,14 @@ static inline void list_splice_init_rcu(struct list_head *list,
  * as long as the traversal is guarded by rcu_read_lock().
  */
 #define list_for_each_rcu(pos, head) \
-	for (pos = (head)->next; \
-		prefetch(rcu_dereference(pos)->next), pos != (head); \
-		pos = pos->next)
+	for (pos = rcu_dereference((head)->next); \
+		prefetch(pos->next), pos != (head); \
+		pos = rcu_dereference(pos->next))
 
 #define __list_for_each_rcu(pos, head) \
-	for (pos = (head)->next; \
-		rcu_dereference(pos) != (head); \
-		pos = pos->next)
-
-/**
- * list_for_each_safe_rcu
- * @pos:	the &struct list_head to use as a loop cursor.
- * @n:		another &struct list_head to use as temporary storage
- * @head:	the head for your list.
- *
- * Iterate over an rcu-protected list, safe against removal of list entry.
- *
- * This list-traversal primitive may safely run concurrently with
- * the _rcu list-mutation primitives such as list_add_rcu()
- * as long as the traversal is guarded by rcu_read_lock().
- */
-#define list_for_each_safe_rcu(pos, n, head) \
-	for (pos = (head)->next; \
-		n = rcu_dereference(pos)->next, pos != (head); \
-		pos = n)
+	for (pos = rcu_dereference((head)->next); \
+		pos != (head); \
+		pos = rcu_dereference(pos->next))
 
 /**
  * list_for_each_entry_rcu	-	iterate over rcu list of given type
@@ -217,10 +200,9 @@ static inline void list_splice_init_rcu(struct list_head *list,
  * as long as the traversal is guarded by rcu_read_lock().
  */
 #define list_for_each_entry_rcu(pos, head, member) \
-	for (pos = list_entry((head)->next, typeof(*pos), member); \
-		prefetch(rcu_dereference(pos)->member.next), \
-			&pos->member != (head); \
-		pos = list_entry(pos->member.next, typeof(*pos), member))
+	for (pos = list_entry(rcu_dereference((head)->next), typeof(*pos), member); \
+		prefetch(pos->member.next), &pos->member != (head); \
+		pos = list_entry(rcu_dereference(pos->member.next), typeof(*pos), member))
 
 
 /**
@@ -235,9 +217,9 @@ static inline void list_splice_init_rcu(struct list_head *list,
  * as long as the traversal is guarded by rcu_read_lock().
  */
 #define list_for_each_continue_rcu(pos, head) \
-	for ((pos) = (pos)->next; \
-		prefetch(rcu_dereference((pos))->next), (pos) != (head); \
-		(pos) = (pos)->next)
+	for ((pos) = rcu_dereference((pos)->next); \
+		prefetch((pos)->next), (pos) != (head); \
+		(pos) = rcu_dereference((pos)->next))
 
 /**
  * hlist_del_rcu - deletes entry from hash list without re-initialization
@@ -382,10 +364,10 @@ static inline void hlist_add_after_rcu(struct hlist_node *prev,
  * as long as the traversal is guarded by rcu_read_lock().
  */
 #define hlist_for_each_entry_rcu(tpos, pos, head, member)		 \
-	for (pos = (head)->first;					 \
-	     rcu_dereference(pos) && ({ prefetch(pos->next); 1; }) &&	 \
+	for (pos = rcu_dereference((head)->first);			 \
+		pos && ({ prefetch(pos->next); 1; }) &&			 \
 		({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; }); \
-	     pos = pos->next)
+		pos = rcu_dereference(pos->next))
 
 #endif	/* __KERNEL__ */
 #endif
-- 
cgit v1.2.3


From 1a189b97190d3f0f8cf0379a799d3555b2d648bb Mon Sep 17 00:00:00 2001
From: Russell King <rmk@dyn-67.arm.linux.org.uk>
Date: Sun, 13 Apr 2008 21:41:55 +0100
Subject: [ARM] pxa: Add bare bones PWM API

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/Kconfig    |  3 +++
 include/linux/pwm.h | 31 +++++++++++++++++++++++++++++++
 2 files changed, 34 insertions(+)
 create mode 100644 include/linux/pwm.h

(limited to 'include/linux')

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index b786e68914d4..c274dbb89a83 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -22,6 +22,9 @@ config ARM
 	  Europe.  There is an ARM Linux project with a web page at
 	  <http://www.arm.linux.org.uk/>.
 
+config HAVE_PWM
+	bool
+
 config SYS_SUPPORTS_APM_EMULATION
 	bool
 
diff --git a/include/linux/pwm.h b/include/linux/pwm.h
new file mode 100644
index 000000000000..3945f803d514
--- /dev/null
+++ b/include/linux/pwm.h
@@ -0,0 +1,31 @@
+#ifndef __LINUX_PWM_H
+#define __LINUX_PWM_H
+
+struct pwm_device;
+
+/*
+ * pwm_request - request a PWM device
+ */
+struct pwm_device *pwm_request(int pwm_id, const char *label);
+
+/*
+ * pwm_free - free a PWM device
+ */
+void pwm_free(struct pwm_device *pwm);
+
+/*
+ * pwm_config - change a PWM device configuration
+ */
+int pwm_config(struct pwm_device *pwm, int duty_ns, int period_ns);
+
+/*
+ * pwm_enable - start a PWM output toggling
+ */
+int pwm_enable(struct pwm_device *pwm);
+
+/*
+ * pwm_disable - stop a PWM output toggling
+ */
+void pwm_disable(struct pwm_device *pwm);
+
+#endif /* __ASM_ARCH_PWM_H */
-- 
cgit v1.2.3


From bd3bff9e20f454b242d979ec2f9a4dca0d5fa06f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:20:41 +0200
Subject: sched: add latency tracer callbacks to the scheduler

add 3 lightweight callbacks to the tracer backend.

zero impact if tracing is turned off.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/sched.h | 26 ++++++++++++++++++++++++++
 kernel/sched.c        |  3 +++
 2 files changed, 29 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5395a6176f4b..717cab8a0c83 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2117,6 +2117,32 @@ static inline void arch_pick_mmap_layout(struct mm_struct *mm)
 }
 #endif
 
+#ifdef CONFIG_CONTEXT_SWITCH_TRACER
+extern void
+ftrace_ctx_switch(struct task_struct *prev, struct task_struct *next);
+#else
+static inline void
+ftrace_ctx_switch(struct task_struct *prev, struct task_struct *next)
+{
+}
+#endif
+
+#ifdef CONFIG_SCHED_TRACER
+extern void
+ftrace_wake_up_task(struct task_struct *wakee, struct task_struct *curr);
+extern void
+ftrace_wake_up_new_task(struct task_struct *wakee, struct task_struct *curr);
+#else
+static inline void
+ftrace_wake_up_task(struct task_struct *wakee, struct task_struct *curr)
+{
+}
+static inline void
+ftrace_wake_up_new_task(struct task_struct *wakee, struct task_struct *curr)
+{
+}
+#endif
+
 extern long sched_setaffinity(pid_t pid, const cpumask_t *new_mask);
 extern long sched_getaffinity(pid_t pid, cpumask_t *mask);
 
diff --git a/kernel/sched.c b/kernel/sched.c
index cfa222a91539..463dcdb36ef8 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2467,6 +2467,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 
 out_activate:
 #endif /* CONFIG_SMP */
+	ftrace_wake_up_task(p, rq->curr);
 	schedstat_inc(p, se.nr_wakeups);
 	if (sync)
 		schedstat_inc(p, se.nr_wakeups_sync);
@@ -2611,6 +2612,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 		p->sched_class->task_new(rq, p);
 		inc_nr_running(rq);
 	}
+	ftrace_wake_up_new_task(p, rq->curr);
 	check_preempt_curr(rq, p);
 #ifdef CONFIG_SMP
 	if (p->sched_class->task_wake_up)
@@ -2783,6 +2785,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
 	struct mm_struct *mm, *oldmm;
 
 	prepare_task_switch(rq, prev, next);
+	ftrace_ctx_switch(prev, next);
 	mm = next->mm;
 	oldmm = prev->active_mm;
 	/*
-- 
cgit v1.2.3


From 7c731e0a495e25e79dc1e9e68772a67a55721a65 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 12 May 2008 21:20:41 +0200
Subject: ftrace: make the task state char-string visible to all

The tracer wants to be able to convert the state number
into a user visible character. This patch pulls that conversion
string out the scheduler into the header. This way if it were to
ever change, other parts of the kernel will know.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/sched.h | 2 ++
 kernel/sched.c        | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 717cab8a0c83..6e26f1fdbfe2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2237,6 +2237,8 @@ static inline void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
 }
 #endif /* CONFIG_MM_OWNER */
 
+#define TASK_STATE_TO_CHAR_STR "RSDTtZX"
+
 #endif /* __KERNEL__ */
 
 #endif
diff --git a/kernel/sched.c b/kernel/sched.c
index 463dcdb36ef8..73e600852365 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5729,7 +5729,7 @@ out_unlock:
 	return retval;
 }
 
-static const char stat_nam[] = "RSDTtZX";
+static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
 
 void sched_show_task(struct task_struct *p)
 {
-- 
cgit v1.2.3


From 502825282e6f79c975a644afc124432ec1744de4 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 12 May 2008 21:20:41 +0200
Subject: ftrace: add preempt_enable/disable notrace macros

The tracer may need to call preempt_enable and disable functions
for time keeping and such. The trace gets ugly when we see these
functions show up for all traces. To make the output cleaner
this patch adds preempt_enable_notrace and preempt_disable_notrace
to be used by tracer (and debugging) functions.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/preempt.h | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 23f0c54175cd..36b03d50bf40 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -52,6 +52,34 @@ do { \
 	preempt_check_resched(); \
 } while (0)
 
+/* For debugging and tracer internals only! */
+#define add_preempt_count_notrace(val)			\
+	do { preempt_count() += (val); } while (0)
+#define sub_preempt_count_notrace(val)			\
+	do { preempt_count() -= (val); } while (0)
+#define inc_preempt_count_notrace() add_preempt_count_notrace(1)
+#define dec_preempt_count_notrace() sub_preempt_count_notrace(1)
+
+#define preempt_disable_notrace() \
+do { \
+	inc_preempt_count_notrace(); \
+	barrier(); \
+} while (0)
+
+#define preempt_enable_no_resched_notrace() \
+do { \
+	barrier(); \
+	dec_preempt_count_notrace(); \
+} while (0)
+
+/* preempt_check_resched is OK to trace */
+#define preempt_enable_notrace() \
+do { \
+	preempt_enable_no_resched_notrace(); \
+	barrier(); \
+	preempt_check_resched(); \
+} while (0)
+
 #else
 
 #define preempt_disable()		do { } while (0)
@@ -59,6 +87,10 @@ do { \
 #define preempt_enable()		do { } while (0)
 #define preempt_check_resched()		do { } while (0)
 
+#define preempt_disable_notrace()		do { } while (0)
+#define preempt_enable_no_resched_notrace()	do { } while (0)
+#define preempt_enable_notrace()		do { } while (0)
+
 #endif
 
 #ifdef CONFIG_PREEMPT_NOTIFIERS
-- 
cgit v1.2.3


From ffdc1a09ae7e2cbd714a446ee38a27f625b5f1c8 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:20:41 +0200
Subject: tracing: add notrace to linkage.h

notrace signals that a function should not be traced. Most of the
time this is used by tracers to annotate code that cannot be
traced - it's in a volatile state (such as in user vdso context
or NMI context) or it's in the tracer internals.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/linkage.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/linkage.h b/include/linux/linkage.h
index 2119610b24f8..14f329c64ba8 100644
--- a/include/linux/linkage.h
+++ b/include/linux/linkage.h
@@ -3,6 +3,8 @@
 
 #include <asm/linkage.h>
 
+#define notrace __attribute__((no_instrument_function))
+
 #ifdef __cplusplus
 #define CPP_ASMLINKAGE extern "C"
 #else
-- 
cgit v1.2.3


From 16444a8a40d4c7b4f6de34af0cae1f76a4f6c901 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Date: Mon, 12 May 2008 21:20:42 +0200
Subject: ftrace: add basic support for gcc profiler instrumentation

If CONFIG_FTRACE is selected and /proc/sys/kernel/ftrace_enabled is
set to a non-zero value the ftrace routine will be called everytime
we enter a kernel function that is not marked with the "notrace"
attribute.

The ftrace routine will then call a registered function if a function
happens to be registered.

[ This code has been highly hacked by Steven Rostedt and Ingo Molnar,
  so don't blame Arnaldo for all of this ;-) ]

Update:
  It is now possible to register more than one ftrace function.
  If only one ftrace function is registered, that will be the
  function that ftrace calls directly. If more than one function
  is registered, then ftrace will call a function that will loop
  through the functions to call.

Signed-off-by: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 Makefile                   |   4 ++
 arch/x86/Kconfig           |   1 +
 arch/x86/kernel/entry_32.S |  27 +++++++++
 arch/x86/kernel/entry_64.S |  37 ++++++++++++
 include/linux/ftrace.h     |  38 +++++++++++++
 kernel/Makefile            |   1 +
 kernel/trace/Kconfig       |   5 ++
 kernel/trace/Makefile      |   3 +
 kernel/trace/ftrace.c      | 138 +++++++++++++++++++++++++++++++++++++++++++++
 lib/Kconfig.debug          |   2 +
 10 files changed, 256 insertions(+)
 create mode 100644 include/linux/ftrace.h
 create mode 100644 kernel/trace/Kconfig
 create mode 100644 kernel/trace/Makefile
 create mode 100644 kernel/trace/ftrace.c

(limited to 'include/linux')

diff --git a/Makefile b/Makefile
index 20b32351906b..b4a273f19b52 100644
--- a/Makefile
+++ b/Makefile
@@ -528,6 +528,10 @@ KBUILD_CFLAGS	+= -g
 KBUILD_AFLAGS	+= -gdwarf-2
 endif
 
+ifdef CONFIG_FTRACE
+KBUILD_CFLAGS	+= -pg
+endif
+
 # We trigger additional mismatches with less inlining
 ifdef CONFIG_DEBUG_SECTION_MISMATCH
 KBUILD_CFLAGS += $(call cc-option, -fno-inline-functions-called-once)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index fe361ae7ef2f..c742dfeb0dbe 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -23,6 +23,7 @@ config X86
 	select HAVE_OPROFILE
 	select HAVE_KPROBES
 	select HAVE_KRETPROBES
+	select HAVE_FTRACE
 	select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
 	select HAVE_ARCH_KGDB if !X86_VOYAGER
 
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 2a609dc3271c..f47b9b5440d2 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -1109,6 +1109,33 @@ ENDPROC(xen_failsafe_callback)
 
 #endif	/* CONFIG_XEN */
 
+#ifdef CONFIG_FTRACE
+ENTRY(mcount)
+	cmpl $ftrace_stub, ftrace_trace_function
+	jnz trace
+
+.globl ftrace_stub
+ftrace_stub:
+	ret
+
+	/* taken from glibc */
+trace:
+	pushl %eax
+	pushl %ecx
+	pushl %edx
+	movl 0xc(%esp), %eax
+	movl 0x4(%ebp), %edx
+
+	call   *ftrace_trace_function
+
+	popl %edx
+	popl %ecx
+	popl %eax
+
+	jmp ftrace_stub
+END(mcount)
+#endif
+
 .section .rodata,"a"
 #include "syscall_table_32.S"
 
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 556a8df522a7..f046e0c64883 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -54,6 +54,43 @@
 
 	.code64
 
+#ifdef CONFIG_FTRACE
+ENTRY(mcount)
+	cmpq $ftrace_stub, ftrace_trace_function
+	jnz trace
+.globl ftrace_stub
+ftrace_stub:
+	retq
+
+trace:
+	/* taken from glibc */
+	subq $0x38, %rsp
+	movq %rax, (%rsp)
+	movq %rcx, 8(%rsp)
+	movq %rdx, 16(%rsp)
+	movq %rsi, 24(%rsp)
+	movq %rdi, 32(%rsp)
+	movq %r8, 40(%rsp)
+	movq %r9, 48(%rsp)
+
+	movq 0x38(%rsp), %rdi
+	movq 8(%rbp), %rsi
+
+	call   *ftrace_trace_function
+
+	movq 48(%rsp), %r9
+	movq 40(%rsp), %r8
+	movq 32(%rsp), %rdi
+	movq 24(%rsp), %rsi
+	movq 16(%rsp), %rdx
+	movq 8(%rsp), %rcx
+	movq (%rsp), %rax
+	addq $0x38, %rsp
+
+	jmp ftrace_stub
+END(mcount)
+#endif
+
 #ifndef CONFIG_PREEMPT
 #define retint_kernel retint_restore_args
 #endif	
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
new file mode 100644
index 000000000000..b96ef14c249a
--- /dev/null
+++ b/include/linux/ftrace.h
@@ -0,0 +1,38 @@
+#ifndef _LINUX_FTRACE_H
+#define _LINUX_FTRACE_H
+
+#ifdef CONFIG_FTRACE
+
+#include <linux/linkage.h>
+
+#define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0))
+#define CALLER_ADDR1 ((unsigned long)__builtin_return_address(1))
+#define CALLER_ADDR2 ((unsigned long)__builtin_return_address(2))
+
+typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip);
+
+struct ftrace_ops {
+	ftrace_func_t	  func;
+	struct ftrace_ops *next;
+};
+
+/*
+ * The ftrace_ops must be a static and should also
+ * be read_mostly.  These functions do modify read_mostly variables
+ * so use them sparely. Never free an ftrace_op or modify the
+ * next pointer after it has been registered. Even after unregistering
+ * it, the next pointer may still be used internally.
+ */
+int register_ftrace_function(struct ftrace_ops *ops);
+int unregister_ftrace_function(struct ftrace_ops *ops);
+void clear_ftrace_function(void);
+
+extern void ftrace_stub(unsigned long a0, unsigned long a1);
+extern void mcount(void);
+
+#else /* !CONFIG_FTRACE */
+# define register_ftrace_function(ops) do { } while (0)
+# define unregister_ftrace_function(ops) do { } while (0)
+# define clear_ftrace_function(ops) do { } while (0)
+#endif /* CONFIG_FTRACE */
+#endif /* _LINUX_FTRACE_H */
diff --git a/kernel/Makefile b/kernel/Makefile
index 1c9938addb9d..fa05f6d8bdbf 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -69,6 +69,7 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
 obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
 obj-$(CONFIG_MARKERS) += marker.o
 obj-$(CONFIG_LATENCYTOP) += latencytop.o
+obj-$(CONFIG_FTRACE) += trace/
 
 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
new file mode 100644
index 000000000000..8185c91417bc
--- /dev/null
+++ b/kernel/trace/Kconfig
@@ -0,0 +1,5 @@
+#
+# Architectures that offer an FTRACE implementation should select HAVE_FTRACE:
+#
+config HAVE_FTRACE
+	bool
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
new file mode 100644
index 000000000000..bf4fd215a6a9
--- /dev/null
+++ b/kernel/trace/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_FTRACE) += libftrace.o
+
+libftrace-y := ftrace.o
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
new file mode 100644
index 000000000000..b6a80b98a3fb
--- /dev/null
+++ b/kernel/trace/ftrace.c
@@ -0,0 +1,138 @@
+/*
+ * Infrastructure for profiling code inserted by 'gcc -pg'.
+ *
+ * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ * Copyright (C) 2004-2008 Ingo Molnar <mingo@redhat.com>
+ *
+ * Originally ported from the -rt patch by:
+ *   Copyright (C) 2007 Arnaldo Carvalho de Melo <acme@redhat.com>
+ *
+ * Based on code in the latency_tracer, that is:
+ *
+ *  Copyright (C) 2004-2006 Ingo Molnar
+ *  Copyright (C) 2004 William Lee Irwin III
+ */
+
+#include <linux/module.h>
+#include <linux/ftrace.h>
+
+static DEFINE_SPINLOCK(ftrace_func_lock);
+static struct ftrace_ops ftrace_list_end __read_mostly =
+{
+	.func = ftrace_stub,
+};
+
+static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end;
+ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
+
+/* mcount is defined per arch in assembly */
+EXPORT_SYMBOL(mcount);
+
+notrace void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
+{
+	struct ftrace_ops *op = ftrace_list;
+
+	/* in case someone actually ports this to alpha! */
+	read_barrier_depends();
+
+	while (op != &ftrace_list_end) {
+		/* silly alpha */
+		read_barrier_depends();
+		op->func(ip, parent_ip);
+		op = op->next;
+	};
+}
+
+/**
+ * register_ftrace_function - register a function for profiling
+ * @ops - ops structure that holds the function for profiling.
+ *
+ * Register a function to be called by all functions in the
+ * kernel.
+ *
+ * Note: @ops->func and all the functions it calls must be labeled
+ *       with "notrace", otherwise it will go into a
+ *       recursive loop.
+ */
+int register_ftrace_function(struct ftrace_ops *ops)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ftrace_func_lock, flags);
+	ops->next = ftrace_list;
+	/*
+	 * We are entering ops into the ftrace_list but another
+	 * CPU might be walking that list. We need to make sure
+	 * the ops->next pointer is valid before another CPU sees
+	 * the ops pointer included into the ftrace_list.
+	 */
+	smp_wmb();
+	ftrace_list = ops;
+	/*
+	 * For one func, simply call it directly.
+	 * For more than one func, call the chain.
+	 */
+	if (ops->next == &ftrace_list_end)
+		ftrace_trace_function = ops->func;
+	else
+		ftrace_trace_function = ftrace_list_func;
+	spin_unlock_irqrestore(&ftrace_func_lock, flags);
+
+	return 0;
+}
+
+/**
+ * unregister_ftrace_function - unresgister a function for profiling.
+ * @ops - ops structure that holds the function to unregister
+ *
+ * Unregister a function that was added to be called by ftrace profiling.
+ */
+int unregister_ftrace_function(struct ftrace_ops *ops)
+{
+	unsigned long flags;
+	struct ftrace_ops **p;
+	int ret = 0;
+
+	spin_lock_irqsave(&ftrace_func_lock, flags);
+
+	/*
+	 * If we are the only function, then the ftrace pointer is
+	 * pointing directly to that function.
+	 */
+	if (ftrace_list == ops && ops->next == &ftrace_list_end) {
+		ftrace_trace_function = ftrace_stub;
+		ftrace_list = &ftrace_list_end;
+		goto out;
+	}
+
+	for (p = &ftrace_list; *p != &ftrace_list_end; p = &(*p)->next)
+		if (*p == ops)
+			break;
+
+	if (*p != ops) {
+		ret = -1;
+		goto out;
+	}
+
+	*p = (*p)->next;
+
+	/* If we only have one func left, then call that directly */
+	if (ftrace_list->next == &ftrace_list_end)
+		ftrace_trace_function = ftrace_list->func;
+
+ out:
+	spin_unlock_irqrestore(&ftrace_func_lock, flags);
+
+	return 0;
+}
+
+/**
+ * clear_ftrace_function - reset the ftrace function
+ *
+ * This NULLs the ftrace function and in essence stops
+ * tracing.  There may be lag
+ */
+void clear_ftrace_function(void)
+{
+	ftrace_trace_function = ftrace_stub;
+}
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index d2099f41aa1e..d8b6279a9b42 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -634,6 +634,8 @@ config LATENCYTOP
 	  Enable this option if you want to use the LatencyTOP tool
 	  to find out which userspace is blocking on what kernel operations.
 
+source kernel/trace/Kconfig
+
 config PROVIDE_OHCI1394_DMA_INIT
 	bool "Remote debugging over FireWire early on boot"
 	depends on PCI && X86
-- 
cgit v1.2.3


From 352ad25aa4a189c667cb2af333948d34692a2d27 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 12 May 2008 21:20:42 +0200
Subject: ftrace: tracer for scheduler wakeup latency

This patch adds the tracer that tracks the wakeup latency of the
highest priority waking task.

  "wakeup" is added to /debugfs/tracing/available_tracers

Also added to /debugfs/tracing

  tracing_max_latency
     holds the current max latency for the wakeup

  wakeup_thresh
     if set to other than zero, a log will be recorded
     for every wakeup that takes longer than the number
     entered in here (usecs for all counters)
     (deletes previous trace)

Examples:

  (with ftrace_enabled = 0)

============
preemption latency trace v1.1.5 on 2.6.24-rc8
Signed-off-by: Ingo Molnar <mingo@elte.hu>
--------------------------------------------------------------------
 latency: 26 us, #2/2, CPU#1 | (M:rt VP:0, KP:0, SP:0 HP:0 #P:2)
    -----------------
    | task: migration/0-3 (uid:0 nice:-5 policy:1 rt_prio:99)
    -----------------

                 _------=> CPU#
                / _-----=> irqs-off
               | / _----=> need-resched
               || / _---=> hardirq/softirq
               ||| / _--=> preempt-depth
               |||| /
               |||||     delay
   cmd     pid ||||| time  |   caller
      \   /    |||||   \   |   /
   quilt-8551  0d..3    0us+: wake_up_process+0x15/0x17 <ffffffff80233e80> (sched_exec+0xc9/0x100 <ffffffff80235343>)
   quilt-8551  0d..4   26us : sched_switch_callback+0x73/0x81 <ffffffff80338d2f> (schedule+0x483/0x6d5 <ffffffff8048b3ee>)

vim:ft=help
============

  (with ftrace_enabled = 1)

============
preemption latency trace v1.1.5 on 2.6.24-rc8
--------------------------------------------------------------------
 latency: 36 us, #45/45, CPU#0 | (M:rt VP:0, KP:0, SP:0 HP:0 #P:2)
    -----------------
    | task: migration/1-5 (uid:0 nice:-5 policy:1 rt_prio:99)
    -----------------

                 _------=> CPU#
                / _-----=> irqs-off
               | / _----=> need-resched
               || / _---=> hardirq/softirq
               ||| / _--=> preempt-depth
               |||| /
               |||||     delay
   cmd     pid ||||| time  |   caller
      \   /    |||||   \   |   /
    bash-10653 1d..3    0us : wake_up_process+0x15/0x17 <ffffffff80233e80> (sched_exec+0xc9/0x100 <ffffffff80235343>)
    bash-10653 1d..3    1us : try_to_wake_up+0x271/0x2e7 <ffffffff80233dcf> (sub_preempt_count+0xc/0x7a <ffffffff8023309e>)
    bash-10653 1d..2    2us : try_to_wake_up+0x296/0x2e7 <ffffffff80233df4> (update_rq_clock+0x9/0x20 <ffffffff802303f3>)
    bash-10653 1d..2    2us : update_rq_clock+0x1e/0x20 <ffffffff80230408> (__update_rq_clock+0xc/0x90 <ffffffff80230366>)
    bash-10653 1d..2    3us : __update_rq_clock+0x1b/0x90 <ffffffff80230375> (sched_clock+0x9/0x29 <ffffffff80214529>)
    bash-10653 1d..2    4us : try_to_wake_up+0x2a6/0x2e7 <ffffffff80233e04> (activate_task+0xc/0x3f <ffffffff8022ffca>)
    bash-10653 1d..2    4us : activate_task+0x2d/0x3f <ffffffff8022ffeb> (enqueue_task+0xe/0x66 <ffffffff8022ff66>)
    bash-10653 1d..2    5us : enqueue_task+0x5b/0x66 <ffffffff8022ffb3> (enqueue_task_rt+0x9/0x3c <ffffffff80233351>)
    bash-10653 1d..2    6us : try_to_wake_up+0x2ba/0x2e7 <ffffffff80233e18> (check_preempt_wakeup+0x12/0x99 <ffffffff80234f84>)
[...]
    bash-10653 1d..5   33us : tracing_record_cmdline+0xcf/0xd4 <ffffffff80338aad> (_spin_unlock+0x9/0x33 <ffffffff8048d3ec>)
    bash-10653 1d..5   34us : _spin_unlock+0x19/0x33 <ffffffff8048d3fc> (sub_preempt_count+0xc/0x7a <ffffffff8023309e>)
    bash-10653 1d..4   35us : wakeup_sched_switch+0x65/0x2ff <ffffffff80339f66> (_spin_lock_irqsave+0xc/0xa9 <ffffffff8048d08b>)
    bash-10653 1d..4   35us : _spin_lock_irqsave+0x19/0xa9 <ffffffff8048d098> (add_preempt_count+0xe/0x77 <ffffffff8023311a>)
    bash-10653 1d..4   36us : sched_switch_callback+0x73/0x81 <ffffffff80338d2f> (schedule+0x483/0x6d5 <ffffffff8048b3ee>)

vim:ft=help
============

The [...] was added here to not waste your email box space.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/ftrace.h            |  23 ++-
 kernel/trace/Kconfig              |  13 ++
 kernel/trace/Makefile             |   1 +
 kernel/trace/trace_sched_wakeup.c | 310 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 343 insertions(+), 4 deletions(-)
 create mode 100644 kernel/trace/trace_sched_wakeup.c

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index b96ef14c249a..db8a5e7abe41 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -5,10 +5,6 @@
 
 #include <linux/linkage.h>
 
-#define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0))
-#define CALLER_ADDR1 ((unsigned long)__builtin_return_address(1))
-#define CALLER_ADDR2 ((unsigned long)__builtin_return_address(2))
-
 typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip);
 
 struct ftrace_ops {
@@ -35,4 +31,23 @@ extern void mcount(void);
 # define unregister_ftrace_function(ops) do { } while (0)
 # define clear_ftrace_function(ops) do { } while (0)
 #endif /* CONFIG_FTRACE */
+
+
+#ifdef CONFIG_FRAME_POINTER
+/* TODO: need to fix this for ARM */
+# define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0))
+# define CALLER_ADDR1 ((unsigned long)__builtin_return_address(1))
+# define CALLER_ADDR2 ((unsigned long)__builtin_return_address(2))
+# define CALLER_ADDR3 ((unsigned long)__builtin_return_address(3))
+# define CALLER_ADDR4 ((unsigned long)__builtin_return_address(4))
+# define CALLER_ADDR5 ((unsigned long)__builtin_return_address(5))
+#else
+# define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0))
+# define CALLER_ADDR1 0UL
+# define CALLER_ADDR2 0UL
+# define CALLER_ADDR3 0UL
+# define CALLER_ADDR4 0UL
+# define CALLER_ADDR5 0UL
+#endif
+
 #endif /* _LINUX_FTRACE_H */
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 5d6aa92866cd..892ecc94a82b 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -4,6 +4,9 @@
 config HAVE_FTRACE
 	bool
 
+config TRACER_MAX_TRACE
+	bool
+
 config TRACING
 	bool
 	select DEBUG_FS
@@ -23,6 +26,16 @@ config FTRACE
 	  (the bootup default), then the overhead of the instructions is very
 	  small and not measurable even in micro-benchmarks.
 
+config SCHED_TRACER
+	bool "Scheduling Latency Tracer"
+	depends on DEBUG_KERNEL
+	select TRACING
+	select CONTEXT_SWITCH_TRACER
+	select TRACER_MAX_TRACE
+	help
+	  This tracer tracks the latency of the highest priority task
+	  to be scheduled in, starting from the point it has woken up.
+
 config CONTEXT_SWITCH_TRACER
 	bool "Trace process context switches"
 	depends on DEBUG_KERNEL
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 6b54ceb7f16e..5508cdb19aea 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -3,5 +3,6 @@ obj-$(CONFIG_FTRACE) += libftrace.o
 obj-$(CONFIG_TRACING) += trace.o
 obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
 obj-$(CONFIG_FTRACE) += trace_functions.o
+obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
new file mode 100644
index 000000000000..7c3ccefcf4c3
--- /dev/null
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -0,0 +1,310 @@
+/*
+ * trace task wakeup timings
+ *
+ * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
+ *
+ * Based on code from the latency_tracer, that is:
+ *
+ *  Copyright (C) 2004-2006 Ingo Molnar
+ *  Copyright (C) 2004 William Lee Irwin III
+ */
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/debugfs.h>
+#include <linux/kallsyms.h>
+#include <linux/uaccess.h>
+#include <linux/ftrace.h>
+
+#include "trace.h"
+
+static struct trace_array	*wakeup_trace;
+static int __read_mostly	tracer_enabled;
+
+static struct task_struct	*wakeup_task;
+static int			wakeup_cpu;
+static unsigned			wakeup_prio = -1;
+
+static DEFINE_SPINLOCK(wakeup_lock);
+
+static void notrace __wakeup_reset(struct trace_array *tr);
+
+/*
+ * Should this new latency be reported/recorded?
+ */
+static int notrace report_latency(cycle_t delta)
+{
+	if (tracing_thresh) {
+		if (delta < tracing_thresh)
+			return 0;
+	} else {
+		if (delta <= tracing_max_latency)
+			return 0;
+	}
+	return 1;
+}
+
+void notrace
+wakeup_sched_switch(struct task_struct *prev, struct task_struct *next)
+{
+	unsigned long latency = 0, t0 = 0, t1 = 0;
+	struct trace_array *tr = wakeup_trace;
+	struct trace_array_cpu *data;
+	cycle_t T0, T1, delta;
+	unsigned long flags;
+	long disabled;
+	int cpu;
+
+	if (unlikely(!tracer_enabled))
+		return;
+
+	/*
+	 * When we start a new trace, we set wakeup_task to NULL
+	 * and then set tracer_enabled = 1. We want to make sure
+	 * that another CPU does not see the tracer_enabled = 1
+	 * and the wakeup_task with an older task, that might
+	 * actually be the same as next.
+	 */
+	smp_rmb();
+
+	if (next != wakeup_task)
+		return;
+
+	/* The task we are waitng for is waking up */
+	data = tr->data[wakeup_cpu];
+
+	/* disable local data, not wakeup_cpu data */
+	cpu = raw_smp_processor_id();
+	disabled = atomic_inc_return(&tr->data[cpu]->disabled);
+	if (likely(disabled != 1))
+		goto out;
+
+	spin_lock_irqsave(&wakeup_lock, flags);
+
+	/* We could race with grabbing wakeup_lock */
+	if (unlikely(!tracer_enabled || next != wakeup_task))
+		goto out_unlock;
+
+	ftrace(tr, data, CALLER_ADDR1, CALLER_ADDR2, flags);
+
+	/*
+	 * usecs conversion is slow so we try to delay the conversion
+	 * as long as possible:
+	 */
+	T0 = data->preempt_timestamp;
+	T1 = now(cpu);
+	delta = T1-T0;
+
+	if (!report_latency(delta))
+		goto out_unlock;
+
+	latency = nsecs_to_usecs(delta);
+
+	tracing_max_latency = delta;
+	t0 = nsecs_to_usecs(T0);
+	t1 = nsecs_to_usecs(T1);
+
+	update_max_tr(tr, wakeup_task, wakeup_cpu);
+
+	if (tracing_thresh) {
+		printk(KERN_INFO "(%16s-%-5d|#%d): %lu us wakeup latency "
+		       "violates %lu us threshold.\n"
+		       " => started at timestamp %lu: ",
+				wakeup_task->comm, wakeup_task->pid,
+				raw_smp_processor_id(),
+				latency, nsecs_to_usecs(tracing_thresh), t0);
+	} else {
+		printk(KERN_INFO "(%16s-%-5d|#%d): new %lu us maximum "
+		       "wakeup latency.\n => started at timestamp %lu: ",
+				wakeup_task->comm, wakeup_task->pid,
+				cpu, latency, t0);
+	}
+
+	printk(KERN_CONT "   ended at timestamp %lu: ", t1);
+	dump_stack();
+	t1 = nsecs_to_usecs(now(cpu));
+	printk(KERN_CONT "   dump-end timestamp %lu\n\n", t1);
+
+out_unlock:
+	__wakeup_reset(tr);
+	spin_unlock_irqrestore(&wakeup_lock, flags);
+out:
+	atomic_dec(&tr->data[cpu]->disabled);
+}
+
+static void notrace __wakeup_reset(struct trace_array *tr)
+{
+	struct trace_array_cpu *data;
+	int cpu;
+
+	assert_spin_locked(&wakeup_lock);
+
+	for_each_possible_cpu(cpu) {
+		data = tr->data[cpu];
+		tracing_reset(data);
+	}
+
+	wakeup_cpu = -1;
+	wakeup_prio = -1;
+
+	if (wakeup_task)
+		put_task_struct(wakeup_task);
+
+	wakeup_task = NULL;
+}
+
+static void notrace wakeup_reset(struct trace_array *tr)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&wakeup_lock, flags);
+	__wakeup_reset(tr);
+	spin_unlock_irqrestore(&wakeup_lock, flags);
+}
+
+static notrace void
+wakeup_check_start(struct trace_array *tr, struct task_struct *p,
+		   struct task_struct *curr)
+{
+	int cpu = smp_processor_id();
+	unsigned long flags;
+	long disabled;
+
+	if (likely(!rt_task(p)) ||
+			p->prio >= wakeup_prio ||
+			p->prio >= curr->prio)
+		return;
+
+	disabled = atomic_inc_return(&tr->data[cpu]->disabled);
+	if (unlikely(disabled != 1))
+		goto out;
+
+	/* interrupts should be off from try_to_wake_up */
+	spin_lock(&wakeup_lock);
+
+	/* check for races. */
+	if (!tracer_enabled || p->prio >= wakeup_prio)
+		goto out_locked;
+
+	/* reset the trace */
+	__wakeup_reset(tr);
+
+	wakeup_cpu = task_cpu(p);
+	wakeup_prio = p->prio;
+
+	wakeup_task = p;
+	get_task_struct(wakeup_task);
+
+	local_save_flags(flags);
+
+	tr->data[wakeup_cpu]->preempt_timestamp = now(cpu);
+	ftrace(tr, tr->data[wakeup_cpu], CALLER_ADDR1, CALLER_ADDR2, flags);
+
+out_locked:
+	spin_unlock(&wakeup_lock);
+out:
+	atomic_dec(&tr->data[cpu]->disabled);
+}
+
+notrace void
+ftrace_wake_up_task(struct task_struct *wakee, struct task_struct *curr)
+{
+	if (likely(!tracer_enabled))
+		return;
+
+	wakeup_check_start(wakeup_trace, wakee, curr);
+}
+
+notrace void
+ftrace_wake_up_new_task(struct task_struct *wakee, struct task_struct *curr)
+{
+	if (likely(!tracer_enabled))
+		return;
+
+	wakeup_check_start(wakeup_trace, wakee, curr);
+}
+
+static notrace void start_wakeup_tracer(struct trace_array *tr)
+{
+	wakeup_reset(tr);
+
+	/*
+	 * Don't let the tracer_enabled = 1 show up before
+	 * the wakeup_task is reset. This may be overkill since
+	 * wakeup_reset does a spin_unlock after setting the
+	 * wakeup_task to NULL, but I want to be safe.
+	 * This is a slow path anyway.
+	 */
+	smp_wmb();
+
+	tracer_enabled = 1;
+
+	return;
+}
+
+static notrace void stop_wakeup_tracer(struct trace_array *tr)
+{
+	tracer_enabled = 0;
+}
+
+static notrace void wakeup_tracer_init(struct trace_array *tr)
+{
+	wakeup_trace = tr;
+
+	if (tr->ctrl)
+		start_wakeup_tracer(tr);
+}
+
+static notrace void wakeup_tracer_reset(struct trace_array *tr)
+{
+	if (tr->ctrl) {
+		stop_wakeup_tracer(tr);
+		/* make sure we put back any tasks we are tracing */
+		wakeup_reset(tr);
+	}
+}
+
+static void wakeup_tracer_ctrl_update(struct trace_array *tr)
+{
+	if (tr->ctrl)
+		start_wakeup_tracer(tr);
+	else
+		stop_wakeup_tracer(tr);
+}
+
+static void notrace wakeup_tracer_open(struct trace_iterator *iter)
+{
+	/* stop the trace while dumping */
+	if (iter->tr->ctrl)
+		stop_wakeup_tracer(iter->tr);
+}
+
+static void notrace wakeup_tracer_close(struct trace_iterator *iter)
+{
+	/* forget about any processes we were recording */
+	if (iter->tr->ctrl)
+		start_wakeup_tracer(iter->tr);
+}
+
+static struct tracer wakeup_tracer __read_mostly =
+{
+	.name		= "wakeup",
+	.init		= wakeup_tracer_init,
+	.reset		= wakeup_tracer_reset,
+	.open		= wakeup_tracer_open,
+	.close		= wakeup_tracer_close,
+	.ctrl_update	= wakeup_tracer_ctrl_update,
+	.print_max	= 1,
+};
+
+__init static int init_wakeup_tracer(void)
+{
+	int ret;
+
+	ret = register_tracer(&wakeup_tracer);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+device_initcall(init_wakeup_tracer);
-- 
cgit v1.2.3


From 81d68a96a39844853b37f20cc8282d9b65b78ef3 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 12 May 2008 21:20:42 +0200
Subject: ftrace: trace irq disabled critical timings

This patch adds latency tracing for critical timings
(how long interrupts are disabled for).

 "irqsoff" is added to /debugfs/tracing/available_tracers

Note:
  tracing_max_latency
    also holds the max latency for irqsoff (in usecs).
   (default to large number so one must start latency tracing)

  tracing_thresh
    threshold (in usecs) to always print out if irqs off
    is detected to be longer than stated here.
    If irq_thresh is non-zero, then max_irq_latency
    is ignored.

Here's an example of a trace with ftrace_enabled = 0

=======
preemption latency trace v1.1.5 on 2.6.24-rc7
Signed-off-by: Ingo Molnar <mingo@elte.hu>
--------------------------------------------------------------------
 latency: 100 us, #3/3, CPU#1 | (M:rt VP:0, KP:0, SP:0 HP:0 #P:2)
    -----------------
    | task: swapper-0 (uid:0 nice:0 policy:0 rt_prio:0)
    -----------------
 => started at: _spin_lock_irqsave+0x2a/0xb7
 => ended at:   _spin_unlock_irqrestore+0x32/0x5f

                 _------=> CPU#
                / _-----=> irqs-off
               | / _----=> need-resched
               || / _---=> hardirq/softirq
               ||| / _--=> preempt-depth
               |||| /
               |||||     delay
   cmd     pid ||||| time  |   caller
      \   /    |||||   \   |   /
 swapper-0     1d.s3    0us+: _spin_lock_irqsave+0x2a/0xb7 (e1000_update_stats+0x47/0x64c [e1000])
 swapper-0     1d.s3  100us : _spin_unlock_irqrestore+0x32/0x5f (e1000_update_stats+0x641/0x64c [e1000])
 swapper-0     1d.s3  100us : trace_hardirqs_on_caller+0x75/0x89 (_spin_unlock_irqrestore+0x32/0x5f)

vim:ft=help
=======

And this is a trace with ftrace_enabled == 1

=======
preemption latency trace v1.1.5 on 2.6.24-rc7
--------------------------------------------------------------------
 latency: 102 us, #12/12, CPU#1 | (M:rt VP:0, KP:0, SP:0 HP:0 #P:2)
    -----------------
    | task: swapper-0 (uid:0 nice:0 policy:0 rt_prio:0)
    -----------------
 => started at: _spin_lock_irqsave+0x2a/0xb7
 => ended at:   _spin_unlock_irqrestore+0x32/0x5f

                 _------=> CPU#
                / _-----=> irqs-off
               | / _----=> need-resched
               || / _---=> hardirq/softirq
               ||| / _--=> preempt-depth
               |||| /
               |||||     delay
   cmd     pid ||||| time  |   caller
      \   /    |||||   \   |   /
 swapper-0     1dNs3    0us+: _spin_lock_irqsave+0x2a/0xb7 (e1000_update_stats+0x47/0x64c [e1000])
 swapper-0     1dNs3   46us : e1000_read_phy_reg+0x16/0x225 [e1000] (e1000_update_stats+0x5e2/0x64c [e1000])
 swapper-0     1dNs3   46us : e1000_swfw_sync_acquire+0x10/0x99 [e1000] (e1000_read_phy_reg+0x49/0x225 [e1000])
 swapper-0     1dNs3   46us : e1000_get_hw_eeprom_semaphore+0x12/0xa6 [e1000] (e1000_swfw_sync_acquire+0x36/0x99 [e1000])
 swapper-0     1dNs3   47us : __const_udelay+0x9/0x47 (e1000_read_phy_reg+0x116/0x225 [e1000])
 swapper-0     1dNs3   47us+: __delay+0x9/0x50 (__const_udelay+0x45/0x47)
 swapper-0     1dNs3   97us : preempt_schedule+0xc/0x84 (__delay+0x4e/0x50)
 swapper-0     1dNs3   98us : e1000_swfw_sync_release+0xc/0x55 [e1000] (e1000_read_phy_reg+0x211/0x225 [e1000])
 swapper-0     1dNs3   99us+: e1000_put_hw_eeprom_semaphore+0x9/0x35 [e1000] (e1000_swfw_sync_release+0x50/0x55 [e1000])
 swapper-0     1dNs3  101us : _spin_unlock_irqrestore+0xe/0x5f (e1000_update_stats+0x641/0x64c [e1000])
 swapper-0     1dNs3  102us : _spin_unlock_irqrestore+0x32/0x5f (e1000_update_stats+0x641/0x64c [e1000])
 swapper-0     1dNs3  102us : trace_hardirqs_on_caller+0x75/0x89 (_spin_unlock_irqrestore+0x32/0x5f)

vim:ft=help
=======

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/process_64.c |   3 +
 arch/x86/lib/Makefile        |   1 +
 arch/x86/lib/thunk_32.S      |  47 +++++
 arch/x86/lib/thunk_64.S      |  19 +-
 include/asm-x86/irqflags.h   |  24 +--
 include/linux/ftrace.h       |   8 +
 include/linux/irqflags.h     |  12 +-
 kernel/fork.c                |   2 +-
 kernel/lockdep.c             |  23 ++-
 kernel/printk.c              |   2 +
 kernel/trace/Kconfig         |  18 ++
 kernel/trace/Makefile        |   1 +
 kernel/trace/trace_irqsoff.c | 402 +++++++++++++++++++++++++++++++++++++++++++
 13 files changed, 531 insertions(+), 31 deletions(-)
 create mode 100644 arch/x86/lib/thunk_32.S
 create mode 100644 kernel/trace/trace_irqsoff.c

(limited to 'include/linux')

diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index e2319f39988b..dd349c92f051 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -165,7 +165,10 @@ void cpu_idle(void)
 			 */
 			local_irq_disable();
 			enter_idle();
+			/* Don't trace irqs off for idle */
+			stop_critical_timings();
 			idle();
+			start_critical_timings();
 			/* In many cases the interrupt that ended idle
 			   has already called exit_idle. But some idle
 			   loops can be woken up without interrupt. */
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 76f60f52a885..84aa2883fe15 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -5,6 +5,7 @@
 obj-$(CONFIG_SMP) := msr-on-cpu.o
 
 lib-y := delay_$(BITS).o
+lib-y += thunk_$(BITS).o
 lib-y += usercopy_$(BITS).o getuser_$(BITS).o putuser_$(BITS).o
 lib-y += memcpy_$(BITS).o
 
diff --git a/arch/x86/lib/thunk_32.S b/arch/x86/lib/thunk_32.S
new file mode 100644
index 000000000000..650b11e00ecc
--- /dev/null
+++ b/arch/x86/lib/thunk_32.S
@@ -0,0 +1,47 @@
+/*
+ * Trampoline to trace irqs off. (otherwise CALLER_ADDR1 might crash)
+ * Copyright 2008 by Steven Rostedt, Red Hat, Inc
+ *  (inspired by Andi Kleen's thunk_64.S)
+ * Subject to the GNU public license, v.2. No warranty of any kind.
+ */
+
+	#include <linux/linkage.h>
+
+#define ARCH_TRACE_IRQS_ON			\
+	pushl %eax;				\
+	pushl %ecx;				\
+	pushl %edx;				\
+	call trace_hardirqs_on;			\
+	popl %edx;				\
+	popl %ecx;				\
+	popl %eax;
+
+#define ARCH_TRACE_IRQS_OFF			\
+	pushl %eax;				\
+	pushl %ecx;				\
+	pushl %edx;				\
+	call trace_hardirqs_off;		\
+	popl %edx;				\
+	popl %ecx;				\
+	popl %eax;
+
+#ifdef CONFIG_TRACE_IRQFLAGS
+	/* put return address in eax (arg1) */
+	.macro thunk_ra name,func
+	.globl \name
+\name:
+	pushl %eax
+	pushl %ecx
+	pushl %edx
+	/* Place EIP in the arg1 */
+	movl 3*4(%esp), %eax
+	call \func
+	popl %edx
+	popl %ecx
+	popl %eax
+	ret
+	.endm
+
+	thunk_ra trace_hardirqs_on_thunk,trace_hardirqs_on_caller
+	thunk_ra trace_hardirqs_off_thunk,trace_hardirqs_off_caller
+#endif
diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S
index e009251d4e9f..bf9a7d5a5428 100644
--- a/arch/x86/lib/thunk_64.S
+++ b/arch/x86/lib/thunk_64.S
@@ -2,6 +2,7 @@
  * Save registers before calling assembly functions. This avoids
  * disturbance of register allocation in some inline assembly constructs.
  * Copyright 2001,2002 by Andi Kleen, SuSE Labs.
+ * Added trace_hardirqs callers - Copyright 2007 Steven Rostedt, Red Hat, Inc.
  * Subject to the GNU public license, v.2. No warranty of any kind.
  */
 
@@ -42,8 +43,22 @@
 #endif	
 	
 #ifdef CONFIG_TRACE_IRQFLAGS
-	thunk trace_hardirqs_on_thunk,trace_hardirqs_on
-	thunk trace_hardirqs_off_thunk,trace_hardirqs_off
+	/* put return address in rdi (arg1) */
+	.macro thunk_ra name,func
+	.globl \name
+\name:
+	CFI_STARTPROC
+	SAVE_ARGS
+	/* SAVE_ARGS pushs 9 elements */
+	/* the next element would be the rip */
+	movq 9*8(%rsp), %rdi
+	call \func
+	jmp  restore
+	CFI_ENDPROC
+	.endm
+
+	thunk_ra trace_hardirqs_on_thunk,trace_hardirqs_on_caller
+	thunk_ra trace_hardirqs_off_thunk,trace_hardirqs_off_caller
 #endif
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
diff --git a/include/asm-x86/irqflags.h b/include/asm-x86/irqflags.h
index c242527f970e..24d71b1eb189 100644
--- a/include/asm-x86/irqflags.h
+++ b/include/asm-x86/irqflags.h
@@ -179,8 +179,6 @@ static inline void trace_hardirqs_fixup(void)
  * have a reliable stack. x86_64 only.
  */
 #define SWAPGS_UNSAFE_STACK	swapgs
-#define ARCH_TRACE_IRQS_ON		call trace_hardirqs_on_thunk
-#define ARCH_TRACE_IRQS_OFF		call trace_hardirqs_off_thunk
 #define ARCH_LOCKDEP_SYS_EXIT		call lockdep_sys_exit_thunk
 #define ARCH_LOCKDEP_SYS_EXIT_IRQ	\
 	TRACE_IRQS_ON; \
@@ -192,24 +190,6 @@ static inline void trace_hardirqs_fixup(void)
 	TRACE_IRQS_OFF;
 
 #else
-#define ARCH_TRACE_IRQS_ON			\
-	pushl %eax;				\
-	pushl %ecx;				\
-	pushl %edx;				\
-	call trace_hardirqs_on;			\
-	popl %edx;				\
-	popl %ecx;				\
-	popl %eax;
-
-#define ARCH_TRACE_IRQS_OFF			\
-	pushl %eax;				\
-	pushl %ecx;				\
-	pushl %edx;				\
-	call trace_hardirqs_off;		\
-	popl %edx;				\
-	popl %ecx;				\
-	popl %eax;
-
 #define ARCH_LOCKDEP_SYS_EXIT			\
 	pushl %eax;				\
 	pushl %ecx;				\
@@ -223,8 +203,8 @@ static inline void trace_hardirqs_fixup(void)
 #endif
 
 #ifdef CONFIG_TRACE_IRQFLAGS
-#  define TRACE_IRQS_ON		ARCH_TRACE_IRQS_ON
-#  define TRACE_IRQS_OFF	ARCH_TRACE_IRQS_OFF
+#  define TRACE_IRQS_ON		call trace_hardirqs_on_thunk;
+#  define TRACE_IRQS_OFF	call trace_hardirqs_off_thunk;
 #else
 #  define TRACE_IRQS_ON
 #  define TRACE_IRQS_OFF
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index db8a5e7abe41..0a20445dcbcc 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -50,4 +50,12 @@ extern void mcount(void);
 # define CALLER_ADDR5 0UL
 #endif
 
+#ifdef CONFIG_IRQSOFF_TRACER
+  extern void notrace time_hardirqs_on(unsigned long a0, unsigned long a1);
+  extern void notrace time_hardirqs_off(unsigned long a0, unsigned long a1);
+#else
+# define time_hardirqs_on(a0, a1)		do { } while (0)
+# define time_hardirqs_off(a0, a1)		do { } while (0)
+#endif
+
 #endif /* _LINUX_FTRACE_H */
diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
index e600c4e9b8c5..5b711d4e9fd9 100644
--- a/include/linux/irqflags.h
+++ b/include/linux/irqflags.h
@@ -12,10 +12,10 @@
 #define _LINUX_TRACE_IRQFLAGS_H
 
 #ifdef CONFIG_TRACE_IRQFLAGS
-  extern void trace_hardirqs_on(void);
-  extern void trace_hardirqs_off(void);
   extern void trace_softirqs_on(unsigned long ip);
   extern void trace_softirqs_off(unsigned long ip);
+  extern void trace_hardirqs_on(void);
+  extern void trace_hardirqs_off(void);
 # define trace_hardirq_context(p)	((p)->hardirq_context)
 # define trace_softirq_context(p)	((p)->softirq_context)
 # define trace_hardirqs_enabled(p)	((p)->hardirqs_enabled)
@@ -41,6 +41,14 @@
 # define INIT_TRACE_IRQFLAGS
 #endif
 
+#ifdef CONFIG_IRQSOFF_TRACER
+ extern void stop_critical_timings(void);
+ extern void start_critical_timings(void);
+#else
+# define stop_critical_timings() do { } while (0)
+# define start_critical_timings() do { } while (0)
+#endif
+
 #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
 
 #include <asm/irqflags.h>
diff --git a/kernel/fork.c b/kernel/fork.c
index 19908b26cf80..d66d676dc362 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -909,7 +909,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 
 	rt_mutex_init_task(p);
 
-#ifdef CONFIG_TRACE_IRQFLAGS
+#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_LOCKDEP)
 	DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
 	DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
 #endif
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 81a4e4a3f087..e21924365ea3 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -39,6 +39,7 @@
 #include <linux/irqflags.h>
 #include <linux/utsname.h>
 #include <linux/hash.h>
+#include <linux/ftrace.h>
 
 #include <asm/sections.h>
 
@@ -982,7 +983,7 @@ check_noncircular(struct lock_class *source, unsigned int depth)
 	return 1;
 }
 
-#ifdef CONFIG_TRACE_IRQFLAGS
+#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
 /*
  * Forwards and backwards subgraph searching, for the purposes of
  * proving that two subgraphs can be connected by a new dependency
@@ -1680,7 +1681,7 @@ valid_state(struct task_struct *curr, struct held_lock *this,
 static int mark_lock(struct task_struct *curr, struct held_lock *this,
 		     enum lock_usage_bit new_bit);
 
-#ifdef CONFIG_TRACE_IRQFLAGS
+#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
 
 /*
  * print irq inversion bug:
@@ -2013,11 +2014,13 @@ void early_boot_irqs_on(void)
 /*
  * Hardirqs will be enabled:
  */
-void trace_hardirqs_on(void)
+void notrace trace_hardirqs_on_caller(unsigned long a0)
 {
 	struct task_struct *curr = current;
 	unsigned long ip;
 
+	time_hardirqs_on(CALLER_ADDR0, a0);
+
 	if (unlikely(!debug_locks || current->lockdep_recursion))
 		return;
 
@@ -2055,16 +2058,23 @@ void trace_hardirqs_on(void)
 	curr->hardirq_enable_event = ++curr->irq_events;
 	debug_atomic_inc(&hardirqs_on_events);
 }
+EXPORT_SYMBOL(trace_hardirqs_on_caller);
 
+void notrace trace_hardirqs_on(void)
+{
+	trace_hardirqs_on_caller(CALLER_ADDR0);
+}
 EXPORT_SYMBOL(trace_hardirqs_on);
 
 /*
  * Hardirqs were disabled:
  */
-void trace_hardirqs_off(void)
+void notrace trace_hardirqs_off_caller(unsigned long a0)
 {
 	struct task_struct *curr = current;
 
+	time_hardirqs_off(CALLER_ADDR0, a0);
+
 	if (unlikely(!debug_locks || current->lockdep_recursion))
 		return;
 
@@ -2082,7 +2092,12 @@ void trace_hardirqs_off(void)
 	} else
 		debug_atomic_inc(&redundant_hardirqs_off);
 }
+EXPORT_SYMBOL(trace_hardirqs_off_caller);
 
+void notrace trace_hardirqs_off(void)
+{
+	trace_hardirqs_off_caller(CALLER_ADDR0);
+}
 EXPORT_SYMBOL(trace_hardirqs_off);
 
 /*
diff --git a/kernel/printk.c b/kernel/printk.c
index 8fb01c32aa3b..ae7d5b9e535d 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1041,7 +1041,9 @@ void release_console_sem(void)
 		_log_end = log_end;
 		con_start = log_end;		/* Flush */
 		spin_unlock(&logbuf_lock);
+		stop_critical_timings();	/* don't trace print latency */
 		call_console_drivers(_con_start, _log_end);
+		start_critical_timings();
 		local_irq_restore(flags);
 	}
 	console_locked = 0;
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 892ecc94a82b..896df1cf6adc 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -26,6 +26,24 @@ config FTRACE
 	  (the bootup default), then the overhead of the instructions is very
 	  small and not measurable even in micro-benchmarks.
 
+config IRQSOFF_TRACER
+	bool "Interrupts-off Latency Tracer"
+	default n
+	depends on TRACE_IRQFLAGS_SUPPORT
+	depends on GENERIC_TIME
+	select TRACE_IRQFLAGS
+	select TRACING
+	select TRACER_MAX_TRACE
+	help
+	  This option measures the time spent in irqs-off critical
+	  sections, with microsecond accuracy.
+
+	  The default measurement method is a maximum search, which is
+	  disabled by default and can be runtime (re-)started
+	  via:
+
+	      echo 0 > /debugfs/tracing/tracing_max_latency
+
 config SCHED_TRACER
 	bool "Scheduling Latency Tracer"
 	depends on DEBUG_KERNEL
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 5508cdb19aea..46be8647fb65 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -3,6 +3,7 @@ obj-$(CONFIG_FTRACE) += libftrace.o
 obj-$(CONFIG_TRACING) += trace.o
 obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
 obj-$(CONFIG_FTRACE) += trace_functions.o
+obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
 obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
new file mode 100644
index 000000000000..a9131b0cf1a5
--- /dev/null
+++ b/kernel/trace/trace_irqsoff.c
@@ -0,0 +1,402 @@
+/*
+ * trace irqs off criticall timings
+ *
+ * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
+ *
+ * From code in the latency_tracer, that is:
+ *
+ *  Copyright (C) 2004-2006 Ingo Molnar
+ *  Copyright (C) 2004 William Lee Irwin III
+ */
+#include <linux/kallsyms.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/ftrace.h>
+#include <linux/fs.h>
+
+#include "trace.h"
+
+static struct trace_array		*irqsoff_trace __read_mostly;
+static int				tracer_enabled __read_mostly;
+
+/*
+ * Sequence count - we record it when starting a measurement and
+ * skip the latency if the sequence has changed - some other section
+ * did a maximum and could disturb our measurement with serial console
+ * printouts, etc. Truly coinciding maximum latencies should be rare
+ * and what happens together happens separately as well, so this doesnt
+ * decrease the validity of the maximum found:
+ */
+static __cacheline_aligned_in_smp	unsigned long max_sequence;
+
+#ifdef CONFIG_FTRACE
+/*
+ * irqsoff uses its own tracer function to keep the overhead down:
+ */
+static void notrace
+irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
+{
+	struct trace_array *tr = irqsoff_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+	long disabled;
+	int cpu;
+
+	if (likely(!tracer_enabled))
+		return;
+
+	local_save_flags(flags);
+
+	if (!irqs_disabled_flags(flags))
+		return;
+
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+	disabled = atomic_inc_return(&data->disabled);
+
+	if (likely(disabled == 1))
+		ftrace(tr, data, ip, parent_ip, flags);
+
+	atomic_dec(&data->disabled);
+}
+
+static struct ftrace_ops trace_ops __read_mostly =
+{
+	.func = irqsoff_tracer_call,
+};
+#endif /* CONFIG_FTRACE */
+
+/*
+ * Should this new latency be reported/recorded?
+ */
+static int notrace report_latency(cycle_t delta)
+{
+	if (tracing_thresh) {
+		if (delta < tracing_thresh)
+			return 0;
+	} else {
+		if (delta <= tracing_max_latency)
+			return 0;
+	}
+	return 1;
+}
+
+static void notrace
+check_critical_timing(struct trace_array *tr,
+		      struct trace_array_cpu *data,
+		      unsigned long parent_ip,
+		      int cpu)
+{
+	unsigned long latency, t0, t1;
+	cycle_t T0, T1, T2, delta;
+	unsigned long flags;
+
+	/*
+	 * usecs conversion is slow so we try to delay the conversion
+	 * as long as possible:
+	 */
+	T0 = data->preempt_timestamp;
+	T1 = now(cpu);
+	delta = T1-T0;
+
+	local_save_flags(flags);
+
+	if (!report_latency(delta))
+		goto out;
+
+	ftrace(tr, data, CALLER_ADDR0, parent_ip, flags);
+	/*
+	 * Update the timestamp, because the trace entry above
+	 * might change it (it can only get larger so the latency
+	 * is fair to be reported):
+	 */
+	T2 = now(cpu);
+
+	delta = T2-T0;
+
+	latency = nsecs_to_usecs(delta);
+
+	if (data->critical_sequence != max_sequence)
+		goto out;
+
+	tracing_max_latency = delta;
+	t0 = nsecs_to_usecs(T0);
+	t1 = nsecs_to_usecs(T1);
+
+	data->critical_end = parent_ip;
+
+	update_max_tr_single(tr, current, cpu);
+
+	if (tracing_thresh)
+		printk(KERN_INFO "(%16s-%-5d|#%d): %lu us critical section "
+		       "violates %lu us threshold.\n"
+		       " => started at timestamp %lu: ",
+				current->comm, current->pid,
+				raw_smp_processor_id(),
+				latency, nsecs_to_usecs(tracing_thresh), t0);
+	else
+		printk(KERN_INFO "(%16s-%-5d|#%d):"
+		       " new %lu us maximum-latency "
+		       "critical section.\n => started at timestamp %lu: ",
+				current->comm, current->pid,
+				raw_smp_processor_id(),
+				latency, t0);
+
+	print_symbol(KERN_CONT "<%s>\n", data->critical_start);
+	printk(KERN_CONT " =>   ended at timestamp %lu: ", t1);
+	print_symbol(KERN_CONT "<%s>\n", data->critical_end);
+	dump_stack();
+	t1 = nsecs_to_usecs(now(cpu));
+	printk(KERN_CONT " =>   dump-end timestamp %lu\n\n", t1);
+
+	max_sequence++;
+
+out:
+	data->critical_sequence = max_sequence;
+	data->preempt_timestamp = now(cpu);
+	tracing_reset(data);
+	ftrace(tr, data, CALLER_ADDR0, parent_ip, flags);
+}
+
+static inline void notrace
+start_critical_timing(unsigned long ip, unsigned long parent_ip)
+{
+	int cpu;
+	struct trace_array *tr = irqsoff_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+
+	if (likely(!tracer_enabled))
+		return;
+
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+
+	if (unlikely(!data) || unlikely(!data->trace) ||
+	    data->critical_start || atomic_read(&data->disabled))
+		return;
+
+	atomic_inc(&data->disabled);
+
+	data->critical_sequence = max_sequence;
+	data->preempt_timestamp = now(cpu);
+	data->critical_start = parent_ip;
+	tracing_reset(data);
+
+	local_save_flags(flags);
+	ftrace(tr, data, ip, parent_ip, flags);
+
+	atomic_dec(&data->disabled);
+}
+
+static inline void notrace
+stop_critical_timing(unsigned long ip, unsigned long parent_ip)
+{
+	int cpu;
+	struct trace_array *tr = irqsoff_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+
+	if (likely(!tracer_enabled))
+		return;
+
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+
+	if (unlikely(!data) || unlikely(!data->trace) ||
+	    !data->critical_start || atomic_read(&data->disabled))
+		return;
+
+	atomic_inc(&data->disabled);
+	local_save_flags(flags);
+	ftrace(tr, data, ip, parent_ip, flags);
+	check_critical_timing(tr, data, parent_ip, cpu);
+	data->critical_start = 0;
+	atomic_dec(&data->disabled);
+}
+
+void notrace start_critical_timings(void)
+{
+	unsigned long flags;
+
+	local_save_flags(flags);
+
+	if (irqs_disabled_flags(flags))
+		start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
+}
+
+void notrace stop_critical_timings(void)
+{
+	unsigned long flags;
+
+	local_save_flags(flags);
+
+	if (irqs_disabled_flags(flags))
+		stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
+}
+
+#ifdef CONFIG_PROVE_LOCKING
+void notrace time_hardirqs_on(unsigned long a0, unsigned long a1)
+{
+	unsigned long flags;
+
+	local_save_flags(flags);
+
+	if (irqs_disabled_flags(flags))
+		stop_critical_timing(a0, a1);
+}
+
+void notrace time_hardirqs_off(unsigned long a0, unsigned long a1)
+{
+	unsigned long flags;
+
+	local_save_flags(flags);
+
+	if (irqs_disabled_flags(flags))
+		start_critical_timing(a0, a1);
+}
+
+#else /* !CONFIG_PROVE_LOCKING */
+
+/*
+ * Stubs:
+ */
+
+void early_boot_irqs_off(void)
+{
+}
+
+void early_boot_irqs_on(void)
+{
+}
+
+void trace_softirqs_on(unsigned long ip)
+{
+}
+
+void trace_softirqs_off(unsigned long ip)
+{
+}
+
+inline void print_irqtrace_events(struct task_struct *curr)
+{
+}
+
+/*
+ * We are only interested in hardirq on/off events:
+ */
+void notrace trace_hardirqs_on(void)
+{
+	unsigned long flags;
+
+	local_save_flags(flags);
+
+	if (irqs_disabled_flags(flags))
+		stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
+}
+EXPORT_SYMBOL(trace_hardirqs_on);
+
+void notrace trace_hardirqs_off(void)
+{
+	unsigned long flags;
+
+	local_save_flags(flags);
+
+	if (irqs_disabled_flags(flags))
+		start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
+}
+EXPORT_SYMBOL(trace_hardirqs_off);
+
+void notrace trace_hardirqs_on_caller(unsigned long caller_addr)
+{
+	unsigned long flags;
+
+	local_save_flags(flags);
+
+	if (irqs_disabled_flags(flags))
+		stop_critical_timing(CALLER_ADDR0, caller_addr);
+}
+EXPORT_SYMBOL(trace_hardirqs_on_caller);
+
+void notrace trace_hardirqs_off_caller(unsigned long caller_addr)
+{
+	unsigned long flags;
+
+	local_save_flags(flags);
+
+	if (irqs_disabled_flags(flags))
+		start_critical_timing(CALLER_ADDR0, caller_addr);
+}
+EXPORT_SYMBOL(trace_hardirqs_off_caller);
+
+#endif /* CONFIG_PROVE_LOCKING */
+
+static void start_irqsoff_tracer(struct trace_array *tr)
+{
+	tracer_enabled = 1;
+	register_ftrace_function(&trace_ops);
+}
+
+static void stop_irqsoff_tracer(struct trace_array *tr)
+{
+	unregister_ftrace_function(&trace_ops);
+	tracer_enabled = 0;
+}
+
+static void irqsoff_tracer_init(struct trace_array *tr)
+{
+	irqsoff_trace = tr;
+	/* make sure that the tracer is visibel */
+	smp_wmb();
+
+	if (tr->ctrl)
+		start_irqsoff_tracer(tr);
+}
+
+static void irqsoff_tracer_reset(struct trace_array *tr)
+{
+	if (tr->ctrl)
+		stop_irqsoff_tracer(tr);
+}
+
+static void irqsoff_tracer_ctrl_update(struct trace_array *tr)
+{
+	if (tr->ctrl)
+		start_irqsoff_tracer(tr);
+	else
+		stop_irqsoff_tracer(tr);
+}
+
+static void notrace irqsoff_tracer_open(struct trace_iterator *iter)
+{
+	/* stop the trace while dumping */
+	if (iter->tr->ctrl)
+		stop_irqsoff_tracer(iter->tr);
+}
+
+static void notrace irqsoff_tracer_close(struct trace_iterator *iter)
+{
+	if (iter->tr->ctrl)
+		start_irqsoff_tracer(iter->tr);
+}
+
+static struct tracer irqsoff_tracer __read_mostly =
+{
+	.name		= "irqsoff",
+	.init		= irqsoff_tracer_init,
+	.reset		= irqsoff_tracer_reset,
+	.open		= irqsoff_tracer_open,
+	.close		= irqsoff_tracer_close,
+	.ctrl_update	= irqsoff_tracer_ctrl_update,
+	.print_max	= 1,
+};
+
+__init static int init_irqsoff_tracer(void)
+{
+	register_tracer(&irqsoff_tracer);
+
+	return 0;
+}
+device_initcall(init_irqsoff_tracer);
-- 
cgit v1.2.3


From 6cd8a4bb2f97527a9ceb30bc77ea4e959c6a95e3 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 12 May 2008 21:20:42 +0200
Subject: ftrace: trace preempt off critical timings

Add preempt off timings. A lot of kernel core code is taken from the RT patch
latency trace that was written by Ingo Molnar.

This adds "preemptoff" and "preemptirqsoff" to /debugfs/tracing/available_tracers

Now instead of just tracing irqs off, preemption off can be selected
to be recorded.

When this is selected, it shares the same files as irqs off timings.
One can either trace preemption off, irqs off, or one or the other off.

By echoing "preemptoff" into /debugfs/tracing/current_tracer, recording
of preempt off only is performed. "irqsoff" will only record the time
irqs are disabled, but "preemptirqsoff" will take the total time irqs
or preemption are disabled. Runtime switching of these options is now
supported by simpling echoing in the appropriate trace name into
/debugfs/tracing/current_tracer.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/process_32.c |   3 +
 include/linux/ftrace.h       |   8 ++
 include/linux/irqflags.h     |   3 +-
 include/linux/preempt.h      |   2 +-
 kernel/sched.c               |  24 +++++-
 kernel/trace/Kconfig         |  25 ++++++
 kernel/trace/Makefile        |   1 +
 kernel/trace/trace_irqsoff.c | 184 +++++++++++++++++++++++++++++++------------
 8 files changed, 197 insertions(+), 53 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index f8476dfbb60d..a30aa1f2607a 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -185,7 +185,10 @@ void cpu_idle(void)
 
 			local_irq_disable();
 			__get_cpu_var(irq_stat).idle_timestamp = jiffies;
+			/* Don't trace irqs off for idle */
+			stop_critical_timings();
 			idle();
+			start_critical_timings();
 		}
 		tick_nohz_restart_sched_tick();
 		preempt_enable_no_resched();
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 0a20445dcbcc..740c97dcf9cb 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -58,4 +58,12 @@ extern void mcount(void);
 # define time_hardirqs_off(a0, a1)		do { } while (0)
 #endif
 
+#ifdef CONFIG_PREEMPT_TRACER
+  extern void notrace trace_preempt_on(unsigned long a0, unsigned long a1);
+  extern void notrace trace_preempt_off(unsigned long a0, unsigned long a1);
+#else
+# define trace_preempt_on(a0, a1)		do { } while (0)
+# define trace_preempt_off(a0, a1)		do { } while (0)
+#endif
+
 #endif /* _LINUX_FTRACE_H */
diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
index 5b711d4e9fd9..2b1c2e58566e 100644
--- a/include/linux/irqflags.h
+++ b/include/linux/irqflags.h
@@ -41,7 +41,8 @@
 # define INIT_TRACE_IRQFLAGS
 #endif
 
-#ifdef CONFIG_IRQSOFF_TRACER
+#if defined(CONFIG_IRQSOFF_TRACER) || \
+	defined(CONFIG_PREEMPT_TRACER)
  extern void stop_critical_timings(void);
  extern void start_critical_timings(void);
 #else
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 36b03d50bf40..72b1a10a59b6 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -10,7 +10,7 @@
 #include <linux/linkage.h>
 #include <linux/list.h>
 
-#ifdef CONFIG_DEBUG_PREEMPT
+#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER)
   extern void add_preempt_count(int val);
   extern void sub_preempt_count(int val);
 #else
diff --git a/kernel/sched.c b/kernel/sched.c
index 73e600852365..328494e28df2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -70,6 +70,7 @@
 #include <linux/bootmem.h>
 #include <linux/debugfs.h>
 #include <linux/ctype.h>
+#include <linux/ftrace.h>
 
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
@@ -4365,26 +4366,44 @@ void scheduler_tick(void)
 #endif
 }
 
-#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
+#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
+				defined(CONFIG_PREEMPT_TRACER))
+
+static inline unsigned long get_parent_ip(unsigned long addr)
+{
+	if (in_lock_functions(addr)) {
+		addr = CALLER_ADDR2;
+		if (in_lock_functions(addr))
+			addr = CALLER_ADDR3;
+	}
+	return addr;
+}
 
 void __kprobes add_preempt_count(int val)
 {
+#ifdef CONFIG_DEBUG_PREEMPT
 	/*
 	 * Underflow?
 	 */
 	if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
 		return;
+#endif
 	preempt_count() += val;
+#ifdef CONFIG_DEBUG_PREEMPT
 	/*
 	 * Spinlock count overflowing soon?
 	 */
 	DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
 				PREEMPT_MASK - 10);
+#endif
+	if (preempt_count() == val)
+		trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
 }
 EXPORT_SYMBOL(add_preempt_count);
 
 void __kprobes sub_preempt_count(int val)
 {
+#ifdef CONFIG_DEBUG_PREEMPT
 	/*
 	 * Underflow?
 	 */
@@ -4396,7 +4415,10 @@ void __kprobes sub_preempt_count(int val)
 	if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
 			!(preempt_count() & PREEMPT_MASK)))
 		return;
+#endif
 
+	if (preempt_count() == val)
+		trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
 	preempt_count() -= val;
 }
 EXPORT_SYMBOL(sub_preempt_count);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 896df1cf6adc..6430016b98e8 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -44,6 +44,31 @@ config IRQSOFF_TRACER
 
 	      echo 0 > /debugfs/tracing/tracing_max_latency
 
+	  (Note that kernel size and overhead increases with this option
+	  enabled. This option and the preempt-off timing option can be
+	  used together or separately.)
+
+config PREEMPT_TRACER
+	bool "Preemption-off Latency Tracer"
+	default n
+	depends on GENERIC_TIME
+	depends on PREEMPT
+	select TRACING
+	select TRACER_MAX_TRACE
+	help
+	  This option measures the time spent in preemption off critical
+	  sections, with microsecond accuracy.
+
+	  The default measurement method is a maximum search, which is
+	  disabled by default and can be runtime (re-)started
+	  via:
+
+	      echo 0 > /debugfs/tracing/tracing_max_latency
+
+	  (Note that kernel size and overhead increases with this option
+	  enabled. This option and the irqs-off timing option can be
+	  used together or separately.)
+
 config SCHED_TRACER
 	bool "Scheduling Latency Tracer"
 	depends on DEBUG_KERNEL
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 46be8647fb65..3fec653d6533 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -4,6 +4,7 @@ obj-$(CONFIG_TRACING) += trace.o
 obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
 obj-$(CONFIG_FTRACE) += trace_functions.o
 obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
+obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
 obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index a9131b0cf1a5..8b1231633dc5 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -21,6 +21,36 @@
 static struct trace_array		*irqsoff_trace __read_mostly;
 static int				tracer_enabled __read_mostly;
 
+static DEFINE_PER_CPU(int, tracing_cpu);
+
+enum {
+	TRACER_IRQS_OFF		= (1 << 1),
+	TRACER_PREEMPT_OFF	= (1 << 2),
+};
+
+static int trace_type __read_mostly;
+
+#ifdef CONFIG_PREEMPT_TRACER
+static inline int notrace
+preempt_trace(void)
+{
+	return ((trace_type & TRACER_PREEMPT_OFF) && preempt_count());
+}
+#else
+# define preempt_trace() (0)
+#endif
+
+#ifdef CONFIG_IRQSOFF_TRACER
+static inline int notrace
+irq_trace(void)
+{
+	return ((trace_type & TRACER_IRQS_OFF) &&
+		irqs_disabled());
+}
+#else
+# define irq_trace() (0)
+#endif
+
 /*
  * Sequence count - we record it when starting a measurement and
  * skip the latency if the sequence has changed - some other section
@@ -44,14 +74,11 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
 	long disabled;
 	int cpu;
 
-	if (likely(!tracer_enabled))
+	if (likely(!__get_cpu_var(tracing_cpu)))
 		return;
 
 	local_save_flags(flags);
 
-	if (!irqs_disabled_flags(flags))
-		return;
-
 	cpu = raw_smp_processor_id();
 	data = tr->data[cpu];
 	disabled = atomic_inc_return(&data->disabled);
@@ -171,23 +198,29 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
 	if (likely(!tracer_enabled))
 		return;
 
+	if (__get_cpu_var(tracing_cpu))
+		return;
+
 	cpu = raw_smp_processor_id();
 	data = tr->data[cpu];
 
 	if (unlikely(!data) || unlikely(!data->trace) ||
-	    data->critical_start || atomic_read(&data->disabled))
+	    atomic_read(&data->disabled))
 		return;
 
 	atomic_inc(&data->disabled);
 
 	data->critical_sequence = max_sequence;
 	data->preempt_timestamp = now(cpu);
-	data->critical_start = parent_ip;
+	data->critical_start = parent_ip ? : ip;
 	tracing_reset(data);
 
 	local_save_flags(flags);
+
 	ftrace(tr, data, ip, parent_ip, flags);
 
+	__get_cpu_var(tracing_cpu) = 1;
+
 	atomic_dec(&data->disabled);
 }
 
@@ -199,7 +232,13 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)
 	struct trace_array_cpu *data;
 	unsigned long flags;
 
-	if (likely(!tracer_enabled))
+	/* Always clear the tracing cpu on stopping the trace */
+	if (unlikely(__get_cpu_var(tracing_cpu)))
+		__get_cpu_var(tracing_cpu) = 0;
+	else
+		return;
+
+	if (!tracer_enabled)
 		return;
 
 	cpu = raw_smp_processor_id();
@@ -212,49 +251,35 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)
 	atomic_inc(&data->disabled);
 	local_save_flags(flags);
 	ftrace(tr, data, ip, parent_ip, flags);
-	check_critical_timing(tr, data, parent_ip, cpu);
+	check_critical_timing(tr, data, parent_ip ? : ip, cpu);
 	data->critical_start = 0;
 	atomic_dec(&data->disabled);
 }
 
+/* start and stop critical timings used to for stoppage (in idle) */
 void notrace start_critical_timings(void)
 {
-	unsigned long flags;
-
-	local_save_flags(flags);
-
-	if (irqs_disabled_flags(flags))
+	if (preempt_trace() || irq_trace())
 		start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
 }
 
 void notrace stop_critical_timings(void)
 {
-	unsigned long flags;
-
-	local_save_flags(flags);
-
-	if (irqs_disabled_flags(flags))
+	if (preempt_trace() || irq_trace())
 		stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
 }
 
+#ifdef CONFIG_IRQSOFF_TRACER
 #ifdef CONFIG_PROVE_LOCKING
 void notrace time_hardirqs_on(unsigned long a0, unsigned long a1)
 {
-	unsigned long flags;
-
-	local_save_flags(flags);
-
-	if (irqs_disabled_flags(flags))
+	if (!preempt_trace() && irq_trace())
 		stop_critical_timing(a0, a1);
 }
 
 void notrace time_hardirqs_off(unsigned long a0, unsigned long a1)
 {
-	unsigned long flags;
-
-	local_save_flags(flags);
-
-	if (irqs_disabled_flags(flags))
+	if (!preempt_trace() && irq_trace())
 		start_critical_timing(a0, a1);
 }
 
@@ -289,49 +314,46 @@ inline void print_irqtrace_events(struct task_struct *curr)
  */
 void notrace trace_hardirqs_on(void)
 {
-	unsigned long flags;
-
-	local_save_flags(flags);
-
-	if (irqs_disabled_flags(flags))
+	if (!preempt_trace() && irq_trace())
 		stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
 }
 EXPORT_SYMBOL(trace_hardirqs_on);
 
 void notrace trace_hardirqs_off(void)
 {
-	unsigned long flags;
-
-	local_save_flags(flags);
-
-	if (irqs_disabled_flags(flags))
+	if (!preempt_trace() && irq_trace())
 		start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
 }
 EXPORT_SYMBOL(trace_hardirqs_off);
 
 void notrace trace_hardirqs_on_caller(unsigned long caller_addr)
 {
-	unsigned long flags;
-
-	local_save_flags(flags);
-
-	if (irqs_disabled_flags(flags))
+	if (!preempt_trace() && irq_trace())
 		stop_critical_timing(CALLER_ADDR0, caller_addr);
 }
 EXPORT_SYMBOL(trace_hardirqs_on_caller);
 
 void notrace trace_hardirqs_off_caller(unsigned long caller_addr)
 {
-	unsigned long flags;
-
-	local_save_flags(flags);
-
-	if (irqs_disabled_flags(flags))
+	if (!preempt_trace() && irq_trace())
 		start_critical_timing(CALLER_ADDR0, caller_addr);
 }
 EXPORT_SYMBOL(trace_hardirqs_off_caller);
 
 #endif /* CONFIG_PROVE_LOCKING */
+#endif /*  CONFIG_IRQSOFF_TRACER */
+
+#ifdef CONFIG_PREEMPT_TRACER
+void notrace trace_preempt_on(unsigned long a0, unsigned long a1)
+{
+	stop_critical_timing(a0, a1);
+}
+
+void notrace trace_preempt_off(unsigned long a0, unsigned long a1)
+{
+	start_critical_timing(a0, a1);
+}
+#endif /* CONFIG_PREEMPT_TRACER */
 
 static void start_irqsoff_tracer(struct trace_array *tr)
 {
@@ -345,7 +367,7 @@ static void stop_irqsoff_tracer(struct trace_array *tr)
 	tracer_enabled = 0;
 }
 
-static void irqsoff_tracer_init(struct trace_array *tr)
+static void __irqsoff_tracer_init(struct trace_array *tr)
 {
 	irqsoff_trace = tr;
 	/* make sure that the tracer is visibel */
@@ -382,6 +404,13 @@ static void notrace irqsoff_tracer_close(struct trace_iterator *iter)
 		start_irqsoff_tracer(iter->tr);
 }
 
+#ifdef CONFIG_IRQSOFF_TRACER
+static void irqsoff_tracer_init(struct trace_array *tr)
+{
+	trace_type = TRACER_IRQS_OFF;
+
+	__irqsoff_tracer_init(tr);
+}
 static struct tracer irqsoff_tracer __read_mostly =
 {
 	.name		= "irqsoff",
@@ -392,10 +421,65 @@ static struct tracer irqsoff_tracer __read_mostly =
 	.ctrl_update	= irqsoff_tracer_ctrl_update,
 	.print_max	= 1,
 };
+# define register_irqsoff(trace) register_tracer(&trace)
+#else
+# define register_irqsoff(trace) do { } while (0)
+#endif
+
+#ifdef CONFIG_PREEMPT_TRACER
+static void preemptoff_tracer_init(struct trace_array *tr)
+{
+	trace_type = TRACER_PREEMPT_OFF;
+
+	__irqsoff_tracer_init(tr);
+}
+
+static struct tracer preemptoff_tracer __read_mostly =
+{
+	.name		= "preemptoff",
+	.init		= preemptoff_tracer_init,
+	.reset		= irqsoff_tracer_reset,
+	.open		= irqsoff_tracer_open,
+	.close		= irqsoff_tracer_close,
+	.ctrl_update	= irqsoff_tracer_ctrl_update,
+	.print_max	= 1,
+};
+# define register_preemptoff(trace) register_tracer(&trace)
+#else
+# define register_preemptoff(trace) do { } while (0)
+#endif
+
+#if defined(CONFIG_IRQSOFF_TRACER) && \
+	defined(CONFIG_PREEMPT_TRACER)
+
+static void preemptirqsoff_tracer_init(struct trace_array *tr)
+{
+	trace_type = TRACER_IRQS_OFF | TRACER_PREEMPT_OFF;
+
+	__irqsoff_tracer_init(tr);
+}
+
+static struct tracer preemptirqsoff_tracer __read_mostly =
+{
+	.name		= "preemptirqsoff",
+	.init		= preemptirqsoff_tracer_init,
+	.reset		= irqsoff_tracer_reset,
+	.open		= irqsoff_tracer_open,
+	.close		= irqsoff_tracer_close,
+	.ctrl_update	= irqsoff_tracer_ctrl_update,
+	.print_max	= 1,
+};
+
+# define register_preemptirqsoff(trace) register_tracer(&trace)
+#else
+# define register_preemptirqsoff(trace) do { } while (0)
+#endif
 
 __init static int init_irqsoff_tracer(void)
 {
-	register_tracer(&irqsoff_tracer);
+	register_irqsoff(irqsoff_tracer);
+	register_preemptoff(preemptoff_tracer);
+	register_preemptirqsoff(preemptirqsoff_tracer);
 
 	return 0;
 }
-- 
cgit v1.2.3


From 3d0833953e1b98b79ddf491dd49229eef9baeac1 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 12 May 2008 21:20:42 +0200
Subject: ftrace: dynamic enabling/disabling of function calls

This patch adds a feature to dynamically replace the ftrace code
with the jmps to allow a kernel with ftrace configured to run
as fast as it can without it configured.

The way this works, is on bootup (if ftrace is enabled), a ftrace
function is registered to record the instruction pointer of all
places that call the function.

Later, if there's still any code to patch, a kthread is awoken
(rate limited to at most once a second) that performs a stop_machine,
and replaces all the code that was called with a jmp over the call
to ftrace. It only replaces what was found the previous time. Typically
the system reaches equilibrium quickly after bootup and there's no code
patching needed at all.

e.g.

  call ftrace  /* 5 bytes */

is replaced with

  jmp 3f  /* jmp is 2 bytes and we jump 3 forward */
3:

When we want to enable ftrace for function tracing, the IP recording
is removed, and stop_machine is called again to replace all the locations
of that were recorded back to the call of ftrace.  When it is disabled,
we replace the code back to the jmp.

Allocation is done by the kthread. If the ftrace recording function is
called, and we don't have any record slots available, then we simply
skip that call. Once a second a new page (if needed) is allocated for
recording new ftrace function calls.  A large batch is allocated at
boot up to get most of the calls there.

Because we do this via stop_machine, we don't have to worry about another
CPU executing a ftrace call as we modify it. But we do need to worry
about NMI's so all functions that might be called via nmi must be
annotated with notrace_nmi. When this code is configured in, the NMI code
will not call notrace.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/Makefile |   1 +
 arch/x86/kernel/ftrace.c | 237 +++++++++++++++++++++++++++++++
 include/linux/ftrace.h   |  18 +++
 kernel/trace/Kconfig     |  17 +++
 kernel/trace/ftrace.c    | 356 ++++++++++++++++++++++++++++++++++++++++++-----
 5 files changed, 597 insertions(+), 32 deletions(-)
 create mode 100644 arch/x86/kernel/ftrace.c

(limited to 'include/linux')

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 5e618c3b4720..e142091524b0 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -56,6 +56,7 @@ obj-$(CONFIG_X86_MPPARSE)	+= mpparse.o
 obj-$(CONFIG_X86_LOCAL_APIC)	+= apic_$(BITS).o nmi_$(BITS).o
 obj-$(CONFIG_X86_IO_APIC)	+= io_apic_$(BITS).o
 obj-$(CONFIG_X86_REBOOTFIXUPS)	+= reboot_fixups_32.o
+obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o
 obj-$(CONFIG_KEXEC)		+= machine_kexec_$(BITS).o
 obj-$(CONFIG_KEXEC)		+= relocate_kernel_$(BITS).o crash.o
 obj-$(CONFIG_CRASH_DUMP)	+= crash_dump_$(BITS).o
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
new file mode 100644
index 000000000000..5dd58136ef02
--- /dev/null
+++ b/arch/x86/kernel/ftrace.c
@@ -0,0 +1,237 @@
+/*
+ * Code for replacing ftrace calls with jumps.
+ *
+ * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ *
+ * Thanks goes to Ingo Molnar, for suggesting the idea.
+ * Mathieu Desnoyers, for suggesting postponing the modifications.
+ * Arjan van de Ven, for keeping me straight, and explaining to me
+ * the dangers of modifying code on the run.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/hardirq.h>
+#include <linux/ftrace.h>
+#include <linux/percpu.h>
+#include <linux/init.h>
+#include <linux/list.h>
+
+#define CALL_BACK		5
+
+#define JMPFWD			0x03eb
+
+static unsigned short ftrace_jmp = JMPFWD;
+
+struct ftrace_record {
+	struct dyn_ftrace	rec;
+	int			failed;
+} __attribute__((packed));
+
+struct ftrace_page {
+	struct ftrace_page	*next;
+	int			index;
+	struct ftrace_record	records[];
+} __attribute__((packed));
+
+#define ENTRIES_PER_PAGE \
+  ((PAGE_SIZE - sizeof(struct ftrace_page)) / sizeof(struct ftrace_record))
+
+/* estimate from running different kernels */
+#define NR_TO_INIT		10000
+
+#define MCOUNT_ADDR ((long)(&mcount))
+
+union ftrace_code_union {
+	char code[5];
+	struct {
+		char e8;
+		int offset;
+	} __attribute__((packed));
+};
+
+static struct ftrace_page	*ftrace_pages_start;
+static struct ftrace_page	*ftrace_pages;
+
+notrace struct dyn_ftrace *ftrace_alloc_shutdown_node(unsigned long ip)
+{
+	struct ftrace_record *rec;
+	unsigned short save;
+
+	ip -= CALL_BACK;
+	save = *(short *)ip;
+
+	/* If this was already converted, skip it */
+	if (save == JMPFWD)
+		return NULL;
+
+	if (ftrace_pages->index == ENTRIES_PER_PAGE) {
+		if (!ftrace_pages->next)
+			return NULL;
+		ftrace_pages = ftrace_pages->next;
+	}
+
+	rec = &ftrace_pages->records[ftrace_pages->index++];
+
+	return &rec->rec;
+}
+
+static int notrace
+ftrace_modify_code(unsigned long ip, unsigned char *old_code,
+		   unsigned char *new_code)
+{
+	unsigned short old = *(unsigned short *)old_code;
+	unsigned short new = *(unsigned short *)new_code;
+	unsigned short replaced;
+	int faulted = 0;
+
+	/*
+	 * Note: Due to modules and __init, code can
+	 *  disappear and change, we need to protect against faulting
+	 *  as well as code changing.
+	 *
+	 * No real locking needed, this code is run through
+	 * kstop_machine.
+	 */
+	asm volatile (
+		"1: lock\n"
+		"   cmpxchg %w3, (%2)\n"
+		"2:\n"
+		".section .fixup, \"ax\"\n"
+		"	movl $1, %0\n"
+		"3:	jmp 2b\n"
+		".previous\n"
+		_ASM_EXTABLE(1b, 3b)
+		: "=r"(faulted), "=a"(replaced)
+		: "r"(ip), "r"(new), "0"(faulted), "a"(old)
+		: "memory");
+	sync_core();
+
+	if (replaced != old)
+		faulted = 2;
+
+	return faulted;
+}
+
+static int notrace ftrace_calc_offset(long ip)
+{
+	return (int)(MCOUNT_ADDR - ip);
+}
+
+notrace void ftrace_code_disable(struct dyn_ftrace *rec)
+{
+	unsigned long ip;
+	union ftrace_code_union save;
+	struct ftrace_record *r =
+		container_of(rec, struct ftrace_record, rec);
+
+	ip = rec->ip;
+
+	save.e8		= 0xe8;
+	save.offset 	= ftrace_calc_offset(ip);
+
+	/* move the IP back to the start of the call */
+	ip -= CALL_BACK;
+
+	r->failed = ftrace_modify_code(ip, save.code, (char *)&ftrace_jmp);
+}
+
+static void notrace ftrace_replace_code(int saved)
+{
+	unsigned char *new = NULL, *old = NULL;
+	struct ftrace_record *rec;
+	struct ftrace_page *pg;
+	unsigned long ip;
+	int i;
+
+	if (saved)
+		old = (char *)&ftrace_jmp;
+	else
+		new = (char *)&ftrace_jmp;
+
+	for (pg = ftrace_pages_start; pg; pg = pg->next) {
+		for (i = 0; i < pg->index; i++) {
+			union ftrace_code_union calc;
+			rec = &pg->records[i];
+
+			/* don't modify code that has already faulted */
+			if (rec->failed)
+				continue;
+
+			ip = rec->rec.ip;
+
+			calc.e8		= 0xe8;
+			calc.offset	= ftrace_calc_offset(ip);
+
+			if (saved)
+				new = calc.code;
+			else
+				old = calc.code;
+
+			ip -= CALL_BACK;
+
+			rec->failed = ftrace_modify_code(ip, old, new);
+		}
+	}
+
+}
+
+notrace void ftrace_startup_code(void)
+{
+	ftrace_replace_code(1);
+}
+
+notrace void ftrace_shutdown_code(void)
+{
+	ftrace_replace_code(0);
+}
+
+notrace void ftrace_shutdown_replenish(void)
+{
+	if (ftrace_pages->next)
+		return;
+
+	/* allocate another page */
+	ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL);
+}
+
+notrace int ftrace_shutdown_arch_init(void)
+{
+	struct ftrace_page *pg;
+	int cnt;
+	int i;
+
+	/* allocate a few pages */
+	ftrace_pages_start = (void *)get_zeroed_page(GFP_KERNEL);
+	if (!ftrace_pages_start)
+		return -1;
+
+	/*
+	 * Allocate a few more pages.
+	 *
+	 * TODO: have some parser search vmlinux before
+	 *   final linking to find all calls to ftrace.
+	 *   Then we can:
+	 *    a) know how many pages to allocate.
+	 *     and/or
+	 *    b) set up the table then.
+	 *
+	 *  The dynamic code is still necessary for
+	 *  modules.
+	 */
+
+	pg = ftrace_pages = ftrace_pages_start;
+
+	cnt = NR_TO_INIT / ENTRIES_PER_PAGE;
+
+	for (i = 0; i < cnt; i++) {
+		pg->next = (void *)get_zeroed_page(GFP_KERNEL);
+
+		/* If we fail, we'll try later anyway */
+		if (!pg->next)
+			break;
+
+		pg = pg->next;
+	}
+
+	return 0;
+}
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 740c97dcf9cb..90dbc0ee2046 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -32,6 +32,24 @@ extern void mcount(void);
 # define clear_ftrace_function(ops) do { } while (0)
 #endif /* CONFIG_FTRACE */
 
+#ifdef CONFIG_DYNAMIC_FTRACE
+# define FTRACE_HASHBITS	10
+# define FTRACE_HASHSIZE	(1<<FTRACE_HASHBITS)
+
+struct dyn_ftrace {
+	struct hlist_node node;
+	unsigned long	  ip;
+};
+
+/* defined in arch */
+extern struct dyn_ftrace *
+ftrace_alloc_shutdown_node(unsigned long ip);
+extern int ftrace_shutdown_arch_init(void);
+extern void ftrace_code_disable(struct dyn_ftrace *rec);
+extern void ftrace_startup_code(void);
+extern void ftrace_shutdown_code(void);
+extern void ftrace_shutdown_replenish(void);
+#endif
 
 #ifdef CONFIG_FRAME_POINTER
 /* TODO: need to fix this for ARM */
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 6430016b98e8..cad9db1dee02 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -88,3 +88,20 @@ config CONTEXT_SWITCH_TRACER
 	  This tracer gets called from the context switch and records
 	  all switching of tasks.
 
+config DYNAMIC_FTRACE
+	bool "enable/disable ftrace tracepoints dynamically"
+	depends on FTRACE
+	default y
+	help
+         This option will modify all the calls to ftrace dynamically
+	 (will patch them out of the binary image and replaces them
+	 with a No-Op instruction) as they are called. A table is
+	 created to dynamically enable them again.
+
+	 This way a CONFIG_FTRACE kernel is slightly larger, but otherwise
+	 has native performance as long as no tracing is active.
+
+	 The changes to the code are done by a kernel thread that
+	 wakes up once a second and checks to see if any ftrace calls
+	 were made. If so, it runs stop_machine (stops all CPUS)
+	 and modifies the code to jump over the call to ftrace.
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index b6a80b98a3fb..d1ae2ba25274 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -13,10 +13,19 @@
  *  Copyright (C) 2004 William Lee Irwin III
  */
 
-#include <linux/module.h>
+#include <linux/stop_machine.h>
+#include <linux/clocksource.h>
+#include <linux/kallsyms.h>
+#include <linux/kthread.h>
+#include <linux/hardirq.h>
 #include <linux/ftrace.h>
+#include <linux/module.h>
+#include <linux/hash.h>
+#include <linux/list.h>
+
+#include "trace.h"
 
-static DEFINE_SPINLOCK(ftrace_func_lock);
+static DEFINE_SPINLOCK(ftrace_lock);
 static struct ftrace_ops ftrace_list_end __read_mostly =
 {
 	.func = ftrace_stub,
@@ -44,21 +53,21 @@ notrace void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
 }
 
 /**
- * register_ftrace_function - register a function for profiling
- * @ops - ops structure that holds the function for profiling.
- *
- * Register a function to be called by all functions in the
- * kernel.
+ * clear_ftrace_function - reset the ftrace function
  *
- * Note: @ops->func and all the functions it calls must be labeled
- *       with "notrace", otherwise it will go into a
- *       recursive loop.
+ * This NULLs the ftrace function and in essence stops
+ * tracing.  There may be lag
  */
-int register_ftrace_function(struct ftrace_ops *ops)
+void clear_ftrace_function(void)
 {
-	unsigned long flags;
+	ftrace_trace_function = ftrace_stub;
+}
+
+static int notrace __register_ftrace_function(struct ftrace_ops *ops)
+{
+	/* Should never be called by interrupts */
+	spin_lock(&ftrace_lock);
 
-	spin_lock_irqsave(&ftrace_func_lock, flags);
 	ops->next = ftrace_list;
 	/*
 	 * We are entering ops into the ftrace_list but another
@@ -68,6 +77,7 @@ int register_ftrace_function(struct ftrace_ops *ops)
 	 */
 	smp_wmb();
 	ftrace_list = ops;
+
 	/*
 	 * For one func, simply call it directly.
 	 * For more than one func, call the chain.
@@ -76,28 +86,22 @@ int register_ftrace_function(struct ftrace_ops *ops)
 		ftrace_trace_function = ops->func;
 	else
 		ftrace_trace_function = ftrace_list_func;
-	spin_unlock_irqrestore(&ftrace_func_lock, flags);
+
+	spin_unlock(&ftrace_lock);
 
 	return 0;
 }
 
-/**
- * unregister_ftrace_function - unresgister a function for profiling.
- * @ops - ops structure that holds the function to unregister
- *
- * Unregister a function that was added to be called by ftrace profiling.
- */
-int unregister_ftrace_function(struct ftrace_ops *ops)
+static int notrace __unregister_ftrace_function(struct ftrace_ops *ops)
 {
-	unsigned long flags;
 	struct ftrace_ops **p;
 	int ret = 0;
 
-	spin_lock_irqsave(&ftrace_func_lock, flags);
+	spin_lock(&ftrace_lock);
 
 	/*
-	 * If we are the only function, then the ftrace pointer is
-	 * pointing directly to that function.
+	 * If we are removing the last function, then simply point
+	 * to the ftrace_stub.
 	 */
 	if (ftrace_list == ops && ops->next == &ftrace_list_end) {
 		ftrace_trace_function = ftrace_stub;
@@ -117,22 +121,310 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
 	*p = (*p)->next;
 
 	/* If we only have one func left, then call that directly */
-	if (ftrace_list->next == &ftrace_list_end)
+	if (ftrace_list == &ftrace_list_end ||
+	    ftrace_list->next == &ftrace_list_end)
 		ftrace_trace_function = ftrace_list->func;
 
  out:
-	spin_unlock_irqrestore(&ftrace_func_lock, flags);
+	spin_unlock(&ftrace_lock);
+
+	return ret;
+}
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+static struct hlist_head ftrace_hash[FTRACE_HASHSIZE];
+
+static DEFINE_PER_CPU(int, ftrace_shutdown_disable_cpu);
+
+static DEFINE_SPINLOCK(ftrace_shutdown_lock);
+static DEFINE_MUTEX(ftraced_lock);
+
+static int ftraced_trigger;
+static int ftraced_suspend;
+
+static int ftrace_record_suspend;
+
+static inline int
+notrace ftrace_ip_in_hash(unsigned long ip, unsigned long key)
+{
+	struct dyn_ftrace *p;
+	struct hlist_node *t;
+	int found = 0;
+
+	hlist_for_each_entry(p, t, &ftrace_hash[key], node) {
+		if (p->ip == ip) {
+			found = 1;
+			break;
+		}
+	}
+
+	return found;
+}
+
+static inline void notrace
+ftrace_add_hash(struct dyn_ftrace *node, unsigned long key)
+{
+	hlist_add_head(&node->node, &ftrace_hash[key]);
+}
+
+static void notrace
+ftrace_record_ip(unsigned long ip, unsigned long parent_ip)
+{
+	struct dyn_ftrace *node;
+	unsigned long flags;
+	unsigned long key;
+	int resched;
+	int atomic;
+
+	resched = need_resched();
+	preempt_disable_notrace();
+
+	/* We simply need to protect against recursion */
+	__get_cpu_var(ftrace_shutdown_disable_cpu)++;
+	if (__get_cpu_var(ftrace_shutdown_disable_cpu) != 1)
+		goto out;
+
+	if (unlikely(ftrace_record_suspend))
+		goto out;
+
+	key = hash_long(ip, FTRACE_HASHBITS);
+
+	WARN_ON_ONCE(key >= FTRACE_HASHSIZE);
+
+	if (ftrace_ip_in_hash(ip, key))
+		goto out;
+
+	atomic = irqs_disabled();
+
+	spin_lock_irqsave(&ftrace_shutdown_lock, flags);
+
+	/* This ip may have hit the hash before the lock */
+	if (ftrace_ip_in_hash(ip, key))
+		goto out_unlock;
+
+	/*
+	 * There's a slight race that the ftraced will update the
+	 * hash and reset here. The arch alloc is responsible
+	 * for seeing if the IP has already changed, and if
+	 * it has, the alloc will fail.
+	 */
+	node = ftrace_alloc_shutdown_node(ip);
+	if (!node)
+		goto out_unlock;
+
+	node->ip = ip;
+
+	ftrace_add_hash(node, key);
+
+	ftraced_trigger = 1;
+
+ out_unlock:
+	spin_unlock_irqrestore(&ftrace_shutdown_lock, flags);
+ out:
+	__get_cpu_var(ftrace_shutdown_disable_cpu)--;
+
+	/* prevent recursion with scheduler */
+	if (resched)
+		preempt_enable_no_resched_notrace();
+	else
+		preempt_enable_notrace();
+}
+
+static struct ftrace_ops ftrace_shutdown_ops __read_mostly =
+{
+	.func = ftrace_record_ip,
+};
+
+
+static int notrace __ftrace_modify_code(void *data)
+{
+	void (*func)(void) = data;
+
+	func();
+	return 0;
+}
+
+static void notrace ftrace_run_startup_code(void)
+{
+	stop_machine_run(__ftrace_modify_code, ftrace_startup_code, NR_CPUS);
+}
+
+static void notrace ftrace_run_shutdown_code(void)
+{
+	stop_machine_run(__ftrace_modify_code, ftrace_shutdown_code, NR_CPUS);
+}
+
+static void notrace ftrace_startup(void)
+{
+	mutex_lock(&ftraced_lock);
+	ftraced_suspend++;
+	if (ftraced_suspend != 1)
+		goto out;
+	__unregister_ftrace_function(&ftrace_shutdown_ops);
+
+	ftrace_run_startup_code();
+ out:
+	mutex_unlock(&ftraced_lock);
+}
+
+static void notrace ftrace_shutdown(void)
+{
+	mutex_lock(&ftraced_lock);
+	ftraced_suspend--;
+	if (ftraced_suspend)
+		goto out;
+
+	ftrace_run_shutdown_code();
+
+	__register_ftrace_function(&ftrace_shutdown_ops);
+ out:
+	mutex_unlock(&ftraced_lock);
+}
+
+static cycle_t		ftrace_update_time;
+static unsigned long	ftrace_update_cnt;
+unsigned long		ftrace_update_tot_cnt;
+
+static int notrace __ftrace_update_code(void *ignore)
+{
+	struct dyn_ftrace *p;
+	struct hlist_head head;
+	struct hlist_node *t;
+	cycle_t start, stop;
+	int i;
+
+	/* Don't be calling ftrace ops now */
+	__unregister_ftrace_function(&ftrace_shutdown_ops);
+
+	start = now(raw_smp_processor_id());
+	ftrace_update_cnt = 0;
+
+	/* No locks needed, the machine is stopped! */
+	for (i = 0; i < FTRACE_HASHSIZE; i++) {
+		if (hlist_empty(&ftrace_hash[i]))
+			continue;
+
+		head = ftrace_hash[i];
+		INIT_HLIST_HEAD(&ftrace_hash[i]);
+
+		/* all CPUS are stopped, we are safe to modify code */
+		hlist_for_each_entry(p, t, &head, node) {
+			ftrace_code_disable(p);
+			ftrace_update_cnt++;
+		}
+
+	}
+
+	stop = now(raw_smp_processor_id());
+	ftrace_update_time = stop - start;
+	ftrace_update_tot_cnt += ftrace_update_cnt;
+
+	__register_ftrace_function(&ftrace_shutdown_ops);
 
 	return 0;
 }
 
+static void notrace ftrace_update_code(void)
+{
+	stop_machine_run(__ftrace_update_code, NULL, NR_CPUS);
+}
+
+static int notrace ftraced(void *ignore)
+{
+	unsigned long usecs;
+
+	set_current_state(TASK_INTERRUPTIBLE);
+
+	while (!kthread_should_stop()) {
+
+		/* check once a second */
+		schedule_timeout(HZ);
+
+		mutex_lock(&ftraced_lock);
+		if (ftraced_trigger && !ftraced_suspend) {
+			ftrace_record_suspend++;
+			ftrace_update_code();
+			usecs = nsecs_to_usecs(ftrace_update_time);
+			if (ftrace_update_tot_cnt > 100000) {
+				ftrace_update_tot_cnt = 0;
+				pr_info("hm, dftrace overflow: %lu change%s"
+					 " (%lu total) in %lu usec%s\n",
+					ftrace_update_cnt,
+					ftrace_update_cnt != 1 ? "s" : "",
+					ftrace_update_tot_cnt,
+					usecs, usecs != 1 ? "s" : "");
+				WARN_ON_ONCE(1);
+			}
+			ftraced_trigger = 0;
+			ftrace_record_suspend--;
+		}
+		mutex_unlock(&ftraced_lock);
+
+		ftrace_shutdown_replenish();
+
+		set_current_state(TASK_INTERRUPTIBLE);
+	}
+	__set_current_state(TASK_RUNNING);
+	return 0;
+}
+
+static int __init notrace ftrace_shutdown_init(void)
+{
+	struct task_struct *p;
+	int ret;
+
+	ret = ftrace_shutdown_arch_init();
+	if (ret)
+		return ret;
+
+	p = kthread_run(ftraced, NULL, "ftraced");
+	if (IS_ERR(p))
+		return -1;
+
+	__register_ftrace_function(&ftrace_shutdown_ops);
+
+	return 0;
+}
+
+core_initcall(ftrace_shutdown_init);
+#else
+# define ftrace_startup()	do { } while (0)
+# define ftrace_shutdown()	do { } while (0)
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
 /**
- * clear_ftrace_function - reset the ftrace function
+ * register_ftrace_function - register a function for profiling
+ * @ops - ops structure that holds the function for profiling.
  *
- * This NULLs the ftrace function and in essence stops
- * tracing.  There may be lag
+ * Register a function to be called by all functions in the
+ * kernel.
+ *
+ * Note: @ops->func and all the functions it calls must be labeled
+ *       with "notrace", otherwise it will go into a
+ *       recursive loop.
  */
-void clear_ftrace_function(void)
+int register_ftrace_function(struct ftrace_ops *ops)
 {
-	ftrace_trace_function = ftrace_stub;
+	ftrace_startup();
+
+	return __register_ftrace_function(ops);
+}
+
+/**
+ * unregister_ftrace_function - unresgister a function for profiling.
+ * @ops - ops structure that holds the function to unregister
+ *
+ * Unregister a function that was added to be called by ftrace profiling.
+ */
+int unregister_ftrace_function(struct ftrace_ops *ops)
+{
+	int ret;
+
+	ret = __unregister_ftrace_function(ops);
+
+	if (ftrace_list == &ftrace_list_end)
+		ftrace_shutdown();
+
+	return ret;
 }
-- 
cgit v1.2.3


From b0fc494fae96a7089f3651cb451f461c7291244c Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 12 May 2008 21:20:43 +0200
Subject: ftrace: add ftrace_enabled sysctl to disable mcount function

This patch adds back the sysctl ftrace_enabled. This time it is
defaulted to on, if DYNAMIC_FTRACE is configured. When ftrace_enabled
is disabled, the ftrace function is set to the stub return.

If DYNAMIC_FTRACE is also configured, on ftrace_enabled = 0,
the registered ftrace functions will all be set to jmps, but no more
new calls to ftrace recording (used to find the ftrace calling sites)
will be called.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/ftrace.h |   6 +++
 kernel/sysctl.c        |  11 +++++
 kernel/trace/ftrace.c  | 125 ++++++++++++++++++++++++++++++++++++++++++-------
 3 files changed, 124 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 90dbc0ee2046..ccd8537dbdb7 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -5,6 +5,12 @@
 
 #include <linux/linkage.h>
 
+extern int ftrace_enabled;
+extern int
+ftrace_enable_sysctl(struct ctl_table *table, int write,
+		     struct file *filp, void __user *buffer, size_t *lenp,
+		     loff_t *ppos);
+
 typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip);
 
 struct ftrace_ops {
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 29116652dca8..efaf7c5500e9 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -46,6 +46,7 @@
 #include <linux/nfs_fs.h>
 #include <linux/acpi.h>
 #include <linux/reboot.h>
+#include <linux/ftrace.h>
 
 #include <asm/uaccess.h>
 #include <asm/processor.h>
@@ -455,6 +456,16 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+#ifdef CONFIG_FTRACE
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "ftrace_enabled",
+		.data		= &ftrace_enabled,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &ftrace_enable_sysctl,
+	},
+#endif
 #ifdef CONFIG_KMOD
 	{
 		.ctl_name	= KERN_MODPROBE,
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index d1ae2ba25274..d3de37299ba4 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -20,12 +20,24 @@
 #include <linux/hardirq.h>
 #include <linux/ftrace.h>
 #include <linux/module.h>
+#include <linux/sysctl.h>
 #include <linux/hash.h>
 #include <linux/list.h>
 
 #include "trace.h"
 
+#ifdef CONFIG_DYNAMIC_FTRACE
+# define FTRACE_ENABLED_INIT 1
+#else
+# define FTRACE_ENABLED_INIT 0
+#endif
+
+int ftrace_enabled = FTRACE_ENABLED_INIT;
+static int last_ftrace_enabled = FTRACE_ENABLED_INIT;
+
 static DEFINE_SPINLOCK(ftrace_lock);
+static DEFINE_MUTEX(ftrace_sysctl_lock);
+
 static struct ftrace_ops ftrace_list_end __read_mostly =
 {
 	.func = ftrace_stub,
@@ -78,14 +90,16 @@ static int notrace __register_ftrace_function(struct ftrace_ops *ops)
 	smp_wmb();
 	ftrace_list = ops;
 
-	/*
-	 * For one func, simply call it directly.
-	 * For more than one func, call the chain.
-	 */
-	if (ops->next == &ftrace_list_end)
-		ftrace_trace_function = ops->func;
-	else
-		ftrace_trace_function = ftrace_list_func;
+	if (ftrace_enabled) {
+		/*
+		 * For one func, simply call it directly.
+		 * For more than one func, call the chain.
+		 */
+		if (ops->next == &ftrace_list_end)
+			ftrace_trace_function = ops->func;
+		else
+			ftrace_trace_function = ftrace_list_func;
+	}
 
 	spin_unlock(&ftrace_lock);
 
@@ -120,10 +134,12 @@ static int notrace __unregister_ftrace_function(struct ftrace_ops *ops)
 
 	*p = (*p)->next;
 
-	/* If we only have one func left, then call that directly */
-	if (ftrace_list == &ftrace_list_end ||
-	    ftrace_list->next == &ftrace_list_end)
-		ftrace_trace_function = ftrace_list->func;
+	if (ftrace_enabled) {
+		/* If we only have one func left, then call that directly */
+		if (ftrace_list == &ftrace_list_end ||
+		    ftrace_list->next == &ftrace_list_end)
+			ftrace_trace_function = ftrace_list->func;
+	}
 
  out:
 	spin_unlock(&ftrace_lock);
@@ -263,7 +279,8 @@ static void notrace ftrace_startup(void)
 		goto out;
 	__unregister_ftrace_function(&ftrace_shutdown_ops);
 
-	ftrace_run_startup_code();
+	if (ftrace_enabled)
+		ftrace_run_startup_code();
  out:
 	mutex_unlock(&ftraced_lock);
 }
@@ -275,13 +292,32 @@ static void notrace ftrace_shutdown(void)
 	if (ftraced_suspend)
 		goto out;
 
-	ftrace_run_shutdown_code();
+	if (ftrace_enabled)
+		ftrace_run_shutdown_code();
 
 	__register_ftrace_function(&ftrace_shutdown_ops);
  out:
 	mutex_unlock(&ftraced_lock);
 }
 
+static void notrace ftrace_startup_sysctl(void)
+{
+	mutex_lock(&ftraced_lock);
+	/* ftraced_suspend is true if we want ftrace running */
+	if (ftraced_suspend)
+		ftrace_run_startup_code();
+	mutex_unlock(&ftraced_lock);
+}
+
+static void notrace ftrace_shutdown_sysctl(void)
+{
+	mutex_lock(&ftraced_lock);
+	/* ftraced_suspend is true if ftrace is running */
+	if (ftraced_suspend)
+		ftrace_run_shutdown_code();
+	mutex_unlock(&ftraced_lock);
+}
+
 static cycle_t		ftrace_update_time;
 static unsigned long	ftrace_update_cnt;
 unsigned long		ftrace_update_tot_cnt;
@@ -341,8 +377,9 @@ static int notrace ftraced(void *ignore)
 		/* check once a second */
 		schedule_timeout(HZ);
 
+		mutex_lock(&ftrace_sysctl_lock);
 		mutex_lock(&ftraced_lock);
-		if (ftraced_trigger && !ftraced_suspend) {
+		if (ftrace_enabled && ftraced_trigger && !ftraced_suspend) {
 			ftrace_record_suspend++;
 			ftrace_update_code();
 			usecs = nsecs_to_usecs(ftrace_update_time);
@@ -360,6 +397,7 @@ static int notrace ftraced(void *ignore)
 			ftrace_record_suspend--;
 		}
 		mutex_unlock(&ftraced_lock);
+		mutex_unlock(&ftrace_sysctl_lock);
 
 		ftrace_shutdown_replenish();
 
@@ -389,8 +427,10 @@ static int __init notrace ftrace_shutdown_init(void)
 
 core_initcall(ftrace_shutdown_init);
 #else
-# define ftrace_startup()	do { } while (0)
-# define ftrace_shutdown()	do { } while (0)
+# define ftrace_startup()	  do { } while (0)
+# define ftrace_shutdown()	  do { } while (0)
+# define ftrace_startup_sysctl()  do { } while (0)
+# define ftrace_shutdown_sysctl() do { } while (0)
 #endif /* CONFIG_DYNAMIC_FTRACE */
 
 /**
@@ -406,9 +446,15 @@ core_initcall(ftrace_shutdown_init);
  */
 int register_ftrace_function(struct ftrace_ops *ops)
 {
+	int ret;
+
+	mutex_lock(&ftrace_sysctl_lock);
 	ftrace_startup();
 
-	return __register_ftrace_function(ops);
+	ret = __register_ftrace_function(ops);
+	mutex_unlock(&ftrace_sysctl_lock);
+
+	return ret;
 }
 
 /**
@@ -421,10 +467,53 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
 {
 	int ret;
 
+	mutex_lock(&ftrace_sysctl_lock);
 	ret = __unregister_ftrace_function(ops);
 
 	if (ftrace_list == &ftrace_list_end)
 		ftrace_shutdown();
 
+	mutex_unlock(&ftrace_sysctl_lock);
+
+	return ret;
+}
+
+notrace int
+ftrace_enable_sysctl(struct ctl_table *table, int write,
+		     struct file *filp, void __user *buffer, size_t *lenp,
+		     loff_t *ppos)
+{
+	int ret;
+
+	mutex_lock(&ftrace_sysctl_lock);
+
+	ret  = proc_dointvec(table, write, filp, buffer, lenp, ppos);
+
+	if (ret || !write || (last_ftrace_enabled == ftrace_enabled))
+		goto out;
+
+	last_ftrace_enabled = ftrace_enabled;
+
+	if (ftrace_enabled) {
+
+		ftrace_startup_sysctl();
+
+		/* we are starting ftrace again */
+		if (ftrace_list != &ftrace_list_end) {
+			if (ftrace_list->next == &ftrace_list_end)
+				ftrace_trace_function = ftrace_list->func;
+			else
+				ftrace_trace_function = ftrace_list_func;
+		}
+
+	} else {
+		/* stopping ftrace calls (just send to ftrace_stub) */
+		ftrace_trace_function = ftrace_stub;
+
+		ftrace_shutdown_sysctl();
+	}
+
+ out:
+	mutex_unlock(&ftrace_sysctl_lock);
 	return ret;
 }
-- 
cgit v1.2.3


From 3c1720f00bb619302ba19d55986ab565e74d06db Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 12 May 2008 21:20:43 +0200
Subject: ftrace: move memory management out of arch code

This patch moves the memory management of the ftrace
records out of the arch code and into the generic code
making the arch code simpler.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/ftrace.c | 183 ++++++++---------------------------------------
 include/linux/ftrace.h   |  18 +++--
 kernel/trace/ftrace.c    | 154 ++++++++++++++++++++++++++++++++++++++-
 3 files changed, 192 insertions(+), 163 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 2e060c58b860..b69795efa226 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -23,25 +23,6 @@
 /* Long is fine, even if it is only 4 bytes ;-) */
 static long *ftrace_nop;
 
-struct ftrace_record {
-	struct dyn_ftrace	rec;
-	int			failed;
-} __attribute__((packed));
-
-struct ftrace_page {
-	struct ftrace_page	*next;
-	int			index;
-	struct ftrace_record	records[];
-} __attribute__((packed));
-
-#define ENTRIES_PER_PAGE \
-  ((PAGE_SIZE - sizeof(struct ftrace_page)) / sizeof(struct ftrace_record))
-
-/* estimate from running different kernels */
-#define NR_TO_INIT		10000
-
-#define MCOUNT_ADDR ((long)(&mcount))
-
 union ftrace_code_union {
 	char code[5];
 	struct {
@@ -50,33 +31,41 @@ union ftrace_code_union {
 	} __attribute__((packed));
 };
 
-static struct ftrace_page	*ftrace_pages_start;
-static struct ftrace_page	*ftrace_pages;
-
-notrace struct dyn_ftrace *ftrace_alloc_shutdown_node(unsigned long ip)
+notrace int ftrace_ip_converted(unsigned long ip)
 {
-	struct ftrace_record *rec;
 	unsigned long save;
 
 	ip -= CALL_BACK;
 	save = *(long *)ip;
 
-	/* If this was already converted, skip it */
-	if (save == *ftrace_nop)
-		return NULL;
+	return save == *ftrace_nop;
+}
 
-	if (ftrace_pages->index == ENTRIES_PER_PAGE) {
-		if (!ftrace_pages->next)
-			return NULL;
-		ftrace_pages = ftrace_pages->next;
-	}
+static int notrace ftrace_calc_offset(long ip, long addr)
+{
+	return (int)(addr - ip);
+}
 
-	rec = &ftrace_pages->records[ftrace_pages->index++];
+notrace unsigned char *ftrace_nop_replace(void)
+{
+	return (char *)ftrace_nop;
+}
+
+notrace unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
+{
+	static union ftrace_code_union calc;
 
-	return &rec->rec;
+	calc.e8		= 0xe8;
+	calc.offset	= ftrace_calc_offset(ip, addr);
+
+	/*
+	 * No locking needed, this must be called via kstop_machine
+	 * which in essence is like running on a uniprocessor machine.
+	 */
+	return calc.code;
 }
 
-static int notrace
+notrace int
 ftrace_modify_code(unsigned long ip, unsigned char *old_code,
 		   unsigned char *new_code)
 {
@@ -86,6 +75,9 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
 	unsigned char newch = new_code[4];
 	int faulted = 0;
 
+	/* move the IP back to the start of the call */
+	ip -= CALL_BACK;
+
 	/*
 	 * Note: Due to modules and __init, code can
 	 *  disappear and change, we need to protect against faulting
@@ -117,129 +109,12 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
 	return faulted;
 }
 
-static int notrace ftrace_calc_offset(long ip)
-{
-	return (int)(MCOUNT_ADDR - ip);
-}
-
-notrace void ftrace_code_disable(struct dyn_ftrace *rec)
-{
-	unsigned long ip;
-	union ftrace_code_union save;
-	struct ftrace_record *r =
-		container_of(rec, struct ftrace_record, rec);
-
-	ip = rec->ip;
-
-	save.e8		= 0xe8;
-	save.offset 	= ftrace_calc_offset(ip);
-
-	/* move the IP back to the start of the call */
-	ip -= CALL_BACK;
-
-	r->failed = ftrace_modify_code(ip, save.code, (char *)ftrace_nop);
-}
-
-static void notrace ftrace_replace_code(int saved)
-{
-	unsigned char *new = NULL, *old = NULL;
-	struct ftrace_record *rec;
-	struct ftrace_page *pg;
-	unsigned long ip;
-	int i;
-
-	if (saved)
-		old = (char *)ftrace_nop;
-	else
-		new = (char *)ftrace_nop;
-
-	for (pg = ftrace_pages_start; pg; pg = pg->next) {
-		for (i = 0; i < pg->index; i++) {
-			union ftrace_code_union calc;
-			rec = &pg->records[i];
-
-			/* don't modify code that has already faulted */
-			if (rec->failed)
-				continue;
-
-			ip = rec->rec.ip;
-
-			calc.e8		= 0xe8;
-			calc.offset	= ftrace_calc_offset(ip);
-
-			if (saved)
-				new = calc.code;
-			else
-				old = calc.code;
-
-			ip -= CALL_BACK;
-
-			rec->failed = ftrace_modify_code(ip, old, new);
-		}
-	}
-
-}
-
-notrace void ftrace_startup_code(void)
-{
-	ftrace_replace_code(1);
-}
-
-notrace void ftrace_shutdown_code(void)
-{
-	ftrace_replace_code(0);
-}
-
-notrace void ftrace_shutdown_replenish(void)
-{
-	if (ftrace_pages->next)
-		return;
-
-	/* allocate another page */
-	ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL);
-}
-
-notrace int __init ftrace_shutdown_arch_init(void)
+int __init ftrace_dyn_arch_init(void)
 {
 	const unsigned char *const *noptable = find_nop_table();
-	struct ftrace_page *pg;
-	int cnt;
-	int i;
 
 	ftrace_nop = (unsigned long *)noptable[CALL_BACK];
 
-	/* allocate a few pages */
-	ftrace_pages_start = (void *)get_zeroed_page(GFP_KERNEL);
-	if (!ftrace_pages_start)
-		return -1;
-
-	/*
-	 * Allocate a few more pages.
-	 *
-	 * TODO: have some parser search vmlinux before
-	 *   final linking to find all calls to ftrace.
-	 *   Then we can:
-	 *    a) know how many pages to allocate.
-	 *     and/or
-	 *    b) set up the table then.
-	 *
-	 *  The dynamic code is still necessary for
-	 *  modules.
-	 */
-
-	pg = ftrace_pages = ftrace_pages_start;
-
-	cnt = NR_TO_INIT / ENTRIES_PER_PAGE;
-
-	for (i = 0; i < cnt; i++) {
-		pg->next = (void *)get_zeroed_page(GFP_KERNEL);
-
-		/* If we fail, we'll try later anyway */
-		if (!pg->next)
-			break;
-
-		pg = pg->next;
-	}
-
 	return 0;
 }
+
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index ccd8537dbdb7..d509ad6c9cb8 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -42,19 +42,23 @@ extern void mcount(void);
 # define FTRACE_HASHBITS	10
 # define FTRACE_HASHSIZE	(1<<FTRACE_HASHBITS)
 
+enum {
+	FTRACE_FL_FAILED	= (1<<0),
+};
+
 struct dyn_ftrace {
 	struct hlist_node node;
 	unsigned long	  ip;
+	unsigned long	  flags;
 };
 
 /* defined in arch */
-extern struct dyn_ftrace *
-ftrace_alloc_shutdown_node(unsigned long ip);
-extern int ftrace_shutdown_arch_init(void);
-extern void ftrace_code_disable(struct dyn_ftrace *rec);
-extern void ftrace_startup_code(void);
-extern void ftrace_shutdown_code(void);
-extern void ftrace_shutdown_replenish(void);
+extern int ftrace_ip_converted(unsigned long ip);
+extern unsigned char *ftrace_nop_replace(void);
+extern unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr);
+extern int ftrace_dyn_arch_init(void);
+extern int ftrace_modify_code(unsigned long ip, unsigned char *old_code,
+			      unsigned char *new_code);
 #endif
 
 #ifdef CONFIG_FRAME_POINTER
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index d3de37299ba4..f6d9af3bf66b 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -156,6 +156,21 @@ static DEFINE_PER_CPU(int, ftrace_shutdown_disable_cpu);
 static DEFINE_SPINLOCK(ftrace_shutdown_lock);
 static DEFINE_MUTEX(ftraced_lock);
 
+struct ftrace_page {
+	struct ftrace_page	*next;
+	int			index;
+	struct dyn_ftrace	records[];
+} __attribute__((packed));
+
+#define ENTRIES_PER_PAGE \
+  ((PAGE_SIZE - sizeof(struct ftrace_page)) / sizeof(struct dyn_ftrace))
+
+/* estimate from running different kernels */
+#define NR_TO_INIT		10000
+
+static struct ftrace_page	*ftrace_pages_start;
+static struct ftrace_page	*ftrace_pages;
+
 static int ftraced_trigger;
 static int ftraced_suspend;
 
@@ -184,6 +199,21 @@ ftrace_add_hash(struct dyn_ftrace *node, unsigned long key)
 	hlist_add_head(&node->node, &ftrace_hash[key]);
 }
 
+static notrace struct dyn_ftrace *ftrace_alloc_shutdown_node(unsigned long ip)
+{
+	/* If this was already converted, skip it */
+	if (ftrace_ip_converted(ip))
+		return NULL;
+
+	if (ftrace_pages->index == ENTRIES_PER_PAGE) {
+		if (!ftrace_pages->next)
+			return NULL;
+		ftrace_pages = ftrace_pages->next;
+	}
+
+	return &ftrace_pages->records[ftrace_pages->index++];
+}
+
 static void notrace
 ftrace_record_ip(unsigned long ip, unsigned long parent_ip)
 {
@@ -252,6 +282,62 @@ static struct ftrace_ops ftrace_shutdown_ops __read_mostly =
 	.func = ftrace_record_ip,
 };
 
+#define MCOUNT_ADDR ((long)(&mcount))
+
+static void notrace ftrace_replace_code(int saved)
+{
+	unsigned char *new = NULL, *old = NULL;
+	struct dyn_ftrace *rec;
+	struct ftrace_page *pg;
+	unsigned long ip;
+	int failed;
+	int i;
+
+	if (saved)
+		old = ftrace_nop_replace();
+	else
+		new = ftrace_nop_replace();
+
+	for (pg = ftrace_pages_start; pg; pg = pg->next) {
+		for (i = 0; i < pg->index; i++) {
+			rec = &pg->records[i];
+
+			/* don't modify code that has already faulted */
+			if (rec->flags & FTRACE_FL_FAILED)
+				continue;
+
+			ip = rec->ip;
+
+			if (saved)
+				new = ftrace_call_replace(ip, MCOUNT_ADDR);
+			else
+				old = ftrace_call_replace(ip, MCOUNT_ADDR);
+
+			failed = ftrace_modify_code(ip, old, new);
+			if (failed)
+				rec->flags |= FTRACE_FL_FAILED;
+		}
+	}
+}
+
+static notrace void ftrace_startup_code(void)
+{
+	ftrace_replace_code(1);
+}
+
+static notrace void ftrace_shutdown_code(void)
+{
+	ftrace_replace_code(0);
+}
+
+static notrace void ftrace_shutdown_replenish(void)
+{
+	if (ftrace_pages->next)
+		return;
+
+	/* allocate another page */
+	ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL);
+}
 
 static int notrace __ftrace_modify_code(void *data)
 {
@@ -261,6 +347,23 @@ static int notrace __ftrace_modify_code(void *data)
 	return 0;
 }
 
+static notrace void
+ftrace_code_disable(struct dyn_ftrace *rec, unsigned long addr)
+{
+	unsigned long ip;
+	unsigned char *nop, *call;
+	int failed;
+
+	ip = rec->ip;
+
+	nop = ftrace_nop_replace();
+	call = ftrace_call_replace(ip, addr);
+
+	failed = ftrace_modify_code(ip, call, nop);
+	if (failed)
+		rec->flags |= FTRACE_FL_FAILED;
+}
+
 static void notrace ftrace_run_startup_code(void)
 {
 	stop_machine_run(__ftrace_modify_code, ftrace_startup_code, NR_CPUS);
@@ -346,7 +449,7 @@ static int notrace __ftrace_update_code(void *ignore)
 
 		/* all CPUS are stopped, we are safe to modify code */
 		hlist_for_each_entry(p, t, &head, node) {
-			ftrace_code_disable(p);
+			ftrace_code_disable(p, MCOUNT_ADDR);
 			ftrace_update_cnt++;
 		}
 
@@ -407,12 +510,59 @@ static int notrace ftraced(void *ignore)
 	return 0;
 }
 
+static int __init ftrace_dyn_table_alloc(void)
+{
+	struct ftrace_page *pg;
+	int cnt;
+	int i;
+	int ret;
+
+	ret = ftrace_dyn_arch_init();
+	if (ret)
+		return ret;
+
+	/* allocate a few pages */
+	ftrace_pages_start = (void *)get_zeroed_page(GFP_KERNEL);
+	if (!ftrace_pages_start)
+		return -1;
+
+	/*
+	 * Allocate a few more pages.
+	 *
+	 * TODO: have some parser search vmlinux before
+	 *   final linking to find all calls to ftrace.
+	 *   Then we can:
+	 *    a) know how many pages to allocate.
+	 *     and/or
+	 *    b) set up the table then.
+	 *
+	 *  The dynamic code is still necessary for
+	 *  modules.
+	 */
+
+	pg = ftrace_pages = ftrace_pages_start;
+
+	cnt = NR_TO_INIT / ENTRIES_PER_PAGE;
+
+	for (i = 0; i < cnt; i++) {
+		pg->next = (void *)get_zeroed_page(GFP_KERNEL);
+
+		/* If we fail, we'll try later anyway */
+		if (!pg->next)
+			break;
+
+		pg = pg->next;
+	}
+
+	return 0;
+}
+
 static int __init notrace ftrace_shutdown_init(void)
 {
 	struct task_struct *p;
 	int ret;
 
-	ret = ftrace_shutdown_arch_init();
+	ret = ftrace_dyn_table_alloc();
 	if (ret)
 		return ret;
 
-- 
cgit v1.2.3


From d61f82d06672f57fca410da6f7fffd15867db622 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 12 May 2008 21:20:43 +0200
Subject: ftrace: use dynamic patching for updating mcount calls

This patch replaces the indirect call to the mcount function
pointer with a direct call that will be patched by the
dynamic ftrace routines.

On boot up, the mcount function calls the ftace_stub function.
When the dynamic ftrace code is initialized, the ftrace_stub
is replaced with a call to the ftrace_record_ip, which records
the instruction pointers of the locations that call it.

Later, the ftraced daemon will call kstop_machine and patch all
the locations to nops.

When a ftrace is enabled, the original calls to mcount will now
be set top call ftrace_caller, which will do a direct call
to the registered ftrace function. This direct call is also patched
when the function that should be called is updated.

All patching is performed by a kstop_machine routine to prevent any
type of race conditions that is associated with modifying code
on the fly.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/entry_32.S |  47 +++++++++++-
 arch/x86/kernel/entry_64.S |  67 ++++++++++++++++-
 arch/x86/kernel/ftrace.c   |  41 +++++++++-
 include/linux/ftrace.h     |   7 +-
 kernel/trace/ftrace.c      | 183 ++++++++++++++++++++++++++-------------------
 5 files changed, 261 insertions(+), 84 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index f47b9b5440d2..e6517ce0b824 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -1110,10 +1110,50 @@ ENDPROC(xen_failsafe_callback)
 #endif	/* CONFIG_XEN */
 
 #ifdef CONFIG_FTRACE
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+ENTRY(mcount)
+	pushl %eax
+	pushl %ecx
+	pushl %edx
+	movl 0xc(%esp), %eax
+
+.globl mcount_call
+mcount_call:
+	call ftrace_stub
+
+	popl %edx
+	popl %ecx
+	popl %eax
+
+	ret
+END(mcount)
+
+ENTRY(ftrace_caller)
+	pushl %eax
+	pushl %ecx
+	pushl %edx
+	movl 0xc(%esp), %eax
+	movl 0x4(%ebp), %edx
+
+.globl ftrace_call
+ftrace_call:
+	call ftrace_stub
+
+	popl %edx
+	popl %ecx
+	popl %eax
+
+.globl ftrace_stub
+ftrace_stub:
+	ret
+END(ftrace_caller)
+
+#else /* ! CONFIG_DYNAMIC_FTRACE */
+
 ENTRY(mcount)
 	cmpl $ftrace_stub, ftrace_trace_function
 	jnz trace
-
 .globl ftrace_stub
 ftrace_stub:
 	ret
@@ -1126,7 +1166,7 @@ trace:
 	movl 0xc(%esp), %eax
 	movl 0x4(%ebp), %edx
 
-	call   *ftrace_trace_function
+	call *ftrace_trace_function
 
 	popl %edx
 	popl %ecx
@@ -1134,7 +1174,8 @@ trace:
 
 	jmp ftrace_stub
 END(mcount)
-#endif
+#endif /* CONFIG_DYNAMIC_FTRACE */
+#endif /* CONFIG_FTRACE */
 
 .section .rodata,"a"
 #include "syscall_table_32.S"
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index f046e0c64883..fe25e5febca3 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -55,6 +55,70 @@
 	.code64
 
 #ifdef CONFIG_FTRACE
+#ifdef CONFIG_DYNAMIC_FTRACE
+ENTRY(mcount)
+
+	subq $0x38, %rsp
+	movq %rax, (%rsp)
+	movq %rcx, 8(%rsp)
+	movq %rdx, 16(%rsp)
+	movq %rsi, 24(%rsp)
+	movq %rdi, 32(%rsp)
+	movq %r8, 40(%rsp)
+	movq %r9, 48(%rsp)
+
+	movq 0x38(%rsp), %rdi
+
+.globl mcount_call
+mcount_call:
+	call ftrace_stub
+
+	movq 48(%rsp), %r9
+	movq 40(%rsp), %r8
+	movq 32(%rsp), %rdi
+	movq 24(%rsp), %rsi
+	movq 16(%rsp), %rdx
+	movq 8(%rsp), %rcx
+	movq (%rsp), %rax
+	addq $0x38, %rsp
+
+	retq
+END(mcount)
+
+ENTRY(ftrace_caller)
+
+	/* taken from glibc */
+	subq $0x38, %rsp
+	movq %rax, (%rsp)
+	movq %rcx, 8(%rsp)
+	movq %rdx, 16(%rsp)
+	movq %rsi, 24(%rsp)
+	movq %rdi, 32(%rsp)
+	movq %r8, 40(%rsp)
+	movq %r9, 48(%rsp)
+
+	movq 0x38(%rsp), %rdi
+	movq 8(%rbp), %rsi
+
+.globl ftrace_call
+ftrace_call:
+	call ftrace_stub
+
+	movq 48(%rsp), %r9
+	movq 40(%rsp), %r8
+	movq 32(%rsp), %rdi
+	movq 24(%rsp), %rsi
+	movq 16(%rsp), %rdx
+	movq 8(%rsp), %rcx
+	movq (%rsp), %rax
+	addq $0x38, %rsp
+
+.globl ftrace_stub
+ftrace_stub:
+	retq
+END(ftrace_caller)
+
+#else /* ! CONFIG_DYNAMIC_FTRACE */
 ENTRY(mcount)
 	cmpq $ftrace_stub, ftrace_trace_function
 	jnz trace
@@ -89,7 +153,8 @@ trace:
 
 	jmp ftrace_stub
 END(mcount)
-#endif
+#endif /* CONFIG_DYNAMIC_FTRACE */
+#endif /* CONFIG_FTRACE */
 
 #ifndef CONFIG_PREEMPT
 #define retint_kernel retint_restore_args
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index b69795efa226..9f44623e0072 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -109,10 +109,49 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
 	return faulted;
 }
 
-int __init ftrace_dyn_arch_init(void)
+notrace int ftrace_update_ftrace_func(ftrace_func_t func)
+{
+	unsigned long ip = (unsigned long)(&ftrace_call);
+	unsigned char old[5], *new;
+	int ret;
+
+	ip += CALL_BACK;
+
+	memcpy(old, &ftrace_call, 5);
+	new = ftrace_call_replace(ip, (unsigned long)func);
+	ret = ftrace_modify_code(ip, old, new);
+
+	return ret;
+}
+
+notrace int ftrace_mcount_set(unsigned long *data)
+{
+	unsigned long ip = (long)(&mcount_call);
+	unsigned long *addr = data;
+	unsigned char old[5], *new;
+
+	/* ip is at the location, but modify code will subtact this */
+	ip += CALL_BACK;
+
+	/*
+	 * Replace the mcount stub with a pointer to the
+	 * ip recorder function.
+	 */
+	memcpy(old, &mcount_call, 5);
+	new = ftrace_call_replace(ip, *addr);
+	*addr = ftrace_modify_code(ip, old, new);
+
+	return 0;
+}
+
+int __init ftrace_dyn_arch_init(void *data)
 {
 	const unsigned char *const *noptable = find_nop_table();
 
+	/* This is running in kstop_machine */
+
+	ftrace_mcount_set(data);
+
 	ftrace_nop = (unsigned long *)noptable[CALL_BACK];
 
 	return 0;
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index d509ad6c9cb8..b0dd0093058f 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -56,9 +56,14 @@ struct dyn_ftrace {
 extern int ftrace_ip_converted(unsigned long ip);
 extern unsigned char *ftrace_nop_replace(void);
 extern unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr);
-extern int ftrace_dyn_arch_init(void);
+extern int ftrace_dyn_arch_init(void *data);
+extern int ftrace_mcount_set(unsigned long *data);
 extern int ftrace_modify_code(unsigned long ip, unsigned char *old_code,
 			      unsigned char *new_code);
+extern int ftrace_update_ftrace_func(ftrace_func_t func);
+extern void ftrace_caller(void);
+extern void ftrace_call(void);
+extern void mcount_call(void);
 #endif
 
 #ifdef CONFIG_FRAME_POINTER
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index f6d9af3bf66b..88544f9bc0ed 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -26,14 +26,8 @@
 
 #include "trace.h"
 
-#ifdef CONFIG_DYNAMIC_FTRACE
-# define FTRACE_ENABLED_INIT 1
-#else
-# define FTRACE_ENABLED_INIT 0
-#endif
-
-int ftrace_enabled = FTRACE_ENABLED_INIT;
-static int last_ftrace_enabled = FTRACE_ENABLED_INIT;
+int ftrace_enabled;
+static int last_ftrace_enabled;
 
 static DEFINE_SPINLOCK(ftrace_lock);
 static DEFINE_MUTEX(ftrace_sysctl_lock);
@@ -149,6 +143,14 @@ static int notrace __unregister_ftrace_function(struct ftrace_ops *ops)
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 
+enum {
+	FTRACE_ENABLE_CALLS		= (1 << 0),
+	FTRACE_DISABLE_CALLS		= (1 << 1),
+	FTRACE_UPDATE_TRACE_FUNC	= (1 << 2),
+	FTRACE_ENABLE_MCOUNT		= (1 << 3),
+	FTRACE_DISABLE_MCOUNT		= (1 << 4),
+};
+
 static struct hlist_head ftrace_hash[FTRACE_HASHSIZE];
 
 static DEFINE_PER_CPU(int, ftrace_shutdown_disable_cpu);
@@ -199,12 +201,8 @@ ftrace_add_hash(struct dyn_ftrace *node, unsigned long key)
 	hlist_add_head(&node->node, &ftrace_hash[key]);
 }
 
-static notrace struct dyn_ftrace *ftrace_alloc_shutdown_node(unsigned long ip)
+static notrace struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
 {
-	/* If this was already converted, skip it */
-	if (ftrace_ip_converted(ip))
-		return NULL;
-
 	if (ftrace_pages->index == ENTRIES_PER_PAGE) {
 		if (!ftrace_pages->next)
 			return NULL;
@@ -215,7 +213,7 @@ static notrace struct dyn_ftrace *ftrace_alloc_shutdown_node(unsigned long ip)
 }
 
 static void notrace
-ftrace_record_ip(unsigned long ip, unsigned long parent_ip)
+ftrace_record_ip(unsigned long ip)
 {
 	struct dyn_ftrace *node;
 	unsigned long flags;
@@ -223,6 +221,9 @@ ftrace_record_ip(unsigned long ip, unsigned long parent_ip)
 	int resched;
 	int atomic;
 
+	if (!ftrace_enabled)
+		return;
+
 	resched = need_resched();
 	preempt_disable_notrace();
 
@@ -251,11 +252,12 @@ ftrace_record_ip(unsigned long ip, unsigned long parent_ip)
 
 	/*
 	 * There's a slight race that the ftraced will update the
-	 * hash and reset here. The arch alloc is responsible
-	 * for seeing if the IP has already changed, and if
-	 * it has, the alloc will fail.
+	 * hash and reset here. If it is already converted, skip it.
 	 */
-	node = ftrace_alloc_shutdown_node(ip);
+	if (ftrace_ip_converted(ip))
+		goto out_unlock;
+
+	node = ftrace_alloc_dyn_node(ip);
 	if (!node)
 		goto out_unlock;
 
@@ -277,11 +279,7 @@ ftrace_record_ip(unsigned long ip, unsigned long parent_ip)
 		preempt_enable_notrace();
 }
 
-static struct ftrace_ops ftrace_shutdown_ops __read_mostly =
-{
-	.func = ftrace_record_ip,
-};
-
+#define FTRACE_ADDR ((long)(&ftrace_caller))
 #define MCOUNT_ADDR ((long)(&mcount))
 
 static void notrace ftrace_replace_code(int saved)
@@ -309,9 +307,9 @@ static void notrace ftrace_replace_code(int saved)
 			ip = rec->ip;
 
 			if (saved)
-				new = ftrace_call_replace(ip, MCOUNT_ADDR);
+				new = ftrace_call_replace(ip, FTRACE_ADDR);
 			else
-				old = ftrace_call_replace(ip, MCOUNT_ADDR);
+				old = ftrace_call_replace(ip, FTRACE_ADDR);
 
 			failed = ftrace_modify_code(ip, old, new);
 			if (failed)
@@ -320,16 +318,6 @@ static void notrace ftrace_replace_code(int saved)
 	}
 }
 
-static notrace void ftrace_startup_code(void)
-{
-	ftrace_replace_code(1);
-}
-
-static notrace void ftrace_shutdown_code(void)
-{
-	ftrace_replace_code(0);
-}
-
 static notrace void ftrace_shutdown_replenish(void)
 {
 	if (ftrace_pages->next)
@@ -339,16 +327,8 @@ static notrace void ftrace_shutdown_replenish(void)
 	ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL);
 }
 
-static int notrace __ftrace_modify_code(void *data)
-{
-	void (*func)(void) = data;
-
-	func();
-	return 0;
-}
-
 static notrace void
-ftrace_code_disable(struct dyn_ftrace *rec, unsigned long addr)
+ftrace_code_disable(struct dyn_ftrace *rec)
 {
 	unsigned long ip;
 	unsigned char *nop, *call;
@@ -357,67 +337,113 @@ ftrace_code_disable(struct dyn_ftrace *rec, unsigned long addr)
 	ip = rec->ip;
 
 	nop = ftrace_nop_replace();
-	call = ftrace_call_replace(ip, addr);
+	call = ftrace_call_replace(ip, MCOUNT_ADDR);
 
 	failed = ftrace_modify_code(ip, call, nop);
 	if (failed)
 		rec->flags |= FTRACE_FL_FAILED;
 }
 
-static void notrace ftrace_run_startup_code(void)
+static int notrace __ftrace_modify_code(void *data)
 {
-	stop_machine_run(__ftrace_modify_code, ftrace_startup_code, NR_CPUS);
+	unsigned long addr;
+	int *command = data;
+
+	if (*command & FTRACE_ENABLE_CALLS)
+		ftrace_replace_code(1);
+	else if (*command & FTRACE_DISABLE_CALLS)
+		ftrace_replace_code(0);
+
+	if (*command & FTRACE_UPDATE_TRACE_FUNC)
+		ftrace_update_ftrace_func(ftrace_trace_function);
+
+	if (*command & FTRACE_ENABLE_MCOUNT) {
+		addr = (unsigned long)ftrace_record_ip;
+		ftrace_mcount_set(&addr);
+	} else if (*command & FTRACE_DISABLE_MCOUNT) {
+		addr = (unsigned long)ftrace_stub;
+		ftrace_mcount_set(&addr);
+	}
+
+	return 0;
 }
 
-static void notrace ftrace_run_shutdown_code(void)
+static void notrace ftrace_run_update_code(int command)
 {
-	stop_machine_run(__ftrace_modify_code, ftrace_shutdown_code, NR_CPUS);
+	stop_machine_run(__ftrace_modify_code, &command, NR_CPUS);
 }
 
+static ftrace_func_t saved_ftrace_func;
+
 static void notrace ftrace_startup(void)
 {
+	int command = 0;
+
 	mutex_lock(&ftraced_lock);
 	ftraced_suspend++;
-	if (ftraced_suspend != 1)
+	if (ftraced_suspend == 1)
+		command |= FTRACE_ENABLE_CALLS;
+
+	if (saved_ftrace_func != ftrace_trace_function) {
+		saved_ftrace_func = ftrace_trace_function;
+		command |= FTRACE_UPDATE_TRACE_FUNC;
+	}
+
+	if (!command || !ftrace_enabled)
 		goto out;
-	__unregister_ftrace_function(&ftrace_shutdown_ops);
 
-	if (ftrace_enabled)
-		ftrace_run_startup_code();
+	ftrace_run_update_code(command);
  out:
 	mutex_unlock(&ftraced_lock);
 }
 
 static void notrace ftrace_shutdown(void)
 {
+	int command = 0;
+
 	mutex_lock(&ftraced_lock);
 	ftraced_suspend--;
-	if (ftraced_suspend)
-		goto out;
+	if (!ftraced_suspend)
+		command |= FTRACE_DISABLE_CALLS;
 
-	if (ftrace_enabled)
-		ftrace_run_shutdown_code();
+	if (saved_ftrace_func != ftrace_trace_function) {
+		saved_ftrace_func = ftrace_trace_function;
+		command |= FTRACE_UPDATE_TRACE_FUNC;
+	}
 
-	__register_ftrace_function(&ftrace_shutdown_ops);
+	if (!command || !ftrace_enabled)
+		goto out;
+
+	ftrace_run_update_code(command);
  out:
 	mutex_unlock(&ftraced_lock);
 }
 
 static void notrace ftrace_startup_sysctl(void)
 {
+	int command = FTRACE_ENABLE_MCOUNT;
+
 	mutex_lock(&ftraced_lock);
+	/* Force update next time */
+	saved_ftrace_func = NULL;
 	/* ftraced_suspend is true if we want ftrace running */
 	if (ftraced_suspend)
-		ftrace_run_startup_code();
+		command |= FTRACE_ENABLE_CALLS;
+
+	ftrace_run_update_code(command);
 	mutex_unlock(&ftraced_lock);
 }
 
 static void notrace ftrace_shutdown_sysctl(void)
 {
+	int command = FTRACE_DISABLE_MCOUNT;
+
 	mutex_lock(&ftraced_lock);
 	/* ftraced_suspend is true if ftrace is running */
 	if (ftraced_suspend)
-		ftrace_run_shutdown_code();
+		command |= FTRACE_DISABLE_CALLS;
+
+	ftrace_run_update_code(command);
 	mutex_unlock(&ftraced_lock);
 }
 
@@ -430,11 +456,13 @@ static int notrace __ftrace_update_code(void *ignore)
 	struct dyn_ftrace *p;
 	struct hlist_head head;
 	struct hlist_node *t;
+	int save_ftrace_enabled;
 	cycle_t start, stop;
 	int i;
 
-	/* Don't be calling ftrace ops now */
-	__unregister_ftrace_function(&ftrace_shutdown_ops);
+	/* Don't be recording funcs now */
+	save_ftrace_enabled = ftrace_enabled;
+	ftrace_enabled = 0;
 
 	start = now(raw_smp_processor_id());
 	ftrace_update_cnt = 0;
@@ -449,7 +477,7 @@ static int notrace __ftrace_update_code(void *ignore)
 
 		/* all CPUS are stopped, we are safe to modify code */
 		hlist_for_each_entry(p, t, &head, node) {
-			ftrace_code_disable(p, MCOUNT_ADDR);
+			ftrace_code_disable(p);
 			ftrace_update_cnt++;
 		}
 
@@ -459,7 +487,7 @@ static int notrace __ftrace_update_code(void *ignore)
 	ftrace_update_time = stop - start;
 	ftrace_update_tot_cnt += ftrace_update_cnt;
 
-	__register_ftrace_function(&ftrace_shutdown_ops);
+	ftrace_enabled = save_ftrace_enabled;
 
 	return 0;
 }
@@ -515,11 +543,6 @@ static int __init ftrace_dyn_table_alloc(void)
 	struct ftrace_page *pg;
 	int cnt;
 	int i;
-	int ret;
-
-	ret = ftrace_dyn_arch_init();
-	if (ret)
-		return ret;
 
 	/* allocate a few pages */
 	ftrace_pages_start = (void *)get_zeroed_page(GFP_KERNEL);
@@ -557,11 +580,19 @@ static int __init ftrace_dyn_table_alloc(void)
 	return 0;
 }
 
-static int __init notrace ftrace_shutdown_init(void)
+static int __init notrace ftrace_dynamic_init(void)
 {
 	struct task_struct *p;
+	unsigned long addr;
 	int ret;
 
+	addr = (unsigned long)ftrace_record_ip;
+	stop_machine_run(ftrace_dyn_arch_init, &addr, NR_CPUS);
+
+	/* ftrace_dyn_arch_init places the return code in addr */
+	if (addr)
+		return addr;
+
 	ret = ftrace_dyn_table_alloc();
 	if (ret)
 		return ret;
@@ -570,12 +601,12 @@ static int __init notrace ftrace_shutdown_init(void)
 	if (IS_ERR(p))
 		return -1;
 
-	__register_ftrace_function(&ftrace_shutdown_ops);
+	last_ftrace_enabled = ftrace_enabled = 1;
 
 	return 0;
 }
 
-core_initcall(ftrace_shutdown_init);
+core_initcall(ftrace_dynamic_init);
 #else
 # define ftrace_startup()	  do { } while (0)
 # define ftrace_shutdown()	  do { } while (0)
@@ -599,9 +630,8 @@ int register_ftrace_function(struct ftrace_ops *ops)
 	int ret;
 
 	mutex_lock(&ftrace_sysctl_lock);
-	ftrace_startup();
-
 	ret = __register_ftrace_function(ops);
+	ftrace_startup();
 	mutex_unlock(&ftrace_sysctl_lock);
 
 	return ret;
@@ -619,10 +649,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
 
 	mutex_lock(&ftrace_sysctl_lock);
 	ret = __unregister_ftrace_function(ops);
-
-	if (ftrace_list == &ftrace_list_end)
-		ftrace_shutdown();
-
+	ftrace_shutdown();
 	mutex_unlock(&ftrace_sysctl_lock);
 
 	return ret;
-- 
cgit v1.2.3


From 5072c59fd45e9976d02ee6f18c7336ef97623cbc Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 12 May 2008 21:20:43 +0200
Subject: ftrace: add filter select functions to trace

This patch adds two files to the debugfs system:

 /debugfs/tracing/available_filter_functions

and

 /debugfs/tracing/set_ftrace_filter

The available_filter_functions lists all functions that has been
recorded by the ftraced that has called the ftrace_record_ip function.
This is to allow users to see what functions have been converted
to nops and can be enabled for tracing.

To enable functions, simply echo the names (whitespace delimited)
into set_ftrace_filter. Simple wildcards are also allowed.

echo 'scheduler' > /debugfs/tracing/set_ftrace_filter

Will have only the scheduler be activated when tracing is enabled.

echo 'sched_*' > /debugfs/tracing/set_ftrace_filter

Will have only the functions starting with 'sched_' be activated.

echo '*lock' > /debugfs/tracing/set_ftrace_filter

Will have only functions ending with 'lock' be activated.

echo '*lock*' > /debugfs/tracing/set_ftrace_filter

Will have only functions with 'lock' in its name be activated.

Note: 'sched*lock' will not work. The only wildcards that are
allowed is an asterisk and the beginning and or end of the string
passed in.

Multiple names can be passed in with whitespace delimited:

echo 'scheduler *lock *acpi*' > /debugfs/tracing/set_ftrace_filter

is also the same as:

echo 'scheduler' > /debugfs/tracing/set_ftrace_filter
echo '*lock' >> /debugfs/tracing/set_ftrace_filter
echo '*acpi*' >> /debugfs/tracing/set_ftrace_filter

Appending does just that. It appends to the list.

To disable all filters simply echo an empty line in:

echo > /debugfs/tracing/set_ftrace_filter

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/ftrace.h |   4 +-
 kernel/trace/ftrace.c  | 527 +++++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 513 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index b0dd0093058f..f5911d2d42c3 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -43,7 +43,9 @@ extern void mcount(void);
 # define FTRACE_HASHSIZE	(1<<FTRACE_HASHBITS)
 
 enum {
-	FTRACE_FL_FAILED	= (1<<0),
+	FTRACE_FL_FAILED	= (1 << 0),
+	FTRACE_FL_FILTER	= (1 << 1),
+	FTRACE_FL_ENABLED	= (1 << 2),
 };
 
 struct dyn_ftrace {
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 88544f9bc0ed..97d5cb7b7e75 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -16,12 +16,15 @@
 #include <linux/stop_machine.h>
 #include <linux/clocksource.h>
 #include <linux/kallsyms.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
 #include <linux/kthread.h>
 #include <linux/hardirq.h>
 #include <linux/ftrace.h>
-#include <linux/module.h>
+#include <linux/uaccess.h>
 #include <linux/sysctl.h>
 #include <linux/hash.h>
+#include <linux/ctype.h>
 #include <linux/list.h>
 
 #include "trace.h"
@@ -151,12 +154,15 @@ enum {
 	FTRACE_DISABLE_MCOUNT		= (1 << 4),
 };
 
+static int ftrace_filtered;
+
 static struct hlist_head ftrace_hash[FTRACE_HASHSIZE];
 
 static DEFINE_PER_CPU(int, ftrace_shutdown_disable_cpu);
 
 static DEFINE_SPINLOCK(ftrace_shutdown_lock);
 static DEFINE_MUTEX(ftraced_lock);
+static DEFINE_MUTEX(ftrace_filter_lock);
 
 struct ftrace_page {
 	struct ftrace_page	*next;
@@ -282,16 +288,82 @@ ftrace_record_ip(unsigned long ip)
 #define FTRACE_ADDR ((long)(&ftrace_caller))
 #define MCOUNT_ADDR ((long)(&mcount))
 
-static void notrace ftrace_replace_code(int saved)
+static void notrace
+__ftrace_replace_code(struct dyn_ftrace *rec,
+		      unsigned char *old, unsigned char *new, int enable)
+{
+	unsigned long ip;
+	int failed;
+
+	ip = rec->ip;
+
+	if (ftrace_filtered && enable) {
+		unsigned long fl;
+		/*
+		 * If filtering is on:
+		 *
+		 * If this record is set to be filtered and
+		 * is enabled then do nothing.
+		 *
+		 * If this record is set to be filtered and
+		 * it is not enabled, enable it.
+		 *
+		 * If this record is not set to be filtered
+		 * and it is not enabled do nothing.
+		 *
+		 * If this record is not set to be filtered and
+		 * it is enabled, disable it.
+		 */
+		fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_ENABLED);
+
+		if ((fl ==  (FTRACE_FL_FILTER | FTRACE_FL_ENABLED)) ||
+		    (fl == 0))
+			return;
+
+		/*
+		 * If it is enabled disable it,
+		 * otherwise enable it!
+		 */
+		if (fl == FTRACE_FL_ENABLED) {
+			/* swap new and old */
+			new = old;
+			old = ftrace_call_replace(ip, FTRACE_ADDR);
+			rec->flags &= ~FTRACE_FL_ENABLED;
+		} else {
+			new = ftrace_call_replace(ip, FTRACE_ADDR);
+			rec->flags |= FTRACE_FL_ENABLED;
+		}
+	} else {
+
+		if (enable)
+			new = ftrace_call_replace(ip, FTRACE_ADDR);
+		else
+			old = ftrace_call_replace(ip, FTRACE_ADDR);
+
+		if (enable) {
+			if (rec->flags & FTRACE_FL_ENABLED)
+				return;
+			rec->flags |= FTRACE_FL_ENABLED;
+		} else {
+			if (!(rec->flags & FTRACE_FL_ENABLED))
+				return;
+			rec->flags &= ~FTRACE_FL_ENABLED;
+		}
+	}
+
+	failed = ftrace_modify_code(ip, old, new);
+	if (failed)
+		rec->flags |= FTRACE_FL_FAILED;
+}
+
+static void notrace ftrace_replace_code(int enable)
 {
 	unsigned char *new = NULL, *old = NULL;
 	struct dyn_ftrace *rec;
 	struct ftrace_page *pg;
-	unsigned long ip;
-	int failed;
 	int i;
 
-	if (saved)
+	if (enable)
 		old = ftrace_nop_replace();
 	else
 		new = ftrace_nop_replace();
@@ -304,16 +376,7 @@ static void notrace ftrace_replace_code(int saved)
 			if (rec->flags & FTRACE_FL_FAILED)
 				continue;
 
-			ip = rec->ip;
-
-			if (saved)
-				new = ftrace_call_replace(ip, FTRACE_ADDR);
-			else
-				old = ftrace_call_replace(ip, FTRACE_ADDR);
-
-			failed = ftrace_modify_code(ip, old, new);
-			if (failed)
-				rec->flags |= FTRACE_FL_FAILED;
+			__ftrace_replace_code(rec, old, new, enable);
 		}
 	}
 }
@@ -580,6 +643,436 @@ static int __init ftrace_dyn_table_alloc(void)
 	return 0;
 }
 
+enum {
+	FTRACE_ITER_FILTER	= (1 << 0),
+	FTRACE_ITER_CONT	= (1 << 1),
+};
+
+#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
+
+struct ftrace_iterator {
+	loff_t			pos;
+	struct ftrace_page	*pg;
+	unsigned		idx;
+	unsigned		flags;
+	unsigned char		buffer[FTRACE_BUFF_MAX+1];
+	unsigned		buffer_idx;
+	unsigned		filtered;
+};
+
+static void notrace *
+t_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct ftrace_iterator *iter = m->private;
+	struct dyn_ftrace *rec = NULL;
+
+	(*pos)++;
+
+ retry:
+	if (iter->idx >= iter->pg->index) {
+		if (iter->pg->next) {
+			iter->pg = iter->pg->next;
+			iter->idx = 0;
+			goto retry;
+		}
+	} else {
+		rec = &iter->pg->records[iter->idx++];
+		if ((rec->flags & FTRACE_FL_FAILED) ||
+		    ((iter->flags & FTRACE_ITER_FILTER) &&
+		     !(rec->flags & FTRACE_FL_FILTER))) {
+			rec = NULL;
+			goto retry;
+		}
+	}
+
+	iter->pos = *pos;
+
+	return rec;
+}
+
+static void *t_start(struct seq_file *m, loff_t *pos)
+{
+	struct ftrace_iterator *iter = m->private;
+	void *p = NULL;
+	loff_t l = -1;
+
+	if (*pos != iter->pos) {
+		for (p = t_next(m, p, &l); p && l < *pos; p = t_next(m, p, &l))
+			;
+	} else {
+		l = *pos;
+		p = t_next(m, p, &l);
+	}
+
+	return p;
+}
+
+static void t_stop(struct seq_file *m, void *p)
+{
+}
+
+static int t_show(struct seq_file *m, void *v)
+{
+	struct dyn_ftrace *rec = v;
+	char str[KSYM_SYMBOL_LEN];
+
+	if (!rec)
+		return 0;
+
+	kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
+
+	seq_printf(m, "%s\n", str);
+
+	return 0;
+}
+
+static struct seq_operations show_ftrace_seq_ops = {
+	.start = t_start,
+	.next = t_next,
+	.stop = t_stop,
+	.show = t_show,
+};
+
+static int notrace
+ftrace_avail_open(struct inode *inode, struct file *file)
+{
+	struct ftrace_iterator *iter;
+	int ret;
+
+	iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+	if (!iter)
+		return -ENOMEM;
+
+	iter->pg = ftrace_pages_start;
+	iter->pos = -1;
+
+	ret = seq_open(file, &show_ftrace_seq_ops);
+	if (!ret) {
+		struct seq_file *m = file->private_data;
+		m->private = iter;
+	} else
+		kfree(iter);
+
+	return ret;
+}
+
+int ftrace_avail_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *m = (struct seq_file *)file->private_data;
+	struct ftrace_iterator *iter = m->private;
+
+	seq_release(inode, file);
+	kfree(iter);
+	return 0;
+}
+
+static void notrace ftrace_filter_reset(void)
+{
+	struct ftrace_page *pg;
+	struct dyn_ftrace *rec;
+	unsigned i;
+
+	/* keep kstop machine from running */
+	preempt_disable();
+	ftrace_filtered = 0;
+	pg = ftrace_pages_start;
+	while (pg) {
+		for (i = 0; i < pg->index; i++) {
+			rec = &pg->records[i];
+			if (rec->flags & FTRACE_FL_FAILED)
+				continue;
+			rec->flags &= ~FTRACE_FL_FILTER;
+		}
+		pg = pg->next;
+	}
+	preempt_enable();
+}
+
+static int notrace
+ftrace_filter_open(struct inode *inode, struct file *file)
+{
+	struct ftrace_iterator *iter;
+	int ret = 0;
+
+	iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+	if (!iter)
+		return -ENOMEM;
+
+	mutex_lock(&ftrace_filter_lock);
+	if ((file->f_mode & FMODE_WRITE) &&
+	    !(file->f_flags & O_APPEND))
+		ftrace_filter_reset();
+
+	if (file->f_mode & FMODE_READ) {
+		iter->pg = ftrace_pages_start;
+		iter->pos = -1;
+		iter->flags = FTRACE_ITER_FILTER;
+
+		ret = seq_open(file, &show_ftrace_seq_ops);
+		if (!ret) {
+			struct seq_file *m = file->private_data;
+			m->private = iter;
+		} else
+			kfree(iter);
+	} else
+		file->private_data = iter;
+	mutex_unlock(&ftrace_filter_lock);
+
+	return ret;
+}
+
+static ssize_t notrace
+ftrace_filter_read(struct file *file, char __user *ubuf,
+		       size_t cnt, loff_t *ppos)
+{
+	if (file->f_mode & FMODE_READ)
+		return seq_read(file, ubuf, cnt, ppos);
+	else
+		return -EPERM;
+}
+
+static loff_t notrace
+ftrace_filter_lseek(struct file *file, loff_t offset, int origin)
+{
+	loff_t ret;
+
+	if (file->f_mode & FMODE_READ)
+		ret = seq_lseek(file, offset, origin);
+	else
+		file->f_pos = ret = 1;
+
+	return ret;
+}
+
+enum {
+	MATCH_FULL,
+	MATCH_FRONT_ONLY,
+	MATCH_MIDDLE_ONLY,
+	MATCH_END_ONLY,
+};
+
+static void notrace
+ftrace_match(unsigned char *buff, int len)
+{
+	char str[KSYM_SYMBOL_LEN];
+	char *search = NULL;
+	struct ftrace_page *pg;
+	struct dyn_ftrace *rec;
+	int type = MATCH_FULL;
+	unsigned i, match = 0, search_len = 0;
+
+	for (i = 0; i < len; i++) {
+		if (buff[i] == '*') {
+			if (!i) {
+				search = buff + i + 1;
+				type = MATCH_END_ONLY;
+				search_len = len - (i + 1);
+			} else {
+				if (type == MATCH_END_ONLY) {
+					type = MATCH_MIDDLE_ONLY;
+				} else {
+					match = i;
+					type = MATCH_FRONT_ONLY;
+				}
+				buff[i] = 0;
+				break;
+			}
+		}
+	}
+
+	/* keep kstop machine from running */
+	preempt_disable();
+	ftrace_filtered = 1;
+	pg = ftrace_pages_start;
+	while (pg) {
+		for (i = 0; i < pg->index; i++) {
+			int matched = 0;
+			char *ptr;
+
+			rec = &pg->records[i];
+			if (rec->flags & FTRACE_FL_FAILED)
+				continue;
+			kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
+			switch (type) {
+			case MATCH_FULL:
+				if (strcmp(str, buff) == 0)
+					matched = 1;
+				break;
+			case MATCH_FRONT_ONLY:
+				if (memcmp(str, buff, match) == 0)
+					matched = 1;
+				break;
+			case MATCH_MIDDLE_ONLY:
+				if (strstr(str, search))
+					matched = 1;
+				break;
+			case MATCH_END_ONLY:
+				ptr = strstr(str, search);
+				if (ptr && (ptr[search_len] == 0))
+					matched = 1;
+				break;
+			}
+			if (matched)
+				rec->flags |= FTRACE_FL_FILTER;
+		}
+		pg = pg->next;
+	}
+	preempt_enable();
+}
+
+static ssize_t notrace
+ftrace_filter_write(struct file *file, const char __user *ubuf,
+		    size_t cnt, loff_t *ppos)
+{
+	struct ftrace_iterator *iter;
+	char ch;
+	size_t read = 0;
+	ssize_t ret;
+
+	if (!cnt || cnt < 0)
+		return 0;
+
+	mutex_lock(&ftrace_filter_lock);
+
+	if (file->f_mode & FMODE_READ) {
+		struct seq_file *m = file->private_data;
+		iter = m->private;
+	} else
+		iter = file->private_data;
+
+	if (!*ppos) {
+		iter->flags &= ~FTRACE_ITER_CONT;
+		iter->buffer_idx = 0;
+	}
+
+	ret = get_user(ch, ubuf++);
+	if (ret)
+		goto out;
+	read++;
+	cnt--;
+
+	if (!(iter->flags & ~FTRACE_ITER_CONT)) {
+		/* skip white space */
+		while (cnt && isspace(ch)) {
+			ret = get_user(ch, ubuf++);
+			if (ret)
+				goto out;
+			read++;
+			cnt--;
+		}
+
+
+		if (isspace(ch)) {
+			file->f_pos += read;
+			ret = read;
+			goto out;
+		}
+
+		iter->buffer_idx = 0;
+	}
+
+	while (cnt && !isspace(ch)) {
+		if (iter->buffer_idx < FTRACE_BUFF_MAX)
+			iter->buffer[iter->buffer_idx++] = ch;
+		else {
+			ret = -EINVAL;
+			goto out;
+		}
+		ret = get_user(ch, ubuf++);
+		if (ret)
+			goto out;
+		read++;
+		cnt--;
+	}
+
+	if (isspace(ch)) {
+		iter->filtered++;
+		iter->buffer[iter->buffer_idx] = 0;
+		ftrace_match(iter->buffer, iter->buffer_idx);
+		iter->buffer_idx = 0;
+	} else
+		iter->flags |= FTRACE_ITER_CONT;
+
+
+	file->f_pos += read;
+
+	ret = read;
+ out:
+	mutex_unlock(&ftrace_filter_lock);
+
+	return ret;
+}
+
+static int notrace
+ftrace_filter_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *m = (struct seq_file *)file->private_data;
+	struct ftrace_iterator *iter;
+
+	mutex_lock(&ftrace_filter_lock);
+	if (file->f_mode & FMODE_READ) {
+		iter = m->private;
+
+		seq_release(inode, file);
+	} else
+		iter = file->private_data;
+
+	if (iter->buffer_idx) {
+		iter->filtered++;
+		iter->buffer[iter->buffer_idx] = 0;
+		ftrace_match(iter->buffer, iter->buffer_idx);
+	}
+
+	mutex_lock(&ftrace_sysctl_lock);
+	mutex_lock(&ftraced_lock);
+	if (iter->filtered && ftraced_suspend && ftrace_enabled)
+		ftrace_run_update_code(FTRACE_ENABLE_CALLS);
+	mutex_unlock(&ftraced_lock);
+	mutex_unlock(&ftrace_sysctl_lock);
+
+	kfree(iter);
+	mutex_unlock(&ftrace_filter_lock);
+	return 0;
+}
+
+static struct file_operations ftrace_avail_fops = {
+	.open = ftrace_avail_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = ftrace_avail_release,
+};
+
+static struct file_operations ftrace_filter_fops = {
+	.open = ftrace_filter_open,
+	.read = ftrace_filter_read,
+	.write = ftrace_filter_write,
+	.llseek = ftrace_filter_lseek,
+	.release = ftrace_filter_release,
+};
+
+static __init int ftrace_init_debugfs(void)
+{
+	struct dentry *d_tracer;
+	struct dentry *entry;
+
+	d_tracer = tracing_init_dentry();
+
+	entry = debugfs_create_file("available_filter_functions", 0444,
+				    d_tracer, NULL, &ftrace_avail_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'available_filter_functions' entry\n");
+
+	entry = debugfs_create_file("set_ftrace_filter", 0644, d_tracer,
+				    NULL, &ftrace_filter_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'set_ftrace_filter' entry\n");
+	return 0;
+}
+
+fs_initcall(ftrace_init_debugfs);
+
 static int __init notrace ftrace_dynamic_init(void)
 {
 	struct task_struct *p;
@@ -657,14 +1150,14 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
 
 notrace int
 ftrace_enable_sysctl(struct ctl_table *table, int write,
-		     struct file *filp, void __user *buffer, size_t *lenp,
+		     struct file *file, void __user *buffer, size_t *lenp,
 		     loff_t *ppos)
 {
 	int ret;
 
 	mutex_lock(&ftrace_sysctl_lock);
 
-	ret  = proc_dointvec(table, write, filp, buffer, lenp, ppos);
+	ret  = proc_dointvec(table, write, file, buffer, lenp, ppos);
 
 	if (ret || !write || (last_ftrace_enabled == ftrace_enabled))
 		goto out;
-- 
cgit v1.2.3


From f43fdad8627fec2d21df92799b254dceb66c9c3c Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:20:43 +0200
Subject: ftrace: fix kexec

disable the tracer while kexec pulls the rug from under the old
kernel.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/machine_kexec_32.c | 4 ++++
 arch/x86/kernel/machine_kexec_64.c | 4 ++++
 include/linux/ftrace.h             | 7 +++++++
 3 files changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index d0b234c9fc31..88923fd7a6fc 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -11,6 +11,8 @@
 #include <linux/delay.h>
 #include <linux/init.h>
 #include <linux/numa.h>
+#include <linux/ftrace.h>
+
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
@@ -107,6 +109,8 @@ NORET_TYPE void machine_kexec(struct kimage *image)
 	unsigned long page_list[PAGES_NR];
 	void *control_page;
 
+	tracer_disable();
+
 	/* Interrupts aren't acceptable while we reboot */
 	local_irq_disable();
 
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 576a03db4511..1558fdc174f9 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -11,6 +11,8 @@
 #include <linux/string.h>
 #include <linux/reboot.h>
 #include <linux/numa.h>
+#include <linux/ftrace.h>
+
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
@@ -184,6 +186,8 @@ NORET_TYPE void machine_kexec(struct kimage *image)
 	unsigned long page_list[PAGES_NR];
 	void *control_page;
 
+	tracer_disable();
+
 	/* Interrupts aren't acceptable while we reboot */
 	local_irq_disable();
 
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index f5911d2d42c3..a42390c1d6e1 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -68,6 +68,13 @@ extern void ftrace_call(void);
 extern void mcount_call(void);
 #endif
 
+static inline void tracer_disable(void)
+{
+#ifdef CONFIG_FTRACE
+	ftrace_enabled = 0;
+#endif
+}
+
 #ifdef CONFIG_FRAME_POINTER
 /* TODO: need to fix this for ARM */
 # define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0))
-- 
cgit v1.2.3


From e1c08bdd9fa73e44096e5a82c0d5928b04ab02c8 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 12 May 2008 21:20:44 +0200
Subject: ftrace: force recording

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/ftrace.h |  4 ++++
 kernel/trace/ftrace.c  | 51 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 55 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index a42390c1d6e1..2c1670c65236 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -54,6 +54,8 @@ struct dyn_ftrace {
 	unsigned long	  flags;
 };
 
+int ftrace_force_update(void);
+
 /* defined in arch */
 extern int ftrace_ip_converted(unsigned long ip);
 extern unsigned char *ftrace_nop_replace(void);
@@ -66,6 +68,8 @@ extern int ftrace_update_ftrace_func(ftrace_func_t func);
 extern void ftrace_caller(void);
 extern void ftrace_call(void);
 extern void mcount_call(void);
+#else
+# define ftrace_force_update() do { } while (0)
 #endif
 
 static inline void tracer_disable(void)
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 97d5cb7b7e75..4facf5ceeb86 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -146,6 +146,10 @@ static int notrace __unregister_ftrace_function(struct ftrace_ops *ops)
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 
+static struct task_struct *ftraced_task;
+static DECLARE_WAIT_QUEUE_HEAD(ftraced_waiters);
+static unsigned long ftraced_iteration_counter;
+
 enum {
 	FTRACE_ENABLE_CALLS		= (1 << 0),
 	FTRACE_DISABLE_CALLS		= (1 << 1),
@@ -590,9 +594,12 @@ static int notrace ftraced(void *ignore)
 			ftraced_trigger = 0;
 			ftrace_record_suspend--;
 		}
+		ftraced_iteration_counter++;
 		mutex_unlock(&ftraced_lock);
 		mutex_unlock(&ftrace_sysctl_lock);
 
+		wake_up_interruptible(&ftraced_waiters);
+
 		ftrace_shutdown_replenish();
 
 		set_current_state(TASK_INTERRUPTIBLE);
@@ -1050,6 +1057,49 @@ static struct file_operations ftrace_filter_fops = {
 	.release = ftrace_filter_release,
 };
 
+/**
+ * ftrace_force_update - force an update to all recording ftrace functions
+ *
+ * The ftrace dynamic update daemon only wakes up once a second.
+ * There may be cases where an update needs to be done immediately
+ * for tests or internal kernel tracing to begin. This function
+ * wakes the daemon to do an update and will not return until the
+ * update is complete.
+ */
+int ftrace_force_update(void)
+{
+	unsigned long last_counter;
+	DECLARE_WAITQUEUE(wait, current);
+	int ret = 0;
+
+	if (!ftraced_task)
+		return -ENODEV;
+
+	mutex_lock(&ftraced_lock);
+	last_counter = ftraced_iteration_counter;
+
+	set_current_state(TASK_INTERRUPTIBLE);
+	add_wait_queue(&ftraced_waiters, &wait);
+
+	do {
+		mutex_unlock(&ftraced_lock);
+		wake_up_process(ftraced_task);
+		schedule();
+		mutex_lock(&ftraced_lock);
+		if (signal_pending(current)) {
+			ret = -EINTR;
+			break;
+		}
+		set_current_state(TASK_INTERRUPTIBLE);
+	} while (last_counter == ftraced_iteration_counter);
+
+	mutex_unlock(&ftraced_lock);
+	remove_wait_queue(&ftraced_waiters, &wait);
+	set_current_state(TASK_RUNNING);
+
+	return ret;
+}
+
 static __init int ftrace_init_debugfs(void)
 {
 	struct dentry *d_tracer;
@@ -1095,6 +1145,7 @@ static int __init notrace ftrace_dynamic_init(void)
 		return -1;
 
 	last_ftrace_enabled = ftrace_enabled = 1;
+	ftraced_task = p;
 
 	return 0;
 }
-- 
cgit v1.2.3


From c7aafc549766b87819285d3480648fc652a47bc4 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:20:45 +0200
Subject: ftrace: cleanups

factor out code and clean it up.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/ftrace.h            |   2 +-
 kernel/trace/ftrace.c             |   8 +--
 kernel/trace/trace.c              | 144 ++++++++++++++++++++++++--------------
 kernel/trace/trace.h              |   8 ++-
 kernel/trace/trace_irqsoff.c      |  32 ++++-----
 kernel/trace/trace_sched_wakeup.c |  18 ++---
 kernel/trace/trace_selftest.c     |  25 ++++---
 7 files changed, 134 insertions(+), 103 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 2c1670c65236..953a36d6a199 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -69,7 +69,7 @@ extern void ftrace_caller(void);
 extern void ftrace_call(void);
 extern void mcount_call(void);
 #else
-# define ftrace_force_update() do { } while (0)
+# define ftrace_force_update() ({ 0; })
 #endif
 
 static inline void tracer_disable(void)
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 4facf5ceeb86..6d4d2e86debc 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1152,10 +1152,10 @@ static int __init notrace ftrace_dynamic_init(void)
 
 core_initcall(ftrace_dynamic_init);
 #else
-# define ftrace_startup()	  do { } while (0)
-# define ftrace_shutdown()	  do { } while (0)
-# define ftrace_startup_sysctl()  do { } while (0)
-# define ftrace_shutdown_sysctl() do { } while (0)
+# define ftrace_startup()		do { } while (0)
+# define ftrace_shutdown()		do { } while (0)
+# define ftrace_startup_sysctl()	do { } while (0)
+# define ftrace_shutdown_sysctl()	do { } while (0)
 #endif /* CONFIG_DYNAMIC_FTRACE */
 
 /**
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index f6d026f17dbb..61d2f0228866 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -142,12 +142,59 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
 	tracing_record_cmdline(current);
 }
 
+void check_pages(struct trace_array_cpu *data)
+{
+	struct page *page, *tmp;
+
+	BUG_ON(data->trace_pages.next->prev != &data->trace_pages);
+	BUG_ON(data->trace_pages.prev->next != &data->trace_pages);
+
+	list_for_each_entry_safe(page, tmp, &data->trace_pages, lru) {
+		BUG_ON(page->lru.next->prev != &page->lru);
+		BUG_ON(page->lru.prev->next != &page->lru);
+	}
+}
+
+void *head_page(struct trace_array_cpu *data)
+{
+	struct page *page;
+
+	check_pages(data);
+	if (list_empty(&data->trace_pages))
+		return NULL;
+
+	page = list_entry(data->trace_pages.next, struct page, lru);
+	BUG_ON(&page->lru == &data->trace_pages);
+
+	return page_address(page);
+}
+
+notrace static void
+flip_trace(struct trace_array_cpu *tr1, struct trace_array_cpu *tr2)
+{
+	struct list_head flip_pages;
+
+	INIT_LIST_HEAD(&flip_pages);
+
+	tr1->trace_current = NULL;
+	memcpy(&tr1->trace_current_idx, &tr2->trace_current_idx,
+		sizeof(struct trace_array_cpu) -
+		offsetof(struct trace_array_cpu, trace_current_idx));
+
+	check_pages(tr1);
+	check_pages(tr2);
+	list_splice_init(&tr1->trace_pages, &flip_pages);
+	list_splice_init(&tr2->trace_pages, &tr1->trace_pages);
+	list_splice_init(&flip_pages, &tr2->trace_pages);
+	BUG_ON(!list_empty(&flip_pages));
+	check_pages(tr1);
+	check_pages(tr2);
+}
+
 notrace void
 update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
 {
 	struct trace_array_cpu *data;
-	void *save_trace;
-	struct list_head save_pages;
 	int i;
 
 	WARN_ON_ONCE(!irqs_disabled());
@@ -155,11 +202,7 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
 	/* clear out all the previous traces */
 	for_each_possible_cpu(i) {
 		data = tr->data[i];
-		save_trace = max_tr.data[i]->trace;
-		save_pages = max_tr.data[i]->trace_pages;
-		memcpy(max_tr.data[i], data, sizeof(*data));
-		data->trace = save_trace;
-		data->trace_pages = save_pages;
+		flip_trace(max_tr.data[i], data);
 		tracing_reset(data);
 	}
 
@@ -177,8 +220,6 @@ notrace void
 update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
 {
 	struct trace_array_cpu *data = tr->data[cpu];
-	void *save_trace;
-	struct list_head save_pages;
 	int i;
 
 	WARN_ON_ONCE(!irqs_disabled());
@@ -186,11 +227,8 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
 	for_each_possible_cpu(i)
 		tracing_reset(max_tr.data[i]);
 
-	save_trace = max_tr.data[cpu]->trace;
-	save_pages = max_tr.data[cpu]->trace_pages;
-	memcpy(max_tr.data[cpu], data, sizeof(*data));
-	data->trace = save_trace;
-	data->trace_pages = save_pages;
+	flip_trace(max_tr.data[cpu], data);
+
 	tracing_reset(data);
 
 	__update_max_tr(tr, tsk, cpu);
@@ -234,9 +272,9 @@ int register_tracer(struct tracer *type)
 		 * If we fail, we do not register this tracer.
 		 */
 		for_each_possible_cpu(i) {
-			if (!data->trace)
-				continue;
 			data = tr->data[i];
+			if (!head_page(data))
+				continue;
 			tracing_reset(data);
 		}
 		current_trace = type;
@@ -298,7 +336,7 @@ void unregister_tracer(struct tracer *type)
 void notrace tracing_reset(struct trace_array_cpu *data)
 {
 	data->trace_idx = 0;
-	data->trace_current = data->trace;
+	data->trace_current = head_page(data);
 	data->trace_current_idx = 0;
 }
 
@@ -425,26 +463,31 @@ notrace void tracing_record_cmdline(struct task_struct *tsk)
 }
 
 static inline notrace struct trace_entry *
-tracing_get_trace_entry(struct trace_array *tr,
-			struct trace_array_cpu *data)
+tracing_get_trace_entry(struct trace_array *tr, struct trace_array_cpu *data)
 {
 	unsigned long idx, idx_next;
 	struct trace_entry *entry;
-	struct page *page;
 	struct list_head *next;
+	struct page *page;
 
 	data->trace_idx++;
 	idx = data->trace_current_idx;
 	idx_next = idx + 1;
 
+	BUG_ON(idx * TRACE_ENTRY_SIZE >= PAGE_SIZE);
+
 	entry = data->trace_current + idx * TRACE_ENTRY_SIZE;
 
 	if (unlikely(idx_next >= ENTRIES_PER_PAGE)) {
 		page = virt_to_page(data->trace_current);
-		if (unlikely(&page->lru == data->trace_pages.prev))
-			next = data->trace_pages.next;
-		else
-			next = page->lru.next;
+		/*
+		 * Roundrobin - but skip the head (which is not a real page):
+		 */
+		next = page->lru.next;
+		if (unlikely(next == &data->trace_pages))
+			next = next->next;
+		BUG_ON(next == &data->trace_pages);
+
 		page = list_entry(next, struct page, lru);
 		data->trace_current = page_address(page);
 		idx_next = 0;
@@ -456,18 +499,17 @@ tracing_get_trace_entry(struct trace_array *tr,
 }
 
 static inline notrace void
-tracing_generic_entry_update(struct trace_entry *entry,
-			     unsigned long flags)
+tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags)
 {
 	struct task_struct *tsk = current;
 	unsigned long pc;
 
 	pc = preempt_count();
 
-	entry->idx	= atomic_inc_return(&tracer_counter);
-	entry->preempt_count = pc & 0xff;
-	entry->pid	 = tsk->pid;
-	entry->t	 = now(raw_smp_processor_id());
+	entry->idx		= atomic_inc_return(&tracer_counter);
+	entry->preempt_count	= pc & 0xff;
+	entry->pid		= tsk->pid;
+	entry->t		= now(raw_smp_processor_id());
 	entry->flags = (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
 		((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
 		((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
@@ -476,16 +518,15 @@ tracing_generic_entry_update(struct trace_entry *entry,
 
 notrace void
 ftrace(struct trace_array *tr, struct trace_array_cpu *data,
-		       unsigned long ip, unsigned long parent_ip,
-		       unsigned long flags)
+       unsigned long ip, unsigned long parent_ip, unsigned long flags)
 {
 	struct trace_entry *entry;
 
-	entry = tracing_get_trace_entry(tr, data);
+	entry			= tracing_get_trace_entry(tr, data);
 	tracing_generic_entry_update(entry, flags);
-	entry->type	    = TRACE_FN;
-	entry->fn.ip	    = ip;
-	entry->fn.parent_ip = parent_ip;
+	entry->type		= TRACE_FN;
+	entry->fn.ip		= ip;
+	entry->fn.parent_ip	= parent_ip;
 }
 
 notrace void
@@ -496,7 +537,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
 {
 	struct trace_entry *entry;
 
-	entry = tracing_get_trace_entry(tr, data);
+	entry			= tracing_get_trace_entry(tr, data);
 	tracing_generic_entry_update(entry, flags);
 	entry->type		= TRACE_CTX;
 	entry->ctx.prev_pid	= prev->pid;
@@ -540,6 +581,8 @@ trace_entry_idx(struct trace_array *tr, struct trace_array_cpu *data,
 	}
 
 	page = list_entry(iter->next_page[cpu], struct page, lru);
+	BUG_ON(&data->trace_pages == &page->lru);
+
 	array = page_address(page);
 
 	return &array[iter->next_page_idx[cpu]];
@@ -554,7 +597,7 @@ find_next_entry(struct trace_iterator *iter, int *ent_cpu)
 	int cpu;
 
 	for_each_possible_cpu(cpu) {
-		if (!tr->data[cpu]->trace)
+		if (!head_page(tr->data[cpu]))
 			continue;
 		ent = trace_entry_idx(tr, tr->data[cpu], iter, cpu);
 		if (ent &&
@@ -762,7 +805,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
 		name = type->name;
 
 	for_each_possible_cpu(cpu) {
-		if (tr->data[cpu]->trace) {
+		if (head_page(tr->data[cpu])) {
 			total += tr->data[cpu]->trace_idx;
 			if (tr->data[cpu]->trace_idx > tr->entries)
 				entries += tr->entries;
@@ -975,8 +1018,7 @@ static int trace_empty(struct trace_iterator *iter)
 	for_each_possible_cpu(cpu) {
 		data = iter->tr->data[cpu];
 
-		if (data->trace &&
-		    data->trace_idx)
+		if (head_page(data) && data->trace_idx)
 			return 0;
 	}
 	return 1;
@@ -1576,9 +1618,9 @@ static struct tracer no_tracer __read_mostly =
 static int trace_alloc_page(void)
 {
 	struct trace_array_cpu *data;
-	void *array;
 	struct page *page, *tmp;
 	LIST_HEAD(pages);
+	void *array;
 	int i;
 
 	/* first allocate a page for each CPU */
@@ -1610,14 +1652,14 @@ static int trace_alloc_page(void)
 	for_each_possible_cpu(i) {
 		data = global_trace.data[i];
 		page = list_entry(pages.next, struct page, lru);
-		list_del(&page->lru);
+		list_del_init(&page->lru);
 		list_add_tail(&page->lru, &data->trace_pages);
 		ClearPageLRU(page);
 
 #ifdef CONFIG_TRACER_MAX_TRACE
 		data = max_tr.data[i];
 		page = list_entry(pages.next, struct page, lru);
-		list_del(&page->lru);
+		list_del_init(&page->lru);
 		list_add_tail(&page->lru, &data->trace_pages);
 		SetPageLRU(page);
 #endif
@@ -1628,7 +1670,7 @@ static int trace_alloc_page(void)
 
  free_pages:
 	list_for_each_entry_safe(page, tmp, &pages, lru) {
-		list_del(&page->lru);
+		list_del_init(&page->lru);
 		__free_page(page);
 	}
 	return -ENOMEM;
@@ -1654,7 +1696,6 @@ __init static int tracer_alloc_buffers(void)
 			       "for trace buffer!\n");
 			goto free_buffers;
 		}
-		data->trace = array;
 
 		/* set the array to the list */
 		INIT_LIST_HEAD(&data->trace_pages);
@@ -1671,7 +1712,6 @@ __init static int tracer_alloc_buffers(void)
 			       "for trace buffer!\n");
 			goto free_buffers;
 		}
-		max_tr.data[i]->trace = array;
 
 		INIT_LIST_HEAD(&max_tr.data[i]->trace_pages);
 		page = virt_to_page(array);
@@ -1716,24 +1756,22 @@ __init static int tracer_alloc_buffers(void)
 		struct page *page, *tmp;
 		struct trace_array_cpu *data = global_trace.data[i];
 
-		if (data && data->trace) {
+		if (data) {
 			list_for_each_entry_safe(page, tmp,
 						 &data->trace_pages, lru) {
-				list_del(&page->lru);
+				list_del_init(&page->lru);
 				__free_page(page);
 			}
-			data->trace = NULL;
 		}
 
 #ifdef CONFIG_TRACER_MAX_TRACE
 		data = max_tr.data[i];
-		if (data && data->trace) {
+		if (data) {
 			list_for_each_entry_safe(page, tmp,
 						 &data->trace_pages, lru) {
-				list_del(&page->lru);
+				list_del_init(&page->lru);
 				__free_page(page);
 			}
-			data->trace = NULL;
 		}
 #endif
 	}
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 88edbf1f6788..cc1d34b8b771 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -53,12 +53,12 @@ struct trace_entry {
  * the trace, etc.)
  */
 struct trace_array_cpu {
-	void			*trace;
 	void			*trace_current;
-	unsigned		trace_current_idx;
 	struct list_head	trace_pages;
-	unsigned long		trace_idx;
 	atomic_t		disabled;
+	/* these fields get copied into max-trace: */
+	unsigned		trace_current_idx;
+	unsigned long		trace_idx;
 	unsigned long		saved_latency;
 	unsigned long		critical_start;
 	unsigned long		critical_end;
@@ -216,4 +216,6 @@ extern int trace_selftest_startup_sched_switch(struct tracer *trace,
 #endif
 #endif /* CONFIG_FTRACE_STARTUP_TEST */
 
+extern void *head_page(struct trace_array_cpu *data);
+
 #endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 14183b8f79c5..2dfebb67fdfb 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -144,7 +144,7 @@ check_critical_timing(struct trace_array *tr,
 	if (!report_latency(delta))
 		goto out;
 
-	spin_lock(&max_trace_lock);
+	spin_lock_irqsave(&max_trace_lock, flags);
 
 	/* check if we are still the max latency */
 	if (!report_latency(delta))
@@ -165,32 +165,24 @@ check_critical_timing(struct trace_array *tr,
 
 	update_max_tr_single(tr, current, cpu);
 
-	if (tracing_thresh)
-		printk(KERN_INFO "(%16s-%-5d|#%d): %lu us critical section "
-		       "violates %lu us threshold.\n"
-		       " => started at timestamp %lu: ",
+	if (tracing_thresh) {
+		printk(KERN_INFO "(%16s-%-5d|#%d):"
+			" %lu us critical section violates %lu us threshold.\n",
 				current->comm, current->pid,
 				raw_smp_processor_id(),
-				latency, nsecs_to_usecs(tracing_thresh), t0);
-	else
+				latency, nsecs_to_usecs(tracing_thresh));
+	} else {
 		printk(KERN_INFO "(%16s-%-5d|#%d):"
-		       " new %lu us maximum-latency "
-		       "critical section.\n => started at timestamp %lu: ",
+		       " new %lu us maximum-latency critical section.\n",
 				current->comm, current->pid,
 				raw_smp_processor_id(),
-				latency, t0);
-
-	print_symbol(KERN_CONT "<%s>\n", data->critical_start);
-	printk(KERN_CONT " =>   ended at timestamp %lu: ", t1);
-	print_symbol(KERN_CONT "<%s>\n", data->critical_end);
-	dump_stack();
-	t1 = nsecs_to_usecs(now(cpu));
-	printk(KERN_CONT " =>   dump-end timestamp %lu\n\n", t1);
+				latency);
+	}
 
 	max_sequence++;
 
 out_unlock:
-	spin_unlock(&max_trace_lock);
+	spin_unlock_irqrestore(&max_trace_lock, flags);
 
 out:
 	data->critical_sequence = max_sequence;
@@ -216,7 +208,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
 	cpu = raw_smp_processor_id();
 	data = tr->data[cpu];
 
-	if (unlikely(!data) || unlikely(!data->trace) ||
+	if (unlikely(!data) || unlikely(!head_page(data)) ||
 	    atomic_read(&data->disabled))
 		return;
 
@@ -256,7 +248,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)
 	cpu = raw_smp_processor_id();
 	data = tr->data[cpu];
 
-	if (unlikely(!data) || unlikely(!data->trace) ||
+	if (unlikely(!data) || unlikely(!head_page(data)) ||
 	    !data->critical_start || atomic_read(&data->disabled))
 		return;
 
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 3d10ff01f805..688df965f3f2 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -107,24 +107,18 @@ wakeup_sched_switch(struct task_struct *prev, struct task_struct *next)
 	update_max_tr(tr, wakeup_task, wakeup_cpu);
 
 	if (tracing_thresh) {
-		printk(KERN_INFO "(%16s-%-5d|#%d): %lu us wakeup latency "
-		       "violates %lu us threshold.\n"
-		       " => started at timestamp %lu: ",
+		printk(KERN_INFO "(%16s-%-5d|#%d):"
+			" %lu us wakeup latency violates %lu us threshold.\n",
 				wakeup_task->comm, wakeup_task->pid,
 				raw_smp_processor_id(),
-				latency, nsecs_to_usecs(tracing_thresh), t0);
+				latency, nsecs_to_usecs(tracing_thresh));
 	} else {
-		printk(KERN_INFO "(%16s-%-5d|#%d): new %lu us maximum "
-		       "wakeup latency.\n => started at timestamp %lu: ",
+		printk(KERN_INFO "(%16s-%-5d|#%d):"
+			" new %lu us maximum wakeup latency.\n",
 				wakeup_task->comm, wakeup_task->pid,
-				cpu, latency, t0);
+				cpu, latency);
 	}
 
-	printk(KERN_CONT "   ended at timestamp %lu: ", t1);
-	dump_stack();
-	t1 = nsecs_to_usecs(now(cpu));
-	printk(KERN_CONT "   dump-end timestamp %lu\n\n", t1);
-
 out_unlock:
 	__wakeup_reset(tr);
 	spin_unlock_irqrestore(&wakeup_lock, flags);
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index ef4d3cc009f5..c01874c3b1f9 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -1,6 +1,7 @@
 /* Include in trace.c */
 
 #include <linux/kthread.h>
+#include <linux/delay.h>
 
 static inline int trace_valid_entry(struct trace_entry *entry)
 {
@@ -15,28 +16,29 @@ static inline int trace_valid_entry(struct trace_entry *entry)
 static int
 trace_test_buffer_cpu(struct trace_array *tr, struct trace_array_cpu *data)
 {
-	struct page *page;
 	struct trace_entry *entries;
+	struct page *page;
 	int idx = 0;
 	int i;
 
+	BUG_ON(list_empty(&data->trace_pages));
 	page = list_entry(data->trace_pages.next, struct page, lru);
 	entries = page_address(page);
 
-	if (data->trace != entries)
+	if (head_page(data) != entries)
 		goto failed;
 
 	/*
 	 * The starting trace buffer always has valid elements,
-	 * if any element exits.
+	 * if any element exists.
 	 */
-	entries = data->trace;
+	entries = head_page(data);
 
 	for (i = 0; i < tr->entries; i++) {
 
-		if (i < data->trace_idx &&
-		    !trace_valid_entry(&entries[idx])) {
-			printk(KERN_CONT ".. invalid entry %d ", entries[idx].type);
+		if (i < data->trace_idx && !trace_valid_entry(&entries[idx])) {
+			printk(KERN_CONT ".. invalid entry %d ",
+				entries[idx].type);
 			goto failed;
 		}
 
@@ -80,11 +82,10 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
 	int ret = 0;
 
 	for_each_possible_cpu(cpu) {
-		if (!tr->data[cpu]->trace)
+		if (!head_page(tr->data[cpu]))
 			continue;
 
 		cnt += tr->data[cpu]->trace_idx;
-		printk("%d: count = %ld\n", cpu, cnt);
 
 		ret = trace_test_buffer_cpu(tr, tr->data[cpu]);
 		if (ret)
@@ -117,6 +118,8 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
 	}
 
 	/* start the tracing */
+	ftrace_enabled = 1;
+
 	tr->ctrl = 1;
 	trace->init(tr);
 	/* Sleep for a 1/10 of a second */
@@ -124,6 +127,8 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
 	/* stop the tracing. */
 	tr->ctrl = 0;
 	trace->ctrl_update(tr);
+	ftrace_enabled = 0;
+
 	/* check the trace buffer */
 	ret = trace_test_buffer(tr, &count);
 	trace->reset(tr);
@@ -328,7 +333,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
 
 	/* create a high prio thread */
 	p = kthread_run(trace_wakeup_test_thread, &isrt, "ftrace-test");
-	if (!IS_ERR(p)) {
+	if (IS_ERR(p)) {
 		printk(KERN_CONT "Failed to create ftrace wakeup test thread ");
 		return -1;
 	}
-- 
cgit v1.2.3


From 77a2b37d227483fe52aead242652aee406c25bf0 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 12 May 2008 21:20:45 +0200
Subject: ftrace: startup tester on dynamic tracing.

This patch adds a startup self test on dynamic code modification
and filters. The test filters on a specific function, makes sure that
no other function is traced, exectutes the function, then makes sure that
the function is traced.

This patch also fixes a slight bug with the ftrace selftest, where
tracer_enabled was not being set.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/ftrace.h        |   2 +
 kernel/trace/ftrace.c         |  19 +++++++
 kernel/trace/trace_selftest.c | 113 ++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 130 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 953a36d6a199..a842d96c6343 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -55,6 +55,7 @@ struct dyn_ftrace {
 };
 
 int ftrace_force_update(void);
+void ftrace_set_filter(unsigned char *buf, int len, int reset);
 
 /* defined in arch */
 extern int ftrace_ip_converted(unsigned long ip);
@@ -70,6 +71,7 @@ extern void ftrace_call(void);
 extern void mcount_call(void);
 #else
 # define ftrace_force_update() ({ 0; })
+# define ftrace_set_filter(buf, len, reset) do { } while (0)
 #endif
 
 static inline void tracer_disable(void)
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 6d4d2e86debc..5e9389faaf75 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1010,6 +1010,25 @@ ftrace_filter_write(struct file *file, const char __user *ubuf,
 	return ret;
 }
 
+/**
+ * ftrace_set_filter - set a function to filter on in ftrace
+ * @buf - the string that holds the function filter text.
+ * @len - the length of the string.
+ * @reset - non zero to reset all filters before applying this filter.
+ *
+ * Filters denote which functions should be enabled when tracing is enabled.
+ * If @buf is NULL and reset is set, all functions will be enabled for tracing.
+ */
+notrace void ftrace_set_filter(unsigned char *buf, int len, int reset)
+{
+	mutex_lock(&ftrace_filter_lock);
+	if (reset)
+		ftrace_filter_reset();
+	if (buf)
+		ftrace_match(buf, len);
+	mutex_unlock(&ftrace_filter_lock);
+}
+
 static int notrace
 ftrace_filter_release(struct inode *inode, struct file *file)
 {
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index c01874c3b1f9..4c8a1b2d8231 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -99,6 +99,100 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
 }
 
 #ifdef CONFIG_FTRACE
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+#define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func
+#define __STR(x) #x
+#define STR(x) __STR(x)
+static int DYN_FTRACE_TEST_NAME(void)
+{
+	/* used to call mcount */
+	return 0;
+}
+
+/* Test dynamic code modification and ftrace filters */
+int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
+					   struct trace_array *tr,
+					   int (*func)(void))
+{
+	unsigned long count;
+	int ret;
+	int save_ftrace_enabled = ftrace_enabled;
+	int save_tracer_enabled = tracer_enabled;
+
+	/* The ftrace test PASSED */
+	printk(KERN_CONT "PASSED\n");
+	pr_info("Testing dynamic ftrace: ");
+
+	/* enable tracing, and record the filter function */
+	ftrace_enabled = 1;
+	tracer_enabled = 1;
+
+	/* passed in by parameter to fool gcc from optimizing */
+	func();
+
+	/* update the records */
+	ret = ftrace_force_update();
+	if (ret) {
+		printk(KERN_CONT ".. ftraced failed .. ");
+		return ret;
+	}
+
+	/* filter only on our function */
+	ftrace_set_filter(STR(DYN_FTRACE_TEST_NAME),
+			  sizeof(STR(DYN_FTRACE_TEST_NAME)), 1);
+
+	/* enable tracing */
+	tr->ctrl = 1;
+	trace->init(tr);
+	/* Sleep for a 1/10 of a second */
+	msleep(100);
+
+	/* we should have nothing in the buffer */
+	ret = trace_test_buffer(tr, &count);
+	if (ret)
+		goto out;
+
+	if (count) {
+		ret = -1;
+		printk(KERN_CONT ".. filter did not filter .. ");
+		goto out;
+	}
+
+	/* call our function again */
+	func();
+
+	/* sleep again */
+	msleep(100);
+
+	/* stop the tracing. */
+	tr->ctrl = 0;
+	trace->ctrl_update(tr);
+	ftrace_enabled = 0;
+
+	/* check the trace buffer */
+	ret = trace_test_buffer(tr, &count);
+	trace->reset(tr);
+
+	/* we should only have one item */
+	if (!ret && count != 1) {
+		printk(KERN_CONT ".. filter failed ..");
+		ret = -1;
+		goto out;
+	}
+ out:
+	ftrace_enabled = save_ftrace_enabled;
+	tracer_enabled = save_tracer_enabled;
+
+	/* Enable tracing on all functions again */
+	ftrace_set_filter(NULL, 0, 1);
+
+	return ret;
+}
+#else
+# define trace_selftest_startup_dynamic_tracing(trace, tr, func) ({ 0; })
+#endif /* CONFIG_DYNAMIC_FTRACE */
 /*
  * Simple verification test of ftrace function tracer.
  * Enable ftrace, sleep 1/10 second, and then read the trace
@@ -109,8 +203,13 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
 {
 	unsigned long count;
 	int ret;
+	int save_ftrace_enabled = ftrace_enabled;
+	int save_tracer_enabled = tracer_enabled;
 
-	/* make sure functions have been recorded */
+	/* make sure msleep has been recorded */
+	msleep(1);
+
+	/* force the recorded functions to be traced */
 	ret = ftrace_force_update();
 	if (ret) {
 		printk(KERN_CONT ".. ftraced failed .. ");
@@ -119,6 +218,7 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
 
 	/* start the tracing */
 	ftrace_enabled = 1;
+	tracer_enabled = 1;
 
 	tr->ctrl = 1;
 	trace->init(tr);
@@ -136,8 +236,16 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
 	if (!ret && !count) {
 		printk(KERN_CONT ".. no entries found ..");
 		ret = -1;
+		goto out;
 	}
 
+	ret = trace_selftest_startup_dynamic_tracing(trace, tr,
+						     DYN_FTRACE_TEST_NAME);
+
+ out:
+	ftrace_enabled = save_ftrace_enabled;
+	tracer_enabled = save_tracer_enabled;
+
 	return ret;
 }
 #endif /* CONFIG_FTRACE */
@@ -415,6 +523,3 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr
 	return ret;
 }
 #endif /* CONFIG_CONTEXT_SWITCH_TRACER */
-
-#ifdef CONFIG_DYNAMIC_FTRACE
-#endif /* CONFIG_DYNAMIC_FTRACE */
-- 
cgit v1.2.3


From 37ad508419f0fdfda7b378756eb1f35cfd26d96d Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 12 May 2008 21:20:48 +0200
Subject: ftrace - fix dynamic ftrace memory leak

The ftrace dynamic function update allocates a record to store the
instruction pointers that are being modified. If the modified
instruction pointer fails to update, then the record is marked as
failed and nothing more is done.

Worse, if the modification fails, but the record ip function is still
called, it will allocate a new record and try again. In just a matter
of time, will this cause a serious memory leak and crash the system.

This patch plugs this memory leak. When a record fails, it is
included back into the pool of records to be used. Now a record may
fail over and over again, but the number of allocated records will
not increase.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/ftrace.h |  7 ++++---
 kernel/trace/ftrace.c  | 45 ++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 46 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index a842d96c6343..61e757bd2350 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -43,9 +43,10 @@ extern void mcount(void);
 # define FTRACE_HASHSIZE	(1<<FTRACE_HASHBITS)
 
 enum {
-	FTRACE_FL_FAILED	= (1 << 0),
-	FTRACE_FL_FILTER	= (1 << 1),
-	FTRACE_FL_ENABLED	= (1 << 2),
+	FTRACE_FL_FREE		= (1 << 0),
+	FTRACE_FL_FAILED	= (1 << 1),
+	FTRACE_FL_FILTER	= (1 << 2),
+	FTRACE_FL_ENABLED	= (1 << 3),
 };
 
 struct dyn_ftrace {
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index a15e068535f8..8e02aa690b2b 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -188,6 +188,8 @@ static int ftraced_suspend;
 
 static int ftrace_record_suspend;
 
+static struct dyn_ftrace *ftrace_free_records;
+
 static inline int
 notrace ftrace_ip_in_hash(unsigned long ip, unsigned long key)
 {
@@ -211,8 +213,35 @@ ftrace_add_hash(struct dyn_ftrace *node, unsigned long key)
 	hlist_add_head(&node->node, &ftrace_hash[key]);
 }
 
+static notrace void ftrace_free_rec(struct dyn_ftrace *rec)
+{
+	/* no locking, only called from kstop_machine */
+
+	rec->ip = (unsigned long)ftrace_free_records;
+	ftrace_free_records = rec;
+	rec->flags |= FTRACE_FL_FREE;
+}
+
 static notrace struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
 {
+	struct dyn_ftrace *rec;
+
+	/* First check for freed records */
+	if (ftrace_free_records) {
+		rec = ftrace_free_records;
+
+		/* todo, disable tracing altogether on this warning */
+		if (unlikely(!(rec->flags & FTRACE_FL_FREE))) {
+			WARN_ON_ONCE(1);
+			ftrace_free_records = NULL;
+			return NULL;
+		}
+
+		ftrace_free_records = (void *)rec->ip;
+		memset(rec, 0, sizeof(*rec));
+		return rec;
+	}
+
 	if (ftrace_pages->index == ENTRIES_PER_PAGE) {
 		if (!ftrace_pages->next)
 			return NULL;
@@ -356,8 +385,16 @@ __ftrace_replace_code(struct dyn_ftrace *rec,
 	}
 
 	failed = ftrace_modify_code(ip, old, new);
-	if (failed)
-		rec->flags |= FTRACE_FL_FAILED;
+	if (failed) {
+		unsigned long key;
+		/* It is possible that the function hasn't been converted yet */
+		key = hash_long(ip, FTRACE_HASHBITS);
+		if (!ftrace_ip_in_hash(ip, key)) {
+			rec->flags |= FTRACE_FL_FAILED;
+			ftrace_free_rec(rec);
+		}
+
+	}
 }
 
 static void notrace ftrace_replace_code(int enable)
@@ -407,8 +444,10 @@ ftrace_code_disable(struct dyn_ftrace *rec)
 	call = ftrace_call_replace(ip, MCOUNT_ADDR);
 
 	failed = ftrace_modify_code(ip, call, nop);
-	if (failed)
+	if (failed) {
 		rec->flags |= FTRACE_FL_FAILED;
+		ftrace_free_rec(rec);
+	}
 }
 
 static int notrace __ftrace_modify_code(void *data)
-- 
cgit v1.2.3


From 4eebcc81a33fbc45e28542b50197ed7b3c486d90 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 12 May 2008 21:20:48 +0200
Subject: ftrace: disable tracing on failure

Since ftrace touches practically every function. If we detect any
anomaly, we want to fully disable ftrace. This patch adds code
to try shutdown ftrace as much as possible without doing any more
harm is something is detected not quite correct.

This only kills ftrace, this patch does have checks for other parts of
the tracer (irqsoff, wakeup, etc.).

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/ftrace.h        |   3 ++
 kernel/trace/ftrace.c         | 112 ++++++++++++++++++++++++++++++++++++++----
 kernel/trace/trace_selftest.c |   4 ++
 3 files changed, 110 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 61e757bd2350..4650a3160b7f 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -58,6 +58,9 @@ struct dyn_ftrace {
 int ftrace_force_update(void);
 void ftrace_set_filter(unsigned char *buf, int len, int reset);
 
+/* totally disable ftrace - can not re-enable after this */
+void ftrace_kill(void);
+
 /* defined in arch */
 extern int ftrace_ip_converted(unsigned long ip);
 extern unsigned char *ftrace_nop_replace(void);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 8e02aa690b2b..ff42345dd78e 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -29,9 +29,16 @@
 
 #include "trace.h"
 
-int ftrace_enabled;
+/* ftrace_enabled is a method to turn ftrace on or off */
+int ftrace_enabled __read_mostly;
 static int last_ftrace_enabled;
 
+/*
+ * ftrace_disabled is set when an anomaly is discovered.
+ * ftrace_disabled is much stronger than ftrace_enabled.
+ */
+static int ftrace_disabled __read_mostly;
+
 static DEFINE_SPINLOCK(ftrace_lock);
 static DEFINE_MUTEX(ftrace_sysctl_lock);
 
@@ -230,10 +237,11 @@ static notrace struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
 	if (ftrace_free_records) {
 		rec = ftrace_free_records;
 
-		/* todo, disable tracing altogether on this warning */
 		if (unlikely(!(rec->flags & FTRACE_FL_FREE))) {
 			WARN_ON_ONCE(1);
 			ftrace_free_records = NULL;
+			ftrace_disabled = 1;
+			ftrace_enabled = 0;
 			return NULL;
 		}
 
@@ -260,7 +268,7 @@ ftrace_record_ip(unsigned long ip)
 	int resched;
 	int atomic;
 
-	if (!ftrace_enabled)
+	if (!ftrace_enabled || ftrace_disabled)
 		return;
 
 	resched = need_resched();
@@ -485,6 +493,9 @@ static void notrace ftrace_startup(void)
 {
 	int command = 0;
 
+	if (unlikely(ftrace_disabled))
+		return;
+
 	mutex_lock(&ftraced_lock);
 	ftraced_suspend++;
 	if (ftraced_suspend == 1)
@@ -507,6 +518,9 @@ static void notrace ftrace_shutdown(void)
 {
 	int command = 0;
 
+	if (unlikely(ftrace_disabled))
+		return;
+
 	mutex_lock(&ftraced_lock);
 	ftraced_suspend--;
 	if (!ftraced_suspend)
@@ -529,6 +543,9 @@ static void notrace ftrace_startup_sysctl(void)
 {
 	int command = FTRACE_ENABLE_MCOUNT;
 
+	if (unlikely(ftrace_disabled))
+		return;
+
 	mutex_lock(&ftraced_lock);
 	/* Force update next time */
 	saved_ftrace_func = NULL;
@@ -544,6 +561,9 @@ static void notrace ftrace_shutdown_sysctl(void)
 {
 	int command = FTRACE_DISABLE_MCOUNT;
 
+	if (unlikely(ftrace_disabled))
+		return;
+
 	mutex_lock(&ftraced_lock);
 	/* ftraced_suspend is true if ftrace is running */
 	if (ftraced_suspend)
@@ -600,6 +620,9 @@ static int notrace __ftrace_update_code(void *ignore)
 
 static void notrace ftrace_update_code(void)
 {
+	if (unlikely(ftrace_disabled))
+		return;
+
 	stop_machine_run(__ftrace_update_code, NULL, NR_CPUS);
 }
 
@@ -614,6 +637,9 @@ static int notrace ftraced(void *ignore)
 		/* check once a second */
 		schedule_timeout(HZ);
 
+		if (unlikely(ftrace_disabled))
+			continue;
+
 		mutex_lock(&ftrace_sysctl_lock);
 		mutex_lock(&ftraced_lock);
 		if (ftrace_enabled && ftraced_trigger && !ftraced_suspend) {
@@ -628,6 +654,7 @@ static int notrace ftraced(void *ignore)
 					ftrace_update_cnt != 1 ? "s" : "",
 					ftrace_update_tot_cnt,
 					usecs, usecs != 1 ? "s" : "");
+				ftrace_disabled = 1;
 				WARN_ON_ONCE(1);
 			}
 			ftraced_trigger = 0;
@@ -785,6 +812,9 @@ ftrace_avail_open(struct inode *inode, struct file *file)
 	struct ftrace_iterator *iter;
 	int ret;
 
+	if (unlikely(ftrace_disabled))
+		return -ENODEV;
+
 	iter = kzalloc(sizeof(*iter), GFP_KERNEL);
 	if (!iter)
 		return -ENOMEM;
@@ -843,6 +873,9 @@ ftrace_filter_open(struct inode *inode, struct file *file)
 	struct ftrace_iterator *iter;
 	int ret = 0;
 
+	if (unlikely(ftrace_disabled))
+		return -ENODEV;
+
 	iter = kzalloc(sizeof(*iter), GFP_KERNEL);
 	if (!iter)
 		return -ENOMEM;
@@ -1063,6 +1096,9 @@ ftrace_filter_write(struct file *file, const char __user *ubuf,
  */
 notrace void ftrace_set_filter(unsigned char *buf, int len, int reset)
 {
+	if (unlikely(ftrace_disabled))
+		return;
+
 	mutex_lock(&ftrace_filter_lock);
 	if (reset)
 		ftrace_filter_reset();
@@ -1133,7 +1169,7 @@ int ftrace_force_update(void)
 	DECLARE_WAITQUEUE(wait, current);
 	int ret = 0;
 
-	if (!ftraced_task)
+	if (unlikely(ftrace_disabled))
 		return -ENODEV;
 
 	mutex_lock(&ftraced_lock);
@@ -1142,6 +1178,11 @@ int ftrace_force_update(void)
 	set_current_state(TASK_INTERRUPTIBLE);
 	add_wait_queue(&ftraced_waiters, &wait);
 
+	if (unlikely(!ftraced_task)) {
+		ret = -ENODEV;
+		goto out;
+	}
+
 	do {
 		mutex_unlock(&ftraced_lock);
 		wake_up_process(ftraced_task);
@@ -1154,6 +1195,7 @@ int ftrace_force_update(void)
 		set_current_state(TASK_INTERRUPTIBLE);
 	} while (last_counter == ftraced_iteration_counter);
 
+ out:
 	mutex_unlock(&ftraced_lock);
 	remove_wait_queue(&ftraced_waiters, &wait);
 	set_current_state(TASK_RUNNING);
@@ -1161,6 +1203,22 @@ int ftrace_force_update(void)
 	return ret;
 }
 
+static void ftrace_force_shutdown(void)
+{
+	struct task_struct *task;
+	int command = FTRACE_DISABLE_CALLS | FTRACE_UPDATE_TRACE_FUNC;
+
+	mutex_lock(&ftraced_lock);
+	task = ftraced_task;
+	ftraced_task = NULL;
+	ftraced_suspend = -1;
+	ftrace_run_update_code(command);
+	mutex_unlock(&ftraced_lock);
+
+	if (task)
+		kthread_stop(task);
+}
+
 static __init int ftrace_init_debugfs(void)
 {
 	struct dentry *d_tracer;
@@ -1194,21 +1252,29 @@ static int __init notrace ftrace_dynamic_init(void)
 	stop_machine_run(ftrace_dyn_arch_init, &addr, NR_CPUS);
 
 	/* ftrace_dyn_arch_init places the return code in addr */
-	if (addr)
-		return addr;
+	if (addr) {
+		ret = (int)addr;
+		goto failed;
+	}
 
 	ret = ftrace_dyn_table_alloc();
 	if (ret)
-		return ret;
+		goto failed;
 
 	p = kthread_run(ftraced, NULL, "ftraced");
-	if (IS_ERR(p))
-		return -1;
+	if (IS_ERR(p)) {
+		ret = -1;
+		goto failed;
+	}
 
 	last_ftrace_enabled = ftrace_enabled = 1;
 	ftraced_task = p;
 
 	return 0;
+
+ failed:
+	ftrace_disabled = 1;
+	return ret;
 }
 
 core_initcall(ftrace_dynamic_init);
@@ -1217,8 +1283,30 @@ core_initcall(ftrace_dynamic_init);
 # define ftrace_shutdown()		do { } while (0)
 # define ftrace_startup_sysctl()	do { } while (0)
 # define ftrace_shutdown_sysctl()	do { } while (0)
+# define ftrace_force_shutdown()	do { } while (0)
 #endif /* CONFIG_DYNAMIC_FTRACE */
 
+/**
+ * ftrace_kill - totally shutdown ftrace
+ *
+ * This is a safety measure. If something was detected that seems
+ * wrong, calling this function will keep ftrace from doing
+ * any more modifications, and updates.
+ * used when something went wrong.
+ */
+void ftrace_kill(void)
+{
+	mutex_lock(&ftrace_sysctl_lock);
+	ftrace_disabled = 1;
+	ftrace_enabled = 0;
+
+	clear_ftrace_function();
+	mutex_unlock(&ftrace_sysctl_lock);
+
+	/* Try to totally disable ftrace */
+	ftrace_force_shutdown();
+}
+
 /**
  * register_ftrace_function - register a function for profiling
  * @ops - ops structure that holds the function for profiling.
@@ -1234,6 +1322,9 @@ int register_ftrace_function(struct ftrace_ops *ops)
 {
 	int ret;
 
+	if (unlikely(ftrace_disabled))
+		return -1;
+
 	mutex_lock(&ftrace_sysctl_lock);
 	ret = __register_ftrace_function(ops);
 	ftrace_startup();
@@ -1267,6 +1358,9 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
 {
 	int ret;
 
+	if (unlikely(ftrace_disabled))
+		return -ENODEV;
+
 	mutex_lock(&ftrace_sysctl_lock);
 
 	ret  = proc_dointvec(table, write, file, buffer, lenp, ppos);
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index a6f1ed75f836..85715b86a342 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -248,6 +248,10 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
 	ftrace_enabled = save_ftrace_enabled;
 	tracer_enabled = save_tracer_enabled;
 
+	/* kill ftrace totally if we failed */
+	if (ret)
+		ftrace_kill();
+
 	return ret;
 }
 #endif /* CONFIG_FTRACE */
-- 
cgit v1.2.3


From aeaee8a2c9cb4489f166ca0e39c568e8254faaa6 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:20:49 +0200
Subject: ftrace: build fix

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/ftrace.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 4650a3160b7f..08fbef1744cc 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -58,9 +58,6 @@ struct dyn_ftrace {
 int ftrace_force_update(void);
 void ftrace_set_filter(unsigned char *buf, int len, int reset);
 
-/* totally disable ftrace - can not re-enable after this */
-void ftrace_kill(void);
-
 /* defined in arch */
 extern int ftrace_ip_converted(unsigned long ip);
 extern unsigned char *ftrace_nop_replace(void);
@@ -74,10 +71,13 @@ extern void ftrace_caller(void);
 extern void ftrace_call(void);
 extern void mcount_call(void);
 #else
-# define ftrace_force_update() ({ 0; })
-# define ftrace_set_filter(buf, len, reset) do { } while (0)
+# define ftrace_force_update()			({ 0; })
+# define ftrace_set_filter(buf, len, reset)	do { } while (0)
 #endif
 
+/* totally disable ftrace - can not re-enable after this */
+void ftrace_kill(void);
+
 static inline void tracer_disable(void)
 {
 #ifdef CONFIG_FTRACE
-- 
cgit v1.2.3


From 86387f7ee5d3273ff4859e2c64ce656639b6ca65 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:20:51 +0200
Subject: ftrace: add stack tracing

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/ftrace.h |   2 +
 kernel/trace/Kconfig   |   1 +
 kernel/trace/trace.c   | 103 ++++++++++++++++++++++++++++++++++++++++---------
 kernel/trace/trace.h   |  11 ++++++
 4 files changed, 99 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 08fbef1744cc..0d3714e7110b 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -93,6 +93,7 @@ static inline void tracer_disable(void)
 # define CALLER_ADDR3 ((unsigned long)__builtin_return_address(3))
 # define CALLER_ADDR4 ((unsigned long)__builtin_return_address(4))
 # define CALLER_ADDR5 ((unsigned long)__builtin_return_address(5))
+# define CALLER_ADDR6 ((unsigned long)__builtin_return_address(6))
 #else
 # define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0))
 # define CALLER_ADDR1 0UL
@@ -100,6 +101,7 @@ static inline void tracer_disable(void)
 # define CALLER_ADDR3 0UL
 # define CALLER_ADDR4 0UL
 # define CALLER_ADDR5 0UL
+# define CALLER_ADDR6 0UL
 #endif
 
 #ifdef CONFIG_IRQSOFF_TRACER
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 3f73a1710242..eb1988ed84b7 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -10,6 +10,7 @@ config TRACER_MAX_TRACE
 config TRACING
 	bool
 	select DEBUG_FS
+	select STACKTRACE
 
 config FTRACE
 	bool "Kernel Function Tracer"
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 192c1354a7e0..b4b1b4fe99fd 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -28,6 +28,8 @@
 #include <linux/gfp.h>
 #include <linux/fs.h>
 
+#include <linux/stacktrace.h>
+
 #include "trace.h"
 
 unsigned long __read_mostly	tracing_max_latency = (cycle_t)ULONG_MAX;
@@ -88,6 +90,7 @@ enum trace_type {
 	TRACE_FN,
 	TRACE_CTX,
 	TRACE_WAKE,
+	TRACE_STACK,
 	TRACE_SPECIAL,
 
 	__TRACE_LAST_TYPE
@@ -109,6 +112,7 @@ enum trace_iterator_flags {
 	TRACE_ITER_HEX			= 0x20,
 	TRACE_ITER_BIN			= 0x40,
 	TRACE_ITER_BLOCK		= 0x80,
+	TRACE_ITER_STACKTRACE		= 0x100,
 };
 
 #define TRACE_ITER_SYM_MASK \
@@ -124,10 +128,11 @@ static const char *trace_options[] = {
 	"hex",
 	"bin",
 	"block",
+	"stacktrace",
 	NULL
 };
 
-static unsigned trace_flags;
+static unsigned trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_STACKTRACE;
 
 static DEFINE_SPINLOCK(ftrace_max_lock);
 
@@ -657,7 +662,7 @@ trace_function(struct trace_array *tr, struct trace_array_cpu *data,
 	spin_unlock_irqrestore(&data->lock, irq_flags);
 
 	if (!(trace_flags & TRACE_ITER_BLOCK))
-		wake_up (&trace_wait);
+		wake_up(&trace_wait);
 }
 
 void
@@ -685,13 +690,39 @@ trace_special(struct trace_array *tr, struct trace_array_cpu *data,
 	spin_unlock_irqrestore(&data->lock, irq_flags);
 
 	if (!(trace_flags & TRACE_ITER_BLOCK))
-		wake_up (&trace_wait);
+		wake_up(&trace_wait);
+}
+
+void __trace_stack(struct trace_array *tr,
+		   struct trace_array_cpu *data,
+		   unsigned long flags,
+		   int skip)
+{
+	struct trace_entry *entry;
+	struct stack_trace trace;
+
+	if (!(trace_flags & TRACE_ITER_STACKTRACE))
+		return;
+
+	entry			= tracing_get_trace_entry(tr, data);
+	tracing_generic_entry_update(entry, flags);
+	entry->type		= TRACE_STACK;
+
+	memset(&entry->stack, 0, sizeof(entry->stack));
+
+	trace.nr_entries	= 0;
+	trace.max_entries	= FTRACE_STACK_ENTRIES;
+	trace.skip		= skip;
+	trace.entries		= entry->stack.caller;
+
+	save_stack_trace(&trace);
 }
 
 void
 tracing_sched_switch_trace(struct trace_array *tr,
 			   struct trace_array_cpu *data,
-			   struct task_struct *prev, struct task_struct *next,
+			   struct task_struct *prev,
+			   struct task_struct *next,
 			   unsigned long flags)
 {
 	struct trace_entry *entry;
@@ -706,16 +737,18 @@ tracing_sched_switch_trace(struct trace_array *tr,
 	entry->ctx.prev_state	= prev->state;
 	entry->ctx.next_pid	= next->pid;
 	entry->ctx.next_prio	= next->prio;
+	__trace_stack(tr, data, flags, 4);
 	spin_unlock_irqrestore(&data->lock, irq_flags);
 
 	if (!(trace_flags & TRACE_ITER_BLOCK))
-		wake_up (&trace_wait);
+		wake_up(&trace_wait);
 }
 
 void
 tracing_sched_wakeup_trace(struct trace_array *tr,
 			   struct trace_array_cpu *data,
-			   struct task_struct *wakee, struct task_struct *curr,
+			   struct task_struct *wakee,
+			   struct task_struct *curr,
 			   unsigned long flags)
 {
 	struct trace_entry *entry;
@@ -730,6 +763,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 	entry->ctx.prev_state	= curr->state;
 	entry->ctx.next_pid	= wakee->pid;
 	entry->ctx.next_prio	= wakee->prio;
+	__trace_stack(tr, data, flags, 5);
 	spin_unlock_irqrestore(&data->lock, irq_flags);
 
 	if (!(trace_flags & TRACE_ITER_BLOCK))
@@ -1179,6 +1213,7 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
 	unsigned long rel_usecs;
 	char *comm;
 	int S;
+	int i;
 
 	if (!next_entry)
 		next_entry = entry;
@@ -1197,8 +1232,10 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
 				 abs_usecs % 1000, rel_usecs/1000,
 				 rel_usecs % 1000);
 	} else {
-		lat_print_generic(s, entry, cpu);
-		lat_print_timestamp(s, abs_usecs, rel_usecs);
+		if (entry->type != TRACE_STACK) {
+			lat_print_generic(s, entry, cpu);
+			lat_print_timestamp(s, abs_usecs, rel_usecs);
+		}
 	}
 	switch (entry->type) {
 	case TRACE_FN:
@@ -1226,6 +1263,14 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
 				 entry->special.arg2,
 				 entry->special.arg3);
 		break;
+	case TRACE_STACK:
+		for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
+			if (i)
+				trace_seq_puts(s, " <= ");
+			seq_print_ip_sym(s, entry->stack.caller[i], sym_flags);
+		}
+		trace_seq_puts(s, "\n");
+		break;
 	default:
 		trace_seq_printf(s, "Unknown type %d\n", entry->type);
 	}
@@ -1241,8 +1286,9 @@ static int print_trace_fmt(struct trace_iterator *iter)
 	unsigned long long t;
 	unsigned long secs;
 	char *comm;
-	int S;
 	int ret;
+	int S;
+	int i;
 
 	entry = iter->ent;
 
@@ -1252,15 +1298,17 @@ static int print_trace_fmt(struct trace_iterator *iter)
 	usec_rem = do_div(t, 1000000ULL);
 	secs = (unsigned long)t;
 
-	ret = trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid);
-	if (!ret)
-		return 0;
-	ret = trace_seq_printf(s, "[%02d] ", iter->cpu);
-	if (!ret)
-		return 0;
-	ret = trace_seq_printf(s, "%5lu.%06lu: ", secs, usec_rem);
-	if (!ret)
-		return 0;
+	if (entry->type != TRACE_STACK) {
+		ret = trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid);
+		if (!ret)
+			return 0;
+		ret = trace_seq_printf(s, "[%02d] ", iter->cpu);
+		if (!ret)
+			return 0;
+		ret = trace_seq_printf(s, "%5lu.%06lu: ", secs, usec_rem);
+		if (!ret)
+			return 0;
+	}
 
 	switch (entry->type) {
 	case TRACE_FN:
@@ -1303,6 +1351,22 @@ static int print_trace_fmt(struct trace_iterator *iter)
 		if (!ret)
 			return 0;
 		break;
+	case TRACE_STACK:
+		for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
+			if (i) {
+				ret = trace_seq_puts(s, " <= ");
+				if (!ret)
+					return 0;
+			}
+			ret = seq_print_ip_sym(s, entry->stack.caller[i],
+					       sym_flags);
+			if (!ret)
+				return 0;
+		}
+		ret = trace_seq_puts(s, "\n");
+		if (!ret)
+			return 0;
+		break;
 	}
 	return 1;
 }
@@ -1344,6 +1408,7 @@ static int print_raw_fmt(struct trace_iterator *iter)
 			return 0;
 		break;
 	case TRACE_SPECIAL:
+	case TRACE_STACK:
 		ret = trace_seq_printf(s, " %lx %lx %lx\n",
 				 entry->special.arg1,
 				 entry->special.arg2,
@@ -1399,6 +1464,7 @@ static int print_hex_fmt(struct trace_iterator *iter)
 		SEQ_PUT_HEX_FIELD_RET(s, entry->fn.parent_ip);
 		break;
 	case TRACE_SPECIAL:
+	case TRACE_STACK:
 		SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg1);
 		SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg2);
 		SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg3);
@@ -1433,6 +1499,7 @@ static int print_bin_fmt(struct trace_iterator *iter)
 		SEQ_PUT_FIELD_RET(s, entry->ctx.next_prio);
 		break;
 	case TRACE_SPECIAL:
+	case TRACE_STACK:
 		SEQ_PUT_FIELD_RET(s, entry->special.arg1);
 		SEQ_PUT_FIELD_RET(s, entry->special.arg2);
 		SEQ_PUT_FIELD_RET(s, entry->special.arg3);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 90e0ba0f6eba..387bdcf45e28 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -34,6 +34,16 @@ struct special_entry {
 	unsigned long		arg3;
 };
 
+/*
+ * Stack-trace entry:
+ */
+
+#define FTRACE_STACK_ENTRIES	5
+
+struct stack_entry {
+	unsigned long		caller[FTRACE_STACK_ENTRIES];
+};
+
 /*
  * The trace entry - the most basic unit of tracing. This is what
  * is printed in the end as a single line in the trace output, such as:
@@ -51,6 +61,7 @@ struct trace_entry {
 		struct ftrace_entry		fn;
 		struct ctx_switch_entry		ctx;
 		struct special_entry		special;
+		struct stack_entry		stack;
 	};
 };
 
-- 
cgit v1.2.3


From 8ac0fca4ccb355ce50471d7aa3f10f5900b28b95 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:20:51 +0200
Subject: ftrace: sched tracer fix

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/sched.h             |  6 ------
 kernel/sched.c                    |  2 +-
 kernel/trace/trace_sched_wakeup.c | 13 +++----------
 3 files changed, 4 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6e26f1fdbfe2..05744f9cb096 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2130,17 +2130,11 @@ ftrace_ctx_switch(struct task_struct *prev, struct task_struct *next)
 #ifdef CONFIG_SCHED_TRACER
 extern void
 ftrace_wake_up_task(struct task_struct *wakee, struct task_struct *curr);
-extern void
-ftrace_wake_up_new_task(struct task_struct *wakee, struct task_struct *curr);
 #else
 static inline void
 ftrace_wake_up_task(struct task_struct *wakee, struct task_struct *curr)
 {
 }
-static inline void
-ftrace_wake_up_new_task(struct task_struct *wakee, struct task_struct *curr)
-{
-}
 #endif
 
 extern long sched_setaffinity(pid_t pid, const cpumask_t *new_mask);
diff --git a/kernel/sched.c b/kernel/sched.c
index 328494e28df2..53ab1174664f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2613,7 +2613,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 		p->sched_class->task_new(rq, p);
 		inc_nr_running(rq);
 	}
-	ftrace_wake_up_new_task(p, rq->curr);
+	ftrace_wake_up_task(p, rq->curr);
 	check_preempt_curr(rq, p);
 #ifdef CONFIG_SMP
 	if (p->sched_class->task_wake_up)
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 87fa7b253b57..2a012423f9d0 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -201,20 +201,13 @@ out:
 	atomic_dec(&tr->data[cpu]->disabled);
 }
 
-void
-wakeup_sched_wakeup(struct task_struct *wakee, struct task_struct *curr)
+void wakeup_sched_wakeup(struct task_struct *wakee, struct task_struct *curr)
 {
 	if (likely(!tracer_enabled))
 		return;
 
-	wakeup_check_start(wakeup_trace, wakee, curr);
-}
-
-void
-ftrace_wake_up_new_task(struct task_struct *wakee, struct task_struct *curr)
-{
-	if (likely(!tracer_enabled))
-		return;
+	tracing_record_cmdline(curr);
+	tracing_record_cmdline(wakee);
 
 	wakeup_check_start(wakeup_trace, wakee, curr);
 }
-- 
cgit v1.2.3


From 4e65551905fb0300ae7e667cbaa41ee2e3f29a13 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:20:52 +0200
Subject: ftrace: sched tracer, trace full rbtree

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/sched.h             | 32 ++++++++++++++++-------
 kernel/sched.c                    | 35 ++++++++++++++++++++++---
 kernel/trace/trace.c              | 55 ++++++++++++++++-----------------------
 kernel/trace/trace.h              | 14 ++++++++++
 kernel/trace/trace_sched_switch.c | 24 +++++++++++------
 5 files changed, 108 insertions(+), 52 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 05744f9cb096..652d380ae563 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2119,20 +2119,34 @@ static inline void arch_pick_mmap_layout(struct mm_struct *mm)
 
 #ifdef CONFIG_CONTEXT_SWITCH_TRACER
 extern void
-ftrace_ctx_switch(struct task_struct *prev, struct task_struct *next);
+ftrace_ctx_switch(void *rq, struct task_struct *prev, struct task_struct *next);
+extern void
+ftrace_wake_up_task(void *rq, struct task_struct *wakee,
+		    struct task_struct *curr);
+extern void ftrace_all_fair_tasks(void *__rq, void *__tr, void *__data);
+extern void
+__trace_special(void *__tr, void *__data,
+		unsigned long arg1, unsigned long arg2, unsigned long arg3);
 #else
 static inline void
-ftrace_ctx_switch(struct task_struct *prev, struct task_struct *next)
+ftrace_ctx_switch(void *rq, struct task_struct *prev, struct task_struct *next)
+{
+}
+static inline void
+sched_trace_special(unsigned long p1, unsigned long p2, unsigned long p3)
+{
+}
+static inline void
+ftrace_wake_up_task(void *rq, struct task_struct *wakee,
+		    struct task_struct *curr)
+{
+}
+static inline void ftrace_all_fair_tasks(void *__rq, void *__tr, void *__data)
 {
 }
-#endif
-
-#ifdef CONFIG_SCHED_TRACER
-extern void
-ftrace_wake_up_task(struct task_struct *wakee, struct task_struct *curr);
-#else
 static inline void
-ftrace_wake_up_task(struct task_struct *wakee, struct task_struct *curr)
+__trace_special(void *__tr, void *__data,
+		unsigned long arg1, unsigned long arg2, unsigned long arg3)
 {
 }
 #endif
diff --git a/kernel/sched.c b/kernel/sched.c
index 53ab1174664f..b9208a0e33a0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2394,6 +2394,35 @@ static int sched_balance_self(int cpu, int flag)
 
 #endif /* CONFIG_SMP */
 
+#ifdef CONFIG_CONTEXT_SWITCH_TRACER
+
+void ftrace_all_fair_tasks(void *__rq, void *__tr, void *__data)
+{
+	struct sched_entity *se;
+	struct task_struct *p;
+	struct rb_node *curr;
+	struct rq *rq = __rq;
+
+	curr = first_fair(&rq->cfs);
+	if (!curr)
+		return;
+
+	while (curr) {
+		se = rb_entry(curr, struct sched_entity, run_node);
+		if (!entity_is_task(se))
+			continue;
+
+		p = task_of(se);
+
+		__trace_special(__tr, __data,
+			      p->pid, p->se.vruntime, p->se.sum_exec_runtime);
+
+		curr = rb_next(curr);
+	}
+}
+
+#endif
+
 /***
  * try_to_wake_up - wake up a thread
  * @p: the to-be-woken-up thread
@@ -2468,7 +2497,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 
 out_activate:
 #endif /* CONFIG_SMP */
-	ftrace_wake_up_task(p, rq->curr);
+	ftrace_wake_up_task(rq, p, rq->curr);
 	schedstat_inc(p, se.nr_wakeups);
 	if (sync)
 		schedstat_inc(p, se.nr_wakeups_sync);
@@ -2613,7 +2642,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 		p->sched_class->task_new(rq, p);
 		inc_nr_running(rq);
 	}
-	ftrace_wake_up_task(p, rq->curr);
+	ftrace_wake_up_task(rq, p, rq->curr);
 	check_preempt_curr(rq, p);
 #ifdef CONFIG_SMP
 	if (p->sched_class->task_wake_up)
@@ -2786,7 +2815,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
 	struct mm_struct *mm, *oldmm;
 
 	prepare_task_switch(rq, prev, next);
-	ftrace_ctx_switch(prev, next);
+	ftrace_ctx_switch(rq, prev, next);
 	mm = next->mm;
 	oldmm = prev->active_mm;
 	/*
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 0e4b7119e263..65173b14b914 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -66,7 +66,18 @@ static struct tracer		*current_trace __read_mostly;
 static int			max_tracer_type_len;
 
 static DEFINE_MUTEX(trace_types_lock);
-static DECLARE_WAIT_QUEUE_HEAD (trace_wait);
+static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
+
+unsigned long trace_flags = TRACE_ITER_PRINT_PARENT;
+
+/*
+ * FIXME: where should this be called?
+ */
+void trace_wake_up(void)
+{
+	if (!(trace_flags & TRACE_ITER_BLOCK))
+		wake_up(&trace_wait);
+}
 
 #define ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(struct trace_entry))
 
@@ -103,18 +114,6 @@ enum trace_flag_type {
 	TRACE_FLAG_SOFTIRQ		= 0x08,
 };
 
-enum trace_iterator_flags {
-	TRACE_ITER_PRINT_PARENT		= 0x01,
-	TRACE_ITER_SYM_OFFSET		= 0x02,
-	TRACE_ITER_SYM_ADDR		= 0x04,
-	TRACE_ITER_VERBOSE		= 0x08,
-	TRACE_ITER_RAW			= 0x10,
-	TRACE_ITER_HEX			= 0x20,
-	TRACE_ITER_BIN			= 0x40,
-	TRACE_ITER_BLOCK		= 0x80,
-	TRACE_ITER_STACKTRACE		= 0x100,
-};
-
 #define TRACE_ITER_SYM_MASK \
 	(TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR)
 
@@ -132,8 +131,6 @@ static const char *trace_options[] = {
 	NULL
 };
 
-static unsigned trace_flags = TRACE_ITER_PRINT_PARENT;
-
 static DEFINE_SPINLOCK(ftrace_max_lock);
 
 /*
@@ -660,9 +657,6 @@ trace_function(struct trace_array *tr, struct trace_array_cpu *data,
 	entry->fn.ip		= ip;
 	entry->fn.parent_ip	= parent_ip;
 	spin_unlock_irqrestore(&data->lock, irq_flags);
-
-	if (!(trace_flags & TRACE_ITER_BLOCK))
-		wake_up(&trace_wait);
 }
 
 void
@@ -673,10 +667,14 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data,
 		trace_function(tr, data, ip, parent_ip, flags);
 }
 
+#ifdef CONFIG_CONTEXT_SWITCH_TRACER
+
 void
-trace_special(struct trace_array *tr, struct trace_array_cpu *data,
-	      unsigned long arg1, unsigned long arg2, unsigned long arg3)
+__trace_special(void *__tr, void *__data,
+		unsigned long arg1, unsigned long arg2, unsigned long arg3)
 {
+	struct trace_array_cpu *data = __data;
+	struct trace_array *tr = __tr;
 	struct trace_entry *entry;
 	unsigned long irq_flags;
 
@@ -688,11 +686,10 @@ trace_special(struct trace_array *tr, struct trace_array_cpu *data,
 	entry->special.arg2	= arg2;
 	entry->special.arg3	= arg3;
 	spin_unlock_irqrestore(&data->lock, irq_flags);
-
-	if (!(trace_flags & TRACE_ITER_BLOCK))
-		wake_up(&trace_wait);
 }
 
+#endif
+
 void __trace_stack(struct trace_array *tr,
 		   struct trace_array_cpu *data,
 		   unsigned long flags,
@@ -739,9 +736,6 @@ tracing_sched_switch_trace(struct trace_array *tr,
 	entry->ctx.next_prio	= next->prio;
 	__trace_stack(tr, data, flags, 4);
 	spin_unlock_irqrestore(&data->lock, irq_flags);
-
-	if (!(trace_flags & TRACE_ITER_BLOCK))
-		wake_up(&trace_wait);
 }
 
 void
@@ -765,9 +759,6 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 	entry->ctx.next_prio	= wakee->prio;
 	__trace_stack(tr, data, flags, 5);
 	spin_unlock_irqrestore(&data->lock, irq_flags);
-
-	if (!(trace_flags & TRACE_ITER_BLOCK))
-		wake_up(&trace_wait);
 }
 
 #ifdef CONFIG_FTRACE
@@ -1258,7 +1249,7 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
 				 comm);
 		break;
 	case TRACE_SPECIAL:
-		trace_seq_printf(s, " %lx %lx %lx\n",
+		trace_seq_printf(s, " %ld %ld %ld\n",
 				 entry->special.arg1,
 				 entry->special.arg2,
 				 entry->special.arg3);
@@ -1344,7 +1335,7 @@ static int print_trace_fmt(struct trace_iterator *iter)
 			return 0;
 		break;
 	case TRACE_SPECIAL:
-		ret = trace_seq_printf(s, " %lx %lx %lx\n",
+		ret = trace_seq_printf(s, " %ld %ld %ld\n",
 				 entry->special.arg1,
 				 entry->special.arg2,
 				 entry->special.arg3);
@@ -1409,7 +1400,7 @@ static int print_raw_fmt(struct trace_iterator *iter)
 		break;
 	case TRACE_SPECIAL:
 	case TRACE_STACK:
-		ret = trace_seq_printf(s, " %lx %lx %lx\n",
+		ret = trace_seq_printf(s, " %ld %ld %ld\n",
 				 entry->special.arg1,
 				 entry->special.arg2,
 				 entry->special.arg3);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 387bdcf45e28..75e237475674 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -274,4 +274,18 @@ extern int trace_selftest_startup_sched_switch(struct tracer *trace,
 
 extern void *head_page(struct trace_array_cpu *data);
 
+extern unsigned long trace_flags;
+
+enum trace_iterator_flags {
+	TRACE_ITER_PRINT_PARENT		= 0x01,
+	TRACE_ITER_SYM_OFFSET		= 0x02,
+	TRACE_ITER_SYM_ADDR		= 0x04,
+	TRACE_ITER_VERBOSE		= 0x08,
+	TRACE_ITER_RAW			= 0x10,
+	TRACE_ITER_HEX			= 0x20,
+	TRACE_ITER_BIN			= 0x40,
+	TRACE_ITER_BLOCK		= 0x80,
+	TRACE_ITER_STACKTRACE		= 0x100,
+};
+
 #endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 8b1cf1a3aee0..12658b3f2b28 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -18,7 +18,7 @@ static struct trace_array	*ctx_trace;
 static int __read_mostly	tracer_enabled;
 
 static void
-ctx_switch_func(struct task_struct *prev, struct task_struct *next)
+ctx_switch_func(void *__rq, struct task_struct *prev, struct task_struct *next)
 {
 	struct trace_array *tr = ctx_trace;
 	struct trace_array_cpu *data;
@@ -34,14 +34,17 @@ ctx_switch_func(struct task_struct *prev, struct task_struct *next)
 	data = tr->data[cpu];
 	disabled = atomic_inc_return(&data->disabled);
 
-	if (likely(disabled == 1))
+	if (likely(disabled == 1)) {
 		tracing_sched_switch_trace(tr, data, prev, next, flags);
+		ftrace_all_fair_tasks(__rq, tr, data);
+	}
 
 	atomic_dec(&data->disabled);
 	local_irq_restore(flags);
 }
 
-static void wakeup_func(struct task_struct *wakee, struct task_struct *curr)
+static void
+wakeup_func(void *__rq, struct task_struct *wakee, struct task_struct *curr)
 {
 	struct trace_array *tr = ctx_trace;
 	struct trace_array_cpu *data;
@@ -57,14 +60,18 @@ static void wakeup_func(struct task_struct *wakee, struct task_struct *curr)
 	data = tr->data[cpu];
 	disabled = atomic_inc_return(&data->disabled);
 
-	if (likely(disabled == 1))
+	if (likely(disabled == 1)) {
 		tracing_sched_wakeup_trace(tr, data, wakee, curr, flags);
+		ftrace_all_fair_tasks(__rq, tr, data);
+	}
 
 	atomic_dec(&data->disabled);
 	local_irq_restore(flags);
 }
 
-void ftrace_ctx_switch(struct task_struct *prev, struct task_struct *next)
+void
+ftrace_ctx_switch(void *__rq, struct task_struct *prev,
+		  struct task_struct *next)
 {
 	tracing_record_cmdline(prev);
 
@@ -72,7 +79,7 @@ void ftrace_ctx_switch(struct task_struct *prev, struct task_struct *next)
 	 * If tracer_switch_func only points to the local
 	 * switch func, it still needs the ptr passed to it.
 	 */
-	ctx_switch_func(prev, next);
+	ctx_switch_func(__rq, prev, next);
 
 	/*
 	 * Chain to the wakeup tracer (this is a NOP if disabled):
@@ -81,11 +88,12 @@ void ftrace_ctx_switch(struct task_struct *prev, struct task_struct *next)
 }
 
 void
-ftrace_wake_up_task(struct task_struct *wakee, struct task_struct *curr)
+ftrace_wake_up_task(void *__rq, struct task_struct *wakee,
+		    struct task_struct *curr)
 {
 	tracing_record_cmdline(curr);
 
-	wakeup_func(wakee, curr);
+	wakeup_func(__rq, wakee, curr);
 
 	/*
 	 * Chain to the wakeup tracer (this is a NOP if disabled):
-- 
cgit v1.2.3


From 017730c11241e26577673eb9d957cfc66172ea91 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:20:52 +0200
Subject: ftrace: fix wakeups

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/sched.h |  2 ++
 kernel/sched.c        | 18 ++++++++++++++++++
 kernel/trace/trace.c  | 15 +++++++++++----
 3 files changed, 31 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 652d380ae563..a3970b563757 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -246,6 +246,8 @@ extern asmlinkage void schedule_tail(struct task_struct *prev);
 extern void init_idle(struct task_struct *idle, int cpu);
 extern void init_idle_bootup_task(struct task_struct *idle);
 
+extern int runqueue_is_locked(void);
+
 extern cpumask_t nohz_cpu_mask;
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
 extern int select_nohz_load_balancer(int cpu);
diff --git a/kernel/sched.c b/kernel/sched.c
index 673b588b713b..9ca4a2e6a236 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -642,6 +642,24 @@ static inline void update_rq_clock(struct rq *rq)
 # define const_debug static const
 #endif
 
+/**
+ * runqueue_is_locked
+ *
+ * Returns true if the current cpu runqueue is locked.
+ * This interface allows printk to be called with the runqueue lock
+ * held and know whether or not it is OK to wake up the klogd.
+ */
+int runqueue_is_locked(void)
+{
+	int cpu = get_cpu();
+	struct rq *rq = cpu_rq(cpu);
+	int ret;
+
+	ret = spin_is_locked(&rq->lock);
+	put_cpu();
+	return ret;
+}
+
 /*
  * Debugging: various feature bits
  */
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 65173b14b914..2ca9d66aa74e 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -70,12 +70,13 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
 
 unsigned long trace_flags = TRACE_ITER_PRINT_PARENT;
 
-/*
- * FIXME: where should this be called?
- */
 void trace_wake_up(void)
 {
-	if (!(trace_flags & TRACE_ITER_BLOCK))
+	/*
+	 * The runqueue_is_locked() can fail, but this is the best we
+	 * have for now:
+	 */
+	if (!(trace_flags & TRACE_ITER_BLOCK) && !runqueue_is_locked())
 		wake_up(&trace_wait);
 }
 
@@ -657,6 +658,8 @@ trace_function(struct trace_array *tr, struct trace_array_cpu *data,
 	entry->fn.ip		= ip;
 	entry->fn.parent_ip	= parent_ip;
 	spin_unlock_irqrestore(&data->lock, irq_flags);
+
+	trace_wake_up();
 }
 
 void
@@ -686,6 +689,8 @@ __trace_special(void *__tr, void *__data,
 	entry->special.arg2	= arg2;
 	entry->special.arg3	= arg3;
 	spin_unlock_irqrestore(&data->lock, irq_flags);
+
+	trace_wake_up();
 }
 
 #endif
@@ -759,6 +764,8 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 	entry->ctx.next_prio	= wakee->prio;
 	__trace_stack(tr, data, flags, 5);
 	spin_unlock_irqrestore(&data->lock, irq_flags);
+
+	trace_wake_up();
 }
 
 #ifdef CONFIG_FTRACE
-- 
cgit v1.2.3


From 1a3c3034336320554a3342572dae98d69e054fc7 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:20:52 +0200
Subject: ftrace: fix __trace_special()

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/sched.h | 20 ++++++++++++--------
 kernel/trace/trace.c  |  4 ----
 2 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index a3970b563757..5b186bed54bc 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2119,6 +2119,18 @@ static inline void arch_pick_mmap_layout(struct mm_struct *mm)
 }
 #endif
 
+#ifdef CONFIG_TRACING
+extern void
+__trace_special(void *__tr, void *__data,
+		unsigned long arg1, unsigned long arg2, unsigned long arg3);
+#else
+static inline void
+__trace_special(void *__tr, void *__data,
+		unsigned long arg1, unsigned long arg2, unsigned long arg3)
+{
+}
+#endif
+
 #ifdef CONFIG_CONTEXT_SWITCH_TRACER
 extern void
 ftrace_ctx_switch(void *rq, struct task_struct *prev, struct task_struct *next);
@@ -2126,9 +2138,6 @@ extern void
 ftrace_wake_up_task(void *rq, struct task_struct *wakee,
 		    struct task_struct *curr);
 extern void ftrace_all_fair_tasks(void *__rq, void *__tr, void *__data);
-extern void
-__trace_special(void *__tr, void *__data,
-		unsigned long arg1, unsigned long arg2, unsigned long arg3);
 #else
 static inline void
 ftrace_ctx_switch(void *rq, struct task_struct *prev, struct task_struct *next)
@@ -2146,11 +2155,6 @@ ftrace_wake_up_task(void *rq, struct task_struct *wakee,
 static inline void ftrace_all_fair_tasks(void *__rq, void *__tr, void *__data)
 {
 }
-static inline void
-__trace_special(void *__tr, void *__data,
-		unsigned long arg1, unsigned long arg2, unsigned long arg3)
-{
-}
 #endif
 
 extern long sched_setaffinity(pid_t pid, const cpumask_t *new_mask);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2ca9d66aa74e..65d2c0a61ed4 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -670,8 +670,6 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data,
 		trace_function(tr, data, ip, parent_ip, flags);
 }
 
-#ifdef CONFIG_CONTEXT_SWITCH_TRACER
-
 void
 __trace_special(void *__tr, void *__data,
 		unsigned long arg1, unsigned long arg2, unsigned long arg3)
@@ -693,8 +691,6 @@ __trace_special(void *__tr, void *__data,
 	trace_wake_up();
 }
 
-#endif
-
 void __trace_stack(struct trace_array *tr,
 		   struct trace_array_cpu *data,
 		   unsigned long flags,
-- 
cgit v1.2.3


From 88a4216c3ec4281fc7e6725cc3a3ccd01fb1aa14 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:20:53 +0200
Subject: ftrace: sched special

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/sched.h             |  6 ++++++
 kernel/sched_fair.c               |  3 +++
 kernel/trace/trace.c              |  6 +++---
 kernel/trace/trace_sched_switch.c | 24 ++++++++++++++++++++++++
 4 files changed, 36 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5b186bed54bc..360ca99033d2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2138,6 +2138,8 @@ extern void
 ftrace_wake_up_task(void *rq, struct task_struct *wakee,
 		    struct task_struct *curr);
 extern void ftrace_all_fair_tasks(void *__rq, void *__tr, void *__data);
+extern void
+ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3);
 #else
 static inline void
 ftrace_ctx_switch(void *rq, struct task_struct *prev, struct task_struct *next)
@@ -2155,6 +2157,10 @@ ftrace_wake_up_task(void *rq, struct task_struct *wakee,
 static inline void ftrace_all_fair_tasks(void *__rq, void *__tr, void *__data)
 {
 }
+static inline void
+ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
+{
+}
 #endif
 
 extern long sched_setaffinity(pid_t pid, const cpumask_t *new_mask);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index e24ecd39c4b8..dc1856f10795 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1061,6 +1061,8 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
 	if (!(this_sd->flags & SD_WAKE_AFFINE))
 		return 0;
 
+	ftrace_special(__LINE__, curr->se.avg_overlap, sync);
+	ftrace_special(__LINE__, p->se.avg_overlap, -1);
 	/*
 	 * If the currently running task will sleep within
 	 * a reasonable amount of time then attract this newly
@@ -1238,6 +1240,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
 	if (unlikely(se == pse))
 		return;
 
+	ftrace_special(__LINE__, p->pid, se->last_wakeup);
 	cfs_rq_of(pse)->next = pse;
 
 	/*
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 3a4032492fcb..b87a26414892 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1251,7 +1251,7 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
 				 comm);
 		break;
 	case TRACE_SPECIAL:
-		trace_seq_printf(s, " %ld %ld %ld\n",
+		trace_seq_printf(s, "# %ld %ld %ld\n",
 				 entry->special.arg1,
 				 entry->special.arg2,
 				 entry->special.arg3);
@@ -1335,7 +1335,7 @@ static int print_trace_fmt(struct trace_iterator *iter)
 			return 0;
 		break;
 	case TRACE_SPECIAL:
-		ret = trace_seq_printf(s, " %ld %ld %ld\n",
+		ret = trace_seq_printf(s, "# %ld %ld %ld\n",
 				 entry->special.arg1,
 				 entry->special.arg2,
 				 entry->special.arg3);
@@ -1400,7 +1400,7 @@ static int print_raw_fmt(struct trace_iterator *iter)
 		break;
 	case TRACE_SPECIAL:
 	case TRACE_STACK:
-		ret = trace_seq_printf(s, " %ld %ld %ld\n",
+		ret = trace_seq_printf(s, "# %ld %ld %ld\n",
 				 entry->special.arg1,
 				 entry->special.arg2,
 				 entry->special.arg3);
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 5a217e863586..bddf676914ed 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -103,6 +103,30 @@ ftrace_wake_up_task(void *__rq, struct task_struct *wakee,
 	wakeup_sched_wakeup(wakee, curr);
 }
 
+void
+ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
+{
+	struct trace_array *tr = ctx_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+	long disabled;
+	int cpu;
+
+	if (!tracer_enabled)
+		return;
+
+	local_irq_save(flags);
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+	disabled = atomic_inc_return(&data->disabled);
+
+	if (likely(disabled == 1))
+		__trace_special(tr, data, arg1, arg2, arg3);
+
+	atomic_dec(&data->disabled);
+	local_irq_restore(flags);
+}
+
 static void sched_switch_reset(struct trace_array *tr)
 {
 	int cpu;
-- 
cgit v1.2.3


From 3eefae994d9224fb7771a3ddb683868363c23510 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Mon, 12 May 2008 21:21:04 +0200
Subject: ftrace: limit trace entries

Currently there is no protection from the root user to use up all of
memory for trace buffers. If the root user allocates too many entries,
the OOM killer might start kill off all tasks.

This patch adds an algorith to check the following condition:

 pages_requested > (freeable_memory + current_trace_buffer_pages) / 4

If the above is met then the allocation fails. The above prevents more
than 1/4th of freeable memory from being used by trace buffers.

To determine the freeable_memory, I made determine_dirtyable_memory in
mm/page-writeback.c global.

Special thanks goes to Peter Zijlstra for suggesting the above calculation.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/writeback.h |  2 ++
 kernel/trace/trace.c      | 38 ++++++++++++++++++++++++++++++++++++++
 mm/page-writeback.c       | 10 +++++++---
 3 files changed, 47 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index f462439cc288..bd91987c065f 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -105,6 +105,8 @@ extern int vm_highmem_is_dirtyable;
 extern int block_dump;
 extern int laptop_mode;
 
+extern unsigned long determine_dirtyable_memory(void);
+
 extern int dirty_ratio_handler(struct ctl_table *table, int write,
 		struct file *filp, void __user *buffer, size_t *lenp,
 		loff_t *ppos);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 82ced406aacf..2824cf48cdca 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -27,6 +27,7 @@
 #include <linux/poll.h>
 #include <linux/gfp.h>
 #include <linux/fs.h>
+#include <linux/writeback.h>
 
 #include <linux/stacktrace.h>
 
@@ -51,6 +52,8 @@ static int trace_free_page(void);
 
 static int tracing_disabled = 1;
 
+static unsigned long tracing_pages_allocated;
+
 long
 ns2usecs(cycle_t nsec)
 {
@@ -2591,12 +2594,41 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
 	}
 
 	if (val > global_trace.entries) {
+		long pages_requested;
+		unsigned long freeable_pages;
+
+		/* make sure we have enough memory before mapping */
+		pages_requested =
+			(val + (ENTRIES_PER_PAGE-1)) / ENTRIES_PER_PAGE;
+
+		/* account for each buffer (and max_tr) */
+		pages_requested *= tracing_nr_buffers * 2;
+
+		/* Check for overflow */
+		if (pages_requested < 0) {
+			cnt = -ENOMEM;
+			goto out;
+		}
+
+		freeable_pages = determine_dirtyable_memory();
+
+		/* we only allow to request 1/4 of useable memory */
+		if (pages_requested >
+		    ((freeable_pages + tracing_pages_allocated) / 4)) {
+			cnt = -ENOMEM;
+			goto out;
+		}
+
 		while (global_trace.entries < val) {
 			if (trace_alloc_page()) {
 				cnt = -ENOMEM;
 				goto out;
 			}
+			/* double check that we don't go over the known pages */
+			if (tracing_pages_allocated > pages_requested)
+				break;
 		}
+
 	} else {
 		/* include the number of entries in val (inc of page entries) */
 		while (global_trace.entries > val + (ENTRIES_PER_PAGE - 1))
@@ -2776,6 +2808,7 @@ static int trace_alloc_page(void)
 	struct page *page, *tmp;
 	LIST_HEAD(pages);
 	void *array;
+	unsigned pages_allocated = 0;
 	int i;
 
 	/* first allocate a page for each CPU */
@@ -2787,6 +2820,7 @@ static int trace_alloc_page(void)
 			goto free_pages;
 		}
 
+		pages_allocated++;
 		page = virt_to_page(array);
 		list_add(&page->lru, &pages);
 
@@ -2798,6 +2832,7 @@ static int trace_alloc_page(void)
 			       "for trace buffer!\n");
 			goto free_pages;
 		}
+		pages_allocated++;
 		page = virt_to_page(array);
 		list_add(&page->lru, &pages);
 #endif
@@ -2819,6 +2854,7 @@ static int trace_alloc_page(void)
 		SetPageLRU(page);
 #endif
 	}
+	tracing_pages_allocated += pages_allocated;
 	global_trace.entries += ENTRIES_PER_PAGE;
 
 	return 0;
@@ -2853,6 +2889,8 @@ static int trace_free_page(void)
 		page = list_entry(p, struct page, lru);
 		ClearPageLRU(page);
 		list_del(&page->lru);
+		tracing_pages_allocated--;
+		tracing_pages_allocated--;
 		__free_page(page);
 
 		tracing_reset(data);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 789b6adbef37..b38f700825fc 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -126,8 +126,6 @@ static void background_writeout(unsigned long _min_pages);
 static struct prop_descriptor vm_completions;
 static struct prop_descriptor vm_dirties;
 
-static unsigned long determine_dirtyable_memory(void);
-
 /*
  * couple the period to the dirty_ratio:
  *
@@ -347,7 +345,13 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
 #endif
 }
 
-static unsigned long determine_dirtyable_memory(void)
+/**
+ * determine_dirtyable_memory - amount of memory that may be used
+ *
+ * Returns the numebr of pages that can currently be freed and used
+ * by the kernel for direct mappings.
+ */
+unsigned long determine_dirtyable_memory(void)
 {
 	unsigned long x;
 
-- 
cgit v1.2.3


From dc102a8fae2d0d6bf5223fc549247f2e23959ae6 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Date: Mon, 12 May 2008 21:21:09 +0200
Subject: Markers - remove extra format argument

Denys Vlasenko <vda.linux@googlemail.com> :

> Not in this patch, but I noticed:
>
> #define __trace_mark(name, call_private, format, args...)               \
>         do {                                                            \
>                 static const char __mstrtab_##name[]                    \
>                 __attribute__((section("__markers_strings")))           \
>                 = #name "\0" format;                                    \
>                 static struct marker __mark_##name                      \
>                 __attribute__((section("__markers"), aligned(8))) =     \
>                 { __mstrtab_##name, &__mstrtab_##name[sizeof(#name)],   \
>                 0, 0, marker_probe_cb,                                  \
>                 { __mark_empty_function, NULL}, NULL };                 \
>                 __mark_check_format(format, ## args);                   \
>                 if (unlikely(__mark_##name.state)) {                    \
>                         (*__mark_##name.call)                           \
>                                 (&__mark_##name, call_private,          \
>                                 format, ## args);                       \
>                 }                                                       \
>         } while (0)
>
> In this call:
>
>                         (*__mark_##name.call)                           \
>                                 (&__mark_##name, call_private,          \
>                                 format, ## args);                       \
>
> you make gcc allocate duplicate format string. You can use
> &__mstrtab_##name[sizeof(#name)] instead since it holds the same string,
> or drop ", format," above and "const char *fmt" from here:
>
>         void (*call)(const struct marker *mdata,        /* Probe wrapper */
>                 void *call_private, const char *fmt, ...);
>
> since mdata->format is the same and all callees which need it can take it there.

Very good point. I actually thought about dropping it, since it would
remove an unnecessary argument from the stack. And actually, since I now
have the marker_probe_cb sitting between the marker site and the
callbacks, there is no API change required. Thanks :)

Mathieu

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
CC: Denys Vlasenko <vda.linux@googlemail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/marker.h | 11 +++++------
 kernel/marker.c        | 30 ++++++++++++++----------------
 2 files changed, 19 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/marker.h b/include/linux/marker.h
index 430f6adf9762..338533abb47b 100644
--- a/include/linux/marker.h
+++ b/include/linux/marker.h
@@ -44,8 +44,8 @@ struct marker {
 				 */
 	char state;		/* Marker state. */
 	char ptype;		/* probe type : 0 : single, 1 : multi */
-	void (*call)(const struct marker *mdata,	/* Probe wrapper */
-		void *call_private, const char *fmt, ...);
+				/* Probe wrapper */
+	void (*call)(const struct marker *mdata, void *call_private, ...);
 	struct marker_probe_closure single;
 	struct marker_probe_closure *multi;
 } __attribute__((aligned(8)));
@@ -72,8 +72,7 @@ struct marker {
 		__mark_check_format(format, ## args);			\
 		if (unlikely(__mark_##name.state)) {			\
 			(*__mark_##name.call)				\
-				(&__mark_##name, call_private,		\
-				format, ## args);			\
+				(&__mark_##name, call_private, ## args);\
 		}							\
 	} while (0)
 
@@ -117,9 +116,9 @@ static inline void __printf(1, 2) ___mark_check_format(const char *fmt, ...)
 extern marker_probe_func __mark_empty_function;
 
 extern void marker_probe_cb(const struct marker *mdata,
-	void *call_private, const char *fmt, ...);
+	void *call_private, ...);
 extern void marker_probe_cb_noarg(const struct marker *mdata,
-	void *call_private, const char *fmt, ...);
+	void *call_private, ...);
 
 /*
  * Connect a probe to a marker.
diff --git a/kernel/marker.c b/kernel/marker.c
index b5a9fe1d50d5..1abfb923b761 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -55,8 +55,8 @@ static DEFINE_MUTEX(markers_mutex);
 struct marker_entry {
 	struct hlist_node hlist;
 	char *format;
-	void (*call)(const struct marker *mdata,	/* Probe wrapper */
-		void *call_private, const char *fmt, ...);
+			/* Probe wrapper */
+	void (*call)(const struct marker *mdata, void *call_private, ...);
 	struct marker_probe_closure single;
 	struct marker_probe_closure *multi;
 	int refcount;	/* Number of times armed. 0 if disarmed. */
@@ -91,15 +91,13 @@ EXPORT_SYMBOL_GPL(__mark_empty_function);
  * marker_probe_cb Callback that prepares the variable argument list for probes.
  * @mdata: pointer of type struct marker
  * @call_private: caller site private data
- * @fmt: format string
  * @...:  Variable argument list.
  *
  * Since we do not use "typical" pointer based RCU in the 1 argument case, we
  * need to put a full smp_rmb() in this branch. This is why we do not use
  * rcu_dereference() for the pointer read.
  */
-void marker_probe_cb(const struct marker *mdata, void *call_private,
-	const char *fmt, ...)
+void marker_probe_cb(const struct marker *mdata, void *call_private, ...)
 {
 	va_list args;
 	char ptype;
@@ -120,8 +118,9 @@ void marker_probe_cb(const struct marker *mdata, void *call_private,
 		/* Must read the ptr before private data. They are not data
 		 * dependant, so we put an explicit smp_rmb() here. */
 		smp_rmb();
-		va_start(args, fmt);
-		func(mdata->single.probe_private, call_private, fmt, &args);
+		va_start(args, call_private);
+		func(mdata->single.probe_private, call_private, mdata->format,
+			&args);
 		va_end(args);
 	} else {
 		struct marker_probe_closure *multi;
@@ -136,9 +135,9 @@ void marker_probe_cb(const struct marker *mdata, void *call_private,
 		smp_read_barrier_depends();
 		multi = mdata->multi;
 		for (i = 0; multi[i].func; i++) {
-			va_start(args, fmt);
-			multi[i].func(multi[i].probe_private, call_private, fmt,
-				&args);
+			va_start(args, call_private);
+			multi[i].func(multi[i].probe_private, call_private,
+				mdata->format, &args);
 			va_end(args);
 		}
 	}
@@ -150,13 +149,11 @@ EXPORT_SYMBOL_GPL(marker_probe_cb);
  * marker_probe_cb Callback that does not prepare the variable argument list.
  * @mdata: pointer of type struct marker
  * @call_private: caller site private data
- * @fmt: format string
  * @...:  Variable argument list.
  *
  * Should be connected to markers "MARK_NOARGS".
  */
-void marker_probe_cb_noarg(const struct marker *mdata,
-	void *call_private, const char *fmt, ...)
+void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...)
 {
 	va_list args;	/* not initialized */
 	char ptype;
@@ -172,7 +169,8 @@ void marker_probe_cb_noarg(const struct marker *mdata,
 		/* Must read the ptr before private data. They are not data
 		 * dependant, so we put an explicit smp_rmb() here. */
 		smp_rmb();
-		func(mdata->single.probe_private, call_private, fmt, &args);
+		func(mdata->single.probe_private, call_private, mdata->format,
+			&args);
 	} else {
 		struct marker_probe_closure *multi;
 		int i;
@@ -186,8 +184,8 @@ void marker_probe_cb_noarg(const struct marker *mdata,
 		smp_read_barrier_depends();
 		multi = mdata->multi;
 		for (i = 0; multi[i].func; i++)
-			multi[i].func(multi[i].probe_private, call_private, fmt,
-				&args);
+			multi[i].func(multi[i].probe_private, call_private,
+				mdata->format, &args);
 	}
 	preempt_enable();
 }
-- 
cgit v1.2.3


From 0aa977f592f17004f9d1d545f2e1bb9ea71896c3 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Date: Mon, 12 May 2008 21:21:10 +0200
Subject: Markers - define non optimized marker

To support the forthcoming "immediate values" marker optimization, we must have
a way to declare markers in few code paths that does not use instruction
modification based enable. This will be the case of printk(), some traps and
eventually lockdep instrumentation.

Changelog :
- Fix reversed boolean logic of "generic".

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/marker.h | 29 ++++++++++++++++++++++++-----
 1 file changed, 24 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/marker.h b/include/linux/marker.h
index 338533abb47b..1290653f9241 100644
--- a/include/linux/marker.h
+++ b/include/linux/marker.h
@@ -58,8 +58,12 @@ struct marker {
  * Make sure the alignment of the structure in the __markers section will
  * not add unwanted padding between the beginning of the section and the
  * structure. Force alignment to the same alignment as the section start.
+ *
+ * The "generic" argument controls which marker enabling mechanism must be used.
+ * If generic is true, a variable read is used.
+ * If generic is false, immediate values are used.
  */
-#define __trace_mark(name, call_private, format, args...)		\
+#define __trace_mark(generic, name, call_private, format, args...)	\
 	do {								\
 		static const char __mstrtab_##name[]			\
 		__attribute__((section("__markers_strings")))		\
@@ -79,7 +83,7 @@ struct marker {
 extern void marker_update_probe_range(struct marker *begin,
 	struct marker *end);
 #else /* !CONFIG_MARKERS */
-#define __trace_mark(name, call_private, format, args...) \
+#define __trace_mark(generic, name, call_private, format, args...) \
 		__mark_check_format(format, ## args)
 static inline void marker_update_probe_range(struct marker *begin,
 	struct marker *end)
@@ -87,15 +91,30 @@ static inline void marker_update_probe_range(struct marker *begin,
 #endif /* CONFIG_MARKERS */
 
 /**
- * trace_mark - Marker
+ * trace_mark - Marker using code patching
  * @name: marker name, not quoted.
  * @format: format string
  * @args...: variable argument list
  *
- * Places a marker.
+ * Places a marker using optimized code patching technique (imv_read())
+ * to be enabled when immediate values are present.
  */
 #define trace_mark(name, format, args...) \
-	__trace_mark(name, NULL, format, ## args)
+	__trace_mark(0, name, NULL, format, ## args)
+
+/**
+ * _trace_mark - Marker using variable read
+ * @name: marker name, not quoted.
+ * @format: format string
+ * @args...: variable argument list
+ *
+ * Places a marker using a standard memory read (_imv_read()) to be
+ * enabled. Should be used for markers in code paths where instruction
+ * modification based enabling is not welcome. (__init and __exit functions,
+ * lockdep, some traps, printk).
+ */
+#define _trace_mark(name, format, args...) \
+	__trace_mark(1, name, NULL, format, ## args)
 
 /**
  * MARK_NOARGS - Format string for a marker with no argument.
-- 
cgit v1.2.3


From 5b82a1b08a00b2adca3d9dd9777efff40b7aaaa1 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Date: Mon, 12 May 2008 21:21:10 +0200
Subject: Port ftrace to markers

Porting ftrace to the marker infrastructure.

Don't need to chain to the wakeup tracer from the sched tracer, because markers
support multiple probes connected.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
CC: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/sched.h             |  32 -------
 kernel/sched.c                    |  14 +++-
 kernel/trace/trace.h              |  20 +----
 kernel/trace/trace_sched_switch.c | 171 +++++++++++++++++++++++++++++++-------
 kernel/trace/trace_sched_wakeup.c | 106 +++++++++++++++++++++--
 5 files changed, 255 insertions(+), 88 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 360ca99033d2..c0b1c69b55ce 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2131,38 +2131,6 @@ __trace_special(void *__tr, void *__data,
 }
 #endif
 
-#ifdef CONFIG_CONTEXT_SWITCH_TRACER
-extern void
-ftrace_ctx_switch(void *rq, struct task_struct *prev, struct task_struct *next);
-extern void
-ftrace_wake_up_task(void *rq, struct task_struct *wakee,
-		    struct task_struct *curr);
-extern void ftrace_all_fair_tasks(void *__rq, void *__tr, void *__data);
-extern void
-ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3);
-#else
-static inline void
-ftrace_ctx_switch(void *rq, struct task_struct *prev, struct task_struct *next)
-{
-}
-static inline void
-sched_trace_special(unsigned long p1, unsigned long p2, unsigned long p3)
-{
-}
-static inline void
-ftrace_wake_up_task(void *rq, struct task_struct *wakee,
-		    struct task_struct *curr)
-{
-}
-static inline void ftrace_all_fair_tasks(void *__rq, void *__tr, void *__data)
-{
-}
-static inline void
-ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
-{
-}
-#endif
-
 extern long sched_setaffinity(pid_t pid, const cpumask_t *new_mask);
 extern long sched_getaffinity(pid_t pid, cpumask_t *mask);
 
diff --git a/kernel/sched.c b/kernel/sched.c
index ad95cca4e42e..e2e985eeee78 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2500,7 +2500,9 @@ out_activate:
 	success = 1;
 
 out_running:
-	ftrace_wake_up_task(rq, p, rq->curr);
+	trace_mark(kernel_sched_wakeup,
+		"pid %d state %ld ## rq %p task %p rq->curr %p",
+		p->pid, p->state, rq, p, rq->curr);
 	check_preempt_curr(rq, p);
 
 	p->state = TASK_RUNNING;
@@ -2631,7 +2633,9 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 		p->sched_class->task_new(rq, p);
 		inc_nr_running(rq);
 	}
-	ftrace_wake_up_task(rq, p, rq->curr);
+	trace_mark(kernel_sched_wakeup_new,
+		"pid %d state %ld ## rq %p task %p rq->curr %p",
+		p->pid, p->state, rq, p, rq->curr);
 	check_preempt_curr(rq, p);
 #ifdef CONFIG_SMP
 	if (p->sched_class->task_wake_up)
@@ -2804,7 +2808,11 @@ context_switch(struct rq *rq, struct task_struct *prev,
 	struct mm_struct *mm, *oldmm;
 
 	prepare_task_switch(rq, prev, next);
-	ftrace_ctx_switch(rq, prev, next);
+	trace_mark(kernel_sched_schedule,
+		"prev_pid %d next_pid %d prev_state %ld "
+		"## rq %p prev %p next %p",
+		prev->pid, next->pid, prev->state,
+		rq, prev, next);
 	mm = next->mm;
 	oldmm = prev->active_mm;
 	/*
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 8845033ab49d..f5de0601b408 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -234,25 +234,10 @@ void update_max_tr_single(struct trace_array *tr,
 
 extern cycle_t ftrace_now(int cpu);
 
-#ifdef CONFIG_SCHED_TRACER
-extern void
-wakeup_sched_switch(struct task_struct *prev, struct task_struct *next);
-extern void
-wakeup_sched_wakeup(struct task_struct *wakee, struct task_struct *curr);
-#else
-static inline void
-wakeup_sched_switch(struct task_struct *prev, struct task_struct *next)
-{
-}
-static inline void
-wakeup_sched_wakeup(struct task_struct *wakee, struct task_struct *curr)
-{
-}
-#endif
-
 #ifdef CONFIG_CONTEXT_SWITCH_TRACER
 typedef void
 (*tracer_switch_func_t)(void *private,
+			void *__rq,
 			struct task_struct *prev,
 			struct task_struct *next);
 
@@ -262,9 +247,6 @@ struct tracer_switch_ops {
 	struct tracer_switch_ops	*next;
 };
 
-extern int register_tracer_switch(struct tracer_switch_ops *ops);
-extern int unregister_tracer_switch(struct tracer_switch_ops *ops);
-
 #endif /* CONFIG_CONTEXT_SWITCH_TRACER */
 
 #ifdef CONFIG_DYNAMIC_FTRACE
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index a3376478fc2c..d25ffa5eaf2b 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -16,11 +16,14 @@
 
 static struct trace_array	*ctx_trace;
 static int __read_mostly	tracer_enabled;
+static atomic_t			sched_ref;
 
 static void
-ctx_switch_func(void *__rq, struct task_struct *prev, struct task_struct *next)
+sched_switch_func(void *private, void *__rq, struct task_struct *prev,
+			struct task_struct *next)
 {
-	struct trace_array *tr = ctx_trace;
+	struct trace_array **ptr = private;
+	struct trace_array *tr = *ptr;
 	struct trace_array_cpu *data;
 	unsigned long flags;
 	long disabled;
@@ -41,10 +44,40 @@ ctx_switch_func(void *__rq, struct task_struct *prev, struct task_struct *next)
 	local_irq_restore(flags);
 }
 
+static notrace void
+sched_switch_callback(void *probe_data, void *call_data,
+		      const char *format, va_list *args)
+{
+	struct task_struct *prev;
+	struct task_struct *next;
+	struct rq *__rq;
+
+	if (!atomic_read(&sched_ref))
+		return;
+
+	/* skip prev_pid %d next_pid %d prev_state %ld */
+	(void)va_arg(*args, int);
+	(void)va_arg(*args, int);
+	(void)va_arg(*args, long);
+	__rq = va_arg(*args, typeof(__rq));
+	prev = va_arg(*args, typeof(prev));
+	next = va_arg(*args, typeof(next));
+
+	tracing_record_cmdline(prev);
+
+	/*
+	 * If tracer_switch_func only points to the local
+	 * switch func, it still needs the ptr passed to it.
+	 */
+	sched_switch_func(probe_data, __rq, prev, next);
+}
+
 static void
-wakeup_func(void *__rq, struct task_struct *wakee, struct task_struct *curr)
+wakeup_func(void *private, void *__rq, struct task_struct *wakee, struct
+			task_struct *curr)
 {
-	struct trace_array *tr = ctx_trace;
+	struct trace_array **ptr = private;
+	struct trace_array *tr = *ptr;
 	struct trace_array_cpu *data;
 	unsigned long flags;
 	long disabled;
@@ -67,35 +100,29 @@ wakeup_func(void *__rq, struct task_struct *wakee, struct task_struct *curr)
 	local_irq_restore(flags);
 }
 
-void
-ftrace_ctx_switch(void *__rq, struct task_struct *prev,
-		  struct task_struct *next)
+static notrace void
+wake_up_callback(void *probe_data, void *call_data,
+		 const char *format, va_list *args)
 {
-	if (unlikely(atomic_read(&trace_record_cmdline_enabled)))
-		tracing_record_cmdline(prev);
+	struct task_struct *curr;
+	struct task_struct *task;
+	struct rq *__rq;
 
-	/*
-	 * If tracer_switch_func only points to the local
-	 * switch func, it still needs the ptr passed to it.
-	 */
-	ctx_switch_func(__rq, prev, next);
+	if (likely(!tracer_enabled))
+		return;
 
-	/*
-	 * Chain to the wakeup tracer (this is a NOP if disabled):
-	 */
-	wakeup_sched_switch(prev, next);
-}
+	/* Skip pid %d state %ld */
+	(void)va_arg(*args, int);
+	(void)va_arg(*args, long);
+	/* now get the meat: "rq %p task %p rq->curr %p" */
+	__rq = va_arg(*args, typeof(__rq));
+	task = va_arg(*args, typeof(task));
+	curr = va_arg(*args, typeof(curr));
 
-void
-ftrace_wake_up_task(void *__rq, struct task_struct *wakee,
-		    struct task_struct *curr)
-{
-	wakeup_func(__rq, wakee, curr);
+	tracing_record_cmdline(task);
+	tracing_record_cmdline(curr);
 
-	/*
-	 * Chain to the wakeup tracer (this is a NOP if disabled):
-	 */
-	wakeup_sched_wakeup(wakee, curr);
+	wakeup_func(probe_data, __rq, task, curr);
 }
 
 void
@@ -132,15 +159,95 @@ static void sched_switch_reset(struct trace_array *tr)
 		tracing_reset(tr->data[cpu]);
 }
 
+static int tracing_sched_register(void)
+{
+	int ret;
+
+	ret = marker_probe_register("kernel_sched_wakeup",
+			"pid %d state %ld ## rq %p task %p rq->curr %p",
+			wake_up_callback,
+			&ctx_trace);
+	if (ret) {
+		pr_info("wakeup trace: Couldn't add marker"
+			" probe to kernel_sched_wakeup\n");
+		return ret;
+	}
+
+	ret = marker_probe_register("kernel_sched_wakeup_new",
+			"pid %d state %ld ## rq %p task %p rq->curr %p",
+			wake_up_callback,
+			&ctx_trace);
+	if (ret) {
+		pr_info("wakeup trace: Couldn't add marker"
+			" probe to kernel_sched_wakeup_new\n");
+		goto fail_deprobe;
+	}
+
+	ret = marker_probe_register("kernel_sched_schedule",
+		"prev_pid %d next_pid %d prev_state %ld "
+		"## rq %p prev %p next %p",
+		sched_switch_callback,
+		&ctx_trace);
+	if (ret) {
+		pr_info("sched trace: Couldn't add marker"
+			" probe to kernel_sched_schedule\n");
+		goto fail_deprobe_wake_new;
+	}
+
+	return ret;
+fail_deprobe_wake_new:
+	marker_probe_unregister("kernel_sched_wakeup_new",
+				wake_up_callback,
+				&ctx_trace);
+fail_deprobe:
+	marker_probe_unregister("kernel_sched_wakeup",
+				wake_up_callback,
+				&ctx_trace);
+	return ret;
+}
+
+static void tracing_sched_unregister(void)
+{
+	marker_probe_unregister("kernel_sched_schedule",
+				sched_switch_callback,
+				&ctx_trace);
+	marker_probe_unregister("kernel_sched_wakeup_new",
+				wake_up_callback,
+				&ctx_trace);
+	marker_probe_unregister("kernel_sched_wakeup",
+				wake_up_callback,
+				&ctx_trace);
+}
+
+void tracing_start_sched_switch(void)
+{
+	long ref;
+
+	ref = atomic_inc_return(&sched_ref);
+	if (ref == 1)
+		tracing_sched_register();
+}
+
+void tracing_stop_sched_switch(void)
+{
+	long ref;
+
+	ref = atomic_dec_and_test(&sched_ref);
+	if (ref)
+		tracing_sched_unregister();
+}
+
 static void start_sched_trace(struct trace_array *tr)
 {
 	sched_switch_reset(tr);
 	atomic_inc(&trace_record_cmdline_enabled);
 	tracer_enabled = 1;
+	tracing_start_sched_switch();
 }
 
 static void stop_sched_trace(struct trace_array *tr)
 {
+	tracing_stop_sched_switch();
 	atomic_dec(&trace_record_cmdline_enabled);
 	tracer_enabled = 0;
 }
@@ -181,6 +288,14 @@ static struct tracer sched_switch_trace __read_mostly =
 
 __init static int init_sched_switch_trace(void)
 {
+	int ret = 0;
+
+	if (atomic_read(&sched_ref))
+		ret = tracing_sched_register();
+	if (ret) {
+		pr_info("error registering scheduler trace\n");
+		return ret;
+	}
 	return register_tracer(&sched_switch_trace);
 }
 device_initcall(init_sched_switch_trace);
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 5948011006bc..5d2fb48e47f8 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -15,6 +15,7 @@
 #include <linux/kallsyms.h>
 #include <linux/uaccess.h>
 #include <linux/ftrace.h>
+#include <linux/marker.h>
 
 #include "trace.h"
 
@@ -44,11 +45,13 @@ static int report_latency(cycle_t delta)
 	return 1;
 }
 
-void
-wakeup_sched_switch(struct task_struct *prev, struct task_struct *next)
+static void notrace
+wakeup_sched_switch(void *private, void *rq, struct task_struct *prev,
+	struct task_struct *next)
 {
 	unsigned long latency = 0, t0 = 0, t1 = 0;
-	struct trace_array *tr = wakeup_trace;
+	struct trace_array **ptr = private;
+	struct trace_array *tr = *ptr;
 	struct trace_array_cpu *data;
 	cycle_t T0, T1, delta;
 	unsigned long flags;
@@ -113,6 +116,31 @@ out:
 	atomic_dec(&tr->data[cpu]->disabled);
 }
 
+static notrace void
+sched_switch_callback(void *probe_data, void *call_data,
+		      const char *format, va_list *args)
+{
+	struct task_struct *prev;
+	struct task_struct *next;
+	struct rq *__rq;
+
+	/* skip prev_pid %d next_pid %d prev_state %ld */
+	(void)va_arg(*args, int);
+	(void)va_arg(*args, int);
+	(void)va_arg(*args, long);
+	__rq = va_arg(*args, typeof(__rq));
+	prev = va_arg(*args, typeof(prev));
+	next = va_arg(*args, typeof(next));
+
+	tracing_record_cmdline(prev);
+
+	/*
+	 * If tracer_switch_func only points to the local
+	 * switch func, it still needs the ptr passed to it.
+	 */
+	wakeup_sched_switch(probe_data, __rq, prev, next);
+}
+
 static void __wakeup_reset(struct trace_array *tr)
 {
 	struct trace_array_cpu *data;
@@ -188,19 +216,68 @@ out:
 	atomic_dec(&tr->data[cpu]->disabled);
 }
 
-void wakeup_sched_wakeup(struct task_struct *wakee, struct task_struct *curr)
+static notrace void
+wake_up_callback(void *probe_data, void *call_data,
+		 const char *format, va_list *args)
 {
+	struct trace_array **ptr = probe_data;
+	struct trace_array *tr = *ptr;
+	struct task_struct *curr;
+	struct task_struct *task;
+	struct rq *__rq;
+
 	if (likely(!tracer_enabled))
 		return;
 
+	/* Skip pid %d state %ld */
+	(void)va_arg(*args, int);
+	(void)va_arg(*args, long);
+	/* now get the meat: "rq %p task %p rq->curr %p" */
+	__rq = va_arg(*args, typeof(__rq));
+	task = va_arg(*args, typeof(task));
+	curr = va_arg(*args, typeof(curr));
+
+	tracing_record_cmdline(task);
 	tracing_record_cmdline(curr);
-	tracing_record_cmdline(wakee);
 
-	wakeup_check_start(wakeup_trace, wakee, curr);
+	wakeup_check_start(tr, task, curr);
 }
 
 static void start_wakeup_tracer(struct trace_array *tr)
 {
+	int ret;
+
+	ret = marker_probe_register("kernel_sched_wakeup",
+			"pid %d state %ld ## rq %p task %p rq->curr %p",
+			wake_up_callback,
+			&wakeup_trace);
+	if (ret) {
+		pr_info("wakeup trace: Couldn't add marker"
+			" probe to kernel_sched_wakeup\n");
+		return;
+	}
+
+	ret = marker_probe_register("kernel_sched_wakeup_new",
+			"pid %d state %ld ## rq %p task %p rq->curr %p",
+			wake_up_callback,
+			&wakeup_trace);
+	if (ret) {
+		pr_info("wakeup trace: Couldn't add marker"
+			" probe to kernel_sched_wakeup_new\n");
+		goto fail_deprobe;
+	}
+
+	ret = marker_probe_register("kernel_sched_schedule",
+		"prev_pid %d next_pid %d prev_state %ld "
+		"## rq %p prev %p next %p",
+		sched_switch_callback,
+		&wakeup_trace);
+	if (ret) {
+		pr_info("sched trace: Couldn't add marker"
+			" probe to kernel_sched_schedule\n");
+		goto fail_deprobe_wake_new;
+	}
+
 	wakeup_reset(tr);
 
 	/*
@@ -215,11 +292,28 @@ static void start_wakeup_tracer(struct trace_array *tr)
 	tracer_enabled = 1;
 
 	return;
+fail_deprobe_wake_new:
+	marker_probe_unregister("kernel_sched_wakeup_new",
+				wake_up_callback,
+				&wakeup_trace);
+fail_deprobe:
+	marker_probe_unregister("kernel_sched_wakeup",
+				wake_up_callback,
+				&wakeup_trace);
 }
 
 static void stop_wakeup_tracer(struct trace_array *tr)
 {
 	tracer_enabled = 0;
+	marker_probe_unregister("kernel_sched_schedule",
+				sched_switch_callback,
+				&wakeup_trace);
+	marker_probe_unregister("kernel_sched_wakeup_new",
+				wake_up_callback,
+				&wakeup_trace);
+	marker_probe_unregister("kernel_sched_wakeup",
+				wake_up_callback,
+				&wakeup_trace);
 }
 
 static void wakeup_tracer_init(struct trace_array *tr)
-- 
cgit v1.2.3


From 74f4e369fc5b52433ad824cef32d3bf1304549be Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:21:15 +0200
Subject: ftrace: stacktrace fix

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/ftrace.h | 8 ++++++++
 kernel/semaphore.c     | 2 ++
 kernel/trace/trace.c   | 4 ++--
 kernel/trace/trace.h   | 2 +-
 4 files changed, 13 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 0d3714e7110b..017ab44d572a 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -120,4 +120,12 @@ static inline void tracer_disable(void)
 # define trace_preempt_off(a0, a1)		do { } while (0)
 #endif
 
+#ifdef CONFIG_CONTEXT_SWITCH_TRACER
+extern void
+ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3);
+#else
+static inline void
+ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { }
+#endif
+
 #endif /* _LINUX_FTRACE_H */
diff --git a/kernel/semaphore.c b/kernel/semaphore.c
index 5c2942e768cd..1a064adab658 100644
--- a/kernel/semaphore.c
+++ b/kernel/semaphore.c
@@ -31,6 +31,7 @@
 #include <linux/sched.h>
 #include <linux/semaphore.h>
 #include <linux/spinlock.h>
+#include <linux/ftrace.h>
 
 static noinline void __down(struct semaphore *sem);
 static noinline int __down_interruptible(struct semaphore *sem);
@@ -53,6 +54,7 @@ void down(struct semaphore *sem)
 {
 	unsigned long flags;
 
+	ftrace_special(sem->count, 0, __LINE__);
 	spin_lock_irqsave(&sem->lock, flags);
 	if (likely(sem->count > 0))
 		sem->count--;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2824cf48cdca..3271916ff033 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -901,7 +901,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
 	entry->ctx.next_pid	= next->pid;
 	entry->ctx.next_prio	= next->prio;
 	entry->ctx.next_state	= next->state;
-	__trace_stack(tr, data, flags, 4);
+	__trace_stack(tr, data, flags, 5);
 	__raw_spin_unlock(&data->lock);
 	raw_local_irq_restore(irq_flags);
 }
@@ -927,7 +927,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 	entry->ctx.next_pid	= wakee->pid;
 	entry->ctx.next_prio	= wakee->prio;
 	entry->ctx.next_state	= wakee->state;
-	__trace_stack(tr, data, flags, 5);
+	__trace_stack(tr, data, flags, 6);
 	__raw_spin_unlock(&data->lock);
 	raw_local_irq_restore(irq_flags);
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index f5de0601b408..c460e85e94ed 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -51,7 +51,7 @@ struct special_entry {
  * Stack-trace entry:
  */
 
-#define FTRACE_STACK_ENTRIES	5
+#define FTRACE_STACK_ENTRIES	8
 
 struct stack_entry {
 	unsigned long		caller[FTRACE_STACK_ENTRIES];
-- 
cgit v1.2.3


From d49dbf33f0bf8748ee3662b973eb57e60525d622 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 16 May 2008 10:41:53 +0200
Subject: ftrace: fix include file dependency

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/ftrace.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 017ab44d572a..911d5d80b49f 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -4,6 +4,7 @@
 #ifdef CONFIG_FTRACE
 
 #include <linux/linkage.h>
+#include <linux/fs.h>
 
 extern int ftrace_enabled;
 extern int
-- 
cgit v1.2.3


From 489f139614596cbc956a06f5e4bb41288e276fe3 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 25 Feb 2008 13:38:05 +0100
Subject: ftrace: fix build bug

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/ftrace.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 911d5d80b49f..922e23d0196f 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -106,16 +106,16 @@ static inline void tracer_disable(void)
 #endif
 
 #ifdef CONFIG_IRQSOFF_TRACER
-  extern void notrace time_hardirqs_on(unsigned long a0, unsigned long a1);
-  extern void notrace time_hardirqs_off(unsigned long a0, unsigned long a1);
+  extern void time_hardirqs_on(unsigned long a0, unsigned long a1);
+  extern void time_hardirqs_off(unsigned long a0, unsigned long a1);
 #else
 # define time_hardirqs_on(a0, a1)		do { } while (0)
 # define time_hardirqs_off(a0, a1)		do { } while (0)
 #endif
 
 #ifdef CONFIG_PREEMPT_TRACER
-  extern void notrace trace_preempt_on(unsigned long a0, unsigned long a1);
-  extern void notrace trace_preempt_off(unsigned long a0, unsigned long a1);
+  extern void trace_preempt_on(unsigned long a0, unsigned long a1);
+  extern void trace_preempt_off(unsigned long a0, unsigned long a1);
 #else
 # define trace_preempt_on(a0, a1)		do { } while (0)
 # define trace_preempt_off(a0, a1)		do { } while (0)
-- 
cgit v1.2.3


From 8b7d89d02ef3c6a7c73d6596f28cea7632850af4 Mon Sep 17 00:00:00 2001
From: Pekka Paalanen <pq@iki.fi>
Date: Mon, 12 May 2008 21:20:56 +0200
Subject: x86: mmiotrace - trace memory mapped IO

Mmiotrace is a tool for trapping memory mapped IO (MMIO) accesses within
the kernel. It is used for debugging and especially for reverse
engineering evil binary drivers.

Mmiotrace works by wrapping the ioremap family of kernel functions and
marking the returned pages as not present. Access to the IO memory
triggers a page fault, which will be handled by mmiotrace's custom page
fault handler. This will single-step the faulted instruction with the
MMIO page marked as present. Access logs are directed to user space via
relay and debug_fs.

This page fault approach is necessary, because binary drivers have
readl/writel etc. calls inlined and therefore extremely difficult to
trap with with e.g. kprobes.

This patch depends on the custom page fault handlers patch.

Signed-off-by: Pekka Paalanen <pq@iki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/Kconfig.debug                    |  27 ++
 arch/x86/kernel/Makefile                  |   2 +
 arch/x86/kernel/init_task.c               |   1 +
 arch/x86/kernel/mmiotrace/Makefile        |   4 +
 arch/x86/kernel/mmiotrace/kmmio.c         | 391 ++++++++++++++++++++++
 arch/x86/kernel/mmiotrace/kmmio.h         |  58 ++++
 arch/x86/kernel/mmiotrace/mmio-mod.c      | 527 ++++++++++++++++++++++++++++++
 arch/x86/kernel/mmiotrace/pf_in.c         | 489 +++++++++++++++++++++++++++
 arch/x86/kernel/mmiotrace/pf_in.h         |  39 +++
 arch/x86/kernel/mmiotrace/testmmiotrace.c |  77 +++++
 include/linux/mmiotrace.h                 |  62 ++++
 11 files changed, 1677 insertions(+)
 create mode 100644 arch/x86/kernel/mmiotrace/Makefile
 create mode 100644 arch/x86/kernel/mmiotrace/kmmio.c
 create mode 100644 arch/x86/kernel/mmiotrace/kmmio.h
 create mode 100644 arch/x86/kernel/mmiotrace/mmio-mod.c
 create mode 100644 arch/x86/kernel/mmiotrace/pf_in.c
 create mode 100644 arch/x86/kernel/mmiotrace/pf_in.h
 create mode 100644 arch/x86/kernel/mmiotrace/testmmiotrace.c
 create mode 100644 include/linux/mmiotrace.h

(limited to 'include/linux')

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 9431a8399844..7c6496e2225e 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -176,6 +176,33 @@ config PAGE_FAULT_HANDLERS
 	  register a function that is called on every page fault. Custom
 	  handlers are used by some debugging and reverse engineering tools.
 
+config MMIOTRACE
+	tristate "Memory mapped IO tracing"
+	depends on DEBUG_KERNEL && PAGE_FAULT_HANDLERS && RELAY && DEBUG_FS
+	default n
+	help
+	  This will build a kernel module called mmiotrace.
+
+	  Mmiotrace traces Memory Mapped I/O access and is meant for debugging
+	  and reverse engineering. The kernel module offers wrapped
+	  versions of the ioremap family of functions. The driver to be traced
+	  must be modified to call these wrappers. A user space program is
+	  required to collect the MMIO data.
+
+	  See http://nouveau.freedesktop.org/wiki/MmioTrace
+	  If you are not helping to develop drivers, say N.
+
+config MMIOTRACE_TEST
+	tristate "Test module for mmiotrace"
+	depends on MMIOTRACE && m
+	default n
+	help
+	  This is a dumb module for testing mmiotrace. It is very dangerous
+	  as it will write garbage to IO memory starting at a given address.
+	  However, it should be safe to use on e.g. unused portion of VRAM.
+
+	  Say N, unless you absolutely know what you are doing.
+
 #
 # IO delay types:
 #
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 739d49acd2f1..a51ac153685e 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -79,6 +79,8 @@ obj-$(CONFIG_KGDB)		+= kgdb.o
 obj-$(CONFIG_VM86)		+= vm86_32.o
 obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
 
+obj-$(CONFIG_MMIOTRACE)		+= mmiotrace/
+
 obj-$(CONFIG_HPET_TIMER) 	+= hpet.o
 
 obj-$(CONFIG_K8_NB)		+= k8.o
diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c
index a4f93b4120c1..027a5b6a12b2 100644
--- a/arch/x86/kernel/init_task.c
+++ b/arch/x86/kernel/init_task.c
@@ -15,6 +15,7 @@ static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
 EXPORT_UNUSED_SYMBOL(init_mm); /* will be removed in 2.6.26 */
+EXPORT_SYMBOL_GPL(init_mm);
 
 /*
  * Initial thread structure.
diff --git a/arch/x86/kernel/mmiotrace/Makefile b/arch/x86/kernel/mmiotrace/Makefile
new file mode 100644
index 000000000000..d6905f7f981b
--- /dev/null
+++ b/arch/x86/kernel/mmiotrace/Makefile
@@ -0,0 +1,4 @@
+obj-$(CONFIG_MMIOTRACE) += mmiotrace.o
+mmiotrace-objs := pf_in.o kmmio.o mmio-mod.o
+
+obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o
diff --git a/arch/x86/kernel/mmiotrace/kmmio.c b/arch/x86/kernel/mmiotrace/kmmio.c
new file mode 100644
index 000000000000..8ba48f9c91b4
--- /dev/null
+++ b/arch/x86/kernel/mmiotrace/kmmio.c
@@ -0,0 +1,391 @@
+/* Support for MMIO probes.
+ * Benfit many code from kprobes
+ * (C) 2002 Louis Zhuang <louis.zhuang@intel.com>.
+ *     2007 Alexander Eichner
+ *     2008 Pekka Paalanen <pq@iki.fi>
+ */
+
+#include <linux/version.h>
+#include <linux/spinlock.h>
+#include <linux/hash.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/uaccess.h>
+#include <linux/ptrace.h>
+#include <linux/preempt.h>
+#include <asm/io.h>
+#include <asm/cacheflush.h>
+#include <asm/errno.h>
+#include <asm/tlbflush.h>
+
+#include "kmmio.h"
+
+#define KMMIO_HASH_BITS 6
+#define KMMIO_TABLE_SIZE (1 << KMMIO_HASH_BITS)
+#define KMMIO_PAGE_HASH_BITS 4
+#define KMMIO_PAGE_TABLE_SIZE (1 << KMMIO_PAGE_HASH_BITS)
+
+struct kmmio_context {
+	struct kmmio_fault_page *fpage;
+	struct kmmio_probe *probe;
+	unsigned long saved_flags;
+	int active;
+};
+
+static int kmmio_page_fault(struct pt_regs *regs, unsigned long error_code,
+						unsigned long address);
+static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val,
+								void *args);
+
+static DEFINE_SPINLOCK(kmmio_lock);
+
+/* These are protected by kmmio_lock */
+unsigned int kmmio_count;
+static unsigned int handler_registered;
+static struct list_head kmmio_page_table[KMMIO_PAGE_TABLE_SIZE];
+static LIST_HEAD(kmmio_probes);
+
+static struct kmmio_context kmmio_ctx[NR_CPUS];
+
+static struct pf_handler kmmio_pf_hook = {
+	.handler = kmmio_page_fault
+};
+
+static struct notifier_block nb_die = {
+	.notifier_call = kmmio_die_notifier
+};
+
+int init_kmmio(void)
+{
+	int i;
+	for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++)
+		INIT_LIST_HEAD(&kmmio_page_table[i]);
+
+	register_die_notifier(&nb_die);
+	return 0;
+}
+
+void cleanup_kmmio(void)
+{
+	/*
+	 * Assume the following have been already cleaned by calling
+	 * unregister_kmmio_probe() appropriately:
+	 * kmmio_page_table, kmmio_probes
+	 */
+	if (handler_registered) {
+		unregister_page_fault_handler(&kmmio_pf_hook);
+		synchronize_rcu();
+	}
+	unregister_die_notifier(&nb_die);
+}
+
+/*
+ * this is basically a dynamic stabbing problem:
+ * Could use the existing prio tree code or
+ * Possible better implementations:
+ * The Interval Skip List: A Data Structure for Finding All Intervals That
+ * Overlap a Point (might be simple)
+ * Space Efficient Dynamic Stabbing with Fast Queries - Mikkel Thorup
+ */
+/* Get the kmmio at this addr (if any). You must be holding kmmio_lock. */
+static struct kmmio_probe *get_kmmio_probe(unsigned long addr)
+{
+	struct kmmio_probe *p;
+	list_for_each_entry(p, &kmmio_probes, list) {
+		if (addr >= p->addr && addr <= (p->addr + p->len))
+			return p;
+	}
+	return NULL;
+}
+
+static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page)
+{
+	struct list_head *head, *tmp;
+
+	page &= PAGE_MASK;
+	head = &kmmio_page_table[hash_long(page, KMMIO_PAGE_HASH_BITS)];
+	list_for_each(tmp, head) {
+		struct kmmio_fault_page *p
+			= list_entry(tmp, struct kmmio_fault_page, list);
+		if (p->page == page)
+			return p;
+	}
+
+	return NULL;
+}
+
+static void arm_kmmio_fault_page(unsigned long page, int *large)
+{
+	unsigned long address = page & PAGE_MASK;
+	pgd_t *pgd = pgd_offset_k(address);
+	pud_t *pud = pud_offset(pgd, address);
+	pmd_t *pmd = pmd_offset(pud, address);
+	pte_t *pte = pte_offset_kernel(pmd, address);
+
+	if (pmd_large(*pmd)) {
+		set_pmd(pmd, __pmd(pmd_val(*pmd) & ~_PAGE_PRESENT));
+		if (large)
+			*large = 1;
+	} else {
+		set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT));
+	}
+
+	__flush_tlb_one(page);
+}
+
+static void disarm_kmmio_fault_page(unsigned long page, int *large)
+{
+	unsigned long address = page & PAGE_MASK;
+	pgd_t *pgd = pgd_offset_k(address);
+	pud_t *pud = pud_offset(pgd, address);
+	pmd_t *pmd = pmd_offset(pud, address);
+	pte_t *pte = pte_offset_kernel(pmd, address);
+
+	if (large && *large) {
+		set_pmd(pmd, __pmd(pmd_val(*pmd) | _PAGE_PRESENT));
+		*large = 0;
+	} else {
+		set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT));
+	}
+
+	__flush_tlb_one(page);
+}
+
+/*
+ * Interrupts are disabled on entry as trap3 is an interrupt gate
+ * and they remain disabled thorough out this function.
+ */
+static int kmmio_handler(struct pt_regs *regs, unsigned long addr)
+{
+	struct kmmio_context *ctx;
+	int cpu;
+
+	/*
+	 * Preemption is now disabled to prevent process switch during
+	 * single stepping. We can only handle one active kmmio trace
+	 * per cpu, so ensure that we finish it before something else
+	 * gets to run.
+	 *
+	 * XXX what if an interrupt occurs between returning from
+	 * do_page_fault() and entering the single-step exception handler?
+	 * And that interrupt triggers a kmmio trap?
+	 */
+	preempt_disable();
+	cpu = smp_processor_id();
+	ctx = &kmmio_ctx[cpu];
+
+	/* interrupts disabled and CPU-local data => atomicity guaranteed. */
+	if (ctx->active) {
+		/*
+		 * This avoids a deadlock with kmmio_lock.
+		 * If this page fault really was due to kmmio trap,
+		 * all hell breaks loose.
+		 */
+		printk(KERN_EMERG "mmiotrace: recursive probe hit on CPU %d, "
+					"for address %lu. Ignoring.\n",
+					cpu, addr);
+		goto no_kmmio;
+	}
+	ctx->active++;
+
+	/*
+	 * Acquire the kmmio lock to prevent changes affecting
+	 * get_kmmio_fault_page() and get_kmmio_probe(), since we save their
+	 * returned pointers.
+	 * The lock is released in post_kmmio_handler().
+	 * XXX: could/should get_kmmio_*() be using RCU instead of spinlock?
+	 */
+	spin_lock(&kmmio_lock);
+
+	ctx->fpage = get_kmmio_fault_page(addr);
+	if (!ctx->fpage) {
+		/* this page fault is not caused by kmmio */
+		goto no_kmmio_locked;
+	}
+
+	ctx->probe = get_kmmio_probe(addr);
+	ctx->saved_flags = (regs->flags & (TF_MASK|IF_MASK));
+
+	if (ctx->probe && ctx->probe->pre_handler)
+		ctx->probe->pre_handler(ctx->probe, regs, addr);
+
+	regs->flags |= TF_MASK;
+	regs->flags &= ~IF_MASK;
+
+	/* We hold lock, now we set present bit in PTE and single step. */
+	disarm_kmmio_fault_page(ctx->fpage->page, NULL);
+
+	return 1;
+
+no_kmmio_locked:
+	spin_unlock(&kmmio_lock);
+	ctx->active--;
+no_kmmio:
+	preempt_enable_no_resched();
+	/* page fault not handled by kmmio */
+	return 0;
+}
+
+/*
+ * Interrupts are disabled on entry as trap1 is an interrupt gate
+ * and they remain disabled thorough out this function.
+ * And we hold kmmio lock.
+ */
+static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
+{
+	int cpu = smp_processor_id();
+	struct kmmio_context *ctx = &kmmio_ctx[cpu];
+
+	if (!ctx->active)
+		return 0;
+
+	if (ctx->probe && ctx->probe->post_handler)
+		ctx->probe->post_handler(ctx->probe, condition, regs);
+
+	arm_kmmio_fault_page(ctx->fpage->page, NULL);
+
+	regs->flags &= ~TF_MASK;
+	regs->flags |= ctx->saved_flags;
+
+	/* These were acquired in kmmio_handler(). */
+	ctx->active--;
+	spin_unlock(&kmmio_lock);
+	preempt_enable_no_resched();
+
+	/*
+	 * if somebody else is singlestepping across a probe point, flags
+	 * will have TF set, in which case, continue the remaining processing
+	 * of do_debug, as if this is not a probe hit.
+	 */
+	if (regs->flags & TF_MASK)
+		return 0;
+
+	return 1;
+}
+
+static int add_kmmio_fault_page(unsigned long page)
+{
+	struct kmmio_fault_page *f;
+
+	page &= PAGE_MASK;
+	f = get_kmmio_fault_page(page);
+	if (f) {
+		f->count++;
+		return 0;
+	}
+
+	f = kmalloc(sizeof(*f), GFP_ATOMIC);
+	if (!f)
+		return -1;
+
+	f->count = 1;
+	f->page = page;
+	list_add(&f->list,
+		 &kmmio_page_table[hash_long(f->page, KMMIO_PAGE_HASH_BITS)]);
+
+	arm_kmmio_fault_page(f->page, NULL);
+
+	return 0;
+}
+
+static void release_kmmio_fault_page(unsigned long page)
+{
+	struct kmmio_fault_page *f;
+
+	page &= PAGE_MASK;
+	f = get_kmmio_fault_page(page);
+	if (!f)
+		return;
+
+	f->count--;
+	if (!f->count) {
+		disarm_kmmio_fault_page(f->page, NULL);
+		list_del(&f->list);
+	}
+}
+
+int register_kmmio_probe(struct kmmio_probe *p)
+{
+	int ret = 0;
+	unsigned long size = 0;
+
+	spin_lock_irq(&kmmio_lock);
+	kmmio_count++;
+	if (get_kmmio_probe(p->addr)) {
+		ret = -EEXIST;
+		goto out;
+	}
+	list_add(&p->list, &kmmio_probes);
+	/*printk("adding fault pages...\n");*/
+	while (size < p->len) {
+		if (add_kmmio_fault_page(p->addr + size))
+			printk(KERN_ERR "mmio: Unable to set page fault.\n");
+		size += PAGE_SIZE;
+	}
+
+	if (!handler_registered) {
+		register_page_fault_handler(&kmmio_pf_hook);
+		handler_registered++;
+	}
+
+out:
+	spin_unlock_irq(&kmmio_lock);
+	/*
+	 * XXX: What should I do here?
+	 * Here was a call to global_flush_tlb(), but it does not exist
+	 * anymore.
+	 */
+	return ret;
+}
+
+void unregister_kmmio_probe(struct kmmio_probe *p)
+{
+	unsigned long size = 0;
+
+	spin_lock_irq(&kmmio_lock);
+	while (size < p->len) {
+		release_kmmio_fault_page(p->addr + size);
+		size += PAGE_SIZE;
+	}
+	list_del(&p->list);
+	kmmio_count--;
+	spin_unlock_irq(&kmmio_lock);
+}
+
+/*
+ * According to 2.6.20, mainly x86_64 arch:
+ * This is being called from do_page_fault(), via the page fault notifier
+ * chain. The chain is called for both user space faults and kernel space
+ * faults (address >= TASK_SIZE64), except not on faults serviced by
+ * vmalloc_fault().
+ *
+ * We may be in an interrupt or a critical section. Also prefecthing may
+ * trigger a page fault. We may be in the middle of process switch.
+ * The page fault hook functionality has put us inside RCU read lock.
+ *
+ * Local interrupts are disabled, so preemption cannot happen.
+ * Do not enable interrupts, do not sleep, and watch out for other CPUs.
+ */
+static int kmmio_page_fault(struct pt_regs *regs, unsigned long error_code,
+						unsigned long address)
+{
+	if (is_kmmio_active())
+		if (kmmio_handler(regs, address) == 1)
+			return -1;
+	return 0;
+}
+
+static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val,
+								void *args)
+{
+	struct die_args *arg = args;
+
+	if (val == DIE_DEBUG)
+		if (post_kmmio_handler(arg->err, arg->regs) == 1)
+			return NOTIFY_STOP;
+
+	return NOTIFY_DONE;
+}
diff --git a/arch/x86/kernel/mmiotrace/kmmio.h b/arch/x86/kernel/mmiotrace/kmmio.h
new file mode 100644
index 000000000000..85b7f68a3b8a
--- /dev/null
+++ b/arch/x86/kernel/mmiotrace/kmmio.h
@@ -0,0 +1,58 @@
+#ifndef _LINUX_KMMIO_H
+#define _LINUX_KMMIO_H
+
+#include <linux/list.h>
+#include <linux/notifier.h>
+#include <linux/smp.h>
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/version.h>
+#include <linux/kdebug.h>
+
+struct kmmio_probe;
+struct kmmio_fault_page;
+struct pt_regs;
+
+typedef void (*kmmio_pre_handler_t)(struct kmmio_probe *,
+				struct pt_regs *, unsigned long addr);
+typedef void (*kmmio_post_handler_t)(struct kmmio_probe *,
+				unsigned long condition, struct pt_regs *);
+
+struct kmmio_probe {
+	struct list_head list;
+
+	/* start location of the probe point */
+	unsigned long addr;
+
+	/* length of the probe region */
+	unsigned long len;
+
+	 /* Called before addr is executed. */
+	kmmio_pre_handler_t pre_handler;
+
+	/* Called after addr is executed, unless... */
+	kmmio_post_handler_t post_handler;
+};
+
+struct kmmio_fault_page {
+	struct list_head list;
+
+	/* location of the fault page */
+	unsigned long page;
+
+	int count;
+};
+
+/* kmmio is active by some kmmio_probes? */
+static inline int is_kmmio_active(void)
+{
+	extern unsigned int kmmio_count;
+	return kmmio_count;
+}
+
+int init_kmmio(void);
+void cleanup_kmmio(void);
+int register_kmmio_probe(struct kmmio_probe *p);
+void unregister_kmmio_probe(struct kmmio_probe *p);
+
+#endif /* _LINUX_KMMIO_H */
diff --git a/arch/x86/kernel/mmiotrace/mmio-mod.c b/arch/x86/kernel/mmiotrace/mmio-mod.c
new file mode 100644
index 000000000000..73561fe85f03
--- /dev/null
+++ b/arch/x86/kernel/mmiotrace/mmio-mod.c
@@ -0,0 +1,527 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2005
+ *               Jeff Muizelaar, 2006, 2007
+ *               Pekka Paalanen, 2008 <pq@iki.fi>
+ *
+ * Derived from the read-mod example from relay-examples by Tom Zanussi.
+ */
+#include <linux/module.h>
+#include <linux/relay.h>
+#include <linux/debugfs.h>
+#include <linux/proc_fs.h>
+#include <asm/io.h>
+#include <linux/version.h>
+#include <linux/kallsyms.h>
+#include <asm/pgtable.h>
+#include <linux/mmiotrace.h>
+#include <asm/e820.h> /* for ISA_START_ADDRESS */
+
+#include "kmmio.h"
+#include "pf_in.h"
+
+/* This app's relay channel files will appear in /debug/mmio-trace */
+#define APP_DIR		"mmio-trace"
+/* the marker injection file in /proc */
+#define MARKER_FILE     "mmio-marker"
+
+#define MODULE_NAME     "mmiotrace"
+
+struct trap_reason {
+	unsigned long addr;
+	unsigned long ip;
+	enum reason_type type;
+	int active_traces;
+};
+
+static struct trap_reason pf_reason[NR_CPUS];
+static struct mm_io_header_rw cpu_trace[NR_CPUS];
+
+static struct file_operations mmio_fops = {
+	.owner = THIS_MODULE,
+};
+
+static const size_t subbuf_size = 256*1024;
+static struct rchan *chan;
+static struct dentry *dir;
+static int suspended;      /* XXX should this be per cpu? */
+static struct proc_dir_entry *proc_marker_file;
+
+/* module parameters */
+static unsigned int      n_subbufs = 32*4;
+static unsigned long filter_offset;
+static int                 nommiotrace;
+static int               ISA_trace;
+static int                trace_pc;
+
+module_param(n_subbufs, uint, 0);
+module_param(filter_offset, ulong, 0);
+module_param(nommiotrace, bool, 0);
+module_param(ISA_trace, bool, 0);
+module_param(trace_pc, bool, 0);
+
+MODULE_PARM_DESC(n_subbufs, "Number of 256kB buffers, default 128.");
+MODULE_PARM_DESC(filter_offset, "Start address of traced mappings.");
+MODULE_PARM_DESC(nommiotrace, "Disable actual MMIO tracing.");
+MODULE_PARM_DESC(ISA_trace, "Do not exclude the low ISA range.");
+MODULE_PARM_DESC(trace_pc, "Record address of faulting instructions.");
+
+static void record_timestamp(struct mm_io_header *header)
+{
+	struct timespec now;
+
+	getnstimeofday(&now);
+	header->sec = now.tv_sec;
+	header->nsec = now.tv_nsec;
+}
+
+/*
+ * Write callback for the /proc entry:
+ * Read a marker and write it to the mmio trace log
+ */
+static int write_marker(struct file *file, const char __user *buffer,
+					unsigned long count, void *data)
+{
+	char *event = NULL;
+	struct mm_io_header *headp;
+	int len = (count > 65535) ? 65535 : count;
+
+	event = kzalloc(sizeof(*headp) + len, GFP_KERNEL);
+	if (!event)
+		return -ENOMEM;
+
+	headp = (struct mm_io_header *)event;
+	headp->type = MMIO_MAGIC | (MMIO_MARKER << MMIO_OPCODE_SHIFT);
+	headp->data_len = len;
+	record_timestamp(headp);
+
+	if (copy_from_user(event + sizeof(*headp), buffer, len)) {
+		kfree(event);
+		return -EFAULT;
+	}
+
+	relay_write(chan, event, sizeof(*headp) + len);
+	kfree(event);
+	return len;
+}
+
+static void print_pte(unsigned long address)
+{
+	pgd_t *pgd = pgd_offset_k(address);
+	pud_t *pud = pud_offset(pgd, address);
+	pmd_t *pmd = pmd_offset(pud, address);
+	if (pmd_large(*pmd)) {
+		printk(KERN_EMERG MODULE_NAME ": 4MB pages are not "
+						"currently supported: %lx\n",
+						address);
+		BUG();
+	}
+	printk(KERN_DEBUG MODULE_NAME ": pte for 0x%lx: 0x%lx 0x%lx\n",
+		address,
+		pte_val(*pte_offset_kernel(pmd, address)),
+		pte_val(*pte_offset_kernel(pmd, address)) & _PAGE_PRESENT);
+}
+
+/*
+ * For some reason the pre/post pairs have been called in an
+ * unmatched order. Report and die.
+ */
+static void die_kmmio_nesting_error(struct pt_regs *regs, unsigned long addr)
+{
+	const unsigned long cpu = smp_processor_id();
+	printk(KERN_EMERG MODULE_NAME ": unexpected fault for address: %lx, "
+					"last fault for address: %lx\n",
+					addr, pf_reason[cpu].addr);
+	print_pte(addr);
+#ifdef __i386__
+	print_symbol(KERN_EMERG "faulting EIP is at %s\n", regs->ip);
+	print_symbol(KERN_EMERG "last faulting EIP was at %s\n",
+							pf_reason[cpu].ip);
+	printk(KERN_EMERG
+			"eax: %08lx   ebx: %08lx   ecx: %08lx   edx: %08lx\n",
+			regs->ax, regs->bx, regs->cx, regs->dx);
+	printk(KERN_EMERG
+			"esi: %08lx   edi: %08lx   ebp: %08lx   esp: %08lx\n",
+			regs->si, regs->di, regs->bp, regs->sp);
+#else
+	print_symbol(KERN_EMERG "faulting RIP is at %s\n", regs->ip);
+	print_symbol(KERN_EMERG "last faulting RIP was at %s\n",
+							pf_reason[cpu].ip);
+	printk(KERN_EMERG "rax: %016lx   rcx: %016lx   rdx: %016lx\n",
+					regs->ax, regs->cx, regs->dx);
+	printk(KERN_EMERG "rsi: %016lx   rdi: %016lx   "
+				"rbp: %016lx   rsp: %016lx\n",
+				regs->si, regs->di, regs->bp, regs->sp);
+#endif
+	BUG();
+}
+
+static void pre(struct kmmio_probe *p, struct pt_regs *regs,
+						unsigned long addr)
+{
+	const unsigned long cpu = smp_processor_id();
+	const unsigned long instptr = instruction_pointer(regs);
+	const enum reason_type type = get_ins_type(instptr);
+
+	/* it doesn't make sense to have more than one active trace per cpu */
+	if (pf_reason[cpu].active_traces)
+		die_kmmio_nesting_error(regs, addr);
+	else
+		pf_reason[cpu].active_traces++;
+
+	pf_reason[cpu].type = type;
+	pf_reason[cpu].addr = addr;
+	pf_reason[cpu].ip = instptr;
+
+	cpu_trace[cpu].header.type = MMIO_MAGIC;
+	cpu_trace[cpu].header.pid = 0;
+	cpu_trace[cpu].header.data_len = sizeof(struct mm_io_rw);
+	cpu_trace[cpu].rw.address = addr;
+
+	/*
+	 * Only record the program counter when requested.
+	 * It may taint clean-room reverse engineering.
+	 */
+	if (trace_pc)
+		cpu_trace[cpu].rw.pc = instptr;
+	else
+		cpu_trace[cpu].rw.pc = 0;
+
+	record_timestamp(&cpu_trace[cpu].header);
+
+	switch (type) {
+	case REG_READ:
+		cpu_trace[cpu].header.type |=
+			(MMIO_READ << MMIO_OPCODE_SHIFT) |
+			(get_ins_mem_width(instptr) << MMIO_WIDTH_SHIFT);
+		break;
+	case REG_WRITE:
+		cpu_trace[cpu].header.type |=
+			(MMIO_WRITE << MMIO_OPCODE_SHIFT) |
+			(get_ins_mem_width(instptr) << MMIO_WIDTH_SHIFT);
+		cpu_trace[cpu].rw.value = get_ins_reg_val(instptr, regs);
+		break;
+	case IMM_WRITE:
+		cpu_trace[cpu].header.type |=
+			(MMIO_WRITE << MMIO_OPCODE_SHIFT) |
+			(get_ins_mem_width(instptr) << MMIO_WIDTH_SHIFT);
+		cpu_trace[cpu].rw.value = get_ins_imm_val(instptr);
+		break;
+	default:
+		{
+			unsigned char *ip = (unsigned char *)instptr;
+			cpu_trace[cpu].header.type |=
+					(MMIO_UNKNOWN_OP << MMIO_OPCODE_SHIFT);
+			cpu_trace[cpu].rw.value = (*ip) << 16 |
+							*(ip + 1) << 8 |
+							*(ip + 2);
+		}
+	}
+}
+
+static void post(struct kmmio_probe *p, unsigned long condition,
+							struct pt_regs *regs)
+{
+	const unsigned long cpu = smp_processor_id();
+
+	/* this should always return the active_trace count to 0 */
+	pf_reason[cpu].active_traces--;
+	if (pf_reason[cpu].active_traces) {
+		printk(KERN_EMERG MODULE_NAME ": unexpected post handler");
+		BUG();
+	}
+
+	switch (pf_reason[cpu].type) {
+	case REG_READ:
+		cpu_trace[cpu].rw.value = get_ins_reg_val(pf_reason[cpu].ip,
+									regs);
+		break;
+	default:
+		break;
+	}
+	relay_write(chan, &cpu_trace[cpu], sizeof(struct mm_io_header_rw));
+}
+
+/*
+ * subbuf_start() relay callback.
+ *
+ * Defined so that we know when events are dropped due to the buffer-full
+ * condition.
+ */
+static int subbuf_start_handler(struct rchan_buf *buf, void *subbuf,
+					void *prev_subbuf, size_t prev_padding)
+{
+	if (relay_buf_full(buf)) {
+		if (!suspended) {
+			suspended = 1;
+			printk(KERN_ERR MODULE_NAME
+						": cpu %d buffer full!!!\n",
+						smp_processor_id());
+		}
+		return 0;
+	} else if (suspended) {
+		suspended = 0;
+		printk(KERN_ERR MODULE_NAME
+					": cpu %d buffer no longer full.\n",
+					smp_processor_id());
+	}
+
+	return 1;
+}
+
+/* file_create() callback.  Creates relay file in debugfs. */
+static struct dentry *create_buf_file_handler(const char *filename,
+						struct dentry *parent,
+						int mode,
+						struct rchan_buf *buf,
+						int *is_global)
+{
+	struct dentry *buf_file;
+
+	mmio_fops.read = relay_file_operations.read;
+	mmio_fops.open = relay_file_operations.open;
+	mmio_fops.poll = relay_file_operations.poll;
+	mmio_fops.mmap = relay_file_operations.mmap;
+	mmio_fops.release = relay_file_operations.release;
+	mmio_fops.splice_read = relay_file_operations.splice_read;
+
+	buf_file = debugfs_create_file(filename, mode, parent, buf,
+								&mmio_fops);
+
+	return buf_file;
+}
+
+/* file_remove() default callback.  Removes relay file in debugfs. */
+static int remove_buf_file_handler(struct dentry *dentry)
+{
+	debugfs_remove(dentry);
+	return 0;
+}
+
+static struct rchan_callbacks relay_callbacks = {
+	.subbuf_start = subbuf_start_handler,
+	.create_buf_file = create_buf_file_handler,
+	.remove_buf_file = remove_buf_file_handler,
+};
+
+/*
+ * create_channel - creates channel /debug/APP_DIR/cpuXXX
+ * Returns channel on success, NULL otherwise
+ */
+static struct rchan *create_channel(unsigned size, unsigned n)
+{
+	return relay_open("cpu", dir, size, n, &relay_callbacks, NULL);
+}
+
+/* destroy_channel - destroys channel /debug/APP_DIR/cpuXXX */
+static void destroy_channel(void)
+{
+	if (chan) {
+		relay_close(chan);
+		chan = NULL;
+	}
+}
+
+struct remap_trace {
+	struct list_head list;
+	struct kmmio_probe probe;
+};
+static LIST_HEAD(trace_list);
+static DEFINE_SPINLOCK(trace_list_lock);
+
+static void do_ioremap_trace_core(unsigned long offset, unsigned long size,
+							void __iomem *addr)
+{
+	struct remap_trace *trace = kmalloc(sizeof(*trace), GFP_KERNEL);
+	struct mm_io_header_map event = {
+		.header = {
+			.type = MMIO_MAGIC |
+					(MMIO_PROBE << MMIO_OPCODE_SHIFT),
+			.sec = 0,
+			.nsec = 0,
+			.pid = 0,
+			.data_len = sizeof(struct mm_io_map)
+		},
+		.map = {
+			.phys = offset,
+			.addr = (unsigned long)addr,
+			.len  = size,
+			.pc   = 0
+		}
+	};
+	record_timestamp(&event.header);
+
+	*trace = (struct remap_trace) {
+		.probe = {
+			.addr = (unsigned long)addr,
+			.len = size,
+			.pre_handler = pre,
+			.post_handler = post,
+		}
+	};
+
+	relay_write(chan, &event, sizeof(event));
+	spin_lock(&trace_list_lock);
+	list_add_tail(&trace->list, &trace_list);
+	spin_unlock(&trace_list_lock);
+	if (!nommiotrace)
+		register_kmmio_probe(&trace->probe);
+}
+
+static void ioremap_trace_core(unsigned long offset, unsigned long size,
+							void __iomem *addr)
+{
+	if ((filter_offset) && (offset != filter_offset))
+		return;
+
+	/* Don't trace the low PCI/ISA area, it's always mapped.. */
+	if (!ISA_trace && (offset < ISA_END_ADDRESS) &&
+					(offset + size > ISA_START_ADDRESS)) {
+		printk(KERN_NOTICE MODULE_NAME ": Ignoring map of low "
+						"PCI/ISA area (0x%lx-0x%lx)\n",
+						offset, offset + size);
+		return;
+	}
+	do_ioremap_trace_core(offset, size, addr);
+}
+
+void __iomem *ioremap_cache_trace(unsigned long offset, unsigned long size)
+{
+	void __iomem *p = ioremap_cache(offset, size);
+	printk(KERN_DEBUG MODULE_NAME ": ioremap_cache(0x%lx, 0x%lx) = %p\n",
+							offset, size, p);
+	ioremap_trace_core(offset, size, p);
+	return p;
+}
+EXPORT_SYMBOL(ioremap_cache_trace);
+
+void __iomem *ioremap_nocache_trace(unsigned long offset, unsigned long size)
+{
+	void __iomem *p = ioremap_nocache(offset, size);
+	printk(KERN_DEBUG MODULE_NAME ": ioremap_nocache(0x%lx, 0x%lx) = %p\n",
+							offset, size, p);
+	ioremap_trace_core(offset, size, p);
+	return p;
+}
+EXPORT_SYMBOL(ioremap_nocache_trace);
+
+void iounmap_trace(volatile void __iomem *addr)
+{
+	struct mm_io_header_map event = {
+		.header = {
+			.type = MMIO_MAGIC |
+				(MMIO_UNPROBE << MMIO_OPCODE_SHIFT),
+			.sec = 0,
+			.nsec = 0,
+			.pid = 0,
+			.data_len = sizeof(struct mm_io_map)
+		},
+		.map = {
+			.phys = 0,
+			.addr = (unsigned long)addr,
+			.len  = 0,
+			.pc   = 0
+		}
+	};
+	struct remap_trace *trace;
+	struct remap_trace *tmp;
+	printk(KERN_DEBUG MODULE_NAME ": Unmapping %p.\n", addr);
+	record_timestamp(&event.header);
+
+	spin_lock(&trace_list_lock);
+	list_for_each_entry_safe(trace, tmp, &trace_list, list) {
+		if ((unsigned long)addr == trace->probe.addr) {
+			if (!nommiotrace)
+				unregister_kmmio_probe(&trace->probe);
+			list_del(&trace->list);
+			kfree(trace);
+			break;
+		}
+	}
+	spin_unlock(&trace_list_lock);
+	relay_write(chan, &event, sizeof(event));
+	iounmap(addr);
+}
+EXPORT_SYMBOL(iounmap_trace);
+
+static void clear_trace_list(void)
+{
+	struct remap_trace *trace;
+	struct remap_trace *tmp;
+
+	spin_lock(&trace_list_lock);
+	list_for_each_entry_safe(trace, tmp, &trace_list, list) {
+		printk(KERN_WARNING MODULE_NAME ": purging non-iounmapped "
+					"trace @0x%08lx, size 0x%lx.\n",
+					trace->probe.addr, trace->probe.len);
+		if (!nommiotrace)
+			unregister_kmmio_probe(&trace->probe);
+		list_del(&trace->list);
+		kfree(trace);
+		break;
+	}
+	spin_unlock(&trace_list_lock);
+}
+
+static int __init init(void)
+{
+	if (n_subbufs < 2)
+		return -EINVAL;
+
+	dir = debugfs_create_dir(APP_DIR, NULL);
+	if (!dir) {
+		printk(KERN_ERR MODULE_NAME
+				": Couldn't create relay app directory.\n");
+		return -ENOMEM;
+	}
+
+	chan = create_channel(subbuf_size, n_subbufs);
+	if (!chan) {
+		debugfs_remove(dir);
+		printk(KERN_ERR MODULE_NAME
+				": relay app channel creation failed\n");
+		return -ENOMEM;
+	}
+
+	init_kmmio();
+
+	proc_marker_file = create_proc_entry(MARKER_FILE, 0, NULL);
+	if (proc_marker_file)
+		proc_marker_file->write_proc = write_marker;
+
+	printk(KERN_DEBUG MODULE_NAME ": loaded.\n");
+	if (nommiotrace)
+		printk(KERN_DEBUG MODULE_NAME ": MMIO tracing disabled.\n");
+	if (ISA_trace)
+		printk(KERN_WARNING MODULE_NAME
+				": Warning! low ISA range will be traced.\n");
+	return 0;
+}
+
+static void __exit cleanup(void)
+{
+	printk(KERN_DEBUG MODULE_NAME ": unload...\n");
+	clear_trace_list();
+	cleanup_kmmio();
+	remove_proc_entry(MARKER_FILE, NULL);
+	destroy_channel();
+	if (dir)
+		debugfs_remove(dir);
+}
+
+module_init(init);
+module_exit(cleanup);
+MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/mmiotrace/pf_in.c b/arch/x86/kernel/mmiotrace/pf_in.c
new file mode 100644
index 000000000000..67ea520dde62
--- /dev/null
+++ b/arch/x86/kernel/mmiotrace/pf_in.c
@@ -0,0 +1,489 @@
+/*
+ *  Fault Injection Test harness (FI)
+ *  Copyright (C) Intel Crop.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version 2
+ *  of the License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307,
+ *  USA.
+ *
+ */
+
+/*  $Id: pf_in.c,v 1.1.1.1 2002/11/12 05:56:32 brlock Exp $
+ *  Copyright by Intel Crop., 2002
+ *  Louis Zhuang (louis.zhuang@intel.com)
+ *
+ *  Bjorn Steinbrink (B.Steinbrink@gmx.de), 2007
+ */
+
+#include <linux/module.h>
+#include <linux/ptrace.h> /* struct pt_regs */
+#include "pf_in.h"
+
+#ifdef __i386__
+/* IA32 Manual 3, 2-1 */
+static unsigned char prefix_codes[] = {
+	0xF0, 0xF2, 0xF3, 0x2E, 0x36, 0x3E, 0x26, 0x64,
+	0x65, 0x2E, 0x3E, 0x66, 0x67
+};
+/* IA32 Manual 3, 3-432*/
+static unsigned int reg_rop[] = {
+	0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
+};
+static unsigned int reg_wop[] = { 0x88, 0x89 };
+static unsigned int imm_wop[] = { 0xC6, 0xC7 };
+/* IA32 Manual 3, 3-432*/
+static unsigned int rw8[] = { 0x88, 0x8A, 0xC6 };
+static unsigned int rw32[] = {
+	0x89, 0x8B, 0xC7, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
+};
+static unsigned int mw8[] = { 0x88, 0x8A, 0xC6, 0xB60F, 0xBE0F };
+static unsigned int mw16[] = { 0xB70F, 0xBF0F };
+static unsigned int mw32[] = { 0x89, 0x8B, 0xC7 };
+static unsigned int mw64[] = {};
+#else /* not __i386__ */
+static unsigned char prefix_codes[] = {
+	0x66, 0x67, 0x2E, 0x3E, 0x26, 0x64, 0x65, 0x36,
+	0xF0, 0xF3, 0xF2,
+	/* REX Prefixes */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f
+};
+/* AMD64 Manual 3, Appendix A*/
+static unsigned int reg_rop[] = {
+	0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
+};
+static unsigned int reg_wop[] = { 0x88, 0x89 };
+static unsigned int imm_wop[] = { 0xC6, 0xC7 };
+static unsigned int rw8[] = { 0xC6, 0x88, 0x8A };
+static unsigned int rw32[] = {
+	0xC7, 0x89, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
+};
+/* 8 bit only */
+static unsigned int mw8[] = { 0xC6, 0x88, 0x8A, 0xB60F, 0xBE0F };
+/* 16 bit only */
+static unsigned int mw16[] = { 0xB70F, 0xBF0F };
+/* 16 or 32 bit */
+static unsigned int mw32[] = { 0xC7 };
+/* 16, 32 or 64 bit */
+static unsigned int mw64[] = { 0x89, 0x8B };
+#endif /* not __i386__ */
+
+static int skip_prefix(unsigned char *addr, int *shorted, int *enlarged,
+								int *rexr)
+{
+	int i;
+	unsigned char *p = addr;
+	*shorted = 0;
+	*enlarged = 0;
+	*rexr = 0;
+
+restart:
+	for (i = 0; i < ARRAY_SIZE(prefix_codes); i++) {
+		if (*p == prefix_codes[i]) {
+			if (*p == 0x66)
+				*shorted = 1;
+#ifdef __amd64__
+			if ((*p & 0xf8) == 0x48)
+				*enlarged = 1;
+			if ((*p & 0xf4) == 0x44)
+				*rexr = 1;
+#endif
+			p++;
+			goto restart;
+		}
+	}
+
+	return (p - addr);
+}
+
+static int get_opcode(unsigned char *addr, unsigned int *opcode)
+{
+	int len;
+
+	if (*addr == 0x0F) {
+		/* 0x0F is extension instruction */
+		*opcode = *(unsigned short *)addr;
+		len = 2;
+	} else {
+		*opcode = *addr;
+		len = 1;
+	}
+
+	return len;
+}
+
+#define CHECK_OP_TYPE(opcode, array, type) \
+	for (i = 0; i < ARRAY_SIZE(array); i++) { \
+		if (array[i] == opcode) { \
+			rv = type; \
+			goto exit; \
+		} \
+	}
+
+enum reason_type get_ins_type(unsigned long ins_addr)
+{
+	unsigned int opcode;
+	unsigned char *p;
+	int shorted, enlarged, rexr;
+	int i;
+	enum reason_type rv = OTHERS;
+
+	p = (unsigned char *)ins_addr;
+	p += skip_prefix(p, &shorted, &enlarged, &rexr);
+	p += get_opcode(p, &opcode);
+
+	CHECK_OP_TYPE(opcode, reg_rop, REG_READ);
+	CHECK_OP_TYPE(opcode, reg_wop, REG_WRITE);
+	CHECK_OP_TYPE(opcode, imm_wop, IMM_WRITE);
+
+exit:
+	return rv;
+}
+#undef CHECK_OP_TYPE
+
+static unsigned int get_ins_reg_width(unsigned long ins_addr)
+{
+	unsigned int opcode;
+	unsigned char *p;
+	int i, shorted, enlarged, rexr;
+
+	p = (unsigned char *)ins_addr;
+	p += skip_prefix(p, &shorted, &enlarged, &rexr);
+	p += get_opcode(p, &opcode);
+
+	for (i = 0; i < ARRAY_SIZE(rw8); i++)
+		if (rw8[i] == opcode)
+			return 1;
+
+	for (i = 0; i < ARRAY_SIZE(rw32); i++)
+		if (rw32[i] == opcode)
+			return (shorted ? 2 : (enlarged ? 8 : 4));
+
+	printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode);
+	return 0;
+}
+
+unsigned int get_ins_mem_width(unsigned long ins_addr)
+{
+	unsigned int opcode;
+	unsigned char *p;
+	int i, shorted, enlarged, rexr;
+
+	p = (unsigned char *)ins_addr;
+	p += skip_prefix(p, &shorted, &enlarged, &rexr);
+	p += get_opcode(p, &opcode);
+
+	for (i = 0; i < ARRAY_SIZE(mw8); i++)
+		if (mw8[i] == opcode)
+			return 1;
+
+	for (i = 0; i < ARRAY_SIZE(mw16); i++)
+		if (mw16[i] == opcode)
+			return 2;
+
+	for (i = 0; i < ARRAY_SIZE(mw32); i++)
+		if (mw32[i] == opcode)
+			return shorted ? 2 : 4;
+
+	for (i = 0; i < ARRAY_SIZE(mw64); i++)
+		if (mw64[i] == opcode)
+			return shorted ? 2 : (enlarged ? 8 : 4);
+
+	printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode);
+	return 0;
+}
+
+/*
+ * Define register ident in mod/rm byte.
+ * Note: these are NOT the same as in ptrace-abi.h.
+ */
+enum {
+	arg_AL = 0,
+	arg_CL = 1,
+	arg_DL = 2,
+	arg_BL = 3,
+	arg_AH = 4,
+	arg_CH = 5,
+	arg_DH = 6,
+	arg_BH = 7,
+
+	arg_AX = 0,
+	arg_CX = 1,
+	arg_DX = 2,
+	arg_BX = 3,
+	arg_SP = 4,
+	arg_BP = 5,
+	arg_SI = 6,
+	arg_DI = 7,
+#ifdef __amd64__
+	arg_R8  = 8,
+	arg_R9  = 9,
+	arg_R10 = 10,
+	arg_R11 = 11,
+	arg_R12 = 12,
+	arg_R13 = 13,
+	arg_R14 = 14,
+	arg_R15 = 15
+#endif
+};
+
+static unsigned char *get_reg_w8(int no, struct pt_regs *regs)
+{
+	unsigned char *rv = NULL;
+
+	switch (no) {
+	case arg_AL:
+		rv = (unsigned char *)&regs->ax;
+		break;
+	case arg_BL:
+		rv = (unsigned char *)&regs->bx;
+		break;
+	case arg_CL:
+		rv = (unsigned char *)&regs->cx;
+		break;
+	case arg_DL:
+		rv = (unsigned char *)&regs->dx;
+		break;
+	case arg_AH:
+		rv = 1 + (unsigned char *)&regs->ax;
+		break;
+	case arg_BH:
+		rv = 1 + (unsigned char *)&regs->bx;
+		break;
+	case arg_CH:
+		rv = 1 + (unsigned char *)&regs->cx;
+		break;
+	case arg_DH:
+		rv = 1 + (unsigned char *)&regs->dx;
+		break;
+#ifdef __amd64__
+	case arg_R8:
+		rv = (unsigned char *)&regs->r8;
+		break;
+	case arg_R9:
+		rv = (unsigned char *)&regs->r9;
+		break;
+	case arg_R10:
+		rv = (unsigned char *)&regs->r10;
+		break;
+	case arg_R11:
+		rv = (unsigned char *)&regs->r11;
+		break;
+	case arg_R12:
+		rv = (unsigned char *)&regs->r12;
+		break;
+	case arg_R13:
+		rv = (unsigned char *)&regs->r13;
+		break;
+	case arg_R14:
+		rv = (unsigned char *)&regs->r14;
+		break;
+	case arg_R15:
+		rv = (unsigned char *)&regs->r15;
+		break;
+#endif
+	default:
+		printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no);
+		break;
+	}
+	return rv;
+}
+
+static unsigned long *get_reg_w32(int no, struct pt_regs *regs)
+{
+	unsigned long *rv = NULL;
+
+	switch (no) {
+	case arg_AX:
+		rv = &regs->ax;
+		break;
+	case arg_BX:
+		rv = &regs->bx;
+		break;
+	case arg_CX:
+		rv = &regs->cx;
+		break;
+	case arg_DX:
+		rv = &regs->dx;
+		break;
+	case arg_SP:
+		rv = &regs->sp;
+		break;
+	case arg_BP:
+		rv = &regs->bp;
+		break;
+	case arg_SI:
+		rv = &regs->si;
+		break;
+	case arg_DI:
+		rv = &regs->di;
+		break;
+#ifdef __amd64__
+	case arg_R8:
+		rv = &regs->r8;
+		break;
+	case arg_R9:
+		rv = &regs->r9;
+		break;
+	case arg_R10:
+		rv = &regs->r10;
+		break;
+	case arg_R11:
+		rv = &regs->r11;
+		break;
+	case arg_R12:
+		rv = &regs->r12;
+		break;
+	case arg_R13:
+		rv = &regs->r13;
+		break;
+	case arg_R14:
+		rv = &regs->r14;
+		break;
+	case arg_R15:
+		rv = &regs->r15;
+		break;
+#endif
+	default:
+		printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no);
+	}
+
+	return rv;
+}
+
+unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs)
+{
+	unsigned int opcode;
+	unsigned char mod_rm;
+	int reg;
+	unsigned char *p;
+	int i, shorted, enlarged, rexr;
+	unsigned long rv;
+
+	p = (unsigned char *)ins_addr;
+	p += skip_prefix(p, &shorted, &enlarged, &rexr);
+	p += get_opcode(p, &opcode);
+	for (i = 0; i < ARRAY_SIZE(reg_rop); i++)
+		if (reg_rop[i] == opcode) {
+			rv = REG_READ;
+			goto do_work;
+		}
+
+	for (i = 0; i < ARRAY_SIZE(reg_wop); i++)
+		if (reg_wop[i] == opcode) {
+			rv = REG_WRITE;
+			goto do_work;
+		}
+
+	printk(KERN_ERR "mmiotrace: Not a register instruction, opcode "
+							"0x%02x\n", opcode);
+	goto err;
+
+do_work:
+	mod_rm = *p;
+	reg = ((mod_rm >> 3) & 0x7) | (rexr << 3);
+	switch (get_ins_reg_width(ins_addr)) {
+	case 1:
+		return *get_reg_w8(reg, regs);
+
+	case 2:
+		return *(unsigned short *)get_reg_w32(reg, regs);
+
+	case 4:
+		return *(unsigned int *)get_reg_w32(reg, regs);
+
+#ifdef __amd64__
+	case 8:
+		return *(unsigned long *)get_reg_w32(reg, regs);
+#endif
+
+	default:
+		printk(KERN_ERR "mmiotrace: Error width# %d\n", reg);
+	}
+
+err:
+	return 0;
+}
+
+unsigned long get_ins_imm_val(unsigned long ins_addr)
+{
+	unsigned int opcode;
+	unsigned char mod_rm;
+	unsigned char mod;
+	unsigned char *p;
+	int i, shorted, enlarged, rexr;
+	unsigned long rv;
+
+	p = (unsigned char *)ins_addr;
+	p += skip_prefix(p, &shorted, &enlarged, &rexr);
+	p += get_opcode(p, &opcode);
+	for (i = 0; i < ARRAY_SIZE(imm_wop); i++)
+		if (imm_wop[i] == opcode) {
+			rv = IMM_WRITE;
+			goto do_work;
+		}
+
+	printk(KERN_ERR "mmiotrace: Not an immediate instruction, opcode "
+							"0x%02x\n", opcode);
+	goto err;
+
+do_work:
+	mod_rm = *p;
+	mod = mod_rm >> 6;
+	p++;
+	switch (mod) {
+	case 0:
+		/* if r/m is 5 we have a 32 disp (IA32 Manual 3, Table 2-2)  */
+		/* AMD64: XXX Check for address size prefix? */
+		if ((mod_rm & 0x7) == 0x5)
+			p += 4;
+		break;
+
+	case 1:
+		p += 1;
+		break;
+
+	case 2:
+		p += 4;
+		break;
+
+	case 3:
+	default:
+		printk(KERN_ERR "mmiotrace: not a memory access instruction "
+						"at 0x%lx, rm_mod=0x%02x\n",
+						ins_addr, mod_rm);
+	}
+
+	switch (get_ins_reg_width(ins_addr)) {
+	case 1:
+		return *(unsigned char *)p;
+
+	case 2:
+		return *(unsigned short *)p;
+
+	case 4:
+		return *(unsigned int *)p;
+
+#ifdef __amd64__
+	case 8:
+		return *(unsigned long *)p;
+#endif
+
+	default:
+		printk(KERN_ERR "mmiotrace: Error: width.\n");
+	}
+
+err:
+	return 0;
+}
diff --git a/arch/x86/kernel/mmiotrace/pf_in.h b/arch/x86/kernel/mmiotrace/pf_in.h
new file mode 100644
index 000000000000..e05341a51a27
--- /dev/null
+++ b/arch/x86/kernel/mmiotrace/pf_in.h
@@ -0,0 +1,39 @@
+/*
+ *  Fault Injection Test harness (FI)
+ *  Copyright (C) Intel Crop.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version 2
+ *  of the License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307,
+ *  USA.
+ *
+ */
+
+#ifndef __PF_H_
+#define __PF_H_
+
+enum reason_type {
+	NOT_ME,	/* page fault is not in regions */
+	NOTHING,	/* access others point in regions */
+	REG_READ,	/* read from addr to reg */
+	REG_WRITE,	/* write from reg to addr */
+	IMM_WRITE,	/* write from imm to addr */
+	OTHERS	/* Other instructions can not intercept */
+};
+
+enum reason_type get_ins_type(unsigned long ins_addr);
+unsigned int get_ins_mem_width(unsigned long ins_addr);
+unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs);
+unsigned long get_ins_imm_val(unsigned long ins_addr);
+
+#endif /* __PF_H_ */
diff --git a/arch/x86/kernel/mmiotrace/testmmiotrace.c b/arch/x86/kernel/mmiotrace/testmmiotrace.c
new file mode 100644
index 000000000000..40e66b0e6480
--- /dev/null
+++ b/arch/x86/kernel/mmiotrace/testmmiotrace.c
@@ -0,0 +1,77 @@
+/*
+ * Written by Pekka Paalanen, 2008 <pq@iki.fi>
+ */
+#include <linux/module.h>
+#include <asm/io.h>
+
+extern void __iomem *ioremap_nocache_trace(unsigned long offset,
+						unsigned long size);
+extern void iounmap_trace(volatile void __iomem *addr);
+
+#define MODULE_NAME "testmmiotrace"
+
+static unsigned long mmio_address;
+module_param(mmio_address, ulong, 0);
+MODULE_PARM_DESC(mmio_address, "Start address of the mapping of 16 kB.");
+
+static void do_write_test(void __iomem *p)
+{
+	unsigned int i;
+	for (i = 0; i < 256; i++)
+		iowrite8(i, p + i);
+	for (i = 1024; i < (5 * 1024); i += 2)
+		iowrite16(i * 12 + 7, p + i);
+	for (i = (5 * 1024); i < (16 * 1024); i += 4)
+		iowrite32(i * 212371 + 13, p + i);
+}
+
+static void do_read_test(void __iomem *p)
+{
+	unsigned int i;
+	volatile unsigned int v;
+	for (i = 0; i < 256; i++)
+		v = ioread8(p + i);
+	for (i = 1024; i < (5 * 1024); i += 2)
+		v = ioread16(p + i);
+	for (i = (5 * 1024); i < (16 * 1024); i += 4)
+		v = ioread32(p + i);
+}
+
+static void do_test(void)
+{
+	void __iomem *p = ioremap_nocache_trace(mmio_address, 0x4000);
+	if (!p) {
+		printk(KERN_ERR MODULE_NAME ": could not ioremap IO memory, "
+							"aborting.\n");
+		return;
+	}
+	do_write_test(p);
+	do_read_test(p);
+	iounmap_trace(p);
+}
+
+static int __init init(void)
+{
+	if (mmio_address == 0) {
+		printk(KERN_ERR MODULE_NAME ": you have to use the module "
+						"argument mmio_address.\n");
+		printk(KERN_ERR MODULE_NAME ": DO NOT LOAD THIS MODULE UNLESS"
+				" YOU REALLY KNOW WHAT YOU ARE DOING!\n");
+		return -ENXIO;
+	}
+
+	printk(KERN_WARNING MODULE_NAME ": WARNING: mapping 16 kB @ 0x%08lx "
+					"in PCI address space, and writing "
+					"rubbish in there.\n", mmio_address);
+	do_test();
+	return 0;
+}
+
+static void __exit cleanup(void)
+{
+	printk(KERN_DEBUG MODULE_NAME ": unloaded.\n");
+}
+
+module_init(init);
+module_exit(cleanup);
+MODULE_LICENSE("GPL");
diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h
new file mode 100644
index 000000000000..cb247825f3ec
--- /dev/null
+++ b/include/linux/mmiotrace.h
@@ -0,0 +1,62 @@
+#ifndef MMIOTRACE_H
+#define MMIOTRACE_H
+
+#include <asm/types.h>
+
+#define MMIO_VERSION 0x04
+
+/* mm_io_header.type */
+#define MMIO_OPCODE_MASK 0xff
+#define MMIO_OPCODE_SHIFT 0
+#define MMIO_WIDTH_MASK 0xff00
+#define MMIO_WIDTH_SHIFT 8
+#define MMIO_MAGIC (0x6f000000 | (MMIO_VERSION<<16))
+#define MMIO_MAGIC_MASK 0xffff0000
+
+enum mm_io_opcode {          /* payload type: */
+	MMIO_READ = 0x1,     /* struct mm_io_rw */
+	MMIO_WRITE = 0x2,    /* struct mm_io_rw */
+	MMIO_PROBE = 0x3,    /* struct mm_io_map */
+	MMIO_UNPROBE = 0x4,  /* struct mm_io_map */
+	MMIO_MARKER = 0x5,   /* raw char data */
+	MMIO_UNKNOWN_OP = 0x6, /* struct mm_io_rw */
+};
+
+struct mm_io_header {
+	__u32 type;
+	__u32 sec;      /* timestamp */
+	__u32 nsec;
+	__u32 pid;      /* PID of the process, or 0 for kernel core */
+	__u16 data_len; /* length of the following payload */
+};
+
+struct mm_io_rw {
+	__u64 address; /* virtual address of register */
+	__u64 value;
+	__u64 pc;      /* optional program counter */
+};
+
+struct mm_io_map {
+	__u64 phys;  /* base address in PCI space */
+	__u64 addr;  /* base virtual address */
+	__u64 len;   /* mapping size */
+	__u64 pc;    /* optional program counter */
+};
+
+
+/*
+ * These structures are used to allow a single relay_write()
+ * call to write a full packet.
+ */
+
+struct mm_io_header_rw {
+	struct mm_io_header header;
+	struct mm_io_rw rw;
+} __attribute__((packed));
+
+struct mm_io_header_map {
+	struct mm_io_header header;
+	struct mm_io_map map;
+} __attribute__((packed));
+
+#endif /* MMIOTRACE_H */
-- 
cgit v1.2.3


From 63ffa3e456c1a9884a3ebac997d91e3fdae18d78 Mon Sep 17 00:00:00 2001
From: Pekka Paalanen <pq@iki.fi>
Date: Mon, 12 May 2008 21:20:57 +0200
Subject: x86 mmiotrace: comment about user space ABI

Signed-off-by: Pekka Paalanen <pq@iki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/mmiotrace.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h
index cb247825f3ec..6ec288f1fe24 100644
--- a/include/linux/mmiotrace.h
+++ b/include/linux/mmiotrace.h
@@ -3,6 +3,10 @@
 
 #include <asm/types.h>
 
+/*
+ * If you change anything here, you must bump MMIO_VERSION.
+ * This is the relay data format for user space.
+ */
 #define MMIO_VERSION 0x04
 
 /* mm_io_header.type */
@@ -23,7 +27,7 @@ enum mm_io_opcode {          /* payload type: */
 };
 
 struct mm_io_header {
-	__u32 type;
+	__u32 type;     /* see MMIO_* macros above */
 	__u32 sec;      /* timestamp */
 	__u32 nsec;
 	__u32 pid;      /* PID of the process, or 0 for kernel core */
-- 
cgit v1.2.3


From 0fd0e3da4557c479b820b9a4a7afa25b4637ddf2 Mon Sep 17 00:00:00 2001
From: Pekka Paalanen <pq@iki.fi>
Date: Mon, 12 May 2008 21:20:57 +0200
Subject: x86: mmiotrace full patch, preview 1

kmmio.c handles the list of mmio probes with callbacks, list of traced
pages, and attaching into the page fault handler and die notifier. It
arms, traps and disarms the given pages, this is the core of mmiotrace.

mmio-mod.c is a user interface, hooking into ioremap functions and
registering the mmio probes. It also decodes the required information
from trapped mmio accesses via the pre and post callbacks in each probe.
Currently, hooking into ioremap functions works by redefining the symbols
of the target (binary) kernel module, so that it calls the traced
versions of the functions.

The most notable changes done since the last discussion are:
- kmmio.c is a built-in, not part of the module
- direct call from fault.c to kmmio.c, removing all dynamic hooks
- prepare for unregistering probes at any time
- make kmmio re-initializable and accessible to more than one user
- rewrite kmmio locking to remove all spinlocks from page fault path

Can I abuse call_rcu() like I do in kmmio.c:unregister_kmmio_probe()
or is there a better way?

The function called via call_rcu() itself calls call_rcu() again,
will this work or break? There I need a second grace period for RCU
after the first grace period for page faults.

Mmiotrace itself (mmio-mod.c) is still a module, I am going to attack
that next. At some point I will start looking into how to make mmiotrace
a tracer component of ftrace (thanks for the hint, Ingo). Ftrace should
make the user space part of mmiotracing as simple as
'cat /debug/trace/mmio > dump.txt'.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/init_task.c               |   1 -
 arch/x86/kernel/mmiotrace/Makefile        |   8 +-
 arch/x86/kernel/mmiotrace/kmmio.c         | 349 ++++++++++++++++++++----------
 arch/x86/kernel/mmiotrace/kmmio.h         |  58 -----
 arch/x86/kernel/mmiotrace/mmio-mod.c      |  81 ++++---
 arch/x86/kernel/mmiotrace/pf_in.c         |   2 +-
 arch/x86/kernel/mmiotrace/testmmiotrace.c |  13 +-
 arch/x86/mm/fault.c                       |  59 +----
 include/asm-x86/kdebug.h                  |   7 -
 include/linux/mmiotrace.h                 |  38 ++++
 10 files changed, 335 insertions(+), 281 deletions(-)
 delete mode 100644 arch/x86/kernel/mmiotrace/kmmio.h

(limited to 'include/linux')

diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c
index 027a5b6a12b2..a4f93b4120c1 100644
--- a/arch/x86/kernel/init_task.c
+++ b/arch/x86/kernel/init_task.c
@@ -15,7 +15,6 @@ static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
 EXPORT_UNUSED_SYMBOL(init_mm); /* will be removed in 2.6.26 */
-EXPORT_SYMBOL_GPL(init_mm);
 
 /*
  * Initial thread structure.
diff --git a/arch/x86/kernel/mmiotrace/Makefile b/arch/x86/kernel/mmiotrace/Makefile
index d6905f7f981b..cf1e747b463e 100644
--- a/arch/x86/kernel/mmiotrace/Makefile
+++ b/arch/x86/kernel/mmiotrace/Makefile
@@ -1,4 +1,4 @@
-obj-$(CONFIG_MMIOTRACE) += mmiotrace.o
-mmiotrace-objs := pf_in.o kmmio.o mmio-mod.o
-
-obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o
+obj-$(CONFIG_MMIOTRACE_HOOKS)	+= kmmio.o
+obj-$(CONFIG_MMIOTRACE)		+= mmiotrace.o
+mmiotrace-objs			:= pf_in.o mmio-mod.o
+obj-$(CONFIG_MMIOTRACE_TEST)	+= testmmiotrace.o
diff --git a/arch/x86/kernel/mmiotrace/kmmio.c b/arch/x86/kernel/mmiotrace/kmmio.c
index 5e239d0b8467..539a9b19588f 100644
--- a/arch/x86/kernel/mmiotrace/kmmio.c
+++ b/arch/x86/kernel/mmiotrace/kmmio.c
@@ -6,6 +6,7 @@
  */
 
 #include <linux/version.h>
+#include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/hash.h>
 #include <linux/init.h>
@@ -17,70 +18,119 @@
 #include <linux/ptrace.h>
 #include <linux/preempt.h>
 #include <linux/percpu.h>
+#include <linux/kdebug.h>
 #include <asm/io.h>
 #include <asm/cacheflush.h>
 #include <asm/errno.h>
 #include <asm/tlbflush.h>
 #include <asm/pgtable.h>
 
-#include "kmmio.h"
+#include <linux/mmiotrace.h>
 
-#define KMMIO_HASH_BITS 6
-#define KMMIO_TABLE_SIZE (1 << KMMIO_HASH_BITS)
 #define KMMIO_PAGE_HASH_BITS 4
 #define KMMIO_PAGE_TABLE_SIZE (1 << KMMIO_PAGE_HASH_BITS)
 
+struct kmmio_fault_page {
+	struct list_head list;
+	struct kmmio_fault_page *release_next;
+	unsigned long page; /* location of the fault page */
+
+	/*
+	 * Number of times this page has been registered as a part
+	 * of a probe. If zero, page is disarmed and this may be freed.
+	 * Used only by writers (RCU).
+	 */
+	int count;
+};
+
+struct kmmio_delayed_release {
+	struct rcu_head rcu;
+	struct kmmio_fault_page *release_list;
+};
+
 struct kmmio_context {
 	struct kmmio_fault_page *fpage;
 	struct kmmio_probe *probe;
 	unsigned long saved_flags;
+	unsigned long addr;
 	int active;
 };
 
-static int kmmio_page_fault(struct pt_regs *regs, unsigned long error_code,
-						unsigned long address);
 static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val,
 								void *args);
 
+static DECLARE_MUTEX(kmmio_init_mutex);
 static DEFINE_SPINLOCK(kmmio_lock);
 
 /* These are protected by kmmio_lock */
+static int kmmio_initialized;
 unsigned int kmmio_count;
-static unsigned int handler_registered;
+
+/* Read-protected by RCU, write-protected by kmmio_lock. */
 static struct list_head kmmio_page_table[KMMIO_PAGE_TABLE_SIZE];
 static LIST_HEAD(kmmio_probes);
 
+static struct list_head *kmmio_page_list(unsigned long page)
+{
+	return &kmmio_page_table[hash_long(page, KMMIO_PAGE_HASH_BITS)];
+}
+
 /* Accessed per-cpu */
 static DEFINE_PER_CPU(struct kmmio_context, kmmio_ctx);
 
+/* protected by kmmio_init_mutex */
 static struct notifier_block nb_die = {
 	.notifier_call = kmmio_die_notifier
 };
 
-int init_kmmio(void)
+/**
+ * Makes sure kmmio is initialized and usable.
+ * This must be called before any other kmmio function defined here.
+ * May sleep.
+ */
+void reference_kmmio(void)
 {
-	int i;
-	for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++)
-		INIT_LIST_HEAD(&kmmio_page_table[i]);
-
-	register_die_notifier(&nb_die);
-	return 0;
+	down(&kmmio_init_mutex);
+	spin_lock_irq(&kmmio_lock);
+	if (!kmmio_initialized) {
+		int i;
+		for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++)
+			INIT_LIST_HEAD(&kmmio_page_table[i]);
+		if (register_die_notifier(&nb_die))
+			BUG();
+	}
+	kmmio_initialized++;
+	spin_unlock_irq(&kmmio_lock);
+	up(&kmmio_init_mutex);
 }
+EXPORT_SYMBOL_GPL(reference_kmmio);
 
-void cleanup_kmmio(void)
+/**
+ * Clean up kmmio after use. This must be called for every call to
+ * reference_kmmio(). All probes registered after the corresponding
+ * reference_kmmio() must have been unregistered when calling this.
+ * May sleep.
+ */
+void unreference_kmmio(void)
 {
-	/*
-	 * Assume the following have been already cleaned by calling
-	 * unregister_kmmio_probe() appropriately:
-	 * kmmio_page_table, kmmio_probes
-	 */
-	if (handler_registered) {
-		if (mmiotrace_unregister_pf(&kmmio_page_fault))
-			BUG();
-		synchronize_rcu();
+	bool unreg = false;
+
+	down(&kmmio_init_mutex);
+	spin_lock_irq(&kmmio_lock);
+
+	if (kmmio_initialized == 1) {
+		BUG_ON(is_kmmio_active());
+		unreg = true;
 	}
-	unregister_die_notifier(&nb_die);
+	kmmio_initialized--;
+	BUG_ON(kmmio_initialized < 0);
+	spin_unlock_irq(&kmmio_lock);
+
+	if (unreg)
+		unregister_die_notifier(&nb_die); /* calls sync_rcu() */
+	up(&kmmio_init_mutex);
 }
+EXPORT_SYMBOL(unreference_kmmio);
 
 /*
  * this is basically a dynamic stabbing problem:
@@ -90,33 +140,33 @@ void cleanup_kmmio(void)
  * Overlap a Point (might be simple)
  * Space Efficient Dynamic Stabbing with Fast Queries - Mikkel Thorup
  */
-/* Get the kmmio at this addr (if any). You must be holding kmmio_lock. */
+/* Get the kmmio at this addr (if any). You must be holding RCU read lock. */
 static struct kmmio_probe *get_kmmio_probe(unsigned long addr)
 {
 	struct kmmio_probe *p;
-	list_for_each_entry(p, &kmmio_probes, list) {
+	list_for_each_entry_rcu(p, &kmmio_probes, list) {
 		if (addr >= p->addr && addr <= (p->addr + p->len))
 			return p;
 	}
 	return NULL;
 }
 
+/* You must be holding RCU read lock. */
 static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page)
 {
-	struct list_head *head, *tmp;
+	struct list_head *head;
+	struct kmmio_fault_page *p;
 
 	page &= PAGE_MASK;
-	head = &kmmio_page_table[hash_long(page, KMMIO_PAGE_HASH_BITS)];
-	list_for_each(tmp, head) {
-		struct kmmio_fault_page *p
-			= list_entry(tmp, struct kmmio_fault_page, list);
+	head = kmmio_page_list(page);
+	list_for_each_entry_rcu(p, head, list) {
 		if (p->page == page)
 			return p;
 	}
-
 	return NULL;
 }
 
+/** Mark the given page as not present. Access to it will trigger a fault. */
 static void arm_kmmio_fault_page(unsigned long page, int *page_level)
 {
 	unsigned long address = page & PAGE_MASK;
@@ -124,8 +174,8 @@ static void arm_kmmio_fault_page(unsigned long page, int *page_level)
 	pte_t *pte = lookup_address(address, &level);
 
 	if (!pte) {
-		printk(KERN_ERR "Error in %s: no pte for page 0x%08lx\n",
-						__FUNCTION__, page);
+		pr_err("kmmio: Error in %s: no pte for page 0x%08lx\n",
+							__func__, page);
 		return;
 	}
 
@@ -143,6 +193,7 @@ static void arm_kmmio_fault_page(unsigned long page, int *page_level)
 	__flush_tlb_one(page);
 }
 
+/** Mark the given page as present. */
 static void disarm_kmmio_fault_page(unsigned long page, int *page_level)
 {
 	unsigned long address = page & PAGE_MASK;
@@ -150,8 +201,8 @@ static void disarm_kmmio_fault_page(unsigned long page, int *page_level)
 	pte_t *pte = lookup_address(address, &level);
 
 	if (!pte) {
-		printk(KERN_ERR "Error in %s: no pte for page 0x%08lx\n",
-						__FUNCTION__, page);
+		pr_err("kmmio: Error in %s: no pte for page 0x%08lx\n",
+							__func__, page);
 		return;
 	}
 
@@ -169,13 +220,25 @@ static void disarm_kmmio_fault_page(unsigned long page, int *page_level)
 	__flush_tlb_one(page);
 }
 
+/*
+ * This is being called from do_page_fault().
+ *
+ * We may be in an interrupt or a critical section. Also prefecthing may
+ * trigger a page fault. We may be in the middle of process switch.
+ * We cannot take any locks, because we could be executing especially
+ * within a kmmio critical section.
+ *
+ * Local interrupts are disabled, so preemption cannot happen.
+ * Do not enable interrupts, do not sleep, and watch out for other CPUs.
+ */
 /*
  * Interrupts are disabled on entry as trap3 is an interrupt gate
  * and they remain disabled thorough out this function.
  */
-static int kmmio_handler(struct pt_regs *regs, unsigned long addr)
+int kmmio_handler(struct pt_regs *regs, unsigned long addr)
 {
-	struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx);
+	struct kmmio_context *ctx;
+	struct kmmio_fault_page *faultpage;
 
 	/*
 	 * Preemption is now disabled to prevent process switch during
@@ -186,40 +249,40 @@ static int kmmio_handler(struct pt_regs *regs, unsigned long addr)
 	 * XXX what if an interrupt occurs between returning from
 	 * do_page_fault() and entering the single-step exception handler?
 	 * And that interrupt triggers a kmmio trap?
+	 * XXX If we tracing an interrupt service routine or whatever, is
+	 * this enough to keep it on the current cpu?
 	 */
 	preempt_disable();
 
-	/* interrupts disabled and CPU-local data => atomicity guaranteed. */
+	rcu_read_lock();
+	faultpage = get_kmmio_fault_page(addr);
+	if (!faultpage) {
+		/*
+		 * Either this page fault is not caused by kmmio, or
+		 * another CPU just pulled the kmmio probe from under
+		 * our feet. In the latter case all hell breaks loose.
+		 */
+		goto no_kmmio;
+	}
+
+	ctx = &get_cpu_var(kmmio_ctx);
 	if (ctx->active) {
 		/*
-		 * This avoids a deadlock with kmmio_lock.
+		 * Prevent overwriting already in-flight context.
 		 * If this page fault really was due to kmmio trap,
 		 * all hell breaks loose.
 		 */
-		printk(KERN_EMERG "mmiotrace: recursive probe hit on CPU %d, "
-					"for address %lu. Ignoring.\n",
+		pr_emerg("kmmio: recursive probe hit on CPU %d, "
+					"for address 0x%08lx. Ignoring.\n",
 					smp_processor_id(), addr);
-		goto no_kmmio;
+		goto no_kmmio_ctx;
 	}
 	ctx->active++;
 
-	/*
-	 * Acquire the kmmio lock to prevent changes affecting
-	 * get_kmmio_fault_page() and get_kmmio_probe(), since we save their
-	 * returned pointers.
-	 * The lock is released in post_kmmio_handler().
-	 * XXX: could/should get_kmmio_*() be using RCU instead of spinlock?
-	 */
-	spin_lock(&kmmio_lock);
-
-	ctx->fpage = get_kmmio_fault_page(addr);
-	if (!ctx->fpage) {
-		/* this page fault is not caused by kmmio */
-		goto no_kmmio_locked;
-	}
-
+	ctx->fpage = faultpage;
 	ctx->probe = get_kmmio_probe(addr);
 	ctx->saved_flags = (regs->flags & (TF_MASK|IF_MASK));
+	ctx->addr = addr;
 
 	if (ctx->probe && ctx->probe->pre_handler)
 		ctx->probe->pre_handler(ctx->probe, regs, addr);
@@ -227,46 +290,62 @@ static int kmmio_handler(struct pt_regs *regs, unsigned long addr)
 	regs->flags |= TF_MASK;
 	regs->flags &= ~IF_MASK;
 
-	/* We hold lock, now we set present bit in PTE and single step. */
+	/* Now we set present bit in PTE and single step. */
 	disarm_kmmio_fault_page(ctx->fpage->page, NULL);
 
 	put_cpu_var(kmmio_ctx);
+	rcu_read_unlock();
 	return 1;
 
-no_kmmio_locked:
-	spin_unlock(&kmmio_lock);
-	ctx->active--;
+no_kmmio_ctx:
+	put_cpu_var(kmmio_ctx);
 no_kmmio:
+	rcu_read_unlock();
 	preempt_enable_no_resched();
-	put_cpu_var(kmmio_ctx);
-	/* page fault not handled by kmmio */
-	return 0;
+	return 0; /* page fault not handled by kmmio */
 }
 
 /*
  * Interrupts are disabled on entry as trap1 is an interrupt gate
  * and they remain disabled thorough out this function.
- * And we hold kmmio lock.
+ * This must always get called as the pair to kmmio_handler().
  */
 static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
 {
 	int ret = 0;
+	struct kmmio_probe *probe;
+	struct kmmio_fault_page *faultpage;
 	struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx);
 
 	if (!ctx->active)
 		goto out;
 
+	rcu_read_lock();
+
+	faultpage = get_kmmio_fault_page(ctx->addr);
+	probe = get_kmmio_probe(ctx->addr);
+	if (faultpage != ctx->fpage || probe != ctx->probe) {
+		/*
+		 * The trace setup changed after kmmio_handler() and before
+		 * running this respective post handler. User does not want
+		 * the result anymore.
+		 */
+		ctx->probe = NULL;
+		ctx->fpage = NULL;
+	}
+
 	if (ctx->probe && ctx->probe->post_handler)
 		ctx->probe->post_handler(ctx->probe, condition, regs);
 
-	arm_kmmio_fault_page(ctx->fpage->page, NULL);
+	if (ctx->fpage)
+		arm_kmmio_fault_page(ctx->fpage->page, NULL);
 
 	regs->flags &= ~TF_MASK;
 	regs->flags |= ctx->saved_flags;
 
 	/* These were acquired in kmmio_handler(). */
 	ctx->active--;
-	spin_unlock(&kmmio_lock);
+	BUG_ON(ctx->active);
 	preempt_enable_no_resched();
 
 	/*
@@ -277,11 +356,13 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
 	if (!(regs->flags & TF_MASK))
 		ret = 1;
 
+	rcu_read_unlock();
 out:
 	put_cpu_var(kmmio_ctx);
 	return ret;
 }
 
+/* You must be holding kmmio_lock. */
 static int add_kmmio_fault_page(unsigned long page)
 {
 	struct kmmio_fault_page *f;
@@ -289,6 +370,8 @@ static int add_kmmio_fault_page(unsigned long page)
 	page &= PAGE_MASK;
 	f = get_kmmio_fault_page(page);
 	if (f) {
+		if (!f->count)
+			arm_kmmio_fault_page(f->page, NULL);
 		f->count++;
 		return 0;
 	}
@@ -299,15 +382,16 @@ static int add_kmmio_fault_page(unsigned long page)
 
 	f->count = 1;
 	f->page = page;
-	list_add(&f->list,
-		 &kmmio_page_table[hash_long(f->page, KMMIO_PAGE_HASH_BITS)]);
+	list_add_rcu(&f->list, kmmio_page_list(f->page));
 
 	arm_kmmio_fault_page(f->page, NULL);
 
 	return 0;
 }
 
-static void release_kmmio_fault_page(unsigned long page)
+/* You must be holding kmmio_lock. */
+static void release_kmmio_fault_page(unsigned long page,
+				struct kmmio_fault_page **release_list)
 {
 	struct kmmio_fault_page *f;
 
@@ -317,9 +401,11 @@ static void release_kmmio_fault_page(unsigned long page)
 		return;
 
 	f->count--;
+	BUG_ON(f->count < 0);
 	if (!f->count) {
 		disarm_kmmio_fault_page(f->page, NULL);
-		list_del(&f->list);
+		f->release_next = *release_list;
+		*release_list = f;
 	}
 }
 
@@ -334,68 +420,113 @@ int register_kmmio_probe(struct kmmio_probe *p)
 		ret = -EEXIST;
 		goto out;
 	}
-	list_add(&p->list, &kmmio_probes);
-	/*printk("adding fault pages...\n");*/
+	list_add_rcu(&p->list, &kmmio_probes);
 	while (size < p->len) {
 		if (add_kmmio_fault_page(p->addr + size))
-			printk(KERN_ERR "mmio: Unable to set page fault.\n");
+			pr_err("kmmio: Unable to set page fault.\n");
 		size += PAGE_SIZE;
 	}
-
-	if (!handler_registered) {
-		if (mmiotrace_register_pf(&kmmio_page_fault))
-			printk(KERN_ERR "mmiotrace: Cannot register page "
-					"fault handler.\n");
-		else
-			handler_registered++;
-	}
-
 out:
 	spin_unlock_irq(&kmmio_lock);
 	/*
 	 * XXX: What should I do here?
 	 * Here was a call to global_flush_tlb(), but it does not exist
-	 * anymore.
+	 * anymore. It seems it's not needed after all.
 	 */
 	return ret;
 }
+EXPORT_SYMBOL(register_kmmio_probe);
 
+static void rcu_free_kmmio_fault_pages(struct rcu_head *head)
+{
+	struct kmmio_delayed_release *dr = container_of(
+						head,
+						struct kmmio_delayed_release,
+						rcu);
+	struct kmmio_fault_page *p = dr->release_list;
+	while (p) {
+		struct kmmio_fault_page *next = p->release_next;
+		BUG_ON(p->count);
+		kfree(p);
+		p = next;
+	}
+	kfree(dr);
+}
+
+static void remove_kmmio_fault_pages(struct rcu_head *head)
+{
+	struct kmmio_delayed_release *dr = container_of(
+						head,
+						struct kmmio_delayed_release,
+						rcu);
+	struct kmmio_fault_page *p = dr->release_list;
+	struct kmmio_fault_page **prevp = &dr->release_list;
+	unsigned long flags;
+	spin_lock_irqsave(&kmmio_lock, flags);
+	while (p) {
+		if (!p->count)
+			list_del_rcu(&p->list);
+		else
+			*prevp = p->release_next;
+		prevp = &p->release_next;
+		p = p->release_next;
+	}
+	spin_unlock_irqrestore(&kmmio_lock, flags);
+	/* This is the real RCU destroy call. */
+	call_rcu(&dr->rcu, rcu_free_kmmio_fault_pages);
+}
+
+/*
+ * Remove a kmmio probe. You have to synchronize_rcu() before you can be
+ * sure that the callbacks will not be called anymore.
+ *
+ * Unregistering a kmmio fault page has three steps:
+ * 1. release_kmmio_fault_page()
+ *    Disarm the page, wait a grace period to let all faults finish.
+ * 2. remove_kmmio_fault_pages()
+ *    Remove the pages from kmmio_page_table.
+ * 3. rcu_free_kmmio_fault_pages()
+ *    Actally free the kmmio_fault_page structs as with RCU.
+ */
 void unregister_kmmio_probe(struct kmmio_probe *p)
 {
 	unsigned long size = 0;
+	struct kmmio_fault_page *release_list = NULL;
+	struct kmmio_delayed_release *drelease;
 
 	spin_lock_irq(&kmmio_lock);
 	while (size < p->len) {
-		release_kmmio_fault_page(p->addr + size);
+		release_kmmio_fault_page(p->addr + size, &release_list);
 		size += PAGE_SIZE;
 	}
-	list_del(&p->list);
+	list_del_rcu(&p->list);
 	kmmio_count--;
 	spin_unlock_irq(&kmmio_lock);
-}
 
-/*
- * According to 2.6.20, mainly x86_64 arch:
- * This is being called from do_page_fault(), via the page fault notifier
- * chain. The chain is called for both user space faults and kernel space
- * faults (address >= TASK_SIZE64), except not on faults serviced by
- * vmalloc_fault().
- *
- * We may be in an interrupt or a critical section. Also prefecthing may
- * trigger a page fault. We may be in the middle of process switch.
- * The page fault hook functionality has put us inside RCU read lock.
- *
- * Local interrupts are disabled, so preemption cannot happen.
- * Do not enable interrupts, do not sleep, and watch out for other CPUs.
- */
-static int kmmio_page_fault(struct pt_regs *regs, unsigned long error_code,
-						unsigned long address)
-{
-	if (is_kmmio_active())
-		if (kmmio_handler(regs, address) == 1)
-			return -1;
-	return 0;
+	drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC);
+	if (!drelease) {
+		pr_crit("kmmio: leaking kmmio_fault_page objects.\n");
+		return;
+	}
+	drelease->release_list = release_list;
+
+	/*
+	 * This is not really RCU here. We have just disarmed a set of
+	 * pages so that they cannot trigger page faults anymore. However,
+	 * we cannot remove the pages from kmmio_page_table,
+	 * because a probe hit might be in flight on another CPU. The
+	 * pages are collected into a list, and they will be removed from
+	 * kmmio_page_table when it is certain that no probe hit related to
+	 * these pages can be in flight. RCU grace period sounds like a
+	 * good choice.
+	 *
+	 * If we removed the pages too early, kmmio page fault handler might
+	 * not find the respective kmmio_fault_page and determine it's not
+	 * a kmmio fault, when it actually is. This would lead to madness.
+	 */
+	call_rcu(&drelease->rcu, remove_kmmio_fault_pages);
 }
+EXPORT_SYMBOL(unregister_kmmio_probe);
 
 static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val,
 								void *args)
diff --git a/arch/x86/kernel/mmiotrace/kmmio.h b/arch/x86/kernel/mmiotrace/kmmio.h
deleted file mode 100644
index 85b7f68a3b8a..000000000000
--- a/arch/x86/kernel/mmiotrace/kmmio.h
+++ /dev/null
@@ -1,58 +0,0 @@
-#ifndef _LINUX_KMMIO_H
-#define _LINUX_KMMIO_H
-
-#include <linux/list.h>
-#include <linux/notifier.h>
-#include <linux/smp.h>
-#include <linux/types.h>
-#include <linux/ptrace.h>
-#include <linux/version.h>
-#include <linux/kdebug.h>
-
-struct kmmio_probe;
-struct kmmio_fault_page;
-struct pt_regs;
-
-typedef void (*kmmio_pre_handler_t)(struct kmmio_probe *,
-				struct pt_regs *, unsigned long addr);
-typedef void (*kmmio_post_handler_t)(struct kmmio_probe *,
-				unsigned long condition, struct pt_regs *);
-
-struct kmmio_probe {
-	struct list_head list;
-
-	/* start location of the probe point */
-	unsigned long addr;
-
-	/* length of the probe region */
-	unsigned long len;
-
-	 /* Called before addr is executed. */
-	kmmio_pre_handler_t pre_handler;
-
-	/* Called after addr is executed, unless... */
-	kmmio_post_handler_t post_handler;
-};
-
-struct kmmio_fault_page {
-	struct list_head list;
-
-	/* location of the fault page */
-	unsigned long page;
-
-	int count;
-};
-
-/* kmmio is active by some kmmio_probes? */
-static inline int is_kmmio_active(void)
-{
-	extern unsigned int kmmio_count;
-	return kmmio_count;
-}
-
-int init_kmmio(void);
-void cleanup_kmmio(void);
-int register_kmmio_probe(struct kmmio_probe *p);
-void unregister_kmmio_probe(struct kmmio_probe *p);
-
-#endif /* _LINUX_KMMIO_H */
diff --git a/arch/x86/kernel/mmiotrace/mmio-mod.c b/arch/x86/kernel/mmiotrace/mmio-mod.c
index f9c609266d83..e1a508588f03 100644
--- a/arch/x86/kernel/mmiotrace/mmio-mod.c
+++ b/arch/x86/kernel/mmiotrace/mmio-mod.c
@@ -32,7 +32,6 @@
 #include <asm/atomic.h>
 #include <linux/percpu.h>
 
-#include "kmmio.h"
 #include "pf_in.h"
 
 /* This app's relay channel files will appear in /debug/mmio-trace */
@@ -129,18 +128,17 @@ static void print_pte(unsigned long address)
 	pte_t *pte = lookup_address(address, &level);
 
 	if (!pte) {
-		printk(KERN_ERR "Error in %s: no pte for page 0x%08lx\n",
-						__FUNCTION__, address);
+		pr_err(MODULE_NAME ": Error in %s: no pte for page 0x%08lx\n",
+							__func__, address);
 		return;
 	}
 
 	if (level == PG_LEVEL_2M) {
-		printk(KERN_EMERG MODULE_NAME ": 4MB pages are not "
-						"currently supported: %lx\n",
-						address);
+		pr_emerg(MODULE_NAME ": 4MB pages are not currently "
+						"supported: %lx\n", address);
 		BUG();
 	}
-	printk(KERN_DEBUG MODULE_NAME ": pte for 0x%lx: 0x%lx 0x%lx\n",
+	pr_info(MODULE_NAME ": pte for 0x%lx: 0x%lx 0x%lx\n",
 					address, pte_val(*pte),
 					pte_val(*pte) & _PAGE_PRESENT);
 }
@@ -152,7 +150,7 @@ static void print_pte(unsigned long address)
 static void die_kmmio_nesting_error(struct pt_regs *regs, unsigned long addr)
 {
 	const struct trap_reason *my_reason = &get_cpu_var(pf_reason);
-	printk(KERN_EMERG MODULE_NAME ": unexpected fault for address: %lx, "
+	pr_emerg(MODULE_NAME ": unexpected fault for address: %lx, "
 					"last fault for address: %lx\n",
 					addr, my_reason->addr);
 	print_pte(addr);
@@ -160,20 +158,17 @@ static void die_kmmio_nesting_error(struct pt_regs *regs, unsigned long addr)
 	print_symbol(KERN_EMERG "faulting EIP is at %s\n", regs->ip);
 	print_symbol(KERN_EMERG "last faulting EIP was at %s\n",
 							my_reason->ip);
-	printk(KERN_EMERG
-			"eax: %08lx   ebx: %08lx   ecx: %08lx   edx: %08lx\n",
+	pr_emerg("eax: %08lx   ebx: %08lx   ecx: %08lx   edx: %08lx\n",
 			regs->ax, regs->bx, regs->cx, regs->dx);
-	printk(KERN_EMERG
-			"esi: %08lx   edi: %08lx   ebp: %08lx   esp: %08lx\n",
+	pr_emerg("esi: %08lx   edi: %08lx   ebp: %08lx   esp: %08lx\n",
 			regs->si, regs->di, regs->bp, regs->sp);
 #else
 	print_symbol(KERN_EMERG "faulting RIP is at %s\n", regs->ip);
 	print_symbol(KERN_EMERG "last faulting RIP was at %s\n",
 							my_reason->ip);
-	printk(KERN_EMERG "rax: %016lx   rcx: %016lx   rdx: %016lx\n",
+	pr_emerg("rax: %016lx   rcx: %016lx   rdx: %016lx\n",
 					regs->ax, regs->cx, regs->dx);
-	printk(KERN_EMERG "rsi: %016lx   rdi: %016lx   "
-				"rbp: %016lx   rsp: %016lx\n",
+	pr_emerg("rsi: %016lx   rdi: %016lx   rbp: %016lx   rsp: %016lx\n",
 				regs->si, regs->di, regs->bp, regs->sp);
 #endif
 	put_cpu_var(pf_reason);
@@ -251,10 +246,15 @@ static void post(struct kmmio_probe *p, unsigned long condition,
 	struct trap_reason *my_reason = &get_cpu_var(pf_reason);
 	struct mm_io_header_rw *my_trace = &get_cpu_var(cpu_trace);
 
+	/*
+	 * XXX: This might not get called, if the probe is removed while
+	 * trace hit is on flight.
+	 */
+
 	/* this should always return the active_trace count to 0 */
 	my_reason->active_traces--;
 	if (my_reason->active_traces) {
-		printk(KERN_EMERG MODULE_NAME ": unexpected post handler");
+		pr_emerg(MODULE_NAME ": unexpected post handler");
 		BUG();
 	}
 
@@ -283,16 +283,15 @@ static int subbuf_start_handler(struct rchan_buf *buf, void *subbuf,
 	atomic_t *drop = &per_cpu(dropped, cpu);
 	int count;
 	if (relay_buf_full(buf)) {
-		if (atomic_inc_return(drop) == 1) {
-			printk(KERN_ERR MODULE_NAME ": cpu %d buffer full!\n",
-									cpu);
-		}
+		if (atomic_inc_return(drop) == 1)
+			pr_err(MODULE_NAME ": cpu %d buffer full!\n", cpu);
 		return 0;
-	} else if ((count = atomic_read(drop))) {
-		printk(KERN_ERR MODULE_NAME
-					": cpu %d buffer no longer full, "
-					"missed %d events.\n",
-					cpu, count);
+	}
+	count = atomic_read(drop);
+	if (count) {
+		pr_err(MODULE_NAME ": cpu %d buffer no longer full, "
+						"missed %d events.\n",
+						cpu, count);
 		atomic_sub(count, drop);
 	}
 
@@ -407,8 +406,8 @@ static void ioremap_trace_core(unsigned long offset, unsigned long size,
 	/* Don't trace the low PCI/ISA area, it's always mapped.. */
 	if (!ISA_trace && (offset < ISA_END_ADDRESS) &&
 					(offset + size > ISA_START_ADDRESS)) {
-		printk(KERN_NOTICE MODULE_NAME ": Ignoring map of low "
-						"PCI/ISA area (0x%lx-0x%lx)\n",
+		pr_notice(MODULE_NAME ": Ignoring map of low PCI/ISA area "
+						"(0x%lx-0x%lx)\n",
 						offset, offset + size);
 		return;
 	}
@@ -418,7 +417,7 @@ static void ioremap_trace_core(unsigned long offset, unsigned long size,
 void __iomem *ioremap_cache_trace(unsigned long offset, unsigned long size)
 {
 	void __iomem *p = ioremap_cache(offset, size);
-	printk(KERN_DEBUG MODULE_NAME ": ioremap_cache(0x%lx, 0x%lx) = %p\n",
+	pr_debug(MODULE_NAME ": ioremap_cache(0x%lx, 0x%lx) = %p\n",
 							offset, size, p);
 	ioremap_trace_core(offset, size, p);
 	return p;
@@ -428,7 +427,7 @@ EXPORT_SYMBOL(ioremap_cache_trace);
 void __iomem *ioremap_nocache_trace(unsigned long offset, unsigned long size)
 {
 	void __iomem *p = ioremap_nocache(offset, size);
-	printk(KERN_DEBUG MODULE_NAME ": ioremap_nocache(0x%lx, 0x%lx) = %p\n",
+	pr_debug(MODULE_NAME ": ioremap_nocache(0x%lx, 0x%lx) = %p\n",
 							offset, size, p);
 	ioremap_trace_core(offset, size, p);
 	return p;
@@ -455,7 +454,7 @@ void iounmap_trace(volatile void __iomem *addr)
 	};
 	struct remap_trace *trace;
 	struct remap_trace *tmp;
-	printk(KERN_DEBUG MODULE_NAME ": Unmapping %p.\n", addr);
+	pr_debug(MODULE_NAME ": Unmapping %p.\n", addr);
 	record_timestamp(&event.header);
 
 	spin_lock(&trace_list_lock);
@@ -481,7 +480,7 @@ static void clear_trace_list(void)
 
 	spin_lock(&trace_list_lock);
 	list_for_each_entry_safe(trace, tmp, &trace_list, list) {
-		printk(KERN_WARNING MODULE_NAME ": purging non-iounmapped "
+		pr_warning(MODULE_NAME ": purging non-iounmapped "
 					"trace @0x%08lx, size 0x%lx.\n",
 					trace->probe.addr, trace->probe.len);
 		if (!nommiotrace)
@@ -500,39 +499,37 @@ static int __init init(void)
 
 	dir = debugfs_create_dir(APP_DIR, NULL);
 	if (!dir) {
-		printk(KERN_ERR MODULE_NAME
-				": Couldn't create relay app directory.\n");
+		pr_err(MODULE_NAME ": Couldn't create relay app directory.\n");
 		return -ENOMEM;
 	}
 
 	chan = create_channel(subbuf_size, n_subbufs);
 	if (!chan) {
 		debugfs_remove(dir);
-		printk(KERN_ERR MODULE_NAME
-				": relay app channel creation failed\n");
+		pr_err(MODULE_NAME ": relay app channel creation failed\n");
 		return -ENOMEM;
 	}
 
-	init_kmmio();
+	reference_kmmio();
 
 	proc_marker_file = create_proc_entry(MARKER_FILE, 0, NULL);
 	if (proc_marker_file)
 		proc_marker_file->write_proc = write_marker;
 
-	printk(KERN_DEBUG MODULE_NAME ": loaded.\n");
+	pr_debug(MODULE_NAME ": loaded.\n");
 	if (nommiotrace)
-		printk(KERN_DEBUG MODULE_NAME ": MMIO tracing disabled.\n");
+		pr_info(MODULE_NAME ": MMIO tracing disabled.\n");
 	if (ISA_trace)
-		printk(KERN_WARNING MODULE_NAME
-				": Warning! low ISA range will be traced.\n");
+		pr_warning(MODULE_NAME ": Warning! low ISA range will be "
+								"traced.\n");
 	return 0;
 }
 
 static void __exit cleanup(void)
 {
-	printk(KERN_DEBUG MODULE_NAME ": unload...\n");
+	pr_debug(MODULE_NAME ": unload...\n");
 	clear_trace_list();
-	cleanup_kmmio();
+	unreference_kmmio();
 	remove_proc_entry(MARKER_FILE, NULL);
 	destroy_channel();
 	if (dir)
diff --git a/arch/x86/kernel/mmiotrace/pf_in.c b/arch/x86/kernel/mmiotrace/pf_in.c
index 67ea520dde62..efa1911e20ca 100644
--- a/arch/x86/kernel/mmiotrace/pf_in.c
+++ b/arch/x86/kernel/mmiotrace/pf_in.c
@@ -19,7 +19,7 @@
  *
  */
 
-/*  $Id: pf_in.c,v 1.1.1.1 2002/11/12 05:56:32 brlock Exp $
+/*  Id: pf_in.c,v 1.1.1.1 2002/11/12 05:56:32 brlock Exp
  *  Copyright by Intel Crop., 2002
  *  Louis Zhuang (louis.zhuang@intel.com)
  *
diff --git a/arch/x86/kernel/mmiotrace/testmmiotrace.c b/arch/x86/kernel/mmiotrace/testmmiotrace.c
index 40e66b0e6480..5ecff578672b 100644
--- a/arch/x86/kernel/mmiotrace/testmmiotrace.c
+++ b/arch/x86/kernel/mmiotrace/testmmiotrace.c
@@ -41,8 +41,7 @@ static void do_test(void)
 {
 	void __iomem *p = ioremap_nocache_trace(mmio_address, 0x4000);
 	if (!p) {
-		printk(KERN_ERR MODULE_NAME ": could not ioremap IO memory, "
-							"aborting.\n");
+		pr_err(MODULE_NAME ": could not ioremap, aborting.\n");
 		return;
 	}
 	do_write_test(p);
@@ -53,14 +52,14 @@ static void do_test(void)
 static int __init init(void)
 {
 	if (mmio_address == 0) {
-		printk(KERN_ERR MODULE_NAME ": you have to use the module "
-						"argument mmio_address.\n");
-		printk(KERN_ERR MODULE_NAME ": DO NOT LOAD THIS MODULE UNLESS"
+		pr_err(MODULE_NAME ": you have to use the module argument "
+							"mmio_address.\n");
+		pr_err(MODULE_NAME ": DO NOT LOAD THIS MODULE UNLESS"
 				" YOU REALLY KNOW WHAT YOU ARE DOING!\n");
 		return -ENXIO;
 	}
 
-	printk(KERN_WARNING MODULE_NAME ": WARNING: mapping 16 kB @ 0x%08lx "
+	pr_warning(MODULE_NAME ": WARNING: mapping 16 kB @ 0x%08lx "
 					"in PCI address space, and writing "
 					"rubbish in there.\n", mmio_address);
 	do_test();
@@ -69,7 +68,7 @@ static int __init init(void)
 
 static void __exit cleanup(void)
 {
-	printk(KERN_DEBUG MODULE_NAME ": unloaded.\n");
+	pr_debug(MODULE_NAME ": unloaded.\n");
 }
 
 module_init(init);
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index e9a086a1a9ff..8c828a68d3b6 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -10,6 +10,7 @@
 #include <linux/string.h>
 #include <linux/types.h>
 #include <linux/ptrace.h>
+#include <linux/mmiotrace.h>
 #include <linux/mman.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
@@ -49,60 +50,14 @@
 #define PF_RSVD		(1<<3)
 #define PF_INSTR	(1<<4)
 
-#ifdef CONFIG_MMIOTRACE_HOOKS
-static pf_handler_func mmiotrace_pf_handler; /* protected by RCU */
-static DEFINE_SPINLOCK(mmiotrace_handler_lock);
-
-int mmiotrace_register_pf(pf_handler_func new_pfh)
-{
-	int ret = 0;
-	unsigned long flags;
-	spin_lock_irqsave(&mmiotrace_handler_lock, flags);
-	if (mmiotrace_pf_handler)
-		ret = -EBUSY;
-	else
-		mmiotrace_pf_handler = new_pfh;
-	spin_unlock_irqrestore(&mmiotrace_handler_lock, flags);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(mmiotrace_register_pf);
-
-/**
- * mmiotrace_unregister_pf:
- * The caller must ensure @old_pfh is not in use anymore before freeing it.
- * This function does not guarantee it. The handler function pointer is
- * protected by RCU, so you can do this by e.g. calling synchronize_rcu().
- */
-int mmiotrace_unregister_pf(pf_handler_func old_pfh)
-{
-	int ret = 0;
-	unsigned long flags;
-	spin_lock_irqsave(&mmiotrace_handler_lock, flags);
-	if (mmiotrace_pf_handler != old_pfh)
-		ret = -EPERM;
-	else
-		mmiotrace_pf_handler = NULL;
-	spin_unlock_irqrestore(&mmiotrace_handler_lock, flags);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(mmiotrace_unregister_pf);
-#endif /* CONFIG_MMIOTRACE_HOOKS */
-
-/* returns non-zero if do_page_fault() should return */
-static inline int call_mmiotrace(struct pt_regs *regs,
-					unsigned long error_code,
-					unsigned long address)
+static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
 {
 #ifdef CONFIG_MMIOTRACE_HOOKS
-	int ret = 0;
-	rcu_read_lock();
-	if (mmiotrace_pf_handler)
-		ret = mmiotrace_pf_handler(regs, error_code, address);
-	rcu_read_unlock();
-	return ret;
-#else
-	return 0;
+	if (unlikely(is_kmmio_active()))
+		if (kmmio_handler(regs, addr) == 1)
+			return -1;
 #endif
+	return 0;
 }
 
 static inline int notify_page_fault(struct pt_regs *regs)
@@ -657,7 +612,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
 
 	if (notify_page_fault(regs))
 		return;
-	if (call_mmiotrace(regs, error_code, address))
+	if (unlikely(kmmio_fault(regs, address)))
 		return;
 
 	/*
diff --git a/include/asm-x86/kdebug.h b/include/asm-x86/kdebug.h
index 7063281040da..96651bb59ba1 100644
--- a/include/asm-x86/kdebug.h
+++ b/include/asm-x86/kdebug.h
@@ -35,11 +35,4 @@ extern void show_regs(struct pt_regs *regs);
 extern unsigned long oops_begin(void);
 extern void oops_end(unsigned long, struct pt_regs *, int signr);
 
-typedef int (*pf_handler_func)(struct pt_regs *regs,
-				unsigned long error_code,
-				unsigned long address);
-
-extern int mmiotrace_register_pf(pf_handler_func new_pfh);
-extern int mmiotrace_unregister_pf(pf_handler_func old_pfh);
-
 #endif
diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h
index 6ec288f1fe24..d87a6cd8b686 100644
--- a/include/linux/mmiotrace.h
+++ b/include/linux/mmiotrace.h
@@ -3,6 +3,44 @@
 
 #include <asm/types.h>
 
+#ifdef __KERNEL__
+
+#include <linux/list.h>
+
+struct kmmio_probe;
+struct pt_regs;
+
+typedef void (*kmmio_pre_handler_t)(struct kmmio_probe *,
+				struct pt_regs *, unsigned long addr);
+typedef void (*kmmio_post_handler_t)(struct kmmio_probe *,
+				unsigned long condition, struct pt_regs *);
+
+struct kmmio_probe {
+	struct list_head list;
+	unsigned long addr; /* start location of the probe point */
+	unsigned long len; /* length of the probe region */
+	kmmio_pre_handler_t pre_handler; /* Called before addr is executed. */
+	kmmio_post_handler_t post_handler; /* Called after addr is executed */
+};
+
+/* kmmio is active by some kmmio_probes? */
+static inline int is_kmmio_active(void)
+{
+	extern unsigned int kmmio_count;
+	return kmmio_count;
+}
+
+extern void reference_kmmio(void);
+extern void unreference_kmmio(void);
+extern int register_kmmio_probe(struct kmmio_probe *p);
+extern void unregister_kmmio_probe(struct kmmio_probe *p);
+
+/* Called from page fault handler. */
+extern int kmmio_handler(struct pt_regs *regs, unsigned long addr);
+
+#endif /* __KERNEL__ */
+
+
 /*
  * If you change anything here, you must bump MMIO_VERSION.
  * This is the relay data format for user space.
-- 
cgit v1.2.3


From d61fc44853f46fb002228b18aa5f30db21fcd4ac Mon Sep 17 00:00:00 2001
From: Pekka Paalanen <pq@iki.fi>
Date: Mon, 12 May 2008 21:20:57 +0200
Subject: x86: mmiotrace, preview 2

Kconfig.debug, Makefile and testmmiotrace.c style fixes.
Use real mutex instead of mutex.
Fix failure path in register probe func.
kmmio: RCU read-locked over single stepping.
Generate mapping id's.
Make mmio-mod.c built-in and rewrite its locking.
Add debugfs file to enable/disable mmiotracing.
kmmio: use irqsave spinlocks.
Lots of cleanups in mmio-mod.c
Marker file moved from /proc into debugfs.
Call mmiotrace entrypoints directly from ioremap.c.

Signed-off-by: Pekka Paalanen <pq@iki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/Kconfig.debug                    |  20 +-
 arch/x86/kernel/mmiotrace/Makefile        |   2 +-
 arch/x86/kernel/mmiotrace/kmmio.c         |  72 +++---
 arch/x86/kernel/mmiotrace/mmio-mod.c      | 397 ++++++++++++++++++++----------
 arch/x86/kernel/mmiotrace/testmmiotrace.c |  15 +-
 arch/x86/mm/ioremap.c                     |   9 +-
 include/linux/mmiotrace.h                 |  18 +-
 7 files changed, 332 insertions(+), 201 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 9491c0ae03a3..aa0d6462b1fc 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -170,22 +170,19 @@ config IOMMU_LEAK
 
 config MMIOTRACE_HOOKS
 	bool
-	default n
 
 config MMIOTRACE
-	tristate "Memory mapped IO tracing"
+	bool "Memory mapped IO tracing"
 	depends on DEBUG_KERNEL && RELAY && DEBUG_FS
 	select MMIOTRACE_HOOKS
-	default n
+	default y
 	help
-	  This will build a kernel module called mmiotrace.
-	  Making this a built-in is heavily discouraged.
-
-	  Mmiotrace traces Memory Mapped I/O access and is meant for debugging
-	  and reverse engineering. The kernel module offers wrapped
-	  versions of the ioremap family of functions. The driver to be traced
-	  must be modified to call these wrappers. A user space program is
-	  required to collect the MMIO data.
+	  Mmiotrace traces Memory Mapped I/O access and is meant for
+	  debugging and reverse engineering. It is called from the ioremap
+	  implementation and works via page faults. A user space program is
+	  required to collect the MMIO data from debugfs files.
+	  Tracing is disabled by default and can be enabled from a debugfs
+	  file.
 
 	  See http://nouveau.freedesktop.org/wiki/MmioTrace
 	  If you are not helping to develop drivers, say N.
@@ -193,7 +190,6 @@ config MMIOTRACE
 config MMIOTRACE_TEST
 	tristate "Test module for mmiotrace"
 	depends on MMIOTRACE && m
-	default n
 	help
 	  This is a dumb module for testing mmiotrace. It is very dangerous
 	  as it will write garbage to IO memory starting at a given address.
diff --git a/arch/x86/kernel/mmiotrace/Makefile b/arch/x86/kernel/mmiotrace/Makefile
index cf1e747b463e..dbcd8d50fb8d 100644
--- a/arch/x86/kernel/mmiotrace/Makefile
+++ b/arch/x86/kernel/mmiotrace/Makefile
@@ -1,4 +1,4 @@
 obj-$(CONFIG_MMIOTRACE_HOOKS)	+= kmmio.o
 obj-$(CONFIG_MMIOTRACE)		+= mmiotrace.o
-mmiotrace-objs			:= pf_in.o mmio-mod.o
+mmiotrace-y			:= pf_in.o mmio-mod.o
 obj-$(CONFIG_MMIOTRACE_TEST)	+= testmmiotrace.o
diff --git a/arch/x86/kernel/mmiotrace/kmmio.c b/arch/x86/kernel/mmiotrace/kmmio.c
index 539a9b19588f..efb467933087 100644
--- a/arch/x86/kernel/mmiotrace/kmmio.c
+++ b/arch/x86/kernel/mmiotrace/kmmio.c
@@ -19,6 +19,7 @@
 #include <linux/preempt.h>
 #include <linux/percpu.h>
 #include <linux/kdebug.h>
+#include <linux/mutex.h>
 #include <asm/io.h>
 #include <asm/cacheflush.h>
 #include <asm/errno.h>
@@ -59,7 +60,7 @@ struct kmmio_context {
 static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val,
 								void *args);
 
-static DECLARE_MUTEX(kmmio_init_mutex);
+static DEFINE_MUTEX(kmmio_init_mutex);
 static DEFINE_SPINLOCK(kmmio_lock);
 
 /* These are protected by kmmio_lock */
@@ -90,7 +91,7 @@ static struct notifier_block nb_die = {
  */
 void reference_kmmio(void)
 {
-	down(&kmmio_init_mutex);
+	mutex_lock(&kmmio_init_mutex);
 	spin_lock_irq(&kmmio_lock);
 	if (!kmmio_initialized) {
 		int i;
@@ -101,7 +102,7 @@ void reference_kmmio(void)
 	}
 	kmmio_initialized++;
 	spin_unlock_irq(&kmmio_lock);
-	up(&kmmio_init_mutex);
+	mutex_unlock(&kmmio_init_mutex);
 }
 EXPORT_SYMBOL_GPL(reference_kmmio);
 
@@ -115,7 +116,7 @@ void unreference_kmmio(void)
 {
 	bool unreg = false;
 
-	down(&kmmio_init_mutex);
+	mutex_lock(&kmmio_init_mutex);
 	spin_lock_irq(&kmmio_lock);
 
 	if (kmmio_initialized == 1) {
@@ -128,7 +129,7 @@ void unreference_kmmio(void)
 
 	if (unreg)
 		unregister_die_notifier(&nb_die); /* calls sync_rcu() */
-	up(&kmmio_init_mutex);
+	mutex_unlock(&kmmio_init_mutex);
 }
 EXPORT_SYMBOL(unreference_kmmio);
 
@@ -244,17 +245,13 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
 	 * Preemption is now disabled to prevent process switch during
 	 * single stepping. We can only handle one active kmmio trace
 	 * per cpu, so ensure that we finish it before something else
-	 * gets to run.
-	 *
-	 * XXX what if an interrupt occurs between returning from
-	 * do_page_fault() and entering the single-step exception handler?
-	 * And that interrupt triggers a kmmio trap?
-	 * XXX If we tracing an interrupt service routine or whatever, is
-	 * this enough to keep it on the current cpu?
+	 * gets to run. We also hold the RCU read lock over single
+	 * stepping to avoid looking up the probe and kmmio_fault_page
+	 * again.
 	 */
 	preempt_disable();
-
 	rcu_read_lock();
+
 	faultpage = get_kmmio_fault_page(addr);
 	if (!faultpage) {
 		/*
@@ -287,14 +284,24 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
 	if (ctx->probe && ctx->probe->pre_handler)
 		ctx->probe->pre_handler(ctx->probe, regs, addr);
 
+	/*
+	 * Enable single-stepping and disable interrupts for the faulting
+	 * context. Local interrupts must not get enabled during stepping.
+	 */
 	regs->flags |= TF_MASK;
 	regs->flags &= ~IF_MASK;
 
 	/* Now we set present bit in PTE and single step. */
 	disarm_kmmio_fault_page(ctx->fpage->page, NULL);
 
+	/*
+	 * If another cpu accesses the same page while we are stepping,
+	 * the access will not be caught. It will simply succeed and the
+	 * only downside is we lose the event. If this becomes a problem,
+	 * the user should drop to single cpu before tracing.
+	 */
+
 	put_cpu_var(kmmio_ctx);
-	rcu_read_unlock();
 	return 1;
 
 no_kmmio_ctx:
@@ -313,32 +320,15 @@ no_kmmio:
 static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
 {
 	int ret = 0;
-	struct kmmio_probe *probe;
-	struct kmmio_fault_page *faultpage;
 	struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx);
 
 	if (!ctx->active)
 		goto out;
 
-	rcu_read_lock();
-
-	faultpage = get_kmmio_fault_page(ctx->addr);
-	probe = get_kmmio_probe(ctx->addr);
-	if (faultpage != ctx->fpage || probe != ctx->probe) {
-		/*
-		 * The trace setup changed after kmmio_handler() and before
-		 * running this respective post handler. User does not want
-		 * the result anymore.
-		 */
-		ctx->probe = NULL;
-		ctx->fpage = NULL;
-	}
-
 	if (ctx->probe && ctx->probe->post_handler)
 		ctx->probe->post_handler(ctx->probe, condition, regs);
 
-	if (ctx->fpage)
-		arm_kmmio_fault_page(ctx->fpage->page, NULL);
+	arm_kmmio_fault_page(ctx->fpage->page, NULL);
 
 	regs->flags &= ~TF_MASK;
 	regs->flags |= ctx->saved_flags;
@@ -346,6 +336,7 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
 	/* These were acquired in kmmio_handler(). */
 	ctx->active--;
 	BUG_ON(ctx->active);
+	rcu_read_unlock();
 	preempt_enable_no_resched();
 
 	/*
@@ -355,8 +346,6 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
 	 */
 	if (!(regs->flags & TF_MASK))
 		ret = 1;
-
-	rcu_read_unlock();
 out:
 	put_cpu_var(kmmio_ctx);
 	return ret;
@@ -411,15 +400,16 @@ static void release_kmmio_fault_page(unsigned long page,
 
 int register_kmmio_probe(struct kmmio_probe *p)
 {
+	unsigned long flags;
 	int ret = 0;
 	unsigned long size = 0;
 
-	spin_lock_irq(&kmmio_lock);
-	kmmio_count++;
+	spin_lock_irqsave(&kmmio_lock, flags);
 	if (get_kmmio_probe(p->addr)) {
 		ret = -EEXIST;
 		goto out;
 	}
+	kmmio_count++;
 	list_add_rcu(&p->list, &kmmio_probes);
 	while (size < p->len) {
 		if (add_kmmio_fault_page(p->addr + size))
@@ -427,7 +417,7 @@ int register_kmmio_probe(struct kmmio_probe *p)
 		size += PAGE_SIZE;
 	}
 out:
-	spin_unlock_irq(&kmmio_lock);
+	spin_unlock_irqrestore(&kmmio_lock, flags);
 	/*
 	 * XXX: What should I do here?
 	 * Here was a call to global_flush_tlb(), but it does not exist
@@ -478,7 +468,8 @@ static void remove_kmmio_fault_pages(struct rcu_head *head)
 
 /*
  * Remove a kmmio probe. You have to synchronize_rcu() before you can be
- * sure that the callbacks will not be called anymore.
+ * sure that the callbacks will not be called anymore. Only after that
+ * you may actually release your struct kmmio_probe.
  *
  * Unregistering a kmmio fault page has three steps:
  * 1. release_kmmio_fault_page()
@@ -490,18 +481,19 @@ static void remove_kmmio_fault_pages(struct rcu_head *head)
  */
 void unregister_kmmio_probe(struct kmmio_probe *p)
 {
+	unsigned long flags;
 	unsigned long size = 0;
 	struct kmmio_fault_page *release_list = NULL;
 	struct kmmio_delayed_release *drelease;
 
-	spin_lock_irq(&kmmio_lock);
+	spin_lock_irqsave(&kmmio_lock, flags);
 	while (size < p->len) {
 		release_kmmio_fault_page(p->addr + size, &release_list);
 		size += PAGE_SIZE;
 	}
 	list_del_rcu(&p->list);
 	kmmio_count--;
-	spin_unlock_irq(&kmmio_lock);
+	spin_unlock_irqrestore(&kmmio_lock, flags);
 
 	drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC);
 	if (!drelease) {
diff --git a/arch/x86/kernel/mmiotrace/mmio-mod.c b/arch/x86/kernel/mmiotrace/mmio-mod.c
index e1a508588f03..738644061e4e 100644
--- a/arch/x86/kernel/mmiotrace/mmio-mod.c
+++ b/arch/x86/kernel/mmiotrace/mmio-mod.c
@@ -19,6 +19,8 @@
  *
  * Derived from the read-mod example from relay-examples by Tom Zanussi.
  */
+#define DEBUG 1
+
 #include <linux/module.h>
 #include <linux/relay.h>
 #include <linux/debugfs.h>
@@ -34,12 +36,12 @@
 
 #include "pf_in.h"
 
-/* This app's relay channel files will appear in /debug/mmio-trace */
-#define APP_DIR		"mmio-trace"
-/* the marker injection file in /proc */
-#define MARKER_FILE     "mmio-marker"
+#define NAME "mmiotrace: "
 
-#define MODULE_NAME     "mmiotrace"
+/* This app's relay channel files will appear in /debug/mmio-trace */
+static const char APP_DIR[] = "mmio-trace";
+/* the marker injection file in /debug/APP_DIR */
+static const char MARKER_FILE[] = "mmio-marker";
 
 struct trap_reason {
 	unsigned long addr;
@@ -48,6 +50,15 @@ struct trap_reason {
 	int active_traces;
 };
 
+struct remap_trace {
+	struct list_head list;
+	struct kmmio_probe probe;
+	unsigned long phys;
+	unsigned long id;
+};
+
+static const size_t subbuf_size = 256*1024;
+
 /* Accessed per-cpu. */
 static DEFINE_PER_CPU(struct trap_reason, pf_reason);
 static DEFINE_PER_CPU(struct mm_io_header_rw, cpu_trace);
@@ -55,33 +66,53 @@ static DEFINE_PER_CPU(struct mm_io_header_rw, cpu_trace);
 /* Access to this is not per-cpu. */
 static DEFINE_PER_CPU(atomic_t, dropped);
 
-static struct file_operations mmio_fops = {
-	.owner = THIS_MODULE,
-};
+static struct dentry *dir;
+static struct dentry *enabled_file;
+static struct dentry *marker_file;
 
-static const size_t subbuf_size = 256*1024;
+static DEFINE_MUTEX(mmiotrace_mutex);
+static DEFINE_SPINLOCK(trace_lock);
+static atomic_t mmiotrace_enabled;
+static LIST_HEAD(trace_list);		/* struct remap_trace */
 static struct rchan *chan;
-static struct dentry *dir;
-static struct proc_dir_entry *proc_marker_file;
+
+/*
+ * Locking in this file:
+ * - mmiotrace_mutex enforces enable/disable_mmiotrace() critical sections.
+ * - mmiotrace_enabled may be modified only when holding mmiotrace_mutex
+ *   and trace_lock.
+ * - Routines depending on is_enabled() must take trace_lock.
+ * - trace_list users must hold trace_lock.
+ * - is_enabled() guarantees that chan is valid.
+ * - pre/post callbacks assume the effect of is_enabled() being true.
+ */
 
 /* module parameters */
-static unsigned int      n_subbufs = 32*4;
-static unsigned long filter_offset;
-static int                 nommiotrace;
-static int               ISA_trace;
-static int                trace_pc;
+static unsigned int	n_subbufs = 32*4;
+static unsigned long	filter_offset;
+static int		nommiotrace;
+static int		ISA_trace;
+static int		trace_pc;
+static int		enable_now;
 
 module_param(n_subbufs, uint, 0);
 module_param(filter_offset, ulong, 0);
 module_param(nommiotrace, bool, 0);
 module_param(ISA_trace, bool, 0);
 module_param(trace_pc, bool, 0);
+module_param(enable_now, bool, 0);
 
 MODULE_PARM_DESC(n_subbufs, "Number of 256kB buffers, default 128.");
 MODULE_PARM_DESC(filter_offset, "Start address of traced mappings.");
 MODULE_PARM_DESC(nommiotrace, "Disable actual MMIO tracing.");
 MODULE_PARM_DESC(ISA_trace, "Do not exclude the low ISA range.");
 MODULE_PARM_DESC(trace_pc, "Record address of faulting instructions.");
+MODULE_PARM_DESC(enable_now, "Start mmiotracing immediately on module load.");
+
+static bool is_enabled(void)
+{
+	return atomic_read(&mmiotrace_enabled);
+}
 
 static void record_timestamp(struct mm_io_header *header)
 {
@@ -93,15 +124,15 @@ static void record_timestamp(struct mm_io_header *header)
 }
 
 /*
- * Write callback for the /proc entry:
+ * Write callback for the debugfs entry:
  * Read a marker and write it to the mmio trace log
  */
-static int write_marker(struct file *file, const char __user *buffer,
-					unsigned long count, void *data)
+static ssize_t write_marker(struct file *file, const char __user *buffer,
+						size_t count, loff_t *ppos)
 {
 	char *event = NULL;
 	struct mm_io_header *headp;
-	int len = (count > 65535) ? 65535 : count;
+	ssize_t len = (count > 65535) ? 65535 : count;
 
 	event = kzalloc(sizeof(*headp) + len, GFP_KERNEL);
 	if (!event)
@@ -117,7 +148,12 @@ static int write_marker(struct file *file, const char __user *buffer,
 		return -EFAULT;
 	}
 
-	relay_write(chan, event, sizeof(*headp) + len);
+	spin_lock_irq(&trace_lock);
+	if (is_enabled())
+		relay_write(chan, event, sizeof(*headp) + len);
+	else
+		len = -EINVAL;
+	spin_unlock_irq(&trace_lock);
 	kfree(event);
 	return len;
 }
@@ -128,19 +164,18 @@ static void print_pte(unsigned long address)
 	pte_t *pte = lookup_address(address, &level);
 
 	if (!pte) {
-		pr_err(MODULE_NAME ": Error in %s: no pte for page 0x%08lx\n",
+		pr_err(NAME "Error in %s: no pte for page 0x%08lx\n",
 							__func__, address);
 		return;
 	}
 
 	if (level == PG_LEVEL_2M) {
-		pr_emerg(MODULE_NAME ": 4MB pages are not currently "
-						"supported: %lx\n", address);
+		pr_emerg(NAME "4MB pages are not currently supported: "
+							"0x%08lx\n", address);
 		BUG();
 	}
-	pr_info(MODULE_NAME ": pte for 0x%lx: 0x%lx 0x%lx\n",
-					address, pte_val(*pte),
-					pte_val(*pte) & _PAGE_PRESENT);
+	pr_info(NAME "pte for 0x%lx: 0x%lx 0x%lx\n", address, pte_val(*pte),
+						pte_val(*pte) & _PAGE_PRESENT);
 }
 
 /*
@@ -150,22 +185,18 @@ static void print_pte(unsigned long address)
 static void die_kmmio_nesting_error(struct pt_regs *regs, unsigned long addr)
 {
 	const struct trap_reason *my_reason = &get_cpu_var(pf_reason);
-	pr_emerg(MODULE_NAME ": unexpected fault for address: %lx, "
-					"last fault for address: %lx\n",
+	pr_emerg(NAME "unexpected fault for address: 0x%08lx, "
+					"last fault for address: 0x%08lx\n",
 					addr, my_reason->addr);
 	print_pte(addr);
+	print_symbol(KERN_EMERG "faulting IP is at %s\n", regs->ip);
+	print_symbol(KERN_EMERG "last faulting IP was at %s\n", my_reason->ip);
 #ifdef __i386__
-	print_symbol(KERN_EMERG "faulting EIP is at %s\n", regs->ip);
-	print_symbol(KERN_EMERG "last faulting EIP was at %s\n",
-							my_reason->ip);
 	pr_emerg("eax: %08lx   ebx: %08lx   ecx: %08lx   edx: %08lx\n",
 			regs->ax, regs->bx, regs->cx, regs->dx);
 	pr_emerg("esi: %08lx   edi: %08lx   ebp: %08lx   esp: %08lx\n",
 			regs->si, regs->di, regs->bp, regs->sp);
 #else
-	print_symbol(KERN_EMERG "faulting RIP is at %s\n", regs->ip);
-	print_symbol(KERN_EMERG "last faulting RIP was at %s\n",
-							my_reason->ip);
 	pr_emerg("rax: %016lx   rcx: %016lx   rdx: %016lx\n",
 					regs->ax, regs->cx, regs->dx);
 	pr_emerg("rsi: %016lx   rdi: %016lx   rbp: %016lx   rsp: %016lx\n",
@@ -197,6 +228,10 @@ static void pre(struct kmmio_probe *p, struct pt_regs *regs,
 	my_trace->header.pid = 0;
 	my_trace->header.data_len = sizeof(struct mm_io_rw);
 	my_trace->rw.address = addr;
+	/*
+	 * struct remap_trace *trace = p->user_data;
+	 * phys = addr - trace->probe.addr + trace->phys;
+	 */
 
 	/*
 	 * Only record the program counter when requested.
@@ -246,15 +281,10 @@ static void post(struct kmmio_probe *p, unsigned long condition,
 	struct trap_reason *my_reason = &get_cpu_var(pf_reason);
 	struct mm_io_header_rw *my_trace = &get_cpu_var(cpu_trace);
 
-	/*
-	 * XXX: This might not get called, if the probe is removed while
-	 * trace hit is on flight.
-	 */
-
 	/* this should always return the active_trace count to 0 */
 	my_reason->active_traces--;
 	if (my_reason->active_traces) {
-		pr_emerg(MODULE_NAME ": unexpected post handler");
+		pr_emerg(NAME "unexpected post handler");
 		BUG();
 	}
 
@@ -284,20 +314,23 @@ static int subbuf_start_handler(struct rchan_buf *buf, void *subbuf,
 	int count;
 	if (relay_buf_full(buf)) {
 		if (atomic_inc_return(drop) == 1)
-			pr_err(MODULE_NAME ": cpu %d buffer full!\n", cpu);
+			pr_err(NAME "cpu %d buffer full!\n", cpu);
 		return 0;
 	}
 	count = atomic_read(drop);
 	if (count) {
-		pr_err(MODULE_NAME ": cpu %d buffer no longer full, "
-						"missed %d events.\n",
-						cpu, count);
+		pr_err(NAME "cpu %d buffer no longer full, missed %d events.\n",
+								cpu, count);
 		atomic_sub(count, drop);
 	}
 
 	return 1;
 }
 
+static struct file_operations mmio_fops = {
+	.owner = THIS_MODULE,
+};
+
 /* file_create() callback.  Creates relay file in debugfs. */
 static struct dentry *create_buf_file_handler(const char *filename,
 						struct dentry *parent,
@@ -333,34 +366,10 @@ static struct rchan_callbacks relay_callbacks = {
 	.remove_buf_file = remove_buf_file_handler,
 };
 
-/*
- * create_channel - creates channel /debug/APP_DIR/cpuXXX
- * Returns channel on success, NULL otherwise
- */
-static struct rchan *create_channel(unsigned size, unsigned n)
-{
-	return relay_open("cpu", dir, size, n, &relay_callbacks, NULL);
-}
-
-/* destroy_channel - destroys channel /debug/APP_DIR/cpuXXX */
-static void destroy_channel(void)
-{
-	if (chan) {
-		relay_close(chan);
-		chan = NULL;
-	}
-}
-
-struct remap_trace {
-	struct list_head list;
-	struct kmmio_probe probe;
-};
-static LIST_HEAD(trace_list);
-static DEFINE_SPINLOCK(trace_list_lock);
-
-static void do_ioremap_trace_core(unsigned long offset, unsigned long size,
+static void ioremap_trace_core(unsigned long offset, unsigned long size,
 							void __iomem *addr)
 {
+	static atomic_t next_id;
 	struct remap_trace *trace = kmalloc(sizeof(*trace), GFP_KERNEL);
 	struct mm_io_header_map event = {
 		.header = {
@@ -380,61 +389,49 @@ static void do_ioremap_trace_core(unsigned long offset, unsigned long size,
 	};
 	record_timestamp(&event.header);
 
+	if (!trace) {
+		pr_err(NAME "kmalloc failed in ioremap\n");
+		return;
+	}
+
 	*trace = (struct remap_trace) {
 		.probe = {
 			.addr = (unsigned long)addr,
 			.len = size,
 			.pre_handler = pre,
 			.post_handler = post,
-		}
+			.user_data = trace
+		},
+		.phys = offset,
+		.id = atomic_inc_return(&next_id)
 	};
 
+	spin_lock_irq(&trace_lock);
+	if (!is_enabled())
+		goto not_enabled;
+
 	relay_write(chan, &event, sizeof(event));
-	spin_lock(&trace_list_lock);
 	list_add_tail(&trace->list, &trace_list);
-	spin_unlock(&trace_list_lock);
 	if (!nommiotrace)
 		register_kmmio_probe(&trace->probe);
+
+not_enabled:
+	spin_unlock_irq(&trace_lock);
 }
 
-static void ioremap_trace_core(unsigned long offset, unsigned long size,
-							void __iomem *addr)
+void
+mmiotrace_ioremap(unsigned long offset, unsigned long size, void __iomem *addr)
 {
-	if ((filter_offset) && (offset != filter_offset))
+	if (!is_enabled()) /* recheck and proper locking in *_core() */
 		return;
 
-	/* Don't trace the low PCI/ISA area, it's always mapped.. */
-	if (!ISA_trace && (offset < ISA_END_ADDRESS) &&
-					(offset + size > ISA_START_ADDRESS)) {
-		pr_notice(MODULE_NAME ": Ignoring map of low PCI/ISA area "
-						"(0x%lx-0x%lx)\n",
-						offset, offset + size);
+	pr_debug(NAME "ioremap_*(0x%lx, 0x%lx) = %p\n", offset, size, addr);
+	if ((filter_offset) && (offset != filter_offset))
 		return;
-	}
-	do_ioremap_trace_core(offset, size, addr);
-}
-
-void __iomem *ioremap_cache_trace(unsigned long offset, unsigned long size)
-{
-	void __iomem *p = ioremap_cache(offset, size);
-	pr_debug(MODULE_NAME ": ioremap_cache(0x%lx, 0x%lx) = %p\n",
-							offset, size, p);
-	ioremap_trace_core(offset, size, p);
-	return p;
+	ioremap_trace_core(offset, size, addr);
 }
-EXPORT_SYMBOL(ioremap_cache_trace);
 
-void __iomem *ioremap_nocache_trace(unsigned long offset, unsigned long size)
-{
-	void __iomem *p = ioremap_nocache(offset, size);
-	pr_debug(MODULE_NAME ": ioremap_nocache(0x%lx, 0x%lx) = %p\n",
-							offset, size, p);
-	ioremap_trace_core(offset, size, p);
-	return p;
-}
-EXPORT_SYMBOL(ioremap_nocache_trace);
-
-void iounmap_trace(volatile void __iomem *addr)
+static void iounmap_trace_core(volatile void __iomem *addr)
 {
 	struct mm_io_header_map event = {
 		.header = {
@@ -454,84 +451,212 @@ void iounmap_trace(volatile void __iomem *addr)
 	};
 	struct remap_trace *trace;
 	struct remap_trace *tmp;
-	pr_debug(MODULE_NAME ": Unmapping %p.\n", addr);
+	struct remap_trace *found_trace = NULL;
+
+	pr_debug(NAME "Unmapping %p.\n", addr);
 	record_timestamp(&event.header);
 
-	spin_lock(&trace_list_lock);
+	spin_lock_irq(&trace_lock);
+	if (!is_enabled())
+		goto not_enabled;
+
 	list_for_each_entry_safe(trace, tmp, &trace_list, list) {
 		if ((unsigned long)addr == trace->probe.addr) {
 			if (!nommiotrace)
 				unregister_kmmio_probe(&trace->probe);
 			list_del(&trace->list);
-			kfree(trace);
+			found_trace = trace;
 			break;
 		}
 	}
-	spin_unlock(&trace_list_lock);
 	relay_write(chan, &event, sizeof(event));
-	iounmap(addr);
+
+not_enabled:
+	spin_unlock_irq(&trace_lock);
+	if (found_trace) {
+		synchronize_rcu(); /* unregister_kmmio_probe() requirement */
+		kfree(found_trace);
+	}
+}
+
+void mmiotrace_iounmap(volatile void __iomem *addr)
+{
+	might_sleep();
+	if (is_enabled()) /* recheck and proper locking in *_core() */
+		iounmap_trace_core(addr);
 }
-EXPORT_SYMBOL(iounmap_trace);
 
 static void clear_trace_list(void)
 {
 	struct remap_trace *trace;
 	struct remap_trace *tmp;
 
-	spin_lock(&trace_list_lock);
-	list_for_each_entry_safe(trace, tmp, &trace_list, list) {
-		pr_warning(MODULE_NAME ": purging non-iounmapped "
+	/*
+	 * No locking required, because the caller ensures we are in a
+	 * critical section via mutex, and is_enabled() is false,
+	 * i.e. nothing can traverse or modify this list.
+	 * Caller also ensures is_enabled() cannot change.
+	 */
+	list_for_each_entry(trace, &trace_list, list) {
+		pr_notice(NAME "purging non-iounmapped "
 					"trace @0x%08lx, size 0x%lx.\n",
 					trace->probe.addr, trace->probe.len);
 		if (!nommiotrace)
 			unregister_kmmio_probe(&trace->probe);
+	}
+	synchronize_rcu(); /* unregister_kmmio_probe() requirement */
+
+	list_for_each_entry_safe(trace, tmp, &trace_list, list) {
 		list_del(&trace->list);
 		kfree(trace);
+	}
+}
+
+static ssize_t read_enabled_file_bool(struct file *file,
+		char __user *user_buf, size_t count, loff_t *ppos)
+{
+	char buf[3];
+
+	if (is_enabled())
+		buf[0] = '1';
+	else
+		buf[0] = '0';
+	buf[1] = '\n';
+	buf[2] = '\0';
+	return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
+}
+
+static void enable_mmiotrace(void);
+static void disable_mmiotrace(void);
+
+static ssize_t write_enabled_file_bool(struct file *file,
+		const char __user *user_buf, size_t count, loff_t *ppos)
+{
+	char buf[32];
+	int buf_size = min(count, (sizeof(buf)-1));
+
+	if (copy_from_user(buf, user_buf, buf_size))
+		return -EFAULT;
+
+	switch (buf[0]) {
+	case 'y':
+	case 'Y':
+	case '1':
+		enable_mmiotrace();
+		break;
+	case 'n':
+	case 'N':
+	case '0':
+		disable_mmiotrace();
 		break;
 	}
-	spin_unlock(&trace_list_lock);
+
+	return count;
+}
+
+/* this ripped from kernel/kprobes.c */
+static struct file_operations fops_enabled = {
+	.owner =	THIS_MODULE,
+	.read =		read_enabled_file_bool,
+	.write =	write_enabled_file_bool
+};
+
+static struct file_operations fops_marker = {
+	.owner =	THIS_MODULE,
+	.write =	write_marker
+};
+
+static void enable_mmiotrace(void)
+{
+	mutex_lock(&mmiotrace_mutex);
+	if (is_enabled())
+		goto out;
+
+	chan = relay_open("cpu", dir, subbuf_size, n_subbufs,
+						&relay_callbacks, NULL);
+	if (!chan) {
+		pr_err(NAME "relay app channel creation failed.\n");
+		goto out;
+	}
+
+	reference_kmmio();
+
+	marker_file = debugfs_create_file("marker", 0660, dir, NULL,
+								&fops_marker);
+	if (!marker_file)
+		pr_err(NAME "marker file creation failed.\n");
+
+	if (nommiotrace)
+		pr_info(NAME "MMIO tracing disabled.\n");
+	if (ISA_trace)
+		pr_warning(NAME "Warning! low ISA range will be traced.\n");
+	spin_lock_irq(&trace_lock);
+	atomic_inc(&mmiotrace_enabled);
+	spin_unlock_irq(&trace_lock);
+	pr_info(NAME "enabled.\n");
+out:
+	mutex_unlock(&mmiotrace_mutex);
+}
+
+static void disable_mmiotrace(void)
+{
+	mutex_lock(&mmiotrace_mutex);
+	if (!is_enabled())
+		goto out;
+
+	spin_lock_irq(&trace_lock);
+	atomic_dec(&mmiotrace_enabled);
+	BUG_ON(is_enabled());
+	spin_unlock_irq(&trace_lock);
+
+	clear_trace_list(); /* guarantees: no more kmmio callbacks */
+	unreference_kmmio();
+	if (marker_file) {
+		debugfs_remove(marker_file);
+		marker_file = NULL;
+	}
+	if (chan) {
+		relay_close(chan);
+		chan = NULL;
+	}
+
+	pr_info(NAME "disabled.\n");
+out:
+	mutex_unlock(&mmiotrace_mutex);
 }
 
 static int __init init(void)
 {
+	pr_debug(NAME "load...\n");
 	if (n_subbufs < 2)
 		return -EINVAL;
 
 	dir = debugfs_create_dir(APP_DIR, NULL);
 	if (!dir) {
-		pr_err(MODULE_NAME ": Couldn't create relay app directory.\n");
+		pr_err(NAME "Couldn't create relay app directory.\n");
 		return -ENOMEM;
 	}
 
-	chan = create_channel(subbuf_size, n_subbufs);
-	if (!chan) {
+	enabled_file = debugfs_create_file("enabled", 0600, dir, NULL,
+								&fops_enabled);
+	if (!enabled_file) {
+		pr_err(NAME "Couldn't create enabled file.\n");
 		debugfs_remove(dir);
-		pr_err(MODULE_NAME ": relay app channel creation failed\n");
 		return -ENOMEM;
 	}
 
-	reference_kmmio();
-
-	proc_marker_file = create_proc_entry(MARKER_FILE, 0, NULL);
-	if (proc_marker_file)
-		proc_marker_file->write_proc = write_marker;
+	if (enable_now)
+		enable_mmiotrace();
 
-	pr_debug(MODULE_NAME ": loaded.\n");
-	if (nommiotrace)
-		pr_info(MODULE_NAME ": MMIO tracing disabled.\n");
-	if (ISA_trace)
-		pr_warning(MODULE_NAME ": Warning! low ISA range will be "
-								"traced.\n");
 	return 0;
 }
 
 static void __exit cleanup(void)
 {
-	pr_debug(MODULE_NAME ": unload...\n");
-	clear_trace_list();
-	unreference_kmmio();
-	remove_proc_entry(MARKER_FILE, NULL);
-	destroy_channel();
+	pr_debug(NAME "unload...\n");
+	if (enabled_file)
+		debugfs_remove(enabled_file);
+	disable_mmiotrace();
 	if (dir)
 		debugfs_remove(dir);
 }
diff --git a/arch/x86/kernel/mmiotrace/testmmiotrace.c b/arch/x86/kernel/mmiotrace/testmmiotrace.c
index 5ecff578672b..cfa60b227c8d 100644
--- a/arch/x86/kernel/mmiotrace/testmmiotrace.c
+++ b/arch/x86/kernel/mmiotrace/testmmiotrace.c
@@ -4,10 +4,6 @@
 #include <linux/module.h>
 #include <asm/io.h>
 
-extern void __iomem *ioremap_nocache_trace(unsigned long offset,
-						unsigned long size);
-extern void iounmap_trace(volatile void __iomem *addr);
-
 #define MODULE_NAME "testmmiotrace"
 
 static unsigned long mmio_address;
@@ -28,25 +24,24 @@ static void do_write_test(void __iomem *p)
 static void do_read_test(void __iomem *p)
 {
 	unsigned int i;
-	volatile unsigned int v;
 	for (i = 0; i < 256; i++)
-		v = ioread8(p + i);
+		ioread8(p + i);
 	for (i = 1024; i < (5 * 1024); i += 2)
-		v = ioread16(p + i);
+		ioread16(p + i);
 	for (i = (5 * 1024); i < (16 * 1024); i += 4)
-		v = ioread32(p + i);
+		ioread32(p + i);
 }
 
 static void do_test(void)
 {
-	void __iomem *p = ioremap_nocache_trace(mmio_address, 0x4000);
+	void __iomem *p = ioremap_nocache(mmio_address, 0x4000);
 	if (!p) {
 		pr_err(MODULE_NAME ": could not ioremap, aborting.\n");
 		return;
 	}
 	do_write_test(p);
 	do_read_test(p);
-	iounmap_trace(p);
+	iounmap(p);
 }
 
 static int __init init(void)
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 71bb3159031a..8927c878544d 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -12,6 +12,7 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
+#include <linux/mmiotrace.h>
 
 #include <asm/cacheflush.h>
 #include <asm/e820.h>
@@ -126,6 +127,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
 	unsigned long new_prot_val;
 	pgprot_t prot;
 	int retval;
+	void __iomem *ret_addr;
 
 	/* Don't allow wraparound or zero size */
 	last_addr = phys_addr + size - 1;
@@ -233,7 +235,10 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
 		return NULL;
 	}
 
-	return (void __iomem *) (vaddr + offset);
+	ret_addr = (void __iomem *) (vaddr + offset);
+	mmiotrace_ioremap(phys_addr, size, ret_addr);
+
+	return ret_addr;
 }
 
 /**
@@ -325,6 +330,8 @@ void iounmap(volatile void __iomem *addr)
 	addr = (volatile void __iomem *)
 		(PAGE_MASK & (unsigned long __force)addr);
 
+	mmiotrace_iounmap(addr);
+
 	/* Use the vm area unlocked, assuming the caller
 	   ensures there isn't another iounmap for the same address
 	   in parallel. Reuse of the virtual address is prevented by
diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h
index d87a6cd8b686..cb5efd0c7f51 100644
--- a/include/linux/mmiotrace.h
+++ b/include/linux/mmiotrace.h
@@ -16,11 +16,12 @@ typedef void (*kmmio_post_handler_t)(struct kmmio_probe *,
 				unsigned long condition, struct pt_regs *);
 
 struct kmmio_probe {
-	struct list_head list;
+	struct list_head list; /* kmmio internal list */
 	unsigned long addr; /* start location of the probe point */
 	unsigned long len; /* length of the probe region */
 	kmmio_pre_handler_t pre_handler; /* Called before addr is executed. */
 	kmmio_post_handler_t post_handler; /* Called after addr is executed */
+	void *user_data;
 };
 
 /* kmmio is active by some kmmio_probes? */
@@ -38,6 +39,21 @@ extern void unregister_kmmio_probe(struct kmmio_probe *p);
 /* Called from page fault handler. */
 extern int kmmio_handler(struct pt_regs *regs, unsigned long addr);
 
+/* Called from ioremap.c */
+#ifdef CONFIG_MMIOTRACE
+extern void
+mmiotrace_ioremap(unsigned long offset, unsigned long size, void __iomem *addr);
+extern void mmiotrace_iounmap(volatile void __iomem *addr);
+#else
+static inline void
+mmiotrace_ioremap(unsigned long offset, unsigned long size, void __iomem *addr)
+{
+}
+static inline void mmiotrace_iounmap(volatile void __iomem *addr)
+{
+}
+#endif /* CONFIG_MMIOTRACE_HOOKS */
+
 #endif /* __KERNEL__ */
 
 
-- 
cgit v1.2.3


From f984b51e0779a6dd30feedc41404013ca54e5d05 Mon Sep 17 00:00:00 2001
From: Pekka Paalanen <pq@iki.fi>
Date: Mon, 12 May 2008 21:20:57 +0200
Subject: ftrace: add mmiotrace plugin

On Sat, 22 Mar 2008 13:07:47 +0100
Ingo Molnar <mingo@elte.hu> wrote:

> > > i'd suggest the following: pull x86.git and sched-devel.git into a
> > > single tree [the two will combine without rejects]. Then try to add a
> > > kernel/tracing/trace_mmiotrace.c ftrace plugin. The trace_sysprof.c
> > > plugin might be a good example.
> >
> > I did this and now I have mmiotrace enabled/disabled via the tracing
> > framework (what do we call this, since ftrace is one of the tracers?).
>
> cool! could you send the patches for that? (even if they are not fully
> functional yet)

Patch attached in the end. Nice to see how much code disappeared. I tried
to mark all the features I had to break with XXX-comments.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/Kconfig.debug               |   3 +-
 arch/x86/kernel/mmiotrace/mmio-mod.c | 208 +++++------------------------------
 include/linux/mmiotrace.h            |   6 +
 kernel/trace/Makefile                |   1 +
 kernel/trace/trace_mmiotrace.c       |  84 ++++++++++++++
 5 files changed, 123 insertions(+), 179 deletions(-)
 create mode 100644 kernel/trace/trace_mmiotrace.c

(limited to 'include/linux')

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index aa0d6462b1fc..7e4b8494078e 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -173,7 +173,8 @@ config MMIOTRACE_HOOKS
 
 config MMIOTRACE
 	bool "Memory mapped IO tracing"
-	depends on DEBUG_KERNEL && RELAY && DEBUG_FS
+	depends on DEBUG_KERNEL && RELAY
+	select TRACING
 	select MMIOTRACE_HOOKS
 	default y
 	help
diff --git a/arch/x86/kernel/mmiotrace/mmio-mod.c b/arch/x86/kernel/mmiotrace/mmio-mod.c
index 738644061e4e..c7a67d7e482b 100644
--- a/arch/x86/kernel/mmiotrace/mmio-mod.c
+++ b/arch/x86/kernel/mmiotrace/mmio-mod.c
@@ -22,9 +22,8 @@
 #define DEBUG 1
 
 #include <linux/module.h>
-#include <linux/relay.h>
 #include <linux/debugfs.h>
-#include <linux/proc_fs.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>
 #include <linux/version.h>
 #include <linux/kallsyms.h>
@@ -63,18 +62,18 @@ static const size_t subbuf_size = 256*1024;
 static DEFINE_PER_CPU(struct trap_reason, pf_reason);
 static DEFINE_PER_CPU(struct mm_io_header_rw, cpu_trace);
 
+#if 0 /* XXX: no way gather this info anymore */
 /* Access to this is not per-cpu. */
 static DEFINE_PER_CPU(atomic_t, dropped);
+#endif
 
 static struct dentry *dir;
-static struct dentry *enabled_file;
 static struct dentry *marker_file;
 
 static DEFINE_MUTEX(mmiotrace_mutex);
 static DEFINE_SPINLOCK(trace_lock);
 static atomic_t mmiotrace_enabled;
 static LIST_HEAD(trace_list);		/* struct remap_trace */
-static struct rchan *chan;
 
 /*
  * Locking in this file:
@@ -93,36 +92,24 @@ static unsigned long	filter_offset;
 static int		nommiotrace;
 static int		ISA_trace;
 static int		trace_pc;
-static int		enable_now;
 
 module_param(n_subbufs, uint, 0);
 module_param(filter_offset, ulong, 0);
 module_param(nommiotrace, bool, 0);
 module_param(ISA_trace, bool, 0);
 module_param(trace_pc, bool, 0);
-module_param(enable_now, bool, 0);
 
 MODULE_PARM_DESC(n_subbufs, "Number of 256kB buffers, default 128.");
 MODULE_PARM_DESC(filter_offset, "Start address of traced mappings.");
 MODULE_PARM_DESC(nommiotrace, "Disable actual MMIO tracing.");
 MODULE_PARM_DESC(ISA_trace, "Do not exclude the low ISA range.");
 MODULE_PARM_DESC(trace_pc, "Record address of faulting instructions.");
-MODULE_PARM_DESC(enable_now, "Start mmiotracing immediately on module load.");
 
 static bool is_enabled(void)
 {
 	return atomic_read(&mmiotrace_enabled);
 }
 
-static void record_timestamp(struct mm_io_header *header)
-{
-	struct timespec now;
-
-	getnstimeofday(&now);
-	header->sec = now.tv_sec;
-	header->nsec = now.tv_nsec;
-}
-
 /*
  * Write callback for the debugfs entry:
  * Read a marker and write it to the mmio trace log
@@ -141,7 +128,6 @@ static ssize_t write_marker(struct file *file, const char __user *buffer,
 	headp = (struct mm_io_header *)event;
 	headp->type = MMIO_MAGIC | (MMIO_MARKER << MMIO_OPCODE_SHIFT);
 	headp->data_len = len;
-	record_timestamp(headp);
 
 	if (copy_from_user(event + sizeof(*headp), buffer, len)) {
 		kfree(event);
@@ -149,9 +135,11 @@ static ssize_t write_marker(struct file *file, const char __user *buffer,
 	}
 
 	spin_lock_irq(&trace_lock);
+#if 0 /* XXX: convert this to use tracing */
 	if (is_enabled())
 		relay_write(chan, event, sizeof(*headp) + len);
 	else
+#endif
 		len = -EINVAL;
 	spin_unlock_irq(&trace_lock);
 	kfree(event);
@@ -242,7 +230,11 @@ static void pre(struct kmmio_probe *p, struct pt_regs *regs,
 	else
 		my_trace->rw.pc = 0;
 
-	record_timestamp(&my_trace->header);
+	/*
+	 * XXX: the timestamp recorded will be *after* the tracing has been
+	 * done, not at the time we hit the instruction. SMP implications
+	 * on event ordering?
+	 */
 
 	switch (type) {
 	case REG_READ:
@@ -295,77 +287,19 @@ static void post(struct kmmio_probe *p, unsigned long condition,
 	default:
 		break;
 	}
-	relay_write(chan, my_trace, sizeof(*my_trace));
+
+	/*
+	 * XXX: Several required values are ignored:
+	 * - mapping id
+	 * - program counter
+	 * Also the address should be physical, not virtual.
+	 */
+	mmio_trace_record(my_trace->header.type, my_trace->rw.address,
+							my_trace->rw.value);
 	put_cpu_var(cpu_trace);
 	put_cpu_var(pf_reason);
 }
 
-/*
- * subbuf_start() relay callback.
- *
- * Defined so that we know when events are dropped due to the buffer-full
- * condition.
- */
-static int subbuf_start_handler(struct rchan_buf *buf, void *subbuf,
-					void *prev_subbuf, size_t prev_padding)
-{
-	unsigned int cpu = buf->cpu;
-	atomic_t *drop = &per_cpu(dropped, cpu);
-	int count;
-	if (relay_buf_full(buf)) {
-		if (atomic_inc_return(drop) == 1)
-			pr_err(NAME "cpu %d buffer full!\n", cpu);
-		return 0;
-	}
-	count = atomic_read(drop);
-	if (count) {
-		pr_err(NAME "cpu %d buffer no longer full, missed %d events.\n",
-								cpu, count);
-		atomic_sub(count, drop);
-	}
-
-	return 1;
-}
-
-static struct file_operations mmio_fops = {
-	.owner = THIS_MODULE,
-};
-
-/* file_create() callback.  Creates relay file in debugfs. */
-static struct dentry *create_buf_file_handler(const char *filename,
-						struct dentry *parent,
-						int mode,
-						struct rchan_buf *buf,
-						int *is_global)
-{
-	struct dentry *buf_file;
-
-	mmio_fops.read = relay_file_operations.read;
-	mmio_fops.open = relay_file_operations.open;
-	mmio_fops.poll = relay_file_operations.poll;
-	mmio_fops.mmap = relay_file_operations.mmap;
-	mmio_fops.release = relay_file_operations.release;
-	mmio_fops.splice_read = relay_file_operations.splice_read;
-
-	buf_file = debugfs_create_file(filename, mode, parent, buf,
-								&mmio_fops);
-
-	return buf_file;
-}
-
-/* file_remove() default callback.  Removes relay file in debugfs. */
-static int remove_buf_file_handler(struct dentry *dentry)
-{
-	debugfs_remove(dentry);
-	return 0;
-}
-
-static struct rchan_callbacks relay_callbacks = {
-	.subbuf_start = subbuf_start_handler,
-	.create_buf_file = create_buf_file_handler,
-	.remove_buf_file = remove_buf_file_handler,
-};
-
 static void ioremap_trace_core(unsigned long offset, unsigned long size,
 							void __iomem *addr)
 {
@@ -387,7 +321,6 @@ static void ioremap_trace_core(unsigned long offset, unsigned long size,
 			.pc   = 0
 		}
 	};
-	record_timestamp(&event.header);
 
 	if (!trace) {
 		pr_err(NAME "kmalloc failed in ioremap\n");
@@ -410,7 +343,10 @@ static void ioremap_trace_core(unsigned long offset, unsigned long size,
 	if (!is_enabled())
 		goto not_enabled;
 
-	relay_write(chan, &event, sizeof(event));
+	/*
+	 * XXX: Insufficient data recorded!
+	 */
+	mmio_trace_record(event.header.type, event.map.addr, event.map.len);
 	list_add_tail(&trace->list, &trace_list);
 	if (!nommiotrace)
 		register_kmmio_probe(&trace->probe);
@@ -454,7 +390,6 @@ static void iounmap_trace_core(volatile void __iomem *addr)
 	struct remap_trace *found_trace = NULL;
 
 	pr_debug(NAME "Unmapping %p.\n", addr);
-	record_timestamp(&event.header);
 
 	spin_lock_irq(&trace_lock);
 	if (!is_enabled())
@@ -469,7 +404,8 @@ static void iounmap_trace_core(volatile void __iomem *addr)
 			break;
 		}
 	}
-	relay_write(chan, &event, sizeof(event));
+	mmio_trace_record(event.header.type, event.map.addr,
+					found_trace ? found_trace->id : -1);
 
 not_enabled:
 	spin_unlock_irq(&trace_lock);
@@ -512,77 +448,23 @@ static void clear_trace_list(void)
 	}
 }
 
-static ssize_t read_enabled_file_bool(struct file *file,
-		char __user *user_buf, size_t count, loff_t *ppos)
-{
-	char buf[3];
-
-	if (is_enabled())
-		buf[0] = '1';
-	else
-		buf[0] = '0';
-	buf[1] = '\n';
-	buf[2] = '\0';
-	return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
-}
-
-static void enable_mmiotrace(void);
-static void disable_mmiotrace(void);
-
-static ssize_t write_enabled_file_bool(struct file *file,
-		const char __user *user_buf, size_t count, loff_t *ppos)
-{
-	char buf[32];
-	int buf_size = min(count, (sizeof(buf)-1));
-
-	if (copy_from_user(buf, user_buf, buf_size))
-		return -EFAULT;
-
-	switch (buf[0]) {
-	case 'y':
-	case 'Y':
-	case '1':
-		enable_mmiotrace();
-		break;
-	case 'n':
-	case 'N':
-	case '0':
-		disable_mmiotrace();
-		break;
-	}
-
-	return count;
-}
-
-/* this ripped from kernel/kprobes.c */
-static struct file_operations fops_enabled = {
-	.owner =	THIS_MODULE,
-	.read =		read_enabled_file_bool,
-	.write =	write_enabled_file_bool
-};
-
 static struct file_operations fops_marker = {
 	.owner =	THIS_MODULE,
 	.write =	write_marker
 };
 
-static void enable_mmiotrace(void)
+void enable_mmiotrace(void)
 {
 	mutex_lock(&mmiotrace_mutex);
 	if (is_enabled())
 		goto out;
 
-	chan = relay_open("cpu", dir, subbuf_size, n_subbufs,
-						&relay_callbacks, NULL);
-	if (!chan) {
-		pr_err(NAME "relay app channel creation failed.\n");
-		goto out;
-	}
-
 	reference_kmmio();
 
+#if 0 /* XXX: tracing does not support text entries */
 	marker_file = debugfs_create_file("marker", 0660, dir, NULL,
 								&fops_marker);
+#endif
 	if (!marker_file)
 		pr_err(NAME "marker file creation failed.\n");
 
@@ -598,7 +480,7 @@ out:
 	mutex_unlock(&mmiotrace_mutex);
 }
 
-static void disable_mmiotrace(void)
+void disable_mmiotrace(void)
 {
 	mutex_lock(&mmiotrace_mutex);
 	if (!is_enabled())
@@ -615,17 +497,13 @@ static void disable_mmiotrace(void)
 		debugfs_remove(marker_file);
 		marker_file = NULL;
 	}
-	if (chan) {
-		relay_close(chan);
-		chan = NULL;
-	}
 
 	pr_info(NAME "disabled.\n");
 out:
 	mutex_unlock(&mmiotrace_mutex);
 }
 
-static int __init init(void)
+int __init init_mmiotrace(void)
 {
 	pr_debug(NAME "load...\n");
 	if (n_subbufs < 2)
@@ -636,31 +514,5 @@ static int __init init(void)
 		pr_err(NAME "Couldn't create relay app directory.\n");
 		return -ENOMEM;
 	}
-
-	enabled_file = debugfs_create_file("enabled", 0600, dir, NULL,
-								&fops_enabled);
-	if (!enabled_file) {
-		pr_err(NAME "Couldn't create enabled file.\n");
-		debugfs_remove(dir);
-		return -ENOMEM;
-	}
-
-	if (enable_now)
-		enable_mmiotrace();
-
 	return 0;
 }
-
-static void __exit cleanup(void)
-{
-	pr_debug(NAME "unload...\n");
-	if (enabled_file)
-		debugfs_remove(enabled_file);
-	disable_mmiotrace();
-	if (dir)
-		debugfs_remove(dir);
-}
-
-module_init(init);
-module_exit(cleanup);
-MODULE_LICENSE("GPL");
diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h
index cb5efd0c7f51..579b3b06c90e 100644
--- a/include/linux/mmiotrace.h
+++ b/include/linux/mmiotrace.h
@@ -54,6 +54,12 @@ static inline void mmiotrace_iounmap(volatile void __iomem *addr)
 }
 #endif /* CONFIG_MMIOTRACE_HOOKS */
 
+/* in kernel/trace/trace_mmiotrace.c */
+extern int __init init_mmiotrace(void);
+extern void enable_mmiotrace(void);
+extern void disable_mmiotrace(void);
+extern void mmio_trace_record(u32 type, unsigned long addr, unsigned long arg);
+
 #endif /* __KERNEL__ */
 
 
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index d9efbbfa2bdf..c44a7dce9086 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -18,5 +18,6 @@ obj-$(CONFIG_FTRACE) += trace_functions.o
 obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
 obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
 obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
+obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
new file mode 100644
index 000000000000..e4dd03cc5aa6
--- /dev/null
+++ b/kernel/trace/trace_mmiotrace.c
@@ -0,0 +1,84 @@
+/*
+ * Memory mapped I/O tracing
+ *
+ * Copyright (C) 2008 Pekka Paalanen <pq@iki.fi>
+ */
+
+#define DEBUG 1
+
+#include <linux/kernel.h>
+#include <linux/mmiotrace.h>
+
+#include "trace.h"
+
+extern void
+__trace_special(void *__tr, void *__data,
+		unsigned long arg1, unsigned long arg2, unsigned long arg3);
+
+static struct trace_array *mmio_trace_array;
+
+
+static void mmio_trace_init(struct trace_array *tr)
+{
+	pr_debug("in %s\n", __func__);
+	mmio_trace_array = tr;
+	if (tr->ctrl)
+		enable_mmiotrace();
+}
+
+static void mmio_trace_reset(struct trace_array *tr)
+{
+	pr_debug("in %s\n", __func__);
+	if (tr->ctrl)
+		disable_mmiotrace();
+}
+
+static void mmio_trace_ctrl_update(struct trace_array *tr)
+{
+	pr_debug("in %s\n", __func__);
+	if (tr->ctrl)
+		enable_mmiotrace();
+	else
+		disable_mmiotrace();
+}
+
+static struct tracer mmio_tracer __read_mostly =
+{
+	.name		= "mmiotrace",
+	.init		= mmio_trace_init,
+	.reset		= mmio_trace_reset,
+	.ctrl_update	= mmio_trace_ctrl_update,
+};
+
+__init static int init_mmio_trace(void)
+{
+	int ret = init_mmiotrace();
+	if (ret)
+		return ret;
+	return register_tracer(&mmio_tracer);
+}
+device_initcall(init_mmio_trace);
+
+void mmio_trace_record(u32 type, unsigned long addr, unsigned long arg)
+{
+	struct trace_array *tr = mmio_trace_array;
+	struct trace_array_cpu *data = tr->data[smp_processor_id()];
+
+	if (!current || current->pid == 0) {
+		/*
+		 * XXX: This is a problem. We need to able to record, no
+		 * matter what. tracing_generic_entry_update() would crash.
+		 */
+		static unsigned limit;
+		if (limit++ < 12)
+			pr_err("Error in %s: no current.\n", __func__);
+		return;
+	}
+	if (!tr || !data) {
+		static unsigned limit;
+		if (limit++ < 12)
+			pr_err("%s: no tr or data\n", __func__);
+		return;
+	}
+	__trace_special(tr, data, type, addr, arg);
+}
-- 
cgit v1.2.3


From bd8ac686c73c7e925fcfe0b02dc4e7b947127864 Mon Sep 17 00:00:00 2001
From: Pekka Paalanen <pq@iki.fi>
Date: Mon, 12 May 2008 21:20:57 +0200
Subject: ftrace: mmiotrace, updates

here is a patch that makes mmiotrace work almost well within the tracing
framework. The patch applies on top of my previous patch. I have my own
output formatting in place now.

Summary of changes:
- fix the NULL dereference that was due to not calling tracing_reset()
- add print_line() callback into struct tracer
- implement print_line() for mmiotrace, producing up-to-spec text
- add my output header, but that is not really called in the right place
- rewrote the main structs in mmiotrace
- added two new trace entry types: TRACE_MMIO_RW and TRACE_MMIO_MAP
- made some functions in trace.c non-static
- check current==NULL in tracing_generic_entry_update()
- fix(?) comparison in trace_seq_printf()

Things seem to work fine except a few issues. Markers (text lines injected
into mmiotrace log) are missing, I did not feel hacking them in before we
have variable length entries. My output header is printed only for 'trace'
file, but not 'trace_pipe'. For some reason, despite my quick fix,
iter->trace is NULL in print_trace_line() when called from 'trace_pipe'
file, which means I don't get proper output formatting.

I only tried by loading nouveau.ko, which just detects the card, and that
is traced fine. I didn't try further. Map, two reads and unmap. Works
perfectly.

I am missing the information about overflows, I'd prefer to have a
counter for lost events. I didn't try, but I guess currently there is no
way of knowning when it overflows?

So, not too far from being fully operational, it seems :-)
And looking at the diffstat, there also is some 700-900 lines of user space
code that just became obsolete.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/Kconfig.debug               |   2 +-
 arch/x86/kernel/mmiotrace/mmio-mod.c | 140 ++++++++++----------------------
 include/linux/mmiotrace.h            |  85 ++++++--------------
 kernel/trace/trace.c                 |  34 ++++++++
 kernel/trace/trace.h                 |  14 ++++
 kernel/trace/trace_mmiotrace.c       | 151 ++++++++++++++++++++++++++++-------
 6 files changed, 238 insertions(+), 188 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 7e4b8494078e..1d6de0d67f99 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -173,7 +173,7 @@ config MMIOTRACE_HOOKS
 
 config MMIOTRACE
 	bool "Memory mapped IO tracing"
-	depends on DEBUG_KERNEL && RELAY
+	depends on DEBUG_KERNEL
 	select TRACING
 	select MMIOTRACE_HOOKS
 	default y
diff --git a/arch/x86/kernel/mmiotrace/mmio-mod.c b/arch/x86/kernel/mmiotrace/mmio-mod.c
index c7a67d7e482b..62abc281a512 100644
--- a/arch/x86/kernel/mmiotrace/mmio-mod.c
+++ b/arch/x86/kernel/mmiotrace/mmio-mod.c
@@ -37,11 +37,6 @@
 
 #define NAME "mmiotrace: "
 
-/* This app's relay channel files will appear in /debug/mmio-trace */
-static const char APP_DIR[] = "mmio-trace";
-/* the marker injection file in /debug/APP_DIR */
-static const char MARKER_FILE[] = "mmio-marker";
-
 struct trap_reason {
 	unsigned long addr;
 	unsigned long ip;
@@ -56,18 +51,15 @@ struct remap_trace {
 	unsigned long id;
 };
 
-static const size_t subbuf_size = 256*1024;
-
 /* Accessed per-cpu. */
 static DEFINE_PER_CPU(struct trap_reason, pf_reason);
-static DEFINE_PER_CPU(struct mm_io_header_rw, cpu_trace);
+static DEFINE_PER_CPU(struct mmiotrace_rw, cpu_trace);
 
 #if 0 /* XXX: no way gather this info anymore */
 /* Access to this is not per-cpu. */
 static DEFINE_PER_CPU(atomic_t, dropped);
 #endif
 
-static struct dentry *dir;
 static struct dentry *marker_file;
 
 static DEFINE_MUTEX(mmiotrace_mutex);
@@ -82,24 +74,21 @@ static LIST_HEAD(trace_list);		/* struct remap_trace */
  *   and trace_lock.
  * - Routines depending on is_enabled() must take trace_lock.
  * - trace_list users must hold trace_lock.
- * - is_enabled() guarantees that chan is valid.
+ * - is_enabled() guarantees that mmio_trace_record is allowed.
  * - pre/post callbacks assume the effect of is_enabled() being true.
  */
 
 /* module parameters */
-static unsigned int	n_subbufs = 32*4;
 static unsigned long	filter_offset;
 static int		nommiotrace;
 static int		ISA_trace;
 static int		trace_pc;
 
-module_param(n_subbufs, uint, 0);
 module_param(filter_offset, ulong, 0);
 module_param(nommiotrace, bool, 0);
 module_param(ISA_trace, bool, 0);
 module_param(trace_pc, bool, 0);
 
-MODULE_PARM_DESC(n_subbufs, "Number of 256kB buffers, default 128.");
 MODULE_PARM_DESC(filter_offset, "Start address of traced mappings.");
 MODULE_PARM_DESC(nommiotrace, "Disable actual MMIO tracing.");
 MODULE_PARM_DESC(ISA_trace, "Do not exclude the low ISA range.");
@@ -110,6 +99,7 @@ static bool is_enabled(void)
 	return atomic_read(&mmiotrace_enabled);
 }
 
+#if 0 /* XXX: needs rewrite */
 /*
  * Write callback for the debugfs entry:
  * Read a marker and write it to the mmio trace log
@@ -145,6 +135,7 @@ static ssize_t write_marker(struct file *file, const char __user *buffer,
 	kfree(event);
 	return len;
 }
+#endif
 
 static void print_pte(unsigned long address)
 {
@@ -198,9 +189,10 @@ static void pre(struct kmmio_probe *p, struct pt_regs *regs,
 						unsigned long addr)
 {
 	struct trap_reason *my_reason = &get_cpu_var(pf_reason);
-	struct mm_io_header_rw *my_trace = &get_cpu_var(cpu_trace);
+	struct mmiotrace_rw *my_trace = &get_cpu_var(cpu_trace);
 	const unsigned long instptr = instruction_pointer(regs);
 	const enum reason_type type = get_ins_type(instptr);
+	struct remap_trace *trace = p->user_data;
 
 	/* it doesn't make sense to have more than one active trace per cpu */
 	if (my_reason->active_traces)
@@ -212,23 +204,17 @@ static void pre(struct kmmio_probe *p, struct pt_regs *regs,
 	my_reason->addr = addr;
 	my_reason->ip = instptr;
 
-	my_trace->header.type = MMIO_MAGIC;
-	my_trace->header.pid = 0;
-	my_trace->header.data_len = sizeof(struct mm_io_rw);
-	my_trace->rw.address = addr;
-	/*
-	 * struct remap_trace *trace = p->user_data;
-	 * phys = addr - trace->probe.addr + trace->phys;
-	 */
+	my_trace->phys = addr - trace->probe.addr + trace->phys;
+	my_trace->map_id = trace->id;
 
 	/*
 	 * Only record the program counter when requested.
 	 * It may taint clean-room reverse engineering.
 	 */
 	if (trace_pc)
-		my_trace->rw.pc = instptr;
+		my_trace->pc = instptr;
 	else
-		my_trace->rw.pc = 0;
+		my_trace->pc = 0;
 
 	/*
 	 * XXX: the timestamp recorded will be *after* the tracing has been
@@ -238,28 +224,25 @@ static void pre(struct kmmio_probe *p, struct pt_regs *regs,
 
 	switch (type) {
 	case REG_READ:
-		my_trace->header.type |=
-			(MMIO_READ << MMIO_OPCODE_SHIFT) |
-			(get_ins_mem_width(instptr) << MMIO_WIDTH_SHIFT);
+		my_trace->opcode = MMIO_READ;
+		my_trace->width = get_ins_mem_width(instptr);
 		break;
 	case REG_WRITE:
-		my_trace->header.type |=
-			(MMIO_WRITE << MMIO_OPCODE_SHIFT) |
-			(get_ins_mem_width(instptr) << MMIO_WIDTH_SHIFT);
-		my_trace->rw.value = get_ins_reg_val(instptr, regs);
+		my_trace->opcode = MMIO_WRITE;
+		my_trace->width = get_ins_mem_width(instptr);
+		my_trace->value = get_ins_reg_val(instptr, regs);
 		break;
 	case IMM_WRITE:
-		my_trace->header.type |=
-			(MMIO_WRITE << MMIO_OPCODE_SHIFT) |
-			(get_ins_mem_width(instptr) << MMIO_WIDTH_SHIFT);
-		my_trace->rw.value = get_ins_imm_val(instptr);
+		my_trace->opcode = MMIO_WRITE;
+		my_trace->width = get_ins_mem_width(instptr);
+		my_trace->value = get_ins_imm_val(instptr);
 		break;
 	default:
 		{
 			unsigned char *ip = (unsigned char *)instptr;
-			my_trace->header.type |=
-					(MMIO_UNKNOWN_OP << MMIO_OPCODE_SHIFT);
-			my_trace->rw.value = (*ip) << 16 | *(ip + 1) << 8 |
+			my_trace->opcode = MMIO_UNKNOWN_OP;
+			my_trace->width = 0;
+			my_trace->value = (*ip) << 16 | *(ip + 1) << 8 |
 								*(ip + 2);
 		}
 	}
@@ -271,7 +254,7 @@ static void post(struct kmmio_probe *p, unsigned long condition,
 							struct pt_regs *regs)
 {
 	struct trap_reason *my_reason = &get_cpu_var(pf_reason);
-	struct mm_io_header_rw *my_trace = &get_cpu_var(cpu_trace);
+	struct mmiotrace_rw *my_trace = &get_cpu_var(cpu_trace);
 
 	/* this should always return the active_trace count to 0 */
 	my_reason->active_traces--;
@@ -282,20 +265,13 @@ static void post(struct kmmio_probe *p, unsigned long condition,
 
 	switch (my_reason->type) {
 	case REG_READ:
-		my_trace->rw.value = get_ins_reg_val(my_reason->ip, regs);
+		my_trace->value = get_ins_reg_val(my_reason->ip, regs);
 		break;
 	default:
 		break;
 	}
 
-	/*
-	 * XXX: Several required values are ignored:
-	 * - mapping id
-	 * - program counter
-	 * Also the address should be physical, not virtual.
-	 */
-	mmio_trace_record(my_trace->header.type, my_trace->rw.address,
-							my_trace->rw.value);
+	mmio_trace_rw(my_trace);
 	put_cpu_var(cpu_trace);
 	put_cpu_var(pf_reason);
 }
@@ -305,21 +281,11 @@ static void ioremap_trace_core(unsigned long offset, unsigned long size,
 {
 	static atomic_t next_id;
 	struct remap_trace *trace = kmalloc(sizeof(*trace), GFP_KERNEL);
-	struct mm_io_header_map event = {
-		.header = {
-			.type = MMIO_MAGIC |
-					(MMIO_PROBE << MMIO_OPCODE_SHIFT),
-			.sec = 0,
-			.nsec = 0,
-			.pid = 0,
-			.data_len = sizeof(struct mm_io_map)
-		},
-		.map = {
-			.phys = offset,
-			.addr = (unsigned long)addr,
-			.len  = size,
-			.pc   = 0
-		}
+	struct mmiotrace_map map = {
+		.phys = offset,
+		.virt = (unsigned long)addr,
+		.len = size,
+		.opcode = MMIO_PROBE
 	};
 
 	if (!trace) {
@@ -338,15 +304,13 @@ static void ioremap_trace_core(unsigned long offset, unsigned long size,
 		.phys = offset,
 		.id = atomic_inc_return(&next_id)
 	};
+	map.map_id = trace->id;
 
 	spin_lock_irq(&trace_lock);
 	if (!is_enabled())
 		goto not_enabled;
 
-	/*
-	 * XXX: Insufficient data recorded!
-	 */
-	mmio_trace_record(event.header.type, event.map.addr, event.map.len);
+	mmio_trace_mapping(&map);
 	list_add_tail(&trace->list, &trace_list);
 	if (!nommiotrace)
 		register_kmmio_probe(&trace->probe);
@@ -369,21 +333,11 @@ mmiotrace_ioremap(unsigned long offset, unsigned long size, void __iomem *addr)
 
 static void iounmap_trace_core(volatile void __iomem *addr)
 {
-	struct mm_io_header_map event = {
-		.header = {
-			.type = MMIO_MAGIC |
-				(MMIO_UNPROBE << MMIO_OPCODE_SHIFT),
-			.sec = 0,
-			.nsec = 0,
-			.pid = 0,
-			.data_len = sizeof(struct mm_io_map)
-		},
-		.map = {
-			.phys = 0,
-			.addr = (unsigned long)addr,
-			.len  = 0,
-			.pc   = 0
-		}
+	struct mmiotrace_map map = {
+		.phys = 0,
+		.virt = (unsigned long)addr,
+		.len = 0,
+		.opcode = MMIO_UNPROBE
 	};
 	struct remap_trace *trace;
 	struct remap_trace *tmp;
@@ -404,8 +358,8 @@ static void iounmap_trace_core(volatile void __iomem *addr)
 			break;
 		}
 	}
-	mmio_trace_record(event.header.type, event.map.addr,
-					found_trace ? found_trace->id : -1);
+	map.map_id = (found_trace) ? found_trace->id : -1;
+	mmio_trace_mapping(&map);
 
 not_enabled:
 	spin_unlock_irq(&trace_lock);
@@ -448,10 +402,12 @@ static void clear_trace_list(void)
 	}
 }
 
+#if 0 /* XXX: out of order */
 static struct file_operations fops_marker = {
 	.owner =	THIS_MODULE,
 	.write =	write_marker
 };
+#endif
 
 void enable_mmiotrace(void)
 {
@@ -464,9 +420,9 @@ void enable_mmiotrace(void)
 #if 0 /* XXX: tracing does not support text entries */
 	marker_file = debugfs_create_file("marker", 0660, dir, NULL,
 								&fops_marker);
-#endif
 	if (!marker_file)
 		pr_err(NAME "marker file creation failed.\n");
+#endif
 
 	if (nommiotrace)
 		pr_info(NAME "MMIO tracing disabled.\n");
@@ -502,17 +458,3 @@ void disable_mmiotrace(void)
 out:
 	mutex_unlock(&mmiotrace_mutex);
 }
-
-int __init init_mmiotrace(void)
-{
-	pr_debug(NAME "load...\n");
-	if (n_subbufs < 2)
-		return -EINVAL;
-
-	dir = debugfs_create_dir(APP_DIR, NULL);
-	if (!dir) {
-		pr_err(NAME "Couldn't create relay app directory.\n");
-		return -ENOMEM;
-	}
-	return 0;
-}
diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h
index 579b3b06c90e..c88a9c197d22 100644
--- a/include/linux/mmiotrace.h
+++ b/include/linux/mmiotrace.h
@@ -54,73 +54,38 @@ static inline void mmiotrace_iounmap(volatile void __iomem *addr)
 }
 #endif /* CONFIG_MMIOTRACE_HOOKS */
 
-/* in kernel/trace/trace_mmiotrace.c */
-extern int __init init_mmiotrace(void);
-extern void enable_mmiotrace(void);
-extern void disable_mmiotrace(void);
-extern void mmio_trace_record(u32 type, unsigned long addr, unsigned long arg);
-
-#endif /* __KERNEL__ */
-
-
-/*
- * If you change anything here, you must bump MMIO_VERSION.
- * This is the relay data format for user space.
- */
-#define MMIO_VERSION 0x04
-
-/* mm_io_header.type */
-#define MMIO_OPCODE_MASK 0xff
-#define MMIO_OPCODE_SHIFT 0
-#define MMIO_WIDTH_MASK 0xff00
-#define MMIO_WIDTH_SHIFT 8
-#define MMIO_MAGIC (0x6f000000 | (MMIO_VERSION<<16))
-#define MMIO_MAGIC_MASK 0xffff0000
-
-enum mm_io_opcode {          /* payload type: */
-	MMIO_READ = 0x1,     /* struct mm_io_rw */
-	MMIO_WRITE = 0x2,    /* struct mm_io_rw */
-	MMIO_PROBE = 0x3,    /* struct mm_io_map */
-	MMIO_UNPROBE = 0x4,  /* struct mm_io_map */
+enum mm_io_opcode {
+	MMIO_READ = 0x1,     /* struct mmiotrace_rw */
+	MMIO_WRITE = 0x2,    /* struct mmiotrace_rw */
+	MMIO_PROBE = 0x3,    /* struct mmiotrace_map */
+	MMIO_UNPROBE = 0x4,  /* struct mmiotrace_map */
 	MMIO_MARKER = 0x5,   /* raw char data */
-	MMIO_UNKNOWN_OP = 0x6, /* struct mm_io_rw */
+	MMIO_UNKNOWN_OP = 0x6, /* struct mmiotrace_rw */
 };
 
-struct mm_io_header {
-	__u32 type;     /* see MMIO_* macros above */
-	__u32 sec;      /* timestamp */
-	__u32 nsec;
-	__u32 pid;      /* PID of the process, or 0 for kernel core */
-	__u16 data_len; /* length of the following payload */
+struct mmiotrace_rw {
+	unsigned long phys;	/* PCI address of register */
+	unsigned long value;
+	unsigned long pc;	/* optional program counter */
+	int map_id;
+	unsigned char opcode;	/* one of MMIO_{READ,WRITE,UNKNOWN_OP} */
+	unsigned char width;	/* size of register access in bytes */
 };
 
-struct mm_io_rw {
-	__u64 address; /* virtual address of register */
-	__u64 value;
-	__u64 pc;      /* optional program counter */
+struct mmiotrace_map {
+	unsigned long phys;	/* base address in PCI space */
+	unsigned long virt;	/* base virtual address */
+	unsigned long len;	/* mapping size */
+	int map_id;
+	unsigned char opcode;	/* MMIO_PROBE or MMIO_UNPROBE */
 };
 
-struct mm_io_map {
-	__u64 phys;  /* base address in PCI space */
-	__u64 addr;  /* base virtual address */
-	__u64 len;   /* mapping size */
-	__u64 pc;    /* optional program counter */
-};
-
-
-/*
- * These structures are used to allow a single relay_write()
- * call to write a full packet.
- */
-
-struct mm_io_header_rw {
-	struct mm_io_header header;
-	struct mm_io_rw rw;
-} __attribute__((packed));
+/* in kernel/trace/trace_mmiotrace.c */
+extern void enable_mmiotrace(void);
+extern void disable_mmiotrace(void);
+extern void mmio_trace_rw(struct mmiotrace_rw *rw);
+extern void mmio_trace_mapping(struct mmiotrace_map *map);
 
-struct mm_io_header_map {
-	struct mm_io_header header;
-	struct mm_io_map map;
-} __attribute__((packed));
+#endif /* __KERNEL__ */
 
 #endif /* MMIOTRACE_H */
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 3271916ff033..d14fe49e9638 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -831,6 +831,40 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data,
 		trace_function(tr, data, ip, parent_ip, flags);
 }
 
+#ifdef CONFIG_MMIOTRACE
+void __trace_mmiotrace_rw(struct trace_array *tr, struct trace_array_cpu *data,
+						struct mmiotrace_rw *rw)
+{
+	struct trace_entry *entry;
+	unsigned long irq_flags;
+
+	spin_lock_irqsave(&data->lock, irq_flags);
+	entry			= tracing_get_trace_entry(tr, data);
+	tracing_generic_entry_update(entry, 0);
+	entry->type		= TRACE_MMIO_RW;
+	entry->mmiorw		= *rw;
+	spin_unlock_irqrestore(&data->lock, irq_flags);
+
+	trace_wake_up();
+}
+
+void __trace_mmiotrace_map(struct trace_array *tr, struct trace_array_cpu *data,
+						struct mmiotrace_map *map)
+{
+	struct trace_entry *entry;
+	unsigned long irq_flags;
+
+	spin_lock_irqsave(&data->lock, irq_flags);
+	entry			= tracing_get_trace_entry(tr, data);
+	tracing_generic_entry_update(entry, 0);
+	entry->type		= TRACE_MMIO_MAP;
+	entry->mmiomap		= *map;
+	spin_unlock_irqrestore(&data->lock, irq_flags);
+
+	trace_wake_up();
+}
+#endif
+
 void __trace_stack(struct trace_array *tr,
 		   struct trace_array_cpu *data,
 		   unsigned long flags,
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index c460e85e94ed..0ef9ef74c806 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -5,6 +5,7 @@
 #include <asm/atomic.h>
 #include <linux/sched.h>
 #include <linux/clocksource.h>
+#include <linux/mmiotrace.h>
 
 enum trace_type {
 	__TRACE_FIRST_TYPE = 0,
@@ -14,6 +15,8 @@ enum trace_type {
 	TRACE_WAKE,
 	TRACE_STACK,
 	TRACE_SPECIAL,
+	TRACE_MMIO_RW,
+	TRACE_MMIO_MAP,
 
 	__TRACE_LAST_TYPE
 };
@@ -75,6 +78,8 @@ struct trace_entry {
 		struct ctx_switch_entry		ctx;
 		struct special_entry		special;
 		struct stack_entry		stack;
+		struct mmiotrace_rw		mmiorw;
+		struct mmiotrace_map		mmiomap;
 	};
 };
 
@@ -255,6 +260,15 @@ extern unsigned long ftrace_update_tot_cnt;
 extern int DYN_FTRACE_TEST_NAME(void);
 #endif
 
+#ifdef CONFIG_MMIOTRACE
+extern void __trace_mmiotrace_rw(struct trace_array *tr,
+				struct trace_array_cpu *data,
+				struct mmiotrace_rw *rw);
+extern void __trace_mmiotrace_map(struct trace_array *tr,
+				struct trace_array_cpu *data,
+				struct mmiotrace_map *map);
+#endif
+
 #ifdef CONFIG_FTRACE_STARTUP_TEST
 #ifdef CONFIG_FTRACE
 extern int trace_selftest_startup_function(struct tracer *trace,
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index e4dd03cc5aa6..3a12b1ad0c63 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -11,19 +11,26 @@
 
 #include "trace.h"
 
-extern void
-__trace_special(void *__tr, void *__data,
-		unsigned long arg1, unsigned long arg2, unsigned long arg3);
-
 static struct trace_array *mmio_trace_array;
 
+static void mmio_reset_data(struct trace_array *tr)
+{
+	int cpu;
+
+	tr->time_start = ftrace_now(tr->cpu);
+
+	for_each_online_cpu(cpu)
+		tracing_reset(tr->data[cpu]);
+}
 
 static void mmio_trace_init(struct trace_array *tr)
 {
 	pr_debug("in %s\n", __func__);
 	mmio_trace_array = tr;
-	if (tr->ctrl)
+	if (tr->ctrl) {
+		mmio_reset_data(tr);
 		enable_mmiotrace();
+	}
 }
 
 static void mmio_trace_reset(struct trace_array *tr)
@@ -31,15 +38,110 @@ static void mmio_trace_reset(struct trace_array *tr)
 	pr_debug("in %s\n", __func__);
 	if (tr->ctrl)
 		disable_mmiotrace();
+	mmio_reset_data(tr);
+	mmio_trace_array = NULL;
 }
 
 static void mmio_trace_ctrl_update(struct trace_array *tr)
 {
 	pr_debug("in %s\n", __func__);
-	if (tr->ctrl)
+	if (tr->ctrl) {
+		mmio_reset_data(tr);
 		enable_mmiotrace();
-	else
+	} else {
 		disable_mmiotrace();
+	}
+}
+
+/* XXX: This is not called for trace_pipe file! */
+void mmio_print_header(struct trace_iterator *iter)
+{
+	struct trace_seq *s = &iter->seq;
+	trace_seq_printf(s, "VERSION broken 20070824\n");
+	/* TODO: print /proc/bus/pci/devices contents as PCIDEV lines */
+}
+
+static int mmio_print_rw(struct trace_iterator *iter)
+{
+	struct trace_entry *entry = iter->ent;
+	struct mmiotrace_rw *rw	= &entry->mmiorw;
+	struct trace_seq *s	= &iter->seq;
+	unsigned long long t	= ns2usecs(entry->t);
+	unsigned long usec_rem	= do_div(t, 1000000ULL);
+	unsigned secs		= (unsigned long)t;
+	int ret = 1;
+
+	switch (entry->mmiorw.opcode) {
+	case MMIO_READ:
+		ret = trace_seq_printf(s,
+			"R %d %lu.%06lu %d 0x%lx 0x%lx 0x%lx %d\n",
+			rw->width, secs, usec_rem, rw->map_id, rw->phys,
+			rw->value, rw->pc, entry->pid);
+		break;
+	case MMIO_WRITE:
+		ret = trace_seq_printf(s,
+			"W %d %lu.%06lu %d 0x%lx 0x%lx 0x%lx %d\n",
+			rw->width, secs, usec_rem, rw->map_id, rw->phys,
+			rw->value, rw->pc, entry->pid);
+		break;
+	case MMIO_UNKNOWN_OP:
+		ret = trace_seq_printf(s,
+			"UNKNOWN %lu.%06lu %d 0x%lx %02x,%02x,%02x 0x%lx %d\n",
+			secs, usec_rem, rw->map_id, rw->phys,
+			(rw->value >> 16) & 0xff, (rw->value >> 8) & 0xff,
+			(rw->value >> 0) & 0xff, rw->pc, entry->pid);
+		break;
+	default:
+		ret = trace_seq_printf(s, "rw what?\n");
+		break;
+	}
+	if (ret)
+		return 1;
+	return 0;
+}
+
+static int mmio_print_map(struct trace_iterator *iter)
+{
+	struct trace_entry *entry = iter->ent;
+	struct mmiotrace_map *m	= &entry->mmiomap;
+	struct trace_seq *s	= &iter->seq;
+	unsigned long long t	= ns2usecs(entry->t);
+	unsigned long usec_rem	= do_div(t, 1000000ULL);
+	unsigned secs		= (unsigned long)t;
+	int ret = 1;
+
+	switch (entry->mmiorw.opcode) {
+	case MMIO_PROBE:
+		ret = trace_seq_printf(s,
+			"MAP %lu.%06lu %d 0x%lx 0x%lx 0x%lx 0x%lx %d\n",
+			secs, usec_rem, m->map_id, m->phys, m->virt, m->len,
+			0UL, entry->pid);
+		break;
+	case MMIO_UNPROBE:
+		ret = trace_seq_printf(s,
+			"UNMAP %lu.%06lu %d 0x%lx %d\n",
+			secs, usec_rem, m->map_id, 0UL, entry->pid);
+		break;
+	default:
+		ret = trace_seq_printf(s, "map what?\n");
+		break;
+	}
+	if (ret)
+		return 1;
+	return 0;
+}
+
+/* return 0 to abort printing without consuming current entry in pipe mode */
+static int mmio_print_line(struct trace_iterator *iter)
+{
+	switch (iter->ent->type) {
+	case TRACE_MMIO_RW:
+		return mmio_print_rw(iter);
+	case TRACE_MMIO_MAP:
+		return mmio_print_map(iter);
+	default:
+		return 1; /* ignore unknown entries */
+	}
 }
 
 static struct tracer mmio_tracer __read_mostly =
@@ -47,38 +149,31 @@ static struct tracer mmio_tracer __read_mostly =
 	.name		= "mmiotrace",
 	.init		= mmio_trace_init,
 	.reset		= mmio_trace_reset,
+	.open		= mmio_print_header,
 	.ctrl_update	= mmio_trace_ctrl_update,
+	.print_line	= mmio_print_line,
 };
 
 __init static int init_mmio_trace(void)
 {
-	int ret = init_mmiotrace();
-	if (ret)
-		return ret;
 	return register_tracer(&mmio_tracer);
 }
 device_initcall(init_mmio_trace);
 
-void mmio_trace_record(u32 type, unsigned long addr, unsigned long arg)
+void mmio_trace_rw(struct mmiotrace_rw *rw)
 {
 	struct trace_array *tr = mmio_trace_array;
 	struct trace_array_cpu *data = tr->data[smp_processor_id()];
+	__trace_mmiotrace_rw(tr, data, rw);
+}
 
-	if (!current || current->pid == 0) {
-		/*
-		 * XXX: This is a problem. We need to able to record, no
-		 * matter what. tracing_generic_entry_update() would crash.
-		 */
-		static unsigned limit;
-		if (limit++ < 12)
-			pr_err("Error in %s: no current.\n", __func__);
-		return;
-	}
-	if (!tr || !data) {
-		static unsigned limit;
-		if (limit++ < 12)
-			pr_err("%s: no tr or data\n", __func__);
-		return;
-	}
-	__trace_special(tr, data, type, addr, arg);
+void mmio_trace_mapping(struct mmiotrace_map *map)
+{
+	struct trace_array *tr = mmio_trace_array;
+	struct trace_array_cpu *data;
+
+	preempt_disable();
+	data = tr->data[smp_processor_id()];
+	__trace_mmiotrace_map(tr, data, map);
+	preempt_enable();
 }
-- 
cgit v1.2.3


From 138295373ccf7625fcb0218dfea114837983bc39 Mon Sep 17 00:00:00 2001
From: Pekka Paalanen <pq@iki.fi>
Date: Mon, 12 May 2008 21:20:58 +0200
Subject: ftrace: mmiotrace update, #2

another weekend, another patch. This should apply on top of my previous patch
from March 23rd.

Summary of changes:
- Print PCI device list in output header
- work around recursive probe hits on SMP
- refactor dis/arm_kmmio_fault_page() and add check for page levels
- remove un/reference_kmmio(), the die notifier hook is registered
permanently into the list
- explicitly check for single stepping in die notifier callback

I have tested this version on my UP Athlon64 desktop with Nouveau, and
SMP Core 2 Duo laptop with the proprietary nvidia driver. Both systems
are 64-bit. One previously unknown bug crept into daylight: the ftrace
framework's output routines print the first entry last after buffer has
wrapped around.

The most important regressions compared to non-ftrace mmiotrace at this
time are:
- failure of trace_pipe file
- illegal lines in output file
- unaware of losing data due to buffer full

Personally I'd like to see these three solved before submitting to
mainline. Other issues may come up once we know when we lose events.

Signed-off-by: Pekka Paalanen <pq@iki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/mmiotrace/kmmio.c    | 186 ++++++++++++++---------------------
 arch/x86/kernel/mmiotrace/mmio-mod.c |   3 -
 include/linux/mmiotrace.h            |   2 -
 kernel/trace/trace_mmiotrace.c       |  47 ++++++++-
 4 files changed, 120 insertions(+), 118 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/mmiotrace/kmmio.c b/arch/x86/kernel/mmiotrace/kmmio.c
index efb467933087..cd0d95fe4fe6 100644
--- a/arch/x86/kernel/mmiotrace/kmmio.c
+++ b/arch/x86/kernel/mmiotrace/kmmio.c
@@ -5,15 +5,12 @@
  *     2008 Pekka Paalanen <pq@iki.fi>
  */
 
-#include <linux/version.h>
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/hash.h>
 #include <linux/init.h>
 #include <linux/module.h>
-#include <linux/slab.h>
 #include <linux/kernel.h>
-#include <linux/mm.h>
 #include <linux/uaccess.h>
 #include <linux/ptrace.h>
 #include <linux/preempt.h>
@@ -22,10 +19,9 @@
 #include <linux/mutex.h>
 #include <asm/io.h>
 #include <asm/cacheflush.h>
-#include <asm/errno.h>
 #include <asm/tlbflush.h>
-#include <asm/pgtable.h>
-
+#include <asm/errno.h>
+#include <asm/debugreg.h>
 #include <linux/mmiotrace.h>
 
 #define KMMIO_PAGE_HASH_BITS 4
@@ -57,14 +53,9 @@ struct kmmio_context {
 	int active;
 };
 
-static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val,
-								void *args);
-
-static DEFINE_MUTEX(kmmio_init_mutex);
 static DEFINE_SPINLOCK(kmmio_lock);
 
-/* These are protected by kmmio_lock */
-static int kmmio_initialized;
+/* Protected by kmmio_lock */
 unsigned int kmmio_count;
 
 /* Read-protected by RCU, write-protected by kmmio_lock. */
@@ -79,60 +70,6 @@ static struct list_head *kmmio_page_list(unsigned long page)
 /* Accessed per-cpu */
 static DEFINE_PER_CPU(struct kmmio_context, kmmio_ctx);
 
-/* protected by kmmio_init_mutex */
-static struct notifier_block nb_die = {
-	.notifier_call = kmmio_die_notifier
-};
-
-/**
- * Makes sure kmmio is initialized and usable.
- * This must be called before any other kmmio function defined here.
- * May sleep.
- */
-void reference_kmmio(void)
-{
-	mutex_lock(&kmmio_init_mutex);
-	spin_lock_irq(&kmmio_lock);
-	if (!kmmio_initialized) {
-		int i;
-		for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++)
-			INIT_LIST_HEAD(&kmmio_page_table[i]);
-		if (register_die_notifier(&nb_die))
-			BUG();
-	}
-	kmmio_initialized++;
-	spin_unlock_irq(&kmmio_lock);
-	mutex_unlock(&kmmio_init_mutex);
-}
-EXPORT_SYMBOL_GPL(reference_kmmio);
-
-/**
- * Clean up kmmio after use. This must be called for every call to
- * reference_kmmio(). All probes registered after the corresponding
- * reference_kmmio() must have been unregistered when calling this.
- * May sleep.
- */
-void unreference_kmmio(void)
-{
-	bool unreg = false;
-
-	mutex_lock(&kmmio_init_mutex);
-	spin_lock_irq(&kmmio_lock);
-
-	if (kmmio_initialized == 1) {
-		BUG_ON(is_kmmio_active());
-		unreg = true;
-	}
-	kmmio_initialized--;
-	BUG_ON(kmmio_initialized < 0);
-	spin_unlock_irq(&kmmio_lock);
-
-	if (unreg)
-		unregister_die_notifier(&nb_die); /* calls sync_rcu() */
-	mutex_unlock(&kmmio_init_mutex);
-}
-EXPORT_SYMBOL(unreference_kmmio);
-
 /*
  * this is basically a dynamic stabbing problem:
  * Could use the existing prio tree code or
@@ -167,58 +104,56 @@ static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page)
 	return NULL;
 }
 
-/** Mark the given page as not present. Access to it will trigger a fault. */
-static void arm_kmmio_fault_page(unsigned long page, int *page_level)
+static void set_page_present(unsigned long addr, bool present, int *pglevel)
 {
-	unsigned long address = page & PAGE_MASK;
+	pteval_t pteval;
+	pmdval_t pmdval;
 	int level;
-	pte_t *pte = lookup_address(address, &level);
+	pmd_t *pmd;
+	pte_t *pte = lookup_address(addr, &level);
 
 	if (!pte) {
-		pr_err("kmmio: Error in %s: no pte for page 0x%08lx\n",
-							__func__, page);
+		pr_err("kmmio: no pte for page 0x%08lx\n", addr);
 		return;
 	}
 
-	if (level == PG_LEVEL_2M) {
-		pmd_t *pmd = (pmd_t *)pte;
-		set_pmd(pmd, __pmd(pmd_val(*pmd) & ~_PAGE_PRESENT));
-	} else {
-		/* PG_LEVEL_4K */
-		set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT));
+	if (pglevel)
+		*pglevel = level;
+
+	switch (level) {
+	case PG_LEVEL_2M:
+		pmd = (pmd_t *)pte;
+		pmdval = pmd_val(*pmd) & ~_PAGE_PRESENT;
+		if (present)
+			pmdval |= _PAGE_PRESENT;
+		set_pmd(pmd, __pmd(pmdval));
+		break;
+
+	case PG_LEVEL_4K:
+		pteval = pte_val(*pte) & ~_PAGE_PRESENT;
+		if (present)
+			pteval |= _PAGE_PRESENT;
+		set_pte_atomic(pte, __pte(pteval));
+		break;
+
+	default:
+		pr_err("kmmio: unexpected page level 0x%x.\n", level);
+		return;
 	}
 
-	if (page_level)
-		*page_level = level;
+	__flush_tlb_one(addr);
+}
 
-	__flush_tlb_one(page);
+/** Mark the given page as not present. Access to it will trigger a fault. */
+static void arm_kmmio_fault_page(unsigned long page, int *page_level)
+{
+	set_page_present(page & PAGE_MASK, false, page_level);
 }
 
 /** Mark the given page as present. */
 static void disarm_kmmio_fault_page(unsigned long page, int *page_level)
 {
-	unsigned long address = page & PAGE_MASK;
-	int level;
-	pte_t *pte = lookup_address(address, &level);
-
-	if (!pte) {
-		pr_err("kmmio: Error in %s: no pte for page 0x%08lx\n",
-							__func__, page);
-		return;
-	}
-
-	if (level == PG_LEVEL_2M) {
-		pmd_t *pmd = (pmd_t *)pte;
-		set_pmd(pmd, __pmd(pmd_val(*pmd) | _PAGE_PRESENT));
-	} else {
-		/* PG_LEVEL_4K */
-		set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT));
-	}
-
-	if (page_level)
-		*page_level = level;
-
-	__flush_tlb_one(page);
+	set_page_present(page & PAGE_MASK, true, page_level);
 }
 
 /*
@@ -240,6 +175,7 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
 {
 	struct kmmio_context *ctx;
 	struct kmmio_fault_page *faultpage;
+	int ret = 0; /* default to fault not handled */
 
 	/*
 	 * Preemption is now disabled to prevent process switch during
@@ -257,21 +193,35 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
 		/*
 		 * Either this page fault is not caused by kmmio, or
 		 * another CPU just pulled the kmmio probe from under
-		 * our feet. In the latter case all hell breaks loose.
+		 * our feet. The latter case should not be possible.
 		 */
 		goto no_kmmio;
 	}
 
 	ctx = &get_cpu_var(kmmio_ctx);
 	if (ctx->active) {
+		disarm_kmmio_fault_page(faultpage->page, NULL);
+		if (addr == ctx->addr) {
+			/*
+			 * On SMP we sometimes get recursive probe hits on the
+			 * same address. Context is already saved, fall out.
+			 */
+			pr_debug("kmmio: duplicate probe hit on CPU %d, for "
+						"address 0x%08lx.\n",
+						smp_processor_id(), addr);
+			ret = 1;
+			goto no_kmmio_ctx;
+		}
 		/*
 		 * Prevent overwriting already in-flight context.
-		 * If this page fault really was due to kmmio trap,
-		 * all hell breaks loose.
+		 * This should not happen, let's hope disarming at least
+		 * prevents a panic.
 		 */
 		pr_emerg("kmmio: recursive probe hit on CPU %d, "
 					"for address 0x%08lx. Ignoring.\n",
 					smp_processor_id(), addr);
+		pr_emerg("kmmio: previous hit was at 0x%08lx.\n",
+					ctx->addr);
 		goto no_kmmio_ctx;
 	}
 	ctx->active++;
@@ -302,14 +252,14 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
 	 */
 
 	put_cpu_var(kmmio_ctx);
-	return 1;
+	return 1; /* fault handled */
 
 no_kmmio_ctx:
 	put_cpu_var(kmmio_ctx);
 no_kmmio:
 	rcu_read_unlock();
 	preempt_enable_no_resched();
-	return 0; /* page fault not handled by kmmio */
+	return ret;
 }
 
 /*
@@ -322,8 +272,11 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
 	int ret = 0;
 	struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx);
 
-	if (!ctx->active)
+	if (!ctx->active) {
+		pr_debug("kmmio: spurious debug trap on CPU %d.\n",
+							smp_processor_id());
 		goto out;
+	}
 
 	if (ctx->probe && ctx->probe->post_handler)
 		ctx->probe->post_handler(ctx->probe, condition, regs);
@@ -525,9 +478,22 @@ static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val,
 {
 	struct die_args *arg = args;
 
-	if (val == DIE_DEBUG)
+	if (val == DIE_DEBUG && (arg->err & DR_STEP))
 		if (post_kmmio_handler(arg->err, arg->regs) == 1)
 			return NOTIFY_STOP;
 
 	return NOTIFY_DONE;
 }
+
+static struct notifier_block nb_die = {
+	.notifier_call = kmmio_die_notifier
+};
+
+static int __init init_kmmio(void)
+{
+	int i;
+	for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++)
+		INIT_LIST_HEAD(&kmmio_page_table[i]);
+	return register_die_notifier(&nb_die);
+}
+fs_initcall(init_kmmio); /* should be before device_initcall() */
diff --git a/arch/x86/kernel/mmiotrace/mmio-mod.c b/arch/x86/kernel/mmiotrace/mmio-mod.c
index 62abc281a512..8256546d49bf 100644
--- a/arch/x86/kernel/mmiotrace/mmio-mod.c
+++ b/arch/x86/kernel/mmiotrace/mmio-mod.c
@@ -415,8 +415,6 @@ void enable_mmiotrace(void)
 	if (is_enabled())
 		goto out;
 
-	reference_kmmio();
-
 #if 0 /* XXX: tracing does not support text entries */
 	marker_file = debugfs_create_file("marker", 0660, dir, NULL,
 								&fops_marker);
@@ -448,7 +446,6 @@ void disable_mmiotrace(void)
 	spin_unlock_irq(&trace_lock);
 
 	clear_trace_list(); /* guarantees: no more kmmio callbacks */
-	unreference_kmmio();
 	if (marker_file) {
 		debugfs_remove(marker_file);
 		marker_file = NULL;
diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h
index c88a9c197d22..dd6b64b160fc 100644
--- a/include/linux/mmiotrace.h
+++ b/include/linux/mmiotrace.h
@@ -31,8 +31,6 @@ static inline int is_kmmio_active(void)
 	return kmmio_count;
 }
 
-extern void reference_kmmio(void);
-extern void unreference_kmmio(void);
 extern int register_kmmio_probe(struct kmmio_probe *p);
 extern void unregister_kmmio_probe(struct kmmio_probe *p);
 
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 3a12b1ad0c63..361472b5788c 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -8,6 +8,7 @@
 
 #include <linux/kernel.h>
 #include <linux/mmiotrace.h>
+#include <linux/pci.h>
 
 #include "trace.h"
 
@@ -53,12 +54,52 @@ static void mmio_trace_ctrl_update(struct trace_array *tr)
 	}
 }
 
+static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev)
+{
+	int ret = 0;
+	int i;
+	resource_size_t start, end;
+	const struct pci_driver *drv = pci_dev_driver(dev);
+
+	/* XXX: incomplete checks for trace_seq_printf() return value */
+	ret += trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x",
+				dev->bus->number, dev->devfn,
+				dev->vendor, dev->device, dev->irq);
+	/*
+	 * XXX: is pci_resource_to_user() appropriate, since we are
+	 * supposed to interpret the __ioremap() phys_addr argument based on
+	 * these printed values?
+	 */
+	for (i = 0; i < 7; i++) {
+		pci_resource_to_user(dev, i, &dev->resource[i], &start, &end);
+		ret += trace_seq_printf(s, " %llx",
+			(unsigned long long)(start |
+			(dev->resource[i].flags & PCI_REGION_FLAG_MASK)));
+	}
+	for (i = 0; i < 7; i++) {
+		pci_resource_to_user(dev, i, &dev->resource[i], &start, &end);
+		ret += trace_seq_printf(s, " %llx",
+			dev->resource[i].start < dev->resource[i].end ?
+			(unsigned long long)(end - start) + 1 : 0);
+	}
+	if (drv)
+		ret += trace_seq_printf(s, " %s\n", drv->name);
+	else
+		ret += trace_seq_printf(s, " \n");
+	return ret;
+}
+
 /* XXX: This is not called for trace_pipe file! */
-void mmio_print_header(struct trace_iterator *iter)
+static void mmio_print_header(struct trace_iterator *iter)
 {
 	struct trace_seq *s = &iter->seq;
-	trace_seq_printf(s, "VERSION broken 20070824\n");
-	/* TODO: print /proc/bus/pci/devices contents as PCIDEV lines */
+	struct pci_dev *dev = NULL;
+
+	trace_seq_printf(s, "VERSION 20070824\n");
+
+	for_each_pci_dev(dev)
+		mmio_print_pcidev(s, dev);
+	/* XXX: return value? What if header is very long? */
 }
 
 static int mmio_print_rw(struct trace_iterator *iter)
-- 
cgit v1.2.3


From 970e6fa03885f32cc43e42cb08c73a5f54cd8bd9 Mon Sep 17 00:00:00 2001
From: Pekka Paalanen <pq@iki.fi>
Date: Mon, 12 May 2008 21:21:03 +0200
Subject: mmiotrace: code style cleanups

From c2da03771e29159627c5c7b9509ec70bce9f91ee Mon Sep 17 00:00:00 2001
From: Pekka Paalanen <pq@iki.fi>
Date: Mon, 28 Apr 2008 21:25:22 +0300

Signed-off-by: Pekka Paalanen <pq@iki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/kmmio.c         | 4 ++--
 arch/x86/mm/mmio-mod.c      | 7 +++----
 arch/x86/mm/testmmiotrace.c | 2 +-
 include/linux/mmiotrace.h   | 6 +-----
 4 files changed, 7 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index 3ad27b8504a5..6a92d9111b64 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -17,10 +17,10 @@
 #include <linux/percpu.h>
 #include <linux/kdebug.h>
 #include <linux/mutex.h>
-#include <asm/io.h>
+#include <linux/io.h>
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
-#include <asm/errno.h>
+#include <linux/errno.h>
 #include <asm/debugreg.h>
 #include <linux/mmiotrace.h>
 
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
index 1f77d8532037..a8d2a0019da4 100644
--- a/arch/x86/mm/mmio-mod.c
+++ b/arch/x86/mm/mmio-mod.c
@@ -24,7 +24,7 @@
 #include <linux/module.h>
 #include <linux/debugfs.h>
 #include <linux/uaccess.h>
-#include <asm/io.h>
+#include <linux/io.h>
 #include <linux/version.h>
 #include <linux/kallsyms.h>
 #include <asm/pgtable.h>
@@ -418,11 +418,10 @@ static void enter_uniprocessor(void)
 
 	for_each_cpu_mask(cpu, downed_cpus) {
 		err = cpu_down(cpu);
-		if (!err) {
+		if (!err)
 			pr_info(NAME "CPU%d is down.\n", cpu);
-		} else {
+		else
 			pr_err(NAME "Error taking CPU%d down: %d\n", cpu, err);
-		}
 	}
 	if (num_online_cpus() > 1)
 		pr_warning(NAME "multiple CPUs still online, "
diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c
index cfa60b227c8d..d877c5b423ef 100644
--- a/arch/x86/mm/testmmiotrace.c
+++ b/arch/x86/mm/testmmiotrace.c
@@ -2,7 +2,7 @@
  * Written by Pekka Paalanen, 2008 <pq@iki.fi>
  */
 #include <linux/module.h>
-#include <asm/io.h>
+#include <linux/io.h>
 
 #define MODULE_NAME "testmmiotrace"
 
diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h
index dd6b64b160fc..de8e91258da7 100644
--- a/include/linux/mmiotrace.h
+++ b/include/linux/mmiotrace.h
@@ -1,9 +1,7 @@
 #ifndef MMIOTRACE_H
 #define MMIOTRACE_H
 
-#include <asm/types.h>
-
-#ifdef __KERNEL__
+#include <linux/types.h>
 
 #include <linux/list.h>
 
@@ -84,6 +82,4 @@ extern void disable_mmiotrace(void);
 extern void mmio_trace_rw(struct mmiotrace_rw *rw);
 extern void mmio_trace_mapping(struct mmiotrace_map *map);
 
-#endif /* __KERNEL__ */
-
 #endif /* MMIOTRACE_H */
-- 
cgit v1.2.3


From dee310d0adf41019aca476052ac3085ff286d9be Mon Sep 17 00:00:00 2001
From: Pekka Paalanen <pq@iki.fi>
Date: Mon, 12 May 2008 21:21:03 +0200
Subject: x86 mmiotrace: use resource_size_t for phys addresses

Signed-off-by: Pekka Paalanen <pq@iki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/mmio-mod.c         | 11 ++++++-----
 include/linux/mmiotrace.h      | 14 +++++++-------
 kernel/trace/trace_mmiotrace.c | 20 ++++++++++++--------
 3 files changed, 25 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
index 278998c1998f..3b04a0126121 100644
--- a/arch/x86/mm/mmio-mod.c
+++ b/arch/x86/mm/mmio-mod.c
@@ -48,7 +48,7 @@ struct trap_reason {
 struct remap_trace {
 	struct list_head list;
 	struct kmmio_probe probe;
-	unsigned long phys;
+	resource_size_t phys;
 	unsigned long id;
 };
 
@@ -275,7 +275,7 @@ static void post(struct kmmio_probe *p, unsigned long condition,
 	put_cpu_var(pf_reason);
 }
 
-static void ioremap_trace_core(unsigned long offset, unsigned long size,
+static void ioremap_trace_core(resource_size_t offset, unsigned long size,
 							void __iomem *addr)
 {
 	static atomic_t next_id;
@@ -319,13 +319,14 @@ not_enabled:
 	spin_unlock_irq(&trace_lock);
 }
 
-void
-mmiotrace_ioremap(unsigned long offset, unsigned long size, void __iomem *addr)
+void mmiotrace_ioremap(resource_size_t offset, unsigned long size,
+						void __iomem *addr)
 {
 	if (!is_enabled()) /* recheck and proper locking in *_core() */
 		return;
 
-	pr_debug(NAME "ioremap_*(0x%lx, 0x%lx) = %p\n", offset, size, addr);
+	pr_debug(NAME "ioremap_*(0x%llx, 0x%lx) = %p\n",
+				(unsigned long long)offset, size, addr);
 	if ((filter_offset) && (offset != filter_offset))
 		return;
 	ioremap_trace_core(offset, size, addr);
diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h
index de8e91258da7..5cbbc374e945 100644
--- a/include/linux/mmiotrace.h
+++ b/include/linux/mmiotrace.h
@@ -2,7 +2,6 @@
 #define MMIOTRACE_H
 
 #include <linux/types.h>
-
 #include <linux/list.h>
 
 struct kmmio_probe;
@@ -37,14 +36,15 @@ extern int kmmio_handler(struct pt_regs *regs, unsigned long addr);
 
 /* Called from ioremap.c */
 #ifdef CONFIG_MMIOTRACE
-extern void
-mmiotrace_ioremap(unsigned long offset, unsigned long size, void __iomem *addr);
+extern void mmiotrace_ioremap(resource_size_t offset, unsigned long size,
+							void __iomem *addr);
 extern void mmiotrace_iounmap(volatile void __iomem *addr);
 #else
-static inline void
-mmiotrace_ioremap(unsigned long offset, unsigned long size, void __iomem *addr)
+static inline void mmiotrace_ioremap(resource_size_t offset,
+					unsigned long size, void __iomem *addr)
 {
 }
+
 static inline void mmiotrace_iounmap(volatile void __iomem *addr)
 {
 }
@@ -60,7 +60,7 @@ enum mm_io_opcode {
 };
 
 struct mmiotrace_rw {
-	unsigned long phys;	/* PCI address of register */
+	resource_size_t phys;	/* PCI address of register */
 	unsigned long value;
 	unsigned long pc;	/* optional program counter */
 	int map_id;
@@ -69,7 +69,7 @@ struct mmiotrace_rw {
 };
 
 struct mmiotrace_map {
-	unsigned long phys;	/* base address in PCI space */
+	resource_size_t phys;	/* base address in PCI space */
 	unsigned long virt;	/* base virtual address */
 	unsigned long len;	/* mapping size */
 	int map_id;
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 3c1dacdc2d85..b13dc19dcbb4 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -184,20 +184,23 @@ static int mmio_print_rw(struct trace_iterator *iter)
 	switch (entry->mmiorw.opcode) {
 	case MMIO_READ:
 		ret = trace_seq_printf(s,
-			"R %d %lu.%06lu %d 0x%lx 0x%lx 0x%lx %d\n",
-			rw->width, secs, usec_rem, rw->map_id, rw->phys,
+			"R %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
+			rw->width, secs, usec_rem, rw->map_id,
+			(unsigned long long)rw->phys,
 			rw->value, rw->pc, 0);
 		break;
 	case MMIO_WRITE:
 		ret = trace_seq_printf(s,
-			"W %d %lu.%06lu %d 0x%lx 0x%lx 0x%lx %d\n",
-			rw->width, secs, usec_rem, rw->map_id, rw->phys,
+			"W %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
+			rw->width, secs, usec_rem, rw->map_id,
+			(unsigned long long)rw->phys,
 			rw->value, rw->pc, 0);
 		break;
 	case MMIO_UNKNOWN_OP:
 		ret = trace_seq_printf(s,
-			"UNKNOWN %lu.%06lu %d 0x%lx %02x,%02x,%02x 0x%lx %d\n",
-			secs, usec_rem, rw->map_id, rw->phys,
+			"UNKNOWN %lu.%06lu %d 0x%llx %02x,%02x,%02x 0x%lx %d\n",
+			secs, usec_rem, rw->map_id,
+			(unsigned long long)rw->phys,
 			(rw->value >> 16) & 0xff, (rw->value >> 8) & 0xff,
 			(rw->value >> 0) & 0xff, rw->pc, 0);
 		break;
@@ -223,8 +226,9 @@ static int mmio_print_map(struct trace_iterator *iter)
 	switch (entry->mmiorw.opcode) {
 	case MMIO_PROBE:
 		ret = trace_seq_printf(s,
-			"MAP %lu.%06lu %d 0x%lx 0x%lx 0x%lx 0x%lx %d\n",
-			secs, usec_rem, m->map_id, m->phys, m->virt, m->len,
+			"MAP %lu.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n",
+			secs, usec_rem, m->map_id,
+			(unsigned long long)m->phys, m->virt, m->len,
 			0UL, 0);
 		break;
 	case MMIO_UNPROBE:
-- 
cgit v1.2.3


From a50445d76c22a34ae149704ea5adaef171c8acb7 Mon Sep 17 00:00:00 2001
From: Pekka Paalanen <pq@iki.fi>
Date: Mon, 12 May 2008 21:21:03 +0200
Subject: mmiotrace: rename kmmio_probe::user_data to :private.

Signed-off-by: Pekka Paalanen <pq@iki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/mmio-mod.c    | 4 ++--
 include/linux/mmiotrace.h | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
index 3b04a0126121..ed0e0e90b3ef 100644
--- a/arch/x86/mm/mmio-mod.c
+++ b/arch/x86/mm/mmio-mod.c
@@ -191,7 +191,7 @@ static void pre(struct kmmio_probe *p, struct pt_regs *regs,
 	struct mmiotrace_rw *my_trace = &get_cpu_var(cpu_trace);
 	const unsigned long instptr = instruction_pointer(regs);
 	const enum reason_type type = get_ins_type(instptr);
-	struct remap_trace *trace = p->user_data;
+	struct remap_trace *trace = p->private;
 
 	/* it doesn't make sense to have more than one active trace per cpu */
 	if (my_reason->active_traces)
@@ -299,7 +299,7 @@ static void ioremap_trace_core(resource_size_t offset, unsigned long size,
 			.len = size,
 			.pre_handler = pre,
 			.post_handler = post,
-			.user_data = trace
+			.private = trace
 		},
 		.phys = offset,
 		.id = atomic_inc_return(&next_id)
diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h
index 5cbbc374e945..61d19e1b7a0b 100644
--- a/include/linux/mmiotrace.h
+++ b/include/linux/mmiotrace.h
@@ -18,7 +18,7 @@ struct kmmio_probe {
 	unsigned long len; /* length of the probe region */
 	kmmio_pre_handler_t pre_handler; /* Called before addr is executed. */
 	kmmio_post_handler_t post_handler; /* Called after addr is executed */
-	void *user_data;
+	void *private;
 };
 
 /* kmmio is active by some kmmio_probes? */
-- 
cgit v1.2.3


From 42fdfa238a23643226910acf922ea930b3286032 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 24 May 2008 23:14:51 +0200
Subject: namespacecheck: more kernel/printk.c fixes

[ Stephen Rothwell <sfr@canb.auug.org.au>: build fix ]

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/kernel.h |  6 ------
 kernel/printk.c        | 13 -------------
 2 files changed, 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 792bf0aa779b..f2a668c195bf 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -184,9 +184,6 @@ asmlinkage int vprintk(const char *fmt, va_list args)
 	__attribute__ ((format (printf, 1, 0)));
 asmlinkage int printk(const char * fmt, ...)
 	__attribute__ ((format (printf, 1, 2))) __cold;
-extern int log_buf_get_len(void);
-extern int log_buf_read(int idx);
-extern int log_buf_copy(char *dest, int idx, int len);
 
 extern int printk_ratelimit_jiffies;
 extern int printk_ratelimit_burst;
@@ -202,9 +199,6 @@ static inline int vprintk(const char *s, va_list args) { return 0; }
 static inline int printk(const char *s, ...)
 	__attribute__ ((format (printf, 1, 2)));
 static inline int __cold printk(const char *s, ...) { return 0; }
-static inline int log_buf_get_len(void) { return 0; }
-static inline int log_buf_read(int idx) { return 0; }
-static inline int log_buf_copy(char *dest, int idx, int len) { return 0; }
 static inline int printk_ratelimit(void) { return 0; }
 static inline int __printk_ratelimit(int ratelimit_jiffies, \
 				     int ratelimit_burst) { return 0; }
diff --git a/kernel/printk.c b/kernel/printk.c
index b620e3d96136..55d16e57499a 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -267,19 +267,6 @@ int log_buf_copy(char *dest, int idx, int len)
 	return ret;
 }
 
-/*
- * Extract a single character from the log buffer.
- */
-static int log_buf_read(int idx)
-{
-	char ret;
-
-	if (log_buf_copy(&ret, idx, 1) == 1)
-		return ret;
-	else
-		return -1;
-}
-
 /*
  * Commands to do_syslog:
  *
-- 
cgit v1.2.3


From 63687a528c39a67c1a213cdffa09feb0e6af9dbe Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Mon, 12 May 2008 15:44:41 +0200
Subject: x86: move tracedata to RODATA

.. allowing it to be write-protected just as other read-only data
under CONFIG_DEBUG_RODATA.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/vmlinux_32.lds.S  |  7 -------
 arch/x86/kernel/vmlinux_64.lds.S  |  7 -------
 drivers/base/power/trace.c        |  2 +-
 include/asm-generic/vmlinux.lds.h | 14 ++++++++++++++
 include/asm-x86/resume-trace.h    |  2 +-
 include/linux/resume-trace.h      |  2 +-
 6 files changed, 17 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index ce5ed083a1e9..2674f5796275 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -60,13 +60,6 @@ SECTIONS
 
   BUG_TABLE :text
 
-  . = ALIGN(4);
-  .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) {
-  	__tracedata_start = .;
-	*(.tracedata)
-  	__tracedata_end = .;
-  }
-
   RODATA
 
   /* writeable */
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index fad3674b06a5..687041bfbae5 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -53,13 +53,6 @@ SECTIONS
 
   RODATA
 
-  . = ALIGN(4);
-  .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) {
-  	__tracedata_start = .;
-	*(.tracedata)
-  	__tracedata_end = .;
-  }
-
   . = ALIGN(PAGE_SIZE);		/* Align data segment to page size boundary */
 				/* Data */
   .data : AT(ADDR(.data) - LOAD_OFFSET) {
diff --git a/drivers/base/power/trace.c b/drivers/base/power/trace.c
index 2b4b392dcbc1..87a7f1d02578 100644
--- a/drivers/base/power/trace.c
+++ b/drivers/base/power/trace.c
@@ -153,7 +153,7 @@ EXPORT_SYMBOL(set_trace_device);
  * it's not any guarantee, but it's a high _likelihood_ that
  * the match is valid).
  */
-void generate_resume_trace(void *tracedata, unsigned int user)
+void generate_resume_trace(const void *tracedata, unsigned int user)
 {
 	unsigned short lineno = *(unsigned short *)tracedata;
 	const char *file = *(const char **)(tracedata + 2);
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index f054778e916c..f1992dc5c424 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -93,6 +93,8 @@
 		VMLINUX_SYMBOL(__end_rio_route_ops) = .;		\
 	}								\
 									\
+	TRACEDATA							\
+									\
 	/* Kernel symbol table: Normal symbols */			\
 	__ksymtab         : AT(ADDR(__ksymtab) - LOAD_OFFSET) {		\
 		VMLINUX_SYMBOL(__start___ksymtab) = .;			\
@@ -318,6 +320,18 @@
 		__stop___bug_table = .;					\
 	}
 
+#ifdef CONFIG_PM_TRACE
+#define TRACEDATA							\
+	. = ALIGN(4);							\
+	.tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) {		\
+	  	__tracedata_start = .;					\
+		*(.tracedata)						\
+	  	__tracedata_end = .;					\
+	}
+#else
+#define TRACEDATA
+#endif
+
 #define NOTES								\
 	.notes : AT(ADDR(.notes) - LOAD_OFFSET) {			\
 		VMLINUX_SYMBOL(__start_notes) = .;			\
diff --git a/include/asm-x86/resume-trace.h b/include/asm-x86/resume-trace.h
index 2557514d7ef6..8d9f0b41ee86 100644
--- a/include/asm-x86/resume-trace.h
+++ b/include/asm-x86/resume-trace.h
@@ -6,7 +6,7 @@
 #define TRACE_RESUME(user)					\
 do {								\
 	if (pm_trace_enabled) {					\
-		void *tracedata;				\
+		const void *tracedata;				\
 		asm volatile(_ASM_MOV_UL " $1f,%0\n"		\
 			     ".section .tracedata,\"a\"\n"	\
 			     "1:\t.word %c1\n\t"		\
diff --git a/include/linux/resume-trace.h b/include/linux/resume-trace.h
index f3f4f28c6960..c9ba2fdf807d 100644
--- a/include/linux/resume-trace.h
+++ b/include/linux/resume-trace.h
@@ -8,7 +8,7 @@ extern int pm_trace_enabled;
 
 struct device;
 extern void set_trace_device(struct device *);
-extern void generate_resume_trace(void *tracedata, unsigned int user);
+extern void generate_resume_trace(const void *tracedata, unsigned int user);
 
 #define TRACE_DEVICE(dev) do { \
 	if (pm_trace_enabled) \
-- 
cgit v1.2.3


From 962cf36c5bf6d2840b8d66ee9a606fae2f540bbd Mon Sep 17 00:00:00 2001
From: "Carlos R. Mafra" <crmafra2@gmail.com>
Date: Thu, 15 May 2008 11:15:37 -0300
Subject: Remove argument from open_softirq which is always NULL

As git-grep shows, open_softirq() is always called with the last argument
being NULL

block/blk-core.c:       open_softirq(BLOCK_SOFTIRQ, blk_done_softirq, NULL);
kernel/hrtimer.c:       open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq, NULL);
kernel/rcuclassic.c:    open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL);
kernel/rcupreempt.c:    open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL);
kernel/sched.c: open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);
kernel/softirq.c:       open_softirq(TASKLET_SOFTIRQ, tasklet_action, NULL);
kernel/softirq.c:       open_softirq(HI_SOFTIRQ, tasklet_hi_action, NULL);
kernel/timer.c: open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL);
net/core/dev.c: open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL);
net/core/dev.c: open_softirq(NET_RX_SOFTIRQ, net_rx_action, NULL);

This observation has already been made by Matthew Wilcox in June 2002
(http://www.cs.helsinki.fi/linux/linux-kernel/2002-25/0687.html)

"I notice that none of the current softirq routines use the data element
passed to them."

and the situation hasn't changed since them. So it appears we can safely
remove that extra argument to save 128 (54) bytes of kernel data (text).

Signed-off-by: Carlos R. Mafra <crmafra@ift.unesp.br>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 block/blk-core.c          | 2 +-
 include/linux/interrupt.h | 3 +--
 kernel/hrtimer.c          | 2 +-
 kernel/rcuclassic.c       | 2 +-
 kernel/rcupreempt.c       | 2 +-
 kernel/sched.c            | 2 +-
 kernel/softirq.c          | 7 +++----
 kernel/timer.c            | 2 +-
 net/core/dev.c            | 4 ++--
 9 files changed, 12 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 6a9cc0d22a61..75fdc65136e8 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2048,7 +2048,7 @@ int __init blk_dev_init(void)
 	for_each_possible_cpu(i)
 		INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i));
 
-	open_softirq(BLOCK_SOFTIRQ, blk_done_softirq, NULL);
+	open_softirq(BLOCK_SOFTIRQ, blk_done_softirq);
 	register_hotcpu_notifier(&blk_cpu_notifier);
 
 	return 0;
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index f1fc7470d26c..a86186dd0474 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -285,12 +285,11 @@ enum
 struct softirq_action
 {
 	void	(*action)(struct softirq_action *);
-	void	*data;
 };
 
 asmlinkage void do_softirq(void);
 asmlinkage void __do_softirq(void);
-extern void open_softirq(int nr, void (*action)(struct softirq_action*), void *data);
+extern void open_softirq(int nr, void (*action)(struct softirq_action *));
 extern void softirq_init(void);
 #define __raise_softirq_irqoff(nr) do { or_softirq_pending(1UL << (nr)); } while (0)
 extern void raise_softirq_irqoff(unsigned int nr);
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 421be5fe5cc7..861b4088092a 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1669,7 +1669,7 @@ void __init hrtimers_init(void)
 			  (void *)(long)smp_processor_id());
 	register_cpu_notifier(&hrtimers_nb);
 #ifdef CONFIG_HIGH_RES_TIMERS
-	open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq, NULL);
+	open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq);
 #endif
 }
 
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index f4ffbd0f306f..f6e01f3ae9c6 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -529,7 +529,7 @@ static void __cpuinit rcu_online_cpu(int cpu)
 
 	rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp);
 	rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp);
-	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL);
+	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
 }
 
 static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index e1cdf196a515..9dd827db359f 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -1125,7 +1125,7 @@ void __init __rcu_init(void)
 	for_each_online_cpu(cpu)
 		rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,	(void *)(long) cpu);
 
-	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL);
+	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
 }
 
 /*
diff --git a/kernel/sched.c b/kernel/sched.c
index cfa222a91539..56ea3a203a5a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8154,7 +8154,7 @@ void __init sched_init(void)
 #endif
 
 #ifdef CONFIG_SMP
-	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);
+	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
 #endif
 
 #ifdef CONFIG_RT_MUTEXES
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 36e061740047..059256874e9b 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -347,9 +347,8 @@ void raise_softirq(unsigned int nr)
 	local_irq_restore(flags);
 }
 
-void open_softirq(int nr, void (*action)(struct softirq_action*), void *data)
+void open_softirq(int nr, void (*action)(struct softirq_action *))
 {
-	softirq_vec[nr].data = data;
 	softirq_vec[nr].action = action;
 }
 
@@ -503,8 +502,8 @@ void __init softirq_init(void)
 			&per_cpu(tasklet_hi_vec, cpu).head;
 	}
 
-	open_softirq(TASKLET_SOFTIRQ, tasklet_action, NULL);
-	open_softirq(HI_SOFTIRQ, tasklet_hi_action, NULL);
+	open_softirq(TASKLET_SOFTIRQ, tasklet_action);
+	open_softirq(HI_SOFTIRQ, tasklet_hi_action);
 }
 
 static int ksoftirqd(void * __bind_cpu)
diff --git a/kernel/timer.c b/kernel/timer.c
index ceacc6626572..b4da888497fa 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1502,7 +1502,7 @@ void __init init_timers(void)
 
 	BUG_ON(err == NOTIFY_BAD);
 	register_cpu_notifier(&timers_nb);
-	open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL);
+	open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
 }
 
 /**
diff --git a/net/core/dev.c b/net/core/dev.c
index 582963077877..cf0e16731dc7 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4563,8 +4563,8 @@ static int __init net_dev_init(void)
 
 	dev_boot_phase = 0;
 
-	open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL);
-	open_softirq(NET_RX_SOFTIRQ, net_rx_action, NULL);
+	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
+	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
 
 	hotcpu_notifier(dev_cpu_callback, 0);
 	dst_init();
-- 
cgit v1.2.3


From e9197bf0114661195bee35e7795cfc42164d9b2c Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Wed, 14 May 2008 08:15:10 -0700
Subject: x86 boot: remove some unused extern function declarations

Remove three extern declarations for routines
that don't exist.  Fix a typo in a comment.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/numa_64.c | 2 +-
 include/linux/efi.h   | 4 ----
 2 files changed, 1 insertion(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index c5066d519e5d..afb07ffb931d 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -233,7 +233,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
 	else
 		bootmap_start = round_up(start, PAGE_SIZE);
 	/*
-	 * SMP_CAHCE_BYTES could be enough, but init_bootmem_node like
+	 * SMP_CACHE_BYTES could be enough, but init_bootmem_node like
 	 * to use that to align to PAGE_SIZE
 	 */
 	bootmap = early_node_mem(nodeid, bootmap_start, end,
diff --git a/include/linux/efi.h b/include/linux/efi.h
index a5f359a7ad0e..807373d467f7 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -287,7 +287,6 @@ efi_guid_unparse(efi_guid_t *guid, char *out)
 extern void efi_init (void);
 extern void *efi_get_pal_addr (void);
 extern void efi_map_pal_code (void);
-extern void efi_map_memmap(void);
 extern void efi_memmap_walk (efi_freemem_callback_t callback, void *arg);
 extern void efi_gettimeofday (struct timespec *ts);
 extern void efi_enter_virtual_mode (void);	/* switch EFI to virtual mode, if possible */
@@ -295,14 +294,11 @@ extern u64 efi_get_iobase (void);
 extern u32 efi_mem_type (unsigned long phys_addr);
 extern u64 efi_mem_attributes (unsigned long phys_addr);
 extern u64 efi_mem_attribute (unsigned long phys_addr, unsigned long size);
-extern int efi_mem_attribute_range (unsigned long phys_addr, unsigned long size,
-				    u64 attr);
 extern int __init efi_uart_console_only (void);
 extern void efi_initialize_iomem_resources(struct resource *code_resource,
 		struct resource *data_resource, struct resource *bss_resource);
 extern unsigned long efi_get_time(void);
 extern int efi_set_rtc_mmss(unsigned long nowtime);
-extern int is_available_memory(efi_memory_desc_t * md);
 extern struct efi_memory_map memmap;
 
 /**
-- 
cgit v1.2.3


From c801ed3860fe2f84281d4cae4c3e6ca87e81e890 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Wed, 14 May 2008 08:15:23 -0700
Subject: x86 boot: simplify pageblock_bits enum declaration

The use of #defines with '##' pre-processor concatenation is a useful
way to form several symbol names with a common pattern.  But when there
is just a single name obtained from that #define, it's just obfuscation.
Better to just write the plain symbol name, as is.

The following patch is a result of my wasting ten minutes looking through
the kernel to figure out what 'PB_migrate_end' meant, and forgetting what
I came to do, by the time I figured out that the #define PB_range macro
defined it.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/pageblock-flags.h | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h
index e875905f7b12..e8c06122be36 100644
--- a/include/linux/pageblock-flags.h
+++ b/include/linux/pageblock-flags.h
@@ -25,13 +25,11 @@
 
 #include <linux/types.h>
 
-/* Macro to aid the definition of ranges of bits */
-#define PB_range(name, required_bits) \
-	name, name ## _end = (name + required_bits) - 1
-
 /* Bit indices that affect a whole block of pages */
 enum pageblock_bits {
-	PB_range(PB_migrate, 3), /* 3 bits required for migrate types */
+	PB_migrate,
+	PB_migrate_end = PB_migrate + 3 - 1,
+			/* 3 bits required for migrate types */
 	NR_PAGEBLOCK_BITS
 };
 
-- 
cgit v1.2.3


From 41c52c0db9607e59f90da7da5309489fa06e887f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 22 May 2008 11:46:33 -0400
Subject: ftrace: set_ftrace_notrace feature

While debugging latencies in the RT kernel, I found that it would be nice
to be able to filter away functions from the trace than just to filter
on functions.

I added a new interface to the debugfs tracing directory called

  set_ftrace_notrace

When dynamic frace is enabled, this lets you filter away functions that will
not be recorded in the trace. It is similar to adding 'notrace' to those
functions but by doing it without recompiling the kernel.

Here's how set_ftrace_filter and set_ftrace_notrace interact. Remember, if
set_ftrace_filter is set, it removes all functions from the trace execpt for
those listed in the set_ftrace_filter. set_ftrace_notrace will prevent those
functions from being traced.

If you were to set one function in both set_ftrace_filter and
set_ftrace_notrace and that function was the same, then you would end up
with an empty trace.

the set of functions to trace is:

  set_ftrace_filter == empty then

     all functions not in set_ftrace_notrace

  else

     set of the set_ftrace_filter and not in set of set_ftrace_notrace.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/ftrace.h |   1 +
 kernel/trace/ftrace.c  | 170 +++++++++++++++++++++++++++++++++++++------------
 2 files changed, 131 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 922e23d0196f..ffbbd54a720e 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -48,6 +48,7 @@ enum {
 	FTRACE_FL_FAILED	= (1 << 1),
 	FTRACE_FL_FILTER	= (1 << 2),
 	FTRACE_FL_ENABLED	= (1 << 3),
+	FTRACE_FL_NOTRACE	= (1 << 4),
 };
 
 struct dyn_ftrace {
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 89bd9a6f52ec..2552454609cf 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -170,7 +170,7 @@ static DEFINE_PER_CPU(int, ftrace_shutdown_disable_cpu);
 
 static DEFINE_SPINLOCK(ftrace_shutdown_lock);
 static DEFINE_MUTEX(ftraced_lock);
-static DEFINE_MUTEX(ftrace_filter_lock);
+static DEFINE_MUTEX(ftrace_regex_lock);
 
 struct ftrace_page {
 	struct ftrace_page	*next;
@@ -337,13 +337,12 @@ static void
 __ftrace_replace_code(struct dyn_ftrace *rec,
 		      unsigned char *old, unsigned char *new, int enable)
 {
-	unsigned long ip;
+	unsigned long ip, fl;
 	int failed;
 
 	ip = rec->ip;
 
 	if (ftrace_filtered && enable) {
-		unsigned long fl;
 		/*
 		 * If filtering is on:
 		 *
@@ -356,13 +355,16 @@ __ftrace_replace_code(struct dyn_ftrace *rec,
 		 * If this record is not set to be filtered
 		 * and it is not enabled do nothing.
 		 *
+		 * If this record is set not to trace then
+		 * do nothing.
+		 *
 		 * If this record is not set to be filtered and
 		 * it is enabled, disable it.
 		 */
 		fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_ENABLED);
 
 		if ((fl ==  (FTRACE_FL_FILTER | FTRACE_FL_ENABLED)) ||
-		    (fl == 0))
+		    (fl == 0) || (rec->flags & FTRACE_FL_NOTRACE))
 			return;
 
 		/*
@@ -380,9 +382,17 @@ __ftrace_replace_code(struct dyn_ftrace *rec,
 		}
 	} else {
 
-		if (enable)
+		if (enable) {
+			/*
+			 * If this record is set not to trace and is
+			 * not enabled, do nothing.
+			 */
+			fl = rec->flags & (FTRACE_FL_NOTRACE | FTRACE_FL_ENABLED);
+			if (fl == FTRACE_FL_NOTRACE)
+				return;
+
 			new = ftrace_call_replace(ip, FTRACE_ADDR);
-		else
+		} else
 			old = ftrace_call_replace(ip, FTRACE_ADDR);
 
 		if (enable) {
@@ -721,6 +731,7 @@ static int __init ftrace_dyn_table_alloc(void)
 enum {
 	FTRACE_ITER_FILTER	= (1 << 0),
 	FTRACE_ITER_CONT	= (1 << 1),
+	FTRACE_ITER_NOTRACE	= (1 << 2),
 };
 
 #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
@@ -754,7 +765,9 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
 		rec = &iter->pg->records[iter->idx++];
 		if ((rec->flags & FTRACE_FL_FAILED) ||
 		    ((iter->flags & FTRACE_ITER_FILTER) &&
-		     !(rec->flags & FTRACE_FL_FILTER))) {
+		     !(rec->flags & FTRACE_FL_FILTER)) ||
+		    ((iter->flags & FTRACE_ITER_NOTRACE) &&
+		     !(rec->flags & FTRACE_FL_NOTRACE))) {
 			rec = NULL;
 			goto retry;
 		}
@@ -847,22 +860,24 @@ int ftrace_avail_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
-static void ftrace_filter_reset(void)
+static void ftrace_filter_reset(int enable)
 {
 	struct ftrace_page *pg;
 	struct dyn_ftrace *rec;
+	unsigned long type = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
 	unsigned i;
 
 	/* keep kstop machine from running */
 	preempt_disable();
-	ftrace_filtered = 0;
+	if (enable)
+		ftrace_filtered = 0;
 	pg = ftrace_pages_start;
 	while (pg) {
 		for (i = 0; i < pg->index; i++) {
 			rec = &pg->records[i];
 			if (rec->flags & FTRACE_FL_FAILED)
 				continue;
-			rec->flags &= ~FTRACE_FL_FILTER;
+			rec->flags &= ~type;
 		}
 		pg = pg->next;
 	}
@@ -870,7 +885,7 @@ static void ftrace_filter_reset(void)
 }
 
 static int
-ftrace_filter_open(struct inode *inode, struct file *file)
+ftrace_regex_open(struct inode *inode, struct file *file, int enable)
 {
 	struct ftrace_iterator *iter;
 	int ret = 0;
@@ -882,15 +897,16 @@ ftrace_filter_open(struct inode *inode, struct file *file)
 	if (!iter)
 		return -ENOMEM;
 
-	mutex_lock(&ftrace_filter_lock);
+	mutex_lock(&ftrace_regex_lock);
 	if ((file->f_mode & FMODE_WRITE) &&
 	    !(file->f_flags & O_APPEND))
-		ftrace_filter_reset();
+		ftrace_filter_reset(enable);
 
 	if (file->f_mode & FMODE_READ) {
 		iter->pg = ftrace_pages_start;
 		iter->pos = -1;
-		iter->flags = FTRACE_ITER_FILTER;
+		iter->flags = enable ? FTRACE_ITER_FILTER :
+			FTRACE_ITER_NOTRACE;
 
 		ret = seq_open(file, &show_ftrace_seq_ops);
 		if (!ret) {
@@ -900,13 +916,25 @@ ftrace_filter_open(struct inode *inode, struct file *file)
 			kfree(iter);
 	} else
 		file->private_data = iter;
-	mutex_unlock(&ftrace_filter_lock);
+	mutex_unlock(&ftrace_regex_lock);
 
 	return ret;
 }
 
+static int
+ftrace_filter_open(struct inode *inode, struct file *file)
+{
+	return ftrace_regex_open(inode, file, 1);
+}
+
+static int
+ftrace_notrace_open(struct inode *inode, struct file *file)
+{
+	return ftrace_regex_open(inode, file, 0);
+}
+
 static ssize_t
-ftrace_filter_read(struct file *file, char __user *ubuf,
+ftrace_regex_read(struct file *file, char __user *ubuf,
 		       size_t cnt, loff_t *ppos)
 {
 	if (file->f_mode & FMODE_READ)
@@ -916,7 +944,7 @@ ftrace_filter_read(struct file *file, char __user *ubuf,
 }
 
 static loff_t
-ftrace_filter_lseek(struct file *file, loff_t offset, int origin)
+ftrace_regex_lseek(struct file *file, loff_t offset, int origin)
 {
 	loff_t ret;
 
@@ -936,13 +964,14 @@ enum {
 };
 
 static void
-ftrace_match(unsigned char *buff, int len)
+ftrace_match(unsigned char *buff, int len, int enable)
 {
 	char str[KSYM_SYMBOL_LEN];
 	char *search = NULL;
 	struct ftrace_page *pg;
 	struct dyn_ftrace *rec;
 	int type = MATCH_FULL;
+	unsigned long flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
 	unsigned i, match = 0, search_len = 0;
 
 	for (i = 0; i < len; i++) {
@@ -966,7 +995,8 @@ ftrace_match(unsigned char *buff, int len)
 
 	/* keep kstop machine from running */
 	preempt_disable();
-	ftrace_filtered = 1;
+	if (enable)
+		ftrace_filtered = 1;
 	pg = ftrace_pages_start;
 	while (pg) {
 		for (i = 0; i < pg->index; i++) {
@@ -997,7 +1027,7 @@ ftrace_match(unsigned char *buff, int len)
 				break;
 			}
 			if (matched)
-				rec->flags |= FTRACE_FL_FILTER;
+				rec->flags |= flag;
 		}
 		pg = pg->next;
 	}
@@ -1005,8 +1035,8 @@ ftrace_match(unsigned char *buff, int len)
 }
 
 static ssize_t
-ftrace_filter_write(struct file *file, const char __user *ubuf,
-		    size_t cnt, loff_t *ppos)
+ftrace_regex_write(struct file *file, const char __user *ubuf,
+		   size_t cnt, loff_t *ppos, int enable)
 {
 	struct ftrace_iterator *iter;
 	char ch;
@@ -1016,7 +1046,7 @@ ftrace_filter_write(struct file *file, const char __user *ubuf,
 	if (!cnt || cnt < 0)
 		return 0;
 
-	mutex_lock(&ftrace_filter_lock);
+	mutex_lock(&ftrace_regex_lock);
 
 	if (file->f_mode & FMODE_READ) {
 		struct seq_file *m = file->private_data;
@@ -1045,7 +1075,6 @@ ftrace_filter_write(struct file *file, const char __user *ubuf,
 			cnt--;
 		}
 
-
 		if (isspace(ch)) {
 			file->f_pos += read;
 			ret = read;
@@ -1072,7 +1101,7 @@ ftrace_filter_write(struct file *file, const char __user *ubuf,
 	if (isspace(ch)) {
 		iter->filtered++;
 		iter->buffer[iter->buffer_idx] = 0;
-		ftrace_match(iter->buffer, iter->buffer_idx);
+		ftrace_match(iter->buffer, iter->buffer_idx, enable);
 		iter->buffer_idx = 0;
 	} else
 		iter->flags |= FTRACE_ITER_CONT;
@@ -1082,11 +1111,39 @@ ftrace_filter_write(struct file *file, const char __user *ubuf,
 
 	ret = read;
  out:
-	mutex_unlock(&ftrace_filter_lock);
+	mutex_unlock(&ftrace_regex_lock);
 
 	return ret;
 }
 
+static ssize_t
+ftrace_filter_write(struct file *file, const char __user *ubuf,
+		    size_t cnt, loff_t *ppos)
+{
+	return ftrace_regex_write(file, ubuf, cnt, ppos, 1);
+}
+
+static ssize_t
+ftrace_notrace_write(struct file *file, const char __user *ubuf,
+		     size_t cnt, loff_t *ppos)
+{
+	return ftrace_regex_write(file, ubuf, cnt, ppos, 0);
+}
+
+static void
+ftrace_set_regex(unsigned char *buf, int len, int reset, int enable)
+{
+	if (unlikely(ftrace_disabled))
+		return;
+
+	mutex_lock(&ftrace_regex_lock);
+	if (reset)
+		ftrace_filter_reset(enable);
+	if (buf)
+		ftrace_match(buf, len, enable);
+	mutex_unlock(&ftrace_regex_lock);
+}
+
 /**
  * ftrace_set_filter - set a function to filter on in ftrace
  * @buf - the string that holds the function filter text.
@@ -1098,24 +1155,31 @@ ftrace_filter_write(struct file *file, const char __user *ubuf,
  */
 void ftrace_set_filter(unsigned char *buf, int len, int reset)
 {
-	if (unlikely(ftrace_disabled))
-		return;
+	ftrace_set_regex(buf, len, reset, 1);
+}
 
-	mutex_lock(&ftrace_filter_lock);
-	if (reset)
-		ftrace_filter_reset();
-	if (buf)
-		ftrace_match(buf, len);
-	mutex_unlock(&ftrace_filter_lock);
+/**
+ * ftrace_set_notrace - set a function to not trace in ftrace
+ * @buf - the string that holds the function notrace text.
+ * @len - the length of the string.
+ * @reset - non zero to reset all filters before applying this filter.
+ *
+ * Notrace Filters denote which functions should not be enabled when tracing
+ * is enabled. If @buf is NULL and reset is set, all functions will be enabled
+ * for tracing.
+ */
+void ftrace_set_notrace(unsigned char *buf, int len, int reset)
+{
+	ftrace_set_regex(buf, len, reset, 0);
 }
 
 static int
-ftrace_filter_release(struct inode *inode, struct file *file)
+ftrace_regex_release(struct inode *inode, struct file *file, int enable)
 {
 	struct seq_file *m = (struct seq_file *)file->private_data;
 	struct ftrace_iterator *iter;
 
-	mutex_lock(&ftrace_filter_lock);
+	mutex_lock(&ftrace_regex_lock);
 	if (file->f_mode & FMODE_READ) {
 		iter = m->private;
 
@@ -1126,7 +1190,7 @@ ftrace_filter_release(struct inode *inode, struct file *file)
 	if (iter->buffer_idx) {
 		iter->filtered++;
 		iter->buffer[iter->buffer_idx] = 0;
-		ftrace_match(iter->buffer, iter->buffer_idx);
+		ftrace_match(iter->buffer, iter->buffer_idx, enable);
 	}
 
 	mutex_lock(&ftrace_sysctl_lock);
@@ -1137,10 +1201,22 @@ ftrace_filter_release(struct inode *inode, struct file *file)
 	mutex_unlock(&ftrace_sysctl_lock);
 
 	kfree(iter);
-	mutex_unlock(&ftrace_filter_lock);
+	mutex_unlock(&ftrace_regex_lock);
 	return 0;
 }
 
+static int
+ftrace_filter_release(struct inode *inode, struct file *file)
+{
+	return ftrace_regex_release(inode, file, 1);
+}
+
+static int
+ftrace_notrace_release(struct inode *inode, struct file *file)
+{
+	return ftrace_regex_release(inode, file, 0);
+}
+
 static struct file_operations ftrace_avail_fops = {
 	.open = ftrace_avail_open,
 	.read = seq_read,
@@ -1150,12 +1226,20 @@ static struct file_operations ftrace_avail_fops = {
 
 static struct file_operations ftrace_filter_fops = {
 	.open = ftrace_filter_open,
-	.read = ftrace_filter_read,
+	.read = ftrace_regex_read,
 	.write = ftrace_filter_write,
-	.llseek = ftrace_filter_lseek,
+	.llseek = ftrace_regex_lseek,
 	.release = ftrace_filter_release,
 };
 
+static struct file_operations ftrace_notrace_fops = {
+	.open = ftrace_notrace_open,
+	.read = ftrace_regex_read,
+	.write = ftrace_notrace_write,
+	.llseek = ftrace_regex_lseek,
+	.release = ftrace_notrace_release,
+};
+
 /**
  * ftrace_force_update - force an update to all recording ftrace functions
  *
@@ -1239,6 +1323,12 @@ static __init int ftrace_init_debugfs(void)
 	if (!entry)
 		pr_warning("Could not create debugfs "
 			   "'set_ftrace_filter' entry\n");
+
+	entry = debugfs_create_file("set_ftrace_notrace", 0644, d_tracer,
+				    NULL, &ftrace_notrace_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'set_ftrace_notrace' entry\n");
 	return 0;
 }
 
-- 
cgit v1.2.3


From 9e124fe16ff24746d6de5a2ad685266d7bce0e08 Mon Sep 17 00:00:00 2001
From: Markus Armbruster <armbru@redhat.com>
Date: Mon, 26 May 2008 23:31:07 +0100
Subject: xen: Enable console tty by default in domU if it's not a dummy

Without console= arguments on the kernel command line, the first
console to register becomes enabled and the preferred console (the one
behind /dev/console).  This is normally tty (assuming
CONFIG_VT_CONSOLE is enabled, which it commonly is).

This is okay as long tty is a useful console.  But unless we have the
PV framebuffer, and it is enabled for this domain, tty0 in domU is
merely a dummy.  In that case, we want the preferred console to be the
Xen console hvc0, and we want it without having to fiddle with the
kernel command line.  Commit b8c2d3dfbc117dff26058fbac316b8acfc2cb5f7
did that for us.

Since we now have the PV framebuffer, we want to enable and prefer tty
again, but only when PVFB is enabled.  But even then we still want to
enable the Xen console as well.

Problem: when tty registers, we can't yet know whether the PVFB is
enabled.  By the time we can know (xenstore is up), the console setup
game is over.

Solution: enable console tty by default, but keep hvc as the preferred
console.  Change the preferred console to tty when PVFB probes
successfully, unless we've been given console kernel parameters.

Signed-off-by: Markus Armbruster <armbru@redhat.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/xen/enlighten.c    |  4 +++-
 drivers/video/xen-fbfront.c | 25 +++++++++++++++++++++++++
 include/linux/console.h     |  2 ++
 kernel/printk.c             |  3 +++
 4 files changed, 33 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 1b4b5fa498b3..6cfb708408e9 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1256,8 +1256,10 @@ asmlinkage void __init xen_start_kernel(void)
 		? __pa(xen_start_info->mod_start) : 0;
 	boot_params.hdr.ramdisk_size = xen_start_info->mod_len;
 
-	if (!is_initial_xendomain())
+	if (!is_initial_xendomain()) {
+		add_preferred_console("tty", 0, NULL);
 		add_preferred_console("hvc", 0, NULL);
+	}
 
 	/* Start the world */
 	start_kernel();
diff --git a/drivers/video/xen-fbfront.c b/drivers/video/xen-fbfront.c
index 619a6f8d65a2..4e10876e62fc 100644
--- a/drivers/video/xen-fbfront.c
+++ b/drivers/video/xen-fbfront.c
@@ -18,6 +18,7 @@
  * frame buffer.
  */
 
+#include <linux/console.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/fb.h>
@@ -48,6 +49,7 @@ struct xenfb_info {
 
 static u32 xenfb_mem_len = XENFB_WIDTH * XENFB_HEIGHT * XENFB_DEPTH / 8;
 
+static void xenfb_make_preferred_console(void);
 static int xenfb_remove(struct xenbus_device *);
 static void xenfb_init_shared_page(struct xenfb_info *);
 static int xenfb_connect_backend(struct xenbus_device *, struct xenfb_info *);
@@ -348,6 +350,7 @@ static int __devinit xenfb_probe(struct xenbus_device *dev,
 	if (ret < 0)
 		goto error;
 
+	xenfb_make_preferred_console();
 	return 0;
 
  error_nomem:
@@ -358,6 +361,28 @@ static int __devinit xenfb_probe(struct xenbus_device *dev,
 	return ret;
 }
 
+static __devinit void
+xenfb_make_preferred_console(void)
+{
+	struct console *c;
+
+	if (console_set_on_cmdline)
+		return;
+
+	acquire_console_sem();
+	for (c = console_drivers; c; c = c->next) {
+		if (!strcmp(c->name, "tty") && c->index == 0)
+			break;
+	}
+	release_console_sem();
+	if (c) {
+		unregister_console(c);
+		c->flags |= CON_CONSDEV;
+		c->flags &= ~CON_PRINTBUFFER; /* don't print again */
+		register_console(c);
+	}
+}
+
 static int xenfb_resume(struct xenbus_device *dev)
 {
 	struct xenfb_info *info = dev->dev.driver_data;
diff --git a/include/linux/console.h b/include/linux/console.h
index a4f27fbdf549..248e6e3b9b73 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -108,6 +108,8 @@ struct console {
 	struct	 console *next;
 };
 
+extern int console_set_on_cmdline;
+
 extern int add_preferred_console(char *name, int idx, char *options);
 extern int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, char *options);
 extern void register_console(struct console *);
diff --git a/kernel/printk.c b/kernel/printk.c
index 8fb01c32aa3b..028ed75d4864 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -121,6 +121,8 @@ struct console_cmdline
 static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES];
 static int selected_console = -1;
 static int preferred_console = -1;
+int console_set_on_cmdline;
+EXPORT_SYMBOL(console_set_on_cmdline);
 
 /* Flag: console code may call schedule() */
 static int console_may_schedule;
@@ -890,6 +892,7 @@ static int __init console_setup(char *str)
 	*s = 0;
 
 	__add_preferred_console(buf, idx, options, brl_options);
+	console_set_on_cmdline = 1;
 	return 1;
 }
 __setup("console=", console_setup);
-- 
cgit v1.2.3


From 0e91398f2a5d4eb6b07df8115917d0d1cf3e9b58 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Mon, 26 May 2008 23:31:27 +0100
Subject: xen: implement save/restore

This patch implements Xen save/restore and migration.

Saving is triggered via xenbus, which is polled in
drivers/xen/manage.c.  When a suspend request comes in, the kernel
prepares itself for saving by:

1 - Freeze all processes.  This is primarily to prevent any
    partially-completed pagetable updates from confusing the suspend
    process.  If CONFIG_PREEMPT isn't defined, then this isn't necessary.

2 - Suspend xenbus and other devices

3 - Stop_machine, to make sure all the other vcpus are quiescent.  The
    Xen tools require the domain to run its save off vcpu0.

4 - Within the stop_machine state, it pins any unpinned pgds (under
    construction or destruction), performs canonicalizes various other
    pieces of state (mostly converting mfns to pfns), and finally

5 - Suspend the domain

Restore reverses the steps used to save the domain, ending when all
the frozen processes are thawed.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/xen/Makefile      |   2 +-
 arch/x86/xen/enlighten.c   |   6 +--
 arch/x86/xen/mmu.c         |  46 +++++++++++++++++
 arch/x86/xen/smp.c         |   2 +-
 arch/x86/xen/suspend.c     |  42 +++++++++++++++
 arch/x86/xen/time.c        |   8 +++
 arch/x86/xen/xen-ops.h     |   4 ++
 drivers/xen/events.c       |  83 +++++++++++++++++++++++++++++
 drivers/xen/grant-table.c  |   4 +-
 drivers/xen/manage.c       | 126 ++++++++++++++++++++++++++++++++++++++++-----
 include/linux/page-flags.h |   1 +
 include/xen/events.h       |   3 ++
 include/xen/grant_table.h  |   3 ++
 include/xen/xen-ops.h      |   9 ++++
 14 files changed, 318 insertions(+), 21 deletions(-)
 create mode 100644 arch/x86/xen/suspend.c

(limited to 'include/linux')

diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 40b119b6b103..2ba2d1649131 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -1,4 +1,4 @@
 obj-y		:= enlighten.o setup.o multicalls.o mmu.o \
-			time.o xen-asm.o grant-table.o
+			time.o xen-asm.o grant-table.o suspend.o
 
 obj-$(CONFIG_SMP)	+= smp.o
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index ce67dc8f36af..b94f63ac228b 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -857,7 +857,7 @@ static __init void xen_pagetable_setup_start(pgd_t *base)
 			  PFN_DOWN(__pa(xen_start_info->pt_base)));
 }
 
-static __init void setup_shared_info(void)
+void xen_setup_shared_info(void)
 {
 	if (!xen_feature(XENFEAT_auto_translated_physmap)) {
 		unsigned long addr = fix_to_virt(FIX_PARAVIRT_BOOTMAP);
@@ -894,7 +894,7 @@ static __init void xen_pagetable_setup_done(pgd_t *base)
 	pv_mmu_ops.release_pmd = xen_release_pmd;
 	pv_mmu_ops.set_pte = xen_set_pte;
 
-	setup_shared_info();
+	xen_setup_shared_info();
 
 	/* Actually pin the pagetable down, but we can't set PG_pinned
 	   yet because the page structures don't exist yet. */
@@ -902,7 +902,7 @@ static __init void xen_pagetable_setup_done(pgd_t *base)
 }
 
 /* This is called once we have the cpu_possible_map */
-void __init xen_setup_vcpu_info_placement(void)
+void xen_setup_vcpu_info_placement(void)
 {
 	int cpu;
 
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 4740cda36563..e95955968ba3 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -560,6 +560,29 @@ void xen_pgd_pin(pgd_t *pgd)
 	xen_mc_issue(0);
 }
 
+/*
+ * On save, we need to pin all pagetables to make sure they get their
+ * mfns turned into pfns.  Search the list for any unpinned pgds and pin
+ * them (unpinned pgds are not currently in use, probably because the
+ * process is under construction or destruction).
+ */
+void xen_mm_pin_all(void)
+{
+	unsigned long flags;
+	struct page *page;
+
+	spin_lock_irqsave(&pgd_lock, flags);
+
+	list_for_each_entry(page, &pgd_list, lru) {
+		if (!PagePinned(page)) {
+			xen_pgd_pin((pgd_t *)page_address(page));
+			SetPageSavePinned(page);
+		}
+	}
+
+	spin_unlock_irqrestore(&pgd_lock, flags);
+}
+
 /* The init_mm pagetable is really pinned as soon as its created, but
    that's before we have page structures to store the bits.  So do all
    the book-keeping now. */
@@ -617,6 +640,29 @@ static void xen_pgd_unpin(pgd_t *pgd)
 	xen_mc_issue(0);
 }
 
+/*
+ * On resume, undo any pinning done at save, so that the rest of the
+ * kernel doesn't see any unexpected pinned pagetables.
+ */
+void xen_mm_unpin_all(void)
+{
+	unsigned long flags;
+	struct page *page;
+
+	spin_lock_irqsave(&pgd_lock, flags);
+
+	list_for_each_entry(page, &pgd_list, lru) {
+		if (PageSavePinned(page)) {
+			BUG_ON(!PagePinned(page));
+			printk("unpinning pinned %p\n", page_address(page));
+			xen_pgd_unpin((pgd_t *)page_address(page));
+			ClearPageSavePinned(page);
+		}
+	}
+
+	spin_unlock_irqrestore(&pgd_lock, flags);
+}
+
 void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
 {
 	spin_lock(&next->page_table_lock);
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 74ab8968c525..d2e3c20127d7 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -35,7 +35,7 @@
 #include "xen-ops.h"
 #include "mmu.h"
 
-static cpumask_t xen_cpu_initialized_map;
+cpumask_t xen_cpu_initialized_map;
 static DEFINE_PER_CPU(int, resched_irq) = -1;
 static DEFINE_PER_CPU(int, callfunc_irq) = -1;
 static DEFINE_PER_CPU(int, debug_irq) = -1;
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
new file mode 100644
index 000000000000..7620a16fe535
--- /dev/null
+++ b/arch/x86/xen/suspend.c
@@ -0,0 +1,42 @@
+#include <linux/types.h>
+
+#include <xen/interface/xen.h>
+#include <xen/grant_table.h>
+#include <xen/events.h>
+
+#include <asm/xen/hypercall.h>
+#include <asm/xen/page.h>
+
+#include "xen-ops.h"
+#include "mmu.h"
+
+void xen_pre_suspend(void)
+{
+	xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn);
+	xen_start_info->console.domU.mfn =
+		mfn_to_pfn(xen_start_info->console.domU.mfn);
+
+	BUG_ON(!irqs_disabled());
+
+	HYPERVISOR_shared_info = &xen_dummy_shared_info;
+	if (HYPERVISOR_update_va_mapping(fix_to_virt(FIX_PARAVIRT_BOOTMAP),
+					 __pte_ma(0), 0))
+		BUG();
+}
+
+void xen_post_suspend(int suspend_cancelled)
+{
+	if (suspend_cancelled) {
+		xen_start_info->store_mfn =
+			pfn_to_mfn(xen_start_info->store_mfn);
+		xen_start_info->console.domU.mfn =
+			pfn_to_mfn(xen_start_info->console.domU.mfn);
+	} else {
+#ifdef CONFIG_SMP
+		xen_cpu_initialized_map = cpu_online_map;
+#endif
+	}
+
+	xen_setup_shared_info();
+}
+
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index c39e1a5aa241..0bef256e5f2d 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -572,6 +572,14 @@ void xen_setup_cpu_clockevents(void)
 	clockevents_register_device(&__get_cpu_var(xen_clock_events));
 }
 
+void xen_time_suspend(void)
+{
+}
+
+void xen_time_resume(void)
+{
+}
+
 __init void xen_time_init(void)
 {
 	int cpu = smp_processor_id();
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index a1bc89a8f169..a0503acad664 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -9,6 +9,7 @@
 extern const char xen_hypervisor_callback[];
 extern const char xen_failsafe_callback[];
 
+struct trap_info;
 void xen_copy_trap_info(struct trap_info *traps);
 
 DECLARE_PER_CPU(unsigned long, xen_cr3);
@@ -19,6 +20,7 @@ extern struct shared_info xen_dummy_shared_info;
 extern struct shared_info *HYPERVISOR_shared_info;
 
 void xen_setup_mfn_list_list(void);
+void xen_setup_shared_info(void);
 
 char * __init xen_memory_setup(void);
 void __init xen_arch_setup(void);
@@ -59,6 +61,8 @@ int xen_smp_call_function_single(int cpu, void (*func) (void *info), void *info,
 int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *),
 			       void *info, int wait);
 
+extern cpumask_t xen_cpu_initialized_map;
+
 
 /* Declare an asm function, along with symbols needed to make it
    inlineable */
diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index 70375a690761..73d78dc9b875 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -674,6 +674,89 @@ static int retrigger_dynirq(unsigned int irq)
 	return ret;
 }
 
+static void restore_cpu_virqs(unsigned int cpu)
+{
+	struct evtchn_bind_virq bind_virq;
+	int virq, irq, evtchn;
+
+	for (virq = 0; virq < NR_VIRQS; virq++) {
+		if ((irq = per_cpu(virq_to_irq, cpu)[virq]) == -1)
+			continue;
+
+		BUG_ON(irq_info[irq].type != IRQT_VIRQ);
+		BUG_ON(irq_info[irq].index != virq);
+
+		/* Get a new binding from Xen. */
+		bind_virq.virq = virq;
+		bind_virq.vcpu = cpu;
+		if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
+						&bind_virq) != 0)
+			BUG();
+		evtchn = bind_virq.port;
+
+		/* Record the new mapping. */
+		evtchn_to_irq[evtchn] = irq;
+		irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn);
+		bind_evtchn_to_cpu(evtchn, cpu);
+
+		/* Ready for use. */
+		unmask_evtchn(evtchn);
+	}
+}
+
+static void restore_cpu_ipis(unsigned int cpu)
+{
+	struct evtchn_bind_ipi bind_ipi;
+	int ipi, irq, evtchn;
+
+	for (ipi = 0; ipi < XEN_NR_IPIS; ipi++) {
+		if ((irq = per_cpu(ipi_to_irq, cpu)[ipi]) == -1)
+			continue;
+
+		BUG_ON(irq_info[irq].type != IRQT_IPI);
+		BUG_ON(irq_info[irq].index != ipi);
+
+		/* Get a new binding from Xen. */
+		bind_ipi.vcpu = cpu;
+		if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi,
+						&bind_ipi) != 0)
+			BUG();
+		evtchn = bind_ipi.port;
+
+		/* Record the new mapping. */
+		evtchn_to_irq[evtchn] = irq;
+		irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn);
+		bind_evtchn_to_cpu(evtchn, cpu);
+
+		/* Ready for use. */
+		unmask_evtchn(evtchn);
+
+	}
+}
+
+void xen_irq_resume(void)
+{
+	unsigned int cpu, irq, evtchn;
+
+	init_evtchn_cpu_bindings();
+
+	/* New event-channel space is not 'live' yet. */
+	for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++)
+		mask_evtchn(evtchn);
+
+	/* No IRQ <-> event-channel mappings. */
+	for (irq = 0; irq < NR_IRQS; irq++)
+		irq_info[irq].evtchn = 0; /* zap event-channel binding */
+
+	for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++)
+		evtchn_to_irq[evtchn] = -1;
+
+	for_each_possible_cpu(cpu) {
+		restore_cpu_virqs(cpu);
+		restore_cpu_ipis(cpu);
+	}
+}
+
 static struct irq_chip xen_dynamic_chip __read_mostly = {
 	.name		= "xen-dyn",
 	.mask		= disable_dynirq,
diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
index 52b6b41b909d..e9e11168616a 100644
--- a/drivers/xen/grant-table.c
+++ b/drivers/xen/grant-table.c
@@ -471,14 +471,14 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
 	return 0;
 }
 
-static int gnttab_resume(void)
+int gnttab_resume(void)
 {
 	if (max_nr_grant_frames() < nr_grant_frames)
 		return -ENOSYS;
 	return gnttab_map(0, nr_grant_frames - 1);
 }
 
-static int gnttab_suspend(void)
+int gnttab_suspend(void)
 {
 	arch_gnttab_unmap_shared(shared, nr_grant_frames);
 	return 0;
diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
index aa7af9e6abc0..ba85fa2cff33 100644
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -5,21 +5,113 @@
 #include <linux/err.h>
 #include <linux/reboot.h>
 #include <linux/sysrq.h>
+#include <linux/stop_machine.h>
+#include <linux/freezer.h>
 
 #include <xen/xenbus.h>
-
-#define SHUTDOWN_INVALID  -1
-#define SHUTDOWN_POWEROFF  0
-#define SHUTDOWN_SUSPEND   2
-/* Code 3 is SHUTDOWN_CRASH, which we don't use because the domain can only
- * report a crash, not be instructed to crash!
- * HALT is the same as POWEROFF, as far as we're concerned.  The tools use
- * the distinction when we return the reason code to them.
- */
-#define SHUTDOWN_HALT      4
+#include <xen/grant_table.h>
+#include <xen/events.h>
+#include <xen/hvc-console.h>
+#include <xen/xen-ops.h>
+
+#include <asm/xen/hypercall.h>
+#include <asm/xen/page.h>
+
+enum shutdown_state {
+	SHUTDOWN_INVALID = -1,
+	SHUTDOWN_POWEROFF = 0,
+	SHUTDOWN_SUSPEND = 2,
+	/* Code 3 is SHUTDOWN_CRASH, which we don't use because the domain can only
+	   report a crash, not be instructed to crash!
+	   HALT is the same as POWEROFF, as far as we're concerned.  The tools use
+	   the distinction when we return the reason code to them.  */
+	 SHUTDOWN_HALT = 4,
+};
 
 /* Ignore multiple shutdown requests. */
-static int shutting_down = SHUTDOWN_INVALID;
+static enum shutdown_state shutting_down = SHUTDOWN_INVALID;
+
+static int xen_suspend(void *data)
+{
+	int *cancelled = data;
+
+	BUG_ON(!irqs_disabled());
+
+	load_cr3(swapper_pg_dir);
+
+	xen_mm_pin_all();
+	gnttab_suspend();
+	xen_time_suspend();
+	xen_pre_suspend();
+
+	/*
+	 * This hypercall returns 1 if suspend was cancelled
+	 * or the domain was merely checkpointed, and 0 if it
+	 * is resuming in a new domain.
+	 */
+	*cancelled = HYPERVISOR_suspend(virt_to_mfn(xen_start_info));
+
+	xen_post_suspend(*cancelled);
+	xen_time_resume();
+	gnttab_resume();
+	xen_mm_unpin_all();
+
+	if (!*cancelled) {
+		xen_irq_resume();
+		xen_console_resume();
+	}
+
+	return 0;
+}
+
+static void do_suspend(void)
+{
+	int err;
+	int cancelled = 1;
+
+	shutting_down = SHUTDOWN_SUSPEND;
+
+#ifdef CONFIG_PREEMPT
+	/* If the kernel is preemptible, we need to freeze all the processes
+	   to prevent them from being in the middle of a pagetable update
+	   during suspend. */
+	err = freeze_processes();
+	if (err) {
+		printk(KERN_ERR "xen suspend: freeze failed %d\n", err);
+		return;
+	}
+#endif
+
+	err = device_suspend(PMSG_SUSPEND);
+	if (err) {
+		printk(KERN_ERR "xen suspend: device_suspend %d\n", err);
+		goto out;
+	}
+
+	printk("suspending xenbus...\n");
+	/* XXX use normal device tree? */
+	xenbus_suspend();
+
+	err = stop_machine_run(xen_suspend, &cancelled, 0);
+	if (err) {
+		printk(KERN_ERR "failed to start xen_suspend: %d\n", err);
+		goto out;
+	}
+
+	if (!cancelled)
+		xenbus_resume();
+	else
+		xenbus_suspend_cancel();
+
+	device_resume();
+
+
+out:
+#ifdef CONFIG_PREEMPT
+	thaw_processes();
+#endif
+	shutting_down = SHUTDOWN_INVALID;
+}
 
 static void shutdown_handler(struct xenbus_watch *watch,
 			     const char **vec, unsigned int len)
@@ -52,11 +144,17 @@ static void shutdown_handler(struct xenbus_watch *watch,
 	}
 
 	if (strcmp(str, "poweroff") == 0 ||
-	    strcmp(str, "halt") == 0)
+	    strcmp(str, "halt") == 0) {
+		shutting_down = SHUTDOWN_POWEROFF;
 		orderly_poweroff(false);
-	else if (strcmp(str, "reboot") == 0)
+	} else if (strcmp(str, "reboot") == 0) {
+		shutting_down = SHUTDOWN_POWEROFF; /* ? */
 		ctrl_alt_del();
-	else {
+#ifdef CONFIG_PM_SLEEP
+	} else if (strcmp(str, "suspend") == 0) {
+		do_suspend();
+#endif
+	} else {
 		printk(KERN_INFO "Ignoring shutdown request: %s\n", str);
 		shutting_down = SHUTDOWN_INVALID;
 	}
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 590cff32415d..02955a1c3d76 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -157,6 +157,7 @@ PAGEFLAG(Active, active) __CLEARPAGEFLAG(Active, active)
 __PAGEFLAG(Slab, slab)
 PAGEFLAG(Checked, owner_priv_1)		/* Used by some filesystems */
 PAGEFLAG(Pinned, owner_priv_1) TESTSCFLAG(Pinned, owner_priv_1) /* Xen */
+PAGEFLAG(SavePinned, dirty);					/* Xen */
 PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved)
 PAGEFLAG(Private, private) __CLEARPAGEFLAG(Private, private)
 	__SETPAGEFLAG(Private, private)
diff --git a/include/xen/events.h b/include/xen/events.h
index a82ec0c45c38..67c4436554a9 100644
--- a/include/xen/events.h
+++ b/include/xen/events.h
@@ -41,4 +41,7 @@ static inline void notify_remote_via_evtchn(int port)
 }
 
 extern void notify_remote_via_irq(int irq);
+
+extern void xen_irq_resume(void);
+
 #endif	/* _XEN_EVENTS_H */
diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h
index 466204846121..a40f1cd91be1 100644
--- a/include/xen/grant_table.h
+++ b/include/xen/grant_table.h
@@ -51,6 +51,9 @@ struct gnttab_free_callback {
 	u16 count;
 };
 
+int gnttab_suspend(void);
+int gnttab_resume(void);
+
 int gnttab_grant_foreign_access(domid_t domid, unsigned long frame,
 				int readonly);
 
diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h
index 10ddfe0142d0..5d7a6db54a8c 100644
--- a/include/xen/xen-ops.h
+++ b/include/xen/xen-ops.h
@@ -5,4 +5,13 @@
 
 DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu);
 
+void xen_pre_suspend(void);
+void xen_post_suspend(int suspend_cancelled);
+
+void xen_mm_pin_all(void);
+void xen_mm_unpin_all(void);
+
+void xen_time_suspend(void);
+void xen_time_resume(void);
+
 #endif /* INCLUDE_XEN_OPS_H */
-- 
cgit v1.2.3


From b1829d2705daa7cb72eb1e08bdc8b7e9fad34266 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 28 May 2008 01:22:08 +0200
Subject: ftrace: fix merge

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index ffbbd54a720e..b482fe88bc04 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -122,7 +122,7 @@ static inline void tracer_disable(void)
 # define trace_preempt_off(a0, a1)		do { } while (0)
 #endif
 
-#ifdef CONFIG_CONTEXT_SWITCH_TRACER
+#ifdef CONFIG_TRACING
 extern void
 ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3);
 #else
-- 
cgit v1.2.3


From ad90c0e3ce8d20d6873b57e36181ef6d7a0097fe Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Tue, 27 May 2008 20:48:37 -0400
Subject: ftrace: user update and disable dynamic ftrace daemon

In dynamic ftrace, the mcount function starts off pointing to a stub
function that just returns.

On start up, the call to the stub is modified to point to a "record_ip"
function. The job of the record_ip function is to add the function to
a pre-allocated hash list. If the function is already there, it simply is
ignored, otherwise it is added to the list.

Later, a ftraced daemon wakes up and calls kstop_machine if any functions
have been recorded, and changes the calls to the recorded functions to
a simple nop.  If no functions were recorded, the daemon goes back to sleep.

The daemon wakes up once a second to see if it needs to update any newly
recorded functions into nops.  Usually it does not, but if a lot of code
has been executed for the first time in the kernel, the ftraced daemon
will call kstop_machine to update those into nops.

The problem currently is that there's no way to stop the daemon from doing
this, and it can cause unneeded latencies (800us which for some is bothersome).

This patch adds a new file /debugfs/tracing/ftraced_enabled. If the daemon
is active, reading this will return "enabled\n" and "disabled\n" when the
daemon is not running. To disable the daemon, the user can echo "0" or
"disable" into this file, and "1" or "enable" to re-enable the daemon.

Since the daemon is used to convert the functions into nops to increase
the performance of the system, I also added that anytime something is
written into the ftraced_enabled file, kstop_machine will run if there
are new functions that have been detected that need to be converted.

This way the user can disable the daemon but still be able to control the
conversion of the mcount calls to nops by simply,

  "echo 0 > /debugfs/tracing/ftraced_enabled"

when they need to do more conversions.

To see the number of converted functions:

  "cat /debugfs/tracing/dyn_ftrace_total_info"

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h |   6 ++
 kernel/trace/ftrace.c  | 157 ++++++++++++++++++++++++++++++++++---------------
 2 files changed, 116 insertions(+), 47 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index b482fe88bc04..623819433ed5 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -72,9 +72,15 @@ extern int ftrace_update_ftrace_func(ftrace_func_t func);
 extern void ftrace_caller(void);
 extern void ftrace_call(void);
 extern void mcount_call(void);
+
+void ftrace_disable_daemon(void);
+void ftrace_enable_daemon(void);
+
 #else
 # define ftrace_force_update()			({ 0; })
 # define ftrace_set_filter(buf, len, reset)	do { } while (0)
+# define ftrace_disable_daemon()		do { } while (0)
+# define ftrace_enable_daemon()			do { } while (0)
 #endif
 
 /* totally disable ftrace - can not re-enable after this */
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 1843edc098a6..f762f5a2d331 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -151,8 +151,6 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
 #ifdef CONFIG_DYNAMIC_FTRACE
 
 static struct task_struct *ftraced_task;
-static DECLARE_WAIT_QUEUE_HEAD(ftraced_waiters);
-static unsigned long ftraced_iteration_counter;
 
 enum {
 	FTRACE_ENABLE_CALLS		= (1 << 0),
@@ -189,6 +187,7 @@ static struct ftrace_page	*ftrace_pages;
 
 static int ftraced_trigger;
 static int ftraced_suspend;
+static int ftraced_stop;
 
 static int ftrace_record_suspend;
 
@@ -474,14 +473,21 @@ ftrace_code_disable(struct dyn_ftrace *rec)
 	return 1;
 }
 
+static int __ftrace_update_code(void *ignore);
+
 static int __ftrace_modify_code(void *data)
 {
 	unsigned long addr;
 	int *command = data;
 
-	if (*command & FTRACE_ENABLE_CALLS)
+	if (*command & FTRACE_ENABLE_CALLS) {
+		/*
+		 * Update any recorded ips now that we have the
+		 * machine stopped
+		 */
+		__ftrace_update_code(NULL);
 		ftrace_replace_code(1);
-	else if (*command & FTRACE_DISABLE_CALLS)
+	} else if (*command & FTRACE_DISABLE_CALLS)
 		ftrace_replace_code(0);
 
 	if (*command & FTRACE_UPDATE_TRACE_FUNC)
@@ -503,6 +509,25 @@ static void ftrace_run_update_code(int command)
 	stop_machine_run(__ftrace_modify_code, &command, NR_CPUS);
 }
 
+void ftrace_disable_daemon(void)
+{
+	/* Stop the daemon from calling kstop_machine */
+	mutex_lock(&ftraced_lock);
+	ftraced_stop = 1;
+	mutex_unlock(&ftraced_lock);
+
+	ftrace_force_update();
+}
+
+void ftrace_enable_daemon(void)
+{
+	mutex_lock(&ftraced_lock);
+	ftraced_stop = 0;
+	mutex_unlock(&ftraced_lock);
+
+	ftrace_force_update();
+}
+
 static ftrace_func_t saved_ftrace_func;
 
 static void ftrace_startup(void)
@@ -603,6 +628,7 @@ static int __ftrace_update_code(void *ignore)
 	int i;
 
 	/* Don't be recording funcs now */
+	ftrace_record_suspend++;
 	save_ftrace_enabled = ftrace_enabled;
 	ftrace_enabled = 0;
 
@@ -628,18 +654,23 @@ static int __ftrace_update_code(void *ignore)
 	stop = ftrace_now(raw_smp_processor_id());
 	ftrace_update_time = stop - start;
 	ftrace_update_tot_cnt += ftrace_update_cnt;
+	ftraced_trigger = 0;
 
 	ftrace_enabled = save_ftrace_enabled;
+	ftrace_record_suspend--;
 
 	return 0;
 }
 
-static void ftrace_update_code(void)
+static int ftrace_update_code(void)
 {
-	if (unlikely(ftrace_disabled))
-		return;
+	if (unlikely(ftrace_disabled) ||
+	    !ftrace_enabled || !ftraced_trigger)
+		return 0;
 
 	stop_machine_run(__ftrace_update_code, NULL, NR_CPUS);
+
+	return 1;
 }
 
 static int ftraced(void *ignore)
@@ -658,14 +689,13 @@ static int ftraced(void *ignore)
 
 		mutex_lock(&ftrace_sysctl_lock);
 		mutex_lock(&ftraced_lock);
-		if (ftrace_enabled && ftraced_trigger && !ftraced_suspend) {
-			ftrace_record_suspend++;
-			ftrace_update_code();
+		if (!ftraced_suspend && !ftraced_stop &&
+		    ftrace_update_code()) {
 			usecs = nsecs_to_usecs(ftrace_update_time);
 			if (ftrace_update_tot_cnt > 100000) {
 				ftrace_update_tot_cnt = 0;
 				pr_info("hm, dftrace overflow: %lu change%s"
-					 " (%lu total) in %lu usec%s\n",
+					" (%lu total) in %lu usec%s\n",
 					ftrace_update_cnt,
 					ftrace_update_cnt != 1 ? "s" : "",
 					ftrace_update_tot_cnt,
@@ -673,15 +703,10 @@ static int ftraced(void *ignore)
 				ftrace_disabled = 1;
 				WARN_ON_ONCE(1);
 			}
-			ftraced_trigger = 0;
-			ftrace_record_suspend--;
 		}
-		ftraced_iteration_counter++;
 		mutex_unlock(&ftraced_lock);
 		mutex_unlock(&ftrace_sysctl_lock);
 
-		wake_up_interruptible(&ftraced_waiters);
-
 		ftrace_shutdown_replenish();
 	}
 	__set_current_state(TASK_RUNNING);
@@ -1219,6 +1244,55 @@ ftrace_notrace_release(struct inode *inode, struct file *file)
 	return ftrace_regex_release(inode, file, 0);
 }
 
+static ssize_t
+ftraced_read(struct file *filp, char __user *ubuf,
+		     size_t cnt, loff_t *ppos)
+{
+	/* don't worry about races */
+	char *buf = ftraced_stop ? "disabled\n" : "enabled\n";
+	int r = strlen(buf);
+
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t
+ftraced_write(struct file *filp, const char __user *ubuf,
+		      size_t cnt, loff_t *ppos)
+{
+	char buf[64];
+	long val;
+	int ret;
+
+	if (cnt >= sizeof(buf))
+		return -EINVAL;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	if (strncmp(buf, "enable", 6) == 0)
+		val = 1;
+	else if (strncmp(buf, "disable", 7) == 0)
+		val = 0;
+	else {
+		buf[cnt] = 0;
+
+		ret = strict_strtoul(buf, 10, &val);
+		if (ret < 0)
+			return ret;
+
+		val = !!val;
+	}
+
+	if (val)
+		ftrace_enable_daemon();
+	else
+		ftrace_disable_daemon();
+
+	filp->f_pos += cnt;
+
+	return cnt;
+}
+
 static struct file_operations ftrace_avail_fops = {
 	.open = ftrace_avail_open,
 	.read = seq_read,
@@ -1242,51 +1316,34 @@ static struct file_operations ftrace_notrace_fops = {
 	.release = ftrace_notrace_release,
 };
 
+static struct file_operations ftraced_fops = {
+	.open = tracing_open_generic,
+	.read = ftraced_read,
+	.write = ftraced_write,
+};
+
 /**
  * ftrace_force_update - force an update to all recording ftrace functions
- *
- * The ftrace dynamic update daemon only wakes up once a second.
- * There may be cases where an update needs to be done immediately
- * for tests or internal kernel tracing to begin. This function
- * wakes the daemon to do an update and will not return until the
- * update is complete.
  */
 int ftrace_force_update(void)
 {
-	unsigned long last_counter;
-	DECLARE_WAITQUEUE(wait, current);
 	int ret = 0;
 
 	if (unlikely(ftrace_disabled))
 		return -ENODEV;
 
+	mutex_lock(&ftrace_sysctl_lock);
 	mutex_lock(&ftraced_lock);
-	last_counter = ftraced_iteration_counter;
-
-	set_current_state(TASK_INTERRUPTIBLE);
-	add_wait_queue(&ftraced_waiters, &wait);
 
-	if (unlikely(!ftraced_task)) {
-		ret = -ENODEV;
-		goto out;
-	}
-
-	do {
-		mutex_unlock(&ftraced_lock);
-		wake_up_process(ftraced_task);
-		schedule();
-		mutex_lock(&ftraced_lock);
-		if (signal_pending(current)) {
-			ret = -EINTR;
-			break;
-		}
-		set_current_state(TASK_INTERRUPTIBLE);
-	} while (last_counter == ftraced_iteration_counter);
+	/*
+	 * If ftraced_trigger is not set, then there is nothing
+	 * to update.
+	 */
+	if (ftraced_trigger && !ftrace_update_code())
+		ret = -EBUSY;
 
- out:
 	mutex_unlock(&ftraced_lock);
-	remove_wait_queue(&ftraced_waiters, &wait);
-	set_current_state(TASK_RUNNING);
+	mutex_unlock(&ftrace_sysctl_lock);
 
 	return ret;
 }
@@ -1331,6 +1388,12 @@ static __init int ftrace_init_debugfs(void)
 	if (!entry)
 		pr_warning("Could not create debugfs "
 			   "'set_ftrace_notrace' entry\n");
+
+	entry = debugfs_create_file("ftraced_enabled", 0644, d_tracer,
+				    NULL, &ftraced_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'ftraced_enabled' entry\n");
 	return 0;
 }
 
-- 
cgit v1.2.3


From 18404756765c713a0be4eb1082920c04822ce588 Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <maxk@qualcomm.com>
Date: Thu, 29 May 2008 11:02:52 -0700
Subject: genirq: Expose default irq affinity mask (take 3)

Current IRQ affinity interface does not provide a way to set affinity
for the IRQs that will be allocated/activated in the future.
This patch creates /proc/irq/default_smp_affinity that lets users set
default affinity mask for the newly allocated IRQs. Changing the default
does not affect affinity masks for the currently active IRQs, they
have to be changed explicitly.

Updated based on Paul J's comments and added some more documentation.

Signed-off-by: Max Krasnyansky <maxk@qualcomm.com>
Cc: pj@sgi.com
Cc: a.p.zijlstra@chello.nl
Cc: tglx@linutronix.de
Cc: rdunlap@xenotime.net
Cc: mingo@elte.hu
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 Documentation/IRQ-affinity.txt     | 37 ++++++++++++++++++------
 Documentation/filesystems/proc.txt | 29 ++++++++++++-------
 arch/alpha/kernel/irq.c            |  5 ++--
 include/linux/interrupt.h          |  5 ++++
 include/linux/irq.h                |  9 ------
 kernel/irq/manage.c                | 28 ++++++++++++++++--
 kernel/irq/proc.c                  | 59 +++++++++++++++++++++++++++++++++++---
 7 files changed, 134 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/IRQ-affinity.txt b/Documentation/IRQ-affinity.txt
index 938d7dd05490..b4a615b78403 100644
--- a/Documentation/IRQ-affinity.txt
+++ b/Documentation/IRQ-affinity.txt
@@ -1,17 +1,26 @@
+ChangeLog:
+	Started by Ingo Molnar <mingo@redhat.com>
+	Update by Max Krasnyansky <maxk@qualcomm.com>
 
-SMP IRQ affinity, started by Ingo Molnar <mingo@redhat.com>
-
+SMP IRQ affinity
 
 /proc/irq/IRQ#/smp_affinity specifies which target CPUs are permitted
 for a given IRQ source. It's a bitmask of allowed CPUs. It's not allowed
 to turn off all CPUs, and if an IRQ controller does not support IRQ
 affinity then the value will not change from the default 0xffffffff.
 
+/proc/irq/default_smp_affinity specifies default affinity mask that applies
+to all non-active IRQs. Once IRQ is allocated/activated its affinity bitmask
+will be set to the default mask. It can then be changed as described above.
+Default mask is 0xffffffff.
+
 Here is an example of restricting IRQ44 (eth1) to CPU0-3 then restricting
-the IRQ to CPU4-7 (this is an 8-CPU SMP box):
+it to CPU4-7 (this is an 8-CPU SMP box):
 
+[root@moon 44]# cd /proc/irq/44
 [root@moon 44]# cat smp_affinity
 ffffffff
+
 [root@moon 44]# echo 0f > smp_affinity
 [root@moon 44]# cat smp_affinity
 0000000f
@@ -21,17 +30,27 @@ PING hell (195.4.7.3): 56 data bytes
 --- hell ping statistics ---
 6029 packets transmitted, 6027 packets received, 0% packet loss
 round-trip min/avg/max = 0.1/0.1/0.4 ms
-[root@moon 44]# cat /proc/interrupts | grep 44:
- 44:          0       1785       1785       1783       1783          1
-1          0   IO-APIC-level  eth1
+[root@moon 44]# cat /proc/interrupts | grep 'CPU\|44:'
+           CPU0       CPU1       CPU2       CPU3      CPU4       CPU5        CPU6       CPU7
+ 44:       1068       1785       1785       1783         0          0           0         0    IO-APIC-level  eth1
+
+As can be seen from the line above IRQ44 was delivered only to the first four
+processors (0-3).
+Now lets restrict that IRQ to CPU(4-7).
+
 [root@moon 44]# echo f0 > smp_affinity
+[root@moon 44]# cat smp_affinity
+000000f0
 [root@moon 44]# ping -f h
 PING hell (195.4.7.3): 56 data bytes
 ..
 --- hell ping statistics ---
 2779 packets transmitted, 2777 packets received, 0% packet loss
 round-trip min/avg/max = 0.1/0.5/585.4 ms
-[root@moon 44]# cat /proc/interrupts | grep 44:
- 44:       1068       1785       1785       1784       1784       1069       1070       1069   IO-APIC-level  eth1
-[root@moon 44]#
+[root@moon 44]# cat /proc/interrupts |  'CPU\|44:'
+           CPU0       CPU1       CPU2       CPU3      CPU4       CPU5        CPU6       CPU7
+ 44:       1068       1785       1785       1783      1784       1069        1070       1069   IO-APIC-level  eth1
+
+This time around IRQ44 was delivered only to the last four processors.
+i.e counters for the CPU0-3 did not change.
 
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index dbc3c6a3650f..7f268f327d75 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -380,28 +380,35 @@ i386 and x86_64 platforms support the new IRQ vector displays.
 Of some interest is the introduction of the /proc/irq directory to 2.4.
 It could be used to set IRQ to CPU affinity, this means that you can "hook" an
 IRQ to only one CPU, or to exclude a CPU of handling IRQs. The contents of the
-irq subdir is one subdir for each IRQ, and one file; prof_cpu_mask
+irq subdir is one subdir for each IRQ, and two files; default_smp_affinity and
+prof_cpu_mask.
 
 For example 
   > ls /proc/irq/
   0  10  12  14  16  18  2  4  6  8  prof_cpu_mask
-  1  11  13  15  17  19  3  5  7  9
+  1  11  13  15  17  19  3  5  7  9  default_smp_affinity
   > ls /proc/irq/0/
   smp_affinity
 
-The contents of the prof_cpu_mask file and each smp_affinity file for each IRQ
-is the same by default:
+smp_affinity is a bitmask, in which you can specify which CPUs can handle the
+IRQ, you can set it by doing:
 
-  > cat /proc/irq/0/smp_affinity 
-  ffffffff
+  > echo 1 > /proc/irq/10/smp_affinity
+
+This means that only the first CPU will handle the IRQ, but you can also echo
+5 which means that only the first and fourth CPU can handle the IRQ.
 
-It's a bitmask, in which you can specify which CPUs can handle the IRQ, you can
-set it by doing:
+The contents of each smp_affinity file is the same by default:
+
+  > cat /proc/irq/0/smp_affinity
+  ffffffff
 
-  > echo 1 > /proc/irq/prof_cpu_mask
+The default_smp_affinity mask applies to all non-active IRQs, which are the
+IRQs which have not yet been allocated/activated, and hence which lack a
+/proc/irq/[0-9]* directory.
 
-This means that only the first CPU will handle the IRQ, but you can also echo 5
-which means that only the first and fourth CPU can handle the IRQ.
+prof_cpu_mask specifies which CPUs are to be profiled by the system wide
+profiler. Default value is ffffffff (all cpus).
 
 The way IRQs are routed is handled by the IO-APIC, and it's Round Robin
 between all the CPUs which are allowed to handle it. As usual the kernel has
diff --git a/arch/alpha/kernel/irq.c b/arch/alpha/kernel/irq.c
index facf82a5499a..c626a821cdcb 100644
--- a/arch/alpha/kernel/irq.c
+++ b/arch/alpha/kernel/irq.c
@@ -42,8 +42,7 @@ void ack_bad_irq(unsigned int irq)
 #ifdef CONFIG_SMP 
 static char irq_user_affinity[NR_IRQS];
 
-int
-select_smp_affinity(unsigned int irq)
+int irq_select_affinity(unsigned int irq)
 {
 	static int last_cpu;
 	int cpu = last_cpu + 1;
@@ -51,7 +50,7 @@ select_smp_affinity(unsigned int irq)
 	if (!irq_desc[irq].chip->set_affinity || irq_user_affinity[irq])
 		return 1;
 
-	while (!cpu_possible(cpu))
+	while (!cpu_possible(cpu) || !cpu_isset(cpu, irq_default_affinity))
 		cpu = (cpu < (NR_CPUS-1) ? cpu + 1 : 0);
 	last_cpu = cpu;
 
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index f1fc7470d26c..043400f3d458 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -104,8 +104,11 @@ extern void enable_irq(unsigned int irq);
 
 #if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS)
 
+extern cpumask_t irq_default_affinity;
+
 extern int irq_set_affinity(unsigned int irq, cpumask_t cpumask);
 extern int irq_can_set_affinity(unsigned int irq);
+extern int irq_select_affinity(unsigned int irq);
 
 #else /* CONFIG_SMP */
 
@@ -119,6 +122,8 @@ static inline int irq_can_set_affinity(unsigned int irq)
 	return 0;
 }
 
+static inline int irq_select_affinity(unsigned int irq)  { return 0; }
+
 #endif /* CONFIG_SMP && CONFIG_GENERIC_HARDIRQS */
 
 #ifdef CONFIG_GENERIC_HARDIRQS
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 552e0ec269c9..8ccb462ea42c 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -244,15 +244,6 @@ static inline void set_balance_irq_affinity(unsigned int irq, cpumask_t mask)
 }
 #endif
 
-#ifdef CONFIG_AUTO_IRQ_AFFINITY
-extern int select_smp_affinity(unsigned int irq);
-#else
-static inline int select_smp_affinity(unsigned int irq)
-{
-	return 1;
-}
-#endif
-
 extern int no_irq_affinity;
 
 static inline int irq_balancing_disabled(unsigned int irq)
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 46d6611a33bb..469814e9b9ee 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -17,6 +17,8 @@
 
 #ifdef CONFIG_SMP
 
+cpumask_t irq_default_affinity = CPU_MASK_ALL;
+
 /**
  *	synchronize_irq - wait for pending IRQ handlers (on other CPUs)
  *	@irq: interrupt number to wait for
@@ -95,6 +97,27 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
 	return 0;
 }
 
+#ifndef CONFIG_AUTO_IRQ_AFFINITY
+/*
+ * Generic version of the affinity autoselector.
+ */
+int irq_select_affinity(unsigned int irq)
+{
+	cpumask_t mask;
+
+	if (!irq_can_set_affinity(irq))
+		return 0;
+
+	cpus_and(mask, cpu_online_map, irq_default_affinity);
+
+	irq_desc[irq].affinity = mask;
+	irq_desc[irq].chip->set_affinity(irq, mask);
+
+	set_balance_irq_affinity(irq, mask);
+	return 0;
+}
+#endif
+
 #endif
 
 /**
@@ -382,6 +405,9 @@ int setup_irq(unsigned int irq, struct irqaction *new)
 		} else
 			/* Undo nested disables: */
 			desc->depth = 1;
+
+		/* Set default affinity mask once everything is setup */
+		irq_select_affinity(irq);
 	}
 	/* Reset broken irq detection when installing new handler */
 	desc->irq_count = 0;
@@ -571,8 +597,6 @@ int request_irq(unsigned int irq, irq_handler_t handler,
 	action->next = NULL;
 	action->dev_id = dev_id;
 
-	select_smp_affinity(irq);
-
 #ifdef CONFIG_DEBUG_SHIRQ
 	if (irqflags & IRQF_SHARED) {
 		/*
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index c2f2ccb0549a..6c6d35d68ee9 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -44,7 +44,7 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
 				   unsigned long count, void *data)
 {
 	unsigned int irq = (int)(long)data, full_count = count, err;
-	cpumask_t new_value, tmp;
+	cpumask_t new_value;
 
 	if (!irq_desc[irq].chip->set_affinity || no_irq_affinity ||
 	    irq_balancing_disabled(irq))
@@ -62,17 +62,51 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
 	 * way to make the system unusable accidentally :-) At least
 	 * one online CPU still has to be targeted.
 	 */
-	cpus_and(tmp, new_value, cpu_online_map);
-	if (cpus_empty(tmp))
+	if (!cpus_intersects(new_value, cpu_online_map))
 		/* Special case for empty set - allow the architecture
 		   code to set default SMP affinity. */
-		return select_smp_affinity(irq) ? -EINVAL : full_count;
+		return irq_select_affinity(irq) ? -EINVAL : full_count;
 
 	irq_set_affinity(irq, new_value);
 
 	return full_count;
 }
 
+static int default_affinity_read(char *page, char **start, off_t off,
+				  int count, int *eof, void *data)
+{
+	int len = cpumask_scnprintf(page, count, irq_default_affinity);
+	if (count - len < 2)
+		return -EINVAL;
+	len += sprintf(page + len, "\n");
+	return len;
+}
+
+static int default_affinity_write(struct file *file, const char __user *buffer,
+				   unsigned long count, void *data)
+{
+	unsigned int full_count = count, err;
+	cpumask_t new_value;
+
+	err = cpumask_parse_user(buffer, count, new_value);
+	if (err)
+		return err;
+
+	if (!is_affinity_mask_valid(new_value))
+		return -EINVAL;
+
+	/*
+	 * Do not allow disabling IRQs completely - it's a too easy
+	 * way to make the system unusable accidentally :-) At least
+	 * one online CPU still has to be targeted.
+	 */
+	if (!cpus_intersects(new_value, cpu_online_map))
+		return -EINVAL;
+
+	irq_default_affinity = new_value;
+
+	return full_count;
+}
 #endif
 
 static int irq_spurious_read(char *page, char **start, off_t off,
@@ -171,6 +205,21 @@ void unregister_handler_proc(unsigned int irq, struct irqaction *action)
 		remove_proc_entry(action->dir->name, irq_desc[irq].dir);
 }
 
+void register_default_affinity_proc(void)
+{
+#ifdef CONFIG_SMP
+	struct proc_dir_entry *entry;
+
+	/* create /proc/irq/default_smp_affinity */
+	entry = create_proc_entry("default_smp_affinity", 0600, root_irq_dir);
+	if (entry) {
+		entry->data = NULL;
+		entry->read_proc  = default_affinity_read;
+		entry->write_proc = default_affinity_write;
+	}
+#endif
+}
+
 void init_irq_proc(void)
 {
 	int i;
@@ -180,6 +229,8 @@ void init_irq_proc(void)
 	if (!root_irq_dir)
 		return;
 
+	register_default_affinity_proc();
+
 	/*
 	 * Create entries for all existing IRQs.
 	 */
-- 
cgit v1.2.3


From 554ec22f075d46e4363520a407d2b7eeb5dfdd43 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:21:03 +0200
Subject: namespacecheck: more sched.c fixes

[ Stephen Rothwell <sfr@canb.auug.org.au>: build fix ]

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/sched.h | 18 ------------------
 1 file changed, 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index ae0be3c62375..dc36c3aea018 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -134,7 +134,6 @@ extern unsigned long nr_running(void);
 extern unsigned long nr_uninterruptible(void);
 extern unsigned long nr_active(void);
 extern unsigned long nr_iowait(void);
-extern unsigned long weighted_cpuload(const int cpu);
 
 struct seq_file;
 struct cfs_rq;
@@ -823,23 +822,6 @@ extern int arch_reinit_sched_domains(void);
 
 #endif	/* CONFIG_SMP */
 
-/*
- * A runqueue laden with a single nice 0 task scores a weighted_cpuload of
- * SCHED_LOAD_SCALE. This function returns 1 if any cpu is laden with a
- * task of nice 0 or enough lower priority tasks to bring up the
- * weighted_cpuload
- */
-static inline int above_background_load(void)
-{
-	unsigned long cpu;
-
-	for_each_online_cpu(cpu) {
-		if (weighted_cpuload(cpu) >= SCHED_LOAD_SCALE)
-			return 1;
-	}
-	return 0;
-}
-
 struct io_context;			/* See blkdev.h */
 #define NGROUPS_SMALL		32
 #define NGROUPS_PER_BLOCK	((unsigned int)(PAGE_SIZE / sizeof(gid_t)))
-- 
cgit v1.2.3


From c7aceaba042702538b23cf4e0de1b2891ad8e671 Mon Sep 17 00:00:00 2001
From: Richard Kennedy <richard@rsk.demon.co.uk>
Date: Thu, 15 May 2008 12:09:15 +0100
Subject: sched: reorder task_struct to reduce padding on 64bit builds

This patch removes 24 bytes of padding and allows 1 extra object per
slab on my fedora based config.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/sched.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index dc36c3aea018..ea2857b99596 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1021,6 +1021,7 @@ struct task_struct {
 #endif
 
 	int prio, static_prio, normal_prio;
+	unsigned int rt_priority;
 	const struct sched_class *sched_class;
 	struct sched_entity se;
 	struct sched_rt_entity rt;
@@ -1104,7 +1105,6 @@ struct task_struct {
 	int __user *set_child_tid;		/* CLONE_CHILD_SETTID */
 	int __user *clear_child_tid;		/* CLONE_CHILD_CLEARTID */
 
-	unsigned int rt_priority;
 	cputime_t utime, stime, utimescaled, stimescaled;
 	cputime_t gtime;
 	cputime_t prev_utime, prev_stime;
@@ -1123,12 +1123,12 @@ struct task_struct {
 	gid_t gid,egid,sgid,fsgid;
 	struct group_info *group_info;
 	kernel_cap_t   cap_effective, cap_inheritable, cap_permitted, cap_bset;
-	unsigned securebits;
 	struct user_struct *user;
+	unsigned securebits;
 #ifdef CONFIG_KEYS
+	unsigned char jit_keyring;	/* default keyring to attach requested keys to */
 	struct key *request_key_auth;	/* assumed request_key authority */
 	struct key *thread_keyring;	/* keyring private to this thread */
-	unsigned char jit_keyring;	/* default keyring to attach requested keys to */
 #endif
 	char comm[TASK_COMM_LEN]; /* executable name excluding path
 				     - access with [gs]et_task_comm (which lock
@@ -1215,8 +1215,8 @@ struct task_struct {
 # define MAX_LOCK_DEPTH 48UL
 	u64 curr_chain_key;
 	int lockdep_depth;
-	struct held_lock held_locks[MAX_LOCK_DEPTH];
 	unsigned int lockdep_recursion;
+	struct held_lock held_locks[MAX_LOCK_DEPTH];
 #endif
 
 /* journalling filesystem info */
@@ -1244,10 +1244,6 @@ struct task_struct {
 	u64 acct_vm_mem1;	/* accumulated virtual memory usage */
 	cputime_t acct_stimexpd;/* stime since last update */
 #endif
-#ifdef CONFIG_NUMA
-  	struct mempolicy *mempolicy;
-	short il_next;
-#endif
 #ifdef CONFIG_CPUSETS
 	nodemask_t mems_allowed;
 	int cpuset_mems_generation;
@@ -1266,6 +1262,10 @@ struct task_struct {
 #endif
 	struct list_head pi_state_list;
 	struct futex_pi_state *pi_state_cache;
+#endif
+#ifdef CONFIG_NUMA
+	struct mempolicy *mempolicy;
+	short il_next;
 #endif
 	atomic_t fs_excl;	/* holding fs exclusive resources */
 	struct rcu_head rcu;
-- 
cgit v1.2.3


From 1f11eb6a8bc92536d9e93ead48fa3ffbd1478571 Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Wed, 4 Jun 2008 15:04:05 -0400
Subject: sched: fix cpupri hotplug support

The RT folks over at RedHat found an issue w.r.t. hotplug support which
was traced to problems with the cpupri infrastructure in the scheduler:

https://bugzilla.redhat.com/show_bug.cgi?id=449676

This bug affects 23-rt12+, 24-rtX, 25-rtX, and sched-devel.  This patch
applies to 25.4-rt4, though it should trivially apply to most cpupri enabled
kernels mentioned above.

It turned out that the issue was that offline cpus could get inadvertently
registered with cpupri so that they were erroneously selected during
migration decisions.  The end result would be an OOPS as the offline cpu
had tasks routed to it.

This patch generalizes the old join/leave domain interface into an
online/offline interface, and adjusts the root-domain/hotplug code to
utilize it.

I was able to easily reproduce the issue prior to this patch, and am no
longer able to reproduce it after this patch.  I can offline cpus
indefinately and everything seems to be in working order.

Thanks to Arnaldo (acme), Thomas, and Peter for doing the legwork to point
me in the right direction.  Also thank you to Peter for reviewing the
early iterations of this patch.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/sched.h |  4 ++--
 kernel/sched.c        | 54 ++++++++++++++++++++++++++++++++++++++-------------
 kernel/sched_rt.c     | 24 +++++++++++++++++------
 3 files changed, 60 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index ea2857b99596..d25acf600a32 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -903,8 +903,8 @@ struct sched_class {
 	void (*set_cpus_allowed)(struct task_struct *p,
 				 const cpumask_t *newmask);
 
-	void (*join_domain)(struct rq *rq);
-	void (*leave_domain)(struct rq *rq);
+	void (*rq_online)(struct rq *rq);
+	void (*rq_offline)(struct rq *rq);
 
 	void (*switched_from) (struct rq *this_rq, struct task_struct *task,
 			       int running);
diff --git a/kernel/sched.c b/kernel/sched.c
index dc0be113f41d..f0ed81b71282 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -529,6 +529,7 @@ struct rq {
 	int push_cpu;
 	/* cpu of this runqueue: */
 	int cpu;
+	int online;
 
 	struct task_struct *migration_thread;
 	struct list_head migration_queue;
@@ -1498,6 +1499,8 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
 #endif
 
 #define sched_class_highest (&rt_sched_class)
+#define for_each_class(class) \
+   for (class = sched_class_highest; class; class = class->next)
 
 static inline void inc_load(struct rq *rq, const struct task_struct *p)
 {
@@ -6065,6 +6068,36 @@ static void unregister_sched_domain_sysctl(void)
 }
 #endif
 
+static void set_rq_online(struct rq *rq)
+{
+	if (!rq->online) {
+		const struct sched_class *class;
+
+		cpu_set(rq->cpu, rq->rd->online);
+		rq->online = 1;
+
+		for_each_class(class) {
+			if (class->rq_online)
+				class->rq_online(rq);
+		}
+	}
+}
+
+static void set_rq_offline(struct rq *rq)
+{
+	if (rq->online) {
+		const struct sched_class *class;
+
+		for_each_class(class) {
+			if (class->rq_offline)
+				class->rq_offline(rq);
+		}
+
+		cpu_clear(rq->cpu, rq->rd->online);
+		rq->online = 0;
+	}
+}
+
 /*
  * migration_call - callback that gets triggered when a CPU is added.
  * Here we can start up the necessary migration thread for the new CPU.
@@ -6102,7 +6135,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		spin_lock_irqsave(&rq->lock, flags);
 		if (rq->rd) {
 			BUG_ON(!cpu_isset(cpu, rq->rd->span));
-			cpu_set(cpu, rq->rd->online);
+
+			set_rq_online(rq);
 		}
 		spin_unlock_irqrestore(&rq->lock, flags);
 		break;
@@ -6163,7 +6197,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		spin_lock_irqsave(&rq->lock, flags);
 		if (rq->rd) {
 			BUG_ON(!cpu_isset(cpu, rq->rd->span));
-			cpu_clear(cpu, rq->rd->online);
+			set_rq_offline(rq);
 		}
 		spin_unlock_irqrestore(&rq->lock, flags);
 		break;
@@ -6385,20 +6419,16 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
 static void rq_attach_root(struct rq *rq, struct root_domain *rd)
 {
 	unsigned long flags;
-	const struct sched_class *class;
 
 	spin_lock_irqsave(&rq->lock, flags);
 
 	if (rq->rd) {
 		struct root_domain *old_rd = rq->rd;
 
-		for (class = sched_class_highest; class; class = class->next) {
-			if (class->leave_domain)
-				class->leave_domain(rq);
-		}
+		if (cpu_isset(rq->cpu, old_rd->online))
+			set_rq_offline(rq);
 
 		cpu_clear(rq->cpu, old_rd->span);
-		cpu_clear(rq->cpu, old_rd->online);
 
 		if (atomic_dec_and_test(&old_rd->refcount))
 			kfree(old_rd);
@@ -6409,12 +6439,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
 
 	cpu_set(rq->cpu, rd->span);
 	if (cpu_isset(rq->cpu, cpu_online_map))
-		cpu_set(rq->cpu, rd->online);
-
-	for (class = sched_class_highest; class; class = class->next) {
-		if (class->join_domain)
-			class->join_domain(rq);
-	}
+		set_rq_online(rq);
 
 	spin_unlock_irqrestore(&rq->lock, flags);
 }
@@ -7824,6 +7849,7 @@ void __init sched_init(void)
 		rq->next_balance = jiffies;
 		rq->push_cpu = 0;
 		rq->cpu = i;
+		rq->online = 0;
 		rq->migration_thread = NULL;
 		INIT_LIST_HEAD(&rq->migration_queue);
 		rq_attach_root(rq, &def_root_domain);
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 44b06d75416e..e4821593d4de 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -12,6 +12,9 @@ static inline int rt_overloaded(struct rq *rq)
 
 static inline void rt_set_overload(struct rq *rq)
 {
+	if (!rq->online)
+		return;
+
 	cpu_set(rq->cpu, rq->rd->rto_mask);
 	/*
 	 * Make sure the mask is visible before we set
@@ -26,6 +29,9 @@ static inline void rt_set_overload(struct rq *rq)
 
 static inline void rt_clear_overload(struct rq *rq)
 {
+	if (!rq->online)
+		return;
+
 	/* the order here really doesn't matter */
 	atomic_dec(&rq->rd->rto_count);
 	cpu_clear(rq->cpu, rq->rd->rto_mask);
@@ -394,7 +400,10 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 	if (rt_se_prio(rt_se) < rt_rq->highest_prio) {
 		struct rq *rq = rq_of_rt_rq(rt_rq);
 		rt_rq->highest_prio = rt_se_prio(rt_se);
-		cpupri_set(&rq->rd->cpupri, rq->cpu, rt_se_prio(rt_se));
+
+		if (rq->online)
+			cpupri_set(&rq->rd->cpupri, rq->cpu,
+				   rt_se_prio(rt_se));
 	}
 #endif
 #ifdef CONFIG_SMP
@@ -448,7 +457,10 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 
 	if (rt_rq->highest_prio != highest_prio) {
 		struct rq *rq = rq_of_rt_rq(rt_rq);
-		cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio);
+
+		if (rq->online)
+			cpupri_set(&rq->rd->cpupri, rq->cpu,
+				   rt_rq->highest_prio);
 	}
 
 	update_rt_migration(rq_of_rt_rq(rt_rq));
@@ -1154,7 +1166,7 @@ static void set_cpus_allowed_rt(struct task_struct *p,
 }
 
 /* Assumes rq->lock is held */
-static void join_domain_rt(struct rq *rq)
+static void rq_online_rt(struct rq *rq)
 {
 	if (rq->rt.overloaded)
 		rt_set_overload(rq);
@@ -1163,7 +1175,7 @@ static void join_domain_rt(struct rq *rq)
 }
 
 /* Assumes rq->lock is held */
-static void leave_domain_rt(struct rq *rq)
+static void rq_offline_rt(struct rq *rq)
 {
 	if (rq->rt.overloaded)
 		rt_clear_overload(rq);
@@ -1331,8 +1343,8 @@ static const struct sched_class rt_sched_class = {
 	.load_balance		= load_balance_rt,
 	.move_one_task		= move_one_task_rt,
 	.set_cpus_allowed       = set_cpus_allowed_rt,
-	.join_domain            = join_domain_rt,
-	.leave_domain           = leave_domain_rt,
+	.rq_online              = rq_online_rt,
+	.rq_offline             = rq_offline_rt,
 	.pre_schedule		= pre_schedule_rt,
 	.post_schedule		= post_schedule_rt,
 	.task_wake_up		= task_wake_up_rt,
-- 
cgit v1.2.3


From cc1a9d86ce989083703c4bdc11b75a87e1cc404a Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Sun, 8 Jun 2008 19:39:16 -0700
Subject: mm, x86: shrink_active_range() should check all

Now we are using register_e820_active_regions() instead of
add_active_range() directly. So end_pfn could be different between the
value in early_node_map to node_end_pfn.

So we need to make shrink_active_range() smarter.

shrink_active_range() is a generic MM function in mm/page_alloc.c but
it is only used on 32-bit x86. Should we move it back to some file in
arch/x86?

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/discontig_32.c |  2 +-
 include/linux/mm.h         |  3 +--
 mm/page_alloc.c            | 44 ++++++++++++++++++++++++++++++++++----------
 3 files changed, 36 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c
index a89ccf3d4c14..489605bab85a 100644
--- a/arch/x86/mm/discontig_32.c
+++ b/arch/x86/mm/discontig_32.c
@@ -282,7 +282,7 @@ static unsigned long calculate_numa_remap_pages(void)
 
 		node_end_pfn[nid] -= size;
 		node_remap_start_pfn[nid] = node_end_pfn[nid];
-		shrink_active_range(nid, old_end_pfn, node_end_pfn[nid]);
+		shrink_active_range(nid, node_end_pfn[nid]);
 	}
 	printk("Reserving total of %ld pages for numa KVA remap\n",
 			reserve_pages);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index c31a9cd2a30e..7cbd949f2516 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -997,8 +997,7 @@ extern void free_area_init_node(int nid, pg_data_t *pgdat,
 extern void free_area_init_nodes(unsigned long *max_zone_pfn);
 extern void add_active_range(unsigned int nid, unsigned long start_pfn,
 					unsigned long end_pfn);
-extern void shrink_active_range(unsigned int nid, unsigned long old_end_pfn,
-						unsigned long new_end_pfn);
+extern void shrink_active_range(unsigned int nid, unsigned long new_end_pfn);
 extern void push_node_boundaries(unsigned int nid, unsigned long start_pfn,
 					unsigned long end_pfn);
 extern void remove_all_active_ranges(void);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 502223c3c2c6..215408684076 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3579,25 +3579,49 @@ void __init add_active_range(unsigned int nid, unsigned long start_pfn,
 /**
  * shrink_active_range - Shrink an existing registered range of PFNs
  * @nid: The node id the range is on that should be shrunk
- * @old_end_pfn: The old end PFN of the range
  * @new_end_pfn: The new PFN of the range
  *
  * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node.
- * The map is kept at the end physical page range that has already been
- * registered with add_active_range(). This function allows an arch to shrink
- * an existing registered range.
+ * The map is kept near the end physical page range that has already been
+ * registered. This function allows an arch to shrink an existing registered
+ * range.
  */
-void __init shrink_active_range(unsigned int nid, unsigned long old_end_pfn,
-						unsigned long new_end_pfn)
+void __init shrink_active_range(unsigned int nid, unsigned long new_end_pfn)
 {
-	int i;
+	int i, j;
+	int removed = 0;
 
 	/* Find the old active region end and shrink */
-	for_each_active_range_index_in_nid(i, nid)
-		if (early_node_map[i].end_pfn == old_end_pfn) {
+	for_each_active_range_index_in_nid(i, nid) {
+		if (early_node_map[i].start_pfn >= new_end_pfn) {
+			/* clear it */
+			early_node_map[i].end_pfn = 0;
+			removed = 1;
+			continue;
+		}
+		if (early_node_map[i].end_pfn > new_end_pfn) {
 			early_node_map[i].end_pfn = new_end_pfn;
-			break;
+			continue;
 		}
+	}
+
+	if (!removed)
+		return;
+
+	/* remove the blank ones */
+	for (i = nr_nodemap_entries - 1; i > 0; i--) {
+		if (early_node_map[i].nid != nid)
+			continue;
+		if (early_node_map[i].end_pfn)
+			continue;
+		/* we found it, get rid of it */
+		for (j = i; j < nr_nodemap_entries - 1; j++)
+			memcpy(&early_node_map[j], &early_node_map[j+1],
+				sizeof(early_node_map[j]));
+		j = nr_nodemap_entries - 1;
+		memset(&early_node_map[j], 0, sizeof(early_node_map[j]));
+		nr_nodemap_entries--;
+	}
 }
 
 /**
-- 
cgit v1.2.3


From 0eb967012ea15e6e8cfab483d9fa37bc602d400c Mon Sep 17 00:00:00 2001
From: Abhishek Sagar <sagar.abhishek@gmail.com>
Date: Sun, 1 Jun 2008 21:47:30 +0530
Subject: ftrace: prevent freeing of all failed updates

Prevent freeing of records which cause problems and correspond to function from
core kernel text. A new flag, FTRACE_FL_CONVERTED is used to mark a record
as "converted". All other records are patched lazily to NOPs. Failed records
now also remain on frace_hash table. Each invocation of ftrace_record_ip now
checks whether the traced function has ever been recorded (including past
failures) and doesn't re-record it again.

Signed-off-by: Abhishek Sagar <sagar.abhishek@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h |  1 +
 kernel/trace/ftrace.c  | 76 ++++++++++++++++++++++++++++++--------------------
 2 files changed, 47 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 623819433ed5..20e14d0093c7 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -49,6 +49,7 @@ enum {
 	FTRACE_FL_FILTER	= (1 << 2),
 	FTRACE_FL_ENABLED	= (1 << 3),
 	FTRACE_FL_NOTRACE	= (1 << 4),
+	FTRACE_FL_CONVERTED	= (1 << 5),
 };
 
 struct dyn_ftrace {
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index f762f5a2d331..ec54cb7d69d6 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -216,6 +216,12 @@ ftrace_add_hash(struct dyn_ftrace *node, unsigned long key)
 	hlist_add_head_rcu(&node->node, &ftrace_hash[key]);
 }
 
+/* called from kstop_machine */
+static inline void ftrace_del_hash(struct dyn_ftrace *node)
+{
+	hlist_del(&node->node);
+}
+
 static void ftrace_free_rec(struct dyn_ftrace *rec)
 {
 	/* no locking, only called from kstop_machine */
@@ -332,12 +338,11 @@ ftrace_record_ip(unsigned long ip)
 #define FTRACE_ADDR ((long)(ftrace_caller))
 #define MCOUNT_ADDR ((long)(mcount))
 
-static void
+static int
 __ftrace_replace_code(struct dyn_ftrace *rec,
 		      unsigned char *old, unsigned char *new, int enable)
 {
 	unsigned long ip, fl;
-	int failed;
 
 	ip = rec->ip;
 
@@ -364,7 +369,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec,
 
 		if ((fl ==  (FTRACE_FL_FILTER | FTRACE_FL_ENABLED)) ||
 		    (fl == 0) || (rec->flags & FTRACE_FL_NOTRACE))
-			return;
+			return 0;
 
 		/*
 		 * If it is enabled disable it,
@@ -388,7 +393,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec,
 			 */
 			fl = rec->flags & (FTRACE_FL_NOTRACE | FTRACE_FL_ENABLED);
 			if (fl == FTRACE_FL_NOTRACE)
-				return;
+				return 0;
 
 			new = ftrace_call_replace(ip, FTRACE_ADDR);
 		} else
@@ -396,34 +401,24 @@ __ftrace_replace_code(struct dyn_ftrace *rec,
 
 		if (enable) {
 			if (rec->flags & FTRACE_FL_ENABLED)
-				return;
+				return 0;
 			rec->flags |= FTRACE_FL_ENABLED;
 		} else {
 			if (!(rec->flags & FTRACE_FL_ENABLED))
-				return;
+				return 0;
 			rec->flags &= ~FTRACE_FL_ENABLED;
 		}
 	}
 
-	failed = ftrace_modify_code(ip, old, new);
-	if (failed) {
-		unsigned long key;
-		/* It is possible that the function hasn't been converted yet */
-		key = hash_long(ip, FTRACE_HASHBITS);
-		if (!ftrace_ip_in_hash(ip, key)) {
-			rec->flags |= FTRACE_FL_FAILED;
-			ftrace_free_rec(rec);
-		}
-
-	}
+	return ftrace_modify_code(ip, old, new);
 }
 
 static void ftrace_replace_code(int enable)
 {
+	int i, failed;
 	unsigned char *new = NULL, *old = NULL;
 	struct dyn_ftrace *rec;
 	struct ftrace_page *pg;
-	int i;
 
 	if (enable)
 		old = ftrace_nop_replace();
@@ -438,7 +433,15 @@ static void ftrace_replace_code(int enable)
 			if (rec->flags & FTRACE_FL_FAILED)
 				continue;
 
-			__ftrace_replace_code(rec, old, new, enable);
+			failed = __ftrace_replace_code(rec, old, new, enable);
+			if (failed && (rec->flags & FTRACE_FL_CONVERTED)) {
+				rec->flags |= FTRACE_FL_FAILED;
+				if ((system_state == SYSTEM_BOOTING) ||
+				    !kernel_text_address(rec->ip)) {
+					ftrace_del_hash(rec);
+					ftrace_free_rec(rec);
+				}
+			}
 		}
 	}
 }
@@ -467,7 +470,6 @@ ftrace_code_disable(struct dyn_ftrace *rec)
 	failed = ftrace_modify_code(ip, call, nop);
 	if (failed) {
 		rec->flags |= FTRACE_FL_FAILED;
-		ftrace_free_rec(rec);
 		return 0;
 	}
 	return 1;
@@ -621,8 +623,7 @@ unsigned long		ftrace_update_tot_cnt;
 static int __ftrace_update_code(void *ignore)
 {
 	struct dyn_ftrace *p;
-	struct hlist_head head;
-	struct hlist_node *t;
+	struct hlist_node *t, *n;
 	int save_ftrace_enabled;
 	cycle_t start, stop;
 	int i;
@@ -637,18 +638,33 @@ static int __ftrace_update_code(void *ignore)
 
 	/* No locks needed, the machine is stopped! */
 	for (i = 0; i < FTRACE_HASHSIZE; i++) {
-		if (hlist_empty(&ftrace_hash[i]))
-			continue;
+		/* all CPUS are stopped, we are safe to modify code */
+		hlist_for_each_entry_safe(p, t, n, &ftrace_hash[i], node) {
+			/* Skip over failed records which have not been
+			 * freed. */
+			if (p->flags & FTRACE_FL_FAILED)
+				continue;
 
-		head = ftrace_hash[i];
-		INIT_HLIST_HEAD(&ftrace_hash[i]);
+			/* Unconverted records are always at the head of the
+			 * hash bucket. Once we encounter a converted record,
+			 * simply skip over to the next bucket. Saves ftraced
+			 * some processor cycles (ftrace does its bid for
+			 * global warming :-p ). */
+			if (p->flags & (FTRACE_FL_CONVERTED))
+				break;
 
-		/* all CPUS are stopped, we are safe to modify code */
-		hlist_for_each_entry(p, t, &head, node) {
-			if (ftrace_code_disable(p))
+			if (ftrace_code_disable(p)) {
+				p->flags |= FTRACE_FL_CONVERTED;
 				ftrace_update_cnt++;
-		}
+			} else {
+				if ((system_state == SYSTEM_BOOTING) ||
+				    !kernel_text_address(p->ip)) {
+					ftrace_del_hash(p);
+					ftrace_free_rec(p);
 
+				}
+			}
+		}
 	}
 
 	stop = ftrace_now(raw_smp_processor_id());
-- 
cgit v1.2.3


From 9985b0bab332289f14837eff3c6e0bcc658b58f7 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Thu, 5 Jun 2008 12:57:11 -0700
Subject: sched: prevent bound kthreads from changing cpus_allowed

Kthreads that have called kthread_bind() are bound to specific cpus, so
other tasks should not be able to change their cpus_allowed from under
them.  Otherwise, it is possible to move kthreads, such as the migration
or software watchdog threads, so they are not allowed access to the cpu
they work on.

Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Paul Menage <menage@google.com>
Cc: Paul Jackson <pj@sgi.com>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  1 +
 kernel/cpuset.c       | 14 +++++++++++++-
 kernel/kthread.c      |  1 +
 kernel/sched.c        |  6 ++++++
 4 files changed, 21 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index d25acf600a32..2db1485f865d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1486,6 +1486,7 @@ static inline void put_task_struct(struct task_struct *t)
 #define PF_SWAPWRITE	0x00800000	/* Allowed to write to swap */
 #define PF_SPREAD_PAGE	0x01000000	/* Spread page cache over cpuset */
 #define PF_SPREAD_SLAB	0x02000000	/* Spread some slab caches over cpuset */
+#define PF_THREAD_BOUND	0x04000000	/* Thread bound to specific cpu */
 #define PF_MEMPOLICY	0x10000000	/* Non-default NUMA mempolicy */
 #define PF_MUTEX_TESTER	0x20000000	/* Thread belongs to the rt mutex tester */
 #define PF_FREEZER_SKIP	0x40000000	/* Freezer should not count it as freezeable */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 6090d18b58a9..b84354f4de36 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1190,6 +1190,15 @@ static int cpuset_can_attach(struct cgroup_subsys *ss,
 
 	if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
 		return -ENOSPC;
+	if (tsk->flags & PF_THREAD_BOUND) {
+		cpumask_t mask;
+
+		mutex_lock(&callback_mutex);
+		mask = cs->cpus_allowed;
+		mutex_unlock(&callback_mutex);
+		if (!cpus_equal(tsk->cpus_allowed, mask))
+			return -EINVAL;
+	}
 
 	return security_task_setscheduler(tsk, 0, NULL);
 }
@@ -1203,11 +1212,14 @@ static void cpuset_attach(struct cgroup_subsys *ss,
 	struct mm_struct *mm;
 	struct cpuset *cs = cgroup_cs(cont);
 	struct cpuset *oldcs = cgroup_cs(oldcont);
+	int err;
 
 	mutex_lock(&callback_mutex);
 	guarantee_online_cpus(cs, &cpus);
-	set_cpus_allowed_ptr(tsk, &cpus);
+	err = set_cpus_allowed_ptr(tsk, &cpus);
 	mutex_unlock(&callback_mutex);
+	if (err)
+		return;
 
 	from = oldcs->mems_allowed;
 	to = cs->mems_allowed;
diff --git a/kernel/kthread.c b/kernel/kthread.c
index bd1b9ea024e1..97747cdd37c9 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -180,6 +180,7 @@ void kthread_bind(struct task_struct *k, unsigned int cpu)
 	set_task_cpu(k, cpu);
 	k->cpus_allowed = cpumask_of_cpu(cpu);
 	k->rt.nr_cpus_allowed = 1;
+	k->flags |= PF_THREAD_BOUND;
 }
 EXPORT_SYMBOL(kthread_bind);
 
diff --git a/kernel/sched.c b/kernel/sched.c
index e9c24a128655..164fe7fe0d89 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5563,6 +5563,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
 		goto out;
 	}
 
+	if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
+		     !cpus_equal(p->cpus_allowed, *new_mask))) {
+		ret = -EINVAL;
+		goto out;
+	}
+
 	if (p->sched_class->set_cpus_allowed)
 		p->sched_class->set_cpus_allowed(p, new_mask);
 	else {
-- 
cgit v1.2.3


From c50cbb05a05cf1f9ca3592272eff053c847727d8 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Wed, 4 Jun 2008 21:47:29 -0700
Subject: cpu topology: always define CPU topology information

This can result in an empty topology directory in sysfs, and requires
in-kernel users to protect all uses with #ifdef - see
<http://marc.info/?l=linux-netdev&m=120639033904472&w=2>.

The documentation of CPU topology specifies what the defaults should be if
only partial information is available from the hardware.  So we can
provide these defaults as a fallback.

This patch:

- Adds default definitions of the 4 topology macros to <linux/topology.h>
- Changes drivers/base/topology.c to use the topology macros unconditionally
  and to cope with definitions that aren't lvalues
- Updates documentation accordingly

[ From: Andrew Morton <akpm@linux-foundation.org>
  - fold now-duplicated code
  - fix layout
]

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
Cc: Vegard Nossum <vegard.nossum@gmail.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Chandra Seetharaman <sekharan@us.ibm.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Mike Travis <travis@sgi.com>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: John Hawkes <hawkes@sgi.com>
Cc: Zhang, Yanmin <yanmin.zhang@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/cputopology.txt | 26 +++++++++-----------------
 drivers/base/topology.c       | 38 ++++++++++----------------------------
 include/linux/topology.h      | 13 +++++++++++++
 3 files changed, 32 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/cputopology.txt b/Documentation/cputopology.txt
index b61cb9564023..bd699da24666 100644
--- a/Documentation/cputopology.txt
+++ b/Documentation/cputopology.txt
@@ -14,9 +14,8 @@ represent the thread siblings to cpu X in the same physical package;
 To implement it in an architecture-neutral way, a new source file,
 drivers/base/topology.c, is to export the 4 attributes.
 
-If one architecture wants to support this feature, it just needs to
-implement 4 defines, typically in file include/asm-XXX/topology.h.
-The 4 defines are:
+For an architecture to support this feature, it must define some of
+these macros in include/asm-XXX/topology.h:
 #define topology_physical_package_id(cpu)
 #define topology_core_id(cpu)
 #define topology_thread_siblings(cpu)
@@ -25,17 +24,10 @@ The 4 defines are:
 The type of **_id is int.
 The type of siblings is cpumask_t.
 
-To be consistent on all architectures, the 4 attributes should have
-default values if their values are unavailable. Below is the rule.
-1) physical_package_id: If cpu has no physical package id, -1 is the
-default value.
-2) core_id: If cpu doesn't support multi-core, its core id is 0.
-3) thread_siblings: Just include itself, if the cpu doesn't support
-HT/multi-thread.
-4) core_siblings: Just include itself, if the cpu doesn't support
-multi-core and HT/Multi-thread.
-
-So be careful when declaring the 4 defines in include/asm-XXX/topology.h.
-
-If an attribute isn't defined on an architecture, it won't be exported.
-
+To be consistent on all architectures, include/linux/topology.h
+provides default definitions for any of the above macros that are
+not defined by include/asm-XXX/topology.h:
+1) physical_package_id: -1
+2) core_id: 0
+3) thread_siblings: just the given CPU
+4) core_siblings: just the given CPU
diff --git a/drivers/base/topology.c b/drivers/base/topology.c
index fdf4044d2e74..24d29a9fc25b 100644
--- a/drivers/base/topology.c
+++ b/drivers/base/topology.c
@@ -59,60 +59,42 @@ static ssize_t show_cpumap(int type, cpumask_t *mask, char *buf)
 static inline ssize_t show_##name(struct sys_device *dev, char *buf)	\
 {									\
 	unsigned int cpu = dev->id;					\
-	return show_cpumap(0, &(topology_##name(cpu)), buf);		\
+	cpumask_t siblings = topology_##name(cpu);			\
+	return show_cpumap(0, &siblings, buf);				\
 }
 
 #define define_siblings_show_list(name)					\
 static inline ssize_t show_##name##_list(struct sys_device *dev, char *buf) \
 {									\
 	unsigned int cpu = dev->id;					\
-	return show_cpumap(1, &(topology_##name(cpu)), buf);		\
+	cpumask_t siblings = topology_##name(cpu);			\
+	return show_cpumap(1, &siblings, buf);				\
 }
 
 #define define_siblings_show_func(name)		\
 	define_siblings_show_map(name); define_siblings_show_list(name)
 
-#ifdef	topology_physical_package_id
 define_id_show_func(physical_package_id);
 define_one_ro(physical_package_id);
-#define ref_physical_package_id_attr	&attr_physical_package_id.attr,
-#else
-#define ref_physical_package_id_attr
-#endif
 
-#ifdef topology_core_id
 define_id_show_func(core_id);
 define_one_ro(core_id);
-#define ref_core_id_attr		&attr_core_id.attr,
-#else
-#define ref_core_id_attr
-#endif
 
-#ifdef topology_thread_siblings
 define_siblings_show_func(thread_siblings);
 define_one_ro(thread_siblings);
 define_one_ro(thread_siblings_list);
-#define ref_thread_siblings_attr	\
-		&attr_thread_siblings.attr, &attr_thread_siblings_list.attr,
-#else
-#define ref_thread_siblings_attr
-#endif
 
-#ifdef topology_core_siblings
 define_siblings_show_func(core_siblings);
 define_one_ro(core_siblings);
 define_one_ro(core_siblings_list);
-#define ref_core_siblings_attr		\
-		&attr_core_siblings.attr, &attr_core_siblings_list.attr,
-#else
-#define ref_core_siblings_attr
-#endif
 
 static struct attribute *default_attrs[] = {
-	ref_physical_package_id_attr
-	ref_core_id_attr
-	ref_thread_siblings_attr
-	ref_core_siblings_attr
+	&attr_physical_package_id.attr,
+	&attr_core_id.attr,
+	&attr_thread_siblings.attr,
+	&attr_thread_siblings_list.attr,
+	&attr_core_siblings.attr,
+	&attr_core_siblings_list.attr,
 	NULL
 };
 
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 24f3d2282e11..2158fc0d5a56 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -179,4 +179,17 @@ void arch_update_cpu_topology(void);
 #endif
 #endif /* CONFIG_NUMA */
 
+#ifndef topology_physical_package_id
+#define topology_physical_package_id(cpu)	((void)(cpu), -1)
+#endif
+#ifndef topology_core_id
+#define topology_core_id(cpu)			((void)(cpu), 0)
+#endif
+#ifndef topology_thread_siblings
+#define topology_thread_siblings(cpu)		cpumask_of_cpu(cpu)
+#endif
+#ifndef topology_core_siblings
+#define topology_core_siblings(cpu)		cpumask_of_cpu(cpu)
+#endif
+
 #endif /* _LINUX_TOPOLOGY_H */
-- 
cgit v1.2.3


From e17ba73b0ee6c0f24393c48b455e0d8db761782c Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Mon, 12 May 2008 15:44:40 +0200
Subject: x86, generic: mark early_printk as asmlinkage

It's not explicitly marked as asmlinkage, but invoked from x86_32
startup code with parameters on stack.

No other architectures define early_printk and none of them are affected
by this change, since defines asmlinkage as empty token.

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/early_printk.c | 2 +-
 include/linux/kernel.h         | 2 +-
 kernel/printk.c                | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 643fd861b724..ff9e7350da54 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -196,7 +196,7 @@ static struct console simnow_console = {
 static struct console *early_console = &early_vga_console;
 static int early_console_initialized;
 
-void early_printk(const char *fmt, ...)
+asmlinkage void early_printk(const char *fmt, ...)
 {
 	char buf[512];
 	int n;
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index f2a668c195bf..4cb8d3df414e 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -207,7 +207,7 @@ static inline bool printk_timed_ratelimit(unsigned long *caller_jiffies, \
 		{ return false; }
 #endif
 
-extern void __attribute__((format(printf, 1, 2)))
+extern void asmlinkage __attribute__((format(printf, 1, 2)))
 	early_printk(const char *fmt, ...);
 
 unsigned long int_sqrt(unsigned long);
diff --git a/kernel/printk.c b/kernel/printk.c
index 70cfa5ac75ce..de1a4f4470c3 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -38,7 +38,7 @@
 /*
  * Architectures can override it:
  */
-void __attribute__((weak)) early_printk(const char *fmt, ...)
+void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
 {
 }
 
-- 
cgit v1.2.3


From 443cd507ce7f78c6f8742b72736585c031d5a921 Mon Sep 17 00:00:00 2001
From: "Huang, Ying" <ying.huang@intel.com>
Date: Fri, 20 Jun 2008 16:39:21 +0800
Subject: lockdep: add lock_class information to lock_chain and output it

This patch records array of lock_class into lock_chain, and export
lock_chain information via /proc/lockdep_chains.

It is based on x86/master branch of git-x86 tree, and has been tested
on x86_64 platform.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/lockdep.h    |  3 ++
 kernel/lockdep.c           | 38 +++++++++++++++++--
 kernel/lockdep_internals.h |  6 +++
 kernel/lockdep_proc.c      | 91 ++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 135 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 4c4d236ded18..b26fbc715a50 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -182,6 +182,9 @@ struct lock_list {
  * We record lock dependency chains, so that we can cache them:
  */
 struct lock_chain {
+	u8				irq_context;
+	u8				depth;
+	u16				base;
 	struct list_head		entry;
 	u64				chain_key;
 };
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 81a4e4a3f087..a796f1f38ac5 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -1458,7 +1458,14 @@ out_bug:
 }
 
 unsigned long nr_lock_chains;
-static struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS];
+struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS];
+atomic_t nr_chain_hlocks;
+static u16 chain_hlocks[MAX_LOCKDEP_CHAIN_HLOCKS];
+
+struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i)
+{
+	return lock_classes + chain_hlocks[chain->base + i];
+}
 
 /*
  * Look up a dependency chain. If the key is not present yet then
@@ -1466,10 +1473,15 @@ static struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS];
  * validated. If the key is already hashed, return 0.
  * (On return with 1 graph_lock is held.)
  */
-static inline int lookup_chain_cache(u64 chain_key, struct lock_class *class)
+static inline int lookup_chain_cache(struct task_struct *curr,
+				     struct held_lock *hlock,
+				     u64 chain_key)
 {
+	struct lock_class *class = hlock->class;
 	struct list_head *hash_head = chainhashentry(chain_key);
 	struct lock_chain *chain;
+	struct held_lock *hlock_curr, *hlock_next;
+	int i, j, n;
 
 	if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
 		return 0;
@@ -1517,6 +1529,26 @@ cache_hit:
 	}
 	chain = lock_chains + nr_lock_chains++;
 	chain->chain_key = chain_key;
+	chain->irq_context = hlock->irq_context;
+	/* Find the first held_lock of current chain */
+	hlock_next = hlock;
+	for (i = curr->lockdep_depth - 1; i >= 0; i--) {
+		hlock_curr = curr->held_locks + i;
+		if (hlock_curr->irq_context != hlock_next->irq_context)
+			break;
+		hlock_next = hlock;
+	}
+	i++;
+	chain->depth = curr->lockdep_depth + 1 - i;
+	n = atomic_add_return(chain->depth, &nr_chain_hlocks);
+	if (unlikely(n < MAX_LOCKDEP_CHAIN_HLOCKS)) {
+		chain->base = n - chain->depth;
+		for (j = 0; j < chain->depth - 1; j++, i++) {
+			int lock_id = curr->held_locks[i].class - lock_classes;
+			chain_hlocks[chain->base + j] = lock_id;
+		}
+		chain_hlocks[chain->base + j] = class - lock_classes;
+	}
 	list_add_tail_rcu(&chain->entry, hash_head);
 	debug_atomic_inc(&chain_lookup_misses);
 	inc_chains();
@@ -1538,7 +1570,7 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,
 	 * graph_lock for us)
 	 */
 	if (!hlock->trylock && (hlock->check == 2) &&
-			lookup_chain_cache(chain_key, hlock->class)) {
+	    lookup_chain_cache(curr, hlock, chain_key)) {
 		/*
 		 * Check whether last held lock:
 		 *
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index 8ce09bc4613d..db09b176dd34 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -23,6 +23,8 @@
 #define MAX_LOCKDEP_CHAINS_BITS	14
 #define MAX_LOCKDEP_CHAINS	(1UL << MAX_LOCKDEP_CHAINS_BITS)
 
+#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5)
+
 /*
  * Stack-trace: tightly packed array of stack backtrace
  * addresses. Protected by the hash_lock.
@@ -30,15 +32,19 @@
 #define MAX_STACK_TRACE_ENTRIES	262144UL
 
 extern struct list_head all_lock_classes;
+extern struct lock_chain lock_chains[];
 
 extern void
 get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4);
 
 extern const char * __get_key_name(struct lockdep_subclass_key *key, char *str);
 
+struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i);
+
 extern unsigned long nr_lock_classes;
 extern unsigned long nr_list_entries;
 extern unsigned long nr_lock_chains;
+extern atomic_t nr_chain_hlocks;
 extern unsigned long nr_stack_trace_entries;
 
 extern unsigned int nr_hardirq_chains;
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 688c5f1940bd..14d052c8a835 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -178,6 +178,93 @@ static const struct file_operations proc_lockdep_operations = {
 	.release	= seq_release,
 };
 
+static void *lc_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct lock_chain *chain;
+
+	(*pos)++;
+
+	if (v == SEQ_START_TOKEN)
+		chain = m->private;
+	else {
+		chain = v;
+
+		if (*pos < nr_lock_chains)
+			chain = lock_chains + *pos;
+		else
+			chain = NULL;
+	}
+
+	return chain;
+}
+
+static void *lc_start(struct seq_file *m, loff_t *pos)
+{
+	if (*pos == 0)
+		return SEQ_START_TOKEN;
+
+	if (*pos < nr_lock_chains)
+		return lock_chains + *pos;
+
+	return NULL;
+}
+
+static void lc_stop(struct seq_file *m, void *v)
+{
+}
+
+static int lc_show(struct seq_file *m, void *v)
+{
+	struct lock_chain *chain = v;
+	struct lock_class *class;
+	int i;
+
+	if (v == SEQ_START_TOKEN) {
+		seq_printf(m, "all lock chains:\n");
+		return 0;
+	}
+
+	seq_printf(m, "irq_context: %d\n", chain->irq_context);
+
+	for (i = 0; i < chain->depth; i++) {
+		class = lock_chain_get_class(chain, i);
+		seq_printf(m, "[%p] ", class->key);
+		print_name(m, class);
+		seq_puts(m, "\n");
+	}
+	seq_puts(m, "\n");
+
+	return 0;
+}
+
+static const struct seq_operations lockdep_chains_ops = {
+	.start	= lc_start,
+	.next	= lc_next,
+	.stop	= lc_stop,
+	.show	= lc_show,
+};
+
+static int lockdep_chains_open(struct inode *inode, struct file *file)
+{
+	int res = seq_open(file, &lockdep_chains_ops);
+	if (!res) {
+		struct seq_file *m = file->private_data;
+
+		if (nr_lock_chains)
+			m->private = lock_chains;
+		else
+			m->private = NULL;
+	}
+	return res;
+}
+
+static const struct file_operations proc_lockdep_chains_operations = {
+	.open		= lockdep_chains_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
 static void lockdep_stats_debug_show(struct seq_file *m)
 {
 #ifdef CONFIG_DEBUG_LOCKDEP
@@ -294,6 +381,8 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
 #ifdef CONFIG_PROVE_LOCKING
 	seq_printf(m, " dependency chains:             %11lu [max: %lu]\n",
 			nr_lock_chains, MAX_LOCKDEP_CHAINS);
+	seq_printf(m, " dependency chain hlocks:       %11d [max: %lu]\n",
+			atomic_read(&nr_chain_hlocks), MAX_LOCKDEP_CHAIN_HLOCKS);
 #endif
 
 #ifdef CONFIG_TRACE_IRQFLAGS
@@ -661,6 +750,8 @@ static const struct file_operations proc_lock_stat_operations = {
 static int __init lockdep_proc_init(void)
 {
 	proc_create("lockdep", S_IRUSR, NULL, &proc_lockdep_operations);
+	proc_create("lockdep_chains", S_IRUSR, NULL,
+		    &proc_lockdep_chains_operations);
 	proc_create("lockdep_stats", S_IRUSR, NULL,
 		    &proc_lockdep_stats_operations);
 
-- 
cgit v1.2.3


From 0b2806768899dba5967bcd4a3b93eaed9a1dc4f3 Mon Sep 17 00:00:00 2001
From: Jonathan Corbet <corbet@lwn.net>
Date: Sun, 18 May 2008 14:27:41 -0600
Subject: Add cycle_kernel_lock()

A number of driver functions are so obviously trivial that they do not need
the big kernel lock - at least not overtly.  It turns out that the
acquisition of the BKL in driver open() functions can perform a sort of
poor-hacker's serialization function, delaying the open operation until the
driver is certain to have completed its initialization.  Add a simple
cycle_kernel_lock() function for these cases to make it clear that there is
no need to *hold* the BKL, just to be sure that we can acquire it.

Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 include/linux/smp_lock.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/smp_lock.h b/include/linux/smp_lock.h
index aab3a4cff4e1..813be59bf345 100644
--- a/include/linux/smp_lock.h
+++ b/include/linux/smp_lock.h
@@ -27,11 +27,24 @@ static inline int reacquire_kernel_lock(struct task_struct *task)
 extern void __lockfunc lock_kernel(void)	__acquires(kernel_lock);
 extern void __lockfunc unlock_kernel(void)	__releases(kernel_lock);
 
+/*
+ * Various legacy drivers don't really need the BKL in a specific
+ * function, but they *do* need to know that the BKL became available.
+ * This function just avoids wrapping a bunch of lock/unlock pairs
+ * around code which doesn't really need it.
+ */
+static inline void cycle_kernel_lock(void)
+{
+	lock_kernel();
+	unlock_kernel();
+}
+
 #else
 
 #define lock_kernel()				do { } while(0)
 #define unlock_kernel()				do { } while(0)
 #define release_kernel_lock(task)		do { } while(0)
+#define cycle_kernel_lock()			do { } while(0)
 #define reacquire_kernel_lock(task)		0
 #define kernel_locked()				1
 
-- 
cgit v1.2.3


From 395a59d0f8e86bb39cd700c3d185d30c670bb958 Mon Sep 17 00:00:00 2001
From: Abhishek Sagar <sagar.abhishek@gmail.com>
Date: Sat, 21 Jun 2008 23:47:27 +0530
Subject: ftrace: store mcount address in rec->ip

Record the address of the mcount call-site. Currently all archs except sparc64
record the address of the instruction following the mcount call-site. Some
general cleanups are entailed. Storing mcount addresses in rec->ip enables
looking them up in the kprobe hash table later on to check if they're kprobe'd.

Signed-off-by: Abhishek Sagar <sagar.abhishek@gmail.com>
Cc: davem@davemloft.net
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/arm/kernel/armksyms.c          | 10 +++++-----
 arch/arm/kernel/entry-common.S      |  4 ++++
 arch/arm/kernel/ftrace.c            | 16 +++++++---------
 arch/powerpc/kernel/entry_32.S      |  4 ++++
 arch/powerpc/kernel/entry_64.S      |  5 ++++-
 arch/powerpc/kernel/ftrace.c        | 21 +++++++--------------
 arch/sparc64/kernel/ftrace.c        | 10 ++++++----
 arch/sparc64/kernel/sparc64_ksyms.c |  2 +-
 arch/x86/kernel/entry_32.S          |  4 ++++
 arch/x86/kernel/entry_64.S          |  4 ++++
 arch/x86/kernel/ftrace.c            | 26 +++++++++-----------------
 arch/x86/kernel/i386_ksyms_32.c     |  2 +-
 arch/x86/kernel/x8664_ksyms_64.c    |  2 +-
 include/asm-arm/ftrace.h            | 14 ++++++++++++++
 include/asm-powerpc/ftrace.h        |  8 ++++++++
 include/asm-sparc64/ftrace.h        | 14 ++++++++++++++
 include/asm-x86/ftrace.h            | 14 ++++++++++++++
 include/linux/ftrace.h              |  3 +--
 kernel/trace/ftrace.c               |  3 ++-
 19 files changed, 110 insertions(+), 56 deletions(-)
 create mode 100644 include/asm-arm/ftrace.h
 create mode 100644 include/asm-sparc64/ftrace.h
 create mode 100644 include/asm-x86/ftrace.h

(limited to 'include/linux')

diff --git a/arch/arm/kernel/armksyms.c b/arch/arm/kernel/armksyms.c
index 3b132215cbf8..cc7b246e9652 100644
--- a/arch/arm/kernel/armksyms.c
+++ b/arch/arm/kernel/armksyms.c
@@ -18,6 +18,7 @@
 #include <asm/io.h>
 #include <asm/system.h>
 #include <asm/uaccess.h>
+#include <asm/ftrace.h>
 
 /*
  * libgcc functions - functions that are used internally by the
@@ -48,11 +49,6 @@ extern void __aeabi_ulcmp(void);
 extern void fpundefinstr(void);
 extern void fp_enter(void);
 
-#ifdef CONFIG_FTRACE
-extern void mcount(void);
-EXPORT_SYMBOL(mcount);
-#endif
-
 /*
  * This has a special calling convention; it doesn't
  * modify any of the usual registers, except for LR.
@@ -186,3 +182,7 @@ EXPORT_SYMBOL(_find_next_bit_be);
 #endif
 
 EXPORT_SYMBOL(copy_page);
+
+#ifdef CONFIG_FTRACE
+EXPORT_SYMBOL(mcount);
+#endif
diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S
index 8f79a4789ed4..84694e88b428 100644
--- a/arch/arm/kernel/entry-common.S
+++ b/arch/arm/kernel/entry-common.S
@@ -9,6 +9,7 @@
  */
 
 #include <asm/unistd.h>
+#include <asm/ftrace.h>
 #include <asm/arch/entry-macro.S>
 
 #include "entry-header.S"
@@ -104,6 +105,7 @@ ENTRY(ret_from_fork)
 ENTRY(mcount)
 	stmdb sp!, {r0-r3, lr}
 	mov r0, lr
+	sub r0, r0, #MCOUNT_INSN_SIZE
 
 	.globl mcount_call
 mcount_call:
@@ -114,6 +116,7 @@ ENTRY(ftrace_caller)
 	stmdb sp!, {r0-r3, lr}
 	ldr r1, [fp, #-4]
 	mov r0, lr
+	sub r0, r0, #MCOUNT_INSN_SIZE
 
 	.globl ftrace_call
 ftrace_call:
@@ -134,6 +137,7 @@ ENTRY(mcount)
 trace:
 	ldr r1, [fp, #-4]
 	mov r0, lr
+	sub r0, r0, #MCOUNT_INSN_SIZE
 	mov lr, pc
 	mov pc, r2
 	ldmia sp!, {r0-r3, pc}
diff --git a/arch/arm/kernel/ftrace.c b/arch/arm/kernel/ftrace.c
index 22f3d6e309f9..76d50e6091bc 100644
--- a/arch/arm/kernel/ftrace.c
+++ b/arch/arm/kernel/ftrace.c
@@ -12,9 +12,10 @@
  */
 
 #include <linux/ftrace.h>
+
 #include <asm/cacheflush.h>
+#include <asm/ftrace.h>
 
-#define INSN_SIZE      4
 #define PC_OFFSET      8
 #define BL_OPCODE      0xeb000000
 #define BL_OFFSET_MASK 0x00ffffff
@@ -32,10 +33,10 @@ unsigned char *ftrace_call_replace(unsigned long pc, unsigned long addr)
 {
 	long offset;
 
-	offset = (long)addr - (long)(pc - INSN_SIZE + PC_OFFSET);
+	offset = (long)addr - (long)(pc + PC_OFFSET);
 	if (unlikely(offset < -33554432 || offset > 33554428)) {
 		/* Can't generate branches that far (from ARM ARM). Ftrace
-		 * doesn't generate branches outside of core kernel text.
+		 * doesn't generate branches outside of kernel text.
 		 */
 		WARN_ON_ONCE(1);
 		return NULL;
@@ -52,7 +53,6 @@ int ftrace_modify_code(unsigned long pc, unsigned char *old_code,
 
 	old = *(unsigned long *)old_code;
 	new = *(unsigned long *)new_code;
-	pc -= INSN_SIZE;
 
 	__asm__ __volatile__ (
 		"1:  ldr    %1, [%2]  \n"
@@ -77,7 +77,7 @@ int ftrace_modify_code(unsigned long pc, unsigned char *old_code,
 		: "memory");
 
 	if (!err && (replaced == old))
-		flush_icache_range(pc, pc + INSN_SIZE);
+		flush_icache_range(pc, pc + MCOUNT_INSN_SIZE);
 
 	return err;
 }
@@ -89,8 +89,7 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
 	unsigned char *new;
 
 	pc = (unsigned long)&ftrace_call;
-	pc += INSN_SIZE;
-	memcpy(&old, &ftrace_call, INSN_SIZE);
+	memcpy(&old, &ftrace_call, MCOUNT_INSN_SIZE);
 	new = ftrace_call_replace(pc, (unsigned long)func);
 	ret = ftrace_modify_code(pc, (unsigned char *)&old, new);
 	return ret;
@@ -103,8 +102,7 @@ int ftrace_mcount_set(unsigned long *data)
 	unsigned char *new;
 
 	pc = (unsigned long)&mcount_call;
-	pc += INSN_SIZE;
-	memcpy(&old, &mcount_call, INSN_SIZE);
+	memcpy(&old, &mcount_call, MCOUNT_INSN_SIZE);
 	new = ftrace_call_replace(pc, *addr);
 	*addr = ftrace_modify_code(pc, (unsigned char *)&old, new);
 	return 0;
diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index 3b1dd29d9f91..7231a708af0d 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -30,6 +30,7 @@
 #include <asm/ppc_asm.h>
 #include <asm/asm-offsets.h>
 #include <asm/unistd.h>
+#include <asm/ftrace.h>
 
 #undef SHOW_SYSCALLS
 #undef SHOW_SYSCALLS_TASK
@@ -1053,6 +1054,7 @@ _GLOBAL(_mcount)
 	stw	r10,40(r1)
 	stw	r3, 44(r1)
 	stw	r5, 8(r1)
+	subi	r3, r3, MCOUNT_INSN_SIZE
 	.globl mcount_call
 mcount_call:
 	bl	ftrace_stub
@@ -1090,6 +1092,7 @@ _GLOBAL(ftrace_caller)
 	stw	r10,40(r1)
 	stw	r3, 44(r1)
 	stw	r5, 8(r1)
+	subi	r3, r3, MCOUNT_INSN_SIZE
 .globl ftrace_call
 ftrace_call:
 	bl	ftrace_stub
@@ -1128,6 +1131,7 @@ _GLOBAL(_mcount)
 	stw	r3, 44(r1)
 	stw	r5, 8(r1)
 
+	subi	r3, r3, MCOUNT_INSN_SIZE
 	LOAD_REG_ADDR(r5, ftrace_trace_function)
 	lwz	r5,0(r5)
 
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 2c4d9e056ead..2f511a969d2c 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -31,6 +31,7 @@
 #include <asm/bug.h>
 #include <asm/ptrace.h>
 #include <asm/irqflags.h>
+#include <asm/ftrace.h>
 
 /*
  * System calls.
@@ -879,6 +880,7 @@ _GLOBAL(_mcount)
 	mflr	r3
 	stdu	r1, -112(r1)
 	std	r3, 128(r1)
+	subi	r3, r3, MCOUNT_INSN_SIZE
 	.globl mcount_call
 mcount_call:
 	bl	ftrace_stub
@@ -895,6 +897,7 @@ _GLOBAL(ftrace_caller)
 	stdu	r1, -112(r1)
 	std	r3, 128(r1)
 	ld	r4, 16(r11)
+	subi	r3, r3, MCOUNT_INSN_SIZE
 .globl ftrace_call
 ftrace_call:
 	bl	ftrace_stub
@@ -916,7 +919,7 @@ _GLOBAL(_mcount)
 	std	r3, 128(r1)
 	ld	r4, 16(r11)
 
-
+	subi	r3, r3, MCOUNT_INSN_SIZE
 	LOAD_REG_ADDR(r5,ftrace_trace_function)
 	ld	r5,0(r5)
 	ld	r5,0(r5)
diff --git a/arch/powerpc/kernel/ftrace.c b/arch/powerpc/kernel/ftrace.c
index e12c593ab9ca..3855ceb937b0 100644
--- a/arch/powerpc/kernel/ftrace.c
+++ b/arch/powerpc/kernel/ftrace.c
@@ -15,8 +15,8 @@
 #include <linux/list.h>
 
 #include <asm/cacheflush.h>
+#include <asm/ftrace.h>
 
-#define CALL_BACK		4
 
 static unsigned int ftrace_nop = 0x60000000;
 
@@ -27,9 +27,10 @@ static unsigned int ftrace_nop = 0x60000000;
 # define GET_ADDR(addr) *(unsigned long *)addr
 #endif
 
+
 static unsigned int notrace ftrace_calc_offset(long ip, long addr)
 {
-	return (int)((addr + CALL_BACK) - ip);
+	return (int)(addr - ip);
 }
 
 notrace unsigned char *ftrace_nop_replace(void)
@@ -76,9 +77,6 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
 	unsigned new = *(unsigned *)new_code;
 	int faulted = 0;
 
-	/* move the IP back to the start of the call */
-	ip -= CALL_BACK;
-
 	/*
 	 * Note: Due to modules and __init, code can
 	 *  disappear and change, we need to protect against faulting
@@ -118,12 +116,10 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
 notrace int ftrace_update_ftrace_func(ftrace_func_t func)
 {
 	unsigned long ip = (unsigned long)(&ftrace_call);
-	unsigned char old[4], *new;
+	unsigned char old[MCOUNT_INSN_SIZE], *new;
 	int ret;
 
-	ip += CALL_BACK;
-
-	memcpy(old, &ftrace_call, 4);
+	memcpy(old, &ftrace_call, MCOUNT_INSN_SIZE);
 	new = ftrace_call_replace(ip, (unsigned long)func);
 	ret = ftrace_modify_code(ip, old, new);
 
@@ -134,16 +130,13 @@ notrace int ftrace_mcount_set(unsigned long *data)
 {
 	unsigned long ip = (long)(&mcount_call);
 	unsigned long *addr = data;
-	unsigned char old[4], *new;
-
-	/* ip is at the location, but modify code will subtact this */
-	ip += CALL_BACK;
+	unsigned char old[MCOUNT_INSN_SIZE], *new;
 
 	/*
 	 * Replace the mcount stub with a pointer to the
 	 * ip recorder function.
 	 */
-	memcpy(old, &mcount_call, 4);
+	memcpy(old, &mcount_call, MCOUNT_INSN_SIZE);
 	new = ftrace_call_replace(ip, *addr);
 	*addr = ftrace_modify_code(ip, old, new);
 
diff --git a/arch/sparc64/kernel/ftrace.c b/arch/sparc64/kernel/ftrace.c
index c17373195b1e..4298d0aee713 100644
--- a/arch/sparc64/kernel/ftrace.c
+++ b/arch/sparc64/kernel/ftrace.c
@@ -5,6 +5,8 @@
 #include <linux/init.h>
 #include <linux/list.h>
 
+#include <asm/ftrace.h>
+
 static const u32 ftrace_nop = 0x01000000;
 
 notrace unsigned char *ftrace_nop_replace(void)
@@ -60,9 +62,9 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
 notrace int ftrace_update_ftrace_func(ftrace_func_t func)
 {
 	unsigned long ip = (unsigned long)(&ftrace_call);
-	unsigned char old[4], *new;
+	unsigned char old[MCOUNT_INSN_SIZE], *new;
 
-	memcpy(old, &ftrace_call, 4);
+	memcpy(old, &ftrace_call, MCOUNT_INSN_SIZE);
 	new = ftrace_call_replace(ip, (unsigned long)func);
 	return ftrace_modify_code(ip, old, new);
 }
@@ -71,13 +73,13 @@ notrace int ftrace_mcount_set(unsigned long *data)
 {
 	unsigned long ip = (long)(&mcount_call);
 	unsigned long *addr = data;
-	unsigned char old[4], *new;
+	unsigned char old[MCOUNT_INSN_SIZE], *new;
 
 	/*
 	 * Replace the mcount stub with a pointer to the
 	 * ip recorder function.
 	 */
-	memcpy(old, &mcount_call, 4);
+	memcpy(old, &mcount_call, MCOUNT_INSN_SIZE);
 	new = ftrace_call_replace(ip, *addr);
 	*addr = ftrace_modify_code(ip, old, new);
 
diff --git a/arch/sparc64/kernel/sparc64_ksyms.c b/arch/sparc64/kernel/sparc64_ksyms.c
index 8ac0b99f2c55..b80d982a29c6 100644
--- a/arch/sparc64/kernel/sparc64_ksyms.c
+++ b/arch/sparc64/kernel/sparc64_ksyms.c
@@ -53,6 +53,7 @@
 #include <asm/ns87303.h>
 #include <asm/timer.h>
 #include <asm/cpudata.h>
+#include <asm/ftrace.h>
 
 struct poll {
 	int fd;
@@ -112,7 +113,6 @@ EXPORT_SYMBOL(smp_call_function);
 #endif /* CONFIG_SMP */
 
 #if defined(CONFIG_MCOUNT)
-extern void _mcount(void);
 EXPORT_SYMBOL(_mcount);
 #endif
 
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 04ea83ccb979..95e6bbe3665e 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -51,6 +51,7 @@
 #include <asm/percpu.h>
 #include <asm/dwarf2.h>
 #include <asm/processor-flags.h>
+#include <asm/ftrace.h>
 #include "irq_vectors.h"
 
 /*
@@ -1118,6 +1119,7 @@ ENTRY(mcount)
 	pushl %ecx
 	pushl %edx
 	movl 0xc(%esp), %eax
+	subl $MCOUNT_INSN_SIZE, %eax
 
 .globl mcount_call
 mcount_call:
@@ -1136,6 +1138,7 @@ ENTRY(ftrace_caller)
 	pushl %edx
 	movl 0xc(%esp), %eax
 	movl 0x4(%ebp), %edx
+	subl $MCOUNT_INSN_SIZE, %eax
 
 .globl ftrace_call
 ftrace_call:
@@ -1166,6 +1169,7 @@ trace:
 	pushl %edx
 	movl 0xc(%esp), %eax
 	movl 0x4(%ebp), %edx
+	subl $MCOUNT_INSN_SIZE, %eax
 
 	call *ftrace_trace_function
 
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index fe25e5febca3..b0f7308f78a6 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -51,6 +51,7 @@
 #include <asm/page.h>
 #include <asm/irqflags.h>
 #include <asm/paravirt.h>
+#include <asm/ftrace.h>
 
 	.code64
 
@@ -68,6 +69,7 @@ ENTRY(mcount)
 	movq %r9, 48(%rsp)
 
 	movq 0x38(%rsp), %rdi
+	subq $MCOUNT_INSN_SIZE, %rdi
 
 .globl mcount_call
 mcount_call:
@@ -99,6 +101,7 @@ ENTRY(ftrace_caller)
 
 	movq 0x38(%rsp), %rdi
 	movq 8(%rbp), %rsi
+	subq $MCOUNT_INSN_SIZE, %rdi
 
 .globl ftrace_call
 ftrace_call:
@@ -139,6 +142,7 @@ trace:
 
 	movq 0x38(%rsp), %rdi
 	movq 8(%rbp), %rsi
+	subq $MCOUNT_INSN_SIZE, %rdi
 
 	call   *ftrace_trace_function
 
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 55828149e01e..ab115cd15fdf 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -17,20 +17,21 @@
 #include <linux/list.h>
 
 #include <asm/alternative.h>
+#include <asm/ftrace.h>
 
-#define CALL_BACK		5
 
 /* Long is fine, even if it is only 4 bytes ;-) */
 static long *ftrace_nop;
 
 union ftrace_code_union {
-	char code[5];
+	char code[MCOUNT_INSN_SIZE];
 	struct {
 		char e8;
 		int offset;
 	} __attribute__((packed));
 };
 
+
 static int notrace ftrace_calc_offset(long ip, long addr)
 {
 	return (int)(addr - ip);
@@ -46,7 +47,7 @@ notrace unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
 	static union ftrace_code_union calc;
 
 	calc.e8		= 0xe8;
-	calc.offset	= ftrace_calc_offset(ip, addr);
+	calc.offset	= ftrace_calc_offset(ip + MCOUNT_INSN_SIZE, addr);
 
 	/*
 	 * No locking needed, this must be called via kstop_machine
@@ -65,9 +66,6 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
 	unsigned char newch = new_code[4];
 	int faulted = 0;
 
-	/* move the IP back to the start of the call */
-	ip -= CALL_BACK;
-
 	/*
 	 * Note: Due to modules and __init, code can
 	 *  disappear and change, we need to protect against faulting
@@ -102,12 +100,10 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
 notrace int ftrace_update_ftrace_func(ftrace_func_t func)
 {
 	unsigned long ip = (unsigned long)(&ftrace_call);
-	unsigned char old[5], *new;
+	unsigned char old[MCOUNT_INSN_SIZE], *new;
 	int ret;
 
-	ip += CALL_BACK;
-
-	memcpy(old, &ftrace_call, 5);
+	memcpy(old, &ftrace_call, MCOUNT_INSN_SIZE);
 	new = ftrace_call_replace(ip, (unsigned long)func);
 	ret = ftrace_modify_code(ip, old, new);
 
@@ -118,16 +114,13 @@ notrace int ftrace_mcount_set(unsigned long *data)
 {
 	unsigned long ip = (long)(&mcount_call);
 	unsigned long *addr = data;
-	unsigned char old[5], *new;
-
-	/* ip is at the location, but modify code will subtact this */
-	ip += CALL_BACK;
+	unsigned char old[MCOUNT_INSN_SIZE], *new;
 
 	/*
 	 * Replace the mcount stub with a pointer to the
 	 * ip recorder function.
 	 */
-	memcpy(old, &mcount_call, 5);
+	memcpy(old, &mcount_call, MCOUNT_INSN_SIZE);
 	new = ftrace_call_replace(ip, *addr);
 	*addr = ftrace_modify_code(ip, old, new);
 
@@ -142,8 +135,7 @@ int __init ftrace_dyn_arch_init(void *data)
 
 	ftrace_mcount_set(data);
 
-	ftrace_nop = (unsigned long *)noptable[CALL_BACK];
+	ftrace_nop = (unsigned long *)noptable[MCOUNT_INSN_SIZE];
 
 	return 0;
 }
-
diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c
index 29999dbb754c..dd7ebee446af 100644
--- a/arch/x86/kernel/i386_ksyms_32.c
+++ b/arch/x86/kernel/i386_ksyms_32.c
@@ -1,9 +1,9 @@
-#include <linux/ftrace.h>
 #include <linux/module.h>
 
 #include <asm/checksum.h>
 #include <asm/pgtable.h>
 #include <asm/desc.h>
+#include <asm/ftrace.h>
 
 #ifdef CONFIG_FTRACE
 /* mcount is defined in assembly */
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 122885bc5f3b..16ff4bf418d9 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -1,7 +1,6 @@
 /* Exports for assembly files.
    All C exports should go in the respective C files. */
 
-#include <linux/ftrace.h>
 #include <linux/module.h>
 #include <linux/smp.h>
 
@@ -11,6 +10,7 @@
 #include <asm/pgtable.h>
 #include <asm/uaccess.h>
 #include <asm/desc.h>
+#include <asm/ftrace.h>
 
 #ifdef CONFIG_FTRACE
 /* mcount is defined in assembly */
diff --git a/include/asm-arm/ftrace.h b/include/asm-arm/ftrace.h
new file mode 100644
index 000000000000..584ef9a8e5a5
--- /dev/null
+++ b/include/asm-arm/ftrace.h
@@ -0,0 +1,14 @@
+#ifndef _ASM_ARM_FTRACE
+#define _ASM_ARM_FTRACE
+
+#ifdef CONFIG_FTRACE
+#define MCOUNT_ADDR		((long)(mcount))
+#define MCOUNT_INSN_SIZE	4 /* sizeof mcount call */
+
+#ifndef __ASSEMBLY__
+extern void mcount(void);
+#endif
+
+#endif
+
+#endif /* _ASM_ARM_FTRACE */
diff --git a/include/asm-powerpc/ftrace.h b/include/asm-powerpc/ftrace.h
index b1bfa704b6e5..de921326cca8 100644
--- a/include/asm-powerpc/ftrace.h
+++ b/include/asm-powerpc/ftrace.h
@@ -1,6 +1,14 @@
 #ifndef _ASM_POWERPC_FTRACE
 #define _ASM_POWERPC_FTRACE
 
+#ifdef CONFIG_FTRACE
+#define MCOUNT_ADDR		((long)(_mcount))
+#define MCOUNT_INSN_SIZE	4 /* sizeof mcount call */
+
+#ifndef __ASSEMBLY__
 extern void _mcount(void);
+#endif
 
 #endif
+
+#endif /* _ASM_POWERPC_FTRACE */
diff --git a/include/asm-sparc64/ftrace.h b/include/asm-sparc64/ftrace.h
new file mode 100644
index 000000000000..f76a40a338bb
--- /dev/null
+++ b/include/asm-sparc64/ftrace.h
@@ -0,0 +1,14 @@
+#ifndef _ASM_SPARC64_FTRACE
+#define _ASM_SPARC64_FTRACE
+
+#ifdef CONFIG_FTRACE
+#define MCOUNT_ADDR		((long)(_mcount))
+#define MCOUNT_INSN_SIZE	4 /* sizeof mcount call */
+
+#ifndef __ASSEMBLY__
+extern void _mcount(void);
+#endif
+
+#endif
+
+#endif /* _ASM_SPARC64_FTRACE */
diff --git a/include/asm-x86/ftrace.h b/include/asm-x86/ftrace.h
new file mode 100644
index 000000000000..c184441133f2
--- /dev/null
+++ b/include/asm-x86/ftrace.h
@@ -0,0 +1,14 @@
+#ifndef _ASM_X86_FTRACE
+#define _ASM_SPARC64_FTRACE
+
+#ifdef CONFIG_FTRACE
+#define MCOUNT_ADDR		((long)(mcount))
+#define MCOUNT_INSN_SIZE	5 /* sizeof mcount call */
+
+#ifndef __ASSEMBLY__
+extern void mcount(void);
+#endif
+
+#endif /* CONFIG_FTRACE */
+
+#endif /* _ASM_X86_FTRACE */
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 20e14d0093c7..366098d591de 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -31,7 +31,6 @@ int unregister_ftrace_function(struct ftrace_ops *ops);
 void clear_ftrace_function(void);
 
 extern void ftrace_stub(unsigned long a0, unsigned long a1);
-extern void mcount(void);
 
 #else /* !CONFIG_FTRACE */
 # define register_ftrace_function(ops) do { } while (0)
@@ -54,7 +53,7 @@ enum {
 
 struct dyn_ftrace {
 	struct hlist_node node;
-	unsigned long	  ip;
+	unsigned long	  ip; /* address of mcount call-site */
 	unsigned long	  flags;
 };
 
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 0d5bcf69952d..f1e9e5c74e64 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -27,6 +27,8 @@
 #include <linux/hash.h>
 #include <linux/list.h>
 
+#include <asm/ftrace.h>
+
 #include "trace.h"
 
 /* ftrace_enabled is a method to turn ftrace on or off */
@@ -329,7 +331,6 @@ ftrace_record_ip(unsigned long ip)
 }
 
 #define FTRACE_ADDR ((long)(ftrace_caller))
-#define MCOUNT_ADDR ((long)(mcount))
 
 static int
 __ftrace_replace_code(struct dyn_ftrace *rec,
-- 
cgit v1.2.3


From 785656a41f9a9c0e843a23d1ae05d900b5158f8f Mon Sep 17 00:00:00 2001
From: Abhishek Sagar <sagar.abhishek@gmail.com>
Date: Sat, 21 Jun 2008 23:47:39 +0530
Subject: kprobes: enable clean usage of get_kprobe

Allow clean use of get_kprobe() outside of core kprobe code. Ftrace makes use
of get_kprobe to identify probes installed on mcount call-sites.

Signed-off-by: Abhishek Sagar <sagar.abhishek@gmail.com>
Acked-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Masami Hiramatsu <mhiramat@redhat.com>
Cc: jkenisto@us.ibm.com
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/kprobes.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 1036631ff4fa..04a3556bdea6 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -259,6 +259,10 @@ void recycle_rp_inst(struct kretprobe_instance *ri, struct hlist_head *head);
 struct jprobe;
 struct kretprobe;
 
+static inline struct kprobe *get_kprobe(void *addr)
+{
+	return NULL;
+}
 static inline struct kprobe *kprobe_running(void)
 {
 	return NULL;
-- 
cgit v1.2.3


From ecea656d1d5e912d2f3d332657ea4a6d8380f891 Mon Sep 17 00:00:00 2001
From: Abhishek Sagar <sagar.abhishek@gmail.com>
Date: Sat, 21 Jun 2008 23:47:53 +0530
Subject: ftrace: freeze kprobe'd records

Let records identified as being kprobe'd be marked as "frozen". The trouble
with records which have a kprobe installed on their mcount call-site is
that they don't get updated. So if such a function which is currently being
traced gets its tracing disabled due to a new filter rule (or because it
was added to the notrace list) then it won't be updated and continue being
traced. This patch allows scanning of all frozen records during tracing to
check if they should be traced.

Signed-off-by: Abhishek Sagar <sagar.abhishek@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h |  6 ++++-
 kernel/trace/ftrace.c  | 72 +++++++++++++++++++++++++++++++++++++++++++++++++-
 kernel/trace/trace.c   |  3 +++
 3 files changed, 79 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 366098d591de..3121b95443d9 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -49,6 +49,7 @@ enum {
 	FTRACE_FL_ENABLED	= (1 << 3),
 	FTRACE_FL_NOTRACE	= (1 << 4),
 	FTRACE_FL_CONVERTED	= (1 << 5),
+	FTRACE_FL_FROZEN	= (1 << 6),
 };
 
 struct dyn_ftrace {
@@ -73,15 +74,18 @@ extern void ftrace_caller(void);
 extern void ftrace_call(void);
 extern void mcount_call(void);
 
+extern int skip_trace(unsigned long ip);
+
 void ftrace_disable_daemon(void);
 void ftrace_enable_daemon(void);
 
 #else
+# define skip_trace(ip)				({ 0; })
 # define ftrace_force_update()			({ 0; })
 # define ftrace_set_filter(buf, len, reset)	do { } while (0)
 # define ftrace_disable_daemon()		do { } while (0)
 # define ftrace_enable_daemon()			do { } while (0)
-#endif
+#endif /* CONFIG_DYNAMIC_FTRACE */
 
 /* totally disable ftrace - can not re-enable after this */
 void ftrace_kill(void);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index f1e9e5c74e64..d1238163155f 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -163,6 +163,8 @@ enum {
 };
 
 static int ftrace_filtered;
+static int tracing_on;
+static int frozen_record_count;
 
 static struct hlist_head ftrace_hash[FTRACE_HASHSIZE];
 
@@ -195,6 +197,71 @@ static int ftrace_record_suspend;
 
 static struct dyn_ftrace *ftrace_free_records;
 
+
+#ifdef CONFIG_KPROBES
+static inline void freeze_record(struct dyn_ftrace *rec)
+{
+	if (!(rec->flags & FTRACE_FL_FROZEN)) {
+		rec->flags |= FTRACE_FL_FROZEN;
+		frozen_record_count++;
+	}
+}
+
+static inline void unfreeze_record(struct dyn_ftrace *rec)
+{
+	if (rec->flags & FTRACE_FL_FROZEN) {
+		rec->flags &= ~FTRACE_FL_FROZEN;
+		frozen_record_count--;
+	}
+}
+
+static inline int record_frozen(struct dyn_ftrace *rec)
+{
+	return rec->flags & FTRACE_FL_FROZEN;
+}
+#else
+# define freeze_record(rec)			({ 0; })
+# define unfreeze_record(rec)			({ 0; })
+# define record_frozen(rec)			({ 0; })
+#endif /* CONFIG_KPROBES */
+
+int skip_trace(unsigned long ip)
+{
+	unsigned long fl;
+	struct dyn_ftrace *rec;
+	struct hlist_node *t;
+	struct hlist_head *head;
+
+	if (frozen_record_count == 0)
+		return 0;
+
+	head = &ftrace_hash[hash_long(ip, FTRACE_HASHBITS)];
+	hlist_for_each_entry_rcu(rec, t, head, node) {
+		if (rec->ip == ip) {
+			if (record_frozen(rec)) {
+				if (rec->flags & FTRACE_FL_FAILED)
+					return 1;
+
+				if (!(rec->flags & FTRACE_FL_CONVERTED))
+					return 1;
+
+				if (!tracing_on || !ftrace_enabled)
+					return 1;
+
+				if (ftrace_filtered) {
+					fl = rec->flags & (FTRACE_FL_FILTER |
+							   FTRACE_FL_NOTRACE);
+					if (!fl || (fl & FTRACE_FL_NOTRACE))
+						return 1;
+				}
+			}
+			break;
+		}
+	}
+
+	return 0;
+}
+
 static inline int
 ftrace_ip_in_hash(unsigned long ip, unsigned long key)
 {
@@ -489,8 +556,11 @@ static int __ftrace_modify_code(void *data)
 		 */
 		__ftrace_update_code(NULL);
 		ftrace_replace_code(1);
-	} else if (*command & FTRACE_DISABLE_CALLS)
+		tracing_on = 1;
+	} else if (*command & FTRACE_DISABLE_CALLS) {
 		ftrace_replace_code(0);
+		tracing_on = 0;
+	}
 
 	if (*command & FTRACE_UPDATE_TRACE_FUNC)
 		ftrace_update_ftrace_func(ftrace_trace_function);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 6e9dae7eb418..9ade79369bfb 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -988,6 +988,9 @@ function_trace_call(unsigned long ip, unsigned long parent_ip)
 	if (unlikely(!tracer_enabled))
 		return;
 
+	if (skip_trace(ip))
+		return;
+
 	local_irq_save(flags);
 	cpu = raw_smp_processor_id();
 	data = tr->data[cpu];
-- 
cgit v1.2.3


From 3da757daf86e498872855f0b5e101f763ba79499 Mon Sep 17 00:00:00 2001
From: Alok Kataria <akataria@vmware.com>
Date: Fri, 20 Jun 2008 15:06:33 -0700
Subject: x86: use cpu_khz for loops_per_jiffy calculation

On the x86 platform we can use the value of tsc_khz computed during tsc
calibration to calculate the loops_per_jiffy value. Its very important
to keep the error in lpj values to minimum as any error in that may
result in kernel panic in check_timer. In virtualization environment, On
a highly overloaded host the guest delay calibration may sometimes
result in errors beyond the ~50% that timer_irq_works can handle,
resulting in the guest panicking.

Does some formating changes to lpj_setup code to now have a single
printk to print the bogomips value.

We do this only for the boot processor because the AP's can have
different base frequencies or the BIOS might boot a AP at a different
frequency.

Signed-off-by: Alok N Kataria <akataria@vmware.com>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Daniel Hecht <dhecht@vmware.com>
Cc: Tim Mann <mann@vmware.com>
Cc: Zach Amsden <zach@vmware.com>
Cc: Sahil Rihan <srihan@vmware.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/time_64.c |  2 ++
 arch/x86/kernel/tsc_32.c  |  5 +++++
 include/linux/delay.h     |  1 +
 init/calibrate.c          | 36 +++++++++++++++++++-----------------
 4 files changed, 27 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c
index c737849e2ef7..12b4a71bd074 100644
--- a/arch/x86/kernel/time_64.c
+++ b/arch/x86/kernel/time_64.c
@@ -123,6 +123,8 @@ void __init time_init(void)
 		(boot_cpu_data.x86_vendor == X86_VENDOR_AMD))
 		cpu_khz = calculate_cpu_khz();
 
+	lpj_tsc = ((unsigned long)tsc_khz * 1000)/HZ;
+
 	if (unsynchronized_tsc())
 		mark_tsc_unstable("TSCs unsynchronized");
 
diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c
index 068759db63dd..be729035b30b 100644
--- a/arch/x86/kernel/tsc_32.c
+++ b/arch/x86/kernel/tsc_32.c
@@ -401,6 +401,7 @@ static inline void check_geode_tsc_reliable(void) { }
 void __init tsc_init(void)
 {
 	int cpu;
+	u64 lpj;
 
 	if (!cpu_has_tsc || tsc_disabled) {
 		/* Disable the TSC in case of !cpu_has_tsc */
@@ -421,6 +422,10 @@ void __init tsc_init(void)
 		return;
 	}
 
+	lpj = ((u64)tsc_khz * 1000);
+	do_div(lpj, HZ);
+	lpj_tsc = lpj;
+
 	printk("Detected %lu.%03lu MHz processor.\n",
 				(unsigned long)cpu_khz / 1000,
 				(unsigned long)cpu_khz % 1000);
diff --git a/include/linux/delay.h b/include/linux/delay.h
index 54552d21296e..01aec60590ab 100644
--- a/include/linux/delay.h
+++ b/include/linux/delay.h
@@ -41,6 +41,7 @@ static inline void ndelay(unsigned long x)
 #define ndelay(x) ndelay(x)
 #endif
 
+extern unsigned long lpj_tsc;
 void calibrate_delay(void);
 void msleep(unsigned int msecs);
 unsigned long msleep_interruptible(unsigned int msecs);
diff --git a/init/calibrate.c b/init/calibrate.c
index ecb3822d4f70..86286974dada 100644
--- a/init/calibrate.c
+++ b/init/calibrate.c
@@ -8,7 +8,9 @@
 #include <linux/delay.h>
 #include <linux/init.h>
 #include <linux/timex.h>
+#include <linux/smp.h>
 
+unsigned long lpj_tsc;
 unsigned long preset_lpj;
 static int __init lpj_setup(char *str)
 {
@@ -108,6 +110,10 @@ static unsigned long __cpuinit calibrate_delay_direct(void) {return 0;}
  * This is the number of bits of precision for the loops_per_jiffy.  Each
  * bit takes on average 1.5/HZ seconds.  This (like the original) is a little
  * better than 1%
+ * For the boot cpu we can skip the delay calibration and assign it a value
+ * calculated based on the tsc frequency.
+ * For the rest of the CPUs we cannot assume that the tsc frequency is same as
+ * the cpu frequency, hence do the calibration for those.
  */
 #define LPS_PREC 8
 
@@ -118,20 +124,20 @@ void __cpuinit calibrate_delay(void)
 
 	if (preset_lpj) {
 		loops_per_jiffy = preset_lpj;
-		printk("Calibrating delay loop (skipped)... "
-			"%lu.%02lu BogoMIPS preset\n",
-			loops_per_jiffy/(500000/HZ),
-			(loops_per_jiffy/(5000/HZ)) % 100);
+		printk(KERN_INFO
+			"Calibrating delay loop (skipped) preset value.. ");
+	} else if ((smp_processor_id() == 0) && lpj_tsc) {
+		loops_per_jiffy = lpj_tsc;
+		printk(KERN_INFO
+			"Calibrating delay loop (skipped), "
+			"using tsc calculated value.. ");
 	} else if ((loops_per_jiffy = calibrate_delay_direct()) != 0) {
-		printk("Calibrating delay using timer specific routine.. ");
-		printk("%lu.%02lu BogoMIPS (lpj=%lu)\n",
-			loops_per_jiffy/(500000/HZ),
-			(loops_per_jiffy/(5000/HZ)) % 100,
-			loops_per_jiffy);
+		printk(KERN_INFO
+			"Calibrating delay using timer specific routine.. ");
 	} else {
 		loops_per_jiffy = (1<<12);
 
-		printk(KERN_DEBUG "Calibrating delay loop... ");
+		printk(KERN_INFO "Calibrating delay loop... ");
 		while ((loops_per_jiffy <<= 1) != 0) {
 			/* wait for "start of" clock tick */
 			ticks = jiffies;
@@ -161,12 +167,8 @@ void __cpuinit calibrate_delay(void)
 			if (jiffies != ticks)	/* longer than 1 tick */
 				loops_per_jiffy &= ~loopbit;
 		}
-
-		/* Round the value and print it */
-		printk("%lu.%02lu BogoMIPS (lpj=%lu)\n",
-			loops_per_jiffy/(500000/HZ),
-			(loops_per_jiffy/(5000/HZ)) % 100,
-			loops_per_jiffy);
 	}
-
+	printk(KERN_INFO "%lu.%02lu BogoMIPS (lpj=%lu)\n",
+			loops_per_jiffy/(500000/HZ),
+			(loops_per_jiffy/(5000/HZ)) % 100, loops_per_jiffy);
 }
-- 
cgit v1.2.3


From 961ccddd59d627b89bd3dc284b6517833bbdf25d Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 23 Jun 2008 13:55:38 +1000
Subject: sched: add new API sched_setscheduler_nocheck: add a flag to control
 access checks

Hidehiro Kawai noticed that sched_setscheduler() can fail in
stop_machine: it calls sched_setscheduler() from insmod, which can
have CAP_SYS_MODULE without CAP_SYS_NICE.

Two cases could have failed, so are changed to sched_setscheduler_nocheck:
  kernel/softirq.c:cpu_callback()
	- CPU hotplug callback
  kernel/stop_machine.c:__stop_machine_run()
	- Called from various places, including modprobe()

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: Hidehiro Kawai <hidehiro.kawai.ez@hitachi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: linux-mm@kvack.org
Cc: sugita <yumiko.sugita.yf@hitachi.com>
Cc: Satoshi OSHIMA <satoshi.oshima.fk@hitachi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  2 ++
 kernel/sched.c        | 48 ++++++++++++++++++++++++++++++++++++------------
 kernel/softirq.c      |  2 +-
 kernel/stop_machine.c |  2 +-
 4 files changed, 40 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index c5d3f847ca8d..fe3b9b5d7390 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1655,6 +1655,8 @@ extern int can_nice(const struct task_struct *p, const int nice);
 extern int task_curr(const struct task_struct *p);
 extern int idle_cpu(int cpu);
 extern int sched_setscheduler(struct task_struct *, int, struct sched_param *);
+extern int sched_setscheduler_nocheck(struct task_struct *, int,
+				      struct sched_param *);
 extern struct task_struct *idle_task(int cpu);
 extern struct task_struct *curr_task(int cpu);
 extern void set_curr_task(int cpu, struct task_struct *p);
diff --git a/kernel/sched.c b/kernel/sched.c
index b048ad8a11af..8d7c246ab864 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4746,16 +4746,8 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
 	set_load_weight(p);
 }
 
-/**
- * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
- * @p: the task in question.
- * @policy: new policy.
- * @param: structure containing the new RT priority.
- *
- * NOTE that the task may be already dead.
- */
-int sched_setscheduler(struct task_struct *p, int policy,
-		       struct sched_param *param)
+static int __sched_setscheduler(struct task_struct *p, int policy,
+				struct sched_param *param, bool user)
 {
 	int retval, oldprio, oldpolicy = -1, on_rq, running;
 	unsigned long flags;
@@ -4787,7 +4779,7 @@ recheck:
 	/*
 	 * Allow unprivileged RT tasks to decrease priority:
 	 */
-	if (!capable(CAP_SYS_NICE)) {
+	if (user && !capable(CAP_SYS_NICE)) {
 		if (rt_policy(policy)) {
 			unsigned long rlim_rtprio;
 
@@ -4823,7 +4815,8 @@ recheck:
 	 * Do not allow realtime tasks into groups that have no runtime
 	 * assigned.
 	 */
-	if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
+	if (user
+	    && rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
 		return -EPERM;
 #endif
 
@@ -4872,8 +4865,39 @@ recheck:
 
 	return 0;
 }
+
+/**
+ * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
+ * @p: the task in question.
+ * @policy: new policy.
+ * @param: structure containing the new RT priority.
+ *
+ * NOTE that the task may be already dead.
+ */
+int sched_setscheduler(struct task_struct *p, int policy,
+		       struct sched_param *param)
+{
+	return __sched_setscheduler(p, policy, param, true);
+}
 EXPORT_SYMBOL_GPL(sched_setscheduler);
 
+/**
+ * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
+ * @p: the task in question.
+ * @policy: new policy.
+ * @param: structure containing the new RT priority.
+ *
+ * Just like sched_setscheduler, only don't bother checking if the
+ * current context has permission.  For example, this is needed in
+ * stop_machine(): we create temporary high priority worker threads,
+ * but our caller might not have that capability.
+ */
+int sched_setscheduler_nocheck(struct task_struct *p, int policy,
+			       struct sched_param *param)
+{
+	return __sched_setscheduler(p, policy, param, false);
+}
+
 static int
 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
 {
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 36e061740047..afd9120c2fc4 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -645,7 +645,7 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
 
 		p = per_cpu(ksoftirqd, hotcpu);
 		per_cpu(ksoftirqd, hotcpu) = NULL;
-		sched_setscheduler(p, SCHED_FIFO, &param);
+		sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
 		kthread_stop(p);
 		takeover_tasklets(hotcpu);
 		break;
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index b7350bbfb076..ba9b2054ecbd 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -187,7 +187,7 @@ struct task_struct *__stop_machine_run(int (*fn)(void *), void *data,
 		struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
 
 		/* One high-prio thread per cpu.  We'll do this one. */
-		sched_setscheduler(p, SCHED_FIFO, &param);
+		sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
 		kthread_bind(p, cpu);
 		wake_up_process(p);
 		wait_for_completion(&smdata.done);
-- 
cgit v1.2.3


From a033c332e047397904ed74816946b2edd9b0d5cd Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Mon, 23 Jun 2008 10:52:42 +0800
Subject: lockdep: remove duplicate definition of STATIC_LOCKDEP_MAP_INIT

STATIC_LOCKDEP_MAP_INIT is defined twice in lockdep.h. I guess
it's a copy & paste.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/lockdep.h | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index b26fbc715a50..2486eb4edbf1 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -278,14 +278,6 @@ extern void lockdep_init_map(struct lockdep_map *lock, const char *name,
 		lockdep_init_map(&(lock)->dep_map, #lock, \
 				 (lock)->dep_map.key, sub)
 
-/*
- * To initialize a lockdep_map statically use this macro.
- * Note that _name must not be NULL.
- */
-#define STATIC_LOCKDEP_MAP_INIT(_name, _key) \
-	{ .name = (_name), .key = (void *)(_key), }
-
-
 /*
  * Acquire a lock.
  *
-- 
cgit v1.2.3


From f3f3149f35b9195ef4b761b1353fc0766b5f53be Mon Sep 17 00:00:00 2001
From: Alok Kataria <akataria@vmware.com>
Date: Mon, 23 Jun 2008 18:21:56 -0700
Subject: x86: use cpu_khz for loops_per_jiffy calculation, cleanup

As suggested by Ingo, remove all references to tsc from init/calibrate.c

TSC is x86 specific, and using tsc in variable names in a generic file should
be avoided. lpj_tsc is now called lpj_fine, since it is related to fine tuning
of lpj value. Also tsc_rate_*  is called timer_rate_*

Signed-off-by: Alok N Kataria <akataria@vmware.com>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Daniel Hecht <dhecht@vmware.com>
Cc: Tim Mann <mann@vmware.com>
Cc: Zach Amsden <zach@vmware.com>
Cc: Sahil Rihan <srihan@vmware.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/time_64.c |  2 +-
 arch/x86/kernel/tsc_32.c  |  2 +-
 include/linux/delay.h     |  2 +-
 init/calibrate.c          | 36 +++++++++++++++++++-----------------
 4 files changed, 22 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c
index 12b4a71bd074..39ae8511a137 100644
--- a/arch/x86/kernel/time_64.c
+++ b/arch/x86/kernel/time_64.c
@@ -123,7 +123,7 @@ void __init time_init(void)
 		(boot_cpu_data.x86_vendor == X86_VENDOR_AMD))
 		cpu_khz = calculate_cpu_khz();
 
-	lpj_tsc = ((unsigned long)tsc_khz * 1000)/HZ;
+	lpj_fine = ((unsigned long)tsc_khz * 1000)/HZ;
 
 	if (unsynchronized_tsc())
 		mark_tsc_unstable("TSCs unsynchronized");
diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c
index 0af49fb533e1..048baab77268 100644
--- a/arch/x86/kernel/tsc_32.c
+++ b/arch/x86/kernel/tsc_32.c
@@ -425,7 +425,7 @@ void __init tsc_init(void)
 
 	lpj = ((u64)tsc_khz * 1000);
 	do_div(lpj, HZ);
-	lpj_tsc = lpj;
+	lpj_fine = lpj;
 
 	printk("Detected %lu.%03lu MHz processor.\n",
 				(unsigned long)cpu_khz / 1000,
diff --git a/include/linux/delay.h b/include/linux/delay.h
index 01aec60590ab..fd832c6d419e 100644
--- a/include/linux/delay.h
+++ b/include/linux/delay.h
@@ -41,7 +41,7 @@ static inline void ndelay(unsigned long x)
 #define ndelay(x) ndelay(x)
 #endif
 
-extern unsigned long lpj_tsc;
+extern unsigned long lpj_fine;
 void calibrate_delay(void);
 void msleep(unsigned int msecs);
 unsigned long msleep_interruptible(unsigned int msecs);
diff --git a/init/calibrate.c b/init/calibrate.c
index 86286974dada..7963e3fc51d9 100644
--- a/init/calibrate.c
+++ b/init/calibrate.c
@@ -10,7 +10,7 @@
 #include <linux/timex.h>
 #include <linux/smp.h>
 
-unsigned long lpj_tsc;
+unsigned long lpj_fine;
 unsigned long preset_lpj;
 static int __init lpj_setup(char *str)
 {
@@ -35,9 +35,9 @@ static unsigned long __cpuinit calibrate_delay_direct(void)
 	unsigned long pre_start, start, post_start;
 	unsigned long pre_end, end, post_end;
 	unsigned long start_jiffies;
-	unsigned long tsc_rate_min, tsc_rate_max;
-	unsigned long good_tsc_sum = 0;
-	unsigned long good_tsc_count = 0;
+	unsigned long timer_rate_min, timer_rate_max;
+	unsigned long good_timer_sum = 0;
+	unsigned long good_timer_count = 0;
 	int i;
 
 	if (read_current_timer(&pre_start) < 0 )
@@ -81,22 +81,24 @@ static unsigned long __cpuinit calibrate_delay_direct(void)
 		}
 		read_current_timer(&post_end);
 
-		tsc_rate_max = (post_end - pre_start) / DELAY_CALIBRATION_TICKS;
-		tsc_rate_min = (pre_end - post_start) / DELAY_CALIBRATION_TICKS;
+		timer_rate_max = (post_end - pre_start) /
+					DELAY_CALIBRATION_TICKS;
+		timer_rate_min = (pre_end - post_start) /
+					DELAY_CALIBRATION_TICKS;
 
 		/*
-	 	 * If the upper limit and lower limit of the tsc_rate is
+		 * If the upper limit and lower limit of the timer_rate is
 		 * >= 12.5% apart, redo calibration.
 		 */
 		if (pre_start != 0 && pre_end != 0 &&
-		    (tsc_rate_max - tsc_rate_min) < (tsc_rate_max >> 3)) {
-			good_tsc_count++;
-			good_tsc_sum += tsc_rate_max;
+		    (timer_rate_max - timer_rate_min) < (timer_rate_max >> 3)) {
+			good_timer_count++;
+			good_timer_sum += timer_rate_max;
 		}
 	}
 
-	if (good_tsc_count)
-		return (good_tsc_sum/good_tsc_count);
+	if (good_timer_count)
+		return (good_timer_sum/good_timer_count);
 
 	printk(KERN_WARNING "calibrate_delay_direct() failed to get a good "
 	       "estimate for loops_per_jiffy.\nProbably due to long platform interrupts. Consider using \"lpj=\" boot option.\n");
@@ -111,8 +113,8 @@ static unsigned long __cpuinit calibrate_delay_direct(void) {return 0;}
  * bit takes on average 1.5/HZ seconds.  This (like the original) is a little
  * better than 1%
  * For the boot cpu we can skip the delay calibration and assign it a value
- * calculated based on the tsc frequency.
- * For the rest of the CPUs we cannot assume that the tsc frequency is same as
+ * calculated based on the timer frequency.
+ * For the rest of the CPUs we cannot assume that the timer frequency is same as
  * the cpu frequency, hence do the calibration for those.
  */
 #define LPS_PREC 8
@@ -126,11 +128,11 @@ void __cpuinit calibrate_delay(void)
 		loops_per_jiffy = preset_lpj;
 		printk(KERN_INFO
 			"Calibrating delay loop (skipped) preset value.. ");
-	} else if ((smp_processor_id() == 0) && lpj_tsc) {
-		loops_per_jiffy = lpj_tsc;
+	} else if ((smp_processor_id() == 0) && lpj_fine) {
+		loops_per_jiffy = lpj_fine;
 		printk(KERN_INFO
 			"Calibrating delay loop (skipped), "
-			"using tsc calculated value.. ");
+			"value calculated using timer frequency.. ");
 	} else if ((loops_per_jiffy = calibrate_delay_direct()) != 0) {
 		printk(KERN_INFO
 			"Calibrating delay using timer specific routine.. ");
-- 
cgit v1.2.3


From 3d4422332711ef48ef0f132f1fcbfcbd56c7f3d1 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 26 Jun 2008 11:21:34 +0200
Subject: Add generic helpers for arch IPI function calls

This adds kernel/smp.c which contains helpers for IPI function calls. In
addition to supporting the existing smp_call_function() in a more efficient
manner, it also adds a more scalable variant called smp_call_function_single()
for calling a given function on a single CPU only.

The core of this is based on the x86-64 patch from Nick Piggin, lots of
changes since then. "Alan D. Brunelle" <Alan.Brunelle@hp.com> has
contributed lots of fixes and suggestions as well. Also thanks to
Paul E. McKenney <paulmck@linux.vnet.ibm.com> for reviewing RCU usage
and getting rid of the data allocation fallback deadlock.

Acked-by: Ingo Molnar <mingo@elte.hu>
Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 arch/Kconfig              |   3 +
 arch/sparc64/kernel/smp.c |  11 +-
 include/linux/smp.h       |  35 ++++-
 init/main.c               |   2 +
 kernel/Makefile           |   1 +
 kernel/smp.c              | 383 ++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 428 insertions(+), 7 deletions(-)
 create mode 100644 kernel/smp.c

(limited to 'include/linux')

diff --git a/arch/Kconfig b/arch/Kconfig
index 3ea332b009e5..ad89a33d8c6e 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -39,3 +39,6 @@ config HAVE_KRETPROBES
 
 config HAVE_DMA_ATTRS
 	def_bool n
+
+config USE_GENERIC_SMP_HELPERS
+	def_bool n
diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c
index fa63c68a1819..b82d017a1744 100644
--- a/arch/sparc64/kernel/smp.c
+++ b/arch/sparc64/kernel/smp.c
@@ -816,8 +816,9 @@ extern unsigned long xcall_call_function;
  * You must not call this function with disabled interrupts or from a
  * hardware interrupt handler or from a bottom half handler.
  */
-static int smp_call_function_mask(void (*func)(void *info), void *info,
-				  int nonatomic, int wait, cpumask_t mask)
+static int sparc64_smp_call_function_mask(void (*func)(void *info), void *info,
+					  int nonatomic, int wait,
+					  cpumask_t mask)
 {
 	struct call_data_struct data;
 	int cpus;
@@ -855,8 +856,8 @@ out_unlock:
 int smp_call_function(void (*func)(void *info), void *info,
 		      int nonatomic, int wait)
 {
-	return smp_call_function_mask(func, info, nonatomic, wait,
-				      cpu_online_map);
+	return sparc64_smp_call_function_mask(func, info, nonatomic, wait,
+						cpu_online_map);
 }
 
 void smp_call_function_client(int irq, struct pt_regs *regs)
@@ -893,7 +894,7 @@ static void tsb_sync(void *info)
 
 void smp_tsb_sync(struct mm_struct *mm)
 {
-	smp_call_function_mask(tsb_sync, mm, 0, 1, mm->cpu_vm_mask);
+	sparc64_smp_call_function_mask(tsb_sync, mm, 0, 1, mm->cpu_vm_mask);
 }
 
 extern unsigned long xcall_flush_tlb_mm;
diff --git a/include/linux/smp.h b/include/linux/smp.h
index 55232ccf9cfd..eac3e062250f 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -7,9 +7,19 @@
  */
 
 #include <linux/errno.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/cpumask.h>
 
 extern void cpu_idle(void);
 
+struct call_single_data {
+	struct list_head list;
+	void (*func) (void *info);
+	void *info;
+	unsigned int flags;
+};
+
 #ifdef CONFIG_SMP
 
 #include <linux/preempt.h>
@@ -53,9 +63,28 @@ extern void smp_cpus_done(unsigned int max_cpus);
  * Call a function on all other processors
  */
 int smp_call_function(void(*func)(void *info), void *info, int retry, int wait);
-
+int smp_call_function_mask(cpumask_t mask, void(*func)(void *info), void *info,
+				int wait);
 int smp_call_function_single(int cpuid, void (*func) (void *info), void *info,
 				int retry, int wait);
+void __smp_call_function_single(int cpuid, struct call_single_data *data);
+
+/*
+ * Generic and arch helpers
+ */
+#ifdef CONFIG_USE_GENERIC_SMP_HELPERS
+void generic_smp_call_function_single_interrupt(void);
+void generic_smp_call_function_interrupt(void);
+void init_call_single_data(void);
+void ipi_call_lock(void);
+void ipi_call_unlock(void);
+void ipi_call_lock_irq(void);
+void ipi_call_unlock_irq(void);
+#else
+static inline void init_call_single_data(void)
+{
+}
+#endif
 
 /*
  * Call a function on all processors
@@ -112,7 +141,9 @@ static inline void smp_send_reschedule(int cpu) { }
 })
 #define smp_call_function_mask(mask, func, info, wait) \
 			(up_smp_call_function(func, info))
-
+static inline void init_call_single_data(void)
+{
+}
 #endif /* !SMP */
 
 /*
diff --git a/init/main.c b/init/main.c
index f7fb20021d48..1efcccff1bdb 100644
--- a/init/main.c
+++ b/init/main.c
@@ -31,6 +31,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/start_kernel.h>
 #include <linux/security.h>
+#include <linux/smp.h>
 #include <linux/workqueue.h>
 #include <linux/profile.h>
 #include <linux/rcupdate.h>
@@ -779,6 +780,7 @@ static void __init do_pre_smp_initcalls(void)
 {
 	extern int spawn_ksoftirqd(void);
 
+	init_call_single_data();
 	migration_init();
 	spawn_ksoftirqd();
 	if (!nosoftlockup)
diff --git a/kernel/Makefile b/kernel/Makefile
index 1c9938addb9d..9fa57976f252 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -28,6 +28,7 @@ obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
 obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
 obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
 obj-$(CONFIG_SMP) += cpu.o spinlock.o
+obj-$(CONFIG_USE_GENERIC_SMP_HELPERS) += smp.o
 obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
 obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
 obj-$(CONFIG_UID16) += uid16.o
diff --git a/kernel/smp.c b/kernel/smp.c
new file mode 100644
index 000000000000..f77b75c027ad
--- /dev/null
+++ b/kernel/smp.c
@@ -0,0 +1,383 @@
+/*
+ * Generic helpers for smp ipi calls
+ *
+ * (C) Jens Axboe <jens.axboe@oracle.com> 2008
+ *
+ */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/rcupdate.h>
+#include <linux/smp.h>
+
+static DEFINE_PER_CPU(struct call_single_queue, call_single_queue);
+static LIST_HEAD(call_function_queue);
+__cacheline_aligned_in_smp DEFINE_SPINLOCK(call_function_lock);
+
+enum {
+	CSD_FLAG_WAIT		= 0x01,
+	CSD_FLAG_ALLOC		= 0x02,
+};
+
+struct call_function_data {
+	struct call_single_data csd;
+	spinlock_t lock;
+	unsigned int refs;
+	cpumask_t cpumask;
+	struct rcu_head rcu_head;
+};
+
+struct call_single_queue {
+	struct list_head list;
+	spinlock_t lock;
+};
+
+void __cpuinit init_call_single_data(void)
+{
+	int i;
+
+	for_each_possible_cpu(i) {
+		struct call_single_queue *q = &per_cpu(call_single_queue, i);
+
+		spin_lock_init(&q->lock);
+		INIT_LIST_HEAD(&q->list);
+	}
+}
+
+static void csd_flag_wait(struct call_single_data *data)
+{
+	/* Wait for response */
+	do {
+		/*
+		 * We need to see the flags store in the IPI handler
+		 */
+		smp_mb();
+		if (!(data->flags & CSD_FLAG_WAIT))
+			break;
+		cpu_relax();
+	} while (1);
+}
+
+/*
+ * Insert a previously allocated call_single_data element for execution
+ * on the given CPU. data must already have ->func, ->info, and ->flags set.
+ */
+static void generic_exec_single(int cpu, struct call_single_data *data)
+{
+	struct call_single_queue *dst = &per_cpu(call_single_queue, cpu);
+	int wait = data->flags & CSD_FLAG_WAIT, ipi;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dst->lock, flags);
+	ipi = list_empty(&dst->list);
+	list_add_tail(&data->list, &dst->list);
+	spin_unlock_irqrestore(&dst->lock, flags);
+
+	if (ipi)
+		arch_send_call_function_single_ipi(cpu);
+
+	if (wait)
+		csd_flag_wait(data);
+}
+
+static void rcu_free_call_data(struct rcu_head *head)
+{
+	struct call_function_data *data;
+
+	data = container_of(head, struct call_function_data, rcu_head);
+
+	kfree(data);
+}
+
+/*
+ * Invoked by arch to handle an IPI for call function. Must be called with
+ * interrupts disabled.
+ */
+void generic_smp_call_function_interrupt(void)
+{
+	struct call_function_data *data;
+	int cpu = get_cpu();
+
+	/*
+	 * It's ok to use list_for_each_rcu() here even though we may delete
+	 * 'pos', since list_del_rcu() doesn't clear ->next
+	 */
+	rcu_read_lock();
+	list_for_each_entry_rcu(data, &call_function_queue, csd.list) {
+		int refs;
+
+		if (!cpu_isset(cpu, data->cpumask))
+			continue;
+
+		data->csd.func(data->csd.info);
+
+		spin_lock(&data->lock);
+		cpu_clear(cpu, data->cpumask);
+		WARN_ON(data->refs == 0);
+		data->refs--;
+		refs = data->refs;
+		spin_unlock(&data->lock);
+
+		if (refs)
+			continue;
+
+		spin_lock(&call_function_lock);
+		list_del_rcu(&data->csd.list);
+		spin_unlock(&call_function_lock);
+
+		if (data->csd.flags & CSD_FLAG_WAIT) {
+			/*
+			 * serialize stores to data with the flag clear
+			 * and wakeup
+			 */
+			smp_wmb();
+			data->csd.flags &= ~CSD_FLAG_WAIT;
+		} else
+			call_rcu(&data->rcu_head, rcu_free_call_data);
+	}
+	rcu_read_unlock();
+
+	put_cpu();
+}
+
+/*
+ * Invoked by arch to handle an IPI for call function single. Must be called
+ * from the arch with interrupts disabled.
+ */
+void generic_smp_call_function_single_interrupt(void)
+{
+	struct call_single_queue *q = &__get_cpu_var(call_single_queue);
+	LIST_HEAD(list);
+
+	/*
+	 * Need to see other stores to list head for checking whether
+	 * list is empty without holding q->lock
+	 */
+	smp_mb();
+	while (!list_empty(&q->list)) {
+		unsigned int data_flags;
+
+		spin_lock(&q->lock);
+		list_replace_init(&q->list, &list);
+		spin_unlock(&q->lock);
+
+		while (!list_empty(&list)) {
+			struct call_single_data *data;
+
+			data = list_entry(list.next, struct call_single_data,
+						list);
+			list_del(&data->list);
+
+			/*
+			 * 'data' can be invalid after this call if
+			 * flags == 0 (when called through
+			 * generic_exec_single(), so save them away before
+			 * making the call.
+			 */
+			data_flags = data->flags;
+
+			data->func(data->info);
+
+			if (data_flags & CSD_FLAG_WAIT) {
+				smp_wmb();
+				data->flags &= ~CSD_FLAG_WAIT;
+			} else if (data_flags & CSD_FLAG_ALLOC)
+				kfree(data);
+		}
+		/*
+		 * See comment on outer loop
+		 */
+		smp_mb();
+	}
+}
+
+/*
+ * smp_call_function_single - Run a function on a specific CPU
+ * @func: The function to run. This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to the function.
+ * @retry: Unused
+ * @wait: If true, wait until function has completed on other CPUs.
+ *
+ * Returns 0 on success, else a negative status code. Note that @wait
+ * will be implicitly turned on in case of allocation failures, since
+ * we fall back to on-stack allocation.
+ */
+int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
+			     int retry, int wait)
+{
+	struct call_single_data d;
+	unsigned long flags;
+	/* prevent preemption and reschedule on another processor */
+	int me = get_cpu();
+
+	/* Can deadlock when called with interrupts disabled */
+	WARN_ON(irqs_disabled());
+
+	if (cpu == me) {
+		local_irq_save(flags);
+		func(info);
+		local_irq_restore(flags);
+	} else {
+		struct call_single_data *data = NULL;
+
+		if (!wait) {
+			data = kmalloc(sizeof(*data), GFP_ATOMIC);
+			if (data)
+				data->flags = CSD_FLAG_ALLOC;
+		}
+		if (!data) {
+			data = &d;
+			data->flags = CSD_FLAG_WAIT;
+		}
+
+		data->func = func;
+		data->info = info;
+		generic_exec_single(cpu, data);
+	}
+
+	put_cpu();
+	return 0;
+}
+EXPORT_SYMBOL(smp_call_function_single);
+
+/**
+ * __smp_call_function_single(): Run a function on another CPU
+ * @cpu: The CPU to run on.
+ * @data: Pre-allocated and setup data structure
+ *
+ * Like smp_call_function_single(), but allow caller to pass in a pre-allocated
+ * data structure. Useful for embedding @data inside other structures, for
+ * instance.
+ *
+ */
+void __smp_call_function_single(int cpu, struct call_single_data *data)
+{
+	/* Can deadlock when called with interrupts disabled */
+	WARN_ON((data->flags & CSD_FLAG_WAIT) && irqs_disabled());
+
+	generic_exec_single(cpu, data);
+}
+
+/**
+ * smp_call_function_mask(): Run a function on a set of other CPUs.
+ * @mask: The set of cpus to run on.
+ * @func: The function to run. This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to the function.
+ * @wait: If true, wait (atomically) until function has completed on other CPUs.
+ *
+ * Returns 0 on success, else a negative status code.
+ *
+ * If @wait is true, then returns once @func has returned. Note that @wait
+ * will be implicitly turned on in case of allocation failures, since
+ * we fall back to on-stack allocation.
+ *
+ * You must not call this function with disabled interrupts or from a
+ * hardware interrupt handler or from a bottom half handler. Preemption
+ * must be disabled when calling this function.
+ */
+int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info,
+			   int wait)
+{
+	struct call_function_data d;
+	struct call_function_data *data = NULL;
+	cpumask_t allbutself;
+	unsigned long flags;
+	int cpu, num_cpus;
+
+	/* Can deadlock when called with interrupts disabled */
+	WARN_ON(irqs_disabled());
+
+	cpu = smp_processor_id();
+	allbutself = cpu_online_map;
+	cpu_clear(cpu, allbutself);
+	cpus_and(mask, mask, allbutself);
+	num_cpus = cpus_weight(mask);
+
+	/*
+	 * If zero CPUs, return. If just a single CPU, turn this request
+	 * into a targetted single call instead since it's faster.
+	 */
+	if (!num_cpus)
+		return 0;
+	else if (num_cpus == 1) {
+		cpu = first_cpu(mask);
+		return smp_call_function_single(cpu, func, info, 0, wait);
+	}
+
+	if (!wait) {
+		data = kmalloc(sizeof(*data), GFP_ATOMIC);
+		if (data)
+			data->csd.flags = CSD_FLAG_ALLOC;
+	}
+	if (!data) {
+		data = &d;
+		data->csd.flags = CSD_FLAG_WAIT;
+	}
+
+	spin_lock_init(&data->lock);
+	data->csd.func = func;
+	data->csd.info = info;
+	data->refs = num_cpus;
+	data->cpumask = mask;
+
+	spin_lock_irqsave(&call_function_lock, flags);
+	list_add_tail_rcu(&data->csd.list, &call_function_queue);
+	spin_unlock_irqrestore(&call_function_lock, flags);
+
+	/* Send a message to all CPUs in the map */
+	arch_send_call_function_ipi(mask);
+
+	/* optionally wait for the CPUs to complete */
+	if (wait)
+		csd_flag_wait(&data->csd);
+
+	return 0;
+}
+EXPORT_SYMBOL(smp_call_function_mask);
+
+/**
+ * smp_call_function(): Run a function on all other CPUs.
+ * @func: The function to run. This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to the function.
+ * @natomic: Unused
+ * @wait: If true, wait (atomically) until function has completed on other CPUs.
+ *
+ * Returns 0 on success, else a negative status code.
+ *
+ * If @wait is true, then returns once @func has returned; otherwise
+ * it returns just before the target cpu calls @func. In case of allocation
+ * failure, @wait will be implicitly turned on.
+ *
+ * You must not call this function with disabled interrupts or from a
+ * hardware interrupt handler or from a bottom half handler.
+ */
+int smp_call_function(void (*func)(void *), void *info, int natomic, int wait)
+{
+	int ret;
+
+	preempt_disable();
+	ret = smp_call_function_mask(cpu_online_map, func, info, wait);
+	preempt_enable();
+	return ret;
+}
+EXPORT_SYMBOL(smp_call_function);
+
+void ipi_call_lock(void)
+{
+	spin_lock(&call_function_lock);
+}
+
+void ipi_call_unlock(void)
+{
+	spin_unlock(&call_function_lock);
+}
+
+void ipi_call_lock_irq(void)
+{
+	spin_lock_irq(&call_function_lock);
+}
+
+void ipi_call_unlock_irq(void)
+{
+	spin_unlock_irq(&call_function_lock);
+}
-- 
cgit v1.2.3


From 8691e5a8f691cc2a4fda0651e8d307aaba0e7d68 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Fri, 6 Jun 2008 11:18:06 +0200
Subject: smp_call_function: get rid of the unused nonatomic/retry argument

It's never used and the comments refer to nonatomic and retry
interchangably. So get rid of it.

Acked-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 arch/alpha/kernel/core_marvel.c      |  2 +-
 arch/alpha/kernel/smp.c              |  6 +++---
 arch/alpha/oprofile/common.c         |  6 +++---
 arch/arm/oprofile/op_model_mpcore.c  |  2 +-
 arch/arm/vfp/vfpmodule.c             |  2 +-
 arch/cris/arch-v32/kernel/smp.c      |  5 ++---
 arch/ia64/kernel/mca.c               |  2 +-
 arch/ia64/kernel/palinfo.c           |  2 +-
 arch/ia64/kernel/perfmon.c           |  2 +-
 arch/ia64/kernel/process.c           |  2 +-
 arch/ia64/kernel/smpboot.c           |  2 +-
 arch/ia64/kernel/uncached.c          |  5 ++---
 arch/ia64/sn/kernel/sn2/sn_hwperf.c  |  2 +-
 arch/m32r/kernel/smp.c               |  4 ++--
 arch/mips/kernel/smp.c               |  4 ++--
 arch/mips/mm/c-r4k.c                 | 18 +++++++++---------
 arch/mips/pmc-sierra/yosemite/prom.c |  2 +-
 arch/mips/sibyte/cfe/setup.c         |  2 +-
 arch/mips/sibyte/sb1250/prom.c       |  2 +-
 arch/powerpc/kernel/smp.c            |  2 +-
 arch/s390/appldata/appldata_base.c   |  4 ++--
 arch/s390/kernel/smp.c               | 16 ++++++----------
 arch/s390/kernel/time.c              |  4 ++--
 arch/sh/kernel/smp.c                 | 10 +++++-----
 arch/sparc64/kernel/smp.c            | 12 ++++--------
 arch/um/kernel/smp.c                 |  3 +--
 arch/x86/kernel/cpu/mtrr/main.c      |  4 ++--
 arch/x86/kernel/cpuid.c              |  2 +-
 arch/x86/kernel/ldt.c                |  2 +-
 arch/x86/kernel/nmi_32.c             |  2 +-
 arch/x86/kernel/nmi_64.c             |  2 +-
 arch/x86/kernel/smp.c                |  2 +-
 arch/x86/kernel/vsyscall_64.c        |  2 +-
 arch/x86/kvm/vmx.c                   |  2 +-
 arch/x86/kvm/x86.c                   |  2 +-
 arch/x86/lib/msr-on-cpu.c            |  8 ++++----
 arch/x86/mach-voyager/voyager_smp.c  |  2 +-
 arch/x86/xen/smp.c                   |  2 +-
 drivers/acpi/processor_idle.c        |  2 +-
 drivers/cpuidle/cpuidle.c            |  2 +-
 include/asm-alpha/smp.h              |  2 +-
 include/asm-sparc/smp.h              |  2 +-
 include/linux/smp.h                  |  8 ++++----
 kernel/smp.c                         |  6 ++----
 kernel/softirq.c                     |  2 +-
 kernel/time/tick-broadcast.c         |  2 +-
 net/core/flow.c                      |  2 +-
 net/iucv/iucv.c                      | 14 +++++++-------
 virt/kvm/kvm_main.c                  |  6 +++---
 49 files changed, 95 insertions(+), 108 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/core_marvel.c b/arch/alpha/kernel/core_marvel.c
index ced4aae8b804..04dcc5e5d4c1 100644
--- a/arch/alpha/kernel/core_marvel.c
+++ b/arch/alpha/kernel/core_marvel.c
@@ -662,7 +662,7 @@ __marvel_rtc_io(u8 b, unsigned long addr, int write)
 		if (smp_processor_id() != boot_cpuid)
 			smp_call_function_single(boot_cpuid,
 						 __marvel_access_rtc,
-						 &rtc_access, 1, 1);
+						 &rtc_access, 1);
 		else
 			__marvel_access_rtc(&rtc_access);
 #else
diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c
index 95c905be9154..44114c8dbb2a 100644
--- a/arch/alpha/kernel/smp.c
+++ b/arch/alpha/kernel/smp.c
@@ -710,7 +710,7 @@ flush_tlb_mm(struct mm_struct *mm)
 		}
 	}
 
-	if (smp_call_function(ipi_flush_tlb_mm, mm, 1, 1)) {
+	if (smp_call_function(ipi_flush_tlb_mm, mm, 1)) {
 		printk(KERN_CRIT "flush_tlb_mm: timed out\n");
 	}
 
@@ -763,7 +763,7 @@ flush_tlb_page(struct vm_area_struct *vma, unsigned long addr)
 	data.mm = mm;
 	data.addr = addr;
 
-	if (smp_call_function(ipi_flush_tlb_page, &data, 1, 1)) {
+	if (smp_call_function(ipi_flush_tlb_page, &data, 1)) {
 		printk(KERN_CRIT "flush_tlb_page: timed out\n");
 	}
 
@@ -815,7 +815,7 @@ flush_icache_user_range(struct vm_area_struct *vma, struct page *page,
 		}
 	}
 
-	if (smp_call_function(ipi_flush_icache_page, mm, 1, 1)) {
+	if (smp_call_function(ipi_flush_icache_page, mm, 1)) {
 		printk(KERN_CRIT "flush_icache_page: timed out\n");
 	}
 
diff --git a/arch/alpha/oprofile/common.c b/arch/alpha/oprofile/common.c
index 9fc0eeb4f0ab..7c3d5ec6ec67 100644
--- a/arch/alpha/oprofile/common.c
+++ b/arch/alpha/oprofile/common.c
@@ -65,7 +65,7 @@ op_axp_setup(void)
 	model->reg_setup(&reg, ctr, &sys);
 
 	/* Configure the registers on all cpus.  */
-	(void)smp_call_function(model->cpu_setup, &reg, 0, 1);
+	(void)smp_call_function(model->cpu_setup, &reg, 1);
 	model->cpu_setup(&reg);
 	return 0;
 }
@@ -86,7 +86,7 @@ op_axp_cpu_start(void *dummy)
 static int
 op_axp_start(void)
 {
-	(void)smp_call_function(op_axp_cpu_start, NULL, 0, 1);
+	(void)smp_call_function(op_axp_cpu_start, NULL, 1);
 	op_axp_cpu_start(NULL);
 	return 0;
 }
@@ -101,7 +101,7 @@ op_axp_cpu_stop(void *dummy)
 static void
 op_axp_stop(void)
 {
-	(void)smp_call_function(op_axp_cpu_stop, NULL, 0, 1);
+	(void)smp_call_function(op_axp_cpu_stop, NULL, 1);
 	op_axp_cpu_stop(NULL);
 }
 
diff --git a/arch/arm/oprofile/op_model_mpcore.c b/arch/arm/oprofile/op_model_mpcore.c
index 74fae6045650..4458705021e0 100644
--- a/arch/arm/oprofile/op_model_mpcore.c
+++ b/arch/arm/oprofile/op_model_mpcore.c
@@ -201,7 +201,7 @@ static int em_call_function(int (*fn)(void))
 	data.ret = 0;
 
 	preempt_disable();
-	smp_call_function(em_func, &data, 1, 1);
+	smp_call_function(em_func, &data, 1);
 	em_func(&data);
 	preempt_enable();
 
diff --git a/arch/arm/vfp/vfpmodule.c b/arch/arm/vfp/vfpmodule.c
index 32455c633f1c..c0d2c9bb952b 100644
--- a/arch/arm/vfp/vfpmodule.c
+++ b/arch/arm/vfp/vfpmodule.c
@@ -352,7 +352,7 @@ static int __init vfp_init(void)
 	else if (vfpsid & FPSID_NODOUBLE) {
 		printk("no double precision support\n");
 	} else {
-		smp_call_function(vfp_enable, NULL, 1, 1);
+		smp_call_function(vfp_enable, NULL, 1);
 
 		VFP_arch = (vfpsid & FPSID_ARCH_MASK) >> FPSID_ARCH_BIT;  /* Extract the architecture version */
 		printk("implementor %02x architecture %d part %02x variant %x rev %x\n",
diff --git a/arch/cris/arch-v32/kernel/smp.c b/arch/cris/arch-v32/kernel/smp.c
index a9c3334e46c9..952a24b2f5a9 100644
--- a/arch/cris/arch-v32/kernel/smp.c
+++ b/arch/cris/arch-v32/kernel/smp.c
@@ -194,7 +194,7 @@ void stop_this_cpu(void* dummy)
 /* Other calls */
 void smp_send_stop(void)
 {
-	smp_call_function(stop_this_cpu, NULL, 1, 0);
+	smp_call_function(stop_this_cpu, NULL, 0);
 }
 
 int setup_profiling_timer(unsigned int multiplier)
@@ -316,8 +316,7 @@ int send_ipi(int vector, int wait, cpumask_t cpu_mask)
  * You must not call this function with disabled interrupts or from a
  * hardware interrupt handler or from a bottom half handler.
  */
-int smp_call_function(void (*func)(void *info), void *info,
-		      int nonatomic, int wait)
+int smp_call_function(void (*func)(void *info), void *info, int wait)
 {
 	cpumask_t cpu_mask = CPU_MASK_ALL;
 	struct call_data_struct data;
diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c
index 705176b434b3..9cd818cc7008 100644
--- a/arch/ia64/kernel/mca.c
+++ b/arch/ia64/kernel/mca.c
@@ -1881,7 +1881,7 @@ static int __cpuinit mca_cpu_callback(struct notifier_block *nfb,
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:
 		smp_call_function_single(hotcpu, ia64_mca_cmc_vector_adjust,
-					 NULL, 1, 0);
+					 NULL, 0);
 		break;
 	}
 	return NOTIFY_OK;
diff --git a/arch/ia64/kernel/palinfo.c b/arch/ia64/kernel/palinfo.c
index 9dc00f7fe10e..e5c57f413ca2 100644
--- a/arch/ia64/kernel/palinfo.c
+++ b/arch/ia64/kernel/palinfo.c
@@ -921,7 +921,7 @@ int palinfo_handle_smp(pal_func_cpu_u_t *f, char *page)
 
 
 	/* will send IPI to other CPU and wait for completion of remote call */
-	if ((ret=smp_call_function_single(f->req_cpu, palinfo_smp_call, &ptr, 0, 1))) {
+	if ((ret=smp_call_function_single(f->req_cpu, palinfo_smp_call, &ptr, 1))) {
 		printk(KERN_ERR "palinfo: remote CPU call from %d to %d on function %d: "
 		       "error %d\n", smp_processor_id(), f->req_cpu, f->func_id, ret);
 		return 0;
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index 7714a97b0104..9baa48255c12 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -1820,7 +1820,7 @@ pfm_syswide_cleanup_other_cpu(pfm_context_t *ctx)
 	int ret;
 
 	DPRINT(("calling CPU%d for cleanup\n", ctx->ctx_cpu));
-	ret = smp_call_function_single(ctx->ctx_cpu, pfm_syswide_force_stop, ctx, 0, 1);
+	ret = smp_call_function_single(ctx->ctx_cpu, pfm_syswide_force_stop, ctx, 1);
 	DPRINT(("called CPU%d for cleanup ret=%d\n", ctx->ctx_cpu, ret));
 }
 #endif /* CONFIG_SMP */
diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
index a3a34b4eb038..fabaf08d9a69 100644
--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@ -286,7 +286,7 @@ void cpu_idle_wait(void)
 {
 	smp_mb();
 	/* kick all the CPUs so that they exit out of pm_idle */
-	smp_call_function(do_nothing, NULL, 0, 1);
+	smp_call_function(do_nothing, NULL, 1);
 }
 EXPORT_SYMBOL_GPL(cpu_idle_wait);
 
diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c
index eaa1b6795a13..9d1d429c6c59 100644
--- a/arch/ia64/kernel/smpboot.c
+++ b/arch/ia64/kernel/smpboot.c
@@ -317,7 +317,7 @@ ia64_sync_itc (unsigned int master)
 
 	go[MASTER] = 1;
 
-	if (smp_call_function_single(master, sync_master, NULL, 1, 0) < 0) {
+	if (smp_call_function_single(master, sync_master, NULL, 0) < 0) {
 		printk(KERN_ERR "sync_itc: failed to get attention of CPU %u!\n", master);
 		return;
 	}
diff --git a/arch/ia64/kernel/uncached.c b/arch/ia64/kernel/uncached.c
index e77995a6e3ed..8eff8c1d40a6 100644
--- a/arch/ia64/kernel/uncached.c
+++ b/arch/ia64/kernel/uncached.c
@@ -123,8 +123,7 @@ static int uncached_add_chunk(struct uncached_pool *uc_pool, int nid)
 	status = ia64_pal_prefetch_visibility(PAL_VISIBILITY_PHYSICAL);
 	if (status == PAL_VISIBILITY_OK_REMOTE_NEEDED) {
 		atomic_set(&uc_pool->status, 0);
-		status = smp_call_function(uncached_ipi_visibility, uc_pool,
-					   0, 1);
+		status = smp_call_function(uncached_ipi_visibility, uc_pool, 1);
 		if (status || atomic_read(&uc_pool->status))
 			goto failed;
 	} else if (status != PAL_VISIBILITY_OK)
@@ -146,7 +145,7 @@ static int uncached_add_chunk(struct uncached_pool *uc_pool, int nid)
 	if (status != PAL_STATUS_SUCCESS)
 		goto failed;
 	atomic_set(&uc_pool->status, 0);
-	status = smp_call_function(uncached_ipi_mc_drain, uc_pool, 0, 1);
+	status = smp_call_function(uncached_ipi_mc_drain, uc_pool, 1);
 	if (status || atomic_read(&uc_pool->status))
 		goto failed;
 
diff --git a/arch/ia64/sn/kernel/sn2/sn_hwperf.c b/arch/ia64/sn/kernel/sn2/sn_hwperf.c
index 8cc0c4753d89..636588e7e068 100644
--- a/arch/ia64/sn/kernel/sn2/sn_hwperf.c
+++ b/arch/ia64/sn/kernel/sn2/sn_hwperf.c
@@ -629,7 +629,7 @@ static int sn_hwperf_op_cpu(struct sn_hwperf_op_info *op_info)
 		if (use_ipi) {
 			/* use an interprocessor interrupt to call SAL */
 			smp_call_function_single(cpu, sn_hwperf_call_sal,
-				op_info, 1, 1);
+				op_info, 1);
 		}
 		else {
 			/* migrate the task before calling SAL */ 
diff --git a/arch/m32r/kernel/smp.c b/arch/m32r/kernel/smp.c
index 74eb7bcd5a40..7577f971ea4e 100644
--- a/arch/m32r/kernel/smp.c
+++ b/arch/m32r/kernel/smp.c
@@ -212,7 +212,7 @@ void smp_flush_tlb_all(void)
 	local_irq_save(flags);
 	__flush_tlb_all();
 	local_irq_restore(flags);
-	smp_call_function(flush_tlb_all_ipi, NULL, 1, 1);
+	smp_call_function(flush_tlb_all_ipi, NULL, 1);
 	preempt_enable();
 }
 
@@ -505,7 +505,7 @@ void smp_invalidate_interrupt(void)
  *==========================================================================*/
 void smp_send_stop(void)
 {
-	smp_call_function(stop_this_cpu, NULL, 1, 0);
+	smp_call_function(stop_this_cpu, NULL, 0);
 }
 
 /*==========================================================================*
diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c
index c75b26cb61df..7a9ae830be86 100644
--- a/arch/mips/kernel/smp.c
+++ b/arch/mips/kernel/smp.c
@@ -167,7 +167,7 @@ static void stop_this_cpu(void *dummy)
 
 void smp_send_stop(void)
 {
-	smp_call_function(stop_this_cpu, NULL, 1, 0);
+	smp_call_function(stop_this_cpu, NULL, 0);
 }
 
 void __init smp_cpus_done(unsigned int max_cpus)
@@ -266,7 +266,7 @@ static void flush_tlb_mm_ipi(void *mm)
 static inline void smp_on_other_tlbs(void (*func) (void *info), void *info)
 {
 #ifndef CONFIG_MIPS_MT_SMTC
-	smp_call_function(func, info, 1, 1);
+	smp_call_function(func, info, 1);
 #endif
 }
 
diff --git a/arch/mips/mm/c-r4k.c b/arch/mips/mm/c-r4k.c
index 27096751ddce..71df3390c07b 100644
--- a/arch/mips/mm/c-r4k.c
+++ b/arch/mips/mm/c-r4k.c
@@ -43,12 +43,12 @@
  *    primary cache.
  */
 static inline void r4k_on_each_cpu(void (*func) (void *info), void *info,
-                                   int retry, int wait)
+                                   int wait)
 {
 	preempt_disable();
 
 #if !defined(CONFIG_MIPS_MT_SMP) && !defined(CONFIG_MIPS_MT_SMTC)
-	smp_call_function(func, info, retry, wait);
+	smp_call_function(func, info, wait);
 #endif
 	func(info);
 	preempt_enable();
@@ -350,7 +350,7 @@ static inline void local_r4k___flush_cache_all(void * args)
 
 static void r4k___flush_cache_all(void)
 {
-	r4k_on_each_cpu(local_r4k___flush_cache_all, NULL, 1, 1);
+	r4k_on_each_cpu(local_r4k___flush_cache_all, NULL, 1);
 }
 
 static inline int has_valid_asid(const struct mm_struct *mm)
@@ -397,7 +397,7 @@ static void r4k_flush_cache_range(struct vm_area_struct *vma,
 	int exec = vma->vm_flags & VM_EXEC;
 
 	if (cpu_has_dc_aliases || (exec && !cpu_has_ic_fills_f_dc))
-		r4k_on_each_cpu(local_r4k_flush_cache_range, vma, 1, 1);
+		r4k_on_each_cpu(local_r4k_flush_cache_range, vma, 1);
 }
 
 static inline void local_r4k_flush_cache_mm(void * args)
@@ -429,7 +429,7 @@ static void r4k_flush_cache_mm(struct mm_struct *mm)
 	if (!cpu_has_dc_aliases)
 		return;
 
-	r4k_on_each_cpu(local_r4k_flush_cache_mm, mm, 1, 1);
+	r4k_on_each_cpu(local_r4k_flush_cache_mm, mm, 1);
 }
 
 struct flush_cache_page_args {
@@ -521,7 +521,7 @@ static void r4k_flush_cache_page(struct vm_area_struct *vma,
 	args.addr = addr;
 	args.pfn = pfn;
 
-	r4k_on_each_cpu(local_r4k_flush_cache_page, &args, 1, 1);
+	r4k_on_each_cpu(local_r4k_flush_cache_page, &args, 1);
 }
 
 static inline void local_r4k_flush_data_cache_page(void * addr)
@@ -535,7 +535,7 @@ static void r4k_flush_data_cache_page(unsigned long addr)
 		local_r4k_flush_data_cache_page((void *)addr);
 	else
 		r4k_on_each_cpu(local_r4k_flush_data_cache_page, (void *) addr,
-			        1, 1);
+			        1);
 }
 
 struct flush_icache_range_args {
@@ -571,7 +571,7 @@ static void r4k_flush_icache_range(unsigned long start, unsigned long end)
 	args.start = start;
 	args.end = end;
 
-	r4k_on_each_cpu(local_r4k_flush_icache_range, &args, 1, 1);
+	r4k_on_each_cpu(local_r4k_flush_icache_range, &args, 1);
 	instruction_hazard();
 }
 
@@ -672,7 +672,7 @@ static void local_r4k_flush_cache_sigtramp(void * arg)
 
 static void r4k_flush_cache_sigtramp(unsigned long addr)
 {
-	r4k_on_each_cpu(local_r4k_flush_cache_sigtramp, (void *) addr, 1, 1);
+	r4k_on_each_cpu(local_r4k_flush_cache_sigtramp, (void *) addr, 1);
 }
 
 static void r4k_flush_icache_all(void)
diff --git a/arch/mips/pmc-sierra/yosemite/prom.c b/arch/mips/pmc-sierra/yosemite/prom.c
index 35dc435846a6..cf4c868715ac 100644
--- a/arch/mips/pmc-sierra/yosemite/prom.c
+++ b/arch/mips/pmc-sierra/yosemite/prom.c
@@ -64,7 +64,7 @@ static void prom_exit(void)
 #ifdef CONFIG_SMP
 	if (smp_processor_id())
 		/* CPU 1 */
-		smp_call_function(prom_cpu0_exit, NULL, 1, 1);
+		smp_call_function(prom_cpu0_exit, NULL, 1);
 #endif
 	prom_cpu0_exit(NULL);
 }
diff --git a/arch/mips/sibyte/cfe/setup.c b/arch/mips/sibyte/cfe/setup.c
index 33fce826f8bf..fd9604d5555a 100644
--- a/arch/mips/sibyte/cfe/setup.c
+++ b/arch/mips/sibyte/cfe/setup.c
@@ -74,7 +74,7 @@ static void __noreturn cfe_linux_exit(void *arg)
 		if (!reboot_smp) {
 			/* Get CPU 0 to do the cfe_exit */
 			reboot_smp = 1;
-			smp_call_function(cfe_linux_exit, arg, 1, 0);
+			smp_call_function(cfe_linux_exit, arg, 0);
 		}
 	} else {
 		printk("Passing control back to CFE...\n");
diff --git a/arch/mips/sibyte/sb1250/prom.c b/arch/mips/sibyte/sb1250/prom.c
index cf8f6b3de86c..65b1af66b674 100644
--- a/arch/mips/sibyte/sb1250/prom.c
+++ b/arch/mips/sibyte/sb1250/prom.c
@@ -66,7 +66,7 @@ static void prom_linux_exit(void)
 {
 #ifdef CONFIG_SMP
 	if (smp_processor_id()) {
-		smp_call_function(prom_cpu0_exit, NULL, 1, 1);
+		smp_call_function(prom_cpu0_exit, NULL, 1);
 	}
 #endif
 	while(1);
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 37a5ab410dcc..5191b46a611e 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -168,7 +168,7 @@ static void stop_this_cpu(void *dummy)
 
 void smp_send_stop(void)
 {
-	smp_call_function(stop_this_cpu, NULL, 0, 0);
+	smp_call_function(stop_this_cpu, NULL, 0);
 }
 
 extern struct gettimeofday_struct do_gtod;
diff --git a/arch/s390/appldata/appldata_base.c b/arch/s390/appldata/appldata_base.c
index ad40729bec3d..837a3b3e7759 100644
--- a/arch/s390/appldata/appldata_base.c
+++ b/arch/s390/appldata/appldata_base.c
@@ -209,7 +209,7 @@ __appldata_vtimer_setup(int cmd)
 			per_cpu(appldata_timer, i).expires = per_cpu_interval;
 			smp_call_function_single(i, add_virt_timer_periodic,
 						 &per_cpu(appldata_timer, i),
-						 0, 1);
+						 1);
 		}
 		appldata_timer_active = 1;
 		P_INFO("Monitoring timer started.\n");
@@ -236,7 +236,7 @@ __appldata_vtimer_setup(int cmd)
 			args.timer = &per_cpu(appldata_timer, i);
 			args.expires = per_cpu_interval;
 			smp_call_function_single(i, __appldata_mod_vtimer_wrap,
-						 &args, 0, 1);
+						 &args, 1);
 		}
 	}
 }
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index 5d4fa4b1c74c..276b105fb2a4 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -109,7 +109,7 @@ static void do_call_function(void)
 }
 
 static void __smp_call_function_map(void (*func) (void *info), void *info,
-				    int nonatomic, int wait, cpumask_t map)
+				    int wait, cpumask_t map)
 {
 	struct call_data_struct data;
 	int cpu, local = 0;
@@ -162,7 +162,6 @@ out:
  * smp_call_function:
  * @func: the function to run; this must be fast and non-blocking
  * @info: an arbitrary pointer to pass to the function
- * @nonatomic: unused
  * @wait: if true, wait (atomically) until function has completed on other CPUs
  *
  * Run a function on all other CPUs.
@@ -170,15 +169,14 @@ out:
  * You must not call this function with disabled interrupts, from a
  * hardware interrupt handler or from a bottom half.
  */
-int smp_call_function(void (*func) (void *info), void *info, int nonatomic,
-		      int wait)
+int smp_call_function(void (*func) (void *info), void *info, int wait)
 {
 	cpumask_t map;
 
 	spin_lock(&call_lock);
 	map = cpu_online_map;
 	cpu_clear(smp_processor_id(), map);
-	__smp_call_function_map(func, info, nonatomic, wait, map);
+	__smp_call_function_map(func, info, wait, map);
 	spin_unlock(&call_lock);
 	return 0;
 }
@@ -189,7 +187,6 @@ EXPORT_SYMBOL(smp_call_function);
  * @cpu: the CPU where func should run
  * @func: the function to run; this must be fast and non-blocking
  * @info: an arbitrary pointer to pass to the function
- * @nonatomic: unused
  * @wait: if true, wait (atomically) until function has completed on other CPUs
  *
  * Run a function on one processor.
@@ -198,11 +195,10 @@ EXPORT_SYMBOL(smp_call_function);
  * hardware interrupt handler or from a bottom half.
  */
 int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
-			     int nonatomic, int wait)
+			     int wait)
 {
 	spin_lock(&call_lock);
-	__smp_call_function_map(func, info, nonatomic, wait,
-				cpumask_of_cpu(cpu));
+	__smp_call_function_map(func, info, wait, cpumask_of_cpu(cpu));
 	spin_unlock(&call_lock);
 	return 0;
 }
@@ -228,7 +224,7 @@ int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info,
 {
 	spin_lock(&call_lock);
 	cpu_clear(smp_processor_id(), mask);
-	__smp_call_function_map(func, info, 0, wait, mask);
+	__smp_call_function_map(func, info, wait, mask);
 	spin_unlock(&call_lock);
 	return 0;
 }
diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index 7aec676fefd5..bf7bf2c2236a 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -690,7 +690,7 @@ static int etr_sync_clock(struct etr_aib *aib, int port)
 	 */
 	memset(&etr_sync, 0, sizeof(etr_sync));
 	preempt_disable();
-	smp_call_function(etr_sync_cpu_start, NULL, 0, 0);
+	smp_call_function(etr_sync_cpu_start, NULL, 0);
 	local_irq_disable();
 	etr_enable_sync_clock();
 
@@ -729,7 +729,7 @@ static int etr_sync_clock(struct etr_aib *aib, int port)
 		rc = -EAGAIN;
 	}
 	local_irq_enable();
-	smp_call_function(etr_sync_cpu_end,NULL,0,0);
+	smp_call_function(etr_sync_cpu_end,NULL,0);
 	preempt_enable();
 	return rc;
 }
diff --git a/arch/sh/kernel/smp.c b/arch/sh/kernel/smp.c
index 2ed8dceb297b..71781ba2675b 100644
--- a/arch/sh/kernel/smp.c
+++ b/arch/sh/kernel/smp.c
@@ -168,7 +168,7 @@ static void stop_this_cpu(void *unused)
 
 void smp_send_stop(void)
 {
-	smp_call_function(stop_this_cpu, 0, 1, 0);
+	smp_call_function(stop_this_cpu, 0, 0);
 }
 
 void arch_send_call_function_ipi(cpumask_t mask)
@@ -223,7 +223,7 @@ void flush_tlb_mm(struct mm_struct *mm)
 	preempt_disable();
 
 	if ((atomic_read(&mm->mm_users) != 1) || (current->mm != mm)) {
-		smp_call_function(flush_tlb_mm_ipi, (void *)mm, 1, 1);
+		smp_call_function(flush_tlb_mm_ipi, (void *)mm, 1);
 	} else {
 		int i;
 		for (i = 0; i < num_online_cpus(); i++)
@@ -260,7 +260,7 @@ void flush_tlb_range(struct vm_area_struct *vma,
 		fd.vma = vma;
 		fd.addr1 = start;
 		fd.addr2 = end;
-		smp_call_function(flush_tlb_range_ipi, (void *)&fd, 1, 1);
+		smp_call_function(flush_tlb_range_ipi, (void *)&fd, 1);
 	} else {
 		int i;
 		for (i = 0; i < num_online_cpus(); i++)
@@ -303,7 +303,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long page)
 
 		fd.vma = vma;
 		fd.addr1 = page;
-		smp_call_function(flush_tlb_page_ipi, (void *)&fd, 1, 1);
+		smp_call_function(flush_tlb_page_ipi, (void *)&fd, 1);
 	} else {
 		int i;
 		for (i = 0; i < num_online_cpus(); i++)
@@ -327,6 +327,6 @@ void flush_tlb_one(unsigned long asid, unsigned long vaddr)
 	fd.addr1 = asid;
 	fd.addr2 = vaddr;
 
-	smp_call_function(flush_tlb_one_ipi, (void *)&fd, 1, 1);
+	smp_call_function(flush_tlb_one_ipi, (void *)&fd, 1);
 	local_flush_tlb_one(asid, vaddr);
 }
diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c
index b82d017a1744..c099d96f1239 100644
--- a/arch/sparc64/kernel/smp.c
+++ b/arch/sparc64/kernel/smp.c
@@ -807,7 +807,6 @@ extern unsigned long xcall_call_function;
  * smp_call_function(): Run a function on all other CPUs.
  * @func: The function to run. This must be fast and non-blocking.
  * @info: An arbitrary pointer to pass to the function.
- * @nonatomic: currently unused.
  * @wait: If true, wait (atomically) until function has completed on other CPUs.
  *
  * Returns 0 on success, else a negative status code. Does not return until
@@ -817,8 +816,7 @@ extern unsigned long xcall_call_function;
  * hardware interrupt handler or from a bottom half handler.
  */
 static int sparc64_smp_call_function_mask(void (*func)(void *info), void *info,
-					  int nonatomic, int wait,
-					  cpumask_t mask)
+					  int wait, cpumask_t mask)
 {
 	struct call_data_struct data;
 	int cpus;
@@ -853,11 +851,9 @@ out_unlock:
 	return 0;
 }
 
-int smp_call_function(void (*func)(void *info), void *info,
-		      int nonatomic, int wait)
+int smp_call_function(void (*func)(void *info), void *info, int wait)
 {
-	return sparc64_smp_call_function_mask(func, info, nonatomic, wait,
-						cpu_online_map);
+	return sparc64_smp_call_function_mask(func, info, wait, cpu_online_map);
 }
 
 void smp_call_function_client(int irq, struct pt_regs *regs)
@@ -894,7 +890,7 @@ static void tsb_sync(void *info)
 
 void smp_tsb_sync(struct mm_struct *mm)
 {
-	sparc64_smp_call_function_mask(tsb_sync, mm, 0, 1, mm->cpu_vm_mask);
+	sparc64_smp_call_function_mask(tsb_sync, mm, 1, mm->cpu_vm_mask);
 }
 
 extern unsigned long xcall_flush_tlb_mm;
diff --git a/arch/um/kernel/smp.c b/arch/um/kernel/smp.c
index e1062ec36d40..be2d50c3aa95 100644
--- a/arch/um/kernel/smp.c
+++ b/arch/um/kernel/smp.c
@@ -214,8 +214,7 @@ void smp_call_function_slave(int cpu)
 	atomic_inc(&scf_finished);
 }
 
-int smp_call_function(void (*_func)(void *info), void *_info, int nonatomic,
-		      int wait)
+int smp_call_function(void (*_func)(void *info), void *_info, int wait)
 {
 	int cpus = num_online_cpus() - 1;
 	int i;
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 6a1e278d9323..290652cefddb 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -222,7 +222,7 @@ static void set_mtrr(unsigned int reg, unsigned long base,
 	atomic_set(&data.gate,0);
 
 	/*  Start the ball rolling on other CPUs  */
-	if (smp_call_function(ipi_handler, &data, 1, 0) != 0)
+	if (smp_call_function(ipi_handler, &data, 0) != 0)
 		panic("mtrr: timed out waiting for other CPUs\n");
 
 	local_irq_save(flags);
@@ -822,7 +822,7 @@ void mtrr_ap_init(void)
  */
 void mtrr_save_state(void)
 {
-	smp_call_function_single(0, mtrr_save_fixed_ranges, NULL, 1, 1);
+	smp_call_function_single(0, mtrr_save_fixed_ranges, NULL, 1);
 }
 
 static int __init mtrr_init_finialize(void)
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index daff52a62248..336dd43c9158 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -95,7 +95,7 @@ static ssize_t cpuid_read(struct file *file, char __user *buf,
 	for (; count; count -= 16) {
 		cmd.eax = pos;
 		cmd.ecx = pos >> 32;
-		smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1, 1);
+		smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1);
 		if (copy_to_user(tmp, &cmd, 16))
 			return -EFAULT;
 		tmp += 16;
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index 0224c3637c73..cb0a6398c64b 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -68,7 +68,7 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
 		load_LDT(pc);
 		mask = cpumask_of_cpu(smp_processor_id());
 		if (!cpus_equal(current->mm->cpu_vm_mask, mask))
-			smp_call_function(flush_ldt, NULL, 1, 1);
+			smp_call_function(flush_ldt, NULL, 1);
 		preempt_enable();
 #else
 		load_LDT(pc);
diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c
index 84160f74eeb0..5562dab0bd20 100644
--- a/arch/x86/kernel/nmi_32.c
+++ b/arch/x86/kernel/nmi_32.c
@@ -87,7 +87,7 @@ int __init check_nmi_watchdog(void)
 
 #ifdef CONFIG_SMP
 	if (nmi_watchdog == NMI_LOCAL_APIC)
-		smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0);
+		smp_call_function(nmi_cpu_busy, (void *)&endflag, 0);
 #endif
 
 	for_each_possible_cpu(cpu)
diff --git a/arch/x86/kernel/nmi_64.c b/arch/x86/kernel/nmi_64.c
index 5a29ded994fa..2f1e4f503c9e 100644
--- a/arch/x86/kernel/nmi_64.c
+++ b/arch/x86/kernel/nmi_64.c
@@ -96,7 +96,7 @@ int __init check_nmi_watchdog(void)
 
 #ifdef CONFIG_SMP
 	if (nmi_watchdog == NMI_LOCAL_APIC)
-		smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0);
+		smp_call_function(nmi_cpu_busy, (void *)&endflag, 0);
 #endif
 
 	for (cpu = 0; cpu < NR_CPUS; cpu++)
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 575aa3d7248a..56546e8a13ac 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -164,7 +164,7 @@ static void native_smp_send_stop(void)
 	if (reboot_force)
 		return;
 
-	smp_call_function(stop_this_cpu, NULL, 0, 0);
+	smp_call_function(stop_this_cpu, NULL, 0);
 	local_irq_save(flags);
 	disable_local_APIC();
 	local_irq_restore(flags);
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 61efa2f7d564..0a03d57f9b3b 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -278,7 +278,7 @@ cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
 {
 	long cpu = (long)arg;
 	if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
-		smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 0, 1);
+		smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1);
 	return NOTIFY_DONE;
 }
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 540e95179074..5534fe59b5fc 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -335,7 +335,7 @@ static void vcpu_clear(struct vcpu_vmx *vmx)
 {
 	if (vmx->vcpu.cpu == -1)
 		return;
-	smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 0, 1);
+	smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1);
 	vmx->launched = 0;
 }
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 63a77caa59f1..0faa2546b1cd 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4044,6 +4044,6 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
 	 * So need not to call smp_call_function_single() in that case.
 	 */
 	if (vcpu->guest_mode && vcpu->cpu != cpu)
-		smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0, 0);
+		smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0);
 	put_cpu();
 }
diff --git a/arch/x86/lib/msr-on-cpu.c b/arch/x86/lib/msr-on-cpu.c
index 57d043fa893e..d5a2b39f882b 100644
--- a/arch/x86/lib/msr-on-cpu.c
+++ b/arch/x86/lib/msr-on-cpu.c
@@ -30,10 +30,10 @@ static int _rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h, int safe)
 
 	rv.msr_no = msr_no;
 	if (safe) {
-		smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 0, 1);
+		smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 1);
 		err = rv.err;
 	} else {
-		smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 0, 1);
+		smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1);
 	}
 	*l = rv.l;
 	*h = rv.h;
@@ -64,10 +64,10 @@ static int _wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h, int safe)
 	rv.l = l;
 	rv.h = h;
 	if (safe) {
-		smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 0, 1);
+		smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1);
 		err = rv.err;
 	} else {
-		smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 0, 1);
+		smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1);
 	}
 
 	return err;
diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c
index cb34407a9930..04f596eab749 100644
--- a/arch/x86/mach-voyager/voyager_smp.c
+++ b/arch/x86/mach-voyager/voyager_smp.c
@@ -1113,7 +1113,7 @@ int safe_smp_processor_id(void)
 /* broadcast a halt to all other CPUs */
 static void voyager_smp_send_stop(void)
 {
-	smp_call_function(smp_stop_cpu_function, NULL, 1, 1);
+	smp_call_function(smp_stop_cpu_function, NULL, 1);
 }
 
 /* this function is triggered in time.c when a clock tick fires
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index b3786e749b8e..a1651d029ea8 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -331,7 +331,7 @@ static void stop_self(void *v)
 
 void xen_smp_send_stop(void)
 {
-	smp_call_function(stop_self, NULL, 0, 0);
+	smp_call_function(stop_self, NULL, 0);
 }
 
 void xen_smp_send_reschedule(int cpu)
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index 556ee1585192..4976e5db2b3f 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -1339,7 +1339,7 @@ static void smp_callback(void *v)
 static int acpi_processor_latency_notify(struct notifier_block *b,
 		unsigned long l, void *v)
 {
-	smp_call_function(smp_callback, NULL, 0, 1);
+	smp_call_function(smp_callback, NULL, 1);
 	return NOTIFY_OK;
 }
 
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 23554b676d6e..5405769020a1 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -340,7 +340,7 @@ static void smp_callback(void *v)
 static int cpuidle_latency_notify(struct notifier_block *b,
 		unsigned long l, void *v)
 {
-	smp_call_function(smp_callback, NULL, 0, 1);
+	smp_call_function(smp_callback, NULL, 1);
 	return NOTIFY_OK;
 }
 
diff --git a/include/asm-alpha/smp.h b/include/asm-alpha/smp.h
index 2f60a362d75e..544c69af8168 100644
--- a/include/asm-alpha/smp.h
+++ b/include/asm-alpha/smp.h
@@ -53,7 +53,7 @@ extern void arch_send_call_function_ipi(cpumask_t mask);
 #else /* CONFIG_SMP */
 
 #define hard_smp_processor_id()		0
-#define smp_call_function_on_cpu(func,info,retry,wait,cpu)    ({ 0; })
+#define smp_call_function_on_cpu(func,info,wait,cpu)    ({ 0; })
 
 #endif /* CONFIG_SMP */
 
diff --git a/include/asm-sparc/smp.h b/include/asm-sparc/smp.h
index e6d561599726..b61e74bea06a 100644
--- a/include/asm-sparc/smp.h
+++ b/include/asm-sparc/smp.h
@@ -72,7 +72,7 @@ static inline void xc5(smpfunc_t func, unsigned long arg1, unsigned long arg2,
 			   unsigned long arg3, unsigned long arg4, unsigned long arg5)
 { smp_cross_call(func, arg1, arg2, arg3, arg4, arg5); }
 
-static inline int smp_call_function(void (*func)(void *info), void *info, int nonatomic, int wait)
+static inline int smp_call_function(void (*func)(void *info), void *info, int wait)
 {
 	xc1((smpfunc_t)func, (unsigned long)info);
 	return 0;
diff --git a/include/linux/smp.h b/include/linux/smp.h
index eac3e062250f..338cad1b9548 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -62,11 +62,11 @@ extern void smp_cpus_done(unsigned int max_cpus);
 /*
  * Call a function on all other processors
  */
-int smp_call_function(void(*func)(void *info), void *info, int retry, int wait);
+int smp_call_function(void(*func)(void *info), void *info, int wait);
 int smp_call_function_mask(cpumask_t mask, void(*func)(void *info), void *info,
 				int wait);
 int smp_call_function_single(int cpuid, void (*func) (void *info), void *info,
-				int retry, int wait);
+				int wait);
 void __smp_call_function_single(int cpuid, struct call_single_data *data);
 
 /*
@@ -119,7 +119,7 @@ static inline int up_smp_call_function(void (*func)(void *), void *info)
 {
 	return 0;
 }
-#define smp_call_function(func, info, retry, wait) \
+#define smp_call_function(func, info, wait) \
 			(up_smp_call_function(func, info))
 #define on_each_cpu(func,info,retry,wait)	\
 	({					\
@@ -131,7 +131,7 @@ static inline int up_smp_call_function(void (*func)(void *), void *info)
 static inline void smp_send_reschedule(int cpu) { }
 #define num_booting_cpus()			1
 #define smp_prepare_boot_cpu()			do {} while (0)
-#define smp_call_function_single(cpuid, func, info, retry, wait) \
+#define smp_call_function_single(cpuid, func, info, wait) \
 ({ \
 	WARN_ON(cpuid != 0);	\
 	local_irq_disable();	\
diff --git a/kernel/smp.c b/kernel/smp.c
index f77b75c027ad..7e0432a4a0e2 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -195,7 +195,6 @@ void generic_smp_call_function_single_interrupt(void)
  * smp_call_function_single - Run a function on a specific CPU
  * @func: The function to run. This must be fast and non-blocking.
  * @info: An arbitrary pointer to pass to the function.
- * @retry: Unused
  * @wait: If true, wait until function has completed on other CPUs.
  *
  * Returns 0 on success, else a negative status code. Note that @wait
@@ -203,7 +202,7 @@ void generic_smp_call_function_single_interrupt(void)
  * we fall back to on-stack allocation.
  */
 int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
-			     int retry, int wait)
+			     int wait)
 {
 	struct call_single_data d;
 	unsigned long flags;
@@ -339,7 +338,6 @@ EXPORT_SYMBOL(smp_call_function_mask);
  * smp_call_function(): Run a function on all other CPUs.
  * @func: The function to run. This must be fast and non-blocking.
  * @info: An arbitrary pointer to pass to the function.
- * @natomic: Unused
  * @wait: If true, wait (atomically) until function has completed on other CPUs.
  *
  * Returns 0 on success, else a negative status code.
@@ -351,7 +349,7 @@ EXPORT_SYMBOL(smp_call_function_mask);
  * You must not call this function with disabled interrupts or from a
  * hardware interrupt handler or from a bottom half handler.
  */
-int smp_call_function(void (*func)(void *), void *info, int natomic, int wait)
+int smp_call_function(void (*func)(void *), void *info, int wait)
 {
 	int ret;
 
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 36e061740047..d73afb4764ef 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -679,7 +679,7 @@ int on_each_cpu(void (*func) (void *info), void *info, int retry, int wait)
 	int ret = 0;
 
 	preempt_disable();
-	ret = smp_call_function(func, info, retry, wait);
+	ret = smp_call_function(func, info, wait);
 	local_irq_disable();
 	func(info);
 	local_irq_enable();
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 57a1f02e5ec0..75e718539dcb 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -266,7 +266,7 @@ void tick_broadcast_on_off(unsigned long reason, int *oncpu)
 		       "offline CPU #%d\n", *oncpu);
 	else
 		smp_call_function_single(*oncpu, tick_do_broadcast_on_off,
-					 &reason, 1, 1);
+					 &reason, 1);
 }
 
 /*
diff --git a/net/core/flow.c b/net/core/flow.c
index 19991175fdeb..5cf81052d044 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -298,7 +298,7 @@ void flow_cache_flush(void)
 	init_completion(&info.completion);
 
 	local_bh_disable();
-	smp_call_function(flow_cache_flush_per_cpu, &info, 1, 0);
+	smp_call_function(flow_cache_flush_per_cpu, &info, 0);
 	flow_cache_flush_tasklet((unsigned long)&info);
 	local_bh_enable();
 
diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c
index 918970762131..94d5a45c3a57 100644
--- a/net/iucv/iucv.c
+++ b/net/iucv/iucv.c
@@ -480,7 +480,7 @@ static void iucv_setmask_mp(void)
 		if (cpu_isset(cpu, iucv_buffer_cpumask) &&
 		    !cpu_isset(cpu, iucv_irq_cpumask))
 			smp_call_function_single(cpu, iucv_allow_cpu,
-						 NULL, 0, 1);
+						 NULL, 1);
 	preempt_enable();
 }
 
@@ -498,7 +498,7 @@ static void iucv_setmask_up(void)
 	cpumask = iucv_irq_cpumask;
 	cpu_clear(first_cpu(iucv_irq_cpumask), cpumask);
 	for_each_cpu_mask(cpu, cpumask)
-		smp_call_function_single(cpu, iucv_block_cpu, NULL, 0, 1);
+		smp_call_function_single(cpu, iucv_block_cpu, NULL, 1);
 }
 
 /**
@@ -523,7 +523,7 @@ static int iucv_enable(void)
 	rc = -EIO;
 	preempt_disable();
 	for_each_online_cpu(cpu)
-		smp_call_function_single(cpu, iucv_declare_cpu, NULL, 0, 1);
+		smp_call_function_single(cpu, iucv_declare_cpu, NULL, 1);
 	preempt_enable();
 	if (cpus_empty(iucv_buffer_cpumask))
 		/* No cpu could declare an iucv buffer. */
@@ -580,7 +580,7 @@ static int __cpuinit iucv_cpu_notify(struct notifier_block *self,
 	case CPU_ONLINE_FROZEN:
 	case CPU_DOWN_FAILED:
 	case CPU_DOWN_FAILED_FROZEN:
-		smp_call_function_single(cpu, iucv_declare_cpu, NULL, 0, 1);
+		smp_call_function_single(cpu, iucv_declare_cpu, NULL, 1);
 		break;
 	case CPU_DOWN_PREPARE:
 	case CPU_DOWN_PREPARE_FROZEN:
@@ -589,10 +589,10 @@ static int __cpuinit iucv_cpu_notify(struct notifier_block *self,
 		if (cpus_empty(cpumask))
 			/* Can't offline last IUCV enabled cpu. */
 			return NOTIFY_BAD;
-		smp_call_function_single(cpu, iucv_retrieve_cpu, NULL, 0, 1);
+		smp_call_function_single(cpu, iucv_retrieve_cpu, NULL, 1);
 		if (cpus_empty(iucv_irq_cpumask))
 			smp_call_function_single(first_cpu(iucv_buffer_cpumask),
-						 iucv_allow_cpu, NULL, 0, 1);
+						 iucv_allow_cpu, NULL, 1);
 		break;
 	}
 	return NOTIFY_OK;
@@ -652,7 +652,7 @@ static void iucv_cleanup_queue(void)
 	 * pending interrupts force them to the work queue by calling
 	 * an empty function on all cpus.
 	 */
-	smp_call_function(__iucv_cleanup_queue, NULL, 0, 1);
+	smp_call_function(__iucv_cleanup_queue, NULL, 1);
 	spin_lock_irq(&iucv_queue_lock);
 	list_for_each_entry_safe(p, n, &iucv_task_queue, list) {
 		/* Remove stale work items from the task queue. */
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 2d29e260da3d..ea1f595f8a87 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1266,12 +1266,12 @@ static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
 	case CPU_UP_CANCELED:
 		printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
 		       cpu);
-		smp_call_function_single(cpu, hardware_disable, NULL, 0, 1);
+		smp_call_function_single(cpu, hardware_disable, NULL, 1);
 		break;
 	case CPU_ONLINE:
 		printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
 		       cpu);
-		smp_call_function_single(cpu, hardware_enable, NULL, 0, 1);
+		smp_call_function_single(cpu, hardware_enable, NULL, 1);
 		break;
 	}
 	return NOTIFY_OK;
@@ -1474,7 +1474,7 @@ int kvm_init(void *opaque, unsigned int vcpu_size,
 	for_each_online_cpu(cpu) {
 		smp_call_function_single(cpu,
 				kvm_arch_check_processor_compat,
-				&r, 0, 1);
+				&r, 1);
 		if (r < 0)
 			goto out_free_1;
 	}
-- 
cgit v1.2.3


From 15c8b6c1aaaf1c4edd67e2f02e4d8e1bd1a51c0d Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Fri, 9 May 2008 09:39:44 +0200
Subject: on_each_cpu(): kill unused 'retry' parameter

It's not even passed on to smp_call_function() anymore, since that
was removed. So kill it.

Acked-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 arch/alpha/kernel/process.c            |  2 +-
 arch/alpha/kernel/smp.c                |  4 ++--
 arch/arm/kernel/smp.c                  |  6 +++---
 arch/ia64/kernel/mca.c                 |  4 ++--
 arch/ia64/kernel/perfmon.c             |  4 ++--
 arch/ia64/kernel/smp.c                 |  4 ++--
 arch/mips/kernel/irq-rm9000.c          |  4 ++--
 arch/mips/kernel/smp.c                 |  4 ++--
 arch/mips/oprofile/common.c            |  6 +++---
 arch/parisc/kernel/cache.c             |  6 +++---
 arch/parisc/kernel/smp.c               |  2 +-
 arch/parisc/mm/init.c                  |  2 +-
 arch/powerpc/kernel/rtas.c             |  2 +-
 arch/powerpc/kernel/tau_6xx.c          |  4 ++--
 arch/powerpc/kernel/time.c             |  2 +-
 arch/powerpc/mm/slice.c                |  2 +-
 arch/powerpc/oprofile/common.c         |  6 +++---
 arch/s390/kernel/smp.c                 |  6 +++---
 arch/s390/kernel/time.c                |  2 +-
 arch/sh/kernel/smp.c                   |  4 ++--
 arch/sparc64/mm/hugetlbpage.c          |  2 +-
 arch/x86/kernel/cpu/mcheck/mce_64.c    |  6 +++---
 arch/x86/kernel/cpu/mcheck/non-fatal.c |  2 +-
 arch/x86/kernel/cpu/perfctr-watchdog.c |  4 ++--
 arch/x86/kernel/io_apic_32.c           |  2 +-
 arch/x86/kernel/io_apic_64.c           |  2 +-
 arch/x86/kernel/nmi_32.c               |  4 ++--
 arch/x86/kernel/nmi_64.c               |  4 ++--
 arch/x86/kernel/tlb_32.c               |  2 +-
 arch/x86/kernel/tlb_64.c               |  2 +-
 arch/x86/kernel/vsyscall_64.c          |  2 +-
 arch/x86/kvm/vmx.c                     |  2 +-
 arch/x86/mach-voyager/voyager_smp.c    |  2 +-
 arch/x86/mm/pageattr.c                 |  4 ++--
 arch/x86/oprofile/nmi_int.c            | 10 +++++-----
 drivers/char/agp/generic.c             |  2 +-
 drivers/lguest/x86/core.c              |  4 ++--
 fs/buffer.c                            |  2 +-
 include/linux/smp.h                    |  4 ++--
 kernel/hrtimer.c                       |  2 +-
 kernel/profile.c                       |  6 +++---
 kernel/rcupdate.c                      |  2 +-
 kernel/softirq.c                       |  2 +-
 mm/page_alloc.c                        |  2 +-
 mm/slab.c                              |  4 ++--
 mm/slub.c                              |  2 +-
 net/iucv/iucv.c                        |  2 +-
 virt/kvm/kvm_main.c                    |  8 ++++----
 48 files changed, 84 insertions(+), 84 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c
index 96ed82fd9eef..351407e07e71 100644
--- a/arch/alpha/kernel/process.c
+++ b/arch/alpha/kernel/process.c
@@ -160,7 +160,7 @@ common_shutdown(int mode, char *restart_cmd)
 	struct halt_info args;
 	args.mode = mode;
 	args.restart_cmd = restart_cmd;
-	on_each_cpu(common_shutdown_1, &args, 1, 0);
+	on_each_cpu(common_shutdown_1, &args, 0);
 }
 
 void
diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c
index 44114c8dbb2a..83df541650fc 100644
--- a/arch/alpha/kernel/smp.c
+++ b/arch/alpha/kernel/smp.c
@@ -657,7 +657,7 @@ void
 smp_imb(void)
 {
 	/* Must wait other processors to flush their icache before continue. */
-	if (on_each_cpu(ipi_imb, NULL, 1, 1))
+	if (on_each_cpu(ipi_imb, NULL, 1))
 		printk(KERN_CRIT "smp_imb: timed out\n");
 }
 EXPORT_SYMBOL(smp_imb);
@@ -673,7 +673,7 @@ flush_tlb_all(void)
 {
 	/* Although we don't have any data to pass, we do want to
 	   synchronize with the other processors.  */
-	if (on_each_cpu(ipi_flush_tlb_all, NULL, 1, 1)) {
+	if (on_each_cpu(ipi_flush_tlb_all, NULL, 1)) {
 		printk(KERN_CRIT "flush_tlb_all: timed out\n");
 	}
 }
diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index 6344466b2113..5a7c09564d13 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -604,7 +604,7 @@ static inline void ipi_flush_tlb_kernel_range(void *arg)
 
 void flush_tlb_all(void)
 {
-	on_each_cpu(ipi_flush_tlb_all, NULL, 1, 1);
+	on_each_cpu(ipi_flush_tlb_all, NULL, 1);
 }
 
 void flush_tlb_mm(struct mm_struct *mm)
@@ -631,7 +631,7 @@ void flush_tlb_kernel_page(unsigned long kaddr)
 
 	ta.ta_start = kaddr;
 
-	on_each_cpu(ipi_flush_tlb_kernel_page, &ta, 1, 1);
+	on_each_cpu(ipi_flush_tlb_kernel_page, &ta, 1);
 }
 
 void flush_tlb_range(struct vm_area_struct *vma,
@@ -654,5 +654,5 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end)
 	ta.ta_start = start;
 	ta.ta_end = end;
 
-	on_each_cpu(ipi_flush_tlb_kernel_range, &ta, 1, 1);
+	on_each_cpu(ipi_flush_tlb_kernel_range, &ta, 1);
 }
diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c
index 9cd818cc7008..7dd96c127177 100644
--- a/arch/ia64/kernel/mca.c
+++ b/arch/ia64/kernel/mca.c
@@ -707,7 +707,7 @@ ia64_mca_cmc_vector_enable (void *dummy)
 static void
 ia64_mca_cmc_vector_disable_keventd(struct work_struct *unused)
 {
-	on_each_cpu(ia64_mca_cmc_vector_disable, NULL, 1, 0);
+	on_each_cpu(ia64_mca_cmc_vector_disable, NULL, 0);
 }
 
 /*
@@ -719,7 +719,7 @@ ia64_mca_cmc_vector_disable_keventd(struct work_struct *unused)
 static void
 ia64_mca_cmc_vector_enable_keventd(struct work_struct *unused)
 {
-	on_each_cpu(ia64_mca_cmc_vector_enable, NULL, 1, 0);
+	on_each_cpu(ia64_mca_cmc_vector_enable, NULL, 0);
 }
 
 /*
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index 9baa48255c12..19d4493c6193 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -6508,7 +6508,7 @@ pfm_install_alt_pmu_interrupt(pfm_intr_handler_desc_t *hdl)
 	}
 
 	/* save the current system wide pmu states */
-	ret = on_each_cpu(pfm_alt_save_pmu_state, NULL, 0, 1);
+	ret = on_each_cpu(pfm_alt_save_pmu_state, NULL, 1);
 	if (ret) {
 		DPRINT(("on_each_cpu() failed: %d\n", ret));
 		goto cleanup_reserve;
@@ -6553,7 +6553,7 @@ pfm_remove_alt_pmu_interrupt(pfm_intr_handler_desc_t *hdl)
 
 	pfm_alt_intr_handler = NULL;
 
-	ret = on_each_cpu(pfm_alt_restore_pmu_state, NULL, 0, 1);
+	ret = on_each_cpu(pfm_alt_restore_pmu_state, NULL, 1);
 	if (ret) {
 		DPRINT(("on_each_cpu() failed: %d\n", ret));
 	}
diff --git a/arch/ia64/kernel/smp.c b/arch/ia64/kernel/smp.c
index 19152dcbf6e4..3676468612b6 100644
--- a/arch/ia64/kernel/smp.c
+++ b/arch/ia64/kernel/smp.c
@@ -285,7 +285,7 @@ smp_flush_tlb_cpumask(cpumask_t xcpumask)
 void
 smp_flush_tlb_all (void)
 {
-	on_each_cpu((void (*)(void *))local_flush_tlb_all, NULL, 1, 1);
+	on_each_cpu((void (*)(void *))local_flush_tlb_all, NULL, 1);
 }
 
 void
@@ -308,7 +308,7 @@ smp_flush_tlb_mm (struct mm_struct *mm)
 	 * anyhow, and once a CPU is interrupted, the cost of local_flush_tlb_all() is
 	 * rather trivial.
 	 */
-	on_each_cpu((void (*)(void *))local_finish_flush_tlb_mm, mm, 1, 1);
+	on_each_cpu((void (*)(void *))local_finish_flush_tlb_mm, mm, 1);
 }
 
 void arch_send_call_function_single_ipi(int cpu)
diff --git a/arch/mips/kernel/irq-rm9000.c b/arch/mips/kernel/irq-rm9000.c
index ed9febe63d72..b47e4615ec12 100644
--- a/arch/mips/kernel/irq-rm9000.c
+++ b/arch/mips/kernel/irq-rm9000.c
@@ -49,7 +49,7 @@ static void local_rm9k_perfcounter_irq_startup(void *args)
 
 static unsigned int rm9k_perfcounter_irq_startup(unsigned int irq)
 {
-	on_each_cpu(local_rm9k_perfcounter_irq_startup, (void *) irq, 0, 1);
+	on_each_cpu(local_rm9k_perfcounter_irq_startup, (void *) irq, 1);
 
 	return 0;
 }
@@ -66,7 +66,7 @@ static void local_rm9k_perfcounter_irq_shutdown(void *args)
 
 static void rm9k_perfcounter_irq_shutdown(unsigned int irq)
 {
-	on_each_cpu(local_rm9k_perfcounter_irq_shutdown, (void *) irq, 0, 1);
+	on_each_cpu(local_rm9k_perfcounter_irq_shutdown, (void *) irq, 1);
 }
 
 static struct irq_chip rm9k_irq_controller = {
diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c
index 7a9ae830be86..4410f172b8ab 100644
--- a/arch/mips/kernel/smp.c
+++ b/arch/mips/kernel/smp.c
@@ -246,7 +246,7 @@ static void flush_tlb_all_ipi(void *info)
 
 void flush_tlb_all(void)
 {
-	on_each_cpu(flush_tlb_all_ipi, NULL, 1, 1);
+	on_each_cpu(flush_tlb_all_ipi, NULL, 1);
 }
 
 static void flush_tlb_mm_ipi(void *mm)
@@ -366,7 +366,7 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end)
 		.addr2 = end,
 	};
 
-	on_each_cpu(flush_tlb_kernel_range_ipi, &fd, 1, 1);
+	on_each_cpu(flush_tlb_kernel_range_ipi, &fd, 1);
 }
 
 static void flush_tlb_page_ipi(void *info)
diff --git a/arch/mips/oprofile/common.c b/arch/mips/oprofile/common.c
index b5f6f71b27bc..dd2fbd6645c1 100644
--- a/arch/mips/oprofile/common.c
+++ b/arch/mips/oprofile/common.c
@@ -27,7 +27,7 @@ static int op_mips_setup(void)
 	model->reg_setup(ctr);
 
 	/* Configure the registers on all cpus.  */
-	on_each_cpu(model->cpu_setup, NULL, 0, 1);
+	on_each_cpu(model->cpu_setup, NULL, 1);
 
         return 0;
 }
@@ -58,7 +58,7 @@ static int op_mips_create_files(struct super_block * sb, struct dentry * root)
 
 static int op_mips_start(void)
 {
-	on_each_cpu(model->cpu_start, NULL, 0, 1);
+	on_each_cpu(model->cpu_start, NULL, 1);
 
 	return 0;
 }
@@ -66,7 +66,7 @@ static int op_mips_start(void)
 static void op_mips_stop(void)
 {
 	/* Disable performance monitoring for all counters.  */
-	on_each_cpu(model->cpu_stop, NULL, 0, 1);
+	on_each_cpu(model->cpu_stop, NULL, 1);
 }
 
 int __init oprofile_arch_init(struct oprofile_operations *ops)
diff --git a/arch/parisc/kernel/cache.c b/arch/parisc/kernel/cache.c
index e10d25d2d9c9..5259d8c20676 100644
--- a/arch/parisc/kernel/cache.c
+++ b/arch/parisc/kernel/cache.c
@@ -51,12 +51,12 @@ static struct pdc_btlb_info btlb_info __read_mostly;
 void
 flush_data_cache(void)
 {
-	on_each_cpu(flush_data_cache_local, NULL, 1, 1);
+	on_each_cpu(flush_data_cache_local, NULL, 1);
 }
 void 
 flush_instruction_cache(void)
 {
-	on_each_cpu(flush_instruction_cache_local, NULL, 1, 1);
+	on_each_cpu(flush_instruction_cache_local, NULL, 1);
 }
 #endif
 
@@ -515,7 +515,7 @@ static void cacheflush_h_tmp_function(void *dummy)
 
 void flush_cache_all(void)
 {
-	on_each_cpu(cacheflush_h_tmp_function, NULL, 1, 1);
+	on_each_cpu(cacheflush_h_tmp_function, NULL, 1);
 }
 
 void flush_cache_mm(struct mm_struct *mm)
diff --git a/arch/parisc/kernel/smp.c b/arch/parisc/kernel/smp.c
index 126105c76a44..d47f3975c9c6 100644
--- a/arch/parisc/kernel/smp.c
+++ b/arch/parisc/kernel/smp.c
@@ -292,7 +292,7 @@ void arch_send_call_function_single_ipi(int cpu)
 void
 smp_flush_tlb_all(void)
 {
-	on_each_cpu(flush_tlb_all_local, NULL, 1, 1);
+	on_each_cpu(flush_tlb_all_local, NULL, 1);
 }
 
 /*
diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c
index ce0da689a89d..b4d6c8777ed0 100644
--- a/arch/parisc/mm/init.c
+++ b/arch/parisc/mm/init.c
@@ -1053,7 +1053,7 @@ void flush_tlb_all(void)
 	    do_recycle++;
 	}
 	spin_unlock(&sid_lock);
-	on_each_cpu(flush_tlb_all_local, NULL, 1, 1);
+	on_each_cpu(flush_tlb_all_local, NULL, 1);
 	if (do_recycle) {
 	    spin_lock(&sid_lock);
 	    recycle_sids(recycle_ndirty,recycle_dirty_array);
diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
index 34843c318419..647f3e8677dc 100644
--- a/arch/powerpc/kernel/rtas.c
+++ b/arch/powerpc/kernel/rtas.c
@@ -747,7 +747,7 @@ static int rtas_ibm_suspend_me(struct rtas_args *args)
 	/* Call function on all CPUs.  One of us will make the
 	 * rtas call
 	 */
-	if (on_each_cpu(rtas_percpu_suspend_me, &data, 1, 0))
+	if (on_each_cpu(rtas_percpu_suspend_me, &data, 0))
 		data.error = -EINVAL;
 
 	wait_for_completion(&done);
diff --git a/arch/powerpc/kernel/tau_6xx.c b/arch/powerpc/kernel/tau_6xx.c
index 368a4934f7ee..c3a56d65c5a9 100644
--- a/arch/powerpc/kernel/tau_6xx.c
+++ b/arch/powerpc/kernel/tau_6xx.c
@@ -192,7 +192,7 @@ static void tau_timeout_smp(unsigned long unused)
 
 	/* schedule ourselves to be run again */
 	mod_timer(&tau_timer, jiffies + shrink_timer) ;
-	on_each_cpu(tau_timeout, NULL, 1, 0);
+	on_each_cpu(tau_timeout, NULL, 0);
 }
 
 /*
@@ -234,7 +234,7 @@ int __init TAU_init(void)
 	tau_timer.expires = jiffies + shrink_timer;
 	add_timer(&tau_timer);
 
-	on_each_cpu(TAU_init_smp, NULL, 1, 0);
+	on_each_cpu(TAU_init_smp, NULL, 0);
 
 	printk("Thermal assist unit ");
 #ifdef CONFIG_TAU_INT
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 73401e83739a..f1a38a6c1e2d 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -322,7 +322,7 @@ void snapshot_timebases(void)
 {
 	if (!cpu_has_feature(CPU_FTR_PURR))
 		return;
-	on_each_cpu(snapshot_tb_and_purr, NULL, 0, 1);
+	on_each_cpu(snapshot_tb_and_purr, NULL, 1);
 }
 
 /*
diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
index ad928edafb0a..2bd12d965db1 100644
--- a/arch/powerpc/mm/slice.c
+++ b/arch/powerpc/mm/slice.c
@@ -218,7 +218,7 @@ static void slice_convert(struct mm_struct *mm, struct slice_mask mask, int psiz
 	mb();
 
 	/* XXX this is sub-optimal but will do for now */
-	on_each_cpu(slice_flush_segments, mm, 0, 1);
+	on_each_cpu(slice_flush_segments, mm, 1);
 #ifdef CONFIG_SPU_BASE
 	spu_flush_all_slbs(mm);
 #endif
diff --git a/arch/powerpc/oprofile/common.c b/arch/powerpc/oprofile/common.c
index 4908dc98f9ca..17807acb05d9 100644
--- a/arch/powerpc/oprofile/common.c
+++ b/arch/powerpc/oprofile/common.c
@@ -65,7 +65,7 @@ static int op_powerpc_setup(void)
 
 	/* Configure the registers on all cpus.	 If an error occurs on one
 	 * of the cpus, op_per_cpu_rc will be set to the error */
-	on_each_cpu(op_powerpc_cpu_setup, NULL, 0, 1);
+	on_each_cpu(op_powerpc_cpu_setup, NULL, 1);
 
 out:	if (op_per_cpu_rc) {
 		/* error on setup release the performance counter hardware */
@@ -100,7 +100,7 @@ static int op_powerpc_start(void)
 	if (model->global_start)
 		return model->global_start(ctr);
 	if (model->start) {
-		on_each_cpu(op_powerpc_cpu_start, NULL, 0, 1);
+		on_each_cpu(op_powerpc_cpu_start, NULL, 1);
 		return op_per_cpu_rc;
 	}
 	return -EIO; /* No start function is defined for this
@@ -115,7 +115,7 @@ static inline void op_powerpc_cpu_stop(void *dummy)
 static void op_powerpc_stop(void)
 {
 	if (model->stop)
-		on_each_cpu(op_powerpc_cpu_stop, NULL, 0, 1);
+		on_each_cpu(op_powerpc_cpu_stop, NULL, 1);
         if (model->global_stop)
                 model->global_stop();
 }
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index 276b105fb2a4..b6781030cfbd 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -299,7 +299,7 @@ static void smp_ptlb_callback(void *info)
 
 void smp_ptlb_all(void)
 {
-	on_each_cpu(smp_ptlb_callback, NULL, 0, 1);
+	on_each_cpu(smp_ptlb_callback, NULL, 1);
 }
 EXPORT_SYMBOL(smp_ptlb_all);
 #endif /* ! CONFIG_64BIT */
@@ -347,7 +347,7 @@ void smp_ctl_set_bit(int cr, int bit)
 	memset(&parms.orvals, 0, sizeof(parms.orvals));
 	memset(&parms.andvals, 0xff, sizeof(parms.andvals));
 	parms.orvals[cr] = 1 << bit;
-	on_each_cpu(smp_ctl_bit_callback, &parms, 0, 1);
+	on_each_cpu(smp_ctl_bit_callback, &parms, 1);
 }
 EXPORT_SYMBOL(smp_ctl_set_bit);
 
@@ -361,7 +361,7 @@ void smp_ctl_clear_bit(int cr, int bit)
 	memset(&parms.orvals, 0, sizeof(parms.orvals));
 	memset(&parms.andvals, 0xff, sizeof(parms.andvals));
 	parms.andvals[cr] = ~(1L << bit);
-	on_each_cpu(smp_ctl_bit_callback, &parms, 0, 1);
+	on_each_cpu(smp_ctl_bit_callback, &parms, 1);
 }
 EXPORT_SYMBOL(smp_ctl_clear_bit);
 
diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index bf7bf2c2236a..6037ed2b7471 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -909,7 +909,7 @@ static void etr_work_fn(struct work_struct *work)
 	if (!eacr.ea) {
 		/* Both ports offline. Reset everything. */
 		eacr.dp = eacr.es = eacr.sl = 0;
-		on_each_cpu(etr_disable_sync_clock, NULL, 0, 1);
+		on_each_cpu(etr_disable_sync_clock, NULL, 1);
 		del_timer_sync(&etr_timer);
 		etr_update_eacr(eacr);
 		set_bit(ETR_FLAG_EACCES, &etr_flags);
diff --git a/arch/sh/kernel/smp.c b/arch/sh/kernel/smp.c
index 71781ba2675b..60c50841143e 100644
--- a/arch/sh/kernel/smp.c
+++ b/arch/sh/kernel/smp.c
@@ -197,7 +197,7 @@ static void flush_tlb_all_ipi(void *info)
 
 void flush_tlb_all(void)
 {
-	on_each_cpu(flush_tlb_all_ipi, 0, 1, 1);
+	on_each_cpu(flush_tlb_all_ipi, 0, 1);
 }
 
 static void flush_tlb_mm_ipi(void *mm)
@@ -284,7 +284,7 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end)
 
 	fd.addr1 = start;
 	fd.addr2 = end;
-	on_each_cpu(flush_tlb_kernel_range_ipi, (void *)&fd, 1, 1);
+	on_each_cpu(flush_tlb_kernel_range_ipi, (void *)&fd, 1);
 }
 
 static void flush_tlb_page_ipi(void *info)
diff --git a/arch/sparc64/mm/hugetlbpage.c b/arch/sparc64/mm/hugetlbpage.c
index 6cfab2e4d340..ebefd2a14375 100644
--- a/arch/sparc64/mm/hugetlbpage.c
+++ b/arch/sparc64/mm/hugetlbpage.c
@@ -344,7 +344,7 @@ void hugetlb_prefault_arch_hook(struct mm_struct *mm)
 			 * also executing in this address space.
 			 */
 			mm->context.sparc64_ctx_val = ctx;
-			on_each_cpu(context_reload, mm, 0, 0);
+			on_each_cpu(context_reload, mm, 0);
 		}
 		spin_unlock(&ctx_alloc_lock);
 	}
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index e07e8c068ae0..43b7cb594912 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -363,7 +363,7 @@ static void mcheck_check_cpu(void *info)
 
 static void mcheck_timer(struct work_struct *work)
 {
-	on_each_cpu(mcheck_check_cpu, NULL, 1, 1);
+	on_each_cpu(mcheck_check_cpu, NULL, 1);
 
 	/*
 	 * Alert userspace if needed.  If we logged an MCE, reduce the
@@ -612,7 +612,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
 	 * Collect entries that were still getting written before the
 	 * synchronize.
 	 */
-	on_each_cpu(collect_tscs, cpu_tsc, 1, 1);
+	on_each_cpu(collect_tscs, cpu_tsc, 1);
 	for (i = next; i < MCE_LOG_LEN; i++) {
 		if (mcelog.entry[i].finished &&
 		    mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
@@ -737,7 +737,7 @@ static void mce_restart(void)
 	if (next_interval)
 		cancel_delayed_work(&mcheck_work);
 	/* Timer race is harmless here */
-	on_each_cpu(mce_init, NULL, 1, 1);
+	on_each_cpu(mce_init, NULL, 1);
 	next_interval = check_interval * HZ;
 	if (next_interval)
 		schedule_delayed_work(&mcheck_work,
diff --git a/arch/x86/kernel/cpu/mcheck/non-fatal.c b/arch/x86/kernel/cpu/mcheck/non-fatal.c
index 00ccb6c14ec2..cc1fccdd31e0 100644
--- a/arch/x86/kernel/cpu/mcheck/non-fatal.c
+++ b/arch/x86/kernel/cpu/mcheck/non-fatal.c
@@ -59,7 +59,7 @@ static DECLARE_DELAYED_WORK(mce_work, mce_work_fn);
 
 static void mce_work_fn(struct work_struct *work)
 {
-	on_each_cpu(mce_checkregs, NULL, 1, 1);
+	on_each_cpu(mce_checkregs, NULL, 1);
 	schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE));
 }
 
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index f9ae93adffe5..58043f06d7e2 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -180,7 +180,7 @@ void disable_lapic_nmi_watchdog(void)
 	if (atomic_read(&nmi_active) <= 0)
 		return;
 
-	on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1);
+	on_each_cpu(stop_apic_nmi_watchdog, NULL, 1);
 	wd_ops->unreserve();
 
 	BUG_ON(atomic_read(&nmi_active) != 0);
@@ -202,7 +202,7 @@ void enable_lapic_nmi_watchdog(void)
 		return;
 	}
 
-	on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1);
+	on_each_cpu(setup_apic_nmi_watchdog, NULL, 1);
 	touch_nmi_watchdog();
 }
 
diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index 4dc8600d9d20..720640ff36ca 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -1565,7 +1565,7 @@ void /*__init*/ print_local_APIC(void * dummy)
 
 void print_all_local_APICs (void)
 {
-	on_each_cpu(print_local_APIC, NULL, 1, 1);
+	on_each_cpu(print_local_APIC, NULL, 1);
 }
 
 void /*__init*/ print_PIC(void)
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index ef1a8dfcc529..4504c7f50012 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -1146,7 +1146,7 @@ void __apicdebuginit print_local_APIC(void * dummy)
 
 void print_all_local_APICs (void)
 {
-	on_each_cpu(print_local_APIC, NULL, 1, 1);
+	on_each_cpu(print_local_APIC, NULL, 1);
 }
 
 void __apicdebuginit print_PIC(void)
diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c
index 5562dab0bd20..11008e0857c0 100644
--- a/arch/x86/kernel/nmi_32.c
+++ b/arch/x86/kernel/nmi_32.c
@@ -218,7 +218,7 @@ static void __acpi_nmi_enable(void *__unused)
 void acpi_nmi_enable(void)
 {
 	if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
-		on_each_cpu(__acpi_nmi_enable, NULL, 0, 1);
+		on_each_cpu(__acpi_nmi_enable, NULL, 1);
 }
 
 static void __acpi_nmi_disable(void *__unused)
@@ -232,7 +232,7 @@ static void __acpi_nmi_disable(void *__unused)
 void acpi_nmi_disable(void)
 {
 	if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
-		on_each_cpu(__acpi_nmi_disable, NULL, 0, 1);
+		on_each_cpu(__acpi_nmi_disable, NULL, 1);
 }
 
 void setup_apic_nmi_watchdog(void *unused)
diff --git a/arch/x86/kernel/nmi_64.c b/arch/x86/kernel/nmi_64.c
index 2f1e4f503c9e..bbdcb17b3dfe 100644
--- a/arch/x86/kernel/nmi_64.c
+++ b/arch/x86/kernel/nmi_64.c
@@ -225,7 +225,7 @@ static void __acpi_nmi_enable(void *__unused)
 void acpi_nmi_enable(void)
 {
 	if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
-		on_each_cpu(__acpi_nmi_enable, NULL, 0, 1);
+		on_each_cpu(__acpi_nmi_enable, NULL, 1);
 }
 
 static void __acpi_nmi_disable(void *__unused)
@@ -239,7 +239,7 @@ static void __acpi_nmi_disable(void *__unused)
 void acpi_nmi_disable(void)
 {
 	if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
-		on_each_cpu(__acpi_nmi_disable, NULL, 0, 1);
+		on_each_cpu(__acpi_nmi_disable, NULL, 1);
 }
 
 void setup_apic_nmi_watchdog(void *unused)
diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c
index 9bb2363851af..fec1ecedc9b7 100644
--- a/arch/x86/kernel/tlb_32.c
+++ b/arch/x86/kernel/tlb_32.c
@@ -238,6 +238,6 @@ static void do_flush_tlb_all(void *info)
 
 void flush_tlb_all(void)
 {
-	on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
+	on_each_cpu(do_flush_tlb_all, NULL, 1);
 }
 
diff --git a/arch/x86/kernel/tlb_64.c b/arch/x86/kernel/tlb_64.c
index a1f07d793202..184a367516d3 100644
--- a/arch/x86/kernel/tlb_64.c
+++ b/arch/x86/kernel/tlb_64.c
@@ -270,5 +270,5 @@ static void do_flush_tlb_all(void *info)
 
 void flush_tlb_all(void)
 {
-	on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
+	on_each_cpu(do_flush_tlb_all, NULL, 1);
 }
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 0a03d57f9b3b..0dcae19ed627 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -301,7 +301,7 @@ static int __init vsyscall_init(void)
 #ifdef CONFIG_SYSCTL
 	register_sysctl_table(kernel_root_table2);
 #endif
-	on_each_cpu(cpu_vsyscall_init, NULL, 0, 1);
+	on_each_cpu(cpu_vsyscall_init, NULL, 1);
 	hotcpu_notifier(cpu_vsyscall_notifier, 0);
 	return 0;
 }
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 5534fe59b5fc..10ce6ee4c491 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2968,7 +2968,7 @@ static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
 	if (vmx->vmcs) {
-		on_each_cpu(__vcpu_clear, vmx, 0, 1);
+		on_each_cpu(__vcpu_clear, vmx, 1);
 		free_vmcs(vmx->vmcs);
 		vmx->vmcs = NULL;
 	}
diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c
index 04f596eab749..abea08459a73 100644
--- a/arch/x86/mach-voyager/voyager_smp.c
+++ b/arch/x86/mach-voyager/voyager_smp.c
@@ -1072,7 +1072,7 @@ static void do_flush_tlb_all(void *info)
 /* flush the TLB of every active CPU in the system */
 void flush_tlb_all(void)
 {
-	on_each_cpu(do_flush_tlb_all, 0, 1, 1);
+	on_each_cpu(do_flush_tlb_all, 0, 1);
 }
 
 /* used to set up the trampoline for other CPUs when the memory manager
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 60bcb5b6a37e..9b836ba9dedd 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -106,7 +106,7 @@ static void cpa_flush_all(unsigned long cache)
 {
 	BUG_ON(irqs_disabled());
 
-	on_each_cpu(__cpa_flush_all, (void *) cache, 1, 1);
+	on_each_cpu(__cpa_flush_all, (void *) cache, 1);
 }
 
 static void __cpa_flush_range(void *arg)
@@ -127,7 +127,7 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache)
 	BUG_ON(irqs_disabled());
 	WARN_ON(PAGE_ALIGN(start) != start);
 
-	on_each_cpu(__cpa_flush_range, NULL, 1, 1);
+	on_each_cpu(__cpa_flush_range, NULL, 1);
 
 	if (!cache)
 		return;
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index cc48d3fde545..3238ad32ffd8 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -218,8 +218,8 @@ static int nmi_setup(void)
 		}
 
 	}
-	on_each_cpu(nmi_save_registers, NULL, 0, 1);
-	on_each_cpu(nmi_cpu_setup, NULL, 0, 1);
+	on_each_cpu(nmi_save_registers, NULL, 1);
+	on_each_cpu(nmi_cpu_setup, NULL, 1);
 	nmi_enabled = 1;
 	return 0;
 }
@@ -271,7 +271,7 @@ static void nmi_shutdown(void)
 {
 	struct op_msrs *msrs = &__get_cpu_var(cpu_msrs);
 	nmi_enabled = 0;
-	on_each_cpu(nmi_cpu_shutdown, NULL, 0, 1);
+	on_each_cpu(nmi_cpu_shutdown, NULL, 1);
 	unregister_die_notifier(&profile_exceptions_nb);
 	model->shutdown(msrs);
 	free_msrs();
@@ -285,7 +285,7 @@ static void nmi_cpu_start(void *dummy)
 
 static int nmi_start(void)
 {
-	on_each_cpu(nmi_cpu_start, NULL, 0, 1);
+	on_each_cpu(nmi_cpu_start, NULL, 1);
 	return 0;
 }
 
@@ -297,7 +297,7 @@ static void nmi_cpu_stop(void *dummy)
 
 static void nmi_stop(void)
 {
-	on_each_cpu(nmi_cpu_stop, NULL, 0, 1);
+	on_each_cpu(nmi_cpu_stop, NULL, 1);
 }
 
 struct op_counter_config counter_config[OP_MAX_COUNTER];
diff --git a/drivers/char/agp/generic.c b/drivers/char/agp/generic.c
index 564daaa6c7d0..eaa1a355bb32 100644
--- a/drivers/char/agp/generic.c
+++ b/drivers/char/agp/generic.c
@@ -1249,7 +1249,7 @@ static void ipi_handler(void *null)
 
 void global_cache_flush(void)
 {
-	if (on_each_cpu(ipi_handler, NULL, 1, 1) != 0)
+	if (on_each_cpu(ipi_handler, NULL, 1) != 0)
 		panic(PFX "timed out waiting for the other CPUs!\n");
 }
 EXPORT_SYMBOL(global_cache_flush);
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
index 2e554a4ab337..95dfda52b4f9 100644
--- a/drivers/lguest/x86/core.c
+++ b/drivers/lguest/x86/core.c
@@ -478,7 +478,7 @@ void __init lguest_arch_host_init(void)
 		cpu_had_pge = 1;
 		/* adjust_pge is a helper function which sets or unsets the PGE
 		 * bit on its CPU, depending on the argument (0 == unset). */
-		on_each_cpu(adjust_pge, (void *)0, 0, 1);
+		on_each_cpu(adjust_pge, (void *)0, 1);
 		/* Turn off the feature in the global feature set. */
 		clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
 	}
@@ -493,7 +493,7 @@ void __exit lguest_arch_host_fini(void)
 	if (cpu_had_pge) {
 		set_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
 		/* adjust_pge's argument "1" means set PGE. */
-		on_each_cpu(adjust_pge, (void *)1, 0, 1);
+		on_each_cpu(adjust_pge, (void *)1, 1);
 	}
 	put_online_cpus();
 }
diff --git a/fs/buffer.c b/fs/buffer.c
index a073f3f4f013..5c23ef560d01 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1464,7 +1464,7 @@ static void invalidate_bh_lru(void *arg)
 	
 void invalidate_bh_lrus(void)
 {
-	on_each_cpu(invalidate_bh_lru, NULL, 1, 1);
+	on_each_cpu(invalidate_bh_lru, NULL, 1);
 }
 EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
 
diff --git a/include/linux/smp.h b/include/linux/smp.h
index 338cad1b9548..55261101d09a 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -89,7 +89,7 @@ static inline void init_call_single_data(void)
 /*
  * Call a function on all processors
  */
-int on_each_cpu(void (*func) (void *info), void *info, int retry, int wait);
+int on_each_cpu(void (*func) (void *info), void *info, int wait);
 
 #define MSG_ALL_BUT_SELF	0x8000	/* Assume <32768 CPU's */
 #define MSG_ALL			0x8001
@@ -121,7 +121,7 @@ static inline int up_smp_call_function(void (*func)(void *), void *info)
 }
 #define smp_call_function(func, info, wait) \
 			(up_smp_call_function(func, info))
-#define on_each_cpu(func,info,retry,wait)	\
+#define on_each_cpu(func,info,wait)		\
 	({					\
 		local_irq_disable();		\
 		func(info);			\
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 421be5fe5cc7..50e8616d7955 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -623,7 +623,7 @@ static void retrigger_next_event(void *arg)
 void clock_was_set(void)
 {
 	/* Retrigger the CPU local events everywhere */
-	on_each_cpu(retrigger_next_event, NULL, 0, 1);
+	on_each_cpu(retrigger_next_event, NULL, 1);
 }
 
 /*
diff --git a/kernel/profile.c b/kernel/profile.c
index ae7ead82cbc9..58926411eb2a 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -252,7 +252,7 @@ static void profile_flip_buffers(void)
 	mutex_lock(&profile_flip_mutex);
 	j = per_cpu(cpu_profile_flip, get_cpu());
 	put_cpu();
-	on_each_cpu(__profile_flip_buffers, NULL, 0, 1);
+	on_each_cpu(__profile_flip_buffers, NULL, 1);
 	for_each_online_cpu(cpu) {
 		struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[j];
 		for (i = 0; i < NR_PROFILE_HIT; ++i) {
@@ -275,7 +275,7 @@ static void profile_discard_flip_buffers(void)
 	mutex_lock(&profile_flip_mutex);
 	i = per_cpu(cpu_profile_flip, get_cpu());
 	put_cpu();
-	on_each_cpu(__profile_flip_buffers, NULL, 0, 1);
+	on_each_cpu(__profile_flip_buffers, NULL, 1);
 	for_each_online_cpu(cpu) {
 		struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[i];
 		memset(hits, 0, NR_PROFILE_HIT*sizeof(struct profile_hit));
@@ -558,7 +558,7 @@ static int __init create_hash_tables(void)
 out_cleanup:
 	prof_on = 0;
 	smp_mb();
-	on_each_cpu(profile_nop, NULL, 0, 1);
+	on_each_cpu(profile_nop, NULL, 1);
 	for_each_online_cpu(cpu) {
 		struct page *page;
 
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index c09605f8d16c..6addab5e6d88 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -127,7 +127,7 @@ void rcu_barrier(void)
 	 * until all the callbacks are queued.
 	 */
 	rcu_read_lock();
-	on_each_cpu(rcu_barrier_func, NULL, 0, 1);
+	on_each_cpu(rcu_barrier_func, NULL, 1);
 	rcu_read_unlock();
 	wait_for_completion(&rcu_barrier_completion);
 	mutex_unlock(&rcu_barrier_mutex);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index d73afb4764ef..c159fd094772 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -674,7 +674,7 @@ __init int spawn_ksoftirqd(void)
 /*
  * Call a function on all processors
  */
-int on_each_cpu(void (*func) (void *info), void *info, int retry, int wait)
+int on_each_cpu(void (*func) (void *info), void *info, int wait)
 {
 	int ret = 0;
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2f552955a02f..53242344a774 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -918,7 +918,7 @@ void drain_local_pages(void *arg)
  */
 void drain_all_pages(void)
 {
-	on_each_cpu(drain_local_pages, NULL, 0, 1);
+	on_each_cpu(drain_local_pages, NULL, 1);
 }
 
 #ifdef CONFIG_HIBERNATION
diff --git a/mm/slab.c b/mm/slab.c
index 046607f05f3e..0772abb412b9 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2454,7 +2454,7 @@ static void drain_cpu_caches(struct kmem_cache *cachep)
 	struct kmem_list3 *l3;
 	int node;
 
-	on_each_cpu(do_drain, cachep, 1, 1);
+	on_each_cpu(do_drain, cachep, 1);
 	check_irq_on();
 	for_each_online_node(node) {
 		l3 = cachep->nodelists[node];
@@ -3939,7 +3939,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
 	}
 	new->cachep = cachep;
 
-	on_each_cpu(do_ccupdate_local, (void *)new, 1, 1);
+	on_each_cpu(do_ccupdate_local, (void *)new, 1);
 
 	check_irq_on();
 	cachep->batchcount = batchcount;
diff --git a/mm/slub.c b/mm/slub.c
index 0987d1cd943c..44715eb70c06 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1497,7 +1497,7 @@ static void flush_cpu_slab(void *d)
 static void flush_all(struct kmem_cache *s)
 {
 #ifdef CONFIG_SMP
-	on_each_cpu(flush_cpu_slab, s, 1, 1);
+	on_each_cpu(flush_cpu_slab, s, 1);
 #else
 	unsigned long flags;
 
diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c
index 94d5a45c3a57..a178e27e7b1a 100644
--- a/net/iucv/iucv.c
+++ b/net/iucv/iucv.c
@@ -545,7 +545,7 @@ out:
  */
 static void iucv_disable(void)
 {
-	on_each_cpu(iucv_retrieve_cpu, NULL, 0, 1);
+	on_each_cpu(iucv_retrieve_cpu, NULL, 1);
 	kfree(iucv_path_table);
 }
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index ea1f595f8a87..d4eae6af0738 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1286,7 +1286,7 @@ static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
 		 * in vmx root mode.
 		 */
 		printk(KERN_INFO "kvm: exiting hardware virtualization\n");
-		on_each_cpu(hardware_disable, NULL, 0, 1);
+		on_each_cpu(hardware_disable, NULL, 1);
 	}
 	return NOTIFY_OK;
 }
@@ -1479,7 +1479,7 @@ int kvm_init(void *opaque, unsigned int vcpu_size,
 			goto out_free_1;
 	}
 
-	on_each_cpu(hardware_enable, NULL, 0, 1);
+	on_each_cpu(hardware_enable, NULL, 1);
 	r = register_cpu_notifier(&kvm_cpu_notifier);
 	if (r)
 		goto out_free_2;
@@ -1525,7 +1525,7 @@ out_free_3:
 	unregister_reboot_notifier(&kvm_reboot_notifier);
 	unregister_cpu_notifier(&kvm_cpu_notifier);
 out_free_2:
-	on_each_cpu(hardware_disable, NULL, 0, 1);
+	on_each_cpu(hardware_disable, NULL, 1);
 out_free_1:
 	kvm_arch_hardware_unsetup();
 out_free_0:
@@ -1547,7 +1547,7 @@ void kvm_exit(void)
 	sysdev_class_unregister(&kvm_sysdev_class);
 	unregister_reboot_notifier(&kvm_reboot_notifier);
 	unregister_cpu_notifier(&kvm_cpu_notifier);
-	on_each_cpu(hardware_disable, NULL, 0, 1);
+	on_each_cpu(hardware_disable, NULL, 1);
 	kvm_arch_hardware_unsetup();
 	kvm_arch_exit();
 	kvm_exit_debug();
-- 
cgit v1.2.3


From 1bdad606338debc6384b2844f1b53cc436b3ac90 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 3 Jun 2008 14:09:53 +0100
Subject: [GFS2] Remove remote lock dropping code

There are several reasons why this is undesirable:

 1. It never happens during normal operation anyway
 2. If it does happen it causes performance to be very, very poor
 3. It isn't likely to solve the original problem (memory shortage
    on remote DLM node) it was supposed to solve
 4. It uses a bunch of arbitrary constants which are unlikely to be
    correct for any particular situation and for which the tuning seems
    to be a black art.
 5. In an N node cluster, only 1/N of the dropped locked will actually
    contribute to solving the problem on average.

So all in all we are better off without it. This also makes merging
the lock_dlm module into GFS2 a bit easier.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/gfs2.h                 |  5 -----
 fs/gfs2/glock.c                | 12 +++---------
 fs/gfs2/glock.h                |  2 +-
 fs/gfs2/locking/dlm/lock_dlm.h |  3 ---
 fs/gfs2/locking/dlm/mount.c    |  3 ---
 fs/gfs2/locking/dlm/sysfs.c    | 13 -------------
 fs/gfs2/locking/dlm/thread.c   | 19 -------------------
 fs/gfs2/ops_fstype.c           |  2 +-
 fs/gfs2/ops_super.c            |  2 +-
 fs/gfs2/sys.c                  | 14 --------------
 include/linux/lm_interface.h   |  4 ----
 11 files changed, 6 insertions(+), 73 deletions(-)

(limited to 'include/linux')

diff --git a/fs/gfs2/gfs2.h b/fs/gfs2/gfs2.h
index 3bb11c0f8b56..ef606e3a5cf4 100644
--- a/fs/gfs2/gfs2.h
+++ b/fs/gfs2/gfs2.h
@@ -15,11 +15,6 @@ enum {
 	CREATE = 1,
 };
 
-enum {
-	NO_WAIT = 0,
-	WAIT = 1,
-};
-
 enum {
 	NO_FORCE = 0,
 	FORCE = 1,
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index be7ed503f012..8d5450f3c3ef 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1316,11 +1316,6 @@ void gfs2_glock_cb(void *cb_data, unsigned int type, void *data)
 			wake_up_process(sdp->sd_recoverd_process);
 		return;
 
-	case LM_CB_DROPLOCKS:
-		gfs2_gl_hash_clear(sdp, NO_WAIT);
-		gfs2_quota_scan(sdp);
-		return;
-
 	default:
 		gfs2_assert_warn(sdp, 0);
 		return;
@@ -1508,11 +1503,10 @@ static void clear_glock(struct gfs2_glock *gl)
  * @sdp: the filesystem
  * @wait: wait until it's all gone
  *
- * Called when unmounting the filesystem, or when inter-node lock manager
- * requests DROPLOCKS because it is running out of capacity.
+ * Called when unmounting the filesystem.
  */
 
-void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait)
+void gfs2_gl_hash_clear(struct gfs2_sbd *sdp)
 {
 	unsigned long t;
 	unsigned int x;
@@ -1527,7 +1521,7 @@ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait)
 				cont = 1;
 		}
 
-		if (!wait || !cont)
+		if (!cont)
 			break;
 
 		if (time_after_eq(jiffies,
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 7389f8ef0a31..971d92af70fc 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -132,7 +132,7 @@ void gfs2_lvb_unhold(struct gfs2_glock *gl);
 void gfs2_glock_cb(void *cb_data, unsigned int type, void *data);
 void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl);
 void gfs2_reclaim_glock(struct gfs2_sbd *sdp);
-void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait);
+void gfs2_gl_hash_clear(struct gfs2_sbd *sdp);
 
 int __init gfs2_glock_init(void);
 void gfs2_glock_exit(void);
diff --git a/fs/gfs2/locking/dlm/lock_dlm.h b/fs/gfs2/locking/dlm/lock_dlm.h
index ad944c64eab1..845a27fd303e 100644
--- a/fs/gfs2/locking/dlm/lock_dlm.h
+++ b/fs/gfs2/locking/dlm/lock_dlm.h
@@ -79,9 +79,6 @@ struct gdlm_ls {
 	wait_queue_head_t	wait_control;
 	struct task_struct	*thread;
 	wait_queue_head_t	thread_wait;
-	unsigned long		drop_time;
-	int			drop_locks_count;
-	int			drop_locks_period;
 };
 
 enum {
diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c
index 0628520a445f..fa31c54c2e67 100644
--- a/fs/gfs2/locking/dlm/mount.c
+++ b/fs/gfs2/locking/dlm/mount.c
@@ -22,8 +22,6 @@ static struct gdlm_ls *init_gdlm(lm_callback_t cb, struct gfs2_sbd *sdp,
 	if (!ls)
 		return NULL;
 
-	ls->drop_locks_count = GDLM_DROP_COUNT;
-	ls->drop_locks_period = GDLM_DROP_PERIOD;
 	ls->fscb = cb;
 	ls->sdp = sdp;
 	ls->fsflags = flags;
@@ -33,7 +31,6 @@ static struct gdlm_ls *init_gdlm(lm_callback_t cb, struct gfs2_sbd *sdp,
 	INIT_LIST_HEAD(&ls->all_locks);
 	init_waitqueue_head(&ls->thread_wait);
 	init_waitqueue_head(&ls->wait_control);
-	ls->drop_time = jiffies;
 	ls->jid = -1;
 
 	strncpy(buf, table_name, 256);
diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c
index a4ff271df9ee..4ec571c3d8a9 100644
--- a/fs/gfs2/locking/dlm/sysfs.c
+++ b/fs/gfs2/locking/dlm/sysfs.c
@@ -114,17 +114,6 @@ static ssize_t recover_status_show(struct gdlm_ls *ls, char *buf)
 	return sprintf(buf, "%d\n", ls->recover_jid_status);
 }
 
-static ssize_t drop_count_show(struct gdlm_ls *ls, char *buf)
-{
-	return sprintf(buf, "%d\n", ls->drop_locks_count);
-}
-
-static ssize_t drop_count_store(struct gdlm_ls *ls, const char *buf, size_t len)
-{
-	ls->drop_locks_count = simple_strtol(buf, NULL, 0);
-	return len;
-}
-
 struct gdlm_attr {
 	struct attribute attr;
 	ssize_t (*show)(struct gdlm_ls *, char *);
@@ -144,7 +133,6 @@ GDLM_ATTR(first_done,     0444, first_done_show,     NULL);
 GDLM_ATTR(recover,        0644, recover_show,        recover_store);
 GDLM_ATTR(recover_done,   0444, recover_done_show,   NULL);
 GDLM_ATTR(recover_status, 0444, recover_status_show, NULL);
-GDLM_ATTR(drop_count,     0644, drop_count_show,     drop_count_store);
 
 static struct attribute *gdlm_attrs[] = {
 	&gdlm_attr_proto_name.attr,
@@ -157,7 +145,6 @@ static struct attribute *gdlm_attrs[] = {
 	&gdlm_attr_recover.attr,
 	&gdlm_attr_recover_done.attr,
 	&gdlm_attr_recover_status.attr,
-	&gdlm_attr_drop_count.attr,
 	NULL,
 };
 
diff --git a/fs/gfs2/locking/dlm/thread.c b/fs/gfs2/locking/dlm/thread.c
index f30350abd62f..38823efd698c 100644
--- a/fs/gfs2/locking/dlm/thread.c
+++ b/fs/gfs2/locking/dlm/thread.c
@@ -20,19 +20,6 @@ static inline int no_work(struct gdlm_ls *ls)
 	return ret;
 }
 
-static inline int check_drop(struct gdlm_ls *ls)
-{
-	if (!ls->drop_locks_count)
-		return 0;
-
-	if (time_after(jiffies, ls->drop_time + ls->drop_locks_period * HZ)) {
-		ls->drop_time = jiffies;
-		if (ls->all_locks_count >= ls->drop_locks_count)
-			return 1;
-	}
-	return 0;
-}
-
 static int gdlm_thread(void *data)
 {
 	struct gdlm_ls *ls = (struct gdlm_ls *) data;
@@ -52,12 +39,6 @@ static int gdlm_thread(void *data)
 			gdlm_do_lock(lp);
 			spin_lock(&ls->async_lock);
 		}
-		/* Does this ever happen these days? I hope not anyway */
-		if (check_drop(ls)) {
-			spin_unlock(&ls->async_lock);
-			ls->fscb(ls->sdp, LM_CB_DROPLOCKS, NULL);
-			spin_lock(&ls->async_lock);
-		}
 		spin_unlock(&ls->async_lock);
 	}
 
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 9bd97c5543bd..6ba69dd1a729 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -874,7 +874,7 @@ fail_sb:
 fail_locking:
 	init_locking(sdp, &mount_gh, UNDO);
 fail_lm:
-	gfs2_gl_hash_clear(sdp, WAIT);
+	gfs2_gl_hash_clear(sdp);
 	gfs2_lm_unmount(sdp);
 	while (invalidate_inodes(sb))
 		yield();
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index 66907922109f..f66ea0f7a356 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -126,7 +126,7 @@ static void gfs2_put_super(struct super_block *sb)
 	gfs2_clear_rgrpd(sdp);
 	gfs2_jindex_free(sdp);
 	/*  Take apart glock structures and buffer lists  */
-	gfs2_gl_hash_clear(sdp, WAIT);
+	gfs2_gl_hash_clear(sdp);
 	/*  Unmount the locking protocol  */
 	gfs2_lm_unmount(sdp);
 
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 9ab9fc85ecd0..6f7e2e5858e0 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -110,18 +110,6 @@ static ssize_t statfs_sync_store(struct gfs2_sbd *sdp, const char *buf,
 	return len;
 }
 
-static ssize_t shrink_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
-{
-	if (!capable(CAP_SYS_ADMIN))
-		return -EACCES;
-
-	if (simple_strtol(buf, NULL, 0) != 1)
-		return -EINVAL;
-
-	gfs2_gl_hash_clear(sdp, NO_WAIT);
-	return len;
-}
-
 static ssize_t quota_sync_store(struct gfs2_sbd *sdp, const char *buf,
 				size_t len)
 {
@@ -175,7 +163,6 @@ static struct gfs2_attr gfs2_attr_##name = __ATTR(name, mode, show, store)
 GFS2_ATTR(id,                  0444, id_show,       NULL);
 GFS2_ATTR(fsname,              0444, fsname_show,   NULL);
 GFS2_ATTR(freeze,              0644, freeze_show,   freeze_store);
-GFS2_ATTR(shrink,              0200, NULL,          shrink_store);
 GFS2_ATTR(withdraw,            0644, withdraw_show, withdraw_store);
 GFS2_ATTR(statfs_sync,         0200, NULL,          statfs_sync_store);
 GFS2_ATTR(quota_sync,          0200, NULL,          quota_sync_store);
@@ -186,7 +173,6 @@ static struct attribute *gfs2_attrs[] = {
 	&gfs2_attr_id.attr,
 	&gfs2_attr_fsname.attr,
 	&gfs2_attr_freeze.attr,
-	&gfs2_attr_shrink.attr,
 	&gfs2_attr_withdraw.attr,
 	&gfs2_attr_statfs_sync.attr,
 	&gfs2_attr_quota_sync.attr,
diff --git a/include/linux/lm_interface.h b/include/linux/lm_interface.h
index f274997bc283..d0a7112b9719 100644
--- a/include/linux/lm_interface.h
+++ b/include/linux/lm_interface.h
@@ -138,9 +138,6 @@ typedef void (*lm_callback_t) (void *ptr, unsigned int type, void *data);
  * LM_CB_NEED_RECOVERY
  * The given journal needs to be recovered.
  *
- * LM_CB_DROPLOCKS
- * Reduce the number of cached locks.
- *
  * LM_CB_ASYNC
  * The given lock has been granted.
  */
@@ -149,7 +146,6 @@ typedef void (*lm_callback_t) (void *ptr, unsigned int type, void *data);
 #define LM_CB_NEED_D		258
 #define LM_CB_NEED_S		259
 #define LM_CB_NEED_RECOVERY	260
-#define LM_CB_DROPLOCKS		261
 #define LM_CB_ASYNC		262
 
 /*
-- 
cgit v1.2.3


From b2cad26cfc2091050574a460b304ed103a35dbda Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 3 Jun 2008 14:34:14 +0100
Subject: [GFS2] Remove obsolete conversion deadlock avoidance code

This is only used by GFS1 so can be removed.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/locking/dlm/lock.c   | 23 +----------------------
 include/linux/lm_interface.h |  2 --
 2 files changed, 1 insertion(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/fs/gfs2/locking/dlm/lock.c b/fs/gfs2/locking/dlm/lock.c
index 871ffc9578f2..894df4567a03 100644
--- a/fs/gfs2/locking/dlm/lock.c
+++ b/fs/gfs2/locking/dlm/lock.c
@@ -80,7 +80,6 @@ static void process_complete(struct gdlm_lock *lp)
 {
 	struct gdlm_ls *ls = lp->ls;
 	struct lm_async_cb acb;
-	s16 prev_mode = lp->cur;
 
 	memset(&acb, 0, sizeof(acb));
 
@@ -160,15 +159,7 @@ static void process_complete(struct gdlm_lock *lp)
 			 lp->lksb.sb_status, lp->lockname.ln_type,
 			 (unsigned long long)lp->lockname.ln_number,
 			 lp->flags);
-		if (lp->lksb.sb_status == -EDEADLOCK &&
-		    lp->ls->fsflags & LM_MFLAG_CONV_NODROP) {
-			lp->req = lp->cur;
-			acb.lc_ret |= LM_OUT_CONV_DEADLK;
-			if (lp->cur == DLM_LOCK_IV)
-				lp->lksb.sb_lkid = 0;
-			goto out;
-		} else
-			return;
+		return;
 	}
 
 	/*
@@ -268,10 +259,6 @@ out:
 	acb.lc_name = lp->lockname;
 	acb.lc_ret |= gdlm_make_lmstate(lp->cur);
 
-	if (!test_and_clear_bit(LFL_NOCACHE, &lp->flags) &&
-	    (lp->cur > DLM_LOCK_NL) && (prev_mode > DLM_LOCK_NL))
-		acb.lc_ret |= LM_OUT_CACHEABLE;
-
 	ls->fscb(ls->sdp, LM_CB_ASYNC, &acb);
 }
 
@@ -376,14 +363,6 @@ static inline unsigned int make_flags(struct gdlm_lock *lp,
 
 	if (lp->lksb.sb_lkid != 0) {
 		lkf |= DLM_LKF_CONVERT;
-
-		/* Conversion deadlock avoidance by DLM */
-
-		if (!(lp->ls->fsflags & LM_MFLAG_CONV_NODROP) &&
-		    !test_bit(LFL_FORCE_PROMOTE, &lp->flags) &&
-		    !(lkf & DLM_LKF_NOQUEUE) &&
-		    cur > DLM_LOCK_NL && req > DLM_LOCK_NL && cur != req)
-			lkf |= DLM_LKF_CONVDEADLK;
 	}
 
 	if (lp->lvb)
diff --git a/include/linux/lm_interface.h b/include/linux/lm_interface.h
index d0a7112b9719..2ed8fa1b762b 100644
--- a/include/linux/lm_interface.h
+++ b/include/linux/lm_interface.h
@@ -122,11 +122,9 @@ typedef void (*lm_callback_t) (void *ptr, unsigned int type, void *data);
  */
 
 #define LM_OUT_ST_MASK		0x00000003
-#define LM_OUT_CACHEABLE	0x00000004
 #define LM_OUT_CANCELED		0x00000008
 #define LM_OUT_ASYNC		0x00000080
 #define LM_OUT_ERROR		0x00000100
-#define LM_OUT_CONV_DEADLK	0x00000200
 
 /*
  * lm_callback_t types
-- 
cgit v1.2.3


From c09595f63bb1909c5dc4dca288f4fe818561b5f3 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 27 Jun 2008 13:41:14 +0200
Subject: sched: revert revert of: fair-group: SMP-nice for group scheduling

Try again..

Initial commit: 18d95a2832c1392a2d63227a7a6d433cb9f2037e
Revert: 6363ca57c76b7b83639ca8c83fc285fa26a7880e

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |   1 +
 kernel/sched.c        | 430 ++++++++++++++++++++++++++++++++++++++++++++++----
 kernel/sched_debug.c  |   5 +
 kernel/sched_fair.c   | 124 +++++++++------
 kernel/sched_rt.c     |   4 +
 5 files changed, 489 insertions(+), 75 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index eaf821072dbd..97a58b622ee1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -765,6 +765,7 @@ struct sched_domain {
 	struct sched_domain *child;	/* bottom domain must be null terminated */
 	struct sched_group *groups;	/* the balancing groups of the domain */
 	cpumask_t span;			/* span of all CPUs in this domain */
+	int first_cpu;			/* cache of the first cpu in this domain */
 	unsigned long min_interval;	/* Minimum balance interval ms */
 	unsigned long max_interval;	/* Maximum balance interval ms */
 	unsigned int busy_factor;	/* less balancing by factor if busy */
diff --git a/kernel/sched.c b/kernel/sched.c
index f653af684fb3..874b6da15430 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -403,6 +403,43 @@ struct cfs_rq {
 	 */
 	struct list_head leaf_cfs_rq_list;
 	struct task_group *tg;	/* group that "owns" this runqueue */
+
+#ifdef CONFIG_SMP
+	unsigned long task_weight;
+	unsigned long shares;
+	/*
+	 * We need space to build a sched_domain wide view of the full task
+	 * group tree, in order to avoid depending on dynamic memory allocation
+	 * during the load balancing we place this in the per cpu task group
+	 * hierarchy. This limits the load balancing to one instance per cpu,
+	 * but more should not be needed anyway.
+	 */
+	struct aggregate_struct {
+		/*
+		 *   load = weight(cpus) * f(tg)
+		 *
+		 * Where f(tg) is the recursive weight fraction assigned to
+		 * this group.
+		 */
+		unsigned long load;
+
+		/*
+		 * part of the group weight distributed to this span.
+		 */
+		unsigned long shares;
+
+		/*
+		 * The sum of all runqueue weights within this span.
+		 */
+		unsigned long rq_weight;
+
+		/*
+		 * Weight contributed by tasks; this is the part we can
+		 * influence by moving tasks around.
+		 */
+		unsigned long task_weight;
+	} aggregate;
+#endif
 #endif
 };
 
@@ -1484,6 +1521,326 @@ static unsigned long source_load(int cpu, int type);
 static unsigned long target_load(int cpu, int type);
 static unsigned long cpu_avg_load_per_task(int cpu);
 static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+
+/*
+ * Group load balancing.
+ *
+ * We calculate a few balance domain wide aggregate numbers; load and weight.
+ * Given the pictures below, and assuming each item has equal weight:
+ *
+ *         root          1 - thread
+ *         / | \         A - group
+ *        A  1  B
+ *       /|\   / \
+ *      C 2 D 3   4
+ *      |   |
+ *      5   6
+ *
+ * load:
+ *    A and B get 1/3-rd of the total load. C and D get 1/3-rd of A's 1/3-rd,
+ *    which equals 1/9-th of the total load.
+ *
+ * shares:
+ *    The weight of this group on the selected cpus.
+ *
+ * rq_weight:
+ *    Direct sum of all the cpu's their rq weight, e.g. A would get 3 while
+ *    B would get 2.
+ *
+ * task_weight:
+ *    Part of the rq_weight contributed by tasks; all groups except B would
+ *    get 1, B gets 2.
+ */
+
+static inline struct aggregate_struct *
+aggregate(struct task_group *tg, struct sched_domain *sd)
+{
+	return &tg->cfs_rq[sd->first_cpu]->aggregate;
+}
+
+typedef void (*aggregate_func)(struct task_group *, struct sched_domain *);
+
+/*
+ * Iterate the full tree, calling @down when first entering a node and @up when
+ * leaving it for the final time.
+ */
+static
+void aggregate_walk_tree(aggregate_func down, aggregate_func up,
+			 struct sched_domain *sd)
+{
+	struct task_group *parent, *child;
+
+	rcu_read_lock();
+	parent = &root_task_group;
+down:
+	(*down)(parent, sd);
+	list_for_each_entry_rcu(child, &parent->children, siblings) {
+		parent = child;
+		goto down;
+
+up:
+		continue;
+	}
+	(*up)(parent, sd);
+
+	child = parent;
+	parent = parent->parent;
+	if (parent)
+		goto up;
+	rcu_read_unlock();
+}
+
+/*
+ * Calculate the aggregate runqueue weight.
+ */
+static
+void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd)
+{
+	unsigned long rq_weight = 0;
+	unsigned long task_weight = 0;
+	int i;
+
+	for_each_cpu_mask(i, sd->span) {
+		rq_weight += tg->cfs_rq[i]->load.weight;
+		task_weight += tg->cfs_rq[i]->task_weight;
+	}
+
+	aggregate(tg, sd)->rq_weight = rq_weight;
+	aggregate(tg, sd)->task_weight = task_weight;
+}
+
+/*
+ * Compute the weight of this group on the given cpus.
+ */
+static
+void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd)
+{
+	unsigned long shares = 0;
+	int i;
+
+	for_each_cpu_mask(i, sd->span)
+		shares += tg->cfs_rq[i]->shares;
+
+	if ((!shares && aggregate(tg, sd)->rq_weight) || shares > tg->shares)
+		shares = tg->shares;
+
+	aggregate(tg, sd)->shares = shares;
+}
+
+/*
+ * Compute the load fraction assigned to this group, relies on the aggregate
+ * weight and this group's parent's load, i.e. top-down.
+ */
+static
+void aggregate_group_load(struct task_group *tg, struct sched_domain *sd)
+{
+	unsigned long load;
+
+	if (!tg->parent) {
+		int i;
+
+		load = 0;
+		for_each_cpu_mask(i, sd->span)
+			load += cpu_rq(i)->load.weight;
+
+	} else {
+		load = aggregate(tg->parent, sd)->load;
+
+		/*
+		 * shares is our weight in the parent's rq so
+		 * shares/parent->rq_weight gives our fraction of the load
+		 */
+		load *= aggregate(tg, sd)->shares;
+		load /= aggregate(tg->parent, sd)->rq_weight + 1;
+	}
+
+	aggregate(tg, sd)->load = load;
+}
+
+static void __set_se_shares(struct sched_entity *se, unsigned long shares);
+
+/*
+ * Calculate and set the cpu's group shares.
+ */
+static void
+__update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd,
+			  int tcpu)
+{
+	int boost = 0;
+	unsigned long shares;
+	unsigned long rq_weight;
+
+	if (!tg->se[tcpu])
+		return;
+
+	rq_weight = tg->cfs_rq[tcpu]->load.weight;
+
+	/*
+	 * If there are currently no tasks on the cpu pretend there is one of
+	 * average load so that when a new task gets to run here it will not
+	 * get delayed by group starvation.
+	 */
+	if (!rq_weight) {
+		boost = 1;
+		rq_weight = NICE_0_LOAD;
+	}
+
+	/*
+	 *           \Sum shares * rq_weight
+	 * shares =  -----------------------
+	 *               \Sum rq_weight
+	 *
+	 */
+	shares = aggregate(tg, sd)->shares * rq_weight;
+	shares /= aggregate(tg, sd)->rq_weight + 1;
+
+	/*
+	 * record the actual number of shares, not the boosted amount.
+	 */
+	tg->cfs_rq[tcpu]->shares = boost ? 0 : shares;
+
+	if (shares < MIN_SHARES)
+		shares = MIN_SHARES;
+	else if (shares > MAX_SHARES)
+		shares = MAX_SHARES;
+
+	__set_se_shares(tg->se[tcpu], shares);
+}
+
+/*
+ * Re-adjust the weights on the cpu the task came from and on the cpu the
+ * task went to.
+ */
+static void
+__move_group_shares(struct task_group *tg, struct sched_domain *sd,
+		    int scpu, int dcpu)
+{
+	unsigned long shares;
+
+	shares = tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares;
+
+	__update_group_shares_cpu(tg, sd, scpu);
+	__update_group_shares_cpu(tg, sd, dcpu);
+
+	/*
+	 * ensure we never loose shares due to rounding errors in the
+	 * above redistribution.
+	 */
+	shares -= tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares;
+	if (shares)
+		tg->cfs_rq[dcpu]->shares += shares;
+}
+
+/*
+ * Because changing a group's shares changes the weight of the super-group
+ * we need to walk up the tree and change all shares until we hit the root.
+ */
+static void
+move_group_shares(struct task_group *tg, struct sched_domain *sd,
+		  int scpu, int dcpu)
+{
+	while (tg) {
+		__move_group_shares(tg, sd, scpu, dcpu);
+		tg = tg->parent;
+	}
+}
+
+static
+void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd)
+{
+	unsigned long shares = aggregate(tg, sd)->shares;
+	int i;
+
+	for_each_cpu_mask(i, sd->span) {
+		struct rq *rq = cpu_rq(i);
+		unsigned long flags;
+
+		spin_lock_irqsave(&rq->lock, flags);
+		__update_group_shares_cpu(tg, sd, i);
+		spin_unlock_irqrestore(&rq->lock, flags);
+	}
+
+	aggregate_group_shares(tg, sd);
+
+	/*
+	 * ensure we never loose shares due to rounding errors in the
+	 * above redistribution.
+	 */
+	shares -= aggregate(tg, sd)->shares;
+	if (shares) {
+		tg->cfs_rq[sd->first_cpu]->shares += shares;
+		aggregate(tg, sd)->shares += shares;
+	}
+}
+
+/*
+ * Calculate the accumulative weight and recursive load of each task group
+ * while walking down the tree.
+ */
+static
+void aggregate_get_down(struct task_group *tg, struct sched_domain *sd)
+{
+	aggregate_group_weight(tg, sd);
+	aggregate_group_shares(tg, sd);
+	aggregate_group_load(tg, sd);
+}
+
+/*
+ * Rebalance the cpu shares while walking back up the tree.
+ */
+static
+void aggregate_get_up(struct task_group *tg, struct sched_domain *sd)
+{
+	aggregate_group_set_shares(tg, sd);
+}
+
+static DEFINE_PER_CPU(spinlock_t, aggregate_lock);
+
+static void __init init_aggregate(void)
+{
+	int i;
+
+	for_each_possible_cpu(i)
+		spin_lock_init(&per_cpu(aggregate_lock, i));
+}
+
+static int get_aggregate(struct sched_domain *sd)
+{
+	if (!spin_trylock(&per_cpu(aggregate_lock, sd->first_cpu)))
+		return 0;
+
+	aggregate_walk_tree(aggregate_get_down, aggregate_get_up, sd);
+	return 1;
+}
+
+static void put_aggregate(struct sched_domain *sd)
+{
+	spin_unlock(&per_cpu(aggregate_lock, sd->first_cpu));
+}
+
+static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
+{
+	cfs_rq->shares = shares;
+}
+
+#else
+
+static inline void init_aggregate(void)
+{
+}
+
+static inline int get_aggregate(struct sched_domain *sd)
+{
+	return 0;
+}
+
+static inline void put_aggregate(struct sched_domain *sd)
+{
+}
+#endif
+
 #endif
 
 #include "sched_stats.h"
@@ -1498,26 +1855,14 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
 #define for_each_class(class) \
    for (class = sched_class_highest; class; class = class->next)
 
-static inline void inc_load(struct rq *rq, const struct task_struct *p)
-{
-	update_load_add(&rq->load, p->se.load.weight);
-}
-
-static inline void dec_load(struct rq *rq, const struct task_struct *p)
-{
-	update_load_sub(&rq->load, p->se.load.weight);
-}
-
-static void inc_nr_running(struct task_struct *p, struct rq *rq)
+static void inc_nr_running(struct rq *rq)
 {
 	rq->nr_running++;
-	inc_load(rq, p);
 }
 
-static void dec_nr_running(struct task_struct *p, struct rq *rq)
+static void dec_nr_running(struct rq *rq)
 {
 	rq->nr_running--;
-	dec_load(rq, p);
 }
 
 static void set_load_weight(struct task_struct *p)
@@ -1609,7 +1954,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
 		rq->nr_uninterruptible--;
 
 	enqueue_task(rq, p, wakeup);
-	inc_nr_running(p, rq);
+	inc_nr_running(rq);
 }
 
 /*
@@ -1621,7 +1966,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
 		rq->nr_uninterruptible++;
 
 	dequeue_task(rq, p, sleep);
-	dec_nr_running(p, rq);
+	dec_nr_running(rq);
 }
 
 /**
@@ -2274,7 +2619,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 		 * management (if any):
 		 */
 		p->sched_class->task_new(rq, p);
-		inc_nr_running(p, rq);
+		inc_nr_running(rq);
 	}
 	check_preempt_curr(rq, p);
 #ifdef CONFIG_SMP
@@ -3265,9 +3610,12 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 	unsigned long imbalance;
 	struct rq *busiest;
 	unsigned long flags;
+	int unlock_aggregate;
 
 	cpus_setall(*cpus);
 
+	unlock_aggregate = get_aggregate(sd);
+
 	/*
 	 * When power savings policy is enabled for the parent domain, idle
 	 * sibling can pick up load irrespective of busy siblings. In this case,
@@ -3383,8 +3731,9 @@ redo:
 
 	if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
 	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-		return -1;
-	return ld_moved;
+		ld_moved = -1;
+
+	goto out;
 
 out_balanced:
 	schedstat_inc(sd, lb_balanced[idle]);
@@ -3399,8 +3748,13 @@ out_one_pinned:
 
 	if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
 	    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-		return -1;
-	return 0;
+		ld_moved = -1;
+	else
+		ld_moved = 0;
+out:
+	if (unlock_aggregate)
+		put_aggregate(sd);
+	return ld_moved;
 }
 
 /*
@@ -4588,10 +4942,8 @@ void set_user_nice(struct task_struct *p, long nice)
 		goto out_unlock;
 	}
 	on_rq = p->se.on_rq;
-	if (on_rq) {
+	if (on_rq)
 		dequeue_task(rq, p, 0);
-		dec_load(rq, p);
-	}
 
 	p->static_prio = NICE_TO_PRIO(nice);
 	set_load_weight(p);
@@ -4601,7 +4953,6 @@ void set_user_nice(struct task_struct *p, long nice)
 
 	if (on_rq) {
 		enqueue_task(rq, p, 0);
-		inc_load(rq, p);
 		/*
 		 * If the task increased its priority or is running and
 		 * lowered its priority, then reschedule its CPU:
@@ -7016,6 +7367,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 			SD_INIT(sd, ALLNODES);
 			set_domain_attribute(sd, attr);
 			sd->span = *cpu_map;
+			sd->first_cpu = first_cpu(sd->span);
 			cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
 			p = sd;
 			sd_allnodes = 1;
@@ -7026,6 +7378,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		SD_INIT(sd, NODE);
 		set_domain_attribute(sd, attr);
 		sched_domain_node_span(cpu_to_node(i), &sd->span);
+		sd->first_cpu = first_cpu(sd->span);
 		sd->parent = p;
 		if (p)
 			p->child = sd;
@@ -7037,6 +7390,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		SD_INIT(sd, CPU);
 		set_domain_attribute(sd, attr);
 		sd->span = *nodemask;
+		sd->first_cpu = first_cpu(sd->span);
 		sd->parent = p;
 		if (p)
 			p->child = sd;
@@ -7048,6 +7402,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		SD_INIT(sd, MC);
 		set_domain_attribute(sd, attr);
 		sd->span = cpu_coregroup_map(i);
+		sd->first_cpu = first_cpu(sd->span);
 		cpus_and(sd->span, sd->span, *cpu_map);
 		sd->parent = p;
 		p->child = sd;
@@ -7060,6 +7415,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		SD_INIT(sd, SIBLING);
 		set_domain_attribute(sd, attr);
 		sd->span = per_cpu(cpu_sibling_map, i);
+		sd->first_cpu = first_cpu(sd->span);
 		cpus_and(sd->span, sd->span, *cpu_map);
 		sd->parent = p;
 		p->child = sd;
@@ -7757,6 +8113,7 @@ void __init sched_init(void)
 	}
 
 #ifdef CONFIG_SMP
+	init_aggregate();
 	init_defrootdomain();
 #endif
 
@@ -8322,14 +8679,11 @@ void sched_move_task(struct task_struct *tsk)
 #endif /* CONFIG_GROUP_SCHED */
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static void set_se_shares(struct sched_entity *se, unsigned long shares)
+static void __set_se_shares(struct sched_entity *se, unsigned long shares)
 {
 	struct cfs_rq *cfs_rq = se->cfs_rq;
-	struct rq *rq = cfs_rq->rq;
 	int on_rq;
 
-	spin_lock_irq(&rq->lock);
-
 	on_rq = se->on_rq;
 	if (on_rq)
 		dequeue_entity(cfs_rq, se, 0);
@@ -8339,8 +8693,17 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares)
 
 	if (on_rq)
 		enqueue_entity(cfs_rq, se, 0);
+}
 
-	spin_unlock_irq(&rq->lock);
+static void set_se_shares(struct sched_entity *se, unsigned long shares)
+{
+	struct cfs_rq *cfs_rq = se->cfs_rq;
+	struct rq *rq = cfs_rq->rq;
+	unsigned long flags;
+
+	spin_lock_irqsave(&rq->lock, flags);
+	__set_se_shares(se, shares);
+	spin_unlock_irqrestore(&rq->lock, flags);
 }
 
 static DEFINE_MUTEX(shares_mutex);
@@ -8379,8 +8742,13 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 	 * w/o tripping rebalance_share or load_balance_fair.
 	 */
 	tg->shares = shares;
-	for_each_possible_cpu(i)
+	for_each_possible_cpu(i) {
+		/*
+		 * force a rebalance
+		 */
+		cfs_rq_set_shares(tg->cfs_rq[i], 0);
 		set_se_shares(tg->se[i], shares);
+	}
 
 	/*
 	 * Enable load balance activity on this group, by inserting it back on
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 8e077b9c91cb..04394ccac88d 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -167,6 +167,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 #endif
 	SEQ_printf(m, "  .%-30s: %ld\n", "nr_spread_over",
 			cfs_rq->nr_spread_over);
+#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_SMP
+	SEQ_printf(m, "  .%-30s: %lu\n", "shares", cfs_rq->shares);
+#endif
+#endif
 }
 
 void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 2e197b8e43f1..183388c4dead 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -567,10 +567,27 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
  * Scheduling class queueing methods:
  */
 
+#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
+static void
+add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
+{
+	cfs_rq->task_weight += weight;
+}
+#else
+static inline void
+add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
+{
+}
+#endif
+
 static void
 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	update_load_add(&cfs_rq->load, se->load.weight);
+	if (!parent_entity(se))
+		inc_cpu_load(rq_of(cfs_rq), se->load.weight);
+	if (entity_is_task(se))
+		add_cfs_task_weight(cfs_rq, se->load.weight);
 	cfs_rq->nr_running++;
 	se->on_rq = 1;
 	list_add(&se->group_node, &cfs_rq->tasks);
@@ -580,6 +597,10 @@ static void
 account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	update_load_sub(&cfs_rq->load, se->load.weight);
+	if (!parent_entity(se))
+		dec_cpu_load(rq_of(cfs_rq), se->load.weight);
+	if (entity_is_task(se))
+		add_cfs_task_weight(cfs_rq, -se->load.weight);
 	cfs_rq->nr_running--;
 	se->on_rq = 0;
 	list_del_init(&se->group_node);
@@ -1372,75 +1393,90 @@ static struct task_struct *load_balance_next_fair(void *arg)
 	return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator);
 }
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
-static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
+static unsigned long
+__load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
+		unsigned long max_load_move, struct sched_domain *sd,
+		enum cpu_idle_type idle, int *all_pinned, int *this_best_prio,
+		struct cfs_rq *cfs_rq)
 {
-	struct sched_entity *curr;
-	struct task_struct *p;
-
-	if (!cfs_rq->nr_running || !first_fair(cfs_rq))
-		return MAX_PRIO;
-
-	curr = cfs_rq->curr;
-	if (!curr)
-		curr = __pick_next_entity(cfs_rq);
+	struct rq_iterator cfs_rq_iterator;
 
-	p = task_of(curr);
+	cfs_rq_iterator.start = load_balance_start_fair;
+	cfs_rq_iterator.next = load_balance_next_fair;
+	cfs_rq_iterator.arg = cfs_rq;
 
-	return p->prio;
+	return balance_tasks(this_rq, this_cpu, busiest,
+			max_load_move, sd, idle, all_pinned,
+			this_best_prio, &cfs_rq_iterator);
 }
-#endif
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
 static unsigned long
 load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		  unsigned long max_load_move,
 		  struct sched_domain *sd, enum cpu_idle_type idle,
 		  int *all_pinned, int *this_best_prio)
 {
-	struct cfs_rq *busy_cfs_rq;
 	long rem_load_move = max_load_move;
-	struct rq_iterator cfs_rq_iterator;
-
-	cfs_rq_iterator.start = load_balance_start_fair;
-	cfs_rq_iterator.next = load_balance_next_fair;
+	int busiest_cpu = cpu_of(busiest);
+	struct task_group *tg;
 
-	for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
-#ifdef CONFIG_FAIR_GROUP_SCHED
-		struct cfs_rq *this_cfs_rq;
+	rcu_read_lock();
+	list_for_each_entry(tg, &task_groups, list) {
 		long imbalance;
-		unsigned long maxload;
+		unsigned long this_weight, busiest_weight;
+		long rem_load, max_load, moved_load;
+
+		/*
+		 * empty group
+		 */
+		if (!aggregate(tg, sd)->task_weight)
+			continue;
+
+		rem_load = rem_load_move * aggregate(tg, sd)->rq_weight;
+		rem_load /= aggregate(tg, sd)->load + 1;
+
+		this_weight = tg->cfs_rq[this_cpu]->task_weight;
+		busiest_weight = tg->cfs_rq[busiest_cpu]->task_weight;
 
-		this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu);
+		imbalance = (busiest_weight - this_weight) / 2;
 
-		imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight;
-		/* Don't pull if this_cfs_rq has more load than busy_cfs_rq */
-		if (imbalance <= 0)
+		if (imbalance < 0)
+			imbalance = busiest_weight;
+
+		max_load = max(rem_load, imbalance);
+		moved_load = __load_balance_fair(this_rq, this_cpu, busiest,
+				max_load, sd, idle, all_pinned, this_best_prio,
+				tg->cfs_rq[busiest_cpu]);
+
+		if (!moved_load)
 			continue;
 
-		/* Don't pull more than imbalance/2 */
-		imbalance /= 2;
-		maxload = min(rem_load_move, imbalance);
+		move_group_shares(tg, sd, busiest_cpu, this_cpu);
 
-		*this_best_prio = cfs_rq_best_prio(this_cfs_rq);
-#else
-# define maxload rem_load_move
-#endif
-		/*
-		 * pass busy_cfs_rq argument into
-		 * load_balance_[start|next]_fair iterators
-		 */
-		cfs_rq_iterator.arg = busy_cfs_rq;
-		rem_load_move -= balance_tasks(this_rq, this_cpu, busiest,
-					       maxload, sd, idle, all_pinned,
-					       this_best_prio,
-					       &cfs_rq_iterator);
+		moved_load *= aggregate(tg, sd)->load;
+		moved_load /= aggregate(tg, sd)->rq_weight + 1;
 
-		if (rem_load_move <= 0)
+		rem_load_move -= moved_load;
+		if (rem_load_move < 0)
 			break;
 	}
+	rcu_read_unlock();
 
 	return max_load_move - rem_load_move;
 }
+#else
+static unsigned long
+load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
+		  unsigned long max_load_move,
+		  struct sched_domain *sd, enum cpu_idle_type idle,
+		  int *all_pinned, int *this_best_prio)
+{
+	return __load_balance_fair(this_rq, this_cpu, busiest,
+			max_load_move, sd, idle, all_pinned,
+			this_best_prio, &busiest->cfs);
+}
+#endif
 
 static int
 move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 6b4a6b5a4167..765932d0399d 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -670,6 +670,8 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
 		rt_se->timeout = 0;
 
 	enqueue_rt_entity(rt_se);
+
+	inc_cpu_load(rq, p->se.load.weight);
 }
 
 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
@@ -678,6 +680,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
 
 	update_curr_rt(rq);
 	dequeue_rt_entity(rt_se);
+
+	dec_cpu_load(rq, p->se.load.weight);
 }
 
 /*
-- 
cgit v1.2.3


From b6a86c746f5b708012809958462234d19e9c8177 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 27 Jun 2008 13:41:18 +0200
Subject: sched: fix sched_domain aggregation

Keeping the aggregate on the first cpu of the sched domain has two problems:
 - it could collide between different sched domains on different cpus
 - it could slow things down because of the remote accesses

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |   1 -
 kernel/sched.c        | 113 ++++++++++++++++++++++++--------------------------
 kernel/sched_fair.c   |  12 +++---
 3 files changed, 60 insertions(+), 66 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 97a58b622ee1..eaf821072dbd 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -765,7 +765,6 @@ struct sched_domain {
 	struct sched_domain *child;	/* bottom domain must be null terminated */
 	struct sched_group *groups;	/* the balancing groups of the domain */
 	cpumask_t span;			/* span of all CPUs in this domain */
-	int first_cpu;			/* cache of the first cpu in this domain */
 	unsigned long min_interval;	/* Minimum balance interval ms */
 	unsigned long max_interval;	/* Maximum balance interval ms */
 	unsigned int busy_factor;	/* less balancing by factor if busy */
diff --git a/kernel/sched.c b/kernel/sched.c
index 7d282c52bd42..160d3c209b8f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1480,12 +1480,12 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
  */
 
 static inline struct aggregate_struct *
-aggregate(struct task_group *tg, struct sched_domain *sd)
+aggregate(struct task_group *tg, int cpu)
 {
-	return &tg->cfs_rq[sd->first_cpu]->aggregate;
+	return &tg->cfs_rq[cpu]->aggregate;
 }
 
-typedef void (*aggregate_func)(struct task_group *, struct sched_domain *);
+typedef void (*aggregate_func)(struct task_group *, int, struct sched_domain *);
 
 /*
  * Iterate the full tree, calling @down when first entering a node and @up when
@@ -1493,14 +1493,14 @@ typedef void (*aggregate_func)(struct task_group *, struct sched_domain *);
  */
 static
 void aggregate_walk_tree(aggregate_func down, aggregate_func up,
-			 struct sched_domain *sd)
+			 int cpu, struct sched_domain *sd)
 {
 	struct task_group *parent, *child;
 
 	rcu_read_lock();
 	parent = &root_task_group;
 down:
-	(*down)(parent, sd);
+	(*down)(parent, cpu, sd);
 	list_for_each_entry_rcu(child, &parent->children, siblings) {
 		parent = child;
 		goto down;
@@ -1508,7 +1508,7 @@ down:
 up:
 		continue;
 	}
-	(*up)(parent, sd);
+	(*up)(parent, cpu, sd);
 
 	child = parent;
 	parent = parent->parent;
@@ -1520,8 +1520,8 @@ up:
 /*
  * Calculate the aggregate runqueue weight.
  */
-static
-void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd)
+static void
+aggregate_group_weight(struct task_group *tg, int cpu, struct sched_domain *sd)
 {
 	unsigned long rq_weight = 0;
 	unsigned long task_weight = 0;
@@ -1532,15 +1532,15 @@ void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd)
 		task_weight += tg->cfs_rq[i]->task_weight;
 	}
 
-	aggregate(tg, sd)->rq_weight = rq_weight;
-	aggregate(tg, sd)->task_weight = task_weight;
+	aggregate(tg, cpu)->rq_weight = rq_weight;
+	aggregate(tg, cpu)->task_weight = task_weight;
 }
 
 /*
  * Compute the weight of this group on the given cpus.
  */
-static
-void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd)
+static void
+aggregate_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd)
 {
 	unsigned long shares = 0;
 	int i;
@@ -1548,18 +1548,18 @@ void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd)
 	for_each_cpu_mask(i, sd->span)
 		shares += tg->cfs_rq[i]->shares;
 
-	if ((!shares && aggregate(tg, sd)->rq_weight) || shares > tg->shares)
+	if ((!shares && aggregate(tg, cpu)->rq_weight) || shares > tg->shares)
 		shares = tg->shares;
 
-	aggregate(tg, sd)->shares = shares;
+	aggregate(tg, cpu)->shares = shares;
 }
 
 /*
  * Compute the load fraction assigned to this group, relies on the aggregate
  * weight and this group's parent's load, i.e. top-down.
  */
-static
-void aggregate_group_load(struct task_group *tg, struct sched_domain *sd)
+static void
+aggregate_group_load(struct task_group *tg, int cpu, struct sched_domain *sd)
 {
 	unsigned long load;
 
@@ -1571,17 +1571,17 @@ void aggregate_group_load(struct task_group *tg, struct sched_domain *sd)
 			load += cpu_rq(i)->load.weight;
 
 	} else {
-		load = aggregate(tg->parent, sd)->load;
+		load = aggregate(tg->parent, cpu)->load;
 
 		/*
 		 * shares is our weight in the parent's rq so
 		 * shares/parent->rq_weight gives our fraction of the load
 		 */
-		load *= aggregate(tg, sd)->shares;
-		load /= aggregate(tg->parent, sd)->rq_weight + 1;
+		load *= aggregate(tg, cpu)->shares;
+		load /= aggregate(tg->parent, cpu)->rq_weight + 1;
 	}
 
-	aggregate(tg, sd)->load = load;
+	aggregate(tg, cpu)->load = load;
 }
 
 static void __set_se_shares(struct sched_entity *se, unsigned long shares);
@@ -1590,8 +1590,8 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares);
  * Calculate and set the cpu's group shares.
  */
 static void
-__update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd,
-			  int tcpu)
+__update_group_shares_cpu(struct task_group *tg, int cpu,
+			  struct sched_domain *sd, int tcpu)
 {
 	int boost = 0;
 	unsigned long shares;
@@ -1618,8 +1618,8 @@ __update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd,
 	 *               \Sum rq_weight
 	 *
 	 */
-	shares = aggregate(tg, sd)->shares * rq_weight;
-	shares /= aggregate(tg, sd)->rq_weight + 1;
+	shares = aggregate(tg, cpu)->shares * rq_weight;
+	shares /= aggregate(tg, cpu)->rq_weight + 1;
 
 	/*
 	 * record the actual number of shares, not the boosted amount.
@@ -1639,15 +1639,15 @@ __update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd,
  * task went to.
  */
 static void
-__move_group_shares(struct task_group *tg, struct sched_domain *sd,
+__move_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd,
 		    int scpu, int dcpu)
 {
 	unsigned long shares;
 
 	shares = tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares;
 
-	__update_group_shares_cpu(tg, sd, scpu);
-	__update_group_shares_cpu(tg, sd, dcpu);
+	__update_group_shares_cpu(tg, cpu, sd, scpu);
+	__update_group_shares_cpu(tg, cpu, sd, dcpu);
 
 	/*
 	 * ensure we never loose shares due to rounding errors in the
@@ -1663,19 +1663,19 @@ __move_group_shares(struct task_group *tg, struct sched_domain *sd,
  * we need to walk up the tree and change all shares until we hit the root.
  */
 static void
-move_group_shares(struct task_group *tg, struct sched_domain *sd,
+move_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd,
 		  int scpu, int dcpu)
 {
 	while (tg) {
-		__move_group_shares(tg, sd, scpu, dcpu);
+		__move_group_shares(tg, cpu, sd, scpu, dcpu);
 		tg = tg->parent;
 	}
 }
 
-static
-void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd)
+static void
+aggregate_group_set_shares(struct task_group *tg, int cpu, struct sched_domain *sd)
 {
-	unsigned long shares = aggregate(tg, sd)->shares;
+	unsigned long shares = aggregate(tg, cpu)->shares;
 	int i;
 
 	for_each_cpu_mask(i, sd->span) {
@@ -1683,20 +1683,20 @@ void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd)
 		unsigned long flags;
 
 		spin_lock_irqsave(&rq->lock, flags);
-		__update_group_shares_cpu(tg, sd, i);
+		__update_group_shares_cpu(tg, cpu, sd, i);
 		spin_unlock_irqrestore(&rq->lock, flags);
 	}
 
-	aggregate_group_shares(tg, sd);
+	aggregate_group_shares(tg, cpu, sd);
 
 	/*
 	 * ensure we never loose shares due to rounding errors in the
 	 * above redistribution.
 	 */
-	shares -= aggregate(tg, sd)->shares;
+	shares -= aggregate(tg, cpu)->shares;
 	if (shares) {
-		tg->cfs_rq[sd->first_cpu]->shares += shares;
-		aggregate(tg, sd)->shares += shares;
+		tg->cfs_rq[cpu]->shares += shares;
+		aggregate(tg, cpu)->shares += shares;
 	}
 }
 
@@ -1704,21 +1704,21 @@ void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd)
  * Calculate the accumulative weight and recursive load of each task group
  * while walking down the tree.
  */
-static
-void aggregate_get_down(struct task_group *tg, struct sched_domain *sd)
+static void
+aggregate_get_down(struct task_group *tg, int cpu, struct sched_domain *sd)
 {
-	aggregate_group_weight(tg, sd);
-	aggregate_group_shares(tg, sd);
-	aggregate_group_load(tg, sd);
+	aggregate_group_weight(tg, cpu, sd);
+	aggregate_group_shares(tg, cpu, sd);
+	aggregate_group_load(tg, cpu, sd);
 }
 
 /*
  * Rebalance the cpu shares while walking back up the tree.
  */
-static
-void aggregate_get_up(struct task_group *tg, struct sched_domain *sd)
+static void
+aggregate_get_up(struct task_group *tg, int cpu, struct sched_domain *sd)
 {
-	aggregate_group_set_shares(tg, sd);
+	aggregate_group_set_shares(tg, cpu, sd);
 }
 
 static DEFINE_PER_CPU(spinlock_t, aggregate_lock);
@@ -1731,18 +1731,18 @@ static void __init init_aggregate(void)
 		spin_lock_init(&per_cpu(aggregate_lock, i));
 }
 
-static int get_aggregate(struct sched_domain *sd)
+static int get_aggregate(int cpu, struct sched_domain *sd)
 {
-	if (!spin_trylock(&per_cpu(aggregate_lock, sd->first_cpu)))
+	if (!spin_trylock(&per_cpu(aggregate_lock, cpu)))
 		return 0;
 
-	aggregate_walk_tree(aggregate_get_down, aggregate_get_up, sd);
+	aggregate_walk_tree(aggregate_get_down, aggregate_get_up, cpu, sd);
 	return 1;
 }
 
-static void put_aggregate(struct sched_domain *sd)
+static void put_aggregate(int cpu, struct sched_domain *sd)
 {
-	spin_unlock(&per_cpu(aggregate_lock, sd->first_cpu));
+	spin_unlock(&per_cpu(aggregate_lock, cpu));
 }
 
 static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
@@ -1756,12 +1756,12 @@ static inline void init_aggregate(void)
 {
 }
 
-static inline int get_aggregate(struct sched_domain *sd)
+static inline int get_aggregate(int cpu, struct sched_domain *sd)
 {
 	return 0;
 }
 
-static inline void put_aggregate(struct sched_domain *sd)
+static inline void put_aggregate(int cpu, struct sched_domain *sd)
 {
 }
 #endif
@@ -3539,7 +3539,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 
 	cpus_setall(*cpus);
 
-	unlock_aggregate = get_aggregate(sd);
+	unlock_aggregate = get_aggregate(this_cpu, sd);
 
 	/*
 	 * When power savings policy is enabled for the parent domain, idle
@@ -3678,7 +3678,7 @@ out_one_pinned:
 		ld_moved = 0;
 out:
 	if (unlock_aggregate)
-		put_aggregate(sd);
+		put_aggregate(this_cpu, sd);
 	return ld_moved;
 }
 
@@ -7292,7 +7292,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 			SD_INIT(sd, ALLNODES);
 			set_domain_attribute(sd, attr);
 			sd->span = *cpu_map;
-			sd->first_cpu = first_cpu(sd->span);
 			cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
 			p = sd;
 			sd_allnodes = 1;
@@ -7303,7 +7302,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		SD_INIT(sd, NODE);
 		set_domain_attribute(sd, attr);
 		sched_domain_node_span(cpu_to_node(i), &sd->span);
-		sd->first_cpu = first_cpu(sd->span);
 		sd->parent = p;
 		if (p)
 			p->child = sd;
@@ -7315,7 +7313,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		SD_INIT(sd, CPU);
 		set_domain_attribute(sd, attr);
 		sd->span = *nodemask;
-		sd->first_cpu = first_cpu(sd->span);
 		sd->parent = p;
 		if (p)
 			p->child = sd;
@@ -7327,7 +7324,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		SD_INIT(sd, MC);
 		set_domain_attribute(sd, attr);
 		sd->span = cpu_coregroup_map(i);
-		sd->first_cpu = first_cpu(sd->span);
 		cpus_and(sd->span, sd->span, *cpu_map);
 		sd->parent = p;
 		p->child = sd;
@@ -7340,7 +7336,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		SD_INIT(sd, SIBLING);
 		set_domain_attribute(sd, attr);
 		sd->span = per_cpu(cpu_sibling_map, i);
-		sd->first_cpu = first_cpu(sd->span);
 		cpus_and(sd->span, sd->span, *cpu_map);
 		sd->parent = p;
 		p->child = sd;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 509092af0330..40cf24ab4de8 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1429,11 +1429,11 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		/*
 		 * empty group
 		 */
-		if (!aggregate(tg, sd)->task_weight)
+		if (!aggregate(tg, this_cpu)->task_weight)
 			continue;
 
-		rem_load = rem_load_move * aggregate(tg, sd)->rq_weight;
-		rem_load /= aggregate(tg, sd)->load + 1;
+		rem_load = rem_load_move * aggregate(tg, this_cpu)->rq_weight;
+		rem_load /= aggregate(tg, this_cpu)->load + 1;
 
 		this_weight = tg->cfs_rq[this_cpu]->task_weight;
 		busiest_weight = tg->cfs_rq[busiest_cpu]->task_weight;
@@ -1451,10 +1451,10 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		if (!moved_load)
 			continue;
 
-		move_group_shares(tg, sd, busiest_cpu, this_cpu);
+		move_group_shares(tg, this_cpu, sd, busiest_cpu, this_cpu);
 
-		moved_load *= aggregate(tg, sd)->load;
-		moved_load /= aggregate(tg, sd)->rq_weight + 1;
+		moved_load *= aggregate(tg, this_cpu)->load;
+		moved_load /= aggregate(tg, this_cpu)->rq_weight + 1;
 
 		rem_load_move -= moved_load;
 		if (rem_load_move < 0)
-- 
cgit v1.2.3


From 2398f2c6d34b43025f274fc42eaca34d23ec2320 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 27 Jun 2008 13:41:35 +0200
Subject: sched: update shares on wakeup

We found that the affine wakeup code needs rather accurate load figures
to be effective. The trouble is that updating the load figures is fairly
expensive with group scheduling. Therefore ratelimit the updating.

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h   |  3 +++
 kernel/sched.c          | 30 +++++++++++++++++++++++++++++-
 kernel/sched_features.h |  3 ++-
 kernel/sysctl.c         |  8 ++++++++
 4 files changed, 42 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index eaf821072dbd..835b6c6fcc56 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -783,6 +783,8 @@ struct sched_domain {
 	unsigned int balance_interval;	/* initialise to 1. units in ms. */
 	unsigned int nr_balance_failed; /* initialise to 0 */
 
+	u64 last_update;
+
 #ifdef CONFIG_SCHEDSTATS
 	/* load_balance() stats */
 	unsigned int lb_count[CPU_MAX_IDLE_TYPES];
@@ -1605,6 +1607,7 @@ extern unsigned int sysctl_sched_child_runs_first;
 extern unsigned int sysctl_sched_features;
 extern unsigned int sysctl_sched_migration_cost;
 extern unsigned int sysctl_sched_nr_migrate;
+extern unsigned int sysctl_sched_shares_ratelimit;
 
 int sched_nr_latency_handler(struct ctl_table *table, int write,
 		struct file *file, void __user *buffer, size_t *length,
diff --git a/kernel/sched.c b/kernel/sched.c
index 1cff969f6646..62db0891025a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -777,6 +777,12 @@ late_initcall(sched_init_debug);
  */
 const_debug unsigned int sysctl_sched_nr_migrate = 32;
 
+/*
+ * ratelimit for updating the group shares.
+ * default: 0.5ms
+ */
+const_debug unsigned int sysctl_sched_shares_ratelimit = 500000;
+
 /*
  * period over which we measure -rt task cpu usage in us.
  * default: 1s
@@ -1590,7 +1596,13 @@ tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
 
 static void update_shares(struct sched_domain *sd)
 {
-	walk_tg_tree(tg_nop, tg_shares_up, 0, sd);
+	u64 now = cpu_clock(raw_smp_processor_id());
+	s64 elapsed = now - sd->last_update;
+
+	if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
+		sd->last_update = now;
+		walk_tg_tree(tg_nop, tg_shares_up, 0, sd);
+	}
 }
 
 static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
@@ -2199,6 +2211,22 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 	if (!sched_feat(SYNC_WAKEUPS))
 		sync = 0;
 
+#ifdef CONFIG_SMP
+	if (sched_feat(LB_WAKEUP_UPDATE)) {
+		struct sched_domain *sd;
+
+		this_cpu = raw_smp_processor_id();
+		cpu = task_cpu(p);
+
+		for_each_domain(this_cpu, sd) {
+			if (cpu_isset(cpu, sd->span)) {
+				update_shares(sd);
+				break;
+			}
+		}
+	}
+#endif
+
 	smp_wmb();
 	rq = task_rq_lock(p, &flags);
 	old_state = p->state;
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index d56e3053e746..7d616d2a2a3f 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -8,4 +8,5 @@ SCHED_FEAT(SYNC_WAKEUPS, 1)
 SCHED_FEAT(HRTICK, 1)
 SCHED_FEAT(DOUBLE_TICK, 0)
 SCHED_FEAT(ASYM_GRAN, 1)
-SCHED_FEAT(LB_BIAS, 0)
\ No newline at end of file
+SCHED_FEAT(LB_BIAS, 0)
+SCHED_FEAT(LB_WAKEUP_UPDATE, 1)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 29116652dca8..fe8cdc80ff02 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -264,6 +264,14 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &min_wakeup_granularity_ns,
 		.extra2		= &max_wakeup_granularity_ns,
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "sched_shares_ratelimit",
+		.data		= &sysctl_sched_shares_ratelimit,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 	{
 		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "sched_child_runs_first",
-- 
cgit v1.2.3


From 9465efc9e96135a2cec8154c0c766fa59984a298 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Fri, 27 Jun 2008 11:05:24 +0200
Subject: Remove BKL from remote_llseek v2

- Replace remote_llseek with generic_file_llseek_unlocked (to force compilation
failures in all users)
- Change all users to either use generic_file_llseek_unlocked directly or
take the BKL around. I changed the file systems who don't use the BKL
for anything (CIFS, GFS) to call it directly. NCPFS and SMBFS and NFS
take the BKL, but explicitely in their own source now.

I moved them all over in a single patch to avoid unbisectable sections.

Open problem: 32bit kernels can corrupt fpos because its modification
is not atomic, but they can do that anyways because there's other paths who
modify it without BKL.

Do we need a special lock for the pos/f_version = 0 checks?

Trond says the NFS BKL is likely not needed, but keep it for now
until his full audit.

v2: Use generic_file_llseek_unlocked instead of remote_llseek_unlocked
    and factor duplicated code (suggested by hch)

Cc: Trond.Myklebust@netapp.com
Cc: swhiteho@redhat.com
Cc: sfrench@samba.org
Cc: vandrove@vc.cvut.cz

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 fs/cifs/cifsfs.c   |  2 +-
 fs/gfs2/ops_file.c |  4 ++--
 fs/ncpfs/file.c    | 12 +++++++++++-
 fs/nfs/file.c      |  6 +++++-
 fs/read_write.c    | 38 +++++++++++---------------------------
 fs/smbfs/file.c    | 11 ++++++++++-
 include/linux/fs.h |  3 ++-
 7 files changed, 42 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 427a7c695896..aeff0fe5b6b9 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -581,7 +581,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
 		if (retval < 0)
 			return (loff_t)retval;
 	}
-	return remote_llseek(file, offset, origin);
+	return generic_file_llseek_unlocked(file, offset, origin);
 }
 
 struct file_system_type cifs_fs_type = {
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index e1b7d525a066..24dd59450088 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -62,11 +62,11 @@ static loff_t gfs2_llseek(struct file *file, loff_t offset, int origin)
 		error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
 					   &i_gh);
 		if (!error) {
-			error = remote_llseek(file, offset, origin);
+			error = generic_file_llseek_unlocked(file, offset, origin);
 			gfs2_glock_dq_uninit(&i_gh);
 		}
 	} else
-		error = remote_llseek(file, offset, origin);
+		error = generic_file_llseek_unlocked(file, offset, origin);
 
 	return error;
 }
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index 2b145de45b39..6a7d901f1936 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -18,6 +18,7 @@
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/sched.h>
+#include <linux/smp_lock.h>
 
 #include <linux/ncp_fs.h>
 #include "ncplib_kernel.h"
@@ -281,9 +282,18 @@ static int ncp_release(struct inode *inode, struct file *file) {
 	return 0;
 }
 
+static loff_t ncp_remote_llseek(struct file *file, loff_t offset, int origin)
+{
+	loff_t ret;
+	lock_kernel();
+	ret = generic_file_llseek_unlocked(file, offset, origin);
+	unlock_kernel();
+	return ret;
+}
+
 const struct file_operations ncp_file_operations =
 {
-	.llseek		= remote_llseek,
+	.llseek 	= ncp_remote_llseek,
 	.read		= ncp_file_read,
 	.write		= ncp_file_write,
 	.ioctl		= ncp_ioctl,
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 3536b01164f9..a34eb78989f5 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -170,6 +170,7 @@ force_reval:
 
 static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
 {
+	loff_t loff;
 	/* origin == SEEK_END => we must revalidate the cached file length */
 	if (origin == SEEK_END) {
 		struct inode *inode = filp->f_mapping->host;
@@ -177,7 +178,10 @@ static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
 		if (retval < 0)
 			return (loff_t)retval;
 	}
-	return remote_llseek(filp, offset, origin);
+	lock_kernel();	/* BKL needed? */
+	loff = generic_file_llseek_unlocked(filp, offset, origin);
+	unlock_kernel();
+	return loff;
 }
 
 /*
diff --git a/fs/read_write.c b/fs/read_write.c
index f0d1240a5c69..9ba495d5a29b 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -31,12 +31,12 @@ const struct file_operations generic_ro_fops = {
 
 EXPORT_SYMBOL(generic_ro_fops);
 
-loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
+loff_t
+generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin)
 {
 	loff_t retval;
 	struct inode *inode = file->f_mapping->host;
 
-	mutex_lock(&inode->i_mutex);
 	switch (origin) {
 		case SEEK_END:
 			offset += inode->i_size;
@@ -46,42 +46,26 @@ loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
 	}
 	retval = -EINVAL;
 	if (offset>=0 && offset<=inode->i_sb->s_maxbytes) {
+		/* Special lock needed here? */
 		if (offset != file->f_pos) {
 			file->f_pos = offset;
 			file->f_version = 0;
 		}
 		retval = offset;
 	}
-	mutex_unlock(&inode->i_mutex);
 	return retval;
 }
+EXPORT_SYMBOL(generic_file_llseek_unlocked);
 
-EXPORT_SYMBOL(generic_file_llseek);
-
-loff_t remote_llseek(struct file *file, loff_t offset, int origin)
+loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
 {
-	loff_t retval;
-
-	lock_kernel();
-	switch (origin) {
-		case SEEK_END:
-			offset += i_size_read(file->f_path.dentry->d_inode);
-			break;
-		case SEEK_CUR:
-			offset += file->f_pos;
-	}
-	retval = -EINVAL;
-	if (offset>=0 && offset<=file->f_path.dentry->d_inode->i_sb->s_maxbytes) {
-		if (offset != file->f_pos) {
-			file->f_pos = offset;
-			file->f_version = 0;
-		}
-		retval = offset;
-	}
-	unlock_kernel();
-	return retval;
+	loff_t n;
+	mutex_lock(&file->f_dentry->d_inode->i_mutex);
+	n = generic_file_llseek_unlocked(file, offset, origin);
+	mutex_unlock(&file->f_dentry->d_inode->i_mutex);
+	return n;
 }
-EXPORT_SYMBOL(remote_llseek);
+EXPORT_SYMBOL(generic_file_llseek);
 
 loff_t no_llseek(struct file *file, loff_t offset, int origin)
 {
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
index efbe29af3d7a..2294783320cb 100644
--- a/fs/smbfs/file.c
+++ b/fs/smbfs/file.c
@@ -422,9 +422,18 @@ smb_file_permission(struct inode *inode, int mask, struct nameidata *nd)
 	return error;
 }
 
+static loff_t smb_remote_llseek(struct file *file, loff_t offset, int origin)
+{
+	loff_t ret;
+	lock_kernel();
+	ret = generic_file_llseek_unlocked(file, offset, origin);
+	unlock_kernel();
+	return ret;
+}
+
 const struct file_operations smb_file_operations =
 {
-	.llseek		= remote_llseek,
+	.llseek 	= smb_remote_llseek,
 	.read		= do_sync_read,
 	.aio_read	= smb_file_aio_read,
 	.write		= do_sync_write,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f413085f748e..b158e5161bca 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1871,7 +1871,8 @@ extern void
 file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping);
 extern loff_t no_llseek(struct file *file, loff_t offset, int origin);
 extern loff_t generic_file_llseek(struct file *file, loff_t offset, int origin);
-extern loff_t remote_llseek(struct file *file, loff_t offset, int origin);
+extern loff_t generic_file_llseek_unlocked(struct file *file, loff_t offset,
+			int origin);
 extern int generic_file_open(struct inode * inode, struct file * filp);
 extern int nonseekable_open(struct inode * inode, struct file * filp);
 
-- 
cgit v1.2.3


From 02c62304e6af60f1963695c6bc1bbffe619aa585 Mon Sep 17 00:00:00 2001
From: "Alan D. Brunelle" <Alan.Brunelle@hp.com>
Date: Wed, 11 Jun 2008 09:12:52 +0200
Subject: Added in user-injected messages into blk traces

This allows a user to annotate the blk trace stream: writing a suitable
message to {/sys/kernel/debug}/block/<dsf>/msg will have it propagated
into the trace stream.

Signed-off-by: Alan D. Brunelle <alan.brunelle@hp.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blktrace.c             | 45 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/blktrace_api.h |  1 +
 2 files changed, 46 insertions(+)

(limited to 'include/linux')

diff --git a/block/blktrace.c b/block/blktrace.c
index 8d3a27780260..eb9651ccb241 100644
--- a/block/blktrace.c
+++ b/block/blktrace.c
@@ -244,6 +244,7 @@ err:
 static void blk_trace_cleanup(struct blk_trace *bt)
 {
 	relay_close(bt->rchan);
+	debugfs_remove(bt->msg_file);
 	debugfs_remove(bt->dropped_file);
 	blk_remove_tree(bt->dir);
 	free_percpu(bt->sequence);
@@ -291,6 +292,44 @@ static const struct file_operations blk_dropped_fops = {
 	.read =		blk_dropped_read,
 };
 
+static int blk_msg_open(struct inode *inode, struct file *filp)
+{
+	filp->private_data = inode->i_private;
+
+	return 0;
+}
+
+static ssize_t blk_msg_write(struct file *filp, const char __user *buffer,
+				size_t count, loff_t *ppos)
+{
+	char *msg;
+	struct blk_trace *bt;
+
+	if (count > BLK_TN_MAX_MSG)
+		return -EINVAL;
+
+	msg = kmalloc(count, GFP_KERNEL);
+	if (msg == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(msg, buffer, count)) {
+		kfree(msg);
+		return -EFAULT;
+	}
+
+	bt = filp->private_data;
+	__trace_note_message(bt, "%s", msg);
+	kfree(msg);
+
+	return count;
+}
+
+static const struct file_operations blk_msg_fops = {
+	.owner =	THIS_MODULE,
+	.open =		blk_msg_open,
+	.write =	blk_msg_write,
+};
+
 /*
  * Keep track of how many times we encountered a full subbuffer, to aid
  * the user space app in telling how many lost events there were.
@@ -380,6 +419,10 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 	if (!bt->dropped_file)
 		goto err;
 
+	bt->msg_file = debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops);
+	if (!bt->msg_file)
+		goto err;
+
 	bt->rchan = relay_open("trace", dir, buts->buf_size,
 				buts->buf_nr, &blk_relay_callbacks, bt);
 	if (!bt->rchan)
@@ -409,6 +452,8 @@ err:
 	if (dir)
 		blk_remove_tree(dir);
 	if (bt) {
+		if (bt->msg_file)
+			debugfs_remove(bt->msg_file);
 		if (bt->dropped_file)
 			debugfs_remove(bt->dropped_file);
 		free_percpu(bt->sequence);
diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index e3ef903aae88..d084b8d227a5 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -129,6 +129,7 @@ struct blk_trace {
 	u32 dev;
 	struct dentry *dir;
 	struct dentry *dropped_file;
+	struct dentry *msg_file;
 	atomic_t dropped;
 };
 
-- 
cgit v1.2.3


From 244b4d56f85bcd11b21ab0b94845a3dabeed5c10 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 12 Jun 2008 20:12:36 +0200
Subject: block: kill request_queue_t

Everything was moved to struct request_queue a few kernel revisions
ago, maintaining the deprecated typedef to avoid breaking things.
Now the time has come to get rid of that typedef.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 include/linux/blkdev.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index d2a1b71e93c3..6a3da6717135 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -23,7 +23,6 @@
 struct scsi_ioctl_command;
 
 struct request_queue;
-typedef struct request_queue request_queue_t __deprecated;
 struct elevator_queue;
 typedef struct elevator_queue elevator_t;
 struct request_pm_state;
-- 
cgit v1.2.3


From 51d654e1d885607a6edd02b337105fa5c28b6d33 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Tue, 17 Jun 2008 18:59:56 +0200
Subject: block: Globalize bio_set and bio_vec_slab

Move struct bio_set and biovec_slab definitions to bio.h so they can
be used outside of bio.c.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/bio.c            | 30 ++----------------------------
 include/linux/bio.h | 29 +++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/fs/bio.c b/fs/bio.c
index 78562574cb52..7a6598abc967 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -28,25 +28,10 @@
 #include <linux/blktrace_api.h>
 #include <scsi/sg.h>		/* for struct sg_iovec */
 
-#define BIO_POOL_SIZE 2
-
 static struct kmem_cache *bio_slab __read_mostly;
 
-#define BIOVEC_NR_POOLS 6
-
-/*
- * a small number of entries is fine, not going to be performance critical.
- * basically we just need to survive
- */
-#define BIO_SPLIT_ENTRIES 2
 mempool_t *bio_split_pool __read_mostly;
 
-struct biovec_slab {
-	int nr_vecs;
-	char *name; 
-	struct kmem_cache *slab;
-};
-
 /*
  * if you change this list, also change bvec_alloc or things will
  * break badly! cannot be bigger than what you can fit into an
@@ -59,24 +44,13 @@ static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = {
 };
 #undef BV
 
-/*
- * bio_set is used to allow other portions of the IO system to
- * allocate their own private memory pools for bio and iovec structures.
- * These memory pools in turn all allocate from the bio_slab
- * and the bvec_slabs[].
- */
-struct bio_set {
-	mempool_t *bio_pool;
-	mempool_t *bvec_pools[BIOVEC_NR_POOLS];
-};
-
 /*
  * fs_bio_set is the bio_set containing bio and iovec memory pools used by
  * IO code that does not need private memory pools.
  */
-static struct bio_set *fs_bio_set;
+struct bio_set *fs_bio_set;
 
-static inline struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct bio_set *bs)
+struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct bio_set *bs)
 {
 	struct bio_vec *bvl;
 
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 61c15eaf3fb3..49dfb3cb7460 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -333,6 +333,35 @@ extern struct bio *bio_copy_user_iov(struct request_queue *, struct sg_iovec *,
 				     int, int);
 extern int bio_uncopy_user(struct bio *);
 void zero_fill_bio(struct bio *bio);
+extern struct bio_vec *bvec_alloc_bs(gfp_t, int, unsigned long *, struct bio_set *);
+
+/*
+ * bio_set is used to allow other portions of the IO system to
+ * allocate their own private memory pools for bio and iovec structures.
+ * These memory pools in turn all allocate from the bio_slab
+ * and the bvec_slabs[].
+ */
+#define BIO_POOL_SIZE 2
+#define BIOVEC_NR_POOLS 6
+
+struct bio_set {
+	mempool_t *bio_pool;
+	mempool_t *bvec_pools[BIOVEC_NR_POOLS];
+};
+
+struct biovec_slab {
+	int nr_vecs;
+	char *name;
+	struct kmem_cache *slab;
+};
+
+extern struct bio_set *fs_bio_set;
+
+/*
+ * a small number of entries is fine, not going to be performance critical.
+ * basically we just need to survive
+ */
+#define BIO_SPLIT_ENTRIES 2
 
 #ifdef CONFIG_HIGHMEM
 /*
-- 
cgit v1.2.3


From 7ba1ba12eeef0aa7113beb16410ef8b7c748e18b Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Mon, 30 Jun 2008 20:04:41 +0200
Subject: block: Block layer data integrity support

Some block devices support verifying the integrity of requests by way
of checksums or other protection information that is submitted along
with the I/O.

This patch implements support for generating and verifying integrity
metadata, as well as correctly merging, splitting and cloning bios and
requests that have this extra information attached.

See Documentation/block/data-integrity.txt for more information.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/Kconfig          |  12 +
 block/Makefile         |   1 +
 block/blk-core.c       |   7 +
 block/blk-integrity.c  | 382 ++++++++++++++++++++++++++
 block/blk-merge.c      |   3 +
 block/blk.h            |   8 +
 block/elevator.c       |   6 +
 fs/Makefile            |   1 +
 fs/bio-integrity.c     | 708 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/bio.c               |  32 ++-
 include/linux/bio.h    |  94 ++++++-
 include/linux/blkdev.h | 105 ++++++++
 include/linux/genhd.h  |   3 +
 13 files changed, 1355 insertions(+), 7 deletions(-)
 create mode 100644 block/blk-integrity.c
 create mode 100644 fs/bio-integrity.c

(limited to 'include/linux')

diff --git a/block/Kconfig b/block/Kconfig
index 3e97f2bc446f..1ab7c15c8d7a 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -81,6 +81,18 @@ config BLK_DEV_BSG
 
 	  If unsure, say N.
 
+config BLK_DEV_INTEGRITY
+	bool "Block layer data integrity support"
+	---help---
+	Some storage devices allow extra information to be
+	stored/retrieved to help protect the data.  The block layer
+	data integrity option provides hooks which can be used by
+	filesystems to ensure better data integrity.
+
+	Say yes here if you have a storage device that provides the
+	T10/SCSI Data Integrity Field or the T13/ATA External Path
+	Protection.  If in doubt, say N.
+
 endif # BLOCK
 
 config BLOCK_COMPAT
diff --git a/block/Makefile b/block/Makefile
index 5a43c7d79594..045f7b62e4bb 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -14,3 +14,4 @@ obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o
 
 obj-$(CONFIG_BLK_DEV_IO_TRACE)	+= blktrace.o
 obj-$(CONFIG_BLOCK_COMPAT)	+= compat_ioctl.o
+obj-$(CONFIG_BLK_DEV_INTEGRITY)	+= blk-integrity.o
diff --git a/block/blk-core.c b/block/blk-core.c
index 1905aaba49fb..e0fb0bcc0c17 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -143,6 +143,10 @@ static void req_bio_endio(struct request *rq, struct bio *bio,
 
 		bio->bi_size -= nbytes;
 		bio->bi_sector += (nbytes >> 9);
+
+		if (bio_integrity(bio))
+			bio_integrity_advance(bio, nbytes);
+
 		if (bio->bi_size == 0)
 			bio_endio(bio, error);
 	} else {
@@ -1381,6 +1385,9 @@ end_io:
 		 */
 		blk_partition_remap(bio);
 
+		if (bio_integrity_enabled(bio) && bio_integrity_prep(bio))
+			goto end_io;
+
 		if (old_sector != -1)
 			blk_add_trace_remap(q, bio, old_dev, bio->bi_sector,
 					    old_sector);
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
new file mode 100644
index 000000000000..65f23ef38bbe
--- /dev/null
+++ b/block/blk-integrity.c
@@ -0,0 +1,382 @@
+/*
+ * blk-integrity.c - Block layer data integrity extensions
+ *
+ * Copyright (C) 2007, 2008 Oracle Corporation
+ * Written by: Martin K. Petersen <martin.petersen@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; see the file COPYING.  If not, write to
+ * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139,
+ * USA.
+ *
+ */
+
+#include <linux/blkdev.h>
+#include <linux/mempool.h>
+#include <linux/bio.h>
+#include <linux/scatterlist.h>
+
+#include "blk.h"
+
+static struct kmem_cache *integrity_cachep;
+
+/**
+ * blk_rq_count_integrity_sg - Count number of integrity scatterlist elements
+ * @rq:		request with integrity metadata attached
+ *
+ * Description: Returns the number of elements required in a
+ * scatterlist corresponding to the integrity metadata in a request.
+ */
+int blk_rq_count_integrity_sg(struct request *rq)
+{
+	struct bio_vec *iv, *ivprv;
+	struct req_iterator iter;
+	unsigned int segments;
+
+	ivprv = NULL;
+	segments = 0;
+
+	rq_for_each_integrity_segment(iv, rq, iter) {
+
+		if (!ivprv || !BIOVEC_PHYS_MERGEABLE(ivprv, iv))
+			segments++;
+
+		ivprv = iv;
+	}
+
+	return segments;
+}
+EXPORT_SYMBOL(blk_rq_count_integrity_sg);
+
+/**
+ * blk_rq_map_integrity_sg - Map integrity metadata into a scatterlist
+ * @rq:		request with integrity metadata attached
+ * @sglist:	target scatterlist
+ *
+ * Description: Map the integrity vectors in request into a
+ * scatterlist.  The scatterlist must be big enough to hold all
+ * elements.  I.e. sized using blk_rq_count_integrity_sg().
+ */
+int blk_rq_map_integrity_sg(struct request *rq, struct scatterlist *sglist)
+{
+	struct bio_vec *iv, *ivprv;
+	struct req_iterator iter;
+	struct scatterlist *sg;
+	unsigned int segments;
+
+	ivprv = NULL;
+	sg = NULL;
+	segments = 0;
+
+	rq_for_each_integrity_segment(iv, rq, iter) {
+
+		if (ivprv) {
+			if (!BIOVEC_PHYS_MERGEABLE(ivprv, iv))
+				goto new_segment;
+
+			sg->length += iv->bv_len;
+		} else {
+new_segment:
+			if (!sg)
+				sg = sglist;
+			else {
+				sg->page_link &= ~0x02;
+				sg = sg_next(sg);
+			}
+
+			sg_set_page(sg, iv->bv_page, iv->bv_len, iv->bv_offset);
+			segments++;
+		}
+
+		ivprv = iv;
+	}
+
+	if (sg)
+		sg_mark_end(sg);
+
+	return segments;
+}
+EXPORT_SYMBOL(blk_rq_map_integrity_sg);
+
+/**
+ * blk_integrity_compare - Compare integrity profile of two block devices
+ * @b1:		Device to compare
+ * @b2:		Device to compare
+ *
+ * Description: Meta-devices like DM and MD need to verify that all
+ * sub-devices use the same integrity format before advertising to
+ * upper layers that they can send/receive integrity metadata.  This
+ * function can be used to check whether two block devices have
+ * compatible integrity formats.
+ */
+int blk_integrity_compare(struct block_device *bd1, struct block_device *bd2)
+{
+	struct blk_integrity *b1 = bd1->bd_disk->integrity;
+	struct blk_integrity *b2 = bd2->bd_disk->integrity;
+
+	BUG_ON(bd1->bd_disk == NULL);
+	BUG_ON(bd2->bd_disk == NULL);
+
+	if (!b1 || !b2)
+		return 0;
+
+	if (b1->sector_size != b2->sector_size) {
+		printk(KERN_ERR "%s: %s/%s sector sz %u != %u\n", __func__,
+		       bd1->bd_disk->disk_name, bd2->bd_disk->disk_name,
+		       b1->sector_size, b2->sector_size);
+		return -1;
+	}
+
+	if (b1->tuple_size != b2->tuple_size) {
+		printk(KERN_ERR "%s: %s/%s tuple sz %u != %u\n", __func__,
+		       bd1->bd_disk->disk_name, bd2->bd_disk->disk_name,
+		       b1->tuple_size, b2->tuple_size);
+		return -1;
+	}
+
+	if (b1->tag_size && b2->tag_size && (b1->tag_size != b2->tag_size)) {
+		printk(KERN_ERR "%s: %s/%s tag sz %u != %u\n", __func__,
+		       bd1->bd_disk->disk_name, bd2->bd_disk->disk_name,
+		       b1->tag_size, b2->tag_size);
+		return -1;
+	}
+
+	if (strcmp(b1->name, b2->name)) {
+		printk(KERN_ERR "%s: %s/%s type %s != %s\n", __func__,
+		       bd1->bd_disk->disk_name, bd2->bd_disk->disk_name,
+		       b1->name, b2->name);
+		return -1;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(blk_integrity_compare);
+
+struct integrity_sysfs_entry {
+	struct attribute attr;
+	ssize_t (*show)(struct blk_integrity *, char *);
+	ssize_t (*store)(struct blk_integrity *, const char *, size_t);
+};
+
+static ssize_t integrity_attr_show(struct kobject *kobj, struct attribute *attr,
+				   char *page)
+{
+	struct blk_integrity *bi =
+		container_of(kobj, struct blk_integrity, kobj);
+	struct integrity_sysfs_entry *entry =
+		container_of(attr, struct integrity_sysfs_entry, attr);
+
+	return entry->show(bi, page);
+}
+
+static ssize_t integrity_attr_store(struct kobject *kobj, struct attribute *attr,
+				    const char *page, size_t count)
+{
+	struct blk_integrity *bi =
+		container_of(kobj, struct blk_integrity, kobj);
+	struct integrity_sysfs_entry *entry =
+		container_of(attr, struct integrity_sysfs_entry, attr);
+	ssize_t ret = 0;
+
+	if (entry->store)
+		ret = entry->store(bi, page, count);
+
+	return ret;
+}
+
+static ssize_t integrity_format_show(struct blk_integrity *bi, char *page)
+{
+	if (bi != NULL && bi->name != NULL)
+		return sprintf(page, "%s\n", bi->name);
+	else
+		return sprintf(page, "none\n");
+}
+
+static ssize_t integrity_tag_size_show(struct blk_integrity *bi, char *page)
+{
+	if (bi != NULL)
+		return sprintf(page, "%u\n", bi->tag_size);
+	else
+		return sprintf(page, "0\n");
+}
+
+static ssize_t integrity_read_store(struct blk_integrity *bi,
+				    const char *page, size_t count)
+{
+	char *p = (char *) page;
+	unsigned long val = simple_strtoul(p, &p, 10);
+
+	if (val)
+		set_bit(INTEGRITY_FLAG_READ, &bi->flags);
+	else
+		clear_bit(INTEGRITY_FLAG_READ, &bi->flags);
+
+	return count;
+}
+
+static ssize_t integrity_read_show(struct blk_integrity *bi, char *page)
+{
+	return sprintf(page, "%d\n",
+		       test_bit(INTEGRITY_FLAG_READ, &bi->flags) ? 1 : 0);
+}
+
+static ssize_t integrity_write_store(struct blk_integrity *bi,
+				     const char *page, size_t count)
+{
+	char *p = (char *) page;
+	unsigned long val = simple_strtoul(p, &p, 10);
+
+	if (val)
+		set_bit(INTEGRITY_FLAG_WRITE, &bi->flags);
+	else
+		clear_bit(INTEGRITY_FLAG_WRITE, &bi->flags);
+
+	return count;
+}
+
+static ssize_t integrity_write_show(struct blk_integrity *bi, char *page)
+{
+	return sprintf(page, "%d\n",
+		       test_bit(INTEGRITY_FLAG_WRITE, &bi->flags) ? 1 : 0);
+}
+
+static struct integrity_sysfs_entry integrity_format_entry = {
+	.attr = { .name = "format", .mode = S_IRUGO },
+	.show = integrity_format_show,
+};
+
+static struct integrity_sysfs_entry integrity_tag_size_entry = {
+	.attr = { .name = "tag_size", .mode = S_IRUGO },
+	.show = integrity_tag_size_show,
+};
+
+static struct integrity_sysfs_entry integrity_read_entry = {
+	.attr = { .name = "read_verify", .mode = S_IRUGO | S_IWUSR },
+	.show = integrity_read_show,
+	.store = integrity_read_store,
+};
+
+static struct integrity_sysfs_entry integrity_write_entry = {
+	.attr = { .name = "write_generate", .mode = S_IRUGO | S_IWUSR },
+	.show = integrity_write_show,
+	.store = integrity_write_store,
+};
+
+static struct attribute *integrity_attrs[] = {
+	&integrity_format_entry.attr,
+	&integrity_tag_size_entry.attr,
+	&integrity_read_entry.attr,
+	&integrity_write_entry.attr,
+	NULL,
+};
+
+static struct sysfs_ops integrity_ops = {
+	.show	= &integrity_attr_show,
+	.store	= &integrity_attr_store,
+};
+
+static int __init blk_dev_integrity_init(void)
+{
+	integrity_cachep = kmem_cache_create("blkdev_integrity",
+					     sizeof(struct blk_integrity),
+					     0, SLAB_PANIC, NULL);
+	return 0;
+}
+subsys_initcall(blk_dev_integrity_init);
+
+static void blk_integrity_release(struct kobject *kobj)
+{
+	struct blk_integrity *bi =
+		container_of(kobj, struct blk_integrity, kobj);
+
+	kmem_cache_free(integrity_cachep, bi);
+}
+
+static struct kobj_type integrity_ktype = {
+	.default_attrs	= integrity_attrs,
+	.sysfs_ops	= &integrity_ops,
+	.release	= blk_integrity_release,
+};
+
+/**
+ * blk_integrity_register - Register a gendisk as being integrity-capable
+ * @disk:	struct gendisk pointer to make integrity-aware
+ * @template:	integrity profile
+ *
+ * Description: When a device needs to advertise itself as being able
+ * to send/receive integrity metadata it must use this function to
+ * register the capability with the block layer.  The template is a
+ * blk_integrity struct with values appropriate for the underlying
+ * hardware.  See Documentation/block/data-integrity.txt.
+ */
+int blk_integrity_register(struct gendisk *disk, struct blk_integrity *template)
+{
+	struct blk_integrity *bi;
+
+	BUG_ON(disk == NULL);
+	BUG_ON(template == NULL);
+
+	if (disk->integrity == NULL) {
+		bi = kmem_cache_alloc(integrity_cachep, GFP_KERNEL | __GFP_ZERO);
+		if (!bi)
+			return -1;
+
+		if (kobject_init_and_add(&bi->kobj, &integrity_ktype,
+					 &disk->dev.kobj, "%s", "integrity")) {
+			kmem_cache_free(integrity_cachep, bi);
+			return -1;
+		}
+
+		kobject_uevent(&bi->kobj, KOBJ_ADD);
+
+		set_bit(INTEGRITY_FLAG_READ, &bi->flags);
+		set_bit(INTEGRITY_FLAG_WRITE, &bi->flags);
+		bi->sector_size = disk->queue->hardsect_size;
+		disk->integrity = bi;
+	} else
+		bi = disk->integrity;
+
+	/* Use the provided profile as template */
+	bi->name = template->name;
+	bi->generate_fn = template->generate_fn;
+	bi->verify_fn = template->verify_fn;
+	bi->tuple_size = template->tuple_size;
+	bi->set_tag_fn = template->set_tag_fn;
+	bi->get_tag_fn = template->get_tag_fn;
+	bi->tag_size = template->tag_size;
+
+	return 0;
+}
+EXPORT_SYMBOL(blk_integrity_register);
+
+/**
+ * blk_integrity_unregister - Remove block integrity profile
+ * @disk:	disk whose integrity profile to deallocate
+ *
+ * Description: This function frees all memory used by the block
+ * integrity profile.  To be called at device teardown.
+ */
+void blk_integrity_unregister(struct gendisk *disk)
+{
+	struct blk_integrity *bi;
+
+	if (!disk || !disk->integrity)
+		return;
+
+	bi = disk->integrity;
+
+	kobject_uevent(&bi->kobj, KOBJ_REMOVE);
+	kobject_del(&bi->kobj);
+	kobject_put(&disk->dev.kobj);
+	kmem_cache_free(integrity_cachep, bi);
+}
+EXPORT_SYMBOL(blk_integrity_unregister);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 651136aae76e..5efc9e7a68b7 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -441,6 +441,9 @@ static int attempt_merge(struct request_queue *q, struct request *req,
 	    || next->special)
 		return 0;
 
+	if (blk_integrity_rq(req) != blk_integrity_rq(next))
+		return 0;
+
 	/*
 	 * If we are allowed to merge, then append bio list
 	 * from next to rq and release next. merge_requests_fn
diff --git a/block/blk.h b/block/blk.h
index 59776ab4742a..c79f30e1df52 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -51,4 +51,12 @@ static inline int queue_congestion_off_threshold(struct request_queue *q)
 	return q->nr_congestion_off;
 }
 
+#if defined(CONFIG_BLK_DEV_INTEGRITY)
+
+#define rq_for_each_integrity_segment(bvl, _rq, _iter)		\
+	__rq_for_each_bio(_iter.bio, _rq)			\
+		bip_for_each_vec(bvl, _iter.bio->bi_integrity, _iter.i)
+
+#endif /* BLK_DEV_INTEGRITY */
+
 #endif
diff --git a/block/elevator.c b/block/elevator.c
index 902dd1344d56..1f5bfe696026 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -86,6 +86,12 @@ int elv_rq_merge_ok(struct request *rq, struct bio *bio)
 	if (rq->rq_disk != bio->bi_bdev->bd_disk || rq->special)
 		return 0;
 
+	/*
+	 * only merge integrity protected bio into ditto rq
+	 */
+	if (bio_integrity(bio) != blk_integrity_rq(rq))
+		return 0;
+
 	if (!elv_iosched_allow_merge(rq, bio))
 		return 0;
 
diff --git a/fs/Makefile b/fs/Makefile
index 1e7a11bd4da1..277b079dec9e 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -19,6 +19,7 @@ else
 obj-y +=	no-block.o
 endif
 
+obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o
 obj-$(CONFIG_INOTIFY)		+= inotify.o
 obj-$(CONFIG_INOTIFY_USER)	+= inotify_user.o
 obj-$(CONFIG_EPOLL)		+= eventpoll.o
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
new file mode 100644
index 000000000000..31b08878913d
--- /dev/null
+++ b/fs/bio-integrity.c
@@ -0,0 +1,708 @@
+/*
+ * bio-integrity.c - bio data integrity extensions
+ *
+ * Copyright (C) 2007, 2008 Oracle Corporation
+ * Written by: Martin K. Petersen <martin.petersen@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; see the file COPYING.  If not, write to
+ * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139,
+ * USA.
+ *
+ */
+
+#include <linux/blkdev.h>
+#include <linux/mempool.h>
+#include <linux/bio.h>
+#include <linux/workqueue.h>
+
+static struct kmem_cache *bio_integrity_slab __read_mostly;
+static struct workqueue_struct *kintegrityd_wq;
+
+/**
+ * bio_integrity_alloc_bioset - Allocate integrity payload and attach it to bio
+ * @bio:	bio to attach integrity metadata to
+ * @gfp_mask:	Memory allocation mask
+ * @nr_vecs:	Number of integrity metadata scatter-gather elements
+ * @bs:		bio_set to allocate from
+ *
+ * Description: This function prepares a bio for attaching integrity
+ * metadata.  nr_vecs specifies the maximum number of pages containing
+ * integrity metadata that can be attached.
+ */
+struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio, gfp_t gfp_mask, unsigned int nr_vecs, struct bio_set *bs)
+{
+	struct bio_integrity_payload *bip;
+	struct bio_vec *iv;
+	unsigned long idx;
+
+	BUG_ON(bio == NULL);
+
+	bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask);
+	if (unlikely(bip == NULL)) {
+		printk(KERN_ERR "%s: could not alloc bip\n", __func__);
+		return NULL;
+	}
+
+	memset(bip, 0, sizeof(*bip));
+
+	iv = bvec_alloc_bs(gfp_mask, nr_vecs, &idx, bs);
+	if (unlikely(iv == NULL)) {
+		printk(KERN_ERR "%s: could not alloc bip_vec\n", __func__);
+		mempool_free(bip, bs->bio_integrity_pool);
+		return NULL;
+	}
+
+	bip->bip_pool = idx;
+	bip->bip_vec = iv;
+	bip->bip_bio = bio;
+	bio->bi_integrity = bip;
+
+	return bip;
+}
+EXPORT_SYMBOL(bio_integrity_alloc_bioset);
+
+/**
+ * bio_integrity_alloc - Allocate integrity payload and attach it to bio
+ * @bio:	bio to attach integrity metadata to
+ * @gfp_mask:	Memory allocation mask
+ * @nr_vecs:	Number of integrity metadata scatter-gather elements
+ *
+ * Description: This function prepares a bio for attaching integrity
+ * metadata.  nr_vecs specifies the maximum number of pages containing
+ * integrity metadata that can be attached.
+ */
+struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, gfp_t gfp_mask, unsigned int nr_vecs)
+{
+	return bio_integrity_alloc_bioset(bio, gfp_mask, nr_vecs, fs_bio_set);
+}
+EXPORT_SYMBOL(bio_integrity_alloc);
+
+/**
+ * bio_integrity_free - Free bio integrity payload
+ * @bio:	bio containing bip to be freed
+ * @bs:		bio_set this bio was allocated from
+ *
+ * Description: Used to free the integrity portion of a bio. Usually
+ * called from bio_free().
+ */
+void bio_integrity_free(struct bio *bio, struct bio_set *bs)
+{
+	struct bio_integrity_payload *bip = bio->bi_integrity;
+
+	BUG_ON(bip == NULL);
+
+	/* A cloned bio doesn't own the integrity metadata */
+	if (!bio_flagged(bio, BIO_CLONED) && bip->bip_buf != NULL)
+		kfree(bip->bip_buf);
+
+	mempool_free(bip->bip_vec, bs->bvec_pools[bip->bip_pool]);
+	mempool_free(bip, bs->bio_integrity_pool);
+
+	bio->bi_integrity = NULL;
+}
+EXPORT_SYMBOL(bio_integrity_free);
+
+/**
+ * bio_integrity_add_page - Attach integrity metadata
+ * @bio:	bio to update
+ * @page:	page containing integrity metadata
+ * @len:	number of bytes of integrity metadata in page
+ * @offset:	start offset within page
+ *
+ * Description: Attach a page containing integrity metadata to bio.
+ */
+int bio_integrity_add_page(struct bio *bio, struct page *page,
+			   unsigned int len, unsigned int offset)
+{
+	struct bio_integrity_payload *bip = bio->bi_integrity;
+	struct bio_vec *iv;
+
+	if (bip->bip_vcnt >= bvec_nr_vecs(bip->bip_pool)) {
+		printk(KERN_ERR "%s: bip_vec full\n", __func__);
+		return 0;
+	}
+
+	iv = bip_vec_idx(bip, bip->bip_vcnt);
+	BUG_ON(iv == NULL);
+	BUG_ON(iv->bv_page != NULL);
+
+	iv->bv_page = page;
+	iv->bv_len = len;
+	iv->bv_offset = offset;
+	bip->bip_vcnt++;
+
+	return len;
+}
+EXPORT_SYMBOL(bio_integrity_add_page);
+
+/**
+ * bio_integrity_enabled - Check whether integrity can be passed
+ * @bio:	bio to check
+ *
+ * Description: Determines whether bio_integrity_prep() can be called
+ * on this bio or not.	bio data direction and target device must be
+ * set prior to calling.  The functions honors the write_generate and
+ * read_verify flags in sysfs.
+ */
+int bio_integrity_enabled(struct bio *bio)
+{
+	/* Already protected? */
+	if (bio_integrity(bio))
+		return 0;
+
+	return bdev_integrity_enabled(bio->bi_bdev, bio_data_dir(bio));
+}
+EXPORT_SYMBOL(bio_integrity_enabled);
+
+/**
+ * bio_integrity_hw_sectors - Convert 512b sectors to hardware ditto
+ * @bi:		blk_integrity profile for device
+ * @sectors:	Number of 512 sectors to convert
+ *
+ * Description: The block layer calculates everything in 512 byte
+ * sectors but integrity metadata is done in terms of the hardware
+ * sector size of the storage device.  Convert the block layer sectors
+ * to physical sectors.
+ */
+static inline unsigned int bio_integrity_hw_sectors(struct blk_integrity *bi, unsigned int sectors)
+{
+	/* At this point there are only 512b or 4096b DIF/EPP devices */
+	if (bi->sector_size == 4096)
+		return sectors >>= 3;
+
+	return sectors;
+}
+
+/**
+ * bio_integrity_tag_size - Retrieve integrity tag space
+ * @bio:	bio to inspect
+ *
+ * Description: Returns the maximum number of tag bytes that can be
+ * attached to this bio. Filesystems can use this to determine how
+ * much metadata to attach to an I/O.
+ */
+unsigned int bio_integrity_tag_size(struct bio *bio)
+{
+	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+
+	BUG_ON(bio->bi_size == 0);
+
+	return bi->tag_size * (bio->bi_size / bi->sector_size);
+}
+EXPORT_SYMBOL(bio_integrity_tag_size);
+
+int bio_integrity_tag(struct bio *bio, void *tag_buf, unsigned int len, int set)
+{
+	struct bio_integrity_payload *bip = bio->bi_integrity;
+	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+	unsigned int nr_sectors;
+
+	BUG_ON(bip->bip_buf == NULL);
+
+	if (bi->tag_size == 0)
+		return -1;
+
+	nr_sectors = bio_integrity_hw_sectors(bi, DIV_ROUND_UP(len, bi->tag_size));
+
+	if (nr_sectors * bi->tuple_size > bip->bip_size) {
+		printk(KERN_ERR "%s: tag too big for bio: %u > %u\n",
+		       __func__, nr_sectors * bi->tuple_size, bip->bip_size);
+		return -1;
+	}
+
+	if (set)
+		bi->set_tag_fn(bip->bip_buf, tag_buf, nr_sectors);
+	else
+		bi->get_tag_fn(bip->bip_buf, tag_buf, nr_sectors);
+
+	return 0;
+}
+
+/**
+ * bio_integrity_set_tag - Attach a tag buffer to a bio
+ * @bio:	bio to attach buffer to
+ * @tag_buf:	Pointer to a buffer containing tag data
+ * @len:	Length of the included buffer
+ *
+ * Description: Use this function to tag a bio by leveraging the extra
+ * space provided by devices formatted with integrity protection.  The
+ * size of the integrity buffer must be <= to the size reported by
+ * bio_integrity_tag_size().
+ */
+int bio_integrity_set_tag(struct bio *bio, void *tag_buf, unsigned int len)
+{
+	BUG_ON(bio_data_dir(bio) != WRITE);
+
+	return bio_integrity_tag(bio, tag_buf, len, 1);
+}
+EXPORT_SYMBOL(bio_integrity_set_tag);
+
+/**
+ * bio_integrity_get_tag - Retrieve a tag buffer from a bio
+ * @bio:	bio to retrieve buffer from
+ * @tag_buf:	Pointer to a buffer for the tag data
+ * @len:	Length of the target buffer
+ *
+ * Description: Use this function to retrieve the tag buffer from a
+ * completed I/O. The size of the integrity buffer must be <= to the
+ * size reported by bio_integrity_tag_size().
+ */
+int bio_integrity_get_tag(struct bio *bio, void *tag_buf, unsigned int len)
+{
+	BUG_ON(bio_data_dir(bio) != READ);
+
+	return bio_integrity_tag(bio, tag_buf, len, 0);
+}
+EXPORT_SYMBOL(bio_integrity_get_tag);
+
+/**
+ * bio_integrity_generate - Generate integrity metadata for a bio
+ * @bio:	bio to generate integrity metadata for
+ *
+ * Description: Generates integrity metadata for a bio by calling the
+ * block device's generation callback function.  The bio must have a
+ * bip attached with enough room to accommodate the generated
+ * integrity metadata.
+ */
+static void bio_integrity_generate(struct bio *bio)
+{
+	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+	struct blk_integrity_exchg bix;
+	struct bio_vec *bv;
+	sector_t sector = bio->bi_sector;
+	unsigned int i, sectors, total;
+	void *prot_buf = bio->bi_integrity->bip_buf;
+
+	total = 0;
+	bix.disk_name = bio->bi_bdev->bd_disk->disk_name;
+	bix.sector_size = bi->sector_size;
+
+	bio_for_each_segment(bv, bio, i) {
+		void *kaddr = kmap_atomic(bv->bv_page, KM_USER0);
+		bix.data_buf = kaddr + bv->bv_offset;
+		bix.data_size = bv->bv_len;
+		bix.prot_buf = prot_buf;
+		bix.sector = sector;
+
+		bi->generate_fn(&bix);
+
+		sectors = bv->bv_len / bi->sector_size;
+		sector += sectors;
+		prot_buf += sectors * bi->tuple_size;
+		total += sectors * bi->tuple_size;
+		BUG_ON(total > bio->bi_integrity->bip_size);
+
+		kunmap_atomic(kaddr, KM_USER0);
+	}
+}
+
+/**
+ * bio_integrity_prep - Prepare bio for integrity I/O
+ * @bio:	bio to prepare
+ *
+ * Description: Allocates a buffer for integrity metadata, maps the
+ * pages and attaches them to a bio.  The bio must have data
+ * direction, target device and start sector set priot to calling.  In
+ * the WRITE case, integrity metadata will be generated using the
+ * block device's integrity function.  In the READ case, the buffer
+ * will be prepared for DMA and a suitable end_io handler set up.
+ */
+int bio_integrity_prep(struct bio *bio)
+{
+	struct bio_integrity_payload *bip;
+	struct blk_integrity *bi;
+	struct request_queue *q;
+	void *buf;
+	unsigned long start, end;
+	unsigned int len, nr_pages;
+	unsigned int bytes, offset, i;
+	unsigned int sectors;
+
+	bi = bdev_get_integrity(bio->bi_bdev);
+	q = bdev_get_queue(bio->bi_bdev);
+	BUG_ON(bi == NULL);
+	BUG_ON(bio_integrity(bio));
+
+	sectors = bio_integrity_hw_sectors(bi, bio_sectors(bio));
+
+	/* Allocate kernel buffer for protection data */
+	len = sectors * blk_integrity_tuple_size(bi);
+	buf = kmalloc(len, GFP_NOIO | __GFP_NOFAIL | q->bounce_gfp);
+	if (unlikely(buf == NULL)) {
+		printk(KERN_ERR "could not allocate integrity buffer\n");
+		return -EIO;
+	}
+
+	end = (((unsigned long) buf) + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	start = ((unsigned long) buf) >> PAGE_SHIFT;
+	nr_pages = end - start;
+
+	/* Allocate bio integrity payload and integrity vectors */
+	bip = bio_integrity_alloc(bio, GFP_NOIO, nr_pages);
+	if (unlikely(bip == NULL)) {
+		printk(KERN_ERR "could not allocate data integrity bioset\n");
+		kfree(buf);
+		return -EIO;
+	}
+
+	bip->bip_buf = buf;
+	bip->bip_size = len;
+	bip->bip_sector = bio->bi_sector;
+
+	/* Map it */
+	offset = offset_in_page(buf);
+	for (i = 0 ; i < nr_pages ; i++) {
+		int ret;
+		bytes = PAGE_SIZE - offset;
+
+		if (len <= 0)
+			break;
+
+		if (bytes > len)
+			bytes = len;
+
+		ret = bio_integrity_add_page(bio, virt_to_page(buf),
+					     bytes, offset);
+
+		if (ret == 0)
+			return 0;
+
+		if (ret < bytes)
+			break;
+
+		buf += bytes;
+		len -= bytes;
+		offset = 0;
+	}
+
+	/* Install custom I/O completion handler if read verify is enabled */
+	if (bio_data_dir(bio) == READ) {
+		bip->bip_end_io = bio->bi_end_io;
+		bio->bi_end_io = bio_integrity_endio;
+	}
+
+	/* Auto-generate integrity metadata if this is a write */
+	if (bio_data_dir(bio) == WRITE)
+		bio_integrity_generate(bio);
+
+	return 0;
+}
+EXPORT_SYMBOL(bio_integrity_prep);
+
+/**
+ * bio_integrity_verify - Verify integrity metadata for a bio
+ * @bio:	bio to verify
+ *
+ * Description: This function is called to verify the integrity of a
+ * bio.	 The data in the bio io_vec is compared to the integrity
+ * metadata returned by the HBA.
+ */
+static int bio_integrity_verify(struct bio *bio)
+{
+	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+	struct blk_integrity_exchg bix;
+	struct bio_vec *bv;
+	sector_t sector = bio->bi_integrity->bip_sector;
+	unsigned int i, sectors, total, ret;
+	void *prot_buf = bio->bi_integrity->bip_buf;
+
+	ret = total = 0;
+	bix.disk_name = bio->bi_bdev->bd_disk->disk_name;
+	bix.sector_size = bi->sector_size;
+
+	bio_for_each_segment(bv, bio, i) {
+		void *kaddr = kmap_atomic(bv->bv_page, KM_USER0);
+		bix.data_buf = kaddr + bv->bv_offset;
+		bix.data_size = bv->bv_len;
+		bix.prot_buf = prot_buf;
+		bix.sector = sector;
+
+		ret = bi->verify_fn(&bix);
+
+		if (ret) {
+			kunmap_atomic(kaddr, KM_USER0);
+			break;
+		}
+
+		sectors = bv->bv_len / bi->sector_size;
+		sector += sectors;
+		prot_buf += sectors * bi->tuple_size;
+		total += sectors * bi->tuple_size;
+		BUG_ON(total > bio->bi_integrity->bip_size);
+
+		kunmap_atomic(kaddr, KM_USER0);
+	}
+
+	return ret;
+}
+
+/**
+ * bio_integrity_verify_fn - Integrity I/O completion worker
+ * @work:	Work struct stored in bio to be verified
+ *
+ * Description: This workqueue function is called to complete a READ
+ * request.  The function verifies the transferred integrity metadata
+ * and then calls the original bio end_io function.
+ */
+static void bio_integrity_verify_fn(struct work_struct *work)
+{
+	struct bio_integrity_payload *bip = 
+		container_of(work, struct bio_integrity_payload, bip_work);
+	struct bio *bio = bip->bip_bio;
+	int error = bip->bip_error;
+
+	if (bio_integrity_verify(bio)) {
+		clear_bit(BIO_UPTODATE, &bio->bi_flags);
+		error = -EIO;
+	}
+
+	/* Restore original bio completion handler */
+	bio->bi_end_io = bip->bip_end_io;
+
+	if (bio->bi_end_io)
+		bio->bi_end_io(bio, error);
+}
+
+/**
+ * bio_integrity_endio - Integrity I/O completion function
+ * @bio:	Protected bio
+ * @error:	Pointer to errno
+ *
+ * Description: Completion for integrity I/O
+ *
+ * Normally I/O completion is done in interrupt context.  However,
+ * verifying I/O integrity is a time-consuming task which must be run
+ * in process context.	This function postpones completion
+ * accordingly.
+ */
+void bio_integrity_endio(struct bio *bio, int error)
+{
+	struct bio_integrity_payload *bip = bio->bi_integrity;
+
+	BUG_ON(bip->bip_bio != bio);
+
+	bip->bip_error = error;
+	INIT_WORK(&bip->bip_work, bio_integrity_verify_fn);
+	queue_work(kintegrityd_wq, &bip->bip_work);
+}
+EXPORT_SYMBOL(bio_integrity_endio);
+
+/**
+ * bio_integrity_mark_head - Advance bip_vec skip bytes
+ * @bip:	Integrity vector to advance
+ * @skip:	Number of bytes to advance it
+ */
+void bio_integrity_mark_head(struct bio_integrity_payload *bip, unsigned int skip)
+{
+	struct bio_vec *iv;
+	unsigned int i;
+
+	bip_for_each_vec(iv, bip, i) {
+		if (skip == 0) {
+			bip->bip_idx = i;
+			return;
+		} else if (skip >= iv->bv_len) {
+			skip -= iv->bv_len;
+		} else { /* skip < iv->bv_len) */
+			iv->bv_offset += skip;
+			iv->bv_len -= skip;
+			bip->bip_idx = i;
+			return;
+		}
+	}
+}
+
+/**
+ * bio_integrity_mark_tail - Truncate bip_vec to be len bytes long
+ * @bip:	Integrity vector to truncate
+ * @len:	New length of integrity vector
+ */
+void bio_integrity_mark_tail(struct bio_integrity_payload *bip, unsigned int len)
+{
+	struct bio_vec *iv;
+	unsigned int i;
+
+	bip_for_each_vec(iv, bip, i) {
+		if (len == 0) {
+			bip->bip_vcnt = i;
+			return;
+		} else if (len >= iv->bv_len) {
+			len -= iv->bv_len;
+		} else { /* len < iv->bv_len) */
+			iv->bv_len = len;
+			len = 0;
+		}
+	}
+}
+
+/**
+ * bio_integrity_advance - Advance integrity vector
+ * @bio:	bio whose integrity vector to update
+ * @bytes_done:	number of data bytes that have been completed
+ *
+ * Description: This function calculates how many integrity bytes the
+ * number of completed data bytes correspond to and advances the
+ * integrity vector accordingly.
+ */
+void bio_integrity_advance(struct bio *bio, unsigned int bytes_done)
+{
+	struct bio_integrity_payload *bip = bio->bi_integrity;
+	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+	unsigned int nr_sectors;
+
+	BUG_ON(bip == NULL);
+	BUG_ON(bi == NULL);
+
+	nr_sectors = bio_integrity_hw_sectors(bi, bytes_done >> 9);
+	bio_integrity_mark_head(bip, nr_sectors * bi->tuple_size);
+}
+EXPORT_SYMBOL(bio_integrity_advance);
+
+/**
+ * bio_integrity_trim - Trim integrity vector
+ * @bio:	bio whose integrity vector to update
+ * @offset:	offset to first data sector
+ * @sectors:	number of data sectors
+ *
+ * Description: Used to trim the integrity vector in a cloned bio.
+ * The ivec will be advanced corresponding to 'offset' data sectors
+ * and the length will be truncated corresponding to 'len' data
+ * sectors.
+ */
+void bio_integrity_trim(struct bio *bio, unsigned int offset, unsigned int sectors)
+{
+	struct bio_integrity_payload *bip = bio->bi_integrity;
+	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+	unsigned int nr_sectors;
+
+	BUG_ON(bip == NULL);
+	BUG_ON(bi == NULL);
+	BUG_ON(!bio_flagged(bio, BIO_CLONED));
+
+	nr_sectors = bio_integrity_hw_sectors(bi, sectors);
+	bip->bip_sector = bip->bip_sector + offset;
+	bio_integrity_mark_head(bip, offset * bi->tuple_size);
+	bio_integrity_mark_tail(bip, sectors * bi->tuple_size);
+}
+EXPORT_SYMBOL(bio_integrity_trim);
+
+/**
+ * bio_integrity_split - Split integrity metadata
+ * @bio:	Protected bio
+ * @bp:		Resulting bio_pair
+ * @sectors:	Offset
+ *
+ * Description: Splits an integrity page into a bio_pair.
+ */
+void bio_integrity_split(struct bio *bio, struct bio_pair *bp, int sectors)
+{
+	struct blk_integrity *bi;
+	struct bio_integrity_payload *bip = bio->bi_integrity;
+	unsigned int nr_sectors;
+
+	if (bio_integrity(bio) == 0)
+		return;
+
+	bi = bdev_get_integrity(bio->bi_bdev);
+	BUG_ON(bi == NULL);
+	BUG_ON(bip->bip_vcnt != 1);
+
+	nr_sectors = bio_integrity_hw_sectors(bi, sectors);
+
+	bp->bio1.bi_integrity = &bp->bip1;
+	bp->bio2.bi_integrity = &bp->bip2;
+
+	bp->iv1 = bip->bip_vec[0];
+	bp->iv2 = bip->bip_vec[0];
+
+	bp->bip1.bip_vec = &bp->iv1;
+	bp->bip2.bip_vec = &bp->iv2;
+
+	bp->iv1.bv_len = sectors * bi->tuple_size;
+	bp->iv2.bv_offset += sectors * bi->tuple_size;
+	bp->iv2.bv_len -= sectors * bi->tuple_size;
+
+	bp->bip1.bip_sector = bio->bi_integrity->bip_sector;
+	bp->bip2.bip_sector = bio->bi_integrity->bip_sector + nr_sectors;
+
+	bp->bip1.bip_vcnt = bp->bip2.bip_vcnt = 1;
+	bp->bip1.bip_idx = bp->bip2.bip_idx = 0;
+}
+EXPORT_SYMBOL(bio_integrity_split);
+
+/**
+ * bio_integrity_clone - Callback for cloning bios with integrity metadata
+ * @bio:	New bio
+ * @bio_src:	Original bio
+ * @bs:		bio_set to allocate bip from
+ *
+ * Description:	Called to allocate a bip when cloning a bio
+ */
+int bio_integrity_clone(struct bio *bio, struct bio *bio_src, struct bio_set *bs)
+{
+	struct bio_integrity_payload *bip_src = bio_src->bi_integrity;
+	struct bio_integrity_payload *bip;
+
+	BUG_ON(bip_src == NULL);
+
+	bip = bio_integrity_alloc_bioset(bio, GFP_NOIO, bip_src->bip_vcnt, bs);
+
+	if (bip == NULL)
+		return -EIO;
+
+	memcpy(bip->bip_vec, bip_src->bip_vec,
+	       bip_src->bip_vcnt * sizeof(struct bio_vec));
+
+	bip->bip_sector = bip_src->bip_sector;
+	bip->bip_vcnt = bip_src->bip_vcnt;
+	bip->bip_idx = bip_src->bip_idx;
+
+	return 0;
+}
+EXPORT_SYMBOL(bio_integrity_clone);
+
+int bioset_integrity_create(struct bio_set *bs, int pool_size)
+{
+	bs->bio_integrity_pool = mempool_create_slab_pool(pool_size,
+							  bio_integrity_slab);
+	if (!bs->bio_integrity_pool)
+		return -1;
+
+	return 0;
+}
+EXPORT_SYMBOL(bioset_integrity_create);
+
+void bioset_integrity_free(struct bio_set *bs)
+{
+	if (bs->bio_integrity_pool)
+		mempool_destroy(bs->bio_integrity_pool);
+}
+EXPORT_SYMBOL(bioset_integrity_free);
+
+void __init bio_integrity_init_slab(void)
+{
+	bio_integrity_slab = KMEM_CACHE(bio_integrity_payload,
+					SLAB_HWCACHE_ALIGN|SLAB_PANIC);
+}
+EXPORT_SYMBOL(bio_integrity_init_slab);
+
+static int __init integrity_init(void)
+{
+	kintegrityd_wq = create_workqueue("kintegrityd");
+
+	if (!kintegrityd_wq)
+		panic("Failed to create kintegrityd\n");
+
+	return 0;
+}
+subsys_initcall(integrity_init);
diff --git a/fs/bio.c b/fs/bio.c
index 7a6598abc967..7761c84c7032 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -50,6 +50,11 @@ static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = {
  */
 struct bio_set *fs_bio_set;
 
+unsigned int bvec_nr_vecs(unsigned short idx)
+{
+	return bvec_slabs[idx].nr_vecs;
+}
+
 struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct bio_set *bs)
 {
 	struct bio_vec *bvl;
@@ -91,6 +96,9 @@ void bio_free(struct bio *bio, struct bio_set *bio_set)
 		mempool_free(bio->bi_io_vec, bio_set->bvec_pools[pool_idx]);
 	}
 
+	if (bio_integrity(bio))
+		bio_integrity_free(bio, bio_set);
+
 	mempool_free(bio, bio_set->bio_pool);
 }
 
@@ -249,9 +257,19 @@ struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask)
 {
 	struct bio *b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, fs_bio_set);
 
-	if (b) {
-		b->bi_destructor = bio_fs_destructor;
-		__bio_clone(b, bio);
+	if (!b)
+		return NULL;
+
+	b->bi_destructor = bio_fs_destructor;
+	__bio_clone(b, bio);
+
+	if (bio_integrity(bio)) {
+		int ret;
+
+		ret = bio_integrity_clone(b, bio, fs_bio_set);
+
+		if (ret < 0)
+			return NULL;
 	}
 
 	return b;
@@ -1223,6 +1241,9 @@ struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors)
 	bp->bio1.bi_private = bi;
 	bp->bio2.bi_private = pool;
 
+	if (bio_integrity(bi))
+		bio_integrity_split(bi, bp, first_sectors);
+
 	return bp;
 }
 
@@ -1264,6 +1285,7 @@ void bioset_free(struct bio_set *bs)
 	if (bs->bio_pool)
 		mempool_destroy(bs->bio_pool);
 
+	bioset_integrity_free(bs);
 	biovec_free_pools(bs);
 
 	kfree(bs);
@@ -1280,6 +1302,9 @@ struct bio_set *bioset_create(int bio_pool_size, int bvec_pool_size)
 	if (!bs->bio_pool)
 		goto bad;
 
+	if (bioset_integrity_create(bs, bio_pool_size))
+		goto bad;
+
 	if (!biovec_create_pools(bs, bvec_pool_size))
 		return bs;
 
@@ -1306,6 +1331,7 @@ static int __init init_bio(void)
 {
 	bio_slab = KMEM_CACHE(bio, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
 
+	bio_integrity_init_slab();
 	biovec_init_slabs();
 
 	fs_bio_set = bioset_create(BIO_POOL_SIZE, 2);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 49dfb3cb7460..6bfc3e8d9d89 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -64,6 +64,7 @@ struct bio_vec {
 
 struct bio_set;
 struct bio;
+struct bio_integrity_payload;
 typedef void (bio_end_io_t) (struct bio *, int);
 typedef void (bio_destructor_t) (struct bio *);
 
@@ -112,6 +113,9 @@ struct bio {
 	atomic_t		bi_cnt;		/* pin count */
 
 	void			*bi_private;
+#if defined(CONFIG_BLK_DEV_INTEGRITY)
+	struct bio_integrity_payload *bi_integrity;  /* data integrity */
+#endif
 
 	bio_destructor_t	*bi_destructor;	/* destructor */
 };
@@ -271,6 +275,29 @@ static inline void *bio_data(struct bio *bio)
  */
 #define bio_get(bio)	atomic_inc(&(bio)->bi_cnt)
 
+#if defined(CONFIG_BLK_DEV_INTEGRITY)
+/*
+ * bio integrity payload
+ */
+struct bio_integrity_payload {
+	struct bio		*bip_bio;	/* parent bio */
+	struct bio_vec		*bip_vec;	/* integrity data vector */
+
+	sector_t		bip_sector;	/* virtual start sector */
+
+	void			*bip_buf;	/* generated integrity data */
+	bio_end_io_t		*bip_end_io;	/* saved I/O completion fn */
+
+	int			bip_error;	/* saved I/O error */
+	unsigned int		bip_size;
+
+	unsigned short		bip_pool;	/* pool the ivec came from */
+	unsigned short		bip_vcnt;	/* # of integrity bio_vecs */
+	unsigned short		bip_idx;	/* current bip_vec index */
+
+	struct work_struct	bip_work;	/* I/O completion */
+};
+#endif /* CONFIG_BLK_DEV_INTEGRITY */
 
 /*
  * A bio_pair is used when we need to split a bio.
@@ -283,10 +310,14 @@ static inline void *bio_data(struct bio *bio)
  *   in bio2.bi_private
  */
 struct bio_pair {
-	struct bio	bio1, bio2;
-	struct bio_vec	bv1, bv2;
-	atomic_t	cnt;
-	int		error;
+	struct bio			bio1, bio2;
+	struct bio_vec			bv1, bv2;
+#if defined(CONFIG_BLK_DEV_INTEGRITY)
+	struct bio_integrity_payload	bip1, bip2;
+	struct bio_vec			iv1, iv2;
+#endif
+	atomic_t			cnt;
+	int				error;
 };
 extern struct bio_pair *bio_split(struct bio *bi, mempool_t *pool,
 				  int first_sectors);
@@ -334,6 +365,7 @@ extern struct bio *bio_copy_user_iov(struct request_queue *, struct sg_iovec *,
 extern int bio_uncopy_user(struct bio *);
 void zero_fill_bio(struct bio *bio);
 extern struct bio_vec *bvec_alloc_bs(gfp_t, int, unsigned long *, struct bio_set *);
+extern unsigned int bvec_nr_vecs(unsigned short idx);
 
 /*
  * bio_set is used to allow other portions of the IO system to
@@ -346,6 +378,9 @@ extern struct bio_vec *bvec_alloc_bs(gfp_t, int, unsigned long *, struct bio_set
 
 struct bio_set {
 	mempool_t *bio_pool;
+#if defined(CONFIG_BLK_DEV_INTEGRITY)
+	mempool_t *bio_integrity_pool;
+#endif
 	mempool_t *bvec_pools[BIOVEC_NR_POOLS];
 };
 
@@ -410,5 +445,56 @@ static inline char *__bio_kmap_irq(struct bio *bio, unsigned short idx,
 	__bio_kmap_irq((bio), (bio)->bi_idx, (flags))
 #define bio_kunmap_irq(buf,flags)	__bio_kunmap_irq(buf, flags)
 
+#if defined(CONFIG_BLK_DEV_INTEGRITY)
+
+#define bip_vec_idx(bip, idx)	(&(bip->bip_vec[(idx)]))
+#define bip_vec(bip)		bip_vec_idx(bip, 0)
+
+#define __bip_for_each_vec(bvl, bip, i, start_idx)			\
+	for (bvl = bip_vec_idx((bip), (start_idx)), i = (start_idx);	\
+	     i < (bip)->bip_vcnt;					\
+	     bvl++, i++)
+
+#define bip_for_each_vec(bvl, bip, i)					\
+	__bip_for_each_vec(bvl, bip, i, (bip)->bip_idx)
+
+#define bio_integrity(bio)	((bio)->bi_integrity ? 1 : 0)
+
+extern struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *, gfp_t, unsigned int, struct bio_set *);
+extern struct bio_integrity_payload *bio_integrity_alloc(struct bio *, gfp_t, unsigned int);
+extern void bio_integrity_free(struct bio *, struct bio_set *);
+extern int bio_integrity_add_page(struct bio *, struct page *, unsigned int, unsigned int);
+extern int bio_integrity_enabled(struct bio *bio);
+extern int bio_integrity_set_tag(struct bio *, void *, unsigned int);
+extern int bio_integrity_get_tag(struct bio *, void *, unsigned int);
+extern int bio_integrity_prep(struct bio *);
+extern void bio_integrity_endio(struct bio *, int);
+extern void bio_integrity_advance(struct bio *, unsigned int);
+extern void bio_integrity_trim(struct bio *, unsigned int, unsigned int);
+extern void bio_integrity_split(struct bio *, struct bio_pair *, int);
+extern int bio_integrity_clone(struct bio *, struct bio *, struct bio_set *);
+extern int bioset_integrity_create(struct bio_set *, int);
+extern void bioset_integrity_free(struct bio_set *);
+extern void bio_integrity_init_slab(void);
+
+#else /* CONFIG_BLK_DEV_INTEGRITY */
+
+#define bio_integrity(a)		(0)
+#define bioset_integrity_create(a, b)	(0)
+#define bio_integrity_prep(a)		(0)
+#define bio_integrity_enabled(a)	(0)
+#define bio_integrity_clone(a, b, c)	(0)
+#define bioset_integrity_free(a)	do { } while (0)
+#define bio_integrity_free(a, b)	do { } while (0)
+#define bio_integrity_endio(a, b)	do { } while (0)
+#define bio_integrity_advance(a, b)	do { } while (0)
+#define bio_integrity_trim(a, b, c)	do { } while (0)
+#define bio_integrity_split(a, b, c)	do { } while (0)
+#define bio_integrity_set_tag(a, b, c)	do { } while (0)
+#define bio_integrity_get_tag(a, b, c)	do { } while (0)
+#define bio_integrity_init_slab(a)	do { } while (0)
+
+#endif /* CONFIG_BLK_DEV_INTEGRITY */
+
 #endif /* CONFIG_BLOCK */
 #endif /* __LINUX_BIO_H */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 6a3da6717135..4a9ed45270ff 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -112,6 +112,7 @@ enum rq_flag_bits {
 	__REQ_ALLOCED,		/* request came from our alloc pool */
 	__REQ_RW_META,		/* metadata io request */
 	__REQ_COPY_USER,	/* contains copies of user pages */
+	__REQ_INTEGRITY,	/* integrity metadata has been remapped */
 	__REQ_NR_BITS,		/* stops here */
 };
 
@@ -134,6 +135,7 @@ enum rq_flag_bits {
 #define REQ_ALLOCED	(1 << __REQ_ALLOCED)
 #define REQ_RW_META	(1 << __REQ_RW_META)
 #define REQ_COPY_USER	(1 << __REQ_COPY_USER)
+#define REQ_INTEGRITY	(1 << __REQ_INTEGRITY)
 
 #define BLK_MAX_CDB	16
 
@@ -865,6 +867,109 @@ void kblockd_flush_work(struct work_struct *work);
 	MODULE_ALIAS("block-major-" __stringify(major) "-*")
 
 
+#if defined(CONFIG_BLK_DEV_INTEGRITY)
+
+#define INTEGRITY_FLAG_READ	1	/* verify data integrity on read */
+#define INTEGRITY_FLAG_WRITE	2	/* generate data integrity on write */
+
+struct blk_integrity_exchg {
+	void			*prot_buf;
+	void			*data_buf;
+	sector_t		sector;
+	unsigned int		data_size;
+	unsigned short		sector_size;
+	const char		*disk_name;
+};
+
+typedef void (integrity_gen_fn) (struct blk_integrity_exchg *);
+typedef int (integrity_vrfy_fn) (struct blk_integrity_exchg *);
+typedef void (integrity_set_tag_fn) (void *, void *, unsigned int);
+typedef void (integrity_get_tag_fn) (void *, void *, unsigned int);
+
+struct blk_integrity {
+	integrity_gen_fn	*generate_fn;
+	integrity_vrfy_fn	*verify_fn;
+	integrity_set_tag_fn	*set_tag_fn;
+	integrity_get_tag_fn	*get_tag_fn;
+
+	unsigned short		flags;
+	unsigned short		tuple_size;
+	unsigned short		sector_size;
+	unsigned short		tag_size;
+
+	const char		*name;
+
+	struct kobject		kobj;
+};
+
+extern int blk_integrity_register(struct gendisk *, struct blk_integrity *);
+extern void blk_integrity_unregister(struct gendisk *);
+extern int blk_integrity_compare(struct block_device *, struct block_device *);
+extern int blk_rq_map_integrity_sg(struct request *, struct scatterlist *);
+extern int blk_rq_count_integrity_sg(struct request *);
+
+static inline unsigned short blk_integrity_tuple_size(struct blk_integrity *bi)
+{
+	if (bi)
+		return bi->tuple_size;
+
+	return 0;
+}
+
+static inline struct blk_integrity *bdev_get_integrity(struct block_device *bdev)
+{
+	return bdev->bd_disk->integrity;
+}
+
+static inline unsigned int bdev_get_tag_size(struct block_device *bdev)
+{
+	struct blk_integrity *bi = bdev_get_integrity(bdev);
+
+	if (bi)
+		return bi->tag_size;
+
+	return 0;
+}
+
+static inline int bdev_integrity_enabled(struct block_device *bdev, int rw)
+{
+	struct blk_integrity *bi = bdev_get_integrity(bdev);
+
+	if (bi == NULL)
+		return 0;
+
+	if (rw == READ && bi->verify_fn != NULL &&
+	    test_bit(INTEGRITY_FLAG_READ, &bi->flags))
+		return 1;
+
+	if (rw == WRITE && bi->generate_fn != NULL &&
+	    test_bit(INTEGRITY_FLAG_WRITE, &bi->flags))
+		return 1;
+
+	return 0;
+}
+
+static inline int blk_integrity_rq(struct request *rq)
+{
+	BUG_ON(rq->bio == NULL);
+
+	return bio_integrity(rq->bio);
+}
+
+#else /* CONFIG_BLK_DEV_INTEGRITY */
+
+#define blk_integrity_rq(rq)			(0)
+#define blk_rq_count_integrity_sg(a)		(0)
+#define blk_rq_map_integrity_sg(a, b)		(0)
+#define bdev_get_integrity(a)			(0)
+#define bdev_get_tag_size(a)			(0)
+#define blk_integrity_compare(a, b)		(0)
+#define blk_integrity_register(a, b)		(0)
+#define blk_integrity_unregister(a)		do { } while (0);
+
+#endif /* CONFIG_BLK_DEV_INTEGRITY */
+
+
 #else /* CONFIG_BLOCK */
 /*
  * stubs for when the block layer is configured out
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index ae7aec3cabee..524ec96f5a23 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -141,6 +141,9 @@ struct gendisk {
 	struct disk_stats dkstats;
 #endif
 	struct work_struct async_notify;
+#ifdef  CONFIG_BLK_DEV_INTEGRITY
+	struct blk_integrity *integrity;
+#endif
 };
 
 /* 
-- 
cgit v1.2.3


From da9cbc87395308a21465bd25441297bbba0477e1 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Mon, 30 Jun 2008 20:42:08 +0200
Subject: block: blkdev.h cleanup, move iocontext stuff to iocontext.h

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 include/linux/blkdev.h    | 17 -----------------
 include/linux/iocontext.h | 18 ++++++++++++++++++
 kernel/exit.c             |  1 +
 kernel/fork.c             |  1 +
 4 files changed, 20 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 4a9ed45270ff..443df75d2cde 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -33,12 +33,6 @@ struct sg_io_hdr;
 #define BLKDEV_MIN_RQ	4
 #define BLKDEV_MAX_RQ	128	/* Default maximum */
 
-int put_io_context(struct io_context *ioc);
-void exit_io_context(void);
-struct io_context *get_io_context(gfp_t gfp_flags, int node);
-struct io_context *alloc_io_context(gfp_t gfp_flags, int node);
-void copy_io_context(struct io_context **pdst, struct io_context **psrc);
-
 struct request;
 typedef void (rq_end_io_fn)(struct request *, int);
 
@@ -981,17 +975,6 @@ static inline long nr_blockdev_pages(void)
 	return 0;
 }
 
-static inline void exit_io_context(void)
-{
-}
-
-struct io_context;
-static inline int put_io_context(struct io_context *ioc)
-{
-	return 1;
-}
-
-
 #endif /* CONFIG_BLOCK */
 
 #endif
diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
index 2b7a1187cb29..08b987bccf89 100644
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -99,4 +99,22 @@ static inline struct io_context *ioc_task_link(struct io_context *ioc)
 	return NULL;
 }
 
+#ifdef CONFIG_BLOCK
+int put_io_context(struct io_context *ioc);
+void exit_io_context(void);
+struct io_context *get_io_context(gfp_t gfp_flags, int node);
+struct io_context *alloc_io_context(gfp_t gfp_flags, int node);
+void copy_io_context(struct io_context **pdst, struct io_context **psrc);
+#else
+static inline void exit_io_context(void)
+{
+}
+
+struct io_context;
+static inline int put_io_context(struct io_context *ioc)
+{
+	return 1;
+}
+#endif
+
 #endif
diff --git a/kernel/exit.c b/kernel/exit.c
index 8f6185e69b69..ceb258782835 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -13,6 +13,7 @@
 #include <linux/personality.h>
 #include <linux/tty.h>
 #include <linux/mnt_namespace.h>
+#include <linux/iocontext.h>
 #include <linux/key.h>
 #include <linux/security.h>
 #include <linux/cpu.h>
diff --git a/kernel/fork.c b/kernel/fork.c
index 19908b26cf80..b71ccd09fc8d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -23,6 +23,7 @@
 #include <linux/sem.h>
 #include <linux/file.h>
 #include <linux/fdtable.h>
+#include <linux/iocontext.h>
 #include <linux/key.h>
 #include <linux/binfmts.h>
 #include <linux/mman.h>
-- 
cgit v1.2.3


From 6e2401ad6f33de15ff00f78b88159f00a14f3b35 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 18 Jun 2008 10:15:02 +0200
Subject: block: integrity cleanups

- No need to check for NULL bio, we'll get an immediate oops anyway.
- Make bio_integrity() a proper function.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 include/linux/bio.h    | 9 ++++++++-
 include/linux/blkdev.h | 4 ----
 2 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bio.h b/include/linux/bio.h
index 6bfc3e8d9d89..0933a14e6414 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -458,7 +458,14 @@ static inline char *__bio_kmap_irq(struct bio *bio, unsigned short idx,
 #define bip_for_each_vec(bvl, bip, i)					\
 	__bip_for_each_vec(bvl, bip, i, (bip)->bip_idx)
 
-#define bio_integrity(bio)	((bio)->bi_integrity ? 1 : 0)
+static inline int bio_integrity(struct bio *bio)
+{
+#if defined(CONFIG_BLK_DEV_INTEGRITY)
+	return bio->bi_integrity != NULL;
+#else
+	return 0;
+#endif
+}
 
 extern struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *, gfp_t, unsigned int, struct bio_set *);
 extern struct bio_integrity_payload *bio_integrity_alloc(struct bio *, gfp_t, unsigned int);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 443df75d2cde..d3ae9ad97213 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -860,7 +860,6 @@ void kblockd_flush_work(struct work_struct *work);
 #define MODULE_ALIAS_BLOCKDEV_MAJOR(major) \
 	MODULE_ALIAS("block-major-" __stringify(major) "-*")
 
-
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
 
 #define INTEGRITY_FLAG_READ	1	/* verify data integrity on read */
@@ -945,8 +944,6 @@ static inline int bdev_integrity_enabled(struct block_device *bdev, int rw)
 
 static inline int blk_integrity_rq(struct request *rq)
 {
-	BUG_ON(rq->bio == NULL);
-
 	return bio_integrity(rq->bio);
 }
 
@@ -963,7 +960,6 @@ static inline int blk_integrity_rq(struct request *rq)
 
 #endif /* CONFIG_BLK_DEV_INTEGRITY */
 
-
 #else /* CONFIG_BLOCK */
 /*
  * stubs for when the block layer is configured out
-- 
cgit v1.2.3


From 0b07de85a76e1346e675f0e98437378932473df7 Mon Sep 17 00:00:00 2001
From: Adel Gadllah <adel.gadllah@gmail.com>
Date: Thu, 26 Jun 2008 13:48:27 +0200
Subject: allow userspace to modify scsi command filter on per device basis

This patch exports the per-gendisk command filter to user space through
sysfs, so it can be changed by the system administrator.
All users of the old cmd filter have been converted to use the new one.

Original patch from Peter Jones.

Signed-off-by: Adel Gadllah <adel.gadllah@gmail.com>
Signed-off-by: Peter Jones <pjones@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/Makefile         |   3 +-
 block/bsg.c            |  38 ++++--
 block/cmd-filter.c     | 325 +++++++++++++++++++++++++++++++++++++++++++++++++
 block/genhd.c          |   2 +
 block/scsi_ioctl.c     | 121 +-----------------
 drivers/scsi/sg.c      |  40 ++----
 include/linux/blkdev.h |  10 +-
 include/linux/genhd.h  |   9 ++
 8 files changed, 389 insertions(+), 159 deletions(-)
 create mode 100644 block/cmd-filter.c

(limited to 'include/linux')

diff --git a/block/Makefile b/block/Makefile
index 045f7b62e4bb..208000b0750d 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -4,7 +4,8 @@
 
 obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
 			blk-barrier.o blk-settings.o blk-ioc.o blk-map.o \
-			blk-exec.o blk-merge.o ioctl.o genhd.o scsi_ioctl.o
+			blk-exec.o blk-merge.o ioctl.o genhd.o scsi_ioctl.o \
+			cmd-filter.o
 
 obj-$(CONFIG_BLK_DEV_BSG)	+= bsg.o
 obj-$(CONFIG_IOSCHED_NOOP)	+= noop-iosched.o
diff --git a/block/bsg.c b/block/bsg.c
index f0b7cd343216..439940c3a1ff 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -44,11 +44,12 @@ struct bsg_device {
 	char name[BUS_ID_SIZE];
 	int max_queue;
 	unsigned long flags;
+	struct blk_scsi_cmd_filter *cmd_filter;
+	mode_t *f_mode;
 };
 
 enum {
 	BSG_F_BLOCK		= 1,
-	BSG_F_WRITE_PERM	= 2,
 };
 
 #define BSG_DEFAULT_CMDS	64
@@ -172,7 +173,7 @@ unlock:
 }
 
 static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq,
-				struct sg_io_v4 *hdr, int has_write_perm)
+				struct sg_io_v4 *hdr, struct bsg_device *bd)
 {
 	if (hdr->request_len > BLK_MAX_CDB) {
 		rq->cmd = kzalloc(hdr->request_len, GFP_KERNEL);
@@ -185,7 +186,8 @@ static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq,
 		return -EFAULT;
 
 	if (hdr->subprotocol == BSG_SUB_PROTOCOL_SCSI_CMD) {
-		if (blk_verify_command(rq->cmd, has_write_perm))
+		if (blk_cmd_filter_verify_command(bd->cmd_filter, rq->cmd,
+						 bd->f_mode))
 			return -EPERM;
 	} else if (!capable(CAP_SYS_RAWIO))
 		return -EPERM;
@@ -263,8 +265,7 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr)
 	rq = blk_get_request(q, rw, GFP_KERNEL);
 	if (!rq)
 		return ERR_PTR(-ENOMEM);
-	ret = blk_fill_sgv4_hdr_rq(q, rq, hdr, test_bit(BSG_F_WRITE_PERM,
-						       &bd->flags));
+	ret = blk_fill_sgv4_hdr_rq(q, rq, hdr, bd);
 	if (ret)
 		goto out;
 
@@ -566,12 +567,23 @@ static inline void bsg_set_block(struct bsg_device *bd, struct file *file)
 		set_bit(BSG_F_BLOCK, &bd->flags);
 }
 
-static inline void bsg_set_write_perm(struct bsg_device *bd, struct file *file)
+static void bsg_set_cmd_filter(struct bsg_device *bd,
+			   struct file *file)
 {
-	if (file->f_mode & FMODE_WRITE)
-		set_bit(BSG_F_WRITE_PERM, &bd->flags);
-	else
-		clear_bit(BSG_F_WRITE_PERM, &bd->flags);
+	struct inode *inode;
+	struct gendisk *disk;
+
+	if (!file)
+		return;
+
+	inode = file->f_dentry->d_inode;
+	if (!inode)
+		return;
+
+	disk = inode->i_bdev->bd_disk;
+
+	bd->cmd_filter = &disk->cmd_filter;
+	bd->f_mode = &file->f_mode;
 }
 
 /*
@@ -595,6 +607,8 @@ bsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 	dprintk("%s: read %Zd bytes\n", bd->name, count);
 
 	bsg_set_block(bd, file);
+	bsg_set_cmd_filter(bd, file);
+
 	bytes_read = 0;
 	ret = __bsg_read(buf, count, bd, NULL, &bytes_read);
 	*ppos = bytes_read;
@@ -668,7 +682,7 @@ bsg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
 	dprintk("%s: write %Zd bytes\n", bd->name, count);
 
 	bsg_set_block(bd, file);
-	bsg_set_write_perm(bd, file);
+	bsg_set_cmd_filter(bd, file);
 
 	bytes_written = 0;
 	ret = __bsg_write(bd, buf, count, &bytes_written);
@@ -771,7 +785,9 @@ static struct bsg_device *bsg_add_device(struct inode *inode,
 	}
 
 	bd->queue = rq;
+
 	bsg_set_block(bd, file);
+	bsg_set_cmd_filter(bd, file);
 
 	atomic_set(&bd->ref_count, 1);
 	mutex_lock(&bsg_mutex);
diff --git a/block/cmd-filter.c b/block/cmd-filter.c
new file mode 100644
index 000000000000..35e327ceaa97
--- /dev/null
+++ b/block/cmd-filter.c
@@ -0,0 +1,325 @@
+/*
+ * Copyright 2004 Peter M. Jones <pjones@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public Licens
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
+ *
+ */
+
+#include <linux/list.h>
+#include <linux/genhd.h>
+#include <linux/spinlock.h>
+#include <linux/parser.h>
+#include <linux/capability.h>
+#include <linux/bitops.h>
+
+#include <scsi/scsi.h>
+#include <linux/cdrom.h>
+
+int blk_cmd_filter_verify_command(struct blk_scsi_cmd_filter *filter,
+				  unsigned char *cmd, mode_t *f_mode)
+{
+	/* root can do any command. */
+	if (capable(CAP_SYS_RAWIO))
+		return 0;
+
+	/* if there's no filter set, assume we're filtering everything out */
+	if (!filter)
+		return -EPERM;
+
+	/* Anybody who can open the device can do a read-safe command */
+	if (test_bit(cmd[0], filter->read_ok))
+		return 0;
+
+	/* Write-safe commands require a writable open */
+	if (test_bit(cmd[0], filter->write_ok) && (*f_mode & FMODE_WRITE))
+		return 0;
+
+	return -EPERM;
+}
+EXPORT_SYMBOL(blk_cmd_filter_verify_command);
+
+int blk_verify_command(struct file *file, unsigned char *cmd)
+{
+	struct gendisk *disk;
+	struct inode *inode;
+
+	if (!file)
+		return -EINVAL;
+
+	inode = file->f_dentry->d_inode;
+	if (!inode)
+		return -EINVAL;
+
+	disk = inode->i_bdev->bd_disk;
+
+	return blk_cmd_filter_verify_command(&disk->cmd_filter,
+						 cmd, &file->f_mode);
+}
+EXPORT_SYMBOL(blk_verify_command);
+
+/* and now, the sysfs stuff */
+static ssize_t rcf_cmds_show(struct blk_scsi_cmd_filter *filter, char *page,
+			     int rw)
+{
+	char *npage = page;
+	unsigned long *okbits;
+	int i;
+
+	if (rw == READ)
+		okbits = filter->read_ok;
+	else
+		okbits = filter->write_ok;
+
+	for (i = 0; i < BLK_SCSI_MAX_CMDS; i++) {
+		if (test_bit(i, okbits)) {
+			sprintf(npage, "%02x", i);
+			npage += 2;
+			if (i < BLK_SCSI_MAX_CMDS - 1)
+				sprintf(npage++, " ");
+		}
+	}
+
+	if (npage != page)
+		npage += sprintf(npage, "\n");
+
+	return npage - page;
+}
+
+static ssize_t rcf_readcmds_show(struct blk_scsi_cmd_filter *filter, char *page)
+{
+	return rcf_cmds_show(filter, page, READ);
+}
+
+static ssize_t rcf_writecmds_show(struct blk_scsi_cmd_filter *filter,
+				 char *page)
+{
+	return rcf_cmds_show(filter, page, WRITE);
+}
+
+static ssize_t rcf_cmds_store(struct blk_scsi_cmd_filter *filter,
+			      const char *page, size_t count, int rw)
+{
+	ssize_t ret = 0;
+	unsigned long okbits[BLK_SCSI_CMD_PER_LONG], *target_okbits;
+	int cmd, status, len;
+	substring_t ss;
+
+	memset(&okbits, 0, sizeof(okbits));
+
+	for (len = strlen(page); len > 0; len -= 3) {
+		if (len < 2)
+			break;
+		ss.from = (char *) page + ret;
+		ss.to = (char *) page + ret + 2;
+		ret += 3;
+		status = match_hex(&ss, &cmd);
+		/* either of these cases means invalid input, so do nothing. */
+		if (status || cmd >= BLK_SCSI_MAX_CMDS)
+			return -EINVAL;
+
+		__set_bit(cmd, okbits);
+	}
+
+	if (rw == READ)
+		target_okbits = filter->read_ok;
+	else
+		target_okbits = filter->write_ok;
+
+	memmove(target_okbits, okbits, sizeof(okbits));
+	return count;
+}
+
+static ssize_t rcf_readcmds_store(struct blk_scsi_cmd_filter *filter,
+				  const char *page, size_t count)
+{
+	return rcf_cmds_store(filter, page, count, READ);
+}
+
+static ssize_t rcf_writecmds_store(struct blk_scsi_cmd_filter *filter,
+				   const char *page, size_t count)
+{
+	return rcf_cmds_store(filter, page, count, WRITE);
+}
+
+struct rcf_sysfs_entry {
+	struct attribute attr;
+	ssize_t (*show)(struct blk_scsi_cmd_filter *, char *);
+	ssize_t (*store)(struct blk_scsi_cmd_filter *, const char *, size_t);
+};
+
+static struct rcf_sysfs_entry rcf_readcmds_entry = {
+	.attr = { .name = "read_table", .mode = S_IRUGO | S_IWUSR },
+	.show = rcf_readcmds_show,
+	.store = rcf_readcmds_store,
+};
+
+static struct rcf_sysfs_entry rcf_writecmds_entry = {
+	.attr = {.name = "write_table", .mode = S_IRUGO | S_IWUSR },
+	.show = rcf_writecmds_show,
+	.store = rcf_writecmds_store,
+};
+
+static struct attribute *default_attrs[] = {
+	&rcf_readcmds_entry.attr,
+	&rcf_writecmds_entry.attr,
+	NULL,
+};
+
+#define to_rcf(atr) container_of((atr), struct rcf_sysfs_entry, attr)
+
+static ssize_t
+rcf_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
+{
+	struct rcf_sysfs_entry *entry = to_rcf(attr);
+	struct blk_scsi_cmd_filter *filter;
+
+	filter = container_of(kobj, struct blk_scsi_cmd_filter, kobj);
+	if (entry->show)
+		return entry->show(filter, page);
+
+	return 0;
+}
+
+static ssize_t
+rcf_attr_store(struct kobject *kobj, struct attribute *attr,
+			const char *page, size_t length)
+{
+	struct rcf_sysfs_entry *entry = to_rcf(attr);
+	struct blk_scsi_cmd_filter *filter;
+
+	if (!capable(CAP_SYS_RAWIO))
+		return -EPERM;
+
+	if (!entry->store)
+		return -EINVAL;
+
+	filter = container_of(kobj, struct blk_scsi_cmd_filter, kobj);
+	return entry->store(filter, page, length);
+}
+
+static struct sysfs_ops rcf_sysfs_ops = {
+	.show = rcf_attr_show,
+	.store = rcf_attr_store,
+};
+
+static struct kobj_type rcf_ktype = {
+	.sysfs_ops = &rcf_sysfs_ops,
+	.default_attrs = default_attrs,
+};
+
+static void rcf_set_defaults(struct blk_scsi_cmd_filter *filter)
+{
+	/* Basic read-only commands */
+	__set_bit(TEST_UNIT_READY, filter->read_ok);
+	__set_bit(REQUEST_SENSE, filter->read_ok);
+	__set_bit(READ_6, filter->read_ok);
+	__set_bit(READ_10, filter->read_ok);
+	__set_bit(READ_12, filter->read_ok);
+	__set_bit(READ_16, filter->read_ok);
+	__set_bit(READ_BUFFER, filter->read_ok);
+	__set_bit(READ_DEFECT_DATA, filter->read_ok);
+	__set_bit(READ_LONG, filter->read_ok);
+	__set_bit(INQUIRY, filter->read_ok);
+	__set_bit(MODE_SENSE, filter->read_ok);
+	__set_bit(MODE_SENSE_10, filter->read_ok);
+	__set_bit(LOG_SENSE, filter->read_ok);
+	__set_bit(START_STOP, filter->read_ok);
+	__set_bit(GPCMD_VERIFY_10, filter->read_ok);
+	__set_bit(VERIFY_16, filter->read_ok);
+	__set_bit(GPCMD_READ_BUFFER_CAPACITY, filter->read_ok);
+
+	/* Audio CD commands */
+	__set_bit(GPCMD_PLAY_CD, filter->read_ok);
+	__set_bit(GPCMD_PLAY_AUDIO_10, filter->read_ok);
+	__set_bit(GPCMD_PLAY_AUDIO_MSF, filter->read_ok);
+	__set_bit(GPCMD_PLAY_AUDIO_TI, filter->read_ok);
+	__set_bit(GPCMD_PAUSE_RESUME, filter->read_ok);
+
+	/* CD/DVD data reading */
+	__set_bit(GPCMD_READ_CD, filter->read_ok);
+	__set_bit(GPCMD_READ_CD_MSF, filter->read_ok);
+	__set_bit(GPCMD_READ_DISC_INFO, filter->read_ok);
+	__set_bit(GPCMD_READ_CDVD_CAPACITY, filter->read_ok);
+	__set_bit(GPCMD_READ_DVD_STRUCTURE, filter->read_ok);
+	__set_bit(GPCMD_READ_HEADER, filter->read_ok);
+	__set_bit(GPCMD_READ_TRACK_RZONE_INFO, filter->read_ok);
+	__set_bit(GPCMD_READ_SUBCHANNEL, filter->read_ok);
+	__set_bit(GPCMD_READ_TOC_PMA_ATIP, filter->read_ok);
+	__set_bit(GPCMD_REPORT_KEY, filter->read_ok);
+	__set_bit(GPCMD_SCAN, filter->read_ok);
+	__set_bit(GPCMD_GET_CONFIGURATION, filter->read_ok);
+	__set_bit(GPCMD_READ_FORMAT_CAPACITIES, filter->read_ok);
+	__set_bit(GPCMD_GET_EVENT_STATUS_NOTIFICATION, filter->read_ok);
+	__set_bit(GPCMD_GET_PERFORMANCE, filter->read_ok);
+	__set_bit(GPCMD_SEEK, filter->read_ok);
+	__set_bit(GPCMD_STOP_PLAY_SCAN, filter->read_ok);
+
+	/* Basic writing commands */
+	__set_bit(WRITE_6, filter->write_ok);
+	__set_bit(WRITE_10, filter->write_ok);
+	__set_bit(WRITE_VERIFY, filter->write_ok);
+	__set_bit(WRITE_12, filter->write_ok);
+	__set_bit(WRITE_VERIFY_12, filter->write_ok);
+	__set_bit(WRITE_16, filter->write_ok);
+	__set_bit(WRITE_LONG, filter->write_ok);
+	__set_bit(WRITE_LONG_2, filter->write_ok);
+	__set_bit(ERASE, filter->write_ok);
+	__set_bit(GPCMD_MODE_SELECT_10, filter->write_ok);
+	__set_bit(MODE_SELECT, filter->write_ok);
+	__set_bit(LOG_SELECT, filter->write_ok);
+	__set_bit(GPCMD_BLANK, filter->write_ok);
+	__set_bit(GPCMD_CLOSE_TRACK, filter->write_ok);
+	__set_bit(GPCMD_FLUSH_CACHE, filter->write_ok);
+	__set_bit(GPCMD_FORMAT_UNIT, filter->write_ok);
+	__set_bit(GPCMD_REPAIR_RZONE_TRACK, filter->write_ok);
+	__set_bit(GPCMD_RESERVE_RZONE_TRACK, filter->write_ok);
+	__set_bit(GPCMD_SEND_DVD_STRUCTURE, filter->write_ok);
+	__set_bit(GPCMD_SEND_EVENT, filter->write_ok);
+	__set_bit(GPCMD_SEND_KEY, filter->write_ok);
+	__set_bit(GPCMD_SEND_OPC, filter->write_ok);
+	__set_bit(GPCMD_SEND_CUE_SHEET, filter->write_ok);
+	__set_bit(GPCMD_SET_SPEED, filter->write_ok);
+	__set_bit(GPCMD_PREVENT_ALLOW_MEDIUM_REMOVAL, filter->write_ok);
+	__set_bit(GPCMD_LOAD_UNLOAD, filter->write_ok);
+	__set_bit(GPCMD_SET_STREAMING, filter->write_ok);
+}
+
+int blk_register_filter(struct gendisk *disk)
+{
+	int ret;
+	struct blk_scsi_cmd_filter *filter = &disk->cmd_filter;
+	struct kobject *parent = kobject_get(disk->holder_dir->parent);
+
+	if (!parent)
+		return -ENODEV;
+
+	ret = kobject_init_and_add(&filter->kobj, &rcf_ktype, parent,
+				 "%s", "cmd_filter");
+
+	if (ret < 0)
+		return ret;
+
+	rcf_set_defaults(filter);
+	return 0;
+}
+
+void blk_unregister_filter(struct gendisk *disk)
+{
+	struct blk_scsi_cmd_filter *filter = &disk->cmd_filter;
+
+	kobject_put(&filter->kobj);
+	kobject_put(disk->holder_dir->parent);
+}
+
diff --git a/block/genhd.c b/block/genhd.c
index 43e468ee5993..9074f384b097 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -189,6 +189,7 @@ void add_disk(struct gendisk *disk)
 			    disk->minors, NULL, exact_match, exact_lock, disk);
 	register_disk(disk);
 	blk_register_queue(disk);
+	blk_register_filter(disk);
 
 	bdi = &disk->queue->backing_dev_info;
 	bdi_register_dev(bdi, MKDEV(disk->major, disk->first_minor));
@@ -200,6 +201,7 @@ EXPORT_SYMBOL(del_gendisk);	/* in partitions/check.c */
 
 void unlink_gendisk(struct gendisk *disk)
 {
+	blk_unregister_filter(disk);
 	sysfs_remove_link(&disk->dev.kobj, "bdi");
 	bdi_unregister(&disk->queue->backing_dev_info);
 	blk_unregister_queue(disk);
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 78199c08ec92..c5b9bcfc0a6d 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -105,120 +105,12 @@ static int sg_emulated_host(struct request_queue *q, int __user *p)
 	return put_user(1, p);
 }
 
-#define CMD_READ_SAFE	0x01
-#define CMD_WRITE_SAFE	0x02
-#define CMD_WARNED	0x04
-#define safe_for_read(cmd)	[cmd] = CMD_READ_SAFE
-#define safe_for_write(cmd)	[cmd] = CMD_WRITE_SAFE
-
-int blk_verify_command(unsigned char *cmd, int has_write_perm)
-{
-	static unsigned char cmd_type[256] = {
-
-		/* Basic read-only commands */
-		safe_for_read(TEST_UNIT_READY),
-		safe_for_read(REQUEST_SENSE),
-		safe_for_read(READ_6),
-		safe_for_read(READ_10),
-		safe_for_read(READ_12),
-		safe_for_read(READ_16),
-		safe_for_read(READ_BUFFER),
-		safe_for_read(READ_DEFECT_DATA),
-		safe_for_read(READ_LONG),
-		safe_for_read(INQUIRY),
-		safe_for_read(MODE_SENSE),
-		safe_for_read(MODE_SENSE_10),
-		safe_for_read(LOG_SENSE),
-		safe_for_read(START_STOP),
-		safe_for_read(GPCMD_VERIFY_10),
-		safe_for_read(VERIFY_16),
-
-		/* Audio CD commands */
-		safe_for_read(GPCMD_PLAY_CD),
-		safe_for_read(GPCMD_PLAY_AUDIO_10),
-		safe_for_read(GPCMD_PLAY_AUDIO_MSF),
-		safe_for_read(GPCMD_PLAY_AUDIO_TI),
-		safe_for_read(GPCMD_PAUSE_RESUME),
-
-		/* CD/DVD data reading */
-		safe_for_read(GPCMD_READ_BUFFER_CAPACITY),
-		safe_for_read(GPCMD_READ_CD),
-		safe_for_read(GPCMD_READ_CD_MSF),
-		safe_for_read(GPCMD_READ_DISC_INFO),
-		safe_for_read(GPCMD_READ_CDVD_CAPACITY),
-		safe_for_read(GPCMD_READ_DVD_STRUCTURE),
-		safe_for_read(GPCMD_READ_HEADER),
-		safe_for_read(GPCMD_READ_TRACK_RZONE_INFO),
-		safe_for_read(GPCMD_READ_SUBCHANNEL),
-		safe_for_read(GPCMD_READ_TOC_PMA_ATIP),
-		safe_for_read(GPCMD_REPORT_KEY),
-		safe_for_read(GPCMD_SCAN),
-		safe_for_read(GPCMD_GET_CONFIGURATION),
-		safe_for_read(GPCMD_READ_FORMAT_CAPACITIES),
-		safe_for_read(GPCMD_GET_EVENT_STATUS_NOTIFICATION),
-		safe_for_read(GPCMD_GET_PERFORMANCE),
-		safe_for_read(GPCMD_SEEK),
-		safe_for_read(GPCMD_STOP_PLAY_SCAN),
-
-		/* Basic writing commands */
-		safe_for_write(WRITE_6),
-		safe_for_write(WRITE_10),
-		safe_for_write(WRITE_VERIFY),
-		safe_for_write(WRITE_12),
-		safe_for_write(WRITE_VERIFY_12),
-		safe_for_write(WRITE_16),
-		safe_for_write(WRITE_LONG),
-		safe_for_write(WRITE_LONG_2),
-		safe_for_write(ERASE),
-		safe_for_write(GPCMD_MODE_SELECT_10),
-		safe_for_write(MODE_SELECT),
-		safe_for_write(LOG_SELECT),
-		safe_for_write(GPCMD_BLANK),
-		safe_for_write(GPCMD_CLOSE_TRACK),
-		safe_for_write(GPCMD_FLUSH_CACHE),
-		safe_for_write(GPCMD_FORMAT_UNIT),
-		safe_for_write(GPCMD_REPAIR_RZONE_TRACK),
-		safe_for_write(GPCMD_RESERVE_RZONE_TRACK),
-		safe_for_write(GPCMD_SEND_DVD_STRUCTURE),
-		safe_for_write(GPCMD_SEND_EVENT),
-		safe_for_write(GPCMD_SEND_KEY),
-		safe_for_write(GPCMD_SEND_OPC),
-		safe_for_write(GPCMD_SEND_CUE_SHEET),
-		safe_for_write(GPCMD_SET_SPEED),
-		safe_for_write(GPCMD_PREVENT_ALLOW_MEDIUM_REMOVAL),
-		safe_for_write(GPCMD_LOAD_UNLOAD),
-		safe_for_write(GPCMD_SET_STREAMING),
-	};
-	unsigned char type = cmd_type[cmd[0]];
-
-	/* Anybody who can open the device can do a read-safe command */
-	if (type & CMD_READ_SAFE)
-		return 0;
-
-	/* Write-safe commands just require a writable open.. */
-	if ((type & CMD_WRITE_SAFE) && has_write_perm)
-		return 0;
-
-	/* And root can do any command.. */
-	if (capable(CAP_SYS_RAWIO))
-		return 0;
-
-	if (!type) {
-		cmd_type[cmd[0]] = CMD_WARNED;
-		printk(KERN_WARNING "scsi: unknown opcode 0x%02x\n", cmd[0]);
-	}
-
-	/* Otherwise fail it with an "Operation not permitted" */
-	return -EPERM;
-}
-EXPORT_SYMBOL_GPL(blk_verify_command);
-
 static int blk_fill_sghdr_rq(struct request_queue *q, struct request *rq,
-			     struct sg_io_hdr *hdr, int has_write_perm)
+			     struct sg_io_hdr *hdr, struct file *file)
 {
 	if (copy_from_user(rq->cmd, hdr->cmdp, hdr->cmd_len))
 		return -EFAULT;
-	if (blk_verify_command(rq->cmd, has_write_perm))
+	if (blk_verify_command(file, rq->cmd))
 		return -EPERM;
 
 	/*
@@ -287,7 +179,7 @@ static int sg_io(struct file *file, struct request_queue *q,
 		struct gendisk *bd_disk, struct sg_io_hdr *hdr)
 {
 	unsigned long start_time;
-	int writing = 0, ret = 0, has_write_perm = 0;
+	int writing = 0, ret = 0;
 	struct request *rq;
 	char sense[SCSI_SENSE_BUFFERSIZE];
 	struct bio *bio;
@@ -316,10 +208,7 @@ static int sg_io(struct file *file, struct request_queue *q,
 	if (!rq)
 		return -ENOMEM;
 
-	if (file)
-		has_write_perm = file->f_mode & FMODE_WRITE;
-
-	if (blk_fill_sghdr_rq(q, rq, hdr, has_write_perm)) {
+	if (blk_fill_sghdr_rq(q, rq, hdr, file)) {
 		blk_put_request(rq);
 		return -EFAULT;
 	}
@@ -451,7 +340,7 @@ int sg_scsi_ioctl(struct file *file, struct request_queue *q,
 	if (in_len && copy_from_user(buffer, sic->data + cmdlen, in_len))
 		goto error;
 
-	err = blk_verify_command(rq->cmd, file->f_mode & FMODE_WRITE);
+	err = blk_verify_command(file, rq->cmd);
 	if (err)
 		goto error;
 
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index ea0edd1b2e76..f7abccaffaec 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -182,8 +182,9 @@ static int sg_build_sgat(Sg_scatter_hold * schp, const Sg_fd * sfp,
 			 int tablesize);
 static ssize_t sg_new_read(Sg_fd * sfp, char __user *buf, size_t count,
 			   Sg_request * srp);
-static ssize_t sg_new_write(Sg_fd * sfp, const char __user *buf, size_t count,
-			    int blocking, int read_only, Sg_request ** o_srp);
+static ssize_t sg_new_write(Sg_fd *sfp, struct file *file,
+			const char __user *buf, size_t count, int blocking,
+			int read_only, Sg_request **o_srp);
 static int sg_common_write(Sg_fd * sfp, Sg_request * srp,
 			   unsigned char *cmnd, int timeout, int blocking);
 static int sg_u_iovec(sg_io_hdr_t * hp, int sg_num, int ind,
@@ -204,7 +205,6 @@ static Sg_request *sg_get_rq_mark(Sg_fd * sfp, int pack_id);
 static Sg_request *sg_add_request(Sg_fd * sfp);
 static int sg_remove_request(Sg_fd * sfp, Sg_request * srp);
 static int sg_res_in_use(Sg_fd * sfp);
-static int sg_allow_access(unsigned char opcode, char dev_type);
 static int sg_build_direct(Sg_request * srp, Sg_fd * sfp, int dxfer_len);
 static Sg_device *sg_get_dev(int dev);
 #ifdef CONFIG_SCSI_PROC_FS
@@ -544,7 +544,7 @@ sg_write(struct file *filp, const char __user *buf, size_t count, loff_t * ppos)
 		return -EFAULT;
 	blocking = !(filp->f_flags & O_NONBLOCK);
 	if (old_hdr.reply_len < 0)
-		return sg_new_write(sfp, buf, count, blocking, 0, NULL);
+		return sg_new_write(sfp, filp, buf, count, blocking, 0, NULL);
 	if (count < (SZ_SG_HEADER + 6))
 		return -EIO;	/* The minimum scsi command length is 6 bytes. */
 
@@ -621,8 +621,9 @@ sg_write(struct file *filp, const char __user *buf, size_t count, loff_t * ppos)
 }
 
 static ssize_t
-sg_new_write(Sg_fd * sfp, const char __user *buf, size_t count,
-	     int blocking, int read_only, Sg_request ** o_srp)
+sg_new_write(Sg_fd *sfp, struct file *file, const char __user *buf,
+		 size_t count, int blocking, int read_only,
+		 Sg_request **o_srp)
 {
 	int k;
 	Sg_request *srp;
@@ -678,8 +679,7 @@ sg_new_write(Sg_fd * sfp, const char __user *buf, size_t count,
 		sg_remove_request(sfp, srp);
 		return -EFAULT;
 	}
-	if (read_only &&
-	    (!sg_allow_access(cmnd[0], sfp->parentdp->device->type))) {
+	if (read_only && (!blk_verify_command(file, cmnd))) {
 		sg_remove_request(sfp, srp);
 		return -EPERM;
 	}
@@ -799,7 +799,7 @@ sg_ioctl(struct inode *inode, struct file *filp,
 			if (!access_ok(VERIFY_WRITE, p, SZ_SG_IO_HDR))
 				return -EFAULT;
 			result =
-			    sg_new_write(sfp, p, SZ_SG_IO_HDR,
+			    sg_new_write(sfp, filp, p, SZ_SG_IO_HDR,
 					 blocking, read_only, &srp);
 			if (result < 0)
 				return result;
@@ -1048,7 +1048,7 @@ sg_ioctl(struct inode *inode, struct file *filp,
 
 			if (copy_from_user(&opcode, siocp->data, 1))
 				return -EFAULT;
-			if (!sg_allow_access(opcode, sdp->device->type))
+			if (!blk_verify_command(filp, &opcode))
 				return -EPERM;
 		}
 		return sg_scsi_ioctl(filp, sdp->device->request_queue, NULL, p);
@@ -2506,26 +2506,6 @@ sg_page_free(struct page *page, int size)
 #define MAINTENANCE_IN_CMD 0xa3
 #endif
 
-static unsigned char allow_ops[] = { TEST_UNIT_READY, REQUEST_SENSE,
-	INQUIRY, READ_CAPACITY, READ_BUFFER, READ_6, READ_10, READ_12,
-	READ_16, MODE_SENSE, MODE_SENSE_10, LOG_SENSE, REPORT_LUNS,
-	SERVICE_ACTION_IN, RECEIVE_DIAGNOSTIC, READ_LONG, MAINTENANCE_IN_CMD
-};
-
-static int
-sg_allow_access(unsigned char opcode, char dev_type)
-{
-	int k;
-
-	if (TYPE_SCANNER == dev_type)	/* TYPE_ROM maybe burner */
-		return 1;
-	for (k = 0; k < sizeof (allow_ops); ++k) {
-		if (opcode == allow_ops[k])
-			return 1;
-	}
-	return 0;
-}
-
 #ifdef CONFIG_SCSI_PROC_FS
 static int
 sg_idr_max_id(int id, void *p, void *data)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index d3ae9ad97213..a842b776d099 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -671,7 +671,6 @@ extern int blk_execute_rq(struct request_queue *, struct gendisk *,
 			  struct request *, int);
 extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *,
 				  struct request *, int, rq_end_io_fn *);
-extern int blk_verify_command(unsigned char *, int);
 extern void blk_unplug(struct request_queue *q);
 
 static inline struct request_queue *bdev_get_queue(struct block_device *bdev)
@@ -797,6 +796,15 @@ static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt,
 
 extern int blkdev_issue_flush(struct block_device *, sector_t *);
 
+/*
+* command filter functions
+*/
+extern int blk_verify_command(struct file *file, unsigned char *cmd);
+extern int blk_cmd_filter_verify_command(struct blk_scsi_cmd_filter *filter,
+					 unsigned char *cmd, mode_t *f_mode);
+extern int blk_register_filter(struct gendisk *disk);
+extern void blk_unregister_filter(struct gendisk *disk);
+
 #define MAX_PHYS_SEGMENTS 128
 #define MAX_HW_SEGMENTS 128
 #define SAFE_MAX_SECTORS 255
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 524ec96f5a23..e8787417f65a 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -110,6 +110,14 @@ struct hd_struct {
 #define GENHD_FL_SUPPRESS_PARTITION_INFO	32
 #define GENHD_FL_FAIL				64
 
+#define BLK_SCSI_MAX_CMDS	(256)
+#define BLK_SCSI_CMD_PER_LONG	(BLK_SCSI_MAX_CMDS / (sizeof(long) * 8))
+
+struct blk_scsi_cmd_filter {
+	unsigned long read_ok[BLK_SCSI_CMD_PER_LONG];
+	unsigned long write_ok[BLK_SCSI_CMD_PER_LONG];
+	struct kobject kobj;
+};
 
 struct gendisk {
 	int major;			/* major number of driver */
@@ -120,6 +128,7 @@ struct gendisk {
 	struct hd_struct **part;	/* [indexed by minor] */
 	struct block_device_operations *fops;
 	struct request_queue *queue;
+	struct blk_scsi_cmd_filter cmd_filter;
 	void *private_data;
 	sector_t capacity;
 
-- 
cgit v1.2.3


From b24498d477a14680fc3bb3ad884fa9fa76a2d237 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Fri, 27 Jun 2008 09:12:09 +0200
Subject: block: integrity flags can't use bit ops on unsigned short

Just use normal open coded bit operations instead, they need not be
atomic.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-integrity.c  | 17 +++++++----------
 include/linux/blkdev.h |  8 ++++----
 2 files changed, 11 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index 4ffa3814f6a9..3f1a8478cc38 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -217,17 +217,16 @@ static ssize_t integrity_read_store(struct blk_integrity *bi,
 	unsigned long val = simple_strtoul(p, &p, 10);
 
 	if (val)
-		set_bit(INTEGRITY_FLAG_READ, &bi->flags);
+		bi->flags |= INTEGRITY_FLAG_READ;
 	else
-		clear_bit(INTEGRITY_FLAG_READ, &bi->flags);
+		bi->flags &= ~INTEGRITY_FLAG_READ;
 
 	return count;
 }
 
 static ssize_t integrity_read_show(struct blk_integrity *bi, char *page)
 {
-	return sprintf(page, "%d\n",
-		       test_bit(INTEGRITY_FLAG_READ, &bi->flags) ? 1 : 0);
+	return sprintf(page, "%d\n", (bi->flags & INTEGRITY_FLAG_READ) != 0);
 }
 
 static ssize_t integrity_write_store(struct blk_integrity *bi,
@@ -237,17 +236,16 @@ static ssize_t integrity_write_store(struct blk_integrity *bi,
 	unsigned long val = simple_strtoul(p, &p, 10);
 
 	if (val)
-		set_bit(INTEGRITY_FLAG_WRITE, &bi->flags);
+		bi->flags |= INTEGRITY_FLAG_WRITE;
 	else
-		clear_bit(INTEGRITY_FLAG_WRITE, &bi->flags);
+		bi->flags &= ~INTEGRITY_FLAG_WRITE;
 
 	return count;
 }
 
 static ssize_t integrity_write_show(struct blk_integrity *bi, char *page)
 {
-	return sprintf(page, "%d\n",
-		       test_bit(INTEGRITY_FLAG_WRITE, &bi->flags) ? 1 : 0);
+	return sprintf(page, "%d\n", (bi->flags & INTEGRITY_FLAG_WRITE) != 0);
 }
 
 static struct integrity_sysfs_entry integrity_format_entry = {
@@ -340,8 +338,7 @@ int blk_integrity_register(struct gendisk *disk, struct blk_integrity *template)
 
 		kobject_uevent(&bi->kobj, KOBJ_ADD);
 
-		set_bit(INTEGRITY_FLAG_READ, &bi->flags);
-		set_bit(INTEGRITY_FLAG_WRITE, &bi->flags);
+		bi->flags |= INTEGRITY_FLAG_READ | INTEGRITY_FLAG_WRITE;
 		bi->sector_size = disk->queue->hardsect_size;
 		disk->integrity = bi;
 	} else
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index a842b776d099..7ab8acad5b6e 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -870,8 +870,8 @@ void kblockd_flush_work(struct work_struct *work);
 
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
 
-#define INTEGRITY_FLAG_READ	1	/* verify data integrity on read */
-#define INTEGRITY_FLAG_WRITE	2	/* generate data integrity on write */
+#define INTEGRITY_FLAG_READ	2	/* verify data integrity on read */
+#define INTEGRITY_FLAG_WRITE	4	/* generate data integrity on write */
 
 struct blk_integrity_exchg {
 	void			*prot_buf;
@@ -940,11 +940,11 @@ static inline int bdev_integrity_enabled(struct block_device *bdev, int rw)
 		return 0;
 
 	if (rw == READ && bi->verify_fn != NULL &&
-	    test_bit(INTEGRITY_FLAG_READ, &bi->flags))
+	    (bi->flags & INTEGRITY_FLAG_READ))
 		return 1;
 
 	if (rw == WRITE && bi->generate_fn != NULL &&
-	    test_bit(INTEGRITY_FLAG_WRITE, &bi->flags))
+	    (bi->flags & INTEGRITY_FLAG_WRITE))
 		return 1;
 
 	return 0;
-- 
cgit v1.2.3


From cc371e66e340f35eed8dc4651c7c18e754c7fb26 Mon Sep 17 00:00:00 2001
From: Alasdair G Kergon <agk@redhat.com>
Date: Thu, 3 Jul 2008 09:53:43 +0200
Subject: Add bvec_merge_data to handle stacked devices and ->merge_bvec()

When devices are stacked, one device's merge_bvec_fn may need to perform
the mapping and then call one or more functions for its underlying devices.

The following bio fields are used:
  bio->bi_sector
  bio->bi_bdev
  bio->bi_size
  bio->bi_rw  using bio_data_dir()

This patch creates a new struct bvec_merge_data holding a copy of those
fields to avoid having to change them directly in the struct bio when
going down the stack only to have to change them back again on the way
back up.  (And then when the bio gets mapped for real, the whole
exercise gets repeated, but that's a problem for another day...)

Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Milan Broz <mbroz@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/pktcdvd.c |  9 +++++----
 drivers/md/linear.c     | 10 ++++++----
 drivers/md/raid0.c      | 10 ++++++----
 drivers/md/raid10.c     | 15 ++++++++-------
 drivers/md/raid5.c      | 10 ++++++----
 fs/bio.c                | 26 +++++++++++++++++++++-----
 include/linux/blkdev.h  |  9 ++++++++-
 7 files changed, 60 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 3ba1df93e9e3..589850cff359 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -2633,11 +2633,12 @@ end_io:
 
 
-static int pkt_merge_bvec(struct request_queue *q, struct bio *bio, struct bio_vec *bvec)
+static int pkt_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
+			  struct bio_vec *bvec)
 {
 	struct pktcdvd_device *pd = q->queuedata;
-	sector_t zone = ZONE(bio->bi_sector, pd);
-	int used = ((bio->bi_sector - zone) << 9) + bio->bi_size;
+	sector_t zone = ZONE(bmd->bi_sector, pd);
+	int used = ((bmd->bi_sector - zone) << 9) + bmd->bi_size;
 	int remaining = (pd->settings.size << 9) - used;
 	int remaining2;
 
@@ -2645,7 +2646,7 @@ static int pkt_merge_bvec(struct request_queue *q, struct bio *bio, struct bio_v
 	 * A bio <= PAGE_SIZE must be allowed. If it crosses a packet
 	 * boundary, pkt_make_request() will split the bio.
 	 */
-	remaining2 = PAGE_SIZE - bio->bi_size;
+	remaining2 = PAGE_SIZE - bmd->bi_size;
 	remaining = max(remaining, remaining2);
 
 	BUG_ON(remaining < 0);
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 10748240cb2f..6a866d7c8ae5 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -50,17 +50,19 @@ static inline dev_info_t *which_dev(mddev_t *mddev, sector_t sector)
 /**
  *	linear_mergeable_bvec -- tell bio layer if two requests can be merged
  *	@q: request queue
- *	@bio: the buffer head that's been built up so far
+ *	@bvm: properties of new bio
  *	@biovec: the request that could be merged to it.
  *
  *	Return amount of bytes we can take at this offset
  */
-static int linear_mergeable_bvec(struct request_queue *q, struct bio *bio, struct bio_vec *biovec)
+static int linear_mergeable_bvec(struct request_queue *q,
+				 struct bvec_merge_data *bvm,
+				 struct bio_vec *biovec)
 {
 	mddev_t *mddev = q->queuedata;
 	dev_info_t *dev0;
-	unsigned long maxsectors, bio_sectors = bio->bi_size >> 9;
-	sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
+	unsigned long maxsectors, bio_sectors = bvm->bi_size >> 9;
+	sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
 
 	dev0 = which_dev(mddev, sector);
 	maxsectors = (dev0->size << 1) - (sector - (dev0->offset<<1));
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 914c04ddec7c..bcbb82594a19 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -241,18 +241,20 @@ static int create_strip_zones (mddev_t *mddev)
 /**
  *	raid0_mergeable_bvec -- tell bio layer if a two requests can be merged
  *	@q: request queue
- *	@bio: the buffer head that's been built up so far
+ *	@bvm: properties of new bio
  *	@biovec: the request that could be merged to it.
  *
  *	Return amount of bytes we can accept at this offset
  */
-static int raid0_mergeable_bvec(struct request_queue *q, struct bio *bio, struct bio_vec *biovec)
+static int raid0_mergeable_bvec(struct request_queue *q,
+				struct bvec_merge_data *bvm,
+				struct bio_vec *biovec)
 {
 	mddev_t *mddev = q->queuedata;
-	sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
+	sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
 	int max;
 	unsigned int chunk_sectors = mddev->chunk_size >> 9;
-	unsigned int bio_sectors = bio->bi_size >> 9;
+	unsigned int bio_sectors = bvm->bi_size >> 9;
 
 	max =  (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
 	if (max < 0) max = 0; /* bio_add cannot handle a negative return */
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index a71277b640ab..22bb2b1b886d 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -439,26 +439,27 @@ static sector_t raid10_find_virt(conf_t *conf, sector_t sector, int dev)
 /**
  *	raid10_mergeable_bvec -- tell bio layer if a two requests can be merged
  *	@q: request queue
- *	@bio: the buffer head that's been built up so far
+ *	@bvm: properties of new bio
  *	@biovec: the request that could be merged to it.
  *
  *	Return amount of bytes we can accept at this offset
  *      If near_copies == raid_disk, there are no striping issues,
  *      but in that case, the function isn't called at all.
  */
-static int raid10_mergeable_bvec(struct request_queue *q, struct bio *bio,
-				struct bio_vec *bio_vec)
+static int raid10_mergeable_bvec(struct request_queue *q,
+				 struct bvec_merge_data *bvm,
+				 struct bio_vec *biovec)
 {
 	mddev_t *mddev = q->queuedata;
-	sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
+	sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
 	int max;
 	unsigned int chunk_sectors = mddev->chunk_size >> 9;
-	unsigned int bio_sectors = bio->bi_size >> 9;
+	unsigned int bio_sectors = bvm->bi_size >> 9;
 
 	max =  (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
 	if (max < 0) max = 0; /* bio_add cannot handle a negative return */
-	if (max <= bio_vec->bv_len && bio_sectors == 0)
-		return bio_vec->bv_len;
+	if (max <= biovec->bv_len && bio_sectors == 0)
+		return biovec->bv_len;
 	else
 		return max;
 }
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 54c8ee28fcc4..9b00675dc64f 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3319,15 +3319,17 @@ static int raid5_congested(void *data, int bits)
 /* We want read requests to align with chunks where possible,
  * but write requests don't need to.
  */
-static int raid5_mergeable_bvec(struct request_queue *q, struct bio *bio, struct bio_vec *biovec)
+static int raid5_mergeable_bvec(struct request_queue *q,
+				struct bvec_merge_data *bvm,
+				struct bio_vec *biovec)
 {
 	mddev_t *mddev = q->queuedata;
-	sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
+	sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
 	int max;
 	unsigned int chunk_sectors = mddev->chunk_size >> 9;
-	unsigned int bio_sectors = bio->bi_size >> 9;
+	unsigned int bio_sectors = bvm->bi_size >> 9;
 
-	if (bio_data_dir(bio) == WRITE)
+	if ((bvm->bi_rw & 1) == WRITE)
 		return biovec->bv_len; /* always allow writes to be mergeable */
 
 	max =  (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
diff --git a/fs/bio.c b/fs/bio.c
index 7761c84c7032..88322b066acb 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -325,10 +325,19 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
 		if (page == prev->bv_page &&
 		    offset == prev->bv_offset + prev->bv_len) {
 			prev->bv_len += len;
-			if (q->merge_bvec_fn &&
-			    q->merge_bvec_fn(q, bio, prev) < len) {
-				prev->bv_len -= len;
-				return 0;
+
+			if (q->merge_bvec_fn) {
+				struct bvec_merge_data bvm = {
+					.bi_bdev = bio->bi_bdev,
+					.bi_sector = bio->bi_sector,
+					.bi_size = bio->bi_size,
+					.bi_rw = bio->bi_rw,
+				};
+
+				if (q->merge_bvec_fn(q, &bvm, prev) < len) {
+					prev->bv_len -= len;
+					return 0;
+				}
 			}
 
 			goto done;
@@ -369,11 +378,18 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
 	 * queue to get further control
 	 */
 	if (q->merge_bvec_fn) {
+		struct bvec_merge_data bvm = {
+			.bi_bdev = bio->bi_bdev,
+			.bi_sector = bio->bi_sector,
+			.bi_size = bio->bi_size,
+			.bi_rw = bio->bi_rw,
+		};
+
 		/*
 		 * merge_bvec_fn() returns number of bytes it can accept
 		 * at this offset
 		 */
-		if (q->merge_bvec_fn(q, bio, bvec) < len) {
+		if (q->merge_bvec_fn(q, &bvm, bvec) < len) {
 			bvec->bv_page = NULL;
 			bvec->bv_len = 0;
 			bvec->bv_offset = 0;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 7ab8acad5b6e..ff9d0bdf2a16 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -254,7 +254,14 @@ typedef int (prep_rq_fn) (struct request_queue *, struct request *);
 typedef void (unplug_fn) (struct request_queue *);
 
 struct bio_vec;
-typedef int (merge_bvec_fn) (struct request_queue *, struct bio *, struct bio_vec *);
+struct bvec_merge_data {
+	struct block_device *bi_bdev;
+	sector_t bi_sector;
+	unsigned bi_size;
+	unsigned long bi_rw;
+};
+typedef int (merge_bvec_fn) (struct request_queue *, struct bvec_merge_data *,
+			     struct bio_vec *);
 typedef void (prepare_flush_fn) (struct request_queue *, struct request *);
 typedef void (softirq_done_fn)(struct request *);
 typedef int (dma_drain_needed_fn)(struct request *);
-- 
cgit v1.2.3


From e48ec69005f02b70b7ecfde1bc39a599086d16ef Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 3 Jul 2008 13:18:54 +0200
Subject: block: extend queue_flag bitops

Add test_and_clear and test_and_set.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c       | 12 ++++--------
 include/linux/blkdev.h | 26 ++++++++++++++++++++++++++
 2 files changed, 30 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index e0fb0bcc0c17..dbc7f42b5d2b 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -205,8 +205,7 @@ void blk_plug_device(struct request_queue *q)
 	if (blk_queue_stopped(q))
 		return;
 
-	if (!test_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) {
-		__set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags);
+	if (!queue_flag_test_and_set(QUEUE_FLAG_PLUGGED, q)) {
 		mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
 		blk_add_trace_generic(q, NULL, 0, BLK_TA_PLUG);
 	}
@@ -221,10 +220,9 @@ int blk_remove_plug(struct request_queue *q)
 {
 	WARN_ON(!irqs_disabled());
 
-	if (!test_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags))
+	if (!queue_flag_test_and_clear(QUEUE_FLAG_PLUGGED, q))
 		return 0;
 
-	queue_flag_clear(QUEUE_FLAG_PLUGGED, q);
 	del_timer(&q->unplug_timer);
 	return 1;
 }
@@ -328,8 +326,7 @@ void blk_start_queue(struct request_queue *q)
 	 * one level of recursion is ok and is much faster than kicking
 	 * the unplug handling
 	 */
-	if (!test_bit(QUEUE_FLAG_REENTER, &q->queue_flags)) {
-		queue_flag_set(QUEUE_FLAG_REENTER, q);
+	if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) {
 		q->request_fn(q);
 		queue_flag_clear(QUEUE_FLAG_REENTER, q);
 	} else {
@@ -394,8 +391,7 @@ void __blk_run_queue(struct request_queue *q)
 	 * handling reinvoke the handler shortly if we already got there.
 	 */
 	if (!elv_queue_empty(q)) {
-		if (!test_bit(QUEUE_FLAG_REENTER, &q->queue_flags)) {
-			queue_flag_set(QUEUE_FLAG_REENTER, q);
+		if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) {
 			q->request_fn(q);
 			queue_flag_clear(QUEUE_FLAG_REENTER, q);
 		} else {
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index ff9d0bdf2a16..e04c4ac8a7cf 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -428,6 +428,32 @@ static inline void queue_flag_set_unlocked(unsigned int flag,
 	__set_bit(flag, &q->queue_flags);
 }
 
+static inline int queue_flag_test_and_clear(unsigned int flag,
+					    struct request_queue *q)
+{
+	WARN_ON_ONCE(!queue_is_locked(q));
+
+	if (test_bit(flag, &q->queue_flags)) {
+		__clear_bit(flag, &q->queue_flags);
+		return 1;
+	}
+
+	return 0;
+}
+
+static inline int queue_flag_test_and_set(unsigned int flag,
+					  struct request_queue *q)
+{
+	WARN_ON_ONCE(!queue_is_locked(q));
+
+	if (!test_bit(flag, &q->queue_flags)) {
+		__set_bit(flag, &q->queue_flags);
+		return 0;
+	}
+
+	return 1;
+}
+
 static inline void queue_flag_set(unsigned int flag, struct request_queue *q)
 {
 	WARN_ON_ONCE(!queue_is_locked(q));
-- 
cgit v1.2.3


From 42796d37da6ef4fd851dc6d5d0387baf7e2b0c3c Mon Sep 17 00:00:00 2001
From: eric miao <eric.miao@marvell.com>
Date: Mon, 14 Apr 2008 09:35:08 +0100
Subject: [ARM] pxa: add generic PWM backlight driver

Patch mostly from Eric Miao, with minor edits by rmk to convert
Eric's driver to a generic PWM-based backlight driver.

Signed-off-by: eric miao <eric.miao@marvell.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 drivers/video/backlight/Kconfig  |   7 ++
 drivers/video/backlight/Makefile |   1 +
 drivers/video/backlight/pwm_bl.c | 160 +++++++++++++++++++++++++++++++++++++++
 include/linux/pwm_backlight.h    |  14 ++++
 4 files changed, 182 insertions(+)
 create mode 100644 drivers/video/backlight/pwm_bl.c
 create mode 100644 include/linux/pwm_backlight.h

(limited to 'include/linux')

diff --git a/drivers/video/backlight/Kconfig b/drivers/video/backlight/Kconfig
index dcd8073c2369..30bf7f2f1635 100644
--- a/drivers/video/backlight/Kconfig
+++ b/drivers/video/backlight/Kconfig
@@ -112,3 +112,10 @@ config BACKLIGHT_CARILLO_RANCH
 	help
 	  If you have a Intel LE80578 (Carillo Ranch) say Y to enable the
 	  backlight driver.
+
+config BACKLIGHT_PWM
+	tristate "Generic PWM based Backlight Driver"
+	depends on BACKLIGHT_CLASS_DEVICE && HAVE_PWM
+	help
+	  If you have a LCD backlight adjustable by PWM, say Y to enable
+	  this driver.
diff --git a/drivers/video/backlight/Makefile b/drivers/video/backlight/Makefile
index 33f6c7cecc73..b51a7cd12500 100644
--- a/drivers/video/backlight/Makefile
+++ b/drivers/video/backlight/Makefile
@@ -10,3 +10,4 @@ obj-$(CONFIG_BACKLIGHT_LOCOMO)	+= locomolcd.o
 obj-$(CONFIG_BACKLIGHT_OMAP1)	+= omap1_bl.o
 obj-$(CONFIG_BACKLIGHT_PROGEAR) += progear_bl.o
 obj-$(CONFIG_BACKLIGHT_CARILLO_RANCH) += cr_bllcd.o
+obj-$(CONFIG_BACKLIGHT_PWM)	+= pwm_bl.o
diff --git a/drivers/video/backlight/pwm_bl.c b/drivers/video/backlight/pwm_bl.c
new file mode 100644
index 000000000000..9637f5e08cb6
--- /dev/null
+++ b/drivers/video/backlight/pwm_bl.c
@@ -0,0 +1,160 @@
+/*
+ * linux/drivers/video/backlight/pwm_bl.c
+ *
+ * simple PWM based backlight control, board code has to setup
+ * 1) pin configuration so PWM waveforms can output
+ * 2) platform_data casts to the PWM id (0/1/2/3 on PXA)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/platform_device.h>
+#include <linux/fb.h>
+#include <linux/backlight.h>
+#include <linux/err.h>
+#include <linux/pwm.h>
+#include <linux/pwm_backlight.h>
+
+struct pwm_bl_data {
+	struct pwm_device	*pwm;
+	unsigned int		period;
+};
+
+static int pwm_backlight_update_status(struct backlight_device *bl)
+{
+	struct pwm_bl_data *pb = dev_get_drvdata(&bl->dev);
+	int brightness = bl->props.brightness;
+	int max = bl->props.max_brightness;
+
+	if (bl->props.power != FB_BLANK_UNBLANK)
+		brightness = 0;
+
+	if (bl->props.fb_blank != FB_BLANK_UNBLANK)
+		brightness = 0;
+
+	if (brightness == 0) {
+		pwm_config(pb->pwm, 0, pb->period);
+		pwm_disable(pb->pwm);
+	} else {
+		pwm_config(pb->pwm, brightness * pb->period / max, pb->period);
+		pwm_enable(pb->pwm);
+	}
+	return 0;
+}
+
+static int pwm_backlight_get_brightness(struct backlight_device *bl)
+{
+	return bl->props.brightness;
+}
+
+static struct backlight_ops pwm_backlight_ops = {
+	.update_status	= pwm_backlight_update_status,
+	.get_brightness	= pwm_backlight_get_brightness,
+};
+
+static int pwm_backlight_probe(struct platform_device *pdev)
+{
+	struct platform_pwm_backlight_data *data = pdev->dev.platform_data;
+	struct backlight_device *bl;
+	struct pwm_bl_data *pb;
+
+	if (!data)
+		return -EINVAL;
+
+	pb = kzalloc(sizeof(*pb), GFP_KERNEL);
+	if (!pb)
+		return -ENOMEM;
+
+	pb->period = data->pwm_period_ns;
+
+	pb->pwm = pwm_request(data->pwm_id, "backlight");
+	if (pb->pwm == NULL) {
+		dev_err(&pdev->dev, "unable to request PWM for backlight\n");
+		kfree(pb);
+		return -EBUSY;
+	}
+
+	bl = backlight_device_register(pdev->name, &pdev->dev,
+			pb, &pwm_backlight_ops);
+	if (IS_ERR(bl)) {
+		dev_err(&pdev->dev, "failed to register backlight\n");
+		pwm_free(pb->pwm);
+		kfree(pb);
+		return PTR_ERR(bl);
+	}
+
+	bl->props.max_brightness = data->max_brightness;
+	bl->props.brightness = data->dft_brightness;
+	backlight_update_status(bl);
+
+	platform_set_drvdata(pdev, bl);
+	return 0;
+}
+
+static int pwm_backlight_remove(struct platform_device *pdev)
+{
+	struct backlight_device *bl = platform_get_drvdata(pdev);
+	struct pwm_bl_data *pb = dev_get_drvdata(&bl->dev);
+
+	backlight_device_unregister(bl);
+	pwm_config(pb->pwm, 0, pb->period);
+	pwm_disable(pb->pwm);
+	pwm_free(pb->pwm);
+	kfree(pb);
+	return 0;
+}
+
+#ifdef CONFIG_PM
+static int pwm_backlight_suspend(struct platform_device *pdev,
+				 pm_message_t state)
+{
+	struct backlight_device *bl = platform_get_drvdata(pdev);
+	struct pwm_bl_data *pb = dev_get_drvdata(&bl->dev);
+
+	pwm_config(pb->pwm, 0, pb->period);
+	pwm_disable(pb->pwm);
+	return 0;
+}
+
+static int pwm_backlight_resume(struct platform_device *pdev)
+{
+	struct backlight_device *bl = platform_get_drvdata(pdev);
+
+	backlight_update_status(bl);
+	return 0;
+}
+#else
+#define pwm_backlight_suspend	NULL
+#define pwm_backlight_resume	NULL
+#endif
+
+static struct platform_driver pwm_backlight_driver = {
+	.driver		= {
+		.name	= "pwm-backlight",
+		.owner	= THIS_MODULE,
+	},
+	.probe		= pwm_backlight_probe,
+	.remove		= pwm_backlight_remove,
+	.suspend	= pwm_backlight_suspend,
+	.resume		= pwm_backlight_resume,
+};
+
+static int __init pwm_backlight_init(void)
+{
+	return platform_driver_register(&pwm_backlight_driver);
+}
+module_init(pwm_backlight_init);
+
+static void __exit pwm_backlight_exit(void)
+{
+	platform_driver_unregister(&pwm_backlight_driver);
+}
+module_exit(pwm_backlight_exit);
+
+MODULE_DESCRIPTION("PWM based Backlight Driver");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/pwm_backlight.h b/include/linux/pwm_backlight.h
new file mode 100644
index 000000000000..aeeffedbe822
--- /dev/null
+++ b/include/linux/pwm_backlight.h
@@ -0,0 +1,14 @@
+/*
+ * Generic PWM backlight driver data - see drivers/video/backlight/pwm_bl.c
+ */
+#ifndef __LINUX_PWM_BACKLIGHT_H
+#define __LINUX_PWM_BACKLIGHT_H
+
+struct platform_pwm_backlight_data {
+	int pwm_id;
+	unsigned int max_brightness;
+	unsigned int dft_brightness;
+	unsigned int pwm_period_ns;
+};
+
+#endif
-- 
cgit v1.2.3


From 3b73125af69f93972625f4b655675f42ca4274eb Mon Sep 17 00:00:00 2001
From: Philipp Zabel <philipp.zabel@gmail.com>
Date: Thu, 22 May 2008 14:18:40 +0100
Subject: [ARM] 5044/1: pwm_bl: add init/notify/exit callbacks

This allows platform code to manipulate GPIOs and brightness level as
needed.

Signed-off-by: Philipp Zabel <philipp.zabel@gmail.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 drivers/video/backlight/pwm_bl.c | 39 ++++++++++++++++++++++++++++++++-------
 include/linux/pwm_backlight.h    |  3 +++
 2 files changed, 35 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/video/backlight/pwm_bl.c b/drivers/video/backlight/pwm_bl.c
index 9637f5e08cb6..8346dfc01cf6 100644
--- a/drivers/video/backlight/pwm_bl.c
+++ b/drivers/video/backlight/pwm_bl.c
@@ -23,6 +23,7 @@
 struct pwm_bl_data {
 	struct pwm_device	*pwm;
 	unsigned int		period;
+	int			(*notify)(int brightness);
 };
 
 static int pwm_backlight_update_status(struct backlight_device *bl)
@@ -37,6 +38,9 @@ static int pwm_backlight_update_status(struct backlight_device *bl)
 	if (bl->props.fb_blank != FB_BLANK_UNBLANK)
 		brightness = 0;
 
+	if (pb->notify)
+		brightness = pb->notify(brightness);
+
 	if (brightness == 0) {
 		pwm_config(pb->pwm, 0, pb->period);
 		pwm_disable(pb->pwm);
@@ -62,30 +66,39 @@ static int pwm_backlight_probe(struct platform_device *pdev)
 	struct platform_pwm_backlight_data *data = pdev->dev.platform_data;
 	struct backlight_device *bl;
 	struct pwm_bl_data *pb;
+	int ret;
 
 	if (!data)
 		return -EINVAL;
 
+	if (data->init) {
+		ret = data->init(&pdev->dev);
+		if (ret < 0)
+			return ret;
+	}
+
 	pb = kzalloc(sizeof(*pb), GFP_KERNEL);
-	if (!pb)
-		return -ENOMEM;
+	if (!pb) {
+		ret = -ENOMEM;
+		goto err_alloc;
+	}
 
 	pb->period = data->pwm_period_ns;
+	pb->notify = data->notify;
 
 	pb->pwm = pwm_request(data->pwm_id, "backlight");
 	if (pb->pwm == NULL) {
 		dev_err(&pdev->dev, "unable to request PWM for backlight\n");
-		kfree(pb);
-		return -EBUSY;
+		ret = -EBUSY;
+		goto err_pwm;
 	}
 
 	bl = backlight_device_register(pdev->name, &pdev->dev,
 			pb, &pwm_backlight_ops);
 	if (IS_ERR(bl)) {
 		dev_err(&pdev->dev, "failed to register backlight\n");
-		pwm_free(pb->pwm);
-		kfree(pb);
-		return PTR_ERR(bl);
+		ret = PTR_ERR(bl);
+		goto err_bl;
 	}
 
 	bl->props.max_brightness = data->max_brightness;
@@ -94,10 +107,20 @@ static int pwm_backlight_probe(struct platform_device *pdev)
 
 	platform_set_drvdata(pdev, bl);
 	return 0;
+
+err_bl:
+	pwm_free(pb->pwm);
+err_pwm:
+	kfree(pb);
+err_alloc:
+	if (data->exit)
+		data->exit(&pdev->dev);
+	return ret;
 }
 
 static int pwm_backlight_remove(struct platform_device *pdev)
 {
+	struct platform_pwm_backlight_data *data = pdev->dev.platform_data;
 	struct backlight_device *bl = platform_get_drvdata(pdev);
 	struct pwm_bl_data *pb = dev_get_drvdata(&bl->dev);
 
@@ -106,6 +129,8 @@ static int pwm_backlight_remove(struct platform_device *pdev)
 	pwm_disable(pb->pwm);
 	pwm_free(pb->pwm);
 	kfree(pb);
+	if (data->exit)
+		data->exit(&pdev->dev);
 	return 0;
 }
 
diff --git a/include/linux/pwm_backlight.h b/include/linux/pwm_backlight.h
index aeeffedbe822..7a9754c96775 100644
--- a/include/linux/pwm_backlight.h
+++ b/include/linux/pwm_backlight.h
@@ -9,6 +9,9 @@ struct platform_pwm_backlight_data {
 	unsigned int max_brightness;
 	unsigned int dft_brightness;
 	unsigned int pwm_period_ns;
+	int (*init)(struct device *dev);
+	int (*notify)(int brightness);
+	void (*exit)(struct device *dev);
 };
 
 #endif
-- 
cgit v1.2.3


From 27f8221af406e43b529a5425bc99c9b1e9bdf521 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Fri, 4 Jul 2008 09:30:03 +0200
Subject: block: add blk_queue_update_dma_pad

This adds blk_queue_update_dma_pad to prevent LLDs from overwriting
the dma pad mask wrongly (we added blk_queue_update_dma_alignment due
to the same reason).

This also converts libata to use blk_queue_update_dma_pad instead of
blk_queue_dma_pad.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Tejun Heo <htejun@gmail.com>
Cc: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-settings.c      | 24 ++++++++++++++++++++----
 drivers/ata/libata-scsi.c |  3 ++-
 include/linux/blkdev.h    |  1 +
 3 files changed, 23 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 8dd86418f35d..dfc77012843f 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -302,11 +302,10 @@ EXPORT_SYMBOL(blk_queue_stack_limits);
  * @q:     the request queue for the device
  * @mask:  pad mask
  *
- * Set pad mask.  Direct IO requests are padded to the mask specified.
+ * Set dma pad mask.
  *
- * Appending pad buffer to a request modifies ->data_len such that it
- * includes the pad buffer.  The original requested data length can be
- * obtained using blk_rq_raw_data_len().
+ * Appending pad buffer to a request modifies the last entry of a
+ * scatter list such that it includes the pad buffer.
  **/
 void blk_queue_dma_pad(struct request_queue *q, unsigned int mask)
 {
@@ -314,6 +313,23 @@ void blk_queue_dma_pad(struct request_queue *q, unsigned int mask)
 }
 EXPORT_SYMBOL(blk_queue_dma_pad);
 
+/**
+ * blk_queue_update_dma_pad - update pad mask
+ * @q:     the request queue for the device
+ * @mask:  pad mask
+ *
+ * Update dma pad mask.
+ *
+ * Appending pad buffer to a request modifies the last entry of a
+ * scatter list such that it includes the pad buffer.
+ **/
+void blk_queue_update_dma_pad(struct request_queue *q, unsigned int mask)
+{
+	if (mask > q->dma_pad_mask)
+		q->dma_pad_mask = mask;
+}
+EXPORT_SYMBOL(blk_queue_update_dma_pad);
+
 /**
  * blk_queue_dma_drain - Set up a drain buffer for excess dma.
  * @q:  the request queue for the device
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index 57a43649a461..499ccc628d81 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -885,7 +885,8 @@ static int ata_scsi_dev_config(struct scsi_device *sdev,
 		/* set the min alignment and padding */
 		blk_queue_update_dma_alignment(sdev->request_queue,
 					       ATA_DMA_PAD_SZ - 1);
-		blk_queue_dma_pad(sdev->request_queue, ATA_DMA_PAD_SZ - 1);
+		blk_queue_update_dma_pad(sdev->request_queue,
+					 ATA_DMA_PAD_SZ - 1);
 
 		/* configure draining */
 		buf = kmalloc(ATAPI_MAX_DRAIN, q->bounce_gfp | GFP_KERNEL);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index e04c4ac8a7cf..1ffd8bfdc4c9 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -776,6 +776,7 @@ extern void blk_queue_max_segment_size(struct request_queue *, unsigned int);
 extern void blk_queue_hardsect_size(struct request_queue *, unsigned short);
 extern void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b);
 extern void blk_queue_dma_pad(struct request_queue *, unsigned int);
+extern void blk_queue_update_dma_pad(struct request_queue *, unsigned int);
 extern int blk_queue_dma_drain(struct request_queue *q,
 			       dma_drain_needed_fn *dma_drain_needed,
 			       void *buf, unsigned int size);
-- 
cgit v1.2.3


From ba8dd03ac09f51a69c154b8cb508b701d713a2cd Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Fri, 4 Jul 2008 11:26:40 +0200
Subject: generic-ipi: fix s390 build bug

forgot to remove #include <linux/spinlock.h> from linux/smp.h while
fixing the original s390 build bug.

Patch below fixes this build bug caused by header inclusion dependencies:

  CC      kernel/timer.o
In file included from include/linux/spinlock.h:87,
                 from include/linux/smp.h:11,
                 from include/linux/kernel_stat.h:4,
                 from kernel/timer.c:22:
include/asm/spinlock.h: In function '__raw_spin_lock':
include/asm/spinlock.h:69: error: implicit declaration of function 'smp_processor_id'

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/smp.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/smp.h b/include/linux/smp.h
index 55261101d09a..48262f86c969 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -8,7 +8,6 @@
 
 #include <linux/errno.h>
 #include <linux/list.h>
-#include <linux/spinlock.h>
 #include <linux/cpumask.h>
 
 extern void cpu_idle(void);
-- 
cgit v1.2.3


From d2dbf343329dc777d77488743465f7be4245971d Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Fri, 13 Jun 2008 02:00:56 -0700
Subject: x86: clean up reserve_bootmem_generic() and port it to 32-bit

1. add reserve_bootmem_generic for 32bit
2. change len to unsigned long
3. make early_res_to_bootmem to use it

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/e820.c    |  2 +-
 arch/x86/kernel/mpparse.c | 18 ++++++------------
 arch/x86/mm/init_32.c     |  6 ++++++
 arch/x86/mm/init_64.c     |  3 ++-
 include/asm-x86/proto.h   |  2 --
 include/linux/bootmem.h   |  2 ++
 6 files changed, 17 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 4f2cd5d179e2..774063f11be0 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -635,7 +635,7 @@ void __init early_res_to_bootmem(u64 start, u64 end)
 			continue;
 		printk(KERN_INFO "  early res: %d [%llx-%llx] %s\n", i,
 			final_start, final_end - 1, r->name);
-		reserve_bootmem(final_start, final_end - final_start,
+		reserve_bootmem_generic(final_start, final_end - final_start,
 				BOOTMEM_DEFAULT);
 	}
 }
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 7ac1b689b70a..b62ac6ba1410 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -859,10 +859,11 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
 
 			if (!reserve)
 				return 1;
-#ifdef CONFIG_X86_32
-			reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE,
+			reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE,
 					BOOTMEM_DEFAULT);
 			if (mpf->mpf_physptr) {
+				unsigned long size = PAGE_SIZE;
+#ifdef CONFIG_X86_32
 				/*
 				 * We cannot access to MPC table to compute
 				 * table size yet, as only few megabytes from
@@ -872,22 +873,15 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
 				 * PAGE_SIZE from mpg->mpf_physptr yields BUG()
 				 * in reserve_bootmem.
 				 */
-				unsigned long size = PAGE_SIZE;
 				unsigned long end = max_low_pfn * PAGE_SIZE;
 				if (mpf->mpf_physptr + size > end)
 					size = end - mpf->mpf_physptr;
-				reserve_bootmem(mpf->mpf_physptr, size,
+#endif
+				reserve_bootmem_generic(mpf->mpf_physptr, size,
 						BOOTMEM_DEFAULT);
 			}
 
-#else
-			reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE,
-				BOOTMEM_DEFAULT);
-			if (mpf->mpf_physptr)
-				reserve_bootmem_generic(mpf->mpf_physptr,
-					PAGE_SIZE, BOOTMEM_DEFAULT);
-#endif
-		return 1;
+			return 1;
 		}
 		bp += 4;
 		length -= 16;
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 0e7bb5e81670..abadb1da70df 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -785,3 +785,9 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 	free_init_pages("initrd memory", start, end);
 }
 #endif
+
+int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
+				   int flags)
+{
+	return reserve_bootmem(phys, len, flags);
+}
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index bf7bf1de6c25..b8c2c1ef7ad5 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -799,7 +799,8 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 }
 #endif
 
-int __init reserve_bootmem_generic(unsigned long phys, unsigned len, int flags)
+int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
+				   int flags)
 {
 #ifdef CONFIG_NUMA
 	int nid, next_nid;
diff --git a/include/asm-x86/proto.h b/include/asm-x86/proto.h
index a9f51472521e..3dd458c385c0 100644
--- a/include/asm-x86/proto.h
+++ b/include/asm-x86/proto.h
@@ -14,8 +14,6 @@ extern void ia32_syscall(void);
 extern void ia32_cstar_target(void);
 extern void ia32_sysenter_target(void);
 
-extern int reserve_bootmem_generic(unsigned long phys, unsigned len, int flags);
-
 extern void syscall32_cpu_init(void);
 
 extern void check_efer(void);
diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index 686895bacd9d..a1d9b79078ea 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -84,6 +84,8 @@ extern int reserve_bootmem(unsigned long addr, unsigned long size, int flags);
 	__alloc_bootmem_low(x, PAGE_SIZE, 0)
 #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
 
+extern int reserve_bootmem_generic(unsigned long addr, unsigned long size,
+				   int flags);
 extern unsigned long free_all_bootmem(void);
 extern unsigned long free_all_bootmem_node(pg_data_t *pgdat);
 extern void *__alloc_bootmem_node(pg_data_t *pgdat,
-- 
cgit v1.2.3


From cc1050bafebfb1d7935331282e948b5016318192 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Fri, 13 Jun 2008 19:08:52 -0700
Subject: x86: replace shrink_active_range() with remove_active_range()

in case we have kva before ramdisk on a node, we still need to use
those ranges.

v2: reserve_early kva ram area, in case there are holes in highmem, to avoid
    those area could be treat as free high pages.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/discontig_32.c | 45 ++++++++++++++++++++++++---------------------
 include/linux/mm.h         |  3 ++-
 mm/page_alloc.c            | 29 +++++++++++++++++++++++------
 3 files changed, 49 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c
index accc7c6c57fc..c3f119e99e0d 100644
--- a/arch/x86/mm/discontig_32.c
+++ b/arch/x86/mm/discontig_32.c
@@ -230,8 +230,8 @@ static unsigned long calculate_numa_remap_pages(void)
 	unsigned long size, reserve_pages = 0;
 
 	for_each_online_node(nid) {
-		u64 node_end_target;
-		u64 node_end_final;
+		u64 node_kva_target;
+		u64 node_kva_final;
 
 		/*
 		 * The acpi/srat node info can show hot-add memroy zones
@@ -254,42 +254,45 @@ static unsigned long calculate_numa_remap_pages(void)
 		/* now the roundup is correct, convert to PAGE_SIZE pages */
 		size = size * PTRS_PER_PTE;
 
-		node_end_target = round_down(node_end_pfn[nid] - size,
+		node_kva_target = round_down(node_end_pfn[nid] - size,
 						 PTRS_PER_PTE);
-		node_end_target <<= PAGE_SHIFT;
+		node_kva_target <<= PAGE_SHIFT;
 		do {
-			node_end_final = find_e820_area(node_end_target,
+			node_kva_final = find_e820_area(node_kva_target,
 					((u64)node_end_pfn[nid])<<PAGE_SHIFT,
 						((u64)size)<<PAGE_SHIFT,
 						LARGE_PAGE_BYTES);
-			node_end_target -= LARGE_PAGE_BYTES;
-		} while (node_end_final == -1ULL &&
-			 (node_end_target>>PAGE_SHIFT) > (node_start_pfn[nid]));
+			node_kva_target -= LARGE_PAGE_BYTES;
+		} while (node_kva_final == -1ULL &&
+			 (node_kva_target>>PAGE_SHIFT) > (node_start_pfn[nid]));
 
-		if (node_end_final == -1ULL)
+		if (node_kva_final == -1ULL)
 			panic("Can not get kva ram\n");
 
-		printk("Reserving %ld pages of KVA for lmem_map of node %d\n",
-				size, nid);
 		node_remap_size[nid] = size;
 		node_remap_offset[nid] = reserve_pages;
 		reserve_pages += size;
-		printk("Shrinking node %d from %ld pages to %lld pages\n",
-			nid, node_end_pfn[nid], node_end_final>>PAGE_SHIFT);
+		printk("Reserving %ld pages of KVA for lmem_map of node %d at %llx\n",
+				size, nid, node_kva_final>>PAGE_SHIFT);
 
 		/*
 		 *  prevent kva address below max_low_pfn want it on system
 		 *  with less memory later.
 		 *  layout will be: KVA address , KVA RAM
+		 *
+		 *  we are supposed to only record the one less then max_low_pfn
+		 *  but we could have some hole in high memory, and it will only
+		 *  check page_is_ram(pfn) && !page_is_reserved_early(pfn) to decide
+		 *  to use it as free.
+		 *  So reserve_early here, hope we don't run out of that array
 		 */
-		if ((node_end_final>>PAGE_SHIFT) < max_low_pfn)
-			reserve_early(node_end_final,
-				      node_end_final+(((u64)size)<<PAGE_SHIFT),
-				      "KVA RAM");
-
-		node_end_pfn[nid] = node_end_final>>PAGE_SHIFT;
-		node_remap_start_pfn[nid] = node_end_pfn[nid];
-		shrink_active_range(nid, node_end_pfn[nid]);
+		reserve_early(node_kva_final,
+			      node_kva_final+(((u64)size)<<PAGE_SHIFT),
+			      "KVA RAM");
+
+		node_remap_start_pfn[nid] = node_kva_final>>PAGE_SHIFT;
+		remove_active_range(nid, node_remap_start_pfn[nid],
+					 node_remap_start_pfn[nid] + size);
 	}
 	printk("Reserving total of %ld pages for numa KVA remap\n",
 			reserve_pages);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ce8e397a61f6..034a3156d2f0 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -998,7 +998,8 @@ extern void free_area_init_node(int nid, pg_data_t *pgdat,
 extern void free_area_init_nodes(unsigned long *max_zone_pfn);
 extern void add_active_range(unsigned int nid, unsigned long start_pfn,
 					unsigned long end_pfn);
-extern void shrink_active_range(unsigned int nid, unsigned long new_end_pfn);
+extern void remove_active_range(unsigned int nid, unsigned long start_pfn,
+					unsigned long end_pfn);
 extern void push_node_boundaries(unsigned int nid, unsigned long start_pfn,
 					unsigned long end_pfn);
 extern void remove_all_active_ranges(void);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index eee5ba7509c1..d80e1868e570 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3552,30 +3552,47 @@ void __init add_active_range(unsigned int nid, unsigned long start_pfn,
 }
 
 /**
- * shrink_active_range - Shrink an existing registered range of PFNs
+ * remove_active_range - Shrink an existing registered range of PFNs
  * @nid: The node id the range is on that should be shrunk
- * @new_end_pfn: The new PFN of the range
+ * @start_pfn: The new PFN of the range
+ * @end_pfn: The new PFN of the range
  *
  * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node.
  * The map is kept near the end physical page range that has already been
  * registered. This function allows an arch to shrink an existing registered
  * range.
  */
-void __init shrink_active_range(unsigned int nid, unsigned long new_end_pfn)
+void __init remove_active_range(unsigned int nid, unsigned long start_pfn,
+				unsigned long end_pfn)
 {
 	int i, j;
 	int removed = 0;
 
+	printk(KERN_DEBUG "remove_active_range (%d, %lu, %lu)\n",
+			  nid, start_pfn, end_pfn);
+
 	/* Find the old active region end and shrink */
 	for_each_active_range_index_in_nid(i, nid) {
-		if (early_node_map[i].start_pfn >= new_end_pfn) {
+		if (early_node_map[i].start_pfn >= start_pfn &&
+		    early_node_map[i].end_pfn <= end_pfn) {
 			/* clear it */
+			early_node_map[i].start_pfn = 0;
 			early_node_map[i].end_pfn = 0;
 			removed = 1;
 			continue;
 		}
-		if (early_node_map[i].end_pfn > new_end_pfn) {
-			early_node_map[i].end_pfn = new_end_pfn;
+		if (early_node_map[i].start_pfn < start_pfn &&
+		    early_node_map[i].end_pfn > start_pfn) {
+			unsigned long temp_end_pfn = early_node_map[i].end_pfn;
+			early_node_map[i].end_pfn = start_pfn;
+			if (temp_end_pfn > end_pfn)
+				add_active_range(nid, end_pfn, temp_end_pfn);
+			continue;
+		}
+		if (early_node_map[i].start_pfn >= start_pfn &&
+		    early_node_map[i].end_pfn > end_pfn &&
+		    early_node_map[i].start_pfn < end_pfn) {
+			early_node_map[i].start_pfn = end_pfn;
 			continue;
 		}
 	}
-- 
cgit v1.2.3


From b5bc6c0e55000dab86b73f838f5ad02908b23755 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Sat, 14 Jun 2008 18:32:52 -0700
Subject: x86, mm: use add_highpages_with_active_regions() for high pages init
 v2

use early_node_map to init high pages, so we can remove page_is_ram() and
page_is_reserved_early() in the big loop with add_one_highpage

also remove page_is_reserved_early(), it is not needed anymore.

v2: fix the build of other platforms

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/e820.c     | 11 --------
 arch/x86/mm/discontig_32.c | 19 ++++++--------
 arch/x86/mm/init_32.c      | 62 ++++++++++++++++++++++++++++++++++++++--------
 include/asm-x86/e820.h     |  1 -
 include/asm-x86/highmem.h  |  3 +++
 include/linux/mm.h         |  2 ++
 mm/page_alloc.c            |  8 ++++++
 7 files changed, 71 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 5051ce744b4e..ed46b7a6bc13 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -612,17 +612,6 @@ void __init free_early(u64 start, u64 end)
 	early_res[j - 1].end = 0;
 }
 
-int __init page_is_reserved_early(unsigned long pagenr)
-{
-	u64 start = (u64)pagenr << PAGE_SHIFT;
-	int i;
-	struct early_res *r;
-
-	i = find_overlapped_early(start, start + PAGE_SIZE);
-	r = &early_res[i];
-	return (i < MAX_EARLY_RES && r->end);
-}
-
 void __init early_res_to_bootmem(u64 start, u64 end)
 {
 	int i;
diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c
index c3f119e99e0d..7c4d0255f8d8 100644
--- a/arch/x86/mm/discontig_32.c
+++ b/arch/x86/mm/discontig_32.c
@@ -100,7 +100,6 @@ unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
 #endif
 
 extern unsigned long find_max_low_pfn(void);
-extern void add_one_highpage_init(struct page *, int, int);
 extern unsigned long highend_pfn, highstart_pfn;
 
 #define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE)
@@ -432,10 +431,10 @@ void __init set_highmem_pages_init(int bad_ppro)
 {
 #ifdef CONFIG_HIGHMEM
 	struct zone *zone;
-	struct page *page;
+	int nid;
 
 	for_each_zone(zone) {
-		unsigned long node_pfn, zone_start_pfn, zone_end_pfn;
+		unsigned long zone_start_pfn, zone_end_pfn;
 
 		if (!is_highmem(zone))
 			continue;
@@ -443,16 +442,12 @@ void __init set_highmem_pages_init(int bad_ppro)
 		zone_start_pfn = zone->zone_start_pfn;
 		zone_end_pfn = zone_start_pfn + zone->spanned_pages;
 
+		nid = zone_to_nid(zone);
 		printk("Initializing %s for node %d (%08lx:%08lx)\n",
-				zone->name, zone_to_nid(zone),
-				zone_start_pfn, zone_end_pfn);
-
-		for (node_pfn = zone_start_pfn; node_pfn < zone_end_pfn; node_pfn++) {
-			if (!pfn_valid(node_pfn))
-				continue;
-			page = pfn_to_page(node_pfn);
-			add_one_highpage_init(page, node_pfn, bad_ppro);
-		}
+				zone->name, nid, zone_start_pfn, zone_end_pfn);
+
+		add_highpages_with_active_regions(nid, zone_start_pfn,
+				 zone_end_pfn, bad_ppro);
 	}
 	totalram_pages += totalhigh_pages;
 #endif
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index abadb1da70df..ba07a489230e 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -287,10 +287,10 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base)
 	pkmap_page_table = pte;
 }
 
-void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro)
+static void __init
+add_one_highpage_init(struct page *page, int pfn, int bad_ppro)
 {
-	if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn)) &&
-	    !page_is_reserved_early(pfn)) {
+	if (!(bad_ppro && page_kills_ppro(pfn))) {
 		ClearPageReserved(page);
 		init_page_count(page);
 		__free_page(page);
@@ -299,18 +299,58 @@ void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro)
 		SetPageReserved(page);
 }
 
+struct add_highpages_data {
+	unsigned long start_pfn;
+	unsigned long end_pfn;
+	int bad_ppro;
+};
+
+static void __init add_highpages_work_fn(unsigned long start_pfn,
+					 unsigned long end_pfn, void *datax)
+{
+	int node_pfn;
+	struct page *page;
+	unsigned long final_start_pfn, final_end_pfn;
+	struct add_highpages_data *data;
+	int bad_ppro;
+
+	data = (struct add_highpages_data *)datax;
+	bad_ppro = data->bad_ppro;
+
+	final_start_pfn = max(start_pfn, data->start_pfn);
+	final_end_pfn = min(end_pfn, data->end_pfn);
+	if (final_start_pfn >= final_end_pfn)
+		return;
+
+	for (node_pfn = final_start_pfn; node_pfn < final_end_pfn;
+	     node_pfn++) {
+		if (!pfn_valid(node_pfn))
+			continue;
+		page = pfn_to_page(node_pfn);
+		add_one_highpage_init(page, node_pfn, bad_ppro);
+	}
+
+}
+
+void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn,
+					      unsigned long end_pfn,
+					      int bad_ppro)
+{
+	struct add_highpages_data data;
+
+	data.start_pfn = start_pfn;
+	data.end_pfn = end_pfn;
+	data.bad_ppro = bad_ppro;
+
+	work_with_active_regions(nid, add_highpages_work_fn, &data);
+}
+
 #ifndef CONFIG_NUMA
 static void __init set_highmem_pages_init(int bad_ppro)
 {
-	int pfn;
+	add_highpages_with_active_regions(0, highstart_pfn, highend_pfn,
+						bad_ppro);
 
-	for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) {
-		/*
-		 * Holes under sparsemem might not have no mem_map[]:
-		 */
-		if (pfn_valid(pfn))
-			add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro);
-	}
 	totalram_pages += totalhigh_pages;
 }
 #endif /* !CONFIG_NUMA */
diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h
index 6b0ce745a60c..55d310596907 100644
--- a/include/asm-x86/e820.h
+++ b/include/asm-x86/e820.h
@@ -86,7 +86,6 @@ extern u64 find_e820_area_size(u64 start, u64 *sizep, u64 align);
 extern void reserve_early(u64 start, u64 end, char *name);
 extern void free_early(u64 start, u64 end);
 extern void early_res_to_bootmem(u64 start, u64 end);
-extern int page_is_reserved_early(unsigned long pagenr);
 extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align);
 
 extern unsigned long e820_end_of_ram(void);
diff --git a/include/asm-x86/highmem.h b/include/asm-x86/highmem.h
index e153f3b44774..85c4fea41ff6 100644
--- a/include/asm-x86/highmem.h
+++ b/include/asm-x86/highmem.h
@@ -74,6 +74,9 @@ struct page *kmap_atomic_to_page(void *ptr);
 
 #define flush_cache_kmaps()	do { } while (0)
 
+extern void add_highpages_with_active_regions(int nid, unsigned long start_pfn,
+					unsigned long end_pfn, int bad_ppro);
+
 #endif /* __KERNEL__ */
 
 #endif /* _ASM_HIGHMEM_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 034a3156d2f0..e4de460907c1 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1011,6 +1011,8 @@ extern unsigned long find_min_pfn_with_active_regions(void);
 extern unsigned long find_max_pfn_with_active_regions(void);
 extern void free_bootmem_with_active_regions(int nid,
 						unsigned long max_low_pfn);
+typedef void (*work_fn_t)(unsigned long, unsigned long, void *);
+extern void work_with_active_regions(int nid, work_fn_t work_fn, void *data);
 extern void sparse_memory_present_with_active_regions(int nid);
 #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
 extern int early_pfn_to_nid(unsigned long pfn);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d80e1868e570..41c6e3aa059f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2929,6 +2929,14 @@ void __init free_bootmem_with_active_regions(int nid,
 	}
 }
 
+void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data)
+{
+	int i;
+
+	for_each_active_range_index_in_nid(i, nid)
+		work_fn(early_node_map[i].start_pfn, early_node_map[i].end_pfn,
+			data);
+}
 /**
  * sparse_memory_present_with_active_regions - Call memory_present for each active range
  * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
-- 
cgit v1.2.3


From 3461b0af025251bbc6b3d56c821c6ac2de6f7209 Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Mon, 12 May 2008 21:21:13 +0200
Subject: x86: remove static boot_cpu_pda array v2

  * Remove the boot_cpu_pda array and pointer table from the data section.
    Allocate the pointer table and array during init.  do_boot_cpu()
    will reallocate the pda in node local memory and if the cpu is being
    brought up before the bootmem array is released (after_bootmem = 0),
    then it will free the initial pda.  This will happen for all cpus
    present at system startup.

    This removes 512k + 32k bytes from the data section.

For inclusion into sched-devel/latest tree.

Based on:
	git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git
    +   sched-devel/latest  .../mingo/linux-2.6-sched-devel.git

Signed-off-by: Mike Travis <travis@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/head64.c  | 26 +++++++++++++++--
 arch/x86/kernel/setup.c   | 73 ++++++++++++++++++++++++++++++++++++-----------
 arch/x86/kernel/setup64.c |  8 ++++--
 arch/x86/kernel/smpboot.c | 59 +++++++++++++++++++++++++++++---------
 include/asm-x86/pda.h     |  6 ++--
 include/linux/mm.h        |  1 +
 6 files changed, 135 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index e25c57b8aa84..0ab59edd7067 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -25,6 +25,24 @@
 #include <asm/e820.h>
 #include <asm/bios_ebda.h>
 
+/* boot cpu pda */
+static struct x8664_pda _boot_cpu_pda __read_mostly;
+
+#ifdef CONFIG_SMP
+#ifdef CONFIG_DEBUG_PER_CPU_MAPS
+/*
+ * We install an empty cpu_pda pointer table to trap references before
+ * the actual cpu_pda pointer table is created in setup_cpu_pda_map().
+ */
+static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata;
+#else
+static struct x8664_pda *__cpu_pda[1] __read_mostly;
+#endif
+
+#else /* !CONFIG_SMP (NR_CPUS will be 1) */
+static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly;
+#endif
+
 static void __init zap_identity_mappings(void)
 {
 	pgd_t *pgd = pgd_offset_k(0UL);
@@ -156,10 +174,12 @@ void __init x86_64_start_kernel(char * real_mode_data)
 
 	early_printk("Kernel alive\n");
 
- 	for (i = 0; i < NR_CPUS; i++)
- 		cpu_pda(i) = &boot_cpu_pda[i];
-
+	_cpu_pda = __cpu_pda;
+	cpu_pda(0) = &_boot_cpu_pda;
 	pda_init(0);
+
+	early_printk("Kernel really alive\n");
+
 	copy_bootdata(__va(real_mode_data));
 
 	reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 913af838c3c5..dd12c1c84a8f 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -101,6 +101,50 @@ static inline void setup_cpumask_of_cpu(void) { }
  */
 unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
 EXPORT_SYMBOL(__per_cpu_offset);
+static inline void setup_cpu_pda_map(void) { }
+
+#elif !defined(CONFIG_SMP)
+static inline void setup_cpu_pda_map(void) { }
+
+#else /* CONFIG_SMP && CONFIG_X86_64 */
+
+/*
+ * Allocate cpu_pda pointer table and array via alloc_bootmem.
+ */
+static void __init setup_cpu_pda_map(void)
+{
+	char *pda;
+	struct x8664_pda **new_cpu_pda;
+	unsigned long size;
+	int cpu;
+
+	size = roundup(sizeof(struct x8664_pda), cache_line_size());
+
+	/* allocate cpu_pda array and pointer table */
+	{
+		unsigned long tsize = nr_cpu_ids * sizeof(void *);
+		unsigned long asize = size * (nr_cpu_ids - 1);
+
+		tsize = roundup(tsize, cache_line_size());
+		new_cpu_pda = alloc_bootmem(tsize + asize);
+		pda = (char *)new_cpu_pda + tsize;
+	}
+
+	/* initialize pointer table to static pda's */
+	for_each_possible_cpu(cpu) {
+		if (cpu == 0) {
+			/* leave boot cpu pda in place */
+			new_cpu_pda[0] = cpu_pda(0);
+			continue;
+		}
+		new_cpu_pda[cpu] = (struct x8664_pda *)pda;
+		new_cpu_pda[cpu]->in_bootmem = 1;
+		pda += size;
+	}
+
+	/* point to new pointer table */
+	_cpu_pda = new_cpu_pda;
+}
 #endif
 
 /*
@@ -110,46 +154,43 @@ EXPORT_SYMBOL(__per_cpu_offset);
  */
 void __init setup_per_cpu_areas(void)
 {
-	int i, highest_cpu = 0;
-	unsigned long size;
+	ssize_t size = PERCPU_ENOUGH_ROOM;
+	char *ptr;
+	int cpu;
 
 #ifdef CONFIG_HOTPLUG_CPU
 	prefill_possible_map();
+#else
+	nr_cpu_ids = num_processors;
 #endif
 
+	/* Setup cpu_pda map */
+	setup_cpu_pda_map();
+
 	/* Copy section for each CPU (we discard the original) */
 	size = PERCPU_ENOUGH_ROOM;
 	printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n",
 			  size);
 
-	for_each_possible_cpu(i) {
-		char *ptr;
+	for_each_possible_cpu(cpu) {
 #ifndef CONFIG_NEED_MULTIPLE_NODES
 		ptr = alloc_bootmem_pages(size);
 #else
-		int node = early_cpu_to_node(i);
+		int node = early_cpu_to_node(cpu);
 		if (!node_online(node) || !NODE_DATA(node)) {
 			ptr = alloc_bootmem_pages(size);
 			printk(KERN_INFO
 			       "cpu %d has no node %d or node-local memory\n",
-				i, node);
+				cpu, node);
 		}
 		else
 			ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
 #endif
-		if (!ptr)
-			panic("Cannot allocate cpu data for CPU %d\n", i);
-#ifdef CONFIG_X86_64
-		cpu_pda(i)->data_offset = ptr - __per_cpu_start;
-#else
-		__per_cpu_offset[i] = ptr - __per_cpu_start;
-#endif
+		per_cpu_offset(cpu) = ptr - __per_cpu_start;
 		memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
 
-		highest_cpu = i;
 	}
 
-	nr_cpu_ids = highest_cpu + 1;
 	printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n",
 		NR_CPUS, nr_cpu_ids, nr_node_ids);
 
@@ -199,7 +240,7 @@ void __cpuinit numa_set_node(int cpu, int node)
 {
 	int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
 
-	if (node != NUMA_NO_NODE)
+	if (cpu_pda(cpu) && node != NUMA_NO_NODE)
 		cpu_pda(cpu)->nodenumber = node;
 
 	if (cpu_to_node_map)
diff --git a/arch/x86/kernel/setup64.c b/arch/x86/kernel/setup64.c
index aee0e8200777..631ea6cc01d8 100644
--- a/arch/x86/kernel/setup64.c
+++ b/arch/x86/kernel/setup64.c
@@ -12,6 +12,7 @@
 #include <linux/bitops.h>
 #include <linux/module.h>
 #include <linux/kgdb.h>
+#include <linux/topology.h>
 #include <asm/pda.h>
 #include <asm/pgtable.h>
 #include <asm/processor.h>
@@ -34,9 +35,8 @@ struct boot_params boot_params;
 
 cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
 
-struct x8664_pda *_cpu_pda[NR_CPUS] __read_mostly;
+struct x8664_pda **_cpu_pda __read_mostly;
 EXPORT_SYMBOL(_cpu_pda);
-struct x8664_pda boot_cpu_pda[NR_CPUS] __cacheline_aligned;
 
 struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
 
@@ -114,8 +114,10 @@ void pda_init(int cpu)
 			__get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
 		if (!pda->irqstackptr)
 			panic("cannot allocate irqstack for cpu %d", cpu); 
-	}
 
+		if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE)
+			pda->nodenumber = cpu_to_node(cpu);
+	}
 
 	pda->irqstackptr += IRQSTACKSIZE-64;
 } 
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 036604d3daed..bf0833487455 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -816,6 +816,43 @@ static void __cpuinit do_fork_idle(struct work_struct *work)
 	complete(&c_idle->done);
 }
 
+/*
+ * Allocate node local memory for the AP pda.
+ *
+ * Must be called after the _cpu_pda pointer table is initialized.
+ */
+static int __cpuinit get_local_pda(int cpu)
+{
+	struct x8664_pda *oldpda, *newpda;
+	unsigned long size = sizeof(struct x8664_pda);
+	int node = cpu_to_node(cpu);
+
+	if (cpu_pda(cpu) && !cpu_pda(cpu)->in_bootmem)
+		return 0;
+
+	oldpda = cpu_pda(cpu);
+	newpda = kmalloc_node(size, GFP_ATOMIC, node);
+	if (!newpda) {
+		printk(KERN_ERR "Could not allocate node local PDA "
+			"for CPU %d on node %d\n", cpu, node);
+
+		if (oldpda)
+			return 0;	/* have a usable pda */
+		else
+			return -1;
+	}
+
+	if (oldpda) {
+		memcpy(newpda, oldpda, size);
+		if (!after_bootmem)
+			free_bootmem((unsigned long)oldpda, size);
+	}
+
+	newpda->in_bootmem = 0;
+	cpu_pda(cpu) = newpda;
+	return 0;
+}
+
 static int __cpuinit do_boot_cpu(int apicid, int cpu)
 /*
  * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
@@ -841,19 +878,11 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
 	}
 
 	/* Allocate node local memory for AP pdas */
-	if (cpu_pda(cpu) == &boot_cpu_pda[cpu]) {
-		struct x8664_pda *newpda, *pda;
-		int node = cpu_to_node(cpu);
-		pda = cpu_pda(cpu);
-		newpda = kmalloc_node(sizeof(struct x8664_pda), GFP_ATOMIC,
-				      node);
-		if (newpda) {
-			memcpy(newpda, pda, sizeof(struct x8664_pda));
-			cpu_pda(cpu) = newpda;
-		} else
-			printk(KERN_ERR
-		"Could not allocate node local PDA for CPU %d on node %d\n",
-				cpu, node);
+	if (cpu > 0) {
+		boot_error = get_local_pda(cpu);
+		if (boot_error)
+			goto restore_state;
+			/* if can't get pda memory, can't start cpu */
 	}
 #endif
 
@@ -972,6 +1001,8 @@ do_rest:
 		}
 	}
 
+restore_state:
+
 	if (boot_error) {
 		/* Try to put things back the way they were before ... */
 		unmap_cpu_to_logical_apicid(cpu);
@@ -1347,6 +1378,8 @@ __init void prefill_possible_map(void)
 
 	for (i = 0; i < possible; i++)
 		cpu_set(i, cpu_possible_map);
+
+	nr_cpu_ids = possible;
 }
 
 static void __ref remove_cpu_from_maps(int cpu)
diff --git a/include/asm-x86/pda.h b/include/asm-x86/pda.h
index de2ad9ac35a9..b34e9a7cc80b 100644
--- a/include/asm-x86/pda.h
+++ b/include/asm-x86/pda.h
@@ -22,7 +22,8 @@ struct x8664_pda {
 					   offset 40!!! */
 #endif
 	char *irqstackptr;
-	int nodenumber;			/* number of current node */
+	short nodenumber;		/* number of current node (32k max) */
+	short in_bootmem;		/* pda lives in bootmem */
 	unsigned int __softirq_pending;
 	unsigned int __nmi_count;	/* number of NMI on this CPUs */
 	short mmu_state;
@@ -38,8 +39,7 @@ struct x8664_pda {
 	unsigned irq_spurious_count;
 } ____cacheline_aligned_in_smp;
 
-extern struct x8664_pda *_cpu_pda[];
-extern struct x8664_pda boot_cpu_pda[];
+extern struct x8664_pda **_cpu_pda;
 extern void pda_init(int);
 
 #define cpu_pda(i) (_cpu_pda[i])
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 586a943cab01..0ea48a5af823 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1024,6 +1024,7 @@ extern void mem_init(void);
 extern void show_mem(void);
 extern void si_meminfo(struct sysinfo * val);
 extern void si_meminfo_node(struct sysinfo *val, int nid);
+extern int after_bootmem;
 
 #ifdef CONFIG_NUMA
 extern void setup_per_cpu_pageset(void);
-- 
cgit v1.2.3


From a7bf0bd5e6af7fe69342dabf2a3b721f0163469a Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Wed, 28 May 2008 15:02:14 +0100
Subject: build: add __page_aligned_data and __page_aligned_bss

Making a variable page-aligned by using
__attribute__((section(".data.page_aligned"))) is fragile because if
sizeof(variable) is not also a multiple of page size, it leaves
variables in the remainder of the section unaligned.

This patch introduces two new qualifiers, __page_aligned_data and
__page_aligned_bss to set the section *and* the alignment of
variables.  This makes page-aligned variables more robust because the
linker will make sure they're aligned properly.  Unfortunately it
requires *all* page-aligned data to use these macros...

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup64.c | 2 +-
 arch/x86/mm/ioremap.c     | 3 +--
 include/linux/linkage.h   | 4 ++++
 3 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/setup64.c b/arch/x86/kernel/setup64.c
index 631ea6cc01d8..fc1a56da8240 100644
--- a/arch/x86/kernel/setup64.c
+++ b/arch/x86/kernel/setup64.c
@@ -40,7 +40,7 @@ EXPORT_SYMBOL(_cpu_pda);
 
 struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
 
-char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned")));
+char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss;
 
 unsigned long __supported_pte_mask __read_mostly = ~0UL;
 EXPORT_SYMBOL_GPL(__supported_pte_mask);
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 416ea415f5c2..0561fde56a51 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -394,8 +394,7 @@ static int __init early_ioremap_debug_setup(char *str)
 early_param("early_ioremap_debug", early_ioremap_debug_setup);
 
 static __initdata int after_paging_init;
-static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)]
-		__section(.bss.page_aligned);
+static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;
 
 static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
 {
diff --git a/include/linux/linkage.h b/include/linux/linkage.h
index 2119610b24f8..9fd1f859021b 100644
--- a/include/linux/linkage.h
+++ b/include/linux/linkage.h
@@ -1,6 +1,7 @@
 #ifndef _LINUX_LINKAGE_H
 #define _LINUX_LINKAGE_H
 
+#include <linux/compiler.h>
 #include <asm/linkage.h>
 
 #ifdef __cplusplus
@@ -17,6 +18,9 @@
 # define asmregparm
 #endif
 
+#define __page_aligned_data	__section(.data.page_aligned) __aligned(PAGE_SIZE)
+#define __page_aligned_bss	__section(.bss.page_aligned) __aligned(PAGE_SIZE)
+
 /*
  * This is used by architectures to keep arguments on the stack
  * untouched by the compiler by keeping them live until the end.
-- 
cgit v1.2.3


From d52d53b8a5b258bfaab9223a5e7284fcfdd48577 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Mon, 16 Jun 2008 20:10:55 -0700
Subject: RFC x86: try to remove arch_get_ram_range

want to remove arch_get_ram_range, and use early_node_map instead.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/init_32.c     |  6 ++++--
 drivers/pci/intel-iommu.c | 51 ++++++++++++++++++++++++++++++++++-------------
 include/linux/mm.h        |  2 +-
 mm/page_alloc.c           | 10 +++++++---
 4 files changed, 49 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 65d55056b6e7..a0484adbf59d 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -298,7 +298,7 @@ struct add_highpages_data {
 	unsigned long end_pfn;
 };
 
-static void __init add_highpages_work_fn(unsigned long start_pfn,
+static int __init add_highpages_work_fn(unsigned long start_pfn,
 					 unsigned long end_pfn, void *datax)
 {
 	int node_pfn;
@@ -311,7 +311,7 @@ static void __init add_highpages_work_fn(unsigned long start_pfn,
 	final_start_pfn = max(start_pfn, data->start_pfn);
 	final_end_pfn = min(end_pfn, data->end_pfn);
 	if (final_start_pfn >= final_end_pfn)
-		return;
+		return 0;
 
 	for (node_pfn = final_start_pfn; node_pfn < final_end_pfn;
 	     node_pfn++) {
@@ -321,6 +321,8 @@ static void __init add_highpages_work_fn(unsigned long start_pfn,
 		add_one_highpage_init(page, node_pfn);
 	}
 
+	return 0;
+
 }
 
 void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn,
diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index 66c0fd21894b..bb0642318a95 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -1637,12 +1637,43 @@ static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
 }
 
 #ifdef CONFIG_DMAR_GFX_WA
-extern int arch_get_ram_range(int slot, u64 *addr, u64 *size);
+struct iommu_prepare_data {
+	struct pci_dev *pdev;
+	int ret;
+};
+
+static int __init iommu_prepare_work_fn(unsigned long start_pfn,
+					 unsigned long end_pfn, void *datax)
+{
+	struct iommu_prepare_data *data;
+
+	data = (struct iommu_prepare_data *)datax;
+
+	data->ret = iommu_prepare_identity_map(data->pdev,
+				start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
+	return data->ret;
+
+}
+
+static int __init iommu_prepare_with_active_regions(struct pci_dev *pdev)
+{
+	int nid;
+	struct iommu_prepare_data data;
+
+	data.pdev = pdev;
+	data.ret = 0;
+
+	for_each_online_node(nid) {
+		work_with_active_regions(nid, iommu_prepare_work_fn, &data);
+		if (data.ret)
+			return data.ret;
+	}
+	return data.ret;
+}
+
 static void __init iommu_prepare_gfx_mapping(void)
 {
 	struct pci_dev *pdev = NULL;
-	u64 base, size;
-	int slot;
 	int ret;
 
 	for_each_pci_dev(pdev) {
@@ -1651,17 +1682,9 @@ static void __init iommu_prepare_gfx_mapping(void)
 			continue;
 		printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
 			pci_name(pdev));
-		slot = arch_get_ram_range(0, &base, &size);
-		while (slot >= 0) {
-			ret = iommu_prepare_identity_map(pdev,
-					base, base + size);
-			if (ret)
-				goto error;
-			slot = arch_get_ram_range(slot, &base, &size);
-		}
-		continue;
-error:
-		printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
+		ret = iommu_prepare_with_active_regions(pdev);
+		if (ret)
+			printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
 	}
 }
 #endif
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3d647b24041f..cf1cd3a2ed78 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1011,7 +1011,7 @@ extern unsigned long find_min_pfn_with_active_regions(void);
 extern unsigned long find_max_pfn_with_active_regions(void);
 extern void free_bootmem_with_active_regions(int nid,
 						unsigned long max_low_pfn);
-typedef void (*work_fn_t)(unsigned long, unsigned long, void *);
+typedef int (*work_fn_t)(unsigned long, unsigned long, void *);
 extern void work_with_active_regions(int nid, work_fn_t work_fn, void *data);
 extern void sparse_memory_present_with_active_regions(int nid);
 #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 41c6e3aa059f..e25b6b24f844 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2932,10 +2932,14 @@ void __init free_bootmem_with_active_regions(int nid,
 void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data)
 {
 	int i;
+	int ret;
 
-	for_each_active_range_index_in_nid(i, nid)
-		work_fn(early_node_map[i].start_pfn, early_node_map[i].end_pfn,
-			data);
+	for_each_active_range_index_in_nid(i, nid) {
+		ret = work_fn(early_node_map[i].start_pfn,
+			      early_node_map[i].end_pfn, data);
+		if (ret)
+			break;
+	}
 }
 /**
  * sparse_memory_present_with_active_regions - Call memory_present for each active range
-- 
cgit v1.2.3


From 3c999f142665265afd0fe9190204dd051f17e505 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Fri, 20 Jun 2008 16:11:20 -0700
Subject: x86: check command line when CONFIG_X86_MPPARSE is not set, v2

if acpi=off, acpi=noirq and pci=noacpi, we need to disable apic.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: "Maciej W. Rozycki" <macro@linux-mips.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/acpi/boot.c | 14 ++++++++++++++
 arch/x86/kernel/apic_32.c   |  2 +-
 arch/x86/kernel/setup.c     |  4 ++++
 arch/x86/kernel/setup_32.c  |  5 +++++
 arch/x86/kernel/setup_64.c  |  9 +++++++++
 include/asm-x86/apic.h      |  6 +++++-
 include/linux/acpi.h        |  6 ++++++
 7 files changed, 44 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 6516359922ba..5c0107602b62 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -1787,6 +1787,20 @@ static int __init parse_pci(char *arg)
 }
 early_param("pci", parse_pci);
 
+int __init acpi_mps_check(void)
+{
+#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_X86_MPPARSE)
+/* mptable code is not built-in*/
+	if (acpi_disabled || acpi_noirq) {
+		printk(KERN_WARNING "MPS support code is not built-in.\n"
+		       "Using acpi=off or acpi=noirq or pci=noacpi "
+		       "may have problem\n");
+		return 1;
+	}
+#endif
+	return 0;
+}
+
 #ifdef CONFIG_X86_IO_APIC
 static int __init parse_acpi_skip_timer_override(char *arg)
 {
diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
index dd8de26b2786..4932d7813bcd 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic_32.c
@@ -57,7 +57,7 @@ unsigned long mp_lapic_addr;
  *
  * -1=force-disable, +1=force-enable
  */
-static int enable_local_apic __initdata;
+int enable_local_apic;
 
 /* Local APIC timer verification ok */
 static int local_apic_timer_verify_ok;
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index c5330f601b68..56aee55cf8dc 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -161,6 +161,10 @@ void __init setup_per_cpu_areas(void)
 	char *ptr;
 	int cpu;
 
+	/* no processor from mptable or madt */
+	if (!num_processors)
+		num_processors = 1;
+
 #ifdef CONFIG_HOTPLUG_CPU
 	prefill_possible_map();
 #else
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index 369d0fe1ff9c..cad4e893df05 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -677,6 +677,11 @@ void __init setup_arch(char **cmdline_p)
 
 	parse_early_param();
 
+	if (acpi_mps_check()){
+		enable_local_apic = -1;
+		clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
+	}
+
 	finish_e820_parsing();
 
 	probe_roms();
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index a93300de4da9..175c696ec536 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -302,6 +302,11 @@ void __init setup_arch(char **cmdline_p)
 
 	parse_early_param();
 
+	if (acpi_mps_check()) {
+		disable_apic = 1;
+		clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
+	}
+
 #ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
 	if (init_ohci1394_dma_early)
 		init_ohci1394_dma_on_all_controllers();
@@ -723,6 +728,10 @@ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
 		cpu_devs[c->x86_vendor]->c_early_init(c);
 
 	validate_pat_support(c);
+
+	/* early_param could clear that, but recall get it set again */
+	if (disable_apic)
+		clear_cpu_cap(c, X86_FEATURE_APIC);
 }
 
 /*
diff --git a/include/asm-x86/apic.h b/include/asm-x86/apic.h
index 313bcaf4b6c3..9fe941cd843d 100644
--- a/include/asm-x86/apic.h
+++ b/include/asm-x86/apic.h
@@ -39,8 +39,12 @@ extern int apic_verbosity;
 extern int local_apic_timer_c2_ok;
 
 extern int ioapic_force;
-extern int disable_apic;
 
+#ifdef CONFIG_X86_64
+extern int disable_apic;
+#else
+extern int enable_local_apic;
+#endif
 /*
  * Basic functions accessing APICs.
  */
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 41f7ce7edd7a..0601075d09a1 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -82,6 +82,7 @@ char * __acpi_map_table (unsigned long phys_addr, unsigned long size);
 int early_acpi_boot_init(void);
 int acpi_boot_init (void);
 int acpi_boot_table_init (void);
+int acpi_mps_check (void);
 int acpi_numa_init (void);
 
 int acpi_table_init (void);
@@ -250,6 +251,11 @@ static inline int acpi_boot_table_init(void)
 	return 0;
 }
 
+static inline int acpi_mps_check(void)
+{
+	return 0;
+}
+
 static inline int acpi_check_resource_conflict(struct resource *res)
 {
 	return 0;
-- 
cgit v1.2.3


From 69ac9cd629ca96e59f34eb4ccd12d00b2c8276a7 Mon Sep 17 00:00:00 2001
From: Bernhard Walle <bwalle@suse.de>
Date: Fri, 27 Jun 2008 13:12:54 +0200
Subject: sysfs: add /sys/firmware/memmap

This patch adds /sys/firmware/memmap interface that represents the BIOS
(or Firmware) provided memory map. The tree looks like:

    /sys/firmware/memmap/0/start   (hex number)
                           end     (hex number)
                           type    (string)
    ...                 /1/start
                           end
                           type

With the following shell snippet one can print the memory map in the same form
the kernel prints itself when booting on x86 (the E820 map).

  --------- 8< --------------------------
    #!/bin/sh
    cd /sys/firmware/memmap
    for dir in * ; do
        start=$(cat $dir/start)
        end=$(cat $dir/end)
        type=$(cat $dir/type)
        printf "%016x-%016x (%s)\n" $start $[ $end +1] "$type"
    done
  --------- >8 --------------------------

That patch only provides the needed interface:

 1. The sysfs interface.
 2. The structure and enumeration definition.
 3. The function firmware_map_add() and firmware_map_add_early()
    that should be called from architecture code (E820/EFI, for
    example) to add the contents to the interface.

If the kernel is compiled without CONFIG_FIRMWARE_MEMMAP, the interface does
nothing without cluttering the architecture-specific code with #ifdef's.

The purpose of the new interface is kexec: While /proc/iomem represents
the *used* memory map (e.g. modified via kernel parameters like 'memmap'
and 'mem'), the /sys/firmware/memmap tree represents the unmodified memory
map provided via the firmware. So kexec can:

 - use the original memory map for rebooting,
 - use the /proc/iomem for setting up the ELF core headers for kdump
   case that should only represent the memory of the system.

The patch has been tested on i386 and x86_64.

Signed-off-by: Bernhard Walle <bwalle@suse.de>
Acked-by: Greg KH <gregkh@suse.de>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Cc: kexec@lists.infradead.org
Cc: yhlu.kernel@gmail.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/ABI/testing/sysfs-firmware-memmap |  71 ++++++++
 drivers/firmware/Kconfig                        |  10 ++
 drivers/firmware/Makefile                       |   1 +
 drivers/firmware/memmap.c                       | 205 ++++++++++++++++++++++++
 include/linux/firmware-map.h                    |  74 +++++++++
 5 files changed, 361 insertions(+)
 create mode 100644 Documentation/ABI/testing/sysfs-firmware-memmap
 create mode 100644 drivers/firmware/memmap.c
 create mode 100644 include/linux/firmware-map.h

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-firmware-memmap b/Documentation/ABI/testing/sysfs-firmware-memmap
new file mode 100644
index 000000000000..0d99ee6ae02e
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-firmware-memmap
@@ -0,0 +1,71 @@
+What:		/sys/firmware/memmap/
+Date:		June 2008
+Contact:	Bernhard Walle <bwalle@suse.de>
+Description:
+		On all platforms, the firmware provides a memory map which the
+		kernel reads. The resources from that memory map are registered
+		in the kernel resource tree and exposed to userspace via
+		/proc/iomem (together with other resources).
+
+		However, on most architectures that firmware-provided memory
+		map is modified afterwards by the kernel itself, either because
+		the kernel merges that memory map with other information or
+		just because the user overwrites that memory map via command
+		line.
+
+		kexec needs the raw firmware-provided memory map to setup the
+		parameter segment of the kernel that should be booted with
+		kexec. Also, the raw memory map is useful for debugging. For
+		that reason, /sys/firmware/memmap is an interface that provides
+		the raw memory map to userspace.
+
+		The structure is as follows: Under /sys/firmware/memmap there
+		are subdirectories with the number of the entry as their name:
+
+			/sys/firmware/memmap/0
+			/sys/firmware/memmap/1
+			/sys/firmware/memmap/2
+			/sys/firmware/memmap/3
+			...
+
+		The maximum depends on the number of memory map entries provided
+		by the firmware. The order is just the order that the firmware
+		provides.
+
+		Each directory contains three files:
+
+		start	: The start address (as hexadecimal number with the
+			  '0x' prefix).
+		end	: The end address, inclusive (regardless whether the
+			  firmware provides inclusive or exclusive ranges).
+		type	: Type of the entry as string. See below for a list of
+			  valid types.
+
+		So, for example:
+
+			/sys/firmware/memmap/0/start
+			/sys/firmware/memmap/0/end
+			/sys/firmware/memmap/0/type
+			/sys/firmware/memmap/1/start
+			...
+
+		Currently following types exist:
+
+		  - System RAM
+		  - ACPI Tables
+		  - ACPI Non-volatile Storage
+		  - reserved
+
+		Following shell snippet can be used to display that memory
+		map in a human-readable format:
+
+		-------------------- 8< ----------------------------------------
+		  #!/bin/bash
+		  cd /sys/firmware/memmap
+		  for dir in * ; do
+		      start=$(cat $dir/start)
+		      end=$(cat $dir/end)
+		      type=$(cat $dir/type)
+		      printf "%016x-%016x (%s)\n" $start $[ $end +1] "$type"
+		  done
+		-------------------- >8 ----------------------------------------
diff --git a/drivers/firmware/Kconfig b/drivers/firmware/Kconfig
index dc2cec6127d1..ebb9e51deb0c 100644
--- a/drivers/firmware/Kconfig
+++ b/drivers/firmware/Kconfig
@@ -26,6 +26,16 @@ config EDD_OFF
 	  kernel. Say N if you want EDD enabled by default. EDD can be dynamically set
 	  using the kernel parameter 'edd={on|skipmbr|off}'.
 
+config FIRMWARE_MEMMAP
+    bool "Add firmware-provided memory map to sysfs" if EMBEDDED
+    default (X86_64 || X86_32)
+    help
+      Add the firmware-provided (unmodified) memory map to /sys/firmware/memmap.
+      That memory map is used for example by kexec to set up parameter area
+      for the next kernel, but can also be used for debugging purposes.
+
+      See also Documentation/ABI/testing/sysfs-firmware-memmap.
+
 config EFI_VARS
 	tristate "EFI Variable Support via sysfs"
 	depends on EFI
diff --git a/drivers/firmware/Makefile b/drivers/firmware/Makefile
index 4c9147154df8..1c3c17343dbe 100644
--- a/drivers/firmware/Makefile
+++ b/drivers/firmware/Makefile
@@ -10,3 +10,4 @@ obj-$(CONFIG_DCDBAS)		+= dcdbas.o
 obj-$(CONFIG_DMIID)		+= dmi-id.o
 obj-$(CONFIG_ISCSI_IBFT_FIND)	+= iscsi_ibft_find.o
 obj-$(CONFIG_ISCSI_IBFT)	+= iscsi_ibft.o
+obj-$(CONFIG_FIRMWARE_MEMMAP)	+= memmap.o
diff --git a/drivers/firmware/memmap.c b/drivers/firmware/memmap.c
new file mode 100644
index 000000000000..e23399c7f773
--- /dev/null
+++ b/drivers/firmware/memmap.c
@@ -0,0 +1,205 @@
+/*
+ * linux/drivers/firmware/memmap.c
+ *  Copyright (C) 2008 SUSE LINUX Products GmbH
+ *  by Bernhard Walle <bwalle@suse.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License v2.0 as published by
+ * the Free Software Foundation
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/string.h>
+#include <linux/firmware-map.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/bootmem.h>
+
+/*
+ * Data types ------------------------------------------------------------------
+ */
+
+/*
+ * Firmware map entry. Because firmware memory maps are flat and not
+ * hierarchical, it's ok to organise them in a linked list. No parent
+ * information is necessary as for the resource tree.
+ */
+struct firmware_map_entry {
+	resource_size_t		start;	/* start of the memory range */
+	resource_size_t		end;	/* end of the memory range (incl.) */
+	const char		*type;	/* type of the memory range */
+	struct list_head	list;	/* entry for the linked list */
+	struct kobject		kobj;   /* kobject for each entry */
+};
+
+/*
+ * Forward declarations --------------------------------------------------------
+ */
+static ssize_t memmap_attr_show(struct kobject *kobj,
+				struct attribute *attr, char *buf);
+static ssize_t start_show(struct firmware_map_entry *entry, char *buf);
+static ssize_t end_show(struct firmware_map_entry *entry, char *buf);
+static ssize_t type_show(struct firmware_map_entry *entry, char *buf);
+
+/*
+ * Static data -----------------------------------------------------------------
+ */
+
+struct memmap_attribute {
+	struct attribute attr;
+	ssize_t (*show)(struct firmware_map_entry *entry, char *buf);
+};
+
+struct memmap_attribute memmap_start_attr = __ATTR_RO(start);
+struct memmap_attribute memmap_end_attr   = __ATTR_RO(end);
+struct memmap_attribute memmap_type_attr  = __ATTR_RO(type);
+
+/*
+ * These are default attributes that are added for every memmap entry.
+ */
+static struct attribute *def_attrs[] = {
+	&memmap_start_attr.attr,
+	&memmap_end_attr.attr,
+	&memmap_type_attr.attr,
+	NULL
+};
+
+static struct sysfs_ops memmap_attr_ops = {
+	.show = memmap_attr_show,
+};
+
+static struct kobj_type memmap_ktype = {
+	.sysfs_ops	= &memmap_attr_ops,
+	.default_attrs	= def_attrs,
+};
+
+/*
+ * Registration functions ------------------------------------------------------
+ */
+
+/*
+ * Firmware memory map entries
+ */
+static LIST_HEAD(map_entries);
+
+/**
+ * Common implementation of firmware_map_add() and firmware_map_add_early()
+ * which expects a pre-allocated struct firmware_map_entry.
+ *
+ * @start: Start of the memory range.
+ * @end:   End of the memory range (inclusive).
+ * @type:  Type of the memory range.
+ * @entry: Pre-allocated (either kmalloc() or bootmem allocator), uninitialised
+ *         entry.
+ */
+static int firmware_map_add_entry(resource_size_t start, resource_size_t end,
+				  const char *type,
+				  struct firmware_map_entry *entry)
+{
+	BUG_ON(start > end);
+
+	entry->start = start;
+	entry->end = end;
+	entry->type = type;
+	INIT_LIST_HEAD(&entry->list);
+	kobject_init(&entry->kobj, &memmap_ktype);
+
+	list_add_tail(&entry->list, &map_entries);
+
+	return 0;
+}
+
+/*
+ * See <linux/firmware-map.h> for documentation.
+ */
+int firmware_map_add(resource_size_t start, resource_size_t end,
+		     const char *type)
+{
+	struct firmware_map_entry *entry;
+
+	entry = kmalloc(sizeof(struct firmware_map_entry), GFP_ATOMIC);
+	WARN_ON(!entry);
+	if (!entry)
+		return -ENOMEM;
+
+	return firmware_map_add_entry(start, end, type, entry);
+}
+
+/*
+ * See <linux/firmware-map.h> for documentation.
+ */
+int __init firmware_map_add_early(resource_size_t start, resource_size_t end,
+				  const char *type)
+{
+	struct firmware_map_entry *entry;
+
+	entry = alloc_bootmem_low(sizeof(struct firmware_map_entry));
+	WARN_ON(!entry);
+	if (!entry)
+		return -ENOMEM;
+
+	return firmware_map_add_entry(start, end, type, entry);
+}
+
+/*
+ * Sysfs functions -------------------------------------------------------------
+ */
+
+static ssize_t start_show(struct firmware_map_entry *entry, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "0x%llx\n", entry->start);
+}
+
+static ssize_t end_show(struct firmware_map_entry *entry, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "0x%llx\n", entry->end);
+}
+
+static ssize_t type_show(struct firmware_map_entry *entry, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%s\n", entry->type);
+}
+
+#define to_memmap_attr(_attr) container_of(_attr, struct memmap_attribute, attr)
+#define to_memmap_entry(obj) container_of(obj, struct firmware_map_entry, kobj)
+
+static ssize_t memmap_attr_show(struct kobject *kobj,
+				struct attribute *attr, char *buf)
+{
+	struct firmware_map_entry *entry = to_memmap_entry(kobj);
+	struct memmap_attribute *memmap_attr = to_memmap_attr(attr);
+
+	return memmap_attr->show(entry, buf);
+}
+
+/*
+ * Initialises stuff and adds the entries in the map_entries list to
+ * sysfs. Important is that firmware_map_add() and firmware_map_add_early()
+ * must be called before late_initcall.
+ */
+static int __init memmap_init(void)
+{
+	int i = 0;
+	struct firmware_map_entry *entry;
+	struct kset *memmap_kset;
+
+	memmap_kset = kset_create_and_add("memmap", NULL, firmware_kobj);
+	WARN_ON(!memmap_kset);
+	if (!memmap_kset)
+		return -ENOMEM;
+
+	list_for_each_entry(entry, &map_entries, list) {
+		entry->kobj.kset = memmap_kset;
+		kobject_add(&entry->kobj, NULL, "%d", i++);
+	}
+
+	return 0;
+}
+late_initcall(memmap_init);
+
diff --git a/include/linux/firmware-map.h b/include/linux/firmware-map.h
new file mode 100644
index 000000000000..acbdbcc16051
--- /dev/null
+++ b/include/linux/firmware-map.h
@@ -0,0 +1,74 @@
+/*
+ * include/linux/firmware-map.h:
+ *  Copyright (C) 2008 SUSE LINUX Products GmbH
+ *  by Bernhard Walle <bwalle@suse.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License v2.0 as published by
+ * the Free Software Foundation
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+#ifndef _LINUX_FIRMWARE_MAP_H
+#define _LINUX_FIRMWARE_MAP_H
+
+#include <linux/list.h>
+#include <linux/kobject.h>
+
+/*
+ * provide a dummy interface if CONFIG_FIRMWARE_MEMMAP is disabled
+ */
+#ifdef CONFIG_FIRMWARE_MEMMAP
+
+/**
+ * Adds a firmware mapping entry. This function uses kmalloc() for memory
+ * allocation. Use firmware_map_add_early() if you want to use the bootmem
+ * allocator.
+ *
+ * That function must be called before late_initcall.
+ *
+ * @start: Start of the memory range.
+ * @end:   End of the memory range (inclusive).
+ * @type:  Type of the memory range.
+ *
+ * Returns 0 on success, or -ENOMEM if no memory could be allocated.
+ */
+int firmware_map_add(resource_size_t start, resource_size_t end,
+		     const char *type);
+
+/**
+ * Adds a firmware mapping entry. This function uses the bootmem allocator
+ * for memory allocation. Use firmware_map_add() if you want to use kmalloc().
+ *
+ * That function must be called before late_initcall.
+ *
+ * @start: Start of the memory range.
+ * @end:   End of the memory range (inclusive).
+ * @type:  Type of the memory range.
+ *
+ * Returns 0 on success, or -ENOMEM if no memory could be allocated.
+ */
+int firmware_map_add_early(resource_size_t start, resource_size_t end,
+			   const char *type);
+
+#else /* CONFIG_FIRMWARE_MEMMAP */
+
+static inline int firmware_map_add(resource_size_t start, resource_size_t end,
+				   const char *type)
+{
+	return 0;
+}
+
+static inline int firmware_map_add_early(resource_size_t start,
+					 resource_size_t end, const char *type)
+{
+	return 0;
+}
+
+#endif /* CONFIG_FIRMWARE_MEMMAP */
+
+#endif /* _LINUX_FIRMWARE_MAP_H */
-- 
cgit v1.2.3


From a861beb1401d65e3f095fee074c13645ab06490e Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 8 Jul 2008 19:27:22 +0200
Subject: ide: add __ide_default_irq() inline helper

Add __ide_default_irq() inline helper and use it instead of
ide_default_irq() in ide-probe.c and ns87415.c (all host drivers
except IDE PCI ones always setup hwif->irq so it is enough to
check only for I/O bases 0x1f0 and 0x170).

This fixes post-2.6.25 regression since ide_default_irq()
define could shadow ide_default_irq() inline.

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-probe.c   |  8 ++------
 drivers/ide/pci/ns87415.c |  6 +-----
 include/linux/ide.h       | 15 +++++++++++++++
 3 files changed, 18 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index d27061b39324..26e68b65b7cf 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -1218,16 +1218,12 @@ static void drive_release_dev (struct device *dev)
 	complete(&drive->gendev_rel_comp);
 }
 
-#ifndef ide_default_irq
-#define ide_default_irq(irq) 0
-#endif
-
 static int hwif_init(ide_hwif_t *hwif)
 {
 	int old_irq;
 
 	if (!hwif->irq) {
-		hwif->irq = ide_default_irq(hwif->io_ports.data_addr);
+		hwif->irq = __ide_default_irq(hwif->io_ports.data_addr);
 		if (!hwif->irq) {
 			printk("%s: DISABLED, NO IRQ\n", hwif->name);
 			return 0;
@@ -1257,7 +1253,7 @@ static int hwif_init(ide_hwif_t *hwif)
 	 *	It failed to initialise. Find the default IRQ for 
 	 *	this port and try that.
 	 */
-	hwif->irq = ide_default_irq(hwif->io_ports.data_addr);
+	hwif->irq = __ide_default_irq(hwif->io_ports.data_addr);
 	if (!hwif->irq) {
 		printk("%s: Disabled unable to get IRQ %d.\n",
 			hwif->name, old_irq);
diff --git a/drivers/ide/pci/ns87415.c b/drivers/ide/pci/ns87415.c
index fec4955f449b..a7a41bb82778 100644
--- a/drivers/ide/pci/ns87415.c
+++ b/drivers/ide/pci/ns87415.c
@@ -225,10 +225,6 @@ static int ns87415_dma_setup(ide_drive_t *drive)
 	return 1;
 }
 
-#ifndef ide_default_irq
-#define ide_default_irq(irq) 0
-#endif
-
 static void __devinit init_hwif_ns87415 (ide_hwif_t *hwif)
 {
 	struct pci_dev *dev = to_pci_dev(hwif->dev);
@@ -288,7 +284,7 @@ static void __devinit init_hwif_ns87415 (ide_hwif_t *hwif)
 	}
 
 	if (!using_inta)
-		hwif->irq = ide_default_irq(hwif->io_ports.data_addr);
+		hwif->irq = __ide_default_irq(hwif->io_ports.data_addr);
 	else if (!hwif->irq && hwif->mate && hwif->mate->irq)
 		hwif->irq = hwif->mate->irq;	/* share IRQ with mate */
 
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 9918772bf274..eddb6daadf4a 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -189,6 +189,21 @@ static inline void ide_std_init_ports(hw_regs_t *hw,
 	hw->io_ports.ctl_addr = ctl_addr;
 }
 
+/* for IDE PCI controllers in legacy mode, temporary */
+static inline int __ide_default_irq(unsigned long base)
+{
+	switch (base) {
+#ifdef CONFIG_IA64
+	case 0x1f0: return isa_irq_to_vector(14);
+	case 0x170: return isa_irq_to_vector(15);
+#else
+	case 0x1f0: return 14;
+	case 0x170: return 15;
+#endif
+	}
+	return 0;
+}
+
 #include <asm/ide.h>
 
 #if !defined(MAX_HWIFS) || defined(CONFIG_EMBEDDED)
-- 
cgit v1.2.3


From 004a403c2e954734090a69aedc7f4f822bdcc142 Mon Sep 17 00:00:00 2001
From: Loc Ho <lho@amcc.com>
Date: Wed, 14 May 2008 20:41:47 +0800
Subject: [CRYPTO] hash: Add asynchronous hash support

This patch adds asynchronous hash and digest support.

Signed-off-by: Loc Ho <lho@amcc.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/Makefile         |   1 +
 crypto/ahash.c          | 106 +++++++++++++++++++++++++++
 crypto/api.c            |   8 ++-
 crypto/digest.c         |  81 +++++++++++++++++++++
 crypto/hash.c           | 102 +++++++++++++++++++++++---
 crypto/internal.h       |   1 +
 include/crypto/algapi.h |  36 ++++++++++
 include/linux/crypto.h  | 187 ++++++++++++++++++++++++++++++++++++++++++++++--
 8 files changed, 507 insertions(+), 15 deletions(-)
 create mode 100644 crypto/ahash.c

(limited to 'include/linux')

diff --git a/crypto/Makefile b/crypto/Makefile
index 807656b64e02..d4f3ed857df0 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -19,6 +19,7 @@ obj-$(CONFIG_CRYPTO_BLKCIPHER) += crypto_blkcipher.o
 obj-$(CONFIG_CRYPTO_SEQIV) += seqiv.o
 
 crypto_hash-objs := hash.o
+crypto_hash-objs += ahash.o
 obj-$(CONFIG_CRYPTO_HASH) += crypto_hash.o
 
 obj-$(CONFIG_CRYPTO_MANAGER) += cryptomgr.o
diff --git a/crypto/ahash.c b/crypto/ahash.c
new file mode 100644
index 000000000000..a83e035d9a3f
--- /dev/null
+++ b/crypto/ahash.c
@@ -0,0 +1,106 @@
+/*
+ * Asynchronous Cryptographic Hash operations.
+ *
+ * This is the asynchronous version of hash.c with notification of
+ * completion via a callback.
+ *
+ * Copyright (c) 2008 Loc Ho <lho@amcc.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include <crypto/algapi.h>
+#include <linux/err.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/seq_file.h>
+
+#include "internal.h"
+
+static int ahash_setkey_unaligned(struct crypto_ahash *tfm, const u8 *key,
+				unsigned int keylen)
+{
+	struct ahash_alg *ahash = crypto_ahash_alg(tfm);
+	unsigned long alignmask = crypto_ahash_alignmask(tfm);
+	int ret;
+	u8 *buffer, *alignbuffer;
+	unsigned long absize;
+
+	absize = keylen + alignmask;
+	buffer = kmalloc(absize, GFP_ATOMIC);
+	if (!buffer)
+		return -ENOMEM;
+
+	alignbuffer = (u8 *)ALIGN((unsigned long)buffer, alignmask + 1);
+	memcpy(alignbuffer, key, keylen);
+	ret = ahash->setkey(tfm, alignbuffer, keylen);
+	memset(alignbuffer, 0, keylen);
+	kfree(buffer);
+	return ret;
+}
+
+static int ahash_setkey(struct crypto_ahash *tfm, const u8 *key,
+			unsigned int keylen)
+{
+	struct ahash_alg *ahash = crypto_ahash_alg(tfm);
+	unsigned long alignmask = crypto_ahash_alignmask(tfm);
+
+	if ((unsigned long)key & alignmask)
+		return ahash_setkey_unaligned(tfm, key, keylen);
+
+	return ahash->setkey(tfm, key, keylen);
+}
+
+static unsigned int crypto_ahash_ctxsize(struct crypto_alg *alg, u32 type,
+					u32 mask)
+{
+	return alg->cra_ctxsize;
+}
+
+static int crypto_init_ahash_ops(struct crypto_tfm *tfm, u32 type, u32 mask)
+{
+	struct ahash_alg *alg = &tfm->__crt_alg->cra_ahash;
+	struct ahash_tfm *crt   = &tfm->crt_ahash;
+
+	if (alg->digestsize > crypto_tfm_alg_blocksize(tfm))
+		return -EINVAL;
+
+	crt->init = alg->init;
+	crt->update = alg->update;
+	crt->final  = alg->final;
+	crt->digest = alg->digest;
+	crt->setkey = ahash_setkey;
+	crt->base   = __crypto_ahash_cast(tfm);
+	crt->digestsize = alg->digestsize;
+
+	return 0;
+}
+
+static void crypto_ahash_show(struct seq_file *m, struct crypto_alg *alg)
+	__attribute__ ((unused));
+static void crypto_ahash_show(struct seq_file *m, struct crypto_alg *alg)
+{
+	seq_printf(m, "type         : ahash\n");
+	seq_printf(m, "async        : %s\n", alg->cra_flags & CRYPTO_ALG_ASYNC ?
+					     "yes" : "no");
+	seq_printf(m, "blocksize    : %u\n", alg->cra_blocksize);
+	seq_printf(m, "digestsize   : %u\n", alg->cra_hash.digestsize);
+}
+
+const struct crypto_type crypto_ahash_type = {
+	.ctxsize = crypto_ahash_ctxsize,
+	.init = crypto_init_ahash_ops,
+#ifdef CONFIG_PROC_FS
+	.show = crypto_ahash_show,
+#endif
+};
+EXPORT_SYMBOL_GPL(crypto_ahash_type);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Asynchronous cryptographic hash type");
diff --git a/crypto/api.c b/crypto/api.c
index 0a0f41ef255f..d06e33270abe 100644
--- a/crypto/api.c
+++ b/crypto/api.c
@@ -235,8 +235,12 @@ static int crypto_init_ops(struct crypto_tfm *tfm, u32 type, u32 mask)
 		return crypto_init_cipher_ops(tfm);
 		
 	case CRYPTO_ALG_TYPE_DIGEST:
-		return crypto_init_digest_ops(tfm);
-		
+		if ((mask & CRYPTO_ALG_TYPE_HASH_MASK) !=
+		    CRYPTO_ALG_TYPE_HASH_MASK)
+			return crypto_init_digest_ops_async(tfm);
+		else
+			return crypto_init_digest_ops(tfm);
+
 	case CRYPTO_ALG_TYPE_COMPRESS:
 		return crypto_init_compress_ops(tfm);
 	
diff --git a/crypto/digest.c b/crypto/digest.c
index b526cc348b79..025c9aea24ed 100644
--- a/crypto/digest.c
+++ b/crypto/digest.c
@@ -157,3 +157,84 @@ int crypto_init_digest_ops(struct crypto_tfm *tfm)
 void crypto_exit_digest_ops(struct crypto_tfm *tfm)
 {
 }
+
+static int digest_async_nosetkey(struct crypto_ahash *tfm_async, const u8 *key,
+			unsigned int keylen)
+{
+	crypto_ahash_clear_flags(tfm_async, CRYPTO_TFM_RES_MASK);
+	return -ENOSYS;
+}
+
+static int digest_async_setkey(struct crypto_ahash *tfm_async, const u8 *key,
+			unsigned int keylen)
+{
+	struct crypto_tfm    *tfm        = crypto_ahash_tfm(tfm_async);
+	struct digest_alg    *dalg       = &tfm->__crt_alg->cra_digest;
+
+	crypto_ahash_clear_flags(tfm_async, CRYPTO_TFM_RES_MASK);
+	return dalg->dia_setkey(tfm, key, keylen);
+}
+
+static int digest_async_init(struct ahash_request *req)
+{
+	struct crypto_tfm *tfm  = req->base.tfm;
+	struct digest_alg *dalg = &tfm->__crt_alg->cra_digest;
+
+	dalg->dia_init(tfm);
+	return 0;
+}
+
+static int digest_async_update(struct ahash_request *req)
+{
+	struct crypto_tfm *tfm = req->base.tfm;
+	struct hash_desc  desc = {
+		.tfm   = __crypto_hash_cast(tfm),
+		.flags = req->base.flags,
+	};
+
+	update(&desc, req->src, req->nbytes);
+	return 0;
+}
+
+static int digest_async_final(struct ahash_request *req)
+{
+	struct crypto_tfm *tfm  = req->base.tfm;
+	struct hash_desc  desc = {
+		.tfm   = __crypto_hash_cast(tfm),
+		.flags = req->base.flags,
+	};
+
+	final(&desc, req->result);
+	return 0;
+}
+
+static int digest_async_digest(struct ahash_request *req)
+{
+	struct crypto_tfm *tfm  = req->base.tfm;
+	struct hash_desc  desc = {
+		.tfm   = __crypto_hash_cast(tfm),
+		.flags = req->base.flags,
+	};
+
+	return digest(&desc, req->src, req->nbytes, req->result);
+}
+
+int crypto_init_digest_ops_async(struct crypto_tfm *tfm)
+{
+	struct ahash_tfm  *crt  = &tfm->crt_ahash;
+	struct digest_alg *dalg = &tfm->__crt_alg->cra_digest;
+
+	if (dalg->dia_digestsize > crypto_tfm_alg_blocksize(tfm))
+		return -EINVAL;
+
+	crt->init       = digest_async_init;
+	crt->update     = digest_async_update;
+	crt->final      = digest_async_final;
+	crt->digest     = digest_async_digest;
+	crt->setkey     = dalg->dia_setkey ? digest_async_setkey :
+						digest_async_nosetkey;
+	crt->digestsize = dalg->dia_digestsize;
+	crt->base       = __crypto_ahash_cast(tfm);
+
+	return 0;
+}
diff --git a/crypto/hash.c b/crypto/hash.c
index 7dcff671c19b..f9400a014e74 100644
--- a/crypto/hash.c
+++ b/crypto/hash.c
@@ -59,24 +59,108 @@ static int hash_setkey(struct crypto_hash *crt, const u8 *key,
 	return alg->setkey(crt, key, keylen);
 }
 
-static int crypto_init_hash_ops(struct crypto_tfm *tfm, u32 type, u32 mask)
+static int hash_async_setkey(struct crypto_ahash *tfm_async, const u8 *key,
+			unsigned int keylen)
+{
+	struct crypto_tfm  *tfm      = crypto_ahash_tfm(tfm_async);
+	struct crypto_hash *tfm_hash = __crypto_hash_cast(tfm);
+	struct hash_alg    *alg      = &tfm->__crt_alg->cra_hash;
+
+	return alg->setkey(tfm_hash, key, keylen);
+}
+
+static int hash_async_init(struct ahash_request *req)
+{
+	struct crypto_tfm *tfm = req->base.tfm;
+	struct hash_alg   *alg = &tfm->__crt_alg->cra_hash;
+	struct hash_desc  desc = {
+		.tfm = __crypto_hash_cast(tfm),
+		.flags = req->base.flags,
+	};
+
+	return alg->init(&desc);
+}
+
+static int hash_async_update(struct ahash_request *req)
+{
+	struct crypto_tfm *tfm = req->base.tfm;
+	struct hash_alg   *alg = &tfm->__crt_alg->cra_hash;
+	struct hash_desc  desc = {
+		.tfm = __crypto_hash_cast(tfm),
+		.flags = req->base.flags,
+	};
+
+	return alg->update(&desc, req->src, req->nbytes);
+}
+
+static int hash_async_final(struct ahash_request *req)
+{
+	struct crypto_tfm *tfm = req->base.tfm;
+	struct hash_alg   *alg = &tfm->__crt_alg->cra_hash;
+	struct hash_desc  desc = {
+		.tfm = __crypto_hash_cast(tfm),
+		.flags = req->base.flags,
+	};
+
+	return alg->final(&desc, req->result);
+}
+
+static int hash_async_digest(struct ahash_request *req)
+{
+	struct crypto_tfm *tfm = req->base.tfm;
+	struct hash_alg   *alg = &tfm->__crt_alg->cra_hash;
+	struct hash_desc  desc = {
+		.tfm = __crypto_hash_cast(tfm),
+		.flags = req->base.flags,
+	};
+
+	return alg->digest(&desc, req->src, req->nbytes, req->result);
+}
+
+static int crypto_init_hash_ops_async(struct crypto_tfm *tfm)
+{
+	struct ahash_tfm *crt = &tfm->crt_ahash;
+	struct hash_alg  *alg = &tfm->__crt_alg->cra_hash;
+
+	crt->init       = hash_async_init;
+	crt->update     = hash_async_update;
+	crt->final      = hash_async_final;
+	crt->digest     = hash_async_digest;
+	crt->setkey     = hash_async_setkey;
+	crt->digestsize = alg->digestsize;
+	crt->base       = __crypto_ahash_cast(tfm);
+
+	return 0;
+}
+
+static int crypto_init_hash_ops_sync(struct crypto_tfm *tfm)
 {
 	struct hash_tfm *crt = &tfm->crt_hash;
 	struct hash_alg *alg = &tfm->__crt_alg->cra_hash;
 
-	if (alg->digestsize > crypto_tfm_alg_blocksize(tfm))
-		return -EINVAL;
-
-	crt->init = alg->init;
-	crt->update = alg->update;
-	crt->final = alg->final;
-	crt->digest = alg->digest;
-	crt->setkey = hash_setkey;
+	crt->init       = alg->init;
+	crt->update     = alg->update;
+	crt->final      = alg->final;
+	crt->digest     = alg->digest;
+	crt->setkey     = hash_setkey;
 	crt->digestsize = alg->digestsize;
 
 	return 0;
 }
 
+static int crypto_init_hash_ops(struct crypto_tfm *tfm, u32 type, u32 mask)
+{
+	struct hash_alg *alg = &tfm->__crt_alg->cra_hash;
+
+	if (alg->digestsize > crypto_tfm_alg_blocksize(tfm))
+		return -EINVAL;
+
+	if ((mask & CRYPTO_ALG_TYPE_HASH_MASK) != CRYPTO_ALG_TYPE_HASH_MASK)
+		return crypto_init_hash_ops_async(tfm);
+	else
+		return crypto_init_hash_ops_sync(tfm);
+}
+
 static void crypto_hash_show(struct seq_file *m, struct crypto_alg *alg)
 	__attribute__ ((unused));
 static void crypto_hash_show(struct seq_file *m, struct crypto_alg *alg)
diff --git a/crypto/internal.h b/crypto/internal.h
index 32f4c2145603..683fcb2d91f4 100644
--- a/crypto/internal.h
+++ b/crypto/internal.h
@@ -86,6 +86,7 @@ struct crypto_alg *__crypto_alg_lookup(const char *name, u32 type, u32 mask);
 struct crypto_alg *crypto_alg_mod_lookup(const char *name, u32 type, u32 mask);
 
 int crypto_init_digest_ops(struct crypto_tfm *tfm);
+int crypto_init_digest_ops_async(struct crypto_tfm *tfm);
 int crypto_init_cipher_ops(struct crypto_tfm *tfm);
 int crypto_init_compress_ops(struct crypto_tfm *tfm);
 
diff --git a/include/crypto/algapi.h b/include/crypto/algapi.h
index 60d06e784be3..fef272a8ceeb 100644
--- a/include/crypto/algapi.h
+++ b/include/crypto/algapi.h
@@ -98,6 +98,7 @@ extern const struct crypto_type crypto_ablkcipher_type;
 extern const struct crypto_type crypto_aead_type;
 extern const struct crypto_type crypto_blkcipher_type;
 extern const struct crypto_type crypto_hash_type;
+extern const struct crypto_type crypto_ahash_type;
 
 void crypto_mod_put(struct crypto_alg *alg);
 
@@ -314,5 +315,40 @@ static inline int crypto_requires_sync(u32 type, u32 mask)
 	return (type ^ CRYPTO_ALG_ASYNC) & mask & CRYPTO_ALG_ASYNC;
 }
 
+static inline void *crypto_ahash_ctx(struct crypto_ahash *tfm)
+{
+	return crypto_tfm_ctx(&tfm->base);
+}
+
+static inline struct ahash_alg *crypto_ahash_alg(
+	struct crypto_ahash *tfm)
+{
+	return &crypto_ahash_tfm(tfm)->__crt_alg->cra_ahash;
+}
+
+static inline int ahash_enqueue_request(struct crypto_queue *queue,
+					     struct ahash_request *request)
+{
+	return crypto_enqueue_request(queue, &request->base);
+}
+
+static inline struct ahash_request *ahash_dequeue_request(
+	struct crypto_queue *queue)
+{
+	return ahash_request_cast(crypto_dequeue_request(queue));
+}
+
+static inline void *ahash_request_ctx(struct ahash_request *req)
+{
+	return req->__ctx;
+}
+
+static inline int ahash_tfm_in_queue(struct crypto_queue *queue,
+					  struct crypto_ahash *tfm)
+{
+	return crypto_tfm_in_queue(queue, crypto_ahash_tfm(tfm));
+}
+
+
 #endif	/* _CRYPTO_ALGAPI_H */
 
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index 425824bd49f3..b6efe569128d 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -30,15 +30,17 @@
  */
 #define CRYPTO_ALG_TYPE_MASK		0x0000000f
 #define CRYPTO_ALG_TYPE_CIPHER		0x00000001
-#define CRYPTO_ALG_TYPE_DIGEST		0x00000002
-#define CRYPTO_ALG_TYPE_HASH		0x00000003
+#define CRYPTO_ALG_TYPE_COMPRESS	0x00000002
+#define CRYPTO_ALG_TYPE_AEAD		0x00000003
 #define CRYPTO_ALG_TYPE_BLKCIPHER	0x00000004
 #define CRYPTO_ALG_TYPE_ABLKCIPHER	0x00000005
 #define CRYPTO_ALG_TYPE_GIVCIPHER	0x00000006
-#define CRYPTO_ALG_TYPE_COMPRESS	0x00000008
-#define CRYPTO_ALG_TYPE_AEAD		0x00000009
+#define CRYPTO_ALG_TYPE_DIGEST		0x00000008
+#define CRYPTO_ALG_TYPE_HASH		0x00000009
+#define CRYPTO_ALG_TYPE_AHASH		0x0000000a
 
 #define CRYPTO_ALG_TYPE_HASH_MASK	0x0000000e
+#define CRYPTO_ALG_TYPE_AHASH_MASK	0x0000000c
 #define CRYPTO_ALG_TYPE_BLKCIPHER_MASK	0x0000000c
 
 #define CRYPTO_ALG_LARVAL		0x00000010
@@ -102,6 +104,7 @@ struct crypto_async_request;
 struct crypto_aead;
 struct crypto_blkcipher;
 struct crypto_hash;
+struct crypto_ahash;
 struct crypto_tfm;
 struct crypto_type;
 struct aead_givcrypt_request;
@@ -131,6 +134,18 @@ struct ablkcipher_request {
 	void *__ctx[] CRYPTO_MINALIGN_ATTR;
 };
 
+struct ahash_request {
+	struct crypto_async_request base;
+
+	void *info;
+
+	unsigned int nbytes;
+	struct scatterlist *src;
+	u8		   *result;
+
+	void *__ctx[] CRYPTO_MINALIGN_ATTR;
+};
+
 /**
  *	struct aead_request - AEAD request
  *	@base: Common attributes for async crypto requests
@@ -195,6 +210,17 @@ struct ablkcipher_alg {
 	unsigned int ivsize;
 };
 
+struct ahash_alg {
+	int (*init)(struct ahash_request *req);
+	int (*update)(struct ahash_request *req);
+	int (*final)(struct ahash_request *req);
+	int (*digest)(struct ahash_request *req);
+	int (*setkey)(struct crypto_ahash *tfm, const u8 *key,
+			unsigned int keylen);
+
+	unsigned int digestsize;
+};
+
 struct aead_alg {
 	int (*setkey)(struct crypto_aead *tfm, const u8 *key,
 	              unsigned int keylen);
@@ -272,6 +298,7 @@ struct compress_alg {
 #define cra_cipher	cra_u.cipher
 #define cra_digest	cra_u.digest
 #define cra_hash	cra_u.hash
+#define cra_ahash	cra_u.ahash
 #define cra_compress	cra_u.compress
 
 struct crypto_alg {
@@ -298,6 +325,7 @@ struct crypto_alg {
 		struct cipher_alg cipher;
 		struct digest_alg digest;
 		struct hash_alg hash;
+		struct ahash_alg ahash;
 		struct compress_alg compress;
 	} cra_u;
 
@@ -383,6 +411,19 @@ struct hash_tfm {
 	unsigned int digestsize;
 };
 
+struct ahash_tfm {
+	int (*init)(struct ahash_request *req);
+	int (*update)(struct ahash_request *req);
+	int (*final)(struct ahash_request *req);
+	int (*digest)(struct ahash_request *req);
+	int (*setkey)(struct crypto_ahash *tfm, const u8 *key,
+			unsigned int keylen);
+
+	unsigned int digestsize;
+	struct crypto_ahash *base;
+	unsigned int reqsize;
+};
+
 struct compress_tfm {
 	int (*cot_compress)(struct crypto_tfm *tfm,
 	                    const u8 *src, unsigned int slen,
@@ -397,6 +438,7 @@ struct compress_tfm {
 #define crt_blkcipher	crt_u.blkcipher
 #define crt_cipher	crt_u.cipher
 #define crt_hash	crt_u.hash
+#define crt_ahash	crt_u.ahash
 #define crt_compress	crt_u.compress
 
 struct crypto_tfm {
@@ -409,6 +451,7 @@ struct crypto_tfm {
 		struct blkcipher_tfm blkcipher;
 		struct cipher_tfm cipher;
 		struct hash_tfm hash;
+		struct ahash_tfm ahash;
 		struct compress_tfm compress;
 	} crt_u;
 	
@@ -441,6 +484,10 @@ struct crypto_hash {
 	struct crypto_tfm base;
 };
 
+struct crypto_ahash {
+	struct crypto_tfm base;
+};
+
 enum {
 	CRYPTOA_UNSPEC,
 	CRYPTOA_ALG,
@@ -1264,5 +1311,137 @@ static inline int crypto_comp_decompress(struct crypto_comp *tfm,
 						    src, slen, dst, dlen);
 }
 
+static inline struct crypto_ahash *__crypto_ahash_cast(struct crypto_tfm *tfm)
+{
+	return (struct crypto_ahash *)tfm;
+}
+
+static inline struct crypto_ahash *crypto_alloc_ahash(const char *alg_name,
+						      u32 type, u32 mask)
+{
+	type &= ~CRYPTO_ALG_TYPE_MASK;
+	mask &= ~CRYPTO_ALG_TYPE_MASK;
+	type |= CRYPTO_ALG_TYPE_AHASH;
+	mask |= CRYPTO_ALG_TYPE_AHASH_MASK;
+
+	return __crypto_ahash_cast(crypto_alloc_base(alg_name, type, mask));
+}
+
+static inline struct crypto_tfm *crypto_ahash_tfm(struct crypto_ahash *tfm)
+{
+	return &tfm->base;
+}
+
+static inline void crypto_free_ahash(struct crypto_ahash *tfm)
+{
+	crypto_free_tfm(crypto_ahash_tfm(tfm));
+}
+
+static inline unsigned int crypto_ahash_alignmask(
+	struct crypto_ahash *tfm)
+{
+	return crypto_tfm_alg_alignmask(crypto_ahash_tfm(tfm));
+}
+
+static inline struct ahash_tfm *crypto_ahash_crt(struct crypto_ahash *tfm)
+{
+	return &crypto_ahash_tfm(tfm)->crt_ahash;
+}
+
+static inline unsigned int crypto_ahash_digestsize(struct crypto_ahash *tfm)
+{
+	return crypto_ahash_crt(tfm)->digestsize;
+}
+
+static inline u32 crypto_ahash_get_flags(struct crypto_ahash *tfm)
+{
+	return crypto_tfm_get_flags(crypto_ahash_tfm(tfm));
+}
+
+static inline void crypto_ahash_set_flags(struct crypto_ahash *tfm, u32 flags)
+{
+	crypto_tfm_set_flags(crypto_ahash_tfm(tfm), flags);
+}
+
+static inline void crypto_ahash_clear_flags(struct crypto_ahash *tfm, u32 flags)
+{
+	crypto_tfm_clear_flags(crypto_ahash_tfm(tfm), flags);
+}
+
+static inline struct crypto_ahash *crypto_ahash_reqtfm(
+	struct ahash_request *req)
+{
+	return __crypto_ahash_cast(req->base.tfm);
+}
+
+static inline unsigned int crypto_ahash_reqsize(struct crypto_ahash *tfm)
+{
+	return crypto_ahash_crt(tfm)->reqsize;
+}
+
+static inline int crypto_ahash_setkey(struct crypto_ahash *tfm,
+				      const u8 *key, unsigned int keylen)
+{
+	struct ahash_tfm *crt = crypto_ahash_crt(tfm);
+
+	return crt->setkey(crt->base, key, keylen);
+}
+
+static inline int crypto_ahash_digest(struct ahash_request *req)
+{
+	struct ahash_tfm *crt = crypto_ahash_crt(crypto_ahash_reqtfm(req));
+	return crt->digest(req);
+}
+
+static inline void ahash_request_set_tfm(struct ahash_request *req,
+					 struct crypto_ahash *tfm)
+{
+	req->base.tfm = crypto_ahash_tfm(crypto_ahash_crt(tfm)->base);
+}
+
+static inline struct ahash_request *ahash_request_alloc(
+	struct crypto_ahash *tfm, gfp_t gfp)
+{
+	struct ahash_request *req;
+
+	req = kmalloc(sizeof(struct ahash_request) +
+		      crypto_ahash_reqsize(tfm), gfp);
+
+	if (likely(req))
+		ahash_request_set_tfm(req, tfm);
+
+	return req;
+}
+
+static inline void ahash_request_free(struct ahash_request *req)
+{
+	kfree(req);
+}
+
+static inline struct ahash_request *ahash_request_cast(
+	struct crypto_async_request *req)
+{
+	return container_of(req, struct ahash_request, base);
+}
+
+static inline void ahash_request_set_callback(struct ahash_request *req,
+					      u32 flags,
+					      crypto_completion_t complete,
+					      void *data)
+{
+	req->base.complete = complete;
+	req->base.data = data;
+	req->base.flags = flags;
+}
+
+static inline void ahash_request_set_crypt(struct ahash_request *req,
+					   struct scatterlist *src, u8 *result,
+					   unsigned int nbytes)
+{
+	req->src = src;
+	req->nbytes = nbytes;
+	req->result = result;
+}
+
 #endif	/* _LINUX_CRYPTO_H */
 
-- 
cgit v1.2.3


From 166247f46a9c866e6f7f7d2212be875fb82212a1 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Mon, 7 Jul 2008 20:54:35 +0800
Subject: crypto: hash - Removed vestigial ahash fields

The base field in ahash_tfm appears to have been cut-n-pasted from
ablkcipher.  It isn't needed here at all.  Similarly, the info field
in ahash_request also appears to have originated from its cipher
counter-part and is vestigial.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/ahash.c         | 1 -
 crypto/digest.c        | 1 -
 crypto/hash.c          | 1 -
 include/linux/crypto.h | 7 ++-----
 4 files changed, 2 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/crypto/ahash.c b/crypto/ahash.c
index 8c1f918a6878..e6e5906ca80a 100644
--- a/crypto/ahash.c
+++ b/crypto/ahash.c
@@ -76,7 +76,6 @@ static int crypto_init_ahash_ops(struct crypto_tfm *tfm, u32 type, u32 mask)
 	crt->final  = alg->final;
 	crt->digest = alg->digest;
 	crt->setkey = ahash_setkey;
-	crt->base   = __crypto_ahash_cast(tfm);
 	crt->digestsize = alg->digestsize;
 
 	return 0;
diff --git a/crypto/digest.c b/crypto/digest.c
index d63d5d96feec..bf332982c50d 100644
--- a/crypto/digest.c
+++ b/crypto/digest.c
@@ -234,7 +234,6 @@ int crypto_init_digest_ops_async(struct crypto_tfm *tfm)
 	crt->setkey     = dalg->dia_setkey ? digest_async_setkey :
 						digest_async_nosetkey;
 	crt->digestsize = dalg->dia_digestsize;
-	crt->base       = __crypto_ahash_cast(tfm);
 
 	return 0;
 }
diff --git a/crypto/hash.c b/crypto/hash.c
index 0d7caa9ab748..140a75565f15 100644
--- a/crypto/hash.c
+++ b/crypto/hash.c
@@ -128,7 +128,6 @@ static int crypto_init_hash_ops_async(struct crypto_tfm *tfm)
 	crt->digest     = hash_async_digest;
 	crt->setkey     = hash_async_setkey;
 	crt->digestsize = alg->digestsize;
-	crt->base       = __crypto_ahash_cast(tfm);
 
 	return 0;
 }
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index b6efe569128d..68ef293644d3 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -137,8 +137,6 @@ struct ablkcipher_request {
 struct ahash_request {
 	struct crypto_async_request base;
 
-	void *info;
-
 	unsigned int nbytes;
 	struct scatterlist *src;
 	u8		   *result;
@@ -420,7 +418,6 @@ struct ahash_tfm {
 			unsigned int keylen);
 
 	unsigned int digestsize;
-	struct crypto_ahash *base;
 	unsigned int reqsize;
 };
 
@@ -1384,7 +1381,7 @@ static inline int crypto_ahash_setkey(struct crypto_ahash *tfm,
 {
 	struct ahash_tfm *crt = crypto_ahash_crt(tfm);
 
-	return crt->setkey(crt->base, key, keylen);
+	return crt->setkey(tfm, key, keylen);
 }
 
 static inline int crypto_ahash_digest(struct ahash_request *req)
@@ -1396,7 +1393,7 @@ static inline int crypto_ahash_digest(struct ahash_request *req)
 static inline void ahash_request_set_tfm(struct ahash_request *req,
 					 struct crypto_ahash *tfm)
 {
-	req->base.tfm = crypto_ahash_tfm(crypto_ahash_crt(tfm)->base);
+	req->base.tfm = crypto_ahash_tfm(tfm);
 }
 
 static inline struct ahash_request *ahash_request_alloc(
-- 
cgit v1.2.3


From 18e33e6d5cc0495826f5245777cd267732815e01 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 10 Jul 2008 16:01:22 +0800
Subject: crypto: hash - Move ahash functions into crypto/hash.h

All new crypto interfaces should go into individual files as much
as possible in order to ensure that crypto.h does not collapse under
its own weight.

This patch moves the ahash code into crypto/hash.h and crypto/internal/hash.h
respectively.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/cryptd.c                |   1 +
 crypto/digest.c                |   1 +
 crypto/hash.c                  |   1 +
 crypto/tcrypt.c                |   1 +
 include/crypto/algapi.h        |  36 ----------
 include/crypto/hash.h          | 154 +++++++++++++++++++++++++++++++++++++++++
 include/crypto/internal/hash.h |  37 ++++++++++
 include/linux/crypto.h         | 136 ------------------------------------
 8 files changed, 195 insertions(+), 172 deletions(-)
 create mode 100644 include/crypto/hash.h

(limited to 'include/linux')

diff --git a/crypto/cryptd.c b/crypto/cryptd.c
index d3ecd7e73b7e..d29e06b350ff 100644
--- a/crypto/cryptd.c
+++ b/crypto/cryptd.c
@@ -11,6 +11,7 @@
  */
 
 #include <crypto/algapi.h>
+#include <crypto/internal/hash.h>
 #include <linux/err.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
diff --git a/crypto/digest.c b/crypto/digest.c
index bf332982c50d..ac0919460d14 100644
--- a/crypto/digest.c
+++ b/crypto/digest.c
@@ -12,6 +12,7 @@
  *
  */
 
+#include <crypto/internal/hash.h>
 #include <crypto/scatterwalk.h>
 #include <linux/mm.h>
 #include <linux/errno.h>
diff --git a/crypto/hash.c b/crypto/hash.c
index 140a75565f15..cb86b19fd105 100644
--- a/crypto/hash.c
+++ b/crypto/hash.c
@@ -9,6 +9,7 @@
  * any later version.
  */
 
+#include <crypto/internal/hash.h>
 #include <linux/errno.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c
index 87f08f9b0904..59821a22d752 100644
--- a/crypto/tcrypt.c
+++ b/crypto/tcrypt.c
@@ -15,6 +15,7 @@
  *
  */
 
+#include <crypto/hash.h>
 #include <linux/err.h>
 #include <linux/init.h>
 #include <linux/module.h>
diff --git a/include/crypto/algapi.h b/include/crypto/algapi.h
index fef272a8ceeb..60d06e784be3 100644
--- a/include/crypto/algapi.h
+++ b/include/crypto/algapi.h
@@ -98,7 +98,6 @@ extern const struct crypto_type crypto_ablkcipher_type;
 extern const struct crypto_type crypto_aead_type;
 extern const struct crypto_type crypto_blkcipher_type;
 extern const struct crypto_type crypto_hash_type;
-extern const struct crypto_type crypto_ahash_type;
 
 void crypto_mod_put(struct crypto_alg *alg);
 
@@ -315,40 +314,5 @@ static inline int crypto_requires_sync(u32 type, u32 mask)
 	return (type ^ CRYPTO_ALG_ASYNC) & mask & CRYPTO_ALG_ASYNC;
 }
 
-static inline void *crypto_ahash_ctx(struct crypto_ahash *tfm)
-{
-	return crypto_tfm_ctx(&tfm->base);
-}
-
-static inline struct ahash_alg *crypto_ahash_alg(
-	struct crypto_ahash *tfm)
-{
-	return &crypto_ahash_tfm(tfm)->__crt_alg->cra_ahash;
-}
-
-static inline int ahash_enqueue_request(struct crypto_queue *queue,
-					     struct ahash_request *request)
-{
-	return crypto_enqueue_request(queue, &request->base);
-}
-
-static inline struct ahash_request *ahash_dequeue_request(
-	struct crypto_queue *queue)
-{
-	return ahash_request_cast(crypto_dequeue_request(queue));
-}
-
-static inline void *ahash_request_ctx(struct ahash_request *req)
-{
-	return req->__ctx;
-}
-
-static inline int ahash_tfm_in_queue(struct crypto_queue *queue,
-					  struct crypto_ahash *tfm)
-{
-	return crypto_tfm_in_queue(queue, crypto_ahash_tfm(tfm));
-}
-
-
 #endif	/* _CRYPTO_ALGAPI_H */
 
diff --git a/include/crypto/hash.h b/include/crypto/hash.h
new file mode 100644
index 000000000000..d12498ec8a4e
--- /dev/null
+++ b/include/crypto/hash.h
@@ -0,0 +1,154 @@
+/*
+ * Hash: Hash algorithms under the crypto API
+ * 
+ * Copyright (c) 2008 Herbert Xu <herbert@gondor.apana.org.au>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) 
+ * any later version.
+ *
+ */
+
+#ifndef _CRYPTO_HASH_H
+#define _CRYPTO_HASH_H
+
+#include <linux/crypto.h>
+
+struct crypto_ahash {
+	struct crypto_tfm base;
+};
+
+static inline struct crypto_ahash *__crypto_ahash_cast(struct crypto_tfm *tfm)
+{
+	return (struct crypto_ahash *)tfm;
+}
+
+static inline struct crypto_ahash *crypto_alloc_ahash(const char *alg_name,
+						      u32 type, u32 mask)
+{
+	type &= ~CRYPTO_ALG_TYPE_MASK;
+	mask &= ~CRYPTO_ALG_TYPE_MASK;
+	type |= CRYPTO_ALG_TYPE_AHASH;
+	mask |= CRYPTO_ALG_TYPE_AHASH_MASK;
+
+	return __crypto_ahash_cast(crypto_alloc_base(alg_name, type, mask));
+}
+
+static inline struct crypto_tfm *crypto_ahash_tfm(struct crypto_ahash *tfm)
+{
+	return &tfm->base;
+}
+
+static inline void crypto_free_ahash(struct crypto_ahash *tfm)
+{
+	crypto_free_tfm(crypto_ahash_tfm(tfm));
+}
+
+static inline unsigned int crypto_ahash_alignmask(
+	struct crypto_ahash *tfm)
+{
+	return crypto_tfm_alg_alignmask(crypto_ahash_tfm(tfm));
+}
+
+static inline struct ahash_tfm *crypto_ahash_crt(struct crypto_ahash *tfm)
+{
+	return &crypto_ahash_tfm(tfm)->crt_ahash;
+}
+
+static inline unsigned int crypto_ahash_digestsize(struct crypto_ahash *tfm)
+{
+	return crypto_ahash_crt(tfm)->digestsize;
+}
+
+static inline u32 crypto_ahash_get_flags(struct crypto_ahash *tfm)
+{
+	return crypto_tfm_get_flags(crypto_ahash_tfm(tfm));
+}
+
+static inline void crypto_ahash_set_flags(struct crypto_ahash *tfm, u32 flags)
+{
+	crypto_tfm_set_flags(crypto_ahash_tfm(tfm), flags);
+}
+
+static inline void crypto_ahash_clear_flags(struct crypto_ahash *tfm, u32 flags)
+{
+	crypto_tfm_clear_flags(crypto_ahash_tfm(tfm), flags);
+}
+
+static inline struct crypto_ahash *crypto_ahash_reqtfm(
+	struct ahash_request *req)
+{
+	return __crypto_ahash_cast(req->base.tfm);
+}
+
+static inline unsigned int crypto_ahash_reqsize(struct crypto_ahash *tfm)
+{
+	return crypto_ahash_crt(tfm)->reqsize;
+}
+
+static inline int crypto_ahash_setkey(struct crypto_ahash *tfm,
+				      const u8 *key, unsigned int keylen)
+{
+	struct ahash_tfm *crt = crypto_ahash_crt(tfm);
+
+	return crt->setkey(tfm, key, keylen);
+}
+
+static inline int crypto_ahash_digest(struct ahash_request *req)
+{
+	struct ahash_tfm *crt = crypto_ahash_crt(crypto_ahash_reqtfm(req));
+	return crt->digest(req);
+}
+
+static inline void ahash_request_set_tfm(struct ahash_request *req,
+					 struct crypto_ahash *tfm)
+{
+	req->base.tfm = crypto_ahash_tfm(tfm);
+}
+
+static inline struct ahash_request *ahash_request_alloc(
+	struct crypto_ahash *tfm, gfp_t gfp)
+{
+	struct ahash_request *req;
+
+	req = kmalloc(sizeof(struct ahash_request) +
+		      crypto_ahash_reqsize(tfm), gfp);
+
+	if (likely(req))
+		ahash_request_set_tfm(req, tfm);
+
+	return req;
+}
+
+static inline void ahash_request_free(struct ahash_request *req)
+{
+	kfree(req);
+}
+
+static inline struct ahash_request *ahash_request_cast(
+	struct crypto_async_request *req)
+{
+	return container_of(req, struct ahash_request, base);
+}
+
+static inline void ahash_request_set_callback(struct ahash_request *req,
+					      u32 flags,
+					      crypto_completion_t complete,
+					      void *data)
+{
+	req->base.complete = complete;
+	req->base.data = data;
+	req->base.flags = flags;
+}
+
+static inline void ahash_request_set_crypt(struct ahash_request *req,
+					   struct scatterlist *src, u8 *result,
+					   unsigned int nbytes)
+{
+	req->src = src;
+	req->nbytes = nbytes;
+	req->result = result;
+}
+
+#endif	/* _CRYPTO_HASH_H */
diff --git a/include/crypto/internal/hash.h b/include/crypto/internal/hash.h
index 93ac42280221..917ae57bad4a 100644
--- a/include/crypto/internal/hash.h
+++ b/include/crypto/internal/hash.h
@@ -14,6 +14,7 @@
 #define _CRYPTO_INTERNAL_HASH_H
 
 #include <crypto/algapi.h>
+#include <crypto/hash.h>
 
 struct ahash_request;
 struct scatterlist;
@@ -33,9 +34,45 @@ struct crypto_hash_walk {
 	unsigned int flags;
 };
 
+extern const struct crypto_type crypto_ahash_type;
+
 int crypto_hash_walk_done(struct crypto_hash_walk *walk, int err);
 int crypto_hash_walk_first(struct ahash_request *req,
 			   struct crypto_hash_walk *walk);
 
+static inline void *crypto_ahash_ctx(struct crypto_ahash *tfm)
+{
+	return crypto_tfm_ctx(&tfm->base);
+}
+
+static inline struct ahash_alg *crypto_ahash_alg(
+	struct crypto_ahash *tfm)
+{
+	return &crypto_ahash_tfm(tfm)->__crt_alg->cra_ahash;
+}
+
+static inline int ahash_enqueue_request(struct crypto_queue *queue,
+					     struct ahash_request *request)
+{
+	return crypto_enqueue_request(queue, &request->base);
+}
+
+static inline struct ahash_request *ahash_dequeue_request(
+	struct crypto_queue *queue)
+{
+	return ahash_request_cast(crypto_dequeue_request(queue));
+}
+
+static inline void *ahash_request_ctx(struct ahash_request *req)
+{
+	return req->__ctx;
+}
+
+static inline int ahash_tfm_in_queue(struct crypto_queue *queue,
+					  struct crypto_ahash *tfm)
+{
+	return crypto_tfm_in_queue(queue, crypto_ahash_tfm(tfm));
+}
+
 #endif	/* _CRYPTO_INTERNAL_HASH_H */
 
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index 68ef293644d3..c43dc47fdf75 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -481,10 +481,6 @@ struct crypto_hash {
 	struct crypto_tfm base;
 };
 
-struct crypto_ahash {
-	struct crypto_tfm base;
-};
-
 enum {
 	CRYPTOA_UNSPEC,
 	CRYPTOA_ALG,
@@ -1308,137 +1304,5 @@ static inline int crypto_comp_decompress(struct crypto_comp *tfm,
 						    src, slen, dst, dlen);
 }
 
-static inline struct crypto_ahash *__crypto_ahash_cast(struct crypto_tfm *tfm)
-{
-	return (struct crypto_ahash *)tfm;
-}
-
-static inline struct crypto_ahash *crypto_alloc_ahash(const char *alg_name,
-						      u32 type, u32 mask)
-{
-	type &= ~CRYPTO_ALG_TYPE_MASK;
-	mask &= ~CRYPTO_ALG_TYPE_MASK;
-	type |= CRYPTO_ALG_TYPE_AHASH;
-	mask |= CRYPTO_ALG_TYPE_AHASH_MASK;
-
-	return __crypto_ahash_cast(crypto_alloc_base(alg_name, type, mask));
-}
-
-static inline struct crypto_tfm *crypto_ahash_tfm(struct crypto_ahash *tfm)
-{
-	return &tfm->base;
-}
-
-static inline void crypto_free_ahash(struct crypto_ahash *tfm)
-{
-	crypto_free_tfm(crypto_ahash_tfm(tfm));
-}
-
-static inline unsigned int crypto_ahash_alignmask(
-	struct crypto_ahash *tfm)
-{
-	return crypto_tfm_alg_alignmask(crypto_ahash_tfm(tfm));
-}
-
-static inline struct ahash_tfm *crypto_ahash_crt(struct crypto_ahash *tfm)
-{
-	return &crypto_ahash_tfm(tfm)->crt_ahash;
-}
-
-static inline unsigned int crypto_ahash_digestsize(struct crypto_ahash *tfm)
-{
-	return crypto_ahash_crt(tfm)->digestsize;
-}
-
-static inline u32 crypto_ahash_get_flags(struct crypto_ahash *tfm)
-{
-	return crypto_tfm_get_flags(crypto_ahash_tfm(tfm));
-}
-
-static inline void crypto_ahash_set_flags(struct crypto_ahash *tfm, u32 flags)
-{
-	crypto_tfm_set_flags(crypto_ahash_tfm(tfm), flags);
-}
-
-static inline void crypto_ahash_clear_flags(struct crypto_ahash *tfm, u32 flags)
-{
-	crypto_tfm_clear_flags(crypto_ahash_tfm(tfm), flags);
-}
-
-static inline struct crypto_ahash *crypto_ahash_reqtfm(
-	struct ahash_request *req)
-{
-	return __crypto_ahash_cast(req->base.tfm);
-}
-
-static inline unsigned int crypto_ahash_reqsize(struct crypto_ahash *tfm)
-{
-	return crypto_ahash_crt(tfm)->reqsize;
-}
-
-static inline int crypto_ahash_setkey(struct crypto_ahash *tfm,
-				      const u8 *key, unsigned int keylen)
-{
-	struct ahash_tfm *crt = crypto_ahash_crt(tfm);
-
-	return crt->setkey(tfm, key, keylen);
-}
-
-static inline int crypto_ahash_digest(struct ahash_request *req)
-{
-	struct ahash_tfm *crt = crypto_ahash_crt(crypto_ahash_reqtfm(req));
-	return crt->digest(req);
-}
-
-static inline void ahash_request_set_tfm(struct ahash_request *req,
-					 struct crypto_ahash *tfm)
-{
-	req->base.tfm = crypto_ahash_tfm(tfm);
-}
-
-static inline struct ahash_request *ahash_request_alloc(
-	struct crypto_ahash *tfm, gfp_t gfp)
-{
-	struct ahash_request *req;
-
-	req = kmalloc(sizeof(struct ahash_request) +
-		      crypto_ahash_reqsize(tfm), gfp);
-
-	if (likely(req))
-		ahash_request_set_tfm(req, tfm);
-
-	return req;
-}
-
-static inline void ahash_request_free(struct ahash_request *req)
-{
-	kfree(req);
-}
-
-static inline struct ahash_request *ahash_request_cast(
-	struct crypto_async_request *req)
-{
-	return container_of(req, struct ahash_request, base);
-}
-
-static inline void ahash_request_set_callback(struct ahash_request *req,
-					      u32 flags,
-					      crypto_completion_t complete,
-					      void *data)
-{
-	req->base.complete = complete;
-	req->base.data = data;
-	req->base.flags = flags;
-}
-
-static inline void ahash_request_set_crypt(struct ahash_request *req,
-					   struct scatterlist *src, u8 *result,
-					   unsigned int nbytes)
-{
-	req->src = src;
-	req->nbytes = nbytes;
-	req->result = result;
-}
-
 #endif	/* _LINUX_CRYPTO_H */
 
-- 
cgit v1.2.3


From b7a39bd0afc4021e8ad2b1189e884551e147427f Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Fri, 23 May 2008 18:38:49 +0100
Subject: firmware: make fw->data const

In preparation for supporting firmware files linked into the static
kernel, make fw->data const to ensure that users aren't modifying it (so
that we can pass a pointer to the original in-kernel copy, rather than
having to copy it).

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 drivers/base/firmware_class.c | 2 +-
 include/linux/firmware.h      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/firmware_class.c b/drivers/base/firmware_class.c
index 9fd4a8534146..264b3a2cd860 100644
--- a/drivers/base/firmware_class.c
+++ b/drivers/base/firmware_class.c
@@ -257,7 +257,7 @@ firmware_data_write(struct kobject *kobj, struct bin_attribute *bin_attr,
 	if (retval)
 		goto out;
 
-	memcpy(fw->data + offset, buffer, count);
+	memcpy((u8 *)fw->data + offset, buffer, count);
 
 	fw->size = max_t(size_t, offset + count, fw->size);
 	retval = count;
diff --git a/include/linux/firmware.h b/include/linux/firmware.h
index 6c7eff2ebada..88718d60153c 100644
--- a/include/linux/firmware.h
+++ b/include/linux/firmware.h
@@ -8,7 +8,7 @@
 
 struct firmware {
 	size_t size;
-	u8 *data;
+	const u8 *data;
 };
 
 struct device;
-- 
cgit v1.2.3


From 5658c769443d543728b6c5c673dffc2df8676317 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Fri, 23 May 2008 13:52:42 +0100
Subject: firmware: allow firmware files to be built into kernel image

Some drivers have their own hacks to bypass the kernel's firmware loader
and build their firmware into the kernel; this renders those unnecessary.

Other drivers don't use the firmware loader at all, because they always
want the firmware to be available. This allows them to start using the
firmware loader.

A third set of drivers already use the firmware loader, but can't be
used without help from userspace, which sometimes requires an initrd.
This allows them to work in a static kernel.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 drivers/base/firmware_class.c     | 33 +++++++++++++++++++++++++++++++--
 include/asm-generic/vmlinux.lds.h |  7 +++++++
 include/linux/firmware.h          | 21 +++++++++++++++++++++
 3 files changed, 59 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/firmware_class.c b/drivers/base/firmware_class.c
index 264b3a2cd860..b0be1d18fee2 100644
--- a/drivers/base/firmware_class.c
+++ b/drivers/base/firmware_class.c
@@ -49,6 +49,14 @@ struct firmware_priv {
 	struct timer_list timeout;
 };
 
+#ifdef CONFIG_FW_LOADER
+extern struct builtin_fw __start_builtin_fw[];
+extern struct builtin_fw __end_builtin_fw[];
+#else /* Module case. Avoid ifdefs later; it'll all optimise out */
+static struct builtin_fw *__start_builtin_fw;
+static struct builtin_fw *__end_builtin_fw;
+#endif
+
 static void
 fw_load_abort(struct firmware_priv *fw_priv)
 {
@@ -391,13 +399,12 @@ _request_firmware(const struct firmware **firmware_p, const char *name,
 	struct device *f_dev;
 	struct firmware_priv *fw_priv;
 	struct firmware *firmware;
+	struct builtin_fw *builtin;
 	int retval;
 
 	if (!firmware_p)
 		return -EINVAL;
 
-	printk(KERN_INFO "firmware: requesting %s\n", name);
-
 	*firmware_p = firmware = kzalloc(sizeof(*firmware), GFP_KERNEL);
 	if (!firmware) {
 		printk(KERN_ERR "%s: kmalloc(struct firmware) failed\n",
@@ -406,6 +413,20 @@ _request_firmware(const struct firmware **firmware_p, const char *name,
 		goto out;
 	}
 
+	for (builtin = __start_builtin_fw; builtin != __end_builtin_fw;
+	     builtin++) {
+		if (strcmp(name, builtin->name))
+			continue;
+		printk(KERN_INFO "firmware: using built-in firmware %s\n",
+		       name);
+		firmware->size = builtin->size;
+		firmware->data = builtin->data;
+		return 0;
+	}
+
+	if (uevent)
+		printk(KERN_INFO "firmware: requesting %s\n", name);
+
 	retval = fw_setup_device(firmware, &f_dev, name, device, uevent);
 	if (retval)
 		goto error_kfree_fw;
@@ -473,8 +494,16 @@ request_firmware(const struct firmware **firmware_p, const char *name,
 void
 release_firmware(const struct firmware *fw)
 {
+	struct builtin_fw *builtin;
+
 	if (fw) {
+		for (builtin = __start_builtin_fw; builtin != __end_builtin_fw;
+		     builtin++) {
+			if (fw->data == builtin->data)
+				goto free_fw;
+		}
 		vfree(fw->data);
+	free_fw:
 		kfree(fw);
 	}
 }
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index f054778e916c..8d71a40625f3 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -86,6 +86,13 @@
 		VMLINUX_SYMBOL(__end_pci_fixups_resume) = .;		\
 	}								\
 									\
+	/* Built-in firmware blobs */					\
+	.builtin_fw        : AT(ADDR(.builtin_fw) - LOAD_OFFSET) {	\
+		VMLINUX_SYMBOL(__start_builtin_fw) = .;			\
+		*(.builtin_fw)						\
+		VMLINUX_SYMBOL(__end_builtin_fw) = .;			\
+	}								\
+									\
 	/* RapidIO route ops */						\
 	.rio_route        : AT(ADDR(.rio_route) - LOAD_OFFSET) {	\
 		VMLINUX_SYMBOL(__start_rio_route_ops) = .;		\
diff --git a/include/linux/firmware.h b/include/linux/firmware.h
index 88718d60153c..c8ecf5b2a207 100644
--- a/include/linux/firmware.h
+++ b/include/linux/firmware.h
@@ -1,7 +1,10 @@
 #ifndef _LINUX_FIRMWARE_H
 #define _LINUX_FIRMWARE_H
+
 #include <linux/module.h>
 #include <linux/types.h>
+#include <linux/compiler.h>
+
 #define FIRMWARE_NAME_MAX 30 
 #define FW_ACTION_NOHOTPLUG 0
 #define FW_ACTION_HOTPLUG 1
@@ -13,6 +16,24 @@ struct firmware {
 
 struct device;
 
+struct builtin_fw {
+	char *name;
+	void *data;
+	unsigned long size;
+};
+
+/* We have to play tricks here much like stringify() to get the
+   __COUNTER__ macro to be expanded as we want it */
+#define __fw_concat1(x, y) x##y
+#define __fw_concat(x, y) __fw_concat1(x, y)
+
+#define DECLARE_BUILTIN_FIRMWARE(name, blob)				     \
+	DECLARE_BUILTIN_FIRMWARE_SIZE(name, &(blob), sizeof(blob))
+
+#define DECLARE_BUILTIN_FIRMWARE_SIZE(name, blob, size)			     \
+	static const struct builtin_fw __fw_concat(__builtin_fw,__COUNTER__) \
+	__used __section(.builtin_fw) = { name, blob, size }
+
 #if defined(CONFIG_FW_LOADER) || (defined(CONFIG_FW_LOADER_MODULE) && defined(MODULE))
 int request_firmware(const struct firmware **fw, const char *name,
 		     struct device *device);
-- 
cgit v1.2.3


From bacfe09dd7545467965e8d8f1eab20bc62dce00d Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Fri, 30 May 2008 13:57:27 +0300
Subject: ihex.h: binary representation of ihex records

Some devices need their firmware as a set of {address, len, data...}
records in some specific order rather than a simple blob.

The normal way of doing this kind of thing is 'ihex', which is a text
format and not entirely suitable for use in the kernel.

This provides a binary representation which is very similar, but much
more compact -- and a helper routine to skip to the next record,
because the alignment constraints mean that everybody will screw it up
for themselves otherwise.

Also a helper function which can verify that a 'struct firmware'
contains a valid set of ihex records, and that following them won't run
off the end of the loaded data.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/ihex.h | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)
 create mode 100644 include/linux/ihex.h

(limited to 'include/linux')

diff --git a/include/linux/ihex.h b/include/linux/ihex.h
new file mode 100644
index 000000000000..df89edd890ae
--- /dev/null
+++ b/include/linux/ihex.h
@@ -0,0 +1,50 @@
+/*
+ * Compact binary representation of ihex records. Some devices need their
+ * firmware loaded in strange orders rather than a single big blob, but
+ * actually parsing ihex-as-text within the kernel seems silly. Thus,...
+ */
+
+#ifndef __LINUX_IHEX_H__
+#define __LINUX_IHEX_H__
+
+#include <linux/types.h>
+#include <linux/firmware.h>
+
+/* Intel HEX files actually limit the length to 256 bytes, but we have
+   drivers which would benefit from using separate records which are
+   longer than that, so we extend to 16 bits of length */
+struct ihex_binrec {
+	__be32 addr;
+	__be16 len;
+	uint8_t data[0];
+} __attribute__((aligned(4)));
+
+/* Find the next record, taking into account the 4-byte alignment */
+static inline const struct ihex_binrec *
+ihex_next_binrec(const struct ihex_binrec *rec)
+{
+	int next = ((be16_to_cpu(rec->len) + 5) & ~3) - 2;
+	rec = (void *)&rec->data[next];
+
+	return be16_to_cpu(rec->len) ? rec : NULL;
+}
+
+/* Check that ihex_next_binrec() won't take us off the end of the image... */
+static inline int ihex_validate_fw(const struct firmware *fw)
+{
+	const struct ihex_binrec *rec;
+	size_t ofs = 0;
+
+	while (ofs <= fw->size - sizeof(*rec)) {
+		rec = (void *)&fw->data[ofs];
+
+		/* Zero length marks end of records */
+		if (!be16_to_cpu(rec->len))
+			return 0;
+
+		/* Point to next record... */
+		ofs += (sizeof(*rec) + be16_to_cpu(rec->len) + 3) & ~3;
+	}
+	return -EINVAL;
+}
+#endif /* __LINUX_IHEX_H__ */
-- 
cgit v1.2.3


From f1485f3deb89e6ae10c4d34662ec9e692855ab5d Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Sat, 31 May 2008 15:20:37 +0300
Subject: ihex: request_ihex_firmware() function to load and validate firmware

Provide a helper to load the file and validate it in one call, to
simplify error handling in the drivers which are going to use it.

Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 include/linux/ihex.h | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ihex.h b/include/linux/ihex.h
index df89edd890ae..2baace2788a7 100644
--- a/include/linux/ihex.h
+++ b/include/linux/ihex.h
@@ -9,6 +9,7 @@
 
 #include <linux/types.h>
 #include <linux/firmware.h>
+#include <linux/device.h>
 
 /* Intel HEX files actually limit the length to 256 bytes, but we have
    drivers which would benefit from using separate records which are
@@ -47,4 +48,27 @@ static inline int ihex_validate_fw(const struct firmware *fw)
 	}
 	return -EINVAL;
 }
+
+/* Request firmware and validate it so that we can trust we won't
+ * run off the end while reading records... */
+static inline int request_ihex_firmware(const struct firmware **fw,
+					const char *fw_name,
+					struct device *dev)
+{
+	const struct firmware *lfw;
+	int ret;
+
+	ret = request_firmware(&lfw, fw_name, dev);
+	if (ret)
+		return ret;
+	ret = ihex_validate_fw(lfw);
+	if (ret) {
+		dev_err(dev, "Firmware \"%s\" not valid IHEX records\n",
+			fw_name);
+		release_firmware(lfw);
+		return ret;
+	}
+	*fw = lfw;
+	return 0;
+}
 #endif /* __LINUX_IHEX_H__ */
-- 
cgit v1.2.3


From ccf9b3b83d0e56fbf20c00a08b15031ce13204a7 Mon Sep 17 00:00:00 2001
From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Thu, 10 Jul 2008 16:55:37 -0700
Subject: xfrm: Add a XFRM_STATE_AF_UNSPEC flag to xfrm_usersa_info

Add a XFRM_STATE_AF_UNSPEC flag to handle the AF_UNSPEC behavior for
the selector family. Userspace applications can set this flag to leave
the selector family of the xfrm_state unspecified.  This can be used
to to handle inter family tunnels if the selector is not set from
userspace.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/xfrm.h | 1 +
 net/xfrm/xfrm_user.c | 3 +--
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/xfrm.h b/include/linux/xfrm.h
index 2ca6bae88721..fb0c215a3051 100644
--- a/include/linux/xfrm.h
+++ b/include/linux/xfrm.h
@@ -339,6 +339,7 @@ struct xfrm_usersa_info {
 #define XFRM_STATE_NOPMTUDISC	4
 #define XFRM_STATE_WILDRECV	8
 #define XFRM_STATE_ICMP		16
+#define XFRM_STATE_AF_UNSPEC	32
 };
 
 struct xfrm_usersa_id {
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index b976d9ed10e4..04c41504f84c 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -277,9 +277,8 @@ static void copy_from_user_state(struct xfrm_state *x, struct xfrm_usersa_info *
 	memcpy(&x->props.saddr, &p->saddr, sizeof(x->props.saddr));
 	x->props.flags = p->flags;
 
-	if (!x->sel.family)
+	if (!x->sel.family && !(p->flags & XFRM_STATE_AF_UNSPEC))
 		x->sel.family = p->family;
-
 }
 
 /*
-- 
cgit v1.2.3


From a2bb6a3d85ef3124cd336403a95abc0540d3fbe2 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 10 Jul 2008 20:58:15 -0400
Subject: ftrace: add ftrace_kill_atomic

It has been suggested that I add a way to disable the function tracer
on an oops. This code adds a ftrace_kill_atomic. It is not meant to be
used in normal situations. It will disable the ftrace tracer, but will
not perform the nice shutdown that requires scheduling.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Cc: Steven Rostedt <srostedt@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h |  1 +
 kernel/trace/ftrace.c  | 15 +++++++++++++++
 2 files changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 3121b95443d9..f368d041e02d 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -89,6 +89,7 @@ void ftrace_enable_daemon(void);
 
 /* totally disable ftrace - can not re-enable after this */
 void ftrace_kill(void);
+void ftrace_kill_atomic(void);
 
 static inline void tracer_disable(void)
 {
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 0f271c45cd02..1359632668a4 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1601,6 +1601,21 @@ core_initcall(ftrace_dynamic_init);
 # define ftrace_force_shutdown()	do { } while (0)
 #endif /* CONFIG_DYNAMIC_FTRACE */
 
+/**
+ * ftrace_kill_atomic - kill ftrace from critical sections
+ *
+ * This function should be used by panic code. It stops ftrace
+ * but in a not so nice way. If you need to simply kill ftrace
+ * from a non-atomic section, use ftrace_kill.
+ */
+void ftrace_kill_atomic(void)
+{
+	ftrace_disabled = 1;
+	ftrace_enabled = 0;
+	ftraced_suspend = -1;
+	clear_ftrace_function();
+}
+
 /**
  * ftrace_kill - totally shutdown ftrace
  *
-- 
cgit v1.2.3


From af52a90a14cdaa54ecbfb6e6982abb13466a4b56 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Mon, 7 Jul 2008 14:16:52 -0400
Subject: sched_clock: stop maximum check on NO HZ

Working with ftrace I would get large jumps of 11 millisecs or more with
the clock tracer. This killed the latencing timings of ftrace and also
caused the irqoff self tests to fail.

What was happening is with NO_HZ the idle would stop the jiffy counter and
before the jiffy counter was updated the sched_clock would have a bad
delta jiffies to compare with the gtod with the maximum.

The jiffies would stop and the last sched_tick would record the last gtod.
On wakeup, the sched clock update would compare the gtod + delta jiffies
(which would be zero) and compare it to the TSC. The TSC would have
correctly (with a stable TSC) moved forward several jiffies. But because the
jiffies has not been updated yet the clock would be prevented from moving
forward because it would appear that the TSC jumped too far ahead.

The clock would then virtually stop, until the jiffies are updated. Then
the next sched clock update would see that the clock was very much behind
since the delta jiffies is now correct. This would then jump the clock
forward by several jiffies.

This caused ftrace to report several milliseconds of interrupts off
latency at every resume from NO_HZ idle.

This patch adds hooks into the nohz code to disable the checking of the
maximum clock update when nohz is in effect. It resumes the max check
when nohz has updated the jiffies again.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Cc: Steven Rostedt <srostedt@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h    | 17 ++++++++++++++++-
 kernel/sched_clock.c     | 39 ++++++++++++++++++++++++++++++++++++++-
 kernel/time/tick-sched.c |  2 ++
 3 files changed, 56 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index c5d3f847ca8d..33a8f42041fa 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1573,13 +1573,28 @@ static inline void sched_clock_idle_sleep_event(void)
 static inline void sched_clock_idle_wakeup_event(u64 delta_ns)
 {
 }
-#else
+
+#ifdef CONFIG_NO_HZ
+static inline void sched_clock_tick_stop(int cpu)
+{
+}
+
+static inline void sched_clock_tick_start(int cpu)
+{
+}
+#endif
+
+#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
 extern void sched_clock_init(void);
 extern u64 sched_clock_cpu(int cpu);
 extern void sched_clock_tick(void);
 extern void sched_clock_idle_sleep_event(void);
 extern void sched_clock_idle_wakeup_event(u64 delta_ns);
+#ifdef CONFIG_NO_HZ
+extern void sched_clock_tick_stop(int cpu);
+extern void sched_clock_tick_start(int cpu);
 #endif
+#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
 
 /*
  * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 42b81fa38cbd..97159e225a77 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -45,6 +45,9 @@ struct sched_clock_data {
 	u64			tick_raw;
 	u64			tick_gtod;
 	u64			clock;
+#ifdef CONFIG_NO_HZ
+	int			check_max;
+#endif
 };
 
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data);
@@ -76,11 +79,45 @@ void sched_clock_init(void)
 		scd->tick_raw = 0;
 		scd->tick_gtod = ktime_now;
 		scd->clock = ktime_now;
+#ifdef CONFIG_NO_HZ
+		scd->check_max = 1;
+#endif
 	}
 
 	sched_clock_running = 1;
 }
 
+#ifdef CONFIG_NO_HZ
+/*
+ * The dynamic ticks makes the delta jiffies inaccurate. This
+ * prevents us from checking the maximum time update.
+ * Disable the maximum check during stopped ticks.
+ */
+void sched_clock_tick_stop(int cpu)
+{
+	struct sched_clock_data *scd = cpu_sdc(cpu);
+
+	scd->check_max = 0;
+}
+
+void sched_clock_tick_start(int cpu)
+{
+	struct sched_clock_data *scd = cpu_sdc(cpu);
+
+	scd->check_max = 1;
+}
+
+static int check_max(struct sched_clock_data *scd)
+{
+	return scd->check_max;
+}
+#else
+static int check_max(struct sched_clock_data *scd)
+{
+	return 1;
+}
+#endif /* CONFIG_NO_HZ */
+
 /*
  * update the percpu scd from the raw @now value
  *
@@ -112,7 +149,7 @@ static void __update_sched_clock(struct sched_clock_data *scd, u64 now)
 	 */
 	max_clock = scd->tick_gtod + (2 + delta_jiffies) * TICK_NSEC;
 
-	if (unlikely(clock + delta > max_clock)) {
+	if (unlikely(clock + delta > max_clock) && check_max(scd)) {
 		if (clock < max_clock)
 			clock = max_clock;
 		else
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index b854a895591e..d63008b09a4c 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -276,6 +276,7 @@ void tick_nohz_stop_sched_tick(void)
 			ts->tick_stopped = 1;
 			ts->idle_jiffies = last_jiffies;
 			rcu_enter_nohz();
+			sched_clock_tick_stop(cpu);
 		}
 
 		/*
@@ -375,6 +376,7 @@ void tick_nohz_restart_sched_tick(void)
 	select_nohz_load_balancer(0);
 	now = ktime_get();
 	tick_do_update_jiffies64(now);
+	sched_clock_tick_start(cpu);
 	cpu_clear(cpu, nohz_cpu_mask);
 
 	/*
-- 
cgit v1.2.3


From 736603ab297506f4396cb5af592004499950fcfd Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 11 Jul 2008 19:27:31 -0400
Subject: jbd2: Add commit time into the commit block

Carlo Wood has demonstrated that it's possible to recover deleted
files from the journal.  Something that will make this easier is if we
can put the time of the commit into commit block.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/commit.c     | 3 +++
 include/linux/jbd2.h | 2 ++
 2 files changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index a2ed72f7ceee..92b6ac3df8ab 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -112,6 +112,7 @@ static int journal_submit_commit_record(journal_t *journal,
 	struct buffer_head *bh;
 	int ret;
 	int barrier_done = 0;
+	struct timespec now = current_kernel_time();
 
 	if (is_journal_aborted(journal))
 		return 0;
@@ -126,6 +127,8 @@ static int journal_submit_commit_record(journal_t *journal,
 	tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
 	tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
 	tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
+	tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
+	tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
 
 	if (JBD2_HAS_COMPAT_FEATURE(journal,
 				    JBD2_FEATURE_COMPAT_CHECKSUM)) {
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index d147f0f90360..ec9cadf58227 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -168,6 +168,8 @@ struct commit_header {
 	unsigned char   h_chksum_size;
 	unsigned char 	h_padding[2];
 	__be32 		h_chksum[JBD2_CHECKSUM_BYTES];
+	__be64		h_commit_sec;
+	__be32		h_commit_nsec;
 };
 
 /*
-- 
cgit v1.2.3


From f4c0a0fdfae708f7aa438c27a380ed4071294e11 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 11 Jul 2008 19:27:31 -0400
Subject: vfs: export filemap_fdatawrite_range()

Make filemap_fdatawrite_range() function public, so that it can later
be used in ordered mode rewrite by JBD/JBD2.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 include/linux/fs.h | 2 ++
 mm/filemap.c       | 3 ++-
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index d8e2762ed14d..97f992adc62d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1740,6 +1740,8 @@ extern int wait_on_page_writeback_range(struct address_space *mapping,
 				pgoff_t start, pgoff_t end);
 extern int __filemap_fdatawrite_range(struct address_space *mapping,
 				loff_t start, loff_t end, int sync_mode);
+extern int filemap_fdatawrite_range(struct address_space *mapping,
+				loff_t start, loff_t end);
 
 extern long do_fsync(struct file *file, int datasync);
 extern void sync_supers(void);
diff --git a/mm/filemap.c b/mm/filemap.c
index 1e6a7d34874f..65d9d9e2b755 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -236,11 +236,12 @@ int filemap_fdatawrite(struct address_space *mapping)
 }
 EXPORT_SYMBOL(filemap_fdatawrite);
 
-static int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
+int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
 				loff_t end)
 {
 	return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
 }
+EXPORT_SYMBOL(filemap_fdatawrite_range);
 
 /**
  * filemap_flush - mostly a non-blocking flush
-- 
cgit v1.2.3


From c851ed540173736e60d48b53b91a16ea5c903896 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 11 Jul 2008 19:27:31 -0400
Subject: jbd2: Implement data=ordered mode handling via inodes

This patch adds necessary framework into JBD2 to be able to track inodes
with each transaction and write-out their dirty data during transaction
commit time.

This new ordered mode brings all sorts of advantages such as possibility
to get rid of journal heads and buffer heads for data buffers in ordered
mode, better ordering of writes on transaction commit, simplification of
 some JBD code, no more anonymous pages when truncate of data being
committed happens.  Also with this new ordered mode, delayed allocation
on ordered mode is much simpler.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/jbd2/commit.c      | 90 +++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/jbd2/journal.c     | 52 +++++++++++++++++++++++++++++
 fs/jbd2/transaction.c | 86 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/jbd2.h  | 42 ++++++++++++++++++++++++
 4 files changed, 270 insertions(+)

(limited to 'include/linux')

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 92b6ac3df8ab..3ca107b5c86b 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -355,6 +355,81 @@ write_out_data:
 	journal_do_submit_data(wbuf, bufs);
 }
 
+/*
+ * Submit all the data buffers of inode associated with the transaction to
+ * disk.
+ *
+ * We are in a committing transaction. Therefore no new inode can be added to
+ * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
+ * operate on from being released while we write out pages.
+ */
+static int journal_submit_inode_data_buffers(journal_t *journal,
+		transaction_t *commit_transaction)
+{
+	struct jbd2_inode *jinode;
+	int err, ret = 0;
+	struct address_space *mapping;
+
+	spin_lock(&journal->j_list_lock);
+	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
+		mapping = jinode->i_vfs_inode->i_mapping;
+		jinode->i_flags |= JI_COMMIT_RUNNING;
+		spin_unlock(&journal->j_list_lock);
+		err = filemap_fdatawrite_range(mapping, 0,
+					i_size_read(jinode->i_vfs_inode));
+		if (!ret)
+			ret = err;
+		spin_lock(&journal->j_list_lock);
+		J_ASSERT(jinode->i_transaction == commit_transaction);
+		jinode->i_flags &= ~JI_COMMIT_RUNNING;
+		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
+	}
+	spin_unlock(&journal->j_list_lock);
+	return ret;
+}
+
+/*
+ * Wait for data submitted for writeout, refile inodes to proper
+ * transaction if needed.
+ *
+ */
+static int journal_finish_inode_data_buffers(journal_t *journal,
+		transaction_t *commit_transaction)
+{
+	struct jbd2_inode *jinode, *next_i;
+	int err, ret = 0;
+
+	/* For locking, see the comment in journal_submit_inode_data_buffers() */
+	spin_lock(&journal->j_list_lock);
+	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
+		jinode->i_flags |= JI_COMMIT_RUNNING;
+		spin_unlock(&journal->j_list_lock);
+		err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
+		if (!ret)
+			ret = err;
+		spin_lock(&journal->j_list_lock);
+		jinode->i_flags &= ~JI_COMMIT_RUNNING;
+		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
+	}
+
+	/* Now refile inode to proper lists */
+	list_for_each_entry_safe(jinode, next_i,
+				 &commit_transaction->t_inode_list, i_list) {
+		list_del(&jinode->i_list);
+		if (jinode->i_next_transaction) {
+			jinode->i_transaction = jinode->i_next_transaction;
+			jinode->i_next_transaction = NULL;
+			list_add(&jinode->i_list,
+				&jinode->i_transaction->t_inode_list);
+		} else {
+			jinode->i_transaction = NULL;
+		}
+	}
+	spin_unlock(&journal->j_list_lock);
+
+	return ret;
+}
+
 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
 {
 	struct page *page = bh->b_page;
@@ -529,6 +604,9 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	 */
 	err = 0;
 	journal_submit_data_buffers(journal, commit_transaction);
+	err = journal_submit_inode_data_buffers(journal, commit_transaction);
+	if (err)
+		jbd2_journal_abort(journal, err);
 
 	/*
 	 * Wait for all previously submitted IO to complete if commit
@@ -760,6 +838,17 @@ start_journal_io:
 			__jbd2_journal_abort_hard(journal);
 	}
 
+	/*
+	 * This is the right place to wait for data buffers both for ASYNC
+	 * and !ASYNC commit. If commit is ASYNC, we need to wait only after
+	 * the commit block went to disk (which happens above). If commit is
+	 * SYNC, we need to wait for data buffers before we start writing
+	 * commit block, which happens below in such setting.
+	 */
+	err = journal_finish_inode_data_buffers(journal, commit_transaction);
+	if (err)
+		jbd2_journal_abort(journal, err);
+
 	/* Lo and behold: we have just managed to send a transaction to
            the log.  Before we can commit it, wait for the IO so far to
            complete.  Control buffers being written are on the
@@ -880,6 +969,7 @@ wait_for_iobuf:
 	jbd_debug(3, "JBD: commit phase 7\n");
 
 	J_ASSERT(commit_transaction->t_sync_datalist == NULL);
+	J_ASSERT(list_empty(&commit_transaction->t_inode_list));
 	J_ASSERT(commit_transaction->t_buffers == NULL);
 	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
 	J_ASSERT(commit_transaction->t_iobuf_list == NULL);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 2e24567c4a79..78cf7bd7f604 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -82,6 +82,10 @@ EXPORT_SYMBOL(jbd2_journal_blocks_per_page);
 EXPORT_SYMBOL(jbd2_journal_invalidatepage);
 EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers);
 EXPORT_SYMBOL(jbd2_journal_force_commit);
+EXPORT_SYMBOL(jbd2_journal_file_inode);
+EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);
+EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
+EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
 
 static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
 static void __journal_abort_soft (journal_t *journal, int errno);
@@ -2194,6 +2198,54 @@ void jbd2_journal_put_journal_head(struct journal_head *jh)
 	jbd_unlock_bh_journal_head(bh);
 }
 
+/*
+ * Initialize jbd inode head
+ */
+void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode)
+{
+	jinode->i_transaction = NULL;
+	jinode->i_next_transaction = NULL;
+	jinode->i_vfs_inode = inode;
+	jinode->i_flags = 0;
+	INIT_LIST_HEAD(&jinode->i_list);
+}
+
+/*
+ * Function to be called before we start removing inode from memory (i.e.,
+ * clear_inode() is a fine place to be called from). It removes inode from
+ * transaction's lists.
+ */
+void jbd2_journal_release_jbd_inode(journal_t *journal,
+				    struct jbd2_inode *jinode)
+{
+	int writeout = 0;
+
+	if (!journal)
+		return;
+restart:
+	spin_lock(&journal->j_list_lock);
+	/* Is commit writing out inode - we have to wait */
+	if (jinode->i_flags & JI_COMMIT_RUNNING) {
+		wait_queue_head_t *wq;
+		DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
+		wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING);
+		prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+		spin_unlock(&journal->j_list_lock);
+		schedule();
+		finish_wait(wq, &wait.wait);
+		goto restart;
+	}
+
+	/* Do we need to wait for data writeback? */
+	if (journal->j_committing_transaction == jinode->i_transaction)
+		writeout = 1;
+	if (jinode->i_transaction) {
+		list_del(&jinode->i_list);
+		jinode->i_transaction = NULL;
+	}
+	spin_unlock(&journal->j_list_lock);
+}
+
 /*
  * debugfs tunables
  */
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index ba620c4493d2..98b596d23705 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -51,6 +51,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
 	transaction->t_tid = journal->j_transaction_sequence++;
 	transaction->t_expires = jiffies + journal->j_commit_interval;
 	spin_lock_init(&transaction->t_handle_lock);
+	INIT_LIST_HEAD(&transaction->t_inode_list);
 
 	/* Set up the commit timer for the new transaction. */
 	journal->j_commit_timer.expires = round_jiffies(transaction->t_expires);
@@ -2195,3 +2196,88 @@ void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
 	spin_unlock(&journal->j_list_lock);
 	__brelse(bh);
 }
+
+/*
+ * File inode in the inode list of the handle's transaction
+ */
+int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
+{
+	transaction_t *transaction = handle->h_transaction;
+	journal_t *journal = transaction->t_journal;
+
+	if (is_handle_aborted(handle))
+		return -EIO;
+
+	jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino,
+			transaction->t_tid);
+
+	/*
+	 * First check whether inode isn't already on the transaction's
+	 * lists without taking the lock. Note that this check is safe
+	 * without the lock as we cannot race with somebody removing inode
+	 * from the transaction. The reason is that we remove inode from the
+	 * transaction only in journal_release_jbd_inode() and when we commit
+	 * the transaction. We are guarded from the first case by holding
+	 * a reference to the inode. We are safe against the second case
+	 * because if jinode->i_transaction == transaction, commit code
+	 * cannot touch the transaction because we hold reference to it,
+	 * and if jinode->i_next_transaction == transaction, commit code
+	 * will only file the inode where we want it.
+	 */
+	if (jinode->i_transaction == transaction ||
+	    jinode->i_next_transaction == transaction)
+		return 0;
+
+	spin_lock(&journal->j_list_lock);
+
+	if (jinode->i_transaction == transaction ||
+	    jinode->i_next_transaction == transaction)
+		goto done;
+
+	/* On some different transaction's list - should be
+	 * the committing one */
+	if (jinode->i_transaction) {
+		J_ASSERT(jinode->i_next_transaction == NULL);
+		J_ASSERT(jinode->i_transaction ==
+					journal->j_committing_transaction);
+		jinode->i_next_transaction = transaction;
+		goto done;
+	}
+	/* Not on any transaction list... */
+	J_ASSERT(!jinode->i_next_transaction);
+	jinode->i_transaction = transaction;
+	list_add(&jinode->i_list, &transaction->t_inode_list);
+done:
+	spin_unlock(&journal->j_list_lock);
+
+	return 0;
+}
+
+/*
+ * This function must be called when inode is journaled in ordered mode
+ * before truncation happens. It starts writeout of truncated part in
+ * case it is in the committing transaction so that we stand to ordered
+ * mode consistency guarantees.
+ */
+int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode,
+					loff_t new_size)
+{
+	journal_t *journal;
+	transaction_t *commit_trans;
+	int ret = 0;
+
+	if (!inode->i_transaction && !inode->i_next_transaction)
+		goto out;
+	journal = inode->i_transaction->t_journal;
+	spin_lock(&journal->j_state_lock);
+	commit_trans = journal->j_committing_transaction;
+	spin_unlock(&journal->j_state_lock);
+	if (inode->i_transaction == commit_trans) {
+		ret = filemap_fdatawrite_range(inode->i_vfs_inode->i_mapping,
+			new_size, LLONG_MAX);
+		if (ret)
+			jbd2_journal_abort(journal, ret);
+	}
+out:
+	return ret;
+}
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index ec9cadf58227..622c3d8ca4ed 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -381,6 +381,38 @@ static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh)
 	bit_spin_unlock(BH_JournalHead, &bh->b_state);
 }
 
+/* Flags in jbd_inode->i_flags */
+#define __JI_COMMIT_RUNNING 0
+/* Commit of the inode data in progress. We use this flag to protect us from
+ * concurrent deletion of inode. We cannot use reference to inode for this
+ * since we cannot afford doing last iput() on behalf of kjournald
+ */
+#define JI_COMMIT_RUNNING (1 << __JI_COMMIT_RUNNING)
+
+/**
+ * struct jbd_inode is the structure linking inodes in ordered mode
+ *   present in a transaction so that we can sync them during commit.
+ */
+struct jbd2_inode {
+	/* Which transaction does this inode belong to? Either the running
+	 * transaction or the committing one. [j_list_lock] */
+	transaction_t *i_transaction;
+
+	/* Pointer to the running transaction modifying inode's data in case
+	 * there is already a committing transaction touching it. [j_list_lock] */
+	transaction_t *i_next_transaction;
+
+	/* List of inodes in the i_transaction [j_list_lock] */
+	struct list_head i_list;
+
+	/* VFS inode this inode belongs to [constant during the lifetime
+	 * of the structure] */
+	struct inode *i_vfs_inode;
+
+	/* Flags of inode [j_list_lock] */
+	unsigned int i_flags;
+};
+
 struct jbd2_revoke_table_s;
 
 /**
@@ -566,6 +598,12 @@ struct transaction_s
 	 */
 	struct journal_head	*t_log_list;
 
+	/*
+	 * List of inodes whose data we've modified in data=ordered mode.
+	 * [j_list_lock]
+	 */
+	struct list_head	t_inode_list;
+
 	/*
 	 * Protects info related to handles
 	 */
@@ -1046,6 +1084,10 @@ extern void	   jbd2_journal_ack_err    (journal_t *);
 extern int	   jbd2_journal_clear_err  (journal_t *);
 extern int	   jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *);
 extern int	   jbd2_journal_force_commit(journal_t *);
+extern int	   jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *inode);
+extern int	   jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode, loff_t new_size);
+extern void	   jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode);
+extern void	   jbd2_journal_release_jbd_inode(journal_t *journal, struct jbd2_inode *jinode);
 
 /*
  * journal_head management
-- 
cgit v1.2.3


From 87c89c232c8f7b3820c33c3b9bc803e9358027da Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 11 Jul 2008 19:27:31 -0400
Subject: jbd2: Remove data=ordered mode support using jbd buffer heads

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/jbd2/checkpoint.c  |   1 -
 fs/jbd2/commit.c      | 221 ++------------------------------------------------
 fs/jbd2/journal.c     |   1 -
 fs/jbd2/transaction.c | 217 ++-----------------------------------------------
 include/linux/jbd2.h  |  29 ++-----
 5 files changed, 21 insertions(+), 448 deletions(-)

(limited to 'include/linux')

diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 6914598022ce..91389c8aee8a 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -688,7 +688,6 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact
 
 	J_ASSERT(transaction->t_state == T_FINISHED);
 	J_ASSERT(transaction->t_buffers == NULL);
-	J_ASSERT(transaction->t_sync_datalist == NULL);
 	J_ASSERT(transaction->t_forget == NULL);
 	J_ASSERT(transaction->t_iobuf_list == NULL);
 	J_ASSERT(transaction->t_shadow_list == NULL);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 3ca107b5c86b..483183d15ed5 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -37,8 +37,8 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 }
 
 /*
- * When an ext3-ordered file is truncated, it is possible that many pages are
- * not sucessfully freed, because they are attached to a committing transaction.
+ * When an ext4 file is truncated, it is possible that some pages are not
+ * successfully freed, because they are attached to a committing transaction.
  * After the transaction commits, these pages are left on the LRU, with no
  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
  * by the VM, but their apparent absence upsets the VM accounting, and it makes
@@ -79,21 +79,6 @@ nope:
 	__brelse(bh);
 }
 
-/*
- * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
- * held.  For ranking reasons we must trylock.  If we lose, schedule away and
- * return 0.  j_list_lock is dropped in this case.
- */
-static int inverted_lock(journal_t *journal, struct buffer_head *bh)
-{
-	if (!jbd_trylock_bh_state(bh)) {
-		spin_unlock(&journal->j_list_lock);
-		schedule();
-		return 0;
-	}
-	return 1;
-}
-
 /*
  * Done it all: now submit the commit record.  We should have
  * cleaned up our previous buffers by now, so if we are in abort
@@ -199,162 +184,6 @@ static int journal_wait_on_commit_record(struct buffer_head *bh)
 	return ret;
 }
 
-/*
- * Wait for all submitted IO to complete.
- */
-static int journal_wait_on_locked_list(journal_t *journal,
-				       transaction_t *commit_transaction)
-{
-	int ret = 0;
-	struct journal_head *jh;
-
-	while (commit_transaction->t_locked_list) {
-		struct buffer_head *bh;
-
-		jh = commit_transaction->t_locked_list->b_tprev;
-		bh = jh2bh(jh);
-		get_bh(bh);
-		if (buffer_locked(bh)) {
-			spin_unlock(&journal->j_list_lock);
-			wait_on_buffer(bh);
-			if (unlikely(!buffer_uptodate(bh)))
-				ret = -EIO;
-			spin_lock(&journal->j_list_lock);
-		}
-		if (!inverted_lock(journal, bh)) {
-			put_bh(bh);
-			spin_lock(&journal->j_list_lock);
-			continue;
-		}
-		if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
-			__jbd2_journal_unfile_buffer(jh);
-			jbd_unlock_bh_state(bh);
-			jbd2_journal_remove_journal_head(bh);
-			put_bh(bh);
-		} else {
-			jbd_unlock_bh_state(bh);
-		}
-		put_bh(bh);
-		cond_resched_lock(&journal->j_list_lock);
-	}
-	return ret;
-  }
-
-static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
-{
-	int i;
-
-	for (i = 0; i < bufs; i++) {
-		wbuf[i]->b_end_io = end_buffer_write_sync;
-		/* We use-up our safety reference in submit_bh() */
-		submit_bh(WRITE, wbuf[i]);
-	}
-}
-
-/*
- *  Submit all the data buffers to disk
- */
-static void journal_submit_data_buffers(journal_t *journal,
-				transaction_t *commit_transaction)
-{
-	struct journal_head *jh;
-	struct buffer_head *bh;
-	int locked;
-	int bufs = 0;
-	struct buffer_head **wbuf = journal->j_wbuf;
-
-	/*
-	 * Whenever we unlock the journal and sleep, things can get added
-	 * onto ->t_sync_datalist, so we have to keep looping back to
-	 * write_out_data until we *know* that the list is empty.
-	 *
-	 * Cleanup any flushed data buffers from the data list.  Even in
-	 * abort mode, we want to flush this out as soon as possible.
-	 */
-write_out_data:
-	cond_resched();
-	spin_lock(&journal->j_list_lock);
-
-	while (commit_transaction->t_sync_datalist) {
-		jh = commit_transaction->t_sync_datalist;
-		bh = jh2bh(jh);
-		locked = 0;
-
-		/* Get reference just to make sure buffer does not disappear
-		 * when we are forced to drop various locks */
-		get_bh(bh);
-		/* If the buffer is dirty, we need to submit IO and hence
-		 * we need the buffer lock. We try to lock the buffer without
-		 * blocking. If we fail, we need to drop j_list_lock and do
-		 * blocking lock_buffer().
-		 */
-		if (buffer_dirty(bh)) {
-			if (test_set_buffer_locked(bh)) {
-				BUFFER_TRACE(bh, "needs blocking lock");
-				spin_unlock(&journal->j_list_lock);
-				/* Write out all data to prevent deadlocks */
-				journal_do_submit_data(wbuf, bufs);
-				bufs = 0;
-				lock_buffer(bh);
-				spin_lock(&journal->j_list_lock);
-			}
-			locked = 1;
-		}
-		/* We have to get bh_state lock. Again out of order, sigh. */
-		if (!inverted_lock(journal, bh)) {
-			jbd_lock_bh_state(bh);
-			spin_lock(&journal->j_list_lock);
-		}
-		/* Someone already cleaned up the buffer? */
-		if (!buffer_jbd(bh)
-			|| jh->b_transaction != commit_transaction
-			|| jh->b_jlist != BJ_SyncData) {
-			jbd_unlock_bh_state(bh);
-			if (locked)
-				unlock_buffer(bh);
-			BUFFER_TRACE(bh, "already cleaned up");
-			put_bh(bh);
-			continue;
-		}
-		if (locked && test_clear_buffer_dirty(bh)) {
-			BUFFER_TRACE(bh, "needs writeout, adding to array");
-			wbuf[bufs++] = bh;
-			__jbd2_journal_file_buffer(jh, commit_transaction,
-						BJ_Locked);
-			jbd_unlock_bh_state(bh);
-			if (bufs == journal->j_wbufsize) {
-				spin_unlock(&journal->j_list_lock);
-				journal_do_submit_data(wbuf, bufs);
-				bufs = 0;
-				goto write_out_data;
-			}
-		} else if (!locked && buffer_locked(bh)) {
-			__jbd2_journal_file_buffer(jh, commit_transaction,
-						BJ_Locked);
-			jbd_unlock_bh_state(bh);
-			put_bh(bh);
-		} else {
-			BUFFER_TRACE(bh, "writeout complete: unfile");
-			__jbd2_journal_unfile_buffer(jh);
-			jbd_unlock_bh_state(bh);
-			if (locked)
-				unlock_buffer(bh);
-			jbd2_journal_remove_journal_head(bh);
-			/* Once for our safety reference, once for
-			 * jbd2_journal_remove_journal_head() */
-			put_bh(bh);
-			put_bh(bh);
-		}
-
-		if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
-			spin_unlock(&journal->j_list_lock);
-			goto write_out_data;
-		}
-	}
-	spin_unlock(&journal->j_list_lock);
-	journal_do_submit_data(wbuf, bufs);
-}
-
 /*
  * Submit all the data buffers of inode associated with the transaction to
  * disk.
@@ -602,24 +431,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	 * Now start flushing things to disk, in the order they appear
 	 * on the transaction lists.  Data blocks go first.
 	 */
-	err = 0;
-	journal_submit_data_buffers(journal, commit_transaction);
 	err = journal_submit_inode_data_buffers(journal, commit_transaction);
-	if (err)
-		jbd2_journal_abort(journal, err);
-
-	/*
-	 * Wait for all previously submitted IO to complete if commit
-	 * record is to be written synchronously.
-	 */
-	spin_lock(&journal->j_list_lock);
-	if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
-		JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
-		err = journal_wait_on_locked_list(journal,
-						commit_transaction);
-
-	spin_unlock(&journal->j_list_lock);
-
 	if (err)
 		jbd2_journal_abort(journal, err);
 
@@ -627,16 +439,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 
 	jbd_debug(3, "JBD: commit phase 2\n");
 
-	/*
-	 * If we found any dirty or locked buffers, then we should have
-	 * looped back up to the write_out_data label.  If there weren't
-	 * any then journal_clean_data_list should have wiped the list
-	 * clean by now, so check that it is in fact empty.
-	 */
-	J_ASSERT (commit_transaction->t_sync_datalist == NULL);
-
-	jbd_debug (3, "JBD: commit phase 3\n");
-
 	/*
 	 * Way to go: we have now written out all of the data for a
 	 * transaction!  Now comes the tricky part: we need to write out
@@ -655,6 +457,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	J_ASSERT(commit_transaction->t_nr_buffers <=
 		 commit_transaction->t_outstanding_credits);
 
+	err = 0;
 	descriptor = NULL;
 	bufs = 0;
 	while (commit_transaction->t_buffers) {
@@ -829,13 +632,6 @@ start_journal_io:
 						 &cbh, crc32_sum);
 		if (err)
 			__jbd2_journal_abort_hard(journal);
-
-		spin_lock(&journal->j_list_lock);
-		err = journal_wait_on_locked_list(journal,
-						commit_transaction);
-		spin_unlock(&journal->j_list_lock);
-		if (err)
-			__jbd2_journal_abort_hard(journal);
 	}
 
 	/*
@@ -860,7 +656,7 @@ start_journal_io:
 	   so we incur less scheduling load.
 	*/
 
-	jbd_debug(3, "JBD: commit phase 4\n");
+	jbd_debug(3, "JBD: commit phase 3\n");
 
 	/*
 	 * akpm: these are BJ_IO, and j_list_lock is not needed.
@@ -919,7 +715,7 @@ wait_for_iobuf:
 
 	J_ASSERT (commit_transaction->t_shadow_list == NULL);
 
-	jbd_debug(3, "JBD: commit phase 5\n");
+	jbd_debug(3, "JBD: commit phase 4\n");
 
 	/* Here we wait for the revoke record and descriptor record buffers */
  wait_for_ctlbuf:
@@ -946,7 +742,7 @@ wait_for_iobuf:
 		/* AKPM: bforget here */
 	}
 
-	jbd_debug(3, "JBD: commit phase 6\n");
+	jbd_debug(3, "JBD: commit phase 5\n");
 
 	if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
 		JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
@@ -966,9 +762,8 @@ wait_for_iobuf:
            transaction can be removed from any checkpoint list it was on
            before. */
 
-	jbd_debug(3, "JBD: commit phase 7\n");
+	jbd_debug(3, "JBD: commit phase 6\n");
 
-	J_ASSERT(commit_transaction->t_sync_datalist == NULL);
 	J_ASSERT(list_empty(&commit_transaction->t_inode_list));
 	J_ASSERT(commit_transaction->t_buffers == NULL);
 	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
@@ -1090,7 +885,7 @@ restart_loop:
 
 	/* Done with this transaction! */
 
-	jbd_debug(3, "JBD: commit phase 8\n");
+	jbd_debug(3, "JBD: commit phase 7\n");
 
 	J_ASSERT(commit_transaction->t_state == T_COMMIT);
 
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 78cf7bd7f604..b26c6d9fe6ae 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -50,7 +50,6 @@ EXPORT_SYMBOL(jbd2_journal_unlock_updates);
 EXPORT_SYMBOL(jbd2_journal_get_write_access);
 EXPORT_SYMBOL(jbd2_journal_get_create_access);
 EXPORT_SYMBOL(jbd2_journal_get_undo_access);
-EXPORT_SYMBOL(jbd2_journal_dirty_data);
 EXPORT_SYMBOL(jbd2_journal_dirty_metadata);
 EXPORT_SYMBOL(jbd2_journal_release_buffer);
 EXPORT_SYMBOL(jbd2_journal_forget);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 98b596d23705..4f7cadbb19fa 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -942,183 +942,6 @@ out:
 	return err;
 }
 
-/**
- * int jbd2_journal_dirty_data() -  mark a buffer as containing dirty data which
- *                             needs to be flushed before we can commit the
- *                             current transaction.
- * @handle: transaction
- * @bh: bufferhead to mark
- *
- * The buffer is placed on the transaction's data list and is marked as
- * belonging to the transaction.
- *
- * Returns error number or 0 on success.
- *
- * jbd2_journal_dirty_data() can be called via page_launder->ext3_writepage
- * by kswapd.
- */
-int jbd2_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
-{
-	journal_t *journal = handle->h_transaction->t_journal;
-	int need_brelse = 0;
-	struct journal_head *jh;
-
-	if (is_handle_aborted(handle))
-		return 0;
-
-	jh = jbd2_journal_add_journal_head(bh);
-	JBUFFER_TRACE(jh, "entry");
-
-	/*
-	 * The buffer could *already* be dirty.  Writeout can start
-	 * at any time.
-	 */
-	jbd_debug(4, "jh: %p, tid:%d\n", jh, handle->h_transaction->t_tid);
-
-	/*
-	 * What if the buffer is already part of a running transaction?
-	 *
-	 * There are two cases:
-	 * 1) It is part of the current running transaction.  Refile it,
-	 *    just in case we have allocated it as metadata, deallocated
-	 *    it, then reallocated it as data.
-	 * 2) It is part of the previous, still-committing transaction.
-	 *    If all we want to do is to guarantee that the buffer will be
-	 *    written to disk before this new transaction commits, then
-	 *    being sure that the *previous* transaction has this same
-	 *    property is sufficient for us!  Just leave it on its old
-	 *    transaction.
-	 *
-	 * In case (2), the buffer must not already exist as metadata
-	 * --- that would violate write ordering (a transaction is free
-	 * to write its data at any point, even before the previous
-	 * committing transaction has committed).  The caller must
-	 * never, ever allow this to happen: there's nothing we can do
-	 * about it in this layer.
-	 */
-	jbd_lock_bh_state(bh);
-	spin_lock(&journal->j_list_lock);
-
-	/* Now that we have bh_state locked, are we really still mapped? */
-	if (!buffer_mapped(bh)) {
-		JBUFFER_TRACE(jh, "unmapped buffer, bailing out");
-		goto no_journal;
-	}
-
-	if (jh->b_transaction) {
-		JBUFFER_TRACE(jh, "has transaction");
-		if (jh->b_transaction != handle->h_transaction) {
-			JBUFFER_TRACE(jh, "belongs to older transaction");
-			J_ASSERT_JH(jh, jh->b_transaction ==
-					journal->j_committing_transaction);
-
-			/* @@@ IS THIS TRUE  ? */
-			/*
-			 * Not any more.  Scenario: someone does a write()
-			 * in data=journal mode.  The buffer's transaction has
-			 * moved into commit.  Then someone does another
-			 * write() to the file.  We do the frozen data copyout
-			 * and set b_next_transaction to point to j_running_t.
-			 * And while we're in that state, someone does a
-			 * writepage() in an attempt to pageout the same area
-			 * of the file via a shared mapping.  At present that
-			 * calls jbd2_journal_dirty_data(), and we get right here.
-			 * It may be too late to journal the data.  Simply
-			 * falling through to the next test will suffice: the
-			 * data will be dirty and wil be checkpointed.  The
-			 * ordering comments in the next comment block still
-			 * apply.
-			 */
-			//J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
-
-			/*
-			 * If we're journalling data, and this buffer was
-			 * subject to a write(), it could be metadata, forget
-			 * or shadow against the committing transaction.  Now,
-			 * someone has dirtied the same darn page via a mapping
-			 * and it is being writepage()'d.
-			 * We *could* just steal the page from commit, with some
-			 * fancy locking there.  Instead, we just skip it -
-			 * don't tie the page's buffers to the new transaction
-			 * at all.
-			 * Implication: if we crash before the writepage() data
-			 * is written into the filesystem, recovery will replay
-			 * the write() data.
-			 */
-			if (jh->b_jlist != BJ_None &&
-					jh->b_jlist != BJ_SyncData &&
-					jh->b_jlist != BJ_Locked) {
-				JBUFFER_TRACE(jh, "Not stealing");
-				goto no_journal;
-			}
-
-			/*
-			 * This buffer may be undergoing writeout in commit.  We
-			 * can't return from here and let the caller dirty it
-			 * again because that can cause the write-out loop in
-			 * commit to never terminate.
-			 */
-			if (buffer_dirty(bh)) {
-				get_bh(bh);
-				spin_unlock(&journal->j_list_lock);
-				jbd_unlock_bh_state(bh);
-				need_brelse = 1;
-				sync_dirty_buffer(bh);
-				jbd_lock_bh_state(bh);
-				spin_lock(&journal->j_list_lock);
-				/* Since we dropped the lock... */
-				if (!buffer_mapped(bh)) {
-					JBUFFER_TRACE(jh, "buffer got unmapped");
-					goto no_journal;
-				}
-				/* The buffer may become locked again at any
-				   time if it is redirtied */
-			}
-
-			/* journal_clean_data_list() may have got there first */
-			if (jh->b_transaction != NULL) {
-				JBUFFER_TRACE(jh, "unfile from commit");
-				__jbd2_journal_temp_unlink_buffer(jh);
-				/* It still points to the committing
-				 * transaction; move it to this one so
-				 * that the refile assert checks are
-				 * happy. */
-				jh->b_transaction = handle->h_transaction;
-			}
-			/* The buffer will be refiled below */
-
-		}
-		/*
-		 * Special case --- the buffer might actually have been
-		 * allocated and then immediately deallocated in the previous,
-		 * committing transaction, so might still be left on that
-		 * transaction's metadata lists.
-		 */
-		if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) {
-			JBUFFER_TRACE(jh, "not on correct data list: unfile");
-			J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow);
-			__jbd2_journal_temp_unlink_buffer(jh);
-			jh->b_transaction = handle->h_transaction;
-			JBUFFER_TRACE(jh, "file as data");
-			__jbd2_journal_file_buffer(jh, handle->h_transaction,
-						BJ_SyncData);
-		}
-	} else {
-		JBUFFER_TRACE(jh, "not on a transaction");
-		__jbd2_journal_file_buffer(jh, handle->h_transaction, BJ_SyncData);
-	}
-no_journal:
-	spin_unlock(&journal->j_list_lock);
-	jbd_unlock_bh_state(bh);
-	if (need_brelse) {
-		BUFFER_TRACE(bh, "brelse");
-		__brelse(bh);
-	}
-	JBUFFER_TRACE(jh, "exit");
-	jbd2_journal_put_journal_head(jh);
-	return 0;
-}
-
 /**
  * int jbd2_journal_dirty_metadata() -  mark a buffer as containing dirty metadata
  * @handle: transaction to add buffer to.
@@ -1541,10 +1364,10 @@ __blist_del_buffer(struct journal_head **list, struct journal_head *jh)
  * Remove a buffer from the appropriate transaction list.
  *
  * Note that this function can *change* the value of
- * bh->b_transaction->t_sync_datalist, t_buffers, t_forget,
- * t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list.  If the caller
- * is holding onto a copy of one of thee pointers, it could go bad.
- * Generally the caller needs to re-read the pointer from the transaction_t.
+ * bh->b_transaction->t_buffers, t_forget, t_iobuf_list, t_shadow_list,
+ * t_log_list or t_reserved_list.  If the caller is holding onto a copy of one
+ * of these pointers, it could go bad.  Generally the caller needs to re-read
+ * the pointer from the transaction_t.
  *
  * Called under j_list_lock.  The journal may not be locked.
  */
@@ -1566,9 +1389,6 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
 	switch (jh->b_jlist) {
 	case BJ_None:
 		return;
-	case BJ_SyncData:
-		list = &transaction->t_sync_datalist;
-		break;
 	case BJ_Metadata:
 		transaction->t_nr_buffers--;
 		J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0);
@@ -1589,9 +1409,6 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
 	case BJ_Reserved:
 		list = &transaction->t_reserved_list;
 		break;
-	case BJ_Locked:
-		list = &transaction->t_locked_list;
-		break;
 	}
 
 	__blist_del_buffer(list, jh);
@@ -1634,15 +1451,7 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
 		goto out;
 
 	spin_lock(&journal->j_list_lock);
-	if (jh->b_transaction != NULL && jh->b_cp_transaction == NULL) {
-		if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) {
-			/* A written-back ordered data buffer */
-			JBUFFER_TRACE(jh, "release data");
-			__jbd2_journal_unfile_buffer(jh);
-			jbd2_journal_remove_journal_head(bh);
-			__brelse(bh);
-		}
-	} else if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
+	if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
 		/* written-back checkpointed metadata buffer */
 		if (jh->b_jlist == BJ_None) {
 			JBUFFER_TRACE(jh, "remove from checkpoint list");
@@ -1878,6 +1687,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
 	if (!buffer_jbd(bh))
 		goto zap_buffer_unlocked;
 
+	/* OK, we have data buffer in journaled mode */
 	spin_lock(&journal->j_state_lock);
 	jbd_lock_bh_state(bh);
 	spin_lock(&journal->j_list_lock);
@@ -1941,15 +1751,6 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
 		}
 	} else if (transaction == journal->j_committing_transaction) {
 		JBUFFER_TRACE(jh, "on committing transaction");
-		if (jh->b_jlist == BJ_Locked) {
-			/*
-			 * The buffer is on the committing transaction's locked
-			 * list.  We have the buffer locked, so I/O has
-			 * completed.  So we can nail the buffer now.
-			 */
-			may_free = __dispose_buffer(jh, transaction);
-			goto zap_buffer;
-		}
 		/*
 		 * If it is committing, we simply cannot touch it.  We
 		 * can remove it's next_transaction pointer from the
@@ -2082,9 +1883,6 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
 		J_ASSERT_JH(jh, !jh->b_committed_data);
 		J_ASSERT_JH(jh, !jh->b_frozen_data);
 		return;
-	case BJ_SyncData:
-		list = &transaction->t_sync_datalist;
-		break;
 	case BJ_Metadata:
 		transaction->t_nr_buffers++;
 		list = &transaction->t_buffers;
@@ -2104,9 +1902,6 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
 	case BJ_Reserved:
 		list = &transaction->t_reserved_list;
 		break;
-	case BJ_Locked:
-		list =  &transaction->t_locked_list;
-		break;
 	}
 
 	__blist_add_buffer(list, jh);
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 622c3d8ca4ed..3dd209007098 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -542,24 +542,12 @@ struct transaction_s
 	 */
 	struct journal_head	*t_reserved_list;
 
-	/*
-	 * Doubly-linked circular list of all buffers under writeout during
-	 * commit [j_list_lock]
-	 */
-	struct journal_head	*t_locked_list;
-
 	/*
 	 * Doubly-linked circular list of all metadata buffers owned by this
 	 * transaction [j_list_lock]
 	 */
 	struct journal_head	*t_buffers;
 
-	/*
-	 * Doubly-linked circular list of all data buffers still to be
-	 * flushed before this transaction can be committed [j_list_lock]
-	 */
-	struct journal_head	*t_sync_datalist;
-
 	/*
 	 * Doubly-linked circular list of all forget buffers (superseded
 	 * buffers which we can un-checkpoint once this transaction commits)
@@ -1044,7 +1032,6 @@ extern int	 jbd2_journal_extend (handle_t *, int nblocks);
 extern int	 jbd2_journal_get_write_access(handle_t *, struct buffer_head *);
 extern int	 jbd2_journal_get_create_access (handle_t *, struct buffer_head *);
 extern int	 jbd2_journal_get_undo_access(handle_t *, struct buffer_head *);
-extern int	 jbd2_journal_dirty_data (handle_t *, struct buffer_head *);
 extern int	 jbd2_journal_dirty_metadata (handle_t *, struct buffer_head *);
 extern void	 jbd2_journal_release_buffer (handle_t *, struct buffer_head *);
 extern int	 jbd2_journal_forget (handle_t *, struct buffer_head *);
@@ -1223,15 +1210,13 @@ static inline int jbd_space_needed(journal_t *journal)
 
 /* journaling buffer types */
 #define BJ_None		0	/* Not journaled */
-#define BJ_SyncData	1	/* Normal data: flush before commit */
-#define BJ_Metadata	2	/* Normal journaled metadata */
-#define BJ_Forget	3	/* Buffer superseded by this transaction */
-#define BJ_IO		4	/* Buffer is for temporary IO use */
-#define BJ_Shadow	5	/* Buffer contents being shadowed to the log */
-#define BJ_LogCtl	6	/* Buffer contains log descriptors */
-#define BJ_Reserved	7	/* Buffer is reserved for access by journal */
-#define BJ_Locked	8	/* Locked for I/O during commit */
-#define BJ_Types	9
+#define BJ_Metadata	1	/* Normal journaled metadata */
+#define BJ_Forget	2	/* Buffer superseded by this transaction */
+#define BJ_IO		3	/* Buffer is for temporary IO use */
+#define BJ_Shadow	4	/* Buffer contents being shadowed to the log */
+#define BJ_LogCtl	5	/* Buffer contains log descriptors */
+#define BJ_Reserved	6	/* Buffer is reserved for access by journal */
+#define BJ_Types	7
 
 extern int jbd_blocks_per_page(struct inode *inode);
 
-- 
cgit v1.2.3


From 29a814d2ee0e43c2980f33f91c1311ec06c0aa35 Mon Sep 17 00:00:00 2001
From: Alex Tomas <alex@clusterfs.com>
Date: Fri, 11 Jul 2008 19:27:31 -0400
Subject: vfs: add hooks for ext4's delayed allocation support

Export mpage_bio_submit() and __mpage_writepage() for the benefit of
ext4's delayed allocation support.   Also change __block_write_full_page
so that if buffers that have the BH_Delay flag set it will call
get_block() to get the physical block allocated, just as in the
!BH_Mapped case.

Signed-off-by: Alex Tomas <alex@clusterfs.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/buffer.c           |  7 +++++--
 fs/mpage.c            | 14 +++++---------
 include/linux/mpage.h | 10 ++++++++++
 3 files changed, 20 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/fs/buffer.c b/fs/buffer.c
index f4b033237a02..5fa1512cd9a2 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1691,11 +1691,13 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
 			 */
 			clear_buffer_dirty(bh);
 			set_buffer_uptodate(bh);
-		} else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
+		} else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
+			   buffer_dirty(bh)) {
 			WARN_ON(bh->b_size != blocksize);
 			err = get_block(inode, block, bh, 1);
 			if (err)
 				goto recover;
+			clear_buffer_delay(bh);
 			if (buffer_new(bh)) {
 				/* blockdev mappings never come here */
 				clear_buffer_new(bh);
@@ -1774,7 +1776,8 @@ recover:
 	bh = head;
 	/* Recovery: lock and submit the mapped buffers */
 	do {
-		if (buffer_mapped(bh) && buffer_dirty(bh)) {
+		if (buffer_mapped(bh) && buffer_dirty(bh) &&
+		    !buffer_delay(bh)) {
 			lock_buffer(bh);
 			mark_buffer_async_write(bh);
 		} else {
diff --git a/fs/mpage.c b/fs/mpage.c
index 235e4d3873a8..dbcc7af76a15 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -82,7 +82,7 @@ static void mpage_end_io_write(struct bio *bio, int err)
 	bio_put(bio);
 }
 
-static struct bio *mpage_bio_submit(int rw, struct bio *bio)
+struct bio *mpage_bio_submit(int rw, struct bio *bio)
 {
 	bio->bi_end_io = mpage_end_io_read;
 	if (rw == WRITE)
@@ -90,6 +90,7 @@ static struct bio *mpage_bio_submit(int rw, struct bio *bio)
 	submit_bio(rw, bio);
 	return NULL;
 }
+EXPORT_SYMBOL(mpage_bio_submit);
 
 static struct bio *
 mpage_alloc(struct block_device *bdev,
@@ -435,15 +436,9 @@ EXPORT_SYMBOL(mpage_readpage);
  * written, so it can intelligently allocate a suitably-sized BIO.  For now,
  * just allocate full-size (16-page) BIOs.
  */
-struct mpage_data {
-	struct bio *bio;
-	sector_t last_block_in_bio;
-	get_block_t *get_block;
-	unsigned use_writepage;
-};
 
-static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
-			     void *data)
+int __mpage_writepage(struct page *page, struct writeback_control *wbc,
+		      void *data)
 {
 	struct mpage_data *mpd = data;
 	struct bio *bio = mpd->bio;
@@ -651,6 +646,7 @@ out:
 	mpd->bio = bio;
 	return ret;
 }
+EXPORT_SYMBOL(__mpage_writepage);
 
 /**
  * mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them
diff --git a/include/linux/mpage.h b/include/linux/mpage.h
index 068a0c9946af..5c42821da2d1 100644
--- a/include/linux/mpage.h
+++ b/include/linux/mpage.h
@@ -11,11 +11,21 @@
  */
 #ifdef CONFIG_BLOCK
 
+struct mpage_data {
+	struct bio *bio;
+	sector_t last_block_in_bio;
+	get_block_t *get_block;
+	unsigned use_writepage;
+};
+
 struct writeback_control;
 
+struct bio *mpage_bio_submit(int rw, struct bio *bio);
 int mpage_readpages(struct address_space *mapping, struct list_head *pages,
 				unsigned nr_pages, get_block_t get_block);
 int mpage_readpage(struct page *page, get_block_t get_block);
+int __mpage_writepage(struct page *page, struct writeback_control *wbc,
+		      void *data);
 int mpage_writepages(struct address_space *mapping,
 		struct writeback_control *wbc, get_block_t get_block);
 int mpage_writepage(struct page *page, get_block_t *get_block,
-- 
cgit v1.2.3


From e8ced39d5e8911c662d4d69a342b9d053eaaac4e Mon Sep 17 00:00:00 2001
From: Mingming Cao <cmm@us.ibm.com>
Date: Fri, 11 Jul 2008 19:27:31 -0400
Subject: percpu_counter: new function percpu_counter_sum_and_set

Delayed allocation need to check free blocks at every write time.
percpu_counter_read_positive() is not quit accurate. delayed
allocation need a more accurate accounting, but using
percpu_counter_sum_positive() is frequently is quite expensive.

This patch added a new function to update center counter when sum
per-cpu counter, to increase the accurate rate for next
percpu_counter_read() and require less calling expensive
percpu_counter_sum().

Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c               |  2 +-
 include/linux/percpu_counter.h | 12 +++++++++---
 lib/percpu_counter.c           |  7 ++++++-
 3 files changed, 16 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 25f63d8c1b3d..6369bacf0dcb 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -1621,7 +1621,7 @@ ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
 #ifdef CONFIG_SMP
 	if (free_blocks - root_blocks < FBC_BATCH)
 		free_blocks =
-			percpu_counter_sum_positive(&sbi->s_freeblocks_counter);
+			percpu_counter_sum_and_set(&sbi->s_freeblocks_counter);
 #endif
 	if (free_blocks - root_blocks < nblocks)
 		return free_blocks - root_blocks;
diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
index 9007ccdfc112..208388835357 100644
--- a/include/linux/percpu_counter.h
+++ b/include/linux/percpu_counter.h
@@ -35,7 +35,7 @@ int percpu_counter_init_irq(struct percpu_counter *fbc, s64 amount);
 void percpu_counter_destroy(struct percpu_counter *fbc);
 void percpu_counter_set(struct percpu_counter *fbc, s64 amount);
 void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch);
-s64 __percpu_counter_sum(struct percpu_counter *fbc);
+s64 __percpu_counter_sum(struct percpu_counter *fbc, int set);
 
 static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount)
 {
@@ -44,13 +44,19 @@ static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount)
 
 static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc)
 {
-	s64 ret = __percpu_counter_sum(fbc);
+	s64 ret = __percpu_counter_sum(fbc, 0);
 	return ret < 0 ? 0 : ret;
 }
 
+static inline s64 percpu_counter_sum_and_set(struct percpu_counter *fbc)
+{
+	return __percpu_counter_sum(fbc, 1);
+}
+
+
 static inline s64 percpu_counter_sum(struct percpu_counter *fbc)
 {
-	return __percpu_counter_sum(fbc);
+	return __percpu_counter_sum(fbc, 0);
 }
 
 static inline s64 percpu_counter_read(struct percpu_counter *fbc)
diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
index 119174494cb5..4a8ba4bf5f6f 100644
--- a/lib/percpu_counter.c
+++ b/lib/percpu_counter.c
@@ -52,7 +52,7 @@ EXPORT_SYMBOL(__percpu_counter_add);
  * Add up all the per-cpu counts, return the result.  This is a more accurate
  * but much slower version of percpu_counter_read_positive()
  */
-s64 __percpu_counter_sum(struct percpu_counter *fbc)
+s64 __percpu_counter_sum(struct percpu_counter *fbc, int set)
 {
 	s64 ret;
 	int cpu;
@@ -62,7 +62,12 @@ s64 __percpu_counter_sum(struct percpu_counter *fbc)
 	for_each_online_cpu(cpu) {
 		s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
 		ret += *pcount;
+		if (set)
+			*pcount = 0;
 	}
+	if (set)
+		fbc->count = ret;
+
 	spin_unlock(&fbc->lock);
 	return ret;
 }
-- 
cgit v1.2.3


From 06d6cf6959d22037fcec598f4f954db5db3d7356 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Fri, 11 Jul 2008 19:27:31 -0400
Subject: mm: Add range_cont mode for writeback

Filesystems like ext4 needs to start a new transaction in
the writepages for block allocation. This happens with delayed
allocation and there is limit to how many credits we can request
from the journal layer. So we call write_cache_pages multiple
times with wbc->nr_to_write set to the maximum possible value
limitted by the max journal credits available.

Add a new mode to writeback that enables us to handle this
behaviour. In the new mode we update the wbc->range_start
to point to the new offset to be written. Next call to
call to write_cache_pages will start writeout from specified
range_start offset. In the new mode we also limit writing
to the specified wbc->range_end.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 include/linux/writeback.h | 1 +
 mm/page-writeback.c       | 3 +++
 2 files changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index f462439cc288..0d8573e6b9ec 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -63,6 +63,7 @@ struct writeback_control {
 	unsigned for_writepages:1;	/* This is a writepages() call */
 	unsigned range_cyclic:1;	/* range_start is cyclic */
 	unsigned more_io:1;		/* more io to be dispatched */
+	unsigned range_cont:1;
 };
 
 /*
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 789b6adbef37..ded57d528060 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -956,6 +956,9 @@ retry:
 	}
 	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
 		mapping->writeback_index = index;
+
+	if (wbc->range_cont)
+		wbc->range_start = index << PAGE_CACHE_SHIFT;
 	return ret;
 }
 EXPORT_SYMBOL(write_cache_pages);
-- 
cgit v1.2.3


From 006ebb40d3d65338bd74abb03b945f8d60e362bd Mon Sep 17 00:00:00 2001
From: Stephen Smalley <sds@tycho.nsa.gov>
Date: Mon, 19 May 2008 08:32:49 -0400
Subject: Security: split proc ptrace checking into read vs. attach

Enable security modules to distinguish reading of process state via
proc from full ptrace access by renaming ptrace_may_attach to
ptrace_may_access and adding a mode argument indicating whether only
read access or full attach access is requested.  This allows security
modules to permit access to reading process state without granting
full ptrace access.  The base DAC/capability checking remains unchanged.

Read access to /proc/pid/mem continues to apply a full ptrace attach
check since check_mem_permission() already requires the current task
to already be ptracing the target.  The other ptrace checks within
proc for elements like environ, maps, and fds are changed to pass the
read mode instead of attach.

In the SELinux case, we model such reading of process state as a
reading of a proc file labeled with the target process' label.  This
enables SELinux policy to permit such reading of process state without
permitting control or manipulation of the target process, as there are
a number of cases where programs probe for such information via proc
but do not need to be able to control the target (e.g. procps,
lsof, PolicyKit, ConsoleKit).  At present we have to choose between
allowing full ptrace in policy (more permissive than required/desired)
or breaking functionality (or in some cases just silencing the denials
via dontaudit rules but this can hide genuine attacks).

This version of the patch incorporates comments from Casey Schaufler
(change/replace existing ptrace_may_attach interface, pass access
mode), and Chris Wright (provide greater consistency in the checking).

Note that like their predecessors __ptrace_may_attach and
ptrace_may_attach, the __ptrace_may_access and ptrace_may_access
interfaces use different return value conventions from each other (0
or -errno vs. 1 or 0).  I retained this difference to avoid any
changes to the caller logic but made the difference clearer by
changing the latter interface to return a bool rather than an int and
by adding a comment about it to ptrace.h for any future callers.

Signed-off-by:  Stephen Smalley <sds@tycho.nsa.gov>
Acked-by: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: James Morris <jmorris@namei.org>
---
 fs/proc/base.c             |  9 +++++----
 fs/proc/task_mmu.c         |  6 +++---
 fs/proc/task_nommu.c       |  2 +-
 include/linux/ptrace.h     |  8 ++++++--
 include/linux/security.h   | 16 +++++++++++-----
 kernel/ptrace.c            | 15 ++++++++-------
 security/commoncap.c       |  3 ++-
 security/dummy.c           |  3 ++-
 security/security.c        |  5 +++--
 security/selinux/hooks.c   | 13 +++++++++++--
 security/smack/smack_lsm.c |  5 +++--
 11 files changed, 55 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 3b455371e7ff..58c3e6a8e15e 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -233,7 +233,7 @@ static int check_mem_permission(struct task_struct *task)
 	 */
 	if (task->parent == current && (task->ptrace & PT_PTRACED) &&
 	    task_is_stopped_or_traced(task) &&
-	    ptrace_may_attach(task))
+	    ptrace_may_access(task, PTRACE_MODE_ATTACH))
 		return 0;
 
 	/*
@@ -251,7 +251,8 @@ struct mm_struct *mm_for_maps(struct task_struct *task)
 	task_lock(task);
 	if (task->mm != mm)
 		goto out;
-	if (task->mm != current->mm && __ptrace_may_attach(task) < 0)
+	if (task->mm != current->mm &&
+	    __ptrace_may_access(task, PTRACE_MODE_READ) < 0)
 		goto out;
 	task_unlock(task);
 	return mm;
@@ -518,7 +519,7 @@ static int proc_fd_access_allowed(struct inode *inode)
 	 */
 	task = get_proc_task(inode);
 	if (task) {
-		allowed = ptrace_may_attach(task);
+		allowed = ptrace_may_access(task, PTRACE_MODE_READ);
 		put_task_struct(task);
 	}
 	return allowed;
@@ -904,7 +905,7 @@ static ssize_t environ_read(struct file *file, char __user *buf,
 	if (!task)
 		goto out_no_task;
 
-	if (!ptrace_may_attach(task))
+	if (!ptrace_may_access(task, PTRACE_MODE_READ))
 		goto out;
 
 	ret = -ENOMEM;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index c492449f3b45..164bd9f9ede3 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -210,7 +210,7 @@ static int show_map(struct seq_file *m, void *v)
 	dev_t dev = 0;
 	int len;
 
-	if (maps_protect && !ptrace_may_attach(task))
+	if (maps_protect && !ptrace_may_access(task, PTRACE_MODE_READ))
 		return -EACCES;
 
 	if (file) {
@@ -646,7 +646,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
 		goto out;
 
 	ret = -EACCES;
-	if (!ptrace_may_attach(task))
+	if (!ptrace_may_access(task, PTRACE_MODE_READ))
 		goto out_task;
 
 	ret = -EINVAL;
@@ -747,7 +747,7 @@ static int show_numa_map_checked(struct seq_file *m, void *v)
 	struct proc_maps_private *priv = m->private;
 	struct task_struct *task = priv->task;
 
-	if (maps_protect && !ptrace_may_attach(task))
+	if (maps_protect && !ptrace_may_access(task, PTRACE_MODE_READ))
 		return -EACCES;
 
 	return show_numa_map(m, v);
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 4b4f9cc2f186..5d84e7121df8 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -113,7 +113,7 @@ static int show_map(struct seq_file *m, void *_vml)
 	struct proc_maps_private *priv = m->private;
 	struct task_struct *task = priv->task;
 
-	if (maps_protect && !ptrace_may_attach(task))
+	if (maps_protect && !ptrace_may_access(task, PTRACE_MODE_READ))
 		return -EACCES;
 
 	return nommu_vma_show(m, vml->vma);
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index f98501ba557e..c6f5f9dd0cee 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -95,8 +95,12 @@ extern void __ptrace_link(struct task_struct *child,
 			  struct task_struct *new_parent);
 extern void __ptrace_unlink(struct task_struct *child);
 extern void ptrace_untrace(struct task_struct *child);
-extern int ptrace_may_attach(struct task_struct *task);
-extern int __ptrace_may_attach(struct task_struct *task);
+#define PTRACE_MODE_READ   1
+#define PTRACE_MODE_ATTACH 2
+/* Returns 0 on success, -errno on denial. */
+extern int __ptrace_may_access(struct task_struct *task, unsigned int mode);
+/* Returns true on success, false on denial. */
+extern bool ptrace_may_access(struct task_struct *task, unsigned int mode);
 
 static inline int ptrace_reparented(struct task_struct *child)
 {
diff --git a/include/linux/security.h b/include/linux/security.h
index 50737c70e78e..62bd80cb7f87 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -46,7 +46,8 @@ struct audit_krule;
  */
 extern int cap_capable(struct task_struct *tsk, int cap);
 extern int cap_settime(struct timespec *ts, struct timezone *tz);
-extern int cap_ptrace(struct task_struct *parent, struct task_struct *child);
+extern int cap_ptrace(struct task_struct *parent, struct task_struct *child,
+		      unsigned int mode);
 extern int cap_capget(struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted);
 extern int cap_capset_check(struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted);
 extern void cap_capset_set(struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted);
@@ -1170,6 +1171,7 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	attributes would be changed by the execve.
  *	@parent contains the task_struct structure for parent process.
  *	@child contains the task_struct structure for child process.
+ *	@mode contains the PTRACE_MODE flags indicating the form of access.
  *	Return 0 if permission is granted.
  * @capget:
  *	Get the @effective, @inheritable, and @permitted capability sets for
@@ -1295,7 +1297,8 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
 struct security_operations {
 	char name[SECURITY_NAME_MAX + 1];
 
-	int (*ptrace) (struct task_struct *parent, struct task_struct *child);
+	int (*ptrace) (struct task_struct *parent, struct task_struct *child,
+		       unsigned int mode);
 	int (*capget) (struct task_struct *target,
 		       kernel_cap_t *effective,
 		       kernel_cap_t *inheritable, kernel_cap_t *permitted);
@@ -1573,7 +1576,8 @@ extern struct dentry *securityfs_create_dir(const char *name, struct dentry *par
 extern void securityfs_remove(struct dentry *dentry);
 
 /* Security operations */
-int security_ptrace(struct task_struct *parent, struct task_struct *child);
+int security_ptrace(struct task_struct *parent, struct task_struct *child,
+		    unsigned int mode);
 int security_capget(struct task_struct *target,
 		    kernel_cap_t *effective,
 		    kernel_cap_t *inheritable,
@@ -1755,9 +1759,11 @@ static inline int security_init(void)
 	return 0;
 }
 
-static inline int security_ptrace(struct task_struct *parent, struct task_struct *child)
+static inline int security_ptrace(struct task_struct *parent,
+				  struct task_struct *child,
+				  unsigned int mode)
 {
-	return cap_ptrace(parent, child);
+	return cap_ptrace(parent, child, mode);
 }
 
 static inline int security_capget(struct task_struct *target,
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 6c19e94fd0a5..e337390fce01 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -121,7 +121,7 @@ int ptrace_check_attach(struct task_struct *child, int kill)
 	return ret;
 }
 
-int __ptrace_may_attach(struct task_struct *task)
+int __ptrace_may_access(struct task_struct *task, unsigned int mode)
 {
 	/* May we inspect the given task?
 	 * This check is used both for attaching with ptrace
@@ -148,16 +148,16 @@ int __ptrace_may_attach(struct task_struct *task)
 	if (!dumpable && !capable(CAP_SYS_PTRACE))
 		return -EPERM;
 
-	return security_ptrace(current, task);
+	return security_ptrace(current, task, mode);
 }
 
-int ptrace_may_attach(struct task_struct *task)
+bool ptrace_may_access(struct task_struct *task, unsigned int mode)
 {
 	int err;
 	task_lock(task);
-	err = __ptrace_may_attach(task);
+	err = __ptrace_may_access(task, mode);
 	task_unlock(task);
-	return !err;
+	return (!err ? true : false);
 }
 
 int ptrace_attach(struct task_struct *task)
@@ -195,7 +195,7 @@ repeat:
 	/* the same process cannot be attached many times */
 	if (task->ptrace & PT_PTRACED)
 		goto bad;
-	retval = __ptrace_may_attach(task);
+	retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH);
 	if (retval)
 		goto bad;
 
@@ -494,7 +494,8 @@ int ptrace_traceme(void)
 	 */
 	task_lock(current);
 	if (!(current->ptrace & PT_PTRACED)) {
-		ret = security_ptrace(current->parent, current);
+		ret = security_ptrace(current->parent, current,
+				      PTRACE_MODE_ATTACH);
 		/*
 		 * Set the ptrace bit in the process ptrace flags.
 		 */
diff --git a/security/commoncap.c b/security/commoncap.c
index 33d343308413..0b6537a3672d 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -63,7 +63,8 @@ int cap_settime(struct timespec *ts, struct timezone *tz)
 	return 0;
 }
 
-int cap_ptrace (struct task_struct *parent, struct task_struct *child)
+int cap_ptrace (struct task_struct *parent, struct task_struct *child,
+		unsigned int mode)
 {
 	/* Derived from arch/i386/kernel/ptrace.c:sys_ptrace. */
 	if (!cap_issubset(child->cap_permitted, parent->cap_permitted) &&
diff --git a/security/dummy.c b/security/dummy.c
index b8916883b77f..1db712d99dc7 100644
--- a/security/dummy.c
+++ b/security/dummy.c
@@ -30,7 +30,8 @@
 #include <linux/prctl.h>
 #include <linux/securebits.h>
 
-static int dummy_ptrace (struct task_struct *parent, struct task_struct *child)
+static int dummy_ptrace (struct task_struct *parent, struct task_struct *child,
+			 unsigned int mode)
 {
 	return 0;
 }
diff --git a/security/security.c b/security/security.c
index 59838a99b80e..c4507ce2a5a0 100644
--- a/security/security.c
+++ b/security/security.c
@@ -161,9 +161,10 @@ int mod_reg_security(const char *name, struct security_operations *ops)
 
 /* Security operations */
 
-int security_ptrace(struct task_struct *parent, struct task_struct *child)
+int security_ptrace(struct task_struct *parent, struct task_struct *child,
+		    unsigned int mode)
 {
-	return security_ops->ptrace(parent, child);
+	return security_ops->ptrace(parent, child, mode);
 }
 
 int security_capget(struct task_struct *target,
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index eca70f42e678..4be156334b22 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -1686,14 +1686,23 @@ static inline u32 file_to_av(struct file *file)
 
 /* Hook functions begin here. */
 
-static int selinux_ptrace(struct task_struct *parent, struct task_struct *child)
+static int selinux_ptrace(struct task_struct *parent,
+			  struct task_struct *child,
+			  unsigned int mode)
 {
 	int rc;
 
-	rc = secondary_ops->ptrace(parent, child);
+	rc = secondary_ops->ptrace(parent, child, mode);
 	if (rc)
 		return rc;
 
+	if (mode == PTRACE_MODE_READ) {
+		struct task_security_struct *tsec = parent->security;
+		struct task_security_struct *csec = child->security;
+		return avc_has_perm(tsec->sid, csec->sid,
+				    SECCLASS_FILE, FILE__READ, NULL);
+	}
+
 	return task_has_perm(parent, child, PROCESS__PTRACE);
 }
 
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 4a09293efa00..3c7150b3493d 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -95,11 +95,12 @@ struct inode_smack *new_inode_smack(char *smack)
  *
  * Do the capability checks, and require read and write.
  */
-static int smack_ptrace(struct task_struct *ptp, struct task_struct *ctp)
+static int smack_ptrace(struct task_struct *ptp, struct task_struct *ctp,
+			unsigned int mode)
 {
 	int rc;
 
-	rc = cap_ptrace(ptp, ctp);
+	rc = cap_ptrace(ptp, ctp, mode);
 	if (rc != 0)
 		return rc;
 
-- 
cgit v1.2.3


From 2069f457848f846cb31149c9aa29b330a6b66d1b Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Fri, 4 Jul 2008 09:47:13 +1000
Subject: LSM/SELinux: show LSM mount options in /proc/mounts

This patch causes SELinux mount options to show up in /proc/mounts.  As
with other code in the area seq_put errors are ignored.  Other LSM's
will not have their mount options displayed until they fill in their own
security_sb_show_options() function.

Signed-off-by: Eric Paris <eparis@redhat.com>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: James Morris <jmorris@namei.org>
---
 fs/namespace.c           | 14 +++++++++---
 include/linux/security.h |  9 ++++++++
 security/dummy.c         |  6 ++++++
 security/security.c      |  5 +++++
 security/selinux/hooks.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++-
 5 files changed, 85 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/namespace.c b/fs/namespace.c
index 4fc302c2a0e0..4f6f7635b59c 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -750,7 +750,7 @@ struct proc_fs_info {
 	const char *str;
 };
 
-static void show_sb_opts(struct seq_file *m, struct super_block *sb)
+static int show_sb_opts(struct seq_file *m, struct super_block *sb)
 {
 	static const struct proc_fs_info fs_info[] = {
 		{ MS_SYNCHRONOUS, ",sync" },
@@ -764,6 +764,8 @@ static void show_sb_opts(struct seq_file *m, struct super_block *sb)
 		if (sb->s_flags & fs_infop->flag)
 			seq_puts(m, fs_infop->str);
 	}
+
+	return security_sb_show_options(m, sb);
 }
 
 static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt)
@@ -806,11 +808,14 @@ static int show_vfsmnt(struct seq_file *m, void *v)
 	seq_putc(m, ' ');
 	show_type(m, mnt->mnt_sb);
 	seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw");
-	show_sb_opts(m, mnt->mnt_sb);
+	err = show_sb_opts(m, mnt->mnt_sb);
+	if (err)
+		goto out;
 	show_mnt_opts(m, mnt);
 	if (mnt->mnt_sb->s_op->show_options)
 		err = mnt->mnt_sb->s_op->show_options(m, mnt);
 	seq_puts(m, " 0 0\n");
+out:
 	return err;
 }
 
@@ -865,10 +870,13 @@ static int show_mountinfo(struct seq_file *m, void *v)
 	seq_putc(m, ' ');
 	mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none");
 	seq_puts(m, sb->s_flags & MS_RDONLY ? " ro" : " rw");
-	show_sb_opts(m, sb);
+	err = show_sb_opts(m, sb);
+	if (err)
+		goto out;
 	if (sb->s_op->show_options)
 		err = sb->s_op->show_options(m, mnt);
 	seq_putc(m, '\n');
+out:
 	return err;
 }
 
diff --git a/include/linux/security.h b/include/linux/security.h
index 62bd80cb7f87..c8ad8ec684b4 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -80,6 +80,7 @@ struct xfrm_selector;
 struct xfrm_policy;
 struct xfrm_state;
 struct xfrm_user_sec_ctx;
+struct seq_file;
 
 extern int cap_netlink_send(struct sock *sk, struct sk_buff *skb);
 extern int cap_netlink_recv(struct sk_buff *skb, int cap);
@@ -1331,6 +1332,7 @@ struct security_operations {
 	void (*sb_free_security) (struct super_block *sb);
 	int (*sb_copy_data) (char *orig, char *copy);
 	int (*sb_kern_mount) (struct super_block *sb, void *data);
+	int (*sb_show_options) (struct seq_file *m, struct super_block *sb);
 	int (*sb_statfs) (struct dentry *dentry);
 	int (*sb_mount) (char *dev_name, struct path *path,
 			 char *type, unsigned long flags, void *data);
@@ -1610,6 +1612,7 @@ int security_sb_alloc(struct super_block *sb);
 void security_sb_free(struct super_block *sb);
 int security_sb_copy_data(char *orig, char *copy);
 int security_sb_kern_mount(struct super_block *sb, void *data);
+int security_sb_show_options(struct seq_file *m, struct super_block *sb);
 int security_sb_statfs(struct dentry *dentry);
 int security_sb_mount(char *dev_name, struct path *path,
 		      char *type, unsigned long flags, void *data);
@@ -1887,6 +1890,12 @@ static inline int security_sb_kern_mount(struct super_block *sb, void *data)
 	return 0;
 }
 
+static inline int security_sb_show_options(struct seq_file *m,
+					   struct super_block *sb)
+{
+	return 0;
+}
+
 static inline int security_sb_statfs(struct dentry *dentry)
 {
 	return 0;
diff --git a/security/dummy.c b/security/dummy.c
index 1db712d99dc7..c155f08e9dd8 100644
--- a/security/dummy.c
+++ b/security/dummy.c
@@ -194,6 +194,11 @@ static int dummy_sb_kern_mount (struct super_block *sb, void *data)
 	return 0;
 }
 
+static int dummy_sb_show_options(struct seq_file *m, struct super_block *sb)
+{
+	return 0;
+}
+
 static int dummy_sb_statfs (struct dentry *dentry)
 {
 	return 0;
@@ -1088,6 +1093,7 @@ void security_fixup_ops (struct security_operations *ops)
 	set_to_dummy_if_null(ops, sb_free_security);
 	set_to_dummy_if_null(ops, sb_copy_data);
 	set_to_dummy_if_null(ops, sb_kern_mount);
+	set_to_dummy_if_null(ops, sb_show_options);
 	set_to_dummy_if_null(ops, sb_statfs);
 	set_to_dummy_if_null(ops, sb_mount);
 	set_to_dummy_if_null(ops, sb_check_sb);
diff --git a/security/security.c b/security/security.c
index 2c0a5876b939..de74fdccde26 100644
--- a/security/security.c
+++ b/security/security.c
@@ -292,6 +292,11 @@ int security_sb_kern_mount(struct super_block *sb, void *data)
 	return security_ops->sb_kern_mount(sb, data);
 }
 
+int security_sb_show_options(struct seq_file *m, struct super_block *sb)
+{
+	return security_ops->sb_show_options(m, sb);
+}
+
 int security_sb_statfs(struct dentry *dentry)
 {
 	return security_ops->sb_statfs(dentry);
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 85f74f665765..33dee83fdd2f 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -9,7 +9,8 @@
  *	      James Morris <jmorris@redhat.com>
  *
  *  Copyright (C) 2001,2002 Networks Associates Technology, Inc.
- *  Copyright (C) 2003 Red Hat, Inc., James Morris <jmorris@redhat.com>
+ *  Copyright (C) 2003-2008 Red Hat, Inc., James Morris <jmorris@redhat.com>
+ *					   Eric Paris <eparis@redhat.com>
  *  Copyright (C) 2004-2005 Trusted Computer Solutions, Inc.
  *			    <dgoeddel@trustedcs.com>
  *  Copyright (C) 2006, 2007 Hewlett-Packard Development Company, L.P.
@@ -970,6 +971,57 @@ out_err:
 	return rc;
 }
 
+void selinux_write_opts(struct seq_file *m, struct security_mnt_opts *opts)
+{
+	int i;
+	char *prefix;
+
+	for (i = 0; i < opts->num_mnt_opts; i++) {
+		char *has_comma = strchr(opts->mnt_opts[i], ',');
+
+		switch (opts->mnt_opts_flags[i]) {
+		case CONTEXT_MNT:
+			prefix = CONTEXT_STR;
+			break;
+		case FSCONTEXT_MNT:
+			prefix = FSCONTEXT_STR;
+			break;
+		case ROOTCONTEXT_MNT:
+			prefix = ROOTCONTEXT_STR;
+			break;
+		case DEFCONTEXT_MNT:
+			prefix = DEFCONTEXT_STR;
+			break;
+		default:
+			BUG();
+		};
+		/* we need a comma before each option */
+		seq_putc(m, ',');
+		seq_puts(m, prefix);
+		if (has_comma)
+			seq_putc(m, '\"');
+		seq_puts(m, opts->mnt_opts[i]);
+		if (has_comma)
+			seq_putc(m, '\"');
+	}
+}
+
+static int selinux_sb_show_options(struct seq_file *m, struct super_block *sb)
+{
+	struct security_mnt_opts opts;
+	int rc;
+
+	rc = selinux_get_mnt_opts(sb, &opts);
+	if (rc)
+		return rc;
+
+	selinux_write_opts(m, &opts);
+
+	security_free_mnt_opts(&opts);
+
+	return rc;
+}
+
 static inline u16 inode_mode_to_security_class(umode_t mode)
 {
 	switch (mode & S_IFMT) {
@@ -5365,6 +5417,7 @@ static struct security_operations selinux_ops = {
 	.sb_free_security =		selinux_sb_free_security,
 	.sb_copy_data =			selinux_sb_copy_data,
 	.sb_kern_mount =		selinux_sb_kern_mount,
+	.sb_show_options =		selinux_sb_show_options,
 	.sb_statfs =			selinux_sb_statfs,
 	.sb_mount =			selinux_mount,
 	.sb_umount =			selinux_umount,
-- 
cgit v1.2.3


From b478a9f9889c81e88077d1495daadee64c0af541 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Thu, 3 Jul 2008 20:56:04 +0200
Subject: security: remove unused sb_get_mnt_opts hook

The sb_get_mnt_opts() hook is unused, and is superseded by the
sb_show_options() hook.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Acked-by: James Morris <jmorris@namei.org>
---
 include/linux/security.h | 14 --------------
 security/dummy.c         |  8 --------
 security/security.c      |  6 ------
 security/selinux/hooks.c |  1 -
 4 files changed, 29 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/security.h b/include/linux/security.h
index c8ad8ec684b4..43c6357568a3 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -291,10 +291,6 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	Update module state after a successful pivot.
  *	@old_path contains the path for the old root.
  *	@new_path contains the path for the new root.
- * @sb_get_mnt_opts:
- *	Get the security relevant mount options used for a superblock
- *	@sb the superblock to get security mount options from
- *	@opts binary data structure containing all lsm mount data
  * @sb_set_mnt_opts:
  *	Set the security relevant mount options used for a superblock
  *	@sb the superblock to set security mount options for
@@ -1348,8 +1344,6 @@ struct security_operations {
 			     struct path *new_path);
 	void (*sb_post_pivotroot) (struct path *old_path,
 				   struct path *new_path);
-	int (*sb_get_mnt_opts) (const struct super_block *sb,
-				struct security_mnt_opts *opts);
 	int (*sb_set_mnt_opts) (struct super_block *sb,
 				struct security_mnt_opts *opts);
 	void (*sb_clone_mnt_opts) (const struct super_block *oldsb,
@@ -1624,8 +1618,6 @@ void security_sb_post_remount(struct vfsmount *mnt, unsigned long flags, void *d
 void security_sb_post_addmount(struct vfsmount *mnt, struct path *mountpoint);
 int security_sb_pivotroot(struct path *old_path, struct path *new_path);
 void security_sb_post_pivotroot(struct path *old_path, struct path *new_path);
-int security_sb_get_mnt_opts(const struct super_block *sb,
-				struct security_mnt_opts *opts);
 int security_sb_set_mnt_opts(struct super_block *sb, struct security_mnt_opts *opts);
 void security_sb_clone_mnt_opts(const struct super_block *oldsb,
 				struct super_block *newsb);
@@ -1942,12 +1934,6 @@ static inline int security_sb_pivotroot(struct path *old_path,
 static inline void security_sb_post_pivotroot(struct path *old_path,
 					      struct path *new_path)
 { }
-static inline int security_sb_get_mnt_opts(const struct super_block *sb,
-					   struct security_mnt_opts *opts)
-{
-	security_init_mnt_opts(opts);
-	return 0;
-}
 
 static inline int security_sb_set_mnt_opts(struct super_block *sb,
 					   struct security_mnt_opts *opts)
diff --git a/security/dummy.c b/security/dummy.c
index c155f08e9dd8..793856691641 100644
--- a/security/dummy.c
+++ b/security/dummy.c
@@ -252,13 +252,6 @@ static void dummy_sb_post_pivotroot (struct path *old_path, struct path *new_pat
 	return;
 }
 
-static int dummy_sb_get_mnt_opts(const struct super_block *sb,
-				 struct security_mnt_opts *opts)
-{
-	security_init_mnt_opts(opts);
-	return 0;
-}
-
 static int dummy_sb_set_mnt_opts(struct super_block *sb,
 				 struct security_mnt_opts *opts)
 {
@@ -1104,7 +1097,6 @@ void security_fixup_ops (struct security_operations *ops)
 	set_to_dummy_if_null(ops, sb_post_addmount);
 	set_to_dummy_if_null(ops, sb_pivotroot);
 	set_to_dummy_if_null(ops, sb_post_pivotroot);
-	set_to_dummy_if_null(ops, sb_get_mnt_opts);
 	set_to_dummy_if_null(ops, sb_set_mnt_opts);
 	set_to_dummy_if_null(ops, sb_clone_mnt_opts);
 	set_to_dummy_if_null(ops, sb_parse_opts_str);
diff --git a/security/security.c b/security/security.c
index de74fdccde26..28b2860c1129 100644
--- a/security/security.c
+++ b/security/security.c
@@ -348,12 +348,6 @@ void security_sb_post_pivotroot(struct path *old_path, struct path *new_path)
 	security_ops->sb_post_pivotroot(old_path, new_path);
 }
 
-int security_sb_get_mnt_opts(const struct super_block *sb,
-				struct security_mnt_opts *opts)
-{
-	return security_ops->sb_get_mnt_opts(sb, opts);
-}
-
 int security_sb_set_mnt_opts(struct super_block *sb,
 				struct security_mnt_opts *opts)
 {
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 33dee83fdd2f..745a69e74e38 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -5421,7 +5421,6 @@ static struct security_operations selinux_ops = {
 	.sb_statfs =			selinux_sb_statfs,
 	.sb_mount =			selinux_mount,
 	.sb_umount =			selinux_umount,
-	.sb_get_mnt_opts =		selinux_get_mnt_opts,
 	.sb_set_mnt_opts =		selinux_set_mnt_opts,
 	.sb_clone_mnt_opts =		selinux_sb_clone_mnt_opts,
 	.sb_parse_opts_str = 		selinux_parse_opts_str,
-- 
cgit v1.2.3


From 6f0f0fd496333777d53daff21a4e3b28c4d03a6d Mon Sep 17 00:00:00 2001
From: James Morris <jmorris@namei.org>
Date: Thu, 10 Jul 2008 17:02:07 +0900
Subject: security: remove register_security hook

The register security hook is no longer required, as the capability
module is always registered.  LSMs wishing to stack capability as
a secondary module should do so explicitly.

Signed-off-by: James Morris <jmorris@namei.org>
Acked-by: Stephen Smalley <sds@tycho.nsa.gov>
Acked-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/security.h   | 10 ----------
 security/capability.c      |  7 -------
 security/root_plug.c       |  9 ---------
 security/security.c        | 29 -----------------------------
 security/selinux/hooks.c   | 32 +++++---------------------------
 security/smack/smack_lsm.c | 23 -----------------------
 6 files changed, 5 insertions(+), 105 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/security.h b/include/linux/security.h
index 43c6357568a3..31c8851ec5d0 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -1239,11 +1239,6 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	@pages contains the number of pages.
  *	Return 0 if permission is granted.
  *
- * @register_security:
- *	allow module stacking.
- *	@name contains the name of the security module being stacked.
- *	@ops contains a pointer to the struct security_operations of the module to stack.
- *
  * @secid_to_secctx:
  *	Convert secid to security context.
  *	@secid contains the security ID.
@@ -1471,10 +1466,6 @@ struct security_operations {
 	int (*netlink_send) (struct sock *sk, struct sk_buff *skb);
 	int (*netlink_recv) (struct sk_buff *skb, int cap);
 
-	/* allow module stacking */
-	int (*register_security) (const char *name,
-				  struct security_operations *ops);
-
 	void (*d_instantiate) (struct dentry *dentry, struct inode *inode);
 
 	int (*getprocattr) (struct task_struct *p, char *name, char **value);
@@ -1564,7 +1555,6 @@ struct security_operations {
 extern int security_init(void);
 extern int security_module_enable(struct security_operations *ops);
 extern int register_security(struct security_operations *ops);
-extern int mod_reg_security(const char *name, struct security_operations *ops);
 extern struct dentry *securityfs_create_file(const char *name, mode_t mode,
 					     struct dentry *parent, void *data,
 					     const struct file_operations *fops);
diff --git a/security/capability.c b/security/capability.c
index 6e0671c82018..5b01c0b02422 100644
--- a/security/capability.c
+++ b/security/capability.c
@@ -721,12 +721,6 @@ static int cap_xfrm_decode_session(struct sk_buff *skb, u32 *fl, int ckall)
 }
 
 #endif /* CONFIG_SECURITY_NETWORK_XFRM */
-static int cap_register_security(const char *name,
-				 struct security_operations *ops)
-{
-	return -EINVAL;
-}
-
 static void cap_d_instantiate(struct dentry *dentry, struct inode *inode)
 {
 }
@@ -940,7 +934,6 @@ void security_fixup_ops(struct security_operations *ops)
 	set_to_cap_if_null(ops, sem_semop);
 	set_to_cap_if_null(ops, netlink_send);
 	set_to_cap_if_null(ops, netlink_recv);
-	set_to_cap_if_null(ops, register_security);
 	set_to_cap_if_null(ops, d_instantiate);
 	set_to_cap_if_null(ops, getprocattr);
 	set_to_cap_if_null(ops, setprocattr);
diff --git a/security/root_plug.c b/security/root_plug.c
index a41cf42a4fa0..be0ebec2580b 100644
--- a/security/root_plug.c
+++ b/security/root_plug.c
@@ -28,9 +28,6 @@
 #include <linux/usb.h>
 #include <linux/moduleparam.h>
 
-/* flag to keep track of how we were registered */
-static int secondary;
-
 /* default is a generic type of usb to serial converter */
 static int vendor_id = 0x0557;
 static int product_id = 0x2008;
@@ -97,13 +94,7 @@ static int __init rootplug_init (void)
 	if (register_security (&rootplug_security_ops)) {
 		printk (KERN_INFO 
 			"Failure registering Root Plug module with the kernel\n");
-		/* try registering with primary module */
-		if (mod_reg_security (MY_NAME, &rootplug_security_ops)) {
-			printk (KERN_INFO "Failure registering Root Plug "
-				" module with primary security module.\n");
 			return -EINVAL;
-		}
-		secondary = 1;
 	}
 	printk (KERN_INFO "Root Plug module initialized, "
 		"vendor_id = %4.4x, product id = %4.4x\n", vendor_id, product_id);
diff --git a/security/security.c b/security/security.c
index 30b0278de394..59f23b5918b3 100644
--- a/security/security.c
+++ b/security/security.c
@@ -125,35 +125,6 @@ int register_security(struct security_operations *ops)
 	return 0;
 }
 
-/**
- * mod_reg_security - allows security modules to be "stacked"
- * @name: a pointer to a string with the name of the security_options to be registered
- * @ops: a pointer to the struct security_options that is to be registered
- *
- * This function allows security modules to be stacked if the currently loaded
- * security module allows this to happen.  It passes the @name and @ops to the
- * register_security function of the currently loaded security module.
- *
- * The return value depends on the currently loaded security module, with 0 as
- * success.
- */
-int mod_reg_security(const char *name, struct security_operations *ops)
-{
-	if (verify(ops)) {
-		printk(KERN_INFO "%s could not verify "
-		       "security operations.\n", __func__);
-		return -EINVAL;
-	}
-
-	if (ops == security_ops) {
-		printk(KERN_INFO "%s security operations "
-		       "already registered.\n", __func__);
-		return -EINVAL;
-	}
-
-	return security_ops->register_security(name, ops);
-}
-
 /* Security operations */
 
 int security_ptrace(struct task_struct *parent, struct task_struct *child,
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 745a69e74e38..91200feb3f9c 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -126,13 +126,11 @@ __setup("selinux=", selinux_enabled_setup);
 int selinux_enabled = 1;
 #endif
 
-/* Original (dummy) security module. */
-static struct security_operations *original_ops;
 
-/* Minimal support for a secondary security module,
-   just to allow the use of the dummy or capability modules.
-   The owlsm module can alternatively be used as a secondary
-   module as long as CONFIG_OWLSM_FD is not enabled. */
+/*
+ * Minimal support for a secondary security module,
+ * just to allow the use of the capability module.
+ */
 static struct security_operations *secondary_ops;
 
 /* Lists of inode and superblock security structures initialized
@@ -5115,24 +5113,6 @@ static void selinux_ipc_getsecid(struct kern_ipc_perm *ipcp, u32 *secid)
 	*secid = isec->sid;
 }
 
-/* module stacking operations */
-static int selinux_register_security(const char *name, struct security_operations *ops)
-{
-	if (secondary_ops != original_ops) {
-		printk(KERN_ERR "%s:  There is already a secondary security "
-		       "module registered.\n", __func__);
-		return -EINVAL;
-	}
-
-	secondary_ops = ops;
-
-	printk(KERN_INFO "%s:  Registering secondary module %s\n",
-	       __func__,
-	       name);
-
-	return 0;
-}
-
 static void selinux_d_instantiate(struct dentry *dentry, struct inode *inode)
 {
 	if (inode)
@@ -5517,8 +5497,6 @@ static struct security_operations selinux_ops = {
 	.sem_semctl =			selinux_sem_semctl,
 	.sem_semop =			selinux_sem_semop,
 
-	.register_security =		selinux_register_security,
-
 	.d_instantiate =		selinux_d_instantiate,
 
 	.getprocattr =			selinux_getprocattr,
@@ -5612,7 +5590,7 @@ static __init int selinux_init(void)
 					    0, SLAB_PANIC, NULL);
 	avc_init();
 
-	original_ops = secondary_ops = security_ops;
+	secondary_ops = security_ops;
 	if (!secondary_ops)
 		panic("SELinux: No initial security operations\n");
 	if (register_security(&selinux_ops))
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 3c7150b3493d..ee5a51cbc5eb 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -1822,27 +1822,6 @@ static void smack_ipc_getsecid(struct kern_ipc_perm *ipp, u32 *secid)
 	*secid = smack_to_secid(smack);
 }
 
-/* module stacking operations */
-
-/**
- * smack_register_security - stack capability module
- * @name: module name
- * @ops: module operations - ignored
- *
- * Allow the capability module to register.
- */
-static int smack_register_security(const char *name,
-				   struct security_operations *ops)
-{
-	if (strcmp(name, "capability") != 0)
-		return -EINVAL;
-
-	printk(KERN_INFO "%s:  Registering secondary module %s\n",
-	       __func__, name);
-
-	return 0;
-}
-
 /**
  * smack_d_instantiate - Make sure the blob is correct on an inode
  * @opt_dentry: unused
@@ -2673,8 +2652,6 @@ struct security_operations smack_ops = {
 	.netlink_send =			cap_netlink_send,
 	.netlink_recv = 		cap_netlink_recv,
 
-	.register_security = 		smack_register_security,
-
 	.d_instantiate = 		smack_d_instantiate,
 
 	.getprocattr = 			smack_getprocattr,
-- 
cgit v1.2.3


From 7e9db9eaefdb8798730790214ff1b7746006ec98 Mon Sep 17 00:00:00 2001
From: Cornelia Huck <cornelia.huck@de.ibm.com>
Date: Mon, 14 Jul 2008 09:58:44 +0200
Subject: [S390] cio: Introduce modalias for css bus.

Add modalias and subchannel type attributes for all subchannels.
I/O subchannel specific attributes are now created in
io_subchannel_probe(). modalias and subchannel type are also
added to the uevent for the css bus. Also make the css modalias
known.

Signed-off-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 Documentation/ABI/testing/sysfs-bus-css | 35 +++++++++++++++++
 drivers/s390/cio/cio.h                  |  1 +
 drivers/s390/cio/css.c                  | 69 ++++++++++++++++++++++++++++++---
 drivers/s390/cio/css.h                  |  2 -
 drivers/s390/cio/device.c               | 47 ++++++++++++++--------
 include/linux/mod_devicetable.h         |  9 +++++
 scripts/mod/file2alias.c                | 12 ++++++
 7 files changed, 151 insertions(+), 24 deletions(-)
 create mode 100644 Documentation/ABI/testing/sysfs-bus-css

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-bus-css b/Documentation/ABI/testing/sysfs-bus-css
new file mode 100644
index 000000000000..b585ec258a08
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-css
@@ -0,0 +1,35 @@
+What:		/sys/bus/css/devices/.../type
+Date:		March 2008
+Contact:	Cornelia Huck <cornelia.huck@de.ibm.com>
+		linux-s390@vger.kernel.org
+Description:	Contains the subchannel type, as reported by the hardware.
+		This attribute is present for all subchannel types.
+
+What:		/sys/bus/css/devices/.../modalias
+Date:		March 2008
+Contact:	Cornelia Huck <cornelia.huck@de.ibm.com>
+		linux-s390@vger.kernel.org
+Description:	Contains the module alias as reported with uevents.
+		It is of the format css:t<type> and present for all
+		subchannel types.
+
+What:		/sys/bus/css/drivers/io_subchannel/.../chpids
+Date:		December 2002
+Contact:	Cornelia Huck <cornelia.huck@de.ibm.com>
+		linux-s390@vger.kernel.org
+Description:	Contains the ids of the channel paths used by this
+		subchannel, as reported by the channel subsystem
+		during subchannel recognition.
+		Note: This is an I/O-subchannel specific attribute.
+Users:		s390-tools, HAL
+
+What:		/sys/bus/css/drivers/io_subchannel/.../pimpampom
+Date:		December 2002
+Contact:	Cornelia Huck <cornelia.huck@de.ibm.com>
+		linux-s390@vger.kernel.org
+Description:	Contains the PIM/PAM/POM values, as reported by the
+		channel subsystem when last queried by the common I/O
+		layer (this implies that this attribute is not neccessarily
+		in sync with the values current in the channel subsystem).
+		Note: This is an I/O-subchannel specific attribute.
+Users:		s390-tools, HAL
diff --git a/drivers/s390/cio/cio.h b/drivers/s390/cio/cio.h
index 6e933aebe013..4062748e8346 100644
--- a/drivers/s390/cio/cio.h
+++ b/drivers/s390/cio/cio.h
@@ -3,6 +3,7 @@
 
 #include <linux/mutex.h>
 #include <linux/device.h>
+#include <linux/mod_devicetable.h>
 #include <asm/chpid.h>
 #include "chsc.h"
 #include "schid.h"
diff --git a/drivers/s390/cio/css.c b/drivers/s390/cio/css.c
index b7f4b52c5a9a..53e7496dc90c 100644
--- a/drivers/s390/cio/css.c
+++ b/drivers/s390/cio/css.c
@@ -2,8 +2,7 @@
  *  drivers/s390/cio/css.c
  *  driver for channel subsystem
  *
- *    Copyright (C) 2002 IBM Deutschland Entwicklung GmbH,
- *			 IBM Corporation
+ *    Copyright IBM Corp. 2002,2008
  *    Author(s): Arnd Bergmann (arndb@de.ibm.com)
  *		 Cornelia Huck (cornelia.huck@de.ibm.com)
  */
@@ -210,6 +209,41 @@ void css_update_ssd_info(struct subchannel *sch)
 	}
 }
 
+static ssize_t type_show(struct device *dev, struct device_attribute *attr,
+			 char *buf)
+{
+	struct subchannel *sch = to_subchannel(dev);
+
+	return sprintf(buf, "%01x\n", sch->st);
+}
+
+static DEVICE_ATTR(type, 0444, type_show, NULL);
+
+static ssize_t modalias_show(struct device *dev, struct device_attribute *attr,
+			     char *buf)
+{
+	struct subchannel *sch = to_subchannel(dev);
+
+	return sprintf(buf, "css:t%01X\n", sch->st);
+}
+
+static DEVICE_ATTR(modalias, 0444, modalias_show, NULL);
+
+static struct attribute *subch_attrs[] = {
+	&dev_attr_type.attr,
+	&dev_attr_modalias.attr,
+	NULL,
+};
+
+static struct attribute_group subch_attr_group = {
+	.attrs = subch_attrs,
+};
+
+static struct attribute_group *default_subch_attr_groups[] = {
+	&subch_attr_group,
+	NULL,
+};
+
 static int css_register_subchannel(struct subchannel *sch)
 {
 	int ret;
@@ -218,16 +252,17 @@ static int css_register_subchannel(struct subchannel *sch)
 	sch->dev.parent = &channel_subsystems[0]->device;
 	sch->dev.bus = &css_bus_type;
 	sch->dev.release = &css_subchannel_release;
-	sch->dev.groups = subch_attr_groups;
+	sch->dev.groups = default_subch_attr_groups;
 	/*
 	 * We don't want to generate uevents for I/O subchannels that don't
 	 * have a working ccw device behind them since they will be
 	 * unregistered before they can be used anyway, so we delay the add
 	 * uevent until after device recognition was successful.
+	 * Note that we suppress the uevent for all subchannel types;
+	 * the subchannel driver can decide itself when it wants to inform
+	 * userspace of its existence.
 	 */
-	if (!cio_is_console(sch->schid))
-		/* Console is special, no need to suppress. */
-		sch->dev.uevent_suppress = 1;
+	sch->dev.uevent_suppress = 1;
 	css_update_ssd_info(sch);
 	/* make it known to the system */
 	ret = css_sch_device_register(sch);
@@ -236,6 +271,15 @@ static int css_register_subchannel(struct subchannel *sch)
 			      sch->schid.ssid, sch->schid.sch_no, ret);
 		return ret;
 	}
+	if (!sch->driver) {
+		/*
+		 * No driver matched. Generate the uevent now so that
+		 * a fitting driver module may be loaded based on the
+		 * modalias.
+		 */
+		sch->dev.uevent_suppress = 0;
+		kobject_uevent(&sch->dev.kobj, KOBJ_ADD);
+	}
 	return ret;
 }
 
@@ -926,12 +970,25 @@ static void css_shutdown(struct device *dev)
 		sch->driver->shutdown(sch);
 }
 
+static int css_uevent(struct device *dev, struct kobj_uevent_env *env)
+{
+	struct subchannel *sch = to_subchannel(dev);
+	int ret;
+
+	ret = add_uevent_var(env, "ST=%01X", sch->st);
+	if (ret)
+		return ret;
+	ret = add_uevent_var(env, "MODALIAS=css:t%01X", sch->st);
+	return ret;
+}
+
 struct bus_type css_bus_type = {
 	.name     = "css",
 	.match    = css_bus_match,
 	.probe    = css_probe,
 	.remove   = css_remove,
 	.shutdown = css_shutdown,
+	.uevent   = css_uevent,
 };
 
 /**
diff --git a/drivers/s390/cio/css.h b/drivers/s390/cio/css.h
index bfe0ada43f2c..e0fc7b499784 100644
--- a/drivers/s390/cio/css.h
+++ b/drivers/s390/cio/css.h
@@ -143,6 +143,4 @@ int css_sch_is_valid(struct schib *);
 
 extern struct workqueue_struct *slow_path_wq;
 void css_wait_for_slow_path(void);
-
-extern struct attribute_group *subch_attr_groups[];
 #endif
diff --git a/drivers/s390/cio/device.c b/drivers/s390/cio/device.c
index 0ed5a81260bc..23b129fd4d8d 100644
--- a/drivers/s390/cio/device.c
+++ b/drivers/s390/cio/device.c
@@ -585,19 +585,14 @@ static DEVICE_ATTR(modalias, 0444, modalias_show, NULL);
 static DEVICE_ATTR(online, 0644, online_show, online_store);
 static DEVICE_ATTR(availability, 0444, available_show, NULL);
 
-static struct attribute * subch_attrs[] = {
+static struct attribute *io_subchannel_attrs[] = {
 	&dev_attr_chpids.attr,
 	&dev_attr_pimpampom.attr,
 	NULL,
 };
 
-static struct attribute_group subch_attr_group = {
-	.attrs = subch_attrs,
-};
-
-struct attribute_group *subch_attr_groups[] = {
-	&subch_attr_group,
-	NULL,
+static struct attribute_group io_subchannel_attr_group = {
+	.attrs = io_subchannel_attrs,
 };
 
 static struct attribute * ccwdev_attrs[] = {
@@ -1157,11 +1152,21 @@ static int io_subchannel_probe(struct subchannel *sch)
 
 	cdev = sch_get_cdev(sch);
 	if (cdev) {
+		rc = sysfs_create_group(&sch->dev.kobj,
+					&io_subchannel_attr_group);
+		if (rc)
+			CIO_MSG_EVENT(0, "Failed to create io subchannel "
+				      "attributes for subchannel "
+				      "0.%x.%04x (rc=%d)\n",
+				      sch->schid.ssid, sch->schid.sch_no, rc);
 		/*
 		 * This subchannel already has an associated ccw_device.
-		 * Register it and exit. This happens for all early
-		 * device, e.g. the console.
+		 * Throw the delayed uevent for the subchannel, register
+		 * the ccw_device and exit. This happens for all early
+		 * devices, e.g. the console.
 		 */
+		sch->dev.uevent_suppress = 0;
+		kobject_uevent(&sch->dev.kobj, KOBJ_ADD);
 		cdev->dev.groups = ccwdev_attr_groups;
 		device_initialize(&cdev->dev);
 		ccw_device_register(cdev);
@@ -1184,11 +1189,17 @@ static int io_subchannel_probe(struct subchannel *sch)
 	 */
 	dev_id.devno = sch->schib.pmcw.dev;
 	dev_id.ssid = sch->schid.ssid;
+	rc = sysfs_create_group(&sch->dev.kobj,
+				&io_subchannel_attr_group);
+	if (rc)
+		return rc;
 	/* Allocate I/O subchannel private data. */
 	sch->private = kzalloc(sizeof(struct io_subchannel_private),
 			       GFP_KERNEL | GFP_DMA);
-	if (!sch->private)
-		return -ENOMEM;
+	if (!sch->private) {
+		rc = -ENOMEM;
+		goto out_err;
+	}
 	cdev = get_disc_ccwdev_by_dev_id(&dev_id, NULL);
 	if (!cdev)
 		cdev = get_orphaned_ccwdev_by_dev_id(to_css(sch->dev.parent),
@@ -1207,8 +1218,8 @@ static int io_subchannel_probe(struct subchannel *sch)
 	}
 	cdev = io_subchannel_create_ccwdev(sch);
 	if (IS_ERR(cdev)) {
-		kfree(sch->private);
-		return PTR_ERR(cdev);
+		rc = PTR_ERR(cdev);
+		goto out_err;
 	}
 	rc = io_subchannel_recog(cdev, sch);
 	if (rc) {
@@ -1217,9 +1228,12 @@ static int io_subchannel_probe(struct subchannel *sch)
 		spin_unlock_irqrestore(sch->lock, flags);
 		if (cdev->dev.release)
 			cdev->dev.release(&cdev->dev);
-		kfree(sch->private);
+		goto out_err;
 	}
-
+	return 0;
+out_err:
+	kfree(sch->private);
+	sysfs_remove_group(&sch->dev.kobj, &io_subchannel_attr_group);
 	return rc;
 }
 
@@ -1240,6 +1254,7 @@ io_subchannel_remove (struct subchannel *sch)
 	ccw_device_unregister(cdev);
 	put_device(&cdev->dev);
 	kfree(sch->private);
+	sysfs_remove_group(&sch->dev.kobj, &io_subchannel_attr_group);
 	return 0;
 }
 
diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index 69b2342d5ebb..1fd03e732e07 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -159,6 +159,15 @@ struct ap_device_id {
 
 #define AP_DEVICE_ID_MATCH_DEVICE_TYPE		0x01
 
+/* s390 css bus devices (subchannels) */
+struct css_device_id {
+	__u8 type; /* subchannel type */
+	__u8 pad1;
+	__u16 pad2;
+	__u32 pad3;
+	kernel_ulong_t driver_data;
+};
+
 #define ACPI_ID_LEN	16 /* only 9 bytes needed here, 16 bytes are used */
 			   /* to workaround crosscompile issues */
 
diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c
index cea4a790e1e9..37d5c363fbcd 100644
--- a/scripts/mod/file2alias.c
+++ b/scripts/mod/file2alias.c
@@ -304,6 +304,14 @@ static int do_ap_entry(const char *filename,
 	return 1;
 }
 
+/* looks like: "css:tN" */
+static int do_css_entry(const char *filename,
+			struct css_device_id *id, char *alias)
+{
+	sprintf(alias, "css:t%01X", id->type);
+	return 1;
+}
+
 /* Looks like: "serio:tyNprNidNexN" */
 static int do_serio_entry(const char *filename,
 			  struct serio_device_id *id, char *alias)
@@ -680,6 +688,10 @@ void handle_moddevtable(struct module *mod, struct elf_info *info,
 		do_table(symval, sym->st_size,
 			 sizeof(struct ap_device_id), "ap",
 			 do_ap_entry, mod);
+	else if (sym_is(symname, "__mod_css_device_table"))
+		do_table(symval, sym->st_size,
+			 sizeof(struct css_device_id), "css",
+			 do_css_entry, mod);
 	else if (sym_is(symname, "__mod_serio_device_table"))
 		do_table(symval, sym->st_size,
 			 sizeof(struct serio_device_id), "serio",
-- 
cgit v1.2.3


From f08adc008d84f6b03d377ede951e29ed169e76e2 Mon Sep 17 00:00:00 2001
From: Cornelia Huck <cornelia.huck@de.ibm.com>
Date: Mon, 14 Jul 2008 09:59:03 +0200
Subject: [S390] css: Use css_device_id for bus matching.

css_device_id exists, so use it for determining the right driver
(and add a match_flags which is always 1 for valid types).

Signed-off-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 drivers/s390/cio/css.c          | 15 ++++++---------
 drivers/s390/cio/css.h          |  2 +-
 drivers/s390/cio/device.c       |  8 +++++++-
 include/linux/mod_devicetable.h |  2 +-
 4 files changed, 15 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/s390/cio/css.c b/drivers/s390/cio/css.c
index 45ba07c0a286..4e2f2bbf4ba5 100644
--- a/drivers/s390/cio/css.c
+++ b/drivers/s390/cio/css.c
@@ -850,19 +850,16 @@ int sch_is_pseudo_sch(struct subchannel *sch)
 	return sch == to_css(sch->dev.parent)->pseudo_subchannel;
 }
 
-/*
- * find a driver for a subchannel. They identify by the subchannel
- * type with the exception that the console subchannel driver has its own
- * subchannel type although the device is an i/o subchannel
- */
-static int
-css_bus_match (struct device *dev, struct device_driver *drv)
+static int css_bus_match(struct device *dev, struct device_driver *drv)
 {
 	struct subchannel *sch = to_subchannel(dev);
 	struct css_driver *driver = to_cssdriver(drv);
+	struct css_device_id *id;
 
-	if (sch->st == driver->subchannel_type)
-		return 1;
+	for (id = driver->subchannel_type; id->match_flags; id++) {
+		if (sch->st == id->type)
+			return 1;
+	}
 
 	return 0;
 }
diff --git a/drivers/s390/cio/css.h b/drivers/s390/cio/css.h
index 38bf9ddb8412..58020bf41ed0 100644
--- a/drivers/s390/cio/css.h
+++ b/drivers/s390/cio/css.h
@@ -75,7 +75,7 @@ struct chp_link;
  */
 struct css_driver {
 	struct module *owner;
-	unsigned int subchannel_type;
+	struct css_device_id *subchannel_type;
 	struct device_driver drv;
 	void (*irq)(struct subchannel *);
 	int (*chp_event)(struct subchannel *, struct chp_link *, int);
diff --git a/drivers/s390/cio/device.c b/drivers/s390/cio/device.c
index 522d47afc950..c904cb84d75e 100644
--- a/drivers/s390/cio/device.c
+++ b/drivers/s390/cio/device.c
@@ -131,9 +131,15 @@ static int io_subchannel_sch_event(struct subchannel *, int);
 static int io_subchannel_chp_event(struct subchannel *, struct chp_link *,
 				   int);
 
+static struct css_device_id io_subchannel_ids[] = {
+	{ .match_flags = 0x1, .type = SUBCHANNEL_TYPE_IO, },
+	{ /* end of list */ },
+};
+MODULE_DEVICE_TABLE(css, io_subchannel_ids);
+
 static struct css_driver io_subchannel_driver = {
 	.owner = THIS_MODULE,
-	.subchannel_type = SUBCHANNEL_TYPE_IO,
+	.subchannel_type = io_subchannel_ids,
 	.name = "io_subchannel",
 	.irq = io_subchannel_irq,
 	.sch_event = io_subchannel_sch_event,
diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index 1fd03e732e07..c4db5827963d 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -161,8 +161,8 @@ struct ap_device_id {
 
 /* s390 css bus devices (subchannels) */
 struct css_device_id {
+	__u8 match_flags;
 	__u8 type; /* subchannel type */
-	__u8 pad1;
 	__u16 pad2;
 	__u32 pad3;
 	kernel_ulong_t driver_data;
-- 
cgit v1.2.3


From 341c2c958ec7bdd9f54733a8b0b432fe76842a82 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Tue, 20 May 2008 02:17:51 +0900
Subject: libata: consistently use msecs for time durations

libata has been using mix of jiffies and msecs for time druations.
This is getting confusing.  As writing sub HZ values in jiffies is
PITA and msecs_to_jiffies() can't be used as initializer, unify unit
for all time durations to msecs.  So, durations are in msecs and
deadlines are in jiffies.  ata_deadline() is added to compute deadline
from a start time and duration in msecs.

While at it, drop now superflous _msec suffix from arguments and
rename @timeout to @deadline if it represents a fixed point in time
rather than duration.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>
---
 drivers/ata/libata-core.c | 44 +++++++++++++++++++++-----------------------
 drivers/ata/libata-eh.c   | 33 +++++++++++++++++----------------
 drivers/ata/libata-pmp.c  |  3 ++-
 drivers/ata/libata-sff.c  | 15 ++++++++-------
 drivers/ata/pata_bf54x.c  |  6 +++---
 drivers/ata/pata_scc.c    |  2 +-
 include/linux/libata.h    | 26 ++++++++++++++++----------
 7 files changed, 68 insertions(+), 61 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 303fc0d2b978..c5c3b1b516e1 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -54,7 +54,6 @@
 #include <linux/completion.h>
 #include <linux/suspend.h>
 #include <linux/workqueue.h>
-#include <linux/jiffies.h>
 #include <linux/scatterlist.h>
 #include <linux/io.h>
 #include <scsi/scsi.h>
@@ -145,7 +144,7 @@ static int libata_dma_mask = ATA_DMA_MASK_ATA|ATA_DMA_MASK_ATAPI|ATA_DMA_MASK_CF
 module_param_named(dma, libata_dma_mask, int, 0444);
 MODULE_PARM_DESC(dma, "DMA enable/disable (0x1==ATA, 0x2==ATAPI, 0x4==CF)");
 
-static int ata_probe_timeout = ATA_TMOUT_INTERNAL / HZ;
+static int ata_probe_timeout = ATA_TMOUT_INTERNAL / 1000;
 module_param(ata_probe_timeout, int, 0444);
 MODULE_PARM_DESC(ata_probe_timeout, "Set ATA probing timeout (seconds)");
 
@@ -1533,7 +1532,7 @@ unsigned long ata_id_xfermask(const u16 *id)
  *	@ap: The ata_port to queue port_task for
  *	@fn: workqueue function to be scheduled
  *	@data: data for @fn to use
- *	@delay: delay time for workqueue function
+ *	@delay: delay time in msecs for workqueue function
  *
  *	Schedule @fn(@data) for execution after @delay jiffies using
  *	port_task.  There is one port_task per port and it's the
@@ -1552,7 +1551,7 @@ void ata_pio_queue_task(struct ata_port *ap, void *data, unsigned long delay)
 	ap->port_task_data = data;
 
 	/* may fail if ata_port_flush_task() in progress */
-	queue_delayed_work(ata_wq, &ap->port_task, delay);
+	queue_delayed_work(ata_wq, &ap->port_task, msecs_to_jiffies(delay));
 }
 
 /**
@@ -1685,7 +1684,7 @@ unsigned ata_exec_internal_sg(struct ata_device *dev,
 	spin_unlock_irqrestore(ap->lock, flags);
 
 	if (!timeout)
-		timeout = ata_probe_timeout * 1000 / HZ;
+		timeout = ata_probe_timeout * 1000;
 
 	rc = wait_for_completion_timeout(&wait, msecs_to_jiffies(timeout));
 
@@ -3319,7 +3318,7 @@ int ata_wait_ready(struct ata_link *link, unsigned long deadline,
 		   int (*check_ready)(struct ata_link *link))
 {
 	unsigned long start = jiffies;
-	unsigned long nodev_deadline = start + ATA_TMOUT_FF_WAIT;
+	unsigned long nodev_deadline = ata_deadline(start, ATA_TMOUT_FF_WAIT);
 	int warned = 0;
 
 	if (time_after(nodev_deadline, deadline))
@@ -3387,7 +3386,7 @@ int ata_wait_ready(struct ata_link *link, unsigned long deadline,
 int ata_wait_after_reset(struct ata_link *link, unsigned long deadline,
 				int (*check_ready)(struct ata_link *link))
 {
-	msleep(ATA_WAIT_AFTER_RESET_MSECS);
+	msleep(ATA_WAIT_AFTER_RESET);
 
 	return ata_wait_ready(link, deadline, check_ready);
 }
@@ -3417,13 +3416,13 @@ int ata_wait_after_reset(struct ata_link *link, unsigned long deadline,
 int sata_link_debounce(struct ata_link *link, const unsigned long *params,
 		       unsigned long deadline)
 {
-	unsigned long interval_msec = params[0];
-	unsigned long duration = msecs_to_jiffies(params[1]);
+	unsigned long interval = params[0];
+	unsigned long duration = params[1];
 	unsigned long last_jiffies, t;
 	u32 last, cur;
 	int rc;
 
-	t = jiffies + msecs_to_jiffies(params[2]);
+	t = ata_deadline(jiffies, params[2]);
 	if (time_before(t, deadline))
 		deadline = t;
 
@@ -3435,7 +3434,7 @@ int sata_link_debounce(struct ata_link *link, const unsigned long *params,
 	last_jiffies = jiffies;
 
 	while (1) {
-		msleep(interval_msec);
+		msleep(interval);
 		if ((rc = sata_scr_read(link, SCR_STATUS, &cur)))
 			return rc;
 		cur &= 0xf;
@@ -3444,7 +3443,8 @@ int sata_link_debounce(struct ata_link *link, const unsigned long *params,
 		if (cur == last) {
 			if (cur == 1 && time_before(jiffies, deadline))
 				continue;
-			if (time_after(jiffies, last_jiffies + duration))
+			if (time_after(jiffies,
+				       ata_deadline(last_jiffies, duration)))
 				return 0;
 			continue;
 		}
@@ -3636,7 +3636,8 @@ int sata_link_hardreset(struct ata_link *link, const unsigned long *timing,
 		if (check_ready) {
 			unsigned long pmp_deadline;
 
-			pmp_deadline = jiffies + ATA_TMOUT_PMP_SRST_WAIT;
+			pmp_deadline = ata_deadline(jiffies,
+						    ATA_TMOUT_PMP_SRST_WAIT);
 			if (time_after(pmp_deadline, deadline))
 				pmp_deadline = deadline;
 			ata_wait_ready(link, pmp_deadline, check_ready);
@@ -6073,8 +6074,6 @@ static void __init ata_parse_force_param(void)
 
 static int __init ata_init(void)
 {
-	ata_probe_timeout *= HZ;
-
 	ata_parse_force_param();
 
 	ata_wq = create_workqueue("ata");
@@ -6127,8 +6126,8 @@ int ata_ratelimit(void)
  *	@reg: IO-mapped register
  *	@mask: Mask to apply to read register value
  *	@val: Wait condition
- *	@interval_msec: polling interval in milliseconds
- *	@timeout_msec: timeout in milliseconds
+ *	@interval: polling interval in milliseconds
+ *	@timeout: timeout in milliseconds
  *
  *	Waiting for some bits of register to change is a common
  *	operation for ATA controllers.  This function reads 32bit LE
@@ -6146,10 +6145,9 @@ int ata_ratelimit(void)
  *	The final register value.
  */
 u32 ata_wait_register(void __iomem *reg, u32 mask, u32 val,
-		      unsigned long interval_msec,
-		      unsigned long timeout_msec)
+		      unsigned long interval, unsigned long timeout)
 {
-	unsigned long timeout;
+	unsigned long deadline;
 	u32 tmp;
 
 	tmp = ioread32(reg);
@@ -6158,10 +6156,10 @@ u32 ata_wait_register(void __iomem *reg, u32 mask, u32 val,
 	 * preceding writes reach the controller before starting to
 	 * eat away the timeout.
 	 */
-	timeout = jiffies + (timeout_msec * HZ) / 1000;
+	deadline = ata_deadline(jiffies, timeout);
 
-	while ((tmp & mask) == val && time_before(jiffies, timeout)) {
-		msleep(interval_msec);
+	while ((tmp & mask) == val && time_before(jiffies, deadline)) {
+		msleep(interval);
 		tmp = ioread32(reg);
 	}
 
diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
index 7894d83ea1eb..08dd07f10008 100644
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -66,15 +66,14 @@ enum {
 	ATA_ECAT_DUBIOUS_TOUT_HSM	= 6,
 	ATA_ECAT_DUBIOUS_UNK_DEV	= 7,
 	ATA_ECAT_NR			= 8,
-};
 
-/* Waiting in ->prereset can never be reliable.  It's sometimes nice
- * to wait there but it can't be depended upon; otherwise, we wouldn't
- * be resetting.  Just give it enough time for most drives to spin up.
- */
-enum {
-	ATA_EH_PRERESET_TIMEOUT		= 10 * HZ,
-	ATA_EH_FASTDRAIN_INTERVAL	= 3 * HZ,
+	/* Waiting in ->prereset can never be reliable.  It's
+	 * sometimes nice to wait there but it can't be depended upon;
+	 * otherwise, we wouldn't be resetting.  Just give it enough
+	 * time for most drives to spin up.
+	 */
+	ATA_EH_PRERESET_TIMEOUT		= 10000,
+	ATA_EH_FASTDRAIN_INTERVAL	=  3000,
 };
 
 /* The following table determines how we sequence resets.  Each entry
@@ -84,10 +83,10 @@ enum {
  * are mostly for error handling, hotplug and retarded devices.
  */
 static const unsigned long ata_eh_reset_timeouts[] = {
-	10 * HZ,	/* most drives spin up by 10sec */
-	10 * HZ,	/* > 99% working drives spin up before 20sec */
-	35 * HZ,	/* give > 30 secs of idleness for retarded devices */
-	5 * HZ,		/* and sweet one last chance */
+	10000,	/* most drives spin up by 10sec */
+	10000,	/* > 99% working drives spin up before 20sec */
+	35000,	/* give > 30 secs of idleness for retarded devices */
+	 5000,	/* and sweet one last chance */
 	/* > 1 min has elapsed, give up */
 };
 
@@ -641,7 +640,7 @@ void ata_eh_fastdrain_timerfn(unsigned long arg)
 		/* some qcs have finished, give it another chance */
 		ap->fastdrain_cnt = cnt;
 		ap->fastdrain_timer.expires =
-			jiffies + ATA_EH_FASTDRAIN_INTERVAL;
+			ata_deadline(jiffies, ATA_EH_FASTDRAIN_INTERVAL);
 		add_timer(&ap->fastdrain_timer);
 	}
 
@@ -681,7 +680,8 @@ static void ata_eh_set_pending(struct ata_port *ap, int fastdrain)
 
 	/* activate fast drain */
 	ap->fastdrain_cnt = cnt;
-	ap->fastdrain_timer.expires = jiffies + ATA_EH_FASTDRAIN_INTERVAL;
+	ap->fastdrain_timer.expires =
+		ata_deadline(jiffies, ATA_EH_FASTDRAIN_INTERVAL);
 	add_timer(&ap->fastdrain_timer);
 }
 
@@ -2125,7 +2125,8 @@ int ata_eh_reset(struct ata_link *link, int classify,
 	}
 
 	if (prereset) {
-		rc = prereset(link, jiffies + ATA_EH_PRERESET_TIMEOUT);
+		rc = prereset(link,
+			      ata_deadline(jiffies, ATA_EH_PRERESET_TIMEOUT));
 		if (rc) {
 			if (rc == -ENOENT) {
 				ata_link_printk(link, KERN_DEBUG,
@@ -2160,7 +2161,7 @@ int ata_eh_reset(struct ata_link *link, int classify,
 	if (ata_is_host_link(link))
 		ata_eh_freeze_port(ap);
 
-	deadline = jiffies + ata_eh_reset_timeouts[try++];
+	deadline = ata_deadline(jiffies, ata_eh_reset_timeouts[try++]);
 
 	if (reset) {
 		if (verbose)
diff --git a/drivers/ata/libata-pmp.c b/drivers/ata/libata-pmp.c
index 7daf4c0f6216..63691d77ac43 100644
--- a/drivers/ata/libata-pmp.c
+++ b/drivers/ata/libata-pmp.c
@@ -785,7 +785,8 @@ static int sata_pmp_eh_handle_disabled_links(struct ata_port *ap)
 		 * SError.N working.
 		 */
 		sata_link_hardreset(link, sata_deb_timing_normal,
-				jiffies + ATA_TMOUT_INTERNAL_QUICK, NULL, NULL);
+				ata_deadline(jiffies, ATA_TMOUT_INTERNAL_QUICK),
+				NULL, NULL);
 
 		/* unconditionally clear SError.N */
 		rc = sata_scr_write(link, SCR_ERROR, SERR_PHYRDY_CHG);
diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c
index c0908c225483..304fdc6f1dc2 100644
--- a/drivers/ata/libata-sff.c
+++ b/drivers/ata/libata-sff.c
@@ -345,8 +345,8 @@ void ata_sff_dma_pause(struct ata_port *ap)
 /**
  *	ata_sff_busy_sleep - sleep until BSY clears, or timeout
  *	@ap: port containing status register to be polled
- *	@tmout_pat: impatience timeout
- *	@tmout: overall timeout
+ *	@tmout_pat: impatience timeout in msecs
+ *	@tmout: overall timeout in msecs
  *
  *	Sleep until ATA Status register bit BSY clears,
  *	or a timeout occurs.
@@ -365,7 +365,7 @@ int ata_sff_busy_sleep(struct ata_port *ap,
 
 	status = ata_sff_busy_wait(ap, ATA_BUSY, 300);
 	timer_start = jiffies;
-	timeout = timer_start + tmout_pat;
+	timeout = ata_deadline(timer_start, tmout_pat);
 	while (status != 0xff && (status & ATA_BUSY) &&
 	       time_before(jiffies, timeout)) {
 		msleep(50);
@@ -377,7 +377,7 @@ int ata_sff_busy_sleep(struct ata_port *ap,
 				"port is slow to respond, please be patient "
 				"(Status 0x%x)\n", status);
 
-	timeout = timer_start + tmout;
+	timeout = ata_deadline(timer_start, tmout);
 	while (status != 0xff && (status & ATA_BUSY) &&
 	       time_before(jiffies, timeout)) {
 		msleep(50);
@@ -390,7 +390,7 @@ int ata_sff_busy_sleep(struct ata_port *ap,
 	if (status & ATA_BUSY) {
 		ata_port_printk(ap, KERN_ERR, "port failed to respond "
 				"(%lu secs, Status 0x%x)\n",
-				tmout / HZ, status);
+				DIV_ROUND_UP(tmout, 1000), status);
 		return -EBUSY;
 	}
 
@@ -1888,7 +1888,7 @@ int ata_sff_wait_after_reset(struct ata_link *link, unsigned int devmask,
 	unsigned int dev1 = devmask & (1 << 1);
 	int rc, ret = 0;
 
-	msleep(ATA_WAIT_AFTER_RESET_MSECS);
+	msleep(ATA_WAIT_AFTER_RESET);
 
 	/* always check readiness of the master device */
 	rc = ata_sff_wait_ready(link, deadline);
@@ -2371,7 +2371,8 @@ void ata_bus_reset(struct ata_port *ap)
 
 	/* issue bus reset */
 	if (ap->flags & ATA_FLAG_SRST) {
-		rc = ata_bus_softreset(ap, devmask, jiffies + 40 * HZ);
+		rc = ata_bus_softreset(ap, devmask,
+				       ata_deadline(jiffies, 40000));
 		if (rc && rc != -ENODEV)
 			goto err_out;
 	}
diff --git a/drivers/ata/pata_bf54x.c b/drivers/ata/pata_bf54x.c
index 55516103626a..d3932901a3b3 100644
--- a/drivers/ata/pata_bf54x.c
+++ b/drivers/ata/pata_bf54x.c
@@ -1011,7 +1011,7 @@ static void bfin_bus_post_reset(struct ata_port *ap, unsigned int devmask)
 	void __iomem *base = (void __iomem *)ap->ioaddr.ctl_addr;
 	unsigned int dev0 = devmask & (1 << 0);
 	unsigned int dev1 = devmask & (1 << 1);
-	unsigned long timeout;
+	unsigned long deadline;
 
 	/* if device 0 was found in ata_devchk, wait for its
 	 * BSY bit to clear
@@ -1022,7 +1022,7 @@ static void bfin_bus_post_reset(struct ata_port *ap, unsigned int devmask)
 	/* if device 1 was found in ata_devchk, wait for
 	 * register access, then wait for BSY to clear
 	 */
-	timeout = jiffies + ATA_TMOUT_BOOT;
+	deadline = ata_deadline(jiffies, ATA_TMOUT_BOOT);
 	while (dev1) {
 		u8 nsect, lbal;
 
@@ -1031,7 +1031,7 @@ static void bfin_bus_post_reset(struct ata_port *ap, unsigned int devmask)
 		lbal = read_atapi_register(base, ATA_REG_LBAL);
 		if ((nsect == 1) && (lbal == 1))
 			break;
-		if (time_after(jiffies, timeout)) {
+		if (time_after(jiffies, deadline)) {
 			dev1 = 0;
 			break;
 		}
diff --git a/drivers/ata/pata_scc.c b/drivers/ata/pata_scc.c
index bbf5aa345e68..16673d168573 100644
--- a/drivers/ata/pata_scc.c
+++ b/drivers/ata/pata_scc.c
@@ -696,7 +696,7 @@ static void scc_bmdma_stop (struct ata_queued_cmd *qc)
 
 		if (reg & INTSTS_BMSINT) {
 			unsigned int classes;
-			unsigned long deadline = jiffies + ATA_TMOUT_BOOT;
+			unsigned long deadline = ata_deadline(jiffies, ATA_TMOUT_BOOT);
 			printk(KERN_WARNING "%s: Internal Bus Error\n", DRV_NAME);
 			out_be32(bmid_base + SCC_DMA_INTST, INTSTS_BMSINT);
 			/* TBD: SW reset */
diff --git a/include/linux/libata.h b/include/linux/libata.h
index e57e5d08312d..94110b652b30 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -27,6 +27,7 @@
 #define __LINUX_LIBATA_H__
 
 #include <linux/delay.h>
+#include <linux/jiffies.h>
 #include <linux/interrupt.h>
 #include <linux/dma-mapping.h>
 #include <linux/scatterlist.h>
@@ -115,7 +116,7 @@ enum {
 	/* tag ATA_MAX_QUEUE - 1 is reserved for internal commands */
 	ATA_MAX_QUEUE		= 32,
 	ATA_TAG_INTERNAL	= ATA_MAX_QUEUE - 1,
-	ATA_SHORT_PAUSE		= (HZ >> 6) + 1,
+	ATA_SHORT_PAUSE		= 16,
 
 	ATAPI_MAX_DRAIN		= 16 << 10,
 
@@ -234,17 +235,17 @@ enum {
 	/* bits 24:31 of host->flags are reserved for LLD specific flags */
 
 	/* various lengths of time */
-	ATA_TMOUT_BOOT		= 30 * HZ,	/* heuristic */
-	ATA_TMOUT_BOOT_QUICK	= 7 * HZ,	/* heuristic */
-	ATA_TMOUT_INTERNAL	= 30 * HZ,
-	ATA_TMOUT_INTERNAL_QUICK = 5 * HZ,
+	ATA_TMOUT_BOOT		= 30000,	/* heuristic */
+	ATA_TMOUT_BOOT_QUICK	=  7000,	/* heuristic */
+	ATA_TMOUT_INTERNAL	= 30000,
+	ATA_TMOUT_INTERNAL_QUICK = 5000,
 
 	/* FIXME: GoVault needs 2s but we can't afford that without
 	 * parallel probing.  800ms is enough for iVDR disk
 	 * HHD424020F7SV00.  Increase to 2secs when parallel probing
 	 * is in place.
 	 */
-	ATA_TMOUT_FF_WAIT	= 4 * HZ / 5,
+	ATA_TMOUT_FF_WAIT	=  800,
 
 	/* Spec mandates to wait for ">= 2ms" before checking status
 	 * after reset.  We wait 150ms, because that was the magic
@@ -256,14 +257,14 @@ enum {
 	 *
 	 * Old drivers/ide uses the 2mS rule and then waits for ready.
 	 */
-	ATA_WAIT_AFTER_RESET_MSECS = 150,
+	ATA_WAIT_AFTER_RESET	=  150,
 
 	/* If PMP is supported, we have to do follow-up SRST.  As some
 	 * PMPs don't send D2H Reg FIS after hardreset, LLDs are
 	 * advised to wait only for the following duration before
 	 * doing SRST.
 	 */
-	ATA_TMOUT_PMP_SRST_WAIT	= 1 * HZ,
+	ATA_TMOUT_PMP_SRST_WAIT	= 1000,
 
 	/* ATA bus states */
 	BUS_UNKNOWN		= 0,
@@ -895,8 +896,7 @@ extern void ata_host_resume(struct ata_host *host);
 #endif
 extern int ata_ratelimit(void);
 extern u32 ata_wait_register(void __iomem *reg, u32 mask, u32 val,
-			     unsigned long interval_msec,
-			     unsigned long timeout_msec);
+			     unsigned long interval, unsigned long timeout);
 extern int atapi_cmd_type(u8 opcode);
 extern void ata_tf_to_fis(const struct ata_taskfile *tf,
 			  u8 pmp, int is_cmd, u8 *fis);
@@ -1389,6 +1389,12 @@ static inline int ata_check_ready(u8 status)
 	return 0;
 }
 
+static inline unsigned long ata_deadline(unsigned long from_jiffies,
+					 unsigned long timeout_msecs)
+{
+	return from_jiffies + msecs_to_jiffies(timeout_msecs);
+}
+
 
 /**************************************************************************
  * PMP - drivers/ata/libata-pmp.c
-- 
cgit v1.2.3


From 0a2c0f56159999e20015241d3b8fa89b1ab14309 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Tue, 20 May 2008 02:17:52 +0900
Subject: libata: improve EH retry delay handling

EH retries were delayed by 5 seconds to ensure that resets don't occur
back-to-back.  However, this 5 second delay is superflous or excessive
in many cases.  For example, after IDENTIFY times out, there's no
reason to wait five more seconds before retrying.

This patch adds ehc->last_reset timestamp and record the timestamp for
the last reset trial or success and uses it to space resets by
ATA_EH_RESET_COOL_DOWN which is 5 secs and removes unconditional 5 sec
sleeps.

As this change makes inter-try waits often shorter and they're
redundant in nature, this patch also removes the "retrying..."
messages.

While at it, convert explicit rounding up division to DIV_ROUND_UP().

This change speeds up EH in many cases w/o sacrificing robustness.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>
---
 drivers/ata/libata-eh.c  | 38 ++++++++++++++++++++------------------
 drivers/ata/libata-pmp.c | 10 ----------
 include/linux/libata.h   |  2 ++
 3 files changed, 22 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
index 08dd07f10008..5b5ae631ed03 100644
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -67,6 +67,9 @@ enum {
 	ATA_ECAT_DUBIOUS_UNK_DEV	= 7,
 	ATA_ECAT_NR			= 8,
 
+	/* always put at least this amount of time between resets */
+	ATA_EH_RESET_COOL_DOWN		=  5000,
+
 	/* Waiting in ->prereset can never be reliable.  It's
 	 * sometimes nice to wait there but it can't be depended upon;
 	 * otherwise, we wouldn't be resetting.  Just give it enough
@@ -485,6 +488,9 @@ void ata_scsi_error(struct Scsi_Host *host)
 				if (ata_ncq_enabled(dev))
 					ehc->saved_ncq_enabled |= 1 << devno;
 			}
+
+			/* set last reset timestamp to some time in the past */
+			ehc->last_reset = jiffies - 60 * HZ;
 		}
 
 		ap->pflags |= ATA_PFLAG_EH_IN_PROGRESS;
@@ -2088,11 +2094,17 @@ int ata_eh_reset(struct ata_link *link, int classify,
 	/*
 	 * Prepare to reset
 	 */
+	now = jiffies;
+	deadline = ata_deadline(ehc->last_reset, ATA_EH_RESET_COOL_DOWN);
+	if (time_before(now, deadline))
+		schedule_timeout_uninterruptible(deadline - now);
+
 	spin_lock_irqsave(ap->lock, flags);
 	ap->pflags |= ATA_PFLAG_RESETTING;
 	spin_unlock_irqrestore(ap->lock, flags);
 
 	ata_eh_about_to_do(link, NULL, ATA_EH_RESET);
+	ehc->last_reset = jiffies;
 
 	ata_link_for_each_dev(dev, link) {
 		/* If we issue an SRST then an ATA drive (not ATAPI)
@@ -2158,6 +2170,7 @@ int ata_eh_reset(struct ata_link *link, int classify,
 	/*
 	 * Perform reset
 	 */
+	ehc->last_reset = jiffies;
 	if (ata_is_host_link(link))
 		ata_eh_freeze_port(ap);
 
@@ -2278,6 +2291,7 @@ int ata_eh_reset(struct ata_link *link, int classify,
 
 	/* reset successful, schedule revalidation */
 	ata_eh_done(link, NULL, ATA_EH_RESET);
+	ehc->last_reset = jiffies;
 	ehc->i.action |= ATA_EH_REVALIDATE;
 
 	rc = 0;
@@ -2304,9 +2318,9 @@ int ata_eh_reset(struct ata_link *link, int classify,
 	if (time_before(now, deadline)) {
 		unsigned long delta = deadline - now;
 
-		ata_link_printk(link, KERN_WARNING, "reset failed "
-				"(errno=%d), retrying in %u secs\n",
-				rc, (jiffies_to_msecs(delta) + 999) / 1000);
+		ata_link_printk(link, KERN_WARNING,
+			"reset failed (errno=%d), retrying in %u secs\n",
+			rc, DIV_ROUND_UP(jiffies_to_msecs(delta), 1000));
 
 		while (delta)
 			delta = schedule_timeout_uninterruptible(delta);
@@ -2623,7 +2637,7 @@ int ata_eh_recover(struct ata_port *ap, ata_prereset_fn_t prereset,
 {
 	struct ata_link *link;
 	struct ata_device *dev;
-	int nr_failed_devs, nr_disabled_devs;
+	int nr_failed_devs;
 	int rc;
 	unsigned long flags;
 
@@ -2666,7 +2680,6 @@ int ata_eh_recover(struct ata_port *ap, ata_prereset_fn_t prereset,
  retry:
 	rc = 0;
 	nr_failed_devs = 0;
-	nr_disabled_devs = 0;
 
 	/* if UNLOADING, finish immediately */
 	if (ap->pflags & ATA_PFLAG_UNLOADING)
@@ -2733,8 +2746,7 @@ int ata_eh_recover(struct ata_port *ap, ata_prereset_fn_t prereset,
 
 dev_fail:
 		nr_failed_devs++;
-		if (ata_eh_handle_dev_fail(dev, rc))
-			nr_disabled_devs++;
+		ata_eh_handle_dev_fail(dev, rc);
 
 		if (ap->pflags & ATA_PFLAG_FROZEN) {
 			/* PMP reset requires working host port.
@@ -2746,18 +2758,8 @@ dev_fail:
 		}
 	}
 
-	if (nr_failed_devs) {
-		if (nr_failed_devs != nr_disabled_devs) {
-			ata_port_printk(ap, KERN_WARNING, "failed to recover "
-					"some devices, retrying in 5 secs\n");
-			ssleep(5);
-		} else {
-			/* no device left to recover, repeat fast */
-			msleep(500);
-		}
-
+	if (nr_failed_devs)
 		goto retry;
-	}
 
  out:
 	if (rc && r_failed_link)
diff --git a/drivers/ata/libata-pmp.c b/drivers/ata/libata-pmp.c
index 63691d77ac43..b65db309c181 100644
--- a/drivers/ata/libata-pmp.c
+++ b/drivers/ata/libata-pmp.c
@@ -727,19 +727,12 @@ static int sata_pmp_eh_recover_pmp(struct ata_port *ap,
 		}
 
 		if (tries) {
-			int sleep = ehc->i.flags & ATA_EHI_DID_RESET;
-
 			/* consecutive revalidation failures? speed down */
 			if (reval_failed)
 				sata_down_spd_limit(link);
 			else
 				reval_failed = 1;
 
-			ata_dev_printk(dev, KERN_WARNING,
-				       "retrying reset%s\n",
-				       sleep ? " in 5 secs" : "");
-			if (sleep)
-				ssleep(5);
 			ehc->i.action |= ATA_EH_RESET;
 			goto retry;
 		} else {
@@ -991,10 +984,7 @@ static int sata_pmp_eh_recover(struct ata_port *ap)
 		goto retry;
 
 	if (--pmp_tries) {
-		ata_port_printk(ap, KERN_WARNING,
-				"failed to recover PMP, retrying in 5 secs\n");
 		pmp_ehc->i.action |= ATA_EH_RESET;
-		ssleep(5);
 		goto retry;
 	}
 
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 94110b652b30..9058c2a325a9 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -602,6 +602,8 @@ struct ata_eh_context {
 	unsigned int		did_probe_mask;
 	unsigned int		saved_ncq_enabled;
 	u8			saved_xfer_mode[ATA_MAX_DEVICES];
+	/* timestamp for the last reset attempt or success */
+	unsigned long		last_reset;
 };
 
 struct ata_acpi_drive
-- 
cgit v1.2.3


From 87fbc5a060faf2394bee88a93519f9b9d434727c Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Tue, 20 May 2008 02:17:54 +0900
Subject: libata: improve EH internal command timeout handling

ATA_TMOUT_INTERNAL which was 30secs were used for all internal
commands which is way too long when something goes wrong.  This patch
implements command type based stepped timeouts.  Different command
types can use different timeouts and each command type can use
different timeout values after timeouts.

ie. the initial timeout is set to a value which should cover most of
the cases but not too long so that run away cases don't delay things
too much.  After the first try times out, the second try can use
longer timeout and if that one times out too, it can go for full 30sec
timeout.

IDENTIFYs use 5s - 10s - 30s timeout and all other commands use 5s -
10s timeouts.

This patch significantly cuts down the needed time to handle failure
cases while still allowing libata to work with nut job devices through
retries.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>
---
 drivers/ata/libata-core.c |  16 ++++--
 drivers/ata/libata-eh.c   | 121 +++++++++++++++++++++++++++++++++++++++++++++-
 drivers/ata/libata.h      |   2 +
 include/linux/libata.h    |   8 ++-
 4 files changed, 142 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index c5c3b1b516e1..9bef1a84fe3f 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -144,7 +144,7 @@ static int libata_dma_mask = ATA_DMA_MASK_ATA|ATA_DMA_MASK_ATAPI|ATA_DMA_MASK_CF
 module_param_named(dma, libata_dma_mask, int, 0444);
 MODULE_PARM_DESC(dma, "DMA enable/disable (0x1==ATA, 0x2==ATAPI, 0x4==CF)");
 
-static int ata_probe_timeout = ATA_TMOUT_INTERNAL / 1000;
+static int ata_probe_timeout;
 module_param(ata_probe_timeout, int, 0444);
 MODULE_PARM_DESC(ata_probe_timeout, "Set ATA probing timeout (seconds)");
 
@@ -1611,6 +1611,7 @@ unsigned ata_exec_internal_sg(struct ata_device *dev,
 	struct ata_link *link = dev->link;
 	struct ata_port *ap = link->ap;
 	u8 command = tf->command;
+	int auto_timeout = 0;
 	struct ata_queued_cmd *qc;
 	unsigned int tag, preempted_tag;
 	u32 preempted_sactive, preempted_qc_active;
@@ -1683,8 +1684,14 @@ unsigned ata_exec_internal_sg(struct ata_device *dev,
 
 	spin_unlock_irqrestore(ap->lock, flags);
 
-	if (!timeout)
-		timeout = ata_probe_timeout * 1000;
+	if (!timeout) {
+		if (ata_probe_timeout)
+			timeout = ata_probe_timeout * 1000;
+		else {
+			timeout = ata_internal_cmd_timeout(dev, command);
+			auto_timeout = 1;
+		}
+	}
 
 	rc = wait_for_completion_timeout(&wait, msecs_to_jiffies(timeout));
 
@@ -1760,6 +1767,9 @@ unsigned ata_exec_internal_sg(struct ata_device *dev,
 
 	spin_unlock_irqrestore(ap->lock, flags);
 
+	if ((err_mask & AC_ERR_TIMEOUT) && auto_timeout)
+		ata_internal_cmd_timed_out(dev, command);
+
 	return err_mask;
 }
 
diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
index 83d1451fa714..d5f03a6e3334 100644
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -67,6 +67,8 @@ enum {
 	ATA_ECAT_DUBIOUS_UNK_DEV	= 7,
 	ATA_ECAT_NR			= 8,
 
+	ATA_EH_CMD_DFL_TIMEOUT		=  5000,
+
 	/* always put at least this amount of time between resets */
 	ATA_EH_RESET_COOL_DOWN		=  5000,
 
@@ -93,6 +95,53 @@ static const unsigned long ata_eh_reset_timeouts[] = {
 	ULONG_MAX, /* > 1 min has elapsed, give up */
 };
 
+static const unsigned long ata_eh_identify_timeouts[] = {
+	 5000,	/* covers > 99% of successes and not too boring on failures */
+	10000,  /* combined time till here is enough even for media access */
+	30000,	/* for true idiots */
+	ULONG_MAX,
+};
+
+static const unsigned long ata_eh_other_timeouts[] = {
+	 5000,	/* same rationale as identify timeout */
+	10000,	/* ditto */
+	/* but no merciful 30sec for other commands, it just isn't worth it */
+	ULONG_MAX,
+};
+
+struct ata_eh_cmd_timeout_ent {
+	const u8		*commands;
+	const unsigned long	*timeouts;
+};
+
+/* The following table determines timeouts to use for EH internal
+ * commands.  Each table entry is a command class and matches the
+ * commands the entry applies to and the timeout table to use.
+ *
+ * On the retry after a command timed out, the next timeout value from
+ * the table is used.  If the table doesn't contain further entries,
+ * the last value is used.
+ *
+ * ehc->cmd_timeout_idx keeps track of which timeout to use per
+ * command class, so if SET_FEATURES times out on the first try, the
+ * next try will use the second timeout value only for that class.
+ */
+#define CMDS(cmds...)	(const u8 []){ cmds, 0 }
+static const struct ata_eh_cmd_timeout_ent
+ata_eh_cmd_timeout_table[ATA_EH_CMD_TIMEOUT_TABLE_SIZE] = {
+	{ .commands = CMDS(ATA_CMD_ID_ATA, ATA_CMD_ID_ATAPI),
+	  .timeouts = ata_eh_identify_timeouts, },
+	{ .commands = CMDS(ATA_CMD_READ_NATIVE_MAX, ATA_CMD_READ_NATIVE_MAX_EXT),
+	  .timeouts = ata_eh_other_timeouts, },
+	{ .commands = CMDS(ATA_CMD_SET_MAX, ATA_CMD_SET_MAX_EXT),
+	  .timeouts = ata_eh_other_timeouts, },
+	{ .commands = CMDS(ATA_CMD_SET_FEATURES),
+	  .timeouts = ata_eh_other_timeouts, },
+	{ .commands = CMDS(ATA_CMD_INIT_DEV_PARAMS),
+	  .timeouts = ata_eh_other_timeouts, },
+};
+#undef CMDS
+
 static void __ata_port_freeze(struct ata_port *ap);
 #ifdef CONFIG_PM
 static void ata_eh_handle_port_suspend(struct ata_port *ap);
@@ -238,6 +287,73 @@ void ata_port_pbar_desc(struct ata_port *ap, int bar, ssize_t offset,
 
 #endif /* CONFIG_PCI */
 
+static int ata_lookup_timeout_table(u8 cmd)
+{
+	int i;
+
+	for (i = 0; i < ATA_EH_CMD_TIMEOUT_TABLE_SIZE; i++) {
+		const u8 *cur;
+
+		for (cur = ata_eh_cmd_timeout_table[i].commands; *cur; cur++)
+			if (*cur == cmd)
+				return i;
+	}
+
+	return -1;
+}
+
+/**
+ *	ata_internal_cmd_timeout - determine timeout for an internal command
+ *	@dev: target device
+ *	@cmd: internal command to be issued
+ *
+ *	Determine timeout for internal command @cmd for @dev.
+ *
+ *	LOCKING:
+ *	EH context.
+ *
+ *	RETURNS:
+ *	Determined timeout.
+ */
+unsigned long ata_internal_cmd_timeout(struct ata_device *dev, u8 cmd)
+{
+	struct ata_eh_context *ehc = &dev->link->eh_context;
+	int ent = ata_lookup_timeout_table(cmd);
+	int idx;
+
+	if (ent < 0)
+		return ATA_EH_CMD_DFL_TIMEOUT;
+
+	idx = ehc->cmd_timeout_idx[dev->devno][ent];
+	return ata_eh_cmd_timeout_table[ent].timeouts[idx];
+}
+
+/**
+ *	ata_internal_cmd_timed_out - notification for internal command timeout
+ *	@dev: target device
+ *	@cmd: internal command which timed out
+ *
+ *	Notify EH that internal command @cmd for @dev timed out.  This
+ *	function should be called only for commands whose timeouts are
+ *	determined using ata_internal_cmd_timeout().
+ *
+ *	LOCKING:
+ *	EH context.
+ */
+void ata_internal_cmd_timed_out(struct ata_device *dev, u8 cmd)
+{
+	struct ata_eh_context *ehc = &dev->link->eh_context;
+	int ent = ata_lookup_timeout_table(cmd);
+	int idx;
+
+	if (ent < 0)
+		return;
+
+	idx = ehc->cmd_timeout_idx[dev->devno][ent];
+	if (ata_eh_cmd_timeout_table[ent].timeouts[idx + 1] != ULONG_MAX)
+		ehc->cmd_timeout_idx[dev->devno][ent]++;
+}
+
 static void ata_ering_record(struct ata_ering *ering, unsigned int eflags,
 			     unsigned int err_mask)
 {
@@ -2600,8 +2716,11 @@ static int ata_eh_handle_dev_fail(struct ata_device *dev, int err)
 			ata_eh_detach_dev(dev);
 
 		/* schedule probe if necessary */
-		if (ata_eh_schedule_probe(dev))
+		if (ata_eh_schedule_probe(dev)) {
 			ehc->tries[dev->devno] = ATA_EH_DEV_TRIES;
+			memset(ehc->cmd_timeout_idx[dev->devno], 0,
+			       sizeof(ehc->cmd_timeout_idx[dev->devno]));
+		}
 
 		return 1;
 	} else {
diff --git a/drivers/ata/libata.h b/drivers/ata/libata.h
index 1cf803adbc95..f6f9c28ec7f8 100644
--- a/drivers/ata/libata.h
+++ b/drivers/ata/libata.h
@@ -151,6 +151,8 @@ extern void ata_scsi_dev_rescan(struct work_struct *work);
 extern int ata_bus_probe(struct ata_port *ap);
 
 /* libata-eh.c */
+extern unsigned long ata_internal_cmd_timeout(struct ata_device *dev, u8 cmd);
+extern void ata_internal_cmd_timed_out(struct ata_device *dev, u8 cmd);
 extern enum scsi_eh_timer_return ata_scsi_timed_out(struct scsi_cmnd *cmd);
 extern void ata_scsi_error(struct Scsi_Host *host);
 extern void ata_port_wait_eh(struct ata_port *ap);
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 9058c2a325a9..035f8e1cd0ac 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -237,7 +237,6 @@ enum {
 	/* various lengths of time */
 	ATA_TMOUT_BOOT		= 30000,	/* heuristic */
 	ATA_TMOUT_BOOT_QUICK	=  7000,	/* heuristic */
-	ATA_TMOUT_INTERNAL	= 30000,
 	ATA_TMOUT_INTERNAL_QUICK = 5000,
 
 	/* FIXME: GoVault needs 2s but we can't afford that without
@@ -341,6 +340,11 @@ enum {
 
 	SATA_PMP_RW_TIMEOUT	= 3000,		/* PMP read/write timeout */
 
+	/* This should match the actual table size of
+	 * ata_eh_cmd_timeout_table in libata-eh.c.
+	 */
+	ATA_EH_CMD_TIMEOUT_TABLE_SIZE = 5,
+
 	/* Horkage types. May be set by libata or controller on drives
 	   (some horkage may be drive/controller pair dependant */
 
@@ -598,6 +602,8 @@ struct ata_eh_info {
 struct ata_eh_context {
 	struct ata_eh_info	i;
 	int			tries[ATA_MAX_DEVICES];
+	int			cmd_timeout_idx[ATA_MAX_DEVICES]
+					       [ATA_EH_CMD_TIMEOUT_TABLE_SIZE];
 	unsigned int		classes[ATA_MAX_DEVICES];
 	unsigned int		did_probe_mask;
 	unsigned int		saved_ncq_enabled;
-- 
cgit v1.2.3


From 18f7ba4c2f4be6b37d925931f04d6cc28d88d1ee Mon Sep 17 00:00:00 2001
From: Kristen Carlson Accardi <kristen.c.accardi@intel.com>
Date: Tue, 3 Jun 2008 10:33:55 -0700
Subject: libata/ahci: enclosure management support

Add Enclosure Management support to libata and ahci.

Signed-off-by:  Kristen Carlson Accardi <kristen.c.accardi@intel.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>
---
 drivers/ata/ahci.c        | 321 +++++++++++++++++++++++++++++++++++++++++++++-
 drivers/ata/libata-scsi.c |  79 ++++++++++++
 include/linux/libata.h    |  21 +++
 3 files changed, 419 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
index 5e6468a7ca4b..65d4e968feb4 100644
--- a/drivers/ata/ahci.c
+++ b/drivers/ata/ahci.c
@@ -56,6 +56,12 @@ MODULE_PARM_DESC(skip_host_reset, "skip global host reset (0=don't skip, 1=skip)
 static int ahci_enable_alpm(struct ata_port *ap,
 		enum link_pm policy);
 static void ahci_disable_alpm(struct ata_port *ap);
+static ssize_t ahci_led_show(struct ata_port *ap, char *buf);
+static ssize_t ahci_led_store(struct ata_port *ap, const char *buf,
+			      size_t size);
+static ssize_t ahci_transmit_led_message(struct ata_port *ap, u32 state,
+					ssize_t size);
+#define MAX_SLOTS 8
 
 enum {
 	AHCI_PCI_BAR		= 5,
@@ -98,6 +104,8 @@ enum {
 	HOST_IRQ_STAT		= 0x08, /* interrupt status */
 	HOST_PORTS_IMPL		= 0x0c, /* bitmap of implemented ports */
 	HOST_VERSION		= 0x10, /* AHCI spec. version compliancy */
+	HOST_EM_LOC		= 0x1c, /* Enclosure Management location */
+	HOST_EM_CTL		= 0x20, /* Enclosure Management Control */
 
 	/* HOST_CTL bits */
 	HOST_RESET		= (1 << 0),  /* reset controller; self-clear */
@@ -105,6 +113,7 @@ enum {
 	HOST_AHCI_EN		= (1 << 31), /* AHCI enabled */
 
 	/* HOST_CAP bits */
+	HOST_CAP_EMS		= (1 << 6),  /* Enclosure Management support */
 	HOST_CAP_SSC		= (1 << 14), /* Slumber capable */
 	HOST_CAP_PMP		= (1 << 17), /* Port Multiplier support */
 	HOST_CAP_CLO		= (1 << 24), /* Command List Override support */
@@ -202,6 +211,11 @@ enum {
 					  ATA_FLAG_IPM,
 
 	ICH_MAP				= 0x90, /* ICH MAP register */
+
+	/* em_ctl bits */
+	EM_CTL_RST			= (1 << 9), /* Reset */
+	EM_CTL_TM			= (1 << 8), /* Transmit Message */
+	EM_CTL_ALHD			= (1 << 26), /* Activity LED */
 };
 
 struct ahci_cmd_hdr {
@@ -219,12 +233,21 @@ struct ahci_sg {
 	__le32			flags_size;
 };
 
+struct ahci_em_priv {
+	enum sw_activity blink_policy;
+	struct timer_list timer;
+	unsigned long saved_activity;
+	unsigned long activity;
+	unsigned long led_state;
+};
+
 struct ahci_host_priv {
 	unsigned int		flags;		/* AHCI_HFLAG_* */
 	u32			cap;		/* cap to use */
 	u32			port_map;	/* port map to use */
 	u32			saved_cap;	/* saved initial cap */
 	u32			saved_port_map;	/* saved initial port_map */
+	u32 			em_loc; /* enclosure management location */
 };
 
 struct ahci_port_priv {
@@ -240,6 +263,8 @@ struct ahci_port_priv {
 	unsigned int		ncq_saw_dmas:1;
 	unsigned int		ncq_saw_sdb:1;
 	u32 			intr_mask;	/* interrupts to enable */
+	struct ahci_em_priv	em_priv[MAX_SLOTS];/* enclosure management info
+					 	 * per PM slot */
 };
 
 static int ahci_scr_read(struct ata_port *ap, unsigned int sc_reg, u32 *val);
@@ -277,9 +302,20 @@ static int ahci_port_suspend(struct ata_port *ap, pm_message_t mesg);
 static int ahci_pci_device_suspend(struct pci_dev *pdev, pm_message_t mesg);
 static int ahci_pci_device_resume(struct pci_dev *pdev);
 #endif
+static ssize_t ahci_activity_show(struct ata_device *dev, char *buf);
+static ssize_t ahci_activity_store(struct ata_device *dev,
+				   enum sw_activity val);
+static void ahci_init_sw_activity(struct ata_link *link);
 
 static struct device_attribute *ahci_shost_attrs[] = {
 	&dev_attr_link_power_management_policy,
+	&dev_attr_em_message_type,
+	&dev_attr_em_message,
+	NULL
+};
+
+static struct device_attribute *ahci_sdev_attrs[] = {
+	&dev_attr_sw_activity,
 	NULL
 };
 
@@ -289,6 +325,7 @@ static struct scsi_host_template ahci_sht = {
 	.sg_tablesize		= AHCI_MAX_SG,
 	.dma_boundary		= AHCI_DMA_BOUNDARY,
 	.shost_attrs		= ahci_shost_attrs,
+	.sdev_attrs		= ahci_sdev_attrs,
 };
 
 static struct ata_port_operations ahci_ops = {
@@ -316,6 +353,10 @@ static struct ata_port_operations ahci_ops = {
 
 	.enable_pm		= ahci_enable_alpm,
 	.disable_pm		= ahci_disable_alpm,
+	.em_show		= ahci_led_show,
+	.em_store		= ahci_led_store,
+	.sw_activity_show	= ahci_activity_show,
+	.sw_activity_store	= ahci_activity_store,
 #ifdef CONFIG_PM
 	.port_suspend		= ahci_port_suspend,
 	.port_resume		= ahci_port_resume,
@@ -561,6 +602,11 @@ static struct pci_driver ahci_pci_driver = {
 #endif
 };
 
+static int ahci_em_messages = 1;
+module_param(ahci_em_messages, int, 0444);
+/* add other LED protocol types when they become supported */
+MODULE_PARM_DESC(ahci_em_messages,
+	"Set AHCI Enclosure Management Message type (0 = disabled, 1 = LED");
 
 static inline int ahci_nr_ports(u32 cap)
 {
@@ -1031,11 +1077,28 @@ static void ahci_power_down(struct ata_port *ap)
 
 static void ahci_start_port(struct ata_port *ap)
 {
+	struct ahci_port_priv *pp = ap->private_data;
+	struct ata_link *link;
+	struct ahci_em_priv *emp;
+
 	/* enable FIS reception */
 	ahci_start_fis_rx(ap);
 
 	/* enable DMA */
 	ahci_start_engine(ap);
+
+	/* turn on LEDs */
+	if (ap->flags & ATA_FLAG_EM) {
+		ata_port_for_each_link(link, ap) {
+			emp = &pp->em_priv[link->pmp];
+			ahci_transmit_led_message(ap, emp->led_state, 4);
+		}
+	}
+
+	if (ap->flags & ATA_FLAG_SW_ACTIVITY)
+		ata_port_for_each_link(link, ap)
+			ahci_init_sw_activity(link);
+
 }
 
 static int ahci_deinit_port(struct ata_port *ap, const char **emsg)
@@ -1116,6 +1179,230 @@ static int ahci_reset_controller(struct ata_host *host)
 	return 0;
 }
 
+static void ahci_sw_activity(struct ata_link *link)
+{
+	struct ata_port *ap = link->ap;
+	struct ahci_port_priv *pp = ap->private_data;
+	struct ahci_em_priv *emp = &pp->em_priv[link->pmp];
+
+	if (!(link->flags & ATA_LFLAG_SW_ACTIVITY))
+		return;
+
+	emp->activity++;
+	if (!timer_pending(&emp->timer))
+		mod_timer(&emp->timer, jiffies + msecs_to_jiffies(10));
+}
+
+static void ahci_sw_activity_blink(unsigned long arg)
+{
+	struct ata_link *link = (struct ata_link *)arg;
+	struct ata_port *ap = link->ap;
+	struct ahci_port_priv *pp = ap->private_data;
+	struct ahci_em_priv *emp = &pp->em_priv[link->pmp];
+	unsigned long led_message = emp->led_state;
+	u32 activity_led_state;
+
+	led_message &= 0xffff0000;
+	led_message |= ap->port_no | (link->pmp << 8);
+
+	/* check to see if we've had activity.  If so,
+	 * toggle state of LED and reset timer.  If not,
+	 * turn LED to desired idle state.
+	 */
+	if (emp->saved_activity != emp->activity) {
+		emp->saved_activity = emp->activity;
+		/* get the current LED state */
+		activity_led_state = led_message & 0x00010000;
+
+		if (activity_led_state)
+			activity_led_state = 0;
+		else
+			activity_led_state = 1;
+
+		/* clear old state */
+		led_message &= 0xfff8ffff;
+
+		/* toggle state */
+		led_message |= (activity_led_state << 16);
+		mod_timer(&emp->timer, jiffies + msecs_to_jiffies(100));
+	} else {
+		/* switch to idle */
+		led_message &= 0xfff8ffff;
+		if (emp->blink_policy == BLINK_OFF)
+			led_message |= (1 << 16);
+	}
+	ahci_transmit_led_message(ap, led_message, 4);
+}
+
+static void ahci_init_sw_activity(struct ata_link *link)
+{
+	struct ata_port *ap = link->ap;
+	struct ahci_port_priv *pp = ap->private_data;
+	struct ahci_em_priv *emp = &pp->em_priv[link->pmp];
+
+	/* init activity stats, setup timer */
+	emp->saved_activity = emp->activity = 0;
+	setup_timer(&emp->timer, ahci_sw_activity_blink, (unsigned long)link);
+
+	/* check our blink policy and set flag for link if it's enabled */
+	if (emp->blink_policy)
+		link->flags |= ATA_LFLAG_SW_ACTIVITY;
+}
+
+static int ahci_reset_em(struct ata_host *host)
+{
+	void __iomem *mmio = host->iomap[AHCI_PCI_BAR];
+	u32 em_ctl;
+
+	em_ctl = readl(mmio + HOST_EM_CTL);
+	if ((em_ctl & EM_CTL_TM) || (em_ctl & EM_CTL_RST))
+		return -EINVAL;
+
+	writel(em_ctl | EM_CTL_RST, mmio + HOST_EM_CTL);
+	return 0;
+}
+
+static ssize_t ahci_transmit_led_message(struct ata_port *ap, u32 state,
+					ssize_t size)
+{
+	struct ahci_host_priv *hpriv = ap->host->private_data;
+	struct ahci_port_priv *pp = ap->private_data;
+	void __iomem *mmio = ap->host->iomap[AHCI_PCI_BAR];
+	u32 em_ctl;
+	u32 message[] = {0, 0};
+	unsigned int flags;
+	int pmp;
+	struct ahci_em_priv *emp;
+
+	/* get the slot number from the message */
+	pmp = (state & 0x0000ff00) >> 8;
+	if (pmp < MAX_SLOTS)
+		emp = &pp->em_priv[pmp];
+	else
+		return -EINVAL;
+
+	spin_lock_irqsave(ap->lock, flags);
+
+	/*
+	 * if we are still busy transmitting a previous message,
+	 * do not allow
+	 */
+	em_ctl = readl(mmio + HOST_EM_CTL);
+	if (em_ctl & EM_CTL_TM) {
+		spin_unlock_irqrestore(ap->lock, flags);
+		return -EINVAL;
+	}
+
+	/*
+	 * create message header - this is all zero except for
+	 * the message size, which is 4 bytes.
+	 */
+	message[0] |= (4 << 8);
+
+	/* ignore 0:4 of byte zero, fill in port info yourself */
+	message[1] = ((state & 0xfffffff0) | ap->port_no);
+
+	/* write message to EM_LOC */
+	writel(message[0], mmio + hpriv->em_loc);
+	writel(message[1], mmio + hpriv->em_loc+4);
+
+	/* save off new led state for port/slot */
+	emp->led_state = message[1];
+
+	/*
+	 * tell hardware to transmit the message
+	 */
+	writel(em_ctl | EM_CTL_TM, mmio + HOST_EM_CTL);
+
+	spin_unlock_irqrestore(ap->lock, flags);
+	return size;
+}
+
+static ssize_t ahci_led_show(struct ata_port *ap, char *buf)
+{
+	struct ahci_port_priv *pp = ap->private_data;
+	struct ata_link *link;
+	struct ahci_em_priv *emp;
+	int rc = 0;
+
+	ata_port_for_each_link(link, ap) {
+		emp = &pp->em_priv[link->pmp];
+		rc += sprintf(buf, "%lx\n", emp->led_state);
+	}
+	return rc;
+}
+
+static ssize_t ahci_led_store(struct ata_port *ap, const char *buf,
+				size_t size)
+{
+	int state;
+	int pmp;
+	struct ahci_port_priv *pp = ap->private_data;
+	struct ahci_em_priv *emp;
+
+	state = simple_strtoul(buf, NULL, 0);
+
+	/* get the slot number from the message */
+	pmp = (state & 0x0000ff00) >> 8;
+	if (pmp < MAX_SLOTS)
+		emp = &pp->em_priv[pmp];
+	else
+		return -EINVAL;
+
+	/* mask off the activity bits if we are in sw_activity
+	 * mode, user should turn off sw_activity before setting
+	 * activity led through em_message
+	 */
+	if (emp->blink_policy)
+		state &= 0xfff8ffff;
+
+	return ahci_transmit_led_message(ap, state, size);
+}
+
+static ssize_t ahci_activity_store(struct ata_device *dev, enum sw_activity val)
+{
+	struct ata_link *link = dev->link;
+	struct ata_port *ap = link->ap;
+	struct ahci_port_priv *pp = ap->private_data;
+	struct ahci_em_priv *emp = &pp->em_priv[link->pmp];
+	u32 port_led_state = emp->led_state;
+
+	/* save the desired Activity LED behavior */
+	if (val == OFF) {
+		/* clear LFLAG */
+		link->flags &= ~(ATA_LFLAG_SW_ACTIVITY);
+
+		/* set the LED to OFF */
+		port_led_state &= 0xfff80000;
+		port_led_state |= (ap->port_no | (link->pmp << 8));
+		ahci_transmit_led_message(ap, port_led_state, 4);
+	} else {
+		link->flags |= ATA_LFLAG_SW_ACTIVITY;
+		if (val == BLINK_OFF) {
+			/* set LED to ON for idle */
+			port_led_state &= 0xfff80000;
+			port_led_state |= (ap->port_no | (link->pmp << 8));
+			port_led_state |= 0x00010000; /* check this */
+			ahci_transmit_led_message(ap, port_led_state, 4);
+		}
+	}
+	emp->blink_policy = val;
+	return 0;
+}
+
+static ssize_t ahci_activity_show(struct ata_device *dev, char *buf)
+{
+	struct ata_link *link = dev->link;
+	struct ata_port *ap = link->ap;
+	struct ahci_port_priv *pp = ap->private_data;
+	struct ahci_em_priv *emp = &pp->em_priv[link->pmp];
+
+	/* display the saved value of activity behavior for this
+	 * disk.
+	 */
+	return sprintf(buf, "%d\n", emp->blink_policy);
+}
+
 static void ahci_port_init(struct pci_dev *pdev, struct ata_port *ap,
 			   int port_no, void __iomem *mmio,
 			   void __iomem *port_mmio)
@@ -1848,6 +2135,8 @@ static unsigned int ahci_qc_issue(struct ata_queued_cmd *qc)
 	writel(1 << qc->tag, port_mmio + PORT_CMD_ISSUE);
 	readl(port_mmio + PORT_CMD_ISSUE);	/* flush */
 
+	ahci_sw_activity(qc->dev->link);
+
 	return 0;
 }
 
@@ -2154,7 +2443,8 @@ static void ahci_print_info(struct ata_host *host)
 	dev_printk(KERN_INFO, &pdev->dev,
 		"flags: "
 		"%s%s%s%s%s%s%s"
-		"%s%s%s%s%s%s%s\n"
+		"%s%s%s%s%s%s%s"
+		"%s\n"
 		,
 
 		cap & (1 << 31) ? "64bit " : "",
@@ -2171,7 +2461,8 @@ static void ahci_print_info(struct ata_host *host)
 		cap & (1 << 17) ? "pmp " : "",
 		cap & (1 << 15) ? "pio " : "",
 		cap & (1 << 14) ? "slum " : "",
-		cap & (1 << 13) ? "part " : ""
+		cap & (1 << 13) ? "part " : "",
+		cap & (1 << 6) ? "ems ": ""
 		);
 }
 
@@ -2291,6 +2582,24 @@ static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	if (hpriv->cap & HOST_CAP_PMP)
 		pi.flags |= ATA_FLAG_PMP;
 
+	if (ahci_em_messages && (hpriv->cap & HOST_CAP_EMS)) {
+		u8 messages;
+		void __iomem *mmio = pcim_iomap_table(pdev)[AHCI_PCI_BAR];
+		u32 em_loc = readl(mmio + HOST_EM_LOC);
+		u32 em_ctl = readl(mmio + HOST_EM_CTL);
+
+		messages = (em_ctl & 0x000f0000) >> 16;
+
+		/* we only support LED message type right now */
+		if ((messages & 0x01) && (ahci_em_messages == 1)) {
+			/* store em_loc */
+			hpriv->em_loc = ((em_loc >> 16) * 4);
+			pi.flags |= ATA_FLAG_EM;
+			if (!(em_ctl & EM_CTL_ALHD))
+				pi.flags |= ATA_FLAG_SW_ACTIVITY;
+		}
+	}
+
 	/* CAP.NP sometimes indicate the index of the last enabled
 	 * port, at other times, that of the last possible port, so
 	 * determining the maximum port number requires looking at
@@ -2304,6 +2613,9 @@ static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	host->iomap = pcim_iomap_table(pdev);
 	host->private_data = hpriv;
 
+	if (pi.flags & ATA_FLAG_EM)
+		ahci_reset_em(host);
+
 	for (i = 0; i < host->n_ports; i++) {
 		struct ata_port *ap = host->ports[i];
 
@@ -2314,6 +2626,11 @@ static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 		/* set initial link pm policy */
 		ap->pm_policy = NOT_AVAILABLE;
 
+		/* set enclosure management message type */
+		if (ap->flags & ATA_FLAG_EM)
+			ap->em_message_type = ahci_em_messages;
+
+
 		/* disabled/not-implemented port */
 		if (!(hpriv->port_map & (1 << i)))
 			ap->ops = &ata_dummy_port_ops;
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index 57a43649a461..b578b11caa7b 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -190,6 +190,85 @@ static void ata_scsi_set_sense(struct scsi_cmnd *cmd, u8 sk, u8 asc, u8 ascq)
 	scsi_build_sense_buffer(0, cmd->sense_buffer, sk, asc, ascq);
 }
 
+static ssize_t
+ata_scsi_em_message_store(struct device *dev, struct device_attribute *attr,
+			  const char *buf, size_t count)
+{
+	struct Scsi_Host *shost = class_to_shost(dev);
+	struct ata_port *ap = ata_shost_to_port(shost);
+	if (ap->ops->em_store && (ap->flags & ATA_FLAG_EM))
+		return ap->ops->em_store(ap, buf, count);
+	return -EINVAL;
+}
+
+static ssize_t
+ata_scsi_em_message_show(struct device *dev, struct device_attribute *attr,
+			 char *buf)
+{
+	struct Scsi_Host *shost = class_to_shost(dev);
+	struct ata_port *ap = ata_shost_to_port(shost);
+
+	if (ap->ops->em_show && (ap->flags & ATA_FLAG_EM))
+		return ap->ops->em_show(ap, buf);
+	return -EINVAL;
+}
+DEVICE_ATTR(em_message, S_IRUGO | S_IWUGO,
+		ata_scsi_em_message_show, ata_scsi_em_message_store);
+EXPORT_SYMBOL_GPL(dev_attr_em_message);
+
+static ssize_t
+ata_scsi_em_message_type_show(struct device *dev, struct device_attribute *attr,
+			      char *buf)
+{
+	struct Scsi_Host *shost = class_to_shost(dev);
+	struct ata_port *ap = ata_shost_to_port(shost);
+
+	return snprintf(buf, 23, "%d\n", ap->em_message_type);
+}
+DEVICE_ATTR(em_message_type, S_IRUGO,
+		  ata_scsi_em_message_type_show, NULL);
+EXPORT_SYMBOL_GPL(dev_attr_em_message_type);
+
+static ssize_t
+ata_scsi_activity_show(struct device *dev, struct device_attribute *attr,
+		char *buf)
+{
+	struct scsi_device *sdev = to_scsi_device(dev);
+	struct ata_port *ap = ata_shost_to_port(sdev->host);
+	struct ata_device *atadev = ata_scsi_find_dev(ap, sdev);
+
+	if (ap->ops->sw_activity_show && (ap->flags & ATA_FLAG_SW_ACTIVITY))
+		return ap->ops->sw_activity_show(atadev, buf);
+	return -EINVAL;
+}
+
+static ssize_t
+ata_scsi_activity_store(struct device *dev, struct device_attribute *attr,
+	const char *buf, size_t count)
+{
+	struct scsi_device *sdev = to_scsi_device(dev);
+	struct ata_port *ap = ata_shost_to_port(sdev->host);
+	struct ata_device *atadev = ata_scsi_find_dev(ap, sdev);
+	enum sw_activity val;
+	int rc;
+
+	if (ap->ops->sw_activity_store && (ap->flags & ATA_FLAG_SW_ACTIVITY)) {
+		val = simple_strtoul(buf, NULL, 0);
+		switch (val) {
+		case OFF: case BLINK_ON: case BLINK_OFF:
+			rc = ap->ops->sw_activity_store(atadev, val);
+			if (!rc)
+				return count;
+			else
+				return rc;
+		}
+	}
+	return -EINVAL;
+}
+DEVICE_ATTR(sw_activity, S_IWUGO | S_IRUGO, ata_scsi_activity_show,
+			ata_scsi_activity_store);
+EXPORT_SYMBOL_GPL(dev_attr_sw_activity);
+
 static void ata_scsi_invalid_field(struct scsi_cmnd *cmd,
 				   void (*done)(struct scsi_cmnd *))
 {
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 035f8e1cd0ac..5b247b8a6b3b 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -169,6 +169,7 @@ enum {
 	ATA_LFLAG_ASSUME_CLASS	= ATA_LFLAG_ASSUME_ATA | ATA_LFLAG_ASSUME_SEMB,
 	ATA_LFLAG_NO_RETRY	= (1 << 5), /* don't retry this link */
 	ATA_LFLAG_DISABLED	= (1 << 6), /* link is disabled */
+	ATA_LFLAG_SW_ACTIVITY	= (1 << 7), /* keep activity stats */
 
 	/* struct ata_port flags */
 	ATA_FLAG_SLAVE_POSS	= (1 << 0), /* host supports slave dev */
@@ -191,6 +192,10 @@ enum {
 	ATA_FLAG_AN		= (1 << 18), /* controller supports AN */
 	ATA_FLAG_PMP		= (1 << 19), /* controller supports PMP */
 	ATA_FLAG_IPM		= (1 << 20), /* driver can handle IPM */
+	ATA_FLAG_EM		= (1 << 21), /* driver supports enclosure
+					      * management */
+	ATA_FLAG_SW_ACTIVITY	= (1 << 22), /* driver supports sw activity
+					      * led */
 
 	/* The following flag belongs to ap->pflags but is kept in
 	 * ap->flags because it's referenced in many LLDs and will be
@@ -446,6 +451,15 @@ enum link_pm {
 	MEDIUM_POWER,
 };
 extern struct device_attribute dev_attr_link_power_management_policy;
+extern struct device_attribute dev_attr_em_message_type;
+extern struct device_attribute dev_attr_em_message;
+extern struct device_attribute dev_attr_sw_activity;
+
+enum sw_activity {
+	OFF,
+	BLINK_ON,
+	BLINK_OFF,
+};
 
 #ifdef CONFIG_ATA_SFF
 struct ata_ioports {
@@ -701,6 +715,7 @@ struct ata_port {
 	struct timer_list	fastdrain_timer;
 	unsigned long		fastdrain_cnt;
 
+	int			em_message_type;
 	void			*private_data;
 
 #ifdef CONFIG_ATA_ACPI
@@ -792,6 +807,12 @@ struct ata_port_operations {
 	u8   (*bmdma_status)(struct ata_port *ap);
 #endif /* CONFIG_ATA_SFF */
 
+	ssize_t (*em_show)(struct ata_port *ap, char *buf);
+	ssize_t (*em_store)(struct ata_port *ap, const char *message,
+			    size_t size);
+	ssize_t (*sw_activity_show)(struct ata_device *dev, char *buf);
+	ssize_t (*sw_activity_store)(struct ata_device *dev,
+				     enum sw_activity val);
 	/*
 	 * Obsolete
 	 */
-- 
cgit v1.2.3


From 20a9b6e7c303f2a6f9afe17c0997bc9a3c734442 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@kernel.org>
Date: Mon, 14 Jul 2008 22:38:22 +0200
Subject: i2c: Remove 3 deprecated bus drivers

This patch contains the scheduled removal of i2c-i810, i2c-prosavage
and i2c-savage4.

Signed-off-by: Adrian Bunk <bunk@kernel.org>
Signed-off-by: Jean Delvare <khali@linux-fr.org>
---
 Documentation/feature-removal-schedule.txt |   7 -
 Documentation/i2c/busses/i2c-i810          |  47 -----
 Documentation/i2c/busses/i2c-prosavage     |  23 --
 Documentation/i2c/busses/i2c-savage4       |  26 ---
 drivers/i2c/busses/Kconfig                 |  52 -----
 drivers/i2c/busses/Makefile                |   3 -
 drivers/i2c/busses/i2c-i810.c              | 260 -----------------------
 drivers/i2c/busses/i2c-prosavage.c         | 325 -----------------------------
 drivers/i2c/busses/i2c-savage4.c           | 185 ----------------
 include/linux/i2c-id.h                     |   1 -
 10 files changed, 929 deletions(-)
 delete mode 100644 Documentation/i2c/busses/i2c-i810
 delete mode 100644 Documentation/i2c/busses/i2c-prosavage
 delete mode 100644 Documentation/i2c/busses/i2c-savage4
 delete mode 100644 drivers/i2c/busses/i2c-i810.c
 delete mode 100644 drivers/i2c/busses/i2c-prosavage.c
 delete mode 100644 drivers/i2c/busses/i2c-savage4.c

(limited to 'include/linux')

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 46ece3fba6f9..65a1482457a8 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -222,13 +222,6 @@ Who:	Thomas Gleixner <tglx@linutronix.de>
 
 ---------------------------
 
-What:	i2c-i810, i2c-prosavage and i2c-savage4
-When:	May 2008
-Why:	These drivers are superseded by i810fb, intelfb and savagefb.
-Who:	Jean Delvare <khali@linux-fr.org>
-
----------------------------
-
 What (Why):
 	- include/linux/netfilter_ipv4/ipt_TOS.h ipt_tos.h header files
 	  (superseded by xt_TOS/xt_tos target & match)
diff --git a/Documentation/i2c/busses/i2c-i810 b/Documentation/i2c/busses/i2c-i810
deleted file mode 100644
index 778210ee1583..000000000000
--- a/Documentation/i2c/busses/i2c-i810
+++ /dev/null
@@ -1,47 +0,0 @@
-Kernel driver i2c-i810
-
-Supported adapters:
-  * Intel 82810, 82810-DC100, 82810E, and 82815 (GMCH)
-  * Intel 82845G (GMCH)
-
-Authors: 
-	Frodo Looijaard <frodol@dds.nl>, 
-	Philip Edelbrock <phil@netroedge.com>,
-        Kyösti Mälkki <kmalkki@cc.hut.fi>,
-	Ralph Metzler <rjkm@thp.uni-koeln.de>,
-	Mark D. Studebaker <mdsxyz123@yahoo.com>
-
-Main contact: Mark Studebaker <mdsxyz123@yahoo.com>
-
-Description 
------------ 
-
-WARNING: If you have an '810' or '815' motherboard, your standard I2C
-temperature sensors are most likely on the 801's I2C bus. You want the
-i2c-i801 driver for those, not this driver.
-
-Now for the i2c-i810...
-
-The GMCH chip contains two I2C interfaces.
-
-The first interface is used for DDC (Data Display Channel) which is a
-serial channel through the VGA monitor connector to a DDC-compliant
-monitor. This interface is defined by the Video Electronics Standards
-Association (VESA). The standards are available for purchase at
-http://www.vesa.org .
-
-The second interface is a general-purpose I2C bus. It may be connected to a
-TV-out chip such as the BT869 or possibly to a digital flat-panel display.
-
-Features
--------- 
-
-Both busses use the i2c-algo-bit driver for 'bit banging'
-and support for specific transactions is provided by i2c-algo-bit.
-
-Issues
-------
-
-If you enable bus testing in i2c-algo-bit (insmod i2c-algo-bit bit_test=1),
-the test may fail; if so, the i2c-i810 driver won't be inserted. However,
-we think this has been fixed.
diff --git a/Documentation/i2c/busses/i2c-prosavage b/Documentation/i2c/busses/i2c-prosavage
deleted file mode 100644
index 703687902511..000000000000
--- a/Documentation/i2c/busses/i2c-prosavage
+++ /dev/null
@@ -1,23 +0,0 @@
-Kernel driver i2c-prosavage
-
-Supported adapters:
-	
-	S3/VIA KM266/VT8375 aka ProSavage8 
-	S3/VIA KM133/VT8365 aka Savage4 
-
-Author: Henk Vergonet <henk@god.dyndns.org>
-
-Description
------------
-
-The Savage4 chips contain two I2C interfaces (aka a I2C 'master' or
-'host'). 
-
-The first interface is used for DDC (Data Display Channel) which is a
-serial channel through the VGA monitor connector to a DDC-compliant
-monitor. This interface is defined by the Video Electronics Standards
-Association (VESA). The standards are available for purchase at
-http://www.vesa.org . The second interface is a general-purpose I2C bus.
-
-Usefull for gaining access to the TV Encoder chips.
-
diff --git a/Documentation/i2c/busses/i2c-savage4 b/Documentation/i2c/busses/i2c-savage4
deleted file mode 100644
index 6ecceab618d3..000000000000
--- a/Documentation/i2c/busses/i2c-savage4
+++ /dev/null
@@ -1,26 +0,0 @@
-Kernel driver i2c-savage4
-
-Supported adapters:
-  * Savage4
-  * Savage2000
-
-Authors: 
-	Alexander Wold <awold@bigfoot.com>,
-	Mark D. Studebaker <mdsxyz123@yahoo.com> 
-
-Description
------------
-
-The Savage4 chips contain two I2C interfaces (aka a I2C 'master'
-or 'host'). 
-
-The first interface is used for DDC (Data Display Channel) which is a
-serial channel through the VGA monitor connector to a DDC-compliant
-monitor. This interface is defined by the Video Electronics Standards
-Association (VESA). The standards are available for purchase at
-http://www.vesa.org . The DDC bus is not yet supported because its register
-is not directly memory-mapped.
-
-The second interface is a general-purpose I2C bus. This is the only
-interface supported by the driver at the moment.
-
diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig
index 00d76e13588f..b7cce9211838 100644
--- a/drivers/i2c/busses/Kconfig
+++ b/drivers/i2c/busses/Kconfig
@@ -186,26 +186,6 @@ config I2C_I801
 	  This driver can also be built as a module.  If so, the module
 	  will be called i2c-i801.
 
-config I2C_I810
-	tristate "Intel 810/815 (DEPRECATED)"
-	default n
-	depends on PCI
-	select I2C_ALGOBIT
-	help
-	  If you say yes to this option, support will be included for the Intel
-	  810/815 family of mainboard I2C interfaces.  Specifically, the
-	  following versions of the chipset are supported:
-	    i810AA
-	    i810AB
-	    i810E
-	    i815
-	    i845G
-
-	  This driver is deprecated in favor of the i810fb and intelfb drivers.
-
-	  This driver can also be built as a module.  If so, the module
-	  will be called i2c-i810.
-
 config I2C_PXA
 	tristate "Intel PXA2XX I2C adapter (EXPERIMENTAL)"
 	depends on EXPERIMENTAL && ARCH_PXA
@@ -402,24 +382,6 @@ config I2C_PASEMI
 	help
 	  Supports the PA Semi PWRficient on-chip SMBus interfaces.
 
-config I2C_PROSAVAGE
-	tristate "S3/VIA (Pro)Savage (DEPRECATED)"
-	default n
-	depends on PCI
-	select I2C_ALGOBIT
-	help
-	  If you say yes to this option, support will be included for the
-	  I2C bus and DDC bus of the S3VIA embedded Savage4 and ProSavage8
-	  graphics processors.
-	  chipsets supported:
-	    S3/VIA KM266/VT8375 aka ProSavage8
-	    S3/VIA KM133/VT8365 aka Savage4
-
-	  This driver is deprecated in favor of the savagefb driver.
-
-	  This support is also available as a module.  If so, the module
-	  will be called i2c-prosavage.
-
 config I2C_S3C2410
 	tristate "S3C2410 I2C Driver"
 	depends on ARCH_S3C2410
@@ -427,20 +389,6 @@ config I2C_S3C2410
 	  Say Y here to include support for I2C controller in the
 	  Samsung S3C2410 based System-on-Chip devices.
 
-config I2C_SAVAGE4
-	tristate "S3 Savage 4 (DEPRECATED)"
-	default n
-	depends on PCI
-	select I2C_ALGOBIT
-	help
-	  If you say yes to this option, support will be included for the
-	  S3 Savage 4 I2C interface.
-
-	  This driver is deprecated in favor of the savagefb driver.
-
-	  This driver can also be built as a module.  If so, the module
-	  will be called i2c-savage4.
-
 config I2C_SIBYTE
 	tristate "SiByte SMBus interface"
 	depends on SIBYTE_SB1xxx_SOC
diff --git a/drivers/i2c/busses/Makefile b/drivers/i2c/busses/Makefile
index 8b0a8c257905..81bb407d24cc 100644
--- a/drivers/i2c/busses/Makefile
+++ b/drivers/i2c/busses/Makefile
@@ -16,7 +16,6 @@ obj-$(CONFIG_I2C_ELEKTOR)	+= i2c-elektor.o
 obj-$(CONFIG_I2C_GPIO)		+= i2c-gpio.o
 obj-$(CONFIG_I2C_HYDRA)		+= i2c-hydra.o
 obj-$(CONFIG_I2C_I801)		+= i2c-i801.o
-obj-$(CONFIG_I2C_I810)		+= i2c-i810.o
 obj-$(CONFIG_I2C_IBM_IIC)	+= i2c-ibm_iic.o
 obj-$(CONFIG_I2C_IOP3XX)	+= i2c-iop3xx.o
 obj-$(CONFIG_I2C_IXP2000)	+= i2c-ixp2000.o
@@ -35,10 +34,8 @@ obj-$(CONFIG_I2C_PCA_PLATFORM)	+= i2c-pca-platform.o
 obj-$(CONFIG_I2C_PIIX4)		+= i2c-piix4.o
 obj-$(CONFIG_I2C_PMCMSP)	+= i2c-pmcmsp.o
 obj-$(CONFIG_I2C_PNX)		+= i2c-pnx.o
-obj-$(CONFIG_I2C_PROSAVAGE)	+= i2c-prosavage.o
 obj-$(CONFIG_I2C_PXA)		+= i2c-pxa.o
 obj-$(CONFIG_I2C_S3C2410)	+= i2c-s3c2410.o
-obj-$(CONFIG_I2C_SAVAGE4)	+= i2c-savage4.o
 obj-$(CONFIG_I2C_SH7760)	+= i2c-sh7760.o
 obj-$(CONFIG_I2C_SH_MOBILE)	+= i2c-sh_mobile.o
 obj-$(CONFIG_I2C_SIBYTE)	+= i2c-sibyte.o
diff --git a/drivers/i2c/busses/i2c-i810.c b/drivers/i2c/busses/i2c-i810.c
deleted file mode 100644
index 42e8d94c276f..000000000000
--- a/drivers/i2c/busses/i2c-i810.c
+++ /dev/null
@@ -1,260 +0,0 @@
-/*
-    i2c-i810.c - Part of lm_sensors, Linux kernel modules for hardware
-              monitoring
-    Copyright (c) 1998, 1999, 2000  Frodo Looijaard <frodol@dds.nl>,
-    Philip Edelbrock <phil@netroedge.com>,
-    Ralph Metzler <rjkm@thp.uni-koeln.de>, and
-    Mark D. Studebaker <mdsxyz123@yahoo.com>
-    
-    Based on code written by Ralph Metzler <rjkm@thp.uni-koeln.de> and
-    Simon Vogl
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-*/
-/*
-   This interfaces to the I810/I815 to provide access to
-   the DDC Bus and the I2C Bus.
-
-   SUPPORTED DEVICES	PCI ID
-   i810AA		7121           
-   i810AB		7123           
-   i810E		7125           
-   i815			1132           
-   i845G		2562
-*/
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/pci.h>
-#include <linux/i2c.h>
-#include <linux/i2c-algo-bit.h>
-#include <asm/io.h>
-
-/* GPIO register locations */
-#define I810_IOCONTROL_OFFSET	0x5000
-#define I810_HVSYNC		0x00	/* not used */
-#define I810_GPIOA		0x10
-#define I810_GPIOB		0x14
-
-/* bit locations in the registers */
-#define SCL_DIR_MASK		0x0001
-#define SCL_DIR			0x0002
-#define SCL_VAL_MASK		0x0004
-#define SCL_VAL_OUT		0x0008
-#define SCL_VAL_IN		0x0010
-#define SDA_DIR_MASK		0x0100
-#define SDA_DIR			0x0200
-#define SDA_VAL_MASK		0x0400
-#define SDA_VAL_OUT		0x0800
-#define SDA_VAL_IN		0x1000
-
-/* initialization states */
-#define INIT1			0x1
-#define INIT2			0x2
-#define INIT3			0x4
-
-/* delays */
-#define CYCLE_DELAY		10
-#define TIMEOUT			(HZ / 2)
-
-static void __iomem *ioaddr;
-
-/* The i810 GPIO registers have individual masks for each bit
-   so we never have to read before writing. Nice. */
-
-static void bit_i810i2c_setscl(void *data, int val)
-{
-	writel((val ? SCL_VAL_OUT : 0) | SCL_DIR | SCL_DIR_MASK | SCL_VAL_MASK,
-	     ioaddr + I810_GPIOB);
-	readl(ioaddr + I810_GPIOB);	/* flush posted write */
-}
-
-static void bit_i810i2c_setsda(void *data, int val)
-{
- 	writel((val ? SDA_VAL_OUT : 0) | SDA_DIR | SDA_DIR_MASK | SDA_VAL_MASK,
-	     ioaddr + I810_GPIOB);
-	readl(ioaddr + I810_GPIOB);	/* flush posted write */
-}
-
-/* The GPIO pins are open drain, so the pins could always remain outputs.
-   However, some chip versions don't latch the inputs unless they
-   are set as inputs.
-   We rely on the i2c-algo-bit routines to set the pins high before
-   reading the input from other chips. Following guidance in the 815
-   prog. ref. guide, we do a "dummy write" of 0 to the register before
-   reading which forces the input value to be latched. We presume this
-   applies to the 810 as well; shouldn't hurt anyway. This is necessary to get
-   i2c_algo_bit bit_test=1 to pass. */
-
-static int bit_i810i2c_getscl(void *data)
-{
-	writel(SCL_DIR_MASK, ioaddr + I810_GPIOB);
-	writel(0, ioaddr + I810_GPIOB);
-	return (0 != (readl(ioaddr + I810_GPIOB) & SCL_VAL_IN));
-}
-
-static int bit_i810i2c_getsda(void *data)
-{
-	writel(SDA_DIR_MASK, ioaddr + I810_GPIOB);
-	writel(0, ioaddr + I810_GPIOB);
-	return (0 != (readl(ioaddr + I810_GPIOB) & SDA_VAL_IN));
-}
-
-static void bit_i810ddc_setscl(void *data, int val)
-{
-	writel((val ? SCL_VAL_OUT : 0) | SCL_DIR | SCL_DIR_MASK | SCL_VAL_MASK,
-	     ioaddr + I810_GPIOA);
-	readl(ioaddr + I810_GPIOA);	/* flush posted write */
-}
-
-static void bit_i810ddc_setsda(void *data, int val)
-{
- 	writel((val ? SDA_VAL_OUT : 0) | SDA_DIR | SDA_DIR_MASK | SDA_VAL_MASK,
-	     ioaddr + I810_GPIOA);
-	readl(ioaddr + I810_GPIOA);	/* flush posted write */
-}
-
-static int bit_i810ddc_getscl(void *data)
-{
-	writel(SCL_DIR_MASK, ioaddr + I810_GPIOA);
-	writel(0, ioaddr + I810_GPIOA);
-	return (0 != (readl(ioaddr + I810_GPIOA) & SCL_VAL_IN));
-}
-
-static int bit_i810ddc_getsda(void *data)
-{
-	writel(SDA_DIR_MASK, ioaddr + I810_GPIOA);
-	writel(0, ioaddr + I810_GPIOA);
-	return (0 != (readl(ioaddr + I810_GPIOA) & SDA_VAL_IN));
-}
-
-static int config_i810(struct pci_dev *dev)
-{
-	unsigned long cadr;
-
-	/* map I810 memory */
-	cadr = dev->resource[1].start;
-	cadr += I810_IOCONTROL_OFFSET;
-	cadr &= PCI_BASE_ADDRESS_MEM_MASK;
-	ioaddr = ioremap_nocache(cadr, 0x1000);
-	if (ioaddr) {
-		bit_i810i2c_setscl(NULL, 1);
-		bit_i810i2c_setsda(NULL, 1);
-		bit_i810ddc_setscl(NULL, 1);
-		bit_i810ddc_setsda(NULL, 1);
-		return 0;
-	}
-	return -ENODEV;
-}
-
-static struct i2c_algo_bit_data i810_i2c_bit_data = {
-	.setsda		= bit_i810i2c_setsda,
-	.setscl		= bit_i810i2c_setscl,
-	.getsda		= bit_i810i2c_getsda,
-	.getscl		= bit_i810i2c_getscl,
-	.udelay		= CYCLE_DELAY,
-	.timeout	= TIMEOUT,
-};
-
-static struct i2c_adapter i810_i2c_adapter = {
-	.owner		= THIS_MODULE,
-	.id		= I2C_HW_B_I810,
-	.name		= "I810/I815 I2C Adapter",
-	.algo_data	= &i810_i2c_bit_data,
-};
-
-static struct i2c_algo_bit_data i810_ddc_bit_data = {
-	.setsda		= bit_i810ddc_setsda,
-	.setscl		= bit_i810ddc_setscl,
-	.getsda		= bit_i810ddc_getsda,
-	.getscl		= bit_i810ddc_getscl,
-	.udelay		= CYCLE_DELAY,
-	.timeout	= TIMEOUT,
-};
-
-static struct i2c_adapter i810_ddc_adapter = {
-	.owner		= THIS_MODULE,
-	.id		= I2C_HW_B_I810,
-	.name		= "I810/I815 DDC Adapter",
-	.algo_data	= &i810_ddc_bit_data,
-};
-
-static struct pci_device_id i810_ids[] __devinitdata = {
-	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82810_IG1) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82810_IG3) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82810E_IG) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82815_CGC) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82845G_IG) },
-	{ 0, },
-};
-
-MODULE_DEVICE_TABLE (pci, i810_ids);
-
-static int __devinit i810_probe(struct pci_dev *dev, const struct pci_device_id *id)
-{
-	int retval;
-
-	retval = config_i810(dev);
-	if (retval)
-		return retval;
-	dev_info(&dev->dev, "i810/i815 i2c device found.\n");
-
-	/* set up the sysfs linkage to our parent device */
-	i810_i2c_adapter.dev.parent = &dev->dev;
-	i810_ddc_adapter.dev.parent = &dev->dev;
-
-	retval = i2c_bit_add_bus(&i810_i2c_adapter);
-	if (retval)
-		return retval;
-	retval = i2c_bit_add_bus(&i810_ddc_adapter);
-	if (retval)
-		i2c_del_adapter(&i810_i2c_adapter);
-	return retval;
-}
-
-static void __devexit i810_remove(struct pci_dev *dev)
-{
-	i2c_del_adapter(&i810_ddc_adapter);
-	i2c_del_adapter(&i810_i2c_adapter);
-	iounmap(ioaddr);
-}
-
-static struct pci_driver i810_driver = {
-	.name		= "i810_smbus",
-	.id_table	= i810_ids,
-	.probe		= i810_probe,
-	.remove		= __devexit_p(i810_remove),
-};
-
-static int __init i2c_i810_init(void)
-{
-	return pci_register_driver(&i810_driver);
-}
-
-static void __exit i2c_i810_exit(void)
-{
-	pci_unregister_driver(&i810_driver);
-}
-
-MODULE_AUTHOR("Frodo Looijaard <frodol@dds.nl>, "
-		"Philip Edelbrock <phil@netroedge.com>, "
-		"Ralph Metzler <rjkm@thp.uni-koeln.de>, "
-		"and Mark D. Studebaker <mdsxyz123@yahoo.com>");
-MODULE_DESCRIPTION("I810/I815 I2C/DDC driver");
-MODULE_LICENSE("GPL");
-
-module_init(i2c_i810_init);
-module_exit(i2c_i810_exit);
diff --git a/drivers/i2c/busses/i2c-prosavage.c b/drivers/i2c/busses/i2c-prosavage.c
deleted file mode 100644
index 07c1f1e27df1..000000000000
--- a/drivers/i2c/busses/i2c-prosavage.c
+++ /dev/null
@@ -1,325 +0,0 @@
-/*
- *    kernel/busses/i2c-prosavage.c
- *
- *    i2c bus driver for S3/VIA 8365/8375 graphics processor.
- *    Copyright (c) 2003 Henk Vergonet <henk@god.dyndns.org>
- *    Based on code written by:
- *	Frodo Looijaard <frodol@dds.nl>,
- *	Philip Edelbrock <phil@netroedge.com>,
- *	Ralph Metzler <rjkm@thp.uni-koeln.de>, and
- *	Mark D. Studebaker <mdsxyz123@yahoo.com>
- *	Simon Vogl
- *	and others
- *
- *    Please read the lm_sensors documentation for details on use.
- *
- *    This program is free software; you can redistribute it and/or modify
- *    it under the terms of the GNU General Public License as published by
- *    the Free Software Foundation; either version 2 of the License, or
- *    (at your option) any later version.
- *
- *    This program is distributed in the hope that it will be useful,
- *    but WITHOUT ANY WARRANTY; without even the implied warranty of
- *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *    GNU General Public License for more details.
- *
- *    You should have received a copy of the GNU General Public License
- *    along with this program; if not, write to the Free Software
- *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- */
-/*  18-05-2003 HVE - created
- *  14-06-2003 HVE - adapted for lm_sensors2
- *  17-06-2003 HVE - linux 2.5.xx compatible
- *  18-06-2003 HVE - codingstyle
- *  21-06-2003 HVE - compatibility lm_sensors2 and linux 2.5.xx
- *		     codingstyle, mmio enabled
- *
- *  This driver interfaces to the I2C bus of the VIA north bridge embedded
- *  ProSavage4/8 devices. Usefull for gaining access to the TV Encoder chips.
- *
- *  Graphics cores:
- *   S3/VIA KM266/VT8375 aka ProSavage8
- *   S3/VIA KM133/VT8365 aka Savage4
- *
- *  Two serial busses are implemented:
- *   SERIAL1 - I2C serial communications interface
- *   SERIAL2 - DDC2 monitor communications interface
- *
- *  Tested on a FX41 mainboard, see http://www.shuttle.com
- * 
- *
- *  TODO:
- *  - integration with prosavage framebuffer device
- *    (Additional documentation needed :(
- */
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/pci.h>
-#include <linux/i2c.h>
-#include <linux/i2c-algo-bit.h>
-#include <asm/io.h>
-
-/*
- * driver configuration
- */
-#define MAX_BUSSES	2
-
-struct s_i2c_bus {
-	void __iomem *mmvga;
-	int	i2c_reg;
-	int	adap_ok;
-	struct i2c_adapter		adap;
-	struct i2c_algo_bit_data	algo;
-};
-
-struct s_i2c_chip {
-	void __iomem *mmio;
-	struct s_i2c_bus	i2c_bus[MAX_BUSSES];
-};
-
-
-/*
- * i2c configuration
- */
-#define CYCLE_DELAY	10
-#define TIMEOUT		(HZ / 2)
-
-
-/* 
- * S3/VIA 8365/8375 registers
- */
-#define VGA_CR_IX	0x3d4
-#define VGA_CR_DATA	0x3d5
-
-#define CR_SERIAL1	0xa0	/* I2C serial communications interface */
-#define MM_SERIAL1	0xff20
-#define CR_SERIAL2	0xb1	/* DDC2 monitor communications interface */
-
-/* based on vt8365 documentation */
-#define I2C_ENAB	0x10
-#define I2C_SCL_OUT	0x01
-#define I2C_SDA_OUT	0x02
-#define I2C_SCL_IN	0x04
-#define I2C_SDA_IN	0x08
-
-#define SET_CR_IX(p, val)	writeb((val), (p)->mmvga + VGA_CR_IX)
-#define SET_CR_DATA(p, val)	writeb((val), (p)->mmvga + VGA_CR_DATA)
-#define GET_CR_DATA(p)		readb((p)->mmvga + VGA_CR_DATA)
-
-
-/*
- * Serial bus line handling
- *
- * serial communications register as parameter in private data
- *
- * TODO: locks with other code sections accessing video registers?
- */
-static void bit_s3via_setscl(void *bus, int val)
-{
-	struct s_i2c_bus *p = (struct s_i2c_bus *)bus;
-	unsigned int r;
-
-	SET_CR_IX(p, p->i2c_reg);
-	r = GET_CR_DATA(p);
-	r |= I2C_ENAB;
-	if (val) {
-		r |= I2C_SCL_OUT;
-	} else {
-		r &= ~I2C_SCL_OUT;
-	}
-	SET_CR_DATA(p, r);
-}
-
-static void bit_s3via_setsda(void *bus, int val)
-{
-	struct s_i2c_bus *p = (struct s_i2c_bus *)bus;
-	unsigned int r;
-	
-	SET_CR_IX(p, p->i2c_reg);
-	r = GET_CR_DATA(p);
-	r |= I2C_ENAB;
-	if (val) {
-		r |= I2C_SDA_OUT;
-	} else {
-		r &= ~I2C_SDA_OUT;
-	}
-	SET_CR_DATA(p, r);
-}
-
-static int bit_s3via_getscl(void *bus)
-{
-	struct s_i2c_bus *p = (struct s_i2c_bus *)bus;
-
-	SET_CR_IX(p, p->i2c_reg);
-	return (0 != (GET_CR_DATA(p) & I2C_SCL_IN));
-}
-
-static int bit_s3via_getsda(void *bus)
-{
-	struct s_i2c_bus *p = (struct s_i2c_bus *)bus;
-
-	SET_CR_IX(p, p->i2c_reg);
-	return (0 != (GET_CR_DATA(p) & I2C_SDA_IN));
-}
-
-
-/*
- * adapter initialisation
- */
-static int i2c_register_bus(struct pci_dev *dev, struct s_i2c_bus *p, void __iomem *mmvga, u32 i2c_reg)
-{
-	int ret;
-	p->adap.owner	  = THIS_MODULE;
-	p->adap.id	  = I2C_HW_B_S3VIA;
-	p->adap.algo_data = &p->algo;
-	p->adap.dev.parent = &dev->dev;
-	p->algo.setsda	  = bit_s3via_setsda;
-	p->algo.setscl	  = bit_s3via_setscl;
-	p->algo.getsda	  = bit_s3via_getsda;
-	p->algo.getscl	  = bit_s3via_getscl;
-	p->algo.udelay	  = CYCLE_DELAY;
-	p->algo.timeout	  = TIMEOUT;
-	p->algo.data	  = p;
-	p->mmvga	  = mmvga;
-	p->i2c_reg	  = i2c_reg;
-    
-	ret = i2c_bit_add_bus(&p->adap);
-	if (ret) {
-		return ret;
-	}
-
-	p->adap_ok = 1;
-	return 0;
-}
-
-
-/*
- * Cleanup stuff
- */
-static void prosavage_remove(struct pci_dev *dev)
-{
-	struct s_i2c_chip *chip;
-	int i, ret;
-
-	chip = (struct s_i2c_chip *)pci_get_drvdata(dev);
-
-	if (!chip) {
-		return;
-	}
-	for (i = MAX_BUSSES - 1; i >= 0; i--) {
-		if (chip->i2c_bus[i].adap_ok == 0)
-			continue;
-
-		ret = i2c_del_adapter(&chip->i2c_bus[i].adap);
-	        if (ret) {
-			dev_err(&dev->dev, "%s not removed\n",
-				chip->i2c_bus[i].adap.name);
-		}
-	}
-	if (chip->mmio) {
-		iounmap(chip->mmio);
-	}
-	kfree(chip);
-}
-
-
-/*
- * Detect chip and initialize it
- */
-static int __devinit prosavage_probe(struct pci_dev *dev, const struct pci_device_id *id)
-{
-	int ret;
-	unsigned long base, len;
-	struct s_i2c_chip *chip;
-	struct s_i2c_bus  *bus;
-
-	pci_set_drvdata(dev, kzalloc(sizeof(struct s_i2c_chip), GFP_KERNEL));
-	chip = (struct s_i2c_chip *)pci_get_drvdata(dev);
-	if (chip == NULL) {
-		return -ENOMEM;
-	}
-
-	base = dev->resource[0].start & PCI_BASE_ADDRESS_MEM_MASK;
-	len  = dev->resource[0].end - base + 1;
-	chip->mmio = ioremap_nocache(base, len);
-
-	if (chip->mmio == NULL) {
-		dev_err(&dev->dev, "ioremap failed\n");
-		prosavage_remove(dev);
-		return -ENODEV;
-	}
-
-
-	/*
-	 * Chip initialisation
-	 */
-	/* Unlock Extended IO Space ??? */
-
-
-	/*
-	 * i2c bus registration
-	 */
-	bus = &chip->i2c_bus[0];
-	snprintf(bus->adap.name, sizeof(bus->adap.name),
-		"ProSavage I2C bus at %02x:%02x.%x",
-		dev->bus->number, PCI_SLOT(dev->devfn), PCI_FUNC(dev->devfn));
-	ret = i2c_register_bus(dev, bus, chip->mmio + 0x8000, CR_SERIAL1);
-	if (ret) {
-		goto err_adap;
-	}
-	/*
-	 * ddc bus registration
-	 */
-	bus = &chip->i2c_bus[1];
-	snprintf(bus->adap.name, sizeof(bus->adap.name),
-		"ProSavage DDC bus at %02x:%02x.%x",
-		dev->bus->number, PCI_SLOT(dev->devfn), PCI_FUNC(dev->devfn));
-	ret = i2c_register_bus(dev, bus, chip->mmio + 0x8000, CR_SERIAL2);
-	if (ret) {
-		goto err_adap;
-	}
-	return 0;
-err_adap:
-	dev_err(&dev->dev, "%s failed\n", bus->adap.name);
-	prosavage_remove(dev);
-	return ret;
-}
-
-
-/*
- * Data for PCI driver interface
- */
-static struct pci_device_id prosavage_pci_tbl[] = {
-	{ PCI_DEVICE(PCI_VENDOR_ID_S3, PCI_DEVICE_ID_S3_SAVAGE4) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_S3, PCI_DEVICE_ID_S3_PROSAVAGE8) },
-	{ 0, },
-};
-
-MODULE_DEVICE_TABLE (pci, prosavage_pci_tbl);
-
-static struct pci_driver prosavage_driver = {
-	.name		=	"prosavage_smbus",
-	.id_table	=	prosavage_pci_tbl,
-	.probe		=	prosavage_probe,
-	.remove		=	prosavage_remove,
-};
-
-static int __init i2c_prosavage_init(void)
-{
-	return pci_register_driver(&prosavage_driver);
-}
-
-static void __exit i2c_prosavage_exit(void)
-{
-	pci_unregister_driver(&prosavage_driver);
-}
-
-MODULE_DEVICE_TABLE(pci, prosavage_pci_tbl);
-MODULE_AUTHOR("Henk Vergonet");
-MODULE_DESCRIPTION("ProSavage VIA 8365/8375 smbus driver");
-MODULE_LICENSE("GPL");
-
-module_init (i2c_prosavage_init);
-module_exit (i2c_prosavage_exit);
diff --git a/drivers/i2c/busses/i2c-savage4.c b/drivers/i2c/busses/i2c-savage4.c
deleted file mode 100644
index 8adf4abaa035..000000000000
--- a/drivers/i2c/busses/i2c-savage4.c
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
-    i2c-savage4.c - Part of lm_sensors, Linux kernel modules for hardware
-              monitoring
-    Copyright (C) 1998-2003  The LM Sensors Team
-    Alexander Wold <awold@bigfoot.com>
-    Mark D. Studebaker <mdsxyz123@yahoo.com>
-    
-    Based on i2c-voodoo3.c.
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-*/
-
-/* This interfaces to the I2C bus of the Savage4 to gain access to
-   the BT869 and possibly other I2C devices. The DDC bus is not
-   yet supported because its register is not memory-mapped.
-*/
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/pci.h>
-#include <linux/i2c.h>
-#include <linux/i2c-algo-bit.h>
-#include <asm/io.h>
-
-/* device IDs */
-#define PCI_CHIP_SAVAGE4	0x8A22
-#define PCI_CHIP_SAVAGE2000	0x9102
-
-#define REG			0xff20	/* Serial Port 1 Register */
-
-/* bit locations in the register */
-#define I2C_ENAB		0x00000020
-#define I2C_SCL_OUT		0x00000001
-#define I2C_SDA_OUT		0x00000002
-#define I2C_SCL_IN		0x00000008
-#define I2C_SDA_IN		0x00000010
-
-/* delays */
-#define CYCLE_DELAY		10
-#define TIMEOUT			(HZ / 2)
-
-
-static void __iomem *ioaddr;
-
-/* The sav GPIO registers don't have individual masks for each bit
-   so we always have to read before writing. */
-
-static void bit_savi2c_setscl(void *data, int val)
-{
-	unsigned int r;
-	r = readl(ioaddr + REG);
-	if(val)
-		r |= I2C_SCL_OUT;
-	else
-		r &= ~I2C_SCL_OUT;
-	writel(r, ioaddr + REG);
-	readl(ioaddr + REG);	/* flush posted write */
-}
-
-static void bit_savi2c_setsda(void *data, int val)
-{
-	unsigned int r;
-	r = readl(ioaddr + REG);
-	if(val)
-		r |= I2C_SDA_OUT;
-	else
-		r &= ~I2C_SDA_OUT;
-	writel(r, ioaddr + REG);
-	readl(ioaddr + REG);	/* flush posted write */
-}
-
-/* The GPIO pins are open drain, so the pins always remain outputs.
-   We rely on the i2c-algo-bit routines to set the pins high before
-   reading the input from other chips. */
-
-static int bit_savi2c_getscl(void *data)
-{
-	return (0 != (readl(ioaddr + REG) & I2C_SCL_IN));
-}
-
-static int bit_savi2c_getsda(void *data)
-{
-	return (0 != (readl(ioaddr + REG) & I2C_SDA_IN));
-}
-
-/* Configures the chip */
-
-static int config_s4(struct pci_dev *dev)
-{
-	unsigned long cadr;
-
-	/* map memory */
-	cadr = dev->resource[0].start;
-	cadr &= PCI_BASE_ADDRESS_MEM_MASK;
-	ioaddr = ioremap_nocache(cadr, 0x0080000);
-	if (ioaddr) {
-		/* writel(0x8160, ioaddr + REG2); */
-		writel(0x00000020, ioaddr + REG);
-		dev_info(&dev->dev, "Using Savage4 at %p\n", ioaddr);
-		return 0;
-	}
-	return -ENODEV;
-}
-
-static struct i2c_algo_bit_data sav_i2c_bit_data = {
-	.setsda		= bit_savi2c_setsda,
-	.setscl		= bit_savi2c_setscl,
-	.getsda		= bit_savi2c_getsda,
-	.getscl		= bit_savi2c_getscl,
-	.udelay		= CYCLE_DELAY,
-	.timeout	= TIMEOUT
-};
-
-static struct i2c_adapter savage4_i2c_adapter = {
-	.owner		= THIS_MODULE,
-	.id		= I2C_HW_B_SAVAGE,
-	.name		= "I2C Savage4 adapter",
-	.algo_data	= &sav_i2c_bit_data,
-};
-
-static struct pci_device_id savage4_ids[] __devinitdata = {
-	{ PCI_DEVICE(PCI_VENDOR_ID_S3, PCI_CHIP_SAVAGE4) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_S3, PCI_CHIP_SAVAGE2000) },
-	{ 0, }
-};
-
-MODULE_DEVICE_TABLE (pci, savage4_ids);
-
-static int __devinit savage4_probe(struct pci_dev *dev, const struct pci_device_id *id)
-{
-	int retval;
-
-	retval = config_s4(dev);
-	if (retval)
-		return retval;
-
-	/* set up the sysfs linkage to our parent device */
-	savage4_i2c_adapter.dev.parent = &dev->dev;
-
-	return i2c_bit_add_bus(&savage4_i2c_adapter);
-}
-
-static void __devexit savage4_remove(struct pci_dev *dev)
-{
-	i2c_del_adapter(&savage4_i2c_adapter);
-	iounmap(ioaddr);
-}
-
-static struct pci_driver savage4_driver = {
-	.name		= "savage4_smbus",
-	.id_table	= savage4_ids,
-	.probe		= savage4_probe,
-	.remove		= __devexit_p(savage4_remove),
-};
-
-static int __init i2c_savage4_init(void)
-{
-	return pci_register_driver(&savage4_driver);
-}
-
-static void __exit i2c_savage4_exit(void)
-{
-	pci_unregister_driver(&savage4_driver);
-}
-
-MODULE_AUTHOR("Alexander Wold <awold@bigfoot.com> "
-		"and Mark D. Studebaker <mdsxyz123@yahoo.com>");
-MODULE_DESCRIPTION("Savage4 I2C/SMBus driver");
-MODULE_LICENSE("GPL");
-
-module_init(i2c_savage4_init);
-module_exit(i2c_savage4_exit);
diff --git a/include/linux/i2c-id.h b/include/linux/i2c-id.h
index 580acc93903e..988e566d3ed5 100644
--- a/include/linux/i2c-id.h
+++ b/include/linux/i2c-id.h
@@ -111,7 +111,6 @@
 #define I2C_HW_B_RIVA		0x010010 /* Riva based graphics cards */
 #define I2C_HW_B_IOC		0x010011 /* IOC bit-wiggling */
 #define I2C_HW_B_IXP2000	0x010016 /* GPIO on IXP2000 systems */
-#define I2C_HW_B_S3VIA		0x010018 /* S3Via ProSavage adapter */
 #define I2C_HW_B_ZR36067	0x010019 /* Zoran-36057/36067 based boards */
 #define I2C_HW_B_PCILYNX	0x01001a /* TI PCILynx I2C adapter */
 #define I2C_HW_B_CX2388x	0x01001b /* connexant 2388x based tv cards */
-- 
cgit v1.2.3


From 67c2e66571c383404a5acd08189194da660da942 Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Mon, 14 Jul 2008 22:38:23 +0200
Subject: i2c: Delete unused function i2c_smbus_write_quick

Function i2c_smbus_write_quick has no users left, so we can delete it.

Also update the list of these helper functions which are gone but
could be added back if needed.

Signed-off-by: Jean Delvare <khali@linux-fr.org>
---
 Documentation/i2c/smbus-protocol  |  4 ++--
 Documentation/i2c/writing-clients | 14 +++++++-------
 drivers/i2c/i2c-core.c            |  7 -------
 include/linux/i2c.h               |  1 -
 4 files changed, 9 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/i2c/smbus-protocol b/Documentation/i2c/smbus-protocol
index 03f08fb491cc..24bfb65da17d 100644
--- a/Documentation/i2c/smbus-protocol
+++ b/Documentation/i2c/smbus-protocol
@@ -42,8 +42,8 @@ Count (8 bits): A data byte containing the length of a block operation.
 [..]: Data sent by I2C device, as opposed to data sent by the host adapter.
 
 
-SMBus Quick Command:  i2c_smbus_write_quick()
-=============================================
+SMBus Quick Command
+===================
 
 This sends a single bit to the device, at the place of the Rd/Wr bit.
 
diff --git a/Documentation/i2c/writing-clients b/Documentation/i2c/writing-clients
index ba5d1971f35f..63722d3c9cdf 100644
--- a/Documentation/i2c/writing-clients
+++ b/Documentation/i2c/writing-clients
@@ -569,7 +569,6 @@ SMBus communication
   in terms of it. Never use this function directly!
 
 
-  extern s32 i2c_smbus_write_quick(struct i2c_client * client, u8 value);
   extern s32 i2c_smbus_read_byte(struct i2c_client * client);
   extern s32 i2c_smbus_write_byte(struct i2c_client * client, u8 value);
   extern s32 i2c_smbus_read_byte_data(struct i2c_client * client, u8 command);
@@ -578,20 +577,21 @@ SMBus communication
   extern s32 i2c_smbus_read_word_data(struct i2c_client * client, u8 command);
   extern s32 i2c_smbus_write_word_data(struct i2c_client * client,
                                        u8 command, u16 value);
+  extern s32 i2c_smbus_read_block_data(struct i2c_client * client,
+                                       u8 command, u8 *values);
   extern s32 i2c_smbus_write_block_data(struct i2c_client * client,
                                         u8 command, u8 length,
                                         u8 *values);
   extern s32 i2c_smbus_read_i2c_block_data(struct i2c_client * client,
                                            u8 command, u8 length, u8 *values);
-
-These ones were removed in Linux 2.6.10 because they had no users, but could
-be added back later if needed:
-
-  extern s32 i2c_smbus_read_block_data(struct i2c_client * client,
-                                       u8 command, u8 *values);
   extern s32 i2c_smbus_write_i2c_block_data(struct i2c_client * client,
                                             u8 command, u8 length,
                                             u8 *values);
+
+These ones were removed from i2c-core because they had no users, but could
+be added back later if needed:
+
+  extern s32 i2c_smbus_write_quick(struct i2c_client * client, u8 value);
   extern s32 i2c_smbus_process_call(struct i2c_client * client,
                                     u8 command, u16 value);
   extern s32 i2c_smbus_block_process_call(struct i2c_client *client,
diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c
index 937f1dcbf3d7..3695a4a1ab77 100644
--- a/drivers/i2c/i2c-core.c
+++ b/drivers/i2c/i2c-core.c
@@ -1303,13 +1303,6 @@ static int i2c_smbus_check_pec(u8 cpec, struct i2c_msg *msg)
 	return 0;
 }
 
-s32 i2c_smbus_write_quick(struct i2c_client *client, u8 value)
-{
-	return i2c_smbus_xfer(client->adapter,client->addr,client->flags,
-	                      value,0,I2C_SMBUS_QUICK,NULL);
-}
-EXPORT_SYMBOL(i2c_smbus_write_quick);
-
 s32 i2c_smbus_read_byte(struct i2c_client *client)
 {
 	union i2c_smbus_data data;
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 8dc730132192..b3695f353f79 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -71,7 +71,6 @@ extern s32 i2c_smbus_xfer (struct i2c_adapter * adapter, u16 addr,
 /* Now follow the 'nice' access routines. These also document the calling
    conventions of smbus_access. */
 
-extern s32 i2c_smbus_write_quick(struct i2c_client * client, u8 value);
 extern s32 i2c_smbus_read_byte(struct i2c_client * client);
 extern s32 i2c_smbus_write_byte(struct i2c_client * client, u8 value);
 extern s32 i2c_smbus_read_byte_data(struct i2c_client * client, u8 command);
-- 
cgit v1.2.3


From ae7193f7fa3e1735ab70807eb6e35a2a6575623f Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Mon, 14 Jul 2008 22:38:24 +0200
Subject: i2c: Update stray references to smbus_access

That function is actually named i2c_smbus_xfer.

Signed-off-by: Jean Delvare <khali@linux-fr.org>
---
 include/linux/i2c.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index b3695f353f79..7c36d5188d39 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -69,7 +69,7 @@ extern s32 i2c_smbus_xfer (struct i2c_adapter * adapter, u16 addr,
                            union i2c_smbus_data * data);
 
 /* Now follow the 'nice' access routines. These also document the calling
-   conventions of smbus_access. */
+   conventions of i2c_smbus_xfer. */
 
 extern s32 i2c_smbus_read_byte(struct i2c_client * client);
 extern s32 i2c_smbus_write_byte(struct i2c_client * client, u8 value);
@@ -536,7 +536,7 @@ union i2c_smbus_data {
 	                       /* and one more for user-space compatibility */
 };
 
-/* smbus_access read or write markers */
+/* i2c_smbus_xfer read or write markers */
 #define I2C_SMBUS_READ	1
 #define I2C_SMBUS_WRITE	0
 
-- 
cgit v1.2.3


From c1b6b4f2342d073698dfc2547240c35045a1d00e Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Mon, 14 Jul 2008 22:38:28 +0200
Subject: i2c: Let framebuffer drivers set their I2C bus class to DDC

Let framebuffer drivers set their I2C bus class to DDC. Once this is
done, we will be able to tell the eeprom driver to only probe for
EDID EEPROMs on these buses.

Signed-off-by: Jean Delvare <khali@linux-fr.org>
---
 drivers/video/fb_ddc.c              |  1 +
 drivers/video/intelfb/intelfb_i2c.c | 12 +++++++-----
 drivers/video/matrox/i2c-matroxfb.c | 20 +++++++++++++++-----
 include/linux/i2c.h                 |  2 +-
 4 files changed, 24 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/video/fb_ddc.c b/drivers/video/fb_ddc.c
index a0df63289b5f..0cf96eb8a60f 100644
--- a/drivers/video/fb_ddc.c
+++ b/drivers/video/fb_ddc.c
@@ -106,6 +106,7 @@ unsigned char *fb_ddc_read(struct i2c_adapter *adapter)
 	algo_data->setsda(algo_data->data, 1);
 	algo_data->setscl(algo_data->data, 1);
 
+	adapter->class |= I2C_CLASS_DDC;
 	return edid;
 }
 
diff --git a/drivers/video/intelfb/intelfb_i2c.c b/drivers/video/intelfb/intelfb_i2c.c
index ca95f09d8b43..fcf9fadbf572 100644
--- a/drivers/video/intelfb/intelfb_i2c.c
+++ b/drivers/video/intelfb/intelfb_i2c.c
@@ -100,7 +100,8 @@ static int intelfb_gpio_getsda(void *data)
 
 static int intelfb_setup_i2c_bus(struct intelfb_info *dinfo,
 				 struct intelfb_i2c_chan *chan,
-				 const u32 reg, const char *name)
+				 const u32 reg, const char *name,
+				 int class)
 {
 	int rc;
 
@@ -108,6 +109,7 @@ static int intelfb_setup_i2c_bus(struct intelfb_info *dinfo,
 	chan->reg			= reg;
 	snprintf(chan->adapter.name, sizeof(chan->adapter.name),
 		 "intelfb %s", name);
+	chan->adapter.class		= class;
 	chan->adapter.owner		= THIS_MODULE;
 	chan->adapter.id		= I2C_HW_B_INTELFB;
 	chan->adapter.algo_data		= &chan->algo;
@@ -145,7 +147,7 @@ void intelfb_create_i2c_busses(struct intelfb_info *dinfo)
 
 	/* setup the DDC bus for analog output */
 	intelfb_setup_i2c_bus(dinfo, &dinfo->output[i].ddc_bus, GPIOA,
-			      "CRTDDC_A");
+			      "CRTDDC_A", I2C_CLASS_DDC);
 	i++;
 
 	/* need to add the output busses for each device
@@ -159,9 +161,9 @@ void intelfb_create_i2c_busses(struct intelfb_info *dinfo)
 	case INTEL_865G:
 		dinfo->output[i].type = INTELFB_OUTPUT_DVO;
 		intelfb_setup_i2c_bus(dinfo, &dinfo->output[i].ddc_bus,
-				      GPIOD, "DVODDC_D");
+				      GPIOD, "DVODDC_D", I2C_CLASS_DDC);
 		intelfb_setup_i2c_bus(dinfo, &dinfo->output[i].i2c_bus,
-				      GPIOE, "DVOI2C_E");
+				      GPIOE, "DVOI2C_E", 0);
 		i++;
 		break;
 	case INTEL_915G:
@@ -174,7 +176,7 @@ void intelfb_create_i2c_busses(struct intelfb_info *dinfo)
 		/* SDVO ports have a single control bus - 2 devices */
 		dinfo->output[i].type = INTELFB_OUTPUT_SDVO;
 		intelfb_setup_i2c_bus(dinfo, &dinfo->output[i].i2c_bus,
-				      GPIOE, "SDVOCTRL_E");
+				      GPIOE, "SDVOCTRL_E", 0);
 		/* TODO: initialize the SDVO */
 		/* I830SDVOInit(pScrn, i, DVOB); */
 		i++;
diff --git a/drivers/video/matrox/i2c-matroxfb.c b/drivers/video/matrox/i2c-matroxfb.c
index 4baab7be58de..75ee5a12e549 100644
--- a/drivers/video/matrox/i2c-matroxfb.c
+++ b/drivers/video/matrox/i2c-matroxfb.c
@@ -104,7 +104,9 @@ static struct i2c_algo_bit_data matrox_i2c_algo_template =
 };
 
 static int i2c_bus_reg(struct i2c_bit_adapter* b, struct matrox_fb_info* minfo, 
-		unsigned int data, unsigned int clock, const char* name) {
+		unsigned int data, unsigned int clock, const char *name,
+		int class)
+{
 	int err;
 
 	b->minfo = minfo;
@@ -114,6 +116,7 @@ static int i2c_bus_reg(struct i2c_bit_adapter* b, struct matrox_fb_info* minfo,
 	snprintf(b->adapter.name, sizeof(b->adapter.name), name,
 		minfo->fbcon.node);
 	i2c_set_adapdata(&b->adapter, b);
+	b->adapter.class = class;
 	b->adapter.algo_data = &b->bac;
 	b->adapter.dev.parent = &ACCESS_FBINFO(pcidev)->dev;
 	b->bac = matrox_i2c_algo_template;
@@ -159,22 +162,29 @@ static void* i2c_matroxfb_probe(struct matrox_fb_info* minfo) {
 	switch (ACCESS_FBINFO(chip)) {
 		case MGA_2064:
 		case MGA_2164:
-			err = i2c_bus_reg(&m2info->ddc1, minfo, DDC1B_DATA, DDC1B_CLK, "DDC:fb%u #0");
+			err = i2c_bus_reg(&m2info->ddc1, minfo,
+					  DDC1B_DATA, DDC1B_CLK,
+					  "DDC:fb%u #0", I2C_CLASS_DDC);
 			break;
 		default:
-			err = i2c_bus_reg(&m2info->ddc1, minfo, DDC1_DATA, DDC1_CLK, "DDC:fb%u #0");
+			err = i2c_bus_reg(&m2info->ddc1, minfo,
+					  DDC1_DATA, DDC1_CLK,
+					  "DDC:fb%u #0", I2C_CLASS_DDC);
 			break;
 	}
 	if (err)
 		goto fail_ddc1;
 	if (ACCESS_FBINFO(devflags.dualhead)) {
-		err = i2c_bus_reg(&m2info->ddc2, minfo, DDC2_DATA, DDC2_CLK, "DDC:fb%u #1");
+		err = i2c_bus_reg(&m2info->ddc2, minfo,
+				  DDC2_DATA, DDC2_CLK,
+				  "DDC:fb%u #1", I2C_CLASS_DDC);
 		if (err == -ENODEV) {
 			printk(KERN_INFO "i2c-matroxfb: VGA->TV plug detected, DDC unavailable.\n");
 		} else if (err)
 			printk(KERN_INFO "i2c-matroxfb: Could not register secondary output i2c bus. Continuing anyway.\n");
 		/* Register maven bus even on G450/G550 */
-		err = i2c_bus_reg(&m2info->maven, minfo, MAT_DATA, MAT_CLK, "MAVEN:fb%u");
+		err = i2c_bus_reg(&m2info->maven, minfo,
+				  MAT_DATA, MAT_CLK, "MAVEN:fb%u", 0);
 		if (err)
 			printk(KERN_INFO "i2c-matroxfb: Could not register Maven i2c bus. Continuing anyway.\n");
 	}
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 7c36d5188d39..145797fe6a31 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -349,7 +349,7 @@ static inline void i2c_set_adapdata (struct i2c_adapter *dev, void *data)
 #define I2C_CLASS_HWMON		(1<<0)	/* lm_sensors, ... */
 #define I2C_CLASS_TV_ANALOG	(1<<1)	/* bttv + friends */
 #define I2C_CLASS_TV_DIGITAL	(1<<2)	/* dvb cards */
-#define I2C_CLASS_DDC		(1<<3)	/* i2c-matroxfb ? */
+#define I2C_CLASS_DDC		(1<<3)	/* DDC bus on graphics adapters */
 #define I2C_CLASS_CAM_ANALOG	(1<<4)	/* camera with analog CCD */
 #define I2C_CLASS_CAM_DIGITAL	(1<<5)	/* most webcams */
 #define I2C_CLASS_SOUND		(1<<6)	/* sound devices */
-- 
cgit v1.2.3


From 3401b2fff38fbb8b73ea6bcc69a8370ae5d2a7a0 Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Mon, 14 Jul 2008 22:38:29 +0200
Subject: i2c: Let bus drivers add SPD to their class

Let general purpose I2C/SMBus bus drivers add SPD to their class. Once
this is done, we will be able to tell the eeprom driver to only probe
for SPD EEPROMs and similar on these buses.

Note that I took a conservative approach here, adding I2C_CLASS_SPD to
many drivers that have no idea whether they can host SPD EEPROMs or not.
This is to make sure that the eeprom driver doesn't stop probing buses
where SPD EEPROMs or equivalent live.

So, bus driver maintainers and users should feel free to remove the SPD
class from drivers those buses never have SPD EEPROMs or they don't
want the eeprom driver to bind to them. Likewise, feel free to add the
SPD class to any bus driver I might have missed.

Signed-off-by: Jean Delvare <khali@linux-fr.org>
---
 drivers/i2c/busses/i2c-ali1535.c | 2 +-
 drivers/i2c/busses/i2c-ali1563.c | 2 +-
 drivers/i2c/busses/i2c-ali15x3.c | 2 +-
 drivers/i2c/busses/i2c-amd756.c  | 2 +-
 drivers/i2c/busses/i2c-amd8111.c | 2 +-
 drivers/i2c/busses/i2c-cpm.c     | 2 +-
 drivers/i2c/busses/i2c-elektor.c | 2 +-
 drivers/i2c/busses/i2c-gpio.c    | 2 +-
 drivers/i2c/busses/i2c-i801.c    | 2 +-
 drivers/i2c/busses/i2c-ibm_iic.c | 4 ++--
 drivers/i2c/busses/i2c-iop3xx.c  | 2 +-
 drivers/i2c/busses/i2c-isch.c    | 2 +-
 drivers/i2c/busses/i2c-mpc.c     | 2 +-
 drivers/i2c/busses/i2c-mv64xxx.c | 2 +-
 drivers/i2c/busses/i2c-nforce2.c | 2 +-
 drivers/i2c/busses/i2c-ocores.c  | 2 +-
 drivers/i2c/busses/i2c-pasemi.c  | 2 +-
 drivers/i2c/busses/i2c-piix4.c   | 2 +-
 drivers/i2c/busses/i2c-pmcmsp.c  | 2 +-
 drivers/i2c/busses/i2c-s3c2410.c | 2 +-
 drivers/i2c/busses/i2c-sibyte.c  | 4 ++--
 drivers/i2c/busses/i2c-sis5595.c | 2 +-
 drivers/i2c/busses/i2c-sis630.c  | 2 +-
 drivers/i2c/busses/i2c-sis96x.c  | 2 +-
 drivers/i2c/busses/i2c-stub.c    | 2 +-
 drivers/i2c/busses/i2c-via.c     | 2 +-
 drivers/i2c/busses/i2c-viapro.c  | 2 +-
 drivers/i2c/busses/scx200_acb.c  | 2 +-
 include/linux/i2c.h              | 1 +
 29 files changed, 31 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/i2c/busses/i2c-ali1535.c b/drivers/i2c/busses/i2c-ali1535.c
index 704436cdec8e..8d1d90ab3a90 100644
--- a/drivers/i2c/busses/i2c-ali1535.c
+++ b/drivers/i2c/busses/i2c-ali1535.c
@@ -473,7 +473,7 @@ static const struct i2c_algorithm smbus_algorithm = {
 static struct i2c_adapter ali1535_adapter = {
 	.owner		= THIS_MODULE,
 	.id		= I2C_HW_SMBUS_ALI1535,
-	.class          = I2C_CLASS_HWMON,
+	.class          = I2C_CLASS_HWMON | I2C_CLASS_SPD,
 	.algo		= &smbus_algorithm,
 };
 
diff --git a/drivers/i2c/busses/i2c-ali1563.c b/drivers/i2c/busses/i2c-ali1563.c
index da5a382eee93..4b55ae19db8d 100644
--- a/drivers/i2c/busses/i2c-ali1563.c
+++ b/drivers/i2c/busses/i2c-ali1563.c
@@ -382,7 +382,7 @@ static const struct i2c_algorithm ali1563_algorithm = {
 static struct i2c_adapter ali1563_adapter = {
 	.owner	= THIS_MODULE,
 	.id	= I2C_HW_SMBUS_ALI1563,
-	.class	= I2C_CLASS_HWMON,
+	.class	= I2C_CLASS_HWMON | I2C_CLASS_SPD,
 	.algo	= &ali1563_algorithm,
 };
 
diff --git a/drivers/i2c/busses/i2c-ali15x3.c b/drivers/i2c/busses/i2c-ali15x3.c
index 7b029b147a8e..e922c3950fcd 100644
--- a/drivers/i2c/busses/i2c-ali15x3.c
+++ b/drivers/i2c/busses/i2c-ali15x3.c
@@ -471,7 +471,7 @@ static const struct i2c_algorithm smbus_algorithm = {
 static struct i2c_adapter ali15x3_adapter = {
 	.owner		= THIS_MODULE,
 	.id		= I2C_HW_SMBUS_ALI15X3,
-	.class          = I2C_CLASS_HWMON,
+	.class          = I2C_CLASS_HWMON | I2C_CLASS_SPD,
 	.algo		= &smbus_algorithm,
 };
 
diff --git a/drivers/i2c/busses/i2c-amd756.c b/drivers/i2c/busses/i2c-amd756.c
index f0baea62067d..bd4f6380fabe 100644
--- a/drivers/i2c/busses/i2c-amd756.c
+++ b/drivers/i2c/busses/i2c-amd756.c
@@ -301,7 +301,7 @@ static const struct i2c_algorithm smbus_algorithm = {
 struct i2c_adapter amd756_smbus = {
 	.owner		= THIS_MODULE,
 	.id		= I2C_HW_SMBUS_AMD756,
-	.class          = I2C_CLASS_HWMON,
+	.class          = I2C_CLASS_HWMON | I2C_CLASS_SPD,
 	.algo		= &smbus_algorithm,
 };
 
diff --git a/drivers/i2c/busses/i2c-amd8111.c b/drivers/i2c/busses/i2c-amd8111.c
index a4f687915de1..0e18fe846010 100644
--- a/drivers/i2c/busses/i2c-amd8111.c
+++ b/drivers/i2c/busses/i2c-amd8111.c
@@ -383,7 +383,7 @@ static int __devinit amd8111_probe(struct pci_dev *dev,
 	snprintf(smbus->adapter.name, sizeof(smbus->adapter.name),
 		"SMBus2 AMD8111 adapter at %04x", smbus->base);
 	smbus->adapter.id = I2C_HW_SMBUS_AMD8111;
-	smbus->adapter.class = I2C_CLASS_HWMON;
+	smbus->adapter.class = I2C_CLASS_HWMON | I2C_CLASS_SPD;
 	smbus->adapter.algo = &smbus_algorithm;
 	smbus->adapter.algo_data = smbus;
 
diff --git a/drivers/i2c/busses/i2c-cpm.c b/drivers/i2c/busses/i2c-cpm.c
index 53af744a91c1..8164de1f4d72 100644
--- a/drivers/i2c/busses/i2c-cpm.c
+++ b/drivers/i2c/busses/i2c-cpm.c
@@ -423,7 +423,7 @@ static const struct i2c_adapter cpm_ops = {
 	.owner		= THIS_MODULE,
 	.name		= "i2c-cpm",
 	.algo		= &cpm_i2c_algo,
-	.class		= I2C_CLASS_HWMON,
+	.class		= I2C_CLASS_HWMON | I2C_CLASS_SPD,
 };
 
 static int __devinit cpm_i2c_setup(struct cpm_i2c *cpm)
diff --git a/drivers/i2c/busses/i2c-elektor.c b/drivers/i2c/busses/i2c-elektor.c
index b7a9977b025f..c251cf21a62b 100644
--- a/drivers/i2c/busses/i2c-elektor.c
+++ b/drivers/i2c/busses/i2c-elektor.c
@@ -202,7 +202,7 @@ static struct i2c_algo_pcf_data pcf_isa_data = {
 
 static struct i2c_adapter pcf_isa_ops = {
 	.owner		= THIS_MODULE,
-	.class		= I2C_CLASS_HWMON,
+	.class		= I2C_CLASS_HWMON | I2C_CLASS_SPD,
 	.id		= I2C_HW_P_ELEK,
 	.algo_data	= &pcf_isa_data,
 	.name		= "i2c-elektor",
diff --git a/drivers/i2c/busses/i2c-gpio.c b/drivers/i2c/busses/i2c-gpio.c
index 7c1b762aa681..79b455a1f090 100644
--- a/drivers/i2c/busses/i2c-gpio.c
+++ b/drivers/i2c/busses/i2c-gpio.c
@@ -140,7 +140,7 @@ static int __init i2c_gpio_probe(struct platform_device *pdev)
 	adap->owner = THIS_MODULE;
 	snprintf(adap->name, sizeof(adap->name), "i2c-gpio%d", pdev->id);
 	adap->algo_data = bit_data;
-	adap->class = I2C_CLASS_HWMON;
+	adap->class = I2C_CLASS_HWMON | I2C_CLASS_SPD;
 	adap->dev.parent = &pdev->dev;
 
 	/*
diff --git a/drivers/i2c/busses/i2c-i801.c b/drivers/i2c/busses/i2c-i801.c
index 213119211e58..9717ffe12921 100644
--- a/drivers/i2c/busses/i2c-i801.c
+++ b/drivers/i2c/busses/i2c-i801.c
@@ -573,7 +573,7 @@ static const struct i2c_algorithm smbus_algorithm = {
 static struct i2c_adapter i801_adapter = {
 	.owner		= THIS_MODULE,
 	.id		= I2C_HW_SMBUS_I801,
-	.class		= I2C_CLASS_HWMON,
+	.class		= I2C_CLASS_HWMON | I2C_CLASS_SPD,
 	.algo		= &smbus_algorithm,
 };
 
diff --git a/drivers/i2c/busses/i2c-ibm_iic.c b/drivers/i2c/busses/i2c-ibm_iic.c
index 85dbf34382e1..6f7bfdec3c69 100644
--- a/drivers/i2c/busses/i2c-ibm_iic.c
+++ b/drivers/i2c/busses/i2c-ibm_iic.c
@@ -740,7 +740,7 @@ static int __devinit iic_probe(struct ocp_device *ocp){
 	strcpy(adap->name, "IBM IIC");
 	i2c_set_adapdata(adap, dev);
 	adap->id = I2C_HW_OCP;
-	adap->class = I2C_CLASS_HWMON;
+	adap->class = I2C_CLASS_HWMON | I2C_CLASS_SPD;
 	adap->algo = &iic_algo;
 	adap->client_register = NULL;
 	adap->client_unregister = NULL;
@@ -934,7 +934,7 @@ static int __devinit iic_probe(struct of_device *ofdev,
 	strlcpy(adap->name, "IBM IIC", sizeof(adap->name));
 	i2c_set_adapdata(adap, dev);
 	adap->id = I2C_HW_OCP;
-	adap->class = I2C_CLASS_HWMON;
+	adap->class = I2C_CLASS_HWMON | I2C_CLASS_SPD;
 	adap->algo = &iic_algo;
 	adap->timeout = 1;
 	adap->nr = dev->idx;
diff --git a/drivers/i2c/busses/i2c-iop3xx.c b/drivers/i2c/busses/i2c-iop3xx.c
index 39884e797594..fc2714ac0c0f 100644
--- a/drivers/i2c/busses/i2c-iop3xx.c
+++ b/drivers/i2c/busses/i2c-iop3xx.c
@@ -482,7 +482,7 @@ iop3xx_i2c_probe(struct platform_device *pdev)
 	memcpy(new_adapter->name, pdev->name, strlen(pdev->name));
 	new_adapter->id = I2C_HW_IOP3XX;
 	new_adapter->owner = THIS_MODULE;
-	new_adapter->class = I2C_CLASS_HWMON;
+	new_adapter->class = I2C_CLASS_HWMON | I2C_CLASS_SPD;
 	new_adapter->dev.parent = &pdev->dev;
 	new_adapter->nr = pdev->id;
 
diff --git a/drivers/i2c/busses/i2c-isch.c b/drivers/i2c/busses/i2c-isch.c
index c9cd46b22692..8d648911a7f5 100644
--- a/drivers/i2c/busses/i2c-isch.c
+++ b/drivers/i2c/busses/i2c-isch.c
@@ -251,7 +251,7 @@ static const struct i2c_algorithm smbus_algorithm = {
 
 static struct i2c_adapter sch_adapter = {
 	.owner		= THIS_MODULE,
-	.class		= I2C_CLASS_HWMON,
+	.class		= I2C_CLASS_HWMON | I2C_CLASS_SPD,
 	.algo		= &smbus_algorithm,
 };
 
diff --git a/drivers/i2c/busses/i2c-mpc.c b/drivers/i2c/busses/i2c-mpc.c
index a076129de7e8..10b9342a36c2 100644
--- a/drivers/i2c/busses/i2c-mpc.c
+++ b/drivers/i2c/busses/i2c-mpc.c
@@ -311,7 +311,7 @@ static struct i2c_adapter mpc_ops = {
 	.name = "MPC adapter",
 	.id = I2C_HW_MPC107,
 	.algo = &mpc_algo,
-	.class = I2C_CLASS_HWMON,
+	.class = I2C_CLASS_HWMON | I2C_CLASS_SPD,
 	.timeout = 1,
 };
 
diff --git a/drivers/i2c/busses/i2c-mv64xxx.c b/drivers/i2c/busses/i2c-mv64xxx.c
index 036e6a883e67..9e8118d2fe64 100644
--- a/drivers/i2c/busses/i2c-mv64xxx.c
+++ b/drivers/i2c/busses/i2c-mv64xxx.c
@@ -530,7 +530,7 @@ mv64xxx_i2c_probe(struct platform_device *pd)
 	drv_data->adapter.id = I2C_HW_MV64XXX;
 	drv_data->adapter.algo = &mv64xxx_i2c_algo;
 	drv_data->adapter.owner = THIS_MODULE;
-	drv_data->adapter.class = I2C_CLASS_HWMON;
+	drv_data->adapter.class = I2C_CLASS_HWMON | I2C_CLASS_SPD;
 	drv_data->adapter.timeout = pdata->timeout;
 	drv_data->adapter.nr = pd->id;
 	platform_set_drvdata(pd, drv_data);
diff --git a/drivers/i2c/busses/i2c-nforce2.c b/drivers/i2c/busses/i2c-nforce2.c
index 081fdf3393f4..2654f20d3a62 100644
--- a/drivers/i2c/busses/i2c-nforce2.c
+++ b/drivers/i2c/busses/i2c-nforce2.c
@@ -350,7 +350,7 @@ static int __devinit nforce2_probe_smb (struct pci_dev *dev, int bar,
 	}
 	smbus->adapter.owner = THIS_MODULE;
 	smbus->adapter.id = I2C_HW_SMBUS_NFORCE2;
-	smbus->adapter.class = I2C_CLASS_HWMON;
+	smbus->adapter.class = I2C_CLASS_HWMON | I2C_CLASS_SPD;
 	smbus->adapter.algo = &smbus_algorithm;
 	smbus->adapter.algo_data = smbus;
 	smbus->adapter.dev.parent = &dev->dev;
diff --git a/drivers/i2c/busses/i2c-ocores.c b/drivers/i2c/busses/i2c-ocores.c
index f145692cbb76..51ca79bf6480 100644
--- a/drivers/i2c/busses/i2c-ocores.c
+++ b/drivers/i2c/busses/i2c-ocores.c
@@ -205,7 +205,7 @@ static const struct i2c_algorithm ocores_algorithm = {
 static struct i2c_adapter ocores_adapter = {
 	.owner		= THIS_MODULE,
 	.name		= "i2c-ocores",
-	.class		= I2C_CLASS_HWMON,
+	.class		= I2C_CLASS_HWMON | I2C_CLASS_SPD,
 	.algo		= &ocores_algorithm,
 };
 
diff --git a/drivers/i2c/busses/i2c-pasemi.c b/drivers/i2c/busses/i2c-pasemi.c
index 1603c81e39d4..adf0fbb902f0 100644
--- a/drivers/i2c/busses/i2c-pasemi.c
+++ b/drivers/i2c/busses/i2c-pasemi.c
@@ -365,7 +365,7 @@ static int __devinit pasemi_smb_probe(struct pci_dev *dev,
 	smbus->adapter.owner = THIS_MODULE;
 	snprintf(smbus->adapter.name, sizeof(smbus->adapter.name),
 		 "PA Semi SMBus adapter at 0x%lx", smbus->base);
-	smbus->adapter.class = I2C_CLASS_HWMON;
+	smbus->adapter.class = I2C_CLASS_HWMON | I2C_CLASS_SPD;
 	smbus->adapter.algo = &smbus_algorithm;
 	smbus->adapter.algo_data = smbus;
 	smbus->adapter.nr = PCI_FUNC(dev->devfn);
diff --git a/drivers/i2c/busses/i2c-piix4.c b/drivers/i2c/busses/i2c-piix4.c
index 2bde47509e1a..85d69f3e624f 100644
--- a/drivers/i2c/busses/i2c-piix4.c
+++ b/drivers/i2c/busses/i2c-piix4.c
@@ -402,7 +402,7 @@ static const struct i2c_algorithm smbus_algorithm = {
 static struct i2c_adapter piix4_adapter = {
 	.owner		= THIS_MODULE,
 	.id		= I2C_HW_SMBUS_PIIX4,
-	.class		= I2C_CLASS_HWMON,
+	.class		= I2C_CLASS_HWMON | I2C_CLASS_SPD,
 	.algo		= &smbus_algorithm,
 };
 
diff --git a/drivers/i2c/busses/i2c-pmcmsp.c b/drivers/i2c/busses/i2c-pmcmsp.c
index 63b3e2c11cff..dcf2045b5222 100644
--- a/drivers/i2c/busses/i2c-pmcmsp.c
+++ b/drivers/i2c/busses/i2c-pmcmsp.c
@@ -622,7 +622,7 @@ static struct i2c_algorithm pmcmsptwi_algo = {
 
 static struct i2c_adapter pmcmsptwi_adapter = {
 	.owner		= THIS_MODULE,
-	.class		= I2C_CLASS_HWMON,
+	.class		= I2C_CLASS_HWMON | I2C_CLASS_SPD,
 	.algo		= &pmcmsptwi_algo,
 	.name		= DRV_NAME,
 };
diff --git a/drivers/i2c/busses/i2c-s3c2410.c b/drivers/i2c/busses/i2c-s3c2410.c
index 9e8c875437be..007390ad9810 100644
--- a/drivers/i2c/busses/i2c-s3c2410.c
+++ b/drivers/i2c/busses/i2c-s3c2410.c
@@ -590,7 +590,7 @@ static struct s3c24xx_i2c s3c24xx_i2c = {
 		.owner			= THIS_MODULE,
 		.algo			= &s3c24xx_i2c_algorithm,
 		.retries		= 2,
-		.class			= I2C_CLASS_HWMON,
+		.class			= I2C_CLASS_HWMON | I2C_CLASS_SPD,
 	},
 };
 
diff --git a/drivers/i2c/busses/i2c-sibyte.c b/drivers/i2c/busses/i2c-sibyte.c
index 114634da6c6e..ac8822e7a5b4 100644
--- a/drivers/i2c/busses/i2c-sibyte.c
+++ b/drivers/i2c/busses/i2c-sibyte.c
@@ -156,7 +156,7 @@ static struct i2c_adapter sibyte_board_adapter[2] = {
 	{
 		.owner		= THIS_MODULE,
 		.id		= I2C_HW_SIBYTE,
-		.class		= I2C_CLASS_HWMON,
+		.class		= I2C_CLASS_HWMON | I2C_CLASS_SPD,
 		.algo		= NULL,
 		.algo_data	= &sibyte_board_data[0],
 		.name		= "SiByte SMBus 0",
@@ -164,7 +164,7 @@ static struct i2c_adapter sibyte_board_adapter[2] = {
 	{
 		.owner		= THIS_MODULE,
 		.id		= I2C_HW_SIBYTE,
-		.class		= I2C_CLASS_HWMON,
+		.class		= I2C_CLASS_HWMON | I2C_CLASS_SPD,
 		.algo		= NULL,
 		.algo_data	= &sibyte_board_data[1],
 		.name		= "SiByte SMBus 1",
diff --git a/drivers/i2c/busses/i2c-sis5595.c b/drivers/i2c/busses/i2c-sis5595.c
index 328441bb5470..f76944b384f5 100644
--- a/drivers/i2c/busses/i2c-sis5595.c
+++ b/drivers/i2c/busses/i2c-sis5595.c
@@ -362,7 +362,7 @@ static const struct i2c_algorithm smbus_algorithm = {
 static struct i2c_adapter sis5595_adapter = {
 	.owner		= THIS_MODULE,
 	.id		= I2C_HW_SMBUS_SIS5595,
-	.class          = I2C_CLASS_HWMON,
+	.class          = I2C_CLASS_HWMON | I2C_CLASS_SPD,
 	.algo		= &smbus_algorithm,
 };
 
diff --git a/drivers/i2c/busses/i2c-sis630.c b/drivers/i2c/busses/i2c-sis630.c
index d7e6ff3e0187..eb2b2181fed7 100644
--- a/drivers/i2c/busses/i2c-sis630.c
+++ b/drivers/i2c/busses/i2c-sis630.c
@@ -462,7 +462,7 @@ static const struct i2c_algorithm smbus_algorithm = {
 static struct i2c_adapter sis630_adapter = {
 	.owner		= THIS_MODULE,
 	.id		= I2C_HW_SMBUS_SIS630,
-	.class		= I2C_CLASS_HWMON,
+	.class		= I2C_CLASS_HWMON | I2C_CLASS_SPD,
 	.algo		= &smbus_algorithm,
 };
 
diff --git a/drivers/i2c/busses/i2c-sis96x.c b/drivers/i2c/busses/i2c-sis96x.c
index cde8e5880368..413e9e477723 100644
--- a/drivers/i2c/busses/i2c-sis96x.c
+++ b/drivers/i2c/busses/i2c-sis96x.c
@@ -244,7 +244,7 @@ static const struct i2c_algorithm smbus_algorithm = {
 static struct i2c_adapter sis96x_adapter = {
 	.owner		= THIS_MODULE,
 	.id		= I2C_HW_SMBUS_SIS96X,
-	.class		= I2C_CLASS_HWMON,
+	.class		= I2C_CLASS_HWMON | I2C_CLASS_SPD,
 	.algo		= &smbus_algorithm,
 };
 
diff --git a/drivers/i2c/busses/i2c-stub.c b/drivers/i2c/busses/i2c-stub.c
index e37ccd80f77a..1b7b2af94036 100644
--- a/drivers/i2c/busses/i2c-stub.c
+++ b/drivers/i2c/busses/i2c-stub.c
@@ -140,7 +140,7 @@ static const struct i2c_algorithm smbus_algorithm = {
 
 static struct i2c_adapter stub_adapter = {
 	.owner		= THIS_MODULE,
-	.class		= I2C_CLASS_HWMON,
+	.class		= I2C_CLASS_HWMON | I2C_CLASS_SPD,
 	.algo		= &smbus_algorithm,
 	.name		= "SMBus stub driver",
 };
diff --git a/drivers/i2c/busses/i2c-via.c b/drivers/i2c/busses/i2c-via.c
index 61716f6b14dc..6517f8a6d911 100644
--- a/drivers/i2c/busses/i2c-via.c
+++ b/drivers/i2c/busses/i2c-via.c
@@ -87,7 +87,7 @@ static struct i2c_algo_bit_data bit_data = {
 static struct i2c_adapter vt586b_adapter = {
 	.owner		= THIS_MODULE,
 	.id		= I2C_HW_B_VIA,
-	.class          = I2C_CLASS_HWMON,
+	.class          = I2C_CLASS_HWMON | I2C_CLASS_SPD,
 	.name		= "VIA i2c",
 	.algo_data	= &bit_data,
 };
diff --git a/drivers/i2c/busses/i2c-viapro.c b/drivers/i2c/busses/i2c-viapro.c
index c611905df009..7957ce515891 100644
--- a/drivers/i2c/busses/i2c-viapro.c
+++ b/drivers/i2c/busses/i2c-viapro.c
@@ -311,7 +311,7 @@ static const struct i2c_algorithm smbus_algorithm = {
 static struct i2c_adapter vt596_adapter = {
 	.owner		= THIS_MODULE,
 	.id		= I2C_HW_SMBUS_VIA2,
-	.class		= I2C_CLASS_HWMON,
+	.class		= I2C_CLASS_HWMON | I2C_CLASS_SPD,
 	.algo		= &smbus_algorithm,
 };
 
diff --git a/drivers/i2c/busses/scx200_acb.c b/drivers/i2c/busses/scx200_acb.c
index 61abe0f33255..ed794b145a11 100644
--- a/drivers/i2c/busses/scx200_acb.c
+++ b/drivers/i2c/busses/scx200_acb.c
@@ -442,7 +442,7 @@ static __init struct scx200_acb_iface *scx200_create_iface(const char *text,
 	adapter->owner = THIS_MODULE;
 	adapter->id = I2C_HW_SMBUS_SCX200;
 	adapter->algo = &scx200_acb_algorithm;
-	adapter->class = I2C_CLASS_HWMON;
+	adapter->class = I2C_CLASS_HWMON | I2C_CLASS_SPD;
 	adapter->dev.parent = dev;
 
 	mutex_init(&iface->mutex);
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 145797fe6a31..839d0ea3dca3 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -353,6 +353,7 @@ static inline void i2c_set_adapdata (struct i2c_adapter *dev, void *data)
 #define I2C_CLASS_CAM_ANALOG	(1<<4)	/* camera with analog CCD */
 #define I2C_CLASS_CAM_DIGITAL	(1<<5)	/* most webcams */
 #define I2C_CLASS_SOUND		(1<<6)	/* sound devices */
+#define I2C_CLASS_SPD		(1<<7)	/* SPD EEPROMs and similar */
 #define I2C_CLASS_ALL		(UINT_MAX) /* all of the above */
 
 /* i2c_client_address_data is the struct for holding default client
-- 
cgit v1.2.3


From 0573d11b2bbd0e4774f33f4c1959c1939c055e96 Mon Sep 17 00:00:00 2001
From: Eric Brower <ebrower@gmail.com>
Date: Mon, 14 Jul 2008 22:38:31 +0200
Subject: i2c-algo-pcf: Multi-master lost-arbitration improvement

Improve lost-arbitration handling of PCF8584.  This is necessary for
support of a currently out-of-kernel driver for Sun Microsystems E250
environmental management; perhaps others.

Signed-off-by: Eric Brower <ebrower@gmail.com>
Acked-by: Dan Smolik <marvin@mydatex.cz>
Signed-off-by: Jean Delvare <khali@linux-fr.org>
---
 drivers/i2c/algos/i2c-algo-pcf.c | 48 ++++++++++++++++++++++++++--------------
 include/linux/i2c-algo-pcf.h     |  6 +++++
 2 files changed, 37 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/i2c/algos/i2c-algo-pcf.c b/drivers/i2c/algos/i2c-algo-pcf.c
index 8907b0191677..1e328d19cd6d 100644
--- a/drivers/i2c/algos/i2c-algo-pcf.c
+++ b/drivers/i2c/algos/i2c-algo-pcf.c
@@ -78,6 +78,36 @@ static void i2c_stop(struct i2c_algo_pcf_data *adap)
 	set_pcf(adap, 1, I2C_PCF_STOP);
 }
 
+static void handle_lab(struct i2c_algo_pcf_data *adap, const int *status)
+{
+	DEB2(printk(KERN_INFO
+		"i2c-algo-pcf.o: lost arbitration (CSR 0x%02x)\n",
+		 *status));
+
+	/* Cleanup from LAB -- reset and enable ESO.
+	 * This resets the PCF8584; since we've lost the bus, no
+	 * further attempts should be made by callers to clean up
+	 * (no i2c_stop() etc.)
+	 */
+	set_pcf(adap, 1, I2C_PCF_PIN);
+	set_pcf(adap, 1, I2C_PCF_ESO);
+
+	/* We pause for a time period sufficient for any running
+	 * I2C transaction to complete -- the arbitration logic won't
+	 * work properly until the next START is seen.
+	 * It is assumed the bus driver or client has set a proper value.
+	 *
+	 * REVISIT: should probably use msleep instead of mdelay if we
+	 * know we can sleep.
+	 */
+	if (adap->lab_mdelay)
+		mdelay(adap->lab_mdelay);
+
+	DEB2(printk(KERN_INFO
+		"i2c-algo-pcf.o: reset LAB condition (CSR 0x%02x)\n",
+		get_pcf(adap, 1)));
+}
+
 static int wait_for_bb(struct i2c_algo_pcf_data *adap) {
 
 	int timeout = DEF_TIMEOUT;
@@ -109,23 +139,7 @@ static int wait_for_pin(struct i2c_algo_pcf_data *adap, int *status) {
 		*status = get_pcf(adap, 1);
 	}
 	if (*status & I2C_PCF_LAB) {
-		DEB2(printk(KERN_INFO 
-			"i2c-algo-pcf.o: lost arbitration (CSR 0x%02x)\n",
-			 *status));
-		/* Cleanup from LAB-- reset and enable ESO.
-		 * This resets the PCF8584; since we've lost the bus, no
-		 * further attempts should be made by callers to clean up 
-		 * (no i2c_stop() etc.)
-		 */
-		set_pcf(adap, 1, I2C_PCF_PIN);
-		set_pcf(adap, 1, I2C_PCF_ESO);
-		/* TODO: we should pause for a time period sufficient for any
-		 * running I2C transaction to complete-- the arbitration
-		 * logic won't work properly until the next START is seen.
-		 */
-		DEB2(printk(KERN_INFO 
-			"i2c-algo-pcf.o: reset LAB condition (CSR 0x%02x)\n", 
-			get_pcf(adap,1)));
+		handle_lab(adap, status);
 		return(-EINTR);
 	}
 #endif
diff --git a/include/linux/i2c-algo-pcf.h b/include/linux/i2c-algo-pcf.h
index 77afbb60fd11..74fb6f889a77 100644
--- a/include/linux/i2c-algo-pcf.h
+++ b/include/linux/i2c-algo-pcf.h
@@ -36,6 +36,12 @@ struct i2c_algo_pcf_data {
 	/* local settings */
 	int udelay;
 	int timeout;
+
+	/* Multi-master lost arbitration back-off delay (msecs)
+	 * This should be set by the bus adapter or knowledgable client
+	 * if bus is multi-mastered, else zero
+	 */
+	unsigned long lab_mdelay;
 };
 
 int i2c_pcf_add_bus(struct i2c_adapter *);
-- 
cgit v1.2.3


From e3e7fc3c401a5d53f0599a357b3cf65d6a4f52e3 Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Mon, 14 Jul 2008 22:38:31 +0200
Subject: i2c-algo-pcf: Drop unused struct members

Struct members udelay and timeout aren't used anywhere, so drop them.

Signed-off-by: Jean Delvare <khali@linux-fr.org>
Acked-by: Eric Brower <ebrower@gmail.com>
---
 drivers/i2c/busses/i2c-elektor.c | 2 --
 include/linux/i2c-algo-pcf.h     | 4 ----
 2 files changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/i2c/busses/i2c-elektor.c b/drivers/i2c/busses/i2c-elektor.c
index c251cf21a62b..7f38c01fb3a0 100644
--- a/drivers/i2c/busses/i2c-elektor.c
+++ b/drivers/i2c/busses/i2c-elektor.c
@@ -196,8 +196,6 @@ static struct i2c_algo_pcf_data pcf_isa_data = {
 	.getown	    = pcf_isa_getown,
 	.getclock   = pcf_isa_getclock,
 	.waitforpin = pcf_isa_waitforpin,
-	.udelay	    = 10,
-	.timeout    = 100,
 };
 
 static struct i2c_adapter pcf_isa_ops = {
diff --git a/include/linux/i2c-algo-pcf.h b/include/linux/i2c-algo-pcf.h
index 74fb6f889a77..0177d280f733 100644
--- a/include/linux/i2c-algo-pcf.h
+++ b/include/linux/i2c-algo-pcf.h
@@ -33,10 +33,6 @@ struct i2c_algo_pcf_data {
 	int  (*getclock) (void *data);
 	void (*waitforpin) (void);
 
-	/* local settings */
-	int udelay;
-	int timeout;
-
 	/* Multi-master lost arbitration back-off delay (msecs)
 	 * This should be set by the bus adapter or knowledgable client
 	 * if bus is multi-mastered, else zero
-- 
cgit v1.2.3


From f6a7110520037ba786f17b53790c6eb8a3d4ef55 Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Mon, 14 Jul 2008 22:38:34 +0200
Subject: i2c-dev: Delete empty detach_client callback

Implementing detach_client is optional, so there is no point in
an empty implementation.

Likewise, i2c driver IDs are optional, and we don't need one.

Signed-off-by: Jean Delvare <khali@linux-fr.org>
---
 drivers/i2c/i2c-dev.c  | 7 -------
 include/linux/i2c-id.h | 2 --
 2 files changed, 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/i2c/i2c-dev.c b/drivers/i2c/i2c-dev.c
index e96d98696782..50df53640c78 100644
--- a/drivers/i2c/i2c-dev.c
+++ b/drivers/i2c/i2c-dev.c
@@ -548,19 +548,12 @@ static int i2cdev_detach_adapter(struct i2c_adapter *adap)
 	return 0;
 }
 
-static int i2cdev_detach_client(struct i2c_client *client)
-{
-	return 0;
-}
-
 static struct i2c_driver i2cdev_driver = {
 	.driver = {
 		.name	= "dev_driver",
 	},
-	.id		= I2C_DRIVERID_I2CDEV,
 	.attach_adapter	= i2cdev_attach_adapter,
 	.detach_adapter	= i2cdev_detach_adapter,
-	.detach_client	= i2cdev_detach_client,
 };
 
 /* ------------------------------------------------------------------------- */
diff --git a/include/linux/i2c-id.h b/include/linux/i2c-id.h
index 988e566d3ed5..ef13b7c66df3 100644
--- a/include/linux/i2c-id.h
+++ b/include/linux/i2c-id.h
@@ -91,8 +91,6 @@
 #define I2C_DRIVERID_M52790 	95      /* Mitsubishi M52790SP/FP AV switch */
 #define I2C_DRIVERID_CS5345	96	/* cs5345 audio processor	*/
 
-#define I2C_DRIVERID_I2CDEV	900
-
 #define I2C_DRIVERID_OV7670 1048	/* Omnivision 7670 camera */
 
 /*
-- 
cgit v1.2.3


From e9ca9eb9d7fc7bf3dc3cec5ba7edb089c4625f7b Mon Sep 17 00:00:00 2001
From: Jon Smirl <jonsmirl@gmail.com>
Date: Mon, 14 Jul 2008 22:38:35 +0200
Subject: i2c: Export the i2c_bus_type symbol

Export the root of the i2c bus so that PowerPC device tree code can
iterate over devices on the i2c bus.

Signed-off-by: Jon Smirl <jonsmirl@gmail.com>
Signed-off-by: Jean Delvare <khali@linux-fr.org>
---
 drivers/i2c/i2c-core.c | 3 ++-
 include/linux/i2c.h    | 2 ++
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c
index d6cc58abf3ff..e45bb2838f42 100644
--- a/drivers/i2c/i2c-core.c
+++ b/drivers/i2c/i2c-core.c
@@ -201,7 +201,7 @@ static struct device_attribute i2c_dev_attrs[] = {
 	{ },
 };
 
-static struct bus_type i2c_bus_type = {
+struct bus_type i2c_bus_type = {
 	.name		= "i2c",
 	.dev_attrs	= i2c_dev_attrs,
 	.match		= i2c_device_match,
@@ -212,6 +212,7 @@ static struct bus_type i2c_bus_type = {
 	.suspend	= i2c_device_suspend,
 	.resume		= i2c_device_resume,
 };
+EXPORT_SYMBOL_GPL(i2c_bus_type);
 
 
 /**
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 839d0ea3dca3..50cbab4b62b0 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -35,6 +35,8 @@
 #include <linux/sched.h>	/* for completion */
 #include <linux/mutex.h>
 
+extern struct bus_type i2c_bus_type;
+
 /* --- General options ------------------------------------------------	*/
 
 struct i2c_msg;
-- 
cgit v1.2.3


From 2b7a5056a0a7ff17d5d2004c29c852a92a6bd632 Mon Sep 17 00:00:00 2001
From: Wolfram Sang <w.sang@pengutronix.de>
Date: Mon, 14 Jul 2008 22:38:35 +0200
Subject: i2c: New-style EEPROM driver using device IDs

Add a new-style driver for most I2C EEPROMs, giving sysfs read/write
access to their data. Tested with various chips and clock rates.

Signed-off-by: Wolfram Sang <w.sang@pengutronix.de>
Signed-off-by: Jean Delvare <khali@linux-fr.org>
---
 drivers/i2c/chips/Kconfig  |  26 ++
 drivers/i2c/chips/Makefile |   1 +
 drivers/i2c/chips/at24.c   | 583 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/i2c/at24.h   |  28 +++
 4 files changed, 638 insertions(+)
 create mode 100644 drivers/i2c/chips/at24.c
 create mode 100644 include/linux/i2c/at24.h

(limited to 'include/linux')

diff --git a/drivers/i2c/chips/Kconfig b/drivers/i2c/chips/Kconfig
index 6326468d5f0b..50e0a4653741 100644
--- a/drivers/i2c/chips/Kconfig
+++ b/drivers/i2c/chips/Kconfig
@@ -14,6 +14,32 @@ config DS1682
 	  This driver can also be built as a module.  If so, the module
 	  will be called ds1682.
 
+config AT24
+	tristate "EEPROMs from most vendors"
+	depends on SYSFS && EXPERIMENTAL
+	help
+	  Enable this driver to get read/write support to most I2C EEPROMs,
+	  after you configure the driver to know about each EEPROM on
+	  your target board.  Use these generic chip names, instead of
+	  vendor-specific ones like at24c64 or 24lc02:
+
+	     24c00, 24c01, 24c02, spd (readonly 24c02), 24c04, 24c08,
+	     24c16, 24c32, 24c64, 24c128, 24c256, 24c512, 24c1024
+
+	  Unless you like data loss puzzles, always be sure that any chip
+	  you configure as a 24c32 (32 kbit) or larger is NOT really a
+	  24c16 (16 kbit) or smaller, and vice versa. Marking the chip
+	  as read-only won't help recover from this. Also, if your chip
+	  has any software write-protect mechanism you may want to review the
+	  code to make sure this driver won't turn it on by accident.
+
+	  If you use this with an SMBus adapter instead of an I2C adapter,
+	  full functionality is not available.  Only smaller devices are
+	  supported (24c16 and below, max 4 kByte).
+
+	  This driver can also be built as a module.  If so, the module
+	  will be called at24.
+
 config SENSORS_EEPROM
 	tristate "EEPROM reader"
 	depends on EXPERIMENTAL
diff --git a/drivers/i2c/chips/Makefile b/drivers/i2c/chips/Makefile
index e47aca0ca5ae..39e3e69ed125 100644
--- a/drivers/i2c/chips/Makefile
+++ b/drivers/i2c/chips/Makefile
@@ -10,6 +10,7 @@
 #
 
 obj-$(CONFIG_DS1682)		+= ds1682.o
+obj-$(CONFIG_AT24)		+= at24.o
 obj-$(CONFIG_SENSORS_EEPROM)	+= eeprom.o
 obj-$(CONFIG_SENSORS_MAX6875)	+= max6875.o
 obj-$(CONFIG_SENSORS_PCA9539)	+= pca9539.o
diff --git a/drivers/i2c/chips/at24.c b/drivers/i2c/chips/at24.c
new file mode 100644
index 000000000000..e764c94f3e3d
--- /dev/null
+++ b/drivers/i2c/chips/at24.c
@@ -0,0 +1,583 @@
+/*
+ * at24.c - handle most I2C EEPROMs
+ *
+ * Copyright (C) 2005-2007 David Brownell
+ * Copyright (C) 2008 Wolfram Sang, Pengutronix
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/delay.h>
+#include <linux/mutex.h>
+#include <linux/sysfs.h>
+#include <linux/mod_devicetable.h>
+#include <linux/log2.h>
+#include <linux/bitops.h>
+#include <linux/jiffies.h>
+#include <linux/i2c.h>
+#include <linux/i2c/at24.h>
+
+/*
+ * I2C EEPROMs from most vendors are inexpensive and mostly interchangeable.
+ * Differences between different vendor product lines (like Atmel AT24C or
+ * MicroChip 24LC, etc) won't much matter for typical read/write access.
+ * There are also I2C RAM chips, likewise interchangeable. One example
+ * would be the PCF8570, which acts like a 24c02 EEPROM (256 bytes).
+ *
+ * However, misconfiguration can lose data. "Set 16-bit memory address"
+ * to a part with 8-bit addressing will overwrite data. Writing with too
+ * big a page size also loses data. And it's not safe to assume that the
+ * conventional addresses 0x50..0x57 only hold eeproms; a PCF8563 RTC
+ * uses 0x51, for just one example.
+ *
+ * Accordingly, explicit board-specific configuration data should be used
+ * in almost all cases. (One partial exception is an SMBus used to access
+ * "SPD" data for DRAM sticks. Those only use 24c02 EEPROMs.)
+ *
+ * So this driver uses "new style" I2C driver binding, expecting to be
+ * told what devices exist. That may be in arch/X/mach-Y/board-Z.c or
+ * similar kernel-resident tables; or, configuration data coming from
+ * a bootloader.
+ *
+ * Other than binding model, current differences from "eeprom" driver are
+ * that this one handles write access and isn't restricted to 24c02 devices.
+ * It also handles larger devices (32 kbit and up) with two-byte addresses,
+ * which won't work on pure SMBus systems.
+ */
+
+struct at24_data {
+	struct at24_platform_data chip;
+	bool use_smbus;
+
+	/*
+	 * Lock protects against activities from other Linux tasks,
+	 * but not from changes by other I2C masters.
+	 */
+	struct mutex lock;
+	struct bin_attribute bin;
+
+	u8 *writebuf;
+	unsigned write_max;
+	unsigned num_addresses;
+
+	/*
+	 * Some chips tie up multiple I2C addresses; dummy devices reserve
+	 * them for us, and we'll use them with SMBus calls.
+	 */
+	struct i2c_client *client[];
+};
+
+/*
+ * This parameter is to help this driver avoid blocking other drivers out
+ * of I2C for potentially troublesome amounts of time. With a 100 kHz I2C
+ * clock, one 256 byte read takes about 1/43 second which is excessive;
+ * but the 1/170 second it takes at 400 kHz may be quite reasonable; and
+ * at 1 MHz (Fm+) a 1/430 second delay could easily be invisible.
+ *
+ * This value is forced to be a power of two so that writes align on pages.
+ */
+static unsigned io_limit = 128;
+module_param(io_limit, uint, 0);
+MODULE_PARM_DESC(io_limit, "Maximum bytes per I/O (default 128)");
+
+/*
+ * Specs often allow 5 msec for a page write, sometimes 20 msec;
+ * it's important to recover from write timeouts.
+ */
+static unsigned write_timeout = 25;
+module_param(write_timeout, uint, 0);
+MODULE_PARM_DESC(write_timeout, "Time (in ms) to try writes (default 25)");
+
+#define AT24_SIZE_BYTELEN 5
+#define AT24_SIZE_FLAGS 8
+
+#define AT24_BITMASK(x) (BIT(x) - 1)
+
+/* create non-zero magic value for given eeprom parameters */
+#define AT24_DEVICE_MAGIC(_len, _flags) 		\
+	((1 << AT24_SIZE_FLAGS | (_flags)) 		\
+	    << AT24_SIZE_BYTELEN | ilog2(_len))
+
+static const struct i2c_device_id at24_ids[] = {
+	/* needs 8 addresses as A0-A2 are ignored */
+	{ "24c00", AT24_DEVICE_MAGIC(128 / 8, AT24_FLAG_TAKE8ADDR) },
+	/* old variants can't be handled with this generic entry! */
+	{ "24c01", AT24_DEVICE_MAGIC(1024 / 8, 0) },
+	{ "24c02", AT24_DEVICE_MAGIC(2048 / 8, 0) },
+	/* spd is a 24c02 in memory DIMMs */
+	{ "spd", AT24_DEVICE_MAGIC(2048 / 8,
+		AT24_FLAG_READONLY | AT24_FLAG_IRUGO) },
+	{ "24c04", AT24_DEVICE_MAGIC(4096 / 8, 0) },
+	/* 24rf08 quirk is handled at i2c-core */
+	{ "24c08", AT24_DEVICE_MAGIC(8192 / 8, 0) },
+	{ "24c16", AT24_DEVICE_MAGIC(16384 / 8, 0) },
+	{ "24c32", AT24_DEVICE_MAGIC(32768 / 8, AT24_FLAG_ADDR16) },
+	{ "24c64", AT24_DEVICE_MAGIC(65536 / 8, AT24_FLAG_ADDR16) },
+	{ "24c128", AT24_DEVICE_MAGIC(131072 / 8, AT24_FLAG_ADDR16) },
+	{ "24c256", AT24_DEVICE_MAGIC(262144 / 8, AT24_FLAG_ADDR16) },
+	{ "24c512", AT24_DEVICE_MAGIC(524288 / 8, AT24_FLAG_ADDR16) },
+	{ "24c1024", AT24_DEVICE_MAGIC(1048576 / 8, AT24_FLAG_ADDR16) },
+	{ "at24", 0 },
+	{ /* END OF LIST */ }
+};
+MODULE_DEVICE_TABLE(i2c, at24_ids);
+
+/*-------------------------------------------------------------------------*/
+
+/*
+ * This routine supports chips which consume multiple I2C addresses. It
+ * computes the addressing information to be used for a given r/w request.
+ * Assumes that sanity checks for offset happened at sysfs-layer.
+ */
+static struct i2c_client *at24_translate_offset(struct at24_data *at24,
+		unsigned *offset)
+{
+	unsigned i;
+
+	if (at24->chip.flags & AT24_FLAG_ADDR16) {
+		i = *offset >> 16;
+		*offset &= 0xffff;
+	} else {
+		i = *offset >> 8;
+		*offset &= 0xff;
+	}
+
+	return at24->client[i];
+}
+
+static ssize_t at24_eeprom_read(struct at24_data *at24, char *buf,
+		unsigned offset, size_t count)
+{
+	struct i2c_msg msg[2];
+	u8 msgbuf[2];
+	struct i2c_client *client;
+	int status, i;
+
+	memset(msg, 0, sizeof(msg));
+
+	/*
+	 * REVISIT some multi-address chips don't rollover page reads to
+	 * the next slave address, so we may need to truncate the count.
+	 * Those chips might need another quirk flag.
+	 *
+	 * If the real hardware used four adjacent 24c02 chips and that
+	 * were misconfigured as one 24c08, that would be a similar effect:
+	 * one "eeprom" file not four, but larger reads would fail when
+	 * they crossed certain pages.
+	 */
+
+	/*
+	 * Slave address and byte offset derive from the offset. Always
+	 * set the byte address; on a multi-master board, another master
+	 * may have changed the chip's "current" address pointer.
+	 */
+	client = at24_translate_offset(at24, &offset);
+
+	if (count > io_limit)
+		count = io_limit;
+
+	/* Smaller eeproms can work given some SMBus extension calls */
+	if (at24->use_smbus) {
+		if (count > I2C_SMBUS_BLOCK_MAX)
+			count = I2C_SMBUS_BLOCK_MAX;
+		status = i2c_smbus_read_i2c_block_data(client, offset,
+				count, buf);
+		dev_dbg(&client->dev, "smbus read %zd@%d --> %d\n",
+				count, offset, status);
+		return (status < 0) ? -EIO : status;
+	}
+
+	/*
+	 * When we have a better choice than SMBus calls, use a combined
+	 * I2C message. Write address; then read up to io_limit data bytes.
+	 * Note that read page rollover helps us here (unlike writes).
+	 * msgbuf is u8 and will cast to our needs.
+	 */
+	i = 0;
+	if (at24->chip.flags & AT24_FLAG_ADDR16)
+		msgbuf[i++] = offset >> 8;
+	msgbuf[i++] = offset;
+
+	msg[0].addr = client->addr;
+	msg[0].buf = msgbuf;
+	msg[0].len = i;
+
+	msg[1].addr = client->addr;
+	msg[1].flags = I2C_M_RD;
+	msg[1].buf = buf;
+	msg[1].len = count;
+
+	status = i2c_transfer(client->adapter, msg, 2);
+	dev_dbg(&client->dev, "i2c read %zd@%d --> %d\n",
+			count, offset, status);
+
+	if (status == 2)
+		return count;
+	else if (status >= 0)
+		return -EIO;
+	else
+		return status;
+}
+
+static ssize_t at24_bin_read(struct kobject *kobj, struct bin_attribute *attr,
+		char *buf, loff_t off, size_t count)
+{
+	struct at24_data *at24;
+	ssize_t retval = 0;
+
+	at24 = dev_get_drvdata(container_of(kobj, struct device, kobj));
+
+	if (unlikely(!count))
+		return count;
+
+	/*
+	 * Read data from chip, protecting against concurrent updates
+	 * from this host, but not from other I2C masters.
+	 */
+	mutex_lock(&at24->lock);
+
+	while (count) {
+		ssize_t	status;
+
+		status = at24_eeprom_read(at24, buf, off, count);
+		if (status <= 0) {
+			if (retval == 0)
+				retval = status;
+			break;
+		}
+		buf += status;
+		off += status;
+		count -= status;
+		retval += status;
+	}
+
+	mutex_unlock(&at24->lock);
+
+	return retval;
+}
+
+
+/*
+ * REVISIT: export at24_bin{read,write}() to let other kernel code use
+ * eeprom data. For example, it might hold a board's Ethernet address, or
+ * board-specific calibration data generated on the manufacturing floor.
+ */
+
+
+/*
+ * Note that if the hardware write-protect pin is pulled high, the whole
+ * chip is normally write protected. But there are plenty of product
+ * variants here, including OTP fuses and partial chip protect.
+ *
+ * We only use page mode writes; the alternative is sloooow. This routine
+ * writes at most one page.
+ */
+static ssize_t at24_eeprom_write(struct at24_data *at24, char *buf,
+		unsigned offset, size_t count)
+{
+	struct i2c_client *client;
+	struct i2c_msg msg;
+	ssize_t status;
+	unsigned long timeout, write_time;
+	unsigned next_page;
+
+	/* Get corresponding I2C address and adjust offset */
+	client = at24_translate_offset(at24, &offset);
+
+	/* write_max is at most a page */
+	if (count > at24->write_max)
+		count = at24->write_max;
+
+	/* Never roll over backwards, to the start of this page */
+	next_page = roundup(offset + 1, at24->chip.page_size);
+	if (offset + count > next_page)
+		count = next_page - offset;
+
+	/* If we'll use I2C calls for I/O, set up the message */
+	if (!at24->use_smbus) {
+		int i = 0;
+
+		msg.addr = client->addr;
+		msg.flags = 0;
+
+		/* msg.buf is u8 and casts will mask the values */
+		msg.buf = at24->writebuf;
+		if (at24->chip.flags & AT24_FLAG_ADDR16)
+			msg.buf[i++] = offset >> 8;
+
+		msg.buf[i++] = offset;
+		memcpy(&msg.buf[i], buf, count);
+		msg.len = i + count;
+	}
+
+	/*
+	 * Writes fail if the previous one didn't complete yet. We may
+	 * loop a few times until this one succeeds, waiting at least
+	 * long enough for one entire page write to work.
+	 */
+	timeout = jiffies + msecs_to_jiffies(write_timeout);
+	do {
+		write_time = jiffies;
+		if (at24->use_smbus) {
+			status = i2c_smbus_write_i2c_block_data(client,
+					offset, count, buf);
+			if (status == 0)
+				status = count;
+		} else {
+			status = i2c_transfer(client->adapter, &msg, 1);
+			if (status == 1)
+				status = count;
+		}
+		dev_dbg(&client->dev, "write %zd@%d --> %zd (%ld)\n",
+				count, offset, status, jiffies);
+
+		if (status == count)
+			return count;
+
+		/* REVISIT: at HZ=100, this is sloooow */
+		msleep(1);
+	} while (time_before(write_time, timeout));
+
+	return -ETIMEDOUT;
+}
+
+static ssize_t at24_bin_write(struct kobject *kobj, struct bin_attribute *attr,
+		char *buf, loff_t off, size_t count)
+{
+	struct at24_data *at24;
+	ssize_t retval = 0;
+
+	at24 = dev_get_drvdata(container_of(kobj, struct device, kobj));
+
+	if (unlikely(!count))
+		return count;
+
+	/*
+	 * Write data to chip, protecting against concurrent updates
+	 * from this host, but not from other I2C masters.
+	 */
+	mutex_lock(&at24->lock);
+
+	while (count) {
+		ssize_t	status;
+
+		status = at24_eeprom_write(at24, buf, off, count);
+		if (status <= 0) {
+			if (retval == 0)
+				retval = status;
+			break;
+		}
+		buf += status;
+		off += status;
+		count -= status;
+		retval += status;
+	}
+
+	mutex_unlock(&at24->lock);
+
+	return retval;
+}
+
+/*-------------------------------------------------------------------------*/
+
+static int at24_probe(struct i2c_client *client, const struct i2c_device_id *id)
+{
+	struct at24_platform_data chip;
+	bool writable;
+	bool use_smbus = false;
+	struct at24_data *at24;
+	int err;
+	unsigned i, num_addresses;
+	kernel_ulong_t magic;
+
+	if (client->dev.platform_data) {
+		chip = *(struct at24_platform_data *)client->dev.platform_data;
+	} else {
+		if (!id->driver_data) {
+			err = -ENODEV;
+			goto err_out;
+		}
+		magic = id->driver_data;
+		chip.byte_len = BIT(magic & AT24_BITMASK(AT24_SIZE_BYTELEN));
+		magic >>= AT24_SIZE_BYTELEN;
+		chip.flags = magic & AT24_BITMASK(AT24_SIZE_FLAGS);
+		/*
+		 * This is slow, but we can't know all eeproms, so we better
+		 * play safe. Specifying custom eeprom-types via platform_data
+		 * is recommended anyhow.
+		 */
+		chip.page_size = 1;
+	}
+
+	if (!is_power_of_2(chip.byte_len))
+		dev_warn(&client->dev,
+			"byte_len looks suspicious (no power of 2)!\n");
+	if (!is_power_of_2(chip.page_size))
+		dev_warn(&client->dev,
+			"page_size looks suspicious (no power of 2)!\n");
+
+	/* Use I2C operations unless we're stuck with SMBus extensions. */
+	if (!i2c_check_functionality(client->adapter, I2C_FUNC_I2C)) {
+		if (chip.flags & AT24_FLAG_ADDR16) {
+			err = -EPFNOSUPPORT;
+			goto err_out;
+		}
+		if (!i2c_check_functionality(client->adapter,
+				I2C_FUNC_SMBUS_READ_I2C_BLOCK)) {
+			err = -EPFNOSUPPORT;
+			goto err_out;
+		}
+		use_smbus = true;
+	}
+
+	if (chip.flags & AT24_FLAG_TAKE8ADDR)
+		num_addresses = 8;
+	else
+		num_addresses =	DIV_ROUND_UP(chip.byte_len,
+			(chip.flags & AT24_FLAG_ADDR16) ? 65536 : 256);
+
+	at24 = kzalloc(sizeof(struct at24_data) +
+		num_addresses * sizeof(struct i2c_client *), GFP_KERNEL);
+	if (!at24) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+
+	mutex_init(&at24->lock);
+	at24->use_smbus = use_smbus;
+	at24->chip = chip;
+	at24->num_addresses = num_addresses;
+
+	/*
+	 * Export the EEPROM bytes through sysfs, since that's convenient.
+	 * By default, only root should see the data (maybe passwords etc)
+	 */
+	at24->bin.attr.name = "eeprom";
+	at24->bin.attr.mode = chip.flags & AT24_FLAG_IRUGO ? S_IRUGO : S_IRUSR;
+	at24->bin.attr.owner = THIS_MODULE;
+	at24->bin.read = at24_bin_read;
+	at24->bin.size = chip.byte_len;
+
+	writable = !(chip.flags & AT24_FLAG_READONLY);
+	if (writable) {
+		if (!use_smbus || i2c_check_functionality(client->adapter,
+				I2C_FUNC_SMBUS_WRITE_I2C_BLOCK)) {
+
+			unsigned write_max = chip.page_size;
+
+			at24->bin.write = at24_bin_write;
+			at24->bin.attr.mode |= S_IWUSR;
+
+			if (write_max > io_limit)
+				write_max = io_limit;
+			if (use_smbus && write_max > I2C_SMBUS_BLOCK_MAX)
+				write_max = I2C_SMBUS_BLOCK_MAX;
+			at24->write_max = write_max;
+
+			/* buffer (data + address at the beginning) */
+			at24->writebuf = kmalloc(write_max + 2, GFP_KERNEL);
+			if (!at24->writebuf) {
+				err = -ENOMEM;
+				goto err_struct;
+			}
+		} else {
+			dev_warn(&client->dev,
+				"cannot write due to controller restrictions.");
+		}
+	}
+
+	at24->client[0] = client;
+
+	/* use dummy devices for multiple-address chips */
+	for (i = 1; i < num_addresses; i++) {
+		at24->client[i] = i2c_new_dummy(client->adapter,
+					client->addr + i);
+		if (!at24->client[i]) {
+			dev_err(&client->dev, "address 0x%02x unavailable\n",
+					client->addr + i);
+			err = -EADDRINUSE;
+			goto err_clients;
+		}
+	}
+
+	err = sysfs_create_bin_file(&client->dev.kobj, &at24->bin);
+	if (err)
+		goto err_clients;
+
+	i2c_set_clientdata(client, at24);
+
+	dev_info(&client->dev, "%Zd byte %s EEPROM %s\n",
+		at24->bin.size, client->name,
+		writable ? "(writable)" : "(read-only)");
+	dev_dbg(&client->dev,
+		"page_size %d, num_addresses %d, write_max %d%s\n",
+		chip.page_size, num_addresses,
+		at24->write_max,
+		use_smbus ? ", use_smbus" : "");
+
+	return 0;
+
+err_clients:
+	for (i = 1; i < num_addresses; i++)
+		if (at24->client[i])
+			i2c_unregister_device(at24->client[i]);
+
+	kfree(at24->writebuf);
+err_struct:
+	kfree(at24);
+err_out:
+	dev_dbg(&client->dev, "probe error %d\n", err);
+	return err;
+}
+
+static int __devexit at24_remove(struct i2c_client *client)
+{
+	struct at24_data *at24;
+	int i;
+
+	at24 = i2c_get_clientdata(client);
+	sysfs_remove_bin_file(&client->dev.kobj, &at24->bin);
+
+	for (i = 1; i < at24->num_addresses; i++)
+		i2c_unregister_device(at24->client[i]);
+
+	kfree(at24->writebuf);
+	kfree(at24);
+	i2c_set_clientdata(client, NULL);
+	return 0;
+}
+
+/*-------------------------------------------------------------------------*/
+
+static struct i2c_driver at24_driver = {
+	.driver = {
+		.name = "at24",
+		.owner = THIS_MODULE,
+	},
+	.probe = at24_probe,
+	.remove = __devexit_p(at24_remove),
+	.id_table = at24_ids,
+};
+
+static int __init at24_init(void)
+{
+	io_limit = rounddown_pow_of_two(io_limit);
+	return i2c_add_driver(&at24_driver);
+}
+module_init(at24_init);
+
+static void __exit at24_exit(void)
+{
+	i2c_del_driver(&at24_driver);
+}
+module_exit(at24_exit);
+
+MODULE_DESCRIPTION("Driver for most I2C EEPROMs");
+MODULE_AUTHOR("David Brownell and Wolfram Sang");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/i2c/at24.h b/include/linux/i2c/at24.h
new file mode 100644
index 000000000000..f6edd522a929
--- /dev/null
+++ b/include/linux/i2c/at24.h
@@ -0,0 +1,28 @@
+#ifndef _LINUX_AT24_H
+#define _LINUX_AT24_H
+
+#include <linux/types.h>
+
+/*
+ * As seen through Linux I2C, differences between the most common types of I2C
+ * memory include:
+ * - How much memory is available (usually specified in bit)?
+ * - What write page size does it support?
+ * - Special flags (16 bit addresses, read_only, world readable...)?
+ *
+ * If you set up a custom eeprom type, please double-check the parameters.
+ * Especially page_size needs extra care, as you risk data loss if your value
+ * is bigger than what the chip actually supports!
+ */
+
+struct at24_platform_data {
+	u32		byte_len;		/* size (sum of all addr) */
+	u16		page_size;		/* for writes */
+	u8		flags;
+#define AT24_FLAG_ADDR16	0x80	/* address pointer is 16 bit */
+#define AT24_FLAG_READONLY	0x40	/* sysfs-entry will be read-only */
+#define AT24_FLAG_IRUGO		0x20	/* sysfs-entry will be world-readable */
+#define AT24_FLAG_TAKE8ADDR	0x10	/* take always 8 addresses (24c00) */
+};
+
+#endif /* _LINUX_AT24_H */
-- 
cgit v1.2.3


From 4735c98f8447acb1c8977e2b8024640f7bf36dd6 Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Mon, 14 Jul 2008 22:38:36 +0200
Subject: i2c: Add detection capability to new-style drivers

Add a mechanism to let new-style i2c drivers optionally autodetect
devices they would support on selected buses and ask i2c-core to
instantiate them. This is a replacement for legacy i2c drivers, much
cleaner.

Where drivers had to implement both a legacy i2c_driver and a
new-style i2c_driver so far, this mechanism makes it possible to get
rid of the legacy i2c_driver and implement both enumerated and
detected device support with just one (new-style) i2c_driver.

Here is a quick conversion guide for these drivers, step by step:

* Delete the legacy driver definition, registration and removal.
  Delete the attach_adapter and detach_client methods of the legacy
  driver.

* Change the prototype of the legacy detect function from
    static int foo_detect(struct i2c_adapter *adapter, int address, int kind);
  to
    static int foo_detect(struct i2c_client *client, int kind,
    			  struct i2c_board_info *info);

* Set the new-style driver detect callback to this new function, and
  set its address_data to &addr_data (addr_data is generally provided
  by I2C_CLIENT_INSMOD.)

* Add the appropriate class to the new-style driver. This is
  typically the class the legacy attach_adapter method was checking
  for. Class checking is now mandatory (done by i2c-core.) See
  <linux/i2c.h> for the list of available classes.

* Remove the i2c_client allocation and freeing from the detect
  function. A pre-allocated client is now handed to you by i2c-core,
  and is freed automatically.

* Make the detect function fill the type field of the i2c_board_info
  structure it was passed as a parameter, and return 0, on success. If
  the detection fails, return -ENODEV.

Signed-off-by: Jean Delvare <khali@linux-fr.org>
---
 Documentation/i2c/writing-clients |  29 +++++
 drivers/i2c/i2c-core.c            | 223 ++++++++++++++++++++++++++++++++++++--
 include/linux/i2c.h               |  36 +++++-
 3 files changed, 272 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/i2c/writing-clients b/Documentation/i2c/writing-clients
index 63722d3c9cdf..6b61b3a2e90b 100644
--- a/Documentation/i2c/writing-clients
+++ b/Documentation/i2c/writing-clients
@@ -44,6 +44,10 @@ static struct i2c_driver foo_driver = {
 	.id_table	= foo_ids,
 	.probe		= foo_probe,
 	.remove		= foo_remove,
+	/* if device autodetection is needed: */
+	.class		= I2C_CLASS_SOMETHING,
+	.detect		= foo_detect,
+	.address_data	= &addr_data,
 
 	/* else, driver uses "legacy" binding model: */
 	.attach_adapter	= foo_attach_adapter,
@@ -217,6 +221,31 @@ in the I2C bus driver. You may want to save the returned i2c_client
 reference for later use.
 
 
+Device Detection (Standard driver model)
+----------------------------------------
+
+Sometimes you do not know in advance which I2C devices are connected to
+a given I2C bus.  This is for example the case of hardware monitoring
+devices on a PC's SMBus.  In that case, you may want to let your driver
+detect supported devices automatically.  This is how the legacy model
+was working, and is now available as an extension to the standard
+driver model (so that we can finally get rid of the legacy model.)
+
+You simply have to define a detect callback which will attempt to
+identify supported devices (returning 0 for supported ones and -ENODEV
+for unsupported ones), a list of addresses to probe, and a device type
+(or class) so that only I2C buses which may have that type of device
+connected (and not otherwise enumerated) will be probed.  The i2c
+core will then call you back as needed and will instantiate a device
+for you for every successful detection.
+
+Note that this mechanism is purely optional and not suitable for all
+devices.  You need some reliable way to identify the supported devices
+(typically using device-specific, dedicated identification registers),
+otherwise misdetections are likely to occur and things can get wrong
+quickly.
+
+
 Device Deletion (Standard driver model)
 ---------------------------------------
 
diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c
index 5e249d758828..0a79f7661017 100644
--- a/drivers/i2c/i2c-core.c
+++ b/drivers/i2c/i2c-core.c
@@ -42,7 +42,9 @@
 static DEFINE_MUTEX(core_lock);
 static DEFINE_IDR(i2c_adapter_idr);
 
-#define is_newstyle_driver(d) ((d)->probe || (d)->remove)
+#define is_newstyle_driver(d) ((d)->probe || (d)->remove || (d)->detect)
+
+static int i2c_detect(struct i2c_adapter *adapter, struct i2c_driver *driver);
 
 /* ------------------------------------------------------------------------- */
 
@@ -418,6 +420,10 @@ static int i2c_do_add_adapter(struct device_driver *d, void *data)
 	struct i2c_driver *driver = to_i2c_driver(d);
 	struct i2c_adapter *adap = data;
 
+	/* Detect supported devices on that bus, and instantiate them */
+	i2c_detect(adap, driver);
+
+	/* Let legacy drivers scan this bus for matching devices */
 	if (driver->attach_adapter) {
 		/* We ignore the return code; if it fails, too bad */
 		driver->attach_adapter(adap);
@@ -457,7 +463,7 @@ static int i2c_register_adapter(struct i2c_adapter *adap)
 	if (adap->nr < __i2c_first_dynamic_bus_num)
 		i2c_scan_static_board_info(adap);
 
-	/* let legacy drivers scan this bus for matching devices */
+	/* Notify drivers */
 	dummy = bus_for_each_drv(&i2c_bus_type, NULL, adap,
 				 i2c_do_add_adapter);
 
@@ -563,8 +569,19 @@ static int i2c_do_del_adapter(struct device_driver *d, void *data)
 {
 	struct i2c_driver *driver = to_i2c_driver(d);
 	struct i2c_adapter *adapter = data;
+	struct i2c_client *client, *_n;
 	int res;
 
+	/* Remove the devices we created ourselves */
+	list_for_each_entry_safe(client, _n, &driver->clients, detected) {
+		if (client->adapter == adapter) {
+			dev_dbg(&adapter->dev, "Removing %s at 0x%x\n",
+				client->name, client->addr);
+			list_del(&client->detected);
+			i2c_unregister_device(client);
+		}
+	}
+
 	if (!driver->detach_adapter)
 		return 0;
 	res = driver->detach_adapter(adapter);
@@ -651,7 +668,11 @@ static int __attach_adapter(struct device *dev, void *data)
 	struct i2c_adapter *adapter = to_i2c_adapter(dev);
 	struct i2c_driver *driver = data;
 
-	driver->attach_adapter(adapter);
+	i2c_detect(adapter, driver);
+
+	/* Legacy drivers scan i2c busses directly */
+	if (driver->attach_adapter)
+		driver->attach_adapter(adapter);
 
 	return 0;
 }
@@ -695,10 +716,9 @@ int i2c_register_driver(struct module *owner, struct i2c_driver *driver)
 
 	pr_debug("i2c-core: driver [%s] registered\n", driver->driver.name);
 
-	/* legacy drivers scan i2c busses directly */
-	if (driver->attach_adapter)
-		class_for_each_device(&i2c_adapter_class, driver,
-				      __attach_adapter);
+	INIT_LIST_HEAD(&driver->clients);
+	/* Walk the adapters that are already present */
+	class_for_each_device(&i2c_adapter_class, driver, __attach_adapter);
 
 	mutex_unlock(&core_lock);
 	return 0;
@@ -709,6 +729,17 @@ static int __detach_adapter(struct device *dev, void *data)
 {
 	struct i2c_adapter *adapter = to_i2c_adapter(dev);
 	struct i2c_driver *driver = data;
+	struct i2c_client *client, *_n;
+
+	list_for_each_entry_safe(client, _n, &driver->clients, detected) {
+		dev_dbg(&adapter->dev, "Removing %s at 0x%x\n",
+			client->name, client->addr);
+		list_del(&client->detected);
+		i2c_unregister_device(client);
+	}
+
+	if (is_newstyle_driver(driver))
+		return 0;
 
 	/* Have a look at each adapter, if clients of this driver are still
 	 * attached. If so, detach them to be able to kill the driver
@@ -747,10 +778,7 @@ void i2c_del_driver(struct i2c_driver *driver)
 {
 	mutex_lock(&core_lock);
 
-	/* legacy driver? */
-	if (!is_newstyle_driver(driver))
-		class_for_each_device(&i2c_adapter_class, driver,
-				      __detach_adapter);
+	class_for_each_device(&i2c_adapter_class, driver, __detach_adapter);
 
 	driver_unregister(&driver->driver);
 	pr_debug("i2c-core: driver [%s] unregistered\n", driver->driver.name);
@@ -1205,6 +1233,179 @@ int i2c_probe(struct i2c_adapter *adapter,
 }
 EXPORT_SYMBOL(i2c_probe);
 
+/* Separate detection function for new-style drivers */
+static int i2c_detect_address(struct i2c_client *temp_client, int kind,
+			      struct i2c_driver *driver)
+{
+	struct i2c_board_info info;
+	struct i2c_adapter *adapter = temp_client->adapter;
+	int addr = temp_client->addr;
+	int err;
+
+	/* Make sure the address is valid */
+	if (addr < 0x03 || addr > 0x77) {
+		dev_warn(&adapter->dev, "Invalid probe address 0x%02x\n",
+			 addr);
+		return -EINVAL;
+	}
+
+	/* Skip if already in use */
+	if (i2c_check_addr(adapter, addr))
+		return 0;
+
+	/* Make sure there is something at this address, unless forced */
+	if (kind < 0) {
+		if (i2c_smbus_xfer(adapter, addr, 0, 0, 0,
+				   I2C_SMBUS_QUICK, NULL) < 0)
+			return 0;
+
+		/* prevent 24RF08 corruption */
+		if ((addr & ~0x0f) == 0x50)
+			i2c_smbus_xfer(adapter, addr, 0, 0, 0,
+				       I2C_SMBUS_QUICK, NULL);
+	}
+
+	/* Finally call the custom detection function */
+	memset(&info, 0, sizeof(struct i2c_board_info));
+	info.addr = addr;
+	err = driver->detect(temp_client, kind, &info);
+	if (err) {
+		/* -ENODEV is returned if the detection fails. We catch it
+		   here as this isn't an error. */
+		return err == -ENODEV ? 0 : err;
+	}
+
+	/* Consistency check */
+	if (info.type[0] == '\0') {
+		dev_err(&adapter->dev, "%s detection function provided "
+			"no name for 0x%x\n", driver->driver.name,
+			addr);
+	} else {
+		struct i2c_client *client;
+
+		/* Detection succeeded, instantiate the device */
+		dev_dbg(&adapter->dev, "Creating %s at 0x%02x\n",
+			info.type, info.addr);
+		client = i2c_new_device(adapter, &info);
+		if (client)
+			list_add_tail(&client->detected, &driver->clients);
+		else
+			dev_err(&adapter->dev, "Failed creating %s at 0x%02x\n",
+				info.type, info.addr);
+	}
+	return 0;
+}
+
+static int i2c_detect(struct i2c_adapter *adapter, struct i2c_driver *driver)
+{
+	const struct i2c_client_address_data *address_data;
+	struct i2c_client *temp_client;
+	int i, err = 0;
+	int adap_id = i2c_adapter_id(adapter);
+
+	address_data = driver->address_data;
+	if (!driver->detect || !address_data)
+		return 0;
+
+	/* Set up a temporary client to help detect callback */
+	temp_client = kzalloc(sizeof(struct i2c_client), GFP_KERNEL);
+	if (!temp_client)
+		return -ENOMEM;
+	temp_client->adapter = adapter;
+
+	/* Force entries are done first, and are not affected by ignore
+	   entries */
+	if (address_data->forces) {
+		const unsigned short * const *forces = address_data->forces;
+		int kind;
+
+		for (kind = 0; forces[kind]; kind++) {
+			for (i = 0; forces[kind][i] != I2C_CLIENT_END;
+			     i += 2) {
+				if (forces[kind][i] == adap_id
+				 || forces[kind][i] == ANY_I2C_BUS) {
+					dev_dbg(&adapter->dev, "found force "
+						"parameter for adapter %d, "
+						"addr 0x%02x, kind %d\n",
+						adap_id, forces[kind][i + 1],
+						kind);
+					temp_client->addr = forces[kind][i + 1];
+					err = i2c_detect_address(temp_client,
+						kind, driver);
+					if (err)
+						goto exit_free;
+				}
+			}
+		}
+	}
+
+	/* Stop here if we can't use SMBUS_QUICK */
+	if (!i2c_check_functionality(adapter, I2C_FUNC_SMBUS_QUICK)) {
+		if (address_data->probe[0] == I2C_CLIENT_END
+		 && address_data->normal_i2c[0] == I2C_CLIENT_END)
+			goto exit_free;
+
+		dev_warn(&adapter->dev, "SMBus Quick command not supported, "
+			 "can't probe for chips\n");
+		err = -EOPNOTSUPP;
+		goto exit_free;
+	}
+
+	/* Stop here if the classes do not match */
+	if (!(adapter->class & driver->class))
+		goto exit_free;
+
+	/* Probe entries are done second, and are not affected by ignore
+	   entries either */
+	for (i = 0; address_data->probe[i] != I2C_CLIENT_END; i += 2) {
+		if (address_data->probe[i] == adap_id
+		 || address_data->probe[i] == ANY_I2C_BUS) {
+			dev_dbg(&adapter->dev, "found probe parameter for "
+				"adapter %d, addr 0x%02x\n", adap_id,
+				address_data->probe[i + 1]);
+			temp_client->addr = address_data->probe[i + 1];
+			err = i2c_detect_address(temp_client, -1, driver);
+			if (err)
+				goto exit_free;
+		}
+	}
+
+	/* Normal entries are done last, unless shadowed by an ignore entry */
+	for (i = 0; address_data->normal_i2c[i] != I2C_CLIENT_END; i += 1) {
+		int j, ignore;
+
+		ignore = 0;
+		for (j = 0; address_data->ignore[j] != I2C_CLIENT_END;
+		     j += 2) {
+			if ((address_data->ignore[j] == adap_id ||
+			     address_data->ignore[j] == ANY_I2C_BUS)
+			 && address_data->ignore[j + 1]
+			    == address_data->normal_i2c[i]) {
+				dev_dbg(&adapter->dev, "found ignore "
+					"parameter for adapter %d, "
+					"addr 0x%02x\n", adap_id,
+					address_data->ignore[j + 1]);
+				ignore = 1;
+				break;
+			}
+		}
+		if (ignore)
+			continue;
+
+		dev_dbg(&adapter->dev, "found normal entry for adapter %d, "
+			"addr 0x%02x\n", adap_id,
+			address_data->normal_i2c[i]);
+		temp_client->addr = address_data->normal_i2c[i];
+		err = i2c_detect_address(temp_client, -1, driver);
+		if (err)
+			goto exit_free;
+	}
+
+ exit_free:
+	kfree(temp_client);
+	return err;
+}
+
 struct i2c_client *
 i2c_new_probed_device(struct i2c_adapter *adap,
 		      struct i2c_board_info *info,
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 50cbab4b62b0..08be0d21864c 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -45,6 +45,7 @@ struct i2c_adapter;
 struct i2c_client;
 struct i2c_driver;
 union i2c_smbus_data;
+struct i2c_board_info;
 
 /*
  * The master routines are the ones normally used to transmit data to devices
@@ -94,15 +95,33 @@ extern s32 i2c_smbus_write_i2c_block_data(struct i2c_client * client,
 					  u8 command, u8 length,
 					  const u8 *values);
 
-/*
- * A driver is capable of handling one or more physical devices present on
- * I2C adapters. This information is used to inform the driver of adapter
- * events.
+/**
+ * struct i2c_driver - represent an I2C device driver
+ * @class: What kind of i2c device we instantiate (for detect)
+ * @detect: Callback for device detection
+ * @address_data: The I2C addresses to probe, ignore or force (for detect)
+ * @clients: List of detected clients we created (for i2c-core use only)
  *
  * The driver.owner field should be set to the module owner of this driver.
  * The driver.name field should be set to the name of this driver.
+ *
+ * For automatic device detection, both @detect and @address_data must
+ * be defined. @class should also be set, otherwise only devices forced
+ * with module parameters will be created. The detect function must
+ * fill at least the name field of the i2c_board_info structure it is
+ * handed upon successful detection, and possibly also the flags field.
+ *
+ * If @detect is missing, the driver will still work fine for enumerated
+ * devices. Detected devices simply won't be supported. This is expected
+ * for the many I2C/SMBus devices which can't be detected reliably, and
+ * the ones which can always be enumerated in practice.
+ *
+ * The i2c_client structure which is handed to the @detect callback is
+ * not a real i2c_client. It is initialized just enough so that you can
+ * call i2c_smbus_read_byte_data and friends on it. Don't do anything
+ * else with it. In particular, calling dev_dbg and friends on it is
+ * not allowed.
  */
-
 struct i2c_driver {
 	int id;
 	unsigned int class;
@@ -142,6 +161,11 @@ struct i2c_driver {
 
 	struct device_driver driver;
 	const struct i2c_device_id *id_table;
+
+	/* Device detection callback for automatic device creation */
+	int (*detect)(struct i2c_client *, int kind, struct i2c_board_info *);
+	const struct i2c_client_address_data *address_data;
+	struct list_head clients;
 };
 #define to_i2c_driver(d) container_of(d, struct i2c_driver, driver)
 
@@ -157,6 +181,7 @@ struct i2c_driver {
  * @dev: Driver model device node for the slave.
  * @irq: indicates the IRQ generated by this device (if any)
  * @list: list of active/busy clients (DEPRECATED)
+ * @detected: member of an i2c_driver.clients list
  * @released: used to synchronize client releases & detaches and references
  *
  * An i2c_client identifies a single device (i.e. chip) connected to an
@@ -174,6 +199,7 @@ struct i2c_client {
 	struct device dev;		/* the device structure		*/
 	int irq;			/* irq issued by device		*/
 	struct list_head list;		/* DEPRECATED */
+	struct list_head detected;
 	struct completion released;
 };
 #define to_i2c_client(d) container_of(d, struct i2c_client, dev)
-- 
cgit v1.2.3


From 521e575b9a7324a0bca762622139f69582a042bf Mon Sep 17 00:00:00 2001
From: Ron Livne <ronli@voltaire.com>
Date: Mon, 14 Jul 2008 23:48:48 -0700
Subject: IB/mlx4: Add support for blocking multicast loopback packets

Add support for handling the IB_QP_CREATE_MULTICAST_BLOCK_LOOPBACK
flag by using the per-multicast group loopback blocking feature of
mlx4 hardware.

Signed-off-by: Ron Livne <ronli@voltaire.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
---
 drivers/infiniband/hw/mlx4/main.c    |  7 +++++--
 drivers/infiniband/hw/mlx4/mlx4_ib.h |  3 ++-
 drivers/infiniband/hw/mlx4/qp.c      | 21 ++++++++++++++++++---
 drivers/net/mlx4/mcg.c               | 17 +++++++++++++----
 include/linux/mlx4/device.h          |  3 ++-
 5 files changed, 40 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index 4d61e32866c6..bcf50648fa18 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -90,7 +90,8 @@ static int mlx4_ib_query_device(struct ib_device *ibdev,
 	props->device_cap_flags    = IB_DEVICE_CHANGE_PHY_PORT |
 		IB_DEVICE_PORT_ACTIVE_EVENT		|
 		IB_DEVICE_SYS_IMAGE_GUID		|
-		IB_DEVICE_RC_RNR_NAK_GEN;
+		IB_DEVICE_RC_RNR_NAK_GEN		|
+		IB_DEVICE_BLOCK_MULTICAST_LOOPBACK;
 	if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BAD_PKEY_CNTR)
 		props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR;
 	if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BAD_QKEY_CNTR)
@@ -437,7 +438,9 @@ static int mlx4_ib_dealloc_pd(struct ib_pd *pd)
 static int mlx4_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
 {
 	return mlx4_multicast_attach(to_mdev(ibqp->device)->dev,
-				     &to_mqp(ibqp)->mqp, gid->raw);
+				     &to_mqp(ibqp)->mqp, gid->raw,
+				     !!(to_mqp(ibqp)->flags &
+					MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK));
 }
 
 static int mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
index 5cf994794d25..c4cf5b69eefa 100644
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -101,7 +101,8 @@ struct mlx4_ib_wq {
 };
 
 enum mlx4_ib_qp_flags {
-	MLX4_IB_QP_LSO		= 1 << 0
+	MLX4_IB_QP_LSO				= 1 << 0,
+	MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK	= 1 << 1,
 };
 
 struct mlx4_ib_qp {
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index 44bbd6c2e315..91590e7fba0c 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -511,6 +511,9 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 	} else {
 		qp->sq_no_prefetch = 0;
 
+		if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK)
+			qp->flags |= MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK;
+
 		if (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO)
 			qp->flags |= MLX4_IB_QP_LSO;
 
@@ -684,10 +687,15 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
 	struct mlx4_ib_qp *qp;
 	int err;
 
-	/* We only support LSO, and only for kernel UD QPs. */
-	if (init_attr->create_flags & ~IB_QP_CREATE_IPOIB_UD_LSO)
+	/*
+	 * We only support LSO and multicast loopback blocking, and
+	 * only for kernel UD QPs.
+	 */
+	if (init_attr->create_flags & ~(IB_QP_CREATE_IPOIB_UD_LSO |
+					IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK))
 		return ERR_PTR(-EINVAL);
-	if (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO &&
+
+	if (init_attr->create_flags &&
 	    (pd->uobject || init_attr->qp_type != IB_QPT_UD))
 		return ERR_PTR(-EINVAL);
 
@@ -1844,6 +1852,13 @@ done:
 
 	qp_init_attr->cap	     = qp_attr->cap;
 
+	qp_init_attr->create_flags = 0;
+	if (qp->flags & MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK)
+		qp_init_attr->create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK;
+
+	if (qp->flags & MLX4_IB_QP_LSO)
+		qp_init_attr->create_flags |= IB_QP_CREATE_IPOIB_UD_LSO;
+
 out:
 	mutex_unlock(&qp->mutex);
 	return err;
diff --git a/drivers/net/mlx4/mcg.c b/drivers/net/mlx4/mcg.c
index 57f7f1f0d4ec..b4b57870ddfd 100644
--- a/drivers/net/mlx4/mcg.c
+++ b/drivers/net/mlx4/mcg.c
@@ -38,6 +38,9 @@
 
 #include "mlx4.h"
 
+#define MGM_QPN_MASK       0x00FFFFFF
+#define MGM_BLCK_LB_BIT    30
+
 struct mlx4_mgm {
 	__be32			next_gid_index;
 	__be32			members_count;
@@ -153,7 +156,8 @@ static int find_mgm(struct mlx4_dev *dev,
 	return err;
 }
 
-int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16])
+int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
+			  int block_mcast_loopback)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	struct mlx4_cmd_mailbox *mailbox;
@@ -202,13 +206,18 @@ int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16])
 	}
 
 	for (i = 0; i < members_count; ++i)
-		if (mgm->qp[i] == cpu_to_be32(qp->qpn)) {
+		if ((be32_to_cpu(mgm->qp[i]) & MGM_QPN_MASK) == qp->qpn) {
 			mlx4_dbg(dev, "QP %06x already a member of MGM\n", qp->qpn);
 			err = 0;
 			goto out;
 		}
 
-	mgm->qp[members_count++] = cpu_to_be32(qp->qpn);
+	if (block_mcast_loopback)
+		mgm->qp[members_count++] = cpu_to_be32((qp->qpn & MGM_QPN_MASK) |
+						       (1 << MGM_BLCK_LB_BIT));
+	else
+		mgm->qp[members_count++] = cpu_to_be32(qp->qpn & MGM_QPN_MASK);
+
 	mgm->members_count       = cpu_to_be32(members_count);
 
 	err = mlx4_WRITE_MCG(dev, index, mailbox);
@@ -283,7 +292,7 @@ int mlx4_multicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16])
 
 	members_count = be32_to_cpu(mgm->members_count);
 	for (loc = -1, i = 0; i < members_count; ++i)
-		if (mgm->qp[i] == cpu_to_be32(qp->qpn))
+		if ((be32_to_cpu(mgm->qp[i]) & MGM_QPN_MASK) == qp->qpn)
 			loc = i;
 
 	if (loc == -1) {
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index a744383d16e9..81b3dd5206e0 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -398,7 +398,8 @@ int mlx4_srq_query(struct mlx4_dev *dev, struct mlx4_srq *srq, int *limit_waterm
 int mlx4_INIT_PORT(struct mlx4_dev *dev, int port);
 int mlx4_CLOSE_PORT(struct mlx4_dev *dev, int port);
 
-int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16]);
+int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
+			  int block_mcast_loopback);
 int mlx4_multicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16]);
 
 int mlx4_map_phys_fmr(struct mlx4_dev *dev, struct mlx4_fmr *fmr, u64 *page_list,
-- 
cgit v1.2.3


From 124cafc5eb973e748c4ce3dc1caad29274e64613 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Tue, 15 Jul 2008 21:21:44 +0200
Subject: ide: remove ide_init_drive_cmd

ide_init_drive_cmd just calls blk_rq_init. This converts the users of
ide_init_drive_cmd to use blk_rq_init directly and removes
ide_init_drive_cmd.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Borislav Petkov <petkovbb@gmail.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-cd.c     |  2 +-
 drivers/ide/ide-floppy.c |  2 +-
 drivers/ide/ide-io.c     | 17 -----------------
 drivers/scsi/ide-scsi.c  |  4 ++--
 include/linux/ide.h      |  2 --
 5 files changed, 4 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 792a3cf73d6e..7917cd576446 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -193,7 +193,7 @@ void ide_cd_init_rq(ide_drive_t *drive, struct request *rq)
 {
 	struct cdrom_info *cd = drive->driver_data;
 
-	ide_init_drive_cmd(rq);
+	blk_rq_init(NULL, rq);
 	rq->cmd_type = REQ_TYPE_ATA_PC;
 	rq->rq_disk = cd->disk;
 }
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index b10e9a813cde..9161cd92a842 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -286,7 +286,7 @@ static void idefloppy_queue_pc_head(ide_drive_t *drive, struct ide_atapi_pc *pc,
 {
 	struct ide_floppy_obj *floppy = drive->driver_data;
 
-	ide_init_drive_cmd(rq);
+	blk_rq_init(NULL, rq);
 	rq->buffer = (char *) pc;
 	rq->cmd_type = REQ_TYPE_SPECIAL;
 	rq->cmd_flags |= REQ_PREEMPT;
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 29f5cc863f6e..d8b4d9f81ae2 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -1538,23 +1538,6 @@ irqreturn_t ide_intr (int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
-/**
- *	ide_init_drive_cmd	-	initialize a drive command request
- *	@rq: request object
- *
- *	Initialize a request before we fill it in and send it down to
- *	ide_do_drive_cmd. Commands must be set up by this function. Right
- *	now it doesn't do a lot, but if that changes abusers will have a
- *	nasty surprise.
- */
-
-void ide_init_drive_cmd (struct request *rq)
-{
-	blk_rq_init(NULL, rq);
-}
-
-EXPORT_SYMBOL(ide_init_drive_cmd);
-
 /**
  *	ide_do_drive_cmd	-	issue IDE special command
  *	@drive: device to issue command
diff --git a/drivers/scsi/ide-scsi.c b/drivers/scsi/ide-scsi.c
index 89ecf0132191..da261806d62a 100644
--- a/drivers/scsi/ide-scsi.c
+++ b/drivers/scsi/ide-scsi.c
@@ -228,7 +228,7 @@ static int idescsi_check_condition(ide_drive_t *drive,
 		kfree(pc);
 		return -ENOMEM;
 	}
-	ide_init_drive_cmd(rq);
+	blk_rq_init(NULL, rq);
 	rq->special = (char *) pc;
 	pc->rq = rq;
 	pc->buf = buf;
@@ -786,7 +786,7 @@ static int idescsi_queue (struct scsi_cmnd *cmd,
 		}
 	}
 
-	ide_init_drive_cmd (rq);
+	blk_rq_init(NULL, rq);
 	rq->special = (char *) pc;
 	rq->cmd_type = REQ_TYPE_SPECIAL;
 	spin_unlock_irq(host->host_lock);
diff --git a/include/linux/ide.h b/include/linux/ide.h
index eddb6daadf4a..3261c6691759 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -857,8 +857,6 @@ int ide_wait_stat(ide_startstop_t *, ide_drive_t *, u8, u8, unsigned long);
 
 extern ide_startstop_t ide_do_reset (ide_drive_t *);
 
-extern void ide_init_drive_cmd (struct request *rq);
-
 /*
  * "action" parameter type for ide_do_drive_cmd() below.
  */
-- 
cgit v1.2.3


From 681a561b7ec7fdcd8f35b68e44ac6d6c70aecc04 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Tue, 15 Jul 2008 21:21:45 +0200
Subject: block: unexport blk_end_sync_rq

All the users of blk_end_sync_rq has gone (they are converted to use
blk_execute_rq). This unexports blk_end_sync_rq.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 block/blk-exec.c       | 3 +--
 include/linux/blkdev.h | 1 -
 2 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-exec.c b/block/blk-exec.c
index 4f52f2792059..9bceff7674f2 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -18,7 +18,7 @@
  * @rq: request to complete
  * @error: end io status of the request
  */
-void blk_end_sync_rq(struct request *rq, int error)
+static void blk_end_sync_rq(struct request *rq, int error)
 {
 	struct completion *waiting = rq->end_io_data;
 
@@ -31,7 +31,6 @@ void blk_end_sync_rq(struct request *rq, int error)
 	 */
 	complete(waiting);
 }
-EXPORT_SYMBOL(blk_end_sync_rq);
 
 /**
  * blk_execute_rq_nowait - insert a request into queue for execution
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index d2a1b71e93c3..1171abd7eb17 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -623,7 +623,6 @@ extern void generic_make_request(struct bio *bio);
 extern void blk_rq_init(struct request_queue *q, struct request *rq);
 extern void blk_put_request(struct request *);
 extern void __blk_put_request(struct request_queue *, struct request *);
-extern void blk_end_sync_rq(struct request *rq, int error);
 extern struct request *blk_get_request(struct request_queue *, int, gfp_t);
 extern void blk_insert_request(struct request_queue *, struct request *, int, void *);
 extern void blk_requeue_request(struct request_queue *, struct request *);
-- 
cgit v1.2.3


From 30e5ee4d1a651a0c66e86c6612c003034bd20ba2 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 15 Jul 2008 21:21:46 +0200
Subject: ide: remove obsoleted "idebus=" kernel parameter

* Remove obsoleted "idebus=" kernel parameter.

* Remove no longer needed ide_system_bus_speed() and system_bus_clock()
  (together with idebus_parameter and system_bus_speed variables).

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide.c            | 71 +-------------------------------------------
 drivers/ide/legacy/ali14xx.c |  2 +-
 drivers/ide/legacy/ht6560b.c |  2 +-
 drivers/ide/legacy/qd65xx.c  |  4 +--
 drivers/ide/pci/aec62xx.c    |  2 +-
 drivers/ide/pci/alim15x3.c   |  2 +-
 drivers/ide/pci/amd74xx.c    |  2 +-
 drivers/ide/pci/cmd640.c     |  8 ++---
 drivers/ide/pci/cmd64x.c     |  4 +--
 drivers/ide/pci/cy82c693.c   |  2 +-
 drivers/ide/pci/via82cxxx.c  |  2 +-
 include/linux/ide.h          |  2 --
 12 files changed, 15 insertions(+), 88 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide.c b/drivers/ide/ide.c
index 8823df1b8716..f65be738b16a 100644
--- a/drivers/ide/ide.c
+++ b/drivers/ide/ide.c
@@ -86,9 +86,6 @@ static const u8 ide_hwif_to_major[] = { IDE0_MAJOR, IDE1_MAJOR,
 					IDE6_MAJOR, IDE7_MAJOR,
 					IDE8_MAJOR, IDE9_MAJOR };
 
-static int idebus_parameter;	/* holds the "idebus=" parameter */
-static int system_bus_speed;	/* holds what we think is VESA/PCI bus speed */
-
 DEFINE_MUTEX(ide_cfg_mtx);
  __cacheline_aligned_in_smp DEFINE_SPINLOCK(ide_lock);
 
@@ -189,38 +186,6 @@ static void __init init_ide_data (void)
 	}
 }
 
-/**
- *	ide_system_bus_speed	-	guess bus speed
- *
- *	ide_system_bus_speed() returns what we think is the system VESA/PCI
- *	bus speed (in MHz). This is used for calculating interface PIO timings.
- *	The default is 40 for known PCI systems, 50 otherwise.
- *	The "idebus=xx" parameter can be used to override this value.
- *	The actual value to be used is computed/displayed the first time
- *	through. Drivers should only use this as a last resort.
- *
- *	Returns a guessed speed in MHz.
- */
-
-static int ide_system_bus_speed(void)
-{
-#ifdef CONFIG_PCI
-	static struct pci_device_id pci_default[] = {
-		{ PCI_DEVICE(PCI_ANY_ID, PCI_ANY_ID) },
-		{ }
-	};
-#else
-#define pci_default 0
-#endif /* CONFIG_PCI */
-
-	/* user supplied value */
-	if (idebus_parameter)
-		return idebus_parameter;
-
-	/* safe default value for PCI or VESA and PCI*/
-	return pci_dev_present(pci_default) ? 33 : 50;
-}
-
 void ide_remove_port_from_hwgroup(ide_hwif_t *hwif)
 {
 	ide_hwgroup_t *hwgroup = hwif->hwgroup;
@@ -540,20 +505,6 @@ static int set_unmaskirq(ide_drive_t *drive, int arg)
 	return 0;
 }
 
-/**
- *	system_bus_clock	-	clock guess
- *
- *	External version of the bus clock guess used by very old IDE drivers
- *	for things like VLB timings. Should not be used.
- */
-
-int system_bus_clock (void)
-{
-	return system_bus_speed;
-}
-
-EXPORT_SYMBOL(system_bus_clock);
-
 static int generic_ide_suspend(struct device *dev, pm_message_t mesg)
 {
 	ide_drive_t *drive = dev->driver_data;
@@ -851,7 +802,7 @@ static int __init ide_setup(char *s)
 	if (strncmp(s,"hd",2) == 0 && s[2] == '=')	/* hd= is for hd.c   */
 		return 0;				/* driver and not us */
 
-	if (strncmp(s,"ide",3) && strncmp(s,"idebus",6) && strncmp(s,"hd",2))
+	if (strncmp(s, "ide", 3) && strncmp(s, "hd", 2))
 		return 0;
 
 	printk(KERN_INFO "ide_setup: %s", s);
@@ -951,21 +902,6 @@ static int __init ide_setup(char *s)
 		}
 	}
 
-	if (s[0] != 'i' || s[1] != 'd' || s[2] != 'e')
-		goto bad_option;
-	/*
-	 * Look for bus speed option:  "idebus="
-	 */
-	if (s[3] == 'b' && s[4] == 'u' && s[5] == 's') {
-		if (match_parm(&s[6], NULL, vals, 1) != 1)
-			goto bad_option;
-		if (vals[0] >= 20 && vals[0] <= 66) {
-			idebus_parameter = vals[0];
-		} else
-			printk(" -- BAD BUS SPEED! Expected value from 20 to 66");
-		goto obsolete_option;
-	}
-
 bad_option:
 	printk(" -- BAD OPTION\n");
 	return 1;
@@ -1287,11 +1223,6 @@ static int __init ide_init(void)
 	int ret;
 
 	printk(KERN_INFO "Uniform Multi-Platform E-IDE driver\n");
-	system_bus_speed = ide_system_bus_speed();
-
-	printk(KERN_INFO "ide: Assuming %dMHz system bus speed "
-			 "for PIO modes%s\n", system_bus_speed,
-			idebus_parameter ? "" : "; override with idebus=xx");
 
 	ret = bus_register(&ide_bus_type);
 	if (ret < 0) {
diff --git a/drivers/ide/legacy/ali14xx.c b/drivers/ide/legacy/ali14xx.c
index 90c65cf97448..052125fafcfa 100644
--- a/drivers/ide/legacy/ali14xx.c
+++ b/drivers/ide/legacy/ali14xx.c
@@ -116,7 +116,7 @@ static void ali14xx_set_pio_mode(ide_drive_t *drive, const u8 pio)
 	int time1, time2;
 	u8 param1, param2, param3, param4;
 	unsigned long flags;
-	int bus_speed = ide_vlb_clk ? ide_vlb_clk : system_bus_clock();
+	int bus_speed = ide_vlb_clk ? ide_vlb_clk : 50;
 
 	/* calculate timing, according to PIO mode */
 	time1 = ide_pio_cycle_time(drive, pio);
diff --git a/drivers/ide/legacy/ht6560b.c b/drivers/ide/legacy/ht6560b.c
index 4fe516df9f74..dd6dfb32e853 100644
--- a/drivers/ide/legacy/ht6560b.c
+++ b/drivers/ide/legacy/ht6560b.c
@@ -212,7 +212,7 @@ static u8 ht_pio2timings(ide_drive_t *drive, const u8 pio)
 {
 	int active_time, recovery_time;
 	int active_cycles, recovery_cycles;
-	int bus_speed = ide_vlb_clk ? ide_vlb_clk : system_bus_clock();
+	int bus_speed = ide_vlb_clk ? ide_vlb_clk : 50;
 
         if (pio) {
 		unsigned int cycle_time;
diff --git a/drivers/ide/legacy/qd65xx.c b/drivers/ide/legacy/qd65xx.c
index 6424af154325..51dba82f8812 100644
--- a/drivers/ide/legacy/qd65xx.c
+++ b/drivers/ide/legacy/qd65xx.c
@@ -110,7 +110,7 @@ static void qd65xx_select(ide_drive_t *drive)
 
 static u8 qd6500_compute_timing (ide_hwif_t *hwif, int active_time, int recovery_time)
 {
-	int clk = ide_vlb_clk ? ide_vlb_clk : system_bus_clock();
+	int clk = ide_vlb_clk ? ide_vlb_clk : 50;
 	u8 act_cyc, rec_cyc;
 
 	if (clk <= 33) {
@@ -132,7 +132,7 @@ static u8 qd6500_compute_timing (ide_hwif_t *hwif, int active_time, int recovery
 
 static u8 qd6580_compute_timing (int active_time, int recovery_time)
 {
-	int clk = ide_vlb_clk ? ide_vlb_clk : system_bus_clock();
+	int clk = ide_vlb_clk ? ide_vlb_clk : 50;
 	u8 act_cyc, rec_cyc;
 
 	act_cyc = 17 - IDE_IN(active_time   * clk / 1000 + 1, 2, 17);
diff --git a/drivers/ide/pci/aec62xx.c b/drivers/ide/pci/aec62xx.c
index 7f46c224b7c4..ae7a4329a581 100644
--- a/drivers/ide/pci/aec62xx.c
+++ b/drivers/ide/pci/aec62xx.c
@@ -140,7 +140,7 @@ static void aec_set_pio_mode(ide_drive_t *drive, const u8 pio)
 
 static unsigned int __devinit init_chipset_aec62xx(struct pci_dev *dev, const char *name)
 {
-	int bus_speed = ide_pci_clk ? ide_pci_clk : system_bus_clock();
+	int bus_speed = ide_pci_clk ? ide_pci_clk : 33;
 
 	if (bus_speed <= 33)
 		pci_set_drvdata(dev, (void *) aec6xxx_33_base);
diff --git a/drivers/ide/pci/alim15x3.c b/drivers/ide/pci/alim15x3.c
index f2129d5e07f2..f2de00adf147 100644
--- a/drivers/ide/pci/alim15x3.c
+++ b/drivers/ide/pci/alim15x3.c
@@ -72,7 +72,7 @@ static void ali_set_pio_mode(ide_drive_t *drive, const u8 pio)
 	int s_time, a_time, c_time;
 	u8 s_clc, a_clc, r_clc;
 	unsigned long flags;
-	int bus_speed = ide_pci_clk ? ide_pci_clk : system_bus_clock();
+	int bus_speed = ide_pci_clk ? ide_pci_clk : 33;
 	int port = hwif->channel ? 0x5c : 0x58;
 	int portFIFO = hwif->channel ? 0x55 : 0x54;
 	u8 cd_dma_fifo = 0;
diff --git a/drivers/ide/pci/amd74xx.c b/drivers/ide/pci/amd74xx.c
index a373101747b6..ad222206a429 100644
--- a/drivers/ide/pci/amd74xx.c
+++ b/drivers/ide/pci/amd74xx.c
@@ -179,7 +179,7 @@ static unsigned int __devinit init_chipset_amd74xx(struct pci_dev *dev,
  * Determine the system bus clock.
  */
 
-	amd_clock = (ide_pci_clk ? ide_pci_clk : system_bus_clock()) * 1000;
+	amd_clock = (ide_pci_clk ? ide_pci_clk : 33) * 1000;
 
 	switch (amd_clock) {
 		case 33000: amd_clock = 33333; break;
diff --git a/drivers/ide/pci/cmd640.c b/drivers/ide/pci/cmd640.c
index b38a1980dcd5..cd1ba14984ab 100644
--- a/drivers/ide/pci/cmd640.c
+++ b/drivers/ide/pci/cmd640.c
@@ -525,12 +525,10 @@ static void cmd640_set_mode(ide_drive_t *drive, unsigned int index,
 	u8 setup_count, active_count, recovery_count, recovery_count2, cycle_count;
 	int bus_speed;
 
-	if (cmd640_vlb && ide_vlb_clk)
-		bus_speed = ide_vlb_clk;
-	else if (!cmd640_vlb && ide_pci_clk)
-		bus_speed = ide_pci_clk;
+	if (cmd640_vlb)
+		bus_speed = ide_vlb_clk ? ide_vlb_clk : 50;
 	else
-		bus_speed = system_bus_clock();
+		bus_speed = ide_pci_clk ? ide_pci_clk : 33;
 
 	if (pio_mode > 5)
 		pio_mode = 5;
diff --git a/drivers/ide/pci/cmd64x.c b/drivers/ide/pci/cmd64x.c
index 08674711d089..ca4774aa27ee 100644
--- a/drivers/ide/pci/cmd64x.c
+++ b/drivers/ide/pci/cmd64x.c
@@ -69,7 +69,7 @@ static u8 quantize_timing(int timing, int quant)
 static void program_cycle_times (ide_drive_t *drive, int cycle_time, int active_time)
 {
 	struct pci_dev *dev = to_pci_dev(drive->hwif->dev);
-	int clock_time = 1000 / (ide_pci_clk ? ide_pci_clk : system_bus_clock());
+	int clock_time = 1000 / (ide_pci_clk ? ide_pci_clk : 33);
 	u8  cycle_count, active_count, recovery_count, drwtim;
 	static const u8 recovery_values[] =
 		{15, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0};
@@ -128,7 +128,7 @@ static void cmd64x_tune_pio(ide_drive_t *drive, const u8 pio)
 			    ide_pio_timings[pio].active_time);
 
 	setup_count = quantize_timing(ide_pio_timings[pio].setup_time,
-			1000 / (ide_pci_clk ? ide_pci_clk : system_bus_clock()));
+			1000 / (ide_pci_clk ? ide_pci_clk : 33));
 
 	/*
 	 * The primary channel has individual address setup timing registers
diff --git a/drivers/ide/pci/cy82c693.c b/drivers/ide/pci/cy82c693.c
index 77cc22c2ad45..8c534afcb6c8 100644
--- a/drivers/ide/pci/cy82c693.c
+++ b/drivers/ide/pci/cy82c693.c
@@ -134,7 +134,7 @@ static int calc_clk(int time, int bus_speed)
 static void compute_clocks(u8 pio, pio_clocks_t *p_pclk)
 {
 	int clk1, clk2;
-	int bus_speed = ide_pci_clk ? ide_pci_clk : system_bus_clock();
+	int bus_speed = ide_pci_clk ? ide_pci_clk : 33;
 
 	/* we don't check against CY82C693's min and max speed,
 	 * so you can play with the idebus=xx parameter
diff --git a/drivers/ide/pci/via82cxxx.c b/drivers/ide/pci/via82cxxx.c
index e8c2570003ff..3ed9728abd24 100644
--- a/drivers/ide/pci/via82cxxx.c
+++ b/drivers/ide/pci/via82cxxx.c
@@ -340,7 +340,7 @@ static unsigned int __devinit init_chipset_via82cxxx(struct pci_dev *dev, const
 	 * Determine system bus clock.
 	 */
 
-	via_clock = (ide_pci_clk ? ide_pci_clk : system_bus_clock()) * 1000;
+	via_clock = (ide_pci_clk ? ide_pci_clk : 33) * 1000;
 
 	switch (via_clock) {
 		case 33000: via_clock = 33333; break;
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 3261c6691759..dad535659249 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -994,8 +994,6 @@ int ide_taskfile_ioctl(ide_drive_t *, unsigned int, unsigned long);
 int ide_cmd_ioctl(ide_drive_t *, unsigned int, unsigned long);
 int ide_task_ioctl(ide_drive_t *, unsigned int, unsigned long);
 
-extern int system_bus_clock(void);
-
 extern int ide_driveid_update(ide_drive_t *);
 extern int ide_config_drive_speed(ide_drive_t *, u8);
 extern u8 eighty_ninty_three (ide_drive_t *);
-- 
cgit v1.2.3


From 931ee0dc5c69e8113233d21942681ab8fecde7f9 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 15 Jul 2008 21:21:47 +0200
Subject: ide: remove obsoleted "ide=" kernel parameters

* Remove obsoleted "ide=" kernel parameters.

* Remove no longer needed:
  - ide_setup()
  - parse_options()
  - __setup("", ...)
  - module_param(options, ...)

* Use module_{init,exit}() for MODULE=y case and remove MODULE ifdef.

* Make ide_*acpi* and ide_doubler variables static.

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-acpi.c     |  6 +--
 drivers/ide/ide-dma.c      |  2 +-
 drivers/ide/ide.c          | 91 +++-------------------------------------------
 drivers/ide/legacy/gayle.c |  4 +-
 include/linux/ide.h        |  4 --
 5 files changed, 10 insertions(+), 97 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-acpi.c b/drivers/ide/ide-acpi.c
index 9d3601fa5680..6f704628c27d 100644
--- a/drivers/ide/ide-acpi.c
+++ b/drivers/ide/ide-acpi.c
@@ -60,15 +60,15 @@ struct ide_acpi_hwif_link {
 #define DEBPRINT(fmt, args...)	do {} while (0)
 #endif	/* DEBUGGING */
 
-int ide_noacpi;
+static int ide_noacpi;
 module_param_named(noacpi, ide_noacpi, bool, 0);
 MODULE_PARM_DESC(noacpi, "disable IDE ACPI support");
 
-int ide_acpigtf;
+static int ide_acpigtf;
 module_param_named(acpigtf, ide_acpigtf, bool, 0);
 MODULE_PARM_DESC(acpigtf, "enable IDE ACPI _GTF support");
 
-int ide_acpionboot;
+static int ide_acpionboot;
 module_param_named(acpionboot, ide_acpionboot, bool, 0);
 MODULE_PARM_DESC(acpionboot, "call IDE ACPI methods on boot");
 
diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c
index 653b1ade13d3..174f4704614c 100644
--- a/drivers/ide/ide-dma.c
+++ b/drivers/ide/ide-dma.c
@@ -692,7 +692,7 @@ static int ide_tune_dma(ide_drive_t *drive)
 	ide_hwif_t *hwif = drive->hwif;
 	u8 speed;
 
-	if (noautodma || drive->nodma || (drive->id->capability & 1) == 0)
+	if (drive->nodma || (drive->id->capability & 1) == 0)
 		return 0;
 
 	/* consult the list of known "bad" drives */
diff --git a/drivers/ide/ide.c b/drivers/ide/ide.c
index e8c88ff2f6b6..1defba3eefe7 100644
--- a/drivers/ide/ide.c
+++ b/drivers/ide/ide.c
@@ -87,9 +87,9 @@ static const u8 ide_hwif_to_major[] = { IDE0_MAJOR, IDE1_MAJOR,
 					IDE8_MAJOR, IDE9_MAJOR };
 
 DEFINE_MUTEX(ide_cfg_mtx);
- __cacheline_aligned_in_smp DEFINE_SPINLOCK(ide_lock);
 
-int noautodma = 0;
+__cacheline_aligned_in_smp DEFINE_SPINLOCK(ide_lock);
+EXPORT_SYMBOL(ide_lock);
 
 ide_hwif_t ide_hwifs[MAX_HWIFS];	/* master data repository */
 
@@ -698,59 +698,6 @@ set_val:
 
 EXPORT_SYMBOL(generic_ide_ioctl);
 
-/*
- * ide_setup() gets called VERY EARLY during initialization,
- * to handle kernel "command line" strings beginning with "ide".
- *
- * Remember to update Documentation/ide/ide.txt if you change something here.
- */
-static int __init ide_setup(char *s)
-{
-	printk(KERN_INFO "ide_setup: %s", s);
-
-#ifdef CONFIG_BLK_DEV_IDEDOUBLER
-	if (!strcmp(s, "ide=doubler")) {
-		extern int ide_doubler;
-
-		printk(" : Enabled support for IDE doublers\n");
-		ide_doubler = 1;
-		goto obsolete_option;
-	}
-#endif /* CONFIG_BLK_DEV_IDEDOUBLER */
-
-	if (!strcmp(s, "ide=nodma")) {
-		printk(" : Prevented DMA\n");
-		noautodma = 1;
-		goto obsolete_option;
-	}
-
-#ifdef CONFIG_BLK_DEV_IDEACPI
-	if (!strcmp(s, "ide=noacpi")) {
-		//printk(" : Disable IDE ACPI support.\n");
-		ide_noacpi = 1;
-		goto obsolete_option;
-	}
-	if (!strcmp(s, "ide=acpigtf")) {
-		//printk(" : Enable IDE ACPI _GTF support.\n");
-		ide_acpigtf = 1;
-		goto obsolete_option;
-	}
-	if (!strcmp(s, "ide=acpionboot")) {
-		//printk(" : Call IDE ACPI methods on boot.\n");
-		ide_acpionboot = 1;
-		goto obsolete_option;
-	}
-#endif /* CONFIG_BLK_DEV_IDEACPI */
-
-	printk(" -- BAD OPTION\n");
-	return 1;
-obsolete_option:
-	printk(" -- OBSOLETE OPTION, WILL BE REMOVED SOON!\n");
-	return 1;
-}
-
-EXPORT_SYMBOL(ide_lock);
-
 static int ide_bus_match(struct device *dev, struct device_driver *drv)
 {
 	return 1;
@@ -1087,32 +1034,7 @@ out_port_class:
 	return ret;
 }
 
-#ifdef MODULE
-static char *options = NULL;
-module_param(options, charp, 0);
-MODULE_LICENSE("GPL");
-
-static void __init parse_options (char *line)
-{
-	char *next = line;
-
-	if (line == NULL || !*line)
-		return;
-	while ((line = next) != NULL) {
- 		if ((next = strchr(line,' ')) != NULL)
-			*next++ = 0;
-		if (!ide_setup(line))
-			printk (KERN_INFO "Unknown option '%s'\n", line);
-	}
-}
-
-int __init init_module (void)
-{
-	parse_options(options);
-	return ide_init();
-}
-
-void __exit cleanup_module (void)
+static void __exit ide_exit(void)
 {
 	proc_ide_destroy();
 
@@ -1121,10 +1043,7 @@ void __exit cleanup_module (void)
 	bus_unregister(&ide_bus_type);
 }
 
-#else /* !MODULE */
-
-__setup("", ide_setup);
-
 module_init(ide_init);
+module_exit(ide_exit);
 
-#endif /* MODULE */
+MODULE_LICENSE("GPL");
diff --git a/drivers/ide/legacy/gayle.c b/drivers/ide/legacy/gayle.c
index fed7d812761c..b78941680c32 100644
--- a/drivers/ide/legacy/gayle.c
+++ b/drivers/ide/legacy/gayle.c
@@ -64,9 +64,7 @@
 #define GAYLE_HAS_CONTROL_REG	(!ide_doubler)
 #define GAYLE_IDEREG_SIZE	(ide_doubler ? 0x1000 : 0x2000)
 
-int ide_doubler = 0;	/* support IDE doublers? */
-EXPORT_SYMBOL_GPL(ide_doubler);
-
+static int ide_doubler;
 module_param_named(doubler, ide_doubler, bool, 0);
 MODULE_PARM_DESC(doubler, "enable support for IDE doublers");
 #endif /* CONFIG_BLK_DEV_IDEDOUBLER */
diff --git a/include/linux/ide.h b/include/linux/ide.h
index dad535659249..0fa1812d0438 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -813,10 +813,6 @@ int generic_ide_ioctl(ide_drive_t *, struct file *, struct block_device *, unsig
 #ifndef _IDE_C
 extern	ide_hwif_t	ide_hwifs[];		/* master data repository */
 #endif
-extern int ide_noacpi;
-extern int ide_acpigtf;
-extern int ide_acpionboot;
-extern int noautodma;
 
 extern int ide_vlb_clk;
 extern int ide_pci_clk;
-- 
cgit v1.2.3


From 9a410e79b552bacb4481f85618aa7333b7776ed7 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 15 Jul 2008 21:21:48 +0200
Subject: ide: remove IDE_TFLAG_NO_SELECT_MASK taskfile flag

Always call SELECT_MASK(..., 0) in ide_tf_load() (needs to be done
to match ide_set_irq(..., 1)) and then remove IDE_TFLAG_NO_SELECT_MASK
taskfile flag.

This change should only affect hpt366 and icside host drivers since
->maskproc(..., 0) for sgiioc4 is equivalent to ide_set_irq(..., 1).

Cc: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Cc: Russell King <rmk@arm.linux.org.uk>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-cd.c     | 4 ++--
 drivers/ide/ide-disk.c   | 3 +--
 drivers/ide/ide-floppy.c | 3 +--
 drivers/ide/ide-iops.c   | 4 +---
 drivers/ide/ide-tape.c   | 3 +--
 drivers/scsi/ide-scsi.c  | 2 +-
 include/linux/ide.h      | 1 -
 7 files changed, 7 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index ac542ffffa49..0fbc2d8d0d53 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -530,8 +530,8 @@ static ide_startstop_t cdrom_start_packet_command(ide_drive_t *drive,
 		info->dma = !hwif->dma_ops->dma_setup(drive);
 
 	/* set up the controller registers */
-	ide_pktcmd_tf_load(drive, IDE_TFLAG_OUT_NSECT | IDE_TFLAG_OUT_LBAL |
-			   IDE_TFLAG_NO_SELECT_MASK, xferlen, info->dma);
+	ide_pktcmd_tf_load(drive, IDE_TFLAG_OUT_NSECT | IDE_TFLAG_OUT_LBAL,
+			   xferlen, info->dma);
 
 	if (info->cd_flags & IDE_CD_FLAG_DRQ_INTERRUPT) {
 		/* waiting for CDB interrupt, not DMA yet. */
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index c5f22ef8ed24..5f49a4ae9dd8 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -198,8 +198,7 @@ static ide_startstop_t __ide_do_rw_disk(ide_drive_t *drive, struct request *rq,
 	}
 
 	memset(&task, 0, sizeof(task));
-	task.tf_flags = IDE_TFLAG_NO_SELECT_MASK;  /* FIXME? */
-	task.tf_flags |= (IDE_TFLAG_TF | IDE_TFLAG_DEVICE);
+	task.tf_flags = IDE_TFLAG_TF | IDE_TFLAG_DEVICE;
 
 	if (drive->select.b.lba) {
 		if (lba48) {
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index 9161cd92a842..1852008d9ee4 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -667,8 +667,7 @@ static ide_startstop_t idefloppy_issue_pc(ide_drive_t *drive,
 	if ((pc->flags & PC_FLAG_DMA_RECOMMENDED) && drive->using_dma)
 		dma = !hwif->dma_ops->dma_setup(drive);
 
-	ide_pktcmd_tf_load(drive, IDE_TFLAG_NO_SELECT_MASK |
-			   IDE_TFLAG_OUT_DEVICE, bcount, dma);
+	ide_pktcmd_tf_load(drive, IDE_TFLAG_OUT_DEVICE, bcount, dma);
 
 	if (dma) {
 		/* Begin DMA, if necessary */
diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
index 2de4c8f581eb..9f9916fe6c2f 100644
--- a/drivers/ide/ide-iops.c
+++ b/drivers/ide/ide-iops.c
@@ -121,9 +121,7 @@ static void ide_tf_load(ide_drive_t *drive, ide_task_t *task)
 		HIHI = 0xFF;
 
 	ide_set_irq(drive, 1);
-
-	if ((task->tf_flags & IDE_TFLAG_NO_SELECT_MASK) == 0)
-		SELECT_MASK(drive, 0);
+	SELECT_MASK(drive, 0);
 
 	if (task->tf_flags & IDE_TFLAG_OUT_DATA) {
 		u16 data = (tf->hob_data << 8) | tf->data;
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index a5f0b774527b..cc7991c7c252 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -1046,8 +1046,7 @@ static ide_startstop_t idetape_issue_pc(ide_drive_t *drive,
 	if ((pc->flags & PC_FLAG_DMA_RECOMMENDED) && drive->using_dma)
 		dma_ok = !hwif->dma_ops->dma_setup(drive);
 
-	ide_pktcmd_tf_load(drive, IDE_TFLAG_NO_SELECT_MASK |
-			   IDE_TFLAG_OUT_DEVICE, bcount, dma_ok);
+	ide_pktcmd_tf_load(drive, IDE_TFLAG_OUT_DEVICE, bcount, dma_ok);
 
 	if (dma_ok)
 		/* Will begin DMA later */
diff --git a/drivers/scsi/ide-scsi.c b/drivers/scsi/ide-scsi.c
index da261806d62a..3222aa589dbf 100644
--- a/drivers/scsi/ide-scsi.c
+++ b/drivers/scsi/ide-scsi.c
@@ -564,7 +564,7 @@ static ide_startstop_t idescsi_issue_pc(ide_drive_t *drive,
 		hwif->sg_mapped = 0;
 	}
 
-	ide_pktcmd_tf_load(drive, IDE_TFLAG_NO_SELECT_MASK, bcount, dma);
+	ide_pktcmd_tf_load(drive, 0, bcount, dma);
 
 	if (dma)
 		pc->flags |= PC_FLAG_DMA_OK;
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 0fa1812d0438..d4a910cdb907 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -869,7 +869,6 @@ extern void ide_end_drive_cmd(ide_drive_t *, u8, u8);
 
 enum {
 	IDE_TFLAG_LBA48			= (1 << 0),
-	IDE_TFLAG_NO_SELECT_MASK	= (1 << 1),
 	IDE_TFLAG_FLAGGED		= (1 << 2),
 	IDE_TFLAG_OUT_DATA		= (1 << 3),
 	IDE_TFLAG_OUT_HOB_FEATURE	= (1 << 4),
-- 
cgit v1.2.3


From ed4af48fd660176680da905817f6e40d51436e4c Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 15 Jul 2008 21:21:48 +0200
Subject: ide: move IRQ unmasking out from ->tf_load method

Move IRQ unmasking out from ->tf_load method to its users.

There should be no functional changes caused by this patch
(SELECT_MASK() is NOP except for hpt366, icside and sgiioc4).

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/h8300/ide-h8300.c | 2 --
 drivers/ide/ide-io.c          | 2 ++
 drivers/ide/ide-iops.c        | 5 +----
 drivers/ide/ide-taskfile.c    | 2 ++
 drivers/ide/pci/scc_pata.c    | 2 --
 include/linux/ide.h           | 1 +
 6 files changed, 6 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/h8300/ide-h8300.c b/drivers/ide/h8300/ide-h8300.c
index ecf53bb0d2aa..d5afc28eaae7 100644
--- a/drivers/ide/h8300/ide-h8300.c
+++ b/drivers/ide/h8300/ide-h8300.c
@@ -52,8 +52,6 @@ static void h8300_tf_load(ide_drive_t *drive, ide_task_t *task)
 	if (task->tf_flags & IDE_TFLAG_FLAGGED)
 		HIHI = 0xFF;
 
-	ide_set_irq(drive, 1);
-
 	if (task->tf_flags & IDE_TFLAG_OUT_DATA)
 		mm_outw((tf->hob_data << 8) | tf->data, io_ports->data_addr);
 
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index c22a337ced4e..2083cc08b2ce 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -1579,6 +1579,8 @@ void ide_pktcmd_tf_load(ide_drive_t *drive, u32 tf_flags, u16 bcount, u8 dma)
 	task.tf.lbah    = (bcount >> 8) & 0xff;
 
 	ide_tf_dump(drive->name, &task.tf);
+	ide_set_irq(drive, 1);
+	SELECT_MASK(drive, 0);
 	drive->hwif->tf_load(drive, &task);
 }
 
diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
index 9f9916fe6c2f..491980aab866 100644
--- a/drivers/ide/ide-iops.c
+++ b/drivers/ide/ide-iops.c
@@ -95,7 +95,7 @@ void SELECT_DRIVE (ide_drive_t *drive)
 	hwif->OUTB(drive->select.all, hwif->io_ports.device_addr);
 }
 
-static void SELECT_MASK(ide_drive_t *drive, int mask)
+void SELECT_MASK(ide_drive_t *drive, int mask)
 {
 	const struct ide_port_ops *port_ops = drive->hwif->port_ops;
 
@@ -120,9 +120,6 @@ static void ide_tf_load(ide_drive_t *drive, ide_task_t *task)
 	if (task->tf_flags & IDE_TFLAG_FLAGGED)
 		HIHI = 0xFF;
 
-	ide_set_irq(drive, 1);
-	SELECT_MASK(drive, 0);
-
 	if (task->tf_flags & IDE_TFLAG_OUT_DATA) {
 		u16 data = (tf->hob_data << 8) | tf->data;
 
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index b6a1c4b51129..6a17ab54f801 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -109,6 +109,8 @@ ide_startstop_t do_rw_taskfile (ide_drive_t *drive, ide_task_t *task)
 
 	if ((task->tf_flags & IDE_TFLAG_DMA_PIO_FALLBACK) == 0) {
 		ide_tf_dump(drive->name, tf);
+		ide_set_irq(drive, 1);
+		SELECT_MASK(drive, 0);
 		hwif->tf_load(drive, task);
 	}
 
diff --git a/drivers/ide/pci/scc_pata.c b/drivers/ide/pci/scc_pata.c
index 910fb00deb71..37e8cfcabb4b 100644
--- a/drivers/ide/pci/scc_pata.c
+++ b/drivers/ide/pci/scc_pata.c
@@ -662,8 +662,6 @@ static void scc_tf_load(ide_drive_t *drive, ide_task_t *task)
 	if (task->tf_flags & IDE_TFLAG_FLAGGED)
 		HIHI = 0xFF;
 
-	ide_set_irq(drive, 1);
-
 	if (task->tf_flags & IDE_TFLAG_OUT_DATA)
 		out_be32((void *)io_ports->data_addr,
 			 (tf->hob_data << 8) | tf->data);
diff --git a/include/linux/ide.h b/include/linux/ide.h
index d4a910cdb907..56d0bc2dffee 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -973,6 +973,7 @@ typedef struct ide_task_s {
 void ide_tf_dump(const char *, struct ide_taskfile *);
 
 extern void SELECT_DRIVE(ide_drive_t *);
+void SELECT_MASK(ide_drive_t *, int);
 
 extern int drive_is_ready(ide_drive_t *);
 
-- 
cgit v1.2.3


From 135721446144af005109c25eeacca4fdddcd9a66 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 15 Jul 2008 21:21:49 +0200
Subject: ide: remove ->mmio flag from ide_hwif_t

Since scc_pata host driver no longer uses IDE PCI layer / ide_dma_setup()
and all other ->mmio users set also IDE_HFLAG_MMIO host flag we can safely
remove ->mmio flag.

There should be no functional changes caused by this patch.

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/arm/palm_bk3710.c |  1 -
 drivers/ide/ide-dma.c         |  2 +-
 drivers/ide/pci/scc_pata.c    |  1 -
 drivers/ide/pci/siimage.c     | 25 +++++++++++++------------
 drivers/ide/setup-pci.c       |  4 ++--
 include/linux/ide.h           |  1 -
 6 files changed, 16 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/arm/palm_bk3710.c b/drivers/ide/arm/palm_bk3710.c
index 74a05dc6d1e6..3839f5722985 100644
--- a/drivers/ide/arm/palm_bk3710.c
+++ b/drivers/ide/arm/palm_bk3710.c
@@ -405,7 +405,6 @@ static int __devinit palm_bk3710_probe(struct platform_device *pdev)
 	ide_init_port_data(hwif, i);
 	ide_init_port_hw(hwif, &hw);
 
-	hwif->mmio = 1;
 	default_hwif_mmiops(hwif);
 
 	idx[0] = i;
diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c
index 174f4704614c..7ee44f86bc54 100644
--- a/drivers/ide/ide-dma.c
+++ b/drivers/ide/ide-dma.c
@@ -463,7 +463,7 @@ int ide_dma_setup(ide_drive_t *drive)
 	}
 
 	/* PRD table */
-	if (hwif->mmio)
+	if (hwif->host_flags & IDE_HFLAG_MMIO)
 		writel(hwif->dmatable_dma,
 		       (void __iomem *)(hwif->dma_base + ATA_DMA_TABLE_OFS));
 	else
diff --git a/drivers/ide/pci/scc_pata.c b/drivers/ide/pci/scc_pata.c
index 37e8cfcabb4b..133053c7a480 100644
--- a/drivers/ide/pci/scc_pata.c
+++ b/drivers/ide/pci/scc_pata.c
@@ -793,7 +793,6 @@ static void __devinit init_mmio_iops_scc(ide_hwif_t *hwif)
 
 	hwif->dma_base = dma_base;
 	hwif->config_data = ports->ctl;
-	hwif->mmio = 1;
 }
 
 /**
diff --git a/drivers/ide/pci/siimage.c b/drivers/ide/pci/siimage.c
index 0006b9e58567..b75e9bb390a7 100644
--- a/drivers/ide/pci/siimage.c
+++ b/drivers/ide/pci/siimage.c
@@ -94,7 +94,7 @@ static unsigned long siimage_selreg(ide_hwif_t *hwif, int r)
 	unsigned long base = (unsigned long)hwif->hwif_data;
 
 	base += 0xA0 + r;
-	if (hwif->mmio)
+	if (hwif->host_flags & IDE_HFLAG_MMIO)
 		base += hwif->channel << 6;
 	else
 		base += hwif->channel << 4;
@@ -117,7 +117,7 @@ static inline unsigned long siimage_seldev(ide_drive_t *drive, int r)
 	unsigned long base	= (unsigned long)hwif->hwif_data;
 
 	base += 0xA0 + r;
-	if (hwif->mmio)
+	if (hwif->host_flags & IDE_HFLAG_MMIO)
 		base += hwif->channel << 6;
 	else
 		base += hwif->channel << 4;
@@ -190,7 +190,9 @@ static u8 sil_pata_udma_filter(ide_drive_t *drive)
 	unsigned long base	= (unsigned long)hwif->hwif_data;
 	u8 scsc, mask		= 0;
 
-	scsc = sil_ioread8(dev, base + (hwif->mmio ? 0x4A : 0x8A));
+	base += (hwif->host_flags & IDE_HFLAG_MMIO) ? 0x4A : 0x8A;
+
+	scsc = sil_ioread8(dev, base);
 
 	switch (scsc & 0x30) {
 	case 0x10:	/* 133 */
@@ -238,8 +240,9 @@ static void sil_set_pio_mode(ide_drive_t *drive, u8 pio)
 	unsigned long tfaddr	= siimage_selreg(hwif,	0x02);
 	unsigned long base	= (unsigned long)hwif->hwif_data;
 	u8 tf_pio		= pio;
-	u8 addr_mask		= hwif->channel ? (hwif->mmio ? 0xF4 : 0x84)
-						: (hwif->mmio ? 0xB4 : 0x80);
+	u8 mmio			= (hwif->host_flags & IDE_HFLAG_MMIO) ? 1 : 0;
+	u8 addr_mask		= hwif->channel ? (mmio ? 0xF4 : 0x84)
+						: (mmio ? 0xB4 : 0x80);
 	u8 mode			= 0;
 	u8 unit			= drive->select.b.unit;
 
@@ -290,13 +293,13 @@ static void sil_set_dma_mode(ide_drive_t *drive, const u8 speed)
 	u16 ultra = 0, multi	= 0;
 	u8 mode = 0, unit	= drive->select.b.unit;
 	unsigned long base	= (unsigned long)hwif->hwif_data;
-	u8 scsc = 0, addr_mask	= hwif->channel ?
-					(hwif->mmio ? 0xF4 : 0x84) :
-					(hwif->mmio ? 0xB4 : 0x80);
+	u8 mmio			= (hwif->host_flags & IDE_HFLAG_MMIO) ? 1 : 0;
+	u8 scsc = 0, addr_mask	= hwif->channel ? (mmio ? 0xF4 : 0x84)
+						: (mmio ? 0xB4 : 0x80);
 	unsigned long ma	= siimage_seldev(drive, 0x08);
 	unsigned long ua	= siimage_seldev(drive, 0x0C);
 
-	scsc  = sil_ioread8 (dev, base + (hwif->mmio ? 0x4A : 0x8A));
+	scsc  = sil_ioread8 (dev, base + (mmio ? 0x4A : 0x8A));
 	mode  = sil_ioread8 (dev, base + addr_mask);
 	multi = sil_ioread16(dev, ma);
 	ultra = sil_ioread16(dev, ua);
@@ -391,7 +394,7 @@ static int siimage_mmio_dma_test_irq(ide_drive_t *drive)
 
 static int siimage_dma_test_irq(ide_drive_t *drive)
 {
-	if (drive->hwif->mmio)
+	if (drive->hwif->host_flags & IDE_HFLAG_MMIO)
 		return siimage_mmio_dma_test_irq(drive);
 	else
 		return siimage_io_dma_test_irq(drive);
@@ -640,8 +643,6 @@ static void __devinit init_mmio_iops_siimage(ide_hwif_t *hwif)
 	hwif->irq = dev->irq;
 
 	hwif->dma_base = (unsigned long)addr + (ch ? 0x08 : 0x00);
-
-	hwif->mmio = 1;
 }
 
 static int is_dev_seagate_sata(ide_drive_t *drive)
diff --git a/drivers/ide/setup-pci.c b/drivers/ide/setup-pci.c
index 5171601fb255..abcfb1739d4d 100644
--- a/drivers/ide/setup-pci.c
+++ b/drivers/ide/setup-pci.c
@@ -87,7 +87,7 @@ unsigned long ide_pci_dma_base(ide_hwif_t *hwif, const struct ide_port_info *d)
 	unsigned long dma_base = 0;
 	u8 dma_stat = 0;
 
-	if (hwif->mmio)
+	if (hwif->host_flags & IDE_HFLAG_MMIO)
 		return hwif->dma_base;
 
 	if (hwif->mate && hwif->mate->dma_base) {
@@ -374,7 +374,7 @@ int ide_hwif_setup_dma(ide_hwif_t *hwif, const struct ide_port_info *d)
 		if (base == 0 || ide_pci_set_master(dev, d->name) < 0)
 			return -1;
 
-		if (hwif->mmio)
+		if (hwif->host_flags & IDE_HFLAG_MMIO)
 			printk(KERN_INFO "    %s: MMIO-DMA\n", hwif->name);
 		else
 			printk(KERN_INFO "    %s: BM-DMA at 0x%04lx-0x%04lx\n",
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 56d0bc2dffee..b01b102be4de 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -532,7 +532,6 @@ typedef struct hwif_s {
 	unsigned	serialized : 1;	/* serialized all channel operation */
 	unsigned	sharing_irq: 1;	/* 1 = sharing irq with another hwif */
 	unsigned	sg_mapped  : 1;	/* sg_table and sg_nents are ready */
-	unsigned	mmio       : 1; /* host uses MMIO */
 
 	struct device		gendev;
 	struct device		*portdev;
-- 
cgit v1.2.3


From f8c4bd0ab2b8783c0f080957781e9f70bee48eaa Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 15 Jul 2008 21:21:49 +0200
Subject: ide: pass 'hwif *' instead of 'drive *' to ->OUTBSYNC method

There should be no functional changes caused by this patch.

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-io.c       |  2 +-
 drivers/ide/ide-iops.c     | 18 +++++++++---------
 drivers/ide/ide-probe.c    |  6 +++---
 drivers/ide/ide-taskfile.c |  2 +-
 drivers/ide/pci/scc_pata.c |  5 +----
 drivers/ide/ppc/pmac.c     |  6 +++---
 drivers/scsi/ide-scsi.c    |  2 +-
 include/linux/ide.h        |  2 +-
 8 files changed, 20 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 2083cc08b2ce..c28fcdf0ee9e 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -437,7 +437,7 @@ static ide_startstop_t ide_atapi_error(ide_drive_t *drive, struct request *rq, u
 
 	if (ide_read_status(drive) & (BUSY_STAT | DRQ_STAT))
 		/* force an abort */
-		hwif->OUTBSYNC(drive, WIN_IDLEIMMEDIATE,
+		hwif->OUTBSYNC(hwif, WIN_IDLEIMMEDIATE,
 			       hwif->io_ports.command_addr);
 
 	if (rq->errors >= ERROR_MAX) {
diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
index 491980aab866..4c32cf0b623c 100644
--- a/drivers/ide/ide-iops.c
+++ b/drivers/ide/ide-iops.c
@@ -42,7 +42,7 @@ static void ide_outb (u8 val, unsigned long port)
 	outb(val, port);
 }
 
-static void ide_outbsync (ide_drive_t *drive, u8 addr, unsigned long port)
+static void ide_outbsync(ide_hwif_t *hwif, u8 addr, unsigned long port)
 {
 	outb(addr, port);
 }
@@ -68,7 +68,7 @@ static void ide_mm_outb (u8 value, unsigned long port)
 	writeb(value, (void __iomem *) port);
 }
 
-static void ide_mm_outbsync (ide_drive_t *drive, u8 value, unsigned long port)
+static void ide_mm_outbsync(ide_hwif_t *hwif, u8 value, unsigned long port)
 {
 	writeb(value, (void __iomem *) port);
 }
@@ -686,7 +686,7 @@ int ide_driveid_update(ide_drive_t *drive)
 	SELECT_MASK(drive, 1);
 	ide_set_irq(drive, 0);
 	msleep(50);
-	hwif->OUTBSYNC(drive, WIN_IDENTIFY, hwif->io_ports.command_addr);
+	hwif->OUTBSYNC(hwif, WIN_IDENTIFY, hwif->io_ports.command_addr);
 	timeout = jiffies + WAIT_WORSTCASE;
 	do {
 		if (time_after(jiffies, timeout)) {
@@ -773,7 +773,7 @@ int ide_config_drive_speed(ide_drive_t *drive, u8 speed)
 	ide_set_irq(drive, 0);
 	hwif->OUTB(speed, io_ports->nsect_addr);
 	hwif->OUTB(SETFEATURES_XFER, io_ports->feature_addr);
-	hwif->OUTBSYNC(drive, WIN_SETFEATURES, io_ports->command_addr);
+	hwif->OUTBSYNC(hwif, WIN_SETFEATURES, io_ports->command_addr);
 	if (drive->quirk_list == 2)
 		ide_set_irq(drive, 1);
 
@@ -881,7 +881,7 @@ void ide_execute_command(ide_drive_t *drive, u8 cmd, ide_handler_t *handler,
 
 	spin_lock_irqsave(&ide_lock, flags);
 	__ide_set_handler(drive, handler, timeout, expiry);
-	hwif->OUTBSYNC(drive, cmd, hwif->io_ports.command_addr);
+	hwif->OUTBSYNC(hwif, cmd, hwif->io_ports.command_addr);
 	/*
 	 * Drive takes 400nS to respond, we must avoid the IRQ being
 	 * serviced before that.
@@ -899,7 +899,7 @@ void ide_execute_pkt_cmd(ide_drive_t *drive)
 	unsigned long flags;
 
 	spin_lock_irqsave(&ide_lock, flags);
-	hwif->OUTBSYNC(drive, WIN_PACKETCMD, hwif->io_ports.command_addr);
+	hwif->OUTBSYNC(hwif, WIN_PACKETCMD, hwif->io_ports.command_addr);
 	ndelay(400);
 	spin_unlock_irqrestore(&ide_lock, flags);
 }
@@ -1094,7 +1094,7 @@ static ide_startstop_t do_reset1 (ide_drive_t *drive, int do_not_try_atapi)
 		pre_reset(drive);
 		SELECT_DRIVE(drive);
 		udelay (20);
-		hwif->OUTBSYNC(drive, WIN_SRST, io_ports->command_addr);
+		hwif->OUTBSYNC(hwif, WIN_SRST, io_ports->command_addr);
 		ndelay(400);
 		hwgroup->poll_timeout = jiffies + WAIT_WORSTCASE;
 		hwgroup->polling = 1;
@@ -1125,14 +1125,14 @@ static ide_startstop_t do_reset1 (ide_drive_t *drive, int do_not_try_atapi)
 	 * recover from reset very quickly, saving us the first 50ms wait time.
 	 */
 	/* set SRST and nIEN */
-	hwif->OUTBSYNC(drive, drive->ctl|6, io_ports->ctl_addr);
+	hwif->OUTBSYNC(hwif, drive->ctl | 6, io_ports->ctl_addr);
 	/* more than enough time */
 	udelay(10);
 	if (drive->quirk_list == 2)
 		ctl = drive->ctl;	/* clear SRST and nIEN */
 	else
 		ctl = drive->ctl | 2;	/* clear SRST, leave nIEN */
-	hwif->OUTBSYNC(drive, ctl, io_ports->ctl_addr);
+	hwif->OUTBSYNC(hwif, ctl, io_ports->ctl_addr);
 	/* more than enough time */
 	udelay(10);
 	hwgroup->poll_timeout = jiffies + WAIT_WORSTCASE;
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index 12513c45d700..b010633eb5ba 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -293,7 +293,7 @@ static int actual_try_to_identify (ide_drive_t *drive, u8 cmd)
 		hwif->OUTB(0, io_ports->feature_addr);
 
 	/* ask drive for ID */
-	hwif->OUTBSYNC(drive, cmd, io_ports->command_addr);
+	hwif->OUTBSYNC(hwif, cmd, hwif->io_ports.command_addr);
 
 	timeout = ((cmd == WIN_IDENTIFY) ? WAIT_WORSTCASE : WAIT_PIDENTIFY) / 2;
 	timeout += jiffies;
@@ -480,7 +480,7 @@ static int do_probe (ide_drive_t *drive, u8 cmd)
 			msleep(50);
 			SELECT_DRIVE(drive);
 			msleep(50);
-			hwif->OUTBSYNC(drive, WIN_SRST, io_ports->command_addr);
+			hwif->OUTBSYNC(hwif, WIN_SRST, io_ports->command_addr);
 			(void)ide_busy_sleep(hwif);
 			rc = try_to_identify(drive, cmd);
 		}
@@ -516,7 +516,7 @@ static void enable_nest (ide_drive_t *drive)
 	printk("%s: enabling %s -- ", hwif->name, drive->id->model);
 	SELECT_DRIVE(drive);
 	msleep(50);
-	hwif->OUTBSYNC(drive, EXABYTE_ENABLE_NEST, hwif->io_ports.command_addr);
+	hwif->OUTBSYNC(hwif, EXABYTE_ENABLE_NEST, hwif->io_ports.command_addr);
 
 	if (ide_busy_sleep(hwif)) {
 		printk(KERN_CONT "failed (timeout)\n");
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index 6a17ab54f801..cf55a48a7dd2 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -117,7 +117,7 @@ ide_startstop_t do_rw_taskfile (ide_drive_t *drive, ide_task_t *task)
 	switch (task->data_phase) {
 	case TASKFILE_MULTI_OUT:
 	case TASKFILE_OUT:
-		hwif->OUTBSYNC(drive, tf->command, hwif->io_ports.command_addr);
+		hwif->OUTBSYNC(hwif, tf->command, hwif->io_ports.command_addr);
 		ndelay(400);	/* FIXME */
 		return pre_task_out_intr(drive, task->rq);
 	case TASKFILE_MULTI_IN:
diff --git a/drivers/ide/pci/scc_pata.c b/drivers/ide/pci/scc_pata.c
index 133053c7a480..32eb0877fce2 100644
--- a/drivers/ide/pci/scc_pata.c
+++ b/drivers/ide/pci/scc_pata.c
@@ -148,11 +148,8 @@ static void scc_ide_outb(u8 addr, unsigned long port)
 	out_be32((void*)port, addr);
 }
 
-static void
-scc_ide_outbsync(ide_drive_t * drive, u8 addr, unsigned long port)
+static void scc_ide_outbsync(ide_hwif_t *hwif, u8 addr, unsigned long port)
 {
-	ide_hwif_t *hwif = HWIF(drive);
-
 	out_be32((void*)port, addr);
 	eieio();
 	in_be32((void*)(hwif->dma_base + 0x01c));
diff --git a/drivers/ide/ppc/pmac.c b/drivers/ide/ppc/pmac.c
index ba2d58727964..dcb2c466bb97 100644
--- a/drivers/ide/ppc/pmac.c
+++ b/drivers/ide/ppc/pmac.c
@@ -480,13 +480,13 @@ pmac_ide_do_update_timings(ide_drive_t *drive)
 		pmac_ide_selectproc(drive);
 }
 
-static void
-pmac_outbsync(ide_drive_t *drive, u8 value, unsigned long port)
+static void pmac_outbsync(ide_hwif_t *hwif, u8 value, unsigned long port)
 {
 	u32 tmp;
 	
 	writeb(value, (void __iomem *) port);
-	tmp = readl(PMAC_IDE_REG(IDE_TIMING_CONFIG));
+	tmp = readl((void __iomem *)(hwif->io_ports.data_addr
+				     + IDE_TIMING_CONFIG));
 }
 
 /*
diff --git a/drivers/scsi/ide-scsi.c b/drivers/scsi/ide-scsi.c
index 3222aa589dbf..d7fd5e550a25 100644
--- a/drivers/scsi/ide-scsi.c
+++ b/drivers/scsi/ide-scsi.c
@@ -257,7 +257,7 @@ idescsi_atapi_error(ide_drive_t *drive, struct request *rq, u8 stat, u8 err)
 
 	if (ide_read_status(drive) & (BUSY_STAT | DRQ_STAT))
 		/* force an abort */
-		hwif->OUTBSYNC(drive, WIN_IDLEIMMEDIATE,
+		hwif->OUTBSYNC(hwif, WIN_IDLEIMMEDIATE,
 			       hwif->io_ports.command_addr);
 
 	rq->errors++;
diff --git a/include/linux/ide.h b/include/linux/ide.h
index b01b102be4de..1c3431469649 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -493,7 +493,7 @@ typedef struct hwif_s {
 	void (*ide_dma_clear_irq)(ide_drive_t *drive);
 
 	void (*OUTB)(u8 addr, unsigned long port);
-	void (*OUTBSYNC)(ide_drive_t *drive, u8 addr, unsigned long port);
+	void (*OUTBSYNC)(struct hwif_s *hwif, u8 addr, unsigned long port);
 
 	u8  (*INB)(unsigned long port);
 
-- 
cgit v1.2.3


From 0fd04dcc2ebb6ec9088c24b368b0ce1f42a98ef5 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 15 Jul 2008 21:21:50 +0200
Subject: ide: use ->OUTBSYNC in ide_set_irq()

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 include/linux/ide.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ide.h b/include/linux/ide.h
index 1c3431469649..4d1c9714f1d9 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1340,7 +1340,8 @@ static inline void ide_set_irq(ide_drive_t *drive, int on)
 {
 	ide_hwif_t *hwif = drive->hwif;
 
-	hwif->OUTB(drive->ctl | (on ? 0 : 2), hwif->io_ports.ctl_addr);
+	hwif->OUTBSYNC(hwif, drive->ctl | (on ? 0 : 2),
+		       hwif->io_ports.ctl_addr);
 }
 
 static inline u8 ide_read_status(ide_drive_t *drive)
-- 
cgit v1.2.3


From ff07488346702f554aaeb6aae982540aa0302373 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 15 Jul 2008 21:21:50 +0200
Subject: ide: remove drive->ctl

Remove drive->ctl (it is always equal to 0x08 after init time).

While at it:

* Use ATA_DEVCTL_OBS define.

There should be no functional changes caused by this patch.

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/h8300/ide-h8300.c |  4 ++--
 drivers/ide/ide-iops.c        | 10 +++++-----
 drivers/ide/ide-probe.c       |  2 +-
 drivers/ide/ide.c             |  1 -
 drivers/ide/pci/hpt366.c      |  3 +--
 drivers/ide/pci/ns87415.c     |  4 ++--
 drivers/ide/pci/scc_pata.c    |  4 ++--
 drivers/ide/pci/sgiioc4.c     |  2 +-
 include/linux/ide.h           |  3 +--
 9 files changed, 15 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/h8300/ide-h8300.c b/drivers/ide/h8300/ide-h8300.c
index d5afc28eaae7..ae37ee58bae2 100644
--- a/drivers/ide/h8300/ide-h8300.c
+++ b/drivers/ide/h8300/ide-h8300.c
@@ -96,7 +96,7 @@ static void h8300_tf_read(ide_drive_t *drive, ide_task_t *task)
 	}
 
 	/* be sure we're looking at the low order bits */
-	outb(drive->ctl & ~0x80, io_ports->ctl_addr);
+	outb(ATA_DEVCTL_OBS & ~0x80, io_ports->ctl_addr);
 
 	if (task->tf_flags & IDE_TFLAG_IN_NSECT)
 		tf->nsect  = inb(io_ports->nsect_addr);
@@ -110,7 +110,7 @@ static void h8300_tf_read(ide_drive_t *drive, ide_task_t *task)
 		tf->device = inb(io_ports->device_addr);
 
 	if (task->tf_flags & IDE_TFLAG_LBA48) {
-		outb(drive->ctl | 0x80, io_ports->ctl_addr);
+		outb(ATA_DEVCTL_OBS | 0x80, io_ports->ctl_addr);
 
 		if (task->tf_flags & IDE_TFLAG_IN_HOB_FEATURE)
 			tf->hob_feature = inb(io_ports->feature_addr);
diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
index 4c32cf0b623c..80ad4f234f3f 100644
--- a/drivers/ide/ide-iops.c
+++ b/drivers/ide/ide-iops.c
@@ -186,7 +186,7 @@ static void ide_tf_read(ide_drive_t *drive, ide_task_t *task)
 	}
 
 	/* be sure we're looking at the low order bits */
-	tf_outb(drive->ctl & ~0x80, io_ports->ctl_addr);
+	tf_outb(ATA_DEVCTL_OBS & ~0x80, io_ports->ctl_addr);
 
 	if (task->tf_flags & IDE_TFLAG_IN_NSECT)
 		tf->nsect  = tf_inb(io_ports->nsect_addr);
@@ -200,7 +200,7 @@ static void ide_tf_read(ide_drive_t *drive, ide_task_t *task)
 		tf->device = tf_inb(io_ports->device_addr);
 
 	if (task->tf_flags & IDE_TFLAG_LBA48) {
-		tf_outb(drive->ctl | 0x80, io_ports->ctl_addr);
+		tf_outb(ATA_DEVCTL_OBS | 0x80, io_ports->ctl_addr);
 
 		if (task->tf_flags & IDE_TFLAG_IN_HOB_FEATURE)
 			tf->hob_feature = tf_inb(io_ports->feature_addr);
@@ -1125,13 +1125,13 @@ static ide_startstop_t do_reset1 (ide_drive_t *drive, int do_not_try_atapi)
 	 * recover from reset very quickly, saving us the first 50ms wait time.
 	 */
 	/* set SRST and nIEN */
-	hwif->OUTBSYNC(hwif, drive->ctl | 6, io_ports->ctl_addr);
+	hwif->OUTBSYNC(hwif, ATA_DEVCTL_OBS | 6, io_ports->ctl_addr);
 	/* more than enough time */
 	udelay(10);
 	if (drive->quirk_list == 2)
-		ctl = drive->ctl;	/* clear SRST and nIEN */
+		ctl = ATA_DEVCTL_OBS;		/* clear SRST and nIEN */
 	else
-		ctl = drive->ctl | 2;	/* clear SRST, leave nIEN */
+		ctl = ATA_DEVCTL_OBS | 2;	/* clear SRST, leave nIEN */
 	hwif->OUTBSYNC(hwif, ctl, io_ports->ctl_addr);
 	/* more than enough time */
 	udelay(10);
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index 809362b13c99..d21e51a02c3e 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -1065,7 +1065,7 @@ static int init_irq (ide_hwif_t *hwif)
 
 		if (io_ports->ctl_addr)
 			/* clear nIEN */
-			hwif->OUTBSYNC(hwif, 0x08, io_ports->ctl_addr);
+			hwif->OUTBSYNC(hwif, ATA_DEVCTL_OBS, io_ports->ctl_addr);
 
 		if (request_irq(hwif->irq,&ide_intr,sa,hwif->name,hwgroup))
 	       		goto out_unlink;
diff --git a/drivers/ide/ide.c b/drivers/ide/ide.c
index 1defba3eefe7..2b8453510e09 100644
--- a/drivers/ide/ide.c
+++ b/drivers/ide/ide.c
@@ -136,7 +136,6 @@ static void ide_port_init_devices_data(ide_hwif_t *hwif)
 		drive->media			= ide_disk;
 		drive->select.all		= (unit<<4)|0xa0;
 		drive->hwif			= hwif;
-		drive->ctl			= 0x08;
 		drive->ready_stat		= READY_STAT;
 		drive->bad_wstat		= BAD_W_STAT;
 		drive->special.b.recalibrate	= 1;
diff --git a/drivers/ide/pci/hpt366.c b/drivers/ide/pci/hpt366.c
index c929dadaaaff..397c6cbe953c 100644
--- a/drivers/ide/pci/hpt366.c
+++ b/drivers/ide/pci/hpt366.c
@@ -759,8 +759,7 @@ static void hpt3xx_maskproc(ide_drive_t *drive, int mask)
 				enable_irq (hwif->irq);
 		}
 	} else
-		outb(mask ? (drive->ctl | 2) : (drive->ctl & ~2),
-		     hwif->io_ports.ctl_addr);
+		outb(ATA_DEVCTL_OBS | (mask ? 2 : 0), hwif->io_ports.ctl_addr);
 }
 
 /*
diff --git a/drivers/ide/pci/ns87415.c b/drivers/ide/pci/ns87415.c
index a7a41bb82778..45ba71a7182f 100644
--- a/drivers/ide/pci/ns87415.c
+++ b/drivers/ide/pci/ns87415.c
@@ -76,7 +76,7 @@ static void superio_tf_read(ide_drive_t *drive, ide_task_t *task)
 	}
 
 	/* be sure we're looking at the low order bits */
-	outb(drive->ctl & ~0x80, io_ports->ctl_addr);
+	outb(ATA_DEVCTL_OBS & ~0x80, io_ports->ctl_addr);
 
 	if (task->tf_flags & IDE_TFLAG_IN_NSECT)
 		tf->nsect  = inb(io_ports->nsect_addr);
@@ -90,7 +90,7 @@ static void superio_tf_read(ide_drive_t *drive, ide_task_t *task)
 		tf->device = superio_ide_inb(io_ports->device_addr);
 
 	if (task->tf_flags & IDE_TFLAG_LBA48) {
-		outb(drive->ctl | 0x80, io_ports->ctl_addr);
+		outb(ATA_DEVCTL_OBS | 0x80, io_ports->ctl_addr);
 
 		if (task->tf_flags & IDE_TFLAG_IN_HOB_FEATURE)
 			tf->hob_feature = inb(io_ports->feature_addr);
diff --git a/drivers/ide/pci/scc_pata.c b/drivers/ide/pci/scc_pata.c
index 32eb0877fce2..1584ebb6a185 100644
--- a/drivers/ide/pci/scc_pata.c
+++ b/drivers/ide/pci/scc_pata.c
@@ -703,7 +703,7 @@ static void scc_tf_read(ide_drive_t *drive, ide_task_t *task)
 	}
 
 	/* be sure we're looking at the low order bits */
-	scc_ide_outb(drive->ctl & ~0x80, io_ports->ctl_addr);
+	scc_ide_outb(ATA_DEVCTL_OBS & ~0x80, io_ports->ctl_addr);
 
 	if (task->tf_flags & IDE_TFLAG_IN_NSECT)
 		tf->nsect  = scc_ide_inb(io_ports->nsect_addr);
@@ -717,7 +717,7 @@ static void scc_tf_read(ide_drive_t *drive, ide_task_t *task)
 		tf->device = scc_ide_inb(io_ports->device_addr);
 
 	if (task->tf_flags & IDE_TFLAG_LBA48) {
-		scc_ide_outb(drive->ctl | 0x80, io_ports->ctl_addr);
+		scc_ide_outb(ATA_DEVCTL_OBS | 0x80, io_ports->ctl_addr);
 
 		if (task->tf_flags & IDE_TFLAG_IN_HOB_FEATURE)
 			tf->hob_feature = scc_ide_inb(io_ports->feature_addr);
diff --git a/drivers/ide/pci/sgiioc4.c b/drivers/ide/pci/sgiioc4.c
index c1b667c53f2b..24513e3dcd6b 100644
--- a/drivers/ide/pci/sgiioc4.c
+++ b/drivers/ide/pci/sgiioc4.c
@@ -111,7 +111,7 @@ sgiioc4_init_hwif_ports(hw_regs_t * hw, unsigned long data_port,
 static void
 sgiioc4_maskproc(ide_drive_t * drive, int mask)
 {
-	writeb(mask ? (drive->ctl | 2) : (drive->ctl & ~2),
+	writeb(ATA_DEVCTL_OBS | (mask ? 2 : 0),
 	       (void __iomem *)drive->hwif->io_ports.ctl_addr);
 }
 
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 4d1c9714f1d9..d8c86f0362c4 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -364,7 +364,6 @@ typedef struct ide_drive_s {
         u8	wcache;		/* status of write cache */
 	u8	acoustic;	/* acoustic management */
 	u8	media;		/* disk, cdrom, tape, floppy, ... */
-	u8	ctl;		/* "normal" value for Control register */
 	u8	ready_stat;	/* min status value for drive ready */
 	u8	mult_count;	/* current multiple sector setting */
 	u8	mult_req;	/* requested multiple sector setting */
@@ -1340,7 +1339,7 @@ static inline void ide_set_irq(ide_drive_t *drive, int on)
 {
 	ide_hwif_t *hwif = drive->hwif;
 
-	hwif->OUTBSYNC(hwif, drive->ctl | (on ? 0 : 2),
+	hwif->OUTBSYNC(hwif, ATA_DEVCTL_OBS | (on ? 0 : 2),
 		       hwif->io_ports.ctl_addr);
 }
 
-- 
cgit v1.2.3


From 63f5abb0959337db0d5bece9cefba03cdcadec51 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Tue, 15 Jul 2008 21:21:51 +0200
Subject: ide: remove action argument in ide_do_drive_cmd

ide_do_drive_cmd is called only with ide_preempt action argument. So
we can remove the action argument in ide_do_drive_cmd and ide_action_t
typedef.

This patch also includes two minor cleanups: 1) ide_do_drive_cmd
always succeeds so we don't need the return value; 2) the callers use
blk_rq_init before ide_do_drive_cmd so there is no need to initialize
rq->errors.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-cd.c     |  2 +-
 drivers/ide/ide-floppy.c |  2 +-
 drivers/ide/ide-io.c     | 40 +++++++++-------------------------------
 drivers/ide/ide-tape.c   |  2 +-
 drivers/scsi/ide-scsi.c  |  3 ++-
 include/linux/ide.h      | 12 +-----------
 6 files changed, 15 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 0fbc2d8d0d53..043129c422fe 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -213,7 +213,7 @@ static void cdrom_queue_request_sense(ide_drive_t *drive, void *sense,
 	/* NOTE! Save the failed command in "rq->buffer" */
 	rq->buffer = (void *) failed_command;
 
-	(void) ide_do_drive_cmd(drive, rq, ide_preempt);
+	ide_do_drive_cmd(drive, rq);
 }
 
 static void cdrom_end_request(ide_drive_t *drive, int uptodate)
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index 1852008d9ee4..53209a473937 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -291,7 +291,7 @@ static void idefloppy_queue_pc_head(ide_drive_t *drive, struct ide_atapi_pc *pc,
 	rq->cmd_type = REQ_TYPE_SPECIAL;
 	rq->cmd_flags |= REQ_PREEMPT;
 	rq->rq_disk = floppy->disk;
-	(void) ide_do_drive_cmd(drive, rq, ide_preempt);
+	ide_do_drive_cmd(drive, rq);
 }
 
 static struct ide_atapi_pc *idefloppy_next_pc_storage(ide_drive_t *drive)
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index c28fcdf0ee9e..28057747c1f8 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -1520,49 +1520,27 @@ irqreturn_t ide_intr (int irq, void *dev_id)
  *	ide_do_drive_cmd	-	issue IDE special command
  *	@drive: device to issue command
  *	@rq: request to issue
- *	@action: action for processing
  *
  *	This function issues a special IDE device request
  *	onto the request queue.
  *
- *	If action is ide_wait, then the rq is queued at the end of the
- *	request queue, and the function sleeps until it has been processed.
- *	This is for use when invoked from an ioctl handler.
- *
- *	If action is ide_preempt, then the rq is queued at the head of
- *	the request queue, displacing the currently-being-processed
- *	request and this function returns immediately without waiting
- *	for the new rq to be completed.  This is VERY DANGEROUS, and is
- *	intended for careful use by the ATAPI tape/cdrom driver code.
- *
- *	If action is ide_end, then the rq is queued at the end of the
- *	request queue, and the function returns immediately without waiting
- *	for the new rq to be completed. This is again intended for careful
- *	use by the ATAPI tape/cdrom driver code.
+ *	the rq is queued at the head of the request queue, displacing
+ *	the currently-being-processed request and this function
+ *	returns immediately without waiting for the new rq to be
+ *	completed.  This is VERY DANGEROUS, and is intended for
+ *	careful use by the ATAPI tape/cdrom driver code.
  */
- 
-int ide_do_drive_cmd (ide_drive_t *drive, struct request *rq, ide_action_t action)
+
+void ide_do_drive_cmd(ide_drive_t *drive, struct request *rq)
 {
 	unsigned long flags;
 	ide_hwgroup_t *hwgroup = HWGROUP(drive);
-	int where = ELEVATOR_INSERT_BACK;
-
-	rq->errors = 0;
-
-	if (action == ide_preempt)
-		where = ELEVATOR_INSERT_FRONT;
 
 	spin_lock_irqsave(&ide_lock, flags);
-	if (action == ide_preempt)
-		hwgroup->rq = NULL;
-	__elv_add_request(drive->queue, rq, where, 1);
+	hwgroup->rq = NULL;
+	__elv_add_request(drive->queue, rq, ELEVATOR_INSERT_FRONT, 1);
 	__generic_unplug_device(drive->queue);
-	/* the queue is stopped so it won't be plugged+unplugged */
-	if (blk_pm_resume_request(rq))
-		do_ide_request(drive->queue);
 	spin_unlock_irqrestore(&ide_lock, flags);
-
-	return 0;
 }
 
 EXPORT_SYMBOL(ide_do_drive_cmd);
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index cc7991c7c252..a562df820777 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -691,7 +691,7 @@ static void idetape_queue_pc_head(ide_drive_t *drive, struct ide_atapi_pc *pc,
 	rq->cmd_flags |= REQ_PREEMPT;
 	rq->buffer = (char *) pc;
 	rq->rq_disk = tape->disk;
-	(void) ide_do_drive_cmd(drive, rq, ide_preempt);
+	ide_do_drive_cmd(drive, rq);
 }
 
 /*
diff --git a/drivers/scsi/ide-scsi.c b/drivers/scsi/ide-scsi.c
index 58e30efe7a74..569ffde6d047 100644
--- a/drivers/scsi/ide-scsi.c
+++ b/drivers/scsi/ide-scsi.c
@@ -245,7 +245,8 @@ static int idescsi_check_condition(ide_drive_t *drive,
 		ide_scsi_hex_dump(pc->c, 6);
 	}
 	rq->rq_disk = scsi->disk;
-	return ide_do_drive_cmd(drive, rq, ide_preempt);
+	ide_do_drive_cmd(drive, rq);
+	return 0;
 }
 
 static int idescsi_end_request(ide_drive_t *, int, int);
diff --git a/include/linux/ide.h b/include/linux/ide.h
index d8c86f0362c4..04267dc1edf2 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -851,17 +851,7 @@ int ide_wait_stat(ide_startstop_t *, ide_drive_t *, u8, u8, unsigned long);
 
 extern ide_startstop_t ide_do_reset (ide_drive_t *);
 
-/*
- * "action" parameter type for ide_do_drive_cmd() below.
- */
-typedef enum {
-	ide_wait,	/* insert rq at end of list, and wait for it */
-	ide_preempt,	/* insert rq in front of current request */
-	ide_head_wait,	/* insert rq in front of current request and wait for it */
-	ide_end		/* insert rq at end of list, but don't wait for it */
-} ide_action_t;
-
-extern int ide_do_drive_cmd(ide_drive_t *, struct request *, ide_action_t);
+extern void ide_do_drive_cmd(ide_drive_t *, struct request *);
 
 extern void ide_end_drive_cmd(ide_drive_t *, u8, u8);
 
-- 
cgit v1.2.3


From 92f5daff2b8439fa4c57c57f47823ffc459c3bd9 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 15 Jul 2008 21:21:55 +0200
Subject: ide-tape: make pc->idetape_callback void

There should be no functional changes caused by this patch.

Cc: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-tape.c | 13 +++++++------
 include/linux/ide.h    |  2 +-
 2 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index d387aaf0eb39..88d26efdf844 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -619,7 +619,7 @@ static int idetape_end_request(ide_drive_t *drive, int uptodate, int nr_sects)
 	return 0;
 }
 
-static ide_startstop_t ide_tape_callback(ide_drive_t *drive)
+static void ide_tape_callback(ide_drive_t *drive)
 {
 	idetape_tape_t *tape = drive->driver_data;
 	struct ide_atapi_pc *pc = tape->pc;
@@ -675,8 +675,6 @@ static ide_startstop_t ide_tape_callback(ide_drive_t *drive)
 	}
 
 	idetape_end_request(drive, uptodate, 0);
-
-	return ide_stopped;
 }
 
 static void idetape_init_pc(struct ide_atapi_pc *pc)
@@ -843,7 +841,8 @@ static ide_startstop_t idetape_pc_intr(ide_drive_t *drive)
 		if (tape->failed_pc == pc)
 			tape->failed_pc = NULL;
 		/* Command finished - Call the callback function */
-		return pc->idetape_callback(drive);
+		pc->idetape_callback(drive);
+		return ide_stopped;
 	}
 
 	if (pc->flags & PC_FLAG_DMA_IN_PROGRESS) {
@@ -1035,7 +1034,8 @@ static ide_startstop_t idetape_issue_pc(ide_drive_t *drive,
 			pc->error = IDETAPE_ERROR_GENERAL;
 		}
 		tape->failed_pc = NULL;
-		return pc->idetape_callback(drive);
+		pc->idetape_callback(drive);
+		return ide_stopped;
 	}
 	debug_log(DBG_SENSE, "Retry #%d, cmd = %02X\n", pc->retries, pc->c[0]);
 
@@ -1120,7 +1120,8 @@ static ide_startstop_t idetape_media_access_finished(ide_drive_t *drive)
 		pc->error = IDETAPE_ERROR_GENERAL;
 		tape->failed_pc = NULL;
 	}
-	return pc->idetape_callback(drive);
+	pc->idetape_callback(drive);
+	return ide_stopped;
 }
 
 static void idetape_create_read_cmd(idetape_tape_t *tape,
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 04267dc1edf2..8936b21a7030 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -641,7 +641,7 @@ struct ide_atapi_pc {
 	 */
 	u8 pc_buf[256];
 	void (*idefloppy_callback) (ide_drive_t *);
-	ide_startstop_t (*idetape_callback) (ide_drive_t *);
+	void (*idetape_callback) (ide_drive_t *);
 
 	/* idetape only */
 	struct idetape_bh *bh;
-- 
cgit v1.2.3


From 1b06e92aa03018e4b3ba281e03a7711d9b71a998 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 15 Jul 2008 21:21:56 +0200
Subject: ide-{floppy,tape}: merge pc->idefloppy_callback and
 pc->idetape_callback

Merge pc->idefloppy_callback and pc->idetape_callback into pc->callback.

There should be no functional changes caused by this patch.

Cc: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-floppy.c | 6 +++---
 drivers/ide/ide-tape.c   | 8 ++++----
 include/linux/ide.h      | 4 ++--
 3 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index 2058a6f3f331..a9f3127a74ed 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -354,7 +354,7 @@ static void idefloppy_init_pc(struct ide_atapi_pc *pc)
 	pc->req_xfer = 0;
 	pc->buf = pc->pc_buf;
 	pc->buf_size = IDEFLOPPY_PC_BUFFER_SIZE;
-	pc->idefloppy_callback = &ide_floppy_callback;
+	pc->callback = ide_floppy_callback;
 }
 
 static void idefloppy_create_request_sense_cmd(struct ide_atapi_pc *pc)
@@ -438,7 +438,7 @@ static ide_startstop_t idefloppy_pc_intr(ide_drive_t *drive)
 		if (floppy->failed_pc == pc)
 			floppy->failed_pc = NULL;
 		/* Command finished - Call the callback function */
-		pc->idefloppy_callback(drive);
+		pc->callback(drive);
 		return ide_stopped;
 	}
 
@@ -612,7 +612,7 @@ static ide_startstop_t idefloppy_issue_pc(ide_drive_t *drive,
 		pc->error = IDEFLOPPY_ERROR_GENERAL;
 
 		floppy->failed_pc = NULL;
-		pc->idefloppy_callback(drive);
+		pc->callback(drive);
 		return ide_stopped;
 	}
 
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 88d26efdf844..ce9b6d327528 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -687,7 +687,7 @@ static void idetape_init_pc(struct ide_atapi_pc *pc)
 	pc->buf_size = IDETAPE_PC_BUFFER_SIZE;
 	pc->bh = NULL;
 	pc->b_data = NULL;
-	pc->idetape_callback = ide_tape_callback;
+	pc->callback = ide_tape_callback;
 }
 
 static void idetape_create_request_sense_cmd(struct ide_atapi_pc *pc)
@@ -841,7 +841,7 @@ static ide_startstop_t idetape_pc_intr(ide_drive_t *drive)
 		if (tape->failed_pc == pc)
 			tape->failed_pc = NULL;
 		/* Command finished - Call the callback function */
-		pc->idetape_callback(drive);
+		pc->callback(drive);
 		return ide_stopped;
 	}
 
@@ -1034,7 +1034,7 @@ static ide_startstop_t idetape_issue_pc(ide_drive_t *drive,
 			pc->error = IDETAPE_ERROR_GENERAL;
 		}
 		tape->failed_pc = NULL;
-		pc->idetape_callback(drive);
+		pc->callback(drive);
 		return ide_stopped;
 	}
 	debug_log(DBG_SENSE, "Retry #%d, cmd = %02X\n", pc->retries, pc->c[0]);
@@ -1120,7 +1120,7 @@ static ide_startstop_t idetape_media_access_finished(ide_drive_t *drive)
 		pc->error = IDETAPE_ERROR_GENERAL;
 		tape->failed_pc = NULL;
 	}
-	pc->idetape_callback(drive);
+	pc->callback(drive);
 	return ide_stopped;
 }
 
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 8936b21a7030..f079456adfdb 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -640,8 +640,8 @@ struct ide_atapi_pc {
 	 * to change/removal later.
 	 */
 	u8 pc_buf[256];
-	void (*idefloppy_callback) (ide_drive_t *);
-	void (*idetape_callback) (ide_drive_t *);
+
+	void (*callback)(ide_drive_t *);
 
 	/* idetape only */
 	struct idetape_bh *bh;
-- 
cgit v1.2.3


From 5e3310958204912f3f00be2592c945fbc37db6ae Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 15 Jul 2008 21:21:56 +0200
Subject: ide-{floppy,tape}: PC_FLAG_DMA_RECOMMENDED -> PC_FLAG_DMA_OK

* Use PC_FLAG_DMA_OK flag instead of PC_FLAG_DMA_RECOMMENDED one.

* Remove no longer used PC_FLAG_DMA_RECOMMENDED flag.

There should be no functional changes caused by this patch.

Cc: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-floppy.c | 6 +++---
 drivers/ide/ide-tape.c   | 6 +++---
 include/linux/ide.h      | 9 ++++-----
 3 files changed, 10 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index a9f3127a74ed..dbefe35c1396 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -630,7 +630,7 @@ static ide_startstop_t idefloppy_issue_pc(ide_drive_t *drive,
 	}
 	dma = 0;
 
-	if ((pc->flags & PC_FLAG_DMA_RECOMMENDED) && drive->using_dma)
+	if ((pc->flags & PC_FLAG_DMA_OK) && drive->using_dma)
 		dma = !hwif->dma_ops->dma_setup(drive);
 
 	ide_pktcmd_tf_load(drive, IDE_TFLAG_OUT_DEVICE, bcount, dma);
@@ -755,7 +755,7 @@ static void idefloppy_create_rw_cmd(idefloppy_floppy_t *floppy,
 		pc->flags |= PC_FLAG_WRITING;
 	pc->buf = NULL;
 	pc->req_xfer = pc->buf_size = blocks * floppy->block_size;
-	pc->flags |= PC_FLAG_DMA_RECOMMENDED;
+	pc->flags |= PC_FLAG_DMA_OK;
 }
 
 static void idefloppy_blockpc_cmd(idefloppy_floppy_t *floppy,
@@ -769,7 +769,7 @@ static void idefloppy_blockpc_cmd(idefloppy_floppy_t *floppy,
 		pc->flags |= PC_FLAG_WRITING;
 	pc->buf = rq->data;
 	if (rq->bio)
-		pc->flags |= PC_FLAG_DMA_RECOMMENDED;
+		pc->flags |= PC_FLAG_DMA_OK;
 	/*
 	 * possibly problematic, doesn't look like ide-floppy correctly
 	 * handled scattered requests if dma fails...
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index ce9b6d327528..e8a5852fa2d4 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -1050,7 +1050,7 @@ static ide_startstop_t idetape_issue_pc(ide_drive_t *drive,
 		pc->flags &= ~PC_FLAG_DMA_ERROR;
 		ide_dma_off(drive);
 	}
-	if ((pc->flags & PC_FLAG_DMA_RECOMMENDED) && drive->using_dma)
+	if ((pc->flags & PC_FLAG_DMA_OK) && drive->using_dma)
 		dma_ok = !hwif->dma_ops->dma_setup(drive);
 
 	ide_pktcmd_tf_load(drive, IDE_TFLAG_OUT_DEVICE, bcount, dma_ok);
@@ -1138,7 +1138,7 @@ static void idetape_create_read_cmd(idetape_tape_t *tape,
 	pc->buf_size = length * tape->blk_size;
 	pc->req_xfer = pc->buf_size;
 	if (pc->req_xfer == tape->buffer_size)
-		pc->flags |= PC_FLAG_DMA_RECOMMENDED;
+		pc->flags |= PC_FLAG_DMA_OK;
 }
 
 static void idetape_create_write_cmd(idetape_tape_t *tape,
@@ -1157,7 +1157,7 @@ static void idetape_create_write_cmd(idetape_tape_t *tape,
 	pc->buf_size = length * tape->blk_size;
 	pc->req_xfer = pc->buf_size;
 	if (pc->req_xfer == tape->buffer_size)
-		pc->flags |= PC_FLAG_DMA_RECOMMENDED;
+		pc->flags |= PC_FLAG_DMA_OK;
 }
 
 static ide_startstop_t idetape_do_request(ide_drive_t *drive,
diff --git a/include/linux/ide.h b/include/linux/ide.h
index f079456adfdb..63cee2947f60 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -602,12 +602,11 @@ enum {
 	PC_FLAG_SUPPRESS_ERROR		= (1 << 1),
 	PC_FLAG_WAIT_FOR_DSC		= (1 << 2),
 	PC_FLAG_DMA_OK			= (1 << 3),
-	PC_FLAG_DMA_RECOMMENDED		= (1 << 4),
-	PC_FLAG_DMA_IN_PROGRESS		= (1 << 5),
-	PC_FLAG_DMA_ERROR		= (1 << 6),
-	PC_FLAG_WRITING			= (1 << 7),
+	PC_FLAG_DMA_IN_PROGRESS		= (1 << 4),
+	PC_FLAG_DMA_ERROR		= (1 << 5),
+	PC_FLAG_WRITING			= (1 << 6),
 	/* command timed out */
-	PC_FLAG_TIMEDOUT		= (1 << 8),
+	PC_FLAG_TIMEDOUT		= (1 << 7),
 };
 
 struct ide_atapi_pc {
-- 
cgit v1.2.3


From 5d41893c0f9caf94b449eada0279a08c86f0212e Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 15 Jul 2008 21:21:57 +0200
Subject: ide: add PC_FLAG_ZIP_DRIVE pc flag

Add PC_FLAG_ZIP_DRIVE pc flag, set it in idefloppy_do_request()
and check for it (instead of checking for IDEFLOPPY_FLAG_ZIP_DRIVE)
in idefloppy_transfer_pc().  This is a preparation for adding
generic ide_transfer_pc() helper.

There should be no functional changes caused by this patch.

Cc: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-floppy.c | 8 ++++++--
 include/linux/ide.h      | 1 +
 2 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index 1df6a3143596..cff90c4b217e 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -559,7 +559,7 @@ static ide_startstop_t idefloppy_transfer_pc1(ide_drive_t *drive)
 	 * 40 and 50msec work well. idefloppy_pc_intr will not be actually
 	 * used until after the packet is moved in about 50 msec.
 	 */
-	if (floppy->flags & IDEFLOPPY_FLAG_ZIP_DRIVE) {
+	if (pc->flags & PC_FLAG_ZIP_DRIVE) {
 		timeout = floppy->ticks;
 		expiry = &idefloppy_transfer_pc2;
 	} else {
@@ -575,7 +575,7 @@ static ide_startstop_t idefloppy_transfer_pc1(ide_drive_t *drive)
 		hwif->dma_ops->dma_start(drive);
 	}
 
-	if ((floppy->flags & IDEFLOPPY_FLAG_ZIP_DRIVE) == 0)
+	if ((pc->flags & PC_FLAG_ZIP_DRIVE) == 0)
 		/* Send the actual packet */
 		hwif->output_data(drive, NULL, floppy->pc->c, 12);
 
@@ -826,7 +826,11 @@ static ide_startstop_t idefloppy_do_request(ide_drive_t *drive,
 		return ide_stopped;
 	}
 
+	if (floppy->flags & IDEFLOPPY_FLAG_ZIP_DRIVE)
+		pc->flags |= PC_FLAG_ZIP_DRIVE;
+
 	pc->rq = rq;
+
 	return idefloppy_issue_pc(drive, pc);
 }
 
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 63cee2947f60..89feaea9e20b 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -607,6 +607,7 @@ enum {
 	PC_FLAG_WRITING			= (1 << 6),
 	/* command timed out */
 	PC_FLAG_TIMEDOUT		= (1 << 7),
+	PC_FLAG_ZIP_DRIVE		= (1 << 8),
 };
 
 struct ide_atapi_pc {
-- 
cgit v1.2.3


From 594c16d8dd54cd7b1c5ef1ec3ac0f6bf34301dad Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 15 Jul 2008 21:21:58 +0200
Subject: ide: add ide_transfer_pc() helper

* Add ide-atapi.c file for generic ATAPI support together with
  CONFIG_IDE_ATAPI config option.

* Add generic ide_transfer_pc() helper to ide-atapi.c and then
  convert ide-{floppy,tape,scsi} device drivers to use it.

There should be no functional changes caused by this patch.

Cc: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/Kconfig      |  6 +++++
 drivers/ide/Makefile     |  1 +
 drivers/ide/ide-atapi.c  | 70 ++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/ide/ide-floppy.c | 28 +------------------
 drivers/ide/ide-tape.c   | 56 ++------------------------------------
 drivers/scsi/ide-scsi.c  | 30 ++-------------------
 include/linux/ide.h      |  3 +++
 7 files changed, 85 insertions(+), 109 deletions(-)
 create mode 100644 drivers/ide/ide-atapi.c

(limited to 'include/linux')

diff --git a/drivers/ide/Kconfig b/drivers/ide/Kconfig
index 1607536ff5fb..cf707c8f08d4 100644
--- a/drivers/ide/Kconfig
+++ b/drivers/ide/Kconfig
@@ -98,6 +98,9 @@ if BLK_DEV_IDE
 
 comment "Please see Documentation/ide/ide.txt for help/info on IDE drives"
 
+config IDE_ATAPI
+	bool
+
 config BLK_DEV_IDE_SATA
 	bool "Support for SATA (deprecated; conflicts with libata SATA driver)"
 	default n
@@ -201,6 +204,7 @@ config BLK_DEV_IDECD_VERBOSE_ERRORS
 
 config BLK_DEV_IDETAPE
 	tristate "Include IDE/ATAPI TAPE support"
+	select IDE_ATAPI
 	help
 	  If you have an IDE tape drive using the ATAPI protocol, say Y.
 	  ATAPI is a newer protocol used by IDE tape and CD-ROM drives,
@@ -223,6 +227,7 @@ config BLK_DEV_IDETAPE
 
 config BLK_DEV_IDEFLOPPY
 	tristate "Include IDE/ATAPI FLOPPY support"
+	select IDE_ATAPI
 	---help---
 	  If you have an IDE floppy drive which uses the ATAPI protocol,
 	  answer Y.  ATAPI is a newer protocol used by IDE CD-ROM/tape/floppy
@@ -246,6 +251,7 @@ config BLK_DEV_IDEFLOPPY
 config BLK_DEV_IDESCSI
 	tristate "SCSI emulation support"
 	depends on SCSI
+	select IDE_ATAPI
 	---help---
 	  WARNING: ide-scsi is no longer needed for cd writing applications!
 	  The 2.6 kernel supports direct writing to ide-cd, which eliminates
diff --git a/drivers/ide/Makefile b/drivers/ide/Makefile
index f94b679b611e..a2b3f84d710d 100644
--- a/drivers/ide/Makefile
+++ b/drivers/ide/Makefile
@@ -14,6 +14,7 @@ EXTRA_CFLAGS				+= -Idrivers/ide
 ide-core-y += ide.o ide-io.o ide-iops.o ide-lib.o ide-probe.o ide-taskfile.o
 
 # core IDE code
+ide-core-$(CONFIG_IDE_ATAPI)		+= ide-atapi.o
 ide-core-$(CONFIG_BLK_DEV_IDEPCI)	+= setup-pci.o
 ide-core-$(CONFIG_BLK_DEV_IDEDMA)	+= ide-dma.o
 ide-core-$(CONFIG_IDE_PROC_FS)		+= ide-proc.o
diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
new file mode 100644
index 000000000000..25939bc60402
--- /dev/null
+++ b/drivers/ide/ide-atapi.c
@@ -0,0 +1,70 @@
+/*
+ * ATAPI support.
+ */
+
+#include <linux/kernel.h>
+#include <linux/delay.h>
+#include <linux/ide.h>
+
+static u8 ide_wait_ireason(ide_drive_t *drive, u8 ireason)
+{
+	ide_hwif_t *hwif = drive->hwif;
+	int retries = 100;
+
+	while (retries-- && ((ireason & CD) == 0 || (ireason & IO))) {
+		printk(KERN_ERR "%s: (IO,CoD != (0,1) while issuing "
+				"a packet command, retrying\n", drive->name);
+		udelay(100);
+		ireason = hwif->INB(hwif->io_ports.nsect_addr);
+		if (retries == 0) {
+			printk(KERN_ERR "%s: (IO,CoD != (0,1) while issuing "
+					"a packet command, ignoring\n",
+					drive->name);
+			ireason |= CD;
+			ireason &= ~IO;
+		}
+	}
+
+	return ireason;
+}
+
+ide_startstop_t ide_transfer_pc(ide_drive_t *drive, struct ide_atapi_pc *pc,
+				ide_handler_t *handler, unsigned int timeout,
+				ide_expiry_t *expiry)
+{
+	ide_hwif_t *hwif = drive->hwif;
+	ide_startstop_t startstop;
+	u8 ireason;
+
+	if (ide_wait_stat(&startstop, drive, DRQ_STAT, BUSY_STAT, WAIT_READY)) {
+		printk(KERN_ERR "%s: Strange, packet command initiated yet "
+				"DRQ isn't asserted\n", drive->name);
+		return startstop;
+	}
+
+	ireason = hwif->INB(hwif->io_ports.nsect_addr);
+	if (drive->media == ide_tape && !drive->scsi)
+		ireason = ide_wait_ireason(drive, ireason);
+
+	if ((ireason & CD) == 0 || (ireason & IO)) {
+		printk(KERN_ERR "%s: (IO,CoD) != (0,1) while issuing "
+				"a packet command\n", drive->name);
+		return ide_do_reset(drive);
+	}
+
+	/* Set the interrupt routine */
+	ide_set_handler(drive, handler, timeout, expiry);
+
+	/* Begin DMA, if necessary */
+	if (pc->flags & PC_FLAG_DMA_OK) {
+		pc->flags |= PC_FLAG_DMA_IN_PROGRESS;
+		hwif->dma_ops->dma_start(drive);
+	}
+
+	/* Send the actual packet */
+	if ((pc->flags & PC_FLAG_ZIP_DRIVE) == 0)
+		hwif->output_data(drive, NULL, pc->c, 12);
+
+	return ide_started;
+}
+EXPORT_SYMBOL_GPL(ide_transfer_pc);
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index a7c138dc324c..e7a1025c03c4 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -532,25 +532,11 @@ static int idefloppy_transfer_pc2(ide_drive_t *drive)
 
 static ide_startstop_t idefloppy_transfer_pc1(ide_drive_t *drive)
 {
-	ide_hwif_t *hwif = drive->hwif;
 	idefloppy_floppy_t *floppy = drive->driver_data;
 	struct ide_atapi_pc *pc = floppy->pc;
 	ide_expiry_t *expiry;
 	unsigned int timeout;
-	ide_startstop_t startstop;
-	u8 ireason;
 
-	if (ide_wait_stat(&startstop, drive, DRQ_STAT, BUSY_STAT, WAIT_READY)) {
-		printk(KERN_ERR "%s: Strange, packet command initiated yet "
-				"DRQ isn't asserted\n", drive->name);
-		return startstop;
-	}
-	ireason = hwif->INB(hwif->io_ports.nsect_addr);
-	if ((ireason & CD) == 0 || (ireason & IO)) {
-		printk(KERN_ERR "%s: (IO,CoD) != (0,1) while issuing "
-				"a packet command\n", drive->name);
-		return ide_do_reset(drive);
-	}
 	/*
 	 * The following delay solves a problem with ATAPI Zip 100 drives
 	 * where the Busy flag was apparently being deasserted before the
@@ -567,19 +553,7 @@ static ide_startstop_t idefloppy_transfer_pc1(ide_drive_t *drive)
 		expiry = NULL;
 	}
 
-	ide_set_handler(drive, &idefloppy_pc_intr, timeout, expiry);
-
-	/* Begin DMA, if necessary */
-	if (pc->flags & PC_FLAG_DMA_OK) {
-		pc->flags |= PC_FLAG_DMA_IN_PROGRESS;
-		hwif->dma_ops->dma_start(drive);
-	}
-
-	if ((pc->flags & PC_FLAG_ZIP_DRIVE) == 0)
-		/* Send the actual packet */
-		hwif->output_data(drive, NULL, floppy->pc->c, 12);
-
-	return ide_started;
+	return ide_transfer_pc(drive, pc, idefloppy_pc_intr, timeout, expiry);
 }
 
 static void ide_floppy_report_error(idefloppy_floppy_t *floppy,
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 2a362138f973..5adc2c9ae418 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -947,64 +947,12 @@ static ide_startstop_t idetape_pc_intr(ide_drive_t *drive)
  * again, the callback function will be called and then we will handle the next
  * request.
  */
-
-static u8 ide_tape_wait_ireason(ide_drive_t *drive, u8 ireason)
-{
-	ide_hwif_t *hwif = drive->hwif;
-	int retries = 100;
-
-	while (retries-- && ((ireason & CD) == 0 || (ireason & IO))) {
-		printk(KERN_ERR "%s: (IO,CoD != (0,1) while issuing "
-				"a packet command, retrying\n", drive->name);
-		udelay(100);
-		ireason = hwif->INB(hwif->io_ports.nsect_addr);
-		if (retries == 0) {
-			printk(KERN_ERR "%s: (IO,CoD != (0,1) while issuing "
-					"a packet command, ignoring\n",
-					drive->name);
-			ireason |= CD;
-			ireason &= ~IO;
-		}
-	}
-
-	return ireason;
-}
-
 static ide_startstop_t idetape_transfer_pc(ide_drive_t *drive)
 {
-	ide_hwif_t *hwif = drive->hwif;
 	idetape_tape_t *tape = drive->driver_data;
-	struct ide_atapi_pc *pc = tape->pc;
-	ide_startstop_t startstop;
-	u8 ireason;
-
-	if (ide_wait_stat(&startstop, drive, DRQ_STAT, BUSY_STAT, WAIT_READY)) {
-		printk(KERN_ERR "%s: Strange, packet command initiated yet "
-				"DRQ isn't asserted\n", drive->name);
-		return startstop;
-	}
-
-	ireason = hwif->INB(hwif->io_ports.nsect_addr);
-	ireason = ide_tape_wait_ireason(drive, ireason);
 
-	if ((ireason & CD) == 0 || (ireason & IO)) {
-		printk(KERN_ERR "%s: (IO,CoD) != (0,1) while issuing "
-				"a packet command\n", drive->name);
-		return ide_do_reset(drive);
-	}
-	/* Set the interrupt routine */
-	ide_set_handler(drive, &idetape_pc_intr, IDETAPE_WAIT_CMD, NULL);
-
-	/* Begin DMA, if necessary */
-	if (pc->flags & PC_FLAG_DMA_OK) {
-		pc->flags |= PC_FLAG_DMA_IN_PROGRESS;
-		hwif->dma_ops->dma_start(drive);
-	}
-
-	/* Send the actual packet */
-	hwif->output_data(drive, NULL, pc->c, 12);
-
-	return ide_started;
+	return ide_transfer_pc(drive, tape->pc, idetape_pc_intr,
+			       IDETAPE_WAIT_CMD, NULL);
 }
 
 static ide_startstop_t idetape_issue_pc(ide_drive_t *drive,
diff --git a/drivers/scsi/ide-scsi.c b/drivers/scsi/ide-scsi.c
index c9fdf60c9dcf..d41348f2245e 100644
--- a/drivers/scsi/ide-scsi.c
+++ b/drivers/scsi/ide-scsi.c
@@ -453,36 +453,10 @@ static ide_startstop_t idescsi_pc_intr (ide_drive_t *drive)
 
 static ide_startstop_t idescsi_transfer_pc(ide_drive_t *drive)
 {
-	ide_hwif_t *hwif = drive->hwif;
 	idescsi_scsi_t *scsi = drive_to_idescsi(drive);
-	struct ide_atapi_pc *pc = scsi->pc;
-	ide_startstop_t startstop;
-	u8 ireason;
-
-	if (ide_wait_stat(&startstop,drive,DRQ_STAT,BUSY_STAT,WAIT_READY)) {
-		printk(KERN_ERR "%s: Strange, packet command initiated yet "
-				"DRQ isn't asserted\n", drive->name);
-		return startstop;
-	}
-	ireason = hwif->INB(hwif->io_ports.nsect_addr);
-	if ((ireason & CD) == 0 || (ireason & IO)) {
-		printk(KERN_ERR "%s: (IO,CoD) != (0,1) while issuing "
-				"a packet command\n", drive->name);
-		return ide_do_reset (drive);
-	}
 
-	/* Set the interrupt routine */
-	ide_set_handler(drive, &idescsi_pc_intr, get_timeout(pc), idescsi_expiry);
-
-	if (pc->flags & PC_FLAG_DMA_OK) {
-		pc->flags |= PC_FLAG_DMA_IN_PROGRESS;
-		hwif->dma_ops->dma_start(drive);
-	}
-
-	/* Send the actual packet */
-	hwif->output_data(drive, NULL, scsi->pc->c, 12);
-
-	return ide_started;
+	return ide_transfer_pc(drive, scsi->pc, idescsi_pc_intr,
+			       get_timeout(scsi->pc), idescsi_expiry);
 }
 
 static inline int idescsi_set_direction(struct ide_atapi_pc *pc)
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 89feaea9e20b..bed3c58798ae 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -967,6 +967,9 @@ extern int drive_is_ready(ide_drive_t *);
 
 void ide_pktcmd_tf_load(ide_drive_t *, u32, u16, u8);
 
+ide_startstop_t ide_transfer_pc(ide_drive_t *, struct ide_atapi_pc *,
+				ide_handler_t *, unsigned int, ide_expiry_t *);
+
 ide_startstop_t do_rw_taskfile(ide_drive_t *, ide_task_t *);
 
 void task_end_request(ide_drive_t *, struct request *, u8);
-- 
cgit v1.2.3


From 28c7214bd8c2bbd4873b8f1e7f58d86d3731124f Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 15 Jul 2008 21:21:59 +0200
Subject: ide: add PC_FLAG_DRQ_INTERRUPT pc flag

Add PC_FLAG_DRQ_INTERRUPT pc flag, set it in ide*_do_request()
and check for it (instead of checking for IDE*_FLAG_DRQ_INTERRUPT)
in ide*_issue_pc().  This is a preparation for adding generic
ide_issue_pc() helper.

There should be no functional changes caused by this patch.

Cc: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-floppy.c |  5 ++++-
 drivers/ide/ide-tape.c   | 11 ++++++++---
 drivers/scsi/ide-scsi.c  |  6 +++++-
 include/linux/ide.h      |  1 +
 4 files changed, 18 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index e7a1025c03c4..13f650fa2125 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -619,7 +619,7 @@ static ide_startstop_t idefloppy_issue_pc(ide_drive_t *drive,
 
 	ide_pktcmd_tf_load(drive, IDE_TFLAG_OUT_DEVICE, bcount, dma);
 
-	if (floppy->flags & IDEFLOPPY_FLAG_DRQ_INTERRUPT) {
+	if (pc->flags & PC_FLAG_DRQ_INTERRUPT) {
 		/* Issue the packet command */
 		ide_execute_command(drive, WIN_PACKETCMD,
 				&idefloppy_transfer_pc1,
@@ -800,6 +800,9 @@ static ide_startstop_t idefloppy_do_request(ide_drive_t *drive,
 		return ide_stopped;
 	}
 
+	if (floppy->flags & IDEFLOPPY_FLAG_DRQ_INTERRUPT)
+		pc->flags |= PC_FLAG_DRQ_INTERRUPT;
+
 	if (floppy->flags & IDEFLOPPY_FLAG_ZIP_DRIVE)
 		pc->flags |= PC_FLAG_ZIP_DRIVE;
 
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 5adc2c9ae418..cba18a675506 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -1020,7 +1020,7 @@ static ide_startstop_t idetape_issue_pc(ide_drive_t *drive,
 
 	ide_pktcmd_tf_load(drive, IDE_TFLAG_OUT_DEVICE, bcount, dma_ok);
 
-	if (test_bit(IDETAPE_FLAG_DRQ_INTERRUPT, &tape->flags)) {
+	if (pc->flags & PC_FLAG_DRQ_INTERRUPT) {
 		ide_execute_command(drive, WIN_PACKETCMD, &idetape_transfer_pc,
 				    IDETAPE_WAIT_CMD, NULL);
 		return ide_started;
@@ -1143,8 +1143,10 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive,
 	}
 
 	/* Retry a failed packet command */
-	if (tape->failed_pc && tape->pc->c[0] == REQUEST_SENSE)
-		return idetape_issue_pc(drive, tape->failed_pc);
+	if (tape->failed_pc && tape->pc->c[0] == REQUEST_SENSE) {
+		pc = tape->failed_pc;
+		goto out;
+	}
 
 	if (postponed_rq != NULL)
 		if (rq != postponed_rq) {
@@ -1216,6 +1218,9 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive,
 	}
 	BUG();
 out:
+	if (test_bit(IDETAPE_FLAG_DRQ_INTERRUPT, &tape->flags))
+		pc->flags |= PC_FLAG_DRQ_INTERRUPT;
+
 	return idetape_issue_pc(drive, pc);
 }
 
diff --git a/drivers/scsi/ide-scsi.c b/drivers/scsi/ide-scsi.c
index 1d261298d61a..b7c5e8391575 100644
--- a/drivers/scsi/ide-scsi.c
+++ b/drivers/scsi/ide-scsi.c
@@ -525,7 +525,7 @@ static ide_startstop_t idescsi_issue_pc(ide_drive_t *drive,
 
 	ide_pktcmd_tf_load(drive, 0, bcount, dma);
 
-	if (test_bit(IDESCSI_DRQ_INTERRUPT, &scsi->flags)) {
+	if (pc->flags & PC_FLAG_DRQ_INTERRUPT) {
 		ide_execute_command(drive, WIN_PACKETCMD, &idescsi_transfer_pc,
 				    get_timeout(pc), idescsi_expiry);
 		return ide_started;
@@ -548,6 +548,10 @@ static ide_startstop_t idescsi_do_request (ide_drive_t *drive, struct request *r
 
 	if (blk_sense_request(rq) || blk_special_request(rq)) {
 		struct ide_atapi_pc *pc = (struct ide_atapi_pc *)rq->special;
+		idescsi_scsi_t *scsi = drive_to_idescsi(drive);
+
+		if (test_bit(IDESCSI_DRQ_INTERRUPT, &scsi->flags))
+			pc->flags |= PC_FLAG_DRQ_INTERRUPT;
 
 		if (drive->using_dma && !idescsi_map_sg(drive, pc))
 			pc->flags |= PC_FLAG_DMA_OK;
diff --git a/include/linux/ide.h b/include/linux/ide.h
index bed3c58798ae..c2274ad44b2e 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -608,6 +608,7 @@ enum {
 	/* command timed out */
 	PC_FLAG_TIMEDOUT		= (1 << 7),
 	PC_FLAG_ZIP_DRIVE		= (1 << 8),
+	PC_FLAG_DRQ_INTERRUPT		= (1 << 9),
 };
 
 struct ide_atapi_pc {
-- 
cgit v1.2.3


From 6bf1641ca1c7554f0da54aaf89788731b541bacc Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 15 Jul 2008 21:22:00 +0200
Subject: ide: add ide_issue_pc() helper

Add generic ide_issue_pc() helper to ide-atapi.c and then
convert ide-{floppy,tape,scsi} device drivers to use it.

There should be no functional changes caused by this patch.

Cc: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-atapi.c  | 49 ++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/ide/ide-floppy.c | 35 ++--------------------------------
 drivers/ide/ide-tape.c   | 30 ++---------------------------
 drivers/scsi/ide-scsi.c  | 30 ++---------------------------
 include/linux/ide.h      |  2 ++
 5 files changed, 57 insertions(+), 89 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 25939bc60402..932a83abaf06 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -68,3 +68,52 @@ ide_startstop_t ide_transfer_pc(ide_drive_t *drive, struct ide_atapi_pc *pc,
 	return ide_started;
 }
 EXPORT_SYMBOL_GPL(ide_transfer_pc);
+
+ide_startstop_t ide_issue_pc(ide_drive_t *drive, struct ide_atapi_pc *pc,
+			     ide_handler_t *handler, unsigned int timeout,
+			     ide_expiry_t *expiry)
+{
+	ide_hwif_t *hwif = drive->hwif;
+	u16 bcount;
+	u8 dma = 0;
+
+	/* We haven't transferred any data yet */
+	pc->xferred = 0;
+	pc->cur_pos = pc->buf;
+
+	/* Request to transfer the entire buffer at once */
+	if (drive->media == ide_tape && !drive->scsi)
+		bcount = pc->req_xfer;
+	else
+		bcount = min(pc->req_xfer, 63 * 1024);
+
+	if (pc->flags & PC_FLAG_DMA_ERROR) {
+		pc->flags &= ~PC_FLAG_DMA_ERROR;
+		ide_dma_off(drive);
+	}
+
+	if ((pc->flags & PC_FLAG_DMA_OK) && drive->using_dma) {
+		if (drive->scsi)
+			hwif->sg_mapped = 1;
+		dma = !hwif->dma_ops->dma_setup(drive);
+		if (drive->scsi)
+			hwif->sg_mapped = 0;
+	}
+
+	if (!dma)
+		pc->flags &= ~PC_FLAG_DMA_OK;
+
+	ide_pktcmd_tf_load(drive, drive->scsi ? 0 : IDE_TFLAG_OUT_DEVICE,
+			   bcount, dma);
+
+	/* Issue the packet command */
+	if (pc->flags & PC_FLAG_DRQ_INTERRUPT) {
+		ide_execute_command(drive, WIN_PACKETCMD, handler,
+				    timeout, NULL);
+		return ide_started;
+	} else {
+		ide_execute_pkt_cmd(drive);
+		return (*handler)(drive);
+	}
+}
+EXPORT_SYMBOL_GPL(ide_issue_pc);
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index 13f650fa2125..e658aafc51da 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -576,9 +576,6 @@ static ide_startstop_t idefloppy_issue_pc(ide_drive_t *drive,
 		struct ide_atapi_pc *pc)
 {
 	idefloppy_floppy_t *floppy = drive->driver_data;
-	ide_hwif_t *hwif = drive->hwif;
-	u16 bcount;
-	u8 dma;
 
 	if (floppy->failed_pc == NULL &&
 	    pc->c[0] != GPCMD_REQUEST_SENSE)
@@ -600,37 +597,9 @@ static ide_startstop_t idefloppy_issue_pc(ide_drive_t *drive,
 	debug_log("Retry number - %d\n", pc->retries);
 
 	pc->retries++;
-	/* We haven't transferred any data yet */
-	pc->xferred = 0;
-	pc->cur_pos = pc->buf;
-	bcount = min(pc->req_xfer, 63 * 1024);
-
-	if (pc->flags & PC_FLAG_DMA_ERROR) {
-		pc->flags &= ~PC_FLAG_DMA_ERROR;
-		ide_dma_off(drive);
-	}
-	dma = 0;
 
-	if ((pc->flags & PC_FLAG_DMA_OK) && drive->using_dma)
-		dma = !hwif->dma_ops->dma_setup(drive);
-
-	if (!dma)
-		pc->flags &= ~PC_FLAG_DMA_OK;
-
-	ide_pktcmd_tf_load(drive, IDE_TFLAG_OUT_DEVICE, bcount, dma);
-
-	if (pc->flags & PC_FLAG_DRQ_INTERRUPT) {
-		/* Issue the packet command */
-		ide_execute_command(drive, WIN_PACKETCMD,
-				&idefloppy_transfer_pc1,
-				IDEFLOPPY_WAIT_CMD,
-				NULL);
-		return ide_started;
-	} else {
-		/* Issue the packet command */
-		ide_execute_pkt_cmd(drive);
-		return idefloppy_transfer_pc1(drive);
-	}
+	return ide_issue_pc(drive, pc, idefloppy_transfer_pc1,
+			    IDEFLOPPY_WAIT_CMD, NULL);
 }
 
 static void idefloppy_create_prevent_cmd(struct ide_atapi_pc *pc, int prevent)
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index cba18a675506..7907a1e41918 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -958,10 +958,7 @@ static ide_startstop_t idetape_transfer_pc(ide_drive_t *drive)
 static ide_startstop_t idetape_issue_pc(ide_drive_t *drive,
 		struct ide_atapi_pc *pc)
 {
-	ide_hwif_t *hwif = drive->hwif;
 	idetape_tape_t *tape = drive->driver_data;
-	int dma_ok = 0;
-	u16 bcount;
 
 	if (tape->pc->c[0] == REQUEST_SENSE &&
 	    pc->c[0] == REQUEST_SENSE) {
@@ -1002,32 +999,9 @@ static ide_startstop_t idetape_issue_pc(ide_drive_t *drive,
 	debug_log(DBG_SENSE, "Retry #%d, cmd = %02X\n", pc->retries, pc->c[0]);
 
 	pc->retries++;
-	/* We haven't transferred any data yet */
-	pc->xferred = 0;
-	pc->cur_pos = pc->buf;
-	/* Request to transfer the entire buffer at once */
-	bcount = pc->req_xfer;
-
-	if (pc->flags & PC_FLAG_DMA_ERROR) {
-		pc->flags &= ~PC_FLAG_DMA_ERROR;
-		ide_dma_off(drive);
-	}
-	if ((pc->flags & PC_FLAG_DMA_OK) && drive->using_dma)
-		dma_ok = !hwif->dma_ops->dma_setup(drive);
-
-	if (!dma_ok)
-		pc->flags &= ~PC_FLAG_DMA_OK;
-
-	ide_pktcmd_tf_load(drive, IDE_TFLAG_OUT_DEVICE, bcount, dma_ok);
 
-	if (pc->flags & PC_FLAG_DRQ_INTERRUPT) {
-		ide_execute_command(drive, WIN_PACKETCMD, &idetape_transfer_pc,
-				    IDETAPE_WAIT_CMD, NULL);
-		return ide_started;
-	} else {
-		ide_execute_pkt_cmd(drive);
-		return idetape_transfer_pc(drive);
-	}
+	return ide_issue_pc(drive, pc, idetape_transfer_pc,
+			    IDETAPE_WAIT_CMD, NULL);
 }
 
 /* A mode sense command is used to "sense" tape parameters. */
diff --git a/drivers/scsi/ide-scsi.c b/drivers/scsi/ide-scsi.c
index b7c5e8391575..32415466fbfe 100644
--- a/drivers/scsi/ide-scsi.c
+++ b/drivers/scsi/ide-scsi.c
@@ -502,38 +502,12 @@ static ide_startstop_t idescsi_issue_pc(ide_drive_t *drive,
 		struct ide_atapi_pc *pc)
 {
 	idescsi_scsi_t *scsi = drive_to_idescsi(drive);
-	ide_hwif_t *hwif = drive->hwif;
-	u16 bcount;
-	u8 dma = 0;
 
 	/* Set the current packet command */
 	scsi->pc = pc;
-	/* We haven't transferred any data yet */
-	pc->xferred = 0;
-	pc->cur_pos = pc->buf;
-	/* Request to transfer the entire buffer at once */
-	bcount = min(pc->req_xfer, 63 * 1024);
-
-	if ((pc->flags & PC_FLAG_DMA_OK) && drive->using_dma) {
-		hwif->sg_mapped = 1;
-		dma = !hwif->dma_ops->dma_setup(drive);
-		hwif->sg_mapped = 0;
-	}
-
-	if (!dma)
-		pc->flags &= ~PC_FLAG_DMA_OK;
 
-	ide_pktcmd_tf_load(drive, 0, bcount, dma);
-
-	if (pc->flags & PC_FLAG_DRQ_INTERRUPT) {
-		ide_execute_command(drive, WIN_PACKETCMD, &idescsi_transfer_pc,
-				    get_timeout(pc), idescsi_expiry);
-		return ide_started;
-	} else {
-		/* Issue the packet command */
-		ide_execute_pkt_cmd(drive);
-		return idescsi_transfer_pc(drive);
-	}
+	return ide_issue_pc(drive, pc, idescsi_transfer_pc,
+			    get_timeout(pc), idescsi_expiry);
 }
 
 /*
diff --git a/include/linux/ide.h b/include/linux/ide.h
index c2274ad44b2e..fee07a7edb19 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -970,6 +970,8 @@ void ide_pktcmd_tf_load(ide_drive_t *, u32, u16, u8);
 
 ide_startstop_t ide_transfer_pc(ide_drive_t *, struct ide_atapi_pc *,
 				ide_handler_t *, unsigned int, ide_expiry_t *);
+ide_startstop_t ide_issue_pc(ide_drive_t *, struct ide_atapi_pc *,
+			     ide_handler_t *, unsigned int, ide_expiry_t *);
 
 ide_startstop_t do_rw_taskfile(ide_drive_t *, ide_task_t *);
 
-- 
cgit v1.2.3


From 646c0cb6c430f8d3ad3769dd1518fe664ff0ce27 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 15 Jul 2008 21:22:03 +0200
Subject: ide: add ide_pc_intr() helper

* ide-tape.c: add 'drive' argument to idetape_update_buffers().

* Add generic ide_pc_intr() helper to ide-atapi.c and then
  convert ide-{floppy,tape,scsi} device drivers to use it.

* ide-tape.c: remove no longer needed DBG_PC_INTR.

There should be no functional changes caused by this patch
(unless the debugging is explicitely compiled in).

Cc: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-atapi.c  | 177 +++++++++++++++++++++++++++++++++++++++++++++++
 drivers/ide/ide-floppy.c | 128 +---------------------------------
 drivers/ide/ide-tape.c   | 132 ++---------------------------------
 drivers/scsi/ide-scsi.c  | 115 +-----------------------------
 include/linux/ide.h      |   6 ++
 5 files changed, 195 insertions(+), 363 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 932a83abaf06..2802031de670 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -5,6 +5,183 @@
 #include <linux/kernel.h>
 #include <linux/delay.h>
 #include <linux/ide.h>
+#include <scsi/scsi.h>
+
+#ifdef DEBUG
+#define debug_log(fmt, args...) \
+	printk(KERN_INFO "ide: " fmt, ## args)
+#else
+#define debug_log(fmt, args...) do {} while (0)
+#endif
+
+/* TODO: unify the code thus making some arguments go away */
+ide_startstop_t ide_pc_intr(ide_drive_t *drive, struct ide_atapi_pc *pc,
+	ide_handler_t *handler, unsigned int timeout, ide_expiry_t *expiry,
+	void (*update_buffers)(ide_drive_t *, struct ide_atapi_pc *),
+	void (*retry_pc)(ide_drive_t *), void (*dsc_handle)(ide_drive_t *),
+	void (*io_buffers)(ide_drive_t *, struct ide_atapi_pc *, unsigned, int))
+{
+	ide_hwif_t *hwif = drive->hwif;
+	xfer_func_t *xferfunc;
+	unsigned int temp;
+	u16 bcount;
+	u8 stat, ireason, scsi = drive->scsi;
+
+	debug_log("Enter %s - interrupt handler\n", __func__);
+
+	if (pc->flags & PC_FLAG_TIMEDOUT) {
+		pc->callback(drive);
+		return ide_stopped;
+	}
+
+	/* Clear the interrupt */
+	stat = ide_read_status(drive);
+
+	if (pc->flags & PC_FLAG_DMA_IN_PROGRESS) {
+		if (hwif->dma_ops->dma_end(drive) ||
+		    (drive->media == ide_tape && !scsi && (stat & ERR_STAT))) {
+			if (drive->media == ide_floppy && !scsi)
+				printk(KERN_ERR "%s: DMA %s error\n",
+					drive->name, rq_data_dir(pc->rq)
+						     ? "write" : "read");
+			pc->flags |= PC_FLAG_DMA_ERROR;
+		} else {
+			pc->xferred = pc->req_xfer;
+			if (update_buffers)
+				update_buffers(drive, pc);
+		}
+		debug_log("%s: DMA finished\n", drive->name);
+	}
+
+	/* No more interrupts */
+	if ((stat & DRQ_STAT) == 0) {
+		debug_log("Packet command completed, %d bytes transferred\n",
+			  pc->xferred);
+
+		pc->flags &= ~PC_FLAG_DMA_IN_PROGRESS;
+
+		local_irq_enable_in_hardirq();
+
+		if (drive->media == ide_tape && !scsi &&
+		    (stat & ERR_STAT) && pc->c[0] == REQUEST_SENSE)
+			stat &= ~ERR_STAT;
+		if ((stat & ERR_STAT) || (pc->flags & PC_FLAG_DMA_ERROR)) {
+			/* Error detected */
+			debug_log("%s: I/O error\n", drive->name);
+
+			if (drive->media != ide_tape || scsi) {
+				pc->rq->errors++;
+				if (scsi)
+					goto cmd_finished;
+			}
+
+			if (pc->c[0] == REQUEST_SENSE) {
+				printk(KERN_ERR "%s: I/O error in request sense"
+						" command\n", drive->name);
+				return ide_do_reset(drive);
+			}
+
+			debug_log("[cmd %x]: check condition\n", pc->c[0]);
+
+			/* Retry operation */
+			retry_pc(drive);
+			/* queued, but not started */
+			return ide_stopped;
+		}
+cmd_finished:
+		pc->error = 0;
+		if ((pc->flags & PC_FLAG_WAIT_FOR_DSC) &&
+		    (stat & SEEK_STAT) == 0) {
+			dsc_handle(drive);
+			return ide_stopped;
+		}
+		/* Command finished - Call the callback function */
+		pc->callback(drive);
+		return ide_stopped;
+	}
+
+	if (pc->flags & PC_FLAG_DMA_IN_PROGRESS) {
+		pc->flags &= ~PC_FLAG_DMA_IN_PROGRESS;
+		printk(KERN_ERR "%s: The device wants to issue more interrupts "
+				"in DMA mode\n", drive->name);
+		ide_dma_off(drive);
+		return ide_do_reset(drive);
+	}
+	/* Get the number of bytes to transfer on this interrupt. */
+	bcount = (hwif->INB(hwif->io_ports.lbah_addr) << 8) |
+		  hwif->INB(hwif->io_ports.lbam_addr);
+
+	ireason = hwif->INB(hwif->io_ports.nsect_addr);
+
+	if (ireason & CD) {
+		printk(KERN_ERR "%s: CoD != 0 in %s\n", drive->name, __func__);
+		return ide_do_reset(drive);
+	}
+	if (((ireason & IO) == IO) == !!(pc->flags & PC_FLAG_WRITING)) {
+		/* Hopefully, we will never get here */
+		printk(KERN_ERR "%s: We wanted to %s, but the device wants us "
+				"to %s!\n", drive->name,
+				(ireason & IO) ? "Write" : "Read",
+				(ireason & IO) ? "Read" : "Write");
+		return ide_do_reset(drive);
+	}
+	if (!(pc->flags & PC_FLAG_WRITING)) {
+		/* Reading - Check that we have enough space */
+		temp = pc->xferred + bcount;
+		if (temp > pc->req_xfer) {
+			if (temp > pc->buf_size) {
+				printk(KERN_ERR "%s: The device wants to send "
+						"us more data than expected - "
+						"discarding data\n",
+						drive->name);
+				if (scsi)
+					temp = pc->buf_size - pc->xferred;
+				else
+					temp = 0;
+				if (temp) {
+					if (pc->sg)
+						io_buffers(drive, pc, temp, 0);
+					else
+						hwif->input_data(drive, NULL,
+							pc->cur_pos, temp);
+					printk(KERN_ERR "%s: transferred %d of "
+							"%d bytes\n",
+							drive->name,
+							temp, bcount);
+				}
+				pc->xferred += temp;
+				pc->cur_pos += temp;
+				ide_pad_transfer(drive, 0, bcount - temp);
+				ide_set_handler(drive, handler, timeout,
+						expiry);
+				return ide_started;
+			}
+			debug_log("The device wants to send us more data than "
+				  "expected - allowing transfer\n");
+		}
+		xferfunc = hwif->input_data;
+	} else
+		xferfunc = hwif->output_data;
+
+	if ((drive->media == ide_floppy && !scsi && !pc->buf) ||
+	    (drive->media == ide_tape && !scsi && pc->bh) ||
+	    (scsi && pc->sg))
+		io_buffers(drive, pc, bcount, !!(pc->flags & PC_FLAG_WRITING));
+	else
+		xferfunc(drive, NULL, pc->cur_pos, bcount);
+
+	/* Update the current position */
+	pc->xferred += bcount;
+	pc->cur_pos += bcount;
+
+	debug_log("[cmd %x] transferred %d bytes on that intr.\n",
+		  pc->c[0], bcount);
+
+	/* And set the interrupt handler again */
+	ide_set_handler(drive, handler, timeout, expiry);
+	return ide_started;
+}
+EXPORT_SYMBOL_GPL(ide_pc_intr);
 
 static u8 ide_wait_ireason(ide_drive_t *drive, u8 ireason)
 {
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index 70aef97fb8bc..0f3602a5efb0 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -388,132 +388,10 @@ static void idefloppy_retry_pc(ide_drive_t *drive)
 static ide_startstop_t idefloppy_pc_intr(ide_drive_t *drive)
 {
 	idefloppy_floppy_t *floppy = drive->driver_data;
-	ide_hwif_t *hwif = drive->hwif;
-	struct ide_atapi_pc *pc = floppy->pc;
-	struct request *rq = pc->rq;
-	xfer_func_t *xferfunc;
-	unsigned int temp;
-	int dma_error = 0;
-	u16 bcount;
-	u8 stat, ireason;
-
-	debug_log("Enter %s - interrupt handler\n", __func__);
-
-	/* Clear the interrupt */
-	stat = ide_read_status(drive);
-
-	if (pc->flags & PC_FLAG_DMA_IN_PROGRESS) {
-		dma_error = hwif->dma_ops->dma_end(drive);
-		if (dma_error) {
-			printk(KERN_ERR "%s: DMA %s error\n", drive->name,
-					rq_data_dir(rq) ? "write" : "read");
-			pc->flags |= PC_FLAG_DMA_ERROR;
-		} else {
-			pc->xferred = pc->req_xfer;
-			idefloppy_update_buffers(drive, pc);
-		}
-		debug_log("%s: DMA finished\n", drive->name);
-	}
-
-	/* No more interrupts */
-	if ((stat & DRQ_STAT) == 0) {
-		debug_log("Packet command completed, %d bytes transferred\n",
-				pc->xferred);
-		pc->flags &= ~PC_FLAG_DMA_IN_PROGRESS;
-
-		local_irq_enable_in_hardirq();
-
-		if ((stat & ERR_STAT) || (pc->flags & PC_FLAG_DMA_ERROR)) {
-			/* Error detected */
-			debug_log("%s: I/O error\n", drive->name);
-			rq->errors++;
-			if (pc->c[0] == GPCMD_REQUEST_SENSE) {
-				printk(KERN_ERR "%s: I/O error in request sense"
-						" command\n", drive->name);
-				return ide_do_reset(drive);
-			}
-
-			debug_log("[cmd %x]: check condition\n", pc->c[0]);
-
-			/* Retry operation */
-			idefloppy_retry_pc(drive);
-			/* queued, but not started */
-			return ide_stopped;
-		}
-		pc->error = 0;
-		/* Command finished - Call the callback function */
-		pc->callback(drive);
-		return ide_stopped;
-	}
-
-	if (pc->flags & PC_FLAG_DMA_IN_PROGRESS) {
-		pc->flags &= ~PC_FLAG_DMA_IN_PROGRESS;
-		printk(KERN_ERR "%s: The device wants to issue more interrupts "
-				"in DMA mode\n", drive->name);
-		ide_dma_off(drive);
-		return ide_do_reset(drive);
-	}
-
-	/* Get the number of bytes to transfer */
-	bcount = (hwif->INB(hwif->io_ports.lbah_addr) << 8) |
-		  hwif->INB(hwif->io_ports.lbam_addr);
-	/* on this interrupt */
-	ireason = hwif->INB(hwif->io_ports.nsect_addr);
-
-	if (ireason & CD) {
-		printk(KERN_ERR "%s: CoD != 0 in %s\n", drive->name, __func__);
-		return ide_do_reset(drive);
-	}
-	if (((ireason & IO) == IO) == !!(pc->flags & PC_FLAG_WRITING)) {
-		/* Hopefully, we will never get here */
-		printk(KERN_ERR "%s: We wanted to %s, but the device wants us "
-				"to %s!\n", drive->name,
-				(ireason & IO) ? "Write" : "Read",
-				(ireason & IO) ? "Read" : "Write");
-		return ide_do_reset(drive);
-	}
-	if (!(pc->flags & PC_FLAG_WRITING)) {
-		/* Reading - Check that we have enough space */
-		temp = pc->xferred + bcount;
-		if (temp > pc->req_xfer) {
-			if (temp > pc->buf_size) {
-				printk(KERN_ERR "%s: The device wants to send "
-						"us more data than expected - "
-						"discarding data\n",
-						drive->name);
-				ide_pad_transfer(drive, 0, bcount);
-
-				ide_set_handler(drive,
-						&idefloppy_pc_intr,
-						IDEFLOPPY_WAIT_CMD,
-						NULL);
-				return ide_started;
-			}
-			debug_log("The device wants to send us more data than "
-				  "expected - allowing transfer\n");
-		}
-	}
-	if (pc->flags & PC_FLAG_WRITING)
-		xferfunc = hwif->output_data;
-	else
-		xferfunc = hwif->input_data;
-
-	if (pc->buf)
-		xferfunc(drive, NULL, pc->cur_pos, bcount);
-	else
-		ide_floppy_io_buffers(drive, pc, bcount,
-				      !!(pc->flags & PC_FLAG_WRITING));
-
-	/* Update the current position */
-	pc->xferred += bcount;
-	pc->cur_pos += bcount;
-
-	debug_log("[cmd %x] transferred %d bytes on that intr.\n",
-		  pc->c[0], bcount);
 
-	/* And set the interrupt handler again */
-	ide_set_handler(drive, &idefloppy_pc_intr, IDEFLOPPY_WAIT_CMD, NULL);
-	return ide_started;
+	return ide_pc_intr(drive, floppy->pc, idefloppy_pc_intr,
+			   IDEFLOPPY_WAIT_CMD, NULL, idefloppy_update_buffers,
+			   idefloppy_retry_pc, NULL, ide_floppy_io_buffers);
 }
 
 /*
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 10f2d3336286..0afa109ec99a 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -56,8 +56,6 @@ enum {
 	DBG_PROCS =		(1 << 3),
 	/* buffer alloc info (pc_stack & rq_stack) */
 	DBG_PCRQ_STACK =	(1 << 4),
-	/* IRQ handler (always log debug info if debugging is on) */
-	DBG_PC_INTR = 		(1 << 5),
 };
 
 /* define to see debug info */
@@ -66,7 +64,7 @@ enum {
 #if IDETAPE_DEBUG_LOG
 #define debug_log(lvl, fmt, args...)			\
 {							\
-	if ((lvl & DBG_PC_INTR) || (tape->debug_mask & lvl)) \
+	if (tape->debug_mask & lvl)			\
 	printk(KERN_INFO "ide-tape: " fmt, ## args);	\
 }
 #else
@@ -441,7 +439,7 @@ static void idetape_output_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc,
 	}
 }
 
-static void idetape_update_buffers(struct ide_atapi_pc *pc)
+static void idetape_update_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc)
 {
 	struct idetape_bh *bh = pc->bh;
 	int count;
@@ -526,7 +524,7 @@ static void idetape_analyze_error(ide_drive_t *drive, u8 *sense)
 		pc->xferred = pc->req_xfer -
 			tape->blk_size *
 			get_unaligned_be32(&sense[3]);
-		idetape_update_buffers(pc);
+		idetape_update_buffers(drive, pc);
 	}
 
 	/*
@@ -800,129 +798,11 @@ static void ide_tape_io_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc,
  */
 static ide_startstop_t idetape_pc_intr(ide_drive_t *drive)
 {
-	ide_hwif_t *hwif = drive->hwif;
 	idetape_tape_t *tape = drive->driver_data;
-	struct ide_atapi_pc *pc = tape->pc;
-	xfer_func_t *xferfunc;
-	unsigned int temp;
-	u16 bcount;
-	u8 stat, ireason;
-
-	debug_log(DBG_PC_INTR, "Enter %s - interrupt handler\n", __func__);
-
-	/* Clear the interrupt */
-	stat = ide_read_status(drive);
-
-	if (pc->flags & PC_FLAG_DMA_IN_PROGRESS) {
-		if (hwif->dma_ops->dma_end(drive) || (stat & ERR_STAT)) {
-			pc->flags |= PC_FLAG_DMA_ERROR;
-		} else {
-			pc->xferred = pc->req_xfer;
-			idetape_update_buffers(pc);
-		}
-		debug_log(DBG_PC_INTR, "%s: DMA finished\n", drive->name);
-	}
-
-	/* No more interrupts */
-	if ((stat & DRQ_STAT) == 0) {
-		debug_log(DBG_PC_INTR, "Packet command completed, %d bytes"
-				" transferred\n", pc->xferred);
-
-		pc->flags &= ~PC_FLAG_DMA_IN_PROGRESS;
-		local_irq_enable_in_hardirq();
-
-		if ((stat & ERR_STAT) && pc->c[0] == REQUEST_SENSE)
-			stat &= ~ERR_STAT;
-		if ((stat & ERR_STAT) || (pc->flags & PC_FLAG_DMA_ERROR)) {
-			/* Error detected */
-			debug_log(DBG_PC_INTR, "%s: I/O error\n", drive->name);
-
-			if (pc->c[0] == REQUEST_SENSE) {
-				printk(KERN_ERR "%s: I/O error in request sense"
-						" command\n", drive->name);
-				return ide_do_reset(drive);
-			}
-			debug_log(DBG_PC_INTR, "[cmd %x]: check condition\n",
-					pc->c[0]);
-
-			/* Retry operation */
-			idetape_retry_pc(drive);
-			return ide_stopped;
-		}
-		pc->error = 0;
-		if ((pc->flags & PC_FLAG_WAIT_FOR_DSC) &&
-		    (stat & SEEK_STAT) == 0) {
-			ide_tape_handle_dsc(drive);
-			return ide_stopped;
-		}
-		/* Command finished - Call the callback function */
-		pc->callback(drive);
-		return ide_stopped;
-	}
-
-	if (pc->flags & PC_FLAG_DMA_IN_PROGRESS) {
-		pc->flags &= ~PC_FLAG_DMA_IN_PROGRESS;
-		printk(KERN_ERR "%s: The device wants to issue more interrupts "
-				"in DMA mode\n", drive->name);
-		ide_dma_off(drive);
-		return ide_do_reset(drive);
-	}
-	/* Get the number of bytes to transfer on this interrupt. */
-	bcount = (hwif->INB(hwif->io_ports.lbah_addr) << 8) |
-		  hwif->INB(hwif->io_ports.lbam_addr);
-
-	ireason = hwif->INB(hwif->io_ports.nsect_addr);
-
-	if (ireason & CD) {
-		printk(KERN_ERR "%s: CoD != 0 in %s\n", drive->name, __func__);
-		return ide_do_reset(drive);
-	}
-	if (((ireason & IO) == IO) == !!(pc->flags & PC_FLAG_WRITING)) {
-		/* Hopefully, we will never get here */
-		printk(KERN_ERR "%s: We wanted to %s, but the device wants us "
-				"to %s!\n", drive->name,
-				(ireason & IO) ? "Write" : "Read",
-				(ireason & IO) ? "Read" : "Write");
-		return ide_do_reset(drive);
-	}
-	if (!(pc->flags & PC_FLAG_WRITING)) {
-		/* Reading - Check that we have enough space */
-		temp = pc->xferred + bcount;
-		if (temp > pc->req_xfer) {
-			if (temp > pc->buf_size) {
-				printk(KERN_ERR "%s: The device wants to send "
-						"us more data than expected - "
-						"discarding data\n",
-						drive->name);
-				ide_pad_transfer(drive, 0, bcount);
-				ide_set_handler(drive, &idetape_pc_intr,
-						IDETAPE_WAIT_CMD, NULL);
-				return ide_started;
-			}
-			debug_log(DBG_PC_INTR, "The device wants to send us more "
-				"data than expected - allowing transfer\n");
-		}
-		xferfunc = hwif->input_data;
-	} else {
-		xferfunc = hwif->output_data;
-	}
-
-	if (pc->bh)
-		ide_tape_io_buffers(drive, pc, bcount,
-				    !!(pc->flags & PC_FLAG_WRITING));
-	else
-		xferfunc(drive, NULL, pc->cur_pos, bcount);
-
-	/* Update the current position */
-	pc->xferred += bcount;
-	pc->cur_pos += bcount;
-
-	debug_log(DBG_PC_INTR, "[cmd %x] transferred %d bytes on that intr.\n",
-			pc->c[0], bcount);
 
-	/* And set the interrupt handler again */
-	ide_set_handler(drive, &idetape_pc_intr, IDETAPE_WAIT_CMD, NULL);
-	return ide_started;
+	return ide_pc_intr(drive, tape->pc, idetape_pc_intr, IDETAPE_WAIT_CMD,
+			   NULL, idetape_update_buffers, idetape_retry_pc,
+			   ide_tape_handle_dsc, ide_tape_io_buffers);
 }
 
 /*
diff --git a/drivers/scsi/ide-scsi.c b/drivers/scsi/ide-scsi.c
index ada733ca6725..683bce375c74 100644
--- a/drivers/scsi/ide-scsi.c
+++ b/drivers/scsi/ide-scsi.c
@@ -356,120 +356,11 @@ static int idescsi_expiry(ide_drive_t *drive)
 static ide_startstop_t idescsi_pc_intr (ide_drive_t *drive)
 {
 	idescsi_scsi_t *scsi = drive_to_idescsi(drive);
-	ide_hwif_t *hwif = drive->hwif;
 	struct ide_atapi_pc *pc = scsi->pc;
-	struct request *rq = pc->rq;
-	xfer_func_t *xferfunc;
-	unsigned int temp;
-	u16 bcount;
-	u8 stat, ireason;
-
-	debug_log("Enter %s - interrupt handler\n", __func__);
-
-	if (pc->flags & PC_FLAG_TIMEDOUT) {
-		pc->callback(drive);
-		return ide_stopped;
-	}
-
-	/* Clear the interrupt */
-	stat = ide_read_status(drive);
-
-	if (pc->flags & PC_FLAG_DMA_IN_PROGRESS) {
-		if (hwif->dma_ops->dma_end(drive))
-			pc->flags |= PC_FLAG_DMA_ERROR;
-		else
-			pc->xferred = pc->req_xfer;
-		debug_log("%s: DMA finished\n", drive->name);
-	}
-
-	if ((stat & DRQ_STAT) == 0) {
-		/* No more interrupts */
-		debug_log("Packet command completed, %d bytes transferred\n",
-			  pc->xferred);
-		pc->flags &= ~PC_FLAG_DMA_IN_PROGRESS;
-		local_irq_enable_in_hardirq();
-		if ((stat & ERR_STAT) || (pc->flags & PC_FLAG_DMA_ERROR)) {
-			/* Error detected */
-			debug_log("%s: I/O error\n", drive->name);
-
-			rq->errors++;
-		}
-		pc->callback(drive);
-		return ide_stopped;
-	}
-	if (pc->flags & PC_FLAG_DMA_IN_PROGRESS) {
-		pc->flags &= ~PC_FLAG_DMA_IN_PROGRESS;
-		printk(KERN_ERR "%s: The device wants to issue more interrupts "
-				"in DMA mode\n", drive->name);
-		ide_dma_off(drive);
-		return ide_do_reset(drive);
-	}
-	bcount = (hwif->INB(hwif->io_ports.lbah_addr) << 8) |
-		  hwif->INB(hwif->io_ports.lbam_addr);
-	ireason = hwif->INB(hwif->io_ports.nsect_addr);
-
-	if (ireason & CD) {
-		printk(KERN_ERR "%s: CoD != 0 in %s\n", drive->name, __func__);
-		return ide_do_reset (drive);
-	}
-	if (((ireason & IO) == IO) == !!(pc->flags & PC_FLAG_WRITING)) {
-		/* Hopefully, we will never get here */
-		printk(KERN_ERR "%s: We wanted to %s, but the device wants us "
-				"to %s!\n", drive->name,
-				(ireason & IO) ? "Write" : "Read",
-				(ireason & IO) ? "Read" : "Write");
-		return ide_do_reset(drive);
-	}
-	if (!(pc->flags & PC_FLAG_WRITING)) {
-		temp = pc->xferred + bcount;
-		if (temp > pc->req_xfer) {
-			if (temp > pc->buf_size) {
-				printk(KERN_ERR "%s: The device wants to send "
-						"us more data than expected - "
-						"discarding data\n",
-						drive->name);
-				temp = pc->buf_size - pc->xferred;
-				if (temp) {
-					if (pc->sg)
-						ide_scsi_io_buffers(drive, pc,
-								    temp, 0);
-					else
-						hwif->input_data(drive, NULL,
-							pc->cur_pos, temp);
-					printk(KERN_ERR "%s: transferred %d of "
-							"%d bytes\n",
-							drive->name,
-							temp, bcount);
-				}
-				pc->xferred += temp;
-				pc->cur_pos += temp;
-				ide_pad_transfer(drive, 0, bcount - temp);
-				ide_set_handler(drive, &idescsi_pc_intr, get_timeout(pc), idescsi_expiry);
-				return ide_started;
-			}
-			debug_log("The device wants to send us more data than "
-				  "expected - allowing transfer\n");
-		}
-		xferfunc = hwif->input_data;
-	} else
-		xferfunc = hwif->output_data;
-
-	if (pc->sg)
-		ide_scsi_io_buffers(drive, pc, bcount,
-				    !!(pc->flags & PC_FLAG_WRITING));
-	else
-		xferfunc(drive, NULL, pc->cur_pos, bcount);
-
-	/* Update the current position */
-	pc->xferred += bcount;
-	pc->cur_pos += bcount;
-
-	debug_log("[cmd %x] transferred %d bytes on that intr.\n",
-		  pc->c[0], bcount);
 
-	/* And set the interrupt handler again */
-	ide_set_handler(drive, &idescsi_pc_intr, get_timeout(pc), idescsi_expiry);
-	return ide_started;
+	return ide_pc_intr(drive, pc, idescsi_pc_intr, get_timeout(pc),
+			   idescsi_expiry, NULL, NULL, NULL,
+			   ide_scsi_io_buffers);
 }
 
 static ide_startstop_t idescsi_transfer_pc(ide_drive_t *drive)
diff --git a/include/linux/ide.h b/include/linux/ide.h
index fee07a7edb19..ac4eeb2932ef 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -968,6 +968,12 @@ extern int drive_is_ready(ide_drive_t *);
 
 void ide_pktcmd_tf_load(ide_drive_t *, u32, u16, u8);
 
+ide_startstop_t ide_pc_intr(ide_drive_t *drive, struct ide_atapi_pc *pc,
+	ide_handler_t *handler, unsigned int timeout, ide_expiry_t *expiry,
+	void (*update_buffers)(ide_drive_t *, struct ide_atapi_pc *),
+	void (*retry_pc)(ide_drive_t *), void (*dsc_handle)(ide_drive_t *),
+	void (*io_buffers)(ide_drive_t *, struct ide_atapi_pc *, unsigned int,
+			   int));
 ide_startstop_t ide_transfer_pc(ide_drive_t *, struct ide_atapi_pc *,
 				ide_handler_t *, unsigned int, ide_expiry_t *);
 ide_startstop_t ide_issue_pc(ide_drive_t *, struct ide_atapi_pc *,
-- 
cgit v1.2.3