From 0b7018aae7e1798f55f736b9a77c201708aa0e33 Mon Sep 17 00:00:00 2001
From: Imre Deak <imre.deak@nokia.com>
Date: Tue, 11 Apr 2006 23:42:03 -0400
Subject: Input: ads7846 - debouncing and rudimentary sample filtering
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Some touchscreens seem to oscillate heavily for a while after touching
the screen.  Implement support for sampling the screen until we get two
consecutive values that are close enough.

Signed-off-by: Imre Deak <imre.deak@nokia.com>
Signed-off-by: Juha Yrjola <juha.yrjola@nokia.com>
Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 include/linux/spi/ads7846.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/spi/ads7846.h b/include/linux/spi/ads7846.h
index 72261e0f2ac1..3f7664951256 100644
--- a/include/linux/spi/ads7846.h
+++ b/include/linux/spi/ads7846.h
@@ -14,5 +14,8 @@ struct ads7846_platform_data {
 	u16	x_min, x_max;
 	u16	y_min, y_max;
 	u16	pressure_min, pressure_max;
+
+	u16	debounce_max;		/* max number of readings per sample */
+	u16	debounce_tol;		/* tolerance used for filtering */
 };
 
-- 
cgit v1.2.3


From c9e617a563ad646239270fa2222cdb06966cf1fa Mon Sep 17 00:00:00 2001
From: Imre Deak <imre.deak@nokia.com>
Date: Tue, 11 Apr 2006 23:44:05 -0400
Subject: Input: ads7846 - handle IRQs that were latched during disabled IRQs

The pen down IRQ will toggle during each X,Y,Z measurement cycle.
Even though the IRQ is disabled it will be latched and delivered
when after enable_irq. Thus in the IRQ handler we must avoid
starting a new measurement cycle when such an "unwanted" IRQ happens.
Add a get_pendown_state platform function, which will probably
determine this by reading the current GPIO level of the pen IRQ pin.

Move the IRQ reenabling from the SPI RX function to the timer. After
the last power down message the pen IRQ pin is still active for a
while and get_pendown_state would report incorrectly a pen down state.

When suspending we should check the ts->pending flag instead of
ts->pendown, since the timer can be pending regardless of ts->pendown.
Also if ts->pending is set we can be sure that the timer is running,
so no need to rearm it. Similarly if ts->pending is not set we can
be sure that the IRQ is enabled (and the timer is not).

Signed-off-by: Imre Deak <imre.deak@nokia.com>
Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 drivers/input/touchscreen/ads7846.c | 82 ++++++++++++++++++++++---------------
 include/linux/spi/ads7846.h         |  2 +
 2 files changed, 50 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/input/touchscreen/ads7846.c b/drivers/input/touchscreen/ads7846.c
index fec3b9b22309..e7cabf12c8dc 100644
--- a/drivers/input/touchscreen/ads7846.c
+++ b/drivers/input/touchscreen/ads7846.c
@@ -102,6 +102,8 @@ struct ads7846 {
 // FIXME remove "irq_disabled"
 	unsigned		irq_disabled:1;	/* P: lock */
 	unsigned		disabled:1;
+
+	int			(*get_pendown_state)(void);
 };
 
 /* leave chip selected when we're done, for quicker re-select? */
@@ -175,6 +177,12 @@ struct ser_req {
 static void ads7846_enable(struct ads7846 *ts);
 static void ads7846_disable(struct ads7846 *ts);
 
+static int device_suspended(struct device *dev)
+{
+	struct ads7846 *ts = dev_get_drvdata(dev);
+	return dev->power.power_state.event != PM_EVENT_ON || ts->disabled;
+}
+
 static int ads7846_read12_ser(struct device *dev, unsigned command)
 {
 	struct spi_device	*spi = to_spi_device(dev);
@@ -227,8 +235,10 @@ static int ads7846_read12_ser(struct device *dev, unsigned command)
 	for (i = 0; i < 6; i++)
 		spi_message_add_tail(&req->xfer[i], &req->msg);
 
+	ts->irq_disabled = 1;
 	disable_irq(spi->irq);
 	status = spi_sync(spi, &req->msg);
+	ts->irq_disabled = 0;
 	enable_irq(spi->irq);
 
 	if (req->msg.status)
@@ -333,7 +343,7 @@ static void ads7846_rx(void *ads)
 	if (x == MAX_12BIT)
 		x = 0;
 
-	if (x && z1 && ts->spi->dev.power.power_state.event == PM_EVENT_ON) {
+	if (likely(x && z1 && !device_suspended(&ts->spi->dev))) {
 		/* compute touch pressure resistance using equation #2 */
 		Rt = z2;
 		Rt -= z1;
@@ -377,20 +387,10 @@ static void ads7846_rx(void *ads)
 			x, y, Rt, Rt ? "" : " UP");
 #endif
 
-	/* don't retrigger while we're suspended */
 	spin_lock_irqsave(&ts->lock, flags);
 
 	ts->pendown = (Rt != 0);
-	ts->pending = 0;
-
-	if (ts->spi->dev.power.power_state.event == PM_EVENT_ON) {
-		if (ts->pendown)
-			mod_timer(&ts->timer, jiffies + TS_POLL_PERIOD);
-		else if (ts->irq_disabled) {
-			ts->irq_disabled = 0;
-			enable_irq(ts->spi->irq);
-		}
-	}
+	mod_timer(&ts->timer, jiffies + TS_POLL_PERIOD);
 
 	spin_unlock_irqrestore(&ts->lock, flags);
 }
@@ -431,10 +431,25 @@ static void ads7846_timer(unsigned long handle)
 	struct ads7846	*ts = (void *)handle;
 	int		status = 0;
 
-	ts->msg_idx = 0;
-	status = spi_async(ts->spi, &ts->msg[0]);
-	if (status)
-		dev_err(&ts->spi->dev, "spi_async --> %d\n", status);
+	spin_lock_irq(&ts->lock);
+
+	if (unlikely(ts->msg_idx && !ts->pendown)) {
+		/* measurment cycle ended */
+		if (!device_suspended(&ts->spi->dev)) {
+			ts->irq_disabled = 0;
+			enable_irq(ts->spi->irq);
+		}
+		ts->pending = 0;
+		ts->msg_idx = 0;
+	} else {
+		/* pen is still down, continue with the measurement */
+		ts->msg_idx = 0;
+		status = spi_async(ts->spi, &ts->msg[0]);
+		if (status)
+			dev_err(&ts->spi->dev, "spi_async --> %d\n", status);
+	}
+
+	spin_unlock_irq(&ts->lock);
 }
 
 static irqreturn_t ads7846_irq(int irq, void *handle, struct pt_regs *regs)
@@ -443,7 +458,7 @@ static irqreturn_t ads7846_irq(int irq, void *handle, struct pt_regs *regs)
 	unsigned long flags;
 
 	spin_lock_irqsave(&ts->lock, flags);
-	if (likely(!ts->irq_disabled && !ts->disabled)) {
+	if (likely(ts->get_pendown_state())) {
 		if (!ts->irq_disabled) {
 			/* REVISIT irq logic for many ARM chips has cloned a
 			 * bug wherein disabling an irq in its handler won't
@@ -452,10 +467,7 @@ static irqreturn_t ads7846_irq(int irq, void *handle, struct pt_regs *regs)
 			 * that state here.
 			 */
 			ts->irq_disabled = 1;
-
 			disable_irq(ts->spi->irq);
-		}
-		if (!ts->pending) {
 			ts->pending = 1;
 			mod_timer(&ts->timer, jiffies);
 		}
@@ -473,20 +485,17 @@ static void ads7846_disable(struct ads7846 *ts)
 	if (ts->disabled)
 		return;
 
+	ts->disabled = 1;
+
 	/* are we waiting for IRQ, or polling? */
-	if (!ts->pendown) {
-		if (!ts->irq_disabled) {
-			ts->irq_disabled = 1;
-			disable_irq(ts->spi->irq);
-		}
+	if (!ts->pending) {
+		ts->irq_disabled = 1;
+		disable_irq(ts->spi->irq);
 	} else {
-		/* polling; force a final SPI completion;
-		 * that will clean things up neatly
+		/* the timer will run at least once more, and
+		 * leave everything in a clean state, IRQ disabled
 		 */
-		if (!ts->pending)
-			mod_timer(&ts->timer, jiffies);
-
-		while (ts->pendown || ts->pending) {
+		while (ts->pending) {
 			spin_unlock_irq(&ts->lock);
 			msleep(1);
 			spin_lock_irq(&ts->lock);
@@ -497,7 +506,6 @@ static void ads7846_disable(struct ads7846 *ts)
 	 * leave it that way after every request
 	 */
 
-	ts->disabled = 1;
 }
 
 /* Must be called with ts->lock held */
@@ -566,6 +574,11 @@ static int __devinit ads7846_probe(struct spi_device *spi)
 		return -EINVAL;
 	}
 
+	if (pdata->get_pendown_state == NULL) {
+		dev_dbg(&spi->dev, "no get_pendown_state function?\n");
+		return -EINVAL;
+	}
+
 	/* We'd set the wordsize to 12 bits ... except that some controllers
 	 * will then treat the 8 bit command words as 12 bits (and drop the
 	 * four MSBs of the 12 bit result).  Result: inputs must be shifted
@@ -596,6 +609,7 @@ static int __devinit ads7846_probe(struct spi_device *spi)
 	ts->x_plate_ohms = pdata->x_plate_ohms ? : 400;
 	ts->debounce_max = pdata->debounce_max ? : 1;
 	ts->debounce_tol = pdata->debounce_tol ? : 10;
+	ts->get_pendown_state = pdata->get_pendown_state;
 
 	snprintf(ts->phys, sizeof(ts->phys), "%s/input0", spi->dev.bus_id);
 
@@ -786,8 +800,8 @@ static int __devexit ads7846_remove(struct spi_device *spi)
 	device_remove_file(&spi->dev, &dev_attr_vaux);
 
 	free_irq(ts->spi->irq, ts);
-	if (ts->irq_disabled)
-		enable_irq(ts->spi->irq);
+	/* suspend left the IRQ disabled */
+	enable_irq(ts->spi->irq);
 
 	kfree(ts);
 
diff --git a/include/linux/spi/ads7846.h b/include/linux/spi/ads7846.h
index 3f7664951256..d8823e237e0b 100644
--- a/include/linux/spi/ads7846.h
+++ b/include/linux/spi/ads7846.h
@@ -17,5 +17,7 @@ struct ads7846_platform_data {
 
 	u16	debounce_max;		/* max number of readings per sample */
 	u16	debounce_tol;		/* tolerance used for filtering */
+
+	int	(*get_pendown_state)(void);
 };
 
-- 
cgit v1.2.3


From 13626a887fad4220bc7ca85f4b42ca8cfb805e11 Mon Sep 17 00:00:00 2001
From: Ralf Baechle <ralf@linux-mips.org>
Date: Sun, 2 Apr 2006 13:17:58 +0100
Subject: [MIPS] MV6434x: The name of the CPP symbol is __mips__, not __MIPS__.

Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
---
 include/linux/mv643xx.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mv643xx.h b/include/linux/mv643xx.h
index 955d3069d727..edfa012fad3a 100644
--- a/include/linux/mv643xx.h
+++ b/include/linux/mv643xx.h
@@ -13,7 +13,7 @@
 #ifndef __ASM_MV643XX_H
 #define __ASM_MV643XX_H
 
-#ifdef __MIPS__
+#ifdef __mips__
 #include <asm/addrspace.h>
 #include <asm/marvell.h>
 #endif
-- 
cgit v1.2.3


From 5e85d4abe3f43bb5362f384bab0e20ef082ce0b5 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 18 Apr 2006 22:20:16 -0700
Subject: [PATCH] task: Make task list manipulations RCU safe

While we can currently walk through thread groups, process groups, and
sessions with just the rcu_read_lock, this opens the door to walking the
entire task list.

We already have all of the other RCU guarantees so there is no cost in
doing this, this should be enough so that proc can stop taking the
tasklist lock during readdir.

prev_task was killed because it has no users, and using it will miss new
tasks when doing an rcu traversal.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/exec.c             | 2 +-
 include/linux/sched.h | 3 +--
 kernel/exit.c         | 2 +-
 kernel/fork.c         | 2 +-
 4 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/fs/exec.c b/fs/exec.c
index 4121bb559739..3a79d97ac234 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -712,7 +712,7 @@ static int de_thread(struct task_struct *tsk)
 		attach_pid(current, PIDTYPE_PID,  current->pid);
 		attach_pid(current, PIDTYPE_PGID, current->signal->pgrp);
 		attach_pid(current, PIDTYPE_SID,  current->signal->session);
-		list_add_tail(&current->tasks, &init_task.tasks);
+		list_add_tail_rcu(&current->tasks, &init_task.tasks);
 
 		current->group_leader = current;
 		leader->group_leader = current;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b7d31e2e1729..29b7d4f87d20 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1192,8 +1192,7 @@ extern void wait_task_inactive(task_t * p);
 #define remove_parent(p)	list_del_init(&(p)->sibling)
 #define add_parent(p)		list_add_tail(&(p)->sibling,&(p)->parent->children)
 
-#define next_task(p)	list_entry((p)->tasks.next, struct task_struct, tasks)
-#define prev_task(p)	list_entry((p)->tasks.prev, struct task_struct, tasks)
+#define next_task(p)	list_entry(rcu_dereference((p)->tasks.next), struct task_struct, tasks)
 
 #define for_each_process(p) \
 	for (p = &init_task ; (p = next_task(p)) != &init_task ; )
diff --git a/kernel/exit.c b/kernel/exit.c
index 1a9787ac6173..f86434d7b3d1 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -56,7 +56,7 @@ static void __unhash_process(struct task_struct *p)
 		detach_pid(p, PIDTYPE_PGID);
 		detach_pid(p, PIDTYPE_SID);
 
-		list_del_init(&p->tasks);
+		list_del_rcu(&p->tasks);
 		__get_cpu_var(process_counts)--;
 	}
 	list_del_rcu(&p->thread_group);
diff --git a/kernel/fork.c b/kernel/fork.c
index 54b15f8cda53..34515772611e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1204,7 +1204,7 @@ static task_t *copy_process(unsigned long clone_flags,
 			attach_pid(p, PIDTYPE_PGID, process_group(p));
 			attach_pid(p, PIDTYPE_SID, p->signal->session);
 
-			list_add_tail(&p->tasks, &init_task.tasks);
+			list_add_tail_rcu(&p->tasks, &init_task.tasks);
 			__get_cpu_var(process_counts)++;
 		}
 		attach_pid(p, PIDTYPE_PID, p->pid);
-- 
cgit v1.2.3


From 6e89280184e4990f5ea80d2504af89b6099523c4 Mon Sep 17 00:00:00 2001
From: Anatoli Antonovitch <antonovi@ati.com>
Date: Tue, 18 Apr 2006 22:22:05 -0700
Subject: [PATCH] ide: ATI SB600 IDE support

Add support for the IDE device on ATI SB600

Signed-off-by: Felix Kuehling <fkuehlin@ati.com>
Acked-by: Bartlomiej Zolnierkiewicz <B.Zolnierkiewicz@elka.pw.edu.pl>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Acked-by: Jeff Garzik <jeff@garzik.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/ide/pci/atiixp.c | 1 +
 include/linux/pci_ids.h  | 4 ++++
 2 files changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/ide/pci/atiixp.c b/drivers/ide/pci/atiixp.c
index df9ee9a78435..900efd1da587 100644
--- a/drivers/ide/pci/atiixp.c
+++ b/drivers/ide/pci/atiixp.c
@@ -348,6 +348,7 @@ static struct pci_device_id atiixp_pci_tbl[] = {
 	{ PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP200_IDE, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
 	{ PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP300_IDE, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
 	{ PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_IDE, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
+	{ PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP600_IDE, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
 	{ 0, },
 };
 MODULE_DEVICE_TABLE(pci, atiixp_pci_tbl);
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 8d03e10212f5..d6fe048376ab 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -356,6 +356,10 @@
 #define PCI_DEVICE_ID_ATI_IXP300_SATA   0x436e
 #define PCI_DEVICE_ID_ATI_IXP400_IDE	0x4376
 #define PCI_DEVICE_ID_ATI_IXP400_SATA   0x4379
+#define PCI_DEVICE_ID_ATI_IXP400_SATA2	0x437a
+#define PCI_DEVICE_ID_ATI_IXP600_SATA	0x4380
+#define PCI_DEVICE_ID_ATI_IXP600_SRAID	0x4381
+#define PCI_DEVICE_ID_ATI_IXP600_IDE	0x438c
 
 #define PCI_VENDOR_ID_VLSI		0x1004
 #define PCI_DEVICE_ID_VLSI_82C592	0x0005
-- 
cgit v1.2.3


From d3a7b202995421631f486313aacf9ab2ad48b2c8 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Tue, 18 Apr 2006 22:22:07 -0700
Subject: [PATCH] remove the obsolete IDEPCI_FLAG_FORCE_PDC

Noted by Sergei Shtylylov <sshtylyov@ru.mvista.com>

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Acked-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/ide/pci/pdc202xx_old.c |  2 --
 drivers/ide/setup-pci.c        | 13 -------------
 include/linux/ide.h            |  1 -
 3 files changed, 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/pci/pdc202xx_old.c b/drivers/ide/pci/pdc202xx_old.c
index 6f8f8645b02c..7ce5bf783688 100644
--- a/drivers/ide/pci/pdc202xx_old.c
+++ b/drivers/ide/pci/pdc202xx_old.c
@@ -798,7 +798,6 @@ static ide_pci_device_t pdc202xx_chipsets[] __devinitdata = {
 		.autodma	= AUTODMA,
 		.bootable	= OFF_BOARD,
 		.extra		= 48,
-		.flags		= IDEPCI_FLAG_FORCE_PDC,
 	},{	/* 2 */
 		.name		= "PDC20263",
 		.init_setup	= init_setup_pdc202ata4,
@@ -819,7 +818,6 @@ static ide_pci_device_t pdc202xx_chipsets[] __devinitdata = {
 		.autodma	= AUTODMA,
 		.bootable	= OFF_BOARD,
 		.extra		= 48,
-		.flags		= IDEPCI_FLAG_FORCE_PDC,
 	},{	/* 4 */
 		.name		= "PDC20267",
 		.init_setup	= init_setup_pdc202xx,
diff --git a/drivers/ide/setup-pci.c b/drivers/ide/setup-pci.c
index 7ebf992e8c2f..462ed3006c30 100644
--- a/drivers/ide/setup-pci.c
+++ b/drivers/ide/setup-pci.c
@@ -580,7 +580,6 @@ void ide_pci_setup_ports(struct pci_dev *dev, ide_pci_device_t *d, int pciirq, a
 	int port;
 	int at_least_one_hwif_enabled = 0;
 	ide_hwif_t *hwif, *mate = NULL;
-	static int secondpdc = 0;
 	u8 tmp;
 
 	index->all = 0xf0f0;
@@ -592,21 +591,9 @@ void ide_pci_setup_ports(struct pci_dev *dev, ide_pci_device_t *d, int pciirq, a
 	for (port = 0; port <= 1; ++port) {
 		ide_pci_enablebit_t *e = &(d->enablebits[port]);
 	
-		/* 
-		 * If this is a Promise FakeRaid controller,
-		 * the 2nd controller will be marked as 
-		 * disabled while it is actually there and enabled
-		 * by the bios for raid purposes. 
-		 * Skip the normal "is it enabled" test for those.
-		 */
-		if ((d->flags & IDEPCI_FLAG_FORCE_PDC) &&
-		    (secondpdc++==1) && (port==1))
-			goto controller_ok;
-			
 		if (e->reg && (pci_read_config_byte(dev, e->reg, &tmp) ||
 		    (tmp & e->mask) != e->val))
 			continue;	/* port not enabled */
-controller_ok:
 
 		if (d->channels	<= port)
 			break;
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 8d2db412ba9c..a8bef1d1371c 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1220,7 +1220,6 @@ typedef struct ide_pci_enablebit_s {
 enum {
 	/* Uses ISA control ports not PCI ones. */
 	IDEPCI_FLAG_ISA_PORTS		= (1 << 0),
-	IDEPCI_FLAG_FORCE_PDC		= (1 << 1),
 };
 
 typedef struct ide_pci_device_s {
-- 
cgit v1.2.3


From 7866babad542bb5e1dc95deb5800b577abef58dd Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Tue, 18 Apr 2006 13:14:13 -0400
Subject: NFS: fix PROC_FS=n compile error

fs/built-in.o: In function `nfs_show_stats':inode.c:(.text+0x15481a): undefined reference to `rpc_print_iostats'
net/built-in.o: In function `rpc_destroy_client': undefined reference to `rpc_free_iostats'
net/built-in.o: In function `rpc_clone_client': undefined reference to `rpc_alloc_iostats'
net/built-in.o: In function `rpc_new_client': undefined reference to `rpc_alloc_iostats'
net/built-in.o: In function `xprt_release': undefined reference to `rpc_count_iostats'
make: *** [.tmp_vmlinux1] Error 1

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/metrics.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/metrics.h b/include/linux/sunrpc/metrics.h
index 8f96e9dc369a..77f78e56c481 100644
--- a/include/linux/sunrpc/metrics.h
+++ b/include/linux/sunrpc/metrics.h
@@ -69,9 +69,21 @@ struct rpc_clnt;
 /*
  * EXPORTed functions for managing rpc_iostats structures
  */
+
+#ifdef CONFIG_PROC_FS
+
 struct rpc_iostats *	rpc_alloc_iostats(struct rpc_clnt *);
 void			rpc_count_iostats(struct rpc_task *);
 void			rpc_print_iostats(struct seq_file *, struct rpc_clnt *);
 void			rpc_free_iostats(struct rpc_iostats *);
 
+#else  /*  CONFIG_PROC_FS  */
+
+static inline struct rpc_iostats *rpc_alloc_iostats(struct rpc_clnt *clnt) { return NULL; }
+static inline void rpc_count_iostats(struct rpc_task *task) {}
+static inline void rpc_print_iostats(struct seq_file *seq, struct rpc_clnt *clnt) {}
+static inline void rpc_free_iostats(struct rpc_iostats *stats) {}
+
+#endif  /*  CONFIG_PROC_FS  */
+
 #endif /* _LINUX_SUNRPC_METRICS_H */
-- 
cgit v1.2.3


From e99170ff3b799a9fd43d538932a9231fac1de9d4 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 18 Apr 2006 13:21:42 -0400
Subject: NFS,SUNRPC: Fix compiler warnings if CONFIG_PROC_FS & CONFIG_SYSCTL
 are unset

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/direct.c             | 8 +++-----
 fs/nfs/file.c               | 5 ++---
 include/linux/sunrpc/xprt.h | 1 +
 3 files changed, 6 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 0f583cb16ddb..3c72b0c07283 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -112,10 +112,9 @@ static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode
  */
 ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t pos, unsigned long nr_segs)
 {
-	struct dentry *dentry = iocb->ki_filp->f_dentry;
-
 	dprintk("NFS: nfs_direct_IO (%s) off/no(%Ld/%lu) EINVAL\n",
-			dentry->d_name.name, (long long) pos, nr_segs);
+			iocb->ki_filp->f_dentry->d_name.name,
+			(long long) pos, nr_segs);
 
 	return -EINVAL;
 }
@@ -468,7 +467,6 @@ static const struct rpc_call_ops nfs_commit_direct_ops = {
 static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
 {
 	struct nfs_write_data *data = dreq->commit_data;
-	struct rpc_task *task = &data->task;
 
 	data->inode = dreq->inode;
 	data->cred = dreq->ctx->cred;
@@ -489,7 +487,7 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
 	/* Note: task.tk_ops->rpc_release will free dreq->commit_data */
 	dreq->commit_data = NULL;
 
-	dprintk("NFS: %5u initiated commit call\n", task->tk_pid);
+	dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
 
 	lock_kernel();
 	rpc_execute(&data->task);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index f1df2c8d9259..fade02c15e6e 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -534,10 +534,9 @@ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
  */
 static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
 {
-	struct inode * inode = filp->f_mapping->host;
-
 	dprintk("NFS: nfs_flock(f=%s/%ld, t=%x, fl=%x)\n",
-			inode->i_sb->s_id, inode->i_ino,
+			filp->f_dentry->d_inode->i_sb->s_id,
+			filp->f_dentry->d_inode->i_ino,
 			fl->fl_type, fl->fl_flags);
 
 	/*
diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 7eebbab7160b..e8bbe8118de8 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -53,6 +53,7 @@ struct rpc_timeout {
 
 struct rpc_task;
 struct rpc_xprt;
+struct seq_file;
 
 /*
  * This describes a complete RPC request
-- 
cgit v1.2.3


From dc6de33674608f978ec29f5c2f7e3af458c06f78 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@sunset.davemloft.net>
Date: Thu, 20 Apr 2006 00:10:50 -0700
Subject: [NET]: Add skb->truesize assertion checking.

Add some sanity checking.  truesize should be at least sizeof(struct
sk_buff) plus the current packet length.  If not, then truesize is
seriously mangled and deserves a kernel log message.

Currently we'll do the check for release of stream socket buffers.

But we can add checks to more spots over time.

Incorporating ideas from Herbert Xu.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 7 +++++++
 include/net/sock.h     | 1 +
 net/core/skbuff.c      | 8 ++++++++
 net/core/stream.c      | 1 +
 4 files changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index c4619a428d9b..f8f234708b98 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -344,6 +344,13 @@ extern void	      skb_over_panic(struct sk_buff *skb, int len,
 				     void *here);
 extern void	      skb_under_panic(struct sk_buff *skb, int len,
 				      void *here);
+extern void	      skb_truesize_bug(struct sk_buff *skb);
+
+static inline void skb_truesize_check(struct sk_buff *skb)
+{
+	if (unlikely((int)skb->truesize < sizeof(struct sk_buff) + skb->len))
+		skb_truesize_bug(skb);
+}
 
 extern int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb,
 			int getfrag(void *from, char *to, int offset,
diff --git a/include/net/sock.h b/include/net/sock.h
index af2b0544586e..ff8b0dad7b0f 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -454,6 +454,7 @@ static inline void sk_stream_set_owner_r(struct sk_buff *skb, struct sock *sk)
 
 static inline void sk_stream_free_skb(struct sock *sk, struct sk_buff *skb)
 {
+	skb_truesize_check(skb);
 	sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
 	sk->sk_wmem_queued   -= skb->truesize;
 	sk->sk_forward_alloc += skb->truesize;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 09464fa8d72f..fb3770f9c094 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -112,6 +112,14 @@ void skb_under_panic(struct sk_buff *skb, int sz, void *here)
 	BUG();
 }
 
+void skb_truesize_bug(struct sk_buff *skb)
+{
+	printk(KERN_ERR "SKB BUG: Invalid truesize (%u) "
+	       "len=%u, sizeof(sk_buff)=%Zd\n",
+	       skb->truesize, skb->len, sizeof(struct sk_buff));
+}
+EXPORT_SYMBOL(skb_truesize_bug);
+
 /* 	Allocate a new skbuff. We do this ourselves so we can fill in a few
  *	'private' fields and also do memory statistics to find all the
  *	[BEEP] leaks.
diff --git a/net/core/stream.c b/net/core/stream.c
index 35e25259fd95..e9489696f694 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -176,6 +176,7 @@ void sk_stream_rfree(struct sk_buff *skb)
 {
 	struct sock *sk = skb->sk;
 
+	skb_truesize_check(skb);
 	atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
 	sk->sk_forward_alloc += skb->truesize;
 }
-- 
cgit v1.2.3


From 72b38d436e4cd18185de11f4b48a6e62eb104644 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Thu, 20 Apr 2006 02:43:23 -0700
Subject: [PATCH] memory_hotplug.h cleanup

We don't have to #if guard prototypes.

This also fixes a bug observed by Randy Dunlap due to a misspelled
option in the #if.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/memory_hotplug.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 4ca3e6ad03ec..911206386171 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -99,10 +99,7 @@ static inline int __remove_pages(struct zone *zone, unsigned long start_pfn,
 	return -ENOSYS;
 }
 
-#if defined(CONFIG_MEMORY_HOTPLUG) || defined(CONFIG_ACPI_HOTPLUG_MEMORY) \
-	|| defined(CONFIG_ACPI_HOTPLUG_MEMORY_MODULE)
 extern int add_memory(u64 start, u64 size);
 extern int remove_memory(u64 start, u64 size);
-#endif
 
 #endif /* __LINUX_MEMORY_HOTPLUG_H */
-- 
cgit v1.2.3


From 55fe5866366ae42f259f27ae5962eb267d9ce172 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Mon, 24 Apr 2006 17:16:28 -0700
Subject: [NETFILTER]: Fix compat_xt_counters alignment for non-x86

Some (?) non-x86 architectures require 8byte alignment for u_int64_t
even when compiled for 32bit, using u_int32_t in compat_xt_counters
breaks on these architectures, use u_int64_t for everything but x86.

Reported by Andreas Schwab <schwab@suse.de>.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter/x_tables.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index f6bdef82a322..38701454e197 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -361,7 +361,11 @@ struct compat_xt_entry_target
 
 struct compat_xt_counters
 {
+#if defined(CONFIG_X86_64) || defined(CONFIG_IA64)
 	u_int32_t cnt[4];
+#else
+	u_int64_t cnt[2];
+#endif
 };
 
 struct compat_xt_counters_info
-- 
cgit v1.2.3


From d5b415c95f0e6510451f1446cea832c1f77bd7ea Mon Sep 17 00:00:00 2001
From: Imre Deak <imre.deak@nokia.com>
Date: Wed, 26 Apr 2006 00:13:18 -0400
Subject: Input: ads7846 - improve filtering for thumb press accuracy

Providing more accurate coordinates for thumb press requires additional
steps in the filtering logic:

- Ignore samples found invalid by the debouncing logic, or the ones that
  have out of bound pressure value.
- Add a parameter to repeat debouncing, so that more then two consecutive
  good readings are required for a valid sample.

Signed-off-by: Imre Deak <imre.deak@nokia.com>
Acked-by: Juha Yrjola <juha.yrjola@nokia.com>
Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 drivers/input/touchscreen/ads7846.c | 72 +++++++++++++++++++++++++++++--------
 include/linux/spi/ads7846.h         |  6 ++--
 2 files changed, 61 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/input/touchscreen/ads7846.c b/drivers/input/touchscreen/ads7846.c
index 1aaa153a2774..1494175ac6fe 100644
--- a/drivers/input/touchscreen/ads7846.c
+++ b/drivers/input/touchscreen/ads7846.c
@@ -71,6 +71,7 @@ struct ts_event {
 	__be16 x;
 	__be16 y;
 	__be16 z1, z2;
+	int    ignore;
 };
 
 struct ads7846 {
@@ -81,6 +82,7 @@ struct ads7846 {
 	u16			model;
 	u16			vref_delay_usecs;
 	u16			x_plate_ohms;
+	u16			pressure_max;
 
 	u8			read_x, read_y, read_z1, read_z2, pwrdown;
 	u16			dummy;		/* for the pwrdown read */
@@ -88,12 +90,15 @@ struct ads7846 {
 
 	struct spi_transfer	xfer[10];
 	struct spi_message	msg[5];
+	struct spi_message	*last_msg;
 	int			msg_idx;
 	int			read_cnt;
+	int			read_rep;
 	int			last_read;
 
 	u16			debounce_max;
 	u16			debounce_tol;
+	u16			debounce_rep;
 
 	spinlock_t		lock;
 	struct timer_list	timer;		/* P: lock */
@@ -354,6 +359,14 @@ static void ads7846_rx(void *ads)
 	} else
 		Rt = 0;
 
+	/* Sample found inconsistent by debouncing or pressure is beyond
+	* the maximum. Don't report it to user space, repeat at least
+	* once more the measurement */
+	if (ts->tc.ignore || Rt > ts->pressure_max) {
+		mod_timer(&ts->timer, jiffies + TS_POLL_PERIOD);
+		return;
+	}
+
 	/* NOTE:  "pendown" is inferred from pressure; we don't rely on
 	 * being able to check nPENIRQ status, or "friendly" trigger modes
 	 * (both-edges is much better than just-falling or low-level).
@@ -402,25 +415,45 @@ static void ads7846_debounce(void *ads)
 	struct ads7846		*ts = ads;
 	struct spi_message	*m;
 	struct spi_transfer	*t;
-	u16			val;
+	int			val;
 	int			status;
 
 	m = &ts->msg[ts->msg_idx];
 	t = list_entry(m->transfers.prev, struct spi_transfer, transfer_list);
 	val = (*(u16 *)t->rx_buf) >> 3;
-
-	if (!ts->read_cnt || (abs(ts->last_read - val) > ts->debounce_tol
-				&& ts->read_cnt < ts->debounce_max)) {
-		/* Repeat it, if this was the first read or the read wasn't
-		 * consistent enough
-		 */
-		ts->read_cnt++;
-		ts->last_read = val;
+	if (!ts->read_cnt || (abs(ts->last_read - val) > ts->debounce_tol)) {
+		/* Repeat it, if this was the first read or the read
+		 * wasn't consistent enough. */
+		if (ts->read_cnt < ts->debounce_max) {
+			ts->last_read = val;
+			ts->read_cnt++;
+		} else {
+			/* Maximum number of debouncing reached and still
+			 * not enough number of consistent readings. Abort
+			 * the whole sample, repeat it in the next sampling
+			 * period.
+			 */
+			ts->tc.ignore = 1;
+			ts->read_cnt = 0;
+			/* Last message will contain ads7846_rx() as the
+			 * completion function.
+			 */
+			m = ts->last_msg;
+		}
+		/* Start over collecting consistent readings. */
+		ts->read_rep = 0;
 	} else {
-		/* Go for the next read */
-		ts->msg_idx++;
-		ts->read_cnt = 0;
-		m++;
+		if (++ts->read_rep > ts->debounce_rep) {
+			/* Got a good reading for this coordinate,
+			 * go for the next one. */
+			ts->tc.ignore = 0;
+			ts->msg_idx++;
+			ts->read_cnt = 0;
+			ts->read_rep = 0;
+			m++;
+		} else
+			/* Read more values that are consistent. */
+			ts->read_cnt++;
 	}
 	status = spi_async(ts->spi, m);
 	if (status)
@@ -609,8 +642,15 @@ static int __devinit ads7846_probe(struct spi_device *spi)
 	ts->model = pdata->model ? : 7846;
 	ts->vref_delay_usecs = pdata->vref_delay_usecs ? : 100;
 	ts->x_plate_ohms = pdata->x_plate_ohms ? : 400;
-	ts->debounce_max = pdata->debounce_max ? : 1;
-	ts->debounce_tol = pdata->debounce_tol ? : 10;
+	ts->pressure_max = pdata->pressure_max ? : ~0;
+	if (pdata->debounce_max) {
+		ts->debounce_max = pdata->debounce_max;
+		ts->debounce_tol = pdata->debounce_tol;
+		ts->debounce_rep = pdata->debounce_rep;
+		if (ts->debounce_rep > ts->debounce_max + 1)
+			ts->debounce_rep = ts->debounce_max - 1;
+	} else
+		ts->debounce_tol = ~0;
 	ts->get_pendown_state = pdata->get_pendown_state;
 
 	snprintf(ts->phys, sizeof(ts->phys), "%s/input0", spi->dev.bus_id);
@@ -728,6 +768,8 @@ static int __devinit ads7846_probe(struct spi_device *spi)
 	m->complete = ads7846_rx;
 	m->context = ts;
 
+	ts->last_msg = m;
+
 	if (request_irq(spi->irq, ads7846_irq,
 			SA_SAMPLE_RANDOM | SA_TRIGGER_FALLING,
 			spi->dev.bus_id, ts)) {
diff --git a/include/linux/spi/ads7846.h b/include/linux/spi/ads7846.h
index d8823e237e0b..adb3dafd33e9 100644
--- a/include/linux/spi/ads7846.h
+++ b/include/linux/spi/ads7846.h
@@ -15,9 +15,11 @@ struct ads7846_platform_data {
 	u16	y_min, y_max;
 	u16	pressure_min, pressure_max;
 
-	u16	debounce_max;		/* max number of readings per sample */
+	u16	debounce_max;		/* max number of additional readings
+					 * per sample */
 	u16	debounce_tol;		/* tolerance used for filtering */
-
+	u16	debounce_rep;		/* additional consecutive good readings
+					 * required after the first two */
 	int	(*get_pendown_state)(void);
 };
 
-- 
cgit v1.2.3


From 1a0ccece05efb8a9c04b1130c24a2652239f3bea Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dtor_core@ameritech.net>
Date: Wed, 26 Apr 2006 00:13:57 -0400
Subject: Input: allow passing NULL to input_free_device()

Many drivers rely on input_free_device() behaving like kfree().

Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 include/linux/input.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/input.h b/include/linux/input.h
index b0e612dda0cf..16c19d710a4d 100644
--- a/include/linux/input.h
+++ b/include/linux/input.h
@@ -1016,7 +1016,8 @@ static inline void input_put_device(struct input_dev *dev)
 
 static inline void input_free_device(struct input_dev *dev)
 {
-	input_put_device(dev);
+	if (dev)
+		input_put_device(dev);
 }
 
 int input_register_device(struct input_dev *);
-- 
cgit v1.2.3


From ddc5d3414593e4d7ad7fbd33e7f7517fcc234544 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dtor_core@ameritech.net>
Date: Wed, 26 Apr 2006 00:14:19 -0400
Subject: Input: move input_device_id to mod_devicetable.h

Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 drivers/input/input.c           |   8 ++--
 include/linux/input.h           | 104 ++++++++++++++++++++--------------------
 include/linux/mod_devicetable.h |  48 +++++++++++++++++++
 scripts/mod/file2alias.c        |  36 +++++++-------
 4 files changed, 122 insertions(+), 74 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/input/input.c b/drivers/input/input.c
index a935abeffffc..591c70d80cd8 100644
--- a/drivers/input/input.c
+++ b/drivers/input/input.c
@@ -286,19 +286,19 @@ static struct input_device_id *input_match_device(struct input_device_id *id, st
 	for (; id->flags || id->driver_info; id++) {
 
 		if (id->flags & INPUT_DEVICE_ID_MATCH_BUS)
-			if (id->id.bustype != dev->id.bustype)
+			if (id->bustype != dev->id.bustype)
 				continue;
 
 		if (id->flags & INPUT_DEVICE_ID_MATCH_VENDOR)
-			if (id->id.vendor != dev->id.vendor)
+			if (id->vendor != dev->id.vendor)
 				continue;
 
 		if (id->flags & INPUT_DEVICE_ID_MATCH_PRODUCT)
-			if (id->id.product != dev->id.product)
+			if (id->product != dev->id.product)
 				continue;
 
 		if (id->flags & INPUT_DEVICE_ID_MATCH_VERSION)
-			if (id->id.version != dev->id.version)
+			if (id->version != dev->id.version)
 				continue;
 
 		MATCH_BIT(evbit,  EV_MAX);
diff --git a/include/linux/input.h b/include/linux/input.h
index 16c19d710a4d..8298b4bf5a07 100644
--- a/include/linux/input.h
+++ b/include/linux/input.h
@@ -12,8 +12,6 @@
 #ifdef __KERNEL__
 #include <linux/time.h>
 #include <linux/list.h>
-#include <linux/device.h>
-#include <linux/mod_devicetable.h>
 #else
 #include <sys/time.h>
 #include <sys/ioctl.h>
@@ -577,15 +575,15 @@ struct input_absinfo {
  * Switch events
  */
 
-#define SW_0		0x00
-#define SW_1		0x01
-#define SW_2		0x02
-#define SW_3		0x03
-#define SW_4		0x04
-#define SW_5		0x05
-#define SW_6		0x06
-#define SW_7		0x07
-#define SW_MAX		0x0f
+#define SW_0			0x00
+#define SW_1			0x01
+#define SW_2			0x02
+#define SW_3			0x03
+#define SW_4			0x04
+#define SW_5			0x05
+#define SW_6			0x06
+#define SW_7			0x07
+#define SW_MAX			0x0f
 
 /*
  * Misc events
@@ -805,52 +803,16 @@ struct ff_effect {
 
 #define FF_MAX		0x7f
 
-struct input_device_id {
-
-	kernel_ulong_t flags;
-
-	struct input_id id;
-
-	kernel_ulong_t evbit[EV_MAX/BITS_PER_LONG+1];
-	kernel_ulong_t keybit[KEY_MAX/BITS_PER_LONG+1];
-	kernel_ulong_t relbit[REL_MAX/BITS_PER_LONG+1];
-	kernel_ulong_t absbit[ABS_MAX/BITS_PER_LONG+1];
-	kernel_ulong_t mscbit[MSC_MAX/BITS_PER_LONG+1];
-	kernel_ulong_t ledbit[LED_MAX/BITS_PER_LONG+1];
-	kernel_ulong_t sndbit[SND_MAX/BITS_PER_LONG+1];
-	kernel_ulong_t ffbit[FF_MAX/BITS_PER_LONG+1];
-	kernel_ulong_t swbit[SW_MAX/BITS_PER_LONG+1];
-
-	kernel_ulong_t driver_info;
-};
-
-/*
- * Structure for hotplug & device<->driver matching.
- */
-
-#define INPUT_DEVICE_ID_MATCH_BUS	1
-#define INPUT_DEVICE_ID_MATCH_VENDOR	2
-#define INPUT_DEVICE_ID_MATCH_PRODUCT	4
-#define INPUT_DEVICE_ID_MATCH_VERSION	8
-
-#define INPUT_DEVICE_ID_MATCH_EVBIT	0x010
-#define INPUT_DEVICE_ID_MATCH_KEYBIT	0x020
-#define INPUT_DEVICE_ID_MATCH_RELBIT	0x040
-#define INPUT_DEVICE_ID_MATCH_ABSBIT	0x080
-#define INPUT_DEVICE_ID_MATCH_MSCIT	0x100
-#define INPUT_DEVICE_ID_MATCH_LEDBIT	0x200
-#define INPUT_DEVICE_ID_MATCH_SNDBIT	0x400
-#define INPUT_DEVICE_ID_MATCH_FFBIT	0x800
-#define INPUT_DEVICE_ID_MATCH_SWBIT	0x1000
-
 #ifdef __KERNEL__
 
 /*
  * In-kernel definitions.
  */
 
+#include <linux/device.h>
 #include <linux/fs.h>
 #include <linux/timer.h>
+#include <linux/mod_devicetable.h>
 
 #define NBITS(x) (((x)/BITS_PER_LONG)+1)
 #define BIT(x)	(1UL<<((x)%BITS_PER_LONG))
@@ -951,9 +913,49 @@ struct input_dev {
 };
 #define to_input_dev(d) container_of(d, struct input_dev, cdev)
 
-#define INPUT_DEVICE_ID_MATCH_DEVICE\
+/*
+ * Verify that we are in sync with input_device_id mod_devicetable.h #defines
+ */
+
+#if EV_MAX != INPUT_DEVICE_ID_EV_MAX
+#error "EV_MAX and INPUT_DEVICE_ID_EV_MAX do not match"
+#endif
+
+#if KEY_MAX != INPUT_DEVICE_ID_KEY_MAX
+#error "KEY_MAX and INPUT_DEVICE_ID_KEY_MAX do not match"
+#endif
+
+#if REL_MAX != INPUT_DEVICE_ID_REL_MAX
+#error "REL_MAX and INPUT_DEVICE_ID_REL_MAX do not match"
+#endif
+
+#if ABS_MAX != INPUT_DEVICE_ID_ABS_MAX
+#error "ABS_MAX and INPUT_DEVICE_ID_ABS_MAX do not match"
+#endif
+
+#if MSC_MAX != INPUT_DEVICE_ID_MSC_MAX
+#error "MSC_MAX and INPUT_DEVICE_ID_MSC_MAX do not match"
+#endif
+
+#if LED_MAX != INPUT_DEVICE_ID_LED_MAX
+#error "LED_MAX and INPUT_DEVICE_ID_LED_MAX do not match"
+#endif
+
+#if SND_MAX != INPUT_DEVICE_ID_SND_MAX
+#error "SND_MAX and INPUT_DEVICE_ID_SND_MAX do not match"
+#endif
+
+#if FF_MAX != INPUT_DEVICE_ID_FF_MAX
+#error "FF_MAX and INPUT_DEVICE_ID_FF_MAX do not match"
+#endif
+
+#if SW_MAX != INPUT_DEVICE_ID_SW_MAX
+#error "SW_MAX and INPUT_DEVICE_ID_SW_MAX do not match"
+#endif
+
+#define INPUT_DEVICE_ID_MATCH_DEVICE \
 	(INPUT_DEVICE_ID_MATCH_BUS | INPUT_DEVICE_ID_MATCH_VENDOR | INPUT_DEVICE_ID_MATCH_PRODUCT)
-#define INPUT_DEVICE_ID_MATCH_DEVICE_AND_VERSION\
+#define INPUT_DEVICE_ID_MATCH_DEVICE_AND_VERSION \
 	(INPUT_DEVICE_ID_MATCH_DEVICE | INPUT_DEVICE_ID_MATCH_VERSION)
 
 struct input_handle;
diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index 7b08c11ec4cc..f6977708585c 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -249,4 +249,52 @@ struct i2c_device_id {
 	__u16 id;
 };
 
+/* Input */
+#define INPUT_DEVICE_ID_EV_MAX		0x1f
+#define INPUT_DEVICE_ID_KEY_MAX		0x1ff
+#define INPUT_DEVICE_ID_REL_MAX		0x0f
+#define INPUT_DEVICE_ID_ABS_MAX		0x3f
+#define INPUT_DEVICE_ID_MSC_MAX		0x07
+#define INPUT_DEVICE_ID_LED_MAX		0x0f
+#define INPUT_DEVICE_ID_SND_MAX		0x07
+#define INPUT_DEVICE_ID_FF_MAX		0x7f
+#define INPUT_DEVICE_ID_SW_MAX		0x0f
+
+#define INPUT_DEVICE_ID_MATCH_BUS	1
+#define INPUT_DEVICE_ID_MATCH_VENDOR	2
+#define INPUT_DEVICE_ID_MATCH_PRODUCT	4
+#define INPUT_DEVICE_ID_MATCH_VERSION	8
+
+#define INPUT_DEVICE_ID_MATCH_EVBIT	0x0010
+#define INPUT_DEVICE_ID_MATCH_KEYBIT	0x0020
+#define INPUT_DEVICE_ID_MATCH_RELBIT	0x0040
+#define INPUT_DEVICE_ID_MATCH_ABSBIT	0x0080
+#define INPUT_DEVICE_ID_MATCH_MSCIT	0x0100
+#define INPUT_DEVICE_ID_MATCH_LEDBIT	0x0200
+#define INPUT_DEVICE_ID_MATCH_SNDBIT	0x0400
+#define INPUT_DEVICE_ID_MATCH_FFBIT	0x0800
+#define INPUT_DEVICE_ID_MATCH_SWBIT	0x1000
+
+struct input_device_id {
+
+	kernel_ulong_t flags;
+
+	__u16 bustype;
+	__u16 vendor;
+	__u16 product;
+	__u16 version;
+
+	kernel_ulong_t evbit[INPUT_DEVICE_ID_EV_MAX / BITS_PER_LONG + 1];
+	kernel_ulong_t keybit[INPUT_DEVICE_ID_KEY_MAX / BITS_PER_LONG + 1];
+	kernel_ulong_t relbit[INPUT_DEVICE_ID_REL_MAX / BITS_PER_LONG + 1];
+	kernel_ulong_t absbit[INPUT_DEVICE_ID_ABS_MAX / BITS_PER_LONG + 1];
+	kernel_ulong_t mscbit[INPUT_DEVICE_ID_MSC_MAX / BITS_PER_LONG + 1];
+	kernel_ulong_t ledbit[INPUT_DEVICE_ID_LED_MAX / BITS_PER_LONG + 1];
+	kernel_ulong_t sndbit[INPUT_DEVICE_ID_SND_MAX / BITS_PER_LONG + 1];
+	kernel_ulong_t ffbit[INPUT_DEVICE_ID_FF_MAX / BITS_PER_LONG + 1];
+	kernel_ulong_t swbit[INPUT_DEVICE_ID_SW_MAX / BITS_PER_LONG + 1];
+
+	kernel_ulong_t driver_info;
+};
+
 #endif /* LINUX_MOD_DEVICETABLE_H */
diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c
index 84e21201f3c0..37f67c23e11b 100644
--- a/scripts/mod/file2alias.c
+++ b/scripts/mod/file2alias.c
@@ -374,10 +374,10 @@ static void do_input(char *alias,
 		     kernel_ulong_t *arr, unsigned int min, unsigned int max)
 {
 	unsigned int i;
-	for (i = min; i < max; i++) {
-		if (arr[i/BITS_PER_LONG] & (1 << (i%BITS_PER_LONG)))
-			sprintf(alias+strlen(alias), "%X,*", i);
-	}
+
+	for (i = min; i < max; i++)
+		if (arr[i / BITS_PER_LONG] & (1 << (i%BITS_PER_LONG)))
+			sprintf(alias + strlen(alias), "%X,*", i);
 }
 
 /* input:b0v0p0e0-eXkXrXaXmXlXsXfXwX where X is comma-separated %02X. */
@@ -386,39 +386,37 @@ static int do_input_entry(const char *filename, struct input_device_id *id,
 {
 	sprintf(alias, "input:");
 
-	ADD(alias, "b", id->flags&INPUT_DEVICE_ID_MATCH_BUS, id->id.bustype);
-	ADD(alias, "v", id->flags&INPUT_DEVICE_ID_MATCH_VENDOR, id->id.vendor);
-	ADD(alias, "p", id->flags&INPUT_DEVICE_ID_MATCH_PRODUCT,
-	    id->id.product);
-	ADD(alias, "e", id->flags&INPUT_DEVICE_ID_MATCH_VERSION,
-	    id->id.version);
+	ADD(alias, "b", id->flags & INPUT_DEVICE_ID_MATCH_BUS, id->bustype);
+	ADD(alias, "v", id->flags & INPUT_DEVICE_ID_MATCH_VENDOR, id->vendor);
+	ADD(alias, "p", id->flags & INPUT_DEVICE_ID_MATCH_PRODUCT, id->product);
+	ADD(alias, "e", id->flags & INPUT_DEVICE_ID_MATCH_VERSION, id->version);
 
 	sprintf(alias + strlen(alias), "-e*");
-	if (id->flags&INPUT_DEVICE_ID_MATCH_EVBIT)
+	if (id->flags & INPUT_DEVICE_ID_MATCH_EVBIT)
 		do_input(alias, id->evbit, 0, EV_MAX);
 	sprintf(alias + strlen(alias), "k*");
-	if (id->flags&INPUT_DEVICE_ID_MATCH_KEYBIT)
+	if (id->flags & INPUT_DEVICE_ID_MATCH_KEYBIT)
 		do_input(alias, id->keybit, KEY_MIN_INTERESTING, KEY_MAX);
 	sprintf(alias + strlen(alias), "r*");
-	if (id->flags&INPUT_DEVICE_ID_MATCH_RELBIT)
+	if (id->flags & INPUT_DEVICE_ID_MATCH_RELBIT)
 		do_input(alias, id->relbit, 0, REL_MAX);
 	sprintf(alias + strlen(alias), "a*");
-	if (id->flags&INPUT_DEVICE_ID_MATCH_ABSBIT)
+	if (id->flags & INPUT_DEVICE_ID_MATCH_ABSBIT)
 		do_input(alias, id->absbit, 0, ABS_MAX);
 	sprintf(alias + strlen(alias), "m*");
-	if (id->flags&INPUT_DEVICE_ID_MATCH_MSCIT)
+	if (id->flags & INPUT_DEVICE_ID_MATCH_MSCIT)
 		do_input(alias, id->mscbit, 0, MSC_MAX);
 	sprintf(alias + strlen(alias), "l*");
-	if (id->flags&INPUT_DEVICE_ID_MATCH_LEDBIT)
+	if (id->flags & INPUT_DEVICE_ID_MATCH_LEDBIT)
 		do_input(alias, id->ledbit, 0, LED_MAX);
 	sprintf(alias + strlen(alias), "s*");
-	if (id->flags&INPUT_DEVICE_ID_MATCH_SNDBIT)
+	if (id->flags & INPUT_DEVICE_ID_MATCH_SNDBIT)
 		do_input(alias, id->sndbit, 0, SND_MAX);
 	sprintf(alias + strlen(alias), "f*");
-	if (id->flags&INPUT_DEVICE_ID_MATCH_FFBIT)
+	if (id->flags & INPUT_DEVICE_ID_MATCH_FFBIT)
 		do_input(alias, id->ffbit, 0, FF_MAX);
 	sprintf(alias + strlen(alias), "w*");
-	if (id->flags&INPUT_DEVICE_ID_MATCH_SWBIT)
+	if (id->flags & INPUT_DEVICE_ID_MATCH_SWBIT)
 		do_input(alias, id->swbit, 0, SW_MAX);
 	return 1;
 }
-- 
cgit v1.2.3


From 912d35f86781e64d73be1ef358f703c08905ac37 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Wed, 26 Apr 2006 10:59:21 +0200
Subject: [PATCH] Add support for the sys_vmsplice syscall

sys_splice() moves data to/from pipes with a file input/output. sys_vmsplice()
moves data to a pipe, with the input being a user address range instead.

This uses an approach suggested by Linus, where we can hold partial ranges
inside the pages[] map. Hopefully this will be useful for network
receive support as well.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 arch/ia64/kernel/entry.S                    |   1 +
 arch/powerpc/kernel/systbl.S                |   1 +
 arch/powerpc/platforms/cell/spu_callbacks.c |   1 +
 fs/splice.c                                 | 292 ++++++++++++++++++++++++----
 include/asm-i386/unistd.h                   |   3 +-
 include/asm-ia64/unistd.h                   |   3 +-
 include/asm-powerpc/unistd.h                |   3 +-
 include/asm-x86_64/unistd.h                 |   4 +-
 include/linux/syscalls.h                    |   3 +
 9 files changed, 268 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S
index e30798811216..bcb80ca5cf40 100644
--- a/arch/ia64/kernel/entry.S
+++ b/arch/ia64/kernel/entry.S
@@ -1610,5 +1610,6 @@ sys_call_table:
 	data8 sys_get_robust_list
 	data8 sys_sync_file_range		// 1300
 	data8 sys_tee
+	data8 sys_vmsplice
 
 	.org sys_call_table + 8*NR_syscalls	// guard against failures to increase NR_syscalls
diff --git a/arch/powerpc/kernel/systbl.S b/arch/powerpc/kernel/systbl.S
index 8d1522690501..0b98eea73c5e 100644
--- a/arch/powerpc/kernel/systbl.S
+++ b/arch/powerpc/kernel/systbl.S
@@ -324,6 +324,7 @@ COMPAT_SYS(ppoll)
 SYSCALL(unshare)
 SYSCALL(splice)
 SYSCALL(tee)
+SYSCALL(vmsplice)
 
 /*
  * please add new calls to arch/powerpc/platforms/cell/spu_callbacks.c
diff --git a/arch/powerpc/platforms/cell/spu_callbacks.c b/arch/powerpc/platforms/cell/spu_callbacks.c
index deb3afb94484..b283380a2a18 100644
--- a/arch/powerpc/platforms/cell/spu_callbacks.c
+++ b/arch/powerpc/platforms/cell/spu_callbacks.c
@@ -318,6 +318,7 @@ void *spu_syscall_table[] = {
 	[__NR_unshare]			sys_unshare,
 	[__NR_splice]			sys_splice,
 	[__NR_tee]			sys_tee,
+	[__NR_vmsplice]			sys_vmsplice,
 };
 
 long spu_sys_callback(struct spu_syscall_block *s)
diff --git a/fs/splice.c b/fs/splice.c
index 8c6030c762e2..0b2c1f060cae 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -27,6 +27,7 @@
 #include <linux/buffer_head.h>
 #include <linux/module.h>
 #include <linux/syscalls.h>
+#include <linux/uio.h>
 
 /*
  * Passed to the actors
@@ -38,6 +39,22 @@ struct splice_desc {
 	loff_t pos;			/* file position */
 };
 
+struct partial_page {
+	unsigned int offset;
+	unsigned int len;
+};
+
+/*
+ * Passed to move_to_pipe
+ */
+struct splice_pipe_desc {
+	struct page **pages;		/* page map */
+	struct partial_page *partial;	/* pages[] may not be contig */
+	int nr_pages;			/* number of pages in map */
+	unsigned int flags;		/* splice flags */
+	struct pipe_buf_operations *ops;/* ops associated with output pipe */
+};
+
 /*
  * Attempt to steal a page from a pipe buffer. This should perhaps go into
  * a vm helper function, it's already simplified quite a bit by the
@@ -128,6 +145,19 @@ static void page_cache_pipe_buf_unmap(struct pipe_inode_info *info,
 	kunmap(buf->page);
 }
 
+static void *user_page_pipe_buf_map(struct file *file,
+				    struct pipe_inode_info *pipe,
+				    struct pipe_buffer *buf)
+{
+	return kmap(buf->page);
+}
+
+static void user_page_pipe_buf_unmap(struct pipe_inode_info *pipe,
+				     struct pipe_buffer *buf)
+{
+	kunmap(buf->page);
+}
+
 static void page_cache_pipe_buf_get(struct pipe_inode_info *info,
 				    struct pipe_buffer *buf)
 {
@@ -143,19 +173,33 @@ static struct pipe_buf_operations page_cache_pipe_buf_ops = {
 	.get = page_cache_pipe_buf_get,
 };
 
+static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe,
+				    struct pipe_buffer *buf)
+{
+	return 1;
+}
+
+static struct pipe_buf_operations user_page_pipe_buf_ops = {
+	.can_merge = 0,
+	.map = user_page_pipe_buf_map,
+	.unmap = user_page_pipe_buf_unmap,
+	.release = page_cache_pipe_buf_release,
+	.steal = user_page_pipe_buf_steal,
+	.get = page_cache_pipe_buf_get,
+};
+
 /*
  * Pipe output worker. This sets up our pipe format with the page cache
  * pipe buffer operations. Otherwise very similar to the regular pipe_writev().
  */
-static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages,
-			    int nr_pages, unsigned long len,
-			    unsigned int offset, unsigned int flags)
+static ssize_t move_to_pipe(struct pipe_inode_info *pipe,
+			    struct splice_pipe_desc *spd)
 {
-	int ret, do_wakeup, i;
+	int ret, do_wakeup, page_nr;
 
 	ret = 0;
 	do_wakeup = 0;
-	i = 0;
+	page_nr = 0;
 
 	if (pipe->inode)
 		mutex_lock(&pipe->inode->i_mutex);
@@ -171,27 +215,19 @@ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages,
 		if (pipe->nrbufs < PIPE_BUFFERS) {
 			int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1);
 			struct pipe_buffer *buf = pipe->bufs + newbuf;
-			struct page *page = pages[i++];
-			unsigned long this_len;
 
-			this_len = PAGE_CACHE_SIZE - offset;
-			if (this_len > len)
-				this_len = len;
-
-			buf->page = page;
-			buf->offset = offset;
-			buf->len = this_len;
-			buf->ops = &page_cache_pipe_buf_ops;
+			buf->page = spd->pages[page_nr];
+			buf->offset = spd->partial[page_nr].offset;
+			buf->len = spd->partial[page_nr].len;
+			buf->ops = spd->ops;
 			pipe->nrbufs++;
+			page_nr++;
+			ret += buf->len;
+
 			if (pipe->inode)
 				do_wakeup = 1;
 
-			ret += this_len;
-			len -= this_len;
-			offset = 0;
-			if (!--nr_pages)
-				break;
-			if (!len)
+			if (!--spd->nr_pages)
 				break;
 			if (pipe->nrbufs < PIPE_BUFFERS)
 				continue;
@@ -199,7 +235,7 @@ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages,
 			break;
 		}
 
-		if (flags & SPLICE_F_NONBLOCK) {
+		if (spd->flags & SPLICE_F_NONBLOCK) {
 			if (!ret)
 				ret = -EAGAIN;
 			break;
@@ -234,8 +270,8 @@ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages,
 		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 	}
 
-	while (i < nr_pages)
-		page_cache_release(pages[i++]);
+	while (page_nr < spd->nr_pages)
+		page_cache_release(spd->pages[page_nr++]);
 
 	return ret;
 }
@@ -246,17 +282,24 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 			   unsigned int flags)
 {
 	struct address_space *mapping = in->f_mapping;
-	unsigned int loff, offset, nr_pages;
+	unsigned int loff, nr_pages;
 	struct page *pages[PIPE_BUFFERS];
+	struct partial_page partial[PIPE_BUFFERS];
 	struct page *page;
 	pgoff_t index, end_index;
 	loff_t isize;
-	size_t bytes;
-	int i, error;
+	size_t total_len;
+	int error;
+	struct splice_pipe_desc spd = {
+		.pages = pages,
+		.partial = partial,
+		.flags = flags,
+		.ops = &page_cache_pipe_buf_ops,
+	};
 
 	index = *ppos >> PAGE_CACHE_SHIFT;
-	loff = offset = *ppos & ~PAGE_CACHE_MASK;
-	nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	loff = *ppos & ~PAGE_CACHE_MASK;
+	nr_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 
 	if (nr_pages > PIPE_BUFFERS)
 		nr_pages = PIPE_BUFFERS;
@@ -266,15 +309,15 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 	 * read-ahead if this is a non-zero offset (we are likely doing small
 	 * chunk splice and the page is already there) for a single page.
 	 */
-	if (!offset || nr_pages > 1)
-		do_page_cache_readahead(mapping, in, index, nr_pages);
+	if (!loff || spd.nr_pages > 1)
+		do_page_cache_readahead(mapping, in, index, spd.nr_pages);
 
 	/*
 	 * Now fill in the holes:
 	 */
 	error = 0;
-	bytes = 0;
-	for (i = 0; i < nr_pages; i++, index++) {
+	total_len = 0;
+	for (spd.nr_pages = 0; spd.nr_pages < nr_pages; spd.nr_pages++, index++) {
 		unsigned int this_len;
 
 		if (!len)
@@ -367,26 +410,29 @@ readpage:
 			 */
 			if (end_index == index) {
 				loff = PAGE_CACHE_SIZE - (isize & ~PAGE_CACHE_MASK);
-				if (bytes + loff > isize) {
+				if (total_len + loff > isize) {
 					page_cache_release(page);
 					break;
 				}
 				/*
 				 * force quit after adding this page
 				 */
-				nr_pages = i;
+				nr_pages = spd.nr_pages;
 				this_len = min(this_len, loff);
+				loff = 0;
 			}
 		}
 fill_it:
-		pages[i] = page;
-		bytes += this_len;
+		pages[spd.nr_pages] = page;
+		partial[spd.nr_pages].offset = loff;
+		partial[spd.nr_pages].len = this_len;
 		len -= this_len;
+		total_len += this_len;
 		loff = 0;
 	}
 
-	if (i)
-		return move_to_pipe(pipe, pages, i, bytes, offset, flags);
+	if (spd.nr_pages)
+		return move_to_pipe(pipe, &spd);
 
 	return error;
 }
@@ -1018,6 +1064,174 @@ static long do_splice(struct file *in, loff_t __user *off_in,
 	return -EINVAL;
 }
 
+/*
+ * Map an iov into an array of pages and offset/length tupples. With the
+ * partial_page structure, we can map several non-contiguous ranges into
+ * our ones pages[] map instead of splitting that operation into pieces.
+ * Could easily be exported as a generic helper for other users, in which
+ * case one would probably want to add a 'max_nr_pages' parameter as well.
+ */
+static int get_iovec_page_array(const struct iovec __user *iov,
+				unsigned int nr_vecs, struct page **pages,
+				struct partial_page *partial)
+{
+	int buffers = 0, error = 0;
+
+	/*
+	 * It's ok to take the mmap_sem for reading, even
+	 * across a "get_user()".
+	 */
+	down_read(&current->mm->mmap_sem);
+
+	while (nr_vecs) {
+		unsigned long off, npages;
+		void __user *base;
+		size_t len;
+		int i;
+
+		/*
+		 * Get user address base and length for this iovec.
+		 */
+		error = get_user(base, &iov->iov_base);
+		if (unlikely(error))
+			break;
+		error = get_user(len, &iov->iov_len);
+		if (unlikely(error))
+			break;
+
+		/*
+		 * Sanity check this iovec. 0 read succeeds.
+		 */
+		if (unlikely(!len))
+			break;
+		error = -EFAULT;
+		if (unlikely(!base))
+			break;
+
+		/*
+		 * Get this base offset and number of pages, then map
+		 * in the user pages.
+		 */
+		off = (unsigned long) base & ~PAGE_MASK;
+		npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+		if (npages > PIPE_BUFFERS - buffers)
+			npages = PIPE_BUFFERS - buffers;
+
+		error = get_user_pages(current, current->mm,
+				       (unsigned long) base, npages, 0, 0,
+				       &pages[buffers], NULL);
+
+		if (unlikely(error <= 0))
+			break;
+
+		/*
+		 * Fill this contiguous range into the partial page map.
+		 */
+		for (i = 0; i < error; i++) {
+			const int plen = min_t(size_t, len, PAGE_SIZE) - off;
+
+			partial[buffers].offset = off;
+			partial[buffers].len = plen;
+
+			off = 0;
+			len -= plen;
+			buffers++;
+		}
+
+		/*
+		 * We didn't complete this iov, stop here since it probably
+		 * means we have to move some of this into a pipe to
+		 * be able to continue.
+		 */
+		if (len)
+			break;
+
+		/*
+		 * Don't continue if we mapped fewer pages than we asked for,
+		 * or if we mapped the max number of pages that we have
+		 * room for.
+		 */
+		if (error < npages || buffers == PIPE_BUFFERS)
+			break;
+
+		nr_vecs--;
+		iov++;
+	}
+
+	up_read(&current->mm->mmap_sem);
+
+	if (buffers)
+		return buffers;
+
+	return error;
+}
+
+/*
+ * vmsplice splices a user address range into a pipe. It can be thought of
+ * as splice-from-memory, where the regular splice is splice-from-file (or
+ * to file). In both cases the output is a pipe, naturally.
+ *
+ * Note that vmsplice only supports splicing _from_ user memory to a pipe,
+ * not the other way around. Splicing from user memory is a simple operation
+ * that can be supported without any funky alignment restrictions or nasty
+ * vm tricks. We simply map in the user memory and fill them into a pipe.
+ * The reverse isn't quite as easy, though. There are two possible solutions
+ * for that:
+ *
+ *	- memcpy() the data internally, at which point we might as well just
+ *	  do a regular read() on the buffer anyway.
+ *	- Lots of nasty vm tricks, that are neither fast nor flexible (it
+ *	  has restriction limitations on both ends of the pipe).
+ *
+ * Alas, it isn't here.
+ *
+ */
+static long do_vmsplice(struct file *file, const struct iovec __user *iov,
+			unsigned long nr_segs, unsigned int flags)
+{
+	struct pipe_inode_info *pipe = file->f_dentry->d_inode->i_pipe;
+	struct page *pages[PIPE_BUFFERS];
+	struct partial_page partial[PIPE_BUFFERS];
+	struct splice_pipe_desc spd = {
+		.pages = pages,
+		.partial = partial,
+		.flags = flags,
+		.ops = &user_page_pipe_buf_ops,
+	};
+
+	if (unlikely(!pipe))
+		return -EBADF;
+	if (unlikely(nr_segs > UIO_MAXIOV))
+		return -EINVAL;
+	else if (unlikely(!nr_segs))
+		return 0;
+
+	spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial);
+	if (spd.nr_pages <= 0)
+		return spd.nr_pages;
+
+	return move_to_pipe(pipe, &spd);
+}
+
+asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov,
+			     unsigned long nr_segs, unsigned int flags)
+{
+	struct file *file;
+	long error;
+	int fput;
+
+	error = -EBADF;
+	file = fget_light(fd, &fput);
+	if (file) {
+		if (file->f_mode & FMODE_WRITE)
+			error = do_vmsplice(file, iov, nr_segs, flags);
+
+		fput_light(file, fput);
+	}
+
+	return error;
+}
+
 asmlinkage long sys_splice(int fd_in, loff_t __user *off_in,
 			   int fd_out, loff_t __user *off_out,
 			   size_t len, unsigned int flags)
diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
index d81d6cfc1bb4..eb4b152c82fc 100644
--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
@@ -321,8 +321,9 @@
 #define __NR_splice		313
 #define __NR_sync_file_range	314
 #define __NR_tee		315
+#define __NR_vmsplice		316
 
-#define NR_syscalls 316
+#define NR_syscalls 317
 
 /*
  * user-visible error numbers are in the range -1 - -128: see
diff --git a/include/asm-ia64/unistd.h b/include/asm-ia64/unistd.h
index a40ebec6aeeb..7107763168bf 100644
--- a/include/asm-ia64/unistd.h
+++ b/include/asm-ia64/unistd.h
@@ -290,12 +290,13 @@
 #define __NR_get_robust_list		1299
 #define __NR_sync_file_range		1300
 #define __NR_tee			1301
+#define __NR_vmsplice			1302
 
 #ifdef __KERNEL__
 
 #include <linux/config.h>
 
-#define NR_syscalls			278 /* length of syscall table */
+#define NR_syscalls			279 /* length of syscall table */
 
 #define __ARCH_WANT_SYS_RT_SIGACTION
 
diff --git a/include/asm-powerpc/unistd.h b/include/asm-powerpc/unistd.h
index c612f1a62772..34325e292596 100644
--- a/include/asm-powerpc/unistd.h
+++ b/include/asm-powerpc/unistd.h
@@ -303,8 +303,9 @@
 #define __NR_unshare		282
 #define __NR_splice		283
 #define __NR_tee		284
+#define __NR_vmsplice		285
 
-#define __NR_syscalls		285
+#define __NR_syscalls		286
 
 #ifdef __KERNEL__
 #define __NR__exit __NR_exit
diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h
index 98c36eae567c..feb77cb8c044 100644
--- a/include/asm-x86_64/unistd.h
+++ b/include/asm-x86_64/unistd.h
@@ -615,8 +615,10 @@ __SYSCALL(__NR_splice, sys_splice)
 __SYSCALL(__NR_tee, sys_tee)
 #define __NR_sync_file_range	277
 __SYSCALL(__NR_sync_file_range, sys_sync_file_range)
+#define __NR_vmsplice		278
+__SYSCALL(__NR_vmsplice, sys_vmsplice)
 
-#define __NR_syscall_max __NR_sync_file_range
+#define __NR_syscall_max __NR_vmsplice
 
 #ifndef __NO_STUBS
 
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index d3ebc0e68b2b..3996960fc565 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -574,6 +574,9 @@ asmlinkage long sys_splice(int fd_in, loff_t __user *off_in,
 			   int fd_out, loff_t __user *off_out,
 			   size_t len, unsigned int flags);
 
+asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov,
+			     unsigned long nr_segs, unsigned int flags);
+
 asmlinkage long sys_tee(int fdin, int fdout, size_t len, unsigned int flags);
 
 asmlinkage long sys_sync_file_range(int fd, loff_t offset, loff_t nbytes,
-- 
cgit v1.2.3


From 734cbc363b159caee158d5a83408c72d98bcacf0 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Tue, 25 Apr 2006 10:58:50 -0700
Subject: [PATCH] sky2: reschedule if irq still pending

This is a workaround for the case edge-triggered irq's. Several users
seem to have broken configurations sharing edge-triggered irq's. To avoid
losing IRQ's, reshedule if more work arrives.

The changes to netdevice.h are to extract the part that puts device
back in list into separate inline.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 drivers/net/sky2.c        | 20 ++++++++++++++++----
 include/linux/netdevice.h | 18 ++++++++++--------
 2 files changed, 26 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/sky2.c b/drivers/net/sky2.c
index 67b0eab16589..618fde8622ca 100644
--- a/drivers/net/sky2.c
+++ b/drivers/net/sky2.c
@@ -2093,6 +2093,7 @@ static int sky2_poll(struct net_device *dev0, int *budget)
 	int work_done = 0;
 	u32 status = sky2_read32(hw, B0_Y2_SP_EISR);
 
+ restart_poll:
 	if (unlikely(status & ~Y2_IS_STAT_BMU)) {
 		if (status & Y2_IS_HW_ERR)
 			sky2_hw_intr(hw);
@@ -2123,7 +2124,7 @@ static int sky2_poll(struct net_device *dev0, int *budget)
 	}
 
 	if (status & Y2_IS_STAT_BMU) {
-		work_done = sky2_status_intr(hw, work_limit);
+		work_done += sky2_status_intr(hw, work_limit - work_done);
 		*budget -= work_done;
 		dev0->quota -= work_done;
 
@@ -2133,9 +2134,22 @@ static int sky2_poll(struct net_device *dev0, int *budget)
 		sky2_write32(hw, STAT_CTRL, SC_STAT_CLR_IRQ);
 	}
 
-	netif_rx_complete(dev0);
+	local_irq_disable();
+	__netif_rx_complete(dev0);
 
 	status = sky2_read32(hw, B0_Y2_SP_LISR);
+
+	if (unlikely(status)) {
+		/* More work pending, try and keep going */
+		if (__netif_rx_schedule_prep(dev0)) {
+			__netif_rx_reschedule(dev0, work_done);
+			status = sky2_read32(hw, B0_Y2_SP_EISR);
+			local_irq_enable();
+			goto restart_poll;
+		}
+	}
+
+	local_irq_enable();
 	return 0;
 }
 
@@ -2153,8 +2167,6 @@ static irqreturn_t sky2_intr(int irq, void *dev_id, struct pt_regs *regs)
 	prefetch(&hw->st_le[hw->st_idx]);
 	if (likely(__netif_rx_schedule_prep(dev0)))
 		__netif_rx_schedule(dev0);
-	else
-		printk(KERN_DEBUG PFX "irq race detected\n");
 
 	return IRQ_HANDLED;
 }
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 40ccf8cc4239..01db7b88a2b1 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -829,19 +829,21 @@ static inline void netif_rx_schedule(struct net_device *dev)
 		__netif_rx_schedule(dev);
 }
 
-/* Try to reschedule poll. Called by dev->poll() after netif_rx_complete().
- * Do not inline this?
- */
+
+static inline void  __netif_rx_reschedule(struct net_device *dev, int undo)
+{
+	dev->quota += undo;
+	list_add_tail(&dev->poll_list, &__get_cpu_var(softnet_data).poll_list);
+	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
+}
+
+/* Try to reschedule poll. Called by dev->poll() after netif_rx_complete(). */
 static inline int netif_rx_reschedule(struct net_device *dev, int undo)
 {
 	if (netif_rx_schedule_prep(dev)) {
 		unsigned long flags;
-
-		dev->quota += undo;
-
 		local_irq_save(flags);
-		list_add_tail(&dev->poll_list, &__get_cpu_var(softnet_data).poll_list);
-		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
+		__netif_rx_reschedule(dev, undo);
 		local_irq_restore(flags);
 		return 1;
 	}
-- 
cgit v1.2.3


From 00522fb41a2a9bf0f98a007c0e2b516a3873148c Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Wed, 26 Apr 2006 14:39:29 +0200
Subject: [PATCH] splice: rearrange moving to/from pipe helpers

We need these for people writing their own ->splice_read/write hooks.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/splice.c               | 35 +++++++++++------------------------
 include/linux/pipe_fs_i.h | 17 +++++++++++++++++
 2 files changed, 28 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/fs/splice.c b/fs/splice.c
index 0b2c1f060cae..447ebc0a37f3 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -29,23 +29,13 @@
 #include <linux/syscalls.h>
 #include <linux/uio.h>
 
-/*
- * Passed to the actors
- */
-struct splice_desc {
-	unsigned int len, total_len;	/* current and remaining length */
-	unsigned int flags;		/* splice flags */
-	struct file *file;		/* file to read/write */
-	loff_t pos;			/* file position */
-};
-
 struct partial_page {
 	unsigned int offset;
 	unsigned int len;
 };
 
 /*
- * Passed to move_to_pipe
+ * Passed to splice_to_pipe
  */
 struct splice_pipe_desc {
 	struct page **pages;		/* page map */
@@ -192,8 +182,8 @@ static struct pipe_buf_operations user_page_pipe_buf_ops = {
  * Pipe output worker. This sets up our pipe format with the page cache
  * pipe buffer operations. Otherwise very similar to the regular pipe_writev().
  */
-static ssize_t move_to_pipe(struct pipe_inode_info *pipe,
-			    struct splice_pipe_desc *spd)
+static ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
+			      struct splice_pipe_desc *spd)
 {
 	int ret, do_wakeup, page_nr;
 
@@ -432,7 +422,7 @@ fill_it:
 	}
 
 	if (spd.nr_pages)
-		return move_to_pipe(pipe, &spd);
+		return splice_to_pipe(pipe, &spd);
 
 	return error;
 }
@@ -666,17 +656,14 @@ out_nomem:
 	return ret;
 }
 
-typedef int (splice_actor)(struct pipe_inode_info *, struct pipe_buffer *,
-			   struct splice_desc *);
-
 /*
  * Pipe input worker. Most of this logic works like a regular pipe, the
  * key here is the 'actor' worker passed in that actually moves the data
  * to the wanted destination. See pipe_to_file/pipe_to_sendpage above.
  */
-static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out,
-			      loff_t *ppos, size_t len, unsigned int flags,
-			      splice_actor *actor)
+ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
+			 loff_t *ppos, size_t len, unsigned int flags,
+			 splice_actor *actor)
 {
 	int ret, do_wakeup, err;
 	struct splice_desc sd;
@@ -795,7 +782,7 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
 	struct address_space *mapping = out->f_mapping;
 	ssize_t ret;
 
-	ret = move_from_pipe(pipe, out, ppos, len, flags, pipe_to_file);
+	ret = splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_file);
 	if (ret > 0) {
 		struct inode *inode = mapping->host;
 
@@ -837,7 +824,7 @@ EXPORT_SYMBOL(generic_file_splice_write);
 ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out,
 				loff_t *ppos, size_t len, unsigned int flags)
 {
-	return move_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage);
+	return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage);
 }
 
 EXPORT_SYMBOL(generic_splice_sendpage);
@@ -924,7 +911,7 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
 
 		/*
 		 * We don't have an immediate reader, but we'll read the stuff
-		 * out of the pipe right after the move_to_pipe(). So set
+		 * out of the pipe right after the splice_to_pipe(). So set
 		 * PIPE_READERS appropriately.
 		 */
 		pipe->readers = 1;
@@ -1210,7 +1197,7 @@ static long do_vmsplice(struct file *file, const struct iovec __user *iov,
 	if (spd.nr_pages <= 0)
 		return spd.nr_pages;
 
-	return move_to_pipe(pipe, &spd);
+	return splice_to_pipe(pipe, &spd);
 }
 
 asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov,
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index ef7f33c0be19..0008d4bd4059 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -61,4 +61,21 @@ void __free_pipe_info(struct pipe_inode_info *);
 				 /* from/to, of course */
 #define SPLICE_F_MORE	(0x04)	/* expect more data */
 
+/*
+ * Passed to the actors
+ */
+struct splice_desc {
+	unsigned int len, total_len;	/* current and remaining length */
+	unsigned int flags;		/* splice flags */
+	struct file *file;		/* file to read/write */
+	loff_t pos;			/* file position */
+};
+
+typedef int (splice_actor)(struct pipe_inode_info *, struct pipe_buffer *,
+			   struct splice_desc *);
+
+extern ssize_t splice_from_pipe(struct pipe_inode_info *, struct file *,
+				loff_t *, size_t, unsigned int,
+				splice_actor *);
+
 #endif
-- 
cgit v1.2.3


From ebf43500ef148a380bd132743c3fc530111ac620 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Thu, 27 Apr 2006 08:46:01 +0200
Subject: [PATCH] Add find_get_pages_contig(): contiguous variant of
 find_get_pages()

find_get_pages_contig() will break out if we hit a hole in the page cache.
From Andrew Morton, small modifications and documentation by me.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 include/linux/pagemap.h |  2 ++
 mm/filemap.c            | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 34 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 9539efd4f7e6..7a1af574dedf 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -78,6 +78,8 @@ extern struct page * find_or_create_page(struct address_space *mapping,
 				unsigned long index, gfp_t gfp_mask);
 unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
 			unsigned int nr_pages, struct page **pages);
+unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start,
+			       unsigned int nr_pages, struct page **pages);
 unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
 			int tag, unsigned int nr_pages, struct page **pages);
 
diff --git a/mm/filemap.c b/mm/filemap.c
index 3ef20739e725..fd57442186cb 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -697,6 +697,38 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
 	return ret;
 }
 
+/**
+ * find_get_pages_contig - gang contiguous pagecache lookup
+ * @mapping:	The address_space to search
+ * @index:	The starting page index
+ * @nr_pages:	The maximum number of pages
+ * @pages:	Where the resulting pages are placed
+ *
+ * find_get_pages_contig() works exactly like find_get_pages(), except
+ * that the returned number of pages are guaranteed to be contiguous.
+ *
+ * find_get_pages_contig() returns the number of pages which were found.
+ */
+unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
+			       unsigned int nr_pages, struct page **pages)
+{
+	unsigned int i;
+	unsigned int ret;
+
+	read_lock_irq(&mapping->tree_lock);
+	ret = radix_tree_gang_lookup(&mapping->page_tree,
+				(void **)pages, index, nr_pages);
+	for (i = 0; i < ret; i++) {
+		if (pages[i]->mapping == NULL || pages[i]->index != index)
+			break;
+
+		page_cache_get(pages[i]);
+		index++;
+	}
+	read_unlock_irq(&mapping->tree_lock);
+	return i;
+}
+
 /*
  * Like find_get_pages, except we only return pages which are tagged with
  * `tag'.   We update *index to index the next page for the traversal.
-- 
cgit v1.2.3


From 4d17ffda331ba6030bb8c233c73d6a87954d8ea7 Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@vrfy.org>
Date: Tue, 25 Apr 2006 15:37:26 +0200
Subject: [PATCH] Kobject: fix build error

This fixes a build error for various odd combinations of CONFIG_HOTPLUG
and CONFIG_NET.

Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Cc: Nigel Cunningham <ncunningham@cyclades.com>
Cc: Andrew Morton <akpm@osdl.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/kobject.h | 2 +-
 lib/kobject_uevent.c    | 8 +++++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kobject.h b/include/linux/kobject.h
index dcd0623be892..e38bb357282a 100644
--- a/include/linux/kobject.h
+++ b/include/linux/kobject.h
@@ -259,7 +259,7 @@ struct subsys_attribute {
 extern int subsys_create_file(struct subsystem * , struct subsys_attribute *);
 extern void subsys_remove_file(struct subsystem * , struct subsys_attribute *);
 
-#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
+#if defined(CONFIG_HOTPLUG)
 void kobject_uevent(struct kobject *kobj, enum kobject_action action);
 
 int add_uevent_var(char **envp, int num_envp, int *cur_index,
diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c
index 982226daf939..7f20e7b857cb 100644
--- a/lib/kobject_uevent.c
+++ b/lib/kobject_uevent.c
@@ -25,11 +25,13 @@
 #define BUFFER_SIZE	2048	/* buffer for the variables */
 #define NUM_ENVP	32	/* number of env pointers */
 
-#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
+#if defined(CONFIG_HOTPLUG)
 u64 uevent_seqnum;
 char uevent_helper[UEVENT_HELPER_PATH_LEN] = "/sbin/hotplug";
 static DEFINE_SPINLOCK(sequence_lock);
+#if defined(CONFIG_NET)
 static struct sock *uevent_sock;
+#endif
 
 static char *action_to_string(enum kobject_action action)
 {
@@ -155,6 +157,7 @@ void kobject_uevent(struct kobject *kobj, enum kobject_action action)
 	spin_unlock(&sequence_lock);
 	sprintf(seq_buff, "SEQNUM=%llu", (unsigned long long)seq);
 
+#if defined(CONFIG_NET)
 	/* send netlink message */
 	if (uevent_sock) {
 		struct sk_buff *skb;
@@ -179,6 +182,7 @@ void kobject_uevent(struct kobject *kobj, enum kobject_action action)
 			netlink_broadcast(uevent_sock, skb, 0, 1, GFP_KERNEL);
 		}
 	}
+#endif
 
 	/* call uevent_helper, usually only enabled during early boot */
 	if (uevent_helper[0]) {
@@ -249,6 +253,7 @@ int add_uevent_var(char **envp, int num_envp, int *cur_index,
 }
 EXPORT_SYMBOL_GPL(add_uevent_var);
 
+#if defined(CONFIG_NET)
 static int __init kobject_uevent_init(void)
 {
 	uevent_sock = netlink_kernel_create(NETLINK_KOBJECT_UEVENT, 1, NULL,
@@ -264,5 +269,6 @@ static int __init kobject_uevent_init(void)
 }
 
 postcore_initcall(kobject_uevent_init);
+#endif
 
 #endif /* CONFIG_HOTPLUG */
-- 
cgit v1.2.3


From bde11d794206ae8d72defd0e8a481181200f7dc4 Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Tue, 18 Apr 2006 21:30:22 -0700
Subject: [PATCH] Fix OCFS2 warning when DEBUG_FS is not enabled

Fix the following warning which happens when OCFS2_FS is enabled but
DEBUG_FS isn't:

fs/ocfs2/dlmglue.c: In function `ocfs2_dlm_init_debug':
fs/ocfs2/dlmglue.c:2036: warning: passing arg 5 of `debugfs_create_file' discards qualifiers from pointer target type

Signed-off-by: Jean Delvare <khali@linux-fr.org>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Joel Becker <Joel.Becker@oracle.com>
Acked-by: Mark Fasheh <mark.fasheh@oracle.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/debugfs.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index 176e2d371577..047567d34ca7 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -58,9 +58,8 @@ struct dentry *debugfs_create_blob(const char *name, mode_t mode,
  */
 
 static inline struct dentry *debugfs_create_file(const char *name, mode_t mode,
-						 struct dentry *parent,
-						 void *data,
-						 struct file_operations *fops)
+					struct dentry *parent, void *data,
+					const struct file_operations *fops)
 {
 	return ERR_PTR(-ENODEV);
 }
-- 
cgit v1.2.3


From 5b3ef14e3e9d745a512d65fcb4ef9be541226d80 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Sat, 22 Apr 2006 12:14:44 +0200
Subject: [PATCH] Kobject: possible cleanups

This patch contains the following possible cleanups:
- #if 0 the following unused global function:
  - subsys_remove_file()
- remove the following unused EXPORT_SYMBOL's:
  - kset_find_obj
  - subsystem_init
- remove the following unused EXPORT_SYMBOL_GPL:
  - kobject_add_dir

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/kobject.h | 1 -
 lib/kobject.c           | 7 ++-----
 2 files changed, 2 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kobject.h b/include/linux/kobject.h
index e38bb357282a..c187c53cecd0 100644
--- a/include/linux/kobject.h
+++ b/include/linux/kobject.h
@@ -257,7 +257,6 @@ struct subsys_attribute {
 };
 
 extern int subsys_create_file(struct subsystem * , struct subsys_attribute *);
-extern void subsys_remove_file(struct subsystem * , struct subsys_attribute *);
 
 #if defined(CONFIG_HOTPLUG)
 void kobject_uevent(struct kobject *kobj, enum kobject_action action);
diff --git a/lib/kobject.c b/lib/kobject.c
index 01d957513940..b46350c27837 100644
--- a/lib/kobject.c
+++ b/lib/kobject.c
@@ -422,7 +422,6 @@ struct kobject *kobject_add_dir(struct kobject *parent, const char *name)
 
 	return k;
 }
-EXPORT_SYMBOL_GPL(kobject_add_dir);
 
 /**
  *	kset_init - initialize a kset for use
@@ -569,7 +568,7 @@ int subsys_create_file(struct subsystem * s, struct subsys_attribute * a)
  *	@s:	subsystem.
  *	@a:	attribute desciptor.
  */
-
+#if 0
 void subsys_remove_file(struct subsystem * s, struct subsys_attribute * a)
 {
 	if (subsys_get(s)) {
@@ -577,6 +576,7 @@ void subsys_remove_file(struct subsystem * s, struct subsys_attribute * a)
 		subsys_put(s);
 	}
 }
+#endif  /*  0  */
 
 EXPORT_SYMBOL(kobject_init);
 EXPORT_SYMBOL(kobject_register);
@@ -588,10 +588,7 @@ EXPORT_SYMBOL(kobject_del);
 
 EXPORT_SYMBOL(kset_register);
 EXPORT_SYMBOL(kset_unregister);
-EXPORT_SYMBOL(kset_find_obj);
 
-EXPORT_SYMBOL(subsystem_init);
 EXPORT_SYMBOL(subsystem_register);
 EXPORT_SYMBOL(subsystem_unregister);
 EXPORT_SYMBOL(subsys_create_file);
-EXPORT_SYMBOL(subsys_remove_file);
-- 
cgit v1.2.3


From 13e87ec68641fd54f3fa04eef3419d034ed2115a Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Thu, 27 Apr 2006 18:39:18 -0700
Subject: [PATCH] request_irq(): remove warnings from irq probing

- Add new SA_PROBEIRQ which suppresses the new sharing-mismatch warning.
  Some drivers like to use request_irq() to find an unused interrupt slot.

- Use it in i82365.c

- Kill unused SA_PROBE.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/pcmcia/i82365.c     | 7 ++++---
 include/asm-xtensa/signal.h | 2 +-
 include/linux/signal.h      | 4 +++-
 kernel/irq/manage.c         | 6 ++++--
 4 files changed, 12 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pcmcia/i82365.c b/drivers/pcmcia/i82365.c
index bd0308e89815..a2f05f485156 100644
--- a/drivers/pcmcia/i82365.c
+++ b/drivers/pcmcia/i82365.c
@@ -509,7 +509,8 @@ static irqreturn_t i365_count_irq(int irq, void *dev, struct pt_regs *regs)
 static u_int __init test_irq(u_short sock, int irq)
 {
     debug(2, "  testing ISA irq %d\n", irq);
-    if (request_irq(irq, i365_count_irq, 0, "scan", i365_count_irq) != 0)
+    if (request_irq(irq, i365_count_irq, SA_PROBEIRQ, "scan",
+			i365_count_irq) != 0)
 	return 1;
     irq_hits = 0; irq_sock = sock;
     msleep(10);
@@ -561,7 +562,7 @@ static u_int __init isa_scan(u_short sock, u_int mask0)
     } else {
 	/* Fallback: just find interrupts that aren't in use */
 	for (i = 0; i < 16; i++)
-	    if ((mask0 & (1 << i)) && (_check_irq(i, 0) == 0))
+	    if ((mask0 & (1 << i)) && (_check_irq(i, SA_PROBEIRQ) == 0))
 		mask1 |= (1 << i);
 	printk("default");
 	/* If scan failed, default to polled status */
@@ -725,7 +726,7 @@ static void __init add_pcic(int ns, int type)
 	u_int cs_mask = mask & ((cs_irq) ? (1<<cs_irq) : ~(1<<12));
 	for (cs_irq = 15; cs_irq > 0; cs_irq--)
 	    if ((cs_mask & (1 << cs_irq)) &&
-		(_check_irq(cs_irq, 0) == 0))
+		(_check_irq(cs_irq, SA_PROBEIRQ) == 0))
 		break;
 	if (cs_irq) {
 	    grab_irq = 1;
diff --git a/include/asm-xtensa/signal.h b/include/asm-xtensa/signal.h
index 5d6fc9cdf58d..a99c9aec64ec 100644
--- a/include/asm-xtensa/signal.h
+++ b/include/asm-xtensa/signal.h
@@ -118,9 +118,9 @@ typedef struct {
  * SA_INTERRUPT is also used by the irq handling routines.
  * SA_SHIRQ is for shared interrupt support on PCI and EISA.
  */
-#define SA_PROBE		SA_ONESHOT
 #define SA_SAMPLE_RANDOM	SA_RESTART
 #define SA_SHIRQ		0x04000000
+#define SA_PROBEIRQ		0x08000000
 #endif
 
 #define SIG_BLOCK          0	/* for blocking signals */
diff --git a/include/linux/signal.h b/include/linux/signal.h
index 162a8fd10b29..70739f51a09f 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -14,10 +14,12 @@
  *
  * SA_INTERRUPT is also used by the irq handling routines.
  * SA_SHIRQ is for shared interrupt support on PCI and EISA.
+ * SA_PROBEIRQ is set by callers when they expect sharing mismatches to occur
  */
-#define SA_PROBE		SA_ONESHOT
 #define SA_SAMPLE_RANDOM	SA_RESTART
 #define SA_SHIRQ		0x04000000
+#define SA_PROBEIRQ		0x08000000
+
 /*
  * As above, these correspond to the IORESOURCE_IRQ_* defines in
  * linux/ioport.h to select the interrupt line behaviour.  When
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index ac766ad573e8..1279e3499534 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -246,8 +246,10 @@ int setup_irq(unsigned int irq, struct irqaction * new)
 
 mismatch:
 	spin_unlock_irqrestore(&desc->lock, flags);
-	printk(KERN_ERR "%s: irq handler mismatch\n", __FUNCTION__);
-	dump_stack();
+	if (!(new->flags & SA_PROBEIRQ)) {
+		printk(KERN_ERR "%s: irq handler mismatch\n", __FUNCTION__);
+		dump_stack();
+	}
 	return -EBUSY;
 }
 
-- 
cgit v1.2.3


From 08791e5cf62b6952ca32106aebb79b6066005de4 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dtor_core@ameritech.net>
Date: Sat, 29 Apr 2006 01:13:21 -0400
Subject: Input: ressurect EVIOCGREP and EVIOCSREP

While writing to an event device allows to set repeat rate for an
individual input device there is no way to retrieve current settings
so we need to ressurect EVIOCGREP. Also ressurect EVIOCSREP so we
have a symmetrical interface.

Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 drivers/input/evdev.c | 21 +++++++++++++++++++++
 include/linux/input.h |  2 ++
 2 files changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/input/evdev.c b/drivers/input/evdev.c
index a34e3d91d9ed..ba325f16d077 100644
--- a/drivers/input/evdev.c
+++ b/drivers/input/evdev.c
@@ -403,6 +403,27 @@ static long evdev_ioctl_handler(struct file *file, unsigned int cmd,
 		case EVIOCGID:
 			if (copy_to_user(p, &dev->id, sizeof(struct input_id)))
 				return -EFAULT;
+			return 0;
+
+		case EVIOCGREP:
+			if (!test_bit(EV_REP, dev->evbit))
+				return -ENOSYS;
+			if (put_user(dev->rep[REP_DELAY], ip))
+				return -EFAULT;
+			if (put_user(dev->rep[REP_PERIOD], ip + 1))
+				return -EFAULT;
+			return 0;
+
+		case EVIOCSREP:
+			if (!test_bit(EV_REP, dev->evbit))
+				return -ENOSYS;
+			if (get_user(u, ip))
+				return -EFAULT;
+			if (get_user(v, ip + 1))
+				return -EFAULT;
+
+			input_event(dev, EV_REP, REP_DELAY, u);
+			input_event(dev, EV_REP, REP_PERIOD, v);
 
 			return 0;
 
diff --git a/include/linux/input.h b/include/linux/input.h
index 8298b4bf5a07..50e338d2ffda 100644
--- a/include/linux/input.h
+++ b/include/linux/input.h
@@ -56,6 +56,8 @@ struct input_absinfo {
 
 #define EVIOCGVERSION		_IOR('E', 0x01, int)			/* get driver version */
 #define EVIOCGID		_IOR('E', 0x02, struct input_id)	/* get device ID */
+#define EVIOCGREP		_IOR('E', 0x03, int[2])			/* get repeat settings */
+#define EVIOCSREP		_IOW('E', 0x03, int[2])			/* set repeat settings */
 #define EVIOCGKEYCODE		_IOR('E', 0x04, int[2])			/* get keycode */
 #define EVIOCSKEYCODE		_IOW('E', 0x04, int[2])			/* set keycode */
 
-- 
cgit v1.2.3


From da753beaeb1446aa87bcca7e8a0026633a8914f0 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <mita@miraclelinux.com>
Date: Fri, 28 Apr 2006 15:21:23 -0700
Subject: [NET]: use hlist_unhashed()

Use hlist_unhashed() rather than accessing inside data structure.

Signed-off-by: Akinobu Mita <mita@miraclelinux.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/list.h             | 2 +-
 include/net/inet_timewait_sock.h | 2 +-
 include/net/sock.h               | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/list.h b/include/linux/list.h
index 67258b47e9ca..76f05718342c 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -619,7 +619,7 @@ static inline void hlist_del_rcu(struct hlist_node *n)
 
 static inline void hlist_del_init(struct hlist_node *n)
 {
-	if (n->pprev)  {
+	if (!hlist_unhashed(n)) {
 		__hlist_del(n);
 		INIT_HLIST_NODE(n);
 	}
diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index 1da294c47522..e837f98fdb50 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -150,7 +150,7 @@ static inline void inet_twsk_add_bind_node(struct inet_timewait_sock *tw,
 
 static inline int inet_twsk_dead_hashed(const struct inet_timewait_sock *tw)
 {
-	return tw->tw_death_node.pprev != NULL;
+	return !hlist_unhashed(&tw->tw_death_node);
 }
 
 static inline void inet_twsk_dead_node_init(struct inet_timewait_sock *tw)
diff --git a/include/net/sock.h b/include/net/sock.h
index ff8b0dad7b0f..c9fad6fb629b 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -279,7 +279,7 @@ static inline int sk_unhashed(const struct sock *sk)
 
 static inline int sk_hashed(const struct sock *sk)
 {
-	return sk->sk_node.pprev != NULL;
+	return !sk_unhashed(sk);
 }
 
 static __inline__ void sk_node_init(struct hlist_node *node)
-- 
cgit v1.2.3


From 5411be59db80333039386f3b1ccfe5eb9023a916 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 29 Mar 2006 20:23:36 -0500
Subject: [PATCH] drop task argument of audit_syscall_{entry,exit}

... it's always current, and that's a good thing - allows simpler locking.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/i386/kernel/ptrace.c    | 7 +++----
 arch/i386/kernel/vm86.c      | 2 +-
 arch/ia64/kernel/ptrace.c    | 4 ++--
 arch/mips/kernel/ptrace.c    | 4 ++--
 arch/powerpc/kernel/ptrace.c | 5 ++---
 arch/s390/kernel/ptrace.c    | 5 ++---
 arch/sparc64/kernel/ptrace.c | 5 ++---
 arch/um/kernel/ptrace.c      | 6 ++----
 arch/x86_64/kernel/ptrace.c  | 6 +++---
 include/linux/audit.h        | 8 ++++----
 kernel/auditsc.c             | 8 ++++----
 11 files changed, 27 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/kernel/ptrace.c b/arch/i386/kernel/ptrace.c
index 506462ef36a0..fd7eaf7866e0 100644
--- a/arch/i386/kernel/ptrace.c
+++ b/arch/i386/kernel/ptrace.c
@@ -671,7 +671,7 @@ int do_syscall_trace(struct pt_regs *regs, int entryexit)
 
 	if (unlikely(current->audit_context)) {
 		if (entryexit)
-			audit_syscall_exit(current, AUDITSC_RESULT(regs->eax),
+			audit_syscall_exit(AUDITSC_RESULT(regs->eax),
 						regs->eax);
 		/* Debug traps, when using PTRACE_SINGLESTEP, must be sent only
 		 * on the syscall exit path. Normally, when TIF_SYSCALL_AUDIT is
@@ -720,14 +720,13 @@ int do_syscall_trace(struct pt_regs *regs, int entryexit)
 	ret = is_sysemu;
 out:
 	if (unlikely(current->audit_context) && !entryexit)
-		audit_syscall_entry(current, AUDIT_ARCH_I386, regs->orig_eax,
+		audit_syscall_entry(AUDIT_ARCH_I386, regs->orig_eax,
 				    regs->ebx, regs->ecx, regs->edx, regs->esi);
 	if (ret == 0)
 		return 0;
 
 	regs->orig_eax = -1; /* force skip of syscall restarting */
 	if (unlikely(current->audit_context))
-		audit_syscall_exit(current, AUDITSC_RESULT(regs->eax),
-				regs->eax);
+		audit_syscall_exit(AUDITSC_RESULT(regs->eax), regs->eax);
 	return 1;
 }
diff --git a/arch/i386/kernel/vm86.c b/arch/i386/kernel/vm86.c
index aee14fafd13d..00e0118e717c 100644
--- a/arch/i386/kernel/vm86.c
+++ b/arch/i386/kernel/vm86.c
@@ -312,7 +312,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
 
 	/*call audit_syscall_exit since we do not exit via the normal paths */
 	if (unlikely(current->audit_context))
-		audit_syscall_exit(current, AUDITSC_RESULT(eax), eax);
+		audit_syscall_exit(AUDITSC_RESULT(eax), eax);
 
 	__asm__ __volatile__(
 		"movl %0,%%esp\n\t"
diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c
index 9887c8787e7a..e61e15e28d8b 100644
--- a/arch/ia64/kernel/ptrace.c
+++ b/arch/ia64/kernel/ptrace.c
@@ -1644,7 +1644,7 @@ syscall_trace_enter (long arg0, long arg1, long arg2, long arg3,
 			arch = AUDIT_ARCH_IA64;
 		}
 
-		audit_syscall_entry(current, arch, syscall, arg0, arg1, arg2, arg3);
+		audit_syscall_entry(arch, syscall, arg0, arg1, arg2, arg3);
 	}
 
 }
@@ -1662,7 +1662,7 @@ syscall_trace_leave (long arg0, long arg1, long arg2, long arg3,
 
 		if (success != AUDITSC_SUCCESS)
 			result = -result;
-		audit_syscall_exit(current, success, result);
+		audit_syscall_exit(success, result);
 	}
 
 	if (test_thread_flag(TIF_SYSCALL_TRACE)
diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c
index f3106d0771b0..9b4733c12395 100644
--- a/arch/mips/kernel/ptrace.c
+++ b/arch/mips/kernel/ptrace.c
@@ -483,7 +483,7 @@ static inline int audit_arch(void)
 asmlinkage void do_syscall_trace(struct pt_regs *regs, int entryexit)
 {
 	if (unlikely(current->audit_context) && entryexit)
-		audit_syscall_exit(current, AUDITSC_RESULT(regs->regs[2]),
+		audit_syscall_exit(AUDITSC_RESULT(regs->regs[2]),
 		                   regs->regs[2]);
 
 	if (!(current->ptrace & PT_PTRACED))
@@ -507,7 +507,7 @@ asmlinkage void do_syscall_trace(struct pt_regs *regs, int entryexit)
 	}
  out:
 	if (unlikely(current->audit_context) && !entryexit)
-		audit_syscall_entry(current, audit_arch(), regs->regs[2],
+		audit_syscall_entry(audit_arch(), regs->regs[2],
 				    regs->regs[4], regs->regs[5],
 				    regs->regs[6], regs->regs[7]);
 }
diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c
index bcb83574335b..4a677d1bd4ef 100644
--- a/arch/powerpc/kernel/ptrace.c
+++ b/arch/powerpc/kernel/ptrace.c
@@ -538,7 +538,7 @@ void do_syscall_trace_enter(struct pt_regs *regs)
 		do_syscall_trace();
 
 	if (unlikely(current->audit_context))
-		audit_syscall_entry(current,
+		audit_syscall_entry(
 #ifdef CONFIG_PPC32
 				    AUDIT_ARCH_PPC,
 #else
@@ -556,8 +556,7 @@ void do_syscall_trace_leave(struct pt_regs *regs)
 #endif
 
 	if (unlikely(current->audit_context))
-		audit_syscall_exit(current,
-				   (regs->ccr&0x1000)?AUDITSC_FAILURE:AUDITSC_SUCCESS,
+		audit_syscall_exit((regs->ccr&0x1000)?AUDITSC_FAILURE:AUDITSC_SUCCESS,
 				   regs->result);
 
 	if ((test_thread_flag(TIF_SYSCALL_TRACE)
diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c
index 37dfe33dab73..8f36504075ed 100644
--- a/arch/s390/kernel/ptrace.c
+++ b/arch/s390/kernel/ptrace.c
@@ -734,7 +734,7 @@ asmlinkage void
 syscall_trace(struct pt_regs *regs, int entryexit)
 {
 	if (unlikely(current->audit_context) && entryexit)
-		audit_syscall_exit(current, AUDITSC_RESULT(regs->gprs[2]), regs->gprs[2]);
+		audit_syscall_exit(AUDITSC_RESULT(regs->gprs[2]), regs->gprs[2]);
 
 	if (!test_thread_flag(TIF_SYSCALL_TRACE))
 		goto out;
@@ -761,8 +761,7 @@ syscall_trace(struct pt_regs *regs, int entryexit)
 	}
  out:
 	if (unlikely(current->audit_context) && !entryexit)
-		audit_syscall_entry(current, 
-				    test_thread_flag(TIF_31BIT)?AUDIT_ARCH_S390:AUDIT_ARCH_S390X,
+		audit_syscall_entry(test_thread_flag(TIF_31BIT)?AUDIT_ARCH_S390:AUDIT_ARCH_S390X,
 				    regs->gprs[2], regs->orig_gpr2, regs->gprs[3],
 				    regs->gprs[4], regs->gprs[5]);
 }
diff --git a/arch/sparc64/kernel/ptrace.c b/arch/sparc64/kernel/ptrace.c
index 49e6dedd027d..d31975e6d6f6 100644
--- a/arch/sparc64/kernel/ptrace.c
+++ b/arch/sparc64/kernel/ptrace.c
@@ -653,7 +653,7 @@ asmlinkage void syscall_trace(struct pt_regs *regs, int syscall_exit_p)
 		if (unlikely(tstate & (TSTATE_XCARRY | TSTATE_ICARRY)))
 			result = AUDITSC_FAILURE;
 
-		audit_syscall_exit(current, result, regs->u_regs[UREG_I0]);
+		audit_syscall_exit(result, regs->u_regs[UREG_I0]);
 	}
 
 	if (!(current->ptrace & PT_PTRACED))
@@ -677,8 +677,7 @@ asmlinkage void syscall_trace(struct pt_regs *regs, int syscall_exit_p)
 
 out:
 	if (unlikely(current->audit_context) && !syscall_exit_p)
-		audit_syscall_entry(current,
-				    (test_thread_flag(TIF_32BIT) ?
+		audit_syscall_entry((test_thread_flag(TIF_32BIT) ?
 				     AUDIT_ARCH_SPARC :
 				     AUDIT_ARCH_SPARC64),
 				    regs->u_regs[UREG_G1],
diff --git a/arch/um/kernel/ptrace.c b/arch/um/kernel/ptrace.c
index 60d2eda995c1..9a77fb3c269d 100644
--- a/arch/um/kernel/ptrace.c
+++ b/arch/um/kernel/ptrace.c
@@ -275,15 +275,13 @@ void syscall_trace(union uml_pt_regs *regs, int entryexit)
 
 	if (unlikely(current->audit_context)) {
 		if (!entryexit)
-			audit_syscall_entry(current,
-                                            HOST_AUDIT_ARCH,
+			audit_syscall_entry(HOST_AUDIT_ARCH,
 					    UPT_SYSCALL_NR(regs),
 					    UPT_SYSCALL_ARG1(regs),
 					    UPT_SYSCALL_ARG2(regs),
 					    UPT_SYSCALL_ARG3(regs),
 					    UPT_SYSCALL_ARG4(regs));
-		else audit_syscall_exit(current,
-                                        AUDITSC_RESULT(UPT_SYSCALL_RET(regs)),
+		else audit_syscall_exit(AUDITSC_RESULT(UPT_SYSCALL_RET(regs)),
                                         UPT_SYSCALL_RET(regs));
 	}
 
diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c
index da8e7903d817..2d50024c9f30 100644
--- a/arch/x86_64/kernel/ptrace.c
+++ b/arch/x86_64/kernel/ptrace.c
@@ -600,12 +600,12 @@ asmlinkage void syscall_trace_enter(struct pt_regs *regs)
 
 	if (unlikely(current->audit_context)) {
 		if (test_thread_flag(TIF_IA32)) {
-			audit_syscall_entry(current, AUDIT_ARCH_I386,
+			audit_syscall_entry(AUDIT_ARCH_I386,
 					    regs->orig_rax,
 					    regs->rbx, regs->rcx,
 					    regs->rdx, regs->rsi);
 		} else {
-			audit_syscall_entry(current, AUDIT_ARCH_X86_64,
+			audit_syscall_entry(AUDIT_ARCH_X86_64,
 					    regs->orig_rax,
 					    regs->rdi, regs->rsi,
 					    regs->rdx, regs->r10);
@@ -616,7 +616,7 @@ asmlinkage void syscall_trace_enter(struct pt_regs *regs)
 asmlinkage void syscall_trace_leave(struct pt_regs *regs)
 {
 	if (unlikely(current->audit_context))
-		audit_syscall_exit(current, AUDITSC_RESULT(regs->rax), regs->rax);
+		audit_syscall_exit(AUDITSC_RESULT(regs->rax), regs->rax);
 
 	if ((test_thread_flag(TIF_SYSCALL_TRACE)
 	     || test_thread_flag(TIF_SINGLESTEP))
diff --git a/include/linux/audit.h b/include/linux/audit.h
index 1c47c59058c1..39fef6ebb854 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -287,10 +287,10 @@ struct netlink_skb_parms;
 				/* Public API */
 extern int  audit_alloc(struct task_struct *task);
 extern void audit_free(struct task_struct *task);
-extern void audit_syscall_entry(struct task_struct *task, int arch,
+extern void audit_syscall_entry(int arch,
 				int major, unsigned long a0, unsigned long a1,
 				unsigned long a2, unsigned long a3);
-extern void audit_syscall_exit(struct task_struct *task, int failed, long return_code);
+extern void audit_syscall_exit(int failed, long return_code);
 extern void audit_getname(const char *name);
 extern void audit_putname(const char *name);
 extern void __audit_inode(const char *name, const struct inode *inode, unsigned flags);
@@ -323,8 +323,8 @@ extern int audit_set_macxattr(const char *name);
 #else
 #define audit_alloc(t) ({ 0; })
 #define audit_free(t) do { ; } while (0)
-#define audit_syscall_entry(t,ta,a,b,c,d,e) do { ; } while (0)
-#define audit_syscall_exit(t,f,r) do { ; } while (0)
+#define audit_syscall_entry(ta,a,b,c,d,e) do { ; } while (0)
+#define audit_syscall_exit(f,r) do { ; } while (0)
 #define audit_getname(n) do { ; } while (0)
 #define audit_putname(n) do { ; } while (0)
 #define __audit_inode(n,i,f) do { ; } while (0)
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index ba0ec1ba6698..7ed82b088e4b 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -736,10 +736,11 @@ void audit_free(struct task_struct *tsk)
  * will only be written if another part of the kernel requests that it
  * be written).
  */
-void audit_syscall_entry(struct task_struct *tsk, int arch, int major,
+void audit_syscall_entry(int arch, int major,
 			 unsigned long a1, unsigned long a2,
 			 unsigned long a3, unsigned long a4)
 {
+	struct task_struct *tsk = current;
 	struct audit_context *context = tsk->audit_context;
 	enum audit_state     state;
 
@@ -817,12 +818,11 @@ void audit_syscall_entry(struct task_struct *tsk, int arch, int major,
  * message), then write out the syscall information.  In call cases,
  * free the names stored from getname().
  */
-void audit_syscall_exit(struct task_struct *tsk, int valid, long return_code)
+void audit_syscall_exit(int valid, long return_code)
 {
+	struct task_struct *tsk = current;
 	struct audit_context *context;
 
-	/* tsk == current */
-
 	get_task_struct(tsk);
 	task_lock(tsk);
 	context = audit_get_context(tsk, valid, return_code);
-- 
cgit v1.2.3


From 376bd9cb357ec945ac893feaeb63af7370a6e70b Mon Sep 17 00:00:00 2001
From: Darrel Goeddel <dgoeddel@trustedcs.com>
Date: Fri, 24 Feb 2006 15:44:05 -0600
Subject: [PATCH] support for context based audit filtering

The following patch provides selinux interfaces that will allow the audit
system to perform filtering based on the process context (user, role, type,
sensitivity, and clearance).  These interfaces will allow the selinux
module to perform efficient matches based on lower level selinux constructs,
rather than relying on context retrievals and string comparisons within
the audit module.  It also allows for dominance checks on the mls portion
of the contexts that are impossible with only string comparisons.

Signed-off-by: Darrel Goeddel <dgoeddel@trustedcs.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/audit.h          |   5 +
 include/linux/selinux.h        | 112 ++++++++++++++++++++
 security/selinux/Makefile      |   2 +-
 security/selinux/avc.c         |  13 +--
 security/selinux/exports.c     |  28 +++++
 security/selinux/ss/mls.c      |  30 +++++-
 security/selinux/ss/mls.h      |   4 +-
 security/selinux/ss/services.c | 235 ++++++++++++++++++++++++++++++++++++++++-
 8 files changed, 419 insertions(+), 10 deletions(-)
 create mode 100644 include/linux/selinux.h
 create mode 100644 security/selinux/exports.c

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 39fef6ebb854..740f950397b7 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -145,6 +145,11 @@
 #define AUDIT_PERS	10
 #define AUDIT_ARCH	11
 #define AUDIT_MSGTYPE	12
+#define AUDIT_SE_USER	13	/* security label user */
+#define AUDIT_SE_ROLE	14	/* security label role */
+#define AUDIT_SE_TYPE	15	/* security label type */
+#define AUDIT_SE_SEN	16	/* security label sensitivity label */
+#define AUDIT_SE_CLR	17	/* security label clearance label */
 
 				/* These are ONLY useful when checking
 				 * at syscall exit time (AUDIT_AT_EXIT). */
diff --git a/include/linux/selinux.h b/include/linux/selinux.h
new file mode 100644
index 000000000000..9d684b1728b0
--- /dev/null
+++ b/include/linux/selinux.h
@@ -0,0 +1,112 @@
+/*
+ * SELinux services exported to the rest of the kernel.
+ *
+ * Author: James Morris <jmorris@redhat.com>
+ *
+ * Copyright (C) 2005 Red Hat, Inc., James Morris <jmorris@redhat.com>
+ * Copyright (C) 2006 Trusted Computer Solutions, Inc. <dgoeddel@trustedcs.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2,
+ * as published by the Free Software Foundation.
+ */
+#ifndef _LINUX_SELINUX_H
+#define _LINUX_SELINUX_H
+
+struct selinux_audit_rule;
+struct audit_context;
+
+#ifdef CONFIG_SECURITY_SELINUX
+
+/**
+ *	selinux_audit_rule_init - alloc/init an selinux audit rule structure.
+ *	@field: the field this rule refers to
+ *	@op: the operater the rule uses
+ *	@rulestr: the text "target" of the rule
+ *	@rule: pointer to the new rule structure returned via this
+ *
+ *	Returns 0 if successful, -errno if not.  On success, the rule structure
+ *	will be allocated internally.  The caller must free this structure with
+ *	selinux_audit_rule_free() after use.
+ */
+int selinux_audit_rule_init(u32 field, u32 op, char *rulestr,
+                            struct selinux_audit_rule **rule);
+
+/**
+ *	selinux_audit_rule_free - free an selinux audit rule structure.
+ *	@rule: pointer to the audit rule to be freed
+ *
+ *	This will free all memory associated with the given rule.
+ *	If @rule is NULL, no operation is performed.
+ */
+void selinux_audit_rule_free(struct selinux_audit_rule *rule);
+
+/**
+ *	selinux_audit_rule_match - determine if a context ID matches a rule.
+ *	@ctxid: the context ID to check
+ *	@field: the field this rule refers to
+ *	@op: the operater the rule uses
+ *	@rule: pointer to the audit rule to check against
+ *	@actx: the audit context (can be NULL) associated with the check
+ *
+ *	Returns 1 if the context id matches the rule, 0 if it does not, and
+ *	-errno on failure.
+ */
+int selinux_audit_rule_match(u32 ctxid, u32 field, u32 op,
+                             struct selinux_audit_rule *rule,
+                             struct audit_context *actx);
+
+/**
+ *	selinux_audit_set_callback - set the callback for policy reloads.
+ *	@callback: the function to call when the policy is reloaded
+ *
+ *	This sets the function callback function that will update the rules
+ *	upon policy reloads.  This callback should rebuild all existing rules
+ *	using selinux_audit_rule_init().
+ */
+void selinux_audit_set_callback(int (*callback)(void));
+
+/**
+ *	selinux_task_ctxid - determine a context ID for a process.
+ *	@tsk: the task object
+ *	@ctxid: ID value returned via this
+ *
+ *	On return, ctxid will contain an ID for the context.  This value
+ *	should only be used opaquely.
+ */
+void selinux_task_ctxid(struct task_struct *tsk, u32 *ctxid);
+
+#else
+
+static inline int selinux_audit_rule_init(u32 field, u32 op,
+                                          char *rulestr,
+                                          struct selinux_audit_rule **rule)
+{
+	return -ENOTSUPP;
+}
+
+static inline void selinux_audit_rule_free(struct selinux_audit_rule *rule)
+{
+	return;
+}
+
+static inline int selinux_audit_rule_match(u32 ctxid, u32 field, u32 op,
+                                           struct selinux_audit_rule *rule,
+                                           struct audit_context *actx)
+{
+	return 0;
+}
+
+static inline void selinux_audit_set_callback(int (*callback)(void))
+{
+	return;
+}
+
+static inline void selinux_task_ctxid(struct task_struct *tsk, u32 *ctxid)
+{
+	*ctxid = 0;
+}
+
+#endif	/* CONFIG_SECURITY_SELINUX */
+
+#endif /* _LINUX_SELINUX_H */
diff --git a/security/selinux/Makefile b/security/selinux/Makefile
index 688c0a267b62..faf2e02e4410 100644
--- a/security/selinux/Makefile
+++ b/security/selinux/Makefile
@@ -4,7 +4,7 @@
 
 obj-$(CONFIG_SECURITY_SELINUX) := selinux.o ss/
 
-selinux-y := avc.o hooks.o selinuxfs.o netlink.o nlmsgtab.o netif.o
+selinux-y := avc.o hooks.o selinuxfs.o netlink.o nlmsgtab.o netif.o exports.o
 
 selinux-$(CONFIG_SECURITY_NETWORK_XFRM) += xfrm.o
 
diff --git a/security/selinux/avc.c b/security/selinux/avc.c
index ac5d69bb3377..a300702da527 100644
--- a/security/selinux/avc.c
+++ b/security/selinux/avc.c
@@ -800,7 +800,7 @@ out:
 int avc_ss_reset(u32 seqno)
 {
 	struct avc_callback_node *c;
-	int i, rc = 0;
+	int i, rc = 0, tmprc;
 	unsigned long flag;
 	struct avc_node *node;
 
@@ -813,15 +813,16 @@ int avc_ss_reset(u32 seqno)
 
 	for (c = avc_callbacks; c; c = c->next) {
 		if (c->events & AVC_CALLBACK_RESET) {
-			rc = c->callback(AVC_CALLBACK_RESET,
-					 0, 0, 0, 0, NULL);
-			if (rc)
-				goto out;
+			tmprc = c->callback(AVC_CALLBACK_RESET,
+			                    0, 0, 0, 0, NULL);
+			/* save the first error encountered for the return
+			   value and continue processing the callbacks */
+			if (!rc)
+				rc = tmprc;
 		}
 	}
 
 	avc_latest_notif_update(seqno, 0);
-out:
 	return rc;
 }
 
diff --git a/security/selinux/exports.c b/security/selinux/exports.c
new file mode 100644
index 000000000000..333c4c7824d8
--- /dev/null
+++ b/security/selinux/exports.c
@@ -0,0 +1,28 @@
+/*
+ * SELinux services exported to the rest of the kernel.
+ *
+ * Author: James Morris <jmorris@redhat.com>
+ *
+ * Copyright (C) 2005 Red Hat, Inc., James Morris <jmorris@redhat.com>
+ * Copyright (C) 2006 Trusted Computer Solutions, Inc. <dgoeddel@trustedcs.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2,
+ * as published by the Free Software Foundation.
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/selinux.h>
+
+#include "security.h"
+#include "objsec.h"
+
+void selinux_task_ctxid(struct task_struct *tsk, u32 *ctxid)
+{
+	struct task_security_struct *tsec = tsk->security;
+	if (selinux_enabled)
+		*ctxid = tsec->sid;
+	else
+		*ctxid = 0;
+}
diff --git a/security/selinux/ss/mls.c b/security/selinux/ss/mls.c
index 84047f69f9c1..7bc5b6440f70 100644
--- a/security/selinux/ss/mls.c
+++ b/security/selinux/ss/mls.c
@@ -8,7 +8,7 @@
  *
  *	Support for enhanced MLS infrastructure.
  *
- * Copyright (C) 2004-2005 Trusted Computer Solutions, Inc.
+ * Copyright (C) 2004-2006 Trusted Computer Solutions, Inc.
  */
 
 #include <linux/kernel.h>
@@ -384,6 +384,34 @@ out:
 	return rc;
 }
 
+/*
+ * Set the MLS fields in the security context structure
+ * `context' based on the string representation in
+ * the string `str'.  This function will allocate temporary memory with the
+ * given constraints of gfp_mask.
+ */
+int mls_from_string(char *str, struct context *context, gfp_t gfp_mask)
+{
+	char *tmpstr, *freestr;
+	int rc;
+
+	if (!selinux_mls_enabled)
+		return -EINVAL;
+
+	/* we need freestr because mls_context_to_sid will change
+	   the value of tmpstr */
+	tmpstr = freestr = kstrdup(str, gfp_mask);
+	if (!tmpstr) {
+		rc = -ENOMEM;
+	} else {
+		rc = mls_context_to_sid(':', &tmpstr, context,
+		                        NULL, SECSID_NULL);
+		kfree(freestr);
+	}
+
+	return rc;
+}
+
 /*
  * Copies the effective MLS range from `src' into `dst'.
  */
diff --git a/security/selinux/ss/mls.h b/security/selinux/ss/mls.h
index 03de697c8058..fbb42f07dd7c 100644
--- a/security/selinux/ss/mls.h
+++ b/security/selinux/ss/mls.h
@@ -8,7 +8,7 @@
  *
  *	Support for enhanced MLS infrastructure.
  *
- * Copyright (C) 2004-2005 Trusted Computer Solutions, Inc.
+ * Copyright (C) 2004-2006 Trusted Computer Solutions, Inc.
  */
 
 #ifndef _SS_MLS_H_
@@ -27,6 +27,8 @@ int mls_context_to_sid(char oldc,
 		       struct sidtab *s,
 		       u32 def_sid);
 
+int mls_from_string(char *str, struct context *context, gfp_t gfp_mask);
+
 int mls_convert_context(struct policydb *oldp,
 			struct policydb *newp,
 			struct context *context);
diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c
index 61492485de84..7177e98df7f3 100644
--- a/security/selinux/ss/services.c
+++ b/security/selinux/ss/services.c
@@ -7,12 +7,13 @@
  * Updated: Trusted Computer Solutions, Inc. <dgoeddel@trustedcs.com>
  *
  *	Support for enhanced MLS infrastructure.
+ *	Support for context based audit filters.
  *
  * Updated: Frank Mayer <mayerf@tresys.com> and Karl MacMillan <kmacmillan@tresys.com>
  *
  * 	Added conditional policy language extensions
  *
- * Copyright (C) 2004-2005 Trusted Computer Solutions, Inc.
+ * Copyright (C) 2004-2006 Trusted Computer Solutions, Inc.
  * Copyright (C) 2003 - 2004 Tresys Technology, LLC
  * Copyright (C) 2003 Red Hat, Inc., James Morris <jmorris@redhat.com>
  *	This program is free software; you can redistribute it and/or modify
@@ -1811,3 +1812,235 @@ out:
 	POLICY_RDUNLOCK;
 	return rc;
 }
+
+struct selinux_audit_rule {
+	u32 au_seqno;
+	struct context au_ctxt;
+};
+
+void selinux_audit_rule_free(struct selinux_audit_rule *rule)
+{
+	if (rule) {
+		context_destroy(&rule->au_ctxt);
+		kfree(rule);
+	}
+}
+
+int selinux_audit_rule_init(u32 field, u32 op, char *rulestr,
+                            struct selinux_audit_rule **rule)
+{
+	struct selinux_audit_rule *tmprule;
+	struct role_datum *roledatum;
+	struct type_datum *typedatum;
+	struct user_datum *userdatum;
+	int rc = 0;
+
+	*rule = NULL;
+
+	if (!ss_initialized)
+		return -ENOTSUPP;
+
+	switch (field) {
+	case AUDIT_SE_USER:
+	case AUDIT_SE_ROLE:
+	case AUDIT_SE_TYPE:
+		/* only 'equals' and 'not equals' fit user, role, and type */
+		if (op != AUDIT_EQUAL && op != AUDIT_NOT_EQUAL)
+			return -EINVAL;
+		break;
+	case AUDIT_SE_SEN:
+	case AUDIT_SE_CLR:
+		/* we do not allow a range, indicated by the presense of '-' */
+		if (strchr(rulestr, '-'))
+			return -EINVAL;
+		break;
+	default:
+		/* only the above fields are valid */
+		return -EINVAL;
+	}
+
+	tmprule = kzalloc(sizeof(struct selinux_audit_rule), GFP_KERNEL);
+	if (!tmprule)
+		return -ENOMEM;
+
+	context_init(&tmprule->au_ctxt);
+
+	POLICY_RDLOCK;
+
+	tmprule->au_seqno = latest_granting;
+
+	switch (field) {
+	case AUDIT_SE_USER:
+		userdatum = hashtab_search(policydb.p_users.table, rulestr);
+		if (!userdatum)
+			rc = -EINVAL;
+		else
+			tmprule->au_ctxt.user = userdatum->value;
+		break;
+	case AUDIT_SE_ROLE:
+		roledatum = hashtab_search(policydb.p_roles.table, rulestr);
+		if (!roledatum)
+			rc = -EINVAL;
+		else
+			tmprule->au_ctxt.role = roledatum->value;
+		break;
+	case AUDIT_SE_TYPE:
+		typedatum = hashtab_search(policydb.p_types.table, rulestr);
+		if (!typedatum)
+			rc = -EINVAL;
+		else
+			tmprule->au_ctxt.type = typedatum->value;
+		break;
+	case AUDIT_SE_SEN:
+	case AUDIT_SE_CLR:
+		rc = mls_from_string(rulestr, &tmprule->au_ctxt, GFP_ATOMIC);
+		break;
+	}
+
+	POLICY_RDUNLOCK;
+
+	if (rc) {
+		selinux_audit_rule_free(tmprule);
+		tmprule = NULL;
+	}
+
+	*rule = tmprule;
+
+	return rc;
+}
+
+int selinux_audit_rule_match(u32 ctxid, u32 field, u32 op,
+                             struct selinux_audit_rule *rule,
+                             struct audit_context *actx)
+{
+	struct context *ctxt;
+	struct mls_level *level;
+	int match = 0;
+
+	if (!rule) {
+		audit_log(actx, GFP_ATOMIC, AUDIT_SELINUX_ERR,
+		          "selinux_audit_rule_match: missing rule\n");
+		return -ENOENT;
+	}
+
+	POLICY_RDLOCK;
+
+	if (rule->au_seqno < latest_granting) {
+		audit_log(actx, GFP_ATOMIC, AUDIT_SELINUX_ERR,
+		          "selinux_audit_rule_match: stale rule\n");
+		match = -ESTALE;
+		goto out;
+	}
+
+	ctxt = sidtab_search(&sidtab, ctxid);
+	if (!ctxt) {
+		audit_log(actx, GFP_ATOMIC, AUDIT_SELINUX_ERR,
+		          "selinux_audit_rule_match: unrecognized SID %d\n",
+		          ctxid);
+		match = -ENOENT;
+		goto out;
+	}
+
+	/* a field/op pair that is not caught here will simply fall through
+	   without a match */
+	switch (field) {
+	case AUDIT_SE_USER:
+		switch (op) {
+		case AUDIT_EQUAL:
+			match = (ctxt->user == rule->au_ctxt.user);
+			break;
+		case AUDIT_NOT_EQUAL:
+			match = (ctxt->user != rule->au_ctxt.user);
+			break;
+		}
+		break;
+	case AUDIT_SE_ROLE:
+		switch (op) {
+		case AUDIT_EQUAL:
+			match = (ctxt->role == rule->au_ctxt.role);
+			break;
+		case AUDIT_NOT_EQUAL:
+			match = (ctxt->role != rule->au_ctxt.role);
+			break;
+		}
+		break;
+	case AUDIT_SE_TYPE:
+		switch (op) {
+		case AUDIT_EQUAL:
+			match = (ctxt->type == rule->au_ctxt.type);
+			break;
+		case AUDIT_NOT_EQUAL:
+			match = (ctxt->type != rule->au_ctxt.type);
+			break;
+		}
+		break;
+	case AUDIT_SE_SEN:
+	case AUDIT_SE_CLR:
+		level = (op == AUDIT_SE_SEN ?
+		         &ctxt->range.level[0] : &ctxt->range.level[1]);
+		switch (op) {
+		case AUDIT_EQUAL:
+			match = mls_level_eq(&rule->au_ctxt.range.level[0],
+			                     level);
+			break;
+		case AUDIT_NOT_EQUAL:
+			match = !mls_level_eq(&rule->au_ctxt.range.level[0],
+			                      level);
+			break;
+		case AUDIT_LESS_THAN:
+			match = (mls_level_dom(&rule->au_ctxt.range.level[0],
+			                       level) &&
+			         !mls_level_eq(&rule->au_ctxt.range.level[0],
+			                       level));
+			break;
+		case AUDIT_LESS_THAN_OR_EQUAL:
+			match = mls_level_dom(&rule->au_ctxt.range.level[0],
+			                      level);
+			break;
+		case AUDIT_GREATER_THAN:
+			match = (mls_level_dom(level,
+			                      &rule->au_ctxt.range.level[0]) &&
+			         !mls_level_eq(level,
+			                       &rule->au_ctxt.range.level[0]));
+			break;
+		case AUDIT_GREATER_THAN_OR_EQUAL:
+			match = mls_level_dom(level,
+			                      &rule->au_ctxt.range.level[0]);
+			break;
+		}
+	}
+
+out:
+	POLICY_RDUNLOCK;
+	return match;
+}
+
+static int (*aurule_callback)(void) = NULL;
+
+static int aurule_avc_callback(u32 event, u32 ssid, u32 tsid,
+                               u16 class, u32 perms, u32 *retained)
+{
+	int err = 0;
+
+	if (event == AVC_CALLBACK_RESET && aurule_callback)
+		err = aurule_callback();
+	return err;
+}
+
+static int __init aurule_init(void)
+{
+	int err;
+
+	err = avc_add_callback(aurule_avc_callback, AVC_CALLBACK_RESET,
+	                       SECSID_NULL, SECSID_NULL, SECCLASS_NULL, 0);
+	if (err)
+		panic("avc_add_callback() failed, error %d\n", err);
+
+	return err;
+}
+__initcall(aurule_init);
+
+void selinux_audit_set_callback(int (*callback)(void))
+{
+	aurule_callback = callback;
+}
-- 
cgit v1.2.3


From 1b50eed9cac0e8e5e4d3a522d8aa267f7f8f8acb Mon Sep 17 00:00:00 2001
From: Steve Grubb <sgrubb@redhat.com>
Date: Mon, 3 Apr 2006 14:06:13 -0400
Subject: [PATCH] audit inode patch

Previously, we were gathering the context instead of the sid. Now in this patch,
we gather just the sid and convert to context only if an audit event is being
output.

This patch brings the performance hit from 146% down to 23%

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/selinux.h    | 34 +++++++++++++++++++++++++++++
 kernel/auditsc.c           | 53 ++++++++++++++--------------------------------
 security/selinux/exports.c | 24 +++++++++++++++++++++
 3 files changed, 74 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/selinux.h b/include/linux/selinux.h
index 9d684b1728b0..84a6c7404687 100644
--- a/include/linux/selinux.h
+++ b/include/linux/selinux.h
@@ -15,6 +15,7 @@
 
 struct selinux_audit_rule;
 struct audit_context;
+struct inode;
 
 #ifdef CONFIG_SECURITY_SELINUX
 
@@ -76,6 +77,27 @@ void selinux_audit_set_callback(int (*callback)(void));
  */
 void selinux_task_ctxid(struct task_struct *tsk, u32 *ctxid);
 
+/**
+ *     selinux_ctxid_to_string - map a security context ID to a string
+ *     @ctxid: security context ID to be converted.
+ *     @ctx: address of context string to be returned
+ *     @ctxlen: length of returned context string.
+ *
+ *     Returns 0 if successful, -errno if not.  On success, the context
+ *     string will be allocated internally, and the caller must call
+ *     kfree() on it after use.
+ */
+int selinux_ctxid_to_string(u32 ctxid, char **ctx, u32 *ctxlen);
+
+/**
+ *     selinux_get_inode_sid - get the inode's security context ID
+ *     @inode: inode structure to get the sid from.
+ *     @sid: pointer to security context ID to be filled in.
+ *
+ *     Returns nothing
+ */
+void selinux_get_inode_sid(const struct inode *inode, u32 *sid);
+
 #else
 
 static inline int selinux_audit_rule_init(u32 field, u32 op,
@@ -107,6 +129,18 @@ static inline void selinux_task_ctxid(struct task_struct *tsk, u32 *ctxid)
 	*ctxid = 0;
 }
 
+static inline int selinux_ctxid_to_string(u32 ctxid, char **ctx, u32 *ctxlen)
+{
+       *ctx = NULL;
+       *ctxlen = 0;
+       return 0;
+}
+
+static inline void selinux_get_inode_sid(const struct inode *inode, u32 *sid)
+{
+	*sid = 0;
+}
+
 #endif	/* CONFIG_SECURITY_SELINUX */
 
 #endif /* _LINUX_SELINUX_H */
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index d3d97d28b69a..2e123a8a0d60 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -90,7 +90,7 @@ struct audit_names {
 	uid_t		uid;
 	gid_t		gid;
 	dev_t		rdev;
-	char		*ctx;
+	u32		osid;
 };
 
 struct audit_aux_data {
@@ -410,9 +410,6 @@ static inline void audit_free_names(struct audit_context *context)
 #endif
 
 	for (i = 0; i < context->name_count; i++) {
-		char *p = context->names[i].ctx;
-		context->names[i].ctx = NULL;
-		kfree(p);
 		if (context->names[i].name)
 			__putname(context->names[i].name);
 	}
@@ -674,6 +671,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 		}
 	}
 	for (i = 0; i < context->name_count; i++) {
+		int call_panic = 0;
 		unsigned long ino  = context->names[i].ino;
 		unsigned long pino = context->names[i].pino;
 
@@ -703,12 +701,22 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 					 context->names[i].gid, 
 					 MAJOR(context->names[i].rdev), 
 					 MINOR(context->names[i].rdev));
-		if (context->names[i].ctx) {
-			audit_log_format(ab, " obj=%s",
-					context->names[i].ctx);
+		if (context->names[i].osid != 0) {
+			char *ctx = NULL;
+			u32 len;
+			if (selinux_ctxid_to_string(
+				context->names[i].osid, &ctx, &len)) {
+				audit_log_format(ab, " obj=%u",
+						context->names[i].osid);
+				call_panic = 1;
+			} else
+				audit_log_format(ab, " obj=%s", ctx);
+			kfree(ctx);
 		}
 
 		audit_log_end(ab);
+		if (call_panic)
+			audit_panic("error converting sid to string");
 	}
 }
 
@@ -946,37 +954,8 @@ void audit_putname(const char *name)
 void audit_inode_context(int idx, const struct inode *inode)
 {
 	struct audit_context *context = current->audit_context;
-	const char *suffix = security_inode_xattr_getsuffix();
-	char *ctx = NULL;
-	int len = 0;
-
-	if (!suffix)
-		goto ret;
-
-	len = security_inode_getsecurity(inode, suffix, NULL, 0, 0);
-	if (len == -EOPNOTSUPP)
-		goto ret;
-	if (len < 0) 
-		goto error_path;
-
-	ctx = kmalloc(len, GFP_KERNEL);
-	if (!ctx) 
-		goto error_path;
-
-	len = security_inode_getsecurity(inode, suffix, ctx, len, 0);
-	if (len < 0)
-		goto error_path;
-
-	kfree(context->names[idx].ctx);
-	context->names[idx].ctx = ctx;
-	goto ret;
 
-error_path:
-	if (ctx)
-		kfree(ctx);
-	audit_panic("error in audit_inode_context");
-ret:
-	return;
+	selinux_get_inode_sid(inode, &context->names[idx].osid);
 }
 
 
diff --git a/security/selinux/exports.c b/security/selinux/exports.c
index 333c4c7824d8..07ddce7bf374 100644
--- a/security/selinux/exports.c
+++ b/security/selinux/exports.c
@@ -14,6 +14,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/selinux.h>
+#include <linux/fs.h>
 
 #include "security.h"
 #include "objsec.h"
@@ -26,3 +27,26 @@ void selinux_task_ctxid(struct task_struct *tsk, u32 *ctxid)
 	else
 		*ctxid = 0;
 }
+
+int selinux_ctxid_to_string(u32 ctxid, char **ctx, u32 *ctxlen)
+{
+	if (selinux_enabled)
+		return security_sid_to_context(ctxid, ctx, ctxlen);
+	else {
+		*ctx = NULL;
+		*ctxlen = 0;
+	}
+
+	return 0;
+}
+
+void selinux_get_inode_sid(const struct inode *inode, u32 *sid)
+{
+	if (selinux_enabled) {
+		struct inode_security_struct *isec = inode->i_security;
+		*sid = isec->sid;
+		return;
+	}
+	*sid = 0;
+}
+
-- 
cgit v1.2.3


From 9c7aa6aa74fa8a5cda36e54cbbe4fffe0214497d Mon Sep 17 00:00:00 2001
From: Steve Grubb <sgrubb@redhat.com>
Date: Fri, 31 Mar 2006 15:22:49 -0500
Subject: [PATCH] change lspp ipc auditing

Hi,

The patch below converts IPC auditing to collect sid's and convert to context
string only if it needs to output an audit record. This patch depends on the
inode audit change patch already being applied.

Signed-off-by: Steve Grubb <sgrubb@redhat.com>

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/security.h   | 16 -----------
 include/linux/selinux.h    | 15 ++++++++++
 kernel/auditsc.c           | 68 ++++++++++++++--------------------------------
 security/dummy.c           |  6 ----
 security/selinux/exports.c | 11 ++++++++
 security/selinux/hooks.c   |  8 ------
 6 files changed, 47 insertions(+), 77 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/security.h b/include/linux/security.h
index aaa0a5cdbf75..1bab48f6aeac 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -869,11 +869,6 @@ struct swap_info_struct;
  *	@ipcp contains the kernel IPC permission structure
  *	@flag contains the desired (requested) permission set
  *	Return 0 if permission is granted.
- * @ipc_getsecurity:
- *      Copy the security label associated with the ipc object into
- *      @buffer.  @buffer may be NULL to request the size of the buffer 
- *      required.  @size indicates the size of @buffer in bytes. Return 
- *      number of bytes used/required on success.
  *
  * Security hooks for individual messages held in System V IPC message queues
  * @msg_msg_alloc_security:
@@ -1223,7 +1218,6 @@ struct security_operations {
 	void (*task_to_inode)(struct task_struct *p, struct inode *inode);
 
 	int (*ipc_permission) (struct kern_ipc_perm * ipcp, short flag);
-	int (*ipc_getsecurity)(struct kern_ipc_perm *ipcp, void *buffer, size_t size);
 
 	int (*msg_msg_alloc_security) (struct msg_msg * msg);
 	void (*msg_msg_free_security) (struct msg_msg * msg);
@@ -1887,11 +1881,6 @@ static inline int security_ipc_permission (struct kern_ipc_perm *ipcp,
 	return security_ops->ipc_permission (ipcp, flag);
 }
 
-static inline int security_ipc_getsecurity(struct kern_ipc_perm *ipcp, void *buffer, size_t size)
-{
-	return security_ops->ipc_getsecurity(ipcp, buffer, size);
-}
-
 static inline int security_msg_msg_alloc (struct msg_msg * msg)
 {
 	return security_ops->msg_msg_alloc_security (msg);
@@ -2532,11 +2521,6 @@ static inline int security_ipc_permission (struct kern_ipc_perm *ipcp,
 	return 0;
 }
 
-static inline int security_ipc_getsecurity(struct kern_ipc_perm *ipcp, void *buffer, size_t size)
-{
-	return -EOPNOTSUPP;
-}
-
 static inline int security_msg_msg_alloc (struct msg_msg * msg)
 {
 	return 0;
diff --git a/include/linux/selinux.h b/include/linux/selinux.h
index 84a6c7404687..413d66773b91 100644
--- a/include/linux/selinux.h
+++ b/include/linux/selinux.h
@@ -16,6 +16,7 @@
 struct selinux_audit_rule;
 struct audit_context;
 struct inode;
+struct kern_ipc_perm;
 
 #ifdef CONFIG_SECURITY_SELINUX
 
@@ -98,6 +99,15 @@ int selinux_ctxid_to_string(u32 ctxid, char **ctx, u32 *ctxlen);
  */
 void selinux_get_inode_sid(const struct inode *inode, u32 *sid);
 
+/**
+ *     selinux_get_ipc_sid - get the ipc security context ID
+ *     @ipcp: ipc structure to get the sid from.
+ *     @sid: pointer to security context ID to be filled in.
+ *
+ *     Returns nothing
+ */
+void selinux_get_ipc_sid(const struct kern_ipc_perm *ipcp, u32 *sid);
+
 #else
 
 static inline int selinux_audit_rule_init(u32 field, u32 op,
@@ -141,6 +151,11 @@ static inline void selinux_get_inode_sid(const struct inode *inode, u32 *sid)
 	*sid = 0;
 }
 
+static inline void selinux_get_ipc_sid(const struct kern_ipc_perm *ipcp, u32 *sid)
+{
+	*sid = 0;
+}
+
 #endif	/* CONFIG_SECURITY_SELINUX */
 
 #endif /* _LINUX_SELINUX_H */
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 2e123a8a0d60..b4f7223811fe 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -107,7 +107,7 @@ struct audit_aux_data_ipcctl {
 	uid_t			uid;
 	gid_t			gid;
 	mode_t			mode;
-	char 			*ctx;
+	u32			osid;
 };
 
 struct audit_aux_data_socketcall {
@@ -432,11 +432,6 @@ static inline void audit_free_aux(struct audit_context *context)
 			dput(axi->dentry);
 			mntput(axi->mnt);
 		}
-		if ( aux->type == AUDIT_IPC ) {
-			struct audit_aux_data_ipcctl *axi = (void *)aux;
-			if (axi->ctx)
-				kfree(axi->ctx);
-		}
 
 		context->aux = aux->next;
 		kfree(aux);
@@ -584,7 +579,7 @@ static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk
 
 static void audit_log_exit(struct audit_context *context, struct task_struct *tsk)
 {
-	int i;
+	int i, call_panic = 0;
 	struct audit_buffer *ab;
 	struct audit_aux_data *aux;
 	const char *tty;
@@ -635,8 +630,20 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 		case AUDIT_IPC: {
 			struct audit_aux_data_ipcctl *axi = (void *)aux;
 			audit_log_format(ab, 
-					 " qbytes=%lx iuid=%u igid=%u mode=%x obj=%s",
-					 axi->qbytes, axi->uid, axi->gid, axi->mode, axi->ctx);
+				 " qbytes=%lx iuid=%u igid=%u mode=%x",
+				 axi->qbytes, axi->uid, axi->gid, axi->mode);
+			if (axi->osid != 0) {
+				char *ctx = NULL;
+				u32 len;
+				if (selinux_ctxid_to_string(
+						axi->osid, &ctx, &len)) {
+					audit_log_format(ab, " obj=%u",
+							axi->osid);
+					call_panic = 1;
+				} else
+					audit_log_format(ab, " obj=%s", ctx);
+				kfree(ctx);
+			}
 			break; }
 
 		case AUDIT_SOCKETCALL: {
@@ -671,7 +678,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 		}
 	}
 	for (i = 0; i < context->name_count; i++) {
-		int call_panic = 0;
 		unsigned long ino  = context->names[i].ino;
 		unsigned long pino = context->names[i].pino;
 
@@ -708,16 +714,16 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 				context->names[i].osid, &ctx, &len)) {
 				audit_log_format(ab, " obj=%u",
 						context->names[i].osid);
-				call_panic = 1;
+				call_panic = 2;
 			} else
 				audit_log_format(ab, " obj=%s", ctx);
 			kfree(ctx);
 		}
 
 		audit_log_end(ab);
-		if (call_panic)
-			audit_panic("error converting sid to string");
 	}
+	if (call_panic)
+		audit_panic("error converting sid to string");
 }
 
 /**
@@ -951,7 +957,7 @@ void audit_putname(const char *name)
 #endif
 }
 
-void audit_inode_context(int idx, const struct inode *inode)
+static void audit_inode_context(int idx, const struct inode *inode)
 {
 	struct audit_context *context = current->audit_context;
 
@@ -1141,38 +1147,6 @@ uid_t audit_get_loginuid(struct audit_context *ctx)
 	return ctx ? ctx->loginuid : -1;
 }
 
-static char *audit_ipc_context(struct kern_ipc_perm *ipcp)
-{
-	struct audit_context *context = current->audit_context;
-	char *ctx = NULL;
-	int len = 0;
-
-	if (likely(!context))
-		return NULL;
-
-	len = security_ipc_getsecurity(ipcp, NULL, 0);
-	if (len == -EOPNOTSUPP)
-		goto ret;
-	if (len < 0)
-		goto error_path;
-
-	ctx = kmalloc(len, GFP_ATOMIC);
-	if (!ctx)
-		goto error_path;
-
-	len = security_ipc_getsecurity(ipcp, ctx, len);
-	if (len < 0)
-		goto error_path;
-
-	return ctx;
-
-error_path:
-	kfree(ctx);
-	audit_panic("error in audit_ipc_context");
-ret:
-	return NULL;
-}
-
 /**
  * audit_ipc_perms - record audit data for ipc
  * @qbytes: msgq bytes
@@ -1198,7 +1172,7 @@ int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode, str
 	ax->uid = uid;
 	ax->gid = gid;
 	ax->mode = mode;
-	ax->ctx = audit_ipc_context(ipcp);
+	selinux_get_ipc_sid(ipcp, &ax->osid);
 
 	ax->d.type = AUDIT_IPC;
 	ax->d.next = context->aux;
diff --git a/security/dummy.c b/security/dummy.c
index fd99429278e9..8ccccccc12ac 100644
--- a/security/dummy.c
+++ b/security/dummy.c
@@ -563,11 +563,6 @@ static int dummy_ipc_permission (struct kern_ipc_perm *ipcp, short flag)
 	return 0;
 }
 
-static int dummy_ipc_getsecurity(struct kern_ipc_perm *ipcp, void *buffer, size_t size)
-{
-	return -EOPNOTSUPP;
-}
-
 static int dummy_msg_msg_alloc_security (struct msg_msg *msg)
 {
 	return 0;
@@ -976,7 +971,6 @@ void security_fixup_ops (struct security_operations *ops)
 	set_to_dummy_if_null(ops, task_reparent_to_init);
  	set_to_dummy_if_null(ops, task_to_inode);
 	set_to_dummy_if_null(ops, ipc_permission);
-	set_to_dummy_if_null(ops, ipc_getsecurity);
 	set_to_dummy_if_null(ops, msg_msg_alloc_security);
 	set_to_dummy_if_null(ops, msg_msg_free_security);
 	set_to_dummy_if_null(ops, msg_queue_alloc_security);
diff --git a/security/selinux/exports.c b/security/selinux/exports.c
index 07ddce7bf374..7357cf247f60 100644
--- a/security/selinux/exports.c
+++ b/security/selinux/exports.c
@@ -15,6 +15,7 @@
 #include <linux/module.h>
 #include <linux/selinux.h>
 #include <linux/fs.h>
+#include <linux/ipc.h>
 
 #include "security.h"
 #include "objsec.h"
@@ -50,3 +51,13 @@ void selinux_get_inode_sid(const struct inode *inode, u32 *sid)
 	*sid = 0;
 }
 
+void selinux_get_ipc_sid(const struct kern_ipc_perm *ipcp, u32 *sid)
+{
+	if (selinux_enabled) {
+		struct ipc_security_struct *isec = ipcp->security;
+		*sid = isec->sid;
+		return;
+	}
+	*sid = 0;
+}
+
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index b61b9554bc27..3cf368a16448 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -4052,13 +4052,6 @@ static int selinux_ipc_permission(struct kern_ipc_perm *ipcp, short flag)
 	return ipc_has_perm(ipcp, av);
 }
 
-static int selinux_ipc_getsecurity(struct kern_ipc_perm *ipcp, void *buffer, size_t size)
-{
-	struct ipc_security_struct *isec = ipcp->security;
-
-	return selinux_getsecurity(isec->sid, buffer, size);
-}
-
 /* module stacking operations */
 static int selinux_register_security (const char *name, struct security_operations *ops)
 {
@@ -4321,7 +4314,6 @@ static struct security_operations selinux_ops = {
 	.task_to_inode =                selinux_task_to_inode,
 
 	.ipc_permission =		selinux_ipc_permission,
-	.ipc_getsecurity =		selinux_ipc_getsecurity,
 
 	.msg_msg_alloc_security =	selinux_msg_msg_alloc_security,
 	.msg_msg_free_security =	selinux_msg_msg_free_security,
-- 
cgit v1.2.3


From e7c3497013a7e5496ce3d5fd3c73b5cf5af7a56e Mon Sep 17 00:00:00 2001
From: Steve Grubb <sgrubb@redhat.com>
Date: Mon, 3 Apr 2006 09:08:13 -0400
Subject: [PATCH] Reworked patch for labels on user space messages

The below patch should be applied after the inode and ipc sid patches.
This patch is a reworking of Tim's patch that has been updated to match
the inode and ipc patches since its similar.

[updated:
>  Stephen Smalley also wanted to change a variable from isec to tsec in the
>  user sid patch.                                                              ]

Signed-off-by: Steve Grubb <sgrubb@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/netlink.h    |  1 +
 include/linux/selinux.h    | 16 ++++++++++++++++
 kernel/audit.c             | 22 +++++++++++++++++++---
 net/netlink/af_netlink.c   |  2 ++
 security/selinux/exports.c | 11 +++++++++++
 5 files changed, 49 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index f8f3d1c927f8..87b8a5703ebc 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -143,6 +143,7 @@ struct netlink_skb_parms
 	__u32			dst_group;
 	kernel_cap_t		eff_cap;
 	__u32			loginuid;	/* Login (audit) uid */
+	__u32			sid;		/* SELinux security id */
 };
 
 #define NETLINK_CB(skb)		(*(struct netlink_skb_parms*)&((skb)->cb))
diff --git a/include/linux/selinux.h b/include/linux/selinux.h
index 413d66773b91..4047bcde4484 100644
--- a/include/linux/selinux.h
+++ b/include/linux/selinux.h
@@ -5,6 +5,7 @@
  *
  * Copyright (C) 2005 Red Hat, Inc., James Morris <jmorris@redhat.com>
  * Copyright (C) 2006 Trusted Computer Solutions, Inc. <dgoeddel@trustedcs.com>
+ * Copyright (C) 2006 IBM Corporation, Timothy R. Chavez <tinytim@us.ibm.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2,
@@ -108,6 +109,16 @@ void selinux_get_inode_sid(const struct inode *inode, u32 *sid);
  */
 void selinux_get_ipc_sid(const struct kern_ipc_perm *ipcp, u32 *sid);
 
+/**
+ *     selinux_get_task_sid - return the SID of task
+ *     @tsk: the task whose SID will be returned
+ *     @sid: pointer to security context ID to be filled in.
+ *
+ *     Returns nothing
+ */
+void selinux_get_task_sid(struct task_struct *tsk, u32 *sid);
+
+
 #else
 
 static inline int selinux_audit_rule_init(u32 field, u32 op,
@@ -156,6 +167,11 @@ static inline void selinux_get_ipc_sid(const struct kern_ipc_perm *ipcp, u32 *si
 	*sid = 0;
 }
 
+static inline void selinux_get_task_sid(struct task_struct *tsk, u32 *sid)
+{
+	*sid = 0;
+}
+
 #endif	/* CONFIG_SECURITY_SELINUX */
 
 #endif /* _LINUX_SELINUX_H */
diff --git a/kernel/audit.c b/kernel/audit.c
index 9060be750c48..7ec9ccae1299 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -390,7 +390,7 @@ static int audit_netlink_ok(kernel_cap_t eff_cap, u16 msg_type)
 
 static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 {
-	u32			uid, pid, seq;
+	u32			uid, pid, seq, sid;
 	void			*data;
 	struct audit_status	*status_get, status_set;
 	int			err;
@@ -416,6 +416,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 	pid  = NETLINK_CREDS(skb)->pid;
 	uid  = NETLINK_CREDS(skb)->uid;
 	loginuid = NETLINK_CB(skb).loginuid;
+	sid  = NETLINK_CB(skb).sid;
 	seq  = nlh->nlmsg_seq;
 	data = NLMSG_DATA(nlh);
 
@@ -468,8 +469,23 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 			ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
 			if (ab) {
 				audit_log_format(ab,
-						 "user pid=%d uid=%u auid=%u msg='%.1024s'",
-						 pid, uid, loginuid, (char *)data);
+						 "user pid=%d uid=%u auid=%u",
+						 pid, uid, loginuid);
+				if (sid) {
+					char *ctx = NULL;
+					u32 len;
+					if (selinux_ctxid_to_string(
+							sid, &ctx, &len)) {
+						audit_log_format(ab, 
+							" subj=%u", sid);
+						/* Maybe call audit_panic? */
+					} else
+						audit_log_format(ab, 
+							" subj=%s", ctx);
+					kfree(ctx);
+				}
+				audit_log_format(ab, " msg='%.1024s'",
+					 (char *)data);
 				audit_set_pid(ab, pid);
 				audit_log_end(ab);
 			}
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 2a233ffcf618..09fbc4bc7088 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -56,6 +56,7 @@
 #include <linux/mm.h>
 #include <linux/types.h>
 #include <linux/audit.h>
+#include <linux/selinux.h>
 
 #include <net/sock.h>
 #include <net/scm.h>
@@ -1157,6 +1158,7 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock,
 	NETLINK_CB(skb).dst_pid = dst_pid;
 	NETLINK_CB(skb).dst_group = dst_group;
 	NETLINK_CB(skb).loginuid = audit_get_loginuid(current->audit_context);
+	selinux_get_task_sid(current, &(NETLINK_CB(skb).sid));
 	memcpy(NETLINK_CREDS(skb), &siocb->scm->creds, sizeof(struct ucred));
 
 	/* What can I do? Netlink is asynchronous, so that
diff --git a/security/selinux/exports.c b/security/selinux/exports.c
index 7357cf247f60..ae4c73eb3085 100644
--- a/security/selinux/exports.c
+++ b/security/selinux/exports.c
@@ -5,6 +5,7 @@
  *
  * Copyright (C) 2005 Red Hat, Inc., James Morris <jmorris@redhat.com>
  * Copyright (C) 2006 Trusted Computer Solutions, Inc. <dgoeddel@trustedcs.com>
+ * Copyright (C) 2006 IBM Corporation, Timothy R. Chavez <tinytim@us.ibm.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2,
@@ -61,3 +62,13 @@ void selinux_get_ipc_sid(const struct kern_ipc_perm *ipcp, u32 *sid)
 	*sid = 0;
 }
 
+void selinux_get_task_sid(struct task_struct *tsk, u32 *sid)
+{
+	if (selinux_enabled) {
+		struct task_security_struct *tsec = tsk->security;
+		*sid = tsec->sid;
+		return;
+	}
+	*sid = 0;
+}
+
-- 
cgit v1.2.3


From ce29b682e228c70cdc91a1b2935c5adb2087bab8 Mon Sep 17 00:00:00 2001
From: Steve Grubb <sgrubb@redhat.com>
Date: Sat, 1 Apr 2006 18:29:34 -0500
Subject: [PATCH] More user space subject labels

Hi,

The patch below builds upon the patch sent earlier and adds subject label to
all audit events generated via the netlink interface. It also cleans up a few
other minor things.

Signed-off-by: Steve Grubb <sgrubb@redhat.com>

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/audit.h |   2 +-
 kernel/audit.c        | 132 ++++++++++++++++++++++++++++++++++++++------------
 kernel/auditfilter.c  |  44 ++++++++++++++---
 kernel/auditsc.c      |   4 +-
 4 files changed, 142 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 740f950397b7..d5c40823e166 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -371,7 +371,7 @@ extern void		    audit_log_d_path(struct audit_buffer *ab,
 extern int audit_filter_user(struct netlink_skb_parms *cb, int type);
 extern int audit_filter_type(int type);
 extern int  audit_receive_filter(int type, int pid, int uid, int seq,
-				 void *data, size_t datasz, uid_t loginuid);
+			 void *data, size_t datasz, uid_t loginuid, u32 sid);
 #else
 #define audit_log(c,g,t,f,...) do { ; } while (0)
 #define audit_log_start(c,g,t) ({ NULL; })
diff --git a/kernel/audit.c b/kernel/audit.c
index 7ec9ccae1299..df57b493e1cb 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -230,49 +230,103 @@ void audit_log_lost(const char *message)
 	}
 }
 
-static int audit_set_rate_limit(int limit, uid_t loginuid)
+static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sid)
 {
-	int old		 = audit_rate_limit;
-	audit_rate_limit = limit;
-	audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, 
+	int old	= audit_rate_limit;
+
+	if (sid) {
+		char *ctx = NULL;
+		u32 len;
+		int rc;
+		if ((rc = selinux_ctxid_to_string(sid, &ctx, &len)))
+			return rc;
+		else
+			audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+				"audit_rate_limit=%d old=%d by auid=%u subj=%s",
+				limit, old, loginuid, ctx);
+		kfree(ctx);
+	} else
+		audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
 			"audit_rate_limit=%d old=%d by auid=%u",
-			audit_rate_limit, old, loginuid);
+			limit, old, loginuid);
+	audit_rate_limit = limit;
 	return old;
 }
 
-static int audit_set_backlog_limit(int limit, uid_t loginuid)
+static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sid)
 {
-	int old		 = audit_backlog_limit;
-	audit_backlog_limit = limit;
-	audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+	int old	= audit_backlog_limit;
+
+	if (sid) {
+		char *ctx = NULL;
+		u32 len;
+		int rc;
+		if ((rc = selinux_ctxid_to_string(sid, &ctx, &len)))
+			return rc;
+		else
+			audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+			    "audit_backlog_limit=%d old=%d by auid=%u subj=%s",
+				limit, old, loginuid, ctx);
+		kfree(ctx);
+	} else
+		audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
 			"audit_backlog_limit=%d old=%d by auid=%u",
-			audit_backlog_limit, old, loginuid);
+			limit, old, loginuid);
+	audit_backlog_limit = limit;
 	return old;
 }
 
-static int audit_set_enabled(int state, uid_t loginuid)
+static int audit_set_enabled(int state, uid_t loginuid, u32 sid)
 {
-	int old		 = audit_enabled;
+	int old = audit_enabled;
+
 	if (state != 0 && state != 1)
 		return -EINVAL;
-	audit_enabled = state;
-	audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+
+	if (sid) {
+		char *ctx = NULL;
+		u32 len;
+		int rc;
+		if ((rc = selinux_ctxid_to_string(sid, &ctx, &len)))
+			return rc;
+		else
+			audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+				"audit_enabled=%d old=%d by auid=%u subj=%s",
+				state, old, loginuid, ctx);
+		kfree(ctx);
+	} else
+		audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
 			"audit_enabled=%d old=%d by auid=%u",
-			audit_enabled, old, loginuid);
+			state, old, loginuid);
+	audit_enabled = state;
 	return old;
 }
 
-static int audit_set_failure(int state, uid_t loginuid)
+static int audit_set_failure(int state, uid_t loginuid, u32 sid)
 {
-	int old		 = audit_failure;
+	int old = audit_failure;
+
 	if (state != AUDIT_FAIL_SILENT
 	    && state != AUDIT_FAIL_PRINTK
 	    && state != AUDIT_FAIL_PANIC)
 		return -EINVAL;
-	audit_failure = state;
-	audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+
+	if (sid) {
+		char *ctx = NULL;
+		u32 len;
+		int rc;
+		if ((rc = selinux_ctxid_to_string(sid, &ctx, &len)))
+			return rc;
+		else
+			audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+				"audit_failure=%d old=%d by auid=%u subj=%s",
+				state, old, loginuid, ctx);
+		kfree(ctx);
+	} else
+		audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
 			"audit_failure=%d old=%d by auid=%u",
-			audit_failure, old, loginuid);
+			state, old, loginuid);
+	audit_failure = state;
 	return old;
 }
 
@@ -437,25 +491,43 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 			return -EINVAL;
 		status_get   = (struct audit_status *)data;
 		if (status_get->mask & AUDIT_STATUS_ENABLED) {
-			err = audit_set_enabled(status_get->enabled, loginuid);
+			err = audit_set_enabled(status_get->enabled,
+							loginuid, sid);
 			if (err < 0) return err;
 		}
 		if (status_get->mask & AUDIT_STATUS_FAILURE) {
-			err = audit_set_failure(status_get->failure, loginuid);
+			err = audit_set_failure(status_get->failure,
+							 loginuid, sid);
 			if (err < 0) return err;
 		}
 		if (status_get->mask & AUDIT_STATUS_PID) {
 			int old   = audit_pid;
+			if (sid) {
+				char *ctx = NULL;
+				u32 len;
+				int rc;
+				if ((rc = selinux_ctxid_to_string(
+						sid, &ctx, &len)))
+					return rc;
+				else
+					audit_log(NULL, GFP_KERNEL,
+						AUDIT_CONFIG_CHANGE,
+						"audit_pid=%d old=%d by auid=%u subj=%s",
+						status_get->pid, old,
+						loginuid, ctx);
+				kfree(ctx);
+			} else
+				audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+					"audit_pid=%d old=%d by auid=%u",
+					  status_get->pid, old, loginuid);
 			audit_pid = status_get->pid;
-			audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
-				"audit_pid=%d old=%d by auid=%u",
-				  audit_pid, old, loginuid);
 		}
 		if (status_get->mask & AUDIT_STATUS_RATE_LIMIT)
-			audit_set_rate_limit(status_get->rate_limit, loginuid);
+			audit_set_rate_limit(status_get->rate_limit,
+							 loginuid, sid);
 		if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT)
 			audit_set_backlog_limit(status_get->backlog_limit,
-							loginuid);
+							loginuid, sid);
 		break;
 	case AUDIT_USER:
 	case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG:
@@ -477,7 +549,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 					if (selinux_ctxid_to_string(
 							sid, &ctx, &len)) {
 						audit_log_format(ab, 
-							" subj=%u", sid);
+							" ssid=%u", sid);
 						/* Maybe call audit_panic? */
 					} else
 						audit_log_format(ab, 
@@ -499,7 +571,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 	case AUDIT_LIST:
 		err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid,
 					   uid, seq, data, nlmsg_len(nlh),
-					   loginuid);
+					   loginuid, sid);
 		break;
 	case AUDIT_ADD_RULE:
 	case AUDIT_DEL_RULE:
@@ -509,7 +581,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 	case AUDIT_LIST_RULES:
 		err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid,
 					   uid, seq, data, nlmsg_len(nlh),
-					   loginuid);
+					   loginuid, sid);
 		break;
 	case AUDIT_SIGNAL_INFO:
 		sig_data.uid = audit_sig_uid;
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 85a7862143a1..7c134906d689 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -586,9 +586,10 @@ static int audit_list_rules(void *_dest)
  * @data: payload data
  * @datasz: size of payload data
  * @loginuid: loginuid of sender
+ * @sid: SE Linux Security ID of sender
  */
 int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
-			 size_t datasz, uid_t loginuid)
+			 size_t datasz, uid_t loginuid, u32 sid)
 {
 	struct task_struct *tsk;
 	int *dest;
@@ -631,9 +632,23 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
 
 		err = audit_add_rule(entry,
 				     &audit_filter_list[entry->rule.listnr]);
-		audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
-			"auid=%u add rule to list=%d res=%d\n",
-			loginuid, entry->rule.listnr, !err);
+		if (sid) {
+			char *ctx = NULL;
+			u32 len;
+			if (selinux_ctxid_to_string(sid, &ctx, &len)) {
+				/* Maybe call audit_panic? */
+				audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+				 "auid=%u ssid=%u add rule to list=%d res=%d",
+				 loginuid, sid, entry->rule.listnr, !err);
+			} else
+				audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+				 "auid=%u subj=%s add rule to list=%d res=%d",
+				 loginuid, ctx, entry->rule.listnr, !err);
+			kfree(ctx);
+		} else
+			audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+				"auid=%u add rule to list=%d res=%d",
+				loginuid, entry->rule.listnr, !err);
 
 		if (err)
 			audit_free_rule(entry);
@@ -649,9 +664,24 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
 
 		err = audit_del_rule(entry,
 				     &audit_filter_list[entry->rule.listnr]);
-		audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
-			"auid=%u remove rule from list=%d res=%d\n",
-			loginuid, entry->rule.listnr, !err);
+
+		if (sid) {
+			char *ctx = NULL;
+			u32 len;
+			if (selinux_ctxid_to_string(sid, &ctx, &len)) {
+				/* Maybe call audit_panic? */
+				audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+					"auid=%u ssid=%u remove rule from list=%d res=%d",
+					 loginuid, sid, entry->rule.listnr, !err);
+			} else
+				audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+					"auid=%u subj=%s remove rule from list=%d res=%d",
+					 loginuid, ctx, entry->rule.listnr, !err);
+			kfree(ctx);
+		} else
+			audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+				"auid=%u remove rule from list=%d res=%d",
+				loginuid, entry->rule.listnr, !err);
 
 		audit_free_rule(entry);
 		break;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index b4f7223811fe..d94e0404113c 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -637,7 +637,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 				u32 len;
 				if (selinux_ctxid_to_string(
 						axi->osid, &ctx, &len)) {
-					audit_log_format(ab, " obj=%u",
+					audit_log_format(ab, " osid=%u",
 							axi->osid);
 					call_panic = 1;
 				} else
@@ -712,7 +712,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 			u32 len;
 			if (selinux_ctxid_to_string(
 				context->names[i].osid, &ctx, &len)) {
-				audit_log_format(ab, " obj=%u",
+				audit_log_format(ab, " osid=%u",
 						context->names[i].osid);
 				call_panic = 2;
 			} else
-- 
cgit v1.2.3


From 073115d6b29c7910feaa08241c6484637f5ca958 Mon Sep 17 00:00:00 2001
From: Steve Grubb <sgrubb@redhat.com>
Date: Sun, 2 Apr 2006 17:07:33 -0400
Subject: [PATCH] Rework of IPC auditing

1) The audit_ipc_perms() function has been split into two different
functions:
        - audit_ipc_obj()
        - audit_ipc_set_perm()

There's a key shift here...  The audit_ipc_obj() collects the uid, gid,
mode, and SElinux context label of the current ipc object.  This
audit_ipc_obj() hook is now found in several places.  Most notably, it
is hooked in ipcperms(), which is called in various places around the
ipc code permforming a MAC check.  Additionally there are several places
where *checkid() is used to validate that an operation is being
performed on a valid object while not necessarily having a nearby
ipcperms() call.  In these locations, audit_ipc_obj() is called to
ensure that the information is captured by the audit system.

The audit_set_new_perm() function is called any time the permissions on
the ipc object changes.  In this case, the NEW permissions are recorded
(and note that an audit_ipc_obj() call exists just a few lines before
each instance).

2) Support for an AUDIT_IPC_SET_PERM audit message type.  This allows
for separate auxiliary audit records for normal operations on an IPC
object and permissions changes.  Note that the same struct
audit_aux_data_ipcctl is used and populated, however there are separate
audit_log_format statements based on the type of the message.  Finally,
the AUDIT_IPC block of code in audit_free_aux() was extended to handle
aux messages of this new type.  No more mem leaks I hope ;-)

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/audit.h |  7 +++++--
 ipc/msg.c             | 11 ++++++++++-
 ipc/sem.c             | 11 ++++++++++-
 ipc/shm.c             | 19 +++++++++++++++---
 ipc/util.c            |  7 ++++++-
 kernel/auditsc.c      | 54 ++++++++++++++++++++++++++++++++++++++++++++++++---
 6 files changed, 98 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index d5c40823e166..b74c148f14e3 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -83,6 +83,7 @@
 #define AUDIT_CONFIG_CHANGE	1305	/* Audit system configuration change */
 #define AUDIT_SOCKADDR		1306	/* sockaddr copied as syscall arg */
 #define AUDIT_CWD		1307	/* Current working directory */
+#define AUDIT_IPC_SET_PERM	1311	/* IPC new permissions record type */
 
 #define AUDIT_AVC		1400	/* SE Linux avc denial or grant */
 #define AUDIT_SELINUX_ERR	1401	/* Internal SE Linux Errors */
@@ -319,7 +320,8 @@ extern void auditsc_get_stamp(struct audit_context *ctx,
 			      struct timespec *t, unsigned int *serial);
 extern int  audit_set_loginuid(struct task_struct *task, uid_t loginuid);
 extern uid_t audit_get_loginuid(struct audit_context *ctx);
-extern int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode, struct kern_ipc_perm *ipcp);
+extern int audit_ipc_obj(struct kern_ipc_perm *ipcp);
+extern int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode, struct kern_ipc_perm *ipcp);
 extern int audit_socketcall(int nargs, unsigned long *args);
 extern int audit_sockaddr(int len, void *addr);
 extern int audit_avc_path(struct dentry *dentry, struct vfsmount *mnt);
@@ -338,7 +340,8 @@ extern int audit_set_macxattr(const char *name);
 #define audit_inode_child(d,i,p) do { ; } while (0)
 #define auditsc_get_stamp(c,t,s) do { BUG(); } while (0)
 #define audit_get_loginuid(c) ({ -1; })
-#define audit_ipc_perms(q,u,g,m,i) ({ 0; })
+#define audit_ipc_obj(i) ({ 0; })
+#define audit_ipc_set_perm(q,u,g,m,i) ({ 0; })
 #define audit_socketcall(n,a) ({ 0; })
 #define audit_sockaddr(len, addr) ({ 0; })
 #define audit_avc_path(dentry, mnt) ({ 0; })
diff --git a/ipc/msg.c b/ipc/msg.c
index 48a7f17a7236..7d1340ccb16b 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -13,6 +13,9 @@
  * mostly rewritten, threaded and wake-one semantics added
  * MSGMAX limit removed, sysctl's added
  * (c) 1999 Manfred Spraul <manfred@colorfullife.com>
+ *
+ * support for audit of ipc object properties and permission changes
+ * Dustin Kirkland <dustin.kirkland@us.ibm.com>
  */
 
 #include <linux/capability.h>
@@ -447,6 +450,11 @@ asmlinkage long sys_msgctl (int msqid, int cmd, struct msqid_ds __user *buf)
 	if (msg_checkid(msq,msqid))
 		goto out_unlock_up;
 	ipcp = &msq->q_perm;
+
+	err = audit_ipc_obj(ipcp);
+	if (err)
+		goto out_unlock_up;
+
 	err = -EPERM;
 	if (current->euid != ipcp->cuid && 
 	    current->euid != ipcp->uid && !capable(CAP_SYS_ADMIN))
@@ -460,7 +468,8 @@ asmlinkage long sys_msgctl (int msqid, int cmd, struct msqid_ds __user *buf)
 	switch (cmd) {
 	case IPC_SET:
 	{
-		if ((err = audit_ipc_perms(setbuf.qbytes, setbuf.uid, setbuf.gid, setbuf.mode, ipcp)))
+		err = audit_ipc_set_perm(setbuf.qbytes, setbuf.uid, setbuf.gid, setbuf.mode, ipcp);
+		if (err)
 			goto out_unlock_up;
 
 		err = -EPERM;
diff --git a/ipc/sem.c b/ipc/sem.c
index 642659cd596b..7919f8ece6ba 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -61,6 +61,9 @@
  * (c) 2001 Red Hat Inc <alan@redhat.com>
  * Lockless wakeup
  * (c) 2003 Manfred Spraul <manfred@colorfullife.com>
+ *
+ * support for audit of ipc object properties and permission changes
+ * Dustin Kirkland <dustin.kirkland@us.ibm.com>
  */
 
 #include <linux/config.h>
@@ -820,6 +823,11 @@ static int semctl_down(int semid, int semnum, int cmd, int version, union semun
 		goto out_unlock;
 	}	
 	ipcp = &sma->sem_perm;
+
+	err = audit_ipc_obj(ipcp);
+	if (err)
+		goto out_unlock;
+
 	if (current->euid != ipcp->cuid && 
 	    current->euid != ipcp->uid && !capable(CAP_SYS_ADMIN)) {
 	    	err=-EPERM;
@@ -836,7 +844,8 @@ static int semctl_down(int semid, int semnum, int cmd, int version, union semun
 		err = 0;
 		break;
 	case IPC_SET:
-		if ((err = audit_ipc_perms(0, setbuf.uid, setbuf.gid, setbuf.mode, ipcp)))
+		err = audit_ipc_set_perm(0, setbuf.uid, setbuf.gid, setbuf.mode, ipcp);
+		if (err)
 			goto out_unlock;
 		ipcp->uid = setbuf.uid;
 		ipcp->gid = setbuf.gid;
diff --git a/ipc/shm.c b/ipc/shm.c
index 1c2faf62bc73..809896851902 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -13,6 +13,8 @@
  * Shared /dev/zero support, Kanoj Sarcar <kanoj@sgi.com>
  * Move the mm functionality over to mm/shmem.c, Christoph Rohland <cr@sap.com>
  *
+ * support for audit of ipc object properties and permission changes
+ * Dustin Kirkland <dustin.kirkland@us.ibm.com>
  */
 
 #include <linux/config.h>
@@ -542,6 +544,10 @@ asmlinkage long sys_shmctl (int shmid, int cmd, struct shmid_ds __user *buf)
 		if(err)
 			goto out_unlock;
 
+		err = audit_ipc_obj(&(shp->shm_perm));
+		if (err)
+			goto out_unlock;
+
 		if (!capable(CAP_IPC_LOCK)) {
 			err = -EPERM;
 			if (current->euid != shp->shm_perm.uid &&
@@ -594,6 +600,10 @@ asmlinkage long sys_shmctl (int shmid, int cmd, struct shmid_ds __user *buf)
 		if(err)
 			goto out_unlock_up;
 
+		err = audit_ipc_obj(&(shp->shm_perm));
+		if (err)
+			goto out_unlock_up;
+
 		if (current->euid != shp->shm_perm.uid &&
 		    current->euid != shp->shm_perm.cuid && 
 		    !capable(CAP_SYS_ADMIN)) {
@@ -627,12 +637,15 @@ asmlinkage long sys_shmctl (int shmid, int cmd, struct shmid_ds __user *buf)
 		err=-EINVAL;
 		if(shp==NULL)
 			goto out_up;
-		if ((err = audit_ipc_perms(0, setbuf.uid, setbuf.gid,
-					setbuf.mode, &(shp->shm_perm))))
-			goto out_unlock_up;
 		err = shm_checkid(shp,shmid);
 		if(err)
 			goto out_unlock_up;
+		err = audit_ipc_obj(&(shp->shm_perm));
+		if (err)
+			goto out_unlock_up;
+		err = audit_ipc_set_perm(0, setbuf.uid, setbuf.gid, setbuf.mode, &(shp->shm_perm));
+		if (err)
+			goto out_unlock_up;
 		err=-EPERM;
 		if (current->euid != shp->shm_perm.uid &&
 		    current->euid != shp->shm_perm.cuid && 
diff --git a/ipc/util.c b/ipc/util.c
index b3dcfad3b4f7..8193299f45f6 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -10,6 +10,8 @@
  *	      Manfred Spraul <manfred@colorfullife.com>
  * Oct 2002 - One lock per IPC id. RCU ipc_free for lock-free grow_ary().
  *            Mingming Cao <cmm@us.ibm.com>
+ * Mar 2006 - support for audit of ipc object properties
+ *            Dustin Kirkland <dustin.kirkland@us.ibm.com>
  */
 
 #include <linux/config.h>
@@ -27,6 +29,7 @@
 #include <linux/workqueue.h>
 #include <linux/seq_file.h>
 #include <linux/proc_fs.h>
+#include <linux/audit.h>
 
 #include <asm/unistd.h>
 
@@ -464,8 +467,10 @@ void ipc_rcu_putref(void *ptr)
  
 int ipcperms (struct kern_ipc_perm *ipcp, short flag)
 {	/* flag will most probably be 0 or S_...UGO from <linux/stat.h> */
-	int requested_mode, granted_mode;
+	int requested_mode, granted_mode, err;
 
+	if (unlikely((err = audit_ipc_obj(ipcp))))
+		return err;
 	requested_mode = (flag >> 6) | (flag >> 3) | flag;
 	granted_mode = ipcp->mode;
 	if (current->euid == ipcp->cuid || current->euid == ipcp->uid)
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index d94e0404113c..a300736ee037 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -646,6 +646,25 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 			}
 			break; }
 
+		case AUDIT_IPC_SET_PERM: {
+			struct audit_aux_data_ipcctl *axi = (void *)aux;
+			audit_log_format(ab,
+				" new qbytes=%lx new iuid=%u new igid=%u new mode=%x",
+				axi->qbytes, axi->uid, axi->gid, axi->mode);
+			if (axi->osid != 0) {
+				char *ctx = NULL;
+				u32 len;
+				if (selinux_ctxid_to_string(
+						axi->osid, &ctx, &len)) {
+					audit_log_format(ab, " osid=%u",
+							axi->osid);
+					call_panic = 1;
+				} else
+					audit_log_format(ab, " obj=%s", ctx);
+				kfree(ctx);
+			}
+			break; }
+
 		case AUDIT_SOCKETCALL: {
 			int i;
 			struct audit_aux_data_socketcall *axs = (void *)aux;
@@ -1148,7 +1167,36 @@ uid_t audit_get_loginuid(struct audit_context *ctx)
 }
 
 /**
- * audit_ipc_perms - record audit data for ipc
+ * audit_ipc_obj - record audit data for ipc object
+ * @ipcp: ipc permissions
+ *
+ * Returns 0 for success or NULL context or < 0 on error.
+ */
+int audit_ipc_obj(struct kern_ipc_perm *ipcp)
+{
+	struct audit_aux_data_ipcctl *ax;
+	struct audit_context *context = current->audit_context;
+
+	if (likely(!context))
+		return 0;
+
+	ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
+	if (!ax)
+		return -ENOMEM;
+
+	ax->uid = ipcp->uid;
+	ax->gid = ipcp->gid;
+	ax->mode = ipcp->mode;
+	selinux_get_ipc_sid(ipcp, &ax->osid);
+
+	ax->d.type = AUDIT_IPC;
+	ax->d.next = context->aux;
+	context->aux = (void *)ax;
+	return 0;
+}
+
+/**
+ * audit_ipc_set_perm - record audit data for new ipc permissions
  * @qbytes: msgq bytes
  * @uid: msgq user id
  * @gid: msgq group id
@@ -1156,7 +1204,7 @@ uid_t audit_get_loginuid(struct audit_context *ctx)
  *
  * Returns 0 for success or NULL context or < 0 on error.
  */
-int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode, struct kern_ipc_perm *ipcp)
+int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode, struct kern_ipc_perm *ipcp)
 {
 	struct audit_aux_data_ipcctl *ax;
 	struct audit_context *context = current->audit_context;
@@ -1174,7 +1222,7 @@ int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode, str
 	ax->mode = mode;
 	selinux_get_ipc_sid(ipcp, &ax->osid);
 
-	ax->d.type = AUDIT_IPC;
+	ax->d.type = AUDIT_IPC_SET_PERM;
 	ax->d.next = context->aux;
 	context->aux = (void *)ax;
 	return 0;
-- 
cgit v1.2.3


From 0568b409c74f7a125d92a09a3f386785700ef688 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Mon, 1 May 2006 19:50:48 +0200
Subject: [PATCH] splice: fix bugs in pipe_to_file()

Found by Oleg Nesterov <oleg@tv-sign.ru>, fixed by me.

- Only allow full pages to go to the page cache.
- Check page != buf->page instead of using PIPE_BUF_FLAG_STOLEN.
- Remember to clear 'stolen' if add_to_page_cache() fails.

And as a cleanup on that:

- Make the bottom fall-through logic a little less convoluted. Also make
  the steal path hold an extra reference to the page, so we don't have
  to differentiate between stolen and non-stolen at the end.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/pipe.c                 |  3 ---
 fs/splice.c               | 37 +++++++++++++++++++------------------
 include/linux/pipe_fs_i.h |  3 +--
 3 files changed, 20 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/fs/pipe.c b/fs/pipe.c
index 5a369273c51b..888f265011bf 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -99,8 +99,6 @@ static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
 {
 	struct page *page = buf->page;
 
-	buf->flags &= ~PIPE_BUF_FLAG_STOLEN;
-
 	/*
 	 * If nobody else uses this page, and we don't already have a
 	 * temporary page, let's keep track of it as a one-deep
@@ -130,7 +128,6 @@ static int anon_pipe_buf_steal(struct pipe_inode_info *pipe,
 	struct page *page = buf->page;
 
 	if (page_count(page) == 1) {
-		buf->flags |= PIPE_BUF_FLAG_STOLEN;
 		lock_page(page);
 		return 0;
 	}
diff --git a/fs/splice.c b/fs/splice.c
index 9df28d30efa0..1633778f3652 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -78,7 +78,7 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *info,
 		return 1;
 	}
 
-	buf->flags |= PIPE_BUF_FLAG_STOLEN | PIPE_BUF_FLAG_LRU;
+	buf->flags |= PIPE_BUF_FLAG_LRU;
 	return 0;
 }
 
@@ -87,7 +87,7 @@ static void page_cache_pipe_buf_release(struct pipe_inode_info *info,
 {
 	page_cache_release(buf->page);
 	buf->page = NULL;
-	buf->flags &= ~(PIPE_BUF_FLAG_STOLEN | PIPE_BUF_FLAG_LRU);
+	buf->flags &= ~PIPE_BUF_FLAG_LRU;
 }
 
 static void *page_cache_pipe_buf_map(struct file *file,
@@ -587,9 +587,10 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf,
 		this_len = PAGE_CACHE_SIZE - offset;
 
 	/*
-	 * Reuse buf page, if SPLICE_F_MOVE is set.
+	 * Reuse buf page, if SPLICE_F_MOVE is set and we are doing a full
+	 * page.
 	 */
-	if (sd->flags & SPLICE_F_MOVE) {
+	if ((sd->flags & SPLICE_F_MOVE) && this_len == PAGE_CACHE_SIZE) {
 		/*
 		 * If steal succeeds, buf->page is now pruned from the vm
 		 * side (LRU and page cache) and we can reuse it. The page
@@ -604,6 +605,8 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf,
 			goto find_page;
 		}
 
+		page_cache_get(page);
+
 		if (!(buf->flags & PIPE_BUF_FLAG_LRU))
 			lru_cache_add(page);
 	} else {
@@ -662,7 +665,7 @@ find_page:
 	} else if (ret)
 		goto out;
 
-	if (!(buf->flags & PIPE_BUF_FLAG_STOLEN)) {
+	if (buf->page != page) {
 		char *dst = kmap_atomic(page, KM_USER0);
 
 		memcpy(dst + offset, src + buf->offset, this_len);
@@ -671,22 +674,20 @@ find_page:
 	}
 
 	ret = mapping->a_ops->commit_write(file, page, offset, offset+this_len);
-	if (ret == AOP_TRUNCATED_PAGE) {
+	if (!ret) {
+		/*
+		 * Return the number of bytes written and mark page as
+		 * accessed, we are now done!
+		 */
+		ret = this_len;
+		mark_page_accessed(page);
+		balance_dirty_pages_ratelimited(mapping);
+	} else if (ret == AOP_TRUNCATED_PAGE) {
 		page_cache_release(page);
 		goto find_page;
-	} else if (ret)
-		goto out;
-
-	/*
-	 * Return the number of bytes written.
-	 */
-	ret = this_len;
-	mark_page_accessed(page);
-	balance_dirty_pages_ratelimited(mapping);
+	}
 out:
-	if (!(buf->flags & PIPE_BUF_FLAG_STOLEN))
-		page_cache_release(page);
-
+	page_cache_release(page);
 	unlock_page(page);
 out_nomem:
 	buf->ops->unmap(info, buf);
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index 0008d4bd4059..3130977fc6ab 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -5,8 +5,7 @@
 
 #define PIPE_BUFFERS (16)
 
-#define PIPE_BUF_FLAG_STOLEN	0x01
-#define PIPE_BUF_FLAG_LRU	0x02
+#define PIPE_BUF_FLAG_LRU	0x01
 
 struct pipe_buffer {
 	struct page *page;
-- 
cgit v1.2.3


From f84d751994441292593523c7069ed147176f6cab Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Mon, 1 May 2006 19:59:03 +0200
Subject: [PATCH] pipe: introduce ->pin() buffer operation

The ->map() function is really expensive on highmem machines right now,
since it has to use the slower kmap() instead of kmap_atomic(). Splice
rarely needs to access the virtual address of a page, so it's a waste
of time doing it.

Introduce ->pin() to take over the responsibility of making sure the
page data is valid. ->map() is then reduced to just kmap(). That way we
can also share a most of the pipe buffer ops between pipe.c and splice.c

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/pipe.c                 | 39 +++++++++++---------
 fs/splice.c               | 91 ++++++++++++++++-------------------------------
 include/linux/pipe_fs_i.h | 21 ++++++++++-
 3 files changed, 73 insertions(+), 78 deletions(-)

(limited to 'include/linux')

diff --git a/fs/pipe.c b/fs/pipe.c
index 888f265011bf..d9644fd9cc0d 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -110,14 +110,14 @@ static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
 		page_cache_release(page);
 }
 
-static void * anon_pipe_buf_map(struct file *file, struct pipe_inode_info *pipe,
-				struct pipe_buffer *buf)
+void *generic_pipe_buf_map(struct pipe_inode_info *pipe,
+			   struct pipe_buffer *buf)
 {
 	return kmap(buf->page);
 }
 
-static void anon_pipe_buf_unmap(struct pipe_inode_info *pipe,
-				struct pipe_buffer *buf)
+void generic_pipe_buf_unmap(struct pipe_inode_info *pipe,
+			    struct pipe_buffer *buf)
 {
 	kunmap(buf->page);
 }
@@ -135,19 +135,24 @@ static int anon_pipe_buf_steal(struct pipe_inode_info *pipe,
 	return 1;
 }
 
-static void anon_pipe_buf_get(struct pipe_inode_info *info,
-			      struct pipe_buffer *buf)
+void generic_pipe_buf_get(struct pipe_inode_info *info, struct pipe_buffer *buf)
 {
 	page_cache_get(buf->page);
 }
 
+int generic_pipe_buf_pin(struct pipe_inode_info *info, struct pipe_buffer *buf)
+{
+	return 0;
+}
+
 static struct pipe_buf_operations anon_pipe_buf_ops = {
 	.can_merge = 1,
-	.map = anon_pipe_buf_map,
-	.unmap = anon_pipe_buf_unmap,
+	.map = generic_pipe_buf_map,
+	.unmap = generic_pipe_buf_unmap,
+	.pin = generic_pipe_buf_pin,
 	.release = anon_pipe_buf_release,
 	.steal = anon_pipe_buf_steal,
-	.get = anon_pipe_buf_get,
+	.get = generic_pipe_buf_get,
 };
 
 static ssize_t
@@ -183,12 +188,14 @@ pipe_readv(struct file *filp, const struct iovec *_iov,
 			if (chars > total_len)
 				chars = total_len;
 
-			addr = ops->map(filp, pipe, buf);
-			if (IS_ERR(addr)) {
+			error = ops->pin(pipe, buf);
+			if (error) {
 				if (!ret)
-					ret = PTR_ERR(addr);
+					error = ret;
 				break;
 			}
+
+			addr = ops->map(pipe, buf);
 			error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars);
 			ops->unmap(pipe, buf);
 			if (unlikely(error)) {
@@ -300,11 +307,11 @@ pipe_writev(struct file *filp, const struct iovec *_iov,
 			void *addr;
 			int error;
 
-			addr = ops->map(filp, pipe, buf);
-			if (IS_ERR(addr)) {
-				error = PTR_ERR(addr);
+			error = ops->pin(pipe, buf);
+			if (error)
 				goto out;
-			}
+
+			addr = ops->map(pipe, buf);
 			error = pipe_iov_copy_from_user(offset + addr, iov,
 							chars);
 			ops->unmap(pipe, buf);
diff --git a/fs/splice.c b/fs/splice.c
index 1633778f3652..d7538d83c367 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -90,9 +90,8 @@ static void page_cache_pipe_buf_release(struct pipe_inode_info *info,
 	buf->flags &= ~PIPE_BUF_FLAG_LRU;
 }
 
-static void *page_cache_pipe_buf_map(struct file *file,
-				     struct pipe_inode_info *info,
-				     struct pipe_buffer *buf)
+static int page_cache_pipe_buf_pin(struct pipe_inode_info *info,
+				   struct pipe_buffer *buf)
 {
 	struct page *page = buf->page;
 	int err;
@@ -118,49 +117,25 @@ static void *page_cache_pipe_buf_map(struct file *file,
 		}
 
 		/*
-		 * Page is ok afterall, fall through to mapping.
+		 * Page is ok afterall, we are done.
 		 */
 		unlock_page(page);
 	}
 
-	return kmap(page);
+	return 0;
 error:
 	unlock_page(page);
-	return ERR_PTR(err);
-}
-
-static void page_cache_pipe_buf_unmap(struct pipe_inode_info *info,
-				      struct pipe_buffer *buf)
-{
-	kunmap(buf->page);
-}
-
-static void *user_page_pipe_buf_map(struct file *file,
-				    struct pipe_inode_info *pipe,
-				    struct pipe_buffer *buf)
-{
-	return kmap(buf->page);
-}
-
-static void user_page_pipe_buf_unmap(struct pipe_inode_info *pipe,
-				     struct pipe_buffer *buf)
-{
-	kunmap(buf->page);
-}
-
-static void page_cache_pipe_buf_get(struct pipe_inode_info *info,
-				    struct pipe_buffer *buf)
-{
-	page_cache_get(buf->page);
+	return err;
 }
 
 static struct pipe_buf_operations page_cache_pipe_buf_ops = {
 	.can_merge = 0,
-	.map = page_cache_pipe_buf_map,
-	.unmap = page_cache_pipe_buf_unmap,
+	.map = generic_pipe_buf_map,
+	.unmap = generic_pipe_buf_unmap,
+	.pin = page_cache_pipe_buf_pin,
 	.release = page_cache_pipe_buf_release,
 	.steal = page_cache_pipe_buf_steal,
-	.get = page_cache_pipe_buf_get,
+	.get = generic_pipe_buf_get,
 };
 
 static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe,
@@ -171,11 +146,12 @@ static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe,
 
 static struct pipe_buf_operations user_page_pipe_buf_ops = {
 	.can_merge = 0,
-	.map = user_page_pipe_buf_map,
-	.unmap = user_page_pipe_buf_unmap,
+	.map = generic_pipe_buf_map,
+	.unmap = generic_pipe_buf_unmap,
+	.pin = generic_pipe_buf_pin,
 	.release = page_cache_pipe_buf_release,
 	.steal = user_page_pipe_buf_steal,
-	.get = page_cache_pipe_buf_get,
+	.get = generic_pipe_buf_get,
 };
 
 /*
@@ -517,26 +493,16 @@ static int pipe_to_sendpage(struct pipe_inode_info *info,
 {
 	struct file *file = sd->file;
 	loff_t pos = sd->pos;
-	ssize_t ret;
-	void *ptr;
-	int more;
+	int ret, more;
 
-	/*
-	 * Sub-optimal, but we are limited by the pipe ->map. We don't
-	 * need a kmap'ed buffer here, we just want to make sure we
-	 * have the page pinned if the pipe page originates from the
-	 * page cache.
-	 */
-	ptr = buf->ops->map(file, info, buf);
-	if (IS_ERR(ptr))
-		return PTR_ERR(ptr);
-
-	more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len;
+	ret = buf->ops->pin(info, buf);
+	if (!ret) {
+		more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len;
 
-	ret = file->f_op->sendpage(file, buf->page, buf->offset, sd->len,
-				   &pos, more);
+		ret = file->f_op->sendpage(file, buf->page, buf->offset,
+					   sd->len, &pos, more);
+	}
 
-	buf->ops->unmap(info, buf);
 	return ret;
 }
 
@@ -569,15 +535,14 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf,
 	unsigned int offset, this_len;
 	struct page *page;
 	pgoff_t index;
-	char *src;
 	int ret;
 
 	/*
 	 * make sure the data in this buffer is uptodate
 	 */
-	src = buf->ops->map(file, info, buf);
-	if (IS_ERR(src))
-		return PTR_ERR(src);
+	ret = buf->ops->pin(info, buf);
+	if (unlikely(ret))
+		return ret;
 
 	index = sd->pos >> PAGE_CACHE_SHIFT;
 	offset = sd->pos & ~PAGE_CACHE_MASK;
@@ -666,11 +631,16 @@ find_page:
 		goto out;
 
 	if (buf->page != page) {
-		char *dst = kmap_atomic(page, KM_USER0);
+		/*
+		 * Careful, ->map() uses KM_USER0!
+		 */
+		char *src = buf->ops->map(info, buf);
+		char *dst = kmap_atomic(page, KM_USER1);
 
 		memcpy(dst + offset, src + buf->offset, this_len);
 		flush_dcache_page(page);
-		kunmap_atomic(dst, KM_USER0);
+		kunmap_atomic(dst, KM_USER1);
+		buf->ops->unmap(info, buf);
 	}
 
 	ret = mapping->a_ops->commit_write(file, page, offset, offset+this_len);
@@ -690,7 +660,6 @@ out:
 	page_cache_release(page);
 	unlock_page(page);
 out_nomem:
-	buf->ops->unmap(info, buf);
 	return ret;
 }
 
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index 3130977fc6ab..b8aae1fc5185 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -14,10 +14,23 @@ struct pipe_buffer {
 	unsigned int flags;
 };
 
+/*
+ * Note on the nesting of these functions:
+ *
+ * ->pin()
+ *	->steal()
+ *	...
+ *	->map()
+ *	...
+ *	->unmap()
+ *
+ * That is, ->map() must be called on a pinned buffer, same goes for ->steal().
+ */
 struct pipe_buf_operations {
 	int can_merge;
-	void * (*map)(struct file *, struct pipe_inode_info *, struct pipe_buffer *);
+	void * (*map)(struct pipe_inode_info *, struct pipe_buffer *);
 	void (*unmap)(struct pipe_inode_info *, struct pipe_buffer *);
+	int (*pin)(struct pipe_inode_info *, struct pipe_buffer *);
 	void (*release)(struct pipe_inode_info *, struct pipe_buffer *);
 	int (*steal)(struct pipe_inode_info *, struct pipe_buffer *);
 	void (*get)(struct pipe_inode_info *, struct pipe_buffer *);
@@ -50,6 +63,12 @@ struct pipe_inode_info * alloc_pipe_info(struct inode * inode);
 void free_pipe_info(struct inode * inode);
 void __free_pipe_info(struct pipe_inode_info *);
 
+/* Generic pipe buffer ops functions */
+void *generic_pipe_buf_map(struct pipe_inode_info *, struct pipe_buffer *);
+void generic_pipe_buf_unmap(struct pipe_inode_info *, struct pipe_buffer *);
+void generic_pipe_buf_get(struct pipe_inode_info *, struct pipe_buffer *);
+int generic_pipe_buf_pin(struct pipe_inode_info *, struct pipe_buffer *);
+
 /*
  * splice is tied to pipes as a transport (at least for now), so we'll just
  * add the splice flags here.
-- 
cgit v1.2.3


From f6762b7ad8edd6abc802542ce845d3bc8adcb92f Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Mon, 1 May 2006 20:02:05 +0200
Subject: [PATCH] pipe: enable atomic copying of pipe data to/from user space

The pipe ->map() method uses kmap() to virtually map the pages, which
is both slow and has known scalability issues on SMP. This patch enables
atomic copying of pipe pages, by pre-faulting data and using kmap_atomic()
instead.

lmbench bw_pipe and lat_pipe measurements agree this is a Good Thing. Here
are results from that on a UP machine with highmem (1.5GiB of RAM), running
first a UP kernel, SMP kernel, and SMP kernel patched.

Vanilla-UP:
Pipe bandwidth: 1622.28 MB/sec
Pipe bandwidth: 1610.59 MB/sec
Pipe bandwidth: 1608.30 MB/sec
Pipe latency: 7.3275 microseconds
Pipe latency: 7.2995 microseconds
Pipe latency: 7.3097 microseconds

Vanilla-SMP:
Pipe bandwidth: 1382.19 MB/sec
Pipe bandwidth: 1317.27 MB/sec
Pipe bandwidth: 1355.61 MB/sec
Pipe latency: 9.6402 microseconds
Pipe latency: 9.6696 microseconds
Pipe latency: 9.6153 microseconds

Patched-SMP:
Pipe bandwidth: 1578.70 MB/sec
Pipe bandwidth: 1579.95 MB/sec
Pipe bandwidth: 1578.63 MB/sec
Pipe latency: 9.1654 microseconds
Pipe latency: 9.2266 microseconds
Pipe latency: 9.1527 microseconds

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/pipe.c                 | 141 ++++++++++++++++++++++++++++++++++++++--------
 fs/splice.c               |   4 +-
 include/linux/pipe_fs_i.h |  11 ++--
 3 files changed, 126 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/fs/pipe.c b/fs/pipe.c
index d9644fd9cc0d..3941a7f78b5d 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -55,7 +55,8 @@ void pipe_wait(struct pipe_inode_info *pipe)
 }
 
 static int
-pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len)
+pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len,
+			int atomic)
 {
 	unsigned long copy;
 
@@ -64,8 +65,13 @@ pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len)
 			iov++;
 		copy = min_t(unsigned long, len, iov->iov_len);
 
-		if (copy_from_user(to, iov->iov_base, copy))
-			return -EFAULT;
+		if (atomic) {
+			if (__copy_from_user_inatomic(to, iov->iov_base, copy))
+				return -EFAULT;
+		} else {
+			if (copy_from_user(to, iov->iov_base, copy))
+				return -EFAULT;
+		}
 		to += copy;
 		len -= copy;
 		iov->iov_base += copy;
@@ -75,7 +81,8 @@ pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len)
 }
 
 static int
-pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len)
+pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len,
+		      int atomic)
 {
 	unsigned long copy;
 
@@ -84,8 +91,13 @@ pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len)
 			iov++;
 		copy = min_t(unsigned long, len, iov->iov_len);
 
-		if (copy_to_user(iov->iov_base, from, copy))
-			return -EFAULT;
+		if (atomic) {
+			if (__copy_to_user_inatomic(iov->iov_base, from, copy))
+				return -EFAULT;
+		} else {
+			if (copy_to_user(iov->iov_base, from, copy))
+				return -EFAULT;
+		}
 		from += copy;
 		len -= copy;
 		iov->iov_base += copy;
@@ -94,6 +106,47 @@ pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len)
 	return 0;
 }
 
+/*
+ * Attempt to pre-fault in the user memory, so we can use atomic copies.
+ * Returns the number of bytes not faulted in.
+ */
+static int iov_fault_in_pages_write(struct iovec *iov, unsigned long len)
+{
+	while (!iov->iov_len)
+		iov++;
+
+	while (len > 0) {
+		unsigned long this_len;
+
+		this_len = min_t(unsigned long, len, iov->iov_len);
+		if (fault_in_pages_writeable(iov->iov_base, this_len))
+			break;
+
+		len -= this_len;
+		iov++;
+	}
+
+	return len;
+}
+
+/*
+ * Pre-fault in the user memory, so we can use atomic copies.
+ */
+static void iov_fault_in_pages_read(struct iovec *iov, unsigned long len)
+{
+	while (!iov->iov_len)
+		iov++;
+
+	while (len > 0) {
+		unsigned long this_len;
+
+		this_len = min_t(unsigned long, len, iov->iov_len);
+		fault_in_pages_readable(iov->iov_base, this_len);
+		len -= this_len;
+		iov++;
+	}
+}
+
 static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
 				  struct pipe_buffer *buf)
 {
@@ -111,15 +164,24 @@ static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
 }
 
 void *generic_pipe_buf_map(struct pipe_inode_info *pipe,
-			   struct pipe_buffer *buf)
+			   struct pipe_buffer *buf, int atomic)
 {
+	if (atomic) {
+		buf->flags |= PIPE_BUF_FLAG_ATOMIC;
+		return kmap_atomic(buf->page, KM_USER0);
+	}
+
 	return kmap(buf->page);
 }
 
 void generic_pipe_buf_unmap(struct pipe_inode_info *pipe,
-			    struct pipe_buffer *buf)
+			    struct pipe_buffer *buf, void *map_data)
 {
-	kunmap(buf->page);
+	if (buf->flags & PIPE_BUF_FLAG_ATOMIC) {
+		buf->flags &= ~PIPE_BUF_FLAG_ATOMIC;
+		kunmap_atomic(map_data, KM_USER0);
+	} else
+		kunmap(buf->page);
 }
 
 static int anon_pipe_buf_steal(struct pipe_inode_info *pipe,
@@ -183,7 +245,7 @@ pipe_readv(struct file *filp, const struct iovec *_iov,
 			struct pipe_buf_operations *ops = buf->ops;
 			void *addr;
 			size_t chars = buf->len;
-			int error;
+			int error, atomic;
 
 			if (chars > total_len)
 				chars = total_len;
@@ -195,12 +257,21 @@ pipe_readv(struct file *filp, const struct iovec *_iov,
 				break;
 			}
 
-			addr = ops->map(pipe, buf);
-			error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars);
-			ops->unmap(pipe, buf);
+			atomic = !iov_fault_in_pages_write(iov, chars);
+redo:
+			addr = ops->map(pipe, buf, atomic);
+			error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars, atomic);
+			ops->unmap(pipe, buf, addr);
 			if (unlikely(error)) {
+				/*
+				 * Just retry with the slow path if we failed.
+				 */
+				if (atomic) {
+					atomic = 0;
+					goto redo;
+				}
 				if (!ret)
-					ret = -EFAULT;
+					ret = error;
 				break;
 			}
 			ret += chars;
@@ -304,21 +375,28 @@ pipe_writev(struct file *filp, const struct iovec *_iov,
 		int offset = buf->offset + buf->len;
 
 		if (ops->can_merge && offset + chars <= PAGE_SIZE) {
+			int error, atomic = 1;
 			void *addr;
-			int error;
 
 			error = ops->pin(pipe, buf);
 			if (error)
 				goto out;
 
-			addr = ops->map(pipe, buf);
+			iov_fault_in_pages_read(iov, chars);
+redo1:
+			addr = ops->map(pipe, buf, atomic);
 			error = pipe_iov_copy_from_user(offset + addr, iov,
-							chars);
-			ops->unmap(pipe, buf);
+							chars, atomic);
+			ops->unmap(pipe, buf, addr);
 			ret = error;
 			do_wakeup = 1;
-			if (error)
+			if (error) {
+				if (atomic) {
+					atomic = 0;
+					goto redo1;
+				}
 				goto out;
+			}
 			buf->len += chars;
 			total_len -= chars;
 			ret = chars;
@@ -341,7 +419,8 @@ pipe_writev(struct file *filp, const struct iovec *_iov,
 			int newbuf = (pipe->curbuf + bufs) & (PIPE_BUFFERS-1);
 			struct pipe_buffer *buf = pipe->bufs + newbuf;
 			struct page *page = pipe->tmp_page;
-			int error;
+			char *src;
+			int error, atomic = 1;
 
 			if (!page) {
 				page = alloc_page(GFP_HIGHUSER);
@@ -361,11 +440,27 @@ pipe_writev(struct file *filp, const struct iovec *_iov,
 			if (chars > total_len)
 				chars = total_len;
 
-			error = pipe_iov_copy_from_user(kmap(page), iov, chars);
-			kunmap(page);
+			iov_fault_in_pages_read(iov, chars);
+redo2:
+			if (atomic)
+				src = kmap_atomic(page, KM_USER0);
+			else
+				src = kmap(page);
+
+			error = pipe_iov_copy_from_user(src, iov, chars,
+							atomic);
+			if (atomic)
+				kunmap_atomic(src, KM_USER0);
+			else
+				kunmap(page);
+
 			if (unlikely(error)) {
+				if (atomic) {
+					atomic = 0;
+					goto redo2;
+				}
 				if (!ret)
-					ret = -EFAULT;
+					ret = error;
 				break;
 			}
 			ret += chars;
diff --git a/fs/splice.c b/fs/splice.c
index 0a6916423e7d..d4664a297bab 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -640,13 +640,13 @@ find_page:
 		/*
 		 * Careful, ->map() uses KM_USER0!
 		 */
-		char *src = buf->ops->map(info, buf);
+		char *src = buf->ops->map(info, buf, 1);
 		char *dst = kmap_atomic(page, KM_USER1);
 
 		memcpy(dst + offset, src + buf->offset, this_len);
 		flush_dcache_page(page);
 		kunmap_atomic(dst, KM_USER1);
-		buf->ops->unmap(info, buf);
+		buf->ops->unmap(info, buf, src);
 	}
 
 	ret = mapping->a_ops->commit_write(file, page, offset, offset+this_len);
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index b8aae1fc5185..4c054491e38e 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -5,7 +5,8 @@
 
 #define PIPE_BUFFERS (16)
 
-#define PIPE_BUF_FLAG_LRU	0x01
+#define PIPE_BUF_FLAG_LRU	0x01	/* page is on the LRU */
+#define PIPE_BUF_FLAG_ATOMIC	0x02	/* was atomically mapped */
 
 struct pipe_buffer {
 	struct page *page;
@@ -28,8 +29,8 @@ struct pipe_buffer {
  */
 struct pipe_buf_operations {
 	int can_merge;
-	void * (*map)(struct pipe_inode_info *, struct pipe_buffer *);
-	void (*unmap)(struct pipe_inode_info *, struct pipe_buffer *);
+	void * (*map)(struct pipe_inode_info *, struct pipe_buffer *, int);
+	void (*unmap)(struct pipe_inode_info *, struct pipe_buffer *, void *);
 	int (*pin)(struct pipe_inode_info *, struct pipe_buffer *);
 	void (*release)(struct pipe_inode_info *, struct pipe_buffer *);
 	int (*steal)(struct pipe_inode_info *, struct pipe_buffer *);
@@ -64,8 +65,8 @@ void free_pipe_info(struct inode * inode);
 void __free_pipe_info(struct pipe_inode_info *);
 
 /* Generic pipe buffer ops functions */
-void *generic_pipe_buf_map(struct pipe_inode_info *, struct pipe_buffer *);
-void generic_pipe_buf_unmap(struct pipe_inode_info *, struct pipe_buffer *);
+void *generic_pipe_buf_map(struct pipe_inode_info *, struct pipe_buffer *, int);
+void generic_pipe_buf_unmap(struct pipe_inode_info *, struct pipe_buffer *, void *);
 void generic_pipe_buf_get(struct pipe_inode_info *, struct pipe_buffer *);
 int generic_pipe_buf_pin(struct pipe_inode_info *, struct pipe_buffer *);
 
-- 
cgit v1.2.3


From 7afa6fd037e51e95d322990cb127bb2b1217251a Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Mon, 1 May 2006 20:02:33 +0200
Subject: [PATCH] vmsplice: allow user to pass in gift pages

If SPLICE_F_GIFT is set, the user is basically giving this pages away to
the kernel. That means we can steal them for eg page cache uses instead
of copying it.

The data must be properly page aligned and also a multiple of the page size
in length.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/splice.c               | 28 +++++++++++++++++++++++++---
 include/linux/pipe_fs_i.h |  2 ++
 2 files changed, 27 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/splice.c b/fs/splice.c
index d4664a297bab..b150493b6fc3 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -141,7 +141,10 @@ static struct pipe_buf_operations page_cache_pipe_buf_ops = {
 static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe,
 				    struct pipe_buffer *buf)
 {
-	return 1;
+	if (!(buf->flags & PIPE_BUF_FLAG_GIFT))
+		return 1;
+
+	return 0;
 }
 
 static struct pipe_buf_operations user_page_pipe_buf_ops = {
@@ -186,6 +189,9 @@ static ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
 			buf->offset = spd->partial[page_nr].offset;
 			buf->len = spd->partial[page_nr].len;
 			buf->ops = spd->ops;
+			if (spd->flags & SPLICE_F_GIFT)
+				buf->flags |= PIPE_BUF_FLAG_GIFT;
+
 			pipe->nrbufs++;
 			page_nr++;
 			ret += buf->len;
@@ -1073,7 +1079,7 @@ static long do_splice(struct file *in, loff_t __user *off_in,
  */
 static int get_iovec_page_array(const struct iovec __user *iov,
 				unsigned int nr_vecs, struct page **pages,
-				struct partial_page *partial)
+				struct partial_page *partial, int aligned)
 {
 	int buffers = 0, error = 0;
 
@@ -1113,6 +1119,15 @@ static int get_iovec_page_array(const struct iovec __user *iov,
 		 * in the user pages.
 		 */
 		off = (unsigned long) base & ~PAGE_MASK;
+
+		/*
+		 * If asked for alignment, the offset must be zero and the
+		 * length a multiple of the PAGE_SIZE.
+		 */
+		error = -EINVAL;
+		if (aligned && (off || len & ~PAGE_MASK))
+			break;
+
 		npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
 		if (npages > PIPE_BUFFERS - buffers)
 			npages = PIPE_BUFFERS - buffers;
@@ -1206,7 +1221,8 @@ static long do_vmsplice(struct file *file, const struct iovec __user *iov,
 	else if (unlikely(!nr_segs))
 		return 0;
 
-	spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial);
+	spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial,
+					    flags & SPLICE_F_GIFT);
 	if (spd.nr_pages <= 0)
 		return spd.nr_pages;
 
@@ -1314,6 +1330,12 @@ static int link_pipe(struct pipe_inode_info *ipipe,
 				obuf = opipe->bufs + nbuf;
 				*obuf = *ibuf;
 
+				/*
+				 * Don't inherit the gift flag, we need to
+				 * prevent multiple steals of this page.
+				 */
+				obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
+
 				if (obuf->len > len)
 					obuf->len = len;
 
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index 4c054491e38e..df4d3fa7d3dc 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -7,6 +7,7 @@
 
 #define PIPE_BUF_FLAG_LRU	0x01	/* page is on the LRU */
 #define PIPE_BUF_FLAG_ATOMIC	0x02	/* was atomically mapped */
+#define PIPE_BUF_FLAG_GIFT	0x04	/* page is a gift */
 
 struct pipe_buffer {
 	struct page *page;
@@ -79,6 +80,7 @@ int generic_pipe_buf_pin(struct pipe_inode_info *, struct pipe_buffer *);
 				 /* we may still block on the fd we splice */
 				 /* from/to, of course */
 #define SPLICE_F_MORE	(0x04)	/* expect more data */
+#define SPLICE_F_GIFT	(0x08)	/* pages passed in are a gift */
 
 /*
  * Passed to the actors
-- 
cgit v1.2.3


From 46c5ea3c9ae7fbc6e52a13c92e59d4fc7f4ca80a Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Tue, 2 May 2006 05:12:22 +0200
Subject: [NETFILTER] x_tables: fix compat related crash on non-x86

When iptables userspace adds an ipt_standard_target, it calculates the size
of the entire entry as:

sizeof(struct ipt_entry) + XT_ALIGN(sizeof(struct ipt_standard_target))

ipt_standard_target looks like this:

  struct xt_standard_target
  {
        struct xt_entry_target target;
        int verdict;
  };

xt_entry_target contains a pointer, so when compiled for 64 bit the
structure gets an extra 4 byte of padding at the end. On 32 bit
architectures where iptables aligns to 8 byte it will also have 4
byte padding at the end because it is only 36 bytes large.

The compat_ipt_standard_fn in the kernel adjusts the offsets by

  sizeof(struct ipt_standard_target) - sizeof(struct compat_ipt_standard_target),

which will always result in 4, even if the structure from userspace
was already padded to a multiple of 8. On x86 this works out by
accident because userspace only aligns to 4, on all other
architectures this is broken and causes incorrect adjustments to
the size and following offsets.

Thanks to Linus for lots of debugging help and testing.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/netfilter/x_tables.h |  8 ++++++++
 net/ipv4/netfilter/ip_tables.c     | 33 ++++++++++++++-------------------
 2 files changed, 22 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 38701454e197..48cc32d83f77 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -337,6 +337,10 @@ struct compat_xt_entry_match
 			char name[XT_FUNCTION_MAXNAMELEN - 1];
 			u_int8_t revision;
 		} user;
+		struct {
+			u_int16_t match_size;
+			compat_uptr_t match;
+		} kernel;
 		u_int16_t match_size;
 	} u;
 	unsigned char data[0];
@@ -350,6 +354,10 @@ struct compat_xt_entry_target
 			char name[XT_FUNCTION_MAXNAMELEN - 1];
 			u_int8_t revision;
 		} user;
+		struct {
+			u_int16_t target_size;
+			compat_uptr_t target;
+		} kernel;
 		u_int16_t target_size;
 	} u;
 	unsigned char data[0];
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index d25ac8ba6eba..6d1c11563943 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -956,15 +956,16 @@ struct compat_ipt_standard_target
 	compat_int_t verdict;
 };
 
-#define IPT_ST_OFFSET	(sizeof(struct ipt_standard_target) - \
-				sizeof(struct compat_ipt_standard_target))
-
 struct compat_ipt_standard
 {
 	struct compat_ipt_entry entry;
 	struct compat_ipt_standard_target target;
 };
 
+#define IPT_ST_LEN		XT_ALIGN(sizeof(struct ipt_standard_target))
+#define IPT_ST_COMPAT_LEN	COMPAT_XT_ALIGN(sizeof(struct compat_ipt_standard_target))
+#define IPT_ST_OFFSET		(IPT_ST_LEN - IPT_ST_COMPAT_LEN)
+
 static int compat_ipt_standard_fn(void *target,
 		void **dstptr, int *size, int convert)
 {
@@ -975,35 +976,29 @@ static int compat_ipt_standard_fn(void *target,
 	ret = 0;
 	switch (convert) {
 		case COMPAT_TO_USER:
-			pst = (struct ipt_standard_target *)target;
+			pst = target;
 			memcpy(&compat_st.target, &pst->target,
-					sizeof(struct ipt_entry_target));
+				sizeof(compat_st.target));
 			compat_st.verdict = pst->verdict;
 			if (compat_st.verdict > 0)
 				compat_st.verdict -=
 					compat_calc_jump(compat_st.verdict);
-			compat_st.target.u.user.target_size =
-			sizeof(struct compat_ipt_standard_target);
-			if (__copy_to_user(*dstptr, &compat_st,
-				sizeof(struct compat_ipt_standard_target)))
+			compat_st.target.u.user.target_size = IPT_ST_COMPAT_LEN;
+			if (copy_to_user(*dstptr, &compat_st, IPT_ST_COMPAT_LEN))
 				ret = -EFAULT;
 			*size -= IPT_ST_OFFSET;
-			*dstptr += sizeof(struct compat_ipt_standard_target);
+			*dstptr += IPT_ST_COMPAT_LEN;
 			break;
 		case COMPAT_FROM_USER:
-			pcompat_st =
-				(struct compat_ipt_standard_target *)target;
-			memcpy(&st.target, &pcompat_st->target,
-					sizeof(struct ipt_entry_target));
+			pcompat_st = target;
+			memcpy(&st.target, &pcompat_st->target, IPT_ST_COMPAT_LEN);
 			st.verdict = pcompat_st->verdict;
 			if (st.verdict > 0)
 				st.verdict += compat_calc_jump(st.verdict);
-			st.target.u.user.target_size =
-			sizeof(struct ipt_standard_target);
-			memcpy(*dstptr, &st,
-					sizeof(struct ipt_standard_target));
+			st.target.u.user.target_size = IPT_ST_LEN;
+			memcpy(*dstptr, &st, IPT_ST_LEN);
 			*size += IPT_ST_OFFSET;
-			*dstptr += sizeof(struct ipt_standard_target);
+			*dstptr += IPT_ST_LEN;
 			break;
 		case COMPAT_CALC_SIZE:
 			*size += IPT_ST_OFFSET;
-- 
cgit v1.2.3


From a893b99be71f1d669b74f840e3a683dd077d007b Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Tue, 2 May 2006 15:03:27 +0200
Subject: [PATCH] splice: fix page LRU accounting

Currently we rely on the PIPE_BUF_FLAG_LRU flag being set correctly
to know whether we need to fiddle with page LRU state after stealing it,
however for some origins we just don't know if the page is on the LRU
list or not.

So remove PIPE_BUF_FLAG_LRU and do this check/add manually in pipe_to_file()
instead.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/splice.c               | 31 +++++++++++++++++++++----------
 include/linux/pipe_fs_i.h |  5 ++---
 2 files changed, 23 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/fs/splice.c b/fs/splice.c
index b0c157d76948..3318b965c10b 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -78,7 +78,6 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *info,
 		return 1;
 	}
 
-	buf->flags |= PIPE_BUF_FLAG_LRU;
 	return 0;
 }
 
@@ -86,8 +85,6 @@ static void page_cache_pipe_buf_release(struct pipe_inode_info *info,
 					struct pipe_buffer *buf)
 {
 	page_cache_release(buf->page);
-	buf->page = NULL;
-	buf->flags &= ~PIPE_BUF_FLAG_LRU;
 }
 
 static int page_cache_pipe_buf_pin(struct pipe_inode_info *info,
@@ -570,22 +567,36 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf,
 	if ((sd->flags & SPLICE_F_MOVE) && this_len == PAGE_CACHE_SIZE) {
 		/*
 		 * If steal succeeds, buf->page is now pruned from the vm
-		 * side (LRU and page cache) and we can reuse it. The page
-		 * will also be looked on successful return.
+		 * side (page cache) and we can reuse it. The page will also
+		 * be locked on successful return.
 		 */
 		if (buf->ops->steal(info, buf))
 			goto find_page;
 
 		page = buf->page;
+		page_cache_get(page);
+
+		/*
+		 * page must be on the LRU for adding to the pagecache.
+		 * Check this without grabbing the zone lock, if it isn't
+		 * the do grab the zone lock, recheck, and add if necessary.
+		 */
+		if (!PageLRU(page)) {
+			struct zone *zone = page_zone(page);
+
+			spin_lock_irq(&zone->lru_lock);
+			if (!PageLRU(page)) {
+				SetPageLRU(page);
+				add_page_to_inactive_list(zone, page);
+			}
+			spin_unlock_irq(&zone->lru_lock);
+		}
+
 		if (add_to_page_cache(page, mapping, index, gfp_mask)) {
+			page_cache_release(page);
 			unlock_page(page);
 			goto find_page;
 		}
-
-		page_cache_get(page);
-
-		if (!(buf->flags & PIPE_BUF_FLAG_LRU))
-			lru_cache_add(page);
 	} else {
 find_page:
 		page = find_lock_page(mapping, index);
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index df4d3fa7d3dc..070954f05947 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -5,9 +5,8 @@
 
 #define PIPE_BUFFERS (16)
 
-#define PIPE_BUF_FLAG_LRU	0x01	/* page is on the LRU */
-#define PIPE_BUF_FLAG_ATOMIC	0x02	/* was atomically mapped */
-#define PIPE_BUF_FLAG_GIFT	0x04	/* page is a gift */
+#define PIPE_BUF_FLAG_ATOMIC	0x01	/* was atomically mapped */
+#define PIPE_BUF_FLAG_GIFT	0x02	/* page is a gift */
 
 struct pipe_buffer {
 	struct page *page;
-- 
cgit v1.2.3


From 330ab71619bacc4d4494227a6cfc9b7f5500403d Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Tue, 2 May 2006 15:29:57 +0200
Subject: [PATCH] vmsplice: restrict stealing a little more

Apply the same rules as the anon pipe pages, only allow stealing
if no one else is using the page.

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 fs/pipe.c                 | 6 +++---
 fs/splice.c               | 2 +-
 include/linux/pipe_fs_i.h | 1 +
 3 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/pipe.c b/fs/pipe.c
index 3941a7f78b5d..5acd8954aaa0 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -184,8 +184,8 @@ void generic_pipe_buf_unmap(struct pipe_inode_info *pipe,
 		kunmap(buf->page);
 }
 
-static int anon_pipe_buf_steal(struct pipe_inode_info *pipe,
-			       struct pipe_buffer *buf)
+int generic_pipe_buf_steal(struct pipe_inode_info *pipe,
+			   struct pipe_buffer *buf)
 {
 	struct page *page = buf->page;
 
@@ -213,7 +213,7 @@ static struct pipe_buf_operations anon_pipe_buf_ops = {
 	.unmap = generic_pipe_buf_unmap,
 	.pin = generic_pipe_buf_pin,
 	.release = anon_pipe_buf_release,
-	.steal = anon_pipe_buf_steal,
+	.steal = generic_pipe_buf_steal,
 	.get = generic_pipe_buf_get,
 };
 
diff --git a/fs/splice.c b/fs/splice.c
index 3318b965c10b..7fb04970c72d 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -141,7 +141,7 @@ static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe,
 	if (!(buf->flags & PIPE_BUF_FLAG_GIFT))
 		return 1;
 
-	return 0;
+	return generic_pipe_buf_steal(pipe, buf);
 }
 
 static struct pipe_buf_operations user_page_pipe_buf_ops = {
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index 070954f05947..ba73108cbf8b 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -69,6 +69,7 @@ void *generic_pipe_buf_map(struct pipe_inode_info *, struct pipe_buffer *, int);
 void generic_pipe_buf_unmap(struct pipe_inode_info *, struct pipe_buffer *, void *);
 void generic_pipe_buf_get(struct pipe_inode_info *, struct pipe_buffer *);
 int generic_pipe_buf_pin(struct pipe_inode_info *, struct pipe_buffer *);
+int generic_pipe_buf_steal(struct pipe_inode_info *, struct pipe_buffer *);
 
 /*
  * splice is tied to pipes as a transport (at least for now), so we'll just
-- 
cgit v1.2.3