From a22b169df1b9f259391cf3b8ad8bfeea3d7be3f1 Mon Sep 17 00:00:00 2001
From: Vasily Tarasov <vtaras@openvz.org>
Date: Wed, 11 Oct 2006 09:24:27 +0200
Subject: [PATCH] block layer: elevator_find function cleanup

We can easily produce search through the elevator list
without introducing additional elevator_type variable.

Signed-off-by: Vasily Tarasov <vtaras@openvz.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/elevator.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

(limited to 'block')

diff --git a/block/elevator.c b/block/elevator.c
index 487dd3da8853..d8030a84773a 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -93,21 +93,18 @@ static inline int elv_try_merge(struct request *__rq, struct bio *bio)
 
 static struct elevator_type *elevator_find(const char *name)
 {
-	struct elevator_type *e = NULL;
+	struct elevator_type *e;
 	struct list_head *entry;
 
 	list_for_each(entry, &elv_list) {
-		struct elevator_type *__e;
 
-		__e = list_entry(entry, struct elevator_type, list);
+		e = list_entry(entry, struct elevator_type, list);
 
-		if (!strcmp(__e->elevator_name, name)) {
-			e = __e;
-			break;
-		}
+		if (!strcmp(e->elevator_name, name))
+			return e;
 	}
 
-	return e;
+	return NULL;
 }
 
 static void elevator_put(struct elevator_type *e)
-- 
cgit v1.2.3


From c5841642242e9ae817275e09b36b298456dc17d2 Mon Sep 17 00:00:00 2001
From: Vasily Tarasov <vtaras@openvz.org>
Date: Wed, 11 Oct 2006 13:26:30 +0200
Subject: [PATCH] block layer: elv_iosched_show should get elv_list_lock

elv_iosched_show function iterates other elv_list, hence
elv_list_lock should be got.

Signed-off-by: Vasily Tarasov <vtaras@openvz.org>
Signed-off-by: Vasily Tarasov <jens.axboe@oracle.com>
---
 block/elevator.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/elevator.c b/block/elevator.c
index d8030a84773a..8ccd163254b8 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -1085,7 +1085,7 @@ ssize_t elv_iosched_show(request_queue_t *q, char *name)
 	struct list_head *entry;
 	int len = 0;
 
-	spin_lock_irq(q->queue_lock);
+	spin_lock_irq(&elv_list_lock);
 	list_for_each(entry, &elv_list) {
 		struct elevator_type *__e;
 
@@ -1095,7 +1095,7 @@ ssize_t elv_iosched_show(request_queue_t *q, char *name)
 		else
 			len += sprintf(name+len, "%s ", __e->elevator_name);
 	}
-	spin_unlock_irq(q->queue_lock);
+	spin_unlock_irq(&elv_list_lock);
 
 	len += sprintf(len+name, "\n");
 	return len;
-- 
cgit v1.2.3


From 79e2de4bc53d7ca2a8eedee49e4a92479b4b530e Mon Sep 17 00:00:00 2001
From: Thomas Maier <balagi@justmail.de>
Date: Thu, 19 Oct 2006 23:28:15 -0700
Subject: [PATCH] export clear_queue_congested and set_queue_congested

Export the clear_queue_congested() and set_queue_congested() functions
located in ll_rw_blk.c

The functions are renamed to blk_clear_queue_congested() and
blk_set_queue_congested().

(needed in the pktcdvd driver's bio write congestion control)

Signed-off-by: Thomas Maier <balagi@justmail.de>
Cc: Peter Osterlund <petero2@telia.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 block/ll_rw_blk.c      | 20 ++++++++++----------
 include/linux/blkdev.h |  2 ++
 2 files changed, 12 insertions(+), 10 deletions(-)

(limited to 'block')

diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index c847e17e5caa..132a858ce2c5 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -117,7 +117,7 @@ static void blk_queue_congestion_threshold(struct request_queue *q)
  * congested queues, and wake up anyone who was waiting for requests to be
  * put back.
  */
-static void clear_queue_congested(request_queue_t *q, int rw)
+void blk_clear_queue_congested(request_queue_t *q, int rw)
 {
 	enum bdi_state bit;
 	wait_queue_head_t *wqh = &congestion_wqh[rw];
@@ -128,18 +128,20 @@ static void clear_queue_congested(request_queue_t *q, int rw)
 	if (waitqueue_active(wqh))
 		wake_up(wqh);
 }
+EXPORT_SYMBOL(blk_clear_queue_congested);
 
 /*
  * A queue has just entered congestion.  Flag that in the queue's VM-visible
  * state flags and increment the global gounter of congested queues.
  */
-static void set_queue_congested(request_queue_t *q, int rw)
+void blk_set_queue_congested(request_queue_t *q, int rw)
 {
 	enum bdi_state bit;
 
 	bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested;
 	set_bit(bit, &q->backing_dev_info.state);
 }
+EXPORT_SYMBOL(blk_set_queue_congested);
 
 /**
  * blk_get_backing_dev_info - get the address of a queue's backing_dev_info
@@ -159,7 +161,6 @@ struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev)
 		ret = &q->backing_dev_info;
 	return ret;
 }
-
 EXPORT_SYMBOL(blk_get_backing_dev_info);
 
 void blk_queue_activity_fn(request_queue_t *q, activity_fn *fn, void *data)
@@ -167,7 +168,6 @@ void blk_queue_activity_fn(request_queue_t *q, activity_fn *fn, void *data)
 	q->activity_fn = fn;
 	q->activity_data = data;
 }
-
 EXPORT_SYMBOL(blk_queue_activity_fn);
 
 /**
@@ -2067,7 +2067,7 @@ static void __freed_request(request_queue_t *q, int rw)
 	struct request_list *rl = &q->rq;
 
 	if (rl->count[rw] < queue_congestion_off_threshold(q))
-		clear_queue_congested(q, rw);
+		blk_clear_queue_congested(q, rw);
 
 	if (rl->count[rw] + 1 <= q->nr_requests) {
 		if (waitqueue_active(&rl->wait[rw]))
@@ -2137,7 +2137,7 @@ static struct request *get_request(request_queue_t *q, int rw, struct bio *bio,
 				}
 			}
 		}
-		set_queue_congested(q, rw);
+		blk_set_queue_congested(q, rw);
 	}
 
 	/*
@@ -3765,14 +3765,14 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count)
 	blk_queue_congestion_threshold(q);
 
 	if (rl->count[READ] >= queue_congestion_on_threshold(q))
-		set_queue_congested(q, READ);
+		blk_set_queue_congested(q, READ);
 	else if (rl->count[READ] < queue_congestion_off_threshold(q))
-		clear_queue_congested(q, READ);
+		blk_clear_queue_congested(q, READ);
 
 	if (rl->count[WRITE] >= queue_congestion_on_threshold(q))
-		set_queue_congested(q, WRITE);
+		blk_set_queue_congested(q, WRITE);
 	else if (rl->count[WRITE] < queue_congestion_off_threshold(q))
-		clear_queue_congested(q, WRITE);
+		blk_clear_queue_congested(q, WRITE);
 
 	if (rl->count[READ] >= q->nr_requests) {
 		blk_set_queue_full(q, READ);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index d370d2cfe138..9575e3a5ff2a 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -651,6 +651,8 @@ extern void blk_recount_segments(request_queue_t *, struct bio *);
 extern int scsi_cmd_ioctl(struct file *, struct gendisk *, unsigned int, void __user *);
 extern int sg_scsi_ioctl(struct file *, struct request_queue *,
 		struct gendisk *, struct scsi_ioctl_command __user *);
+extern void blk_clear_queue_congested(request_queue_t *q, int rw);
+extern void blk_set_queue_congested(request_queue_t *q, int rw);
 extern void blk_start_queue(request_queue_t *q);
 extern void blk_stop_queue(request_queue_t *q);
 extern void blk_sync_queue(struct request_queue *q);
-- 
cgit v1.2.3


From 3fcfab16c5b86eaa3db3a9a31adba550c5b67141 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Thu, 19 Oct 2006 23:28:16 -0700
Subject: [PATCH] separate bdi congestion functions from queue congestion
 functions

Separate out the concept of "queue congestion" from "backing-dev congestion".
Congestion is a backing-dev concept, not a queue concept.

The blk_* congestion functions are retained, as wrappers around the core
backing-dev congestion functions.

This proper layering is needed so that NFS can cleanly use the congestion
functions, and so that CONFIG_BLOCK=n actually links.

Cc: "Thomas Maier" <balagi@justmail.de>
Cc: "Jens Axboe" <jens.axboe@oracle.com>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: David Howells <dhowells@redhat.com>
Cc: Peter Osterlund <petero2@telia.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/lib/usercopy.c    |  3 +-
 block/ll_rw_blk.c           | 71 ---------------------------------------------
 drivers/md/dm-crypt.c       |  3 +-
 fs/fat/file.c               |  3 +-
 fs/nfs/write.c              |  4 ++-
 fs/reiserfs/journal.c       |  3 +-
 fs/xfs/linux-2.6/kmem.c     |  5 ++--
 fs/xfs/linux-2.6/xfs_buf.c  |  3 +-
 include/linux/backing-dev.h |  7 +++++
 include/linux/blkdev.h      | 24 ++++++++++++---
 include/linux/writeback.h   |  1 -
 mm/Makefile                 |  3 +-
 mm/backing-dev.c            | 69 +++++++++++++++++++++++++++++++++++++++++++
 mm/page-writeback.c         | 17 +++--------
 mm/page_alloc.c             |  5 ++--
 mm/shmem.c                  |  3 +-
 mm/vmscan.c                 |  6 ++--
 17 files changed, 126 insertions(+), 104 deletions(-)
 create mode 100644 mm/backing-dev.c

(limited to 'block')

diff --git a/arch/i386/lib/usercopy.c b/arch/i386/lib/usercopy.c
index 258df6b4d7d7..d22cfc9d656c 100644
--- a/arch/i386/lib/usercopy.c
+++ b/arch/i386/lib/usercopy.c
@@ -9,6 +9,7 @@
 #include <linux/highmem.h>
 #include <linux/blkdev.h>
 #include <linux/module.h>
+#include <linux/backing-dev.h>
 #include <asm/uaccess.h>
 #include <asm/mmx.h>
 
@@ -741,7 +742,7 @@ survive:
 
 			if (retval == -ENOMEM && is_init(current)) {
 				up_read(&current->mm->mmap_sem);
-				blk_congestion_wait(WRITE, HZ/50);
+				congestion_wait(WRITE, HZ/50);
 				goto survive;
 			}
 
diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index 132a858ce2c5..136066583c68 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -56,11 +56,6 @@ static kmem_cache_t *requestq_cachep;
  */
 static kmem_cache_t *iocontext_cachep;
 
-static wait_queue_head_t congestion_wqh[2] = {
-		__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
-		__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
-	};
-
 /*
  * Controlling structure to kblockd
  */
@@ -112,37 +107,6 @@ static void blk_queue_congestion_threshold(struct request_queue *q)
 	q->nr_congestion_off = nr;
 }
 
-/*
- * A queue has just exitted congestion.  Note this in the global counter of
- * congested queues, and wake up anyone who was waiting for requests to be
- * put back.
- */
-void blk_clear_queue_congested(request_queue_t *q, int rw)
-{
-	enum bdi_state bit;
-	wait_queue_head_t *wqh = &congestion_wqh[rw];
-
-	bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested;
-	clear_bit(bit, &q->backing_dev_info.state);
-	smp_mb__after_clear_bit();
-	if (waitqueue_active(wqh))
-		wake_up(wqh);
-}
-EXPORT_SYMBOL(blk_clear_queue_congested);
-
-/*
- * A queue has just entered congestion.  Flag that in the queue's VM-visible
- * state flags and increment the global gounter of congested queues.
- */
-void blk_set_queue_congested(request_queue_t *q, int rw)
-{
-	enum bdi_state bit;
-
-	bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested;
-	set_bit(bit, &q->backing_dev_info.state);
-}
-EXPORT_SYMBOL(blk_set_queue_congested);
-
 /**
  * blk_get_backing_dev_info - get the address of a queue's backing_dev_info
  * @bdev:	device
@@ -2755,41 +2719,6 @@ void blk_end_sync_rq(struct request *rq, int error)
 }
 EXPORT_SYMBOL(blk_end_sync_rq);
 
-/**
- * blk_congestion_wait - wait for a queue to become uncongested
- * @rw: READ or WRITE
- * @timeout: timeout in jiffies
- *
- * Waits for up to @timeout jiffies for a queue (any queue) to exit congestion.
- * If no queues are congested then just wait for the next request to be
- * returned.
- */
-long blk_congestion_wait(int rw, long timeout)
-{
-	long ret;
-	DEFINE_WAIT(wait);
-	wait_queue_head_t *wqh = &congestion_wqh[rw];
-
-	prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
-	ret = io_schedule_timeout(timeout);
-	finish_wait(wqh, &wait);
-	return ret;
-}
-
-EXPORT_SYMBOL(blk_congestion_wait);
-
-/**
- * blk_congestion_end - wake up sleepers on a congestion queue
- * @rw: READ or WRITE
- */
-void blk_congestion_end(int rw)
-{
-	wait_queue_head_t *wqh = &congestion_wqh[rw];
-
-	if (waitqueue_active(wqh))
-		wake_up(wqh);
-}
-
 /*
  * Has to be called with the request spinlock acquired
  */
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 655d816760e5..a625576fdeeb 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -16,6 +16,7 @@
 #include <linux/slab.h>
 #include <linux/crypto.h>
 #include <linux/workqueue.h>
+#include <linux/backing-dev.h>
 #include <asm/atomic.h>
 #include <linux/scatterlist.h>
 #include <asm/page.h>
@@ -602,7 +603,7 @@ static void process_write(struct crypt_io *io)
 
 		/* out of memory -> run queues */
 		if (remaining)
-			blk_congestion_wait(bio_data_dir(clone), HZ/100);
+			congestion_wait(bio_data_dir(clone), HZ/100);
 	}
 }
 
diff --git a/fs/fat/file.c b/fs/fat/file.c
index f4b8f8b3fbdd..8337451e7897 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -13,6 +13,7 @@
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
+#include <linux/backing-dev.h>
 #include <linux/blkdev.h>
 
 int fat_generic_ioctl(struct inode *inode, struct file *filp,
@@ -118,7 +119,7 @@ static int fat_file_release(struct inode *inode, struct file *filp)
 	if ((filp->f_mode & FMODE_WRITE) &&
 	     MSDOS_SB(inode->i_sb)->options.flush) {
 		fat_flush_inodes(inode->i_sb, inode, NULL);
-		blk_congestion_wait(WRITE, HZ/10);
+		congestion_wait(WRITE, HZ/10);
 	}
 	return 0;
 }
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index f6675d2c386c..ca92ac36fe9d 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -57,6 +57,8 @@
 #include <linux/nfs_fs.h>
 #include <linux/nfs_mount.h>
 #include <linux/nfs_page.h>
+#include <linux/backing-dev.h>
+
 #include <asm/uaccess.h>
 #include <linux/smp_lock.h>
 
@@ -395,7 +397,7 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
 out:
 	clear_bit(BDI_write_congested, &bdi->state);
 	wake_up_all(&nfs_write_congestion);
-	writeback_congestion_end();
+	congestion_end(WRITE);
 	return err;
 }
 
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index ad8cbc49883a..85ce23268302 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -53,6 +53,7 @@
 #include <linux/workqueue.h>
 #include <linux/writeback.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 
 /* gets a struct reiserfs_journal_list * from a list head */
 #define JOURNAL_LIST_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \
@@ -970,7 +971,7 @@ int reiserfs_async_progress_wait(struct super_block *s)
 	DEFINE_WAIT(wait);
 	struct reiserfs_journal *j = SB_JOURNAL(s);
 	if (atomic_read(&j->j_async_throttle))
-		blk_congestion_wait(WRITE, HZ / 10);
+		congestion_wait(WRITE, HZ / 10);
 	return 0;
 }
 
diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c
index d59737589815..004baf600611 100644
--- a/fs/xfs/linux-2.6/kmem.c
+++ b/fs/xfs/linux-2.6/kmem.c
@@ -21,6 +21,7 @@
 #include <linux/highmem.h>
 #include <linux/swap.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include "time.h"
 #include "kmem.h"
 
@@ -53,7 +54,7 @@ kmem_alloc(size_t size, unsigned int __nocast flags)
 			printk(KERN_ERR "XFS: possible memory allocation "
 					"deadlock in %s (mode:0x%x)\n",
 					__FUNCTION__, lflags);
-		blk_congestion_wait(WRITE, HZ/50);
+		congestion_wait(WRITE, HZ/50);
 	} while (1);
 }
 
@@ -131,7 +132,7 @@ kmem_zone_alloc(kmem_zone_t *zone, unsigned int __nocast flags)
 			printk(KERN_ERR "XFS: possible memory allocation "
 					"deadlock in %s (mode:0x%x)\n",
 					__FUNCTION__, lflags);
-		blk_congestion_wait(WRITE, HZ/50);
+		congestion_wait(WRITE, HZ/50);
 	} while (1);
 }
 
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 9bbadafdcb00..db5f5a3608ca 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -30,6 +30,7 @@
 #include <linux/hash.h>
 #include <linux/kthread.h>
 #include <linux/migrate.h>
+#include <linux/backing-dev.h>
 #include "xfs_linux.h"
 
 STATIC kmem_zone_t *xfs_buf_zone;
@@ -395,7 +396,7 @@ _xfs_buf_lookup_pages(
 
 			XFS_STATS_INC(xb_page_retries);
 			xfsbufd_wakeup(0, gfp_mask);
-			blk_congestion_wait(WRITE, HZ/50);
+			congestion_wait(WRITE, HZ/50);
 			goto retry;
 		}
 
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index f7a1390d67f5..7011d6255593 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -10,6 +10,8 @@
 
 #include <asm/atomic.h>
 
+struct page;
+
 /*
  * Bits in backing_dev_info.state
  */
@@ -88,6 +90,11 @@ static inline int bdi_rw_congested(struct backing_dev_info *bdi)
 				  (1 << BDI_write_congested));
 }
 
+void clear_bdi_congested(struct backing_dev_info *bdi, int rw);
+void set_bdi_congested(struct backing_dev_info *bdi, int rw);
+long congestion_wait(int rw, long timeout);
+void congestion_end(int rw);
+
 #define bdi_cap_writeback_dirty(bdi) \
 	(!((bdi)->capabilities & BDI_CAP_NO_WRITEBACK))
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 9575e3a5ff2a..7bfcde2d5578 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -651,8 +651,26 @@ extern void blk_recount_segments(request_queue_t *, struct bio *);
 extern int scsi_cmd_ioctl(struct file *, struct gendisk *, unsigned int, void __user *);
 extern int sg_scsi_ioctl(struct file *, struct request_queue *,
 		struct gendisk *, struct scsi_ioctl_command __user *);
-extern void blk_clear_queue_congested(request_queue_t *q, int rw);
-extern void blk_set_queue_congested(request_queue_t *q, int rw);
+
+/*
+ * A queue has just exitted congestion.  Note this in the global counter of
+ * congested queues, and wake up anyone who was waiting for requests to be
+ * put back.
+ */
+static inline void blk_clear_queue_congested(request_queue_t *q, int rw)
+{
+	clear_bdi_congested(&q->backing_dev_info, rw);
+}
+
+/*
+ * A queue has just entered congestion.  Flag that in the queue's VM-visible
+ * state flags and increment the global gounter of congested queues.
+ */
+static inline void blk_set_queue_congested(request_queue_t *q, int rw)
+{
+	set_bdi_congested(&q->backing_dev_info, rw);
+}
+
 extern void blk_start_queue(request_queue_t *q);
 extern void blk_stop_queue(request_queue_t *q);
 extern void blk_sync_queue(struct request_queue *q);
@@ -767,10 +785,8 @@ extern int blk_queue_init_tags(request_queue_t *, int, struct blk_queue_tag *);
 extern void blk_queue_free_tags(request_queue_t *);
 extern int blk_queue_resize_tags(request_queue_t *, int);
 extern void blk_queue_invalidate_tags(request_queue_t *);
-extern long blk_congestion_wait(int rw, long timeout);
 extern struct blk_queue_tag *blk_init_tags(int);
 extern void blk_free_tags(struct blk_queue_tag *);
-extern void blk_congestion_end(int rw);
 
 static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt,
 						int tag)
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index a341c8032866..fc35e6bdfb93 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -85,7 +85,6 @@ int wakeup_pdflush(long nr_pages);
 void laptop_io_completion(void);
 void laptop_sync_completion(void);
 void throttle_vm_writeout(void);
-void writeback_congestion_end(void);
 
 /* These are exported to sysctl. */
 extern int dirty_background_ratio;
diff --git a/mm/Makefile b/mm/Makefile
index 12b3a4eee88d..f3c077eb0b8e 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -10,7 +10,8 @@ mmu-$(CONFIG_MMU)	:= fremap.o highmem.o madvise.o memory.o mincore.o \
 obj-y			:= bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
 			   page_alloc.o page-writeback.o pdflush.o \
 			   readahead.o swap.o truncate.o vmscan.o \
-			   prio_tree.o util.o mmzone.o vmstat.o $(mmu-y)
+			   prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
+			   $(mmu-y)
 
 ifeq ($(CONFIG_MMU)$(CONFIG_BLOCK),yy)
 obj-y			+= bounce.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
new file mode 100644
index 000000000000..f50a2811f9dc
--- /dev/null
+++ b/mm/backing-dev.c
@@ -0,0 +1,69 @@
+
+#include <linux/wait.h>
+#include <linux/backing-dev.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+
+static wait_queue_head_t congestion_wqh[2] = {
+		__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
+		__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
+	};
+
+
+void clear_bdi_congested(struct backing_dev_info *bdi, int rw)
+{
+	enum bdi_state bit;
+	wait_queue_head_t *wqh = &congestion_wqh[rw];
+
+	bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested;
+	clear_bit(bit, &bdi->state);
+	smp_mb__after_clear_bit();
+	if (waitqueue_active(wqh))
+		wake_up(wqh);
+}
+EXPORT_SYMBOL(clear_bdi_congested);
+
+void set_bdi_congested(struct backing_dev_info *bdi, int rw)
+{
+	enum bdi_state bit;
+
+	bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested;
+	set_bit(bit, &bdi->state);
+}
+EXPORT_SYMBOL(set_bdi_congested);
+
+/**
+ * congestion_wait - wait for a backing_dev to become uncongested
+ * @rw: READ or WRITE
+ * @timeout: timeout in jiffies
+ *
+ * Waits for up to @timeout jiffies for a backing_dev (any backing_dev) to exit
+ * write congestion.  If no backing_devs are congested then just wait for the
+ * next write to be completed.
+ */
+long congestion_wait(int rw, long timeout)
+{
+	long ret;
+	DEFINE_WAIT(wait);
+	wait_queue_head_t *wqh = &congestion_wqh[rw];
+
+	prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
+	ret = io_schedule_timeout(timeout);
+	finish_wait(wqh, &wait);
+	return ret;
+}
+EXPORT_SYMBOL(congestion_wait);
+
+/**
+ * congestion_end - wake up sleepers on a congested backing_dev_info
+ * @rw: READ or WRITE
+ */
+void congestion_end(int rw)
+{
+	wait_queue_head_t *wqh = &congestion_wqh[rw];
+
+	if (waitqueue_active(wqh))
+		wake_up(wqh);
+}
+EXPORT_SYMBOL(congestion_end);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index a0f339057449..8d9b19f239c3 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -222,7 +222,7 @@ static void balance_dirty_pages(struct address_space *mapping)
 			if (pages_written >= write_chunk)
 				break;		/* We've done our duty */
 		}
-		blk_congestion_wait(WRITE, HZ/10);
+		congestion_wait(WRITE, HZ/10);
 	}
 
 	if (nr_reclaimable + global_page_state(NR_WRITEBACK)
@@ -314,7 +314,7 @@ void throttle_vm_writeout(void)
                 if (global_page_state(NR_UNSTABLE_NFS) +
 			global_page_state(NR_WRITEBACK) <= dirty_thresh)
                         	break;
-                blk_congestion_wait(WRITE, HZ/10);
+                congestion_wait(WRITE, HZ/10);
         }
 }
 
@@ -351,7 +351,7 @@ static void background_writeout(unsigned long _min_pages)
 		min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
 		if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
 			/* Wrote less than expected */
-			blk_congestion_wait(WRITE, HZ/10);
+			congestion_wait(WRITE, HZ/10);
 			if (!wbc.encountered_congestion)
 				break;
 		}
@@ -422,7 +422,7 @@ static void wb_kupdate(unsigned long arg)
 		writeback_inodes(&wbc);
 		if (wbc.nr_to_write > 0) {
 			if (wbc.encountered_congestion)
-				blk_congestion_wait(WRITE, HZ/10);
+				congestion_wait(WRITE, HZ/10);
 			else
 				break;	/* All the old data is written */
 		}
@@ -955,15 +955,6 @@ int test_set_page_writeback(struct page *page)
 }
 EXPORT_SYMBOL(test_set_page_writeback);
 
-/*
- * Wakes up tasks that are being throttled due to writeback congestion
- */
-void writeback_congestion_end(void)
-{
-	blk_congestion_end(WRITE);
-}
-EXPORT_SYMBOL(writeback_congestion_end);
-
 /*
  * Return true if any of the pages in the mapping are marged with the
  * passed tag.
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 40db96a655d0..afee38f04d84 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -39,6 +39,7 @@
 #include <linux/stop_machine.h>
 #include <linux/sort.h>
 #include <linux/pfn.h>
+#include <linux/backing-dev.h>
 
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -1050,7 +1051,7 @@ nofail_alloc:
 			if (page)
 				goto got_pg;
 			if (gfp_mask & __GFP_NOFAIL) {
-				blk_congestion_wait(WRITE, HZ/50);
+				congestion_wait(WRITE, HZ/50);
 				goto nofail_alloc;
 			}
 		}
@@ -1113,7 +1114,7 @@ rebalance:
 			do_retry = 1;
 	}
 	if (do_retry) {
-		blk_congestion_wait(WRITE, HZ/50);
+		congestion_wait(WRITE, HZ/50);
 		goto rebalance;
 	}
 
diff --git a/mm/shmem.c b/mm/shmem.c
index b378f66cf2f9..4959535fc14c 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -48,6 +48,7 @@
 #include <linux/ctype.h>
 #include <linux/migrate.h>
 #include <linux/highmem.h>
+#include <linux/backing-dev.h>
 
 #include <asm/uaccess.h>
 #include <asm/div64.h>
@@ -1131,7 +1132,7 @@ repeat:
 			page_cache_release(swappage);
 			if (error == -ENOMEM) {
 				/* let kswapd refresh zone for GFP_ATOMICs */
-				blk_congestion_wait(WRITE, HZ/50);
+				congestion_wait(WRITE, HZ/50);
 			}
 			goto repeat;
 		}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index af73c14f9d88..f05527bf792b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1059,7 +1059,7 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
 
 		/* Take a nap, wait for some writeback to complete */
 		if (sc.nr_scanned && priority < DEF_PRIORITY - 2)
-			blk_congestion_wait(WRITE, HZ/10);
+			congestion_wait(WRITE, HZ/10);
 	}
 	/* top priority shrink_caches still had more to do? don't OOM, then */
 	if (!sc.all_unreclaimable)
@@ -1214,7 +1214,7 @@ scan:
 		 * another pass across the zones.
 		 */
 		if (total_scanned && priority < DEF_PRIORITY - 2)
-			blk_congestion_wait(WRITE, HZ/10);
+			congestion_wait(WRITE, HZ/10);
 
 		/*
 		 * We do this so kswapd doesn't build up large priorities for
@@ -1458,7 +1458,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
 				goto out;
 
 			if (sc.nr_scanned && prio < DEF_PRIORITY - 2)
-				blk_congestion_wait(WRITE, HZ / 10);
+				congestion_wait(WRITE, HZ / 10);
 		}
 
 		lru_pages = 0;
-- 
cgit v1.2.3


From 0261d6886eb5822867a5310dc1e4479b940a1942 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Mon, 30 Oct 2006 19:07:48 +0100
Subject: [PATCH] CFQ: use irq safe locking in cfq_cic_link()

If cfq_set_request() is called for a new process AND a non-fs io
request (so that __GFP_WAIT may not be set), cfq_cic_link() may
use spin_lock_irq() and spin_unlock_irq() with interrupts already
disabled.

Fix is to always use irq safe locking in cfq_cic_link()

Acked-By: Arjan van de Ven <arjan@linux.intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 block/cfq-iosched.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index d3d76136f53a..5c3da894a56c 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1362,6 +1362,7 @@ cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc,
 	struct rb_node **p;
 	struct rb_node *parent;
 	struct cfq_io_context *__cic;
+	unsigned long flags;
 	void *k;
 
 	cic->ioc = ioc;
@@ -1391,9 +1392,9 @@ restart:
 	rb_link_node(&cic->rb_node, parent, p);
 	rb_insert_color(&cic->rb_node, &ioc->cic_root);
 
-	spin_lock_irq(cfqd->queue->queue_lock);
+	spin_lock_irqsave(cfqd->queue->queue_lock, flags);
 	list_add(&cic->queue_list, &cfqd->cic_list);
-	spin_unlock_irq(cfqd->queue->queue_lock);
+	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
 }
 
 /*
-- 
cgit v1.2.3


From c1b707d253fe918b92882cff1dbd926b47e14fd2 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Mon, 30 Oct 2006 19:54:23 +0100
Subject: [PATCH] CFQ: bad locking in changed_ioprio()

When the ioprio code recently got juggled a bit, a bug was introduced.
changed_ioprio() is no longer called with interrupts disabled, so using
plain spin_lock() on the queue_lock is a bug.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 block/cfq-iosched.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 5c3da894a56c..25c4e7ed0d00 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1215,11 +1215,12 @@ static inline void changed_ioprio(struct cfq_io_context *cic)
 {
 	struct cfq_data *cfqd = cic->key;
 	struct cfq_queue *cfqq;
+	unsigned long flags;
 
 	if (unlikely(!cfqd))
 		return;
 
-	spin_lock(cfqd->queue->queue_lock);
+	spin_lock_irqsave(cfqd->queue->queue_lock, flags);
 
 	cfqq = cic->cfqq[ASYNC];
 	if (cfqq) {
@@ -1236,7 +1237,7 @@ static inline void changed_ioprio(struct cfq_io_context *cic)
 	if (cfqq)
 		cfq_mark_cfqq_prio_changed(cfqq);
 
-	spin_unlock(cfqd->queue->queue_lock);
+	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
 }
 
 static void cfq_ioc_set_ioprio(struct io_context *ioc)
-- 
cgit v1.2.3


From 5ddfe9691c91a244e8d1be597b6428fcefd58103 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 30 Oct 2006 22:07:21 -0800
Subject: [PATCH] md: check bio address after mapping through partitions.

Partitions are not limited to live within a device.  So we should range
check after partition mapping.

Note that 'maxsector' was being used for two different things.  I have
split off the second usage into 'old_sector' so that maxsector can be still
be used for it's primary usage later in the function.

Cc: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Neil Brown <neilb@suse.de>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 block/ll_rw_blk.c | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

(limited to 'block')

diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index 136066583c68..c7b1dac8bee9 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -2999,6 +2999,7 @@ void generic_make_request(struct bio *bio)
 {
 	request_queue_t *q;
 	sector_t maxsector;
+	sector_t old_sector;
 	int ret, nr_sectors = bio_sectors(bio);
 	dev_t old_dev;
 
@@ -3027,7 +3028,7 @@ void generic_make_request(struct bio *bio)
 	 * NOTE: we don't repeat the blk_size check for each new device.
 	 * Stacking drivers are expected to know what they are doing.
 	 */
-	maxsector = -1;
+	old_sector = -1;
 	old_dev = 0;
 	do {
 		char b[BDEVNAME_SIZE];
@@ -3061,15 +3062,30 @@ end_io:
 		 */
 		blk_partition_remap(bio);
 
-		if (maxsector != -1)
+		if (old_sector != -1)
 			blk_add_trace_remap(q, bio, old_dev, bio->bi_sector, 
-					    maxsector);
+					    old_sector);
 
 		blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
 
-		maxsector = bio->bi_sector;
+		old_sector = bio->bi_sector;
 		old_dev = bio->bi_bdev->bd_dev;
 
+		maxsector = bio->bi_bdev->bd_inode->i_size >> 9;
+		if (maxsector) {
+			sector_t sector = bio->bi_sector;
+
+			if (maxsector < nr_sectors || maxsector - nr_sectors < sector) {
+				/*
+				 * This may well happen - partitions are not checked
+				 * to make sure they are within the size of the
+				 * whole device.
+				 */
+				handle_bad_sector(bio);
+				goto end_io;
+			}
+		}
+
 		ret = q->make_request_fn(q, bio);
 	} while (ret);
 }
-- 
cgit v1.2.3


From 5fccbf61be2a7f32d2002b04afca4c5009612a58 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 31 Oct 2006 14:21:55 +0100
Subject: [PATCH] CFQ: request <-> request merging rr_list fixup

In very rare circumstances would we be pruning a merged request and at
the same time delete the implicated cfqq from the rr_list, and not readd
it when the merged request got added. This could cause io stalls until
that process issued io again.

Fix it up by putting the rr_list add handling into cfq_add_rq_rb(),
identical to how pruning is handled in cfq_del_rq_rb(). This fixes a
hang reproducible with fsx-linux.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 block/cfq-iosched.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 25c4e7ed0d00..1d9c3c70a9a0 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -456,6 +456,9 @@ static void cfq_add_rq_rb(struct request *rq)
 	 */
 	while ((__alias = elv_rb_add(&cfqq->sort_list, rq)) != NULL)
 		cfq_dispatch_insert(cfqd->queue, __alias);
+
+	if (!cfq_cfqq_on_rr(cfqq))
+		cfq_add_cfqq_rr(cfqd, cfqq);
 }
 
 static inline void
@@ -1652,9 +1655,6 @@ static void cfq_insert_request(request_queue_t *q, struct request *rq)
 
 	cfq_add_rq_rb(rq);
 
-	if (!cfq_cfqq_on_rr(cfqq))
-		cfq_add_cfqq_rr(cfqd, cfqq);
-
 	list_add_tail(&rq->queuelist, &cfqq->fifo);
 
 	cfq_rq_enqueued(cfqd, cfqq, rq);
-- 
cgit v1.2.3


From df66b8552be5fdab5c4b4d53ee08b99388b9bd02 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Thu, 2 Nov 2006 22:06:56 -0800
Subject: [PATCH] tidy "md: check bio address after mapping through partitions"

Neil's xterms are too wide.

Cc: Neil Brown <neilb@cse.unsw.edu.au>
Cc: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 block/ll_rw_blk.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'block')

diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index c7b1dac8bee9..9eaee6640535 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -3075,11 +3075,12 @@ end_io:
 		if (maxsector) {
 			sector_t sector = bio->bi_sector;
 
-			if (maxsector < nr_sectors || maxsector - nr_sectors < sector) {
+			if (maxsector < nr_sectors ||
+					maxsector - nr_sectors < sector) {
 				/*
-				 * This may well happen - partitions are not checked
-				 * to make sure they are within the size of the
-				 * whole device.
+				 * This may well happen - partitions are not
+				 * checked to make sure they are within the size
+				 * of the whole device.
 				 */
 				handle_bad_sector(bio);
 				goto end_io;
-- 
cgit v1.2.3


From 616e8a091a035c0bd9b871695f4af191df123caa Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Mon, 13 Nov 2006 18:04:59 +0100
Subject: [PATCH] Fix bad data direction in SG_IO

Contrary to what the name misleads you to believe, SG_DXFER_TO_FROM_DEV
is really just a normal read seen from the device side.

This patch fixes http://lkml.org/lkml/2006/10/13/100

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 block/scsi_ioctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 2dc326421a24..a19338e6215d 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -246,10 +246,10 @@ static int sg_io(struct file *file, request_queue_t *q,
 		switch (hdr->dxfer_direction) {
 		default:
 			return -EINVAL;
-		case SG_DXFER_TO_FROM_DEV:
 		case SG_DXFER_TO_DEV:
 			writing = 1;
 			break;
+		case SG_DXFER_TO_FROM_DEV:
 		case SG_DXFER_FROM_DEV:
 			break;
 		}
-- 
cgit v1.2.3


From 097b8457dafe7efc22201b4062e2d1e82e494067 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Thu, 16 Nov 2006 01:19:31 -0800
Subject: [PATCH] scsi: clear garbage after CDBs on SG_IO

ATAPI devices transfer fixed number of bytes for CDBs (12 or 16).  Some
ATAPI devices choke when shorter CDB is used and the left bytes contain
garbage.  Block SG_IO cleared left bytes but SCSI SG_IO didn't.  This patch
makes SCSI SG_IO clear it and simplify CDB clearing in block SG_IO.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Cc: Mathieu Fluhr <mfluhr@nero.com>
Cc: James Bottomley <James.Bottomley@steeleye.com>
Cc: Douglas Gilbert <dougg@torque.net>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Cc: <stable@kernel.org>
Acked-by: Jeff Garzik <jgarzik@pobox.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 block/scsi_ioctl.c      | 3 +--
 drivers/scsi/scsi_lib.c | 1 +
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index a19338e6215d..e55a75621437 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -286,9 +286,8 @@ static int sg_io(struct file *file, request_queue_t *q,
 	 * fill in request structure
 	 */
 	rq->cmd_len = hdr->cmd_len;
+	memset(rq->cmd, 0, BLK_MAX_CDB); /* ATAPI hates garbage after CDB */
 	memcpy(rq->cmd, cmd, hdr->cmd_len);
-	if (sizeof(rq->cmd) != hdr->cmd_len)
-		memset(rq->cmd + hdr->cmd_len, 0, sizeof(rq->cmd) - hdr->cmd_len);
 
 	memset(sense, 0, sizeof(sense));
 	rq->sense = sense;
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index d2c02df12fdc..3ac4890ce086 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -410,6 +410,7 @@ int scsi_execute_async(struct scsi_device *sdev, const unsigned char *cmd,
 		goto free_req;
 
 	req->cmd_len = cmd_len;
+	memset(req->cmd, 0, BLK_MAX_CDB); /* ATAPI hates garbage after CDB */
 	memcpy(req->cmd, cmd, req->cmd_len);
 	req->sense = sioc->sense;
 	req->sense_len = 0;
-- 
cgit v1.2.3