From 5a0e3ad6af8660be21ca98a971cd00f331318c05 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 24 Mar 2010 17:04:11 +0900
Subject: include cleanup: Update gfp.h and slab.h includes to prepare for
 breaking implicit slab.h inclusion from percpu.h

percpu.h is included by sched.h and module.h and thus ends up being
included when building most .c files.  percpu.h includes slab.h which
in turn includes gfp.h making everything defined by the two files
universally available and complicating inclusion dependencies.

percpu.h -> slab.h dependency is about to be removed.  Prepare for
this change by updating users of gfp and slab facilities include those
headers directly instead of assuming availability.  As this conversion
needs to touch large number of source files, the following script is
used as the basis of conversion.

  http://userweb.kernel.org/~tj/misc/slabh-sweep.py

The script does the followings.

* Scan files for gfp and slab usages and update includes such that
  only the necessary includes are there.  ie. if only gfp is used,
  gfp.h, if slab is used, slab.h.

* When the script inserts a new include, it looks at the include
  blocks and try to put the new include such that its order conforms
  to its surrounding.  It's put in the include block which contains
  core kernel includes, in the same order that the rest are ordered -
  alphabetical, Christmas tree, rev-Xmas-tree or at the end if there
  doesn't seem to be any matching order.

* If the script can't find a place to put a new include (mostly
  because the file doesn't have fitting include block), it prints out
  an error message indicating which .h file needs to be added to the
  file.

The conversion was done in the following steps.

1. The initial automatic conversion of all .c files updated slightly
   over 4000 files, deleting around 700 includes and adding ~480 gfp.h
   and ~3000 slab.h inclusions.  The script emitted errors for ~400
   files.

2. Each error was manually checked.  Some didn't need the inclusion,
   some needed manual addition while adding it to implementation .h or
   embedding .c file was more appropriate for others.  This step added
   inclusions to around 150 files.

3. The script was run again and the output was compared to the edits
   from #2 to make sure no file was left behind.

4. Several build tests were done and a couple of problems were fixed.
   e.g. lib/decompress_*.c used malloc/free() wrappers around slab
   APIs requiring slab.h to be added manually.

5. The script was run on all .h files but without automatically
   editing them as sprinkling gfp.h and slab.h inclusions around .h
   files could easily lead to inclusion dependency hell.  Most gfp.h
   inclusion directives were ignored as stuff from gfp.h was usually
   wildly available and often used in preprocessor macros.  Each
   slab.h inclusion directive was examined and added manually as
   necessary.

6. percpu.h was updated not to include slab.h.

7. Build test were done on the following configurations and failures
   were fixed.  CONFIG_GCOV_KERNEL was turned off for all tests (as my
   distributed build env didn't work with gcov compiles) and a few
   more options had to be turned off depending on archs to make things
   build (like ipr on powerpc/64 which failed due to missing writeq).

   * x86 and x86_64 UP and SMP allmodconfig and a custom test config.
   * powerpc and powerpc64 SMP allmodconfig
   * sparc and sparc64 SMP allmodconfig
   * ia64 SMP allmodconfig
   * s390 SMP allmodconfig
   * alpha SMP allmodconfig
   * um on x86_64 SMP allmodconfig

8. percpu.h modifications were reverted so that it could be applied as
   a separate patch and serve as bisection point.

Given the fact that I had only a couple of failures from tests on step
6, I'm fairly confident about the coverage of this conversion patch.
If there is a breakage, it's likely to be something in one of the arch
headers which should be easily discoverable easily on most builds of
the specific arch.

Signed-off-by: Tejun Heo <tj@kernel.org>
Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
---
 drivers/dma/at_hdmac.c      | 1 +
 drivers/dma/coh901318_lli.c | 1 +
 drivers/dma/dmaengine.c     | 1 +
 drivers/dma/dmatest.c       | 1 +
 drivers/dma/fsldma.c        | 1 +
 drivers/dma/ioat/dma.c      | 1 +
 drivers/dma/ioat/dma_v2.c   | 1 +
 drivers/dma/ioat/dma_v3.c   | 1 +
 drivers/dma/ioat/pci.c      | 1 +
 drivers/dma/iop-adma.c      | 1 +
 drivers/dma/iovlock.c       | 1 +
 drivers/dma/mpc512x_dma.c   | 1 +
 drivers/dma/mv_xor.c        | 1 +
 drivers/dma/ppc4xx/adma.c   | 1 +
 drivers/dma/shdma.c         | 1 +
 15 files changed, 15 insertions(+)

(limited to 'drivers/dma')

diff --git a/drivers/dma/at_hdmac.c b/drivers/dma/at_hdmac.c
index efc1a61ca231..278cf5bceef2 100644
--- a/drivers/dma/at_hdmac.c
+++ b/drivers/dma/at_hdmac.c
@@ -22,6 +22,7 @@
 #include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/platform_device.h>
+#include <linux/slab.h>
 
 #include "at_hdmac_regs.h"
 
diff --git a/drivers/dma/coh901318_lli.c b/drivers/dma/coh901318_lli.c
index 71d58c1a1e86..9f7e0e6a7eea 100644
--- a/drivers/dma/coh901318_lli.c
+++ b/drivers/dma/coh901318_lli.c
@@ -11,6 +11,7 @@
 #include <linux/spinlock.h>
 #include <linux/dmapool.h>
 #include <linux/memory.h>
+#include <linux/gfp.h>
 #include <mach/coh901318.h>
 
 #include "coh901318_lli.h"
diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index 87399cafce37..d18b5d069d7e 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -58,6 +58,7 @@
 #include <linux/jiffies.h>
 #include <linux/rculist.h>
 #include <linux/idr.h>
+#include <linux/slab.h>
 
 static DEFINE_MUTEX(dma_list_mutex);
 static LIST_HEAD(dma_device_list);
diff --git a/drivers/dma/dmatest.c b/drivers/dma/dmatest.c
index 6fa55fe3dd24..68d58c414cf0 100644
--- a/drivers/dma/dmatest.c
+++ b/drivers/dma/dmatest.c
@@ -14,6 +14,7 @@
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/random.h>
+#include <linux/slab.h>
 #include <linux/wait.h>
 
 static unsigned int test_buf_size = 16384;
diff --git a/drivers/dma/fsldma.c b/drivers/dma/fsldma.c
index bbb4be5a3ff4..88f470f0d820 100644
--- a/drivers/dma/fsldma.c
+++ b/drivers/dma/fsldma.c
@@ -27,6 +27,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/pci.h>
+#include <linux/slab.h>
 #include <linux/interrupt.h>
 #include <linux/dmaengine.h>
 #include <linux/delay.h>
diff --git a/drivers/dma/ioat/dma.c b/drivers/dma/ioat/dma.c
index 0099340b9616..3e5a8005c62b 100644
--- a/drivers/dma/ioat/dma.c
+++ b/drivers/dma/ioat/dma.c
@@ -27,6 +27,7 @@
 
 #include <linux/init.h>
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/pci.h>
 #include <linux/interrupt.h>
 #include <linux/dmaengine.h>
diff --git a/drivers/dma/ioat/dma_v2.c b/drivers/dma/ioat/dma_v2.c
index 1ed5d66d7dca..b5ae56c211e6 100644
--- a/drivers/dma/ioat/dma_v2.c
+++ b/drivers/dma/ioat/dma_v2.c
@@ -27,6 +27,7 @@
 
 #include <linux/init.h>
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/pci.h>
 #include <linux/interrupt.h>
 #include <linux/dmaengine.h>
diff --git a/drivers/dma/ioat/dma_v3.c b/drivers/dma/ioat/dma_v3.c
index 26febc56dab1..6740e319c9cf 100644
--- a/drivers/dma/ioat/dma_v3.c
+++ b/drivers/dma/ioat/dma_v3.c
@@ -57,6 +57,7 @@
  */
 
 #include <linux/pci.h>
+#include <linux/gfp.h>
 #include <linux/dmaengine.h>
 #include <linux/dma-mapping.h>
 #include "registers.h"
diff --git a/drivers/dma/ioat/pci.c b/drivers/dma/ioat/pci.c
index d545fae30f37..99ec26725bae 100644
--- a/drivers/dma/ioat/pci.c
+++ b/drivers/dma/ioat/pci.c
@@ -30,6 +30,7 @@
 #include <linux/pci.h>
 #include <linux/interrupt.h>
 #include <linux/dca.h>
+#include <linux/slab.h>
 #include "dma.h"
 #include "dma_v2.h"
 #include "registers.h"
diff --git a/drivers/dma/iop-adma.c b/drivers/dma/iop-adma.c
index ca6e6a0cb793..1ebc801678b0 100644
--- a/drivers/dma/iop-adma.c
+++ b/drivers/dma/iop-adma.c
@@ -32,6 +32,7 @@
 #include <linux/memory.h>
 #include <linux/ioport.h>
 #include <linux/raid/pq.h>
+#include <linux/slab.h>
 
 #include <mach/adma.h>
 
diff --git a/drivers/dma/iovlock.c b/drivers/dma/iovlock.c
index c0a272c73682..bb48a57c2fc1 100644
--- a/drivers/dma/iovlock.c
+++ b/drivers/dma/iovlock.c
@@ -27,6 +27,7 @@
 
 #include <linux/dmaengine.h>
 #include <linux/pagemap.h>
+#include <linux/slab.h>
 #include <net/tcp.h> /* for memcpy_toiovec */
 #include <asm/io.h>
 #include <asm/uaccess.h>
diff --git a/drivers/dma/mpc512x_dma.c b/drivers/dma/mpc512x_dma.c
index 3fdf1f46bd63..bbbd58566625 100644
--- a/drivers/dma/mpc512x_dma.c
+++ b/drivers/dma/mpc512x_dma.c
@@ -37,6 +37,7 @@
 #include <linux/dma-mapping.h>
 #include <linux/interrupt.h>
 #include <linux/io.h>
+#include <linux/slab.h>
 #include <linux/of_device.h>
 #include <linux/of_platform.h>
 
diff --git a/drivers/dma/mv_xor.c b/drivers/dma/mv_xor.c
index 466ab10c1ff1..e2fd34da64f2 100644
--- a/drivers/dma/mv_xor.c
+++ b/drivers/dma/mv_xor.c
@@ -18,6 +18,7 @@
 
 #include <linux/init.h>
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/delay.h>
 #include <linux/dma-mapping.h>
 #include <linux/spinlock.h>
diff --git a/drivers/dma/ppc4xx/adma.c b/drivers/dma/ppc4xx/adma.c
index e69d87f24a25..d44626fa35ad 100644
--- a/drivers/dma/ppc4xx/adma.c
+++ b/drivers/dma/ppc4xx/adma.c
@@ -38,6 +38,7 @@
 #include <linux/dma-mapping.h>
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
+#include <linux/slab.h>
 #include <linux/uaccess.h>
 #include <linux/proc_fs.h>
 #include <linux/of.h>
diff --git a/drivers/dma/shdma.c b/drivers/dma/shdma.c
index 5d17e09cb625..7cc31b3f40d8 100644
--- a/drivers/dma/shdma.c
+++ b/drivers/dma/shdma.c
@@ -19,6 +19,7 @@
 
 #include <linux/init.h>
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/interrupt.h>
 #include <linux/dmaengine.h>
 #include <linux/delay.h>
-- 
cgit v1.2.3


From abb12dfd50c7580d7dcbd581cf6265ba4d01ea7e Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sat, 1 May 2010 15:22:54 -0700
Subject: ioat: convert to circ_buf

Use the common power-of-2 circular buffer macros.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dma/ioat/dma_v2.c |  2 +-
 drivers/dma/ioat/dma_v2.h | 18 +++++++-----------
 2 files changed, 8 insertions(+), 12 deletions(-)

(limited to 'drivers/dma')

diff --git a/drivers/dma/ioat/dma_v2.c b/drivers/dma/ioat/dma_v2.c
index b5ae56c211e6..b6699a350989 100644
--- a/drivers/dma/ioat/dma_v2.c
+++ b/drivers/dma/ioat/dma_v2.c
@@ -553,7 +553,7 @@ bool reshape_ring(struct ioat2_dma_chan *ioat, int order)
 	 */
 	struct ioat_chan_common *chan = &ioat->base;
 	struct dma_chan *c = &chan->common;
-	const u16 curr_size = ioat2_ring_mask(ioat) + 1;
+	const u16 curr_size = ioat2_ring_size(ioat);
 	const u16 active = ioat2_ring_active(ioat);
 	const u16 new_size = 1 << order;
 	struct ioat_ring_ent **ring;
diff --git a/drivers/dma/ioat/dma_v2.h b/drivers/dma/ioat/dma_v2.h
index ef2871fd7868..d7b64f188f78 100644
--- a/drivers/dma/ioat/dma_v2.h
+++ b/drivers/dma/ioat/dma_v2.h
@@ -22,6 +22,7 @@
 #define IOATDMA_V2_H
 
 #include <linux/dmaengine.h>
+#include <linux/circ_buf.h>
 #include "dma.h"
 #include "hw.h"
 
@@ -71,31 +72,26 @@ static inline struct ioat2_dma_chan *to_ioat2_chan(struct dma_chan *c)
 	return container_of(chan, struct ioat2_dma_chan, base);
 }
 
-static inline u16 ioat2_ring_mask(struct ioat2_dma_chan *ioat)
+static inline u16 ioat2_ring_size(struct ioat2_dma_chan *ioat)
 {
-	return (1 << ioat->alloc_order) - 1;
+	return 1 << ioat->alloc_order;
 }
 
 /* count of descriptors in flight with the engine */
 static inline u16 ioat2_ring_active(struct ioat2_dma_chan *ioat)
 {
-	return (ioat->head - ioat->tail) & ioat2_ring_mask(ioat);
+	return CIRC_CNT(ioat->head, ioat->tail, ioat2_ring_size(ioat));
 }
 
 /* count of descriptors pending submission to hardware */
 static inline u16 ioat2_ring_pending(struct ioat2_dma_chan *ioat)
 {
-	return (ioat->head - ioat->issued) & ioat2_ring_mask(ioat);
+	return CIRC_CNT(ioat->head, ioat->issued, ioat2_ring_size(ioat));
 }
 
 static inline u16 ioat2_ring_space(struct ioat2_dma_chan *ioat)
 {
-	u16 num_descs = ioat2_ring_mask(ioat) + 1;
-	u16 active = ioat2_ring_active(ioat);
-
-	BUG_ON(active > num_descs);
-
-	return num_descs - active;
+	return ioat2_ring_size(ioat) - ioat2_ring_active(ioat);
 }
 
 /* assumes caller already checked space */
@@ -151,7 +147,7 @@ struct ioat_ring_ent {
 static inline struct ioat_ring_ent *
 ioat2_get_ring_ent(struct ioat2_dma_chan *ioat, u16 idx)
 {
-	return ioat->ring[idx & ioat2_ring_mask(ioat)];
+	return ioat->ring[idx & (ioat2_ring_size(ioat) - 1)];
 }
 
 static inline void ioat2_set_chainaddr(struct ioat2_dma_chan *ioat, u64 addr)
-- 
cgit v1.2.3


From 074cc47679f8b0931d7d5384e95822d82768f149 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sat, 1 May 2010 15:22:55 -0700
Subject: ioat2,3: convert to producer/consumer locking

Use separate locks for the descriptor prep (producer) and descriptor
cleanup (consumer) paths.  Allows the producer path to run concurrently
with the cleanup path.  Inspired by Documentation/circular-buffer.txt.

Cc: David Howells <dhowells@redhat.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Maciej Sosnowski <maciej.sosnowski@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dma/ioat/dma.h    |   1 +
 drivers/dma/ioat/dma_v2.c | 182 +++++++++++++++++++++++-----------------------
 drivers/dma/ioat/dma_v2.h |  15 ++--
 drivers/dma/ioat/dma_v3.c | 117 +++++++++--------------------
 4 files changed, 134 insertions(+), 181 deletions(-)

(limited to 'drivers/dma')

diff --git a/drivers/dma/ioat/dma.h b/drivers/dma/ioat/dma.h
index 86b97ac8774e..0c76578e9911 100644
--- a/drivers/dma/ioat/dma.h
+++ b/drivers/dma/ioat/dma.h
@@ -96,6 +96,7 @@ struct ioat_chan_common {
 	#define IOAT_COMPLETION_ACK 1
 	#define IOAT_RESET_PENDING 2
 	#define IOAT_KOBJ_INIT_FAIL 3
+	#define IOAT_RESHAPE_PENDING 4
 	struct timer_list timer;
 	#define COMPLETION_TIMEOUT msecs_to_jiffies(100)
 	#define IDLE_TIMEOUT msecs_to_jiffies(2000)
diff --git a/drivers/dma/ioat/dma_v2.c b/drivers/dma/ioat/dma_v2.c
index b6699a350989..e75d0299bb82 100644
--- a/drivers/dma/ioat/dma_v2.c
+++ b/drivers/dma/ioat/dma_v2.c
@@ -56,8 +56,6 @@ void __ioat2_issue_pending(struct ioat2_dma_chan *ioat)
 
 	ioat->dmacount += ioat2_ring_pending(ioat);
 	ioat->issued = ioat->head;
-	/* make descriptor updates globally visible before notifying channel */
-	wmb();
 	writew(ioat->dmacount, chan->reg_base + IOAT_CHAN_DMACOUNT_OFFSET);
 	dev_dbg(to_dev(chan),
 		"%s: head: %#x tail: %#x issued: %#x count: %#x\n",
@@ -69,9 +67,9 @@ void ioat2_issue_pending(struct dma_chan *c)
 	struct ioat2_dma_chan *ioat = to_ioat2_chan(c);
 
 	if (ioat2_ring_pending(ioat)) {
-		spin_lock_bh(&ioat->ring_lock);
+		spin_lock_bh(&ioat->prep_lock);
 		__ioat2_issue_pending(ioat);
-		spin_unlock_bh(&ioat->ring_lock);
+		spin_unlock_bh(&ioat->prep_lock);
 	}
 }
 
@@ -80,7 +78,7 @@ void ioat2_issue_pending(struct dma_chan *c)
  * @ioat: ioat2+ channel
  *
  * Check if the number of unsubmitted descriptors has exceeded the
- * watermark.  Called with ring_lock held
+ * watermark.  Called with prep_lock held
  */
 static void ioat2_update_pending(struct ioat2_dma_chan *ioat)
 {
@@ -92,7 +90,6 @@ static void __ioat2_start_null_desc(struct ioat2_dma_chan *ioat)
 {
 	struct ioat_ring_ent *desc;
 	struct ioat_dma_descriptor *hw;
-	int idx;
 
 	if (ioat2_ring_space(ioat) < 1) {
 		dev_err(to_dev(&ioat->base),
@@ -102,8 +99,7 @@ static void __ioat2_start_null_desc(struct ioat2_dma_chan *ioat)
 
 	dev_dbg(to_dev(&ioat->base), "%s: head: %#x tail: %#x issued: %#x\n",
 		__func__, ioat->head, ioat->tail, ioat->issued);
-	idx = ioat2_desc_alloc(ioat, 1);
-	desc = ioat2_get_ring_ent(ioat, idx);
+	desc = ioat2_get_ring_ent(ioat, ioat->head);
 
 	hw = desc->hw;
 	hw->ctl = 0;
@@ -117,14 +113,16 @@ static void __ioat2_start_null_desc(struct ioat2_dma_chan *ioat)
 	async_tx_ack(&desc->txd);
 	ioat2_set_chainaddr(ioat, desc->txd.phys);
 	dump_desc_dbg(ioat, desc);
+	wmb();
+	ioat->head += 1;
 	__ioat2_issue_pending(ioat);
 }
 
 static void ioat2_start_null_desc(struct ioat2_dma_chan *ioat)
 {
-	spin_lock_bh(&ioat->ring_lock);
+	spin_lock_bh(&ioat->prep_lock);
 	__ioat2_start_null_desc(ioat);
-	spin_unlock_bh(&ioat->ring_lock);
+	spin_unlock_bh(&ioat->prep_lock);
 }
 
 static void __cleanup(struct ioat2_dma_chan *ioat, unsigned long phys_complete)
@@ -134,15 +132,16 @@ static void __cleanup(struct ioat2_dma_chan *ioat, unsigned long phys_complete)
 	struct ioat_ring_ent *desc;
 	bool seen_current = false;
 	u16 active;
-	int i;
+	int idx = ioat->tail, i;
 
 	dev_dbg(to_dev(chan), "%s: head: %#x tail: %#x issued: %#x\n",
 		__func__, ioat->head, ioat->tail, ioat->issued);
 
 	active = ioat2_ring_active(ioat);
 	for (i = 0; i < active && !seen_current; i++) {
-		prefetch(ioat2_get_ring_ent(ioat, ioat->tail + i + 1));
-		desc = ioat2_get_ring_ent(ioat, ioat->tail + i);
+		smp_read_barrier_depends();
+		prefetch(ioat2_get_ring_ent(ioat, idx + i + 1));
+		desc = ioat2_get_ring_ent(ioat, idx + i);
 		tx = &desc->txd;
 		dump_desc_dbg(ioat, desc);
 		if (tx->cookie) {
@@ -158,11 +157,12 @@ static void __cleanup(struct ioat2_dma_chan *ioat, unsigned long phys_complete)
 		if (tx->phys == phys_complete)
 			seen_current = true;
 	}
-	ioat->tail += i;
+	smp_mb(); /* finish all descriptor reads before incrementing tail */
+	ioat->tail = idx + i;
 	BUG_ON(active && !seen_current); /* no active descs have written a completion? */
 
 	chan->last_completion = phys_complete;
-	if (ioat->head == ioat->tail) {
+	if (active - i == 0) {
 		dev_dbg(to_dev(chan), "%s: cancel completion timeout\n",
 			__func__);
 		clear_bit(IOAT_COMPLETION_PENDING, &chan->state);
@@ -179,24 +179,9 @@ static void ioat2_cleanup(struct ioat2_dma_chan *ioat)
 	struct ioat_chan_common *chan = &ioat->base;
 	unsigned long phys_complete;
 
-	prefetch(chan->completion);
-
-	if (!spin_trylock_bh(&chan->cleanup_lock))
-		return;
-
-	if (!ioat_cleanup_preamble(chan, &phys_complete)) {
-		spin_unlock_bh(&chan->cleanup_lock);
-		return;
-	}
-
-	if (!spin_trylock_bh(&ioat->ring_lock)) {
-		spin_unlock_bh(&chan->cleanup_lock);
-		return;
-	}
-
-	__cleanup(ioat, phys_complete);
-
-	spin_unlock_bh(&ioat->ring_lock);
+	spin_lock_bh(&chan->cleanup_lock);
+	if (ioat_cleanup_preamble(chan, &phys_complete))
+		__cleanup(ioat, phys_complete);
 	spin_unlock_bh(&chan->cleanup_lock);
 }
 
@@ -287,12 +272,10 @@ void ioat2_timer_event(unsigned long data)
 	struct ioat2_dma_chan *ioat = to_ioat2_chan((void *) data);
 	struct ioat_chan_common *chan = &ioat->base;
 
-	spin_lock_bh(&chan->cleanup_lock);
 	if (test_bit(IOAT_COMPLETION_PENDING, &chan->state)) {
 		unsigned long phys_complete;
 		u64 status;
 
-		spin_lock_bh(&ioat->ring_lock);
 		status = ioat_chansts(chan);
 
 		/* when halted due to errors check for channel
@@ -311,26 +294,31 @@ void ioat2_timer_event(unsigned long data)
 		 * acknowledged a pending completion once, then be more
 		 * forceful with a restart
 		 */
-		if (ioat_cleanup_preamble(chan, &phys_complete))
+		spin_lock_bh(&chan->cleanup_lock);
+		if (ioat_cleanup_preamble(chan, &phys_complete)) {
 			__cleanup(ioat, phys_complete);
-		else if (test_bit(IOAT_COMPLETION_ACK, &chan->state))
+		} else if (test_bit(IOAT_COMPLETION_ACK, &chan->state)) {
+			spin_lock_bh(&ioat->prep_lock);
 			ioat2_restart_channel(ioat);
-		else {
+			spin_unlock_bh(&ioat->prep_lock);
+		} else {
 			set_bit(IOAT_COMPLETION_ACK, &chan->state);
 			mod_timer(&chan->timer, jiffies + COMPLETION_TIMEOUT);
 		}
-		spin_unlock_bh(&ioat->ring_lock);
+		spin_unlock_bh(&chan->cleanup_lock);
 	} else {
 		u16 active;
 
 		/* if the ring is idle, empty, and oversized try to step
 		 * down the size
 		 */
-		spin_lock_bh(&ioat->ring_lock);
+		spin_lock_bh(&chan->cleanup_lock);
+		spin_lock_bh(&ioat->prep_lock);
 		active = ioat2_ring_active(ioat);
 		if (active == 0 && ioat->alloc_order > ioat_get_alloc_order())
 			reshape_ring(ioat, ioat->alloc_order-1);
-		spin_unlock_bh(&ioat->ring_lock);
+		spin_unlock_bh(&ioat->prep_lock);
+		spin_unlock_bh(&chan->cleanup_lock);
 
 		/* keep shrinking until we get back to our minimum
 		 * default size
@@ -338,7 +326,6 @@ void ioat2_timer_event(unsigned long data)
 		if (ioat->alloc_order > ioat_get_alloc_order())
 			mod_timer(&chan->timer, jiffies + IDLE_TIMEOUT);
 	}
-	spin_unlock_bh(&chan->cleanup_lock);
 }
 
 static int ioat2_reset_hw(struct ioat_chan_common *chan)
@@ -392,7 +379,7 @@ int ioat2_enumerate_channels(struct ioatdma_device *device)
 
 		ioat_init_channel(device, &ioat->base, i);
 		ioat->xfercap_log = xfercap_log;
-		spin_lock_init(&ioat->ring_lock);
+		spin_lock_init(&ioat->prep_lock);
 		if (device->reset_hw(&ioat->base)) {
 			i = 0;
 			break;
@@ -418,8 +405,17 @@ static dma_cookie_t ioat2_tx_submit_unlock(struct dma_async_tx_descriptor *tx)
 
 	if (!test_and_set_bit(IOAT_COMPLETION_PENDING, &chan->state))
 		mod_timer(&chan->timer, jiffies + COMPLETION_TIMEOUT);
+
+	/* make descriptor updates visible before advancing ioat->head,
+	 * this is purposefully not smp_wmb() since we are also
+	 * publishing the descriptor updates to a dma device
+	 */
+	wmb();
+
+	ioat->head += ioat->produce;
+
 	ioat2_update_pending(ioat);
-	spin_unlock_bh(&ioat->ring_lock);
+	spin_unlock_bh(&ioat->prep_lock);
 
 	return cookie;
 }
@@ -531,13 +527,15 @@ int ioat2_alloc_chan_resources(struct dma_chan *c)
 	if (!ring)
 		return -ENOMEM;
 
-	spin_lock_bh(&ioat->ring_lock);
+	spin_lock_bh(&chan->cleanup_lock);
+	spin_lock_bh(&ioat->prep_lock);
 	ioat->ring = ring;
 	ioat->head = 0;
 	ioat->issued = 0;
 	ioat->tail = 0;
 	ioat->alloc_order = order;
-	spin_unlock_bh(&ioat->ring_lock);
+	spin_unlock_bh(&ioat->prep_lock);
+	spin_unlock_bh(&chan->cleanup_lock);
 
 	tasklet_enable(&chan->cleanup_task);
 	ioat2_start_null_desc(ioat);
@@ -653,54 +651,61 @@ bool reshape_ring(struct ioat2_dma_chan *ioat, int order)
 }
 
 /**
- * ioat2_alloc_and_lock - common descriptor alloc boilerplate for ioat2,3 ops
- * @idx: gets starting descriptor index on successful allocation
+ * ioat2_check_space_lock - verify space and grab ring producer lock
  * @ioat: ioat2,3 channel (ring) to operate on
  * @num_descs: allocation length
  */
-int ioat2_alloc_and_lock(u16 *idx, struct ioat2_dma_chan *ioat, int num_descs)
+int ioat2_check_space_lock(struct ioat2_dma_chan *ioat, int num_descs)
 {
 	struct ioat_chan_common *chan = &ioat->base;
+	bool retry;
 
-	spin_lock_bh(&ioat->ring_lock);
+ retry:
+	spin_lock_bh(&ioat->prep_lock);
 	/* never allow the last descriptor to be consumed, we need at
 	 * least one free at all times to allow for on-the-fly ring
 	 * resizing.
 	 */
-	while (unlikely(ioat2_ring_space(ioat) <= num_descs)) {
-		if (reshape_ring(ioat, ioat->alloc_order + 1) &&
-		    ioat2_ring_space(ioat) > num_descs)
-				break;
-
-		if (printk_ratelimit())
-			dev_dbg(to_dev(chan),
-				"%s: ring full! num_descs: %d (%x:%x:%x)\n",
-				__func__, num_descs, ioat->head, ioat->tail,
-				ioat->issued);
-		spin_unlock_bh(&ioat->ring_lock);
-
-		/* progress reclaim in the allocation failure case we
-		 * may be called under bh_disabled so we need to trigger
-		 * the timer event directly
-		 */
-		spin_lock_bh(&chan->cleanup_lock);
-		if (jiffies > chan->timer.expires &&
-		    timer_pending(&chan->timer)) {
-			struct ioatdma_device *device = chan->device;
-
-			mod_timer(&chan->timer, jiffies + COMPLETION_TIMEOUT);
-			spin_unlock_bh(&chan->cleanup_lock);
-			device->timer_fn((unsigned long) &chan->common);
-		} else
-			spin_unlock_bh(&chan->cleanup_lock);
-		return -ENOMEM;
+	if (likely(ioat2_ring_space(ioat) > num_descs)) {
+		dev_dbg(to_dev(chan), "%s: num_descs: %d (%x:%x:%x)\n",
+			__func__, num_descs, ioat->head, ioat->tail, ioat->issued);
+		ioat->produce = num_descs;
+		return 0;  /* with ioat->prep_lock held */
 	}
+	retry = test_and_set_bit(IOAT_RESHAPE_PENDING, &chan->state);
+	spin_unlock_bh(&ioat->prep_lock);
 
-	dev_dbg(to_dev(chan), "%s: num_descs: %d (%x:%x:%x)\n",
-		__func__, num_descs, ioat->head, ioat->tail, ioat->issued);
+	/* is another cpu already trying to expand the ring? */
+	if (retry)
+		goto retry;
 
-	*idx = ioat2_desc_alloc(ioat, num_descs);
-	return 0;  /* with ioat->ring_lock held */
+	spin_lock_bh(&chan->cleanup_lock);
+	spin_lock_bh(&ioat->prep_lock);
+	retry = reshape_ring(ioat, ioat->alloc_order + 1);
+	clear_bit(IOAT_RESHAPE_PENDING, &chan->state);
+	spin_unlock_bh(&ioat->prep_lock);
+	spin_unlock_bh(&chan->cleanup_lock);
+
+	/* if we were able to expand the ring retry the allocation */
+	if (retry)
+		goto retry;
+
+	if (printk_ratelimit())
+		dev_dbg(to_dev(chan), "%s: ring full! num_descs: %d (%x:%x:%x)\n",
+			__func__, num_descs, ioat->head, ioat->tail, ioat->issued);
+
+	/* progress reclaim in the allocation failure case we may be
+	 * called under bh_disabled so we need to trigger the timer
+	 * event directly
+	 */
+	if (jiffies > chan->timer.expires && timer_pending(&chan->timer)) {
+		struct ioatdma_device *device = chan->device;
+
+		mod_timer(&chan->timer, jiffies + COMPLETION_TIMEOUT);
+		device->timer_fn((unsigned long) &chan->common);
+	}
+
+	return -ENOMEM;
 }
 
 struct dma_async_tx_descriptor *
@@ -713,14 +718,11 @@ ioat2_dma_prep_memcpy_lock(struct dma_chan *c, dma_addr_t dma_dest,
 	dma_addr_t dst = dma_dest;
 	dma_addr_t src = dma_src;
 	size_t total_len = len;
-	int num_descs;
-	u16 idx;
-	int i;
+	int num_descs, idx, i;
 
 	num_descs = ioat2_xferlen_to_descs(ioat, len);
-	if (likely(num_descs) &&
-	    ioat2_alloc_and_lock(&idx, ioat, num_descs) == 0)
-		/* pass */;
+	if (likely(num_descs) && ioat2_check_space_lock(ioat, num_descs) == 0)
+		idx = ioat->head;
 	else
 		return NULL;
 	i = 0;
@@ -777,7 +779,8 @@ void ioat2_free_chan_resources(struct dma_chan *c)
 	device->cleanup_fn((unsigned long) c);
 	device->reset_hw(chan);
 
-	spin_lock_bh(&ioat->ring_lock);
+	spin_lock_bh(&chan->cleanup_lock);
+	spin_lock_bh(&ioat->prep_lock);
 	descs = ioat2_ring_space(ioat);
 	dev_dbg(to_dev(chan), "freeing %d idle descriptors\n", descs);
 	for (i = 0; i < descs; i++) {
@@ -800,7 +803,8 @@ void ioat2_free_chan_resources(struct dma_chan *c)
 	ioat->alloc_order = 0;
 	pci_pool_free(device->completion_pool, chan->completion,
 		      chan->completion_dma);
-	spin_unlock_bh(&ioat->ring_lock);
+	spin_unlock_bh(&ioat->prep_lock);
+	spin_unlock_bh(&chan->cleanup_lock);
 
 	chan->last_completion = 0;
 	chan->completion_dma = 0;
diff --git a/drivers/dma/ioat/dma_v2.h b/drivers/dma/ioat/dma_v2.h
index d7b64f188f78..a2c413b2b8d8 100644
--- a/drivers/dma/ioat/dma_v2.h
+++ b/drivers/dma/ioat/dma_v2.h
@@ -50,8 +50,9 @@ extern int ioat_ring_alloc_order;
  * @tail: cleanup index
  * @dmacount: identical to 'head' except for occasionally resetting to zero
  * @alloc_order: log2 of the number of allocated descriptors
+ * @produce: number of descriptors to produce at submit time
  * @ring: software ring buffer implementation of hardware ring
- * @ring_lock: protects ring attributes
+ * @prep_lock: serializes descriptor preparation (producers)
  */
 struct ioat2_dma_chan {
 	struct ioat_chan_common base;
@@ -61,8 +62,9 @@ struct ioat2_dma_chan {
 	u16 tail;
 	u16 dmacount;
 	u16 alloc_order;
+	u16 produce;
 	struct ioat_ring_ent **ring;
-	spinlock_t ring_lock;
+	spinlock_t prep_lock;
 };
 
 static inline struct ioat2_dma_chan *to_ioat2_chan(struct dma_chan *c)
@@ -94,13 +96,6 @@ static inline u16 ioat2_ring_space(struct ioat2_dma_chan *ioat)
 	return ioat2_ring_size(ioat) - ioat2_ring_active(ioat);
 }
 
-/* assumes caller already checked space */
-static inline u16 ioat2_desc_alloc(struct ioat2_dma_chan *ioat, u16 len)
-{
-	ioat->head += len;
-	return ioat->head - len;
-}
-
 static inline u16 ioat2_xferlen_to_descs(struct ioat2_dma_chan *ioat, size_t len)
 {
 	u16 num_descs = len >> ioat->xfercap_log;
@@ -164,7 +159,7 @@ int __devinit ioat2_dma_probe(struct ioatdma_device *dev, int dca);
 int __devinit ioat3_dma_probe(struct ioatdma_device *dev, int dca);
 struct dca_provider * __devinit ioat2_dca_init(struct pci_dev *pdev, void __iomem *iobase);
 struct dca_provider * __devinit ioat3_dca_init(struct pci_dev *pdev, void __iomem *iobase);
-int ioat2_alloc_and_lock(u16 *idx, struct ioat2_dma_chan *ioat, int num_descs);
+int ioat2_check_space_lock(struct ioat2_dma_chan *ioat, int num_descs);
 int ioat2_enumerate_channels(struct ioatdma_device *device);
 struct dma_async_tx_descriptor *
 ioat2_dma_prep_memcpy_lock(struct dma_chan *c, dma_addr_t dma_dest,
diff --git a/drivers/dma/ioat/dma_v3.c b/drivers/dma/ioat/dma_v3.c
index 6740e319c9cf..8b573fac2a25 100644
--- a/drivers/dma/ioat/dma_v3.c
+++ b/drivers/dma/ioat/dma_v3.c
@@ -260,8 +260,8 @@ static void __cleanup(struct ioat2_dma_chan *ioat, unsigned long phys_complete)
 	struct ioat_chan_common *chan = &ioat->base;
 	struct ioat_ring_ent *desc;
 	bool seen_current = false;
+	int idx = ioat->tail, i;
 	u16 active;
-	int i;
 
 	dev_dbg(to_dev(chan), "%s: head: %#x tail: %#x issued: %#x\n",
 		__func__, ioat->head, ioat->tail, ioat->issued);
@@ -270,13 +270,14 @@ static void __cleanup(struct ioat2_dma_chan *ioat, unsigned long phys_complete)
 	for (i = 0; i < active && !seen_current; i++) {
 		struct dma_async_tx_descriptor *tx;
 
-		prefetch(ioat2_get_ring_ent(ioat, ioat->tail + i + 1));
-		desc = ioat2_get_ring_ent(ioat, ioat->tail + i);
+		smp_read_barrier_depends();
+		prefetch(ioat2_get_ring_ent(ioat, idx + i + 1));
+		desc = ioat2_get_ring_ent(ioat, idx + i);
 		dump_desc_dbg(ioat, desc);
 		tx = &desc->txd;
 		if (tx->cookie) {
 			chan->completed_cookie = tx->cookie;
-			ioat3_dma_unmap(ioat, desc, ioat->tail + i);
+			ioat3_dma_unmap(ioat, desc, idx + i);
 			tx->cookie = 0;
 			if (tx->callback) {
 				tx->callback(tx->callback_param);
@@ -293,69 +294,30 @@ static void __cleanup(struct ioat2_dma_chan *ioat, unsigned long phys_complete)
 			i++;
 		}
 	}
-	ioat->tail += i;
+	smp_mb(); /* finish all descriptor reads before incrementing tail */
+	ioat->tail = idx + i;
 	BUG_ON(active && !seen_current); /* no active descs have written a completion? */
 	chan->last_completion = phys_complete;
 
-	active = ioat2_ring_active(ioat);
-	if (active == 0) {
+	if (active - i == 0) {
 		dev_dbg(to_dev(chan), "%s: cancel completion timeout\n",
 			__func__);
 		clear_bit(IOAT_COMPLETION_PENDING, &chan->state);
 		mod_timer(&chan->timer, jiffies + IDLE_TIMEOUT);
 	}
 	/* 5 microsecond delay per pending descriptor */
-	writew(min((5 * active), IOAT_INTRDELAY_MASK),
+	writew(min((5 * (active - i)), IOAT_INTRDELAY_MASK),
 	       chan->device->reg_base + IOAT_INTRDELAY_OFFSET);
 }
 
-/* try to cleanup, but yield (via spin_trylock) to incoming submissions
- * with the expectation that we will immediately poll again shortly
- */
-static void ioat3_cleanup_poll(struct ioat2_dma_chan *ioat)
+static void ioat3_cleanup(struct ioat2_dma_chan *ioat)
 {
 	struct ioat_chan_common *chan = &ioat->base;
 	unsigned long phys_complete;
 
-	prefetch(chan->completion);
-
-	if (!spin_trylock_bh(&chan->cleanup_lock))
-		return;
-
-	if (!ioat_cleanup_preamble(chan, &phys_complete)) {
-		spin_unlock_bh(&chan->cleanup_lock);
-		return;
-	}
-
-	if (!spin_trylock_bh(&ioat->ring_lock)) {
-		spin_unlock_bh(&chan->cleanup_lock);
-		return;
-	}
-
-	__cleanup(ioat, phys_complete);
-
-	spin_unlock_bh(&ioat->ring_lock);
-	spin_unlock_bh(&chan->cleanup_lock);
-}
-
-/* run cleanup now because we already delayed the interrupt via INTRDELAY */
-static void ioat3_cleanup_sync(struct ioat2_dma_chan *ioat)
-{
-	struct ioat_chan_common *chan = &ioat->base;
-	unsigned long phys_complete;
-
-	prefetch(chan->completion);
-
 	spin_lock_bh(&chan->cleanup_lock);
-	if (!ioat_cleanup_preamble(chan, &phys_complete)) {
-		spin_unlock_bh(&chan->cleanup_lock);
-		return;
-	}
-	spin_lock_bh(&ioat->ring_lock);
-
-	__cleanup(ioat, phys_complete);
-
-	spin_unlock_bh(&ioat->ring_lock);
+	if (ioat_cleanup_preamble(chan, &phys_complete))
+		__cleanup(ioat, phys_complete);
 	spin_unlock_bh(&chan->cleanup_lock);
 }
 
@@ -363,7 +325,7 @@ static void ioat3_cleanup_event(unsigned long data)
 {
 	struct ioat2_dma_chan *ioat = to_ioat2_chan((void *) data);
 
-	ioat3_cleanup_sync(ioat);
+	ioat3_cleanup(ioat);
 	writew(IOAT_CHANCTRL_RUN, ioat->base.reg_base + IOAT_CHANCTRL_OFFSET);
 }
 
@@ -384,12 +346,10 @@ static void ioat3_timer_event(unsigned long data)
 	struct ioat2_dma_chan *ioat = to_ioat2_chan((void *) data);
 	struct ioat_chan_common *chan = &ioat->base;
 
-	spin_lock_bh(&chan->cleanup_lock);
 	if (test_bit(IOAT_COMPLETION_PENDING, &chan->state)) {
 		unsigned long phys_complete;
 		u64 status;
 
-		spin_lock_bh(&ioat->ring_lock);
 		status = ioat_chansts(chan);
 
 		/* when halted due to errors check for channel
@@ -408,26 +368,31 @@ static void ioat3_timer_event(unsigned long data)
 		 * acknowledged a pending completion once, then be more
 		 * forceful with a restart
 		 */
+		spin_lock_bh(&chan->cleanup_lock);
 		if (ioat_cleanup_preamble(chan, &phys_complete))
 			__cleanup(ioat, phys_complete);
-		else if (test_bit(IOAT_COMPLETION_ACK, &chan->state))
+		else if (test_bit(IOAT_COMPLETION_ACK, &chan->state)) {
+			spin_lock_bh(&ioat->prep_lock);
 			ioat3_restart_channel(ioat);
-		else {
+			spin_unlock_bh(&ioat->prep_lock);
+		} else {
 			set_bit(IOAT_COMPLETION_ACK, &chan->state);
 			mod_timer(&chan->timer, jiffies + COMPLETION_TIMEOUT);
 		}
-		spin_unlock_bh(&ioat->ring_lock);
+		spin_unlock_bh(&chan->cleanup_lock);
 	} else {
 		u16 active;
 
 		/* if the ring is idle, empty, and oversized try to step
 		 * down the size
 		 */
-		spin_lock_bh(&ioat->ring_lock);
+		spin_lock_bh(&chan->cleanup_lock);
+		spin_lock_bh(&ioat->prep_lock);
 		active = ioat2_ring_active(ioat);
 		if (active == 0 && ioat->alloc_order > ioat_get_alloc_order())
 			reshape_ring(ioat, ioat->alloc_order-1);
-		spin_unlock_bh(&ioat->ring_lock);
+		spin_unlock_bh(&ioat->prep_lock);
+		spin_unlock_bh(&chan->cleanup_lock);
 
 		/* keep shrinking until we get back to our minimum
 		 * default size
@@ -435,7 +400,6 @@ static void ioat3_timer_event(unsigned long data)
 		if (ioat->alloc_order > ioat_get_alloc_order())
 			mod_timer(&chan->timer, jiffies + IDLE_TIMEOUT);
 	}
-	spin_unlock_bh(&chan->cleanup_lock);
 }
 
 static enum dma_status
@@ -447,7 +411,7 @@ ioat3_is_complete(struct dma_chan *c, dma_cookie_t cookie,
 	if (ioat_is_complete(c, cookie, done, used) == DMA_SUCCESS)
 		return DMA_SUCCESS;
 
-	ioat3_cleanup_poll(ioat);
+	ioat3_cleanup(ioat);
 
 	return ioat_is_complete(c, cookie, done, used);
 }
@@ -460,15 +424,12 @@ ioat3_prep_memset_lock(struct dma_chan *c, dma_addr_t dest, int value,
 	struct ioat_ring_ent *desc;
 	size_t total_len = len;
 	struct ioat_fill_descriptor *fill;
-	int num_descs;
 	u64 src_data = (0x0101010101010101ULL) * (value & 0xff);
-	u16 idx;
-	int i;
+	int num_descs, idx, i;
 
 	num_descs = ioat2_xferlen_to_descs(ioat, len);
-	if (likely(num_descs) &&
-	    ioat2_alloc_and_lock(&idx, ioat, num_descs) == 0)
-		/* pass */;
+	if (likely(num_descs) && ioat2_check_space_lock(ioat, num_descs) == 0)
+		idx = ioat->head;
 	else
 		return NULL;
 	i = 0;
@@ -513,11 +474,8 @@ __ioat3_prep_xor_lock(struct dma_chan *c, enum sum_check_flags *result,
 	struct ioat_xor_descriptor *xor;
 	struct ioat_xor_ext_descriptor *xor_ex = NULL;
 	struct ioat_dma_descriptor *hw;
+	int num_descs, with_ext, idx, i;
 	u32 offset = 0;
-	int num_descs;
-	int with_ext;
-	int i;
-	u16 idx;
 	u8 op = result ? IOAT_OP_XOR_VAL : IOAT_OP_XOR;
 
 	BUG_ON(src_cnt < 2);
@@ -537,9 +495,8 @@ __ioat3_prep_xor_lock(struct dma_chan *c, enum sum_check_flags *result,
 	 * (legacy) descriptor to ensure all completion writes arrive in
 	 * order.
 	 */
-	if (likely(num_descs) &&
-	    ioat2_alloc_and_lock(&idx, ioat, num_descs+1) == 0)
-		/* pass */;
+	if (likely(num_descs) && ioat2_check_space_lock(ioat, num_descs+1) == 0)
+		idx = ioat->head;
 	else
 		return NULL;
 	i = 0;
@@ -657,11 +614,8 @@ __ioat3_prep_pq_lock(struct dma_chan *c, enum sum_check_flags *result,
 	struct ioat_pq_ext_descriptor *pq_ex = NULL;
 	struct ioat_dma_descriptor *hw;
 	u32 offset = 0;
-	int num_descs;
-	int with_ext;
-	int i, s;
-	u16 idx;
 	u8 op = result ? IOAT_OP_PQ_VAL : IOAT_OP_PQ;
+	int i, s, idx, with_ext, num_descs;
 
 	dev_dbg(to_dev(chan), "%s\n", __func__);
 	/* the engine requires at least two sources (we provide
@@ -687,8 +641,8 @@ __ioat3_prep_pq_lock(struct dma_chan *c, enum sum_check_flags *result,
 	 * order.
 	 */
 	if (likely(num_descs) &&
-	    ioat2_alloc_and_lock(&idx, ioat, num_descs+1) == 0)
-		/* pass */;
+	    ioat2_check_space_lock(ioat, num_descs+1) == 0)
+		idx = ioat->head;
 	else
 		return NULL;
 	i = 0;
@@ -851,10 +805,9 @@ ioat3_prep_interrupt_lock(struct dma_chan *c, unsigned long flags)
 	struct ioat2_dma_chan *ioat = to_ioat2_chan(c);
 	struct ioat_ring_ent *desc;
 	struct ioat_dma_descriptor *hw;
-	u16 idx;
 
-	if (ioat2_alloc_and_lock(&idx, ioat, 1) == 0)
-		desc = ioat2_get_ring_ent(ioat, idx);
+	if (ioat2_check_space_lock(ioat, 1) == 0)
+		desc = ioat2_get_ring_ent(ioat, ioat->head);
 	else
 		return NULL;
 
-- 
cgit v1.2.3


From 2adfc550b6d9646301c810643bc309fa49375987 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sat, 1 May 2010 15:22:56 -0700
Subject: ioat3: disable cacheline-unaligned transfers for raid operations

There are cases where cacheline-unaligned raid operations can hang the
dma channel.  Simply disable these operations by increasing the
alignment constraints published to async_tx.  The raid456 driver always
issues page aligned requests, so the only in-kernel user of the ioatdma
driver that is affected by this change is dmatest.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dma/ioat/dma_v3.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'drivers/dma')

diff --git a/drivers/dma/ioat/dma_v3.c b/drivers/dma/ioat/dma_v3.c
index 8b573fac2a25..e2a23952172d 100644
--- a/drivers/dma/ioat/dma_v3.c
+++ b/drivers/dma/ioat/dma_v3.c
@@ -1175,7 +1175,7 @@ int __devinit ioat3_dma_probe(struct ioatdma_device *device, int dca)
 	if (cap & IOAT_CAP_XOR) {
 		is_raid_device = true;
 		dma->max_xor = 8;
-		dma->xor_align = 2;
+		dma->xor_align = 6;
 
 		dma_cap_set(DMA_XOR, dma->cap_mask);
 		dma->device_prep_dma_xor = ioat3_prep_xor;
@@ -1186,7 +1186,7 @@ int __devinit ioat3_dma_probe(struct ioatdma_device *device, int dca)
 	if (cap & IOAT_CAP_PQ) {
 		is_raid_device = true;
 		dma_set_maxpq(dma, 8, 0);
-		dma->pq_align = 2;
+		dma->pq_align = 6;
 
 		dma_cap_set(DMA_PQ, dma->cap_mask);
 		dma->device_prep_dma_pq = ioat3_prep_pq;
@@ -1196,7 +1196,7 @@ int __devinit ioat3_dma_probe(struct ioatdma_device *device, int dca)
 
 		if (!(cap & IOAT_CAP_XOR)) {
 			dma->max_xor = 8;
-			dma->xor_align = 2;
+			dma->xor_align = 6;
 
 			dma_cap_set(DMA_XOR, dma->cap_mask);
 			dma->device_prep_dma_xor = ioat3_prep_pqxor;
-- 
cgit v1.2.3


From c86e1401c9f2ba8d989fa1c4b33d0f0ec3ba8aaf Mon Sep 17 00:00:00 2001
From: Minskey Guo <chaohong_guo@linux.intel.com>
Date: Sun, 2 May 2010 12:52:35 -0700
Subject: ioat: Remove duplicated devm_kzalloc() calls for ioatdma_device

The memory for ioatdma_device structure is being allocated in
alloc_ioatdma()

Signed-off-by: Minskey Guo <chaohong_guo@linux.intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dma/ioat/pci.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'drivers/dma')

diff --git a/drivers/dma/ioat/pci.c b/drivers/dma/ioat/pci.c
index 99ec26725bae..fab37d1cf48d 100644
--- a/drivers/dma/ioat/pci.c
+++ b/drivers/dma/ioat/pci.c
@@ -138,15 +138,10 @@ static int __devinit ioat_pci_probe(struct pci_dev *pdev, const struct pci_devic
 	if (err)
 		return err;
 
-	device = devm_kzalloc(dev, sizeof(*device), GFP_KERNEL);
-	if (!device)
-		return -ENOMEM;
-
-	pci_set_master(pdev);
-
 	device = alloc_ioatdma(pdev, iomap[IOAT_MMIO_BAR]);
 	if (!device)
 		return -ENOMEM;
+	pci_set_master(pdev);
 	pci_set_drvdata(pdev, device);
 
 	device->version = readb(device->reg_base + IOAT_VER_OFFSET);
-- 
cgit v1.2.3


From caa20d974c86af496b419eef70010e63b7fab7ac Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Mon, 17 May 2010 16:24:16 -0700
Subject: async_tx: trim dma_async_tx_descriptor in 'no channel switch' case

Saves 24 bytes per descriptor (64-bit) when the channel-switching
capabilities of async_tx are not required.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 crypto/async_tx/async_tx.c | 46 +++++++++++++++--------------------
 drivers/dma/dmaengine.c    | 16 +++++++------
 include/linux/dmaengine.h  | 60 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 88 insertions(+), 34 deletions(-)

(limited to 'drivers/dma')

diff --git a/crypto/async_tx/async_tx.c b/crypto/async_tx/async_tx.c
index f9cdf04fe7c0..7f2c00a45205 100644
--- a/crypto/async_tx/async_tx.c
+++ b/crypto/async_tx/async_tx.c
@@ -81,18 +81,13 @@ async_tx_channel_switch(struct dma_async_tx_descriptor *depend_tx,
 	struct dma_device *device = chan->device;
 	struct dma_async_tx_descriptor *intr_tx = (void *) ~0;
 
-	#ifdef CONFIG_ASYNC_TX_DISABLE_CHANNEL_SWITCH
-	BUG();
-	#endif
-
 	/* first check to see if we can still append to depend_tx */
-	spin_lock_bh(&depend_tx->lock);
-	if (depend_tx->parent && depend_tx->chan == tx->chan) {
-		tx->parent = depend_tx;
-		depend_tx->next = tx;
+	txd_lock(depend_tx);
+	if (txd_parent(depend_tx) && depend_tx->chan == tx->chan) {
+		txd_chain(depend_tx, tx);
 		intr_tx = NULL;
 	}
-	spin_unlock_bh(&depend_tx->lock);
+	txd_unlock(depend_tx);
 
 	/* attached dependency, flush the parent channel */
 	if (!intr_tx) {
@@ -111,24 +106,22 @@ async_tx_channel_switch(struct dma_async_tx_descriptor *depend_tx,
 	if (intr_tx) {
 		intr_tx->callback = NULL;
 		intr_tx->callback_param = NULL;
-		tx->parent = intr_tx;
-		/* safe to set ->next outside the lock since we know we are
+		/* safe to chain outside the lock since we know we are
 		 * not submitted yet
 		 */
-		intr_tx->next = tx;
+		txd_chain(intr_tx, tx);
 
 		/* check if we need to append */
-		spin_lock_bh(&depend_tx->lock);
-		if (depend_tx->parent) {
-			intr_tx->parent = depend_tx;
-			depend_tx->next = intr_tx;
+		txd_lock(depend_tx);
+		if (txd_parent(depend_tx)) {
+			txd_chain(depend_tx, intr_tx);
 			async_tx_ack(intr_tx);
 			intr_tx = NULL;
 		}
-		spin_unlock_bh(&depend_tx->lock);
+		txd_unlock(depend_tx);
 
 		if (intr_tx) {
-			intr_tx->parent = NULL;
+			txd_clear_parent(intr_tx);
 			intr_tx->tx_submit(intr_tx);
 			async_tx_ack(intr_tx);
 		}
@@ -176,21 +169,20 @@ async_tx_submit(struct dma_chan *chan, struct dma_async_tx_descriptor *tx,
 		 * 2/ dependencies are 1:1 i.e. two transactions can
 		 * not depend on the same parent
 		 */
-		BUG_ON(async_tx_test_ack(depend_tx) || depend_tx->next ||
-		       tx->parent);
+		BUG_ON(async_tx_test_ack(depend_tx) || txd_next(depend_tx) ||
+		       txd_parent(tx));
 
 		/* the lock prevents async_tx_run_dependencies from missing
 		 * the setting of ->next when ->parent != NULL
 		 */
-		spin_lock_bh(&depend_tx->lock);
-		if (depend_tx->parent) {
+		txd_lock(depend_tx);
+		if (txd_parent(depend_tx)) {
 			/* we have a parent so we can not submit directly
 			 * if we are staying on the same channel: append
 			 * else: channel switch
 			 */
 			if (depend_tx->chan == chan) {
-				tx->parent = depend_tx;
-				depend_tx->next = tx;
+				txd_chain(depend_tx, tx);
 				s = ASYNC_TX_SUBMITTED;
 			} else
 				s = ASYNC_TX_CHANNEL_SWITCH;
@@ -203,7 +195,7 @@ async_tx_submit(struct dma_chan *chan, struct dma_async_tx_descriptor *tx,
 			else
 				s = ASYNC_TX_CHANNEL_SWITCH;
 		}
-		spin_unlock_bh(&depend_tx->lock);
+		txd_unlock(depend_tx);
 
 		switch (s) {
 		case ASYNC_TX_SUBMITTED:
@@ -212,12 +204,12 @@ async_tx_submit(struct dma_chan *chan, struct dma_async_tx_descriptor *tx,
 			async_tx_channel_switch(depend_tx, tx);
 			break;
 		case ASYNC_TX_DIRECT_SUBMIT:
-			tx->parent = NULL;
+			txd_clear_parent(tx);
 			tx->tx_submit(tx);
 			break;
 		}
 	} else {
-		tx->parent = NULL;
+		txd_clear_parent(tx);
 		tx->tx_submit(tx);
 	}
 
diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index d18b5d069d7e..fcfe1a62ffa5 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -978,7 +978,9 @@ void dma_async_tx_descriptor_init(struct dma_async_tx_descriptor *tx,
 	struct dma_chan *chan)
 {
 	tx->chan = chan;
+	#ifndef CONFIG_ASYNC_TX_DISABLE_CHANNEL_SWITCH
 	spin_lock_init(&tx->lock);
+	#endif
 }
 EXPORT_SYMBOL(dma_async_tx_descriptor_init);
 
@@ -1011,7 +1013,7 @@ EXPORT_SYMBOL_GPL(dma_wait_for_async_tx);
  */
 void dma_run_dependencies(struct dma_async_tx_descriptor *tx)
 {
-	struct dma_async_tx_descriptor *dep = tx->next;
+	struct dma_async_tx_descriptor *dep = txd_next(tx);
 	struct dma_async_tx_descriptor *dep_next;
 	struct dma_chan *chan;
 
@@ -1019,7 +1021,7 @@ void dma_run_dependencies(struct dma_async_tx_descriptor *tx)
 		return;
 
 	/* we'll submit tx->next now, so clear the link */
-	tx->next = NULL;
+	txd_clear_next(tx);
 	chan = dep->chan;
 
 	/* keep submitting up until a channel switch is detected
@@ -1027,14 +1029,14 @@ void dma_run_dependencies(struct dma_async_tx_descriptor *tx)
 	 * processing the interrupt from async_tx_channel_switch
 	 */
 	for (; dep; dep = dep_next) {
-		spin_lock_bh(&dep->lock);
-		dep->parent = NULL;
-		dep_next = dep->next;
+		txd_lock(dep);
+		txd_clear_parent(dep);
+		dep_next = txd_next(dep);
 		if (dep_next && dep_next->chan == chan)
-			dep->next = NULL; /* ->next will be submitted */
+			txd_clear_next(dep); /* ->next will be submitted */
 		else
 			dep_next = NULL; /* submit current dep and terminate */
-		spin_unlock_bh(&dep->lock);
+		txd_unlock(dep);
 
 		dep->tx_submit(dep);
 	}
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 20ea12c86fd0..cb234979fc6b 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -230,11 +230,71 @@ struct dma_async_tx_descriptor {
 	dma_cookie_t (*tx_submit)(struct dma_async_tx_descriptor *tx);
 	dma_async_tx_callback callback;
 	void *callback_param;
+#ifndef CONFIG_ASYNC_TX_DISABLE_CHANNEL_SWITCH
 	struct dma_async_tx_descriptor *next;
 	struct dma_async_tx_descriptor *parent;
 	spinlock_t lock;
+#endif
 };
 
+#ifdef CONFIG_ASYNC_TX_DISABLE_CHANNEL_SWITCH
+static inline void txd_lock(struct dma_async_tx_descriptor *txd)
+{
+}
+static inline void txd_unlock(struct dma_async_tx_descriptor *txd)
+{
+}
+static inline void txd_chain(struct dma_async_tx_descriptor *txd, struct dma_async_tx_descriptor *next)
+{
+	BUG();
+}
+static inline void txd_clear_parent(struct dma_async_tx_descriptor *txd)
+{
+}
+static inline void txd_clear_next(struct dma_async_tx_descriptor *txd)
+{
+}
+static inline struct dma_async_tx_descriptor *txd_next(struct dma_async_tx_descriptor *txd)
+{
+	return NULL;
+}
+static inline struct dma_async_tx_descriptor *txd_parent(struct dma_async_tx_descriptor *txd)
+{
+	return NULL;
+}
+
+#else
+static inline void txd_lock(struct dma_async_tx_descriptor *txd)
+{
+	spin_lock_bh(&txd->lock);
+}
+static inline void txd_unlock(struct dma_async_tx_descriptor *txd)
+{
+	spin_unlock_bh(&txd->lock);
+}
+static inline void txd_chain(struct dma_async_tx_descriptor *txd, struct dma_async_tx_descriptor *next)
+{
+	txd->next = next;
+	next->parent = txd;
+}
+static inline void txd_clear_parent(struct dma_async_tx_descriptor *txd)
+{
+	txd->parent = NULL;
+}
+static inline void txd_clear_next(struct dma_async_tx_descriptor *txd)
+{
+	txd->next = NULL;
+}
+static inline struct dma_async_tx_descriptor *txd_parent(struct dma_async_tx_descriptor *txd)
+{
+	return txd->parent;
+}
+static inline struct dma_async_tx_descriptor *txd_next(struct dma_async_tx_descriptor *txd)
+{
+	return txd->next;
+}
+#endif
+
 /**
  * struct dma_device - info on the entity supplying DMA services
  * @chancnt: how many DMA channels are supported
-- 
cgit v1.2.3