From be4fdb501e9209bd8ef5d625310dae94fb056a41 Mon Sep 17 00:00:00 2001
From: Jordan Crouse <jcrouse@codeaurora.org>
Date: Mon, 13 Feb 2017 10:14:23 -0700
Subject: drm/msm: Remove memptrs->wptr

memptrs->wptr seems to be unused. Remove it to avoid
confusing the upcoming preemption code.

Change-Id: Ic0dedbadacef5e866bd37a332019f1133f1def49
Signed-off-by: Jordan Crouse <jcrouse@codeaurora.org>
---
 drivers/gpu/drm/msm/adreno/adreno_gpu.c | 3 ---
 drivers/gpu/drm/msm/adreno/adreno_gpu.h | 1 -
 2 files changed, 4 deletions(-)

diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
index e67c63c9a3ac..e4a6d2ba93e3 100644
--- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
@@ -120,7 +120,6 @@ void adreno_recover(struct msm_gpu *gpu)
 	/* reset completed fence seqno, just discard anything pending: */
 	adreno_gpu->memptrs->fence = gpu->submitted_fence;
 	adreno_gpu->memptrs->rptr  = 0;
-	adreno_gpu->memptrs->wptr  = 0;
 
 	gpu->funcs->pm_resume(gpu);
 
@@ -262,7 +261,6 @@ void adreno_show(struct msm_gpu *gpu, struct seq_file *m)
 	seq_printf(m, "fence:    %d/%d\n", adreno_gpu->memptrs->fence,
 			gpu->submitted_fence);
 	seq_printf(m, "rptr:     %d\n", get_rptr(adreno_gpu));
-	seq_printf(m, "wptr:     %d\n", adreno_gpu->memptrs->wptr);
 	seq_printf(m, "rb wptr:  %d\n", get_wptr(gpu->rb));
 
 	gpu->funcs->pm_resume(gpu);
@@ -303,7 +301,6 @@ void adreno_dump_info(struct msm_gpu *gpu)
 	printk("fence:    %d/%d\n", adreno_gpu->memptrs->fence,
 			gpu->submitted_fence);
 	printk("rptr:     %d\n", get_rptr(adreno_gpu));
-	printk("wptr:     %d\n", adreno_gpu->memptrs->wptr);
 	printk("rb wptr:  %d\n", get_wptr(gpu->rb));
 
 	for (i = 0; i < 8; i++) {
diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.h b/drivers/gpu/drm/msm/adreno/adreno_gpu.h
index 9bcf4bb705bd..86a993bcce32 100644
--- a/drivers/gpu/drm/msm/adreno/adreno_gpu.h
+++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.h
@@ -88,7 +88,6 @@ const struct adreno_info *adreno_info(struct adreno_rev rev);
 
 struct adreno_rbmemptrs {
 	volatile uint32_t rptr;
-	volatile uint32_t wptr;
 	volatile uint32_t fence;
 };
 
-- 
cgit v1.2.3


From b0dfb38811ed601bca0f52a2425fa4eae760f9aa Mon Sep 17 00:00:00 2001
From: Jordan Crouse <jcrouse@codeaurora.org>
Date: Mon, 13 Feb 2017 10:14:23 -0700
Subject: drm/msm: Add a property for the GMEM base

Return the base address of GMEM in virtual address space as
a parameter.

Change-Id: Ic0dedbad3b849052313e4673efcf6c22bc81f21f
Signed-off-by: Jordan Crouse <jcrouse@codeaurora.org>
---
 drivers/gpu/drm/msm/adreno/adreno_gpu.c | 3 +++
 include/uapi/drm/msm_drm.h              | 1 +
 2 files changed, 4 insertions(+)

diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
index e4a6d2ba93e3..fad7a5fd9d60 100644
--- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
@@ -35,6 +35,9 @@ int adreno_get_param(struct msm_gpu *gpu, uint32_t param, uint64_t *value)
 	case MSM_PARAM_GMEM_SIZE:
 		*value = adreno_gpu->gmem;
 		return 0;
+	case MSM_PARAM_GMEM_BASE:
+		*value = 0x100000;
+		return 0;
 	case MSM_PARAM_CHIP_ID:
 		*value = adreno_gpu->rev.patchid |
 				(adreno_gpu->rev.minor << 8) |
diff --git a/include/uapi/drm/msm_drm.h b/include/uapi/drm/msm_drm.h
index 587d35ce1638..20ef9bc424f3 100644
--- a/include/uapi/drm/msm_drm.h
+++ b/include/uapi/drm/msm_drm.h
@@ -54,6 +54,7 @@ struct drm_msm_timespec {
 #define MSM_PARAM_CHIP_ID    0x03
 #define MSM_PARAM_MAX_FREQ   0x04
 #define MSM_PARAM_TIMESTAMP  0x05
+#define MSM_PARAM_GMEM_BASE  0x06
 
 struct drm_msm_param {
 	__u32 pipe;           /* in, MSM_PIPE_x */
-- 
cgit v1.2.3


From 378583458fa167277b15d145dccce253459393ec Mon Sep 17 00:00:00 2001
From: Jordan Crouse <jcrouse@codeaurora.org>
Date: Mon, 13 Feb 2017 10:14:24 -0700
Subject: drm/msm: Add support for multiple ringbuffers

Add the infrastructure for supporting multiple ringbuffers.

Change-Id: Ic0dedbada90ec5c4c8074ffce33c3fe275b0cda1
Signed-off-by: Jordan Crouse <jcrouse@codeaurora.org>
---
 drivers/gpu/drm/msm/adreno/a3xx_gpu.c   |  14 +--
 drivers/gpu/drm/msm/adreno/a4xx_gpu.c   |  14 +--
 drivers/gpu/drm/msm/adreno/a5xx_gpu.c   |  72 ++++++++++----
 drivers/gpu/drm/msm/adreno/a5xx_gpu.h   |   1 +
 drivers/gpu/drm/msm/adreno/a5xx_power.c |   6 +-
 drivers/gpu/drm/msm/adreno/adreno_gpu.c | 168 ++++++++++++++++++++-----------
 drivers/gpu/drm/msm/adreno/adreno_gpu.h |  36 ++++---
 drivers/gpu/drm/msm/msm_drv.c           |  44 ++++----
 drivers/gpu/drm/msm/msm_drv.h           |  34 ++++++-
 drivers/gpu/drm/msm/msm_gem.h           |   1 +
 drivers/gpu/drm/msm/msm_gem_submit.c    |   7 +-
 drivers/gpu/drm/msm/msm_gpu.c           | 171 +++++++++++++++++++++-----------
 drivers/gpu/drm/msm/msm_gpu.h           |  45 +++++++--
 drivers/gpu/drm/msm/msm_ringbuffer.c    |  12 +--
 drivers/gpu/drm/msm/msm_ringbuffer.h    |   6 +-
 include/uapi/drm/msm_drm.h              |  17 +++-
 16 files changed, 446 insertions(+), 202 deletions(-)

diff --git a/drivers/gpu/drm/msm/adreno/a3xx_gpu.c b/drivers/gpu/drm/msm/adreno/a3xx_gpu.c
index 5a061ad6225a..c4f886fd6037 100644
--- a/drivers/gpu/drm/msm/adreno/a3xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a3xx_gpu.c
@@ -40,10 +40,11 @@
 extern bool hang_debug;
 
 static void a3xx_dump(struct msm_gpu *gpu);
+static bool a3xx_idle(struct msm_gpu *gpu);
 
 static bool a3xx_me_init(struct msm_gpu *gpu)
 {
-	struct msm_ringbuffer *ring = gpu->rb;
+	struct msm_ringbuffer *ring = gpu->rb[0];
 
 	OUT_PKT3(ring, CP_ME_INIT, 17);
 	OUT_RING(ring, 0x000003f7);
@@ -64,8 +65,8 @@ static bool a3xx_me_init(struct msm_gpu *gpu)
 	OUT_RING(ring, 0x00000000);
 	OUT_RING(ring, 0x00000000);
 
-	gpu->funcs->flush(gpu);
-	return gpu->funcs->idle(gpu);
+	gpu->funcs->flush(gpu, ring);
+	return a3xx_idle(gpu);
 }
 
 static int a3xx_hw_init(struct msm_gpu *gpu)
@@ -331,7 +332,7 @@ static void a3xx_destroy(struct msm_gpu *gpu)
 static bool a3xx_idle(struct msm_gpu *gpu)
 {
 	/* wait for ringbuffer to drain: */
-	if (!adreno_idle(gpu))
+	if (!adreno_idle(gpu, gpu->rb[0]))
 		return false;
 
 	/* then wait for GPU to finish: */
@@ -439,9 +440,10 @@ static const struct adreno_gpu_funcs funcs = {
 		.pm_resume = msm_gpu_pm_resume,
 		.recover = a3xx_recover,
 		.last_fence = adreno_last_fence,
+		.submitted_fence = adreno_submitted_fence,
 		.submit = adreno_submit,
 		.flush = adreno_flush,
-		.idle = a3xx_idle,
+		.active_ring = adreno_active_ring,
 		.irq = a3xx_irq,
 		.destroy = a3xx_destroy,
 #ifdef CONFIG_DEBUG_FS
@@ -489,7 +491,7 @@ struct msm_gpu *a3xx_gpu_init(struct drm_device *dev)
 	adreno_gpu->registers = a3xx_registers;
 	adreno_gpu->reg_offsets = a3xx_register_offsets;
 
-	ret = adreno_gpu_init(dev, pdev, adreno_gpu, &funcs);
+	ret = adreno_gpu_init(dev, pdev, adreno_gpu, &funcs, 1);
 	if (ret)
 		goto fail;
 
diff --git a/drivers/gpu/drm/msm/adreno/a4xx_gpu.c b/drivers/gpu/drm/msm/adreno/a4xx_gpu.c
index 47c9b22b0801..534a7c3fbdca 100644
--- a/drivers/gpu/drm/msm/adreno/a4xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a4xx_gpu.c
@@ -31,6 +31,7 @@
 
 extern bool hang_debug;
 static void a4xx_dump(struct msm_gpu *gpu);
+static bool a4xx_idle(struct msm_gpu *gpu);
 
 /*
  * a4xx_enable_hwcg() - Program the clock control registers
@@ -115,7 +116,7 @@ static void a4xx_enable_hwcg(struct msm_gpu *gpu)
 
 static bool a4xx_me_init(struct msm_gpu *gpu)
 {
-	struct msm_ringbuffer *ring = gpu->rb;
+	struct msm_ringbuffer *ring = gpu->rb[0];
 
 	OUT_PKT3(ring, CP_ME_INIT, 17);
 	OUT_RING(ring, 0x000003f7);
@@ -136,8 +137,8 @@ static bool a4xx_me_init(struct msm_gpu *gpu)
 	OUT_RING(ring, 0x00000000);
 	OUT_RING(ring, 0x00000000);
 
-	gpu->funcs->flush(gpu);
-	return gpu->funcs->idle(gpu);
+	gpu->funcs->flush(gpu, ring);
+	return a4xx_idle(gpu);
 }
 
 static int a4xx_hw_init(struct msm_gpu *gpu)
@@ -329,7 +330,7 @@ static void a4xx_destroy(struct msm_gpu *gpu)
 static bool a4xx_idle(struct msm_gpu *gpu)
 {
 	/* wait for ringbuffer to drain: */
-	if (!adreno_idle(gpu))
+	if (!adreno_idle(gpu, gpu->rb[0]))
 		return false;
 
 	/* then wait for GPU to finish: */
@@ -522,9 +523,10 @@ static const struct adreno_gpu_funcs funcs = {
 		.pm_resume = a4xx_pm_resume,
 		.recover = a4xx_recover,
 		.last_fence = adreno_last_fence,
+		.submitted_fence = adreno_submitted_fence,
 		.submit = adreno_submit,
 		.flush = adreno_flush,
-		.idle = a4xx_idle,
+		.active_ring = adreno_active_ring,
 		.irq = a4xx_irq,
 		.destroy = a4xx_destroy,
 #ifdef CONFIG_DEBUG_FS
@@ -566,7 +568,7 @@ struct msm_gpu *a4xx_gpu_init(struct drm_device *dev)
 	adreno_gpu->registers = a4xx_registers;
 	adreno_gpu->reg_offsets = a4xx_register_offsets;
 
-	ret = adreno_gpu_init(dev, pdev, adreno_gpu, &funcs);
+	ret = adreno_gpu_init(dev, pdev, adreno_gpu, &funcs, 1);
 	if (ret)
 		goto fail;
 
diff --git a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
index 8bc3c7bee3fb..1e554ef51968 100644
--- a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
@@ -22,7 +22,7 @@ static int a5xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
 {
 	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
 	struct msm_drm_private *priv = gpu->dev->dev_private;
-	struct msm_ringbuffer *ring = gpu->rb;
+	struct msm_ringbuffer *ring = gpu->rb[submit->ring];
 	unsigned int i, ibs = 0;
 
 	for (i = 0; i < submit->nr_cmds; i++) {
@@ -47,11 +47,12 @@ static int a5xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
 
 	OUT_PKT7(ring, CP_EVENT_WRITE, 4);
 	OUT_RING(ring, CACHE_FLUSH_TS | (1 << 31));
-	OUT_RING(ring, lower_32_bits(rbmemptr(adreno_gpu, fence)));
-	OUT_RING(ring, upper_32_bits(rbmemptr(adreno_gpu, fence)));
+
+	OUT_RING(ring, lower_32_bits(rbmemptr(adreno_gpu, ring->id, fence)));
+	OUT_RING(ring, upper_32_bits(rbmemptr(adreno_gpu, ring->id, fence)));
 	OUT_RING(ring, submit->fence);
 
-	gpu->funcs->flush(gpu);
+	gpu->funcs->flush(gpu, ring);
 
 	return 0;
 }
@@ -175,7 +176,7 @@ static void a5xx_enable_hwcg(struct msm_gpu *gpu)
 static int a5xx_me_init(struct msm_gpu *gpu)
 {
 	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
-	struct msm_ringbuffer *ring = gpu->rb;
+	struct msm_ringbuffer *ring = gpu->rb[0];
 
 	OUT_PKT7(ring, CP_ME_INIT, 8);
 
@@ -206,9 +207,8 @@ static int a5xx_me_init(struct msm_gpu *gpu)
 	OUT_RING(ring, 0x00000000);
 	OUT_RING(ring, 0x00000000);
 
-	gpu->funcs->flush(gpu);
-
-	return gpu->funcs->idle(gpu) ? 0 : -EINVAL;
+	gpu->funcs->flush(gpu, ring);
+	return a5xx_idle(gpu, ring) ? 0 : -EINVAL;
 }
 
 static struct drm_gem_object *a5xx_ucode_load_bo(struct msm_gpu *gpu,
@@ -545,11 +545,11 @@ static int a5xx_hw_init(struct msm_gpu *gpu)
 	 * ticking correctly
 	 */
 	if (adreno_is_a530(adreno_gpu)) {
-		OUT_PKT7(gpu->rb, CP_EVENT_WRITE, 1);
-		OUT_RING(gpu->rb, 0x0F);
+		OUT_PKT7(gpu->rb[0], CP_EVENT_WRITE, 1);
+		OUT_RING(gpu->rb[0], 0x0F);
 
-		gpu->funcs->flush(gpu);
-		if (!gpu->funcs->idle(gpu))
+		gpu->funcs->flush(gpu, gpu->rb[0]);
+		if (!a5xx_idle(gpu, gpu->rb[0]))
 			return -EINVAL;
 	}
 
@@ -562,13 +562,13 @@ static int a5xx_hw_init(struct msm_gpu *gpu)
 	 * cause a XPU violation.
 	 */
 	if (test_bit(A5XX_ZAP_SHADER_LOADED, &a5xx_gpu->flags)) {
-		struct msm_ringbuffer *ring = gpu->rb;
+		struct msm_ringbuffer *ring = gpu->rb[0];
 
 		OUT_PKT7(ring, CP_SET_SECURE_MODE, 1);
 		OUT_RING(ring, 0x00000000);
 
-		gpu->funcs->flush(gpu);
-		if (!gpu->funcs->idle(gpu))
+		gpu->funcs->flush(gpu, gpu->rb[0]);
+		if (!a5xx_idle(gpu, gpu->rb[0]))
 			return -EINVAL;
 	} else {
 		/* Print a warning so if we die, we know why */
@@ -639,16 +639,19 @@ static inline bool _a5xx_check_idle(struct msm_gpu *gpu)
 		A5XX_RBBM_INT_0_MASK_MISC_HANG_DETECT);
 }
 
-static bool a5xx_idle(struct msm_gpu *gpu)
+bool a5xx_idle(struct msm_gpu *gpu, struct msm_ringbuffer *ring)
 {
 	/* wait for CP to drain ringbuffer: */
-	if (!adreno_idle(gpu))
+	if (!adreno_idle(gpu, ring))
 		return false;
 
 	if (spin_until(_a5xx_check_idle(gpu))) {
-		DRM_ERROR("%s: timeout waiting for GPU to idle: status %8.8X irq %8.8X\n",
-			gpu->name,
+		DRM_ERROR(
+			"%s: timeout waiting for GPU RB %d to idle: status %8.8X rptr/wptr: %4.4X/%4.4X irq %8.8X\n",
+			gpu->name, ring->id,
 			gpu_read(gpu, REG_A5XX_RBBM_STATUS),
+			gpu_read(gpu, REG_A5XX_CP_RB_RPTR),
+			gpu_read(gpu, REG_A5XX_CP_RB_WPTR),
 			gpu_read(gpu, REG_A5XX_RBBM_INT_0_STATUS));
 
 		return false;
@@ -768,6 +771,32 @@ static void a5xx_fault_detect_irq(struct msm_gpu *gpu)
 {
 	struct drm_device *dev = gpu->dev;
 	struct msm_drm_private *priv = dev->dev_private;
+	struct msm_ringbuffer *ring;
+	int i;
+
+	dev_err(dev->dev, "GPU fault detected: status %8.8X\n",
+		gpu_read(gpu, REG_A5XX_RBBM_STATUS));
+
+	dev_err(dev->dev,
+		"RPTR/WPTR %4.4X/%4.4X IB1 %16.16llX/%4.4X IB2 %16.16llX/%4.4X\n",
+		gpu_read(gpu, REG_A5XX_CP_RB_RPTR),
+		gpu_read(gpu, REG_A5XX_CP_RB_WPTR),
+		gpu_read64(gpu, REG_A5XX_CP_IB1_BASE, REG_A5XX_CP_IB1_BASE_HI),
+		gpu_read(gpu, REG_A5XX_CP_IB1_BUFSZ),
+		gpu_read64(gpu, REG_A5XX_CP_IB2_BASE, REG_A5XX_CP_IB2_BASE_HI),
+		gpu_read(gpu, REG_A5XX_CP_IB2_BUFSZ));
+
+	FOR_EACH_RING(gpu, ring, i) {
+		if (!ring)
+			continue;
+
+		dev_err(dev->dev, "RB %d: submitted %d retired %d\n", ring->id,
+			adreno_last_fence(gpu, ring),
+			adreno_submitted_fence(gpu, ring));
+	}
+
+	/* Turn off the hangcheck timer to keep it from bothering us */
+	del_timer(&gpu->hangcheck_timer);
 
 	queue_work(priv->wq, &gpu->recover_work);
 }
@@ -951,9 +980,10 @@ static const struct adreno_gpu_funcs funcs = {
 		.pm_resume = a5xx_pm_resume,
 		.recover = a5xx_recover,
 		.last_fence = adreno_last_fence,
+		.submitted_fence = adreno_submitted_fence,
 		.submit = a5xx_submit,
 		.flush = adreno_flush,
-		.idle = a5xx_idle,
+		.active_ring = adreno_active_ring,
 		.irq = a5xx_irq,
 		.destroy = a5xx_destroy,
 #ifdef CONFIG_DEBUG_FS
@@ -1073,7 +1103,7 @@ struct msm_gpu *a5xx_gpu_init(struct drm_device *dev)
 	/* Check the efuses for some configuration */
 	a5xx_efuses_read(pdev, adreno_gpu);
 
-	ret = adreno_gpu_init(dev, pdev, adreno_gpu, &funcs);
+	ret = adreno_gpu_init(dev, pdev, adreno_gpu, &funcs, 1);
 	if (ret) {
 		a5xx_destroy(&(a5xx_gpu->base.base));
 		return ERR_PTR(ret);
diff --git a/drivers/gpu/drm/msm/adreno/a5xx_gpu.h b/drivers/gpu/drm/msm/adreno/a5xx_gpu.h
index e82f54063877..6c8d1f41b680 100644
--- a/drivers/gpu/drm/msm/adreno/a5xx_gpu.h
+++ b/drivers/gpu/drm/msm/adreno/a5xx_gpu.h
@@ -62,5 +62,6 @@ static inline int spin_usecs(struct msm_gpu *gpu, uint32_t usecs,
 	return -ETIMEDOUT;
 }
 
+bool a5xx_idle(struct msm_gpu *gpu, struct msm_ringbuffer *ring);
 
 #endif /* __A5XX_GPU_H__ */
diff --git a/drivers/gpu/drm/msm/adreno/a5xx_power.c b/drivers/gpu/drm/msm/adreno/a5xx_power.c
index 18bca141b425..2040b18f731f 100644
--- a/drivers/gpu/drm/msm/adreno/a5xx_power.c
+++ b/drivers/gpu/drm/msm/adreno/a5xx_power.c
@@ -229,7 +229,7 @@ static int a5xx_gpmu_init(struct msm_gpu *gpu)
 {
 	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
 	struct a5xx_gpu *a5xx_gpu = to_a5xx_gpu(adreno_gpu);
-	struct msm_ringbuffer *ring = gpu->rb;
+	struct msm_ringbuffer *ring = gpu->rb[0];
 
 	if (!a5xx_gpu->gpmu_dwords)
 		return 0;
@@ -248,9 +248,9 @@ static int a5xx_gpmu_init(struct msm_gpu *gpu)
 	OUT_PKT7(ring, CP_SET_PROTECTED_MODE, 1);
 	OUT_RING(ring, 1);
 
-	gpu->funcs->flush(gpu);
+	gpu->funcs->flush(gpu, ring);
 
-	if (!gpu->funcs->idle(gpu)) {
+	if (!a5xx_idle(gpu, ring)) {
 		DRM_ERROR("%s: Unable to load GPMU firmware. GPMU will not be active\n",
 			gpu->name);
 		return -EINVAL;
diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
index fad7a5fd9d60..8506ca2e9b8a 100644
--- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
@@ -21,7 +21,6 @@
 #include "msm_gem.h"
 #include "msm_mmu.h"
 
-#define RB_SIZE    SZ_32K
 #define RB_BLKSIZE 32
 
 int adreno_get_param(struct msm_gpu *gpu, uint32_t param, uint64_t *value)
@@ -60,30 +59,34 @@ int adreno_get_param(struct msm_gpu *gpu, uint32_t param, uint64_t *value)
 int adreno_hw_init(struct msm_gpu *gpu)
 {
 	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
-	int ret;
+	int i;
 
 	DBG("%s", gpu->name);
 
-	ret = msm_gem_get_iova(gpu->rb->bo, gpu->aspace, &gpu->rb_iova);
-	if (ret) {
-		gpu->rb_iova = 0;
-		dev_err(gpu->dev->dev, "could not map ringbuffer: %d\n", ret);
-		return ret;
+	for (i = 0; i < gpu->nr_rings; i++) {
+		int ret = msm_gem_get_iova(gpu->rb[i]->bo, gpu->aspace,
+			&gpu->rb[i]->iova);
+		if (ret) {
+			gpu->rb[i]->iova = 0;
+			dev_err(gpu->dev->dev,
+				"could not map ringbuffer %d: %d\n", i, ret);
+			return ret;
+		}
 	}
 
 	/* Setup REG_CP_RB_CNTL: */
 	adreno_gpu_write(adreno_gpu, REG_ADRENO_CP_RB_CNTL,
-			/* size is log2(quad-words): */
-			AXXX_CP_RB_CNTL_BUFSZ(ilog2(gpu->rb->size / 8)) |
-			AXXX_CP_RB_CNTL_BLKSZ(ilog2(RB_BLKSIZE / 8)) |
-			(adreno_is_a430(adreno_gpu) ? AXXX_CP_RB_CNTL_NO_UPDATE : 0));
+		/* size is log2(quad-words): */
+		AXXX_CP_RB_CNTL_BUFSZ(ilog2(MSM_GPU_RINGBUFFER_SZ / 8)) |
+		AXXX_CP_RB_CNTL_BLKSZ(ilog2(RB_BLKSIZE / 8)) |
+		(adreno_is_a430(adreno_gpu) ? AXXX_CP_RB_CNTL_NO_UPDATE : 0));
 
-	/* Setup ringbuffer address */
+	/* Setup ringbuffer address - use ringbuffer[0] for GPU init */
 	adreno_gpu_write64(adreno_gpu, REG_ADRENO_CP_RB_BASE,
-		REG_ADRENO_CP_RB_BASE_HI, gpu->rb_iova);
+		REG_ADRENO_CP_RB_BASE_HI, gpu->rb[0]->iova);
 
 	adreno_gpu_write64(adreno_gpu, REG_ADRENO_CP_RB_RPTR_ADDR,
-		REG_ADRENO_CP_RB_RPTR_ADDR_HI, rbmemptr(adreno_gpu, rptr));
+		REG_ADRENO_CP_RB_RPTR_ADDR_HI, rbmemptr(adreno_gpu, 0, rptr));
 
 	return 0;
 }
@@ -94,35 +97,68 @@ static uint32_t get_wptr(struct msm_ringbuffer *ring)
 }
 
 /* Use this helper to read rptr, since a430 doesn't update rptr in memory */
-static uint32_t get_rptr(struct adreno_gpu *adreno_gpu)
+static uint32_t get_rptr(struct adreno_gpu *adreno_gpu,
+		struct msm_ringbuffer *ring)
 {
-	if (adreno_is_a430(adreno_gpu))
-		return adreno_gpu->memptrs->rptr = adreno_gpu_read(
+	if (adreno_is_a430(adreno_gpu)) {
+		/*
+		 * If index is anything but 0 this will probably break horribly,
+		 * but I think that we have enough infrastructure in place to
+		 * ensure that it won't be. If not then this is why your
+		 * a430 stopped working.
+		 */
+		return adreno_gpu->memptrs->rptr[ring->id] = adreno_gpu_read(
 			adreno_gpu, REG_ADRENO_CP_RB_RPTR);
-	else
-		return adreno_gpu->memptrs->rptr;
+	} else
+		return adreno_gpu->memptrs->rptr[ring->id];
 }
 
-uint32_t adreno_last_fence(struct msm_gpu *gpu)
+struct msm_ringbuffer *adreno_active_ring(struct msm_gpu *gpu)
+{
+	return gpu->rb[0];
+}
+
+uint32_t adreno_submitted_fence(struct msm_gpu *gpu,
+		struct msm_ringbuffer *ring)
+{
+	if (!ring)
+		return 0;
+
+	return ring->submitted_fence;
+}
+
+uint32_t adreno_last_fence(struct msm_gpu *gpu, struct msm_ringbuffer *ring)
 {
 	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
-	return adreno_gpu->memptrs->fence;
+
+	if (!ring)
+		return 0;
+
+	return adreno_gpu->memptrs->fence[ring->id];
 }
 
 void adreno_recover(struct msm_gpu *gpu)
 {
 	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
 	struct drm_device *dev = gpu->dev;
-	int ret;
+	struct msm_ringbuffer *ring;
+	int ret, i;
 
 	gpu->funcs->pm_suspend(gpu);
 
-	/* reset ringbuffer: */
-	gpu->rb->cur = gpu->rb->start;
+	/* reset ringbuffer(s): */
+
+	FOR_EACH_RING(gpu, ring, i) {
+		if (!ring)
+			continue;
+
+		ring->cur = ring->start;
 
-	/* reset completed fence seqno, just discard anything pending: */
-	adreno_gpu->memptrs->fence = gpu->submitted_fence;
-	adreno_gpu->memptrs->rptr  = 0;
+		/* reset completed fence seqno, discard anything pending: */
+		adreno_gpu->memptrs->fence[ring->id] =
+			adreno_submitted_fence(gpu, ring);
+		adreno_gpu->memptrs->rptr[ring->id]  = 0;
+	}
 
 	gpu->funcs->pm_resume(gpu);
 
@@ -140,7 +176,7 @@ int adreno_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
 {
 	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
 	struct msm_drm_private *priv = gpu->dev->dev_private;
-	struct msm_ringbuffer *ring = gpu->rb;
+	struct msm_ringbuffer *ring = gpu->rb[submit->ring];
 	unsigned i, ibs = 0;
 
 	for (i = 0; i < submit->nr_cmds; i++) {
@@ -186,7 +222,7 @@ int adreno_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
 
 	OUT_PKT3(ring, CP_EVENT_WRITE, 3);
 	OUT_RING(ring, CACHE_FLUSH_TS);
-	OUT_RING(ring, rbmemptr(adreno_gpu, fence));
+	OUT_RING(ring, rbmemptr(adreno_gpu, ring->id, fence));
 	OUT_RING(ring, submit->fence);
 
 	/* we could maybe be clever and only CP_COND_EXEC the interrupt: */
@@ -213,12 +249,12 @@ int adreno_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
 	}
 #endif
 
-	gpu->funcs->flush(gpu);
+	gpu->funcs->flush(gpu, ring);
 
 	return 0;
 }
 
-void adreno_flush(struct msm_gpu *gpu)
+void adreno_flush(struct msm_gpu *gpu, struct msm_ringbuffer *ring)
 {
 	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
 	uint32_t wptr;
@@ -228,7 +264,7 @@ void adreno_flush(struct msm_gpu *gpu)
 	 * to account for the possibility that the last command fit exactly into
 	 * the ringbuffer and rb->next hasn't wrapped to zero yet
 	 */
-	wptr = get_wptr(gpu->rb) & ((gpu->rb->size / 4) - 1);
+	wptr = get_wptr(ring) % (MSM_GPU_RINGBUFFER_SZ >> 2);
 
 	/* ensure writes to ringbuffer have hit system memory: */
 	mb();
@@ -236,17 +272,18 @@ void adreno_flush(struct msm_gpu *gpu)
 	adreno_gpu_write(adreno_gpu, REG_ADRENO_CP_RB_WPTR, wptr);
 }
 
-bool adreno_idle(struct msm_gpu *gpu)
+bool adreno_idle(struct msm_gpu *gpu, struct msm_ringbuffer *ring)
 {
 	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
-	uint32_t wptr = get_wptr(gpu->rb);
+	uint32_t wptr = get_wptr(ring);
 
 	/* wait for CP to drain ringbuffer: */
-	if (!spin_until(get_rptr(adreno_gpu) == wptr))
+	if (!spin_until(get_rptr(adreno_gpu, ring) == wptr))
 		return true;
 
 	/* TODO maybe we need to reset GPU here to recover from hang? */
-	DRM_ERROR("%s: timeout waiting to drain ringbuffer!\n", gpu->name);
+	DRM_ERROR("%s: timeout waiting to drain ringbuffer %d!\n", gpu->name,
+		ring->id);
 	return false;
 }
 
@@ -254,6 +291,7 @@ bool adreno_idle(struct msm_gpu *gpu)
 void adreno_show(struct msm_gpu *gpu, struct seq_file *m)
 {
 	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
+	struct msm_ringbuffer *ring;
 	int i;
 
 	seq_printf(m, "revision: %d (%d.%d.%d.%d)\n",
@@ -261,10 +299,18 @@ void adreno_show(struct msm_gpu *gpu, struct seq_file *m)
 			adreno_gpu->rev.major, adreno_gpu->rev.minor,
 			adreno_gpu->rev.patchid);
 
-	seq_printf(m, "fence:    %d/%d\n", adreno_gpu->memptrs->fence,
-			gpu->submitted_fence);
-	seq_printf(m, "rptr:     %d\n", get_rptr(adreno_gpu));
-	seq_printf(m, "rb wptr:  %d\n", get_wptr(gpu->rb));
+	FOR_EACH_RING(gpu, ring, i) {
+		if (!ring)
+			continue;
+
+		seq_printf(m, "rb %d: fence:    %d/%d\n", i,
+			adreno_last_fence(gpu, ring),
+			adreno_submitted_fence(gpu, ring));
+
+		seq_printf(m, "      rptr:     %d\n",
+			get_rptr(adreno_gpu, ring));
+		seq_printf(m, "rb wptr:  %d\n", get_wptr(ring));
+	}
 
 	gpu->funcs->pm_resume(gpu);
 
@@ -294,20 +340,28 @@ void adreno_show(struct msm_gpu *gpu, struct seq_file *m)
 void adreno_dump_info(struct msm_gpu *gpu)
 {
 	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
+	struct msm_ringbuffer *ring;
 	int i;
 
-	printk("revision: %d (%d.%d.%d.%d)\n",
+	pr_err("revision: %d (%d.%d.%d.%d)\n",
 			adreno_gpu->info->revn, adreno_gpu->rev.core,
 			adreno_gpu->rev.major, adreno_gpu->rev.minor,
 			adreno_gpu->rev.patchid);
 
-	printk("fence:    %d/%d\n", adreno_gpu->memptrs->fence,
-			gpu->submitted_fence);
-	printk("rptr:     %d\n", get_rptr(adreno_gpu));
-	printk("rb wptr:  %d\n", get_wptr(gpu->rb));
+	FOR_EACH_RING(gpu, ring, i) {
+		if (!ring)
+			continue;
+
+		pr_err("rb %d: fence:    %d/%d\n", i,
+			adreno_last_fence(gpu, ring),
+			adreno_submitted_fence(gpu, ring));
+
+		pr_err("rptr:     %d\n", get_rptr(adreno_gpu, ring));
+		pr_err("rb wptr:  %d\n", get_wptr(ring));
+	}
 
 	for (i = 0; i < 8; i++) {
-		printk("CP_SCRATCH_REG%d: %u\n", i,
+		pr_err("CP_SCRATCH_REG%d: %u\n", i,
 			gpu_read(gpu, REG_AXXX_CP_SCRATCH_REG0 + i));
 	}
 }
@@ -332,19 +386,20 @@ void adreno_dump(struct msm_gpu *gpu)
 	}
 }
 
-static uint32_t ring_freewords(struct msm_gpu *gpu)
+static uint32_t ring_freewords(struct msm_ringbuffer *ring)
 {
-	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
-	uint32_t size = gpu->rb->size / 4;
-	uint32_t wptr = get_wptr(gpu->rb);
-	uint32_t rptr = get_rptr(adreno_gpu);
+	struct adreno_gpu *adreno_gpu = to_adreno_gpu(ring->gpu);
+	uint32_t size = MSM_GPU_RINGBUFFER_SZ >> 2;
+	uint32_t wptr = get_wptr(ring);
+	uint32_t rptr = get_rptr(adreno_gpu, ring);
 	return (rptr + (size - 1) - wptr) % size;
 }
 
-void adreno_wait_ring(struct msm_gpu *gpu, uint32_t ndwords)
+void adreno_wait_ring(struct msm_ringbuffer *ring, uint32_t ndwords)
 {
-	if (spin_until(ring_freewords(gpu) >= ndwords))
-		DRM_ERROR("%s: timeout waiting for ringbuffer space\n", gpu->name);
+	if (spin_until(ring_freewords(ring) >= ndwords))
+		DRM_ERROR("%s: timeout waiting for space in ringubffer %d\n",
+			ring->gpu->name, ring->id);
 }
 
 static const char *iommu_ports[] = {
@@ -465,7 +520,8 @@ static int adreno_of_parse(struct platform_device *pdev, struct msm_gpu *gpu)
 }
 
 int adreno_gpu_init(struct drm_device *drm, struct platform_device *pdev,
-		struct adreno_gpu *adreno_gpu, const struct adreno_gpu_funcs *funcs)
+		struct adreno_gpu *adreno_gpu,
+		const struct adreno_gpu_funcs *funcs, int nr_rings)
 {
 	struct adreno_platform_config *config = pdev->dev.platform_data;
 	struct msm_gpu *gpu = &adreno_gpu->base;
@@ -483,7 +539,7 @@ int adreno_gpu_init(struct drm_device *drm, struct platform_device *pdev,
 
 	ret = msm_gpu_init(drm, pdev, &adreno_gpu->base, &funcs->base,
 			adreno_gpu->info->name, "kgsl_3d0_reg_memory", "kgsl_3d0_irq",
-			RB_SIZE);
+			nr_rings);
 	if (ret)
 		return ret;
 
diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.h b/drivers/gpu/drm/msm/adreno/adreno_gpu.h
index 86a993bcce32..9b4e1828bb6f 100644
--- a/drivers/gpu/drm/msm/adreno/adreno_gpu.h
+++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.h
@@ -83,12 +83,18 @@ struct adreno_info {
 
 const struct adreno_info *adreno_info(struct adreno_rev rev);
 
-#define rbmemptr(adreno_gpu, member)  \
+#define _sizeof(member) \
+	sizeof(((struct adreno_rbmemptrs *) 0)->member[0])
+
+#define _base(adreno_gpu, member)  \
 	((adreno_gpu)->memptrs_iova + offsetof(struct adreno_rbmemptrs, member))
 
+#define rbmemptr(adreno_gpu, index, member) \
+	(_base((adreno_gpu), member) + ((index) * _sizeof(member)))
+
 struct adreno_rbmemptrs {
-	volatile uint32_t rptr;
-	volatile uint32_t fence;
+	volatile uint32_t rptr[MSM_GPU_MAX_RINGS];
+	volatile uint32_t fence[MSM_GPU_MAX_RINGS];
 };
 
 struct adreno_gpu {
@@ -205,21 +211,25 @@ static inline int adreno_is_a540(struct adreno_gpu *gpu)
 
 int adreno_get_param(struct msm_gpu *gpu, uint32_t param, uint64_t *value);
 int adreno_hw_init(struct msm_gpu *gpu);
-uint32_t adreno_last_fence(struct msm_gpu *gpu);
+uint32_t adreno_last_fence(struct msm_gpu *gpu, struct msm_ringbuffer *ring);
+uint32_t adreno_submitted_fence(struct msm_gpu *gpu,
+		struct msm_ringbuffer *ring);
 void adreno_recover(struct msm_gpu *gpu);
 int adreno_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
 		struct msm_file_private *ctx);
-void adreno_flush(struct msm_gpu *gpu);
-bool adreno_idle(struct msm_gpu *gpu);
+void adreno_flush(struct msm_gpu *gpu, struct msm_ringbuffer *ring);
+bool adreno_idle(struct msm_gpu *gpu, struct msm_ringbuffer *ring);
 #ifdef CONFIG_DEBUG_FS
 void adreno_show(struct msm_gpu *gpu, struct seq_file *m);
 #endif
 void adreno_dump_info(struct msm_gpu *gpu);
 void adreno_dump(struct msm_gpu *gpu);
-void adreno_wait_ring(struct msm_gpu *gpu, uint32_t ndwords);
+void adreno_wait_ring(struct msm_ringbuffer *ring, uint32_t ndwords);
+struct msm_ringbuffer *adreno_active_ring(struct msm_gpu *gpu);
 
 int adreno_gpu_init(struct drm_device *drm, struct platform_device *pdev,
-		struct adreno_gpu *gpu, const struct adreno_gpu_funcs *funcs);
+		struct adreno_gpu *gpu, const struct adreno_gpu_funcs *funcs,
+		int nr_rings);
 void adreno_gpu_cleanup(struct adreno_gpu *gpu);
 
 
@@ -228,7 +238,7 @@ void adreno_gpu_cleanup(struct adreno_gpu *gpu);
 static inline void
 OUT_PKT0(struct msm_ringbuffer *ring, uint16_t regindx, uint16_t cnt)
 {
-	adreno_wait_ring(ring->gpu, cnt+1);
+	adreno_wait_ring(ring, cnt+1);
 	OUT_RING(ring, CP_TYPE0_PKT | ((cnt-1) << 16) | (regindx & 0x7FFF));
 }
 
@@ -236,14 +246,14 @@ OUT_PKT0(struct msm_ringbuffer *ring, uint16_t regindx, uint16_t cnt)
 static inline void
 OUT_PKT2(struct msm_ringbuffer *ring)
 {
-	adreno_wait_ring(ring->gpu, 1);
+	adreno_wait_ring(ring, 1);
 	OUT_RING(ring, CP_TYPE2_PKT);
 }
 
 static inline void
 OUT_PKT3(struct msm_ringbuffer *ring, uint8_t opcode, uint16_t cnt)
 {
-	adreno_wait_ring(ring->gpu, cnt+1);
+	adreno_wait_ring(ring, cnt+1);
 	OUT_RING(ring, CP_TYPE3_PKT | ((cnt-1) << 16) | ((opcode & 0xFF) << 8));
 }
 
@@ -265,14 +275,14 @@ static inline u32 PM4_PARITY(u32 val)
 static inline void
 OUT_PKT4(struct msm_ringbuffer *ring, uint16_t regindx, uint16_t cnt)
 {
-	adreno_wait_ring(ring->gpu, cnt + 1);
+	adreno_wait_ring(ring, cnt + 1);
 	OUT_RING(ring, PKT4(regindx, cnt));
 }
 
 static inline void
 OUT_PKT7(struct msm_ringbuffer *ring, uint8_t opcode, uint16_t cnt)
 {
-	adreno_wait_ring(ring->gpu, cnt + 1);
+	adreno_wait_ring(ring, cnt + 1);
 	OUT_RING(ring, CP_TYPE7_PKT | (cnt << 0) | (PM4_PARITY(cnt) << 15) |
 		((opcode & 0x7F) << 16) | (PM4_PARITY(opcode) << 23));
 }
diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c
index 9163b90981a2..940674c8d759 100644
--- a/drivers/gpu/drm/msm/msm_drv.c
+++ b/drivers/gpu/drm/msm/msm_drv.c
@@ -939,14 +939,23 @@ int msm_wait_fence(struct drm_device *dev, uint32_t fence,
 		ktime_t *timeout , bool interruptible)
 {
 	struct msm_drm_private *priv = dev->dev_private;
+	struct msm_gpu *gpu = priv->gpu;
+	int index = FENCE_RING(fence);
+	uint32_t submitted;
 	int ret;
 
-	if (!priv->gpu)
+	if (!gpu)
 		return -ENXIO;
 
-	if (fence > priv->gpu->submitted_fence) {
+	if (index > MSM_GPU_MAX_RINGS || index >= gpu->nr_rings ||
+		!gpu->rb[index])
+		return -EINVAL;
+
+	submitted = gpu->funcs->submitted_fence(gpu, gpu->rb[index]);
+
+	if (fence > submitted) {
 		DRM_ERROR("waiting on invalid fence: %u (of %u)\n",
-				fence, priv->gpu->submitted_fence);
+			fence, submitted);
 		return -EINVAL;
 	}
 
@@ -976,7 +985,7 @@ int msm_wait_fence(struct drm_device *dev, uint32_t fence,
 
 		if (ret == 0) {
 			DBG("timeout waiting for fence: %u (completed: %u)",
-					fence, priv->completed_fence);
+					fence, priv->completed_fence[index]);
 			ret = -ETIMEDOUT;
 		} else if (ret != -ERESTARTSYS) {
 			ret = 0;
@@ -990,12 +999,13 @@ int msm_queue_fence_cb(struct drm_device *dev,
 		struct msm_fence_cb *cb, uint32_t fence)
 {
 	struct msm_drm_private *priv = dev->dev_private;
+	int index = FENCE_RING(fence);
 	int ret = 0;
 
 	mutex_lock(&dev->struct_mutex);
 	if (!list_empty(&cb->work.entry)) {
 		ret = -EINVAL;
-	} else if (fence > priv->completed_fence) {
+	} else if (fence > priv->completed_fence[index]) {
 		cb->fence = fence;
 		list_add_tail(&cb->work.entry, &priv->fence_cbs);
 	} else {
@@ -1010,21 +1020,21 @@ int msm_queue_fence_cb(struct drm_device *dev,
 void msm_update_fence(struct drm_device *dev, uint32_t fence)
 {
 	struct msm_drm_private *priv = dev->dev_private;
+	struct msm_fence_cb *cb, *tmp;
+	int index = FENCE_RING(fence);
 
-	mutex_lock(&dev->struct_mutex);
-	priv->completed_fence = max(fence, priv->completed_fence);
-
-	while (!list_empty(&priv->fence_cbs)) {
-		struct msm_fence_cb *cb;
-
-		cb = list_first_entry(&priv->fence_cbs,
-				struct msm_fence_cb, work.entry);
+	if (index >= MSM_GPU_MAX_RINGS)
+		return;
 
-		if (cb->fence > priv->completed_fence)
-			break;
+	mutex_lock(&dev->struct_mutex);
+	priv->completed_fence[index] = max(fence, priv->completed_fence[index]);
 
-		list_del_init(&cb->work.entry);
-		queue_work(priv->wq, &cb->work);
+	list_for_each_entry_safe(cb, tmp, &priv->fence_cbs, work.entry) {
+		if (COMPARE_FENCE_LTE(cb->fence,
+			priv->completed_fence[index])) {
+			list_del_init(&cb->work.entry);
+			queue_work(priv->wq, &cb->work);
+		}
 	}
 
 	mutex_unlock(&dev->struct_mutex);
diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h
index 406b40f0f8de..c058ec56aafc 100644
--- a/drivers/gpu/drm/msm/msm_drv.h
+++ b/drivers/gpu/drm/msm/msm_drv.h
@@ -250,6 +250,8 @@ struct msm_drm_commit {
 	struct kthread_worker worker;
 };
 
+#define MSM_GPU_MAX_RINGS 1
+
 struct msm_drm_private {
 
 	struct msm_kms *kms;
@@ -280,7 +282,9 @@ struct msm_drm_private {
 
 	struct drm_fb_helper *fbdev;
 
-	uint32_t next_fence, completed_fence;
+	uint32_t next_fence[MSM_GPU_MAX_RINGS];
+	uint32_t completed_fence[MSM_GPU_MAX_RINGS];
+
 	wait_queue_head_t fence_event;
 
 	struct msm_rd_state *rd;
@@ -351,6 +355,31 @@ struct msm_format {
 	uint32_t pixel_format;
 };
 
+/*
+ * Some GPU targets can support multiple ringbuffers and preempt between them.
+ * In order to do this without massive API changes we will steal two bits from
+ * the top of the fence and use them to identify the ringbuffer, (0x00000001 for
+ * riug 0, 0x40000001 for ring 1, 0x50000001 for ring 2, etc). If you are going
+ * to do a fence comparision you have to make sure you are only comparing
+ * against fences from the same ring, but since fences within a ringbuffer are
+ * still contigious you can still use straight comparisons (i.e 0x40000001 is
+ * older than 0x40000002). Mathmatically there will be 0x3FFFFFFF timestamps
+ * per ring or ~103 days of 120 interrupts per second (two interrupts per frame
+ * at 60 FPS).
+ */
+#define FENCE_RING(_fence) ((_fence >> 30) & 3)
+#define FENCE(_ring, _fence) ((((_ring) & 3) << 30) | ((_fence) & 0x3FFFFFFF))
+
+static inline bool COMPARE_FENCE_LTE(uint32_t a, uint32_t b)
+{
+	return ((FENCE_RING(a) == FENCE_RING(b)) && a <= b);
+}
+
+static inline bool COMPARE_FENCE_LT(uint32_t a, uint32_t b)
+{
+	return ((FENCE_RING(a) == FENCE_RING(b)) && a < b);
+}
+
 /* callback from wq once fence has passed: */
 struct msm_fence_cb {
 	struct work_struct work;
@@ -529,7 +558,8 @@ u32 msm_readl(const void __iomem *addr);
 static inline bool fence_completed(struct drm_device *dev, uint32_t fence)
 {
 	struct msm_drm_private *priv = dev->dev_private;
-	return priv->completed_fence >= fence;
+
+	return priv->completed_fence[FENCE_RING(fence)] >= fence;
 }
 
 static inline int align_pitch(int width, int bpp)
diff --git a/drivers/gpu/drm/msm/msm_gem.h b/drivers/gpu/drm/msm/msm_gem.h
index d9c3977434d9..1bd5b21f0763 100644
--- a/drivers/gpu/drm/msm/msm_gem.h
+++ b/drivers/gpu/drm/msm/msm_gem.h
@@ -121,6 +121,7 @@ struct msm_gem_submit {
 	struct list_head bo_list;
 	struct ww_acquire_ctx ticket;
 	uint32_t fence;
+	int ring;
 	bool valid;
 	unsigned int nr_cmds;
 	unsigned int nr_bos;
diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c b/drivers/gpu/drm/msm/msm_gem_submit.c
index f7b5e30b41eb..7c83a9acd3e6 100644
--- a/drivers/gpu/drm/msm/msm_gem_submit.c
+++ b/drivers/gpu/drm/msm/msm_gem_submit.c
@@ -349,7 +349,7 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
 	/* for now, we just have 3d pipe.. eventually this would need to
 	 * be more clever to dispatch to appropriate gpu module:
 	 */
-	if (args->pipe != MSM_PIPE_3D0)
+	if (MSM_PIPE_ID(args->flags) != MSM_PIPE_3D0)
 		return -EINVAL;
 
 	gpu = priv->gpu;
@@ -435,6 +435,11 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
 
 	submit->nr_cmds = i;
 
+	/* Clamp the user submitted ring to the range of available rings */
+	submit->ring = clamp_t(uint32_t,
+		(args->flags & MSM_SUBMIT_RING_MASK) >> MSM_SUBMIT_RING_SHIFT,
+		0, gpu->nr_rings - 1);
+
 	ret = msm_gpu_submit(gpu, submit, ctx);
 
 	args->fence = submit->fence;
diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
index 08ecc089611f..57d518300425 100644
--- a/drivers/gpu/drm/msm/msm_gpu.c
+++ b/drivers/gpu/drm/msm/msm_gpu.c
@@ -273,15 +273,33 @@ static void recover_worker(struct work_struct *work)
 	mutex_lock(&dev->struct_mutex);
 	if (msm_gpu_active(gpu)) {
 		struct msm_gem_submit *submit;
-		uint32_t fence = gpu->funcs->last_fence(gpu);
-
-		/* retire completed submits, plus the one that hung: */
-		retire_submits(gpu, fence + 1);
+		struct msm_ringbuffer *ring;
+		int i;
 
 		inactive_cancel(gpu);
+
+		FOR_EACH_RING(gpu, ring, i) {
+			uint32_t fence;
+
+			if (!ring)
+				continue;
+
+			fence = gpu->funcs->last_fence(gpu, ring);
+
+			/*
+			 * Retire the faulting command on the active ring and
+			 * make sure the other rings are cleaned up
+			 */
+			if (ring == gpu->funcs->active_ring(gpu))
+				retire_submits(gpu, fence + 1);
+			else
+				retire_submits(gpu, fence);
+		}
+
+		/* Recover the GPU */
 		gpu->funcs->recover(gpu);
 
-		/* replay the remaining submits after the one that hung: */
+		/* replay the remaining submits for all rings: */
 		list_for_each_entry(submit, &gpu->submit_list, node) {
 			gpu->funcs->submit(gpu, submit, NULL);
 		}
@@ -303,25 +321,28 @@ static void hangcheck_handler(unsigned long data)
 	struct msm_gpu *gpu = (struct msm_gpu *)data;
 	struct drm_device *dev = gpu->dev;
 	struct msm_drm_private *priv = dev->dev_private;
-	uint32_t fence = gpu->funcs->last_fence(gpu);
+	struct msm_ringbuffer *ring = gpu->funcs->active_ring(gpu);
+	uint32_t fence = gpu->funcs->last_fence(gpu, ring);
+	uint32_t submitted = gpu->funcs->submitted_fence(gpu, ring);
 
-	if (fence != gpu->hangcheck_fence) {
+	if (fence != gpu->hangcheck_fence[ring->id]) {
 		/* some progress has been made.. ya! */
-		gpu->hangcheck_fence = fence;
-	} else if (fence < gpu->submitted_fence) {
+		gpu->hangcheck_fence[ring->id] = fence;
+	} else if (fence < submitted) {
 		/* no progress and not done.. hung! */
-		gpu->hangcheck_fence = fence;
-		dev_err(dev->dev, "%s: hangcheck detected gpu lockup!\n",
-				gpu->name);
+		gpu->hangcheck_fence[ring->id] = fence;
+		dev_err(dev->dev, "%s: hangcheck detected gpu lockup rb %d!\n",
+				gpu->name, ring->id);
 		dev_err(dev->dev, "%s:     completed fence: %u\n",
 				gpu->name, fence);
 		dev_err(dev->dev, "%s:     submitted fence: %u\n",
-				gpu->name, gpu->submitted_fence);
+				gpu->name, submitted);
+
 		queue_work(priv->wq, &gpu->recover_work);
 	}
 
 	/* if still more pending work, reset the hangcheck timer: */
-	if (gpu->submitted_fence > gpu->hangcheck_fence)
+	if (submitted > gpu->hangcheck_fence[ring->id])
 		hangcheck_timer_reset(gpu);
 
 	/* workaround for missing irq: */
@@ -430,54 +451,66 @@ out:
 static void retire_submits(struct msm_gpu *gpu, uint32_t fence)
 {
 	struct drm_device *dev = gpu->dev;
+	struct msm_gem_submit *submit, *tmp;
 
 	WARN_ON(!mutex_is_locked(&dev->struct_mutex));
 
-	while (!list_empty(&gpu->submit_list)) {
-		struct msm_gem_submit *submit;
-
-		submit = list_first_entry(&gpu->submit_list,
-				struct msm_gem_submit, node);
+	/*
+	 * Find and retire all the submits in the same ring that are older than
+	 * or equal to 'fence'
+	 */
 
-		if (submit->fence <= fence) {
+	list_for_each_entry_safe(submit, tmp, &gpu->submit_list, node) {
+		if (COMPARE_FENCE_LTE(submit->fence, fence)) {
 			list_del(&submit->node);
 			kfree(submit);
-		} else {
-			break;
 		}
 	}
 }
 
-static void retire_worker(struct work_struct *work)
+static bool _fence_signaled(struct msm_gem_object *obj, uint32_t fence)
 {
-	struct msm_gpu *gpu = container_of(work, struct msm_gpu, retire_work);
-	struct drm_device *dev = gpu->dev;
-	uint32_t fence = gpu->funcs->last_fence(gpu);
+	if (obj->write_fence & 0x3FFFFFFF)
+		return COMPARE_FENCE_LTE(obj->write_fence, fence);
 
-	msm_update_fence(gpu->dev, fence);
+	return COMPARE_FENCE_LTE(obj->read_fence, fence);
+}
 
-	mutex_lock(&dev->struct_mutex);
+static void _retire_ring(struct msm_gpu *gpu, uint32_t fence)
+{
+	struct msm_gem_object *obj, *tmp;
 
 	retire_submits(gpu, fence);
 
-	while (!list_empty(&gpu->active_list)) {
-		struct msm_gem_object *obj;
-
-		obj = list_first_entry(&gpu->active_list,
-				struct msm_gem_object, mm_list);
-
-		if ((obj->read_fence <= fence) &&
-				(obj->write_fence <= fence)) {
-			/* move to inactive: */
+	list_for_each_entry_safe(obj, tmp, &gpu->active_list, mm_list) {
+		if (_fence_signaled(obj, fence)) {
 			msm_gem_move_to_inactive(&obj->base);
 			msm_gem_put_iova(&obj->base, gpu->aspace);
 			drm_gem_object_unreference(&obj->base);
-		} else {
-			break;
 		}
 	}
+}
 
-	mutex_unlock(&dev->struct_mutex);
+static void retire_worker(struct work_struct *work)
+{
+	struct msm_gpu *gpu = container_of(work, struct msm_gpu, retire_work);
+	struct drm_device *dev = gpu->dev;
+	struct msm_ringbuffer *ring;
+	int i;
+
+	FOR_EACH_RING(gpu, ring, i) {
+		uint32_t fence;
+
+		if (!ring)
+			continue;
+
+		fence = gpu->funcs->last_fence(gpu, ring);
+		msm_update_fence(gpu->dev, fence);
+
+		mutex_lock(&dev->struct_mutex);
+		_retire_ring(gpu, fence);
+		mutex_unlock(&dev->struct_mutex);
+	}
 
 	if (!msm_gpu_active(gpu))
 		inactive_start(gpu);
@@ -497,13 +530,12 @@ int msm_gpu_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
 {
 	struct drm_device *dev = gpu->dev;
 	struct msm_drm_private *priv = dev->dev_private;
+	struct msm_ringbuffer *ring = gpu->rb[submit->ring];
 	int i, ret;
 
 	WARN_ON(!mutex_is_locked(&dev->struct_mutex));
 
-	submit->fence = ++priv->next_fence;
-
-	gpu->submitted_fence = submit->fence;
+	submit->fence = FENCE(submit->ring, ++priv->next_fence[submit->ring]);
 
 	inactive_cancel(gpu);
 
@@ -511,7 +543,7 @@ int msm_gpu_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
 
 	msm_rd_dump_submit(submit);
 
-	gpu->submitted_fence = submit->fence;
+	ring->submitted_fence = submit->fence;
 
 	update_sw_cntrs(gpu);
 
@@ -564,7 +596,8 @@ static const char *clk_names[] = {
 
 int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev,
 		struct msm_gpu *gpu, const struct msm_gpu_funcs *funcs,
-		const char *name, const char *ioname, const char *irqname, int ringsz)
+		const char *name, const char *ioname, const char *irqname,
+		int nr_rings)
 {
 	struct iommu_domain *iommu;
 	int i, ret;
@@ -666,17 +699,28 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev,
 		dev_info(drm->dev, "%s: no IOMMU, fallback to VRAM carveout!\n", name);
 	}
 
-	/* Create ringbuffer: */
-	mutex_lock(&drm->struct_mutex);
-	gpu->rb = msm_ringbuffer_new(gpu, ringsz);
-	mutex_unlock(&drm->struct_mutex);
-	if (IS_ERR(gpu->rb)) {
-		ret = PTR_ERR(gpu->rb);
-		gpu->rb = NULL;
-		dev_err(drm->dev, "could not create ringbuffer: %d\n", ret);
-		goto fail;
+	if (nr_rings > ARRAY_SIZE(gpu->rb)) {
+		WARN(1, "Only creating %lu ringbuffers\n", ARRAY_SIZE(gpu->rb));
+		nr_rings = ARRAY_SIZE(gpu->rb);
 	}
 
+	/* Create ringbuffer(s): */
+	for (i = 0; i < nr_rings; i++) {
+		mutex_lock(&drm->struct_mutex);
+		gpu->rb[i] = msm_ringbuffer_new(gpu, i);
+		mutex_unlock(&drm->struct_mutex);
+
+		if (IS_ERR(gpu->rb[i])) {
+			ret = PTR_ERR(gpu->rb[i]);
+			gpu->rb[i] = NULL;
+			dev_err(drm->dev,
+				"could not create ringbuffer %d: %d\n", i, ret);
+			goto fail;
+		}
+	}
+
+	gpu->nr_rings = nr_rings;
+
 #ifdef CONFIG_SMP
 	gpu->pm_qos_req_dma.type = PM_QOS_REQ_AFFINE_IRQ;
 	gpu->pm_qos_req_dma.irq = gpu->irq;
@@ -690,20 +734,31 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev,
 	return 0;
 
 fail:
+	for (i = 0; i < ARRAY_SIZE(gpu->rb); i++) {
+		if (gpu->rb[i])
+			msm_ringbuffer_destroy(gpu->rb[i]);
+	}
+
 	return ret;
 }
 
 void msm_gpu_cleanup(struct msm_gpu *gpu)
 {
+	int i;
+
 	DBG("%s", gpu->name);
 
 	WARN_ON(!list_empty(&gpu->active_list));
 
 	bs_fini(gpu);
 
-	if (gpu->rb) {
-		if (gpu->rb_iova)
-			msm_gem_put_iova(gpu->rb->bo, gpu->aspace);
-		msm_ringbuffer_destroy(gpu->rb);
+	for (i = 0; i < ARRAY_SIZE(gpu->rb); i++) {
+		if (!gpu->rb[i])
+			continue;
+
+		if (gpu->rb[i]->iova)
+			msm_gem_put_iova(gpu->rb[i]->bo, gpu->aspace);
+
+		msm_ringbuffer_destroy(gpu->rb[i]);
 	}
 }
diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
index c6f1d3bd36e9..bb2d7d6727fb 100644
--- a/drivers/gpu/drm/msm/msm_gpu.h
+++ b/drivers/gpu/drm/msm/msm_gpu.h
@@ -49,10 +49,13 @@ struct msm_gpu_funcs {
 	int (*pm_resume)(struct msm_gpu *gpu);
 	int (*submit)(struct msm_gpu *gpu, struct msm_gem_submit *submit,
 			struct msm_file_private *ctx);
-	void (*flush)(struct msm_gpu *gpu);
-	bool (*idle)(struct msm_gpu *gpu);
+	void (*flush)(struct msm_gpu *gpu, struct msm_ringbuffer *ring);
 	irqreturn_t (*irq)(struct msm_gpu *irq);
-	uint32_t (*last_fence)(struct msm_gpu *gpu);
+	uint32_t (*last_fence)(struct msm_gpu *gpu,
+			struct msm_ringbuffer *ring);
+	uint32_t (*submitted_fence)(struct msm_gpu *gpu,
+			struct msm_ringbuffer *ring);
+	struct msm_ringbuffer *(*active_ring)(struct msm_gpu *gpu);
 	void (*recover)(struct msm_gpu *gpu);
 	void (*destroy)(struct msm_gpu *gpu);
 #ifdef CONFIG_DEBUG_FS
@@ -78,14 +81,12 @@ struct msm_gpu {
 	const struct msm_gpu_perfcntr *perfcntrs;
 	uint32_t num_perfcntrs;
 
-	struct msm_ringbuffer *rb;
-	uint64_t rb_iova;
+	struct msm_ringbuffer *rb[MSM_GPU_MAX_RINGS];
+	int nr_rings;
 
 	/* list of GEM active objects: */
 	struct list_head active_list;
 
-	uint32_t submitted_fence;
-
 	/* is gpu powered/active? */
 	int active_cnt;
 	bool inactive;
@@ -123,15 +124,37 @@ struct msm_gpu {
 #define DRM_MSM_HANGCHECK_PERIOD 500 /* in ms */
 #define DRM_MSM_HANGCHECK_JIFFIES msecs_to_jiffies(DRM_MSM_HANGCHECK_PERIOD)
 	struct timer_list hangcheck_timer;
-	uint32_t hangcheck_fence;
+	uint32_t hangcheck_fence[MSM_GPU_MAX_RINGS];
 	struct work_struct recover_work;
 
 	struct list_head submit_list;
 };
 
+/* It turns out that all targets use the same ringbuffer size. */
+#define MSM_GPU_RINGBUFFER_SZ SZ_32K
+
+static inline struct msm_ringbuffer *__get_ring(struct msm_gpu *gpu, int index)
+{
+	return (index < ARRAY_SIZE(gpu->rb) ? gpu->rb[index] : NULL);
+}
+
+#define FOR_EACH_RING(gpu, ring, index) \
+	for (index = 0, ring = (gpu)->rb[0]; \
+		index < (gpu)->nr_rings && index < ARRAY_SIZE((gpu)->rb); \
+		index++, ring = __get_ring(gpu, index))
+
 static inline bool msm_gpu_active(struct msm_gpu *gpu)
 {
-	return gpu->submitted_fence > gpu->funcs->last_fence(gpu);
+	struct msm_ringbuffer *ring;
+	int i;
+
+	FOR_EACH_RING(gpu, ring, i) {
+		if (gpu->funcs->submitted_fence(gpu, ring) >
+			gpu->funcs->last_fence(gpu, ring))
+			return true;
+	}
+
+	return false;
 }
 
 /* Perf-Counters:
@@ -210,7 +233,9 @@ int msm_gpu_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
 
 int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev,
 		struct msm_gpu *gpu, const struct msm_gpu_funcs *funcs,
-		const char *name, const char *ioname, const char *irqname, int ringsz);
+		const char *name, const char *ioname, const char *irqname,
+		int nr_rings);
+
 void msm_gpu_cleanup(struct msm_gpu *gpu);
 
 struct msm_gpu *adreno_load_gpu(struct drm_device *dev);
diff --git a/drivers/gpu/drm/msm/msm_ringbuffer.c b/drivers/gpu/drm/msm/msm_ringbuffer.c
index 1f14b908b221..19ac38b2eba2 100644
--- a/drivers/gpu/drm/msm/msm_ringbuffer.c
+++ b/drivers/gpu/drm/msm/msm_ringbuffer.c
@@ -18,12 +18,13 @@
 #include "msm_ringbuffer.h"
 #include "msm_gpu.h"
 
-struct msm_ringbuffer *msm_ringbuffer_new(struct msm_gpu *gpu, int size)
+struct msm_ringbuffer *msm_ringbuffer_new(struct msm_gpu *gpu, int id)
 {
 	struct msm_ringbuffer *ring;
 	int ret;
 
-	size = ALIGN(size, 4);   /* size should be dword aligned */
+	/* We assume everwhere that MSM_GPU_RINGBUFFER_SZ is a power of 2 */
+	BUILD_BUG_ON(!is_power_of_2(MSM_GPU_RINGBUFFER_SZ));
 
 	ring = kzalloc(sizeof(*ring), GFP_KERNEL);
 	if (!ring) {
@@ -32,7 +33,8 @@ struct msm_ringbuffer *msm_ringbuffer_new(struct msm_gpu *gpu, int size)
 	}
 
 	ring->gpu = gpu;
-	ring->bo = msm_gem_new(gpu->dev, size, MSM_BO_WC);
+	ring->id = id;
+	ring->bo = msm_gem_new(gpu->dev, MSM_GPU_RINGBUFFER_SZ, MSM_BO_WC);
 	if (IS_ERR(ring->bo)) {
 		ret = PTR_ERR(ring->bo);
 		ring->bo = NULL;
@@ -40,11 +42,9 @@ struct msm_ringbuffer *msm_ringbuffer_new(struct msm_gpu *gpu, int size)
 	}
 
 	ring->start = msm_gem_vaddr_locked(ring->bo);
-	ring->end   = ring->start + (size / 4);
+	ring->end   = ring->start + (MSM_GPU_RINGBUFFER_SZ >> 2);
 	ring->cur   = ring->start;
 
-	ring->size = size;
-
 	return ring;
 
 fail:
diff --git a/drivers/gpu/drm/msm/msm_ringbuffer.h b/drivers/gpu/drm/msm/msm_ringbuffer.h
index 6e0e1049fa4f..e40d130853e0 100644
--- a/drivers/gpu/drm/msm/msm_ringbuffer.h
+++ b/drivers/gpu/drm/msm/msm_ringbuffer.h
@@ -22,12 +22,14 @@
 
 struct msm_ringbuffer {
 	struct msm_gpu *gpu;
-	int size;
+	int id;
 	struct drm_gem_object *bo;
 	uint32_t *start, *end, *cur;
+	uint64_t iova;
+	uint32_t submitted_fence;
 };
 
-struct msm_ringbuffer *msm_ringbuffer_new(struct msm_gpu *gpu, int size);
+struct msm_ringbuffer *msm_ringbuffer_new(struct msm_gpu *gpu, int id);
 void msm_ringbuffer_destroy(struct msm_ringbuffer *ring);
 
 /* ringbuffer helpers (the parts that are same for a3xx/a2xx/z180..) */
diff --git a/include/uapi/drm/msm_drm.h b/include/uapi/drm/msm_drm.h
index 20ef9bc424f3..be4a18c0712c 100644
--- a/include/uapi/drm/msm_drm.h
+++ b/include/uapi/drm/msm_drm.h
@@ -40,6 +40,15 @@
 #define MSM_PIPE_2D1         0x02
 #define MSM_PIPE_3D0         0x10
 
+/* The pipe-id just uses the lower bits, so can be OR'd with flags in
+ * the upper 16 bits (which could be extended further, if needed, maybe
+ * we extend/overload the pipe-id some day to deal with multiple rings,
+ * but even then I don't think we need the full lower 16 bits).
+ */
+#define MSM_PIPE_ID_MASK     0xffff
+#define MSM_PIPE_ID(x)       ((x) & MSM_PIPE_ID_MASK)
+#define MSM_PIPE_FLAGS(x)    ((x) & ~MSM_PIPE_ID_MASK)
+
 /* timeouts are specified in clock-monotonic absolute times (to simplify
  * restarting interrupted ioctls).  The following struct is logically the
  * same as 'struct timespec' but 32/64b ABI safe.
@@ -178,12 +187,18 @@ struct drm_msm_gem_submit_bo {
 	__u64 presumed;       /* in/out, presumed buffer address */
 };
 
+/* Valid submit ioctl flags: */
+#define MSM_SUBMIT_RING_MASK 0x000F0000
+#define MSM_SUBMIT_RING_SHIFT 16
+
+#define MSM_SUBMIT_FLAGS (MSM_SUBMIT_RING_MASK)
+
 /* Each cmdstream submit consists of a table of buffers involved, and
  * one or more cmdstream buffers.  This allows for conditional execution
  * (context-restore), and IB buffers needed for per tile/bin draw cmds.
  */
 struct drm_msm_gem_submit {
-	__u32 pipe;           /* in, MSM_PIPE_x */
+	__u32 flags;          /* MSM_PIPE_x | MSM_SUBMIT_x */
 	__u32 fence;          /* out */
 	__u32 nr_bos;         /* in, number of submit_bo's */
 	__u32 nr_cmds;        /* in, number of submit_cmd's */
-- 
cgit v1.2.3


From 64e3375be29e5634337724a7a51ee019cb2de146 Mon Sep 17 00:00:00 2001
From: Jordan Crouse <jcrouse@codeaurora.org>
Date: Mon, 13 Feb 2017 10:14:24 -0700
Subject: drm/msm: Shadow current pointer in the ring until command is complete

Add a shadow pointer to track the current command being written into
the ring. Don't commit it as 'cur' until the command is submitted.
Because 'cur' is used to construct the software copy of the wptr this
ensures that somebody peeking in on the ring doesn't assume that a
command is inflight while it is being written. This isn't a huge deal
with a single ring (though technically the hangcheck could assume
the system is prematurely busy when it isn't) but it will be rather
important for preemption where the decision to preempt is based
on a non-empty ringbuffer. Without a shadow an aggressive preemption
scheme could assume that the ringbuffer is non empty and switch to it
before the CPU is done writing the command and boom.

Even though preemption won't be supported for all targets because of
the way the code is organized it is simpler to make this generic for
all targets. The extra load for non-preemption targets should be
minimal.

Change-Id: Ic0dedbad83247c3e77de6f4f24bbb97db10e5edd
Signed-off-by: Jordan Crouse <jcrouse@codeaurora.org>
---
 drivers/gpu/drm/msm/adreno/adreno_gpu.c |  9 +++++++--
 drivers/gpu/drm/msm/msm_ringbuffer.c    |  1 +
 drivers/gpu/drm/msm/msm_ringbuffer.h    | 12 ++++++++----
 3 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
index 8506ca2e9b8a..065b0f401a17 100644
--- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
@@ -153,6 +153,7 @@ void adreno_recover(struct msm_gpu *gpu)
 			continue;
 
 		ring->cur = ring->start;
+		ring->next = ring->start;
 
 		/* reset completed fence seqno, discard anything pending: */
 		adreno_gpu->memptrs->fence[ring->id] =
@@ -259,12 +260,15 @@ void adreno_flush(struct msm_gpu *gpu, struct msm_ringbuffer *ring)
 	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
 	uint32_t wptr;
 
+	/* Copy the shadow to the actual register */
+	ring->cur = ring->next;
+
 	/*
 	 * Mask the wptr value that we calculate to fit in the HW range. This is
 	 * to account for the possibility that the last command fit exactly into
 	 * the ringbuffer and rb->next hasn't wrapped to zero yet
 	 */
-	wptr = get_wptr(ring) % (MSM_GPU_RINGBUFFER_SZ >> 2);
+	wptr = (ring->cur - ring->start) % (MSM_GPU_RINGBUFFER_SZ >> 2);
 
 	/* ensure writes to ringbuffer have hit system memory: */
 	mb();
@@ -390,7 +394,8 @@ static uint32_t ring_freewords(struct msm_ringbuffer *ring)
 {
 	struct adreno_gpu *adreno_gpu = to_adreno_gpu(ring->gpu);
 	uint32_t size = MSM_GPU_RINGBUFFER_SZ >> 2;
-	uint32_t wptr = get_wptr(ring);
+	/* Use ring->next to calculate free size */
+	uint32_t wptr = ring->next - ring->start;
 	uint32_t rptr = get_rptr(adreno_gpu, ring);
 	return (rptr + (size - 1) - wptr) % size;
 }
diff --git a/drivers/gpu/drm/msm/msm_ringbuffer.c b/drivers/gpu/drm/msm/msm_ringbuffer.c
index 19ac38b2eba2..2dd6bc6c725c 100644
--- a/drivers/gpu/drm/msm/msm_ringbuffer.c
+++ b/drivers/gpu/drm/msm/msm_ringbuffer.c
@@ -43,6 +43,7 @@ struct msm_ringbuffer *msm_ringbuffer_new(struct msm_gpu *gpu, int id)
 
 	ring->start = msm_gem_vaddr_locked(ring->bo);
 	ring->end   = ring->start + (MSM_GPU_RINGBUFFER_SZ >> 2);
+	ring->next  = ring->start;
 	ring->cur   = ring->start;
 
 	return ring;
diff --git a/drivers/gpu/drm/msm/msm_ringbuffer.h b/drivers/gpu/drm/msm/msm_ringbuffer.h
index e40d130853e0..4e0b302501fa 100644
--- a/drivers/gpu/drm/msm/msm_ringbuffer.h
+++ b/drivers/gpu/drm/msm/msm_ringbuffer.h
@@ -24,7 +24,7 @@ struct msm_ringbuffer {
 	struct msm_gpu *gpu;
 	int id;
 	struct drm_gem_object *bo;
-	uint32_t *start, *end, *cur;
+	uint32_t *start, *end, *cur, *next;
 	uint64_t iova;
 	uint32_t submitted_fence;
 };
@@ -37,9 +37,13 @@ void msm_ringbuffer_destroy(struct msm_ringbuffer *ring);
 static inline void
 OUT_RING(struct msm_ringbuffer *ring, uint32_t data)
 {
-	if (ring->cur == ring->end)
-		ring->cur = ring->start;
-	*(ring->cur++) = data;
+	/*
+	 * ring->next points to the current command being written - it won't be
+	 * committed as ring->cur until the flush
+	 */
+	if (ring->next == ring->end)
+		ring->next = ring->start;
+	*(ring->next++) = data;
 }
 
 #endif /* __MSM_RINGBUFFER_H__ */
-- 
cgit v1.2.3


From 76eb0ae231831f40c631959b98c786facde0d266 Mon Sep 17 00:00:00 2001
From: Jordan Crouse <jcrouse@codeaurora.org>
Date: Mon, 13 Feb 2017 10:14:25 -0700
Subject: drm/msm: Make the value of RB_CNTL (almost) generic

We use a global ringbuffer size and block size for all targets and
at least for 5XX preemption we need to know the value the RB_CNTL
in several locations so it makes sense to caculate it once and use
it everywhere.

The only monkey wrench is that we need to disable the RPTR shadow
for A430 targets but that only needs to be done once and doesn't
affect A5XX so we can or in the value at init time.

Change-Id: Ic0dedbadca31e835f014037ea3f9741048df3b98
Signed-off-by: Jordan Crouse <jcrouse@codeaurora.org>
---
 drivers/gpu/drm/msm/adreno/adreno_gpu.c | 12 +++++++-----
 drivers/gpu/drm/msm/msm_gpu.h           |  5 +++++
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
index 065b0f401a17..c0ed98358b2c 100644
--- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
@@ -21,7 +21,6 @@
 #include "msm_gem.h"
 #include "msm_mmu.h"
 
-#define RB_BLKSIZE 32
 
 int adreno_get_param(struct msm_gpu *gpu, uint32_t param, uint64_t *value)
 {
@@ -74,11 +73,14 @@ int adreno_hw_init(struct msm_gpu *gpu)
 		}
 	}
 
-	/* Setup REG_CP_RB_CNTL: */
+	/*
+	 * Setup REG_CP_RB_CNTL.  The same value is used across targets (with
+	 * the excpetion of A430 that disables the RPTR shadow) - the cacluation
+	 * for the ringbuffer size and block size is moved to msm_gpu.h for the
+	 * pre-processor to deal with and the A430 variant is ORed in here
+	 */
 	adreno_gpu_write(adreno_gpu, REG_ADRENO_CP_RB_CNTL,
-		/* size is log2(quad-words): */
-		AXXX_CP_RB_CNTL_BUFSZ(ilog2(MSM_GPU_RINGBUFFER_SZ / 8)) |
-		AXXX_CP_RB_CNTL_BLKSZ(ilog2(RB_BLKSIZE / 8)) |
+		MSM_GPU_RB_CNTL_DEFAULT |
 		(adreno_is_a430(adreno_gpu) ? AXXX_CP_RB_CNTL_NO_UPDATE : 0));
 
 	/* Setup ringbuffer address - use ringbuffer[0] for GPU init */
diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
index bb2d7d6727fb..48303d3d0e44 100644
--- a/drivers/gpu/drm/msm/msm_gpu.h
+++ b/drivers/gpu/drm/msm/msm_gpu.h
@@ -132,6 +132,11 @@ struct msm_gpu {
 
 /* It turns out that all targets use the same ringbuffer size. */
 #define MSM_GPU_RINGBUFFER_SZ SZ_32K
+#define MSM_GPU_RINGBUFFER_BLKSIZE 32
+
+#define MSM_GPU_RB_CNTL_DEFAULT \
+		(AXXX_CP_RB_CNTL_BUFSZ(ilog2(MSM_GPU_RINGBUFFER_SZ / 8)) | \
+		AXXX_CP_RB_CNTL_BLKSZ(ilog2(MSM_GPU_RINGBUFFER_BLKSIZE / 8)))
 
 static inline struct msm_ringbuffer *__get_ring(struct msm_gpu *gpu, int index)
 {
-- 
cgit v1.2.3


From 425372c0ba8e332995e05be0ba6ec513192512d2 Mon Sep 17 00:00:00 2001
From: Jordan Crouse <jcrouse@codeaurora.org>
Date: Mon, 13 Feb 2017 10:14:26 -0700
Subject: drm/msm: Set IOMMU map attributes

Remove the IOMMU_WRITE bit from buffer objects that are
marked MSM_BO_GPU_READONLY.  Add a new flag (MSM_BO_PRIVILEGED)
to pass through IOMMU_PRIV for those IOMMU targets that support
it.

Change-Id: Ic0dedbad8d9d3f461a47ea093fad3fdd90f46535
Signed-off-by: Jordan Crouse <jcrouse@codeaurora.org>
---
 drivers/gpu/drm/msm/msm_drv.h     |  3 ++-
 drivers/gpu/drm/msm/msm_gem.c     |  6 +++++-
 drivers/gpu/drm/msm/msm_gem.h     |  2 +-
 drivers/gpu/drm/msm/msm_gem_vma.c | 23 +++++++++++++++--------
 include/uapi/drm/msm_drm.h        |  1 +
 5 files changed, 24 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h
index c058ec56aafc..7e229aeb8a8b 100644
--- a/drivers/gpu/drm/msm/msm_drv.h
+++ b/drivers/gpu/drm/msm/msm_drv.h
@@ -408,7 +408,8 @@ void msm_gem_unmap_vma(struct msm_gem_address_space *aspace,
 		void *priv);
 int msm_gem_map_vma(struct msm_gem_address_space *aspace,
 		struct msm_gem_vma *vma, struct sg_table *sgt,
-		void *priv);
+		void *priv, unsigned int flags);
+
 void msm_gem_address_space_destroy(struct msm_gem_address_space *aspace);
 
 /* For GPU and legacy display */
diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c
index f42c7bff18fb..63128d11767e 100644
--- a/drivers/gpu/drm/msm/msm_gem.c
+++ b/drivers/gpu/drm/msm/msm_gem.c
@@ -330,6 +330,10 @@ static struct msm_gem_vma *obj_get_domain(struct drm_gem_object *obj,
 	return NULL;
 }
 
+#ifndef IOMMU_PRIV
+#define IOMMU_PRIV 0
+#endif
+
 /* should be called under struct_mutex.. although it can be called
  * from atomic context without struct_mutex to acquire an extra
  * iova ref if you know one is already held.
@@ -369,7 +373,7 @@ int msm_gem_get_iova_locked(struct drm_gem_object *obj,
 		}
 
 		ret = msm_gem_map_vma(aspace, domain, msm_obj->sgt,
-			get_dmabuf_ptr(obj));
+			get_dmabuf_ptr(obj), msm_obj->flags);
 	}
 
 	if (!ret)
diff --git a/drivers/gpu/drm/msm/msm_gem.h b/drivers/gpu/drm/msm/msm_gem.h
index 1bd5b21f0763..8d9976ce8feb 100644
--- a/drivers/gpu/drm/msm/msm_gem.h
+++ b/drivers/gpu/drm/msm/msm_gem.h
@@ -26,7 +26,7 @@
 
 struct msm_gem_aspace_ops {
 	int (*map)(struct msm_gem_address_space *, struct msm_gem_vma *,
-		struct sg_table *sgt, void *priv);
+		struct sg_table *sgt, void *priv, unsigned int flags);
 
 	void (*unmap)(struct msm_gem_address_space *, struct msm_gem_vma *,
 		struct sg_table *sgt, void *priv);
diff --git a/drivers/gpu/drm/msm/msm_gem_vma.c b/drivers/gpu/drm/msm/msm_gem_vma.c
index 53e70263e03c..8ad96e9e2371 100644
--- a/drivers/gpu/drm/msm/msm_gem_vma.c
+++ b/drivers/gpu/drm/msm/msm_gem_vma.c
@@ -39,7 +39,7 @@ static void smmu_aspace_unmap_vma(struct msm_gem_address_space *aspace,
 
 static int smmu_aspace_map_vma(struct msm_gem_address_space *aspace,
 		struct msm_gem_vma *vma, struct sg_table *sgt,
-		void *priv)
+		void *priv, unsigned int flags)
 {
 	struct dma_buf *buf = priv;
 	int ret;
@@ -107,13 +107,20 @@ static void iommu_aspace_unmap_vma(struct msm_gem_address_space *aspace,
 }
 
 static int iommu_aspace_map_vma(struct msm_gem_address_space *aspace,
-		struct msm_gem_vma *vma, struct sg_table *sgt,
-		void *priv)
+		struct msm_gem_vma *vma, struct sg_table *sgt, void *priv,
+		unsigned int flags)
 {
 	struct msm_iommu_aspace *local = to_iommu_aspace(aspace);
 	size_t size = 0;
 	struct scatterlist *sg;
-	int ret = 0, i;
+	int ret, i;
+	int iommu_flags = IOMMU_READ;
+
+	if (!(flags & MSM_BO_GPU_READONLY))
+		iommu_flags |= IOMMU_WRITE;
+
+	if (flags & MSM_BO_PRIVILEGED)
+		iommu_flags |= IOMMU_PRIV;
 
 	if (WARN_ON(drm_mm_node_allocated(&vma->node)))
 		return 0;
@@ -129,8 +136,8 @@ static int iommu_aspace_map_vma(struct msm_gem_address_space *aspace,
 	vma->iova = vma->node.start << PAGE_SHIFT;
 
 	if (aspace->mmu)
-		ret = aspace->mmu->funcs->map(aspace->mmu, vma->iova,
-			sgt, IOMMU_READ | IOMMU_WRITE);
+		ret = aspace->mmu->funcs->map(aspace->mmu, vma->iova, sgt,
+			iommu_flags);
 
 	return ret;
 }
@@ -174,10 +181,10 @@ msm_gem_address_space_new(struct msm_mmu *mmu, const char *name,
 
 int msm_gem_map_vma(struct msm_gem_address_space *aspace,
 		struct msm_gem_vma *vma, struct sg_table *sgt,
-		void *priv)
+		void *priv, unsigned int flags)
 {
 	if (aspace && aspace->ops->map)
-		return aspace->ops->map(aspace, vma, sgt, priv);
+		return aspace->ops->map(aspace, vma, sgt, priv, flags);
 
 	return -EINVAL;
 }
diff --git a/include/uapi/drm/msm_drm.h b/include/uapi/drm/msm_drm.h
index be4a18c0712c..d2f19ac6f536 100644
--- a/include/uapi/drm/msm_drm.h
+++ b/include/uapi/drm/msm_drm.h
@@ -77,6 +77,7 @@ struct drm_msm_param {
 
 #define MSM_BO_SCANOUT       0x00000001     /* scanout capable */
 #define MSM_BO_GPU_READONLY  0x00000002
+#define MSM_BO_PRIVILEGED    0x00000004
 #define MSM_BO_CACHE_MASK    0x000f0000
 /* cache modes */
 #define MSM_BO_CACHED        0x00010000
-- 
cgit v1.2.3


From 3b045f8fff410937f91e95156a0c955297a3bd39 Mon Sep 17 00:00:00 2001
From: Jordan Crouse <jcrouse@codeaurora.org>
Date: Mon, 13 Feb 2017 10:14:26 -0700
Subject: drm/msm: Implement preemption for A5XX targets

Implement preemption for A5XX targets - this allows multiple
ringbuffers for different priorities with automatic preemption
of a lower priority ringbuffer if a higher one is ready.

Change-Id: Ic0dedbad428360d23768d52b585021237c6bc3d3
Signed-off-by: Jordan Crouse <jcrouse@codeaurora.org>
---
 drivers/gpu/drm/msm/Makefile              |   3 +-
 drivers/gpu/drm/msm/adreno/a5xx_gpu.c     | 192 +++++++++++++++--
 drivers/gpu/drm/msm/adreno/a5xx_gpu.h     | 102 ++++++++-
 drivers/gpu/drm/msm/adreno/a5xx_preempt.c | 343 ++++++++++++++++++++++++++++++
 drivers/gpu/drm/msm/adreno/adreno_gpu.c   |  27 ++-
 drivers/gpu/drm/msm/adreno/adreno_gpu.h   |   7 +-
 drivers/gpu/drm/msm/msm_drv.h             |   2 +-
 drivers/gpu/drm/msm/msm_ringbuffer.c      |   2 +
 drivers/gpu/drm/msm/msm_ringbuffer.h      |   1 +
 9 files changed, 640 insertions(+), 39 deletions(-)
 create mode 100644 drivers/gpu/drm/msm/adreno/a5xx_preempt.c

diff --git a/drivers/gpu/drm/msm/Makefile b/drivers/gpu/drm/msm/Makefile
index 9f035b10875b..6086c0f9f13c 100644
--- a/drivers/gpu/drm/msm/Makefile
+++ b/drivers/gpu/drm/msm/Makefile
@@ -54,7 +54,8 @@ msm_drm-y += adreno/adreno_device.o \
 	adreno/a3xx_gpu.o \
 	adreno/a4xx_gpu.o \
 	adreno/a5xx_gpu.o \
-	adreno/a5xx_power.o
+	adreno/a5xx_power.o \
+	adreno/a5xx_preempt.o
 endif
 
 msm_drm-$(CONFIG_DRM_MSM_MDP4) += mdp/mdp4/mdp4_crtc.o \
diff --git a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
index 1e554ef51968..85fa69f2bbce 100644
--- a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
@@ -17,14 +17,66 @@
 extern bool hang_debug;
 static void a5xx_dump(struct msm_gpu *gpu);
 
+static void a5xx_flush(struct msm_gpu *gpu, struct msm_ringbuffer *ring)
+{
+	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
+	struct a5xx_gpu *a5xx_gpu = to_a5xx_gpu(adreno_gpu);
+	uint32_t wptr;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ring->lock, flags);
+
+	/* Copy the shadow to the actual register */
+	ring->cur = ring->next;
+
+	/* Make sure to wrap wptr if we need to */
+	wptr = get_wptr(ring);
+
+	spin_unlock_irqrestore(&ring->lock, flags);
+
+	/* Make sure everything is posted before making a decision */
+	mb();
+
+	/* Update HW if this is the current ring and we are not in preempt */
+	if (a5xx_gpu->cur_ring == ring && !a5xx_in_preempt(a5xx_gpu))
+		gpu_write(gpu, REG_A5XX_CP_RB_WPTR, wptr);
+}
+
 static int a5xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
 	struct msm_file_private *ctx)
 {
 	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
+	struct a5xx_gpu *a5xx_gpu = to_a5xx_gpu(adreno_gpu);
+
 	struct msm_drm_private *priv = gpu->dev->dev_private;
 	struct msm_ringbuffer *ring = gpu->rb[submit->ring];
 	unsigned int i, ibs = 0;
 
+	OUT_PKT7(ring, CP_PREEMPT_ENABLE_GLOBAL, 1);
+	OUT_RING(ring, 0x02);
+
+	/* Turn off protected mode to write to special registers */
+	OUT_PKT7(ring, CP_SET_PROTECTED_MODE, 1);
+	OUT_RING(ring, 0);
+
+	/* Set the save preemption record for the ring/command */
+	OUT_PKT4(ring, REG_A5XX_CP_CONTEXT_SWITCH_SAVE_ADDR_LO, 2);
+	OUT_RING(ring, lower_32_bits(a5xx_gpu->preempt_iova[submit->ring]));
+	OUT_RING(ring, upper_32_bits(a5xx_gpu->preempt_iova[submit->ring]));
+
+	/* Turn back on protected mode */
+	OUT_PKT7(ring, CP_SET_PROTECTED_MODE, 1);
+	OUT_RING(ring, 1);
+
+	/* Enable local preemption for finegrain preemption */
+	OUT_PKT7(ring, CP_PREEMPT_ENABLE_GLOBAL, 1);
+	OUT_RING(ring, 0x02);
+
+	/* Allow CP_CONTEXT_SWITCH_YIELD packets in the IB2 */
+	OUT_PKT7(ring, CP_YIELD_ENABLE, 1);
+	OUT_RING(ring, 0x02);
+
+	/* Submit the commands */
 	for (i = 0; i < submit->nr_cmds; i++) {
 		switch (submit->cmd[i].type) {
 		case MSM_SUBMIT_CMD_IB_TARGET_BUF:
@@ -42,9 +94,30 @@ static int a5xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
 		}
 	}
 
+	/*
+	 * Write the render mode to NULL (0) to indicate to the CP that the IBs
+	 * are done rendering - otherwise a lucky preemption would start
+	 * replaying from the last checkpoint
+	 */
+	OUT_PKT7(ring, CP_SET_RENDER_MODE, 5);
+	OUT_RING(ring, 0);
+	OUT_RING(ring, 0);
+	OUT_RING(ring, 0);
+	OUT_RING(ring, 0);
+	OUT_RING(ring, 0);
+
+	/* Turn off IB level preemptions */
+	OUT_PKT7(ring, CP_YIELD_ENABLE, 1);
+	OUT_RING(ring, 0x01);
+
+	/* Write the fence to the scratch register */
 	OUT_PKT4(ring, REG_A5XX_CP_SCRATCH_REG(2), 1);
 	OUT_RING(ring, submit->fence);
 
+	/*
+	 * Execute a CACHE_FLUSH_TS event. This will ensure that the
+	 * timestamp is written to the memory and then triggers the interrupt
+	 */
 	OUT_PKT7(ring, CP_EVENT_WRITE, 4);
 	OUT_RING(ring, CACHE_FLUSH_TS | (1 << 31));
 
@@ -52,7 +125,24 @@ static int a5xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
 	OUT_RING(ring, upper_32_bits(rbmemptr(adreno_gpu, ring->id, fence)));
 	OUT_RING(ring, submit->fence);
 
-	gpu->funcs->flush(gpu, ring);
+	/* Yield the floor on command completion */
+	OUT_PKT7(ring, CP_CONTEXT_SWITCH_YIELD, 4);
+	/*
+	 * If dword[2:1] are non zero, they specify an address for the CP to
+	 * write the value of dword[3] to on preemption complete. Write 0 to
+	 * skip the write
+	 */
+	OUT_RING(ring, 0x00);
+	OUT_RING(ring, 0x00);
+	/* Data value - not used if the address above is 0 */
+	OUT_RING(ring, 0x01);
+	/* Set bit 0 to trigger an interrupt on preempt complete */
+	OUT_RING(ring, 0x01);
+
+	a5xx_flush(gpu, ring);
+
+	/* Check to see if we need to start preemption */
+	a5xx_preempt_trigger(gpu);
 
 	return 0;
 }
@@ -211,6 +301,50 @@ static int a5xx_me_init(struct msm_gpu *gpu)
 	return a5xx_idle(gpu, ring) ? 0 : -EINVAL;
 }
 
+static int a5xx_preempt_start(struct msm_gpu *gpu)
+{
+	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
+	struct a5xx_gpu *a5xx_gpu = to_a5xx_gpu(adreno_gpu);
+	struct msm_ringbuffer *ring = gpu->rb[0];
+
+	if (gpu->nr_rings == 1)
+		return 0;
+
+	/* Turn off protected mode to write to special registers */
+	OUT_PKT7(ring, CP_SET_PROTECTED_MODE, 1);
+	OUT_RING(ring, 0);
+
+	/* Set the save preemption record for the ring/command */
+	OUT_PKT4(ring, REG_A5XX_CP_CONTEXT_SWITCH_SAVE_ADDR_LO, 2);
+	OUT_RING(ring, lower_32_bits(a5xx_gpu->preempt_iova[ring->id]));
+	OUT_RING(ring, upper_32_bits(a5xx_gpu->preempt_iova[ring->id]));
+
+	/* Turn back on protected mode */
+	OUT_PKT7(ring, CP_SET_PROTECTED_MODE, 1);
+	OUT_RING(ring, 1);
+
+	OUT_PKT7(ring, CP_PREEMPT_ENABLE_GLOBAL, 1);
+	OUT_RING(ring, 0x00);
+
+	OUT_PKT7(ring, CP_PREEMPT_ENABLE_LOCAL, 1);
+	OUT_RING(ring, 0x01);
+
+	OUT_PKT7(ring, CP_YIELD_ENABLE, 1);
+	OUT_RING(ring, 0x01);
+
+	/* Yield the floor on command completion */
+	OUT_PKT7(ring, CP_CONTEXT_SWITCH_YIELD, 4);
+	OUT_RING(ring, 0x00);
+	OUT_RING(ring, 0x00);
+	OUT_RING(ring, 0x01);
+	OUT_RING(ring, 0x01);
+
+	gpu->funcs->flush(gpu, ring);
+
+	return a5xx_idle(gpu, ring) ? 0 : -EINVAL;
+}
+
+
 static struct drm_gem_object *a5xx_ucode_load_bo(struct msm_gpu *gpu,
 		const struct firmware *fw, u64 *iova)
 {
@@ -354,6 +488,7 @@ static void a5xx_zap_shader_init(struct msm_gpu *gpu)
 	  A5XX_RBBM_INT_0_MASK_RBBM_ATB_ASYNC_OVERFLOW | \
 	  A5XX_RBBM_INT_0_MASK_CP_HW_ERROR | \
 	  A5XX_RBBM_INT_0_MASK_MISC_HANG_DETECT | \
+	  A5XX_RBBM_INT_0_MASK_CP_SW | \
 	  A5XX_RBBM_INT_0_MASK_CP_CACHE_FLUSH_TS | \
 	  A5XX_RBBM_INT_0_MASK_UCHE_OOB_ACCESS | \
 	  A5XX_RBBM_INT_0_MASK_GPMU_VOLTAGE_DROOP)
@@ -523,6 +658,8 @@ static int a5xx_hw_init(struct msm_gpu *gpu)
 	if (ret)
 		return ret;
 
+	a5xx_preempt_hw_init(gpu);
+
 	ret = a5xx_ucode_init(gpu);
 	if (ret)
 		return ret;
@@ -577,6 +714,9 @@ static int a5xx_hw_init(struct msm_gpu *gpu)
 		gpu_write(gpu, REG_A5XX_RBBM_SECVID_TRUST_CNTL, 0x0);
 	}
 
+	/* Last step - yield the ringbuffer */
+	a5xx_preempt_start(gpu);
+
 	pm_qos_update_request(&gpu->pm_qos_req_dma, 501);
 
 	return 0;
@@ -604,6 +744,8 @@ static void a5xx_destroy(struct msm_gpu *gpu)
 
 	DBG("%s", gpu->name);
 
+	a5xx_preempt_fini(gpu);
+
 	if (a5xx_gpu->pm4_bo) {
 		if (a5xx_gpu->pm4_iova)
 			msm_gem_put_iova(a5xx_gpu->pm4_bo, gpu->aspace);
@@ -641,6 +783,14 @@ static inline bool _a5xx_check_idle(struct msm_gpu *gpu)
 
 bool a5xx_idle(struct msm_gpu *gpu, struct msm_ringbuffer *ring)
 {
+	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
+	struct a5xx_gpu *a5xx_gpu = to_a5xx_gpu(adreno_gpu);
+
+	if (ring != a5xx_gpu->cur_ring) {
+		WARN(1, "Tried to idle a non-current ringbuffer\n");
+		return false;
+	}
+
 	/* wait for CP to drain ringbuffer: */
 	if (!adreno_idle(gpu, ring))
 		return false;
@@ -771,14 +921,11 @@ static void a5xx_fault_detect_irq(struct msm_gpu *gpu)
 {
 	struct drm_device *dev = gpu->dev;
 	struct msm_drm_private *priv = dev->dev_private;
-	struct msm_ringbuffer *ring;
-	int i;
+	struct msm_ringbuffer *ring = gpu->funcs->active_ring(gpu);
 
-	dev_err(dev->dev, "GPU fault detected: status %8.8X\n",
-		gpu_read(gpu, REG_A5XX_RBBM_STATUS));
-
-	dev_err(dev->dev,
-		"RPTR/WPTR %4.4X/%4.4X IB1 %16.16llX/%4.4X IB2 %16.16llX/%4.4X\n",
+	dev_err(dev->dev, "gpu fault ring %d fence %x status %8.8X rb %4.4x/%4.4x ib1 %16.16llX/%4.4x ib2 %16.16llX/%4.4x\n",
+		ring ? ring->id : -1, adreno_submitted_fence(gpu, ring),
+		gpu_read(gpu, REG_A5XX_RBBM_STATUS),
 		gpu_read(gpu, REG_A5XX_CP_RB_RPTR),
 		gpu_read(gpu, REG_A5XX_CP_RB_WPTR),
 		gpu_read64(gpu, REG_A5XX_CP_IB1_BASE, REG_A5XX_CP_IB1_BASE_HI),
@@ -786,15 +933,6 @@ static void a5xx_fault_detect_irq(struct msm_gpu *gpu)
 		gpu_read64(gpu, REG_A5XX_CP_IB2_BASE, REG_A5XX_CP_IB2_BASE_HI),
 		gpu_read(gpu, REG_A5XX_CP_IB2_BUFSZ));
 
-	FOR_EACH_RING(gpu, ring, i) {
-		if (!ring)
-			continue;
-
-		dev_err(dev->dev, "RB %d: submitted %d retired %d\n", ring->id,
-			adreno_last_fence(gpu, ring),
-			adreno_submitted_fence(gpu, ring));
-	}
-
 	/* Turn off the hangcheck timer to keep it from bothering us */
 	del_timer(&gpu->hangcheck_timer);
 
@@ -839,6 +977,9 @@ static irqreturn_t a5xx_irq(struct msm_gpu *gpu)
 	if (status & A5XX_RBBM_INT_0_MASK_CP_CACHE_FLUSH_TS)
 		msm_gpu_retire(gpu);
 
+	if (status & A5XX_RBBM_INT_0_MASK_CP_SW)
+		a5xx_preempt_irq(gpu);
+
 	return IRQ_HANDLED;
 }
 
@@ -972,6 +1113,14 @@ static void a5xx_show(struct msm_gpu *gpu, struct seq_file *m)
 }
 #endif
 
+static struct msm_ringbuffer *a5xx_active_ring(struct msm_gpu *gpu)
+{
+	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
+	struct a5xx_gpu *a5xx_gpu = to_a5xx_gpu(adreno_gpu);
+
+	return a5xx_gpu->cur_ring;
+}
+
 static const struct adreno_gpu_funcs funcs = {
 	.base = {
 		.get_param = adreno_get_param,
@@ -982,8 +1131,8 @@ static const struct adreno_gpu_funcs funcs = {
 		.last_fence = adreno_last_fence,
 		.submitted_fence = adreno_submitted_fence,
 		.submit = a5xx_submit,
-		.flush = adreno_flush,
-		.active_ring = adreno_active_ring,
+		.flush = a5xx_flush,
+		.active_ring = a5xx_active_ring,
 		.irq = a5xx_irq,
 		.destroy = a5xx_destroy,
 #ifdef CONFIG_DEBUG_FS
@@ -1103,11 +1252,14 @@ struct msm_gpu *a5xx_gpu_init(struct drm_device *dev)
 	/* Check the efuses for some configuration */
 	a5xx_efuses_read(pdev, adreno_gpu);
 
-	ret = adreno_gpu_init(dev, pdev, adreno_gpu, &funcs, 1);
+	ret = adreno_gpu_init(dev, pdev, adreno_gpu, &funcs, 4);
 	if (ret) {
 		a5xx_destroy(&(a5xx_gpu->base.base));
 		return ERR_PTR(ret);
 	}
 
+	/* Set up the preemption specific bits and pieces for each ringbuffer */
+	a5xx_preempt_init(gpu);
+
 	return gpu;
 }
diff --git a/drivers/gpu/drm/msm/adreno/a5xx_gpu.h b/drivers/gpu/drm/msm/adreno/a5xx_gpu.h
index 6c8d1f41b680..a0f8bb2ac42b 100644
--- a/drivers/gpu/drm/msm/adreno/a5xx_gpu.h
+++ b/drivers/gpu/drm/msm/adreno/a5xx_gpu.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 The Linux Foundation. All rights reserved.
+/* Copyright (c) 2016-2017 The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -42,10 +42,98 @@ struct a5xx_gpu {
 	uint32_t gpmu_dwords;
 
 	uint32_t lm_leakage;
+
+	struct msm_ringbuffer *cur_ring;
+	struct msm_ringbuffer *next_ring;
+
+	struct drm_gem_object *preempt_bo[MSM_GPU_MAX_RINGS];
+	struct a5xx_preempt_record *preempt[MSM_GPU_MAX_RINGS];
+	uint64_t preempt_iova[MSM_GPU_MAX_RINGS];
+
+	atomic_t preempt_state;
+	struct timer_list preempt_timer;
+
 };
 
 #define to_a5xx_gpu(x) container_of(x, struct a5xx_gpu, base)
 
+/*
+ * In order to do lockless preemption we use a simple state machine to progress
+ * through the process.
+ *
+ * PREEMPT_NONE - no preemption in progress.  Next state START.
+ * PREEMPT_START - The trigger is evaulating if preemption is possible. Next
+ * states: TRIGGERED, NONE
+ * PREEMPT_TRIGGERED: A preemption has been executed on the hardware. Next
+ * states: FAULTED, PENDING
+ * PREEMPT_FAULTED: A preemption timed out (never completed). This will trigger
+ * recovery.  Next state: N/A
+ * PREEMPT_PENDING: Preemption complete interrupt fired - the callback is
+ * checking the success of the operation. Next state: FAULTED, NONE.
+ */
+
+enum preempt_state {
+	PREEMPT_NONE = 0,
+	PREEMPT_START,
+	PREEMPT_TRIGGERED,
+	PREEMPT_FAULTED,
+	PREEMPT_PENDING,
+};
+
+/*
+ * struct a5xx_preempt_record is a shared buffer between the microcode and the
+ * CPU to store the state for preemption. The record itself is much larger
+ * (64k) but most of that is used by the CP for storage.
+ *
+ * There is a preemption record assigned per ringbuffer. When the CPU triggers a
+ * preemption, it fills out the record with the useful information (wptr, ring
+ * base, etc) and the microcode uses that information to set up the CP following
+ * the preemption.  When a ring is switched out, the CP will save the ringbuffer
+ * state back to the record. In this way, once the records are properly set up
+ * the CPU can quickly switch back and forth between ringbuffers by only
+ * updating a few registers (often only the wptr).
+ *
+ * These are the CPU aware registers in the record:
+ * @magic: Must always be 0x27C4BAFC
+ * @info: Type of the record - written 0 by the CPU, updated by the CP
+ * @data: Data field from SET_RENDER_MODE or a checkpoint. Written and used by
+ * the CP
+ * @cntl: Value of RB_CNTL written by CPU, save/restored by CP
+ * @rptr: Value of RB_RPTR written by CPU, save/restored by CP
+ * @wptr: Value of RB_WPTR written by CPU, save/restored by CP
+ * @rptr_addr: Value of RB_RPTR_ADDR written by CPU, save/restored by CP
+ * @rbase: Value of RB_BASE written by CPU, save/restored by CP
+ * @counter: GPU address of the storage area for the performance counters
+ */
+struct a5xx_preempt_record {
+	uint32_t magic;
+	uint32_t info;
+	uint32_t data;
+	uint32_t cntl;
+	uint32_t rptr;
+	uint32_t wptr;
+	uint64_t rptr_addr;
+	uint64_t rbase;
+	uint64_t counter;
+};
+
+/* Magic identifier for the preemption record */
+#define A5XX_PREEMPT_RECORD_MAGIC 0x27C4BAFCUL
+
+/*
+ * Even though the structure above is only a few bytes, we need a full 64k to
+ * store the entire preemption record from the CP
+ */
+#define A5XX_PREEMPT_RECORD_SIZE (64 * 1024)
+
+/*
+ * The preemption counter block is a storage area for the value of the
+ * preemption counters that are saved immediately before context switch. We
+ * append it on to the end of the allocadtion for the preemption record.
+ */
+#define A5XX_PREEMPT_COUNTER_SIZE (16 * 4)
+
+
 int a5xx_power_init(struct msm_gpu *gpu);
 void a5xx_gpmu_ucode_init(struct msm_gpu *gpu);
 
@@ -64,4 +152,16 @@ static inline int spin_usecs(struct msm_gpu *gpu, uint32_t usecs,
 
 bool a5xx_idle(struct msm_gpu *gpu, struct msm_ringbuffer *ring);
 
+void a5xx_preempt_init(struct msm_gpu *gpu);
+void a5xx_preempt_hw_init(struct msm_gpu *gpu);
+void a5xx_preempt_trigger(struct msm_gpu *gpu);
+void a5xx_preempt_irq(struct msm_gpu *gpu);
+void a5xx_preempt_fini(struct msm_gpu *gpu);
+
+/* Return true if we are in a preempt state */
+static inline bool a5xx_in_preempt(struct a5xx_gpu *a5xx_gpu)
+{
+	return !(atomic_read(&a5xx_gpu->preempt_state) == PREEMPT_NONE);
+}
+
 #endif /* __A5XX_GPU_H__ */
diff --git a/drivers/gpu/drm/msm/adreno/a5xx_preempt.c b/drivers/gpu/drm/msm/adreno/a5xx_preempt.c
new file mode 100644
index 000000000000..9403853d86c4
--- /dev/null
+++ b/drivers/gpu/drm/msm/adreno/a5xx_preempt.c
@@ -0,0 +1,343 @@
+/* Copyright (c) 2016-2017 The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include "msm_gem.h"
+#include "a5xx_gpu.h"
+
+static void *alloc_kernel_bo(struct drm_device *drm, struct msm_gpu *gpu,
+		size_t size, uint32_t flags, struct drm_gem_object **bo,
+		u64 *iova)
+{
+	struct drm_gem_object *_bo;
+	u64 _iova;
+	void *ptr;
+	int ret;
+
+	mutex_lock(&drm->struct_mutex);
+	_bo = msm_gem_new(drm, size, flags);
+	mutex_unlock(&drm->struct_mutex);
+
+	if (IS_ERR(_bo))
+		return _bo;
+
+	ret = msm_gem_get_iova(_bo, gpu->aspace, &_iova);
+	if (ret)
+		goto out;
+
+	ptr = msm_gem_vaddr(_bo);
+	if (!ptr) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	if (bo)
+		*bo = _bo;
+	if (iova)
+		*iova = _iova;
+
+	return ptr;
+out:
+	drm_gem_object_unreference_unlocked(_bo);
+	return ERR_PTR(ret);
+}
+
+/*
+ * Try to transition the preemption state from old to new. Return
+ * true on success or false if the original state wasn't 'old'
+ */
+static inline bool try_preempt_state(struct a5xx_gpu *a5xx_gpu,
+		enum preempt_state old, enum preempt_state new)
+{
+	enum preempt_state cur = atomic_cmpxchg(&a5xx_gpu->preempt_state,
+		old, new);
+
+	return (cur == old);
+}
+
+/*
+ * Force the preemption state to the specified state.  This is used in cases
+ * where the current state is known and won't change
+ */
+static inline void set_preempt_state(struct a5xx_gpu *gpu,
+		enum preempt_state new)
+{
+	/*
+	 * preempt_state may be read by other cores trying to trigger a
+	 * preemption or in the interrupt handler so barriers are needed
+	 * before...
+	 */
+	smp_mb__before_atomic();
+	atomic_set(&gpu->preempt_state, new);
+	/* ... and after*/
+	smp_mb__after_atomic();
+}
+
+/* Write the most recent wptr for the given ring into the hardware */
+static inline void update_wptr(struct msm_gpu *gpu, struct msm_ringbuffer *ring)
+{
+	unsigned long flags;
+	uint32_t wptr;
+
+	if (!ring)
+		return;
+
+	spin_lock_irqsave(&ring->lock, flags);
+	wptr = get_wptr(ring);
+	spin_unlock_irqrestore(&ring->lock, flags);
+
+	gpu_write(gpu, REG_A5XX_CP_RB_WPTR, wptr);
+}
+
+/* Return the highest priority ringbuffer with something in it */
+static struct msm_ringbuffer *get_next_ring(struct msm_gpu *gpu)
+{
+	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
+	unsigned long flags;
+	int i;
+
+	for (i = gpu->nr_rings - 1; i >= 0; i--) {
+		bool empty;
+		struct msm_ringbuffer *ring = gpu->rb[i];
+
+		spin_lock_irqsave(&ring->lock, flags);
+		empty = (get_wptr(ring) == adreno_gpu->memptrs->rptr[ring->id]);
+		spin_unlock_irqrestore(&ring->lock, flags);
+
+		if (!empty)
+			return ring;
+	}
+
+	return NULL;
+}
+
+static void a5xx_preempt_timer(unsigned long data)
+{
+	struct a5xx_gpu *a5xx_gpu = (struct a5xx_gpu *) data;
+	struct msm_gpu *gpu = &a5xx_gpu->base.base;
+	struct drm_device *dev = gpu->dev;
+	struct msm_drm_private *priv = dev->dev_private;
+
+	if (!try_preempt_state(a5xx_gpu, PREEMPT_TRIGGERED, PREEMPT_FAULTED))
+		return;
+
+	dev_err(dev->dev, "%s: preemption timed out\n", gpu->name);
+	queue_work(priv->wq, &gpu->recover_work);
+}
+
+/* Try to trigger a preemption switch */
+void a5xx_preempt_trigger(struct msm_gpu *gpu)
+{
+	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
+	struct a5xx_gpu *a5xx_gpu = to_a5xx_gpu(adreno_gpu);
+	unsigned long flags;
+	struct msm_ringbuffer *ring;
+
+	if (gpu->nr_rings == 1)
+		return;
+
+	/*
+	 * Try to start preemption by moving from NONE to START. If
+	 * unsuccessful, a preemption is already in flight
+	 */
+	if (!try_preempt_state(a5xx_gpu, PREEMPT_NONE, PREEMPT_START))
+		return;
+
+	/* Get the next ring to preempt to */
+	ring = get_next_ring(gpu);
+
+	/*
+	 * If no ring is populated or the highest priority ring is the current
+	 * one do nothing except to update the wptr to the latest and greatest
+	 */
+	if (!ring || (a5xx_gpu->cur_ring == ring)) {
+		update_wptr(gpu, ring);
+
+		/* Set the state back to NONE */
+		set_preempt_state(a5xx_gpu, PREEMPT_NONE);
+		return;
+	}
+
+	/* Make sure the wptr doesn't update while we're in motion */
+	spin_lock_irqsave(&ring->lock, flags);
+	a5xx_gpu->preempt[ring->id]->wptr = get_wptr(ring);
+	spin_unlock_irqrestore(&ring->lock, flags);
+
+	/* Set the address of the incoming preemption record */
+	gpu_write64(gpu, REG_A5XX_CP_CONTEXT_SWITCH_RESTORE_ADDR_LO,
+		REG_A5XX_CP_CONTEXT_SWITCH_RESTORE_ADDR_HI,
+		a5xx_gpu->preempt_iova[ring->id]);
+
+	a5xx_gpu->next_ring = ring;
+
+	/* Start a timer to catch a stuck preemption */
+	mod_timer(&a5xx_gpu->preempt_timer, jiffies + msecs_to_jiffies(10000));
+
+	/* Set the preemption state to triggered */
+	set_preempt_state(a5xx_gpu, PREEMPT_TRIGGERED);
+
+	/* Make sure everything is written before hitting the button */
+	wmb();
+
+	/* And actually start the preemption */
+	gpu_write(gpu, REG_A5XX_CP_CONTEXT_SWITCH_CNTL, 1);
+}
+
+void a5xx_preempt_irq(struct msm_gpu *gpu)
+{
+	uint32_t status;
+	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
+	struct a5xx_gpu *a5xx_gpu = to_a5xx_gpu(adreno_gpu);
+	struct drm_device *dev = gpu->dev;
+	struct msm_drm_private *priv = dev->dev_private;
+
+	if (!try_preempt_state(a5xx_gpu, PREEMPT_TRIGGERED, PREEMPT_PENDING))
+		return;
+
+	/* Delete the preemption watchdog timer */
+	del_timer(&a5xx_gpu->preempt_timer);
+
+	/*
+	 * The hardware should be setting CP_CONTEXT_SWITCH_CNTL to zero before
+	 * firing the interrupt, but there is a non zero chance of a hardware
+	 * condition or a software race that could set it again before we have a
+	 * chance to finish. If that happens, log and go for recovery
+	 */
+	status = gpu_read(gpu, REG_A5XX_CP_CONTEXT_SWITCH_CNTL);
+	if (unlikely(status)) {
+		set_preempt_state(a5xx_gpu, PREEMPT_FAULTED);
+		dev_err(dev->dev, "%s: Preemption failed to complete\n",
+			gpu->name);
+		queue_work(priv->wq, &gpu->recover_work);
+		return;
+	}
+
+	a5xx_gpu->cur_ring = a5xx_gpu->next_ring;
+	a5xx_gpu->next_ring = NULL;
+
+	update_wptr(gpu, a5xx_gpu->cur_ring);
+
+	set_preempt_state(a5xx_gpu, PREEMPT_NONE);
+}
+
+void a5xx_preempt_hw_init(struct msm_gpu *gpu)
+{
+	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
+	struct a5xx_gpu *a5xx_gpu = to_a5xx_gpu(adreno_gpu);
+	struct msm_ringbuffer *ring;
+	int i;
+
+	if (gpu->nr_rings > 1) {
+		/* Clear the preemption records */
+		FOR_EACH_RING(gpu, ring, i) {
+			if (ring) {
+				a5xx_gpu->preempt[ring->id]->wptr = 0;
+				a5xx_gpu->preempt[ring->id]->rptr = 0;
+				a5xx_gpu->preempt[ring->id]->rbase = ring->iova;
+			}
+		}
+	}
+
+	/* Write a 0 to signal that we aren't switching pagetables */
+	gpu_write64(gpu, REG_A5XX_CP_CONTEXT_SWITCH_SMMU_INFO_LO,
+		REG_A5XX_CP_CONTEXT_SWITCH_SMMU_INFO_HI, 0);
+
+	/* Reset the preemption state */
+	set_preempt_state(a5xx_gpu, PREEMPT_NONE);
+
+	/* Always come up on rb 0 */
+	a5xx_gpu->cur_ring = gpu->rb[0];
+}
+
+static int preempt_init_ring(struct a5xx_gpu *a5xx_gpu,
+		struct msm_ringbuffer *ring)
+{
+	struct adreno_gpu *adreno_gpu = &a5xx_gpu->base;
+	struct msm_gpu *gpu = &adreno_gpu->base;
+	struct a5xx_preempt_record *ptr;
+	struct drm_gem_object *bo;
+	u64 iova;
+
+	ptr = alloc_kernel_bo(gpu->dev, gpu,
+		A5XX_PREEMPT_RECORD_SIZE + A5XX_PREEMPT_COUNTER_SIZE,
+		MSM_BO_UNCACHED | MSM_BO_PRIVILEGED,
+		&bo, &iova);
+
+	if (IS_ERR(ptr))
+		return PTR_ERR(ptr);
+
+	a5xx_gpu->preempt_bo[ring->id] = bo;
+	a5xx_gpu->preempt_iova[ring->id] = iova;
+	a5xx_gpu->preempt[ring->id] = ptr;
+
+	/* Set up the defaults on the preemption record */
+
+	ptr->magic = A5XX_PREEMPT_RECORD_MAGIC;
+	ptr->info = 0;
+	ptr->data = 0;
+	ptr->cntl = MSM_GPU_RB_CNTL_DEFAULT;
+	ptr->rptr_addr = rbmemptr(adreno_gpu, ring->id, rptr);
+	ptr->counter = iova + A5XX_PREEMPT_RECORD_SIZE;
+
+	return 0;
+}
+
+void a5xx_preempt_fini(struct msm_gpu *gpu)
+{
+	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
+	struct a5xx_gpu *a5xx_gpu = to_a5xx_gpu(adreno_gpu);
+	struct msm_ringbuffer *ring;
+	int i;
+
+	FOR_EACH_RING(gpu, ring, i) {
+		if (!ring || !a5xx_gpu->preempt_bo[i])
+			continue;
+
+		if (a5xx_gpu->preempt_iova[i])
+			msm_gem_put_iova(a5xx_gpu->preempt_bo[i], gpu->aspace);
+
+		drm_gem_object_unreference_unlocked(a5xx_gpu->preempt_bo[i]);
+
+		a5xx_gpu->preempt_bo[i] = NULL;
+	}
+}
+
+void a5xx_preempt_init(struct msm_gpu *gpu)
+{
+	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
+	struct a5xx_gpu *a5xx_gpu = to_a5xx_gpu(adreno_gpu);
+	struct msm_ringbuffer *ring;
+	int i;
+
+	/* No preemption if we only have one ring */
+	if (gpu->nr_rings <= 1)
+		return;
+
+	FOR_EACH_RING(gpu, ring, i) {
+		if (!ring)
+			continue;
+
+		if (preempt_init_ring(a5xx_gpu, ring)) {
+			/*
+			 * On any failure our adventure is over. Clean up and
+			 * set nr_rings to 1 to force preemption off
+			 */
+			a5xx_preempt_fini(gpu);
+			gpu->nr_rings = 1;
+
+			return;
+		}
+	}
+
+	setup_timer(&a5xx_gpu->preempt_timer, a5xx_preempt_timer,
+		(unsigned long) a5xx_gpu);
+}
diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
index c0ed98358b2c..236aa48b26aa 100644
--- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
@@ -2,7 +2,7 @@
  * Copyright (C) 2013 Red Hat
  * Author: Rob Clark <robdclark@gmail.com>
  *
- * Copyright (c) 2014,2016 The Linux Foundation. All rights reserved.
+ * Copyright (c) 2014,2016-2017 The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License version 2 as published by
@@ -93,11 +93,6 @@ int adreno_hw_init(struct msm_gpu *gpu)
 	return 0;
 }
 
-static uint32_t get_wptr(struct msm_ringbuffer *ring)
-{
-	return ring->cur - ring->start;
-}
-
 /* Use this helper to read rptr, since a430 doesn't update rptr in memory */
 static uint32_t get_rptr(struct adreno_gpu *adreno_gpu,
 		struct msm_ringbuffer *ring)
@@ -154,6 +149,7 @@ void adreno_recover(struct msm_gpu *gpu)
 		if (!ring)
 			continue;
 
+		/* No need for a lock here, nobody else is peeking in */
 		ring->cur = ring->start;
 		ring->next = ring->start;
 
@@ -270,7 +266,7 @@ void adreno_flush(struct msm_gpu *gpu, struct msm_ringbuffer *ring)
 	 * to account for the possibility that the last command fit exactly into
 	 * the ringbuffer and rb->next hasn't wrapped to zero yet
 	 */
-	wptr = (ring->cur - ring->start) % (MSM_GPU_RINGBUFFER_SZ >> 2);
+	wptr = get_wptr(ring);
 
 	/* ensure writes to ringbuffer have hit system memory: */
 	mb();
@@ -288,8 +284,9 @@ bool adreno_idle(struct msm_gpu *gpu, struct msm_ringbuffer *ring)
 		return true;
 
 	/* TODO maybe we need to reset GPU here to recover from hang? */
-	DRM_ERROR("%s: timeout waiting to drain ringbuffer %d!\n", gpu->name,
-		ring->id);
+	DRM_ERROR("%s: timeout waiting to drain ringbuffer %d rptr/wptr = %X/%X\n",
+		gpu->name, ring->id, get_rptr(adreno_gpu, ring), wptr);
+
 	return false;
 }
 
@@ -345,11 +342,12 @@ void adreno_show(struct msm_gpu *gpu, struct seq_file *m)
  */
 void adreno_dump_info(struct msm_gpu *gpu)
 {
+	struct drm_device *dev = gpu->dev;
 	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
 	struct msm_ringbuffer *ring;
 	int i;
 
-	pr_err("revision: %d (%d.%d.%d.%d)\n",
+	dev_err(dev->dev, "revision: %d (%d.%d.%d.%d)\n",
 			adreno_gpu->info->revn, adreno_gpu->rev.core,
 			adreno_gpu->rev.major, adreno_gpu->rev.minor,
 			adreno_gpu->rev.patchid);
@@ -358,12 +356,11 @@ void adreno_dump_info(struct msm_gpu *gpu)
 		if (!ring)
 			continue;
 
-		pr_err("rb %d: fence:    %d/%d\n", i,
+		dev_err(dev->dev, " ring %d: fence %d/%d rptr/wptr %x/%x\n", i,
 			adreno_last_fence(gpu, ring),
-			adreno_submitted_fence(gpu, ring));
-
-		pr_err("rptr:     %d\n", get_rptr(adreno_gpu, ring));
-		pr_err("rb wptr:  %d\n", get_wptr(ring));
+			adreno_submitted_fence(gpu, ring),
+			get_rptr(adreno_gpu, ring),
+			get_wptr(ring));
 	}
 
 	for (i = 0; i < 8; i++) {
diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.h b/drivers/gpu/drm/msm/adreno/adreno_gpu.h
index 9b4e1828bb6f..c7a9506397f6 100644
--- a/drivers/gpu/drm/msm/adreno/adreno_gpu.h
+++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.h
@@ -2,7 +2,7 @@
  * Copyright (C) 2013 Red Hat
  * Author: Rob Clark <robdclark@gmail.com>
  *
- * Copyright (c) 2014,2016 The Linux Foundation. All rights reserved.
+ * Copyright (c) 2014,2016-2017 The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License version 2 as published by
@@ -337,6 +337,11 @@ static inline void adreno_gpu_write64(struct adreno_gpu *gpu,
 	adreno_gpu_write(gpu, hi, upper_32_bits(data));
 }
 
+static inline uint32_t get_wptr(struct msm_ringbuffer *ring)
+{
+	return (ring->cur - ring->start) % (MSM_GPU_RINGBUFFER_SZ >> 2);
+}
+
 /*
  * Given a register and a count, return a value to program into
  * REG_CP_PROTECT_REG(n) - this will block both reads and writes for _len
diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h
index 7e229aeb8a8b..3df2c7e9b4b6 100644
--- a/drivers/gpu/drm/msm/msm_drv.h
+++ b/drivers/gpu/drm/msm/msm_drv.h
@@ -250,7 +250,7 @@ struct msm_drm_commit {
 	struct kthread_worker worker;
 };
 
-#define MSM_GPU_MAX_RINGS 1
+#define MSM_GPU_MAX_RINGS 4
 
 struct msm_drm_private {
 
diff --git a/drivers/gpu/drm/msm/msm_ringbuffer.c b/drivers/gpu/drm/msm/msm_ringbuffer.c
index 2dd6bc6c725c..14a16c4578d9 100644
--- a/drivers/gpu/drm/msm/msm_ringbuffer.c
+++ b/drivers/gpu/drm/msm/msm_ringbuffer.c
@@ -46,6 +46,8 @@ struct msm_ringbuffer *msm_ringbuffer_new(struct msm_gpu *gpu, int id)
 	ring->next  = ring->start;
 	ring->cur   = ring->start;
 
+	spin_lock_init(&ring->lock);
+
 	return ring;
 
 fail:
diff --git a/drivers/gpu/drm/msm/msm_ringbuffer.h b/drivers/gpu/drm/msm/msm_ringbuffer.h
index 4e0b302501fa..1e84905073bf 100644
--- a/drivers/gpu/drm/msm/msm_ringbuffer.h
+++ b/drivers/gpu/drm/msm/msm_ringbuffer.h
@@ -27,6 +27,7 @@ struct msm_ringbuffer {
 	uint32_t *start, *end, *cur, *next;
 	uint64_t iova;
 	uint32_t submitted_fence;
+	spinlock_t lock;
 };
 
 struct msm_ringbuffer *msm_ringbuffer_new(struct msm_gpu *gpu, int id);
-- 
cgit v1.2.3


From b9148c855ae81a1478c0a9e508d15067609e1c22 Mon Sep 17 00:00:00 2001
From: Jordan Crouse <jcrouse@codeaurora.org>
Date: Mon, 13 Feb 2017 10:14:27 -0700
Subject: drm/msm: Add a struct to pass configuration to msm_gpu_init()

The amount of information that we need to pass into msm_gpu_init()
is steadily increasing, so add a new struct to stabilize the function
call and make it easier to add new configuration down the line.

Change-Id: Ic0dedbad6c62d6859c90764245437c222d61f00d
Signed-off-by: Jordan Crouse <jcrouse@codeaurora.org>
---
 drivers/gpu/drm/msm/adreno/adreno_gpu.c | 13 +++++++++++--
 drivers/gpu/drm/msm/msm_gpu.c           | 15 ++++++++-------
 drivers/gpu/drm/msm/msm_gpu.h           | 11 +++++++++--
 3 files changed, 28 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
index 236aa48b26aa..16eede8165bd 100644
--- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
@@ -528,6 +528,7 @@ int adreno_gpu_init(struct drm_device *drm, struct platform_device *pdev,
 		const struct adreno_gpu_funcs *funcs, int nr_rings)
 {
 	struct adreno_platform_config *config = pdev->dev.platform_data;
+	struct msm_gpu_config adreno_gpu_config  = { 0 };
 	struct msm_gpu *gpu = &adreno_gpu->base;
 	struct msm_mmu *mmu;
 	int ret;
@@ -541,9 +542,17 @@ int adreno_gpu_init(struct drm_device *drm, struct platform_device *pdev,
 	/* Get the rest of the target configuration from the device tree */
 	adreno_of_parse(pdev, gpu);
 
+	adreno_gpu_config.ioname = "kgsl_3d0_reg_memory";
+	adreno_gpu_config.irqname = "kgsl_3d0_irq";
+	adreno_gpu_config.nr_rings = nr_rings;
+
+	adreno_gpu_config.va_start = SZ_16M;
+	adreno_gpu_config.va_end = 0xffffffff;
+
+	adreno_gpu_config.nr_rings = nr_rings;
+
 	ret = msm_gpu_init(drm, pdev, &adreno_gpu->base, &funcs->base,
-			adreno_gpu->info->name, "kgsl_3d0_reg_memory", "kgsl_3d0_irq",
-			nr_rings);
+			adreno_gpu->info->name, &adreno_gpu_config);
 	if (ret)
 		return ret;
 
diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
index 57d518300425..b95f51f2e4be 100644
--- a/drivers/gpu/drm/msm/msm_gpu.c
+++ b/drivers/gpu/drm/msm/msm_gpu.c
@@ -596,11 +596,10 @@ static const char *clk_names[] = {
 
 int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev,
 		struct msm_gpu *gpu, const struct msm_gpu_funcs *funcs,
-		const char *name, const char *ioname, const char *irqname,
-		int nr_rings)
+		const char *name, struct msm_gpu_config *config)
 {
 	struct iommu_domain *iommu;
-	int i, ret;
+	int i, ret, nr_rings;
 
 	if (WARN_ON(gpu->num_perfcntrs > ARRAY_SIZE(gpu->last_cntrs)))
 		gpu->num_perfcntrs = ARRAY_SIZE(gpu->last_cntrs);
@@ -627,14 +626,14 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev,
 	BUG_ON(ARRAY_SIZE(clk_names) != ARRAY_SIZE(gpu->grp_clks));
 
 	/* Map registers: */
-	gpu->mmio = msm_ioremap(pdev, ioname, name);
+	gpu->mmio = msm_ioremap(pdev, config->ioname, name);
 	if (IS_ERR(gpu->mmio)) {
 		ret = PTR_ERR(gpu->mmio);
 		goto fail;
 	}
 
 	/* Get Interrupt: */
-	gpu->irq = platform_get_irq_byname(pdev, irqname);
+	gpu->irq = platform_get_irq_byname(pdev, config->irqname);
 	if (gpu->irq < 0) {
 		ret = gpu->irq;
 		dev_err(drm->dev, "failed to get irq: %d\n", ret);
@@ -681,8 +680,8 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev,
 	iommu = iommu_domain_alloc(&platform_bus_type);
 	if (iommu) {
 		/* TODO 32b vs 64b address space.. */
-		iommu->geometry.aperture_start = 0x1000;
-		iommu->geometry.aperture_end = 0xffffffff;
+		iommu->geometry.aperture_start = config->va_start;
+		iommu->geometry.aperture_end = config->va_end;
 
 		dev_info(drm->dev, "%s: using IOMMU\n", name);
 		gpu->aspace = msm_gem_address_space_create(&pdev->dev,
@@ -699,6 +698,8 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev,
 		dev_info(drm->dev, "%s: no IOMMU, fallback to VRAM carveout!\n", name);
 	}
 
+	nr_rings = config->nr_rings;
+
 	if (nr_rings > ARRAY_SIZE(gpu->rb)) {
 		WARN(1, "Only creating %lu ringbuffers\n", ARRAY_SIZE(gpu->rb));
 		nr_rings = ARRAY_SIZE(gpu->rb);
diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
index 48303d3d0e44..70a0ccc3dc0c 100644
--- a/drivers/gpu/drm/msm/msm_gpu.h
+++ b/drivers/gpu/drm/msm/msm_gpu.h
@@ -28,6 +28,14 @@
 struct msm_gem_submit;
 struct msm_gpu_perfcntr;
 
+struct msm_gpu_config {
+	const char *ioname;
+	const char *irqname;
+	int nr_rings;
+	uint64_t va_start;
+	uint64_t va_end;
+};
+
 /* So far, with hardware that I've seen to date, we can have:
  *  + zero, one, or two z180 2d cores
  *  + a3xx or a2xx 3d core, which share a common CP (the firmware
@@ -238,8 +246,7 @@ int msm_gpu_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
 
 int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev,
 		struct msm_gpu *gpu, const struct msm_gpu_funcs *funcs,
-		const char *name, const char *ioname, const char *irqname,
-		int nr_rings);
+		const char *name, struct msm_gpu_config *config);
 
 void msm_gpu_cleanup(struct msm_gpu *gpu);
 
-- 
cgit v1.2.3


From 40b74543b5e0f7a786a344b1e2112dba7ca1f150 Mon Sep 17 00:00:00 2001
From: Jordan Crouse <jcrouse@codeaurora.org>
Date: Mon, 13 Feb 2017 10:14:27 -0700
Subject: drm/msm: a5xx: Enable 64 bit mode by default

A5XX GPUs can be run in either 32 or 64 bit mode. The GPU registers
and the microcode use 64 bit virtual addressing by default but the
upper 32 bits are ignored if the GPU is in 32 bit mode. There is no
performance disadvantage to remaining in 64 bit mode even if we are
only generating 32 bit addresses so switch over now to prepare for
possibly using addresses above 4G for those targets that support them.

Change-Id: Ic0dedbad7e527c4b1fe87878e943619c5e0ad869
Signed-off-by: Jordan Crouse <jcrouse@codeaurora.org>
---
 drivers/gpu/drm/msm/adreno/a5xx_gpu.c   | 14 ++++++++++++++
 drivers/gpu/drm/msm/adreno/adreno_gpu.c |  9 +++++++++
 2 files changed, 23 insertions(+)

diff --git a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
index 85fa69f2bbce..893969985c26 100644
--- a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
@@ -651,6 +651,20 @@ static int a5xx_hw_init(struct msm_gpu *gpu)
 		REG_A5XX_RBBM_SECVID_TSB_TRUSTED_BASE_HI, 0x00000000);
 	gpu_write(gpu, REG_A5XX_RBBM_SECVID_TSB_TRUSTED_SIZE, 0x00000000);
 
+	/* Put the GPU into 64 bit by default */
+	gpu_write(gpu, REG_A5XX_CP_ADDR_MODE_CNTL, 0x1);
+	gpu_write(gpu, REG_A5XX_VSC_ADDR_MODE_CNTL, 0x1);
+	gpu_write(gpu, REG_A5XX_GRAS_ADDR_MODE_CNTL, 0x1);
+	gpu_write(gpu, REG_A5XX_RB_ADDR_MODE_CNTL, 0x1);
+	gpu_write(gpu, REG_A5XX_PC_ADDR_MODE_CNTL, 0x1);
+	gpu_write(gpu, REG_A5XX_HLSQ_ADDR_MODE_CNTL, 0x1);
+	gpu_write(gpu, REG_A5XX_VFD_ADDR_MODE_CNTL, 0x1);
+	gpu_write(gpu, REG_A5XX_VPC_ADDR_MODE_CNTL, 0x1);
+	gpu_write(gpu, REG_A5XX_UCHE_ADDR_MODE_CNTL, 0x1);
+	gpu_write(gpu, REG_A5XX_SP_ADDR_MODE_CNTL, 0x1);
+	gpu_write(gpu, REG_A5XX_TPL1_ADDR_MODE_CNTL, 0x1);
+	gpu_write(gpu, REG_A5XX_RBBM_SECVID_TSB_ADDR_MODE_CNTL, 0x1);
+
 	/* Load the GPMU firmware before starting the HW init */
 	a5xx_gpmu_ucode_init(gpu);
 
diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
index 16eede8165bd..9afb4b75d991 100644
--- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
@@ -549,6 +549,15 @@ int adreno_gpu_init(struct drm_device *drm, struct platform_device *pdev,
 	adreno_gpu_config.va_start = SZ_16M;
 	adreno_gpu_config.va_end = 0xffffffff;
 
+	if (adreno_gpu->revn >= 500) {
+		/* 5XX targets use a 64 bit region */
+		adreno_gpu_config.va_start = 0x800000000;
+		adreno_gpu_config.va_end = 0x8ffffffff;
+	} else {
+		adreno_gpu_config.va_start = 0x300000;
+		adreno_gpu_config.va_end = 0xffffffff;
+	}
+
 	adreno_gpu_config.nr_rings = nr_rings;
 
 	ret = msm_gpu_init(drm, pdev, &adreno_gpu->base, &funcs->base,
-- 
cgit v1.2.3


From 663d4c0a64159fb15cc0817424081695901f39cb Mon Sep 17 00:00:00 2001
From: Jordan Crouse <jcrouse@codeaurora.org>
Date: Mon, 13 Feb 2017 10:14:28 -0700
Subject: iommu/arm-smmu: Add support for TTBR1

Allow a domain to opt into allocating and maintaining a TTBR1
pagetable.  The size of the TTBR1 region will be the same as
the TTBR0 size with the sign extension bit set on the highest
bit in the region.

By example, given a TTBR0/TTBR1 virtual address range of 36
bits the memory map will look like this:

   TTBR0 [0x000000000:0x7FFFFFFFF]
   TTBR1 [0x800000000:0xFFFFFFFFF]

The map/unmap operations will automatically use the appropriate
pagetable for the given iova.

Change-Id: Ic0dedbad2b2c58cd9c47ce31356472e0463d4228
Signed-off-by: Jordan Crouse <jcrouse@codeaurora.org>
---
 drivers/iommu/arm-smmu.c       |  29 ++++----
 drivers/iommu/io-pgtable-arm.c | 158 ++++++++++++++++++++++++++++++++++++-----
 drivers/iommu/io-pgtable.h     |   1 +
 include/linux/iommu.h          |   1 +
 4 files changed, 160 insertions(+), 29 deletions(-)

diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
index ce15e150277e..ce1eb562be36 100644
--- a/drivers/iommu/arm-smmu.c
+++ b/drivers/iommu/arm-smmu.c
@@ -249,17 +249,6 @@
 #define RESUME_RETRY			(0 << 0)
 #define RESUME_TERMINATE		(1 << 0)
 
-#define TTBCR2_SEP_SHIFT		15
-#define TTBCR2_SEP_UPSTREAM		(0x7 << TTBCR2_SEP_SHIFT)
-
-#define TTBCR2_SEP_31			0
-#define TTBCR2_SEP_35			1
-#define TTBCR2_SEP_39			2
-#define TTBCR2_SEP_41			3
-#define TTBCR2_SEP_43			4
-#define TTBCR2_SEP_47			5
-#define TTBCR2_SEP_NOSIGN		7
-
 #define TTBRn_ASID_SHIFT		48
 
 #define FSR_MULTI			(1 << 31)
@@ -1614,7 +1603,6 @@ static void arm_smmu_init_context_bank(struct arm_smmu_domain *smmu_domain,
 		writel_relaxed(reg, cb_base + ARM_SMMU_CB_TTBCR);
 		if (smmu->version > ARM_SMMU_V1) {
 			reg = pgtbl_cfg->arm_lpae_s1_cfg.tcr >> 32;
-			reg |= TTBCR2_SEP_UPSTREAM;
 			writel_relaxed(reg, cb_base + ARM_SMMU_CB_TTBCR2);
 		}
 	} else {
@@ -1745,7 +1733,9 @@ static int arm_smmu_init_domain_context(struct iommu_domain *domain,
 	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
 	struct arm_smmu_cfg *cfg = &smmu_domain->cfg;
 	bool is_fast = smmu_domain->attributes & (1 << DOMAIN_ATTR_FAST);
-	unsigned long quirks = 0;
+	unsigned long quirks =
+		smmu_domain->attributes & (1 << DOMAIN_ATTR_ENABLE_TTBR1) ?
+			IO_PGTABLE_QUIRK_ARM_TTBR1 : 0;
 
 	if (smmu_domain->smmu)
 		goto out;
@@ -1837,6 +1827,7 @@ static int arm_smmu_init_domain_context(struct iommu_domain *domain,
 		};
 		fmt = ARM_MSM_SECURE;
 	} else {
+
 		smmu_domain->pgtbl_cfg = (struct io_pgtable_cfg) {
 			.quirks		= quirks,
 			.pgsize_bitmap	= arm_smmu_ops.pgsize_bitmap,
@@ -3140,6 +3131,12 @@ static int arm_smmu_domain_get_attr(struct iommu_domain *domain,
 			& (1 << DOMAIN_ATTR_PAGE_TABLE_FORCE_COHERENT));
 		ret = 0;
 		break;
+	case DOMAIN_ATTR_ENABLE_TTBR1:
+		*((int *)data) = !!(smmu_domain->attributes
+					& (1 << DOMAIN_ATTR_ENABLE_TTBR1));
+		ret = 0;
+		break;
+
 	default:
 		ret = -ENODEV;
 		break;
@@ -3283,6 +3280,12 @@ static int arm_smmu_domain_set_attr(struct iommu_domain *domain,
 		ret = 0;
 		break;
 	}
+	case DOMAIN_ATTR_ENABLE_TTBR1:
+		if (*((int *)data))
+			smmu_domain->attributes |=
+				1 << DOMAIN_ATTR_ENABLE_TTBR1;
+		ret = 0;
+		break;
 	default:
 		ret = -ENODEV;
 		break;
diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
index 0d057ca92972..5f2b66286c0c 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -131,14 +131,21 @@
 #define ARM_LPAE_TCR_TG0_64K		(1 << 14)
 #define ARM_LPAE_TCR_TG0_16K		(2 << 14)
 
+#define ARM_LPAE_TCR_TG1_16K            1ULL
+#define ARM_LPAE_TCR_TG1_4K             2ULL
+#define ARM_LPAE_TCR_TG1_64K            3ULL
+
 #define ARM_LPAE_TCR_SH0_SHIFT		12
 #define ARM_LPAE_TCR_SH0_MASK		0x3
+#define ARM_LPAE_TCR_SH1_SHIFT		28
 #define ARM_LPAE_TCR_SH_NS		0
 #define ARM_LPAE_TCR_SH_OS		2
 #define ARM_LPAE_TCR_SH_IS		3
 
 #define ARM_LPAE_TCR_ORGN0_SHIFT	10
+#define ARM_LPAE_TCR_ORGN1_SHIFT	26
 #define ARM_LPAE_TCR_IRGN0_SHIFT	8
+#define ARM_LPAE_TCR_IRGN1_SHIFT	24
 #define ARM_LPAE_TCR_RGN_MASK		0x3
 #define ARM_LPAE_TCR_RGN_NC		0
 #define ARM_LPAE_TCR_RGN_WBWA		1
@@ -151,6 +158,9 @@
 #define ARM_LPAE_TCR_T0SZ_SHIFT		0
 #define ARM_LPAE_TCR_SZ_MASK		0xf
 
+#define ARM_LPAE_TCR_T1SZ_SHIFT         16
+#define ARM_LPAE_TCR_T1SZ_MASK          0x3f
+
 #define ARM_LPAE_TCR_PS_SHIFT		16
 #define ARM_LPAE_TCR_PS_MASK		0x7
 
@@ -167,6 +177,16 @@
 #define ARM_LPAE_TCR_EPD1_SHIFT		23
 #define ARM_LPAE_TCR_EPD1_FAULT		1
 
+#define ARM_LPAE_TCR_SEP_SHIFT		(15 + 32)
+
+#define ARM_LPAE_TCR_SEP_31		0ULL
+#define ARM_LPAE_TCR_SEP_35		1ULL
+#define ARM_LPAE_TCR_SEP_39		2ULL
+#define ARM_LPAE_TCR_SEP_41		3ULL
+#define ARM_LPAE_TCR_SEP_43		4ULL
+#define ARM_LPAE_TCR_SEP_47		5ULL
+#define ARM_LPAE_TCR_SEP_UPSTREAM	7ULL
+
 #define ARM_LPAE_MAIR_ATTR_SHIFT(n)	((n) << 3)
 #define ARM_LPAE_MAIR_ATTR_MASK		0xff
 #define ARM_LPAE_MAIR_ATTR_DEVICE	0x04
@@ -206,7 +226,7 @@ struct arm_lpae_io_pgtable {
 	unsigned long		pg_shift;
 	unsigned long		bits_per_level;
 
-	void			*pgd;
+	void			*pgd[2];
 };
 
 typedef u64 arm_lpae_iopte;
@@ -524,14 +544,26 @@ static arm_lpae_iopte arm_lpae_prot_to_pte(struct arm_lpae_io_pgtable *data,
 	return pte;
 }
 
+static inline arm_lpae_iopte *arm_lpae_get_table(
+		struct arm_lpae_io_pgtable *data, unsigned long iova)
+{
+	struct io_pgtable_cfg *cfg = &data->iop.cfg;
+
+	return ((cfg->quirks & IO_PGTABLE_QUIRK_ARM_TTBR1) &&
+		(iova & (1UL << (cfg->ias - 1)))) ?
+		data->pgd[1] : data->pgd[0];
+}
+
 static int arm_lpae_map(struct io_pgtable_ops *ops, unsigned long iova,
 			phys_addr_t paddr, size_t size, int iommu_prot)
 {
 	struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops);
-	arm_lpae_iopte *ptep = data->pgd;
+	arm_lpae_iopte *ptep;
 	int ret, lvl = ARM_LPAE_START_LVL(data);
 	arm_lpae_iopte prot;
 
+	ptep = arm_lpae_get_table(data, iova);
+
 	/* If no access, then nothing to do */
 	if (!(iommu_prot & (IOMMU_READ | IOMMU_WRITE)))
 		return 0;
@@ -554,7 +586,7 @@ static int arm_lpae_map_sg(struct io_pgtable_ops *ops, unsigned long iova,
 {
 	struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops);
 	struct io_pgtable_cfg *cfg = &data->iop.cfg;
-	arm_lpae_iopte *ptep = data->pgd;
+	arm_lpae_iopte *ptep;
 	int lvl = ARM_LPAE_START_LVL(data);
 	arm_lpae_iopte prot;
 	struct scatterlist *s;
@@ -563,6 +595,8 @@ static int arm_lpae_map_sg(struct io_pgtable_ops *ops, unsigned long iova,
 	unsigned int min_pagesz;
 	struct map_state ms;
 
+	ptep = arm_lpae_get_table(data, iova);
+
 	/* If no access, then nothing to do */
 	if (!(iommu_prot & (IOMMU_READ | IOMMU_WRITE)))
 		goto out_err;
@@ -672,7 +706,10 @@ static void arm_lpae_free_pgtable(struct io_pgtable *iop)
 {
 	struct arm_lpae_io_pgtable *data = io_pgtable_to_data(iop);
 
-	__arm_lpae_free_pgtable(data, ARM_LPAE_START_LVL(data), data->pgd);
+	__arm_lpae_free_pgtable(data, ARM_LPAE_START_LVL(data), data->pgd[0]);
+	if (data->pgd[1])
+		__arm_lpae_free_pgtable(data, ARM_LPAE_START_LVL(data),
+			data->pgd[1]);
 	kfree(data);
 }
 
@@ -800,9 +837,11 @@ static size_t arm_lpae_unmap(struct io_pgtable_ops *ops, unsigned long iova,
 	size_t unmapped = 0;
 	struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops);
 	struct io_pgtable *iop = &data->iop;
-	arm_lpae_iopte *ptep = data->pgd;
+	arm_lpae_iopte *ptep;
 	int lvl = ARM_LPAE_START_LVL(data);
 
+	ptep = arm_lpae_get_table(data, iova);
+
 	while (unmapped < size) {
 		size_t ret, size_to_unmap, remaining;
 
@@ -828,7 +867,10 @@ static int arm_lpae_iova_to_pte(struct arm_lpae_io_pgtable *data,
 				unsigned long iova, int *plvl_ret,
 				arm_lpae_iopte *ptep_ret)
 {
-	arm_lpae_iopte pte, *ptep = data->pgd;
+	arm_lpae_iopte pte, *ptep;
+
+	ptep = arm_lpae_get_table(data, iova);
+
 	*plvl_ret = ARM_LPAE_START_LVL(data);
 	*ptep_ret = 0;
 
@@ -994,6 +1036,71 @@ arm_lpae_alloc_pgtable(struct io_pgtable_cfg *cfg)
 	return data;
 }
 
+static u64 arm64_lpae_setup_ttbr1(struct io_pgtable_cfg *cfg,
+		struct arm_lpae_io_pgtable *data)
+
+{
+	u64 reg;
+
+	/* If TTBR1 is disabled, disable speculative walks through the TTBR1 */
+	if (!(cfg->quirks & IO_PGTABLE_QUIRK_ARM_TTBR1)) {
+		reg = ARM_LPAE_TCR_EPD1;
+		reg |= (ARM_LPAE_TCR_SEP_UPSTREAM << ARM_LPAE_TCR_SEP_SHIFT);
+		return reg;
+	}
+
+	if (cfg->iommu_dev && cfg->iommu_dev->archdata.dma_coherent)
+		reg = (ARM_LPAE_TCR_SH_OS << ARM_LPAE_TCR_SH1_SHIFT) |
+			(ARM_LPAE_TCR_RGN_WBWA << ARM_LPAE_TCR_IRGN1_SHIFT) |
+			(ARM_LPAE_TCR_RGN_WBWA << ARM_LPAE_TCR_ORGN1_SHIFT);
+	else
+		reg = (ARM_LPAE_TCR_SH_OS << ARM_LPAE_TCR_SH1_SHIFT) |
+			(ARM_LPAE_TCR_RGN_NC << ARM_LPAE_TCR_IRGN1_SHIFT) |
+			(ARM_LPAE_TCR_RGN_NC << ARM_LPAE_TCR_ORGN1_SHIFT);
+
+	switch (1 << data->pg_shift) {
+	case SZ_4K:
+		reg |= (ARM_LPAE_TCR_TG1_4K << 30);
+		break;
+	case SZ_16K:
+		reg |= (ARM_LPAE_TCR_TG1_16K << 30);
+		break;
+	case SZ_64K:
+		reg |= (ARM_LPAE_TCR_TG1_64K << 30);
+		break;
+	}
+
+	/* Set T1SZ */
+	reg |= (64ULL - cfg->ias) << ARM_LPAE_TCR_T1SZ_SHIFT;
+
+	/* Set the SEP bit based on the size */
+	switch (cfg->ias) {
+	case 32:
+		reg |= (ARM_LPAE_TCR_SEP_31 << ARM_LPAE_TCR_SEP_SHIFT);
+		break;
+	case 36:
+		reg |= (ARM_LPAE_TCR_SEP_35 << ARM_LPAE_TCR_SEP_SHIFT);
+		break;
+	case 40:
+		reg |= (ARM_LPAE_TCR_SEP_39 << ARM_LPAE_TCR_SEP_SHIFT);
+		break;
+	case 42:
+		reg |= (ARM_LPAE_TCR_SEP_41 << ARM_LPAE_TCR_SEP_SHIFT);
+		break;
+	case 44:
+		reg |= (ARM_LPAE_TCR_SEP_43 << ARM_LPAE_TCR_SEP_SHIFT);
+		break;
+	case 48:
+		reg |= (ARM_LPAE_TCR_SEP_47 << ARM_LPAE_TCR_SEP_SHIFT);
+		break;
+	default:
+		reg |= (ARM_LPAE_TCR_SEP_UPSTREAM << ARM_LPAE_TCR_SEP_SHIFT);
+		break;
+	}
+
+	return reg;
+}
+
 static struct io_pgtable *
 arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie)
 {
@@ -1050,8 +1157,9 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie)
 
 	reg |= (64ULL - cfg->ias) << ARM_LPAE_TCR_T0SZ_SHIFT;
 
-	/* Disable speculative walks through TTBR1 */
-	reg |= ARM_LPAE_TCR_EPD1;
+	/* Bring in the TTBR1 configuration */
+	reg |= arm64_lpae_setup_ttbr1(cfg, data);
+
 	cfg->arm_lpae_s1_cfg.tcr = reg;
 
 	/* MAIRs */
@@ -1066,16 +1174,33 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie)
 	cfg->arm_lpae_s1_cfg.mair[1] = 0;
 
 	/* Looking good; allocate a pgd */
-	data->pgd = __arm_lpae_alloc_pages(data->pgd_size, GFP_KERNEL, cfg, cookie);
-	if (!data->pgd)
+	data->pgd[0] = __arm_lpae_alloc_pages(data->pgd_size, GFP_KERNEL, cfg,
+		cookie);
+	if (!data->pgd[0])
 		goto out_free_data;
 
+
+	if (cfg->quirks & IO_PGTABLE_QUIRK_ARM_TTBR1) {
+		data->pgd[1] = __arm_lpae_alloc_pages(data->pgd_size,
+			GFP_KERNEL, cfg, cookie);
+		if (!data->pgd[1]) {
+			__arm_lpae_free_pages(data->pgd[0], data->pgd_size, cfg,
+				cookie);
+			goto out_free_data;
+		}
+	} else {
+		data->pgd[1] = NULL;
+	}
+
 	/* Ensure the empty pgd is visible before any actual TTBR write */
 	wmb();
 
 	/* TTBRs */
-	cfg->arm_lpae_s1_cfg.ttbr[0] = virt_to_phys(data->pgd);
-	cfg->arm_lpae_s1_cfg.ttbr[1] = 0;
+	cfg->arm_lpae_s1_cfg.ttbr[0] = virt_to_phys(data->pgd[0]);
+
+	if (data->pgd[1])
+		cfg->arm_lpae_s1_cfg.ttbr[1] = virt_to_phys(data->pgd[1]);
+
 	return &data->iop;
 
 out_free_data:
@@ -1155,15 +1280,16 @@ arm_64_lpae_alloc_pgtable_s2(struct io_pgtable_cfg *cfg, void *cookie)
 	cfg->arm_lpae_s2_cfg.vtcr = reg;
 
 	/* Allocate pgd pages */
-	data->pgd = __arm_lpae_alloc_pages(data->pgd_size, GFP_KERNEL, cfg, cookie);
-	if (!data->pgd)
+	data->pgd[0] = __arm_lpae_alloc_pages(data->pgd_size, GFP_KERNEL, cfg,
+		cookie);
+	if (!data->pgd[0])
 		goto out_free_data;
 
 	/* Ensure the empty pgd is visible before any actual TTBR write */
 	wmb();
 
 	/* VTTBR */
-	cfg->arm_lpae_s2_cfg.vttbr = virt_to_phys(data->pgd);
+	cfg->arm_lpae_s2_cfg.vttbr = virt_to_phys(data->pgd[0]);
 	return &data->iop;
 
 out_free_data:
@@ -1261,7 +1387,7 @@ static void __init arm_lpae_dump_ops(struct io_pgtable_ops *ops)
 		cfg->pgsize_bitmap, cfg->ias);
 	pr_err("data: %d levels, 0x%zx pgd_size, %lu pg_shift, %lu bits_per_level, pgd @ %p\n",
 		data->levels, data->pgd_size, data->pg_shift,
-		data->bits_per_level, data->pgd);
+		data->bits_per_level, data->pgd[0]);
 }
 
 #define __FAIL(ops, i)	({						\
diff --git a/drivers/iommu/io-pgtable.h b/drivers/iommu/io-pgtable.h
index a3f366f559a7..f4533040806f 100644
--- a/drivers/iommu/io-pgtable.h
+++ b/drivers/iommu/io-pgtable.h
@@ -62,6 +62,7 @@ struct io_pgtable_cfg {
 	 */
 	#define IO_PGTABLE_QUIRK_ARM_NS	(1 << 0)	/* Set NS bit in PTEs */
 	#define IO_PGTABLE_QUIRK_PAGE_TABLE_COHERENT (1 << 1)
+	#define IO_PGTABLE_QUIRK_ARM_TTBR1 (1 << 2)     /* Allocate TTBR1 PT */
 	int				quirks;
 	unsigned long			pgsize_bitmap;
 	unsigned int			ias;
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 1b3f20e8fb74..c4c25651ff21 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -136,6 +136,7 @@ enum iommu_attr {
 	DOMAIN_ATTR_EARLY_MAP,
 	DOMAIN_ATTR_PAGE_TABLE_IS_COHERENT,
 	DOMAIN_ATTR_PAGE_TABLE_FORCE_COHERENT,
+	DOMAIN_ATTR_ENABLE_TTBR1,
 	DOMAIN_ATTR_MAX,
 };
 
-- 
cgit v1.2.3


From 231c57eeaf8e10ec2a4510ffc98382ef1d7513ed Mon Sep 17 00:00:00 2001
From: Jordan Crouse <jcrouse@codeaurora.org>
Date: Mon, 13 Feb 2017 10:14:28 -0700
Subject: drm/msm: Pass the MMU domain index in struct msm_file_private

Pass the index of the MMU domain in struct msm_file_private instead
of assuming gpu->id throughout the submit path.

Change-Id: Ic0dedbad3761b0f72ad6b1789f69458896214239
Signed-off-by: Jordan Crouse <jcrouse@codeaurora.org>
---
 drivers/gpu/drm/msm/adreno/a5xx_gpu.c   |  8 +-------
 drivers/gpu/drm/msm/adreno/adreno_gpu.c |  6 +-----
 drivers/gpu/drm/msm/adreno/adreno_gpu.h |  3 +--
 drivers/gpu/drm/msm/msm_drv.c           | 19 +++++++++----------
 drivers/gpu/drm/msm/msm_drv.h           |  7 +------
 drivers/gpu/drm/msm/msm_gem.h           |  2 +-
 drivers/gpu/drm/msm/msm_gem_submit.c    | 12 ++++++------
 drivers/gpu/drm/msm/msm_gpu.c           | 10 ++++------
 drivers/gpu/drm/msm/msm_gpu.h           |  6 ++----
 9 files changed, 26 insertions(+), 47 deletions(-)

diff --git a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
index 893969985c26..687ff90a2a7e 100644
--- a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
@@ -42,13 +42,10 @@ static void a5xx_flush(struct msm_gpu *gpu, struct msm_ringbuffer *ring)
 		gpu_write(gpu, REG_A5XX_CP_RB_WPTR, wptr);
 }
 
-static int a5xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
-	struct msm_file_private *ctx)
+static int a5xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit)
 {
 	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
 	struct a5xx_gpu *a5xx_gpu = to_a5xx_gpu(adreno_gpu);
-
-	struct msm_drm_private *priv = gpu->dev->dev_private;
 	struct msm_ringbuffer *ring = gpu->rb[submit->ring];
 	unsigned int i, ibs = 0;
 
@@ -81,9 +78,6 @@ static int a5xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
 		switch (submit->cmd[i].type) {
 		case MSM_SUBMIT_CMD_IB_TARGET_BUF:
 			break;
-		case MSM_SUBMIT_CMD_CTX_RESTORE_BUF:
-			if (priv->lastctx == ctx)
-				break;
 		case MSM_SUBMIT_CMD_BUF:
 			OUT_PKT7(ring, CP_INDIRECT_BUFFER_PFE, 3);
 			OUT_RING(ring, lower_32_bits(submit->cmd[i].iova));
diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
index 9afb4b75d991..0db0b70edab7 100644
--- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
@@ -170,11 +170,9 @@ void adreno_recover(struct msm_gpu *gpu)
 	enable_irq(gpu->irq);
 }
 
-int adreno_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
-		struct msm_file_private *ctx)
+int adreno_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit)
 {
 	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
-	struct msm_drm_private *priv = gpu->dev->dev_private;
 	struct msm_ringbuffer *ring = gpu->rb[submit->ring];
 	unsigned i, ibs = 0;
 
@@ -184,8 +182,6 @@ int adreno_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
 			/* ignore IB-targets */
 			break;
 		case MSM_SUBMIT_CMD_CTX_RESTORE_BUF:
-			/* ignore if there has not been a ctx switch: */
-			if (priv->lastctx == ctx)
 				break;
 		case MSM_SUBMIT_CMD_BUF:
 			OUT_PKT3(ring, adreno_is_a430(adreno_gpu) ?
diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.h b/drivers/gpu/drm/msm/adreno/adreno_gpu.h
index c7a9506397f6..95389fc453b4 100644
--- a/drivers/gpu/drm/msm/adreno/adreno_gpu.h
+++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.h
@@ -215,8 +215,7 @@ uint32_t adreno_last_fence(struct msm_gpu *gpu, struct msm_ringbuffer *ring);
 uint32_t adreno_submitted_fence(struct msm_gpu *gpu,
 		struct msm_ringbuffer *ring);
 void adreno_recover(struct msm_gpu *gpu);
-int adreno_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
-		struct msm_file_private *ctx);
+int adreno_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit);
 void adreno_flush(struct msm_gpu *gpu, struct msm_ringbuffer *ring);
 bool adreno_idle(struct msm_gpu *gpu, struct msm_ringbuffer *ring);
 #ifdef CONFIG_DEBUG_FS
diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c
index 940674c8d759..d08f8ee1aaaa 100644
--- a/drivers/gpu/drm/msm/msm_drv.c
+++ b/drivers/gpu/drm/msm/msm_drv.c
@@ -565,16 +565,20 @@ static int msm_open(struct drm_device *dev, struct drm_file *file)
 	if (!ctx)
 		return -ENOMEM;
 
-	file->driver_priv = ctx;
-
 	if (dev && dev->dev_private) {
 		struct msm_drm_private *priv = dev->dev_private;
 		struct msm_kms *kms;
 
-		kms = priv->kms;
-		if (kms && kms->funcs && kms->funcs->postopen)
-			kms->funcs->postopen(kms, file);
+		if (priv) {
+			ctx->aspace = priv->gpu->aspace;
+			kms = priv->kms;
+
+			if (kms && kms->funcs && kms->funcs->postopen)
+				kms->funcs->postopen(kms, file);
+		}
 	}
+
+	file->driver_priv = ctx;
 	return 0;
 }
 
@@ -596,11 +600,6 @@ static void msm_postclose(struct drm_device *dev, struct drm_file *file)
 	if (kms && kms->funcs && kms->funcs->postclose)
 		kms->funcs->postclose(kms, file);
 
-	mutex_lock(&dev->struct_mutex);
-	if (ctx == priv->lastctx)
-		priv->lastctx = NULL;
-	mutex_unlock(&dev->struct_mutex);
-
 	kfree(ctx);
 }
 
diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h
index 3df2c7e9b4b6..3efd4466d4bc 100644
--- a/drivers/gpu/drm/msm/msm_drv.h
+++ b/drivers/gpu/drm/msm/msm_drv.h
@@ -74,11 +74,7 @@ struct msm_gem_vma;
 #define MAX_CONNECTORS 8
 
 struct msm_file_private {
-	/* currently we don't do anything useful with this.. but when
-	 * per-context address spaces are supported we'd keep track of
-	 * the context's page-tables here.
-	 */
-	int dummy;
+	struct msm_gem_address_space *aspace;
 };
 
 enum msm_mdp_plane_property {
@@ -278,7 +274,6 @@ struct msm_drm_private {
 
 	/* when we have more than one 'msm_gpu' these need to be an array: */
 	struct msm_gpu *gpu;
-	struct msm_file_private *lastctx;
 
 	struct drm_fb_helper *fbdev;
 
diff --git a/drivers/gpu/drm/msm/msm_gem.h b/drivers/gpu/drm/msm/msm_gem.h
index 8d9976ce8feb..e70e8833b30c 100644
--- a/drivers/gpu/drm/msm/msm_gem.h
+++ b/drivers/gpu/drm/msm/msm_gem.h
@@ -116,7 +116,7 @@ static inline uint32_t msm_gem_fence(struct msm_gem_object *msm_obj,
  */
 struct msm_gem_submit {
 	struct drm_device *dev;
-	struct msm_gpu *gpu;
+	struct msm_gem_address_space *aspace;
 	struct list_head node;   /* node in gpu submit_list */
 	struct list_head bo_list;
 	struct ww_acquire_ctx ticket;
diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c b/drivers/gpu/drm/msm/msm_gem_submit.c
index 7c83a9acd3e6..c04c3c8e0eca 100644
--- a/drivers/gpu/drm/msm/msm_gem_submit.c
+++ b/drivers/gpu/drm/msm/msm_gem_submit.c
@@ -34,7 +34,7 @@ static inline void __user *to_user_ptr(u64 address)
 }
 
 static struct msm_gem_submit *submit_create(struct drm_device *dev,
-		struct msm_gpu *gpu, int nr)
+		struct msm_gem_address_space *aspace, int nr)
 {
 	struct msm_gem_submit *submit;
 	int sz = sizeof(*submit) + (nr * sizeof(submit->bos[0]));
@@ -42,7 +42,7 @@ static struct msm_gem_submit *submit_create(struct drm_device *dev,
 	submit = kmalloc(sz, GFP_TEMPORARY | __GFP_NOWARN | __GFP_NORETRY);
 	if (submit) {
 		submit->dev = dev;
-		submit->gpu = gpu;
+		submit->aspace = aspace;
 
 		/* initially, until copy_from_user() and bo lookup succeeds: */
 		submit->nr_bos = 0;
@@ -142,7 +142,7 @@ static void submit_unlock_unpin_bo(struct msm_gem_submit *submit, int i)
 	struct msm_gem_object *msm_obj = submit->bos[i].obj;
 
 	if (submit->bos[i].flags & BO_PINNED)
-		msm_gem_put_iova(&msm_obj->base, submit->gpu->aspace);
+		msm_gem_put_iova(&msm_obj->base, submit->aspace);
 
 	if (submit->bos[i].flags & BO_LOCKED)
 		ww_mutex_unlock(&msm_obj->resv->lock);
@@ -181,7 +181,7 @@ retry:
 
 		/* if locking succeeded, pin bo: */
 		ret = msm_gem_get_iova_locked(&msm_obj->base,
-				submit->gpu->aspace, &iova);
+				submit->aspace, &iova);
 
 		/* this would break the logic in the fail path.. there is no
 		 * reason for this to happen, but just to be on the safe side
@@ -361,7 +361,7 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
 
 	mutex_lock(&dev->struct_mutex);
 
-	submit = submit_create(dev, gpu, args->nr_bos);
+	submit = submit_create(dev, ctx->aspace, args->nr_bos);
 	if (!submit) {
 		ret = -ENOMEM;
 		goto out;
@@ -440,7 +440,7 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
 		(args->flags & MSM_SUBMIT_RING_MASK) >> MSM_SUBMIT_RING_SHIFT,
 		0, gpu->nr_rings - 1);
 
-	ret = msm_gpu_submit(gpu, submit, ctx);
+	ret = msm_gpu_submit(gpu, submit);
 
 	args->fence = submit->fence;
 
diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
index b95f51f2e4be..3b0c591122d0 100644
--- a/drivers/gpu/drm/msm/msm_gpu.c
+++ b/drivers/gpu/drm/msm/msm_gpu.c
@@ -301,7 +301,7 @@ static void recover_worker(struct work_struct *work)
 
 		/* replay the remaining submits for all rings: */
 		list_for_each_entry(submit, &gpu->submit_list, node) {
-			gpu->funcs->submit(gpu, submit, NULL);
+			gpu->funcs->submit(gpu, submit);
 		}
 	}
 	mutex_unlock(&dev->struct_mutex);
@@ -525,8 +525,7 @@ void msm_gpu_retire(struct msm_gpu *gpu)
 }
 
 /* add bo's to gpu's ring, and kick gpu: */
-int msm_gpu_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
-		struct msm_file_private *ctx)
+int msm_gpu_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit)
 {
 	struct drm_device *dev = gpu->dev;
 	struct msm_drm_private *priv = dev->dev_private;
@@ -561,7 +560,7 @@ int msm_gpu_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
 			/* ring takes a reference to the bo and iova: */
 			drm_gem_object_reference(&msm_obj->base);
 			msm_gem_get_iova_locked(&msm_obj->base,
-					submit->gpu->aspace, &iova);
+					submit->aspace, &iova);
 		}
 
 		if (submit->bos[i].flags & MSM_SUBMIT_BO_READ)
@@ -571,8 +570,7 @@ int msm_gpu_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
 			msm_gem_move_to_active(&msm_obj->base, gpu, true, submit->fence);
 	}
 
-	ret = gpu->funcs->submit(gpu, submit, ctx);
-	priv->lastctx = ctx;
+	ret = gpu->funcs->submit(gpu, submit);
 
 	hangcheck_timer_reset(gpu);
 
diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
index 70a0ccc3dc0c..e6699c286da0 100644
--- a/drivers/gpu/drm/msm/msm_gpu.h
+++ b/drivers/gpu/drm/msm/msm_gpu.h
@@ -55,8 +55,7 @@ struct msm_gpu_funcs {
 	int (*hw_init)(struct msm_gpu *gpu);
 	int (*pm_suspend)(struct msm_gpu *gpu);
 	int (*pm_resume)(struct msm_gpu *gpu);
-	int (*submit)(struct msm_gpu *gpu, struct msm_gem_submit *submit,
-			struct msm_file_private *ctx);
+	int (*submit)(struct msm_gpu *gpu, struct msm_gem_submit *submit);
 	void (*flush)(struct msm_gpu *gpu, struct msm_ringbuffer *ring);
 	irqreturn_t (*irq)(struct msm_gpu *irq);
 	uint32_t (*last_fence)(struct msm_gpu *gpu,
@@ -241,8 +240,7 @@ int msm_gpu_perfcntr_sample(struct msm_gpu *gpu, uint32_t *activetime,
 		uint32_t *totaltime, uint32_t ncntrs, uint32_t *cntrs);
 
 void msm_gpu_retire(struct msm_gpu *gpu);
-int msm_gpu_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
-		struct msm_file_private *ctx);
+int msm_gpu_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit);
 
 int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev,
 		struct msm_gpu *gpu, const struct msm_gpu_funcs *funcs,
-- 
cgit v1.2.3


From 7591b1ab7db50ca1d8b8e934129e8572de1f5b6d Mon Sep 17 00:00:00 2001
From: Jordan Crouse <jcrouse@codeaurora.org>
Date: Mon, 13 Feb 2017 10:14:29 -0700
Subject: drm/msm: Support dynamic IOMMU domains

Dynamic IOMMU domains allow multiple pagetables to be attached to the
same IOMMU device. These can be used by smart devices like the GPU
that can switch the pagetable dynamically between DRM instances.

Add support for dynamic IOMMU domains if they are enabled and
supported by your friendly neighborhood IOMMU driver.

Change-Id: Ic0dedbaded3a9e57a7fbb8e745c44c183f6b4655
Signed-off-by: Jordan Crouse <jcrouse@codeaurora.org>
---
 drivers/gpu/drm/msm/adreno/adreno_gpu.c |   3 +-
 drivers/gpu/drm/msm/mdp/mdp5/mdp5_kms.c |   3 +-
 drivers/gpu/drm/msm/msm_iommu.c         | 137 ++++++++++++++++++++++++++++----
 drivers/gpu/drm/msm/msm_iommu.h         |  34 ++++++++
 drivers/gpu/drm/msm/msm_mmu.h           |   3 +-
 drivers/gpu/drm/msm/msm_smmu.c          |   2 +-
 drivers/gpu/drm/msm/sde/sde_kms.c       |   3 +-
 7 files changed, 163 insertions(+), 22 deletions(-)
 create mode 100644 drivers/gpu/drm/msm/msm_iommu.h

diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
index 0db0b70edab7..93d495770b32 100644
--- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
@@ -625,8 +625,7 @@ void adreno_gpu_cleanup(struct adreno_gpu *gpu)
 	msm_gpu_cleanup(&gpu->base);
 
 	if (aspace) {
-		aspace->mmu->funcs->detach(aspace->mmu,
-			iommu_ports, ARRAY_SIZE(iommu_ports));
+		aspace->mmu->funcs->detach(aspace->mmu);
 		msm_gem_address_space_destroy(aspace);
 	}
 }
diff --git a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_kms.c b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_kms.c
index ede681e12a68..2e5754bd8280 100644
--- a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_kms.c
+++ b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_kms.c
@@ -136,8 +136,7 @@ static void mdp5_destroy(struct msm_kms *kms)
 	mdp5_irq_domain_fini(mdp5_kms);
 
 	if (aspace) {
-		aspace->mmu->funcs->detach(aspace->mmu,
-				iommu_ports, ARRAY_SIZE(iommu_ports));
+		aspace->mmu->funcs->detach(aspace->mmu);
 		msm_gem_address_space_destroy(aspace);
 	}
 
diff --git a/drivers/gpu/drm/msm/msm_iommu.c b/drivers/gpu/drm/msm/msm_iommu.c
index b4a5015ff922..acbab7494de4 100644
--- a/drivers/gpu/drm/msm/msm_iommu.c
+++ b/drivers/gpu/drm/msm/msm_iommu.c
@@ -17,13 +17,7 @@
 
 #include <linux/of_platform.h>
 #include "msm_drv.h"
-#include "msm_mmu.h"
-
-struct msm_iommu {
-	struct msm_mmu base;
-	struct iommu_domain *domain;
-};
-#define to_msm_iommu(x) container_of(x, struct msm_iommu, base)
+#include "msm_iommu.h"
 
 static int msm_fault_handler(struct iommu_domain *iommu, struct device *dev,
 		unsigned long iova, int flags, void *arg)
@@ -32,9 +26,9 @@ static int msm_fault_handler(struct iommu_domain *iommu, struct device *dev,
 	return 0;
 }
 
-static int msm_iommu_attach(struct msm_mmu *mmu, const char **names, int cnt)
+static int _attach_iommu_device(struct msm_mmu *mmu,
+		struct iommu_domain *domain, const char **names, int cnt)
 {
-	struct msm_iommu *iommu = to_msm_iommu(mmu);
 	int i;
 
 	/* See if there is a iommus member in the current device.  If not, look
@@ -42,7 +36,7 @@ static int msm_iommu_attach(struct msm_mmu *mmu, const char **names, int cnt)
 	 */
 
 	if (of_find_property(mmu->dev->of_node, "iommus", NULL))
-		return iommu_attach_device(iommu->domain, mmu->dev);
+		return iommu_attach_device(domain, mmu->dev);
 
 	/* Look through the list of names for a target */
 	for (i = 0; i < cnt; i++) {
@@ -65,7 +59,7 @@ static int msm_iommu_attach(struct msm_mmu *mmu, const char **names, int cnt)
 				continue;
 
 			mmu->dev = &pdev->dev;
-			return iommu_attach_device(iommu->domain, mmu->dev);
+			return iommu_attach_device(domain, mmu->dev);
 		}
 	}
 
@@ -73,7 +67,67 @@ static int msm_iommu_attach(struct msm_mmu *mmu, const char **names, int cnt)
 	return -ENODEV;
 }
 
-static void msm_iommu_detach(struct msm_mmu *mmu, const char **names, int cnt)
+static int msm_iommu_attach(struct msm_mmu *mmu, const char **names, int cnt)
+{
+	struct msm_iommu *iommu = to_msm_iommu(mmu);
+	int val = 1, ret;
+
+	/* Hope springs eternal */
+	iommu->allow_dynamic = true;
+
+	/* per-instance pagetables need TTBR1 support in the IOMMU driver */
+	ret = iommu_domain_set_attr(iommu->domain,
+		DOMAIN_ATTR_ENABLE_TTBR1, &val);
+	if (ret)
+		iommu->allow_dynamic = false;
+
+	/* Attach the device to the domain */
+	ret = _attach_iommu_device(mmu, iommu->domain, names, cnt);
+	if (ret)
+		return ret;
+
+	/*
+	 * Get the context bank for the base domain; this will be shared with
+	 * the children.
+	 */
+	iommu->cb = -1;
+	if (iommu_domain_get_attr(iommu->domain, DOMAIN_ATTR_CONTEXT_BANK,
+		&iommu->cb))
+		iommu->allow_dynamic = false;
+
+	return 0;
+}
+
+static int msm_iommu_attach_dynamic(struct msm_mmu *mmu, const char **names,
+		int cnt)
+{
+	static unsigned int procid;
+	struct msm_iommu *iommu = to_msm_iommu(mmu);
+	int ret;
+	unsigned int id;
+
+	/* Assign a unique procid for the domain to cut down on TLB churn */
+	id = ++procid;
+
+	iommu_domain_set_attr(iommu->domain, DOMAIN_ATTR_PROCID, &id);
+
+	ret = iommu_attach_device(iommu->domain, mmu->dev);
+	if (ret)
+		return ret;
+
+	/*
+	 * Get the TTBR0 and the CONTEXTIDR - these will be used by the GPU to
+	 * switch the pagetable on its own.
+	 */
+	iommu_domain_get_attr(iommu->domain, DOMAIN_ATTR_TTBR0,
+		&iommu->ttbr0);
+	iommu_domain_get_attr(iommu->domain, DOMAIN_ATTR_CONTEXTIDR,
+		&iommu->contextidr);
+
+	return 0;
+}
+
+static void msm_iommu_detach(struct msm_mmu *mmu)
 {
 	struct msm_iommu *iommu = to_msm_iommu(mmu);
 	iommu_detach_device(iommu->domain, mmu->dev);
@@ -160,7 +214,16 @@ static const struct msm_mmu_funcs funcs = {
 		.destroy = msm_iommu_destroy,
 };
 
-struct msm_mmu *msm_iommu_new(struct device *dev, struct iommu_domain *domain)
+static const struct msm_mmu_funcs dynamic_funcs = {
+		.attach = msm_iommu_attach_dynamic,
+		.detach = msm_iommu_detach,
+		.map = msm_iommu_map,
+		.unmap = msm_iommu_unmap,
+		.destroy = msm_iommu_destroy,
+};
+
+struct msm_mmu *_msm_iommu_new(struct device *dev, struct iommu_domain *domain,
+		const struct msm_mmu_funcs *funcs)
 {
 	struct msm_iommu *iommu;
 
@@ -169,8 +232,54 @@ struct msm_mmu *msm_iommu_new(struct device *dev, struct iommu_domain *domain)
 		return ERR_PTR(-ENOMEM);
 
 	iommu->domain = domain;
-	msm_mmu_init(&iommu->base, dev, &funcs);
+	msm_mmu_init(&iommu->base, dev, funcs);
 	iommu_set_fault_handler(domain, msm_fault_handler, dev);
 
 	return &iommu->base;
 }
+struct msm_mmu *msm_iommu_new(struct device *dev, struct iommu_domain *domain)
+{
+	return _msm_iommu_new(dev, domain, &funcs);
+}
+
+/*
+ * Given a base domain that is attached to a IOMMU device try to create a
+ * dynamic domain that is also attached to the same device but allocates a new
+ * pagetable. This is used to allow multiple pagetables to be attached to the
+ * same device.
+ */
+struct msm_mmu *msm_iommu_new_dynamic(struct msm_mmu *base)
+{
+	struct msm_iommu *base_iommu = to_msm_iommu(base);
+	struct iommu_domain *domain;
+	struct msm_mmu *mmu;
+	int ret, val = 1;
+
+	/* Don't continue if the base domain didn't have the support we need */
+	if (!base || base_iommu->allow_dynamic == false)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	domain = iommu_domain_alloc(&platform_bus_type);
+	if (!domain)
+		return ERR_PTR(-ENODEV);
+
+	mmu = _msm_iommu_new(base->dev, domain, &dynamic_funcs);
+
+	if (IS_ERR(mmu)) {
+		if (domain)
+			iommu_domain_free(domain);
+		return mmu;
+	}
+
+	ret = iommu_domain_set_attr(domain, DOMAIN_ATTR_DYNAMIC, &val);
+	if (ret) {
+		msm_iommu_destroy(mmu);
+		return ERR_PTR(ret);
+	}
+
+	/* Set the context bank to match the base domain */
+	iommu_domain_set_attr(domain, DOMAIN_ATTR_CONTEXT_BANK,
+		&base_iommu->cb);
+
+	return mmu;
+}
diff --git a/drivers/gpu/drm/msm/msm_iommu.h b/drivers/gpu/drm/msm/msm_iommu.h
new file mode 100644
index 000000000000..bfb75015efbc
--- /dev/null
+++ b/drivers/gpu/drm/msm/msm_iommu.h
@@ -0,0 +1,34 @@
+/* Copyright (c) 2016 The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef _MSM_IOMMU_H_
+#define _MSM_IOMMU_H_
+
+#include "msm_mmu.h"
+
+struct msm_iommu {
+	struct msm_mmu base;
+	struct iommu_domain *domain;
+	int cb;
+	phys_addr_t ttbr0;
+	uint32_t contextidr;
+	bool allow_dynamic;
+};
+#define to_msm_iommu(x) container_of(x, struct msm_iommu, base)
+
+static inline bool msm_iommu_allow_dynamic(struct msm_mmu *mmu)
+{
+	struct msm_iommu *iommu = to_msm_iommu(mmu);
+
+	return iommu->allow_dynamic;
+}
+#endif
diff --git a/drivers/gpu/drm/msm/msm_mmu.h b/drivers/gpu/drm/msm/msm_mmu.h
index 8464b9e04b63..501f12bef00d 100644
--- a/drivers/gpu/drm/msm/msm_mmu.h
+++ b/drivers/gpu/drm/msm/msm_mmu.h
@@ -32,7 +32,7 @@ enum msm_mmu_domain_type {
 
 struct msm_mmu_funcs {
 	int (*attach)(struct msm_mmu *mmu, const char **names, int cnt);
-	void (*detach)(struct msm_mmu *mmu, const char **names, int cnt);
+	void (*detach)(struct msm_mmu *mmu);
 	int (*map)(struct msm_mmu *mmu, uint64_t iova, struct sg_table *sgt,
 			int prot);
 	int (*unmap)(struct msm_mmu *mmu, uint64_t iova, struct sg_table *sgt);
@@ -62,5 +62,6 @@ static inline void msm_mmu_init(struct msm_mmu *mmu, struct device *dev,
 struct msm_mmu *msm_iommu_new(struct device *dev, struct iommu_domain *domain);
 struct msm_mmu *msm_smmu_new(struct device *dev,
 	enum msm_mmu_domain_type domain);
+struct msm_mmu *msm_iommu_new_dynamic(struct msm_mmu *orig);
 
 #endif /* __MSM_MMU_H__ */
diff --git a/drivers/gpu/drm/msm/msm_smmu.c b/drivers/gpu/drm/msm/msm_smmu.c
index 500e8a5e6247..c99f51e09700 100644
--- a/drivers/gpu/drm/msm/msm_smmu.c
+++ b/drivers/gpu/drm/msm/msm_smmu.c
@@ -86,7 +86,7 @@ static int msm_smmu_attach(struct msm_mmu *mmu, const char **names, int cnt)
 	return 0;
 }
 
-static void msm_smmu_detach(struct msm_mmu *mmu, const char **names, int cnt)
+static void msm_smmu_detach(struct msm_mmu *mmu)
 {
 	struct msm_smmu *smmu = to_msm_smmu(mmu);
 	struct msm_smmu_client *client = msm_smmu_to_client(smmu);
diff --git a/drivers/gpu/drm/msm/sde/sde_kms.c b/drivers/gpu/drm/msm/sde/sde_kms.c
index e906b66dcd95..4907b3233894 100644
--- a/drivers/gpu/drm/msm/sde/sde_kms.c
+++ b/drivers/gpu/drm/msm/sde/sde_kms.c
@@ -946,8 +946,7 @@ static int _sde_kms_mmu_destroy(struct sde_kms *sde_kms)
 
 		mmu = sde_kms->aspace[i]->mmu;
 
-		mmu->funcs->detach(mmu, (const char **)iommu_ports,
-				ARRAY_SIZE(iommu_ports));
+		mmu->funcs->detach(mmu);
 		msm_gem_address_space_destroy(sde_kms->aspace[i]);
 
 		sde_kms->aspace[i] = NULL;
-- 
cgit v1.2.3


From 64eeed7a4b465870b77d8832537124c4822f07b7 Mon Sep 17 00:00:00 2001
From: Jordan Crouse <jcrouse@codeaurora.org>
Date: Mon, 13 Feb 2017 10:14:29 -0700
Subject: drm/msm: Support per-instance pagetables

Support per-instance pagetables for 5XX targets. Per-instance
pagetables allow each open DRM instance to have its own VM memory
space to prevent accidently or maliciously copying or overwriting
buffers from other instances. It also opens the door for SVM since
any given CPU side address can be more reliably mapped into the
instance's GPU VM space without conflict.

To support this create a new dynamic domain (pagetable) for each open
DRM file and map buffer objects for each instance into that pagetable.
Use the GPU to switch to the pagetable for the instance while doing a
submit.

Change-Id: Ic0dedbad22d157d514ed1628b83e8cded5490dec
Signed-off-by: Jordan Crouse <jcrouse@codeaurora.org>
---
 drivers/gpu/drm/msm/adreno/a5xx_gpu.c     | 63 ++++++++++++++++++++++++++++
 drivers/gpu/drm/msm/adreno/a5xx_gpu.h     | 17 ++++++++
 drivers/gpu/drm/msm/adreno/a5xx_preempt.c | 62 ++++++++++++++++++++++-----
 drivers/gpu/drm/msm/adreno/adreno_gpu.c   |  2 +-
 drivers/gpu/drm/msm/adreno/adreno_gpu.h   |  2 +
 drivers/gpu/drm/msm/msm_drv.c             | 69 ++++++++++++++++++++++++-------
 drivers/gpu/drm/msm/msm_drv.h             |  3 ++
 drivers/gpu/drm/msm/msm_gem_vma.c         | 13 ++++++
 8 files changed, 205 insertions(+), 26 deletions(-)

diff --git a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
index 687ff90a2a7e..7add88f62bb0 100644
--- a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
@@ -12,6 +12,7 @@
  */
 
 #include "msm_gem.h"
+#include "msm_iommu.h"
 #include "a5xx_gpu.h"
 
 extern bool hang_debug;
@@ -42,6 +43,66 @@ static void a5xx_flush(struct msm_gpu *gpu, struct msm_ringbuffer *ring)
 		gpu_write(gpu, REG_A5XX_CP_RB_WPTR, wptr);
 }
 
+static void a5xx_set_pagetable(struct msm_gpu *gpu, struct msm_ringbuffer *ring,
+	struct msm_gem_address_space *aspace)
+{
+	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
+	struct msm_mmu *mmu = aspace->mmu;
+	struct msm_iommu *iommu = to_msm_iommu(mmu);
+
+	if (!iommu->ttbr0)
+		return;
+
+	/* Turn off protected mode */
+	OUT_PKT7(ring, CP_SET_PROTECTED_MODE, 1);
+	OUT_RING(ring, 0);
+
+	/* Turn on APIV mode to access critical regions */
+	OUT_PKT4(ring, REG_A5XX_CP_CNTL, 1);
+	OUT_RING(ring, 1);
+
+	/* Make sure the ME is syncronized before staring the update */
+	OUT_PKT7(ring, CP_WAIT_FOR_ME, 0);
+
+	/* Execute the table update */
+	OUT_PKT7(ring, CP_SMMU_TABLE_UPDATE, 3);
+	OUT_RING(ring, lower_32_bits(iommu->ttbr0));
+	OUT_RING(ring, upper_32_bits(iommu->ttbr0));
+	OUT_RING(ring, iommu->contextidr);
+
+	/*
+	 * Write the new TTBR0 to the preemption records - this will be used to
+	 * reload the pagetable if the current ring gets preempted out.
+	 */
+	OUT_PKT7(ring, CP_MEM_WRITE, 4);
+	OUT_RING(ring, lower_32_bits(rbmemptr(adreno_gpu, ring->id, ttbr0)));
+	OUT_RING(ring, upper_32_bits(rbmemptr(adreno_gpu, ring->id, ttbr0)));
+	OUT_RING(ring, lower_32_bits(iommu->ttbr0));
+	OUT_RING(ring, upper_32_bits(iommu->ttbr0));
+
+	/* Also write the current contextidr (ASID) */
+	OUT_PKT7(ring, CP_MEM_WRITE, 3);
+	OUT_RING(ring, lower_32_bits(rbmemptr(adreno_gpu, ring->id,
+		contextidr)));
+	OUT_RING(ring, upper_32_bits(rbmemptr(adreno_gpu, ring->id,
+		contextidr)));
+	OUT_RING(ring, iommu->contextidr);
+
+	/* Invalidate the draw state so we start off fresh */
+	OUT_PKT7(ring, CP_SET_DRAW_STATE, 3);
+	OUT_RING(ring, 0x40000);
+	OUT_RING(ring, 1);
+	OUT_RING(ring, 0);
+
+	/* Turn off APRIV */
+	OUT_PKT4(ring, REG_A5XX_CP_CNTL, 1);
+	OUT_RING(ring, 0);
+
+	/* Turn off protected mode */
+	OUT_PKT7(ring, CP_SET_PROTECTED_MODE, 1);
+	OUT_RING(ring, 1);
+}
+
 static int a5xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit)
 {
 	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
@@ -49,6 +110,8 @@ static int a5xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit)
 	struct msm_ringbuffer *ring = gpu->rb[submit->ring];
 	unsigned int i, ibs = 0;
 
+	a5xx_set_pagetable(gpu, ring, submit->aspace);
+
 	OUT_PKT7(ring, CP_PREEMPT_ENABLE_GLOBAL, 1);
 	OUT_RING(ring, 0x02);
 
diff --git a/drivers/gpu/drm/msm/adreno/a5xx_gpu.h b/drivers/gpu/drm/msm/adreno/a5xx_gpu.h
index a0f8bb2ac42b..6327cde998a0 100644
--- a/drivers/gpu/drm/msm/adreno/a5xx_gpu.h
+++ b/drivers/gpu/drm/msm/adreno/a5xx_gpu.h
@@ -53,6 +53,9 @@ struct a5xx_gpu {
 	atomic_t preempt_state;
 	struct timer_list preempt_timer;
 
+	struct a5xx_smmu_info *smmu_info;
+	struct drm_gem_object *smmu_info_bo;
+	uint64_t smmu_info_iova;
 };
 
 #define to_a5xx_gpu(x) container_of(x, struct a5xx_gpu, base)
@@ -133,6 +136,20 @@ struct a5xx_preempt_record {
  */
 #define A5XX_PREEMPT_COUNTER_SIZE (16 * 4)
 
+/*
+ * This is a global structure that the preemption code uses to switch in the
+ * pagetable for the preempted process - the code switches in whatever we
+ * after preempting in a new ring.
+ */
+struct a5xx_smmu_info {
+	uint32_t  magic;
+	uint32_t  _pad4;
+	uint64_t  ttbr0;
+	uint32_t  asid;
+	uint32_t  contextidr;
+};
+
+#define A5XX_SMMU_INFO_MAGIC 0x3618CDA3UL
 
 int a5xx_power_init(struct msm_gpu *gpu);
 void a5xx_gpmu_ucode_init(struct msm_gpu *gpu);
diff --git a/drivers/gpu/drm/msm/adreno/a5xx_preempt.c b/drivers/gpu/drm/msm/adreno/a5xx_preempt.c
index 9403853d86c4..648494c75abc 100644
--- a/drivers/gpu/drm/msm/adreno/a5xx_preempt.c
+++ b/drivers/gpu/drm/msm/adreno/a5xx_preempt.c
@@ -12,6 +12,7 @@
  */
 
 #include "msm_gem.h"
+#include "msm_iommu.h"
 #include "a5xx_gpu.h"
 
 static void *alloc_kernel_bo(struct drm_device *drm, struct msm_gpu *gpu,
@@ -172,6 +173,17 @@ void a5xx_preempt_trigger(struct msm_gpu *gpu)
 	a5xx_gpu->preempt[ring->id]->wptr = get_wptr(ring);
 	spin_unlock_irqrestore(&ring->lock, flags);
 
+	/* Do read barrier to make sure we have updated pagetable info */
+	rmb();
+
+	/* Set the SMMU info for the preemption */
+	if (a5xx_gpu->smmu_info) {
+		a5xx_gpu->smmu_info->ttbr0 =
+			adreno_gpu->memptrs->ttbr0[ring->id];
+		a5xx_gpu->smmu_info->contextidr =
+			adreno_gpu->memptrs->contextidr[ring->id];
+	}
+
 	/* Set the address of the incoming preemption record */
 	gpu_write64(gpu, REG_A5XX_CP_CONTEXT_SWITCH_RESTORE_ADDR_LO,
 		REG_A5XX_CP_CONTEXT_SWITCH_RESTORE_ADDR_HI,
@@ -247,9 +259,10 @@ void a5xx_preempt_hw_init(struct msm_gpu *gpu)
 		}
 	}
 
-	/* Write a 0 to signal that we aren't switching pagetables */
+	/* Tell the CP where to find the smmu_info buffer */
 	gpu_write64(gpu, REG_A5XX_CP_CONTEXT_SWITCH_SMMU_INFO_LO,
-		REG_A5XX_CP_CONTEXT_SWITCH_SMMU_INFO_HI, 0);
+		REG_A5XX_CP_CONTEXT_SWITCH_SMMU_INFO_HI,
+		a5xx_gpu->smmu_info_iova);
 
 	/* Reset the preemption state */
 	set_preempt_state(a5xx_gpu, PREEMPT_NONE);
@@ -309,6 +322,13 @@ void a5xx_preempt_fini(struct msm_gpu *gpu)
 
 		a5xx_gpu->preempt_bo[i] = NULL;
 	}
+
+	if (a5xx_gpu->smmu_info_bo) {
+		if (a5xx_gpu->smmu_info_iova)
+			msm_gem_put_iova(a5xx_gpu->smmu_info_bo, gpu->aspace);
+		drm_gem_object_unreference_unlocked(a5xx_gpu->smmu_info_bo);
+		a5xx_gpu->smmu_info_bo = NULL;
+	}
 }
 
 void a5xx_preempt_init(struct msm_gpu *gpu)
@@ -316,6 +336,9 @@ void a5xx_preempt_init(struct msm_gpu *gpu)
 	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
 	struct a5xx_gpu *a5xx_gpu = to_a5xx_gpu(adreno_gpu);
 	struct msm_ringbuffer *ring;
+	struct a5xx_smmu_info *ptr;
+	struct drm_gem_object *bo;
+	uint64_t iova;
 	int i;
 
 	/* No preemption if we only have one ring */
@@ -326,18 +349,35 @@ void a5xx_preempt_init(struct msm_gpu *gpu)
 		if (!ring)
 			continue;
 
-		if (preempt_init_ring(a5xx_gpu, ring)) {
-			/*
-			 * On any failure our adventure is over. Clean up and
-			 * set nr_rings to 1 to force preemption off
-			 */
-			a5xx_preempt_fini(gpu);
-			gpu->nr_rings = 1;
+		if (preempt_init_ring(a5xx_gpu, ring))
+			goto fail;
+	}
+
+	if (msm_iommu_allow_dynamic(gpu->aspace->mmu)) {
+		ptr = alloc_kernel_bo(gpu->dev, gpu,
+			sizeof(struct a5xx_smmu_info),
+			MSM_BO_UNCACHED | MSM_BO_PRIVILEGED,
+			&bo, &iova);
 
-			return;
-		}
+		if (IS_ERR(ptr))
+			goto fail;
+
+		ptr->magic = A5XX_SMMU_INFO_MAGIC;
+
+		a5xx_gpu->smmu_info_bo = bo;
+		a5xx_gpu->smmu_info_iova = iova;
+		a5xx_gpu->smmu_info = ptr;
 	}
 
 	setup_timer(&a5xx_gpu->preempt_timer, a5xx_preempt_timer,
 		(unsigned long) a5xx_gpu);
+
+	return;
+fail:
+	/*
+	 * On any failure our adventure is over. Clean up and
+	 * set nr_rings to 1 to force preemption off
+	 */
+	a5xx_preempt_fini(gpu);
+	gpu->nr_rings = 1;
 }
diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
index 93d495770b32..6600c62288d5 100644
--- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
@@ -616,7 +616,7 @@ void adreno_gpu_cleanup(struct adreno_gpu *gpu)
 
 	if (gpu->memptrs_bo) {
 		if (gpu->memptrs_iova)
-			msm_gem_put_iova(gpu->memptrs_bo, gpu->base.aspace);
+			msm_gem_put_iova(gpu->memptrs_bo, aspace);
 		drm_gem_object_unreference_unlocked(gpu->memptrs_bo);
 	}
 	release_firmware(gpu->pm4);
diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.h b/drivers/gpu/drm/msm/adreno/adreno_gpu.h
index 95389fc453b4..55602d20a205 100644
--- a/drivers/gpu/drm/msm/adreno/adreno_gpu.h
+++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.h
@@ -95,6 +95,8 @@ const struct adreno_info *adreno_info(struct adreno_rev rev);
 struct adreno_rbmemptrs {
 	volatile uint32_t rptr[MSM_GPU_MAX_RINGS];
 	volatile uint32_t fence[MSM_GPU_MAX_RINGS];
+	volatile uint64_t ttbr0[MSM_GPU_MAX_RINGS];
+	volatile unsigned int contextidr[MSM_GPU_MAX_RINGS];
 };
 
 struct adreno_gpu {
diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c
index d08f8ee1aaaa..bc014c547908 100644
--- a/drivers/gpu/drm/msm/msm_drv.c
+++ b/drivers/gpu/drm/msm/msm_drv.c
@@ -23,6 +23,8 @@
 #include "sde_wb.h"
 
 #define TEARDOWN_DEADLOCK_RETRY_MAX 5
+#include "msm_gem.h"
+#include "msm_mmu.h"
 
 static void msm_fb_output_poll_changed(struct drm_device *dev)
 {
@@ -552,33 +554,65 @@ static void load_gpu(struct drm_device *dev)
 }
 #endif
 
-static int msm_open(struct drm_device *dev, struct drm_file *file)
+static struct msm_file_private *setup_pagetable(struct msm_drm_private *priv)
 {
 	struct msm_file_private *ctx;
 
-	/* For now, load gpu on open.. to avoid the requirement of having
-	 * firmware in the initrd.
-	 */
-	load_gpu(dev);
+	if (!priv || !priv->gpu)
+		return NULL;
 
 	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
 	if (!ctx)
-		return -ENOMEM;
+		return ERR_PTR(-ENOMEM);
 
-	if (dev && dev->dev_private) {
-		struct msm_drm_private *priv = dev->dev_private;
-		struct msm_kms *kms;
+	ctx->aspace = msm_gem_address_space_create_instance(
+		priv->gpu->aspace->mmu, "gpu", 0x100000000, 0x1ffffffff);
 
-		if (priv) {
-			ctx->aspace = priv->gpu->aspace;
-			kms = priv->kms;
+	if (IS_ERR(ctx->aspace)) {
+		int ret = PTR_ERR(ctx->aspace);
 
-			if (kms && kms->funcs && kms->funcs->postopen)
-				kms->funcs->postopen(kms, file);
+		/*
+		 * If dynamic domains are not supported, everybody uses the
+		 * same pagetable
+		 */
+		if (ret != -EOPNOTSUPP) {
+			kfree(ctx);
+			return ERR_PTR(ret);
 		}
+
+		ctx->aspace = priv->gpu->aspace;
 	}
 
+	ctx->aspace->mmu->funcs->attach(ctx->aspace->mmu, NULL, 0);
+	return ctx;
+}
+
+static int msm_open(struct drm_device *dev, struct drm_file *file)
+{
+	struct msm_file_private *ctx = NULL;
+	struct msm_drm_private *priv;
+	struct msm_kms *kms;
+
+	if (!dev || !dev->dev_private)
+		return -ENODEV;
+
+	priv = dev->dev_private;
+	/* For now, load gpu on open.. to avoid the requirement of having
+	 * firmware in the initrd.
+	 */
+	load_gpu(dev);
+
+	ctx = setup_pagetable(priv);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
 	file->driver_priv = ctx;
+
+	kms = priv->kms;
+
+	if (kms && kms->funcs && kms->funcs->postopen)
+		kms->funcs->postopen(kms, file);
+
 	return 0;
 }
 
@@ -600,6 +634,13 @@ static void msm_postclose(struct drm_device *dev, struct drm_file *file)
 	if (kms && kms->funcs && kms->funcs->postclose)
 		kms->funcs->postclose(kms, file);
 
+	mutex_lock(&dev->struct_mutex);
+	if (ctx && ctx->aspace && ctx->aspace != priv->gpu->aspace) {
+		ctx->aspace->mmu->funcs->detach(ctx->aspace->mmu);
+		msm_gem_address_space_destroy(ctx->aspace);
+	}
+	mutex_unlock(&dev->struct_mutex);
+
 	kfree(ctx);
 }
 
diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h
index 3efd4466d4bc..fe32ea1d4e00 100644
--- a/drivers/gpu/drm/msm/msm_drv.h
+++ b/drivers/gpu/drm/msm/msm_drv.h
@@ -411,6 +411,9 @@ void msm_gem_address_space_destroy(struct msm_gem_address_space *aspace);
 struct msm_gem_address_space *
 msm_gem_address_space_create(struct device *dev, struct iommu_domain *domain,
 		const char *name);
+struct msm_gem_address_space *
+msm_gem_address_space_create_instance(struct msm_mmu *parent, const char *name,
+		uint64_t start, uint64_t end);
 
 /* For SDE  display */
 struct msm_gem_address_space *
diff --git a/drivers/gpu/drm/msm/msm_gem_vma.c b/drivers/gpu/drm/msm/msm_gem_vma.c
index 8ad96e9e2371..2baa5a3e6ce4 100644
--- a/drivers/gpu/drm/msm/msm_gem_vma.c
+++ b/drivers/gpu/drm/msm/msm_gem_vma.c
@@ -218,3 +218,16 @@ msm_gem_address_space_destroy(struct msm_gem_address_space *aspace)
 
 	kfree(aspace);
 }
+
+/* Create a new dynamic instance */
+struct msm_gem_address_space *
+msm_gem_address_space_create_instance(struct msm_mmu *parent, const char *name,
+		uint64_t start, uint64_t end)
+{
+	struct msm_mmu *child = msm_iommu_new_dynamic(parent);
+
+	if (IS_ERR(child))
+		return (struct msm_gem_address_space *) child;
+
+	return msm_gem_address_space_new(child, name, start, end);
+}
-- 
cgit v1.2.3


From 75bc0cc55c2c563a0585846e5e8fe244de186580 Mon Sep 17 00:00:00 2001
From: Jordan Crouse <jcrouse@codeaurora.org>
Date: Mon, 13 Feb 2017 10:14:30 -0700
Subject: drm/msm: Reference count address spaces

There are reasons for a memory object to outlive the file descriptor
that created it and so the address space that a buffer object is
attached to must also outlive the file descriptor. Reference count
the address space so that it can remain viable until all the objects
have released their addresses.

Change-Id: Ic0dedbad3769801b62152d81b37f2f43f962d308
Signed-off-by: Jordan Crouse <jcrouse@codeaurora.org>
---
 drivers/gpu/drm/msm/adreno/adreno_gpu.c |  2 +-
 drivers/gpu/drm/msm/mdp/mdp4/mdp4_kms.c |  2 +-
 drivers/gpu/drm/msm/mdp/mdp5/mdp5_kms.c |  2 +-
 drivers/gpu/drm/msm/msm_drv.c           |  2 +-
 drivers/gpu/drm/msm/msm_drv.h           |  2 +-
 drivers/gpu/drm/msm/msm_gem.h           |  2 ++
 drivers/gpu/drm/msm/msm_gem_vma.c       | 41 +++++++++++++++++++++++++--------
 drivers/gpu/drm/msm/sde/sde_kms.c       |  4 ++--
 8 files changed, 41 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
index 6600c62288d5..337ed53b7bc8 100644
--- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
@@ -626,6 +626,6 @@ void adreno_gpu_cleanup(struct adreno_gpu *gpu)
 
 	if (aspace) {
 		aspace->mmu->funcs->detach(aspace->mmu);
-		msm_gem_address_space_destroy(aspace);
+		msm_gem_address_space_put(aspace);
 	}
 }
diff --git a/drivers/gpu/drm/msm/mdp/mdp4/mdp4_kms.c b/drivers/gpu/drm/msm/mdp/mdp4/mdp4_kms.c
index da9d58eccc3a..b6cddee0cf34 100644
--- a/drivers/gpu/drm/msm/mdp/mdp4/mdp4_kms.c
+++ b/drivers/gpu/drm/msm/mdp/mdp4/mdp4_kms.c
@@ -204,7 +204,7 @@ static void mdp4_destroy(struct msm_kms *kms)
 	if (aspace) {
 		aspace->mmu->funcs->detach(aspace->mmu,
 				iommu_ports, ARRAY_SIZE(iommu_ports));
-		msm_gem_address_space_destroy(aspace);
+		msm_gem_address_space_put(aspace);
 	}
 
 	kfree(mdp4_kms);
diff --git a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_kms.c b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_kms.c
index 2e5754bd8280..e4e69ebd116e 100644
--- a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_kms.c
+++ b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_kms.c
@@ -137,7 +137,7 @@ static void mdp5_destroy(struct msm_kms *kms)
 
 	if (aspace) {
 		aspace->mmu->funcs->detach(aspace->mmu);
-		msm_gem_address_space_destroy(aspace);
+		msm_gem_address_space_put(aspace);
 	}
 
 	if (mdp5_kms->ctlm)
diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c
index bc014c547908..4d39eb439793 100644
--- a/drivers/gpu/drm/msm/msm_drv.c
+++ b/drivers/gpu/drm/msm/msm_drv.c
@@ -637,7 +637,7 @@ static void msm_postclose(struct drm_device *dev, struct drm_file *file)
 	mutex_lock(&dev->struct_mutex);
 	if (ctx && ctx->aspace && ctx->aspace != priv->gpu->aspace) {
 		ctx->aspace->mmu->funcs->detach(ctx->aspace->mmu);
-		msm_gem_address_space_destroy(ctx->aspace);
+		msm_gem_address_space_put(ctx->aspace);
 	}
 	mutex_unlock(&dev->struct_mutex);
 
diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h
index fe32ea1d4e00..0a06c1cb27a4 100644
--- a/drivers/gpu/drm/msm/msm_drv.h
+++ b/drivers/gpu/drm/msm/msm_drv.h
@@ -405,7 +405,7 @@ int msm_gem_map_vma(struct msm_gem_address_space *aspace,
 		struct msm_gem_vma *vma, struct sg_table *sgt,
 		void *priv, unsigned int flags);
 
-void msm_gem_address_space_destroy(struct msm_gem_address_space *aspace);
+void msm_gem_address_space_put(struct msm_gem_address_space *aspace);
 
 /* For GPU and legacy display */
 struct msm_gem_address_space *
diff --git a/drivers/gpu/drm/msm/msm_gem.h b/drivers/gpu/drm/msm/msm_gem.h
index e70e8833b30c..ac46c473791f 100644
--- a/drivers/gpu/drm/msm/msm_gem.h
+++ b/drivers/gpu/drm/msm/msm_gem.h
@@ -18,6 +18,7 @@
 #ifndef __MSM_GEM_H__
 #define __MSM_GEM_H__
 
+#include <linux/kref.h>
 #include <linux/reservation.h>
 #include "msm_drv.h"
 
@@ -38,6 +39,7 @@ struct msm_gem_address_space {
 	const char *name;
 	struct msm_mmu *mmu;
 	const struct msm_gem_aspace_ops *ops;
+	struct kref kref;
 };
 
 struct msm_gem_vma {
diff --git a/drivers/gpu/drm/msm/msm_gem_vma.c b/drivers/gpu/drm/msm/msm_gem_vma.c
index 2baa5a3e6ce4..7ca96831a9b3 100644
--- a/drivers/gpu/drm/msm/msm_gem_vma.c
+++ b/drivers/gpu/drm/msm/msm_gem_vma.c
@@ -19,6 +19,24 @@
 #include "msm_gem.h"
 #include "msm_mmu.h"
 
+static void
+msm_gem_address_space_destroy(struct kref *kref)
+{
+	struct msm_gem_address_space *aspace = container_of(kref,
+			struct msm_gem_address_space, kref);
+
+	if (aspace->ops->destroy)
+		aspace->ops->destroy(aspace);
+
+	kfree(aspace);
+}
+
+void msm_gem_address_space_put(struct msm_gem_address_space *aspace)
+{
+	if (aspace)
+		kref_put(&aspace->kref, msm_gem_address_space_destroy);
+}
+
 /* SDE address space operations */
 static void smmu_aspace_unmap_vma(struct msm_gem_address_space *aspace,
 		struct msm_gem_vma *vma, struct sg_table *sgt,
@@ -34,6 +52,8 @@ static void smmu_aspace_unmap_vma(struct msm_gem_address_space *aspace,
 			DMA_BIDIRECTIONAL);
 
 	vma->iova = 0;
+
+	msm_gem_address_space_put(aspace);
 }
 
 
@@ -54,6 +74,9 @@ static int smmu_aspace_map_vma(struct msm_gem_address_space *aspace,
 	if (!ret)
 		vma->iova = sg_dma_address(sgt->sgl);
 
+	/* Get a reference to the aspace to keep it around */
+	kref_get(&aspace->kref);
+
 	return ret;
 }
 
@@ -79,6 +102,8 @@ msm_gem_smmu_address_space_create(struct device *dev, struct msm_mmu *mmu,
 	aspace->mmu = mmu;
 	aspace->ops = &smmu_aspace_ops;
 
+	kref_init(&aspace->kref);
+
 	return aspace;
 }
 
@@ -104,6 +129,8 @@ static void iommu_aspace_unmap_vma(struct msm_gem_address_space *aspace,
 	drm_mm_remove_node(&vma->node);
 
 	vma->iova = 0;
+
+	msm_gem_address_space_put(aspace);
 }
 
 static int iommu_aspace_map_vma(struct msm_gem_address_space *aspace,
@@ -139,6 +166,9 @@ static int iommu_aspace_map_vma(struct msm_gem_address_space *aspace,
 		ret = aspace->mmu->funcs->map(aspace->mmu, vma->iova, sgt,
 			iommu_flags);
 
+	/* Get a reference to the aspace to keep it around */
+	kref_get(&aspace->kref);
+
 	return ret;
 }
 
@@ -176,6 +206,8 @@ msm_gem_address_space_new(struct msm_mmu *mmu, const char *name,
 	local->base.mmu = mmu;
 	local->base.ops = &msm_iommu_aspace_ops;
 
+	kref_init(&local->base.kref);
+
 	return &local->base;
 }
 
@@ -210,15 +242,6 @@ msm_gem_address_space_create(struct device *dev, struct iommu_domain *domain,
 		domain->geometry.aperture_end);
 }
 
-void
-msm_gem_address_space_destroy(struct msm_gem_address_space *aspace)
-{
-	if (aspace && aspace->ops->destroy)
-		aspace->ops->destroy(aspace);
-
-	kfree(aspace);
-}
-
 /* Create a new dynamic instance */
 struct msm_gem_address_space *
 msm_gem_address_space_create_instance(struct msm_mmu *parent, const char *name,
diff --git a/drivers/gpu/drm/msm/sde/sde_kms.c b/drivers/gpu/drm/msm/sde/sde_kms.c
index 4907b3233894..576e7d1e7189 100644
--- a/drivers/gpu/drm/msm/sde/sde_kms.c
+++ b/drivers/gpu/drm/msm/sde/sde_kms.c
@@ -947,7 +947,7 @@ static int _sde_kms_mmu_destroy(struct sde_kms *sde_kms)
 		mmu = sde_kms->aspace[i]->mmu;
 
 		mmu->funcs->detach(mmu);
-		msm_gem_address_space_destroy(sde_kms->aspace[i]);
+		msm_gem_address_space_put(sde_kms->aspace[i]);
 
 		sde_kms->aspace[i] = NULL;
 	}
@@ -986,7 +986,7 @@ static int _sde_kms_mmu_init(struct sde_kms *sde_kms)
 				ARRAY_SIZE(iommu_ports));
 		if (ret) {
 			SDE_ERROR("failed to attach iommu %d: %d\n", i, ret);
-			msm_gem_address_space_destroy(aspace);
+			msm_gem_address_space_put(aspace);
 			goto fail;
 		}
 
-- 
cgit v1.2.3


From 8268f30aebac89703d540563299f0a414a7cb210 Mon Sep 17 00:00:00 2001
From: Jordan Crouse <jcrouse@codeaurora.org>
Date: Mon, 13 Feb 2017 10:14:30 -0700
Subject: msm/drm: Dynamically locate the clocks from the device tree

Instead of using a fixed list of clock names, use the clock-names
list in the device tree to discover and get the list of clocks
that we need.

Change-Id: Ic0dedbad629743ff078177c301ffda3dbce88d3c
Signed-off-by: Jordan Crouse <jcrouse@codeaurora.org>
---
 drivers/gpu/drm/msm/msm_gpu.c | 82 ++++++++++++++++++++++++++++++-------------
 drivers/gpu/drm/msm/msm_gpu.h |  4 ++-
 2 files changed, 61 insertions(+), 25 deletions(-)

diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
index 3b0c591122d0..000a369e2537 100644
--- a/drivers/gpu/drm/msm/msm_gpu.c
+++ b/drivers/gpu/drm/msm/msm_gpu.c
@@ -93,17 +93,17 @@ static int enable_clk(struct msm_gpu *gpu)
 	uint32_t rate = gpu->gpufreq[gpu->active_level];
 	int i;
 
-	clk_set_rate(gpu->grp_clks[0], rate);
+	if (gpu->core_clk)
+		clk_set_rate(gpu->core_clk, rate);
 
-	if (gpu->grp_clks[3])
-		clk_set_rate(gpu->grp_clks[3], 19200000);
+	if (gpu->rbbmtimer_clk)
+		clk_set_rate(gpu->rbbmtimer_clk, 19200000);
 
-	/* NOTE: kgsl_pwrctrl_clk() ignores grp_clks[0].. */
-	for (i = ARRAY_SIZE(gpu->grp_clks) - 1; i > 0; i--)
+	for (i = gpu->nr_clocks - 1; i >= 0; i--)
 		if (gpu->grp_clks[i])
 			clk_prepare(gpu->grp_clks[i]);
 
-	for (i = ARRAY_SIZE(gpu->grp_clks) - 1; i > 0; i--)
+	for (i = gpu->nr_clocks - 1; i >= 0; i--)
 		if (gpu->grp_clks[i])
 			clk_enable(gpu->grp_clks[i]);
 
@@ -115,16 +115,20 @@ static int disable_clk(struct msm_gpu *gpu)
 	uint32_t rate = gpu->gpufreq[gpu->nr_pwrlevels - 1];
 	int i;
 
-	/* NOTE: kgsl_pwrctrl_clk() ignores grp_clks[0].. */
-	for (i = ARRAY_SIZE(gpu->grp_clks) - 1; i > 0; i--)
+	for (i = gpu->nr_clocks - 1; i >= 0; i--)
 		if (gpu->grp_clks[i])
 			clk_disable(gpu->grp_clks[i]);
 
-	for (i = ARRAY_SIZE(gpu->grp_clks) - 1; i > 0; i--)
+	for (i = gpu->nr_clocks - 1; i >= 0; i--)
 		if (gpu->grp_clks[i])
 			clk_unprepare(gpu->grp_clks[i]);
 
-	clk_set_rate(gpu->grp_clks[0], rate);
+	if (gpu->core_clk)
+		clk_set_rate(gpu->core_clk, rate);
+
+	if (gpu->rbbmtimer_clk)
+		clk_set_rate(gpu->rbbmtimer_clk, 0);
+
 	return 0;
 }
 
@@ -587,10 +591,47 @@ static irqreturn_t irq_handler(int irq, void *data)
 	return gpu->funcs->irq(gpu);
 }
 
-static const char *clk_names[] = {
-		"src_clk", "core_clk", "iface_clk", "rbbmtimer_clk",
-		"mem_clk", "mem_iface_clk", "alt_mem_iface_clk",
-};
+static struct clk *get_clock(struct device *dev, const char *name)
+{
+	struct clk *clk = devm_clk_get(dev, name);
+
+	DBG("clks[%s]: %p", name, clk);
+
+	return IS_ERR(clk) ? NULL : clk;
+}
+
+static int get_clocks(struct platform_device *pdev, struct msm_gpu *gpu)
+{
+	struct device *dev = &pdev->dev;
+	struct property *prop;
+	const char *name;
+	int i = 0;
+
+	gpu->nr_clocks = of_property_count_strings(dev->of_node, "clock-names");
+	if (gpu->nr_clocks < 1) {
+		gpu->nr_clocks = 0;
+		return 0;
+	}
+
+	gpu->grp_clks = devm_kcalloc(dev, sizeof(struct clk *), gpu->nr_clocks,
+		GFP_KERNEL);
+	if (!gpu->grp_clks)
+		return -ENOMEM;
+
+	of_property_for_each_string(dev->of_node, "clock-names", prop, name) {
+		gpu->grp_clks[i] = get_clock(dev, name);
+
+		/* Remember the key clocks that we need to control later */
+		if (!strcmp(name, "core_clk"))
+			gpu->core_clk = gpu->grp_clks[i];
+		else if (!strcmp(name, "rbbmtimer_clk"))
+			gpu->rbbmtimer_clk = gpu->grp_clks[i];
+
+		++i;
+	}
+
+	return 0;
+}
 
 int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev,
 		struct msm_gpu *gpu, const struct msm_gpu_funcs *funcs,
@@ -621,7 +662,6 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev,
 
 	spin_lock_init(&gpu->perf_lock);
 
-	BUG_ON(ARRAY_SIZE(clk_names) != ARRAY_SIZE(gpu->grp_clks));
 
 	/* Map registers: */
 	gpu->mmio = msm_ioremap(pdev, config->ioname, name);
@@ -645,15 +685,9 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev,
 		goto fail;
 	}
 
-	/* Acquire clocks: */
-	for (i = 0; i < ARRAY_SIZE(clk_names); i++) {
-		gpu->grp_clks[i] = devm_clk_get(&pdev->dev, clk_names[i]);
-		DBG("grp_clks[%s]: %p", clk_names[i], gpu->grp_clks[i]);
-		if (IS_ERR(gpu->grp_clks[i]))
-			gpu->grp_clks[i] = NULL;
-	}
-
-	gpu->grp_clks[0] = gpu->grp_clks[1];
+	ret = get_clocks(pdev, gpu);
+	if (ret)
+		goto fail;
 
 	gpu->ebi1_clk = devm_clk_get(&pdev->dev, "bus_clk");
 	DBG("ebi1_clk: %p", gpu->ebi1_clk);
diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
index e6699c286da0..bab4f9349e64 100644
--- a/drivers/gpu/drm/msm/msm_gpu.h
+++ b/drivers/gpu/drm/msm/msm_gpu.h
@@ -108,7 +108,9 @@ struct msm_gpu {
 
 	/* Power Control: */
 	struct regulator *gpu_reg, *gpu_cx;
-	struct clk *ebi1_clk, *grp_clks[7];
+	struct clk **grp_clks;
+	struct clk *ebi1_clk, *core_clk, *rbbmtimer_clk;
+	int nr_clocks;
 
 	uint32_t gpufreq[10];
 	uint32_t busfreq[10];
-- 
cgit v1.2.3


From 7110a92f3da98b055c9c4e23b52d7620c2b78849 Mon Sep 17 00:00:00 2001
From: Jordan Crouse <jcrouse@codeaurora.org>
Date: Mon, 13 Feb 2017 10:14:31 -0700
Subject: drm/msm: Update the list of A5XX registers

Update the list of the A5XX register ranges that can be read on a
hang. The new list adds some registers that were previously missed,
and omits registers that are write only.

Change-Id: Ic0dedbadaf6969892c0563d9cfd8fa2869008417
Signed-off-by: Jordan Crouse <jcrouse@codeaurora.org>
---
 drivers/gpu/drm/msm/adreno/a5xx_gpu.c | 67 +++++++++++++++++++++--------------
 1 file changed, 40 insertions(+), 27 deletions(-)

diff --git a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
index 7add88f62bb0..cc4d378cf7e0 100644
--- a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
@@ -1066,33 +1066,46 @@ static const u32 a5xx_register_offsets[REG_ADRENO_REGISTER_MAX] = {
 };
 
 static const u32 a5xx_registers[] = {
-	0x0000, 0x0002, 0x0004, 0x0020, 0x0022, 0x0026, 0x0029, 0x002B,
-	0x002E, 0x0035, 0x0038, 0x0042, 0x0044, 0x0044, 0x0047, 0x0095,
-	0x0097, 0x00BB, 0x03A0, 0x0464, 0x0469, 0x046F, 0x04D2, 0x04D3,
-	0x04E0, 0x0533, 0x0540, 0x0555, 0xF400, 0xF400, 0xF800, 0xF807,
-	0x0800, 0x081A, 0x081F, 0x0841, 0x0860, 0x0860, 0x0880, 0x08A0,
-	0x0B00, 0x0B12, 0x0B15, 0x0B28, 0x0B78, 0x0B7F, 0x0BB0, 0x0BBD,
-	0x0BC0, 0x0BC6, 0x0BD0, 0x0C53, 0x0C60, 0x0C61, 0x0C80, 0x0C82,
-	0x0C84, 0x0C85, 0x0C90, 0x0C98, 0x0CA0, 0x0CA0, 0x0CB0, 0x0CB2,
-	0x2180, 0x2185, 0x2580, 0x2585, 0x0CC1, 0x0CC1, 0x0CC4, 0x0CC7,
-	0x0CCC, 0x0CCC, 0x0CD0, 0x0CD8, 0x0CE0, 0x0CE5, 0x0CE8, 0x0CE8,
-	0x0CEC, 0x0CF1, 0x0CFB, 0x0D0E, 0x2100, 0x211E, 0x2140, 0x2145,
-	0x2500, 0x251E, 0x2540, 0x2545, 0x0D10, 0x0D17, 0x0D20, 0x0D23,
-	0x0D30, 0x0D30, 0x20C0, 0x20C0, 0x24C0, 0x24C0, 0x0E40, 0x0E43,
-	0x0E4A, 0x0E4A, 0x0E50, 0x0E57, 0x0E60, 0x0E7C, 0x0E80, 0x0E8E,
-	0x0E90, 0x0E96, 0x0EA0, 0x0EA8, 0x0EB0, 0x0EB2, 0xE140, 0xE147,
-	0xE150, 0xE187, 0xE1A0, 0xE1A9, 0xE1B0, 0xE1B6, 0xE1C0, 0xE1C7,
-	0xE1D0, 0xE1D1, 0xE200, 0xE201, 0xE210, 0xE21C, 0xE240, 0xE268,
-	0xE000, 0xE006, 0xE010, 0xE09A, 0xE0A0, 0xE0A4, 0xE0AA, 0xE0EB,
-	0xE100, 0xE105, 0xE380, 0xE38F, 0xE3B0, 0xE3B0, 0xE400, 0xE405,
-	0xE408, 0xE4E9, 0xE4F0, 0xE4F0, 0xE280, 0xE280, 0xE282, 0xE2A3,
-	0xE2A5, 0xE2C2, 0xE940, 0xE947, 0xE950, 0xE987, 0xE9A0, 0xE9A9,
-	0xE9B0, 0xE9B6, 0xE9C0, 0xE9C7, 0xE9D0, 0xE9D1, 0xEA00, 0xEA01,
-	0xEA10, 0xEA1C, 0xEA40, 0xEA68, 0xE800, 0xE806, 0xE810, 0xE89A,
-	0xE8A0, 0xE8A4, 0xE8AA, 0xE8EB, 0xE900, 0xE905, 0xEB80, 0xEB8F,
-	0xEBB0, 0xEBB0, 0xEC00, 0xEC05, 0xEC08, 0xECE9, 0xECF0, 0xECF0,
-	0xEA80, 0xEA80, 0xEA82, 0xEAA3, 0xEAA5, 0xEAC2, 0xA800, 0xA8FF,
-	0xAC60, 0xAC60, 0xB000, 0xB97F, 0xB9A0, 0xB9BF,
+	0x0000, 0x0002, 0x0004, 0x0020, 0x0022, 0x0026, 0x0029, 0x002b,
+	0x002e, 0x0035, 0x0038, 0x0042, 0x0044, 0x0044, 0x0047, 0x0095,
+	0x0097, 0x00bb, 0x03a0, 0x0464, 0x0469, 0x046f, 0x04d2, 0x04d3,
+	0x04e0, 0x0533, 0x0540, 0x0555, 0x0800, 0x081a, 0x081f, 0x0841,
+	0x0860, 0x0860, 0x0880, 0x08a0, 0x0b00, 0x0b12, 0x0b14, 0x0b28,
+	0x0b78, 0x0b7f, 0x0bb0, 0x0bbd, 0x0bc0, 0x0bc6, 0x0bd0, 0x0c53,
+	0x0c60, 0x0c61, 0x0c80, 0x0c82, 0x0c84, 0x0c85, 0x0c90, 0x0c9b,
+	0x0ca0, 0x0ca0, 0x0cb0, 0x0cb2, 0x0cc1, 0x0cc1, 0x0cc4, 0x0cc7,
+	0x0ccc, 0x0ccc, 0x0cd0, 0x0cdb, 0x0ce0, 0x0ce5, 0x0ce8, 0x0ce8,
+	0x0cec, 0x0cf1, 0x0cfb, 0x0d0e, 0x0d10, 0x0d17, 0x0d20, 0x0d23,
+	0x0d30, 0x0d30, 0x0e40, 0x0e43, 0x0e4a, 0x0e4a, 0x0e50, 0x0e57,
+	0x0e60, 0x0e7c, 0x0e80, 0x0e8e, 0x0e90, 0x0e96, 0x0ea0, 0x0eab,
+	0x0eb0, 0x0eb2, 0x2100, 0x211e, 0x2140, 0x2145, 0x2180, 0x2185,
+	0x2500, 0x251e, 0x2540, 0x2545, 0x2580, 0x2585, 0x3000, 0x3014,
+	0x3018, 0x302c, 0x3030, 0x3030, 0x3034, 0x3036, 0x303c, 0x303d,
+	0x3040, 0x3040, 0x3042, 0x3042, 0x3049, 0x3049, 0x3058, 0x3058,
+	0x305a, 0x3061, 0x3064, 0x3068, 0x306c, 0x306d, 0x3080, 0x3088,
+	0x308b, 0x308c, 0x3090, 0x3094, 0x3098, 0x3098, 0x309c, 0x309c,
+	0x3124, 0x3124, 0x340c, 0x340c, 0x3410, 0x3410, 0x3800, 0x3801,
+	0xa800, 0xa800, 0xa820, 0xa828, 0xa840, 0xa87d, 0xa880, 0xa88d,
+	0xa890, 0xa8a3, 0xa8a8, 0xa8aa, 0xa8c0, 0xa8c3, 0xa8c6, 0xa8ca,
+	0xa8cc, 0xa8cf, 0xa8d1, 0xa8d8, 0xa8dc, 0xa8dc, 0xa8e0, 0xa8f5,
+	0xac00, 0xac06, 0xac40, 0xac47, 0xac60, 0xac62, 0xac80, 0xac82,
+	0xb800, 0xb808, 0xb80c, 0xb812, 0xb814, 0xb817, 0xb900, 0xb904,
+	0xb906, 0xb90a, 0xb90c, 0xb90f, 0xb920, 0xb924, 0xb926, 0xb92a,
+	0xb92c, 0xb92f, 0xb940, 0xb944, 0xb946, 0xb94a, 0xb94c, 0xb94f,
+	0xb960, 0xb964, 0xb966, 0xb96a, 0xb96c, 0xb96f, 0xb980, 0xb984,
+	0xb986, 0xb98a, 0xb98c, 0xb98f, 0xb9a0, 0xb9b0, 0xb9b8, 0xb9ba,
+	0xd200, 0xd23f, 0xe000, 0xe006, 0xe010, 0xe09a, 0xe0a0, 0xe0a4,
+	0xe0aa, 0xe0eb, 0xe100, 0xe105, 0xe140, 0xe147, 0xe150, 0xe187,
+	0xe1a0, 0xe1a9, 0xe1b0, 0xe1b6, 0xe1c0, 0xe1c7, 0xe1d0, 0xe1d1,
+	0xe200, 0xe201, 0xe210, 0xe21c, 0xe240, 0xe268, 0xe280, 0xe280,
+	0xe282, 0xe2a3, 0xe2a5, 0xe2c2, 0xe380, 0xe38f, 0xe3b0, 0xe3b0,
+	0xe400, 0xe405, 0xe408, 0xe4e9, 0xe4f0, 0xe4f0, 0xe800, 0xe806,
+	0xe810, 0xe89a, 0xe8a0, 0xe8a4, 0xe8aa, 0xe8eb, 0xe900, 0xe905,
+	0xe940, 0xe947, 0xe950, 0xe987, 0xe9a0, 0xe9a9, 0xe9b0, 0xe9b6,
+	0xe9c0, 0xe9c7, 0xe9d0, 0xe9d1, 0xea00, 0xea01, 0xea10, 0xea1c,
+	0xea40, 0xea68, 0xea80, 0xea80, 0xea82, 0xeaa3, 0xeaa5, 0xeac2,
+	0xeb80, 0xeb8f, 0xebb0, 0xebb0, 0xec00, 0xec05, 0xec08, 0xece9,
+	0xecf0, 0xecf0, 0xf400, 0xf400, 0xf800, 0xf807,
 	~0
 };
 
-- 
cgit v1.2.3


From 869486c969c819ef907437ac74381ffe1ae6f662 Mon Sep 17 00:00:00 2001
From: Jordan Crouse <jcrouse@codeaurora.org>
Date: Mon, 13 Feb 2017 10:14:31 -0700
Subject: drm/msm: Allow hardware clock gating to be toggled

There are some use cases wherein we need to turn off hardware clock
gating before reading certain registers. Modify the A5XX HWCG function
to allow user to enable or disable clock gating at will.

Change-Id: Ic0dedbade1264785b3436099e638a5678a62818f
Signed-off-by: Jordan Crouse <jcrouse@codeaurora.org>
---
 drivers/gpu/drm/msm/adreno/a5xx_gpu.c     | 17 ++++++++++-------
 drivers/gpu/drm/msm/adreno/a5xx_gpu.h     |  1 +
 drivers/gpu/drm/msm/adreno/a5xx_preempt.c |  4 ++++
 3 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
index cc4d378cf7e0..9933e932679a 100644
--- a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
@@ -302,22 +302,25 @@ static const struct {
 	{REG_A5XX_RBBM_CLOCK_DELAY_VFD, 0x00002222}
 };
 
-static void a5xx_enable_hwcg(struct msm_gpu *gpu)
+void a5xx_set_hwcg(struct msm_gpu *gpu, bool state)
 {
 	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
 	unsigned int i;
 
 	for (i = 0; i < ARRAY_SIZE(a5xx_hwcg); i++)
-		gpu_write(gpu, a5xx_hwcg[i].offset, a5xx_hwcg[i].value);
+		gpu_write(gpu, a5xx_hwcg[i].offset,
+			state ? a5xx_hwcg[i].value : 0);
 
 	/* There are a few additional registers just for A540 */
 	if (adreno_is_a540(adreno_gpu)) {
-		gpu_write(gpu, REG_A5XX_RBBM_CLOCK_DELAY_GPMU, 0x770);
-		gpu_write(gpu, REG_A5XX_RBBM_CLOCK_HYST_GPMU, 0x004);
+		gpu_write(gpu, REG_A5XX_RBBM_CLOCK_DELAY_GPMU,
+			state  ? 0x770 : 0);
+		gpu_write(gpu, REG_A5XX_RBBM_CLOCK_HYST_GPMU,
+			state ? 0x004 : 0);
 	}
 
-	gpu_write(gpu, REG_A5XX_RBBM_CLOCK_CNTL, 0xAAA8AA00);
-	gpu_write(gpu, REG_A5XX_RBBM_ISDB_CNT, 0x182);
+	gpu_write(gpu, REG_A5XX_RBBM_CLOCK_CNTL, state ? 0xAAA8AA00 : 0);
+	gpu_write(gpu, REG_A5XX_RBBM_ISDB_CNT, state ? 0x182 : 0x180);
 }
 
 static int a5xx_me_init(struct msm_gpu *gpu)
@@ -637,7 +640,7 @@ static int a5xx_hw_init(struct msm_gpu *gpu)
 	gpu_write(gpu, REG_A5XX_RBBM_AHB_CNTL1, 0xA6FFFFFF);
 
 	/* Enable HWCG */
-	a5xx_enable_hwcg(gpu);
+	a5xx_set_hwcg(gpu, true);
 
 	gpu_write(gpu, REG_A5XX_RBBM_AHB_CNTL2, 0x0000003F);
 
diff --git a/drivers/gpu/drm/msm/adreno/a5xx_gpu.h b/drivers/gpu/drm/msm/adreno/a5xx_gpu.h
index 6327cde998a0..366f37545589 100644
--- a/drivers/gpu/drm/msm/adreno/a5xx_gpu.h
+++ b/drivers/gpu/drm/msm/adreno/a5xx_gpu.h
@@ -167,6 +167,7 @@ static inline int spin_usecs(struct msm_gpu *gpu, uint32_t usecs,
 	return -ETIMEDOUT;
 }
 
+void a5xx_set_hwcg(struct msm_gpu *gpu, bool state);
 bool a5xx_idle(struct msm_gpu *gpu, struct msm_ringbuffer *ring);
 
 void a5xx_preempt_init(struct msm_gpu *gpu);
diff --git a/drivers/gpu/drm/msm/adreno/a5xx_preempt.c b/drivers/gpu/drm/msm/adreno/a5xx_preempt.c
index 648494c75abc..f8e6bc4dc432 100644
--- a/drivers/gpu/drm/msm/adreno/a5xx_preempt.c
+++ b/drivers/gpu/drm/msm/adreno/a5xx_preempt.c
@@ -46,6 +46,10 @@ static void *alloc_kernel_bo(struct drm_device *drm, struct msm_gpu *gpu,
 	if (iova)
 		*iova = _iova;
 
+	pr_err("[%ps] buffer size %x, iova [%llx : %llx]\n",
+		__builtin_return_address(0), size,
+		_iova, _iova+size-1);
+
 	return ptr;
 out:
 	drm_gem_object_unreference_unlocked(_bo);
-- 
cgit v1.2.3


From 20e281de4868145daafdf6e0da9b1924a869e152 Mon Sep 17 00:00:00 2001
From: Jordan Crouse <jcrouse@codeaurora.org>
Date: Tue, 21 Feb 2017 14:50:45 -0700
Subject: drm/msm: Add support for the QTI GPU snapshot format

When a fault happens on the Adreno GPU we want to collect
a considerable amount of information to diagnose the problem
including registers, caches, and GPU memory structures (ringbuffers,
etc).

The snapshot collects all of this information following a GPU fault
and encodes it into a binary file format that can be pulled from
debugfs or extracted from a memory dump.

This may seem a duplication of other debug methods (the ->show
functions for example) and while that is true for small numbers
of registers the snapshot goes much further - it collects hundreds
(thousands) of registers in addition to memory and other structures
that would be impractical to dump as ascii. The binary format allows
for the snapshot to be easily shared and post-processed in different
ways to extract patterns.

Add the basic snapshot infrastructure and enable ringbuffer, register
and shader bank collection for A5XX targets.

Change-Id: Ic0dedbadcf0513096d05870f522ac73da74ceb31
Signed-off-by: Jordan Crouse <jcrouse@codeaurora.org>
---
 drivers/gpu/drm/msm/Makefile               |   6 +-
 drivers/gpu/drm/msm/adreno/a5xx.xml.h      | 147 ++++++
 drivers/gpu/drm/msm/adreno/a5xx_gpu.c      |  14 +-
 drivers/gpu/drm/msm/adreno/a5xx_gpu.h      |   2 +
 drivers/gpu/drm/msm/adreno/a5xx_preempt.c  |   4 -
 drivers/gpu/drm/msm/adreno/a5xx_snapshot.c | 796 +++++++++++++++++++++++++++++
 drivers/gpu/drm/msm/adreno/adreno_gpu.c    |  80 +++
 drivers/gpu/drm/msm/adreno/adreno_gpu.h    |   1 +
 drivers/gpu/drm/msm/msm_drv.c              |  18 +
 drivers/gpu/drm/msm/msm_gpu.c              |   6 +
 drivers/gpu/drm/msm/msm_gpu.h              |   4 +
 drivers/gpu/drm/msm/msm_snapshot.c         | 105 ++++
 drivers/gpu/drm/msm/msm_snapshot.h         |  85 +++
 drivers/gpu/drm/msm/msm_snapshot_api.h     | 121 +++++
 14 files changed, 1371 insertions(+), 18 deletions(-)
 create mode 100644 drivers/gpu/drm/msm/adreno/a5xx_snapshot.c
 create mode 100644 drivers/gpu/drm/msm/msm_snapshot.c
 create mode 100644 drivers/gpu/drm/msm/msm_snapshot.h
 create mode 100644 drivers/gpu/drm/msm/msm_snapshot_api.h

diff --git a/drivers/gpu/drm/msm/Makefile b/drivers/gpu/drm/msm/Makefile
index 6086c0f9f13c..b77fdd098471 100644
--- a/drivers/gpu/drm/msm/Makefile
+++ b/drivers/gpu/drm/msm/Makefile
@@ -55,7 +55,8 @@ msm_drm-y += adreno/adreno_device.o \
 	adreno/a4xx_gpu.o \
 	adreno/a5xx_gpu.o \
 	adreno/a5xx_power.o \
-	adreno/a5xx_preempt.o
+	adreno/a5xx_preempt.o \
+	adreno/a5xx_snapshot.o
 endif
 
 msm_drm-$(CONFIG_DRM_MSM_MDP4) += mdp/mdp4/mdp4_crtc.o \
@@ -131,6 +132,7 @@ msm_drm-$(CONFIG_DRM_MSM) += \
 	msm_perf.o \
 	msm_rd.o \
 	msm_ringbuffer.o \
-	msm_prop.o
+	msm_prop.o \
+	msm_snapshot.o
 
 obj-$(CONFIG_DRM_MSM)	+= msm_drm.o
diff --git a/drivers/gpu/drm/msm/adreno/a5xx.xml.h b/drivers/gpu/drm/msm/adreno/a5xx.xml.h
index bfee2fd83462..56dad2217289 100644
--- a/drivers/gpu/drm/msm/adreno/a5xx.xml.h
+++ b/drivers/gpu/drm/msm/adreno/a5xx.xml.h
@@ -155,6 +155,114 @@ enum a5xx_depth_format {
 	DEPTH5_32 = 4,
 };
 
+enum a5xx_debugbus {
+	A5XX_RBBM_DBGBUS_CP = 1,
+	A5XX_RBBM_DBGBUS_RBBM = 2,
+	A5XX_RBBM_DBGBUS_VBIF = 3,
+	A5XX_RBBM_DBGBUS_HLSQ = 4,
+	A5XX_RBBM_DBGBUS_UCHE = 5,
+	A5XX_RBBM_DBGBUS_DPM = 6,
+	A5XX_RBBM_DBGBUS_TESS = 7,
+	A5XX_RBBM_DBGBUS_PC = 8,
+	A5XX_RBBM_DBGBUS_VFDP = 9,
+	A5XX_RBBM_DBGBUS_VPC = 10,
+	A5XX_RBBM_DBGBUS_TSE = 11,
+	A5XX_RBBM_DBGBUS_RAS = 12,
+	A5XX_RBBM_DBGBUS_VSC = 13,
+	A5XX_RBBM_DBGBUS_COM = 14,
+	A5XX_RBBM_DBGBUS_DCOM = 15,
+	A5XX_RBBM_DBGBUS_LRZ = 16,
+	A5XX_RBBM_DBGBUS_A2D_DSP = 17,
+	A5XX_RBBM_DBGBUS_CCUFCHE = 18,
+	A5XX_RBBM_DBGBUS_GPMU = 19,
+	A5XX_RBBM_DBGBUS_RBP = 20,
+	A5XX_RBBM_DBGBUS_HM = 21,
+	A5XX_RBBM_DBGBUS_RBBM_CFG = 22,
+	A5XX_RBBM_DBGBUS_VBIF_CX = 23,
+	A5XX_RBBM_DBGBUS_GPC = 29,
+	A5XX_RBBM_DBGBUS_LARC = 30,
+	A5XX_RBBM_DBGBUS_HLSQ_SPTP = 31,
+	A5XX_RBBM_DBGBUS_RB_0 = 32,
+	A5XX_RBBM_DBGBUS_RB_1 = 33,
+	A5XX_RBBM_DBGBUS_RB_2 = 34,
+	A5XX_RBBM_DBGBUS_RB_3 = 35,
+	A5XX_RBBM_DBGBUS_CCU_0 = 40,
+	A5XX_RBBM_DBGBUS_CCU_1 = 41,
+	A5XX_RBBM_DBGBUS_CCU_2 = 42,
+	A5XX_RBBM_DBGBUS_CCU_3 = 43,
+	A5XX_RBBM_DBGBUS_A2D_RAS_0 = 48,
+	A5XX_RBBM_DBGBUS_A2D_RAS_1 = 49,
+	A5XX_RBBM_DBGBUS_A2D_RAS_2 = 50,
+	A5XX_RBBM_DBGBUS_A2D_RAS_3 = 51,
+	A5XX_RBBM_DBGBUS_VFD_0 = 56,
+	A5XX_RBBM_DBGBUS_VFD_1 = 57,
+	A5XX_RBBM_DBGBUS_VFD_2 = 58,
+	A5XX_RBBM_DBGBUS_VFD_3 = 59,
+	A5XX_RBBM_DBGBUS_SP_0 = 64,
+	A5XX_RBBM_DBGBUS_SP_1 = 65,
+	A5XX_RBBM_DBGBUS_SP_2 = 66,
+	A5XX_RBBM_DBGBUS_SP_3 = 67,
+	A5XX_RBBM_DBGBUS_TPL1_0 = 72,
+	A5XX_RBBM_DBGBUS_TPL1_1 = 73,
+	A5XX_RBBM_DBGBUS_TPL1_2 = 74,
+	A5XX_RBBM_DBGBUS_TPL1_3 = 75,
+};
+
+enum a5xx_shader_blocks {
+	A5XX_TP_W_MEMOBJ = 1,
+	A5XX_TP_W_SAMPLER = 2,
+	A5XX_TP_W_MIPMAP_BASE = 3,
+	A5XX_TP_W_MEMOBJ_TAG = 4,
+	A5XX_TP_W_SAMPLER_TAG = 5,
+	A5XX_TP_S_3D_MEMOBJ = 6,
+	A5XX_TP_S_3D_SAMPLER = 7,
+	A5XX_TP_S_3D_MEMOBJ_TAG = 8,
+	A5XX_TP_S_3D_SAMPLER_TAG = 9,
+	A5XX_TP_S_CS_MEMOBJ = 10,
+	A5XX_TP_S_CS_SAMPLER = 11,
+	A5XX_TP_S_CS_MEMOBJ_TAG = 12,
+	A5XX_TP_S_CS_SAMPLER_TAG = 13,
+	A5XX_SP_W_INSTR = 14,
+	A5XX_SP_W_CONST = 15,
+	A5XX_SP_W_UAV_SIZE = 16,
+	A5XX_SP_W_CB_SIZE = 17,
+	A5XX_SP_W_UAV_BASE = 18,
+	A5XX_SP_W_CB_BASE = 19,
+	A5XX_SP_W_INST_TAG = 20,
+	A5XX_SP_W_STATE = 21,
+	A5XX_SP_S_3D_INSTR = 22,
+	A5XX_SP_S_3D_CONST = 23,
+	A5XX_SP_S_3D_CB_BASE = 24,
+	A5XX_SP_S_3D_CB_SIZE = 25,
+	A5XX_SP_S_3D_UAV_BASE = 26,
+	A5XX_SP_S_3D_UAV_SIZE = 27,
+	A5XX_SP_S_CS_INSTR = 28,
+	A5XX_SP_S_CS_CONST = 29,
+	A5XX_SP_S_CS_CB_BASE = 30,
+	A5XX_SP_S_CS_CB_SIZE = 31,
+	A5XX_SP_S_CS_UAV_BASE = 32,
+	A5XX_SP_S_CS_UAV_SIZE = 33,
+	A5XX_SP_S_3D_INSTR_DIRTY = 34,
+	A5XX_SP_S_3D_CONST_DIRTY = 35,
+	A5XX_SP_S_3D_CB_BASE_DIRTY = 36,
+	A5XX_SP_S_3D_CB_SIZE_DIRTY = 37,
+	A5XX_SP_S_3D_UAV_BASE_DIRTY = 38,
+	A5XX_SP_S_3D_UAV_SIZE_DIRTY = 39,
+	A5XX_SP_S_CS_INSTR_DIRTY = 40,
+	A5XX_SP_S_CS_CONST_DIRTY = 41,
+	A5XX_SP_S_CS_CB_BASE_DIRTY = 42,
+	A5XX_SP_S_CS_CB_SIZE_DIRTY = 43,
+	A5XX_SP_S_CS_UAV_BASE_DIRTY = 44,
+	A5XX_SP_S_CS_UAV_SIZE_DIRTY = 45,
+	A5XX_HLSQ_ICB = 46,
+	A5XX_HLSQ_ICB_DIRTY = 47,
+	A5XX_HLSQ_ICB_CB_BASE_DIRTY = 48,
+	A5XX_SP_POWER_RESTORE_RAM = 64,
+	A5XX_SP_POWER_RESTORE_RAM_TAG = 65,
+	A5XX_TP_POWER_RESTORE_RAM = 66,
+	A5XX_TP_POWER_RESTORE_RAM_TAG = 67,
+};
+
 enum a5xx_tex_filter {
 	A5XX_TEX_NEAREST = 0,
 	A5XX_TEX_LINEAR = 1,
@@ -396,6 +504,18 @@ static inline uint32_t A5XX_CP_PROTECT_REG_MASK_LEN(uint32_t val)
 #define REG_A5XX_CP_POWERCTR_CP_SEL_3				0x00000bbd
 
 #define REG_A5XX_RBBM_CFG_DBGBUS_SEL_A				0x00000004
+#define A5XX_RBBM_CFG_DBGBUS_SEL_A_PING_INDEX__MASK		0x000000ff
+#define A5XX_RBBM_CFG_DBGBUS_SEL_A_PING_INDEX__SHIFT		0
+static inline uint32_t A5XX_RBBM_CFG_DBGBUS_SEL_A_PING_INDEX(uint32_t val)
+{
+	return ((val) << A5XX_RBBM_CFG_DBGBUS_SEL_A_PING_INDEX__SHIFT) & A5XX_RBBM_CFG_DBGBUS_SEL_A_PING_INDEX__MASK;
+}
+#define A5XX_RBBM_CFG_DBGBUS_SEL_A_PING_BLK_SEL__MASK		0x0000ff00
+#define A5XX_RBBM_CFG_DBGBUS_SEL_A_PING_BLK_SEL__SHIFT		8
+static inline uint32_t A5XX_RBBM_CFG_DBGBUS_SEL_A_PING_BLK_SEL(uint32_t val)
+{
+	return ((val) << A5XX_RBBM_CFG_DBGBUS_SEL_A_PING_BLK_SEL__SHIFT) & A5XX_RBBM_CFG_DBGBUS_SEL_A_PING_BLK_SEL__MASK;
+}
 
 #define REG_A5XX_RBBM_CFG_DBGBUS_SEL_B				0x00000005
 
@@ -406,6 +526,12 @@ static inline uint32_t A5XX_CP_PROTECT_REG_MASK_LEN(uint32_t val)
 #define REG_A5XX_RBBM_CFG_DBGBUS_CNTLT				0x00000008
 
 #define REG_A5XX_RBBM_CFG_DBGBUS_CNTLM				0x00000009
+#define A5XX_RBBM_CFG_DBGBUS_CNTLM_ENABLE__MASK			0x0f000000
+#define A5XX_RBBM_CFG_DBGBUS_CNTLM_ENABLE__SHIFT		24
+static inline uint32_t A5XX_RBBM_CFG_DBGBUS_CNTLM_ENABLE(uint32_t val)
+{
+	return ((val) << A5XX_RBBM_CFG_DBGBUS_CNTLM_ENABLE__SHIFT) & A5XX_RBBM_CFG_DBGBUS_CNTLM_ENABLE__MASK;
+}
 
 #define REG_A5XX_RBBM_CFG_DEBBUS_CTLTM_ENABLE_SHIFT		0x00000018
 
@@ -1413,6 +1539,12 @@ static inline uint32_t A5XX_VSC_BIN_SIZE_Y(uint32_t val)
 #define REG_A5XX_HLSQ_SPTP_RDSEL				0x00000f08
 
 #define REG_A5XX_HLSQ_DBG_READ_SEL				0x0000bc00
+#define A5XX_HLSQ_DBG_READ_SEL_STATETYPE__MASK			0x0000ff00
+#define A5XX_HLSQ_DBG_READ_SEL_STATETYPE__SHIFT			8
+static inline uint32_t A5XX_HLSQ_DBG_READ_SEL_STATETYPE(uint32_t val)
+{
+	return ((val) << A5XX_HLSQ_DBG_READ_SEL_STATETYPE__SHIFT) & A5XX_HLSQ_DBG_READ_SEL_STATETYPE__MASK;
+}
 
 #define REG_A5XX_HLSQ_DBG_AHB_READ_APERTURE			0x0000a000
 
@@ -1583,6 +1715,8 @@ static inline uint32_t A5XX_VSC_BIN_SIZE_Y(uint32_t val)
 #define REG_A5XX_VBIF_VERSION					0x00003000
 
 #define REG_A5XX_VBIF_CLKON					0x00003001
+#define A5XX_VBIF_CLKON_FORCE_ON				0x00000001
+#define A5XX_VBIF_CLKON_FORCE_ON_TESTBUS			0x00000002
 
 #define REG_A5XX_VBIF_ABIT_SORT					0x00003028
 
@@ -1601,14 +1735,27 @@ static inline uint32_t A5XX_VSC_BIN_SIZE_Y(uint32_t val)
 #define REG_A5XX_VBIF_XIN_HALT_CTRL1				0x00003081
 
 #define REG_A5XX_VBIF_TEST_BUS_OUT_CTRL				0x00003084
+#define A5XX_VBIF_TEST_BUS_OUT_CTRL_TEST_BUS_CTRL_EN		0x00000001
 
 #define REG_A5XX_VBIF_TEST_BUS1_CTRL0				0x00003085
 
 #define REG_A5XX_VBIF_TEST_BUS1_CTRL1				0x00003086
+#define A5XX_VBIF_TEST_BUS1_CTRL1_TEST_BUS1_DATA_SEL__MASK	0x0000000f
+#define A5XX_VBIF_TEST_BUS1_CTRL1_TEST_BUS1_DATA_SEL__SHIFT	0
+static inline uint32_t A5XX_VBIF_TEST_BUS1_CTRL1_TEST_BUS1_DATA_SEL(uint32_t val)
+{
+	return ((val) << A5XX_VBIF_TEST_BUS1_CTRL1_TEST_BUS1_DATA_SEL__SHIFT) & A5XX_VBIF_TEST_BUS1_CTRL1_TEST_BUS1_DATA_SEL__MASK;
+}
 
 #define REG_A5XX_VBIF_TEST_BUS2_CTRL0				0x00003087
 
 #define REG_A5XX_VBIF_TEST_BUS2_CTRL1				0x00003088
+#define A5XX_VBIF_TEST_BUS2_CTRL1_TEST_BUS2_DATA_SEL__MASK	0x0000001f
+#define A5XX_VBIF_TEST_BUS2_CTRL1_TEST_BUS2_DATA_SEL__SHIFT	0
+static inline uint32_t A5XX_VBIF_TEST_BUS2_CTRL1_TEST_BUS2_DATA_SEL(uint32_t val)
+{
+	return ((val) << A5XX_VBIF_TEST_BUS2_CTRL1_TEST_BUS2_DATA_SEL__SHIFT) & A5XX_VBIF_TEST_BUS2_CTRL1_TEST_BUS2_DATA_SEL__MASK;
+}
 
 #define REG_A5XX_VBIF_TEST_BUS_OUT				0x0000308c
 
diff --git a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
index 9933e932679a..a49a7b247547 100644
--- a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
@@ -15,9 +15,6 @@
 #include "msm_iommu.h"
 #include "a5xx_gpu.h"
 
-extern bool hang_debug;
-static void a5xx_dump(struct msm_gpu *gpu);
-
 static void a5xx_flush(struct msm_gpu *gpu, struct msm_ringbuffer *ring)
 {
 	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
@@ -800,8 +797,7 @@ static void a5xx_recover(struct msm_gpu *gpu)
 {
 	adreno_dump_info(gpu);
 
-	if (hang_debug)
-		a5xx_dump(gpu);
+	msm_gpu_snapshot(gpu, gpu->snapshot);
 
 	/* Reset the GPU so it can work again */
 	gpu_write(gpu, REG_A5XX_RBBM_SW_RESET_CMD, 1);
@@ -1112,13 +1108,6 @@ static const u32 a5xx_registers[] = {
 	~0
 };
 
-static void a5xx_dump(struct msm_gpu *gpu)
-{
-	dev_info(gpu->dev->dev, "status:   %08x\n",
-		gpu_read(gpu, REG_A5XX_RBBM_STATUS));
-	adreno_dump(gpu);
-}
-
 static int a5xx_pm_resume(struct msm_gpu *gpu)
 {
 	int ret;
@@ -1225,6 +1214,7 @@ static const struct adreno_gpu_funcs funcs = {
 #ifdef CONFIG_DEBUG_FS
 		.show = a5xx_show,
 #endif
+		.snapshot = a5xx_snapshot,
 	},
 	.get_timestamp = a5xx_get_timestamp,
 };
diff --git a/drivers/gpu/drm/msm/adreno/a5xx_gpu.h b/drivers/gpu/drm/msm/adreno/a5xx_gpu.h
index 366f37545589..3de14fe42a1b 100644
--- a/drivers/gpu/drm/msm/adreno/a5xx_gpu.h
+++ b/drivers/gpu/drm/msm/adreno/a5xx_gpu.h
@@ -176,6 +176,8 @@ void a5xx_preempt_trigger(struct msm_gpu *gpu);
 void a5xx_preempt_irq(struct msm_gpu *gpu);
 void a5xx_preempt_fini(struct msm_gpu *gpu);
 
+int a5xx_snapshot(struct msm_gpu *gpu, struct msm_snapshot *snapshot);
+
 /* Return true if we are in a preempt state */
 static inline bool a5xx_in_preempt(struct a5xx_gpu *a5xx_gpu)
 {
diff --git a/drivers/gpu/drm/msm/adreno/a5xx_preempt.c b/drivers/gpu/drm/msm/adreno/a5xx_preempt.c
index f8e6bc4dc432..648494c75abc 100644
--- a/drivers/gpu/drm/msm/adreno/a5xx_preempt.c
+++ b/drivers/gpu/drm/msm/adreno/a5xx_preempt.c
@@ -46,10 +46,6 @@ static void *alloc_kernel_bo(struct drm_device *drm, struct msm_gpu *gpu,
 	if (iova)
 		*iova = _iova;
 
-	pr_err("[%ps] buffer size %x, iova [%llx : %llx]\n",
-		__builtin_return_address(0), size,
-		_iova, _iova+size-1);
-
 	return ptr;
 out:
 	drm_gem_object_unreference_unlocked(_bo);
diff --git a/drivers/gpu/drm/msm/adreno/a5xx_snapshot.c b/drivers/gpu/drm/msm/adreno/a5xx_snapshot.c
new file mode 100644
index 000000000000..5a2edb0ea518
--- /dev/null
+++ b/drivers/gpu/drm/msm/adreno/a5xx_snapshot.c
@@ -0,0 +1,796 @@
+/* Copyright (c) 2016-2017 The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include "msm_gpu.h"
+#include "msm_gem.h"
+#include "a5xx_gpu.h"
+#include "msm_snapshot_api.h"
+
+#define A5XX_NR_SHADER_BANKS 4
+
+/*
+ * These are a list of the registers that need to be read through the HLSQ
+ * aperture through the crashdumper.  These are not nominally accessible from
+ * the CPU on a secure platform.
+ */
+static const struct {
+	u32 type;
+	u32 regoffset;
+	u32 count;
+} a5xx_hlsq_aperture_regs[] = {
+	{ 0x35, 0xE00, 0x32 },   /* HSLQ non-context */
+	{ 0x31, 0x2080, 0x1 },   /* HLSQ 2D context 0 */
+	{ 0x33, 0x2480, 0x1 },   /* HLSQ 2D context 1 */
+	{ 0x32, 0xE780, 0x62 },  /* HLSQ 3D context 0 */
+	{ 0x34, 0xEF80, 0x62 },  /* HLSQ 3D context 1 */
+	{ 0x3f, 0x0EC0, 0x40 },  /* SP non-context */
+	{ 0x3d, 0x2040, 0x1 },   /* SP 2D context 0 */
+	{ 0x3b, 0x2440, 0x1 },   /* SP 2D context 1 */
+	{ 0x3e, 0xE580, 0x180 }, /* SP 3D context 0 */
+	{ 0x3c, 0xED80, 0x180 }, /* SP 3D context 1 */
+	{ 0x3a, 0x0F00, 0x1c },  /* TP non-context */
+	{ 0x38, 0x2000, 0xa },   /* TP 2D context 0 */
+	{ 0x36, 0x2400, 0xa },   /* TP 2D context 1 */
+	{ 0x39, 0xE700, 0x80 },  /* TP 3D context 0 */
+	{ 0x37, 0xEF00, 0x80 },  /* TP 3D context 1 */
+};
+
+/*
+ * The debugbus registers contain device state that presumably makes
+ * sense to the hardware designers. 'count' is the number of indexes to read,
+ * each index value is 64 bits
+ */
+static const struct {
+	enum a5xx_debugbus id;
+	u32 count;
+} a5xx_debugbus_blocks[] = {
+	{  A5XX_RBBM_DBGBUS_CP, 0x100, },
+	{  A5XX_RBBM_DBGBUS_RBBM, 0x100, },
+	{  A5XX_RBBM_DBGBUS_HLSQ, 0x100, },
+	{  A5XX_RBBM_DBGBUS_UCHE, 0x100, },
+	{  A5XX_RBBM_DBGBUS_DPM, 0x100, },
+	{  A5XX_RBBM_DBGBUS_TESS, 0x100, },
+	{  A5XX_RBBM_DBGBUS_PC, 0x100, },
+	{  A5XX_RBBM_DBGBUS_VFDP, 0x100, },
+	{  A5XX_RBBM_DBGBUS_VPC, 0x100, },
+	{  A5XX_RBBM_DBGBUS_TSE, 0x100, },
+	{  A5XX_RBBM_DBGBUS_RAS, 0x100, },
+	{  A5XX_RBBM_DBGBUS_VSC, 0x100, },
+	{  A5XX_RBBM_DBGBUS_COM, 0x100, },
+	{  A5XX_RBBM_DBGBUS_DCOM, 0x100, },
+	{  A5XX_RBBM_DBGBUS_LRZ, 0x100, },
+	{  A5XX_RBBM_DBGBUS_A2D_DSP, 0x100, },
+	{  A5XX_RBBM_DBGBUS_CCUFCHE, 0x100, },
+	{  A5XX_RBBM_DBGBUS_GPMU, 0x100, },
+	{  A5XX_RBBM_DBGBUS_RBP, 0x100, },
+	{  A5XX_RBBM_DBGBUS_HM, 0x100, },
+	{  A5XX_RBBM_DBGBUS_RBBM_CFG, 0x100, },
+	{  A5XX_RBBM_DBGBUS_VBIF_CX, 0x100, },
+	{  A5XX_RBBM_DBGBUS_GPC, 0x100, },
+	{  A5XX_RBBM_DBGBUS_LARC, 0x100, },
+	{  A5XX_RBBM_DBGBUS_HLSQ_SPTP, 0x100, },
+	{  A5XX_RBBM_DBGBUS_RB_0, 0x100, },
+	{  A5XX_RBBM_DBGBUS_RB_1, 0x100, },
+	{  A5XX_RBBM_DBGBUS_RB_2, 0x100, },
+	{  A5XX_RBBM_DBGBUS_RB_3, 0x100, },
+	{  A5XX_RBBM_DBGBUS_CCU_0, 0x100, },
+	{  A5XX_RBBM_DBGBUS_CCU_1, 0x100, },
+	{  A5XX_RBBM_DBGBUS_CCU_2, 0x100, },
+	{  A5XX_RBBM_DBGBUS_CCU_3, 0x100, },
+	{  A5XX_RBBM_DBGBUS_A2D_RAS_0, 0x100, },
+	{  A5XX_RBBM_DBGBUS_A2D_RAS_1, 0x100, },
+	{  A5XX_RBBM_DBGBUS_A2D_RAS_2, 0x100, },
+	{  A5XX_RBBM_DBGBUS_A2D_RAS_3, 0x100, },
+	{  A5XX_RBBM_DBGBUS_VFD_0, 0x100, },
+	{  A5XX_RBBM_DBGBUS_VFD_1, 0x100, },
+	{  A5XX_RBBM_DBGBUS_VFD_2, 0x100, },
+	{  A5XX_RBBM_DBGBUS_VFD_3, 0x100, },
+	{  A5XX_RBBM_DBGBUS_SP_0, 0x100, },
+	{  A5XX_RBBM_DBGBUS_SP_1, 0x100, },
+	{  A5XX_RBBM_DBGBUS_SP_2, 0x100, },
+	{  A5XX_RBBM_DBGBUS_SP_3, 0x100, },
+	{  A5XX_RBBM_DBGBUS_TPL1_0, 0x100, },
+	{  A5XX_RBBM_DBGBUS_TPL1_1, 0x100, },
+	{  A5XX_RBBM_DBGBUS_TPL1_2, 0x100, },
+	{  A5XX_RBBM_DBGBUS_TPL1_3, 0x100, },
+};
+
+/*
+ * The shader blocks are read from the HLSQ aperture - each one has its own
+ * identifier for the aperture read
+ */
+static const struct {
+	enum a5xx_shader_blocks id;
+	u32 size;
+} a5xx_shader_blocks[] = {
+	{A5XX_TP_W_MEMOBJ,              0x200},
+	{A5XX_TP_W_MIPMAP_BASE,         0x3C0},
+	{A5XX_TP_W_SAMPLER_TAG,          0x40},
+	{A5XX_TP_S_3D_SAMPLER,           0x80},
+	{A5XX_TP_S_3D_SAMPLER_TAG,       0x20},
+	{A5XX_TP_S_CS_SAMPLER,           0x40},
+	{A5XX_TP_S_CS_SAMPLER_TAG,       0x10},
+	{A5XX_SP_W_CONST,               0x800},
+	{A5XX_SP_W_CB_SIZE,              0x30},
+	{A5XX_SP_W_CB_BASE,              0xF0},
+	{A5XX_SP_W_STATE,                 0x1},
+	{A5XX_SP_S_3D_CONST,            0x800},
+	{A5XX_SP_S_3D_CB_SIZE,           0x28},
+	{A5XX_SP_S_3D_UAV_SIZE,          0x80},
+	{A5XX_SP_S_CS_CONST,            0x400},
+	{A5XX_SP_S_CS_CB_SIZE,            0x8},
+	{A5XX_SP_S_CS_UAV_SIZE,          0x80},
+	{A5XX_SP_S_3D_CONST_DIRTY,       0x12},
+	{A5XX_SP_S_3D_CB_SIZE_DIRTY,      0x1},
+	{A5XX_SP_S_3D_UAV_SIZE_DIRTY,     0x2},
+	{A5XX_SP_S_CS_CONST_DIRTY,        0xA},
+	{A5XX_SP_S_CS_CB_SIZE_DIRTY,      0x1},
+	{A5XX_SP_S_CS_UAV_SIZE_DIRTY,     0x2},
+	{A5XX_HLSQ_ICB_DIRTY,             0xB},
+	{A5XX_SP_POWER_RESTORE_RAM_TAG,   0xA},
+	{A5XX_TP_POWER_RESTORE_RAM_TAG,   0xA},
+	{A5XX_TP_W_SAMPLER,              0x80},
+	{A5XX_TP_W_MEMOBJ_TAG,           0x40},
+	{A5XX_TP_S_3D_MEMOBJ,           0x200},
+	{A5XX_TP_S_3D_MEMOBJ_TAG,        0x20},
+	{A5XX_TP_S_CS_MEMOBJ,           0x100},
+	{A5XX_TP_S_CS_MEMOBJ_TAG,        0x10},
+	{A5XX_SP_W_INSTR,               0x800},
+	{A5XX_SP_W_UAV_SIZE,             0x80},
+	{A5XX_SP_W_UAV_BASE,             0x80},
+	{A5XX_SP_W_INST_TAG,             0x40},
+	{A5XX_SP_S_3D_INSTR,            0x800},
+	{A5XX_SP_S_3D_CB_BASE,           0xC8},
+	{A5XX_SP_S_3D_UAV_BASE,          0x80},
+	{A5XX_SP_S_CS_INSTR,            0x400},
+	{A5XX_SP_S_CS_CB_BASE,           0x28},
+	{A5XX_SP_S_CS_UAV_BASE,          0x80},
+	{A5XX_SP_S_3D_INSTR_DIRTY,        0x1},
+	{A5XX_SP_S_3D_CB_BASE_DIRTY,      0x5},
+	{A5XX_SP_S_3D_UAV_BASE_DIRTY,     0x2},
+	{A5XX_SP_S_CS_INSTR_DIRTY,        0x1},
+	{A5XX_SP_S_CS_CB_BASE_DIRTY,      0x1},
+	{A5XX_SP_S_CS_UAV_BASE_DIRTY,     0x2},
+	{A5XX_HLSQ_ICB,                 0x200},
+	{A5XX_HLSQ_ICB_CB_BASE_DIRTY,     0x4},
+	{A5XX_SP_POWER_RESTORE_RAM,     0x140},
+	{A5XX_TP_POWER_RESTORE_RAM,      0x40},
+};
+
+/*
+ * The A5XX architecture has a a built in engine to asynchronously dump
+ * registers from the GPU. It is used to accelerate the copy of hundreds
+ * (thousands) of registers and as a safe way to access registers that might
+ * have secure data in them (if the GPU is in secure, the crashdumper returns
+ * bogus values for those registers). On a fully secured device the CPU will be
+ * blocked from accessing those registers directly and so the crashdump is the
+ * only way that we can access context registers and the shader banks for debug
+ * purposes.
+ *
+ * The downside of the crashdump is that it requires access to GPU accessible
+ * memory (so the VBIF and the bus and the SMMU need to be up and working) and
+ * you need enough memory to write the script for the crashdumper and to store
+ * the data that you are dumping so there is a balancing act between the work to
+ * set up a crash dumper and the value we get out of it.
+ */
+
+/*
+ * The crashdump uses a pseudo-script format to read and write registers.  Each
+ * operation is two 64 bit values.
+ *
+ * READ:
+ *  [qword 0] [64:00] - The absolute IOVA address target for the register value
+ *  [qword 1] [63:44] - the dword address of the register offset to read
+ *            [15:00] - Number of dwords to read at once
+ *
+ * WRITE:
+ *  [qword 0] [31:0] 32 bit value to write to the register
+ *  [qword 1] [63:44] - the dword address of the register offset to write
+ *            [21:21] - set 1 to write
+ *            [15:00] - Number of dwords to write (usually 1)
+ *
+ * At the bottom of the script, write quadword zeros to trigger the end.
+ */
+struct crashdump {
+	struct drm_gem_object *bo;
+	void *ptr;
+	u64 iova;
+	u32 index;
+};
+
+#define CRASHDUMP_BO_SIZE (SZ_1M)
+#define CRASHDUMP_SCRIPT_SIZE (256 * SZ_1K)
+#define CRASHDUMP_DATA_SIZE (CRASHDUMP_BO_SIZE - CRASHDUMP_SCRIPT_SIZE)
+
+static int crashdump_init(struct msm_gpu *gpu, struct crashdump *crashdump)
+{
+	struct drm_device *drm = gpu->dev;
+	int ret = -ENOMEM;
+
+	crashdump->bo = msm_gem_new(drm, CRASHDUMP_BO_SIZE, MSM_BO_UNCACHED);
+	if (IS_ERR(crashdump->bo)) {
+		ret = PTR_ERR(crashdump->bo);
+		crashdump->bo = NULL;
+		return ret;
+	}
+
+	crashdump->ptr = msm_gem_vaddr_locked(crashdump->bo);
+	if (!crashdump->ptr)
+		goto out;
+
+	ret = msm_gem_get_iova_locked(crashdump->bo, gpu->aspace,
+		&crashdump->iova);
+
+out:
+	if (ret) {
+		drm_gem_object_unreference(crashdump->bo);
+		crashdump->bo = NULL;
+	}
+
+	return ret;
+}
+
+static int crashdump_run(struct msm_gpu *gpu, struct crashdump *crashdump)
+{
+	if (!crashdump->ptr || !crashdump->index)
+		return -EINVAL;
+
+	gpu_write(gpu, REG_A5XX_CP_CRASH_SCRIPT_BASE_LO,
+		lower_32_bits(crashdump->iova));
+	gpu_write(gpu, REG_A5XX_CP_CRASH_SCRIPT_BASE_HI,
+		upper_32_bits(crashdump->iova));
+
+	gpu_write(gpu, REG_A5XX_CP_CRASH_DUMP_CNTL, 1);
+
+	return spin_until(gpu_read(gpu, REG_A5XX_CP_CRASH_DUMP_CNTL) & 0x04);
+}
+
+static void crashdump_destroy(struct msm_gpu *gpu, struct crashdump *crashdump)
+{
+	if (!crashdump->bo)
+		return;
+
+	if (crashdump->iova)
+		msm_gem_put_iova(crashdump->bo, gpu->aspace);
+
+	drm_gem_object_unreference(crashdump->bo);
+
+	memset(crashdump, 0, sizeof(*crashdump));
+}
+
+static inline void CRASHDUMP_SCRIPT_WRITE(struct crashdump *crashdump,
+		u32 reg, u32 val)
+{
+	u64 *ptr = crashdump->ptr + crashdump->index;
+
+	if (WARN_ON(crashdump->index + (2 * sizeof(u64))
+		>= CRASHDUMP_SCRIPT_SIZE))
+		return;
+
+	/* This is the value to write */
+	ptr[0] = (u64) val;
+
+	/*
+	 * This triggers a write to the specified register.  1 is the size of
+	 * the write in dwords
+	 */
+	ptr[1] = (((u64) reg) << 44) | (1 << 21) | 1;
+
+	crashdump->index += 2 * sizeof(u64);
+}
+
+static inline void CRASHDUMP_SCRIPT_READ(struct crashdump *crashdump,
+		u32 reg, u32 count, u32 offset)
+{
+	u64 *ptr = crashdump->ptr + crashdump->index;
+
+	if (WARN_ON(crashdump->index + (2 * sizeof(u64))
+		>= CRASHDUMP_SCRIPT_SIZE))
+		return;
+
+	if (WARN_ON(offset + (count * sizeof(u32)) >= CRASHDUMP_DATA_SIZE))
+		return;
+
+	ptr[0] = (u64) crashdump->iova + CRASHDUMP_SCRIPT_SIZE + offset;
+	ptr[1] = (((u64) reg) << 44) | count;
+
+	crashdump->index += 2 * sizeof(u64);
+}
+
+static inline void *CRASHDUMP_DATA_PTR(struct crashdump *crashdump, u32 offset)
+{
+	if (WARN_ON(!crashdump->ptr || offset >= CRASHDUMP_DATA_SIZE))
+		return NULL;
+
+	return crashdump->ptr + CRASHDUMP_SCRIPT_SIZE + offset;
+}
+
+static inline u32 CRASHDUMP_DATA_READ(struct crashdump *crashdump, u32 offset)
+{
+	return *((u32 *) CRASHDUMP_DATA_PTR(crashdump, offset));
+}
+
+static inline void CRASHDUMP_RESET(struct crashdump *crashdump)
+{
+	crashdump->index = 0;
+}
+
+static inline void CRASHDUMP_END(struct crashdump *crashdump)
+{
+	u64 *ptr = crashdump->ptr + crashdump->index;
+
+	if (WARN_ON((crashdump->index + (2 * sizeof(u64)))
+		>= CRASHDUMP_SCRIPT_SIZE))
+		return;
+
+	ptr[0] = 0;
+	ptr[1] = 0;
+
+	crashdump->index += 2 * sizeof(u64);
+}
+
+static u32 _crashdump_read_hlsq_aperture(struct crashdump *crashdump,
+		u32 offset, u32 statetype, u32 bank,
+		u32 count)
+{
+	CRASHDUMP_SCRIPT_WRITE(crashdump, REG_A5XX_HLSQ_DBG_READ_SEL,
+		A5XX_HLSQ_DBG_READ_SEL_STATETYPE(statetype) | bank);
+
+	CRASHDUMP_SCRIPT_READ(crashdump, REG_A5XX_HLSQ_DBG_AHB_READ_APERTURE,
+		count, offset);
+
+	return count * sizeof(u32);
+}
+
+static u32 _copy_registers(struct msm_snapshot *snapshot,
+		struct crashdump *crashdump, u32 reg, u32 count,
+		u32 offset)
+{
+	int i;
+	u32 *ptr = (u32 *) (crashdump->ptr + CRASHDUMP_SCRIPT_SIZE + offset);
+	/*
+	 * Write the offset of the first register of the group and the number of
+	 * registers in the group
+	 */
+	SNAPSHOT_WRITE_U32(snapshot, ((count << 16) | reg));
+
+	/* Followed by each register value in the group */
+	for (i = 0; i < count; i++)
+		SNAPSHOT_WRITE_U32(snapshot, ptr[i]);
+
+	return count * sizeof(u32);
+}
+
+/*
+ * Return the number of registers in each register group from the
+ * adreno_gpu->rgisters
+ */
+static inline u32 REG_COUNT(const unsigned int *ptr)
+{
+	return (ptr[1] - ptr[0]) + 1;
+}
+
+/*
+ * Capture what registers we can from the CPU in case the crashdumper is
+ * unavailable or broken.  This will omit the SP,TP and HLSQ registers, but
+ * you'll get everything else and that ain't bad
+ */
+static void a5xx_snapshot_registers_cpu(struct msm_gpu *gpu,
+		struct msm_snapshot *snapshot)
+{
+	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
+	struct msm_snapshot_regs header;
+	u32 regcount = 0, groups = 0;
+	int i;
+
+	/*
+	 * Before we write the section we need to figure out how big our data
+	 * section will be
+	 */
+	for (i = 0; adreno_gpu->registers[i] != ~0; i += 2) {
+		regcount += REG_COUNT(&(adreno_gpu->registers[i]));
+		groups++;
+	}
+
+	header.count = groups;
+
+	/*
+	 * We need one dword for each group and then one dword for each register
+	 * value in that group
+	 */
+	if (!SNAPSHOT_HEADER(snapshot, header, SNAPSHOT_SECTION_REGS_V2,
+		regcount + groups))
+		return;
+
+	for (i = 0; adreno_gpu->registers[i] != ~0; i += 2) {
+		u32 count = REG_COUNT(&(adreno_gpu->registers[i]));
+		u32 reg = adreno_gpu->registers[i];
+		int j;
+
+		/* Write the offset and count for the group */
+		SNAPSHOT_WRITE_U32(snapshot, (count << 16) | reg);
+
+		/* Write each value in the group */
+		for (j = 0; j < count; j++)
+			SNAPSHOT_WRITE_U32(snapshot, gpu_read(gpu, reg++));
+	}
+}
+
+static void a5xx_snapshot_registers(struct msm_gpu *gpu,
+		struct msm_snapshot *snapshot)
+{
+	struct msm_snapshot_regs header;
+	struct crashdump *crashdump = snapshot->priv;
+	u32 offset = 0, regcount = 0, groups = 0;
+	int i;
+
+	/*
+	 * First snapshot all the registers that we can from the CPU.  Do this
+	 * because the crashdumper has a tendency to "taint" the value of some
+	 * of the registers (because the GPU implements the crashdumper) so we
+	 * only want to use the crash dump facility if we have to
+	 */
+	a5xx_snapshot_registers_cpu(gpu, snapshot);
+
+	if (!crashdump)
+		return;
+
+	CRASHDUMP_RESET(crashdump);
+
+	/* HLSQ and context registers behind the aperture */
+	for (i = 0; i < ARRAY_SIZE(a5xx_hlsq_aperture_regs); i++) {
+		u32 count = a5xx_hlsq_aperture_regs[i].count;
+
+		offset += _crashdump_read_hlsq_aperture(crashdump, offset,
+			a5xx_hlsq_aperture_regs[i].type, 0, count);
+		regcount += count;
+
+		groups++;
+	}
+
+	CRASHDUMP_END(crashdump);
+
+	if (crashdump_run(gpu, crashdump))
+		return;
+
+	header.count = groups;
+
+	/*
+	 * The size of the data will be one dword for each "group" of registers,
+	 * and then one dword for each of the registers in that group
+	 */
+	if (!SNAPSHOT_HEADER(snapshot, header, SNAPSHOT_SECTION_REGS_V2,
+		groups + regcount))
+		return;
+
+	/* Copy the registers to the snapshot */
+	for (i = 0; i < ARRAY_SIZE(a5xx_hlsq_aperture_regs); i++)
+		offset += _copy_registers(snapshot, crashdump,
+			a5xx_hlsq_aperture_regs[i].regoffset,
+			a5xx_hlsq_aperture_regs[i].count, offset);
+}
+
+static void _a5xx_snapshot_shader_bank(struct msm_snapshot *snapshot,
+		struct crashdump *crashdump, u32 block, u32 bank,
+		u32 size, u32 offset)
+{
+	void *src;
+
+	struct msm_snapshot_shader header = {
+		.type = block,
+		.index = bank,
+		.size = size,
+	};
+
+	if (!SNAPSHOT_HEADER(snapshot, header, SNAPSHOT_SECTION_SHADER, size))
+		return;
+
+	src = CRASHDUMP_DATA_PTR(crashdump, offset);
+
+	if (src)
+		SNAPSHOT_MEMCPY(snapshot, src, size * sizeof(u32));
+}
+
+static void a5xx_snapshot_shader_memory(struct msm_gpu *gpu,
+		struct msm_snapshot *snapshot)
+{
+	struct crashdump *crashdump = snapshot->priv;
+	u32 offset = 0;
+	int i;
+
+	/* We can only get shader memory through the crashdump */
+	if (!crashdump)
+		return;
+
+	CRASHDUMP_RESET(crashdump);
+
+	/* For each shader block */
+	for (i = 0; i < ARRAY_SIZE(a5xx_shader_blocks); i++) {
+		int j;
+
+		/* For each block, dump 4 banks */
+		for (j = 0; j < A5XX_NR_SHADER_BANKS; j++)
+			offset += _crashdump_read_hlsq_aperture(crashdump,
+				offset, a5xx_shader_blocks[i].id, j,
+				a5xx_shader_blocks[i].size);
+	}
+
+	CRASHDUMP_END(crashdump);
+
+	/* If the crashdump fails we can't get shader memory any other way */
+	if (crashdump_run(gpu, crashdump))
+		return;
+
+	/* Each bank of each shader gets its own snapshot section */
+	for (offset = 0, i = 0; i < ARRAY_SIZE(a5xx_shader_blocks); i++) {
+		int j;
+
+		for (j = 0; j < A5XX_NR_SHADER_BANKS; j++) {
+			_a5xx_snapshot_shader_bank(snapshot, crashdump,
+				a5xx_shader_blocks[i].id, j,
+				a5xx_shader_blocks[i].size, offset);
+			offset += a5xx_shader_blocks[i].size * sizeof(u32);
+		}
+	}
+}
+
+#define A5XX_NUM_AXI_ARB_BLOCKS 2
+#define A5XX_NUM_XIN_BLOCKS     4
+#define VBIF_DATA_SIZE ((16 * A5XX_NUM_AXI_ARB_BLOCKS) + \
+	(18 * A5XX_NUM_XIN_BLOCKS) + (12 * A5XX_NUM_XIN_BLOCKS))
+
+static void a5xx_snapshot_debugbus_vbif(struct msm_gpu *gpu,
+		struct msm_snapshot *snapshot)
+{
+	int i;
+	struct msm_snapshot_debugbus header = {
+		.id = A5XX_RBBM_DBGBUS_VBIF,
+		.count = VBIF_DATA_SIZE,
+	};
+
+	if (!SNAPSHOT_HEADER(snapshot, header, SNAPSHOT_SECTION_DEBUGBUS,
+		VBIF_DATA_SIZE))
+		return;
+
+	gpu_rmw(gpu, REG_A5XX_VBIF_CLKON, A5XX_VBIF_CLKON_FORCE_ON_TESTBUS,
+		A5XX_VBIF_CLKON_FORCE_ON_TESTBUS);
+
+	gpu_write(gpu, REG_A5XX_VBIF_TEST_BUS1_CTRL0, 0);
+	gpu_write(gpu, REG_A5XX_VBIF_TEST_BUS_OUT_CTRL,
+		A5XX_VBIF_TEST_BUS_OUT_CTRL_TEST_BUS_CTRL_EN);
+
+	for (i = 0; i < A5XX_NUM_AXI_ARB_BLOCKS; i++) {
+		int j;
+
+		gpu_write(gpu, REG_A5XX_VBIF_TEST_BUS2_CTRL0, 1 << (i + 16));
+		for (j = 0; j < 16; j++) {
+			gpu_write(gpu, REG_A5XX_VBIF_TEST_BUS2_CTRL1,
+			A5XX_VBIF_TEST_BUS2_CTRL1_TEST_BUS2_DATA_SEL(j));
+			SNAPSHOT_WRITE_U32(snapshot, gpu_read(gpu,
+				REG_A5XX_VBIF_TEST_BUS_OUT));
+		}
+	}
+
+	for (i = 0; i < A5XX_NUM_XIN_BLOCKS; i++) {
+		int j;
+
+		gpu_write(gpu, REG_A5XX_VBIF_TEST_BUS2_CTRL0, 1 << i);
+		for (j = 0; j < 18; j++) {
+			gpu_write(gpu, REG_A5XX_VBIF_TEST_BUS2_CTRL1,
+			A5XX_VBIF_TEST_BUS2_CTRL1_TEST_BUS2_DATA_SEL(j));
+			SNAPSHOT_WRITE_U32(snapshot,
+				gpu_read(gpu, REG_A5XX_VBIF_TEST_BUS_OUT));
+		}
+	}
+
+	for (i = 0; i < A5XX_NUM_XIN_BLOCKS; i++) {
+		int j;
+
+		gpu_write(gpu, REG_A5XX_VBIF_TEST_BUS1_CTRL0, 1 << i);
+		for (j = 0; j < 12; j++) {
+			gpu_write(gpu, REG_A5XX_VBIF_TEST_BUS1_CTRL1,
+			A5XX_VBIF_TEST_BUS1_CTRL1_TEST_BUS1_DATA_SEL(j));
+			SNAPSHOT_WRITE_U32(snapshot, gpu_read(gpu,
+				REG_A5XX_VBIF_TEST_BUS_OUT));
+		}
+	}
+
+}
+
+static void a5xx_snapshot_debugbus_block(struct msm_gpu *gpu,
+		struct msm_snapshot *snapshot, u32 block, u32 count)
+{
+	int i;
+	struct msm_snapshot_debugbus header = {
+		.id = block,
+		.count = count * 2, /* Each value is 2 dwords */
+	};
+
+	if (!SNAPSHOT_HEADER(snapshot, header, SNAPSHOT_SECTION_DEBUGBUS,
+		(count * 2)))
+		return;
+
+	for (i = 0; i < count; i++) {
+		u32 reg = A5XX_RBBM_CFG_DBGBUS_SEL_A_PING_INDEX(i) |
+			A5XX_RBBM_CFG_DBGBUS_SEL_A_PING_BLK_SEL(block);
+
+		gpu_write(gpu, REG_A5XX_RBBM_CFG_DBGBUS_SEL_A, reg);
+		gpu_write(gpu, REG_A5XX_RBBM_CFG_DBGBUS_SEL_B, reg);
+		gpu_write(gpu, REG_A5XX_RBBM_CFG_DBGBUS_SEL_C, reg);
+		gpu_write(gpu, REG_A5XX_RBBM_CFG_DBGBUS_SEL_D, reg);
+
+		/* Each debugbus entry is a quad word */
+		SNAPSHOT_WRITE_U32(snapshot, gpu_read(gpu,
+			REG_A5XX_RBBM_CFG_DBGBUS_TRACE_BUF2));
+		SNAPSHOT_WRITE_U32(snapshot,
+			gpu_read(gpu, REG_A5XX_RBBM_CFG_DBGBUS_TRACE_BUF1));
+	}
+}
+
+static void a5xx_snapshot_debugbus(struct msm_gpu *gpu,
+		struct msm_snapshot *snapshot)
+{
+	int i;
+
+	gpu_write(gpu, REG_A5XX_RBBM_CFG_DBGBUS_CNTLM,
+		A5XX_RBBM_CFG_DBGBUS_CNTLM_ENABLE(0xF));
+
+	for (i = 0; i < ARRAY_SIZE(a5xx_debugbus_blocks); i++)
+		a5xx_snapshot_debugbus_block(gpu, snapshot,
+			a5xx_debugbus_blocks[i].id,
+			a5xx_debugbus_blocks[i].count);
+
+	/* VBIF is special and not in a good way */
+	a5xx_snapshot_debugbus_vbif(gpu, snapshot);
+}
+
+static void a5xx_snapshot_cp_merciu(struct msm_gpu *gpu,
+		struct msm_snapshot *snapshot)
+{
+	unsigned int i;
+	struct msm_snapshot_debug header = {
+		.type = SNAPSHOT_DEBUG_CP_MERCIU,
+		.size = 64 << 1, /* Data size is 2 dwords per entry */
+	};
+
+	if (!SNAPSHOT_HEADER(snapshot, header, SNAPSHOT_SECTION_DEBUG, 64 << 1))
+		return;
+
+	gpu_write(gpu, REG_A5XX_CP_MERCIU_DBG_ADDR, 0);
+	for (i = 0; i < 64; i++) {
+		SNAPSHOT_WRITE_U32(snapshot,
+			gpu_read(gpu, REG_A5XX_CP_MERCIU_DBG_DATA_1));
+		SNAPSHOT_WRITE_U32(snapshot,
+			gpu_read(gpu, REG_A5XX_CP_MERCIU_DBG_DATA_2));
+	}
+}
+
+static void a5xx_snapshot_cp_roq(struct msm_gpu *gpu,
+		struct msm_snapshot *snapshot)
+{
+	int i;
+	struct msm_snapshot_debug header = {
+		.type = SNAPSHOT_DEBUG_CP_ROQ,
+		.size = 512,
+	};
+
+	if (!SNAPSHOT_HEADER(snapshot, header, SNAPSHOT_SECTION_DEBUG, 512))
+		return;
+
+	gpu_write(gpu, REG_A5XX_CP_ROQ_DBG_ADDR, 0);
+	for (i = 0; i < 512; i++)
+		SNAPSHOT_WRITE_U32(snapshot,
+			gpu_read(gpu, REG_A5XX_CP_ROQ_DBG_DATA));
+}
+
+static void a5xx_snapshot_cp_meq(struct msm_gpu *gpu,
+		struct msm_snapshot *snapshot)
+{
+	int i;
+	struct msm_snapshot_debug header = {
+		.type = SNAPSHOT_DEBUG_CP_MEQ,
+		.size = 64,
+	};
+
+	if (!SNAPSHOT_HEADER(snapshot, header, SNAPSHOT_SECTION_DEBUG, 64))
+		return;
+
+	gpu_write(gpu, REG_A5XX_CP_MEQ_DBG_ADDR, 0);
+	for (i = 0; i < 64; i++)
+		SNAPSHOT_WRITE_U32(snapshot,
+			gpu_read(gpu, REG_A5XX_CP_MEQ_DBG_DATA));
+}
+
+static void a5xx_snapshot_indexed_registers(struct msm_gpu *gpu,
+		struct msm_snapshot *snapshot, u32 addr, u32 data,
+		u32 count)
+{
+	unsigned int i;
+	struct msm_snapshot_indexed_regs header = {
+		.index_reg = addr,
+		.data_reg = data,
+		.start = 0,
+		.count = count,
+	};
+
+	if (!SNAPSHOT_HEADER(snapshot, header, SNAPSHOT_SECTION_INDEXED_REGS,
+		count))
+		return;
+
+	for (i = 0; i < count; i++) {
+		gpu_write(gpu, addr, i);
+		SNAPSHOT_WRITE_U32(snapshot, gpu_read(gpu, data));
+	}
+}
+
+int a5xx_snapshot(struct msm_gpu *gpu, struct msm_snapshot *snapshot)
+{
+	struct crashdump crashdump = { 0 };
+
+	if (!crashdump_init(gpu, &crashdump))
+		snapshot->priv = &crashdump;
+
+	/* To accurately read all registers, disable hardware clock gating */
+	a5xx_set_hwcg(gpu, false);
+
+	/* Kick it up to the generic level */
+	adreno_snapshot(gpu, snapshot);
+
+	/* Read the GPU registers */
+	a5xx_snapshot_registers(gpu, snapshot);
+
+	/* Read the shader memory banks */
+	a5xx_snapshot_shader_memory(gpu, snapshot);
+
+	/* Read the debugbus registers */
+	a5xx_snapshot_debugbus(gpu, snapshot);
+
+	/* PFP data */
+	a5xx_snapshot_indexed_registers(gpu, snapshot,
+		REG_A5XX_CP_PFP_STAT_ADDR, REG_A5XX_CP_PFP_STAT_DATA, 36);
+
+	/* ME data */
+	a5xx_snapshot_indexed_registers(gpu, snapshot,
+		REG_A5XX_CP_ME_STAT_ADDR, REG_A5XX_CP_ME_STAT_DATA, 29);
+
+	/* DRAW_STATE data */
+	a5xx_snapshot_indexed_registers(gpu, snapshot,
+		REG_A5XX_CP_DRAW_STATE_ADDR, REG_A5XX_CP_DRAW_STATE_DATA,
+		256);
+
+	/* ME cache */
+	a5xx_snapshot_indexed_registers(gpu, snapshot,
+		REG_A5XX_CP_ME_UCODE_DBG_ADDR, REG_A5XX_CP_ME_UCODE_DBG_DATA,
+		0x53F);
+
+	/* PFP cache */
+	a5xx_snapshot_indexed_registers(gpu, snapshot,
+		REG_A5XX_CP_PFP_UCODE_DBG_ADDR, REG_A5XX_CP_PFP_UCODE_DBG_DATA,
+		0x53F);
+
+	/* ME queue */
+	a5xx_snapshot_cp_meq(gpu, snapshot);
+
+	/* CP ROQ */
+	a5xx_snapshot_cp_roq(gpu, snapshot);
+
+	/* CP MERCIU */
+	a5xx_snapshot_cp_merciu(gpu, snapshot);
+
+	crashdump_destroy(gpu, &crashdump);
+	snapshot->priv = NULL;
+
+	/* Re-enable HWCG */
+	a5xx_set_hwcg(gpu, true);
+	return 0;
+}
diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
index 337ed53b7bc8..f1883825354e 100644
--- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
@@ -17,7 +17,9 @@
  * this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
+#include <linux/utsname.h>
 #include "adreno_gpu.h"
+#include "msm_snapshot.h"
 #include "msm_gem.h"
 #include "msm_mmu.h"
 
@@ -629,3 +631,81 @@ void adreno_gpu_cleanup(struct adreno_gpu *gpu)
 		msm_gem_address_space_put(aspace);
 	}
 }
+
+static void adreno_snapshot_os(struct msm_gpu *gpu,
+		struct msm_snapshot *snapshot)
+{
+	struct msm_snapshot_linux header;
+
+	memset(&header, 0, sizeof(header));
+
+	header.osid = SNAPSHOT_OS_LINUX_V3;
+	strlcpy(header.release, utsname()->release, sizeof(header.release));
+	strlcpy(header.version, utsname()->version, sizeof(header.version));
+
+	header.seconds = get_seconds();
+	header.ctxtcount = 0;
+
+	SNAPSHOT_HEADER(snapshot, header, SNAPSHOT_SECTION_OS, 0);
+}
+
+static void adreno_snapshot_ringbuffer(struct msm_gpu *gpu,
+		struct msm_snapshot *snapshot, struct msm_ringbuffer *ring)
+{
+	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
+	struct msm_snapshot_ringbuffer header;
+	unsigned int i, end = 0;
+	unsigned int *data = ring->start;
+
+	memset(&header, 0, sizeof(header));
+
+	/*
+	 * We only want to copy the active contents of each ring, so find the
+	 * last valid entry in the ringbuffer
+	 */
+	for (i = 0; i < MSM_GPU_RINGBUFFER_SZ >> 2; i++) {
+		if (data[i])
+			end = i;
+	}
+
+	/* The dump always starts at 0 */
+	header.start = 0;
+	header.end = end;
+
+	/* This is the number of dwords being dumped */
+	header.count = end + 1;
+
+	/* This is the size of the actual ringbuffer */
+	header.rbsize = MSM_GPU_RINGBUFFER_SZ >> 2;
+
+	header.id = ring->id;
+	header.gpuaddr = ring->iova;
+	header.rptr = get_rptr(adreno_gpu, ring);
+	header.wptr = get_wptr(ring);
+	header.timestamp_queued = adreno_submitted_fence(gpu, ring);
+	header.timestamp_retired = adreno_last_fence(gpu, ring);
+
+	/* Write the header even if the ringbuffer data is empty */
+	if (!SNAPSHOT_HEADER(snapshot, header, SNAPSHOT_SECTION_RB_V2,
+		header.count))
+		return;
+
+	SNAPSHOT_MEMCPY(snapshot, ring->start, header.count * sizeof(u32));
+}
+
+static void adreno_snapshot_ringbuffers(struct msm_gpu *gpu,
+		struct msm_snapshot *snapshot)
+{
+	struct msm_ringbuffer *ring;
+	int i;
+
+	/* Write a new section for each ringbuffer */
+	FOR_EACH_RING(gpu, ring, i)
+		adreno_snapshot_ringbuffer(gpu, snapshot, ring);
+}
+
+void adreno_snapshot(struct msm_gpu *gpu, struct msm_snapshot *snapshot)
+{
+	adreno_snapshot_os(gpu, snapshot);
+	adreno_snapshot_ringbuffers(gpu, snapshot);
+}
diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.h b/drivers/gpu/drm/msm/adreno/adreno_gpu.h
index 55602d20a205..30461115281c 100644
--- a/drivers/gpu/drm/msm/adreno/adreno_gpu.h
+++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.h
@@ -233,6 +233,7 @@ int adreno_gpu_init(struct drm_device *drm, struct platform_device *pdev,
 		int nr_rings);
 void adreno_gpu_cleanup(struct adreno_gpu *gpu);
 
+void adreno_snapshot(struct msm_gpu *gpu, struct msm_snapshot *snapshot);
 
 /* ringbuffer helpers (the parts that are adreno specific) */
 
diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c
index 4d39eb439793..0231ac3f269f 100644
--- a/drivers/gpu/drm/msm/msm_drv.c
+++ b/drivers/gpu/drm/msm/msm_drv.c
@@ -837,6 +837,13 @@ static int msm_gpu_show(struct drm_device *dev, struct seq_file *m)
 	return 0;
 }
 
+static int msm_snapshot_show(struct drm_device *dev, struct seq_file *m)
+{
+	struct msm_drm_private *priv = dev->dev_private;
+
+	return msm_snapshot_write(priv->gpu, m);
+}
+
 static int msm_gem_show(struct drm_device *dev, struct seq_file *m)
 {
 	struct msm_drm_private *priv = dev->dev_private;
@@ -901,11 +908,22 @@ static int show_locked(struct seq_file *m, void *arg)
 	return ret;
 }
 
+static int show_unlocked(struct seq_file *m, void *arg)
+{
+	struct drm_info_node *node = (struct drm_info_node *) m->private;
+	struct drm_device *dev = node->minor->dev;
+	int (*show)(struct drm_device *dev, struct seq_file *m) =
+			node->info_ent->data;
+
+	return show(dev, m);
+}
+
 static struct drm_info_list msm_debugfs_list[] = {
 		{"gpu", show_locked, 0, msm_gpu_show},
 		{"gem", show_locked, 0, msm_gem_show},
 		{ "mm", show_locked, 0, msm_mm_show },
 		{ "fb", show_locked, 0, msm_fb_show },
+		{ "snapshot", show_unlocked, 0, msm_snapshot_show },
 };
 
 static int late_init_minor(struct drm_minor *minor)
diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
index 000a369e2537..3fb480f41fde 100644
--- a/drivers/gpu/drm/msm/msm_gpu.c
+++ b/drivers/gpu/drm/msm/msm_gpu.c
@@ -764,6 +764,10 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev,
 
 	bs_init(gpu);
 
+	gpu->snapshot = msm_snapshot_new(gpu);
+	if (IS_ERR(gpu->snapshot))
+		gpu->snapshot = NULL;
+
 	return 0;
 
 fail:
@@ -794,4 +798,6 @@ void msm_gpu_cleanup(struct msm_gpu *gpu)
 
 		msm_ringbuffer_destroy(gpu->rb[i]);
 	}
+
+	msm_snapshot_destroy(gpu, gpu->snapshot);
 }
diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
index bab4f9349e64..06dfaabbfcfe 100644
--- a/drivers/gpu/drm/msm/msm_gpu.h
+++ b/drivers/gpu/drm/msm/msm_gpu.h
@@ -24,6 +24,7 @@
 
 #include "msm_drv.h"
 #include "msm_ringbuffer.h"
+#include "msm_snapshot.h"
 
 struct msm_gem_submit;
 struct msm_gpu_perfcntr;
@@ -69,6 +70,7 @@ struct msm_gpu_funcs {
 	/* show GPU status in debugfs: */
 	void (*show)(struct msm_gpu *gpu, struct seq_file *m);
 #endif
+	int (*snapshot)(struct msm_gpu *gpu, struct msm_snapshot *snapshot);
 };
 
 struct msm_gpu {
@@ -137,6 +139,8 @@ struct msm_gpu {
 	struct work_struct recover_work;
 
 	struct list_head submit_list;
+
+	struct msm_snapshot *snapshot;
 };
 
 /* It turns out that all targets use the same ringbuffer size. */
diff --git a/drivers/gpu/drm/msm/msm_snapshot.c b/drivers/gpu/drm/msm/msm_snapshot.c
new file mode 100644
index 000000000000..30f3e5c64ebd
--- /dev/null
+++ b/drivers/gpu/drm/msm/msm_snapshot.c
@@ -0,0 +1,105 @@
+/* Copyright (c) 2016 The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include "msm_gpu.h"
+#include "msm_gem.h"
+#include "msm_snapshot_api.h"
+
+void msm_snapshot_destroy(struct msm_gpu *gpu, struct msm_snapshot *snapshot)
+{
+	struct drm_device *dev = gpu->dev;
+	struct msm_drm_private *priv = dev->dev_private;
+	struct platform_device *pdev = priv->gpu_pdev;
+
+	if (!snapshot)
+		return;
+
+	dma_free_coherent(&pdev->dev, SZ_1M, snapshot->ptr,
+		snapshot->physaddr);
+
+	kfree(snapshot);
+}
+
+struct msm_snapshot *msm_snapshot_new(struct msm_gpu *gpu)
+{
+	struct drm_device *dev = gpu->dev;
+	struct msm_drm_private *priv = dev->dev_private;
+	struct platform_device *pdev = priv->gpu_pdev;
+	struct msm_snapshot *snapshot;
+
+	snapshot = kzalloc(sizeof(*snapshot), GFP_KERNEL);
+	if (!snapshot)
+		return ERR_PTR(-ENOMEM);
+
+	snapshot->ptr = dma_alloc_coherent(&pdev->dev, SZ_1M,
+		&snapshot->physaddr, GFP_KERNEL);
+
+	if (!snapshot->ptr) {
+		kfree(snapshot);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	seq_buf_init(&snapshot->buf, snapshot->ptr, SZ_1M);
+
+	return snapshot;
+}
+
+int msm_gpu_snapshot(struct msm_gpu *gpu, struct msm_snapshot *snapshot)
+{
+	int ret;
+	struct msm_snapshot_header header;
+	uint64_t val;
+
+	if (!snapshot)
+		return -ENOMEM;
+
+	/*
+	 * For now, blow away the snapshot and take a new one  - the most
+	 * interesting hang is the last one we saw
+	 */
+	seq_buf_init(&snapshot->buf, snapshot->ptr, SZ_1M);
+
+	header.magic = SNAPSHOT_MAGIC;
+	gpu->funcs->get_param(gpu, MSM_PARAM_GPU_ID, &val);
+	header.gpuid = lower_32_bits(val);
+
+	gpu->funcs->get_param(gpu, MSM_PARAM_CHIP_ID, &val);
+	header.chipid = lower_32_bits(val);
+
+	seq_buf_putmem(&snapshot->buf, &header, sizeof(header));
+
+	ret = gpu->funcs->snapshot(gpu, snapshot);
+
+	if (!ret) {
+		struct msm_snapshot_section_header end;
+
+		end.magic = SNAPSHOT_SECTION_MAGIC;
+		end.id = SNAPSHOT_SECTION_END;
+		end.size = sizeof(end);
+
+		seq_buf_putmem(&snapshot->buf, &end, sizeof(end));
+
+		dev_info(gpu->dev->dev, "GPU snapshot created [0x%pa (%d bytes)]\n",
+			&snapshot->physaddr, seq_buf_used(&snapshot->buf));
+	}
+
+	return ret;
+}
+
+int msm_snapshot_write(struct msm_gpu *gpu, struct seq_file *m)
+{
+	if (gpu && gpu->snapshot)
+		seq_write(m, gpu->snapshot->ptr,
+			seq_buf_used(&gpu->snapshot->buf));
+
+	return 0;
+}
diff --git a/drivers/gpu/drm/msm/msm_snapshot.h b/drivers/gpu/drm/msm/msm_snapshot.h
new file mode 100644
index 000000000000..247e1358c885
--- /dev/null
+++ b/drivers/gpu/drm/msm/msm_snapshot.h
@@ -0,0 +1,85 @@
+/* Copyright (c) 2016 The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef MSM_SNAPSHOT_H_
+#define MSM_SNAPSHOT_H_
+
+#include <linux/string.h>
+#include <linux/seq_buf.h>
+#include "msm_snapshot_api.h"
+
+struct msm_snapshot {
+	void *ptr;
+	struct seq_buf buf;
+	phys_addr_t physaddr;
+	uint32_t index;
+	uint32_t remain;
+	unsigned long timestamp;
+	void *priv;
+};
+
+/* Write a uint32_t value to the next position in the snapshot buffer */
+static inline void SNAPSHOT_WRITE_U32(struct msm_snapshot *snapshot,
+		uint32_t value)
+{
+	seq_buf_putmem(&snapshot->buf, &value, sizeof(value));
+}
+
+/* Copy a block of memory to the next position in the snapshot buffer */
+static inline void SNAPSHOT_MEMCPY(struct msm_snapshot *snapshot, void *src,
+		uint32_t size)
+{
+	if (size)
+		seq_buf_putmem(&snapshot->buf, src, size);
+}
+
+static inline bool _snapshot_header(struct msm_snapshot *snapshot,
+		struct msm_snapshot_section_header *header,
+		u32 headsz, u32 datasz, u32 id)
+{
+	u32 size = headsz + datasz;
+
+	if (seq_buf_buffer_left(&snapshot->buf) <= size)
+		return false;
+
+	/* Write the section header */
+	header->magic = SNAPSHOT_SECTION_MAGIC;
+	header->id = id;
+	header->size = headsz + datasz;
+
+	/* Write the section header */
+	seq_buf_putmem(&snapshot->buf, header, headsz);
+
+	/* The caller will fill in the data from here */
+	return true;
+}
+
+/* SNAPSHOT_HEADER
+ * _snapshot: pointer to struct msm_snapshot
+ * _header: Local variable containing the sub-section header
+ * _id: Section ID to write
+ * _dword: Size of the data section (in dword)
+ */
+#define SNAPSHOT_HEADER(_snapshot, _header, _id, _dwords) \
+	_snapshot_header((_snapshot), \
+		(struct msm_snapshot_section_header *) &(header), \
+		sizeof(header), (_dwords) << 2, (_id))
+
+struct msm_gpu;
+
+struct msm_snapshot *msm_snapshot_new(struct msm_gpu *gpu);
+void msm_snapshot_destroy(struct msm_gpu *gpu, struct msm_snapshot *snapshot);
+int msm_gpu_snapshot(struct msm_gpu *gpu, struct msm_snapshot *snapshot);
+int msm_snapshot_write(struct msm_gpu *gpu, struct seq_file *m);
+
+#endif
+
diff --git a/drivers/gpu/drm/msm/msm_snapshot_api.h b/drivers/gpu/drm/msm/msm_snapshot_api.h
new file mode 100644
index 000000000000..9f0adb9ee784
--- /dev/null
+++ b/drivers/gpu/drm/msm/msm_snapshot_api.h
@@ -0,0 +1,121 @@
+/* Copyright (c) 2016 The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef MSM_SNAPSHOT_API_H_
+#define MSM_SNAPSHOT_API_H_
+
+#include <linux/types.h>
+
+/* High word is the magic, low word is the snapshot header version */
+#define SNAPSHOT_MAGIC 0x504D0002
+
+struct msm_snapshot_header {
+	__u32 magic;
+	__u32 gpuid;
+	__u32 chipid;
+} __packed;
+
+#define SNAPSHOT_SECTION_MAGIC 0xABCD
+
+struct msm_snapshot_section_header {
+	__u16 magic;
+	__u16 id;
+	__u32 size;
+} __packed;
+
+/* Section identifiers */
+#define SNAPSHOT_SECTION_OS		0x0101
+#define SNAPSHOT_SECTION_REGS_V2	0x0202
+#define SNAPSHOT_SECTION_RB_V2		0x0302
+#define SNAPSHOT_SECTION_IB_V2		0x0402
+#define SNAPSHOT_SECTION_INDEXED_REGS	0x0501
+#define SNAPSHOT_SECTION_DEBUG		0x0901
+#define SNAPSHOT_SECTION_DEBUGBUS	0x0A01
+#define SNAPSHOT_SECTION_GPU_OBJECT_V2	0x0B02
+#define SNAPSHOT_SECTION_MEMLIST_V2	0x0E02
+#define SNAPSHOT_SECTION_SHADER		0x1201
+#define SNAPSHOT_SECTION_END		0xFFFF
+
+#define SNAPSHOT_OS_LINUX_V3          0x00000202
+
+struct msm_snapshot_linux {
+	struct msm_snapshot_section_header header;
+	int osid;
+	__u32 seconds;
+	__u32 power_flags;
+	__u32 power_level;
+	__u32 power_interval_timeout;
+	__u32 grpclk;
+	__u32 busclk;
+	__u64 ptbase;
+	__u32 pid;
+	__u32 current_context;
+	__u32 ctxtcount;
+	unsigned char release[32];
+	unsigned char version[32];
+	unsigned char comm[16];
+} __packed;
+
+struct msm_snapshot_ringbuffer {
+	struct msm_snapshot_section_header header;
+	int start;
+	int end;
+	int rbsize;
+	int wptr;
+	int rptr;
+	int count;
+	__u32 timestamp_queued;
+	__u32 timestamp_retired;
+	__u64 gpuaddr;
+	__u32 id;
+} __packed;
+
+struct msm_snapshot_regs {
+	struct msm_snapshot_section_header header;
+	__u32 count;
+} __packed;
+
+struct msm_snapshot_indexed_regs {
+	struct msm_snapshot_section_header header;
+	__u32 index_reg;
+	__u32 data_reg;
+	__u32 start;
+	__u32 count;
+} __packed;
+
+#define SNAPSHOT_DEBUG_CP_MEQ		7
+#define SNAPSHOT_DEBUG_CP_PM4_RAM	8
+#define SNAPSHOT_DEBUG_CP_PFP_RAM	9
+#define SNAPSHOT_DEBUG_CP_ROQ		10
+#define SNAPSHOT_DEBUG_SHADER_MEMORY	11
+#define SNAPSHOT_DEBUG_CP_MERCIU	12
+
+struct msm_snapshot_debug {
+	struct msm_snapshot_section_header header;
+	__u32 type;
+	__u32 size;
+} __packed;
+
+struct msm_snapshot_debugbus {
+	struct msm_snapshot_section_header header;
+	__u32 id;
+	__u32 count;
+} __packed;
+
+struct msm_snapshot_shader {
+	struct msm_snapshot_section_header header;
+	__u32 type;
+	__u32 index;
+	__u32 size;
+} __packed;
+
+#endif
-- 
cgit v1.2.3


From 4b508ebeb81e6908d8c2eb359e90329f661ecdac Mon Sep 17 00:00:00 2001
From: Jordan Crouse <jcrouse@codeaurora.org>
Date: Mon, 13 Feb 2017 10:14:32 -0700
Subject: drm/msm: Fix the check for the command size

The overrun check for the size of submitted commands is off by one.
It should allow the offset plus the size to be equal to the
size of the memory object when the command stream is very tightly
constructed.

Change-Id: Ic0dedbadec41fb8be84d7522b4dc923dbd537ce5
Signed-off-by: Jordan Crouse <jcrouse@codeaurora.org>
---
 drivers/gpu/drm/msm/msm_gem_submit.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c b/drivers/gpu/drm/msm/msm_gem_submit.c
index c04c3c8e0eca..0566cefaae81 100644
--- a/drivers/gpu/drm/msm/msm_gem_submit.c
+++ b/drivers/gpu/drm/msm/msm_gem_submit.c
@@ -412,8 +412,9 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
 			goto out;
 		}
 
-		if ((submit_cmd.size + submit_cmd.submit_offset) >=
-				msm_obj->base.size) {
+		if (!(submit_cmd.size) ||
+			((submit_cmd.size + submit_cmd.submit_offset) >
+				msm_obj->base.size)) {
 			DRM_ERROR("invalid cmdstream size: %u\n", submit_cmd.size);
 			ret = -EINVAL;
 			goto out;
-- 
cgit v1.2.3


From 9f8cd5dfb437c689b563c6e9a2d3d4316655ab28 Mon Sep 17 00:00:00 2001
From: Jordan Crouse <jcrouse@codeaurora.org>
Date: Mon, 13 Feb 2017 10:14:32 -0700
Subject: drm/msm: Get and enable the IOMMU clocks

If we do not enable the iommu clocks at attach time they might
be shut off automatically by other devices power collapsing which
would affect our ability to switch the pagetable dynamically.

There is little power downside to just leaving them on all the time,
or at least as long as the main device is attached (in other words,
all the time).

Change-Id: Ic0dedbad8f6d2ee2a2cb9516e062af2421d91052
Signed-off-by: Jordan Crouse <jcrouse@codeaurora.org>
---
 drivers/gpu/drm/msm/msm_iommu.c | 60 ++++++++++++++++++++++++++++++++++++++++-
 drivers/gpu/drm/msm/msm_iommu.h |  5 +++-
 2 files changed, 63 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/msm/msm_iommu.c b/drivers/gpu/drm/msm/msm_iommu.c
index acbab7494de4..3c16222b8890 100644
--- a/drivers/gpu/drm/msm/msm_iommu.c
+++ b/drivers/gpu/drm/msm/msm_iommu.c
@@ -26,6 +26,47 @@ static int msm_fault_handler(struct iommu_domain *iommu, struct device *dev,
 	return 0;
 }
 
+/*
+ * Get and enable the IOMMU clocks so that we can make
+ * sure they stay on the entire duration so that we can
+ * safely change the pagetable from the GPU
+ */
+static void _get_iommu_clocks(struct msm_mmu *mmu, struct platform_device *pdev)
+{
+	struct msm_iommu *iommu = to_msm_iommu(mmu);
+	struct device *dev;
+	struct property *prop;
+	const char *name;
+	int i = 0;
+
+	if (WARN_ON(!pdev))
+		return;
+
+	dev = &pdev->dev;
+
+	iommu->nr_clocks =
+		of_property_count_strings(dev->of_node, "clock-names");
+
+	if (iommu->nr_clocks < 0) {
+		iommu->nr_clocks = 0;
+		return;
+	}
+
+	if (WARN_ON(iommu->nr_clocks > ARRAY_SIZE(iommu->clocks)))
+		iommu->nr_clocks = ARRAY_SIZE(iommu->clocks);
+
+	of_property_for_each_string(dev->of_node, "clock-names", prop, name) {
+		if (i == iommu->nr_clocks)
+			break;
+
+		iommu->clocks[i] =  clk_get(dev, name);
+		if (iommu->clocks[i])
+			clk_prepare_enable(iommu->clocks[i]);
+
+		i++;
+	}
+}
+
 static int _attach_iommu_device(struct msm_mmu *mmu,
 		struct iommu_domain *domain, const char **names, int cnt)
 {
@@ -58,7 +99,11 @@ static int _attach_iommu_device(struct msm_mmu *mmu,
 			if (!pdev)
 				continue;
 
+			_get_iommu_clocks(mmu,
+				of_find_device_by_node(node->parent));
+
 			mmu->dev = &pdev->dev;
+
 			return iommu_attach_device(domain, mmu->dev);
 		}
 	}
@@ -128,6 +173,19 @@ static int msm_iommu_attach_dynamic(struct msm_mmu *mmu, const char **names,
 }
 
 static void msm_iommu_detach(struct msm_mmu *mmu)
+{
+	struct msm_iommu *iommu = to_msm_iommu(mmu);
+	int i;
+
+	iommu_detach_device(iommu->domain, mmu->dev);
+
+	for (i = 0; i < iommu->nr_clocks; i++) {
+		if (iommu->clocks[i])
+			clk_disable(iommu->clocks[i]);
+	}
+}
+
+static void msm_iommu_detach_dynamic(struct msm_mmu *mmu)
 {
 	struct msm_iommu *iommu = to_msm_iommu(mmu);
 	iommu_detach_device(iommu->domain, mmu->dev);
@@ -216,7 +274,7 @@ static const struct msm_mmu_funcs funcs = {
 
 static const struct msm_mmu_funcs dynamic_funcs = {
 		.attach = msm_iommu_attach_dynamic,
-		.detach = msm_iommu_detach,
+		.detach = msm_iommu_detach_dynamic,
 		.map = msm_iommu_map,
 		.unmap = msm_iommu_unmap,
 		.destroy = msm_iommu_destroy,
diff --git a/drivers/gpu/drm/msm/msm_iommu.h b/drivers/gpu/drm/msm/msm_iommu.h
index bfb75015efbc..d005cfb9758f 100644
--- a/drivers/gpu/drm/msm/msm_iommu.h
+++ b/drivers/gpu/drm/msm/msm_iommu.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 The Linux Foundation. All rights reserved.
+/* Copyright (c) 2016-2017 The Linux Foundation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -22,6 +22,9 @@ struct msm_iommu {
 	phys_addr_t ttbr0;
 	uint32_t contextidr;
 	bool allow_dynamic;
+
+	struct clk *clocks[5];
+	int nr_clocks;
 };
 #define to_msm_iommu(x) container_of(x, struct msm_iommu, base)
 
-- 
cgit v1.2.3