diff options
author | Linux Build Service Account <lnxbuild@localhost> | 2017-02-23 21:35:10 -0800 |
---|---|---|
committer | Gerrit - the friendly Code Review server <code-review@localhost> | 2017-02-23 21:35:09 -0800 |
commit | ef49b42e4abb3f8c6f8a81d794332f078aeaea2a (patch) | |
tree | 243313f0f3e6d11f44502481547443d2e024be23 | |
parent | fb98e68c1c8cb2c66827e10ae7c8049ad52638b2 (diff) | |
parent | 9f8cd5dfb437c689b563c6e9a2d3d4316655ab28 (diff) |
Merge "drm/msm: Get and enable the IOMMU clocks"
36 files changed, 3262 insertions, 420 deletions
diff --git a/drivers/gpu/drm/msm/Makefile b/drivers/gpu/drm/msm/Makefile index 9f035b10875b..b77fdd098471 100644 --- a/drivers/gpu/drm/msm/Makefile +++ b/drivers/gpu/drm/msm/Makefile @@ -54,7 +54,9 @@ msm_drm-y += adreno/adreno_device.o \ adreno/a3xx_gpu.o \ adreno/a4xx_gpu.o \ adreno/a5xx_gpu.o \ - adreno/a5xx_power.o + adreno/a5xx_power.o \ + adreno/a5xx_preempt.o \ + adreno/a5xx_snapshot.o endif msm_drm-$(CONFIG_DRM_MSM_MDP4) += mdp/mdp4/mdp4_crtc.o \ @@ -130,6 +132,7 @@ msm_drm-$(CONFIG_DRM_MSM) += \ msm_perf.o \ msm_rd.o \ msm_ringbuffer.o \ - msm_prop.o + msm_prop.o \ + msm_snapshot.o obj-$(CONFIG_DRM_MSM) += msm_drm.o diff --git a/drivers/gpu/drm/msm/adreno/a3xx_gpu.c b/drivers/gpu/drm/msm/adreno/a3xx_gpu.c index 5a061ad6225a..c4f886fd6037 100644 --- a/drivers/gpu/drm/msm/adreno/a3xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a3xx_gpu.c @@ -40,10 +40,11 @@ extern bool hang_debug; static void a3xx_dump(struct msm_gpu *gpu); +static bool a3xx_idle(struct msm_gpu *gpu); static bool a3xx_me_init(struct msm_gpu *gpu) { - struct msm_ringbuffer *ring = gpu->rb; + struct msm_ringbuffer *ring = gpu->rb[0]; OUT_PKT3(ring, CP_ME_INIT, 17); OUT_RING(ring, 0x000003f7); @@ -64,8 +65,8 @@ static bool a3xx_me_init(struct msm_gpu *gpu) OUT_RING(ring, 0x00000000); OUT_RING(ring, 0x00000000); - gpu->funcs->flush(gpu); - return gpu->funcs->idle(gpu); + gpu->funcs->flush(gpu, ring); + return a3xx_idle(gpu); } static int a3xx_hw_init(struct msm_gpu *gpu) @@ -331,7 +332,7 @@ static void a3xx_destroy(struct msm_gpu *gpu) static bool a3xx_idle(struct msm_gpu *gpu) { /* wait for ringbuffer to drain: */ - if (!adreno_idle(gpu)) + if (!adreno_idle(gpu, gpu->rb[0])) return false; /* then wait for GPU to finish: */ @@ -439,9 +440,10 @@ static const struct adreno_gpu_funcs funcs = { .pm_resume = msm_gpu_pm_resume, .recover = a3xx_recover, .last_fence = adreno_last_fence, + .submitted_fence = adreno_submitted_fence, .submit = adreno_submit, .flush = adreno_flush, - .idle = a3xx_idle, + .active_ring = adreno_active_ring, .irq = a3xx_irq, .destroy = a3xx_destroy, #ifdef CONFIG_DEBUG_FS @@ -489,7 +491,7 @@ struct msm_gpu *a3xx_gpu_init(struct drm_device *dev) adreno_gpu->registers = a3xx_registers; adreno_gpu->reg_offsets = a3xx_register_offsets; - ret = adreno_gpu_init(dev, pdev, adreno_gpu, &funcs); + ret = adreno_gpu_init(dev, pdev, adreno_gpu, &funcs, 1); if (ret) goto fail; diff --git a/drivers/gpu/drm/msm/adreno/a4xx_gpu.c b/drivers/gpu/drm/msm/adreno/a4xx_gpu.c index 47c9b22b0801..534a7c3fbdca 100644 --- a/drivers/gpu/drm/msm/adreno/a4xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a4xx_gpu.c @@ -31,6 +31,7 @@ extern bool hang_debug; static void a4xx_dump(struct msm_gpu *gpu); +static bool a4xx_idle(struct msm_gpu *gpu); /* * a4xx_enable_hwcg() - Program the clock control registers @@ -115,7 +116,7 @@ static void a4xx_enable_hwcg(struct msm_gpu *gpu) static bool a4xx_me_init(struct msm_gpu *gpu) { - struct msm_ringbuffer *ring = gpu->rb; + struct msm_ringbuffer *ring = gpu->rb[0]; OUT_PKT3(ring, CP_ME_INIT, 17); OUT_RING(ring, 0x000003f7); @@ -136,8 +137,8 @@ static bool a4xx_me_init(struct msm_gpu *gpu) OUT_RING(ring, 0x00000000); OUT_RING(ring, 0x00000000); - gpu->funcs->flush(gpu); - return gpu->funcs->idle(gpu); + gpu->funcs->flush(gpu, ring); + return a4xx_idle(gpu); } static int a4xx_hw_init(struct msm_gpu *gpu) @@ -329,7 +330,7 @@ static void a4xx_destroy(struct msm_gpu *gpu) static bool a4xx_idle(struct msm_gpu *gpu) { /* wait for ringbuffer to drain: */ - if (!adreno_idle(gpu)) + if (!adreno_idle(gpu, gpu->rb[0])) return false; /* then wait for GPU to finish: */ @@ -522,9 +523,10 @@ static const struct adreno_gpu_funcs funcs = { .pm_resume = a4xx_pm_resume, .recover = a4xx_recover, .last_fence = adreno_last_fence, + .submitted_fence = adreno_submitted_fence, .submit = adreno_submit, .flush = adreno_flush, - .idle = a4xx_idle, + .active_ring = adreno_active_ring, .irq = a4xx_irq, .destroy = a4xx_destroy, #ifdef CONFIG_DEBUG_FS @@ -566,7 +568,7 @@ struct msm_gpu *a4xx_gpu_init(struct drm_device *dev) adreno_gpu->registers = a4xx_registers; adreno_gpu->reg_offsets = a4xx_register_offsets; - ret = adreno_gpu_init(dev, pdev, adreno_gpu, &funcs); + ret = adreno_gpu_init(dev, pdev, adreno_gpu, &funcs, 1); if (ret) goto fail; diff --git a/drivers/gpu/drm/msm/adreno/a5xx.xml.h b/drivers/gpu/drm/msm/adreno/a5xx.xml.h index bfee2fd83462..56dad2217289 100644 --- a/drivers/gpu/drm/msm/adreno/a5xx.xml.h +++ b/drivers/gpu/drm/msm/adreno/a5xx.xml.h @@ -155,6 +155,114 @@ enum a5xx_depth_format { DEPTH5_32 = 4, }; +enum a5xx_debugbus { + A5XX_RBBM_DBGBUS_CP = 1, + A5XX_RBBM_DBGBUS_RBBM = 2, + A5XX_RBBM_DBGBUS_VBIF = 3, + A5XX_RBBM_DBGBUS_HLSQ = 4, + A5XX_RBBM_DBGBUS_UCHE = 5, + A5XX_RBBM_DBGBUS_DPM = 6, + A5XX_RBBM_DBGBUS_TESS = 7, + A5XX_RBBM_DBGBUS_PC = 8, + A5XX_RBBM_DBGBUS_VFDP = 9, + A5XX_RBBM_DBGBUS_VPC = 10, + A5XX_RBBM_DBGBUS_TSE = 11, + A5XX_RBBM_DBGBUS_RAS = 12, + A5XX_RBBM_DBGBUS_VSC = 13, + A5XX_RBBM_DBGBUS_COM = 14, + A5XX_RBBM_DBGBUS_DCOM = 15, + A5XX_RBBM_DBGBUS_LRZ = 16, + A5XX_RBBM_DBGBUS_A2D_DSP = 17, + A5XX_RBBM_DBGBUS_CCUFCHE = 18, + A5XX_RBBM_DBGBUS_GPMU = 19, + A5XX_RBBM_DBGBUS_RBP = 20, + A5XX_RBBM_DBGBUS_HM = 21, + A5XX_RBBM_DBGBUS_RBBM_CFG = 22, + A5XX_RBBM_DBGBUS_VBIF_CX = 23, + A5XX_RBBM_DBGBUS_GPC = 29, + A5XX_RBBM_DBGBUS_LARC = 30, + A5XX_RBBM_DBGBUS_HLSQ_SPTP = 31, + A5XX_RBBM_DBGBUS_RB_0 = 32, + A5XX_RBBM_DBGBUS_RB_1 = 33, + A5XX_RBBM_DBGBUS_RB_2 = 34, + A5XX_RBBM_DBGBUS_RB_3 = 35, + A5XX_RBBM_DBGBUS_CCU_0 = 40, + A5XX_RBBM_DBGBUS_CCU_1 = 41, + A5XX_RBBM_DBGBUS_CCU_2 = 42, + A5XX_RBBM_DBGBUS_CCU_3 = 43, + A5XX_RBBM_DBGBUS_A2D_RAS_0 = 48, + A5XX_RBBM_DBGBUS_A2D_RAS_1 = 49, + A5XX_RBBM_DBGBUS_A2D_RAS_2 = 50, + A5XX_RBBM_DBGBUS_A2D_RAS_3 = 51, + A5XX_RBBM_DBGBUS_VFD_0 = 56, + A5XX_RBBM_DBGBUS_VFD_1 = 57, + A5XX_RBBM_DBGBUS_VFD_2 = 58, + A5XX_RBBM_DBGBUS_VFD_3 = 59, + A5XX_RBBM_DBGBUS_SP_0 = 64, + A5XX_RBBM_DBGBUS_SP_1 = 65, + A5XX_RBBM_DBGBUS_SP_2 = 66, + A5XX_RBBM_DBGBUS_SP_3 = 67, + A5XX_RBBM_DBGBUS_TPL1_0 = 72, + A5XX_RBBM_DBGBUS_TPL1_1 = 73, + A5XX_RBBM_DBGBUS_TPL1_2 = 74, + A5XX_RBBM_DBGBUS_TPL1_3 = 75, +}; + +enum a5xx_shader_blocks { + A5XX_TP_W_MEMOBJ = 1, + A5XX_TP_W_SAMPLER = 2, + A5XX_TP_W_MIPMAP_BASE = 3, + A5XX_TP_W_MEMOBJ_TAG = 4, + A5XX_TP_W_SAMPLER_TAG = 5, + A5XX_TP_S_3D_MEMOBJ = 6, + A5XX_TP_S_3D_SAMPLER = 7, + A5XX_TP_S_3D_MEMOBJ_TAG = 8, + A5XX_TP_S_3D_SAMPLER_TAG = 9, + A5XX_TP_S_CS_MEMOBJ = 10, + A5XX_TP_S_CS_SAMPLER = 11, + A5XX_TP_S_CS_MEMOBJ_TAG = 12, + A5XX_TP_S_CS_SAMPLER_TAG = 13, + A5XX_SP_W_INSTR = 14, + A5XX_SP_W_CONST = 15, + A5XX_SP_W_UAV_SIZE = 16, + A5XX_SP_W_CB_SIZE = 17, + A5XX_SP_W_UAV_BASE = 18, + A5XX_SP_W_CB_BASE = 19, + A5XX_SP_W_INST_TAG = 20, + A5XX_SP_W_STATE = 21, + A5XX_SP_S_3D_INSTR = 22, + A5XX_SP_S_3D_CONST = 23, + A5XX_SP_S_3D_CB_BASE = 24, + A5XX_SP_S_3D_CB_SIZE = 25, + A5XX_SP_S_3D_UAV_BASE = 26, + A5XX_SP_S_3D_UAV_SIZE = 27, + A5XX_SP_S_CS_INSTR = 28, + A5XX_SP_S_CS_CONST = 29, + A5XX_SP_S_CS_CB_BASE = 30, + A5XX_SP_S_CS_CB_SIZE = 31, + A5XX_SP_S_CS_UAV_BASE = 32, + A5XX_SP_S_CS_UAV_SIZE = 33, + A5XX_SP_S_3D_INSTR_DIRTY = 34, + A5XX_SP_S_3D_CONST_DIRTY = 35, + A5XX_SP_S_3D_CB_BASE_DIRTY = 36, + A5XX_SP_S_3D_CB_SIZE_DIRTY = 37, + A5XX_SP_S_3D_UAV_BASE_DIRTY = 38, + A5XX_SP_S_3D_UAV_SIZE_DIRTY = 39, + A5XX_SP_S_CS_INSTR_DIRTY = 40, + A5XX_SP_S_CS_CONST_DIRTY = 41, + A5XX_SP_S_CS_CB_BASE_DIRTY = 42, + A5XX_SP_S_CS_CB_SIZE_DIRTY = 43, + A5XX_SP_S_CS_UAV_BASE_DIRTY = 44, + A5XX_SP_S_CS_UAV_SIZE_DIRTY = 45, + A5XX_HLSQ_ICB = 46, + A5XX_HLSQ_ICB_DIRTY = 47, + A5XX_HLSQ_ICB_CB_BASE_DIRTY = 48, + A5XX_SP_POWER_RESTORE_RAM = 64, + A5XX_SP_POWER_RESTORE_RAM_TAG = 65, + A5XX_TP_POWER_RESTORE_RAM = 66, + A5XX_TP_POWER_RESTORE_RAM_TAG = 67, +}; + enum a5xx_tex_filter { A5XX_TEX_NEAREST = 0, A5XX_TEX_LINEAR = 1, @@ -396,6 +504,18 @@ static inline uint32_t A5XX_CP_PROTECT_REG_MASK_LEN(uint32_t val) #define REG_A5XX_CP_POWERCTR_CP_SEL_3 0x00000bbd #define REG_A5XX_RBBM_CFG_DBGBUS_SEL_A 0x00000004 +#define A5XX_RBBM_CFG_DBGBUS_SEL_A_PING_INDEX__MASK 0x000000ff +#define A5XX_RBBM_CFG_DBGBUS_SEL_A_PING_INDEX__SHIFT 0 +static inline uint32_t A5XX_RBBM_CFG_DBGBUS_SEL_A_PING_INDEX(uint32_t val) +{ + return ((val) << A5XX_RBBM_CFG_DBGBUS_SEL_A_PING_INDEX__SHIFT) & A5XX_RBBM_CFG_DBGBUS_SEL_A_PING_INDEX__MASK; +} +#define A5XX_RBBM_CFG_DBGBUS_SEL_A_PING_BLK_SEL__MASK 0x0000ff00 +#define A5XX_RBBM_CFG_DBGBUS_SEL_A_PING_BLK_SEL__SHIFT 8 +static inline uint32_t A5XX_RBBM_CFG_DBGBUS_SEL_A_PING_BLK_SEL(uint32_t val) +{ + return ((val) << A5XX_RBBM_CFG_DBGBUS_SEL_A_PING_BLK_SEL__SHIFT) & A5XX_RBBM_CFG_DBGBUS_SEL_A_PING_BLK_SEL__MASK; +} #define REG_A5XX_RBBM_CFG_DBGBUS_SEL_B 0x00000005 @@ -406,6 +526,12 @@ static inline uint32_t A5XX_CP_PROTECT_REG_MASK_LEN(uint32_t val) #define REG_A5XX_RBBM_CFG_DBGBUS_CNTLT 0x00000008 #define REG_A5XX_RBBM_CFG_DBGBUS_CNTLM 0x00000009 +#define A5XX_RBBM_CFG_DBGBUS_CNTLM_ENABLE__MASK 0x0f000000 +#define A5XX_RBBM_CFG_DBGBUS_CNTLM_ENABLE__SHIFT 24 +static inline uint32_t A5XX_RBBM_CFG_DBGBUS_CNTLM_ENABLE(uint32_t val) +{ + return ((val) << A5XX_RBBM_CFG_DBGBUS_CNTLM_ENABLE__SHIFT) & A5XX_RBBM_CFG_DBGBUS_CNTLM_ENABLE__MASK; +} #define REG_A5XX_RBBM_CFG_DEBBUS_CTLTM_ENABLE_SHIFT 0x00000018 @@ -1413,6 +1539,12 @@ static inline uint32_t A5XX_VSC_BIN_SIZE_Y(uint32_t val) #define REG_A5XX_HLSQ_SPTP_RDSEL 0x00000f08 #define REG_A5XX_HLSQ_DBG_READ_SEL 0x0000bc00 +#define A5XX_HLSQ_DBG_READ_SEL_STATETYPE__MASK 0x0000ff00 +#define A5XX_HLSQ_DBG_READ_SEL_STATETYPE__SHIFT 8 +static inline uint32_t A5XX_HLSQ_DBG_READ_SEL_STATETYPE(uint32_t val) +{ + return ((val) << A5XX_HLSQ_DBG_READ_SEL_STATETYPE__SHIFT) & A5XX_HLSQ_DBG_READ_SEL_STATETYPE__MASK; +} #define REG_A5XX_HLSQ_DBG_AHB_READ_APERTURE 0x0000a000 @@ -1583,6 +1715,8 @@ static inline uint32_t A5XX_VSC_BIN_SIZE_Y(uint32_t val) #define REG_A5XX_VBIF_VERSION 0x00003000 #define REG_A5XX_VBIF_CLKON 0x00003001 +#define A5XX_VBIF_CLKON_FORCE_ON 0x00000001 +#define A5XX_VBIF_CLKON_FORCE_ON_TESTBUS 0x00000002 #define REG_A5XX_VBIF_ABIT_SORT 0x00003028 @@ -1601,14 +1735,27 @@ static inline uint32_t A5XX_VSC_BIN_SIZE_Y(uint32_t val) #define REG_A5XX_VBIF_XIN_HALT_CTRL1 0x00003081 #define REG_A5XX_VBIF_TEST_BUS_OUT_CTRL 0x00003084 +#define A5XX_VBIF_TEST_BUS_OUT_CTRL_TEST_BUS_CTRL_EN 0x00000001 #define REG_A5XX_VBIF_TEST_BUS1_CTRL0 0x00003085 #define REG_A5XX_VBIF_TEST_BUS1_CTRL1 0x00003086 +#define A5XX_VBIF_TEST_BUS1_CTRL1_TEST_BUS1_DATA_SEL__MASK 0x0000000f +#define A5XX_VBIF_TEST_BUS1_CTRL1_TEST_BUS1_DATA_SEL__SHIFT 0 +static inline uint32_t A5XX_VBIF_TEST_BUS1_CTRL1_TEST_BUS1_DATA_SEL(uint32_t val) +{ + return ((val) << A5XX_VBIF_TEST_BUS1_CTRL1_TEST_BUS1_DATA_SEL__SHIFT) & A5XX_VBIF_TEST_BUS1_CTRL1_TEST_BUS1_DATA_SEL__MASK; +} #define REG_A5XX_VBIF_TEST_BUS2_CTRL0 0x00003087 #define REG_A5XX_VBIF_TEST_BUS2_CTRL1 0x00003088 +#define A5XX_VBIF_TEST_BUS2_CTRL1_TEST_BUS2_DATA_SEL__MASK 0x0000001f +#define A5XX_VBIF_TEST_BUS2_CTRL1_TEST_BUS2_DATA_SEL__SHIFT 0 +static inline uint32_t A5XX_VBIF_TEST_BUS2_CTRL1_TEST_BUS2_DATA_SEL(uint32_t val) +{ + return ((val) << A5XX_VBIF_TEST_BUS2_CTRL1_TEST_BUS2_DATA_SEL__SHIFT) & A5XX_VBIF_TEST_BUS2_CTRL1_TEST_BUS2_DATA_SEL__MASK; +} #define REG_A5XX_VBIF_TEST_BUS_OUT 0x0000308c diff --git a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c index 8bc3c7bee3fb..a49a7b247547 100644 --- a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c @@ -12,26 +12,132 @@ */ #include "msm_gem.h" +#include "msm_iommu.h" #include "a5xx_gpu.h" -extern bool hang_debug; -static void a5xx_dump(struct msm_gpu *gpu); +static void a5xx_flush(struct msm_gpu *gpu, struct msm_ringbuffer *ring) +{ + struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu); + struct a5xx_gpu *a5xx_gpu = to_a5xx_gpu(adreno_gpu); + uint32_t wptr; + unsigned long flags; + + spin_lock_irqsave(&ring->lock, flags); + + /* Copy the shadow to the actual register */ + ring->cur = ring->next; + + /* Make sure to wrap wptr if we need to */ + wptr = get_wptr(ring); + + spin_unlock_irqrestore(&ring->lock, flags); + + /* Make sure everything is posted before making a decision */ + mb(); + + /* Update HW if this is the current ring and we are not in preempt */ + if (a5xx_gpu->cur_ring == ring && !a5xx_in_preempt(a5xx_gpu)) + gpu_write(gpu, REG_A5XX_CP_RB_WPTR, wptr); +} -static int a5xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit, - struct msm_file_private *ctx) +static void a5xx_set_pagetable(struct msm_gpu *gpu, struct msm_ringbuffer *ring, + struct msm_gem_address_space *aspace) { struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu); - struct msm_drm_private *priv = gpu->dev->dev_private; - struct msm_ringbuffer *ring = gpu->rb; + struct msm_mmu *mmu = aspace->mmu; + struct msm_iommu *iommu = to_msm_iommu(mmu); + + if (!iommu->ttbr0) + return; + + /* Turn off protected mode */ + OUT_PKT7(ring, CP_SET_PROTECTED_MODE, 1); + OUT_RING(ring, 0); + + /* Turn on APIV mode to access critical regions */ + OUT_PKT4(ring, REG_A5XX_CP_CNTL, 1); + OUT_RING(ring, 1); + + /* Make sure the ME is syncronized before staring the update */ + OUT_PKT7(ring, CP_WAIT_FOR_ME, 0); + + /* Execute the table update */ + OUT_PKT7(ring, CP_SMMU_TABLE_UPDATE, 3); + OUT_RING(ring, lower_32_bits(iommu->ttbr0)); + OUT_RING(ring, upper_32_bits(iommu->ttbr0)); + OUT_RING(ring, iommu->contextidr); + + /* + * Write the new TTBR0 to the preemption records - this will be used to + * reload the pagetable if the current ring gets preempted out. + */ + OUT_PKT7(ring, CP_MEM_WRITE, 4); + OUT_RING(ring, lower_32_bits(rbmemptr(adreno_gpu, ring->id, ttbr0))); + OUT_RING(ring, upper_32_bits(rbmemptr(adreno_gpu, ring->id, ttbr0))); + OUT_RING(ring, lower_32_bits(iommu->ttbr0)); + OUT_RING(ring, upper_32_bits(iommu->ttbr0)); + + /* Also write the current contextidr (ASID) */ + OUT_PKT7(ring, CP_MEM_WRITE, 3); + OUT_RING(ring, lower_32_bits(rbmemptr(adreno_gpu, ring->id, + contextidr))); + OUT_RING(ring, upper_32_bits(rbmemptr(adreno_gpu, ring->id, + contextidr))); + OUT_RING(ring, iommu->contextidr); + + /* Invalidate the draw state so we start off fresh */ + OUT_PKT7(ring, CP_SET_DRAW_STATE, 3); + OUT_RING(ring, 0x40000); + OUT_RING(ring, 1); + OUT_RING(ring, 0); + + /* Turn off APRIV */ + OUT_PKT4(ring, REG_A5XX_CP_CNTL, 1); + OUT_RING(ring, 0); + + /* Turn off protected mode */ + OUT_PKT7(ring, CP_SET_PROTECTED_MODE, 1); + OUT_RING(ring, 1); +} + +static int a5xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit) +{ + struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu); + struct a5xx_gpu *a5xx_gpu = to_a5xx_gpu(adreno_gpu); + struct msm_ringbuffer *ring = gpu->rb[submit->ring]; unsigned int i, ibs = 0; + a5xx_set_pagetable(gpu, ring, submit->aspace); + + OUT_PKT7(ring, CP_PREEMPT_ENABLE_GLOBAL, 1); + OUT_RING(ring, 0x02); + + /* Turn off protected mode to write to special registers */ + OUT_PKT7(ring, CP_SET_PROTECTED_MODE, 1); + OUT_RING(ring, 0); + + /* Set the save preemption record for the ring/command */ + OUT_PKT4(ring, REG_A5XX_CP_CONTEXT_SWITCH_SAVE_ADDR_LO, 2); + OUT_RING(ring, lower_32_bits(a5xx_gpu->preempt_iova[submit->ring])); + OUT_RING(ring, upper_32_bits(a5xx_gpu->preempt_iova[submit->ring])); + + /* Turn back on protected mode */ + OUT_PKT7(ring, CP_SET_PROTECTED_MODE, 1); + OUT_RING(ring, 1); + + /* Enable local preemption for finegrain preemption */ + OUT_PKT7(ring, CP_PREEMPT_ENABLE_GLOBAL, 1); + OUT_RING(ring, 0x02); + + /* Allow CP_CONTEXT_SWITCH_YIELD packets in the IB2 */ + OUT_PKT7(ring, CP_YIELD_ENABLE, 1); + OUT_RING(ring, 0x02); + + /* Submit the commands */ for (i = 0; i < submit->nr_cmds; i++) { switch (submit->cmd[i].type) { case MSM_SUBMIT_CMD_IB_TARGET_BUF: break; - case MSM_SUBMIT_CMD_CTX_RESTORE_BUF: - if (priv->lastctx == ctx) - break; case MSM_SUBMIT_CMD_BUF: OUT_PKT7(ring, CP_INDIRECT_BUFFER_PFE, 3); OUT_RING(ring, lower_32_bits(submit->cmd[i].iova)); @@ -42,16 +148,55 @@ static int a5xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit, } } + /* + * Write the render mode to NULL (0) to indicate to the CP that the IBs + * are done rendering - otherwise a lucky preemption would start + * replaying from the last checkpoint + */ + OUT_PKT7(ring, CP_SET_RENDER_MODE, 5); + OUT_RING(ring, 0); + OUT_RING(ring, 0); + OUT_RING(ring, 0); + OUT_RING(ring, 0); + OUT_RING(ring, 0); + + /* Turn off IB level preemptions */ + OUT_PKT7(ring, CP_YIELD_ENABLE, 1); + OUT_RING(ring, 0x01); + + /* Write the fence to the scratch register */ OUT_PKT4(ring, REG_A5XX_CP_SCRATCH_REG(2), 1); OUT_RING(ring, submit->fence); + /* + * Execute a CACHE_FLUSH_TS event. This will ensure that the + * timestamp is written to the memory and then triggers the interrupt + */ OUT_PKT7(ring, CP_EVENT_WRITE, 4); OUT_RING(ring, CACHE_FLUSH_TS | (1 << 31)); - OUT_RING(ring, lower_32_bits(rbmemptr(adreno_gpu, fence))); - OUT_RING(ring, upper_32_bits(rbmemptr(adreno_gpu, fence))); + + OUT_RING(ring, lower_32_bits(rbmemptr(adreno_gpu, ring->id, fence))); + OUT_RING(ring, upper_32_bits(rbmemptr(adreno_gpu, ring->id, fence))); OUT_RING(ring, submit->fence); - gpu->funcs->flush(gpu); + /* Yield the floor on command completion */ + OUT_PKT7(ring, CP_CONTEXT_SWITCH_YIELD, 4); + /* + * If dword[2:1] are non zero, they specify an address for the CP to + * write the value of dword[3] to on preemption complete. Write 0 to + * skip the write + */ + OUT_RING(ring, 0x00); + OUT_RING(ring, 0x00); + /* Data value - not used if the address above is 0 */ + OUT_RING(ring, 0x01); + /* Set bit 0 to trigger an interrupt on preempt complete */ + OUT_RING(ring, 0x01); + + a5xx_flush(gpu, ring); + + /* Check to see if we need to start preemption */ + a5xx_preempt_trigger(gpu); return 0; } @@ -154,28 +299,31 @@ static const struct { {REG_A5XX_RBBM_CLOCK_DELAY_VFD, 0x00002222} }; -static void a5xx_enable_hwcg(struct msm_gpu *gpu) +void a5xx_set_hwcg(struct msm_gpu *gpu, bool state) { struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu); unsigned int i; for (i = 0; i < ARRAY_SIZE(a5xx_hwcg); i++) - gpu_write(gpu, a5xx_hwcg[i].offset, a5xx_hwcg[i].value); + gpu_write(gpu, a5xx_hwcg[i].offset, + state ? a5xx_hwcg[i].value : 0); /* There are a few additional registers just for A540 */ if (adreno_is_a540(adreno_gpu)) { - gpu_write(gpu, REG_A5XX_RBBM_CLOCK_DELAY_GPMU, 0x770); - gpu_write(gpu, REG_A5XX_RBBM_CLOCK_HYST_GPMU, 0x004); + gpu_write(gpu, REG_A5XX_RBBM_CLOCK_DELAY_GPMU, + state ? 0x770 : 0); + gpu_write(gpu, REG_A5XX_RBBM_CLOCK_HYST_GPMU, + state ? 0x004 : 0); } - gpu_write(gpu, REG_A5XX_RBBM_CLOCK_CNTL, 0xAAA8AA00); - gpu_write(gpu, REG_A5XX_RBBM_ISDB_CNT, 0x182); + gpu_write(gpu, REG_A5XX_RBBM_CLOCK_CNTL, state ? 0xAAA8AA00 : 0); + gpu_write(gpu, REG_A5XX_RBBM_ISDB_CNT, state ? 0x182 : 0x180); } static int a5xx_me_init(struct msm_gpu *gpu) { struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu); - struct msm_ringbuffer *ring = gpu->rb; + struct msm_ringbuffer *ring = gpu->rb[0]; OUT_PKT7(ring, CP_ME_INIT, 8); @@ -206,11 +354,54 @@ static int a5xx_me_init(struct msm_gpu *gpu) OUT_RING(ring, 0x00000000); OUT_RING(ring, 0x00000000); - gpu->funcs->flush(gpu); + gpu->funcs->flush(gpu, ring); + return a5xx_idle(gpu, ring) ? 0 : -EINVAL; +} + +static int a5xx_preempt_start(struct msm_gpu *gpu) +{ + struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu); + struct a5xx_gpu *a5xx_gpu = to_a5xx_gpu(adreno_gpu); + struct msm_ringbuffer *ring = gpu->rb[0]; + + if (gpu->nr_rings == 1) + return 0; + + /* Turn off protected mode to write to special registers */ + OUT_PKT7(ring, CP_SET_PROTECTED_MODE, 1); + OUT_RING(ring, 0); + + /* Set the save preemption record for the ring/command */ + OUT_PKT4(ring, REG_A5XX_CP_CONTEXT_SWITCH_SAVE_ADDR_LO, 2); + OUT_RING(ring, lower_32_bits(a5xx_gpu->preempt_iova[ring->id])); + OUT_RING(ring, upper_32_bits(a5xx_gpu->preempt_iova[ring->id])); + + /* Turn back on protected mode */ + OUT_PKT7(ring, CP_SET_PROTECTED_MODE, 1); + OUT_RING(ring, 1); - return gpu->funcs->idle(gpu) ? 0 : -EINVAL; + OUT_PKT7(ring, CP_PREEMPT_ENABLE_GLOBAL, 1); + OUT_RING(ring, 0x00); + + OUT_PKT7(ring, CP_PREEMPT_ENABLE_LOCAL, 1); + OUT_RING(ring, 0x01); + + OUT_PKT7(ring, CP_YIELD_ENABLE, 1); + OUT_RING(ring, 0x01); + + /* Yield the floor on command completion */ + OUT_PKT7(ring, CP_CONTEXT_SWITCH_YIELD, 4); + OUT_RING(ring, 0x00); + OUT_RING(ring, 0x00); + OUT_RING(ring, 0x01); + OUT_RING(ring, 0x01); + + gpu->funcs->flush(gpu, ring); + + return a5xx_idle(gpu, ring) ? 0 : -EINVAL; } + static struct drm_gem_object *a5xx_ucode_load_bo(struct msm_gpu *gpu, const struct firmware *fw, u64 *iova) { @@ -354,6 +545,7 @@ static void a5xx_zap_shader_init(struct msm_gpu *gpu) A5XX_RBBM_INT_0_MASK_RBBM_ATB_ASYNC_OVERFLOW | \ A5XX_RBBM_INT_0_MASK_CP_HW_ERROR | \ A5XX_RBBM_INT_0_MASK_MISC_HANG_DETECT | \ + A5XX_RBBM_INT_0_MASK_CP_SW | \ A5XX_RBBM_INT_0_MASK_CP_CACHE_FLUSH_TS | \ A5XX_RBBM_INT_0_MASK_UCHE_OOB_ACCESS | \ A5XX_RBBM_INT_0_MASK_GPMU_VOLTAGE_DROOP) @@ -445,7 +637,7 @@ static int a5xx_hw_init(struct msm_gpu *gpu) gpu_write(gpu, REG_A5XX_RBBM_AHB_CNTL1, 0xA6FFFFFF); /* Enable HWCG */ - a5xx_enable_hwcg(gpu); + a5xx_set_hwcg(gpu, true); gpu_write(gpu, REG_A5XX_RBBM_AHB_CNTL2, 0x0000003F); @@ -516,6 +708,20 @@ static int a5xx_hw_init(struct msm_gpu *gpu) REG_A5XX_RBBM_SECVID_TSB_TRUSTED_BASE_HI, 0x00000000); gpu_write(gpu, REG_A5XX_RBBM_SECVID_TSB_TRUSTED_SIZE, 0x00000000); + /* Put the GPU into 64 bit by default */ + gpu_write(gpu, REG_A5XX_CP_ADDR_MODE_CNTL, 0x1); + gpu_write(gpu, REG_A5XX_VSC_ADDR_MODE_CNTL, 0x1); + gpu_write(gpu, REG_A5XX_GRAS_ADDR_MODE_CNTL, 0x1); + gpu_write(gpu, REG_A5XX_RB_ADDR_MODE_CNTL, 0x1); + gpu_write(gpu, REG_A5XX_PC_ADDR_MODE_CNTL, 0x1); + gpu_write(gpu, REG_A5XX_HLSQ_ADDR_MODE_CNTL, 0x1); + gpu_write(gpu, REG_A5XX_VFD_ADDR_MODE_CNTL, 0x1); + gpu_write(gpu, REG_A5XX_VPC_ADDR_MODE_CNTL, 0x1); + gpu_write(gpu, REG_A5XX_UCHE_ADDR_MODE_CNTL, 0x1); + gpu_write(gpu, REG_A5XX_SP_ADDR_MODE_CNTL, 0x1); + gpu_write(gpu, REG_A5XX_TPL1_ADDR_MODE_CNTL, 0x1); + gpu_write(gpu, REG_A5XX_RBBM_SECVID_TSB_ADDR_MODE_CNTL, 0x1); + /* Load the GPMU firmware before starting the HW init */ a5xx_gpmu_ucode_init(gpu); @@ -523,6 +729,8 @@ static int a5xx_hw_init(struct msm_gpu *gpu) if (ret) return ret; + a5xx_preempt_hw_init(gpu); + ret = a5xx_ucode_init(gpu); if (ret) return ret; @@ -545,11 +753,11 @@ static int a5xx_hw_init(struct msm_gpu *gpu) * ticking correctly */ if (adreno_is_a530(adreno_gpu)) { - OUT_PKT7(gpu->rb, CP_EVENT_WRITE, 1); - OUT_RING(gpu->rb, 0x0F); + OUT_PKT7(gpu->rb[0], CP_EVENT_WRITE, 1); + OUT_RING(gpu->rb[0], 0x0F); - gpu->funcs->flush(gpu); - if (!gpu->funcs->idle(gpu)) + gpu->funcs->flush(gpu, gpu->rb[0]); + if (!a5xx_idle(gpu, gpu->rb[0])) return -EINVAL; } @@ -562,13 +770,13 @@ static int a5xx_hw_init(struct msm_gpu *gpu) * cause a XPU violation. */ if (test_bit(A5XX_ZAP_SHADER_LOADED, &a5xx_gpu->flags)) { - struct msm_ringbuffer *ring = gpu->rb; + struct msm_ringbuffer *ring = gpu->rb[0]; OUT_PKT7(ring, CP_SET_SECURE_MODE, 1); OUT_RING(ring, 0x00000000); - gpu->funcs->flush(gpu); - if (!gpu->funcs->idle(gpu)) + gpu->funcs->flush(gpu, gpu->rb[0]); + if (!a5xx_idle(gpu, gpu->rb[0])) return -EINVAL; } else { /* Print a warning so if we die, we know why */ @@ -577,6 +785,9 @@ static int a5xx_hw_init(struct msm_gpu *gpu) gpu_write(gpu, REG_A5XX_RBBM_SECVID_TRUST_CNTL, 0x0); } + /* Last step - yield the ringbuffer */ + a5xx_preempt_start(gpu); + pm_qos_update_request(&gpu->pm_qos_req_dma, 501); return 0; @@ -586,8 +797,7 @@ static void a5xx_recover(struct msm_gpu *gpu) { adreno_dump_info(gpu); - if (hang_debug) - a5xx_dump(gpu); + msm_gpu_snapshot(gpu, gpu->snapshot); /* Reset the GPU so it can work again */ gpu_write(gpu, REG_A5XX_RBBM_SW_RESET_CMD, 1); @@ -604,6 +814,8 @@ static void a5xx_destroy(struct msm_gpu *gpu) DBG("%s", gpu->name); + a5xx_preempt_fini(gpu); + if (a5xx_gpu->pm4_bo) { if (a5xx_gpu->pm4_iova) msm_gem_put_iova(a5xx_gpu->pm4_bo, gpu->aspace); @@ -639,16 +851,27 @@ static inline bool _a5xx_check_idle(struct msm_gpu *gpu) A5XX_RBBM_INT_0_MASK_MISC_HANG_DETECT); } -static bool a5xx_idle(struct msm_gpu *gpu) +bool a5xx_idle(struct msm_gpu *gpu, struct msm_ringbuffer *ring) { + struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu); + struct a5xx_gpu *a5xx_gpu = to_a5xx_gpu(adreno_gpu); + + if (ring != a5xx_gpu->cur_ring) { + WARN(1, "Tried to idle a non-current ringbuffer\n"); + return false; + } + /* wait for CP to drain ringbuffer: */ - if (!adreno_idle(gpu)) + if (!adreno_idle(gpu, ring)) return false; if (spin_until(_a5xx_check_idle(gpu))) { - DRM_ERROR("%s: timeout waiting for GPU to idle: status %8.8X irq %8.8X\n", - gpu->name, + DRM_ERROR( + "%s: timeout waiting for GPU RB %d to idle: status %8.8X rptr/wptr: %4.4X/%4.4X irq %8.8X\n", + gpu->name, ring->id, gpu_read(gpu, REG_A5XX_RBBM_STATUS), + gpu_read(gpu, REG_A5XX_CP_RB_RPTR), + gpu_read(gpu, REG_A5XX_CP_RB_WPTR), gpu_read(gpu, REG_A5XX_RBBM_INT_0_STATUS)); return false; @@ -768,6 +991,20 @@ static void a5xx_fault_detect_irq(struct msm_gpu *gpu) { struct drm_device *dev = gpu->dev; struct msm_drm_private *priv = dev->dev_private; + struct msm_ringbuffer *ring = gpu->funcs->active_ring(gpu); + + dev_err(dev->dev, "gpu fault ring %d fence %x status %8.8X rb %4.4x/%4.4x ib1 %16.16llX/%4.4x ib2 %16.16llX/%4.4x\n", + ring ? ring->id : -1, adreno_submitted_fence(gpu, ring), + gpu_read(gpu, REG_A5XX_RBBM_STATUS), + gpu_read(gpu, REG_A5XX_CP_RB_RPTR), + gpu_read(gpu, REG_A5XX_CP_RB_WPTR), + gpu_read64(gpu, REG_A5XX_CP_IB1_BASE, REG_A5XX_CP_IB1_BASE_HI), + gpu_read(gpu, REG_A5XX_CP_IB1_BUFSZ), + gpu_read64(gpu, REG_A5XX_CP_IB2_BASE, REG_A5XX_CP_IB2_BASE_HI), + gpu_read(gpu, REG_A5XX_CP_IB2_BUFSZ)); + + /* Turn off the hangcheck timer to keep it from bothering us */ + del_timer(&gpu->hangcheck_timer); queue_work(priv->wq, &gpu->recover_work); } @@ -810,6 +1047,9 @@ static irqreturn_t a5xx_irq(struct msm_gpu *gpu) if (status & A5XX_RBBM_INT_0_MASK_CP_CACHE_FLUSH_TS) msm_gpu_retire(gpu); + if (status & A5XX_RBBM_INT_0_MASK_CP_SW) + a5xx_preempt_irq(gpu); + return IRQ_HANDLED; } @@ -825,43 +1065,49 @@ static const u32 a5xx_register_offsets[REG_ADRENO_REGISTER_MAX] = { }; static const u32 a5xx_registers[] = { - 0x0000, 0x0002, 0x0004, 0x0020, 0x0022, 0x0026, 0x0029, 0x002B, - 0x002E, 0x0035, 0x0038, 0x0042, 0x0044, 0x0044, 0x0047, 0x0095, - 0x0097, 0x00BB, 0x03A0, 0x0464, 0x0469, 0x046F, 0x04D2, 0x04D3, - 0x04E0, 0x0533, 0x0540, 0x0555, 0xF400, 0xF400, 0xF800, 0xF807, - 0x0800, 0x081A, 0x081F, 0x0841, 0x0860, 0x0860, 0x0880, 0x08A0, - 0x0B00, 0x0B12, 0x0B15, 0x0B28, 0x0B78, 0x0B7F, 0x0BB0, 0x0BBD, - 0x0BC0, 0x0BC6, 0x0BD0, 0x0C53, 0x0C60, 0x0C61, 0x0C80, 0x0C82, - 0x0C84, 0x0C85, 0x0C90, 0x0C98, 0x0CA0, 0x0CA0, 0x0CB0, 0x0CB2, - 0x2180, 0x2185, 0x2580, 0x2585, 0x0CC1, 0x0CC1, 0x0CC4, 0x0CC7, - 0x0CCC, 0x0CCC, 0x0CD0, 0x0CD8, 0x0CE0, 0x0CE5, 0x0CE8, 0x0CE8, - 0x0CEC, 0x0CF1, 0x0CFB, 0x0D0E, 0x2100, 0x211E, 0x2140, 0x2145, - 0x2500, 0x251E, 0x2540, 0x2545, 0x0D10, 0x0D17, 0x0D20, 0x0D23, - 0x0D30, 0x0D30, 0x20C0, 0x20C0, 0x24C0, 0x24C0, 0x0E40, 0x0E43, - 0x0E4A, 0x0E4A, 0x0E50, 0x0E57, 0x0E60, 0x0E7C, 0x0E80, 0x0E8E, - 0x0E90, 0x0E96, 0x0EA0, 0x0EA8, 0x0EB0, 0x0EB2, 0xE140, 0xE147, - 0xE150, 0xE187, 0xE1A0, 0xE1A9, 0xE1B0, 0xE1B6, 0xE1C0, 0xE1C7, - 0xE1D0, 0xE1D1, 0xE200, 0xE201, 0xE210, 0xE21C, 0xE240, 0xE268, - 0xE000, 0xE006, 0xE010, 0xE09A, 0xE0A0, 0xE0A4, 0xE0AA, 0xE0EB, - 0xE100, 0xE105, 0xE380, 0xE38F, 0xE3B0, 0xE3B0, 0xE400, 0xE405, - 0xE408, 0xE4E9, 0xE4F0, 0xE4F0, 0xE280, 0xE280, 0xE282, 0xE2A3, - 0xE2A5, 0xE2C2, 0xE940, 0xE947, 0xE950, 0xE987, 0xE9A0, 0xE9A9, - 0xE9B0, 0xE9B6, 0xE9C0, 0xE9C7, 0xE9D0, 0xE9D1, 0xEA00, 0xEA01, - 0xEA10, 0xEA1C, 0xEA40, 0xEA68, 0xE800, 0xE806, 0xE810, 0xE89A, - 0xE8A0, 0xE8A4, 0xE8AA, 0xE8EB, 0xE900, 0xE905, 0xEB80, 0xEB8F, - 0xEBB0, 0xEBB0, 0xEC00, 0xEC05, 0xEC08, 0xECE9, 0xECF0, 0xECF0, - 0xEA80, 0xEA80, 0xEA82, 0xEAA3, 0xEAA5, 0xEAC2, 0xA800, 0xA8FF, - 0xAC60, 0xAC60, 0xB000, 0xB97F, 0xB9A0, 0xB9BF, + 0x0000, 0x0002, 0x0004, 0x0020, 0x0022, 0x0026, 0x0029, 0x002b, + 0x002e, 0x0035, 0x0038, 0x0042, 0x0044, 0x0044, 0x0047, 0x0095, + 0x0097, 0x00bb, 0x03a0, 0x0464, 0x0469, 0x046f, 0x04d2, 0x04d3, + 0x04e0, 0x0533, 0x0540, 0x0555, 0x0800, 0x081a, 0x081f, 0x0841, + 0x0860, 0x0860, 0x0880, 0x08a0, 0x0b00, 0x0b12, 0x0b14, 0x0b28, + 0x0b78, 0x0b7f, 0x0bb0, 0x0bbd, 0x0bc0, 0x0bc6, 0x0bd0, 0x0c53, + 0x0c60, 0x0c61, 0x0c80, 0x0c82, 0x0c84, 0x0c85, 0x0c90, 0x0c9b, + 0x0ca0, 0x0ca0, 0x0cb0, 0x0cb2, 0x0cc1, 0x0cc1, 0x0cc4, 0x0cc7, + 0x0ccc, 0x0ccc, 0x0cd0, 0x0cdb, 0x0ce0, 0x0ce5, 0x0ce8, 0x0ce8, + 0x0cec, 0x0cf1, 0x0cfb, 0x0d0e, 0x0d10, 0x0d17, 0x0d20, 0x0d23, + 0x0d30, 0x0d30, 0x0e40, 0x0e43, 0x0e4a, 0x0e4a, 0x0e50, 0x0e57, + 0x0e60, 0x0e7c, 0x0e80, 0x0e8e, 0x0e90, 0x0e96, 0x0ea0, 0x0eab, + 0x0eb0, 0x0eb2, 0x2100, 0x211e, 0x2140, 0x2145, 0x2180, 0x2185, + 0x2500, 0x251e, 0x2540, 0x2545, 0x2580, 0x2585, 0x3000, 0x3014, + 0x3018, 0x302c, 0x3030, 0x3030, 0x3034, 0x3036, 0x303c, 0x303d, + 0x3040, 0x3040, 0x3042, 0x3042, 0x3049, 0x3049, 0x3058, 0x3058, + 0x305a, 0x3061, 0x3064, 0x3068, 0x306c, 0x306d, 0x3080, 0x3088, + 0x308b, 0x308c, 0x3090, 0x3094, 0x3098, 0x3098, 0x309c, 0x309c, + 0x3124, 0x3124, 0x340c, 0x340c, 0x3410, 0x3410, 0x3800, 0x3801, + 0xa800, 0xa800, 0xa820, 0xa828, 0xa840, 0xa87d, 0xa880, 0xa88d, + 0xa890, 0xa8a3, 0xa8a8, 0xa8aa, 0xa8c0, 0xa8c3, 0xa8c6, 0xa8ca, + 0xa8cc, 0xa8cf, 0xa8d1, 0xa8d8, 0xa8dc, 0xa8dc, 0xa8e0, 0xa8f5, + 0xac00, 0xac06, 0xac40, 0xac47, 0xac60, 0xac62, 0xac80, 0xac82, + 0xb800, 0xb808, 0xb80c, 0xb812, 0xb814, 0xb817, 0xb900, 0xb904, + 0xb906, 0xb90a, 0xb90c, 0xb90f, 0xb920, 0xb924, 0xb926, 0xb92a, + 0xb92c, 0xb92f, 0xb940, 0xb944, 0xb946, 0xb94a, 0xb94c, 0xb94f, + 0xb960, 0xb964, 0xb966, 0xb96a, 0xb96c, 0xb96f, 0xb980, 0xb984, + 0xb986, 0xb98a, 0xb98c, 0xb98f, 0xb9a0, 0xb9b0, 0xb9b8, 0xb9ba, + 0xd200, 0xd23f, 0xe000, 0xe006, 0xe010, 0xe09a, 0xe0a0, 0xe0a4, + 0xe0aa, 0xe0eb, 0xe100, 0xe105, 0xe140, 0xe147, 0xe150, 0xe187, + 0xe1a0, 0xe1a9, 0xe1b0, 0xe1b6, 0xe1c0, 0xe1c7, 0xe1d0, 0xe1d1, + 0xe200, 0xe201, 0xe210, 0xe21c, 0xe240, 0xe268, 0xe280, 0xe280, + 0xe282, 0xe2a3, 0xe2a5, 0xe2c2, 0xe380, 0xe38f, 0xe3b0, 0xe3b0, + 0xe400, 0xe405, 0xe408, 0xe4e9, 0xe4f0, 0xe4f0, 0xe800, 0xe806, + 0xe810, 0xe89a, 0xe8a0, 0xe8a4, 0xe8aa, 0xe8eb, 0xe900, 0xe905, + 0xe940, 0xe947, 0xe950, 0xe987, 0xe9a0, 0xe9a9, 0xe9b0, 0xe9b6, + 0xe9c0, 0xe9c7, 0xe9d0, 0xe9d1, 0xea00, 0xea01, 0xea10, 0xea1c, + 0xea40, 0xea68, 0xea80, 0xea80, 0xea82, 0xeaa3, 0xeaa5, 0xeac2, + 0xeb80, 0xeb8f, 0xebb0, 0xebb0, 0xec00, 0xec05, 0xec08, 0xece9, + 0xecf0, 0xecf0, 0xf400, 0xf400, 0xf800, 0xf807, ~0 }; -static void a5xx_dump(struct msm_gpu *gpu) -{ - dev_info(gpu->dev->dev, "status: %08x\n", - gpu_read(gpu, REG_A5XX_RBBM_STATUS)); - adreno_dump(gpu); -} - static int a5xx_pm_resume(struct msm_gpu *gpu) { int ret; @@ -943,6 +1189,14 @@ static void a5xx_show(struct msm_gpu *gpu, struct seq_file *m) } #endif +static struct msm_ringbuffer *a5xx_active_ring(struct msm_gpu *gpu) +{ + struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu); + struct a5xx_gpu *a5xx_gpu = to_a5xx_gpu(adreno_gpu); + + return a5xx_gpu->cur_ring; +} + static const struct adreno_gpu_funcs funcs = { .base = { .get_param = adreno_get_param, @@ -951,14 +1205,16 @@ static const struct adreno_gpu_funcs funcs = { .pm_resume = a5xx_pm_resume, .recover = a5xx_recover, .last_fence = adreno_last_fence, + .submitted_fence = adreno_submitted_fence, .submit = a5xx_submit, - .flush = adreno_flush, - .idle = a5xx_idle, + .flush = a5xx_flush, + .active_ring = a5xx_active_ring, .irq = a5xx_irq, .destroy = a5xx_destroy, #ifdef CONFIG_DEBUG_FS .show = a5xx_show, #endif + .snapshot = a5xx_snapshot, }, .get_timestamp = a5xx_get_timestamp, }; @@ -1073,11 +1329,14 @@ struct msm_gpu *a5xx_gpu_init(struct drm_device *dev) /* Check the efuses for some configuration */ a5xx_efuses_read(pdev, adreno_gpu); - ret = adreno_gpu_init(dev, pdev, adreno_gpu, &funcs); + ret = adreno_gpu_init(dev, pdev, adreno_gpu, &funcs, 4); if (ret) { a5xx_destroy(&(a5xx_gpu->base.base)); return ERR_PTR(ret); } + /* Set up the preemption specific bits and pieces for each ringbuffer */ + a5xx_preempt_init(gpu); + return gpu; } diff --git a/drivers/gpu/drm/msm/adreno/a5xx_gpu.h b/drivers/gpu/drm/msm/adreno/a5xx_gpu.h index e82f54063877..3de14fe42a1b 100644 --- a/drivers/gpu/drm/msm/adreno/a5xx_gpu.h +++ b/drivers/gpu/drm/msm/adreno/a5xx_gpu.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 The Linux Foundation. All rights reserved. +/* Copyright (c) 2016-2017 The Linux Foundation. All rights reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 and @@ -42,10 +42,115 @@ struct a5xx_gpu { uint32_t gpmu_dwords; uint32_t lm_leakage; + + struct msm_ringbuffer *cur_ring; + struct msm_ringbuffer *next_ring; + + struct drm_gem_object *preempt_bo[MSM_GPU_MAX_RINGS]; + struct a5xx_preempt_record *preempt[MSM_GPU_MAX_RINGS]; + uint64_t preempt_iova[MSM_GPU_MAX_RINGS]; + + atomic_t preempt_state; + struct timer_list preempt_timer; + + struct a5xx_smmu_info *smmu_info; + struct drm_gem_object *smmu_info_bo; + uint64_t smmu_info_iova; }; #define to_a5xx_gpu(x) container_of(x, struct a5xx_gpu, base) +/* + * In order to do lockless preemption we use a simple state machine to progress + * through the process. + * + * PREEMPT_NONE - no preemption in progress. Next state START. + * PREEMPT_START - The trigger is evaulating if preemption is possible. Next + * states: TRIGGERED, NONE + * PREEMPT_TRIGGERED: A preemption has been executed on the hardware. Next + * states: FAULTED, PENDING + * PREEMPT_FAULTED: A preemption timed out (never completed). This will trigger + * recovery. Next state: N/A + * PREEMPT_PENDING: Preemption complete interrupt fired - the callback is + * checking the success of the operation. Next state: FAULTED, NONE. + */ + +enum preempt_state { + PREEMPT_NONE = 0, + PREEMPT_START, + PREEMPT_TRIGGERED, + PREEMPT_FAULTED, + PREEMPT_PENDING, +}; + +/* + * struct a5xx_preempt_record is a shared buffer between the microcode and the + * CPU to store the state for preemption. The record itself is much larger + * (64k) but most of that is used by the CP for storage. + * + * There is a preemption record assigned per ringbuffer. When the CPU triggers a + * preemption, it fills out the record with the useful information (wptr, ring + * base, etc) and the microcode uses that information to set up the CP following + * the preemption. When a ring is switched out, the CP will save the ringbuffer + * state back to the record. In this way, once the records are properly set up + * the CPU can quickly switch back and forth between ringbuffers by only + * updating a few registers (often only the wptr). + * + * These are the CPU aware registers in the record: + * @magic: Must always be 0x27C4BAFC + * @info: Type of the record - written 0 by the CPU, updated by the CP + * @data: Data field from SET_RENDER_MODE or a checkpoint. Written and used by + * the CP + * @cntl: Value of RB_CNTL written by CPU, save/restored by CP + * @rptr: Value of RB_RPTR written by CPU, save/restored by CP + * @wptr: Value of RB_WPTR written by CPU, save/restored by CP + * @rptr_addr: Value of RB_RPTR_ADDR written by CPU, save/restored by CP + * @rbase: Value of RB_BASE written by CPU, save/restored by CP + * @counter: GPU address of the storage area for the performance counters + */ +struct a5xx_preempt_record { + uint32_t magic; + uint32_t info; + uint32_t data; + uint32_t cntl; + uint32_t rptr; + uint32_t wptr; + uint64_t rptr_addr; + uint64_t rbase; + uint64_t counter; +}; + +/* Magic identifier for the preemption record */ +#define A5XX_PREEMPT_RECORD_MAGIC 0x27C4BAFCUL + +/* + * Even though the structure above is only a few bytes, we need a full 64k to + * store the entire preemption record from the CP + */ +#define A5XX_PREEMPT_RECORD_SIZE (64 * 1024) + +/* + * The preemption counter block is a storage area for the value of the + * preemption counters that are saved immediately before context switch. We + * append it on to the end of the allocadtion for the preemption record. + */ +#define A5XX_PREEMPT_COUNTER_SIZE (16 * 4) + +/* + * This is a global structure that the preemption code uses to switch in the + * pagetable for the preempted process - the code switches in whatever we + * after preempting in a new ring. + */ +struct a5xx_smmu_info { + uint32_t magic; + uint32_t _pad4; + uint64_t ttbr0; + uint32_t asid; + uint32_t contextidr; +}; + +#define A5XX_SMMU_INFO_MAGIC 0x3618CDA3UL + int a5xx_power_init(struct msm_gpu *gpu); void a5xx_gpmu_ucode_init(struct msm_gpu *gpu); @@ -62,5 +167,21 @@ static inline int spin_usecs(struct msm_gpu *gpu, uint32_t usecs, return -ETIMEDOUT; } +void a5xx_set_hwcg(struct msm_gpu *gpu, bool state); +bool a5xx_idle(struct msm_gpu *gpu, struct msm_ringbuffer *ring); + +void a5xx_preempt_init(struct msm_gpu *gpu); +void a5xx_preempt_hw_init(struct msm_gpu *gpu); +void a5xx_preempt_trigger(struct msm_gpu *gpu); +void a5xx_preempt_irq(struct msm_gpu *gpu); +void a5xx_preempt_fini(struct msm_gpu *gpu); + +int a5xx_snapshot(struct msm_gpu *gpu, struct msm_snapshot *snapshot); + +/* Return true if we are in a preempt state */ +static inline bool a5xx_in_preempt(struct a5xx_gpu *a5xx_gpu) +{ + return !(atomic_read(&a5xx_gpu->preempt_state) == PREEMPT_NONE); +} #endif /* __A5XX_GPU_H__ */ diff --git a/drivers/gpu/drm/msm/adreno/a5xx_power.c b/drivers/gpu/drm/msm/adreno/a5xx_power.c index 18bca141b425..2040b18f731f 100644 --- a/drivers/gpu/drm/msm/adreno/a5xx_power.c +++ b/drivers/gpu/drm/msm/adreno/a5xx_power.c @@ -229,7 +229,7 @@ static int a5xx_gpmu_init(struct msm_gpu *gpu) { struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu); struct a5xx_gpu *a5xx_gpu = to_a5xx_gpu(adreno_gpu); - struct msm_ringbuffer *ring = gpu->rb; + struct msm_ringbuffer *ring = gpu->rb[0]; if (!a5xx_gpu->gpmu_dwords) return 0; @@ -248,9 +248,9 @@ static int a5xx_gpmu_init(struct msm_gpu *gpu) OUT_PKT7(ring, CP_SET_PROTECTED_MODE, 1); OUT_RING(ring, 1); - gpu->funcs->flush(gpu); + gpu->funcs->flush(gpu, ring); - if (!gpu->funcs->idle(gpu)) { + if (!a5xx_idle(gpu, ring)) { DRM_ERROR("%s: Unable to load GPMU firmware. GPMU will not be active\n", gpu->name); return -EINVAL; diff --git a/drivers/gpu/drm/msm/adreno/a5xx_preempt.c b/drivers/gpu/drm/msm/adreno/a5xx_preempt.c new file mode 100644 index 000000000000..648494c75abc --- /dev/null +++ b/drivers/gpu/drm/msm/adreno/a5xx_preempt.c @@ -0,0 +1,383 @@ +/* Copyright (c) 2016-2017 The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include "msm_gem.h" +#include "msm_iommu.h" +#include "a5xx_gpu.h" + +static void *alloc_kernel_bo(struct drm_device *drm, struct msm_gpu *gpu, + size_t size, uint32_t flags, struct drm_gem_object **bo, + u64 *iova) +{ + struct drm_gem_object *_bo; + u64 _iova; + void *ptr; + int ret; + + mutex_lock(&drm->struct_mutex); + _bo = msm_gem_new(drm, size, flags); + mutex_unlock(&drm->struct_mutex); + + if (IS_ERR(_bo)) + return _bo; + + ret = msm_gem_get_iova(_bo, gpu->aspace, &_iova); + if (ret) + goto out; + + ptr = msm_gem_vaddr(_bo); + if (!ptr) { + ret = -ENOMEM; + goto out; + } + + if (bo) + *bo = _bo; + if (iova) + *iova = _iova; + + return ptr; +out: + drm_gem_object_unreference_unlocked(_bo); + return ERR_PTR(ret); +} + +/* + * Try to transition the preemption state from old to new. Return + * true on success or false if the original state wasn't 'old' + */ +static inline bool try_preempt_state(struct a5xx_gpu *a5xx_gpu, + enum preempt_state old, enum preempt_state new) +{ + enum preempt_state cur = atomic_cmpxchg(&a5xx_gpu->preempt_state, + old, new); + + return (cur == old); +} + +/* + * Force the preemption state to the specified state. This is used in cases + * where the current state is known and won't change + */ +static inline void set_preempt_state(struct a5xx_gpu *gpu, + enum preempt_state new) +{ + /* + * preempt_state may be read by other cores trying to trigger a + * preemption or in the interrupt handler so barriers are needed + * before... + */ + smp_mb__before_atomic(); + atomic_set(&gpu->preempt_state, new); + /* ... and after*/ + smp_mb__after_atomic(); +} + +/* Write the most recent wptr for the given ring into the hardware */ +static inline void update_wptr(struct msm_gpu *gpu, struct msm_ringbuffer *ring) +{ + unsigned long flags; + uint32_t wptr; + + if (!ring) + return; + + spin_lock_irqsave(&ring->lock, flags); + wptr = get_wptr(ring); + spin_unlock_irqrestore(&ring->lock, flags); + + gpu_write(gpu, REG_A5XX_CP_RB_WPTR, wptr); +} + +/* Return the highest priority ringbuffer with something in it */ +static struct msm_ringbuffer *get_next_ring(struct msm_gpu *gpu) +{ + struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu); + unsigned long flags; + int i; + + for (i = gpu->nr_rings - 1; i >= 0; i--) { + bool empty; + struct msm_ringbuffer *ring = gpu->rb[i]; + + spin_lock_irqsave(&ring->lock, flags); + empty = (get_wptr(ring) == adreno_gpu->memptrs->rptr[ring->id]); + spin_unlock_irqrestore(&ring->lock, flags); + + if (!empty) + return ring; + } + + return NULL; +} + +static void a5xx_preempt_timer(unsigned long data) +{ + struct a5xx_gpu *a5xx_gpu = (struct a5xx_gpu *) data; + struct msm_gpu *gpu = &a5xx_gpu->base.base; + struct drm_device *dev = gpu->dev; + struct msm_drm_private *priv = dev->dev_private; + + if (!try_preempt_state(a5xx_gpu, PREEMPT_TRIGGERED, PREEMPT_FAULTED)) + return; + + dev_err(dev->dev, "%s: preemption timed out\n", gpu->name); + queue_work(priv->wq, &gpu->recover_work); +} + +/* Try to trigger a preemption switch */ +void a5xx_preempt_trigger(struct msm_gpu *gpu) +{ + struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu); + struct a5xx_gpu *a5xx_gpu = to_a5xx_gpu(adreno_gpu); + unsigned long flags; + struct msm_ringbuffer *ring; + + if (gpu->nr_rings == 1) + return; + + /* + * Try to start preemption by moving from NONE to START. If + * unsuccessful, a preemption is already in flight + */ + if (!try_preempt_state(a5xx_gpu, PREEMPT_NONE, PREEMPT_START)) + return; + + /* Get the next ring to preempt to */ + ring = get_next_ring(gpu); + + /* + * If no ring is populated or the highest priority ring is the current + * one do nothing except to update the wptr to the latest and greatest + */ + if (!ring || (a5xx_gpu->cur_ring == ring)) { + update_wptr(gpu, ring); + + /* Set the state back to NONE */ + set_preempt_state(a5xx_gpu, PREEMPT_NONE); + return; + } + + /* Make sure the wptr doesn't update while we're in motion */ + spin_lock_irqsave(&ring->lock, flags); + a5xx_gpu->preempt[ring->id]->wptr = get_wptr(ring); + spin_unlock_irqrestore(&ring->lock, flags); + + /* Do read barrier to make sure we have updated pagetable info */ + rmb(); + + /* Set the SMMU info for the preemption */ + if (a5xx_gpu->smmu_info) { + a5xx_gpu->smmu_info->ttbr0 = + adreno_gpu->memptrs->ttbr0[ring->id]; + a5xx_gpu->smmu_info->contextidr = + adreno_gpu->memptrs->contextidr[ring->id]; + } + + /* Set the address of the incoming preemption record */ + gpu_write64(gpu, REG_A5XX_CP_CONTEXT_SWITCH_RESTORE_ADDR_LO, + REG_A5XX_CP_CONTEXT_SWITCH_RESTORE_ADDR_HI, + a5xx_gpu->preempt_iova[ring->id]); + + a5xx_gpu->next_ring = ring; + + /* Start a timer to catch a stuck preemption */ + mod_timer(&a5xx_gpu->preempt_timer, jiffies + msecs_to_jiffies(10000)); + + /* Set the preemption state to triggered */ + set_preempt_state(a5xx_gpu, PREEMPT_TRIGGERED); + + /* Make sure everything is written before hitting the button */ + wmb(); + + /* And actually start the preemption */ + gpu_write(gpu, REG_A5XX_CP_CONTEXT_SWITCH_CNTL, 1); +} + +void a5xx_preempt_irq(struct msm_gpu *gpu) +{ + uint32_t status; + struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu); + struct a5xx_gpu *a5xx_gpu = to_a5xx_gpu(adreno_gpu); + struct drm_device *dev = gpu->dev; + struct msm_drm_private *priv = dev->dev_private; + + if (!try_preempt_state(a5xx_gpu, PREEMPT_TRIGGERED, PREEMPT_PENDING)) + return; + + /* Delete the preemption watchdog timer */ + del_timer(&a5xx_gpu->preempt_timer); + + /* + * The hardware should be setting CP_CONTEXT_SWITCH_CNTL to zero before + * firing the interrupt, but there is a non zero chance of a hardware + * condition or a software race that could set it again before we have a + * chance to finish. If that happens, log and go for recovery + */ + status = gpu_read(gpu, REG_A5XX_CP_CONTEXT_SWITCH_CNTL); + if (unlikely(status)) { + set_preempt_state(a5xx_gpu, PREEMPT_FAULTED); + dev_err(dev->dev, "%s: Preemption failed to complete\n", + gpu->name); + queue_work(priv->wq, &gpu->recover_work); + return; + } + + a5xx_gpu->cur_ring = a5xx_gpu->next_ring; + a5xx_gpu->next_ring = NULL; + + update_wptr(gpu, a5xx_gpu->cur_ring); + + set_preempt_state(a5xx_gpu, PREEMPT_NONE); +} + +void a5xx_preempt_hw_init(struct msm_gpu *gpu) +{ + struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu); + struct a5xx_gpu *a5xx_gpu = to_a5xx_gpu(adreno_gpu); + struct msm_ringbuffer *ring; + int i; + + if (gpu->nr_rings > 1) { + /* Clear the preemption records */ + FOR_EACH_RING(gpu, ring, i) { + if (ring) { + a5xx_gpu->preempt[ring->id]->wptr = 0; + a5xx_gpu->preempt[ring->id]->rptr = 0; + a5xx_gpu->preempt[ring->id]->rbase = ring->iova; + } + } + } + + /* Tell the CP where to find the smmu_info buffer */ + gpu_write64(gpu, REG_A5XX_CP_CONTEXT_SWITCH_SMMU_INFO_LO, + REG_A5XX_CP_CONTEXT_SWITCH_SMMU_INFO_HI, + a5xx_gpu->smmu_info_iova); + + /* Reset the preemption state */ + set_preempt_state(a5xx_gpu, PREEMPT_NONE); + + /* Always come up on rb 0 */ + a5xx_gpu->cur_ring = gpu->rb[0]; +} + +static int preempt_init_ring(struct a5xx_gpu *a5xx_gpu, + struct msm_ringbuffer *ring) +{ + struct adreno_gpu *adreno_gpu = &a5xx_gpu->base; + struct msm_gpu *gpu = &adreno_gpu->base; + struct a5xx_preempt_record *ptr; + struct drm_gem_object *bo; + u64 iova; + + ptr = alloc_kernel_bo(gpu->dev, gpu, + A5XX_PREEMPT_RECORD_SIZE + A5XX_PREEMPT_COUNTER_SIZE, + MSM_BO_UNCACHED | MSM_BO_PRIVILEGED, + &bo, &iova); + + if (IS_ERR(ptr)) + return PTR_ERR(ptr); + + a5xx_gpu->preempt_bo[ring->id] = bo; + a5xx_gpu->preempt_iova[ring->id] = iova; + a5xx_gpu->preempt[ring->id] = ptr; + + /* Set up the defaults on the preemption record */ + + ptr->magic = A5XX_PREEMPT_RECORD_MAGIC; + ptr->info = 0; + ptr->data = 0; + ptr->cntl = MSM_GPU_RB_CNTL_DEFAULT; + ptr->rptr_addr = rbmemptr(adreno_gpu, ring->id, rptr); + ptr->counter = iova + A5XX_PREEMPT_RECORD_SIZE; + + return 0; +} + +void a5xx_preempt_fini(struct msm_gpu *gpu) +{ + struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu); + struct a5xx_gpu *a5xx_gpu = to_a5xx_gpu(adreno_gpu); + struct msm_ringbuffer *ring; + int i; + + FOR_EACH_RING(gpu, ring, i) { + if (!ring || !a5xx_gpu->preempt_bo[i]) + continue; + + if (a5xx_gpu->preempt_iova[i]) + msm_gem_put_iova(a5xx_gpu->preempt_bo[i], gpu->aspace); + + drm_gem_object_unreference_unlocked(a5xx_gpu->preempt_bo[i]); + + a5xx_gpu->preempt_bo[i] = NULL; + } + + if (a5xx_gpu->smmu_info_bo) { + if (a5xx_gpu->smmu_info_iova) + msm_gem_put_iova(a5xx_gpu->smmu_info_bo, gpu->aspace); + drm_gem_object_unreference_unlocked(a5xx_gpu->smmu_info_bo); + a5xx_gpu->smmu_info_bo = NULL; + } +} + +void a5xx_preempt_init(struct msm_gpu *gpu) +{ + struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu); + struct a5xx_gpu *a5xx_gpu = to_a5xx_gpu(adreno_gpu); + struct msm_ringbuffer *ring; + struct a5xx_smmu_info *ptr; + struct drm_gem_object *bo; + uint64_t iova; + int i; + + /* No preemption if we only have one ring */ + if (gpu->nr_rings <= 1) + return; + + FOR_EACH_RING(gpu, ring, i) { + if (!ring) + continue; + + if (preempt_init_ring(a5xx_gpu, ring)) + goto fail; + } + + if (msm_iommu_allow_dynamic(gpu->aspace->mmu)) { + ptr = alloc_kernel_bo(gpu->dev, gpu, + sizeof(struct a5xx_smmu_info), + MSM_BO_UNCACHED | MSM_BO_PRIVILEGED, + &bo, &iova); + + if (IS_ERR(ptr)) + goto fail; + + ptr->magic = A5XX_SMMU_INFO_MAGIC; + + a5xx_gpu->smmu_info_bo = bo; + a5xx_gpu->smmu_info_iova = iova; + a5xx_gpu->smmu_info = ptr; + } + + setup_timer(&a5xx_gpu->preempt_timer, a5xx_preempt_timer, + (unsigned long) a5xx_gpu); + + return; +fail: + /* + * On any failure our adventure is over. Clean up and + * set nr_rings to 1 to force preemption off + */ + a5xx_preempt_fini(gpu); + gpu->nr_rings = 1; +} diff --git a/drivers/gpu/drm/msm/adreno/a5xx_snapshot.c b/drivers/gpu/drm/msm/adreno/a5xx_snapshot.c new file mode 100644 index 000000000000..5a2edb0ea518 --- /dev/null +++ b/drivers/gpu/drm/msm/adreno/a5xx_snapshot.c @@ -0,0 +1,796 @@ +/* Copyright (c) 2016-2017 The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include "msm_gpu.h" +#include "msm_gem.h" +#include "a5xx_gpu.h" +#include "msm_snapshot_api.h" + +#define A5XX_NR_SHADER_BANKS 4 + +/* + * These are a list of the registers that need to be read through the HLSQ + * aperture through the crashdumper. These are not nominally accessible from + * the CPU on a secure platform. + */ +static const struct { + u32 type; + u32 regoffset; + u32 count; +} a5xx_hlsq_aperture_regs[] = { + { 0x35, 0xE00, 0x32 }, /* HSLQ non-context */ + { 0x31, 0x2080, 0x1 }, /* HLSQ 2D context 0 */ + { 0x33, 0x2480, 0x1 }, /* HLSQ 2D context 1 */ + { 0x32, 0xE780, 0x62 }, /* HLSQ 3D context 0 */ + { 0x34, 0xEF80, 0x62 }, /* HLSQ 3D context 1 */ + { 0x3f, 0x0EC0, 0x40 }, /* SP non-context */ + { 0x3d, 0x2040, 0x1 }, /* SP 2D context 0 */ + { 0x3b, 0x2440, 0x1 }, /* SP 2D context 1 */ + { 0x3e, 0xE580, 0x180 }, /* SP 3D context 0 */ + { 0x3c, 0xED80, 0x180 }, /* SP 3D context 1 */ + { 0x3a, 0x0F00, 0x1c }, /* TP non-context */ + { 0x38, 0x2000, 0xa }, /* TP 2D context 0 */ + { 0x36, 0x2400, 0xa }, /* TP 2D context 1 */ + { 0x39, 0xE700, 0x80 }, /* TP 3D context 0 */ + { 0x37, 0xEF00, 0x80 }, /* TP 3D context 1 */ +}; + +/* + * The debugbus registers contain device state that presumably makes + * sense to the hardware designers. 'count' is the number of indexes to read, + * each index value is 64 bits + */ +static const struct { + enum a5xx_debugbus id; + u32 count; +} a5xx_debugbus_blocks[] = { + { A5XX_RBBM_DBGBUS_CP, 0x100, }, + { A5XX_RBBM_DBGBUS_RBBM, 0x100, }, + { A5XX_RBBM_DBGBUS_HLSQ, 0x100, }, + { A5XX_RBBM_DBGBUS_UCHE, 0x100, }, + { A5XX_RBBM_DBGBUS_DPM, 0x100, }, + { A5XX_RBBM_DBGBUS_TESS, 0x100, }, + { A5XX_RBBM_DBGBUS_PC, 0x100, }, + { A5XX_RBBM_DBGBUS_VFDP, 0x100, }, + { A5XX_RBBM_DBGBUS_VPC, 0x100, }, + { A5XX_RBBM_DBGBUS_TSE, 0x100, }, + { A5XX_RBBM_DBGBUS_RAS, 0x100, }, + { A5XX_RBBM_DBGBUS_VSC, 0x100, }, + { A5XX_RBBM_DBGBUS_COM, 0x100, }, + { A5XX_RBBM_DBGBUS_DCOM, 0x100, }, + { A5XX_RBBM_DBGBUS_LRZ, 0x100, }, + { A5XX_RBBM_DBGBUS_A2D_DSP, 0x100, }, + { A5XX_RBBM_DBGBUS_CCUFCHE, 0x100, }, + { A5XX_RBBM_DBGBUS_GPMU, 0x100, }, + { A5XX_RBBM_DBGBUS_RBP, 0x100, }, + { A5XX_RBBM_DBGBUS_HM, 0x100, }, + { A5XX_RBBM_DBGBUS_RBBM_CFG, 0x100, }, + { A5XX_RBBM_DBGBUS_VBIF_CX, 0x100, }, + { A5XX_RBBM_DBGBUS_GPC, 0x100, }, + { A5XX_RBBM_DBGBUS_LARC, 0x100, }, + { A5XX_RBBM_DBGBUS_HLSQ_SPTP, 0x100, }, + { A5XX_RBBM_DBGBUS_RB_0, 0x100, }, + { A5XX_RBBM_DBGBUS_RB_1, 0x100, }, + { A5XX_RBBM_DBGBUS_RB_2, 0x100, }, + { A5XX_RBBM_DBGBUS_RB_3, 0x100, }, + { A5XX_RBBM_DBGBUS_CCU_0, 0x100, }, + { A5XX_RBBM_DBGBUS_CCU_1, 0x100, }, + { A5XX_RBBM_DBGBUS_CCU_2, 0x100, }, + { A5XX_RBBM_DBGBUS_CCU_3, 0x100, }, + { A5XX_RBBM_DBGBUS_A2D_RAS_0, 0x100, }, + { A5XX_RBBM_DBGBUS_A2D_RAS_1, 0x100, }, + { A5XX_RBBM_DBGBUS_A2D_RAS_2, 0x100, }, + { A5XX_RBBM_DBGBUS_A2D_RAS_3, 0x100, }, + { A5XX_RBBM_DBGBUS_VFD_0, 0x100, }, + { A5XX_RBBM_DBGBUS_VFD_1, 0x100, }, + { A5XX_RBBM_DBGBUS_VFD_2, 0x100, }, + { A5XX_RBBM_DBGBUS_VFD_3, 0x100, }, + { A5XX_RBBM_DBGBUS_SP_0, 0x100, }, + { A5XX_RBBM_DBGBUS_SP_1, 0x100, }, + { A5XX_RBBM_DBGBUS_SP_2, 0x100, }, + { A5XX_RBBM_DBGBUS_SP_3, 0x100, }, + { A5XX_RBBM_DBGBUS_TPL1_0, 0x100, }, + { A5XX_RBBM_DBGBUS_TPL1_1, 0x100, }, + { A5XX_RBBM_DBGBUS_TPL1_2, 0x100, }, + { A5XX_RBBM_DBGBUS_TPL1_3, 0x100, }, +}; + +/* + * The shader blocks are read from the HLSQ aperture - each one has its own + * identifier for the aperture read + */ +static const struct { + enum a5xx_shader_blocks id; + u32 size; +} a5xx_shader_blocks[] = { + {A5XX_TP_W_MEMOBJ, 0x200}, + {A5XX_TP_W_MIPMAP_BASE, 0x3C0}, + {A5XX_TP_W_SAMPLER_TAG, 0x40}, + {A5XX_TP_S_3D_SAMPLER, 0x80}, + {A5XX_TP_S_3D_SAMPLER_TAG, 0x20}, + {A5XX_TP_S_CS_SAMPLER, 0x40}, + {A5XX_TP_S_CS_SAMPLER_TAG, 0x10}, + {A5XX_SP_W_CONST, 0x800}, + {A5XX_SP_W_CB_SIZE, 0x30}, + {A5XX_SP_W_CB_BASE, 0xF0}, + {A5XX_SP_W_STATE, 0x1}, + {A5XX_SP_S_3D_CONST, 0x800}, + {A5XX_SP_S_3D_CB_SIZE, 0x28}, + {A5XX_SP_S_3D_UAV_SIZE, 0x80}, + {A5XX_SP_S_CS_CONST, 0x400}, + {A5XX_SP_S_CS_CB_SIZE, 0x8}, + {A5XX_SP_S_CS_UAV_SIZE, 0x80}, + {A5XX_SP_S_3D_CONST_DIRTY, 0x12}, + {A5XX_SP_S_3D_CB_SIZE_DIRTY, 0x1}, + {A5XX_SP_S_3D_UAV_SIZE_DIRTY, 0x2}, + {A5XX_SP_S_CS_CONST_DIRTY, 0xA}, + {A5XX_SP_S_CS_CB_SIZE_DIRTY, 0x1}, + {A5XX_SP_S_CS_UAV_SIZE_DIRTY, 0x2}, + {A5XX_HLSQ_ICB_DIRTY, 0xB}, + {A5XX_SP_POWER_RESTORE_RAM_TAG, 0xA}, + {A5XX_TP_POWER_RESTORE_RAM_TAG, 0xA}, + {A5XX_TP_W_SAMPLER, 0x80}, + {A5XX_TP_W_MEMOBJ_TAG, 0x40}, + {A5XX_TP_S_3D_MEMOBJ, 0x200}, + {A5XX_TP_S_3D_MEMOBJ_TAG, 0x20}, + {A5XX_TP_S_CS_MEMOBJ, 0x100}, + {A5XX_TP_S_CS_MEMOBJ_TAG, 0x10}, + {A5XX_SP_W_INSTR, 0x800}, + {A5XX_SP_W_UAV_SIZE, 0x80}, + {A5XX_SP_W_UAV_BASE, 0x80}, + {A5XX_SP_W_INST_TAG, 0x40}, + {A5XX_SP_S_3D_INSTR, 0x800}, + {A5XX_SP_S_3D_CB_BASE, 0xC8}, + {A5XX_SP_S_3D_UAV_BASE, 0x80}, + {A5XX_SP_S_CS_INSTR, 0x400}, + {A5XX_SP_S_CS_CB_BASE, 0x28}, + {A5XX_SP_S_CS_UAV_BASE, 0x80}, + {A5XX_SP_S_3D_INSTR_DIRTY, 0x1}, + {A5XX_SP_S_3D_CB_BASE_DIRTY, 0x5}, + {A5XX_SP_S_3D_UAV_BASE_DIRTY, 0x2}, + {A5XX_SP_S_CS_INSTR_DIRTY, 0x1}, + {A5XX_SP_S_CS_CB_BASE_DIRTY, 0x1}, + {A5XX_SP_S_CS_UAV_BASE_DIRTY, 0x2}, + {A5XX_HLSQ_ICB, 0x200}, + {A5XX_HLSQ_ICB_CB_BASE_DIRTY, 0x4}, + {A5XX_SP_POWER_RESTORE_RAM, 0x140}, + {A5XX_TP_POWER_RESTORE_RAM, 0x40}, +}; + +/* + * The A5XX architecture has a a built in engine to asynchronously dump + * registers from the GPU. It is used to accelerate the copy of hundreds + * (thousands) of registers and as a safe way to access registers that might + * have secure data in them (if the GPU is in secure, the crashdumper returns + * bogus values for those registers). On a fully secured device the CPU will be + * blocked from accessing those registers directly and so the crashdump is the + * only way that we can access context registers and the shader banks for debug + * purposes. + * + * The downside of the crashdump is that it requires access to GPU accessible + * memory (so the VBIF and the bus and the SMMU need to be up and working) and + * you need enough memory to write the script for the crashdumper and to store + * the data that you are dumping so there is a balancing act between the work to + * set up a crash dumper and the value we get out of it. + */ + +/* + * The crashdump uses a pseudo-script format to read and write registers. Each + * operation is two 64 bit values. + * + * READ: + * [qword 0] [64:00] - The absolute IOVA address target for the register value + * [qword 1] [63:44] - the dword address of the register offset to read + * [15:00] - Number of dwords to read at once + * + * WRITE: + * [qword 0] [31:0] 32 bit value to write to the register + * [qword 1] [63:44] - the dword address of the register offset to write + * [21:21] - set 1 to write + * [15:00] - Number of dwords to write (usually 1) + * + * At the bottom of the script, write quadword zeros to trigger the end. + */ +struct crashdump { + struct drm_gem_object *bo; + void *ptr; + u64 iova; + u32 index; +}; + +#define CRASHDUMP_BO_SIZE (SZ_1M) +#define CRASHDUMP_SCRIPT_SIZE (256 * SZ_1K) +#define CRASHDUMP_DATA_SIZE (CRASHDUMP_BO_SIZE - CRASHDUMP_SCRIPT_SIZE) + +static int crashdump_init(struct msm_gpu *gpu, struct crashdump *crashdump) +{ + struct drm_device *drm = gpu->dev; + int ret = -ENOMEM; + + crashdump->bo = msm_gem_new(drm, CRASHDUMP_BO_SIZE, MSM_BO_UNCACHED); + if (IS_ERR(crashdump->bo)) { + ret = PTR_ERR(crashdump->bo); + crashdump->bo = NULL; + return ret; + } + + crashdump->ptr = msm_gem_vaddr_locked(crashdump->bo); + if (!crashdump->ptr) + goto out; + + ret = msm_gem_get_iova_locked(crashdump->bo, gpu->aspace, + &crashdump->iova); + +out: + if (ret) { + drm_gem_object_unreference(crashdump->bo); + crashdump->bo = NULL; + } + + return ret; +} + +static int crashdump_run(struct msm_gpu *gpu, struct crashdump *crashdump) +{ + if (!crashdump->ptr || !crashdump->index) + return -EINVAL; + + gpu_write(gpu, REG_A5XX_CP_CRASH_SCRIPT_BASE_LO, + lower_32_bits(crashdump->iova)); + gpu_write(gpu, REG_A5XX_CP_CRASH_SCRIPT_BASE_HI, + upper_32_bits(crashdump->iova)); + + gpu_write(gpu, REG_A5XX_CP_CRASH_DUMP_CNTL, 1); + + return spin_until(gpu_read(gpu, REG_A5XX_CP_CRASH_DUMP_CNTL) & 0x04); +} + +static void crashdump_destroy(struct msm_gpu *gpu, struct crashdump *crashdump) +{ + if (!crashdump->bo) + return; + + if (crashdump->iova) + msm_gem_put_iova(crashdump->bo, gpu->aspace); + + drm_gem_object_unreference(crashdump->bo); + + memset(crashdump, 0, sizeof(*crashdump)); +} + +static inline void CRASHDUMP_SCRIPT_WRITE(struct crashdump *crashdump, + u32 reg, u32 val) +{ + u64 *ptr = crashdump->ptr + crashdump->index; + + if (WARN_ON(crashdump->index + (2 * sizeof(u64)) + >= CRASHDUMP_SCRIPT_SIZE)) + return; + + /* This is the value to write */ + ptr[0] = (u64) val; + + /* + * This triggers a write to the specified register. 1 is the size of + * the write in dwords + */ + ptr[1] = (((u64) reg) << 44) | (1 << 21) | 1; + + crashdump->index += 2 * sizeof(u64); +} + +static inline void CRASHDUMP_SCRIPT_READ(struct crashdump *crashdump, + u32 reg, u32 count, u32 offset) +{ + u64 *ptr = crashdump->ptr + crashdump->index; + + if (WARN_ON(crashdump->index + (2 * sizeof(u64)) + >= CRASHDUMP_SCRIPT_SIZE)) + return; + + if (WARN_ON(offset + (count * sizeof(u32)) >= CRASHDUMP_DATA_SIZE)) + return; + + ptr[0] = (u64) crashdump->iova + CRASHDUMP_SCRIPT_SIZE + offset; + ptr[1] = (((u64) reg) << 44) | count; + + crashdump->index += 2 * sizeof(u64); +} + +static inline void *CRASHDUMP_DATA_PTR(struct crashdump *crashdump, u32 offset) +{ + if (WARN_ON(!crashdump->ptr || offset >= CRASHDUMP_DATA_SIZE)) + return NULL; + + return crashdump->ptr + CRASHDUMP_SCRIPT_SIZE + offset; +} + +static inline u32 CRASHDUMP_DATA_READ(struct crashdump *crashdump, u32 offset) +{ + return *((u32 *) CRASHDUMP_DATA_PTR(crashdump, offset)); +} + +static inline void CRASHDUMP_RESET(struct crashdump *crashdump) +{ + crashdump->index = 0; +} + +static inline void CRASHDUMP_END(struct crashdump *crashdump) +{ + u64 *ptr = crashdump->ptr + crashdump->index; + + if (WARN_ON((crashdump->index + (2 * sizeof(u64))) + >= CRASHDUMP_SCRIPT_SIZE)) + return; + + ptr[0] = 0; + ptr[1] = 0; + + crashdump->index += 2 * sizeof(u64); +} + +static u32 _crashdump_read_hlsq_aperture(struct crashdump *crashdump, + u32 offset, u32 statetype, u32 bank, + u32 count) +{ + CRASHDUMP_SCRIPT_WRITE(crashdump, REG_A5XX_HLSQ_DBG_READ_SEL, + A5XX_HLSQ_DBG_READ_SEL_STATETYPE(statetype) | bank); + + CRASHDUMP_SCRIPT_READ(crashdump, REG_A5XX_HLSQ_DBG_AHB_READ_APERTURE, + count, offset); + + return count * sizeof(u32); +} + +static u32 _copy_registers(struct msm_snapshot *snapshot, + struct crashdump *crashdump, u32 reg, u32 count, + u32 offset) +{ + int i; + u32 *ptr = (u32 *) (crashdump->ptr + CRASHDUMP_SCRIPT_SIZE + offset); + /* + * Write the offset of the first register of the group and the number of + * registers in the group + */ + SNAPSHOT_WRITE_U32(snapshot, ((count << 16) | reg)); + + /* Followed by each register value in the group */ + for (i = 0; i < count; i++) + SNAPSHOT_WRITE_U32(snapshot, ptr[i]); + + return count * sizeof(u32); +} + +/* + * Return the number of registers in each register group from the + * adreno_gpu->rgisters + */ +static inline u32 REG_COUNT(const unsigned int *ptr) +{ + return (ptr[1] - ptr[0]) + 1; +} + +/* + * Capture what registers we can from the CPU in case the crashdumper is + * unavailable or broken. This will omit the SP,TP and HLSQ registers, but + * you'll get everything else and that ain't bad + */ +static void a5xx_snapshot_registers_cpu(struct msm_gpu *gpu, + struct msm_snapshot *snapshot) +{ + struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu); + struct msm_snapshot_regs header; + u32 regcount = 0, groups = 0; + int i; + + /* + * Before we write the section we need to figure out how big our data + * section will be + */ + for (i = 0; adreno_gpu->registers[i] != ~0; i += 2) { + regcount += REG_COUNT(&(adreno_gpu->registers[i])); + groups++; + } + + header.count = groups; + + /* + * We need one dword for each group and then one dword for each register + * value in that group + */ + if (!SNAPSHOT_HEADER(snapshot, header, SNAPSHOT_SECTION_REGS_V2, + regcount + groups)) + return; + + for (i = 0; adreno_gpu->registers[i] != ~0; i += 2) { + u32 count = REG_COUNT(&(adreno_gpu->registers[i])); + u32 reg = adreno_gpu->registers[i]; + int j; + + /* Write the offset and count for the group */ + SNAPSHOT_WRITE_U32(snapshot, (count << 16) | reg); + + /* Write each value in the group */ + for (j = 0; j < count; j++) + SNAPSHOT_WRITE_U32(snapshot, gpu_read(gpu, reg++)); + } +} + +static void a5xx_snapshot_registers(struct msm_gpu *gpu, + struct msm_snapshot *snapshot) +{ + struct msm_snapshot_regs header; + struct crashdump *crashdump = snapshot->priv; + u32 offset = 0, regcount = 0, groups = 0; + int i; + + /* + * First snapshot all the registers that we can from the CPU. Do this + * because the crashdumper has a tendency to "taint" the value of some + * of the registers (because the GPU implements the crashdumper) so we + * only want to use the crash dump facility if we have to + */ + a5xx_snapshot_registers_cpu(gpu, snapshot); + + if (!crashdump) + return; + + CRASHDUMP_RESET(crashdump); + + /* HLSQ and context registers behind the aperture */ + for (i = 0; i < ARRAY_SIZE(a5xx_hlsq_aperture_regs); i++) { + u32 count = a5xx_hlsq_aperture_regs[i].count; + + offset += _crashdump_read_hlsq_aperture(crashdump, offset, + a5xx_hlsq_aperture_regs[i].type, 0, count); + regcount += count; + + groups++; + } + + CRASHDUMP_END(crashdump); + + if (crashdump_run(gpu, crashdump)) + return; + + header.count = groups; + + /* + * The size of the data will be one dword for each "group" of registers, + * and then one dword for each of the registers in that group + */ + if (!SNAPSHOT_HEADER(snapshot, header, SNAPSHOT_SECTION_REGS_V2, + groups + regcount)) + return; + + /* Copy the registers to the snapshot */ + for (i = 0; i < ARRAY_SIZE(a5xx_hlsq_aperture_regs); i++) + offset += _copy_registers(snapshot, crashdump, + a5xx_hlsq_aperture_regs[i].regoffset, + a5xx_hlsq_aperture_regs[i].count, offset); +} + +static void _a5xx_snapshot_shader_bank(struct msm_snapshot *snapshot, + struct crashdump *crashdump, u32 block, u32 bank, + u32 size, u32 offset) +{ + void *src; + + struct msm_snapshot_shader header = { + .type = block, + .index = bank, + .size = size, + }; + + if (!SNAPSHOT_HEADER(snapshot, header, SNAPSHOT_SECTION_SHADER, size)) + return; + + src = CRASHDUMP_DATA_PTR(crashdump, offset); + + if (src) + SNAPSHOT_MEMCPY(snapshot, src, size * sizeof(u32)); +} + +static void a5xx_snapshot_shader_memory(struct msm_gpu *gpu, + struct msm_snapshot *snapshot) +{ + struct crashdump *crashdump = snapshot->priv; + u32 offset = 0; + int i; + + /* We can only get shader memory through the crashdump */ + if (!crashdump) + return; + + CRASHDUMP_RESET(crashdump); + + /* For each shader block */ + for (i = 0; i < ARRAY_SIZE(a5xx_shader_blocks); i++) { + int j; + + /* For each block, dump 4 banks */ + for (j = 0; j < A5XX_NR_SHADER_BANKS; j++) + offset += _crashdump_read_hlsq_aperture(crashdump, + offset, a5xx_shader_blocks[i].id, j, + a5xx_shader_blocks[i].size); + } + + CRASHDUMP_END(crashdump); + + /* If the crashdump fails we can't get shader memory any other way */ + if (crashdump_run(gpu, crashdump)) + return; + + /* Each bank of each shader gets its own snapshot section */ + for (offset = 0, i = 0; i < ARRAY_SIZE(a5xx_shader_blocks); i++) { + int j; + + for (j = 0; j < A5XX_NR_SHADER_BANKS; j++) { + _a5xx_snapshot_shader_bank(snapshot, crashdump, + a5xx_shader_blocks[i].id, j, + a5xx_shader_blocks[i].size, offset); + offset += a5xx_shader_blocks[i].size * sizeof(u32); + } + } +} + +#define A5XX_NUM_AXI_ARB_BLOCKS 2 +#define A5XX_NUM_XIN_BLOCKS 4 +#define VBIF_DATA_SIZE ((16 * A5XX_NUM_AXI_ARB_BLOCKS) + \ + (18 * A5XX_NUM_XIN_BLOCKS) + (12 * A5XX_NUM_XIN_BLOCKS)) + +static void a5xx_snapshot_debugbus_vbif(struct msm_gpu *gpu, + struct msm_snapshot *snapshot) +{ + int i; + struct msm_snapshot_debugbus header = { + .id = A5XX_RBBM_DBGBUS_VBIF, + .count = VBIF_DATA_SIZE, + }; + + if (!SNAPSHOT_HEADER(snapshot, header, SNAPSHOT_SECTION_DEBUGBUS, + VBIF_DATA_SIZE)) + return; + + gpu_rmw(gpu, REG_A5XX_VBIF_CLKON, A5XX_VBIF_CLKON_FORCE_ON_TESTBUS, + A5XX_VBIF_CLKON_FORCE_ON_TESTBUS); + + gpu_write(gpu, REG_A5XX_VBIF_TEST_BUS1_CTRL0, 0); + gpu_write(gpu, REG_A5XX_VBIF_TEST_BUS_OUT_CTRL, + A5XX_VBIF_TEST_BUS_OUT_CTRL_TEST_BUS_CTRL_EN); + + for (i = 0; i < A5XX_NUM_AXI_ARB_BLOCKS; i++) { + int j; + + gpu_write(gpu, REG_A5XX_VBIF_TEST_BUS2_CTRL0, 1 << (i + 16)); + for (j = 0; j < 16; j++) { + gpu_write(gpu, REG_A5XX_VBIF_TEST_BUS2_CTRL1, + A5XX_VBIF_TEST_BUS2_CTRL1_TEST_BUS2_DATA_SEL(j)); + SNAPSHOT_WRITE_U32(snapshot, gpu_read(gpu, + REG_A5XX_VBIF_TEST_BUS_OUT)); + } + } + + for (i = 0; i < A5XX_NUM_XIN_BLOCKS; i++) { + int j; + + gpu_write(gpu, REG_A5XX_VBIF_TEST_BUS2_CTRL0, 1 << i); + for (j = 0; j < 18; j++) { + gpu_write(gpu, REG_A5XX_VBIF_TEST_BUS2_CTRL1, + A5XX_VBIF_TEST_BUS2_CTRL1_TEST_BUS2_DATA_SEL(j)); + SNAPSHOT_WRITE_U32(snapshot, + gpu_read(gpu, REG_A5XX_VBIF_TEST_BUS_OUT)); + } + } + + for (i = 0; i < A5XX_NUM_XIN_BLOCKS; i++) { + int j; + + gpu_write(gpu, REG_A5XX_VBIF_TEST_BUS1_CTRL0, 1 << i); + for (j = 0; j < 12; j++) { + gpu_write(gpu, REG_A5XX_VBIF_TEST_BUS1_CTRL1, + A5XX_VBIF_TEST_BUS1_CTRL1_TEST_BUS1_DATA_SEL(j)); + SNAPSHOT_WRITE_U32(snapshot, gpu_read(gpu, + REG_A5XX_VBIF_TEST_BUS_OUT)); + } + } + +} + +static void a5xx_snapshot_debugbus_block(struct msm_gpu *gpu, + struct msm_snapshot *snapshot, u32 block, u32 count) +{ + int i; + struct msm_snapshot_debugbus header = { + .id = block, + .count = count * 2, /* Each value is 2 dwords */ + }; + + if (!SNAPSHOT_HEADER(snapshot, header, SNAPSHOT_SECTION_DEBUGBUS, + (count * 2))) + return; + + for (i = 0; i < count; i++) { + u32 reg = A5XX_RBBM_CFG_DBGBUS_SEL_A_PING_INDEX(i) | + A5XX_RBBM_CFG_DBGBUS_SEL_A_PING_BLK_SEL(block); + + gpu_write(gpu, REG_A5XX_RBBM_CFG_DBGBUS_SEL_A, reg); + gpu_write(gpu, REG_A5XX_RBBM_CFG_DBGBUS_SEL_B, reg); + gpu_write(gpu, REG_A5XX_RBBM_CFG_DBGBUS_SEL_C, reg); + gpu_write(gpu, REG_A5XX_RBBM_CFG_DBGBUS_SEL_D, reg); + + /* Each debugbus entry is a quad word */ + SNAPSHOT_WRITE_U32(snapshot, gpu_read(gpu, + REG_A5XX_RBBM_CFG_DBGBUS_TRACE_BUF2)); + SNAPSHOT_WRITE_U32(snapshot, + gpu_read(gpu, REG_A5XX_RBBM_CFG_DBGBUS_TRACE_BUF1)); + } +} + +static void a5xx_snapshot_debugbus(struct msm_gpu *gpu, + struct msm_snapshot *snapshot) +{ + int i; + + gpu_write(gpu, REG_A5XX_RBBM_CFG_DBGBUS_CNTLM, + A5XX_RBBM_CFG_DBGBUS_CNTLM_ENABLE(0xF)); + + for (i = 0; i < ARRAY_SIZE(a5xx_debugbus_blocks); i++) + a5xx_snapshot_debugbus_block(gpu, snapshot, + a5xx_debugbus_blocks[i].id, + a5xx_debugbus_blocks[i].count); + + /* VBIF is special and not in a good way */ + a5xx_snapshot_debugbus_vbif(gpu, snapshot); +} + +static void a5xx_snapshot_cp_merciu(struct msm_gpu *gpu, + struct msm_snapshot *snapshot) +{ + unsigned int i; + struct msm_snapshot_debug header = { + .type = SNAPSHOT_DEBUG_CP_MERCIU, + .size = 64 << 1, /* Data size is 2 dwords per entry */ + }; + + if (!SNAPSHOT_HEADER(snapshot, header, SNAPSHOT_SECTION_DEBUG, 64 << 1)) + return; + + gpu_write(gpu, REG_A5XX_CP_MERCIU_DBG_ADDR, 0); + for (i = 0; i < 64; i++) { + SNAPSHOT_WRITE_U32(snapshot, + gpu_read(gpu, REG_A5XX_CP_MERCIU_DBG_DATA_1)); + SNAPSHOT_WRITE_U32(snapshot, + gpu_read(gpu, REG_A5XX_CP_MERCIU_DBG_DATA_2)); + } +} + +static void a5xx_snapshot_cp_roq(struct msm_gpu *gpu, + struct msm_snapshot *snapshot) +{ + int i; + struct msm_snapshot_debug header = { + .type = SNAPSHOT_DEBUG_CP_ROQ, + .size = 512, + }; + + if (!SNAPSHOT_HEADER(snapshot, header, SNAPSHOT_SECTION_DEBUG, 512)) + return; + + gpu_write(gpu, REG_A5XX_CP_ROQ_DBG_ADDR, 0); + for (i = 0; i < 512; i++) + SNAPSHOT_WRITE_U32(snapshot, + gpu_read(gpu, REG_A5XX_CP_ROQ_DBG_DATA)); +} + +static void a5xx_snapshot_cp_meq(struct msm_gpu *gpu, + struct msm_snapshot *snapshot) +{ + int i; + struct msm_snapshot_debug header = { + .type = SNAPSHOT_DEBUG_CP_MEQ, + .size = 64, + }; + + if (!SNAPSHOT_HEADER(snapshot, header, SNAPSHOT_SECTION_DEBUG, 64)) + return; + + gpu_write(gpu, REG_A5XX_CP_MEQ_DBG_ADDR, 0); + for (i = 0; i < 64; i++) + SNAPSHOT_WRITE_U32(snapshot, + gpu_read(gpu, REG_A5XX_CP_MEQ_DBG_DATA)); +} + +static void a5xx_snapshot_indexed_registers(struct msm_gpu *gpu, + struct msm_snapshot *snapshot, u32 addr, u32 data, + u32 count) +{ + unsigned int i; + struct msm_snapshot_indexed_regs header = { + .index_reg = addr, + .data_reg = data, + .start = 0, + .count = count, + }; + + if (!SNAPSHOT_HEADER(snapshot, header, SNAPSHOT_SECTION_INDEXED_REGS, + count)) + return; + + for (i = 0; i < count; i++) { + gpu_write(gpu, addr, i); + SNAPSHOT_WRITE_U32(snapshot, gpu_read(gpu, data)); + } +} + +int a5xx_snapshot(struct msm_gpu *gpu, struct msm_snapshot *snapshot) +{ + struct crashdump crashdump = { 0 }; + + if (!crashdump_init(gpu, &crashdump)) + snapshot->priv = &crashdump; + + /* To accurately read all registers, disable hardware clock gating */ + a5xx_set_hwcg(gpu, false); + + /* Kick it up to the generic level */ + adreno_snapshot(gpu, snapshot); + + /* Read the GPU registers */ + a5xx_snapshot_registers(gpu, snapshot); + + /* Read the shader memory banks */ + a5xx_snapshot_shader_memory(gpu, snapshot); + + /* Read the debugbus registers */ + a5xx_snapshot_debugbus(gpu, snapshot); + + /* PFP data */ + a5xx_snapshot_indexed_registers(gpu, snapshot, + REG_A5XX_CP_PFP_STAT_ADDR, REG_A5XX_CP_PFP_STAT_DATA, 36); + + /* ME data */ + a5xx_snapshot_indexed_registers(gpu, snapshot, + REG_A5XX_CP_ME_STAT_ADDR, REG_A5XX_CP_ME_STAT_DATA, 29); + + /* DRAW_STATE data */ + a5xx_snapshot_indexed_registers(gpu, snapshot, + REG_A5XX_CP_DRAW_STATE_ADDR, REG_A5XX_CP_DRAW_STATE_DATA, + 256); + + /* ME cache */ + a5xx_snapshot_indexed_registers(gpu, snapshot, + REG_A5XX_CP_ME_UCODE_DBG_ADDR, REG_A5XX_CP_ME_UCODE_DBG_DATA, + 0x53F); + + /* PFP cache */ + a5xx_snapshot_indexed_registers(gpu, snapshot, + REG_A5XX_CP_PFP_UCODE_DBG_ADDR, REG_A5XX_CP_PFP_UCODE_DBG_DATA, + 0x53F); + + /* ME queue */ + a5xx_snapshot_cp_meq(gpu, snapshot); + + /* CP ROQ */ + a5xx_snapshot_cp_roq(gpu, snapshot); + + /* CP MERCIU */ + a5xx_snapshot_cp_merciu(gpu, snapshot); + + crashdump_destroy(gpu, &crashdump); + snapshot->priv = NULL; + + /* Re-enable HWCG */ + a5xx_set_hwcg(gpu, true); + return 0; +} diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c b/drivers/gpu/drm/msm/adreno/adreno_gpu.c index e67c63c9a3ac..f1883825354e 100644 --- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c +++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c @@ -2,7 +2,7 @@ * Copyright (C) 2013 Red Hat * Author: Rob Clark <robdclark@gmail.com> * - * Copyright (c) 2014,2016 The Linux Foundation. All rights reserved. + * Copyright (c) 2014,2016-2017 The Linux Foundation. All rights reserved. * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 as published by @@ -17,12 +17,12 @@ * this program. If not, see <http://www.gnu.org/licenses/>. */ +#include <linux/utsname.h> #include "adreno_gpu.h" +#include "msm_snapshot.h" #include "msm_gem.h" #include "msm_mmu.h" -#define RB_SIZE SZ_32K -#define RB_BLKSIZE 32 int adreno_get_param(struct msm_gpu *gpu, uint32_t param, uint64_t *value) { @@ -35,6 +35,9 @@ int adreno_get_param(struct msm_gpu *gpu, uint32_t param, uint64_t *value) case MSM_PARAM_GMEM_SIZE: *value = adreno_gpu->gmem; return 0; + case MSM_PARAM_GMEM_BASE: + *value = 0x100000; + return 0; case MSM_PARAM_CHIP_ID: *value = adreno_gpu->rev.patchid | (adreno_gpu->rev.minor << 8) | @@ -57,70 +60,106 @@ int adreno_get_param(struct msm_gpu *gpu, uint32_t param, uint64_t *value) int adreno_hw_init(struct msm_gpu *gpu) { struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu); - int ret; + int i; DBG("%s", gpu->name); - ret = msm_gem_get_iova(gpu->rb->bo, gpu->aspace, &gpu->rb_iova); - if (ret) { - gpu->rb_iova = 0; - dev_err(gpu->dev->dev, "could not map ringbuffer: %d\n", ret); - return ret; + for (i = 0; i < gpu->nr_rings; i++) { + int ret = msm_gem_get_iova(gpu->rb[i]->bo, gpu->aspace, + &gpu->rb[i]->iova); + if (ret) { + gpu->rb[i]->iova = 0; + dev_err(gpu->dev->dev, + "could not map ringbuffer %d: %d\n", i, ret); + return ret; + } } - /* Setup REG_CP_RB_CNTL: */ + /* + * Setup REG_CP_RB_CNTL. The same value is used across targets (with + * the excpetion of A430 that disables the RPTR shadow) - the cacluation + * for the ringbuffer size and block size is moved to msm_gpu.h for the + * pre-processor to deal with and the A430 variant is ORed in here + */ adreno_gpu_write(adreno_gpu, REG_ADRENO_CP_RB_CNTL, - /* size is log2(quad-words): */ - AXXX_CP_RB_CNTL_BUFSZ(ilog2(gpu->rb->size / 8)) | - AXXX_CP_RB_CNTL_BLKSZ(ilog2(RB_BLKSIZE / 8)) | - (adreno_is_a430(adreno_gpu) ? AXXX_CP_RB_CNTL_NO_UPDATE : 0)); + MSM_GPU_RB_CNTL_DEFAULT | + (adreno_is_a430(adreno_gpu) ? AXXX_CP_RB_CNTL_NO_UPDATE : 0)); - /* Setup ringbuffer address */ + /* Setup ringbuffer address - use ringbuffer[0] for GPU init */ adreno_gpu_write64(adreno_gpu, REG_ADRENO_CP_RB_BASE, - REG_ADRENO_CP_RB_BASE_HI, gpu->rb_iova); + REG_ADRENO_CP_RB_BASE_HI, gpu->rb[0]->iova); adreno_gpu_write64(adreno_gpu, REG_ADRENO_CP_RB_RPTR_ADDR, - REG_ADRENO_CP_RB_RPTR_ADDR_HI, rbmemptr(adreno_gpu, rptr)); + REG_ADRENO_CP_RB_RPTR_ADDR_HI, rbmemptr(adreno_gpu, 0, rptr)); return 0; } -static uint32_t get_wptr(struct msm_ringbuffer *ring) +/* Use this helper to read rptr, since a430 doesn't update rptr in memory */ +static uint32_t get_rptr(struct adreno_gpu *adreno_gpu, + struct msm_ringbuffer *ring) { - return ring->cur - ring->start; + if (adreno_is_a430(adreno_gpu)) { + /* + * If index is anything but 0 this will probably break horribly, + * but I think that we have enough infrastructure in place to + * ensure that it won't be. If not then this is why your + * a430 stopped working. + */ + return adreno_gpu->memptrs->rptr[ring->id] = adreno_gpu_read( + adreno_gpu, REG_ADRENO_CP_RB_RPTR); + } else + return adreno_gpu->memptrs->rptr[ring->id]; } -/* Use this helper to read rptr, since a430 doesn't update rptr in memory */ -static uint32_t get_rptr(struct adreno_gpu *adreno_gpu) +struct msm_ringbuffer *adreno_active_ring(struct msm_gpu *gpu) { - if (adreno_is_a430(adreno_gpu)) - return adreno_gpu->memptrs->rptr = adreno_gpu_read( - adreno_gpu, REG_ADRENO_CP_RB_RPTR); - else - return adreno_gpu->memptrs->rptr; + return gpu->rb[0]; } -uint32_t adreno_last_fence(struct msm_gpu *gpu) +uint32_t adreno_submitted_fence(struct msm_gpu *gpu, + struct msm_ringbuffer *ring) +{ + if (!ring) + return 0; + + return ring->submitted_fence; +} + +uint32_t adreno_last_fence(struct msm_gpu *gpu, struct msm_ringbuffer *ring) { struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu); - return adreno_gpu->memptrs->fence; + + if (!ring) + return 0; + + return adreno_gpu->memptrs->fence[ring->id]; } void adreno_recover(struct msm_gpu *gpu) { struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu); struct drm_device *dev = gpu->dev; - int ret; + struct msm_ringbuffer *ring; + int ret, i; gpu->funcs->pm_suspend(gpu); - /* reset ringbuffer: */ - gpu->rb->cur = gpu->rb->start; + /* reset ringbuffer(s): */ - /* reset completed fence seqno, just discard anything pending: */ - adreno_gpu->memptrs->fence = gpu->submitted_fence; - adreno_gpu->memptrs->rptr = 0; - adreno_gpu->memptrs->wptr = 0; + FOR_EACH_RING(gpu, ring, i) { + if (!ring) + continue; + + /* No need for a lock here, nobody else is peeking in */ + ring->cur = ring->start; + ring->next = ring->start; + + /* reset completed fence seqno, discard anything pending: */ + adreno_gpu->memptrs->fence[ring->id] = + adreno_submitted_fence(gpu, ring); + adreno_gpu->memptrs->rptr[ring->id] = 0; + } gpu->funcs->pm_resume(gpu); @@ -133,12 +172,10 @@ void adreno_recover(struct msm_gpu *gpu) enable_irq(gpu->irq); } -int adreno_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit, - struct msm_file_private *ctx) +int adreno_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit) { struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu); - struct msm_drm_private *priv = gpu->dev->dev_private; - struct msm_ringbuffer *ring = gpu->rb; + struct msm_ringbuffer *ring = gpu->rb[submit->ring]; unsigned i, ibs = 0; for (i = 0; i < submit->nr_cmds; i++) { @@ -147,8 +184,6 @@ int adreno_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit, /* ignore IB-targets */ break; case MSM_SUBMIT_CMD_CTX_RESTORE_BUF: - /* ignore if there has not been a ctx switch: */ - if (priv->lastctx == ctx) break; case MSM_SUBMIT_CMD_BUF: OUT_PKT3(ring, adreno_is_a430(adreno_gpu) ? @@ -184,7 +219,7 @@ int adreno_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit, OUT_PKT3(ring, CP_EVENT_WRITE, 3); OUT_RING(ring, CACHE_FLUSH_TS); - OUT_RING(ring, rbmemptr(adreno_gpu, fence)); + OUT_RING(ring, rbmemptr(adreno_gpu, ring->id, fence)); OUT_RING(ring, submit->fence); /* we could maybe be clever and only CP_COND_EXEC the interrupt: */ @@ -211,22 +246,25 @@ int adreno_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit, } #endif - gpu->funcs->flush(gpu); + gpu->funcs->flush(gpu, ring); return 0; } -void adreno_flush(struct msm_gpu *gpu) +void adreno_flush(struct msm_gpu *gpu, struct msm_ringbuffer *ring) { struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu); uint32_t wptr; + /* Copy the shadow to the actual register */ + ring->cur = ring->next; + /* * Mask the wptr value that we calculate to fit in the HW range. This is * to account for the possibility that the last command fit exactly into * the ringbuffer and rb->next hasn't wrapped to zero yet */ - wptr = get_wptr(gpu->rb) & ((gpu->rb->size / 4) - 1); + wptr = get_wptr(ring); /* ensure writes to ringbuffer have hit system memory: */ mb(); @@ -234,17 +272,19 @@ void adreno_flush(struct msm_gpu *gpu) adreno_gpu_write(adreno_gpu, REG_ADRENO_CP_RB_WPTR, wptr); } -bool adreno_idle(struct msm_gpu *gpu) +bool adreno_idle(struct msm_gpu *gpu, struct msm_ringbuffer *ring) { struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu); - uint32_t wptr = get_wptr(gpu->rb); + uint32_t wptr = get_wptr(ring); /* wait for CP to drain ringbuffer: */ - if (!spin_until(get_rptr(adreno_gpu) == wptr)) + if (!spin_until(get_rptr(adreno_gpu, ring) == wptr)) return true; /* TODO maybe we need to reset GPU here to recover from hang? */ - DRM_ERROR("%s: timeout waiting to drain ringbuffer!\n", gpu->name); + DRM_ERROR("%s: timeout waiting to drain ringbuffer %d rptr/wptr = %X/%X\n", + gpu->name, ring->id, get_rptr(adreno_gpu, ring), wptr); + return false; } @@ -252,6 +292,7 @@ bool adreno_idle(struct msm_gpu *gpu) void adreno_show(struct msm_gpu *gpu, struct seq_file *m) { struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu); + struct msm_ringbuffer *ring; int i; seq_printf(m, "revision: %d (%d.%d.%d.%d)\n", @@ -259,11 +300,18 @@ void adreno_show(struct msm_gpu *gpu, struct seq_file *m) adreno_gpu->rev.major, adreno_gpu->rev.minor, adreno_gpu->rev.patchid); - seq_printf(m, "fence: %d/%d\n", adreno_gpu->memptrs->fence, - gpu->submitted_fence); - seq_printf(m, "rptr: %d\n", get_rptr(adreno_gpu)); - seq_printf(m, "wptr: %d\n", adreno_gpu->memptrs->wptr); - seq_printf(m, "rb wptr: %d\n", get_wptr(gpu->rb)); + FOR_EACH_RING(gpu, ring, i) { + if (!ring) + continue; + + seq_printf(m, "rb %d: fence: %d/%d\n", i, + adreno_last_fence(gpu, ring), + adreno_submitted_fence(gpu, ring)); + + seq_printf(m, " rptr: %d\n", + get_rptr(adreno_gpu, ring)); + seq_printf(m, "rb wptr: %d\n", get_wptr(ring)); + } gpu->funcs->pm_resume(gpu); @@ -292,22 +340,29 @@ void adreno_show(struct msm_gpu *gpu, struct seq_file *m) */ void adreno_dump_info(struct msm_gpu *gpu) { + struct drm_device *dev = gpu->dev; struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu); + struct msm_ringbuffer *ring; int i; - printk("revision: %d (%d.%d.%d.%d)\n", + dev_err(dev->dev, "revision: %d (%d.%d.%d.%d)\n", adreno_gpu->info->revn, adreno_gpu->rev.core, adreno_gpu->rev.major, adreno_gpu->rev.minor, adreno_gpu->rev.patchid); - printk("fence: %d/%d\n", adreno_gpu->memptrs->fence, - gpu->submitted_fence); - printk("rptr: %d\n", get_rptr(adreno_gpu)); - printk("wptr: %d\n", adreno_gpu->memptrs->wptr); - printk("rb wptr: %d\n", get_wptr(gpu->rb)); + FOR_EACH_RING(gpu, ring, i) { + if (!ring) + continue; + + dev_err(dev->dev, " ring %d: fence %d/%d rptr/wptr %x/%x\n", i, + adreno_last_fence(gpu, ring), + adreno_submitted_fence(gpu, ring), + get_rptr(adreno_gpu, ring), + get_wptr(ring)); + } for (i = 0; i < 8; i++) { - printk("CP_SCRATCH_REG%d: %u\n", i, + pr_err("CP_SCRATCH_REG%d: %u\n", i, gpu_read(gpu, REG_AXXX_CP_SCRATCH_REG0 + i)); } } @@ -332,19 +387,21 @@ void adreno_dump(struct msm_gpu *gpu) } } -static uint32_t ring_freewords(struct msm_gpu *gpu) +static uint32_t ring_freewords(struct msm_ringbuffer *ring) { - struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu); - uint32_t size = gpu->rb->size / 4; - uint32_t wptr = get_wptr(gpu->rb); - uint32_t rptr = get_rptr(adreno_gpu); + struct adreno_gpu *adreno_gpu = to_adreno_gpu(ring->gpu); + uint32_t size = MSM_GPU_RINGBUFFER_SZ >> 2; + /* Use ring->next to calculate free size */ + uint32_t wptr = ring->next - ring->start; + uint32_t rptr = get_rptr(adreno_gpu, ring); return (rptr + (size - 1) - wptr) % size; } -void adreno_wait_ring(struct msm_gpu *gpu, uint32_t ndwords) +void adreno_wait_ring(struct msm_ringbuffer *ring, uint32_t ndwords) { - if (spin_until(ring_freewords(gpu) >= ndwords)) - DRM_ERROR("%s: timeout waiting for ringbuffer space\n", gpu->name); + if (spin_until(ring_freewords(ring) >= ndwords)) + DRM_ERROR("%s: timeout waiting for space in ringubffer %d\n", + ring->gpu->name, ring->id); } static const char *iommu_ports[] = { @@ -465,9 +522,11 @@ static int adreno_of_parse(struct platform_device *pdev, struct msm_gpu *gpu) } int adreno_gpu_init(struct drm_device *drm, struct platform_device *pdev, - struct adreno_gpu *adreno_gpu, const struct adreno_gpu_funcs *funcs) + struct adreno_gpu *adreno_gpu, + const struct adreno_gpu_funcs *funcs, int nr_rings) { struct adreno_platform_config *config = pdev->dev.platform_data; + struct msm_gpu_config adreno_gpu_config = { 0 }; struct msm_gpu *gpu = &adreno_gpu->base; struct msm_mmu *mmu; int ret; @@ -481,9 +540,26 @@ int adreno_gpu_init(struct drm_device *drm, struct platform_device *pdev, /* Get the rest of the target configuration from the device tree */ adreno_of_parse(pdev, gpu); + adreno_gpu_config.ioname = "kgsl_3d0_reg_memory"; + adreno_gpu_config.irqname = "kgsl_3d0_irq"; + adreno_gpu_config.nr_rings = nr_rings; + + adreno_gpu_config.va_start = SZ_16M; + adreno_gpu_config.va_end = 0xffffffff; + + if (adreno_gpu->revn >= 500) { + /* 5XX targets use a 64 bit region */ + adreno_gpu_config.va_start = 0x800000000; + adreno_gpu_config.va_end = 0x8ffffffff; + } else { + adreno_gpu_config.va_start = 0x300000; + adreno_gpu_config.va_end = 0xffffffff; + } + + adreno_gpu_config.nr_rings = nr_rings; + ret = msm_gpu_init(drm, pdev, &adreno_gpu->base, &funcs->base, - adreno_gpu->info->name, "kgsl_3d0_reg_memory", "kgsl_3d0_irq", - RB_SIZE); + adreno_gpu->info->name, &adreno_gpu_config); if (ret) return ret; @@ -542,7 +618,7 @@ void adreno_gpu_cleanup(struct adreno_gpu *gpu) if (gpu->memptrs_bo) { if (gpu->memptrs_iova) - msm_gem_put_iova(gpu->memptrs_bo, gpu->base.aspace); + msm_gem_put_iova(gpu->memptrs_bo, aspace); drm_gem_object_unreference_unlocked(gpu->memptrs_bo); } release_firmware(gpu->pm4); @@ -551,8 +627,85 @@ void adreno_gpu_cleanup(struct adreno_gpu *gpu) msm_gpu_cleanup(&gpu->base); if (aspace) { - aspace->mmu->funcs->detach(aspace->mmu, - iommu_ports, ARRAY_SIZE(iommu_ports)); - msm_gem_address_space_destroy(aspace); + aspace->mmu->funcs->detach(aspace->mmu); + msm_gem_address_space_put(aspace); } } + +static void adreno_snapshot_os(struct msm_gpu *gpu, + struct msm_snapshot *snapshot) +{ + struct msm_snapshot_linux header; + + memset(&header, 0, sizeof(header)); + + header.osid = SNAPSHOT_OS_LINUX_V3; + strlcpy(header.release, utsname()->release, sizeof(header.release)); + strlcpy(header.version, utsname()->version, sizeof(header.version)); + + header.seconds = get_seconds(); + header.ctxtcount = 0; + + SNAPSHOT_HEADER(snapshot, header, SNAPSHOT_SECTION_OS, 0); +} + +static void adreno_snapshot_ringbuffer(struct msm_gpu *gpu, + struct msm_snapshot *snapshot, struct msm_ringbuffer *ring) +{ + struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu); + struct msm_snapshot_ringbuffer header; + unsigned int i, end = 0; + unsigned int *data = ring->start; + + memset(&header, 0, sizeof(header)); + + /* + * We only want to copy the active contents of each ring, so find the + * last valid entry in the ringbuffer + */ + for (i = 0; i < MSM_GPU_RINGBUFFER_SZ >> 2; i++) { + if (data[i]) + end = i; + } + + /* The dump always starts at 0 */ + header.start = 0; + header.end = end; + + /* This is the number of dwords being dumped */ + header.count = end + 1; + + /* This is the size of the actual ringbuffer */ + header.rbsize = MSM_GPU_RINGBUFFER_SZ >> 2; + + header.id = ring->id; + header.gpuaddr = ring->iova; + header.rptr = get_rptr(adreno_gpu, ring); + header.wptr = get_wptr(ring); + header.timestamp_queued = adreno_submitted_fence(gpu, ring); + header.timestamp_retired = adreno_last_fence(gpu, ring); + + /* Write the header even if the ringbuffer data is empty */ + if (!SNAPSHOT_HEADER(snapshot, header, SNAPSHOT_SECTION_RB_V2, + header.count)) + return; + + SNAPSHOT_MEMCPY(snapshot, ring->start, header.count * sizeof(u32)); +} + +static void adreno_snapshot_ringbuffers(struct msm_gpu *gpu, + struct msm_snapshot *snapshot) +{ + struct msm_ringbuffer *ring; + int i; + + /* Write a new section for each ringbuffer */ + FOR_EACH_RING(gpu, ring, i) + adreno_snapshot_ringbuffer(gpu, snapshot, ring); +} + +void adreno_snapshot(struct msm_gpu *gpu, struct msm_snapshot *snapshot) +{ + adreno_snapshot_os(gpu, snapshot); + adreno_snapshot_ringbuffers(gpu, snapshot); +} diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.h b/drivers/gpu/drm/msm/adreno/adreno_gpu.h index 9bcf4bb705bd..30461115281c 100644 --- a/drivers/gpu/drm/msm/adreno/adreno_gpu.h +++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.h @@ -2,7 +2,7 @@ * Copyright (C) 2013 Red Hat * Author: Rob Clark <robdclark@gmail.com> * - * Copyright (c) 2014,2016 The Linux Foundation. All rights reserved. + * Copyright (c) 2014,2016-2017 The Linux Foundation. All rights reserved. * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 as published by @@ -83,13 +83,20 @@ struct adreno_info { const struct adreno_info *adreno_info(struct adreno_rev rev); -#define rbmemptr(adreno_gpu, member) \ +#define _sizeof(member) \ + sizeof(((struct adreno_rbmemptrs *) 0)->member[0]) + +#define _base(adreno_gpu, member) \ ((adreno_gpu)->memptrs_iova + offsetof(struct adreno_rbmemptrs, member)) +#define rbmemptr(adreno_gpu, index, member) \ + (_base((adreno_gpu), member) + ((index) * _sizeof(member))) + struct adreno_rbmemptrs { - volatile uint32_t rptr; - volatile uint32_t wptr; - volatile uint32_t fence; + volatile uint32_t rptr[MSM_GPU_MAX_RINGS]; + volatile uint32_t fence[MSM_GPU_MAX_RINGS]; + volatile uint64_t ttbr0[MSM_GPU_MAX_RINGS]; + volatile unsigned int contextidr[MSM_GPU_MAX_RINGS]; }; struct adreno_gpu { @@ -206,30 +213,34 @@ static inline int adreno_is_a540(struct adreno_gpu *gpu) int adreno_get_param(struct msm_gpu *gpu, uint32_t param, uint64_t *value); int adreno_hw_init(struct msm_gpu *gpu); -uint32_t adreno_last_fence(struct msm_gpu *gpu); +uint32_t adreno_last_fence(struct msm_gpu *gpu, struct msm_ringbuffer *ring); +uint32_t adreno_submitted_fence(struct msm_gpu *gpu, + struct msm_ringbuffer *ring); void adreno_recover(struct msm_gpu *gpu); -int adreno_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit, - struct msm_file_private *ctx); -void adreno_flush(struct msm_gpu *gpu); -bool adreno_idle(struct msm_gpu *gpu); +int adreno_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit); +void adreno_flush(struct msm_gpu *gpu, struct msm_ringbuffer *ring); +bool adreno_idle(struct msm_gpu *gpu, struct msm_ringbuffer *ring); #ifdef CONFIG_DEBUG_FS void adreno_show(struct msm_gpu *gpu, struct seq_file *m); #endif void adreno_dump_info(struct msm_gpu *gpu); void adreno_dump(struct msm_gpu *gpu); -void adreno_wait_ring(struct msm_gpu *gpu, uint32_t ndwords); +void adreno_wait_ring(struct msm_ringbuffer *ring, uint32_t ndwords); +struct msm_ringbuffer *adreno_active_ring(struct msm_gpu *gpu); int adreno_gpu_init(struct drm_device *drm, struct platform_device *pdev, - struct adreno_gpu *gpu, const struct adreno_gpu_funcs *funcs); + struct adreno_gpu *gpu, const struct adreno_gpu_funcs *funcs, + int nr_rings); void adreno_gpu_cleanup(struct adreno_gpu *gpu); +void adreno_snapshot(struct msm_gpu *gpu, struct msm_snapshot *snapshot); /* ringbuffer helpers (the parts that are adreno specific) */ static inline void OUT_PKT0(struct msm_ringbuffer *ring, uint16_t regindx, uint16_t cnt) { - adreno_wait_ring(ring->gpu, cnt+1); + adreno_wait_ring(ring, cnt+1); OUT_RING(ring, CP_TYPE0_PKT | ((cnt-1) << 16) | (regindx & 0x7FFF)); } @@ -237,14 +248,14 @@ OUT_PKT0(struct msm_ringbuffer *ring, uint16_t regindx, uint16_t cnt) static inline void OUT_PKT2(struct msm_ringbuffer *ring) { - adreno_wait_ring(ring->gpu, 1); + adreno_wait_ring(ring, 1); OUT_RING(ring, CP_TYPE2_PKT); } static inline void OUT_PKT3(struct msm_ringbuffer *ring, uint8_t opcode, uint16_t cnt) { - adreno_wait_ring(ring->gpu, cnt+1); + adreno_wait_ring(ring, cnt+1); OUT_RING(ring, CP_TYPE3_PKT | ((cnt-1) << 16) | ((opcode & 0xFF) << 8)); } @@ -266,14 +277,14 @@ static inline u32 PM4_PARITY(u32 val) static inline void OUT_PKT4(struct msm_ringbuffer *ring, uint16_t regindx, uint16_t cnt) { - adreno_wait_ring(ring->gpu, cnt + 1); + adreno_wait_ring(ring, cnt + 1); OUT_RING(ring, PKT4(regindx, cnt)); } static inline void OUT_PKT7(struct msm_ringbuffer *ring, uint8_t opcode, uint16_t cnt) { - adreno_wait_ring(ring->gpu, cnt + 1); + adreno_wait_ring(ring, cnt + 1); OUT_RING(ring, CP_TYPE7_PKT | (cnt << 0) | (PM4_PARITY(cnt) << 15) | ((opcode & 0x7F) << 16) | (PM4_PARITY(opcode) << 23)); } @@ -328,6 +339,11 @@ static inline void adreno_gpu_write64(struct adreno_gpu *gpu, adreno_gpu_write(gpu, hi, upper_32_bits(data)); } +static inline uint32_t get_wptr(struct msm_ringbuffer *ring) +{ + return (ring->cur - ring->start) % (MSM_GPU_RINGBUFFER_SZ >> 2); +} + /* * Given a register and a count, return a value to program into * REG_CP_PROTECT_REG(n) - this will block both reads and writes for _len diff --git a/drivers/gpu/drm/msm/mdp/mdp4/mdp4_kms.c b/drivers/gpu/drm/msm/mdp/mdp4/mdp4_kms.c index da9d58eccc3a..b6cddee0cf34 100644 --- a/drivers/gpu/drm/msm/mdp/mdp4/mdp4_kms.c +++ b/drivers/gpu/drm/msm/mdp/mdp4/mdp4_kms.c @@ -204,7 +204,7 @@ static void mdp4_destroy(struct msm_kms *kms) if (aspace) { aspace->mmu->funcs->detach(aspace->mmu, iommu_ports, ARRAY_SIZE(iommu_ports)); - msm_gem_address_space_destroy(aspace); + msm_gem_address_space_put(aspace); } kfree(mdp4_kms); diff --git a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_kms.c b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_kms.c index ede681e12a68..e4e69ebd116e 100644 --- a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_kms.c +++ b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_kms.c @@ -136,9 +136,8 @@ static void mdp5_destroy(struct msm_kms *kms) mdp5_irq_domain_fini(mdp5_kms); if (aspace) { - aspace->mmu->funcs->detach(aspace->mmu, - iommu_ports, ARRAY_SIZE(iommu_ports)); - msm_gem_address_space_destroy(aspace); + aspace->mmu->funcs->detach(aspace->mmu); + msm_gem_address_space_put(aspace); } if (mdp5_kms->ctlm) diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c index 9163b90981a2..0231ac3f269f 100644 --- a/drivers/gpu/drm/msm/msm_drv.c +++ b/drivers/gpu/drm/msm/msm_drv.c @@ -23,6 +23,8 @@ #include "sde_wb.h" #define TEARDOWN_DEADLOCK_RETRY_MAX 5 +#include "msm_gem.h" +#include "msm_mmu.h" static void msm_fb_output_poll_changed(struct drm_device *dev) { @@ -552,29 +554,65 @@ static void load_gpu(struct drm_device *dev) } #endif -static int msm_open(struct drm_device *dev, struct drm_file *file) +static struct msm_file_private *setup_pagetable(struct msm_drm_private *priv) { struct msm_file_private *ctx; + if (!priv || !priv->gpu) + return NULL; + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return ERR_PTR(-ENOMEM); + + ctx->aspace = msm_gem_address_space_create_instance( + priv->gpu->aspace->mmu, "gpu", 0x100000000, 0x1ffffffff); + + if (IS_ERR(ctx->aspace)) { + int ret = PTR_ERR(ctx->aspace); + + /* + * If dynamic domains are not supported, everybody uses the + * same pagetable + */ + if (ret != -EOPNOTSUPP) { + kfree(ctx); + return ERR_PTR(ret); + } + + ctx->aspace = priv->gpu->aspace; + } + + ctx->aspace->mmu->funcs->attach(ctx->aspace->mmu, NULL, 0); + return ctx; +} + +static int msm_open(struct drm_device *dev, struct drm_file *file) +{ + struct msm_file_private *ctx = NULL; + struct msm_drm_private *priv; + struct msm_kms *kms; + + if (!dev || !dev->dev_private) + return -ENODEV; + + priv = dev->dev_private; /* For now, load gpu on open.. to avoid the requirement of having * firmware in the initrd. */ load_gpu(dev); - ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); - if (!ctx) - return -ENOMEM; + ctx = setup_pagetable(priv); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); file->driver_priv = ctx; - if (dev && dev->dev_private) { - struct msm_drm_private *priv = dev->dev_private; - struct msm_kms *kms; + kms = priv->kms; + + if (kms && kms->funcs && kms->funcs->postopen) + kms->funcs->postopen(kms, file); - kms = priv->kms; - if (kms && kms->funcs && kms->funcs->postopen) - kms->funcs->postopen(kms, file); - } return 0; } @@ -597,8 +635,10 @@ static void msm_postclose(struct drm_device *dev, struct drm_file *file) kms->funcs->postclose(kms, file); mutex_lock(&dev->struct_mutex); - if (ctx == priv->lastctx) - priv->lastctx = NULL; + if (ctx && ctx->aspace && ctx->aspace != priv->gpu->aspace) { + ctx->aspace->mmu->funcs->detach(ctx->aspace->mmu); + msm_gem_address_space_put(ctx->aspace); + } mutex_unlock(&dev->struct_mutex); kfree(ctx); @@ -797,6 +837,13 @@ static int msm_gpu_show(struct drm_device *dev, struct seq_file *m) return 0; } +static int msm_snapshot_show(struct drm_device *dev, struct seq_file *m) +{ + struct msm_drm_private *priv = dev->dev_private; + + return msm_snapshot_write(priv->gpu, m); +} + static int msm_gem_show(struct drm_device *dev, struct seq_file *m) { struct msm_drm_private *priv = dev->dev_private; @@ -861,11 +908,22 @@ static int show_locked(struct seq_file *m, void *arg) return ret; } +static int show_unlocked(struct seq_file *m, void *arg) +{ + struct drm_info_node *node = (struct drm_info_node *) m->private; + struct drm_device *dev = node->minor->dev; + int (*show)(struct drm_device *dev, struct seq_file *m) = + node->info_ent->data; + + return show(dev, m); +} + static struct drm_info_list msm_debugfs_list[] = { {"gpu", show_locked, 0, msm_gpu_show}, {"gem", show_locked, 0, msm_gem_show}, { "mm", show_locked, 0, msm_mm_show }, { "fb", show_locked, 0, msm_fb_show }, + { "snapshot", show_unlocked, 0, msm_snapshot_show }, }; static int late_init_minor(struct drm_minor *minor) @@ -939,14 +997,23 @@ int msm_wait_fence(struct drm_device *dev, uint32_t fence, ktime_t *timeout , bool interruptible) { struct msm_drm_private *priv = dev->dev_private; + struct msm_gpu *gpu = priv->gpu; + int index = FENCE_RING(fence); + uint32_t submitted; int ret; - if (!priv->gpu) + if (!gpu) return -ENXIO; - if (fence > priv->gpu->submitted_fence) { + if (index > MSM_GPU_MAX_RINGS || index >= gpu->nr_rings || + !gpu->rb[index]) + return -EINVAL; + + submitted = gpu->funcs->submitted_fence(gpu, gpu->rb[index]); + + if (fence > submitted) { DRM_ERROR("waiting on invalid fence: %u (of %u)\n", - fence, priv->gpu->submitted_fence); + fence, submitted); return -EINVAL; } @@ -976,7 +1043,7 @@ int msm_wait_fence(struct drm_device *dev, uint32_t fence, if (ret == 0) { DBG("timeout waiting for fence: %u (completed: %u)", - fence, priv->completed_fence); + fence, priv->completed_fence[index]); ret = -ETIMEDOUT; } else if (ret != -ERESTARTSYS) { ret = 0; @@ -990,12 +1057,13 @@ int msm_queue_fence_cb(struct drm_device *dev, struct msm_fence_cb *cb, uint32_t fence) { struct msm_drm_private *priv = dev->dev_private; + int index = FENCE_RING(fence); int ret = 0; mutex_lock(&dev->struct_mutex); if (!list_empty(&cb->work.entry)) { ret = -EINVAL; - } else if (fence > priv->completed_fence) { + } else if (fence > priv->completed_fence[index]) { cb->fence = fence; list_add_tail(&cb->work.entry, &priv->fence_cbs); } else { @@ -1010,21 +1078,21 @@ int msm_queue_fence_cb(struct drm_device *dev, void msm_update_fence(struct drm_device *dev, uint32_t fence) { struct msm_drm_private *priv = dev->dev_private; + struct msm_fence_cb *cb, *tmp; + int index = FENCE_RING(fence); - mutex_lock(&dev->struct_mutex); - priv->completed_fence = max(fence, priv->completed_fence); - - while (!list_empty(&priv->fence_cbs)) { - struct msm_fence_cb *cb; - - cb = list_first_entry(&priv->fence_cbs, - struct msm_fence_cb, work.entry); + if (index >= MSM_GPU_MAX_RINGS) + return; - if (cb->fence > priv->completed_fence) - break; + mutex_lock(&dev->struct_mutex); + priv->completed_fence[index] = max(fence, priv->completed_fence[index]); - list_del_init(&cb->work.entry); - queue_work(priv->wq, &cb->work); + list_for_each_entry_safe(cb, tmp, &priv->fence_cbs, work.entry) { + if (COMPARE_FENCE_LTE(cb->fence, + priv->completed_fence[index])) { + list_del_init(&cb->work.entry); + queue_work(priv->wq, &cb->work); + } } mutex_unlock(&dev->struct_mutex); diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h index 406b40f0f8de..0a06c1cb27a4 100644 --- a/drivers/gpu/drm/msm/msm_drv.h +++ b/drivers/gpu/drm/msm/msm_drv.h @@ -74,11 +74,7 @@ struct msm_gem_vma; #define MAX_CONNECTORS 8 struct msm_file_private { - /* currently we don't do anything useful with this.. but when - * per-context address spaces are supported we'd keep track of - * the context's page-tables here. - */ - int dummy; + struct msm_gem_address_space *aspace; }; enum msm_mdp_plane_property { @@ -250,6 +246,8 @@ struct msm_drm_commit { struct kthread_worker worker; }; +#define MSM_GPU_MAX_RINGS 4 + struct msm_drm_private { struct msm_kms *kms; @@ -276,11 +274,12 @@ struct msm_drm_private { /* when we have more than one 'msm_gpu' these need to be an array: */ struct msm_gpu *gpu; - struct msm_file_private *lastctx; struct drm_fb_helper *fbdev; - uint32_t next_fence, completed_fence; + uint32_t next_fence[MSM_GPU_MAX_RINGS]; + uint32_t completed_fence[MSM_GPU_MAX_RINGS]; + wait_queue_head_t fence_event; struct msm_rd_state *rd; @@ -351,6 +350,31 @@ struct msm_format { uint32_t pixel_format; }; +/* + * Some GPU targets can support multiple ringbuffers and preempt between them. + * In order to do this without massive API changes we will steal two bits from + * the top of the fence and use them to identify the ringbuffer, (0x00000001 for + * riug 0, 0x40000001 for ring 1, 0x50000001 for ring 2, etc). If you are going + * to do a fence comparision you have to make sure you are only comparing + * against fences from the same ring, but since fences within a ringbuffer are + * still contigious you can still use straight comparisons (i.e 0x40000001 is + * older than 0x40000002). Mathmatically there will be 0x3FFFFFFF timestamps + * per ring or ~103 days of 120 interrupts per second (two interrupts per frame + * at 60 FPS). + */ +#define FENCE_RING(_fence) ((_fence >> 30) & 3) +#define FENCE(_ring, _fence) ((((_ring) & 3) << 30) | ((_fence) & 0x3FFFFFFF)) + +static inline bool COMPARE_FENCE_LTE(uint32_t a, uint32_t b) +{ + return ((FENCE_RING(a) == FENCE_RING(b)) && a <= b); +} + +static inline bool COMPARE_FENCE_LT(uint32_t a, uint32_t b) +{ + return ((FENCE_RING(a) == FENCE_RING(b)) && a < b); +} + /* callback from wq once fence has passed: */ struct msm_fence_cb { struct work_struct work; @@ -379,13 +403,17 @@ void msm_gem_unmap_vma(struct msm_gem_address_space *aspace, void *priv); int msm_gem_map_vma(struct msm_gem_address_space *aspace, struct msm_gem_vma *vma, struct sg_table *sgt, - void *priv); -void msm_gem_address_space_destroy(struct msm_gem_address_space *aspace); + void *priv, unsigned int flags); + +void msm_gem_address_space_put(struct msm_gem_address_space *aspace); /* For GPU and legacy display */ struct msm_gem_address_space * msm_gem_address_space_create(struct device *dev, struct iommu_domain *domain, const char *name); +struct msm_gem_address_space * +msm_gem_address_space_create_instance(struct msm_mmu *parent, const char *name, + uint64_t start, uint64_t end); /* For SDE display */ struct msm_gem_address_space * @@ -529,7 +557,8 @@ u32 msm_readl(const void __iomem *addr); static inline bool fence_completed(struct drm_device *dev, uint32_t fence) { struct msm_drm_private *priv = dev->dev_private; - return priv->completed_fence >= fence; + + return priv->completed_fence[FENCE_RING(fence)] >= fence; } static inline int align_pitch(int width, int bpp) diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c index f42c7bff18fb..63128d11767e 100644 --- a/drivers/gpu/drm/msm/msm_gem.c +++ b/drivers/gpu/drm/msm/msm_gem.c @@ -330,6 +330,10 @@ static struct msm_gem_vma *obj_get_domain(struct drm_gem_object *obj, return NULL; } +#ifndef IOMMU_PRIV +#define IOMMU_PRIV 0 +#endif + /* should be called under struct_mutex.. although it can be called * from atomic context without struct_mutex to acquire an extra * iova ref if you know one is already held. @@ -369,7 +373,7 @@ int msm_gem_get_iova_locked(struct drm_gem_object *obj, } ret = msm_gem_map_vma(aspace, domain, msm_obj->sgt, - get_dmabuf_ptr(obj)); + get_dmabuf_ptr(obj), msm_obj->flags); } if (!ret) diff --git a/drivers/gpu/drm/msm/msm_gem.h b/drivers/gpu/drm/msm/msm_gem.h index d9c3977434d9..ac46c473791f 100644 --- a/drivers/gpu/drm/msm/msm_gem.h +++ b/drivers/gpu/drm/msm/msm_gem.h @@ -18,6 +18,7 @@ #ifndef __MSM_GEM_H__ #define __MSM_GEM_H__ +#include <linux/kref.h> #include <linux/reservation.h> #include "msm_drv.h" @@ -26,7 +27,7 @@ struct msm_gem_aspace_ops { int (*map)(struct msm_gem_address_space *, struct msm_gem_vma *, - struct sg_table *sgt, void *priv); + struct sg_table *sgt, void *priv, unsigned int flags); void (*unmap)(struct msm_gem_address_space *, struct msm_gem_vma *, struct sg_table *sgt, void *priv); @@ -38,6 +39,7 @@ struct msm_gem_address_space { const char *name; struct msm_mmu *mmu; const struct msm_gem_aspace_ops *ops; + struct kref kref; }; struct msm_gem_vma { @@ -116,11 +118,12 @@ static inline uint32_t msm_gem_fence(struct msm_gem_object *msm_obj, */ struct msm_gem_submit { struct drm_device *dev; - struct msm_gpu *gpu; + struct msm_gem_address_space *aspace; struct list_head node; /* node in gpu submit_list */ struct list_head bo_list; struct ww_acquire_ctx ticket; uint32_t fence; + int ring; bool valid; unsigned int nr_cmds; unsigned int nr_bos; diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c b/drivers/gpu/drm/msm/msm_gem_submit.c index f7b5e30b41eb..0566cefaae81 100644 --- a/drivers/gpu/drm/msm/msm_gem_submit.c +++ b/drivers/gpu/drm/msm/msm_gem_submit.c @@ -34,7 +34,7 @@ static inline void __user *to_user_ptr(u64 address) } static struct msm_gem_submit *submit_create(struct drm_device *dev, - struct msm_gpu *gpu, int nr) + struct msm_gem_address_space *aspace, int nr) { struct msm_gem_submit *submit; int sz = sizeof(*submit) + (nr * sizeof(submit->bos[0])); @@ -42,7 +42,7 @@ static struct msm_gem_submit *submit_create(struct drm_device *dev, submit = kmalloc(sz, GFP_TEMPORARY | __GFP_NOWARN | __GFP_NORETRY); if (submit) { submit->dev = dev; - submit->gpu = gpu; + submit->aspace = aspace; /* initially, until copy_from_user() and bo lookup succeeds: */ submit->nr_bos = 0; @@ -142,7 +142,7 @@ static void submit_unlock_unpin_bo(struct msm_gem_submit *submit, int i) struct msm_gem_object *msm_obj = submit->bos[i].obj; if (submit->bos[i].flags & BO_PINNED) - msm_gem_put_iova(&msm_obj->base, submit->gpu->aspace); + msm_gem_put_iova(&msm_obj->base, submit->aspace); if (submit->bos[i].flags & BO_LOCKED) ww_mutex_unlock(&msm_obj->resv->lock); @@ -181,7 +181,7 @@ retry: /* if locking succeeded, pin bo: */ ret = msm_gem_get_iova_locked(&msm_obj->base, - submit->gpu->aspace, &iova); + submit->aspace, &iova); /* this would break the logic in the fail path.. there is no * reason for this to happen, but just to be on the safe side @@ -349,7 +349,7 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data, /* for now, we just have 3d pipe.. eventually this would need to * be more clever to dispatch to appropriate gpu module: */ - if (args->pipe != MSM_PIPE_3D0) + if (MSM_PIPE_ID(args->flags) != MSM_PIPE_3D0) return -EINVAL; gpu = priv->gpu; @@ -361,7 +361,7 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data, mutex_lock(&dev->struct_mutex); - submit = submit_create(dev, gpu, args->nr_bos); + submit = submit_create(dev, ctx->aspace, args->nr_bos); if (!submit) { ret = -ENOMEM; goto out; @@ -412,8 +412,9 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data, goto out; } - if ((submit_cmd.size + submit_cmd.submit_offset) >= - msm_obj->base.size) { + if (!(submit_cmd.size) || + ((submit_cmd.size + submit_cmd.submit_offset) > + msm_obj->base.size)) { DRM_ERROR("invalid cmdstream size: %u\n", submit_cmd.size); ret = -EINVAL; goto out; @@ -435,7 +436,12 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data, submit->nr_cmds = i; - ret = msm_gpu_submit(gpu, submit, ctx); + /* Clamp the user submitted ring to the range of available rings */ + submit->ring = clamp_t(uint32_t, + (args->flags & MSM_SUBMIT_RING_MASK) >> MSM_SUBMIT_RING_SHIFT, + 0, gpu->nr_rings - 1); + + ret = msm_gpu_submit(gpu, submit); args->fence = submit->fence; diff --git a/drivers/gpu/drm/msm/msm_gem_vma.c b/drivers/gpu/drm/msm/msm_gem_vma.c index 53e70263e03c..7ca96831a9b3 100644 --- a/drivers/gpu/drm/msm/msm_gem_vma.c +++ b/drivers/gpu/drm/msm/msm_gem_vma.c @@ -19,6 +19,24 @@ #include "msm_gem.h" #include "msm_mmu.h" +static void +msm_gem_address_space_destroy(struct kref *kref) +{ + struct msm_gem_address_space *aspace = container_of(kref, + struct msm_gem_address_space, kref); + + if (aspace->ops->destroy) + aspace->ops->destroy(aspace); + + kfree(aspace); +} + +void msm_gem_address_space_put(struct msm_gem_address_space *aspace) +{ + if (aspace) + kref_put(&aspace->kref, msm_gem_address_space_destroy); +} + /* SDE address space operations */ static void smmu_aspace_unmap_vma(struct msm_gem_address_space *aspace, struct msm_gem_vma *vma, struct sg_table *sgt, @@ -34,12 +52,14 @@ static void smmu_aspace_unmap_vma(struct msm_gem_address_space *aspace, DMA_BIDIRECTIONAL); vma->iova = 0; + + msm_gem_address_space_put(aspace); } static int smmu_aspace_map_vma(struct msm_gem_address_space *aspace, struct msm_gem_vma *vma, struct sg_table *sgt, - void *priv) + void *priv, unsigned int flags) { struct dma_buf *buf = priv; int ret; @@ -54,6 +74,9 @@ static int smmu_aspace_map_vma(struct msm_gem_address_space *aspace, if (!ret) vma->iova = sg_dma_address(sgt->sgl); + /* Get a reference to the aspace to keep it around */ + kref_get(&aspace->kref); + return ret; } @@ -79,6 +102,8 @@ msm_gem_smmu_address_space_create(struct device *dev, struct msm_mmu *mmu, aspace->mmu = mmu; aspace->ops = &smmu_aspace_ops; + kref_init(&aspace->kref); + return aspace; } @@ -104,16 +129,25 @@ static void iommu_aspace_unmap_vma(struct msm_gem_address_space *aspace, drm_mm_remove_node(&vma->node); vma->iova = 0; + + msm_gem_address_space_put(aspace); } static int iommu_aspace_map_vma(struct msm_gem_address_space *aspace, - struct msm_gem_vma *vma, struct sg_table *sgt, - void *priv) + struct msm_gem_vma *vma, struct sg_table *sgt, void *priv, + unsigned int flags) { struct msm_iommu_aspace *local = to_iommu_aspace(aspace); size_t size = 0; struct scatterlist *sg; - int ret = 0, i; + int ret, i; + int iommu_flags = IOMMU_READ; + + if (!(flags & MSM_BO_GPU_READONLY)) + iommu_flags |= IOMMU_WRITE; + + if (flags & MSM_BO_PRIVILEGED) + iommu_flags |= IOMMU_PRIV; if (WARN_ON(drm_mm_node_allocated(&vma->node))) return 0; @@ -129,8 +163,11 @@ static int iommu_aspace_map_vma(struct msm_gem_address_space *aspace, vma->iova = vma->node.start << PAGE_SHIFT; if (aspace->mmu) - ret = aspace->mmu->funcs->map(aspace->mmu, vma->iova, - sgt, IOMMU_READ | IOMMU_WRITE); + ret = aspace->mmu->funcs->map(aspace->mmu, vma->iova, sgt, + iommu_flags); + + /* Get a reference to the aspace to keep it around */ + kref_get(&aspace->kref); return ret; } @@ -169,15 +206,17 @@ msm_gem_address_space_new(struct msm_mmu *mmu, const char *name, local->base.mmu = mmu; local->base.ops = &msm_iommu_aspace_ops; + kref_init(&local->base.kref); + return &local->base; } int msm_gem_map_vma(struct msm_gem_address_space *aspace, struct msm_gem_vma *vma, struct sg_table *sgt, - void *priv) + void *priv, unsigned int flags) { if (aspace && aspace->ops->map) - return aspace->ops->map(aspace, vma, sgt, priv); + return aspace->ops->map(aspace, vma, sgt, priv, flags); return -EINVAL; } @@ -203,11 +242,15 @@ msm_gem_address_space_create(struct device *dev, struct iommu_domain *domain, domain->geometry.aperture_end); } -void -msm_gem_address_space_destroy(struct msm_gem_address_space *aspace) +/* Create a new dynamic instance */ +struct msm_gem_address_space * +msm_gem_address_space_create_instance(struct msm_mmu *parent, const char *name, + uint64_t start, uint64_t end) { - if (aspace && aspace->ops->destroy) - aspace->ops->destroy(aspace); + struct msm_mmu *child = msm_iommu_new_dynamic(parent); - kfree(aspace); + if (IS_ERR(child)) + return (struct msm_gem_address_space *) child; + + return msm_gem_address_space_new(child, name, start, end); } diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c index 08ecc089611f..3fb480f41fde 100644 --- a/drivers/gpu/drm/msm/msm_gpu.c +++ b/drivers/gpu/drm/msm/msm_gpu.c @@ -93,17 +93,17 @@ static int enable_clk(struct msm_gpu *gpu) uint32_t rate = gpu->gpufreq[gpu->active_level]; int i; - clk_set_rate(gpu->grp_clks[0], rate); + if (gpu->core_clk) + clk_set_rate(gpu->core_clk, rate); - if (gpu->grp_clks[3]) - clk_set_rate(gpu->grp_clks[3], 19200000); + if (gpu->rbbmtimer_clk) + clk_set_rate(gpu->rbbmtimer_clk, 19200000); - /* NOTE: kgsl_pwrctrl_clk() ignores grp_clks[0].. */ - for (i = ARRAY_SIZE(gpu->grp_clks) - 1; i > 0; i--) + for (i = gpu->nr_clocks - 1; i >= 0; i--) if (gpu->grp_clks[i]) clk_prepare(gpu->grp_clks[i]); - for (i = ARRAY_SIZE(gpu->grp_clks) - 1; i > 0; i--) + for (i = gpu->nr_clocks - 1; i >= 0; i--) if (gpu->grp_clks[i]) clk_enable(gpu->grp_clks[i]); @@ -115,16 +115,20 @@ static int disable_clk(struct msm_gpu *gpu) uint32_t rate = gpu->gpufreq[gpu->nr_pwrlevels - 1]; int i; - /* NOTE: kgsl_pwrctrl_clk() ignores grp_clks[0].. */ - for (i = ARRAY_SIZE(gpu->grp_clks) - 1; i > 0; i--) + for (i = gpu->nr_clocks - 1; i >= 0; i--) if (gpu->grp_clks[i]) clk_disable(gpu->grp_clks[i]); - for (i = ARRAY_SIZE(gpu->grp_clks) - 1; i > 0; i--) + for (i = gpu->nr_clocks - 1; i >= 0; i--) if (gpu->grp_clks[i]) clk_unprepare(gpu->grp_clks[i]); - clk_set_rate(gpu->grp_clks[0], rate); + if (gpu->core_clk) + clk_set_rate(gpu->core_clk, rate); + + if (gpu->rbbmtimer_clk) + clk_set_rate(gpu->rbbmtimer_clk, 0); + return 0; } @@ -273,17 +277,35 @@ static void recover_worker(struct work_struct *work) mutex_lock(&dev->struct_mutex); if (msm_gpu_active(gpu)) { struct msm_gem_submit *submit; - uint32_t fence = gpu->funcs->last_fence(gpu); - - /* retire completed submits, plus the one that hung: */ - retire_submits(gpu, fence + 1); + struct msm_ringbuffer *ring; + int i; inactive_cancel(gpu); + + FOR_EACH_RING(gpu, ring, i) { + uint32_t fence; + + if (!ring) + continue; + + fence = gpu->funcs->last_fence(gpu, ring); + + /* + * Retire the faulting command on the active ring and + * make sure the other rings are cleaned up + */ + if (ring == gpu->funcs->active_ring(gpu)) + retire_submits(gpu, fence + 1); + else + retire_submits(gpu, fence); + } + + /* Recover the GPU */ gpu->funcs->recover(gpu); - /* replay the remaining submits after the one that hung: */ + /* replay the remaining submits for all rings: */ list_for_each_entry(submit, &gpu->submit_list, node) { - gpu->funcs->submit(gpu, submit, NULL); + gpu->funcs->submit(gpu, submit); } } mutex_unlock(&dev->struct_mutex); @@ -303,25 +325,28 @@ static void hangcheck_handler(unsigned long data) struct msm_gpu *gpu = (struct msm_gpu *)data; struct drm_device *dev = gpu->dev; struct msm_drm_private *priv = dev->dev_private; - uint32_t fence = gpu->funcs->last_fence(gpu); + struct msm_ringbuffer *ring = gpu->funcs->active_ring(gpu); + uint32_t fence = gpu->funcs->last_fence(gpu, ring); + uint32_t submitted = gpu->funcs->submitted_fence(gpu, ring); - if (fence != gpu->hangcheck_fence) { + if (fence != gpu->hangcheck_fence[ring->id]) { /* some progress has been made.. ya! */ - gpu->hangcheck_fence = fence; - } else if (fence < gpu->submitted_fence) { + gpu->hangcheck_fence[ring->id] = fence; + } else if (fence < submitted) { /* no progress and not done.. hung! */ - gpu->hangcheck_fence = fence; - dev_err(dev->dev, "%s: hangcheck detected gpu lockup!\n", - gpu->name); + gpu->hangcheck_fence[ring->id] = fence; + dev_err(dev->dev, "%s: hangcheck detected gpu lockup rb %d!\n", + gpu->name, ring->id); dev_err(dev->dev, "%s: completed fence: %u\n", gpu->name, fence); dev_err(dev->dev, "%s: submitted fence: %u\n", - gpu->name, gpu->submitted_fence); + gpu->name, submitted); + queue_work(priv->wq, &gpu->recover_work); } /* if still more pending work, reset the hangcheck timer: */ - if (gpu->submitted_fence > gpu->hangcheck_fence) + if (submitted > gpu->hangcheck_fence[ring->id]) hangcheck_timer_reset(gpu); /* workaround for missing irq: */ @@ -430,54 +455,66 @@ out: static void retire_submits(struct msm_gpu *gpu, uint32_t fence) { struct drm_device *dev = gpu->dev; + struct msm_gem_submit *submit, *tmp; WARN_ON(!mutex_is_locked(&dev->struct_mutex)); - while (!list_empty(&gpu->submit_list)) { - struct msm_gem_submit *submit; - - submit = list_first_entry(&gpu->submit_list, - struct msm_gem_submit, node); + /* + * Find and retire all the submits in the same ring that are older than + * or equal to 'fence' + */ - if (submit->fence <= fence) { + list_for_each_entry_safe(submit, tmp, &gpu->submit_list, node) { + if (COMPARE_FENCE_LTE(submit->fence, fence)) { list_del(&submit->node); kfree(submit); - } else { - break; } } } -static void retire_worker(struct work_struct *work) +static bool _fence_signaled(struct msm_gem_object *obj, uint32_t fence) { - struct msm_gpu *gpu = container_of(work, struct msm_gpu, retire_work); - struct drm_device *dev = gpu->dev; - uint32_t fence = gpu->funcs->last_fence(gpu); + if (obj->write_fence & 0x3FFFFFFF) + return COMPARE_FENCE_LTE(obj->write_fence, fence); - msm_update_fence(gpu->dev, fence); + return COMPARE_FENCE_LTE(obj->read_fence, fence); +} - mutex_lock(&dev->struct_mutex); +static void _retire_ring(struct msm_gpu *gpu, uint32_t fence) +{ + struct msm_gem_object *obj, *tmp; retire_submits(gpu, fence); - while (!list_empty(&gpu->active_list)) { - struct msm_gem_object *obj; - - obj = list_first_entry(&gpu->active_list, - struct msm_gem_object, mm_list); - - if ((obj->read_fence <= fence) && - (obj->write_fence <= fence)) { - /* move to inactive: */ + list_for_each_entry_safe(obj, tmp, &gpu->active_list, mm_list) { + if (_fence_signaled(obj, fence)) { msm_gem_move_to_inactive(&obj->base); msm_gem_put_iova(&obj->base, gpu->aspace); drm_gem_object_unreference(&obj->base); - } else { - break; } } +} - mutex_unlock(&dev->struct_mutex); +static void retire_worker(struct work_struct *work) +{ + struct msm_gpu *gpu = container_of(work, struct msm_gpu, retire_work); + struct drm_device *dev = gpu->dev; + struct msm_ringbuffer *ring; + int i; + + FOR_EACH_RING(gpu, ring, i) { + uint32_t fence; + + if (!ring) + continue; + + fence = gpu->funcs->last_fence(gpu, ring); + msm_update_fence(gpu->dev, fence); + + mutex_lock(&dev->struct_mutex); + _retire_ring(gpu, fence); + mutex_unlock(&dev->struct_mutex); + } if (!msm_gpu_active(gpu)) inactive_start(gpu); @@ -492,18 +529,16 @@ void msm_gpu_retire(struct msm_gpu *gpu) } /* add bo's to gpu's ring, and kick gpu: */ -int msm_gpu_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit, - struct msm_file_private *ctx) +int msm_gpu_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit) { struct drm_device *dev = gpu->dev; struct msm_drm_private *priv = dev->dev_private; + struct msm_ringbuffer *ring = gpu->rb[submit->ring]; int i, ret; WARN_ON(!mutex_is_locked(&dev->struct_mutex)); - submit->fence = ++priv->next_fence; - - gpu->submitted_fence = submit->fence; + submit->fence = FENCE(submit->ring, ++priv->next_fence[submit->ring]); inactive_cancel(gpu); @@ -511,7 +546,7 @@ int msm_gpu_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit, msm_rd_dump_submit(submit); - gpu->submitted_fence = submit->fence; + ring->submitted_fence = submit->fence; update_sw_cntrs(gpu); @@ -529,7 +564,7 @@ int msm_gpu_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit, /* ring takes a reference to the bo and iova: */ drm_gem_object_reference(&msm_obj->base); msm_gem_get_iova_locked(&msm_obj->base, - submit->gpu->aspace, &iova); + submit->aspace, &iova); } if (submit->bos[i].flags & MSM_SUBMIT_BO_READ) @@ -539,8 +574,7 @@ int msm_gpu_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit, msm_gem_move_to_active(&msm_obj->base, gpu, true, submit->fence); } - ret = gpu->funcs->submit(gpu, submit, ctx); - priv->lastctx = ctx; + ret = gpu->funcs->submit(gpu, submit); hangcheck_timer_reset(gpu); @@ -557,17 +591,54 @@ static irqreturn_t irq_handler(int irq, void *data) return gpu->funcs->irq(gpu); } -static const char *clk_names[] = { - "src_clk", "core_clk", "iface_clk", "rbbmtimer_clk", - "mem_clk", "mem_iface_clk", "alt_mem_iface_clk", -}; +static struct clk *get_clock(struct device *dev, const char *name) +{ + struct clk *clk = devm_clk_get(dev, name); + + DBG("clks[%s]: %p", name, clk); + + return IS_ERR(clk) ? NULL : clk; +} + +static int get_clocks(struct platform_device *pdev, struct msm_gpu *gpu) +{ + struct device *dev = &pdev->dev; + struct property *prop; + const char *name; + int i = 0; + + gpu->nr_clocks = of_property_count_strings(dev->of_node, "clock-names"); + if (gpu->nr_clocks < 1) { + gpu->nr_clocks = 0; + return 0; + } + + gpu->grp_clks = devm_kcalloc(dev, sizeof(struct clk *), gpu->nr_clocks, + GFP_KERNEL); + if (!gpu->grp_clks) + return -ENOMEM; + + of_property_for_each_string(dev->of_node, "clock-names", prop, name) { + gpu->grp_clks[i] = get_clock(dev, name); + + /* Remember the key clocks that we need to control later */ + if (!strcmp(name, "core_clk")) + gpu->core_clk = gpu->grp_clks[i]; + else if (!strcmp(name, "rbbmtimer_clk")) + gpu->rbbmtimer_clk = gpu->grp_clks[i]; + + ++i; + } + + return 0; +} int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev, struct msm_gpu *gpu, const struct msm_gpu_funcs *funcs, - const char *name, const char *ioname, const char *irqname, int ringsz) + const char *name, struct msm_gpu_config *config) { struct iommu_domain *iommu; - int i, ret; + int i, ret, nr_rings; if (WARN_ON(gpu->num_perfcntrs > ARRAY_SIZE(gpu->last_cntrs))) gpu->num_perfcntrs = ARRAY_SIZE(gpu->last_cntrs); @@ -591,17 +662,16 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev, spin_lock_init(&gpu->perf_lock); - BUG_ON(ARRAY_SIZE(clk_names) != ARRAY_SIZE(gpu->grp_clks)); /* Map registers: */ - gpu->mmio = msm_ioremap(pdev, ioname, name); + gpu->mmio = msm_ioremap(pdev, config->ioname, name); if (IS_ERR(gpu->mmio)) { ret = PTR_ERR(gpu->mmio); goto fail; } /* Get Interrupt: */ - gpu->irq = platform_get_irq_byname(pdev, irqname); + gpu->irq = platform_get_irq_byname(pdev, config->irqname); if (gpu->irq < 0) { ret = gpu->irq; dev_err(drm->dev, "failed to get irq: %d\n", ret); @@ -615,15 +685,9 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev, goto fail; } - /* Acquire clocks: */ - for (i = 0; i < ARRAY_SIZE(clk_names); i++) { - gpu->grp_clks[i] = devm_clk_get(&pdev->dev, clk_names[i]); - DBG("grp_clks[%s]: %p", clk_names[i], gpu->grp_clks[i]); - if (IS_ERR(gpu->grp_clks[i])) - gpu->grp_clks[i] = NULL; - } - - gpu->grp_clks[0] = gpu->grp_clks[1]; + ret = get_clocks(pdev, gpu); + if (ret) + goto fail; gpu->ebi1_clk = devm_clk_get(&pdev->dev, "bus_clk"); DBG("ebi1_clk: %p", gpu->ebi1_clk); @@ -648,8 +712,8 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev, iommu = iommu_domain_alloc(&platform_bus_type); if (iommu) { /* TODO 32b vs 64b address space.. */ - iommu->geometry.aperture_start = 0x1000; - iommu->geometry.aperture_end = 0xffffffff; + iommu->geometry.aperture_start = config->va_start; + iommu->geometry.aperture_end = config->va_end; dev_info(drm->dev, "%s: using IOMMU\n", name); gpu->aspace = msm_gem_address_space_create(&pdev->dev, @@ -666,17 +730,30 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev, dev_info(drm->dev, "%s: no IOMMU, fallback to VRAM carveout!\n", name); } - /* Create ringbuffer: */ - mutex_lock(&drm->struct_mutex); - gpu->rb = msm_ringbuffer_new(gpu, ringsz); - mutex_unlock(&drm->struct_mutex); - if (IS_ERR(gpu->rb)) { - ret = PTR_ERR(gpu->rb); - gpu->rb = NULL; - dev_err(drm->dev, "could not create ringbuffer: %d\n", ret); - goto fail; + nr_rings = config->nr_rings; + + if (nr_rings > ARRAY_SIZE(gpu->rb)) { + WARN(1, "Only creating %lu ringbuffers\n", ARRAY_SIZE(gpu->rb)); + nr_rings = ARRAY_SIZE(gpu->rb); } + /* Create ringbuffer(s): */ + for (i = 0; i < nr_rings; i++) { + mutex_lock(&drm->struct_mutex); + gpu->rb[i] = msm_ringbuffer_new(gpu, i); + mutex_unlock(&drm->struct_mutex); + + if (IS_ERR(gpu->rb[i])) { + ret = PTR_ERR(gpu->rb[i]); + gpu->rb[i] = NULL; + dev_err(drm->dev, + "could not create ringbuffer %d: %d\n", i, ret); + goto fail; + } + } + + gpu->nr_rings = nr_rings; + #ifdef CONFIG_SMP gpu->pm_qos_req_dma.type = PM_QOS_REQ_AFFINE_IRQ; gpu->pm_qos_req_dma.irq = gpu->irq; @@ -687,23 +764,40 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev, bs_init(gpu); + gpu->snapshot = msm_snapshot_new(gpu); + if (IS_ERR(gpu->snapshot)) + gpu->snapshot = NULL; + return 0; fail: + for (i = 0; i < ARRAY_SIZE(gpu->rb); i++) { + if (gpu->rb[i]) + msm_ringbuffer_destroy(gpu->rb[i]); + } + return ret; } void msm_gpu_cleanup(struct msm_gpu *gpu) { + int i; + DBG("%s", gpu->name); WARN_ON(!list_empty(&gpu->active_list)); bs_fini(gpu); - if (gpu->rb) { - if (gpu->rb_iova) - msm_gem_put_iova(gpu->rb->bo, gpu->aspace); - msm_ringbuffer_destroy(gpu->rb); + for (i = 0; i < ARRAY_SIZE(gpu->rb); i++) { + if (!gpu->rb[i]) + continue; + + if (gpu->rb[i]->iova) + msm_gem_put_iova(gpu->rb[i]->bo, gpu->aspace); + + msm_ringbuffer_destroy(gpu->rb[i]); } + + msm_snapshot_destroy(gpu, gpu->snapshot); } diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h index c6f1d3bd36e9..06dfaabbfcfe 100644 --- a/drivers/gpu/drm/msm/msm_gpu.h +++ b/drivers/gpu/drm/msm/msm_gpu.h @@ -24,10 +24,19 @@ #include "msm_drv.h" #include "msm_ringbuffer.h" +#include "msm_snapshot.h" struct msm_gem_submit; struct msm_gpu_perfcntr; +struct msm_gpu_config { + const char *ioname; + const char *irqname; + int nr_rings; + uint64_t va_start; + uint64_t va_end; +}; + /* So far, with hardware that I've seen to date, we can have: * + zero, one, or two z180 2d cores * + a3xx or a2xx 3d core, which share a common CP (the firmware @@ -47,18 +56,21 @@ struct msm_gpu_funcs { int (*hw_init)(struct msm_gpu *gpu); int (*pm_suspend)(struct msm_gpu *gpu); int (*pm_resume)(struct msm_gpu *gpu); - int (*submit)(struct msm_gpu *gpu, struct msm_gem_submit *submit, - struct msm_file_private *ctx); - void (*flush)(struct msm_gpu *gpu); - bool (*idle)(struct msm_gpu *gpu); + int (*submit)(struct msm_gpu *gpu, struct msm_gem_submit *submit); + void (*flush)(struct msm_gpu *gpu, struct msm_ringbuffer *ring); irqreturn_t (*irq)(struct msm_gpu *irq); - uint32_t (*last_fence)(struct msm_gpu *gpu); + uint32_t (*last_fence)(struct msm_gpu *gpu, + struct msm_ringbuffer *ring); + uint32_t (*submitted_fence)(struct msm_gpu *gpu, + struct msm_ringbuffer *ring); + struct msm_ringbuffer *(*active_ring)(struct msm_gpu *gpu); void (*recover)(struct msm_gpu *gpu); void (*destroy)(struct msm_gpu *gpu); #ifdef CONFIG_DEBUG_FS /* show GPU status in debugfs: */ void (*show)(struct msm_gpu *gpu, struct seq_file *m); #endif + int (*snapshot)(struct msm_gpu *gpu, struct msm_snapshot *snapshot); }; struct msm_gpu { @@ -78,14 +90,12 @@ struct msm_gpu { const struct msm_gpu_perfcntr *perfcntrs; uint32_t num_perfcntrs; - struct msm_ringbuffer *rb; - uint64_t rb_iova; + struct msm_ringbuffer *rb[MSM_GPU_MAX_RINGS]; + int nr_rings; /* list of GEM active objects: */ struct list_head active_list; - uint32_t submitted_fence; - /* is gpu powered/active? */ int active_cnt; bool inactive; @@ -100,7 +110,9 @@ struct msm_gpu { /* Power Control: */ struct regulator *gpu_reg, *gpu_cx; - struct clk *ebi1_clk, *grp_clks[7]; + struct clk **grp_clks; + struct clk *ebi1_clk, *core_clk, *rbbmtimer_clk; + int nr_clocks; uint32_t gpufreq[10]; uint32_t busfreq[10]; @@ -123,15 +135,44 @@ struct msm_gpu { #define DRM_MSM_HANGCHECK_PERIOD 500 /* in ms */ #define DRM_MSM_HANGCHECK_JIFFIES msecs_to_jiffies(DRM_MSM_HANGCHECK_PERIOD) struct timer_list hangcheck_timer; - uint32_t hangcheck_fence; + uint32_t hangcheck_fence[MSM_GPU_MAX_RINGS]; struct work_struct recover_work; struct list_head submit_list; + + struct msm_snapshot *snapshot; }; +/* It turns out that all targets use the same ringbuffer size. */ +#define MSM_GPU_RINGBUFFER_SZ SZ_32K +#define MSM_GPU_RINGBUFFER_BLKSIZE 32 + +#define MSM_GPU_RB_CNTL_DEFAULT \ + (AXXX_CP_RB_CNTL_BUFSZ(ilog2(MSM_GPU_RINGBUFFER_SZ / 8)) | \ + AXXX_CP_RB_CNTL_BLKSZ(ilog2(MSM_GPU_RINGBUFFER_BLKSIZE / 8))) + +static inline struct msm_ringbuffer *__get_ring(struct msm_gpu *gpu, int index) +{ + return (index < ARRAY_SIZE(gpu->rb) ? gpu->rb[index] : NULL); +} + +#define FOR_EACH_RING(gpu, ring, index) \ + for (index = 0, ring = (gpu)->rb[0]; \ + index < (gpu)->nr_rings && index < ARRAY_SIZE((gpu)->rb); \ + index++, ring = __get_ring(gpu, index)) + static inline bool msm_gpu_active(struct msm_gpu *gpu) { - return gpu->submitted_fence > gpu->funcs->last_fence(gpu); + struct msm_ringbuffer *ring; + int i; + + FOR_EACH_RING(gpu, ring, i) { + if (gpu->funcs->submitted_fence(gpu, ring) > + gpu->funcs->last_fence(gpu, ring)) + return true; + } + + return false; } /* Perf-Counters: @@ -205,12 +246,12 @@ int msm_gpu_perfcntr_sample(struct msm_gpu *gpu, uint32_t *activetime, uint32_t *totaltime, uint32_t ncntrs, uint32_t *cntrs); void msm_gpu_retire(struct msm_gpu *gpu); -int msm_gpu_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit, - struct msm_file_private *ctx); +int msm_gpu_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit); int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev, struct msm_gpu *gpu, const struct msm_gpu_funcs *funcs, - const char *name, const char *ioname, const char *irqname, int ringsz); + const char *name, struct msm_gpu_config *config); + void msm_gpu_cleanup(struct msm_gpu *gpu); struct msm_gpu *adreno_load_gpu(struct drm_device *dev); diff --git a/drivers/gpu/drm/msm/msm_iommu.c b/drivers/gpu/drm/msm/msm_iommu.c index b4a5015ff922..3c16222b8890 100644 --- a/drivers/gpu/drm/msm/msm_iommu.c +++ b/drivers/gpu/drm/msm/msm_iommu.c @@ -17,13 +17,7 @@ #include <linux/of_platform.h> #include "msm_drv.h" -#include "msm_mmu.h" - -struct msm_iommu { - struct msm_mmu base; - struct iommu_domain *domain; -}; -#define to_msm_iommu(x) container_of(x, struct msm_iommu, base) +#include "msm_iommu.h" static int msm_fault_handler(struct iommu_domain *iommu, struct device *dev, unsigned long iova, int flags, void *arg) @@ -32,9 +26,50 @@ static int msm_fault_handler(struct iommu_domain *iommu, struct device *dev, return 0; } -static int msm_iommu_attach(struct msm_mmu *mmu, const char **names, int cnt) +/* + * Get and enable the IOMMU clocks so that we can make + * sure they stay on the entire duration so that we can + * safely change the pagetable from the GPU + */ +static void _get_iommu_clocks(struct msm_mmu *mmu, struct platform_device *pdev) { struct msm_iommu *iommu = to_msm_iommu(mmu); + struct device *dev; + struct property *prop; + const char *name; + int i = 0; + + if (WARN_ON(!pdev)) + return; + + dev = &pdev->dev; + + iommu->nr_clocks = + of_property_count_strings(dev->of_node, "clock-names"); + + if (iommu->nr_clocks < 0) { + iommu->nr_clocks = 0; + return; + } + + if (WARN_ON(iommu->nr_clocks > ARRAY_SIZE(iommu->clocks))) + iommu->nr_clocks = ARRAY_SIZE(iommu->clocks); + + of_property_for_each_string(dev->of_node, "clock-names", prop, name) { + if (i == iommu->nr_clocks) + break; + + iommu->clocks[i] = clk_get(dev, name); + if (iommu->clocks[i]) + clk_prepare_enable(iommu->clocks[i]); + + i++; + } +} + +static int _attach_iommu_device(struct msm_mmu *mmu, + struct iommu_domain *domain, const char **names, int cnt) +{ int i; /* See if there is a iommus member in the current device. If not, look @@ -42,7 +77,7 @@ static int msm_iommu_attach(struct msm_mmu *mmu, const char **names, int cnt) */ if (of_find_property(mmu->dev->of_node, "iommus", NULL)) - return iommu_attach_device(iommu->domain, mmu->dev); + return iommu_attach_device(domain, mmu->dev); /* Look through the list of names for a target */ for (i = 0; i < cnt; i++) { @@ -64,8 +99,12 @@ static int msm_iommu_attach(struct msm_mmu *mmu, const char **names, int cnt) if (!pdev) continue; + _get_iommu_clocks(mmu, + of_find_device_by_node(node->parent)); + mmu->dev = &pdev->dev; - return iommu_attach_device(iommu->domain, mmu->dev); + + return iommu_attach_device(domain, mmu->dev); } } @@ -73,7 +112,80 @@ static int msm_iommu_attach(struct msm_mmu *mmu, const char **names, int cnt) return -ENODEV; } -static void msm_iommu_detach(struct msm_mmu *mmu, const char **names, int cnt) +static int msm_iommu_attach(struct msm_mmu *mmu, const char **names, int cnt) +{ + struct msm_iommu *iommu = to_msm_iommu(mmu); + int val = 1, ret; + + /* Hope springs eternal */ + iommu->allow_dynamic = true; + + /* per-instance pagetables need TTBR1 support in the IOMMU driver */ + ret = iommu_domain_set_attr(iommu->domain, + DOMAIN_ATTR_ENABLE_TTBR1, &val); + if (ret) + iommu->allow_dynamic = false; + + /* Attach the device to the domain */ + ret = _attach_iommu_device(mmu, iommu->domain, names, cnt); + if (ret) + return ret; + + /* + * Get the context bank for the base domain; this will be shared with + * the children. + */ + iommu->cb = -1; + if (iommu_domain_get_attr(iommu->domain, DOMAIN_ATTR_CONTEXT_BANK, + &iommu->cb)) + iommu->allow_dynamic = false; + + return 0; +} + +static int msm_iommu_attach_dynamic(struct msm_mmu *mmu, const char **names, + int cnt) +{ + static unsigned int procid; + struct msm_iommu *iommu = to_msm_iommu(mmu); + int ret; + unsigned int id; + + /* Assign a unique procid for the domain to cut down on TLB churn */ + id = ++procid; + + iommu_domain_set_attr(iommu->domain, DOMAIN_ATTR_PROCID, &id); + + ret = iommu_attach_device(iommu->domain, mmu->dev); + if (ret) + return ret; + + /* + * Get the TTBR0 and the CONTEXTIDR - these will be used by the GPU to + * switch the pagetable on its own. + */ + iommu_domain_get_attr(iommu->domain, DOMAIN_ATTR_TTBR0, + &iommu->ttbr0); + iommu_domain_get_attr(iommu->domain, DOMAIN_ATTR_CONTEXTIDR, + &iommu->contextidr); + + return 0; +} + +static void msm_iommu_detach(struct msm_mmu *mmu) +{ + struct msm_iommu *iommu = to_msm_iommu(mmu); + int i; + + iommu_detach_device(iommu->domain, mmu->dev); + + for (i = 0; i < iommu->nr_clocks; i++) { + if (iommu->clocks[i]) + clk_disable(iommu->clocks[i]); + } +} + +static void msm_iommu_detach_dynamic(struct msm_mmu *mmu) { struct msm_iommu *iommu = to_msm_iommu(mmu); iommu_detach_device(iommu->domain, mmu->dev); @@ -160,7 +272,16 @@ static const struct msm_mmu_funcs funcs = { .destroy = msm_iommu_destroy, }; -struct msm_mmu *msm_iommu_new(struct device *dev, struct iommu_domain *domain) +static const struct msm_mmu_funcs dynamic_funcs = { + .attach = msm_iommu_attach_dynamic, + .detach = msm_iommu_detach_dynamic, + .map = msm_iommu_map, + .unmap = msm_iommu_unmap, + .destroy = msm_iommu_destroy, +}; + +struct msm_mmu *_msm_iommu_new(struct device *dev, struct iommu_domain *domain, + const struct msm_mmu_funcs *funcs) { struct msm_iommu *iommu; @@ -169,8 +290,54 @@ struct msm_mmu *msm_iommu_new(struct device *dev, struct iommu_domain *domain) return ERR_PTR(-ENOMEM); iommu->domain = domain; - msm_mmu_init(&iommu->base, dev, &funcs); + msm_mmu_init(&iommu->base, dev, funcs); iommu_set_fault_handler(domain, msm_fault_handler, dev); return &iommu->base; } +struct msm_mmu *msm_iommu_new(struct device *dev, struct iommu_domain *domain) +{ + return _msm_iommu_new(dev, domain, &funcs); +} + +/* + * Given a base domain that is attached to a IOMMU device try to create a + * dynamic domain that is also attached to the same device but allocates a new + * pagetable. This is used to allow multiple pagetables to be attached to the + * same device. + */ +struct msm_mmu *msm_iommu_new_dynamic(struct msm_mmu *base) +{ + struct msm_iommu *base_iommu = to_msm_iommu(base); + struct iommu_domain *domain; + struct msm_mmu *mmu; + int ret, val = 1; + + /* Don't continue if the base domain didn't have the support we need */ + if (!base || base_iommu->allow_dynamic == false) + return ERR_PTR(-EOPNOTSUPP); + + domain = iommu_domain_alloc(&platform_bus_type); + if (!domain) + return ERR_PTR(-ENODEV); + + mmu = _msm_iommu_new(base->dev, domain, &dynamic_funcs); + + if (IS_ERR(mmu)) { + if (domain) + iommu_domain_free(domain); + return mmu; + } + + ret = iommu_domain_set_attr(domain, DOMAIN_ATTR_DYNAMIC, &val); + if (ret) { + msm_iommu_destroy(mmu); + return ERR_PTR(ret); + } + + /* Set the context bank to match the base domain */ + iommu_domain_set_attr(domain, DOMAIN_ATTR_CONTEXT_BANK, + &base_iommu->cb); + + return mmu; +} diff --git a/drivers/gpu/drm/msm/msm_iommu.h b/drivers/gpu/drm/msm/msm_iommu.h new file mode 100644 index 000000000000..d005cfb9758f --- /dev/null +++ b/drivers/gpu/drm/msm/msm_iommu.h @@ -0,0 +1,37 @@ +/* Copyright (c) 2016-2017 The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef _MSM_IOMMU_H_ +#define _MSM_IOMMU_H_ + +#include "msm_mmu.h" + +struct msm_iommu { + struct msm_mmu base; + struct iommu_domain *domain; + int cb; + phys_addr_t ttbr0; + uint32_t contextidr; + bool allow_dynamic; + + struct clk *clocks[5]; + int nr_clocks; +}; +#define to_msm_iommu(x) container_of(x, struct msm_iommu, base) + +static inline bool msm_iommu_allow_dynamic(struct msm_mmu *mmu) +{ + struct msm_iommu *iommu = to_msm_iommu(mmu); + + return iommu->allow_dynamic; +} +#endif diff --git a/drivers/gpu/drm/msm/msm_mmu.h b/drivers/gpu/drm/msm/msm_mmu.h index 8464b9e04b63..501f12bef00d 100644 --- a/drivers/gpu/drm/msm/msm_mmu.h +++ b/drivers/gpu/drm/msm/msm_mmu.h @@ -32,7 +32,7 @@ enum msm_mmu_domain_type { struct msm_mmu_funcs { int (*attach)(struct msm_mmu *mmu, const char **names, int cnt); - void (*detach)(struct msm_mmu *mmu, const char **names, int cnt); + void (*detach)(struct msm_mmu *mmu); int (*map)(struct msm_mmu *mmu, uint64_t iova, struct sg_table *sgt, int prot); int (*unmap)(struct msm_mmu *mmu, uint64_t iova, struct sg_table *sgt); @@ -62,5 +62,6 @@ static inline void msm_mmu_init(struct msm_mmu *mmu, struct device *dev, struct msm_mmu *msm_iommu_new(struct device *dev, struct iommu_domain *domain); struct msm_mmu *msm_smmu_new(struct device *dev, enum msm_mmu_domain_type domain); +struct msm_mmu *msm_iommu_new_dynamic(struct msm_mmu *orig); #endif /* __MSM_MMU_H__ */ diff --git a/drivers/gpu/drm/msm/msm_ringbuffer.c b/drivers/gpu/drm/msm/msm_ringbuffer.c index 1f14b908b221..14a16c4578d9 100644 --- a/drivers/gpu/drm/msm/msm_ringbuffer.c +++ b/drivers/gpu/drm/msm/msm_ringbuffer.c @@ -18,12 +18,13 @@ #include "msm_ringbuffer.h" #include "msm_gpu.h" -struct msm_ringbuffer *msm_ringbuffer_new(struct msm_gpu *gpu, int size) +struct msm_ringbuffer *msm_ringbuffer_new(struct msm_gpu *gpu, int id) { struct msm_ringbuffer *ring; int ret; - size = ALIGN(size, 4); /* size should be dword aligned */ + /* We assume everwhere that MSM_GPU_RINGBUFFER_SZ is a power of 2 */ + BUILD_BUG_ON(!is_power_of_2(MSM_GPU_RINGBUFFER_SZ)); ring = kzalloc(sizeof(*ring), GFP_KERNEL); if (!ring) { @@ -32,7 +33,8 @@ struct msm_ringbuffer *msm_ringbuffer_new(struct msm_gpu *gpu, int size) } ring->gpu = gpu; - ring->bo = msm_gem_new(gpu->dev, size, MSM_BO_WC); + ring->id = id; + ring->bo = msm_gem_new(gpu->dev, MSM_GPU_RINGBUFFER_SZ, MSM_BO_WC); if (IS_ERR(ring->bo)) { ret = PTR_ERR(ring->bo); ring->bo = NULL; @@ -40,10 +42,11 @@ struct msm_ringbuffer *msm_ringbuffer_new(struct msm_gpu *gpu, int size) } ring->start = msm_gem_vaddr_locked(ring->bo); - ring->end = ring->start + (size / 4); + ring->end = ring->start + (MSM_GPU_RINGBUFFER_SZ >> 2); + ring->next = ring->start; ring->cur = ring->start; - ring->size = size; + spin_lock_init(&ring->lock); return ring; diff --git a/drivers/gpu/drm/msm/msm_ringbuffer.h b/drivers/gpu/drm/msm/msm_ringbuffer.h index 6e0e1049fa4f..1e84905073bf 100644 --- a/drivers/gpu/drm/msm/msm_ringbuffer.h +++ b/drivers/gpu/drm/msm/msm_ringbuffer.h @@ -22,12 +22,15 @@ struct msm_ringbuffer { struct msm_gpu *gpu; - int size; + int id; struct drm_gem_object *bo; - uint32_t *start, *end, *cur; + uint32_t *start, *end, *cur, *next; + uint64_t iova; + uint32_t submitted_fence; + spinlock_t lock; }; -struct msm_ringbuffer *msm_ringbuffer_new(struct msm_gpu *gpu, int size); +struct msm_ringbuffer *msm_ringbuffer_new(struct msm_gpu *gpu, int id); void msm_ringbuffer_destroy(struct msm_ringbuffer *ring); /* ringbuffer helpers (the parts that are same for a3xx/a2xx/z180..) */ @@ -35,9 +38,13 @@ void msm_ringbuffer_destroy(struct msm_ringbuffer *ring); static inline void OUT_RING(struct msm_ringbuffer *ring, uint32_t data) { - if (ring->cur == ring->end) - ring->cur = ring->start; - *(ring->cur++) = data; + /* + * ring->next points to the current command being written - it won't be + * committed as ring->cur until the flush + */ + if (ring->next == ring->end) + ring->next = ring->start; + *(ring->next++) = data; } #endif /* __MSM_RINGBUFFER_H__ */ diff --git a/drivers/gpu/drm/msm/msm_smmu.c b/drivers/gpu/drm/msm/msm_smmu.c index 500e8a5e6247..c99f51e09700 100644 --- a/drivers/gpu/drm/msm/msm_smmu.c +++ b/drivers/gpu/drm/msm/msm_smmu.c @@ -86,7 +86,7 @@ static int msm_smmu_attach(struct msm_mmu *mmu, const char **names, int cnt) return 0; } -static void msm_smmu_detach(struct msm_mmu *mmu, const char **names, int cnt) +static void msm_smmu_detach(struct msm_mmu *mmu) { struct msm_smmu *smmu = to_msm_smmu(mmu); struct msm_smmu_client *client = msm_smmu_to_client(smmu); diff --git a/drivers/gpu/drm/msm/msm_snapshot.c b/drivers/gpu/drm/msm/msm_snapshot.c new file mode 100644 index 000000000000..30f3e5c64ebd --- /dev/null +++ b/drivers/gpu/drm/msm/msm_snapshot.c @@ -0,0 +1,105 @@ +/* Copyright (c) 2016 The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include "msm_gpu.h" +#include "msm_gem.h" +#include "msm_snapshot_api.h" + +void msm_snapshot_destroy(struct msm_gpu *gpu, struct msm_snapshot *snapshot) +{ + struct drm_device *dev = gpu->dev; + struct msm_drm_private *priv = dev->dev_private; + struct platform_device *pdev = priv->gpu_pdev; + + if (!snapshot) + return; + + dma_free_coherent(&pdev->dev, SZ_1M, snapshot->ptr, + snapshot->physaddr); + + kfree(snapshot); +} + +struct msm_snapshot *msm_snapshot_new(struct msm_gpu *gpu) +{ + struct drm_device *dev = gpu->dev; + struct msm_drm_private *priv = dev->dev_private; + struct platform_device *pdev = priv->gpu_pdev; + struct msm_snapshot *snapshot; + + snapshot = kzalloc(sizeof(*snapshot), GFP_KERNEL); + if (!snapshot) + return ERR_PTR(-ENOMEM); + + snapshot->ptr = dma_alloc_coherent(&pdev->dev, SZ_1M, + &snapshot->physaddr, GFP_KERNEL); + + if (!snapshot->ptr) { + kfree(snapshot); + return ERR_PTR(-ENOMEM); + } + + seq_buf_init(&snapshot->buf, snapshot->ptr, SZ_1M); + + return snapshot; +} + +int msm_gpu_snapshot(struct msm_gpu *gpu, struct msm_snapshot *snapshot) +{ + int ret; + struct msm_snapshot_header header; + uint64_t val; + + if (!snapshot) + return -ENOMEM; + + /* + * For now, blow away the snapshot and take a new one - the most + * interesting hang is the last one we saw + */ + seq_buf_init(&snapshot->buf, snapshot->ptr, SZ_1M); + + header.magic = SNAPSHOT_MAGIC; + gpu->funcs->get_param(gpu, MSM_PARAM_GPU_ID, &val); + header.gpuid = lower_32_bits(val); + + gpu->funcs->get_param(gpu, MSM_PARAM_CHIP_ID, &val); + header.chipid = lower_32_bits(val); + + seq_buf_putmem(&snapshot->buf, &header, sizeof(header)); + + ret = gpu->funcs->snapshot(gpu, snapshot); + + if (!ret) { + struct msm_snapshot_section_header end; + + end.magic = SNAPSHOT_SECTION_MAGIC; + end.id = SNAPSHOT_SECTION_END; + end.size = sizeof(end); + + seq_buf_putmem(&snapshot->buf, &end, sizeof(end)); + + dev_info(gpu->dev->dev, "GPU snapshot created [0x%pa (%d bytes)]\n", + &snapshot->physaddr, seq_buf_used(&snapshot->buf)); + } + + return ret; +} + +int msm_snapshot_write(struct msm_gpu *gpu, struct seq_file *m) +{ + if (gpu && gpu->snapshot) + seq_write(m, gpu->snapshot->ptr, + seq_buf_used(&gpu->snapshot->buf)); + + return 0; +} diff --git a/drivers/gpu/drm/msm/msm_snapshot.h b/drivers/gpu/drm/msm/msm_snapshot.h new file mode 100644 index 000000000000..247e1358c885 --- /dev/null +++ b/drivers/gpu/drm/msm/msm_snapshot.h @@ -0,0 +1,85 @@ +/* Copyright (c) 2016 The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef MSM_SNAPSHOT_H_ +#define MSM_SNAPSHOT_H_ + +#include <linux/string.h> +#include <linux/seq_buf.h> +#include "msm_snapshot_api.h" + +struct msm_snapshot { + void *ptr; + struct seq_buf buf; + phys_addr_t physaddr; + uint32_t index; + uint32_t remain; + unsigned long timestamp; + void *priv; +}; + +/* Write a uint32_t value to the next position in the snapshot buffer */ +static inline void SNAPSHOT_WRITE_U32(struct msm_snapshot *snapshot, + uint32_t value) +{ + seq_buf_putmem(&snapshot->buf, &value, sizeof(value)); +} + +/* Copy a block of memory to the next position in the snapshot buffer */ +static inline void SNAPSHOT_MEMCPY(struct msm_snapshot *snapshot, void *src, + uint32_t size) +{ + if (size) + seq_buf_putmem(&snapshot->buf, src, size); +} + +static inline bool _snapshot_header(struct msm_snapshot *snapshot, + struct msm_snapshot_section_header *header, + u32 headsz, u32 datasz, u32 id) +{ + u32 size = headsz + datasz; + + if (seq_buf_buffer_left(&snapshot->buf) <= size) + return false; + + /* Write the section header */ + header->magic = SNAPSHOT_SECTION_MAGIC; + header->id = id; + header->size = headsz + datasz; + + /* Write the section header */ + seq_buf_putmem(&snapshot->buf, header, headsz); + + /* The caller will fill in the data from here */ + return true; +} + +/* SNAPSHOT_HEADER + * _snapshot: pointer to struct msm_snapshot + * _header: Local variable containing the sub-section header + * _id: Section ID to write + * _dword: Size of the data section (in dword) + */ +#define SNAPSHOT_HEADER(_snapshot, _header, _id, _dwords) \ + _snapshot_header((_snapshot), \ + (struct msm_snapshot_section_header *) &(header), \ + sizeof(header), (_dwords) << 2, (_id)) + +struct msm_gpu; + +struct msm_snapshot *msm_snapshot_new(struct msm_gpu *gpu); +void msm_snapshot_destroy(struct msm_gpu *gpu, struct msm_snapshot *snapshot); +int msm_gpu_snapshot(struct msm_gpu *gpu, struct msm_snapshot *snapshot); +int msm_snapshot_write(struct msm_gpu *gpu, struct seq_file *m); + +#endif + diff --git a/drivers/gpu/drm/msm/msm_snapshot_api.h b/drivers/gpu/drm/msm/msm_snapshot_api.h new file mode 100644 index 000000000000..9f0adb9ee784 --- /dev/null +++ b/drivers/gpu/drm/msm/msm_snapshot_api.h @@ -0,0 +1,121 @@ +/* Copyright (c) 2016 The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef MSM_SNAPSHOT_API_H_ +#define MSM_SNAPSHOT_API_H_ + +#include <linux/types.h> + +/* High word is the magic, low word is the snapshot header version */ +#define SNAPSHOT_MAGIC 0x504D0002 + +struct msm_snapshot_header { + __u32 magic; + __u32 gpuid; + __u32 chipid; +} __packed; + +#define SNAPSHOT_SECTION_MAGIC 0xABCD + +struct msm_snapshot_section_header { + __u16 magic; + __u16 id; + __u32 size; +} __packed; + +/* Section identifiers */ +#define SNAPSHOT_SECTION_OS 0x0101 +#define SNAPSHOT_SECTION_REGS_V2 0x0202 +#define SNAPSHOT_SECTION_RB_V2 0x0302 +#define SNAPSHOT_SECTION_IB_V2 0x0402 +#define SNAPSHOT_SECTION_INDEXED_REGS 0x0501 +#define SNAPSHOT_SECTION_DEBUG 0x0901 +#define SNAPSHOT_SECTION_DEBUGBUS 0x0A01 +#define SNAPSHOT_SECTION_GPU_OBJECT_V2 0x0B02 +#define SNAPSHOT_SECTION_MEMLIST_V2 0x0E02 +#define SNAPSHOT_SECTION_SHADER 0x1201 +#define SNAPSHOT_SECTION_END 0xFFFF + +#define SNAPSHOT_OS_LINUX_V3 0x00000202 + +struct msm_snapshot_linux { + struct msm_snapshot_section_header header; + int osid; + __u32 seconds; + __u32 power_flags; + __u32 power_level; + __u32 power_interval_timeout; + __u32 grpclk; + __u32 busclk; + __u64 ptbase; + __u32 pid; + __u32 current_context; + __u32 ctxtcount; + unsigned char release[32]; + unsigned char version[32]; + unsigned char comm[16]; +} __packed; + +struct msm_snapshot_ringbuffer { + struct msm_snapshot_section_header header; + int start; + int end; + int rbsize; + int wptr; + int rptr; + int count; + __u32 timestamp_queued; + __u32 timestamp_retired; + __u64 gpuaddr; + __u32 id; +} __packed; + +struct msm_snapshot_regs { + struct msm_snapshot_section_header header; + __u32 count; +} __packed; + +struct msm_snapshot_indexed_regs { + struct msm_snapshot_section_header header; + __u32 index_reg; + __u32 data_reg; + __u32 start; + __u32 count; +} __packed; + +#define SNAPSHOT_DEBUG_CP_MEQ 7 +#define SNAPSHOT_DEBUG_CP_PM4_RAM 8 +#define SNAPSHOT_DEBUG_CP_PFP_RAM 9 +#define SNAPSHOT_DEBUG_CP_ROQ 10 +#define SNAPSHOT_DEBUG_SHADER_MEMORY 11 +#define SNAPSHOT_DEBUG_CP_MERCIU 12 + +struct msm_snapshot_debug { + struct msm_snapshot_section_header header; + __u32 type; + __u32 size; +} __packed; + +struct msm_snapshot_debugbus { + struct msm_snapshot_section_header header; + __u32 id; + __u32 count; +} __packed; + +struct msm_snapshot_shader { + struct msm_snapshot_section_header header; + __u32 type; + __u32 index; + __u32 size; +} __packed; + +#endif diff --git a/drivers/gpu/drm/msm/sde/sde_kms.c b/drivers/gpu/drm/msm/sde/sde_kms.c index e906b66dcd95..576e7d1e7189 100644 --- a/drivers/gpu/drm/msm/sde/sde_kms.c +++ b/drivers/gpu/drm/msm/sde/sde_kms.c @@ -946,9 +946,8 @@ static int _sde_kms_mmu_destroy(struct sde_kms *sde_kms) mmu = sde_kms->aspace[i]->mmu; - mmu->funcs->detach(mmu, (const char **)iommu_ports, - ARRAY_SIZE(iommu_ports)); - msm_gem_address_space_destroy(sde_kms->aspace[i]); + mmu->funcs->detach(mmu); + msm_gem_address_space_put(sde_kms->aspace[i]); sde_kms->aspace[i] = NULL; } @@ -987,7 +986,7 @@ static int _sde_kms_mmu_init(struct sde_kms *sde_kms) ARRAY_SIZE(iommu_ports)); if (ret) { SDE_ERROR("failed to attach iommu %d: %d\n", i, ret); - msm_gem_address_space_destroy(aspace); + msm_gem_address_space_put(aspace); goto fail; } diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c index ce15e150277e..ce1eb562be36 100644 --- a/drivers/iommu/arm-smmu.c +++ b/drivers/iommu/arm-smmu.c @@ -249,17 +249,6 @@ #define RESUME_RETRY (0 << 0) #define RESUME_TERMINATE (1 << 0) -#define TTBCR2_SEP_SHIFT 15 -#define TTBCR2_SEP_UPSTREAM (0x7 << TTBCR2_SEP_SHIFT) - -#define TTBCR2_SEP_31 0 -#define TTBCR2_SEP_35 1 -#define TTBCR2_SEP_39 2 -#define TTBCR2_SEP_41 3 -#define TTBCR2_SEP_43 4 -#define TTBCR2_SEP_47 5 -#define TTBCR2_SEP_NOSIGN 7 - #define TTBRn_ASID_SHIFT 48 #define FSR_MULTI (1 << 31) @@ -1614,7 +1603,6 @@ static void arm_smmu_init_context_bank(struct arm_smmu_domain *smmu_domain, writel_relaxed(reg, cb_base + ARM_SMMU_CB_TTBCR); if (smmu->version > ARM_SMMU_V1) { reg = pgtbl_cfg->arm_lpae_s1_cfg.tcr >> 32; - reg |= TTBCR2_SEP_UPSTREAM; writel_relaxed(reg, cb_base + ARM_SMMU_CB_TTBCR2); } } else { @@ -1745,7 +1733,9 @@ static int arm_smmu_init_domain_context(struct iommu_domain *domain, struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); struct arm_smmu_cfg *cfg = &smmu_domain->cfg; bool is_fast = smmu_domain->attributes & (1 << DOMAIN_ATTR_FAST); - unsigned long quirks = 0; + unsigned long quirks = + smmu_domain->attributes & (1 << DOMAIN_ATTR_ENABLE_TTBR1) ? + IO_PGTABLE_QUIRK_ARM_TTBR1 : 0; if (smmu_domain->smmu) goto out; @@ -1837,6 +1827,7 @@ static int arm_smmu_init_domain_context(struct iommu_domain *domain, }; fmt = ARM_MSM_SECURE; } else { + smmu_domain->pgtbl_cfg = (struct io_pgtable_cfg) { .quirks = quirks, .pgsize_bitmap = arm_smmu_ops.pgsize_bitmap, @@ -3140,6 +3131,12 @@ static int arm_smmu_domain_get_attr(struct iommu_domain *domain, & (1 << DOMAIN_ATTR_PAGE_TABLE_FORCE_COHERENT)); ret = 0; break; + case DOMAIN_ATTR_ENABLE_TTBR1: + *((int *)data) = !!(smmu_domain->attributes + & (1 << DOMAIN_ATTR_ENABLE_TTBR1)); + ret = 0; + break; + default: ret = -ENODEV; break; @@ -3283,6 +3280,12 @@ static int arm_smmu_domain_set_attr(struct iommu_domain *domain, ret = 0; break; } + case DOMAIN_ATTR_ENABLE_TTBR1: + if (*((int *)data)) + smmu_domain->attributes |= + 1 << DOMAIN_ATTR_ENABLE_TTBR1; + ret = 0; + break; default: ret = -ENODEV; break; diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c index 0d057ca92972..5f2b66286c0c 100644 --- a/drivers/iommu/io-pgtable-arm.c +++ b/drivers/iommu/io-pgtable-arm.c @@ -131,14 +131,21 @@ #define ARM_LPAE_TCR_TG0_64K (1 << 14) #define ARM_LPAE_TCR_TG0_16K (2 << 14) +#define ARM_LPAE_TCR_TG1_16K 1ULL +#define ARM_LPAE_TCR_TG1_4K 2ULL +#define ARM_LPAE_TCR_TG1_64K 3ULL + #define ARM_LPAE_TCR_SH0_SHIFT 12 #define ARM_LPAE_TCR_SH0_MASK 0x3 +#define ARM_LPAE_TCR_SH1_SHIFT 28 #define ARM_LPAE_TCR_SH_NS 0 #define ARM_LPAE_TCR_SH_OS 2 #define ARM_LPAE_TCR_SH_IS 3 #define ARM_LPAE_TCR_ORGN0_SHIFT 10 +#define ARM_LPAE_TCR_ORGN1_SHIFT 26 #define ARM_LPAE_TCR_IRGN0_SHIFT 8 +#define ARM_LPAE_TCR_IRGN1_SHIFT 24 #define ARM_LPAE_TCR_RGN_MASK 0x3 #define ARM_LPAE_TCR_RGN_NC 0 #define ARM_LPAE_TCR_RGN_WBWA 1 @@ -151,6 +158,9 @@ #define ARM_LPAE_TCR_T0SZ_SHIFT 0 #define ARM_LPAE_TCR_SZ_MASK 0xf +#define ARM_LPAE_TCR_T1SZ_SHIFT 16 +#define ARM_LPAE_TCR_T1SZ_MASK 0x3f + #define ARM_LPAE_TCR_PS_SHIFT 16 #define ARM_LPAE_TCR_PS_MASK 0x7 @@ -167,6 +177,16 @@ #define ARM_LPAE_TCR_EPD1_SHIFT 23 #define ARM_LPAE_TCR_EPD1_FAULT 1 +#define ARM_LPAE_TCR_SEP_SHIFT (15 + 32) + +#define ARM_LPAE_TCR_SEP_31 0ULL +#define ARM_LPAE_TCR_SEP_35 1ULL +#define ARM_LPAE_TCR_SEP_39 2ULL +#define ARM_LPAE_TCR_SEP_41 3ULL +#define ARM_LPAE_TCR_SEP_43 4ULL +#define ARM_LPAE_TCR_SEP_47 5ULL +#define ARM_LPAE_TCR_SEP_UPSTREAM 7ULL + #define ARM_LPAE_MAIR_ATTR_SHIFT(n) ((n) << 3) #define ARM_LPAE_MAIR_ATTR_MASK 0xff #define ARM_LPAE_MAIR_ATTR_DEVICE 0x04 @@ -206,7 +226,7 @@ struct arm_lpae_io_pgtable { unsigned long pg_shift; unsigned long bits_per_level; - void *pgd; + void *pgd[2]; }; typedef u64 arm_lpae_iopte; @@ -524,14 +544,26 @@ static arm_lpae_iopte arm_lpae_prot_to_pte(struct arm_lpae_io_pgtable *data, return pte; } +static inline arm_lpae_iopte *arm_lpae_get_table( + struct arm_lpae_io_pgtable *data, unsigned long iova) +{ + struct io_pgtable_cfg *cfg = &data->iop.cfg; + + return ((cfg->quirks & IO_PGTABLE_QUIRK_ARM_TTBR1) && + (iova & (1UL << (cfg->ias - 1)))) ? + data->pgd[1] : data->pgd[0]; +} + static int arm_lpae_map(struct io_pgtable_ops *ops, unsigned long iova, phys_addr_t paddr, size_t size, int iommu_prot) { struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops); - arm_lpae_iopte *ptep = data->pgd; + arm_lpae_iopte *ptep; int ret, lvl = ARM_LPAE_START_LVL(data); arm_lpae_iopte prot; + ptep = arm_lpae_get_table(data, iova); + /* If no access, then nothing to do */ if (!(iommu_prot & (IOMMU_READ | IOMMU_WRITE))) return 0; @@ -554,7 +586,7 @@ static int arm_lpae_map_sg(struct io_pgtable_ops *ops, unsigned long iova, { struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops); struct io_pgtable_cfg *cfg = &data->iop.cfg; - arm_lpae_iopte *ptep = data->pgd; + arm_lpae_iopte *ptep; int lvl = ARM_LPAE_START_LVL(data); arm_lpae_iopte prot; struct scatterlist *s; @@ -563,6 +595,8 @@ static int arm_lpae_map_sg(struct io_pgtable_ops *ops, unsigned long iova, unsigned int min_pagesz; struct map_state ms; + ptep = arm_lpae_get_table(data, iova); + /* If no access, then nothing to do */ if (!(iommu_prot & (IOMMU_READ | IOMMU_WRITE))) goto out_err; @@ -672,7 +706,10 @@ static void arm_lpae_free_pgtable(struct io_pgtable *iop) { struct arm_lpae_io_pgtable *data = io_pgtable_to_data(iop); - __arm_lpae_free_pgtable(data, ARM_LPAE_START_LVL(data), data->pgd); + __arm_lpae_free_pgtable(data, ARM_LPAE_START_LVL(data), data->pgd[0]); + if (data->pgd[1]) + __arm_lpae_free_pgtable(data, ARM_LPAE_START_LVL(data), + data->pgd[1]); kfree(data); } @@ -800,9 +837,11 @@ static size_t arm_lpae_unmap(struct io_pgtable_ops *ops, unsigned long iova, size_t unmapped = 0; struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops); struct io_pgtable *iop = &data->iop; - arm_lpae_iopte *ptep = data->pgd; + arm_lpae_iopte *ptep; int lvl = ARM_LPAE_START_LVL(data); + ptep = arm_lpae_get_table(data, iova); + while (unmapped < size) { size_t ret, size_to_unmap, remaining; @@ -828,7 +867,10 @@ static int arm_lpae_iova_to_pte(struct arm_lpae_io_pgtable *data, unsigned long iova, int *plvl_ret, arm_lpae_iopte *ptep_ret) { - arm_lpae_iopte pte, *ptep = data->pgd; + arm_lpae_iopte pte, *ptep; + + ptep = arm_lpae_get_table(data, iova); + *plvl_ret = ARM_LPAE_START_LVL(data); *ptep_ret = 0; @@ -994,6 +1036,71 @@ arm_lpae_alloc_pgtable(struct io_pgtable_cfg *cfg) return data; } +static u64 arm64_lpae_setup_ttbr1(struct io_pgtable_cfg *cfg, + struct arm_lpae_io_pgtable *data) + +{ + u64 reg; + + /* If TTBR1 is disabled, disable speculative walks through the TTBR1 */ + if (!(cfg->quirks & IO_PGTABLE_QUIRK_ARM_TTBR1)) { + reg = ARM_LPAE_TCR_EPD1; + reg |= (ARM_LPAE_TCR_SEP_UPSTREAM << ARM_LPAE_TCR_SEP_SHIFT); + return reg; + } + + if (cfg->iommu_dev && cfg->iommu_dev->archdata.dma_coherent) + reg = (ARM_LPAE_TCR_SH_OS << ARM_LPAE_TCR_SH1_SHIFT) | + (ARM_LPAE_TCR_RGN_WBWA << ARM_LPAE_TCR_IRGN1_SHIFT) | + (ARM_LPAE_TCR_RGN_WBWA << ARM_LPAE_TCR_ORGN1_SHIFT); + else + reg = (ARM_LPAE_TCR_SH_OS << ARM_LPAE_TCR_SH1_SHIFT) | + (ARM_LPAE_TCR_RGN_NC << ARM_LPAE_TCR_IRGN1_SHIFT) | + (ARM_LPAE_TCR_RGN_NC << ARM_LPAE_TCR_ORGN1_SHIFT); + + switch (1 << data->pg_shift) { + case SZ_4K: + reg |= (ARM_LPAE_TCR_TG1_4K << 30); + break; + case SZ_16K: + reg |= (ARM_LPAE_TCR_TG1_16K << 30); + break; + case SZ_64K: + reg |= (ARM_LPAE_TCR_TG1_64K << 30); + break; + } + + /* Set T1SZ */ + reg |= (64ULL - cfg->ias) << ARM_LPAE_TCR_T1SZ_SHIFT; + + /* Set the SEP bit based on the size */ + switch (cfg->ias) { + case 32: + reg |= (ARM_LPAE_TCR_SEP_31 << ARM_LPAE_TCR_SEP_SHIFT); + break; + case 36: + reg |= (ARM_LPAE_TCR_SEP_35 << ARM_LPAE_TCR_SEP_SHIFT); + break; + case 40: + reg |= (ARM_LPAE_TCR_SEP_39 << ARM_LPAE_TCR_SEP_SHIFT); + break; + case 42: + reg |= (ARM_LPAE_TCR_SEP_41 << ARM_LPAE_TCR_SEP_SHIFT); + break; + case 44: + reg |= (ARM_LPAE_TCR_SEP_43 << ARM_LPAE_TCR_SEP_SHIFT); + break; + case 48: + reg |= (ARM_LPAE_TCR_SEP_47 << ARM_LPAE_TCR_SEP_SHIFT); + break; + default: + reg |= (ARM_LPAE_TCR_SEP_UPSTREAM << ARM_LPAE_TCR_SEP_SHIFT); + break; + } + + return reg; +} + static struct io_pgtable * arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie) { @@ -1050,8 +1157,9 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie) reg |= (64ULL - cfg->ias) << ARM_LPAE_TCR_T0SZ_SHIFT; - /* Disable speculative walks through TTBR1 */ - reg |= ARM_LPAE_TCR_EPD1; + /* Bring in the TTBR1 configuration */ + reg |= arm64_lpae_setup_ttbr1(cfg, data); + cfg->arm_lpae_s1_cfg.tcr = reg; /* MAIRs */ @@ -1066,16 +1174,33 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie) cfg->arm_lpae_s1_cfg.mair[1] = 0; /* Looking good; allocate a pgd */ - data->pgd = __arm_lpae_alloc_pages(data->pgd_size, GFP_KERNEL, cfg, cookie); - if (!data->pgd) + data->pgd[0] = __arm_lpae_alloc_pages(data->pgd_size, GFP_KERNEL, cfg, + cookie); + if (!data->pgd[0]) goto out_free_data; + + if (cfg->quirks & IO_PGTABLE_QUIRK_ARM_TTBR1) { + data->pgd[1] = __arm_lpae_alloc_pages(data->pgd_size, + GFP_KERNEL, cfg, cookie); + if (!data->pgd[1]) { + __arm_lpae_free_pages(data->pgd[0], data->pgd_size, cfg, + cookie); + goto out_free_data; + } + } else { + data->pgd[1] = NULL; + } + /* Ensure the empty pgd is visible before any actual TTBR write */ wmb(); /* TTBRs */ - cfg->arm_lpae_s1_cfg.ttbr[0] = virt_to_phys(data->pgd); - cfg->arm_lpae_s1_cfg.ttbr[1] = 0; + cfg->arm_lpae_s1_cfg.ttbr[0] = virt_to_phys(data->pgd[0]); + + if (data->pgd[1]) + cfg->arm_lpae_s1_cfg.ttbr[1] = virt_to_phys(data->pgd[1]); + return &data->iop; out_free_data: @@ -1155,15 +1280,16 @@ arm_64_lpae_alloc_pgtable_s2(struct io_pgtable_cfg *cfg, void *cookie) cfg->arm_lpae_s2_cfg.vtcr = reg; /* Allocate pgd pages */ - data->pgd = __arm_lpae_alloc_pages(data->pgd_size, GFP_KERNEL, cfg, cookie); - if (!data->pgd) + data->pgd[0] = __arm_lpae_alloc_pages(data->pgd_size, GFP_KERNEL, cfg, + cookie); + if (!data->pgd[0]) goto out_free_data; /* Ensure the empty pgd is visible before any actual TTBR write */ wmb(); /* VTTBR */ - cfg->arm_lpae_s2_cfg.vttbr = virt_to_phys(data->pgd); + cfg->arm_lpae_s2_cfg.vttbr = virt_to_phys(data->pgd[0]); return &data->iop; out_free_data: @@ -1261,7 +1387,7 @@ static void __init arm_lpae_dump_ops(struct io_pgtable_ops *ops) cfg->pgsize_bitmap, cfg->ias); pr_err("data: %d levels, 0x%zx pgd_size, %lu pg_shift, %lu bits_per_level, pgd @ %p\n", data->levels, data->pgd_size, data->pg_shift, - data->bits_per_level, data->pgd); + data->bits_per_level, data->pgd[0]); } #define __FAIL(ops, i) ({ \ diff --git a/drivers/iommu/io-pgtable.h b/drivers/iommu/io-pgtable.h index a3f366f559a7..f4533040806f 100644 --- a/drivers/iommu/io-pgtable.h +++ b/drivers/iommu/io-pgtable.h @@ -62,6 +62,7 @@ struct io_pgtable_cfg { */ #define IO_PGTABLE_QUIRK_ARM_NS (1 << 0) /* Set NS bit in PTEs */ #define IO_PGTABLE_QUIRK_PAGE_TABLE_COHERENT (1 << 1) + #define IO_PGTABLE_QUIRK_ARM_TTBR1 (1 << 2) /* Allocate TTBR1 PT */ int quirks; unsigned long pgsize_bitmap; unsigned int ias; diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 1b3f20e8fb74..c4c25651ff21 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -136,6 +136,7 @@ enum iommu_attr { DOMAIN_ATTR_EARLY_MAP, DOMAIN_ATTR_PAGE_TABLE_IS_COHERENT, DOMAIN_ATTR_PAGE_TABLE_FORCE_COHERENT, + DOMAIN_ATTR_ENABLE_TTBR1, DOMAIN_ATTR_MAX, }; diff --git a/include/uapi/drm/msm_drm.h b/include/uapi/drm/msm_drm.h index 587d35ce1638..d2f19ac6f536 100644 --- a/include/uapi/drm/msm_drm.h +++ b/include/uapi/drm/msm_drm.h @@ -40,6 +40,15 @@ #define MSM_PIPE_2D1 0x02 #define MSM_PIPE_3D0 0x10 +/* The pipe-id just uses the lower bits, so can be OR'd with flags in + * the upper 16 bits (which could be extended further, if needed, maybe + * we extend/overload the pipe-id some day to deal with multiple rings, + * but even then I don't think we need the full lower 16 bits). + */ +#define MSM_PIPE_ID_MASK 0xffff +#define MSM_PIPE_ID(x) ((x) & MSM_PIPE_ID_MASK) +#define MSM_PIPE_FLAGS(x) ((x) & ~MSM_PIPE_ID_MASK) + /* timeouts are specified in clock-monotonic absolute times (to simplify * restarting interrupted ioctls). The following struct is logically the * same as 'struct timespec' but 32/64b ABI safe. @@ -54,6 +63,7 @@ struct drm_msm_timespec { #define MSM_PARAM_CHIP_ID 0x03 #define MSM_PARAM_MAX_FREQ 0x04 #define MSM_PARAM_TIMESTAMP 0x05 +#define MSM_PARAM_GMEM_BASE 0x06 struct drm_msm_param { __u32 pipe; /* in, MSM_PIPE_x */ @@ -67,6 +77,7 @@ struct drm_msm_param { #define MSM_BO_SCANOUT 0x00000001 /* scanout capable */ #define MSM_BO_GPU_READONLY 0x00000002 +#define MSM_BO_PRIVILEGED 0x00000004 #define MSM_BO_CACHE_MASK 0x000f0000 /* cache modes */ #define MSM_BO_CACHED 0x00010000 @@ -177,12 +188,18 @@ struct drm_msm_gem_submit_bo { __u64 presumed; /* in/out, presumed buffer address */ }; +/* Valid submit ioctl flags: */ +#define MSM_SUBMIT_RING_MASK 0x000F0000 +#define MSM_SUBMIT_RING_SHIFT 16 + +#define MSM_SUBMIT_FLAGS (MSM_SUBMIT_RING_MASK) + /* Each cmdstream submit consists of a table of buffers involved, and * one or more cmdstream buffers. This allows for conditional execution * (context-restore), and IB buffers needed for per tile/bin draw cmds. */ struct drm_msm_gem_submit { - __u32 pipe; /* in, MSM_PIPE_x */ + __u32 flags; /* MSM_PIPE_x | MSM_SUBMIT_x */ __u32 fence; /* out */ __u32 nr_bos; /* in, number of submit_bo's */ __u32 nr_cmds; /* in, number of submit_cmd's */ |