diff options
Diffstat (limited to 'drivers')
-rw-r--r-- | drivers/edac/Kconfig | 40 | ||||
-rw-r--r-- | drivers/edac/Makefile | 1 | ||||
-rw-r--r-- | drivers/edac/cortex_arm64_edac.c | 900 |
3 files changed, 941 insertions, 0 deletions
diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig index ef25000a5bc6..f61271201906 100644 --- a/drivers/edac/Kconfig +++ b/drivers/edac/Kconfig @@ -390,4 +390,44 @@ config EDAC_XGENE Support for error detection and correction on the APM X-Gene family of SOCs. +config EDAC_CORTEX_ARM64 + depends on EDAC_MM_EDAC && ARM64 + bool "ARM Cortex A CPUs L1/L2 Caches" + help + Support for error detection and correction on the + ARM Cortex A53 and A57 CPUs. For debugging issues having to do with + stability and overall system health, you should probably say 'Y' + here. + +config EDAC_CORTEX_ARM64_PANIC_ON_CE + depends on EDAC_CORTEX_ARM64 + bool "Panic on correctable errors" + help + Forcibly cause a kernel panic if an correctable error (CE) is + detected, even though the error is (by definition) correctable and + would otherwise result in no adverse system effects. This can reduce + debugging times on hardware which may be operating at voltages or + frequencies outside normal specification. + + For production builds, you should definitely say 'N' here. + +config EDAC_CORTEX_ARM64_DBE_IRQ_ONLY + depends on EDAC_CORTEX_ARM64 + bool "Only check for parity errors when an irq is generated" + help + In ARM64, parity errors will cause an interrupt + to be triggered but may also cause a data abort to + occur. Only check for EDAC errors for the interrupt. + If unsure, say no. + +config EDAC_CORTEX_ARM64_PANIC_ON_UE + depends on EDAC_CORTEX_ARM64 + bool "Panic on uncorrectable errors" + help + Forcibly cause a kernel panic if an uncorrectable error (UE) is + detected. This can reduce debugging times on hardware which may be + operating at voltages or frequencies outside normal specification. + + For production builds, you should probably say 'N' here. + endif # EDAC diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile index dbf53e08bdd1..d4f49f97b615 100644 --- a/drivers/edac/Makefile +++ b/drivers/edac/Makefile @@ -70,3 +70,4 @@ obj-$(CONFIG_EDAC_OCTEON_PCI) += octeon_edac-pci.o obj-$(CONFIG_EDAC_ALTERA_MC) += altera_edac.o obj-$(CONFIG_EDAC_SYNOPSYS) += synopsys_edac.o obj-$(CONFIG_EDAC_XGENE) += xgene_edac.o +obj-$(CONFIG_EDAC_CORTEX_ARM64) += cortex_arm64_edac.o diff --git a/drivers/edac/cortex_arm64_edac.c b/drivers/edac/cortex_arm64_edac.c new file mode 100644 index 000000000000..8a0f0fb9811f --- /dev/null +++ b/drivers/edac/cortex_arm64_edac.c @@ -0,0 +1,900 @@ +/* Copyright (c) 2014-2015, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/kernel.h> +#include <linux/edac.h> +#include <linux/interrupt.h> +#include <linux/of_device.h> +#include <linux/platform_device.h> +#include <linux/perf_event.h> +#include <linux/smp.h> +#include <linux/cpu.h> +#include <linux/cpu_pm.h> +#include <linux/of_irq.h> +#include <linux/spinlock.h> +#include <linux/slab.h> +#include <linux/workqueue.h> +#include <linux/percpu.h> +#include <linux/msm_rtb.h> + +#include <asm/cputype.h> +#include <asm/esr.h> + +#include "edac_core.h" + +#define A53_CPUMERRSR_FATAL(a) ((a) & (1LL << 63)) +#define A53_CPUMERRSR_OTHER(a) (((a) >> 40) & 0xff) +#define A53_CPUMERRSR_REPT(a) (((a) >> 32) & 0xff) +#define A53_CPUMERRSR_VALID(a) ((a) & (1 << 31)) +#define A53_CPUMERRSR_RAMID(a) (((a) >> 24) & 0x7f) +#define A53_CPUMERRSR_CPUID(a) (((a) >> 18) & 0x07) +#define A53_CPUMERRSR_ADDR(a) ((a) & 0xfff) + +#define A53_L2MERRSR_FATAL(a) ((a) & (1LL << 63)) +#define A53_L2MERRSR_OTHER(a) (((a) >> 40) & 0xff) +#define A53_L2MERRSR_REPT(a) (((a) >> 32) & 0xff) +#define A53_L2MERRSR_VALID(a) ((a) & (1 << 31)) +#define A53_L2MERRSR_RAMID(a) (((a) >> 24) & 0x7f) +#define A53_L2MERRSR_CPUID(a) (((a) >> 18) & 0x0f) +#define A53_L2MERRSR_INDEX(a) (((a) >> 3) & 0x3fff) + +#define A57_CPUMERRSR_FATAL(a) ((a) & (1LL << 63)) +#define A57_CPUMERRSR_OTHER(a) (((a) >> 40) & 0xff) +#define A57_CPUMERRSR_REPT(a) (((a) >> 32) & 0xff) +#define A57_CPUMERRSR_VALID(a) ((a) & (1 << 31)) +#define A57_CPUMERRSR_RAMID(a) (((a) >> 24) & 0x7f) +#define A57_CPUMERRSR_BANK(a) (((a) >> 18) & 0x1f) +#define A57_CPUMERRSR_INDEX(a) ((a) & 0x1ffff) + +#define A57_L2MERRSR_FATAL(a) ((a) & (1LL << 63)) +#define A57_L2MERRSR_OTHER(a) (((a) >> 40) & 0xff) +#define A57_L2MERRSR_REPT(a) (((a) >> 32) & 0xff) +#define A57_L2MERRSR_VALID(a) ((a) & (1 << 31)) +#define A57_L2MERRSR_RAMID(a) (((a) >> 24) & 0x7f) +#define A57_L2MERRSR_CPUID(a) (((a) >> 18) & 0x0f) +#define A57_L2MERRSR_INDEX(a) ((a) & 0x1ffff) + +#define L2ECTLR_INT_ERR (1 << 30) +#define L2ECTLR_EXT_ERR (1 << 29) + +#define ESR_SERROR(a) ((a) >> ESR_ELx_EC_SHIFT == ESR_ELx_EC_SERROR) +#define ESR_VALID(a) ((a) & BIT(24)) +#define ESR_L2_DBE(a) (ESR_SERROR(a) && ESR_VALID(a) && \ + (((a) & 0x00C00003) == 0x1)) + +#define CCI_IMPRECISEERROR_REG 0x10 + +#define L1_CACHE 0 +#define L2_CACHE 1 +#define CCI 2 + +#define A53_L1_CE 0 +#define A53_L1_UE 1 +#define A53_L2_CE 2 +#define A53_L2_UE 3 +#define A57_L1_CE 4 +#define A57_L1_UE 5 +#define A57_L2_CE 6 +#define A57_L2_UE 7 +#define L2_EXT_UE 8 +#define CCI_UE 9 + +#ifdef CONFIG_EDAC_CORTEX_ARM64_PANIC_ON_UE +#define ARM64_ERP_PANIC_ON_UE 1 +#else +#define ARM64_ERP_PANIC_ON_UE 0 +#endif + +#ifdef CONFIG_EDAC_CORTEX_ARM64_PANIC_ON_CE +static int panic_on_ce = 1; +#else +static int panic_on_ce; +#endif +module_param(panic_on_ce, int, 0); + +#define EDAC_CPU "arm64" + +enum error_type { + SBE, + DBE, +}; + +const char *err_name[] = { + "Single-bit", + "Double-bit", +}; + +struct erp_drvdata { + struct edac_device_ctl_info *edev_ctl; + void __iomem *cci_base; + struct notifier_block nb_pm; + struct notifier_block nb_cpu; + struct notifier_block nb_panic; + struct work_struct work; + struct perf_event *memerr_counters[NR_CPUS]; +}; + +static struct erp_drvdata *panic_handler_drvdata; + +struct erp_local_data { + struct erp_drvdata *drv; + enum error_type err; +}; + +#define MEM_ERROR_EVENT 0x1A + +struct errors_edac { + const char * const msg; + void (*func)(struct edac_device_ctl_info *edac_dev, + int inst_nr, int block_nr, const char *msg); +}; + +static const struct errors_edac errors[] = { + {"A53 L1 Correctable Error", edac_device_handle_ce }, + {"A53 L1 Uncorrectable Error", edac_device_handle_ue }, + {"A53 L2 Correctable Error", edac_device_handle_ce }, + {"A53 L2 Uncorrectable Error", edac_device_handle_ue }, + {"A57 L1 Correctable Error", edac_device_handle_ce }, + {"A57 L1 Uncorrectable Error", edac_device_handle_ue }, + {"A57 L2 Correctable Error", edac_device_handle_ce }, + {"A57 L2 Uncorrectable Error", edac_device_handle_ue }, + {"L2 External Error", edac_device_handle_ue }, + {"CCI Error", edac_device_handle_ue }, +}; + +#define read_l2merrsr_el1 ({ \ + u64 __val; \ + asm("mrs %0, s3_1_c15_c2_3" : "=r" (__val)); \ + __val; \ +}) + +#define read_l2ectlr_el1 ({ \ + u32 __val; \ + asm("mrs %0, s3_1_c11_c0_3" : "=r" (__val)); \ + __val; \ +}) + +#define read_cpumerrsr_el1 ({ \ + u64 __val; \ + asm("mrs %0, s3_1_c15_c2_2" : "=r" (__val)); \ + __val; \ +}) + +#define read_esr_el1 ({ \ + u64 __val; \ + asm("mrs %0, esr_el1" : "=r" (__val)); \ + __val; \ +}) + +#define write_l2merrsr_el1(val) ({ \ + asm("msr s3_1_c15_c2_3, %0" : : "r" (val)); \ +}) + +#define write_l2ectlr_el1(val) ({ \ + asm("msr s3_1_c11_c0_3, %0" : : "r" (val)); \ +}) + +#define write_cpumerrsr_el1(val) ({ \ + asm("msr s3_1_c15_c2_2, %0" : : "r" (val)); \ +}) + +static void ca53_ca57_print_error_state_regs(void) +{ + u64 l2merrsr; + u64 cpumerrsr; + u32 esr_el1; + u32 l2ectlr; + + cpumerrsr = read_cpumerrsr_el1; + l2merrsr = read_l2merrsr_el1; + esr_el1 = read_esr_el1; + l2ectlr = read_l2ectlr_el1; + + /* store data in uncached rtb logs */ + uncached_logk_pc(LOGK_READL, __builtin_return_address(0), + (void *)cpumerrsr); + uncached_logk_pc(LOGK_READL, __builtin_return_address(0), + (void *)l2merrsr); + uncached_logk_pc(LOGK_READL, __builtin_return_address(0), + (void *)((u64)esr_el1)); + uncached_logk_pc(LOGK_READL, __builtin_return_address(0), + (void *)((u64)l2ectlr)); + + edac_printk(KERN_CRIT, EDAC_CPU, "CPUMERRSR value = %#llx\n", + cpumerrsr); + edac_printk(KERN_CRIT, EDAC_CPU, "L2MERRSR value = %#llx\n", l2merrsr); + + edac_printk(KERN_CRIT, EDAC_CPU, "ESR value = %#x\n", esr_el1); + edac_printk(KERN_CRIT, EDAC_CPU, "L2ECTLR value = %#x\n", l2ectlr); + if (ESR_L2_DBE(esr_el1)) + edac_printk(KERN_CRIT, EDAC_CPU, + "Double bit error on dirty L2 cacheline\n"); +} + +static void ca53_parse_cpumerrsr(struct erp_local_data *ed) +{ + u64 cpumerrsr; + int cpuid; + + cpumerrsr = read_cpumerrsr_el1; + + if (!A53_CPUMERRSR_VALID(cpumerrsr)) + return; + + if (A53_CPUMERRSR_FATAL(cpumerrsr)) + ed->err = DBE; + + edac_printk(KERN_CRIT, EDAC_CPU, "Cortex A53 CPU%d L1 %s Error detected\n", + smp_processor_id(), err_name[ed->err]); + ca53_ca57_print_error_state_regs(); + if (ed->err == DBE) + edac_printk(KERN_CRIT, EDAC_CPU, "Fatal error\n"); + + cpuid = A53_CPUMERRSR_CPUID(cpumerrsr); + + switch (A53_CPUMERRSR_RAMID(cpumerrsr)) { + case 0x0: + edac_printk(KERN_CRIT, EDAC_CPU, + "L1 Instruction tag RAM way is %d\n", cpuid); + break; + case 0x1: + edac_printk(KERN_CRIT, EDAC_CPU, + "L1 Instruction data RAM bank is %d\n", cpuid); + break; + case 0x8: + edac_printk(KERN_CRIT, EDAC_CPU, + "L1 Data tag RAM cpu %d way is %d\n", + cpuid / 4, cpuid % 4); + break; + case 0x9: + edac_printk(KERN_CRIT, EDAC_CPU, + "L1 Data data RAM cpu %d way is %d\n", + cpuid / 4, cpuid % 4); + break; + case 0xA: + edac_printk(KERN_CRIT, EDAC_CPU, + "L1 Data dirty RAM cpu %d way is %d\n", + cpuid / 4, cpuid % 4); + break; + case 0x18: + edac_printk(KERN_CRIT, EDAC_CPU, "TLB RAM way is %d\n", cpuid); + break; + default: + edac_printk(KERN_CRIT, EDAC_CPU, + "Error in unknown RAM ID: %d\n", + (int) A53_CPUMERRSR_RAMID(cpumerrsr)); + break; + } + + edac_printk(KERN_CRIT, EDAC_CPU, "Repeated error count: %d\n", + (int) A53_CPUMERRSR_REPT(cpumerrsr)); + edac_printk(KERN_CRIT, EDAC_CPU, "Other error count: %d\n", + (int) A53_CPUMERRSR_OTHER(cpumerrsr)); + + if (ed->err == SBE) + errors[A53_L1_CE].func(ed->drv->edev_ctl, smp_processor_id(), + L1_CACHE, errors[A53_L1_CE].msg); + else if (ed->err == DBE) + errors[A53_L1_UE].func(ed->drv->edev_ctl, smp_processor_id(), + L1_CACHE, errors[A53_L1_UE].msg); + write_cpumerrsr_el1(0); +} + +static void ca53_parse_l2merrsr(struct erp_local_data *ed) +{ + u64 l2merrsr; + u32 l2ectlr; + int cpuid; + + l2merrsr = read_l2merrsr_el1; + l2ectlr = read_l2ectlr_el1; + + if (!A53_L2MERRSR_VALID(l2merrsr)) + return; + + if (A53_L2MERRSR_FATAL(l2merrsr)) + ed->err = DBE; + + edac_printk(KERN_CRIT, EDAC_CPU, "CortexA53 L2 %s Error detected\n", + err_name[ed->err]); + ca53_ca57_print_error_state_regs(); + if (ed->err == DBE) + edac_printk(KERN_CRIT, EDAC_CPU, "Fatal error\n"); + + cpuid = A53_L2MERRSR_CPUID(l2merrsr); + + switch (A53_L2MERRSR_RAMID(l2merrsr)) { + case 0x10: + edac_printk(KERN_CRIT, EDAC_CPU, + "L2 tag RAM way is %d\n", cpuid); + break; + case 0x11: + edac_printk(KERN_CRIT, EDAC_CPU, + "L2 data RAM bank is %d\n", cpuid); + break; + case 0x12: + edac_printk(KERN_CRIT, EDAC_CPU, + "SCU snoop filter RAM cpu %d way is %d\n", + cpuid / 4, cpuid % 4); + break; + default: + edac_printk(KERN_CRIT, EDAC_CPU, + "Error in unknown RAM ID: %d\n", + (int) A53_L2MERRSR_RAMID(l2merrsr)); + break; + } + + edac_printk(KERN_CRIT, EDAC_CPU, "Repeated error count: %d\n", + (int) A53_L2MERRSR_REPT(l2merrsr)); + edac_printk(KERN_CRIT, EDAC_CPU, "Other error count: %d\n", + (int) A53_L2MERRSR_OTHER(l2merrsr)); + + if (ed->err == SBE) + errors[A53_L2_CE].func(ed->drv->edev_ctl, smp_processor_id(), + L2_CACHE, errors[A53_L2_CE].msg); + else if (ed->err == DBE) + errors[A53_L2_UE].func(ed->drv->edev_ctl, smp_processor_id(), + L2_CACHE, errors[A53_L2_UE].msg); + write_l2merrsr_el1(0); +} + + +static void ca57_parse_cpumerrsr(struct erp_local_data *ed) +{ + u64 cpumerrsr; + int bank; + + cpumerrsr = read_cpumerrsr_el1; + + if (!A57_CPUMERRSR_VALID(cpumerrsr)) + return; + + if (A57_CPUMERRSR_FATAL(cpumerrsr)) + ed->err = DBE; + + edac_printk(KERN_CRIT, EDAC_CPU, "Cortex A57 CPU%d L1 %s Error detected\n", + smp_processor_id(), err_name[ed->err]); + ca53_ca57_print_error_state_regs(); + if (ed->err == DBE) + edac_printk(KERN_CRIT, EDAC_CPU, "Fatal error\n"); + + bank = A57_CPUMERRSR_BANK(cpumerrsr); + + switch (A57_CPUMERRSR_RAMID(cpumerrsr)) { + case 0x0: + edac_printk(KERN_CRIT, EDAC_CPU, + "L1 Instruction tag RAM bank %d\n", bank); + break; + case 0x1: + edac_printk(KERN_CRIT, EDAC_CPU, + "L1 Instruction data RAM bank %d\n", bank); + break; + case 0x8: + edac_printk(KERN_CRIT, EDAC_CPU, + "L1 Data tag RAM bank %d\n", bank); + break; + case 0x9: + edac_printk(KERN_CRIT, EDAC_CPU, + "L1 Data data RAM bank %d\n", bank); + break; + case 0x18: + edac_printk(KERN_CRIT, EDAC_CPU, + "TLB RAM bank %d\n", bank); + break; + default: + edac_printk(KERN_CRIT, EDAC_CPU, + "Error in unknown RAM ID: %d\n", + (int) A57_CPUMERRSR_RAMID(cpumerrsr)); + break; + } + + edac_printk(KERN_CRIT, EDAC_CPU, "Repeated error count: %d\n", + (int) A57_CPUMERRSR_REPT(cpumerrsr)); + edac_printk(KERN_CRIT, EDAC_CPU, "Other error count: %d\n", + (int) A57_CPUMERRSR_OTHER(cpumerrsr)); + + if (ed->err == SBE) + errors[A57_L1_CE].func(ed->drv->edev_ctl, smp_processor_id(), + L1_CACHE, errors[A57_L1_CE].msg); + else if (ed->err == DBE) + errors[A57_L1_UE].func(ed->drv->edev_ctl, smp_processor_id(), + L1_CACHE, errors[A57_L1_UE].msg); + write_cpumerrsr_el1(0); +} + +static void ca57_parse_l2merrsr(struct erp_local_data *ed) +{ + u64 l2merrsr; + u32 l2ectlr; + int cpuid; + + l2merrsr = read_l2merrsr_el1; + l2ectlr = read_l2ectlr_el1; + + if (!A57_L2MERRSR_VALID(l2merrsr)) + return; + + if (A57_L2MERRSR_FATAL(l2merrsr)) + ed->err = DBE; + + edac_printk(KERN_CRIT, EDAC_CPU, "CortexA57 L2 %s Error detected\n", + err_name[ed->err]); + ca53_ca57_print_error_state_regs(); + if (ed->err == DBE) + edac_printk(KERN_CRIT, EDAC_CPU, "Fatal error\n"); + + cpuid = A57_L2MERRSR_CPUID(l2merrsr); + + switch (A57_L2MERRSR_RAMID(l2merrsr)) { + case 0x10: + edac_printk(KERN_CRIT, EDAC_CPU, + "L2 tag RAM cpu %d way is %d\n", + cpuid / 2, cpuid % 2); + break; + case 0x11: + edac_printk(KERN_CRIT, EDAC_CPU, + "L2 data RAM cpu %d bank is %d\n", + cpuid / 2, cpuid % 2); + break; + case 0x12: + edac_printk(KERN_CRIT, EDAC_CPU, + "SCU snoop tag RAM bank is %d\n", cpuid); + break; + case 0x14: + edac_printk(KERN_CRIT, EDAC_CPU, + "L2 dirty RAM cpu %d bank is %d\n", + cpuid / 2, cpuid % 2); + break; + case 0x18: + edac_printk(KERN_CRIT, EDAC_CPU, + "L2 inclusion PF RAM bank is %d\n", cpuid); + break; + default: + edac_printk(KERN_CRIT, EDAC_CPU, + "Error in unknown RAM ID: %d\n", + (int) A57_L2MERRSR_RAMID(l2merrsr)); + break; + } + + edac_printk(KERN_CRIT, EDAC_CPU, "Repeated error count: %d\n", + (int) A57_L2MERRSR_REPT(l2merrsr)); + edac_printk(KERN_CRIT, EDAC_CPU, "Other error count: %d\n", + (int) A57_L2MERRSR_OTHER(l2merrsr)); + + if (ed->err == SBE) { + errors[A57_L2_CE].func(ed->drv->edev_ctl, smp_processor_id(), + L2_CACHE, errors[A57_L2_CE].msg); + } else if (ed->err == DBE) { + errors[A57_L2_UE].func(ed->drv->edev_ctl, smp_processor_id(), + L2_CACHE, errors[A57_L2_UE].msg); + } + write_l2merrsr_el1(0); +} + +static DEFINE_SPINLOCK(local_handler_lock); +static DEFINE_SPINLOCK(l2ectlr_lock); + +static void arm64_erp_local_handler(void *info) +{ + struct erp_local_data *errdata = info; + unsigned int cpuid = read_cpuid_id(); + unsigned int partnum = read_cpuid_part_number(); + unsigned long flags, flags2; + u32 l2ectlr; + + spin_lock_irqsave(&local_handler_lock, flags); + edac_printk(KERN_CRIT, EDAC_CPU, "%s error information from CPU %d, MIDR=%#08x:\n", + err_name[errdata->err], raw_smp_processor_id(), cpuid); + + switch (partnum) { + case ARM_CPU_PART_CORTEX_A53: + ca53_parse_cpumerrsr(errdata); + ca53_parse_l2merrsr(errdata); + break; + + case ARM_CPU_PART_CORTEX_A72: + case ARM_CPU_PART_CORTEX_A57: + ca57_parse_cpumerrsr(errdata); + ca57_parse_l2merrsr(errdata); + break; + + default: + edac_printk(KERN_CRIT, EDAC_CPU, "Unknown CPU Part Number in MIDR: %#04x (%#08x)\n", + partnum, cpuid); + }; + + /* Acklowledge internal error in L2ECTLR */ + spin_lock_irqsave(&l2ectlr_lock, flags2); + + l2ectlr = read_l2ectlr_el1; + + if (l2ectlr & L2ECTLR_INT_ERR) { + l2ectlr &= ~L2ECTLR_INT_ERR; + write_l2ectlr_el1(l2ectlr); + } + + spin_unlock_irqrestore(&l2ectlr_lock, flags2); + spin_unlock_irqrestore(&local_handler_lock, flags); +} + +static irqreturn_t arm64_dbe_handler(int irq, void *drvdata) +{ + struct erp_local_data errdata; + + errdata.drv = drvdata; + errdata.err = DBE; + edac_printk(KERN_CRIT, EDAC_CPU, "ARM64 CPU ERP: Double-bit error interrupt received!\n"); + + on_each_cpu(arm64_erp_local_handler, &errdata, 1); + + return IRQ_HANDLED; +} + +static void arm64_ext_local_handler(void *info) +{ + struct erp_drvdata *drv = info; + unsigned long flags, flags2; + u32 l2ectlr; + + spin_lock_irqsave(&local_handler_lock, flags); + + /* TODO: Shared locking for L2ECTLR access */ + spin_lock_irqsave(&l2ectlr_lock, flags2); + + l2ectlr = read_l2ectlr_el1; + + if (l2ectlr & L2ECTLR_EXT_ERR) { + edac_printk(KERN_CRIT, EDAC_CPU, + "L2 external error detected by CPU%d\n", + smp_processor_id()); + + errors[L2_EXT_UE].func(drv->edev_ctl, smp_processor_id(), + L2_CACHE, errors[L2_EXT_UE].msg); + + l2ectlr &= ~L2ECTLR_EXT_ERR; + write_l2ectlr_el1(l2ectlr); + } + + spin_unlock_irqrestore(&l2ectlr_lock, flags2); + spin_unlock_irqrestore(&local_handler_lock, flags); +} + +static irqreturn_t arm64_ext_handler(int irq, void *drvdata) +{ + edac_printk(KERN_CRIT, EDAC_CPU, "External error interrupt received!\n"); + + on_each_cpu(arm64_ext_local_handler, drvdata, 1); + + return IRQ_HANDLED; +} + +static irqreturn_t arm64_cci_handler(int irq, void *drvdata) +{ + struct erp_drvdata *drv = drvdata; + u32 cci_err_reg; + + edac_printk(KERN_CRIT, EDAC_CPU, "CCI error interrupt received!\n"); + + if (drv->cci_base) { + cci_err_reg = readl_relaxed(drv->cci_base + + CCI_IMPRECISEERROR_REG); + + edac_printk(KERN_CRIT, EDAC_CPU, "CCI imprecise error register: %#08x.\n", + cci_err_reg); + + /* This register has write-clear semantics */ + writel_relaxed(cci_err_reg, drv->cci_base + + CCI_IMPRECISEERROR_REG); + + /* Ensure error bits cleared before exiting ISR */ + mb(); + } else { + edac_printk(KERN_CRIT, EDAC_CPU, "CCI registers not available.\n"); + } + + errors[CCI_UE].func(drv->edev_ctl, 0, CCI, errors[CCI_UE].msg); + + return IRQ_HANDLED; +} + +static void arm64_sbe_handler(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct erp_local_data errdata; + int cpu = raw_smp_processor_id(); + + errdata.drv = event->overflow_handler_context; + errdata.err = SBE; + edac_printk(KERN_CRIT, EDAC_CPU, "ARM64 CPU ERP: Single-bit error interrupt received on CPU %d!\n", + cpu); + WARN_ON(!panic_on_ce); + arm64_erp_local_handler(&errdata); +} + +static int request_erp_irq(struct platform_device *pdev, const char *propname, + const char *desc, irq_handler_t handler, + void *ed) +{ + int rc; + struct resource *r; + + r = platform_get_resource_byname(pdev, IORESOURCE_IRQ, propname); + + if (!r) { + pr_err("ARM64 CPU ERP: Could not find <%s> IRQ property. Proceeding anyway.\n", + propname); + return -EINVAL; + } + + rc = devm_request_threaded_irq(&pdev->dev, r->start, NULL, + handler, + IRQF_ONESHOT | IRQF_TRIGGER_RISING, + desc, + ed); + + if (rc) { + pr_err("ARM64 CPU ERP: Failed to request IRQ %d: %d (%s / %s). Proceeding anyway.\n", + (int) r->start, rc, propname, desc); + return -EINVAL; + } + + return 0; +} + +static void check_sbe_event(struct erp_drvdata *drv) +{ + unsigned int partnum = read_cpuid_part_number(); + struct erp_local_data errdata; + unsigned long flags; + + errdata.drv = drv; + errdata.err = SBE; + + spin_lock_irqsave(&local_handler_lock, flags); + switch (partnum) { + case ARM_CPU_PART_CORTEX_A53: + ca53_parse_cpumerrsr(&errdata); + ca53_parse_l2merrsr(&errdata); + break; + + case ARM_CPU_PART_CORTEX_A72: + case ARM_CPU_PART_CORTEX_A57: + ca57_parse_cpumerrsr(&errdata); + ca57_parse_l2merrsr(&errdata); + break; + }; + spin_unlock_irqrestore(&local_handler_lock, flags); +} + +#ifdef CONFIG_EDAC_CORTEX_ARM64_DBE_IRQ_ONLY +static void create_sbe_counter(int cpu, void *info) +{ } +#else +static void create_sbe_counter(int cpu, void *info) +{ + struct erp_drvdata *drv = info; + struct perf_event *event = drv->memerr_counters[cpu]; + struct perf_event_attr attr = { + .pinned = 1, + .disabled = 0, /* 0 will enable the counter upon creation */ + .sample_period = 1, /* 1 will set the counter to max int */ + .type = PERF_TYPE_RAW, + .config = MEM_ERROR_EVENT, + .size = sizeof(struct perf_event_attr), + }; + + if (event) + return; + + /* Fails if cpu is not online */ + event = perf_event_create_kernel_counter(&attr, cpu, NULL, + arm64_sbe_handler, + drv); + if (IS_ERR(event)) { + pr_err("PERF Event creation failed on cpu %d ptr_err %ld\n", + cpu, PTR_ERR(event)); + return; + } + drv->memerr_counters[cpu] = event; +} +#endif + +static int arm64_pmu_cpu_pm_notify(struct notifier_block *self, + unsigned long action, void *v) +{ + struct erp_drvdata *drv = container_of(self, struct erp_drvdata, nb_pm); + + switch (action) { + case CPU_PM_EXIT: + check_sbe_event(drv); + break; + } + + return NOTIFY_OK; +} + +static int arm64_edac_pmu_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + struct erp_drvdata *drv = container_of(self, struct erp_drvdata, + nb_cpu); + unsigned long cpu = (unsigned long)hcpu; + + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_ONLINE: + create_sbe_counter(cpu, drv); + break; + }; + + return NOTIFY_OK; +} + +#ifndef CONFIG_EDAC_CORTEX_ARM64_DBE_IRQ_ONLY +void arm64_check_cache_ecc(void *info) +{ + if (panic_handler_drvdata) + check_sbe_event(panic_handler_drvdata); +} +#else +static inline void arm64_check_cache_ecc(void *info) {} +#endif + +static int arm64_erp_panic_notify(struct notifier_block *this, + unsigned long event, void *ptr) +{ + arm64_check_cache_ecc(NULL); + + return NOTIFY_OK; +} + +static void arm64_monitor_cache_errors(struct edac_device_ctl_info *edev) +{ + struct cpumask cluster_mask, old_mask; + int cpu; + + cpumask_clear(&cluster_mask); + cpumask_clear(&old_mask); + + for_each_possible_cpu(cpu) { + cpumask_copy(&cluster_mask, topology_core_cpumask(cpu)); + if (cpumask_equal(&cluster_mask, &old_mask)) + continue; + cpumask_copy(&old_mask, &cluster_mask); + smp_call_function_any(&cluster_mask, + arm64_check_cache_ecc, NULL, 0); + } +} + +static int arm64_cpu_erp_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct erp_drvdata *drv; + struct resource *r; + int cpu; + u32 poll_msec; + + int rc, fail = 0; + + drv = devm_kzalloc(dev, sizeof(*drv), GFP_KERNEL); + + if (!drv) + return -ENOMEM; + + drv->edev_ctl = edac_device_alloc_ctl_info(0, "cpu", + num_possible_cpus(), "L", 3, 1, NULL, 0, + edac_device_alloc_index()); + + if (!drv->edev_ctl) + return -ENOMEM; + + rc = of_property_read_u32(pdev->dev.of_node, "poll-delay-ms", + &poll_msec); + if (!rc && !IS_ENABLED(CONFIG_EDAC_CORTEX_ARM64_DBE_IRQ_ONLY)) { + drv->edev_ctl->edac_check = arm64_monitor_cache_errors; + drv->edev_ctl->poll_msec = poll_msec; + drv->edev_ctl->defer_work = 1; + } + drv->edev_ctl->dev = dev; + drv->edev_ctl->mod_name = dev_name(dev); + drv->edev_ctl->dev_name = dev_name(dev); + drv->edev_ctl->ctl_name = "cache"; + drv->edev_ctl->panic_on_ce = panic_on_ce; + drv->edev_ctl->panic_on_ue = ARM64_ERP_PANIC_ON_UE; + + rc = edac_device_add_device(drv->edev_ctl); + if (rc) + goto out_mem; + + r = platform_get_resource_byname(pdev, IORESOURCE_MEM, "cci"); + if (r) + drv->cci_base = devm_ioremap_resource(dev, r); + + if (request_erp_irq(pdev, "pri-dbe-irq", "ARM64 primary DBE IRQ", + arm64_dbe_handler, drv)) + fail++; + + if (request_erp_irq(pdev, "sec-dbe-irq", "ARM64 secondary DBE IRQ", + arm64_dbe_handler, drv)) + fail++; + + if (request_erp_irq(pdev, "pri-ext-irq", "ARM64 primary ext IRQ", + arm64_ext_handler, drv)) + fail++; + + if (request_erp_irq(pdev, "sec-ext-irq", "ARM64 secondary ext IRQ", + arm64_ext_handler, drv)) + fail++; + + /* + * We still try to register a handler for CCI errors even if we don't + * have access to cci_base, but error reporting becomes best-effort in + * that case. + */ + if (request_erp_irq(pdev, "cci-irq", "CCI error IRQ", + arm64_cci_handler, drv)) + fail++; + + if (IS_ENABLED(CONFIG_EDAC_CORTEX_ARM64_DBE_IRQ_ONLY)) { + pr_err("ARM64 CPU ERP: SBE detection is disabled.\n"); + goto out_irq; + } + + drv->nb_pm.notifier_call = arm64_pmu_cpu_pm_notify; + cpu_pm_register_notifier(&(drv->nb_pm)); + drv->nb_panic.notifier_call = arm64_erp_panic_notify; + atomic_notifier_chain_register(&panic_notifier_list, + &drv->nb_panic); + drv->nb_cpu.notifier_call = arm64_edac_pmu_cpu_notify; + register_cpu_notifier(&drv->nb_cpu); + get_online_cpus(); + for_each_online_cpu(cpu) + create_sbe_counter(cpu, drv); + put_online_cpus(); + +out_irq: + if (fail == of_irq_count(dev->of_node)) { + pr_err("ARM64 CPU ERP: Could not request any IRQs. Giving up.\n"); + rc = -ENODEV; + goto out_dev; + } + + panic_handler_drvdata = drv; + + return 0; + +out_dev: + edac_device_del_device(dev); +out_mem: + edac_device_free_ctl_info(drv->edev_ctl); + return rc; +} + +static const struct of_device_id arm64_cpu_erp_match_table[] = { + { .compatible = "arm,arm64-cpu-erp" }, + { } +}; + +static struct platform_driver arm64_cpu_erp_driver = { + .probe = arm64_cpu_erp_probe, + .driver = { + .name = "arm64_cpu_cache_erp", + .owner = THIS_MODULE, + .of_match_table = of_match_ptr(arm64_cpu_erp_match_table), + }, +}; + +static int __init arm64_cpu_erp_init(void) +{ + return platform_driver_register(&arm64_cpu_erp_driver); +} +device_initcall_sync(arm64_cpu_erp_init); |