From 11d549048e5bea3055053d8dcf45089b1fd711f1 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Fri, 8 Aug 2014 22:22:12 -0700 Subject: powerpc: Fix "attempt to move .org backwards" error Once again, we see arch/powerpc/kernel/exceptions-64s.S: Assembler messages: arch/powerpc/kernel/exceptions-64s.S:865: Error: attempt to move .org backwards arch/powerpc/kernel/exceptions-64s.S:866: Error: attempt to move .org backwards arch/powerpc/kernel/exceptions-64s.S:890: Error: attempt to move .org backwards when compiling ppc:allmodconfig. This time the problem has been caused by to commit 0869b6fd209bda ("powerpc/book3s: Add basic infrastructure to handle HMI in Linux"), which adds functions hmi_exception_early and hmi_exception_after_realmode into a critical (size-limited) code area, even though that does not appear to be necessary. Move those functions to a non-critical area of the file. Signed-off-by: Guenter Roeck Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/exceptions-64s.S | 110 +++++++++++++++++------------------ 1 file changed, 55 insertions(+), 55 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 6144d5a6bfe7..050f79a4a168 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -592,61 +592,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR) MASKABLE_EXCEPTION_HV_OOL(0xe62, hmi_exception) KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe62) - .globl hmi_exception_early -hmi_exception_early: - EXCEPTION_PROLOG_1(PACA_EXGEN, NOTEST, 0xe60) - mr r10,r1 /* Save r1 */ - ld r1,PACAEMERGSP(r13) /* Use emergency stack */ - subi r1,r1,INT_FRAME_SIZE /* alloc stack frame */ - std r9,_CCR(r1) /* save CR in stackframe */ - mfspr r11,SPRN_HSRR0 /* Save HSRR0 */ - std r11,_NIP(r1) /* save HSRR0 in stackframe */ - mfspr r12,SPRN_HSRR1 /* Save SRR1 */ - std r12,_MSR(r1) /* save SRR1 in stackframe */ - std r10,0(r1) /* make stack chain pointer */ - std r0,GPR0(r1) /* save r0 in stackframe */ - std r10,GPR1(r1) /* save r1 in stackframe */ - EXCEPTION_PROLOG_COMMON_2(PACA_EXGEN) - EXCEPTION_PROLOG_COMMON_3(0xe60) - addi r3,r1,STACK_FRAME_OVERHEAD - bl hmi_exception_realmode - /* Windup the stack. */ - /* Clear MSR_RI before setting SRR0 and SRR1. */ - li r0,MSR_RI - mfmsr r9 /* get MSR value */ - andc r9,r9,r0 - mtmsrd r9,1 /* Clear MSR_RI */ - /* Move original HSRR0 and HSRR1 into the respective regs */ - ld r9,_MSR(r1) - mtspr SPRN_HSRR1,r9 - ld r3,_NIP(r1) - mtspr SPRN_HSRR0,r3 - ld r9,_CTR(r1) - mtctr r9 - ld r9,_XER(r1) - mtxer r9 - ld r9,_LINK(r1) - mtlr r9 - REST_GPR(0, r1) - REST_8GPRS(2, r1) - REST_GPR(10, r1) - ld r11,_CCR(r1) - mtcr r11 - REST_GPR(11, r1) - REST_2GPRS(12, r1) - /* restore original r1. */ - ld r1,GPR1(r1) - - /* - * Go to virtual mode and pull the HMI event information from - * firmware. - */ - .globl hmi_exception_after_realmode -hmi_exception_after_realmode: - SET_SCRATCH0(r13) - EXCEPTION_PROLOG_0(PACA_EXGEN) - b hmi_exception_hv - MASKABLE_EXCEPTION_HV_OOL(0xe82, h_doorbell) KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe82) @@ -1306,6 +1251,61 @@ fwnmi_data_area: . = 0x8000 #endif /* defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) */ + .globl hmi_exception_early +hmi_exception_early: + EXCEPTION_PROLOG_1(PACA_EXGEN, NOTEST, 0xe60) + mr r10,r1 /* Save r1 */ + ld r1,PACAEMERGSP(r13) /* Use emergency stack */ + subi r1,r1,INT_FRAME_SIZE /* alloc stack frame */ + std r9,_CCR(r1) /* save CR in stackframe */ + mfspr r11,SPRN_HSRR0 /* Save HSRR0 */ + std r11,_NIP(r1) /* save HSRR0 in stackframe */ + mfspr r12,SPRN_HSRR1 /* Save SRR1 */ + std r12,_MSR(r1) /* save SRR1 in stackframe */ + std r10,0(r1) /* make stack chain pointer */ + std r0,GPR0(r1) /* save r0 in stackframe */ + std r10,GPR1(r1) /* save r1 in stackframe */ + EXCEPTION_PROLOG_COMMON_2(PACA_EXGEN) + EXCEPTION_PROLOG_COMMON_3(0xe60) + addi r3,r1,STACK_FRAME_OVERHEAD + bl hmi_exception_realmode + /* Windup the stack. */ + /* Clear MSR_RI before setting SRR0 and SRR1. */ + li r0,MSR_RI + mfmsr r9 /* get MSR value */ + andc r9,r9,r0 + mtmsrd r9,1 /* Clear MSR_RI */ + /* Move original HSRR0 and HSRR1 into the respective regs */ + ld r9,_MSR(r1) + mtspr SPRN_HSRR1,r9 + ld r3,_NIP(r1) + mtspr SPRN_HSRR0,r3 + ld r9,_CTR(r1) + mtctr r9 + ld r9,_XER(r1) + mtxer r9 + ld r9,_LINK(r1) + mtlr r9 + REST_GPR(0, r1) + REST_8GPRS(2, r1) + REST_GPR(10, r1) + ld r11,_CCR(r1) + mtcr r11 + REST_GPR(11, r1) + REST_2GPRS(12, r1) + /* restore original r1. */ + ld r1,GPR1(r1) + + /* + * Go to virtual mode and pull the HMI event information from + * firmware. + */ + .globl hmi_exception_after_realmode +hmi_exception_after_realmode: + SET_SCRATCH0(r13) + EXCEPTION_PROLOG_0(PACA_EXGEN) + b hmi_exception_hv + #ifdef CONFIG_PPC_POWERNV _GLOBAL(opal_mc_secondary_handler) HMT_MEDIUM_PPR_DISCARD -- cgit v1.2.3 From 763fe0addb8fe15ccea67c0aebddc06f4bb25439 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Wed, 6 Aug 2014 17:10:16 +1000 Subject: powerpc/powernv: Fix IOMMU group lost When we take full hotplug to recover from EEH errors, PCI buses could be involved. For the case, the child devices of involved PCI buses can't be attached to IOMMU group properly, which is caused by commit 3f28c5a ("powerpc/powernv: Reduce multi-hit of iommu_add_device()"). When adding the PCI devices of the newly created PCI buses to the system, the IOMMU group is expected to be added in (C). (A) fails to bind the IOMMU group because bus->is_added is false. (B) fails because the device doesn't have binding IOMMU table yet. bus->is_added is set to true at end of (C) and pdev->is_added is set to true at (D). pcibios_add_pci_devices() pci_scan_bridge() pci_scan_child_bus() pci_scan_slot() pci_scan_single_device() pci_scan_device() pci_device_add() pcibios_add_device() A: Ignore device_add() B: Ignore pcibios_fixup_bus() pcibios_setup_bus_devices() pcibios_setup_device() C: Hit pcibios_finish_adding_to_bus() pci_bus_add_devices() pci_bus_add_device() D: Add device If the parent PCI bus isn't involved in hotplug, the IOMMU group is expected to be bound in (B). (A) should fail as the sysfs entries aren't populated. The patch fixes the issue by reverting commit 3f28c5a and remove WARN_ON() in iommu_add_device() to allow calling the function even the specified device already has associated IOMMU group. Cc: # 3.16+ Reported-by: Thadeu Lima de Souza Cascardo Signed-off-by: Gavin Shan Acked-by: Wei Yang Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/iommu.c | 38 +++++++++++++++++-------------- arch/powerpc/platforms/powernv/pci-ioda.c | 2 +- 2 files changed, 22 insertions(+), 18 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index f84f799babb1..a10642a0d861 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -1120,37 +1120,41 @@ EXPORT_SYMBOL_GPL(iommu_release_ownership); int iommu_add_device(struct device *dev) { struct iommu_table *tbl; - int ret = 0; - if (WARN_ON(dev->iommu_group)) { - pr_warn("iommu_tce: device %s is already in iommu group %d, skipping\n", - dev_name(dev), - iommu_group_id(dev->iommu_group)); + /* + * The sysfs entries should be populated before + * binding IOMMU group. If sysfs entries isn't + * ready, we simply bail. + */ + if (!device_is_registered(dev)) + return -ENOENT; + + if (dev->iommu_group) { + pr_debug("%s: Skipping device %s with iommu group %d\n", + __func__, dev_name(dev), + iommu_group_id(dev->iommu_group)); return -EBUSY; } tbl = get_iommu_table_base(dev); if (!tbl || !tbl->it_group) { - pr_debug("iommu_tce: skipping device %s with no tbl\n", - dev_name(dev)); + pr_debug("%s: Skipping device %s with no tbl\n", + __func__, dev_name(dev)); return 0; } - pr_debug("iommu_tce: adding %s to iommu group %d\n", - dev_name(dev), iommu_group_id(tbl->it_group)); + pr_debug("%s: Adding %s to iommu group %d\n", + __func__, dev_name(dev), + iommu_group_id(tbl->it_group)); if (PAGE_SIZE < IOMMU_PAGE_SIZE(tbl)) { - pr_err("iommu_tce: unsupported iommu page size."); - pr_err("%s has not been added\n", dev_name(dev)); + pr_err("%s: Invalid IOMMU page size %lx (%lx) on %s\n", + __func__, IOMMU_PAGE_SIZE(tbl), + PAGE_SIZE, dev_name(dev)); return -EINVAL; } - ret = iommu_group_add_device(tbl->it_group, dev); - if (ret < 0) - pr_err("iommu_tce: %s has not been added, ret=%d\n", - dev_name(dev), ret); - - return ret; + return iommu_group_add_device(tbl->it_group, dev); } EXPORT_SYMBOL_GPL(iommu_add_device); diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index b136108ddc99..df241b11d4f7 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -857,7 +857,7 @@ static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev pe = &phb->ioda.pe_array[pdn->pe_number]; WARN_ON(get_dma_ops(&pdev->dev) != &dma_iommu_ops); - set_iommu_table_base(&pdev->dev, &pe->tce32_table); + set_iommu_table_base_and_group(&pdev->dev, &pe->tce32_table); } static int pnv_pci_ioda_dma_set_mask(struct pnv_phb *phb, -- cgit v1.2.3 From 97b3be1e9404f4c1645f3286428abd4d2052041e Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Wed, 6 Aug 2014 17:03:09 +1000 Subject: powerpc/ppc476: Disable BTAC This patch disables the branch target address CAM which under specific circumstances may cause the processor to skip execution of 1-4 instructions. This fixes IBM Erratum #47. Signed-off-by: Alistair Popple Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/head_44x.S | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/head_44x.S b/arch/powerpc/kernel/head_44x.S index c334f53453f7..b5061abbd2e0 100644 --- a/arch/powerpc/kernel/head_44x.S +++ b/arch/powerpc/kernel/head_44x.S @@ -1210,10 +1210,12 @@ clear_utlb_entry: /* We configure icbi to invalidate 128 bytes at a time since the * current 32-bit kernel code isn't too happy with icache != dcache - * block size + * block size. We also disable the BTAC as this can cause errors + * in some circumstances (see IBM Erratum 47). */ mfspr r3,SPRN_CCR0 oris r3,r3,0x0020 + ori r3,r3,0x0040 mtspr SPRN_CCR0,r3 isync -- cgit v1.2.3 From 2fabf084b6ad6337675d700b159a6091023544f2 Mon Sep 17 00:00:00 2001 From: Nishanth Aravamudan Date: Thu, 17 Jul 2014 16:15:12 -0700 Subject: powerpc: reorder per-cpu NUMA information's initialization There is an issue currently where NUMA information is used on powerpc (and possibly ia64) before it has been read from the device-tree, which leads to large slab consumption with CONFIG_SLUB and memoryless nodes. NUMA powerpc non-boot CPU's cpu_to_node/cpu_to_mem is only accurate after start_secondary(), similar to ia64, which is invoked via smp_init(). Commit 6ee0578b4daae ("workqueue: mark init_workqueues() as early_initcall()") made init_workqueues() be invoked via do_pre_smp_initcalls(), which is obviously before the secondary processors are online. Additionally, the following commits changed init_workqueues() to use cpu_to_node to determine the node to use for kthread_create_on_node: bce903809ab3f ("workqueue: add wq_numa_tbl_len and wq_numa_possible_cpumask[]") f3f90ad469342 ("workqueue: determine NUMA node of workers accourding to the allowed cpumask") Therefore, when init_workqueues() runs, it sees all CPUs as being on Node 0. On LPARs or KVM guests where Node 0 is memoryless, this leads to a high number of slab deactivations (http://www.spinics.net/lists/linux-mm/msg67489.html). Fix this by initializing the powerpc-specific CPU<->node/local memory node mapping as early as possible, which on powerpc is do_init_bootmem(). Currently that function initializes the mapping for the boot CPU, but we extend it to setup the mapping for all possible CPUs. Then, in smp_prepare_cpus(), we can correspondingly set the per-cpu values for all possible CPUs. That ensures that before the early_initcalls run (and really as early as possible), the per-cpu NUMA mapping is accurate. While testing memoryless nodes on PowerKVM guests with a fix to the workqueue logic to use cpu_to_mem() instead of cpu_to_node(), with a guest topology of: available: 2 nodes (0-1) node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 node 0 size: 0 MB node 0 free: 0 MB node 1 cpus: 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 node 1 size: 16336 MB node 1 free: 15329 MB node distances: node 0 1 0: 10 40 1: 40 10 the slab consumption decreases from Slab: 932416 kB SUnreclaim: 902336 kB to Slab: 395264 kB SUnreclaim: 359424 kB And we a corresponding increase in the slab efficiency from slab mem objs slabs used active active ------------------------------------------------------------ kmalloc-16384 337 MB 11.28% 100.00% task_struct 288 MB 9.93% 100.00% to slab mem objs slabs used active active ------------------------------------------------------------ kmalloc-16384 37 MB 100.00% 100.00% task_struct 31 MB 100.00% 100.00% Powerpc didn't support memoryless nodes until recently (64bb80d87f01 "powerpc/numa: Enable CONFIG_HAVE_MEMORYLESS_NODES" and 8c272261194d "powerpc/numa: Enable USE_PERCPU_NUMA_NODE_ID"). Those commits also helped improve memory consumption with these kind of environments. Signed-off-by: Nishanth Aravamudan Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/smp.c | 11 +++++------ arch/powerpc/mm/numa.c | 13 ++++++++++--- 2 files changed, 15 insertions(+), 9 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 1007fb802e6b..a0738af4aba6 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -376,6 +376,11 @@ void __init smp_prepare_cpus(unsigned int max_cpus) GFP_KERNEL, cpu_to_node(cpu)); zalloc_cpumask_var_node(&per_cpu(cpu_core_map, cpu), GFP_KERNEL, cpu_to_node(cpu)); + /* + * numa_node_id() works after this. + */ + set_cpu_numa_node(cpu, numa_cpu_lookup_table[cpu]); + set_cpu_numa_mem(cpu, local_memory_node(numa_cpu_lookup_table[cpu])); } cpumask_set_cpu(boot_cpuid, cpu_sibling_mask(boot_cpuid)); @@ -723,12 +728,6 @@ void start_secondary(void *unused) } traverse_core_siblings(cpu, true); - /* - * numa_node_id() works after this. - */ - set_numa_node(numa_cpu_lookup_table[cpu]); - set_numa_mem(local_memory_node(numa_cpu_lookup_table[cpu])); - smp_wmb(); notify_cpu_starting(cpu); set_cpu_online(cpu, true); diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index d3e9a78eaed3..d7737a542fd7 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -1049,7 +1049,7 @@ static void __init mark_reserved_regions_for_nid(int nid) void __init do_init_bootmem(void) { - int nid; + int nid, cpu; min_low_pfn = 0; max_low_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT; @@ -1122,8 +1122,15 @@ void __init do_init_bootmem(void) reset_numa_cpu_lookup_table(); register_cpu_notifier(&ppc64_numa_nb); - cpu_numa_callback(&ppc64_numa_nb, CPU_UP_PREPARE, - (void *)(unsigned long)boot_cpuid); + /* + * We need the numa_cpu_lookup_table to be accurate for all CPUs, + * even before we online them, so that we can use cpu_to_{node,mem} + * early in boot, cf. smp_prepare_cpus(). + */ + for_each_possible_cpu(cpu) { + cpu_numa_callback(&ppc64_numa_nb, CPU_UP_PREPARE, + (void *)(unsigned long)cpu); + } } void __init paging_init(void) -- cgit v1.2.3