diff options
author | Ingo Molnar <mingo@kernel.org> | 2014-07-28 10:00:33 +0200 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2014-07-28 10:00:33 +0200 |
commit | 5030c69755416d19516c0a61cd988a0e0062e041 (patch) | |
tree | ca4d3088428cf033420f9df9111d796650bc59e5 /arch/x86 | |
parent | 2336ebc32676df5b794acfe0c980583ec6c05f34 (diff) | |
parent | 64aa90f26c06e1cb2aacfb98a7d0eccfbd6c1a91 (diff) |
Merge tag 'v3.16-rc7' into perf/core, to merge in the latest fixes before applying new changes
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'arch/x86')
34 files changed, 540 insertions, 180 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index a8f749ef0fdc..d24887b645dc 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -131,6 +131,7 @@ config X86 select HAVE_CC_STACKPROTECTOR select GENERIC_CPU_AUTOPROBE select HAVE_ARCH_AUDITSYSCALL + select ARCH_SUPPORTS_ATOMIC_RMW config INSTRUCTION_DECODER def_bool y diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 84c223479e3c..7a6d43a554d7 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -91,10 +91,9 @@ bs_die: .section ".bsdata", "a" bugger_off_msg: - .ascii "Direct floppy boot is not supported. " - .ascii "Use a boot loader program instead.\r\n" + .ascii "Use a boot loader.\r\n" .ascii "\n" - .ascii "Remove disk and press any key to reboot ...\r\n" + .ascii "Remove disk and press any key to reboot...\r\n" .byte 0 #ifdef CONFIG_EFI_STUB @@ -108,7 +107,7 @@ coff_header: #else .word 0x8664 # x86-64 #endif - .word 3 # nr_sections + .word 4 # nr_sections .long 0 # TimeDateStamp .long 0 # PointerToSymbolTable .long 1 # NumberOfSymbols @@ -250,6 +249,25 @@ section_table: .word 0 # NumberOfLineNumbers .long 0x60500020 # Characteristics (section flags) + # + # The offset & size fields are filled in by build.c. + # + .ascii ".bss" + .byte 0 + .byte 0 + .byte 0 + .byte 0 + .long 0 + .long 0x0 + .long 0 # Size of initialized data + # on disk + .long 0x0 + .long 0 # PointerToRelocations + .long 0 # PointerToLineNumbers + .word 0 # NumberOfRelocations + .word 0 # NumberOfLineNumbers + .long 0xc8000080 # Characteristics (section flags) + #endif /* CONFIG_EFI_STUB */ # Kernel attributes; used by setup. This is part 1 of the diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c index 1a2f2121cada..a7661c430cd9 100644 --- a/arch/x86/boot/tools/build.c +++ b/arch/x86/boot/tools/build.c @@ -143,7 +143,7 @@ static void usage(void) #ifdef CONFIG_EFI_STUB -static void update_pecoff_section_header(char *section_name, u32 offset, u32 size) +static void update_pecoff_section_header_fields(char *section_name, u32 vma, u32 size, u32 datasz, u32 offset) { unsigned int pe_header; unsigned short num_sections; @@ -164,10 +164,10 @@ static void update_pecoff_section_header(char *section_name, u32 offset, u32 siz put_unaligned_le32(size, section + 0x8); /* section header vma field */ - put_unaligned_le32(offset, section + 0xc); + put_unaligned_le32(vma, section + 0xc); /* section header 'size of initialised data' field */ - put_unaligned_le32(size, section + 0x10); + put_unaligned_le32(datasz, section + 0x10); /* section header 'file offset' field */ put_unaligned_le32(offset, section + 0x14); @@ -179,6 +179,11 @@ static void update_pecoff_section_header(char *section_name, u32 offset, u32 siz } } +static void update_pecoff_section_header(char *section_name, u32 offset, u32 size) +{ + update_pecoff_section_header_fields(section_name, offset, size, size, offset); +} + static void update_pecoff_setup_and_reloc(unsigned int size) { u32 setup_offset = 0x200; @@ -203,9 +208,6 @@ static void update_pecoff_text(unsigned int text_start, unsigned int file_sz) pe_header = get_unaligned_le32(&buf[0x3c]); - /* Size of image */ - put_unaligned_le32(file_sz, &buf[pe_header + 0x50]); - /* * Size of code: Subtract the size of the first sector (512 bytes) * which includes the header. @@ -220,6 +222,22 @@ static void update_pecoff_text(unsigned int text_start, unsigned int file_sz) update_pecoff_section_header(".text", text_start, text_sz); } +static void update_pecoff_bss(unsigned int file_sz, unsigned int init_sz) +{ + unsigned int pe_header; + unsigned int bss_sz = init_sz - file_sz; + + pe_header = get_unaligned_le32(&buf[0x3c]); + + /* Size of uninitialized data */ + put_unaligned_le32(bss_sz, &buf[pe_header + 0x24]); + + /* Size of image */ + put_unaligned_le32(init_sz, &buf[pe_header + 0x50]); + + update_pecoff_section_header_fields(".bss", file_sz, bss_sz, 0, 0); +} + static int reserve_pecoff_reloc_section(int c) { /* Reserve 0x20 bytes for .reloc section */ @@ -259,6 +277,8 @@ static void efi_stub_entry_update(void) static inline void update_pecoff_setup_and_reloc(unsigned int size) {} static inline void update_pecoff_text(unsigned int text_start, unsigned int file_sz) {} +static inline void update_pecoff_bss(unsigned int file_sz, + unsigned int init_sz) {} static inline void efi_stub_defaults(void) {} static inline void efi_stub_entry_update(void) {} @@ -310,7 +330,7 @@ static void parse_zoffset(char *fname) int main(int argc, char ** argv) { - unsigned int i, sz, setup_sectors; + unsigned int i, sz, setup_sectors, init_sz; int c; u32 sys_size; struct stat sb; @@ -376,7 +396,9 @@ int main(int argc, char ** argv) buf[0x1f1] = setup_sectors-1; put_unaligned_le32(sys_size, &buf[0x1f4]); - update_pecoff_text(setup_sectors * 512, sz + i + ((sys_size * 16) - sz)); + update_pecoff_text(setup_sectors * 512, i + (sys_size * 16)); + init_sz = get_unaligned_le32(&buf[0x260]); + update_pecoff_bss(i + (sys_size * 16), init_sz); efi_stub_entry_update(); diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c index f30cd10293f0..8626b03e83b7 100644 --- a/arch/x86/crypto/sha512_ssse3_glue.c +++ b/arch/x86/crypto/sha512_ssse3_glue.c @@ -141,7 +141,7 @@ static int sha512_ssse3_final(struct shash_desc *desc, u8 *out) /* save number of bits */ bits[1] = cpu_to_be64(sctx->count[0] << 3); - bits[0] = cpu_to_be64(sctx->count[1] << 3) | sctx->count[0] >> 61; + bits[0] = cpu_to_be64(sctx->count[1] << 3 | sctx->count[0] >> 61); /* Pad out to 112 mod 128 and append length */ index = sctx->count[0] & 0x7f; diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index cb6cfcd034cf..a80cbb88ea91 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -43,7 +43,7 @@ extern int vector_used_by_percpu_irq(unsigned int vector); extern void init_ISA_irqs(void); #ifdef CONFIG_X86_LOCAL_APIC -void arch_trigger_all_cpu_backtrace(void); +void arch_trigger_all_cpu_backtrace(bool); #define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace #endif diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 49314155b66c..49205d01b9ad 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -95,7 +95,7 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level) #define KVM_REFILL_PAGES 25 #define KVM_MAX_CPUID_ENTRIES 80 #define KVM_NR_FIXED_MTRR_REGION 88 -#define KVM_NR_VAR_MTRR 8 +#define KVM_NR_VAR_MTRR 10 #define ASYNC_PF_PER_VCPU 64 @@ -461,7 +461,7 @@ struct kvm_vcpu_arch { bool nmi_injected; /* Trying to inject an NMI this entry */ struct mtrr_state_type mtrr_state; - u32 pat; + u64 pat; unsigned switch_db_regs; unsigned long db[KVM_NR_DB_REGS]; diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 14fd6fd75a19..6205f0c434db 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -231,6 +231,22 @@ static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, #define ARCH_HAS_USER_SINGLE_STEP_INFO +/* + * When hitting ptrace_stop(), we cannot return using SYSRET because + * that does not restore the full CPU state, only a minimal set. The + * ptracer can change arbitrary register values, which is usually okay + * because the usual ptrace stops run off the signal delivery path which + * forces IRET; however, ptrace_event() stops happen in arbitrary places + * in the kernel and don't force IRET path. + * + * So force IRET path after a ptrace stop. + */ +#define arch_ptrace_stop_needed(code, info) \ +({ \ + set_thread_flag(TIF_NOTIFY_RESUME); \ + false; \ +}) + struct user_desc; extern int do_get_thread_area(struct task_struct *p, int idx, struct user_desc __user *info); diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index c3fcb5de5083..6a1e71bde323 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -33,31 +33,41 @@ static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly; /* "in progress" flag of arch_trigger_all_cpu_backtrace */ static unsigned long backtrace_flag; -void arch_trigger_all_cpu_backtrace(void) +void arch_trigger_all_cpu_backtrace(bool include_self) { int i; + int cpu = get_cpu(); - if (test_and_set_bit(0, &backtrace_flag)) + if (test_and_set_bit(0, &backtrace_flag)) { /* * If there is already a trigger_all_cpu_backtrace() in progress * (backtrace_flag == 1), don't output double cpu dump infos. */ + put_cpu(); return; + } cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask); + if (!include_self) + cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); - printk(KERN_INFO "sending NMI to all CPUs:\n"); - apic->send_IPI_all(NMI_VECTOR); + if (!cpumask_empty(to_cpumask(backtrace_mask))) { + pr_info("sending NMI to %s CPUs:\n", + (include_self ? "all" : "other")); + apic->send_IPI_mask(to_cpumask(backtrace_mask), NMI_VECTOR); + } /* Wait for up to 10 seconds for all CPUs to do the backtrace */ for (i = 0; i < 10 * 1000; i++) { if (cpumask_empty(to_cpumask(backtrace_mask))) break; mdelay(1); + touch_softlockup_watchdog(); } clear_bit(0, &backtrace_flag); smp_mb__after_atomic(); + put_cpu(); } static int diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index f3a1f04ed4cb..584874451414 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -841,7 +841,6 @@ static int apm_do_idle(void) u32 eax; u8 ret = 0; int idled = 0; - int polling; int err = 0; if (!need_resched()) { diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index a80029035bf2..f9e4fdd3b877 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -370,6 +370,17 @@ static void init_intel(struct cpuinfo_x86 *c) */ detect_extended_topology(c); + if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) { + /* + * let's use the legacy cpuid vector 0x1 and 0x4 for topology + * detection. + */ + c->x86_max_cores = intel_num_cpu_cores(c); +#ifdef CONFIG_X86_32 + detect_ht(c); +#endif + } + l2 = init_intel_cacheinfo(c); if (c->cpuid_level > 9) { unsigned eax = cpuid_eax(10); @@ -438,17 +449,6 @@ static void init_intel(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_P3); #endif - if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) { - /* - * let's use the legacy cpuid vector 0x1 and 0x4 for topology - * detection. - */ - c->x86_max_cores = intel_num_cpu_cores(c); -#ifdef CONFIG_X86_32 - detect_ht(c); -#endif - } - /* Work around errata */ srat_detect_node(c); diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index a952e9c85b6f..9c8f7394c612 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -730,6 +730,18 @@ unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c) #endif } +#ifdef CONFIG_X86_HT + /* + * If cpu_llc_id is not yet set, this means cpuid_level < 4 which in + * turns means that the only possibility is SMT (as indicated in + * cpuid1). Since cpuid2 doesn't specify shared caches, and we know + * that SMT shares all caches, we can unconditionally set cpu_llc_id to + * c->phys_proc_id. + */ + if (per_cpu(cpu_llc_id, cpu) == BAD_APICID) + per_cpu(cpu_llc_id, cpu) = c->phys_proc_id; +#endif + c->x86_cache_size = l3 ? l3 : (l2 ? l2 : (l1i+l1d)); return l2; diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index bb92f38153b2..9a79c8dbd8e8 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -2451,6 +2451,12 @@ static __init int mcheck_init_device(void) for_each_online_cpu(i) { err = mce_device_create(i); if (err) { + /* + * Register notifier anyway (and do not unreg it) so + * that we don't leave undeleted timers, see notifier + * callback above. + */ + __register_hotcpu_notifier(&mce_cpu_notifier); cpu_notifier_register_done(); goto err_device_create; } @@ -2471,10 +2477,6 @@ static __init int mcheck_init_device(void) err_register: unregister_syscore_ops(&mce_syscore_ops); - cpu_notifier_register_begin(); - __unregister_hotcpu_notifier(&mce_cpu_notifier); - cpu_notifier_register_done(); - err_device_create: /* * We didn't keep track of which devices were created above, but diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 2bdfbff8a4f6..2879ecdaac43 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -118,6 +118,9 @@ static int x86_pmu_extra_regs(u64 config, struct perf_event *event) continue; if (event->attr.config1 & ~er->valid_mask) return -EINVAL; + /* Check if the extra msrs can be safely accessed*/ + if (!er->extra_msr_access) + return -ENXIO; reg->idx = er->idx; reg->config = event->attr.config1; diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 3b2f9bdd974b..8ade93111e03 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -295,14 +295,16 @@ struct extra_reg { u64 config_mask; u64 valid_mask; int idx; /* per_xxx->regs[] reg index */ + bool extra_msr_access; }; #define EVENT_EXTRA_REG(e, ms, m, vm, i) { \ - .event = (e), \ - .msr = (ms), \ - .config_mask = (m), \ - .valid_mask = (vm), \ - .idx = EXTRA_REG_##i, \ + .event = (e), \ + .msr = (ms), \ + .config_mask = (m), \ + .valid_mask = (vm), \ + .idx = EXTRA_REG_##i, \ + .extra_msr_access = true, \ } #define INTEL_EVENT_EXTRA_REG(event, msr, vm, idx) \ diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index adb02aa62af5..2502d0d9d246 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1382,6 +1382,15 @@ again: intel_pmu_lbr_read(); /* + * CondChgd bit 63 doesn't mean any overflow status. Ignore + * and clear the bit. + */ + if (__test_and_clear_bit(63, (unsigned long *)&status)) { + if (!status) + goto done; + } + + /* * PEBS overflow sets bit 62 in the global status register */ if (__test_and_clear_bit(62, (unsigned long *)&status)) { @@ -2173,6 +2182,41 @@ static void intel_snb_check_microcode(void) } } +/* + * Under certain circumstances, access certain MSR may cause #GP. + * The function tests if the input MSR can be safely accessed. + */ +static bool check_msr(unsigned long msr, u64 mask) +{ + u64 val_old, val_new, val_tmp; + + /* + * Read the current value, change it and read it back to see if it + * matches, this is needed to detect certain hardware emulators + * (qemu/kvm) that don't trap on the MSR access and always return 0s. + */ + if (rdmsrl_safe(msr, &val_old)) + return false; + + /* + * Only change the bits which can be updated by wrmsrl. + */ + val_tmp = val_old ^ mask; + if (wrmsrl_safe(msr, val_tmp) || + rdmsrl_safe(msr, &val_new)) + return false; + + if (val_new != val_tmp) + return false; + + /* Here it's sure that the MSR can be safely accessed. + * Restore the old value and return. + */ + wrmsrl(msr, val_old); + + return true; +} + static __init void intel_sandybridge_quirk(void) { x86_pmu.check_microcode = intel_snb_check_microcode; @@ -2262,7 +2306,8 @@ __init int intel_pmu_init(void) union cpuid10_ebx ebx; struct event_constraint *c; unsigned int unused; - int version; + struct extra_reg *er; + int version, i; if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { switch (boot_cpu_data.x86) { @@ -2465,6 +2510,9 @@ __init int intel_pmu_init(void) case 62: /* IvyBridge EP */ memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids)); + /* dTLB-load-misses on IVB is different than SNB */ + hw_cache_event_ids[C(DTLB)][C(OP_READ)][C(RESULT_MISS)] = 0x8108; /* DTLB_LOAD_MISSES.DEMAND_LD_MISS_CAUSES_A_WALK */ + memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); @@ -2565,6 +2613,34 @@ __init int intel_pmu_init(void) } } + /* + * Access LBR MSR may cause #GP under certain circumstances. + * E.g. KVM doesn't support LBR MSR + * Check all LBT MSR here. + * Disable LBR access if any LBR MSRs can not be accessed. + */ + if (x86_pmu.lbr_nr && !check_msr(x86_pmu.lbr_tos, 0x3UL)) + x86_pmu.lbr_nr = 0; + for (i = 0; i < x86_pmu.lbr_nr; i++) { + if (!(check_msr(x86_pmu.lbr_from + i, 0xffffUL) && + check_msr(x86_pmu.lbr_to + i, 0xffffUL))) + x86_pmu.lbr_nr = 0; + } + + /* + * Access extra MSR may cause #GP under certain circumstances. + * E.g. KVM doesn't support offcore event + * Check all extra_regs here. + */ + if (x86_pmu.extra_regs) { + for (er = x86_pmu.extra_regs; er->msr; er++) { + er->extra_msr_access = check_msr(er->msr, 0x1ffUL); + /* Disable LBR select mapping */ + if ((er->idx == EXTRA_REG_LBR) && !er->extra_msr_access) + x86_pmu.lbr_sel_map = NULL; + } + } + /* Support full width counters using alternative MSR range */ if (x86_pmu.intel_cap.full_width_write) { x86_pmu.max_period = x86_pmu.cntval_mask; diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 980970cb744d..696ade311ded 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -311,9 +311,11 @@ static int alloc_bts_buffer(int cpu) if (!x86_pmu.bts) return 0; - buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL, node); - if (unlikely(!buffer)) + buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node); + if (unlikely(!buffer)) { + WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__); return -ENOMEM; + } max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE; thresh = max / 16; diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index e009f3cedf89..cfc6f9dfcd90 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -550,16 +550,16 @@ static struct extra_reg snbep_uncore_cbox_extra_regs[] = { SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0x6), SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x8), SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x8), - SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0xc), - SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0xc), + SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0xa), + SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0xa), SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x2), SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x2), SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x2), SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x2), SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x8), SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x8), - SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0xc), - SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0xc), + SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0xa), + SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0xa), SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x2), SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x2), SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x2), @@ -1222,6 +1222,7 @@ static struct extra_reg ivt_uncore_cbox_extra_regs[] = { SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN, SNBEP_CBO_PMON_CTL_TID_EN, 0x1), SNBEP_CBO_EVENT_EXTRA_REG(0x1031, 0x10ff, 0x2), + SNBEP_CBO_EVENT_EXTRA_REG(0x1134, 0xffff, 0x4), SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0xc), SNBEP_CBO_EVENT_EXTRA_REG(0x5134, 0xffff, 0xc), @@ -1245,7 +1246,7 @@ static struct extra_reg ivt_uncore_cbox_extra_regs[] = { SNBEP_CBO_EVENT_EXTRA_REG(0x8335, 0xffff, 0x10), SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x10), SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x10), - SNBEP_CBO_EVENT_EXTRA_REG(0x2336, 0xffff, 0x10), + SNBEP_CBO_EVENT_EXTRA_REG(0x2136, 0xffff, 0x10), SNBEP_CBO_EVENT_EXTRA_REG(0x2336, 0xffff, 0x10), SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0x18), SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0x18), diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index f0da82b8e634..0d0c9d4ab6d5 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -423,8 +423,9 @@ sysenter_past_esp: jnz sysenter_audit sysenter_do_call: cmpl $(NR_syscalls), %eax - jae syscall_badsys + jae sysenter_badsys call *sys_call_table(,%eax,4) +sysenter_after_call: movl %eax,PT_EAX(%esp) LOCKDEP_SYS_EXIT DISABLE_INTERRUPTS(CLBR_ANY) @@ -501,6 +502,7 @@ ENTRY(system_call) jae syscall_badsys syscall_call: call *sys_call_table(,%eax,4) +syscall_after_call: movl %eax,PT_EAX(%esp) # store the return value syscall_exit: LOCKDEP_SYS_EXIT @@ -674,8 +676,13 @@ syscall_fault: END(syscall_fault) syscall_badsys: - movl $-ENOSYS,PT_EAX(%esp) - jmp resume_userspace + movl $-ENOSYS,%eax + jmp syscall_after_call +END(syscall_badsys) + +sysenter_badsys: + movl $-ENOSYS,%eax + jmp sysenter_after_call END(syscall_badsys) CFI_ENDPROC diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c index 6afbb16e9b79..94d857fb1033 100644 --- a/arch/x86/kernel/espfix_64.c +++ b/arch/x86/kernel/espfix_64.c @@ -175,7 +175,7 @@ void init_espfix_ap(void) if (!pud_present(pud)) { pmd_p = (pmd_t *)__get_free_page(PGALLOC_GFP); pud = __pud(__pa(pmd_p) | (PGTABLE_PROT & ptemask)); - paravirt_alloc_pud(&init_mm, __pa(pmd_p) >> PAGE_SHIFT); + paravirt_alloc_pmd(&init_mm, __pa(pmd_p) >> PAGE_SHIFT); for (n = 0; n < ESPFIX_PUD_CLONES; n++) set_pud(&pud_p[n], pud); } @@ -185,7 +185,7 @@ void init_espfix_ap(void) if (!pmd_present(pmd)) { pte_p = (pte_t *)__get_free_page(PGALLOC_GFP); pmd = __pmd(__pa(pte_p) | (PGTABLE_PROT & ptemask)); - paravirt_alloc_pmd(&init_mm, __pa(pte_p) >> PAGE_SHIFT); + paravirt_alloc_pte(&init_mm, __pa(pte_p) >> PAGE_SHIFT); for (n = 0; n < ESPFIX_PMD_CLONES; n++) set_pmd(&pmd_p[n], pmd); } @@ -193,7 +193,6 @@ void init_espfix_ap(void) pte_p = pte_offset_kernel(&pmd, addr); stack_page = (void *)__get_free_page(GFP_KERNEL); pte = __pte(__pa(stack_page) | (__PAGE_KERNEL_RO & ptemask)); - paravirt_alloc_pte(&init_mm, __pa(stack_page) >> PAGE_SHIFT); for (n = 0; n < ESPFIX_PTE_CLONES; n++) set_pte(&pte_p[n*PTE_STRIDE], pte); diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 7596df664901..67e6d19ef1be 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -574,6 +574,9 @@ int kprobe_int3_handler(struct pt_regs *regs) struct kprobe *p; struct kprobe_ctlblk *kcb; + if (user_mode_vm(regs)) + return 0; + addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t)); /* * We don't want to be preempted for the entire diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index a0da58db43a8..2851d63c1202 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -363,7 +363,7 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, /* Set up to return from userspace. */ restorer = current->mm->context.vdso + - selected_vdso32->sym___kernel_sigreturn; + selected_vdso32->sym___kernel_rt_sigreturn; if (ksig->ka.sa.sa_flags & SA_RESTORER) restorer = ksig->ka.sa.sa_restorer; put_user_ex(restorer, &frame->pretcode); diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 57e5ce126d5a..ea030319b321 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -920,9 +920,9 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new); if (!(freq->flags & CPUFREQ_CONST_LOOPS)) mark_tsc_unstable("cpufreq changes"); - } - set_cyc2ns_scale(tsc_khz, freq->cpu); + set_cyc2ns_scale(tsc_khz, freq->cpu); + } return 0; } diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index ec8366c5cfea..b5e994ad0135 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1462,6 +1462,7 @@ static void svm_get_segment(struct kvm_vcpu *vcpu, */ if (var->unusable) var->db = 0; + var->dpl = to_svm(vcpu)->vmcb->save.cpl; break; } } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f32a02578c0d..ef432f891d30 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1898,7 +1898,7 @@ static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data) if (!(data & HV_X64_MSR_TSC_REFERENCE_ENABLE)) break; gfn = data >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT; - if (kvm_write_guest(kvm, data, + if (kvm_write_guest(kvm, gfn << HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT, &tsc_ref, sizeof(tsc_ref))) return 1; mark_page_dirty(kvm, gfn); @@ -5887,6 +5887,18 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win) kvm_x86_ops->set_nmi(vcpu); } } else if (kvm_cpu_has_injectable_intr(vcpu)) { + /* + * Because interrupts can be injected asynchronously, we are + * calling check_nested_events again here to avoid a race condition. + * See https://lkml.org/lkml/2014/7/2/60 for discussion about this + * proposal and current concerns. Perhaps we should be setting + * KVM_REQ_EVENT only on certain events and not unconditionally? + */ + if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) { + r = kvm_x86_ops->check_nested_events(vcpu, req_int_win); + if (r != 0) + return r; + } if (kvm_x86_ops->interrupt_allowed(vcpu)) { kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false); diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile index 3c0809a0631f..61b04fe36e66 100644 --- a/arch/x86/vdso/Makefile +++ b/arch/x86/vdso/Makefile @@ -11,7 +11,6 @@ VDSO32-$(CONFIG_COMPAT) := y # files to link into the vdso vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o vdso-fakesections.o -vobjs-nox32 := vdso-fakesections.o # files to link into kernel obj-y += vma.o @@ -67,7 +66,8 @@ $(obj)/vdso-image-%.c: $(obj)/vdso%.so.dbg $(obj)/vdso2c FORCE # CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \ $(filter -g%,$(KBUILD_CFLAGS)) $(call cc-option, -fno-stack-protector) \ - -fno-omit-frame-pointer -foptimize-sibling-calls + -fno-omit-frame-pointer -foptimize-sibling-calls \ + -DDISABLE_BRANCH_PROFILING $(vobjs): KBUILD_CFLAGS += $(CFL) @@ -134,7 +134,7 @@ override obj-dirs = $(dir $(obj)) $(obj)/vdso32/ targets += vdso32/vdso32.lds targets += vdso32/note.o vdso32/vclock_gettime.o $(vdso32.so-y:%=vdso32/%.o) -targets += vdso32/vclock_gettime.o +targets += vdso32/vclock_gettime.o vdso32/vdso-fakesections.o $(obj)/vdso32.o: $(vdso32-images:%=$(obj)/%) @@ -150,11 +150,13 @@ KBUILD_CFLAGS_32 += -m32 -msoft-float -mregparm=0 -fpic KBUILD_CFLAGS_32 += $(call cc-option, -fno-stack-protector) KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls) KBUILD_CFLAGS_32 += -fno-omit-frame-pointer +KBUILD_CFLAGS_32 += -DDISABLE_BRANCH_PROFILING $(vdso32-images:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_32) $(vdso32-images:%=$(obj)/%.dbg): $(obj)/vdso32-%.so.dbg: FORCE \ $(obj)/vdso32/vdso32.lds \ $(obj)/vdso32/vclock_gettime.o \ + $(obj)/vdso32/vdso-fakesections.o \ $(obj)/vdso32/note.o \ $(obj)/vdso32/%.o $(call if_changed,vdso) @@ -169,14 +171,24 @@ quiet_cmd_vdso = VDSO $@ sh $(srctree)/$(src)/checkundef.sh '$(NM)' '$@' VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=sysv) \ - -Wl,-Bsymbolic $(LTO_CFLAGS) + $(call cc-ldoption, -Wl$(comma)--build-id) -Wl,-Bsymbolic $(LTO_CFLAGS) GCOV_PROFILE := n # -# Install the unstripped copies of vdso*.so. +# Install the unstripped copies of vdso*.so. If our toolchain supports +# build-id, install .build-id links as well. # quiet_cmd_vdso_install = INSTALL $(@:install_%=%) - cmd_vdso_install = cp $< $(MODLIB)/vdso/$(@:install_%=%) +define cmd_vdso_install + cp $< "$(MODLIB)/vdso/$(@:install_%=%)"; \ + if readelf -n $< |grep -q 'Build ID'; then \ + buildid=`readelf -n $< |grep 'Build ID' |sed -e 's/^.*Build ID: \(.*\)$$/\1/'`; \ + first=`echo $$buildid | cut -b-2`; \ + last=`echo $$buildid | cut -b3-`; \ + mkdir -p "$(MODLIB)/vdso/.build-id/$$first"; \ + ln -sf "../../$(@:install_%=%)" "$(MODLIB)/vdso/.build-id/$$first/$$last.debug"; \ + fi +endef vdso_img_insttargets := $(vdso_img_sodbg:%.dbg=install_%) diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index b2e4f493e5b0..9793322751e0 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -11,9 +11,6 @@ * Check with readelf after changing. */ -/* Disable profiling for userspace code: */ -#define DISABLE_BRANCH_PROFILING - #include <uapi/linux/time.h> #include <asm/vgtod.h> #include <asm/hpet.h> diff --git a/arch/x86/vdso/vdso-fakesections.c b/arch/x86/vdso/vdso-fakesections.c index cb8a8d72c24b..aa5fbfab20a5 100644 --- a/arch/x86/vdso/vdso-fakesections.c +++ b/arch/x86/vdso/vdso-fakesections.c @@ -2,31 +2,20 @@ * Copyright 2014 Andy Lutomirski * Subject to the GNU Public License, v.2 * - * Hack to keep broken Go programs working. - * - * The Go runtime had a couple of bugs: it would read the section table to try - * to figure out how many dynamic symbols there were (it shouldn't have looked - * at the section table at all) and, if there were no SHT_SYNDYM section table - * entry, it would use an uninitialized value for the number of symbols. As a - * workaround, we supply a minimal section table. vdso2c will adjust the - * in-memory image so that "vdso_fake_sections" becomes the section table. - * - * The bug was introduced by: - * https://code.google.com/p/go/source/detail?r=56ea40aac72b (2012-08-31) - * and is being addressed in the Go runtime in this issue: - * https://code.google.com/p/go/issues/detail?id=8197 + * String table for loadable section headers. See vdso2c.h for why + * this exists. */ -#ifndef __x86_64__ -#error This hack is specific to the 64-bit vDSO -#endif - -#include <linux/elf.h> - -extern const __visible struct elf64_shdr vdso_fake_sections[]; -const __visible struct elf64_shdr vdso_fake_sections[] = { - { - .sh_type = SHT_DYNSYM, - .sh_entsize = sizeof(Elf64_Sym), - } -}; +const char fake_shstrtab[] __attribute__((section(".fake_shstrtab"))) = + ".hash\0" + ".dynsym\0" + ".dynstr\0" + ".gnu.version\0" + ".gnu.version_d\0" + ".dynamic\0" + ".rodata\0" + ".fake_shstrtab\0" /* Yay, self-referential code. */ + ".note\0" + ".eh_frame_hdr\0" + ".eh_frame\0" + ".text"; diff --git a/arch/x86/vdso/vdso-layout.lds.S b/arch/x86/vdso/vdso-layout.lds.S index 2ec72f651ebf..9197544eea9a 100644 --- a/arch/x86/vdso/vdso-layout.lds.S +++ b/arch/x86/vdso/vdso-layout.lds.S @@ -6,6 +6,16 @@ * This script controls its layout. */ +#if defined(BUILD_VDSO64) +# define SHDR_SIZE 64 +#elif defined(BUILD_VDSO32) || defined(BUILD_VDSOX32) +# define SHDR_SIZE 40 +#else +# error unknown VDSO target +#endif + +#define NUM_FAKE_SHDRS 13 + SECTIONS { . = SIZEOF_HEADERS; @@ -18,36 +28,53 @@ SECTIONS .gnu.version_d : { *(.gnu.version_d) } .gnu.version_r : { *(.gnu.version_r) } + .dynamic : { *(.dynamic) } :text :dynamic + + .rodata : { + *(.rodata*) + *(.data*) + *(.sdata*) + *(.got.plt) *(.got) + *(.gnu.linkonce.d.*) + *(.bss*) + *(.dynbss*) + *(.gnu.linkonce.b.*) + + /* + * Ideally this would live in a C file, but that won't + * work cleanly for x32 until we start building the x32 + * C code using an x32 toolchain. + */ + VDSO_FAKE_SECTION_TABLE_START = .; + . = . + NUM_FAKE_SHDRS * SHDR_SIZE; + VDSO_FAKE_SECTION_TABLE_END = .; + } :text + + .fake_shstrtab : { *(.fake_shstrtab) } :text + + .note : { *(.note.*) } :text :note .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr .eh_frame : { KEEP (*(.eh_frame)) } :text - .dynamic : { *(.dynamic) } :text :dynamic - - .rodata : { *(.rodata*) } :text - .data : { - *(.data*) - *(.sdata*) - *(.got.plt) *(.got) - *(.gnu.linkonce.d.*) - *(.bss*) - *(.dynbss*) - *(.gnu.linkonce.b.*) - } - - .altinstructions : { *(.altinstructions) } - .altinstr_replacement : { *(.altinstr_replacement) } /* - * Align the actual code well away from the non-instruction data. - * This is the best thing for the I-cache. + * Text is well-separated from actual data: there's plenty of + * stuff that isn't used at runtime in between. */ - . = ALIGN(0x100); .text : { *(.text*) } :text =0x90909090, /* + * At the end so that eu-elflint stays happy when vdso2c strips + * these. A better implementation would avoid allocating space + * for these. + */ + .altinstructions : { *(.altinstructions) } :text + .altinstr_replacement : { *(.altinstr_replacement) } :text + + /* * The remainder of the vDSO consists of special pages that are * shared between the kernel and userspace. It needs to be at the * end so that it doesn't overlap the mapping of the actual @@ -75,6 +102,7 @@ SECTIONS /DISCARD/ : { *(.discard) *(.discard.*) + *(__bug_table) } } diff --git a/arch/x86/vdso/vdso.lds.S b/arch/x86/vdso/vdso.lds.S index 75e3404c83b1..6807932643c2 100644 --- a/arch/x86/vdso/vdso.lds.S +++ b/arch/x86/vdso/vdso.lds.S @@ -6,6 +6,8 @@ * the DSO. */ +#define BUILD_VDSO64 + #include "vdso-layout.lds.S" /* diff --git a/arch/x86/vdso/vdso2c.c b/arch/x86/vdso/vdso2c.c index 7a6bf50f9165..238dbe82776e 100644 --- a/arch/x86/vdso/vdso2c.c +++ b/arch/x86/vdso/vdso2c.c @@ -23,6 +23,8 @@ enum { sym_vvar_page, sym_hpet_page, sym_end_mapping, + sym_VDSO_FAKE_SECTION_TABLE_START, + sym_VDSO_FAKE_SECTION_TABLE_END, }; const int special_pages[] = { @@ -30,15 +32,26 @@ const int special_pages[] = { sym_hpet_page, }; -char const * const required_syms[] = { - [sym_vvar_page] = "vvar_page", - [sym_hpet_page] = "hpet_page", - [sym_end_mapping] = "end_mapping", - "VDSO32_NOTE_MASK", - "VDSO32_SYSENTER_RETURN", - "__kernel_vsyscall", - "__kernel_sigreturn", - "__kernel_rt_sigreturn", +struct vdso_sym { + const char *name; + bool export; +}; + +struct vdso_sym required_syms[] = { + [sym_vvar_page] = {"vvar_page", true}, + [sym_hpet_page] = {"hpet_page", true}, + [sym_end_mapping] = {"end_mapping", true}, + [sym_VDSO_FAKE_SECTION_TABLE_START] = { + "VDSO_FAKE_SECTION_TABLE_START", false + }, + [sym_VDSO_FAKE_SECTION_TABLE_END] = { + "VDSO_FAKE_SECTION_TABLE_END", false + }, + {"VDSO32_NOTE_MASK", true}, + {"VDSO32_SYSENTER_RETURN", true}, + {"__kernel_vsyscall", true}, + {"__kernel_sigreturn", true}, + {"__kernel_rt_sigreturn", true}, }; __attribute__((format(printf, 1, 2))) __attribute__((noreturn)) @@ -83,37 +96,21 @@ extern void bad_put_le(void); #define NSYMS (sizeof(required_syms) / sizeof(required_syms[0])) -#define BITS 64 -#define GOFUNC go64 -#define Elf_Ehdr Elf64_Ehdr -#define Elf_Shdr Elf64_Shdr -#define Elf_Phdr Elf64_Phdr -#define Elf_Sym Elf64_Sym -#define Elf_Dyn Elf64_Dyn +#define BITSFUNC3(name, bits) name##bits +#define BITSFUNC2(name, bits) BITSFUNC3(name, bits) +#define BITSFUNC(name) BITSFUNC2(name, ELF_BITS) + +#define ELF_BITS_XFORM2(bits, x) Elf##bits##_##x +#define ELF_BITS_XFORM(bits, x) ELF_BITS_XFORM2(bits, x) +#define ELF(x) ELF_BITS_XFORM(ELF_BITS, x) + +#define ELF_BITS 64 #include "vdso2c.h" -#undef BITS -#undef GOFUNC -#undef Elf_Ehdr -#undef Elf_Shdr -#undef Elf_Phdr -#undef Elf_Sym -#undef Elf_Dyn - -#define BITS 32 -#define GOFUNC go32 -#define Elf_Ehdr Elf32_Ehdr -#define Elf_Shdr Elf32_Shdr -#define Elf_Phdr Elf32_Phdr -#define Elf_Sym Elf32_Sym -#define Elf_Dyn Elf32_Dyn +#undef ELF_BITS + +#define ELF_BITS 32 #include "vdso2c.h" -#undef BITS -#undef GOFUNC -#undef Elf_Ehdr -#undef Elf_Shdr -#undef Elf_Phdr -#undef Elf_Sym -#undef Elf_Dyn +#undef ELF_BITS static void go(void *addr, size_t len, FILE *outfile, const char *name) { diff --git a/arch/x86/vdso/vdso2c.h b/arch/x86/vdso/vdso2c.h index c6eefaf389b9..11b65d4f9414 100644 --- a/arch/x86/vdso/vdso2c.h +++ b/arch/x86/vdso/vdso2c.h @@ -4,23 +4,139 @@ * are built for 32-bit userspace. */ -static void GOFUNC(void *addr, size_t len, FILE *outfile, const char *name) +/* + * We're writing a section table for a few reasons: + * + * The Go runtime had a couple of bugs: it would read the section + * table to try to figure out how many dynamic symbols there were (it + * shouldn't have looked at the section table at all) and, if there + * were no SHT_SYNDYM section table entry, it would use an + * uninitialized value for the number of symbols. An empty DYNSYM + * table would work, but I see no reason not to write a valid one (and + * keep full performance for old Go programs). This hack is only + * needed on x86_64. + * + * The bug was introduced on 2012-08-31 by: + * https://code.google.com/p/go/source/detail?r=56ea40aac72b + * and was fixed on 2014-06-13 by: + * https://code.google.com/p/go/source/detail?r=fc1cd5e12595 + * + * Binutils has issues debugging the vDSO: it reads the section table to + * find SHT_NOTE; it won't look at PT_NOTE for the in-memory vDSO, which + * would break build-id if we removed the section table. Binutils + * also requires that shstrndx != 0. See: + * https://sourceware.org/bugzilla/show_bug.cgi?id=17064 + * + * elfutils might not look for PT_NOTE if there is a section table at + * all. I don't know whether this matters for any practical purpose. + * + * For simplicity, rather than hacking up a partial section table, we + * just write a mostly complete one. We omit non-dynamic symbols, + * though, since they're rather large. + * + * Once binutils gets fixed, we might be able to drop this for all but + * the 64-bit vdso, since build-id only works in kernel RPMs, and + * systems that update to new enough kernel RPMs will likely update + * binutils in sync. build-id has never worked for home-built kernel + * RPMs without manual symlinking, and I suspect that no one ever does + * that. + */ +struct BITSFUNC(fake_sections) +{ + ELF(Shdr) *table; + unsigned long table_offset; + int count, max_count; + + int in_shstrndx; + unsigned long shstr_offset; + const char *shstrtab; + size_t shstrtab_len; + + int out_shstrndx; +}; + +static unsigned int BITSFUNC(find_shname)(struct BITSFUNC(fake_sections) *out, + const char *name) +{ + const char *outname = out->shstrtab; + while (outname - out->shstrtab < out->shstrtab_len) { + if (!strcmp(name, outname)) + return (outname - out->shstrtab) + out->shstr_offset; + outname += strlen(outname) + 1; + } + + if (*name) + printf("Warning: could not find output name \"%s\"\n", name); + return out->shstr_offset + out->shstrtab_len - 1; /* Use a null. */ +} + +static void BITSFUNC(init_sections)(struct BITSFUNC(fake_sections) *out) +{ + if (!out->in_shstrndx) + fail("didn't find the fake shstrndx\n"); + + memset(out->table, 0, out->max_count * sizeof(ELF(Shdr))); + + if (out->max_count < 1) + fail("we need at least two fake output sections\n"); + + PUT_LE(&out->table[0].sh_type, SHT_NULL); + PUT_LE(&out->table[0].sh_name, BITSFUNC(find_shname)(out, "")); + + out->count = 1; +} + +static void BITSFUNC(copy_section)(struct BITSFUNC(fake_sections) *out, + int in_idx, const ELF(Shdr) *in, + const char *name) +{ + uint64_t flags = GET_LE(&in->sh_flags); + + bool copy = flags & SHF_ALLOC && + (GET_LE(&in->sh_size) || + (GET_LE(&in->sh_type) != SHT_RELA && + GET_LE(&in->sh_type) != SHT_REL)) && + strcmp(name, ".altinstructions") && + strcmp(name, ".altinstr_replacement"); + + if (!copy) + return; + + if (out->count >= out->max_count) + fail("too many copied sections (max = %d)\n", out->max_count); + + if (in_idx == out->in_shstrndx) + out->out_shstrndx = out->count; + + out->table[out->count] = *in; + PUT_LE(&out->table[out->count].sh_name, + BITSFUNC(find_shname)(out, name)); + + /* elfutils requires that a strtab have the correct type. */ + if (!strcmp(name, ".fake_shstrtab")) + PUT_LE(&out->table[out->count].sh_type, SHT_STRTAB); + + out->count++; +} + +static void BITSFUNC(go)(void *addr, size_t len, + FILE *outfile, const char *name) { int found_load = 0; unsigned long load_size = -1; /* Work around bogus warning */ unsigned long data_size; - Elf_Ehdr *hdr = (Elf_Ehdr *)addr; + ELF(Ehdr) *hdr = (ELF(Ehdr) *)addr; int i; unsigned long j; - Elf_Shdr *symtab_hdr = NULL, *strtab_hdr, *secstrings_hdr, + ELF(Shdr) *symtab_hdr = NULL, *strtab_hdr, *secstrings_hdr, *alt_sec = NULL; - Elf_Dyn *dyn = 0, *dyn_end = 0; + ELF(Dyn) *dyn = 0, *dyn_end = 0; const char *secstrings; uint64_t syms[NSYMS] = {}; - uint64_t fake_sections_value = 0, fake_sections_size = 0; + struct BITSFUNC(fake_sections) fake_sections = {}; - Elf_Phdr *pt = (Elf_Phdr *)(addr + GET_LE(&hdr->e_phoff)); + ELF(Phdr) *pt = (ELF(Phdr) *)(addr + GET_LE(&hdr->e_phoff)); /* Walk the segment table. */ for (i = 0; i < GET_LE(&hdr->e_phnum); i++) { @@ -51,7 +167,7 @@ static void GOFUNC(void *addr, size_t len, FILE *outfile, const char *name) for (i = 0; dyn + i < dyn_end && GET_LE(&dyn[i].d_tag) != DT_NULL; i++) { typeof(dyn[i].d_tag) tag = GET_LE(&dyn[i].d_tag); - if (tag == DT_REL || tag == DT_RELSZ || + if (tag == DT_REL || tag == DT_RELSZ || tag == DT_RELA || tag == DT_RELENT || tag == DT_TEXTREL) fail("vdso image contains dynamic relocations\n"); } @@ -61,7 +177,7 @@ static void GOFUNC(void *addr, size_t len, FILE *outfile, const char *name) GET_LE(&hdr->e_shentsize)*GET_LE(&hdr->e_shstrndx); secstrings = addr + GET_LE(&secstrings_hdr->sh_offset); for (i = 0; i < GET_LE(&hdr->e_shnum); i++) { - Elf_Shdr *sh = addr + GET_LE(&hdr->e_shoff) + + ELF(Shdr) *sh = addr + GET_LE(&hdr->e_shoff) + GET_LE(&hdr->e_shentsize) * i; if (GET_LE(&sh->sh_type) == SHT_SYMTAB) symtab_hdr = sh; @@ -82,29 +198,63 @@ static void GOFUNC(void *addr, size_t len, FILE *outfile, const char *name) i < GET_LE(&symtab_hdr->sh_size) / GET_LE(&symtab_hdr->sh_entsize); i++) { int k; - Elf_Sym *sym = addr + GET_LE(&symtab_hdr->sh_offset) + + ELF(Sym) *sym = addr + GET_LE(&symtab_hdr->sh_offset) + GET_LE(&symtab_hdr->sh_entsize) * i; const char *name = addr + GET_LE(&strtab_hdr->sh_offset) + GET_LE(&sym->st_name); for (k = 0; k < NSYMS; k++) { - if (!strcmp(name, required_syms[k])) { + if (!strcmp(name, required_syms[k].name)) { if (syms[k]) { fail("duplicate symbol %s\n", - required_syms[k]); + required_syms[k].name); } syms[k] = GET_LE(&sym->st_value); } } - if (!strcmp(name, "vdso_fake_sections")) { - if (fake_sections_value) - fail("duplicate vdso_fake_sections\n"); - fake_sections_value = GET_LE(&sym->st_value); - fake_sections_size = GET_LE(&sym->st_size); + if (!strcmp(name, "fake_shstrtab")) { + ELF(Shdr) *sh; + + fake_sections.in_shstrndx = GET_LE(&sym->st_shndx); + fake_sections.shstrtab = addr + GET_LE(&sym->st_value); + fake_sections.shstrtab_len = GET_LE(&sym->st_size); + sh = addr + GET_LE(&hdr->e_shoff) + + GET_LE(&hdr->e_shentsize) * + fake_sections.in_shstrndx; + fake_sections.shstr_offset = GET_LE(&sym->st_value) - + GET_LE(&sh->sh_addr); } } + /* Build the output section table. */ + if (!syms[sym_VDSO_FAKE_SECTION_TABLE_START] || + !syms[sym_VDSO_FAKE_SECTION_TABLE_END]) + fail("couldn't find fake section table\n"); + if ((syms[sym_VDSO_FAKE_SECTION_TABLE_END] - + syms[sym_VDSO_FAKE_SECTION_TABLE_START]) % sizeof(ELF(Shdr))) + fail("fake section table size isn't a multiple of sizeof(Shdr)\n"); + fake_sections.table = addr + syms[sym_VDSO_FAKE_SECTION_TABLE_START]; + fake_sections.table_offset = syms[sym_VDSO_FAKE_SECTION_TABLE_START]; + fake_sections.max_count = (syms[sym_VDSO_FAKE_SECTION_TABLE_END] - + syms[sym_VDSO_FAKE_SECTION_TABLE_START]) / + sizeof(ELF(Shdr)); + + BITSFUNC(init_sections)(&fake_sections); + for (i = 0; i < GET_LE(&hdr->e_shnum); i++) { + ELF(Shdr) *sh = addr + GET_LE(&hdr->e_shoff) + + GET_LE(&hdr->e_shentsize) * i; + BITSFUNC(copy_section)(&fake_sections, i, sh, + secstrings + GET_LE(&sh->sh_name)); + } + if (!fake_sections.out_shstrndx) + fail("didn't generate shstrndx?!?\n"); + + PUT_LE(&hdr->e_shoff, fake_sections.table_offset); + PUT_LE(&hdr->e_shentsize, sizeof(ELF(Shdr))); + PUT_LE(&hdr->e_shnum, fake_sections.count); + PUT_LE(&hdr->e_shstrndx, fake_sections.out_shstrndx); + /* Validate mapping addresses. */ for (i = 0; i < sizeof(special_pages) / sizeof(special_pages[0]); i++) { if (!syms[i]) @@ -112,25 +262,17 @@ static void GOFUNC(void *addr, size_t len, FILE *outfile, const char *name) if (syms[i] % 4096) fail("%s must be a multiple of 4096\n", - required_syms[i]); + required_syms[i].name); if (syms[i] < data_size) fail("%s must be after the text mapping\n", - required_syms[i]); + required_syms[i].name); if (syms[sym_end_mapping] < syms[i] + 4096) - fail("%s overruns end_mapping\n", required_syms[i]); + fail("%s overruns end_mapping\n", + required_syms[i].name); } if (syms[sym_end_mapping] % 4096) fail("end_mapping must be a multiple of 4096\n"); - /* Remove sections or use fakes */ - if (fake_sections_size % sizeof(Elf_Shdr)) - fail("vdso_fake_sections size is not a multiple of %ld\n", - (long)sizeof(Elf_Shdr)); - PUT_LE(&hdr->e_shoff, fake_sections_value); - PUT_LE(&hdr->e_shentsize, fake_sections_value ? sizeof(Elf_Shdr) : 0); - PUT_LE(&hdr->e_shnum, fake_sections_size / sizeof(Elf_Shdr)); - PUT_LE(&hdr->e_shstrndx, SHN_UNDEF); - if (!name) { fwrite(addr, load_size, 1, outfile); return; @@ -168,9 +310,9 @@ static void GOFUNC(void *addr, size_t len, FILE *outfile, const char *name) (unsigned long)GET_LE(&alt_sec->sh_size)); } for (i = 0; i < NSYMS; i++) { - if (syms[i]) + if (required_syms[i].export && syms[i]) fprintf(outfile, "\t.sym_%s = 0x%" PRIx64 ",\n", - required_syms[i], syms[i]); + required_syms[i].name, syms[i]); } fprintf(outfile, "};\n"); } diff --git a/arch/x86/vdso/vdso32/vdso-fakesections.c b/arch/x86/vdso/vdso32/vdso-fakesections.c new file mode 100644 index 000000000000..541468e25265 --- /dev/null +++ b/arch/x86/vdso/vdso32/vdso-fakesections.c @@ -0,0 +1 @@ +#include "../vdso-fakesections.c" diff --git a/arch/x86/vdso/vdsox32.lds.S b/arch/x86/vdso/vdsox32.lds.S index 46b991b578a8..697c11ece90c 100644 --- a/arch/x86/vdso/vdsox32.lds.S +++ b/arch/x86/vdso/vdsox32.lds.S @@ -6,6 +6,8 @@ * the DSO. */ +#define BUILD_VDSOX32 + #include "vdso-layout.lds.S" /* diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index e1513c47872a..5a5176de8d0a 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -62,6 +62,9 @@ struct linux_binprm; Only used for the 64-bit and x32 vdsos. */ static unsigned long vdso_addr(unsigned long start, unsigned len) { +#ifdef CONFIG_X86_32 + return 0; +#else unsigned long addr, end; unsigned offset; end = (start + PMD_SIZE - 1) & PMD_MASK; @@ -83,6 +86,7 @@ static unsigned long vdso_addr(unsigned long start, unsigned len) addr = align_vdso_addr(addr); return addr; +#endif } static int map_vdso(const struct vdso_image *image, bool calculate_addr) |