From: www.kernel.org Subject: Update to 2.6.24 Patch-mainline: 2.6.24 Automatically created from "patches.kernel.org/patch-2.6.24" by xen-port-patches.py Acked-by: jbeulich@novell.com --- include/asm-x86/mach-xen/asm/mmu_64.h | 31 -- include/asm-x86/mach-xen/asm/scatterlist_32.h | 24 - arch/x86/Kconfig | 8 arch/x86/Makefile | 6 arch/x86/ia32/ia32entry-xen.S | 19 - arch/x86/kernel/Makefile | 2 arch/x86/kernel/acpi/sleep_32-xen.c | 2 arch/x86/kernel/acpi/sleep_64-xen.c | 3 arch/x86/kernel/apic_64-xen.c | 46 ++- arch/x86/kernel/cpu/common-xen.c | 2 arch/x86/kernel/e820_32-xen.c | 23 + arch/x86/kernel/e820_64-xen.c | 41 ++- arch/x86/kernel/early_printk-xen.c | 15 - arch/x86/kernel/entry_32-xen.S | 8 arch/x86/kernel/entry_64-xen.S | 16 - arch/x86/kernel/genapic_64-xen.c | 19 + arch/x86/kernel/head64-xen.c | 24 - arch/x86/kernel/init_task-xen.c | 15 - arch/x86/kernel/io_apic_32-xen.c | 58 +++- arch/x86/kernel/io_apic_64-xen.c | 114 +++++++- arch/x86/kernel/ioport_32-xen.c | 2 arch/x86/kernel/ioport_64-xen.c | 2 arch/x86/kernel/irq_32-xen.c | 59 +++- arch/x86/kernel/irq_64-xen.c | 57 +++- arch/x86/kernel/ldt_32-xen.c | 16 - arch/x86/kernel/ldt_64-xen.c | 16 - arch/x86/kernel/mpparse_32-xen.c | 2 arch/x86/kernel/mpparse_64-xen.c | 19 + arch/x86/kernel/pci-dma_32-xen.c | 42 +-- arch/x86/kernel/pci-swiotlb_64-xen.c | 2 arch/x86/kernel/process_32-xen.c | 71 +++-- arch/x86/kernel/process_64-xen.c | 20 + arch/x86/kernel/quirks-xen.c | 352 +++++++++++++++++++++++++- arch/x86/kernel/setup64-xen.c | 16 - arch/x86/kernel/setup_32-xen.c | 139 ++++++---- arch/x86/kernel/setup_64-xen.c | 219 ++++++++++------ arch/x86/kernel/smp_32-xen.c | 9 arch/x86/kernel/smp_64-xen.c | 125 +++++---- arch/x86/kernel/time_32-xen.c | 27 + arch/x86/kernel/traps_32-xen.c | 105 ++----- arch/x86/kernel/traps_64-xen.c | 49 +-- arch/x86/kernel/vsyscall_64-xen.c | 34 +- arch/x86/mm/fault_32-xen.c | 72 ++--- arch/x86/mm/fault_64-xen.c | 71 ++--- arch/x86/mm/init_32-xen.c | 47 +-- arch/x86/mm/init_64-xen.c | 61 +++- arch/x86/mm/pageattr_64-xen.c | 18 - arch/x86/mm/pgtable_32-xen.c | 5 arch/x86/pci/Makefile | 4 arch/x86/pci/Makefile_32 | 4 arch/x86/pci/Makefile_64 | 4 arch/x86/pci/irq-xen.c | 47 +++ drivers/xen/blkback/blkback.c | 5 drivers/xen/blkfront/blkfront.c | 55 +--- drivers/xen/core/machine_kexec.c | 4 drivers/xen/core/smpboot.c | 53 +-- drivers/xen/netback/loopback.c | 4 drivers/xen/netback/netback.c | 4 drivers/xen/netback/xenbus.c | 12 drivers/xen/netfront/accel.c | 8 drivers/xen/netfront/netfront.c | 26 - drivers/xen/netfront/netfront.h | 2 drivers/xen/pciback/Makefile | 4 drivers/xen/pcifront/Makefile | 4 drivers/xen/sfc_netback/accel_fwd.c | 12 drivers/xen/sfc_netback/accel_msg.c | 4 drivers/xen/sfc_netfront/accel_msg.c | 23 + drivers/xen/sfc_netfront/accel_vi.c | 11 drivers/xen/sfc_netutil/accel_util.h | 3 drivers/xen/xenbus/xenbus_probe.c | 13 drivers/xen/xenbus/xenbus_probe_backend.c | 27 - fs/xfs/linux-2.6/xfs_buf.c | 2 include/asm-x86/mach-xen/asm/agp.h | 30 +- include/asm-x86/mach-xen/asm/desc.h | 5 include/asm-x86/mach-xen/asm/desc_64.h | 34 ++ include/asm-x86/mach-xen/asm/dma-mapping.h | 5 include/asm-x86/mach-xen/asm/dma-mapping_32.h | 2 include/asm-x86/mach-xen/asm/dma-mapping_64.h | 5 include/asm-x86/mach-xen/asm/e820.h | 33 ++ include/asm-x86/mach-xen/asm/e820_64.h | 20 - include/asm-x86/mach-xen/asm/fixmap.h | 5 include/asm-x86/mach-xen/asm/hw_irq.h | 5 include/asm-x86/mach-xen/asm/hw_irq_64.h | 35 +- include/asm-x86/mach-xen/asm/hypercall.h | 5 include/asm-x86/mach-xen/asm/io.h | 5 include/asm-x86/mach-xen/asm/io_32.h | 28 -- include/asm-x86/mach-xen/asm/io_64.h | 6 include/asm-x86/mach-xen/asm/irq.h | 5 include/asm-x86/mach-xen/asm/irqflags.h | 5 include/asm-x86/mach-xen/asm/irqflags_32.h | 30 ++ include/asm-x86/mach-xen/asm/irqflags_64.h | 30 ++ include/asm-x86/mach-xen/asm/maddr.h | 5 include/asm-x86/mach-xen/asm/mmu.h | 28 +- include/asm-x86/mach-xen/asm/mmu_context.h | 5 include/asm-x86/mach-xen/asm/nmi.h | 7 include/asm-x86/mach-xen/asm/page.h | 13 include/asm-x86/mach-xen/asm/page_64.h | 1 include/asm-x86/mach-xen/asm/pci.h | 100 +++++++ include/asm-x86/mach-xen/asm/pci_32.h | 65 ---- include/asm-x86/mach-xen/asm/pci_64.h | 62 ---- include/asm-x86/mach-xen/asm/pgalloc.h | 5 include/asm-x86/mach-xen/asm/pgtable.h | 5 include/asm-x86/mach-xen/asm/pgtable_32.h | 7 include/asm-x86/mach-xen/asm/pgtable_64.h | 3 include/asm-x86/mach-xen/asm/processor.h | 5 include/asm-x86/mach-xen/asm/processor_32.h | 47 ++- include/asm-x86/mach-xen/asm/processor_64.h | 43 ++- include/asm-x86/mach-xen/asm/scatterlist.h | 1 include/asm-x86/mach-xen/asm/segment.h | 5 include/asm-x86/mach-xen/asm/smp.h | 5 include/asm-x86/mach-xen/asm/smp_32.h | 12 include/asm-x86/mach-xen/asm/smp_64.h | 25 + include/asm-x86/mach-xen/asm/swiotlb.h | 5 include/asm-x86/mach-xen/asm/system.h | 5 include/asm-x86/mach-xen/asm/system_32.h | 28 +- include/asm-x86/mach-xen/asm/system_64.h | 27 + include/asm-x86/mach-xen/asm/tlbflush.h | 5 include/asm-x86/mach-xen/asm/tlbflush_32.h | 7 include/asm-x86/mach-xen/asm/tlbflush_64.h | 9 include/asm-x86/mach-xen/asm/xor.h | 5 include/asm-x86/mach-xen/mach_time.h | 113 -------- include/asm-x86/mach-xen/mach_timer.h | 51 --- include/linux/sysctl.h | 1 include/xen/pcifront.h | 20 - include/xen/sysctl.h | 11 include/xen/xenbus.h | 2 kernel/kexec.c | 3 kernel/sysctl_check.c | 12 lib/swiotlb-xen.c | 35 +- mm/memory.c | 2 130 files changed, 2202 insertions(+), 1349 deletions(-) --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -48,15 +48,15 @@ config CLOCKSOURCE_WATCHDOG def_bool y - depends on !X86_XEN + depends on !X86_XEN && !X86_64_XEN config GENERIC_CLOCKEVENTS def_bool y - depends on !X86_XEN + depends on !X86_XEN && !X86_64_XEN config GENERIC_CLOCKEVENTS_BROADCAST def_bool y - depends on X86_64 || (X86_32 && X86_LOCAL_APIC && !X86_XEN) + depends on (X86_64 && !X86_64_XEN) || (X86_32 && X86_LOCAL_APIC && !X86_XEN) config LOCKDEP_SUPPORT def_bool y @@ -257,6 +257,7 @@ config X86_XEN bool "Xen-compatible" + depends on X86_32 select XEN select X86_UP_APIC if !SMP && XEN_PRIVILEGED_GUEST select X86_UP_IOAPIC if !SMP && XEN_PRIVILEGED_GUEST @@ -363,6 +364,7 @@ config X86_64_XEN bool "Enable Xen compatible kernel" + depends on X86_64 select XEN select SWIOTLB help --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -216,8 +216,8 @@ zdisk bzdisk fdimage fdimage144 fdimage288 isoimage install ifdef CONFIG_XEN -CPPFLAGS := -D__XEN_INTERFACE_VERSION__=$(CONFIG_XEN_INTERFACE_VERSION) \ - -Iinclude$(if $(KBUILD_SRC),2)/asm/mach-xen $(CPPFLAGS) +KBUILD_CPPFLAGS := -D__XEN_INTERFACE_VERSION__=$(CONFIG_XEN_INTERFACE_VERSION) \ + -Iinclude$(if $(KBUILD_SRC),2)/asm/mach-xen $(KBUILD_CPPFLAGS) ifdef CONFIG_X86_64 LDFLAGS_vmlinux := -e startup_64 @@ -231,6 +231,8 @@ vmlinuz: vmlinux $(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE) + $(Q)mkdir -p $(objtree)/arch/$(UTS_MACHINE)/boot + $(Q)ln -fsn ../../x86/boot/$@ $(objtree)/arch/$(UTS_MACHINE)/boot/$@ else # Default kernel to build all: bzImage --- a/arch/x86/ia32/ia32entry-xen.S +++ b/arch/x86/ia32/ia32entry-xen.S @@ -125,20 +125,16 @@ jmp int_ret_from_sys_call sysenter_tracesys: + xchgl %r9d,%ebp SAVE_REST CLEAR_RREGS + movq %r9,R9(%rsp) movq $-ENOSYS,RAX(%rsp) /* really needed? */ movq %rsp,%rdi /* &pt_regs -> arg1 */ call syscall_trace_enter LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ RESTORE_REST - movl %ebp, %ebp - /* no need to do an access_ok check here because rbp has been - 32bit zero extended */ -1: movl (%rbp),%r9d - .section __ex_table,"a" - .quad 1b,ia32_badarg - .previous + xchgl %ebp,%r9d jmp sysenter_do_call CFI_ENDPROC ENDPROC(ia32_sysenter_target) @@ -200,20 +196,17 @@ jmp int_ret_from_sys_call cstar_tracesys: + xchgl %r9d,%ebp SAVE_REST CLEAR_RREGS + movq %r9,R9(%rsp) movq $-ENOSYS,RAX(%rsp) /* really needed? */ movq %rsp,%rdi /* &pt_regs -> arg1 */ call syscall_trace_enter LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ RESTORE_REST + xchgl %ebp,%r9d movl RSP-ARGOFFSET(%rsp), %r8d - /* no need to do an access_ok check here because r8 has been - 32bit zero extended */ -1: movl (%r8),%r9d - .section __ex_table,"a" - .quad 1b,ia32_badarg - .previous jmp cstar_do_call END(ia32_cstar_target) --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -106,4 +106,4 @@ disabled-obj-$(CONFIG_XEN) := early-quirks.o hpet.o i8253.o i8259_$(BITS).o reboot.o \ smpboot_$(BITS).o tsc_$(BITS).o tsc_sync.o -%/head_64.o %/head_64.s: $(if $(CONFIG_XEN),EXTRA_AFLAGS,dummy) := +%/head_64.o %/head_64.s: asflags-$(CONFIG_XEN) := --- a/arch/x86/kernel/acpi/sleep_32-xen.c +++ b/arch/x86/kernel/acpi/sleep_32-xen.c @@ -90,7 +90,7 @@ /* Ouch, we want to delete this. We already have better version in userspace, in s2ram from suspend.sf.net project */ -static __init int reset_videomode_after_s3(struct dmi_system_id *d) +static __init int reset_videomode_after_s3(const struct dmi_system_id *d) { acpi_realmode_flags |= 2; return 0; --- a/arch/x86/kernel/acpi/sleep_64-xen.c +++ b/arch/x86/kernel/acpi/sleep_64-xen.c @@ -123,6 +123,3 @@ __setup("acpi_sleep=", acpi_sleep_setup); #endif /* CONFIG_ACPI_PV_SLEEP */ -void acpi_pci_link_exit(void) -{ -} --- a/arch/x86/kernel/apic_64-xen.c +++ b/arch/x86/kernel/apic_64-xen.c @@ -63,22 +63,38 @@ void smp_local_timer_interrupt(void) { - profile_tick(CPU_PROFILING); #ifndef CONFIG_XEN -#ifdef CONFIG_SMP - update_process_times(user_mode(get_irq_regs())); -#endif -#endif + int cpu = smp_processor_id(); + struct clock_event_device *evt = &per_cpu(lapic_events, cpu); + /* - * We take the 'long' return path, and there every subsystem - * grabs the appropriate locks (kernel lock/ irq lock). + * Normally we should not be here till LAPIC has been initialized but + * in some cases like kdump, its possible that there is a pending LAPIC + * timer interrupt from previous kernel's context and is delivered in + * new kernel the moment interrupts are enabled. * - * We might want to decouple profiling from the 'long path', - * and do the profiling totally in assembly. - * - * Currently this isn't too much of an issue (performance wise), - * we can take more than 100K local irqs per second on a 100 MHz P5. + * Interrupts are enabled early and LAPIC is setup much later, hence + * its possible that when we get here evt->event_handler is NULL. + * Check for event_handler being NULL and discard the interrupt as + * spurious. + */ + if (!evt->event_handler) { + printk(KERN_WARNING + "Spurious LAPIC timer interrupt on cpu %d\n", cpu); + /* Switch it off */ + lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt); + return; + } +#endif + + /* + * the NMI deadlock-detector uses this. */ + add_pda(apic_timer_irqs, 1); + +#ifndef CONFIG_XEN + evt->event_handler(evt); +#endif } /* @@ -94,11 +110,6 @@ struct pt_regs *old_regs = set_irq_regs(regs); /* - * the NMI deadlock-detector uses this. - */ - add_pda(apic_timer_irqs, 1); - - /* * NOTE! We'd better ACK the irq immediately, * because timer handling can be slow. */ @@ -132,6 +143,7 @@ if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f))) ack_APIC_irq(); + add_pda(irq_spurious_count, 1); irq_exit(); } --- a/arch/x86/kernel/cpu/common-xen.c +++ b/arch/x86/kernel/cpu/common-xen.c @@ -214,7 +214,7 @@ static int __init x86_fxsr_setup(char * s) { - /* Tell all the other CPU's to not use it... */ + /* Tell all the other CPUs to not use it... */ disable_x86_fxsr = 1; /* --- a/arch/x86/kernel/e820_32-xen.c +++ b/arch/x86/kernel/e820_32-xen.c @@ -52,6 +52,13 @@ .flags = IORESOURCE_BUSY | IORESOURCE_MEM }; +struct resource bss_resource = { + .name = "Kernel bss", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; + static struct resource system_rom_resource = { .name = "System ROM", .start = 0xf0000, @@ -266,7 +273,9 @@ * and also for regions reported as reserved by the e820. */ static void __init -legacy_init_iomem_resources(struct resource *code_resource, struct resource *data_resource) +legacy_init_iomem_resources(struct resource *code_resource, + struct resource *data_resource, + struct resource *bss_resource) { int i; @@ -300,9 +309,11 @@ #ifndef CONFIG_XEN request_resource(res, code_resource); request_resource(res, data_resource); + request_resource(res, bss_resource); #endif #ifdef CONFIG_KEXEC - request_resource(res, &crashk_res); + if (crashk_res.start != crashk_res.end) + request_resource(res, &crashk_res); #ifdef CONFIG_XEN xen_machine_kexec_register_resources(res); #endif @@ -329,9 +340,11 @@ printk("Setting up standard PCI resources\n"); if (efi_enabled) - efi_initialize_iomem_resources(&code_resource, &data_resource); + efi_initialize_iomem_resources(&code_resource, + &data_resource, &bss_resource); else - legacy_init_iomem_resources(&code_resource, &data_resource); + legacy_init_iomem_resources(&code_resource, + &data_resource, &bss_resource); /* EFI systems may still have VGA */ request_resource(&iomem_resource, &video_ram_resource); @@ -759,7 +772,7 @@ #endif /* - * Search for the bigest gap in the low 32 bits of the e820 + * Search for the biggest gap in the low 32 bits of the e820 * memory space. */ last = 0x100000000ull; --- a/arch/x86/kernel/e820_64-xen.c +++ b/arch/x86/kernel/e820_64-xen.c @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include #include @@ -51,7 +51,7 @@ */ static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT; -extern struct resource code_resource, data_resource; +extern struct resource code_resource, data_resource, bss_resource; /* Check for some hardcoded bad areas that early boot is not allowed to touch */ static inline int bad_addr(unsigned long *addrp, unsigned long size) @@ -73,10 +73,15 @@ /* initrd */ #ifdef CONFIG_BLK_DEV_INITRD - if (LOADER_TYPE && INITRD_START && last >= INITRD_START && - addr < INITRD_START+INITRD_SIZE) { - *addrp = PAGE_ALIGN(INITRD_START + INITRD_SIZE); - return 1; + if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) { + unsigned long ramdisk_image = boot_params.hdr.ramdisk_image; + unsigned long ramdisk_size = boot_params.hdr.ramdisk_size; + unsigned long ramdisk_end = ramdisk_image+ramdisk_size; + + if (last >= ramdisk_image && addr < ramdisk_end) { + *addrp = PAGE_ALIGN(ramdisk_end); + return 1; + } } #endif /* kernel code */ @@ -251,6 +256,7 @@ #ifndef CONFIG_XEN request_resource(res, &code_resource); request_resource(res, &data_resource); + request_resource(res, &bss_resource); #endif #ifdef CONFIG_KEXEC if (crashk_res.start != crashk_res.end) @@ -637,8 +643,8 @@ * Otherwise fake a memory map; one section from 0k->640k, * the next section from 1mb->appropriate_mem_k */ - sanitize_e820_map(E820_MAP, &E820_MAP_NR); - if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0) + sanitize_e820_map(boot_params.e820_map, &boot_params.e820_entries); + if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0) early_panic("Cannot find a valid memory map"); printk(KERN_INFO "BIOS-provided physical RAM map:\n"); e820_print_map("BIOS-e820"); @@ -823,3 +829,22 @@ printk(KERN_INFO "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n", pci_mem_start, gapstart, gapsize); } + +int __init arch_get_ram_range(int slot, u64 *addr, u64 *size) +{ + int i; + + if (slot < 0 || slot >= e820.nr_map) + return -1; + for (i = slot; i < e820.nr_map; i++) { + if (e820.map[i].type != E820_RAM) + continue; + break; + } + if (i == e820.nr_map || e820.map[i].addr > (max_pfn << PAGE_SHIFT)) + return -1; + *addr = e820.map[i].addr; + *size = min_t(u64, e820.map[i].size + e820.map[i].addr, + max_pfn << PAGE_SHIFT) - *addr; + return i + 1; +} --- a/arch/x86/kernel/early_printk-xen.c +++ b/arch/x86/kernel/early_printk-xen.c @@ -6,14 +6,9 @@ #include #include #include +#include /* Simple VGA output */ - -#ifdef __i386__ -#include -#else -#include -#endif #define VGABASE (__ISA_IO_base + 0xb8000) #ifndef CONFIG_XEN @@ -264,10 +259,10 @@ early_console = &early_serial_console; } else if (!strncmp(buf, "vga", 3)) { #ifndef CONFIG_XEN - && SCREEN_INFO.orig_video_isVGA == 1) { - max_xpos = SCREEN_INFO.orig_video_cols; - max_ypos = SCREEN_INFO.orig_video_lines; - current_ypos = SCREEN_INFO.orig_y; + && boot_params.screen_info.orig_video_isVGA == 1) { + max_xpos = boot_params.screen_info.orig_video_cols; + max_ypos = boot_params.screen_info.orig_video_lines; + current_ypos = boot_params.screen_info.orig_y; #endif early_console = &early_vga_console; } else if (!strncmp(buf, "simnow", 6)) { --- a/arch/x86/kernel/entry_32-xen.S +++ b/arch/x86/kernel/entry_32-xen.S @@ -254,6 +254,7 @@ jb resume_kernel # not returning to v8086 or userspace ENTRY(resume_userspace) + LOCKDEP_SYS_EXIT DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret @@ -341,6 +342,7 @@ jae syscall_badsys call *sys_call_table(,%eax,4) movl %eax,PT_EAX(%esp) + LOCKDEP_SYS_EXIT DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF movl TI_flags(%ebp), %ecx @@ -406,6 +408,7 @@ call *sys_call_table(,%eax,4) movl %eax,PT_EAX(%esp) # store the return value syscall_exit: + LOCKDEP_SYS_EXIT DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret @@ -478,7 +481,7 @@ * is still available to implement the setting of the high * 16-bits in the INTERRUPT_RETURN paravirt-op. */ - cmpl $0, paravirt_ops+PARAVIRT_enabled + cmpl $0, pv_info+PARAVIRT_enabled jne restore_nocheck #endif @@ -540,6 +543,7 @@ jz work_notifysig work_resched: call schedule + LOCKDEP_SYS_EXIT DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret @@ -1264,6 +1268,6 @@ ENDPROC(kernel_thread_helper) .section .rodata,"a" -#include "syscall_table.S" +#include "syscall_table_32.S" syscall_table_size=(.-sys_call_table) --- a/arch/x86/kernel/entry_64-xen.S +++ b/arch/x86/kernel/entry_64-xen.S @@ -57,7 +57,7 @@ #include #include -#include "xen_entry.S" +#include "xen_entry_64.S" .code64 @@ -275,6 +275,7 @@ movl $_TIF_ALLWORK_MASK,%edi /* edi: flagmask */ sysret_check: + LOCKDEP_SYS_EXIT GET_THREAD_INFO(%rcx) XEN_BLOCK_EVENTS(%rsi) TRACE_IRQS_OFF @@ -365,6 +366,7 @@ movl $_TIF_ALLWORK_MASK,%edi /* edi: mask to check */ int_with_check: + LOCKDEP_SYS_EXIT_IRQ GET_THREAD_INFO(%rcx) movl threadinfo_flags(%rcx),%edx andl %edi,%edx @@ -516,11 +518,12 @@ retint_check: CFI_DEFAULT_STACK adj=1 + LOCKDEP_SYS_EXIT_IRQ movl threadinfo_flags(%rcx),%edx andl %edi,%edx CFI_REMEMBER_STATE jnz retint_careful -retint_restore_args: +retint_restore_args: /* return to kernel space */ movl EFLAGS-REST_SKIP(%rsp), %eax shr $9, %eax # EAX[0] == IRET_EFLAGS.IF XEN_GET_VCPU_INFO(%rsi) @@ -841,7 +844,7 @@ movq ORIG_RAX(%rsp),%rsi # get error code movq $-1,ORIG_RAX(%rsp) call *%rax -error_exit: +error_exit: RESTORE_REST /* cli */ XEN_BLOCK_EVENTS(%rsi) @@ -849,14 +852,11 @@ GET_THREAD_INFO(%rcx) testb $3,CS-ARGOFFSET(%rsp) jz retint_kernel + LOCKDEP_SYS_EXIT_IRQ movl threadinfo_flags(%rcx),%edx movl $_TIF_WORK_MASK,%edi andl %edi,%edx jnz retint_careful - /* - * The iret might restore flags: - */ - TRACE_IRQS_IRETQ jmp retint_restore_args #if 0 @@ -1071,7 +1071,7 @@ movq %rsi, %rdi call *%rax # exit - xorl %edi, %edi + mov %eax, %edi call do_exit CFI_ENDPROC ENDPROC(child_rip) --- a/arch/x86/kernel/genapic_64-xen.c +++ b/arch/x86/kernel/genapic_64-xen.c @@ -24,12 +24,21 @@ #include #endif -/* which logical CPU number maps to which CPU (physical APIC ID) */ -u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly +/* + * which logical CPU number maps to which CPU (physical APIC ID) + * + * The following static array is used during kernel startup + * and the x86_cpu_to_apicid_ptr contains the address of the + * array during this time. Is it zeroed when the per_cpu + * data area is removed. + */ +#ifndef CONFIG_XEN +u8 x86_cpu_to_apicid_init[NR_CPUS] __initdata = { [0 ... NR_CPUS-1] = BAD_APICID }; -EXPORT_SYMBOL(x86_cpu_to_apicid); - -u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; +void *x86_cpu_to_apicid_ptr; +#endif +DEFINE_PER_CPU(u8, x86_cpu_to_apicid) = BAD_APICID; +EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid); #ifndef CONFIG_XEN struct genapic __read_mostly *genapic = &apic_flat; --- a/arch/x86/kernel/head64-xen.c +++ b/arch/x86/kernel/head64-xen.c @@ -1,5 +1,5 @@ /* - * linux/arch/x86_64/kernel/head64.c -- prepare to run common code + * prepare to run common code * * Copyright (C) 2000 Andrea Arcangeli SuSE * @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include @@ -44,27 +43,16 @@ } #endif -#define NEW_CL_POINTER 0x228 /* Relative to real mode data */ -#define OLD_CL_MAGIC_ADDR 0x20 -#define OLD_CL_MAGIC 0xA33F -#define OLD_CL_OFFSET 0x22 - static void __init copy_bootdata(char *real_mode_data) { #ifndef CONFIG_XEN - unsigned long new_data; char * command_line; - memcpy(x86_boot_params, real_mode_data, BOOT_PARAM_SIZE); - new_data = *(u32 *) (x86_boot_params + NEW_CL_POINTER); - if (!new_data) { - if (OLD_CL_MAGIC != *(u16 *)(real_mode_data + OLD_CL_MAGIC_ADDR)) { - return; - } - new_data = __pa(real_mode_data) + *(u16 *)(real_mode_data + OLD_CL_OFFSET); + memcpy(&boot_params, real_mode_data, sizeof boot_params); + if (boot_params.hdr.cmd_line_ptr) { + command_line = __va(boot_params.hdr.cmd_line_ptr); + memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE); } - command_line = __va(new_data); - memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE); #else int max_cmdline; @@ -114,7 +102,7 @@ for (i = 0; i < IDT_ENTRIES; i++) set_intr_gate(i, early_idt_handler); - asm volatile("lidt %0" :: "m" (idt_descr)); + load_idt((const struct desc_ptr *)&idt_descr); #endif early_printk("Kernel alive\n"); --- a/arch/x86/kernel/init_task-xen.c +++ b/arch/x86/kernel/init_task-xen.c @@ -14,11 +14,11 @@ static struct files_struct init_files = INIT_FILES; static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); - +#ifdef CONFIG_X86_XEN #define swapper_pg_dir ((pgd_t *)NULL) +#endif struct mm_struct init_mm = INIT_MM(init_mm); #undef swapper_pg_dir - EXPORT_SYMBOL(init_mm); /* @@ -28,7 +28,7 @@ * way process stacks are handled. This is done by having a special * "init_task" linker map entry.. */ -union thread_union init_thread_union +union thread_union init_thread_union __attribute__((__section__(".data.init_task"))) = { INIT_THREAD_INFO(init_task) }; @@ -38,14 +38,15 @@ * All other task structs will be allocated on slabs in fork.c */ struct task_struct init_task = INIT_TASK(init_task); - EXPORT_SYMBOL(init_task); #ifndef CONFIG_X86_NO_TSS /* - * per-CPU TSS segments. Threads are completely 'soft' on Linux, - * no more per-task TSS's. - */ + * no more per-task TSS's. The TSS size is kept cacheline-aligned + * so they are allowed to end up in the .data.cacheline_aligned + * section. Since TSS's are completely CPU-local, we want them + * on exact cacheline boundaries, to eliminate cacheline ping-pong. + */ DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS; #endif --- a/arch/x86/kernel/io_apic_32-xen.c +++ b/arch/x86/kernel/io_apic_32-xen.c @@ -422,7 +422,7 @@ #define IRQ_ALLOWED(cpu, allowed_mask) cpu_isset(cpu, allowed_mask) -#define CPU_TO_PACKAGEINDEX(i) (first_cpu(cpu_sibling_map[i])) +#define CPU_TO_PACKAGEINDEX(i) (first_cpu(per_cpu(cpu_sibling_map, i))) static cpumask_t balance_irq_affinity[NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_ALL @@ -628,7 +628,7 @@ imbalance = move_this_load; - /* For physical_balance case, we accumlated both load + /* For physical_balance case, we accumulated both load * values in the one of the siblings cpu_irq[], * to use the same code for physical and logical processors * as much as possible. @@ -642,7 +642,7 @@ * (A+B)/2 vs B */ load = CPU_IRQ(min_loaded) >> 1; - for_each_cpu_mask(j, cpu_sibling_map[min_loaded]) { + for_each_cpu_mask(j, per_cpu(cpu_sibling_map, min_loaded)) { if (load > CPU_IRQ(j)) { /* This won't change cpu_sibling_map[min_loaded] */ load = CPU_IRQ(j); @@ -1011,7 +1011,7 @@ #define default_MCA_trigger(idx) (1) #define default_MCA_polarity(idx) (0) -static int __init MPBIOS_polarity(int idx) +static int MPBIOS_polarity(int idx) { int bus = mp_irqs[idx].mpc_srcbus; int polarity; @@ -1337,6 +1337,11 @@ continue; } + if (!first_notcon) { + apic_printk(APIC_VERBOSE, " not connected.\n"); + first_notcon = 1; + } + entry.trigger = irq_trigger(idx); entry.polarity = irq_polarity(idx); @@ -1922,13 +1927,16 @@ static int __init timer_irq_works(void) { unsigned long t1 = jiffies; + unsigned long flags; if (no_timer_check) return 1; + local_save_flags(flags); local_irq_enable(); /* Let ten ticks pass... */ mdelay((10 * 1000) / HZ); + local_irq_restore(flags); /* * Expect a few ticks at least, to be sure some possible @@ -2209,6 +2217,9 @@ { int apic1, pin1, apic2, pin2; int vector; + unsigned long flags; + + local_irq_save(flags); /* * get/set the timer IRQ vector: @@ -2254,7 +2265,7 @@ } if (disable_timer_pin_1 > 0) clear_IO_APIC_pin(0, pin1); - return; + goto out; } clear_IO_APIC_pin(apic1, pin1); printk(KERN_ERR "..MP-BIOS bug: 8254 timer not connected to " @@ -2277,7 +2288,7 @@ if (nmi_watchdog == NMI_IO_APIC) { setup_nmi(); } - return; + goto out; } /* * Cleanup, just in case ... @@ -2301,7 +2312,7 @@ if (timer_irq_works()) { printk(" works.\n"); - return; + goto out; } apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector); printk(" failed.\n"); @@ -2317,11 +2328,13 @@ if (timer_irq_works()) { printk(" works.\n"); - return; + goto out; } printk(" failed :(.\n"); panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " "report. Then try booting with the 'noapic' option"); +out: + local_irq_restore(flags); } #else int timer_uses_ioapic_pin_0 = 0; @@ -2339,6 +2352,14 @@ void __init setup_IO_APIC(void) { +#ifndef CONFIG_XEN + int i; + + /* Reserve all the system vectors. */ + for (i = FIRST_SYSTEM_VECTOR; i < NR_VECTORS; i++) + set_bit(i, used_vectors); +#endif + enable_IO_APIC(); if (acpi_ioapic) @@ -2526,7 +2547,7 @@ #endif /* - * MSI mesage composition + * MSI message composition */ #ifdef CONFIG_PCI_MSI static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) @@ -2883,6 +2904,25 @@ return 0; } +int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) +{ + int i; + + if (skip_ioapic_setup) + return -1; + + for (i = 0; i < mp_irq_entries; i++) + if (mp_irqs[i].mpc_irqtype == mp_INT && + mp_irqs[i].mpc_srcbusirq == bus_irq) + break; + if (i >= mp_irq_entries) + return -1; + + *trigger = irq_trigger(i); + *polarity = irq_polarity(i); + return 0; +} + #endif /* CONFIG_ACPI */ static int __init parse_disable_timer_pin_1(char *arg) --- a/arch/x86/kernel/io_apic_64-xen.c +++ b/arch/x86/kernel/io_apic_64-xen.c @@ -31,6 +31,7 @@ #include #include #include +#include #ifdef CONFIG_ACPI #include #endif @@ -581,7 +582,7 @@ #define default_PCI_trigger(idx) (1) #define default_PCI_polarity(idx) (1) -static int __init MPBIOS_polarity(int idx) +static int MPBIOS_polarity(int idx) { int bus = mp_irqs[idx].mpc_srcbus; int polarity; @@ -864,6 +865,10 @@ apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mpc_apicid, pin); continue; } + if (!first_notcon) { + apic_printk(APIC_VERBOSE, " not connected.\n"); + first_notcon = 1; + } irq = pin_2_irq(idx, apic, pin); add_pin_to_irq(irq, apic, pin); @@ -874,7 +879,7 @@ } if (!first_notcon) - apic_printk(APIC_VERBOSE," not connected.\n"); + apic_printk(APIC_VERBOSE, " not connected.\n"); } #ifndef CONFIG_XEN @@ -1270,10 +1275,13 @@ static int __init timer_irq_works(void) { unsigned long t1 = jiffies; + unsigned long flags; + local_save_flags(flags); local_irq_enable(); /* Let ten ticks pass... */ mdelay((10 * 1000) / HZ); + local_irq_restore(flags); /* * Expect a few ticks at least, to be sure some possible @@ -1648,6 +1656,9 @@ { struct irq_cfg *cfg = irq_cfg + 0; int apic1, pin1, apic2, pin2; + unsigned long flags; + + local_irq_save(flags); /* * get/set the timer IRQ vector: @@ -1689,7 +1700,7 @@ } if (disable_timer_pin_1 > 0) clear_IO_APIC_pin(0, pin1); - return; + goto out; } clear_IO_APIC_pin(apic1, pin1); apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: 8254 timer not " @@ -1711,7 +1722,7 @@ if (nmi_watchdog == NMI_IO_APIC) { setup_nmi(); } - return; + goto out; } /* * Cleanup, just in case ... @@ -1734,7 +1745,7 @@ if (timer_irq_works()) { apic_printk(APIC_VERBOSE," works.\n"); - return; + goto out; } apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector); apic_printk(APIC_VERBOSE," failed.\n"); @@ -1749,10 +1760,12 @@ if (timer_irq_works()) { apic_printk(APIC_VERBOSE," works.\n"); - return; + goto out; } apic_printk(APIC_VERBOSE," failed :(.\n"); panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n"); +out: + local_irq_restore(flags); } #else #define check_timer() ((void)0) @@ -1768,7 +1781,7 @@ /* * - * IRQ's that are handled by the PIC in the MPS IOAPIC case. + * IRQs that are handled by the PIC in the MPS IOAPIC case. * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ. * Linux doesn't really care, as it's not actually used * for any interrupt handling anyway. @@ -1849,7 +1862,7 @@ static int __init ioapic_init_sysfs(void) { struct sys_device * dev; - int i, size, error = 0; + int i, size, error; error = sysdev_class_register(&ioapic_sysdev_class); if (error) @@ -1858,12 +1871,11 @@ for (i = 0; i < nr_ioapics; i++ ) { size = sizeof(struct sys_device) + nr_ioapic_registers[i] * sizeof(struct IO_APIC_route_entry); - mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL); + mp_ioapic_data[i] = kzalloc(size, GFP_KERNEL); if (!mp_ioapic_data[i]) { printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); continue; } - memset(mp_ioapic_data[i], 0, size); dev = &mp_ioapic_data[i]->dev; dev->id = i; dev->cls = &ioapic_sysdev_class; @@ -1924,7 +1936,7 @@ #endif /* - * MSI mesage composition + * MSI message composition */ #ifdef CONFIG_PCI_MSI static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) @@ -2034,8 +2046,64 @@ destroy_irq(irq); } -#endif /* CONFIG_PCI_MSI */ +#ifdef CONFIG_DMAR +#ifdef CONFIG_SMP +static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask) +{ + struct irq_cfg *cfg = irq_cfg + irq; + struct msi_msg msg; + unsigned int dest; + cpumask_t tmp; + + cpus_and(tmp, mask, cpu_online_map); + if (cpus_empty(tmp)) + return; + + if (assign_irq_vector(irq, mask)) + return; + + cpus_and(tmp, cfg->domain, mask); + dest = cpu_mask_to_apicid(tmp); + + dmar_msi_read(irq, &msg); + + msg.data &= ~MSI_DATA_VECTOR_MASK; + msg.data |= MSI_DATA_VECTOR(cfg->vector); + msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; + msg.address_lo |= MSI_ADDR_DEST_ID(dest); + + dmar_msi_write(irq, &msg); + irq_desc[irq].affinity = mask; +} +#endif /* CONFIG_SMP */ + +struct irq_chip dmar_msi_type = { + .name = "DMAR_MSI", + .unmask = dmar_msi_unmask, + .mask = dmar_msi_mask, + .ack = ack_apic_edge, +#ifdef CONFIG_SMP + .set_affinity = dmar_msi_set_affinity, +#endif + .retrigger = ioapic_retrigger_irq, +}; + +int arch_setup_dmar_msi(unsigned int irq) +{ + int ret; + struct msi_msg msg; + + ret = msi_compose_msg(NULL, irq, &msg); + if (ret < 0) + return ret; + dmar_msi_write(irq, &msg); + set_irq_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq, + "edge"); + return 0; +} +#endif +#endif /* CONFIG_PCI_MSI */ /* * Hypertransport interrupt support */ @@ -2168,8 +2236,27 @@ return 0; } -#endif /* CONFIG_ACPI */ +int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) +{ + int i; + + if (skip_ioapic_setup) + return -1; + + for (i = 0; i < mp_irq_entries; i++) + if (mp_irqs[i].mpc_irqtype == mp_INT && + mp_irqs[i].mpc_srcbusirq == bus_irq) + break; + if (i >= mp_irq_entries) + return -1; + + *trigger = irq_trigger(i); + *polarity = irq_polarity(i); + return 0; +} + +#endif /* CONFIG_ACPI */ #ifndef CONFIG_XEN /* @@ -2208,3 +2295,4 @@ } #endif #endif /* !CONFIG_XEN */ + --- a/arch/x86/kernel/ioport_32-xen.c +++ b/arch/x86/kernel/ioport_32-xen.c @@ -1,6 +1,4 @@ /* - * linux/arch/i386/kernel/ioport.c - * * This contains the io-permission bitmap code - written by obz, with changes * by Linus. */ --- a/arch/x86/kernel/ioport_64-xen.c +++ b/arch/x86/kernel/ioport_64-xen.c @@ -1,6 +1,4 @@ /* - * linux/arch/x86_64/kernel/ioport.c - * * This contains the io-permission bitmap code - written by obz, with changes * by Linus. */ --- a/arch/x86/kernel/irq_32-xen.c +++ b/arch/x86/kernel/irq_32-xen.c @@ -1,6 +1,4 @@ /* - * linux/arch/i386/kernel/irq.c - * * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar * * This file contains the lowest level x86-specific interrupt @@ -231,8 +229,6 @@ local_irq_restore(flags); } - -EXPORT_SYMBOL(do_softirq); #endif /* @@ -259,9 +255,17 @@ } if (i < NR_IRQS) { + unsigned any_count = 0; + spin_lock_irqsave(&irq_desc[i].lock, flags); +#ifndef CONFIG_SMP + any_count = kstat_irqs(i); +#else + for_each_online_cpu(j) + any_count |= kstat_cpu(j).irqs[i]; +#endif action = irq_desc[i].action; - if (!action) + if (!action && !any_count) goto skip; seq_printf(p, "%3d: ",i); #ifndef CONFIG_SMP @@ -272,10 +276,12 @@ #endif seq_printf(p, " %8s", irq_desc[i].chip->name); seq_printf(p, "-%-8s", irq_desc[i].name); - seq_printf(p, " %s", action->name); - for (action=action->next; action; action = action->next) - seq_printf(p, ", %s", action->name); + if (action) { + seq_printf(p, " %s", action->name); + while ((action = action->next) != NULL) + seq_printf(p, ", %s", action->name); + } seq_putc(p, '\n'); skip: @@ -284,13 +290,46 @@ seq_printf(p, "NMI: "); for_each_online_cpu(j) seq_printf(p, "%10u ", nmi_count(j)); - seq_putc(p, '\n'); + seq_printf(p, " Non-maskable interrupts\n"); #ifdef CONFIG_X86_LOCAL_APIC seq_printf(p, "LOC: "); for_each_online_cpu(j) seq_printf(p, "%10u ", per_cpu(irq_stat,j).apic_timer_irqs); - seq_putc(p, '\n'); + seq_printf(p, " Local timer interrupts\n"); +#endif +#ifdef CONFIG_SMP + seq_printf(p, "RES: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", + per_cpu(irq_stat,j).irq_resched_count); + seq_printf(p, " Rescheduling interrupts\n"); + seq_printf(p, "CAL: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", + per_cpu(irq_stat,j).irq_call_count); + seq_printf(p, " function call interrupts\n"); +#ifndef CONFIG_XEN + seq_printf(p, "TLB: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", + per_cpu(irq_stat,j).irq_tlb_count); + seq_printf(p, " TLB shootdowns\n"); +#endif +#endif +#ifdef CONFIG_X86_MCE + seq_printf(p, "TRM: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", + per_cpu(irq_stat,j).irq_thermal_count); + seq_printf(p, " Thermal event interrupts\n"); +#endif +#ifdef CONFIG_X86_LOCAL_APIC + seq_printf(p, "SPU: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", + per_cpu(irq_stat,j).irq_spurious_count); + seq_printf(p, " Spurious interrupts\n"); #endif seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); #if defined(CONFIG_X86_IO_APIC) --- a/arch/x86/kernel/irq_64-xen.c +++ b/arch/x86/kernel/irq_64-xen.c @@ -1,6 +1,4 @@ /* - * linux/arch/x86_64/kernel/irq.c - * * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar * * This file contains the lowest level x86_64-specific interrupt @@ -64,9 +62,17 @@ } if (i < NR_IRQS) { + unsigned any_count = 0; + spin_lock_irqsave(&irq_desc[i].lock, flags); +#ifndef CONFIG_SMP + any_count = kstat_irqs(i); +#else + for_each_online_cpu(j) + any_count |= kstat_cpu(j).irqs[i]; +#endif action = irq_desc[i].action; - if (!action) + if (!action && !any_count) goto skip; seq_printf(p, "%3d: ",i); #ifndef CONFIG_SMP @@ -78,9 +84,11 @@ seq_printf(p, " %8s", irq_desc[i].chip->name); seq_printf(p, "-%-8s", irq_desc[i].name); - seq_printf(p, " %s", action->name); - for (action=action->next; action; action = action->next) - seq_printf(p, ", %s", action->name); + if (action) { + seq_printf(p, " %s", action->name); + while ((action = action->next) != NULL) + seq_printf(p, ", %s", action->name); + } seq_putc(p, '\n'); skip: spin_unlock_irqrestore(&irq_desc[i].lock, flags); @@ -88,12 +96,44 @@ seq_printf(p, "NMI: "); for_each_online_cpu(j) seq_printf(p, "%10u ", cpu_pda(j)->__nmi_count); - seq_putc(p, '\n'); + seq_printf(p, " Non-maskable interrupts\n"); #ifdef CONFIG_X86_LOCAL_APIC seq_printf(p, "LOC: "); for_each_online_cpu(j) seq_printf(p, "%10u ", cpu_pda(j)->apic_timer_irqs); - seq_putc(p, '\n'); + seq_printf(p, " Local timer interrupts\n"); +#endif +#ifdef CONFIG_SMP + seq_printf(p, "RES: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", cpu_pda(j)->irq_resched_count); + seq_printf(p, " Rescheduling interrupts\n"); + seq_printf(p, "CAL: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", cpu_pda(j)->irq_call_count); + seq_printf(p, " function call interrupts\n"); +#ifndef CONFIG_XEN + seq_printf(p, "TLB: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", cpu_pda(j)->irq_tlb_count); + seq_printf(p, " TLB shootdowns\n"); +#endif +#endif +#ifdef CONFIG_X86_MCE + seq_printf(p, "TRM: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", cpu_pda(j)->irq_thermal_count); + seq_printf(p, " Thermal event interrupts\n"); + seq_printf(p, "THR: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", cpu_pda(j)->irq_threshold_count); + seq_printf(p, " Threshold APIC interrupts\n"); +#endif +#ifdef CONFIG_X86_LOCAL_APIC + seq_printf(p, "SPU: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", cpu_pda(j)->irq_spurious_count); + seq_printf(p, " Spurious interrupts\n"); #endif seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); } @@ -211,7 +251,6 @@ } local_irq_restore(flags); } -EXPORT_SYMBOL(do_softirq); #ifndef CONFIG_X86_LOCAL_APIC /* --- a/arch/x86/kernel/ldt_32-xen.c +++ b/arch/x86/kernel/ldt_32-xen.c @@ -1,6 +1,4 @@ /* - * linux/arch/i386/kernel/ldt.c - * * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds * Copyright (C) 1999 Ingo Molnar */ @@ -106,14 +104,14 @@ struct mm_struct * old_mm; int retval = 0; - init_MUTEX(&mm->context.sem); + mutex_init(&mm->context.lock); mm->context.size = 0; mm->context.has_foreign_mappings = 0; old_mm = current->mm; if (old_mm && old_mm->context.size > 0) { - down(&old_mm->context.sem); + mutex_lock(&old_mm->context.lock); retval = copy_ldt(&mm->context, &old_mm->context); - up(&old_mm->context.sem); + mutex_unlock(&old_mm->context.lock); } return retval; } @@ -149,7 +147,7 @@ if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES) bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES; - down(&mm->context.sem); + mutex_lock(&mm->context.lock); size = mm->context.size*LDT_ENTRY_SIZE; if (size > bytecount) size = bytecount; @@ -157,7 +155,7 @@ err = 0; if (copy_to_user(ptr, mm->context.ldt, size)) err = -EFAULT; - up(&mm->context.sem); + mutex_unlock(&mm->context.lock); if (err < 0) goto error_return; if (size != bytecount) { @@ -213,7 +211,7 @@ goto out; } - down(&mm->context.sem); + mutex_lock(&mm->context.lock); if (ldt_info.entry_number >= mm->context.size) { error = alloc_ldt(¤t->mm->context, ldt_info.entry_number+1, 1); if (error < 0) @@ -240,7 +238,7 @@ entry_1, entry_2); out_unlock: - up(&mm->context.sem); + mutex_unlock(&mm->context.lock); out: return error; } --- a/arch/x86/kernel/ldt_64-xen.c +++ b/arch/x86/kernel/ldt_64-xen.c @@ -1,6 +1,4 @@ /* - * linux/arch/x86_64/kernel/ldt.c - * * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds * Copyright (C) 1999 Ingo Molnar * Copyright (C) 2002 Andi Kleen @@ -112,14 +110,14 @@ int retval = 0; memset(&mm->context, 0, sizeof(mm->context)); - init_MUTEX(&mm->context.sem); + mutex_init(&mm->context.lock); old_mm = current->mm; if (old_mm) mm->context.vdso = old_mm->context.vdso; if (old_mm && old_mm->context.size > 0) { - down(&old_mm->context.sem); + mutex_lock(&old_mm->context.lock); retval = copy_ldt(&mm->context, &old_mm->context); - up(&old_mm->context.sem); + mutex_unlock(&old_mm->context.lock); } if (retval == 0) { spin_lock(&mm_unpinned_lock); @@ -166,7 +164,7 @@ if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES) bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES; - down(&mm->context.sem); + mutex_lock(&mm->context.lock); size = mm->context.size*LDT_ENTRY_SIZE; if (size > bytecount) size = bytecount; @@ -174,7 +172,7 @@ err = 0; if (copy_to_user(ptr, mm->context.ldt, size)) err = -EFAULT; - up(&mm->context.sem); + mutex_unlock(&mm->context.lock); if (err < 0) goto error_return; if (size != bytecount) { @@ -227,7 +225,7 @@ goto out; } - down(&mm->context.sem); + mutex_lock(&mm->context.lock); if (ldt_info.entry_number >= (unsigned)mm->context.size) { error = alloc_ldt(¤t->mm->context, ldt_info.entry_number+1, 1); if (error < 0) @@ -256,7 +254,7 @@ error = HYPERVISOR_update_descriptor(mach_lp, (unsigned long)((entry_1 | (unsigned long) entry_2 << 32))); out_unlock: - up(&mm->context.sem); + mutex_unlock(&mm->context.lock); out: return error; } --- a/arch/x86/kernel/mpparse_32-xen.c +++ b/arch/x86/kernel/mpparse_32-xen.c @@ -1023,7 +1023,7 @@ /* * Use the default configuration for the IRQs 0-15. Unless - * overriden by (MADT) interrupt source override entries. + * overridden by (MADT) interrupt source override entries. */ for (i = 0; i < 16; i++) { int idx; --- a/arch/x86/kernel/mpparse_64-xen.c +++ b/arch/x86/kernel/mpparse_64-xen.c @@ -57,6 +57,8 @@ /* Processor that is doing the boot up */ unsigned int boot_cpu_id = -1U; +EXPORT_SYMBOL(boot_cpu_id); + /* Internal processor count */ unsigned int num_processors __cpuinitdata = 0; @@ -87,7 +89,7 @@ } #ifndef CONFIG_XEN -static void __cpuinit MP_processor_info (struct mpc_config_processor *m) +static void __cpuinit MP_processor_info(struct mpc_config_processor *m) { int cpu; cpumask_t tmp_map; @@ -124,13 +126,24 @@ cpu = 0; } bios_cpu_apicid[cpu] = m->mpc_apicid; - x86_cpu_to_apicid[cpu] = m->mpc_apicid; + /* + * We get called early in the the start_kernel initialization + * process when the per_cpu data area is not yet setup, so we + * use a static array that is removed after the per_cpu data + * area is created. + */ + if (x86_cpu_to_apicid_ptr) { + u8 *x86_cpu_to_apicid = (u8 *)x86_cpu_to_apicid_ptr; + x86_cpu_to_apicid[cpu] = m->mpc_apicid; + } else { + per_cpu(x86_cpu_to_apicid, cpu) = m->mpc_apicid; + } cpu_set(cpu, cpu_possible_map); cpu_set(cpu, cpu_present_map); } #else -static void __cpuinit MP_processor_info (struct mpc_config_processor *m) +static void __cpuinit MP_processor_info(struct mpc_config_processor *m) { num_processors++; } --- a/arch/x86/kernel/pci-dma_32-xen.c +++ b/arch/x86/kernel/pci-dma_32-xen.c @@ -13,14 +13,13 @@ #include #include #include -#include #include #include #include #include #include -#include -#include +#include +#include #include #ifdef __x86_64__ @@ -112,27 +111,29 @@ } int -dma_map_sg(struct device *hwdev, struct scatterlist *sg, int nents, +dma_map_sg(struct device *hwdev, struct scatterlist *sgl, int nents, enum dma_data_direction direction) { int i, rc; BUG_ON(!valid_dma_direction(direction)); - WARN_ON(nents == 0 || sg[0].length == 0); + WARN_ON(nents == 0 || sgl->length == 0); if (swiotlb) { - rc = swiotlb_map_sg(hwdev, sg, nents, direction); + rc = swiotlb_map_sg(hwdev, sgl, nents, direction); } else { - for (i = 0; i < nents; i++ ) { - BUG_ON(!sg[i].page); - sg[i].dma_address = - gnttab_dma_map_page(sg[i].page) + sg[i].offset; - sg[i].dma_length = sg[i].length; + struct scatterlist *sg; + + for_each_sg(sgl, sg, nents, i) { + BUG_ON(!sg_page(sg)); + sg->dma_address = + gnttab_dma_map_page(sg_page(sg)) + sg->offset; + sg->dma_length = sg->length; IOMMU_BUG_ON(address_needs_mapping( - hwdev, sg[i].dma_address)); + hwdev, sg->dma_address)); IOMMU_BUG_ON(range_straddles_page_boundary( - page_to_pseudophys(sg[i].page) + sg[i].offset, - sg[i].length)); + page_to_pseudophys(sg_page(sg)) + sg->offset, + sg->length)); } rc = nents; } @@ -143,17 +144,19 @@ EXPORT_SYMBOL(dma_map_sg); void -dma_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents, +dma_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nents, enum dma_data_direction direction) { int i; BUG_ON(!valid_dma_direction(direction)); if (swiotlb) - swiotlb_unmap_sg(hwdev, sg, nents, direction); + swiotlb_unmap_sg(hwdev, sgl, nents, direction); else { - for (i = 0; i < nents; i++ ) - gnttab_dma_unmap_page(sg[i].dma_address); + struct scatterlist *sg; + + for_each_sg(sgl, sg, nents, i) + gnttab_dma_unmap_page(sg->dma_address); } } EXPORT_SYMBOL(dma_unmap_sg); @@ -267,7 +270,8 @@ { struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; int order = get_order(size); - + + WARN_ON(irqs_disabled()); /* for portability */ if (mem && vaddr >= mem->virt_base && vaddr < (mem->virt_base + (mem->size << PAGE_SHIFT))) { int page = (vaddr - mem->virt_base) >> PAGE_SHIFT; --- a/arch/x86/kernel/pci-swiotlb_64-xen.c +++ b/arch/x86/kernel/pci-swiotlb_64-xen.c @@ -5,7 +5,7 @@ #include #include -#include +#include #include #include --- a/arch/x86/kernel/process_32-xen.c +++ b/arch/x86/kernel/process_32-xen.c @@ -1,6 +1,4 @@ /* - * linux/arch/i386/kernel/process.c - * * Copyright (C) 1995 Linus Torvalds * * Pentium III FXSR, SSE support @@ -190,6 +188,10 @@ } } +static void do_nothing(void *unused) +{ +} + void cpu_idle_wait(void) { unsigned int cpu, this_cpu = get_cpu(); @@ -214,13 +216,20 @@ cpu_clear(cpu, map); } cpus_and(map, map, cpu_online_map); + /* + * We waited 1 sec, if a CPU still did not call idle + * it may be because it is in idle and not waking up + * because it has nothing to do. + * Give all the remaining CPUS a kick. + */ + smp_call_function_mask(map, do_nothing, 0, 0); } while (!cpus_empty(map)); set_cpus_allowed(current, tmp); } EXPORT_SYMBOL_GPL(cpu_idle_wait); -void __devinit select_idle_routine(const struct cpuinfo_x86 *c) +void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) { } @@ -238,34 +247,52 @@ } early_param("idle", idle_setup); -void show_regs(struct pt_regs * regs) +void __show_registers(struct pt_regs *regs, int all) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; unsigned long d0, d1, d2, d3, d6, d7; + unsigned long esp; + unsigned short ss, gs; + + if (user_mode_vm(regs)) { + esp = regs->esp; + ss = regs->xss & 0xffff; + savesegment(gs, gs); + } else { + esp = (unsigned long) (®s->esp); + savesegment(ss, ss); + savesegment(gs, gs); + } printk("\n"); - printk("Pid: %d, comm: %20s\n", current->pid, current->comm); - printk("EIP: %04x:[<%08lx>] CPU: %d\n",0xffff & regs->xcs,regs->eip, smp_processor_id()); + printk("Pid: %d, comm: %s %s (%s %.*s)\n", + task_pid_nr(current), current->comm, + print_tainted(), init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version); + + printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n", + 0xffff & regs->xcs, regs->eip, regs->eflags, + smp_processor_id()); print_symbol("EIP is at %s\n", regs->eip); - if (user_mode_vm(regs)) - printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp); - printk(" EFLAGS: %08lx %s (%s %.*s)\n", - regs->eflags, print_tainted(), init_utsname()->release, - (int)strcspn(init_utsname()->version, " "), - init_utsname()->version); printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", - regs->eax,regs->ebx,regs->ecx,regs->edx); - printk("ESI: %08lx EDI: %08lx EBP: %08lx", - regs->esi, regs->edi, regs->ebp); - printk(" DS: %04x ES: %04x FS: %04x\n", - 0xffff & regs->xds,0xffff & regs->xes, 0xffff & regs->xfs); + regs->eax, regs->ebx, regs->ecx, regs->edx); + printk("ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n", + regs->esi, regs->edi, regs->ebp, esp); + printk(" DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n", + regs->xds & 0xffff, regs->xes & 0xffff, + regs->xfs & 0xffff, gs, ss); + + if (!all) + return; cr0 = read_cr0(); cr2 = read_cr2(); cr3 = read_cr3(); cr4 = read_cr4_safe(); - printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4); + printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", + cr0, cr2, cr3, cr4); get_debugreg(d0, 0); get_debugreg(d1, 1); @@ -273,10 +300,16 @@ get_debugreg(d3, 3); printk("DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n", d0, d1, d2, d3); + get_debugreg(d6, 6); get_debugreg(d7, 7); - printk("DR6: %08lx DR7: %08lx\n", d6, d7); + printk("DR6: %08lx DR7: %08lx\n", + d6, d7); +} +void show_regs(struct pt_regs *regs) +{ + __show_registers(regs, 1); show_trace(NULL, regs, ®s->esp); } --- a/arch/x86/kernel/process_64-xen.c +++ b/arch/x86/kernel/process_64-xen.c @@ -1,6 +1,4 @@ /* - * linux/arch/x86-64/kernel/process.c - * * Copyright (C) 1995 Linus Torvalds * * Pentium III FXSR, SSE support @@ -41,6 +39,7 @@ #include #include #include +#include #include #include @@ -172,6 +171,9 @@ if (__get_cpu_var(cpu_idle_state)) __get_cpu_var(cpu_idle_state) = 0; + + tick_nohz_stop_sched_tick(); + rmb(); idle = xen_idle; /* no alternatives */ if (cpu_is_offline(smp_processor_id())) @@ -190,12 +192,17 @@ __exit_idle(); } + tick_nohz_restart_sched_tick(); preempt_enable_no_resched(); schedule(); preempt_disable(); } } +static void do_nothing(void *unused) +{ +} + void cpu_idle_wait(void) { unsigned int cpu, this_cpu = get_cpu(); @@ -221,6 +228,13 @@ cpu_clear(cpu, map); } cpus_and(map, map, cpu_online_map); + /* + * We waited 1 sec, if a CPU still did not call idle + * it may be because it is in idle and not waking up + * because it has nothing to do. + * Give all the remaining CPUS a kick. + */ + smp_call_function_mask(map, do_nothing, 0, 0); } while (!cpus_empty(map)); set_cpus_allowed(current, tmp); @@ -528,7 +542,7 @@ * * Kprobes not supported here. Set the probe on schedule instead. */ -__kprobes struct task_struct * +struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) { struct thread_struct *prev = &prev_p->thread, --- a/arch/x86/kernel/quirks-xen.c +++ b/arch/x86/kernel/quirks-xen.c @@ -41,7 +41,353 @@ if (!(config & 0x2)) pci_write_config_byte(dev, 0xf4, config); } -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, quirk_intel_irqbalance); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, quirk_intel_irqbalance); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, quirk_intel_irqbalance); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, + quirk_intel_irqbalance); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, + quirk_intel_irqbalance); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, + quirk_intel_irqbalance); +#endif + +#if defined(CONFIG_HPET_TIMER) +#include + +unsigned long force_hpet_address; + +static enum { + NONE_FORCE_HPET_RESUME, + OLD_ICH_FORCE_HPET_RESUME, + ICH_FORCE_HPET_RESUME, + VT8237_FORCE_HPET_RESUME, + NVIDIA_FORCE_HPET_RESUME, +} force_hpet_resume_type; + +static void __iomem *rcba_base; + +static void ich_force_hpet_resume(void) +{ + u32 val; + + if (!force_hpet_address) + return; + + if (rcba_base == NULL) + BUG(); + + /* read the Function Disable register, dword mode only */ + val = readl(rcba_base + 0x3404); + if (!(val & 0x80)) { + /* HPET disabled in HPTC. Trying to enable */ + writel(val | 0x80, rcba_base + 0x3404); + } + + val = readl(rcba_base + 0x3404); + if (!(val & 0x80)) + BUG(); + else + printk(KERN_DEBUG "Force enabled HPET at resume\n"); + + return; +} + +static void ich_force_enable_hpet(struct pci_dev *dev) +{ + u32 val; + u32 uninitialized_var(rcba); + int err = 0; + + if (hpet_address || force_hpet_address) + return; + + pci_read_config_dword(dev, 0xF0, &rcba); + rcba &= 0xFFFFC000; + if (rcba == 0) { + printk(KERN_DEBUG "RCBA disabled. Cannot force enable HPET\n"); + return; + } + + /* use bits 31:14, 16 kB aligned */ + rcba_base = ioremap_nocache(rcba, 0x4000); + if (rcba_base == NULL) { + printk(KERN_DEBUG "ioremap failed. Cannot force enable HPET\n"); + return; + } + + /* read the Function Disable register, dword mode only */ + val = readl(rcba_base + 0x3404); + + if (val & 0x80) { + /* HPET is enabled in HPTC. Just not reported by BIOS */ + val = val & 0x3; + force_hpet_address = 0xFED00000 | (val << 12); + printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n", + force_hpet_address); + iounmap(rcba_base); + return; + } + + /* HPET disabled in HPTC. Trying to enable */ + writel(val | 0x80, rcba_base + 0x3404); + + val = readl(rcba_base + 0x3404); + if (!(val & 0x80)) { + err = 1; + } else { + val = val & 0x3; + force_hpet_address = 0xFED00000 | (val << 12); + } + + if (err) { + force_hpet_address = 0; + iounmap(rcba_base); + printk(KERN_DEBUG "Failed to force enable HPET\n"); + } else { + force_hpet_resume_type = ICH_FORCE_HPET_RESUME; + printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n", + force_hpet_address); + } +} + +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ESB2_0, + ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH6_1, + ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_0, + ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_1, + ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_31, + ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_1, + ich_force_enable_hpet); + + +static struct pci_dev *cached_dev; + +static void old_ich_force_hpet_resume(void) +{ + u32 val; + u32 uninitialized_var(gen_cntl); + + if (!force_hpet_address || !cached_dev) + return; + + pci_read_config_dword(cached_dev, 0xD0, &gen_cntl); + gen_cntl &= (~(0x7 << 15)); + gen_cntl |= (0x4 << 15); + + pci_write_config_dword(cached_dev, 0xD0, gen_cntl); + pci_read_config_dword(cached_dev, 0xD0, &gen_cntl); + val = gen_cntl >> 15; + val &= 0x7; + if (val == 0x4) + printk(KERN_DEBUG "Force enabled HPET at resume\n"); + else + BUG(); +} + +static void old_ich_force_enable_hpet(struct pci_dev *dev) +{ + u32 val; + u32 uninitialized_var(gen_cntl); + + if (hpet_address || force_hpet_address) + return; + + pci_read_config_dword(dev, 0xD0, &gen_cntl); + /* + * Bit 17 is HPET enable bit. + * Bit 16:15 control the HPET base address. + */ + val = gen_cntl >> 15; + val &= 0x7; + if (val & 0x4) { + val &= 0x3; + force_hpet_address = 0xFED00000 | (val << 12); + printk(KERN_DEBUG "HPET at base address 0x%lx\n", + force_hpet_address); + return; + } + + /* + * HPET is disabled. Trying enabling at FED00000 and check + * whether it sticks + */ + gen_cntl &= (~(0x7 << 15)); + gen_cntl |= (0x4 << 15); + pci_write_config_dword(dev, 0xD0, gen_cntl); + + pci_read_config_dword(dev, 0xD0, &gen_cntl); + + val = gen_cntl >> 15; + val &= 0x7; + if (val & 0x4) { + /* HPET is enabled in HPTC. Just not reported by BIOS */ + val &= 0x3; + force_hpet_address = 0xFED00000 | (val << 12); + printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n", + force_hpet_address); + cached_dev = dev; + force_hpet_resume_type = OLD_ICH_FORCE_HPET_RESUME; + return; + } + + printk(KERN_DEBUG "Failed to force enable HPET\n"); +} + +/* + * Undocumented chipset features. Make sure that the user enforced + * this. + */ +static void old_ich_force_enable_hpet_user(struct pci_dev *dev) +{ + if (hpet_force_user) + old_ich_force_enable_hpet(dev); +} + +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_0, + old_ich_force_enable_hpet_user); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_12, + old_ich_force_enable_hpet_user); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801DB_0, + old_ich_force_enable_hpet_user); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801DB_12, + old_ich_force_enable_hpet_user); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801EB_0, + old_ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801EB_12, + old_ich_force_enable_hpet); + + +static void vt8237_force_hpet_resume(void) +{ + u32 val; + + if (!force_hpet_address || !cached_dev) + return; + + val = 0xfed00000 | 0x80; + pci_write_config_dword(cached_dev, 0x68, val); + + pci_read_config_dword(cached_dev, 0x68, &val); + if (val & 0x80) + printk(KERN_DEBUG "Force enabled HPET at resume\n"); + else + BUG(); +} + +static void vt8237_force_enable_hpet(struct pci_dev *dev) +{ + u32 uninitialized_var(val); + + if (!hpet_force_user || hpet_address || force_hpet_address) + return; + + pci_read_config_dword(dev, 0x68, &val); + /* + * Bit 7 is HPET enable bit. + * Bit 31:10 is HPET base address (contrary to what datasheet claims) + */ + if (val & 0x80) { + force_hpet_address = (val & ~0x3ff); + printk(KERN_DEBUG "HPET at base address 0x%lx\n", + force_hpet_address); + return; + } + + /* + * HPET is disabled. Trying enabling at FED00000 and check + * whether it sticks + */ + val = 0xfed00000 | 0x80; + pci_write_config_dword(dev, 0x68, val); + + pci_read_config_dword(dev, 0x68, &val); + if (val & 0x80) { + force_hpet_address = (val & ~0x3ff); + printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n", + force_hpet_address); + cached_dev = dev; + force_hpet_resume_type = VT8237_FORCE_HPET_RESUME; + return; + } + + printk(KERN_DEBUG "Failed to force enable HPET\n"); +} + +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235, + vt8237_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8237, + vt8237_force_enable_hpet); + +/* + * Undocumented chipset feature taken from LinuxBIOS. + */ +static void nvidia_force_hpet_resume(void) +{ + pci_write_config_dword(cached_dev, 0x44, 0xfed00001); + printk(KERN_DEBUG "Force enabled HPET at resume\n"); +} + +static void nvidia_force_enable_hpet(struct pci_dev *dev) +{ + u32 uninitialized_var(val); + + if (!hpet_force_user || hpet_address || force_hpet_address) + return; + + pci_write_config_dword(dev, 0x44, 0xfed00001); + pci_read_config_dword(dev, 0x44, &val); + force_hpet_address = val & 0xfffffffe; + force_hpet_resume_type = NVIDIA_FORCE_HPET_RESUME; + printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n", + force_hpet_address); + cached_dev = dev; + return; +} + +/* ISA Bridges */ +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0050, + nvidia_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0051, + nvidia_force_enable_hpet); + +/* LPC bridges */ +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0360, + nvidia_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0361, + nvidia_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0362, + nvidia_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0363, + nvidia_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0364, + nvidia_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0365, + nvidia_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0366, + nvidia_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0367, + nvidia_force_enable_hpet); + +void force_hpet_resume(void) +{ + switch (force_hpet_resume_type) { + case ICH_FORCE_HPET_RESUME: + return ich_force_hpet_resume(); + + case OLD_ICH_FORCE_HPET_RESUME: + return old_ich_force_hpet_resume(); + + case VT8237_FORCE_HPET_RESUME: + return vt8237_force_hpet_resume(); + + case NVIDIA_FORCE_HPET_RESUME: + return nvidia_force_hpet_resume(); + + default: + break; + } +} + #endif --- a/arch/x86/kernel/setup64-xen.c +++ b/arch/x86/kernel/setup64-xen.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include @@ -27,11 +26,12 @@ #include #include #include +#include #ifdef CONFIG_XEN #include #endif -char x86_boot_params[BOOT_PARAM_SIZE] __initdata; +struct boot_params __initdata boot_params; cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; @@ -159,8 +159,8 @@ static void __cpuinit cpu_gdt_init(const struct desc_ptr *gdt_descr) { - asm volatile("lgdt %0" :: "m" (*gdt_descr)); - asm volatile("lidt %0" :: "m" (idt_descr)); + load_gdt(gdt_descr); + load_idt(idt_descr); } #endif @@ -252,6 +252,14 @@ unsigned long kernel_eflags; +#ifndef CONFIG_X86_NO_TSS +/* + * Copies of the original ist values from the tss are only accessed during + * debugging, no special alignment required. + */ +DEFINE_PER_CPU(struct orig_ist, orig_ist); +#endif + /* * cpu_init() initializes state that is per-CPU. Some data is already * initialized (naturally) in the bootstrap process, such as the GDT --- a/arch/x86/kernel/setup_32-xen.c +++ b/arch/x86/kernel/setup_32-xen.c @@ -1,6 +1,4 @@ /* - * linux/arch/i386/kernel/setup.c - * * Copyright (C) 1995 Linus Torvalds * * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 @@ -70,6 +68,7 @@ #include #include #include +#include #ifdef CONFIG_XEN #include @@ -83,13 +82,14 @@ extern char hypercall_page[PAGE_SIZE]; EXPORT_SYMBOL(hypercall_page); -int disable_pse __devinitdata = 0; +int disable_pse __cpuinitdata = 0; /* * Machine setup.. */ extern struct resource code_resource; extern struct resource data_resource; +extern struct resource bss_resource; /* cpu data as detected by the assembly code in head.S */ struct cpuinfo_x86 new_cpu_data __cpuinitdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 }; @@ -101,9 +101,6 @@ /* for MCA, but anyone else can use it if they want */ unsigned int machine_id; -#ifdef CONFIG_MCA -EXPORT_SYMBOL(machine_id); -#endif unsigned int machine_submodel_id; unsigned int BIOS_revision; unsigned int mca_pentium_flag; @@ -124,7 +121,7 @@ struct edid_info edid_info; EXPORT_SYMBOL_GPL(edid_info); #ifndef CONFIG_XEN -#define copy_edid() (edid_info = EDID_INFO) +#define copy_edid() (edid_info = boot_params.edid_info) #endif struct ist_info ist_info; #if defined(CONFIG_X86_SPEEDSTEP_SMI) || \ @@ -173,10 +170,11 @@ */ static inline void copy_edd(void) { - memcpy(edd.mbr_signature, EDD_MBR_SIGNATURE, sizeof(edd.mbr_signature)); - memcpy(edd.edd_info, EDD_BUF, sizeof(edd.edd_info)); - edd.mbr_signature_nr = EDD_MBR_SIG_NR; - edd.edd_info_nr = EDD_NR; + memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer, + sizeof(edd.mbr_signature)); + memcpy(edd.edd_info, boot_params.eddbuf, sizeof(edd.edd_info)); + edd.mbr_signature_nr = boot_params.edd_mbr_sig_buf_entries; + edd.edd_info_nr = boot_params.eddbuf_entries; } #endif #else @@ -419,6 +417,53 @@ extern void zone_sizes_init(void); #endif /* !CONFIG_NEED_MULTIPLE_NODES */ +static inline unsigned long long get_total_mem(void) +{ + unsigned long long total; + + total = max_low_pfn - min_low_pfn; +#ifdef CONFIG_HIGHMEM + total += highend_pfn - highstart_pfn; +#endif + + return total << PAGE_SHIFT; +} + +#ifdef CONFIG_KEXEC +#ifndef CONFIG_XEN +static void __init reserve_crashkernel(void) +{ + unsigned long long total_mem; + unsigned long long crash_size, crash_base; + int ret; + + total_mem = get_total_mem(); + + ret = parse_crashkernel(boot_command_line, total_mem, + &crash_size, &crash_base); + if (ret == 0 && crash_size > 0) { + if (crash_base > 0) { + printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " + "for crashkernel (System RAM: %ldMB)\n", + (unsigned long)(crash_size >> 20), + (unsigned long)(crash_base >> 20), + (unsigned long)(total_mem >> 20)); + crashk_res.start = crash_base; + crashk_res.end = crash_base + crash_size - 1; + reserve_bootmem(crash_base, crash_size); + } else + printk(KERN_INFO "crashkernel reservation failed - " + "you have to specify a base address\n"); + } +} +#else +#define reserve_crashkernel xen_machine_kexec_setup_resources +#endif +#else +static inline void __init reserve_crashkernel(void) +{} +#endif + void __init setup_bootmem_allocator(void) { unsigned long bootmap_size; @@ -474,30 +519,25 @@ #ifdef CONFIG_BLK_DEV_INITRD if (xen_start_info->mod_start) { - if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) { - /*reserve_bootmem(INITRD_START, INITRD_SIZE);*/ - initrd_start = INITRD_START + PAGE_OFFSET; - initrd_end = initrd_start+INITRD_SIZE; + unsigned long ramdisk_image = __pa(xen_start_info->mod_start); + unsigned long ramdisk_size = xen_start_info->mod_len; + unsigned long ramdisk_end = ramdisk_image + ramdisk_size; + unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT; + + if (ramdisk_end <= end_of_lowmem) { + /*reserve_bootmem(ramdisk_image, ramdisk_size);*/ + initrd_start = ramdisk_image + PAGE_OFFSET; + initrd_end = initrd_start+ramdisk_size; initrd_below_start_ok = 1; - } - else { + } else { printk(KERN_ERR "initrd extends beyond end of memory " - "(0x%08lx > 0x%08lx)\ndisabling initrd\n", - INITRD_START + INITRD_SIZE, - max_low_pfn << PAGE_SHIFT); + "(0x%08lx > 0x%08lx)\ndisabling initrd\n", + ramdisk_end, end_of_lowmem); initrd_start = 0; } } #endif -#ifdef CONFIG_KEXEC -#ifdef CONFIG_XEN - xen_machine_kexec_setup_resources(); -#else - if (crashk_res.start != crashk_res.end) - reserve_bootmem(crashk_res.start, - crashk_res.end - crashk_res.start + 1); -#endif -#endif + reserve_crashkernel(); } /* @@ -575,7 +615,8 @@ * the system table is valid. If not, then initialize normally. */ #ifdef CONFIG_EFI - if ((LOADER_TYPE == 0x50) && EFI_SYSTAB) + if ((boot_params.hdr.type_of_loader == 0x50) && + boot_params.efi_info.efi_systab) efi_enabled = 1; #endif @@ -583,18 +624,18 @@ properly. Setting ROOT_DEV to default to /dev/ram0 breaks initrd. */ ROOT_DEV = MKDEV(UNNAMED_MAJOR,0); - screen_info = SCREEN_INFO; + screen_info = boot_params.screen_info; copy_edid(); - apm_info.bios = APM_BIOS_INFO; - ist_info = IST_INFO; - saved_videomode = VIDEO_MODE; - if( SYS_DESC_TABLE.length != 0 ) { - set_mca_bus(SYS_DESC_TABLE.table[3] & 0x2); - machine_id = SYS_DESC_TABLE.table[0]; - machine_submodel_id = SYS_DESC_TABLE.table[1]; - BIOS_revision = SYS_DESC_TABLE.table[2]; + apm_info.bios = boot_params.apm_bios_info; + ist_info = boot_params.ist_info; + saved_videomode = boot_params.hdr.vid_mode; + if( boot_params.sys_desc_table.length != 0 ) { + set_mca_bus(boot_params.sys_desc_table.table[3] & 0x2); + machine_id = boot_params.sys_desc_table.table[0]; + machine_submodel_id = boot_params.sys_desc_table.table[1]; + BIOS_revision = boot_params.sys_desc_table.table[2]; } - bootloader_type = LOADER_TYPE; + bootloader_type = boot_params.hdr.type_of_loader; if (is_initial_xendomain()) { const struct dom0_vga_console_info *info = @@ -609,9 +650,9 @@ screen_info.orig_video_isVGA = 0; #ifdef CONFIG_BLK_DEV_RAM - rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK; - rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0); - rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0); + rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK; + rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0); + rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0); #endif ARCH_SETUP @@ -624,7 +665,7 @@ copy_edd(); - if (!MOUNT_ROOT_RDONLY) + if (!boot_params.hdr.root_flags) root_mountflags &= ~MS_RDONLY; init_mm.start_code = (unsigned long) _text; init_mm.end_code = (unsigned long) _etext; @@ -636,6 +677,8 @@ code_resource.end = virt_to_phys(_etext)-1; data_resource.start = virt_to_phys(_etext); data_resource.end = virt_to_phys(_edata)-1; + bss_resource.start = virt_to_phys(&__bss_start); + bss_resource.end = virt_to_phys(&__bss_stop)-1; if ((i = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE) i = COMMAND_LINE_SIZE; @@ -664,7 +707,7 @@ /* * NOTE: before this point _nobody_ is allowed to allocate * any memory using the bootmem allocator. Although the - * alloctor is now initialised only the first 8Mb of the kernel + * allocator is now initialised only the first 8Mb of the kernel * virtual address space has been mapped. All allocations before * paging_init() has completed must use the alloc_bootmem_low_pages() * variant (which allocates DMA'able memory) and care must be taken @@ -787,10 +830,8 @@ acpi_boot_table_init(); #endif -#ifdef CONFIG_PCI -#ifdef CONFIG_X86_IO_APIC - check_acpi_pci(); /* Checks more than just ACPI actually */ -#endif +#if defined(CONFIG_PCI) && !defined(CONFIG_XEN) + early_quirks(); #endif #ifdef CONFIG_ACPI --- a/arch/x86/kernel/setup_64-xen.c +++ b/arch/x86/kernel/setup_64-xen.c @@ -1,10 +1,5 @@ /* - * linux/arch/x86-64/kernel/setup.c - * * Copyright (C) 1995 Linus Torvalds - * - * Nov 2001 Dave Jones - * Forked from i386 setup code. */ /* @@ -57,13 +52,13 @@ #include #include #include -#include #include #include #include #include #include #include +#include #ifdef CONFIG_XEN #include #include @@ -183,6 +178,12 @@ .end = 0, .flags = IORESOURCE_RAM, }; +struct resource bss_resource = { + .name = "Kernel bss", + .start = 0, + .end = 0, + .flags = IORESOURCE_RAM, +}; #ifdef CONFIG_PROC_VMCORE /* elfcorehdr= specifies the location of elf core header @@ -234,10 +235,11 @@ */ static inline void copy_edd(void) { - memcpy(edd.mbr_signature, EDD_MBR_SIGNATURE, sizeof(edd.mbr_signature)); - memcpy(edd.edd_info, EDD_BUF, sizeof(edd.edd_info)); - edd.mbr_signature_nr = EDD_MBR_SIG_NR; - edd.edd_info_nr = EDD_NR; + memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer, + sizeof(edd.mbr_signature)); + memcpy(edd.edd_info, boot_params.eddbuf, sizeof(edd.edd_info)); + edd.mbr_signature_nr = boot_params.edd_mbr_sig_buf_entries; + edd.edd_info_nr = boot_params.eddbuf_entries; } #endif #else @@ -246,6 +248,41 @@ } #endif +#ifdef CONFIG_KEXEC +#ifndef CONFIG_XEN +static void __init reserve_crashkernel(void) +{ + unsigned long long free_mem; + unsigned long long crash_size, crash_base; + int ret; + + free_mem = ((unsigned long long)max_low_pfn - min_low_pfn) << PAGE_SHIFT; + + ret = parse_crashkernel(boot_command_line, free_mem, + &crash_size, &crash_base); + if (ret == 0 && crash_size) { + if (crash_base > 0) { + printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " + "for crashkernel (System RAM: %ldMB)\n", + (unsigned long)(crash_size >> 20), + (unsigned long)(crash_base >> 20), + (unsigned long)(free_mem >> 20)); + crashk_res.start = crash_base; + crashk_res.end = crash_base + crash_size - 1; + reserve_bootmem(crash_base, crash_size); + } else + printk(KERN_INFO "crashkernel reservation failed - " + "you have to specify a base address\n"); + } +} +#else +#define reserve_crashkernel xen_machine_kexec_setup_resources +#endif +#else +static inline void __init reserve_crashkernel(void) +{} +#endif + #ifndef CONFIG_XEN #define EBDA_ADDR_POINTER 0x40E @@ -286,7 +323,7 @@ atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block); ROOT_DEV = MKDEV(RAMDISK_MAJOR,0); - screen_info = SCREEN_INFO; + screen_info = boot_params.screen_info; if (is_initial_xendomain()) { const struct dom0_vga_console_info *info = @@ -309,22 +346,22 @@ #else printk(KERN_INFO "Command line: %s\n", boot_command_line); - ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV); - screen_info = SCREEN_INFO; - edid_info = EDID_INFO; + ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev); + screen_info = boot_params.screen_info; + edid_info = boot_params.edid_info; #endif /* !CONFIG_XEN */ - saved_video_mode = SAVED_VIDEO_MODE; - bootloader_type = LOADER_TYPE; + saved_video_mode = boot_params.hdr.vid_mode; + bootloader_type = boot_params.hdr.type_of_loader; #ifdef CONFIG_BLK_DEV_RAM - rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK; - rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0); - rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0); + rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK; + rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0); + rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0); #endif setup_memory_region(); copy_edd(); - if (!MOUNT_ROOT_RDONLY) + if (!boot_params.hdr.root_flags) root_mountflags &= ~MS_RDONLY; init_mm.start_code = (unsigned long) &_text; init_mm.end_code = (unsigned long) &_etext; @@ -335,6 +372,8 @@ code_resource.end = virt_to_phys(&_etext)-1; data_resource.start = virt_to_phys(&_etext); data_resource.end = virt_to_phys(&_edata)-1; + bss_resource.start = virt_to_phys(&__bss_start); + bss_resource.end = virt_to_phys(&__bss_stop)-1; early_identify_cpu(&boot_cpu_data); @@ -362,6 +401,11 @@ if (is_initial_xendomain()) dmi_scan_machine(); +#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) + /* setup to use the static apicid table during kernel startup */ + x86_cpu_to_apicid_ptr = (void *)&x86_cpu_to_apicid_init; +#endif + /* How many end-of-memory variables you have, grandma! */ max_low_pfn = end_pfn; max_pfn = end_pfn; @@ -426,52 +470,37 @@ */ acpi_reserve_bootmem(); #endif -#ifdef CONFIG_XEN #ifdef CONFIG_BLK_DEV_INITRD +#ifdef CONFIG_XEN if (xen_start_info->mod_start) { - if (INITRD_START + INITRD_SIZE <= (end_pfn << PAGE_SHIFT)) { - /*reserve_bootmem_generic(INITRD_START, INITRD_SIZE);*/ - initrd_start = INITRD_START + PAGE_OFFSET; - initrd_end = initrd_start+INITRD_SIZE; + unsigned long ramdisk_image = __pa(xen_start_info->mod_start); + unsigned long ramdisk_size = xen_start_info->mod_len; +#else + if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) { + unsigned long ramdisk_image = boot_params.hdr.ramdisk_image; + unsigned long ramdisk_size = boot_params.hdr.ramdisk_size; +#endif + unsigned long ramdisk_end = ramdisk_image + ramdisk_size; + unsigned long end_of_mem = end_pfn << PAGE_SHIFT; + + if (ramdisk_end <= end_of_mem) { +#ifndef CONFIG_XEN + reserve_bootmem_generic(ramdisk_image, ramdisk_size); +#endif + initrd_start = ramdisk_image + PAGE_OFFSET; + initrd_end = initrd_start+ramdisk_size; +#ifdef CONFIG_XEN initrd_below_start_ok = 1; - } else { - printk(KERN_ERR "initrd extends beyond end of memory " - "(0x%08lx > 0x%08lx)\ndisabling initrd\n", - (unsigned long)(INITRD_START + INITRD_SIZE), - (unsigned long)(end_pfn << PAGE_SHIFT)); - initrd_start = 0; - } - } #endif -#else /* CONFIG_XEN */ -#ifdef CONFIG_BLK_DEV_INITRD - if (LOADER_TYPE && INITRD_START) { - if (INITRD_START + INITRD_SIZE <= (end_pfn << PAGE_SHIFT)) { - reserve_bootmem_generic(INITRD_START, INITRD_SIZE); - initrd_start = INITRD_START + PAGE_OFFSET; - initrd_end = initrd_start+INITRD_SIZE; - } - else { + } else { printk(KERN_ERR "initrd extends beyond end of memory " - "(0x%08lx > 0x%08lx)\ndisabling initrd\n", - (unsigned long)(INITRD_START + INITRD_SIZE), - (unsigned long)(end_pfn << PAGE_SHIFT)); + "(0x%08lx > 0x%08lx)\ndisabling initrd\n", + ramdisk_end, end_of_mem); initrd_start = 0; } } #endif -#endif /* !CONFIG_XEN */ -#ifdef CONFIG_KEXEC -#ifdef CONFIG_XEN - xen_machine_kexec_setup_resources(); -#else - if (crashk_res.start != crashk_res.end) { - reserve_bootmem_generic(crashk_res.start, - crashk_res.end - crashk_res.start + 1); - } -#endif -#endif - + reserve_crashkernel(); paging_init(); #ifdef CONFIG_X86_LOCAL_APIC /* @@ -796,7 +825,7 @@ but in the same order as the HT nodeids. If that doesn't result in a usable node fall back to the path for the previous case. */ - int ht_nodeid = apicid - (cpu_data[0].phys_proc_id << bits); + int ht_nodeid = apicid - (cpu_data(0).phys_proc_id << bits); if (ht_nodeid >= 0 && apicid_to_node[ht_nodeid] != NUMA_NO_NODE) node = apicid_to_node[ht_nodeid]; @@ -811,6 +840,39 @@ #endif } +#define ENABLE_C1E_MASK 0x18000000 +#define CPUID_PROCESSOR_SIGNATURE 1 +#define CPUID_XFAM 0x0ff00000 +#define CPUID_XFAM_K8 0x00000000 +#define CPUID_XFAM_10H 0x00100000 +#define CPUID_XFAM_11H 0x00200000 +#define CPUID_XMOD 0x000f0000 +#define CPUID_XMOD_REV_F 0x00040000 + +#ifndef CONFIG_XEN +/* AMD systems with C1E don't have a working lAPIC timer. Check for that. */ +static __cpuinit int amd_apic_timer_broken(void) +{ + u32 lo, hi; + u32 eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE); + switch (eax & CPUID_XFAM) { + case CPUID_XFAM_K8: + if ((eax & CPUID_XMOD) < CPUID_XMOD_REV_F) + break; + case CPUID_XFAM_10H: + case CPUID_XFAM_11H: + rdmsr(MSR_K8_ENABLE_C1E, lo, hi); + if (lo & ENABLE_C1E_MASK) + return 1; + break; + default: + /* err on the side of caution */ + return 1; + } + return 0; +} +#endif + static void __cpuinit init_amd(struct cpuinfo_x86 *c) { unsigned level; @@ -840,7 +902,7 @@ level = cpuid_eax(1); if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)) set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability); - if (c->x86 == 0x10) + if (c->x86 == 0x10 || c->x86 == 0x11) set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability); /* Enable workaround for FXSAVE leak */ @@ -882,6 +944,11 @@ /* Family 10 doesn't support C states in MWAIT so don't use it */ if (c->x86 == 0x10 && !force_mwait) clear_bit(X86_FEATURE_MWAIT, &c->x86_capability); + +#ifndef CONFIG_XEN + if (amd_apic_timer_broken()) + disable_apic_timer = 1; +#endif } static void __cpuinit detect_ht(struct cpuinfo_x86 *c) @@ -1192,6 +1259,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) { struct cpuinfo_x86 *c = v; + int cpu = 0; /* * These flag bits must match the definitions in . @@ -1201,7 +1269,7 @@ * applications want to get the raw CPUID data, they should access * /dev/cpu//cpuid instead. */ - static char *x86_cap_flags[] = { + static const char *const x86_cap_flags[] = { /* Intel-defined */ "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce", "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov", @@ -1232,7 +1300,7 @@ /* Intel-defined (#2) */ "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est", "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL, - NULL, NULL, "dca", NULL, NULL, NULL, NULL, "popcnt", + NULL, NULL, "dca", "sse4_1", "sse4_2", NULL, NULL, "popcnt", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* VIA/Cyrix/Centaur-defined */ @@ -1242,10 +1310,10 @@ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* AMD-defined (#2) */ - "lahf_lm", "cmp_legacy", "svm", "extapic", "cr8_legacy", - "altmovcr8", "abm", "sse4a", - "misalignsse", "3dnowprefetch", - "osvw", "ibs", NULL, NULL, NULL, NULL, + "lahf_lm", "cmp_legacy", "svm", "extapic", + "cr8_legacy", "abm", "sse4a", "misalignsse", + "3dnowprefetch", "osvw", "ibs", "sse5", + "skinit", "wdt", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, @@ -1255,7 +1323,7 @@ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, }; - static char *x86_power_flags[] = { + static const char *const x86_power_flags[] = { "ts", /* temperature sensor */ "fid", /* frequency id control */ "vid", /* voltage id control */ @@ -1270,8 +1338,7 @@ #ifdef CONFIG_SMP - if (!cpu_online(c-cpu_data)) - return 0; + cpu = c->cpu_index; #endif seq_printf(m,"processor\t: %u\n" @@ -1279,7 +1346,7 @@ "cpu family\t: %d\n" "model\t\t: %d\n" "model name\t: %s\n", - (unsigned)(c-cpu_data), + (unsigned)cpu, c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown", c->x86, (int)c->x86_model, @@ -1291,7 +1358,7 @@ seq_printf(m, "stepping\t: unknown\n"); if (cpu_has(c,X86_FEATURE_TSC)) { - unsigned int freq = cpufreq_quick_get((unsigned)(c-cpu_data)); + unsigned int freq = cpufreq_quick_get((unsigned)cpu); if (!freq) freq = cpu_khz; seq_printf(m, "cpu MHz\t\t: %u.%03u\n", @@ -1304,9 +1371,9 @@ #ifdef CONFIG_SMP if (smp_num_siblings * c->x86_max_cores > 1) { - int cpu = c - cpu_data; seq_printf(m, "physical id\t: %d\n", c->phys_proc_id); - seq_printf(m, "siblings\t: %d\n", cpus_weight(cpu_core_map[cpu])); + seq_printf(m, "siblings\t: %d\n", + cpus_weight(per_cpu(cpu_core_map, cpu))); seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id); seq_printf(m, "cpu cores\t: %d\n", c->booted_cores); } @@ -1361,12 +1428,16 @@ static void *c_start(struct seq_file *m, loff_t *pos) { - return *pos < NR_CPUS ? cpu_data + *pos : NULL; + if (*pos == 0) /* just in case, cpu 0 is not the first */ + *pos = first_cpu(cpu_online_map); + if ((*pos) < NR_CPUS && cpu_online(*pos)) + return &cpu_data(*pos); + return NULL; } static void *c_next(struct seq_file *m, void *v, loff_t *pos) { - ++*pos; + *pos = next_cpu(*pos, cpu_online_map); return c_start(m, pos); } --- a/arch/x86/kernel/smp_32-xen.c +++ b/arch/x86/kernel/smp_32-xen.c @@ -72,7 +72,7 @@ * * B stepping CPUs may hang. There are hardware work arounds * for this. We warn about it in case your board doesn't have the work - * arounds. Basically thats so I can tell anyone with a B stepping + * arounds. Basically that's so I can tell anyone with a B stepping * CPU and SMP problems "tough". * * Specific items [From Pentium Processor Specification Update] @@ -241,7 +241,7 @@ * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask); * Stop ipi delivery for the old mm. This is not synchronized with * the other cpus, but smp_invalidate_interrupt ignore flush ipis - * for the wrong mm, and in the worst case we perform a superflous + * for the wrong mm, and in the worst case we perform a superfluous * tlb flush. * 1a2) set cpu_tlbstate to TLBSTATE_OK * Now the smp_invalidate_interrupt won't call leave_mm if cpu0 @@ -309,6 +309,7 @@ smp_mb__after_clear_bit(); out: put_cpu_no_resched(); + __get_cpu_var(irq_stat).irq_tlb_count++; return IRQ_HANDLED; } @@ -580,7 +581,7 @@ */ cpu_clear(smp_processor_id(), cpu_online_map); disable_all_local_evtchn(); - if (cpu_data[smp_processor_id()].hlt_works_ok) + if (cpu_data(smp_processor_id()).hlt_works_ok) for(;;) halt(); for (;;); } @@ -610,6 +611,7 @@ */ irqreturn_t smp_reschedule_interrupt(int irq, void *dev_id) { + __get_cpu_var(irq_stat).irq_resched_count++; return IRQ_HANDLED; } @@ -632,6 +634,7 @@ */ irq_enter(); (*func)(info); + __get_cpu_var(irq_stat).irq_call_count++; irq_exit(); if (wait) { --- a/arch/x86/kernel/smp_64-xen.c +++ b/arch/x86/kernel/smp_64-xen.c @@ -167,6 +167,7 @@ out: ack_APIC_irq(); cpu_clear(cpu, f->flush_cpumask); + add_pda(irq_tlb_count, 1); } static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, @@ -326,17 +327,27 @@ } /* - * this function sends a 'generic call function' IPI to one other CPU - * in the system. - * - * cpu is a standard Linux logical CPU number. + * this function sends a 'generic call function' IPI to all other CPU + * of the system defined in the mask. */ -static void -__smp_call_function_single(int cpu, void (*func) (void *info), void *info, - int nonatomic, int wait) + +static int +__smp_call_function_mask(cpumask_t mask, + void (*func)(void *), void *info, + int wait) { struct call_data_struct data; - int cpus = 1; + cpumask_t allbutself; + int cpus; + + allbutself = cpu_online_map; + cpu_clear(smp_processor_id(), allbutself); + + cpus_and(mask, mask, allbutself); + cpus = cpus_weight(mask); + + if (!cpus) + return 0; data.func = func; data.info = info; @@ -347,19 +358,55 @@ call_data = &data; wmb(); - /* Send a message to all other CPUs and wait for them to respond */ - send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_VECTOR); + + /* Send a message to other CPUs */ + if (cpus_equal(mask, allbutself)) + send_IPI_allbutself(CALL_FUNCTION_VECTOR); + else + send_IPI_mask(mask, CALL_FUNCTION_VECTOR); /* Wait for response */ while (atomic_read(&data.started) != cpus) cpu_relax(); if (!wait) - return; + return 0; while (atomic_read(&data.finished) != cpus) cpu_relax(); + + return 0; } +/** + * smp_call_function_mask(): Run a function on a set of other CPUs. + * @mask: The set of cpus to run on. Must not include the current cpu. + * @func: The function to run. This must be fast and non-blocking. + * @info: An arbitrary pointer to pass to the function. + * @wait: If true, wait (atomically) until function has completed on other CPUs. + * + * Returns 0 on success, else a negative status code. + * + * If @wait is true, then returns once @func has returned; otherwise + * it returns just before the target cpu calls @func. + * + * You must not call this function with disabled interrupts or from a + * hardware interrupt handler or from a bottom half handler. + */ +int smp_call_function_mask(cpumask_t mask, + void (*func)(void *), void *info, + int wait) +{ + int ret; + + /* Can deadlock when called with interrupts disabled */ + WARN_ON(irqs_disabled()); + + spin_lock(&call_lock); + ret = __smp_call_function_mask(mask, func, info, wait); + spin_unlock(&call_lock); + return ret; +} +EXPORT_SYMBOL(smp_call_function_mask); /* * smp_call_function_single - Run a function on a specific CPU @@ -378,6 +425,7 @@ int nonatomic, int wait) { /* prevent preemption and reschedule on another processor */ + int ret; int me = get_cpu(); /* Can deadlock when called with interrupts disabled */ @@ -391,51 +439,14 @@ return 0; } - spin_lock(&call_lock); - __smp_call_function_single(cpu, func, info, nonatomic, wait); - spin_unlock(&call_lock); + ret = smp_call_function_mask(cpumask_of_cpu(cpu), func, info, wait); + put_cpu(); - return 0; + return ret; } EXPORT_SYMBOL(smp_call_function_single); /* - * this function sends a 'generic call function' IPI to all other CPUs - * in the system. - */ -static void __smp_call_function (void (*func) (void *info), void *info, - int nonatomic, int wait) -{ - struct call_data_struct data; - int cpus = num_online_cpus()-1; - - if (!cpus) - return; - - data.func = func; - data.info = info; - atomic_set(&data.started, 0); - data.wait = wait; - if (wait) - atomic_set(&data.finished, 0); - - call_data = &data; - wmb(); - /* Send a message to all other CPUs and wait for them to respond */ - send_IPI_allbutself(CALL_FUNCTION_VECTOR); - - /* Wait for response */ - while (atomic_read(&data.started) != cpus) - cpu_relax(); - - if (!wait) - return; - - while (atomic_read(&data.finished) != cpus) - cpu_relax(); -} - -/* * smp_call_function - run a function on all other CPUs. * @func: The function to run. This must be fast and non-blocking. * @info: An arbitrary pointer to pass to the function. @@ -453,10 +464,7 @@ int smp_call_function (void (*func) (void *info), void *info, int nonatomic, int wait) { - spin_lock(&call_lock); - __smp_call_function(func,info,nonatomic,wait); - spin_unlock(&call_lock); - return 0; + return smp_call_function_mask(cpu_online_map, func, info, wait); } EXPORT_SYMBOL(smp_call_function); @@ -485,7 +493,7 @@ /* Don't deadlock on the call lock in panic */ nolock = !spin_trylock(&call_lock); local_irq_save(flags); - __smp_call_function(stop_this_cpu, NULL, 0, 0); + __smp_call_function_mask(cpu_online_map, stop_this_cpu, NULL, 0); if (!nolock) spin_unlock(&call_lock); disable_all_local_evtchn(); @@ -505,7 +513,9 @@ { #ifndef CONFIG_XEN ack_APIC_irq(); -#else +#endif + add_pda(irq_resched_count, 1); +#ifdef CONFIG_XEN return IRQ_HANDLED; #endif } @@ -535,6 +545,7 @@ exit_idle(); irq_enter(); (*func)(info); + add_pda(irq_call_count, 1); irq_exit(); if (wait) { mb(); --- a/arch/x86/kernel/time_32-xen.c +++ b/arch/x86/kernel/time_32-xen.c @@ -1,6 +1,4 @@ /* - * linux/arch/i386/kernel/time.c - * * Copyright (C) 1991, 1992, 1995 Linus Torvalds * * This file contains the PC-specific time handling details: @@ -74,6 +72,7 @@ #include #include +#include #include #include @@ -539,6 +538,13 @@ struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu); struct vcpu_runstate_info runstate; + /* Keep nmi watchdog up to date */ +#ifdef __i386__ + per_cpu(irq_stat, smp_processor_id()).irq0_irqs++; +#else + add_pda(irq0_irqs, 1); +#endif + /* * Here we are in the timer irq handler. We just have irqs locally * disabled but we don't know if the timer_bh is running on the other @@ -987,7 +993,7 @@ struct cpufreq_freqs *freq = data; struct xen_platform_op op; - if (cpu_has(&cpu_data[freq->cpu], X86_FEATURE_CONSTANT_TSC)) + if (cpu_has(&cpu_data(freq->cpu), X86_FEATURE_CONSTANT_TSC)) return 0; if (val == CPUFREQ_PRECHANGE) @@ -1025,30 +1031,33 @@ */ static ctl_table xen_subtable[] = { { - .ctl_name = 1, + .ctl_name = CTL_XEN_INDEPENDENT_WALLCLOCK, .procname = "independent_wallclock", .data = &independent_wallclock, .maxlen = sizeof(independent_wallclock), .mode = 0644, + .strategy = sysctl_data, .proc_handler = proc_dointvec }, { - .ctl_name = 2, + .ctl_name = CTL_XEN_PERMITTED_CLOCK_JITTER, .procname = "permitted_clock_jitter", .data = &permitted_clock_jitter, .maxlen = sizeof(permitted_clock_jitter), .mode = 0644, + .strategy = sysctl_data, .proc_handler = proc_doulongvec_minmax }, - { 0 } + { } }; static ctl_table xen_table[] = { { - .ctl_name = 123, + .ctl_name = CTL_XEN, .procname = "xen", .mode = 0555, - .child = xen_subtable}, - { 0 } + .child = xen_subtable + }, + { } }; static int __init xen_sysctl_init(void) { --- a/arch/x86/kernel/traps_32-xen.c +++ b/arch/x86/kernel/traps_32-xen.c @@ -1,6 +1,4 @@ /* - * linux/arch/i386/traps.c - * * Copyright (C) 1991, 1992 Linus Torvalds * * Pentium III FXSR, SSE support @@ -65,6 +63,11 @@ int panic_on_unrecovered_nmi; +#ifndef CONFIG_XEN +DECLARE_BITMAP(used_vectors, NR_VECTORS); +EXPORT_SYMBOL_GPL(used_vectors); +#endif + asmlinkage int system_call(void); /* Do we ignore FPU interrupts ? */ @@ -120,7 +123,7 @@ static inline unsigned long print_context_stack(struct thread_info *tinfo, unsigned long *stack, unsigned long ebp, - struct stacktrace_ops *ops, void *data) + const struct stacktrace_ops *ops, void *data) { #ifdef CONFIG_FRAME_POINTER struct stack_frame *frame = (struct stack_frame *)ebp; @@ -157,7 +160,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, - struct stacktrace_ops *ops, void *data) + const struct stacktrace_ops *ops, void *data) { unsigned long ebp = 0; @@ -229,7 +232,7 @@ touch_nmi_watchdog(); } -static struct stacktrace_ops print_trace_ops = { +static const struct stacktrace_ops print_trace_ops = { .warning = print_trace_warning, .warning_symbol = print_trace_warning_symbol, .stack = print_trace_stack, @@ -288,6 +291,11 @@ { unsigned long stack; + printk("Pid: %d, comm: %.20s %s %s %.*s\n", + current->pid, current->comm, print_tainted(), + init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version); show_trace(current, NULL, &stack); } @@ -296,48 +304,24 @@ void show_registers(struct pt_regs *regs) { int i; - int in_kernel = 1; - unsigned long esp; - unsigned short ss, gs; - - esp = (unsigned long) (®s->esp); - savesegment(ss, ss); - savesegment(gs, gs); - if (user_mode_vm(regs)) { - in_kernel = 0; - esp = regs->esp; - ss = regs->xss & 0xffff; - } + print_modules(); - printk(KERN_EMERG "CPU: %d\n" - KERN_EMERG "EIP: %04x:[<%08lx>] %s VLI\n" - KERN_EMERG "EFLAGS: %08lx (%s %.*s)\n", - smp_processor_id(), 0xffff & regs->xcs, regs->eip, - print_tainted(), regs->eflags, init_utsname()->release, - (int)strcspn(init_utsname()->version, " "), - init_utsname()->version); - print_symbol(KERN_EMERG "EIP is at %s\n", regs->eip); - printk(KERN_EMERG "eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n", - regs->eax, regs->ebx, regs->ecx, regs->edx); - printk(KERN_EMERG "esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n", - regs->esi, regs->edi, regs->ebp, esp); - printk(KERN_EMERG "ds: %04x es: %04x fs: %04x gs: %04x ss: %04x\n", - regs->xds & 0xffff, regs->xes & 0xffff, regs->xfs & 0xffff, gs, ss); + __show_registers(regs, 0); printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)", - TASK_COMM_LEN, current->comm, current->pid, + TASK_COMM_LEN, current->comm, task_pid_nr(current), current_thread_info(), current, task_thread_info(current)); /* * When in-kernel, we also print out the stack and code at the * time of the fault.. */ - if (in_kernel) { + if (!user_mode_vm(regs)) { u8 *eip; unsigned int code_prologue = code_bytes * 43 / 64; unsigned int code_len = code_bytes; unsigned char c; printk("\n" KERN_EMERG "Stack: "); - show_stack_log_lvl(NULL, regs, (unsigned long *)esp, KERN_EMERG); + show_stack_log_lvl(NULL, regs, ®s->esp, KERN_EMERG); printk(KERN_EMERG "Code: "); @@ -382,11 +366,11 @@ void die(const char * str, struct pt_regs * regs, long err) { static struct { - spinlock_t lock; + raw_spinlock_t lock; u32 lock_owner; int lock_owner_depth; } die = { - .lock = __SPIN_LOCK_UNLOCKED(die.lock), + .lock = __RAW_SPIN_LOCK_UNLOCKED, .lock_owner = -1, .lock_owner_depth = 0 }; @@ -397,40 +381,33 @@ if (die.lock_owner != raw_smp_processor_id()) { console_verbose(); - spin_lock_irqsave(&die.lock, flags); + raw_local_irq_save(flags); + __raw_spin_lock(&die.lock); die.lock_owner = smp_processor_id(); die.lock_owner_depth = 0; bust_spinlocks(1); - } - else - local_save_flags(flags); + } else + raw_local_irq_save(flags); if (++die.lock_owner_depth < 3) { - int nl = 0; unsigned long esp; unsigned short ss; report_bug(regs->eip, regs); - printk(KERN_EMERG "%s: %04lx [#%d]\n", str, err & 0xffff, ++die_counter); + printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, + ++die_counter); #ifdef CONFIG_PREEMPT - printk(KERN_EMERG "PREEMPT "); - nl = 1; + printk("PREEMPT "); #endif #ifdef CONFIG_SMP - if (!nl) - printk(KERN_EMERG); printk("SMP "); - nl = 1; #endif #ifdef CONFIG_DEBUG_PAGEALLOC - if (!nl) - printk(KERN_EMERG); printk("DEBUG_PAGEALLOC"); - nl = 1; #endif - if (nl) - printk("\n"); + printk("\n"); + if (notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV) != NOTIFY_STOP) { @@ -454,7 +431,8 @@ bust_spinlocks(0); die.lock_owner = -1; add_taint(TAINT_DIE); - spin_unlock_irqrestore(&die.lock, flags); + __raw_spin_unlock(&die.lock); + raw_local_irq_restore(flags); if (!regs) return; @@ -571,6 +549,7 @@ info.si_errno = 0; \ info.si_code = sicode; \ info.si_addr = (void __user *)siaddr; \ + trace_hardirqs_fixup(); \ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ == NOTIFY_STOP) \ return; \ @@ -606,7 +585,7 @@ printk_ratelimit()) printk(KERN_INFO "%s[%d] general protection eip:%lx esp:%lx error:%lx\n", - current->comm, current->pid, + current->comm, task_pid_nr(current), regs->eip, regs->esp, error_code); force_sig(SIGSEGV, current); @@ -785,6 +764,8 @@ #ifdef CONFIG_KPROBES fastcall void __kprobes do_int3(struct pt_regs *regs, long error_code) { + trace_hardirqs_fixup(); + if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) return; @@ -822,6 +803,8 @@ unsigned int condition; struct task_struct *tsk = current; + trace_hardirqs_fixup(); + get_debugreg(condition, 6); if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, @@ -1084,20 +1067,6 @@ #endif /* CONFIG_MATH_EMULATION */ -#ifdef CONFIG_X86_F00F_BUG -void __init trap_init_f00f_bug(void) -{ - __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO); - - /* - * Update the IDT descriptor and reload the IDT so that - * it uses the read-only mapped virtual address. - */ - idt_descr.address = fix_to_virt(FIX_F00F_IDT); - load_idt(&idt_descr); -} -#endif - /* * NB. All these are "trap gates" (i.e. events_mask isn't set) except --- a/arch/x86/kernel/traps_64-xen.c +++ b/arch/x86/kernel/traps_64-xen.c @@ -1,6 +1,4 @@ /* - * linux/arch/x86-64/traps.c - * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs * @@ -33,6 +31,7 @@ #include #include #include +#include #if defined(CONFIG_EDAC) #include @@ -205,7 +204,7 @@ #define MSG(txt) ops->warning(data, txt) /* - * x86-64 can have upto three kernel stacks: + * x86-64 can have up to three kernel stacks: * process stack * interrupt stack * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack @@ -219,7 +218,7 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack, - struct stacktrace_ops *ops, void *data) + const struct stacktrace_ops *ops, void *data) { const unsigned cpu = get_cpu(); unsigned long *irqstack_end = (unsigned long*)cpu_pda(cpu)->irqstackptr; @@ -340,7 +339,7 @@ printk_address(addr); } -static struct stacktrace_ops print_trace_ops = { +static const struct stacktrace_ops print_trace_ops = { .warning = print_trace_warning, .warning_symbol = print_trace_warning_symbol, .stack = print_trace_stack, @@ -404,6 +403,12 @@ void dump_stack(void) { unsigned long dummy; + + printk("Pid: %d, comm: %.20s %s %s %.*s\n", + current->pid, current->comm, print_tainted(), + init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version); show_trace(NULL, NULL, &dummy); } @@ -466,7 +471,7 @@ EXPORT_SYMBOL(out_of_line_bug); #endif -static DEFINE_SPINLOCK(die_lock); +static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; static int die_owner = -1; static unsigned int die_nest_count; @@ -478,13 +483,13 @@ oops_enter(); /* racy, but better than risking deadlock. */ - local_irq_save(flags); + raw_local_irq_save(flags); cpu = smp_processor_id(); - if (!spin_trylock(&die_lock)) { + if (!__raw_spin_trylock(&die_lock)) { if (cpu == die_owner) /* nested oops. should stop eventually */; else - spin_lock(&die_lock); + __raw_spin_lock(&die_lock); } die_nest_count++; die_owner = cpu; @@ -498,12 +503,10 @@ die_owner = -1; bust_spinlocks(0); die_nest_count--; - if (die_nest_count) - /* We still own the lock */ - local_irq_restore(flags); - else + if (!die_nest_count) /* Nest count reaches zero, release the lock. */ - spin_unlock_irqrestore(&die_lock, flags); + __raw_spin_unlock(&die_lock); + raw_local_irq_restore(flags); if (panic_on_oops) panic("Fatal exception"); oops_exit(); @@ -636,6 +639,7 @@ info.si_errno = 0; \ info.si_code = sicode; \ info.si_addr = (void __user *)siaddr; \ + trace_hardirqs_fixup(); \ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ == NOTIFY_STOP) \ return; \ @@ -741,11 +745,8 @@ printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); -#if 0 /* XEN */ /* Clear and disable the memory parity error line. */ - reason = (reason & 0xf) | 4; - outb(reason, 0x61); -#endif /* XEN */ + clear_mem_error(reason); } static __kprobes void @@ -754,14 +755,8 @@ printk("NMI: IOCK error (debug interrupt?)\n"); show_registers(regs); -#if 0 /* XEN */ /* Re-enable the IOCK line, wait for a few seconds */ - reason = (reason & 0xf) | 8; - outb(reason, 0x61); - mdelay(2000); - reason &= ~8; - outb(reason, 0x61); -#endif /* XEN */ + clear_io_check_error(reason); } static __kprobes void @@ -821,6 +816,8 @@ /* runs on IST stack. */ asmlinkage void __kprobes do_int3(struct pt_regs * regs, long error_code) { + trace_hardirqs_fixup(); + if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) { return; } @@ -858,6 +855,8 @@ struct task_struct *tsk = current; siginfo_t info; + trace_hardirqs_fixup(); + get_debugreg(condition, 6); if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, --- a/arch/x86/kernel/vsyscall_64-xen.c +++ b/arch/x86/kernel/vsyscall_64-xen.c @@ -1,6 +1,4 @@ /* - * linux/arch/x86_64/kernel/vsyscall.c - * * Copyright (C) 2001 Andrea Arcangeli SuSE * Copyright 2003 Andi Kleen, SuSE Labs. * @@ -50,12 +48,12 @@ ({unsigned long v; \ extern char __vsyscall_0; \ asm("" : "=r" (v) : "0" (x)); \ - ((v - VSYSCALL_FIRST_PAGE) + __pa_symbol(&__vsyscall_0)); }) + ((v - VSYSCALL_START) + __pa_symbol(&__vsyscall_0)); }) /* * vsyscall_gtod_data contains data that is : * - readonly from vsyscalls - * - writen by timer interrupt or systcl (/proc/sys/kernel/vsyscall64) + * - written by timer interrupt or systcl (/proc/sys/kernel/vsyscall64) * Try to keep this structure as small as possible to avoid cache line ping pongs */ int __vgetcpu_mode __section_vgetcpu_mode; @@ -66,6 +64,16 @@ .sysctl_enabled = 1, }; +void update_vsyscall_tz(void) +{ + unsigned long flags; + + write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags); + /* sys_tz has changed */ + vsyscall_gtod_data.sys_tz = sys_tz; + write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); +} + void update_vsyscall(struct timespec *wall_time, struct clocksource *clock) { unsigned long flags; @@ -79,8 +87,6 @@ vsyscall_gtod_data.clock.shift = clock->shift; vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; - vsyscall_gtod_data.sys_tz = sys_tz; - vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic; write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); } @@ -166,7 +172,7 @@ if (unlikely(!__vsyscall_gtod_data.sysctl_enabled)) return time_syscall(t); - vgettimeofday(&tv, 0); + vgettimeofday(&tv, NULL); result = tv.tv_sec; if (t) *t = result; @@ -260,18 +266,10 @@ return ret; } -static int vsyscall_sysctl_nostrat(ctl_table *t, int __user *name, int nlen, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - return -ENOSYS; -} - static ctl_table kernel_table2[] = { - { .ctl_name = 99, .procname = "vsyscall64", + { .procname = "vsyscall64", .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int), .mode = 0644, - .strategy = vsyscall_sysctl_nostrat, .proc_handler = vsyscall_sysctl_change }, {} }; @@ -291,9 +289,9 @@ unsigned long d; unsigned long node = 0; #ifdef CONFIG_NUMA - node = cpu_to_node[cpu]; + node = cpu_to_node(cpu); #endif - if (cpu_has(&cpu_data[cpu], X86_FEATURE_RDTSCP)) + if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP)) write_rdtscp_aux((node << 12) | cpu); /* Store cpu number in limit so that it can be loaded quickly --- a/arch/x86/mm/fault_32-xen.c +++ b/arch/x86/mm/fault_32-xen.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -32,33 +33,27 @@ extern void die(const char *,struct pt_regs *,long); -static ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain); - -int register_page_fault_notifier(struct notifier_block *nb) +#ifdef CONFIG_KPROBES +static inline int notify_page_fault(struct pt_regs *regs) { - vmalloc_sync_all(); - return atomic_notifier_chain_register(¬ify_page_fault_chain, nb); -} -EXPORT_SYMBOL_GPL(register_page_fault_notifier); + int ret = 0; -int unregister_page_fault_notifier(struct notifier_block *nb) -{ - return atomic_notifier_chain_unregister(¬ify_page_fault_chain, nb); -} -EXPORT_SYMBOL_GPL(unregister_page_fault_notifier); + /* kprobe_running() needs smp_processor_id() */ + if (!user_mode_vm(regs)) { + preempt_disable(); + if (kprobe_running() && kprobe_fault_handler(regs, 14)) + ret = 1; + preempt_enable(); + } -static inline int notify_page_fault(struct pt_regs *regs, long err) + return ret; +} +#else +static inline int notify_page_fault(struct pt_regs *regs) { - struct die_args args = { - .regs = regs, - .str = "page fault", - .err = err, - .trapnr = 14, - .signr = SIGSEGV - }; - return atomic_notifier_call_chain(¬ify_page_fault_chain, - DIE_PAGE_FAULT, &args); + return 0; } +#endif /* * Return EIP plus the CS segment base. The segment limit is also @@ -110,7 +105,7 @@ LDT and other horrors are only used in user space. */ if (seg & (1<<2)) { /* Must lock the LDT while reading it. */ - down(¤t->mm->context.sem); + mutex_lock(¤t->mm->context.lock); desc = current->mm->context.ldt; desc = (void *)desc + (seg & ~7); } else { @@ -123,7 +118,7 @@ base = get_desc_base((unsigned long *)desc); if (seg & (1<<2)) { - up(¤t->mm->context.sem); + mutex_unlock(¤t->mm->context.lock); } else put_cpu(); @@ -244,7 +239,7 @@ if (mfn_to_pfn(mfn) >= highstart_pfn) return; #endif - if (p[0] & _PAGE_PRESENT) { + if ((p[0] & _PAGE_PRESENT) && !(p[0] & _PAGE_PSE)) { page = mfn_to_pfn(mfn) << PAGE_SHIFT; p = (unsigned long *) __va(page); address &= 0x001fffff; @@ -270,7 +265,8 @@ * it's allocated already. */ if ((machine_to_phys(page) >> PAGE_SHIFT) < max_low_pfn - && (page & _PAGE_PRESENT)) { + && (page & _PAGE_PRESENT) + && !(page & _PAGE_PSE)) { page = machine_to_phys(page & PAGE_MASK); page = ((unsigned long *) __va(page))[(address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)]; @@ -413,6 +409,11 @@ int write, si_code; int fault; + /* + * We can fault from pretty much anywhere, with unknown IRQ state. + */ + trace_hardirqs_fixup(); + /* get the address */ address = read_cr2(); @@ -450,7 +451,7 @@ /* Can take a spurious fault if mapping changes R/O -> R/W. */ if (spurious_fault(regs, address, error_code)) return; - if (notify_page_fault(regs, error_code) == NOTIFY_STOP) + if (notify_page_fault(regs)) return; /* * Don't take the mm semaphore here. If we fixup a prefetch @@ -459,7 +460,7 @@ goto bad_area_nosemaphore; } - if (notify_page_fault(regs, error_code) == NOTIFY_STOP) + if (notify_page_fault(regs)) return; /* It's safe to allow irq's after cr2 has been saved and the vmalloc @@ -478,7 +479,7 @@ /* When running in the kernel we expect faults to occur only to * addresses in user space. All other faults represent errors in the - * kernel and should generate an OOPS. Unfortunatly, in the case of an + * kernel and should generate an OOPS. Unfortunately, in the case of an * erroneous fault occurring in a code path which already holds mmap_sem * we will deadlock attempting to validate the fault against the * address space. Luckily the kernel only validly references user @@ -486,7 +487,7 @@ * exceptions table. * * As the vast majority of faults will be valid we will only perform - * the source reference check when there is a possibilty of a deadlock. + * the source reference check when there is a possibility of a deadlock. * Attempt to lock the address space, if we cannot we then validate the * source. If this is invalid we can skip the address space check, * thus avoiding the deadlock. @@ -595,8 +596,8 @@ printk_ratelimit()) { printk("%s%s[%d]: segfault at %08lx eip %08lx " "esp %08lx error %lx\n", - tsk->pid > 1 ? KERN_INFO : KERN_EMERG, - tsk->comm, tsk->pid, address, regs->eip, + task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, + tsk->comm, task_pid_nr(tsk), address, regs->eip, regs->esp, error_code); } tsk->thread.cr2 = address; @@ -661,8 +662,7 @@ printk(KERN_ALERT "BUG: unable to handle kernel paging" " request"); printk(" at virtual address %08lx\n",address); - printk(KERN_ALERT " printing eip:\n"); - printk("%08lx\n", regs->eip); + printk(KERN_ALERT "printing eip: %08lx\n", regs->eip); dump_fault_path(address); } tsk->thread.cr2 = address; @@ -678,14 +678,14 @@ */ out_of_memory: up_read(&mm->mmap_sem); - if (is_init(tsk)) { + if (is_global_init(tsk)) { yield(); down_read(&mm->mmap_sem); goto survive; } printk("VM: killing process %s\n", tsk->comm); if (error_code & 4) - do_exit(SIGKILL); + do_group_exit(SIGKILL); goto no_context; do_sigbus: --- a/arch/x86/mm/fault_64-xen.c +++ b/arch/x86/mm/fault_64-xen.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -40,34 +41,27 @@ #define PF_RSVD (1<<3) #define PF_INSTR (1<<4) -static ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain); - -/* Hook to register for page fault notifications */ -int register_page_fault_notifier(struct notifier_block *nb) +#ifdef CONFIG_KPROBES +static inline int notify_page_fault(struct pt_regs *regs) { - vmalloc_sync_all(); - return atomic_notifier_chain_register(¬ify_page_fault_chain, nb); -} -EXPORT_SYMBOL_GPL(register_page_fault_notifier); + int ret = 0; -int unregister_page_fault_notifier(struct notifier_block *nb) -{ - return atomic_notifier_chain_unregister(¬ify_page_fault_chain, nb); -} -EXPORT_SYMBOL_GPL(unregister_page_fault_notifier); + /* kprobe_running() needs smp_processor_id() */ + if (!user_mode(regs)) { + preempt_disable(); + if (kprobe_running() && kprobe_fault_handler(regs, 14)) + ret = 1; + preempt_enable(); + } -static inline int notify_page_fault(struct pt_regs *regs, long err) + return ret; +} +#else +static inline int notify_page_fault(struct pt_regs *regs) { - struct die_args args = { - .regs = regs, - .str = "page fault", - .err = err, - .trapnr = 14, - .signr = SIGSEGV - }; - return atomic_notifier_call_chain(¬ify_page_fault_chain, - DIE_PAGE_FAULT, &args); + return 0; } +#endif /* Sometimes the CPU reports invalid exceptions on prefetch. Check that here and ignore. @@ -175,7 +169,7 @@ pmd = pmd_offset(pud, address); if (bad_address(pmd)) goto bad; printk("PMD %lx ", pmd_val(*pmd)); - if (!pmd_present(*pmd)) goto ret; + if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret; pte = pte_offset_kernel(pmd, address); if (bad_address(pte)) goto bad; @@ -294,7 +288,6 @@ return 0; } -static int page_fault_trace; int show_unhandled_signals = 1; @@ -371,6 +364,11 @@ if (!user_mode(regs)) error_code &= ~PF_USER; /* means kernel */ + /* + * We can fault from pretty much anywhere, with unknown IRQ state. + */ + trace_hardirqs_fixup(); + tsk = current; mm = tsk->mm; prefetchw(&mm->mmap_sem); @@ -408,7 +406,7 @@ /* Can take a spurious fault if mapping changes R/O -> R/W. */ if (spurious_fault(regs, address, error_code)) return; - if (notify_page_fault(regs, error_code) == NOTIFY_STOP) + if (notify_page_fault(regs)) return; /* * Don't take the mm semaphore here. If we fixup a prefetch @@ -417,16 +415,12 @@ goto bad_area_nosemaphore; } - if (notify_page_fault(regs, error_code) == NOTIFY_STOP) + if (notify_page_fault(regs)) return; if (likely(regs->eflags & X86_EFLAGS_IF)) local_irq_enable(); - if (unlikely(page_fault_trace)) - printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n", - regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code); - if (unlikely(error_code & PF_RSVD)) pgtable_bad(address, regs, error_code); @@ -447,7 +441,7 @@ again: /* When running in the kernel we expect faults to occur only to * addresses in user space. All other faults represent errors in the - * kernel and should generate an OOPS. Unfortunatly, in the case of an + * kernel and should generate an OOPS. Unfortunately, in the case of an * erroneous fault occurring in a code path which already holds mmap_sem * we will deadlock attempting to validate the fault against the * address space. Luckily the kernel only validly references user @@ -455,7 +449,7 @@ * exceptions table. * * As the vast majority of faults will be valid we will only perform - * the source reference check when there is a possibilty of a deadlock. + * the source reference check when there is a possibility of a deadlock. * Attempt to lock the address space, if we cannot we then validate the * source. If this is invalid we can skip the address space check, * thus avoiding the deadlock. @@ -557,7 +551,7 @@ if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && printk_ratelimit()) { printk( - "%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n", + "%s%s[%d]: segfault at %lx rip %lx rsp %lx error %lx\n", tsk->pid > 1 ? KERN_INFO : KERN_EMERG, tsk->comm, tsk->pid, address, regs->rip, regs->rsp, error_code); @@ -623,7 +617,7 @@ */ out_of_memory: up_read(&mm->mmap_sem); - if (is_init(current)) { + if (is_global_init(current)) { yield(); goto again; } @@ -690,10 +684,3 @@ BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) == (__START_KERNEL & PGDIR_MASK))); } - -static int __init enable_pagefaulttrace(char *str) -{ - page_fault_trace = 1; - return 1; -} -__setup("pagefaulttrace", enable_pagefaulttrace); --- a/arch/x86/mm/init_32-xen.c +++ b/arch/x86/mm/init_32-xen.c @@ -96,7 +96,14 @@ #else if (!(__pmd_val(*pmd) & _PAGE_PRESENT)) { #endif - pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE); + pte_t *page_table = NULL; + +#ifdef CONFIG_DEBUG_PAGEALLOC + page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE); +#endif + if (!page_table) + page_table = + (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE); paravirt_alloc_pt(&init_mm, __pa(page_table) >> PAGE_SHIFT); make_lowmem_page_readonly(page_table, @@ -104,7 +111,7 @@ set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); BUG_ON(page_table != pte_offset_kernel(pmd, 0)); } - + return pte_offset_kernel(pmd, 0); } @@ -362,8 +369,13 @@ static void __init set_highmem_pages_init(int bad_ppro) { int pfn; - for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) - add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro); + for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) { + /* + * Holes under sparsemem might not have no mem_map[]: + */ + if (pfn_valid(pfn)) + add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro); + } totalram_pages += totalhigh_pages; } #endif /* CONFIG_FLATMEM */ @@ -786,35 +798,18 @@ return __add_pages(zone, start_pfn, nr_pages); } -int remove_memory(u64 start, u64 size) -{ - return -EINVAL; -} -EXPORT_SYMBOL_GPL(remove_memory); #endif struct kmem_cache *pmd_cache; void __init pgtable_cache_init(void) { - size_t pgd_size = PTRS_PER_PGD*sizeof(pgd_t); - - if (PTRS_PER_PMD > 1) { + if (PTRS_PER_PMD > 1) pmd_cache = kmem_cache_create("pmd", - PTRS_PER_PMD*sizeof(pmd_t), - PTRS_PER_PMD*sizeof(pmd_t), - SLAB_PANIC, - pmd_ctor); - if (!SHARED_KERNEL_PMD) { - /* If we're in PAE mode and have a non-shared - kernel pmd, then the pgd size must be a - page size. This is because the pgd_list - links through the page structure, so there - can only be one pgd per page for this to - work. */ - pgd_size = PAGE_SIZE; - } - } + PTRS_PER_PMD*sizeof(pmd_t), + PTRS_PER_PMD*sizeof(pmd_t), + SLAB_PANIC, + pmd_ctor); } /* --- a/arch/x86/mm/init_64-xen.c +++ b/arch/x86/mm/init_64-xen.c @@ -761,7 +761,7 @@ /* Setup the direct mapping of the physical memory at PAGE_OFFSET. This runs before bootmem is initialized and gets pages directly from the physical memory. To access them they are temporarily mapped. */ -void __meminit init_memory_mapping(unsigned long start, unsigned long end) +void __init_refok init_memory_mapping(unsigned long start, unsigned long end) { unsigned long next; @@ -897,12 +897,6 @@ } EXPORT_SYMBOL_GPL(arch_add_memory); -int remove_memory(u64 start, u64 size) -{ - return -EINVAL; -} -EXPORT_SYMBOL_GPL(remove_memory); - #if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA) int memory_add_physaddr_to_nid(u64 start) { @@ -1176,14 +1170,6 @@ return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END); } -#ifndef CONFIG_XEN -void * __init alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size) -{ - return __alloc_bootmem_core(pgdat->bdata, size, - SMP_CACHE_BYTES, (4UL*1024*1024*1024), 0); -} -#endif - const char *arch_vma_name(struct vm_area_struct *vma) { if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso) @@ -1192,3 +1178,48 @@ return "[vsyscall]"; return NULL; } + +#ifdef CONFIG_SPARSEMEM_VMEMMAP +/* + * Initialise the sparsemem vmemmap using huge-pages at the PMD level. + */ +int __meminit vmemmap_populate(struct page *start_page, + unsigned long size, int node) +{ + unsigned long addr = (unsigned long)start_page; + unsigned long end = (unsigned long)(start_page + size); + unsigned long next; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + + for (; addr < end; addr = next) { + next = pmd_addr_end(addr, end); + + pgd = vmemmap_pgd_populate(addr, node); + if (!pgd) + return -ENOMEM; + pud = vmemmap_pud_populate(pgd, addr, node); + if (!pud) + return -ENOMEM; + + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) { + pte_t entry; + void *p = vmemmap_alloc_block(PMD_SIZE, node); + if (!p) + return -ENOMEM; + + entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); + mk_pte_huge(entry); + set_pmd(pmd, __pmd(pte_val(entry))); + + printk(KERN_DEBUG " [%lx-%lx] PMD ->%p on node %d\n", + addr, addr + PMD_SIZE - 1, p, node); + } else + vmemmap_verify((pte_t *)pmd, node, addr, next); + } + + return 0; +} +#endif --- a/arch/x86/mm/pageattr_64-xen.c +++ b/arch/x86/mm/pageattr_64-xen.c @@ -324,11 +324,11 @@ return base; } -static void cache_flush_page(void *adr) +void clflush_cache_range(void *adr, int size) { int i; - for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size) - asm volatile("clflush (%0)" :: "r" (adr + i)); + for (i = 0; i < size; i += boot_cpu_data.x86_clflush_size) + clflush(adr+i); } static void flush_kernel_map(void *arg) @@ -343,7 +343,7 @@ asm volatile("wbinvd" ::: "memory"); else list_for_each_entry(pg, l, lru) { void *adr = page_address(pg); - cache_flush_page(adr); + clflush_cache_range(adr, PAGE_SIZE); } __flush_tlb_all(); } @@ -411,6 +411,7 @@ split = split_large_page(address, prot, ref_prot2); if (!split) return -ENOMEM; + pgprot_val(ref_prot2) &= ~_PAGE_NX; set_pte(kpte, mk_pte(split, ref_prot2)); kpte_page = split; } @@ -503,9 +504,14 @@ struct page *pg, *next; struct list_head l; - down_read(&init_mm.mmap_sem); + /* + * Write-protect the semaphore, to exclude two contexts + * doing a list_replace_init() call in parallel and to + * exclude new additions to the deferred_pages list: + */ + down_write(&init_mm.mmap_sem); list_replace_init(&deferred_pages, &l); - up_read(&init_mm.mmap_sem); + up_write(&init_mm.mmap_sem); flush_map(&l); --- a/arch/x86/mm/pgtable_32-xen.c +++ b/arch/x86/mm/pgtable_32-xen.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -46,6 +47,8 @@ for_each_online_pgdat(pgdat) { pgdat_resize_lock(pgdat, &flags); for (i = 0; i < pgdat->node_spanned_pages; ++i) { + if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) + touch_nmi_watchdog(); page = pgdat_page_nr(pgdat, i); total++; if (PageHighMem(page)) @@ -200,7 +203,7 @@ __free_page(pte); } -void pmd_ctor(void *pmd, struct kmem_cache *cache, unsigned long flags) +void pmd_ctor(struct kmem_cache *cache, void *pmd) { memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t)); } --- a/arch/x86/pci/Makefile +++ b/arch/x86/pci/Makefile @@ -3,3 +3,7 @@ else include ${srctree}/arch/x86/pci/Makefile_64 endif + +# pcifront should be after mmconfig.o and direct.o as it should only +# take over if direct access to the PCI bus is unavailable +obj-$(CONFIG_XEN_PCIDEV_FRONTEND) += pcifront.o --- a/arch/x86/pci/Makefile_32 +++ b/arch/x86/pci/Makefile_32 @@ -4,10 +4,6 @@ obj-$(CONFIG_PCI_MMCONFIG) += mmconfig_32.o direct.o mmconfig-shared.o obj-$(CONFIG_PCI_DIRECT) += direct.o -# pcifront should be after pcbios.o, mmconfig.o, and direct.o as it should only -# take over if direct access to the PCI bus is unavailable -obj-$(CONFIG_XEN_PCIDEV_FRONTEND) += pcifront.o - pci-y := fixup.o pci-$(CONFIG_ACPI) += acpi.o pci-y += legacy.o irq.o --- a/arch/x86/pci/Makefile_64 +++ b/arch/x86/pci/Makefile_64 @@ -15,7 +15,3 @@ obj-$(CONFIG_NUMA) += k8-bus_64.o -# pcifront should be after mmconfig.o and direct.o as it should only -# take over if direct access to the PCI bus is unavailable -obj-$(CONFIG_XEN_PCIDEV_FRONTEND) += pcifront.o - --- a/arch/x86/pci/irq-xen.c +++ b/arch/x86/pci/irq-xen.c @@ -173,7 +173,7 @@ } /* - * Common IRQ routing practice: nybbles in config space, + * Common IRQ routing practice: nibbles in config space, * offset by some magic constant. */ static unsigned int read_config_nybble(struct pci_dev *router, unsigned offset, unsigned nr) @@ -496,6 +496,26 @@ return 1; } +/* + * PicoPower PT86C523 + */ +static int pirq_pico_get(struct pci_dev *router, struct pci_dev *dev, int pirq) +{ + outb(0x10 + ((pirq - 1) >> 1), 0x24); + return ((pirq - 1) & 1) ? (inb(0x26) >> 4) : (inb(0x26) & 0xf); +} + +static int pirq_pico_set(struct pci_dev *router, struct pci_dev *dev, int pirq, + int irq) +{ + unsigned int x; + outb(0x10 + ((pirq - 1) >> 1), 0x24); + x = inb(0x26); + x = ((pirq - 1) & 1) ? ((x & 0x0f) | (irq << 4)) : ((x & 0xf0) | (irq)); + outb(x, 0x26); + return 1; +} + #ifdef CONFIG_PCI_BIOS static int pirq_bios_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) @@ -569,7 +589,7 @@ /* FIXME: We should move some of the quirk fixup stuff here */ /* - * work arounds for some buggy BIOSes + * workarounds for some buggy BIOSes */ if (device == PCI_DEVICE_ID_VIA_82C586_0) { switch(router->device) { @@ -725,6 +745,24 @@ return 1; } +static __init int pico_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) +{ + switch (device) { + case PCI_DEVICE_ID_PICOPOWER_PT86C523: + r->name = "PicoPower PT86C523"; + r->get = pirq_pico_get; + r->set = pirq_pico_set; + return 1; + + case PCI_DEVICE_ID_PICOPOWER_PT86C523BBP: + r->name = "PicoPower PT86C523 rev. BB+"; + r->get = pirq_pico_get; + r->set = pirq_pico_set; + return 1; + } + return 0; +} + static __initdata struct irq_router_handler pirq_routers[] = { { PCI_VENDOR_ID_INTEL, intel_router_probe }, { PCI_VENDOR_ID_AL, ali_router_probe }, @@ -736,6 +774,7 @@ { PCI_VENDOR_ID_VLSI, vlsi_router_probe }, { PCI_VENDOR_ID_SERVERWORKS, serverworks_router_probe }, { PCI_VENDOR_ID_AMD, amd_router_probe }, + { PCI_VENDOR_ID_PICOPOWER, pico_router_probe }, /* Someone with docs needs to add the ATI Radeon IGP */ { 0, NULL } }; @@ -1014,7 +1053,7 @@ * Work around broken HP Pavilion Notebooks which assign USB to * IRQ 9 even though it is actually wired to IRQ 11 */ -static int __init fix_broken_hp_bios_irq9(struct dmi_system_id *d) +static int __init fix_broken_hp_bios_irq9(const struct dmi_system_id *d) { if (!broken_hp_bios_irq9) { broken_hp_bios_irq9 = 1; @@ -1027,7 +1066,7 @@ * Work around broken Acer TravelMate 360 Notebooks which assign * Cardbus to IRQ 11 even though it is actually wired to IRQ 10 */ -static int __init fix_acer_tm360_irqrouting(struct dmi_system_id *d) +static int __init fix_acer_tm360_irqrouting(const struct dmi_system_id *d) { if (!acer_tm360_irqrouting) { acer_tm360_irqrouting = 1; --- a/drivers/xen/blkback/blkback.c +++ b/drivers/xen/blkback/blkback.c @@ -269,13 +269,10 @@ } } -static int end_block_io_op(struct bio *bio, unsigned int done, int error) +static void end_block_io_op(struct bio *bio, int error) { - if (bio->bi_size != 0) - return 1; __end_block_io_op(bio->bi_private, error); bio_put(bio); - return error; } --- a/drivers/xen/blkfront/blkfront.c +++ b/drivers/xen/blkfront/blkfront.c @@ -573,9 +573,8 @@ struct blkfront_info *info = req->rq_disk->private_data; unsigned long buffer_mfn; blkif_request_t *ring_req; - struct bio *bio; struct bio_vec *bvec; - int idx; + struct req_iterator iter; unsigned long id; unsigned int fsect, lsect; int ref; @@ -609,34 +608,32 @@ ring_req->operation = BLKIF_OP_WRITE_BARRIER; ring_req->nr_segments = 0; - rq_for_each_bio (bio, req) { - bio_for_each_segment (bvec, bio, idx) { - BUG_ON(ring_req->nr_segments - == BLKIF_MAX_SEGMENTS_PER_REQUEST); - buffer_mfn = page_to_phys(bvec->bv_page) >> PAGE_SHIFT; - fsect = bvec->bv_offset >> 9; - lsect = fsect + (bvec->bv_len >> 9) - 1; - /* install a grant reference. */ - ref = gnttab_claim_grant_reference(&gref_head); - BUG_ON(ref == -ENOSPC); - - gnttab_grant_foreign_access_ref( - ref, - info->xbdev->otherend_id, - buffer_mfn, - rq_data_dir(req) ? GTF_readonly : 0 ); - - info->shadow[id].frame[ring_req->nr_segments] = - mfn_to_pfn(buffer_mfn); - - ring_req->seg[ring_req->nr_segments] = - (struct blkif_request_segment) { - .gref = ref, - .first_sect = fsect, - .last_sect = lsect }; + rq_for_each_segment(bvec, req, iter) { + BUG_ON(ring_req->nr_segments + == BLKIF_MAX_SEGMENTS_PER_REQUEST); + buffer_mfn = page_to_phys(bvec->bv_page) >> PAGE_SHIFT; + fsect = bvec->bv_offset >> 9; + lsect = fsect + (bvec->bv_len >> 9) - 1; + /* install a grant reference. */ + ref = gnttab_claim_grant_reference(&gref_head); + BUG_ON(ref == -ENOSPC); + + gnttab_grant_foreign_access_ref( + ref, + info->xbdev->otherend_id, + buffer_mfn, + rq_data_dir(req) ? GTF_readonly : 0 ); + + info->shadow[id].frame[ring_req->nr_segments] = + mfn_to_pfn(buffer_mfn); + + ring_req->seg[ring_req->nr_segments] = + (struct blkif_request_segment) { + .gref = ref, + .first_sect = fsect, + .last_sect = lsect }; - ring_req->nr_segments++; - } + ring_req->nr_segments++; } info->ring.req_prod_pvt++; --- a/drivers/xen/core/machine_kexec.c +++ b/drivers/xen/core/machine_kexec.c @@ -25,6 +25,10 @@ struct resource *res; int k = 0; + if (strstr(boot_command_line, "crashkernel=")) + printk(KERN_WARNING "Ignoring crashkernel command line, " + "parameter will be supplied by xen\n"); + if (!is_initial_xendomain()) return; --- a/drivers/xen/core/smpboot.c +++ b/drivers/xen/core/smpboot.c @@ -45,8 +45,8 @@ EXPORT_SYMBOL(cpu_possible_map); cpumask_t cpu_initialized_map; -struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned; -EXPORT_SYMBOL(cpu_data); +DEFINE_PER_CPU(struct cpuinfo_x86, cpu_info); +EXPORT_PER_CPU_SYMBOL(cpu_info); #ifdef CONFIG_HOTPLUG_CPU DEFINE_PER_CPU(int, cpu_state) = { 0 }; @@ -59,13 +59,13 @@ u8 cpu_2_logical_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; -cpumask_t cpu_sibling_map[NR_CPUS] __cacheline_aligned; -cpumask_t cpu_core_map[NR_CPUS] __cacheline_aligned; -EXPORT_SYMBOL(cpu_core_map); +DEFINE_PER_CPU(cpumask_t, cpu_sibling_map); +DEFINE_PER_CPU(cpumask_t, cpu_core_map); +EXPORT_PER_CPU_SYMBOL(cpu_core_map); #if defined(__i386__) -u8 x86_cpu_to_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = 0xff }; -EXPORT_SYMBOL(x86_cpu_to_apicid); +DEFINE_PER_CPU(u8, x86_cpu_to_apicid) = BAD_APICID; +EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid); #endif void __init prefill_possible_map(void) @@ -90,25 +90,25 @@ static inline void set_cpu_sibling_map(unsigned int cpu) { - cpu_data[cpu].phys_proc_id = cpu; - cpu_data[cpu].cpu_core_id = 0; + cpu_data(cpu).phys_proc_id = cpu; + cpu_data(cpu).cpu_core_id = 0; - cpu_sibling_map[cpu] = cpumask_of_cpu(cpu); - cpu_core_map[cpu] = cpumask_of_cpu(cpu); + per_cpu(cpu_sibling_map, cpu) = cpumask_of_cpu(cpu); + per_cpu(cpu_core_map, cpu) = cpumask_of_cpu(cpu); - cpu_data[cpu].booted_cores = 1; + cpu_data(cpu).booted_cores = 1; } static void remove_siblinginfo(unsigned int cpu) { - cpu_data[cpu].phys_proc_id = BAD_APICID; - cpu_data[cpu].cpu_core_id = BAD_APICID; + cpu_data(cpu).phys_proc_id = BAD_APICID; + cpu_data(cpu).cpu_core_id = BAD_APICID; - cpus_clear(cpu_sibling_map[cpu]); - cpus_clear(cpu_core_map[cpu]); + cpus_clear(per_cpu(cpu_sibling_map, cpu)); + cpus_clear(per_cpu(cpu_core_map, cpu)); - cpu_data[cpu].booted_cores = 0; + cpu_data(cpu).booted_cores = 0; } static int __cpuinit xen_smp_intr_init(unsigned int cpu) @@ -167,9 +167,9 @@ { cpu_init(); #ifdef __i386__ - identify_secondary_cpu(cpu_data + smp_processor_id()); + identify_secondary_cpu(¤t_cpu_data); #else - identify_cpu(cpu_data + smp_processor_id()); + identify_cpu(¤t_cpu_data); #endif touch_softlockup_watchdog(); preempt_disable(); @@ -270,16 +270,16 @@ if (HYPERVISOR_vcpu_op(VCPUOP_get_physid, 0, &cpu_id) == 0) apicid = xen_vcpu_physid_to_x86_apicid(cpu_id.phys_id); boot_cpu_data.apicid = apicid; - cpu_data[0] = boot_cpu_data; + cpu_data(0) = boot_cpu_data; cpu_2_logical_apicid[0] = apicid; - x86_cpu_to_apicid[0] = apicid; + per_cpu(x86_cpu_to_apicid, 0) = apicid; current_thread_info()->cpu = 0; for (cpu = 0; cpu < NR_CPUS; cpu++) { - cpus_clear(cpu_sibling_map[cpu]); - cpus_clear(cpu_core_map[cpu]); + cpus_clear(per_cpu(cpu_sibling_map, cpu)); + cpus_clear(per_cpu(cpu_core_map, cpu)); } set_cpu_sibling_map(0); @@ -324,11 +324,12 @@ apicid = cpu; if (HYPERVISOR_vcpu_op(VCPUOP_get_physid, cpu, &cpu_id) == 0) apicid = xen_vcpu_physid_to_x86_apicid(cpu_id.phys_id); - cpu_data[cpu] = boot_cpu_data; - cpu_data[cpu].apicid = apicid; + cpu_data(cpu) = boot_cpu_data; + cpu_data(cpu).cpu_index = cpu; + cpu_data(cpu).apicid = apicid; cpu_2_logical_apicid[cpu] = apicid; - x86_cpu_to_apicid[cpu] = apicid; + per_cpu(x86_cpu_to_apicid, cpu) = apicid; #ifdef __x86_64__ cpu_pda(cpu)->pcurrent = idle; --- a/drivers/xen/netback/loopback.c +++ b/drivers/xen/netback/loopback.c @@ -285,9 +285,9 @@ char dev_name[IFNAMSIZ]; sprintf(dev_name, "vif0.%d", i); - dev1 = dev_get_by_name(dev_name); + dev1 = dev_get_by_name(&init_net, dev_name); sprintf(dev_name, "veth%d", i); - dev2 = dev_get_by_name(dev_name); + dev2 = dev_get_by_name(&init_net, dev_name); if (dev1 && dev2) { unregister_netdev(dev2); unregister_netdev(dev1); --- a/drivers/xen/netback/netback.c +++ b/drivers/xen/netback/netback.c @@ -335,8 +335,8 @@ { static struct net_device *eth0_dev = NULL; if (unlikely(eth0_dev == NULL)) - eth0_dev = __dev_get_by_name("eth0"); - netif_rx_schedule(eth0_dev); + eth0_dev = __dev_get_by_name(&init_net, "eth0"); + netif_rx_schedule(eth0_dev, ???); } /* * Add following to poll() function in NAPI driver (Tigon3 is example): --- a/drivers/xen/netback/xenbus.c +++ b/drivers/xen/netback/xenbus.c @@ -148,12 +148,10 @@ * and vif variables to the environment, for the benefit of the vif-* hotplug * scripts. */ -static int netback_uevent(struct xenbus_device *xdev, char **envp, - int num_envp, char *buffer, int buffer_size) +static int netback_uevent(struct xenbus_device *xdev, struct kobj_uevent_env *env) { struct backend_info *be = xdev->dev.driver_data; netif_t *netif = be->netif; - int i = 0, length = 0; char *val; DPRINTK("netback_uevent"); @@ -165,15 +163,11 @@ return err; } else { - add_uevent_var(envp, num_envp, &i, buffer, buffer_size, - &length, "script=%s", val); + add_uevent_var(env, "script=%s", val); kfree(val); } - add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length, - "vif=%s", netif->dev->name); - - envp[i] = NULL; + add_uevent_var(env, "vif=%s", netif->dev->name); return 0; } --- a/drivers/xen/netfront/accel.c +++ b/drivers/xen/netfront/accel.c @@ -325,13 +325,13 @@ DPRINTK("%p\n",vif_state); /* Make sure there are no data path operations going on */ - netif_poll_disable(vif_state->np->netdev); + napi_disable(&vif_state->np->napi); netif_tx_lock_bh(vif_state->np->netdev); vif_state->hooks = vif_state->np->accelerator->hooks; netif_tx_unlock_bh(vif_state->np->netdev); - netif_poll_enable(vif_state->np->netdev); + napi_enable(&vif_state->np->napi); } @@ -509,7 +509,7 @@ struct netfront_accel_vif_state *vif_state) { /* Make sure there are no data path operations going on */ - netif_poll_disable(vif_state->np->netdev); + napi_disable(&vif_state->np->napi); netif_tx_lock_bh(vif_state->np->netdev); /* @@ -521,7 +521,7 @@ vif_state->hooks = NULL; netif_tx_unlock_bh(vif_state->np->netdev); - netif_poll_enable(vif_state->np->netdev); + napi_enable(&vif_state->np->napi); } --- a/drivers/xen/netfront/netfront.c +++ b/drivers/xen/netfront/netfront.c @@ -634,7 +634,7 @@ if (RING_HAS_UNCONSUMED_RESPONSES(&np->rx)){ netfront_accelerator_call_stop_napi_irq(np, dev); - netif_rx_schedule(dev); + netif_rx_schedule(dev, &np->napi); } } spin_unlock_bh(&np->rx_lock); @@ -706,7 +706,7 @@ netfront_accelerator_call_stop_napi_irq(np, dev); - netif_rx_schedule(dev); + netif_rx_schedule(dev, &np->napi); } static void network_alloc_rx_buffers(struct net_device *dev) @@ -1063,7 +1063,7 @@ if (RING_HAS_UNCONSUMED_RESPONSES(&np->rx)) { netfront_accelerator_call_stop_napi_irq(np, dev); - netif_rx_schedule(dev); + netif_rx_schedule(dev, &np->napi); dev->last_rx = jiffies; } } @@ -1316,16 +1316,17 @@ #endif } -static int netif_poll(struct net_device *dev, int *pbudget) +static int netif_poll(struct napi_struct *napi, int budget) { - struct netfront_info *np = netdev_priv(dev); + struct netfront_info *np = container_of(napi, struct netfront_info, napi); + struct net_device *dev = np->netdev; struct sk_buff *skb; struct netfront_rx_info rinfo; struct netif_rx_response *rx = &rinfo.rx; struct netif_extra_info *extras = rinfo.extras; RING_IDX i, rp; struct multicall_entry *mcl; - int work_done, budget, more_to_do = 1, accel_more_to_do = 1; + int work_done, more_to_do = 1, accel_more_to_do = 1; struct sk_buff_head rxq; struct sk_buff_head errq; struct sk_buff_head tmpq; @@ -1345,8 +1346,6 @@ skb_queue_head_init(&errq); skb_queue_head_init(&tmpq); - if ((budget = *pbudget) > dev->quota) - budget = dev->quota; rp = np->rx.sring->rsp_prod; rmb(); /* Ensure we see queued responses up to 'rp'. */ @@ -1508,9 +1507,6 @@ accel_more_to_do = 0; } - *pbudget -= work_done; - dev->quota -= work_done; - if (work_done < budget) { local_irq_save(flags); @@ -1527,14 +1523,14 @@ } if (!more_to_do && !accel_more_to_do) - __netif_rx_complete(dev); + __netif_rx_complete(dev, napi); local_irq_restore(flags); } spin_unlock(&np->rx_lock); - return more_to_do | accel_more_to_do; + return work_done; } static void netif_release_tx_bufs(struct netfront_info *np) @@ -2089,16 +2085,14 @@ netdev->hard_start_xmit = network_start_xmit; netdev->stop = network_close; netdev->get_stats = network_get_stats; - netdev->poll = netif_poll; + netif_napi_add(netdev, &np->napi, netif_poll, 64); netdev->set_multicast_list = network_set_multicast_list; netdev->uninit = netif_uninit; netdev->set_mac_address = xennet_set_mac_address; netdev->change_mtu = xennet_change_mtu; - netdev->weight = 64; netdev->features = NETIF_F_IP_CSUM; SET_ETHTOOL_OPS(netdev, &network_ethtool_ops); - SET_MODULE_OWNER(netdev); SET_NETDEV_DEV(netdev, &dev->dev); np->netdev = netdev; --- a/drivers/xen/netfront/netfront.h +++ b/drivers/xen/netfront/netfront.h @@ -157,6 +157,8 @@ spinlock_t tx_lock; spinlock_t rx_lock; + struct napi_struct napi; + unsigned int irq; unsigned int copying_receiver; unsigned int carrier; --- a/drivers/xen/pciback/Makefile +++ b/drivers/xen/pciback/Makefile @@ -11,6 +11,4 @@ pciback-$(CONFIG_XEN_PCIDEV_BACKEND_PASS) += passthrough.o pciback-$(CONFIG_XEN_PCIDEV_BACKEND_CONTROLLER) += controller.o -ifeq ($(CONFIG_XEN_PCIDEV_BE_DEBUG),y) -EXTRA_CFLAGS += -DDEBUG -endif +ccflags-$(CONFIG_XEN_PCIDEV_BE_DEBUG) += -DDEBUG --- a/drivers/xen/pcifront/Makefile +++ b/drivers/xen/pcifront/Makefile @@ -2,6 +2,4 @@ pcifront-y := pci_op.o xenbus.o pci.o -ifeq ($(CONFIG_XEN_PCIDEV_FE_DEBUG),y) -EXTRA_CFLAGS += -DDEBUG -endif +ccflags-$(CONFIG_XEN_PCIDEV_FE_DEBUG) += -DDEBUG --- a/drivers/xen/sfc_netback/accel_fwd.c +++ b/drivers/xen/sfc_netback/accel_fwd.c @@ -181,10 +181,11 @@ unsigned long flags; cuckoo_hash_mac_key key = cuckoo_mac_to_key(mac); struct port_fwd *fwd_set = (struct port_fwd *)fwd_priv; + DECLARE_MAC_BUF(buf); BUG_ON(fwd_priv == NULL); - DPRINTK("Adding mac " MAC_FMT "\n", MAC_ARG(mac)); + DPRINTK("Adding mac " %s "\n", print_mac(buf, mac)); spin_lock_irqsave(&fwd_set->fwd_lock, flags); @@ -235,8 +236,9 @@ unsigned long flags; cuckoo_hash_mac_key key = cuckoo_mac_to_key(mac); struct port_fwd *fwd_set = (struct port_fwd *)fwd_priv; + DECLARE_MAC_BUF(buf); - DPRINTK("Removing mac " MAC_FMT "\n", MAC_ARG(mac)); + DPRINTK("Removing mac " %s "\n", print_mac(buf, mac)); BUG_ON(fwd_priv == NULL); @@ -394,14 +396,16 @@ if (is_broadcast_ether_addr(skb_mac_header(skb)) && packet_is_arp_reply(skb)) { + DECLARE_MAC_BUF(buf); + /* * update our fast path forwarding to reflect this * gratuitous ARP */ mac = skb_mac_header(skb)+ETH_ALEN; - DPRINTK("%s: found gratuitous ARP for " MAC_FMT "\n", - __FUNCTION__, MAC_ARG(mac)); + DPRINTK("%s: found gratuitous ARP for " %s "\n", + __FUNCTION__, print_mac(buf, mac)); spin_lock_irqsave(&fwd_set->fwd_lock, flags); /* --- a/drivers/xen/sfc_netback/accel_msg.c +++ b/drivers/xen/sfc_netback/accel_msg.c @@ -57,11 +57,11 @@ { unsigned long lock_state; struct net_accel_msg *msg; + DECLARE_MAC_BUF(buf); BUG_ON(bend == NULL || mac == NULL); - VPRINTK("Sending local mac message: " MAC_FMT "\n", - MAC_ARG((const char *)mac)); + VPRINTK("Sending local mac message: " %s "\n", print_mac(buf, mac)); msg = net_accel_msg_start_send(bend->shared_page, &bend->to_domU, &lock_state); --- a/drivers/xen/sfc_netfront/accel_msg.c +++ b/drivers/xen/sfc_netfront/accel_msg.c @@ -41,11 +41,13 @@ /* Prime our interrupt */ spin_lock_irqsave(&vnic->irq_enabled_lock, flags); if (!netfront_accel_vi_enable_interrupts(vnic)) { + struct netfront_info *np = netdev_priv(vnic->net_dev); + /* Cripes, that was quick, better pass it up */ netfront_accel_disable_net_interrupts(vnic); vnic->irq_enabled = 0; NETFRONT_ACCEL_STATS_OP(vnic->stats.poll_schedule_count++); - netif_rx_schedule(vnic->net_dev); + netif_rx_schedule(vnic->net_dev, &np->napi); } else { /* * Nothing yet, make sure we get interrupts through @@ -72,6 +74,7 @@ static void vnic_start_fastpath(netfront_accel_vnic *vnic) { struct net_device *net_dev = vnic->net_dev; + struct netfront_info *np = netdev_priv(net_dev); unsigned long flags; DPRINTK("%s\n", __FUNCTION__); @@ -80,9 +83,9 @@ vnic->tx_enabled = 1; spin_unlock_irqrestore(&vnic->tx_lock, flags); - netif_poll_disable(net_dev); + napi_disable(&np->napi); vnic->poll_enabled = 1; - netif_poll_enable(net_dev); + napi_enable(&np->napi); vnic_start_interrupts(vnic); } @@ -114,11 +117,11 @@ spin_unlock_irqrestore(&vnic->tx_lock, flags1); /* Must prevent polls and hold lock to modify poll_enabled */ - netif_poll_disable(net_dev); + napi_disable(&np->napi); spin_lock_irqsave(&vnic->irq_enabled_lock, flags1); vnic->poll_enabled = 0; spin_unlock_irqrestore(&vnic->irq_enabled_lock, flags1); - netif_poll_enable(net_dev); + napi_enable(&np->napi); } @@ -326,8 +329,10 @@ cuckoo_hash_mac_key key; if (msg->u.localmac.flags & NET_ACCEL_MSG_ADD) { - DPRINTK("MAC has moved, could be local: " MAC_FMT "\n", - MAC_ARG(msg->u.localmac.mac)); + DECLARE_MAC_BUF(buf); + + DPRINTK("MAC has moved, could be local: " %s "\n", + print_mac(buf, msg->u.localmac.mac)); key = cuckoo_mac_to_key(msg->u.localmac.mac); spin_lock_irqsave(&vnic->table_lock, flags); /* Try to remove it, not a big deal if not there */ @@ -515,6 +520,8 @@ spin_lock_irqsave(&vnic->irq_enabled_lock, flags); if (vnic->irq_enabled) { + struct netfront_info *np = netdev_priv(net_dev); + netfront_accel_disable_net_interrupts(vnic); vnic->irq_enabled = 0; spin_unlock_irqrestore(&vnic->irq_enabled_lock, flags); @@ -527,7 +534,7 @@ vnic->stats.event_count_since_irq; vnic->stats.event_count_since_irq = 0; #endif - netif_rx_schedule(net_dev); + netif_rx_schedule(net_dev, &np->napi); } else { spin_unlock_irqrestore(&vnic->irq_enabled_lock, flags); --- a/drivers/xen/sfc_netfront/accel_vi.c +++ b/drivers/xen/sfc_netfront/accel_vi.c @@ -641,8 +641,10 @@ (cuckoo_hash_key *)(&key), &value); if (!try_fastpath) { - VPRINTK("try fast path false for mac: " MAC_FMT "\n", - MAC_ARG(skb->data)); + DECLARE_MAC_BUF(buf); + + VPRINTK("try fast path false for mac: " %s "\n", + print_mac(buf, skb->data)); return NETFRONT_ACCEL_STATUS_CANT; } @@ -768,9 +770,10 @@ if (compare_ether_addr(skb->data, vnic->mac)) { struct iphdr *ip = (struct iphdr *)(skb->data + ETH_HLEN); u16 port; + DECLARE_MAC_BUF(buf); - DPRINTK("%s: saw wrong MAC address " MAC_FMT "\n", - __FUNCTION__, MAC_ARG(skb->data)); + DPRINTK("%s: saw wrong MAC address " %s "\n", + __FUNCTION__, print_mac(buf, skb->data)); if (ip->protocol == IPPROTO_TCP) { struct tcphdr *tcp = (struct tcphdr *) --- a/drivers/xen/sfc_netutil/accel_util.h +++ b/drivers/xen/sfc_netutil/accel_util.h @@ -63,9 +63,6 @@ DPRINTK("%s at %s:%d\n", #exp, __FILE__, __LINE__); \ } while(0) -#define MAC_FMT "%.2x:%.2x:%.2x:%.2x:%.2x:%.2x" -#define MAC_ARG(_mac) (_mac)[0], (_mac)[1], (_mac)[2], (_mac)[3], (_mac)[4], (_mac)[5] - #include /*! Map a set of pages from another domain --- a/drivers/xen/xenbus/xenbus_probe.c +++ b/drivers/xen/xenbus/xenbus_probe.c @@ -174,11 +174,9 @@ } #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16) && (defined(CONFIG_XEN) || defined(MODULE)) -static int xenbus_uevent_frontend(struct device *dev, char **envp, - int num_envp, char *buffer, int buffer_size) +static int xenbus_uevent_frontend(struct device *dev, struct kobj_uevent_env *env) { struct xenbus_device *xdev; - int length = 0, i = 0; if (dev == NULL) return -ENODEV; @@ -187,12 +185,9 @@ return -ENODEV; /* stuff we want to pass to /sbin/hotplug */ - add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length, - "XENBUS_TYPE=%s", xdev->devicetype); - add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length, - "XENBUS_PATH=%s", xdev->nodename); - add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length, - "MODALIAS=xen:%s", xdev->devicetype); + add_uevent_var(env, "XENBUS_TYPE=%s", xdev->devicetype); + add_uevent_var(env, "XENBUS_PATH=%s", xdev->nodename); + add_uevent_var(env, "MODALIAS=xen:%s", xdev->devicetype); return 0; } --- a/drivers/xen/xenbus/xenbus_probe_backend.c +++ b/drivers/xen/xenbus/xenbus_probe_backend.c @@ -60,8 +60,7 @@ #include #endif -static int xenbus_uevent_backend(struct device *dev, char **envp, - int num_envp, char *buffer, int buffer_size); +static int xenbus_uevent_backend(struct device *dev, struct kobj_uevent_env *env); static int xenbus_probe_backend(const char *type, const char *domid); extern int read_otherend_details(struct xenbus_device *xendev, @@ -128,13 +127,10 @@ }, }; -static int xenbus_uevent_backend(struct device *dev, char **envp, - int num_envp, char *buffer, int buffer_size) +static int xenbus_uevent_backend(struct device *dev, struct kobj_uevent_env *env) { struct xenbus_device *xdev; struct xenbus_driver *drv; - int i = 0; - int length = 0; DPRINTK(""); @@ -146,27 +142,16 @@ return -ENODEV; /* stuff we want to pass to /sbin/hotplug */ - add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length, - "XENBUS_TYPE=%s", xdev->devicetype); + add_uevent_var(env, "XENBUS_TYPE=%s", xdev->devicetype); - add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length, - "XENBUS_PATH=%s", xdev->nodename); + add_uevent_var(env, "XENBUS_PATH=%s", xdev->nodename); - add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length, - "XENBUS_BASE_PATH=%s", xenbus_backend.root); - - /* terminate, set to next free slot, shrink available space */ - envp[i] = NULL; - envp = &envp[i]; - num_envp -= i; - buffer = &buffer[length]; - buffer_size -= length; + add_uevent_var(env, "XENBUS_BASE_PATH=%s", xenbus_backend.root); if (dev->driver) { drv = to_xenbus_driver(dev->driver); if (drv && drv->uevent) - return drv->uevent(xdev, envp, num_envp, buffer, - buffer_size); + return drv->uevent(xdev, env); } return 0; --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -187,7 +187,7 @@ { a_list_t *aentry; -#ifdef CONFIG_XEN +#if defined(CONFIG_XEN) || defined(CONFIG_PARAVIRT_XEN) /* * Xen needs to be able to make sure it can get an exclusive * RO mapping of pages it wants to turn into a pagetable. If --- a/include/asm-x86/mach-xen/asm/agp.h +++ b/include/asm-x86/mach-xen/asm/agp.h @@ -1,20 +1,22 @@ -#ifndef AGP_H -#define AGP_H 1 +#ifndef _ASM_X86_AGP_H +#define _ASM_X86_AGP_H #include #include #include -/* - * Functions to keep the agpgart mappings coherent with the MMU. - * The GART gives the CPU a physical alias of pages in memory. The alias region is - * mapped uncacheable. Make sure there are no conflicting mappings - * with different cachability attributes for the same page. This avoids - * data corruption on some CPUs. +/* + * Functions to keep the agpgart mappings coherent with the MMU. The + * GART gives the CPU a physical alias of pages in memory. The alias + * region is mapped uncacheable. Make sure there are no conflicting + * mappings with different cachability attributes for the same + * page. This avoids data corruption on some CPUs. */ -/* Caller's responsibility to call global_flush_tlb() for - * performance reasons */ +/* + * Caller's responsibility to call global_flush_tlb() for performance + * reasons + */ #define map_page_into_agp(page) ( \ xen_create_contiguous_region((unsigned long)page_address(page), 0, 32) \ ?: change_page_attr(page, 1, PAGE_KERNEL_NOCACHE)) @@ -24,9 +26,11 @@ change_page_attr(page, 1, PAGE_KERNEL)) #define flush_agp_mappings() global_flush_tlb() -/* Could use CLFLUSH here if the cpu supports it. But then it would - need to be called for each cacheline of the whole page so it may not be - worth it. Would need a page for it. */ +/* + * Could use CLFLUSH here if the cpu supports it. But then it would + * need to be called for each cacheline of the whole page so it may + * not be worth it. Would need a page for it. + */ #define flush_agp_cache() wbinvd() /* Convert a physical address to an address suitable for the GART. */ --- /dev/null +++ b/include/asm-x86/mach-xen/asm/desc.h @@ -0,0 +1,5 @@ +#ifdef CONFIG_X86_32 +# include "desc_32.h" +#else +# include "desc_64.h" +#endif --- a/include/asm-x86/mach-xen/asm/desc_64.h +++ b/include/asm-x86/mach-xen/asm/desc_64.h @@ -34,6 +34,18 @@ put_cpu(); } +#ifndef CONFIG_X86_NO_TSS +static inline unsigned long __store_tr(void) +{ + unsigned long tr; + + asm volatile ("str %w0":"=r" (tr)); + return tr; +} + +#define store_tr(tr) (tr) = __store_tr() +#endif + /* * This is the ldt that every process will get unless we need * something other than this. @@ -47,6 +59,18 @@ /* the cpu gdt accessor */ #define cpu_gdt(_cpu) ((struct desc_struct *)cpu_gdt_descr[_cpu].address) +#ifndef CONFIG_XEN +static inline void load_gdt(const struct desc_ptr *ptr) +{ + asm volatile("lgdt %w0"::"m" (*ptr)); +} + +static inline void store_gdt(struct desc_ptr *ptr) +{ + asm("sgdt %w0":"=m" (*ptr)); +} +#endif + static inline void _set_gate(void *adr, unsigned type, unsigned long func, unsigned dpl, unsigned ist) { struct gate_struct s; @@ -87,6 +111,16 @@ { _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 3, ist); } + +static inline void load_idt(const struct desc_ptr *ptr) +{ + asm volatile("lidt %w0"::"m" (*ptr)); +} + +static inline void store_idt(struct desc_ptr *dtr) +{ + asm("sidt %w0":"=m" (*dtr)); +} #endif static inline void set_tssldt_descriptor(void *ptr, unsigned long tss, unsigned type, --- /dev/null +++ b/include/asm-x86/mach-xen/asm/dma-mapping.h @@ -0,0 +1,5 @@ +#ifdef CONFIG_X86_32 +# include "dma-mapping_32.h" +#else +# include "dma-mapping_64.h" +#endif --- a/include/asm-x86/mach-xen/asm/dma-mapping_32.h +++ b/include/asm-x86/mach-xen/asm/dma-mapping_32.h @@ -7,9 +7,9 @@ */ #include +#include #include #include -#include #include static inline int --- a/include/asm-x86/mach-xen/asm/dma-mapping_64.h +++ b/include/asm-x86/mach-xen/asm/dma-mapping_64.h @@ -6,8 +6,7 @@ * documentation. */ - -#include +#include #include struct dma_mapping_ops { @@ -203,4 +202,4 @@ #endif /* _X8664_DMA_MAPPING_H */ -#include +#include "dma-mapping_32.h" --- /dev/null +++ b/include/asm-x86/mach-xen/asm/e820.h @@ -0,0 +1,33 @@ +#ifndef __ASM_E820_H +#define __ASM_E820_H +#define E820MAP 0x2d0 /* our map */ +#define E820MAX 128 /* number of entries in E820MAP */ +#define E820NR 0x1e8 /* # entries in E820MAP */ + +#define E820_RAM 1 +#define E820_RESERVED 2 +#define E820_ACPI 3 +#define E820_NVS 4 + +#ifndef __ASSEMBLY__ +struct e820entry { + __u64 addr; /* start of memory segment */ + __u64 size; /* size of memory segment */ + __u32 type; /* type of memory segment */ +} __attribute__((packed)); + +struct e820map { + __u32 nr_map; + struct e820entry map[E820MAX]; +}; +#endif /* __ASSEMBLY__ */ + +#ifdef __KERNEL__ +#ifdef CONFIG_X86_32 +# include "../../e820_32.h" +#else +# include "e820_64.h" +#endif +#endif /* __KERNEL__ */ + +#endif /* __ASM_E820_H */ --- a/include/asm-x86/mach-xen/asm/e820_64.h +++ b/include/asm-x86/mach-xen/asm/e820_64.h @@ -11,27 +11,7 @@ #ifndef __E820_HEADER #define __E820_HEADER -#define E820MAP 0x2d0 /* our map */ -#define E820MAX 128 /* number of entries in E820MAP */ -#define E820NR 0x1e8 /* # entries in E820MAP */ - -#define E820_RAM 1 -#define E820_RESERVED 2 -#define E820_ACPI 3 -#define E820_NVS 4 - #ifndef __ASSEMBLY__ -struct e820entry { - u64 addr; /* start of memory segment */ - u64 size; /* size of memory segment */ - u32 type; /* type of memory segment */ -} __attribute__((packed)); - -struct e820map { - u32 nr_map; - struct e820entry map[E820MAX]; -}; - extern unsigned long find_e820_area(unsigned long start, unsigned long end, unsigned size); extern void add_memory_region(unsigned long start, unsigned long size, --- /dev/null +++ b/include/asm-x86/mach-xen/asm/fixmap.h @@ -0,0 +1,5 @@ +#ifdef CONFIG_X86_32 +# include "fixmap_32.h" +#else +# include "fixmap_64.h" +#endif --- /dev/null +++ b/include/asm-x86/mach-xen/asm/hw_irq.h @@ -0,0 +1,5 @@ +#ifdef CONFIG_X86_32 +# include "hw_irq_32.h" +#else +# include "hw_irq_64.h" +#endif --- a/include/asm-x86/mach-xen/asm/hw_irq_64.h +++ b/include/asm-x86/mach-xen/asm/hw_irq_64.h @@ -41,22 +41,22 @@ /* * Vectors 0x30-0x3f are used for ISA interrupts. */ -#define IRQ0_VECTOR FIRST_EXTERNAL_VECTOR + 0x10 -#define IRQ1_VECTOR IRQ0_VECTOR + 1 -#define IRQ2_VECTOR IRQ0_VECTOR + 2 -#define IRQ3_VECTOR IRQ0_VECTOR + 3 -#define IRQ4_VECTOR IRQ0_VECTOR + 4 -#define IRQ5_VECTOR IRQ0_VECTOR + 5 -#define IRQ6_VECTOR IRQ0_VECTOR + 6 -#define IRQ7_VECTOR IRQ0_VECTOR + 7 -#define IRQ8_VECTOR IRQ0_VECTOR + 8 -#define IRQ9_VECTOR IRQ0_VECTOR + 9 -#define IRQ10_VECTOR IRQ0_VECTOR + 10 -#define IRQ11_VECTOR IRQ0_VECTOR + 11 -#define IRQ12_VECTOR IRQ0_VECTOR + 12 -#define IRQ13_VECTOR IRQ0_VECTOR + 13 -#define IRQ14_VECTOR IRQ0_VECTOR + 14 -#define IRQ15_VECTOR IRQ0_VECTOR + 15 +#define IRQ0_VECTOR (FIRST_EXTERNAL_VECTOR + 0x10) +#define IRQ1_VECTOR (IRQ0_VECTOR + 1) +#define IRQ2_VECTOR (IRQ0_VECTOR + 2) +#define IRQ3_VECTOR (IRQ0_VECTOR + 3) +#define IRQ4_VECTOR (IRQ0_VECTOR + 4) +#define IRQ5_VECTOR (IRQ0_VECTOR + 5) +#define IRQ6_VECTOR (IRQ0_VECTOR + 6) +#define IRQ7_VECTOR (IRQ0_VECTOR + 7) +#define IRQ8_VECTOR (IRQ0_VECTOR + 8) +#define IRQ9_VECTOR (IRQ0_VECTOR + 9) +#define IRQ10_VECTOR (IRQ0_VECTOR + 10) +#define IRQ11_VECTOR (IRQ0_VECTOR + 11) +#define IRQ12_VECTOR (IRQ0_VECTOR + 12) +#define IRQ13_VECTOR (IRQ0_VECTOR + 13) +#define IRQ14_VECTOR (IRQ0_VECTOR + 14) +#define IRQ15_VECTOR (IRQ0_VECTOR + 15) /* * Special IRQ vectors used by the SMP architecture, 0xf0-0xff @@ -150,9 +150,6 @@ #define IO_APIC_IRQ(x) (((x) >= 16) || ((1<<(x)) & io_apic_irqs)) -#define __STR(x) #x -#define STR(x) __STR(x) - #include #define IRQ_NAME2(nr) nr##_interrupt(void) --- /dev/null +++ b/include/asm-x86/mach-xen/asm/hypercall.h @@ -0,0 +1,5 @@ +#ifdef CONFIG_X86_32 +# include "hypercall_32.h" +#else +# include "hypercall_64.h" +#endif --- /dev/null +++ b/include/asm-x86/mach-xen/asm/io.h @@ -0,0 +1,5 @@ +#ifdef CONFIG_X86_32 +# include "io_32.h" +#else +# include "io_64.h" +#endif --- a/include/asm-x86/mach-xen/asm/io_32.h +++ b/include/asm-x86/mach-xen/asm/io_32.h @@ -212,17 +212,22 @@ #define mmiowb() -static inline void memset_io(volatile void __iomem *addr, unsigned char val, int count) +static inline void +memset_io(volatile void __iomem *addr, unsigned char val, int count) { - memset((void __force *) addr, val, count); + memset((void __force *)addr, val, count); } -static inline void memcpy_fromio(void *dst, const volatile void __iomem *src, int count) + +static inline void +memcpy_fromio(void *dst, const volatile void __iomem *src, int count) { - __memcpy(dst, (void __force *) src, count); + __memcpy(dst, (const void __force *)src, count); } -static inline void memcpy_toio(volatile void __iomem *dst, const void *src, int count) + +static inline void +memcpy_toio(volatile void __iomem *dst, const void *src, int count) { - __memcpy((void __force *) dst, src, count); + __memcpy((void __force *)dst, src, count); } /* @@ -250,18 +255,9 @@ __asm__ __volatile__ ("lock; addl $0,0(%%esp)": : :"memory"); } -#define dma_cache_inv(_start,_size) flush_write_buffers() -#define dma_cache_wback(_start,_size) flush_write_buffers() -#define dma_cache_wback_inv(_start,_size) flush_write_buffers() - #else -/* Nothing to do */ - -#define dma_cache_inv(_start,_size) do { } while (0) -#define dma_cache_wback(_start,_size) do { } while (0) -#define dma_cache_wback_inv(_start,_size) do { } while (0) -#define flush_write_buffers() +#define flush_write_buffers() do { } while (0) #endif --- a/include/asm-x86/mach-xen/asm/io_64.h +++ b/include/asm-x86/mach-xen/asm/io_64.h @@ -268,12 +268,6 @@ */ #define __ISA_IO_base ((char __iomem *)(fix_to_virt(FIX_ISAMAP_BEGIN))) -/* Nothing to do */ - -#define dma_cache_inv(_start,_size) do { } while (0) -#define dma_cache_wback(_start,_size) do { } while (0) -#define dma_cache_wback_inv(_start,_size) do { } while (0) - #define flush_write_buffers() extern int iommu_bio_merge; --- /dev/null +++ b/include/asm-x86/mach-xen/asm/irq.h @@ -0,0 +1,5 @@ +#ifdef CONFIG_X86_32 +# include "../../irq_32.h" +#else +# include "irq_64.h" +#endif --- /dev/null +++ b/include/asm-x86/mach-xen/asm/irqflags.h @@ -0,0 +1,5 @@ +#ifdef CONFIG_X86_32 +# include "irqflags_32.h" +#else +# include "irqflags_64.h" +#endif --- a/include/asm-x86/mach-xen/asm/irqflags_32.h +++ b/include/asm-x86/mach-xen/asm/irqflags_32.h @@ -151,6 +151,23 @@ \ raw_irqs_disabled_flags(flags); \ }) + +/* + * makes the traced hardirq state match with the machine state + * + * should be a rarely used function, only in places where its + * otherwise impossible to know the irq state, like in traps. + */ +static inline void trace_hardirqs_fixup_flags(unsigned long flags) +{ + if (raw_irqs_disabled_flags(flags)) + trace_hardirqs_off(); + else + trace_hardirqs_on(); +} + +#define trace_hardirqs_fixup() \ + trace_hardirqs_fixup_flags(__raw_local_save_flags()) #endif /* __ASSEMBLY__ */ /* @@ -182,4 +199,17 @@ # define TRACE_IRQS_OFF #endif +#ifdef CONFIG_DEBUG_LOCK_ALLOC +# define LOCKDEP_SYS_EXIT \ + pushl %eax; \ + pushl %ecx; \ + pushl %edx; \ + call lockdep_sys_exit; \ + popl %edx; \ + popl %ecx; \ + popl %eax; +#else +# define LOCKDEP_SYS_EXIT +#endif + #endif --- a/include/asm-x86/mach-xen/asm/irqflags_64.h +++ b/include/asm-x86/mach-xen/asm/irqflags_64.h @@ -116,6 +116,22 @@ }) /* + * makes the traced hardirq state match with the machine state + * + * should be a rarely used function, only in places where its + * otherwise impossible to know the irq state, like in traps. + */ +static inline void trace_hardirqs_fixup_flags(unsigned long flags) +{ + if (raw_irqs_disabled_flags(flags)) + trace_hardirqs_off(); + else + trace_hardirqs_on(); +} + +#define trace_hardirqs_fixup() \ + trace_hardirqs_fixup_flags(__raw_local_save_flags()) +/* * Used in the idle loop; sti takes one instruction cycle * to complete: */ @@ -143,6 +159,20 @@ # define TRACE_IRQS_ON # define TRACE_IRQS_OFF # endif +# ifdef CONFIG_DEBUG_LOCK_ALLOC +# define LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk +# define LOCKDEP_SYS_EXIT_IRQ \ + TRACE_IRQS_ON; \ + sti; \ + SAVE_REST; \ + LOCKDEP_SYS_EXIT; \ + RESTORE_REST; \ + cli; \ + TRACE_IRQS_OFF; +# else +# define LOCKDEP_SYS_EXIT +# define LOCKDEP_SYS_EXIT_IRQ +# endif #endif #endif --- /dev/null +++ b/include/asm-x86/mach-xen/asm/maddr.h @@ -0,0 +1,5 @@ +#ifdef CONFIG_X86_32 +# include "maddr_32.h" +#else +# include "maddr_64.h" +#endif --- a/include/asm-x86/mach-xen/asm/mmu.h +++ b/include/asm-x86/mach-xen/asm/mmu.h @@ -1,21 +1,33 @@ -#ifndef __i386_MMU_H -#define __i386_MMU_H +#ifndef _ASM_X86_MMU_H +#define _ASM_X86_MMU_H + +#include +#include -#include /* - * The i386 doesn't have a mmu context, but + * The x86 doesn't have a mmu context, but * we put the segment information here. * * cpu_vm_mask is used to optimize ldt flushing. */ typedef struct { - int size; - struct semaphore sem; void *ldt; +#ifdef CONFIG_X86_64 + rwlock_t ldtlock; +#endif + int size; + struct mutex lock; void *vdso; -#ifdef CONFIG_XEN - int has_foreign_mappings; + unsigned has_foreign_mappings:1; +#ifdef CONFIG_X86_64 + unsigned pinned:1; + struct list_head unpinned; #endif } mm_context_t; +#ifdef CONFIG_X86_64 +extern struct list_head mm_unpinned; +extern spinlock_t mm_unpinned_lock; #endif + +#endif /* _ASM_X86_MMU_H */ --- a/include/asm-x86/mach-xen/asm/mmu_64.h +++ /dev/null @@ -1,31 +0,0 @@ -#ifndef __x86_64_MMU_H -#define __x86_64_MMU_H - -#include -#include - -/* - * The x86_64 doesn't have a mmu context, but - * we put the segment information here. - * - * cpu_vm_mask is used to optimize ldt flushing. - */ -typedef struct { - void *ldt; - rwlock_t ldtlock; - int size; - struct semaphore sem; - void *vdso; -#ifdef CONFIG_XEN - unsigned pinned:1; - unsigned has_foreign_mappings:1; - struct list_head unpinned; -#endif -} mm_context_t; - -#ifdef CONFIG_XEN -extern struct list_head mm_unpinned; -extern spinlock_t mm_unpinned_lock; -#endif - -#endif --- /dev/null +++ b/include/asm-x86/mach-xen/asm/mmu_context.h @@ -0,0 +1,5 @@ +#ifdef CONFIG_X86_32 +# include "mmu_context_32.h" +#else +# include "mmu_context_64.h" +#endif --- /dev/null +++ b/include/asm-x86/mach-xen/asm/nmi.h @@ -0,0 +1,7 @@ +#ifdef CONFIG_X86_32 +# include "../../nmi_32.h" +#else +# include "../../nmi_64.h" +# undef get_nmi_reason +# include "../mach_traps.h" +#endif --- /dev/null +++ b/include/asm-x86/mach-xen/asm/page.h @@ -0,0 +1,13 @@ +#ifdef __KERNEL__ +# ifdef CONFIG_X86_32 +# include "page_32.h" +# else +# include "page_64.h" +# endif +#else +# ifdef __i386__ +# include "page_32.h" +# else +# include "page_64.h" +# endif +#endif --- a/include/asm-x86/mach-xen/asm/page_64.h +++ b/include/asm-x86/mach-xen/asm/page_64.h @@ -207,6 +207,7 @@ VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) #define __HAVE_ARCH_GATE_AREA 1 +#define vmemmap ((struct page *)VMEMMAP_START) #include #include --- /dev/null +++ b/include/asm-x86/mach-xen/asm/pci.h @@ -0,0 +1,100 @@ +#ifndef __x86_PCI_H +#define __x86_PCI_H + +#include /* for struct page */ +#include +#include +#include +#include +#include + + +#ifdef __KERNEL__ + +struct pci_sysdata { + int domain; /* PCI domain */ + int node; /* NUMA node */ +#ifdef CONFIG_X86_64 + void* iommu; /* IOMMU private data */ +#endif +#ifdef CONFIG_XEN_PCIDEV_FRONTEND + struct pcifront_device *pdev; +#endif +}; + +/* scan a bus after allocating a pci_sysdata for it */ +extern struct pci_bus *pci_scan_bus_with_sysdata(int busno); + +static inline int pci_domain_nr(struct pci_bus *bus) +{ + struct pci_sysdata *sd = bus->sysdata; + return sd->domain; +} + +static inline int pci_proc_domain(struct pci_bus *bus) +{ + return pci_domain_nr(bus); +} + + +/* Can be used to override the logic in pci_scan_bus for skipping + already-configured bus numbers - to be used for buggy BIOSes + or architectures with incomplete PCI setup by the loader */ + +#ifdef CONFIG_PCI +extern unsigned int pcibios_assign_all_busses(void); +#else +#define pcibios_assign_all_busses() 0 +#endif + +#include +#define pcibios_scan_all_fns(a, b) (!is_initial_xendomain()) + +extern unsigned long pci_mem_start; +#define PCIBIOS_MIN_IO 0x1000 +#define PCIBIOS_MIN_MEM (pci_mem_start) + +#define PCIBIOS_MIN_CARDBUS_IO 0x4000 + +void pcibios_config_init(void); +struct pci_bus * pcibios_scan_root(int bus); + +void pcibios_set_master(struct pci_dev *dev); +void pcibios_penalize_isa_irq(int irq, int active); +struct irq_routing_table *pcibios_get_irq_routing_table(void); +int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq); + + +#define HAVE_PCI_MMAP +extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, + enum pci_mmap_state mmap_state, int write_combine); + + +#ifdef CONFIG_PCI +static inline void pci_dma_burst_advice(struct pci_dev *pdev, + enum pci_dma_burst_strategy *strat, + unsigned long *strategy_parameter) +{ + *strat = PCI_DMA_BURST_INFINITY; + *strategy_parameter = ~0UL; +} +#endif + + +#endif /* __KERNEL__ */ + +#ifdef CONFIG_X86_32 +# include "pci_32.h" +#else +# include "pci_64.h" +#endif + +/* implement the pci_ DMA API in terms of the generic device dma_ one */ +#include + +/* generic pci stuff */ +#include + + + +#endif --- a/include/asm-x86/mach-xen/asm/pci_32.h +++ b/include/asm-x86/mach-xen/asm/pci_32.h @@ -4,52 +4,10 @@ #ifdef __KERNEL__ -struct pci_sysdata { - int node; /* NUMA node */ -}; - -/* scan a bus after allocating a pci_sysdata for it */ -extern struct pci_bus *pci_scan_bus_with_sysdata(int busno); - -#include /* for struct page */ - -/* Can be used to override the logic in pci_scan_bus for skipping - already-configured bus numbers - to be used for buggy BIOSes - or architectures with incomplete PCI setup by the loader */ - -#ifdef CONFIG_PCI -extern unsigned int pcibios_assign_all_busses(void); -#else -#define pcibios_assign_all_busses() 0 -#endif - -#include -#define pcibios_scan_all_fns(a, b) (!is_initial_xendomain()) - -extern unsigned long pci_mem_start; -#define PCIBIOS_MIN_IO 0x1000 -#define PCIBIOS_MIN_MEM (pci_mem_start) - -#define PCIBIOS_MIN_CARDBUS_IO 0x4000 - -void pcibios_config_init(void); -struct pci_bus * pcibios_scan_root(int bus); - -void pcibios_set_master(struct pci_dev *dev); -void pcibios_penalize_isa_irq(int irq, int active); -struct irq_routing_table *pcibios_get_irq_routing_table(void); -int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq); - /* Dynamic DMA mapping stuff. * i386 has everything mapped statically. */ -#include -#include -#include -#include -#include - struct pci_dev; #ifdef CONFIG_SWIOTLB @@ -89,31 +47,8 @@ #endif -#define HAVE_PCI_MMAP -extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, - enum pci_mmap_state mmap_state, int write_combine); - - -#ifdef CONFIG_PCI -static inline void pci_dma_burst_advice(struct pci_dev *pdev, - enum pci_dma_burst_strategy *strat, - unsigned long *strategy_parameter) -{ - *strat = PCI_DMA_BURST_INFINITY; - *strategy_parameter = ~0UL; -} -#endif #endif /* __KERNEL__ */ -#ifdef CONFIG_XEN_PCIDEV_FRONTEND -#include -#endif /* CONFIG_XEN_PCIDEV_FRONTEND */ - -/* implement the pci_ DMA API in terms of the generic device dma_ one */ -#include - -/* generic pci stuff */ -#include #endif /* __i386_PCI_H */ --- a/include/asm-x86/mach-xen/asm/pci_64.h +++ b/include/asm-x86/mach-xen/asm/pci_64.h @@ -1,16 +1,9 @@ #ifndef __x8664_PCI_H #define __x8664_PCI_H -#include #ifdef __KERNEL__ -struct pci_sysdata { - int node; /* NUMA node */ - void* iommu; /* IOMMU private data */ -}; - -extern struct pci_bus *pci_scan_bus_with_sysdata(int busno); #ifdef CONFIG_CALGARY_IOMMU static inline void* pci_iommu(struct pci_bus *bus) @@ -26,42 +19,11 @@ } #endif /* CONFIG_CALGARY_IOMMU */ -#include /* for struct page */ - -/* Can be used to override the logic in pci_scan_bus for skipping - already-configured bus numbers - to be used for buggy BIOSes - or architectures with incomplete PCI setup by the loader */ - -#ifdef CONFIG_PCI -extern unsigned int pcibios_assign_all_busses(void); -#else -#define pcibios_assign_all_busses() 0 -#endif - -#include -#define pcibios_scan_all_fns(a, b) (!is_initial_xendomain()) - -extern unsigned long pci_mem_start; -#define PCIBIOS_MIN_IO 0x1000 -#define PCIBIOS_MIN_MEM (pci_mem_start) - -#define PCIBIOS_MIN_CARDBUS_IO 0x4000 -void pcibios_config_init(void); -struct pci_bus * pcibios_scan_root(int bus); extern int (*pci_config_read)(int seg, int bus, int dev, int fn, int reg, int len, u32 *value); extern int (*pci_config_write)(int seg, int bus, int dev, int fn, int reg, int len, u32 value); -void pcibios_set_master(struct pci_dev *dev); -void pcibios_penalize_isa_irq(int irq, int active); -struct irq_routing_table *pcibios_get_irq_routing_table(void); -int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq); - -#include -#include -#include -#include -#include + extern void pci_iommu_alloc(void); extern int iommu_setup(char *opt); @@ -75,7 +37,7 @@ */ #define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys) -#if defined(CONFIG_IOMMU) || defined(CONFIG_CALGARY_IOMMU) +#if defined(CONFIG_GART_IOMMU) || defined(CONFIG_CALGARY_IOMMU) #define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) \ dma_addr_t ADDR_NAME; @@ -119,27 +81,7 @@ #endif -#include - -#ifdef CONFIG_PCI -static inline void pci_dma_burst_advice(struct pci_dev *pdev, - enum pci_dma_burst_strategy *strat, - unsigned long *strategy_parameter) -{ - *strat = PCI_DMA_BURST_INFINITY; - *strategy_parameter = ~0UL; -} -#endif - -#define HAVE_PCI_MMAP -extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, - enum pci_mmap_state mmap_state, int write_combine); - #endif /* __KERNEL__ */ -/* generic pci stuff */ -#ifdef CONFIG_PCI -#include -#endif #endif /* __x8664_PCI_H */ --- /dev/null +++ b/include/asm-x86/mach-xen/asm/pgalloc.h @@ -0,0 +1,5 @@ +#ifdef CONFIG_X86_32 +# include "pgalloc_32.h" +#else +# include "pgalloc_64.h" +#endif --- /dev/null +++ b/include/asm-x86/mach-xen/asm/pgtable.h @@ -0,0 +1,5 @@ +#ifdef CONFIG_X86_32 +# include "pgtable_32.h" +#else +# include "pgtable_64.h" +#endif --- a/include/asm-x86/mach-xen/asm/pgtable_32.h +++ b/include/asm-x86/mach-xen/asm/pgtable_32.h @@ -17,10 +17,7 @@ #include #include -#ifndef _I386_BITOPS_H -#include -#endif - +#include #include #include #include @@ -40,7 +37,7 @@ extern struct page *pgd_list; void check_pgt_cache(void); -void pmd_ctor(void *, struct kmem_cache *, unsigned long); +void pmd_ctor(struct kmem_cache *, void *); void pgtable_cache_init(void); void paging_init(void); --- a/include/asm-x86/mach-xen/asm/pgtable_64.h +++ b/include/asm-x86/mach-xen/asm/pgtable_64.h @@ -9,7 +9,7 @@ * the x86-64 page table tree. */ #include -#include +#include #include #include #include @@ -138,6 +138,7 @@ #define MAXMEM _AC(0x3fffffffffff, UL) #define VMALLOC_START _AC(0xffffc20000000000, UL) #define VMALLOC_END _AC(0xffffe1ffffffffff, UL) +#define VMEMMAP_START _AC(0xffffe20000000000, UL) #define MODULES_VADDR _AC(0xffffffff88000000, UL) #define MODULES_END _AC(0xfffffffffff00000, UL) #define MODULES_LEN (MODULES_END - MODULES_VADDR) --- /dev/null +++ b/include/asm-x86/mach-xen/asm/processor.h @@ -0,0 +1,5 @@ +#ifdef CONFIG_X86_32 +# include "processor_32.h" +#else +# include "processor_64.h" +#endif --- a/include/asm-x86/mach-xen/asm/processor_32.h +++ b/include/asm-x86/mach-xen/asm/processor_32.h @@ -80,6 +80,7 @@ unsigned char booted_cores; /* number of cores as seen by OS */ __u8 phys_proc_id; /* Physical processor id. */ __u8 cpu_core_id; /* Core id */ + __u8 cpu_index; /* index into per_cpu list */ #endif } __attribute__((__aligned__(SMP_CACHE_BYTES))); @@ -106,14 +107,19 @@ #endif #ifdef CONFIG_SMP -extern struct cpuinfo_x86 cpu_data[]; -#define current_cpu_data cpu_data[smp_processor_id()] +DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info); +#define cpu_data(cpu) per_cpu(cpu_info, cpu) +#define current_cpu_data cpu_data(smp_processor_id()) #else -#define cpu_data (&boot_cpu_data) -#define current_cpu_data boot_cpu_data +#define cpu_data(cpu) boot_cpu_data +#define current_cpu_data boot_cpu_data #endif -extern int cpu_llc_id[NR_CPUS]; +/* + * the following now lives in the per cpu area: + * extern int cpu_llc_id[NR_CPUS]; + */ +DECLARE_PER_CPU(u8, cpu_llc_id); extern char ignore_fpu_irq; void __init cpu_detect(struct cpuinfo_x86 *c); @@ -560,7 +566,9 @@ * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx * resulting in stale register contents being returned. */ -static inline void cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) +static inline void cpuid(unsigned int op, + unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) { *eax = op; *ecx = 0; @@ -568,8 +576,9 @@ } /* Some CPUID calls want 'count' to be placed in ecx */ -static inline void cpuid_count(int op, int count, int *eax, int *ebx, int *ecx, - int *edx) +static inline void cpuid_count(unsigned int op, int count, + unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) { *eax = op; *ecx = count; @@ -639,6 +648,17 @@ #define K7_NOP7 ".byte 0x8D,0x04,0x05,0,0,0,0\n" #define K7_NOP8 K7_NOP7 ASM_NOP1 +/* P6 nops */ +/* uses eax dependencies (Intel-recommended choice) */ +#define P6_NOP1 GENERIC_NOP1 +#define P6_NOP2 ".byte 0x66,0x90\n" +#define P6_NOP3 ".byte 0x0f,0x1f,0x00\n" +#define P6_NOP4 ".byte 0x0f,0x1f,0x40,0\n" +#define P6_NOP5 ".byte 0x0f,0x1f,0x44,0x00,0\n" +#define P6_NOP6 ".byte 0x66,0x0f,0x1f,0x44,0x00,0\n" +#define P6_NOP7 ".byte 0x0f,0x1f,0x80,0,0,0,0\n" +#define P6_NOP8 ".byte 0x0f,0x1f,0x84,0x00,0,0,0,0\n" + #ifdef CONFIG_MK8 #define ASM_NOP1 K8_NOP1 #define ASM_NOP2 K8_NOP2 @@ -657,6 +677,17 @@ #define ASM_NOP6 K7_NOP6 #define ASM_NOP7 K7_NOP7 #define ASM_NOP8 K7_NOP8 +#elif defined(CONFIG_M686) || defined(CONFIG_MPENTIUMII) || \ + defined(CONFIG_MPENTIUMIII) || defined(CONFIG_MPENTIUMM) || \ + defined(CONFIG_MCORE2) || defined(CONFIG_PENTIUM4) +#define ASM_NOP1 P6_NOP1 +#define ASM_NOP2 P6_NOP2 +#define ASM_NOP3 P6_NOP3 +#define ASM_NOP4 P6_NOP4 +#define ASM_NOP5 P6_NOP5 +#define ASM_NOP6 P6_NOP6 +#define ASM_NOP7 P6_NOP7 +#define ASM_NOP8 P6_NOP8 #else #define ASM_NOP1 GENERIC_NOP1 #define ASM_NOP2 GENERIC_NOP2 --- a/include/asm-x86/mach-xen/asm/processor_64.h +++ b/include/asm-x86/mach-xen/asm/processor_64.h @@ -74,6 +74,7 @@ __u8 booted_cores; /* number of cores as seen by OS */ __u8 phys_proc_id; /* Physical Processor id. */ __u8 cpu_core_id; /* Core id. */ + __u8 cpu_index; /* index into per_cpu list */ #endif } ____cacheline_aligned; @@ -88,11 +89,12 @@ #define X86_VENDOR_UNKNOWN 0xff #ifdef CONFIG_SMP -extern struct cpuinfo_x86 cpu_data[]; -#define current_cpu_data cpu_data[smp_processor_id()] +DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info); +#define cpu_data(cpu) per_cpu(cpu_info, cpu) +#define current_cpu_data cpu_data(smp_processor_id()) #else -#define cpu_data (&boot_cpu_data) -#define current_cpu_data boot_cpu_data +#define cpu_data(cpu) boot_cpu_data +#define current_cpu_data boot_cpu_data #endif extern char ignore_irq13; @@ -343,6 +345,16 @@ }; +#if defined(CONFIG_MPSC) || defined(CONFIG_MCORE2) +#define ASM_NOP1 P6_NOP1 +#define ASM_NOP2 P6_NOP2 +#define ASM_NOP3 P6_NOP3 +#define ASM_NOP4 P6_NOP4 +#define ASM_NOP5 P6_NOP5 +#define ASM_NOP6 P6_NOP6 +#define ASM_NOP7 P6_NOP7 +#define ASM_NOP8 P6_NOP8 +#else #define ASM_NOP1 K8_NOP1 #define ASM_NOP2 K8_NOP2 #define ASM_NOP3 K8_NOP3 @@ -351,6 +363,7 @@ #define ASM_NOP6 K8_NOP6 #define ASM_NOP7 K8_NOP7 #define ASM_NOP8 K8_NOP8 +#endif /* Opteron nops */ #define K8_NOP1 ".byte 0x90\n" @@ -362,6 +375,17 @@ #define K8_NOP7 K8_NOP4 K8_NOP3 #define K8_NOP8 K8_NOP4 K8_NOP4 +/* P6 nops */ +/* uses eax dependencies (Intel-recommended choice) */ +#define P6_NOP1 ".byte 0x90\n" +#define P6_NOP2 ".byte 0x66,0x90\n" +#define P6_NOP3 ".byte 0x0f,0x1f,0x00\n" +#define P6_NOP4 ".byte 0x0f,0x1f,0x40,0\n" +#define P6_NOP5 ".byte 0x0f,0x1f,0x44,0x00,0\n" +#define P6_NOP6 ".byte 0x66,0x0f,0x1f,0x44,0x00,0\n" +#define P6_NOP7 ".byte 0x0f,0x1f,0x80,0,0,0,0\n" +#define P6_NOP8 ".byte 0x0f,0x1f,0x84,0x00,0,0,0,0\n" + #define ASM_NOP_MAX 8 /* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */ @@ -377,12 +401,6 @@ asm volatile("cpuid" : "=a" (tmp) : "0" (1) : "ebx","ecx","edx","memory"); } -#define ARCH_HAS_PREFETCH -static inline void prefetch(void *x) -{ - asm volatile("prefetcht0 (%0)" :: "r" (x)); -} - #define ARCH_HAS_PREFETCHW 1 static inline void prefetchw(void *x) { @@ -398,11 +416,6 @@ #define cpu_relax() rep_nop() -static inline void serialize_cpu(void) -{ - __asm__ __volatile__ ("cpuid" : : : "ax", "bx", "cx", "dx"); -} - static inline void __monitor(const void *eax, unsigned long ecx, unsigned long edx) { --- /dev/null +++ b/include/asm-x86/mach-xen/asm/scatterlist.h @@ -0,0 +1 @@ +#include "../../scatterlist_64.h" --- a/include/asm-x86/mach-xen/asm/scatterlist_32.h +++ /dev/null @@ -1,24 +0,0 @@ -#ifndef _I386_SCATTERLIST_H -#define _I386_SCATTERLIST_H - -#include - -struct scatterlist { - struct page *page; - unsigned int offset; - unsigned int length; - dma_addr_t dma_address; - unsigned int dma_length; -}; - -/* These macros should be used after a pci_map_sg call has been done - * to get bus addresses of each of the SG entries and their lengths. - * You should only work with the number of sg entries pci_map_sg - * returns. - */ -#define sg_dma_address(sg) ((sg)->dma_address) -#define sg_dma_len(sg) ((sg)->dma_length) - -#define ISA_DMA_THRESHOLD (0x00ffffff) - -#endif /* !(_I386_SCATTERLIST_H) */ --- /dev/null +++ b/include/asm-x86/mach-xen/asm/segment.h @@ -0,0 +1,5 @@ +#ifdef CONFIG_X86_32 +# include "segment_32.h" +#else +# include "../../segment_64.h" +#endif --- /dev/null +++ b/include/asm-x86/mach-xen/asm/smp.h @@ -0,0 +1,5 @@ +#ifdef CONFIG_X86_32 +# include "smp_32.h" +#else +# include "smp_64.h" +#endif --- a/include/asm-x86/mach-xen/asm/smp_32.h +++ b/include/asm-x86/mach-xen/asm/smp_32.h @@ -11,7 +11,7 @@ #endif #if defined(CONFIG_X86_LOCAL_APIC) && !defined(__ASSEMBLY__) -#include +#include #include #include #ifdef CONFIG_X86_IO_APIC @@ -30,8 +30,8 @@ extern void smp_alloc_memory(void); extern int pic_mode; extern int smp_num_siblings; -extern cpumask_t cpu_sibling_map[]; -extern cpumask_t cpu_core_map[]; +DECLARE_PER_CPU(cpumask_t, cpu_sibling_map); +DECLARE_PER_CPU(cpumask_t, cpu_core_map); extern void (*mtrr_hook) (void); extern void zap_low_mappings (void); @@ -39,9 +39,11 @@ extern void unlock_ipi_call_lock(void); #define MAX_APICID 256 -extern u8 x86_cpu_to_apicid[]; +extern u8 __initdata x86_cpu_to_apicid_init[]; +extern void *x86_cpu_to_apicid_ptr; +DECLARE_PER_CPU(u8, x86_cpu_to_apicid); -#define cpu_physical_id(cpu) x86_cpu_to_apicid[cpu] +#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu) #ifdef CONFIG_HOTPLUG_CPU extern void cpu_exit_clear(void); --- a/include/asm-x86/mach-xen/asm/smp_64.h +++ b/include/asm-x86/mach-xen/asm/smp_64.h @@ -40,10 +40,19 @@ extern void unlock_ipi_call_lock(void); extern int smp_num_siblings; extern void smp_send_reschedule(int cpu); +extern int smp_call_function_mask(cpumask_t mask, void (*func)(void *), + void *info, int wait); -extern cpumask_t cpu_sibling_map[NR_CPUS]; -extern cpumask_t cpu_core_map[NR_CPUS]; -extern u8 cpu_llc_id[NR_CPUS]; +/* + * cpu_sibling_map and cpu_core_map now live + * in the per cpu area + * + * extern cpumask_t cpu_sibling_map[NR_CPUS]; + * extern cpumask_t cpu_core_map[NR_CPUS]; + */ +DECLARE_PER_CPU(cpumask_t, cpu_sibling_map); +DECLARE_PER_CPU(cpumask_t, cpu_core_map); +DECLARE_PER_CPU(u8, cpu_llc_id); #define SMP_TRAMPOLINE_BASE 0x6000 @@ -70,6 +79,8 @@ #endif /* CONFIG_SMP */ +#define safe_smp_processor_id() smp_processor_id() + #ifdef CONFIG_X86_LOCAL_APIC static inline int hard_smp_processor_id(void) { @@ -82,8 +93,9 @@ * Some lowlevel functions might want to know about * the real APIC ID <-> CPU # mapping. */ -extern u8 x86_cpu_to_apicid[NR_CPUS]; /* physical ID */ -extern u8 x86_cpu_to_log_apicid[NR_CPUS]; +extern u8 __initdata x86_cpu_to_apicid_init[]; +extern void *x86_cpu_to_apicid_ptr; +DECLARE_PER_CPU(u8, x86_cpu_to_apicid); /* physical ID */ extern u8 bios_cpu_apicid[]; #ifdef CONFIG_X86_LOCAL_APIC @@ -118,8 +130,9 @@ #endif #ifdef CONFIG_SMP -#define cpu_physical_id(cpu) x86_cpu_to_apicid[cpu] +#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu) #else +extern unsigned int boot_cpu_id; #define cpu_physical_id(cpu) boot_cpu_id #endif /* !CONFIG_SMP */ #endif --- /dev/null +++ b/include/asm-x86/mach-xen/asm/swiotlb.h @@ -0,0 +1,5 @@ +#ifdef CONFIG_X86_32 +# include "swiotlb_32.h" +#else +# include "../../swiotlb.h" +#endif --- /dev/null +++ b/include/asm-x86/mach-xen/asm/system.h @@ -0,0 +1,5 @@ +#ifdef CONFIG_X86_32 +# include "system_32.h" +#else +# include "system_64.h" +#endif --- a/include/asm-x86/mach-xen/asm/system_32.h +++ b/include/asm-x86/mach-xen/asm/system_32.h @@ -9,6 +9,7 @@ #include #ifdef __KERNEL__ +#define AT_VECTOR_SIZE_ARCH 2 /* entries in ARCH_DLINFO */ struct task_struct; /* one of the stranger aspects of C forward declarations.. */ extern struct task_struct * FASTCALL(__switch_to(struct task_struct *prev, struct task_struct *next)); @@ -138,7 +139,7 @@ { unsigned long val; /* This could fault if %cr4 does not exist */ - asm("1: movl %%cr4, %0 \n" + asm volatile("1: movl %%cr4, %0 \n" "2: \n" ".section __ex_table,\"a\" \n" ".long 1b,2b \n" @@ -157,6 +158,11 @@ asm volatile("wbinvd": : :"memory"); } +static inline void clflush(volatile void *__p) +{ + asm volatile("clflush %0" : "+m" (*(char __force *)__p)); +} + #define read_cr0() (xen_read_cr0()) #define write_cr0(x) (xen_write_cr0(x)) #define read_cr2() (xen_read_cr2()) @@ -207,6 +213,7 @@ #define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2) #define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2) +#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM) /** * read_barrier_depends - Flush all pending reads that subsequents reads @@ -262,18 +269,18 @@ #define read_barrier_depends() do { } while(0) +#ifdef CONFIG_SMP +#define smp_mb() mb() +#ifdef CONFIG_X86_PPRO_FENCE +# define smp_rmb() rmb() +#else +# define smp_rmb() barrier() +#endif #ifdef CONFIG_X86_OOSTORE -/* Actually there are no OOO store capable CPUs for now that do SSE, - but make it already an possibility. */ -#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM) +# define smp_wmb() wmb() #else -#define wmb() __asm__ __volatile__ ("": : :"memory") +# define smp_wmb() barrier() #endif - -#ifdef CONFIG_SMP -#define smp_mb() mb() -#define smp_rmb() rmb() -#define smp_wmb() wmb() #define smp_read_barrier_depends() read_barrier_depends() #define set_mb(var, value) do { (void) xchg(&var, value); } while (0) #else @@ -300,5 +307,6 @@ extern void free_init_pages(char *what, unsigned long begin, unsigned long end); void default_idle(void); +void __show_registers(struct pt_regs *, int all); #endif --- a/include/asm-x86/mach-xen/asm/system_64.h +++ b/include/asm-x86/mach-xen/asm/system_64.h @@ -11,8 +11,12 @@ #ifdef __KERNEL__ -#define __STR(x) #x -#define STR(x) __STR(x) +/* entries in ARCH_DLINFO: */ +#ifdef CONFIG_IA32_EMULATION +# define AT_VECTOR_SIZE_ARCH 2 +#else +# define AT_VECTOR_SIZE_ARCH 1 +#endif #define __SAVE(reg,offset) "movq %%" #reg ",(14-" #offset ")*8(%%rsp)\n\t" #define __RESTORE(reg,offset) "movq (14-" #offset ")*8(%%rsp),%%" #reg "\n\t" @@ -92,7 +96,7 @@ #define read_cr3() ({ \ unsigned long __dummy; \ - asm("movq %%cr3,%0" : "=r" (__dummy)); \ + asm volatile("movq %%cr3,%0" : "=r" (__dummy)); \ machine_to_phys(__dummy); \ }) @@ -105,7 +109,7 @@ static inline unsigned long read_cr4(void) { unsigned long cr4; - asm("movq %%cr4,%0" : "=r" (cr4)); + asm volatile("movq %%cr4,%0" : "=r" (cr4)); return cr4; } @@ -131,12 +135,17 @@ #endif /* __KERNEL__ */ +static inline void clflush(volatile void *__p) +{ + asm volatile("clflush %0" : "+m" (*(char __force *)__p)); +} + #define nop() __asm__ __volatile__ ("nop") #ifdef CONFIG_SMP #define smp_mb() mb() -#define smp_rmb() rmb() -#define smp_wmb() wmb() +#define smp_rmb() barrier() +#define smp_wmb() barrier() #define smp_read_barrier_depends() do {} while(0) #else #define smp_mb() barrier() @@ -153,12 +162,8 @@ */ #define mb() asm volatile("mfence":::"memory") #define rmb() asm volatile("lfence":::"memory") - -#ifdef CONFIG_UNORDERED_IO #define wmb() asm volatile("sfence" ::: "memory") -#else -#define wmb() asm volatile("" ::: "memory") -#endif + #define read_barrier_depends() do {} while(0) #define set_mb(var, value) do { (void) xchg(&var, value); } while (0) --- /dev/null +++ b/include/asm-x86/mach-xen/asm/tlbflush.h @@ -0,0 +1,5 @@ +#ifdef CONFIG_X86_32 +# include "tlbflush_32.h" +#else +# include "tlbflush_64.h" +#endif --- a/include/asm-x86/mach-xen/asm/tlbflush_32.h +++ b/include/asm-x86/mach-xen/asm/tlbflush_32.h @@ -23,7 +23,6 @@ * - flush_tlb_page(vma, vmaddr) flushes one page * - flush_tlb_range(vma, start, end) flushes a range of pages * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages - * - flush_tlb_pgtables(mm, start, end) flushes a range of page tables * * ..but the i386 has somewhat limited tlb flushing capabilities, * and page-granular flushes are available only on i486 and up. @@ -97,10 +96,4 @@ flush_tlb_all(); } -static inline void flush_tlb_pgtables(struct mm_struct *mm, - unsigned long start, unsigned long end) -{ - /* i386 does not keep any page table caches in TLB */ -} - #endif /* _I386_TLBFLUSH_H */ --- a/include/asm-x86/mach-xen/asm/tlbflush_64.h +++ b/include/asm-x86/mach-xen/asm/tlbflush_64.h @@ -28,7 +28,6 @@ * - flush_tlb_page(vma, vmaddr) flushes one page * - flush_tlb_range(vma, start, end) flushes a range of pages * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages - * - flush_tlb_pgtables(mm, start, end) flushes a range of page tables * * x86-64 can only flush individual pages or full VMs. For a range flush * we always do the full VM. Might be worth trying if for a small @@ -95,12 +94,4 @@ flush_tlb_all(); } -static inline void flush_tlb_pgtables(struct mm_struct *mm, - unsigned long start, unsigned long end) -{ - /* x86_64 does not keep any page table caches in a software TLB. - The CPUs do in their hardware TLBs, but they are handled - by the normal TLB flushing algorithms. */ -} - #endif /* _X8664_TLBFLUSH_H */ --- /dev/null +++ b/include/asm-x86/mach-xen/asm/xor.h @@ -0,0 +1,5 @@ +#ifdef CONFIG_X86_32 +# include "../../xor_32.h" +#else +# include "xor_64.h" +#endif --- a/include/asm-x86/mach-xen/mach_time.h +++ b/include/asm-x86/mach-xen/mach_time.h @@ -1,111 +1,2 @@ -/* - * include/asm-i386/mach-default/mach_time.h - * - * Machine specific set RTC function for generic. - * Split out from time.c by Osamu Tomita - */ -#ifndef _MACH_TIME_H -#define _MACH_TIME_H - -#include - -/* for check timing call set_rtc_mmss() 500ms */ -/* used in arch/i386/time.c::do_timer_interrupt() */ -#define USEC_AFTER 500000 -#define USEC_BEFORE 500000 - -/* - * In order to set the CMOS clock precisely, set_rtc_mmss has to be - * called 500 ms after the second nowtime has started, because when - * nowtime is written into the registers of the CMOS clock, it will - * jump to the next second precisely 500 ms later. Check the Motorola - * MC146818A or Dallas DS12887 data sheet for details. - * - * BUG: This routine does not handle hour overflow properly; it just - * sets the minutes. Usually you'll only notice that after reboot! - */ -static inline int mach_set_rtc_mmss(unsigned long nowtime) -{ - int retval = 0; - int real_seconds, real_minutes, cmos_minutes; - unsigned char save_control, save_freq_select; - - save_control = CMOS_READ(RTC_CONTROL); /* tell the clock it's being set */ - CMOS_WRITE((save_control|RTC_SET), RTC_CONTROL); - - save_freq_select = CMOS_READ(RTC_FREQ_SELECT); /* stop and reset prescaler */ - CMOS_WRITE((save_freq_select|RTC_DIV_RESET2), RTC_FREQ_SELECT); - - cmos_minutes = CMOS_READ(RTC_MINUTES); - if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) - BCD_TO_BIN(cmos_minutes); - - /* - * since we're only adjusting minutes and seconds, - * don't interfere with hour overflow. This avoids - * messing with unknown time zones but requires your - * RTC not to be off by more than 15 minutes - */ - real_seconds = nowtime % 60; - real_minutes = nowtime / 60; - if (((abs(real_minutes - cmos_minutes) + 15)/30) & 1) - real_minutes += 30; /* correct for half hour time zone */ - real_minutes %= 60; - - if (abs(real_minutes - cmos_minutes) < 30) { - if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) { - BIN_TO_BCD(real_seconds); - BIN_TO_BCD(real_minutes); - } - CMOS_WRITE(real_seconds,RTC_SECONDS); - CMOS_WRITE(real_minutes,RTC_MINUTES); - } else { - printk(KERN_WARNING - "set_rtc_mmss: can't update from %d to %d\n", - cmos_minutes, real_minutes); - retval = -1; - } - - /* The following flags have to be released exactly in this order, - * otherwise the DS12887 (popular MC146818A clone with integrated - * battery and quartz) will not reset the oscillator and will not - * update precisely 500 ms later. You won't find this mentioned in - * the Dallas Semiconductor data sheets, but who believes data - * sheets anyway ... -- Markus Kuhn - */ - CMOS_WRITE(save_control, RTC_CONTROL); - CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); - - return retval; -} - -static inline unsigned long mach_get_cmos_time(void) -{ - unsigned int year, mon, day, hour, min, sec; - - do { - sec = CMOS_READ(RTC_SECONDS); - min = CMOS_READ(RTC_MINUTES); - hour = CMOS_READ(RTC_HOURS); - day = CMOS_READ(RTC_DAY_OF_MONTH); - mon = CMOS_READ(RTC_MONTH); - year = CMOS_READ(RTC_YEAR); - } while (sec != CMOS_READ(RTC_SECONDS)); - - if (!(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY) || RTC_ALWAYS_BCD) { - BCD_TO_BIN(sec); - BCD_TO_BIN(min); - BCD_TO_BIN(hour); - BCD_TO_BIN(day); - BCD_TO_BIN(mon); - BCD_TO_BIN(year); - } - - year += 1900; - if (year < 1970) - year += 100; - - return mktime(year, mon, day, hour, min, sec); -} - -#endif /* !_MACH_TIME_H */ +#include "../mc146818rtc_32.h" +#include "../mach-default/mach_time.h" --- a/include/asm-x86/mach-xen/mach_timer.h +++ b/include/asm-x86/mach-xen/mach_timer.h @@ -1,50 +1 @@ -/* - * include/asm-i386/mach-default/mach_timer.h - * - * Machine specific calibrate_tsc() for generic. - * Split out from timer_tsc.c by Osamu Tomita - */ -/* ------ Calibrate the TSC ------- - * Return 2^32 * (1 / (TSC clocks per usec)) for do_fast_gettimeoffset(). - * Too much 64-bit arithmetic here to do this cleanly in C, and for - * accuracy's sake we want to keep the overhead on the CTC speaker (channel 2) - * output busy loop as low as possible. We avoid reading the CTC registers - * directly because of the awkward 8-bit access mechanism of the 82C54 - * device. - */ -#ifndef _MACH_TIMER_H -#define _MACH_TIMER_H - -#define CALIBRATE_TIME_MSEC 30 /* 30 msecs */ -#define CALIBRATE_LATCH \ - ((CLOCK_TICK_RATE * CALIBRATE_TIME_MSEC + 1000/2)/1000) - -static inline void mach_prepare_counter(void) -{ - /* Set the Gate high, disable speaker */ - outb((inb(0x61) & ~0x02) | 0x01, 0x61); - - /* - * Now let's take care of CTC channel 2 - * - * Set the Gate high, program CTC channel 2 for mode 0, - * (interrupt on terminal count mode), binary count, - * load 5 * LATCH count, (LSB and MSB) to begin countdown. - * - * Some devices need a delay here. - */ - outb(0xb0, 0x43); /* binary, mode 0, LSB/MSB, Ch 2 */ - outb_p(CALIBRATE_LATCH & 0xff, 0x42); /* LSB of count */ - outb_p(CALIBRATE_LATCH >> 8, 0x42); /* MSB of count */ -} - -static inline void mach_countup(unsigned long *count_p) -{ - unsigned long count = 0; - do { - count++; - } while ((inb_p(0x61) & 0x20) == 0); - *count_p = count; -} - -#endif /* !_MACH_TIMER_H */ +#include "../mach-default/mach_timer.h" --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -69,6 +69,7 @@ CTL_BUS=8, /* Busses */ CTL_ABI=9, /* Binary emulation */ CTL_CPU=10, /* CPU stuff (speed scaling, etc) */ + CTL_XEN=123, /* Xen info and control */ CTL_ARLAN=254, /* arlan wireless driver */ CTL_S390DBF=5677, /* s390 debug */ CTL_SUNRPC=7249, /* sunrpc debug */ --- a/include/xen/pcifront.h +++ b/include/xen/pcifront.h @@ -12,13 +12,11 @@ #ifndef __ia64__ +#include + struct pcifront_device; struct pci_bus; - -struct pcifront_sd { - int domain; - struct pcifront_device *pdev; -}; +#define pcifront_sd pci_sysdata static inline struct pcifront_device * pcifront_get_pdev(struct pcifront_sd *sd) @@ -34,18 +32,6 @@ sd->pdev = pdev; } -#if defined(CONFIG_PCI_DOMAINS) -static inline int pci_domain_nr(struct pci_bus *bus) -{ - struct pcifront_sd *sd = bus->sysdata; - return sd->domain; -} -static inline int pci_proc_domain(struct pci_bus *bus) -{ - return pci_domain_nr(bus); -} -#endif /* CONFIG_PCI_DOMAINS */ - static inline void pcifront_setup_root_resources(struct pci_bus *bus, struct pcifront_sd *sd) { --- /dev/null +++ b/include/xen/sysctl.h @@ -0,0 +1,11 @@ +#ifndef _XEN_SYSCTL_H +#define _XEN_SYSCTL_H + +/* CTL_XEN names: */ +enum +{ + CTL_XEN_INDEPENDENT_WALLCLOCK=1, + CTL_XEN_PERMITTED_CLOCK_JITTER=2, +}; + +#endif /* _XEN_SYSCTL_H */ --- a/include/xen/xenbus.h +++ b/include/xen/xenbus.h @@ -107,7 +107,7 @@ int (*suspend)(struct xenbus_device *dev); int (*suspend_cancel)(struct xenbus_device *dev); int (*resume)(struct xenbus_device *dev); - int (*uevent)(struct xenbus_device *, char **, int, char *, int); + int (*uevent)(struct xenbus_device *, struct kobj_uevent_env *); struct device_driver driver; int (*read_otherend_details)(struct xenbus_device *dev); int (*is_ready)(struct xenbus_device *dev); --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1190,6 +1190,7 @@ module_init(crash_notes_memory_init) +#ifndef CONFIG_XEN /* * parsing the "crashkernel" commandline * @@ -1352,7 +1353,7 @@ return 0; } - +#endif void crash_save_vmcoreinfo(void) --- a/kernel/sysctl_check.c +++ b/kernel/sysctl_check.c @@ -4,6 +4,7 @@ #include #include #include +#include struct trans_ctl_table { int ctl_name; @@ -895,6 +896,14 @@ {} }; +#ifdef CONFIG_XEN +static struct trans_ctl_table trans_xen_table[] = { + { CTL_XEN_INDEPENDENT_WALLCLOCK, "independent_wallclock" }, + { CTL_XEN_PERMITTED_CLOCK_JITTER, "permitted_clock_jitter" }, + {} +}; +#endif + static const struct trans_ctl_table trans_arlan_conf_table0[] = { { 1, "spreadingCode" }, { 2, "channelNumber" }, @@ -1230,6 +1239,9 @@ { CTL_BUS, "bus", trans_bus_table }, { CTL_ABI, "abi" }, /* CTL_CPU not used */ +#ifdef CONFIG_XEN + { CTL_XEN, "xen", trans_xen_table }, +#endif { CTL_ARLAN, "arlan", trans_arlan_table }, { CTL_S390DBF, "s390dbf", trans_s390dbf_table }, { CTL_SUNRPC, "sunrpc", trans_sunrpc_table }, --- a/lib/swiotlb-xen.c +++ b/lib/swiotlb-xen.c @@ -27,7 +27,7 @@ #include #include #include -#include +#include int swiotlb; EXPORT_SYMBOL(swiotlb); @@ -580,9 +580,10 @@ * same here. */ int -swiotlb_map_sg(struct device *hwdev, struct scatterlist *sg, int nelems, +swiotlb_map_sg(struct device *hwdev, struct scatterlist *sgl, int nelems, int dir) { + struct scatterlist *sg; struct phys_addr buffer; dma_addr_t dev_addr; char *map; @@ -590,22 +591,22 @@ BUG_ON(dir == DMA_NONE); - for (i = 0; i < nelems; i++, sg++) { - dev_addr = gnttab_dma_map_page(sg->page) + sg->offset; + for_each_sg(sgl, sg, nelems, i) { + dev_addr = gnttab_dma_map_page(sg_page(sg)) + sg->offset; - if (range_straddles_page_boundary(page_to_pseudophys(sg->page) + if (range_straddles_page_boundary(page_to_pseudophys(sg_page(sg)) + sg->offset, sg->length) || address_needs_mapping(hwdev, dev_addr)) { gnttab_dma_unmap_page(dev_addr); - buffer.page = sg->page; + buffer.page = sg_page(sg); buffer.offset = sg->offset; map = map_single(hwdev, buffer, sg->length, dir); if (!map) { /* Don't panic here, we expect map_sg users to do proper error handling. */ swiotlb_full(hwdev, sg->length, dir, 0); - swiotlb_unmap_sg(hwdev, sg - i, i, dir); - sg[0].dma_length = 0; + swiotlb_unmap_sg(hwdev, sgl, i, dir); + sgl[0].dma_length = 0; return 0; } sg->dma_address = virt_to_bus(map); @@ -621,19 +622,21 @@ * concerning calls here are the same as for swiotlb_unmap_single() above. */ void -swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nelems, +swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nelems, int dir) { + struct scatterlist *sg; int i; BUG_ON(dir == DMA_NONE); - for (i = 0; i < nelems; i++, sg++) + for_each_sg(sgl, sg, nelems, i) { if (in_swiotlb_aperture(sg->dma_address)) unmap_single(hwdev, bus_to_virt(sg->dma_address), sg->dma_length, dir); else gnttab_dma_unmap_page(sg->dma_address); + } } /* @@ -644,31 +647,35 @@ * and usage. */ void -swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg, +swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sgl, int nelems, int dir) { + struct scatterlist *sg; int i; BUG_ON(dir == DMA_NONE); - for (i = 0; i < nelems; i++, sg++) + for_each_sg(sgl, sg, nelems, i) { if (in_swiotlb_aperture(sg->dma_address)) sync_single(hwdev, bus_to_virt(sg->dma_address), sg->dma_length, dir); + } } void -swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg, +swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sgl, int nelems, int dir) { + struct scatterlist *sg; int i; BUG_ON(dir == DMA_NONE); - for (i = 0; i < nelems; i++, sg++) + for_each_sg(sgl, sg, nelems, i) { if (in_swiotlb_aperture(sg->dma_address)) sync_single(hwdev, bus_to_virt(sg->dma_address), sg->dma_length, dir); + } } #ifdef CONFIG_HIGHMEM --- a/mm/memory.c +++ b/mm/memory.c @@ -408,7 +408,7 @@ return NULL; #endif -#ifdef CONFIG_DEBUG_VM +#if defined(CONFIG_DEBUG_VM) || defined(CONFIG_XEN) /* * Add some anal sanity checks for now. Eventually, * we should just do "return pfn_to_page(pfn)", but