Magellan Linux

Annotation of /trunk/kernel26-xen/patches-2.6.25-r1/1020-2.6.25-xen-patch-2.6.19.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 612 - (hide annotations) (download)
Sat May 24 01:03:50 2008 UTC (16 years ago) by niro
File size: 318811 byte(s)
-fixed patch again

1 niro 612 From: www.kernel.org
2     Subject: Linux 2.6.19
3     Patch-mainline: 2.6.19
4    
5     Automatically created from "patches.kernel.org/patch-2.6.19" by xen-port-patches.py
6    
7     Acked-by: jbeulich@novell.com
8    
9     ---
10     arch/x86/Kconfig | 1
11     arch/x86/ia32/ia32entry-xen.S | 9
12     arch/x86/kernel/Makefile | 5
13     arch/x86/kernel/apic_32-xen.c | 9
14     arch/x86/kernel/apic_64-xen.c | 20
15     arch/x86/kernel/cpu/common-xen.c | 20
16     arch/x86/kernel/e820_64-xen.c | 320 +++---
17     arch/x86/kernel/early_printk-xen.c | 20
18     arch/x86/kernel/entry_32-xen.S | 139 +-
19     arch/x86/kernel/entry_64-xen.S | 106 --
20     arch/x86/kernel/genapic_xen_64.c | 9
21     arch/x86/kernel/head64-xen.c | 44
22     arch/x86/kernel/head_32-xen.S | 2
23     arch/x86/kernel/head_64-xen.S | 5
24     arch/x86/kernel/io_apic_32-xen.c | 750 +++++++++------
25     arch/x86/kernel/io_apic_64-xen.c | 1250 +++++++++++---------------
26     arch/x86/kernel/ioport_64-xen.c | 1
27     arch/x86/kernel/irq_32-xen.c | 19
28     arch/x86/kernel/irq_64-xen.c | 35
29     arch/x86/kernel/ldt_32-xen.c | 2
30     arch/x86/kernel/microcode-xen.c | 85 +
31     arch/x86/kernel/mpparse_32-xen.c | 70 -
32     arch/x86/kernel/mpparse_64-xen.c | 313 +-----
33     arch/x86/kernel/pci-dma_32-xen.c | 16
34     arch/x86/kernel/pci-swiotlb_64-xen.c | 3
35     arch/x86/kernel/process_32-xen.c | 29
36     arch/x86/kernel/process_64-xen.c | 90 +
37     arch/x86/kernel/setup64-xen.c | 41
38     arch/x86/kernel/setup_32-xen.c | 430 +++-----
39     arch/x86/kernel/setup_64-xen.c | 271 +----
40     arch/x86/kernel/smp_32-xen.c | 75 +
41     arch/x86/kernel/smp_64-xen.c | 35
42     arch/x86/kernel/time_32-xen.c | 86 -
43     arch/x86/kernel/traps_32-xen.c | 238 +++-
44     arch/x86/kernel/traps_64-xen.c | 220 +++-
45     arch/x86/kernel/vsyscall_64-xen.c | 117 ++
46     arch/x86/mach-xen/setup.c | 6
47     arch/x86/mm/fault_32-xen.c | 29
48     arch/x86/mm/fault_64-xen.c | 34
49     arch/x86/mm/highmem_32-xen.c | 31
50     arch/x86/mm/hypervisor.c | 9
51     arch/x86/mm/init_32-xen.c | 89 +
52     arch/x86/mm/init_64-xen.c | 184 +--
53     arch/x86/mm/ioremap_32-xen.c | 10
54     arch/x86/mm/pageattr_64-xen.c | 24
55     arch/x86/mm/pgtable_32-xen.c | 31
56     arch/x86/pci/irq-xen.c | 38
57     drivers/char/tpm/tpm_xen.c | 5
58     drivers/pci/Kconfig | 2
59     drivers/xen/Kconfig | 3
60     drivers/xen/balloon/balloon.c | 2
61     drivers/xen/blkback/blkback.c | 2
62     drivers/xen/blkback/common.h | 2
63     drivers/xen/blkfront/blkfront.c | 4
64     drivers/xen/blktap/blktap.c | 2
65     drivers/xen/blktap/common.h | 2
66     drivers/xen/console/console.c | 10
67     drivers/xen/console/xencons_ring.c | 4
68     drivers/xen/core/evtchn.c | 50 -
69     drivers/xen/core/reboot.c | 3
70     drivers/xen/core/smpboot.c | 6
71     drivers/xen/fbfront/xenfb.c | 3
72     drivers/xen/fbfront/xenkbd.c | 2
73     drivers/xen/gntdev/gntdev.c | 11
74     drivers/xen/netback/accel.c | 2
75     drivers/xen/netback/common.h | 2
76     drivers/xen/netback/loopback.c | 2
77     drivers/xen/netback/netback.c | 6
78     drivers/xen/netfront/netfront.c | 8
79     drivers/xen/pciback/pciback.h | 2
80     drivers/xen/pciback/pciback_ops.c | 2
81     drivers/xen/pcifront/pci_op.c | 8
82     drivers/xen/privcmd/compat_privcmd.c | 1
83     drivers/xen/privcmd/privcmd.c | 2
84     drivers/xen/sfc_netback/accel_xenbus.c | 6
85     drivers/xen/sfc_netfront/accel.h | 6
86     drivers/xen/sfc_netfront/accel_msg.c | 6
87     drivers/xen/sfc_netfront/accel_tso.c | 2
88     drivers/xen/sfc_netfront/accel_vi.c | 4
89     drivers/xen/tpmback/common.h | 2
90     drivers/xen/tpmback/tpmback.c | 4
91     drivers/xen/xenbus/xenbus_comms.c | 2
92     drivers/xen/xenoprof/xenoprofile.c | 2
93     include/asm-generic/pgtable.h | 2
94     include/asm-x86/mach-xen/asm/desc_32.h | 127 +-
95     include/asm-x86/mach-xen/asm/dma-mapping_64.h | 7
96     include/asm-x86/mach-xen/asm/e820_64.h | 15
97     include/asm-x86/mach-xen/asm/fixmap_32.h | 5
98     include/asm-x86/mach-xen/asm/fixmap_64.h | 2
99     include/asm-x86/mach-xen/asm/hw_irq_32.h | 8
100     include/asm-x86/mach-xen/asm/hw_irq_64.h | 10
101     include/asm-x86/mach-xen/asm/io_32.h | 27
102     include/asm-x86/mach-xen/asm/io_64.h | 27
103     include/asm-x86/mach-xen/asm/pgtable-2level.h | 12
104     include/asm-x86/mach-xen/asm/pgtable-3level.h | 14
105     include/asm-x86/mach-xen/asm/pgtable_32.h | 143 +-
106     include/asm-x86/mach-xen/asm/pgtable_64.h | 86 +
107     include/asm-x86/mach-xen/asm/processor_32.h | 62 -
108     include/asm-x86/mach-xen/asm/processor_64.h | 2
109     include/asm-x86/mach-xen/asm/segment_32.h | 19
110     include/asm-x86/mach-xen/asm/smp_32.h | 25
111     include/asm-x86/mach-xen/asm/smp_64.h | 27
112     include/asm-x86/mach-xen/asm/system_32.h | 36
113     include/asm-x86/mach-xen/asm/system_64.h | 1
114     include/asm-x86/mach-xen/asm/tlbflush_32.h | 2
115     include/asm-x86/mach-xen/asm/tlbflush_64.h | 3
116     include/asm-x86/thread_info_64.h | 4
117     include/linux/skbuff.h | 7
118     include/xen/evtchn.h | 10
119     include/xen/xencons.h | 2
120     mm/mprotect.c | 2
121     net/core/dev.c | 8
122     112 files changed, 3102 insertions(+), 3145 deletions(-)
123    
124     --- a/arch/x86/Kconfig
125     +++ b/arch/x86/Kconfig
126     @@ -390,6 +390,7 @@
127    
128     menuconfig PARAVIRT_GUEST
129     bool "Paravirtualized guest support"
130     + depends on !X86_XEN && !X86_64_XEN
131     help
132     Say Y here to get to see options related to running Linux under
133     various hypervisors. This option alone does not add any kernel code.
134     --- a/arch/x86/ia32/ia32entry-xen.S
135     +++ b/arch/x86/ia32/ia32entry-xen.S
136 niro 609 @@ -83,6 +83,7 @@
137     */
138     ENTRY(ia32_sysenter_target)
139     CFI_STARTPROC32 simple
140     + CFI_SIGNAL_FRAME
141     CFI_DEF_CFA rsp,SS+8-RIP+16
142     /*CFI_REL_OFFSET ss,SS-RIP+16*/
143     CFI_REL_OFFSET rsp,RSP-RIP+16
144     @@ -164,6 +165,7 @@
145     */
146     ENTRY(ia32_cstar_target)
147     CFI_STARTPROC32 simple
148     + CFI_SIGNAL_FRAME
149     CFI_DEF_CFA rsp,SS+8-RIP+16
150     /*CFI_REL_OFFSET ss,SS-RIP+16*/
151     CFI_REL_OFFSET rsp,RSP-RIP+16
152     @@ -243,6 +245,7 @@
153    
154     ENTRY(ia32_syscall)
155     CFI_STARTPROC simple
156     + CFI_SIGNAL_FRAME
157     CFI_DEF_CFA rsp,SS+8-RIP+16
158     /*CFI_REL_OFFSET ss,SS-RIP+16*/
159     CFI_REL_OFFSET rsp,RSP-RIP+16
160     @@ -320,6 +323,7 @@
161     popq %r11
162     CFI_ENDPROC
163     CFI_STARTPROC32 simple
164     + CFI_SIGNAL_FRAME
165     CFI_DEF_CFA rsp,SS+8-ARGOFFSET
166     CFI_REL_OFFSET rax,RAX-ARGOFFSET
167     CFI_REL_OFFSET rcx,RCX-ARGOFFSET
168     @@ -653,8 +657,8 @@
169     .quad sys_readlinkat /* 305 */
170     .quad sys_fchmodat
171     .quad sys_faccessat
172     - .quad quiet_ni_syscall /* pselect6 for now */
173     - .quad quiet_ni_syscall /* ppoll for now */
174     + .quad compat_sys_pselect6
175     + .quad compat_sys_ppoll
176     .quad sys_unshare /* 310 */
177     .quad compat_sys_set_robust_list
178     .quad compat_sys_get_robust_list
179     @@ -663,4 +667,5 @@
180     .quad sys_tee
181     .quad compat_sys_vmsplice
182     .quad compat_sys_move_pages
183     + .quad sys_getcpu
184     ia32_syscall_end:
185 niro 612 --- a/arch/x86/kernel/Makefile
186     +++ b/arch/x86/kernel/Makefile
187     @@ -91,7 +91,7 @@
188     ###
189     # 64 bit specific files
190     ifeq ($(CONFIG_X86_64),y)
191     - obj-y += genapic_64.o genapic_flat_64.o
192     + obj-$(CONFIG_X86_LOCAL_APIC) += genapic_64.o genapic_flat_64.o
193     obj-$(CONFIG_X86_XEN_GENAPIC) += genapic_64.o genapic_xen_64.o
194     obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o
195     obj-$(CONFIG_AUDIT) += audit_64.o
196     @@ -104,5 +104,6 @@
197     pci-dma_64-$(CONFIG_XEN) += pci-dma_32.o
198     endif
199 niro 609
200 niro 612 -disabled-obj-$(CONFIG_XEN) := i8253.o i8259_$(BITS).o reboot.o smpboot_$(BITS).o tsc_$(BITS).o
201     +disabled-obj-$(CONFIG_XEN) := early-quirks.o i8253.o i8259_$(BITS).o reboot.o \
202     + smpboot_$(BITS).o tsc_$(BITS).o
203     %/head_$(BITS).o %/head_$(BITS).s: $(if $(CONFIG_XEN),EXTRA_AFLAGS,dummy) :=
204     --- a/arch/x86/kernel/apic_32-xen.c
205     +++ b/arch/x86/kernel/apic_32-xen.c
206 niro 609 @@ -54,7 +54,6 @@
207     /*
208     * Knob to control our willingness to enable the local APIC.
209     */
210     -int enable_local_apic __initdata = 0; /* -1=force-disable, +1=force-enable */
211    
212     /*
213     * Debug level
214     @@ -102,7 +101,7 @@
215    
216     #ifndef CONFIG_XEN
217     #ifndef CONFIG_SMP
218     -static void up_apic_timer_interrupt_call(struct pt_regs *regs)
219     +static void up_apic_timer_interrupt_call(void)
220     {
221     int cpu = smp_processor_id();
222    
223     @@ -111,11 +110,11 @@
224     */
225     per_cpu(irq_stat, cpu).apic_timer_irqs++;
226    
227     - smp_local_timer_interrupt(regs);
228     + smp_local_timer_interrupt();
229     }
230     #endif
231    
232     -void smp_send_timer_broadcast_ipi(struct pt_regs *regs)
233     +void smp_send_timer_broadcast_ipi(void)
234     {
235     cpumask_t mask;
236    
237     @@ -128,7 +127,7 @@
238     * We can directly call the apic timer interrupt handler
239     * in UP case. Minus all irq related functions
240     */
241     - up_apic_timer_interrupt_call(regs);
242     + up_apic_timer_interrupt_call();
243     #endif
244     }
245     }
246 niro 612 --- a/arch/x86/kernel/apic_64-xen.c
247     +++ b/arch/x86/kernel/apic_64-xen.c
248 niro 609 @@ -43,7 +43,7 @@
249     */
250     void ack_bad_irq(unsigned int irq)
251     {
252     - printk("unexpected IRQ trap at vector %02x\n", irq);
253     + printk("unexpected IRQ trap at irq %02x\n", irq);
254     /*
255     * Currently unexpected vectors happen only on SMP and APIC.
256     * We _must_ ack these because every local APIC has only N
257     @@ -62,19 +62,19 @@
258     return -EINVAL;
259     }
260    
261     -void smp_local_timer_interrupt(struct pt_regs *regs)
262     +void smp_local_timer_interrupt(void)
263     {
264     - profile_tick(CPU_PROFILING, regs);
265     + profile_tick(CPU_PROFILING);
266     #ifndef CONFIG_XEN
267     #ifdef CONFIG_SMP
268     - update_process_times(user_mode(regs));
269     + update_process_times(user_mode(get_irq_regs()));
270     #endif
271     #endif
272     /*
273     * We take the 'long' return path, and there every subsystem
274     * grabs the appropriate locks (kernel lock/ irq lock).
275     *
276     - * we might want to decouple profiling from the 'long path',
277     + * We might want to decouple profiling from the 'long path',
278     * and do the profiling totally in assembly.
279     *
280     * Currently this isn't too much of an issue (performance wise),
281     @@ -92,6 +92,8 @@
282     */
283     void smp_apic_timer_interrupt(struct pt_regs *regs)
284     {
285     + struct pt_regs *old_regs = set_irq_regs(regs);
286     +
287     /*
288     * the NMI deadlock-detector uses this.
289     */
290     @@ -109,8 +111,9 @@
291     */
292     exit_idle();
293     irq_enter();
294     - smp_local_timer_interrupt(regs);
295     + smp_local_timer_interrupt();
296     irq_exit();
297     + set_irq_regs(old_regs);
298     }
299    
300     /*
301     @@ -188,9 +191,8 @@
302     int __init APIC_init_uniprocessor (void)
303     {
304     #ifdef CONFIG_X86_IO_APIC
305     - if (smp_found_config)
306     - if (!skip_ioapic_setup && nr_ioapics)
307     - setup_IO_APIC();
308     + if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
309     + setup_IO_APIC();
310     #endif
311    
312     return 1;
313 niro 612 --- a/arch/x86/kernel/cpu/common-xen.c
314     +++ b/arch/x86/kernel/cpu/common-xen.c
315 niro 609 @@ -43,7 +43,7 @@
316    
317     extern int disable_pse;
318    
319     -static void default_init(struct cpuinfo_x86 * c)
320     +static void __cpuinit default_init(struct cpuinfo_x86 * c)
321     {
322     /* Not much we can do here... */
323     /* Check if at least it has cpuid */
324     @@ -56,7 +56,7 @@
325     }
326     }
327    
328     -static struct cpu_dev default_cpu = {
329     +static struct cpu_dev __cpuinitdata default_cpu = {
330     .c_init = default_init,
331     .c_vendor = "Unknown",
332     };
333     @@ -191,7 +191,16 @@
334    
335     static int __init x86_fxsr_setup(char * s)
336     {
337     + /* Tell all the other CPU's to not use it... */
338     disable_x86_fxsr = 1;
339     +
340     + /*
341     + * ... and clear the bits early in the boot_cpu_data
342     + * so that the bootup process doesn't try to do this
343     + * either.
344     + */
345     + clear_bit(X86_FEATURE_FXSR, boot_cpu_data.x86_capability);
346     + clear_bit(X86_FEATURE_XMM, boot_cpu_data.x86_capability);
347     return 1;
348     }
349     __setup("nofxsr", x86_fxsr_setup);
350     @@ -272,7 +281,7 @@
351     }
352     }
353    
354     -void __cpuinit generic_identify(struct cpuinfo_x86 * c)
355     +static void __cpuinit generic_identify(struct cpuinfo_x86 * c)
356     {
357     u32 tfms, xlvl;
358     int ebx;
359     @@ -698,8 +707,7 @@
360     */
361     atomic_inc(&init_mm.mm_count);
362     current->active_mm = &init_mm;
363     - if (current->mm)
364     - BUG();
365     + BUG_ON(current->mm);
366     enter_lazy_tlb(&init_mm, current);
367    
368     load_esp0(t, thread);
369     @@ -712,7 +720,7 @@
370     #endif
371    
372     /* Clear %fs and %gs. */
373     - asm volatile ("xorl %eax, %eax; movl %eax, %fs; movl %eax, %gs");
374     + asm volatile ("movl %0, %%fs; movl %0, %%gs" : : "r" (0));
375    
376     /* Clear all 6 debug registers: */
377     set_debugreg(0, 0);
378 niro 612 --- a/arch/x86/kernel/e820_64-xen.c
379     +++ b/arch/x86/kernel/e820_64-xen.c
380 niro 609 @@ -16,6 +16,7 @@
381     #include <linux/string.h>
382     #include <linux/kexec.h>
383     #include <linux/module.h>
384     +#include <linux/mm.h>
385    
386     #include <asm/pgtable.h>
387     #include <asm/page.h>
388     @@ -25,6 +26,11 @@
389     #include <asm/sections.h>
390     #include <xen/interface/memory.h>
391    
392     +struct e820map e820 __initdata;
393     +#ifdef CONFIG_XEN
394     +struct e820map machine_e820 __initdata;
395     +#endif
396     +
397     /*
398     * PFN of last memory page.
399     */
400     @@ -41,7 +47,7 @@
401     /*
402     * Last pfn which the user wants to use.
403     */
404     -unsigned long end_user_pfn = MAXMEM>>PAGE_SHIFT;
405     +static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT;
406    
407     extern struct resource code_resource, data_resource;
408    
409     @@ -53,13 +59,13 @@
410     #ifndef CONFIG_XEN
411     /* various gunk below that needed for SMP startup */
412     if (addr < 0x8000) {
413     - *addrp = 0x8000;
414     + *addrp = PAGE_ALIGN(0x8000);
415     return 1;
416     }
417    
418     /* direct mapping tables of the kernel */
419     if (last >= table_start<<PAGE_SHIFT && addr < table_end<<PAGE_SHIFT) {
420     - *addrp = table_end << PAGE_SHIFT;
421     + *addrp = PAGE_ALIGN(table_end << PAGE_SHIFT);
422     return 1;
423     }
424    
425     @@ -67,23 +73,18 @@
426     #ifdef CONFIG_BLK_DEV_INITRD
427     if (LOADER_TYPE && INITRD_START && last >= INITRD_START &&
428     addr < INITRD_START+INITRD_SIZE) {
429     - *addrp = INITRD_START + INITRD_SIZE;
430     + *addrp = PAGE_ALIGN(INITRD_START + INITRD_SIZE);
431     return 1;
432     }
433     #endif
434     - /* kernel code + 640k memory hole (later should not be needed, but
435     - be paranoid for now) */
436     - if (last >= 640*1024 && addr < 1024*1024) {
437     - *addrp = 1024*1024;
438     - return 1;
439     - }
440     - if (last >= __pa_symbol(&_text) && last < __pa_symbol(&_end)) {
441     - *addrp = __pa_symbol(&_end);
442     + /* kernel code */
443     + if (last >= __pa_symbol(&_text) && addr < __pa_symbol(&_end)) {
444     + *addrp = PAGE_ALIGN(__pa_symbol(&_end));
445     return 1;
446     }
447    
448     if (last >= ebda_addr && addr < ebda_addr + ebda_size) {
449     - *addrp = ebda_addr + ebda_size;
450     + *addrp = PAGE_ALIGN(ebda_addr + ebda_size);
451     return 1;
452     }
453    
454     @@ -141,8 +142,6 @@
455     for (i = 0; i < e820.nr_map; i++) {
456     struct e820entry *ei = &e820.map[i];
457     #else
458     - extern struct e820map machine_e820;
459     -
460     if (!is_initial_xendomain())
461     return 0;
462     for (i = 0; i < machine_e820.nr_map; i++) {
463     @@ -184,7 +183,7 @@
464     continue;
465     while (bad_addr(&addr, size) && addr+size <= ei->addr+ei->size)
466     ;
467     - last = addr + size;
468     + last = PAGE_ALIGN(addr) + size;
469     if (last > ei->addr + ei->size)
470     continue;
471     if (last > end)
472     @@ -194,59 +193,14 @@
473     return -1UL;
474     }
475    
476     -/*
477     - * Free bootmem based on the e820 table for a node.
478     - */
479     -void __init e820_bootmem_free(pg_data_t *pgdat, unsigned long start,unsigned long end)
480     -{
481     - int i;
482     - for (i = 0; i < e820.nr_map; i++) {
483     - struct e820entry *ei = &e820.map[i];
484     - unsigned long last, addr;
485     -
486     - if (ei->type != E820_RAM ||
487     - ei->addr+ei->size <= start ||
488     - ei->addr >= end)
489     - continue;
490     -
491     - addr = round_up(ei->addr, PAGE_SIZE);
492     - if (addr < start)
493     - addr = start;
494     -
495     - last = round_down(ei->addr + ei->size, PAGE_SIZE);
496     - if (last >= end)
497     - last = end;
498     -
499     - if (last > addr && last-addr >= PAGE_SIZE)
500     - free_bootmem_node(pgdat, addr, last-addr);
501     - }
502     -}
503     -
504     /*
505     * Find the highest page frame number we have available
506     */
507     unsigned long __init e820_end_of_ram(void)
508     {
509     - int i;
510     unsigned long end_pfn = 0;
511     + end_pfn = find_max_pfn_with_active_regions();
512    
513     - for (i = 0; i < e820.nr_map; i++) {
514     - struct e820entry *ei = &e820.map[i];
515     - unsigned long start, end;
516     -
517     - start = round_up(ei->addr, PAGE_SIZE);
518     - end = round_down(ei->addr + ei->size, PAGE_SIZE);
519     - if (start >= end)
520     - continue;
521     - if (ei->type == E820_RAM) {
522     - if (end > end_pfn<<PAGE_SHIFT)
523     - end_pfn = end>>PAGE_SHIFT;
524     - } else {
525     - if (end > end_pfn_map<<PAGE_SHIFT)
526     - end_pfn_map = end>>PAGE_SHIFT;
527     - }
528     - }
529     -
530     if (end_pfn > end_pfn_map)
531     end_pfn_map = end_pfn;
532     if (end_pfn_map > MAXMEM>>PAGE_SHIFT)
533     @@ -256,43 +210,10 @@
534     if (end_pfn > end_pfn_map)
535     end_pfn = end_pfn_map;
536    
537     + printk("end_pfn_map = %lu\n", end_pfn_map);
538     return end_pfn;
539     }
540    
541     -/*
542     - * Compute how much memory is missing in a range.
543     - * Unlike the other functions in this file the arguments are in page numbers.
544     - */
545     -unsigned long __init
546     -e820_hole_size(unsigned long start_pfn, unsigned long end_pfn)
547     -{
548     - unsigned long ram = 0;
549     - unsigned long start = start_pfn << PAGE_SHIFT;
550     - unsigned long end = end_pfn << PAGE_SHIFT;
551     - int i;
552     - for (i = 0; i < e820.nr_map; i++) {
553     - struct e820entry *ei = &e820.map[i];
554     - unsigned long last, addr;
555     -
556     - if (ei->type != E820_RAM ||
557     - ei->addr+ei->size <= start ||
558     - ei->addr >= end)
559     - continue;
560     -
561     - addr = round_up(ei->addr, PAGE_SIZE);
562     - if (addr < start)
563     - addr = start;
564     -
565     - last = round_down(ei->addr + ei->size, PAGE_SIZE);
566     - if (last >= end)
567     - last = end;
568     -
569     - if (last > addr)
570     - ram += last - addr;
571     - }
572     - return ((end - start) - ram) >> PAGE_SHIFT;
573     -}
574     -
575     /*
576     * Mark e820 reserved areas as busy for the resource manager.
577     */
578     @@ -333,6 +254,98 @@
579     }
580     }
581    
582     +#ifndef CONFIG_XEN
583     +/* Mark pages corresponding to given address range as nosave */
584     +static void __init
585     +e820_mark_nosave_range(unsigned long start, unsigned long end)
586     +{
587     + unsigned long pfn, max_pfn;
588     +
589     + if (start >= end)
590     + return;
591     +
592     + printk("Nosave address range: %016lx - %016lx\n", start, end);
593     + max_pfn = end >> PAGE_SHIFT;
594     + for (pfn = start >> PAGE_SHIFT; pfn < max_pfn; pfn++)
595     + if (pfn_valid(pfn))
596     + SetPageNosave(pfn_to_page(pfn));
597     +}
598     +
599     +/*
600     + * Find the ranges of physical addresses that do not correspond to
601     + * e820 RAM areas and mark the corresponding pages as nosave for software
602     + * suspend and suspend to RAM.
603     + *
604     + * This function requires the e820 map to be sorted and without any
605     + * overlapping entries and assumes the first e820 area to be RAM.
606     + */
607     +void __init e820_mark_nosave_regions(void)
608     +{
609     + int i;
610     + unsigned long paddr;
611     +
612     + paddr = round_down(e820.map[0].addr + e820.map[0].size, PAGE_SIZE);
613     + for (i = 1; i < e820.nr_map; i++) {
614     + struct e820entry *ei = &e820.map[i];
615     +
616     + if (paddr < ei->addr)
617     + e820_mark_nosave_range(paddr,
618     + round_up(ei->addr, PAGE_SIZE));
619     +
620     + paddr = round_down(ei->addr + ei->size, PAGE_SIZE);
621     + if (ei->type != E820_RAM)
622     + e820_mark_nosave_range(round_up(ei->addr, PAGE_SIZE),
623     + paddr);
624     +
625     + if (paddr >= (end_pfn << PAGE_SHIFT))
626     + break;
627     + }
628     +}
629     +#endif
630     +
631     +/* Walk the e820 map and register active regions within a node */
632     +void __init
633     +e820_register_active_regions(int nid, unsigned long start_pfn,
634     + unsigned long end_pfn)
635     +{
636     + int i;
637     + unsigned long ei_startpfn, ei_endpfn;
638     + for (i = 0; i < e820.nr_map; i++) {
639     + struct e820entry *ei = &e820.map[i];
640     + ei_startpfn = round_up(ei->addr, PAGE_SIZE) >> PAGE_SHIFT;
641     + ei_endpfn = round_down(ei->addr + ei->size, PAGE_SIZE)
642     + >> PAGE_SHIFT;
643     +
644     + /* Skip map entries smaller than a page */
645     + if (ei_startpfn >= ei_endpfn)
646     + continue;
647     +
648     + /* Check if end_pfn_map should be updated */
649     + if (ei->type != E820_RAM && ei_endpfn > end_pfn_map)
650     + end_pfn_map = ei_endpfn;
651     +
652     + /* Skip if map is outside the node */
653     + if (ei->type != E820_RAM ||
654     + ei_endpfn <= start_pfn ||
655     + ei_startpfn >= end_pfn)
656     + continue;
657     +
658     + /* Check for overlaps */
659     + if (ei_startpfn < start_pfn)
660     + ei_startpfn = start_pfn;
661     + if (ei_endpfn > end_pfn)
662     + ei_endpfn = end_pfn;
663     +
664     + /* Obey end_user_pfn to save on memmap */
665     + if (ei_startpfn >= end_user_pfn)
666     + continue;
667     + if (ei_endpfn > end_user_pfn)
668     + ei_endpfn = end_user_pfn;
669     +
670     + add_active_range(nid, ei_startpfn, ei_endpfn);
671     + }
672     +}
673     +
674     /*
675     * Add a memory region to the kernel e820 map.
676     */
677     @@ -553,13 +566,6 @@
678     * If we're lucky and live on a modern system, the setup code
679     * will have given us a memory map that we can use to properly
680     * set up memory. If we aren't, we'll fake a memory map.
681     - *
682     - * We check to see that the memory map contains at least 2 elements
683     - * before we'll use it, because the detection code in setup.S may
684     - * not be perfect and most every PC known to man has two memory
685     - * regions: one from 0 to 640k, and one from 1mb up. (The IBM
686     - * thinkpad 560x, for example, does not cooperate with the memory
687     - * detection code.)
688     */
689     static int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
690     {
691     @@ -581,37 +587,20 @@
692     if (start > end)
693     return -1;
694    
695     -#ifndef CONFIG_XEN
696     - /*
697     - * Some BIOSes claim RAM in the 640k - 1M region.
698     - * Not right. Fix it up.
699     - *
700     - * This should be removed on Hammer which is supposed to not
701     - * have non e820 covered ISA mappings there, but I had some strange
702     - * problems so it stays for now. -AK
703     - */
704     - if (type == E820_RAM) {
705     - if (start < 0x100000ULL && end > 0xA0000ULL) {
706     - if (start < 0xA0000ULL)
707     - add_memory_region(start, 0xA0000ULL-start, type);
708     - if (end <= 0x100000ULL)
709     - continue;
710     - start = 0x100000ULL;
711     - size = end - start;
712     - }
713     - }
714     -#endif
715     -
716     add_memory_region(start, size, type);
717     } while (biosmap++,--nr_map);
718     return 0;
719     }
720    
721     +void early_panic(char *msg)
722     +{
723     + early_printk(msg);
724     + panic(msg);
725     +}
726     +
727     #ifndef CONFIG_XEN
728     void __init setup_memory_region(void)
729     {
730     - char *who = "BIOS-e820";
731     -
732     /*
733     * Try to copy the BIOS-supplied E820-map.
734     *
735     @@ -619,24 +608,10 @@
736     * the next section from 1mb->appropriate_mem_k
737     */
738     sanitize_e820_map(E820_MAP, &E820_MAP_NR);
739     - if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0) {
740     - unsigned long mem_size;
741     -
742     - /* compare results from other methods and take the greater */
743     - if (ALT_MEM_K < EXT_MEM_K) {
744     - mem_size = EXT_MEM_K;
745     - who = "BIOS-88";
746     - } else {
747     - mem_size = ALT_MEM_K;
748     - who = "BIOS-e801";
749     - }
750     -
751     - e820.nr_map = 0;
752     - add_memory_region(0, LOWMEMSIZE(), E820_RAM);
753     - add_memory_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
754     - }
755     + if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0)
756     + early_panic("Cannot find a valid memory map");
757     printk(KERN_INFO "BIOS-provided physical RAM map:\n");
758     - e820_print_map(who);
759     + e820_print_map("BIOS-e820");
760     }
761    
762     #else /* CONFIG_XEN */
763     @@ -668,20 +643,23 @@
764    
765     sanitize_e820_map(map, (char *)&memmap.nr_entries);
766    
767     - BUG_ON(copy_e820_map(map, (char)memmap.nr_entries) < 0);
768     + if (copy_e820_map(map, (char)memmap.nr_entries) < 0)
769     + early_panic("Cannot find a valid memory map");
770    
771     printk(KERN_INFO "BIOS-provided physical RAM map:\n");
772     e820_print_map("Xen");
773     }
774     #endif
775    
776     -void __init parse_memopt(char *p, char **from)
777     -{
778     +static int __init parse_memopt(char *p)
779     +{
780     int i;
781     unsigned long current_end;
782     unsigned long end;
783    
784     - end_user_pfn = memparse(p, from);
785     + if (!p)
786     + return -EINVAL;
787     + end_user_pfn = memparse(p, &p);
788     end_user_pfn >>= PAGE_SHIFT;
789    
790     end = end_user_pfn<<PAGE_SHIFT;
791     @@ -698,27 +676,61 @@
792     else
793     add_memory_region(current_end, end - current_end, E820_RAM);
794     }
795     +
796     + return 0;
797     }
798     +early_param("mem", parse_memopt);
799     +
800     +static int userdef __initdata;
801    
802     -void __init parse_memmapopt(char *p, char **from)
803     +static int __init parse_memmap_opt(char *p)
804     {
805     + char *oldp;
806     unsigned long long start_at, mem_size;
807    
808     - mem_size = memparse(p, from);
809     - p = *from;
810     + if (!strcmp(p, "exactmap")) {
811     +#ifdef CONFIG_CRASH_DUMP
812     + /* If we are doing a crash dump, we
813     + * still need to know the real mem
814     + * size before original memory map is
815     + * reset.
816     + */
817     + e820_register_active_regions(0, 0, -1UL);
818     + saved_max_pfn = e820_end_of_ram();
819     + remove_all_active_ranges();
820     +#endif
821     + end_pfn_map = 0;
822     + e820.nr_map = 0;
823     + userdef = 1;
824     + return 0;
825     + }
826     +
827     + oldp = p;
828     + mem_size = memparse(p, &p);
829     + if (p == oldp)
830     + return -EINVAL;
831     if (*p == '@') {
832     - start_at = memparse(p+1, from);
833     + start_at = memparse(p+1, &p);
834     add_memory_region(start_at, mem_size, E820_RAM);
835     } else if (*p == '#') {
836     - start_at = memparse(p+1, from);
837     + start_at = memparse(p+1, &p);
838     add_memory_region(start_at, mem_size, E820_ACPI);
839     } else if (*p == '$') {
840     - start_at = memparse(p+1, from);
841     + start_at = memparse(p+1, &p);
842     add_memory_region(start_at, mem_size, E820_RESERVED);
843     } else {
844     end_user_pfn = (mem_size >> PAGE_SHIFT);
845     }
846     - p = *from;
847     + return *p == '\0' ? 0 : -EINVAL;
848     +}
849     +early_param("memmap", parse_memmap_opt);
850     +
851     +void finish_e820_parsing(void)
852     +{
853     + if (userdef) {
854     + printk(KERN_INFO "user-defined physical RAM map:\n");
855     + e820_print_map("user");
856     + }
857     }
858    
859     unsigned long pci_mem_start = 0xaeedbabe;
860 niro 612 --- a/arch/x86/kernel/early_printk-xen.c
861     +++ b/arch/x86/kernel/early_printk-xen.c
862 niro 609 @@ -244,20 +244,16 @@
863    
864     static int __initdata keep_early;
865    
866     -int __init setup_early_printk(char *opt)
867     +static int __init setup_early_printk(char *buf)
868     {
869     - char *space;
870     - char buf[256];
871     + if (!buf)
872     + return 0;
873    
874     if (early_console_initialized)
875     - return 1;
876     -
877     - strlcpy(buf,opt,sizeof(buf));
878     - space = strchr(buf, ' ');
879     - if (space)
880     - *space = 0;
881     + return 0;
882     + early_console_initialized = 1;
883    
884     - if (strstr(buf,"keep"))
885     + if (strstr(buf, "keep"))
886     keep_early = 1;
887    
888     if (!strncmp(buf, "serial", 6)) {
889     @@ -281,11 +277,12 @@
890     early_console = &simnow_console;
891     keep_early = 1;
892     }
893     - early_console_initialized = 1;
894     register_console(early_console);
895     return 0;
896     }
897    
898     +early_param("earlyprintk", setup_early_printk);
899     +
900     void __init disable_early_printk(void)
901     {
902     if (!early_console_initialized || !early_console)
903     @@ -299,4 +296,3 @@
904     }
905     }
906    
907     -__setup("earlyprintk=", setup_early_printk);
908 niro 612 --- a/arch/x86/kernel/entry_32-xen.S
909     +++ b/arch/x86/kernel/entry_32-xen.S
910 niro 609 @@ -80,8 +80,12 @@
911     NMI_MASK = 0x80000000
912    
913     #ifndef CONFIG_XEN
914     -#define DISABLE_INTERRUPTS cli
915     -#define ENABLE_INTERRUPTS sti
916     +/* These are replaces for paravirtualization */
917     +#define DISABLE_INTERRUPTS cli
918     +#define ENABLE_INTERRUPTS sti
919     +#define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit
920     +#define INTERRUPT_RETURN iret
921     +#define GET_CR0_INTO_EAX movl %cr0, %eax
922     #else
923     /* Offsets into shared_info_t. */
924     #define evtchn_upcall_pending /* 0 */
925     @@ -99,15 +103,29 @@
926    
927     #define __DISABLE_INTERRUPTS movb $1,evtchn_upcall_mask(%esi)
928     #define __ENABLE_INTERRUPTS movb $0,evtchn_upcall_mask(%esi)
929     +#define __TEST_PENDING testb $0xFF,evtchn_upcall_pending(%esi)
930     #define DISABLE_INTERRUPTS GET_VCPU_INFO ; \
931     __DISABLE_INTERRUPTS
932     #define ENABLE_INTERRUPTS GET_VCPU_INFO ; \
933     __ENABLE_INTERRUPTS
934     -#define __TEST_PENDING testb $0xFF,evtchn_upcall_pending(%esi)
935     +#define ENABLE_INTERRUPTS_SYSEXIT __ENABLE_INTERRUPTS ; \
936     +sysexit_scrit: /**** START OF SYSEXIT CRITICAL REGION ****/ ; \
937     + __TEST_PENDING ; \
938     + jnz 14f # process more events if necessary... ; \
939     + movl ESI(%esp), %esi ; \
940     + sysexit ; \
941     +14: __DISABLE_INTERRUPTS ; \
942     + TRACE_IRQS_OFF ; \
943     +sysexit_ecrit: /**** END OF SYSEXIT CRITICAL REGION ****/ ; \
944     + push %esp ; \
945     + call evtchn_do_upcall ; \
946     + add $4,%esp ; \
947     + jmp ret_from_intr
948     +#define INTERRUPT_RETURN iret
949     #endif
950    
951     #ifdef CONFIG_PREEMPT
952     -#define preempt_stop cli; TRACE_IRQS_OFF
953     +#define preempt_stop DISABLE_INTERRUPTS; TRACE_IRQS_OFF
954     #else
955     #define preempt_stop
956     #define resume_kernel restore_nocheck
957     @@ -206,18 +224,21 @@
958    
959     #define RING0_INT_FRAME \
960     CFI_STARTPROC simple;\
961     + CFI_SIGNAL_FRAME;\
962     CFI_DEF_CFA esp, 3*4;\
963     /*CFI_OFFSET cs, -2*4;*/\
964     CFI_OFFSET eip, -3*4
965    
966     #define RING0_EC_FRAME \
967     CFI_STARTPROC simple;\
968     + CFI_SIGNAL_FRAME;\
969     CFI_DEF_CFA esp, 4*4;\
970     /*CFI_OFFSET cs, -2*4;*/\
971     CFI_OFFSET eip, -3*4
972    
973     #define RING0_PTREGS_FRAME \
974     CFI_STARTPROC simple;\
975     + CFI_SIGNAL_FRAME;\
976     CFI_DEF_CFA esp, OLDESP-EBX;\
977     /*CFI_OFFSET cs, CS-OLDESP;*/\
978     CFI_OFFSET eip, EIP-OLDESP;\
979     @@ -263,8 +284,9 @@
980     check_userspace:
981     movl EFLAGS(%esp), %eax # mix EFLAGS and CS
982     movb CS(%esp), %al
983     - testl $(VM_MASK | 2), %eax
984     - jz resume_kernel
985     + andl $(VM_MASK | SEGMENT_RPL_MASK), %eax
986     + cmpl $USER_RPL, %eax
987     + jb resume_kernel # not returning to v8086 or userspace
988     ENTRY(resume_userspace)
989     DISABLE_INTERRUPTS # make sure we don't miss an interrupt
990     # setting need_resched or sigpending
991     @@ -277,7 +299,7 @@
992    
993     #ifdef CONFIG_PREEMPT
994     ENTRY(resume_kernel)
995     - cli
996     + DISABLE_INTERRUPTS
997     cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ?
998     jnz restore_nocheck
999     need_resched:
1000     @@ -297,6 +319,7 @@
1001     # sysenter call handler stub
1002     ENTRY(sysenter_entry)
1003     CFI_STARTPROC simple
1004     + CFI_SIGNAL_FRAME
1005     CFI_DEF_CFA esp, 0
1006     CFI_REGISTER esp, ebp
1007     movl SYSENTER_stack_esp0(%esp),%esp
1008     @@ -305,7 +328,7 @@
1009     * No need to follow this irqs on/off section: the syscall
1010     * disabled irqs and here we enable it straight after entry:
1011     */
1012     - sti
1013     + ENABLE_INTERRUPTS
1014     pushl $(__USER_DS)
1015     CFI_ADJUST_CFA_OFFSET 4
1016     /*CFI_REL_OFFSET ss, 0*/
1017     @@ -359,26 +382,8 @@
1018     movl EIP(%esp), %edx
1019     movl OLDESP(%esp), %ecx
1020     xorl %ebp,%ebp
1021     -#ifdef CONFIG_XEN
1022     TRACE_IRQS_ON
1023     - __ENABLE_INTERRUPTS
1024     -sysexit_scrit: /**** START OF SYSEXIT CRITICAL REGION ****/
1025     - __TEST_PENDING
1026     - jnz 14f # process more events if necessary...
1027     - movl ESI(%esp), %esi
1028     - sysexit
1029     -14: __DISABLE_INTERRUPTS
1030     - TRACE_IRQS_OFF
1031     -sysexit_ecrit: /**** END OF SYSEXIT CRITICAL REGION ****/
1032     - push %esp
1033     - call evtchn_do_upcall
1034     - add $4,%esp
1035     - jmp ret_from_intr
1036     -#else
1037     - TRACE_IRQS_ON
1038     - sti
1039     - sysexit
1040     -#endif /* !CONFIG_XEN */
1041     + ENABLE_INTERRUPTS_SYSEXIT
1042     CFI_ENDPROC
1043    
1044     # pv sysenter call handler stub
1045     @@ -444,8 +449,8 @@
1046     # See comments in process.c:copy_thread() for details.
1047     movb OLDSS(%esp), %ah
1048     movb CS(%esp), %al
1049     - andl $(VM_MASK | (4 << 8) | 3), %eax
1050     - cmpl $((4 << 8) | 3), %eax
1051     + andl $(VM_MASK | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
1052     + cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
1053     CFI_REMEMBER_STATE
1054     je ldt_ss # returning to user-space with LDT SS
1055     restore_nocheck:
1056     @@ -467,12 +472,11 @@
1057     RESTORE_REGS
1058     addl $4, %esp
1059     CFI_ADJUST_CFA_OFFSET -4
1060     -1: iret
1061     +1: INTERRUPT_RETURN
1062     .section .fixup,"ax"
1063     iret_exc:
1064     #ifndef CONFIG_XEN
1065     - TRACE_IRQS_ON
1066     - sti
1067     + ENABLE_INTERRUPTS
1068     #endif
1069     pushl $0 # no error code
1070     pushl $do_iret_error
1071     @@ -498,7 +502,7 @@
1072     * dosemu and wine happy. */
1073     subl $8, %esp # reserve space for switch16 pointer
1074     CFI_ADJUST_CFA_OFFSET 8
1075     - cli
1076     + DISABLE_INTERRUPTS
1077     TRACE_IRQS_OFF
1078     movl %esp, %eax
1079     /* Set up the 16bit stack frame with switch32 pointer on top,
1080     @@ -508,7 +512,7 @@
1081     TRACE_IRQS_IRET
1082     RESTORE_REGS
1083     lss 20+4(%esp), %esp # switch to 16bit stack
1084     -1: iret
1085     +1: INTERRUPT_RETURN
1086     .section __ex_table,"a"
1087     .align 4
1088     .long 1b,iret_exc
1089     @@ -524,7 +528,7 @@
1090     RESTORE_REGS
1091     addl $4, %esp
1092     CFI_ADJUST_CFA_OFFSET -4
1093     -1: iret
1094     +1: INTERRUPT_RETURN
1095     .section __ex_table,"a"
1096     .align 4
1097     .long 1b,iret_exc
1098     @@ -713,11 +717,9 @@
1099     #define UNWIND_ESPFIX_STACK
1100     #endif
1101    
1102     -ENTRY(divide_error)
1103     - RING0_INT_FRAME
1104     - pushl $0 # no error code
1105     - CFI_ADJUST_CFA_OFFSET 4
1106     - pushl $do_divide_error
1107     +KPROBE_ENTRY(page_fault)
1108     + RING0_EC_FRAME
1109     + pushl $do_page_fault
1110     CFI_ADJUST_CFA_OFFSET 4
1111     ALIGN
1112     error_code:
1113     @@ -767,6 +769,7 @@
1114     call *%edi
1115     jmp ret_from_exception
1116     CFI_ENDPROC
1117     +KPROBE_END(page_fault)
1118    
1119     #ifdef CONFIG_XEN
1120     # A note on the "critical region" in our callback handler.
1121     @@ -926,7 +929,7 @@
1122     CFI_ADJUST_CFA_OFFSET 4
1123     SAVE_ALL
1124     #ifndef CONFIG_XEN
1125     - movl %cr0, %eax
1126     + GET_CR0_INTO_EAX
1127     testl $0x4, %eax # EM (math emulation bit)
1128     je device_available_emulate
1129     pushl $0 # temporary storage for ORIG_EIP
1130     @@ -961,9 +964,15 @@
1131     jne ok; \
1132     label: \
1133     movl SYSENTER_stack_esp0+offset(%esp),%esp; \
1134     + CFI_DEF_CFA esp, 0; \
1135     + CFI_UNDEFINED eip; \
1136     pushfl; \
1137     + CFI_ADJUST_CFA_OFFSET 4; \
1138     pushl $__KERNEL_CS; \
1139     - pushl $sysenter_past_esp
1140     + CFI_ADJUST_CFA_OFFSET 4; \
1141     + pushl $sysenter_past_esp; \
1142     + CFI_ADJUST_CFA_OFFSET 4; \
1143     + CFI_REL_OFFSET eip, 0
1144     #endif /* CONFIG_XEN */
1145    
1146     KPROBE_ENTRY(debug)
1147     @@ -982,7 +991,8 @@
1148     call do_debug
1149     jmp ret_from_exception
1150     CFI_ENDPROC
1151     - .previous .text
1152     +KPROBE_END(debug)
1153     +
1154     #ifndef CONFIG_XEN
1155     /*
1156     * NMI is doubly nasty. It can happen _while_ we're handling
1157     @@ -992,7 +1002,7 @@
1158     * check whether we got an NMI on the debug path where the debug
1159     * fault happened on the sysenter path.
1160     */
1161     -ENTRY(nmi)
1162     +KPROBE_ENTRY(nmi)
1163     RING0_INT_FRAME
1164     pushl %eax
1165     CFI_ADJUST_CFA_OFFSET 4
1166     @@ -1017,6 +1027,7 @@
1167     cmpl $sysenter_entry,12(%esp)
1168     je nmi_debug_stack_check
1169     nmi_stack_correct:
1170     + /* We have a RING0_INT_FRAME here */
1171     pushl %eax
1172     CFI_ADJUST_CFA_OFFSET 4
1173     SAVE_ALL
1174     @@ -1027,9 +1038,12 @@
1175     CFI_ENDPROC
1176    
1177     nmi_stack_fixup:
1178     + RING0_INT_FRAME
1179     FIX_STACK(12,nmi_stack_correct, 1)
1180     jmp nmi_stack_correct
1181     +
1182     nmi_debug_stack_check:
1183     + /* We have a RING0_INT_FRAME here */
1184     cmpw $__KERNEL_CS,16(%esp)
1185     jne nmi_stack_correct
1186     cmpl $debug,(%esp)
1187     @@ -1040,8 +1054,10 @@
1188     jmp nmi_stack_correct
1189    
1190     nmi_16bit_stack:
1191     - RING0_INT_FRAME
1192     - /* create the pointer to lss back */
1193     + /* We have a RING0_INT_FRAME here.
1194     + *
1195     + * create the pointer to lss back
1196     + */
1197     pushl %ss
1198     CFI_ADJUST_CFA_OFFSET 4
1199     pushl %esp
1200     @@ -1062,14 +1078,14 @@
1201     call do_nmi
1202     RESTORE_REGS
1203     lss 12+4(%esp), %esp # back to 16bit stack
1204     -1: iret
1205     +1: INTERRUPT_RETURN
1206     CFI_ENDPROC
1207     .section __ex_table,"a"
1208     .align 4
1209     .long 1b,iret_exc
1210     .previous
1211     #else
1212     -ENTRY(nmi)
1213     +KPROBE_ENTRY(nmi)
1214     RING0_INT_FRAME
1215     pushl %eax
1216     CFI_ADJUST_CFA_OFFSET 4
1217     @@ -1081,6 +1097,7 @@
1218     jmp restore_all
1219     CFI_ENDPROC
1220     #endif
1221     +KPROBE_END(nmi)
1222    
1223     KPROBE_ENTRY(int3)
1224     RING0_INT_FRAME
1225     @@ -1092,7 +1109,7 @@
1226     call do_int3
1227     jmp ret_from_exception
1228     CFI_ENDPROC
1229     - .previous .text
1230     +KPROBE_END(int3)
1231    
1232     ENTRY(overflow)
1233     RING0_INT_FRAME
1234     @@ -1157,7 +1174,7 @@
1235     CFI_ADJUST_CFA_OFFSET 4
1236     jmp error_code
1237     CFI_ENDPROC
1238     - .previous .text
1239     +KPROBE_END(general_protection)
1240    
1241     ENTRY(alignment_check)
1242     RING0_EC_FRAME
1243     @@ -1166,13 +1183,14 @@
1244     jmp error_code
1245     CFI_ENDPROC
1246    
1247     -KPROBE_ENTRY(page_fault)
1248     - RING0_EC_FRAME
1249     - pushl $do_page_fault
1250     +ENTRY(divide_error)
1251     + RING0_INT_FRAME
1252     + pushl $0 # no error code
1253     + CFI_ADJUST_CFA_OFFSET 4
1254     + pushl $do_divide_error
1255     CFI_ADJUST_CFA_OFFSET 4
1256     jmp error_code
1257     CFI_ENDPROC
1258     - .previous .text
1259    
1260     #ifdef CONFIG_X86_MCE
1261     ENTRY(machine_check)
1262     @@ -1234,6 +1252,19 @@
1263     jmp error_code
1264     CFI_ENDPROC
1265    
1266     +ENTRY(kernel_thread_helper)
1267     + pushl $0 # fake return address for unwinder
1268     + CFI_STARTPROC
1269     + movl %edx,%eax
1270     + push %edx
1271     + CFI_ADJUST_CFA_OFFSET 4
1272     + call *%ebx
1273     + push %eax
1274     + CFI_ADJUST_CFA_OFFSET 4
1275     + call do_exit
1276     + CFI_ENDPROC
1277     +ENDPROC(kernel_thread_helper)
1278     +
1279     .section .rodata,"a"
1280     #include "syscall_table.S"
1281    
1282 niro 612 --- a/arch/x86/kernel/entry_64-xen.S
1283     +++ b/arch/x86/kernel/entry_64-xen.S
1284 niro 609 @@ -26,15 +23,25 @@
1285     * at the top of the kernel process stack.
1286     * - partial stack frame: partially saved registers upto R11.
1287     * - full stack frame: Like partial stack frame, but all register saved.
1288     - *
1289     - * TODO:
1290     - * - schedule it carefully for the final hardware.
1291     + *
1292     + * Some macro usage:
1293     + * - CFI macros are used to generate dwarf2 unwind information for better
1294     + * backtraces. They don't change any code.
1295     + * - SAVE_ALL/RESTORE_ALL - Save/restore all registers
1296     + * - SAVE_ARGS/RESTORE_ARGS - Save/restore registers that C functions modify.
1297     + * There are unfortunately lots of special cases where some registers
1298     + * not touched. The macro is a big mess that should be cleaned up.
1299     + * - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS.
1300     + * Gives a full stack frame.
1301     + * - ENTRY/END Define functions in the symbol table.
1302     + * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack
1303     + * frame that is otherwise undefined after a SYSCALL
1304     + * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging.
1305     + * - errorentry/paranoidentry/zeroentry - Define exception entry points.
1306     */
1307    
1308     -#define ASSEMBLY 1
1309     #include <linux/linkage.h>
1310     #include <asm/segment.h>
1311     -#include <asm/smp.h>
1312     #include <asm/cache.h>
1313     #include <asm/errno.h>
1314     #include <asm/dwarf2.h>
1315     @@ -117,6 +124,7 @@
1316     .macro CFI_DEFAULT_STACK start=1,adj=0
1317     .if \start
1318     CFI_STARTPROC simple
1319     + CFI_SIGNAL_FRAME
1320     CFI_DEF_CFA rsp,SS+8-(\adj*ARGOFFSET)
1321     .else
1322     CFI_DEF_CFA_OFFSET SS+8-(\adj*ARGOFFSET)
1323     @@ -207,6 +215,7 @@
1324     */
1325     .macro _frame ref
1326     CFI_STARTPROC simple
1327     + CFI_SIGNAL_FRAME
1328     CFI_DEF_CFA rsp,SS+8-\ref
1329     /*CFI_REL_OFFSET ss,SS-\ref*/
1330     CFI_REL_OFFSET rsp,RSP-\ref
1331 niro 612 @@ -334,6 +343,8 @@
1332     LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */
1333     RESTORE_REST
1334 niro 609 cmpq $__NR_syscall_max,%rax
1335     + movq $-ENOSYS,%rcx
1336     + cmova %rcx,%rax
1337 niro 612 ja 1f
1338     movq %r10,%rcx /* fixup for C */
1339     call *sys_call_table(,%rax,8)
1340 niro 609 @@ -349,6 +360,7 @@
1341     */
1342     ENTRY(int_ret_from_sys_call)
1343     CFI_STARTPROC simple
1344     + CFI_SIGNAL_FRAME
1345     CFI_DEF_CFA rsp,SS+8-ARGOFFSET
1346     /*CFI_REL_OFFSET ss,SS-ARGOFFSET*/
1347     CFI_REL_OFFSET rsp,RSP-ARGOFFSET
1348     @@ -583,8 +595,7 @@
1349     #ifdef CONFIG_PREEMPT
1350     /* Returning to kernel space. Check if we need preemption */
1351     /* rcx: threadinfo. interrupts off. */
1352     - .p2align
1353     -retint_kernel:
1354     +ENTRY(retint_kernel)
1355     cmpl $0,threadinfo_preempt_count(%rcx)
1356     jnz retint_restore_args
1357     bt $TIF_NEED_RESCHED,threadinfo_flags(%rcx)
1358     @@ -644,7 +655,6 @@
1359     END(call_function_interrupt)
1360     #endif
1361    
1362     -#ifdef CONFIG_X86_LOCAL_APIC
1363     ENTRY(apic_timer_interrupt)
1364     apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt
1365     END(apic_timer_interrupt)
1366     @@ -656,7 +666,6 @@
1367     ENTRY(spurious_interrupt)
1368     apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt
1369     END(spurious_interrupt)
1370     -#endif
1371     #endif /* !CONFIG_XEN */
1372    
1373     /*
1374     @@ -755,7 +764,9 @@
1375     testl $3,CS(%rsp)
1376     jnz paranoid_userspace\trace
1377     paranoid_swapgs\trace:
1378     + .if \trace
1379     TRACE_IRQS_IRETQ 0
1380     + .endif
1381     swapgs
1382     paranoid_restore\trace:
1383     RESTORE_ALL 8
1384     @@ -802,7 +813,7 @@
1385     * Exception entry point. This expects an error code/orig_rax on the stack
1386     * and the exception handler in %rax.
1387     */
1388     -ENTRY(error_entry)
1389     +KPROBE_ENTRY(error_entry)
1390     _frame RDI
1391     CFI_REL_OFFSET rax,0
1392     /* rdi slot contains rax, oldrax contains error code */
1393     @@ -896,7 +907,7 @@
1394     jmp error_sti
1395     #endif
1396     CFI_ENDPROC
1397     -END(error_entry)
1398     +KPROBE_END(error_entry)
1399    
1400     ENTRY(hypervisor_callback)
1401     zeroentry do_hypervisor_callback
1402     @@ -936,26 +947,6 @@
1403     CFI_ENDPROC
1404     END(do_hypervisor_callback)
1405    
1406     -#ifdef CONFIG_X86_LOCAL_APIC
1407     -KPROBE_ENTRY(nmi)
1408     - zeroentry do_nmi_callback
1409     -ENTRY(do_nmi_callback)
1410     - CFI_STARTPROC
1411     - addq $8, %rsp
1412     - CFI_ENDPROC
1413     - CFI_DEFAULT_STACK
1414     - call do_nmi
1415     - orl $NMI_MASK,EFLAGS(%rsp)
1416     - RESTORE_REST
1417     - XEN_BLOCK_EVENTS(%rsi)
1418     - TRACE_IRQS_OFF
1419     - GET_THREAD_INFO(%rcx)
1420     - jmp retint_restore_args
1421     - CFI_ENDPROC
1422     - .previous .text
1423     -END(nmi)
1424     -#endif
1425     -
1426     ALIGN
1427     restore_all_enable_events:
1428     CFI_DEFAULT_STACK adj=1
1429     @@ -1121,7 +1112,7 @@
1430     * do_sys_execve asm fallback arguments:
1431     * rdi: name, rsi: argv, rdx: envp, fake frame on the stack
1432     */
1433     -ENTRY(execve)
1434     +ENTRY(kernel_execve)
1435     CFI_STARTPROC
1436     FAKE_STACK_FRAME $0
1437     SAVE_ALL
1438     @@ -1135,12 +1126,11 @@
1439     UNFAKE_STACK_FRAME
1440     ret
1441     CFI_ENDPROC
1442     -ENDPROC(execve)
1443     +ENDPROC(kernel_execve)
1444    
1445     KPROBE_ENTRY(page_fault)
1446     errorentry do_page_fault
1447     -END(page_fault)
1448     - .previous .text
1449     +KPROBE_END(page_fault)
1450    
1451     ENTRY(coprocessor_error)
1452     zeroentry do_coprocessor_error
1453     @@ -1162,25 +1152,25 @@
1454     zeroentry do_debug
1455     /* paranoidexit
1456     CFI_ENDPROC */
1457     -END(debug)
1458     - .previous .text
1459     +KPROBE_END(debug)
1460    
1461     -#if 0
1462     - /* runs on exception stack */
1463     KPROBE_ENTRY(nmi)
1464     - INTR_FRAME
1465     - pushq $-1
1466     - CFI_ADJUST_CFA_OFFSET 8
1467     - paranoidentry do_nmi, 0, 0
1468     -#ifdef CONFIG_TRACE_IRQFLAGS
1469     - paranoidexit 0
1470     -#else
1471     - jmp paranoid_exit1
1472     - CFI_ENDPROC
1473     -#endif
1474     -END(nmi)
1475     - .previous .text
1476     -#endif
1477     + zeroentry do_nmi_callback
1478     +KPROBE_END(nmi)
1479     +do_nmi_callback:
1480     + CFI_STARTPROC
1481     + addq $8, %rsp
1482     + CFI_ENDPROC
1483     + CFI_DEFAULT_STACK
1484     + call do_nmi
1485     + orl $NMI_MASK,EFLAGS(%rsp)
1486     + RESTORE_REST
1487     + XEN_BLOCK_EVENTS(%rsi)
1488     + TRACE_IRQS_OFF
1489     + GET_THREAD_INFO(%rcx)
1490     + jmp retint_restore_args
1491     + CFI_ENDPROC
1492     +END(do_nmi_callback)
1493    
1494     KPROBE_ENTRY(int3)
1495     /* INTR_FRAME
1496     @@ -1189,8 +1179,7 @@
1497     zeroentry do_int3
1498     /* jmp paranoid_exit1
1499     CFI_ENDPROC */
1500     -END(int3)
1501     - .previous .text
1502     +KPROBE_END(int3)
1503    
1504     ENTRY(overflow)
1505     zeroentry do_overflow
1506     @@ -1241,8 +1230,7 @@
1507    
1508     KPROBE_ENTRY(general_protection)
1509     errorentry do_general_protection
1510     -END(general_protection)
1511     - .previous .text
1512     +KPROBE_END(general_protection)
1513    
1514     ENTRY(alignment_check)
1515     errorentry do_alignment_check
1516 niro 612 --- a/arch/x86/kernel/genapic_xen_64.c
1517     +++ b/arch/x86/kernel/genapic_xen_64.c
1518 niro 609 @@ -71,6 +71,13 @@
1519     return cpu_online_map;
1520     }
1521    
1522     +static cpumask_t xen_vector_allocation_domain(int cpu)
1523     +{
1524     + cpumask_t domain = CPU_MASK_NONE;
1525     + cpu_set(cpu, domain);
1526     + return domain;
1527     +}
1528     +
1529     /*
1530     * Set up the logical destination ID.
1531     * Do nothing, not called now.
1532     @@ -147,8 +154,8 @@
1533     .int_delivery_mode = dest_LowestPrio,
1534     #endif
1535     .int_dest_mode = (APIC_DEST_LOGICAL != 0),
1536     - .int_delivery_dest = APIC_DEST_LOGICAL | APIC_DM_LOWEST,
1537     .target_cpus = xen_target_cpus,
1538     + .vector_allocation_domain = xen_vector_allocation_domain,
1539     #ifdef CONFIG_XEN_PRIVILEGED_GUEST
1540     .apic_id_registered = xen_apic_id_registered,
1541     #endif
1542 niro 612 --- a/arch/x86/kernel/head64-xen.c
1543     +++ b/arch/x86/kernel/head64-xen.c
1544 niro 609 @@ -54,11 +54,9 @@
1545     new_data = *(int *) (x86_boot_params + NEW_CL_POINTER);
1546     if (!new_data) {
1547     if (OLD_CL_MAGIC != * (u16 *) OLD_CL_MAGIC_ADDR) {
1548     - printk("so old bootloader that it does not support commandline?!\n");
1549     return;
1550     }
1551     new_data = OLD_CL_BASE_ADDR + * (u16 *) OLD_CL_OFFSET;
1552     - printk("old bootloader convention, maybe loadlin?\n");
1553     }
1554     command_line = (char *) ((u64)(new_data));
1555     memcpy(saved_command_line, command_line, COMMAND_LINE_SIZE);
1556     @@ -70,25 +68,6 @@
1557     memcpy(saved_command_line, xen_start_info->cmd_line, max_cmdline);
1558     saved_command_line[max_cmdline-1] = '\0';
1559     #endif
1560     - printk("Bootdata ok (command line is %s)\n", saved_command_line);
1561     -}
1562     -
1563     -static void __init setup_boot_cpu_data(void)
1564     -{
1565     - unsigned int dummy, eax;
1566     -
1567     - /* get vendor info */
1568     - cpuid(0, (unsigned int *)&boot_cpu_data.cpuid_level,
1569     - (unsigned int *)&boot_cpu_data.x86_vendor_id[0],
1570     - (unsigned int *)&boot_cpu_data.x86_vendor_id[8],
1571     - (unsigned int *)&boot_cpu_data.x86_vendor_id[4]);
1572     -
1573     - /* get cpu type */
1574     - cpuid(1, &eax, &dummy, &dummy,
1575     - (unsigned int *) &boot_cpu_data.x86_capability);
1576     - boot_cpu_data.x86 = (eax >> 8) & 0xf;
1577     - boot_cpu_data.x86_model = (eax >> 4) & 0xf;
1578     - boot_cpu_data.x86_mask = eax & 0xf;
1579     }
1580    
1581     #include <xen/interface/memory.h>
1582     @@ -101,7 +80,6 @@
1583     {
1584     struct xen_machphys_mapping mapping;
1585     unsigned long machine_to_phys_nr_ents;
1586     - char *s;
1587     int i;
1588    
1589     setup_xen_features();
1590     @@ -128,10 +106,7 @@
1591     asm volatile("lidt %0" :: "m" (idt_descr));
1592     #endif
1593    
1594     - /*
1595     - * This must be called really, really early:
1596     - */
1597     - lockdep_init();
1598     + early_printk("Kernel alive\n");
1599    
1600     for (i = 0; i < NR_CPUS; i++)
1601     cpu_pda(i) = &boot_cpu_pda[i];
1602     @@ -141,22 +116,5 @@
1603     #ifdef CONFIG_SMP
1604     cpu_set(0, cpu_online_map);
1605     #endif
1606     - s = strstr(saved_command_line, "earlyprintk=");
1607     - if (s != NULL)
1608     - setup_early_printk(strchr(s, '=') + 1);
1609     -#ifdef CONFIG_NUMA
1610     - s = strstr(saved_command_line, "numa=");
1611     - if (s != NULL)
1612     - numa_setup(s+5);
1613     -#endif
1614     -#ifdef CONFIG_X86_IO_APIC
1615     - if (strstr(saved_command_line, "disableapic"))
1616     - disable_apic = 1;
1617     -#endif
1618     - /* You need early console to see that */
1619     - if (__pa_symbol(&_end) >= KERNEL_TEXT_SIZE)
1620     - panic("Kernel too big for kernel mapping\n");
1621     -
1622     - setup_boot_cpu_data();
1623     start_kernel();
1624     }
1625 niro 612 --- a/arch/x86/kernel/head_32-xen.S
1626     +++ b/arch/x86/kernel/head_32-xen.S
1627     @@ -62,7 +62,7 @@
1628     movl %eax,%gs
1629     cld # gcc2 wants the direction flag cleared at all times
1630    
1631     - pushl %eax # fake return address
1632     + pushl $0 # fake return address for unwinder
1633     jmp start_kernel
1634    
1635     #define HYPERCALL_PAGE_OFFSET 0x1000
1636     --- a/arch/x86/kernel/head_64-xen.S
1637     +++ b/arch/x86/kernel/head_64-xen.S
1638 niro 609 @@ -149,7 +146,7 @@
1639     .quad 0,0 /* TSS */
1640     .quad 0,0 /* LDT */
1641     .quad 0,0,0 /* three TLS descriptors */
1642     - .quad 0 /* unused */
1643     + .quad 0x0000f40000000000 /* node/CPU stored in limit */
1644     gdt_end:
1645     /* asm/segment.h:GDT_ENTRIES must match this */
1646     /* This should be a multiple of the cache line size */
1647 niro 612 --- a/arch/x86/kernel/io_apic_32-xen.c
1648     +++ b/arch/x86/kernel/io_apic_32-xen.c
1649 niro 609 @@ -31,6 +31,9 @@
1650     #include <linux/acpi.h>
1651     #include <linux/module.h>
1652     #include <linux/sysdev.h>
1653     +#include <linux/pci.h>
1654     +#include <linux/msi.h>
1655     +#include <linux/htirq.h>
1656    
1657     #include <asm/io.h>
1658     #include <asm/smp.h>
1659     @@ -38,13 +41,15 @@
1660     #include <asm/timer.h>
1661     #include <asm/i8259.h>
1662     #include <asm/nmi.h>
1663     +#include <asm/msidef.h>
1664     +#include <asm/hypertransport.h>
1665    
1666     #include <mach_apic.h>
1667     +#include <mach_apicdef.h>
1668    
1669     #include "io_ports.h"
1670    
1671     #ifdef CONFIG_XEN
1672     -
1673     #include <xen/interface/xen.h>
1674     #include <xen/interface/physdev.h>
1675    
1676     @@ -55,32 +60,7 @@
1677    
1678     unsigned long io_apic_irqs;
1679    
1680     -static inline unsigned int xen_io_apic_read(unsigned int apic, unsigned int reg)
1681     -{
1682     - struct physdev_apic apic_op;
1683     - int ret;
1684     -
1685     - apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
1686     - apic_op.reg = reg;
1687     - ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op);
1688     - if (ret)
1689     - return ret;
1690     - return apic_op.value;
1691     -}
1692     -
1693     -static inline void xen_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
1694     -{
1695     - struct physdev_apic apic_op;
1696     -
1697     - apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
1698     - apic_op.reg = reg;
1699     - apic_op.value = value;
1700     - WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op));
1701     -}
1702     -
1703     -#define io_apic_read(a,r) xen_io_apic_read(a,r)
1704     -#define io_apic_write(a,r,v) xen_io_apic_write(a,r,v)
1705     -
1706     +#define clear_IO_APIC() ((void)0)
1707     #endif /* CONFIG_XEN */
1708    
1709     int (*ioapic_renumber_irq)(int ioapic, int irq);
1710     @@ -105,7 +85,7 @@
1711     */
1712     int nr_ioapic_registers[MAX_IO_APICS];
1713    
1714     -int disable_timer_pin_1 __initdata;
1715     +static int disable_timer_pin_1 __initdata;
1716    
1717     /*
1718     * Rough estimation of how many shared IRQs there are, can
1719     @@ -125,12 +105,122 @@
1720     int apic, pin, next;
1721     } irq_2_pin[PIN_MAP_SIZE];
1722    
1723     -int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1};
1724     -#ifdef CONFIG_PCI_MSI
1725     -#define vector_to_irq(vector) \
1726     - (platform_legacy_irq(vector) ? vector : vector_irq[vector])
1727     +#ifndef CONFIG_XEN
1728     +struct io_apic {
1729     + unsigned int index;
1730     + unsigned int unused[3];
1731     + unsigned int data;
1732     +};
1733     +
1734     +static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
1735     +{
1736     + return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
1737     + + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK);
1738     +}
1739     +#endif
1740     +
1741     +static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
1742     +{
1743     +#ifndef CONFIG_XEN
1744     + struct io_apic __iomem *io_apic = io_apic_base(apic);
1745     + writel(reg, &io_apic->index);
1746     + return readl(&io_apic->data);
1747     +#else
1748     + struct physdev_apic apic_op;
1749     + int ret;
1750     +
1751     + apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
1752     + apic_op.reg = reg;
1753     + ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op);
1754     + if (ret)
1755     + return ret;
1756     + return apic_op.value;
1757     +#endif
1758     +}
1759     +
1760     +static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
1761     +{
1762     +#ifndef CONFIG_XEN
1763     + struct io_apic __iomem *io_apic = io_apic_base(apic);
1764     + writel(reg, &io_apic->index);
1765     + writel(value, &io_apic->data);
1766     +#else
1767     + struct physdev_apic apic_op;
1768     +
1769     + apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
1770     + apic_op.reg = reg;
1771     + apic_op.value = value;
1772     + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op));
1773     +#endif
1774     +}
1775     +
1776     +#ifndef CONFIG_XEN
1777     +/*
1778     + * Re-write a value: to be used for read-modify-write
1779     + * cycles where the read already set up the index register.
1780     + *
1781     + * Older SiS APIC requires we rewrite the index register
1782     + */
1783     +static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
1784     +{
1785     + volatile struct io_apic *io_apic = io_apic_base(apic);
1786     + if (sis_apic_bug)
1787     + writel(reg, &io_apic->index);
1788     + writel(value, &io_apic->data);
1789     +}
1790     #else
1791     -#define vector_to_irq(vector) (vector)
1792     +#define io_apic_modify io_apic_write
1793     +#endif
1794     +
1795     +union entry_union {
1796     + struct { u32 w1, w2; };
1797     + struct IO_APIC_route_entry entry;
1798     +};
1799     +
1800     +static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
1801     +{
1802     + union entry_union eu;
1803     + unsigned long flags;
1804     + spin_lock_irqsave(&ioapic_lock, flags);
1805     + eu.w1 = io_apic_read(apic, 0x10 + 2 * pin);
1806     + eu.w2 = io_apic_read(apic, 0x11 + 2 * pin);
1807     + spin_unlock_irqrestore(&ioapic_lock, flags);
1808     + return eu.entry;
1809     +}
1810     +
1811     +/*
1812     + * When we write a new IO APIC routing entry, we need to write the high
1813     + * word first! If the mask bit in the low word is clear, we will enable
1814     + * the interrupt, and we need to make sure the entry is fully populated
1815     + * before that happens.
1816     + */
1817     +static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
1818     +{
1819     + unsigned long flags;
1820     + union entry_union eu;
1821     + eu.entry = e;
1822     + spin_lock_irqsave(&ioapic_lock, flags);
1823     + io_apic_write(apic, 0x11 + 2*pin, eu.w2);
1824     + io_apic_write(apic, 0x10 + 2*pin, eu.w1);
1825     + spin_unlock_irqrestore(&ioapic_lock, flags);
1826     +}
1827     +
1828     +#ifndef CONFIG_XEN
1829     +/*
1830     + * When we mask an IO APIC routing entry, we need to write the low
1831     + * word first, in order to set the mask bit before we change the
1832     + * high bits!
1833     + */
1834     +static void ioapic_mask_entry(int apic, int pin)
1835     +{
1836     + unsigned long flags;
1837     + union entry_union eu = { .entry.mask = 1 };
1838     +
1839     + spin_lock_irqsave(&ioapic_lock, flags);
1840     + io_apic_write(apic, 0x10 + 2*pin, eu.w1);
1841     + io_apic_write(apic, 0x11 + 2*pin, eu.w2);
1842     + spin_unlock_irqrestore(&ioapic_lock, flags);
1843     +}
1844     #endif
1845    
1846     /*
1847     @@ -156,9 +246,7 @@
1848     entry->pin = pin;
1849     }
1850    
1851     -#ifdef CONFIG_XEN
1852     -#define clear_IO_APIC() ((void)0)
1853     -#else
1854     +#ifndef CONFIG_XEN
1855     /*
1856     * Reroute an IRQ to a different pin.
1857     */
1858     @@ -243,25 +331,16 @@
1859     static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
1860     {
1861     struct IO_APIC_route_entry entry;
1862     - unsigned long flags;
1863    
1864     /* Check delivery_mode to be sure we're not clearing an SMI pin */
1865     - spin_lock_irqsave(&ioapic_lock, flags);
1866     - *(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
1867     - *(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
1868     - spin_unlock_irqrestore(&ioapic_lock, flags);
1869     + entry = ioapic_read_entry(apic, pin);
1870     if (entry.delivery_mode == dest_SMI)
1871     return;
1872    
1873     /*
1874     * Disable it in the IO-APIC irq-routing table:
1875     */
1876     - memset(&entry, 0, sizeof(entry));
1877     - entry.mask = 1;
1878     - spin_lock_irqsave(&ioapic_lock, flags);
1879     - io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0));
1880     - io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1));
1881     - spin_unlock_irqrestore(&ioapic_lock, flags);
1882     + ioapic_mask_entry(apic, pin);
1883     }
1884    
1885     static void clear_IO_APIC (void)
1886     @@ -301,7 +380,7 @@
1887     break;
1888     entry = irq_2_pin + entry->next;
1889     }
1890     - set_irq_info(irq, cpumask);
1891     + set_native_irq_info(irq, cpumask);
1892     spin_unlock_irqrestore(&ioapic_lock, flags);
1893     }
1894    
1895     @@ -1207,40 +1286,40 @@
1896     /* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
1897     u8 irq_vector[NR_IRQ_VECTORS] __read_mostly; /* = { FIRST_DEVICE_VECTOR , 0 }; */
1898    
1899     -int assign_irq_vector(int irq)
1900     +static int __assign_irq_vector(int irq)
1901     {
1902     - unsigned long flags;
1903     int vector;
1904     struct physdev_irq irq_op;
1905    
1906     - BUG_ON(irq != AUTO_ASSIGN && (unsigned)irq >= NR_IRQ_VECTORS);
1907     -
1908     - spin_lock_irqsave(&vector_lock, flags);
1909     + BUG_ON((unsigned)irq >= NR_IRQ_VECTORS);
1910    
1911     - if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0) {
1912     - spin_unlock_irqrestore(&vector_lock, flags);
1913     - return IO_APIC_VECTOR(irq);
1914     - }
1915     + if (irq_vector[irq] > 0)
1916     + return irq_vector[irq];
1917    
1918     irq_op.irq = irq;
1919     - if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) {
1920     - spin_unlock_irqrestore(&vector_lock, flags);
1921     + if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op))
1922     return -ENOSPC;
1923     - }
1924    
1925     vector = irq_op.vector;
1926     - vector_irq[vector] = irq;
1927     - if (irq != AUTO_ASSIGN)
1928     - IO_APIC_VECTOR(irq) = vector;
1929     + irq_vector[irq] = vector;
1930     +
1931     + return vector;
1932     +}
1933    
1934     +static int assign_irq_vector(int irq)
1935     +{
1936     + unsigned long flags;
1937     + int vector;
1938     +
1939     + spin_lock_irqsave(&vector_lock, flags);
1940     + vector = __assign_irq_vector(irq);
1941     spin_unlock_irqrestore(&vector_lock, flags);
1942    
1943     return vector;
1944     }
1945    
1946     #ifndef CONFIG_XEN
1947     -static struct hw_interrupt_type ioapic_level_type;
1948     -static struct hw_interrupt_type ioapic_edge_type;
1949     +static struct irq_chip ioapic_chip;
1950    
1951     #define IOAPIC_AUTO -1
1952     #define IOAPIC_EDGE 0
1953     @@ -1248,16 +1327,16 @@
1954    
1955     static void ioapic_register_intr(int irq, int vector, unsigned long trigger)
1956     {
1957     - unsigned idx;
1958     -
1959     - idx = use_pci_vector() && !platform_legacy_irq(irq) ? vector : irq;
1960     -
1961     if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
1962     trigger == IOAPIC_LEVEL)
1963     - irq_desc[idx].chip = &ioapic_level_type;
1964     - else
1965     - irq_desc[idx].chip = &ioapic_edge_type;
1966     - set_intr_gate(vector, interrupt[idx]);
1967     + set_irq_chip_and_handler_name(irq, &ioapic_chip,
1968     + handle_fasteoi_irq, "fasteoi");
1969     + else {
1970     + irq_desc[irq].status |= IRQ_DELAYED_DISABLE;
1971     + set_irq_chip_and_handler_name(irq, &ioapic_chip,
1972     + handle_edge_irq, "edge");
1973     + }
1974     + set_intr_gate(vector, interrupt[irq]);
1975     }
1976     #else
1977     #define ioapic_register_intr(_irq,_vector,_trigger) ((void)0)
1978     @@ -1328,9 +1407,8 @@
1979     if (!apic && (irq < 16))
1980     disable_8259A_irq(irq);
1981     }
1982     + ioapic_write_entry(apic, pin, entry);
1983     spin_lock_irqsave(&ioapic_lock, flags);
1984     - io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
1985     - io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
1986     set_native_irq_info(irq, TARGET_CPUS);
1987     spin_unlock_irqrestore(&ioapic_lock, flags);
1988     }
1989     @@ -1347,7 +1425,6 @@
1990     static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
1991     {
1992     struct IO_APIC_route_entry entry;
1993     - unsigned long flags;
1994    
1995     memset(&entry,0,sizeof(entry));
1996    
1997     @@ -1372,15 +1449,13 @@
1998     * The timer IRQ doesn't have to know that behind the
1999     * scene we have a 8259A-master in AEOI mode ...
2000     */
2001     - irq_desc[0].chip = &ioapic_edge_type;
2002     + irq_desc[0].chip = &ioapic_chip;
2003     + set_irq_handler(0, handle_edge_irq);
2004    
2005     /*
2006     * Add it to the IO-APIC irq-routing table:
2007     */
2008     - spin_lock_irqsave(&ioapic_lock, flags);
2009     - io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
2010     - io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
2011     - spin_unlock_irqrestore(&ioapic_lock, flags);
2012     + ioapic_write_entry(apic, pin, entry);
2013    
2014     enable_8259A_irq(0);
2015     }
2016     @@ -1490,10 +1565,7 @@
2017     for (i = 0; i <= reg_01.bits.entries; i++) {
2018     struct IO_APIC_route_entry entry;
2019    
2020     - spin_lock_irqsave(&ioapic_lock, flags);
2021     - *(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2);
2022     - *(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2);
2023     - spin_unlock_irqrestore(&ioapic_lock, flags);
2024     + entry = ioapic_read_entry(apic, i);
2025    
2026     printk(KERN_DEBUG " %02x %03X %02X ",
2027     i,
2028     @@ -1513,17 +1585,12 @@
2029     );
2030     }
2031     }
2032     - if (use_pci_vector())
2033     - printk(KERN_INFO "Using vector-based indexing\n");
2034     printk(KERN_DEBUG "IRQ to pin mappings:\n");
2035     for (i = 0; i < NR_IRQS; i++) {
2036     struct irq_pin_list *entry = irq_2_pin + i;
2037     if (entry->pin < 0)
2038     continue;
2039     - if (use_pci_vector() && !platform_legacy_irq(i))
2040     - printk(KERN_DEBUG "IRQ%d ", IO_APIC_VECTOR(i));
2041     - else
2042     - printk(KERN_DEBUG "IRQ%d ", i);
2043     + printk(KERN_DEBUG "IRQ%d ", i);
2044     for (;;) {
2045     printk("-> %d:%d", entry->apic, entry->pin);
2046     if (!entry->next)
2047     @@ -1709,10 +1776,7 @@
2048     /* See if any of the pins is in ExtINT mode */
2049     for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
2050     struct IO_APIC_route_entry entry;
2051     - spin_lock_irqsave(&ioapic_lock, flags);
2052     - *(((int *)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
2053     - *(((int *)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
2054     - spin_unlock_irqrestore(&ioapic_lock, flags);
2055     + entry = ioapic_read_entry(apic, pin);
2056    
2057    
2058     /* If the interrupt line is enabled and in ExtInt mode
2059     @@ -1770,7 +1834,6 @@
2060     */
2061     if (ioapic_i8259.pin != -1) {
2062     struct IO_APIC_route_entry entry;
2063     - unsigned long flags;
2064    
2065     memset(&entry, 0, sizeof(entry));
2066     entry.mask = 0; /* Enabled */
2067     @@ -1787,12 +1850,7 @@
2068     /*
2069     * Add it to the IO-APIC irq-routing table:
2070     */
2071     - spin_lock_irqsave(&ioapic_lock, flags);
2072     - io_apic_write(ioapic_i8259.apic, 0x11+2*ioapic_i8259.pin,
2073     - *(((int *)&entry)+1));
2074     - io_apic_write(ioapic_i8259.apic, 0x10+2*ioapic_i8259.pin,
2075     - *(((int *)&entry)+0));
2076     - spin_unlock_irqrestore(&ioapic_lock, flags);
2077     + ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry);
2078     }
2079     disconnect_bsp_APIC(ioapic_i8259.pin != -1);
2080     #endif
2081     @@ -1959,6 +2017,8 @@
2082     */
2083    
2084     /*
2085     + * Startup quirk:
2086     + *
2087     * Starting up a edge-triggered IO-APIC interrupt is
2088     * nasty - we need to make sure that we get the edge.
2089     * If it is already asserted for some reason, we need
2090     @@ -1966,8 +2026,10 @@
2091     *
2092     * This is not complete - we should be able to fake
2093     * an edge even if it isn't on the 8259A...
2094     + *
2095     + * (We do this for level-triggered IRQs too - it cannot hurt.)
2096     */
2097     -static unsigned int startup_edge_ioapic_irq(unsigned int irq)
2098     +static unsigned int startup_ioapic_irq(unsigned int irq)
2099     {
2100     int was_pending = 0;
2101     unsigned long flags;
2102     @@ -1984,47 +2046,18 @@
2103     return was_pending;
2104     }
2105    
2106     -/*
2107     - * Once we have recorded IRQ_PENDING already, we can mask the
2108     - * interrupt for real. This prevents IRQ storms from unhandled
2109     - * devices.
2110     - */
2111     -static void ack_edge_ioapic_irq(unsigned int irq)
2112     -{
2113     - move_irq(irq);
2114     - if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED))
2115     - == (IRQ_PENDING | IRQ_DISABLED))
2116     - mask_IO_APIC_irq(irq);
2117     - ack_APIC_irq();
2118     -}
2119     -
2120     -/*
2121     - * Level triggered interrupts can just be masked,
2122     - * and shutting down and starting up the interrupt
2123     - * is the same as enabling and disabling them -- except
2124     - * with a startup need to return a "was pending" value.
2125     - *
2126     - * Level triggered interrupts are special because we
2127     - * do not touch any IO-APIC register while handling
2128     - * them. We ack the APIC in the end-IRQ handler, not
2129     - * in the start-IRQ-handler. Protection against reentrance
2130     - * from the same interrupt is still provided, both by the
2131     - * generic IRQ layer and by the fact that an unacked local
2132     - * APIC does not accept IRQs.
2133     - */
2134     -static unsigned int startup_level_ioapic_irq (unsigned int irq)
2135     +static void ack_ioapic_irq(unsigned int irq)
2136     {
2137     - unmask_IO_APIC_irq(irq);
2138     -
2139     - return 0; /* don't check for pending */
2140     + move_native_irq(irq);
2141     + ack_APIC_irq();
2142     }
2143    
2144     -static void end_level_ioapic_irq (unsigned int irq)
2145     +static void ack_ioapic_quirk_irq(unsigned int irq)
2146     {
2147     unsigned long v;
2148     int i;
2149    
2150     - move_irq(irq);
2151     + move_native_irq(irq);
2152     /*
2153     * It appears there is an erratum which affects at least version 0x11
2154     * of I/O APIC (that's the 82093AA and cores integrated into various
2155     @@ -2044,7 +2077,7 @@
2156     * operation to prevent an edge-triggered interrupt escaping meanwhile.
2157     * The idea is from Manfred Spraul. --macro
2158     */
2159     - i = IO_APIC_VECTOR(irq);
2160     + i = irq_vector[irq];
2161    
2162     v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
2163    
2164     @@ -2059,104 +2092,24 @@
2165     }
2166     }
2167    
2168     -#ifdef CONFIG_PCI_MSI
2169     -static unsigned int startup_edge_ioapic_vector(unsigned int vector)
2170     -{
2171     - int irq = vector_to_irq(vector);
2172     -
2173     - return startup_edge_ioapic_irq(irq);
2174     -}
2175     -
2176     -static void ack_edge_ioapic_vector(unsigned int vector)
2177     -{
2178     - int irq = vector_to_irq(vector);
2179     -
2180     - move_native_irq(vector);
2181     - ack_edge_ioapic_irq(irq);
2182     -}
2183     -
2184     -static unsigned int startup_level_ioapic_vector (unsigned int vector)
2185     -{
2186     - int irq = vector_to_irq(vector);
2187     -
2188     - return startup_level_ioapic_irq (irq);
2189     -}
2190     -
2191     -static void end_level_ioapic_vector (unsigned int vector)
2192     -{
2193     - int irq = vector_to_irq(vector);
2194     -
2195     - move_native_irq(vector);
2196     - end_level_ioapic_irq(irq);
2197     -}
2198     -
2199     -static void mask_IO_APIC_vector (unsigned int vector)
2200     -{
2201     - int irq = vector_to_irq(vector);
2202     -
2203     - mask_IO_APIC_irq(irq);
2204     -}
2205     -
2206     -static void unmask_IO_APIC_vector (unsigned int vector)
2207     -{
2208     - int irq = vector_to_irq(vector);
2209     -
2210     - unmask_IO_APIC_irq(irq);
2211     -}
2212     -
2213     -#ifdef CONFIG_SMP
2214     -static void set_ioapic_affinity_vector (unsigned int vector,
2215     - cpumask_t cpu_mask)
2216     -{
2217     - int irq = vector_to_irq(vector);
2218     -
2219     - set_native_irq_info(vector, cpu_mask);
2220     - set_ioapic_affinity_irq(irq, cpu_mask);
2221     -}
2222     -#endif
2223     -#endif
2224     -
2225     -static int ioapic_retrigger(unsigned int irq)
2226     +static int ioapic_retrigger_irq(unsigned int irq)
2227     {
2228     - send_IPI_self(IO_APIC_VECTOR(irq));
2229     + send_IPI_self(irq_vector[irq]);
2230    
2231     return 1;
2232     }
2233    
2234     -/*
2235     - * Level and edge triggered IO-APIC interrupts need different handling,
2236     - * so we use two separate IRQ descriptors. Edge triggered IRQs can be
2237     - * handled with the level-triggered descriptor, but that one has slightly
2238     - * more overhead. Level-triggered interrupts cannot be handled with the
2239     - * edge-triggered handler, without risking IRQ storms and other ugly
2240     - * races.
2241     - */
2242     -static struct hw_interrupt_type ioapic_edge_type __read_mostly = {
2243     - .typename = "IO-APIC-edge",
2244     - .startup = startup_edge_ioapic,
2245     - .shutdown = shutdown_edge_ioapic,
2246     - .enable = enable_edge_ioapic,
2247     - .disable = disable_edge_ioapic,
2248     - .ack = ack_edge_ioapic,
2249     - .end = end_edge_ioapic,
2250     -#ifdef CONFIG_SMP
2251     - .set_affinity = set_ioapic_affinity,
2252     -#endif
2253     - .retrigger = ioapic_retrigger,
2254     -};
2255     -
2256     -static struct hw_interrupt_type ioapic_level_type __read_mostly = {
2257     - .typename = "IO-APIC-level",
2258     - .startup = startup_level_ioapic,
2259     - .shutdown = shutdown_level_ioapic,
2260     - .enable = enable_level_ioapic,
2261     - .disable = disable_level_ioapic,
2262     - .ack = mask_and_ack_level_ioapic,
2263     - .end = end_level_ioapic,
2264     +static struct irq_chip ioapic_chip __read_mostly = {
2265     + .name = "IO-APIC",
2266     + .startup = startup_ioapic_irq,
2267     + .mask = mask_IO_APIC_irq,
2268     + .unmask = unmask_IO_APIC_irq,
2269     + .ack = ack_ioapic_irq,
2270     + .eoi = ack_ioapic_quirk_irq,
2271     #ifdef CONFIG_SMP
2272     - .set_affinity = set_ioapic_affinity,
2273     + .set_affinity = set_ioapic_affinity_irq,
2274     #endif
2275     - .retrigger = ioapic_retrigger,
2276     + .retrigger = ioapic_retrigger_irq,
2277     };
2278     #endif /* !CONFIG_XEN */
2279    
2280     @@ -2177,12 +2130,7 @@
2281     */
2282     for (irq = 0; irq < NR_IRQS ; irq++) {
2283     int tmp = irq;
2284     - if (use_pci_vector()) {
2285     - if (!platform_legacy_irq(tmp))
2286     - if ((tmp = vector_to_irq(tmp)) == -1)
2287     - continue;
2288     - }
2289     - if (IO_APIC_IRQ(tmp) && !IO_APIC_VECTOR(tmp)) {
2290     + if (IO_APIC_IRQ(tmp) && !irq_vector[tmp]) {
2291     /*
2292     * Hmm.. We don't have an entry for this,
2293     * so default to an old-fashioned 8259
2294     @@ -2193,22 +2141,23 @@
2295     #ifndef CONFIG_XEN
2296     else
2297     /* Strange. Oh, well.. */
2298     - irq_desc[irq].chip = &no_irq_type;
2299     + irq_desc[irq].chip = &no_irq_chip;
2300     #endif
2301     }
2302     }
2303     }
2304    
2305     #ifndef CONFIG_XEN
2306     -static void enable_lapic_irq (unsigned int irq)
2307     -{
2308     - unsigned long v;
2309     +/*
2310     + * The local APIC irq-chip implementation:
2311     + */
2312    
2313     - v = apic_read(APIC_LVT0);
2314     - apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED);
2315     +static void ack_apic(unsigned int irq)
2316     +{
2317     + ack_APIC_irq();
2318     }
2319    
2320     -static void disable_lapic_irq (unsigned int irq)
2321     +static void mask_lapic_irq (unsigned int irq)
2322     {
2323     unsigned long v;
2324    
2325     @@ -2216,21 +2165,19 @@
2326     apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
2327     }
2328    
2329     -static void ack_lapic_irq (unsigned int irq)
2330     +static void unmask_lapic_irq (unsigned int irq)
2331     {
2332     - ack_APIC_irq();
2333     -}
2334     + unsigned long v;
2335    
2336     -static void end_lapic_irq (unsigned int i) { /* nothing */ }
2337     + v = apic_read(APIC_LVT0);
2338     + apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED);
2339     +}
2340    
2341     -static struct hw_interrupt_type lapic_irq_type __read_mostly = {
2342     - .typename = "local-APIC-edge",
2343     - .startup = NULL, /* startup_irq() not used for IRQ0 */
2344     - .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */
2345     - .enable = enable_lapic_irq,
2346     - .disable = disable_lapic_irq,
2347     - .ack = ack_lapic_irq,
2348     - .end = end_lapic_irq
2349     +static struct irq_chip lapic_chip __read_mostly = {
2350     + .name = "local-APIC-edge",
2351     + .mask = mask_lapic_irq,
2352     + .unmask = unmask_lapic_irq,
2353     + .eoi = ack_apic,
2354     };
2355    
2356     static void setup_nmi (void)
2357     @@ -2263,17 +2210,13 @@
2358     int apic, pin, i;
2359     struct IO_APIC_route_entry entry0, entry1;
2360     unsigned char save_control, save_freq_select;
2361     - unsigned long flags;
2362    
2363     pin = find_isa_irq_pin(8, mp_INT);
2364     apic = find_isa_irq_apic(8, mp_INT);
2365     if (pin == -1)
2366     return;
2367    
2368     - spin_lock_irqsave(&ioapic_lock, flags);
2369     - *(((int *)&entry0) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
2370     - *(((int *)&entry0) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
2371     - spin_unlock_irqrestore(&ioapic_lock, flags);
2372     + entry0 = ioapic_read_entry(apic, pin);
2373     clear_IO_APIC_pin(apic, pin);
2374    
2375     memset(&entry1, 0, sizeof(entry1));
2376     @@ -2286,10 +2229,7 @@
2377     entry1.trigger = 0;
2378     entry1.vector = 0;
2379    
2380     - spin_lock_irqsave(&ioapic_lock, flags);
2381     - io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry1) + 1));
2382     - io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry1) + 0));
2383     - spin_unlock_irqrestore(&ioapic_lock, flags);
2384     + ioapic_write_entry(apic, pin, entry1);
2385    
2386     save_control = CMOS_READ(RTC_CONTROL);
2387     save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
2388     @@ -2308,10 +2248,7 @@
2389     CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
2390     clear_IO_APIC_pin(apic, pin);
2391    
2392     - spin_lock_irqsave(&ioapic_lock, flags);
2393     - io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry0) + 1));
2394     - io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry0) + 0));
2395     - spin_unlock_irqrestore(&ioapic_lock, flags);
2396     + ioapic_write_entry(apic, pin, entry0);
2397     }
2398    
2399     int timer_uses_ioapic_pin_0;
2400     @@ -2411,7 +2348,8 @@
2401     printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
2402    
2403     disable_8259A_irq(0);
2404     - irq_desc[0].chip = &lapic_irq_type;
2405     + set_irq_chip_and_handler_name(0, &lapic_chip, handle_fasteoi_irq,
2406     + "fasteio");
2407     apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
2408     enable_8259A_irq(0);
2409    
2410     @@ -2523,17 +2461,12 @@
2411     {
2412     struct IO_APIC_route_entry *entry;
2413     struct sysfs_ioapic_data *data;
2414     - unsigned long flags;
2415     int i;
2416    
2417     data = container_of(dev, struct sysfs_ioapic_data, dev);
2418     entry = data->entry;
2419     - spin_lock_irqsave(&ioapic_lock, flags);
2420     - for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
2421     - *(((int *)entry) + 1) = io_apic_read(dev->id, 0x11 + 2 * i);
2422     - *(((int *)entry) + 0) = io_apic_read(dev->id, 0x10 + 2 * i);
2423     - }
2424     - spin_unlock_irqrestore(&ioapic_lock, flags);
2425     + for (i = 0; i < nr_ioapic_registers[dev->id]; i ++)
2426     + entry[i] = ioapic_read_entry(dev->id, i);
2427    
2428     return 0;
2429     }
2430     @@ -2555,11 +2488,9 @@
2431     reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
2432     io_apic_write(dev->id, 0, reg_00.raw);
2433     }
2434     - for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
2435     - io_apic_write(dev->id, 0x11+2*i, *(((int *)entry)+1));
2436     - io_apic_write(dev->id, 0x10+2*i, *(((int *)entry)+0));
2437     - }
2438     spin_unlock_irqrestore(&ioapic_lock, flags);
2439     + for (i = 0; i < nr_ioapic_registers[dev->id]; i ++)
2440     + ioapic_write_entry(dev->id, i, entry[i]);
2441    
2442     return 0;
2443     }
2444     @@ -2605,6 +2536,240 @@
2445    
2446     device_initcall(ioapic_init_sysfs);
2447    
2448     +#ifndef CONFIG_XEN
2449     +/*
2450     + * Dynamic irq allocate and deallocation
2451     + */
2452     +int create_irq(void)
2453     +{
2454     + /* Allocate an unused irq */
2455     + int irq, new, vector;
2456     + unsigned long flags;
2457     +
2458     + irq = -ENOSPC;
2459     + spin_lock_irqsave(&vector_lock, flags);
2460     + for (new = (NR_IRQS - 1); new >= 0; new--) {
2461     + if (platform_legacy_irq(new))
2462     + continue;
2463     + if (irq_vector[new] != 0)
2464     + continue;
2465     + vector = __assign_irq_vector(new);
2466     + if (likely(vector > 0))
2467     + irq = new;
2468     + break;
2469     + }
2470     + spin_unlock_irqrestore(&vector_lock, flags);
2471     +
2472     + if (irq >= 0) {
2473     + set_intr_gate(vector, interrupt[irq]);
2474     + dynamic_irq_init(irq);
2475     + }
2476     + return irq;
2477     +}
2478     +
2479     +void destroy_irq(unsigned int irq)
2480     +{
2481     + unsigned long flags;
2482     +
2483     + dynamic_irq_cleanup(irq);
2484     +
2485     + spin_lock_irqsave(&vector_lock, flags);
2486     + irq_vector[irq] = 0;
2487     + spin_unlock_irqrestore(&vector_lock, flags);
2488     +}
2489     +#endif
2490     +
2491     +/*
2492     + * MSI mesage composition
2493     + */
2494     +#ifdef CONFIG_PCI_MSI
2495     +static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
2496     +{
2497     + int vector;
2498     + unsigned dest;
2499     +
2500     + vector = assign_irq_vector(irq);
2501     + if (vector >= 0) {
2502     + dest = cpu_mask_to_apicid(TARGET_CPUS);
2503     +
2504     + msg->address_hi = MSI_ADDR_BASE_HI;
2505     + msg->address_lo =
2506     + MSI_ADDR_BASE_LO |
2507     + ((INT_DEST_MODE == 0) ?
2508     + MSI_ADDR_DEST_MODE_PHYSICAL:
2509     + MSI_ADDR_DEST_MODE_LOGICAL) |
2510     + ((INT_DELIVERY_MODE != dest_LowestPrio) ?
2511     + MSI_ADDR_REDIRECTION_CPU:
2512     + MSI_ADDR_REDIRECTION_LOWPRI) |
2513     + MSI_ADDR_DEST_ID(dest);
2514     +
2515     + msg->data =
2516     + MSI_DATA_TRIGGER_EDGE |
2517     + MSI_DATA_LEVEL_ASSERT |
2518     + ((INT_DELIVERY_MODE != dest_LowestPrio) ?
2519     + MSI_DATA_DELIVERY_FIXED:
2520     + MSI_DATA_DELIVERY_LOWPRI) |
2521     + MSI_DATA_VECTOR(vector);
2522     + }
2523     + return vector;
2524     +}
2525     +
2526     +#ifdef CONFIG_SMP
2527     +static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
2528     +{
2529     + struct msi_msg msg;
2530     + unsigned int dest;
2531     + cpumask_t tmp;
2532     + int vector;
2533     +
2534     + cpus_and(tmp, mask, cpu_online_map);
2535     + if (cpus_empty(tmp))
2536     + tmp = TARGET_CPUS;
2537     +
2538     + vector = assign_irq_vector(irq);
2539     + if (vector < 0)
2540     + return;
2541     +
2542     + dest = cpu_mask_to_apicid(mask);
2543     +
2544     + read_msi_msg(irq, &msg);
2545     +
2546     + msg.data &= ~MSI_DATA_VECTOR_MASK;
2547     + msg.data |= MSI_DATA_VECTOR(vector);
2548     + msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
2549     + msg.address_lo |= MSI_ADDR_DEST_ID(dest);
2550     +
2551     + write_msi_msg(irq, &msg);
2552     + set_native_irq_info(irq, mask);
2553     +}
2554     +#endif /* CONFIG_SMP */
2555     +
2556     +/*
2557     + * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
2558     + * which implement the MSI or MSI-X Capability Structure.
2559     + */
2560     +static struct irq_chip msi_chip = {
2561     + .name = "PCI-MSI",
2562     + .unmask = unmask_msi_irq,
2563     + .mask = mask_msi_irq,
2564     + .ack = ack_ioapic_irq,
2565     +#ifdef CONFIG_SMP
2566     + .set_affinity = set_msi_irq_affinity,
2567     +#endif
2568     + .retrigger = ioapic_retrigger_irq,
2569     +};
2570     +
2571     +int arch_setup_msi_irq(unsigned int irq, struct pci_dev *dev)
2572     +{
2573     + struct msi_msg msg;
2574     + int ret;
2575     + ret = msi_compose_msg(dev, irq, &msg);
2576     + if (ret < 0)
2577     + return ret;
2578     +
2579     + write_msi_msg(irq, &msg);
2580     +
2581     + set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq,
2582     + "edge");
2583     +
2584     + return 0;
2585     +}
2586     +
2587     +void arch_teardown_msi_irq(unsigned int irq)
2588     +{
2589     + return;
2590     +}
2591     +
2592     +#endif /* CONFIG_PCI_MSI */
2593     +
2594     +/*
2595     + * Hypertransport interrupt support
2596     + */
2597     +#ifdef CONFIG_HT_IRQ
2598     +
2599     +#ifdef CONFIG_SMP
2600     +
2601     +static void target_ht_irq(unsigned int irq, unsigned int dest)
2602     +{
2603     + struct ht_irq_msg msg;
2604     + fetch_ht_irq_msg(irq, &msg);
2605     +
2606     + msg.address_lo &= ~(HT_IRQ_LOW_DEST_ID_MASK);
2607     + msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK);
2608     +
2609     + msg.address_lo |= HT_IRQ_LOW_DEST_ID(dest);
2610     + msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest);
2611     +
2612     + write_ht_irq_msg(irq, &msg);
2613     +}
2614     +
2615     +static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
2616     +{
2617     + unsigned int dest;
2618     + cpumask_t tmp;
2619     +
2620     + cpus_and(tmp, mask, cpu_online_map);
2621     + if (cpus_empty(tmp))
2622     + tmp = TARGET_CPUS;
2623     +
2624     + cpus_and(mask, tmp, CPU_MASK_ALL);
2625     +
2626     + dest = cpu_mask_to_apicid(mask);
2627     +
2628     + target_ht_irq(irq, dest);
2629     + set_native_irq_info(irq, mask);
2630     +}
2631     +#endif
2632     +
2633     +static struct irq_chip ht_irq_chip = {
2634     + .name = "PCI-HT",
2635     + .mask = mask_ht_irq,
2636     + .unmask = unmask_ht_irq,
2637     + .ack = ack_ioapic_irq,
2638     +#ifdef CONFIG_SMP
2639     + .set_affinity = set_ht_irq_affinity,
2640     +#endif
2641     + .retrigger = ioapic_retrigger_irq,
2642     +};
2643     +
2644     +int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
2645     +{
2646     + int vector;
2647     +
2648     + vector = assign_irq_vector(irq);
2649     + if (vector >= 0) {
2650     + struct ht_irq_msg msg;
2651     + unsigned dest;
2652     + cpumask_t tmp;
2653     +
2654     + cpus_clear(tmp);
2655     + cpu_set(vector >> 8, tmp);
2656     + dest = cpu_mask_to_apicid(tmp);
2657     +
2658     + msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
2659     +
2660     + msg.address_lo =
2661     + HT_IRQ_LOW_BASE |
2662     + HT_IRQ_LOW_DEST_ID(dest) |
2663     + HT_IRQ_LOW_VECTOR(vector) |
2664     + ((INT_DEST_MODE == 0) ?
2665     + HT_IRQ_LOW_DM_PHYSICAL :
2666     + HT_IRQ_LOW_DM_LOGICAL) |
2667     + HT_IRQ_LOW_RQEOI_EDGE |
2668     + ((INT_DELIVERY_MODE != dest_LowestPrio) ?
2669     + HT_IRQ_LOW_MT_FIXED :
2670     + HT_IRQ_LOW_MT_ARBITRATED) |
2671     + HT_IRQ_LOW_IRQ_MASKED;
2672     +
2673     + write_ht_irq_msg(irq, &msg);
2674     +
2675     + set_irq_chip_and_handler_name(irq, &ht_irq_chip,
2676     + handle_edge_irq, "edge");
2677     + }
2678     + return vector;
2679     +}
2680     +#endif /* CONFIG_HT_IRQ */
2681     +
2682     /* --------------------------------------------------------------------------
2683     ACPI-based IOAPIC Configuration
2684     -------------------------------------------------------------------------- */
2685     @@ -2758,13 +2923,34 @@
2686     if (!ioapic && (irq < 16))
2687     disable_8259A_irq(irq);
2688    
2689     + ioapic_write_entry(ioapic, pin, entry);
2690     spin_lock_irqsave(&ioapic_lock, flags);
2691     - io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1));
2692     - io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0));
2693     - set_native_irq_info(use_pci_vector() ? entry.vector : irq, TARGET_CPUS);
2694     + set_native_irq_info(irq, TARGET_CPUS);
2695     spin_unlock_irqrestore(&ioapic_lock, flags);
2696    
2697     return 0;
2698     }
2699    
2700     #endif /* CONFIG_ACPI */
2701     +
2702     +static int __init parse_disable_timer_pin_1(char *arg)
2703     +{
2704     + disable_timer_pin_1 = 1;
2705     + return 0;
2706     +}
2707     +early_param("disable_timer_pin_1", parse_disable_timer_pin_1);
2708     +
2709     +static int __init parse_enable_timer_pin_1(char *arg)
2710     +{
2711     + disable_timer_pin_1 = -1;
2712     + return 0;
2713     +}
2714     +early_param("enable_timer_pin_1", parse_enable_timer_pin_1);
2715     +
2716     +static int __init parse_noapic(char *arg)
2717     +{
2718     + /* disable IO-APIC */
2719     + disable_ioapic_setup();
2720     + return 0;
2721     +}
2722     +early_param("noapic", parse_noapic);
2723 niro 612 --- a/arch/x86/kernel/io_apic_64-xen.c
2724     +++ b/arch/x86/kernel/io_apic_64-xen.c
2725 niro 609 @@ -26,9 +26,12 @@
2726     #include <linux/delay.h>
2727     #include <linux/sched.h>
2728     #include <linux/smp_lock.h>
2729     +#include <linux/pci.h>
2730     #include <linux/mc146818rtc.h>
2731     #include <linux/acpi.h>
2732     #include <linux/sysdev.h>
2733     +#include <linux/msi.h>
2734     +#include <linux/htirq.h>
2735     #ifdef CONFIG_ACPI
2736     #include <acpi/acpi_bus.h>
2737     #endif
2738     @@ -41,6 +44,10 @@
2739     #include <asm/acpi.h>
2740     #include <asm/dma.h>
2741     #include <asm/nmi.h>
2742     +#include <asm/msidef.h>
2743     +#include <asm/hypertransport.h>
2744     +
2745     +static int assign_irq_vector(int irq, cpumask_t mask, cpumask_t *result);
2746    
2747     #define __apicdebuginit __init
2748    
2749     @@ -48,17 +55,29 @@
2750    
2751     static int no_timer_check;
2752    
2753     -int disable_timer_pin_1 __initdata;
2754     +static int disable_timer_pin_1 __initdata;
2755    
2756     -#ifndef CONFIG_XEN
2757     -int timer_over_8254 __initdata = 0;
2758     +#ifdef CONFIG_XEN
2759     +#include <xen/interface/xen.h>
2760     +#include <xen/interface/physdev.h>
2761     +
2762     +/* Fake i8259 */
2763     +#define make_8259A_irq(_irq) (io_apic_irqs &= ~(1UL<<(_irq)))
2764     +#define disable_8259A_irq(_irq) ((void)0)
2765     +#define i8259A_irq_pending(_irq) (0)
2766     +
2767     +unsigned long io_apic_irqs;
2768     +
2769     +#define clear_IO_APIC() ((void)0)
2770     +#else
2771     +int timer_over_8254 __initdata = 1;
2772    
2773     /* Where if anywhere is the i8259 connect in external int mode */
2774     static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
2775     #endif
2776    
2777     static DEFINE_SPINLOCK(ioapic_lock);
2778     -static DEFINE_SPINLOCK(vector_lock);
2779     +DEFINE_SPINLOCK(vector_lock);
2780    
2781     /*
2782     * # of IRQ routing registers
2783     @@ -83,28 +102,27 @@
2784     short apic, pin, next;
2785     } irq_2_pin[PIN_MAP_SIZE];
2786    
2787     -int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1};
2788     -#ifdef CONFIG_PCI_MSI
2789     -#define vector_to_irq(vector) \
2790     - (platform_legacy_irq(vector) ? vector : vector_irq[vector])
2791     -#else
2792     -#define vector_to_irq(vector) (vector)
2793     -#endif
2794     -
2795     -#ifdef CONFIG_XEN
2796     -
2797     -#include <xen/interface/xen.h>
2798     -#include <xen/interface/physdev.h>
2799     -
2800     -/* Fake i8259 */
2801     -#define make_8259A_irq(_irq) (io_apic_irqs &= ~(1UL<<(_irq)))
2802     -#define disable_8259A_irq(_irq) ((void)0)
2803     -#define i8259A_irq_pending(_irq) (0)
2804     +#ifndef CONFIG_XEN
2805     +struct io_apic {
2806     + unsigned int index;
2807     + unsigned int unused[3];
2808     + unsigned int data;
2809     +};
2810    
2811     -unsigned long io_apic_irqs;
2812     +static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
2813     +{
2814     + return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
2815     + + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK);
2816     +}
2817     +#endif
2818    
2819     -static inline unsigned int xen_io_apic_read(unsigned int apic, unsigned int reg)
2820     +static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
2821     {
2822     +#ifndef CONFIG_XEN
2823     + struct io_apic __iomem *io_apic = io_apic_base(apic);
2824     + writel(reg, &io_apic->index);
2825     + return readl(&io_apic->data);
2826     +#else
2827     struct physdev_apic apic_op;
2828     int ret;
2829    
2830     @@ -114,31 +132,131 @@
2831     if (ret)
2832     return ret;
2833     return apic_op.value;
2834     +#endif
2835     }
2836    
2837     -static inline void xen_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
2838     +static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
2839     {
2840     +#ifndef CONFIG_XEN
2841     + struct io_apic __iomem *io_apic = io_apic_base(apic);
2842     + writel(reg, &io_apic->index);
2843     + writel(value, &io_apic->data);
2844     +#else
2845     struct physdev_apic apic_op;
2846    
2847     apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
2848     apic_op.reg = reg;
2849     apic_op.value = value;
2850     WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op));
2851     +#endif
2852 niro 612 }
2853    
2854     -#define io_apic_read(a,r) xen_io_apic_read(a,r)
2855     -#define io_apic_write(a,r,v) xen_io_apic_write(a,r,v)
2856 niro 609 +#ifndef CONFIG_XEN
2857     +/*
2858     + * Re-write a value: to be used for read-modify-write
2859     + * cycles where the read already set up the index register.
2860     + */
2861     +static inline void io_apic_modify(unsigned int apic, unsigned int value)
2862     +{
2863     + struct io_apic __iomem *io_apic = io_apic_base(apic);
2864     + writel(value, &io_apic->data);
2865 niro 612 +}
2866 niro 609 +#else
2867     +#define io_apic_modify io_apic_write
2868     +#endif
2869    
2870 niro 612 -#define clear_IO_APIC() ((void)0)
2871 niro 609 +/*
2872     + * Synchronize the IO-APIC and the CPU by doing
2873     + * a dummy read from the IO-APIC
2874     + */
2875     +static inline void io_apic_sync(unsigned int apic)
2876     +{
2877     +#ifndef CONFIG_XEN
2878     + struct io_apic __iomem *io_apic = io_apic_base(apic);
2879     + readl(&io_apic->data);
2880     +#endif
2881     +}
2882    
2883 niro 612 -#else
2884 niro 609 +union entry_union {
2885     + struct { u32 w1, w2; };
2886     + struct IO_APIC_route_entry entry;
2887     +};
2888 niro 612 +
2889 niro 609 +static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
2890     +{
2891     + union entry_union eu;
2892     + unsigned long flags;
2893     + spin_lock_irqsave(&ioapic_lock, flags);
2894     + eu.w1 = io_apic_read(apic, 0x10 + 2 * pin);
2895     + eu.w2 = io_apic_read(apic, 0x11 + 2 * pin);
2896     + spin_unlock_irqrestore(&ioapic_lock, flags);
2897     + return eu.entry;
2898     +}
2899     +
2900     +/*
2901     + * When we write a new IO APIC routing entry, we need to write the high
2902     + * word first! If the mask bit in the low word is clear, we will enable
2903     + * the interrupt, and we need to make sure the entry is fully populated
2904     + * before that happens.
2905     + */
2906     +static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
2907     +{
2908     + unsigned long flags;
2909     + union entry_union eu;
2910     + eu.entry = e;
2911     + spin_lock_irqsave(&ioapic_lock, flags);
2912     + io_apic_write(apic, 0x11 + 2*pin, eu.w2);
2913     + io_apic_write(apic, 0x10 + 2*pin, eu.w1);
2914     + spin_unlock_irqrestore(&ioapic_lock, flags);
2915     +}
2916     +
2917     +#ifndef CONFIG_XEN
2918     +/*
2919     + * When we mask an IO APIC routing entry, we need to write the low
2920     + * word first, in order to set the mask bit before we change the
2921     + * high bits!
2922     + */
2923     +static void ioapic_mask_entry(int apic, int pin)
2924     +{
2925     + unsigned long flags;
2926     + union entry_union eu = { .entry.mask = 1 };
2927     +
2928     + spin_lock_irqsave(&ioapic_lock, flags);
2929     + io_apic_write(apic, 0x10 + 2*pin, eu.w1);
2930     + io_apic_write(apic, 0x11 + 2*pin, eu.w2);
2931     + spin_unlock_irqrestore(&ioapic_lock, flags);
2932     +}
2933    
2934     #ifdef CONFIG_SMP
2935     +static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector)
2936     +{
2937     + int apic, pin;
2938     + struct irq_pin_list *entry = irq_2_pin + irq;
2939     +
2940     + BUG_ON(irq >= NR_IRQS);
2941     + for (;;) {
2942     + unsigned int reg;
2943     + apic = entry->apic;
2944     + pin = entry->pin;
2945     + if (pin == -1)
2946     + break;
2947     + io_apic_write(apic, 0x11 + pin*2, dest);
2948     + reg = io_apic_read(apic, 0x10 + pin*2);
2949     + reg &= ~0x000000ff;
2950     + reg |= vector;
2951     + io_apic_modify(apic, reg);
2952     + if (!entry->next)
2953     + break;
2954     + entry = irq_2_pin + entry->next;
2955     + }
2956     +}
2957     +
2958     static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
2959     {
2960     unsigned long flags;
2961     unsigned int dest;
2962     cpumask_t tmp;
2963     + int vector;
2964    
2965     cpus_and(tmp, mask, cpu_online_map);
2966     if (cpus_empty(tmp))
2967     @@ -146,7 +264,11 @@
2968    
2969     cpus_and(mask, tmp, CPU_MASK_ALL);
2970    
2971     - dest = cpu_mask_to_apicid(mask);
2972     + vector = assign_irq_vector(irq, mask, &tmp);
2973     + if (vector < 0)
2974     + return;
2975     +
2976     + dest = cpu_mask_to_apicid(tmp);
2977    
2978     /*
2979     * Only the high 8 bits are valid.
2980     @@ -154,13 +276,12 @@
2981     dest = SET_APIC_LOGICAL_ID(dest);
2982    
2983     spin_lock_irqsave(&ioapic_lock, flags);
2984     - __DO_ACTION(1, = dest, )
2985     - set_irq_info(irq, mask);
2986     + __target_IO_APIC_irq(irq, dest, vector);
2987     + set_native_irq_info(irq, mask);
2988     spin_unlock_irqrestore(&ioapic_lock, flags);
2989     }
2990     #endif
2991     -
2992     -#endif /* !CONFIG_XEN */
2993     +#endif
2994    
2995     /*
2996     * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
2997     @@ -240,24 +361,15 @@
2998     static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
2999     {
3000     struct IO_APIC_route_entry entry;
3001     - unsigned long flags;
3002    
3003     /* Check delivery_mode to be sure we're not clearing an SMI pin */
3004     - spin_lock_irqsave(&ioapic_lock, flags);
3005     - *(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
3006     - *(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
3007     - spin_unlock_irqrestore(&ioapic_lock, flags);
3008     + entry = ioapic_read_entry(apic, pin);
3009     if (entry.delivery_mode == dest_SMI)
3010     return;
3011     /*
3012     * Disable it in the IO-APIC irq-routing table:
3013     */
3014     - memset(&entry, 0, sizeof(entry));
3015     - entry.mask = 1;
3016     - spin_lock_irqsave(&ioapic_lock, flags);
3017     - io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0));
3018     - io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1));
3019     - spin_unlock_irqrestore(&ioapic_lock, flags);
3020     + ioapic_mask_entry(apic, pin);
3021     }
3022    
3023     static void clear_IO_APIC (void)
3024     @@ -271,16 +383,6 @@
3025    
3026     #endif /* !CONFIG_XEN */
3027    
3028     -static u8 gsi_2_irq[NR_IRQ_VECTORS] = { [0 ... NR_IRQ_VECTORS-1] = 0xFF };
3029     -
3030     -/*
3031     - * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
3032     - * specific CPU-side IRQs.
3033     - */
3034     -
3035     -#define MAX_PIRQS 8
3036     -static int pirq_entries [MAX_PIRQS];
3037     -static int pirqs_enabled;
3038     int skip_ioapic_setup;
3039     int ioapic_force;
3040    
3041     @@ -289,18 +391,17 @@
3042     static int __init disable_ioapic_setup(char *str)
3043     {
3044     skip_ioapic_setup = 1;
3045     - return 1;
3046     + return 0;
3047     }
3048     +early_param("noapic", disable_ioapic_setup);
3049    
3050     -static int __init enable_ioapic_setup(char *str)
3051     +/* Actually the next is obsolete, but keep it for paranoid reasons -AK */
3052     +static int __init disable_timer_pin_setup(char *arg)
3053     {
3054     - ioapic_force = 1;
3055     - skip_ioapic_setup = 0;
3056     + disable_timer_pin_1 = 1;
3057     return 1;
3058     }
3059     -
3060     -__setup("noapic", disable_ioapic_setup);
3061     -__setup("apic", enable_ioapic_setup);
3062     +__setup("disable_timer_pin_1", disable_timer_pin_setup);
3063    
3064     #ifndef CONFIG_XEN
3065     static int __init setup_disable_8254_timer(char *s)
3066     @@ -318,137 +419,6 @@
3067     __setup("enable_8254_timer", setup_enable_8254_timer);
3068     #endif /* !CONFIG_XEN */
3069    
3070     -#include <asm/pci-direct.h>
3071     -#include <linux/pci_ids.h>
3072     -#include <linux/pci.h>
3073     -
3074     -
3075     -#ifdef CONFIG_ACPI
3076     -
3077     -static int nvidia_hpet_detected __initdata;
3078     -
3079     -static int __init nvidia_hpet_check(unsigned long phys, unsigned long size)
3080     -{
3081     - nvidia_hpet_detected = 1;
3082     - return 0;
3083     -}
3084     -#endif
3085     -
3086     -/* Temporary Hack. Nvidia and VIA boards currently only work with IO-APIC
3087     - off. Check for an Nvidia or VIA PCI bridge and turn it off.
3088     - Use pci direct infrastructure because this runs before the PCI subsystem.
3089     -
3090     - Can be overwritten with "apic"
3091     -
3092     - And another hack to disable the IOMMU on VIA chipsets.
3093     -
3094     - ... and others. Really should move this somewhere else.
3095     -
3096     - Kludge-O-Rama. */
3097     -void __init check_ioapic(void)
3098     -{
3099     - int num,slot,func;
3100     - /* Poor man's PCI discovery */
3101     - for (num = 0; num < 32; num++) {
3102     - for (slot = 0; slot < 32; slot++) {
3103     - for (func = 0; func < 8; func++) {
3104     - u32 class;
3105     - u32 vendor;
3106     - u8 type;
3107     - class = read_pci_config(num,slot,func,
3108     - PCI_CLASS_REVISION);
3109     - if (class == 0xffffffff)
3110     - break;
3111     -
3112     - if ((class >> 16) != PCI_CLASS_BRIDGE_PCI)
3113     - continue;
3114     -
3115     - vendor = read_pci_config(num, slot, func,
3116     - PCI_VENDOR_ID);
3117     - vendor &= 0xffff;
3118     - switch (vendor) {
3119     - case PCI_VENDOR_ID_VIA:
3120     -#ifdef CONFIG_IOMMU
3121     - if ((end_pfn > MAX_DMA32_PFN ||
3122     - force_iommu) &&
3123     - !iommu_aperture_allowed) {
3124     - printk(KERN_INFO
3125     - "Looks like a VIA chipset. Disabling IOMMU. Override with \"iommu=allowed\"\n");
3126     - iommu_aperture_disabled = 1;
3127     - }
3128     -#endif
3129     - return;
3130     - case PCI_VENDOR_ID_NVIDIA:
3131     -#ifdef CONFIG_ACPI
3132     - /*
3133     - * All timer overrides on Nvidia are
3134     - * wrong unless HPET is enabled.
3135     - */
3136     - nvidia_hpet_detected = 0;
3137     - acpi_table_parse(ACPI_HPET,
3138     - nvidia_hpet_check);
3139     - if (nvidia_hpet_detected == 0) {
3140     - acpi_skip_timer_override = 1;
3141     - printk(KERN_INFO "Nvidia board "
3142     - "detected. Ignoring ACPI "
3143     - "timer override.\n");
3144     - }
3145     -#endif
3146     - /* RED-PEN skip them on mptables too? */
3147     - return;
3148     - case PCI_VENDOR_ID_ATI:
3149     -
3150     - /* This should be actually default, but
3151     - for 2.6.16 let's do it for ATI only where
3152     - it's really needed. */
3153     -#ifndef CONFIG_XEN
3154     - if (timer_over_8254 == 1) {
3155     - timer_over_8254 = 0;
3156     - printk(KERN_INFO
3157     - "ATI board detected. Disabling timer routing over 8254.\n");
3158     - }
3159     -#endif
3160     - return;
3161     - }
3162     -
3163     -
3164     - /* No multi-function device? */
3165     - type = read_pci_config_byte(num,slot,func,
3166     - PCI_HEADER_TYPE);
3167     - if (!(type & 0x80))
3168     - break;
3169     - }
3170     - }
3171     - }
3172     -}
3173     -
3174     -static int __init ioapic_pirq_setup(char *str)
3175     -{
3176     - int i, max;
3177     - int ints[MAX_PIRQS+1];
3178     -
3179     - get_options(str, ARRAY_SIZE(ints), ints);
3180     -
3181     - for (i = 0; i < MAX_PIRQS; i++)
3182     - pirq_entries[i] = -1;
3183     -
3184     - pirqs_enabled = 1;
3185     - apic_printk(APIC_VERBOSE, "PIRQ redirection, working around broken MP-BIOS.\n");
3186     - max = MAX_PIRQS;
3187     - if (ints[0] < MAX_PIRQS)
3188     - max = ints[0];
3189     -
3190     - for (i = 0; i < max; i++) {
3191     - apic_printk(APIC_VERBOSE, "... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
3192     - /*
3193     - * PIRQs are mapped upside down, usually.
3194     - */
3195     - pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
3196     - }
3197     - return 1;
3198     -}
3199     -
3200     -__setup("pirq=", ioapic_pirq_setup);
3201    
3202     /*
3203     * Find the IRQ entry number of a certain pin.
3204     @@ -478,9 +448,7 @@
3205     for (i = 0; i < mp_irq_entries; i++) {
3206     int lbus = mp_irqs[i].mpc_srcbus;
3207    
3208     - if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
3209     - mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
3210     - mp_bus_id_to_type[lbus] == MP_BUS_MCA) &&
3211     + if (test_bit(lbus, mp_bus_not_pci) &&
3212     (mp_irqs[i].mpc_irqtype == type) &&
3213     (mp_irqs[i].mpc_srcbusirq == irq))
3214    
3215     @@ -496,9 +464,7 @@
3216     for (i = 0; i < mp_irq_entries; i++) {
3217     int lbus = mp_irqs[i].mpc_srcbus;
3218    
3219     - if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
3220     - mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
3221     - mp_bus_id_to_type[lbus] == MP_BUS_MCA) &&
3222     + if (test_bit(lbus, mp_bus_not_pci) &&
3223     (mp_irqs[i].mpc_irqtype == type) &&
3224     (mp_irqs[i].mpc_srcbusirq == irq))
3225     break;
3226     @@ -539,7 +505,7 @@
3227     mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
3228     break;
3229    
3230     - if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) &&
3231     + if (!test_bit(lbus, mp_bus_not_pci) &&
3232     !mp_irqs[i].mpc_irqtype &&
3233     (bus == lbus) &&
3234     (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
3235     @@ -562,27 +528,6 @@
3236     return best_guess;
3237     }
3238    
3239     -/*
3240     - * EISA Edge/Level control register, ELCR
3241     - */
3242     -static int EISA_ELCR(unsigned int irq)
3243     -{
3244     - if (irq < 16) {
3245     - unsigned int port = 0x4d0 + (irq >> 3);
3246     - return (inb(port) >> (irq & 7)) & 1;
3247     - }
3248     - apic_printk(APIC_VERBOSE, "Broken MPtable reports ISA irq %d\n", irq);
3249     - return 0;
3250     -}
3251     -
3252     -/* EISA interrupts are always polarity zero and can be edge or level
3253     - * trigger depending on the ELCR value. If an interrupt is listed as
3254     - * EISA conforming in the MP table, that means its trigger type must
3255     - * be read in from the ELCR */
3256     -
3257     -#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq))
3258     -#define default_EISA_polarity(idx) (0)
3259     -
3260     /* ISA interrupts are always polarity zero edge triggered,
3261     * when listed as conforming in the MP table. */
3262    
3263     @@ -595,12 +540,6 @@
3264     #define default_PCI_trigger(idx) (1)
3265     #define default_PCI_polarity(idx) (1)
3266    
3267     -/* MCA interrupts are always polarity zero level triggered,
3268     - * when listed as conforming in the MP table. */
3269     -
3270     -#define default_MCA_trigger(idx) (1)
3271     -#define default_MCA_polarity(idx) (0)
3272     -
3273     static int __init MPBIOS_polarity(int idx)
3274     {
3275     int bus = mp_irqs[idx].mpc_srcbus;
3276     @@ -612,38 +551,11 @@
3277     switch (mp_irqs[idx].mpc_irqflag & 3)
3278     {
3279     case 0: /* conforms, ie. bus-type dependent polarity */
3280     - {
3281     - switch (mp_bus_id_to_type[bus])
3282     - {
3283     - case MP_BUS_ISA: /* ISA pin */
3284     - {
3285     - polarity = default_ISA_polarity(idx);
3286     - break;
3287     - }
3288     - case MP_BUS_EISA: /* EISA pin */
3289     - {
3290     - polarity = default_EISA_polarity(idx);
3291     - break;
3292     - }
3293     - case MP_BUS_PCI: /* PCI pin */
3294     - {
3295     - polarity = default_PCI_polarity(idx);
3296     - break;
3297     - }
3298     - case MP_BUS_MCA: /* MCA pin */
3299     - {
3300     - polarity = default_MCA_polarity(idx);
3301     - break;
3302     - }
3303     - default:
3304     - {
3305     - printk(KERN_WARNING "broken BIOS!!\n");
3306     - polarity = 1;
3307     - break;
3308     - }
3309     - }
3310     + if (test_bit(bus, mp_bus_not_pci))
3311     + polarity = default_ISA_polarity(idx);
3312     + else
3313     + polarity = default_PCI_polarity(idx);
3314     break;
3315     - }
3316     case 1: /* high active */
3317     {
3318     polarity = 0;
3319     @@ -681,38 +593,11 @@
3320     switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
3321     {
3322     case 0: /* conforms, ie. bus-type dependent */
3323     - {
3324     - switch (mp_bus_id_to_type[bus])
3325     - {
3326     - case MP_BUS_ISA: /* ISA pin */
3327     - {
3328     - trigger = default_ISA_trigger(idx);
3329     - break;
3330     - }
3331     - case MP_BUS_EISA: /* EISA pin */
3332     - {
3333     - trigger = default_EISA_trigger(idx);
3334     - break;
3335     - }
3336     - case MP_BUS_PCI: /* PCI pin */
3337     - {
3338     - trigger = default_PCI_trigger(idx);
3339     - break;
3340     - }
3341     - case MP_BUS_MCA: /* MCA pin */
3342     - {
3343     - trigger = default_MCA_trigger(idx);
3344     - break;
3345     - }
3346     - default:
3347     - {
3348     - printk(KERN_WARNING "broken BIOS!!\n");
3349     - trigger = 1;
3350     - break;
3351     - }
3352     - }
3353     + if (test_bit(bus, mp_bus_not_pci))
3354     + trigger = default_ISA_trigger(idx);
3355     + else
3356     + trigger = default_PCI_trigger(idx);
3357     break;
3358     - }
3359     case 1: /* edge */
3360     {
3361     trigger = 0;
3362     @@ -749,64 +634,6 @@
3363     return MPBIOS_trigger(idx);
3364     }
3365    
3366     -static int next_irq = 16;
3367     -
3368     -/*
3369     - * gsi_irq_sharing -- Name overload! "irq" can be either a legacy IRQ
3370     - * in the range 0-15, a linux IRQ in the range 0-223, or a GSI number
3371     - * from ACPI, which can reach 800 in large boxen.
3372     - *
3373     - * Compact the sparse GSI space into a sequential IRQ series and reuse
3374     - * vectors if possible.
3375     - */
3376     -int gsi_irq_sharing(int gsi)
3377     -{
3378     - int i, tries, vector;
3379     -
3380     - BUG_ON(gsi >= NR_IRQ_VECTORS);
3381     -
3382     - if (platform_legacy_irq(gsi))
3383     - return gsi;
3384     -
3385     - if (gsi_2_irq[gsi] != 0xFF)
3386     - return (int)gsi_2_irq[gsi];
3387     -
3388     - tries = NR_IRQS;
3389     - try_again:
3390     - vector = assign_irq_vector(gsi);
3391     -
3392     - /*
3393     - * Sharing vectors means sharing IRQs, so scan irq_vectors for previous
3394     - * use of vector and if found, return that IRQ. However, we never want
3395     - * to share legacy IRQs, which usually have a different trigger mode
3396     - * than PCI.
3397     - */
3398     - for (i = 0; i < NR_IRQS; i++)
3399     - if (IO_APIC_VECTOR(i) == vector)
3400     - break;
3401     - if (platform_legacy_irq(i)) {
3402     - if (--tries >= 0) {
3403     - IO_APIC_VECTOR(i) = 0;
3404     - goto try_again;
3405     - }
3406     - panic("gsi_irq_sharing: didn't find an IRQ using vector 0x%02X for GSI %d", vector, gsi);
3407     - }
3408     - if (i < NR_IRQS) {
3409     - gsi_2_irq[gsi] = i;
3410     - printk(KERN_INFO "GSI %d sharing vector 0x%02X and IRQ %d\n",
3411     - gsi, vector, i);
3412     - return i;
3413     - }
3414     -
3415     - i = next_irq++;
3416     - BUG_ON(i >= NR_IRQS);
3417     - gsi_2_irq[gsi] = i;
3418     - IO_APIC_VECTOR(i) = vector;
3419     - printk(KERN_INFO "GSI %d assigned vector 0x%02X and IRQ %d\n",
3420     - gsi, vector, i);
3421     - return i;
3422     -}
3423     -
3424     static int pin_2_irq(int idx, int apic, int pin)
3425     {
3426     int irq, i;
3427     @@ -818,49 +645,16 @@
3428     if (mp_irqs[idx].mpc_dstirq != pin)
3429     printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
3430    
3431     - switch (mp_bus_id_to_type[bus])
3432     - {
3433     - case MP_BUS_ISA: /* ISA pin */
3434     - case MP_BUS_EISA:
3435     - case MP_BUS_MCA:
3436     - {
3437     - irq = mp_irqs[idx].mpc_srcbusirq;
3438     - break;
3439     - }
3440     - case MP_BUS_PCI: /* PCI pin */
3441     - {
3442     - /*
3443     - * PCI IRQs are mapped in order
3444     - */
3445     - i = irq = 0;
3446     - while (i < apic)
3447     - irq += nr_ioapic_registers[i++];
3448     - irq += pin;
3449     - irq = gsi_irq_sharing(irq);
3450     - break;
3451     - }
3452     - default:
3453     - {
3454     - printk(KERN_ERR "unknown bus type %d.\n",bus);
3455     - irq = 0;
3456     - break;
3457     - }
3458     - }
3459     - BUG_ON(irq >= NR_IRQS);
3460     -
3461     - /*
3462     - * PCI IRQ command line redirection. Yes, limits are hardcoded.
3463     - */
3464     - if ((pin >= 16) && (pin <= 23)) {
3465     - if (pirq_entries[pin-16] != -1) {
3466     - if (!pirq_entries[pin-16]) {
3467     - apic_printk(APIC_VERBOSE, "disabling PIRQ%d\n", pin-16);
3468     - } else {
3469     - irq = pirq_entries[pin-16];
3470     - apic_printk(APIC_VERBOSE, "using PIRQ%d -> IRQ %d\n",
3471     - pin-16, irq);
3472     - }
3473     - }
3474     + if (test_bit(bus, mp_bus_not_pci)) {
3475     + irq = mp_irqs[idx].mpc_srcbusirq;
3476     + } else {
3477     + /*
3478     + * PCI IRQs are mapped in order
3479     + */
3480     + i = irq = 0;
3481     + while (i < apic)
3482     + irq += nr_ioapic_registers[i++];
3483     + irq += pin;
3484     }
3485     BUG_ON(irq >= NR_IRQS);
3486     return irq;
3487     @@ -884,43 +678,68 @@
3488     }
3489    
3490     /* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
3491     -u8 irq_vector[NR_IRQ_VECTORS] __read_mostly;
3492     +static u8 irq_vector[NR_IRQ_VECTORS] __read_mostly;
3493    
3494     -int assign_irq_vector(int irq)
3495     +static int __assign_irq_vector(int irq, cpumask_t mask, cpumask_t *result)
3496     {
3497     - unsigned long flags;
3498     int vector;
3499     struct physdev_irq irq_op;
3500    
3501     - BUG_ON(irq != AUTO_ASSIGN && (unsigned)irq >= NR_IRQ_VECTORS);
3502     + BUG_ON((unsigned)irq >= NR_IRQ_VECTORS);
3503    
3504     - spin_lock_irqsave(&vector_lock, flags);
3505     + cpus_and(*result, mask, cpu_online_map);
3506    
3507     - if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0) {
3508     - spin_unlock_irqrestore(&vector_lock, flags);
3509     - return IO_APIC_VECTOR(irq);
3510     - }
3511     + if (irq_vector[irq] > 0)
3512     + return irq_vector[irq];
3513    
3514     irq_op.irq = irq;
3515     - if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) {
3516     - spin_unlock_irqrestore(&vector_lock, flags);
3517     + if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op))
3518     return -ENOSPC;
3519     - }
3520    
3521     vector = irq_op.vector;
3522     - vector_irq[vector] = irq;
3523     - if (irq != AUTO_ASSIGN)
3524     - IO_APIC_VECTOR(irq) = vector;
3525     + irq_vector[irq] = vector;
3526    
3527     - spin_unlock_irqrestore(&vector_lock, flags);
3528     + return vector;
3529     +}
3530    
3531     +static int assign_irq_vector(int irq, cpumask_t mask, cpumask_t *result)
3532     +{
3533     + int vector;
3534     + unsigned long flags;
3535     +
3536     + spin_lock_irqsave(&vector_lock, flags);
3537     + vector = __assign_irq_vector(irq, mask, result);
3538     + spin_unlock_irqrestore(&vector_lock, flags);
3539     return vector;
3540     }
3541    
3542     -extern void (*interrupt[NR_IRQS])(void);
3543     #ifndef CONFIG_XEN
3544     -static struct hw_interrupt_type ioapic_level_type;
3545     -static struct hw_interrupt_type ioapic_edge_type;
3546     +void __setup_vector_irq(int cpu)
3547     +{
3548     + /* Initialize vector_irq on a new cpu */
3549     + /* This function must be called with vector_lock held */
3550     + int irq, vector;
3551     +
3552     + /* Mark the inuse vectors */
3553     + for (irq = 0; irq < NR_IRQ_VECTORS; ++irq) {
3554     + if (!cpu_isset(cpu, irq_domain[irq]))
3555     + continue;
3556     + vector = irq_vector[irq];
3557     + per_cpu(vector_irq, cpu)[vector] = irq;
3558     + }
3559     + /* Mark the free vectors */
3560     + for (vector = 0; vector < NR_VECTORS; ++vector) {
3561     + irq = per_cpu(vector_irq, cpu)[vector];
3562     + if (irq < 0)
3563     + continue;
3564     + if (!cpu_isset(cpu, irq_domain[irq]))
3565     + per_cpu(vector_irq, cpu)[vector] = -1;
3566     + }
3567     +}
3568     +
3569     +extern void (*interrupt[NR_IRQS])(void);
3570     +
3571     +static struct irq_chip ioapic_chip;
3572    
3573     #define IOAPIC_AUTO -1
3574     #define IOAPIC_EDGE 0
3575     @@ -928,16 +747,15 @@
3576    
3577     static void ioapic_register_intr(int irq, int vector, unsigned long trigger)
3578     {
3579     - unsigned idx;
3580     -
3581     - idx = use_pci_vector() && !platform_legacy_irq(irq) ? vector : irq;
3582     -
3583     if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
3584     trigger == IOAPIC_LEVEL)
3585     - irq_desc[idx].chip = &ioapic_level_type;
3586     - else
3587     - irq_desc[idx].chip = &ioapic_edge_type;
3588     - set_intr_gate(vector, interrupt[idx]);
3589     + set_irq_chip_and_handler_name(irq, &ioapic_chip,
3590     + handle_fasteoi_irq, "fasteoi");
3591     + else {
3592     + irq_desc[irq].status |= IRQ_DELAYED_DISABLE;
3593     + set_irq_chip_and_handler_name(irq, &ioapic_chip,
3594     + handle_edge_irq, "edge");
3595     + }
3596     }
3597     #else
3598     #define ioapic_register_intr(_irq,_vector,_trigger) ((void)0)
3599     @@ -990,16 +808,21 @@
3600     continue;
3601    
3602     if (IO_APIC_IRQ(irq)) {
3603     - vector = assign_irq_vector(irq);
3604     + cpumask_t mask;
3605     + vector = assign_irq_vector(irq, TARGET_CPUS, &mask);
3606     + if (vector < 0)
3607     + continue;
3608     +
3609     + entry.dest.logical.logical_dest = cpu_mask_to_apicid(mask);
3610     entry.vector = vector;
3611    
3612     ioapic_register_intr(irq, vector, IOAPIC_AUTO);
3613     if (!apic && (irq < 16))
3614     disable_8259A_irq(irq);
3615     }
3616     + ioapic_write_entry(apic, pin, entry);
3617     +
3618     spin_lock_irqsave(&ioapic_lock, flags);
3619     - io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
3620     - io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
3621     set_native_irq_info(irq, TARGET_CPUS);
3622     spin_unlock_irqrestore(&ioapic_lock, flags);
3623     }
3624     @@ -1042,7 +865,7 @@
3625     * The timer IRQ doesn't have to know that behind the
3626     * scene we have a 8259A-master in AEOI mode ...
3627     */
3628     - irq_desc[0].chip = &ioapic_edge_type;
3629     + set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge");
3630    
3631     /*
3632     * Add it to the IO-APIC irq-routing table:
3633     @@ -1138,10 +961,7 @@
3634     for (i = 0; i <= reg_01.bits.entries; i++) {
3635     struct IO_APIC_route_entry entry;
3636    
3637     - spin_lock_irqsave(&ioapic_lock, flags);
3638     - *(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2);
3639     - *(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2);
3640     - spin_unlock_irqrestore(&ioapic_lock, flags);
3641     + entry = ioapic_read_entry(apic, i);
3642    
3643     printk(KERN_DEBUG " %02x %03X %02X ",
3644     i,
3645     @@ -1161,17 +981,12 @@
3646     );
3647     }
3648     }
3649     - if (use_pci_vector())
3650     - printk(KERN_INFO "Using vector-based indexing\n");
3651     printk(KERN_DEBUG "IRQ to pin mappings:\n");
3652     for (i = 0; i < NR_IRQS; i++) {
3653     struct irq_pin_list *entry = irq_2_pin + i;
3654     if (entry->pin < 0)
3655     continue;
3656     - if (use_pci_vector() && !platform_legacy_irq(i))
3657     - printk(KERN_DEBUG "IRQ%d ", IO_APIC_VECTOR(i));
3658     - else
3659     - printk(KERN_DEBUG "IRQ%d ", i);
3660     + printk(KERN_DEBUG "IRQ%d ", i);
3661     for (;;) {
3662     printk("-> %d:%d", entry->apic, entry->pin);
3663     if (!entry->next)
3664     @@ -1335,9 +1150,6 @@
3665     irq_2_pin[i].pin = -1;
3666     irq_2_pin[i].next = 0;
3667     }
3668     - if (!pirqs_enabled)
3669     - for (i = 0; i < MAX_PIRQS; i++)
3670     - pirq_entries[i] = -1;
3671    
3672     /*
3673     * The number of IO-APIC IRQ registers (== #pins):
3674     @@ -1354,11 +1166,7 @@
3675     /* See if any of the pins is in ExtINT mode */
3676     for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
3677     struct IO_APIC_route_entry entry;
3678     - spin_lock_irqsave(&ioapic_lock, flags);
3679     - *(((int *)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
3680     - *(((int *)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
3681     - spin_unlock_irqrestore(&ioapic_lock, flags);
3682     -
3683     + entry = ioapic_read_entry(apic, pin);
3684    
3685     /* If the interrupt line is enabled and in ExtInt mode
3686     * I have found the pin where the i8259 is connected.
3687     @@ -1412,7 +1220,6 @@
3688     */
3689     if (ioapic_i8259.pin != -1) {
3690     struct IO_APIC_route_entry entry;
3691     - unsigned long flags;
3692    
3693     memset(&entry, 0, sizeof(entry));
3694     entry.mask = 0; /* Enabled */
3695     @@ -1429,12 +1236,7 @@
3696     /*
3697     * Add it to the IO-APIC irq-routing table:
3698     */
3699     - spin_lock_irqsave(&ioapic_lock, flags);
3700     - io_apic_write(ioapic_i8259.apic, 0x11+2*ioapic_i8259.pin,
3701     - *(((int *)&entry)+1));
3702     - io_apic_write(ioapic_i8259.apic, 0x10+2*ioapic_i8259.pin,
3703     - *(((int *)&entry)+0));
3704     - spin_unlock_irqrestore(&ioapic_lock, flags);
3705     + ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry);
3706     }
3707    
3708     disconnect_bsp_APIC(ioapic_i8259.pin != -1);
3709     @@ -1442,76 +1244,6 @@
3710     }
3711    
3712     /*
3713     - * function to set the IO-APIC physical IDs based on the
3714     - * values stored in the MPC table.
3715     - *
3716     - * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
3717     - */
3718     -
3719     -#ifndef CONFIG_XEN
3720     -static void __init setup_ioapic_ids_from_mpc (void)
3721     -{
3722     - union IO_APIC_reg_00 reg_00;
3723     - int apic;
3724     - int i;
3725     - unsigned char old_id;
3726     - unsigned long flags;
3727     -
3728     - /*
3729     - * Set the IOAPIC ID to the value stored in the MPC table.
3730     - */
3731     - for (apic = 0; apic < nr_ioapics; apic++) {
3732     -
3733     - /* Read the register 0 value */
3734     - spin_lock_irqsave(&ioapic_lock, flags);
3735     - reg_00.raw = io_apic_read(apic, 0);
3736     - spin_unlock_irqrestore(&ioapic_lock, flags);
3737     -
3738     - old_id = mp_ioapics[apic].mpc_apicid;
3739     -
3740     -
3741     - printk(KERN_INFO "Using IO-APIC %d\n", mp_ioapics[apic].mpc_apicid);
3742     -
3743     -
3744     - /*
3745     - * We need to adjust the IRQ routing table
3746     - * if the ID changed.
3747     - */
3748     - if (old_id != mp_ioapics[apic].mpc_apicid)
3749     - for (i = 0; i < mp_irq_entries; i++)
3750     - if (mp_irqs[i].mpc_dstapic == old_id)
3751     - mp_irqs[i].mpc_dstapic
3752     - = mp_ioapics[apic].mpc_apicid;
3753     -
3754     - /*
3755     - * Read the right value from the MPC table and
3756     - * write it into the ID register.
3757     - */
3758     - apic_printk(APIC_VERBOSE,KERN_INFO "...changing IO-APIC physical APIC ID to %d ...",
3759     - mp_ioapics[apic].mpc_apicid);
3760     -
3761     - reg_00.bits.ID = mp_ioapics[apic].mpc_apicid;
3762     - spin_lock_irqsave(&ioapic_lock, flags);
3763     - io_apic_write(apic, 0, reg_00.raw);
3764     - spin_unlock_irqrestore(&ioapic_lock, flags);
3765     -
3766     - /*
3767     - * Sanity check
3768     - */
3769     - spin_lock_irqsave(&ioapic_lock, flags);
3770     - reg_00.raw = io_apic_read(apic, 0);
3771     - spin_unlock_irqrestore(&ioapic_lock, flags);
3772     - if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid)
3773     - printk("could not set ID!\n");
3774     - else
3775     - apic_printk(APIC_VERBOSE," ok.\n");
3776     - }
3777     -}
3778     -#else
3779     -static void __init setup_ioapic_ids_from_mpc(void) { }
3780     -#endif
3781     -
3782     -/*
3783     * There is a nasty bug in some older SMP boards, their mptable lies
3784     * about the timer IRQ. We do the following to work around the situation:
3785     *
3786     @@ -1565,7 +1297,7 @@
3787     * an edge even if it isn't on the 8259A...
3788     */
3789    
3790     -static unsigned int startup_edge_ioapic_irq(unsigned int irq)
3791     +static unsigned int startup_ioapic_irq(unsigned int irq)
3792     {
3793     int was_pending = 0;
3794     unsigned long flags;
3795     @@ -1582,107 +1314,19 @@
3796     return was_pending;
3797     }
3798    
3799     -/*
3800     - * Once we have recorded IRQ_PENDING already, we can mask the
3801     - * interrupt for real. This prevents IRQ storms from unhandled
3802     - * devices.
3803     - */
3804     -static void ack_edge_ioapic_irq(unsigned int irq)
3805     -{
3806     - move_irq(irq);
3807     - if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED))
3808     - == (IRQ_PENDING | IRQ_DISABLED))
3809     - mask_IO_APIC_irq(irq);
3810     - ack_APIC_irq();
3811     -}
3812     -
3813     -/*
3814     - * Level triggered interrupts can just be masked,
3815     - * and shutting down and starting up the interrupt
3816     - * is the same as enabling and disabling them -- except
3817     - * with a startup need to return a "was pending" value.
3818     - *
3819     - * Level triggered interrupts are special because we
3820     - * do not touch any IO-APIC register while handling
3821     - * them. We ack the APIC in the end-IRQ handler, not
3822     - * in the start-IRQ-handler. Protection against reentrance
3823     - * from the same interrupt is still provided, both by the
3824     - * generic IRQ layer and by the fact that an unacked local
3825     - * APIC does not accept IRQs.
3826     - */
3827     -static unsigned int startup_level_ioapic_irq (unsigned int irq)
3828     -{
3829     - unmask_IO_APIC_irq(irq);
3830     -
3831     - return 0; /* don't check for pending */
3832     -}
3833     -
3834     -static void end_level_ioapic_irq (unsigned int irq)
3835     -{
3836     - move_irq(irq);
3837     - ack_APIC_irq();
3838     -}
3839     -
3840     -#ifdef CONFIG_PCI_MSI
3841     -static unsigned int startup_edge_ioapic_vector(unsigned int vector)
3842     -{
3843     - int irq = vector_to_irq(vector);
3844     -
3845     - return startup_edge_ioapic_irq(irq);
3846     -}
3847     -
3848     -static void ack_edge_ioapic_vector(unsigned int vector)
3849     -{
3850     - int irq = vector_to_irq(vector);
3851     -
3852     - move_native_irq(vector);
3853     - ack_edge_ioapic_irq(irq);
3854     -}
3855     -
3856     -static unsigned int startup_level_ioapic_vector (unsigned int vector)
3857     -{
3858     - int irq = vector_to_irq(vector);
3859     -
3860     - return startup_level_ioapic_irq (irq);
3861     -}
3862     -
3863     -static void end_level_ioapic_vector (unsigned int vector)
3864     -{
3865     - int irq = vector_to_irq(vector);
3866     -
3867     - move_native_irq(vector);
3868     - end_level_ioapic_irq(irq);
3869     -}
3870     -
3871     -static void mask_IO_APIC_vector (unsigned int vector)
3872     -{
3873     - int irq = vector_to_irq(vector);
3874     -
3875     - mask_IO_APIC_irq(irq);
3876     -}
3877     -
3878     -static void unmask_IO_APIC_vector (unsigned int vector)
3879     -{
3880     - int irq = vector_to_irq(vector);
3881     -
3882     - unmask_IO_APIC_irq(irq);
3883     -}
3884     -
3885     -#ifdef CONFIG_SMP
3886     -static void set_ioapic_affinity_vector (unsigned int vector,
3887     - cpumask_t cpu_mask)
3888     +static int ioapic_retrigger_irq(unsigned int irq)
3889     {
3890     - int irq = vector_to_irq(vector);
3891     + cpumask_t mask;
3892     + unsigned vector;
3893     + unsigned long flags;
3894    
3895     - set_native_irq_info(vector, cpu_mask);
3896     - set_ioapic_affinity_irq(irq, cpu_mask);
3897     -}
3898     -#endif // CONFIG_SMP
3899     -#endif // CONFIG_PCI_MSI
3900     + spin_lock_irqsave(&vector_lock, flags);
3901     + vector = irq_vector[irq];
3902     + cpus_clear(mask);
3903     + cpu_set(first_cpu(irq_domain[irq]), mask);
3904    
3905     -static int ioapic_retrigger(unsigned int irq)
3906     -{
3907     - send_IPI_self(IO_APIC_VECTOR(irq));
3908     + send_IPI_mask(mask, vector);
3909     + spin_unlock_irqrestore(&vector_lock, flags);
3910    
3911     return 1;
3912     }
3913     @@ -1696,32 +1340,47 @@
3914     * races.
3915     */
3916    
3917     -static struct hw_interrupt_type ioapic_edge_type __read_mostly = {
3918     - .typename = "IO-APIC-edge",
3919     - .startup = startup_edge_ioapic,
3920     - .shutdown = shutdown_edge_ioapic,
3921     - .enable = enable_edge_ioapic,
3922     - .disable = disable_edge_ioapic,
3923     - .ack = ack_edge_ioapic,
3924     - .end = end_edge_ioapic,
3925     -#ifdef CONFIG_SMP
3926     - .set_affinity = set_ioapic_affinity,
3927     +static void ack_apic_edge(unsigned int irq)
3928     +{
3929     + move_native_irq(irq);
3930     + ack_APIC_irq();
3931     +}
3932     +
3933     +static void ack_apic_level(unsigned int irq)
3934     +{
3935     + int do_unmask_irq = 0;
3936     +
3937     +#if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE)
3938     + /* If we are moving the irq we need to mask it */
3939     + if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) {
3940     + do_unmask_irq = 1;
3941     + mask_IO_APIC_irq(irq);
3942     + }
3943     #endif
3944     - .retrigger = ioapic_retrigger,
3945     -};
3946    
3947     -static struct hw_interrupt_type ioapic_level_type __read_mostly = {
3948     - .typename = "IO-APIC-level",
3949     - .startup = startup_level_ioapic,
3950     - .shutdown = shutdown_level_ioapic,
3951     - .enable = enable_level_ioapic,
3952     - .disable = disable_level_ioapic,
3953     - .ack = mask_and_ack_level_ioapic,
3954     - .end = end_level_ioapic,
3955     + /*
3956     + * We must acknowledge the irq before we move it or the acknowledge will
3957     + * not propogate properly.
3958     + */
3959     + ack_APIC_irq();
3960     +
3961     + /* Now we can move and renable the irq */
3962     + move_masked_irq(irq);
3963     + if (unlikely(do_unmask_irq))
3964     + unmask_IO_APIC_irq(irq);
3965     +}
3966     +
3967     +static struct irq_chip ioapic_chip __read_mostly = {
3968     + .name = "IO-APIC",
3969     + .startup = startup_ioapic_irq,
3970     + .mask = mask_IO_APIC_irq,
3971     + .unmask = unmask_IO_APIC_irq,
3972     + .ack = ack_apic_edge,
3973     + .eoi = ack_apic_level,
3974     #ifdef CONFIG_SMP
3975     - .set_affinity = set_ioapic_affinity,
3976     + .set_affinity = set_ioapic_affinity_irq,
3977     #endif
3978     - .retrigger = ioapic_retrigger,
3979     + .retrigger = ioapic_retrigger_irq,
3980     };
3981     #endif /* !CONFIG_XEN */
3982    
3983     @@ -1742,12 +1401,7 @@
3984     */
3985     for (irq = 0; irq < NR_IRQS ; irq++) {
3986     int tmp = irq;
3987     - if (use_pci_vector()) {
3988     - if (!platform_legacy_irq(tmp))
3989     - if ((tmp = vector_to_irq(tmp)) == -1)
3990     - continue;
3991     - }
3992     - if (IO_APIC_IRQ(tmp) && !IO_APIC_VECTOR(tmp)) {
3993     + if (IO_APIC_IRQ(tmp) && !irq_vector[tmp]) {
3994     /*
3995     * Hmm.. We don't have an entry for this,
3996     * so default to an old-fashioned 8259
3997     @@ -1758,7 +1412,7 @@
3998     #ifndef CONFIG_XEN
3999     else
4000     /* Strange. Oh, well.. */
4001     - irq_desc[irq].chip = &no_irq_type;
4002     + irq_desc[irq].chip = &no_irq_chip;
4003     #endif
4004     }
4005     }
4006     @@ -1879,8 +1533,6 @@
4007     spin_unlock_irqrestore(&ioapic_lock, flags);
4008     }
4009    
4010     -int timer_uses_ioapic_pin_0;
4011     -
4012     /*
4013     * This code may look a bit paranoid, but it's supposed to cooperate with
4014     * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ
4015     @@ -1893,13 +1545,13 @@
4016     {
4017     int apic1, pin1, apic2, pin2;
4018     int vector;
4019     + cpumask_t mask;
4020    
4021     /*
4022     * get/set the timer IRQ vector:
4023     */
4024     disable_8259A_irq(0);
4025     - vector = assign_irq_vector(0);
4026     - set_intr_gate(vector, interrupt[0]);
4027     + vector = assign_irq_vector(0, TARGET_CPUS, &mask);
4028    
4029     /*
4030     * Subtle, code in do_timer_interrupt() expects an AEOI
4031     @@ -1918,9 +1570,6 @@
4032     pin2 = ioapic_i8259.pin;
4033     apic2 = ioapic_i8259.apic;
4034    
4035     - if (pin1 == 0)
4036     - timer_uses_ioapic_pin_0 = 1;
4037     -
4038     apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
4039     vector, apic1, pin1, apic2, pin2);
4040    
4041     @@ -2035,11 +1684,6 @@
4042    
4043     apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
4044    
4045     - /*
4046     - * Set up the IO-APIC IRQ routing table.
4047     - */
4048     - if (!acpi_ioapic)
4049     - setup_ioapic_ids_from_mpc();
4050     #ifndef CONFIG_XEN
4051     sync_Arb_IDs();
4052     #endif /* !CONFIG_XEN */
4053     @@ -2060,17 +1704,12 @@
4054     {
4055     struct IO_APIC_route_entry *entry;
4056     struct sysfs_ioapic_data *data;
4057     - unsigned long flags;
4058     int i;
4059    
4060     data = container_of(dev, struct sysfs_ioapic_data, dev);
4061     entry = data->entry;
4062     - spin_lock_irqsave(&ioapic_lock, flags);
4063     - for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
4064     - *(((int *)entry) + 1) = io_apic_read(dev->id, 0x11 + 2 * i);
4065     - *(((int *)entry) + 0) = io_apic_read(dev->id, 0x10 + 2 * i);
4066     - }
4067     - spin_unlock_irqrestore(&ioapic_lock, flags);
4068     + for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ )
4069     + *entry = ioapic_read_entry(dev->id, i);
4070    
4071     return 0;
4072     }
4073     @@ -2092,11 +1731,9 @@
4074     reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
4075     io_apic_write(dev->id, 0, reg_00.raw);
4076     }
4077     - for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
4078     - io_apic_write(dev->id, 0x11+2*i, *(((int *)entry)+1));
4079     - io_apic_write(dev->id, 0x10+2*i, *(((int *)entry)+0));
4080     - }
4081     spin_unlock_irqrestore(&ioapic_lock, flags);
4082     + for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
4083     + ioapic_write_entry(dev->id, i, entry[i]);
4084    
4085     return 0;
4086     }
4087     @@ -2142,26 +1779,254 @@
4088    
4089     device_initcall(ioapic_init_sysfs);
4090    
4091     -/* --------------------------------------------------------------------------
4092     - ACPI-based IOAPIC Configuration
4093     - -------------------------------------------------------------------------- */
4094     +#ifndef CONFIG_XEN
4095     +/*
4096     + * Dynamic irq allocate and deallocation
4097     + */
4098     +int create_irq(void)
4099     +{
4100     + /* Allocate an unused irq */
4101     + int irq;
4102     + int new;
4103     + int vector = 0;
4104     + unsigned long flags;
4105     + cpumask_t mask;
4106    
4107     -#ifdef CONFIG_ACPI
4108     + irq = -ENOSPC;
4109     + spin_lock_irqsave(&vector_lock, flags);
4110     + for (new = (NR_IRQS - 1); new >= 0; new--) {
4111     + if (platform_legacy_irq(new))
4112     + continue;
4113     + if (irq_vector[new] != 0)
4114     + continue;
4115     + vector = __assign_irq_vector(new, TARGET_CPUS, &mask);
4116     + if (likely(vector > 0))
4117     + irq = new;
4118     + break;
4119     + }
4120     + spin_unlock_irqrestore(&vector_lock, flags);
4121    
4122     -#define IO_APIC_MAX_ID 0xFE
4123     + if (irq >= 0) {
4124     + dynamic_irq_init(irq);
4125     + }
4126     + return irq;
4127     +}
4128    
4129     -int __init io_apic_get_version (int ioapic)
4130     +void destroy_irq(unsigned int irq)
4131     {
4132     - union IO_APIC_reg_01 reg_01;
4133     unsigned long flags;
4134    
4135     - spin_lock_irqsave(&ioapic_lock, flags);
4136     - reg_01.raw = io_apic_read(ioapic, 1);
4137     - spin_unlock_irqrestore(&ioapic_lock, flags);
4138     + dynamic_irq_cleanup(irq);
4139     +
4140     + spin_lock_irqsave(&vector_lock, flags);
4141     + irq_vector[irq] = 0;
4142     + spin_unlock_irqrestore(&vector_lock, flags);
4143     +}
4144     +#endif
4145     +
4146     +/*
4147     + * MSI mesage composition
4148     + */
4149     +#ifdef CONFIG_PCI_MSI
4150     +static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
4151     +{
4152     + int vector;
4153     + unsigned dest;
4154     + cpumask_t tmp;
4155     +
4156     + vector = assign_irq_vector(irq, TARGET_CPUS, &tmp);
4157     + if (vector >= 0) {
4158     + dest = cpu_mask_to_apicid(tmp);
4159     +
4160     + msg->address_hi = MSI_ADDR_BASE_HI;
4161     + msg->address_lo =
4162     + MSI_ADDR_BASE_LO |
4163     + ((INT_DEST_MODE == 0) ?
4164     + MSI_ADDR_DEST_MODE_PHYSICAL:
4165     + MSI_ADDR_DEST_MODE_LOGICAL) |
4166     + ((INT_DELIVERY_MODE != dest_LowestPrio) ?
4167     + MSI_ADDR_REDIRECTION_CPU:
4168     + MSI_ADDR_REDIRECTION_LOWPRI) |
4169     + MSI_ADDR_DEST_ID(dest);
4170     +
4171     + msg->data =
4172     + MSI_DATA_TRIGGER_EDGE |
4173     + MSI_DATA_LEVEL_ASSERT |
4174     + ((INT_DELIVERY_MODE != dest_LowestPrio) ?
4175     + MSI_DATA_DELIVERY_FIXED:
4176     + MSI_DATA_DELIVERY_LOWPRI) |
4177     + MSI_DATA_VECTOR(vector);
4178     + }
4179     + return vector;
4180     +}
4181     +
4182     +#ifdef CONFIG_SMP
4183     +static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
4184     +{
4185     + struct msi_msg msg;
4186     + unsigned int dest;
4187     + cpumask_t tmp;
4188     + int vector;
4189     +
4190     + cpus_and(tmp, mask, cpu_online_map);
4191     + if (cpus_empty(tmp))
4192     + tmp = TARGET_CPUS;
4193     +
4194     + cpus_and(mask, tmp, CPU_MASK_ALL);
4195     +
4196     + vector = assign_irq_vector(irq, mask, &tmp);
4197     + if (vector < 0)
4198     + return;
4199     +
4200     + dest = cpu_mask_to_apicid(tmp);
4201     +
4202     + read_msi_msg(irq, &msg);
4203     +
4204     + msg.data &= ~MSI_DATA_VECTOR_MASK;
4205     + msg.data |= MSI_DATA_VECTOR(vector);
4206     + msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
4207     + msg.address_lo |= MSI_ADDR_DEST_ID(dest);
4208     +
4209     + write_msi_msg(irq, &msg);
4210     + set_native_irq_info(irq, mask);
4211     +}
4212     +#endif /* CONFIG_SMP */
4213     +
4214     +/*
4215     + * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
4216     + * which implement the MSI or MSI-X Capability Structure.
4217     + */
4218     +static struct irq_chip msi_chip = {
4219     + .name = "PCI-MSI",
4220     + .unmask = unmask_msi_irq,
4221     + .mask = mask_msi_irq,
4222     + .ack = ack_apic_edge,
4223     +#ifdef CONFIG_SMP
4224     + .set_affinity = set_msi_irq_affinity,
4225     +#endif
4226     + .retrigger = ioapic_retrigger_irq,
4227     +};
4228     +
4229     +int arch_setup_msi_irq(unsigned int irq, struct pci_dev *dev)
4230     +{
4231     + struct msi_msg msg;
4232     + int ret;
4233     + ret = msi_compose_msg(dev, irq, &msg);
4234     + if (ret < 0)
4235     + return ret;
4236     +
4237     + write_msi_msg(irq, &msg);
4238     +
4239     + set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
4240     +
4241     + return 0;
4242     +}
4243     +
4244     +void arch_teardown_msi_irq(unsigned int irq)
4245     +{
4246     + return;
4247     +}
4248     +
4249     +#endif /* CONFIG_PCI_MSI */
4250     +
4251     +/*
4252     + * Hypertransport interrupt support
4253     + */
4254     +#ifdef CONFIG_HT_IRQ
4255     +
4256     +#ifdef CONFIG_SMP
4257     +
4258     +static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
4259     +{
4260     + struct ht_irq_msg msg;
4261     + fetch_ht_irq_msg(irq, &msg);
4262     +
4263     + msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK | HT_IRQ_LOW_DEST_ID_MASK);
4264     + msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK);
4265    
4266     - return reg_01.bits.version;
4267     + msg.address_lo |= HT_IRQ_LOW_VECTOR(vector) | HT_IRQ_LOW_DEST_ID(dest);
4268     + msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest);
4269     +
4270     + write_ht_irq_msg(irq, &msg);
4271     }
4272    
4273     +static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
4274     +{
4275     + unsigned int dest;
4276     + cpumask_t tmp;
4277     + int vector;
4278     +
4279     + cpus_and(tmp, mask, cpu_online_map);
4280     + if (cpus_empty(tmp))
4281     + tmp = TARGET_CPUS;
4282     +
4283     + cpus_and(mask, tmp, CPU_MASK_ALL);
4284     +
4285     + vector = assign_irq_vector(irq, mask, &tmp);
4286     + if (vector < 0)
4287     + return;
4288     +
4289     + dest = cpu_mask_to_apicid(tmp);
4290     +
4291     + target_ht_irq(irq, dest, vector);
4292     + set_native_irq_info(irq, mask);
4293     +}
4294     +#endif
4295     +
4296     +static struct irq_chip ht_irq_chip = {
4297     + .name = "PCI-HT",
4298     + .mask = mask_ht_irq,
4299     + .unmask = unmask_ht_irq,
4300     + .ack = ack_apic_edge,
4301     +#ifdef CONFIG_SMP
4302     + .set_affinity = set_ht_irq_affinity,
4303     +#endif
4304     + .retrigger = ioapic_retrigger_irq,
4305     +};
4306     +
4307     +int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
4308     +{
4309     + int vector;
4310     + cpumask_t tmp;
4311     +
4312     + vector = assign_irq_vector(irq, TARGET_CPUS, &tmp);
4313     + if (vector >= 0) {
4314     + struct ht_irq_msg msg;
4315     + unsigned dest;
4316     +
4317     + dest = cpu_mask_to_apicid(tmp);
4318     +
4319     + msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
4320     +
4321     + msg.address_lo =
4322     + HT_IRQ_LOW_BASE |
4323     + HT_IRQ_LOW_DEST_ID(dest) |
4324     + HT_IRQ_LOW_VECTOR(vector) |
4325     + ((INT_DEST_MODE == 0) ?
4326     + HT_IRQ_LOW_DM_PHYSICAL :
4327     + HT_IRQ_LOW_DM_LOGICAL) |
4328     + HT_IRQ_LOW_RQEOI_EDGE |
4329     + ((INT_DELIVERY_MODE != dest_LowestPrio) ?
4330     + HT_IRQ_LOW_MT_FIXED :
4331     + HT_IRQ_LOW_MT_ARBITRATED) |
4332     + HT_IRQ_LOW_IRQ_MASKED;
4333     +
4334     + write_ht_irq_msg(irq, &msg);
4335     +
4336     + set_irq_chip_and_handler_name(irq, &ht_irq_chip,
4337     + handle_edge_irq, "edge");
4338     + }
4339     + return vector;
4340     +}
4341     +#endif /* CONFIG_HT_IRQ */
4342     +
4343     +/* --------------------------------------------------------------------------
4344     + ACPI-based IOAPIC Configuration
4345     + -------------------------------------------------------------------------- */
4346     +
4347     +#ifdef CONFIG_ACPI
4348     +
4349     +#define IO_APIC_MAX_ID 0xFE
4350    
4351     int __init io_apic_get_redir_entries (int ioapic)
4352     {
4353     @@ -2180,6 +2045,8 @@
4354     {
4355     struct IO_APIC_route_entry entry;
4356     unsigned long flags;
4357     + int vector;
4358     + cpumask_t mask;
4359    
4360     if (!IO_APIC_IRQ(irq)) {
4361     apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
4362     @@ -2188,6 +2055,17 @@
4363     }
4364    
4365     /*
4366     + * IRQs < 16 are already in the irq_2_pin[] map
4367     + */
4368     + if (irq >= 16)
4369     + add_pin_to_irq(irq, ioapic, pin);
4370     +
4371     +
4372     + vector = assign_irq_vector(irq, TARGET_CPUS, &mask);
4373     + if (vector < 0)
4374     + return vector;
4375     +
4376     + /*
4377     * Generate a PCI IRQ routing entry and program the IOAPIC accordingly.
4378     * Note that we mask (disable) IRQs now -- these get enabled when the
4379     * corresponding device driver registers for this IRQ.
4380     @@ -2197,19 +2075,11 @@
4381    
4382     entry.delivery_mode = INT_DELIVERY_MODE;
4383     entry.dest_mode = INT_DEST_MODE;
4384     - entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
4385     + entry.dest.logical.logical_dest = cpu_mask_to_apicid(mask);
4386     entry.trigger = edge_level;
4387     entry.polarity = active_high_low;
4388     entry.mask = 1; /* Disabled (masked) */
4389     -
4390     - irq = gsi_irq_sharing(irq);
4391     - /*
4392     - * IRQs < 16 are already in the irq_2_pin[] map
4393     - */
4394     - if (irq >= 16)
4395     - add_pin_to_irq(irq, ioapic, pin);
4396     -
4397     - entry.vector = assign_irq_vector(irq);
4398     + entry.vector = vector & 0xff;
4399    
4400     apic_printk(APIC_VERBOSE,KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry (%d-%d -> 0x%x -> "
4401     "IRQ %d Mode:%i Active:%i)\n", ioapic,
4402     @@ -2221,10 +2091,10 @@
4403     if (!ioapic && (irq < 16))
4404     disable_8259A_irq(irq);
4405    
4406     + ioapic_write_entry(ioapic, pin, entry);
4407     +
4408     spin_lock_irqsave(&ioapic_lock, flags);
4409     - io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1));
4410     - io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0));
4411     - set_native_irq_info(use_pci_vector() ? entry.vector : irq, TARGET_CPUS);
4412     + set_native_irq_info(irq, TARGET_CPUS);
4413     spin_unlock_irqrestore(&ioapic_lock, flags);
4414    
4415     return 0;
4416 niro 612 --- a/arch/x86/kernel/ioport_64-xen.c
4417     +++ b/arch/x86/kernel/ioport_64-xen.c
4418 niro 609 @@ -58,6 +58,7 @@
4419    
4420     memset(bitmap, 0xff, IO_BITMAP_BYTES);
4421     t->io_bitmap_ptr = bitmap;
4422     + set_thread_flag(TIF_IO_BITMAP);
4423    
4424     set_xen_guest_handle(set_iobitmap.bitmap, (char *)bitmap);
4425     set_iobitmap.nr_ports = IO_BITMAP_BITS;
4426 niro 612 --- a/arch/x86/kernel/irq_32-xen.c
4427     +++ b/arch/x86/kernel/irq_32-xen.c
4428 niro 609 @@ -53,8 +53,10 @@
4429     */
4430     fastcall unsigned int do_IRQ(struct pt_regs *regs)
4431     {
4432     + struct pt_regs *old_regs;
4433     /* high bit used in ret_from_ code */
4434     int irq = ~regs->orig_eax;
4435     + struct irq_desc *desc = irq_desc + irq;
4436     #ifdef CONFIG_4KSTACKS
4437     union irq_ctx *curctx, *irqctx;
4438     u32 *isp;
4439     @@ -66,6 +68,7 @@
4440     BUG();
4441     }
4442    
4443     + old_regs = set_irq_regs(regs);
4444     irq_enter();
4445     #ifdef CONFIG_DEBUG_STACKOVERFLOW
4446     /* Debugging check for stack overflow: is there less than 1KB free? */
4447     @@ -110,19 +113,20 @@
4448     (curctx->tinfo.preempt_count & SOFTIRQ_MASK);
4449    
4450     asm volatile(
4451     - " xchgl %%ebx,%%esp \n"
4452     - " call __do_IRQ \n"
4453     + " xchgl %%ebx,%%esp \n"
4454     + " call *%%edi \n"
4455     " movl %%ebx,%%esp \n"
4456     : "=a" (arg1), "=d" (arg2), "=b" (ebx)
4457     - : "0" (irq), "1" (regs), "2" (isp)
4458     - : "memory", "cc", "ecx"
4459     + : "0" (irq), "1" (desc), "2" (isp),
4460     + "D" (desc->handle_irq)
4461     + : "memory", "cc"
4462     );
4463     } else
4464     #endif
4465     - __do_IRQ(irq, regs);
4466     + desc->handle_irq(irq, desc);
4467    
4468     irq_exit();
4469     -
4470     + set_irq_regs(old_regs);
4471     return 1;
4472     }
4473    
4474     @@ -253,7 +257,8 @@
4475     for_each_online_cpu(j)
4476     seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
4477     #endif
4478     - seq_printf(p, " %14s", irq_desc[i].chip->typename);
4479     + seq_printf(p, " %8s", irq_desc[i].chip->name);
4480     + seq_printf(p, "-%-8s", irq_desc[i].name);
4481     seq_printf(p, " %s", action->name);
4482    
4483     for (action=action->next; action; action = action->next)
4484 niro 612 --- a/arch/x86/kernel/irq_64-xen.c
4485     +++ b/arch/x86/kernel/irq_64-xen.c
4486 niro 609 @@ -20,11 +20,6 @@
4487     #include <asm/idle.h>
4488    
4489     atomic_t irq_err_count;
4490     -#ifdef CONFIG_X86_IO_APIC
4491     -#ifdef APIC_MISMATCH_DEBUG
4492     -atomic_t irq_mis_count;
4493     -#endif
4494     -#endif
4495    
4496     #ifdef CONFIG_DEBUG_STACKOVERFLOW
4497     /*
4498     @@ -79,7 +74,8 @@
4499     for_each_online_cpu(j)
4500     seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
4501     #endif
4502     - seq_printf(p, " %14s", irq_desc[i].chip->typename);
4503     + seq_printf(p, " %8s", irq_desc[i].chip->name);
4504     + seq_printf(p, "-%-8s", irq_desc[i].name);
4505    
4506     seq_printf(p, " %s", action->name);
4507     for (action=action->next; action; action = action->next)
4508     @@ -99,11 +95,6 @@
4509     seq_putc(p, '\n');
4510     #endif
4511     seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
4512     -#ifdef CONFIG_X86_IO_APIC
4513     -#ifdef APIC_MISMATCH_DEBUG
4514     - seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
4515     -#endif
4516     -#endif
4517     }
4518     return 0;
4519     }
4520     @@ -114,24 +105,28 @@
4521     * handlers).
4522     */
4523     asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
4524     -{
4525     +{
4526     + struct pt_regs *old_regs = set_irq_regs(regs);
4527     +
4528     /* high bit used in ret_from_ code */
4529     unsigned irq = ~regs->orig_rax;
4530    
4531     - if (unlikely(irq >= NR_IRQS)) {
4532     - printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
4533     - __FUNCTION__, irq);
4534     - BUG();
4535     - }
4536     -
4537     exit_idle();
4538     irq_enter();
4539     +
4540     #ifdef CONFIG_DEBUG_STACKOVERFLOW
4541     stack_overflow_check(regs);
4542     #endif
4543     - __do_IRQ(irq, regs);
4544     +
4545     + if (likely(irq < NR_IRQS))
4546     + generic_handle_irq(irq);
4547     + else
4548     + printk(KERN_EMERG "%s: %d.%d No irq handler for irq\n",
4549     + __func__, smp_processor_id(), irq);
4550     +
4551     irq_exit();
4552    
4553     + set_irq_regs(old_regs);
4554     return 1;
4555     }
4556    
4557     @@ -192,6 +187,6 @@
4558     */
4559     void ack_bad_irq(unsigned int irq)
4560     {
4561     - printk("unexpected IRQ trap at vector %02x\n", irq);
4562     + printk("unexpected IRQ trap at irq %02x\n", irq);
4563     }
4564     #endif
4565 niro 612 --- a/arch/x86/kernel/ldt_32-xen.c
4566     +++ b/arch/x86/kernel/ldt_32-xen.c
4567 niro 609 @@ -1,5 +1,5 @@
4568     /*
4569     - * linux/kernel/ldt.c
4570     + * linux/arch/i386/kernel/ldt.c
4571     *
4572     * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
4573     * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
4574 niro 612 --- a/arch/x86/kernel/microcode-xen.c
4575     +++ b/arch/x86/kernel/microcode-xen.c
4576 niro 609 @@ -2,6 +2,7 @@
4577     * Intel CPU Microcode Update Driver for Linux
4578     *
4579     * Copyright (C) 2000-2004 Tigran Aivazian
4580     + * 2006 Shaohua Li <shaohua.li@intel.com>
4581     *
4582     * This driver allows to upgrade microcode on Intel processors
4583     * belonging to IA-32 family - PentiumPro, Pentium II,
4584     @@ -33,7 +34,9 @@
4585     #include <linux/spinlock.h>
4586     #include <linux/mm.h>
4587     #include <linux/mutex.h>
4588     -#include <linux/syscalls.h>
4589     +#include <linux/cpu.h>
4590     +#include <linux/firmware.h>
4591     +#include <linux/platform_device.h>
4592    
4593     #include <asm/msr.h>
4594     #include <asm/uaccess.h>
4595     @@ -55,12 +58,7 @@
4596     /* no concurrent ->write()s are allowed on /dev/cpu/microcode */
4597     static DEFINE_MUTEX(microcode_mutex);
4598    
4599     -static int microcode_open (struct inode *unused1, struct file *unused2)
4600     -{
4601     - return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
4602     -}
4603     -
4604     -
4605     +#ifdef CONFIG_MICROCODE_OLD_INTERFACE
4606     static int do_microcode_update (const void __user *ubuf, size_t len)
4607     {
4608     int err;
4609     @@ -85,6 +83,11 @@
4610     return err;
4611     }
4612    
4613     +static int microcode_open (struct inode *unused1, struct file *unused2)
4614     +{
4615     + return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
4616     +}
4617     +
4618     static ssize_t microcode_write (struct file *file, const char __user *buf, size_t len, loff_t *ppos)
4619     {
4620     ssize_t ret;
4621     @@ -117,7 +120,7 @@
4622     .fops = &microcode_fops,
4623     };
4624    
4625     -static int __init microcode_init (void)
4626     +static int __init microcode_dev_init (void)
4627     {
4628     int error;
4629    
4630     @@ -129,6 +132,68 @@
4631     return error;
4632     }
4633    
4634     + return 0;
4635     +}
4636     +
4637     +static void __exit microcode_dev_exit (void)
4638     +{
4639     + misc_deregister(&microcode_dev);
4640     +}
4641     +
4642     +MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
4643     +#else
4644     +#define microcode_dev_init() 0
4645     +#define microcode_dev_exit() do { } while(0)
4646     +#endif
4647     +
4648     +/* fake device for request_firmware */
4649     +static struct platform_device *microcode_pdev;
4650     +
4651     +static int request_microcode(void)
4652     +{
4653     + char name[30];
4654     + const struct cpuinfo_x86 *c = &boot_cpu_data;
4655     + const struct firmware *firmware;
4656     + int error;
4657     + struct xen_platform_op op;
4658     +
4659     + sprintf(name,"intel-ucode/%02x-%02x-%02x",
4660     + c->x86, c->x86_model, c->x86_mask);
4661     + error = request_firmware(&firmware, name, &microcode_pdev->dev);
4662     + if (error) {
4663     + pr_debug("ucode data file %s load failed\n", name);
4664     + return error;
4665     + }
4666     +
4667     + op.cmd = XENPF_microcode_update;
4668     + set_xen_guest_handle(op.u.microcode.data, (void *)firmware->data);
4669     + op.u.microcode.length = firmware->size;
4670     + error = HYPERVISOR_platform_op(&op);
4671     +
4672     + release_firmware(firmware);
4673     +
4674     + if (error)
4675     + pr_debug("ucode load failed\n");
4676     +
4677     + return error;
4678     +}
4679     +
4680     +static int __init microcode_init (void)
4681     +{
4682     + int error;
4683     +
4684     + error = microcode_dev_init();
4685     + if (error)
4686     + return error;
4687     + microcode_pdev = platform_device_register_simple("microcode", -1,
4688     + NULL, 0);
4689     + if (IS_ERR(microcode_pdev)) {
4690     + microcode_dev_exit();
4691     + return PTR_ERR(microcode_pdev);
4692     + }
4693     +
4694     + request_microcode();
4695     +
4696     printk(KERN_INFO
4697     "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@veritas.com>\n");
4698     return 0;
4699     @@ -136,9 +201,9 @@
4700    
4701     static void __exit microcode_exit (void)
4702     {
4703     - misc_deregister(&microcode_dev);
4704     + microcode_dev_exit();
4705     + platform_device_unregister(microcode_pdev);
4706     }
4707    
4708     module_init(microcode_init)
4709     module_exit(microcode_exit)
4710     -MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
4711 niro 612 --- a/arch/x86/kernel/mpparse_32-xen.c
4712     +++ b/arch/x86/kernel/mpparse_32-xen.c
4713 niro 609 @@ -30,6 +30,7 @@
4714     #include <asm/io_apic.h>
4715    
4716     #include <mach_apic.h>
4717     +#include <mach_apicdef.h>
4718     #include <mach_mpparse.h>
4719     #include <bios_ebda.h>
4720    
4721     @@ -68,7 +69,7 @@
4722     /* Processor that is doing the boot up */
4723     unsigned int boot_cpu_physical_apicid = -1U;
4724     /* Internal processor count */
4725     -static unsigned int __devinitdata num_processors;
4726     +unsigned int __cpuinitdata num_processors;
4727    
4728     /* Bitmask of physically existing CPUs */
4729     physid_mask_t phys_cpu_present_map;
4730     @@ -235,12 +236,14 @@
4731    
4732     mpc_oem_bus_info(m, str, translation_table[mpc_record]);
4733    
4734     +#if MAX_MP_BUSSES < 256
4735     if (m->mpc_busid >= MAX_MP_BUSSES) {
4736     printk(KERN_WARNING "MP table busid value (%d) for bustype %s "
4737     " is too large, max. supported is %d\n",
4738     m->mpc_busid, str, MAX_MP_BUSSES - 1);
4739     return;
4740     }
4741     +#endif
4742    
4743     if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA)-1) == 0) {
4744     mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
4745     @@ -300,19 +303,6 @@
4746     m->mpc_irqtype, m->mpc_irqflag & 3,
4747     (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid,
4748     m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
4749     - /*
4750     - * Well it seems all SMP boards in existence
4751     - * use ExtINT/LVT1 == LINT0 and
4752     - * NMI/LVT2 == LINT1 - the following check
4753     - * will show us if this assumptions is false.
4754     - * Until then we do not have to add baggage.
4755     - */
4756     - if ((m->mpc_irqtype == mp_ExtINT) &&
4757     - (m->mpc_destapiclint != 0))
4758     - BUG();
4759     - if ((m->mpc_irqtype == mp_NMI) &&
4760     - (m->mpc_destapiclint != 1))
4761     - BUG();
4762     }
4763    
4764     #ifdef CONFIG_X86_NUMAQ
4765     @@ -838,8 +828,7 @@
4766    
4767     #ifdef CONFIG_ACPI
4768    
4769     -void __init mp_register_lapic_address (
4770     - u64 address)
4771     +void __init mp_register_lapic_address(u64 address)
4772     {
4773     #ifndef CONFIG_XEN
4774     mp_lapic_addr = (unsigned long) address;
4775     @@ -853,13 +842,10 @@
4776     #endif
4777     }
4778    
4779     -
4780     -void __devinit mp_register_lapic (
4781     - u8 id,
4782     - u8 enabled)
4783     +void __devinit mp_register_lapic (u8 id, u8 enabled)
4784     {
4785     struct mpc_config_processor processor;
4786     - int boot_cpu = 0;
4787     + int boot_cpu = 0;
4788    
4789     if (MAX_APICS - id <= 0) {
4790     printk(KERN_WARNING "Processor #%d invalid (max %d)\n",
4791     @@ -898,11 +884,9 @@
4792     u32 pin_programmed[4];
4793     } mp_ioapic_routing[MAX_IO_APICS];
4794    
4795     -
4796     -static int mp_find_ioapic (
4797     - int gsi)
4798     +static int mp_find_ioapic (int gsi)
4799     {
4800     - int i = 0;
4801     + int i = 0;
4802    
4803     /* Find the IOAPIC that manages this GSI. */
4804     for (i = 0; i < nr_ioapics; i++) {
4805     @@ -915,15 +899,11 @@
4806    
4807     return -1;
4808     }
4809     -
4810    
4811     -void __init mp_register_ioapic (
4812     - u8 id,
4813     - u32 address,
4814     - u32 gsi_base)
4815     +void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base)
4816     {
4817     - int idx = 0;
4818     - int tmpid;
4819     + int idx = 0;
4820     + int tmpid;
4821    
4822     if (nr_ioapics >= MAX_IO_APICS) {
4823     printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
4824     @@ -971,16 +951,10 @@
4825     mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
4826     mp_ioapic_routing[idx].gsi_base,
4827     mp_ioapic_routing[idx].gsi_end);
4828     -
4829     - return;
4830     }
4831    
4832     -
4833     -void __init mp_override_legacy_irq (
4834     - u8 bus_irq,
4835     - u8 polarity,
4836     - u8 trigger,
4837     - u32 gsi)
4838     +void __init
4839     +mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
4840     {
4841     struct mpc_config_intsrc intsrc;
4842     int ioapic = -1;
4843     @@ -1018,15 +992,13 @@
4844     mp_irqs[mp_irq_entries] = intsrc;
4845     if (++mp_irq_entries == MAX_IRQ_SOURCES)
4846     panic("Max # of irq sources exceeded!\n");
4847     -
4848     - return;
4849     }
4850    
4851     void __init mp_config_acpi_legacy_irqs (void)
4852     {
4853     struct mpc_config_intsrc intsrc;
4854     - int i = 0;
4855     - int ioapic = -1;
4856     + int i = 0;
4857     + int ioapic = -1;
4858    
4859     /*
4860     * Fabricate the legacy ISA bus (bus #31).
4861     @@ -1095,12 +1067,12 @@
4862    
4863     #define MAX_GSI_NUM 4096
4864    
4865     -int mp_register_gsi (u32 gsi, int triggering, int polarity)
4866     +int mp_register_gsi(u32 gsi, int triggering, int polarity)
4867     {
4868     - int ioapic = -1;
4869     - int ioapic_pin = 0;
4870     - int idx, bit = 0;
4871     - static int pci_irq = 16;
4872     + int ioapic = -1;
4873     + int ioapic_pin = 0;
4874     + int idx, bit = 0;
4875     + static int pci_irq = 16;
4876     /*
4877     * Mapping between Global System Interrups, which
4878     * represent all possible interrupts, and IRQs
4879 niro 612 --- a/arch/x86/kernel/mpparse_64-xen.c
4880     +++ b/arch/x86/kernel/mpparse_64-xen.c
4881 niro 609 @@ -41,8 +41,7 @@
4882     * Various Linux-internal data structures created from the
4883     * MP-table.
4884     */
4885     -unsigned char apic_version [MAX_APICS];
4886     -unsigned char mp_bus_id_to_type [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
4887     +DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
4888     int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
4889    
4890     static int mp_current_pci_id = 0;
4891     @@ -56,7 +55,6 @@
4892     int mp_irq_entries;
4893    
4894     int nr_ioapics;
4895     -int pic_mode;
4896     unsigned long mp_lapic_addr = 0;
4897    
4898    
4899     @@ -71,19 +69,6 @@
4900     /* Bitmask of physically existing CPUs */
4901     physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE;
4902    
4903     -/* ACPI MADT entry parsing functions */
4904     -#ifdef CONFIG_ACPI
4905     -extern struct acpi_boot_flags acpi_boot;
4906     -#ifdef CONFIG_X86_LOCAL_APIC
4907     -extern int acpi_parse_lapic (acpi_table_entry_header *header);
4908     -extern int acpi_parse_lapic_addr_ovr (acpi_table_entry_header *header);
4909     -extern int acpi_parse_lapic_nmi (acpi_table_entry_header *header);
4910     -#endif /*CONFIG_X86_LOCAL_APIC*/
4911     -#ifdef CONFIG_X86_IO_APIC
4912     -extern int acpi_parse_ioapic (acpi_table_entry_header *header);
4913     -#endif /*CONFIG_X86_IO_APIC*/
4914     -#endif /*CONFIG_ACPI*/
4915     -
4916     u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
4917    
4918    
4919     @@ -109,24 +94,20 @@
4920     static void __cpuinit MP_processor_info (struct mpc_config_processor *m)
4921     {
4922     int cpu;
4923     - unsigned char ver;
4924     cpumask_t tmp_map;
4925     + char *bootup_cpu = "";
4926    
4927     if (!(m->mpc_cpuflag & CPU_ENABLED)) {
4928     disabled_cpus++;
4929     return;
4930     }
4931     -
4932     - printk(KERN_INFO "Processor #%d %d:%d APIC version %d\n",
4933     - m->mpc_apicid,
4934     - (m->mpc_cpufeature & CPU_FAMILY_MASK)>>8,
4935     - (m->mpc_cpufeature & CPU_MODEL_MASK)>>4,
4936     - m->mpc_apicver);
4937     -
4938     if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
4939     - Dprintk(" Bootup CPU\n");
4940     + bootup_cpu = " (Bootup-CPU)";
4941     boot_cpu_id = m->mpc_apicid;
4942     }
4943     +
4944     + printk(KERN_INFO "Processor #%d%s\n", m->mpc_apicid, bootup_cpu);
4945     +
4946     if (num_processors >= NR_CPUS) {
4947     printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
4948     " Processor ignored.\n", NR_CPUS);
4949     @@ -137,24 +118,7 @@
4950     cpus_complement(tmp_map, cpu_present_map);
4951     cpu = first_cpu(tmp_map);
4952    
4953     -#if MAX_APICS < 255
4954     - if ((int)m->mpc_apicid > MAX_APICS) {
4955     - printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n",
4956     - m->mpc_apicid, MAX_APICS);
4957     - return;
4958     - }
4959     -#endif
4960     - ver = m->mpc_apicver;
4961     -
4962     physid_set(m->mpc_apicid, phys_cpu_present_map);
4963     - /*
4964     - * Validate version
4965     - */
4966     - if (ver == 0x0) {
4967     - printk(KERN_ERR "BIOS bug, APIC version is 0 for CPU#%d! fixing up to 0x10. (tell your hw vendor)\n", m->mpc_apicid);
4968     - ver = 0x10;
4969     - }
4970     - apic_version[m->mpc_apicid] = ver;
4971     if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
4972     /*
4973     * bios_cpu_apicid is required to have processors listed
4974     @@ -185,37 +149,42 @@
4975     Dprintk("Bus #%d is %s\n", m->mpc_busid, str);
4976    
4977     if (strncmp(str, "ISA", 3) == 0) {
4978     - mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
4979     - } else if (strncmp(str, "EISA", 4) == 0) {
4980     - mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA;
4981     + set_bit(m->mpc_busid, mp_bus_not_pci);
4982     } else if (strncmp(str, "PCI", 3) == 0) {
4983     - mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
4984     + clear_bit(m->mpc_busid, mp_bus_not_pci);
4985     mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
4986     mp_current_pci_id++;
4987     - } else if (strncmp(str, "MCA", 3) == 0) {
4988     - mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA;
4989     } else {
4990     printk(KERN_ERR "Unknown bustype %s\n", str);
4991     }
4992     }
4993    
4994     +static int bad_ioapic(unsigned long address)
4995     +{
4996     + if (nr_ioapics >= MAX_IO_APICS) {
4997     + printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
4998     + "(found %d)\n", MAX_IO_APICS, nr_ioapics);
4999     + panic("Recompile kernel with bigger MAX_IO_APICS!\n");
5000     + }
5001     + if (!address) {
5002     + printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
5003     + " found in table, skipping!\n");
5004     + return 1;
5005     + }
5006     + return 0;
5007     +}
5008     +
5009     static void __init MP_ioapic_info (struct mpc_config_ioapic *m)
5010     {
5011     if (!(m->mpc_flags & MPC_APIC_USABLE))
5012     return;
5013    
5014     - printk("I/O APIC #%d Version %d at 0x%X.\n",
5015     - m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
5016     - if (nr_ioapics >= MAX_IO_APICS) {
5017     - printk(KERN_ERR "Max # of I/O APICs (%d) exceeded (found %d).\n",
5018     - MAX_IO_APICS, nr_ioapics);
5019     - panic("Recompile kernel with bigger MAX_IO_APICS!.\n");
5020     - }
5021     - if (!m->mpc_apicaddr) {
5022     - printk(KERN_ERR "WARNING: bogus zero I/O APIC address"
5023     - " found in MP table, skipping!\n");
5024     + printk("I/O APIC #%d at 0x%X.\n",
5025     + m->mpc_apicid, m->mpc_apicaddr);
5026     +
5027     + if (bad_ioapic(m->mpc_apicaddr))
5028     return;
5029     - }
5030     +
5031     mp_ioapics[nr_ioapics] = *m;
5032     nr_ioapics++;
5033     }
5034     @@ -239,19 +208,6 @@
5035     m->mpc_irqtype, m->mpc_irqflag & 3,
5036     (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid,
5037     m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
5038     - /*
5039     - * Well it seems all SMP boards in existence
5040     - * use ExtINT/LVT1 == LINT0 and
5041     - * NMI/LVT2 == LINT1 - the following check
5042     - * will show us if this assumptions is false.
5043     - * Until then we do not have to add baggage.
5044     - */
5045     - if ((m->mpc_irqtype == mp_ExtINT) &&
5046     - (m->mpc_destapiclint != 0))
5047     - BUG();
5048     - if ((m->mpc_irqtype == mp_NMI) &&
5049     - (m->mpc_destapiclint != 1))
5050     - BUG();
5051     }
5052    
5053     /*
5054     @@ -265,7 +221,7 @@
5055     unsigned char *mpt=((unsigned char *)mpc)+count;
5056    
5057     if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) {
5058     - printk("SMP mptable: bad signature [%c%c%c%c]!\n",
5059     + printk("MPTABLE: bad signature [%c%c%c%c]!\n",
5060     mpc->mpc_signature[0],
5061     mpc->mpc_signature[1],
5062     mpc->mpc_signature[2],
5063     @@ -273,31 +229,31 @@
5064     return 0;
5065     }
5066     if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) {
5067     - printk("SMP mptable: checksum error!\n");
5068     + printk("MPTABLE: checksum error!\n");
5069     return 0;
5070     }
5071     if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) {
5072     - printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n",
5073     + printk(KERN_ERR "MPTABLE: bad table version (%d)!!\n",
5074     mpc->mpc_spec);
5075     return 0;
5076     }
5077     if (!mpc->mpc_lapic) {
5078     - printk(KERN_ERR "SMP mptable: null local APIC address!\n");
5079     + printk(KERN_ERR "MPTABLE: null local APIC address!\n");
5080     return 0;
5081     }
5082     memcpy(str,mpc->mpc_oem,8);
5083     - str[8]=0;
5084     - printk(KERN_INFO "OEM ID: %s ",str);
5085     + str[8] = 0;
5086     + printk(KERN_INFO "MPTABLE: OEM ID: %s ",str);
5087    
5088     memcpy(str,mpc->mpc_productid,12);
5089     - str[12]=0;
5090     - printk("Product ID: %s ",str);
5091     + str[12] = 0;
5092     + printk("MPTABLE: Product ID: %s ",str);
5093    
5094     - printk("APIC at: 0x%X\n",mpc->mpc_lapic);
5095     + printk("MPTABLE: APIC at: 0x%X\n",mpc->mpc_lapic);
5096    
5097     /* save the local APIC address, it might be non-default */
5098     if (!acpi_lapic)
5099     - mp_lapic_addr = mpc->mpc_lapic;
5100     + mp_lapic_addr = mpc->mpc_lapic;
5101    
5102     /*
5103     * Now process the configuration blocks.
5104     @@ -309,7 +265,7 @@
5105     struct mpc_config_processor *m=
5106     (struct mpc_config_processor *)mpt;
5107     if (!acpi_lapic)
5108     - MP_processor_info(m);
5109     + MP_processor_info(m);
5110     mpt += sizeof(*m);
5111     count += sizeof(*m);
5112     break;
5113     @@ -328,8 +284,8 @@
5114     struct mpc_config_ioapic *m=
5115     (struct mpc_config_ioapic *)mpt;
5116     MP_ioapic_info(m);
5117     - mpt+=sizeof(*m);
5118     - count+=sizeof(*m);
5119     + mpt += sizeof(*m);
5120     + count += sizeof(*m);
5121     break;
5122     }
5123     case MP_INTSRC:
5124     @@ -338,8 +294,8 @@
5125     (struct mpc_config_intsrc *)mpt;
5126    
5127     MP_intsrc_info(m);
5128     - mpt+=sizeof(*m);
5129     - count+=sizeof(*m);
5130     + mpt += sizeof(*m);
5131     + count += sizeof(*m);
5132     break;
5133     }
5134     case MP_LINTSRC:
5135     @@ -347,15 +303,15 @@
5136     struct mpc_config_lintsrc *m=
5137     (struct mpc_config_lintsrc *)mpt;
5138     MP_lintsrc_info(m);
5139     - mpt+=sizeof(*m);
5140     - count+=sizeof(*m);
5141     + mpt += sizeof(*m);
5142     + count += sizeof(*m);
5143     break;
5144     }
5145     }
5146     }
5147     clustered_apic_check();
5148     if (!num_processors)
5149     - printk(KERN_ERR "SMP mptable: no processors registered!\n");
5150     + printk(KERN_ERR "MPTABLE: no processors registered!\n");
5151     return num_processors;
5152     }
5153    
5154     @@ -451,13 +407,10 @@
5155     * 2 CPUs, numbered 0 & 1.
5156     */
5157     processor.mpc_type = MP_PROCESSOR;
5158     - /* Either an integrated APIC or a discrete 82489DX. */
5159     - processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
5160     + processor.mpc_apicver = 0;
5161     processor.mpc_cpuflag = CPU_ENABLED;
5162     - processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
5163     - (boot_cpu_data.x86_model << 4) |
5164     - boot_cpu_data.x86_mask;
5165     - processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
5166     + processor.mpc_cpufeature = 0;
5167     + processor.mpc_featureflag = 0;
5168     processor.mpc_reserved[0] = 0;
5169     processor.mpc_reserved[1] = 0;
5170     for (i = 0; i < 2; i++) {
5171     @@ -476,14 +429,6 @@
5172     case 5:
5173     memcpy(bus.mpc_bustype, "ISA ", 6);
5174     break;
5175     - case 2:
5176     - case 6:
5177     - case 3:
5178     - memcpy(bus.mpc_bustype, "EISA ", 6);
5179     - break;
5180     - case 4:
5181     - case 7:
5182     - memcpy(bus.mpc_bustype, "MCA ", 6);
5183     }
5184     MP_bus_info(&bus);
5185     if (mpc_default_type > 4) {
5186     @@ -494,7 +439,7 @@
5187    
5188     ioapic.mpc_type = MP_IOAPIC;
5189     ioapic.mpc_apicid = 2;
5190     - ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
5191     + ioapic.mpc_apicver = 0;
5192     ioapic.mpc_flags = MPC_APIC_USABLE;
5193     ioapic.mpc_apicaddr = 0xFEC00000;
5194     MP_ioapic_info(&ioapic);
5195     @@ -537,13 +482,6 @@
5196     printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n");
5197    
5198     printk("Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification);
5199     - if (mpf->mpf_feature2 & (1<<7)) {
5200     - printk(KERN_INFO " IMCR and PIC compatibility mode.\n");
5201     - pic_mode = 1;
5202     - } else {
5203     - printk(KERN_INFO " Virtual Wire compatibility mode.\n");
5204     - pic_mode = 0;
5205     - }
5206    
5207     /*
5208     * Now see if we need to read further.
5209     @@ -620,7 +558,7 @@
5210     return 0;
5211     }
5212    
5213     -void __init find_intel_smp (void)
5214     +void __init find_smp_config(void)
5215     {
5216     unsigned int address;
5217    
5218     @@ -637,9 +575,7 @@
5219     smp_scan_config(0xF0000,0x10000))
5220     return;
5221     /*
5222     - * If it is an SMP machine we should know now, unless the
5223     - * configuration is in an EISA/MCA bus machine with an
5224     - * extended bios data area.
5225     + * If it is an SMP machine we should know now.
5226     *
5227     * there is a real-mode segmented pointer pointing to the
5228     * 4K EBDA area at 0x40E, calculate and scan it here.
5229     @@ -660,64 +596,38 @@
5230     printk(KERN_INFO "No mptable found.\n");
5231     }
5232    
5233     -/*
5234     - * - Intel MP Configuration Table
5235     - */
5236     -void __init find_smp_config (void)
5237     -{
5238     -#ifdef CONFIG_X86_LOCAL_APIC
5239     - find_intel_smp();
5240     -#endif
5241     -}
5242     -
5243     -
5244     /* --------------------------------------------------------------------------
5245     ACPI-based MP Configuration
5246     -------------------------------------------------------------------------- */
5247    
5248     #ifdef CONFIG_ACPI
5249    
5250     -void __init mp_register_lapic_address (
5251     - u64 address)
5252     +void __init mp_register_lapic_address(u64 address)
5253     {
5254     #ifndef CONFIG_XEN
5255     mp_lapic_addr = (unsigned long) address;
5256     -
5257     set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
5258     -
5259     if (boot_cpu_id == -1U)
5260     boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID));
5261     -
5262     - Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid);
5263     #endif
5264     }
5265    
5266     -
5267     -void __cpuinit mp_register_lapic (
5268     - u8 id,
5269     - u8 enabled)
5270     +void __cpuinit mp_register_lapic (u8 id, u8 enabled)
5271     {
5272     struct mpc_config_processor processor;
5273     int boot_cpu = 0;
5274    
5275     - if (id >= MAX_APICS) {
5276     - printk(KERN_WARNING "Processor #%d invalid (max %d)\n",
5277     - id, MAX_APICS);
5278     - return;
5279     - }
5280     -
5281     - if (id == boot_cpu_physical_apicid)
5282     + if (id == boot_cpu_id)
5283     boot_cpu = 1;
5284    
5285     #ifndef CONFIG_XEN
5286     processor.mpc_type = MP_PROCESSOR;
5287     processor.mpc_apicid = id;
5288     - processor.mpc_apicver = GET_APIC_VERSION(apic_read(APIC_LVR));
5289     + processor.mpc_apicver = 0;
5290     processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0);
5291     processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0);
5292     - processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
5293     - (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
5294     - processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
5295     + processor.mpc_cpufeature = 0;
5296     + processor.mpc_featureflag = 0;
5297     processor.mpc_reserved[0] = 0;
5298     processor.mpc_reserved[1] = 0;
5299     #endif
5300     @@ -725,8 +635,6 @@
5301     MP_processor_info(&processor);
5302     }
5303    
5304     -#ifdef CONFIG_X86_IO_APIC
5305     -
5306     #define MP_ISA_BUS 0
5307     #define MP_MAX_IOAPIC_PIN 127
5308    
5309     @@ -737,11 +645,9 @@
5310     u32 pin_programmed[4];
5311     } mp_ioapic_routing[MAX_IO_APICS];
5312    
5313     -
5314     -static int mp_find_ioapic (
5315     - int gsi)
5316     +static int mp_find_ioapic(int gsi)
5317     {
5318     - int i = 0;
5319     + int i = 0;
5320    
5321     /* Find the IOAPIC that manages this GSI. */
5322     for (i = 0; i < nr_ioapics; i++) {
5323     @@ -751,28 +657,15 @@
5324     }
5325    
5326     printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
5327     -
5328     return -1;
5329     }
5330     -
5331    
5332     -void __init mp_register_ioapic (
5333     - u8 id,
5334     - u32 address,
5335     - u32 gsi_base)
5336     +void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base)
5337     {
5338     - int idx = 0;
5339     + int idx = 0;
5340    
5341     - if (nr_ioapics >= MAX_IO_APICS) {
5342     - printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
5343     - "(found %d)\n", MAX_IO_APICS, nr_ioapics);
5344     - panic("Recompile kernel with bigger MAX_IO_APICS!\n");
5345     - }
5346     - if (!address) {
5347     - printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
5348     - " found in MADT table, skipping!\n");
5349     + if (bad_ioapic(address))
5350     return;
5351     - }
5352    
5353     idx = nr_ioapics++;
5354    
5355     @@ -784,7 +677,7 @@
5356     set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
5357     #endif
5358     mp_ioapics[idx].mpc_apicid = id;
5359     - mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
5360     + mp_ioapics[idx].mpc_apicver = 0;
5361    
5362     /*
5363     * Build basic IRQ lookup table to facilitate gsi->io_apic lookups
5364     @@ -795,21 +688,15 @@
5365     mp_ioapic_routing[idx].gsi_end = gsi_base +
5366     io_apic_get_redir_entries(idx);
5367    
5368     - printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
5369     + printk(KERN_INFO "IOAPIC[%d]: apic_id %d, address 0x%x, "
5370     "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
5371     - mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
5372     + mp_ioapics[idx].mpc_apicaddr,
5373     mp_ioapic_routing[idx].gsi_start,
5374     mp_ioapic_routing[idx].gsi_end);
5375     -
5376     - return;
5377     }
5378    
5379     -
5380     -void __init mp_override_legacy_irq (
5381     - u8 bus_irq,
5382     - u8 polarity,
5383     - u8 trigger,
5384     - u32 gsi)
5385     +void __init
5386     +mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
5387     {
5388     struct mpc_config_intsrc intsrc;
5389     int ioapic = -1;
5390     @@ -847,22 +734,18 @@
5391     mp_irqs[mp_irq_entries] = intsrc;
5392     if (++mp_irq_entries == MAX_IRQ_SOURCES)
5393     panic("Max # of irq sources exceeded!\n");
5394     -
5395     - return;
5396     }
5397    
5398     -
5399     -void __init mp_config_acpi_legacy_irqs (void)
5400     +void __init mp_config_acpi_legacy_irqs(void)
5401     {
5402     struct mpc_config_intsrc intsrc;
5403     - int i = 0;
5404     - int ioapic = -1;
5405     + int i = 0;
5406     + int ioapic = -1;
5407    
5408     /*
5409     * Fabricate the legacy ISA bus (bus #31).
5410     */
5411     - mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
5412     - Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
5413     + set_bit(MP_ISA_BUS, mp_bus_not_pci);
5414    
5415     /*
5416     * Locate the IOAPIC that manages the ISA IRQs (0-15).
5417     @@ -915,24 +798,13 @@
5418     if (++mp_irq_entries == MAX_IRQ_SOURCES)
5419     panic("Max # of irq sources exceeded!\n");
5420     }
5421     -
5422     - return;
5423     }
5424    
5425     -#define MAX_GSI_NUM 4096
5426     -
5427     int mp_register_gsi(u32 gsi, int triggering, int polarity)
5428     {
5429     - int ioapic = -1;
5430     - int ioapic_pin = 0;
5431     - int idx, bit = 0;
5432     - static int pci_irq = 16;
5433     - /*
5434     - * Mapping between Global System Interrupts, which
5435     - * represent all possible interrupts, to the IRQs
5436     - * assigned to actual devices.
5437     - */
5438     - static int gsi_to_irq[MAX_GSI_NUM];
5439     + int ioapic = -1;
5440     + int ioapic_pin = 0;
5441     + int idx, bit = 0;
5442    
5443     if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
5444     return gsi;
5445     @@ -965,47 +837,14 @@
5446     if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
5447     Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
5448     mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
5449     - return gsi_to_irq[gsi];
5450     + return gsi;
5451     }
5452    
5453     mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
5454    
5455     - if (triggering == ACPI_LEVEL_SENSITIVE) {
5456     - /*
5457     - * For PCI devices assign IRQs in order, avoiding gaps
5458     - * due to unused I/O APIC pins.
5459     - */
5460     - int irq = gsi;
5461     - if (gsi < MAX_GSI_NUM) {
5462     - /*
5463     - * Retain the VIA chipset work-around (gsi > 15), but
5464     - * avoid a problem where the 8254 timer (IRQ0) is setup
5465     - * via an override (so it's not on pin 0 of the ioapic),
5466     - * and at the same time, the pin 0 interrupt is a PCI
5467     - * type. The gsi > 15 test could cause these two pins
5468     - * to be shared as IRQ0, and they are not shareable.
5469     - * So test for this condition, and if necessary, avoid
5470     - * the pin collision.
5471     - */
5472     - if (gsi > 15 || (gsi == 0 && !timer_uses_ioapic_pin_0))
5473     - gsi = pci_irq++;
5474     - /*
5475     - * Don't assign IRQ used by ACPI SCI
5476     - */
5477     - if (gsi == acpi_fadt.sci_int)
5478     - gsi = pci_irq++;
5479     - gsi_to_irq[irq] = gsi;
5480     - } else {
5481     - printk(KERN_ERR "GSI %u is too high\n", gsi);
5482     - return gsi;
5483     - }
5484     - }
5485     -
5486     io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
5487     triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
5488     polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
5489     return gsi;
5490     }
5491     -
5492     -#endif /*CONFIG_X86_IO_APIC*/
5493     #endif /*CONFIG_ACPI*/
5494 niro 612 --- a/arch/x86/kernel/pci-dma_32-xen.c
5495     +++ b/arch/x86/kernel/pci-dma_32-xen.c
5496 niro 609 @@ -116,8 +116,7 @@
5497     {
5498     int i, rc;
5499    
5500     - if (direction == DMA_NONE)
5501     - BUG();
5502     + BUG_ON(!valid_dma_direction(direction));
5503     WARN_ON(nents == 0 || sg[0].length == 0);
5504    
5505     if (swiotlb) {
5506     @@ -148,7 +147,7 @@
5507     {
5508     int i;
5509    
5510     - BUG_ON(direction == DMA_NONE);
5511     + BUG_ON(!valid_dma_direction(direction));
5512     if (swiotlb)
5513     swiotlb_unmap_sg(hwdev, sg, nents, direction);
5514     else {
5515     @@ -165,8 +164,7 @@
5516     {
5517     dma_addr_t dma_addr;
5518    
5519     - BUG_ON(direction == DMA_NONE);
5520     -
5521     + BUG_ON(!valid_dma_direction(direction));
5522     if (swiotlb) {
5523     dma_addr = swiotlb_map_page(
5524     dev, page, offset, size, direction);
5525     @@ -183,7 +181,7 @@
5526     dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size,
5527     enum dma_data_direction direction)
5528     {
5529     - BUG_ON(direction == DMA_NONE);
5530     + BUG_ON(!valid_dma_direction(direction));
5531     if (swiotlb)
5532     swiotlb_unmap_page(dev, dma_address, size, direction);
5533     else
5534     @@ -365,8 +363,7 @@
5535     {
5536     dma_addr_t dma;
5537    
5538     - if (direction == DMA_NONE)
5539     - BUG();
5540     + BUG_ON(!valid_dma_direction(direction));
5541     WARN_ON(size == 0);
5542    
5543     if (swiotlb) {
5544     @@ -387,8 +384,7 @@
5545     dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
5546     enum dma_data_direction direction)
5547     {
5548     - if (direction == DMA_NONE)
5549     - BUG();
5550     + BUG_ON(!valid_dma_direction(direction));
5551     if (swiotlb)
5552     swiotlb_unmap_single(dev, dma_addr, size, direction);
5553     else
5554 niro 612 --- a/arch/x86/kernel/pci-swiotlb_64-xen.c
5555     +++ b/arch/x86/kernel/pci-swiotlb_64-xen.c
5556 niro 609 @@ -3,7 +3,8 @@
5557     #include <linux/pci.h>
5558     #include <linux/cache.h>
5559     #include <linux/module.h>
5560     -#include <asm/dma-mapping.h>
5561     +#include <linux/dma-mapping.h>
5562     +
5563     #include <asm/proto.h>
5564     #include <asm/swiotlb.h>
5565     #include <asm/dma.h>
5566 niro 612 --- a/arch/x86/kernel/process_32-xen.c
5567     +++ b/arch/x86/kernel/process_32-xen.c
5568 niro 609 @@ -37,6 +37,7 @@
5569     #include <linux/kallsyms.h>
5570     #include <linux/ptrace.h>
5571     #include <linux/random.h>
5572     +#include <linux/personality.h>
5573    
5574     #include <asm/uaccess.h>
5575     #include <asm/pgtable.h>
5576     @@ -186,7 +187,7 @@
5577     void cpu_idle_wait(void)
5578     {
5579     unsigned int cpu, this_cpu = get_cpu();
5580     - cpumask_t map;
5581     + cpumask_t map, tmp = current->cpus_allowed;
5582    
5583     set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
5584     put_cpu();
5585     @@ -208,6 +209,8 @@
5586     }
5587     cpus_and(map, map, cpu_online_map);
5588     } while (!cpus_empty(map));
5589     +
5590     + set_cpus_allowed(current, tmp);
5591     }
5592     EXPORT_SYMBOL_GPL(cpu_idle_wait);
5593    
5594     @@ -240,9 +243,9 @@
5595     if (user_mode_vm(regs))
5596     printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp);
5597     printk(" EFLAGS: %08lx %s (%s %.*s)\n",
5598     - regs->eflags, print_tainted(), system_utsname.release,
5599     - (int)strcspn(system_utsname.version, " "),
5600     - system_utsname.version);
5601     + regs->eflags, print_tainted(), init_utsname()->release,
5602     + (int)strcspn(init_utsname()->version, " "),
5603     + init_utsname()->version);
5604     printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
5605     regs->eax,regs->ebx,regs->ecx,regs->edx);
5606     printk("ESI: %08lx EDI: %08lx EBP: %08lx",
5607     @@ -264,15 +267,6 @@
5608     * the "args".
5609     */
5610     extern void kernel_thread_helper(void);
5611     -__asm__(".section .text\n"
5612     - ".align 4\n"
5613     - "kernel_thread_helper:\n\t"
5614     - "movl %edx,%eax\n\t"
5615     - "pushl %edx\n\t"
5616     - "call *%ebx\n\t"
5617     - "pushl %eax\n\t"
5618     - "call do_exit\n"
5619     - ".previous");
5620    
5621     /*
5622     * Create a kernel thread
5623     @@ -290,7 +284,7 @@
5624     regs.xes = __USER_DS;
5625     regs.orig_eax = -1;
5626     regs.eip = (unsigned long) kernel_thread_helper;
5627     - regs.xcs = GET_KERNEL_CS();
5628     + regs.xcs = __KERNEL_CS | get_kernel_rpl();
5629     regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
5630    
5631     /* Ok, create the new process.. */
5632     @@ -369,13 +363,12 @@
5633    
5634     tsk = current;
5635     if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
5636     - p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
5637     + p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
5638     + IO_BITMAP_BYTES, GFP_KERNEL);
5639     if (!p->thread.io_bitmap_ptr) {
5640     p->thread.io_bitmap_max = 0;
5641     return -ENOMEM;
5642     }
5643     - memcpy(p->thread.io_bitmap_ptr, tsk->thread.io_bitmap_ptr,
5644     - IO_BITMAP_BYTES);
5645     set_tsk_thread_flag(p, TIF_IO_BITMAP);
5646     }
5647    
5648     @@ -850,7 +843,7 @@
5649    
5650     unsigned long arch_align_stack(unsigned long sp)
5651     {
5652     - if (randomize_va_space)
5653     + if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
5654     sp -= get_random_int() % 8192;
5655     return sp & ~0xf;
5656     }
5657 niro 612 --- a/arch/x86/kernel/process_64-xen.c
5658     +++ b/arch/x86/kernel/process_64-xen.c
5659 niro 609 @@ -89,25 +89,24 @@
5660     }
5661     EXPORT_SYMBOL(idle_notifier_unregister);
5662    
5663     -enum idle_state { CPU_IDLE, CPU_NOT_IDLE };
5664     -static DEFINE_PER_CPU(enum idle_state, idle_state) = CPU_NOT_IDLE;
5665     -
5666     void enter_idle(void)
5667     {
5668     - __get_cpu_var(idle_state) = CPU_IDLE;
5669     + write_pda(isidle, 1);
5670     atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
5671     }
5672    
5673     static void __exit_idle(void)
5674     {
5675     - __get_cpu_var(idle_state) = CPU_NOT_IDLE;
5676     + if (test_and_clear_bit_pda(0, isidle) == 0)
5677     + return;
5678     atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
5679     }
5680    
5681     /* Called from interrupts to signify idle end */
5682     void exit_idle(void)
5683     {
5684     - if (current->pid | read_pda(irqcount))
5685     + /* idle loop has pid 0 */
5686     + if (current->pid)
5687     return;
5688     __exit_idle();
5689     }
5690     @@ -184,6 +183,9 @@
5691     play_dead();
5692     enter_idle();
5693     idle();
5694     + /* In many cases the interrupt that ended idle
5695     + has already called exit_idle. But some idle
5696     + loops can be woken up without interrupt. */
5697     __exit_idle();
5698     }
5699    
5700     @@ -196,7 +198,7 @@
5701     void cpu_idle_wait(void)
5702     {
5703     unsigned int cpu, this_cpu = get_cpu();
5704     - cpumask_t map;
5705     + cpumask_t map, tmp = current->cpus_allowed;
5706    
5707     set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
5708     put_cpu();
5709     @@ -219,6 +221,8 @@
5710     }
5711     cpus_and(map, map, cpu_online_map);
5712     } while (!cpus_empty(map));
5713     +
5714     + set_cpus_allowed(current, tmp);
5715     }
5716     EXPORT_SYMBOL_GPL(cpu_idle_wait);
5717    
5718     @@ -250,9 +254,9 @@
5719     print_modules();
5720     printk("Pid: %d, comm: %.20s %s %s %.*s\n",
5721     current->pid, current->comm, print_tainted(),
5722     - system_utsname.release,
5723     - (int)strcspn(system_utsname.version, " "),
5724     - system_utsname.version);
5725     + init_utsname()->release,
5726     + (int)strcspn(init_utsname()->version, " "),
5727     + init_utsname()->version);
5728     printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
5729     printk_address(regs->rip);
5730     printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp,
5731     @@ -310,6 +314,7 @@
5732    
5733     kfree(t->io_bitmap_ptr);
5734     t->io_bitmap_ptr = NULL;
5735     + clear_thread_flag(TIF_IO_BITMAP);
5736     /*
5737     * Careful, clear this in the TSS too:
5738     */
5739     @@ -340,6 +345,7 @@
5740     if (t->flags & _TIF_IA32)
5741     current_thread_info()->status |= TS_COMPAT;
5742     }
5743     + t->flags &= ~_TIF_DEBUG;
5744    
5745     tsk->thread.debugreg0 = 0;
5746     tsk->thread.debugreg1 = 0;
5747     @@ -432,7 +438,7 @@
5748     asm("mov %%es,%0" : "=m" (p->thread.es));
5749     asm("mov %%ds,%0" : "=m" (p->thread.ds));
5750    
5751     - if (unlikely(me->thread.io_bitmap_ptr != NULL)) {
5752     + if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
5753     p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
5754     if (!p->thread.io_bitmap_ptr) {
5755     p->thread.io_bitmap_max = 0;
5756     @@ -440,6 +446,7 @@
5757     }
5758     memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
5759     IO_BITMAP_BYTES);
5760     + set_tsk_thread_flag(p, TIF_IO_BITMAP);
5761     }
5762    
5763     /*
5764     @@ -474,6 +481,30 @@
5765     }
5766    
5767     /*
5768     + * This special macro can be used to load a debugging register
5769     + */
5770     +#define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r)
5771     +
5772     +static inline void __switch_to_xtra(struct task_struct *prev_p,
5773     + struct task_struct *next_p)
5774     +{
5775     + struct thread_struct *prev, *next;
5776     +
5777     + prev = &prev_p->thread,
5778     + next = &next_p->thread;
5779     +
5780     + if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
5781     + loaddebug(next, 0);
5782     + loaddebug(next, 1);
5783     + loaddebug(next, 2);
5784     + loaddebug(next, 3);
5785     + /* no 4 and 5 */
5786     + loaddebug(next, 6);
5787     + loaddebug(next, 7);
5788     + }
5789     +}
5790     +
5791     +/*
5792     * switch_to(x,y) should switch tasks from x to y.
5793     *
5794     * This could still be optimized:
5795     @@ -495,6 +526,10 @@
5796     struct physdev_set_iobitmap iobmp_op;
5797     multicall_entry_t _mcl[8], *mcl = _mcl;
5798    
5799     + /* we're going to use this soon, after a few expensive things */
5800     + if (next_p->fpu_counter>5)
5801     + prefetch(&next->i387.fxsave);
5802     +
5803     /*
5804     * This is basically '__unlazy_fpu', except that we queue a
5805     * multicall to indicate FPU task switch, rather than
5806     @@ -507,7 +542,8 @@
5807     mcl->op = __HYPERVISOR_fpu_taskswitch;
5808     mcl->args[0] = 1;
5809     mcl++;
5810     - }
5811     + } else
5812     + prev_p->fpu_counter = 0;
5813    
5814     /*
5815     * Reload esp0, LDT and the page table pointer:
5816     @@ -587,21 +623,29 @@
5817     write_pda(oldrsp, next->userrsp);
5818     write_pda(pcurrent, next_p);
5819     write_pda(kernelstack,
5820     - task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
5821     + (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
5822     +#ifdef CONFIG_CC_STACKPROTECTOR
5823     + write_pda(stack_canary, next_p->stack_canary);
5824     +
5825     + /*
5826     + * Build time only check to make sure the stack_canary is at
5827     + * offset 40 in the pda; this is a gcc ABI requirement
5828     + */
5829     + BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
5830     +#endif
5831    
5832     /*
5833     * Now maybe reload the debug registers
5834     */
5835     - if (unlikely(next->debugreg7)) {
5836     - set_debugreg(next->debugreg0, 0);
5837     - set_debugreg(next->debugreg1, 1);
5838     - set_debugreg(next->debugreg2, 2);
5839     - set_debugreg(next->debugreg3, 3);
5840     - /* no 4 and 5 */
5841     - set_debugreg(next->debugreg6, 6);
5842     - set_debugreg(next->debugreg7, 7);
5843     - }
5844     + if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW))
5845     + __switch_to_xtra(prev_p, next_p);
5846    
5847     + /* If the task has used fpu the last 5 timeslices, just do a full
5848     + * restore of the math state immediately to avoid the trap; the
5849     + * chances of needing FPU soon are obviously high now
5850     + */
5851     + if (next_p->fpu_counter>5)
5852     + math_state_restore();
5853     return prev_p;
5854     }
5855    
5856     @@ -821,7 +865,7 @@
5857    
5858     unsigned long arch_align_stack(unsigned long sp)
5859     {
5860     - if (randomize_va_space)
5861     + if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
5862     sp -= get_random_int() % 8192;
5863     return sp & ~0xf;
5864     }
5865 niro 612 --- a/arch/x86/kernel/setup64-xen.c
5866     +++ b/arch/x86/kernel/setup64-xen.c
5867     @@ -31,7 +31,7 @@
5868     #include <asm/hypervisor.h>
5869     #endif
5870    
5871     -char x86_boot_params[BOOT_PARAM_SIZE] __initdata = {0,};
5872     +char x86_boot_params[BOOT_PARAM_SIZE] __initdata;
5873    
5874     cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
5875    
5876     @@ -55,8 +55,10 @@
5877     on Enable(default)
5878     off Disable
5879     */
5880     -int __init nonx_setup(char *str)
5881     +static int __init nonx_setup(char *str)
5882     {
5883     + if (!str)
5884     + return -EINVAL;
5885     if (!strncmp(str, "on", 2)) {
5886     __supported_pte_mask |= _PAGE_NX;
5887     do_not_nx = 0;
5888     @@ -64,9 +66,9 @@
5889     do_not_nx = 1;
5890     __supported_pte_mask &= ~_PAGE_NX;
5891     }
5892     - return 1;
5893     + return 0;
5894     }
5895     -__setup("noexec=", nonx_setup); /* parsed early actually */
5896     +early_param("noexec", nonx_setup);
5897    
5898     int force_personality32 = 0;
5899    
5900     @@ -102,12 +104,9 @@
5901     #endif
5902    
5903     /* Copy section for each CPU (we discard the original) */
5904     - size = ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES);
5905     -#ifdef CONFIG_MODULES
5906     - if (size < PERCPU_ENOUGH_ROOM)
5907     - size = PERCPU_ENOUGH_ROOM;
5908     -#endif
5909     + size = PERCPU_ENOUGH_ROOM;
5910    
5911     + printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n", size);
5912     for_each_cpu_mask (i, cpu_possible_map) {
5913     char *ptr;
5914    
5915     @@ -169,7 +168,10 @@
5916     /* Setup up data that may be needed in __get_free_pages early */
5917     asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0));
5918     #ifndef CONFIG_XEN
5919     + /* Memory clobbers used to order PDA accessed */
5920     + mb();
5921     wrmsrl(MSR_GS_BASE, pda);
5922     + mb();
5923     #else
5924     if (HYPERVISOR_set_segment_base(SEGBASE_GS_KERNEL,
5925     (unsigned long)pda))
5926     @@ -302,28 +304,17 @@
5927     * set up and load the per-CPU TSS
5928     */
5929     for (v = 0; v < N_EXCEPTION_STACKS; v++) {
5930     + static const unsigned int order[N_EXCEPTION_STACKS] = {
5931     + [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
5932     + [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
5933     + };
5934     if (cpu) {
5935     - static const unsigned int order[N_EXCEPTION_STACKS] = {
5936     - [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
5937     - [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
5938     - };
5939     -
5940     estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
5941     if (!estacks)
5942     panic("Cannot allocate exception stack %ld %d\n",
5943     v, cpu);
5944     }
5945     - switch (v + 1) {
5946     -#if DEBUG_STKSZ > EXCEPTION_STKSZ
5947     - case DEBUG_STACK:
5948     - cpu_pda(cpu)->debugstack = (unsigned long)estacks;
5949     - estacks += DEBUG_STKSZ;
5950     - break;
5951     -#endif
5952     - default:
5953     - estacks += EXCEPTION_STKSZ;
5954     - break;
5955     - }
5956     + estacks += PAGE_SIZE << order[v];
5957     orig_ist->ist[v] = t->ist[v] = (unsigned long)estacks;
5958     }
5959    
5960     --- a/arch/x86/kernel/setup_32-xen.c
5961     +++ b/arch/x86/kernel/setup_32-xen.c
5962 niro 609 @@ -56,6 +56,7 @@
5963     #include <asm/apic.h>
5964     #include <asm/e820.h>
5965     #include <asm/mpspec.h>
5966     +#include <asm/mmzone.h>
5967     #include <asm/setup.h>
5968     #include <asm/arch_hooks.h>
5969     #include <asm/sections.h>
5970     @@ -105,18 +106,6 @@
5971    
5972     unsigned long mmu_cr4_features;
5973    
5974     -#ifdef CONFIG_ACPI
5975     - int acpi_disabled = 0;
5976     -#else
5977     - int acpi_disabled = 1;
5978     -#endif
5979     -EXPORT_SYMBOL(acpi_disabled);
5980     -
5981     -#ifdef CONFIG_ACPI
5982     -int __initdata acpi_force = 0;
5983     -extern acpi_interrupt_flags acpi_sci_flags;
5984     -#endif
5985     -
5986     /* for MCA, but anyone else can use it if they want */
5987     unsigned int machine_id;
5988     #ifdef CONFIG_MCA
5989     @@ -170,7 +159,6 @@
5990     #endif
5991    
5992     extern void early_cpu_init(void);
5993     -extern void generic_apic_probe(char *);
5994     extern int root_mountflags;
5995    
5996     unsigned long saved_videomode;
5997     @@ -243,9 +231,6 @@
5998     .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
5999     } };
6000    
6001     -#define ADAPTER_ROM_RESOURCES \
6002     - (sizeof adapter_rom_resources / sizeof adapter_rom_resources[0])
6003     -
6004     static struct resource video_rom_resource = {
6005     .name = "Video ROM",
6006     .start = 0xc0000,
6007     @@ -307,9 +292,6 @@
6008     .flags = IORESOURCE_BUSY | IORESOURCE_IO
6009     } };
6010    
6011     -#define STANDARD_IO_RESOURCES \
6012     - (sizeof standard_io_resources / sizeof standard_io_resources[0])
6013     -
6014     #define romsignature(x) (*(unsigned short *)(x) == 0xaa55)
6015    
6016     static int __init romchecksum(unsigned char *rom, unsigned long length)
6017     @@ -372,7 +354,7 @@
6018     }
6019    
6020     /* check for adapter roms on 2k boundaries */
6021     - for (i = 0; i < ADAPTER_ROM_RESOURCES && start < upper; start += 2048) {
6022     + for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) {
6023     rom = isa_bus_to_virt(start);
6024     if (!romsignature(rom))
6025     continue;
6026     @@ -764,246 +746,152 @@
6027     }
6028     #endif
6029    
6030     -static void __init parse_cmdline_early (char ** cmdline_p)
6031     +static int __initdata user_defined_memmap = 0;
6032     +
6033     +/*
6034     + * "mem=nopentium" disables the 4MB page tables.
6035     + * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM
6036     + * to <mem>, overriding the bios size.
6037     + * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from
6038     + * <start> to <start>+<mem>, overriding the bios size.
6039     + *
6040     + * HPA tells me bootloaders need to parse mem=, so no new
6041     + * option should be mem= [also see Documentation/i386/boot.txt]
6042     + */
6043     +static int __init parse_mem(char *arg)
6044     {
6045     - char c = ' ', *to = command_line, *from = saved_command_line;
6046     - int len = 0, max_cmdline;
6047     - int userdef = 0;
6048     -
6049     - if ((max_cmdline = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE)
6050     - max_cmdline = COMMAND_LINE_SIZE;
6051     - memcpy(saved_command_line, xen_start_info->cmd_line, max_cmdline);
6052     - /* Save unparsed command line copy for /proc/cmdline */
6053     - saved_command_line[max_cmdline-1] = '\0';
6054     -
6055     - for (;;) {
6056     - if (c != ' ')
6057     - goto next_char;
6058     - /*
6059     - * "mem=nopentium" disables the 4MB page tables.
6060     - * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM
6061     - * to <mem>, overriding the bios size.
6062     - * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from
6063     - * <start> to <start>+<mem>, overriding the bios size.
6064     - *
6065     - * HPA tells me bootloaders need to parse mem=, so no new
6066     - * option should be mem= [also see Documentation/i386/boot.txt]
6067     - */
6068     - if (!memcmp(from, "mem=", 4)) {
6069     - if (to != command_line)
6070     - to--;
6071     - if (!memcmp(from+4, "nopentium", 9)) {
6072     - from += 9+4;
6073     - clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
6074     - disable_pse = 1;
6075     - } else {
6076     - /* If the user specifies memory size, we
6077     - * limit the BIOS-provided memory map to
6078     - * that size. exactmap can be used to specify
6079     - * the exact map. mem=number can be used to
6080     - * trim the existing memory map.
6081     - */
6082     - unsigned long long mem_size;
6083     -
6084     - mem_size = memparse(from+4, &from);
6085     - limit_regions(mem_size);
6086     - userdef=1;
6087     - }
6088     - }
6089     + if (!arg)
6090     + return -EINVAL;
6091    
6092     - else if (!memcmp(from, "memmap=", 7)) {
6093     - if (to != command_line)
6094     - to--;
6095     - if (!memcmp(from+7, "exactmap", 8)) {
6096     -#ifdef CONFIG_CRASH_DUMP
6097     - /* If we are doing a crash dump, we
6098     - * still need to know the real mem
6099     - * size before original memory map is
6100     - * reset.
6101     - */
6102     - find_max_pfn();
6103     - saved_max_pfn = max_pfn;
6104     -#endif
6105     - from += 8+7;
6106     - e820.nr_map = 0;
6107     - userdef = 1;
6108     - } else {
6109     - /* If the user specifies memory size, we
6110     - * limit the BIOS-provided memory map to
6111     - * that size. exactmap can be used to specify
6112     - * the exact map. mem=number can be used to
6113     - * trim the existing memory map.
6114     - */
6115     - unsigned long long start_at, mem_size;
6116     + if (strcmp(arg, "nopentium") == 0) {
6117     + clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
6118     + disable_pse = 1;
6119     + } else {
6120     + /* If the user specifies memory size, we
6121     + * limit the BIOS-provided memory map to
6122     + * that size. exactmap can be used to specify
6123     + * the exact map. mem=number can be used to
6124     + * trim the existing memory map.
6125     + */
6126     + unsigned long long mem_size;
6127    
6128     - mem_size = memparse(from+7, &from);
6129     - if (*from == '@') {
6130     - start_at = memparse(from+1, &from);
6131     - add_memory_region(start_at, mem_size, E820_RAM);
6132     - } else if (*from == '#') {
6133     - start_at = memparse(from+1, &from);
6134     - add_memory_region(start_at, mem_size, E820_ACPI);
6135     - } else if (*from == '$') {
6136     - start_at = memparse(from+1, &from);
6137     - add_memory_region(start_at, mem_size, E820_RESERVED);
6138     - } else {
6139     - limit_regions(mem_size);
6140     - userdef=1;
6141     - }
6142     - }
6143     - }
6144     -
6145     - else if (!memcmp(from, "noexec=", 7))
6146     - noexec_setup(from + 7);
6147     + mem_size = memparse(arg, &arg);
6148     + limit_regions(mem_size);
6149     + user_defined_memmap = 1;
6150     + }
6151     + return 0;
6152     +}
6153     +early_param("mem", parse_mem);
6154    
6155     +static int __init parse_memmap(char *arg)
6156     +{
6157     + if (!arg)
6158     + return -EINVAL;
6159    
6160     -#ifdef CONFIG_X86_MPPARSE
6161     - /*
6162     - * If the BIOS enumerates physical processors before logical,
6163     - * maxcpus=N at enumeration-time can be used to disable HT.
6164     + if (strcmp(arg, "exactmap") == 0) {
6165     +#ifdef CONFIG_CRASH_DUMP
6166     + /* If we are doing a crash dump, we
6167     + * still need to know the real mem
6168     + * size before original memory map is
6169     + * reset.
6170     */
6171     - else if (!memcmp(from, "maxcpus=", 8)) {
6172     - extern unsigned int maxcpus;
6173     -
6174     - maxcpus = simple_strtoul(from + 8, NULL, 0);
6175     - }
6176     + find_max_pfn();
6177     + saved_max_pfn = max_pfn;
6178     #endif
6179     + e820.nr_map = 0;
6180     + user_defined_memmap = 1;
6181     + } else {
6182     + /* If the user specifies memory size, we
6183     + * limit the BIOS-provided memory map to
6184     + * that size. exactmap can be used to specify
6185     + * the exact map. mem=number can be used to
6186     + * trim the existing memory map.
6187     + */
6188     + unsigned long long start_at, mem_size;
6189    
6190     -#ifdef CONFIG_ACPI
6191     - /* "acpi=off" disables both ACPI table parsing and interpreter */
6192     - else if (!memcmp(from, "acpi=off", 8)) {
6193     - disable_acpi();
6194     - }
6195     -
6196     - /* acpi=force to over-ride black-list */
6197     - else if (!memcmp(from, "acpi=force", 10)) {
6198     - acpi_force = 1;
6199     - acpi_ht = 1;
6200     - acpi_disabled = 0;
6201     - }
6202     -
6203     - /* acpi=strict disables out-of-spec workarounds */
6204     - else if (!memcmp(from, "acpi=strict", 11)) {
6205     - acpi_strict = 1;
6206     - }
6207     -
6208     - /* Limit ACPI just to boot-time to enable HT */
6209     - else if (!memcmp(from, "acpi=ht", 7)) {
6210     - if (!acpi_force)
6211     - disable_acpi();
6212     - acpi_ht = 1;
6213     - }
6214     -
6215     - /* "pci=noacpi" disable ACPI IRQ routing and PCI scan */
6216     - else if (!memcmp(from, "pci=noacpi", 10)) {
6217     - acpi_disable_pci();
6218     - }
6219     - /* "acpi=noirq" disables ACPI interrupt routing */
6220     - else if (!memcmp(from, "acpi=noirq", 10)) {
6221     - acpi_noirq_set();
6222     + mem_size = memparse(arg, &arg);
6223     + if (*arg == '@') {
6224     + start_at = memparse(arg+1, &arg);
6225     + add_memory_region(start_at, mem_size, E820_RAM);
6226     + } else if (*arg == '#') {
6227     + start_at = memparse(arg+1, &arg);
6228     + add_memory_region(start_at, mem_size, E820_ACPI);
6229     + } else if (*arg == '$') {
6230     + start_at = memparse(arg+1, &arg);
6231     + add_memory_region(start_at, mem_size, E820_RESERVED);
6232     + } else {
6233     + limit_regions(mem_size);
6234     + user_defined_memmap = 1;
6235     }
6236     + }
6237     + return 0;
6238     +}
6239     +early_param("memmap", parse_memmap);
6240    
6241     - else if (!memcmp(from, "acpi_sci=edge", 13))
6242     - acpi_sci_flags.trigger = 1;
6243     +#ifdef CONFIG_PROC_VMCORE
6244     +/* elfcorehdr= specifies the location of elf core header
6245     + * stored by the crashed kernel.
6246     + */
6247     +static int __init parse_elfcorehdr(char *arg)
6248     +{
6249     + if (!arg)
6250     + return -EINVAL;
6251    
6252 niro 612 - else if (!memcmp(from, "acpi_sci=level", 14))
6253     - acpi_sci_flags.trigger = 3;
6254 niro 609 + elfcorehdr_addr = memparse(arg, &arg);
6255     + return 0;
6256     +}
6257     +early_param("elfcorehdr", parse_elfcorehdr);
6258     +#endif /* CONFIG_PROC_VMCORE */
6259    
6260 niro 612 - else if (!memcmp(from, "acpi_sci=high", 13))
6261     - acpi_sci_flags.polarity = 1;
6262 niro 609 +/*
6263     + * highmem=size forces highmem to be exactly 'size' bytes.
6264     + * This works even on boxes that have no highmem otherwise.
6265     + * This also works to reduce highmem size on bigger boxes.
6266     + */
6267     +static int __init parse_highmem(char *arg)
6268     +{
6269     + if (!arg)
6270     + return -EINVAL;
6271    
6272 niro 612 - else if (!memcmp(from, "acpi_sci=low", 12))
6273     - acpi_sci_flags.polarity = 3;
6274 niro 609 + highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT;
6275     + return 0;
6276     +}
6277     +early_param("highmem", parse_highmem);
6278    
6279 niro 612 -#ifdef CONFIG_X86_IO_APIC
6280     - else if (!memcmp(from, "acpi_skip_timer_override", 24))
6281     - acpi_skip_timer_override = 1;
6282     +/*
6283     + * vmalloc=size forces the vmalloc area to be exactly 'size'
6284     + * bytes. This can be used to increase (or decrease) the
6285     + * vmalloc area - the default is 128m.
6286     + */
6287     +static int __init parse_vmalloc(char *arg)
6288     +{
6289     + if (!arg)
6290     + return -EINVAL;
6291    
6292 niro 611 - if (!memcmp(from, "disable_timer_pin_1", 19))
6293     - disable_timer_pin_1 = 1;
6294     - if (!memcmp(from, "enable_timer_pin_1", 18))
6295     - disable_timer_pin_1 = -1;
6296     -
6297     - /* disable IO-APIC */
6298     - else if (!memcmp(from, "noapic", 6))
6299     - disable_ioapic_setup();
6300     -#endif /* CONFIG_X86_IO_APIC */
6301     -#endif /* CONFIG_ACPI */
6302 niro 612 -
6303 niro 609 -#ifdef CONFIG_X86_LOCAL_APIC
6304     - /* enable local APIC */
6305     - else if (!memcmp(from, "lapic", 5))
6306     - lapic_enable();
6307     -
6308     - /* disable local APIC */
6309     - else if (!memcmp(from, "nolapic", 6))
6310     - lapic_disable();
6311     -#endif /* CONFIG_X86_LOCAL_APIC */
6312     + __VMALLOC_RESERVE = memparse(arg, &arg);
6313     + return 0;
6314     +}
6315     +early_param("vmalloc", parse_vmalloc);
6316    
6317     -#ifdef CONFIG_KEXEC
6318     - /* crashkernel=size@addr specifies the location to reserve for
6319     - * a crash kernel. By reserving this memory we guarantee
6320     - * that linux never set's it up as a DMA target.
6321     - * Useful for holding code to do something appropriate
6322     - * after a kernel panic.
6323     - */
6324     - else if (!memcmp(from, "crashkernel=", 12)) {
6325     #ifndef CONFIG_XEN
6326     - unsigned long size, base;
6327     - size = memparse(from+12, &from);
6328     - if (*from == '@') {
6329     - base = memparse(from+1, &from);
6330     - /* FIXME: Do I want a sanity check
6331     - * to validate the memory range?
6332     - */
6333     - crashk_res.start = base;
6334     - crashk_res.end = base + size - 1;
6335     - }
6336     -#else
6337     - printk("Ignoring crashkernel command line, "
6338     - "parameter will be supplied by xen\n");
6339     -#endif
6340     - }
6341     -#endif
6342     -#ifdef CONFIG_PROC_VMCORE
6343     - /* elfcorehdr= specifies the location of elf core header
6344     - * stored by the crashed kernel.
6345     - */
6346     - else if (!memcmp(from, "elfcorehdr=", 11))
6347     - elfcorehdr_addr = memparse(from+11, &from);
6348     -#endif
6349     +/*
6350     + * reservetop=size reserves a hole at the top of the kernel address space which
6351     + * a hypervisor can load into later. Needed for dynamically loaded hypervisors,
6352     + * so relocating the fixmap can be done before paging initialization.
6353     + */
6354     +static int __init parse_reservetop(char *arg)
6355     +{
6356     + unsigned long address;
6357    
6358     - /*
6359     - * highmem=size forces highmem to be exactly 'size' bytes.
6360     - * This works even on boxes that have no highmem otherwise.
6361     - * This also works to reduce highmem size on bigger boxes.
6362     - */
6363     - else if (!memcmp(from, "highmem=", 8))
6364     - highmem_pages = memparse(from+8, &from) >> PAGE_SHIFT;
6365     -
6366     - /*
6367     - * vmalloc=size forces the vmalloc area to be exactly 'size'
6368     - * bytes. This can be used to increase (or decrease) the
6369     - * vmalloc area - the default is 128m.
6370     - */
6371     - else if (!memcmp(from, "vmalloc=", 8))
6372     - __VMALLOC_RESERVE = memparse(from+8, &from);
6373     + if (!arg)
6374     + return -EINVAL;
6375    
6376     - next_char:
6377     - c = *(from++);
6378     - if (!c)
6379     - break;
6380     - if (COMMAND_LINE_SIZE <= ++len)
6381     - break;
6382     - *(to++) = c;
6383     - }
6384     - *to = '\0';
6385     - *cmdline_p = command_line;
6386     - if (userdef) {
6387     - printk(KERN_INFO "user-defined physical RAM map:\n");
6388     - print_memory_map("user");
6389     - }
6390     + address = memparse(arg, &arg);
6391     + reserve_top_address(address);
6392     + return 0;
6393     }
6394     +early_param("reservetop", parse_reservetop);
6395     +#endif
6396    
6397     /*
6398     * Callback for efi_memory_walk.
6399     @@ -1024,7 +912,7 @@
6400     static int __init
6401     efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg)
6402     {
6403     - memory_present(0, start, end);
6404     + memory_present(0, PFN_UP(start), PFN_DOWN(end));
6405     return 0;
6406     }
6407    
6408     @@ -1291,6 +1179,14 @@
6409     }
6410     printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
6411     pages_to_mb(highend_pfn - highstart_pfn));
6412     + num_physpages = highend_pfn;
6413     + high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
6414     +#else
6415     + num_physpages = max_low_pfn;
6416     + high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
6417     +#endif
6418     +#ifdef CONFIG_FLATMEM
6419     + max_mapnr = num_physpages;
6420     #endif
6421     printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
6422     pages_to_mb(max_low_pfn));
6423     @@ -1302,22 +1198,19 @@
6424    
6425     void __init zone_sizes_init(void)
6426     {
6427     - unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0};
6428     - unsigned int max_dma, low;
6429     -
6430     - max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
6431     - low = max_low_pfn;
6432     -
6433     - if (low < max_dma)
6434     - zones_size[ZONE_DMA] = low;
6435     - else {
6436     - zones_size[ZONE_DMA] = max_dma;
6437     - zones_size[ZONE_NORMAL] = low - max_dma;
6438     + unsigned long max_zone_pfns[MAX_NR_ZONES];
6439     + memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
6440     + max_zone_pfns[ZONE_DMA] =
6441     + virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
6442     + max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
6443     #ifdef CONFIG_HIGHMEM
6444     - zones_size[ZONE_HIGHMEM] = highend_pfn - low;
6445     + max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
6446     + add_active_range(0, 0, highend_pfn);
6447     +#else
6448     + add_active_range(0, 0, max_low_pfn);
6449     #endif
6450     - }
6451     - free_area_init(zones_size);
6452     +
6453     + free_area_init_nodes(max_zone_pfns);
6454     }
6455     #else
6456     extern unsigned long __init setup_memory(void);
6457     @@ -1374,6 +1267,7 @@
6458     */
6459     acpi_reserve_bootmem();
6460     #endif
6461     + numa_kva_reserve();
6462     #endif /* !CONFIG_XEN */
6463    
6464     #ifdef CONFIG_BLK_DEV_INITRD
6465     @@ -1559,7 +1453,7 @@
6466     request_resource(&iomem_resource, &video_ram_resource);
6467    
6468     /* request I/O space for devices used on all i[345]86 PCs */
6469     - for (i = 0; i < STANDARD_IO_RESOURCES; i++)
6470     + for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
6471     request_resource(&ioport_resource, &standard_io_resources[i]);
6472     return 0;
6473     }
6474     @@ -1700,17 +1594,19 @@
6475     data_resource.start = virt_to_phys(_etext);
6476     data_resource.end = virt_to_phys(_edata)-1;
6477    
6478     - parse_cmdline_early(cmdline_p);
6479     + if ((i = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE)
6480     + i = COMMAND_LINE_SIZE;
6481     + memcpy(saved_command_line, xen_start_info->cmd_line, i);
6482     + saved_command_line[i - 1] = '\0';
6483     + parse_early_param();
6484    
6485     -#ifdef CONFIG_EARLY_PRINTK
6486     - {
6487     - char *s = strstr(*cmdline_p, "earlyprintk=");
6488     - if (s) {
6489     - setup_early_printk(strchr(s, '=') + 1);
6490     - printk("early console enabled\n");
6491     - }
6492     + if (user_defined_memmap) {
6493     + printk(KERN_INFO "user-defined physical RAM map:\n");
6494     + print_memory_map("user");
6495     }
6496     -#endif
6497     +
6498     + strlcpy(command_line, saved_command_line, COMMAND_LINE_SIZE);
6499     + *cmdline_p = command_line;
6500    
6501     max_low_pfn = setup_memory();
6502    
6503     @@ -1817,7 +1713,7 @@
6504     dmi_scan_machine();
6505    
6506     #ifdef CONFIG_X86_GENERICARCH
6507     - generic_apic_probe(*cmdline_p);
6508     + generic_apic_probe();
6509     #endif
6510     if (efi_enabled)
6511     efi_map_memmap();
6512     @@ -1838,9 +1734,11 @@
6513     acpi_boot_table_init();
6514     #endif
6515    
6516     +#ifdef CONFIG_PCI
6517     #ifdef CONFIG_X86_IO_APIC
6518     check_acpi_pci(); /* Checks more than just ACPI actually */
6519     #endif
6520     +#endif
6521    
6522     #ifdef CONFIG_ACPI
6523     acpi_boot_init();
6524 niro 612 --- a/arch/x86/kernel/setup_64-xen.c
6525     +++ b/arch/x86/kernel/setup_64-xen.c
6526 niro 609 @@ -118,16 +118,6 @@
6527    
6528     unsigned long mmu_cr4_features;
6529    
6530     -int acpi_disabled;
6531     -EXPORT_SYMBOL(acpi_disabled);
6532     -#ifdef CONFIG_ACPI
6533     -extern int __initdata acpi_ht;
6534     -extern acpi_interrupt_flags acpi_sci_flags;
6535     -int __initdata acpi_force = 0;
6536     -#endif
6537     -
6538     -int acpi_numa __initdata;
6539     -
6540     /* Boot loader ID as an integer, for the benefit of proc_dointvec */
6541     int bootloader_type;
6542    
6543     @@ -151,10 +141,6 @@
6544    
6545     struct edid_info edid_info;
6546     EXPORT_SYMBOL_GPL(edid_info);
6547     -struct e820map e820;
6548     -#ifdef CONFIG_XEN
6549     -struct e820map machine_e820;
6550     -#endif
6551    
6552     extern int root_mountflags;
6553    
6554     @@ -181,9 +167,6 @@
6555     .flags = IORESOURCE_BUSY | IORESOURCE_IO }
6556     };
6557    
6558     -#define STANDARD_IO_RESOURCES \
6559     - (sizeof standard_io_resources / sizeof standard_io_resources[0])
6560     -
6561     #define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM)
6562    
6563     struct resource data_resource = {
6564     @@ -230,9 +213,6 @@
6565     .flags = IORESOURCE_ROM }
6566     };
6567    
6568     -#define ADAPTER_ROM_RESOURCES \
6569     - (sizeof adapter_rom_resources / sizeof adapter_rom_resources[0])
6570     -
6571     static struct resource video_rom_resource = {
6572     .name = "Video ROM",
6573     .start = 0xc0000,
6574     @@ -309,7 +289,8 @@
6575     }
6576    
6577     /* check for adapter roms on 2k boundaries */
6578     - for (i = 0; i < ADAPTER_ROM_RESOURCES && start < upper; start += 2048) {
6579     + for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper;
6580     + start += 2048) {
6581     rom = isa_bus_to_virt(start);
6582     if (!romsignature(rom))
6583     continue;
6584     @@ -329,187 +310,22 @@
6585     }
6586     }
6587    
6588     -/* Check for full argument with no trailing characters */
6589     -static int fullarg(char *p, char *arg)
6590     +#ifdef CONFIG_PROC_VMCORE
6591     +/* elfcorehdr= specifies the location of elf core header
6592     + * stored by the crashed kernel. This option will be passed
6593     + * by kexec loader to the capture kernel.
6594     + */
6595     +static int __init setup_elfcorehdr(char *arg)
6596     {
6597     - int l = strlen(arg);
6598     - return !memcmp(p, arg, l) && (p[l] == 0 || isspace(p[l]));
6599     + char *end;
6600     + if (!arg)
6601     + return -EINVAL;
6602     + elfcorehdr_addr = memparse(arg, &end);
6603     + return end > arg ? 0 : -EINVAL;
6604     }
6605     -
6606     -static __init void parse_cmdline_early (char ** cmdline_p)
6607     -{
6608     - char c = ' ', *to = command_line, *from = COMMAND_LINE;
6609     - int len = 0;
6610     - int userdef = 0;
6611     -
6612     - for (;;) {
6613     - if (c != ' ')
6614     - goto next_char;
6615     -
6616     -#ifdef CONFIG_SMP
6617     - /*
6618     - * If the BIOS enumerates physical processors before logical,
6619     - * maxcpus=N at enumeration-time can be used to disable HT.
6620     - */
6621     - else if (!memcmp(from, "maxcpus=", 8)) {
6622     - extern unsigned int maxcpus;
6623     -
6624     - maxcpus = simple_strtoul(from + 8, NULL, 0);
6625     - }
6626     -#endif
6627     -#ifdef CONFIG_ACPI
6628     - /* "acpi=off" disables both ACPI table parsing and interpreter init */
6629     - if (fullarg(from,"acpi=off"))
6630     - disable_acpi();
6631     -
6632     - if (fullarg(from, "acpi=force")) {
6633     - /* add later when we do DMI horrors: */
6634     - acpi_force = 1;
6635     - acpi_disabled = 0;
6636     - }
6637     -
6638     - /* acpi=ht just means: do ACPI MADT parsing
6639     - at bootup, but don't enable the full ACPI interpreter */
6640     - if (fullarg(from, "acpi=ht")) {
6641     - if (!acpi_force)
6642     - disable_acpi();
6643     - acpi_ht = 1;
6644     - }
6645     - else if (fullarg(from, "pci=noacpi"))
6646     - acpi_disable_pci();
6647     - else if (fullarg(from, "acpi=noirq"))
6648     - acpi_noirq_set();
6649     -
6650     - else if (fullarg(from, "acpi_sci=edge"))
6651     - acpi_sci_flags.trigger = 1;
6652     - else if (fullarg(from, "acpi_sci=level"))
6653     - acpi_sci_flags.trigger = 3;
6654     - else if (fullarg(from, "acpi_sci=high"))
6655     - acpi_sci_flags.polarity = 1;
6656     - else if (fullarg(from, "acpi_sci=low"))
6657     - acpi_sci_flags.polarity = 3;
6658     -
6659     - /* acpi=strict disables out-of-spec workarounds */
6660     - else if (fullarg(from, "acpi=strict")) {
6661     - acpi_strict = 1;
6662     - }
6663     -#ifdef CONFIG_X86_IO_APIC
6664     - else if (fullarg(from, "acpi_skip_timer_override"))
6665     - acpi_skip_timer_override = 1;
6666     -#endif
6667     -#endif
6668     -
6669     -#ifndef CONFIG_XEN
6670     - if (fullarg(from, "nolapic") || fullarg(from, "disableapic")) {
6671     - clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
6672     - disable_apic = 1;
6673     - }
6674     -
6675     - if (fullarg(from, "noapic"))
6676     - skip_ioapic_setup = 1;
6677     -
6678     - if (fullarg(from,"apic")) {
6679     - skip_ioapic_setup = 0;
6680     - ioapic_force = 1;
6681     - }
6682     -#endif
6683     -
6684     - if (!memcmp(from, "mem=", 4))
6685     - parse_memopt(from+4, &from);
6686     -
6687     - if (!memcmp(from, "memmap=", 7)) {
6688     - /* exactmap option is for used defined memory */
6689     - if (!memcmp(from+7, "exactmap", 8)) {
6690     -#ifdef CONFIG_CRASH_DUMP
6691     - /* If we are doing a crash dump, we
6692     - * still need to know the real mem
6693     - * size before original memory map is
6694     - * reset.
6695     - */
6696     - saved_max_pfn = e820_end_of_ram();
6697     -#endif
6698     - from += 8+7;
6699     - end_pfn_map = 0;
6700     - e820.nr_map = 0;
6701     - userdef = 1;
6702     - }
6703     - else {
6704     - parse_memmapopt(from+7, &from);
6705     - userdef = 1;
6706     - }
6707     - }
6708     -
6709     -#ifdef CONFIG_NUMA
6710     - if (!memcmp(from, "numa=", 5))
6711     - numa_setup(from+5);
6712     +early_param("elfcorehdr", setup_elfcorehdr);
6713     #endif
6714    
6715     - if (!memcmp(from,"iommu=",6)) {
6716     - iommu_setup(from+6);
6717     - }
6718     -
6719     - if (fullarg(from,"oops=panic"))
6720     - panic_on_oops = 1;
6721     -
6722     - if (!memcmp(from, "noexec=", 7))
6723     - nonx_setup(from + 7);
6724     -
6725     -#ifdef CONFIG_KEXEC
6726     - /* crashkernel=size@addr specifies the location to reserve for
6727     - * a crash kernel. By reserving this memory we guarantee
6728     - * that linux never set's it up as a DMA target.
6729     - * Useful for holding code to do something appropriate
6730     - * after a kernel panic.
6731     - */
6732     - else if (!memcmp(from, "crashkernel=", 12)) {
6733     -#ifndef CONFIG_XEN
6734     - unsigned long size, base;
6735     - size = memparse(from+12, &from);
6736     - if (*from == '@') {
6737     - base = memparse(from+1, &from);
6738     - /* FIXME: Do I want a sanity check
6739     - * to validate the memory range?
6740     - */
6741     - crashk_res.start = base;
6742     - crashk_res.end = base + size - 1;
6743     - }
6744     -#else
6745     - printk("Ignoring crashkernel command line, "
6746     - "parameter will be supplied by xen\n");
6747     -#endif
6748     - }
6749     -#endif
6750     -
6751     -#ifdef CONFIG_PROC_VMCORE
6752     - /* elfcorehdr= specifies the location of elf core header
6753     - * stored by the crashed kernel. This option will be passed
6754     - * by kexec loader to the capture kernel.
6755     - */
6756     - else if(!memcmp(from, "elfcorehdr=", 11))
6757     - elfcorehdr_addr = memparse(from+11, &from);
6758     -#endif
6759     -
6760     -#if defined(CONFIG_HOTPLUG_CPU) && !defined(CONFIG_XEN)
6761     - else if (!memcmp(from, "additional_cpus=", 16))
6762     - setup_additional_cpus(from+16);
6763     -#endif
6764     -
6765     - next_char:
6766     - c = *(from++);
6767     - if (!c)
6768     - break;
6769     - if (COMMAND_LINE_SIZE <= ++len)
6770     - break;
6771     - *(to++) = c;
6772     - }
6773     - if (userdef) {
6774     - printk(KERN_INFO "user-defined physical RAM map:\n");
6775     - e820_print_map("user");
6776     - }
6777     - *to = '\0';
6778     - *cmdline_p = command_line;
6779     -}
6780     -
6781     #ifndef CONFIG_NUMA
6782     static void __init
6783     contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
6784     @@ -521,10 +337,11 @@
6785     if (bootmap == -1L)
6786     panic("Cannot find bootmem map of size %ld\n",bootmap_size);
6787     bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
6788     + e820_register_active_regions(0, start_pfn, end_pfn);
6789     #ifdef CONFIG_XEN
6790     - e820_bootmem_free(NODE_DATA(0), 0, xen_start_info->nr_pages<<PAGE_SHIFT);
6791     + free_bootmem_with_active_regions(0, xen_start_info->nr_pages);
6792     #else
6793     - e820_bootmem_free(NODE_DATA(0), 0, end_pfn << PAGE_SHIFT);
6794     + free_bootmem_with_active_regions(0, end_pfn);
6795     #endif
6796     reserve_bootmem(bootmap, bootmap_size);
6797     }
6798     @@ -587,6 +404,10 @@
6799     void __init setup_arch(char **cmdline_p)
6800     {
6801     #ifdef CONFIG_XEN
6802     + extern struct e820map machine_e820;
6803     +
6804     + printk(KERN_INFO "Command line: %s\n", saved_command_line);
6805     +
6806     /* Register a call for panic conditions. */
6807     atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
6808    
6809     @@ -612,6 +433,8 @@
6810    
6811     ARCH_SETUP
6812     #else
6813     + printk(KERN_INFO "Command line: %s\n", saved_command_line);
6814     +
6815     ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV);
6816     screen_info = SCREEN_INFO;
6817     edid_info = EDID_INFO;
6818     @@ -639,16 +462,22 @@
6819     data_resource.start = virt_to_phys(&_etext);
6820     data_resource.end = virt_to_phys(&_edata)-1;
6821    
6822     - parse_cmdline_early(cmdline_p);
6823     -
6824     early_identify_cpu(&boot_cpu_data);
6825    
6826     + strlcpy(command_line, saved_command_line, COMMAND_LINE_SIZE);
6827     + *cmdline_p = command_line;
6828     +
6829     + parse_early_param();
6830     +
6831     + finish_e820_parsing();
6832     +
6833     + e820_register_active_regions(0, 0, -1UL);
6834     /*
6835     * partially used pages are not usable - thus
6836     * we are rounding upwards:
6837     */
6838     end_pfn = e820_end_of_ram();
6839     - num_physpages = end_pfn; /* for pfn_valid */
6840     + num_physpages = end_pfn;
6841    
6842     check_efer();
6843    
6844     @@ -659,6 +488,14 @@
6845     if (is_initial_xendomain())
6846     dmi_scan_machine();
6847    
6848     + /* How many end-of-memory variables you have, grandma! */
6849     + max_low_pfn = end_pfn;
6850     + max_pfn = end_pfn;
6851     + high_memory = (void *)__va(end_pfn * PAGE_SIZE - 1) + 1;
6852     +
6853     + /* Remove active ranges so rediscovery with NUMA-awareness happens */
6854     + remove_all_active_ranges();
6855     +
6856     #ifdef CONFIG_ACPI_NUMA
6857     /*
6858     * Parse SRAT to discover nodes.
6859     @@ -848,16 +685,16 @@
6860     BUG();
6861     }
6862    
6863     +#ifdef CONFIG_ACPI
6864     if (!is_initial_xendomain()) {
6865     acpi_disabled = 1;
6866     -#ifdef CONFIG_ACPI
6867     acpi_ht = 0;
6868     -#endif
6869     }
6870     #endif
6871     +#endif
6872    
6873     -#ifndef CONFIG_XEN
6874     - check_ioapic();
6875     +#if defined(CONFIG_PCI) && !defined(CONFIG_XEN)
6876     + early_quirks();
6877     #endif
6878    
6879     zap_low_mappings(0);
6880     @@ -917,6 +754,7 @@
6881     }
6882     #else
6883     e820_reserve_resources(e820.map, e820.nr_map);
6884     + e820_mark_nosave_regions();
6885     #endif
6886    
6887     request_resource(&iomem_resource, &video_ram_resource);
6888     @@ -924,7 +762,7 @@
6889     {
6890     unsigned i;
6891     /* request I/O space for devices used on all i[345]86 PCs */
6892     - for (i = 0; i < STANDARD_IO_RESOURCES; i++)
6893     + for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
6894     request_resource(&ioport_resource, &standard_io_resources[i]);
6895     }
6896    
6897     @@ -1108,7 +946,7 @@
6898     #endif
6899     }
6900    
6901     -static void __init init_amd(struct cpuinfo_x86 *c)
6902     +static void __cpuinit init_amd(struct cpuinfo_x86 *c)
6903     {
6904     unsigned level;
6905    
6906     @@ -1164,6 +1002,12 @@
6907    
6908     /* Fix cpuid4 emulation for more */
6909     num_cache_leaves = 3;
6910     +
6911     + /* When there is only one core no need to synchronize RDTSC */
6912     + if (num_possible_cpus() == 1)
6913     + set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
6914     + else
6915     + clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
6916     }
6917    
6918     static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
6919     @@ -1245,8 +1089,7 @@
6920     node = first_node(node_online_map);
6921     numa_set_node(cpu, node);
6922    
6923     - if (acpi_numa > 0)
6924     - printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
6925     + printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
6926     #endif
6927     }
6928    
6929     @@ -1280,6 +1123,8 @@
6930     if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
6931     (c->x86 == 0x6 && c->x86_model >= 0x0e))
6932     set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
6933     + if (c->x86 == 6)
6934     + set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
6935     set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
6936     c->x86_max_cores = intel_num_cpu_cores(c);
6937    
6938     @@ -1498,8 +1343,8 @@
6939    
6940     /* Intel-defined (#2) */
6941     "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est",
6942     - "tm2", NULL, "cid", NULL, NULL, "cx16", "xtpr", NULL,
6943     - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
6944     + "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL,
6945     + NULL, NULL, "dca", NULL, NULL, NULL, NULL, NULL,
6946     NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
6947    
6948     /* VIA/Cyrix/Centaur-defined */
6949 niro 612 --- a/arch/x86/kernel/smp_32-xen.c
6950     +++ b/arch/x86/kernel/smp_32-xen.c
6951 niro 609 @@ -279,8 +279,7 @@
6952     * 2) Leave the mm if we are in the lazy tlb mode.
6953     */
6954    
6955     -irqreturn_t smp_invalidate_interrupt(int irq, void *dev_id,
6956     - struct pt_regs *regs)
6957     +irqreturn_t smp_invalidate_interrupt(int irq, void *dev_id)
6958     {
6959     unsigned long cpu;
6960    
6961     @@ -567,16 +566,14 @@
6962     * all the work is done automatically when
6963     * we return from the interrupt.
6964     */
6965     -irqreturn_t smp_reschedule_interrupt(int irq, void *dev_id,
6966     - struct pt_regs *regs)
6967     +irqreturn_t smp_reschedule_interrupt(int irq, void *dev_id)
6968     {
6969    
6970     return IRQ_HANDLED;
6971     }
6972    
6973     #include <linux/kallsyms.h>
6974     -irqreturn_t smp_call_function_interrupt(int irq, void *dev_id,
6975     - struct pt_regs *regs)
6976     +irqreturn_t smp_call_function_interrupt(int irq, void *dev_id)
6977     {
6978     void (*func) (void *info) = call_data->func;
6979     void *info = call_data->info;
6980     @@ -603,3 +600,69 @@
6981     return IRQ_HANDLED;
6982     }
6983    
6984     +/*
6985     + * this function sends a 'generic call function' IPI to one other CPU
6986     + * in the system.
6987     + *
6988     + * cpu is a standard Linux logical CPU number.
6989     + */
6990     +static void
6991     +__smp_call_function_single(int cpu, void (*func) (void *info), void *info,
6992     + int nonatomic, int wait)
6993     +{
6994     + struct call_data_struct data;
6995     + int cpus = 1;
6996     +
6997     + data.func = func;
6998     + data.info = info;
6999     + atomic_set(&data.started, 0);
7000     + data.wait = wait;
7001     + if (wait)
7002     + atomic_set(&data.finished, 0);
7003     +
7004     + call_data = &data;
7005     + wmb();
7006     + /* Send a message to all other CPUs and wait for them to respond */
7007     + send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_VECTOR);
7008     +
7009     + /* Wait for response */
7010     + while (atomic_read(&data.started) != cpus)
7011     + cpu_relax();
7012     +
7013     + if (!wait)
7014     + return;
7015     +
7016     + while (atomic_read(&data.finished) != cpus)
7017     + cpu_relax();
7018     +}
7019     +
7020     +/*
7021     + * smp_call_function_single - Run a function on another CPU
7022     + * @func: The function to run. This must be fast and non-blocking.
7023     + * @info: An arbitrary pointer to pass to the function.
7024     + * @nonatomic: Currently unused.
7025     + * @wait: If true, wait until function has completed on other CPUs.
7026     + *
7027     + * Retrurns 0 on success, else a negative status code.
7028     + *
7029     + * Does not return until the remote CPU is nearly ready to execute <func>
7030     + * or is or has executed.
7031     + */
7032     +
7033     +int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
7034     + int nonatomic, int wait)
7035     +{
7036     + /* prevent preemption and reschedule on another processor */
7037     + int me = get_cpu();
7038     + if (cpu == me) {
7039     + WARN_ON(1);
7040     + put_cpu();
7041     + return -EBUSY;
7042     + }
7043     + spin_lock_bh(&call_lock);
7044     + __smp_call_function_single(cpu, func, info, nonatomic, wait);
7045     + spin_unlock_bh(&call_lock);
7046     + put_cpu();
7047     + return 0;
7048     +}
7049     +EXPORT_SYMBOL(smp_call_function_single);
7050 niro 612 --- a/arch/x86/kernel/smp_64-xen.c
7051     +++ b/arch/x86/kernel/smp_64-xen.c
7052 niro 609 @@ -381,9 +381,8 @@
7053     /* prevent preemption and reschedule on another processor */
7054     int me = get_cpu();
7055     if (cpu == me) {
7056     - WARN_ON(1);
7057     put_cpu();
7058     - return -EBUSY;
7059     + return 0;
7060     }
7061     spin_lock_bh(&call_lock);
7062     __smp_call_function_single(cpu, func, info, nonatomic, wait);
7063     @@ -501,7 +500,7 @@
7064     #ifndef CONFIG_XEN
7065     asmlinkage void smp_reschedule_interrupt(void)
7066     #else
7067     -asmlinkage irqreturn_t smp_reschedule_interrupt(void)
7068     +asmlinkage irqreturn_t smp_reschedule_interrupt(int irq, void *ctx)
7069     #endif
7070     {
7071     #ifndef CONFIG_XEN
7072     @@ -514,7 +513,7 @@
7073     #ifndef CONFIG_XEN
7074     asmlinkage void smp_call_function_interrupt(void)
7075     #else
7076     -asmlinkage irqreturn_t smp_call_function_interrupt(void)
7077     +asmlinkage irqreturn_t smp_call_function_interrupt(int irq, void *ctx)
7078     #endif
7079     {
7080     void (*func) (void *info) = call_data->func;
7081     @@ -545,31 +544,3 @@
7082     return IRQ_HANDLED;
7083     #endif
7084     }
7085     -
7086     -int safe_smp_processor_id(void)
7087     -{
7088     -#ifdef CONFIG_XEN
7089     - return smp_processor_id();
7090     -#else
7091     - unsigned apicid, i;
7092     -
7093     - if (disable_apic)
7094     - return 0;
7095     -
7096     - apicid = hard_smp_processor_id();
7097     - if (apicid < NR_CPUS && x86_cpu_to_apicid[apicid] == apicid)
7098     - return apicid;
7099     -
7100     - for (i = 0; i < NR_CPUS; ++i) {
7101     - if (x86_cpu_to_apicid[i] == apicid)
7102     - return i;
7103     - }
7104     -
7105     - /* No entries in x86_cpu_to_apicid? Either no MPS|ACPI,
7106     - * or called too early. Either way, we must be CPU 0. */
7107     - if (x86_cpu_to_apicid[0] == BAD_APICID)
7108     - return 0;
7109     -
7110     - return 0; /* Should not happen */
7111     -#endif
7112     -}
7113 niro 612 --- a/arch/x86/kernel/time_32-xen.c
7114     +++ b/arch/x86/kernel/time_32-xen.c
7115 niro 609 @@ -89,7 +89,6 @@
7116     unsigned long vxtime_hz = PIT_TICK_RATE;
7117     struct vxtime_data __vxtime __section_vxtime; /* for vsyscalls */
7118     volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
7119     -unsigned long __wall_jiffies __section_wall_jiffies = INITIAL_JIFFIES;
7120     struct timespec __xtime __section_xtime;
7121     struct timezone __sys_tz __section_sys_tz;
7122     #endif
7123     @@ -97,8 +96,6 @@
7124     unsigned int cpu_khz; /* Detected as we calibrate the TSC */
7125     EXPORT_SYMBOL(cpu_khz);
7126    
7127     -extern unsigned long wall_jiffies;
7128     -
7129     DEFINE_SPINLOCK(rtc_lock);
7130     EXPORT_SYMBOL(rtc_lock);
7131    
7132     @@ -265,11 +262,10 @@
7133     time_t wtm_sec, xtime_sec;
7134     u64 tmp, wc_nsec;
7135    
7136     - /* Adjust wall-clock time base based on wall_jiffies ticks. */
7137     + /* Adjust wall-clock time base. */
7138     wc_nsec = processed_system_time;
7139     wc_nsec += sec * (u64)NSEC_PER_SEC;
7140     wc_nsec += nsec;
7141     - wc_nsec -= (jiffies - wall_jiffies) * (u64)NS_PER_TICK;
7142    
7143     /* Split wallclock base into seconds and nanoseconds. */
7144     tmp = wc_nsec;
7145     @@ -387,16 +383,10 @@
7146     shadow = &per_cpu(shadow_time, cpu);
7147    
7148     do {
7149     - unsigned long lost;
7150     -
7151     local_time_version = shadow->version;
7152     seq = read_seqbegin(&xtime_lock);
7153    
7154     usec = get_usec_offset(shadow);
7155     - lost = jiffies - wall_jiffies;
7156     -
7157     - if (unlikely(lost))
7158     - usec += lost * (USEC_PER_SEC / HZ);
7159    
7160     sec = xtime.tv_sec;
7161     usec += (xtime.tv_nsec / NSEC_PER_USEC);
7162     @@ -519,7 +509,7 @@
7163     write_seqlock_irq(&xtime_lock);
7164    
7165     sec = xtime.tv_sec;
7166     - nsec = xtime.tv_nsec + ((jiffies - wall_jiffies) * (u64)NS_PER_TICK);
7167     + nsec = xtime.tv_nsec;
7168     __normalize_time(&sec, &nsec);
7169    
7170     op.cmd = XENPF_settime;
7171     @@ -593,42 +583,49 @@
7172     }
7173     #endif
7174    
7175     -#if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER)
7176     unsigned long profile_pc(struct pt_regs *regs)
7177     {
7178     unsigned long pc = instruction_pointer(regs);
7179    
7180     -#ifdef __x86_64__
7181     - /* Assume the lock function has either no stack frame or only a single word.
7182     - This checks if the address on the stack looks like a kernel text address.
7183     - There is a small window for false hits, but in that case the tick
7184     - is just accounted to the spinlock function.
7185     - Better would be to write these functions in assembler again
7186     - and check exactly. */
7187     +#if defined(CONFIG_SMP) || defined(__x86_64__)
7188     if (!user_mode_vm(regs) && in_lock_functions(pc)) {
7189     - char *v = *(char **)regs->rsp;
7190     - if ((v >= _stext && v <= _etext) ||
7191     - (v >= _sinittext && v <= _einittext) ||
7192     - (v >= (char *)MODULES_VADDR && v <= (char *)MODULES_END))
7193     - return (unsigned long)v;
7194     - return ((unsigned long *)regs->rsp)[1];
7195     +# ifdef CONFIG_FRAME_POINTER
7196     +# ifdef __i386__
7197     + return ((unsigned long *)regs->ebp)[1];
7198     +# else
7199     + return ((unsigned long *)regs->rbp)[1];
7200     +# endif
7201     +# else
7202     +# ifdef __i386__
7203     + unsigned long *sp;
7204     + if ((regs->xcs & 2) == 0)
7205     + sp = (unsigned long *)&regs->esp;
7206     + else
7207     + sp = (unsigned long *)regs->esp;
7208     +# else
7209     + unsigned long *sp = (unsigned long *)regs->rsp;
7210     +# endif
7211     + /* Return address is either directly at stack pointer
7212     + or above a saved eflags. Eflags has bits 22-31 zero,
7213     + kernel addresses don't. */
7214     + if (sp[0] >> 22)
7215     + return sp[0];
7216     + if (sp[1] >> 22)
7217     + return sp[1];
7218     +# endif
7219     }
7220     -#else
7221     - if (!user_mode_vm(regs) && in_lock_functions(pc))
7222     - return *(unsigned long *)(regs->ebp + 4);
7223     #endif
7224    
7225     return pc;
7226     }
7227     EXPORT_SYMBOL(profile_pc);
7228     -#endif
7229    
7230     /*
7231     * This is the same as the above, except we _also_ save the current
7232     * Time Stamp Counter value at the time of the timer interrupt, so that
7233     * we later on can estimate the time of day more exactly.
7234     */
7235     -irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
7236     +irqreturn_t timer_interrupt(int irq, void *dev_id)
7237     {
7238     s64 delta, delta_cpu, stolen, blocked;
7239     u64 sched_time;
7240     @@ -686,10 +683,14 @@
7241     }
7242    
7243     /* System-wide jiffy work. */
7244     - while (delta >= NS_PER_TICK) {
7245     - delta -= NS_PER_TICK;
7246     - processed_system_time += NS_PER_TICK;
7247     - do_timer(regs);
7248     + if (delta >= NS_PER_TICK) {
7249     + do_div(delta, NS_PER_TICK);
7250     + processed_system_time += delta * NS_PER_TICK;
7251     + while (delta > HZ) {
7252     + do_timer(HZ);
7253     + delta -= HZ;
7254     + }
7255     + do_timer(delta);
7256     }
7257    
7258     if (shadow_tv_version != HYPERVISOR_shared_info->wc_version) {
7259     @@ -734,7 +735,7 @@
7260     if (delta_cpu > 0) {
7261     do_div(delta_cpu, NS_PER_TICK);
7262     per_cpu(processed_system_time, cpu) += delta_cpu * NS_PER_TICK;
7263     - if (user_mode_vm(regs))
7264     + if (user_mode_vm(get_irq_regs()))
7265     account_user_time(current, (cputime_t)delta_cpu);
7266     else
7267     account_system_time(current, HARDIRQ_OFFSET,
7268     @@ -748,10 +749,10 @@
7269     /* Local timer processing (see update_process_times()). */
7270     run_local_timers();
7271     if (rcu_pending(cpu))
7272     - rcu_check_callbacks(cpu, user_mode_vm(regs));
7273     + rcu_check_callbacks(cpu, user_mode_vm(get_irq_regs()));
7274     scheduler_tick();
7275     run_posix_cpu_timers(current);
7276     - profile_tick(CPU_PROFILING, regs);
7277     + profile_tick(CPU_PROFILING);
7278    
7279     return IRQ_HANDLED;
7280     }
7281     @@ -959,10 +960,11 @@
7282     /* Duplicate of time_init() below, with hpet_enable part added */
7283     static void __init hpet_time_init(void)
7284     {
7285     - xtime.tv_sec = get_cmos_time();
7286     - xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
7287     - set_normalized_timespec(&wall_to_monotonic,
7288     - -xtime.tv_sec, -xtime.tv_nsec);
7289     + struct timespec ts;
7290     + ts.tv_sec = get_cmos_time();
7291     + ts.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
7292     +
7293     + do_settimeofday(&ts);
7294    
7295     if ((hpet_enable() >= 0) && hpet_use_timer) {
7296     printk("Using HPET for base-timer\n");
7297 niro 612 --- a/arch/x86/kernel/traps_32-xen.c
7298     +++ b/arch/x86/kernel/traps_32-xen.c
7299 niro 609 @@ -28,6 +28,7 @@
7300     #include <linux/kprobes.h>
7301     #include <linux/kexec.h>
7302     #include <linux/unwind.h>
7303     +#include <linux/uaccess.h>
7304    
7305     #ifdef CONFIG_EISA
7306     #include <linux/ioport.h>
7307     @@ -40,7 +41,6 @@
7308    
7309     #include <asm/processor.h>
7310     #include <asm/system.h>
7311     -#include <asm/uaccess.h>
7312     #include <asm/io.h>
7313     #include <asm/atomic.h>
7314     #include <asm/debugreg.h>
7315     @@ -51,11 +51,14 @@
7316     #include <asm/smp.h>
7317     #include <asm/arch_hooks.h>
7318     #include <asm/kdebug.h>
7319     +#include <asm/stacktrace.h>
7320    
7321     #include <linux/module.h>
7322    
7323     #include "mach_traps.h"
7324    
7325     +int panic_on_unrecovered_nmi;
7326     +
7327     asmlinkage int system_call(void);
7328    
7329     struct desc_struct default_ldt[] = { { 0, 0 }, { 0, 0 }, { 0, 0 },
7330     @@ -124,62 +127,63 @@
7331     p < (void *)tinfo + THREAD_SIZE - 3;
7332     }
7333    
7334     -/*
7335     - * Print one address/symbol entries per line.
7336     - */
7337     -static inline void print_addr_and_symbol(unsigned long addr, char *log_lvl)
7338     -{
7339     - printk(" [<%08lx>] ", addr);
7340     -
7341     - print_symbol("%s\n", addr);
7342     -}
7343     -
7344     static inline unsigned long print_context_stack(struct thread_info *tinfo,
7345     unsigned long *stack, unsigned long ebp,
7346     - char *log_lvl)
7347     + struct stacktrace_ops *ops, void *data)
7348     {
7349     unsigned long addr;
7350    
7351     #ifdef CONFIG_FRAME_POINTER
7352     while (valid_stack_ptr(tinfo, (void *)ebp)) {
7353     + unsigned long new_ebp;
7354     addr = *(unsigned long *)(ebp + 4);
7355     - print_addr_and_symbol(addr, log_lvl);
7356     + ops->address(data, addr);
7357     /*
7358     * break out of recursive entries (such as
7359     - * end_of_stack_stop_unwind_function):
7360     + * end_of_stack_stop_unwind_function). Also,
7361     + * we can never allow a frame pointer to
7362     + * move downwards!
7363     */
7364     - if (ebp == *(unsigned long *)ebp)
7365     + new_ebp = *(unsigned long *)ebp;
7366     + if (new_ebp <= ebp)
7367     break;
7368     - ebp = *(unsigned long *)ebp;
7369     + ebp = new_ebp;
7370     }
7371     #else
7372     while (valid_stack_ptr(tinfo, stack)) {
7373     addr = *stack++;
7374     if (__kernel_text_address(addr))
7375     - print_addr_and_symbol(addr, log_lvl);
7376     + ops->address(data, addr);
7377     }
7378     #endif
7379     return ebp;
7380     }
7381    
7382     +struct ops_and_data {
7383     + struct stacktrace_ops *ops;
7384     + void *data;
7385     +};
7386     +
7387     static asmlinkage int
7388     -show_trace_unwind(struct unwind_frame_info *info, void *log_lvl)
7389     +dump_trace_unwind(struct unwind_frame_info *info, void *data)
7390     {
7391     + struct ops_and_data *oad = (struct ops_and_data *)data;
7392     int n = 0;
7393    
7394     while (unwind(info) == 0 && UNW_PC(info)) {
7395     n++;
7396     - print_addr_and_symbol(UNW_PC(info), log_lvl);
7397     + oad->ops->address(oad->data, UNW_PC(info));
7398     if (arch_unw_user_mode(info))
7399     break;
7400     }
7401     return n;
7402     }
7403    
7404     -static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
7405     - unsigned long *stack, char *log_lvl)
7406     +void dump_trace(struct task_struct *task, struct pt_regs *regs,
7407     + unsigned long *stack,
7408     + struct stacktrace_ops *ops, void *data)
7409     {
7410     - unsigned long ebp;
7411     + unsigned long ebp = 0;
7412    
7413     if (!task)
7414     task = current;
7415     @@ -187,54 +191,116 @@
7416     if (call_trace >= 0) {
7417     int unw_ret = 0;
7418     struct unwind_frame_info info;
7419     + struct ops_and_data oad = { .ops = ops, .data = data };
7420    
7421     if (regs) {
7422     if (unwind_init_frame_info(&info, task, regs) == 0)
7423     - unw_ret = show_trace_unwind(&info, log_lvl);
7424     + unw_ret = dump_trace_unwind(&info, &oad);
7425     } else if (task == current)
7426     - unw_ret = unwind_init_running(&info, show_trace_unwind, log_lvl);
7427     + unw_ret = unwind_init_running(&info, dump_trace_unwind, &oad);
7428     else {
7429     if (unwind_init_blocked(&info, task) == 0)
7430     - unw_ret = show_trace_unwind(&info, log_lvl);
7431     + unw_ret = dump_trace_unwind(&info, &oad);
7432     }
7433     if (unw_ret > 0) {
7434     if (call_trace == 1 && !arch_unw_user_mode(&info)) {
7435     - print_symbol("DWARF2 unwinder stuck at %s\n",
7436     + ops->warning_symbol(data, "DWARF2 unwinder stuck at %s\n",
7437     UNW_PC(&info));
7438     if (UNW_SP(&info) >= PAGE_OFFSET) {
7439     - printk("Leftover inexact backtrace:\n");
7440     + ops->warning(data, "Leftover inexact backtrace:\n");
7441     stack = (void *)UNW_SP(&info);
7442     + if (!stack)
7443     + return;
7444     + ebp = UNW_FP(&info);
7445     } else
7446     - printk("Full inexact backtrace again:\n");
7447     + ops->warning(data, "Full inexact backtrace again:\n");
7448     } else if (call_trace >= 1)
7449     return;
7450     else
7451     - printk("Full inexact backtrace again:\n");
7452     + ops->warning(data, "Full inexact backtrace again:\n");
7453     } else
7454     - printk("Inexact backtrace:\n");
7455     + ops->warning(data, "Inexact backtrace:\n");
7456     }
7457     -
7458     - if (task == current) {
7459     - /* Grab ebp right from our regs */
7460     - asm ("movl %%ebp, %0" : "=r" (ebp) : );
7461     - } else {
7462     - /* ebp is the last reg pushed by switch_to */
7463     - ebp = *(unsigned long *) task->thread.esp;
7464     + if (!stack) {
7465     + unsigned long dummy;
7466     + stack = &dummy;
7467     + if (task && task != current)
7468     + stack = (unsigned long *)task->thread.esp;
7469     + }
7470     +
7471     +#ifdef CONFIG_FRAME_POINTER
7472     + if (!ebp) {
7473     + if (task == current) {
7474     + /* Grab ebp right from our regs */
7475     + asm ("movl %%ebp, %0" : "=r" (ebp) : );
7476     + } else {
7477     + /* ebp is the last reg pushed by switch_to */
7478     + ebp = *(unsigned long *) task->thread.esp;
7479     + }
7480     }
7481     +#endif
7482    
7483     while (1) {
7484     struct thread_info *context;
7485     context = (struct thread_info *)
7486     ((unsigned long)stack & (~(THREAD_SIZE - 1)));
7487     - ebp = print_context_stack(context, stack, ebp, log_lvl);
7488     + ebp = print_context_stack(context, stack, ebp, ops, data);
7489     + /* Should be after the line below, but somewhere
7490     + in early boot context comes out corrupted and we
7491     + can't reference it -AK */
7492     + if (ops->stack(data, "IRQ") < 0)
7493     + break;
7494     stack = (unsigned long*)context->previous_esp;
7495     if (!stack)
7496     break;
7497     - printk("%s =======================\n", log_lvl);
7498     }
7499     }
7500     +EXPORT_SYMBOL(dump_trace);
7501 niro 612
7502     -void show_trace(struct task_struct *task, struct pt_regs *regs, unsigned long * stack)
7503 niro 609 +static void
7504     +print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
7505     +{
7506     + printk(data);
7507     + print_symbol(msg, symbol);
7508     + printk("\n");
7509     +}
7510     +
7511     +static void print_trace_warning(void *data, char *msg)
7512     +{
7513     + printk("%s%s\n", (char *)data, msg);
7514     +}
7515 niro 612 +
7516 niro 609 +static int print_trace_stack(void *data, char *name)
7517     +{
7518     + return 0;
7519     +}
7520     +
7521     +/*
7522     + * Print one address/symbol entries per line.
7523     + */
7524     +static void print_trace_address(void *data, unsigned long addr)
7525     +{
7526     + printk("%s [<%08lx>] ", (char *)data, addr);
7527     + print_symbol("%s\n", addr);
7528     +}
7529     +
7530     +static struct stacktrace_ops print_trace_ops = {
7531     + .warning = print_trace_warning,
7532     + .warning_symbol = print_trace_warning_symbol,
7533     + .stack = print_trace_stack,
7534     + .address = print_trace_address,
7535     +};
7536     +
7537     +static void
7538     +show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
7539     + unsigned long * stack, char *log_lvl)
7540     +{
7541     + dump_trace(task, regs, stack, &print_trace_ops, log_lvl);
7542     + printk("%s =======================\n", log_lvl);
7543     +}
7544     +
7545     +void show_trace(struct task_struct *task, struct pt_regs *regs,
7546     + unsigned long * stack)
7547     {
7548     show_trace_log_lvl(task, regs, stack, "");
7549     }
7550     @@ -297,12 +363,13 @@
7551     ss = regs->xss & 0xffff;
7552     }
7553     print_modules();
7554     - printk(KERN_EMERG "CPU: %d\nEIP: %04x:[<%08lx>] %s VLI\n"
7555     - "EFLAGS: %08lx (%s %.*s) \n",
7556     + printk(KERN_EMERG "CPU: %d\n"
7557     + KERN_EMERG "EIP: %04x:[<%08lx>] %s VLI\n"
7558     + KERN_EMERG "EFLAGS: %08lx (%s %.*s)\n",
7559     smp_processor_id(), 0xffff & regs->xcs, regs->eip,
7560     - print_tainted(), regs->eflags, system_utsname.release,
7561     - (int)strcspn(system_utsname.version, " "),
7562     - system_utsname.version);
7563     + print_tainted(), regs->eflags, init_utsname()->release,
7564     + (int)strcspn(init_utsname()->version, " "),
7565     + init_utsname()->version);
7566     print_symbol(KERN_EMERG "EIP is at %s\n", regs->eip);
7567     printk(KERN_EMERG "eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n",
7568     regs->eax, regs->ebx, regs->ecx, regs->edx);
7569     @@ -319,6 +386,8 @@
7570     */
7571     if (in_kernel) {
7572     u8 __user *eip;
7573     + int code_bytes = 64;
7574     + unsigned char c;
7575    
7576     printk("\n" KERN_EMERG "Stack: ");
7577     show_stack_log_lvl(NULL, regs, (unsigned long *)esp, KERN_EMERG);
7578     @@ -326,9 +395,12 @@
7579     printk(KERN_EMERG "Code: ");
7580    
7581     eip = (u8 __user *)regs->eip - 43;
7582     - for (i = 0; i < 64; i++, eip++) {
7583     - unsigned char c;
7584     -
7585     + if (eip < (u8 __user *)PAGE_OFFSET || __get_user(c, eip)) {
7586     + /* try starting at EIP */
7587     + eip = (u8 __user *)regs->eip;
7588     + code_bytes = 32;
7589     + }
7590     + for (i = 0; i < code_bytes; i++, eip++) {
7591     if (eip < (u8 __user *)PAGE_OFFSET || __get_user(c, eip)) {
7592     printk(" Bad EIP value.");
7593     break;
7594     @@ -349,7 +421,7 @@
7595    
7596     if (eip < PAGE_OFFSET)
7597     return;
7598     - if (__get_user(ud2, (unsigned short __user *)eip))
7599     + if (probe_kernel_address((unsigned short __user *)eip, ud2))
7600     return;
7601     if (ud2 != 0x0b0f)
7602     return;
7603     @@ -362,7 +434,8 @@
7604     char *file;
7605     char c;
7606    
7607     - if (__get_user(line, (unsigned short __user *)(eip + 2)))
7608     + if (probe_kernel_address((unsigned short __user *)(eip + 2),
7609     + line))
7610     break;
7611     if (__get_user(file, (char * __user *)(eip + 4)) ||
7612     (unsigned long)file < PAGE_OFFSET || __get_user(c, file))
7613     @@ -604,18 +677,24 @@
7614     }
7615     }
7616    
7617     -static void mem_parity_error(unsigned char reason, struct pt_regs * regs)
7618     +static __kprobes void
7619     +mem_parity_error(unsigned char reason, struct pt_regs * regs)
7620     {
7621     - printk(KERN_EMERG "Uhhuh. NMI received. Dazed and confused, but trying "
7622     - "to continue\n");
7623     + printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on "
7624     + "CPU %d.\n", reason, smp_processor_id());
7625     printk(KERN_EMERG "You probably have a hardware problem with your RAM "
7626     "chips\n");
7627     + if (panic_on_unrecovered_nmi)
7628     + panic("NMI: Not continuing");
7629     +
7630     + printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
7631    
7632     /* Clear and disable the memory parity error line. */
7633     clear_mem_error(reason);
7634     }
7635    
7636     -static void io_check_error(unsigned char reason, struct pt_regs * regs)
7637     +static __kprobes void
7638     +io_check_error(unsigned char reason, struct pt_regs * regs)
7639     {
7640     printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n");
7641     show_registers(regs);
7642     @@ -624,7 +703,8 @@
7643     clear_io_check_error(reason);
7644     }
7645    
7646     -static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
7647     +static __kprobes void
7648     +unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
7649     {
7650     #ifdef CONFIG_MCA
7651     /* Might actually be able to figure out what the guilty party
7652     @@ -634,15 +714,18 @@
7653     return;
7654     }
7655     #endif
7656     - printk("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
7657     - reason, smp_processor_id());
7658     - printk("Dazed and confused, but trying to continue\n");
7659     - printk("Do you have a strange power saving mode enabled?\n");
7660     + printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on "
7661     + "CPU %d.\n", reason, smp_processor_id());
7662     + printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n");
7663     + if (panic_on_unrecovered_nmi)
7664     + panic("NMI: Not continuing");
7665     +
7666     + printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
7667     }
7668    
7669     static DEFINE_SPINLOCK(nmi_print_lock);
7670    
7671     -void die_nmi (struct pt_regs *regs, const char *msg)
7672     +void __kprobes die_nmi(struct pt_regs *regs, const char *msg)
7673     {
7674     if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 2, SIGINT) ==
7675     NOTIFY_STOP)
7676     @@ -674,7 +757,7 @@
7677     do_exit(SIGSEGV);
7678     }
7679    
7680     -static void default_do_nmi(struct pt_regs * regs)
7681     +static __kprobes void default_do_nmi(struct pt_regs * regs)
7682     {
7683     unsigned char reason = 0;
7684    
7685     @@ -691,12 +774,12 @@
7686     * Ok, so this is none of the documented NMI sources,
7687     * so it must be the NMI watchdog.
7688     */
7689     - if (nmi_watchdog) {
7690     - nmi_watchdog_tick(regs);
7691     + if (nmi_watchdog_tick(regs, reason))
7692     return;
7693     - }
7694     + if (!do_nmi_callback(regs, smp_processor_id()))
7695     #endif
7696     - unknown_nmi_error(reason, regs);
7697     + unknown_nmi_error(reason, regs);
7698     +
7699     return;
7700     }
7701     if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
7702     @@ -712,14 +795,7 @@
7703     reassert_nmi();
7704     }
7705    
7706     -static int dummy_nmi_callback(struct pt_regs * regs, int cpu)
7707     -{
7708     - return 0;
7709     -}
7710     -
7711     -static nmi_callback_t nmi_callback = dummy_nmi_callback;
7712     -
7713     -fastcall void do_nmi(struct pt_regs * regs, long error_code)
7714     +fastcall __kprobes void do_nmi(struct pt_regs * regs, long error_code)
7715     {
7716     int cpu;
7717    
7718     @@ -729,25 +805,11 @@
7719    
7720     ++nmi_count(cpu);
7721    
7722     - if (!rcu_dereference(nmi_callback)(regs, cpu))
7723     - default_do_nmi(regs);
7724     + default_do_nmi(regs);
7725    
7726     nmi_exit();
7727     }
7728    
7729     -void set_nmi_callback(nmi_callback_t callback)
7730     -{
7731     - vmalloc_sync_all();
7732     - rcu_assign_pointer(nmi_callback, callback);
7733     -}
7734     -EXPORT_SYMBOL_GPL(set_nmi_callback);
7735     -
7736     -void unset_nmi_callback(void)
7737     -{
7738     - nmi_callback = dummy_nmi_callback;
7739     -}
7740     -EXPORT_SYMBOL_GPL(unset_nmi_callback);
7741     -
7742     #ifdef CONFIG_KPROBES
7743     fastcall void __kprobes do_int3(struct pt_regs *regs, long error_code)
7744     {
7745 niro 612 --- a/arch/x86/kernel/traps_64-xen.c
7746     +++ b/arch/x86/kernel/traps_64-xen.c
7747 niro 609 @@ -23,6 +23,7 @@
7748     #include <linux/delay.h>
7749     #include <linux/spinlock.h>
7750     #include <linux/interrupt.h>
7751     +#include <linux/kallsyms.h>
7752     #include <linux/module.h>
7753     #include <linux/moduleparam.h>
7754     #include <linux/nmi.h>
7755     @@ -45,6 +46,7 @@
7756     #include <asm/pda.h>
7757     #include <asm/proto.h>
7758     #include <asm/nmi.h>
7759     +#include <asm/stacktrace.h>
7760    
7761     asmlinkage void divide_error(void);
7762     asmlinkage void debug(void);
7763     @@ -114,7 +116,6 @@
7764     #endif
7765    
7766     #ifdef CONFIG_KALLSYMS
7767     -# include <linux/kallsyms.h>
7768     void printk_address(unsigned long address)
7769     {
7770     unsigned long offset = 0, symsize;
7771     @@ -142,7 +143,7 @@
7772     #endif
7773    
7774     static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
7775     - unsigned *usedp, const char **idp)
7776     + unsigned *usedp, char **idp)
7777     {
7778     #ifndef CONFIG_X86_NO_TSS
7779     static char ids[][8] = {
7780     @@ -162,26 +163,7 @@
7781     * 'stack' is in one of them:
7782     */
7783     for (k = 0; k < N_EXCEPTION_STACKS; k++) {
7784     - unsigned long end;
7785     -
7786     - /*
7787     - * set 'end' to the end of the exception stack.
7788     - */
7789     - switch (k + 1) {
7790     - /*
7791     - * TODO: this block is not needed i think, because
7792     - * setup64.c:cpu_init() sets up t->ist[DEBUG_STACK]
7793     - * properly too.
7794     - */
7795     -#if DEBUG_STKSZ > EXCEPTION_STKSZ
7796     - case DEBUG_STACK:
7797     - end = cpu_pda(cpu)->debugstack + DEBUG_STKSZ;
7798     - break;
7799     -#endif
7800     - default:
7801     - end = per_cpu(orig_ist, cpu).ist[k];
7802     - break;
7803     - }
7804     + unsigned long end = per_cpu(orig_ist, cpu).ist[k];
7805     /*
7806     * Is 'stack' above this exception frame's end?
7807     * If yes then skip to the next frame.
7808     @@ -236,13 +218,19 @@
7809     return NULL;
7810     }
7811    
7812     -static int show_trace_unwind(struct unwind_frame_info *info, void *context)
7813     +struct ops_and_data {
7814     + struct stacktrace_ops *ops;
7815     + void *data;
7816     +};
7817     +
7818     +static int dump_trace_unwind(struct unwind_frame_info *info, void *context)
7819     {
7820     + struct ops_and_data *oad = (struct ops_and_data *)context;
7821     int n = 0;
7822    
7823     while (unwind(info) == 0 && UNW_PC(info)) {
7824     n++;
7825     - printk_address(UNW_PC(info));
7826     + oad->ops->address(oad->data, UNW_PC(info));
7827     if (arch_unw_user_mode(info))
7828     break;
7829     }
7830     @@ -256,13 +244,19 @@
7831     * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
7832     */
7833    
7834     -void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * stack)
7835     +static inline int valid_stack_ptr(struct thread_info *tinfo, void *p)
7836     {
7837     - const unsigned cpu = safe_smp_processor_id();
7838     + void *t = (void *)tinfo;
7839     + return p > t && p < t + THREAD_SIZE - 3;
7840     +}
7841     +
7842     +void dump_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * stack,
7843     + struct stacktrace_ops *ops, void *data)
7844     +{
7845     + const unsigned cpu = smp_processor_id();
7846     unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
7847     unsigned used = 0;
7848     -
7849     - printk("\nCall Trace:\n");
7850     + struct thread_info *tinfo;
7851    
7852     if (!tsk)
7853     tsk = current;
7854     @@ -270,32 +264,47 @@
7855     if (call_trace >= 0) {
7856     int unw_ret = 0;
7857     struct unwind_frame_info info;
7858     + struct ops_and_data oad = { .ops = ops, .data = data };
7859    
7860     if (regs) {
7861     if (unwind_init_frame_info(&info, tsk, regs) == 0)
7862     - unw_ret = show_trace_unwind(&info, NULL);
7863     + unw_ret = dump_trace_unwind(&info, &oad);
7864     } else if (tsk == current)
7865     - unw_ret = unwind_init_running(&info, show_trace_unwind, NULL);
7866     + unw_ret = unwind_init_running(&info, dump_trace_unwind, &oad);
7867     else {
7868     if (unwind_init_blocked(&info, tsk) == 0)
7869     - unw_ret = show_trace_unwind(&info, NULL);
7870     + unw_ret = dump_trace_unwind(&info, &oad);
7871     }
7872     if (unw_ret > 0) {
7873     if (call_trace == 1 && !arch_unw_user_mode(&info)) {
7874     - print_symbol("DWARF2 unwinder stuck at %s\n",
7875     + ops->warning_symbol(data, "DWARF2 unwinder stuck at %s\n",
7876     UNW_PC(&info));
7877     if ((long)UNW_SP(&info) < 0) {
7878     - printk("Leftover inexact backtrace:\n");
7879     + ops->warning(data, "Leftover inexact backtrace:\n");
7880     stack = (unsigned long *)UNW_SP(&info);
7881     + if (!stack)
7882     + return;
7883     } else
7884     - printk("Full inexact backtrace again:\n");
7885     + ops->warning(data, "Full inexact backtrace again:\n");
7886     } else if (call_trace >= 1)
7887     return;
7888     else
7889     - printk("Full inexact backtrace again:\n");
7890     + ops->warning(data, "Full inexact backtrace again:\n");
7891     } else
7892     - printk("Inexact backtrace:\n");
7893     + ops->warning(data, "Inexact backtrace:\n");
7894     + }
7895     + if (!stack) {
7896     + unsigned long dummy;
7897     + stack = &dummy;
7898     + if (tsk && tsk != current)
7899     + stack = (unsigned long *)tsk->thread.rsp;
7900     }
7901     + /*
7902     + * Align the stack pointer on word boundary, later loops
7903     + * rely on that (and corruption / debug info bugs can cause
7904     + * unaligned values here):
7905     + */
7906     + stack = (unsigned long *)((unsigned long)stack & ~(sizeof(long)-1));
7907    
7908     /*
7909     * Print function call entries within a stack. 'cond' is the
7910     @@ -305,7 +314,9 @@
7911     #define HANDLE_STACK(cond) \
7912     do while (cond) { \
7913     unsigned long addr = *stack++; \
7914     - if (kernel_text_address(addr)) { \
7915     + if (oops_in_progress ? \
7916     + __kernel_text_address(addr) : \
7917     + kernel_text_address(addr)) { \
7918     /* \
7919     * If the address is either in the text segment of the \
7920     * kernel, or in the region which contains vmalloc'ed \
7921     @@ -314,7 +325,7 @@
7922     * down the cause of the crash will be able to figure \
7923     * out the call path that was taken. \
7924     */ \
7925     - printk_address(addr); \
7926     + ops->address(data, addr); \
7927     } \
7928     } while (0)
7929    
7930     @@ -323,16 +334,17 @@
7931     * current stack address. If the stacks consist of nested
7932     * exceptions
7933     */
7934     - for ( ; ; ) {
7935     - const char *id;
7936     + for (;;) {
7937     + char *id;
7938     unsigned long *estack_end;
7939     estack_end = in_exception_stack(cpu, (unsigned long)stack,
7940     &used, &id);
7941    
7942     if (estack_end) {
7943     - printk(" <%s>", id);
7944     + if (ops->stack(data, id) < 0)
7945     + break;
7946     HANDLE_STACK (stack < estack_end);
7947     - printk(" <EOE>");
7948     + ops->stack(data, "<EOE>");
7949     /*
7950     * We link to the next stack via the
7951     * second-to-last pointer (index -2 to end) in the
7952     @@ -347,7 +359,8 @@
7953     (IRQSTACKSIZE - 64) / sizeof(*irqstack);
7954    
7955     if (stack >= irqstack && stack < irqstack_end) {
7956     - printk(" <IRQ>");
7957     + if (ops->stack(data, "IRQ") < 0)
7958     + break;
7959     HANDLE_STACK (stack < irqstack_end);
7960     /*
7961     * We link to the next stack (which would be
7962     @@ -356,7 +369,7 @@
7963     */
7964     stack = (unsigned long *) (irqstack_end[-1]);
7965     irqstack_end = NULL;
7966     - printk(" <EOI>");
7967     + ops->stack(data, "EOI");
7968     continue;
7969     }
7970     }
7971     @@ -364,19 +377,58 @@
7972     }
7973    
7974     /*
7975     - * This prints the process stack:
7976     + * This handles the process stack:
7977     */
7978     - HANDLE_STACK (((long) stack & (THREAD_SIZE-1)) != 0);
7979     + tinfo = current_thread_info();
7980     + HANDLE_STACK (valid_stack_ptr(tinfo, stack));
7981     #undef HANDLE_STACK
7982     +}
7983     +EXPORT_SYMBOL(dump_trace);
7984     +
7985     +static void
7986     +print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
7987     +{
7988     + print_symbol(msg, symbol);
7989     + printk("\n");
7990     +}
7991     +
7992     +static void print_trace_warning(void *data, char *msg)
7993     +{
7994     + printk("%s\n", msg);
7995     +}
7996     +
7997     +static int print_trace_stack(void *data, char *name)
7998     +{
7999     + printk(" <%s> ", name);
8000     + return 0;
8001     +}
8002     +
8003     +static void print_trace_address(void *data, unsigned long addr)
8004     +{
8005     + printk_address(addr);
8006     +}
8007     +
8008     +static struct stacktrace_ops print_trace_ops = {
8009     + .warning = print_trace_warning,
8010     + .warning_symbol = print_trace_warning_symbol,
8011     + .stack = print_trace_stack,
8012     + .address = print_trace_address,
8013     +};
8014    
8015     +void
8016     +show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack)
8017     +{
8018     + printk("\nCall Trace:\n");
8019     + dump_trace(tsk, regs, stack, &print_trace_ops, NULL);
8020     printk("\n");
8021     }
8022    
8023     -static void _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long * rsp)
8024     +static void
8025     +_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *rsp)
8026     {
8027     unsigned long *stack;
8028     int i;
8029     - const int cpu = safe_smp_processor_id();
8030     + const int cpu = smp_processor_id();
8031     unsigned long *irqstack_end = (unsigned long *) (cpu_pda(cpu)->irqstackptr);
8032     unsigned long *irqstack = (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE);
8033    
8034     @@ -430,7 +482,7 @@
8035     int i;
8036     int in_kernel = !user_mode(regs);
8037     unsigned long rsp;
8038     - const int cpu = safe_smp_processor_id();
8039     + const int cpu = smp_processor_id();
8040     struct task_struct *cur = cpu_pda(cpu)->pcurrent;
8041    
8042     rsp = regs->rsp;
8043     @@ -505,9 +557,11 @@
8044    
8045     unsigned __kprobes long oops_begin(void)
8046     {
8047     - int cpu = safe_smp_processor_id();
8048     + int cpu = smp_processor_id();
8049     unsigned long flags;
8050    
8051     + oops_enter();
8052     +
8053     /* racy, but better than risking deadlock. */
8054     local_irq_save(flags);
8055     if (!spin_trylock(&die_lock)) {
8056     @@ -536,6 +590,7 @@
8057     spin_unlock_irqrestore(&die_lock, flags);
8058     if (panic_on_oops)
8059     panic("Fatal exception");
8060     + oops_exit();
8061     }
8062    
8063     void __kprobes __die(const char * str, struct pt_regs * regs, long err)
8064     @@ -573,7 +628,7 @@
8065     }
8066    
8067     #ifdef CONFIG_X86_LOCAL_APIC
8068     -void __kprobes die_nmi(char *str, struct pt_regs *regs)
8069     +void __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic)
8070     {
8071     unsigned long flags = oops_begin();
8072    
8073     @@ -581,13 +636,12 @@
8074     * We are in trouble anyway, lets at least try
8075     * to get a message out.
8076     */
8077     - printk(str, safe_smp_processor_id());
8078     + printk(str, smp_processor_id());
8079     show_registers(regs);
8080     if (kexec_should_crash(current))
8081     crash_kexec(regs);
8082     - if (panic_on_timeout || panic_on_oops)
8083     - panic("nmi watchdog");
8084     - printk("console shuts up ...\n");
8085     + if (do_panic || panic_on_oops)
8086     + panic("Non maskable interrupt");
8087     oops_end(flags);
8088     nmi_exit();
8089     local_irq_enable();
8090     @@ -734,8 +788,15 @@
8091     static __kprobes void
8092     mem_parity_error(unsigned char reason, struct pt_regs * regs)
8093     {
8094     - printk("Uhhuh. NMI received. Dazed and confused, but trying to continue\n");
8095     - printk("You probably have a hardware problem with your RAM chips\n");
8096     + printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
8097     + reason);
8098     + printk(KERN_EMERG "You probably have a hardware problem with your "
8099     + "RAM chips\n");
8100     +
8101     + if (panic_on_unrecovered_nmi)
8102     + panic("NMI: Not continuing");
8103     +
8104     + printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
8105    
8106     #if 0 /* XEN */
8107     /* Clear and disable the memory parity error line. */
8108     @@ -762,9 +823,15 @@
8109    
8110     static __kprobes void
8111     unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
8112     -{ printk("Uhhuh. NMI received for unknown reason %02x.\n", reason);
8113     - printk("Dazed and confused, but trying to continue\n");
8114     - printk("Do you have a strange power saving mode enabled?\n");
8115     +{
8116     + printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
8117     + reason);
8118     + printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n");
8119     +
8120     + if (panic_on_unrecovered_nmi)
8121     + panic("NMI: Not continuing");
8122     +
8123     + printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
8124     }
8125    
8126     /* Runs on IST stack. This code must keep interrupts off all the time.
8127     @@ -789,12 +856,12 @@
8128     * Ok, so this is none of the documented NMI sources,
8129     * so it must be the NMI watchdog.
8130     */
8131     - if (nmi_watchdog > 0) {
8132     - nmi_watchdog_tick(regs,reason);
8133     + if (nmi_watchdog_tick(regs,reason))
8134     return;
8135     - }
8136     #endif
8137     - unknown_nmi_error(reason, regs);
8138     + if (!do_nmi_callback(regs,cpu))
8139     + unknown_nmi_error(reason, regs);
8140     +
8141     return;
8142     }
8143     if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
8144     @@ -1081,6 +1148,7 @@
8145     init_fpu(me);
8146     restore_fpu_checking(&me->thread.i387.fxsave);
8147     task_thread_info(me)->status |= TS_USEDFPU;
8148     + me->fpu_counter++;
8149     }
8150    
8151    
8152     @@ -1141,24 +1209,30 @@
8153     }
8154    
8155    
8156     -/* Actual parsing is done early in setup.c. */
8157     -static int __init oops_dummy(char *s)
8158     +static int __init oops_setup(char *s)
8159     {
8160     - panic_on_oops = 1;
8161     - return 1;
8162     + if (!s)
8163     + return -EINVAL;
8164     + if (!strcmp(s, "panic"))
8165     + panic_on_oops = 1;
8166     + return 0;
8167     }
8168     -__setup("oops=", oops_dummy);
8169     +early_param("oops", oops_setup);
8170    
8171     static int __init kstack_setup(char *s)
8172     {
8173     + if (!s)
8174     + return -EINVAL;
8175     kstack_depth_to_print = simple_strtoul(s,NULL,0);
8176     - return 1;
8177     + return 0;
8178     }
8179     -__setup("kstack=", kstack_setup);
8180     +early_param("kstack", kstack_setup);
8181    
8182     #ifdef CONFIG_STACK_UNWIND
8183     static int __init call_trace_setup(char *s)
8184     {
8185     + if (!s)
8186     + return -EINVAL;
8187     if (strcmp(s, "old") == 0)
8188     call_trace = -1;
8189     else if (strcmp(s, "both") == 0)
8190     @@ -1167,7 +1241,7 @@
8191     call_trace = 1;
8192     else if (strcmp(s, "new") == 0)
8193     call_trace = 2;
8194     - return 1;
8195     + return 0;
8196     }
8197     -__setup("call_trace=", call_trace_setup);
8198     +early_param("call_trace", call_trace_setup);
8199     #endif
8200 niro 612 --- a/arch/x86/kernel/vsyscall_64-xen.c
8201     +++ b/arch/x86/kernel/vsyscall_64-xen.c
8202 niro 609 @@ -26,6 +26,10 @@
8203     #include <linux/seqlock.h>
8204     #include <linux/jiffies.h>
8205     #include <linux/sysctl.h>
8206     +#include <linux/getcpu.h>
8207     +#include <linux/cpu.h>
8208     +#include <linux/smp.h>
8209     +#include <linux/notifier.h>
8210    
8211     #include <asm/vsyscall.h>
8212     #include <asm/pgtable.h>
8213     @@ -33,11 +37,15 @@
8214     #include <asm/fixmap.h>
8215     #include <asm/errno.h>
8216     #include <asm/io.h>
8217     +#include <asm/segment.h>
8218     +#include <asm/desc.h>
8219     +#include <asm/topology.h>
8220    
8221     #define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
8222    
8223     int __sysctl_vsyscall __section_sysctl_vsyscall = 1;
8224     seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED;
8225     +int __vgetcpu_mode __section_vgetcpu_mode;
8226    
8227     #include <asm/unistd.h>
8228    
8229     @@ -61,8 +69,7 @@
8230     sequence = read_seqbegin(&__xtime_lock);
8231    
8232     sec = __xtime.tv_sec;
8233     - usec = (__xtime.tv_nsec / 1000) +
8234     - (__jiffies - __wall_jiffies) * (1000000 / HZ);
8235     + usec = __xtime.tv_nsec / 1000;
8236    
8237     if (__vxtime.mode != VXTIME_HPET) {
8238     t = get_cycles_sync();
8239     @@ -72,7 +79,8 @@
8240     __vxtime.tsc_quot) >> 32;
8241     /* See comment in x86_64 do_gettimeofday. */
8242     } else {
8243     - usec += ((readl((void *)fix_to_virt(VSYSCALL_HPET) + 0xf0) -
8244     + usec += ((readl((void __iomem *)
8245     + fix_to_virt(VSYSCALL_HPET) + 0xf0) -
8246     __vxtime.last) * __vxtime.quot) >> 32;
8247     }
8248     } while (read_seqretry(&__xtime_lock, sequence));
8249     @@ -127,9 +135,46 @@
8250     return __xtime.tv_sec;
8251     }
8252    
8253     -long __vsyscall(2) venosys_0(void)
8254     -{
8255     - return -ENOSYS;
8256     +/* Fast way to get current CPU and node.
8257     + This helps to do per node and per CPU caches in user space.
8258     + The result is not guaranteed without CPU affinity, but usually
8259     + works out because the scheduler tries to keep a thread on the same
8260     + CPU.
8261     +
8262     + tcache must point to a two element sized long array.
8263     + All arguments can be NULL. */
8264     +long __vsyscall(2)
8265     +vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
8266     +{
8267     + unsigned int dummy, p;
8268     + unsigned long j = 0;
8269     +
8270     + /* Fast cache - only recompute value once per jiffies and avoid
8271     + relatively costly rdtscp/cpuid otherwise.
8272     + This works because the scheduler usually keeps the process
8273     + on the same CPU and this syscall doesn't guarantee its
8274     + results anyways.
8275     + We do this here because otherwise user space would do it on
8276     + its own in a likely inferior way (no access to jiffies).
8277     + If you don't like it pass NULL. */
8278     + if (tcache && tcache->blob[0] == (j = __jiffies)) {
8279     + p = tcache->blob[1];
8280     + } else if (__vgetcpu_mode == VGETCPU_RDTSCP) {
8281     + /* Load per CPU data from RDTSCP */
8282     + rdtscp(dummy, dummy, p);
8283     + } else {
8284     + /* Load per CPU data from GDT */
8285     + asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
8286     + }
8287     + if (tcache) {
8288     + tcache->blob[0] = j;
8289     + tcache->blob[1] = p;
8290     + }
8291     + if (cpu)
8292     + *cpu = p & 0xfff;
8293     + if (node)
8294     + *node = p >> 12;
8295     + return 0;
8296     }
8297    
8298     long __vsyscall(3) venosys_1(void)
8299     @@ -149,7 +194,8 @@
8300     void __user *buffer, size_t *lenp, loff_t *ppos)
8301     {
8302     extern u16 vsysc1, vsysc2;
8303     - u16 *map1, *map2;
8304     + u16 __iomem *map1;
8305     + u16 __iomem *map2;
8306     int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
8307     if (!write)
8308     return ret;
8309     @@ -164,11 +210,11 @@
8310     goto out;
8311     }
8312     if (!sysctl_vsyscall) {
8313     - *map1 = SYSCALL;
8314     - *map2 = SYSCALL;
8315     + writew(SYSCALL, map1);
8316     + writew(SYSCALL, map2);
8317     } else {
8318     - *map1 = NOP2;
8319     - *map2 = NOP2;
8320     + writew(NOP2, map1);
8321     + writew(NOP2, map2);
8322     }
8323     iounmap(map2);
8324     out:
8325     @@ -200,6 +246,48 @@
8326    
8327     #endif
8328    
8329     +/* Assume __initcall executes before all user space. Hopefully kmod
8330     + doesn't violate that. We'll find out if it does. */
8331     +static void __cpuinit vsyscall_set_cpu(int cpu)
8332     +{
8333     + unsigned long d;
8334     + unsigned long node = 0;
8335     +#ifdef CONFIG_NUMA
8336     + node = cpu_to_node[cpu];
8337     +#endif
8338     + if (cpu_has(&cpu_data[cpu], X86_FEATURE_RDTSCP))
8339     + write_rdtscp_aux((node << 12) | cpu);
8340     +
8341     + /* Store cpu number in limit so that it can be loaded quickly
8342     + in user space in vgetcpu.
8343     + 12 bits for the CPU and 8 bits for the node. */
8344     + d = 0x0f40000000000ULL;
8345     + d |= cpu;
8346     + d |= (node & 0xf) << 12;
8347     + d |= (node >> 4) << 48;
8348     + if (HYPERVISOR_update_descriptor(virt_to_machine(cpu_gdt(cpu)
8349     + + GDT_ENTRY_PER_CPU),
8350     + d))
8351     + BUG();
8352     +}
8353     +
8354     +static void __cpuinit cpu_vsyscall_init(void *arg)
8355     +{
8356     + /* preemption should be already off */
8357     + vsyscall_set_cpu(raw_smp_processor_id());
8358     +}
8359     +
8360     +#ifdef CONFIG_HOTPLUG_CPU
8361     +static int __cpuinit
8362     +cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
8363     +{
8364     + long cpu = (long)arg;
8365     + if (action == CPU_ONLINE)
8366     + smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 0, 1);
8367     + return NOTIFY_DONE;
8368     +}
8369     +#endif
8370     +
8371     static void __init map_vsyscall(void)
8372     {
8373     extern char __vsyscall_0;
8374     @@ -214,13 +302,20 @@
8375     VSYSCALL_ADDR(__NR_vgettimeofday)));
8376     BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
8377     BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
8378     + BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu));
8379     map_vsyscall();
8380     #ifdef CONFIG_XEN
8381     sysctl_vsyscall = 0; /* disable vgettimeofay() */
8382     + if (boot_cpu_has(X86_FEATURE_RDTSCP))
8383     + vgetcpu_mode = VGETCPU_RDTSCP;
8384     + else
8385     + vgetcpu_mode = VGETCPU_LSL;
8386     #endif
8387     #ifdef CONFIG_SYSCTL
8388     register_sysctl_table(kernel_root_table2, 0);
8389     #endif
8390     + on_each_cpu(cpu_vsyscall_init, NULL, 0, 1);
8391     + hotcpu_notifier(cpu_vsyscall_notifier, 0);
8392     return 0;
8393     }
8394    
8395 niro 612 --- a/arch/x86/mach-xen/setup.c
8396     +++ b/arch/x86/mach-xen/setup.c
8397 niro 609 @@ -103,8 +103,10 @@
8398    
8399     setup_xen_features();
8400    
8401     - if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
8402     - set_fixaddr_top(pp.virt_start);
8403     + if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0) {
8404     + hypervisor_virt_start = pp.virt_start;
8405     + reserve_top_address(0UL - pp.virt_start);
8406     + }
8407    
8408     if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
8409     machine_to_phys_mapping = (unsigned long *)mapping.v_start;
8410 niro 612 --- a/arch/x86/mm/fault_32-xen.c
8411     +++ b/arch/x86/mm/fault_32-xen.c
8412 niro 609 @@ -27,21 +27,24 @@
8413     #include <asm/uaccess.h>
8414     #include <asm/desc.h>
8415     #include <asm/kdebug.h>
8416     +#include <asm/segment.h>
8417    
8418     extern void die(const char *,struct pt_regs *,long);
8419    
8420     -#ifdef CONFIG_KPROBES
8421     -ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain);
8422     +static ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain);
8423     +
8424     int register_page_fault_notifier(struct notifier_block *nb)
8425     {
8426     vmalloc_sync_all();
8427     return atomic_notifier_chain_register(&notify_page_fault_chain, nb);
8428     }
8429     +EXPORT_SYMBOL_GPL(register_page_fault_notifier);
8430    
8431     int unregister_page_fault_notifier(struct notifier_block *nb)
8432     {
8433     return atomic_notifier_chain_unregister(&notify_page_fault_chain, nb);
8434     }
8435     +EXPORT_SYMBOL_GPL(unregister_page_fault_notifier);
8436    
8437     static inline int notify_page_fault(enum die_val val, const char *str,
8438     struct pt_regs *regs, long err, int trap, int sig)
8439     @@ -55,14 +58,6 @@
8440     };
8441     return atomic_notifier_call_chain(&notify_page_fault_chain, val, &args);
8442     }
8443     -#else
8444     -static inline int notify_page_fault(enum die_val val, const char *str,
8445     - struct pt_regs *regs, long err, int trap, int sig)
8446     -{
8447     - return NOTIFY_DONE;
8448     -}
8449     -#endif
8450     -
8451    
8452     /*
8453     * Unlock any spinlocks which will prevent us from getting the
8454     @@ -119,10 +114,10 @@
8455     }
8456    
8457     /* The standard kernel/user address space limit. */
8458     - *eip_limit = (seg & 2) ? USER_DS.seg : KERNEL_DS.seg;
8459     + *eip_limit = user_mode(regs) ? USER_DS.seg : KERNEL_DS.seg;
8460    
8461     /* By far the most common cases. */
8462     - if (likely(seg == __USER_CS || seg == GET_KERNEL_CS()))
8463     + if (likely(SEGMENT_IS_FLAT_CODE(seg)))
8464     return eip;
8465    
8466     /* Check the segment exists, is within the current LDT/GDT size,
8467     @@ -559,11 +554,7 @@
8468     write = 0;
8469     switch (error_code & 3) {
8470     default: /* 3: write, present */
8471     -#ifdef TEST_VERIFY_AREA
8472     - if (regs->cs == GET_KERNEL_CS())
8473     - printk("WP fault at %08lx\n", regs->eip);
8474     -#endif
8475     - /* fall through */
8476     + /* fall through */
8477     case 2: /* write, not present */
8478     if (!(vma->vm_flags & VM_WRITE))
8479     goto bad_area;
8480     @@ -572,7 +563,7 @@
8481     case 1: /* read, present */
8482     goto bad_area;
8483     case 0: /* read, not present */
8484     - if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
8485     + if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
8486     goto bad_area;
8487     }
8488    
8489     @@ -704,7 +695,7 @@
8490     */
8491     out_of_memory:
8492     up_read(&mm->mmap_sem);
8493     - if (tsk->pid == 1) {
8494     + if (is_init(tsk)) {
8495     yield();
8496     down_read(&mm->mmap_sem);
8497     goto survive;
8498 niro 612 --- a/arch/x86/mm/fault_64-xen.c
8499     +++ b/arch/x86/mm/fault_64-xen.c
8500 niro 609 @@ -40,8 +40,7 @@
8501     #define PF_RSVD (1<<3)
8502     #define PF_INSTR (1<<4)
8503    
8504     -#ifdef CONFIG_KPROBES
8505     -ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain);
8506     +static ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain);
8507    
8508     /* Hook to register for page fault notifications */
8509     int register_page_fault_notifier(struct notifier_block *nb)
8510     @@ -49,11 +48,13 @@
8511     vmalloc_sync_all();
8512     return atomic_notifier_chain_register(&notify_page_fault_chain, nb);
8513     }
8514     +EXPORT_SYMBOL_GPL(register_page_fault_notifier);
8515    
8516     int unregister_page_fault_notifier(struct notifier_block *nb)
8517     {
8518     return atomic_notifier_chain_unregister(&notify_page_fault_chain, nb);
8519     }
8520     +EXPORT_SYMBOL_GPL(unregister_page_fault_notifier);
8521    
8522     static inline int notify_page_fault(enum die_val val, const char *str,
8523     struct pt_regs *regs, long err, int trap, int sig)
8524     @@ -67,13 +68,6 @@
8525     };
8526     return atomic_notifier_call_chain(&notify_page_fault_chain, val, &args);
8527     }
8528     -#else
8529     -static inline int notify_page_fault(enum die_val val, const char *str,
8530     - struct pt_regs *regs, long err, int trap, int sig)
8531     -{
8532     - return NOTIFY_DONE;
8533     -}
8534     -#endif
8535    
8536     void bust_spinlocks(int yes)
8537     {
8538     @@ -102,7 +96,7 @@
8539     static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
8540     unsigned long error_code)
8541     {
8542     - unsigned char *instr;
8543     + unsigned char __user *instr;
8544     int scan_more = 1;
8545     int prefetch = 0;
8546     unsigned char *max_instr;
8547     @@ -111,7 +105,7 @@
8548     if (error_code & PF_INSTR)
8549     return 0;
8550    
8551     - instr = (unsigned char *)convert_rip_to_linear(current, regs);
8552     + instr = (unsigned char __user *)convert_rip_to_linear(current, regs);
8553     max_instr = instr + 15;
8554    
8555     if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
8556     @@ -122,7 +116,7 @@
8557     unsigned char instr_hi;
8558     unsigned char instr_lo;
8559    
8560     - if (__get_user(opcode, instr))
8561     + if (__get_user(opcode, (char __user *)instr))
8562     break;
8563    
8564     instr_hi = opcode & 0xf0;
8565     @@ -160,7 +154,7 @@
8566     case 0x00:
8567     /* Prefetch instruction is 0x0F0D or 0x0F18 */
8568     scan_more = 0;
8569     - if (__get_user(opcode, instr))
8570     + if (__get_user(opcode, (char __user *)instr))
8571     break;
8572     prefetch = (instr_lo == 0xF) &&
8573     (opcode == 0x0D || opcode == 0x18);
8574     @@ -176,7 +170,7 @@
8575     static int bad_address(void *p)
8576     {
8577     unsigned long dummy;
8578     - return __get_user(dummy, (unsigned long *)p);
8579     + return __get_user(dummy, (unsigned long __user *)p);
8580     }
8581    
8582     void dump_pagetable(unsigned long address)
8583     @@ -248,7 +242,7 @@
8584    
8585     int unhandled_signal(struct task_struct *tsk, int sig)
8586     {
8587     - if (tsk->pid == 1)
8588     + if (is_init(tsk))
8589     return 1;
8590     if (tsk->ptrace & PT_PTRACED)
8591     return 0;
8592     @@ -300,7 +294,7 @@
8593     if (pgd_none(*pgd))
8594     set_pgd(pgd, *pgd_ref);
8595     else
8596     - BUG_ON(pgd_page(*pgd) != pgd_page(*pgd_ref));
8597     + BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
8598    
8599     /* Below here mismatches are bugs because these lower tables
8600     are shared */
8601     @@ -309,7 +303,7 @@
8602     pud_ref = pud_offset(pgd_ref, address);
8603     if (pud_none(*pud_ref))
8604     return -1;
8605     - if (pud_none(*pud) || pud_page(*pud) != pud_page(*pud_ref))
8606     + if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
8607     BUG();
8608     pmd = pmd_offset(pud, address);
8609     pmd_ref = pmd_offset(pud_ref, address);
8610     @@ -531,7 +525,7 @@
8611     case PF_PROT: /* read, present */
8612     goto bad_area;
8613     case 0: /* read, not present */
8614     - if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
8615     + if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
8616     goto bad_area;
8617     }
8618    
8619     @@ -647,7 +641,7 @@
8620     */
8621     out_of_memory:
8622     up_read(&mm->mmap_sem);
8623     - if (current->pid == 1) {
8624     + if (is_init(current)) {
8625     yield();
8626     goto again;
8627     }
8628     @@ -702,7 +696,7 @@
8629     if (pgd_none(*pgd))
8630     set_pgd(pgd, *pgd_ref);
8631     else
8632     - BUG_ON(pgd_page(*pgd) != pgd_page(*pgd_ref));
8633     + BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
8634     }
8635     spin_unlock(&pgd_lock);
8636     set_bit(pgd_index(address), insync);
8637 niro 612 --- a/arch/x86/mm/highmem_32-xen.c
8638     +++ b/arch/x86/mm/highmem_32-xen.c
8639 niro 609 @@ -38,11 +38,9 @@
8640    
8641     idx = type + KM_TYPE_NR*smp_processor_id();
8642     vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
8643     -#ifdef CONFIG_DEBUG_HIGHMEM
8644     if (!pte_none(*(kmap_pte-idx)))
8645     BUG();
8646     -#endif
8647     - set_pte_at_sync(&init_mm, vaddr, kmap_pte-idx, mk_pte(page, prot));
8648     + set_pte_at(&init_mm, vaddr, kmap_pte-idx, mk_pte(page, prot));
8649    
8650     return (void*) vaddr;
8651     }
8652     @@ -62,36 +60,26 @@
8653    
8654     void kunmap_atomic(void *kvaddr, enum km_type type)
8655     {
8656     -#if defined(CONFIG_DEBUG_HIGHMEM) || defined(CONFIG_XEN)
8657     unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
8658     enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
8659    
8660     - if (vaddr < FIXADDR_START) { // FIXME
8661     +#ifdef CONFIG_DEBUG_HIGHMEM
8662     + if (vaddr >= PAGE_OFFSET && vaddr < (unsigned long)high_memory) {
8663     dec_preempt_count();
8664     preempt_check_resched();
8665     return;
8666     }
8667     -#endif
8668    
8669     -#if defined(CONFIG_DEBUG_HIGHMEM)
8670     if (vaddr != __fix_to_virt(FIX_KMAP_BEGIN+idx))
8671     BUG();
8672     -
8673     - /*
8674     - * force other mappings to Oops if they'll try to access
8675     - * this pte without first remap it
8676     - */
8677     - pte_clear(&init_mm, vaddr, kmap_pte-idx);
8678     - __flush_tlb_one(vaddr);
8679     -#elif defined(CONFIG_XEN)
8680     +#endif
8681     /*
8682     - * We must ensure there are no dangling pagetable references when
8683     - * returning memory to Xen (decrease_reservation).
8684     - * XXX TODO: We could make this faster by only zapping when
8685     - * kmap_flush_unused is called but that is trickier and more invasive.
8686     + * Force other mappings to Oops if they'll try to access this pte
8687     + * without first remap it. Keeping stale mappings around is a bad idea
8688     + * also, in case the page changes cacheability attributes or becomes
8689     + * a protected page in a hypervisor.
8690     */
8691     - pte_clear(&init_mm, vaddr, kmap_pte-idx);
8692     -#endif
8693     + kpte_clear_flush(kmap_pte-idx, vaddr);
8694    
8695     dec_preempt_count();
8696     preempt_check_resched();
8697     @@ -110,7 +98,6 @@
8698     idx = type + KM_TYPE_NR*smp_processor_id();
8699     vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
8700     set_pte(kmap_pte-idx, pfn_pte(pfn, kmap_prot));
8701     - __flush_tlb_one(vaddr);
8702    
8703     return (void*) vaddr;
8704     }
8705 niro 612 --- a/arch/x86/mm/hypervisor.c
8706     +++ b/arch/x86/mm/hypervisor.c
8707 niro 609 @@ -569,7 +569,8 @@
8708     #define MAX_BATCHED_FULL_PTES 32
8709    
8710     int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
8711     - unsigned long addr, unsigned long end, pgprot_t newprot)
8712     + unsigned long addr, unsigned long end, pgprot_t newprot,
8713     + int dirty_accountable)
8714     {
8715     int rc = 0, i = 0;
8716     mmu_update_t u[MAX_BATCHED_FULL_PTES];
8717     @@ -582,10 +583,14 @@
8718     pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
8719     do {
8720     if (pte_present(*pte)) {
8721     + pte_t ptent = pte_modify(*pte, newprot);
8722     +
8723     + if (dirty_accountable && pte_dirty(ptent))
8724     + ptent = pte_mkwrite(ptent);
8725     u[i].ptr = (__pmd_val(*pmd) & PHYSICAL_PAGE_MASK)
8726     | ((unsigned long)pte & ~PAGE_MASK)
8727     | MMU_PT_UPDATE_PRESERVE_AD;
8728     - u[i].val = __pte_val(pte_modify(*pte, newprot));
8729     + u[i].val = __pte_val(ptent);
8730     if (++i == MAX_BATCHED_FULL_PTES) {
8731     if ((rc = HYPERVISOR_mmu_update(
8732     &u[0], i, NULL, DOMID_SELF)) != 0)
8733 niro 612 --- a/arch/x86/mm/init_32-xen.c
8734     +++ b/arch/x86/mm/init_32-xen.c
8735 niro 609 @@ -464,16 +464,22 @@
8736     * on Enable
8737     * off Disable
8738     */
8739     -void __init noexec_setup(const char *str)
8740     +static int __init noexec_setup(char *str)
8741     {
8742     - if (!strncmp(str, "on",2) && cpu_has_nx) {
8743     - __supported_pte_mask |= _PAGE_NX;
8744     - disable_nx = 0;
8745     - } else if (!strncmp(str,"off",3)) {
8746     + if (!str || !strcmp(str, "on")) {
8747     + if (cpu_has_nx) {
8748     + __supported_pte_mask |= _PAGE_NX;
8749     + disable_nx = 0;
8750     + }
8751     + } else if (!strcmp(str,"off")) {
8752     disable_nx = 1;
8753     __supported_pte_mask &= ~_PAGE_NX;
8754     - }
8755     + } else
8756     + return -EINVAL;
8757     +
8758     + return 0;
8759     }
8760     +early_param("noexec", noexec_setup);
8761    
8762     int nx_enabled = 0;
8763     #ifdef CONFIG_X86_PAE
8764     @@ -516,6 +522,7 @@
8765     pte->pte_high &= ~(1 << (_PAGE_BIT_NX - 32));
8766     else
8767     pte->pte_high |= 1 << (_PAGE_BIT_NX - 32);
8768     + pte_update_defer(&init_mm, vaddr, pte);
8769     __flush_tlb_all();
8770     out:
8771     return ret;
8772     @@ -598,18 +605,6 @@
8773     }
8774     }
8775    
8776     -static void __init set_max_mapnr_init(void)
8777     -{
8778     -#ifdef CONFIG_HIGHMEM
8779     - num_physpages = highend_pfn;
8780     -#else
8781     - num_physpages = max_low_pfn;
8782     -#endif
8783     -#ifdef CONFIG_FLATMEM
8784     - max_mapnr = num_physpages;
8785     -#endif
8786     -}
8787     -
8788     static struct kcore_list kcore_mem, kcore_vmalloc;
8789    
8790     void __init mem_init(void)
8791     @@ -630,8 +625,7 @@
8792     #endif
8793    
8794     #ifdef CONFIG_FLATMEM
8795     - if (!mem_map)
8796     - BUG();
8797     + BUG_ON(!mem_map);
8798     #endif
8799    
8800     bad_ppro = ppro_with_ram_bug();
8801     @@ -646,17 +640,6 @@
8802     }
8803     #endif
8804    
8805     - set_max_mapnr_init();
8806     -
8807     -#ifdef CONFIG_HIGHMEM
8808     - high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
8809     -#else
8810     - high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
8811     -#endif
8812     - printk("vmalloc area: %lx-%lx, maxmem %lx\n",
8813     - VMALLOC_START,VMALLOC_END,MAXMEM);
8814     - BUG_ON(VMALLOC_START > VMALLOC_END);
8815     -
8816     /* this will put all low memory onto the freelists */
8817     totalram_pages += free_all_bootmem();
8818     /* XEN: init and count low-mem pages outside initial allocation. */
8819     @@ -694,6 +677,48 @@
8820     (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10))
8821     );
8822    
8823     +#if 1 /* double-sanity-check paranoia */
8824     + printk("virtual kernel memory layout:\n"
8825     + " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
8826     +#ifdef CONFIG_HIGHMEM
8827     + " pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
8828     +#endif
8829     + " vmalloc : 0x%08lx - 0x%08lx (%4ld MB)\n"
8830     + " lowmem : 0x%08lx - 0x%08lx (%4ld MB)\n"
8831     + " .init : 0x%08lx - 0x%08lx (%4ld kB)\n"
8832     + " .data : 0x%08lx - 0x%08lx (%4ld kB)\n"
8833     + " .text : 0x%08lx - 0x%08lx (%4ld kB)\n",
8834     + FIXADDR_START, FIXADDR_TOP,
8835     + (FIXADDR_TOP - FIXADDR_START) >> 10,
8836     +
8837     +#ifdef CONFIG_HIGHMEM
8838     + PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
8839     + (LAST_PKMAP*PAGE_SIZE) >> 10,
8840     +#endif
8841     +
8842     + VMALLOC_START, VMALLOC_END,
8843     + (VMALLOC_END - VMALLOC_START) >> 20,
8844     +
8845     + (unsigned long)__va(0), (unsigned long)high_memory,
8846     + ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20,
8847     +
8848     + (unsigned long)&__init_begin, (unsigned long)&__init_end,
8849     + ((unsigned long)&__init_end - (unsigned long)&__init_begin) >> 10,
8850     +
8851     + (unsigned long)&_etext, (unsigned long)&_edata,
8852     + ((unsigned long)&_edata - (unsigned long)&_etext) >> 10,
8853     +
8854     + (unsigned long)&_text, (unsigned long)&_etext,
8855     + ((unsigned long)&_etext - (unsigned long)&_text) >> 10);
8856     +
8857     +#ifdef CONFIG_HIGHMEM
8858     + BUG_ON(PKMAP_BASE+LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
8859     + BUG_ON(VMALLOC_END > PKMAP_BASE);
8860     +#endif
8861     + BUG_ON(VMALLOC_START > VMALLOC_END);
8862     + BUG_ON((unsigned long)high_memory > VMALLOC_START);
8863     +#endif /* double-sanity-check paranoia */
8864     +
8865     #ifdef CONFIG_X86_PAE
8866     if (!cpu_has_pae)
8867     panic("cannot execute a PAE-enabled kernel on a PAE-less CPU!");
8868     @@ -724,7 +749,7 @@
8869     int arch_add_memory(int nid, u64 start, u64 size)
8870     {
8871     struct pglist_data *pgdata = &contig_page_data;
8872     - struct zone *zone = pgdata->node_zones + MAX_NR_ZONES-1;
8873     + struct zone *zone = pgdata->node_zones + ZONE_HIGHMEM;
8874     unsigned long start_pfn = start >> PAGE_SHIFT;
8875     unsigned long nr_pages = size >> PAGE_SHIFT;
8876    
8877 niro 612 --- a/arch/x86/mm/init_64-xen.c
8878     +++ b/arch/x86/mm/init_64-xen.c
8879 niro 609 @@ -61,8 +61,6 @@
8880    
8881     extern unsigned long *contiguous_bitmap;
8882    
8883     -static unsigned long dma_reserve __initdata;
8884     -
8885     DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
8886     extern unsigned long start_pfn;
8887    
8888     @@ -416,7 +414,6 @@
8889    
8890     /* actually usually some more */
8891     if (size >= LARGE_PAGE_SIZE) {
8892     - printk("SMBIOS area too long %lu\n", size);
8893     return NULL;
8894     }
8895     set_pmd(temp_mappings[0].pmd, __pmd(map | _KERNPG_TABLE | _PAGE_PSE));
8896     @@ -438,13 +435,15 @@
8897     #endif
8898    
8899     static void __meminit
8900     -phys_pmd_init(pmd_t *pmd, unsigned long address, unsigned long end)
8901     +phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
8902     {
8903     - int i, k;
8904     + int i = pmd_index(address);
8905    
8906     - for (i = 0; i < PTRS_PER_PMD; pmd++, i++) {
8907     + for (; i < PTRS_PER_PMD; i++) {
8908     unsigned long pte_phys;
8909     + pmd_t *pmd = pmd_page + i;
8910     pte_t *pte, *pte_save;
8911     + int k;
8912    
8913     if (address >= end) {
8914     if (!after_bootmem)
8915     @@ -452,6 +451,12 @@
8916     set_pmd(pmd, __pmd(0));
8917     break;
8918     }
8919     +
8920     + if (__pmd_val(*pmd)) {
8921     + address += PMD_SIZE;
8922     + continue;
8923     + }
8924     +
8925     pte = alloc_static_page(&pte_phys);
8926     pte_save = pte;
8927     for (k = 0; k < PTRS_PER_PTE; pte++, k++, address += PTE_SIZE) {
8928     @@ -474,40 +479,35 @@
8929     static void __meminit
8930     phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
8931     {
8932     - pmd_t *pmd = pmd_offset(pud, (unsigned long)__va(address));
8933     -
8934     - if (pmd_none(*pmd)) {
8935     - spin_lock(&init_mm.page_table_lock);
8936     - phys_pmd_init(pmd, address, end);
8937     - spin_unlock(&init_mm.page_table_lock);
8938     - __flush_tlb_all();
8939     - }
8940     + pmd_t *pmd = pmd_offset(pud,0);
8941     + spin_lock(&init_mm.page_table_lock);
8942     + phys_pmd_init(pmd, address, end);
8943     + spin_unlock(&init_mm.page_table_lock);
8944     + __flush_tlb_all();
8945     }
8946    
8947     -static void __meminit phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
8948     +static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
8949     {
8950     - long i = pud_index(address);
8951     -
8952     - pud = pud + i;
8953     -
8954     - if (after_bootmem && pud_val(*pud)) {
8955     - phys_pmd_update(pud, address, end);
8956     - return;
8957     - }
8958     + int i = pud_index(addr);
8959    
8960     - for (; i < PTRS_PER_PUD; pud++, i++) {
8961     - unsigned long paddr, pmd_phys;
8962     + for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE ) {
8963     + unsigned long pmd_phys;
8964     + pud_t *pud = pud_page + pud_index(addr);
8965     pmd_t *pmd;
8966    
8967     - paddr = (address & PGDIR_MASK) + i*PUD_SIZE;
8968     - if (paddr >= end)
8969     + if (addr >= end)
8970     break;
8971    
8972     + if (__pud_val(*pud)) {
8973     + phys_pmd_update(pud, addr, end);
8974     + continue;
8975     + }
8976     +
8977     pmd = alloc_static_page(&pmd_phys);
8978     early_make_page_readonly(pmd, XENFEAT_writable_page_tables);
8979     spin_lock(&init_mm.page_table_lock);
8980     set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
8981     - phys_pmd_init(pmd, paddr, end);
8982     + phys_pmd_init(pmd, addr, end);
8983     spin_unlock(&init_mm.page_table_lock);
8984     }
8985     __flush_tlb();
8986     @@ -771,69 +771,18 @@
8987     #endif
8988     }
8989    
8990     -/* Compute zone sizes for the DMA and DMA32 zones in a node. */
8991     -__init void
8992     -size_zones(unsigned long *z, unsigned long *h,
8993     - unsigned long start_pfn, unsigned long end_pfn)
8994     -{
8995     - int i;
8996     - unsigned long w;
8997     -
8998     - for (i = 0; i < MAX_NR_ZONES; i++)
8999     - z[i] = 0;
9000     -
9001     - if (start_pfn < MAX_DMA_PFN)
9002     - z[ZONE_DMA] = MAX_DMA_PFN - start_pfn;
9003     - if (start_pfn < MAX_DMA32_PFN) {
9004     - unsigned long dma32_pfn = MAX_DMA32_PFN;
9005     - if (dma32_pfn > end_pfn)
9006     - dma32_pfn = end_pfn;
9007     - z[ZONE_DMA32] = dma32_pfn - start_pfn;
9008     - }
9009     - z[ZONE_NORMAL] = end_pfn - start_pfn;
9010     -
9011     - /* Remove lower zones from higher ones. */
9012     - w = 0;
9013     - for (i = 0; i < MAX_NR_ZONES; i++) {
9014     - if (z[i])
9015     - z[i] -= w;
9016     - w += z[i];
9017     - }
9018     -
9019     - /* Compute holes */
9020     - w = start_pfn;
9021     - for (i = 0; i < MAX_NR_ZONES; i++) {
9022     - unsigned long s = w;
9023     - w += z[i];
9024     - h[i] = e820_hole_size(s, w);
9025     - }
9026     -
9027     - /* Add the space pace needed for mem_map to the holes too. */
9028     - for (i = 0; i < MAX_NR_ZONES; i++)
9029     - h[i] += (z[i] * sizeof(struct page)) / PAGE_SIZE;
9030     -
9031     - /* The 16MB DMA zone has the kernel and other misc mappings.
9032     - Account them too */
9033     - if (h[ZONE_DMA]) {
9034     - h[ZONE_DMA] += dma_reserve;
9035     - if (h[ZONE_DMA] >= z[ZONE_DMA]) {
9036     - printk(KERN_WARNING
9037     - "Kernel too large and filling up ZONE_DMA?\n");
9038     - h[ZONE_DMA] = z[ZONE_DMA];
9039     - }
9040     - }
9041     -}
9042     -
9043     #ifndef CONFIG_NUMA
9044     void __init paging_init(void)
9045     {
9046     - unsigned long zones[MAX_NR_ZONES], holes[MAX_NR_ZONES];
9047     + unsigned long max_zone_pfns[MAX_NR_ZONES];
9048     + memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
9049     + max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
9050     + max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
9051     + max_zone_pfns[ZONE_NORMAL] = end_pfn;
9052    
9053     memory_present(0, 0, end_pfn);
9054     sparse_init();
9055     - size_zones(zones, holes, 0, end_pfn);
9056     - free_area_init_node(0, NODE_DATA(0), zones,
9057     - __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes);
9058     + free_area_init_nodes(max_zone_pfns);
9059    
9060     init_mm.context.pinned = 1;
9061     }
9062     @@ -887,36 +836,23 @@
9063    
9064     #ifdef CONFIG_MEMORY_HOTPLUG
9065     /*
9066     - * XXX: memory_add_physaddr_to_nid() is to find node id from physical address
9067     - * via probe interface of sysfs. If acpi notifies hot-add event, then it
9068     - * can tell node id by searching dsdt. But, probe interface doesn't have
9069     - * node id. So, return 0 as node id at this time.
9070     - */
9071     -#ifdef CONFIG_NUMA
9072     -int memory_add_physaddr_to_nid(u64 start)
9073     -{
9074     - return 0;
9075     -}
9076     -#endif
9077     -
9078     -/*
9079     * Memory is added always to NORMAL zone. This means you will never get
9080     * additional DMA/DMA32 memory.
9081     */
9082     int arch_add_memory(int nid, u64 start, u64 size)
9083     {
9084     struct pglist_data *pgdat = NODE_DATA(nid);
9085     - struct zone *zone = pgdat->node_zones + MAX_NR_ZONES-2;
9086     + struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
9087     unsigned long start_pfn = start >> PAGE_SHIFT;
9088     unsigned long nr_pages = size >> PAGE_SHIFT;
9089     int ret;
9090    
9091     + init_memory_mapping(start, (start + size -1));
9092     +
9093     ret = __add_pages(zone, start_pfn, nr_pages);
9094     if (ret)
9095     goto error;
9096    
9097     - init_memory_mapping(start, (start + size -1));
9098     -
9099     return ret;
9100     error:
9101     printk("%s: Problem encountered in __add_pages!\n", __func__);
9102     @@ -930,7 +866,17 @@
9103     }
9104     EXPORT_SYMBOL_GPL(remove_memory);
9105    
9106     -#else /* CONFIG_MEMORY_HOTPLUG */
9107     +#if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA)
9108     +int memory_add_physaddr_to_nid(u64 start)
9109     +{
9110     + return 0;
9111     +}
9112     +EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
9113     +#endif
9114     +
9115     +#endif /* CONFIG_MEMORY_HOTPLUG */
9116     +
9117     +#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
9118     /*
9119     * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance,
9120     * just online the pages.
9121     @@ -956,7 +902,7 @@
9122     }
9123     return err;
9124     }
9125     -#endif /* CONFIG_MEMORY_HOTPLUG */
9126     +#endif
9127    
9128     static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
9129     kcore_vsyscall;
9130     @@ -973,12 +919,6 @@
9131    
9132     pci_iommu_alloc();
9133    
9134     - /* How many end-of-memory variables you have, grandma! */
9135     - max_low_pfn = end_pfn;
9136     - max_pfn = end_pfn;
9137     - num_physpages = end_pfn;
9138     - high_memory = (void *) __va(end_pfn * PAGE_SIZE);
9139     -
9140     /* clear the zero-page */
9141     memset(empty_zero_page, 0, PAGE_SIZE);
9142    
9143     @@ -996,7 +936,8 @@
9144     init_page_count(pfn_to_page(pfn));
9145     totalram_pages++;
9146     }
9147     - reservedpages = end_pfn - totalram_pages - e820_hole_size(0, end_pfn);
9148     + reservedpages = end_pfn - totalram_pages -
9149     + absent_pages_in_range(0, end_pfn);
9150    
9151     after_bootmem = 1;
9152    
9153     @@ -1103,15 +1044,34 @@
9154    
9155     void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
9156     {
9157     - /* Should check here against the e820 map to avoid double free */
9158     #ifdef CONFIG_NUMA
9159     int nid = phys_to_nid(phys);
9160     +#endif
9161     + unsigned long pfn = phys >> PAGE_SHIFT;
9162     + if (pfn >= end_pfn) {
9163     + /* This can happen with kdump kernels when accessing firmware
9164     + tables. */
9165     + if (pfn < end_pfn_map)
9166     + return;
9167     + printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
9168     + phys, len);
9169     + return;
9170     + }
9171     +
9172     + /* Should check here against the e820 map to avoid double free */
9173     +#ifdef CONFIG_NUMA
9174     reserve_bootmem_node(NODE_DATA(nid), phys, len);
9175     #else
9176     reserve_bootmem(phys, len);
9177     #endif
9178     - if (phys+len <= MAX_DMA_PFN*PAGE_SIZE)
9179     +#ifndef CONFIG_XEN
9180     + if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
9181     + static unsigned long dma_reserve __initdata;
9182     +
9183     dma_reserve += len / PAGE_SIZE;
9184     + set_dma_reserve(dma_reserve);
9185     + }
9186     +#endif
9187     }
9188    
9189     int kern_addr_valid(unsigned long addr)
9190 niro 612 --- a/arch/x86/mm/ioremap_32-xen.c
9191     +++ b/arch/x86/mm/ioremap_32-xen.c
9192 niro 609 @@ -12,7 +12,7 @@
9193     #include <linux/init.h>
9194     #include <linux/slab.h>
9195     #include <linux/module.h>
9196     -#include <asm/io.h>
9197     +#include <linux/io.h>
9198     #include <asm/fixmap.h>
9199     #include <asm/cacheflush.h>
9200     #include <asm/tlbflush.h>
9201     @@ -118,7 +118,7 @@
9202     if (domid == DOMID_SELF)
9203     return -EINVAL;
9204    
9205     - vma->vm_flags |= VM_IO | VM_RESERVED;
9206     + vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
9207    
9208     vma->vm_mm->context.has_foreign_mappings = 1;
9209    
9210     @@ -203,6 +203,7 @@
9211     void __iomem * addr;
9212     struct vm_struct * area;
9213     unsigned long offset, last_addr;
9214     + pgprot_t prot;
9215     domid_t domid = DOMID_IO;
9216    
9217     /* Don't allow wraparound or zero size */
9218     @@ -234,6 +235,8 @@
9219     domid = DOMID_SELF;
9220     }
9221    
9222     + prot = __pgprot(_KERNPG_TABLE | flags);
9223     +
9224     /*
9225     * Mappings have to be page-aligned
9226     */
9227     @@ -249,10 +252,9 @@
9228     return NULL;
9229     area->phys_addr = phys_addr;
9230     addr = (void __iomem *) area->addr;
9231     - flags |= _KERNPG_TABLE;
9232     if (__direct_remap_pfn_range(&init_mm, (unsigned long)addr,
9233     phys_addr>>PAGE_SHIFT,
9234     - size, __pgprot(flags), domid)) {
9235     + size, prot, domid)) {
9236     vunmap((void __force *) addr);
9237     return NULL;
9238     }
9239 niro 612 --- a/arch/x86/mm/pageattr_64-xen.c
9240     +++ b/arch/x86/mm/pageattr_64-xen.c
9241 niro 609 @@ -371,8 +371,8 @@
9242     BUG_ON(pud_none(*pud));
9243     pmd = pmd_offset(pud, address);
9244     BUG_ON(__pmd_val(*pmd) & _PAGE_PSE);
9245     - pgprot_val(ref_prot) |= _PAGE_PSE;
9246     large_pte = mk_pte_phys(__pa(address) & LARGE_PAGE_MASK, ref_prot);
9247     + large_pte = pte_mkhuge(large_pte);
9248     set_pte((pte_t *)pmd, large_pte);
9249     }
9250    
9251     @@ -382,32 +382,28 @@
9252     {
9253     pte_t *kpte;
9254     struct page *kpte_page;
9255     - unsigned kpte_flags;
9256     pgprot_t ref_prot2;
9257     kpte = lookup_address(address);
9258     if (!kpte) return 0;
9259     kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK);
9260     - kpte_flags = pte_val(*kpte);
9261     if (pgprot_val(prot) != pgprot_val(ref_prot)) {
9262     - if ((kpte_flags & _PAGE_PSE) == 0) {
9263     + if (!pte_huge(*kpte)) {
9264     set_pte(kpte, pfn_pte(pfn, prot));
9265     } else {
9266     /*
9267     * split_large_page will take the reference for this
9268     * change_page_attr on the split page.
9269     */
9270     -
9271     struct page *split;
9272     - ref_prot2 = __pgprot(pgprot_val(pte_pgprot(*lookup_address(address))) & ~(1<<_PAGE_BIT_PSE));
9273     -
9274     + ref_prot2 = pte_pgprot(pte_clrhuge(*kpte));
9275     split = split_large_page(address, prot, ref_prot2);
9276     if (!split)
9277     return -ENOMEM;
9278     - set_pte(kpte,mk_pte(split, ref_prot2));
9279     + set_pte(kpte, mk_pte(split, ref_prot2));
9280     kpte_page = split;
9281     - }
9282     + }
9283     page_private(kpte_page)++;
9284     - } else if ((kpte_flags & _PAGE_PSE) == 0) {
9285     + } else if (!pte_huge(*kpte)) {
9286     set_pte(kpte, pfn_pte(pfn, ref_prot));
9287     BUG_ON(page_private(kpte_page) == 0);
9288     page_private(kpte_page)--;
9289     @@ -464,10 +460,12 @@
9290     * lowmem */
9291     if (__pa(address) < KERNEL_TEXT_SIZE) {
9292     unsigned long addr2;
9293     - pgprot_t prot2 = prot;
9294     + pgprot_t prot2;
9295     addr2 = __START_KERNEL_map + __pa(address);
9296     - pgprot_val(prot2) &= ~_PAGE_NX;
9297     - err = __change_page_attr(addr2, pfn, prot2, PAGE_KERNEL_EXEC);
9298     + /* Make sure the kernel mappings stay executable */
9299     + prot2 = pte_pgprot(pte_mkexec(pfn_pte(0, prot)));
9300     + err = __change_page_attr(addr2, pfn, prot2,
9301     + PAGE_KERNEL_EXEC);
9302     }
9303     }
9304     up_write(&init_mm.mmap_sem);
9305 niro 612 --- a/arch/x86/mm/pgtable_32-xen.c
9306     +++ b/arch/x86/mm/pgtable_32-xen.c
9307 niro 609 @@ -68,7 +68,9 @@
9308     printk(KERN_INFO "%lu pages writeback\n",
9309     global_page_state(NR_WRITEBACK));
9310     printk(KERN_INFO "%lu pages mapped\n", global_page_state(NR_FILE_MAPPED));
9311     - printk(KERN_INFO "%lu pages slab\n", global_page_state(NR_SLAB));
9312     + printk(KERN_INFO "%lu pages slab\n",
9313     + global_page_state(NR_SLAB_RECLAIMABLE) +
9314     + global_page_state(NR_SLAB_UNRECLAIMABLE));
9315     printk(KERN_INFO "%lu pages pagetables\n",
9316     global_page_state(NR_PAGETABLE));
9317     }
9318     @@ -108,18 +110,11 @@
9319     __flush_tlb_one(vaddr);
9320     }
9321    
9322     -static int nr_fixmaps = 0;
9323     +static int fixmaps;
9324     unsigned long hypervisor_virt_start = HYPERVISOR_VIRT_START;
9325     -unsigned long __FIXADDR_TOP = (HYPERVISOR_VIRT_START - 2 * PAGE_SIZE);
9326     +unsigned long __FIXADDR_TOP = (HYPERVISOR_VIRT_START - PAGE_SIZE);
9327     EXPORT_SYMBOL(__FIXADDR_TOP);
9328    
9329     -void __init set_fixaddr_top(unsigned long top)
9330     -{
9331     - BUG_ON(nr_fixmaps > 0);
9332     - hypervisor_virt_start = top;
9333     - __FIXADDR_TOP = hypervisor_virt_start - 2 * PAGE_SIZE;
9334     -}
9335     -
9336     void __set_fixmap (enum fixed_addresses idx, maddr_t phys, pgprot_t flags)
9337     {
9338     unsigned long address = __fix_to_virt(idx);
9339     @@ -141,7 +136,21 @@
9340     if (HYPERVISOR_update_va_mapping(address, pte,
9341     UVMF_INVLPG|UVMF_ALL))
9342     BUG();
9343     - nr_fixmaps++;
9344     + fixmaps++;
9345     +}
9346     +
9347     +/**
9348     + * reserve_top_address - reserves a hole in the top of kernel address space
9349     + * @reserve - size of hole to reserve
9350     + *
9351     + * Can be used to relocate the fixmap area and poke a hole in the top
9352     + * of kernel address space to make room for a hypervisor.
9353     + */
9354     +void __init reserve_top_address(unsigned long reserve)
9355     +{
9356     + BUG_ON(fixmaps > 0);
9357     + __FIXADDR_TOP = -reserve - PAGE_SIZE;
9358     + __VMALLOC_RESERVE += reserve;
9359     }
9360    
9361     pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
9362 niro 612 --- a/arch/x86/pci/irq-xen.c
9363     +++ b/arch/x86/pci/irq-xen.c
9364 niro 609 @@ -991,10 +991,6 @@
9365     pci_name(bridge), 'A' + pin, irq);
9366     }
9367     if (irq >= 0) {
9368     - if (use_pci_vector() &&
9369     - !platform_legacy_irq(irq))
9370     - irq = IO_APIC_VECTOR(irq);
9371     -
9372     printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n",
9373     pci_name(dev), 'A' + pin, irq);
9374     dev->irq = irq;
9375     @@ -1155,10 +1151,6 @@
9376     }
9377     dev = temp_dev;
9378     if (irq >= 0) {
9379     -#ifdef CONFIG_PCI_MSI
9380     - if (!platform_legacy_irq(irq))
9381     - irq = IO_APIC_VECTOR(irq);
9382     -#endif
9383     printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n",
9384     pci_name(dev), 'A' + pin, irq);
9385     dev->irq = irq;
9386     @@ -1179,33 +1171,3 @@
9387     }
9388     return 0;
9389     }
9390     -
9391     -int pci_vector_resources(int last, int nr_released)
9392     -{
9393     - int count = nr_released;
9394     -
9395     - int next = last;
9396     - int offset = (last % 8);
9397     -
9398     - while (next < FIRST_SYSTEM_VECTOR) {
9399     - next += 8;
9400     -#ifdef CONFIG_X86_64
9401     - if (next == IA32_SYSCALL_VECTOR)
9402     - continue;
9403     -#else
9404     - if (next == SYSCALL_VECTOR)
9405     - continue;
9406     -#endif
9407     - count++;
9408     - if (next >= FIRST_SYSTEM_VECTOR) {
9409     - if (offset%8) {
9410     - next = FIRST_DEVICE_VECTOR + offset;
9411     - offset++;
9412     - continue;
9413     - }
9414     - count--;
9415     - }
9416     - }
9417     -
9418     - return count;
9419     -}
9420 niro 612 --- a/drivers/char/tpm/tpm_xen.c
9421     +++ b/drivers/char/tpm/tpm_xen.c
9422 niro 609 @@ -85,8 +85,7 @@
9423    
9424     /* local function prototypes */
9425     static irqreturn_t tpmif_int(int irq,
9426     - void *tpm_priv,
9427     - struct pt_regs *ptregs);
9428     + void *tpm_priv);
9429     static void tpmif_rx_action(unsigned long unused);
9430     static int tpmif_connect(struct xenbus_device *dev,
9431     struct tpm_private *tp,
9432     @@ -559,7 +558,7 @@
9433     }
9434    
9435    
9436     -static irqreturn_t tpmif_int(int irq, void *tpm_priv, struct pt_regs *ptregs)
9437     +static irqreturn_t tpmif_int(int irq, void *tpm_priv)
9438     {
9439     struct tpm_private *tp = tpm_priv;
9440     unsigned long flags;
9441 niro 612 --- a/drivers/pci/Kconfig
9442     +++ b/drivers/pci/Kconfig
9443 niro 609 @@ -45,7 +45,7 @@
9444     config HT_IRQ
9445     bool "Interrupts on hypertransport devices"
9446     default y
9447     - depends on PCI && X86_LOCAL_APIC && X86_IO_APIC
9448     + depends on PCI && X86_LOCAL_APIC && X86_IO_APIC && !XEN
9449     help
9450     This allows native hypertransport devices to use interrupts.
9451    
9452 niro 612 --- a/drivers/xen/Kconfig
9453     +++ b/drivers/xen/Kconfig
9454     @@ -278,6 +278,9 @@
9455     config HAVE_IRQ_IGNORE_UNHANDLED
9456     def_bool y
9457    
9458     +config GENERIC_HARDIRQS_NO__DO_IRQ
9459     + def_bool y
9460     +
9461     config NO_IDLE_HZ
9462     def_bool y
9463    
9464     --- a/drivers/xen/balloon/balloon.c
9465     +++ b/drivers/xen/balloon/balloon.c
9466 niro 609 @@ -84,7 +84,7 @@
9467     /* VM /proc information for memory */
9468     extern unsigned long totalram_pages;
9469    
9470     -#ifndef MODULE
9471     +#if !defined(MODULE) && defined(CONFIG_HIGHMEM)
9472     extern unsigned long totalhigh_pages;
9473     #define inc_totalhigh_pages() (totalhigh_pages++)
9474     #define dec_totalhigh_pages() (totalhigh_pages--)
9475 niro 612 --- a/drivers/xen/blkback/blkback.c
9476     +++ b/drivers/xen/blkback/blkback.c
9477 niro 609 @@ -288,7 +288,7 @@
9478     wake_up(&blkif->wq);
9479     }
9480    
9481     -irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs)
9482     +irqreturn_t blkif_be_int(int irq, void *dev_id)
9483     {
9484     blkif_notify_work(dev_id);
9485     return IRQ_HANDLED;
9486 niro 612 --- a/drivers/xen/blkback/common.h
9487     +++ b/drivers/xen/blkback/common.h
9488 niro 609 @@ -130,7 +130,7 @@
9489    
9490     void blkif_xenbus_init(void);
9491    
9492     -irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs);
9493     +irqreturn_t blkif_be_int(int irq, void *dev_id);
9494     int blkif_schedule(void *arg);
9495    
9496     int blkback_barrier(struct xenbus_transaction xbt,
9497 niro 612 --- a/drivers/xen/blkfront/blkfront.c
9498     +++ b/drivers/xen/blkfront/blkfront.c
9499 niro 609 @@ -69,7 +69,7 @@
9500    
9501     static void kick_pending_request_queues(struct blkfront_info *);
9502    
9503     -static irqreturn_t blkif_int(int irq, void *dev_id, struct pt_regs *ptregs);
9504     +static irqreturn_t blkif_int(int irq, void *dev_id);
9505     static void blkif_restart_queue(void *arg);
9506     static void blkif_recover(struct blkfront_info *);
9507     static void blkif_completion(struct blk_shadow *);
9508     @@ -698,7 +698,7 @@
9509     }
9510    
9511    
9512     -static irqreturn_t blkif_int(int irq, void *dev_id, struct pt_regs *ptregs)
9513     +static irqreturn_t blkif_int(int irq, void *dev_id)
9514     {
9515     struct request *req;
9516     blkif_response_t *bret;
9517 niro 612 --- a/drivers/xen/blktap/blktap.c
9518     +++ b/drivers/xen/blktap/blktap.c
9519 niro 609 @@ -1175,7 +1175,7 @@
9520     wake_up(&blkif->wq);
9521     }
9522    
9523     -irqreturn_t tap_blkif_be_int(int irq, void *dev_id, struct pt_regs *regs)
9524     +irqreturn_t tap_blkif_be_int(int irq, void *dev_id)
9525     {
9526     blkif_notify_work(dev_id);
9527     return IRQ_HANDLED;
9528 niro 612 --- a/drivers/xen/blktap/common.h
9529     +++ b/drivers/xen/blktap/common.h
9530 niro 609 @@ -112,7 +112,7 @@
9531    
9532     void tap_blkif_xenbus_init(void);
9533    
9534     -irqreturn_t tap_blkif_be_int(int irq, void *dev_id, struct pt_regs *regs);
9535     +irqreturn_t tap_blkif_be_int(int irq, void *dev_id);
9536     int tap_blkif_schedule(void *arg);
9537    
9538     int dom_to_devid(domid_t domid, int xenbus_id, blkif_t *blkif);
9539 niro 612 --- a/drivers/xen/console/console.c
9540     +++ b/drivers/xen/console/console.c
9541 niro 609 @@ -345,7 +345,7 @@
9542     static int xencons_priv_irq;
9543     static char x_char;
9544    
9545     -void xencons_rx(char *buf, unsigned len, struct pt_regs *regs)
9546     +void xencons_rx(char *buf, unsigned len)
9547     {
9548     int i;
9549     unsigned long flags;
9550     @@ -370,8 +370,7 @@
9551     if (time_before(jiffies, sysrq_timeout)) {
9552     spin_unlock_irqrestore(
9553     &xencons_lock, flags);
9554     - handle_sysrq(
9555     - buf[i], regs, xencons_tty);
9556     + handle_sysrq(buf[i], xencons_tty);
9557     spin_lock_irqsave(
9558     &xencons_lock, flags);
9559     continue;
9560     @@ -436,14 +435,13 @@
9561     }
9562    
9563     /* Privileged receive callback and transmit kicker. */
9564     -static irqreturn_t xencons_priv_interrupt(int irq, void *dev_id,
9565     - struct pt_regs *regs)
9566     +static irqreturn_t xencons_priv_interrupt(int irq, void *dev_id)
9567     {
9568     static char rbuf[16];
9569     int l;
9570    
9571     while ((l = HYPERVISOR_console_io(CONSOLEIO_read, 16, rbuf)) > 0)
9572     - xencons_rx(rbuf, l, regs);
9573     + xencons_rx(rbuf, l);
9574    
9575     xencons_tx();
9576    
9577 niro 612 --- a/drivers/xen/console/xencons_ring.c
9578     +++ b/drivers/xen/console/xencons_ring.c
9579 niro 609 @@ -83,7 +83,7 @@
9580     return sent;
9581     }
9582    
9583     -static irqreturn_t handle_input(int irq, void *unused, struct pt_regs *regs)
9584     +static irqreturn_t handle_input(int irq, void *unused)
9585     {
9586     struct xencons_interface *intf = xencons_interface();
9587     XENCONS_RING_IDX cons, prod;
9588     @@ -94,7 +94,7 @@
9589     BUG_ON((prod - cons) > sizeof(intf->in));
9590    
9591     while (cons != prod) {
9592     - xencons_rx(intf->in+MASK_XENCONS_IDX(cons,intf->in), 1, regs);
9593     + xencons_rx(intf->in+MASK_XENCONS_IDX(cons,intf->in), 1);
9594     cons++;
9595     }
9596    
9597 niro 612 --- a/drivers/xen/core/evtchn.c
9598     +++ b/drivers/xen/core/evtchn.c
9599 niro 609 @@ -507,7 +507,7 @@
9600    
9601     int bind_caller_port_to_irqhandler(
9602     unsigned int caller_port,
9603     - irqreturn_t (*handler)(int, void *, struct pt_regs *),
9604     + irq_handler_t handler,
9605     unsigned long irqflags,
9606     const char *devname,
9607     void *dev_id)
9608     @@ -530,7 +530,7 @@
9609    
9610     int bind_listening_port_to_irqhandler(
9611     unsigned int remote_domain,
9612     - irqreturn_t (*handler)(int, void *, struct pt_regs *),
9613     + irq_handler_t handler,
9614     unsigned long irqflags,
9615     const char *devname,
9616     void *dev_id)
9617     @@ -554,7 +554,7 @@
9618     int bind_interdomain_evtchn_to_irqhandler(
9619     unsigned int remote_domain,
9620     unsigned int remote_port,
9621     - irqreturn_t (*handler)(int, void *, struct pt_regs *),
9622     + irq_handler_t handler,
9623     unsigned long irqflags,
9624     const char *devname,
9625     void *dev_id)
9626     @@ -578,7 +578,7 @@
9627     int bind_virq_to_irqhandler(
9628     unsigned int virq,
9629     unsigned int cpu,
9630     - irqreturn_t (*handler)(int, void *, struct pt_regs *),
9631     + irq_handler_t handler,
9632     unsigned long irqflags,
9633     const char *devname,
9634     void *dev_id)
9635     @@ -602,7 +602,7 @@
9636     int bind_ipi_to_irqhandler(
9637     unsigned int ipi,
9638     unsigned int cpu,
9639     - irqreturn_t (*handler)(int, void *, struct pt_regs *),
9640     + irq_handler_t handler,
9641     unsigned long irqflags,
9642     const char *devname,
9643     void *dev_id)
9644     @@ -687,15 +687,7 @@
9645     return 0;
9646     }
9647    
9648     -static void shutdown_dynirq(unsigned int irq)
9649     -{
9650     - int evtchn = evtchn_from_irq(irq);
9651     -
9652     - if (VALID_EVTCHN(evtchn))
9653     - mask_evtchn(evtchn);
9654     -}
9655     -
9656     -static void enable_dynirq(unsigned int irq)
9657     +static void unmask_dynirq(unsigned int irq)
9658     {
9659     int evtchn = evtchn_from_irq(irq);
9660    
9661     @@ -703,7 +695,7 @@
9662     unmask_evtchn(evtchn);
9663     }
9664    
9665     -static void disable_dynirq(unsigned int irq)
9666     +static void mask_dynirq(unsigned int irq)
9667     {
9668     int evtchn = evtchn_from_irq(irq);
9669    
9670     @@ -731,12 +723,12 @@
9671     unmask_evtchn(evtchn);
9672     }
9673    
9674     -static struct hw_interrupt_type dynirq_type = {
9675     - .typename = "Dynamic-irq",
9676     +static struct irq_chip dynirq_chip = {
9677     + .name = "Dynamic-irq",
9678     .startup = startup_dynirq,
9679     - .shutdown = shutdown_dynirq,
9680     - .enable = enable_dynirq,
9681     - .disable = disable_dynirq,
9682     + .mask = mask_dynirq,
9683     + .unmask = unmask_dynirq,
9684     + .mask_ack = ack_dynirq,
9685     .ack = ack_dynirq,
9686     .end = end_dynirq,
9687     #ifdef CONFIG_SMP
9688     @@ -820,12 +812,12 @@
9689     irq_info[irq] = IRQ_UNBOUND;
9690     }
9691    
9692     -static void enable_pirq(unsigned int irq)
9693     +static void unmask_pirq(unsigned int irq)
9694     {
9695     startup_pirq(irq);
9696     }
9697    
9698     -static void disable_pirq(unsigned int irq)
9699     +static void mask_pirq(unsigned int irq)
9700     {
9701     }
9702    
9703     @@ -854,12 +846,14 @@
9704     }
9705     }
9706    
9707     -static struct hw_interrupt_type pirq_type = {
9708     +static struct irq_chip pirq_chip = {
9709     + .name = "Phys-irq",
9710     .typename = "Phys-irq",
9711     .startup = startup_pirq,
9712     .shutdown = shutdown_pirq,
9713     - .enable = enable_pirq,
9714     - .disable = disable_pirq,
9715     + .mask = mask_pirq,
9716     + .unmask = unmask_pirq,
9717     + .mask_ack = ack_pirq,
9718     .ack = ack_pirq,
9719     .end = end_pirq,
9720     #ifdef CONFIG_SMP
9721     @@ -1043,7 +1037,8 @@
9722     irq_desc[dynirq_to_irq(i)].status = IRQ_DISABLED;
9723     irq_desc[dynirq_to_irq(i)].action = NULL;
9724     irq_desc[dynirq_to_irq(i)].depth = 1;
9725     - irq_desc[dynirq_to_irq(i)].chip = &dynirq_type;
9726     + set_irq_chip_and_handler_name(dynirq_to_irq(i), &dynirq_chip,
9727     + handle_level_irq, "level");
9728     }
9729    
9730     /* Phys IRQ space is statically bound (1:1 mapping). Nail refcnts. */
9731     @@ -1059,6 +1054,7 @@
9732     irq_desc[pirq_to_irq(i)].status = IRQ_DISABLED;
9733     irq_desc[pirq_to_irq(i)].action = NULL;
9734     irq_desc[pirq_to_irq(i)].depth = 1;
9735     - irq_desc[pirq_to_irq(i)].chip = &pirq_type;
9736     + set_irq_chip_and_handler_name(pirq_to_irq(i), &pirq_chip,
9737     + handle_level_irq, "level");
9738     }
9739     }
9740 niro 612 --- a/drivers/xen/core/reboot.c
9741     +++ b/drivers/xen/core/reboot.c
9742 niro 609 @@ -13,6 +13,7 @@
9743    
9744     #ifdef HAVE_XEN_PLATFORM_COMPAT_H
9745     #include <xen/platform-compat.h>
9746     +#undef handle_sysrq
9747     #endif
9748    
9749     MODULE_LICENSE("Dual BSD/GPL");
9750     @@ -203,7 +204,7 @@
9751    
9752     #ifdef CONFIG_MAGIC_SYSRQ
9753     if (sysrq_key != '\0')
9754     - handle_sysrq(sysrq_key, NULL, NULL);
9755     + handle_sysrq(sysrq_key, NULL);
9756     #endif
9757     }
9758    
9759 niro 612 --- a/drivers/xen/core/smpboot.c
9760     +++ b/drivers/xen/core/smpboot.c
9761 niro 609 @@ -25,8 +25,8 @@
9762     #include <xen/cpu_hotplug.h>
9763     #include <xen/xenbus.h>
9764    
9765     -extern irqreturn_t smp_reschedule_interrupt(int, void *, struct pt_regs *);
9766     -extern irqreturn_t smp_call_function_interrupt(int, void *, struct pt_regs *);
9767     +extern irqreturn_t smp_reschedule_interrupt(int, void *);
9768     +extern irqreturn_t smp_call_function_interrupt(int, void *);
9769    
9770     extern int local_setup_timer(unsigned int cpu);
9771     extern void local_teardown_timer(unsigned int cpu);
9772     @@ -66,8 +66,6 @@
9773     #if defined(__i386__)
9774     u8 x86_cpu_to_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = 0xff };
9775     EXPORT_SYMBOL(x86_cpu_to_apicid);
9776     -#elif !defined(CONFIG_X86_IO_APIC)
9777     -unsigned int maxcpus = NR_CPUS;
9778     #endif
9779    
9780     void __init prefill_possible_map(void)
9781 niro 612 --- a/drivers/xen/fbfront/xenfb.c
9782     +++ b/drivers/xen/fbfront/xenfb.c
9783 niro 609 @@ -523,8 +523,7 @@
9784     .fb_set_par = xenfb_set_par,
9785     };
9786    
9787     -static irqreturn_t xenfb_event_handler(int rq, void *dev_id,
9788     - struct pt_regs *regs)
9789     +static irqreturn_t xenfb_event_handler(int rq, void *dev_id)
9790     {
9791     /*
9792     * No in events recognized, simply ignore them all.
9793 niro 612 --- a/drivers/xen/fbfront/xenkbd.c
9794     +++ b/drivers/xen/fbfront/xenkbd.c
9795 niro 609 @@ -46,7 +46,7 @@
9796     * to do that.
9797     */
9798    
9799     -static irqreturn_t input_handler(int rq, void *dev_id, struct pt_regs *regs)
9800     +static irqreturn_t input_handler(int rq, void *dev_id)
9801     {
9802     struct xenkbd_info *info = dev_id;
9803     struct xenkbd_page *page = info->page;
9804 niro 612 --- a/drivers/xen/gntdev/gntdev.c
9805     +++ b/drivers/xen/gntdev/gntdev.c
9806 niro 609 @@ -755,9 +755,6 @@
9807     BUG();
9808     }
9809    
9810     - /* Copy the existing value of the PTE for returning. */
9811     - copy = *ptep;
9812     -
9813     /* Calculate the grant relating to this PTE. */
9814     slot_index = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT);
9815    
9816     @@ -772,6 +769,10 @@
9817     GNTDEV_INVALID_HANDLE &&
9818     !xen_feature(XENFEAT_auto_translated_physmap)) {
9819     /* NOT USING SHADOW PAGE TABLES. */
9820     +
9821     + /* Copy the existing value of the PTE for returning. */
9822     + copy = *ptep;
9823     +
9824     gnttab_set_unmap_op(&op, virt_to_machine(ptep),
9825     GNTMAP_contains_pte,
9826     private_data->grants[slot_index]
9827     @@ -784,7 +785,7 @@
9828     op.status);
9829     } else {
9830     /* USING SHADOW PAGE TABLES. */
9831     - pte_clear_full(vma->vm_mm, addr, ptep, is_fullmm);
9832     + copy = ptep_get_and_clear_full(vma->vm_mm, addr, ptep, is_fullmm);
9833     }
9834    
9835     /* Finally, we unmap the grant from kernel space. */
9836     @@ -812,7 +813,7 @@
9837     >> PAGE_SHIFT, INVALID_P2M_ENTRY);
9838    
9839     } else {
9840     - pte_clear_full(vma->vm_mm, addr, ptep, is_fullmm);
9841     + copy = ptep_get_and_clear_full(vma->vm_mm, addr, ptep, is_fullmm);
9842     }
9843    
9844     return copy;
9845 niro 612 --- a/drivers/xen/netback/accel.c
9846     +++ b/drivers/xen/netback/accel.c
9847 niro 609 @@ -65,7 +65,7 @@
9848    
9849     if (IS_ERR(eth_name)) {
9850     /* Probably means not present */
9851     - DPRINTK("%s: no match due to xenbus_read accel error %d\n",
9852     + DPRINTK("%s: no match due to xenbus_read accel error %ld\n",
9853     __FUNCTION__, PTR_ERR(eth_name));
9854     return 0;
9855     } else {
9856 niro 612 --- a/drivers/xen/netback/common.h
9857     +++ b/drivers/xen/netback/common.h
9858 niro 609 @@ -200,7 +200,7 @@
9859    
9860     int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev);
9861     struct net_device_stats *netif_be_get_stats(struct net_device *dev);
9862     -irqreturn_t netif_be_int(int irq, void *dev_id, struct pt_regs *regs);
9863     +irqreturn_t netif_be_int(int irq, void *dev_id);
9864    
9865     static inline int netbk_can_queue(struct net_device *dev)
9866     {
9867 niro 612 --- a/drivers/xen/netback/loopback.c
9868     +++ b/drivers/xen/netback/loopback.c
9869 niro 609 @@ -151,7 +151,7 @@
9870     np->stats.rx_bytes += skb->len;
9871     np->stats.rx_packets++;
9872    
9873     - if (skb->ip_summed == CHECKSUM_HW) {
9874     + if (skb->ip_summed == CHECKSUM_PARTIAL) {
9875     /* Defer checksum calculation. */
9876     skb->proto_csum_blank = 1;
9877     /* Must be a local packet: assert its integrity. */
9878 niro 612 --- a/drivers/xen/netback/netback.c
9879     +++ b/drivers/xen/netback/netback.c
9880 niro 609 @@ -677,7 +677,7 @@
9881     id = meta[npo.meta_cons].id;
9882     flags = nr_frags ? NETRXF_more_data : 0;
9883    
9884     - if (skb->ip_summed == CHECKSUM_HW) /* local packet? */
9885     + if (skb->ip_summed == CHECKSUM_PARTIAL) /* local packet? */
9886     flags |= NETRXF_csum_blank | NETRXF_data_validated;
9887     else if (skb->proto_data_valid) /* remote but checksummed? */
9888     flags |= NETRXF_data_validated;
9889     @@ -1441,7 +1441,7 @@
9890     netif_idx_release(netif_page_index(page));
9891     }
9892    
9893     -irqreturn_t netif_be_int(int irq, void *dev_id, struct pt_regs *regs)
9894     +irqreturn_t netif_be_int(int irq, void *dev_id)
9895     {
9896     netif_t *netif = dev_id;
9897    
9898     @@ -1508,7 +1508,7 @@
9899     }
9900    
9901     #ifdef NETBE_DEBUG_INTERRUPT
9902     -static irqreturn_t netif_be_dbg(int irq, void *dev_id, struct pt_regs *regs)
9903     +static irqreturn_t netif_be_dbg(int irq, void *dev_id)
9904     {
9905     struct list_head *ent;
9906     netif_t *netif;
9907 niro 612 --- a/drivers/xen/netfront/netfront.c
9908     +++ b/drivers/xen/netfront/netfront.c
9909 niro 609 @@ -136,7 +136,7 @@
9910     {
9911     return skb_is_gso(skb) &&
9912     (!skb_gso_ok(skb, dev->features) ||
9913     - unlikely(skb->ip_summed != CHECKSUM_HW));
9914     + unlikely(skb->ip_summed != CHECKSUM_PARTIAL));
9915     }
9916     #else
9917     #define HAVE_GSO 0
9918     @@ -222,7 +222,7 @@
9919     static void network_alloc_rx_buffers(struct net_device *);
9920     static void send_fake_arp(struct net_device *);
9921    
9922     -static irqreturn_t netif_int(int irq, void *dev_id, struct pt_regs *ptregs);
9923     +static irqreturn_t netif_int(int irq, void *dev_id);
9924    
9925     #ifdef CONFIG_SYSFS
9926     static int xennet_sysfs_addif(struct net_device *netdev);
9927     @@ -992,7 +992,7 @@
9928     tx->flags = 0;
9929     extra = NULL;
9930    
9931     - if (skb->ip_summed == CHECKSUM_HW) /* local packet? */
9932     + if (skb->ip_summed == CHECKSUM_PARTIAL) /* local packet? */
9933     tx->flags |= NETTXF_csum_blank | NETTXF_data_validated;
9934     #ifdef CONFIG_XEN
9935     if (skb->proto_data_valid) /* remote but checksummed? */
9936     @@ -1049,7 +1049,7 @@
9937     return 0;
9938     }
9939    
9940     -static irqreturn_t netif_int(int irq, void *dev_id, struct pt_regs *ptregs)
9941     +static irqreturn_t netif_int(int irq, void *dev_id)
9942     {
9943     struct net_device *dev = dev_id;
9944     struct netfront_info *np = netdev_priv(dev);
9945 niro 612 --- a/drivers/xen/pciback/pciback.h
9946     +++ b/drivers/xen/pciback/pciback.h
9947 niro 609 @@ -87,7 +87,7 @@
9948     void pciback_release_devices(struct pciback_device *pdev);
9949    
9950     /* Handles events from front-end */
9951     -irqreturn_t pciback_handle_event(int irq, void *dev_id, struct pt_regs *regs);
9952     +irqreturn_t pciback_handle_event(int irq, void *dev_id);
9953     void pciback_do_op(void *data);
9954    
9955     int pciback_xenbus_register(void);
9956 niro 612 --- a/drivers/xen/pciback/pciback_ops.c
9957     +++ b/drivers/xen/pciback/pciback_ops.c
9958 niro 609 @@ -85,7 +85,7 @@
9959     test_and_schedule_op(pdev);
9960     }
9961    
9962     -irqreturn_t pciback_handle_event(int irq, void *dev_id, struct pt_regs *regs)
9963     +irqreturn_t pciback_handle_event(int irq, void *dev_id)
9964     {
9965     struct pciback_device *pdev = dev_id;
9966    
9967 niro 612 --- a/drivers/xen/pcifront/pci_op.c
9968     +++ b/drivers/xen/pcifront/pci_op.c
9969 niro 609 @@ -392,10 +392,16 @@
9970    
9971     d = pci_scan_single_device(b, devfn);
9972     if (d) {
9973     + int err;
9974     +
9975     dev_info(&pdev->xdev->dev, "New device on "
9976     "%04x:%02x:%02x.%02x found.\n", domain, bus,
9977     PCI_SLOT(devfn), PCI_FUNC(devfn));
9978     - pci_bus_add_device(d);
9979     + err = pci_bus_add_device(d);
9980     + if (err)
9981     + dev_err(&pdev->xdev->dev,
9982     + "error %d adding device, continuing.\n",
9983     + err);
9984     }
9985     }
9986    
9987 niro 612 --- a/drivers/xen/privcmd/compat_privcmd.c
9988     +++ b/drivers/xen/privcmd/compat_privcmd.c
9989 niro 609 @@ -18,7 +18,6 @@
9990     * Authors: Jimi Xenidis <jimix@watson.ibm.com>
9991     */
9992    
9993     -#include <linux/config.h>
9994     #include <linux/compat.h>
9995     #include <linux/ioctl.h>
9996     #include <linux/syscalls.h>
9997 niro 612 --- a/drivers/xen/privcmd/privcmd.c
9998     +++ b/drivers/xen/privcmd/privcmd.c
9999 niro 609 @@ -236,7 +236,7 @@
10000     #endif
10001    
10002     /* DONTCOPY is essential for Xen as copy_page_range is broken. */
10003     - vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY;
10004     + vma->vm_flags |= VM_RESERVED | VM_IO | VM_PFNMAP | VM_DONTCOPY;
10005     vma->vm_ops = &privcmd_vm_ops;
10006     vma->vm_private_data = NULL;
10007    
10008 niro 612 --- a/drivers/xen/sfc_netback/accel_xenbus.c
10009     +++ b/drivers/xen/sfc_netback/accel_xenbus.c
10010 niro 609 @@ -68,8 +68,7 @@
10011    
10012    
10013     /* Demultiplex a message IRQ from the frontend driver. */
10014     -static irqreturn_t msgirq_from_frontend(int irq, void *context,
10015     - struct pt_regs *unused)
10016     +static irqreturn_t msgirq_from_frontend(int irq, void *context)
10017     {
10018     struct xenbus_device *dev = context;
10019     struct netback_accel *bend = NETBACK_ACCEL_FROM_XENBUS_DEVICE(dev);
10020     @@ -84,8 +83,7 @@
10021     * functionally, but we need it to pass to the bind function, and may
10022     * get called spuriously
10023     */
10024     -static irqreturn_t netirq_from_frontend(int irq, void *context,
10025     - struct pt_regs *unused)
10026     +static irqreturn_t netirq_from_frontend(int irq, void *context)
10027     {
10028     VPRINTK("netirq %d from device %s\n", irq,
10029     ((struct xenbus_device *)context)->nodename);
10030 niro 612 --- a/drivers/xen/sfc_netfront/accel.h
10031     +++ b/drivers/xen/sfc_netfront/accel.h
10032 niro 609 @@ -449,10 +449,8 @@
10033     u32 ip, u16 port, u8 protocol);
10034    
10035     /* Process an IRQ received from back end driver */
10036     -irqreturn_t netfront_accel_msg_channel_irq_from_bend(int irq, void *context,
10037     - struct pt_regs *unused);
10038     -irqreturn_t netfront_accel_net_channel_irq_from_bend(int irq, void *context,
10039     - struct pt_regs *unused);
10040     +irqreturn_t netfront_accel_msg_channel_irq_from_bend(int irq, void *context);
10041     +irqreturn_t netfront_accel_net_channel_irq_from_bend(int irq, void *context);
10042    
10043     #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
10044     extern void netfront_accel_msg_from_bend(struct work_struct *context);
10045 niro 612 --- a/drivers/xen/sfc_netfront/accel_msg.c
10046     +++ b/drivers/xen/sfc_netfront/accel_msg.c
10047 niro 609 @@ -490,8 +490,7 @@
10048     }
10049    
10050    
10051     -irqreturn_t netfront_accel_msg_channel_irq_from_bend(int irq, void *context,
10052     - struct pt_regs *unused)
10053     +irqreturn_t netfront_accel_msg_channel_irq_from_bend(int irq, void *context)
10054     {
10055     netfront_accel_vnic *vnic = (netfront_accel_vnic *)context;
10056     VPRINTK("irq %d from device %s\n", irq, vnic->dev->nodename);
10057     @@ -502,8 +501,7 @@
10058     }
10059    
10060     /* Process an interrupt received from the NIC via backend */
10061     -irqreturn_t netfront_accel_net_channel_irq_from_bend(int irq, void *context,
10062     - struct pt_regs *unused)
10063     +irqreturn_t netfront_accel_net_channel_irq_from_bend(int irq, void *context)
10064     {
10065     netfront_accel_vnic *vnic = (netfront_accel_vnic *)context;
10066     struct net_device *net_dev = vnic->net_dev;
10067 niro 612 --- a/drivers/xen/sfc_netfront/accel_tso.c
10068     +++ b/drivers/xen/sfc_netfront/accel_tso.c
10069 niro 609 @@ -363,7 +363,7 @@
10070    
10071     tso_check_safe(skb);
10072    
10073     - if (skb->ip_summed != CHECKSUM_HW)
10074     + if (skb->ip_summed != CHECKSUM_PARTIAL)
10075     EPRINTK("Trying to TSO send a packet without HW checksum\n");
10076    
10077     tso_start(&state, skb);
10078 niro 612 --- a/drivers/xen/sfc_netfront/accel_vi.c
10079     +++ b/drivers/xen/sfc_netfront/accel_vi.c
10080 niro 609 @@ -461,7 +461,7 @@
10081    
10082     frag_i = -1;
10083    
10084     - if (skb->ip_summed == CHECKSUM_HW) {
10085     + if (skb->ip_summed == CHECKSUM_PARTIAL) {
10086     /* Set to zero to encourage falcon to work it out for us */
10087     *(u16*)(skb->h.raw + skb->csum) = 0;
10088     }
10089     @@ -580,7 +580,7 @@
10090    
10091     kva = buf->pkt_kva;
10092    
10093     - if (skb->ip_summed == CHECKSUM_HW) {
10094     + if (skb->ip_summed == CHECKSUM_PARTIAL) {
10095     /* Set to zero to encourage falcon to work it out for us */
10096     *(u16*)(skb->h.raw + skb->csum) = 0;
10097     }
10098 niro 612 --- a/drivers/xen/tpmback/common.h
10099     +++ b/drivers/xen/tpmback/common.h
10100 niro 609 @@ -61,7 +61,7 @@
10101     void tpmif_xenbus_init(void);
10102     void tpmif_xenbus_exit(void);
10103     int tpmif_map(tpmif_t *tpmif, unsigned long shared_page, unsigned int evtchn);
10104     -irqreturn_t tpmif_be_int(int irq, void *dev_id, struct pt_regs *regs);
10105     +irqreturn_t tpmif_be_int(int irq, void *dev_id);
10106    
10107     long int tpmback_get_instance(struct backend_info *bi);
10108    
10109 niro 612 --- a/drivers/xen/tpmback/tpmback.c
10110     +++ b/drivers/xen/tpmback/tpmback.c
10111 niro 609 @@ -502,7 +502,7 @@
10112     list_del(&pak->next);
10113     write_unlock_irqrestore(&dataex.pak_lock, flags);
10114    
10115     - DPRINTK("size given by app: %d, available: %d\n", size, left);
10116     + DPRINTK("size given by app: %zu, available: %u\n", size, left);
10117    
10118     ret_size = min_t(size_t, size, left);
10119    
10120     @@ -899,7 +899,7 @@
10121     }
10122     }
10123    
10124     -irqreturn_t tpmif_be_int(int irq, void *dev_id, struct pt_regs *regs)
10125     +irqreturn_t tpmif_be_int(int irq, void *dev_id)
10126     {
10127     tpmif_t *tpmif = (tpmif_t *) dev_id;
10128    
10129 niro 612 --- a/drivers/xen/xenbus/xenbus_comms.c
10130     +++ b/drivers/xen/xenbus/xenbus_comms.c
10131 niro 609 @@ -55,7 +55,7 @@
10132    
10133     static DECLARE_WAIT_QUEUE_HEAD(xb_waitq);
10134    
10135     -static irqreturn_t wake_waiting(int irq, void *unused, struct pt_regs *regs)
10136     +static irqreturn_t wake_waiting(int irq, void *unused)
10137     {
10138     if (unlikely(xenstored_ready == 0)) {
10139     xenstored_ready = 1;
10140 niro 612 --- a/drivers/xen/xenoprof/xenoprofile.c
10141     +++ b/drivers/xen/xenoprof/xenoprofile.c
10142 niro 609 @@ -195,7 +195,7 @@
10143     }
10144    
10145     static irqreturn_t
10146     -xenoprof_ovf_interrupt(int irq, void * dev_id, struct pt_regs * regs)
10147     +xenoprof_ovf_interrupt(int irq, void * dev_id)
10148     {
10149     struct xenoprof_buf * buf;
10150     static unsigned long flag;
10151 niro 612 --- a/include/asm-generic/pgtable.h
10152     +++ b/include/asm-generic/pgtable.h
10153 niro 609 @@ -100,7 +100,7 @@
10154     #endif
10155    
10156     #ifndef arch_change_pte_range
10157     -#define arch_change_pte_range(mm, pmd, addr, end, newprot) 0
10158     +#define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) 0
10159     #endif
10160    
10161     #ifndef __HAVE_ARCH_PTE_SAME
10162 niro 612 --- a/include/asm-x86/mach-xen/asm/desc_32.h
10163     +++ b/include/asm-x86/mach-xen/asm/desc_32.h
10164 niro 609 @@ -32,52 +32,110 @@
10165     return (struct desc_struct *)per_cpu(cpu_gdt_descr, cpu).address;
10166     }
10167    
10168     +/*
10169     + * This is the ldt that every process will get unless we need
10170     + * something other than this.
10171     + */
10172     +extern struct desc_struct default_ldt[];
10173     +extern struct desc_struct idt_table[];
10174     +extern void set_intr_gate(unsigned int irq, void * addr);
10175     +
10176     +static inline void pack_descriptor(__u32 *a, __u32 *b,
10177     + unsigned long base, unsigned long limit, unsigned char type, unsigned char flags)
10178     +{
10179     + *a = ((base & 0xffff) << 16) | (limit & 0xffff);
10180     + *b = (base & 0xff000000) | ((base & 0xff0000) >> 16) |
10181     + (limit & 0x000f0000) | ((type & 0xff) << 8) | ((flags & 0xf) << 20);
10182     +}
10183     +
10184     +static inline void pack_gate(__u32 *a, __u32 *b,
10185     + unsigned long base, unsigned short seg, unsigned char type, unsigned char flags)
10186     +{
10187     + *a = (seg << 16) | (base & 0xffff);
10188     + *b = (base & 0xffff0000) | ((type & 0xff) << 8) | (flags & 0xff);
10189     +}
10190     +
10191     +#define DESCTYPE_LDT 0x82 /* present, system, DPL-0, LDT */
10192     +#define DESCTYPE_TSS 0x89 /* present, system, DPL-0, 32-bit TSS */
10193     +#define DESCTYPE_TASK 0x85 /* present, system, DPL-0, task gate */
10194     +#define DESCTYPE_INT 0x8e /* present, system, DPL-0, interrupt gate */
10195     +#define DESCTYPE_TRAP 0x8f /* present, system, DPL-0, trap gate */
10196     +#define DESCTYPE_DPL3 0x60 /* DPL-3 */
10197     +#define DESCTYPE_S 0x10 /* !system */
10198     +
10199     #define load_TR_desc() __asm__ __volatile__("ltr %w0"::"q" (GDT_ENTRY_TSS*8))
10200     #define load_LDT_desc() __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8))
10201    
10202     #define load_gdt(dtr) __asm__ __volatile("lgdt %0"::"m" (*dtr))
10203     #define load_idt(dtr) __asm__ __volatile("lidt %0"::"m" (*dtr))
10204     -#define load_tr(tr) __asm__ __volatile("ltr %0"::"mr" (tr))
10205     -#define load_ldt(ldt) __asm__ __volatile("lldt %0"::"mr" (ldt))
10206     +#define load_tr(tr) __asm__ __volatile("ltr %0"::"m" (tr))
10207     +#define load_ldt(ldt) __asm__ __volatile("lldt %0"::"m" (ldt))
10208    
10209     #define store_gdt(dtr) __asm__ ("sgdt %0":"=m" (*dtr))
10210     #define store_idt(dtr) __asm__ ("sidt %0":"=m" (*dtr))
10211     -#define store_tr(tr) __asm__ ("str %0":"=mr" (tr))
10212     -#define store_ldt(ldt) __asm__ ("sldt %0":"=mr" (ldt))
10213     +#define store_tr(tr) __asm__ ("str %0":"=m" (tr))
10214     +#define store_ldt(ldt) __asm__ ("sldt %0":"=m" (ldt))
10215    
10216     -/*
10217     - * This is the ldt that every process will get unless we need
10218     - * something other than this.
10219     - */
10220     -extern struct desc_struct default_ldt[];
10221     -extern void set_intr_gate(unsigned int irq, void * addr);
10222     +#if TLS_SIZE != 24
10223     +# error update this code.
10224     +#endif
10225     +
10226     +static inline void load_TLS(struct thread_struct *t, unsigned int cpu)
10227     +{
10228     +#define C(i) if (HYPERVISOR_update_descriptor(virt_to_machine(&get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]), \
10229     + *(u64 *)&t->tls_array[i]) \
10230     + BUG()
10231     + C(0); C(1); C(2);
10232     +#undef C
10233     +}
10234    
10235     -#define _set_tssldt_desc(n,addr,limit,type) \
10236     -__asm__ __volatile__ ("movw %w3,0(%2)\n\t" \
10237     - "movw %w1,2(%2)\n\t" \
10238     - "rorl $16,%1\n\t" \
10239     - "movb %b1,4(%2)\n\t" \
10240     - "movb %4,5(%2)\n\t" \
10241     - "movb $0,6(%2)\n\t" \
10242     - "movb %h1,7(%2)\n\t" \
10243     - "rorl $16,%1" \
10244     - : "=m"(*(n)) : "q" (addr), "r"(n), "ir"(limit), "i"(type))
10245     +#ifndef CONFIG_XEN
10246     +static inline void write_dt_entry(void *dt, int entry, __u32 entry_a, __u32 entry_b)
10247     +{
10248     + __u32 *lp = (__u32 *)((char *)dt + entry*8);
10249     + *lp = entry_a;
10250     + *(lp+1) = entry_b;
10251     +}
10252    
10253     -#ifndef CONFIG_X86_NO_TSS
10254     -static inline void __set_tss_desc(unsigned int cpu, unsigned int entry, void *addr)
10255     +#define write_ldt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
10256     +#define write_gdt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
10257     +#else
10258     +extern int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b);
10259     +extern int write_gdt_entry(void *gdt, int entry, __u32 entry_a, __u32 entry_b);
10260     +#endif
10261     +#ifndef CONFIG_X86_NO_IDT
10262     +#define write_idt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
10263     +
10264     +static inline void _set_gate(int gate, unsigned int type, void *addr, unsigned short seg)
10265     {
10266     - _set_tssldt_desc(&get_cpu_gdt_table(cpu)[entry], (int)addr,
10267     - offsetof(struct tss_struct, __cacheline_filler) - 1, 0x89);
10268     + __u32 a, b;
10269     + pack_gate(&a, &b, (unsigned long)addr, seg, type, 0);
10270     + write_idt_entry(idt_table, gate, a, b);
10271     }
10272     +#endif
10273    
10274     -#define set_tss_desc(cpu,addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr)
10275     +#ifndef CONFIG_X86_NO_TSS
10276     +static inline void __set_tss_desc(unsigned int cpu, unsigned int entry, const void *addr)
10277     +{
10278     + __u32 a, b;
10279     + pack_descriptor(&a, &b, (unsigned long)addr,
10280     + offsetof(struct tss_struct, __cacheline_filler) - 1,
10281     + DESCTYPE_TSS, 0);
10282     + write_gdt_entry(get_cpu_gdt_table(cpu), entry, a, b);
10283     +}
10284     #endif
10285    
10286     -static inline void set_ldt_desc(unsigned int cpu, void *addr, unsigned int size)
10287     +static inline void set_ldt_desc(unsigned int cpu, void *addr, unsigned int entries)
10288     {
10289     - _set_tssldt_desc(&get_cpu_gdt_table(cpu)[GDT_ENTRY_LDT], (int)addr, ((size << 3)-1), 0x82);
10290     + __u32 a, b;
10291     + pack_descriptor(&a, &b, (unsigned long)addr,
10292     + entries * sizeof(struct desc_struct) - 1,
10293     + DESCTYPE_LDT, 0);
10294     + write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, a, b);
10295     }
10296    
10297     +#define set_tss_desc(cpu,addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr)
10298     +
10299     #define LDT_entry_a(info) \
10300     ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff))
10301    
10302     @@ -103,21 +161,6 @@
10303     (info)->seg_not_present == 1 && \
10304     (info)->useable == 0 )
10305    
10306     -extern int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b);
10307     -
10308     -#if TLS_SIZE != 24
10309     -# error update this code.
10310     -#endif
10311     -
10312     -static inline void load_TLS(struct thread_struct *t, unsigned int cpu)
10313     -{
10314     -#define C(i) if (HYPERVISOR_update_descriptor(virt_to_machine(&get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]), \
10315     - *(u64 *)&t->tls_array[i])) \
10316     - BUG();
10317     - C(0); C(1); C(2);
10318     -#undef C
10319     -}
10320     -
10321     static inline void clear_LDT(void)
10322     {
10323     int cpu = get_cpu();
10324 niro 612 --- a/include/asm-x86/mach-xen/asm/dma-mapping_64.h
10325     +++ b/include/asm-x86/mach-xen/asm/dma-mapping_64.h
10326 niro 609 @@ -55,13 +55,6 @@
10327     extern struct dma_mapping_ops* dma_ops;
10328     extern int iommu_merge;
10329    
10330     -static inline int valid_dma_direction(int dma_direction)
10331     -{
10332     - return ((dma_direction == DMA_BIDIRECTIONAL) ||
10333     - (dma_direction == DMA_TO_DEVICE) ||
10334     - (dma_direction == DMA_FROM_DEVICE));
10335     -}
10336     -
10337     #if 0
10338     static inline int dma_mapping_error(dma_addr_t dma_addr)
10339     {
10340 niro 612 --- a/include/asm-x86/mach-xen/asm/e820_64.h
10341     +++ b/include/asm-x86/mach-xen/asm/e820_64.h
10342 niro 609 @@ -19,13 +19,9 @@
10343    
10344     #define E820_RAM 1
10345     #define E820_RESERVED 2
10346     -#define E820_ACPI 3 /* usable as RAM once ACPI tables have been read */
10347     +#define E820_ACPI 3
10348     #define E820_NVS 4
10349    
10350     -#define HIGH_MEMORY (1024*1024)
10351     -
10352     -#define LOWMEMSIZE() (0x9f000)
10353     -
10354     #ifndef __ASSEMBLY__
10355     struct e820entry {
10356     u64 addr; /* start of memory segment */
10357     @@ -46,17 +42,16 @@
10358     extern void contig_e820_setup(void);
10359     extern unsigned long e820_end_of_ram(void);
10360     extern void e820_reserve_resources(struct e820entry *e820, int nr_map);
10361     +extern void e820_mark_nosave_regions(void);
10362     extern void e820_print_map(char *who);
10363     extern int e820_any_mapped(unsigned long start, unsigned long end, unsigned type);
10364     extern int e820_all_mapped(unsigned long start, unsigned long end, unsigned type);
10365    
10366     -extern void e820_bootmem_free(pg_data_t *pgdat, unsigned long start,unsigned long end);
10367     extern void e820_setup_gap(struct e820entry *e820, int nr_map);
10368     -extern unsigned long e820_hole_size(unsigned long start_pfn,
10369     - unsigned long end_pfn);
10370     +extern void e820_register_active_regions(int nid,
10371     + unsigned long start_pfn, unsigned long end_pfn);
10372    
10373     -extern void __init parse_memopt(char *p, char **end);
10374     -extern void __init parse_memmapopt(char *p, char **end);
10375     +extern void finish_e820_parsing(void);
10376    
10377     extern struct e820map e820;
10378    
10379 niro 612 --- a/include/asm-x86/mach-xen/asm/fixmap_32.h
10380     +++ b/include/asm-x86/mach-xen/asm/fixmap_32.h
10381 niro 609 @@ -55,7 +55,7 @@
10382     #ifdef CONFIG_X86_LOCAL_APIC
10383     FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
10384     #endif
10385     -#ifdef CONFIG_X86_IO_APIC
10386     +#if defined(CONFIG_X86_IO_APIC) && !defined(CONFIG_XEN)
10387     FIX_IO_APIC_BASE_0,
10388     FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
10389     #endif
10390     @@ -95,10 +95,9 @@
10391     __end_of_fixed_addresses
10392     };
10393    
10394     -extern void set_fixaddr_top(unsigned long top);
10395     -
10396     extern void __set_fixmap(enum fixed_addresses idx,
10397     maddr_t phys, pgprot_t flags);
10398     +extern void reserve_top_address(unsigned long reserve);
10399    
10400     #define set_fixmap(idx, phys) \
10401     __set_fixmap(idx, phys, PAGE_KERNEL)
10402 niro 612 --- a/include/asm-x86/mach-xen/asm/fixmap_64.h
10403     +++ b/include/asm-x86/mach-xen/asm/fixmap_64.h
10404 niro 609 @@ -41,7 +41,7 @@
10405     #ifdef CONFIG_X86_LOCAL_APIC
10406     FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
10407     #endif
10408     -#ifdef CONFIG_X86_IO_APIC
10409     +#ifndef CONFIG_XEN
10410     FIX_IO_APIC_BASE_0,
10411     FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
10412     #endif
10413 niro 612 --- a/include/asm-x86/mach-xen/asm/hw_irq_32.h
10414     +++ b/include/asm-x86/mach-xen/asm/hw_irq_32.h
10415 niro 609 @@ -17,8 +17,6 @@
10416     #include <asm/irq.h>
10417     #include <asm/sections.h>
10418    
10419     -struct hw_interrupt_type;
10420     -
10421     #define NMI_VECTOR 0x02
10422    
10423     /*
10424     @@ -28,10 +26,6 @@
10425     * Interrupt entry/exit code at both C and assembly level
10426     */
10427    
10428     -extern u8 irq_vector[NR_IRQ_VECTORS];
10429     -#define IO_APIC_VECTOR(irq) (irq_vector[irq])
10430     -#define AUTO_ASSIGN -1
10431     -
10432     extern void (*interrupt[NR_IRQS])(void);
10433    
10434     #ifdef CONFIG_SMP
10435     @@ -44,7 +38,7 @@
10436     fastcall void apic_timer_interrupt(void);
10437     fastcall void error_interrupt(void);
10438     fastcall void spurious_interrupt(void);
10439     -fastcall void thermal_interrupt(struct pt_regs *);
10440     +fastcall void thermal_interrupt(void);
10441     #define platform_legacy_irq(irq) ((irq) < 16)
10442     #endif
10443    
10444 niro 612 --- a/include/asm-x86/mach-xen/asm/hw_irq_64.h
10445     +++ b/include/asm-x86/mach-xen/asm/hw_irq_64.h
10446 niro 609 @@ -19,8 +19,7 @@
10447     #include <asm/irq.h>
10448     #include <linux/profile.h>
10449     #include <linux/smp.h>
10450     -
10451     -struct hw_interrupt_type;
10452     +#include <linux/percpu.h>
10453     #endif
10454    
10455     #define NMI_VECTOR 0x02
10456     @@ -77,9 +76,10 @@
10457    
10458    
10459     #ifndef __ASSEMBLY__
10460     -extern u8 irq_vector[NR_IRQ_VECTORS];
10461     -#define IO_APIC_VECTOR(irq) (irq_vector[irq])
10462     -#define AUTO_ASSIGN -1
10463     +typedef int vector_irq_t[NR_VECTORS];
10464     +DECLARE_PER_CPU(vector_irq_t, vector_irq);
10465     +extern void __setup_vector_irq(int cpu);
10466     +extern spinlock_t vector_lock;
10467    
10468     /*
10469     * Various low-level irq details needed by irq.c, process.c,
10470 niro 612 --- a/include/asm-x86/mach-xen/asm/io_32.h
10471     +++ b/include/asm-x86/mach-xen/asm/io_32.h
10472 niro 609 @@ -237,33 +237,6 @@
10473    
10474     #define eth_io_copy_and_sum(a,b,c,d) eth_copy_and_sum((a),(void __force *)(b),(c),(d))
10475    
10476     -/**
10477     - * check_signature - find BIOS signatures
10478     - * @io_addr: mmio address to check
10479     - * @signature: signature block
10480     - * @length: length of signature
10481     - *
10482     - * Perform a signature comparison with the mmio address io_addr. This
10483     - * address should have been obtained by ioremap.
10484     - * Returns 1 on a match.
10485     - */
10486     -
10487     -static inline int check_signature(volatile void __iomem * io_addr,
10488     - const unsigned char *signature, int length)
10489     -{
10490     - int retval = 0;
10491     - do {
10492     - if (readb(io_addr) != *signature)
10493     - goto out;
10494     - io_addr++;
10495     - signature++;
10496     - length--;
10497     - } while (length);
10498     - retval = 1;
10499     -out:
10500     - return retval;
10501     -}
10502     -
10503     /*
10504     * Cache management
10505     *
10506 niro 612 --- a/include/asm-x86/mach-xen/asm/io_64.h
10507     +++ b/include/asm-x86/mach-xen/asm/io_64.h
10508 niro 609 @@ -273,33 +273,6 @@
10509    
10510     #define eth_io_copy_and_sum(a,b,c,d) eth_copy_and_sum((a),(void *)(b),(c),(d))
10511    
10512     -/**
10513     - * check_signature - find BIOS signatures
10514     - * @io_addr: mmio address to check
10515     - * @signature: signature block
10516     - * @length: length of signature
10517     - *
10518     - * Perform a signature comparison with the mmio address io_addr. This
10519     - * address should have been obtained by ioremap.
10520     - * Returns 1 on a match.
10521     - */
10522     -
10523     -static inline int check_signature(void __iomem *io_addr,
10524     - const unsigned char *signature, int length)
10525     -{
10526     - int retval = 0;
10527     - do {
10528     - if (readb(io_addr) != *signature)
10529     - goto out;
10530     - io_addr++;
10531     - signature++;
10532     - length--;
10533     - } while (length);
10534     - retval = 1;
10535     -out:
10536     - return retval;
10537     -}
10538     -
10539     /* Nothing to do */
10540    
10541     #define dma_cache_inv(_start,_size) do { } while (0)
10542 niro 612 --- a/include/asm-x86/mach-xen/asm/pgtable-2level.h
10543     +++ b/include/asm-x86/mach-xen/asm/pgtable-2level.h
10544 niro 609 @@ -23,14 +23,6 @@
10545     set_pte((ptep), (pteval)); \
10546     } while (0)
10547    
10548     -#define set_pte_at_sync(_mm,addr,ptep,pteval) do { \
10549     - if (((_mm) != current->mm && (_mm) != &init_mm) || \
10550     - HYPERVISOR_update_va_mapping((addr), (pteval), UVMF_INVLPG)) { \
10551     - set_pte((ptep), (pteval)); \
10552     - xen_invlpg((addr)); \
10553     - } \
10554     -} while (0)
10555     -
10556     #define set_pte_atomic(pteptr, pteval) set_pte(pteptr,pteval)
10557    
10558     #define set_pmd(pmdptr, pmdval) xen_l2_entry_update((pmdptr), (pmdval))
10559     @@ -40,6 +32,7 @@
10560    
10561     #define pte_none(x) (!(x).pte_low)
10562    
10563     +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
10564     static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
10565     {
10566     pte_t pte = *ptep;
10567     @@ -51,6 +44,7 @@
10568     return pte;
10569     }
10570    
10571     +#define __HAVE_ARCH_PTEP_CLEAR_FLUSH
10572     #define ptep_clear_flush(vma, addr, ptep) \
10573     ({ \
10574     pte_t *__ptep = (ptep); \
10575     @@ -66,8 +60,6 @@
10576     __res; \
10577     })
10578    
10579     -#define pte_same(a, b) ((a).pte_low == (b).pte_low)
10580     -
10581     #define __pte_mfn(_pte) ((_pte).pte_low >> PAGE_SHIFT)
10582     #define pte_mfn(_pte) ((_pte).pte_low & _PAGE_PRESENT ? \
10583     __pte_mfn(_pte) : pfn_to_mfn(__pte_mfn(_pte)))
10584 niro 612 --- a/include/asm-x86/mach-xen/asm/pgtable-3level.h
10585     +++ b/include/asm-x86/mach-xen/asm/pgtable-3level.h
10586     @@ -53,7 +53,6 @@
10587     * not possible, use pte_get_and_clear to obtain the old pte
10588     * value and then use set_pte to update it. -ben
10589     */
10590     -#define __HAVE_ARCH_SET_PTE_ATOMIC
10591    
10592     static inline void set_pte(pte_t *ptep, pte_t pte)
10593     {
10594     @@ -70,14 +69,6 @@
10595     set_pte((ptep), (pteval)); \
10596     } while (0)
10597    
10598     -#define set_pte_at_sync(_mm,addr,ptep,pteval) do { \
10599     - if (((_mm) != current->mm && (_mm) != &init_mm) || \
10600     - HYPERVISOR_update_va_mapping((addr), (pteval), UVMF_INVLPG)) { \
10601     - set_pte((ptep), (pteval)); \
10602     - xen_invlpg((addr)); \
10603     - } \
10604     -} while (0)
10605     -
10606     #define set_pmd(pmdptr,pmdval) \
10607     xen_l2_entry_update((pmdptr), (pmdval))
10608     #define set_pud(pudptr,pudval) \
10609     @@ -94,7 +85,7 @@
10610     #define pud_page(pud) \
10611     ((struct page *) __va(pud_val(pud) & PAGE_MASK))
10612    
10613     -#define pud_page_kernel(pud) \
10614     +#define pud_page_vaddr(pud) \
10615     ((unsigned long) __va(pud_val(pud) & PAGE_MASK))
10616    
10617    
10618     @@ -124,6 +115,7 @@
10619    
10620     #define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0)
10621    
10622     +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
10623     static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
10624     {
10625     pte_t pte = *ptep;
10626     @@ -142,6 +134,7 @@
10627     return pte;
10628     }
10629    
10630     +#define __HAVE_ARCH_PTEP_CLEAR_FLUSH
10631     #define ptep_clear_flush(vma, addr, ptep) \
10632     ({ \
10633     pte_t *__ptep = (ptep); \
10634     @@ -159,6 +152,7 @@
10635     __res; \
10636     })
10637    
10638     +#define __HAVE_ARCH_PTE_SAME
10639     static inline int pte_same(pte_t a, pte_t b)
10640     {
10641     return a.pte_low == b.pte_low && a.pte_high == b.pte_high;
10642     --- a/include/asm-x86/mach-xen/asm/pgtable_32.h
10643     +++ b/include/asm-x86/mach-xen/asm/pgtable_32.h
10644 niro 609 @@ -260,31 +260,89 @@
10645     # include <asm/pgtable-2level.h>
10646     #endif
10647    
10648     -#define ptep_test_and_clear_dirty(vma, addr, ptep) \
10649     +/*
10650     + * Rules for using pte_update - it must be called after any PTE update which
10651     + * has not been done using the set_pte / clear_pte interfaces. It is used by
10652     + * shadow mode hypervisors to resynchronize the shadow page tables. Kernel PTE
10653     + * updates should either be sets, clears, or set_pte_atomic for P->P
10654     + * transitions, which means this hook should only be called for user PTEs.
10655     + * This hook implies a P->P protection or access change has taken place, which
10656     + * requires a subsequent TLB flush. The notification can optionally be delayed
10657     + * until the TLB flush event by using the pte_update_defer form of the
10658     + * interface, but care must be taken to assure that the flush happens while
10659     + * still holding the same page table lock so that the shadow and primary pages
10660     + * do not become out of sync on SMP.
10661     + */
10662     +#define pte_update(mm, addr, ptep) do { } while (0)
10663     +#define pte_update_defer(mm, addr, ptep) do { } while (0)
10664     +
10665     +
10666     +/*
10667     + * We only update the dirty/accessed state if we set
10668     + * the dirty bit by hand in the kernel, since the hardware
10669     + * will do the accessed bit for us, and we don't want to
10670     + * race with other CPU's that might be updating the dirty
10671     + * bit at the same time.
10672     + */
10673     +#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
10674     +#define ptep_set_access_flags(vma, address, ptep, entry, dirty) \
10675     +do { \
10676     + if (dirty) \
10677     + ptep_establish(vma, address, ptep, entry); \
10678     +} while (0)
10679     +
10680     +/*
10681     + * We don't actually have these, but we want to advertise them so that
10682     + * we can encompass the flush here.
10683     + */
10684     +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
10685     +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
10686     +
10687     +/*
10688     + * Rules for using ptep_establish: the pte MUST be a user pte, and
10689     + * must be a present->present transition.
10690     + */
10691     +#define __HAVE_ARCH_PTEP_ESTABLISH
10692     +#define ptep_establish(vma, address, ptep, pteval) \
10693     +do { \
10694     + if ( likely((vma)->vm_mm == current->mm) ) { \
10695     + BUG_ON(HYPERVISOR_update_va_mapping(address, \
10696     + pteval, \
10697     + (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
10698     + UVMF_INVLPG|UVMF_MULTI)); \
10699     + } else { \
10700     + xen_l1_entry_update(ptep, pteval); \
10701     + flush_tlb_page(vma, address); \
10702     + } \
10703     +} while (0)
10704     +
10705     +#define __HAVE_ARCH_PTEP_CLEAR_DIRTY_FLUSH
10706     +#define ptep_clear_flush_dirty(vma, address, ptep) \
10707     ({ \
10708     pte_t __pte = *(ptep); \
10709     - int __ret = pte_dirty(__pte); \
10710     - if (__ret) { \
10711     - __pte = pte_mkclean(__pte); \
10712     - if ((vma)->vm_mm != current->mm || \
10713     - HYPERVISOR_update_va_mapping(addr, __pte, 0)) \
10714     - (ptep)->pte_low = __pte.pte_low; \
10715     - } \
10716     - __ret; \
10717     + int __dirty = pte_dirty(__pte); \
10718     + __pte = pte_mkclean(__pte); \
10719     + if (test_bit(PG_pinned, &virt_to_page((vma)->vm_mm->pgd)->flags)) \
10720     + ptep_set_access_flags(vma, address, ptep, __pte, __dirty); \
10721     + else if (__dirty) \
10722     + (ptep)->pte_low = __pte.pte_low; \
10723     + __dirty; \
10724     })
10725    
10726     -#define ptep_test_and_clear_young(vma, addr, ptep) \
10727     +#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
10728     +#define ptep_clear_flush_young(vma, address, ptep) \
10729     ({ \
10730     pte_t __pte = *(ptep); \
10731     - int __ret = pte_young(__pte); \
10732     - if (__ret) \
10733     - __pte = pte_mkold(__pte); \
10734     - if ((vma)->vm_mm != current->mm || \
10735     - HYPERVISOR_update_va_mapping(addr, __pte, 0)) \
10736     - (ptep)->pte_low = __pte.pte_low; \
10737     - __ret; \
10738     + int __young = pte_young(__pte); \
10739     + __pte = pte_mkold(__pte); \
10740     + if (test_bit(PG_pinned, &virt_to_page((vma)->vm_mm->pgd)->flags)) \
10741     + ptep_set_access_flags(vma, address, ptep, __pte, __young); \
10742     + else if (__young) \
10743     + (ptep)->pte_low = __pte.pte_low; \
10744     + __young; \
10745     })
10746    
10747     +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
10748     #define ptep_get_and_clear_full(mm, addr, ptep, full) \
10749     ((full) ? ({ \
10750     pte_t __res = *(ptep); \
10751     @@ -296,6 +354,7 @@
10752     }) : \
10753     ptep_get_and_clear(mm, addr, ptep))
10754    
10755     +#define __HAVE_ARCH_PTEP_SET_WRPROTECT
10756     static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
10757     {
10758     pte_t pte = *ptep;
10759     @@ -391,11 +450,11 @@
10760     #define pte_index(address) \
10761     (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
10762     #define pte_offset_kernel(dir, address) \
10763     - ((pte_t *) pmd_page_kernel(*(dir)) + pte_index(address))
10764     + ((pte_t *) pmd_page_vaddr(*(dir)) + pte_index(address))
10765    
10766     #define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
10767    
10768     -#define pmd_page_kernel(pmd) \
10769     +#define pmd_page_vaddr(pmd) \
10770     ((unsigned long) __va(pmd_val(pmd) & PAGE_MASK))
10771    
10772     /*
10773     @@ -418,8 +477,6 @@
10774     static inline int set_kernel_exec(unsigned long vaddr, int enable) { return 0;}
10775     #endif
10776    
10777     -extern void noexec_setup(const char *str);
10778     -
10779     #if defined(CONFIG_HIGHPTE)
10780     #define pte_offset_map(dir, address) \
10781     ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE0) + \
10782     @@ -437,37 +494,17 @@
10783     #define pte_unmap_nested(pte) do { } while (0)
10784     #endif
10785    
10786     -#define __HAVE_ARCH_PTEP_ESTABLISH
10787     -#define ptep_establish(vma, address, ptep, pteval) \
10788     - do { \
10789     - if ( likely((vma)->vm_mm == current->mm) ) { \
10790     - BUG_ON(HYPERVISOR_update_va_mapping(address, \
10791     - pteval, \
10792     - (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
10793     - UVMF_INVLPG|UVMF_MULTI)); \
10794     - } else { \
10795     - xen_l1_entry_update(ptep, pteval); \
10796     - flush_tlb_page(vma, address); \
10797     - } \
10798     - } while (0)
10799     +/* Clear a kernel PTE and flush it from the TLB */
10800     +#define kpte_clear_flush(ptep, vaddr) do { \
10801     + if (HYPERVISOR_update_va_mapping(vaddr, __pte(0), UVMF_INVLPG)) \
10802     + BUG(); \
10803     +} while (0)
10804    
10805     /*
10806     * The i386 doesn't have any external MMU info: the kernel page
10807     * tables contain all the necessary information.
10808     - *
10809     - * Also, we only update the dirty/accessed state if we set
10810     - * the dirty bit by hand in the kernel, since the hardware
10811     - * will do the accessed bit for us, and we don't want to
10812     - * race with other CPU's that might be updating the dirty
10813     - * bit at the same time.
10814     */
10815     #define update_mmu_cache(vma,address,pte) do { } while (0)
10816     -#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
10817     -#define ptep_set_access_flags(vma, address, ptep, entry, dirty) \
10818     - do { \
10819     - if (dirty) \
10820     - ptep_establish(vma, address, ptep, entry); \
10821     - } while (0)
10822    
10823     #include <xen/features.h>
10824     void make_lowmem_page_readonly(void *va, unsigned int feature);
10825     @@ -516,10 +553,11 @@
10826     unsigned long size);
10827    
10828     int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
10829     - unsigned long addr, unsigned long end, pgprot_t newprot);
10830     + unsigned long addr, unsigned long end, pgprot_t newprot,
10831     + int dirty_accountable);
10832    
10833     -#define arch_change_pte_range(mm, pmd, addr, end, newprot) \
10834     - xen_change_pte_range(mm, pmd, addr, end, newprot)
10835     +#define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) \
10836     + xen_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable)
10837    
10838     #define io_remap_pfn_range(vma,from,pfn,size,prot) \
10839     direct_remap_pfn_range(vma,from,pfn,size,prot,DOMID_IO)
10840     @@ -528,13 +566,6 @@
10841     #define GET_IOSPACE(pfn) 0
10842     #define GET_PFN(pfn) (pfn)
10843    
10844     -#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
10845     -#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
10846     -#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
10847     -#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
10848     -#define __HAVE_ARCH_PTEP_CLEAR_FLUSH
10849     -#define __HAVE_ARCH_PTEP_SET_WRPROTECT
10850     -#define __HAVE_ARCH_PTE_SAME
10851     #include <asm-generic/pgtable.h>
10852    
10853     #endif /* _I386_PGTABLE_H */
10854 niro 612 --- a/include/asm-x86/mach-xen/asm/pgtable_64.h
10855     +++ b/include/asm-x86/mach-xen/asm/pgtable_64.h
10856 niro 609 @@ -43,12 +43,9 @@
10857    
10858     #define swapper_pg_dir init_level4_pgt
10859    
10860     -extern int nonx_setup(char *str);
10861     extern void paging_init(void);
10862     extern void clear_kernel_mapping(unsigned long addr, unsigned long size);
10863    
10864     -extern unsigned long pgkern_mask;
10865     -
10866     /*
10867     * ZERO_PAGE is a global shared page that is always zero: used
10868     * for zero-mapped memory areas etc..
10869     @@ -118,9 +115,6 @@
10870     set_pgd(__user_pgd(pgd), __pgd(0));
10871     }
10872    
10873     -#define pud_page(pud) \
10874     - ((unsigned long) __va(pud_val(pud) & PHYSICAL_PAGE_MASK))
10875     -
10876     #define pte_same(a, b) ((a).pte == (b).pte)
10877    
10878     #define pte_pgprot(a) (__pgprot((a).pte & ~PHYSICAL_PAGE_MASK))
10879     @@ -332,7 +326,7 @@
10880     #define __LARGE_PTE (_PAGE_PSE|_PAGE_PRESENT)
10881     static inline int pte_user(pte_t pte) { return __pte_val(pte) & _PAGE_USER; }
10882     static inline int pte_read(pte_t pte) { return __pte_val(pte) & _PAGE_USER; }
10883     -static inline int pte_exec(pte_t pte) { return __pte_val(pte) & _PAGE_USER; }
10884     +static inline int pte_exec(pte_t pte) { return !(__pte_val(pte) & _PAGE_NX); }
10885     static inline int pte_dirty(pte_t pte) { return __pte_val(pte) & _PAGE_DIRTY; }
10886     static inline int pte_young(pte_t pte) { return __pte_val(pte) & _PAGE_ACCESSED; }
10887     static inline int pte_write(pte_t pte) { return __pte_val(pte) & _PAGE_RW; }
10888     @@ -345,29 +339,12 @@
10889     static inline pte_t pte_mkold(pte_t pte) { __pte_val(pte) &= ~_PAGE_ACCESSED; return pte; }
10890     static inline pte_t pte_wrprotect(pte_t pte) { __pte_val(pte) &= ~_PAGE_RW; return pte; }
10891     static inline pte_t pte_mkread(pte_t pte) { __pte_val(pte) |= _PAGE_USER; return pte; }
10892     -static inline pte_t pte_mkexec(pte_t pte) { __pte_val(pte) |= _PAGE_USER; return pte; }
10893     +static inline pte_t pte_mkexec(pte_t pte) { __pte_val(pte) &= ~_PAGE_NX; return pte; }
10894     static inline pte_t pte_mkdirty(pte_t pte) { __pte_val(pte) |= _PAGE_DIRTY; return pte; }
10895     static inline pte_t pte_mkyoung(pte_t pte) { __pte_val(pte) |= _PAGE_ACCESSED; return pte; }
10896     static inline pte_t pte_mkwrite(pte_t pte) { __pte_val(pte) |= _PAGE_RW; return pte; }
10897     static inline pte_t pte_mkhuge(pte_t pte) { __pte_val(pte) |= _PAGE_PSE; return pte; }
10898     -
10899     -#define ptep_test_and_clear_dirty(vma, addr, ptep) \
10900     -({ \
10901     - pte_t __pte = *(ptep); \
10902     - int __ret = pte_dirty(__pte); \
10903     - if (__ret) \
10904     - set_pte_at((vma)->vm_mm, addr, ptep, pte_mkclean(__pte)); \
10905     - __ret; \
10906     -})
10907     -
10908     -#define ptep_test_and_clear_young(vma, addr, ptep) \
10909     -({ \
10910     - pte_t __pte = *(ptep); \
10911     - int __ret = pte_young(__pte); \
10912     - if (__ret) \
10913     - set_pte_at((vma)->vm_mm, addr, ptep, pte_mkold(__pte)); \
10914     - __ret; \
10915     -})
10916     +static inline pte_t pte_clrhuge(pte_t pte) { __pte_val(pte) &= ~_PAGE_PSE; return pte; }
10917    
10918     static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
10919     {
10920     @@ -395,7 +372,8 @@
10921     * Level 4 access.
10922     * Never use these in the common code.
10923     */
10924     -#define pgd_page(pgd) ((unsigned long) __va(pgd_val(pgd) & PTE_MASK))
10925     +#define pgd_page_vaddr(pgd) ((unsigned long) __va(pgd_val(pgd) & PTE_MASK))
10926     +#define pgd_page(pgd) (pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT))
10927     #define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1))
10928     #define pgd_offset(mm, addr) ((mm)->pgd + pgd_index(addr))
10929     #define pgd_offset_k(address) (init_level4_pgt + pgd_index(address))
10930     @@ -404,16 +382,18 @@
10931    
10932     /* PUD - Level3 access */
10933     /* to find an entry in a page-table-directory. */
10934     +#define pud_page_vaddr(pud) ((unsigned long) __va(pud_val(pud) & PHYSICAL_PAGE_MASK))
10935     +#define pud_page(pud) (pfn_to_page(pud_val(pud) >> PAGE_SHIFT))
10936     #define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
10937     -#define pud_offset(pgd, address) ((pud_t *) pgd_page(*(pgd)) + pud_index(address))
10938     +#define pud_offset(pgd, address) ((pud_t *) pgd_page_vaddr(*(pgd)) + pud_index(address))
10939     #define pud_present(pud) (__pud_val(pud) & _PAGE_PRESENT)
10940    
10941     /* PMD - Level 2 access */
10942     -#define pmd_page_kernel(pmd) ((unsigned long) __va(pmd_val(pmd) & PTE_MASK))
10943     +#define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val(pmd) & PTE_MASK))
10944     #define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
10945    
10946     #define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1))
10947     -#define pmd_offset(dir, address) ((pmd_t *) pud_page(*(dir)) + \
10948     +#define pmd_offset(dir, address) ((pmd_t *) pud_page_vaddr(*(dir)) + \
10949     pmd_index(address))
10950     #define pmd_none(x) (!__pmd_val(x))
10951     #if CONFIG_XEN_COMPAT <= 0x030002
10952     @@ -444,6 +424,7 @@
10953     {
10954     unsigned long pteval;
10955     pteval = physpage | pgprot_val(pgprot);
10956     + pteval &= __supported_pte_mask;
10957     return __pte(pteval);
10958     }
10959    
10960     @@ -465,7 +446,7 @@
10961    
10962     #define pte_index(address) \
10963     (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
10964     -#define pte_offset_kernel(dir, address) ((pte_t *) pmd_page_kernel(*(dir)) + \
10965     +#define pte_offset_kernel(dir, address) ((pte_t *) pmd_page_vaddr(*(dir)) + \
10966     pte_index(address))
10967    
10968     /* x86-64 always has all page tables mapped. */
10969     @@ -506,6 +487,40 @@
10970     ptep_establish(vma, address, ptep, entry); \
10971     } while (0)
10972    
10973     +
10974     +/*
10975     + * i386 says: We don't actually have these, but we want to advertise
10976     + * them so that we can encompass the flush here.
10977     + */
10978     +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
10979     +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
10980     +
10981     +#define __HAVE_ARCH_PTEP_CLEAR_DIRTY_FLUSH
10982     +#define ptep_clear_flush_dirty(vma, address, ptep) \
10983     +({ \
10984     + pte_t __pte = *(ptep); \
10985     + int __dirty = pte_dirty(__pte); \
10986     + __pte = pte_mkclean(__pte); \
10987     + if ((vma)->vm_mm->context.pinned) \
10988     + ptep_set_access_flags(vma, address, ptep, __pte, __dirty); \
10989     + else if (__dirty) \
10990     + set_pte(ptep, __pte); \
10991     + __dirty; \
10992     +})
10993     +
10994     +#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
10995     +#define ptep_clear_flush_young(vma, address, ptep) \
10996     +({ \
10997     + pte_t __pte = *(ptep); \
10998     + int __young = pte_young(__pte); \
10999     + __pte = pte_mkold(__pte); \
11000     + if ((vma)->vm_mm->context.pinned) \
11001     + ptep_set_access_flags(vma, address, ptep, __pte, __young); \
11002     + else if (__young) \
11003     + set_pte(ptep, __pte); \
11004     + __young; \
11005     +})
11006     +
11007     /* Encode and de-code a swap entry */
11008     #define __swp_type(x) (((x).val >> 1) & 0x3f)
11009     #define __swp_offset(x) ((x).val >> 8)
11010     @@ -547,10 +562,11 @@
11011     unsigned long size);
11012    
11013     int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
11014     - unsigned long addr, unsigned long end, pgprot_t newprot);
11015     + unsigned long addr, unsigned long end, pgprot_t newprot,
11016     + int dirty_accountable);
11017    
11018     -#define arch_change_pte_range(mm, pmd, addr, end, newprot) \
11019     - xen_change_pte_range(mm, pmd, addr, end, newprot)
11020     +#define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) \
11021     + xen_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable)
11022    
11023     #define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \
11024     direct_remap_pfn_range(vma,vaddr,pfn,size,prot,DOMID_IO)
11025     @@ -572,8 +588,6 @@
11026     #define kc_offset_to_vaddr(o) \
11027     (((o) & (1UL << (__VIRTUAL_MASK_SHIFT-1))) ? ((o) | (~__VIRTUAL_MASK)) : (o))
11028    
11029     -#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
11030     -#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
11031     #define __HAVE_ARCH_PTEP_GET_AND_CLEAR
11032     #define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
11033     #define __HAVE_ARCH_PTEP_CLEAR_FLUSH
11034 niro 612 --- a/include/asm-x86/mach-xen/asm/processor_32.h
11035     +++ b/include/asm-x86/mach-xen/asm/processor_32.h
11036 niro 609 @@ -146,6 +146,18 @@
11037     #define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */
11038     #define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */
11039    
11040     +static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
11041     + unsigned int *ecx, unsigned int *edx)
11042     +{
11043     + /* ecx is often an input as well as an output. */
11044     + __asm__(XEN_CPUID
11045     + : "=a" (*eax),
11046     + "=b" (*ebx),
11047     + "=c" (*ecx),
11048     + "=d" (*edx)
11049     + : "0" (*eax), "2" (*ecx));
11050     +}
11051     +
11052     /*
11053     * Generic CPUID function
11054     * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
11055     @@ -153,24 +165,18 @@
11056     */
11057     static inline void cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx)
11058     {
11059     - __asm__(XEN_CPUID
11060     - : "=a" (*eax),
11061     - "=b" (*ebx),
11062     - "=c" (*ecx),
11063     - "=d" (*edx)
11064     - : "0" (op), "c"(0));
11065     + *eax = op;
11066     + *ecx = 0;
11067     + __cpuid(eax, ebx, ecx, edx);
11068     }
11069    
11070     /* Some CPUID calls want 'count' to be placed in ecx */
11071     static inline void cpuid_count(int op, int count, int *eax, int *ebx, int *ecx,
11072     - int *edx)
11073     + int *edx)
11074     {
11075     - __asm__(XEN_CPUID
11076     - : "=a" (*eax),
11077     - "=b" (*ebx),
11078     - "=c" (*ecx),
11079     - "=d" (*edx)
11080     - : "0" (op), "c" (count));
11081     + *eax = op;
11082     + *ecx = count;
11083     + __cpuid(eax, ebx, ecx, edx);
11084     }
11085    
11086     /*
11087     @@ -178,42 +184,30 @@
11088     */
11089     static inline unsigned int cpuid_eax(unsigned int op)
11090     {
11091     - unsigned int eax;
11092     + unsigned int eax, ebx, ecx, edx;
11093    
11094     - __asm__(XEN_CPUID
11095     - : "=a" (eax)
11096     - : "0" (op)
11097     - : "bx", "cx", "dx");
11098     + cpuid(op, &eax, &ebx, &ecx, &edx);
11099     return eax;
11100     }
11101     static inline unsigned int cpuid_ebx(unsigned int op)
11102     {
11103     - unsigned int eax, ebx;
11104     + unsigned int eax, ebx, ecx, edx;
11105    
11106     - __asm__(XEN_CPUID
11107     - : "=a" (eax), "=b" (ebx)
11108     - : "0" (op)
11109     - : "cx", "dx" );
11110     + cpuid(op, &eax, &ebx, &ecx, &edx);
11111     return ebx;
11112     }
11113     static inline unsigned int cpuid_ecx(unsigned int op)
11114     {
11115     - unsigned int eax, ecx;
11116     + unsigned int eax, ebx, ecx, edx;
11117    
11118     - __asm__(XEN_CPUID
11119     - : "=a" (eax), "=c" (ecx)
11120     - : "0" (op)
11121     - : "bx", "dx" );
11122     + cpuid(op, &eax, &ebx, &ecx, &edx);
11123     return ecx;
11124     }
11125     static inline unsigned int cpuid_edx(unsigned int op)
11126     {
11127     - unsigned int eax, edx;
11128     + unsigned int eax, ebx, ecx, edx;
11129    
11130     - __asm__(XEN_CPUID
11131     - : "=a" (eax), "=d" (edx)
11132     - : "0" (op)
11133     - : "bx", "cx");
11134     + cpuid(op, &eax, &ebx, &ecx, &edx);
11135     return edx;
11136     }
11137    
11138     @@ -315,6 +309,8 @@
11139     : :"a" (eax), "c" (ecx));
11140     }
11141    
11142     +extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
11143     +
11144     /* from system description table in BIOS. Mostly for MCA use, but
11145     others may find it useful. */
11146     extern unsigned int machine_id;
11147 niro 612 --- a/include/asm-x86/mach-xen/asm/processor_64.h
11148     +++ b/include/asm-x86/mach-xen/asm/processor_64.h
11149 niro 609 @@ -484,6 +484,8 @@
11150     : :"a" (eax), "c" (ecx));
11151     }
11152    
11153     +extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
11154     +
11155     #define stack_current() \
11156     ({ \
11157     struct thread_info *ti; \
11158 niro 612 --- a/include/asm-x86/mach-xen/asm/segment_32.h
11159     +++ b/include/asm-x86/mach-xen/asm/segment_32.h
11160 niro 609 @@ -61,11 +61,9 @@
11161    
11162     #define GDT_ENTRY_KERNEL_CS (GDT_ENTRY_KERNEL_BASE + 0)
11163     #define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8)
11164     -#define GET_KERNEL_CS() (__KERNEL_CS | (xen_feature(XENFEAT_supervisor_mode_kernel)?0:1) )
11165    
11166     #define GDT_ENTRY_KERNEL_DS (GDT_ENTRY_KERNEL_BASE + 1)
11167     #define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8)
11168     -#define GET_KERNEL_DS() (__KERNEL_DS | (xen_feature(XENFEAT_supervisor_mode_kernel)?0:1) )
11169    
11170     #define GDT_ENTRY_TSS (GDT_ENTRY_KERNEL_BASE + 4)
11171     #define GDT_ENTRY_LDT (GDT_ENTRY_KERNEL_BASE + 5)
11172     @@ -85,6 +83,11 @@
11173    
11174     #define GDT_SIZE (GDT_ENTRIES * 8)
11175    
11176     +/* Matches __KERNEL_CS and __USER_CS (they must be 2 entries apart) */
11177     +#define SEGMENT_IS_FLAT_CODE(x) (((x) & 0xec) == GDT_ENTRY_KERNEL_CS * 8)
11178     +/* Matches PNP_CS32 and PNP_CS16 (they must be consecutive) */
11179     +#define SEGMENT_IS_PNP_CODE(x) (((x) & 0xf4) == GDT_ENTRY_PNPBIOS_BASE * 8)
11180     +
11181     /* Simple and small GDT entries for booting only */
11182    
11183     #define GDT_ENTRY_BOOT_CS 2
11184     @@ -114,4 +117,16 @@
11185     */
11186     #define IDT_ENTRIES 256
11187    
11188     +/* Bottom two bits of selector give the ring privilege level */
11189     +#define SEGMENT_RPL_MASK 0x3
11190     +/* Bit 2 is table indicator (LDT/GDT) */
11191     +#define SEGMENT_TI_MASK 0x4
11192     +
11193     +/* User mode is privilege level 3 */
11194     +#define USER_RPL 0x3
11195     +/* LDT segment has TI set, GDT has it cleared */
11196     +#define SEGMENT_LDT 0x4
11197     +#define SEGMENT_GDT 0x0
11198     +
11199     +#define get_kernel_rpl() (xen_feature(XENFEAT_supervisor_mode_kernel)?0:1)
11200     #endif
11201 niro 612 --- a/include/asm-x86/mach-xen/asm/smp_32.h
11202     +++ b/include/asm-x86/mach-xen/asm/smp_32.h
11203 niro 609 @@ -79,25 +79,36 @@
11204     return GET_APIC_ID(*(unsigned long *)(APIC_BASE+APIC_ID));
11205     }
11206     #endif
11207     -
11208     -static __inline int logical_smp_processor_id(void)
11209     -{
11210     - /* we don't want to mark this access volatile - bad code generation */
11211     - return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR));
11212     -}
11213     -
11214     #endif
11215    
11216     +#define safe_smp_processor_id() smp_processor_id()
11217     extern int __cpu_disable(void);
11218     extern void __cpu_die(unsigned int cpu);
11219     extern void prefill_possible_map(void);
11220     +extern unsigned int num_processors;
11221     +
11222     #endif /* !__ASSEMBLY__ */
11223    
11224     #else /* CONFIG_SMP */
11225    
11226     +#define safe_smp_processor_id() 0
11227     #define cpu_physical_id(cpu) boot_cpu_physical_apicid
11228    
11229     #define NO_PROC_ID 0xFF /* No processor magic marker */
11230    
11231     #endif
11232     +
11233     +#ifndef __ASSEMBLY__
11234     +
11235     +extern u8 apicid_2_node[];
11236     +
11237     +#ifdef CONFIG_X86_LOCAL_APIC
11238     +static __inline int logical_smp_processor_id(void)
11239     +{
11240     + /* we don't want to mark this access volatile - bad code generation */
11241     + return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR));
11242     +}
11243     +#endif
11244     +#endif
11245     +
11246     #endif
11247 niro 612 --- a/include/asm-x86/mach-xen/asm/smp_64.h
11248     +++ b/include/asm-x86/mach-xen/asm/smp_64.h
11249 niro 609 @@ -4,15 +4,12 @@
11250     /*
11251     * We need the APIC definitions automatically as part of 'smp.h'
11252     */
11253     -#ifndef __ASSEMBLY__
11254     #include <linux/threads.h>
11255     #include <linux/cpumask.h>
11256     #include <linux/bitops.h>
11257     extern int disable_apic;
11258     -#endif
11259    
11260     #ifdef CONFIG_X86_LOCAL_APIC
11261     -#ifndef __ASSEMBLY__
11262     #include <asm/fixmap.h>
11263     #include <asm/mpspec.h>
11264     #ifdef CONFIG_X86_IO_APIC
11265     @@ -21,10 +18,8 @@
11266     #include <asm/apic.h>
11267     #include <asm/thread_info.h>
11268     #endif
11269     -#endif
11270    
11271     #ifdef CONFIG_SMP
11272     -#ifndef ASSEMBLY
11273    
11274     #include <asm/pda.h>
11275    
11276     @@ -41,14 +36,11 @@
11277    
11278     extern void smp_alloc_memory(void);
11279     extern volatile unsigned long smp_invalidate_needed;
11280     -extern int pic_mode;
11281     extern void lock_ipi_call_lock(void);
11282     extern void unlock_ipi_call_lock(void);
11283     extern int smp_num_siblings;
11284     extern void smp_send_reschedule(int cpu);
11285     void smp_stop_cpu(void);
11286     -extern int smp_call_function_single(int cpuid, void (*func) (void *info),
11287     - void *info, int retry, int wait);
11288    
11289     extern cpumask_t cpu_sibling_map[NR_CPUS];
11290     extern cpumask_t cpu_core_map[NR_CPUS];
11291     @@ -77,20 +69,16 @@
11292     }
11293     #endif
11294    
11295     -extern int safe_smp_processor_id(void);
11296     extern int __cpu_disable(void);
11297     extern void __cpu_die(unsigned int cpu);
11298     extern void prefill_possible_map(void);
11299     extern unsigned num_processors;
11300     extern unsigned disabled_cpus;
11301    
11302     -#endif /* !ASSEMBLY */
11303     -
11304     #define NO_PROC_ID 0xFF /* No processor magic marker */
11305    
11306     #endif
11307    
11308     -#ifndef ASSEMBLY
11309     /*
11310     * Some lowlevel functions might want to know about
11311     * the real APIC ID <-> CPU # mapping.
11312     @@ -114,11 +102,8 @@
11313     }
11314     #endif
11315    
11316     -#endif /* !ASSEMBLY */
11317     -
11318     #ifndef CONFIG_SMP
11319     #define stack_smp_processor_id() 0
11320     -#define safe_smp_processor_id() 0
11321     #define cpu_logical_map(x) (x)
11322     #else
11323     #include <asm/thread_info.h>
11324     @@ -130,7 +115,6 @@
11325     })
11326     #endif
11327    
11328     -#ifndef __ASSEMBLY__
11329     #ifdef CONFIG_X86_LOCAL_APIC
11330     static __inline int logical_smp_processor_id(void)
11331     {
11332     @@ -138,13 +122,18 @@
11333     return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR));
11334     }
11335     #endif
11336     -#endif
11337    
11338     #ifdef CONFIG_SMP
11339     #define cpu_physical_id(cpu) x86_cpu_to_apicid[cpu]
11340     #else
11341     #define cpu_physical_id(cpu) boot_cpu_id
11342     -#endif
11343     -
11344     +static inline int smp_call_function_single(int cpuid, void (*func) (void *info),
11345     + void *info, int retry, int wait)
11346     +{
11347     + /* Disable interrupts here? */
11348     + func(info);
11349     + return 0;
11350     +}
11351     +#endif /* !CONFIG_SMP */
11352     #endif
11353    
11354 niro 612 --- a/include/asm-x86/mach-xen/asm/system_32.h
11355     +++ b/include/asm-x86/mach-xen/asm/system_32.h
11356 niro 609 @@ -267,6 +267,9 @@
11357     #define cmpxchg(ptr,o,n)\
11358     ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\
11359     (unsigned long)(n),sizeof(*(ptr))))
11360     +#define sync_cmpxchg(ptr,o,n)\
11361     + ((__typeof__(*(ptr)))__sync_cmpxchg((ptr),(unsigned long)(o),\
11362     + (unsigned long)(n),sizeof(*(ptr))))
11363     #endif
11364    
11365     static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
11366 niro 612 @@ -291,6 +294,39 @@
11367     : "=a"(prev)
11368     : "r"(new), "m"(*__xg(ptr)), "0"(old)
11369     : "memory");
11370     + return prev;
11371     + }
11372     + return old;
11373     +}
11374     +
11375 niro 609 +/*
11376     + * Always use locked operations when touching memory shared with a
11377     + * hypervisor, since the system may be SMP even if the guest kernel
11378     + * isn't.
11379     + */
11380     +static inline unsigned long __sync_cmpxchg(volatile void *ptr,
11381     + unsigned long old,
11382     + unsigned long new, int size)
11383     +{
11384     + unsigned long prev;
11385     + switch (size) {
11386     + case 1:
11387     + __asm__ __volatile__("lock; cmpxchgb %b1,%2"
11388     + : "=a"(prev)
11389     + : "q"(new), "m"(*__xg(ptr)), "0"(old)
11390     + : "memory");
11391     + return prev;
11392     + case 2:
11393     + __asm__ __volatile__("lock; cmpxchgw %w1,%2"
11394     + : "=a"(prev)
11395     + : "r"(new), "m"(*__xg(ptr)), "0"(old)
11396     + : "memory");
11397     + return prev;
11398     + case 4:
11399     + __asm__ __volatile__("lock; cmpxchgl %1,%2"
11400     + : "=a"(prev)
11401     + : "r"(new), "m"(*__xg(ptr)), "0"(old)
11402     + : "memory");
11403 niro 612 return prev;
11404     }
11405     return old;
11406     --- a/include/asm-x86/mach-xen/asm/system_64.h
11407     +++ b/include/asm-x86/mach-xen/asm/system_64.h
11408 niro 609 @@ -24,6 +24,7 @@
11409     #define __EXTRA_CLOBBER \
11410     ,"rcx","rbx","rdx","r8","r9","r10","r11","r12","r13","r14","r15"
11411    
11412     +/* Save restore flags to clear handle leaking NT */
11413     #define switch_to(prev,next,last) \
11414     asm volatile(SAVE_CONTEXT \
11415     "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \
11416 niro 612 --- a/include/asm-x86/mach-xen/asm/tlbflush_32.h
11417     +++ b/include/asm-x86/mach-xen/asm/tlbflush_32.h
11418 niro 609 @@ -8,8 +8,6 @@
11419     #define __flush_tlb_global() xen_tlb_flush()
11420     #define __flush_tlb_all() xen_tlb_flush()
11421    
11422     -extern unsigned long pgkern_mask;
11423     -
11424     #define cpu_has_invlpg (boot_cpu_data.x86 > 3)
11425    
11426     #define __flush_tlb_single(addr) xen_invlpg(addr)
11427 niro 612 --- a/include/asm-x86/mach-xen/asm/tlbflush_64.h
11428     +++ b/include/asm-x86/mach-xen/asm/tlbflush_64.h
11429 niro 609 @@ -12,9 +12,6 @@
11430     */
11431     #define __flush_tlb_global() xen_tlb_flush()
11432    
11433     -
11434     -extern unsigned long pgkern_mask;
11435     -
11436     #define __flush_tlb_all() __flush_tlb_global()
11437    
11438     #define __flush_tlb_one(addr) xen_invlpg((unsigned long)addr)
11439 niro 612 --- a/include/asm-x86/thread_info_64.h
11440     +++ b/include/asm-x86/thread_info_64.h
11441 niro 609 @@ -157,10 +157,14 @@
11442     (_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY|_TIF_HRTICK_RESCHED)
11443    
11444     /* flags to check in __switch_to() */
11445     +#ifndef CONFIG_XEN
11446     #define _TIF_WORK_CTXSW \
11447     (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_BTS_TRACE_TS)
11448     #define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW
11449     #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
11450     +#else
11451     +#define _TIF_WORK_CTXSW _TIF_DEBUG
11452     +#endif
11453    
11454     #define PREEMPT_ACTIVE 0x10000000
11455    
11456 niro 612 --- a/include/linux/skbuff.h
11457     +++ b/include/linux/skbuff.h
11458 niro 609 @@ -1821,5 +1821,12 @@
11459     }
11460    
11461     bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off);
11462     +
11463     +#ifdef CONFIG_XEN
11464     +int skb_checksum_setup(struct sk_buff *skb);
11465     +#else
11466     +static inline int skb_checksum_setup(struct sk_buff *skb) { return 0; }
11467     +#endif
11468     +
11469     #endif /* __KERNEL__ */
11470     #endif /* _LINUX_SKBUFF_H */
11471 niro 612 --- a/include/xen/evtchn.h
11472     +++ b/include/xen/evtchn.h
11473 niro 609 @@ -54,34 +54,34 @@
11474     */
11475     int bind_caller_port_to_irqhandler(
11476     unsigned int caller_port,
11477     - irqreturn_t (*handler)(int, void *, struct pt_regs *),
11478     + irq_handler_t handler,
11479     unsigned long irqflags,
11480     const char *devname,
11481     void *dev_id);
11482     int bind_listening_port_to_irqhandler(
11483     unsigned int remote_domain,
11484     - irqreturn_t (*handler)(int, void *, struct pt_regs *),
11485     + irq_handler_t handler,
11486     unsigned long irqflags,
11487     const char *devname,
11488     void *dev_id);
11489     int bind_interdomain_evtchn_to_irqhandler(
11490     unsigned int remote_domain,
11491     unsigned int remote_port,
11492     - irqreturn_t (*handler)(int, void *, struct pt_regs *),
11493     + irq_handler_t handler,
11494     unsigned long irqflags,
11495     const char *devname,
11496     void *dev_id);
11497     int bind_virq_to_irqhandler(
11498     unsigned int virq,
11499     unsigned int cpu,
11500     - irqreturn_t (*handler)(int, void *, struct pt_regs *),
11501     + irq_handler_t handler,
11502     unsigned long irqflags,
11503     const char *devname,
11504     void *dev_id);
11505     int bind_ipi_to_irqhandler(
11506     unsigned int ipi,
11507     unsigned int cpu,
11508     - irqreturn_t (*handler)(int, void *, struct pt_regs *),
11509     + irq_handler_t handler,
11510     unsigned long irqflags,
11511     const char *devname,
11512     void *dev_id);
11513 niro 612 --- a/include/xen/xencons.h
11514     +++ b/include/xen/xencons.h
11515 niro 609 @@ -8,7 +8,7 @@
11516     void xencons_resume(void);
11517    
11518     /* Interrupt work hooks. Receive data, or kick data out. */
11519     -void xencons_rx(char *buf, unsigned len, struct pt_regs *regs);
11520     +void xencons_rx(char *buf, unsigned len);
11521     void xencons_tx(void);
11522    
11523     int xencons_ring_init(void);
11524 niro 612 --- a/mm/mprotect.c
11525     +++ b/mm/mprotect.c
11526 niro 609 @@ -86,7 +86,7 @@
11527     next = pmd_addr_end(addr, end);
11528     if (pmd_none_or_clear_bad(pmd))
11529     continue;
11530     - if (arch_change_pte_range(mm, pmd, addr, next, newprot))
11531     + if (arch_change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable))
11532     continue;
11533     change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable);
11534     } while (pmd++, addr = next, addr != end);
11535 niro 612 --- a/net/core/dev.c
11536     +++ b/net/core/dev.c
11537     @@ -1611,15 +1611,14 @@
11538 niro 609 }
11539     if ((skb->h.raw + skb->csum + 2) > skb->tail)
11540     goto out;
11541     - skb->ip_summed = CHECKSUM_HW;
11542     + skb->ip_summed = CHECKSUM_PARTIAL;
11543     skb->proto_csum_blank = 0;
11544     }
11545     return 0;
11546     out:
11547     return -EPROTO;
11548     }
11549     -#else
11550     -inline int skb_checksum_setup(struct sk_buff *skb) { return 0; }
11551     +EXPORT_SYMBOL(skb_checksum_setup);
11552     #endif
11553    
11554     /**
11555 niro 612 @@ -2115,7 +2114,7 @@
11556 niro 609 case CHECKSUM_UNNECESSARY:
11557     skb->proto_data_valid = 1;
11558     break;
11559     - case CHECKSUM_HW:
11560     + case CHECKSUM_PARTIAL:
11561     /* XXX Implement me. */
11562     default:
11563     skb->proto_data_valid = 0;
11564 niro 612 @@ -4648,7 +4647,6 @@
11565 niro 609 EXPORT_SYMBOL(net_enable_timestamp);
11566     EXPORT_SYMBOL(net_disable_timestamp);
11567     EXPORT_SYMBOL(dev_get_flags);
11568     -EXPORT_SYMBOL(skb_checksum_setup);
11569    
11570     #if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
11571     EXPORT_SYMBOL(br_handle_frame_hook);