Magellan Linux

Annotation of /trunk/kernel26-xen/patches-2.6.25-r1/1023-2.6.25-xen-patch-2.6.22.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 609 - (hide annotations) (download)
Fri May 23 17:35:37 2008 UTC (16 years ago) by niro
File size: 212197 byte(s)
-using opensuse xen patchset, updated kernel configs

1 niro 609 From: www.kernel.org
2     Subject: Update to 2.6.22
3     Patch-mainline: 2.6.22
4    
5     Automatically created from "patches.kernel.org/patch-2.6.22" by xen-port-patches.py
6    
7     Acked-by: jbeulich@novell.com
8    
9     ---
10     arch/x86/Kconfig | 5
11     arch/x86/ia32/ia32entry-xen.S | 18 -
12     arch/x86/kernel/Makefile | 2
13     arch/x86/kernel/acpi/sleep_64-xen.c | 26 -
14     arch/x86/kernel/apic_32-xen.c | 1
15     arch/x86/kernel/apic_64-xen.c | 1
16     arch/x86/kernel/cpu/common-xen.c | 224 ++++---------
17     arch/x86/kernel/cpu/mtrr/main-xen.c | 2
18     arch/x86/kernel/e820_32-xen.c | 46 +-
19     arch/x86/kernel/e820_64-xen.c | 28 -
20     arch/x86/kernel/early_printk-xen.c | 27 -
21     arch/x86/kernel/entry_32-xen.S | 30 -
22     arch/x86/kernel/entry_64-xen.S | 7
23     arch/x86/kernel/genapic_64-xen.c | 106 +-----
24     arch/x86/kernel/genapic_xen_64.c | 3
25     arch/x86/kernel/head64-xen.c | 32 +
26     arch/x86/kernel/head_32-xen.S | 101 ------
27     arch/x86/kernel/head_64-xen.S | 37 --
28     arch/x86/kernel/io_apic_32-xen.c | 43 --
29     arch/x86/kernel/io_apic_64-xen.c | 39 --
30     arch/x86/kernel/ioport_32-xen.c | 2
31     arch/x86/kernel/ioport_64-xen.c | 2
32     arch/x86/kernel/irq_32-xen.c | 3
33     arch/x86/kernel/irq_64-xen.c | 34 +-
34     arch/x86/kernel/ldt_32-xen.c | 1
35     arch/x86/kernel/ldt_64-xen.c | 1
36     arch/x86/kernel/microcode-xen.c | 2
37     arch/x86/kernel/mpparse_32-xen.c | 3
38     arch/x86/kernel/mpparse_64-xen.c | 3
39     arch/x86/kernel/pci-dma_32-xen.c | 29 +
40     arch/x86/kernel/pci-swiotlb_64-xen.c | 2
41     arch/x86/kernel/process_32-xen.c | 27 +
42     arch/x86/kernel/process_64-xen.c | 16
43     arch/x86/kernel/quirks-xen.c | 63 ---
44     arch/x86/kernel/setup64-xen.c | 17 -
45     arch/x86/kernel/setup_64-xen.c | 30 -
46     arch/x86/kernel/smp_32-xen.c | 191 ++++-------
47     arch/x86/kernel/smp_64-xen.c | 29 -
48     arch/x86/kernel/time_32-xen.c | 62 +--
49     arch/x86/kernel/traps_32-xen.c | 46 +-
50     arch/x86/kernel/traps_64-xen.c | 55 +--
51     arch/x86/kernel/vsyscall_64-xen.c | 73 +++-
52     arch/x86/mm/fault_32-xen.c | 42 +-
53     arch/x86/mm/fault_64-xen.c | 15
54     arch/x86/mm/highmem_32-xen.c | 14
55     arch/x86/mm/init_32-xen.c | 157 ++++++---
56     arch/x86/mm/init_64-xen.c | 132 ++++---
57     arch/x86/mm/ioremap_32-xen.c | 1
58     arch/x86/mm/pageattr_64-xen.c | 27 +
59     arch/x86/mm/pgtable_32-xen.c | 210 +++++++-----
60     drivers/char/tpm/tpm_xen.c | 2
61     drivers/xen/blkfront/blkfront.c | 2
62     drivers/xen/char/mem.c | 1
63     drivers/xen/core/hypervisor_sysfs.c | 2
64     drivers/xen/core/smpboot.c | 49 +-
65     drivers/xen/core/xen_sysfs.c | 20 -
66     drivers/xen/netback/netback.c | 14
67     drivers/xen/netfront/netfront.c | 2
68     drivers/xen/pciback/xenbus.c | 2
69     drivers/xen/pcifront/xenbus.c | 4
70     drivers/xen/sfc_netback/accel_fwd.c | 7
71     drivers/xen/sfc_netback/accel_solarflare.c | 2
72     drivers/xen/sfc_netfront/accel_tso.c | 28 -
73     drivers/xen/sfc_netfront/accel_vi.c | 4
74     drivers/xen/sfc_netfront/accel_xenbus.c | 4
75     drivers/xen/xenoprof/xenoprofile.c | 2
76     fs/aio.c | 7
77     include/asm-x86/mach-xen/asm/desc_32.h | 119 ++++---
78     include/asm-x86/mach-xen/asm/desc_64.h | 30 -
79     include/asm-x86/mach-xen/asm/dma-mapping_64.h | 2
80     include/asm-x86/mach-xen/asm/fixmap_32.h | 9
81     include/asm-x86/mach-xen/asm/fixmap_64.h | 1
82     include/asm-x86/mach-xen/asm/highmem.h | 6
83     include/asm-x86/mach-xen/asm/io_32.h | 13
84     include/asm-x86/mach-xen/asm/irqflags_32.h | 78 ++--
85     include/asm-x86/mach-xen/asm/irqflags_64.h | 19 -
86     include/asm-x86/mach-xen/asm/mmu.h | 8
87     include/asm-x86/mach-xen/asm/mmu_64.h | 8
88     include/asm-x86/mach-xen/asm/mmu_context_32.h | 29 +
89     include/asm-x86/mach-xen/asm/mmu_context_64.h | 3
90     include/asm-x86/mach-xen/asm/page_64.h | 61 +--
91     include/asm-x86/mach-xen/asm/pgalloc_32.h | 3
92     include/asm-x86/mach-xen/asm/pgalloc_64.h | 15
93     include/asm-x86/mach-xen/asm/pgtable-2level.h | 43 +-
94     include/asm-x86/mach-xen/asm/pgtable-3level-defs.h | 2
95     include/asm-x86/mach-xen/asm/pgtable-3level.h | 61 ++-
96     include/asm-x86/mach-xen/asm/pgtable_32.h | 80 ++--
97     include/asm-x86/mach-xen/asm/pgtable_64.h | 83 ++---
98     include/asm-x86/mach-xen/asm/processor_32.h | 141 +++-----
99     include/asm-x86/mach-xen/asm/processor_64.h | 55 ---
100     include/asm-x86/mach-xen/asm/scatterlist_32.h | 2
101     include/asm-x86/mach-xen/asm/segment_32.h | 10
102     include/asm-x86/mach-xen/asm/smp_32.h | 117 +++++--
103     include/asm-x86/mach-xen/asm/smp_64.h | 20 -
104     include/asm-x86/mach-xen/asm/system_32.h | 348 ++++-----------------
105     include/asm-x86/mach-xen/asm/system_64.h | 106 ------
106     include/asm-x86/mach-xen/asm/tlbflush_32.h | 11
107     include/asm-x86/mach-xen/asm/tlbflush_64.h | 2
108     lib/swiotlb-xen.c | 1
109     net/core/dev.c | 15
110     scripts/Makefile.xen.awk | 2
111     101 files changed, 1642 insertions(+), 2080 deletions(-)
112    
113     --- a/arch/x86/Kconfig
114     +++ b/arch/x86/Kconfig
115     @@ -1222,7 +1222,7 @@
116    
117     config RELOCATABLE
118     bool "Build a relocatable kernel (EXPERIMENTAL)"
119     - depends on EXPERIMENTAL && !X86_XEN
120     + depends on EXPERIMENTAL && !X86_XEN && !X86_64_XEN
121     help
122     This builds a kernel image that retains relocation information
123     so it can be loaded someplace besides the default 1MB.
124     @@ -1276,7 +1276,6 @@
125     def_bool y
126     prompt "Compat VDSO support"
127     depends on X86_32 || IA32_EMULATION
128     - depends on !X86_XEN
129     help
130     Map the 32-bit VDSO to the predictable old-style address too.
131     ---help---
132     @@ -1453,7 +1452,7 @@
133     bool "PCI support" if !X86_VISWS
134     depends on !X86_VOYAGER
135     default y
136     - select ARCH_SUPPORTS_MSI if (X86_LOCAL_APIC && X86_IO_APIC)
137     + select ARCH_SUPPORTS_MSI if (X86_LOCAL_APIC && X86_IO_APIC && !X86_XEN && !X86_64_XEN)
138     help
139     Find out whether you have a PCI motherboard. PCI is the name of a
140     bus system, i.e. the way the CPU talks to the other stuff inside
141     --- a/arch/x86/ia32/ia32entry-xen.S
142     +++ b/arch/x86/ia32/ia32entry-xen.S
143     @@ -431,11 +431,7 @@
144     .quad sys_symlink
145     .quad sys_lstat
146     .quad sys_readlink /* 85 */
147     -#ifdef CONFIG_IA32_AOUT
148     .quad sys_uselib
149     -#else
150     - .quad quiet_ni_syscall
151     -#endif
152     .quad sys_swapon
153     .quad sys_reboot
154     .quad compat_sys_old_readdir
155     @@ -574,7 +570,7 @@
156     .quad quiet_ni_syscall /* tux */
157     .quad quiet_ni_syscall /* security */
158     .quad sys_gettid
159     - .quad sys_readahead /* 225 */
160     + .quad sys32_readahead /* 225 */
161     .quad sys_setxattr
162     .quad sys_lsetxattr
163     .quad sys_fsetxattr
164     @@ -599,7 +595,7 @@
165     .quad compat_sys_io_getevents
166     .quad compat_sys_io_submit
167     .quad sys_io_cancel
168     - .quad sys_fadvise64 /* 250 */
169     + .quad sys32_fadvise64 /* 250 */
170     .quad quiet_ni_syscall /* free_huge_pages */
171     .quad sys_exit_group
172     .quad sys32_lookup_dcookie
173     @@ -663,10 +659,14 @@
174     .quad compat_sys_set_robust_list
175     .quad compat_sys_get_robust_list
176     .quad sys_splice
177     - .quad sys_sync_file_range
178     - .quad sys_tee
179     + .quad sys32_sync_file_range
180     + .quad sys_tee /* 315 */
181     .quad compat_sys_vmsplice
182     .quad compat_sys_move_pages
183     .quad sys_getcpu
184     .quad sys_epoll_pwait
185     -ia32_syscall_end:
186     + .quad compat_sys_utimensat /* 320 */
187     + .quad compat_sys_signalfd
188     + .quad compat_sys_timerfd
189     + .quad sys_eventfd
190     +ia32_syscall_end:
191     --- a/arch/x86/kernel/Makefile
192     +++ b/arch/x86/kernel/Makefile
193     @@ -106,4 +106,4 @@
194    
195     disabled-obj-$(CONFIG_XEN) := early-quirks.o hpet.o i8253.o i8259_$(BITS).o reboot.o \
196     smpboot_$(BITS).o tsc_$(BITS).o tsc_sync.o
197     -%/head_$(BITS).o %/head_$(BITS).s: $(if $(CONFIG_XEN),EXTRA_AFLAGS,dummy) :=
198     +%/head_64.o %/head_64.s: $(if $(CONFIG_XEN),EXTRA_AFLAGS,dummy) :=
199     --- a/arch/x86/kernel/acpi/sleep_64-xen.c
200     +++ b/arch/x86/kernel/acpi/sleep_64-xen.c
201     @@ -60,19 +60,6 @@
202     extern char wakeup_start, wakeup_end;
203    
204     extern unsigned long acpi_copy_wakeup_routine(unsigned long);
205     -
206     -static pgd_t low_ptr;
207     -
208     -static void init_low_mapping(void)
209     -{
210     - pgd_t *slot0 = pgd_offset(current->mm, 0UL);
211     - low_ptr = *slot0;
212     - /* FIXME: We're playing with the current task's page tables here, which
213     - * is potentially dangerous on SMP systems.
214     - */
215     - set_pgd(slot0, *pgd_offset(current->mm, PAGE_OFFSET));
216     - local_flush_tlb();
217     -}
218     #endif
219    
220     /**
221     @@ -84,8 +71,6 @@
222     int acpi_save_state_mem(void)
223     {
224     #ifndef CONFIG_ACPI_PV_SLEEP
225     - init_low_mapping();
226     -
227     memcpy((void *)acpi_wakeup_address, &wakeup_start,
228     &wakeup_end - &wakeup_start);
229     acpi_copy_wakeup_routine(acpi_wakeup_address);
230     @@ -98,10 +83,6 @@
231     */
232     void acpi_restore_state_mem(void)
233     {
234     -#ifndef CONFIG_ACPI_PV_SLEEP
235     - set_pgd(pgd_offset(current->mm, 0UL), low_ptr);
236     - local_flush_tlb();
237     -#endif
238     }
239    
240     /**
241     @@ -115,10 +96,11 @@
242     void __init acpi_reserve_bootmem(void)
243     {
244     #ifndef CONFIG_ACPI_PV_SLEEP
245     - acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE);
246     - if ((&wakeup_end - &wakeup_start) > PAGE_SIZE)
247     + acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE*2);
248     + if ((&wakeup_end - &wakeup_start) > (PAGE_SIZE*2))
249     printk(KERN_CRIT
250     - "ACPI: Wakeup code way too big, will crash on attempt to suspend\n");
251     + "ACPI: Wakeup code way too big, will crash on attempt"
252     + " to suspend\n");
253     #endif
254     }
255    
256     --- a/arch/x86/kernel/apic_32-xen.c
257     +++ b/arch/x86/kernel/apic_32-xen.c
258     @@ -19,7 +19,6 @@
259     #include <linux/mm.h>
260     #include <linux/delay.h>
261     #include <linux/bootmem.h>
262     -#include <linux/smp_lock.h>
263     #include <linux/interrupt.h>
264     #include <linux/mc146818rtc.h>
265     #include <linux/kernel_stat.h>
266     --- a/arch/x86/kernel/apic_64-xen.c
267     +++ b/arch/x86/kernel/apic_64-xen.c
268     @@ -19,7 +19,6 @@
269     #include <linux/mm.h>
270     #include <linux/delay.h>
271     #include <linux/bootmem.h>
272     -#include <linux/smp_lock.h>
273     #include <linux/interrupt.h>
274     #include <linux/mc146818rtc.h>
275     #include <linux/kernel_stat.h>
276     --- a/arch/x86/kernel/cpu/common-xen.c
277     +++ b/arch/x86/kernel/cpu/common-xen.c
278     @@ -22,16 +22,40 @@
279     #define phys_pkg_id(a,b) a
280     #endif
281     #endif
282     -#include <asm/pda.h>
283     #include <asm/hypervisor.h>
284    
285     #include "cpu.h"
286    
287     -DEFINE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr);
288     -EXPORT_PER_CPU_SYMBOL(cpu_gdt_descr);
289     +DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
290     + [GDT_ENTRY_KERNEL_CS] = { 0x0000ffff, 0x00cf9a00 },
291     + [GDT_ENTRY_KERNEL_DS] = { 0x0000ffff, 0x00cf9200 },
292     + [GDT_ENTRY_DEFAULT_USER_CS] = { 0x0000ffff, 0x00cffa00 },
293     + [GDT_ENTRY_DEFAULT_USER_DS] = { 0x0000ffff, 0x00cff200 },
294     +#ifndef CONFIG_XEN
295     + /*
296     + * Segments used for calling PnP BIOS have byte granularity.
297     + * They code segments and data segments have fixed 64k limits,
298     + * the transfer segment sizes are set at run time.
299     + */
300     + [GDT_ENTRY_PNPBIOS_CS32] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */
301     + [GDT_ENTRY_PNPBIOS_CS16] = { 0x0000ffff, 0x00009a00 },/* 16-bit code */
302     + [GDT_ENTRY_PNPBIOS_DS] = { 0x0000ffff, 0x00009200 }, /* 16-bit data */
303     + [GDT_ENTRY_PNPBIOS_TS1] = { 0x00000000, 0x00009200 },/* 16-bit data */
304     + [GDT_ENTRY_PNPBIOS_TS2] = { 0x00000000, 0x00009200 },/* 16-bit data */
305     + /*
306     + * The APM segments have byte granularity and their bases
307     + * are set at run time. All have 64k limits.
308     + */
309     + [GDT_ENTRY_APMBIOS_BASE] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */
310     + /* 16-bit code */
311     + [GDT_ENTRY_APMBIOS_BASE+1] = { 0x0000ffff, 0x00009a00 },
312     + [GDT_ENTRY_APMBIOS_BASE+2] = { 0x0000ffff, 0x00409200 }, /* data */
313    
314     -struct i386_pda *_cpu_pda[NR_CPUS] __read_mostly;
315     -EXPORT_SYMBOL(_cpu_pda);
316     + [GDT_ENTRY_ESPFIX_SS] = { 0x00000000, 0x00c09200 },
317     +#endif
318     + [GDT_ENTRY_PERCPU] = { 0x00000000, 0x00000000 },
319     +} };
320     +EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
321    
322     static int cachesize_override __cpuinitdata = -1;
323     static int disable_x86_fxsr __cpuinitdata;
324     @@ -373,7 +397,7 @@
325     /*
326     * This does the hard work of actually picking apart the CPU stuff...
327     */
328     -void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
329     +static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
330     {
331     int i;
332    
333     @@ -484,15 +508,22 @@
334    
335     /* Init Machine Check Exception if available. */
336     mcheck_init(c);
337     +}
338    
339     - if (c == &boot_cpu_data)
340     - sysenter_setup();
341     +void __init identify_boot_cpu(void)
342     +{
343     + identify_cpu(&boot_cpu_data);
344     + sysenter_setup();
345     enable_sep_cpu();
346     + mtrr_bp_init();
347     +}
348    
349     - if (c == &boot_cpu_data)
350     - mtrr_bp_init();
351     - else
352     - mtrr_ap_init();
353     +void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
354     +{
355     + BUG_ON(c == &boot_cpu_data);
356     + identify_cpu(c);
357     + enable_sep_cpu();
358     + mtrr_ap_init();
359     }
360    
361     #ifdef CONFIG_X86_HT
362     @@ -606,136 +637,47 @@
363     #endif
364     }
365    
366     -/* Make sure %gs is initialized properly in idle threads */
367     +/* Make sure %fs is initialized properly in idle threads */
368     struct pt_regs * __devinit idle_regs(struct pt_regs *regs)
369     {
370     memset(regs, 0, sizeof(struct pt_regs));
371     - regs->xfs = __KERNEL_PDA;
372     + regs->xfs = __KERNEL_PERCPU;
373     return regs;
374     }
375    
376     -static __cpuinit int alloc_gdt(int cpu)
377     +/* Current gdt points %fs at the "master" per-cpu area: after this,
378     + * it's on the real one. */
379     +void switch_to_new_gdt(void)
380     {
381     - struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
382     - struct desc_struct *gdt;
383     - struct i386_pda *pda;
384     -
385     - gdt = (struct desc_struct *)cpu_gdt_descr->address;
386     - pda = cpu_pda(cpu);
387     -
388     - /*
389     - * This is a horrible hack to allocate the GDT. The problem
390     - * is that cpu_init() is called really early for the boot CPU
391     - * (and hence needs bootmem) but much later for the secondary
392     - * CPUs, when bootmem will have gone away
393     - */
394     - if (NODE_DATA(0)->bdata->node_bootmem_map) {
395     - BUG_ON(gdt != NULL || pda != NULL);
396     -
397     - gdt = alloc_bootmem_pages(PAGE_SIZE);
398     - pda = alloc_bootmem(sizeof(*pda));
399     - /* alloc_bootmem(_pages) panics on failure, so no check */
400     -
401     - memset(gdt, 0, PAGE_SIZE);
402     - memset(pda, 0, sizeof(*pda));
403     - } else {
404     - /* GDT and PDA might already have been allocated if
405     - this is a CPU hotplug re-insertion. */
406     - if (gdt == NULL)
407     - gdt = (struct desc_struct *)get_zeroed_page(GFP_KERNEL);
408     -
409     - if (pda == NULL)
410     - pda = kmalloc_node(sizeof(*pda), GFP_KERNEL, cpu_to_node(cpu));
411     -
412     - if (unlikely(!gdt || !pda)) {
413     - free_pages((unsigned long)gdt, 0);
414     - kfree(pda);
415     - return 0;
416     - }
417     - }
418     -
419     - cpu_gdt_descr->address = (unsigned long)gdt;
420     - cpu_pda(cpu) = pda;
421     -
422     - return 1;
423     -}
424     -
425     -/* Initial PDA used by boot CPU */
426     -struct i386_pda boot_pda = {
427     - ._pda = &boot_pda,
428     - .cpu_number = 0,
429     - .pcurrent = &init_task,
430     -};
431     -
432     -static inline void set_kernel_fs(void)
433     -{
434     - /* Set %fs for this CPU's PDA. Memory clobber is to create a
435     - barrier with respect to any PDA operations, so the compiler
436     - doesn't move any before here. */
437     - asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_PDA) : "memory");
438     -}
439     -
440     -/* Initialize the CPU's GDT and PDA. The boot CPU does this for
441     - itself, but secondaries find this done for them. */
442     -__cpuinit int init_gdt(int cpu, struct task_struct *idle)
443     -{
444     - struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
445     - struct desc_struct *gdt;
446     - struct i386_pda *pda;
447     -
448     - /* For non-boot CPUs, the GDT and PDA should already have been
449     - allocated. */
450     - if (!alloc_gdt(cpu)) {
451     - printk(KERN_CRIT "CPU%d failed to allocate GDT or PDA\n", cpu);
452     - return 0;
453     - }
454     -
455     - gdt = (struct desc_struct *)cpu_gdt_descr->address;
456     - pda = cpu_pda(cpu);
457     -
458     - BUG_ON(gdt == NULL || pda == NULL);
459     -
460     - /*
461     - * Initialize the per-CPU GDT with the boot GDT,
462     - * and set up the GDT descriptor:
463     - */
464     - memcpy(gdt, cpu_gdt_table, GDT_SIZE);
465     - cpu_gdt_descr->size = GDT_SIZE - 1;
466     -
467     - pack_descriptor((u32 *)&gdt[GDT_ENTRY_PDA].a,
468     - (u32 *)&gdt[GDT_ENTRY_PDA].b,
469     - (unsigned long)pda, sizeof(*pda) - 1,
470     - 0x80 | DESCTYPE_S | 0x2, 0); /* present read-write data segment */
471     -
472     - memset(pda, 0, sizeof(*pda));
473     - pda->_pda = pda;
474     - pda->cpu_number = cpu;
475     - pda->pcurrent = idle;
476     -
477     - return 1;
478     -}
479     -
480     -void __cpuinit cpu_set_gdt(int cpu)
481     -{
482     - struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
483     + struct Xgt_desc_struct gdt_descr;
484     unsigned long va, frames[16];
485     int f;
486    
487     - for (va = cpu_gdt_descr->address, f = 0;
488     - va < cpu_gdt_descr->address + cpu_gdt_descr->size;
489     + gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
490     + gdt_descr.size = GDT_SIZE - 1;
491     +
492     + for (va = gdt_descr.address, f = 0;
493     + va < gdt_descr.address + gdt_descr.size;
494     va += PAGE_SIZE, f++) {
495     frames[f] = virt_to_mfn(va);
496     make_lowmem_page_readonly(
497     (void *)va, XENFEAT_writable_descriptor_tables);
498     }
499     - BUG_ON(HYPERVISOR_set_gdt(frames, (cpu_gdt_descr->size + 1) / 8));
500     -
501     - set_kernel_fs();
502     + if (HYPERVISOR_set_gdt(frames, (gdt_descr.size + 1) / 8))
503     + BUG();
504     + asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory");
505     }
506    
507     -/* Common CPU init for both boot and secondary CPUs */
508     -static void __cpuinit _cpu_init(int cpu, struct task_struct *curr)
509     +/*
510     + * cpu_init() initializes state that is per-CPU. Some data is already
511     + * initialized (naturally) in the bootstrap process, such as the GDT
512     + * and IDT. We reload them nevertheless, this function acts as a
513     + * 'CPU state barrier', nothing should get across.
514     + */
515     +void __cpuinit cpu_init(void)
516     {
517     + int cpu = smp_processor_id();
518     + struct task_struct *curr = current;
519     #ifndef CONFIG_X86_NO_TSS
520     struct tss_struct * t = &per_cpu(init_tss, cpu);
521     #endif
522     @@ -757,6 +699,8 @@
523     set_in_cr4(X86_CR4_TSD);
524     }
525    
526     + switch_to_new_gdt();
527     +
528     /*
529     * Set up and load the per-CPU TSS and LDT
530     */
531     @@ -794,38 +738,6 @@
532     mxcsr_feature_mask_init();
533     }
534    
535     -/* Entrypoint to initialize secondary CPU */
536     -void __cpuinit secondary_cpu_init(void)
537     -{
538     - int cpu = smp_processor_id();
539     - struct task_struct *curr = current;
540     -
541     - _cpu_init(cpu, curr);
542     -}
543     -
544     -/*
545     - * cpu_init() initializes state that is per-CPU. Some data is already
546     - * initialized (naturally) in the bootstrap process, such as the GDT
547     - * and IDT. We reload them nevertheless, this function acts as a
548     - * 'CPU state barrier', nothing should get across.
549     - */
550     -void __cpuinit cpu_init(void)
551     -{
552     - int cpu = smp_processor_id();
553     - struct task_struct *curr = current;
554     -
555     - /* Set up the real GDT and PDA, so we can transition from the
556     - boot versions. */
557     - if (!init_gdt(cpu, curr)) {
558     - /* failed to allocate something; not much we can do... */
559     - for (;;)
560     - local_irq_enable();
561     - }
562     -
563     - cpu_set_gdt(cpu);
564     - _cpu_init(cpu, curr);
565     -}
566     -
567     #ifdef CONFIG_HOTPLUG_CPU
568     void __cpuinit cpu_uninit(void)
569     {
570     --- a/arch/x86/kernel/cpu/mtrr/main-xen.c
571     +++ b/arch/x86/kernel/cpu/mtrr/main-xen.c
572     @@ -167,7 +167,7 @@
573     EXPORT_SYMBOL(mtrr_add);
574     EXPORT_SYMBOL(mtrr_del);
575    
576     -void __init mtrr_bp_init(void)
577     +__init void mtrr_bp_init(void)
578     {
579     }
580    
581     --- a/arch/x86/kernel/e820_32-xen.c
582     +++ b/arch/x86/kernel/e820_32-xen.c
583     @@ -162,26 +162,27 @@
584    
585     static int __init romsignature(const unsigned char *rom)
586     {
587     + const unsigned short * const ptr = (const unsigned short *)rom;
588     unsigned short sig;
589    
590     - return probe_kernel_address((const unsigned short *)rom, sig) == 0 &&
591     - sig == ROMSIGNATURE;
592     + return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE;
593     }
594    
595     -static int __init romchecksum(unsigned char *rom, unsigned long length)
596     +static int __init romchecksum(const unsigned char *rom, unsigned long length)
597     {
598     - unsigned char sum;
599     + unsigned char sum, c;
600    
601     - for (sum = 0; length; length--)
602     - sum += *rom++;
603     - return sum == 0;
604     + for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--)
605     + sum += c;
606     + return !length && !sum;
607     }
608    
609     static void __init probe_roms(void)
610     {
611     + const unsigned char *rom;
612     unsigned long start, length, upper;
613     - unsigned char *rom;
614     - int i;
615     + unsigned char c;
616     + int i;
617    
618     #ifdef CONFIG_XEN
619     /* Nothing to do if not running in dom0. */
620     @@ -198,8 +199,11 @@
621    
622     video_rom_resource.start = start;
623    
624     + if (probe_kernel_address(rom + 2, c) != 0)
625     + continue;
626     +
627     /* 0 < length <= 0x7f * 512, historically */
628     - length = rom[2] * 512;
629     + length = c * 512;
630    
631     /* if checksum okay, trust length byte */
632     if (length && romchecksum(rom, length))
633     @@ -233,8 +237,11 @@
634     if (!romsignature(rom))
635     continue;
636    
637     + if (probe_kernel_address(rom + 2, c) != 0)
638     + continue;
639     +
640     /* 0 < length <= 0x7f * 512, historically */
641     - length = rom[2] * 512;
642     + length = c * 512;
643    
644     /* but accept any length that fits if checksum okay */
645     if (!length || start + length > upper || !romchecksum(rom, length))
646     @@ -249,7 +256,7 @@
647     }
648    
649     #ifdef CONFIG_XEN
650     -static struct e820map machine_e820 __initdata;
651     +static struct e820map machine_e820;
652     #define e820 machine_e820
653     #endif
654    
655     @@ -409,10 +416,8 @@
656     ____________________33__
657     ______________________4_
658     */
659     - printk("sanitize start\n");
660     /* if there's only one memory region, don't bother */
661     if (*pnr_map < 2) {
662     - printk("sanitize bail 0\n");
663     return -1;
664     }
665    
666     @@ -421,7 +426,6 @@
667     /* bail out if we find any unreasonable addresses in bios map */
668     for (i=0; i<old_nr; i++)
669     if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) {
670     - printk("sanitize bail 1\n");
671     return -1;
672     }
673    
674     @@ -517,7 +521,6 @@
675     memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
676     *pnr_map = new_nr;
677    
678     - printk("sanitize end\n");
679     return 0;
680     }
681    
682     @@ -552,7 +555,6 @@
683     unsigned long long size = biosmap->size;
684     unsigned long long end = start + size;
685     unsigned long type = biosmap->type;
686     - printk("copy_e820_map() start: %016Lx size: %016Lx end: %016Lx type: %ld\n", start, size, end, type);
687    
688     /* Overflow in 64 bits? Ignore the memory map. */
689     if (start > end)
690     @@ -564,17 +566,11 @@
691     * Not right. Fix it up.
692     */
693     if (type == E820_RAM) {
694     - printk("copy_e820_map() type is E820_RAM\n");
695     if (start < 0x100000ULL && end > 0xA0000ULL) {
696     - printk("copy_e820_map() lies in range...\n");
697     - if (start < 0xA0000ULL) {
698     - printk("copy_e820_map() start < 0xA0000ULL\n");
699     + if (start < 0xA0000ULL)
700     add_memory_region(start, 0xA0000ULL-start, type);
701     - }
702     - if (end <= 0x100000ULL) {
703     - printk("copy_e820_map() end <= 0x100000ULL\n");
704     + if (end <= 0x100000ULL)
705     continue;
706     - }
707     start = 0x100000ULL;
708     size = end - start;
709     }
710     --- a/arch/x86/kernel/e820_64-xen.c
711     +++ b/arch/x86/kernel/e820_64-xen.c
712     @@ -17,6 +17,8 @@
713     #include <linux/kexec.h>
714     #include <linux/module.h>
715     #include <linux/mm.h>
716     +#include <linux/suspend.h>
717     +#include <linux/pfn.h>
718    
719     #include <asm/pgtable.h>
720     #include <asm/page.h>
721     @@ -28,7 +30,7 @@
722    
723     struct e820map e820 __initdata;
724     #ifdef CONFIG_XEN
725     -struct e820map machine_e820 __initdata;
726     +struct e820map machine_e820;
727     #endif
728    
729     /*
730     @@ -293,22 +295,6 @@
731     }
732    
733     #ifndef CONFIG_XEN
734     -/* Mark pages corresponding to given address range as nosave */
735     -static void __init
736     -e820_mark_nosave_range(unsigned long start, unsigned long end)
737     -{
738     - unsigned long pfn, max_pfn;
739     -
740     - if (start >= end)
741     - return;
742     -
743     - printk("Nosave address range: %016lx - %016lx\n", start, end);
744     - max_pfn = end >> PAGE_SHIFT;
745     - for (pfn = start >> PAGE_SHIFT; pfn < max_pfn; pfn++)
746     - if (pfn_valid(pfn))
747     - SetPageNosave(pfn_to_page(pfn));
748     -}
749     -
750     /*
751     * Find the ranges of physical addresses that do not correspond to
752     * e820 RAM areas and mark the corresponding pages as nosave for software
753     @@ -327,13 +313,13 @@
754     struct e820entry *ei = &e820.map[i];
755    
756     if (paddr < ei->addr)
757     - e820_mark_nosave_range(paddr,
758     - round_up(ei->addr, PAGE_SIZE));
759     + register_nosave_region(PFN_DOWN(paddr),
760     + PFN_UP(ei->addr));
761    
762     paddr = round_down(ei->addr + ei->size, PAGE_SIZE);
763     if (ei->type != E820_RAM)
764     - e820_mark_nosave_range(round_up(ei->addr, PAGE_SIZE),
765     - paddr);
766     + register_nosave_region(PFN_UP(ei->addr),
767     + PFN_DOWN(paddr));
768    
769     if (paddr >= (end_pfn << PAGE_SHIFT))
770     break;
771     --- a/arch/x86/kernel/early_printk-xen.c
772     +++ b/arch/x86/kernel/early_printk-xen.c
773     @@ -11,11 +11,10 @@
774    
775     #ifdef __i386__
776     #include <asm/setup.h>
777     -#define VGABASE (__ISA_IO_base + 0xb8000)
778     #else
779     #include <asm/bootsetup.h>
780     -#define VGABASE ((void __iomem *)0xffffffff800b8000UL)
781     #endif
782     +#define VGABASE (__ISA_IO_base + 0xb8000)
783    
784     #ifndef CONFIG_XEN
785     static int max_ypos = 25, max_xpos = 80;
786     @@ -93,9 +92,9 @@
787     static void early_serial_write(struct console *con, const char *s, unsigned n)
788     {
789     while (*s && n-- > 0) {
790     - early_serial_putc(*s);
791     if (*s == '\n')
792     early_serial_putc('\r');
793     + early_serial_putc(*s);
794     s++;
795     }
796     }
797     @@ -205,7 +204,7 @@
798     return ret;
799     }
800    
801     -void __init simnow_init(char *str)
802     +static void __init simnow_init(char *str)
803     {
804     char *fn = "klog";
805     if (*str == '=')
806     @@ -277,22 +276,12 @@
807     early_console = &simnow_console;
808     keep_early = 1;
809     }
810     +
811     + if (keep_early)
812     + early_console->flags &= ~CON_BOOT;
813     + else
814     + early_console->flags |= CON_BOOT;
815     register_console(early_console);
816     return 0;
817     }
818     -
819     early_param("earlyprintk", setup_early_printk);
820     -
821     -void __init disable_early_printk(void)
822     -{
823     - if (!early_console_initialized || !early_console)
824     - return;
825     - if (!keep_early) {
826     - printk("disabling early console\n");
827     - unregister_console(early_console);
828     - early_console_initialized = 0;
829     - } else {
830     - printk("keeping early console\n");
831     - }
832     -}
833     -
834     --- a/arch/x86/kernel/entry_32-xen.S
835     +++ b/arch/x86/kernel/entry_32-xen.S
836     @@ -15,7 +15,7 @@
837     * I changed all the .align's to 4 (16 byte alignment), as that's faster
838     * on a 486.
839     *
840     - * Stack layout in 'ret_from_system_call':
841     + * Stack layout in 'syscall_exit':
842     * ptrace needs to have all regs on the stack.
843     * if the order here is changed, it needs to be
844     * updated in fork.c:copy_process, signal.c:do_signal,
845     @@ -135,7 +135,7 @@
846     movl $(__USER_DS), %edx; \
847     movl %edx, %ds; \
848     movl %edx, %es; \
849     - movl $(__KERNEL_PDA), %edx; \
850     + movl $(__KERNEL_PERCPU), %edx; \
851     movl %edx, %fs
852    
853     #define RESTORE_INT_REGS \
854     @@ -308,16 +308,12 @@
855     pushl $(__USER_CS)
856     CFI_ADJUST_CFA_OFFSET 4
857     /*CFI_REL_OFFSET cs, 0*/
858     -#ifndef CONFIG_COMPAT_VDSO
859     /*
860     * Push current_thread_info()->sysenter_return to the stack.
861     * A tiny bit of offset fixup is necessary - 4*4 means the 4 words
862     * pushed above; +8 corresponds to copy_thread's esp0 setting.
863     */
864     pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp)
865     -#else
866     - pushl $SYSENTER_RETURN
867     -#endif
868     CFI_ADJUST_CFA_OFFSET 4
869     CFI_REL_OFFSET eip, 0
870    
871     @@ -345,7 +341,7 @@
872     jae syscall_badsys
873     call *sys_call_table(,%eax,4)
874     movl %eax,PT_EAX(%esp)
875     - DISABLE_INTERRUPTS(CLBR_ECX|CLBR_EDX)
876     + DISABLE_INTERRUPTS(CLBR_ANY)
877     TRACE_IRQS_OFF
878     movl TI_flags(%ebp), %ecx
879     testw $_TIF_ALLWORK_MASK, %cx
880     @@ -400,10 +396,6 @@
881     CFI_ADJUST_CFA_OFFSET 4
882     SAVE_ALL
883     GET_THREAD_INFO(%ebp)
884     - testl $TF_MASK,PT_EFLAGS(%esp)
885     - jz no_singlestep
886     - orl $_TIF_SINGLESTEP,TI_flags(%ebp)
887     -no_singlestep:
888     # system call tracing in operation / emulation
889     /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
890     testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
891     @@ -418,6 +410,10 @@
892     # setting need_resched or sigpending
893     # between sampling and the iret
894     TRACE_IRQS_OFF
895     + testl $TF_MASK,PT_EFLAGS(%esp) # If tracing set singlestep flag on exit
896     + jz no_singlestep
897     + orl $_TIF_SINGLESTEP,TI_flags(%ebp)
898     +no_singlestep:
899     movl TI_flags(%ebp), %ecx
900     testw $_TIF_ALLWORK_MASK, %cx # current->work
901     jne syscall_exit_work
902     @@ -635,9 +631,7 @@
903     #ifndef CONFIG_XEN
904     #define FIXUP_ESPFIX_STACK \
905     /* since we are on a wrong stack, we cant make it a C code :( */ \
906     - movl %fs:PDA_cpu, %ebx; \
907     - PER_CPU(cpu_gdt_descr, %ebx); \
908     - movl GDS_address(%ebx), %ebx; \
909     + PER_CPU(gdt_page, %ebx); \
910     GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \
911     addl %esp, %eax; \
912     pushl $__KERNEL_DS; \
913     @@ -710,7 +704,7 @@
914     SAVE_ALL; \
915     TRACE_IRQS_OFF \
916     movl %esp,%eax; \
917     - call smp_/**/name; \
918     + call smp_##name; \
919     jmp ret_from_intr; \
920     CFI_ENDPROC; \
921     ENDPROC(name)
922     @@ -718,10 +712,6 @@
923     /* The include is where all of the SMP etc. interrupts come from */
924     #include "entry_arch.h"
925    
926     -/* This alternate entry is needed because we hijack the apic LVTT */
927     -#if defined(CONFIG_VMI) && defined(CONFIG_X86_LOCAL_APIC)
928     -BUILD_INTERRUPT(apic_vmi_timer_interrupt,LOCAL_TIMER_VECTOR)
929     -#endif
930     #else
931     #define UNWIND_ESPFIX_STACK
932     #endif
933     @@ -764,7 +754,7 @@
934     pushl %fs
935     CFI_ADJUST_CFA_OFFSET 4
936     /*CFI_REL_OFFSET fs, 0*/
937     - movl $(__KERNEL_PDA), %ecx
938     + movl $(__KERNEL_PERCPU), %ecx
939     movl %ecx, %fs
940     UNWIND_ESPFIX_STACK
941     popl %ecx
942     --- a/arch/x86/kernel/entry_64-xen.S
943     +++ b/arch/x86/kernel/entry_64-xen.S
944     @@ -1254,3 +1254,10 @@
945     ret
946     CFI_ENDPROC
947     ENDPROC(call_softirq)
948     +
949     +KPROBE_ENTRY(ignore_sysret)
950     + CFI_STARTPROC
951     + mov $-ENOSYS,%eax
952     + HYPERVISOR_IRET 0
953     + CFI_ENDPROC
954     +ENDPROC(ignore_sysret)
955     --- a/arch/x86/kernel/genapic_64-xen.c
956     +++ b/arch/x86/kernel/genapic_64-xen.c
957     @@ -11,123 +11,57 @@
958     #include <linux/threads.h>
959     #include <linux/cpumask.h>
960     #include <linux/string.h>
961     +#include <linux/module.h>
962     #include <linux/kernel.h>
963     #include <linux/ctype.h>
964     #include <linux/init.h>
965     -#include <linux/module.h>
966    
967     #include <asm/smp.h>
968     #include <asm/ipi.h>
969     +#include <asm/genapic.h>
970    
971     -#if defined(CONFIG_ACPI)
972     +#ifdef CONFIG_ACPI
973     #include <acpi/acpi_bus.h>
974     #endif
975    
976     /* which logical CPU number maps to which CPU (physical APIC ID) */
977     -u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
978     +u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly
979     + = { [0 ... NR_CPUS-1] = BAD_APICID };
980     EXPORT_SYMBOL(x86_cpu_to_apicid);
981     -u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
982    
983     -extern struct genapic apic_cluster;
984     -extern struct genapic apic_flat;
985     -extern struct genapic apic_physflat;
986     +u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
987    
988     #ifndef CONFIG_XEN
989     -struct genapic *genapic = &apic_flat;
990     -struct genapic *genapic_force;
991     +struct genapic __read_mostly *genapic = &apic_flat;
992     #else
993     extern struct genapic apic_xen;
994     -struct genapic *genapic = &apic_xen;
995     +struct genapic __read_mostly *genapic = &apic_xen;
996     #endif
997    
998    
999     /*
1000     * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
1001     */
1002     -void __init clustered_apic_check(void)
1003     +void __init setup_apic_routing(void)
1004     {
1005     #ifndef CONFIG_XEN
1006     - long i;
1007     - u8 clusters, max_cluster;
1008     - u8 id;
1009     - u8 cluster_cnt[NUM_APIC_CLUSTERS];
1010     - int max_apic = 0;
1011     -
1012     - /* genapic selection can be forced because of certain quirks.
1013     - */
1014     - if (genapic_force) {
1015     - genapic = genapic_force;
1016     - goto print;
1017     - }
1018     -
1019     -#if defined(CONFIG_ACPI)
1020     +#ifdef CONFIG_ACPI
1021     /*
1022     - * Some x86_64 machines use physical APIC mode regardless of how many
1023     - * procs/clusters are present (x86_64 ES7000 is an example).
1024     + * Quirk: some x86_64 machines can only use physical APIC mode
1025     + * regardless of how many processors are present (x86_64 ES7000
1026     + * is an example).
1027     */
1028     - if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID)
1029     - if (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL) {
1030     - genapic = &apic_cluster;
1031     - goto print;
1032     - }
1033     -#endif
1034     -
1035     - memset(cluster_cnt, 0, sizeof(cluster_cnt));
1036     - for (i = 0; i < NR_CPUS; i++) {
1037     - id = bios_cpu_apicid[i];
1038     - if (id == BAD_APICID)
1039     - continue;
1040     - if (id > max_apic)
1041     - max_apic = id;
1042     - cluster_cnt[APIC_CLUSTERID(id)]++;
1043     - }
1044     -
1045     - /* Don't use clustered mode on AMD platforms. */
1046     - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
1047     + if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID &&
1048     + (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL))
1049     genapic = &apic_physflat;
1050     -#ifndef CONFIG_HOTPLUG_CPU
1051     - /* In the CPU hotplug case we cannot use broadcast mode
1052     - because that opens a race when a CPU is removed.
1053     - Stay at physflat mode in this case.
1054     - It is bad to do this unconditionally though. Once
1055     - we have ACPI platform support for CPU hotplug
1056     - we should detect hotplug capablity from ACPI tables and
1057     - only do this when really needed. -AK */
1058     - if (max_apic <= 8)
1059     - genapic = &apic_flat;
1060     + else
1061     #endif
1062     - goto print;
1063     - }
1064    
1065     - clusters = 0;
1066     - max_cluster = 0;
1067     -
1068     - for (i = 0; i < NUM_APIC_CLUSTERS; i++) {
1069     - if (cluster_cnt[i] > 0) {
1070     - ++clusters;
1071     - if (cluster_cnt[i] > max_cluster)
1072     - max_cluster = cluster_cnt[i];
1073     - }
1074     - }
1075     -
1076     - /*
1077     - * If we have clusters <= 1 and CPUs <= 8 in cluster 0, then flat mode,
1078     - * else if max_cluster <= 4 and cluster_cnt[15] == 0, clustered logical
1079     - * else physical mode.
1080     - * (We don't use lowest priority delivery + HW APIC IRQ steering, so
1081     - * can ignore the clustered logical case and go straight to physical.)
1082     - */
1083     - if (clusters <= 1 && max_cluster <= 8 && cluster_cnt[0] == max_cluster) {
1084     -#ifdef CONFIG_HOTPLUG_CPU
1085     - /* Don't use APIC shortcuts in CPU hotplug to avoid races */
1086     - genapic = &apic_physflat;
1087     -#else
1088     + if (cpus_weight(cpu_possible_map) <= 8)
1089     genapic = &apic_flat;
1090     -#endif
1091     - } else
1092     - genapic = &apic_cluster;
1093     + else
1094     + genapic = &apic_physflat;
1095    
1096     -print:
1097     #else
1098     /* hardcode to xen apic functions */
1099     genapic = &apic_xen;
1100     @@ -135,7 +69,7 @@
1101     printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name);
1102     }
1103    
1104     -/* Same for both flat and clustered. */
1105     +/* Same for both flat and physical. */
1106    
1107     #ifdef CONFIG_XEN
1108     extern void xen_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest);
1109     --- a/arch/x86/kernel/genapic_xen_64.c
1110     +++ b/arch/x86/kernel/genapic_xen_64.c
1111     @@ -21,9 +21,8 @@
1112     #include <asm/ipi.h>
1113     #else
1114     #include <asm/apic.h>
1115     -#include <asm/apicdef.h>
1116     -#include <asm/genapic.h>
1117     #endif
1118     +#include <asm/genapic.h>
1119     #include <xen/evtchn.h>
1120    
1121     DECLARE_PER_CPU(int, ipi_to_irq[NR_IPIS]);
1122     --- a/arch/x86/kernel/head64-xen.c
1123     +++ b/arch/x86/kernel/head64-xen.c
1124     @@ -22,13 +22,21 @@
1125     #include <asm/setup.h>
1126     #include <asm/desc.h>
1127     #include <asm/pgtable.h>
1128     +#include <asm/tlbflush.h>
1129     #include <asm/sections.h>
1130    
1131     unsigned long start_pfn;
1132    
1133     +#ifndef CONFIG_XEN
1134     +static void __init zap_identity_mappings(void)
1135     +{
1136     + pgd_t *pgd = pgd_offset_k(0UL);
1137     + pgd_clear(pgd);
1138     + __flush_tlb();
1139     +}
1140     +
1141     /* Don't add a printk in there. printk relies on the PDA which is not initialized
1142     yet. */
1143     -#if 0
1144     static void __init clear_bss(void)
1145     {
1146     memset(__bss_start, 0,
1147     @@ -37,26 +45,25 @@
1148     #endif
1149    
1150     #define NEW_CL_POINTER 0x228 /* Relative to real mode data */
1151     -#define OLD_CL_MAGIC_ADDR 0x90020
1152     +#define OLD_CL_MAGIC_ADDR 0x20
1153     #define OLD_CL_MAGIC 0xA33F
1154     -#define OLD_CL_BASE_ADDR 0x90000
1155     -#define OLD_CL_OFFSET 0x90022
1156     +#define OLD_CL_OFFSET 0x22
1157    
1158     static void __init copy_bootdata(char *real_mode_data)
1159     {
1160     #ifndef CONFIG_XEN
1161     - int new_data;
1162     + unsigned long new_data;
1163     char * command_line;
1164    
1165     memcpy(x86_boot_params, real_mode_data, BOOT_PARAM_SIZE);
1166     - new_data = *(int *) (x86_boot_params + NEW_CL_POINTER);
1167     + new_data = *(u32 *) (x86_boot_params + NEW_CL_POINTER);
1168     if (!new_data) {
1169     - if (OLD_CL_MAGIC != * (u16 *) OLD_CL_MAGIC_ADDR) {
1170     + if (OLD_CL_MAGIC != *(u16 *)(real_mode_data + OLD_CL_MAGIC_ADDR)) {
1171     return;
1172     }
1173     - new_data = OLD_CL_BASE_ADDR + * (u16 *) OLD_CL_OFFSET;
1174     + new_data = __pa(real_mode_data) + *(u16 *)(real_mode_data + OLD_CL_OFFSET);
1175     }
1176     - command_line = (char *) ((u64)(new_data));
1177     + command_line = __va(new_data);
1178     memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE);
1179     #else
1180     int max_cmdline;
1181     @@ -98,10 +105,13 @@
1182     while ((1UL << machine_to_phys_order) < machine_to_phys_nr_ents )
1183     machine_to_phys_order++;
1184    
1185     -#if 0
1186     +#ifndef CONFIG_XEN
1187     /* clear bss before set_intr_gate with early_idt_handler */
1188     clear_bss();
1189    
1190     + /* Make NULL pointers segfault */
1191     + zap_identity_mappings();
1192     +
1193     for (i = 0; i < IDT_ENTRIES; i++)
1194     set_intr_gate(i, early_idt_handler);
1195     asm volatile("lidt %0" :: "m" (idt_descr));
1196     @@ -113,7 +123,7 @@
1197     cpu_pda(i) = &boot_cpu_pda[i];
1198    
1199     pda_init(0);
1200     - copy_bootdata(real_mode_data);
1201     + copy_bootdata(__va(real_mode_data));
1202     #ifdef CONFIG_SMP
1203     cpu_set(0, cpu_online_map);
1204     #endif
1205     --- a/arch/x86/kernel/head_32-xen.S
1206     +++ b/arch/x86/kernel/head_32-xen.S
1207     @@ -37,7 +37,8 @@
1208     /* Set up the stack pointer */
1209     movl $(init_thread_union+THREAD_SIZE),%esp
1210    
1211     - call setup_pda
1212     + movl %ss,%eax
1213     + movl %eax,%fs # gets reset once there's real percpu
1214    
1215     /* get vendor info */
1216     xorl %eax,%eax # call CPUID with 0 -> return vendor ID
1217     @@ -64,55 +65,11 @@
1218     xorl %eax,%eax # Clear GS
1219     movl %eax,%gs
1220    
1221     - movl $(__KERNEL_PDA),%eax
1222     - mov %eax,%fs
1223     -
1224     cld # gcc2 wants the direction flag cleared at all times
1225    
1226     pushl $0 # fake return address for unwinder
1227     jmp start_kernel
1228    
1229     -/*
1230     - * Point the GDT at this CPU's PDA. This will be
1231     - * cpu_gdt_table and boot_pda.
1232     - */
1233     -ENTRY(setup_pda)
1234     - /* get the PDA pointer */
1235     - movl $boot_pda, %eax
1236     -
1237     - /* slot the PDA address into the GDT */
1238     - mov $cpu_gdt_table, %ecx
1239     - mov %ax, (__KERNEL_PDA+0+2)(%ecx) /* base & 0x0000ffff */
1240     - shr $16, %eax
1241     - mov %al, (__KERNEL_PDA+4+0)(%ecx) /* base & 0x00ff0000 */
1242     - mov %ah, (__KERNEL_PDA+4+3)(%ecx) /* base & 0xff000000 */
1243     -
1244     - # %esi still points to start_info, and no registers
1245     - # need to be preserved.
1246     -
1247     - movl XEN_START_mfn_list(%esi), %ebx
1248     - movl $(cpu_gdt_table - __PAGE_OFFSET), %eax
1249     - shrl $PAGE_SHIFT, %eax
1250     - movl (%ebx,%eax,4), %ecx
1251     - pushl %ecx # frame number for set_gdt below
1252     -
1253     - xorl %esi, %esi
1254     - xorl %edx, %edx
1255     - shldl $PAGE_SHIFT, %ecx, %edx
1256     - shll $PAGE_SHIFT, %ecx
1257     - orl $0x61, %ecx
1258     - movl $cpu_gdt_table, %ebx
1259     - movl $__HYPERVISOR_update_va_mapping, %eax
1260     - int $0x82
1261     -
1262     - movl $(PAGE_SIZE_asm / 8), %ecx
1263     - movl %esp, %ebx
1264     - movl $__HYPERVISOR_set_gdt, %eax
1265     - int $0x82
1266     -
1267     - popl %ecx
1268     - ret
1269     -
1270     #define HYPERCALL_PAGE_OFFSET 0x1000
1271     .org HYPERCALL_PAGE_OFFSET
1272     ENTRY(hypercall_page)
1273     @@ -138,60 +95,6 @@
1274     */
1275     .data
1276    
1277     -/*
1278     - * The Global Descriptor Table contains 28 quadwords, per-CPU.
1279     - */
1280     - .section .data.page_aligned, "aw"
1281     - .align PAGE_SIZE_asm
1282     -ENTRY(cpu_gdt_table)
1283     - .quad 0x0000000000000000 /* NULL descriptor */
1284     - .quad 0x0000000000000000 /* 0x0b reserved */
1285     - .quad 0x0000000000000000 /* 0x13 reserved */
1286     - .quad 0x0000000000000000 /* 0x1b reserved */
1287     - .quad 0x0000000000000000 /* 0x20 unused */
1288     - .quad 0x0000000000000000 /* 0x28 unused */
1289     - .quad 0x0000000000000000 /* 0x33 TLS entry 1 */
1290     - .quad 0x0000000000000000 /* 0x3b TLS entry 2 */
1291     - .quad 0x0000000000000000 /* 0x43 TLS entry 3 */
1292     - .quad 0x0000000000000000 /* 0x4b reserved */
1293     - .quad 0x0000000000000000 /* 0x53 reserved */
1294     - .quad 0x0000000000000000 /* 0x5b reserved */
1295     -
1296     - .quad 0x00cf9a000000ffff /* 0x60 kernel 4GB code at 0x00000000 */
1297     - .quad 0x00cf92000000ffff /* 0x68 kernel 4GB data at 0x00000000 */
1298     - .quad 0x00cffa000000ffff /* 0x73 user 4GB code at 0x00000000 */
1299     - .quad 0x00cff2000000ffff /* 0x7b user 4GB data at 0x00000000 */
1300     -
1301     - .quad 0x0000000000000000 /* 0x80 TSS descriptor */
1302     - .quad 0x0000000000000000 /* 0x88 LDT descriptor */
1303     -
1304     - /*
1305     - * Segments used for calling PnP BIOS have byte granularity.
1306     - * They code segments and data segments have fixed 64k limits,
1307     - * the transfer segment sizes are set at run time.
1308     - */
1309     - .quad 0x0000000000000000 /* 0x90 32-bit code */
1310     - .quad 0x0000000000000000 /* 0x98 16-bit code */
1311     - .quad 0x0000000000000000 /* 0xa0 16-bit data */
1312     - .quad 0x0000000000000000 /* 0xa8 16-bit data */
1313     - .quad 0x0000000000000000 /* 0xb0 16-bit data */
1314     -
1315     - /*
1316     - * The APM segments have byte granularity and their bases
1317     - * are set at run time. All have 64k limits.
1318     - */
1319     - .quad 0x0000000000000000 /* 0xb8 APM CS code */
1320     - .quad 0x0000000000000000 /* 0xc0 APM CS 16 code (16 bit) */
1321     - .quad 0x0000000000000000 /* 0xc8 APM DS data */
1322     -
1323     - .quad 0x0000000000000000 /* 0xd0 - ESPFIX SS */
1324     - .quad 0x00cf92000000ffff /* 0xd8 - PDA */
1325     - .quad 0x0000000000000000 /* 0xe0 - unused */
1326     - .quad 0x0000000000000000 /* 0xe8 - unused */
1327     - .quad 0x0000000000000000 /* 0xf0 - unused */
1328     - .quad 0x0000000000000000 /* 0xf8 - GDT entry 31: double-fault TSS */
1329     - .align PAGE_SIZE_asm
1330     -
1331     #if CONFIG_XEN_COMPAT <= 0x030002
1332     /*
1333     * __xen_guest information
1334     --- a/arch/x86/kernel/head_64-xen.S
1335     +++ b/arch/x86/kernel/head_64-xen.S
1336     @@ -5,6 +5,7 @@
1337     * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
1338     * Copyright (C) 2000 Karsten Keil <kkeil@suse.de>
1339     * Copyright (C) 2001,2002 Andi Kleen <ak@suse.de>
1340     + * Copyright (C) 2005 Eric Biederman <ebiederm@xmission.com>
1341     * Jun Nakajima <jun.nakajima@intel.com>
1342     * Modified for Xen
1343     */
1344     @@ -41,18 +42,15 @@
1345     .word gdt_end-cpu_gdt_table-1
1346     .long cpu_gdt_table-__START_KERNEL_map
1347     #endif
1348     -ENTRY(stext)
1349     -ENTRY(_stext)
1350    
1351     - $page = 0
1352     +.balign PAGE_SIZE
1353     +
1354     #define NEXT_PAGE(name) \
1355     - $page = $page + 1; \
1356     - .org $page * 0x1000; \
1357     - phys_##name = $page * 0x1000 + __PHYSICAL_START; \
1358     + .balign PAGE_SIZE; \
1359     + phys_##name = . - .bootstrap.text; \
1360     ENTRY(name)
1361    
1362     NEXT_PAGE(init_level4_pgt)
1363     - /* This gets initialized in x86_64_start_kernel */
1364     .fill 512,8,0
1365     NEXT_PAGE(init_level4_user_pgt)
1366     /*
1367     @@ -136,13 +134,13 @@
1368    
1369     ENTRY(cpu_gdt_table)
1370     .quad 0x0000000000000000 /* NULL descriptor */
1371     + .quad 0x00cf9b000000ffff /* __KERNEL32_CS */
1372     + .quad 0x00af9b000000ffff /* __KERNEL_CS */
1373     + .quad 0x00cf93000000ffff /* __KERNEL_DS */
1374     + .quad 0x00cffb000000ffff /* __USER32_CS */
1375     + .quad 0x00cff3000000ffff /* __USER_DS, __USER32_DS */
1376     + .quad 0x00affb000000ffff /* __USER_CS */
1377     .quad 0x0 /* unused */
1378     - .quad 0x00af9a000000ffff /* __KERNEL_CS */
1379     - .quad 0x00cf92000000ffff /* __KERNEL_DS */
1380     - .quad 0x00cffa000000ffff /* __USER32_CS */
1381     - .quad 0x00cff2000000ffff /* __USER_DS, __USER32_DS */
1382     - .quad 0x00affa000000ffff /* __USER_CS */
1383     - .quad 0x00cf9a000000ffff /* __KERNEL32_CS */
1384     .quad 0,0 /* TSS */
1385     .quad 0,0 /* LDT */
1386     .quad 0,0,0 /* three TLS descriptors */
1387     @@ -165,14 +163,11 @@
1388     * __xen_guest information
1389     */
1390     .macro utoh value
1391     - .if (\value) < 0 || (\value) >= 0x10
1392     - utoh (((\value)>>4)&0x0fffffffffffffff)
1393     - .endif
1394     - .if ((\value) & 0xf) < 10
1395     - .byte '0' + ((\value) & 0xf)
1396     - .else
1397     - .byte 'A' + ((\value) & 0xf) - 10
1398     - .endif
1399     + i = 64
1400     + .rept 16
1401     + i = i - 4
1402     + .byte '0' + ((((\value) >> i) & 0xf) > 9) * ('0' - 'A' + 10) + (((\value) >> i) & 0xf)
1403     + .endr
1404     .endm
1405    
1406     .section __xen_guest
1407     --- a/arch/x86/kernel/io_apic_32-xen.c
1408     +++ b/arch/x86/kernel/io_apic_32-xen.c
1409     @@ -25,7 +25,6 @@
1410     #include <linux/init.h>
1411     #include <linux/delay.h>
1412     #include <linux/sched.h>
1413     -#include <linux/smp_lock.h>
1414     #include <linux/mc146818rtc.h>
1415     #include <linux/compiler.h>
1416     #include <linux/acpi.h>
1417     @@ -35,6 +34,7 @@
1418     #include <linux/msi.h>
1419     #include <linux/htirq.h>
1420     #include <linux/freezer.h>
1421     +#include <linux/kthread.h>
1422    
1423     #include <asm/io.h>
1424     #include <asm/smp.h>
1425     @@ -705,8 +705,6 @@
1426     unsigned long prev_balance_time = jiffies;
1427     long time_remaining = balanced_irq_interval;
1428    
1429     - daemonize("kirqd");
1430     -
1431     /* push everything to CPU 0 to give us a starting point. */
1432     for (i = 0 ; i < NR_IRQS ; i++) {
1433     irq_desc[i].pending_mask = cpumask_of_cpu(0);
1434     @@ -766,10 +764,9 @@
1435     }
1436    
1437     printk(KERN_INFO "Starting balanced_irq\n");
1438     - if (kernel_thread(balanced_irq, NULL, CLONE_KERNEL) >= 0)
1439     + if (!IS_ERR(kthread_run(balanced_irq, NULL, "kirqd")))
1440     return 0;
1441     - else
1442     - printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq");
1443     + printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq");
1444     failed:
1445     for_each_possible_cpu(i) {
1446     kfree(irq_cpu_data[i].irq_delta);
1447     @@ -1445,10 +1442,6 @@
1448     enable_8259A_irq(0);
1449     }
1450    
1451     -static inline void UNEXPECTED_IO_APIC(void)
1452     -{
1453     -}
1454     -
1455     void __init print_IO_APIC(void)
1456     {
1457     int apic, i;
1458     @@ -1488,34 +1481,12 @@
1459     printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
1460     printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
1461     printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS);
1462     - if (reg_00.bits.ID >= get_physical_broadcast())
1463     - UNEXPECTED_IO_APIC();
1464     - if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2)
1465     - UNEXPECTED_IO_APIC();
1466    
1467     printk(KERN_DEBUG ".... register #01: %08X\n", reg_01.raw);
1468     printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries);
1469     - if ( (reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */
1470     - (reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */
1471     - (reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */
1472     - (reg_01.bits.entries != 0x1f) && /* dual Xeon boards */
1473     - (reg_01.bits.entries != 0x22) && /* bigger Xeon boards */
1474     - (reg_01.bits.entries != 0x2E) &&
1475     - (reg_01.bits.entries != 0x3F)
1476     - )
1477     - UNEXPECTED_IO_APIC();
1478    
1479     printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ);
1480     printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version);
1481     - if ( (reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */
1482     - (reg_01.bits.version != 0x10) && /* oldest IO-APICs */
1483     - (reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */
1484     - (reg_01.bits.version != 0x13) && /* Xeon IO-APICs */
1485     - (reg_01.bits.version != 0x20) /* Intel P64H (82806 AA) */
1486     - )
1487     - UNEXPECTED_IO_APIC();
1488     - if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2)
1489     - UNEXPECTED_IO_APIC();
1490    
1491     /*
1492     * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02,
1493     @@ -1525,8 +1496,6 @@
1494     if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) {
1495     printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
1496     printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration);
1497     - if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2)
1498     - UNEXPECTED_IO_APIC();
1499     }
1500    
1501     /*
1502     @@ -1538,8 +1507,6 @@
1503     reg_03.raw != reg_01.raw) {
1504     printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw);
1505     printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT);
1506     - if (reg_03.bits.__reserved_1)
1507     - UNEXPECTED_IO_APIC();
1508     }
1509    
1510     printk(KERN_DEBUG ".... IRQ redirection table:\n");
1511     @@ -2670,19 +2637,19 @@
1512     if (irq < 0)
1513     return irq;
1514    
1515     - set_irq_msi(irq, desc);
1516     ret = msi_compose_msg(dev, irq, &msg);
1517     if (ret < 0) {
1518     destroy_irq(irq);
1519     return ret;
1520     }
1521    
1522     + set_irq_msi(irq, desc);
1523     write_msi_msg(irq, &msg);
1524    
1525     set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq,
1526     "edge");
1527    
1528     - return irq;
1529     + return 0;
1530     }
1531    
1532     void arch_teardown_msi_irq(unsigned int irq)
1533     --- a/arch/x86/kernel/io_apic_64-xen.c
1534     +++ b/arch/x86/kernel/io_apic_64-xen.c
1535     @@ -25,7 +25,6 @@
1536     #include <linux/init.h>
1537     #include <linux/delay.h>
1538     #include <linux/sched.h>
1539     -#include <linux/smp_lock.h>
1540     #include <linux/pci.h>
1541     #include <linux/mc146818rtc.h>
1542     #include <linux/acpi.h>
1543     @@ -897,10 +896,6 @@
1544     enable_8259A_irq(0);
1545     }
1546    
1547     -void __init UNEXPECTED_IO_APIC(void)
1548     -{
1549     -}
1550     -
1551     void __apicdebuginit print_IO_APIC(void)
1552     {
1553     int apic, i;
1554     @@ -936,40 +931,16 @@
1555     printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
1556     printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
1557     printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
1558     - if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2)
1559     - UNEXPECTED_IO_APIC();
1560    
1561     printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)&reg_01);
1562     printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries);
1563     - if ( (reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */
1564     - (reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */
1565     - (reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */
1566     - (reg_01.bits.entries != 0x1f) && /* dual Xeon boards */
1567     - (reg_01.bits.entries != 0x22) && /* bigger Xeon boards */
1568     - (reg_01.bits.entries != 0x2E) &&
1569     - (reg_01.bits.entries != 0x3F) &&
1570     - (reg_01.bits.entries != 0x03)
1571     - )
1572     - UNEXPECTED_IO_APIC();
1573    
1574     printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ);
1575     printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version);
1576     - if ( (reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */
1577     - (reg_01.bits.version != 0x02) && /* 82801BA IO-APICs (ICH2) */
1578     - (reg_01.bits.version != 0x10) && /* oldest IO-APICs */
1579     - (reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */
1580     - (reg_01.bits.version != 0x13) && /* Xeon IO-APICs */
1581     - (reg_01.bits.version != 0x20) /* Intel P64H (82806 AA) */
1582     - )
1583     - UNEXPECTED_IO_APIC();
1584     - if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2)
1585     - UNEXPECTED_IO_APIC();
1586    
1587     if (reg_01.bits.version >= 0x10) {
1588     printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
1589     printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration);
1590     - if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2)
1591     - UNEXPECTED_IO_APIC();
1592     }
1593    
1594     printk(KERN_DEBUG ".... IRQ redirection table:\n");
1595     @@ -1401,8 +1372,7 @@
1596    
1597     vector = ~get_irq_regs()->orig_rax;
1598     me = smp_processor_id();
1599     - if ((vector == cfg->vector) &&
1600     - cpu_isset(smp_processor_id(), cfg->domain)) {
1601     + if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) {
1602     cpumask_t cleanup_mask;
1603    
1604     cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
1605     @@ -1437,7 +1407,7 @@
1606    
1607     /*
1608     * We must acknowledge the irq before we move it or the acknowledge will
1609     - * not propogate properly.
1610     + * not propagate properly.
1611     */
1612     ack_APIC_irq();
1613    
1614     @@ -1520,6 +1490,7 @@
1615     static void end_lapic_irq (unsigned int i) { /* nothing */ }
1616    
1617     static struct hw_interrupt_type lapic_irq_type __read_mostly = {
1618     + .name = "local-APIC",
1619     .typename = "local-APIC-edge",
1620     .startup = NULL, /* startup_irq() not used for IRQ0 */
1621     .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */
1622     @@ -1989,18 +1960,18 @@
1623     if (irq < 0)
1624     return irq;
1625    
1626     - set_irq_msi(irq, desc);
1627     ret = msi_compose_msg(dev, irq, &msg);
1628     if (ret < 0) {
1629     destroy_irq(irq);
1630     return ret;
1631     }
1632    
1633     + set_irq_msi(irq, desc);
1634     write_msi_msg(irq, &msg);
1635    
1636     set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
1637    
1638     - return irq;
1639     + return 0;
1640     }
1641    
1642     void arch_teardown_msi_irq(unsigned int irq)
1643     --- a/arch/x86/kernel/ioport_32-xen.c
1644     +++ b/arch/x86/kernel/ioport_32-xen.c
1645     @@ -12,10 +12,10 @@
1646     #include <linux/types.h>
1647     #include <linux/ioport.h>
1648     #include <linux/smp.h>
1649     -#include <linux/smp_lock.h>
1650     #include <linux/stddef.h>
1651     #include <linux/slab.h>
1652     #include <linux/thread_info.h>
1653     +#include <linux/syscalls.h>
1654     #include <xen/interface/physdev.h>
1655    
1656     /* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
1657     --- a/arch/x86/kernel/ioport_64-xen.c
1658     +++ b/arch/x86/kernel/ioport_64-xen.c
1659     @@ -13,10 +13,10 @@
1660     #include <linux/ioport.h>
1661     #include <linux/mm.h>
1662     #include <linux/smp.h>
1663     -#include <linux/smp_lock.h>
1664     #include <linux/stddef.h>
1665     #include <linux/slab.h>
1666     #include <linux/thread_info.h>
1667     +#include <linux/syscalls.h>
1668     #include <xen/interface/physdev.h>
1669    
1670     /* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
1671     --- a/arch/x86/kernel/irq_32-xen.c
1672     +++ b/arch/x86/kernel/irq_32-xen.c
1673     @@ -24,6 +24,9 @@
1674     DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_internodealigned_in_smp;
1675     EXPORT_PER_CPU_SYMBOL(irq_stat);
1676    
1677     +DEFINE_PER_CPU(struct pt_regs *, irq_regs);
1678     +EXPORT_PER_CPU_SYMBOL(irq_regs);
1679     +
1680     /*
1681     * 'what should we do if we get a hw irq event on an illegal vector'.
1682     * each architecture has to answer this themselves.
1683     --- a/arch/x86/kernel/irq_64-xen.c
1684     +++ b/arch/x86/kernel/irq_64-xen.c
1685     @@ -32,7 +32,7 @@
1686     */
1687     static inline void stack_overflow_check(struct pt_regs *regs)
1688     {
1689     - u64 curbase = (u64) current->thread_info;
1690     + u64 curbase = (u64)task_stack_page(current);
1691     static unsigned long warned = -60*HZ;
1692    
1693     if (regs->rsp >= curbase && regs->rsp <= curbase + THREAD_SIZE &&
1694     @@ -145,17 +145,43 @@
1695    
1696     for (irq = 0; irq < NR_IRQS; irq++) {
1697     cpumask_t mask;
1698     + int break_affinity = 0;
1699     + int set_affinity = 1;
1700     +
1701     if (irq == 2)
1702     continue;
1703    
1704     + /* interrupt's are disabled at this point */
1705     + spin_lock(&irq_desc[irq].lock);
1706     +
1707     + if (!irq_has_action(irq) ||
1708     + cpus_equal(irq_desc[irq].affinity, map)) {
1709     + spin_unlock(&irq_desc[irq].lock);
1710     + continue;
1711     + }
1712     +
1713     cpus_and(mask, irq_desc[irq].affinity, map);
1714     - if (any_online_cpu(mask) == NR_CPUS) {
1715     - printk("Breaking affinity for irq %i\n", irq);
1716     + if (cpus_empty(mask)) {
1717     + break_affinity = 1;
1718     mask = map;
1719     }
1720     +
1721     + if (irq_desc[irq].chip->mask)
1722     + irq_desc[irq].chip->mask(irq);
1723     +
1724     if (irq_desc[irq].chip->set_affinity)
1725     irq_desc[irq].chip->set_affinity(irq, mask);
1726     - else if (irq_desc[irq].action && !(warned++))
1727     + else if (!(warned++))
1728     + set_affinity = 0;
1729     +
1730     + if (irq_desc[irq].chip->unmask)
1731     + irq_desc[irq].chip->unmask(irq);
1732     +
1733     + spin_unlock(&irq_desc[irq].lock);
1734     +
1735     + if (break_affinity && set_affinity)
1736     + printk("Broke affinity for irq %i\n", irq);
1737     + else if (!set_affinity)
1738     printk("Cannot set affinity for irq %i\n", irq);
1739     }
1740    
1741     --- a/arch/x86/kernel/ldt_32-xen.c
1742     +++ b/arch/x86/kernel/ldt_32-xen.c
1743     @@ -10,7 +10,6 @@
1744     #include <linux/string.h>
1745     #include <linux/mm.h>
1746     #include <linux/smp.h>
1747     -#include <linux/smp_lock.h>
1748     #include <linux/vmalloc.h>
1749     #include <linux/slab.h>
1750    
1751     --- a/arch/x86/kernel/ldt_64-xen.c
1752     +++ b/arch/x86/kernel/ldt_64-xen.c
1753     @@ -13,7 +13,6 @@
1754     #include <linux/string.h>
1755     #include <linux/mm.h>
1756     #include <linux/smp.h>
1757     -#include <linux/smp_lock.h>
1758     #include <linux/vmalloc.h>
1759     #include <linux/slab.h>
1760    
1761     --- a/arch/x86/kernel/microcode-xen.c
1762     +++ b/arch/x86/kernel/microcode-xen.c
1763     @@ -135,7 +135,7 @@
1764     return 0;
1765     }
1766    
1767     -static void __exit microcode_dev_exit (void)
1768     +static void microcode_dev_exit (void)
1769     {
1770     misc_deregister(&microcode_dev);
1771     }
1772     --- a/arch/x86/kernel/mpparse_32-xen.c
1773     +++ b/arch/x86/kernel/mpparse_32-xen.c
1774     @@ -18,7 +18,6 @@
1775     #include <linux/acpi.h>
1776     #include <linux/delay.h>
1777     #include <linux/bootmem.h>
1778     -#include <linux/smp_lock.h>
1779     #include <linux/kernel_stat.h>
1780     #include <linux/mc146818rtc.h>
1781     #include <linux/bitops.h>
1782     @@ -484,7 +483,7 @@
1783     }
1784     ++mpc_record;
1785     }
1786     - clustered_apic_check();
1787     + setup_apic_routing();
1788     if (!num_processors)
1789     printk(KERN_ERR "SMP mptable: no processors registered!\n");
1790     return num_processors;
1791     --- a/arch/x86/kernel/mpparse_64-xen.c
1792     +++ b/arch/x86/kernel/mpparse_64-xen.c
1793     @@ -17,7 +17,6 @@
1794     #include <linux/init.h>
1795     #include <linux/delay.h>
1796     #include <linux/bootmem.h>
1797     -#include <linux/smp_lock.h>
1798     #include <linux/kernel_stat.h>
1799     #include <linux/mc146818rtc.h>
1800     #include <linux/acpi.h>
1801     @@ -307,7 +306,7 @@
1802     }
1803     }
1804     }
1805     - clustered_apic_check();
1806     + setup_apic_routing();
1807     if (!num_processors)
1808     printk(KERN_ERR "MPTABLE: no processors registered!\n");
1809     return num_processors;
1810     --- a/arch/x86/kernel/pci-dma_32-xen.c
1811     +++ b/arch/x86/kernel/pci-dma_32-xen.c
1812     @@ -13,6 +13,7 @@
1813     #include <linux/pci.h>
1814     #include <linux/module.h>
1815     #include <linux/version.h>
1816     +#include <linux/pci.h>
1817     #include <asm/io.h>
1818     #include <xen/balloon.h>
1819     #include <xen/gnttab.h>
1820     @@ -284,7 +285,7 @@
1821     {
1822     void __iomem *mem_base = NULL;
1823     int pages = size >> PAGE_SHIFT;
1824     - int bitmap_size = (pages + 31)/32;
1825     + int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long);
1826    
1827     if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0)
1828     goto out;
1829     @@ -357,6 +358,32 @@
1830     EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
1831     #endif /* ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY */
1832    
1833     +#if defined(CONFIG_PCI) && !defined(CONFIG_XEN)
1834     +/* Many VIA bridges seem to corrupt data for DAC. Disable it here */
1835     +
1836     +int forbid_dac;
1837     +EXPORT_SYMBOL(forbid_dac);
1838     +
1839     +static __devinit void via_no_dac(struct pci_dev *dev)
1840     +{
1841     + if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
1842     + printk(KERN_INFO "PCI: VIA PCI bridge detected. Disabling DAC.\n");
1843     + forbid_dac = 1;
1844     + }
1845     +}
1846     +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac);
1847     +
1848     +static int check_iommu(char *s)
1849     +{
1850     + if (!strcmp(s, "usedac")) {
1851     + forbid_dac = -1;
1852     + return 1;
1853     + }
1854     + return 0;
1855     +}
1856     +__setup("iommu=", check_iommu);
1857     +#endif
1858     +
1859     dma_addr_t
1860     dma_map_single(struct device *dev, void *ptr, size_t size,
1861     enum dma_data_direction direction)
1862     --- a/arch/x86/kernel/pci-swiotlb_64-xen.c
1863     +++ b/arch/x86/kernel/pci-swiotlb_64-xen.c
1864     @@ -16,7 +16,7 @@
1865    
1866     void swiotlb_init(void);
1867    
1868     -struct dma_mapping_ops swiotlb_dma_ops = {
1869     +const struct dma_mapping_ops swiotlb_dma_ops = {
1870     #if 0
1871     .mapping_error = swiotlb_dma_mapping_error,
1872     .alloc_coherent = swiotlb_alloc_coherent,
1873     --- a/arch/x86/kernel/process_32-xen.c
1874     +++ b/arch/x86/kernel/process_32-xen.c
1875     @@ -21,7 +21,6 @@
1876     #include <linux/mm.h>
1877     #include <linux/elfcore.h>
1878     #include <linux/smp.h>
1879     -#include <linux/smp_lock.h>
1880     #include <linux/stddef.h>
1881     #include <linux/slab.h>
1882     #include <linux/vmalloc.h>
1883     @@ -39,6 +38,7 @@
1884     #include <linux/random.h>
1885     #include <linux/personality.h>
1886     #include <linux/tick.h>
1887     +#include <linux/percpu.h>
1888    
1889     #include <asm/uaccess.h>
1890     #include <asm/pgtable.h>
1891     @@ -61,7 +61,6 @@
1892    
1893     #include <asm/tlbflush.h>
1894     #include <asm/cpu.h>
1895     -#include <asm/pda.h>
1896    
1897     asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
1898    
1899     @@ -70,6 +69,12 @@
1900     unsigned long boot_option_idle_override = 0;
1901     EXPORT_SYMBOL(boot_option_idle_override);
1902    
1903     +DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
1904     +EXPORT_PER_CPU_SYMBOL(current_task);
1905     +
1906     +DEFINE_PER_CPU(int, cpu_number);
1907     +EXPORT_PER_CPU_SYMBOL(cpu_number);
1908     +
1909     /*
1910     * Return saved PC of a blocked thread.
1911     */
1912     @@ -168,6 +173,7 @@
1913     if (__get_cpu_var(cpu_idle_state))
1914     __get_cpu_var(cpu_idle_state) = 0;
1915    
1916     + check_pgt_cache();
1917     rmb();
1918     idle = xen_idle; /* no alternatives */
1919    
1920     @@ -218,18 +224,19 @@
1921     {
1922     }
1923    
1924     -static int __init idle_setup (char *str)
1925     +static int __init idle_setup(char *str)
1926     {
1927     - if (!strncmp(str, "poll", 4)) {
1928     + if (!strcmp(str, "poll")) {
1929     printk("using polling idle threads.\n");
1930     pm_idle = poll_idle;
1931     }
1932     + else
1933     + return -1;
1934    
1935     boot_option_idle_override = 1;
1936     - return 1;
1937     + return 0;
1938     }
1939     -
1940     -__setup("idle=", idle_setup);
1941     +early_param("idle", idle_setup);
1942    
1943     void show_regs(struct pt_regs * regs)
1944     {
1945     @@ -282,7 +289,7 @@
1946    
1947     regs.xds = __USER_DS;
1948     regs.xes = __USER_DS;
1949     - regs.xfs = __KERNEL_PDA;
1950     + regs.xfs = __KERNEL_PERCPU;
1951     regs.orig_eax = -1;
1952     regs.eip = (unsigned long) kernel_thread_helper;
1953     regs.xcs = __KERNEL_CS | get_kernel_rpl();
1954     @@ -556,7 +563,7 @@
1955     * multicall to indicate FPU task switch, rather than
1956     * synchronously trapping to Xen.
1957     */
1958     - if (prev_p->thread_info->status & TS_USEDFPU) {
1959     + if (task_thread_info(prev_p)->status & TS_USEDFPU) {
1960     __save_init_fpu(prev_p); /* _not_ save_init_fpu() */
1961     mcl->op = __HYPERVISOR_fpu_taskswitch;
1962     mcl->args[0] = 1;
1963     @@ -648,7 +655,7 @@
1964     if (prev->gs | next->gs)
1965     loadsegment(gs, next->gs);
1966    
1967     - write_pda(pcurrent, next_p);
1968     + x86_write_percpu(current_task, next_p);
1969    
1970     return prev_p;
1971     }
1972     --- a/arch/x86/kernel/process_64-xen.c
1973     +++ b/arch/x86/kernel/process_64-xen.c
1974     @@ -39,6 +39,7 @@
1975     #include <linux/random.h>
1976     #include <linux/notifier.h>
1977     #include <linux/kprobes.h>
1978     +#include <linux/kdebug.h>
1979    
1980     #include <asm/uaccess.h>
1981     #include <asm/pgtable.h>
1982     @@ -49,7 +50,6 @@
1983     #include <asm/mmu_context.h>
1984     #include <asm/pda.h>
1985     #include <asm/prctl.h>
1986     -#include <asm/kdebug.h>
1987     #include <xen/interface/platform.h>
1988     #include <xen/interface/physdev.h>
1989     #include <xen/interface/vcpu.h>
1990     @@ -232,16 +232,18 @@
1991    
1992     static int __init idle_setup (char *str)
1993     {
1994     - if (!strncmp(str, "poll", 4)) {
1995     + if (!strcmp(str, "poll")) {
1996     printk("using polling idle threads.\n");
1997     pm_idle = poll_idle;
1998     - }
1999     + } else if (!strcmp(str, "mwait"))
2000     + force_mwait = 1;
2001     + else
2002     + return -1;
2003    
2004     boot_option_idle_override = 1;
2005     - return 1;
2006     + return 0;
2007     }
2008     -
2009     -__setup("idle=", idle_setup);
2010     +early_param("idle", idle_setup);
2011    
2012     /* Prints also some state that isn't saved in the pt_regs */
2013     void __show_regs(struct pt_regs * regs)
2014     @@ -540,7 +542,7 @@
2015     * The AMD workaround requires it to be after DS reload, or
2016     * after DS has been cleared, which we do in __prepare_arch_switch.
2017     */
2018     - if (prev_p->thread_info->status & TS_USEDFPU) {
2019     + if (task_thread_info(prev_p)->status & TS_USEDFPU) {
2020     __save_init_fpu(prev_p); /* _not_ save_init_fpu() */
2021     mcl->op = __HYPERVISOR_fpu_taskswitch;
2022     mcl->args[0] = 1;
2023     --- a/arch/x86/kernel/quirks-xen.c
2024     +++ b/arch/x86/kernel/quirks-xen.c
2025     @@ -3,12 +3,10 @@
2026     */
2027     #include <linux/pci.h>
2028     #include <linux/irq.h>
2029     -#include <asm/pci-direct.h>
2030     -#include <asm/genapic.h>
2031     -#include <asm/cpu.h>
2032    
2033     #if defined(CONFIG_X86_IO_APIC) && (defined(CONFIG_SMP) || defined(CONFIG_XEN)) && defined(CONFIG_PCI)
2034     -static void __devinit verify_quirk_intel_irqbalance(struct pci_dev *dev)
2035     +
2036     +static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
2037     {
2038     u8 config, rev;
2039     u32 word;
2040     @@ -16,7 +14,7 @@
2041     /* BIOS may enable hardware IRQ balancing for
2042     * E7520/E7320/E7525(revision ID 0x9 and below)
2043     * based platforms.
2044     - * For those platforms, make sure that the genapic is set to 'flat'
2045     + * Disable SW irqbalance/affinity on those platforms.
2046     */
2047     pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev);
2048     if (rev > 0x9)
2049     @@ -30,59 +28,20 @@
2050     raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word);
2051    
2052     if (!(word & (1 << 13))) {
2053     -#ifndef CONFIG_XEN
2054     -#ifdef CONFIG_X86_64
2055     - if (genapic != &apic_flat)
2056     - panic("APIC mode must be flat on this system\n");
2057     -#elif defined(CONFIG_X86_GENERICARCH)
2058     - if (genapic != &apic_default)
2059     - panic("APIC mode must be default(flat) on this system. Use apic=default\n");
2060     -#endif
2061     -#endif
2062     - }
2063     -
2064     - /* put back the original value for config space*/
2065     - if (!(config & 0x2))
2066     - pci_write_config_byte(dev, 0xf4, config);
2067     -}
2068     -
2069     -void __init quirk_intel_irqbalance(void)
2070     -{
2071     - u8 config, rev;
2072     - u32 word;
2073     -
2074     - /* BIOS may enable hardware IRQ balancing for
2075     - * E7520/E7320/E7525(revision ID 0x9 and below)
2076     - * based platforms.
2077     - * Disable SW irqbalance/affinity on those platforms.
2078     - */
2079     - rev = read_pci_config_byte(0, 0, 0, PCI_CLASS_REVISION);
2080     - if (rev > 0x9)
2081     - return;
2082     -
2083     - printk(KERN_INFO "Intel E7520/7320/7525 detected.");
2084     -
2085     - /* enable access to config space */
2086     - config = read_pci_config_byte(0, 0, 0, 0xf4);
2087     - write_pci_config_byte(0, 0, 0, 0xf4, config|0x2);
2088     -
2089     - /* read xTPR register */
2090     - word = read_pci_config_16(0, 0, 0x40, 0x4c);
2091     -
2092     - if (!(word & (1 << 13))) {
2093     struct xen_platform_op op;
2094     - printk(KERN_INFO "Disabling irq balancing and affinity\n");
2095     +
2096     + printk(KERN_INFO "Intel E7520/7320/7525 detected. "
2097     + "Disabling irq balancing and affinity\n");
2098     op.cmd = XENPF_platform_quirk;
2099     op.u.platform_quirk.quirk_id = QUIRK_NOIRQBALANCING;
2100     WARN_ON(HYPERVISOR_platform_op(&op));
2101     }
2102    
2103     - /* put back the original value for config space */
2104     + /* put back the original value for config space*/
2105     if (!(config & 0x2))
2106     - write_pci_config_byte(0, 0, 0, 0xf4, config);
2107     + pci_write_config_byte(dev, 0xf4, config);
2108     }
2109     -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, verify_quirk_intel_irqbalance);
2110     -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, verify_quirk_intel_irqbalance);
2111     -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, verify_quirk_intel_irqbalance);
2112     -
2113     +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, quirk_intel_irqbalance);
2114     +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, quirk_intel_irqbalance);
2115     +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, quirk_intel_irqbalance);
2116     #endif
2117     --- a/arch/x86/kernel/setup64-xen.c
2118     +++ b/arch/x86/kernel/setup64-xen.c
2119     @@ -113,9 +113,9 @@
2120     if (!NODE_DATA(cpu_to_node(i))) {
2121     printk("cpu with no node %d, num_online_nodes %d\n",
2122     i, num_online_nodes());
2123     - ptr = alloc_bootmem(size);
2124     + ptr = alloc_bootmem_pages(size);
2125     } else {
2126     - ptr = alloc_bootmem_node(NODE_DATA(cpu_to_node(i)), size);
2127     + ptr = alloc_bootmem_pages_node(NODE_DATA(cpu_to_node(i)), size);
2128     }
2129     if (!ptr)
2130     panic("Cannot allocate cpu data for CPU %d\n", i);
2131     @@ -208,6 +208,8 @@
2132     __attribute__((section(".bss.page_aligned")));
2133     #endif
2134    
2135     +extern asmlinkage void ignore_sysret(void);
2136     +
2137     /* May not be marked __init: used by software suspend */
2138     void syscall_init(void)
2139     {
2140     @@ -219,12 +221,22 @@
2141     */
2142     wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
2143     wrmsrl(MSR_LSTAR, system_call);
2144     + wrmsrl(MSR_CSTAR, ignore_sysret);
2145    
2146     /* Flags to clear on syscall */
2147     wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000);
2148     #endif
2149     #ifdef CONFIG_IA32_EMULATION
2150     syscall32_cpu_init ();
2151     +#else
2152     + {
2153     + static const struct callback_register cstar = {
2154     + .type = CALLBACKTYPE_syscall32,
2155     + .address = (unsigned long)ignore_sysret
2156     + };
2157     + if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar))
2158     + printk(KERN_WARN "Unable to register CSTAR callback\n");
2159     + }
2160     #endif
2161     }
2162    
2163     @@ -262,7 +274,6 @@
2164     /* CPU 0 is initialised in head64.c */
2165     if (cpu != 0) {
2166     pda_init(cpu);
2167     - zap_low_mappings(cpu);
2168     }
2169     #ifndef CONFIG_X86_NO_TSS
2170     else
2171     --- a/arch/x86/kernel/setup_64-xen.c
2172     +++ b/arch/x86/kernel/setup_64-xen.c
2173     @@ -123,6 +123,8 @@
2174    
2175     unsigned long saved_video_mode;
2176    
2177     +int force_mwait __cpuinitdata;
2178     +
2179     /*
2180     * Early DMI memory
2181     */
2182     @@ -256,10 +258,10 @@
2183     * there is a real-mode segmented pointer pointing to the
2184     * 4K EBDA area at 0x40E
2185     */
2186     - ebda_addr = *(unsigned short *)EBDA_ADDR_POINTER;
2187     + ebda_addr = *(unsigned short *)__va(EBDA_ADDR_POINTER);
2188     ebda_addr <<= 4;
2189    
2190     - ebda_size = *(unsigned short *)(unsigned long)ebda_addr;
2191     + ebda_size = *(unsigned short *)__va(ebda_addr);
2192    
2193     /* Round EBDA up to pages */
2194     if (ebda_size == 0)
2195     @@ -413,15 +415,8 @@
2196     #endif
2197    
2198     #ifdef CONFIG_SMP
2199     - /*
2200     - * But first pinch a few for the stack/trampoline stuff
2201     - * FIXME: Don't need the extra page at 4K, but need to fix
2202     - * trampoline before removing it. (see the GDT stuff)
2203     - */
2204     - reserve_bootmem_generic(PAGE_SIZE, PAGE_SIZE);
2205     -
2206     /* Reserve SMP trampoline */
2207     - reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, PAGE_SIZE);
2208     + reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, 2*PAGE_SIZE);
2209     #endif
2210     #endif
2211    
2212     @@ -573,8 +568,6 @@
2213     early_quirks();
2214     #endif
2215    
2216     - zap_low_mappings(0);
2217     -
2218     /*
2219     * set this early, so we dont allocate cpu0
2220     * if MADT list doesnt list BSP first
2221     @@ -877,6 +870,10 @@
2222    
2223     /* RDTSC can be speculated around */
2224     clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
2225     +
2226     + /* Family 10 doesn't support C states in MWAIT so don't use it */
2227     + if (c->x86 == 0x10 && !force_mwait)
2228     + clear_bit(X86_FEATURE_MWAIT, &c->x86_capability);
2229     }
2230    
2231     static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
2232     @@ -1159,9 +1156,7 @@
2233     #ifdef CONFIG_X86_MCE
2234     mcheck_init(c);
2235     #endif
2236     - if (c == &boot_cpu_data)
2237     - mtrr_bp_init();
2238     - else
2239     + if (c != &boot_cpu_data)
2240     mtrr_ap_init();
2241     #ifdef CONFIG_NUMA
2242     numa_add_cpu(smp_processor_id());
2243     @@ -1252,9 +1247,8 @@
2244     "stc",
2245     "100mhzsteps",
2246     "hwpstate",
2247     - NULL, /* tsc invariant mapped to constant_tsc */
2248     - NULL,
2249     - /* nothing */ /* constant_tsc - moved to flags */
2250     + "", /* tsc invariant mapped to constant_tsc */
2251     + /* nothing */
2252     };
2253    
2254    
2255     --- a/arch/x86/kernel/smp_32-xen.c
2256     +++ b/arch/x86/kernel/smp_32-xen.c
2257     @@ -13,7 +13,6 @@
2258     #include <linux/mm.h>
2259     #include <linux/delay.h>
2260     #include <linux/spinlock.h>
2261     -#include <linux/smp_lock.h>
2262     #include <linux/kernel_stat.h>
2263     #include <linux/mc146818rtc.h>
2264     #include <linux/cache.h>
2265     @@ -216,7 +215,6 @@
2266     static struct mm_struct * flush_mm;
2267     static unsigned long flush_va;
2268     static DEFINE_SPINLOCK(tlbstate_lock);
2269     -#define FLUSH_ALL 0xffffffff
2270    
2271     /*
2272     * We cannot call mmdrop() because we are in interrupt context,
2273     @@ -298,7 +296,7 @@
2274    
2275     if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) {
2276     if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) {
2277     - if (flush_va == FLUSH_ALL)
2278     + if (flush_va == TLB_FLUSH_ALL)
2279     local_flush_tlb();
2280     else
2281     __flush_tlb_one(flush_va);
2282     @@ -314,9 +312,11 @@
2283     return IRQ_HANDLED;
2284     }
2285    
2286     -static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
2287     - unsigned long va)
2288     +void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
2289     + unsigned long va)
2290     {
2291     + cpumask_t cpumask = *cpumaskp;
2292     +
2293     /*
2294     * A couple of (to be removed) sanity checks:
2295     *
2296     @@ -327,10 +327,12 @@
2297     BUG_ON(cpu_isset(smp_processor_id(), cpumask));
2298     BUG_ON(!mm);
2299    
2300     +#ifdef CONFIG_HOTPLUG_CPU
2301     /* If a CPU which we ran on has gone down, OK. */
2302     cpus_and(cpumask, cpumask, cpu_online_map);
2303     - if (cpus_empty(cpumask))
2304     + if (unlikely(cpus_empty(cpumask)))
2305     return;
2306     +#endif
2307    
2308     /*
2309     * i'm not happy about this global shared spinlock in the
2310     @@ -341,17 +343,7 @@
2311    
2312     flush_mm = mm;
2313     flush_va = va;
2314     -#if NR_CPUS <= BITS_PER_LONG
2315     - atomic_set_mask(cpumask, &flush_cpumask);
2316     -#else
2317     - {
2318     - int k;
2319     - unsigned long *flush_mask = (unsigned long *)&flush_cpumask;
2320     - unsigned long *cpu_mask = (unsigned long *)&cpumask;
2321     - for (k = 0; k < BITS_TO_LONGS(NR_CPUS); ++k)
2322     - atomic_set_mask(cpu_mask[k], &flush_mask[k]);
2323     - }
2324     -#endif
2325     + cpus_or(flush_cpumask, cpumask, flush_cpumask);
2326     /*
2327     * We have to send the IPI only to
2328     * CPUs affected.
2329     @@ -378,7 +370,7 @@
2330    
2331     local_flush_tlb();
2332     if (!cpus_empty(cpu_mask))
2333     - flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
2334     + flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
2335     preempt_enable();
2336     }
2337    
2338     @@ -397,7 +389,7 @@
2339     leave_mm(smp_processor_id());
2340     }
2341     if (!cpus_empty(cpu_mask))
2342     - flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
2343     + flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
2344    
2345     preempt_enable();
2346     }
2347     @@ -446,7 +438,7 @@
2348     * it goes straight through and wastes no time serializing
2349     * anything. Worst case is that we lose a reschedule ...
2350     */
2351     -void smp_send_reschedule(int cpu)
2352     +void xen_smp_send_reschedule(int cpu)
2353     {
2354     WARN_ON(cpu_is_offline(cpu));
2355     send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
2356     @@ -478,36 +470,79 @@
2357    
2358     static struct call_data_struct *call_data;
2359    
2360     +static void __smp_call_function(void (*func) (void *info), void *info,
2361     + int nonatomic, int wait)
2362     +{
2363     + struct call_data_struct data;
2364     + int cpus = num_online_cpus() - 1;
2365     +
2366     + if (!cpus)
2367     + return;
2368     +
2369     + data.func = func;
2370     + data.info = info;
2371     + atomic_set(&data.started, 0);
2372     + data.wait = wait;
2373     + if (wait)
2374     + atomic_set(&data.finished, 0);
2375     +
2376     + call_data = &data;
2377     + mb();
2378     +
2379     + /* Send a message to all other CPUs and wait for them to respond */
2380     + send_IPI_allbutself(CALL_FUNCTION_VECTOR);
2381     +
2382     + /* Wait for response */
2383     + while (atomic_read(&data.started) != cpus)
2384     + cpu_relax();
2385     +
2386     + if (wait)
2387     + while (atomic_read(&data.finished) != cpus)
2388     + cpu_relax();
2389     +}
2390     +
2391     +
2392     /**
2393     - * smp_call_function(): Run a function on all other CPUs.
2394     + * smp_call_function_mask(): Run a function on a set of other CPUs.
2395     + * @mask: The set of cpus to run on. Must not include the current cpu.
2396     * @func: The function to run. This must be fast and non-blocking.
2397     * @info: An arbitrary pointer to pass to the function.
2398     - * @nonatomic: currently unused.
2399     * @wait: If true, wait (atomically) until function has completed on other CPUs.
2400     *
2401     - * Returns 0 on success, else a negative status code. Does not return until
2402     - * remote CPUs are nearly ready to execute <<func>> or are or have executed.
2403     + * Returns 0 on success, else a negative status code.
2404     + *
2405     + * If @wait is true, then returns once @func has returned; otherwise
2406     + * it returns just before the target cpu calls @func.
2407     *
2408     * You must not call this function with disabled interrupts or from a
2409     * hardware interrupt handler or from a bottom half handler.
2410     */
2411     -int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
2412     - int wait)
2413     +int
2414     +xen_smp_call_function_mask(cpumask_t mask,
2415     + void (*func)(void *), void *info,
2416     + int wait)
2417     {
2418     struct call_data_struct data;
2419     + cpumask_t allbutself;
2420     int cpus;
2421    
2422     + /* Can deadlock when called with interrupts disabled */
2423     + WARN_ON(irqs_disabled());
2424     +
2425     /* Holding any lock stops cpus from going down. */
2426     spin_lock(&call_lock);
2427     - cpus = num_online_cpus() - 1;
2428     +
2429     + allbutself = cpu_online_map;
2430     + cpu_clear(smp_processor_id(), allbutself);
2431     +
2432     + cpus_and(mask, mask, allbutself);
2433     + cpus = cpus_weight(mask);
2434     +
2435     if (!cpus) {
2436     spin_unlock(&call_lock);
2437     return 0;
2438     }
2439    
2440     - /* Can deadlock when called with interrupts disabled */
2441     - WARN_ON(irqs_disabled());
2442     -
2443     data.func = func;
2444     data.info = info;
2445     atomic_set(&data.started, 0);
2446     @@ -517,9 +552,12 @@
2447    
2448     call_data = &data;
2449     mb();
2450     -
2451     - /* Send a message to all other CPUs and wait for them to respond */
2452     - send_IPI_allbutself(CALL_FUNCTION_VECTOR);
2453     +
2454     + /* Send a message to other CPUs */
2455     + if (cpus_equal(mask, allbutself))
2456     + send_IPI_allbutself(CALL_FUNCTION_VECTOR);
2457     + else
2458     + send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
2459    
2460     /* Wait for response */
2461     while (atomic_read(&data.started) != cpus)
2462     @@ -532,15 +570,14 @@
2463    
2464     return 0;
2465     }
2466     -EXPORT_SYMBOL(smp_call_function);
2467    
2468     static void stop_this_cpu (void * dummy)
2469     {
2470     + local_irq_disable();
2471     /*
2472     * Remove this CPU:
2473     */
2474     cpu_clear(smp_processor_id(), cpu_online_map);
2475     - local_irq_disable();
2476     disable_all_local_evtchn();
2477     if (cpu_data[smp_processor_id()].hlt_works_ok)
2478     for(;;) halt();
2479     @@ -551,13 +588,18 @@
2480     * this function calls the 'stop' function on all other CPUs in the system.
2481     */
2482    
2483     -void smp_send_stop(void)
2484     +void xen_smp_send_stop(void)
2485     {
2486     - smp_call_function(stop_this_cpu, NULL, 1, 0);
2487     + /* Don't deadlock on the call lock in panic */
2488     + int nolock = !spin_trylock(&call_lock);
2489     + unsigned long flags;
2490    
2491     - local_irq_disable();
2492     + local_irq_save(flags);
2493     + __smp_call_function(stop_this_cpu, NULL, 0, 0);
2494     + if (!nolock)
2495     + spin_unlock(&call_lock);
2496     disable_all_local_evtchn();
2497     - local_irq_enable();
2498     + local_irq_restore(flags);
2499     }
2500    
2501     /*
2502     @@ -598,74 +640,3 @@
2503    
2504     return IRQ_HANDLED;
2505     }
2506     -
2507     -/*
2508     - * this function sends a 'generic call function' IPI to one other CPU
2509     - * in the system.
2510     - *
2511     - * cpu is a standard Linux logical CPU number.
2512     - */
2513     -static void
2514     -__smp_call_function_single(int cpu, void (*func) (void *info), void *info,
2515     - int nonatomic, int wait)
2516     -{
2517     - struct call_data_struct data;
2518     - int cpus = 1;
2519     -
2520     - data.func = func;
2521     - data.info = info;
2522     - atomic_set(&data.started, 0);
2523     - data.wait = wait;
2524     - if (wait)
2525     - atomic_set(&data.finished, 0);
2526     -
2527     - call_data = &data;
2528     - wmb();
2529     - /* Send a message to all other CPUs and wait for them to respond */
2530     - send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_VECTOR);
2531     -
2532     - /* Wait for response */
2533     - while (atomic_read(&data.started) != cpus)
2534     - cpu_relax();
2535     -
2536     - if (!wait)
2537     - return;
2538     -
2539     - while (atomic_read(&data.finished) != cpus)
2540     - cpu_relax();
2541     -}
2542     -
2543     -/*
2544     - * smp_call_function_single - Run a function on another CPU
2545     - * @func: The function to run. This must be fast and non-blocking.
2546     - * @info: An arbitrary pointer to pass to the function.
2547     - * @nonatomic: Currently unused.
2548     - * @wait: If true, wait until function has completed on other CPUs.
2549     - *
2550     - * Retrurns 0 on success, else a negative status code.
2551     - *
2552     - * Does not return until the remote CPU is nearly ready to execute <func>
2553     - * or is or has executed.
2554     - */
2555     -
2556     -int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
2557     - int nonatomic, int wait)
2558     -{
2559     - /* prevent preemption and reschedule on another processor */
2560     - int me = get_cpu();
2561     - if (cpu == me) {
2562     - WARN_ON(1);
2563     - put_cpu();
2564     - return -EBUSY;
2565     - }
2566     -
2567     - /* Can deadlock when called with interrupts disabled */
2568     - WARN_ON(irqs_disabled());
2569     -
2570     - spin_lock_bh(&call_lock);
2571     - __smp_call_function_single(cpu, func, info, nonatomic, wait);
2572     - spin_unlock_bh(&call_lock);
2573     - put_cpu();
2574     - return 0;
2575     -}
2576     -EXPORT_SYMBOL(smp_call_function_single);
2577     --- a/arch/x86/kernel/smp_64-xen.c
2578     +++ b/arch/x86/kernel/smp_64-xen.c
2579     @@ -14,7 +14,6 @@
2580     #include <linux/mm.h>
2581     #include <linux/delay.h>
2582     #include <linux/spinlock.h>
2583     -#include <linux/smp_lock.h>
2584     #include <linux/smp.h>
2585     #include <linux/kernel_stat.h>
2586     #include <linux/mc146818rtc.h>
2587     @@ -457,44 +456,36 @@
2588     }
2589     EXPORT_SYMBOL(smp_call_function);
2590    
2591     -void smp_stop_cpu(void)
2592     +static void stop_this_cpu(void *dummy)
2593     {
2594     - unsigned long flags;
2595     + local_irq_disable();
2596     /*
2597     * Remove this CPU:
2598     */
2599     cpu_clear(smp_processor_id(), cpu_online_map);
2600     - local_irq_save(flags);
2601     disable_all_local_evtchn();
2602     - local_irq_restore(flags);
2603     -}
2604     -
2605     -static void smp_really_stop_cpu(void *dummy)
2606     -{
2607     - smp_stop_cpu();
2608     for (;;)
2609     halt();
2610     }
2611    
2612     void smp_send_stop(void)
2613     {
2614     - int nolock = 0;
2615     + int nolock;
2616     + unsigned long flags;
2617     +
2618     #ifndef CONFIG_XEN
2619     if (reboot_force)
2620     return;
2621     #endif
2622     +
2623     /* Don't deadlock on the call lock in panic */
2624     - if (!spin_trylock(&call_lock)) {
2625     - /* ignore locking because we have panicked anyways */
2626     - nolock = 1;
2627     - }
2628     - __smp_call_function(smp_really_stop_cpu, NULL, 0, 0);
2629     + nolock = !spin_trylock(&call_lock);
2630     + local_irq_save(flags);
2631     + __smp_call_function(stop_this_cpu, NULL, 0, 0);
2632     if (!nolock)
2633     spin_unlock(&call_lock);
2634     -
2635     - local_irq_disable();
2636     disable_all_local_evtchn();
2637     - local_irq_enable();
2638     + local_irq_restore(flags);
2639     }
2640    
2641     /*
2642     --- a/arch/x86/kernel/time_32-xen.c
2643     +++ b/arch/x86/kernel/time_32-xen.c
2644     @@ -80,7 +80,6 @@
2645     #include <asm/i8253.h>
2646     DEFINE_SPINLOCK(i8253_lock);
2647     EXPORT_SYMBOL(i8253_lock);
2648     -int pit_latch_buggy; /* extern */
2649     #else
2650     volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
2651     #endif
2652     @@ -589,7 +588,7 @@
2653     return IRQ_HANDLED;
2654     }
2655    
2656     -void mark_tsc_unstable(void)
2657     +void mark_tsc_unstable(char *reason)
2658     {
2659     #ifndef CONFIG_XEN /* XXX Should tell the hypervisor about this fact. */
2660     tsc_unstable = 1;
2661     @@ -597,17 +596,18 @@
2662     }
2663     EXPORT_SYMBOL_GPL(mark_tsc_unstable);
2664    
2665     +static cycle_t cs_last;
2666     +
2667     static cycle_t xen_clocksource_read(void)
2668     {
2669     cycle_t ret = sched_clock();
2670    
2671     #ifdef CONFIG_SMP
2672     for (;;) {
2673     - static cycle_t last_ret;
2674     #ifndef CONFIG_64BIT
2675     - cycle_t last = cmpxchg64(&last_ret, 0, 0);
2676     + cycle_t last = cmpxchg64(&cs_last, 0, 0);
2677     #else
2678     - cycle_t last = last_ret;
2679     + cycle_t last = cs_last;
2680     #define cmpxchg64 cmpxchg
2681     #endif
2682    
2683     @@ -627,7 +627,7 @@
2684     }
2685     ret = last;
2686     }
2687     - if (cmpxchg64(&last_ret, last, ret) == last)
2688     + if (cmpxchg64(&cs_last, last, ret) == last)
2689     break;
2690     }
2691     #endif
2692     @@ -635,6 +635,14 @@
2693     return ret;
2694     }
2695    
2696     +static void xen_clocksource_resume(void)
2697     +{
2698     + extern void time_resume(void);
2699     +
2700     + time_resume();
2701     + cs_last = sched_clock();
2702     +}
2703     +
2704     static struct clocksource clocksource_xen = {
2705     .name = "xen",
2706     .rating = 400,
2707     @@ -643,6 +651,7 @@
2708     .mult = 1 << XEN_SHIFT, /* time directly in nanoseconds */
2709     .shift = XEN_SHIFT,
2710     .flags = CLOCK_SOURCE_IS_CONTINUOUS,
2711     + .resume = xen_clocksource_resume,
2712     };
2713    
2714     static void init_missing_ticks_accounting(unsigned int cpu)
2715     @@ -731,35 +740,6 @@
2716     mod_timer(&sync_xen_wallclock_timer, jiffies + 1);
2717     }
2718    
2719     -static int timer_resume(struct sys_device *dev)
2720     -{
2721     - extern void time_resume(void);
2722     - time_resume();
2723     - return 0;
2724     -}
2725     -
2726     -static struct sysdev_class timer_sysclass = {
2727     - .resume = timer_resume,
2728     - set_kset_name("timer"),
2729     -};
2730     -
2731     -
2732     -/* XXX this driverfs stuff should probably go elsewhere later -john */
2733     -static struct sys_device device_timer = {
2734     - .id = 0,
2735     - .cls = &timer_sysclass,
2736     -};
2737     -
2738     -static int time_init_device(void)
2739     -{
2740     - int error = sysdev_class_register(&timer_sysclass);
2741     - if (!error)
2742     - error = sysdev_register(&device_timer);
2743     - return error;
2744     -}
2745     -
2746     -device_initcall(time_init_device);
2747     -
2748     extern void (*late_time_init)(void);
2749    
2750     /* Dynamically-mapped IRQ. */
2751     @@ -772,7 +752,7 @@
2752     VIRQ_TIMER,
2753     0,
2754     timer_interrupt,
2755     - SA_INTERRUPT,
2756     + IRQF_DISABLED,
2757     "timer0",
2758     NULL);
2759     BUG_ON(per_cpu(timer_irq, 0) < 0);
2760     @@ -890,21 +870,21 @@
2761     cpu_clear(smp_processor_id(), nohz_cpu_mask);
2762     }
2763    
2764     -void raw_safe_halt(void)
2765     +void xen_safe_halt(void)
2766     {
2767     stop_hz_timer();
2768     /* Blocking includes an implicit local_irq_enable(). */
2769     HYPERVISOR_block();
2770     start_hz_timer();
2771     }
2772     -EXPORT_SYMBOL(raw_safe_halt);
2773     +EXPORT_SYMBOL(xen_safe_halt);
2774    
2775     -void halt(void)
2776     +void xen_halt(void)
2777     {
2778     if (irqs_disabled())
2779     VOID(HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL));
2780     }
2781     -EXPORT_SYMBOL(halt);
2782     +EXPORT_SYMBOL(xen_halt);
2783    
2784     /* No locking required. Interrupts are disabled on all CPUs. */
2785     void time_resume(void)
2786     @@ -967,7 +947,7 @@
2787     irq = bind_virq_to_irqhandler(VIRQ_TIMER,
2788     cpu,
2789     timer_interrupt,
2790     - SA_INTERRUPT,
2791     + IRQF_DISABLED,
2792     timer_name[cpu],
2793     NULL);
2794     if (irq < 0)
2795     --- a/arch/x86/kernel/traps_32-xen.c
2796     +++ b/arch/x86/kernel/traps_32-xen.c
2797     @@ -52,7 +52,7 @@
2798     #include <asm/unwind.h>
2799     #include <asm/smp.h>
2800     #include <asm/arch_hooks.h>
2801     -#include <asm/kdebug.h>
2802     +#include <linux/kdebug.h>
2803     #include <asm/stacktrace.h>
2804    
2805     #include <linux/module.h>
2806     @@ -101,20 +101,6 @@
2807    
2808     int kstack_depth_to_print = 24;
2809     static unsigned int code_bytes = 64;
2810     -ATOMIC_NOTIFIER_HEAD(i386die_chain);
2811     -
2812     -int register_die_notifier(struct notifier_block *nb)
2813     -{
2814     - vmalloc_sync_all();
2815     - return atomic_notifier_chain_register(&i386die_chain, nb);
2816     -}
2817     -EXPORT_SYMBOL(register_die_notifier); /* used modular by kdb */
2818     -
2819     -int unregister_die_notifier(struct notifier_block *nb)
2820     -{
2821     - return atomic_notifier_chain_unregister(&i386die_chain, nb);
2822     -}
2823     -EXPORT_SYMBOL(unregister_die_notifier); /* used modular by kdb */
2824    
2825     static inline int valid_stack_ptr(struct thread_info *tinfo, void *p)
2826     {
2827     @@ -325,7 +311,7 @@
2828     regs->xds & 0xffff, regs->xes & 0xffff, regs->xfs & 0xffff, gs, ss);
2829     printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)",
2830     TASK_COMM_LEN, current->comm, current->pid,
2831     - current_thread_info(), current, current->thread_info);
2832     + current_thread_info(), current, task_thread_info(current));
2833     /*
2834     * When in-kernel, we also print out the stack and code at the
2835     * time of the fault..
2836     @@ -482,8 +468,6 @@
2837     siginfo_t *info)
2838     {
2839     struct task_struct *tsk = current;
2840     - tsk->thread.error_code = error_code;
2841     - tsk->thread.trap_no = trapnr;
2842    
2843     if (regs->eflags & VM_MASK) {
2844     if (vm86)
2845     @@ -495,6 +479,18 @@
2846     goto kernel_trap;
2847    
2848     trap_signal: {
2849     + /*
2850     + * We want error_code and trap_no set for userspace faults and
2851     + * kernelspace faults which result in die(), but not
2852     + * kernelspace faults which are fixed up. die() gives the
2853     + * process no chance to handle the signal and notice the
2854     + * kernel fault information, so that won't result in polluting
2855     + * the information about previously queued, but not yet
2856     + * delivered, faults. See also do_general_protection below.
2857     + */
2858     + tsk->thread.error_code = error_code;
2859     + tsk->thread.trap_no = trapnr;
2860     +
2861     if (info)
2862     force_sig_info(signr, info, tsk);
2863     else
2864     @@ -503,8 +499,11 @@
2865     }
2866    
2867     kernel_trap: {
2868     - if (!fixup_exception(regs))
2869     + if (!fixup_exception(regs)) {
2870     + tsk->thread.error_code = error_code;
2871     + tsk->thread.trap_no = trapnr;
2872     die(str, regs, error_code);
2873     + }
2874     return;
2875     }
2876    
2877     @@ -578,9 +577,6 @@
2878     fastcall void __kprobes do_general_protection(struct pt_regs * regs,
2879     long error_code)
2880     {
2881     - current->thread.error_code = error_code;
2882     - current->thread.trap_no = 13;
2883     -
2884     if (regs->eflags & VM_MASK)
2885     goto gp_in_vm86;
2886    
2887     @@ -599,6 +595,8 @@
2888    
2889     gp_in_kernel:
2890     if (!fixup_exception(regs)) {
2891     + current->thread.error_code = error_code;
2892     + current->thread.trap_no = 13;
2893     if (notify_die(DIE_GPF, "general protection fault", regs,
2894     error_code, 13, SIGSEGV) == NOTIFY_STOP)
2895     return;
2896     @@ -987,9 +985,7 @@
2897     fastcall unsigned long patch_espfix_desc(unsigned long uesp,
2898     unsigned long kesp)
2899     {
2900     - int cpu = smp_processor_id();
2901     - struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
2902     - struct desc_struct *gdt = (struct desc_struct *)cpu_gdt_descr->address;
2903     + struct desc_struct *gdt = __get_cpu_var(gdt_page).gdt;
2904     unsigned long base = (kesp - uesp) & -THREAD_SIZE;
2905     unsigned long new_kesp = kesp - base;
2906     unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT;
2907     --- a/arch/x86/kernel/traps_64-xen.c
2908     +++ b/arch/x86/kernel/traps_64-xen.c
2909     @@ -32,6 +32,7 @@
2910     #include <linux/unwind.h>
2911     #include <linux/uaccess.h>
2912     #include <linux/bug.h>
2913     +#include <linux/kdebug.h>
2914    
2915     #include <asm/system.h>
2916     #include <asm/io.h>
2917     @@ -39,7 +40,6 @@
2918     #include <asm/debugreg.h>
2919     #include <asm/desc.h>
2920     #include <asm/i387.h>
2921     -#include <asm/kdebug.h>
2922     #include <asm/processor.h>
2923     #include <asm/unwind.h>
2924     #include <asm/smp.h>
2925     @@ -71,22 +71,6 @@
2926     asmlinkage void machine_check(void);
2927     asmlinkage void spurious_interrupt_bug(void);
2928    
2929     -ATOMIC_NOTIFIER_HEAD(die_chain);
2930     -EXPORT_SYMBOL(die_chain);
2931     -
2932     -int register_die_notifier(struct notifier_block *nb)
2933     -{
2934     - vmalloc_sync_all();
2935     - return atomic_notifier_chain_register(&die_chain, nb);
2936     -}
2937     -EXPORT_SYMBOL(register_die_notifier); /* used modular by kdb */
2938     -
2939     -int unregister_die_notifier(struct notifier_block *nb)
2940     -{
2941     - return atomic_notifier_chain_unregister(&die_chain, nb);
2942     -}
2943     -EXPORT_SYMBOL(unregister_die_notifier); /* used modular by kdb */
2944     -
2945     static inline void conditional_sti(struct pt_regs *regs)
2946     {
2947     if (regs->eflags & X86_EFLAGS_IF)
2948     @@ -428,8 +412,7 @@
2949     const int cpu = smp_processor_id();
2950     struct task_struct *cur = cpu_pda(cpu)->pcurrent;
2951    
2952     - rsp = regs->rsp;
2953     -
2954     + rsp = regs->rsp;
2955     printk("CPU %d ", cpu);
2956     __show_regs(regs);
2957     printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
2958     @@ -440,7 +423,6 @@
2959     * time of the fault..
2960     */
2961     if (in_kernel) {
2962     -
2963     printk("Stack: ");
2964     _show_stack(NULL, regs, (unsigned long*)rsp);
2965    
2966     @@ -485,13 +467,14 @@
2967    
2968     unsigned __kprobes long oops_begin(void)
2969     {
2970     - int cpu = smp_processor_id();
2971     + int cpu;
2972     unsigned long flags;
2973    
2974     oops_enter();
2975    
2976     /* racy, but better than risking deadlock. */
2977     local_irq_save(flags);
2978     + cpu = smp_processor_id();
2979     if (!spin_trylock(&die_lock)) {
2980     if (cpu == die_owner)
2981     /* nested oops. should stop eventually */;
2982     @@ -585,10 +568,20 @@
2983     {
2984     struct task_struct *tsk = current;
2985    
2986     - tsk->thread.error_code = error_code;
2987     - tsk->thread.trap_no = trapnr;
2988     -
2989     if (user_mode(regs)) {
2990     + /*
2991     + * We want error_code and trap_no set for userspace
2992     + * faults and kernelspace faults which result in
2993     + * die(), but not kernelspace faults which are fixed
2994     + * up. die() gives the process no chance to handle
2995     + * the signal and notice the kernel fault information,
2996     + * so that won't result in polluting the information
2997     + * about previously queued, but not yet delivered,
2998     + * faults. See also do_general_protection below.
2999     + */
3000     + tsk->thread.error_code = error_code;
3001     + tsk->thread.trap_no = trapnr;
3002     +
3003     if (exception_trace && unhandled_signal(tsk, signr))
3004     printk(KERN_INFO
3005     "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n",
3006     @@ -609,8 +602,11 @@
3007     fixup = search_exception_tables(regs->rip);
3008     if (fixup)
3009     regs->rip = fixup->fixup;
3010     - else
3011     + else {
3012     + tsk->thread.error_code = error_code;
3013     + tsk->thread.trap_no = trapnr;
3014     die(str, regs, error_code);
3015     + }
3016     return;
3017     }
3018     }
3019     @@ -686,10 +682,10 @@
3020    
3021     conditional_sti(regs);
3022    
3023     - tsk->thread.error_code = error_code;
3024     - tsk->thread.trap_no = 13;
3025     -
3026     if (user_mode(regs)) {
3027     + tsk->thread.error_code = error_code;
3028     + tsk->thread.trap_no = 13;
3029     +
3030     if (exception_trace && unhandled_signal(tsk, SIGSEGV))
3031     printk(KERN_INFO
3032     "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n",
3033     @@ -708,6 +704,9 @@
3034     regs->rip = fixup->fixup;
3035     return;
3036     }
3037     +
3038     + tsk->thread.error_code = error_code;
3039     + tsk->thread.trap_no = 13;
3040     if (notify_die(DIE_GPF, "general protection fault", regs,
3041     error_code, 13, SIGSEGV) == NOTIFY_STOP)
3042     return;
3043     --- a/arch/x86/kernel/vsyscall_64-xen.c
3044     +++ b/arch/x86/kernel/vsyscall_64-xen.c
3045     @@ -45,14 +45,34 @@
3046    
3047     #define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
3048     #define __syscall_clobber "r11","rcx","memory"
3049     +#define __pa_vsymbol(x) \
3050     + ({unsigned long v; \
3051     + extern char __vsyscall_0; \
3052     + asm("" : "=r" (v) : "0" (x)); \
3053     + ((v - VSYSCALL_FIRST_PAGE) + __pa_symbol(&__vsyscall_0)); })
3054    
3055     +/*
3056     + * vsyscall_gtod_data contains data that is :
3057     + * - readonly from vsyscalls
3058     + * - writen by timer interrupt or systcl (/proc/sys/kernel/vsyscall64)
3059     + * Try to keep this structure as small as possible to avoid cache line ping pongs
3060     + */
3061     struct vsyscall_gtod_data_t {
3062     - seqlock_t lock;
3063     - int sysctl_enabled;
3064     - struct timeval wall_time_tv;
3065     + seqlock_t lock;
3066     +
3067     + /* open coded 'struct timespec' */
3068     + time_t wall_time_sec;
3069     + u32 wall_time_nsec;
3070     +
3071     + int sysctl_enabled;
3072     struct timezone sys_tz;
3073     - cycle_t offset_base;
3074     - struct clocksource clock;
3075     + struct { /* extract of a clocksource struct */
3076     + cycle_t (*vread)(void);
3077     + cycle_t cycle_last;
3078     + cycle_t mask;
3079     + u32 mult;
3080     + u32 shift;
3081     + } clock;
3082     };
3083     int __vgetcpu_mode __section_vgetcpu_mode;
3084    
3085     @@ -68,9 +88,13 @@
3086    
3087     write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
3088     /* copy vsyscall data */
3089     - vsyscall_gtod_data.clock = *clock;
3090     - vsyscall_gtod_data.wall_time_tv.tv_sec = wall_time->tv_sec;
3091     - vsyscall_gtod_data.wall_time_tv.tv_usec = wall_time->tv_nsec/1000;
3092     + vsyscall_gtod_data.clock.vread = clock->vread;
3093     + vsyscall_gtod_data.clock.cycle_last = clock->cycle_last;
3094     + vsyscall_gtod_data.clock.mask = clock->mask;
3095     + vsyscall_gtod_data.clock.mult = clock->mult;
3096     + vsyscall_gtod_data.clock.shift = clock->shift;
3097     + vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
3098     + vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
3099     vsyscall_gtod_data.sys_tz = sys_tz;
3100     write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
3101     }
3102     @@ -105,7 +129,8 @@
3103     static __always_inline void do_vgettimeofday(struct timeval * tv)
3104     {
3105     cycle_t now, base, mask, cycle_delta;
3106     - unsigned long seq, mult, shift, nsec_delta;
3107     + unsigned seq;
3108     + unsigned long mult, shift, nsec;
3109     cycle_t (*vread)(void);
3110     do {
3111     seq = read_seqbegin(&__vsyscall_gtod_data.lock);
3112     @@ -121,21 +146,20 @@
3113     mult = __vsyscall_gtod_data.clock.mult;
3114     shift = __vsyscall_gtod_data.clock.shift;
3115    
3116     - *tv = __vsyscall_gtod_data.wall_time_tv;
3117     -
3118     + tv->tv_sec = __vsyscall_gtod_data.wall_time_sec;
3119     + nsec = __vsyscall_gtod_data.wall_time_nsec;
3120     } while (read_seqretry(&__vsyscall_gtod_data.lock, seq));
3121    
3122     /* calculate interval: */
3123     cycle_delta = (now - base) & mask;
3124     /* convert to nsecs: */
3125     - nsec_delta = (cycle_delta * mult) >> shift;
3126     + nsec += (cycle_delta * mult) >> shift;
3127    
3128     - /* convert to usecs and add to timespec: */
3129     - tv->tv_usec += nsec_delta / NSEC_PER_USEC;
3130     - while (tv->tv_usec > USEC_PER_SEC) {
3131     + while (nsec >= NSEC_PER_SEC) {
3132     tv->tv_sec += 1;
3133     - tv->tv_usec -= USEC_PER_SEC;
3134     + nsec -= NSEC_PER_SEC;
3135     }
3136     + tv->tv_usec = nsec / NSEC_PER_USEC;
3137     }
3138    
3139     int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
3140     @@ -151,11 +175,16 @@
3141     * unlikely */
3142     time_t __vsyscall(1) vtime(time_t *t)
3143     {
3144     + struct timeval tv;
3145     + time_t result;
3146     if (unlikely(!__vsyscall_gtod_data.sysctl_enabled))
3147     return time_syscall(t);
3148     - else if (t)
3149     - *t = __vsyscall_gtod_data.wall_time_tv.tv_sec;
3150     - return __vsyscall_gtod_data.wall_time_tv.tv_sec;
3151     +
3152     + vgettimeofday(&tv, 0);
3153     + result = tv.tv_sec;
3154     + if (t)
3155     + *t = result;
3156     + return result;
3157     }
3158    
3159     /* Fast way to get current CPU and node.
3160     @@ -224,10 +253,10 @@
3161     return ret;
3162     /* gcc has some trouble with __va(__pa()), so just do it this
3163     way. */
3164     - map1 = ioremap(__pa_symbol(&vsysc1), 2);
3165     + map1 = ioremap(__pa_vsymbol(&vsysc1), 2);
3166     if (!map1)
3167     return -ENOMEM;
3168     - map2 = ioremap(__pa_symbol(&vsysc2), 2);
3169     + map2 = ioremap(__pa_vsymbol(&vsysc2), 2);
3170     if (!map2) {
3171     ret = -ENOMEM;
3172     goto out;
3173     @@ -304,7 +333,7 @@
3174     cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
3175     {
3176     long cpu = (long)arg;
3177     - if (action == CPU_ONLINE)
3178     + if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
3179     smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 0, 1);
3180     return NOTIFY_DONE;
3181     }
3182     --- a/arch/x86/mm/fault_32-xen.c
3183     +++ b/arch/x86/mm/fault_32-xen.c
3184     @@ -14,19 +14,20 @@
3185     #include <linux/mman.h>
3186     #include <linux/mm.h>
3187     #include <linux/smp.h>
3188     -#include <linux/smp_lock.h>
3189     #include <linux/interrupt.h>
3190     #include <linux/init.h>
3191     #include <linux/tty.h>
3192     #include <linux/vt_kern.h> /* For unblank_screen() */
3193     #include <linux/highmem.h>
3194     +#include <linux/bootmem.h> /* for max_low_pfn */
3195     +#include <linux/vmalloc.h>
3196     #include <linux/module.h>
3197     #include <linux/kprobes.h>
3198     #include <linux/uaccess.h>
3199     +#include <linux/kdebug.h>
3200    
3201     #include <asm/system.h>
3202     #include <asm/desc.h>
3203     -#include <asm/kdebug.h>
3204     #include <asm/segment.h>
3205    
3206     extern void die(const char *,struct pt_regs *,long);
3207     @@ -259,25 +260,20 @@
3208     unsigned long page;
3209    
3210     page = read_cr3();
3211     - page = ((unsigned long *) __va(page))[address >> 22];
3212     - if (oops_may_print())
3213     - printk(KERN_ALERT "*pde = ma %08lx pa %08lx\n", page,
3214     - machine_to_phys(page));
3215     + page = ((unsigned long *) __va(page))[address >> PGDIR_SHIFT];
3216     + printk(KERN_ALERT "*pde = ma %08lx pa %08lx\n", page,
3217     + machine_to_phys(page));
3218     /*
3219     * We must not directly access the pte in the highpte
3220     * case if the page table is located in highmem.
3221     * And lets rather not kmap-atomic the pte, just in case
3222     * it's allocated already.
3223     */
3224     -#ifdef CONFIG_HIGHPTE
3225     - if ((page >> PAGE_SHIFT) >= highstart_pfn)
3226     - return;
3227     -#endif
3228     - if ((page & 1) && oops_may_print()) {
3229     - page &= PAGE_MASK;
3230     - address &= 0x003ff000;
3231     - page = machine_to_phys(page);
3232     - page = ((unsigned long *) __va(page))[address >> PAGE_SHIFT];
3233     + if ((machine_to_phys(page) >> PAGE_SHIFT) < max_low_pfn
3234     + && (page & _PAGE_PRESENT)) {
3235     + page = machine_to_phys(page & PAGE_MASK);
3236     + page = ((unsigned long *) __va(page))[(address >> PAGE_SHIFT)
3237     + & (PTRS_PER_PTE - 1)];
3238     printk(KERN_ALERT "*pte = ma %08lx pa %08lx\n", page,
3239     machine_to_phys(page));
3240     }
3241     @@ -581,6 +577,11 @@
3242     bad_area_nosemaphore:
3243     /* User mode accesses just cause a SIGSEGV */
3244     if (error_code & 4) {
3245     + /*
3246     + * It's possible to have interrupts off here.
3247     + */
3248     + local_irq_enable();
3249     +
3250     /*
3251     * Valid to do another page fault here because this one came
3252     * from user space.
3253     @@ -633,7 +634,7 @@
3254     bust_spinlocks(1);
3255    
3256     if (oops_may_print()) {
3257     - #ifdef CONFIG_X86_PAE
3258     +#ifdef CONFIG_X86_PAE
3259     if (error_code & 16) {
3260     pte_t *pte = lookup_address(address);
3261    
3262     @@ -642,7 +643,7 @@
3263     "NX-protected page - exploit attempt? "
3264     "(uid: %d)\n", current->uid);
3265     }
3266     - #endif
3267     +#endif
3268     if (address < PAGE_SIZE)
3269     printk(KERN_ALERT "BUG: unable to handle kernel NULL "
3270     "pointer dereference");
3271     @@ -652,8 +653,8 @@
3272     printk(" at virtual address %08lx\n",address);
3273     printk(KERN_ALERT " printing eip:\n");
3274     printk("%08lx\n", regs->eip);
3275     + dump_fault_path(address);
3276     }
3277     - dump_fault_path(address);
3278     tsk->thread.cr2 = address;
3279     tsk->thread.trap_no = 14;
3280     tsk->thread.error_code = error_code;
3281     @@ -694,7 +695,6 @@
3282     force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
3283     }
3284    
3285     -#if !HAVE_SHARED_KERNEL_PMD
3286     void vmalloc_sync_all(void)
3287     {
3288     /*
3289     @@ -710,6 +710,9 @@
3290     static unsigned long start = TASK_SIZE;
3291     unsigned long address;
3292    
3293     + if (SHARED_KERNEL_PMD)
3294     + return;
3295     +
3296     BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
3297     for (address = start;
3298     address >= TASK_SIZE && address < hypervisor_virt_start;
3299     @@ -739,4 +742,3 @@
3300     start = address + (1UL << PMD_SHIFT);
3301     }
3302     }
3303     -#endif
3304     --- a/arch/x86/mm/fault_64-xen.c
3305     +++ b/arch/x86/mm/fault_64-xen.c
3306     @@ -15,22 +15,22 @@
3307     #include <linux/mman.h>
3308     #include <linux/mm.h>
3309     #include <linux/smp.h>
3310     -#include <linux/smp_lock.h>
3311     #include <linux/interrupt.h>
3312     #include <linux/init.h>
3313     #include <linux/tty.h>
3314     #include <linux/vt_kern.h> /* For unblank_screen() */
3315     #include <linux/compiler.h>
3316     +#include <linux/vmalloc.h>
3317     #include <linux/module.h>
3318     #include <linux/kprobes.h>
3319     #include <linux/uaccess.h>
3320     +#include <linux/kdebug.h>
3321    
3322     #include <asm/system.h>
3323     #include <asm/pgalloc.h>
3324     #include <asm/smp.h>
3325     #include <asm/tlbflush.h>
3326     #include <asm/proto.h>
3327     -#include <asm/kdebug.h>
3328     #include <asm-generic/sections.h>
3329    
3330     /* Page fault error code bits */
3331     @@ -537,6 +537,12 @@
3332     bad_area_nosemaphore:
3333     /* User mode accesses just cause a SIGSEGV */
3334     if (error_code & PF_USER) {
3335     +
3336     + /*
3337     + * It's possible to have interrupts off here.
3338     + */
3339     + local_irq_enable();
3340     +
3341     if (is_prefetch(regs, address, error_code))
3342     return;
3343    
3344     @@ -646,7 +652,7 @@
3345     }
3346    
3347     DEFINE_SPINLOCK(pgd_lock);
3348     -struct page *pgd_list;
3349     +LIST_HEAD(pgd_list);
3350    
3351     void vmalloc_sync_all(void)
3352     {
3353     @@ -666,8 +672,7 @@
3354     if (pgd_none(*pgd_ref))
3355     continue;
3356     spin_lock(&pgd_lock);
3357     - for (page = pgd_list; page;
3358     - page = (struct page *)page->index) {
3359     + list_for_each_entry(page, &pgd_list, lru) {
3360     pgd_t *pgd;
3361     pgd = (pgd_t *)page_address(page) + pgd_index(address);
3362     if (pgd_none(*pgd))
3363     --- a/arch/x86/mm/highmem_32-xen.c
3364     +++ b/arch/x86/mm/highmem_32-xen.c
3365     @@ -26,7 +26,7 @@
3366     * However when holding an atomic kmap is is not legal to sleep, so atomic
3367     * kmaps are appropriate for short, tight code paths only.
3368     */
3369     -static void *__kmap_atomic(struct page *page, enum km_type type, pgprot_t prot)
3370     +void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
3371     {
3372     enum fixed_addresses idx;
3373     unsigned long vaddr;
3374     @@ -49,15 +49,7 @@
3375    
3376     void *kmap_atomic(struct page *page, enum km_type type)
3377     {
3378     - return __kmap_atomic(page, type, kmap_prot);
3379     -}
3380     -
3381     -/* Same as kmap_atomic but with PAGE_KERNEL_RO page protection. */
3382     -void *kmap_atomic_pte(struct page *page, enum km_type type)
3383     -{
3384     - return __kmap_atomic(page, type,
3385     - test_bit(PG_pinned, &page->flags)
3386     - ? PAGE_KERNEL_RO : kmap_prot);
3387     + return kmap_atomic_prot(page, type, kmap_prot);
3388     }
3389    
3390     void kunmap_atomic(void *kvaddr, enum km_type type)
3391     @@ -80,6 +72,7 @@
3392     #endif
3393     }
3394    
3395     + arch_flush_lazy_mmu_mode();
3396     pagefault_enable();
3397     }
3398    
3399     @@ -117,6 +110,5 @@
3400     EXPORT_SYMBOL(kmap);
3401     EXPORT_SYMBOL(kunmap);
3402     EXPORT_SYMBOL(kmap_atomic);
3403     -EXPORT_SYMBOL(kmap_atomic_pte);
3404     EXPORT_SYMBOL(kunmap_atomic);
3405     EXPORT_SYMBOL(kmap_atomic_to_page);
3406     --- a/arch/x86/mm/init_32-xen.c
3407     +++ b/arch/x86/mm/init_32-xen.c
3408     @@ -22,6 +22,7 @@
3409     #include <linux/init.h>
3410     #include <linux/highmem.h>
3411     #include <linux/pagemap.h>
3412     +#include <linux/pfn.h>
3413     #include <linux/poison.h>
3414     #include <linux/bootmem.h>
3415     #include <linux/slab.h>
3416     @@ -67,17 +68,19 @@
3417     pmd_t *pmd_table;
3418    
3419     #ifdef CONFIG_X86_PAE
3420     - pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
3421     - paravirt_alloc_pd(__pa(pmd_table) >> PAGE_SHIFT);
3422     - make_lowmem_page_readonly(pmd_table, XENFEAT_writable_page_tables);
3423     - set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
3424     - pud = pud_offset(pgd, 0);
3425     - if (pmd_table != pmd_offset(pud, 0))
3426     - BUG();
3427     -#else
3428     + if (!(__pgd_val(*pgd) & _PAGE_PRESENT)) {
3429     + pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
3430     +
3431     + paravirt_alloc_pd(__pa(pmd_table) >> PAGE_SHIFT);
3432     + make_lowmem_page_readonly(pmd_table, XENFEAT_writable_page_tables);
3433     + set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
3434     + pud = pud_offset(pgd, 0);
3435     + if (pmd_table != pmd_offset(pud, 0))
3436     + BUG();
3437     + }
3438     +#endif
3439     pud = pud_offset(pgd, 0);
3440     pmd_table = pmd_offset(pud, 0);
3441     -#endif
3442    
3443     return pmd_table;
3444     }
3445     @@ -88,16 +91,18 @@
3446     */
3447     static pte_t * __init one_page_table_init(pmd_t *pmd)
3448     {
3449     +#if CONFIG_XEN_COMPAT <= 0x030002
3450     if (pmd_none(*pmd)) {
3451     +#else
3452     + if (!(__pmd_val(*pmd) & _PAGE_PRESENT)) {
3453     +#endif
3454     pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE);
3455     +
3456     paravirt_alloc_pt(__pa(page_table) >> PAGE_SHIFT);
3457     make_lowmem_page_readonly(page_table,
3458     XENFEAT_writable_page_tables);
3459     set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
3460     - if (page_table != pte_offset_kernel(pmd, 0))
3461     - BUG();
3462     -
3463     - return page_table;
3464     + BUG_ON(page_table != pte_offset_kernel(pmd, 0));
3465     }
3466    
3467     return pte_offset_kernel(pmd, 0);
3468     @@ -117,7 +122,6 @@
3469     static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base)
3470     {
3471     pgd_t *pgd;
3472     - pud_t *pud;
3473     pmd_t *pmd;
3474     int pgd_idx, pmd_idx;
3475     unsigned long vaddr;
3476     @@ -128,12 +132,10 @@
3477     pgd = pgd_base + pgd_idx;
3478    
3479     for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) {
3480     - if (pgd_none(*pgd))
3481     - one_md_table_init(pgd);
3482     - pud = pud_offset(pgd, vaddr);
3483     - pmd = pmd_offset(pud, vaddr);
3484     + pmd = one_md_table_init(pgd);
3485     + pmd = pmd + pmd_index(vaddr);
3486     for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) {
3487     - if (vaddr < hypervisor_virt_start && pmd_none(*pmd))
3488     + if (vaddr < hypervisor_virt_start)
3489     one_page_table_init(pmd);
3490    
3491     vaddr += PMD_SIZE;
3492     @@ -196,24 +198,25 @@
3493     /* Map with big pages if possible, otherwise create normal page tables. */
3494     if (cpu_has_pse) {
3495     unsigned int address2 = (pfn + PTRS_PER_PTE - 1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1;
3496     -
3497     if (is_kernel_text(address) || is_kernel_text(address2))
3498     set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC));
3499     else
3500     set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE));
3501     +
3502     pfn += PTRS_PER_PTE;
3503     } else {
3504     pte = one_page_table_init(pmd);
3505    
3506     - pte += pte_ofs;
3507     - for (; pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; pte++, pfn++, pte_ofs++) {
3508     - /* XEN: Only map initial RAM allocation. */
3509     - if ((pfn >= max_ram_pfn) || pte_present(*pte))
3510     - continue;
3511     - if (is_kernel_text(address))
3512     - set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC));
3513     - else
3514     - set_pte(pte, pfn_pte(pfn, PAGE_KERNEL));
3515     + for (pte += pte_ofs;
3516     + pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn;
3517     + pte++, pfn++, pte_ofs++, address += PAGE_SIZE) {
3518     + /* XEN: Only map initial RAM allocation. */
3519     + if ((pfn >= max_ram_pfn) || pte_present(*pte))
3520     + continue;
3521     + if (is_kernel_text(address))
3522     + set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC));
3523     + else
3524     + set_pte(pte, pfn_pte(pfn, PAGE_KERNEL));
3525     }
3526     pte_ofs = 0;
3527     }
3528     @@ -383,15 +386,44 @@
3529    
3530     pgd_t *swapper_pg_dir;
3531    
3532     +static void __init xen_pagetable_setup_start(pgd_t *base)
3533     +{
3534     +}
3535     +
3536     +static void __init xen_pagetable_setup_done(pgd_t *base)
3537     +{
3538     +}
3539     +
3540     +/*
3541     + * Build a proper pagetable for the kernel mappings. Up until this
3542     + * point, we've been running on some set of pagetables constructed by
3543     + * the boot process.
3544     + *
3545     + * If we're booting on native hardware, this will be a pagetable
3546     + * constructed in arch/i386/kernel/head.S, and not running in PAE mode
3547     + * (even if we'll end up running in PAE). The root of the pagetable
3548     + * will be swapper_pg_dir.
3549     + *
3550     + * If we're booting paravirtualized under a hypervisor, then there are
3551     + * more options: we may already be running PAE, and the pagetable may
3552     + * or may not be based in swapper_pg_dir. In any case,
3553     + * paravirt_pagetable_setup_start() will set up swapper_pg_dir
3554     + * appropriately for the rest of the initialization to work.
3555     + *
3556     + * In general, pagetable_init() assumes that the pagetable may already
3557     + * be partially populated, and so it avoids stomping on any existing
3558     + * mappings.
3559     + */
3560     static void __init pagetable_init (void)
3561     {
3562     - unsigned long vaddr;
3563     + unsigned long vaddr, end;
3564     pgd_t *pgd_base = (pgd_t *)xen_start_info->pt_base;
3565    
3566     + xen_pagetable_setup_start(pgd_base);
3567     +
3568     /* Enable PSE if available */
3569     - if (cpu_has_pse) {
3570     + if (cpu_has_pse)
3571     set_in_cr4(X86_CR4_PSE);
3572     - }
3573    
3574     /* Enable PGE if available */
3575     if (cpu_has_pge) {
3576     @@ -408,9 +440,12 @@
3577     * created - mappings will be set by set_fixmap():
3578     */
3579     vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
3580     - page_table_range_init(vaddr, hypervisor_virt_start, pgd_base);
3581     + end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
3582     + page_table_range_init(vaddr, end, pgd_base);
3583    
3584     permanent_kmaps_init(pgd_base);
3585     +
3586     + xen_pagetable_setup_done(pgd_base);
3587     }
3588    
3589     #if defined(CONFIG_SOFTWARE_SUSPEND) || defined(CONFIG_ACPI_SLEEP)
3590     @@ -757,34 +792,29 @@
3591     EXPORT_SYMBOL_GPL(remove_memory);
3592     #endif
3593    
3594     -struct kmem_cache *pgd_cache;
3595     struct kmem_cache *pmd_cache;
3596    
3597     void __init pgtable_cache_init(void)
3598     {
3599     + size_t pgd_size = PTRS_PER_PGD*sizeof(pgd_t);
3600     +
3601     if (PTRS_PER_PMD > 1) {
3602     pmd_cache = kmem_cache_create("pmd",
3603     PTRS_PER_PMD*sizeof(pmd_t),
3604     PTRS_PER_PMD*sizeof(pmd_t),
3605     - 0,
3606     + SLAB_PANIC,
3607     pmd_ctor,
3608     NULL);
3609     - if (!pmd_cache)
3610     - panic("pgtable_cache_init(): cannot create pmd cache");
3611     + if (!SHARED_KERNEL_PMD) {
3612     + /* If we're in PAE mode and have a non-shared
3613     + kernel pmd, then the pgd size must be a
3614     + page size. This is because the pgd_list
3615     + links through the page structure, so there
3616     + can only be one pgd per page for this to
3617     + work. */
3618     + pgd_size = PAGE_SIZE;
3619     + }
3620     }
3621     - pgd_cache = kmem_cache_create("pgd",
3622     -#ifndef CONFIG_XEN
3623     - PTRS_PER_PGD*sizeof(pgd_t),
3624     - PTRS_PER_PGD*sizeof(pgd_t),
3625     -#else
3626     - PAGE_SIZE,
3627     - PAGE_SIZE,
3628     -#endif
3629     - 0,
3630     - pgd_ctor,
3631     - PTRS_PER_PMD == 1 ? pgd_dtor : NULL);
3632     - if (!pgd_cache)
3633     - panic("pgtable_cache_init(): Cannot create pgd cache");
3634     }
3635    
3636     /*
3637     @@ -818,13 +848,26 @@
3638    
3639     void mark_rodata_ro(void)
3640     {
3641     - unsigned long addr = (unsigned long)__start_rodata;
3642     -
3643     - for (; addr < (unsigned long)__end_rodata; addr += PAGE_SIZE)
3644     - change_page_attr(virt_to_page(addr), 1, PAGE_KERNEL_RO);
3645     + unsigned long start = PFN_ALIGN(_text);
3646     + unsigned long size = PFN_ALIGN(_etext) - start;
3647    
3648     - printk("Write protecting the kernel read-only data: %uk\n",
3649     - (__end_rodata - __start_rodata) >> 10);
3650     +#ifndef CONFIG_KPROBES
3651     +#ifdef CONFIG_HOTPLUG_CPU
3652     + /* It must still be possible to apply SMP alternatives. */
3653     + if (num_possible_cpus() <= 1)
3654     +#endif
3655     + {
3656     + change_page_attr(virt_to_page(start),
3657     + size >> PAGE_SHIFT, PAGE_KERNEL_RX);
3658     + printk("Write protecting the kernel text: %luk\n", size >> 10);
3659     + }
3660     +#endif
3661     + start += size;
3662     + size = (unsigned long)__end_rodata - start;
3663     + change_page_attr(virt_to_page(start),
3664     + size >> PAGE_SHIFT, PAGE_KERNEL_RO);
3665     + printk("Write protecting the kernel read-only data: %luk\n",
3666     + size >> 10);
3667    
3668     /*
3669     * change_page_attr() requires a global_flush_tlb() call after it.
3670     @@ -847,7 +890,7 @@
3671     free_page(addr);
3672     totalram_pages++;
3673     }
3674     - printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10);
3675     + printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
3676     }
3677    
3678     void free_initmem(void)
3679     --- a/arch/x86/mm/init_64-xen.c
3680     +++ b/arch/x86/mm/init_64-xen.c
3681     @@ -25,10 +25,12 @@
3682     #include <linux/bootmem.h>
3683     #include <linux/proc_fs.h>
3684     #include <linux/pci.h>
3685     +#include <linux/pfn.h>
3686     #include <linux/poison.h>
3687     #include <linux/dma-mapping.h>
3688     #include <linux/module.h>
3689     #include <linux/memory_hotplug.h>
3690     +#include <linux/nmi.h>
3691    
3692     #include <asm/processor.h>
3693     #include <asm/system.h>
3694     @@ -51,7 +53,7 @@
3695     #define Dprintk(x...)
3696     #endif
3697    
3698     -struct dma_mapping_ops* dma_ops;
3699     +const struct dma_mapping_ops* dma_ops;
3700     EXPORT_SYMBOL(dma_ops);
3701    
3702     #if CONFIG_XEN_COMPAT <= 0x030002
3703     @@ -189,6 +191,13 @@
3704    
3705     for_each_online_pgdat(pgdat) {
3706     for (i = 0; i < pgdat->node_spanned_pages; ++i) {
3707     + /* this loop can take a while with 256 GB and 4k pages
3708     + so update the NMI watchdog */
3709     + if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) {
3710     + touch_nmi_watchdog();
3711     + }
3712     + if (!pfn_valid(pgdat->node_start_pfn + i))
3713     + continue;
3714     page = pfn_to_page(pgdat->node_start_pfn + i);
3715     total++;
3716     if (PageReserved(page))
3717     @@ -350,7 +359,7 @@
3718     }
3719     }
3720    
3721     -unsigned long __initdata table_start, table_end;
3722     +unsigned long __meminitdata table_start, table_end;
3723    
3724     static __meminit void *alloc_static_page(unsigned long *phys)
3725     {
3726     @@ -367,7 +376,7 @@
3727     start_pfn++;
3728     memset((void *)va, 0, PAGE_SIZE);
3729     return (void *)va;
3730     -}
3731     +}
3732    
3733     #define PTE_SIZE PAGE_SIZE
3734    
3735     @@ -408,28 +417,46 @@
3736    
3737     #ifndef CONFIG_XEN
3738     /* Must run before zap_low_mappings */
3739     -__init void *early_ioremap(unsigned long addr, unsigned long size)
3740     +__meminit void *early_ioremap(unsigned long addr, unsigned long size)
3741     {
3742     - unsigned long map = round_down(addr, LARGE_PAGE_SIZE);
3743     -
3744     - /* actually usually some more */
3745     - if (size >= LARGE_PAGE_SIZE) {
3746     - return NULL;
3747     + unsigned long vaddr;
3748     + pmd_t *pmd, *last_pmd;
3749     + int i, pmds;
3750     +
3751     + pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
3752     + vaddr = __START_KERNEL_map;
3753     + pmd = level2_kernel_pgt;
3754     + last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1;
3755     + for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) {
3756     + for (i = 0; i < pmds; i++) {
3757     + if (pmd_present(pmd[i]))
3758     + goto next;
3759     + }
3760     + vaddr += addr & ~PMD_MASK;
3761     + addr &= PMD_MASK;
3762     + for (i = 0; i < pmds; i++, addr += PMD_SIZE)
3763     + set_pmd(pmd + i,__pmd(addr | _KERNPG_TABLE | _PAGE_PSE));
3764     + __flush_tlb();
3765     + return (void *)vaddr;
3766     + next:
3767     + ;
3768     }
3769     - set_pmd(temp_mappings[0].pmd, __pmd(map | _KERNPG_TABLE | _PAGE_PSE));
3770     - map += LARGE_PAGE_SIZE;
3771     - set_pmd(temp_mappings[1].pmd, __pmd(map | _KERNPG_TABLE | _PAGE_PSE));
3772     - __flush_tlb();
3773     - return temp_mappings[0].address + (addr & (LARGE_PAGE_SIZE-1));
3774     + printk("early_ioremap(0x%lx, %lu) failed\n", addr, size);
3775     + return NULL;
3776     }
3777    
3778     /* To avoid virtual aliases later */
3779     -__init void early_iounmap(void *addr, unsigned long size)
3780     +__meminit void early_iounmap(void *addr, unsigned long size)
3781     {
3782     - if ((void *)round_down((unsigned long)addr, LARGE_PAGE_SIZE) != temp_mappings[0].address)
3783     - printk("early_iounmap: bad address %p\n", addr);
3784     - set_pmd(temp_mappings[0].pmd, __pmd(0));
3785     - set_pmd(temp_mappings[1].pmd, __pmd(0));
3786     + unsigned long vaddr;
3787     + pmd_t *pmd;
3788     + int i, pmds;
3789     +
3790     + vaddr = (unsigned long)addr;
3791     + pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
3792     + pmd = level2_kernel_pgt + pmd_index(vaddr);
3793     + for (i = 0; i < pmds; i++)
3794     + pmd_clear(pmd + i);
3795     __flush_tlb();
3796     }
3797     #endif
3798     @@ -763,14 +790,6 @@
3799     __flush_tlb_all();
3800     }
3801    
3802     -void __cpuinit zap_low_mappings(int cpu)
3803     -{
3804     - /* this is not required for Xen */
3805     -#if 0
3806     - swap_low_mappings();
3807     -#endif
3808     -}
3809     -
3810     #ifndef CONFIG_NUMA
3811     void __init paging_init(void)
3812     {
3813     @@ -961,17 +980,6 @@
3814     reservedpages << (PAGE_SHIFT-10),
3815     datasize >> 10,
3816     initsize >> 10);
3817     -
3818     -#ifndef CONFIG_XEN
3819     -#ifdef CONFIG_SMP
3820     - /*
3821     - * Sync boot_level4_pgt mappings with the init_level4_pgt
3822     - * except for the low identity mappings which are already zapped
3823     - * in init_level4_pgt. This sync-up is essential for AP's bringup
3824     - */
3825     - memcpy(boot_level4_pgt+1, init_level4_pgt+1, (PTRS_PER_PGD-1)*sizeof(pgd_t));
3826     -#endif
3827     -#endif
3828     }
3829    
3830     void free_init_pages(char *what, unsigned long begin, unsigned long end)
3831     @@ -981,7 +989,7 @@
3832     if (begin >= end)
3833     return;
3834    
3835     - printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10);
3836     + printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
3837     for (addr = begin; addr < end; addr += PAGE_SIZE) {
3838     ClearPageReserved(virt_to_page(addr));
3839     init_page_count(virt_to_page(addr));
3840     @@ -990,24 +998,17 @@
3841     if (addr >= __START_KERNEL_map) {
3842     /* make_readonly() reports all kernel addresses. */
3843     __make_page_writable(__va(__pa(addr)));
3844     - if (HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) {
3845     - pgd_t *pgd = pgd_offset_k(addr);
3846     - pud_t *pud = pud_offset(pgd, addr);
3847     - pmd_t *pmd = pmd_offset(pud, addr);
3848     - pte_t *pte = pte_offset_kernel(pmd, addr);
3849     -
3850     - xen_l1_entry_update(pte, __pte(0)); /* fallback */
3851     - }
3852     + change_page_attr_addr(addr, 1, __pgprot(0));
3853     }
3854     free_page(addr);
3855     totalram_pages++;
3856     }
3857     + if (addr > __START_KERNEL_map)
3858     + global_flush_tlb();
3859     }
3860    
3861     void free_initmem(void)
3862     {
3863     - memset(__initdata_begin, POISON_FREE_INITDATA,
3864     - __initdata_end - __initdata_begin);
3865     free_init_pages("unused kernel memory",
3866     (unsigned long)(&__init_begin),
3867     (unsigned long)(&__init_end));
3868     @@ -1017,13 +1018,28 @@
3869    
3870     void mark_rodata_ro(void)
3871     {
3872     - unsigned long addr = (unsigned long)__start_rodata;
3873     + unsigned long start = (unsigned long)_stext, end;
3874     +
3875     +#ifdef CONFIG_HOTPLUG_CPU
3876     + /* It must still be possible to apply SMP alternatives. */
3877     + if (num_possible_cpus() > 1)
3878     + start = (unsigned long)_etext;
3879     +#endif
3880     +
3881     +#ifdef CONFIG_KPROBES
3882     + start = (unsigned long)__start_rodata;
3883     +#endif
3884     +
3885     + end = (unsigned long)__end_rodata;
3886     + start = (start + PAGE_SIZE - 1) & PAGE_MASK;
3887     + end &= PAGE_MASK;
3888     + if (end <= start)
3889     + return;
3890    
3891     - for (; addr < (unsigned long)__end_rodata; addr += PAGE_SIZE)
3892     - change_page_attr_addr(addr, 1, PAGE_KERNEL_RO);
3893     + change_page_attr_addr(start, (end - start) >> PAGE_SHIFT, PAGE_KERNEL_RO);
3894    
3895     - printk ("Write protecting the kernel read-only data: %luk\n",
3896     - (__end_rodata - __start_rodata) >> 10);
3897     + printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
3898     + (end - start) >> 10);
3899    
3900     /*
3901     * change_page_attr_addr() requires a global_flush_tlb() call after it.
3902     @@ -1176,3 +1192,11 @@
3903     {
3904     return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
3905     }
3906     +
3907     +#ifndef CONFIG_XEN
3908     +void *alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size)
3909     +{
3910     + return __alloc_bootmem_core(pgdat->bdata, size,
3911     + SMP_CACHE_BYTES, (4UL*1024*1024*1024), 0);
3912     +}
3913     +#endif
3914     --- a/arch/x86/mm/ioremap_32-xen.c
3915     +++ b/arch/x86/mm/ioremap_32-xen.c
3916     @@ -13,6 +13,7 @@
3917     #include <linux/slab.h>
3918     #include <linux/module.h>
3919     #include <linux/io.h>
3920     +#include <linux/sched.h>
3921     #include <asm/fixmap.h>
3922     #include <asm/cacheflush.h>
3923     #include <asm/tlbflush.h>
3924     --- a/arch/x86/mm/pageattr_64-xen.c
3925     +++ b/arch/x86/mm/pageattr_64-xen.c
3926     @@ -215,13 +215,13 @@
3927     preempt_enable();
3928     }
3929    
3930     -void _arch_dup_mmap(struct mm_struct *mm)
3931     +void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
3932     {
3933     if (!mm->context.pinned)
3934     mm_pin(mm);
3935     }
3936    
3937     -void _arch_exit_mmap(struct mm_struct *mm)
3938     +void arch_exit_mmap(struct mm_struct *mm)
3939     {
3940     struct task_struct *tsk = current;
3941    
3942     @@ -337,10 +337,11 @@
3943     struct page *pg;
3944    
3945     /* When clflush is available always use it because it is
3946     - much cheaper than WBINVD */
3947     - if (!cpu_has_clflush)
3948     + much cheaper than WBINVD. Disable clflush for now because
3949     + the high level code is not ready yet */
3950     + if (1 || !cpu_has_clflush)
3951     asm volatile("wbinvd" ::: "memory");
3952     - list_for_each_entry(pg, l, lru) {
3953     + else list_for_each_entry(pg, l, lru) {
3954     void *adr = page_address(pg);
3955     if (cpu_has_clflush)
3956     cache_flush_page(adr);
3957     @@ -454,16 +455,24 @@
3958     */
3959     int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot)
3960     {
3961     - int err = 0;
3962     + int err = 0, kernel_map = 0;
3963     int i;
3964    
3965     + if (address >= __START_KERNEL_map
3966     + && address < __START_KERNEL_map + KERNEL_TEXT_SIZE) {
3967     + address = (unsigned long)__va(__pa(address));
3968     + kernel_map = 1;
3969     + }
3970     +
3971     down_write(&init_mm.mmap_sem);
3972     for (i = 0; i < numpages; i++, address += PAGE_SIZE) {
3973     unsigned long pfn = __pa(address) >> PAGE_SHIFT;
3974    
3975     - err = __change_page_attr(address, pfn, prot, PAGE_KERNEL);
3976     - if (err)
3977     - break;
3978     + if (!kernel_map || pte_present(pfn_pte(0, prot))) {
3979     + err = __change_page_attr(address, pfn, prot, PAGE_KERNEL);
3980     + if (err)
3981     + break;
3982     + }
3983     /* Handle kernel mapping too which aliases part of the
3984     * lowmem */
3985     if (__pa(address) < KERNEL_TEXT_SIZE) {
3986     --- a/arch/x86/mm/pgtable_32-xen.c
3987     +++ b/arch/x86/mm/pgtable_32-xen.c
3988     @@ -13,6 +13,7 @@
3989     #include <linux/pagemap.h>
3990     #include <linux/spinlock.h>
3991     #include <linux/module.h>
3992     +#include <linux/quicklist.h>
3993    
3994     #include <asm/system.h>
3995     #include <asm/pgtable.h>
3996     @@ -212,8 +213,6 @@
3997     * against pageattr.c; it is the unique case in which a valid change
3998     * of kernel pagetables can't be lazily synchronized by vmalloc faults.
3999     * vmalloc faults work because attached pagetables are never freed.
4000     - * The locking scheme was chosen on the basis of manfred's
4001     - * recommendations and having no core impact whatsoever.
4002     * -- wli
4003     */
4004     DEFINE_SPINLOCK(pgd_lock);
4005     @@ -239,37 +238,59 @@
4006     set_page_private(next, (unsigned long)pprev);
4007     }
4008    
4009     -void pgd_ctor(void *pgd, struct kmem_cache *cache, unsigned long unused)
4010     +
4011     +
4012     +#if (PTRS_PER_PMD == 1)
4013     +/* Non-PAE pgd constructor */
4014     +void pgd_ctor(void *pgd)
4015     {
4016     unsigned long flags;
4017    
4018     - if (PTRS_PER_PMD > 1) {
4019     - if (HAVE_SHARED_KERNEL_PMD)
4020     - clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
4021     - swapper_pg_dir + USER_PTRS_PER_PGD,
4022     - KERNEL_PGD_PTRS);
4023     - } else {
4024     - spin_lock_irqsave(&pgd_lock, flags);
4025     + /* !PAE, no pagetable sharing */
4026     + memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
4027     +
4028     + spin_lock_irqsave(&pgd_lock, flags);
4029     +
4030     + /* must happen under lock */
4031     + clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
4032     + swapper_pg_dir + USER_PTRS_PER_PGD,
4033     + KERNEL_PGD_PTRS);
4034     +
4035     + paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
4036     + __pa(swapper_pg_dir) >> PAGE_SHIFT,
4037     + USER_PTRS_PER_PGD,
4038     + KERNEL_PGD_PTRS);
4039     + pgd_list_add(pgd);
4040     + spin_unlock_irqrestore(&pgd_lock, flags);
4041     +}
4042     +#else /* PTRS_PER_PMD > 1 */
4043     +/* PAE pgd constructor */
4044     +void pgd_ctor(void *pgd)
4045     +{
4046     + /* PAE, kernel PMD may be shared */
4047     +
4048     + if (SHARED_KERNEL_PMD) {
4049     clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
4050     swapper_pg_dir + USER_PTRS_PER_PGD,
4051     KERNEL_PGD_PTRS);
4052     - memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
4053     -
4054     - /* must happen under lock */
4055     - paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
4056     - __pa(swapper_pg_dir) >> PAGE_SHIFT,
4057     - USER_PTRS_PER_PGD, PTRS_PER_PGD - USER_PTRS_PER_PGD);
4058     + } else {
4059     + unsigned long flags;
4060    
4061     + memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
4062     + spin_lock_irqsave(&pgd_lock, flags);
4063     pgd_list_add(pgd);
4064     spin_unlock_irqrestore(&pgd_lock, flags);
4065     }
4066     }
4067     +#endif /* PTRS_PER_PMD */
4068    
4069     -/* never called when PTRS_PER_PMD > 1 */
4070     -void pgd_dtor(void *pgd, struct kmem_cache *cache, unsigned long unused)
4071     +void pgd_dtor(void *pgd)
4072     {
4073     unsigned long flags; /* can be called from interrupt context */
4074    
4075     + if (SHARED_KERNEL_PMD)
4076     + return;
4077     +
4078     paravirt_release_pd(__pa(pgd) >> PAGE_SHIFT);
4079     spin_lock_irqsave(&pgd_lock, flags);
4080     pgd_list_del(pgd);
4081     @@ -278,11 +299,46 @@
4082     pgd_test_and_unpin(pgd);
4083     }
4084    
4085     +#define UNSHARED_PTRS_PER_PGD \
4086     + (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD)
4087     +
4088     +/* If we allocate a pmd for part of the kernel address space, then
4089     + make sure its initialized with the appropriate kernel mappings.
4090     + Otherwise use a cached zeroed pmd. */
4091     +static pmd_t *pmd_cache_alloc(int idx)
4092     +{
4093     + pmd_t *pmd;
4094     +
4095     + if (idx >= USER_PTRS_PER_PGD) {
4096     + pmd = (pmd_t *)__get_free_page(GFP_KERNEL);
4097     +
4098     +#ifndef CONFIG_XEN
4099     + if (pmd)
4100     + memcpy(pmd,
4101     + (void *)pgd_page_vaddr(swapper_pg_dir[idx]),
4102     + sizeof(pmd_t) * PTRS_PER_PMD);
4103     +#endif
4104     + } else
4105     + pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
4106     +
4107     + return pmd;
4108     +}
4109     +
4110     +static void pmd_cache_free(pmd_t *pmd, int idx)
4111     +{
4112     + if (idx >= USER_PTRS_PER_PGD) {
4113     + make_lowmem_page_writable(pmd, XENFEAT_writable_page_tables);
4114     + memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
4115     + free_page((unsigned long)pmd);
4116     + } else
4117     + kmem_cache_free(pmd_cache, pmd);
4118     +}
4119     +
4120     pgd_t *pgd_alloc(struct mm_struct *mm)
4121     {
4122     int i;
4123     - pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL);
4124     - pmd_t **pmd;
4125     + pgd_t *pgd = quicklist_alloc(0, GFP_KERNEL, pgd_ctor);
4126     + pmd_t **pmds = NULL;
4127     unsigned long flags;
4128    
4129     pgd_test_and_unpin(pgd);
4130     @@ -290,37 +346,40 @@
4131     if (PTRS_PER_PMD == 1 || !pgd)
4132     return pgd;
4133    
4134     - if (HAVE_SHARED_KERNEL_PMD) {
4135     - for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
4136     - pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
4137     - if (!pmd)
4138     - goto out_oom;
4139     - paravirt_alloc_pd(__pa(pmd) >> PAGE_SHIFT);
4140     - set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
4141     +#ifdef CONFIG_XEN
4142     + if (!SHARED_KERNEL_PMD) {
4143     + /*
4144     + * We can race save/restore (if we sleep during a GFP_KERNEL memory
4145     + * allocation). We therefore store virtual addresses of pmds as they
4146     + * do not change across save/restore, and poke the machine addresses
4147     + * into the pgdir under the pgd_lock.
4148     + */
4149     + pmds = kmalloc(PTRS_PER_PGD * sizeof(pmd_t *), GFP_KERNEL);
4150     + if (!pmds) {
4151     + quicklist_free(0, pgd_dtor, pgd);
4152     + return NULL;
4153     }
4154     - return pgd;
4155     - }
4156     -
4157     - /*
4158     - * We can race save/restore (if we sleep during a GFP_KERNEL memory
4159     - * allocation). We therefore store virtual addresses of pmds as they
4160     - * do not change across save/restore, and poke the machine addresses
4161     - * into the pgdir under the pgd_lock.
4162     - */
4163     - pmd = kmalloc(PTRS_PER_PGD * sizeof(pmd_t *), GFP_KERNEL);
4164     - if (!pmd) {
4165     - kmem_cache_free(pgd_cache, pgd);
4166     - return NULL;
4167     }
4168     +#endif
4169    
4170     /* Allocate pmds, remember virtual addresses. */
4171     - for (i = 0; i < PTRS_PER_PGD; ++i) {
4172     - pmd[i] = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
4173     - if (!pmd[i])
4174     + for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) {
4175     + pmd_t *pmd = pmd_cache_alloc(i);
4176     +
4177     + if (!pmd)
4178     goto out_oom;
4179     +
4180     paravirt_alloc_pd(__pa(pmd) >> PAGE_SHIFT);
4181     + if (pmds)
4182     + pmds[i] = pmd;
4183     + else
4184     + set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
4185     }
4186    
4187     +#ifdef CONFIG_XEN
4188     + if (SHARED_KERNEL_PMD)
4189     + return pgd;
4190     +
4191     spin_lock_irqsave(&pgd_lock, flags);
4192    
4193     /* Protect against save/restore: move below 4GB under pgd_lock. */
4194     @@ -335,44 +394,40 @@
4195    
4196     /* Copy kernel pmd contents and write-protect the new pmds. */
4197     for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
4198     - unsigned long v = (unsigned long)i << PGDIR_SHIFT;
4199     - pgd_t *kpgd = pgd_offset_k(v);
4200     - pud_t *kpud = pud_offset(kpgd, v);
4201     - pmd_t *kpmd = pmd_offset(kpud, v);
4202     - memcpy(pmd[i], kpmd, PAGE_SIZE);
4203     + memcpy(pmds[i],
4204     + (void *)pgd_page_vaddr(swapper_pg_dir[i]),
4205     + sizeof(pmd_t) * PTRS_PER_PMD);
4206     make_lowmem_page_readonly(
4207     - pmd[i], XENFEAT_writable_page_tables);
4208     + pmds[i], XENFEAT_writable_page_tables);
4209     }
4210    
4211     /* It is safe to poke machine addresses of pmds under the pmd_lock. */
4212     for (i = 0; i < PTRS_PER_PGD; i++)
4213     - set_pgd(&pgd[i], __pgd(1 + __pa(pmd[i])));
4214     -
4215     - /* Ensure this pgd gets picked up and pinned on save/restore. */
4216     - pgd_list_add(pgd);
4217     + set_pgd(&pgd[i], __pgd(1 + __pa(pmds[i])));
4218    
4219     spin_unlock_irqrestore(&pgd_lock, flags);
4220    
4221     - kfree(pmd);
4222     + kfree(pmds);
4223     +#endif
4224    
4225     return pgd;
4226    
4227     out_oom:
4228     - if (HAVE_SHARED_KERNEL_PMD) {
4229     + if (!pmds) {
4230     for (i--; i >= 0; i--) {
4231     pgd_t pgdent = pgd[i];
4232     void* pmd = (void *)__va(pgd_val(pgdent)-1);
4233     paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
4234     - kmem_cache_free(pmd_cache, pmd);
4235     + pmd_cache_free(pmd, i);
4236     }
4237     } else {
4238     for (i--; i >= 0; i--) {
4239     - paravirt_release_pd(__pa(pmd[i]) >> PAGE_SHIFT);
4240     - kmem_cache_free(pmd_cache, pmd[i]);
4241     + paravirt_release_pd(__pa(pmds[i]) >> PAGE_SHIFT);
4242     + pmd_cache_free(pmds[i], i);
4243     }
4244     - kfree(pmd);
4245     + kfree(pmds);
4246     }
4247     - kmem_cache_free(pgd_cache, pgd);
4248     + quicklist_free(0, pgd_dtor, pgd);
4249     return NULL;
4250     }
4251    
4252     @@ -392,35 +447,24 @@
4253    
4254     /* in the PAE case user pgd entries are overwritten before usage */
4255     if (PTRS_PER_PMD > 1) {
4256     - for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
4257     + for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) {
4258     pgd_t pgdent = pgd[i];
4259     void* pmd = (void *)__va(pgd_val(pgdent)-1);
4260     paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
4261     - kmem_cache_free(pmd_cache, pmd);
4262     + pmd_cache_free(pmd, i);
4263     }
4264    
4265     - if (!HAVE_SHARED_KERNEL_PMD) {
4266     - unsigned long flags;
4267     - spin_lock_irqsave(&pgd_lock, flags);
4268     - pgd_list_del(pgd);
4269     - spin_unlock_irqrestore(&pgd_lock, flags);
4270     -
4271     - for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
4272     - pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1);
4273     - make_lowmem_page_writable(
4274     - pmd, XENFEAT_writable_page_tables);
4275     - memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
4276     - kmem_cache_free(pmd_cache, pmd);
4277     - }
4278     -
4279     - if (!xen_feature(XENFEAT_pae_pgdir_above_4gb))
4280     - xen_destroy_contiguous_region(
4281     - (unsigned long)pgd, 0);
4282     - }
4283     + if (!xen_feature(XENFEAT_pae_pgdir_above_4gb))
4284     + xen_destroy_contiguous_region((unsigned long)pgd, 0);
4285     }
4286    
4287     /* in the non-PAE case, free_pgtables() clears user pgd entries */
4288     - kmem_cache_free(pgd_cache, pgd);
4289     + quicklist_free(0, pgd_dtor, pgd);
4290     +}
4291     +
4292     +void check_pgt_cache(void)
4293     +{
4294     + quicklist_trim(0, pgd_dtor, 25, 16);
4295     }
4296    
4297     void make_lowmem_page_readonly(void *va, unsigned int feature)
4298     @@ -717,13 +761,13 @@
4299     spin_unlock_irqrestore(&pgd_lock, flags);
4300     }
4301    
4302     -void _arch_dup_mmap(struct mm_struct *mm)
4303     +void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
4304     {
4305     if (!test_bit(PG_pinned, &virt_to_page(mm->pgd)->flags))
4306     mm_pin(mm);
4307     }
4308    
4309     -void _arch_exit_mmap(struct mm_struct *mm)
4310     +void arch_exit_mmap(struct mm_struct *mm)
4311     {
4312     struct task_struct *tsk = current;
4313    
4314     --- a/drivers/char/tpm/tpm_xen.c
4315     +++ b/drivers/char/tpm/tpm_xen.c
4316     @@ -463,7 +463,7 @@
4317     tp->backend_id = domid;
4318    
4319     err = bind_listening_port_to_irqhandler(
4320     - domid, tpmif_int, SA_SAMPLE_RANDOM, "tpmif", tp);
4321     + domid, tpmif_int, IRQF_SAMPLE_RANDOM, "tpmif", tp);
4322     if (err <= 0) {
4323     WPRINTK("bind_listening_port_to_irqhandler failed "
4324     "(err=%d)\n", err);
4325     --- a/drivers/xen/blkfront/blkfront.c
4326     +++ b/drivers/xen/blkfront/blkfront.c
4327     @@ -236,7 +236,7 @@
4328     info->ring_ref = err;
4329    
4330     err = bind_listening_port_to_irqhandler(
4331     - dev->otherend_id, blkif_int, SA_SAMPLE_RANDOM, "blkif", info);
4332     + dev->otherend_id, blkif_int, IRQF_SAMPLE_RANDOM, "blkif", info);
4333     if (err <= 0) {
4334     xenbus_dev_fatal(dev, err,
4335     "bind_listening_port_to_irqhandler");
4336     --- a/drivers/xen/char/mem.c
4337     +++ b/drivers/xen/char/mem.c
4338     @@ -18,7 +18,6 @@
4339     #include <linux/raw.h>
4340     #include <linux/tty.h>
4341     #include <linux/capability.h>
4342     -#include <linux/smp_lock.h>
4343     #include <linux/ptrace.h>
4344     #include <linux/device.h>
4345     #include <asm/pgalloc.h>
4346     --- a/drivers/xen/core/hypervisor_sysfs.c
4347     +++ b/drivers/xen/core/hypervisor_sysfs.c
4348     @@ -50,7 +50,7 @@
4349     if (!is_running_on_xen())
4350     return -ENODEV;
4351    
4352     - hypervisor_subsys.kset.kobj.ktype = &hyp_sysfs_kobj_type;
4353     + hypervisor_subsys.kobj.ktype = &hyp_sysfs_kobj_type;
4354     return 0;
4355     }
4356    
4357     --- a/drivers/xen/core/smpboot.c
4358     +++ b/drivers/xen/core/smpboot.c
4359     @@ -121,7 +121,7 @@
4360     rc = bind_ipi_to_irqhandler(RESCHEDULE_VECTOR,
4361     cpu,
4362     smp_reschedule_interrupt,
4363     - SA_INTERRUPT,
4364     + IRQF_DISABLED,
4365     resched_name[cpu],
4366     NULL);
4367     if (rc < 0)
4368     @@ -132,7 +132,7 @@
4369     rc = bind_ipi_to_irqhandler(CALL_FUNCTION_VECTOR,
4370     cpu,
4371     smp_call_function_interrupt,
4372     - SA_INTERRUPT,
4373     + IRQF_DISABLED,
4374     callfunc_name[cpu],
4375     NULL);
4376     if (rc < 0)
4377     @@ -165,13 +165,12 @@
4378    
4379     void __cpuinit cpu_bringup(void)
4380     {
4381     + cpu_init();
4382     #ifdef __i386__
4383     - cpu_set_gdt(current_thread_info()->cpu);
4384     - secondary_cpu_init();
4385     + identify_secondary_cpu(cpu_data + smp_processor_id());
4386     #else
4387     - cpu_init();
4388     -#endif
4389     identify_cpu(cpu_data + smp_processor_id());
4390     +#endif
4391     touch_softlockup_watchdog();
4392     preempt_disable();
4393     local_irq_enable();
4394     @@ -191,11 +190,6 @@
4395     static DEFINE_SPINLOCK(ctxt_lock);
4396    
4397     struct task_struct *idle = idle_task(cpu);
4398     -#ifdef __x86_64__
4399     - struct desc_ptr *gdt_descr = &cpu_gdt_descr[cpu];
4400     -#else
4401     - struct Xgt_desc_struct *gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
4402     -#endif
4403    
4404     if (cpu_test_and_set(cpu, cpu_initialized_map))
4405     return;
4406     @@ -218,11 +212,11 @@
4407     smp_trap_init(ctxt.trap_ctxt);
4408    
4409     ctxt.ldt_ents = 0;
4410     -
4411     - ctxt.gdt_frames[0] = virt_to_mfn(gdt_descr->address);
4412     - ctxt.gdt_ents = gdt_descr->size / 8;
4413     + ctxt.gdt_ents = GDT_SIZE / 8;
4414    
4415     #ifdef __i386__
4416     + ctxt.gdt_frames[0] = virt_to_mfn(get_cpu_gdt_table(cpu));
4417     +
4418     ctxt.user_regs.cs = __KERNEL_CS;
4419     ctxt.user_regs.esp = idle->thread.esp0 - sizeof(struct pt_regs);
4420    
4421     @@ -235,7 +229,11 @@
4422     ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback;
4423    
4424     ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
4425     +
4426     + ctxt.user_regs.fs = __KERNEL_PERCPU;
4427     #else /* __x86_64__ */
4428     + ctxt.gdt_frames[0] = virt_to_mfn(cpu_gdt_descr[cpu].address);
4429     +
4430     ctxt.user_regs.cs = __KERNEL_CS;
4431     ctxt.user_regs.esp = idle->thread.rsp0 - sizeof(struct pt_regs);
4432    
4433     @@ -265,9 +263,8 @@
4434     struct vcpu_get_physid cpu_id;
4435     #ifdef __x86_64__
4436     struct desc_ptr *gdt_descr;
4437     -#else
4438     - struct Xgt_desc_struct *gdt_descr;
4439     #endif
4440     + void *gdt_addr;
4441    
4442     apicid = 0;
4443     if (HYPERVISOR_vcpu_op(VCPUOP_get_physid, 0, &cpu_id) == 0)
4444     @@ -317,14 +314,12 @@
4445     }
4446     gdt_descr->size = GDT_SIZE;
4447     memcpy((void *)gdt_descr->address, cpu_gdt_table, GDT_SIZE);
4448     + gdt_addr = (void *)gdt_descr->address;
4449     #else
4450     - if (unlikely(!init_gdt(cpu, idle)))
4451     - continue;
4452     - gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
4453     + init_gdt(cpu);
4454     + gdt_addr = get_cpu_gdt_table(cpu);
4455     #endif
4456     - make_page_readonly(
4457     - (void *)gdt_descr->address,
4458     - XENFEAT_writable_descriptor_tables);
4459     + make_page_readonly(gdt_addr, XENFEAT_writable_descriptor_tables);
4460    
4461     apicid = cpu;
4462     if (HYPERVISOR_vcpu_op(VCPUOP_get_physid, cpu, &cpu_id) == 0)
4463     @@ -338,7 +333,9 @@
4464     #ifdef __x86_64__
4465     cpu_pda(cpu)->pcurrent = idle;
4466     cpu_pda(cpu)->cpunumber = cpu;
4467     - clear_ti_thread_flag(idle->thread_info, TIF_FORK);
4468     + clear_ti_thread_flag(task_thread_info(idle), TIF_FORK);
4469     +#else
4470     + per_cpu(current_task, cpu) = idle;
4471     #endif
4472    
4473     irq_ctx_init(cpu);
4474     @@ -363,8 +360,12 @@
4475     #endif
4476     }
4477    
4478     -void __devinit smp_prepare_boot_cpu(void)
4479     +void __init smp_prepare_boot_cpu(void)
4480     {
4481     +#ifdef __i386__
4482     + init_gdt(smp_processor_id());
4483     + switch_to_new_gdt();
4484     +#endif
4485     prefill_possible_map();
4486     }
4487    
4488     --- a/drivers/xen/core/xen_sysfs.c
4489     +++ b/drivers/xen/core/xen_sysfs.c
4490     @@ -28,12 +28,12 @@
4491    
4492     static int __init xen_sysfs_type_init(void)
4493     {
4494     - return sysfs_create_file(&hypervisor_subsys.kset.kobj, &type_attr.attr);
4495     + return sysfs_create_file(&hypervisor_subsys.kobj, &type_attr.attr);
4496     }
4497    
4498     static void xen_sysfs_type_destroy(void)
4499     {
4500     - sysfs_remove_file(&hypervisor_subsys.kset.kobj, &type_attr.attr);
4501     + sysfs_remove_file(&hypervisor_subsys.kobj, &type_attr.attr);
4502     }
4503    
4504     /* xen version attributes */
4505     @@ -89,13 +89,13 @@
4506    
4507     static int __init xen_sysfs_version_init(void)
4508     {
4509     - return sysfs_create_group(&hypervisor_subsys.kset.kobj,
4510     + return sysfs_create_group(&hypervisor_subsys.kobj,
4511     &version_group);
4512     }
4513    
4514     static void xen_sysfs_version_destroy(void)
4515     {
4516     - sysfs_remove_group(&hypervisor_subsys.kset.kobj, &version_group);
4517     + sysfs_remove_group(&hypervisor_subsys.kobj, &version_group);
4518     }
4519    
4520     /* UUID */
4521     @@ -125,12 +125,12 @@
4522    
4523     static int __init xen_sysfs_uuid_init(void)
4524     {
4525     - return sysfs_create_file(&hypervisor_subsys.kset.kobj, &uuid_attr.attr);
4526     + return sysfs_create_file(&hypervisor_subsys.kobj, &uuid_attr.attr);
4527     }
4528    
4529     static void xen_sysfs_uuid_destroy(void)
4530     {
4531     - sysfs_remove_file(&hypervisor_subsys.kset.kobj, &uuid_attr.attr);
4532     + sysfs_remove_file(&hypervisor_subsys.kobj, &uuid_attr.attr);
4533     }
4534    
4535     /* xen compilation attributes */
4536     @@ -203,13 +203,13 @@
4537    
4538     int __init static xen_compilation_init(void)
4539     {
4540     - return sysfs_create_group(&hypervisor_subsys.kset.kobj,
4541     + return sysfs_create_group(&hypervisor_subsys.kobj,
4542     &xen_compilation_group);
4543     }
4544    
4545     static void xen_compilation_destroy(void)
4546     {
4547     - sysfs_remove_group(&hypervisor_subsys.kset.kobj,
4548     + sysfs_remove_group(&hypervisor_subsys.kobj,
4549     &xen_compilation_group);
4550     }
4551    
4552     @@ -324,13 +324,13 @@
4553    
4554     static int __init xen_properties_init(void)
4555     {
4556     - return sysfs_create_group(&hypervisor_subsys.kset.kobj,
4557     + return sysfs_create_group(&hypervisor_subsys.kobj,
4558     &xen_properties_group);
4559     }
4560    
4561     static void xen_properties_destroy(void)
4562     {
4563     - sysfs_remove_group(&hypervisor_subsys.kset.kobj,
4564     + sysfs_remove_group(&hypervisor_subsys.kobj,
4565     &xen_properties_group);
4566     }
4567    
4568     --- a/drivers/xen/netback/netback.c
4569     +++ b/drivers/xen/netback/netback.c
4570     @@ -180,7 +180,7 @@
4571     goto err;
4572    
4573     skb_reserve(nskb, 16 + NET_IP_ALIGN);
4574     - headlen = nskb->end - nskb->data;
4575     + headlen = skb_end_pointer(nskb) - nskb->data;
4576     if (headlen > skb_headlen(skb))
4577     headlen = skb_headlen(skb);
4578     ret = skb_copy_bits(skb, 0, __skb_put(nskb, headlen), headlen);
4579     @@ -226,11 +226,15 @@
4580     len -= copy;
4581     }
4582    
4583     +#ifdef NET_SKBUFF_DATA_USES_OFFSET
4584     + offset = 0;
4585     +#else
4586     offset = nskb->data - skb->data;
4587     +#endif
4588    
4589     - nskb->h.raw = skb->h.raw + offset;
4590     - nskb->nh.raw = skb->nh.raw + offset;
4591     - nskb->mac.raw = skb->mac.raw + offset;
4592     + nskb->transport_header = skb->transport_header + offset;
4593     + nskb->network_header = skb->network_header + offset;
4594     + nskb->mac_header = skb->mac_header + offset;
4595    
4596     return nskb;
4597    
4598     @@ -1601,7 +1605,7 @@
4599     (void)bind_virq_to_irqhandler(VIRQ_DEBUG,
4600     0,
4601     netif_be_dbg,
4602     - SA_SHIRQ,
4603     + IRQF_SHARED,
4604     "net-be-dbg",
4605     &netif_be_dbg);
4606     #endif
4607     --- a/drivers/xen/netfront/netfront.c
4608     +++ b/drivers/xen/netfront/netfront.c
4609     @@ -513,7 +513,7 @@
4610     memcpy(netdev->dev_addr, info->mac, ETH_ALEN);
4611    
4612     err = bind_listening_port_to_irqhandler(
4613     - dev->otherend_id, netif_int, SA_SAMPLE_RANDOM, netdev->name,
4614     + dev->otherend_id, netif_int, IRQF_SAMPLE_RANDOM, netdev->name,
4615     netdev);
4616     if (err < 0)
4617     goto fail;
4618     --- a/drivers/xen/pciback/xenbus.c
4619     +++ b/drivers/xen/pciback/xenbus.c
4620     @@ -86,7 +86,7 @@
4621    
4622     err = bind_interdomain_evtchn_to_irqhandler(
4623     pdev->xdev->otherend_id, remote_evtchn, pciback_handle_event,
4624     - SA_SAMPLE_RANDOM, "pciback", pdev);
4625     + IRQF_SAMPLE_RANDOM, "pciback", pdev);
4626     if (err < 0) {
4627     xenbus_dev_fatal(pdev->xdev, err,
4628     "Error binding event channel to IRQ");
4629     --- a/drivers/xen/pcifront/xenbus.c
4630     +++ b/drivers/xen/pcifront/xenbus.c
4631     @@ -10,10 +10,6 @@
4632     #include <xen/gnttab.h>
4633     #include "pcifront.h"
4634    
4635     -#ifndef __init_refok
4636     -#define __init_refok
4637     -#endif
4638     -
4639     #define INVALID_GRANT_REF (0)
4640     #define INVALID_EVTCHN (-1)
4641    
4642     --- a/drivers/xen/sfc_netback/accel_fwd.c
4643     +++ b/drivers/xen/sfc_netback/accel_fwd.c
4644     @@ -308,7 +308,7 @@
4645     static inline int packet_is_arp_reply(struct sk_buff *skb)
4646     {
4647     return skb->protocol == ntohs(ETH_P_ARP)
4648     - && skb->nh.arph->ar_op == ntohs(ARPOP_REPLY);
4649     + && arp_hdr(skb)->ar_op == ntohs(ARPOP_REPLY);
4650     }
4651    
4652    
4653     @@ -392,12 +392,13 @@
4654    
4655     BUG_ON(fwd_priv == NULL);
4656    
4657     - if (is_broadcast_ether_addr(skb->mac.raw) && packet_is_arp_reply(skb)) {
4658     + if (is_broadcast_ether_addr(skb_mac_header(skb))
4659     + && packet_is_arp_reply(skb)) {
4660     /*
4661     * update our fast path forwarding to reflect this
4662     * gratuitous ARP
4663     */
4664     - mac = skb->mac.raw+ETH_ALEN;
4665     + mac = skb_mac_header(skb)+ETH_ALEN;
4666    
4667     DPRINTK("%s: found gratuitous ARP for " MAC_FMT "\n",
4668     __FUNCTION__, MAC_ARG(mac));
4669     --- a/drivers/xen/sfc_netback/accel_solarflare.c
4670     +++ b/drivers/xen/sfc_netback/accel_solarflare.c
4671     @@ -114,7 +114,7 @@
4672     BUG_ON(port == NULL);
4673    
4674     NETBACK_ACCEL_STATS_OP(global_stats.dl_tx_packets++);
4675     - if (skb->mac.raw != NULL)
4676     + if (skb_mac_header_was_set(skb))
4677     netback_accel_tx_packet(skb, port->fwd_priv);
4678     else {
4679     DPRINTK("Ignoring packet with missing mac address\n");
4680     --- a/drivers/xen/sfc_netfront/accel_tso.c
4681     +++ b/drivers/xen/sfc_netfront/accel_tso.c
4682     @@ -33,10 +33,9 @@
4683    
4684     #include "accel_tso.h"
4685    
4686     -#define PTR_DIFF(p1, p2) ((u8*)(p1) - (u8*)(p2))
4687     -#define ETH_HDR_LEN(skb) ((skb)->nh.raw - (skb)->data)
4688     -#define SKB_TCP_OFF(skb) PTR_DIFF ((skb)->h.th, (skb)->data)
4689     -#define SKB_IP_OFF(skb) PTR_DIFF ((skb)->nh.iph, (skb)->data)
4690     +#define ETH_HDR_LEN(skb) skb_network_offset(skb)
4691     +#define SKB_TCP_OFF(skb) skb_transport_offset(skb)
4692     +#define SKB_IP_OFF(skb) skb_network_offset(skb)
4693    
4694     /*
4695     * Set a maximum number of buffers in each output packet to make life
4696     @@ -114,9 +113,8 @@
4697     static inline void tso_check_safe(struct sk_buff *skb) {
4698     EPRINTK_ON(skb->protocol != htons (ETH_P_IP));
4699     EPRINTK_ON(((struct ethhdr*) skb->data)->h_proto != htons (ETH_P_IP));
4700     - EPRINTK_ON(skb->nh.iph->protocol != IPPROTO_TCP);
4701     - EPRINTK_ON((SKB_TCP_OFF(skb)
4702     - + (skb->h.th->doff << 2u)) > skb_headlen(skb));
4703     + EPRINTK_ON(ip_hdr(skb)->protocol != IPPROTO_TCP);
4704     + EPRINTK_ON((SKB_TCP_OFF(skb) + tcp_hdrlen(skb)) > skb_headlen(skb));
4705     }
4706    
4707    
4708     @@ -129,17 +127,17 @@
4709     * All ethernet/IP/TCP headers combined size is TCP header size
4710     * plus offset of TCP header relative to start of packet.
4711     */
4712     - st->p.header_length = (skb->h.th->doff << 2u) + SKB_TCP_OFF(skb);
4713     + st->p.header_length = tcp_hdrlen(skb) + SKB_TCP_OFF(skb);
4714     st->p.full_packet_size = (st->p.header_length
4715     + skb_shinfo(skb)->gso_size);
4716     st->p.gso_size = skb_shinfo(skb)->gso_size;
4717    
4718     - st->p.ip_id = htons(skb->nh.iph->id);
4719     - st->seqnum = ntohl(skb->h.th->seq);
4720     + st->p.ip_id = htons(ip_hdr(skb)->id);
4721     + st->seqnum = ntohl(tcp_hdr(skb)->seq);
4722    
4723     - EPRINTK_ON(skb->h.th->urg);
4724     - EPRINTK_ON(skb->h.th->syn);
4725     - EPRINTK_ON(skb->h.th->rst);
4726     + EPRINTK_ON(tcp_hdr(skb)->urg);
4727     + EPRINTK_ON(tcp_hdr(skb)->syn);
4728     + EPRINTK_ON(tcp_hdr(skb)->rst);
4729    
4730     st->remaining_len = skb->len - st->p.header_length;
4731    
4732     @@ -258,8 +256,8 @@
4733     /* This packet will be the last in the TSO burst. */
4734     ip_length = (st->p.header_length - ETH_HDR_LEN(skb)
4735     + st->remaining_len);
4736     - tsoh_th->fin = skb->h.th->fin;
4737     - tsoh_th->psh = skb->h.th->psh;
4738     + tsoh_th->fin = tcp_hdr(skb)->fin;
4739     + tsoh_th->psh = tcp_hdr(skb)->psh;
4740     }
4741    
4742     tsoh_iph->tot_len = htons(ip_length);
4743     --- a/drivers/xen/sfc_netfront/accel_vi.c
4744     +++ b/drivers/xen/sfc_netfront/accel_vi.c
4745     @@ -463,7 +463,7 @@
4746    
4747     if (skb->ip_summed == CHECKSUM_PARTIAL) {
4748     /* Set to zero to encourage falcon to work it out for us */
4749     - *(u16*)(skb->h.raw + skb->csum_offset) = 0;
4750     + *(u16*)(skb->head + skb->csum_start + skb->csum_offset) = 0;
4751     }
4752    
4753     if (multi_post_start_new_buffer(vnic, &state)) {
4754     @@ -582,7 +582,7 @@
4755    
4756     if (skb->ip_summed == CHECKSUM_PARTIAL) {
4757     /* Set to zero to encourage falcon to work it out for us */
4758     - *(u16*)(skb->h.raw + skb->csum_offset) = 0;
4759     + *(u16*)(skb->head + skb->csum_start + skb->csum_offset) = 0;
4760     }
4761     NETFRONT_ACCEL_PKTBUFF_FOR_EACH_FRAGMENT
4762     (skb, idx, frag_data, frag_len, {
4763     --- a/drivers/xen/sfc_netfront/accel_xenbus.c
4764     +++ b/drivers/xen/sfc_netfront/accel_xenbus.c
4765     @@ -356,7 +356,7 @@
4766     /* Create xenbus msg event channel */
4767     err = bind_listening_port_to_irqhandler
4768     (dev->otherend_id, netfront_accel_msg_channel_irq_from_bend,
4769     - SA_SAMPLE_RANDOM, "vnicctrl", vnic);
4770     + IRQF_SAMPLE_RANDOM, "vnicctrl", vnic);
4771     if (err < 0) {
4772     EPRINTK("Couldn't bind msg event channel\n");
4773     goto fail_msg_irq;
4774     @@ -367,7 +367,7 @@
4775     /* Create xenbus net event channel */
4776     err = bind_listening_port_to_irqhandler
4777     (dev->otherend_id, netfront_accel_net_channel_irq_from_bend,
4778     - SA_SAMPLE_RANDOM, "vnicfront", vnic);
4779     + IRQF_SAMPLE_RANDOM, "vnicfront", vnic);
4780     if (err < 0) {
4781     EPRINTK("Couldn't bind net event channel\n");
4782     goto fail_net_irq;
4783     --- a/drivers/xen/xenoprof/xenoprofile.c
4784     +++ b/drivers/xen/xenoprof/xenoprofile.c
4785     @@ -236,7 +236,7 @@
4786     result = bind_virq_to_irqhandler(VIRQ_XENOPROF,
4787     i,
4788     xenoprof_ovf_interrupt,
4789     - SA_INTERRUPT,
4790     + IRQF_DISABLED,
4791     "xenoprof",
4792     NULL);
4793    
4794     --- a/fs/aio.c
4795     +++ b/fs/aio.c
4796     @@ -38,7 +38,7 @@
4797    
4798     #ifdef CONFIG_EPOLL
4799     #include <linux/poll.h>
4800     -#include <linux/eventpoll.h>
4801     +#include <linux/anon_inodes.h>
4802     #endif
4803    
4804     #if DEBUG > 1
4805     @@ -1308,7 +1308,7 @@
4806    
4807     /* make_aio_fd:
4808     * Create a file descriptor that can be used to poll the event queue.
4809     - * Based and piggybacked on the excellent epoll code.
4810     + * Based on the excellent epoll code.
4811     */
4812    
4813     static int make_aio_fd(struct kioctx *ioctx)
4814     @@ -1317,7 +1317,8 @@
4815     struct inode *inode;
4816     struct file *file;
4817    
4818     - error = ep_getfd(&fd, &inode, &file, NULL, &aioq_fops);
4819     + error = anon_inode_getfd(&fd, &inode, &file, "[aioq]",
4820     + &aioq_fops, ioctx);
4821     if (error)
4822     return error;
4823    
4824     --- a/include/asm-x86/mach-xen/asm/desc_32.h
4825     +++ b/include/asm-x86/mach-xen/asm/desc_32.h
4826     @@ -11,23 +11,24 @@
4827    
4828     #include <asm/mmu.h>
4829    
4830     -extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
4831     -
4832     struct Xgt_desc_struct {
4833     unsigned short size;
4834     unsigned long address __attribute__((packed));
4835     unsigned short pad;
4836     } __attribute__ ((packed));
4837    
4838     -extern struct Xgt_desc_struct idt_descr;
4839     -DECLARE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr);
4840     -extern struct Xgt_desc_struct early_gdt_descr;
4841     +struct gdt_page
4842     +{
4843     + struct desc_struct gdt[GDT_ENTRIES];
4844     +} __attribute__((aligned(PAGE_SIZE)));
4845     +DECLARE_PER_CPU(struct gdt_page, gdt_page);
4846    
4847     static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
4848     {
4849     - return (struct desc_struct *)per_cpu(cpu_gdt_descr, cpu).address;
4850     + return per_cpu(gdt_page, cpu).gdt;
4851     }
4852    
4853     +extern struct Xgt_desc_struct idt_descr;
4854     extern struct desc_struct idt_table[];
4855     extern void set_intr_gate(unsigned int irq, void * addr);
4856    
4857     @@ -55,53 +56,32 @@
4858     #define DESCTYPE_S 0x10 /* !system */
4859    
4860     #ifndef CONFIG_XEN
4861     -#define load_TR_desc() __asm__ __volatile__("ltr %w0"::"q" (GDT_ENTRY_TSS*8))
4862     -
4863     -#define load_gdt(dtr) __asm__ __volatile("lgdt %0"::"m" (*dtr))
4864     -#define load_idt(dtr) __asm__ __volatile("lidt %0"::"m" (*dtr))
4865     +#define load_TR_desc() native_load_tr_desc()
4866     +#define load_gdt(dtr) native_load_gdt(dtr)
4867     +#define load_idt(dtr) native_load_idt(dtr)
4868     #define load_tr(tr) __asm__ __volatile("ltr %0"::"m" (tr))
4869     #define load_ldt(ldt) __asm__ __volatile("lldt %0"::"m" (ldt))
4870    
4871     -#define store_gdt(dtr) __asm__ ("sgdt %0":"=m" (*dtr))
4872     -#define store_idt(dtr) __asm__ ("sidt %0":"=m" (*dtr))
4873     -#define store_tr(tr) __asm__ ("str %0":"=m" (tr))
4874     +#define store_gdt(dtr) native_store_gdt(dtr)
4875     +#define store_idt(dtr) native_store_idt(dtr)
4876     +#define store_tr(tr) (tr = native_store_tr())
4877     #define store_ldt(ldt) __asm__ ("sldt %0":"=m" (ldt))
4878     -#endif
4879    
4880     -#if TLS_SIZE != 24
4881     -# error update this code.
4882     -#endif
4883     -
4884     -static inline void load_TLS(struct thread_struct *t, unsigned int cpu)
4885     -{
4886     -#define C(i) if (HYPERVISOR_update_descriptor(virt_to_machine(&get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]), \
4887     - *(u64 *)&t->tls_array[i]) \
4888     - BUG()
4889     - C(0); C(1); C(2);
4890     -#undef C
4891     -}
4892     +#define load_TLS(t, cpu) native_load_tls(t, cpu)
4893     +#define set_ldt native_set_ldt
4894    
4895     -#ifndef CONFIG_XEN
4896     #define write_ldt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
4897     #define write_gdt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
4898     #define write_idt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
4899    
4900     -static inline void write_dt_entry(void *dt, int entry, __u32 entry_a, __u32 entry_b)
4901     +static inline void write_dt_entry(struct desc_struct *dt,
4902     + int entry, u32 entry_low, u32 entry_high)
4903     {
4904     - __u32 *lp = (__u32 *)((char *)dt + entry*8);
4905     - *lp = entry_a;
4906     - *(lp+1) = entry_b;
4907     + dt[entry].a = entry_low;
4908     + dt[entry].b = entry_high;
4909     }
4910     -#define set_ldt native_set_ldt
4911     -#else
4912     -extern int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b);
4913     -extern int write_gdt_entry(void *gdt, int entry, __u32 entry_a, __u32 entry_b);
4914     -#define set_ldt xen_set_ldt
4915     -#endif
4916    
4917     -#ifndef CONFIG_XEN
4918     -static inline fastcall void native_set_ldt(const void *addr,
4919     - unsigned int entries)
4920     +static inline void native_set_ldt(const void *addr, unsigned int entries)
4921     {
4922     if (likely(entries == 0))
4923     __asm__ __volatile__("lldt %w0"::"q" (0));
4924     @@ -116,6 +96,65 @@
4925     __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
4926     }
4927     }
4928     +
4929     +
4930     +static inline void native_load_tr_desc(void)
4931     +{
4932     + asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
4933     +}
4934     +
4935     +static inline void native_load_gdt(const struct Xgt_desc_struct *dtr)
4936     +{
4937     + asm volatile("lgdt %0"::"m" (*dtr));
4938     +}
4939     +
4940     +static inline void native_load_idt(const struct Xgt_desc_struct *dtr)
4941     +{
4942     + asm volatile("lidt %0"::"m" (*dtr));
4943     +}
4944     +
4945     +static inline void native_store_gdt(struct Xgt_desc_struct *dtr)
4946     +{
4947     + asm ("sgdt %0":"=m" (*dtr));
4948     +}
4949     +
4950     +static inline void native_store_idt(struct Xgt_desc_struct *dtr)
4951     +{
4952     + asm ("sidt %0":"=m" (*dtr));
4953     +}
4954     +
4955     +static inline unsigned long native_store_tr(void)
4956     +{
4957     + unsigned long tr;
4958     + asm ("str %0":"=r" (tr));
4959     + return tr;
4960     +}
4961     +
4962     +static inline void native_load_tls(struct thread_struct *t, unsigned int cpu)
4963     +{
4964     + unsigned int i;
4965     + struct desc_struct *gdt = get_cpu_gdt_table(cpu);
4966     +
4967     + for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
4968     + gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i];
4969     +}
4970     +#else
4971     +#define load_TLS(t, cpu) xen_load_tls(t, cpu)
4972     +#define set_ldt xen_set_ldt
4973     +
4974     +extern int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b);
4975     +extern int write_gdt_entry(void *gdt, int entry, __u32 entry_a, __u32 entry_b);
4976     +
4977     +static inline void xen_load_tls(struct thread_struct *t, unsigned int cpu)
4978     +{
4979     + unsigned int i;
4980     + struct desc_struct *gdt = get_cpu_gdt_table(cpu) + GDT_ENTRY_TLS_MIN;
4981     +
4982     + for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
4983     + if (HYPERVISOR_update_descriptor(virt_to_machine(&gdt[i]),
4984     + *(u64 *)&t->tls_array[i]))
4985     + BUG();
4986     +}
4987     #endif
4988    
4989     #ifndef CONFIG_X86_NO_IDT
4990     --- a/include/asm-x86/mach-xen/asm/desc_64.h
4991     +++ b/include/asm-x86/mach-xen/asm/desc_64.h
4992     @@ -127,16 +127,6 @@
4993     DESC_LDT, size * 8 - 1);
4994     }
4995    
4996     -static inline void set_seg_base(unsigned cpu, int entry, void *base)
4997     -{
4998     - struct desc_struct *d = &cpu_gdt(cpu)[entry];
4999     - u32 addr = (u32)(u64)base;
5000     - BUG_ON((u64)base >> 32);
5001     - d->base0 = addr & 0xffff;
5002     - d->base1 = (addr >> 16) & 0xff;
5003     - d->base2 = (addr >> 24) & 0xff;
5004     -}
5005     -
5006     #define LDT_entry_a(info) \
5007     ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff))
5008     /* Don't allow setting of the lm bit. It is useless anyways because
5009     @@ -165,25 +155,15 @@
5010     (info)->useable == 0 && \
5011     (info)->lm == 0)
5012    
5013     -#if TLS_SIZE != 24
5014     -# error update this code.
5015     -#endif
5016     -
5017     static inline void load_TLS(struct thread_struct *t, unsigned int cpu)
5018     {
5019     -#if 0
5020     + unsigned int i;
5021     u64 *gdt = (u64 *)(cpu_gdt(cpu) + GDT_ENTRY_TLS_MIN);
5022     - gdt[0] = t->tls_array[0];
5023     - gdt[1] = t->tls_array[1];
5024     - gdt[2] = t->tls_array[2];
5025     -#endif
5026     -#define C(i) \
5027     - if (HYPERVISOR_update_descriptor(virt_to_machine(&cpu_gdt(cpu)[GDT_ENTRY_TLS_MIN + i]), \
5028     - t->tls_array[i])) \
5029     - BUG();
5030    
5031     - C(0); C(1); C(2);
5032     -#undef C
5033     + for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
5034     + if (HYPERVISOR_update_descriptor(virt_to_machine(&gdt[i]),
5035     + t->tls_array[i]))
5036     + BUG();
5037     }
5038    
5039     /*
5040     --- a/include/asm-x86/mach-xen/asm/dma-mapping_64.h
5041     +++ b/include/asm-x86/mach-xen/asm/dma-mapping_64.h
5042     @@ -51,7 +51,7 @@
5043     };
5044    
5045     extern dma_addr_t bad_dma_address;
5046     -extern struct dma_mapping_ops* dma_ops;
5047     +extern const struct dma_mapping_ops* dma_ops;
5048     extern int iommu_merge;
5049    
5050     #if 0
5051     --- a/include/asm-x86/mach-xen/asm/fixmap_32.h
5052     +++ b/include/asm-x86/mach-xen/asm/fixmap_32.h
5053     @@ -19,10 +19,8 @@
5054     * the start of the fixmap.
5055     */
5056     extern unsigned long __FIXADDR_TOP;
5057     -#ifdef CONFIG_COMPAT_VDSO
5058     -#define FIXADDR_USER_START __fix_to_virt(FIX_VDSO)
5059     -#define FIXADDR_USER_END __fix_to_virt(FIX_VDSO - 1)
5060     -#endif
5061     +#define FIXADDR_USER_START __fix_to_virt(FIX_VDSO)
5062     +#define FIXADDR_USER_END __fix_to_virt(FIX_VDSO - 1)
5063    
5064     #ifndef __ASSEMBLY__
5065     #include <linux/kernel.h>
5066     @@ -85,6 +83,9 @@
5067     #ifdef CONFIG_PCI_MMCONFIG
5068     FIX_PCIE_MCFG,
5069     #endif
5070     +#ifdef CONFIG_PARAVIRT
5071     + FIX_PARAVIRT_BOOTMAP,
5072     +#endif
5073     FIX_SHARED_INFO,
5074     #define NR_FIX_ISAMAPS 256
5075     FIX_ISAMAP_END,
5076     --- a/include/asm-x86/mach-xen/asm/fixmap_64.h
5077     +++ b/include/asm-x86/mach-xen/asm/fixmap_64.h
5078     @@ -15,7 +15,6 @@
5079     #include <asm/apicdef.h>
5080     #include <asm/page.h>
5081     #include <asm/vsyscall.h>
5082     -#include <asm/vsyscall32.h>
5083     #include <asm/acpi.h>
5084    
5085     /*
5086     --- a/include/asm-x86/mach-xen/asm/highmem.h
5087     +++ b/include/asm-x86/mach-xen/asm/highmem.h
5088     @@ -67,12 +67,18 @@
5089    
5090     void *kmap(struct page *page);
5091     void kunmap(struct page *page);
5092     +void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot);
5093     void *kmap_atomic(struct page *page, enum km_type type);
5094     void *kmap_atomic_pte(struct page *page, enum km_type type);
5095     void kunmap_atomic(void *kvaddr, enum km_type type);
5096     void *kmap_atomic_pfn(unsigned long pfn, enum km_type type);
5097     struct page *kmap_atomic_to_page(void *ptr);
5098    
5099     +#define kmap_atomic_pte(page, type) \
5100     + kmap_atomic_prot(page, type, \
5101     + test_bit(PG_pinned, &(page)->flags) \
5102     + ? PAGE_KERNEL_RO : kmap_prot)
5103     +
5104     #define flush_cache_kmaps() do { } while (0)
5105    
5106     #endif /* __KERNEL__ */
5107     --- a/include/asm-x86/mach-xen/asm/io_32.h
5108     +++ b/include/asm-x86/mach-xen/asm/io_32.h
5109     @@ -263,15 +263,18 @@
5110    
5111     #endif /* __KERNEL__ */
5112    
5113     -#define __SLOW_DOWN_IO "outb %%al,$0x80;"
5114     +static inline void xen_io_delay(void)
5115     +{
5116     + asm volatile("outb %%al,$0x80" : : : "memory");
5117     +}
5118    
5119     static inline void slow_down_io(void) {
5120     - __asm__ __volatile__(
5121     - __SLOW_DOWN_IO
5122     + xen_io_delay();
5123     #ifdef REALLY_SLOW_IO
5124     - __SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO
5125     + xen_io_delay();
5126     + xen_io_delay();
5127     + xen_io_delay();
5128     #endif
5129     - : : );
5130     }
5131    
5132     #ifdef CONFIG_X86_NUMAQ
5133     --- a/include/asm-x86/mach-xen/asm/irqflags_32.h
5134     +++ b/include/asm-x86/mach-xen/asm/irqflags_32.h
5135     @@ -11,6 +11,43 @@
5136     #define _ASM_IRQFLAGS_H
5137    
5138     #ifndef __ASSEMBLY__
5139     +#define xen_save_fl(void) (current_vcpu_info()->evtchn_upcall_mask)
5140     +
5141     +#define xen_restore_fl(f) \
5142     +do { \
5143     + vcpu_info_t *_vcpu; \
5144     + barrier(); \
5145     + _vcpu = current_vcpu_info(); \
5146     + if ((_vcpu->evtchn_upcall_mask = (f)) == 0) { \
5147     + barrier(); /* unmask then check (avoid races) */\
5148     + if (unlikely(_vcpu->evtchn_upcall_pending)) \
5149     + force_evtchn_callback(); \
5150     + } \
5151     +} while (0)
5152     +
5153     +#define xen_irq_disable() \
5154     +do { \
5155     + current_vcpu_info()->evtchn_upcall_mask = 1; \
5156     + barrier(); \
5157     +} while (0)
5158     +
5159     +#define xen_irq_enable() \
5160     +do { \
5161     + vcpu_info_t *_vcpu; \
5162     + barrier(); \
5163     + _vcpu = current_vcpu_info(); \
5164     + _vcpu->evtchn_upcall_mask = 0; \
5165     + barrier(); /* unmask then check (avoid races) */ \
5166     + if (unlikely(_vcpu->evtchn_upcall_pending)) \
5167     + force_evtchn_callback(); \
5168     +} while (0)
5169     +
5170     +void xen_safe_halt(void);
5171     +
5172     +void xen_halt(void);
5173     +#endif /* __ASSEMBLY__ */
5174     +
5175     +#ifndef __ASSEMBLY__
5176    
5177     /*
5178     * The use of 'barrier' in the following reflects their use as local-lock
5179     @@ -20,48 +57,31 @@
5180     * includes these barriers, for example.
5181     */
5182    
5183     -#define __raw_local_save_flags() (current_vcpu_info()->evtchn_upcall_mask)
5184     +#define __raw_local_save_flags(void) xen_save_fl()
5185    
5186     -#define raw_local_irq_restore(x) \
5187     -do { \
5188     - vcpu_info_t *_vcpu; \
5189     - barrier(); \
5190     - _vcpu = current_vcpu_info(); \
5191     - if ((_vcpu->evtchn_upcall_mask = (x)) == 0) { \
5192     - barrier(); /* unmask then check (avoid races) */ \
5193     - if (unlikely(_vcpu->evtchn_upcall_pending)) \
5194     - force_evtchn_callback(); \
5195     - } \
5196     -} while (0)
5197     +#define raw_local_irq_restore(flags) xen_restore_fl(flags)
5198    
5199     -#define raw_local_irq_disable() \
5200     -do { \
5201     - current_vcpu_info()->evtchn_upcall_mask = 1; \
5202     - barrier(); \
5203     -} while (0)
5204     +#define raw_local_irq_disable() xen_irq_disable()
5205    
5206     -#define raw_local_irq_enable() \
5207     -do { \
5208     - vcpu_info_t *_vcpu; \
5209     - barrier(); \
5210     - _vcpu = current_vcpu_info(); \
5211     - _vcpu->evtchn_upcall_mask = 0; \
5212     - barrier(); /* unmask then check (avoid races) */ \
5213     - if (unlikely(_vcpu->evtchn_upcall_pending)) \
5214     - force_evtchn_callback(); \
5215     -} while (0)
5216     +#define raw_local_irq_enable() xen_irq_enable()
5217    
5218     /*
5219     * Used in the idle loop; sti takes one instruction cycle
5220     * to complete:
5221     */
5222     -void raw_safe_halt(void);
5223     +static inline void raw_safe_halt(void)
5224     +{
5225     + xen_safe_halt();
5226     +}
5227    
5228     /*
5229     * Used when interrupts are already enabled or to
5230     * shutdown the processor:
5231     */
5232     -void halt(void);
5233     +static inline void halt(void)
5234     +{
5235     + xen_halt();
5236     +}
5237    
5238     /*
5239     * For spinlocks, etc:
5240     --- a/include/asm-x86/mach-xen/asm/irqflags_64.h
5241     +++ b/include/asm-x86/mach-xen/asm/irqflags_64.h
5242     @@ -9,6 +9,7 @@
5243     */
5244     #ifndef _ASM_IRQFLAGS_H
5245     #define _ASM_IRQFLAGS_H
5246     +#include <asm/processor-flags.h>
5247    
5248     #ifndef __ASSEMBLY__
5249     /*
5250     @@ -50,19 +51,19 @@
5251     {
5252     unsigned long flags = __raw_local_save_flags();
5253    
5254     - raw_local_irq_restore((flags & ~(1 << 9)) | (1 << 18));
5255     + raw_local_irq_restore((flags & ~X86_EFLAGS_IF) | X86_EFLAGS_AC);
5256     }
5257    
5258     static inline void raw_local_irq_enable(void)
5259     {
5260     unsigned long flags = __raw_local_save_flags();
5261    
5262     - raw_local_irq_restore((flags | (1 << 9)) & ~(1 << 18));
5263     + raw_local_irq_restore((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC));
5264     }
5265    
5266     static inline int raw_irqs_disabled_flags(unsigned long flags)
5267     {
5268     - return !(flags & (1<<9)) || (flags & (1 << 18));
5269     + return !(flags & X86_EFLAGS_IF) || (flags & X86_EFLAGS_AC);
5270     }
5271    
5272     #else /* CONFIG_X86_VSMP */
5273     @@ -118,13 +119,21 @@
5274     * Used in the idle loop; sti takes one instruction cycle
5275     * to complete:
5276     */
5277     -void raw_safe_halt(void);
5278     +void xen_safe_halt(void);
5279     +static inline void raw_safe_halt(void)
5280     +{
5281     + xen_safe_halt();
5282     +}
5283    
5284     /*
5285     * Used when interrupts are already enabled or to
5286     * shutdown the processor:
5287     */
5288     -void halt(void);
5289     +void xen_halt(void);
5290     +static inline void halt(void)
5291     +{
5292     + xen_halt();
5293     +}
5294    
5295     #else /* __ASSEMBLY__: */
5296     # ifdef CONFIG_TRACE_IRQFLAGS
5297     --- a/include/asm-x86/mach-xen/asm/mmu.h
5298     +++ b/include/asm-x86/mach-xen/asm/mmu.h
5299     @@ -18,12 +18,4 @@
5300     #endif
5301     } mm_context_t;
5302    
5303     -/* mm/memory.c:exit_mmap hook */
5304     -extern void _arch_exit_mmap(struct mm_struct *mm);
5305     -#define arch_exit_mmap(_mm) _arch_exit_mmap(_mm)
5306     -
5307     -/* kernel/fork.c:dup_mmap hook */
5308     -extern void _arch_dup_mmap(struct mm_struct *mm);
5309     -#define arch_dup_mmap(mm, oldmm) ((void)(oldmm), _arch_dup_mmap(mm))
5310     -
5311     #endif
5312     --- a/include/asm-x86/mach-xen/asm/mmu_64.h
5313     +++ b/include/asm-x86/mach-xen/asm/mmu_64.h
5314     @@ -25,14 +25,6 @@
5315     #ifdef CONFIG_XEN
5316     extern struct list_head mm_unpinned;
5317     extern spinlock_t mm_unpinned_lock;
5318     -
5319     -/* mm/memory.c:exit_mmap hook */
5320     -extern void _arch_exit_mmap(struct mm_struct *mm);
5321     -#define arch_exit_mmap(_mm) _arch_exit_mmap(_mm)
5322     -
5323     -/* kernel/fork.c:dup_mmap hook */
5324     -extern void _arch_dup_mmap(struct mm_struct *mm);
5325     -#define arch_dup_mmap(mm, oldmm) ((void)(oldmm), _arch_dup_mmap(mm))
5326     #endif
5327    
5328     #endif
5329     --- a/include/asm-x86/mach-xen/asm/mmu_context_32.h
5330     +++ b/include/asm-x86/mach-xen/asm/mmu_context_32.h
5331     @@ -6,6 +6,20 @@
5332     #include <asm/pgalloc.h>
5333     #include <asm/tlbflush.h>
5334    
5335     +void arch_exit_mmap(struct mm_struct *mm);
5336     +void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
5337     +
5338     +void mm_pin(struct mm_struct *mm);
5339     +void mm_unpin(struct mm_struct *mm);
5340     +void mm_pin_all(void);
5341     +
5342     +static inline void xen_activate_mm(struct mm_struct *prev,
5343     + struct mm_struct *next)
5344     +{
5345     + if (!test_bit(PG_pinned, &virt_to_page(next->pgd)->flags))
5346     + mm_pin(next);
5347     +}
5348     +
5349     /*
5350     * Used for LDT copy/destruction.
5351     */
5352     @@ -37,10 +51,6 @@
5353     : : "r" (0) );
5354     }
5355    
5356     -extern void mm_pin(struct mm_struct *mm);
5357     -extern void mm_unpin(struct mm_struct *mm);
5358     -void mm_pin_all(void);
5359     -
5360     static inline void switch_mm(struct mm_struct *prev,
5361     struct mm_struct *next,
5362     struct task_struct *tsk)
5363     @@ -97,11 +107,10 @@
5364     #define deactivate_mm(tsk, mm) \
5365     asm("movl %0,%%gs": :"r" (0));
5366    
5367     -static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next)
5368     -{
5369     - if (!test_bit(PG_pinned, &virt_to_page(next->pgd)->flags))
5370     - mm_pin(next);
5371     - switch_mm(prev, next, NULL);
5372     -}
5373     +#define activate_mm(prev, next) \
5374     + do { \
5375     + xen_activate_mm(prev, next); \
5376     + switch_mm((prev),(next),NULL); \
5377     + } while(0)
5378    
5379     #endif
5380     --- a/include/asm-x86/mach-xen/asm/mmu_context_64.h
5381     +++ b/include/asm-x86/mach-xen/asm/mmu_context_64.h
5382     @@ -9,6 +9,9 @@
5383     #include <asm/pgtable.h>
5384     #include <asm/tlbflush.h>
5385    
5386     +void arch_exit_mmap(struct mm_struct *mm);
5387     +void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
5388     +
5389     /*
5390     * possibly do the LDT unload here?
5391     */
5392     --- a/include/asm-x86/mach-xen/asm/page_64.h
5393     +++ b/include/asm-x86/mach-xen/asm/page_64.h
5394     @@ -7,6 +7,7 @@
5395     #include <linux/types.h>
5396     #include <asm/bug.h>
5397     #endif
5398     +#include <linux/const.h>
5399     #include <xen/interface/xen.h>
5400    
5401     /*
5402     @@ -19,18 +20,14 @@
5403    
5404     /* PAGE_SHIFT determines the page size */
5405     #define PAGE_SHIFT 12
5406     -#ifdef __ASSEMBLY__
5407     -#define PAGE_SIZE (0x1 << PAGE_SHIFT)
5408     -#else
5409     -#define PAGE_SIZE (1UL << PAGE_SHIFT)
5410     -#endif
5411     +#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT)
5412     #define PAGE_MASK (~(PAGE_SIZE-1))
5413    
5414     /* See Documentation/x86_64/mm.txt for a description of the memory map. */
5415     #define __PHYSICAL_MASK_SHIFT 46
5416     -#define __PHYSICAL_MASK ((1UL << __PHYSICAL_MASK_SHIFT) - 1)
5417     +#define __PHYSICAL_MASK ((_AC(1,UL) << __PHYSICAL_MASK_SHIFT) - 1)
5418     #define __VIRTUAL_MASK_SHIFT 48
5419     -#define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1)
5420     +#define __VIRTUAL_MASK ((_AC(1,UL) << __VIRTUAL_MASK_SHIFT) - 1)
5421    
5422     #define PHYSICAL_PAGE_MASK (~(PAGE_SIZE-1) & __PHYSICAL_MASK)
5423    
5424     @@ -55,10 +52,10 @@
5425     #define N_EXCEPTION_STACKS 5 /* hw limit: 7 */
5426    
5427     #define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1))
5428     -#define LARGE_PAGE_SIZE (1UL << PMD_SHIFT)
5429     +#define LARGE_PAGE_SIZE (_AC(1,UL) << PMD_SHIFT)
5430    
5431     #define HPAGE_SHIFT PMD_SHIFT
5432     -#define HPAGE_SIZE ((1UL) << HPAGE_SHIFT)
5433     +#define HPAGE_SIZE (_AC(1,UL) << HPAGE_SHIFT)
5434     #define HPAGE_MASK (~(HPAGE_SIZE - 1))
5435     #define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
5436    
5437     @@ -152,17 +149,23 @@
5438    
5439     #define __pgprot(x) ((pgprot_t) { (x) } )
5440    
5441     -#define __PHYSICAL_START ((unsigned long)CONFIG_PHYSICAL_START)
5442     -#define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START)
5443     -#define __START_KERNEL_map 0xffffffff80000000UL
5444     -#define __PAGE_OFFSET 0xffff880000000000UL
5445     +#endif /* !__ASSEMBLY__ */
5446    
5447     -#else
5448     #define __PHYSICAL_START CONFIG_PHYSICAL_START
5449     +#define __KERNEL_ALIGN 0x200000
5450     +
5451     +/*
5452     + * Make sure kernel is aligned to 2MB address. Catching it at compile
5453     + * time is better. Change your config file and compile the kernel
5454     + * for a 2MB aligned address (CONFIG_PHYSICAL_START)
5455     + */
5456     +#if (CONFIG_PHYSICAL_START % __KERNEL_ALIGN) != 0
5457     +#error "CONFIG_PHYSICAL_START must be a multiple of 2MB"
5458     +#endif
5459     +
5460     #define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START)
5461     -#define __START_KERNEL_map 0xffffffff80000000
5462     -#define __PAGE_OFFSET 0xffff880000000000
5463     -#endif /* !__ASSEMBLY__ */
5464     +#define __START_KERNEL_map _AC(0xffffffff80000000, UL)
5465     +#define __PAGE_OFFSET _AC(0xffff880000000000, UL)
5466    
5467     #if CONFIG_XEN_COMPAT <= 0x030002
5468     #undef LOAD_OFFSET
5469     @@ -172,20 +175,20 @@
5470     /* to align the pointer to the (next) page boundary */
5471     #define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK)
5472    
5473     -#define KERNEL_TEXT_SIZE (40UL*1024*1024)
5474     -#define KERNEL_TEXT_START 0xffffffff80000000UL
5475     +#define KERNEL_TEXT_SIZE (40*1024*1024)
5476     +#define KERNEL_TEXT_START _AC(0xffffffff80000000, UL)
5477     +
5478     +#define PAGE_OFFSET __PAGE_OFFSET
5479    
5480     -#define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET)
5481     +#ifndef __ASSEMBLY__
5482     +static inline unsigned long __phys_addr(unsigned long x)
5483     +{
5484     + return x - (x >= __START_KERNEL_map ? __START_KERNEL_map : PAGE_OFFSET);
5485     +}
5486     +#endif
5487    
5488     -/* Note: __pa(&symbol_visible_to_c) should be always replaced with __pa_symbol.
5489     - Otherwise you risk miscompilation. */
5490     -#define __pa(x) (((unsigned long)(x)>=__START_KERNEL_map)?(unsigned long)(x) - (unsigned long)__START_KERNEL_map:(unsigned long)(x) - PAGE_OFFSET)
5491     -/* __pa_symbol should be used for C visible symbols.
5492     - This seems to be the official gcc blessed way to do such arithmetic. */
5493     -#define __pa_symbol(x) \
5494     - ({unsigned long v; \
5495     - asm("" : "=r" (v) : "0" (x)); \
5496     - __pa(v); })
5497     +#define __pa(x) __phys_addr((unsigned long)(x))
5498     +#define __pa_symbol(x) __phys_addr((unsigned long)(x))
5499    
5500     #define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET))
5501     #define __boot_va(x) __va(x)
5502     --- a/include/asm-x86/mach-xen/asm/pgalloc_32.h
5503     +++ b/include/asm-x86/mach-xen/asm/pgalloc_32.h
5504     @@ -1,7 +1,6 @@
5505     #ifndef _I386_PGALLOC_H
5506     #define _I386_PGALLOC_H
5507    
5508     -#include <asm/fixmap.h>
5509     #include <linux/threads.h>
5510     #include <linux/mm.h> /* for struct page */
5511     #include <asm/io.h> /* for phys_to_virt and page_to_pseudophys */
5512     @@ -69,6 +68,4 @@
5513     #define pud_populate(mm, pmd, pte) BUG()
5514     #endif
5515    
5516     -#define check_pgt_cache() do { } while (0)
5517     -
5518     #endif /* _I386_PGALLOC_H */
5519     --- a/include/asm-x86/mach-xen/asm/pgalloc_64.h
5520     +++ b/include/asm-x86/mach-xen/asm/pgalloc_64.h
5521     @@ -1,7 +1,6 @@
5522     #ifndef _X86_64_PGALLOC_H
5523     #define _X86_64_PGALLOC_H
5524    
5525     -#include <asm/fixmap.h>
5526     #include <asm/pda.h>
5527     #include <linux/threads.h>
5528     #include <linux/mm.h>
5529     @@ -100,24 +99,16 @@
5530     struct page *page = virt_to_page(pgd);
5531    
5532     spin_lock(&pgd_lock);
5533     - page->index = (pgoff_t)pgd_list;
5534     - if (pgd_list)
5535     - pgd_list->private = (unsigned long)&page->index;
5536     - pgd_list = page;
5537     - page->private = (unsigned long)&pgd_list;
5538     + list_add(&page->lru, &pgd_list);
5539     spin_unlock(&pgd_lock);
5540     }
5541    
5542     static inline void pgd_list_del(pgd_t *pgd)
5543     {
5544     - struct page *next, **pprev, *page = virt_to_page(pgd);
5545     + struct page *page = virt_to_page(pgd);
5546    
5547     spin_lock(&pgd_lock);
5548     - next = (struct page *)page->index;
5549     - pprev = (struct page **)page->private;
5550     - *pprev = next;
5551     - if (next)
5552     - next->private = (unsigned long)pprev;
5553     + list_del(&page->lru);
5554     spin_unlock(&pgd_lock);
5555     }
5556    
5557     --- a/include/asm-x86/mach-xen/asm/pgtable-2level.h
5558     +++ b/include/asm-x86/mach-xen/asm/pgtable-2level.h
5559     @@ -13,22 +13,43 @@
5560     * within a page table are directly modified. Thus, the following
5561     * hook is made available.
5562     */
5563     -#define set_pte(pteptr, pteval) (*(pteptr) = pteval)
5564     -
5565     -#define set_pte_at(_mm,addr,ptep,pteval) do { \
5566     - if (((_mm) != current->mm && (_mm) != &init_mm) || \
5567     - HYPERVISOR_update_va_mapping((addr), (pteval), 0)) \
5568     - set_pte((ptep), (pteval)); \
5569     -} while (0)
5570     -
5571     -#define set_pmd(pmdptr, pmdval) xen_l2_entry_update((pmdptr), (pmdval))
5572     +static inline void xen_set_pte(pte_t *ptep , pte_t pte)
5573     +{
5574     + *ptep = pte;
5575     +}
5576     +static inline void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
5577     + pte_t *ptep , pte_t pte)
5578     +{
5579     + if ((mm != current->mm && mm != &init_mm) ||
5580     + HYPERVISOR_update_va_mapping(addr, pte, 0))
5581     + xen_set_pte(ptep, pte);
5582     +}
5583     +static inline void xen_set_pmd(pmd_t *pmdp, pmd_t pmd)
5584     +{
5585     + xen_l2_entry_update(pmdp, pmd);
5586     +}
5587     +#define set_pte(pteptr, pteval) xen_set_pte(pteptr, pteval)
5588     +#define set_pte_at(mm,addr,ptep,pteval) xen_set_pte_at(mm, addr, ptep, pteval)
5589     +#define set_pmd(pmdptr, pmdval) xen_set_pmd(pmdptr, pmdval)
5590    
5591     #define set_pte_atomic(pteptr, pteval) set_pte(pteptr,pteval)
5592    
5593     #define pte_clear(mm,addr,xp) do { set_pte_at(mm, addr, xp, __pte(0)); } while (0)
5594     #define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0)
5595    
5596     -#define raw_ptep_get_and_clear(xp, pte) __pte_ma(xchg(&(xp)->pte_low, 0))
5597     +static inline void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *xp)
5598     +{
5599     + xen_set_pte_at(mm, addr, xp, __pte(0));
5600     +}
5601     +
5602     +#ifdef CONFIG_SMP
5603     +static inline pte_t xen_ptep_get_and_clear(pte_t *xp, pte_t res)
5604     +{
5605     + return __pte_ma(xchg(&xp->pte_low, 0));
5606     +}
5607     +#else
5608     +#define xen_ptep_get_and_clear(xp, res) xen_local_ptep_get_and_clear(xp, res)
5609     +#endif
5610    
5611     #define __HAVE_ARCH_PTEP_CLEAR_FLUSH
5612     #define ptep_clear_flush(vma, addr, ptep) \
5613     @@ -95,6 +116,4 @@
5614     #define __pte_to_swp_entry(pte) ((swp_entry_t) { (pte).pte_low })
5615     #define __swp_entry_to_pte(x) ((pte_t) { (x).val })
5616    
5617     -void vmalloc_sync_all(void);
5618     -
5619     #endif /* _I386_PGTABLE_2LEVEL_H */
5620     --- a/include/asm-x86/mach-xen/asm/pgtable-3level-defs.h
5621     +++ b/include/asm-x86/mach-xen/asm/pgtable-3level-defs.h
5622     @@ -1,7 +1,7 @@
5623     #ifndef _I386_PGTABLE_3LEVEL_DEFS_H
5624     #define _I386_PGTABLE_3LEVEL_DEFS_H
5625    
5626     -#define HAVE_SHARED_KERNEL_PMD 0
5627     +#define SHARED_KERNEL_PMD 0
5628    
5629     /*
5630     * PGDIR_SHIFT determines what a top-level page table entry can map
5631     --- a/include/asm-x86/mach-xen/asm/pgtable-3level.h
5632     +++ b/include/asm-x86/mach-xen/asm/pgtable-3level.h
5633     @@ -52,32 +52,40 @@
5634     * value and then use set_pte to update it. -ben
5635     */
5636    
5637     -static inline void set_pte(pte_t *ptep, pte_t pte)
5638     +static inline void xen_set_pte(pte_t *ptep, pte_t pte)
5639     {
5640     ptep->pte_high = pte.pte_high;
5641     smp_wmb();
5642     ptep->pte_low = pte.pte_low;
5643     }
5644     -#define set_pte_atomic(pteptr,pteval) \
5645     - set_64bit((unsigned long long *)(pteptr),__pte_val(pteval))
5646    
5647     -#define set_pte_at(_mm,addr,ptep,pteval) do { \
5648     - if (((_mm) != current->mm && (_mm) != &init_mm) || \
5649     - HYPERVISOR_update_va_mapping((addr), (pteval), 0)) \
5650     - set_pte((ptep), (pteval)); \
5651     -} while (0)
5652     -
5653     -#define set_pmd(pmdptr,pmdval) \
5654     - xen_l2_entry_update((pmdptr), (pmdval))
5655     -#define set_pud(pudptr,pudval) \
5656     - xen_l3_entry_update((pudptr), (pudval))
5657     +static inline void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
5658     + pte_t *ptep , pte_t pte)
5659     +{
5660     + if ((mm != current->mm && mm != &init_mm) ||
5661     + HYPERVISOR_update_va_mapping(addr, pte, 0))
5662     + xen_set_pte(ptep, pte);
5663     +}
5664     +
5665     +static inline void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
5666     +{
5667     + set_64bit((unsigned long long *)(ptep),__pte_val(pte));
5668     +}
5669     +static inline void xen_set_pmd(pmd_t *pmdp, pmd_t pmd)
5670     +{
5671     + xen_l2_entry_update(pmdp, pmd);
5672     +}
5673     +static inline void xen_set_pud(pud_t *pudp, pud_t pud)
5674     +{
5675     + xen_l3_entry_update(pudp, pud);
5676     +}
5677    
5678     /*
5679     * For PTEs and PDEs, we must clear the P-bit first when clearing a page table
5680     * entry, so clear the bottom half first and enforce ordering with a compiler
5681     * barrier.
5682     */
5683     -static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
5684     +static inline void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
5685     {
5686     if ((mm != current->mm && mm != &init_mm)
5687     || HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) {
5688     @@ -87,7 +95,18 @@
5689     }
5690     }
5691    
5692     -#define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0)
5693     +static inline void xen_pmd_clear(pmd_t *pmd)
5694     +{
5695     + xen_l2_entry_update(pmd, __pmd(0));
5696     +}
5697     +
5698     +#define set_pte(ptep, pte) xen_set_pte(ptep, pte)
5699     +#define set_pte_at(mm, addr, ptep, pte) xen_set_pte_at(mm, addr, ptep, pte)
5700     +#define set_pte_atomic(ptep, pte) xen_set_pte_atomic(ptep, pte)
5701     +#define set_pmd(pmdp, pmd) xen_set_pmd(pmdp, pmd)
5702     +#define set_pud(pudp, pud) xen_set_pud(pudp, pud)
5703     +#define pte_clear(mm, addr, ptep) xen_pte_clear(mm, addr, ptep)
5704     +#define pmd_clear(pmd) xen_pmd_clear(pmd)
5705    
5706     /*
5707     * Pentium-II erratum A13: in PAE mode we explicitly have to flush
5708     @@ -108,7 +127,8 @@
5709     #define pmd_offset(pud, address) ((pmd_t *) pud_page(*(pud)) + \
5710     pmd_index(address))
5711    
5712     -static inline pte_t raw_ptep_get_and_clear(pte_t *ptep, pte_t res)
5713     +#ifdef CONFIG_SMP
5714     +static inline pte_t xen_ptep_get_and_clear(pte_t *ptep, pte_t res)
5715     {
5716     uint64_t val = __pte_val(res);
5717     if (__cmpxchg64(ptep, val, 0) != val) {
5718     @@ -119,6 +139,9 @@
5719     }
5720     return res;
5721     }
5722     +#else
5723     +#define xen_ptep_get_and_clear(xp, pte) xen_local_ptep_get_and_clear(xp, pte)
5724     +#endif
5725    
5726     #define __HAVE_ARCH_PTEP_CLEAR_FLUSH
5727     #define ptep_clear_flush(vma, addr, ptep) \
5728     @@ -165,13 +188,13 @@
5729     static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
5730     {
5731     return __pte((((unsigned long long)page_nr << PAGE_SHIFT) |
5732     - pgprot_val(pgprot)) & __supported_pte_mask);
5733     + pgprot_val(pgprot)) & __supported_pte_mask);
5734     }
5735    
5736     static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
5737     {
5738     return __pmd((((unsigned long long)page_nr << PAGE_SHIFT) |
5739     - pgprot_val(pgprot)) & __supported_pte_mask);
5740     + pgprot_val(pgprot)) & __supported_pte_mask);
5741     }
5742    
5743     /*
5744     @@ -191,6 +214,4 @@
5745    
5746     #define __pmd_free_tlb(tlb, x) do { } while (0)
5747    
5748     -void vmalloc_sync_all(void);
5749     -
5750     #endif /* _I386_PGTABLE_3LEVEL_H */
5751     --- a/include/asm-x86/mach-xen/asm/pgtable_32.h
5752     +++ b/include/asm-x86/mach-xen/asm/pgtable_32.h
5753     @@ -24,11 +24,11 @@
5754     #include <linux/slab.h>
5755     #include <linux/list.h>
5756     #include <linux/spinlock.h>
5757     +#include <linux/sched.h>
5758    
5759     /* Is this pagetable pinned? */
5760     #define PG_pinned PG_arch_1
5761    
5762     -struct mm_struct;
5763     struct vm_area_struct;
5764    
5765     /*
5766     @@ -38,17 +38,16 @@
5767     #define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
5768     extern unsigned long empty_zero_page[1024];
5769     extern pgd_t *swapper_pg_dir;
5770     -extern struct kmem_cache *pgd_cache;
5771     extern struct kmem_cache *pmd_cache;
5772     extern spinlock_t pgd_lock;
5773     extern struct page *pgd_list;
5774     +void check_pgt_cache(void);
5775    
5776     void pmd_ctor(void *, struct kmem_cache *, unsigned long);
5777     -void pgd_ctor(void *, struct kmem_cache *, unsigned long);
5778     -void pgd_dtor(void *, struct kmem_cache *, unsigned long);
5779     void pgtable_cache_init(void);
5780     void paging_init(void);
5781    
5782     +
5783     /*
5784     * The Linux x86 paging architecture is 'compile-time dual-mode', it
5785     * implements both the traditional 2-level x86 page tables and the
5786     @@ -165,6 +164,7 @@
5787    
5788     extern unsigned long long __PAGE_KERNEL, __PAGE_KERNEL_EXEC;
5789     #define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW)
5790     +#define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW)
5791     #define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD)
5792     #define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE)
5793     #define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE)
5794     @@ -172,6 +172,7 @@
5795     #define PAGE_KERNEL __pgprot(__PAGE_KERNEL)
5796     #define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO)
5797     #define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC)
5798     +#define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX)
5799     #define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE)
5800     #define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE)
5801     #define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC)
5802     @@ -275,7 +276,13 @@
5803     */
5804     #define pte_update(mm, addr, ptep) do { } while (0)
5805     #define pte_update_defer(mm, addr, ptep) do { } while (0)
5806     -#define paravirt_map_pt_hook(slot, va, pfn) do { } while (0)
5807     +
5808     +/* local pte updates need not use xchg for locking */
5809     +static inline pte_t xen_local_ptep_get_and_clear(pte_t *ptep, pte_t res)
5810     +{
5811     + xen_set_pte(ptep, __pte(0));
5812     + return res;
5813     +}
5814    
5815     /*
5816     * We only update the dirty/accessed state if we set
5817     @@ -286,17 +293,34 @@
5818     */
5819     #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
5820     #define ptep_set_access_flags(vma, address, ptep, entry, dirty) \
5821     -do { \
5822     - if (dirty) \
5823     +({ \
5824     + int __changed = !pte_same(*(ptep), entry); \
5825     + if (__changed && (dirty)) \
5826     ptep_establish(vma, address, ptep, entry); \
5827     -} while (0)
5828     + __changed; \
5829     +})
5830    
5831     -/*
5832     - * We don't actually have these, but we want to advertise them so that
5833     - * we can encompass the flush here.
5834     - */
5835     #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
5836     +#define ptep_test_and_clear_dirty(vma, addr, ptep) ({ \
5837     + int __ret = 0; \
5838     + if (pte_dirty(*(ptep))) \
5839     + __ret = test_and_clear_bit(_PAGE_BIT_DIRTY, \
5840     + &(ptep)->pte_low); \
5841     + if (__ret) \
5842     + pte_update((vma)->vm_mm, addr, ptep); \
5843     + __ret; \
5844     +})
5845     +
5846     #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
5847     +#define ptep_test_and_clear_young(vma, addr, ptep) ({ \
5848     + int __ret = 0; \
5849     + if (pte_young(*(ptep))) \
5850     + __ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, \
5851     + &(ptep)->pte_low); \
5852     + if (__ret) \
5853     + pte_update((vma)->vm_mm, addr, ptep); \
5854     + __ret; \
5855     +})
5856    
5857     /*
5858     * Rules for using ptep_establish: the pte MUST be a user pte, and
5859     @@ -323,7 +347,7 @@
5860     int __dirty = pte_dirty(__pte); \
5861     __pte = pte_mkclean(__pte); \
5862     if (test_bit(PG_pinned, &virt_to_page((vma)->vm_mm->pgd)->flags)) \
5863     - ptep_set_access_flags(vma, address, ptep, __pte, __dirty); \
5864     + (void)ptep_set_access_flags(vma, address, ptep, __pte, __dirty); \
5865     else if (__dirty) \
5866     (ptep)->pte_low = __pte.pte_low; \
5867     __dirty; \
5868     @@ -336,7 +360,7 @@
5869     int __young = pte_young(__pte); \
5870     __pte = pte_mkold(__pte); \
5871     if (test_bit(PG_pinned, &virt_to_page((vma)->vm_mm->pgd)->flags)) \
5872     - ptep_set_access_flags(vma, address, ptep, __pte, __young); \
5873     + (void)ptep_set_access_flags(vma, address, ptep, __pte, __young); \
5874     else if (__young) \
5875     (ptep)->pte_low = __pte.pte_low; \
5876     __young; \
5877     @@ -349,7 +373,7 @@
5878     if (!pte_none(pte)
5879     && (mm != &init_mm
5880     || HYPERVISOR_update_va_mapping(addr, __pte(0), 0))) {
5881     - pte = raw_ptep_get_and_clear(ptep, pte);
5882     + pte = xen_ptep_get_and_clear(ptep, pte);
5883     pte_update(mm, addr, ptep);
5884     }
5885     return pte;
5886     @@ -491,24 +515,10 @@
5887     #endif
5888    
5889     #if defined(CONFIG_HIGHPTE)
5890     -#define pte_offset_map(dir, address) \
5891     -({ \
5892     - pte_t *__ptep; \
5893     - unsigned pfn = pmd_val(*(dir)) >> PAGE_SHIFT; \
5894     - __ptep = (pte_t *)kmap_atomic_pte(pfn_to_page(pfn),KM_PTE0); \
5895     - paravirt_map_pt_hook(KM_PTE0,__ptep, pfn); \
5896     - __ptep = __ptep + pte_index(address); \
5897     - __ptep; \
5898     -})
5899     -#define pte_offset_map_nested(dir, address) \
5900     -({ \
5901     - pte_t *__ptep; \
5902     - unsigned pfn = pmd_val(*(dir)) >> PAGE_SHIFT; \
5903     - __ptep = (pte_t *)kmap_atomic_pte(pfn_to_page(pfn),KM_PTE1); \
5904     - paravirt_map_pt_hook(KM_PTE1,__ptep, pfn); \
5905     - __ptep = __ptep + pte_index(address); \
5906     - __ptep; \
5907     -})
5908     +#define pte_offset_map(dir, address) \
5909     + ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE0) + pte_index(address))
5910     +#define pte_offset_map_nested(dir, address) \
5911     + ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE1) + pte_index(address))
5912     #define pte_unmap(pte) kunmap_atomic(pte, KM_PTE0)
5913     #define pte_unmap_nested(pte) kunmap_atomic(pte, KM_PTE1)
5914     #else
5915     @@ -587,10 +597,6 @@
5916     #define io_remap_pfn_range(vma,from,pfn,size,prot) \
5917     direct_remap_pfn_range(vma,from,pfn,size,prot,DOMID_IO)
5918    
5919     -#define MK_IOSPACE_PFN(space, pfn) (pfn)
5920     -#define GET_IOSPACE(pfn) 0
5921     -#define GET_PFN(pfn) (pfn)
5922     -
5923     #include <asm-generic/pgtable.h>
5924    
5925     #endif /* _I386_PGTABLE_H */
5926     --- a/include/asm-x86/mach-xen/asm/pgtable_64.h
5927     +++ b/include/asm-x86/mach-xen/asm/pgtable_64.h
5928     @@ -1,12 +1,14 @@
5929     #ifndef _X86_64_PGTABLE_H
5930     #define _X86_64_PGTABLE_H
5931    
5932     +#include <linux/const.h>
5933     +#ifndef __ASSEMBLY__
5934     +
5935     /*
5936     * This file contains the functions and defines necessary to modify and use
5937     * the x86-64 page table tree.
5938     */
5939     #include <asm/processor.h>
5940     -#include <asm/fixmap.h>
5941     #include <asm/bitops.h>
5942     #include <linux/threads.h>
5943     #include <linux/sched.h>
5944     @@ -34,11 +36,9 @@
5945     #endif
5946    
5947     extern pud_t level3_kernel_pgt[512];
5948     -extern pud_t level3_physmem_pgt[512];
5949     extern pud_t level3_ident_pgt[512];
5950     extern pmd_t level2_kernel_pgt[512];
5951     extern pgd_t init_level4_pgt[];
5952     -extern pgd_t boot_level4_pgt[];
5953     extern unsigned long __supported_pte_mask;
5954    
5955     #define swapper_pg_dir init_level4_pgt
5956     @@ -53,6 +53,8 @@
5957     extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)];
5958     #define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
5959    
5960     +#endif /* !__ASSEMBLY__ */
5961     +
5962     /*
5963     * PGDIR_SHIFT determines what a top-level page table entry can map
5964     */
5965     @@ -77,6 +79,8 @@
5966     */
5967     #define PTRS_PER_PTE 512
5968    
5969     +#ifndef __ASSEMBLY__
5970     +
5971     #define pte_ERROR(e) \
5972     printk("%s:%d: bad pte %p(%016lx pfn %010lx).\n", __FILE__, __LINE__, \
5973     &(e), __pte_val(e), pte_pfn(e))
5974     @@ -119,22 +123,23 @@
5975    
5976     #define pte_pgprot(a) (__pgprot((a).pte & ~PHYSICAL_PAGE_MASK))
5977    
5978     -#define PMD_SIZE (1UL << PMD_SHIFT)
5979     +#endif /* !__ASSEMBLY__ */
5980     +
5981     +#define PMD_SIZE (_AC(1,UL) << PMD_SHIFT)
5982     #define PMD_MASK (~(PMD_SIZE-1))
5983     -#define PUD_SIZE (1UL << PUD_SHIFT)
5984     +#define PUD_SIZE (_AC(1,UL) << PUD_SHIFT)
5985     #define PUD_MASK (~(PUD_SIZE-1))
5986     -#define PGDIR_SIZE (1UL << PGDIR_SHIFT)
5987     +#define PGDIR_SIZE (_AC(1,UL) << PGDIR_SHIFT)
5988     #define PGDIR_MASK (~(PGDIR_SIZE-1))
5989    
5990     #define USER_PTRS_PER_PGD ((TASK_SIZE-1)/PGDIR_SIZE+1)
5991     #define FIRST_USER_ADDRESS 0
5992    
5993     -#ifndef __ASSEMBLY__
5994     -#define MAXMEM 0x3fffffffffffUL
5995     -#define VMALLOC_START 0xffffc20000000000UL
5996     -#define VMALLOC_END 0xffffe1ffffffffffUL
5997     -#define MODULES_VADDR 0xffffffff88000000UL
5998     -#define MODULES_END 0xfffffffffff00000UL
5999     +#define MAXMEM _AC(0x3fffffffffff, UL)
6000     +#define VMALLOC_START _AC(0xffffc20000000000, UL)
6001     +#define VMALLOC_END _AC(0xffffe1ffffffffff, UL)
6002     +#define MODULES_VADDR _AC(0xffffffff88000000, UL)
6003     +#define MODULES_END _AC(0xfffffffffff00000, UL)
6004     #define MODULES_LEN (MODULES_END - MODULES_VADDR)
6005    
6006     #define _PAGE_BIT_PRESENT 0
6007     @@ -160,16 +165,18 @@
6008     #define _PAGE_GLOBAL 0x100 /* Global TLB entry */
6009    
6010     #define _PAGE_PROTNONE 0x080 /* If not present */
6011     -#define _PAGE_NX (1UL<<_PAGE_BIT_NX)
6012     +#define _PAGE_NX (_AC(1,UL)<<_PAGE_BIT_NX)
6013    
6014     /* Mapped page is I/O or foreign and has no associated page struct. */
6015     #define _PAGE_IO 0x200
6016    
6017     +#ifndef __ASSEMBLY__
6018     #if CONFIG_XEN_COMPAT <= 0x030002
6019     extern unsigned int __kernel_page_user;
6020     #else
6021     #define __kernel_page_user 0
6022     #endif
6023     +#endif
6024    
6025     #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
6026     #define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY | __kernel_page_user)
6027     @@ -234,6 +241,8 @@
6028     #define __S110 PAGE_SHARED_EXEC
6029     #define __S111 PAGE_SHARED_EXEC
6030    
6031     +#ifndef __ASSEMBLY__
6032     +
6033     static inline unsigned long pgd_bad(pgd_t pgd)
6034     {
6035     return __pgd_val(pgd) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER);
6036     @@ -345,6 +354,20 @@
6037     static inline pte_t pte_mkhuge(pte_t pte) { __pte_val(pte) |= _PAGE_PSE; return pte; }
6038     static inline pte_t pte_clrhuge(pte_t pte) { __pte_val(pte) &= ~_PAGE_PSE; return pte; }
6039    
6040     +static inline int ptep_test_and_clear_dirty(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
6041     +{
6042     + if (!pte_dirty(*ptep))
6043     + return 0;
6044     + return test_and_clear_bit(_PAGE_BIT_DIRTY, &ptep->pte);
6045     +}
6046     +
6047     +static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
6048     +{
6049     + if (!pte_young(*ptep))
6050     + return 0;
6051     + return test_and_clear_bit(_PAGE_BIT_ACCESSED, &ptep->pte);
6052     +}
6053     +
6054     static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
6055     {
6056     pte_t pte = *ptep;
6057     @@ -470,18 +493,12 @@
6058     * bit at the same time. */
6059     #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
6060     #define ptep_set_access_flags(vma, address, ptep, entry, dirty) \
6061     - do { \
6062     - if (dirty) \
6063     - ptep_establish(vma, address, ptep, entry); \
6064     - } while (0)
6065     -
6066     -
6067     -/*
6068     - * i386 says: We don't actually have these, but we want to advertise
6069     - * them so that we can encompass the flush here.
6070     - */
6071     -#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
6072     -#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
6073     +({ \
6074     + int __changed = !pte_same(*(ptep), entry); \
6075     + if (__changed && (dirty)) \
6076     + ptep_establish(vma, address, ptep, entry); \
6077     + __changed; \
6078     +})
6079    
6080     #define __HAVE_ARCH_PTEP_CLEAR_DIRTY_FLUSH
6081     #define ptep_clear_flush_dirty(vma, address, ptep) \
6082     @@ -490,7 +507,7 @@
6083     int __dirty = pte_dirty(__pte); \
6084     __pte = pte_mkclean(__pte); \
6085     if ((vma)->vm_mm->context.pinned) \
6086     - ptep_set_access_flags(vma, address, ptep, __pte, __dirty); \
6087     + (void)ptep_set_access_flags(vma, address, ptep, __pte, __dirty); \
6088     else if (__dirty) \
6089     set_pte(ptep, __pte); \
6090     __dirty; \
6091     @@ -503,7 +520,7 @@
6092     int __young = pte_young(__pte); \
6093     __pte = pte_mkold(__pte); \
6094     if ((vma)->vm_mm->context.pinned) \
6095     - ptep_set_access_flags(vma, address, ptep, __pte, __young); \
6096     + (void)ptep_set_access_flags(vma, address, ptep, __pte, __young); \
6097     else if (__young) \
6098     set_pte(ptep, __pte); \
6099     __young; \
6100     @@ -517,10 +534,7 @@
6101     #define __swp_entry_to_pte(x) ((pte_t) { (x).val })
6102    
6103     extern spinlock_t pgd_lock;
6104     -extern struct page *pgd_list;
6105     -void vmalloc_sync_all(void);
6106     -
6107     -#endif /* !__ASSEMBLY__ */
6108     +extern struct list_head pgd_list;
6109    
6110     extern int kern_addr_valid(unsigned long addr);
6111    
6112     @@ -559,10 +573,6 @@
6113     #define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \
6114     direct_remap_pfn_range(vma,vaddr,pfn,size,prot,DOMID_IO)
6115    
6116     -#define MK_IOSPACE_PFN(space, pfn) (pfn)
6117     -#define GET_IOSPACE(pfn) 0
6118     -#define GET_PFN(pfn) (pfn)
6119     -
6120     #define HAVE_ARCH_UNMAPPED_AREA
6121    
6122     #define pgtable_cache_init() do { } while (0)
6123     @@ -576,11 +586,14 @@
6124     #define kc_offset_to_vaddr(o) \
6125     (((o) & (1UL << (__VIRTUAL_MASK_SHIFT-1))) ? ((o) | (~__VIRTUAL_MASK)) : (o))
6126    
6127     +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
6128     +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
6129     #define __HAVE_ARCH_PTEP_GET_AND_CLEAR
6130     #define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
6131     #define __HAVE_ARCH_PTEP_CLEAR_FLUSH
6132     #define __HAVE_ARCH_PTEP_SET_WRPROTECT
6133     #define __HAVE_ARCH_PTE_SAME
6134     #include <asm-generic/pgtable.h>
6135     +#endif /* !__ASSEMBLY__ */
6136    
6137     #endif /* _X86_64_PGTABLE_H */
6138     --- a/include/asm-x86/mach-xen/asm/processor_32.h
6139     +++ b/include/asm-x86/mach-xen/asm/processor_32.h
6140     @@ -21,6 +21,7 @@
6141     #include <asm/percpu.h>
6142     #include <linux/cpumask.h>
6143     #include <linux/init.h>
6144     +#include <asm/processor-flags.h>
6145     #include <xen/interface/physdev.h>
6146    
6147     /* flag for disabling the tsc */
6148     @@ -118,7 +119,8 @@
6149    
6150     void __init cpu_detect(struct cpuinfo_x86 *c);
6151    
6152     -extern void identify_cpu(struct cpuinfo_x86 *);
6153     +extern void identify_boot_cpu(void);
6154     +extern void identify_secondary_cpu(struct cpuinfo_x86 *);
6155     extern void print_cpu_info(struct cpuinfo_x86 *);
6156     extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
6157     extern unsigned short num_cache_leaves;
6158     @@ -129,29 +131,8 @@
6159     static inline void detect_ht(struct cpuinfo_x86 *c) {}
6160     #endif
6161    
6162     -/*
6163     - * EFLAGS bits
6164     - */
6165     -#define X86_EFLAGS_CF 0x00000001 /* Carry Flag */
6166     -#define X86_EFLAGS_PF 0x00000004 /* Parity Flag */
6167     -#define X86_EFLAGS_AF 0x00000010 /* Auxillary carry Flag */
6168     -#define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */
6169     -#define X86_EFLAGS_SF 0x00000080 /* Sign Flag */
6170     -#define X86_EFLAGS_TF 0x00000100 /* Trap Flag */
6171     -#define X86_EFLAGS_IF 0x00000200 /* Interrupt Flag */
6172     -#define X86_EFLAGS_DF 0x00000400 /* Direction Flag */
6173     -#define X86_EFLAGS_OF 0x00000800 /* Overflow Flag */
6174     -#define X86_EFLAGS_IOPL 0x00003000 /* IOPL mask */
6175     -#define X86_EFLAGS_NT 0x00004000 /* Nested Task */
6176     -#define X86_EFLAGS_RF 0x00010000 /* Resume Flag */
6177     -#define X86_EFLAGS_VM 0x00020000 /* Virtual Mode */
6178     -#define X86_EFLAGS_AC 0x00040000 /* Alignment Check */
6179     -#define X86_EFLAGS_VIF 0x00080000 /* Virtual Interrupt Flag */
6180     -#define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */
6181     -#define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */
6182     -
6183     -static inline fastcall void xen_cpuid(unsigned int *eax, unsigned int *ebx,
6184     - unsigned int *ecx, unsigned int *edx)
6185     +static inline void xen_cpuid(unsigned int *eax, unsigned int *ebx,
6186     + unsigned int *ecx, unsigned int *edx)
6187     {
6188     /* ecx is often an input as well as an output. */
6189     __asm__(XEN_CPUID
6190     @@ -165,21 +146,6 @@
6191     #define load_cr3(pgdir) write_cr3(__pa(pgdir))
6192    
6193     /*
6194     - * Intel CPU features in CR4
6195     - */
6196     -#define X86_CR4_VME 0x0001 /* enable vm86 extensions */
6197     -#define X86_CR4_PVI 0x0002 /* virtual interrupts flag enable */
6198     -#define X86_CR4_TSD 0x0004 /* disable time stamp at ipl 3 */
6199     -#define X86_CR4_DE 0x0008 /* enable debugging extensions */
6200     -#define X86_CR4_PSE 0x0010 /* enable page size extensions */
6201     -#define X86_CR4_PAE 0x0020 /* enable physical address extensions */
6202     -#define X86_CR4_MCE 0x0040 /* Machine check enable */
6203     -#define X86_CR4_PGE 0x0080 /* enable global pages */
6204     -#define X86_CR4_PCE 0x0100 /* enable performance counters at ipl 3 */
6205     -#define X86_CR4_OSFXSR 0x0200 /* enable fast FPU save and restore */
6206     -#define X86_CR4_OSXMMEXCPT 0x0400 /* enable unmasked SSE exceptions */
6207     -
6208     -/*
6209     * Save the cr4 feature set we're using (ie
6210     * Pentium 4MB enable and PPro Global page
6211     * enable), so that any CPU's that boot up
6212     @@ -206,26 +172,6 @@
6213     }
6214    
6215     /*
6216     - * NSC/Cyrix CPU configuration register indexes
6217     - */
6218     -
6219     -#define CX86_PCR0 0x20
6220     -#define CX86_GCR 0xb8
6221     -#define CX86_CCR0 0xc0
6222     -#define CX86_CCR1 0xc1
6223     -#define CX86_CCR2 0xc2
6224     -#define CX86_CCR3 0xc3
6225     -#define CX86_CCR4 0xe8
6226     -#define CX86_CCR5 0xe9
6227     -#define CX86_CCR6 0xea
6228     -#define CX86_CCR7 0xeb
6229     -#define CX86_PCR1 0xf0
6230     -#define CX86_DIR0 0xfe
6231     -#define CX86_DIR1 0xff
6232     -#define CX86_ARR_BASE 0xc4
6233     -#define CX86_RCR_BASE 0xdc
6234     -
6235     -/*
6236     * NSC/Cyrix CPU indexed register access macros
6237     */
6238    
6239     @@ -351,7 +297,8 @@
6240     struct thread_struct;
6241    
6242     #ifndef CONFIG_X86_NO_TSS
6243     -struct tss_struct {
6244     +/* This is the TSS defined by the hardware. */
6245     +struct i386_hw_tss {
6246     unsigned short back_link,__blh;
6247     unsigned long esp0;
6248     unsigned short ss0,__ss0h;
6249     @@ -375,6 +322,11 @@
6250     unsigned short gs, __gsh;
6251     unsigned short ldt, __ldth;
6252     unsigned short trace, io_bitmap_base;
6253     +} __attribute__((packed));
6254     +
6255     +struct tss_struct {
6256     + struct i386_hw_tss x86_tss;
6257     +
6258     /*
6259     * The extra 1 is there because the CPU will access an
6260     * additional byte beyond the end of the IO permission
6261     @@ -428,10 +380,11 @@
6262     };
6263    
6264     #define INIT_THREAD { \
6265     + .esp0 = sizeof(init_stack) + (long)&init_stack, \
6266     .vm86_info = NULL, \
6267     .sysenter_cs = __KERNEL_CS, \
6268     .io_bitmap_ptr = NULL, \
6269     - .fs = __KERNEL_PDA, \
6270     + .fs = __KERNEL_PERCPU, \
6271     }
6272    
6273     /*
6274     @@ -441,10 +394,12 @@
6275     * be within the limit.
6276     */
6277     #define INIT_TSS { \
6278     - .esp0 = sizeof(init_stack) + (long)&init_stack, \
6279     - .ss0 = __KERNEL_DS, \
6280     - .ss1 = __KERNEL_CS, \
6281     - .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \
6282     + .x86_tss = { \
6283     + .esp0 = sizeof(init_stack) + (long)&init_stack, \
6284     + .ss0 = __KERNEL_DS, \
6285     + .ss1 = __KERNEL_CS, \
6286     + .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \
6287     + }, \
6288     .io_bitmap = { [ 0 ... IO_BITMAP_LONGS] = ~0 }, \
6289     }
6290    
6291     @@ -551,38 +506,33 @@
6292    
6293     #define cpu_relax() rep_nop()
6294    
6295     -#define paravirt_enabled() 0
6296     -#define __cpuid xen_cpuid
6297     -
6298     #ifndef CONFIG_X86_NO_TSS
6299     -static inline void __load_esp0(struct tss_struct *tss, struct thread_struct *thread)
6300     +static inline void native_load_esp0(struct tss_struct *tss, struct thread_struct *thread)
6301     {
6302     - tss->esp0 = thread->esp0;
6303     + tss->x86_tss.esp0 = thread->esp0;
6304     /* This can only happen when SEP is enabled, no need to test "SEP"arately */
6305     - if (unlikely(tss->ss1 != thread->sysenter_cs)) {
6306     - tss->ss1 = thread->sysenter_cs;
6307     + if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
6308     + tss->x86_tss.ss1 = thread->sysenter_cs;
6309     wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
6310     }
6311     }
6312     -#define load_esp0(tss, thread) \
6313     - __load_esp0(tss, thread)
6314     #else
6315     -#define load_esp0(tss, thread) do { \
6316     +#define xen_load_esp0(tss, thread) do { \
6317     if (HYPERVISOR_stack_switch(__KERNEL_DS, (thread)->esp0)) \
6318     BUG(); \
6319     } while (0)
6320     #endif
6321    
6322    
6323     -/*
6324     - * These special macros can be used to get or set a debugging register
6325     - */
6326     -#define get_debugreg(var, register) \
6327     - (var) = HYPERVISOR_get_debugreg(register)
6328     -#define set_debugreg(value, register) \
6329     - WARN_ON(HYPERVISOR_set_debugreg(register, value))
6330     +static inline unsigned long xen_get_debugreg(int regno)
6331     +{
6332     + return HYPERVISOR_get_debugreg(regno);
6333     +}
6334    
6335     -#define set_iopl_mask xen_set_iopl_mask
6336     +static inline void xen_set_debugreg(int regno, unsigned long value)
6337     +{
6338     + WARN_ON(HYPERVISOR_set_debugreg(regno, value));
6339     +}
6340    
6341     /*
6342     * Set IOPL bits in EFLAGS from given mask
6343     @@ -597,6 +547,21 @@
6344     }
6345    
6346    
6347     +#define paravirt_enabled() 0
6348     +#define __cpuid xen_cpuid
6349     +
6350     +#define load_esp0 xen_load_esp0
6351     +
6352     +/*
6353     + * These special macros can be used to get or set a debugging register
6354     + */
6355     +#define get_debugreg(var, register) \
6356     + (var) = xen_get_debugreg(register)
6357     +#define set_debugreg(value, register) \
6358     + xen_set_debugreg(register, value)
6359     +
6360     +#define set_iopl_mask xen_set_iopl_mask
6361     +
6362     /*
6363     * Generic CPUID function
6364     * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
6365     @@ -749,8 +714,14 @@
6366     extern void enable_sep_cpu(void);
6367     extern int sysenter_setup(void);
6368    
6369     -extern int init_gdt(int cpu, struct task_struct *idle);
6370     +/* Defined in head.S */
6371     +extern struct Xgt_desc_struct early_gdt_descr;
6372     +
6373     extern void cpu_set_gdt(int);
6374     -extern void secondary_cpu_init(void);
6375     +extern void switch_to_new_gdt(void);
6376     +extern void cpu_init(void);
6377     +extern void init_gdt(int cpu);
6378     +
6379     +extern int force_mwait;
6380    
6381     #endif /* __ASM_I386_PROCESSOR_H */
6382     --- a/include/asm-x86/mach-xen/asm/processor_64.h
6383     +++ b/include/asm-x86/mach-xen/asm/processor_64.h
6384     @@ -20,6 +20,7 @@
6385     #include <asm/percpu.h>
6386     #include <linux/personality.h>
6387     #include <linux/cpumask.h>
6388     +#include <asm/processor-flags.h>
6389    
6390     #define TF_MASK 0x00000100
6391     #define IF_MASK 0x00000200
6392     @@ -103,42 +104,6 @@
6393     extern unsigned short num_cache_leaves;
6394    
6395     /*
6396     - * EFLAGS bits
6397     - */
6398     -#define X86_EFLAGS_CF 0x00000001 /* Carry Flag */
6399     -#define X86_EFLAGS_PF 0x00000004 /* Parity Flag */
6400     -#define X86_EFLAGS_AF 0x00000010 /* Auxillary carry Flag */
6401     -#define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */
6402     -#define X86_EFLAGS_SF 0x00000080 /* Sign Flag */
6403     -#define X86_EFLAGS_TF 0x00000100 /* Trap Flag */
6404     -#define X86_EFLAGS_IF 0x00000200 /* Interrupt Flag */
6405     -#define X86_EFLAGS_DF 0x00000400 /* Direction Flag */
6406     -#define X86_EFLAGS_OF 0x00000800 /* Overflow Flag */
6407     -#define X86_EFLAGS_IOPL 0x00003000 /* IOPL mask */
6408     -#define X86_EFLAGS_NT 0x00004000 /* Nested Task */
6409     -#define X86_EFLAGS_RF 0x00010000 /* Resume Flag */
6410     -#define X86_EFLAGS_VM 0x00020000 /* Virtual Mode */
6411     -#define X86_EFLAGS_AC 0x00040000 /* Alignment Check */
6412     -#define X86_EFLAGS_VIF 0x00080000 /* Virtual Interrupt Flag */
6413     -#define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */
6414     -#define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */
6415     -
6416     -/*
6417     - * Intel CPU features in CR4
6418     - */
6419     -#define X86_CR4_VME 0x0001 /* enable vm86 extensions */
6420     -#define X86_CR4_PVI 0x0002 /* virtual interrupts flag enable */
6421     -#define X86_CR4_TSD 0x0004 /* disable time stamp at ipl 3 */
6422     -#define X86_CR4_DE 0x0008 /* enable debugging extensions */
6423     -#define X86_CR4_PSE 0x0010 /* enable page size extensions */
6424     -#define X86_CR4_PAE 0x0020 /* enable physical address extensions */
6425     -#define X86_CR4_MCE 0x0040 /* Machine check enable */
6426     -#define X86_CR4_PGE 0x0080 /* enable global pages */
6427     -#define X86_CR4_PCE 0x0100 /* enable performance counters at ipl 3 */
6428     -#define X86_CR4_OSFXSR 0x0200 /* enable fast FPU save and restore */
6429     -#define X86_CR4_OSXMMEXCPT 0x0400 /* enable unmasked SSE exceptions */
6430     -
6431     -/*
6432     * Save the cr4 feature set we're using (ie
6433     * Pentium 4MB enable and PPro Global page
6434     * enable), so that any CPU's that boot up
6435     @@ -203,7 +168,7 @@
6436     u32 mxcsr;
6437     u32 mxcsr_mask;
6438     u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
6439     - u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 128 bytes */
6440     + u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */
6441     u32 padding[24];
6442     } __attribute__ ((aligned (16)));
6443    
6444     @@ -436,22 +401,6 @@
6445     #define cpu_relax() rep_nop()
6446    
6447     /*
6448     - * NSC/Cyrix CPU configuration register indexes
6449     - */
6450     -#define CX86_CCR0 0xc0
6451     -#define CX86_CCR1 0xc1
6452     -#define CX86_CCR2 0xc2
6453     -#define CX86_CCR3 0xc3
6454     -#define CX86_CCR4 0xe8
6455     -#define CX86_CCR5 0xe9
6456     -#define CX86_CCR6 0xea
6457     -#define CX86_CCR7 0xeb
6458     -#define CX86_DIR0 0xfe
6459     -#define CX86_DIR1 0xff
6460     -#define CX86_ARR_BASE 0xc4
6461     -#define CX86_RCR_BASE 0xdc
6462     -
6463     -/*
6464     * NSC/Cyrix CPU indexed register access macros
6465     */
6466    
6467     --- a/include/asm-x86/mach-xen/asm/scatterlist_32.h
6468     +++ b/include/asm-x86/mach-xen/asm/scatterlist_32.h
6469     @@ -1,6 +1,8 @@
6470     #ifndef _I386_SCATTERLIST_H
6471     #define _I386_SCATTERLIST_H
6472    
6473     +#include <asm/types.h>
6474     +
6475     struct scatterlist {
6476     struct page *page;
6477     unsigned int offset;
6478     --- a/include/asm-x86/mach-xen/asm/segment_32.h
6479     +++ b/include/asm-x86/mach-xen/asm/segment_32.h
6480     @@ -39,7 +39,7 @@
6481     * 25 - APM BIOS support
6482     *
6483     * 26 - ESPFIX small SS
6484     - * 27 - PDA [ per-cpu private data area ]
6485     + * 27 - per-cpu [ offset to per-cpu data area ]
6486     * 28 - unused
6487     * 29 - unused
6488     * 30 - unused
6489     @@ -74,8 +74,12 @@
6490     #define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE + 14)
6491     #define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8)
6492    
6493     -#define GDT_ENTRY_PDA (GDT_ENTRY_KERNEL_BASE + 15)
6494     -#define __KERNEL_PDA (GDT_ENTRY_PDA * 8)
6495     +#define GDT_ENTRY_PERCPU (GDT_ENTRY_KERNEL_BASE + 15)
6496     +#ifdef CONFIG_SMP
6497     +#define __KERNEL_PERCPU (GDT_ENTRY_PERCPU * 8)
6498     +#else
6499     +#define __KERNEL_PERCPU 0
6500     +#endif
6501    
6502     #define GDT_ENTRY_DOUBLEFAULT_TSS 31
6503    
6504     --- a/include/asm-x86/mach-xen/asm/smp_32.h
6505     +++ b/include/asm-x86/mach-xen/asm/smp_32.h
6506     @@ -8,19 +8,15 @@
6507     #include <linux/kernel.h>
6508     #include <linux/threads.h>
6509     #include <linux/cpumask.h>
6510     -#include <asm/pda.h>
6511     #endif
6512    
6513     -#ifdef CONFIG_X86_LOCAL_APIC
6514     -#ifndef __ASSEMBLY__
6515     -#include <asm/fixmap.h>
6516     +#if defined(CONFIG_X86_LOCAL_APIC) && !defined(__ASSEMBLY__)
6517     #include <asm/bitops.h>
6518     #include <asm/mpspec.h>
6519     +#include <asm/apic.h>
6520     #ifdef CONFIG_X86_IO_APIC
6521     #include <asm/io_apic.h>
6522     #endif
6523     -#include <asm/apic.h>
6524     -#endif
6525     #endif
6526    
6527     #define BAD_APICID 0xFFu
6528     @@ -52,9 +48,76 @@
6529     extern void cpu_uninit(void);
6530     #endif
6531    
6532     -#ifndef CONFIG_PARAVIRT
6533     +#ifndef CONFIG_XEN
6534     +struct smp_ops
6535     +{
6536     + void (*smp_prepare_boot_cpu)(void);
6537     + void (*smp_prepare_cpus)(unsigned max_cpus);
6538     + int (*cpu_up)(unsigned cpu);
6539     + void (*smp_cpus_done)(unsigned max_cpus);
6540     +
6541     + void (*smp_send_stop)(void);
6542     + void (*smp_send_reschedule)(int cpu);
6543     + int (*smp_call_function_mask)(cpumask_t mask,
6544     + void (*func)(void *info), void *info,
6545     + int wait);
6546     +};
6547     +
6548     +extern struct smp_ops smp_ops;
6549     +
6550     +static inline void smp_prepare_boot_cpu(void)
6551     +{
6552     + smp_ops.smp_prepare_boot_cpu();
6553     +}
6554     +static inline void smp_prepare_cpus(unsigned int max_cpus)
6555     +{
6556     + smp_ops.smp_prepare_cpus(max_cpus);
6557     +}
6558     +static inline int __cpu_up(unsigned int cpu)
6559     +{
6560     + return smp_ops.cpu_up(cpu);
6561     +}
6562     +static inline void smp_cpus_done(unsigned int max_cpus)
6563     +{
6564     + smp_ops.smp_cpus_done(max_cpus);
6565     +}
6566     +
6567     +static inline void smp_send_stop(void)
6568     +{
6569     + smp_ops.smp_send_stop();
6570     +}
6571     +static inline void smp_send_reschedule(int cpu)
6572     +{
6573     + smp_ops.smp_send_reschedule(cpu);
6574     +}
6575     +static inline int smp_call_function_mask(cpumask_t mask,
6576     + void (*func) (void *info), void *info,
6577     + int wait)
6578     +{
6579     + return smp_ops.smp_call_function_mask(mask, func, info, wait);
6580     +}
6581     +
6582     +void native_smp_prepare_boot_cpu(void);
6583     +void native_smp_prepare_cpus(unsigned int max_cpus);
6584     +int native_cpu_up(unsigned int cpunum);
6585     +void native_smp_cpus_done(unsigned int max_cpus);
6586     +
6587     #define startup_ipi_hook(phys_apicid, start_eip, start_esp) \
6588     do { } while (0)
6589     +
6590     +#else
6591     +
6592     +
6593     +void xen_smp_send_stop(void);
6594     +void xen_smp_send_reschedule(int cpu);
6595     +int xen_smp_call_function_mask(cpumask_t mask,
6596     + void (*func) (void *info), void *info,
6597     + int wait);
6598     +
6599     +#define smp_send_stop xen_smp_send_stop
6600     +#define smp_send_reschedule xen_smp_send_reschedule
6601     +#define smp_call_function_mask xen_smp_call_function_mask
6602     +
6603     #endif
6604    
6605     /*
6606     @@ -62,7 +125,8 @@
6607     * from the initial startup. We map APIC_BASE very early in page_setup(),
6608     * so this is correct in the x86 case.
6609     */
6610     -#define raw_smp_processor_id() (read_pda(cpu_number))
6611     +DECLARE_PER_CPU(int, cpu_number);
6612     +#define raw_smp_processor_id() (x86_read_percpu(cpu_number))
6613    
6614     extern cpumask_t cpu_possible_map;
6615     #define cpu_callin_map cpu_possible_map
6616     @@ -73,20 +137,6 @@
6617     return cpus_weight(cpu_possible_map);
6618     }
6619    
6620     -#ifdef CONFIG_X86_LOCAL_APIC
6621     -
6622     -#ifdef APIC_DEFINITION
6623     -extern int hard_smp_processor_id(void);
6624     -#else
6625     -#include <mach_apicdef.h>
6626     -static inline int hard_smp_processor_id(void)
6627     -{
6628     - /* we don't want to mark this access volatile - bad code generation */
6629     - return GET_APIC_ID(*(unsigned long *)(APIC_BASE+APIC_ID));
6630     -}
6631     -#endif
6632     -#endif
6633     -
6634     #define safe_smp_processor_id() smp_processor_id()
6635     extern int __cpu_disable(void);
6636     extern void __cpu_die(unsigned int cpu);
6637     @@ -102,10 +152,31 @@
6638    
6639     #define NO_PROC_ID 0xFF /* No processor magic marker */
6640    
6641     -#endif
6642     +#endif /* CONFIG_SMP */
6643    
6644     #ifndef __ASSEMBLY__
6645    
6646     +#ifdef CONFIG_X86_LOCAL_APIC
6647     +
6648     +#ifdef APIC_DEFINITION
6649     +extern int hard_smp_processor_id(void);
6650     +#else
6651     +#include <mach_apicdef.h>
6652     +static inline int hard_smp_processor_id(void)
6653     +{
6654     + /* we don't want to mark this access volatile - bad code generation */
6655     + return GET_APIC_ID(*(unsigned long *)(APIC_BASE+APIC_ID));
6656     +}
6657     +#endif /* APIC_DEFINITION */
6658     +
6659     +#else /* CONFIG_X86_LOCAL_APIC */
6660     +
6661     +#ifndef CONFIG_SMP
6662     +#define hard_smp_processor_id() 0
6663     +#endif
6664     +
6665     +#endif /* CONFIG_X86_LOCAL_APIC */
6666     +
6667     extern u8 apicid_2_node[];
6668    
6669     #ifdef CONFIG_X86_LOCAL_APIC
6670     --- a/include/asm-x86/mach-xen/asm/smp_64.h
6671     +++ b/include/asm-x86/mach-xen/asm/smp_64.h
6672     @@ -11,12 +11,11 @@
6673     extern int disable_apic;
6674    
6675     #ifdef CONFIG_X86_LOCAL_APIC
6676     -#include <asm/fixmap.h>
6677     #include <asm/mpspec.h>
6678     +#include <asm/apic.h>
6679     #ifdef CONFIG_X86_IO_APIC
6680     #include <asm/io_apic.h>
6681     #endif
6682     -#include <asm/apic.h>
6683     #include <asm/thread_info.h>
6684     #endif
6685    
6686     @@ -41,7 +40,6 @@
6687     extern void unlock_ipi_call_lock(void);
6688     extern int smp_num_siblings;
6689     extern void smp_send_reschedule(int cpu);
6690     -void smp_stop_cpu(void);
6691    
6692     extern cpumask_t cpu_sibling_map[NR_CPUS];
6693     extern cpumask_t cpu_core_map[NR_CPUS];
6694     @@ -62,14 +60,6 @@
6695    
6696     #define raw_smp_processor_id() read_pda(cpunumber)
6697    
6698     -#ifdef CONFIG_X86_LOCAL_APIC
6699     -static inline int hard_smp_processor_id(void)
6700     -{
6701     - /* we don't want to mark this access volatile - bad code generation */
6702     - return GET_APIC_ID(*(unsigned int *)(APIC_BASE+APIC_ID));
6703     -}
6704     -#endif
6705     -
6706     extern int __cpu_disable(void);
6707     extern void __cpu_die(unsigned int cpu);
6708     extern void prefill_possible_map(void);
6709     @@ -78,6 +68,14 @@
6710    
6711     #define NO_PROC_ID 0xFF /* No processor magic marker */
6712    
6713     +#endif /* CONFIG_SMP */
6714     +
6715     +#ifdef CONFIG_X86_LOCAL_APIC
6716     +static inline int hard_smp_processor_id(void)
6717     +{
6718     + /* we don't want to mark this access volatile - bad code generation */
6719     + return GET_APIC_ID(*(unsigned int *)(APIC_BASE+APIC_ID));
6720     +}
6721     #endif
6722    
6723     /*
6724     --- a/include/asm-x86/mach-xen/asm/system_32.h
6725     +++ b/include/asm-x86/mach-xen/asm/system_32.h
6726     @@ -4,7 +4,7 @@
6727     #include <linux/kernel.h>
6728     #include <asm/segment.h>
6729     #include <asm/cpufeature.h>
6730     -#include <linux/bitops.h> /* for LOCK_PREFIX */
6731     +#include <asm/cmpxchg.h>
6732     #include <asm/synch_bitops.h>
6733     #include <asm/hypervisor.h>
6734    
6735     @@ -90,308 +90,102 @@
6736     #define savesegment(seg, value) \
6737     asm volatile("mov %%" #seg ",%0":"=rm" (value))
6738    
6739     -#define read_cr0() ({ \
6740     - unsigned int __dummy; \
6741     - __asm__ __volatile__( \
6742     - "movl %%cr0,%0\n\t" \
6743     - :"=r" (__dummy)); \
6744     - __dummy; \
6745     -})
6746     -#define write_cr0(x) \
6747     - __asm__ __volatile__("movl %0,%%cr0": :"r" (x))
6748     -
6749     -#define read_cr2() (current_vcpu_info()->arch.cr2)
6750     -#define write_cr2(x) \
6751     - __asm__ __volatile__("movl %0,%%cr2": :"r" (x))
6752     -
6753     -#define read_cr3() ({ \
6754     - unsigned int __dummy; \
6755     - __asm__ ( \
6756     - "movl %%cr3,%0\n\t" \
6757     - :"=r" (__dummy)); \
6758     - __dummy = xen_cr3_to_pfn(__dummy); \
6759     - mfn_to_pfn(__dummy) << PAGE_SHIFT; \
6760     -})
6761     -#define write_cr3(x) ({ \
6762     - unsigned int __dummy = pfn_to_mfn((x) >> PAGE_SHIFT); \
6763     - __dummy = xen_pfn_to_cr3(__dummy); \
6764     - __asm__ __volatile__("movl %0,%%cr3": :"r" (__dummy)); \
6765     -})
6766     -#define read_cr4() ({ \
6767     - unsigned int __dummy; \
6768     - __asm__( \
6769     - "movl %%cr4,%0\n\t" \
6770     - :"=r" (__dummy)); \
6771     - __dummy; \
6772     -})
6773     -#define read_cr4_safe() ({ \
6774     - unsigned int __dummy; \
6775     - /* This could fault if %cr4 does not exist */ \
6776     - __asm__("1: movl %%cr4, %0 \n" \
6777     - "2: \n" \
6778     - ".section __ex_table,\"a\" \n" \
6779     - ".long 1b,2b \n" \
6780     - ".previous \n" \
6781     - : "=r" (__dummy): "0" (0)); \
6782     - __dummy; \
6783     -})
6784     -
6785     -#define write_cr4(x) \
6786     - __asm__ __volatile__("movl %0,%%cr4": :"r" (x))
6787     -
6788     -#define wbinvd() \
6789     - __asm__ __volatile__ ("wbinvd": : :"memory")
6790     -
6791     -/* Clear the 'TS' bit */
6792     -#define clts() (HYPERVISOR_fpu_taskswitch(0))
6793     -
6794     -/* Set the 'TS' bit */
6795     -#define stts() (HYPERVISOR_fpu_taskswitch(1))
6796     -
6797     -#endif /* __KERNEL__ */
6798     -
6799     -static inline unsigned long get_limit(unsigned long segment)
6800     +static inline void xen_clts(void)
6801     {
6802     - unsigned long __limit;
6803     - __asm__("lsll %1,%0"
6804     - :"=r" (__limit):"r" (segment));
6805     - return __limit+1;
6806     + HYPERVISOR_fpu_taskswitch(0);
6807     }
6808    
6809     -#define nop() __asm__ __volatile__ ("nop")
6810     -
6811     -#define xchg(ptr,v) ((__typeof__(*(ptr)))__xchg((unsigned long)(v),(ptr),sizeof(*(ptr))))
6812     -
6813     -#define tas(ptr) (xchg((ptr),1))
6814     -
6815     -struct __xchg_dummy { unsigned long a[100]; };
6816     -#define __xg(x) ((struct __xchg_dummy *)(x))
6817     +static inline unsigned long xen_read_cr0(void)
6818     +{
6819     + unsigned long val;
6820     + asm volatile("movl %%cr0,%0\n\t" :"=r" (val));
6821     + return val;
6822     +}
6823    
6824     +static inline void xen_write_cr0(unsigned long val)
6825     +{
6826     + asm volatile("movl %0,%%cr0": :"r" (val));
6827     +}
6828    
6829     -#ifdef CONFIG_X86_CMPXCHG64
6830     +#define xen_read_cr2() (current_vcpu_info()->arch.cr2)
6831    
6832     -/*
6833     - * The semantics of XCHGCMP8B are a bit strange, this is why
6834     - * there is a loop and the loading of %%eax and %%edx has to
6835     - * be inside. This inlines well in most cases, the cached
6836     - * cost is around ~38 cycles. (in the future we might want
6837     - * to do an SIMD/3DNOW!/MMX/FPU 64-bit store here, but that
6838     - * might have an implicit FPU-save as a cost, so it's not
6839     - * clear which path to go.)
6840     - *
6841     - * cmpxchg8b must be used with the lock prefix here to allow
6842     - * the instruction to be executed atomically, see page 3-102
6843     - * of the instruction set reference 24319102.pdf. We need
6844     - * the reader side to see the coherent 64bit value.
6845     - */
6846     -static inline void __set_64bit (unsigned long long * ptr,
6847     - unsigned int low, unsigned int high)
6848     +static inline void xen_write_cr2(unsigned long val)
6849     {
6850     - __asm__ __volatile__ (
6851     - "\n1:\t"
6852     - "movl (%0), %%eax\n\t"
6853     - "movl 4(%0), %%edx\n\t"
6854     - "lock cmpxchg8b (%0)\n\t"
6855     - "jnz 1b"
6856     - : /* no outputs */
6857     - : "D"(ptr),
6858     - "b"(low),
6859     - "c"(high)
6860     - : "ax","dx","memory");
6861     + asm volatile("movl %0,%%cr2": :"r" (val));
6862     }
6863    
6864     -static inline void __set_64bit_constant (unsigned long long *ptr,
6865     - unsigned long long value)
6866     +static inline unsigned long xen_read_cr3(void)
6867     {
6868     - __set_64bit(ptr,(unsigned int)(value), (unsigned int)((value)>>32ULL));
6869     + unsigned long val;
6870     + asm volatile("movl %%cr3,%0\n\t" :"=r" (val));
6871     + return mfn_to_pfn(xen_cr3_to_pfn(val)) << PAGE_SHIFT;
6872     }
6873     -#define ll_low(x) *(((unsigned int*)&(x))+0)
6874     -#define ll_high(x) *(((unsigned int*)&(x))+1)
6875    
6876     -static inline void __set_64bit_var (unsigned long long *ptr,
6877     - unsigned long long value)
6878     +static inline void xen_write_cr3(unsigned long val)
6879     {
6880     - __set_64bit(ptr,ll_low(value), ll_high(value));
6881     + val = xen_pfn_to_cr3(pfn_to_mfn(val >> PAGE_SHIFT));
6882     + asm volatile("movl %0,%%cr3": :"r" (val));
6883     }
6884    
6885     -#define set_64bit(ptr,value) \
6886     -(__builtin_constant_p(value) ? \
6887     - __set_64bit_constant(ptr, value) : \
6888     - __set_64bit_var(ptr, value) )
6889     +static inline unsigned long xen_read_cr4(void)
6890     +{
6891     + unsigned long val;
6892     + asm volatile("movl %%cr4,%0\n\t" :"=r" (val));
6893     + return val;
6894     +}
6895    
6896     -#define _set_64bit(ptr,value) \
6897     -(__builtin_constant_p(value) ? \
6898     - __set_64bit(ptr, (unsigned int)(value), (unsigned int)((value)>>32ULL) ) : \
6899     - __set_64bit(ptr, ll_low(value), ll_high(value)) )
6900     +static inline unsigned long xen_read_cr4_safe(void)
6901     +{
6902     + unsigned long val;
6903     + /* This could fault if %cr4 does not exist */
6904     + asm("1: movl %%cr4, %0 \n"
6905     + "2: \n"
6906     + ".section __ex_table,\"a\" \n"
6907     + ".long 1b,2b \n"
6908     + ".previous \n"
6909     + : "=r" (val): "0" (0));
6910     + return val;
6911     +}
6912    
6913     -#endif
6914     +static inline void xen_write_cr4(unsigned long val)
6915     +{
6916     + asm volatile("movl %0,%%cr4": :"r" (val));
6917     +}
6918    
6919     -/*
6920     - * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
6921     - * Note 2: xchg has side effect, so that attribute volatile is necessary,
6922     - * but generally the primitive is invalid, *ptr is output argument. --ANK
6923     - */
6924     -static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int size)
6925     +static inline void xen_wbinvd(void)
6926     {
6927     - switch (size) {
6928     - case 1:
6929     - __asm__ __volatile__("xchgb %b0,%1"
6930     - :"=q" (x)
6931     - :"m" (*__xg(ptr)), "0" (x)
6932     - :"memory");
6933     - break;
6934     - case 2:
6935     - __asm__ __volatile__("xchgw %w0,%1"
6936     - :"=r" (x)
6937     - :"m" (*__xg(ptr)), "0" (x)
6938     - :"memory");
6939     - break;
6940     - case 4:
6941     - __asm__ __volatile__("xchgl %0,%1"
6942     - :"=r" (x)
6943     - :"m" (*__xg(ptr)), "0" (x)
6944     - :"memory");
6945     - break;
6946     - }
6947     - return x;
6948     + asm volatile("wbinvd": : :"memory");
6949     }
6950    
6951     -/*
6952     - * Atomic compare and exchange. Compare OLD with MEM, if identical,
6953     - * store NEW in MEM. Return the initial value in MEM. Success is
6954     - * indicated by comparing RETURN with OLD.
6955     - */
6956     +#define read_cr0() (xen_read_cr0())
6957     +#define write_cr0(x) (xen_write_cr0(x))
6958     +#define read_cr2() (xen_read_cr2())
6959     +#define write_cr2(x) (xen_write_cr2(x))
6960     +#define read_cr3() (xen_read_cr3())
6961     +#define write_cr3(x) (xen_write_cr3(x))
6962     +#define read_cr4() (xen_read_cr4())
6963     +#define read_cr4_safe() (xen_read_cr4_safe())
6964     +#define write_cr4(x) (xen_write_cr4(x))
6965     +#define wbinvd() (xen_wbinvd())
6966    
6967     -#ifdef CONFIG_X86_CMPXCHG
6968     -#define __HAVE_ARCH_CMPXCHG 1
6969     -#define cmpxchg(ptr,o,n)\
6970     - ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\
6971     - (unsigned long)(n),sizeof(*(ptr))))
6972     -#define sync_cmpxchg(ptr,o,n)\
6973     - ((__typeof__(*(ptr)))__sync_cmpxchg((ptr),(unsigned long)(o),\
6974     - (unsigned long)(n),sizeof(*(ptr))))
6975     -#endif
6976     -
6977     -static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
6978     - unsigned long new, int size)
6979     -{
6980     - unsigned long prev;
6981     - switch (size) {
6982     - case 1:
6983     - __asm__ __volatile__(LOCK_PREFIX "cmpxchgb %b1,%2"
6984     - : "=a"(prev)
6985     - : "q"(new), "m"(*__xg(ptr)), "0"(old)
6986     - : "memory");
6987     - return prev;
6988     - case 2:
6989     - __asm__ __volatile__(LOCK_PREFIX "cmpxchgw %w1,%2"
6990     - : "=a"(prev)
6991     - : "r"(new), "m"(*__xg(ptr)), "0"(old)
6992     - : "memory");
6993     - return prev;
6994     - case 4:
6995     - __asm__ __volatile__(LOCK_PREFIX "cmpxchgl %1,%2"
6996     - : "=a"(prev)
6997     - : "r"(new), "m"(*__xg(ptr)), "0"(old)
6998     - : "memory");
6999     - return prev;
7000     - }
7001     - return old;
7002     -}
7003     +/* Clear the 'TS' bit */
7004     +#define clts() (xen_clts())
7005    
7006     -/*
7007     - * Always use locked operations when touching memory shared with a
7008     - * hypervisor, since the system may be SMP even if the guest kernel
7009     - * isn't.
7010     - */
7011     -static inline unsigned long __sync_cmpxchg(volatile void *ptr,
7012     - unsigned long old,
7013     - unsigned long new, int size)
7014     -{
7015     - unsigned long prev;
7016     - switch (size) {
7017     - case 1:
7018     - __asm__ __volatile__("lock; cmpxchgb %b1,%2"
7019     - : "=a"(prev)
7020     - : "q"(new), "m"(*__xg(ptr)), "0"(old)
7021     - : "memory");
7022     - return prev;
7023     - case 2:
7024     - __asm__ __volatile__("lock; cmpxchgw %w1,%2"
7025     - : "=a"(prev)
7026     - : "r"(new), "m"(*__xg(ptr)), "0"(old)
7027     - : "memory");
7028     - return prev;
7029     - case 4:
7030     - __asm__ __volatile__("lock; cmpxchgl %1,%2"
7031     - : "=a"(prev)
7032     - : "r"(new), "m"(*__xg(ptr)), "0"(old)
7033     - : "memory");
7034     - return prev;
7035     - }
7036     - return old;
7037     -}
7038     +/* Set the 'TS' bit */
7039     +#define stts() (HYPERVISOR_fpu_taskswitch(1))
7040    
7041     -#ifndef CONFIG_X86_CMPXCHG
7042     -/*
7043     - * Building a kernel capable running on 80386. It may be necessary to
7044     - * simulate the cmpxchg on the 80386 CPU. For that purpose we define
7045     - * a function for each of the sizes we support.
7046     - */
7047     +#endif /* __KERNEL__ */
7048    
7049     -extern unsigned long cmpxchg_386_u8(volatile void *, u8, u8);
7050     -extern unsigned long cmpxchg_386_u16(volatile void *, u16, u16);
7051     -extern unsigned long cmpxchg_386_u32(volatile void *, u32, u32);
7052     -
7053     -static inline unsigned long cmpxchg_386(volatile void *ptr, unsigned long old,
7054     - unsigned long new, int size)
7055     -{
7056     - switch (size) {
7057     - case 1:
7058     - return cmpxchg_386_u8(ptr, old, new);
7059     - case 2:
7060     - return cmpxchg_386_u16(ptr, old, new);
7061     - case 4:
7062     - return cmpxchg_386_u32(ptr, old, new);
7063     - }
7064     - return old;
7065     -}
7066     -
7067     -#define cmpxchg(ptr,o,n) \
7068     -({ \
7069     - __typeof__(*(ptr)) __ret; \
7070     - if (likely(boot_cpu_data.x86 > 3)) \
7071     - __ret = __cmpxchg((ptr), (unsigned long)(o), \
7072     - (unsigned long)(n), sizeof(*(ptr))); \
7073     - else \
7074     - __ret = cmpxchg_386((ptr), (unsigned long)(o), \
7075     - (unsigned long)(n), sizeof(*(ptr))); \
7076     - __ret; \
7077     -})
7078     -#endif
7079     -
7080     -#ifdef CONFIG_X86_CMPXCHG64
7081     -
7082     -static inline unsigned long long __cmpxchg64(volatile void *ptr, unsigned long long old,
7083     - unsigned long long new)
7084     -{
7085     - unsigned long long prev;
7086     - __asm__ __volatile__(LOCK_PREFIX "cmpxchg8b %3"
7087     - : "=A"(prev)
7088     - : "b"((unsigned long)new),
7089     - "c"((unsigned long)(new >> 32)),
7090     - "m"(*__xg(ptr)),
7091     - "0"(old)
7092     - : "memory");
7093     - return prev;
7094     -}
7095     -
7096     -#define cmpxchg64(ptr,o,n)\
7097     - ((__typeof__(*(ptr)))__cmpxchg64((ptr),(unsigned long long)(o),\
7098     - (unsigned long long)(n)))
7099     +static inline unsigned long get_limit(unsigned long segment)
7100     +{
7101     + unsigned long __limit;
7102     + __asm__("lsll %1,%0"
7103     + :"=r" (__limit):"r" (segment));
7104     + return __limit+1;
7105     +}
7106     +
7107     +#define nop() __asm__ __volatile__ ("nop")
7108    
7109     -#endif
7110     -
7111     /*
7112     * Force strict CPU ordering.
7113     * And yes, this is required on UP too when we're talking
7114     --- a/include/asm-x86/mach-xen/asm/system_64.h
7115     +++ b/include/asm-x86/mach-xen/asm/system_64.h
7116     @@ -3,7 +3,7 @@
7117    
7118     #include <linux/kernel.h>
7119     #include <asm/segment.h>
7120     -#include <asm/alternative.h>
7121     +#include <asm/cmpxchg.h>
7122    
7123     #include <asm/synch_bitops.h>
7124     #include <asm/hypervisor.h>
7125     @@ -43,7 +43,7 @@
7126     [threadrsp] "i" (offsetof(struct task_struct, thread.rsp)), \
7127     [ti_flags] "i" (offsetof(struct thread_info, flags)),\
7128     [tif_fork] "i" (TIF_FORK), \
7129     - [thread_info] "i" (offsetof(struct task_struct, thread_info)), \
7130     + [thread_info] "i" (offsetof(struct task_struct, stack)), \
7131     [pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent)) \
7132     : "memory", "cc" __EXTRA_CLOBBER)
7133    
7134     @@ -92,6 +92,12 @@
7135     machine_to_phys(__dummy); \
7136     })
7137    
7138     +static inline void write_cr3(unsigned long val)
7139     +{
7140     + val = phys_to_machine(val);
7141     + asm volatile("movq %0,%%cr3" :: "r" (val) : "memory");
7142     +}
7143     +
7144     static inline unsigned long read_cr4(void)
7145     {
7146     unsigned long cr4;
7147     @@ -101,7 +107,7 @@
7148    
7149     static inline void write_cr4(unsigned long val)
7150     {
7151     - asm volatile("movq %0,%%cr4" :: "r" (val));
7152     + asm volatile("movq %0,%%cr4" :: "r" (val) : "memory");
7153     }
7154    
7155     #define stts() (HYPERVISOR_fpu_taskswitch(1))
7156     @@ -122,100 +128,6 @@
7157    
7158     #define nop() __asm__ __volatile__ ("nop")
7159    
7160     -#define xchg(ptr,v) ((__typeof__(*(ptr)))__xchg((unsigned long)(v),(ptr),sizeof(*(ptr))))
7161     -
7162     -#define tas(ptr) (xchg((ptr),1))
7163     -
7164     -#define __xg(x) ((volatile long *)(x))
7165     -
7166     -static inline void set_64bit(volatile unsigned long *ptr, unsigned long val)
7167     -{
7168     - *ptr = val;
7169     -}
7170     -
7171     -#define _set_64bit set_64bit
7172     -
7173     -/*
7174     - * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
7175     - * Note 2: xchg has side effect, so that attribute volatile is necessary,
7176     - * but generally the primitive is invalid, *ptr is output argument. --ANK
7177     - */
7178     -static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int size)
7179     -{
7180     - switch (size) {
7181     - case 1:
7182     - __asm__ __volatile__("xchgb %b0,%1"
7183     - :"=q" (x)
7184     - :"m" (*__xg(ptr)), "0" (x)
7185     - :"memory");
7186     - break;
7187     - case 2:
7188     - __asm__ __volatile__("xchgw %w0,%1"
7189     - :"=r" (x)
7190     - :"m" (*__xg(ptr)), "0" (x)
7191     - :"memory");
7192     - break;
7193     - case 4:
7194     - __asm__ __volatile__("xchgl %k0,%1"
7195     - :"=r" (x)
7196     - :"m" (*__xg(ptr)), "0" (x)
7197     - :"memory");
7198     - break;
7199     - case 8:
7200     - __asm__ __volatile__("xchgq %0,%1"
7201     - :"=r" (x)
7202     - :"m" (*__xg(ptr)), "0" (x)
7203     - :"memory");
7204     - break;
7205     - }
7206     - return x;
7207     -}
7208     -
7209     -/*
7210     - * Atomic compare and exchange. Compare OLD with MEM, if identical,
7211     - * store NEW in MEM. Return the initial value in MEM. Success is
7212     - * indicated by comparing RETURN with OLD.
7213     - */
7214     -
7215     -#define __HAVE_ARCH_CMPXCHG 1
7216     -
7217     -static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
7218     - unsigned long new, int size)
7219     -{
7220     - unsigned long prev;
7221     - switch (size) {
7222     - case 1:
7223     - __asm__ __volatile__(LOCK_PREFIX "cmpxchgb %b1,%2"
7224     - : "=a"(prev)
7225     - : "q"(new), "m"(*__xg(ptr)), "0"(old)
7226     - : "memory");
7227     - return prev;
7228     - case 2:
7229     - __asm__ __volatile__(LOCK_PREFIX "cmpxchgw %w1,%2"
7230     - : "=a"(prev)
7231     - : "r"(new), "m"(*__xg(ptr)), "0"(old)
7232     - : "memory");
7233     - return prev;
7234     - case 4:
7235     - __asm__ __volatile__(LOCK_PREFIX "cmpxchgl %k1,%2"
7236     - : "=a"(prev)
7237     - : "r"(new), "m"(*__xg(ptr)), "0"(old)
7238     - : "memory");
7239     - return prev;
7240     - case 8:
7241     - __asm__ __volatile__(LOCK_PREFIX "cmpxchgq %1,%2"
7242     - : "=a"(prev)
7243     - : "r"(new), "m"(*__xg(ptr)), "0"(old)
7244     - : "memory");
7245     - return prev;
7246     - }
7247     - return old;
7248     -}
7249     -
7250     -#define cmpxchg(ptr,o,n)\
7251     - ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\
7252     - (unsigned long)(n),sizeof(*(ptr))))
7253     -
7254     #ifdef CONFIG_SMP
7255     #define smp_mb() mb()
7256     #define smp_rmb() rmb()
7257     --- a/include/asm-x86/mach-xen/asm/tlbflush_32.h
7258     +++ b/include/asm-x86/mach-xen/asm/tlbflush_32.h
7259     @@ -29,8 +29,13 @@
7260     * and page-granular flushes are available only on i486 and up.
7261     */
7262    
7263     +#define TLB_FLUSH_ALL 0xffffffff
7264     +
7265     +
7266     #ifndef CONFIG_SMP
7267    
7268     +#include <linux/sched.h>
7269     +
7270     #define flush_tlb() __flush_tlb()
7271     #define flush_tlb_all() __flush_tlb_all()
7272     #define local_flush_tlb() __flush_tlb()
7273     @@ -55,7 +60,7 @@
7274     __flush_tlb();
7275     }
7276    
7277     -#else
7278     +#else /* SMP */
7279    
7280     #include <asm/smp.h>
7281    
7282     @@ -84,9 +89,7 @@
7283     char __cacheline_padding[L1_CACHE_BYTES-8];
7284     };
7285     DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate);
7286     -
7287     -
7288     -#endif
7289     +#endif /* SMP */
7290    
7291     #define flush_tlb_kernel_range(start, end) flush_tlb_all()
7292    
7293     --- a/include/asm-x86/mach-xen/asm/tlbflush_64.h
7294     +++ b/include/asm-x86/mach-xen/asm/tlbflush_64.h
7295     @@ -2,7 +2,9 @@
7296     #define _X8664_TLBFLUSH_H
7297    
7298     #include <linux/mm.h>
7299     +#include <linux/sched.h>
7300     #include <asm/processor.h>
7301     +#include <asm/system.h>
7302    
7303     #define __flush_tlb() xen_tlb_flush()
7304    
7305     --- a/lib/swiotlb-xen.c
7306     +++ b/lib/swiotlb-xen.c
7307     @@ -729,7 +729,6 @@
7308     return (mask >= ((1UL << dma_bits) - 1));
7309     }
7310    
7311     -EXPORT_SYMBOL(swiotlb_init);
7312     EXPORT_SYMBOL(swiotlb_map_single);
7313     EXPORT_SYMBOL(swiotlb_unmap_single);
7314     EXPORT_SYMBOL(swiotlb_map_sg);
7315     --- a/net/core/dev.c
7316     +++ b/net/core/dev.c
7317     @@ -1590,12 +1590,17 @@
7318     inline int skb_checksum_setup(struct sk_buff *skb)
7319     {
7320     if (skb->proto_csum_blank) {
7321     + struct iphdr *iph;
7322     + unsigned char *th;
7323     +
7324     if (skb->protocol != htons(ETH_P_IP))
7325     goto out;
7326     - skb->h.raw = (unsigned char *)skb->nh.iph + 4*skb->nh.iph->ihl;
7327     - if (skb->h.raw >= skb->tail)
7328     + iph = ip_hdr(skb);
7329     + th = skb_network_header(skb) + 4 * iph->ihl;
7330     + if (th >= skb_tail_pointer(skb))
7331     goto out;
7332     - switch (skb->nh.iph->protocol) {
7333     + skb->csum_start = th - skb->head;
7334     + switch (iph->protocol) {
7335     case IPPROTO_TCP:
7336     skb->csum_offset = offsetof(struct tcphdr, check);
7337     break;
7338     @@ -1606,10 +1611,10 @@
7339     if (net_ratelimit())
7340     printk(KERN_ERR "Attempting to checksum a non-"
7341     "TCP/UDP packet, dropping a protocol"
7342     - " %d packet", skb->nh.iph->protocol);
7343     + " %d packet", iph->protocol);
7344     goto out;
7345     }
7346     - if ((skb->h.raw + skb->csum_offset + 2) > skb->tail)
7347     + if ((th + skb->csum_offset + 2) > skb_tail_pointer(skb))
7348     goto out;
7349     skb->ip_summed = CHECKSUM_PARTIAL;
7350     skb->proto_csum_blank = 0;
7351     --- a/scripts/Makefile.xen.awk
7352     +++ b/scripts/Makefile.xen.awk
7353     @@ -13,7 +13,7 @@
7354     next
7355     }
7356    
7357     -/:[[:space:]]*%\.[cS][[:space:]]/ {
7358     +/:[[:space:]]*\$\(src\)\/%\.[cS][[:space:]]/ {
7359     line = gensub(/%.([cS])/, "%-xen.\\1", "g", $0)
7360     line = gensub(/(single-used-m)/, "xen-\\1", "g", line)
7361     print line