Magellan Linux

Contents of /trunk/kernel26-xen/patches-2.6.25-r1/1023-2.6.25-xen-patch-2.6.22.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 609 - (show annotations) (download)
Fri May 23 17:35:37 2008 UTC (16 years ago) by niro
File size: 212197 byte(s)
-using opensuse xen patchset, updated kernel configs

1 From: www.kernel.org
2 Subject: Update to 2.6.22
3 Patch-mainline: 2.6.22
4
5 Automatically created from "patches.kernel.org/patch-2.6.22" by xen-port-patches.py
6
7 Acked-by: jbeulich@novell.com
8
9 ---
10 arch/x86/Kconfig | 5
11 arch/x86/ia32/ia32entry-xen.S | 18 -
12 arch/x86/kernel/Makefile | 2
13 arch/x86/kernel/acpi/sleep_64-xen.c | 26 -
14 arch/x86/kernel/apic_32-xen.c | 1
15 arch/x86/kernel/apic_64-xen.c | 1
16 arch/x86/kernel/cpu/common-xen.c | 224 ++++---------
17 arch/x86/kernel/cpu/mtrr/main-xen.c | 2
18 arch/x86/kernel/e820_32-xen.c | 46 +-
19 arch/x86/kernel/e820_64-xen.c | 28 -
20 arch/x86/kernel/early_printk-xen.c | 27 -
21 arch/x86/kernel/entry_32-xen.S | 30 -
22 arch/x86/kernel/entry_64-xen.S | 7
23 arch/x86/kernel/genapic_64-xen.c | 106 +-----
24 arch/x86/kernel/genapic_xen_64.c | 3
25 arch/x86/kernel/head64-xen.c | 32 +
26 arch/x86/kernel/head_32-xen.S | 101 ------
27 arch/x86/kernel/head_64-xen.S | 37 --
28 arch/x86/kernel/io_apic_32-xen.c | 43 --
29 arch/x86/kernel/io_apic_64-xen.c | 39 --
30 arch/x86/kernel/ioport_32-xen.c | 2
31 arch/x86/kernel/ioport_64-xen.c | 2
32 arch/x86/kernel/irq_32-xen.c | 3
33 arch/x86/kernel/irq_64-xen.c | 34 +-
34 arch/x86/kernel/ldt_32-xen.c | 1
35 arch/x86/kernel/ldt_64-xen.c | 1
36 arch/x86/kernel/microcode-xen.c | 2
37 arch/x86/kernel/mpparse_32-xen.c | 3
38 arch/x86/kernel/mpparse_64-xen.c | 3
39 arch/x86/kernel/pci-dma_32-xen.c | 29 +
40 arch/x86/kernel/pci-swiotlb_64-xen.c | 2
41 arch/x86/kernel/process_32-xen.c | 27 +
42 arch/x86/kernel/process_64-xen.c | 16
43 arch/x86/kernel/quirks-xen.c | 63 ---
44 arch/x86/kernel/setup64-xen.c | 17 -
45 arch/x86/kernel/setup_64-xen.c | 30 -
46 arch/x86/kernel/smp_32-xen.c | 191 ++++-------
47 arch/x86/kernel/smp_64-xen.c | 29 -
48 arch/x86/kernel/time_32-xen.c | 62 +--
49 arch/x86/kernel/traps_32-xen.c | 46 +-
50 arch/x86/kernel/traps_64-xen.c | 55 +--
51 arch/x86/kernel/vsyscall_64-xen.c | 73 +++-
52 arch/x86/mm/fault_32-xen.c | 42 +-
53 arch/x86/mm/fault_64-xen.c | 15
54 arch/x86/mm/highmem_32-xen.c | 14
55 arch/x86/mm/init_32-xen.c | 157 ++++++---
56 arch/x86/mm/init_64-xen.c | 132 ++++---
57 arch/x86/mm/ioremap_32-xen.c | 1
58 arch/x86/mm/pageattr_64-xen.c | 27 +
59 arch/x86/mm/pgtable_32-xen.c | 210 +++++++-----
60 drivers/char/tpm/tpm_xen.c | 2
61 drivers/xen/blkfront/blkfront.c | 2
62 drivers/xen/char/mem.c | 1
63 drivers/xen/core/hypervisor_sysfs.c | 2
64 drivers/xen/core/smpboot.c | 49 +-
65 drivers/xen/core/xen_sysfs.c | 20 -
66 drivers/xen/netback/netback.c | 14
67 drivers/xen/netfront/netfront.c | 2
68 drivers/xen/pciback/xenbus.c | 2
69 drivers/xen/pcifront/xenbus.c | 4
70 drivers/xen/sfc_netback/accel_fwd.c | 7
71 drivers/xen/sfc_netback/accel_solarflare.c | 2
72 drivers/xen/sfc_netfront/accel_tso.c | 28 -
73 drivers/xen/sfc_netfront/accel_vi.c | 4
74 drivers/xen/sfc_netfront/accel_xenbus.c | 4
75 drivers/xen/xenoprof/xenoprofile.c | 2
76 fs/aio.c | 7
77 include/asm-x86/mach-xen/asm/desc_32.h | 119 ++++---
78 include/asm-x86/mach-xen/asm/desc_64.h | 30 -
79 include/asm-x86/mach-xen/asm/dma-mapping_64.h | 2
80 include/asm-x86/mach-xen/asm/fixmap_32.h | 9
81 include/asm-x86/mach-xen/asm/fixmap_64.h | 1
82 include/asm-x86/mach-xen/asm/highmem.h | 6
83 include/asm-x86/mach-xen/asm/io_32.h | 13
84 include/asm-x86/mach-xen/asm/irqflags_32.h | 78 ++--
85 include/asm-x86/mach-xen/asm/irqflags_64.h | 19 -
86 include/asm-x86/mach-xen/asm/mmu.h | 8
87 include/asm-x86/mach-xen/asm/mmu_64.h | 8
88 include/asm-x86/mach-xen/asm/mmu_context_32.h | 29 +
89 include/asm-x86/mach-xen/asm/mmu_context_64.h | 3
90 include/asm-x86/mach-xen/asm/page_64.h | 61 +--
91 include/asm-x86/mach-xen/asm/pgalloc_32.h | 3
92 include/asm-x86/mach-xen/asm/pgalloc_64.h | 15
93 include/asm-x86/mach-xen/asm/pgtable-2level.h | 43 +-
94 include/asm-x86/mach-xen/asm/pgtable-3level-defs.h | 2
95 include/asm-x86/mach-xen/asm/pgtable-3level.h | 61 ++-
96 include/asm-x86/mach-xen/asm/pgtable_32.h | 80 ++--
97 include/asm-x86/mach-xen/asm/pgtable_64.h | 83 ++---
98 include/asm-x86/mach-xen/asm/processor_32.h | 141 +++-----
99 include/asm-x86/mach-xen/asm/processor_64.h | 55 ---
100 include/asm-x86/mach-xen/asm/scatterlist_32.h | 2
101 include/asm-x86/mach-xen/asm/segment_32.h | 10
102 include/asm-x86/mach-xen/asm/smp_32.h | 117 +++++--
103 include/asm-x86/mach-xen/asm/smp_64.h | 20 -
104 include/asm-x86/mach-xen/asm/system_32.h | 348 ++++-----------------
105 include/asm-x86/mach-xen/asm/system_64.h | 106 ------
106 include/asm-x86/mach-xen/asm/tlbflush_32.h | 11
107 include/asm-x86/mach-xen/asm/tlbflush_64.h | 2
108 lib/swiotlb-xen.c | 1
109 net/core/dev.c | 15
110 scripts/Makefile.xen.awk | 2
111 101 files changed, 1642 insertions(+), 2080 deletions(-)
112
113 --- a/arch/x86/Kconfig
114 +++ b/arch/x86/Kconfig
115 @@ -1222,7 +1222,7 @@
116
117 config RELOCATABLE
118 bool "Build a relocatable kernel (EXPERIMENTAL)"
119 - depends on EXPERIMENTAL && !X86_XEN
120 + depends on EXPERIMENTAL && !X86_XEN && !X86_64_XEN
121 help
122 This builds a kernel image that retains relocation information
123 so it can be loaded someplace besides the default 1MB.
124 @@ -1276,7 +1276,6 @@
125 def_bool y
126 prompt "Compat VDSO support"
127 depends on X86_32 || IA32_EMULATION
128 - depends on !X86_XEN
129 help
130 Map the 32-bit VDSO to the predictable old-style address too.
131 ---help---
132 @@ -1453,7 +1452,7 @@
133 bool "PCI support" if !X86_VISWS
134 depends on !X86_VOYAGER
135 default y
136 - select ARCH_SUPPORTS_MSI if (X86_LOCAL_APIC && X86_IO_APIC)
137 + select ARCH_SUPPORTS_MSI if (X86_LOCAL_APIC && X86_IO_APIC && !X86_XEN && !X86_64_XEN)
138 help
139 Find out whether you have a PCI motherboard. PCI is the name of a
140 bus system, i.e. the way the CPU talks to the other stuff inside
141 --- a/arch/x86/ia32/ia32entry-xen.S
142 +++ b/arch/x86/ia32/ia32entry-xen.S
143 @@ -431,11 +431,7 @@
144 .quad sys_symlink
145 .quad sys_lstat
146 .quad sys_readlink /* 85 */
147 -#ifdef CONFIG_IA32_AOUT
148 .quad sys_uselib
149 -#else
150 - .quad quiet_ni_syscall
151 -#endif
152 .quad sys_swapon
153 .quad sys_reboot
154 .quad compat_sys_old_readdir
155 @@ -574,7 +570,7 @@
156 .quad quiet_ni_syscall /* tux */
157 .quad quiet_ni_syscall /* security */
158 .quad sys_gettid
159 - .quad sys_readahead /* 225 */
160 + .quad sys32_readahead /* 225 */
161 .quad sys_setxattr
162 .quad sys_lsetxattr
163 .quad sys_fsetxattr
164 @@ -599,7 +595,7 @@
165 .quad compat_sys_io_getevents
166 .quad compat_sys_io_submit
167 .quad sys_io_cancel
168 - .quad sys_fadvise64 /* 250 */
169 + .quad sys32_fadvise64 /* 250 */
170 .quad quiet_ni_syscall /* free_huge_pages */
171 .quad sys_exit_group
172 .quad sys32_lookup_dcookie
173 @@ -663,10 +659,14 @@
174 .quad compat_sys_set_robust_list
175 .quad compat_sys_get_robust_list
176 .quad sys_splice
177 - .quad sys_sync_file_range
178 - .quad sys_tee
179 + .quad sys32_sync_file_range
180 + .quad sys_tee /* 315 */
181 .quad compat_sys_vmsplice
182 .quad compat_sys_move_pages
183 .quad sys_getcpu
184 .quad sys_epoll_pwait
185 -ia32_syscall_end:
186 + .quad compat_sys_utimensat /* 320 */
187 + .quad compat_sys_signalfd
188 + .quad compat_sys_timerfd
189 + .quad sys_eventfd
190 +ia32_syscall_end:
191 --- a/arch/x86/kernel/Makefile
192 +++ b/arch/x86/kernel/Makefile
193 @@ -106,4 +106,4 @@
194
195 disabled-obj-$(CONFIG_XEN) := early-quirks.o hpet.o i8253.o i8259_$(BITS).o reboot.o \
196 smpboot_$(BITS).o tsc_$(BITS).o tsc_sync.o
197 -%/head_$(BITS).o %/head_$(BITS).s: $(if $(CONFIG_XEN),EXTRA_AFLAGS,dummy) :=
198 +%/head_64.o %/head_64.s: $(if $(CONFIG_XEN),EXTRA_AFLAGS,dummy) :=
199 --- a/arch/x86/kernel/acpi/sleep_64-xen.c
200 +++ b/arch/x86/kernel/acpi/sleep_64-xen.c
201 @@ -60,19 +60,6 @@
202 extern char wakeup_start, wakeup_end;
203
204 extern unsigned long acpi_copy_wakeup_routine(unsigned long);
205 -
206 -static pgd_t low_ptr;
207 -
208 -static void init_low_mapping(void)
209 -{
210 - pgd_t *slot0 = pgd_offset(current->mm, 0UL);
211 - low_ptr = *slot0;
212 - /* FIXME: We're playing with the current task's page tables here, which
213 - * is potentially dangerous on SMP systems.
214 - */
215 - set_pgd(slot0, *pgd_offset(current->mm, PAGE_OFFSET));
216 - local_flush_tlb();
217 -}
218 #endif
219
220 /**
221 @@ -84,8 +71,6 @@
222 int acpi_save_state_mem(void)
223 {
224 #ifndef CONFIG_ACPI_PV_SLEEP
225 - init_low_mapping();
226 -
227 memcpy((void *)acpi_wakeup_address, &wakeup_start,
228 &wakeup_end - &wakeup_start);
229 acpi_copy_wakeup_routine(acpi_wakeup_address);
230 @@ -98,10 +83,6 @@
231 */
232 void acpi_restore_state_mem(void)
233 {
234 -#ifndef CONFIG_ACPI_PV_SLEEP
235 - set_pgd(pgd_offset(current->mm, 0UL), low_ptr);
236 - local_flush_tlb();
237 -#endif
238 }
239
240 /**
241 @@ -115,10 +96,11 @@
242 void __init acpi_reserve_bootmem(void)
243 {
244 #ifndef CONFIG_ACPI_PV_SLEEP
245 - acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE);
246 - if ((&wakeup_end - &wakeup_start) > PAGE_SIZE)
247 + acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE*2);
248 + if ((&wakeup_end - &wakeup_start) > (PAGE_SIZE*2))
249 printk(KERN_CRIT
250 - "ACPI: Wakeup code way too big, will crash on attempt to suspend\n");
251 + "ACPI: Wakeup code way too big, will crash on attempt"
252 + " to suspend\n");
253 #endif
254 }
255
256 --- a/arch/x86/kernel/apic_32-xen.c
257 +++ b/arch/x86/kernel/apic_32-xen.c
258 @@ -19,7 +19,6 @@
259 #include <linux/mm.h>
260 #include <linux/delay.h>
261 #include <linux/bootmem.h>
262 -#include <linux/smp_lock.h>
263 #include <linux/interrupt.h>
264 #include <linux/mc146818rtc.h>
265 #include <linux/kernel_stat.h>
266 --- a/arch/x86/kernel/apic_64-xen.c
267 +++ b/arch/x86/kernel/apic_64-xen.c
268 @@ -19,7 +19,6 @@
269 #include <linux/mm.h>
270 #include <linux/delay.h>
271 #include <linux/bootmem.h>
272 -#include <linux/smp_lock.h>
273 #include <linux/interrupt.h>
274 #include <linux/mc146818rtc.h>
275 #include <linux/kernel_stat.h>
276 --- a/arch/x86/kernel/cpu/common-xen.c
277 +++ b/arch/x86/kernel/cpu/common-xen.c
278 @@ -22,16 +22,40 @@
279 #define phys_pkg_id(a,b) a
280 #endif
281 #endif
282 -#include <asm/pda.h>
283 #include <asm/hypervisor.h>
284
285 #include "cpu.h"
286
287 -DEFINE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr);
288 -EXPORT_PER_CPU_SYMBOL(cpu_gdt_descr);
289 +DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
290 + [GDT_ENTRY_KERNEL_CS] = { 0x0000ffff, 0x00cf9a00 },
291 + [GDT_ENTRY_KERNEL_DS] = { 0x0000ffff, 0x00cf9200 },
292 + [GDT_ENTRY_DEFAULT_USER_CS] = { 0x0000ffff, 0x00cffa00 },
293 + [GDT_ENTRY_DEFAULT_USER_DS] = { 0x0000ffff, 0x00cff200 },
294 +#ifndef CONFIG_XEN
295 + /*
296 + * Segments used for calling PnP BIOS have byte granularity.
297 + * They code segments and data segments have fixed 64k limits,
298 + * the transfer segment sizes are set at run time.
299 + */
300 + [GDT_ENTRY_PNPBIOS_CS32] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */
301 + [GDT_ENTRY_PNPBIOS_CS16] = { 0x0000ffff, 0x00009a00 },/* 16-bit code */
302 + [GDT_ENTRY_PNPBIOS_DS] = { 0x0000ffff, 0x00009200 }, /* 16-bit data */
303 + [GDT_ENTRY_PNPBIOS_TS1] = { 0x00000000, 0x00009200 },/* 16-bit data */
304 + [GDT_ENTRY_PNPBIOS_TS2] = { 0x00000000, 0x00009200 },/* 16-bit data */
305 + /*
306 + * The APM segments have byte granularity and their bases
307 + * are set at run time. All have 64k limits.
308 + */
309 + [GDT_ENTRY_APMBIOS_BASE] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */
310 + /* 16-bit code */
311 + [GDT_ENTRY_APMBIOS_BASE+1] = { 0x0000ffff, 0x00009a00 },
312 + [GDT_ENTRY_APMBIOS_BASE+2] = { 0x0000ffff, 0x00409200 }, /* data */
313
314 -struct i386_pda *_cpu_pda[NR_CPUS] __read_mostly;
315 -EXPORT_SYMBOL(_cpu_pda);
316 + [GDT_ENTRY_ESPFIX_SS] = { 0x00000000, 0x00c09200 },
317 +#endif
318 + [GDT_ENTRY_PERCPU] = { 0x00000000, 0x00000000 },
319 +} };
320 +EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
321
322 static int cachesize_override __cpuinitdata = -1;
323 static int disable_x86_fxsr __cpuinitdata;
324 @@ -373,7 +397,7 @@
325 /*
326 * This does the hard work of actually picking apart the CPU stuff...
327 */
328 -void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
329 +static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
330 {
331 int i;
332
333 @@ -484,15 +508,22 @@
334
335 /* Init Machine Check Exception if available. */
336 mcheck_init(c);
337 +}
338
339 - if (c == &boot_cpu_data)
340 - sysenter_setup();
341 +void __init identify_boot_cpu(void)
342 +{
343 + identify_cpu(&boot_cpu_data);
344 + sysenter_setup();
345 enable_sep_cpu();
346 + mtrr_bp_init();
347 +}
348
349 - if (c == &boot_cpu_data)
350 - mtrr_bp_init();
351 - else
352 - mtrr_ap_init();
353 +void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
354 +{
355 + BUG_ON(c == &boot_cpu_data);
356 + identify_cpu(c);
357 + enable_sep_cpu();
358 + mtrr_ap_init();
359 }
360
361 #ifdef CONFIG_X86_HT
362 @@ -606,136 +637,47 @@
363 #endif
364 }
365
366 -/* Make sure %gs is initialized properly in idle threads */
367 +/* Make sure %fs is initialized properly in idle threads */
368 struct pt_regs * __devinit idle_regs(struct pt_regs *regs)
369 {
370 memset(regs, 0, sizeof(struct pt_regs));
371 - regs->xfs = __KERNEL_PDA;
372 + regs->xfs = __KERNEL_PERCPU;
373 return regs;
374 }
375
376 -static __cpuinit int alloc_gdt(int cpu)
377 +/* Current gdt points %fs at the "master" per-cpu area: after this,
378 + * it's on the real one. */
379 +void switch_to_new_gdt(void)
380 {
381 - struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
382 - struct desc_struct *gdt;
383 - struct i386_pda *pda;
384 -
385 - gdt = (struct desc_struct *)cpu_gdt_descr->address;
386 - pda = cpu_pda(cpu);
387 -
388 - /*
389 - * This is a horrible hack to allocate the GDT. The problem
390 - * is that cpu_init() is called really early for the boot CPU
391 - * (and hence needs bootmem) but much later for the secondary
392 - * CPUs, when bootmem will have gone away
393 - */
394 - if (NODE_DATA(0)->bdata->node_bootmem_map) {
395 - BUG_ON(gdt != NULL || pda != NULL);
396 -
397 - gdt = alloc_bootmem_pages(PAGE_SIZE);
398 - pda = alloc_bootmem(sizeof(*pda));
399 - /* alloc_bootmem(_pages) panics on failure, so no check */
400 -
401 - memset(gdt, 0, PAGE_SIZE);
402 - memset(pda, 0, sizeof(*pda));
403 - } else {
404 - /* GDT and PDA might already have been allocated if
405 - this is a CPU hotplug re-insertion. */
406 - if (gdt == NULL)
407 - gdt = (struct desc_struct *)get_zeroed_page(GFP_KERNEL);
408 -
409 - if (pda == NULL)
410 - pda = kmalloc_node(sizeof(*pda), GFP_KERNEL, cpu_to_node(cpu));
411 -
412 - if (unlikely(!gdt || !pda)) {
413 - free_pages((unsigned long)gdt, 0);
414 - kfree(pda);
415 - return 0;
416 - }
417 - }
418 -
419 - cpu_gdt_descr->address = (unsigned long)gdt;
420 - cpu_pda(cpu) = pda;
421 -
422 - return 1;
423 -}
424 -
425 -/* Initial PDA used by boot CPU */
426 -struct i386_pda boot_pda = {
427 - ._pda = &boot_pda,
428 - .cpu_number = 0,
429 - .pcurrent = &init_task,
430 -};
431 -
432 -static inline void set_kernel_fs(void)
433 -{
434 - /* Set %fs for this CPU's PDA. Memory clobber is to create a
435 - barrier with respect to any PDA operations, so the compiler
436 - doesn't move any before here. */
437 - asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_PDA) : "memory");
438 -}
439 -
440 -/* Initialize the CPU's GDT and PDA. The boot CPU does this for
441 - itself, but secondaries find this done for them. */
442 -__cpuinit int init_gdt(int cpu, struct task_struct *idle)
443 -{
444 - struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
445 - struct desc_struct *gdt;
446 - struct i386_pda *pda;
447 -
448 - /* For non-boot CPUs, the GDT and PDA should already have been
449 - allocated. */
450 - if (!alloc_gdt(cpu)) {
451 - printk(KERN_CRIT "CPU%d failed to allocate GDT or PDA\n", cpu);
452 - return 0;
453 - }
454 -
455 - gdt = (struct desc_struct *)cpu_gdt_descr->address;
456 - pda = cpu_pda(cpu);
457 -
458 - BUG_ON(gdt == NULL || pda == NULL);
459 -
460 - /*
461 - * Initialize the per-CPU GDT with the boot GDT,
462 - * and set up the GDT descriptor:
463 - */
464 - memcpy(gdt, cpu_gdt_table, GDT_SIZE);
465 - cpu_gdt_descr->size = GDT_SIZE - 1;
466 -
467 - pack_descriptor((u32 *)&gdt[GDT_ENTRY_PDA].a,
468 - (u32 *)&gdt[GDT_ENTRY_PDA].b,
469 - (unsigned long)pda, sizeof(*pda) - 1,
470 - 0x80 | DESCTYPE_S | 0x2, 0); /* present read-write data segment */
471 -
472 - memset(pda, 0, sizeof(*pda));
473 - pda->_pda = pda;
474 - pda->cpu_number = cpu;
475 - pda->pcurrent = idle;
476 -
477 - return 1;
478 -}
479 -
480 -void __cpuinit cpu_set_gdt(int cpu)
481 -{
482 - struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
483 + struct Xgt_desc_struct gdt_descr;
484 unsigned long va, frames[16];
485 int f;
486
487 - for (va = cpu_gdt_descr->address, f = 0;
488 - va < cpu_gdt_descr->address + cpu_gdt_descr->size;
489 + gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
490 + gdt_descr.size = GDT_SIZE - 1;
491 +
492 + for (va = gdt_descr.address, f = 0;
493 + va < gdt_descr.address + gdt_descr.size;
494 va += PAGE_SIZE, f++) {
495 frames[f] = virt_to_mfn(va);
496 make_lowmem_page_readonly(
497 (void *)va, XENFEAT_writable_descriptor_tables);
498 }
499 - BUG_ON(HYPERVISOR_set_gdt(frames, (cpu_gdt_descr->size + 1) / 8));
500 -
501 - set_kernel_fs();
502 + if (HYPERVISOR_set_gdt(frames, (gdt_descr.size + 1) / 8))
503 + BUG();
504 + asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory");
505 }
506
507 -/* Common CPU init for both boot and secondary CPUs */
508 -static void __cpuinit _cpu_init(int cpu, struct task_struct *curr)
509 +/*
510 + * cpu_init() initializes state that is per-CPU. Some data is already
511 + * initialized (naturally) in the bootstrap process, such as the GDT
512 + * and IDT. We reload them nevertheless, this function acts as a
513 + * 'CPU state barrier', nothing should get across.
514 + */
515 +void __cpuinit cpu_init(void)
516 {
517 + int cpu = smp_processor_id();
518 + struct task_struct *curr = current;
519 #ifndef CONFIG_X86_NO_TSS
520 struct tss_struct * t = &per_cpu(init_tss, cpu);
521 #endif
522 @@ -757,6 +699,8 @@
523 set_in_cr4(X86_CR4_TSD);
524 }
525
526 + switch_to_new_gdt();
527 +
528 /*
529 * Set up and load the per-CPU TSS and LDT
530 */
531 @@ -794,38 +738,6 @@
532 mxcsr_feature_mask_init();
533 }
534
535 -/* Entrypoint to initialize secondary CPU */
536 -void __cpuinit secondary_cpu_init(void)
537 -{
538 - int cpu = smp_processor_id();
539 - struct task_struct *curr = current;
540 -
541 - _cpu_init(cpu, curr);
542 -}
543 -
544 -/*
545 - * cpu_init() initializes state that is per-CPU. Some data is already
546 - * initialized (naturally) in the bootstrap process, such as the GDT
547 - * and IDT. We reload them nevertheless, this function acts as a
548 - * 'CPU state barrier', nothing should get across.
549 - */
550 -void __cpuinit cpu_init(void)
551 -{
552 - int cpu = smp_processor_id();
553 - struct task_struct *curr = current;
554 -
555 - /* Set up the real GDT and PDA, so we can transition from the
556 - boot versions. */
557 - if (!init_gdt(cpu, curr)) {
558 - /* failed to allocate something; not much we can do... */
559 - for (;;)
560 - local_irq_enable();
561 - }
562 -
563 - cpu_set_gdt(cpu);
564 - _cpu_init(cpu, curr);
565 -}
566 -
567 #ifdef CONFIG_HOTPLUG_CPU
568 void __cpuinit cpu_uninit(void)
569 {
570 --- a/arch/x86/kernel/cpu/mtrr/main-xen.c
571 +++ b/arch/x86/kernel/cpu/mtrr/main-xen.c
572 @@ -167,7 +167,7 @@
573 EXPORT_SYMBOL(mtrr_add);
574 EXPORT_SYMBOL(mtrr_del);
575
576 -void __init mtrr_bp_init(void)
577 +__init void mtrr_bp_init(void)
578 {
579 }
580
581 --- a/arch/x86/kernel/e820_32-xen.c
582 +++ b/arch/x86/kernel/e820_32-xen.c
583 @@ -162,26 +162,27 @@
584
585 static int __init romsignature(const unsigned char *rom)
586 {
587 + const unsigned short * const ptr = (const unsigned short *)rom;
588 unsigned short sig;
589
590 - return probe_kernel_address((const unsigned short *)rom, sig) == 0 &&
591 - sig == ROMSIGNATURE;
592 + return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE;
593 }
594
595 -static int __init romchecksum(unsigned char *rom, unsigned long length)
596 +static int __init romchecksum(const unsigned char *rom, unsigned long length)
597 {
598 - unsigned char sum;
599 + unsigned char sum, c;
600
601 - for (sum = 0; length; length--)
602 - sum += *rom++;
603 - return sum == 0;
604 + for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--)
605 + sum += c;
606 + return !length && !sum;
607 }
608
609 static void __init probe_roms(void)
610 {
611 + const unsigned char *rom;
612 unsigned long start, length, upper;
613 - unsigned char *rom;
614 - int i;
615 + unsigned char c;
616 + int i;
617
618 #ifdef CONFIG_XEN
619 /* Nothing to do if not running in dom0. */
620 @@ -198,8 +199,11 @@
621
622 video_rom_resource.start = start;
623
624 + if (probe_kernel_address(rom + 2, c) != 0)
625 + continue;
626 +
627 /* 0 < length <= 0x7f * 512, historically */
628 - length = rom[2] * 512;
629 + length = c * 512;
630
631 /* if checksum okay, trust length byte */
632 if (length && romchecksum(rom, length))
633 @@ -233,8 +237,11 @@
634 if (!romsignature(rom))
635 continue;
636
637 + if (probe_kernel_address(rom + 2, c) != 0)
638 + continue;
639 +
640 /* 0 < length <= 0x7f * 512, historically */
641 - length = rom[2] * 512;
642 + length = c * 512;
643
644 /* but accept any length that fits if checksum okay */
645 if (!length || start + length > upper || !romchecksum(rom, length))
646 @@ -249,7 +256,7 @@
647 }
648
649 #ifdef CONFIG_XEN
650 -static struct e820map machine_e820 __initdata;
651 +static struct e820map machine_e820;
652 #define e820 machine_e820
653 #endif
654
655 @@ -409,10 +416,8 @@
656 ____________________33__
657 ______________________4_
658 */
659 - printk("sanitize start\n");
660 /* if there's only one memory region, don't bother */
661 if (*pnr_map < 2) {
662 - printk("sanitize bail 0\n");
663 return -1;
664 }
665
666 @@ -421,7 +426,6 @@
667 /* bail out if we find any unreasonable addresses in bios map */
668 for (i=0; i<old_nr; i++)
669 if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) {
670 - printk("sanitize bail 1\n");
671 return -1;
672 }
673
674 @@ -517,7 +521,6 @@
675 memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
676 *pnr_map = new_nr;
677
678 - printk("sanitize end\n");
679 return 0;
680 }
681
682 @@ -552,7 +555,6 @@
683 unsigned long long size = biosmap->size;
684 unsigned long long end = start + size;
685 unsigned long type = biosmap->type;
686 - printk("copy_e820_map() start: %016Lx size: %016Lx end: %016Lx type: %ld\n", start, size, end, type);
687
688 /* Overflow in 64 bits? Ignore the memory map. */
689 if (start > end)
690 @@ -564,17 +566,11 @@
691 * Not right. Fix it up.
692 */
693 if (type == E820_RAM) {
694 - printk("copy_e820_map() type is E820_RAM\n");
695 if (start < 0x100000ULL && end > 0xA0000ULL) {
696 - printk("copy_e820_map() lies in range...\n");
697 - if (start < 0xA0000ULL) {
698 - printk("copy_e820_map() start < 0xA0000ULL\n");
699 + if (start < 0xA0000ULL)
700 add_memory_region(start, 0xA0000ULL-start, type);
701 - }
702 - if (end <= 0x100000ULL) {
703 - printk("copy_e820_map() end <= 0x100000ULL\n");
704 + if (end <= 0x100000ULL)
705 continue;
706 - }
707 start = 0x100000ULL;
708 size = end - start;
709 }
710 --- a/arch/x86/kernel/e820_64-xen.c
711 +++ b/arch/x86/kernel/e820_64-xen.c
712 @@ -17,6 +17,8 @@
713 #include <linux/kexec.h>
714 #include <linux/module.h>
715 #include <linux/mm.h>
716 +#include <linux/suspend.h>
717 +#include <linux/pfn.h>
718
719 #include <asm/pgtable.h>
720 #include <asm/page.h>
721 @@ -28,7 +30,7 @@
722
723 struct e820map e820 __initdata;
724 #ifdef CONFIG_XEN
725 -struct e820map machine_e820 __initdata;
726 +struct e820map machine_e820;
727 #endif
728
729 /*
730 @@ -293,22 +295,6 @@
731 }
732
733 #ifndef CONFIG_XEN
734 -/* Mark pages corresponding to given address range as nosave */
735 -static void __init
736 -e820_mark_nosave_range(unsigned long start, unsigned long end)
737 -{
738 - unsigned long pfn, max_pfn;
739 -
740 - if (start >= end)
741 - return;
742 -
743 - printk("Nosave address range: %016lx - %016lx\n", start, end);
744 - max_pfn = end >> PAGE_SHIFT;
745 - for (pfn = start >> PAGE_SHIFT; pfn < max_pfn; pfn++)
746 - if (pfn_valid(pfn))
747 - SetPageNosave(pfn_to_page(pfn));
748 -}
749 -
750 /*
751 * Find the ranges of physical addresses that do not correspond to
752 * e820 RAM areas and mark the corresponding pages as nosave for software
753 @@ -327,13 +313,13 @@
754 struct e820entry *ei = &e820.map[i];
755
756 if (paddr < ei->addr)
757 - e820_mark_nosave_range(paddr,
758 - round_up(ei->addr, PAGE_SIZE));
759 + register_nosave_region(PFN_DOWN(paddr),
760 + PFN_UP(ei->addr));
761
762 paddr = round_down(ei->addr + ei->size, PAGE_SIZE);
763 if (ei->type != E820_RAM)
764 - e820_mark_nosave_range(round_up(ei->addr, PAGE_SIZE),
765 - paddr);
766 + register_nosave_region(PFN_UP(ei->addr),
767 + PFN_DOWN(paddr));
768
769 if (paddr >= (end_pfn << PAGE_SHIFT))
770 break;
771 --- a/arch/x86/kernel/early_printk-xen.c
772 +++ b/arch/x86/kernel/early_printk-xen.c
773 @@ -11,11 +11,10 @@
774
775 #ifdef __i386__
776 #include <asm/setup.h>
777 -#define VGABASE (__ISA_IO_base + 0xb8000)
778 #else
779 #include <asm/bootsetup.h>
780 -#define VGABASE ((void __iomem *)0xffffffff800b8000UL)
781 #endif
782 +#define VGABASE (__ISA_IO_base + 0xb8000)
783
784 #ifndef CONFIG_XEN
785 static int max_ypos = 25, max_xpos = 80;
786 @@ -93,9 +92,9 @@
787 static void early_serial_write(struct console *con, const char *s, unsigned n)
788 {
789 while (*s && n-- > 0) {
790 - early_serial_putc(*s);
791 if (*s == '\n')
792 early_serial_putc('\r');
793 + early_serial_putc(*s);
794 s++;
795 }
796 }
797 @@ -205,7 +204,7 @@
798 return ret;
799 }
800
801 -void __init simnow_init(char *str)
802 +static void __init simnow_init(char *str)
803 {
804 char *fn = "klog";
805 if (*str == '=')
806 @@ -277,22 +276,12 @@
807 early_console = &simnow_console;
808 keep_early = 1;
809 }
810 +
811 + if (keep_early)
812 + early_console->flags &= ~CON_BOOT;
813 + else
814 + early_console->flags |= CON_BOOT;
815 register_console(early_console);
816 return 0;
817 }
818 -
819 early_param("earlyprintk", setup_early_printk);
820 -
821 -void __init disable_early_printk(void)
822 -{
823 - if (!early_console_initialized || !early_console)
824 - return;
825 - if (!keep_early) {
826 - printk("disabling early console\n");
827 - unregister_console(early_console);
828 - early_console_initialized = 0;
829 - } else {
830 - printk("keeping early console\n");
831 - }
832 -}
833 -
834 --- a/arch/x86/kernel/entry_32-xen.S
835 +++ b/arch/x86/kernel/entry_32-xen.S
836 @@ -15,7 +15,7 @@
837 * I changed all the .align's to 4 (16 byte alignment), as that's faster
838 * on a 486.
839 *
840 - * Stack layout in 'ret_from_system_call':
841 + * Stack layout in 'syscall_exit':
842 * ptrace needs to have all regs on the stack.
843 * if the order here is changed, it needs to be
844 * updated in fork.c:copy_process, signal.c:do_signal,
845 @@ -135,7 +135,7 @@
846 movl $(__USER_DS), %edx; \
847 movl %edx, %ds; \
848 movl %edx, %es; \
849 - movl $(__KERNEL_PDA), %edx; \
850 + movl $(__KERNEL_PERCPU), %edx; \
851 movl %edx, %fs
852
853 #define RESTORE_INT_REGS \
854 @@ -308,16 +308,12 @@
855 pushl $(__USER_CS)
856 CFI_ADJUST_CFA_OFFSET 4
857 /*CFI_REL_OFFSET cs, 0*/
858 -#ifndef CONFIG_COMPAT_VDSO
859 /*
860 * Push current_thread_info()->sysenter_return to the stack.
861 * A tiny bit of offset fixup is necessary - 4*4 means the 4 words
862 * pushed above; +8 corresponds to copy_thread's esp0 setting.
863 */
864 pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp)
865 -#else
866 - pushl $SYSENTER_RETURN
867 -#endif
868 CFI_ADJUST_CFA_OFFSET 4
869 CFI_REL_OFFSET eip, 0
870
871 @@ -345,7 +341,7 @@
872 jae syscall_badsys
873 call *sys_call_table(,%eax,4)
874 movl %eax,PT_EAX(%esp)
875 - DISABLE_INTERRUPTS(CLBR_ECX|CLBR_EDX)
876 + DISABLE_INTERRUPTS(CLBR_ANY)
877 TRACE_IRQS_OFF
878 movl TI_flags(%ebp), %ecx
879 testw $_TIF_ALLWORK_MASK, %cx
880 @@ -400,10 +396,6 @@
881 CFI_ADJUST_CFA_OFFSET 4
882 SAVE_ALL
883 GET_THREAD_INFO(%ebp)
884 - testl $TF_MASK,PT_EFLAGS(%esp)
885 - jz no_singlestep
886 - orl $_TIF_SINGLESTEP,TI_flags(%ebp)
887 -no_singlestep:
888 # system call tracing in operation / emulation
889 /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
890 testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
891 @@ -418,6 +410,10 @@
892 # setting need_resched or sigpending
893 # between sampling and the iret
894 TRACE_IRQS_OFF
895 + testl $TF_MASK,PT_EFLAGS(%esp) # If tracing set singlestep flag on exit
896 + jz no_singlestep
897 + orl $_TIF_SINGLESTEP,TI_flags(%ebp)
898 +no_singlestep:
899 movl TI_flags(%ebp), %ecx
900 testw $_TIF_ALLWORK_MASK, %cx # current->work
901 jne syscall_exit_work
902 @@ -635,9 +631,7 @@
903 #ifndef CONFIG_XEN
904 #define FIXUP_ESPFIX_STACK \
905 /* since we are on a wrong stack, we cant make it a C code :( */ \
906 - movl %fs:PDA_cpu, %ebx; \
907 - PER_CPU(cpu_gdt_descr, %ebx); \
908 - movl GDS_address(%ebx), %ebx; \
909 + PER_CPU(gdt_page, %ebx); \
910 GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \
911 addl %esp, %eax; \
912 pushl $__KERNEL_DS; \
913 @@ -710,7 +704,7 @@
914 SAVE_ALL; \
915 TRACE_IRQS_OFF \
916 movl %esp,%eax; \
917 - call smp_/**/name; \
918 + call smp_##name; \
919 jmp ret_from_intr; \
920 CFI_ENDPROC; \
921 ENDPROC(name)
922 @@ -718,10 +712,6 @@
923 /* The include is where all of the SMP etc. interrupts come from */
924 #include "entry_arch.h"
925
926 -/* This alternate entry is needed because we hijack the apic LVTT */
927 -#if defined(CONFIG_VMI) && defined(CONFIG_X86_LOCAL_APIC)
928 -BUILD_INTERRUPT(apic_vmi_timer_interrupt,LOCAL_TIMER_VECTOR)
929 -#endif
930 #else
931 #define UNWIND_ESPFIX_STACK
932 #endif
933 @@ -764,7 +754,7 @@
934 pushl %fs
935 CFI_ADJUST_CFA_OFFSET 4
936 /*CFI_REL_OFFSET fs, 0*/
937 - movl $(__KERNEL_PDA), %ecx
938 + movl $(__KERNEL_PERCPU), %ecx
939 movl %ecx, %fs
940 UNWIND_ESPFIX_STACK
941 popl %ecx
942 --- a/arch/x86/kernel/entry_64-xen.S
943 +++ b/arch/x86/kernel/entry_64-xen.S
944 @@ -1254,3 +1254,10 @@
945 ret
946 CFI_ENDPROC
947 ENDPROC(call_softirq)
948 +
949 +KPROBE_ENTRY(ignore_sysret)
950 + CFI_STARTPROC
951 + mov $-ENOSYS,%eax
952 + HYPERVISOR_IRET 0
953 + CFI_ENDPROC
954 +ENDPROC(ignore_sysret)
955 --- a/arch/x86/kernel/genapic_64-xen.c
956 +++ b/arch/x86/kernel/genapic_64-xen.c
957 @@ -11,123 +11,57 @@
958 #include <linux/threads.h>
959 #include <linux/cpumask.h>
960 #include <linux/string.h>
961 +#include <linux/module.h>
962 #include <linux/kernel.h>
963 #include <linux/ctype.h>
964 #include <linux/init.h>
965 -#include <linux/module.h>
966
967 #include <asm/smp.h>
968 #include <asm/ipi.h>
969 +#include <asm/genapic.h>
970
971 -#if defined(CONFIG_ACPI)
972 +#ifdef CONFIG_ACPI
973 #include <acpi/acpi_bus.h>
974 #endif
975
976 /* which logical CPU number maps to which CPU (physical APIC ID) */
977 -u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
978 +u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly
979 + = { [0 ... NR_CPUS-1] = BAD_APICID };
980 EXPORT_SYMBOL(x86_cpu_to_apicid);
981 -u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
982
983 -extern struct genapic apic_cluster;
984 -extern struct genapic apic_flat;
985 -extern struct genapic apic_physflat;
986 +u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
987
988 #ifndef CONFIG_XEN
989 -struct genapic *genapic = &apic_flat;
990 -struct genapic *genapic_force;
991 +struct genapic __read_mostly *genapic = &apic_flat;
992 #else
993 extern struct genapic apic_xen;
994 -struct genapic *genapic = &apic_xen;
995 +struct genapic __read_mostly *genapic = &apic_xen;
996 #endif
997
998
999 /*
1000 * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
1001 */
1002 -void __init clustered_apic_check(void)
1003 +void __init setup_apic_routing(void)
1004 {
1005 #ifndef CONFIG_XEN
1006 - long i;
1007 - u8 clusters, max_cluster;
1008 - u8 id;
1009 - u8 cluster_cnt[NUM_APIC_CLUSTERS];
1010 - int max_apic = 0;
1011 -
1012 - /* genapic selection can be forced because of certain quirks.
1013 - */
1014 - if (genapic_force) {
1015 - genapic = genapic_force;
1016 - goto print;
1017 - }
1018 -
1019 -#if defined(CONFIG_ACPI)
1020 +#ifdef CONFIG_ACPI
1021 /*
1022 - * Some x86_64 machines use physical APIC mode regardless of how many
1023 - * procs/clusters are present (x86_64 ES7000 is an example).
1024 + * Quirk: some x86_64 machines can only use physical APIC mode
1025 + * regardless of how many processors are present (x86_64 ES7000
1026 + * is an example).
1027 */
1028 - if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID)
1029 - if (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL) {
1030 - genapic = &apic_cluster;
1031 - goto print;
1032 - }
1033 -#endif
1034 -
1035 - memset(cluster_cnt, 0, sizeof(cluster_cnt));
1036 - for (i = 0; i < NR_CPUS; i++) {
1037 - id = bios_cpu_apicid[i];
1038 - if (id == BAD_APICID)
1039 - continue;
1040 - if (id > max_apic)
1041 - max_apic = id;
1042 - cluster_cnt[APIC_CLUSTERID(id)]++;
1043 - }
1044 -
1045 - /* Don't use clustered mode on AMD platforms. */
1046 - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
1047 + if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID &&
1048 + (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL))
1049 genapic = &apic_physflat;
1050 -#ifndef CONFIG_HOTPLUG_CPU
1051 - /* In the CPU hotplug case we cannot use broadcast mode
1052 - because that opens a race when a CPU is removed.
1053 - Stay at physflat mode in this case.
1054 - It is bad to do this unconditionally though. Once
1055 - we have ACPI platform support for CPU hotplug
1056 - we should detect hotplug capablity from ACPI tables and
1057 - only do this when really needed. -AK */
1058 - if (max_apic <= 8)
1059 - genapic = &apic_flat;
1060 + else
1061 #endif
1062 - goto print;
1063 - }
1064
1065 - clusters = 0;
1066 - max_cluster = 0;
1067 -
1068 - for (i = 0; i < NUM_APIC_CLUSTERS; i++) {
1069 - if (cluster_cnt[i] > 0) {
1070 - ++clusters;
1071 - if (cluster_cnt[i] > max_cluster)
1072 - max_cluster = cluster_cnt[i];
1073 - }
1074 - }
1075 -
1076 - /*
1077 - * If we have clusters <= 1 and CPUs <= 8 in cluster 0, then flat mode,
1078 - * else if max_cluster <= 4 and cluster_cnt[15] == 0, clustered logical
1079 - * else physical mode.
1080 - * (We don't use lowest priority delivery + HW APIC IRQ steering, so
1081 - * can ignore the clustered logical case and go straight to physical.)
1082 - */
1083 - if (clusters <= 1 && max_cluster <= 8 && cluster_cnt[0] == max_cluster) {
1084 -#ifdef CONFIG_HOTPLUG_CPU
1085 - /* Don't use APIC shortcuts in CPU hotplug to avoid races */
1086 - genapic = &apic_physflat;
1087 -#else
1088 + if (cpus_weight(cpu_possible_map) <= 8)
1089 genapic = &apic_flat;
1090 -#endif
1091 - } else
1092 - genapic = &apic_cluster;
1093 + else
1094 + genapic = &apic_physflat;
1095
1096 -print:
1097 #else
1098 /* hardcode to xen apic functions */
1099 genapic = &apic_xen;
1100 @@ -135,7 +69,7 @@
1101 printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name);
1102 }
1103
1104 -/* Same for both flat and clustered. */
1105 +/* Same for both flat and physical. */
1106
1107 #ifdef CONFIG_XEN
1108 extern void xen_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest);
1109 --- a/arch/x86/kernel/genapic_xen_64.c
1110 +++ b/arch/x86/kernel/genapic_xen_64.c
1111 @@ -21,9 +21,8 @@
1112 #include <asm/ipi.h>
1113 #else
1114 #include <asm/apic.h>
1115 -#include <asm/apicdef.h>
1116 -#include <asm/genapic.h>
1117 #endif
1118 +#include <asm/genapic.h>
1119 #include <xen/evtchn.h>
1120
1121 DECLARE_PER_CPU(int, ipi_to_irq[NR_IPIS]);
1122 --- a/arch/x86/kernel/head64-xen.c
1123 +++ b/arch/x86/kernel/head64-xen.c
1124 @@ -22,13 +22,21 @@
1125 #include <asm/setup.h>
1126 #include <asm/desc.h>
1127 #include <asm/pgtable.h>
1128 +#include <asm/tlbflush.h>
1129 #include <asm/sections.h>
1130
1131 unsigned long start_pfn;
1132
1133 +#ifndef CONFIG_XEN
1134 +static void __init zap_identity_mappings(void)
1135 +{
1136 + pgd_t *pgd = pgd_offset_k(0UL);
1137 + pgd_clear(pgd);
1138 + __flush_tlb();
1139 +}
1140 +
1141 /* Don't add a printk in there. printk relies on the PDA which is not initialized
1142 yet. */
1143 -#if 0
1144 static void __init clear_bss(void)
1145 {
1146 memset(__bss_start, 0,
1147 @@ -37,26 +45,25 @@
1148 #endif
1149
1150 #define NEW_CL_POINTER 0x228 /* Relative to real mode data */
1151 -#define OLD_CL_MAGIC_ADDR 0x90020
1152 +#define OLD_CL_MAGIC_ADDR 0x20
1153 #define OLD_CL_MAGIC 0xA33F
1154 -#define OLD_CL_BASE_ADDR 0x90000
1155 -#define OLD_CL_OFFSET 0x90022
1156 +#define OLD_CL_OFFSET 0x22
1157
1158 static void __init copy_bootdata(char *real_mode_data)
1159 {
1160 #ifndef CONFIG_XEN
1161 - int new_data;
1162 + unsigned long new_data;
1163 char * command_line;
1164
1165 memcpy(x86_boot_params, real_mode_data, BOOT_PARAM_SIZE);
1166 - new_data = *(int *) (x86_boot_params + NEW_CL_POINTER);
1167 + new_data = *(u32 *) (x86_boot_params + NEW_CL_POINTER);
1168 if (!new_data) {
1169 - if (OLD_CL_MAGIC != * (u16 *) OLD_CL_MAGIC_ADDR) {
1170 + if (OLD_CL_MAGIC != *(u16 *)(real_mode_data + OLD_CL_MAGIC_ADDR)) {
1171 return;
1172 }
1173 - new_data = OLD_CL_BASE_ADDR + * (u16 *) OLD_CL_OFFSET;
1174 + new_data = __pa(real_mode_data) + *(u16 *)(real_mode_data + OLD_CL_OFFSET);
1175 }
1176 - command_line = (char *) ((u64)(new_data));
1177 + command_line = __va(new_data);
1178 memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE);
1179 #else
1180 int max_cmdline;
1181 @@ -98,10 +105,13 @@
1182 while ((1UL << machine_to_phys_order) < machine_to_phys_nr_ents )
1183 machine_to_phys_order++;
1184
1185 -#if 0
1186 +#ifndef CONFIG_XEN
1187 /* clear bss before set_intr_gate with early_idt_handler */
1188 clear_bss();
1189
1190 + /* Make NULL pointers segfault */
1191 + zap_identity_mappings();
1192 +
1193 for (i = 0; i < IDT_ENTRIES; i++)
1194 set_intr_gate(i, early_idt_handler);
1195 asm volatile("lidt %0" :: "m" (idt_descr));
1196 @@ -113,7 +123,7 @@
1197 cpu_pda(i) = &boot_cpu_pda[i];
1198
1199 pda_init(0);
1200 - copy_bootdata(real_mode_data);
1201 + copy_bootdata(__va(real_mode_data));
1202 #ifdef CONFIG_SMP
1203 cpu_set(0, cpu_online_map);
1204 #endif
1205 --- a/arch/x86/kernel/head_32-xen.S
1206 +++ b/arch/x86/kernel/head_32-xen.S
1207 @@ -37,7 +37,8 @@
1208 /* Set up the stack pointer */
1209 movl $(init_thread_union+THREAD_SIZE),%esp
1210
1211 - call setup_pda
1212 + movl %ss,%eax
1213 + movl %eax,%fs # gets reset once there's real percpu
1214
1215 /* get vendor info */
1216 xorl %eax,%eax # call CPUID with 0 -> return vendor ID
1217 @@ -64,55 +65,11 @@
1218 xorl %eax,%eax # Clear GS
1219 movl %eax,%gs
1220
1221 - movl $(__KERNEL_PDA),%eax
1222 - mov %eax,%fs
1223 -
1224 cld # gcc2 wants the direction flag cleared at all times
1225
1226 pushl $0 # fake return address for unwinder
1227 jmp start_kernel
1228
1229 -/*
1230 - * Point the GDT at this CPU's PDA. This will be
1231 - * cpu_gdt_table and boot_pda.
1232 - */
1233 -ENTRY(setup_pda)
1234 - /* get the PDA pointer */
1235 - movl $boot_pda, %eax
1236 -
1237 - /* slot the PDA address into the GDT */
1238 - mov $cpu_gdt_table, %ecx
1239 - mov %ax, (__KERNEL_PDA+0+2)(%ecx) /* base & 0x0000ffff */
1240 - shr $16, %eax
1241 - mov %al, (__KERNEL_PDA+4+0)(%ecx) /* base & 0x00ff0000 */
1242 - mov %ah, (__KERNEL_PDA+4+3)(%ecx) /* base & 0xff000000 */
1243 -
1244 - # %esi still points to start_info, and no registers
1245 - # need to be preserved.
1246 -
1247 - movl XEN_START_mfn_list(%esi), %ebx
1248 - movl $(cpu_gdt_table - __PAGE_OFFSET), %eax
1249 - shrl $PAGE_SHIFT, %eax
1250 - movl (%ebx,%eax,4), %ecx
1251 - pushl %ecx # frame number for set_gdt below
1252 -
1253 - xorl %esi, %esi
1254 - xorl %edx, %edx
1255 - shldl $PAGE_SHIFT, %ecx, %edx
1256 - shll $PAGE_SHIFT, %ecx
1257 - orl $0x61, %ecx
1258 - movl $cpu_gdt_table, %ebx
1259 - movl $__HYPERVISOR_update_va_mapping, %eax
1260 - int $0x82
1261 -
1262 - movl $(PAGE_SIZE_asm / 8), %ecx
1263 - movl %esp, %ebx
1264 - movl $__HYPERVISOR_set_gdt, %eax
1265 - int $0x82
1266 -
1267 - popl %ecx
1268 - ret
1269 -
1270 #define HYPERCALL_PAGE_OFFSET 0x1000
1271 .org HYPERCALL_PAGE_OFFSET
1272 ENTRY(hypercall_page)
1273 @@ -138,60 +95,6 @@
1274 */
1275 .data
1276
1277 -/*
1278 - * The Global Descriptor Table contains 28 quadwords, per-CPU.
1279 - */
1280 - .section .data.page_aligned, "aw"
1281 - .align PAGE_SIZE_asm
1282 -ENTRY(cpu_gdt_table)
1283 - .quad 0x0000000000000000 /* NULL descriptor */
1284 - .quad 0x0000000000000000 /* 0x0b reserved */
1285 - .quad 0x0000000000000000 /* 0x13 reserved */
1286 - .quad 0x0000000000000000 /* 0x1b reserved */
1287 - .quad 0x0000000000000000 /* 0x20 unused */
1288 - .quad 0x0000000000000000 /* 0x28 unused */
1289 - .quad 0x0000000000000000 /* 0x33 TLS entry 1 */
1290 - .quad 0x0000000000000000 /* 0x3b TLS entry 2 */
1291 - .quad 0x0000000000000000 /* 0x43 TLS entry 3 */
1292 - .quad 0x0000000000000000 /* 0x4b reserved */
1293 - .quad 0x0000000000000000 /* 0x53 reserved */
1294 - .quad 0x0000000000000000 /* 0x5b reserved */
1295 -
1296 - .quad 0x00cf9a000000ffff /* 0x60 kernel 4GB code at 0x00000000 */
1297 - .quad 0x00cf92000000ffff /* 0x68 kernel 4GB data at 0x00000000 */
1298 - .quad 0x00cffa000000ffff /* 0x73 user 4GB code at 0x00000000 */
1299 - .quad 0x00cff2000000ffff /* 0x7b user 4GB data at 0x00000000 */
1300 -
1301 - .quad 0x0000000000000000 /* 0x80 TSS descriptor */
1302 - .quad 0x0000000000000000 /* 0x88 LDT descriptor */
1303 -
1304 - /*
1305 - * Segments used for calling PnP BIOS have byte granularity.
1306 - * They code segments and data segments have fixed 64k limits,
1307 - * the transfer segment sizes are set at run time.
1308 - */
1309 - .quad 0x0000000000000000 /* 0x90 32-bit code */
1310 - .quad 0x0000000000000000 /* 0x98 16-bit code */
1311 - .quad 0x0000000000000000 /* 0xa0 16-bit data */
1312 - .quad 0x0000000000000000 /* 0xa8 16-bit data */
1313 - .quad 0x0000000000000000 /* 0xb0 16-bit data */
1314 -
1315 - /*
1316 - * The APM segments have byte granularity and their bases
1317 - * are set at run time. All have 64k limits.
1318 - */
1319 - .quad 0x0000000000000000 /* 0xb8 APM CS code */
1320 - .quad 0x0000000000000000 /* 0xc0 APM CS 16 code (16 bit) */
1321 - .quad 0x0000000000000000 /* 0xc8 APM DS data */
1322 -
1323 - .quad 0x0000000000000000 /* 0xd0 - ESPFIX SS */
1324 - .quad 0x00cf92000000ffff /* 0xd8 - PDA */
1325 - .quad 0x0000000000000000 /* 0xe0 - unused */
1326 - .quad 0x0000000000000000 /* 0xe8 - unused */
1327 - .quad 0x0000000000000000 /* 0xf0 - unused */
1328 - .quad 0x0000000000000000 /* 0xf8 - GDT entry 31: double-fault TSS */
1329 - .align PAGE_SIZE_asm
1330 -
1331 #if CONFIG_XEN_COMPAT <= 0x030002
1332 /*
1333 * __xen_guest information
1334 --- a/arch/x86/kernel/head_64-xen.S
1335 +++ b/arch/x86/kernel/head_64-xen.S
1336 @@ -5,6 +5,7 @@
1337 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
1338 * Copyright (C) 2000 Karsten Keil <kkeil@suse.de>
1339 * Copyright (C) 2001,2002 Andi Kleen <ak@suse.de>
1340 + * Copyright (C) 2005 Eric Biederman <ebiederm@xmission.com>
1341 * Jun Nakajima <jun.nakajima@intel.com>
1342 * Modified for Xen
1343 */
1344 @@ -41,18 +42,15 @@
1345 .word gdt_end-cpu_gdt_table-1
1346 .long cpu_gdt_table-__START_KERNEL_map
1347 #endif
1348 -ENTRY(stext)
1349 -ENTRY(_stext)
1350
1351 - $page = 0
1352 +.balign PAGE_SIZE
1353 +
1354 #define NEXT_PAGE(name) \
1355 - $page = $page + 1; \
1356 - .org $page * 0x1000; \
1357 - phys_##name = $page * 0x1000 + __PHYSICAL_START; \
1358 + .balign PAGE_SIZE; \
1359 + phys_##name = . - .bootstrap.text; \
1360 ENTRY(name)
1361
1362 NEXT_PAGE(init_level4_pgt)
1363 - /* This gets initialized in x86_64_start_kernel */
1364 .fill 512,8,0
1365 NEXT_PAGE(init_level4_user_pgt)
1366 /*
1367 @@ -136,13 +134,13 @@
1368
1369 ENTRY(cpu_gdt_table)
1370 .quad 0x0000000000000000 /* NULL descriptor */
1371 + .quad 0x00cf9b000000ffff /* __KERNEL32_CS */
1372 + .quad 0x00af9b000000ffff /* __KERNEL_CS */
1373 + .quad 0x00cf93000000ffff /* __KERNEL_DS */
1374 + .quad 0x00cffb000000ffff /* __USER32_CS */
1375 + .quad 0x00cff3000000ffff /* __USER_DS, __USER32_DS */
1376 + .quad 0x00affb000000ffff /* __USER_CS */
1377 .quad 0x0 /* unused */
1378 - .quad 0x00af9a000000ffff /* __KERNEL_CS */
1379 - .quad 0x00cf92000000ffff /* __KERNEL_DS */
1380 - .quad 0x00cffa000000ffff /* __USER32_CS */
1381 - .quad 0x00cff2000000ffff /* __USER_DS, __USER32_DS */
1382 - .quad 0x00affa000000ffff /* __USER_CS */
1383 - .quad 0x00cf9a000000ffff /* __KERNEL32_CS */
1384 .quad 0,0 /* TSS */
1385 .quad 0,0 /* LDT */
1386 .quad 0,0,0 /* three TLS descriptors */
1387 @@ -165,14 +163,11 @@
1388 * __xen_guest information
1389 */
1390 .macro utoh value
1391 - .if (\value) < 0 || (\value) >= 0x10
1392 - utoh (((\value)>>4)&0x0fffffffffffffff)
1393 - .endif
1394 - .if ((\value) & 0xf) < 10
1395 - .byte '0' + ((\value) & 0xf)
1396 - .else
1397 - .byte 'A' + ((\value) & 0xf) - 10
1398 - .endif
1399 + i = 64
1400 + .rept 16
1401 + i = i - 4
1402 + .byte '0' + ((((\value) >> i) & 0xf) > 9) * ('0' - 'A' + 10) + (((\value) >> i) & 0xf)
1403 + .endr
1404 .endm
1405
1406 .section __xen_guest
1407 --- a/arch/x86/kernel/io_apic_32-xen.c
1408 +++ b/arch/x86/kernel/io_apic_32-xen.c
1409 @@ -25,7 +25,6 @@
1410 #include <linux/init.h>
1411 #include <linux/delay.h>
1412 #include <linux/sched.h>
1413 -#include <linux/smp_lock.h>
1414 #include <linux/mc146818rtc.h>
1415 #include <linux/compiler.h>
1416 #include <linux/acpi.h>
1417 @@ -35,6 +34,7 @@
1418 #include <linux/msi.h>
1419 #include <linux/htirq.h>
1420 #include <linux/freezer.h>
1421 +#include <linux/kthread.h>
1422
1423 #include <asm/io.h>
1424 #include <asm/smp.h>
1425 @@ -705,8 +705,6 @@
1426 unsigned long prev_balance_time = jiffies;
1427 long time_remaining = balanced_irq_interval;
1428
1429 - daemonize("kirqd");
1430 -
1431 /* push everything to CPU 0 to give us a starting point. */
1432 for (i = 0 ; i < NR_IRQS ; i++) {
1433 irq_desc[i].pending_mask = cpumask_of_cpu(0);
1434 @@ -766,10 +764,9 @@
1435 }
1436
1437 printk(KERN_INFO "Starting balanced_irq\n");
1438 - if (kernel_thread(balanced_irq, NULL, CLONE_KERNEL) >= 0)
1439 + if (!IS_ERR(kthread_run(balanced_irq, NULL, "kirqd")))
1440 return 0;
1441 - else
1442 - printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq");
1443 + printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq");
1444 failed:
1445 for_each_possible_cpu(i) {
1446 kfree(irq_cpu_data[i].irq_delta);
1447 @@ -1445,10 +1442,6 @@
1448 enable_8259A_irq(0);
1449 }
1450
1451 -static inline void UNEXPECTED_IO_APIC(void)
1452 -{
1453 -}
1454 -
1455 void __init print_IO_APIC(void)
1456 {
1457 int apic, i;
1458 @@ -1488,34 +1481,12 @@
1459 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
1460 printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
1461 printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS);
1462 - if (reg_00.bits.ID >= get_physical_broadcast())
1463 - UNEXPECTED_IO_APIC();
1464 - if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2)
1465 - UNEXPECTED_IO_APIC();
1466
1467 printk(KERN_DEBUG ".... register #01: %08X\n", reg_01.raw);
1468 printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries);
1469 - if ( (reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */
1470 - (reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */
1471 - (reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */
1472 - (reg_01.bits.entries != 0x1f) && /* dual Xeon boards */
1473 - (reg_01.bits.entries != 0x22) && /* bigger Xeon boards */
1474 - (reg_01.bits.entries != 0x2E) &&
1475 - (reg_01.bits.entries != 0x3F)
1476 - )
1477 - UNEXPECTED_IO_APIC();
1478
1479 printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ);
1480 printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version);
1481 - if ( (reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */
1482 - (reg_01.bits.version != 0x10) && /* oldest IO-APICs */
1483 - (reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */
1484 - (reg_01.bits.version != 0x13) && /* Xeon IO-APICs */
1485 - (reg_01.bits.version != 0x20) /* Intel P64H (82806 AA) */
1486 - )
1487 - UNEXPECTED_IO_APIC();
1488 - if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2)
1489 - UNEXPECTED_IO_APIC();
1490
1491 /*
1492 * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02,
1493 @@ -1525,8 +1496,6 @@
1494 if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) {
1495 printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
1496 printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration);
1497 - if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2)
1498 - UNEXPECTED_IO_APIC();
1499 }
1500
1501 /*
1502 @@ -1538,8 +1507,6 @@
1503 reg_03.raw != reg_01.raw) {
1504 printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw);
1505 printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT);
1506 - if (reg_03.bits.__reserved_1)
1507 - UNEXPECTED_IO_APIC();
1508 }
1509
1510 printk(KERN_DEBUG ".... IRQ redirection table:\n");
1511 @@ -2670,19 +2637,19 @@
1512 if (irq < 0)
1513 return irq;
1514
1515 - set_irq_msi(irq, desc);
1516 ret = msi_compose_msg(dev, irq, &msg);
1517 if (ret < 0) {
1518 destroy_irq(irq);
1519 return ret;
1520 }
1521
1522 + set_irq_msi(irq, desc);
1523 write_msi_msg(irq, &msg);
1524
1525 set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq,
1526 "edge");
1527
1528 - return irq;
1529 + return 0;
1530 }
1531
1532 void arch_teardown_msi_irq(unsigned int irq)
1533 --- a/arch/x86/kernel/io_apic_64-xen.c
1534 +++ b/arch/x86/kernel/io_apic_64-xen.c
1535 @@ -25,7 +25,6 @@
1536 #include <linux/init.h>
1537 #include <linux/delay.h>
1538 #include <linux/sched.h>
1539 -#include <linux/smp_lock.h>
1540 #include <linux/pci.h>
1541 #include <linux/mc146818rtc.h>
1542 #include <linux/acpi.h>
1543 @@ -897,10 +896,6 @@
1544 enable_8259A_irq(0);
1545 }
1546
1547 -void __init UNEXPECTED_IO_APIC(void)
1548 -{
1549 -}
1550 -
1551 void __apicdebuginit print_IO_APIC(void)
1552 {
1553 int apic, i;
1554 @@ -936,40 +931,16 @@
1555 printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
1556 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
1557 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
1558 - if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2)
1559 - UNEXPECTED_IO_APIC();
1560
1561 printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)&reg_01);
1562 printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries);
1563 - if ( (reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */
1564 - (reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */
1565 - (reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */
1566 - (reg_01.bits.entries != 0x1f) && /* dual Xeon boards */
1567 - (reg_01.bits.entries != 0x22) && /* bigger Xeon boards */
1568 - (reg_01.bits.entries != 0x2E) &&
1569 - (reg_01.bits.entries != 0x3F) &&
1570 - (reg_01.bits.entries != 0x03)
1571 - )
1572 - UNEXPECTED_IO_APIC();
1573
1574 printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ);
1575 printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version);
1576 - if ( (reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */
1577 - (reg_01.bits.version != 0x02) && /* 82801BA IO-APICs (ICH2) */
1578 - (reg_01.bits.version != 0x10) && /* oldest IO-APICs */
1579 - (reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */
1580 - (reg_01.bits.version != 0x13) && /* Xeon IO-APICs */
1581 - (reg_01.bits.version != 0x20) /* Intel P64H (82806 AA) */
1582 - )
1583 - UNEXPECTED_IO_APIC();
1584 - if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2)
1585 - UNEXPECTED_IO_APIC();
1586
1587 if (reg_01.bits.version >= 0x10) {
1588 printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
1589 printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration);
1590 - if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2)
1591 - UNEXPECTED_IO_APIC();
1592 }
1593
1594 printk(KERN_DEBUG ".... IRQ redirection table:\n");
1595 @@ -1401,8 +1372,7 @@
1596
1597 vector = ~get_irq_regs()->orig_rax;
1598 me = smp_processor_id();
1599 - if ((vector == cfg->vector) &&
1600 - cpu_isset(smp_processor_id(), cfg->domain)) {
1601 + if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) {
1602 cpumask_t cleanup_mask;
1603
1604 cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
1605 @@ -1437,7 +1407,7 @@
1606
1607 /*
1608 * We must acknowledge the irq before we move it or the acknowledge will
1609 - * not propogate properly.
1610 + * not propagate properly.
1611 */
1612 ack_APIC_irq();
1613
1614 @@ -1520,6 +1490,7 @@
1615 static void end_lapic_irq (unsigned int i) { /* nothing */ }
1616
1617 static struct hw_interrupt_type lapic_irq_type __read_mostly = {
1618 + .name = "local-APIC",
1619 .typename = "local-APIC-edge",
1620 .startup = NULL, /* startup_irq() not used for IRQ0 */
1621 .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */
1622 @@ -1989,18 +1960,18 @@
1623 if (irq < 0)
1624 return irq;
1625
1626 - set_irq_msi(irq, desc);
1627 ret = msi_compose_msg(dev, irq, &msg);
1628 if (ret < 0) {
1629 destroy_irq(irq);
1630 return ret;
1631 }
1632
1633 + set_irq_msi(irq, desc);
1634 write_msi_msg(irq, &msg);
1635
1636 set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
1637
1638 - return irq;
1639 + return 0;
1640 }
1641
1642 void arch_teardown_msi_irq(unsigned int irq)
1643 --- a/arch/x86/kernel/ioport_32-xen.c
1644 +++ b/arch/x86/kernel/ioport_32-xen.c
1645 @@ -12,10 +12,10 @@
1646 #include <linux/types.h>
1647 #include <linux/ioport.h>
1648 #include <linux/smp.h>
1649 -#include <linux/smp_lock.h>
1650 #include <linux/stddef.h>
1651 #include <linux/slab.h>
1652 #include <linux/thread_info.h>
1653 +#include <linux/syscalls.h>
1654 #include <xen/interface/physdev.h>
1655
1656 /* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
1657 --- a/arch/x86/kernel/ioport_64-xen.c
1658 +++ b/arch/x86/kernel/ioport_64-xen.c
1659 @@ -13,10 +13,10 @@
1660 #include <linux/ioport.h>
1661 #include <linux/mm.h>
1662 #include <linux/smp.h>
1663 -#include <linux/smp_lock.h>
1664 #include <linux/stddef.h>
1665 #include <linux/slab.h>
1666 #include <linux/thread_info.h>
1667 +#include <linux/syscalls.h>
1668 #include <xen/interface/physdev.h>
1669
1670 /* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
1671 --- a/arch/x86/kernel/irq_32-xen.c
1672 +++ b/arch/x86/kernel/irq_32-xen.c
1673 @@ -24,6 +24,9 @@
1674 DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_internodealigned_in_smp;
1675 EXPORT_PER_CPU_SYMBOL(irq_stat);
1676
1677 +DEFINE_PER_CPU(struct pt_regs *, irq_regs);
1678 +EXPORT_PER_CPU_SYMBOL(irq_regs);
1679 +
1680 /*
1681 * 'what should we do if we get a hw irq event on an illegal vector'.
1682 * each architecture has to answer this themselves.
1683 --- a/arch/x86/kernel/irq_64-xen.c
1684 +++ b/arch/x86/kernel/irq_64-xen.c
1685 @@ -32,7 +32,7 @@
1686 */
1687 static inline void stack_overflow_check(struct pt_regs *regs)
1688 {
1689 - u64 curbase = (u64) current->thread_info;
1690 + u64 curbase = (u64)task_stack_page(current);
1691 static unsigned long warned = -60*HZ;
1692
1693 if (regs->rsp >= curbase && regs->rsp <= curbase + THREAD_SIZE &&
1694 @@ -145,17 +145,43 @@
1695
1696 for (irq = 0; irq < NR_IRQS; irq++) {
1697 cpumask_t mask;
1698 + int break_affinity = 0;
1699 + int set_affinity = 1;
1700 +
1701 if (irq == 2)
1702 continue;
1703
1704 + /* interrupt's are disabled at this point */
1705 + spin_lock(&irq_desc[irq].lock);
1706 +
1707 + if (!irq_has_action(irq) ||
1708 + cpus_equal(irq_desc[irq].affinity, map)) {
1709 + spin_unlock(&irq_desc[irq].lock);
1710 + continue;
1711 + }
1712 +
1713 cpus_and(mask, irq_desc[irq].affinity, map);
1714 - if (any_online_cpu(mask) == NR_CPUS) {
1715 - printk("Breaking affinity for irq %i\n", irq);
1716 + if (cpus_empty(mask)) {
1717 + break_affinity = 1;
1718 mask = map;
1719 }
1720 +
1721 + if (irq_desc[irq].chip->mask)
1722 + irq_desc[irq].chip->mask(irq);
1723 +
1724 if (irq_desc[irq].chip->set_affinity)
1725 irq_desc[irq].chip->set_affinity(irq, mask);
1726 - else if (irq_desc[irq].action && !(warned++))
1727 + else if (!(warned++))
1728 + set_affinity = 0;
1729 +
1730 + if (irq_desc[irq].chip->unmask)
1731 + irq_desc[irq].chip->unmask(irq);
1732 +
1733 + spin_unlock(&irq_desc[irq].lock);
1734 +
1735 + if (break_affinity && set_affinity)
1736 + printk("Broke affinity for irq %i\n", irq);
1737 + else if (!set_affinity)
1738 printk("Cannot set affinity for irq %i\n", irq);
1739 }
1740
1741 --- a/arch/x86/kernel/ldt_32-xen.c
1742 +++ b/arch/x86/kernel/ldt_32-xen.c
1743 @@ -10,7 +10,6 @@
1744 #include <linux/string.h>
1745 #include <linux/mm.h>
1746 #include <linux/smp.h>
1747 -#include <linux/smp_lock.h>
1748 #include <linux/vmalloc.h>
1749 #include <linux/slab.h>
1750
1751 --- a/arch/x86/kernel/ldt_64-xen.c
1752 +++ b/arch/x86/kernel/ldt_64-xen.c
1753 @@ -13,7 +13,6 @@
1754 #include <linux/string.h>
1755 #include <linux/mm.h>
1756 #include <linux/smp.h>
1757 -#include <linux/smp_lock.h>
1758 #include <linux/vmalloc.h>
1759 #include <linux/slab.h>
1760
1761 --- a/arch/x86/kernel/microcode-xen.c
1762 +++ b/arch/x86/kernel/microcode-xen.c
1763 @@ -135,7 +135,7 @@
1764 return 0;
1765 }
1766
1767 -static void __exit microcode_dev_exit (void)
1768 +static void microcode_dev_exit (void)
1769 {
1770 misc_deregister(&microcode_dev);
1771 }
1772 --- a/arch/x86/kernel/mpparse_32-xen.c
1773 +++ b/arch/x86/kernel/mpparse_32-xen.c
1774 @@ -18,7 +18,6 @@
1775 #include <linux/acpi.h>
1776 #include <linux/delay.h>
1777 #include <linux/bootmem.h>
1778 -#include <linux/smp_lock.h>
1779 #include <linux/kernel_stat.h>
1780 #include <linux/mc146818rtc.h>
1781 #include <linux/bitops.h>
1782 @@ -484,7 +483,7 @@
1783 }
1784 ++mpc_record;
1785 }
1786 - clustered_apic_check();
1787 + setup_apic_routing();
1788 if (!num_processors)
1789 printk(KERN_ERR "SMP mptable: no processors registered!\n");
1790 return num_processors;
1791 --- a/arch/x86/kernel/mpparse_64-xen.c
1792 +++ b/arch/x86/kernel/mpparse_64-xen.c
1793 @@ -17,7 +17,6 @@
1794 #include <linux/init.h>
1795 #include <linux/delay.h>
1796 #include <linux/bootmem.h>
1797 -#include <linux/smp_lock.h>
1798 #include <linux/kernel_stat.h>
1799 #include <linux/mc146818rtc.h>
1800 #include <linux/acpi.h>
1801 @@ -307,7 +306,7 @@
1802 }
1803 }
1804 }
1805 - clustered_apic_check();
1806 + setup_apic_routing();
1807 if (!num_processors)
1808 printk(KERN_ERR "MPTABLE: no processors registered!\n");
1809 return num_processors;
1810 --- a/arch/x86/kernel/pci-dma_32-xen.c
1811 +++ b/arch/x86/kernel/pci-dma_32-xen.c
1812 @@ -13,6 +13,7 @@
1813 #include <linux/pci.h>
1814 #include <linux/module.h>
1815 #include <linux/version.h>
1816 +#include <linux/pci.h>
1817 #include <asm/io.h>
1818 #include <xen/balloon.h>
1819 #include <xen/gnttab.h>
1820 @@ -284,7 +285,7 @@
1821 {
1822 void __iomem *mem_base = NULL;
1823 int pages = size >> PAGE_SHIFT;
1824 - int bitmap_size = (pages + 31)/32;
1825 + int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long);
1826
1827 if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0)
1828 goto out;
1829 @@ -357,6 +358,32 @@
1830 EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
1831 #endif /* ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY */
1832
1833 +#if defined(CONFIG_PCI) && !defined(CONFIG_XEN)
1834 +/* Many VIA bridges seem to corrupt data for DAC. Disable it here */
1835 +
1836 +int forbid_dac;
1837 +EXPORT_SYMBOL(forbid_dac);
1838 +
1839 +static __devinit void via_no_dac(struct pci_dev *dev)
1840 +{
1841 + if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
1842 + printk(KERN_INFO "PCI: VIA PCI bridge detected. Disabling DAC.\n");
1843 + forbid_dac = 1;
1844 + }
1845 +}
1846 +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac);
1847 +
1848 +static int check_iommu(char *s)
1849 +{
1850 + if (!strcmp(s, "usedac")) {
1851 + forbid_dac = -1;
1852 + return 1;
1853 + }
1854 + return 0;
1855 +}
1856 +__setup("iommu=", check_iommu);
1857 +#endif
1858 +
1859 dma_addr_t
1860 dma_map_single(struct device *dev, void *ptr, size_t size,
1861 enum dma_data_direction direction)
1862 --- a/arch/x86/kernel/pci-swiotlb_64-xen.c
1863 +++ b/arch/x86/kernel/pci-swiotlb_64-xen.c
1864 @@ -16,7 +16,7 @@
1865
1866 void swiotlb_init(void);
1867
1868 -struct dma_mapping_ops swiotlb_dma_ops = {
1869 +const struct dma_mapping_ops swiotlb_dma_ops = {
1870 #if 0
1871 .mapping_error = swiotlb_dma_mapping_error,
1872 .alloc_coherent = swiotlb_alloc_coherent,
1873 --- a/arch/x86/kernel/process_32-xen.c
1874 +++ b/arch/x86/kernel/process_32-xen.c
1875 @@ -21,7 +21,6 @@
1876 #include <linux/mm.h>
1877 #include <linux/elfcore.h>
1878 #include <linux/smp.h>
1879 -#include <linux/smp_lock.h>
1880 #include <linux/stddef.h>
1881 #include <linux/slab.h>
1882 #include <linux/vmalloc.h>
1883 @@ -39,6 +38,7 @@
1884 #include <linux/random.h>
1885 #include <linux/personality.h>
1886 #include <linux/tick.h>
1887 +#include <linux/percpu.h>
1888
1889 #include <asm/uaccess.h>
1890 #include <asm/pgtable.h>
1891 @@ -61,7 +61,6 @@
1892
1893 #include <asm/tlbflush.h>
1894 #include <asm/cpu.h>
1895 -#include <asm/pda.h>
1896
1897 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
1898
1899 @@ -70,6 +69,12 @@
1900 unsigned long boot_option_idle_override = 0;
1901 EXPORT_SYMBOL(boot_option_idle_override);
1902
1903 +DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
1904 +EXPORT_PER_CPU_SYMBOL(current_task);
1905 +
1906 +DEFINE_PER_CPU(int, cpu_number);
1907 +EXPORT_PER_CPU_SYMBOL(cpu_number);
1908 +
1909 /*
1910 * Return saved PC of a blocked thread.
1911 */
1912 @@ -168,6 +173,7 @@
1913 if (__get_cpu_var(cpu_idle_state))
1914 __get_cpu_var(cpu_idle_state) = 0;
1915
1916 + check_pgt_cache();
1917 rmb();
1918 idle = xen_idle; /* no alternatives */
1919
1920 @@ -218,18 +224,19 @@
1921 {
1922 }
1923
1924 -static int __init idle_setup (char *str)
1925 +static int __init idle_setup(char *str)
1926 {
1927 - if (!strncmp(str, "poll", 4)) {
1928 + if (!strcmp(str, "poll")) {
1929 printk("using polling idle threads.\n");
1930 pm_idle = poll_idle;
1931 }
1932 + else
1933 + return -1;
1934
1935 boot_option_idle_override = 1;
1936 - return 1;
1937 + return 0;
1938 }
1939 -
1940 -__setup("idle=", idle_setup);
1941 +early_param("idle", idle_setup);
1942
1943 void show_regs(struct pt_regs * regs)
1944 {
1945 @@ -282,7 +289,7 @@
1946
1947 regs.xds = __USER_DS;
1948 regs.xes = __USER_DS;
1949 - regs.xfs = __KERNEL_PDA;
1950 + regs.xfs = __KERNEL_PERCPU;
1951 regs.orig_eax = -1;
1952 regs.eip = (unsigned long) kernel_thread_helper;
1953 regs.xcs = __KERNEL_CS | get_kernel_rpl();
1954 @@ -556,7 +563,7 @@
1955 * multicall to indicate FPU task switch, rather than
1956 * synchronously trapping to Xen.
1957 */
1958 - if (prev_p->thread_info->status & TS_USEDFPU) {
1959 + if (task_thread_info(prev_p)->status & TS_USEDFPU) {
1960 __save_init_fpu(prev_p); /* _not_ save_init_fpu() */
1961 mcl->op = __HYPERVISOR_fpu_taskswitch;
1962 mcl->args[0] = 1;
1963 @@ -648,7 +655,7 @@
1964 if (prev->gs | next->gs)
1965 loadsegment(gs, next->gs);
1966
1967 - write_pda(pcurrent, next_p);
1968 + x86_write_percpu(current_task, next_p);
1969
1970 return prev_p;
1971 }
1972 --- a/arch/x86/kernel/process_64-xen.c
1973 +++ b/arch/x86/kernel/process_64-xen.c
1974 @@ -39,6 +39,7 @@
1975 #include <linux/random.h>
1976 #include <linux/notifier.h>
1977 #include <linux/kprobes.h>
1978 +#include <linux/kdebug.h>
1979
1980 #include <asm/uaccess.h>
1981 #include <asm/pgtable.h>
1982 @@ -49,7 +50,6 @@
1983 #include <asm/mmu_context.h>
1984 #include <asm/pda.h>
1985 #include <asm/prctl.h>
1986 -#include <asm/kdebug.h>
1987 #include <xen/interface/platform.h>
1988 #include <xen/interface/physdev.h>
1989 #include <xen/interface/vcpu.h>
1990 @@ -232,16 +232,18 @@
1991
1992 static int __init idle_setup (char *str)
1993 {
1994 - if (!strncmp(str, "poll", 4)) {
1995 + if (!strcmp(str, "poll")) {
1996 printk("using polling idle threads.\n");
1997 pm_idle = poll_idle;
1998 - }
1999 + } else if (!strcmp(str, "mwait"))
2000 + force_mwait = 1;
2001 + else
2002 + return -1;
2003
2004 boot_option_idle_override = 1;
2005 - return 1;
2006 + return 0;
2007 }
2008 -
2009 -__setup("idle=", idle_setup);
2010 +early_param("idle", idle_setup);
2011
2012 /* Prints also some state that isn't saved in the pt_regs */
2013 void __show_regs(struct pt_regs * regs)
2014 @@ -540,7 +542,7 @@
2015 * The AMD workaround requires it to be after DS reload, or
2016 * after DS has been cleared, which we do in __prepare_arch_switch.
2017 */
2018 - if (prev_p->thread_info->status & TS_USEDFPU) {
2019 + if (task_thread_info(prev_p)->status & TS_USEDFPU) {
2020 __save_init_fpu(prev_p); /* _not_ save_init_fpu() */
2021 mcl->op = __HYPERVISOR_fpu_taskswitch;
2022 mcl->args[0] = 1;
2023 --- a/arch/x86/kernel/quirks-xen.c
2024 +++ b/arch/x86/kernel/quirks-xen.c
2025 @@ -3,12 +3,10 @@
2026 */
2027 #include <linux/pci.h>
2028 #include <linux/irq.h>
2029 -#include <asm/pci-direct.h>
2030 -#include <asm/genapic.h>
2031 -#include <asm/cpu.h>
2032
2033 #if defined(CONFIG_X86_IO_APIC) && (defined(CONFIG_SMP) || defined(CONFIG_XEN)) && defined(CONFIG_PCI)
2034 -static void __devinit verify_quirk_intel_irqbalance(struct pci_dev *dev)
2035 +
2036 +static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
2037 {
2038 u8 config, rev;
2039 u32 word;
2040 @@ -16,7 +14,7 @@
2041 /* BIOS may enable hardware IRQ balancing for
2042 * E7520/E7320/E7525(revision ID 0x9 and below)
2043 * based platforms.
2044 - * For those platforms, make sure that the genapic is set to 'flat'
2045 + * Disable SW irqbalance/affinity on those platforms.
2046 */
2047 pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev);
2048 if (rev > 0x9)
2049 @@ -30,59 +28,20 @@
2050 raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word);
2051
2052 if (!(word & (1 << 13))) {
2053 -#ifndef CONFIG_XEN
2054 -#ifdef CONFIG_X86_64
2055 - if (genapic != &apic_flat)
2056 - panic("APIC mode must be flat on this system\n");
2057 -#elif defined(CONFIG_X86_GENERICARCH)
2058 - if (genapic != &apic_default)
2059 - panic("APIC mode must be default(flat) on this system. Use apic=default\n");
2060 -#endif
2061 -#endif
2062 - }
2063 -
2064 - /* put back the original value for config space*/
2065 - if (!(config & 0x2))
2066 - pci_write_config_byte(dev, 0xf4, config);
2067 -}
2068 -
2069 -void __init quirk_intel_irqbalance(void)
2070 -{
2071 - u8 config, rev;
2072 - u32 word;
2073 -
2074 - /* BIOS may enable hardware IRQ balancing for
2075 - * E7520/E7320/E7525(revision ID 0x9 and below)
2076 - * based platforms.
2077 - * Disable SW irqbalance/affinity on those platforms.
2078 - */
2079 - rev = read_pci_config_byte(0, 0, 0, PCI_CLASS_REVISION);
2080 - if (rev > 0x9)
2081 - return;
2082 -
2083 - printk(KERN_INFO "Intel E7520/7320/7525 detected.");
2084 -
2085 - /* enable access to config space */
2086 - config = read_pci_config_byte(0, 0, 0, 0xf4);
2087 - write_pci_config_byte(0, 0, 0, 0xf4, config|0x2);
2088 -
2089 - /* read xTPR register */
2090 - word = read_pci_config_16(0, 0, 0x40, 0x4c);
2091 -
2092 - if (!(word & (1 << 13))) {
2093 struct xen_platform_op op;
2094 - printk(KERN_INFO "Disabling irq balancing and affinity\n");
2095 +
2096 + printk(KERN_INFO "Intel E7520/7320/7525 detected. "
2097 + "Disabling irq balancing and affinity\n");
2098 op.cmd = XENPF_platform_quirk;
2099 op.u.platform_quirk.quirk_id = QUIRK_NOIRQBALANCING;
2100 WARN_ON(HYPERVISOR_platform_op(&op));
2101 }
2102
2103 - /* put back the original value for config space */
2104 + /* put back the original value for config space*/
2105 if (!(config & 0x2))
2106 - write_pci_config_byte(0, 0, 0, 0xf4, config);
2107 + pci_write_config_byte(dev, 0xf4, config);
2108 }
2109 -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, verify_quirk_intel_irqbalance);
2110 -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, verify_quirk_intel_irqbalance);
2111 -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, verify_quirk_intel_irqbalance);
2112 -
2113 +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, quirk_intel_irqbalance);
2114 +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, quirk_intel_irqbalance);
2115 +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, quirk_intel_irqbalance);
2116 #endif
2117 --- a/arch/x86/kernel/setup64-xen.c
2118 +++ b/arch/x86/kernel/setup64-xen.c
2119 @@ -113,9 +113,9 @@
2120 if (!NODE_DATA(cpu_to_node(i))) {
2121 printk("cpu with no node %d, num_online_nodes %d\n",
2122 i, num_online_nodes());
2123 - ptr = alloc_bootmem(size);
2124 + ptr = alloc_bootmem_pages(size);
2125 } else {
2126 - ptr = alloc_bootmem_node(NODE_DATA(cpu_to_node(i)), size);
2127 + ptr = alloc_bootmem_pages_node(NODE_DATA(cpu_to_node(i)), size);
2128 }
2129 if (!ptr)
2130 panic("Cannot allocate cpu data for CPU %d\n", i);
2131 @@ -208,6 +208,8 @@
2132 __attribute__((section(".bss.page_aligned")));
2133 #endif
2134
2135 +extern asmlinkage void ignore_sysret(void);
2136 +
2137 /* May not be marked __init: used by software suspend */
2138 void syscall_init(void)
2139 {
2140 @@ -219,12 +221,22 @@
2141 */
2142 wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
2143 wrmsrl(MSR_LSTAR, system_call);
2144 + wrmsrl(MSR_CSTAR, ignore_sysret);
2145
2146 /* Flags to clear on syscall */
2147 wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000);
2148 #endif
2149 #ifdef CONFIG_IA32_EMULATION
2150 syscall32_cpu_init ();
2151 +#else
2152 + {
2153 + static const struct callback_register cstar = {
2154 + .type = CALLBACKTYPE_syscall32,
2155 + .address = (unsigned long)ignore_sysret
2156 + };
2157 + if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar))
2158 + printk(KERN_WARN "Unable to register CSTAR callback\n");
2159 + }
2160 #endif
2161 }
2162
2163 @@ -262,7 +274,6 @@
2164 /* CPU 0 is initialised in head64.c */
2165 if (cpu != 0) {
2166 pda_init(cpu);
2167 - zap_low_mappings(cpu);
2168 }
2169 #ifndef CONFIG_X86_NO_TSS
2170 else
2171 --- a/arch/x86/kernel/setup_64-xen.c
2172 +++ b/arch/x86/kernel/setup_64-xen.c
2173 @@ -123,6 +123,8 @@
2174
2175 unsigned long saved_video_mode;
2176
2177 +int force_mwait __cpuinitdata;
2178 +
2179 /*
2180 * Early DMI memory
2181 */
2182 @@ -256,10 +258,10 @@
2183 * there is a real-mode segmented pointer pointing to the
2184 * 4K EBDA area at 0x40E
2185 */
2186 - ebda_addr = *(unsigned short *)EBDA_ADDR_POINTER;
2187 + ebda_addr = *(unsigned short *)__va(EBDA_ADDR_POINTER);
2188 ebda_addr <<= 4;
2189
2190 - ebda_size = *(unsigned short *)(unsigned long)ebda_addr;
2191 + ebda_size = *(unsigned short *)__va(ebda_addr);
2192
2193 /* Round EBDA up to pages */
2194 if (ebda_size == 0)
2195 @@ -413,15 +415,8 @@
2196 #endif
2197
2198 #ifdef CONFIG_SMP
2199 - /*
2200 - * But first pinch a few for the stack/trampoline stuff
2201 - * FIXME: Don't need the extra page at 4K, but need to fix
2202 - * trampoline before removing it. (see the GDT stuff)
2203 - */
2204 - reserve_bootmem_generic(PAGE_SIZE, PAGE_SIZE);
2205 -
2206 /* Reserve SMP trampoline */
2207 - reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, PAGE_SIZE);
2208 + reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, 2*PAGE_SIZE);
2209 #endif
2210 #endif
2211
2212 @@ -573,8 +568,6 @@
2213 early_quirks();
2214 #endif
2215
2216 - zap_low_mappings(0);
2217 -
2218 /*
2219 * set this early, so we dont allocate cpu0
2220 * if MADT list doesnt list BSP first
2221 @@ -877,6 +870,10 @@
2222
2223 /* RDTSC can be speculated around */
2224 clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
2225 +
2226 + /* Family 10 doesn't support C states in MWAIT so don't use it */
2227 + if (c->x86 == 0x10 && !force_mwait)
2228 + clear_bit(X86_FEATURE_MWAIT, &c->x86_capability);
2229 }
2230
2231 static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
2232 @@ -1159,9 +1156,7 @@
2233 #ifdef CONFIG_X86_MCE
2234 mcheck_init(c);
2235 #endif
2236 - if (c == &boot_cpu_data)
2237 - mtrr_bp_init();
2238 - else
2239 + if (c != &boot_cpu_data)
2240 mtrr_ap_init();
2241 #ifdef CONFIG_NUMA
2242 numa_add_cpu(smp_processor_id());
2243 @@ -1252,9 +1247,8 @@
2244 "stc",
2245 "100mhzsteps",
2246 "hwpstate",
2247 - NULL, /* tsc invariant mapped to constant_tsc */
2248 - NULL,
2249 - /* nothing */ /* constant_tsc - moved to flags */
2250 + "", /* tsc invariant mapped to constant_tsc */
2251 + /* nothing */
2252 };
2253
2254
2255 --- a/arch/x86/kernel/smp_32-xen.c
2256 +++ b/arch/x86/kernel/smp_32-xen.c
2257 @@ -13,7 +13,6 @@
2258 #include <linux/mm.h>
2259 #include <linux/delay.h>
2260 #include <linux/spinlock.h>
2261 -#include <linux/smp_lock.h>
2262 #include <linux/kernel_stat.h>
2263 #include <linux/mc146818rtc.h>
2264 #include <linux/cache.h>
2265 @@ -216,7 +215,6 @@
2266 static struct mm_struct * flush_mm;
2267 static unsigned long flush_va;
2268 static DEFINE_SPINLOCK(tlbstate_lock);
2269 -#define FLUSH_ALL 0xffffffff
2270
2271 /*
2272 * We cannot call mmdrop() because we are in interrupt context,
2273 @@ -298,7 +296,7 @@
2274
2275 if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) {
2276 if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) {
2277 - if (flush_va == FLUSH_ALL)
2278 + if (flush_va == TLB_FLUSH_ALL)
2279 local_flush_tlb();
2280 else
2281 __flush_tlb_one(flush_va);
2282 @@ -314,9 +312,11 @@
2283 return IRQ_HANDLED;
2284 }
2285
2286 -static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
2287 - unsigned long va)
2288 +void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
2289 + unsigned long va)
2290 {
2291 + cpumask_t cpumask = *cpumaskp;
2292 +
2293 /*
2294 * A couple of (to be removed) sanity checks:
2295 *
2296 @@ -327,10 +327,12 @@
2297 BUG_ON(cpu_isset(smp_processor_id(), cpumask));
2298 BUG_ON(!mm);
2299
2300 +#ifdef CONFIG_HOTPLUG_CPU
2301 /* If a CPU which we ran on has gone down, OK. */
2302 cpus_and(cpumask, cpumask, cpu_online_map);
2303 - if (cpus_empty(cpumask))
2304 + if (unlikely(cpus_empty(cpumask)))
2305 return;
2306 +#endif
2307
2308 /*
2309 * i'm not happy about this global shared spinlock in the
2310 @@ -341,17 +343,7 @@
2311
2312 flush_mm = mm;
2313 flush_va = va;
2314 -#if NR_CPUS <= BITS_PER_LONG
2315 - atomic_set_mask(cpumask, &flush_cpumask);
2316 -#else
2317 - {
2318 - int k;
2319 - unsigned long *flush_mask = (unsigned long *)&flush_cpumask;
2320 - unsigned long *cpu_mask = (unsigned long *)&cpumask;
2321 - for (k = 0; k < BITS_TO_LONGS(NR_CPUS); ++k)
2322 - atomic_set_mask(cpu_mask[k], &flush_mask[k]);
2323 - }
2324 -#endif
2325 + cpus_or(flush_cpumask, cpumask, flush_cpumask);
2326 /*
2327 * We have to send the IPI only to
2328 * CPUs affected.
2329 @@ -378,7 +370,7 @@
2330
2331 local_flush_tlb();
2332 if (!cpus_empty(cpu_mask))
2333 - flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
2334 + flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
2335 preempt_enable();
2336 }
2337
2338 @@ -397,7 +389,7 @@
2339 leave_mm(smp_processor_id());
2340 }
2341 if (!cpus_empty(cpu_mask))
2342 - flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
2343 + flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
2344
2345 preempt_enable();
2346 }
2347 @@ -446,7 +438,7 @@
2348 * it goes straight through and wastes no time serializing
2349 * anything. Worst case is that we lose a reschedule ...
2350 */
2351 -void smp_send_reschedule(int cpu)
2352 +void xen_smp_send_reschedule(int cpu)
2353 {
2354 WARN_ON(cpu_is_offline(cpu));
2355 send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
2356 @@ -478,36 +470,79 @@
2357
2358 static struct call_data_struct *call_data;
2359
2360 +static void __smp_call_function(void (*func) (void *info), void *info,
2361 + int nonatomic, int wait)
2362 +{
2363 + struct call_data_struct data;
2364 + int cpus = num_online_cpus() - 1;
2365 +
2366 + if (!cpus)
2367 + return;
2368 +
2369 + data.func = func;
2370 + data.info = info;
2371 + atomic_set(&data.started, 0);
2372 + data.wait = wait;
2373 + if (wait)
2374 + atomic_set(&data.finished, 0);
2375 +
2376 + call_data = &data;
2377 + mb();
2378 +
2379 + /* Send a message to all other CPUs and wait for them to respond */
2380 + send_IPI_allbutself(CALL_FUNCTION_VECTOR);
2381 +
2382 + /* Wait for response */
2383 + while (atomic_read(&data.started) != cpus)
2384 + cpu_relax();
2385 +
2386 + if (wait)
2387 + while (atomic_read(&data.finished) != cpus)
2388 + cpu_relax();
2389 +}
2390 +
2391 +
2392 /**
2393 - * smp_call_function(): Run a function on all other CPUs.
2394 + * smp_call_function_mask(): Run a function on a set of other CPUs.
2395 + * @mask: The set of cpus to run on. Must not include the current cpu.
2396 * @func: The function to run. This must be fast and non-blocking.
2397 * @info: An arbitrary pointer to pass to the function.
2398 - * @nonatomic: currently unused.
2399 * @wait: If true, wait (atomically) until function has completed on other CPUs.
2400 *
2401 - * Returns 0 on success, else a negative status code. Does not return until
2402 - * remote CPUs are nearly ready to execute <<func>> or are or have executed.
2403 + * Returns 0 on success, else a negative status code.
2404 + *
2405 + * If @wait is true, then returns once @func has returned; otherwise
2406 + * it returns just before the target cpu calls @func.
2407 *
2408 * You must not call this function with disabled interrupts or from a
2409 * hardware interrupt handler or from a bottom half handler.
2410 */
2411 -int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
2412 - int wait)
2413 +int
2414 +xen_smp_call_function_mask(cpumask_t mask,
2415 + void (*func)(void *), void *info,
2416 + int wait)
2417 {
2418 struct call_data_struct data;
2419 + cpumask_t allbutself;
2420 int cpus;
2421
2422 + /* Can deadlock when called with interrupts disabled */
2423 + WARN_ON(irqs_disabled());
2424 +
2425 /* Holding any lock stops cpus from going down. */
2426 spin_lock(&call_lock);
2427 - cpus = num_online_cpus() - 1;
2428 +
2429 + allbutself = cpu_online_map;
2430 + cpu_clear(smp_processor_id(), allbutself);
2431 +
2432 + cpus_and(mask, mask, allbutself);
2433 + cpus = cpus_weight(mask);
2434 +
2435 if (!cpus) {
2436 spin_unlock(&call_lock);
2437 return 0;
2438 }
2439
2440 - /* Can deadlock when called with interrupts disabled */
2441 - WARN_ON(irqs_disabled());
2442 -
2443 data.func = func;
2444 data.info = info;
2445 atomic_set(&data.started, 0);
2446 @@ -517,9 +552,12 @@
2447
2448 call_data = &data;
2449 mb();
2450 -
2451 - /* Send a message to all other CPUs and wait for them to respond */
2452 - send_IPI_allbutself(CALL_FUNCTION_VECTOR);
2453 +
2454 + /* Send a message to other CPUs */
2455 + if (cpus_equal(mask, allbutself))
2456 + send_IPI_allbutself(CALL_FUNCTION_VECTOR);
2457 + else
2458 + send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
2459
2460 /* Wait for response */
2461 while (atomic_read(&data.started) != cpus)
2462 @@ -532,15 +570,14 @@
2463
2464 return 0;
2465 }
2466 -EXPORT_SYMBOL(smp_call_function);
2467
2468 static void stop_this_cpu (void * dummy)
2469 {
2470 + local_irq_disable();
2471 /*
2472 * Remove this CPU:
2473 */
2474 cpu_clear(smp_processor_id(), cpu_online_map);
2475 - local_irq_disable();
2476 disable_all_local_evtchn();
2477 if (cpu_data[smp_processor_id()].hlt_works_ok)
2478 for(;;) halt();
2479 @@ -551,13 +588,18 @@
2480 * this function calls the 'stop' function on all other CPUs in the system.
2481 */
2482
2483 -void smp_send_stop(void)
2484 +void xen_smp_send_stop(void)
2485 {
2486 - smp_call_function(stop_this_cpu, NULL, 1, 0);
2487 + /* Don't deadlock on the call lock in panic */
2488 + int nolock = !spin_trylock(&call_lock);
2489 + unsigned long flags;
2490
2491 - local_irq_disable();
2492 + local_irq_save(flags);
2493 + __smp_call_function(stop_this_cpu, NULL, 0, 0);
2494 + if (!nolock)
2495 + spin_unlock(&call_lock);
2496 disable_all_local_evtchn();
2497 - local_irq_enable();
2498 + local_irq_restore(flags);
2499 }
2500
2501 /*
2502 @@ -598,74 +640,3 @@
2503
2504 return IRQ_HANDLED;
2505 }
2506 -
2507 -/*
2508 - * this function sends a 'generic call function' IPI to one other CPU
2509 - * in the system.
2510 - *
2511 - * cpu is a standard Linux logical CPU number.
2512 - */
2513 -static void
2514 -__smp_call_function_single(int cpu, void (*func) (void *info), void *info,
2515 - int nonatomic, int wait)
2516 -{
2517 - struct call_data_struct data;
2518 - int cpus = 1;
2519 -
2520 - data.func = func;
2521 - data.info = info;
2522 - atomic_set(&data.started, 0);
2523 - data.wait = wait;
2524 - if (wait)
2525 - atomic_set(&data.finished, 0);
2526 -
2527 - call_data = &data;
2528 - wmb();
2529 - /* Send a message to all other CPUs and wait for them to respond */
2530 - send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_VECTOR);
2531 -
2532 - /* Wait for response */
2533 - while (atomic_read(&data.started) != cpus)
2534 - cpu_relax();
2535 -
2536 - if (!wait)
2537 - return;
2538 -
2539 - while (atomic_read(&data.finished) != cpus)
2540 - cpu_relax();
2541 -}
2542 -
2543 -/*
2544 - * smp_call_function_single - Run a function on another CPU
2545 - * @func: The function to run. This must be fast and non-blocking.
2546 - * @info: An arbitrary pointer to pass to the function.
2547 - * @nonatomic: Currently unused.
2548 - * @wait: If true, wait until function has completed on other CPUs.
2549 - *
2550 - * Retrurns 0 on success, else a negative status code.
2551 - *
2552 - * Does not return until the remote CPU is nearly ready to execute <func>
2553 - * or is or has executed.
2554 - */
2555 -
2556 -int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
2557 - int nonatomic, int wait)
2558 -{
2559 - /* prevent preemption and reschedule on another processor */
2560 - int me = get_cpu();
2561 - if (cpu == me) {
2562 - WARN_ON(1);
2563 - put_cpu();
2564 - return -EBUSY;
2565 - }
2566 -
2567 - /* Can deadlock when called with interrupts disabled */
2568 - WARN_ON(irqs_disabled());
2569 -
2570 - spin_lock_bh(&call_lock);
2571 - __smp_call_function_single(cpu, func, info, nonatomic, wait);
2572 - spin_unlock_bh(&call_lock);
2573 - put_cpu();
2574 - return 0;
2575 -}
2576 -EXPORT_SYMBOL(smp_call_function_single);
2577 --- a/arch/x86/kernel/smp_64-xen.c
2578 +++ b/arch/x86/kernel/smp_64-xen.c
2579 @@ -14,7 +14,6 @@
2580 #include <linux/mm.h>
2581 #include <linux/delay.h>
2582 #include <linux/spinlock.h>
2583 -#include <linux/smp_lock.h>
2584 #include <linux/smp.h>
2585 #include <linux/kernel_stat.h>
2586 #include <linux/mc146818rtc.h>
2587 @@ -457,44 +456,36 @@
2588 }
2589 EXPORT_SYMBOL(smp_call_function);
2590
2591 -void smp_stop_cpu(void)
2592 +static void stop_this_cpu(void *dummy)
2593 {
2594 - unsigned long flags;
2595 + local_irq_disable();
2596 /*
2597 * Remove this CPU:
2598 */
2599 cpu_clear(smp_processor_id(), cpu_online_map);
2600 - local_irq_save(flags);
2601 disable_all_local_evtchn();
2602 - local_irq_restore(flags);
2603 -}
2604 -
2605 -static void smp_really_stop_cpu(void *dummy)
2606 -{
2607 - smp_stop_cpu();
2608 for (;;)
2609 halt();
2610 }
2611
2612 void smp_send_stop(void)
2613 {
2614 - int nolock = 0;
2615 + int nolock;
2616 + unsigned long flags;
2617 +
2618 #ifndef CONFIG_XEN
2619 if (reboot_force)
2620 return;
2621 #endif
2622 +
2623 /* Don't deadlock on the call lock in panic */
2624 - if (!spin_trylock(&call_lock)) {
2625 - /* ignore locking because we have panicked anyways */
2626 - nolock = 1;
2627 - }
2628 - __smp_call_function(smp_really_stop_cpu, NULL, 0, 0);
2629 + nolock = !spin_trylock(&call_lock);
2630 + local_irq_save(flags);
2631 + __smp_call_function(stop_this_cpu, NULL, 0, 0);
2632 if (!nolock)
2633 spin_unlock(&call_lock);
2634 -
2635 - local_irq_disable();
2636 disable_all_local_evtchn();
2637 - local_irq_enable();
2638 + local_irq_restore(flags);
2639 }
2640
2641 /*
2642 --- a/arch/x86/kernel/time_32-xen.c
2643 +++ b/arch/x86/kernel/time_32-xen.c
2644 @@ -80,7 +80,6 @@
2645 #include <asm/i8253.h>
2646 DEFINE_SPINLOCK(i8253_lock);
2647 EXPORT_SYMBOL(i8253_lock);
2648 -int pit_latch_buggy; /* extern */
2649 #else
2650 volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
2651 #endif
2652 @@ -589,7 +588,7 @@
2653 return IRQ_HANDLED;
2654 }
2655
2656 -void mark_tsc_unstable(void)
2657 +void mark_tsc_unstable(char *reason)
2658 {
2659 #ifndef CONFIG_XEN /* XXX Should tell the hypervisor about this fact. */
2660 tsc_unstable = 1;
2661 @@ -597,17 +596,18 @@
2662 }
2663 EXPORT_SYMBOL_GPL(mark_tsc_unstable);
2664
2665 +static cycle_t cs_last;
2666 +
2667 static cycle_t xen_clocksource_read(void)
2668 {
2669 cycle_t ret = sched_clock();
2670
2671 #ifdef CONFIG_SMP
2672 for (;;) {
2673 - static cycle_t last_ret;
2674 #ifndef CONFIG_64BIT
2675 - cycle_t last = cmpxchg64(&last_ret, 0, 0);
2676 + cycle_t last = cmpxchg64(&cs_last, 0, 0);
2677 #else
2678 - cycle_t last = last_ret;
2679 + cycle_t last = cs_last;
2680 #define cmpxchg64 cmpxchg
2681 #endif
2682
2683 @@ -627,7 +627,7 @@
2684 }
2685 ret = last;
2686 }
2687 - if (cmpxchg64(&last_ret, last, ret) == last)
2688 + if (cmpxchg64(&cs_last, last, ret) == last)
2689 break;
2690 }
2691 #endif
2692 @@ -635,6 +635,14 @@
2693 return ret;
2694 }
2695
2696 +static void xen_clocksource_resume(void)
2697 +{
2698 + extern void time_resume(void);
2699 +
2700 + time_resume();
2701 + cs_last = sched_clock();
2702 +}
2703 +
2704 static struct clocksource clocksource_xen = {
2705 .name = "xen",
2706 .rating = 400,
2707 @@ -643,6 +651,7 @@
2708 .mult = 1 << XEN_SHIFT, /* time directly in nanoseconds */
2709 .shift = XEN_SHIFT,
2710 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
2711 + .resume = xen_clocksource_resume,
2712 };
2713
2714 static void init_missing_ticks_accounting(unsigned int cpu)
2715 @@ -731,35 +740,6 @@
2716 mod_timer(&sync_xen_wallclock_timer, jiffies + 1);
2717 }
2718
2719 -static int timer_resume(struct sys_device *dev)
2720 -{
2721 - extern void time_resume(void);
2722 - time_resume();
2723 - return 0;
2724 -}
2725 -
2726 -static struct sysdev_class timer_sysclass = {
2727 - .resume = timer_resume,
2728 - set_kset_name("timer"),
2729 -};
2730 -
2731 -
2732 -/* XXX this driverfs stuff should probably go elsewhere later -john */
2733 -static struct sys_device device_timer = {
2734 - .id = 0,
2735 - .cls = &timer_sysclass,
2736 -};
2737 -
2738 -static int time_init_device(void)
2739 -{
2740 - int error = sysdev_class_register(&timer_sysclass);
2741 - if (!error)
2742 - error = sysdev_register(&device_timer);
2743 - return error;
2744 -}
2745 -
2746 -device_initcall(time_init_device);
2747 -
2748 extern void (*late_time_init)(void);
2749
2750 /* Dynamically-mapped IRQ. */
2751 @@ -772,7 +752,7 @@
2752 VIRQ_TIMER,
2753 0,
2754 timer_interrupt,
2755 - SA_INTERRUPT,
2756 + IRQF_DISABLED,
2757 "timer0",
2758 NULL);
2759 BUG_ON(per_cpu(timer_irq, 0) < 0);
2760 @@ -890,21 +870,21 @@
2761 cpu_clear(smp_processor_id(), nohz_cpu_mask);
2762 }
2763
2764 -void raw_safe_halt(void)
2765 +void xen_safe_halt(void)
2766 {
2767 stop_hz_timer();
2768 /* Blocking includes an implicit local_irq_enable(). */
2769 HYPERVISOR_block();
2770 start_hz_timer();
2771 }
2772 -EXPORT_SYMBOL(raw_safe_halt);
2773 +EXPORT_SYMBOL(xen_safe_halt);
2774
2775 -void halt(void)
2776 +void xen_halt(void)
2777 {
2778 if (irqs_disabled())
2779 VOID(HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL));
2780 }
2781 -EXPORT_SYMBOL(halt);
2782 +EXPORT_SYMBOL(xen_halt);
2783
2784 /* No locking required. Interrupts are disabled on all CPUs. */
2785 void time_resume(void)
2786 @@ -967,7 +947,7 @@
2787 irq = bind_virq_to_irqhandler(VIRQ_TIMER,
2788 cpu,
2789 timer_interrupt,
2790 - SA_INTERRUPT,
2791 + IRQF_DISABLED,
2792 timer_name[cpu],
2793 NULL);
2794 if (irq < 0)
2795 --- a/arch/x86/kernel/traps_32-xen.c
2796 +++ b/arch/x86/kernel/traps_32-xen.c
2797 @@ -52,7 +52,7 @@
2798 #include <asm/unwind.h>
2799 #include <asm/smp.h>
2800 #include <asm/arch_hooks.h>
2801 -#include <asm/kdebug.h>
2802 +#include <linux/kdebug.h>
2803 #include <asm/stacktrace.h>
2804
2805 #include <linux/module.h>
2806 @@ -101,20 +101,6 @@
2807
2808 int kstack_depth_to_print = 24;
2809 static unsigned int code_bytes = 64;
2810 -ATOMIC_NOTIFIER_HEAD(i386die_chain);
2811 -
2812 -int register_die_notifier(struct notifier_block *nb)
2813 -{
2814 - vmalloc_sync_all();
2815 - return atomic_notifier_chain_register(&i386die_chain, nb);
2816 -}
2817 -EXPORT_SYMBOL(register_die_notifier); /* used modular by kdb */
2818 -
2819 -int unregister_die_notifier(struct notifier_block *nb)
2820 -{
2821 - return atomic_notifier_chain_unregister(&i386die_chain, nb);
2822 -}
2823 -EXPORT_SYMBOL(unregister_die_notifier); /* used modular by kdb */
2824
2825 static inline int valid_stack_ptr(struct thread_info *tinfo, void *p)
2826 {
2827 @@ -325,7 +311,7 @@
2828 regs->xds & 0xffff, regs->xes & 0xffff, regs->xfs & 0xffff, gs, ss);
2829 printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)",
2830 TASK_COMM_LEN, current->comm, current->pid,
2831 - current_thread_info(), current, current->thread_info);
2832 + current_thread_info(), current, task_thread_info(current));
2833 /*
2834 * When in-kernel, we also print out the stack and code at the
2835 * time of the fault..
2836 @@ -482,8 +468,6 @@
2837 siginfo_t *info)
2838 {
2839 struct task_struct *tsk = current;
2840 - tsk->thread.error_code = error_code;
2841 - tsk->thread.trap_no = trapnr;
2842
2843 if (regs->eflags & VM_MASK) {
2844 if (vm86)
2845 @@ -495,6 +479,18 @@
2846 goto kernel_trap;
2847
2848 trap_signal: {
2849 + /*
2850 + * We want error_code and trap_no set for userspace faults and
2851 + * kernelspace faults which result in die(), but not
2852 + * kernelspace faults which are fixed up. die() gives the
2853 + * process no chance to handle the signal and notice the
2854 + * kernel fault information, so that won't result in polluting
2855 + * the information about previously queued, but not yet
2856 + * delivered, faults. See also do_general_protection below.
2857 + */
2858 + tsk->thread.error_code = error_code;
2859 + tsk->thread.trap_no = trapnr;
2860 +
2861 if (info)
2862 force_sig_info(signr, info, tsk);
2863 else
2864 @@ -503,8 +499,11 @@
2865 }
2866
2867 kernel_trap: {
2868 - if (!fixup_exception(regs))
2869 + if (!fixup_exception(regs)) {
2870 + tsk->thread.error_code = error_code;
2871 + tsk->thread.trap_no = trapnr;
2872 die(str, regs, error_code);
2873 + }
2874 return;
2875 }
2876
2877 @@ -578,9 +577,6 @@
2878 fastcall void __kprobes do_general_protection(struct pt_regs * regs,
2879 long error_code)
2880 {
2881 - current->thread.error_code = error_code;
2882 - current->thread.trap_no = 13;
2883 -
2884 if (regs->eflags & VM_MASK)
2885 goto gp_in_vm86;
2886
2887 @@ -599,6 +595,8 @@
2888
2889 gp_in_kernel:
2890 if (!fixup_exception(regs)) {
2891 + current->thread.error_code = error_code;
2892 + current->thread.trap_no = 13;
2893 if (notify_die(DIE_GPF, "general protection fault", regs,
2894 error_code, 13, SIGSEGV) == NOTIFY_STOP)
2895 return;
2896 @@ -987,9 +985,7 @@
2897 fastcall unsigned long patch_espfix_desc(unsigned long uesp,
2898 unsigned long kesp)
2899 {
2900 - int cpu = smp_processor_id();
2901 - struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
2902 - struct desc_struct *gdt = (struct desc_struct *)cpu_gdt_descr->address;
2903 + struct desc_struct *gdt = __get_cpu_var(gdt_page).gdt;
2904 unsigned long base = (kesp - uesp) & -THREAD_SIZE;
2905 unsigned long new_kesp = kesp - base;
2906 unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT;
2907 --- a/arch/x86/kernel/traps_64-xen.c
2908 +++ b/arch/x86/kernel/traps_64-xen.c
2909 @@ -32,6 +32,7 @@
2910 #include <linux/unwind.h>
2911 #include <linux/uaccess.h>
2912 #include <linux/bug.h>
2913 +#include <linux/kdebug.h>
2914
2915 #include <asm/system.h>
2916 #include <asm/io.h>
2917 @@ -39,7 +40,6 @@
2918 #include <asm/debugreg.h>
2919 #include <asm/desc.h>
2920 #include <asm/i387.h>
2921 -#include <asm/kdebug.h>
2922 #include <asm/processor.h>
2923 #include <asm/unwind.h>
2924 #include <asm/smp.h>
2925 @@ -71,22 +71,6 @@
2926 asmlinkage void machine_check(void);
2927 asmlinkage void spurious_interrupt_bug(void);
2928
2929 -ATOMIC_NOTIFIER_HEAD(die_chain);
2930 -EXPORT_SYMBOL(die_chain);
2931 -
2932 -int register_die_notifier(struct notifier_block *nb)
2933 -{
2934 - vmalloc_sync_all();
2935 - return atomic_notifier_chain_register(&die_chain, nb);
2936 -}
2937 -EXPORT_SYMBOL(register_die_notifier); /* used modular by kdb */
2938 -
2939 -int unregister_die_notifier(struct notifier_block *nb)
2940 -{
2941 - return atomic_notifier_chain_unregister(&die_chain, nb);
2942 -}
2943 -EXPORT_SYMBOL(unregister_die_notifier); /* used modular by kdb */
2944 -
2945 static inline void conditional_sti(struct pt_regs *regs)
2946 {
2947 if (regs->eflags & X86_EFLAGS_IF)
2948 @@ -428,8 +412,7 @@
2949 const int cpu = smp_processor_id();
2950 struct task_struct *cur = cpu_pda(cpu)->pcurrent;
2951
2952 - rsp = regs->rsp;
2953 -
2954 + rsp = regs->rsp;
2955 printk("CPU %d ", cpu);
2956 __show_regs(regs);
2957 printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
2958 @@ -440,7 +423,6 @@
2959 * time of the fault..
2960 */
2961 if (in_kernel) {
2962 -
2963 printk("Stack: ");
2964 _show_stack(NULL, regs, (unsigned long*)rsp);
2965
2966 @@ -485,13 +467,14 @@
2967
2968 unsigned __kprobes long oops_begin(void)
2969 {
2970 - int cpu = smp_processor_id();
2971 + int cpu;
2972 unsigned long flags;
2973
2974 oops_enter();
2975
2976 /* racy, but better than risking deadlock. */
2977 local_irq_save(flags);
2978 + cpu = smp_processor_id();
2979 if (!spin_trylock(&die_lock)) {
2980 if (cpu == die_owner)
2981 /* nested oops. should stop eventually */;
2982 @@ -585,10 +568,20 @@
2983 {
2984 struct task_struct *tsk = current;
2985
2986 - tsk->thread.error_code = error_code;
2987 - tsk->thread.trap_no = trapnr;
2988 -
2989 if (user_mode(regs)) {
2990 + /*
2991 + * We want error_code and trap_no set for userspace
2992 + * faults and kernelspace faults which result in
2993 + * die(), but not kernelspace faults which are fixed
2994 + * up. die() gives the process no chance to handle
2995 + * the signal and notice the kernel fault information,
2996 + * so that won't result in polluting the information
2997 + * about previously queued, but not yet delivered,
2998 + * faults. See also do_general_protection below.
2999 + */
3000 + tsk->thread.error_code = error_code;
3001 + tsk->thread.trap_no = trapnr;
3002 +
3003 if (exception_trace && unhandled_signal(tsk, signr))
3004 printk(KERN_INFO
3005 "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n",
3006 @@ -609,8 +602,11 @@
3007 fixup = search_exception_tables(regs->rip);
3008 if (fixup)
3009 regs->rip = fixup->fixup;
3010 - else
3011 + else {
3012 + tsk->thread.error_code = error_code;
3013 + tsk->thread.trap_no = trapnr;
3014 die(str, regs, error_code);
3015 + }
3016 return;
3017 }
3018 }
3019 @@ -686,10 +682,10 @@
3020
3021 conditional_sti(regs);
3022
3023 - tsk->thread.error_code = error_code;
3024 - tsk->thread.trap_no = 13;
3025 -
3026 if (user_mode(regs)) {
3027 + tsk->thread.error_code = error_code;
3028 + tsk->thread.trap_no = 13;
3029 +
3030 if (exception_trace && unhandled_signal(tsk, SIGSEGV))
3031 printk(KERN_INFO
3032 "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n",
3033 @@ -708,6 +704,9 @@
3034 regs->rip = fixup->fixup;
3035 return;
3036 }
3037 +
3038 + tsk->thread.error_code = error_code;
3039 + tsk->thread.trap_no = 13;
3040 if (notify_die(DIE_GPF, "general protection fault", regs,
3041 error_code, 13, SIGSEGV) == NOTIFY_STOP)
3042 return;
3043 --- a/arch/x86/kernel/vsyscall_64-xen.c
3044 +++ b/arch/x86/kernel/vsyscall_64-xen.c
3045 @@ -45,14 +45,34 @@
3046
3047 #define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
3048 #define __syscall_clobber "r11","rcx","memory"
3049 +#define __pa_vsymbol(x) \
3050 + ({unsigned long v; \
3051 + extern char __vsyscall_0; \
3052 + asm("" : "=r" (v) : "0" (x)); \
3053 + ((v - VSYSCALL_FIRST_PAGE) + __pa_symbol(&__vsyscall_0)); })
3054
3055 +/*
3056 + * vsyscall_gtod_data contains data that is :
3057 + * - readonly from vsyscalls
3058 + * - writen by timer interrupt or systcl (/proc/sys/kernel/vsyscall64)
3059 + * Try to keep this structure as small as possible to avoid cache line ping pongs
3060 + */
3061 struct vsyscall_gtod_data_t {
3062 - seqlock_t lock;
3063 - int sysctl_enabled;
3064 - struct timeval wall_time_tv;
3065 + seqlock_t lock;
3066 +
3067 + /* open coded 'struct timespec' */
3068 + time_t wall_time_sec;
3069 + u32 wall_time_nsec;
3070 +
3071 + int sysctl_enabled;
3072 struct timezone sys_tz;
3073 - cycle_t offset_base;
3074 - struct clocksource clock;
3075 + struct { /* extract of a clocksource struct */
3076 + cycle_t (*vread)(void);
3077 + cycle_t cycle_last;
3078 + cycle_t mask;
3079 + u32 mult;
3080 + u32 shift;
3081 + } clock;
3082 };
3083 int __vgetcpu_mode __section_vgetcpu_mode;
3084
3085 @@ -68,9 +88,13 @@
3086
3087 write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
3088 /* copy vsyscall data */
3089 - vsyscall_gtod_data.clock = *clock;
3090 - vsyscall_gtod_data.wall_time_tv.tv_sec = wall_time->tv_sec;
3091 - vsyscall_gtod_data.wall_time_tv.tv_usec = wall_time->tv_nsec/1000;
3092 + vsyscall_gtod_data.clock.vread = clock->vread;
3093 + vsyscall_gtod_data.clock.cycle_last = clock->cycle_last;
3094 + vsyscall_gtod_data.clock.mask = clock->mask;
3095 + vsyscall_gtod_data.clock.mult = clock->mult;
3096 + vsyscall_gtod_data.clock.shift = clock->shift;
3097 + vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
3098 + vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
3099 vsyscall_gtod_data.sys_tz = sys_tz;
3100 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
3101 }
3102 @@ -105,7 +129,8 @@
3103 static __always_inline void do_vgettimeofday(struct timeval * tv)
3104 {
3105 cycle_t now, base, mask, cycle_delta;
3106 - unsigned long seq, mult, shift, nsec_delta;
3107 + unsigned seq;
3108 + unsigned long mult, shift, nsec;
3109 cycle_t (*vread)(void);
3110 do {
3111 seq = read_seqbegin(&__vsyscall_gtod_data.lock);
3112 @@ -121,21 +146,20 @@
3113 mult = __vsyscall_gtod_data.clock.mult;
3114 shift = __vsyscall_gtod_data.clock.shift;
3115
3116 - *tv = __vsyscall_gtod_data.wall_time_tv;
3117 -
3118 + tv->tv_sec = __vsyscall_gtod_data.wall_time_sec;
3119 + nsec = __vsyscall_gtod_data.wall_time_nsec;
3120 } while (read_seqretry(&__vsyscall_gtod_data.lock, seq));
3121
3122 /* calculate interval: */
3123 cycle_delta = (now - base) & mask;
3124 /* convert to nsecs: */
3125 - nsec_delta = (cycle_delta * mult) >> shift;
3126 + nsec += (cycle_delta * mult) >> shift;
3127
3128 - /* convert to usecs and add to timespec: */
3129 - tv->tv_usec += nsec_delta / NSEC_PER_USEC;
3130 - while (tv->tv_usec > USEC_PER_SEC) {
3131 + while (nsec >= NSEC_PER_SEC) {
3132 tv->tv_sec += 1;
3133 - tv->tv_usec -= USEC_PER_SEC;
3134 + nsec -= NSEC_PER_SEC;
3135 }
3136 + tv->tv_usec = nsec / NSEC_PER_USEC;
3137 }
3138
3139 int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
3140 @@ -151,11 +175,16 @@
3141 * unlikely */
3142 time_t __vsyscall(1) vtime(time_t *t)
3143 {
3144 + struct timeval tv;
3145 + time_t result;
3146 if (unlikely(!__vsyscall_gtod_data.sysctl_enabled))
3147 return time_syscall(t);
3148 - else if (t)
3149 - *t = __vsyscall_gtod_data.wall_time_tv.tv_sec;
3150 - return __vsyscall_gtod_data.wall_time_tv.tv_sec;
3151 +
3152 + vgettimeofday(&tv, 0);
3153 + result = tv.tv_sec;
3154 + if (t)
3155 + *t = result;
3156 + return result;
3157 }
3158
3159 /* Fast way to get current CPU and node.
3160 @@ -224,10 +253,10 @@
3161 return ret;
3162 /* gcc has some trouble with __va(__pa()), so just do it this
3163 way. */
3164 - map1 = ioremap(__pa_symbol(&vsysc1), 2);
3165 + map1 = ioremap(__pa_vsymbol(&vsysc1), 2);
3166 if (!map1)
3167 return -ENOMEM;
3168 - map2 = ioremap(__pa_symbol(&vsysc2), 2);
3169 + map2 = ioremap(__pa_vsymbol(&vsysc2), 2);
3170 if (!map2) {
3171 ret = -ENOMEM;
3172 goto out;
3173 @@ -304,7 +333,7 @@
3174 cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
3175 {
3176 long cpu = (long)arg;
3177 - if (action == CPU_ONLINE)
3178 + if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
3179 smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 0, 1);
3180 return NOTIFY_DONE;
3181 }
3182 --- a/arch/x86/mm/fault_32-xen.c
3183 +++ b/arch/x86/mm/fault_32-xen.c
3184 @@ -14,19 +14,20 @@
3185 #include <linux/mman.h>
3186 #include <linux/mm.h>
3187 #include <linux/smp.h>
3188 -#include <linux/smp_lock.h>
3189 #include <linux/interrupt.h>
3190 #include <linux/init.h>
3191 #include <linux/tty.h>
3192 #include <linux/vt_kern.h> /* For unblank_screen() */
3193 #include <linux/highmem.h>
3194 +#include <linux/bootmem.h> /* for max_low_pfn */
3195 +#include <linux/vmalloc.h>
3196 #include <linux/module.h>
3197 #include <linux/kprobes.h>
3198 #include <linux/uaccess.h>
3199 +#include <linux/kdebug.h>
3200
3201 #include <asm/system.h>
3202 #include <asm/desc.h>
3203 -#include <asm/kdebug.h>
3204 #include <asm/segment.h>
3205
3206 extern void die(const char *,struct pt_regs *,long);
3207 @@ -259,25 +260,20 @@
3208 unsigned long page;
3209
3210 page = read_cr3();
3211 - page = ((unsigned long *) __va(page))[address >> 22];
3212 - if (oops_may_print())
3213 - printk(KERN_ALERT "*pde = ma %08lx pa %08lx\n", page,
3214 - machine_to_phys(page));
3215 + page = ((unsigned long *) __va(page))[address >> PGDIR_SHIFT];
3216 + printk(KERN_ALERT "*pde = ma %08lx pa %08lx\n", page,
3217 + machine_to_phys(page));
3218 /*
3219 * We must not directly access the pte in the highpte
3220 * case if the page table is located in highmem.
3221 * And lets rather not kmap-atomic the pte, just in case
3222 * it's allocated already.
3223 */
3224 -#ifdef CONFIG_HIGHPTE
3225 - if ((page >> PAGE_SHIFT) >= highstart_pfn)
3226 - return;
3227 -#endif
3228 - if ((page & 1) && oops_may_print()) {
3229 - page &= PAGE_MASK;
3230 - address &= 0x003ff000;
3231 - page = machine_to_phys(page);
3232 - page = ((unsigned long *) __va(page))[address >> PAGE_SHIFT];
3233 + if ((machine_to_phys(page) >> PAGE_SHIFT) < max_low_pfn
3234 + && (page & _PAGE_PRESENT)) {
3235 + page = machine_to_phys(page & PAGE_MASK);
3236 + page = ((unsigned long *) __va(page))[(address >> PAGE_SHIFT)
3237 + & (PTRS_PER_PTE - 1)];
3238 printk(KERN_ALERT "*pte = ma %08lx pa %08lx\n", page,
3239 machine_to_phys(page));
3240 }
3241 @@ -581,6 +577,11 @@
3242 bad_area_nosemaphore:
3243 /* User mode accesses just cause a SIGSEGV */
3244 if (error_code & 4) {
3245 + /*
3246 + * It's possible to have interrupts off here.
3247 + */
3248 + local_irq_enable();
3249 +
3250 /*
3251 * Valid to do another page fault here because this one came
3252 * from user space.
3253 @@ -633,7 +634,7 @@
3254 bust_spinlocks(1);
3255
3256 if (oops_may_print()) {
3257 - #ifdef CONFIG_X86_PAE
3258 +#ifdef CONFIG_X86_PAE
3259 if (error_code & 16) {
3260 pte_t *pte = lookup_address(address);
3261
3262 @@ -642,7 +643,7 @@
3263 "NX-protected page - exploit attempt? "
3264 "(uid: %d)\n", current->uid);
3265 }
3266 - #endif
3267 +#endif
3268 if (address < PAGE_SIZE)
3269 printk(KERN_ALERT "BUG: unable to handle kernel NULL "
3270 "pointer dereference");
3271 @@ -652,8 +653,8 @@
3272 printk(" at virtual address %08lx\n",address);
3273 printk(KERN_ALERT " printing eip:\n");
3274 printk("%08lx\n", regs->eip);
3275 + dump_fault_path(address);
3276 }
3277 - dump_fault_path(address);
3278 tsk->thread.cr2 = address;
3279 tsk->thread.trap_no = 14;
3280 tsk->thread.error_code = error_code;
3281 @@ -694,7 +695,6 @@
3282 force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
3283 }
3284
3285 -#if !HAVE_SHARED_KERNEL_PMD
3286 void vmalloc_sync_all(void)
3287 {
3288 /*
3289 @@ -710,6 +710,9 @@
3290 static unsigned long start = TASK_SIZE;
3291 unsigned long address;
3292
3293 + if (SHARED_KERNEL_PMD)
3294 + return;
3295 +
3296 BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
3297 for (address = start;
3298 address >= TASK_SIZE && address < hypervisor_virt_start;
3299 @@ -739,4 +742,3 @@
3300 start = address + (1UL << PMD_SHIFT);
3301 }
3302 }
3303 -#endif
3304 --- a/arch/x86/mm/fault_64-xen.c
3305 +++ b/arch/x86/mm/fault_64-xen.c
3306 @@ -15,22 +15,22 @@
3307 #include <linux/mman.h>
3308 #include <linux/mm.h>
3309 #include <linux/smp.h>
3310 -#include <linux/smp_lock.h>
3311 #include <linux/interrupt.h>
3312 #include <linux/init.h>
3313 #include <linux/tty.h>
3314 #include <linux/vt_kern.h> /* For unblank_screen() */
3315 #include <linux/compiler.h>
3316 +#include <linux/vmalloc.h>
3317 #include <linux/module.h>
3318 #include <linux/kprobes.h>
3319 #include <linux/uaccess.h>
3320 +#include <linux/kdebug.h>
3321
3322 #include <asm/system.h>
3323 #include <asm/pgalloc.h>
3324 #include <asm/smp.h>
3325 #include <asm/tlbflush.h>
3326 #include <asm/proto.h>
3327 -#include <asm/kdebug.h>
3328 #include <asm-generic/sections.h>
3329
3330 /* Page fault error code bits */
3331 @@ -537,6 +537,12 @@
3332 bad_area_nosemaphore:
3333 /* User mode accesses just cause a SIGSEGV */
3334 if (error_code & PF_USER) {
3335 +
3336 + /*
3337 + * It's possible to have interrupts off here.
3338 + */
3339 + local_irq_enable();
3340 +
3341 if (is_prefetch(regs, address, error_code))
3342 return;
3343
3344 @@ -646,7 +652,7 @@
3345 }
3346
3347 DEFINE_SPINLOCK(pgd_lock);
3348 -struct page *pgd_list;
3349 +LIST_HEAD(pgd_list);
3350
3351 void vmalloc_sync_all(void)
3352 {
3353 @@ -666,8 +672,7 @@
3354 if (pgd_none(*pgd_ref))
3355 continue;
3356 spin_lock(&pgd_lock);
3357 - for (page = pgd_list; page;
3358 - page = (struct page *)page->index) {
3359 + list_for_each_entry(page, &pgd_list, lru) {
3360 pgd_t *pgd;
3361 pgd = (pgd_t *)page_address(page) + pgd_index(address);
3362 if (pgd_none(*pgd))
3363 --- a/arch/x86/mm/highmem_32-xen.c
3364 +++ b/arch/x86/mm/highmem_32-xen.c
3365 @@ -26,7 +26,7 @@
3366 * However when holding an atomic kmap is is not legal to sleep, so atomic
3367 * kmaps are appropriate for short, tight code paths only.
3368 */
3369 -static void *__kmap_atomic(struct page *page, enum km_type type, pgprot_t prot)
3370 +void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
3371 {
3372 enum fixed_addresses idx;
3373 unsigned long vaddr;
3374 @@ -49,15 +49,7 @@
3375
3376 void *kmap_atomic(struct page *page, enum km_type type)
3377 {
3378 - return __kmap_atomic(page, type, kmap_prot);
3379 -}
3380 -
3381 -/* Same as kmap_atomic but with PAGE_KERNEL_RO page protection. */
3382 -void *kmap_atomic_pte(struct page *page, enum km_type type)
3383 -{
3384 - return __kmap_atomic(page, type,
3385 - test_bit(PG_pinned, &page->flags)
3386 - ? PAGE_KERNEL_RO : kmap_prot);
3387 + return kmap_atomic_prot(page, type, kmap_prot);
3388 }
3389
3390 void kunmap_atomic(void *kvaddr, enum km_type type)
3391 @@ -80,6 +72,7 @@
3392 #endif
3393 }
3394
3395 + arch_flush_lazy_mmu_mode();
3396 pagefault_enable();
3397 }
3398
3399 @@ -117,6 +110,5 @@
3400 EXPORT_SYMBOL(kmap);
3401 EXPORT_SYMBOL(kunmap);
3402 EXPORT_SYMBOL(kmap_atomic);
3403 -EXPORT_SYMBOL(kmap_atomic_pte);
3404 EXPORT_SYMBOL(kunmap_atomic);
3405 EXPORT_SYMBOL(kmap_atomic_to_page);
3406 --- a/arch/x86/mm/init_32-xen.c
3407 +++ b/arch/x86/mm/init_32-xen.c
3408 @@ -22,6 +22,7 @@
3409 #include <linux/init.h>
3410 #include <linux/highmem.h>
3411 #include <linux/pagemap.h>
3412 +#include <linux/pfn.h>
3413 #include <linux/poison.h>
3414 #include <linux/bootmem.h>
3415 #include <linux/slab.h>
3416 @@ -67,17 +68,19 @@
3417 pmd_t *pmd_table;
3418
3419 #ifdef CONFIG_X86_PAE
3420 - pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
3421 - paravirt_alloc_pd(__pa(pmd_table) >> PAGE_SHIFT);
3422 - make_lowmem_page_readonly(pmd_table, XENFEAT_writable_page_tables);
3423 - set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
3424 - pud = pud_offset(pgd, 0);
3425 - if (pmd_table != pmd_offset(pud, 0))
3426 - BUG();
3427 -#else
3428 + if (!(__pgd_val(*pgd) & _PAGE_PRESENT)) {
3429 + pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
3430 +
3431 + paravirt_alloc_pd(__pa(pmd_table) >> PAGE_SHIFT);
3432 + make_lowmem_page_readonly(pmd_table, XENFEAT_writable_page_tables);
3433 + set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
3434 + pud = pud_offset(pgd, 0);
3435 + if (pmd_table != pmd_offset(pud, 0))
3436 + BUG();
3437 + }
3438 +#endif
3439 pud = pud_offset(pgd, 0);
3440 pmd_table = pmd_offset(pud, 0);
3441 -#endif
3442
3443 return pmd_table;
3444 }
3445 @@ -88,16 +91,18 @@
3446 */
3447 static pte_t * __init one_page_table_init(pmd_t *pmd)
3448 {
3449 +#if CONFIG_XEN_COMPAT <= 0x030002
3450 if (pmd_none(*pmd)) {
3451 +#else
3452 + if (!(__pmd_val(*pmd) & _PAGE_PRESENT)) {
3453 +#endif
3454 pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE);
3455 +
3456 paravirt_alloc_pt(__pa(page_table) >> PAGE_SHIFT);
3457 make_lowmem_page_readonly(page_table,
3458 XENFEAT_writable_page_tables);
3459 set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
3460 - if (page_table != pte_offset_kernel(pmd, 0))
3461 - BUG();
3462 -
3463 - return page_table;
3464 + BUG_ON(page_table != pte_offset_kernel(pmd, 0));
3465 }
3466
3467 return pte_offset_kernel(pmd, 0);
3468 @@ -117,7 +122,6 @@
3469 static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base)
3470 {
3471 pgd_t *pgd;
3472 - pud_t *pud;
3473 pmd_t *pmd;
3474 int pgd_idx, pmd_idx;
3475 unsigned long vaddr;
3476 @@ -128,12 +132,10 @@
3477 pgd = pgd_base + pgd_idx;
3478
3479 for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) {
3480 - if (pgd_none(*pgd))
3481 - one_md_table_init(pgd);
3482 - pud = pud_offset(pgd, vaddr);
3483 - pmd = pmd_offset(pud, vaddr);
3484 + pmd = one_md_table_init(pgd);
3485 + pmd = pmd + pmd_index(vaddr);
3486 for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) {
3487 - if (vaddr < hypervisor_virt_start && pmd_none(*pmd))
3488 + if (vaddr < hypervisor_virt_start)
3489 one_page_table_init(pmd);
3490
3491 vaddr += PMD_SIZE;
3492 @@ -196,24 +198,25 @@
3493 /* Map with big pages if possible, otherwise create normal page tables. */
3494 if (cpu_has_pse) {
3495 unsigned int address2 = (pfn + PTRS_PER_PTE - 1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1;
3496 -
3497 if (is_kernel_text(address) || is_kernel_text(address2))
3498 set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC));
3499 else
3500 set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE));
3501 +
3502 pfn += PTRS_PER_PTE;
3503 } else {
3504 pte = one_page_table_init(pmd);
3505
3506 - pte += pte_ofs;
3507 - for (; pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; pte++, pfn++, pte_ofs++) {
3508 - /* XEN: Only map initial RAM allocation. */
3509 - if ((pfn >= max_ram_pfn) || pte_present(*pte))
3510 - continue;
3511 - if (is_kernel_text(address))
3512 - set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC));
3513 - else
3514 - set_pte(pte, pfn_pte(pfn, PAGE_KERNEL));
3515 + for (pte += pte_ofs;
3516 + pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn;
3517 + pte++, pfn++, pte_ofs++, address += PAGE_SIZE) {
3518 + /* XEN: Only map initial RAM allocation. */
3519 + if ((pfn >= max_ram_pfn) || pte_present(*pte))
3520 + continue;
3521 + if (is_kernel_text(address))
3522 + set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC));
3523 + else
3524 + set_pte(pte, pfn_pte(pfn, PAGE_KERNEL));
3525 }
3526 pte_ofs = 0;
3527 }
3528 @@ -383,15 +386,44 @@
3529
3530 pgd_t *swapper_pg_dir;
3531
3532 +static void __init xen_pagetable_setup_start(pgd_t *base)
3533 +{
3534 +}
3535 +
3536 +static void __init xen_pagetable_setup_done(pgd_t *base)
3537 +{
3538 +}
3539 +
3540 +/*
3541 + * Build a proper pagetable for the kernel mappings. Up until this
3542 + * point, we've been running on some set of pagetables constructed by
3543 + * the boot process.
3544 + *
3545 + * If we're booting on native hardware, this will be a pagetable
3546 + * constructed in arch/i386/kernel/head.S, and not running in PAE mode
3547 + * (even if we'll end up running in PAE). The root of the pagetable
3548 + * will be swapper_pg_dir.
3549 + *
3550 + * If we're booting paravirtualized under a hypervisor, then there are
3551 + * more options: we may already be running PAE, and the pagetable may
3552 + * or may not be based in swapper_pg_dir. In any case,
3553 + * paravirt_pagetable_setup_start() will set up swapper_pg_dir
3554 + * appropriately for the rest of the initialization to work.
3555 + *
3556 + * In general, pagetable_init() assumes that the pagetable may already
3557 + * be partially populated, and so it avoids stomping on any existing
3558 + * mappings.
3559 + */
3560 static void __init pagetable_init (void)
3561 {
3562 - unsigned long vaddr;
3563 + unsigned long vaddr, end;
3564 pgd_t *pgd_base = (pgd_t *)xen_start_info->pt_base;
3565
3566 + xen_pagetable_setup_start(pgd_base);
3567 +
3568 /* Enable PSE if available */
3569 - if (cpu_has_pse) {
3570 + if (cpu_has_pse)
3571 set_in_cr4(X86_CR4_PSE);
3572 - }
3573
3574 /* Enable PGE if available */
3575 if (cpu_has_pge) {
3576 @@ -408,9 +440,12 @@
3577 * created - mappings will be set by set_fixmap():
3578 */
3579 vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
3580 - page_table_range_init(vaddr, hypervisor_virt_start, pgd_base);
3581 + end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
3582 + page_table_range_init(vaddr, end, pgd_base);
3583
3584 permanent_kmaps_init(pgd_base);
3585 +
3586 + xen_pagetable_setup_done(pgd_base);
3587 }
3588
3589 #if defined(CONFIG_SOFTWARE_SUSPEND) || defined(CONFIG_ACPI_SLEEP)
3590 @@ -757,34 +792,29 @@
3591 EXPORT_SYMBOL_GPL(remove_memory);
3592 #endif
3593
3594 -struct kmem_cache *pgd_cache;
3595 struct kmem_cache *pmd_cache;
3596
3597 void __init pgtable_cache_init(void)
3598 {
3599 + size_t pgd_size = PTRS_PER_PGD*sizeof(pgd_t);
3600 +
3601 if (PTRS_PER_PMD > 1) {
3602 pmd_cache = kmem_cache_create("pmd",
3603 PTRS_PER_PMD*sizeof(pmd_t),
3604 PTRS_PER_PMD*sizeof(pmd_t),
3605 - 0,
3606 + SLAB_PANIC,
3607 pmd_ctor,
3608 NULL);
3609 - if (!pmd_cache)
3610 - panic("pgtable_cache_init(): cannot create pmd cache");
3611 + if (!SHARED_KERNEL_PMD) {
3612 + /* If we're in PAE mode and have a non-shared
3613 + kernel pmd, then the pgd size must be a
3614 + page size. This is because the pgd_list
3615 + links through the page structure, so there
3616 + can only be one pgd per page for this to
3617 + work. */
3618 + pgd_size = PAGE_SIZE;
3619 + }
3620 }
3621 - pgd_cache = kmem_cache_create("pgd",
3622 -#ifndef CONFIG_XEN
3623 - PTRS_PER_PGD*sizeof(pgd_t),
3624 - PTRS_PER_PGD*sizeof(pgd_t),
3625 -#else
3626 - PAGE_SIZE,
3627 - PAGE_SIZE,
3628 -#endif
3629 - 0,
3630 - pgd_ctor,
3631 - PTRS_PER_PMD == 1 ? pgd_dtor : NULL);
3632 - if (!pgd_cache)
3633 - panic("pgtable_cache_init(): Cannot create pgd cache");
3634 }
3635
3636 /*
3637 @@ -818,13 +848,26 @@
3638
3639 void mark_rodata_ro(void)
3640 {
3641 - unsigned long addr = (unsigned long)__start_rodata;
3642 -
3643 - for (; addr < (unsigned long)__end_rodata; addr += PAGE_SIZE)
3644 - change_page_attr(virt_to_page(addr), 1, PAGE_KERNEL_RO);
3645 + unsigned long start = PFN_ALIGN(_text);
3646 + unsigned long size = PFN_ALIGN(_etext) - start;
3647
3648 - printk("Write protecting the kernel read-only data: %uk\n",
3649 - (__end_rodata - __start_rodata) >> 10);
3650 +#ifndef CONFIG_KPROBES
3651 +#ifdef CONFIG_HOTPLUG_CPU
3652 + /* It must still be possible to apply SMP alternatives. */
3653 + if (num_possible_cpus() <= 1)
3654 +#endif
3655 + {
3656 + change_page_attr(virt_to_page(start),
3657 + size >> PAGE_SHIFT, PAGE_KERNEL_RX);
3658 + printk("Write protecting the kernel text: %luk\n", size >> 10);
3659 + }
3660 +#endif
3661 + start += size;
3662 + size = (unsigned long)__end_rodata - start;
3663 + change_page_attr(virt_to_page(start),
3664 + size >> PAGE_SHIFT, PAGE_KERNEL_RO);
3665 + printk("Write protecting the kernel read-only data: %luk\n",
3666 + size >> 10);
3667
3668 /*
3669 * change_page_attr() requires a global_flush_tlb() call after it.
3670 @@ -847,7 +890,7 @@
3671 free_page(addr);
3672 totalram_pages++;
3673 }
3674 - printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10);
3675 + printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
3676 }
3677
3678 void free_initmem(void)
3679 --- a/arch/x86/mm/init_64-xen.c
3680 +++ b/arch/x86/mm/init_64-xen.c
3681 @@ -25,10 +25,12 @@
3682 #include <linux/bootmem.h>
3683 #include <linux/proc_fs.h>
3684 #include <linux/pci.h>
3685 +#include <linux/pfn.h>
3686 #include <linux/poison.h>
3687 #include <linux/dma-mapping.h>
3688 #include <linux/module.h>
3689 #include <linux/memory_hotplug.h>
3690 +#include <linux/nmi.h>
3691
3692 #include <asm/processor.h>
3693 #include <asm/system.h>
3694 @@ -51,7 +53,7 @@
3695 #define Dprintk(x...)
3696 #endif
3697
3698 -struct dma_mapping_ops* dma_ops;
3699 +const struct dma_mapping_ops* dma_ops;
3700 EXPORT_SYMBOL(dma_ops);
3701
3702 #if CONFIG_XEN_COMPAT <= 0x030002
3703 @@ -189,6 +191,13 @@
3704
3705 for_each_online_pgdat(pgdat) {
3706 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
3707 + /* this loop can take a while with 256 GB and 4k pages
3708 + so update the NMI watchdog */
3709 + if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) {
3710 + touch_nmi_watchdog();
3711 + }
3712 + if (!pfn_valid(pgdat->node_start_pfn + i))
3713 + continue;
3714 page = pfn_to_page(pgdat->node_start_pfn + i);
3715 total++;
3716 if (PageReserved(page))
3717 @@ -350,7 +359,7 @@
3718 }
3719 }
3720
3721 -unsigned long __initdata table_start, table_end;
3722 +unsigned long __meminitdata table_start, table_end;
3723
3724 static __meminit void *alloc_static_page(unsigned long *phys)
3725 {
3726 @@ -367,7 +376,7 @@
3727 start_pfn++;
3728 memset((void *)va, 0, PAGE_SIZE);
3729 return (void *)va;
3730 -}
3731 +}
3732
3733 #define PTE_SIZE PAGE_SIZE
3734
3735 @@ -408,28 +417,46 @@
3736
3737 #ifndef CONFIG_XEN
3738 /* Must run before zap_low_mappings */
3739 -__init void *early_ioremap(unsigned long addr, unsigned long size)
3740 +__meminit void *early_ioremap(unsigned long addr, unsigned long size)
3741 {
3742 - unsigned long map = round_down(addr, LARGE_PAGE_SIZE);
3743 -
3744 - /* actually usually some more */
3745 - if (size >= LARGE_PAGE_SIZE) {
3746 - return NULL;
3747 + unsigned long vaddr;
3748 + pmd_t *pmd, *last_pmd;
3749 + int i, pmds;
3750 +
3751 + pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
3752 + vaddr = __START_KERNEL_map;
3753 + pmd = level2_kernel_pgt;
3754 + last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1;
3755 + for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) {
3756 + for (i = 0; i < pmds; i++) {
3757 + if (pmd_present(pmd[i]))
3758 + goto next;
3759 + }
3760 + vaddr += addr & ~PMD_MASK;
3761 + addr &= PMD_MASK;
3762 + for (i = 0; i < pmds; i++, addr += PMD_SIZE)
3763 + set_pmd(pmd + i,__pmd(addr | _KERNPG_TABLE | _PAGE_PSE));
3764 + __flush_tlb();
3765 + return (void *)vaddr;
3766 + next:
3767 + ;
3768 }
3769 - set_pmd(temp_mappings[0].pmd, __pmd(map | _KERNPG_TABLE | _PAGE_PSE));
3770 - map += LARGE_PAGE_SIZE;
3771 - set_pmd(temp_mappings[1].pmd, __pmd(map | _KERNPG_TABLE | _PAGE_PSE));
3772 - __flush_tlb();
3773 - return temp_mappings[0].address + (addr & (LARGE_PAGE_SIZE-1));
3774 + printk("early_ioremap(0x%lx, %lu) failed\n", addr, size);
3775 + return NULL;
3776 }
3777
3778 /* To avoid virtual aliases later */
3779 -__init void early_iounmap(void *addr, unsigned long size)
3780 +__meminit void early_iounmap(void *addr, unsigned long size)
3781 {
3782 - if ((void *)round_down((unsigned long)addr, LARGE_PAGE_SIZE) != temp_mappings[0].address)
3783 - printk("early_iounmap: bad address %p\n", addr);
3784 - set_pmd(temp_mappings[0].pmd, __pmd(0));
3785 - set_pmd(temp_mappings[1].pmd, __pmd(0));
3786 + unsigned long vaddr;
3787 + pmd_t *pmd;
3788 + int i, pmds;
3789 +
3790 + vaddr = (unsigned long)addr;
3791 + pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
3792 + pmd = level2_kernel_pgt + pmd_index(vaddr);
3793 + for (i = 0; i < pmds; i++)
3794 + pmd_clear(pmd + i);
3795 __flush_tlb();
3796 }
3797 #endif
3798 @@ -763,14 +790,6 @@
3799 __flush_tlb_all();
3800 }
3801
3802 -void __cpuinit zap_low_mappings(int cpu)
3803 -{
3804 - /* this is not required for Xen */
3805 -#if 0
3806 - swap_low_mappings();
3807 -#endif
3808 -}
3809 -
3810 #ifndef CONFIG_NUMA
3811 void __init paging_init(void)
3812 {
3813 @@ -961,17 +980,6 @@
3814 reservedpages << (PAGE_SHIFT-10),
3815 datasize >> 10,
3816 initsize >> 10);
3817 -
3818 -#ifndef CONFIG_XEN
3819 -#ifdef CONFIG_SMP
3820 - /*
3821 - * Sync boot_level4_pgt mappings with the init_level4_pgt
3822 - * except for the low identity mappings which are already zapped
3823 - * in init_level4_pgt. This sync-up is essential for AP's bringup
3824 - */
3825 - memcpy(boot_level4_pgt+1, init_level4_pgt+1, (PTRS_PER_PGD-1)*sizeof(pgd_t));
3826 -#endif
3827 -#endif
3828 }
3829
3830 void free_init_pages(char *what, unsigned long begin, unsigned long end)
3831 @@ -981,7 +989,7 @@
3832 if (begin >= end)
3833 return;
3834
3835 - printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10);
3836 + printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
3837 for (addr = begin; addr < end; addr += PAGE_SIZE) {
3838 ClearPageReserved(virt_to_page(addr));
3839 init_page_count(virt_to_page(addr));
3840 @@ -990,24 +998,17 @@
3841 if (addr >= __START_KERNEL_map) {
3842 /* make_readonly() reports all kernel addresses. */
3843 __make_page_writable(__va(__pa(addr)));
3844 - if (HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) {
3845 - pgd_t *pgd = pgd_offset_k(addr);
3846 - pud_t *pud = pud_offset(pgd, addr);
3847 - pmd_t *pmd = pmd_offset(pud, addr);
3848 - pte_t *pte = pte_offset_kernel(pmd, addr);
3849 -
3850 - xen_l1_entry_update(pte, __pte(0)); /* fallback */
3851 - }
3852 + change_page_attr_addr(addr, 1, __pgprot(0));
3853 }
3854 free_page(addr);
3855 totalram_pages++;
3856 }
3857 + if (addr > __START_KERNEL_map)
3858 + global_flush_tlb();
3859 }
3860
3861 void free_initmem(void)
3862 {
3863 - memset(__initdata_begin, POISON_FREE_INITDATA,
3864 - __initdata_end - __initdata_begin);
3865 free_init_pages("unused kernel memory",
3866 (unsigned long)(&__init_begin),
3867 (unsigned long)(&__init_end));
3868 @@ -1017,13 +1018,28 @@
3869
3870 void mark_rodata_ro(void)
3871 {
3872 - unsigned long addr = (unsigned long)__start_rodata;
3873 + unsigned long start = (unsigned long)_stext, end;
3874 +
3875 +#ifdef CONFIG_HOTPLUG_CPU
3876 + /* It must still be possible to apply SMP alternatives. */
3877 + if (num_possible_cpus() > 1)
3878 + start = (unsigned long)_etext;
3879 +#endif
3880 +
3881 +#ifdef CONFIG_KPROBES
3882 + start = (unsigned long)__start_rodata;
3883 +#endif
3884 +
3885 + end = (unsigned long)__end_rodata;
3886 + start = (start + PAGE_SIZE - 1) & PAGE_MASK;
3887 + end &= PAGE_MASK;
3888 + if (end <= start)
3889 + return;
3890
3891 - for (; addr < (unsigned long)__end_rodata; addr += PAGE_SIZE)
3892 - change_page_attr_addr(addr, 1, PAGE_KERNEL_RO);
3893 + change_page_attr_addr(start, (end - start) >> PAGE_SHIFT, PAGE_KERNEL_RO);
3894
3895 - printk ("Write protecting the kernel read-only data: %luk\n",
3896 - (__end_rodata - __start_rodata) >> 10);
3897 + printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
3898 + (end - start) >> 10);
3899
3900 /*
3901 * change_page_attr_addr() requires a global_flush_tlb() call after it.
3902 @@ -1176,3 +1192,11 @@
3903 {
3904 return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
3905 }
3906 +
3907 +#ifndef CONFIG_XEN
3908 +void *alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size)
3909 +{
3910 + return __alloc_bootmem_core(pgdat->bdata, size,
3911 + SMP_CACHE_BYTES, (4UL*1024*1024*1024), 0);
3912 +}
3913 +#endif
3914 --- a/arch/x86/mm/ioremap_32-xen.c
3915 +++ b/arch/x86/mm/ioremap_32-xen.c
3916 @@ -13,6 +13,7 @@
3917 #include <linux/slab.h>
3918 #include <linux/module.h>
3919 #include <linux/io.h>
3920 +#include <linux/sched.h>
3921 #include <asm/fixmap.h>
3922 #include <asm/cacheflush.h>
3923 #include <asm/tlbflush.h>
3924 --- a/arch/x86/mm/pageattr_64-xen.c
3925 +++ b/arch/x86/mm/pageattr_64-xen.c
3926 @@ -215,13 +215,13 @@
3927 preempt_enable();
3928 }
3929
3930 -void _arch_dup_mmap(struct mm_struct *mm)
3931 +void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
3932 {
3933 if (!mm->context.pinned)
3934 mm_pin(mm);
3935 }
3936
3937 -void _arch_exit_mmap(struct mm_struct *mm)
3938 +void arch_exit_mmap(struct mm_struct *mm)
3939 {
3940 struct task_struct *tsk = current;
3941
3942 @@ -337,10 +337,11 @@
3943 struct page *pg;
3944
3945 /* When clflush is available always use it because it is
3946 - much cheaper than WBINVD */
3947 - if (!cpu_has_clflush)
3948 + much cheaper than WBINVD. Disable clflush for now because
3949 + the high level code is not ready yet */
3950 + if (1 || !cpu_has_clflush)
3951 asm volatile("wbinvd" ::: "memory");
3952 - list_for_each_entry(pg, l, lru) {
3953 + else list_for_each_entry(pg, l, lru) {
3954 void *adr = page_address(pg);
3955 if (cpu_has_clflush)
3956 cache_flush_page(adr);
3957 @@ -454,16 +455,24 @@
3958 */
3959 int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot)
3960 {
3961 - int err = 0;
3962 + int err = 0, kernel_map = 0;
3963 int i;
3964
3965 + if (address >= __START_KERNEL_map
3966 + && address < __START_KERNEL_map + KERNEL_TEXT_SIZE) {
3967 + address = (unsigned long)__va(__pa(address));
3968 + kernel_map = 1;
3969 + }
3970 +
3971 down_write(&init_mm.mmap_sem);
3972 for (i = 0; i < numpages; i++, address += PAGE_SIZE) {
3973 unsigned long pfn = __pa(address) >> PAGE_SHIFT;
3974
3975 - err = __change_page_attr(address, pfn, prot, PAGE_KERNEL);
3976 - if (err)
3977 - break;
3978 + if (!kernel_map || pte_present(pfn_pte(0, prot))) {
3979 + err = __change_page_attr(address, pfn, prot, PAGE_KERNEL);
3980 + if (err)
3981 + break;
3982 + }
3983 /* Handle kernel mapping too which aliases part of the
3984 * lowmem */
3985 if (__pa(address) < KERNEL_TEXT_SIZE) {
3986 --- a/arch/x86/mm/pgtable_32-xen.c
3987 +++ b/arch/x86/mm/pgtable_32-xen.c
3988 @@ -13,6 +13,7 @@
3989 #include <linux/pagemap.h>
3990 #include <linux/spinlock.h>
3991 #include <linux/module.h>
3992 +#include <linux/quicklist.h>
3993
3994 #include <asm/system.h>
3995 #include <asm/pgtable.h>
3996 @@ -212,8 +213,6 @@
3997 * against pageattr.c; it is the unique case in which a valid change
3998 * of kernel pagetables can't be lazily synchronized by vmalloc faults.
3999 * vmalloc faults work because attached pagetables are never freed.
4000 - * The locking scheme was chosen on the basis of manfred's
4001 - * recommendations and having no core impact whatsoever.
4002 * -- wli
4003 */
4004 DEFINE_SPINLOCK(pgd_lock);
4005 @@ -239,37 +238,59 @@
4006 set_page_private(next, (unsigned long)pprev);
4007 }
4008
4009 -void pgd_ctor(void *pgd, struct kmem_cache *cache, unsigned long unused)
4010 +
4011 +
4012 +#if (PTRS_PER_PMD == 1)
4013 +/* Non-PAE pgd constructor */
4014 +void pgd_ctor(void *pgd)
4015 {
4016 unsigned long flags;
4017
4018 - if (PTRS_PER_PMD > 1) {
4019 - if (HAVE_SHARED_KERNEL_PMD)
4020 - clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
4021 - swapper_pg_dir + USER_PTRS_PER_PGD,
4022 - KERNEL_PGD_PTRS);
4023 - } else {
4024 - spin_lock_irqsave(&pgd_lock, flags);
4025 + /* !PAE, no pagetable sharing */
4026 + memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
4027 +
4028 + spin_lock_irqsave(&pgd_lock, flags);
4029 +
4030 + /* must happen under lock */
4031 + clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
4032 + swapper_pg_dir + USER_PTRS_PER_PGD,
4033 + KERNEL_PGD_PTRS);
4034 +
4035 + paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
4036 + __pa(swapper_pg_dir) >> PAGE_SHIFT,
4037 + USER_PTRS_PER_PGD,
4038 + KERNEL_PGD_PTRS);
4039 + pgd_list_add(pgd);
4040 + spin_unlock_irqrestore(&pgd_lock, flags);
4041 +}
4042 +#else /* PTRS_PER_PMD > 1 */
4043 +/* PAE pgd constructor */
4044 +void pgd_ctor(void *pgd)
4045 +{
4046 + /* PAE, kernel PMD may be shared */
4047 +
4048 + if (SHARED_KERNEL_PMD) {
4049 clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
4050 swapper_pg_dir + USER_PTRS_PER_PGD,
4051 KERNEL_PGD_PTRS);
4052 - memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
4053 -
4054 - /* must happen under lock */
4055 - paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
4056 - __pa(swapper_pg_dir) >> PAGE_SHIFT,
4057 - USER_PTRS_PER_PGD, PTRS_PER_PGD - USER_PTRS_PER_PGD);
4058 + } else {
4059 + unsigned long flags;
4060
4061 + memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
4062 + spin_lock_irqsave(&pgd_lock, flags);
4063 pgd_list_add(pgd);
4064 spin_unlock_irqrestore(&pgd_lock, flags);
4065 }
4066 }
4067 +#endif /* PTRS_PER_PMD */
4068
4069 -/* never called when PTRS_PER_PMD > 1 */
4070 -void pgd_dtor(void *pgd, struct kmem_cache *cache, unsigned long unused)
4071 +void pgd_dtor(void *pgd)
4072 {
4073 unsigned long flags; /* can be called from interrupt context */
4074
4075 + if (SHARED_KERNEL_PMD)
4076 + return;
4077 +
4078 paravirt_release_pd(__pa(pgd) >> PAGE_SHIFT);
4079 spin_lock_irqsave(&pgd_lock, flags);
4080 pgd_list_del(pgd);
4081 @@ -278,11 +299,46 @@
4082 pgd_test_and_unpin(pgd);
4083 }
4084
4085 +#define UNSHARED_PTRS_PER_PGD \
4086 + (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD)
4087 +
4088 +/* If we allocate a pmd for part of the kernel address space, then
4089 + make sure its initialized with the appropriate kernel mappings.
4090 + Otherwise use a cached zeroed pmd. */
4091 +static pmd_t *pmd_cache_alloc(int idx)
4092 +{
4093 + pmd_t *pmd;
4094 +
4095 + if (idx >= USER_PTRS_PER_PGD) {
4096 + pmd = (pmd_t *)__get_free_page(GFP_KERNEL);
4097 +
4098 +#ifndef CONFIG_XEN
4099 + if (pmd)
4100 + memcpy(pmd,
4101 + (void *)pgd_page_vaddr(swapper_pg_dir[idx]),
4102 + sizeof(pmd_t) * PTRS_PER_PMD);
4103 +#endif
4104 + } else
4105 + pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
4106 +
4107 + return pmd;
4108 +}
4109 +
4110 +static void pmd_cache_free(pmd_t *pmd, int idx)
4111 +{
4112 + if (idx >= USER_PTRS_PER_PGD) {
4113 + make_lowmem_page_writable(pmd, XENFEAT_writable_page_tables);
4114 + memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
4115 + free_page((unsigned long)pmd);
4116 + } else
4117 + kmem_cache_free(pmd_cache, pmd);
4118 +}
4119 +
4120 pgd_t *pgd_alloc(struct mm_struct *mm)
4121 {
4122 int i;
4123 - pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL);
4124 - pmd_t **pmd;
4125 + pgd_t *pgd = quicklist_alloc(0, GFP_KERNEL, pgd_ctor);
4126 + pmd_t **pmds = NULL;
4127 unsigned long flags;
4128
4129 pgd_test_and_unpin(pgd);
4130 @@ -290,37 +346,40 @@
4131 if (PTRS_PER_PMD == 1 || !pgd)
4132 return pgd;
4133
4134 - if (HAVE_SHARED_KERNEL_PMD) {
4135 - for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
4136 - pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
4137 - if (!pmd)
4138 - goto out_oom;
4139 - paravirt_alloc_pd(__pa(pmd) >> PAGE_SHIFT);
4140 - set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
4141 +#ifdef CONFIG_XEN
4142 + if (!SHARED_KERNEL_PMD) {
4143 + /*
4144 + * We can race save/restore (if we sleep during a GFP_KERNEL memory
4145 + * allocation). We therefore store virtual addresses of pmds as they
4146 + * do not change across save/restore, and poke the machine addresses
4147 + * into the pgdir under the pgd_lock.
4148 + */
4149 + pmds = kmalloc(PTRS_PER_PGD * sizeof(pmd_t *), GFP_KERNEL);
4150 + if (!pmds) {
4151 + quicklist_free(0, pgd_dtor, pgd);
4152 + return NULL;
4153 }
4154 - return pgd;
4155 - }
4156 -
4157 - /*
4158 - * We can race save/restore (if we sleep during a GFP_KERNEL memory
4159 - * allocation). We therefore store virtual addresses of pmds as they
4160 - * do not change across save/restore, and poke the machine addresses
4161 - * into the pgdir under the pgd_lock.
4162 - */
4163 - pmd = kmalloc(PTRS_PER_PGD * sizeof(pmd_t *), GFP_KERNEL);
4164 - if (!pmd) {
4165 - kmem_cache_free(pgd_cache, pgd);
4166 - return NULL;
4167 }
4168 +#endif
4169
4170 /* Allocate pmds, remember virtual addresses. */
4171 - for (i = 0; i < PTRS_PER_PGD; ++i) {
4172 - pmd[i] = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
4173 - if (!pmd[i])
4174 + for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) {
4175 + pmd_t *pmd = pmd_cache_alloc(i);
4176 +
4177 + if (!pmd)
4178 goto out_oom;
4179 +
4180 paravirt_alloc_pd(__pa(pmd) >> PAGE_SHIFT);
4181 + if (pmds)
4182 + pmds[i] = pmd;
4183 + else
4184 + set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
4185 }
4186
4187 +#ifdef CONFIG_XEN
4188 + if (SHARED_KERNEL_PMD)
4189 + return pgd;
4190 +
4191 spin_lock_irqsave(&pgd_lock, flags);
4192
4193 /* Protect against save/restore: move below 4GB under pgd_lock. */
4194 @@ -335,44 +394,40 @@
4195
4196 /* Copy kernel pmd contents and write-protect the new pmds. */
4197 for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
4198 - unsigned long v = (unsigned long)i << PGDIR_SHIFT;
4199 - pgd_t *kpgd = pgd_offset_k(v);
4200 - pud_t *kpud = pud_offset(kpgd, v);
4201 - pmd_t *kpmd = pmd_offset(kpud, v);
4202 - memcpy(pmd[i], kpmd, PAGE_SIZE);
4203 + memcpy(pmds[i],
4204 + (void *)pgd_page_vaddr(swapper_pg_dir[i]),
4205 + sizeof(pmd_t) * PTRS_PER_PMD);
4206 make_lowmem_page_readonly(
4207 - pmd[i], XENFEAT_writable_page_tables);
4208 + pmds[i], XENFEAT_writable_page_tables);
4209 }
4210
4211 /* It is safe to poke machine addresses of pmds under the pmd_lock. */
4212 for (i = 0; i < PTRS_PER_PGD; i++)
4213 - set_pgd(&pgd[i], __pgd(1 + __pa(pmd[i])));
4214 -
4215 - /* Ensure this pgd gets picked up and pinned on save/restore. */
4216 - pgd_list_add(pgd);
4217 + set_pgd(&pgd[i], __pgd(1 + __pa(pmds[i])));
4218
4219 spin_unlock_irqrestore(&pgd_lock, flags);
4220
4221 - kfree(pmd);
4222 + kfree(pmds);
4223 +#endif
4224
4225 return pgd;
4226
4227 out_oom:
4228 - if (HAVE_SHARED_KERNEL_PMD) {
4229 + if (!pmds) {
4230 for (i--; i >= 0; i--) {
4231 pgd_t pgdent = pgd[i];
4232 void* pmd = (void *)__va(pgd_val(pgdent)-1);
4233 paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
4234 - kmem_cache_free(pmd_cache, pmd);
4235 + pmd_cache_free(pmd, i);
4236 }
4237 } else {
4238 for (i--; i >= 0; i--) {
4239 - paravirt_release_pd(__pa(pmd[i]) >> PAGE_SHIFT);
4240 - kmem_cache_free(pmd_cache, pmd[i]);
4241 + paravirt_release_pd(__pa(pmds[i]) >> PAGE_SHIFT);
4242 + pmd_cache_free(pmds[i], i);
4243 }
4244 - kfree(pmd);
4245 + kfree(pmds);
4246 }
4247 - kmem_cache_free(pgd_cache, pgd);
4248 + quicklist_free(0, pgd_dtor, pgd);
4249 return NULL;
4250 }
4251
4252 @@ -392,35 +447,24 @@
4253
4254 /* in the PAE case user pgd entries are overwritten before usage */
4255 if (PTRS_PER_PMD > 1) {
4256 - for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
4257 + for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) {
4258 pgd_t pgdent = pgd[i];
4259 void* pmd = (void *)__va(pgd_val(pgdent)-1);
4260 paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
4261 - kmem_cache_free(pmd_cache, pmd);
4262 + pmd_cache_free(pmd, i);
4263 }
4264
4265 - if (!HAVE_SHARED_KERNEL_PMD) {
4266 - unsigned long flags;
4267 - spin_lock_irqsave(&pgd_lock, flags);
4268 - pgd_list_del(pgd);
4269 - spin_unlock_irqrestore(&pgd_lock, flags);
4270 -
4271 - for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
4272 - pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1);
4273 - make_lowmem_page_writable(
4274 - pmd, XENFEAT_writable_page_tables);
4275 - memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
4276 - kmem_cache_free(pmd_cache, pmd);
4277 - }
4278 -
4279 - if (!xen_feature(XENFEAT_pae_pgdir_above_4gb))
4280 - xen_destroy_contiguous_region(
4281 - (unsigned long)pgd, 0);
4282 - }
4283 + if (!xen_feature(XENFEAT_pae_pgdir_above_4gb))
4284 + xen_destroy_contiguous_region((unsigned long)pgd, 0);
4285 }
4286
4287 /* in the non-PAE case, free_pgtables() clears user pgd entries */
4288 - kmem_cache_free(pgd_cache, pgd);
4289 + quicklist_free(0, pgd_dtor, pgd);
4290 +}
4291 +
4292 +void check_pgt_cache(void)
4293 +{
4294 + quicklist_trim(0, pgd_dtor, 25, 16);
4295 }
4296
4297 void make_lowmem_page_readonly(void *va, unsigned int feature)
4298 @@ -717,13 +761,13 @@
4299 spin_unlock_irqrestore(&pgd_lock, flags);
4300 }
4301
4302 -void _arch_dup_mmap(struct mm_struct *mm)
4303 +void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
4304 {
4305 if (!test_bit(PG_pinned, &virt_to_page(mm->pgd)->flags))
4306 mm_pin(mm);
4307 }
4308
4309 -void _arch_exit_mmap(struct mm_struct *mm)
4310 +void arch_exit_mmap(struct mm_struct *mm)
4311 {
4312 struct task_struct *tsk = current;
4313
4314 --- a/drivers/char/tpm/tpm_xen.c
4315 +++ b/drivers/char/tpm/tpm_xen.c
4316 @@ -463,7 +463,7 @@
4317 tp->backend_id = domid;
4318
4319 err = bind_listening_port_to_irqhandler(
4320 - domid, tpmif_int, SA_SAMPLE_RANDOM, "tpmif", tp);
4321 + domid, tpmif_int, IRQF_SAMPLE_RANDOM, "tpmif", tp);
4322 if (err <= 0) {
4323 WPRINTK("bind_listening_port_to_irqhandler failed "
4324 "(err=%d)\n", err);
4325 --- a/drivers/xen/blkfront/blkfront.c
4326 +++ b/drivers/xen/blkfront/blkfront.c
4327 @@ -236,7 +236,7 @@
4328 info->ring_ref = err;
4329
4330 err = bind_listening_port_to_irqhandler(
4331 - dev->otherend_id, blkif_int, SA_SAMPLE_RANDOM, "blkif", info);
4332 + dev->otherend_id, blkif_int, IRQF_SAMPLE_RANDOM, "blkif", info);
4333 if (err <= 0) {
4334 xenbus_dev_fatal(dev, err,
4335 "bind_listening_port_to_irqhandler");
4336 --- a/drivers/xen/char/mem.c
4337 +++ b/drivers/xen/char/mem.c
4338 @@ -18,7 +18,6 @@
4339 #include <linux/raw.h>
4340 #include <linux/tty.h>
4341 #include <linux/capability.h>
4342 -#include <linux/smp_lock.h>
4343 #include <linux/ptrace.h>
4344 #include <linux/device.h>
4345 #include <asm/pgalloc.h>
4346 --- a/drivers/xen/core/hypervisor_sysfs.c
4347 +++ b/drivers/xen/core/hypervisor_sysfs.c
4348 @@ -50,7 +50,7 @@
4349 if (!is_running_on_xen())
4350 return -ENODEV;
4351
4352 - hypervisor_subsys.kset.kobj.ktype = &hyp_sysfs_kobj_type;
4353 + hypervisor_subsys.kobj.ktype = &hyp_sysfs_kobj_type;
4354 return 0;
4355 }
4356
4357 --- a/drivers/xen/core/smpboot.c
4358 +++ b/drivers/xen/core/smpboot.c
4359 @@ -121,7 +121,7 @@
4360 rc = bind_ipi_to_irqhandler(RESCHEDULE_VECTOR,
4361 cpu,
4362 smp_reschedule_interrupt,
4363 - SA_INTERRUPT,
4364 + IRQF_DISABLED,
4365 resched_name[cpu],
4366 NULL);
4367 if (rc < 0)
4368 @@ -132,7 +132,7 @@
4369 rc = bind_ipi_to_irqhandler(CALL_FUNCTION_VECTOR,
4370 cpu,
4371 smp_call_function_interrupt,
4372 - SA_INTERRUPT,
4373 + IRQF_DISABLED,
4374 callfunc_name[cpu],
4375 NULL);
4376 if (rc < 0)
4377 @@ -165,13 +165,12 @@
4378
4379 void __cpuinit cpu_bringup(void)
4380 {
4381 + cpu_init();
4382 #ifdef __i386__
4383 - cpu_set_gdt(current_thread_info()->cpu);
4384 - secondary_cpu_init();
4385 + identify_secondary_cpu(cpu_data + smp_processor_id());
4386 #else
4387 - cpu_init();
4388 -#endif
4389 identify_cpu(cpu_data + smp_processor_id());
4390 +#endif
4391 touch_softlockup_watchdog();
4392 preempt_disable();
4393 local_irq_enable();
4394 @@ -191,11 +190,6 @@
4395 static DEFINE_SPINLOCK(ctxt_lock);
4396
4397 struct task_struct *idle = idle_task(cpu);
4398 -#ifdef __x86_64__
4399 - struct desc_ptr *gdt_descr = &cpu_gdt_descr[cpu];
4400 -#else
4401 - struct Xgt_desc_struct *gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
4402 -#endif
4403
4404 if (cpu_test_and_set(cpu, cpu_initialized_map))
4405 return;
4406 @@ -218,11 +212,11 @@
4407 smp_trap_init(ctxt.trap_ctxt);
4408
4409 ctxt.ldt_ents = 0;
4410 -
4411 - ctxt.gdt_frames[0] = virt_to_mfn(gdt_descr->address);
4412 - ctxt.gdt_ents = gdt_descr->size / 8;
4413 + ctxt.gdt_ents = GDT_SIZE / 8;
4414
4415 #ifdef __i386__
4416 + ctxt.gdt_frames[0] = virt_to_mfn(get_cpu_gdt_table(cpu));
4417 +
4418 ctxt.user_regs.cs = __KERNEL_CS;
4419 ctxt.user_regs.esp = idle->thread.esp0 - sizeof(struct pt_regs);
4420
4421 @@ -235,7 +229,11 @@
4422 ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback;
4423
4424 ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
4425 +
4426 + ctxt.user_regs.fs = __KERNEL_PERCPU;
4427 #else /* __x86_64__ */
4428 + ctxt.gdt_frames[0] = virt_to_mfn(cpu_gdt_descr[cpu].address);
4429 +
4430 ctxt.user_regs.cs = __KERNEL_CS;
4431 ctxt.user_regs.esp = idle->thread.rsp0 - sizeof(struct pt_regs);
4432
4433 @@ -265,9 +263,8 @@
4434 struct vcpu_get_physid cpu_id;
4435 #ifdef __x86_64__
4436 struct desc_ptr *gdt_descr;
4437 -#else
4438 - struct Xgt_desc_struct *gdt_descr;
4439 #endif
4440 + void *gdt_addr;
4441
4442 apicid = 0;
4443 if (HYPERVISOR_vcpu_op(VCPUOP_get_physid, 0, &cpu_id) == 0)
4444 @@ -317,14 +314,12 @@
4445 }
4446 gdt_descr->size = GDT_SIZE;
4447 memcpy((void *)gdt_descr->address, cpu_gdt_table, GDT_SIZE);
4448 + gdt_addr = (void *)gdt_descr->address;
4449 #else
4450 - if (unlikely(!init_gdt(cpu, idle)))
4451 - continue;
4452 - gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
4453 + init_gdt(cpu);
4454 + gdt_addr = get_cpu_gdt_table(cpu);
4455 #endif
4456 - make_page_readonly(
4457 - (void *)gdt_descr->address,
4458 - XENFEAT_writable_descriptor_tables);
4459 + make_page_readonly(gdt_addr, XENFEAT_writable_descriptor_tables);
4460
4461 apicid = cpu;
4462 if (HYPERVISOR_vcpu_op(VCPUOP_get_physid, cpu, &cpu_id) == 0)
4463 @@ -338,7 +333,9 @@
4464 #ifdef __x86_64__
4465 cpu_pda(cpu)->pcurrent = idle;
4466 cpu_pda(cpu)->cpunumber = cpu;
4467 - clear_ti_thread_flag(idle->thread_info, TIF_FORK);
4468 + clear_ti_thread_flag(task_thread_info(idle), TIF_FORK);
4469 +#else
4470 + per_cpu(current_task, cpu) = idle;
4471 #endif
4472
4473 irq_ctx_init(cpu);
4474 @@ -363,8 +360,12 @@
4475 #endif
4476 }
4477
4478 -void __devinit smp_prepare_boot_cpu(void)
4479 +void __init smp_prepare_boot_cpu(void)
4480 {
4481 +#ifdef __i386__
4482 + init_gdt(smp_processor_id());
4483 + switch_to_new_gdt();
4484 +#endif
4485 prefill_possible_map();
4486 }
4487
4488 --- a/drivers/xen/core/xen_sysfs.c
4489 +++ b/drivers/xen/core/xen_sysfs.c
4490 @@ -28,12 +28,12 @@
4491
4492 static int __init xen_sysfs_type_init(void)
4493 {
4494 - return sysfs_create_file(&hypervisor_subsys.kset.kobj, &type_attr.attr);
4495 + return sysfs_create_file(&hypervisor_subsys.kobj, &type_attr.attr);
4496 }
4497
4498 static void xen_sysfs_type_destroy(void)
4499 {
4500 - sysfs_remove_file(&hypervisor_subsys.kset.kobj, &type_attr.attr);
4501 + sysfs_remove_file(&hypervisor_subsys.kobj, &type_attr.attr);
4502 }
4503
4504 /* xen version attributes */
4505 @@ -89,13 +89,13 @@
4506
4507 static int __init xen_sysfs_version_init(void)
4508 {
4509 - return sysfs_create_group(&hypervisor_subsys.kset.kobj,
4510 + return sysfs_create_group(&hypervisor_subsys.kobj,
4511 &version_group);
4512 }
4513
4514 static void xen_sysfs_version_destroy(void)
4515 {
4516 - sysfs_remove_group(&hypervisor_subsys.kset.kobj, &version_group);
4517 + sysfs_remove_group(&hypervisor_subsys.kobj, &version_group);
4518 }
4519
4520 /* UUID */
4521 @@ -125,12 +125,12 @@
4522
4523 static int __init xen_sysfs_uuid_init(void)
4524 {
4525 - return sysfs_create_file(&hypervisor_subsys.kset.kobj, &uuid_attr.attr);
4526 + return sysfs_create_file(&hypervisor_subsys.kobj, &uuid_attr.attr);
4527 }
4528
4529 static void xen_sysfs_uuid_destroy(void)
4530 {
4531 - sysfs_remove_file(&hypervisor_subsys.kset.kobj, &uuid_attr.attr);
4532 + sysfs_remove_file(&hypervisor_subsys.kobj, &uuid_attr.attr);
4533 }
4534
4535 /* xen compilation attributes */
4536 @@ -203,13 +203,13 @@
4537
4538 int __init static xen_compilation_init(void)
4539 {
4540 - return sysfs_create_group(&hypervisor_subsys.kset.kobj,
4541 + return sysfs_create_group(&hypervisor_subsys.kobj,
4542 &xen_compilation_group);
4543 }
4544
4545 static void xen_compilation_destroy(void)
4546 {
4547 - sysfs_remove_group(&hypervisor_subsys.kset.kobj,
4548 + sysfs_remove_group(&hypervisor_subsys.kobj,
4549 &xen_compilation_group);
4550 }
4551
4552 @@ -324,13 +324,13 @@
4553
4554 static int __init xen_properties_init(void)
4555 {
4556 - return sysfs_create_group(&hypervisor_subsys.kset.kobj,
4557 + return sysfs_create_group(&hypervisor_subsys.kobj,
4558 &xen_properties_group);
4559 }
4560
4561 static void xen_properties_destroy(void)
4562 {
4563 - sysfs_remove_group(&hypervisor_subsys.kset.kobj,
4564 + sysfs_remove_group(&hypervisor_subsys.kobj,
4565 &xen_properties_group);
4566 }
4567
4568 --- a/drivers/xen/netback/netback.c
4569 +++ b/drivers/xen/netback/netback.c
4570 @@ -180,7 +180,7 @@
4571 goto err;
4572
4573 skb_reserve(nskb, 16 + NET_IP_ALIGN);
4574 - headlen = nskb->end - nskb->data;
4575 + headlen = skb_end_pointer(nskb) - nskb->data;
4576 if (headlen > skb_headlen(skb))
4577 headlen = skb_headlen(skb);
4578 ret = skb_copy_bits(skb, 0, __skb_put(nskb, headlen), headlen);
4579 @@ -226,11 +226,15 @@
4580 len -= copy;
4581 }
4582
4583 +#ifdef NET_SKBUFF_DATA_USES_OFFSET
4584 + offset = 0;
4585 +#else
4586 offset = nskb->data - skb->data;
4587 +#endif
4588
4589 - nskb->h.raw = skb->h.raw + offset;
4590 - nskb->nh.raw = skb->nh.raw + offset;
4591 - nskb->mac.raw = skb->mac.raw + offset;
4592 + nskb->transport_header = skb->transport_header + offset;
4593 + nskb->network_header = skb->network_header + offset;
4594 + nskb->mac_header = skb->mac_header + offset;
4595
4596 return nskb;
4597
4598 @@ -1601,7 +1605,7 @@
4599 (void)bind_virq_to_irqhandler(VIRQ_DEBUG,
4600 0,
4601 netif_be_dbg,
4602 - SA_SHIRQ,
4603 + IRQF_SHARED,
4604 "net-be-dbg",
4605 &netif_be_dbg);
4606 #endif
4607 --- a/drivers/xen/netfront/netfront.c
4608 +++ b/drivers/xen/netfront/netfront.c
4609 @@ -513,7 +513,7 @@
4610 memcpy(netdev->dev_addr, info->mac, ETH_ALEN);
4611
4612 err = bind_listening_port_to_irqhandler(
4613 - dev->otherend_id, netif_int, SA_SAMPLE_RANDOM, netdev->name,
4614 + dev->otherend_id, netif_int, IRQF_SAMPLE_RANDOM, netdev->name,
4615 netdev);
4616 if (err < 0)
4617 goto fail;
4618 --- a/drivers/xen/pciback/xenbus.c
4619 +++ b/drivers/xen/pciback/xenbus.c
4620 @@ -86,7 +86,7 @@
4621
4622 err = bind_interdomain_evtchn_to_irqhandler(
4623 pdev->xdev->otherend_id, remote_evtchn, pciback_handle_event,
4624 - SA_SAMPLE_RANDOM, "pciback", pdev);
4625 + IRQF_SAMPLE_RANDOM, "pciback", pdev);
4626 if (err < 0) {
4627 xenbus_dev_fatal(pdev->xdev, err,
4628 "Error binding event channel to IRQ");
4629 --- a/drivers/xen/pcifront/xenbus.c
4630 +++ b/drivers/xen/pcifront/xenbus.c
4631 @@ -10,10 +10,6 @@
4632 #include <xen/gnttab.h>
4633 #include "pcifront.h"
4634
4635 -#ifndef __init_refok
4636 -#define __init_refok
4637 -#endif
4638 -
4639 #define INVALID_GRANT_REF (0)
4640 #define INVALID_EVTCHN (-1)
4641
4642 --- a/drivers/xen/sfc_netback/accel_fwd.c
4643 +++ b/drivers/xen/sfc_netback/accel_fwd.c
4644 @@ -308,7 +308,7 @@
4645 static inline int packet_is_arp_reply(struct sk_buff *skb)
4646 {
4647 return skb->protocol == ntohs(ETH_P_ARP)
4648 - && skb->nh.arph->ar_op == ntohs(ARPOP_REPLY);
4649 + && arp_hdr(skb)->ar_op == ntohs(ARPOP_REPLY);
4650 }
4651
4652
4653 @@ -392,12 +392,13 @@
4654
4655 BUG_ON(fwd_priv == NULL);
4656
4657 - if (is_broadcast_ether_addr(skb->mac.raw) && packet_is_arp_reply(skb)) {
4658 + if (is_broadcast_ether_addr(skb_mac_header(skb))
4659 + && packet_is_arp_reply(skb)) {
4660 /*
4661 * update our fast path forwarding to reflect this
4662 * gratuitous ARP
4663 */
4664 - mac = skb->mac.raw+ETH_ALEN;
4665 + mac = skb_mac_header(skb)+ETH_ALEN;
4666
4667 DPRINTK("%s: found gratuitous ARP for " MAC_FMT "\n",
4668 __FUNCTION__, MAC_ARG(mac));
4669 --- a/drivers/xen/sfc_netback/accel_solarflare.c
4670 +++ b/drivers/xen/sfc_netback/accel_solarflare.c
4671 @@ -114,7 +114,7 @@
4672 BUG_ON(port == NULL);
4673
4674 NETBACK_ACCEL_STATS_OP(global_stats.dl_tx_packets++);
4675 - if (skb->mac.raw != NULL)
4676 + if (skb_mac_header_was_set(skb))
4677 netback_accel_tx_packet(skb, port->fwd_priv);
4678 else {
4679 DPRINTK("Ignoring packet with missing mac address\n");
4680 --- a/drivers/xen/sfc_netfront/accel_tso.c
4681 +++ b/drivers/xen/sfc_netfront/accel_tso.c
4682 @@ -33,10 +33,9 @@
4683
4684 #include "accel_tso.h"
4685
4686 -#define PTR_DIFF(p1, p2) ((u8*)(p1) - (u8*)(p2))
4687 -#define ETH_HDR_LEN(skb) ((skb)->nh.raw - (skb)->data)
4688 -#define SKB_TCP_OFF(skb) PTR_DIFF ((skb)->h.th, (skb)->data)
4689 -#define SKB_IP_OFF(skb) PTR_DIFF ((skb)->nh.iph, (skb)->data)
4690 +#define ETH_HDR_LEN(skb) skb_network_offset(skb)
4691 +#define SKB_TCP_OFF(skb) skb_transport_offset(skb)
4692 +#define SKB_IP_OFF(skb) skb_network_offset(skb)
4693
4694 /*
4695 * Set a maximum number of buffers in each output packet to make life
4696 @@ -114,9 +113,8 @@
4697 static inline void tso_check_safe(struct sk_buff *skb) {
4698 EPRINTK_ON(skb->protocol != htons (ETH_P_IP));
4699 EPRINTK_ON(((struct ethhdr*) skb->data)->h_proto != htons (ETH_P_IP));
4700 - EPRINTK_ON(skb->nh.iph->protocol != IPPROTO_TCP);
4701 - EPRINTK_ON((SKB_TCP_OFF(skb)
4702 - + (skb->h.th->doff << 2u)) > skb_headlen(skb));
4703 + EPRINTK_ON(ip_hdr(skb)->protocol != IPPROTO_TCP);
4704 + EPRINTK_ON((SKB_TCP_OFF(skb) + tcp_hdrlen(skb)) > skb_headlen(skb));
4705 }
4706
4707
4708 @@ -129,17 +127,17 @@
4709 * All ethernet/IP/TCP headers combined size is TCP header size
4710 * plus offset of TCP header relative to start of packet.
4711 */
4712 - st->p.header_length = (skb->h.th->doff << 2u) + SKB_TCP_OFF(skb);
4713 + st->p.header_length = tcp_hdrlen(skb) + SKB_TCP_OFF(skb);
4714 st->p.full_packet_size = (st->p.header_length
4715 + skb_shinfo(skb)->gso_size);
4716 st->p.gso_size = skb_shinfo(skb)->gso_size;
4717
4718 - st->p.ip_id = htons(skb->nh.iph->id);
4719 - st->seqnum = ntohl(skb->h.th->seq);
4720 + st->p.ip_id = htons(ip_hdr(skb)->id);
4721 + st->seqnum = ntohl(tcp_hdr(skb)->seq);
4722
4723 - EPRINTK_ON(skb->h.th->urg);
4724 - EPRINTK_ON(skb->h.th->syn);
4725 - EPRINTK_ON(skb->h.th->rst);
4726 + EPRINTK_ON(tcp_hdr(skb)->urg);
4727 + EPRINTK_ON(tcp_hdr(skb)->syn);
4728 + EPRINTK_ON(tcp_hdr(skb)->rst);
4729
4730 st->remaining_len = skb->len - st->p.header_length;
4731
4732 @@ -258,8 +256,8 @@
4733 /* This packet will be the last in the TSO burst. */
4734 ip_length = (st->p.header_length - ETH_HDR_LEN(skb)
4735 + st->remaining_len);
4736 - tsoh_th->fin = skb->h.th->fin;
4737 - tsoh_th->psh = skb->h.th->psh;
4738 + tsoh_th->fin = tcp_hdr(skb)->fin;
4739 + tsoh_th->psh = tcp_hdr(skb)->psh;
4740 }
4741
4742 tsoh_iph->tot_len = htons(ip_length);
4743 --- a/drivers/xen/sfc_netfront/accel_vi.c
4744 +++ b/drivers/xen/sfc_netfront/accel_vi.c
4745 @@ -463,7 +463,7 @@
4746
4747 if (skb->ip_summed == CHECKSUM_PARTIAL) {
4748 /* Set to zero to encourage falcon to work it out for us */
4749 - *(u16*)(skb->h.raw + skb->csum_offset) = 0;
4750 + *(u16*)(skb->head + skb->csum_start + skb->csum_offset) = 0;
4751 }
4752
4753 if (multi_post_start_new_buffer(vnic, &state)) {
4754 @@ -582,7 +582,7 @@
4755
4756 if (skb->ip_summed == CHECKSUM_PARTIAL) {
4757 /* Set to zero to encourage falcon to work it out for us */
4758 - *(u16*)(skb->h.raw + skb->csum_offset) = 0;
4759 + *(u16*)(skb->head + skb->csum_start + skb->csum_offset) = 0;
4760 }
4761 NETFRONT_ACCEL_PKTBUFF_FOR_EACH_FRAGMENT
4762 (skb, idx, frag_data, frag_len, {
4763 --- a/drivers/xen/sfc_netfront/accel_xenbus.c
4764 +++ b/drivers/xen/sfc_netfront/accel_xenbus.c
4765 @@ -356,7 +356,7 @@
4766 /* Create xenbus msg event channel */
4767 err = bind_listening_port_to_irqhandler
4768 (dev->otherend_id, netfront_accel_msg_channel_irq_from_bend,
4769 - SA_SAMPLE_RANDOM, "vnicctrl", vnic);
4770 + IRQF_SAMPLE_RANDOM, "vnicctrl", vnic);
4771 if (err < 0) {
4772 EPRINTK("Couldn't bind msg event channel\n");
4773 goto fail_msg_irq;
4774 @@ -367,7 +367,7 @@
4775 /* Create xenbus net event channel */
4776 err = bind_listening_port_to_irqhandler
4777 (dev->otherend_id, netfront_accel_net_channel_irq_from_bend,
4778 - SA_SAMPLE_RANDOM, "vnicfront", vnic);
4779 + IRQF_SAMPLE_RANDOM, "vnicfront", vnic);
4780 if (err < 0) {
4781 EPRINTK("Couldn't bind net event channel\n");
4782 goto fail_net_irq;
4783 --- a/drivers/xen/xenoprof/xenoprofile.c
4784 +++ b/drivers/xen/xenoprof/xenoprofile.c
4785 @@ -236,7 +236,7 @@
4786 result = bind_virq_to_irqhandler(VIRQ_XENOPROF,
4787 i,
4788 xenoprof_ovf_interrupt,
4789 - SA_INTERRUPT,
4790 + IRQF_DISABLED,
4791 "xenoprof",
4792 NULL);
4793
4794 --- a/fs/aio.c
4795 +++ b/fs/aio.c
4796 @@ -38,7 +38,7 @@
4797
4798 #ifdef CONFIG_EPOLL
4799 #include <linux/poll.h>
4800 -#include <linux/eventpoll.h>
4801 +#include <linux/anon_inodes.h>
4802 #endif
4803
4804 #if DEBUG > 1
4805 @@ -1308,7 +1308,7 @@
4806
4807 /* make_aio_fd:
4808 * Create a file descriptor that can be used to poll the event queue.
4809 - * Based and piggybacked on the excellent epoll code.
4810 + * Based on the excellent epoll code.
4811 */
4812
4813 static int make_aio_fd(struct kioctx *ioctx)
4814 @@ -1317,7 +1317,8 @@
4815 struct inode *inode;
4816 struct file *file;
4817
4818 - error = ep_getfd(&fd, &inode, &file, NULL, &aioq_fops);
4819 + error = anon_inode_getfd(&fd, &inode, &file, "[aioq]",
4820 + &aioq_fops, ioctx);
4821 if (error)
4822 return error;
4823
4824 --- a/include/asm-x86/mach-xen/asm/desc_32.h
4825 +++ b/include/asm-x86/mach-xen/asm/desc_32.h
4826 @@ -11,23 +11,24 @@
4827
4828 #include <asm/mmu.h>
4829
4830 -extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
4831 -
4832 struct Xgt_desc_struct {
4833 unsigned short size;
4834 unsigned long address __attribute__((packed));
4835 unsigned short pad;
4836 } __attribute__ ((packed));
4837
4838 -extern struct Xgt_desc_struct idt_descr;
4839 -DECLARE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr);
4840 -extern struct Xgt_desc_struct early_gdt_descr;
4841 +struct gdt_page
4842 +{
4843 + struct desc_struct gdt[GDT_ENTRIES];
4844 +} __attribute__((aligned(PAGE_SIZE)));
4845 +DECLARE_PER_CPU(struct gdt_page, gdt_page);
4846
4847 static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
4848 {
4849 - return (struct desc_struct *)per_cpu(cpu_gdt_descr, cpu).address;
4850 + return per_cpu(gdt_page, cpu).gdt;
4851 }
4852
4853 +extern struct Xgt_desc_struct idt_descr;
4854 extern struct desc_struct idt_table[];
4855 extern void set_intr_gate(unsigned int irq, void * addr);
4856
4857 @@ -55,53 +56,32 @@
4858 #define DESCTYPE_S 0x10 /* !system */
4859
4860 #ifndef CONFIG_XEN
4861 -#define load_TR_desc() __asm__ __volatile__("ltr %w0"::"q" (GDT_ENTRY_TSS*8))
4862 -
4863 -#define load_gdt(dtr) __asm__ __volatile("lgdt %0"::"m" (*dtr))
4864 -#define load_idt(dtr) __asm__ __volatile("lidt %0"::"m" (*dtr))
4865 +#define load_TR_desc() native_load_tr_desc()
4866 +#define load_gdt(dtr) native_load_gdt(dtr)
4867 +#define load_idt(dtr) native_load_idt(dtr)
4868 #define load_tr(tr) __asm__ __volatile("ltr %0"::"m" (tr))
4869 #define load_ldt(ldt) __asm__ __volatile("lldt %0"::"m" (ldt))
4870
4871 -#define store_gdt(dtr) __asm__ ("sgdt %0":"=m" (*dtr))
4872 -#define store_idt(dtr) __asm__ ("sidt %0":"=m" (*dtr))
4873 -#define store_tr(tr) __asm__ ("str %0":"=m" (tr))
4874 +#define store_gdt(dtr) native_store_gdt(dtr)
4875 +#define store_idt(dtr) native_store_idt(dtr)
4876 +#define store_tr(tr) (tr = native_store_tr())
4877 #define store_ldt(ldt) __asm__ ("sldt %0":"=m" (ldt))
4878 -#endif
4879
4880 -#if TLS_SIZE != 24
4881 -# error update this code.
4882 -#endif
4883 -
4884 -static inline void load_TLS(struct thread_struct *t, unsigned int cpu)
4885 -{
4886 -#define C(i) if (HYPERVISOR_update_descriptor(virt_to_machine(&get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]), \
4887 - *(u64 *)&t->tls_array[i]) \
4888 - BUG()
4889 - C(0); C(1); C(2);
4890 -#undef C
4891 -}
4892 +#define load_TLS(t, cpu) native_load_tls(t, cpu)
4893 +#define set_ldt native_set_ldt
4894
4895 -#ifndef CONFIG_XEN
4896 #define write_ldt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
4897 #define write_gdt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
4898 #define write_idt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
4899
4900 -static inline void write_dt_entry(void *dt, int entry, __u32 entry_a, __u32 entry_b)
4901 +static inline void write_dt_entry(struct desc_struct *dt,
4902 + int entry, u32 entry_low, u32 entry_high)
4903 {
4904 - __u32 *lp = (__u32 *)((char *)dt + entry*8);
4905 - *lp = entry_a;
4906 - *(lp+1) = entry_b;
4907 + dt[entry].a = entry_low;
4908 + dt[entry].b = entry_high;
4909 }
4910 -#define set_ldt native_set_ldt
4911 -#else
4912 -extern int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b);
4913 -extern int write_gdt_entry(void *gdt, int entry, __u32 entry_a, __u32 entry_b);
4914 -#define set_ldt xen_set_ldt
4915 -#endif
4916
4917 -#ifndef CONFIG_XEN
4918 -static inline fastcall void native_set_ldt(const void *addr,
4919 - unsigned int entries)
4920 +static inline void native_set_ldt(const void *addr, unsigned int entries)
4921 {
4922 if (likely(entries == 0))
4923 __asm__ __volatile__("lldt %w0"::"q" (0));
4924 @@ -116,6 +96,65 @@
4925 __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
4926 }
4927 }
4928 +
4929 +
4930 +static inline void native_load_tr_desc(void)
4931 +{
4932 + asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
4933 +}
4934 +
4935 +static inline void native_load_gdt(const struct Xgt_desc_struct *dtr)
4936 +{
4937 + asm volatile("lgdt %0"::"m" (*dtr));
4938 +}
4939 +
4940 +static inline void native_load_idt(const struct Xgt_desc_struct *dtr)
4941 +{
4942 + asm volatile("lidt %0"::"m" (*dtr));
4943 +}
4944 +
4945 +static inline void native_store_gdt(struct Xgt_desc_struct *dtr)
4946 +{
4947 + asm ("sgdt %0":"=m" (*dtr));
4948 +}
4949 +
4950 +static inline void native_store_idt(struct Xgt_desc_struct *dtr)
4951 +{
4952 + asm ("sidt %0":"=m" (*dtr));
4953 +}
4954 +
4955 +static inline unsigned long native_store_tr(void)
4956 +{
4957 + unsigned long tr;
4958 + asm ("str %0":"=r" (tr));
4959 + return tr;
4960 +}
4961 +
4962 +static inline void native_load_tls(struct thread_struct *t, unsigned int cpu)
4963 +{
4964 + unsigned int i;
4965 + struct desc_struct *gdt = get_cpu_gdt_table(cpu);
4966 +
4967 + for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
4968 + gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i];
4969 +}
4970 +#else
4971 +#define load_TLS(t, cpu) xen_load_tls(t, cpu)
4972 +#define set_ldt xen_set_ldt
4973 +
4974 +extern int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b);
4975 +extern int write_gdt_entry(void *gdt, int entry, __u32 entry_a, __u32 entry_b);
4976 +
4977 +static inline void xen_load_tls(struct thread_struct *t, unsigned int cpu)
4978 +{
4979 + unsigned int i;
4980 + struct desc_struct *gdt = get_cpu_gdt_table(cpu) + GDT_ENTRY_TLS_MIN;
4981 +
4982 + for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
4983 + if (HYPERVISOR_update_descriptor(virt_to_machine(&gdt[i]),
4984 + *(u64 *)&t->tls_array[i]))
4985 + BUG();
4986 +}
4987 #endif
4988
4989 #ifndef CONFIG_X86_NO_IDT
4990 --- a/include/asm-x86/mach-xen/asm/desc_64.h
4991 +++ b/include/asm-x86/mach-xen/asm/desc_64.h
4992 @@ -127,16 +127,6 @@
4993 DESC_LDT, size * 8 - 1);
4994 }
4995
4996 -static inline void set_seg_base(unsigned cpu, int entry, void *base)
4997 -{
4998 - struct desc_struct *d = &cpu_gdt(cpu)[entry];
4999 - u32 addr = (u32)(u64)base;
5000 - BUG_ON((u64)base >> 32);
5001 - d->base0 = addr & 0xffff;
5002 - d->base1 = (addr >> 16) & 0xff;
5003 - d->base2 = (addr >> 24) & 0xff;
5004 -}
5005 -
5006 #define LDT_entry_a(info) \
5007 ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff))
5008 /* Don't allow setting of the lm bit. It is useless anyways because
5009 @@ -165,25 +155,15 @@
5010 (info)->useable == 0 && \
5011 (info)->lm == 0)
5012
5013 -#if TLS_SIZE != 24
5014 -# error update this code.
5015 -#endif
5016 -
5017 static inline void load_TLS(struct thread_struct *t, unsigned int cpu)
5018 {
5019 -#if 0
5020 + unsigned int i;
5021 u64 *gdt = (u64 *)(cpu_gdt(cpu) + GDT_ENTRY_TLS_MIN);
5022 - gdt[0] = t->tls_array[0];
5023 - gdt[1] = t->tls_array[1];
5024 - gdt[2] = t->tls_array[2];
5025 -#endif
5026 -#define C(i) \
5027 - if (HYPERVISOR_update_descriptor(virt_to_machine(&cpu_gdt(cpu)[GDT_ENTRY_TLS_MIN + i]), \
5028 - t->tls_array[i])) \
5029 - BUG();
5030
5031 - C(0); C(1); C(2);
5032 -#undef C
5033 + for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
5034 + if (HYPERVISOR_update_descriptor(virt_to_machine(&gdt[i]),
5035 + t->tls_array[i]))
5036 + BUG();
5037 }
5038
5039 /*
5040 --- a/include/asm-x86/mach-xen/asm/dma-mapping_64.h
5041 +++ b/include/asm-x86/mach-xen/asm/dma-mapping_64.h
5042 @@ -51,7 +51,7 @@
5043 };
5044
5045 extern dma_addr_t bad_dma_address;
5046 -extern struct dma_mapping_ops* dma_ops;
5047 +extern const struct dma_mapping_ops* dma_ops;
5048 extern int iommu_merge;
5049
5050 #if 0
5051 --- a/include/asm-x86/mach-xen/asm/fixmap_32.h
5052 +++ b/include/asm-x86/mach-xen/asm/fixmap_32.h
5053 @@ -19,10 +19,8 @@
5054 * the start of the fixmap.
5055 */
5056 extern unsigned long __FIXADDR_TOP;
5057 -#ifdef CONFIG_COMPAT_VDSO
5058 -#define FIXADDR_USER_START __fix_to_virt(FIX_VDSO)
5059 -#define FIXADDR_USER_END __fix_to_virt(FIX_VDSO - 1)
5060 -#endif
5061 +#define FIXADDR_USER_START __fix_to_virt(FIX_VDSO)
5062 +#define FIXADDR_USER_END __fix_to_virt(FIX_VDSO - 1)
5063
5064 #ifndef __ASSEMBLY__
5065 #include <linux/kernel.h>
5066 @@ -85,6 +83,9 @@
5067 #ifdef CONFIG_PCI_MMCONFIG
5068 FIX_PCIE_MCFG,
5069 #endif
5070 +#ifdef CONFIG_PARAVIRT
5071 + FIX_PARAVIRT_BOOTMAP,
5072 +#endif
5073 FIX_SHARED_INFO,
5074 #define NR_FIX_ISAMAPS 256
5075 FIX_ISAMAP_END,
5076 --- a/include/asm-x86/mach-xen/asm/fixmap_64.h
5077 +++ b/include/asm-x86/mach-xen/asm/fixmap_64.h
5078 @@ -15,7 +15,6 @@
5079 #include <asm/apicdef.h>
5080 #include <asm/page.h>
5081 #include <asm/vsyscall.h>
5082 -#include <asm/vsyscall32.h>
5083 #include <asm/acpi.h>
5084
5085 /*
5086 --- a/include/asm-x86/mach-xen/asm/highmem.h
5087 +++ b/include/asm-x86/mach-xen/asm/highmem.h
5088 @@ -67,12 +67,18 @@
5089
5090 void *kmap(struct page *page);
5091 void kunmap(struct page *page);
5092 +void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot);
5093 void *kmap_atomic(struct page *page, enum km_type type);
5094 void *kmap_atomic_pte(struct page *page, enum km_type type);
5095 void kunmap_atomic(void *kvaddr, enum km_type type);
5096 void *kmap_atomic_pfn(unsigned long pfn, enum km_type type);
5097 struct page *kmap_atomic_to_page(void *ptr);
5098
5099 +#define kmap_atomic_pte(page, type) \
5100 + kmap_atomic_prot(page, type, \
5101 + test_bit(PG_pinned, &(page)->flags) \
5102 + ? PAGE_KERNEL_RO : kmap_prot)
5103 +
5104 #define flush_cache_kmaps() do { } while (0)
5105
5106 #endif /* __KERNEL__ */
5107 --- a/include/asm-x86/mach-xen/asm/io_32.h
5108 +++ b/include/asm-x86/mach-xen/asm/io_32.h
5109 @@ -263,15 +263,18 @@
5110
5111 #endif /* __KERNEL__ */
5112
5113 -#define __SLOW_DOWN_IO "outb %%al,$0x80;"
5114 +static inline void xen_io_delay(void)
5115 +{
5116 + asm volatile("outb %%al,$0x80" : : : "memory");
5117 +}
5118
5119 static inline void slow_down_io(void) {
5120 - __asm__ __volatile__(
5121 - __SLOW_DOWN_IO
5122 + xen_io_delay();
5123 #ifdef REALLY_SLOW_IO
5124 - __SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO
5125 + xen_io_delay();
5126 + xen_io_delay();
5127 + xen_io_delay();
5128 #endif
5129 - : : );
5130 }
5131
5132 #ifdef CONFIG_X86_NUMAQ
5133 --- a/include/asm-x86/mach-xen/asm/irqflags_32.h
5134 +++ b/include/asm-x86/mach-xen/asm/irqflags_32.h
5135 @@ -11,6 +11,43 @@
5136 #define _ASM_IRQFLAGS_H
5137
5138 #ifndef __ASSEMBLY__
5139 +#define xen_save_fl(void) (current_vcpu_info()->evtchn_upcall_mask)
5140 +
5141 +#define xen_restore_fl(f) \
5142 +do { \
5143 + vcpu_info_t *_vcpu; \
5144 + barrier(); \
5145 + _vcpu = current_vcpu_info(); \
5146 + if ((_vcpu->evtchn_upcall_mask = (f)) == 0) { \
5147 + barrier(); /* unmask then check (avoid races) */\
5148 + if (unlikely(_vcpu->evtchn_upcall_pending)) \
5149 + force_evtchn_callback(); \
5150 + } \
5151 +} while (0)
5152 +
5153 +#define xen_irq_disable() \
5154 +do { \
5155 + current_vcpu_info()->evtchn_upcall_mask = 1; \
5156 + barrier(); \
5157 +} while (0)
5158 +
5159 +#define xen_irq_enable() \
5160 +do { \
5161 + vcpu_info_t *_vcpu; \
5162 + barrier(); \
5163 + _vcpu = current_vcpu_info(); \
5164 + _vcpu->evtchn_upcall_mask = 0; \
5165 + barrier(); /* unmask then check (avoid races) */ \
5166 + if (unlikely(_vcpu->evtchn_upcall_pending)) \
5167 + force_evtchn_callback(); \
5168 +} while (0)
5169 +
5170 +void xen_safe_halt(void);
5171 +
5172 +void xen_halt(void);
5173 +#endif /* __ASSEMBLY__ */
5174 +
5175 +#ifndef __ASSEMBLY__
5176
5177 /*
5178 * The use of 'barrier' in the following reflects their use as local-lock
5179 @@ -20,48 +57,31 @@
5180 * includes these barriers, for example.
5181 */
5182
5183 -#define __raw_local_save_flags() (current_vcpu_info()->evtchn_upcall_mask)
5184 +#define __raw_local_save_flags(void) xen_save_fl()
5185
5186 -#define raw_local_irq_restore(x) \
5187 -do { \
5188 - vcpu_info_t *_vcpu; \
5189 - barrier(); \
5190 - _vcpu = current_vcpu_info(); \
5191 - if ((_vcpu->evtchn_upcall_mask = (x)) == 0) { \
5192 - barrier(); /* unmask then check (avoid races) */ \
5193 - if (unlikely(_vcpu->evtchn_upcall_pending)) \
5194 - force_evtchn_callback(); \
5195 - } \
5196 -} while (0)
5197 +#define raw_local_irq_restore(flags) xen_restore_fl(flags)
5198
5199 -#define raw_local_irq_disable() \
5200 -do { \
5201 - current_vcpu_info()->evtchn_upcall_mask = 1; \
5202 - barrier(); \
5203 -} while (0)
5204 +#define raw_local_irq_disable() xen_irq_disable()
5205
5206 -#define raw_local_irq_enable() \
5207 -do { \
5208 - vcpu_info_t *_vcpu; \
5209 - barrier(); \
5210 - _vcpu = current_vcpu_info(); \
5211 - _vcpu->evtchn_upcall_mask = 0; \
5212 - barrier(); /* unmask then check (avoid races) */ \
5213 - if (unlikely(_vcpu->evtchn_upcall_pending)) \
5214 - force_evtchn_callback(); \
5215 -} while (0)
5216 +#define raw_local_irq_enable() xen_irq_enable()
5217
5218 /*
5219 * Used in the idle loop; sti takes one instruction cycle
5220 * to complete:
5221 */
5222 -void raw_safe_halt(void);
5223 +static inline void raw_safe_halt(void)
5224 +{
5225 + xen_safe_halt();
5226 +}
5227
5228 /*
5229 * Used when interrupts are already enabled or to
5230 * shutdown the processor:
5231 */
5232 -void halt(void);
5233 +static inline void halt(void)
5234 +{
5235 + xen_halt();
5236 +}
5237
5238 /*
5239 * For spinlocks, etc:
5240 --- a/include/asm-x86/mach-xen/asm/irqflags_64.h
5241 +++ b/include/asm-x86/mach-xen/asm/irqflags_64.h
5242 @@ -9,6 +9,7 @@
5243 */
5244 #ifndef _ASM_IRQFLAGS_H
5245 #define _ASM_IRQFLAGS_H
5246 +#include <asm/processor-flags.h>
5247
5248 #ifndef __ASSEMBLY__
5249 /*
5250 @@ -50,19 +51,19 @@
5251 {
5252 unsigned long flags = __raw_local_save_flags();
5253
5254 - raw_local_irq_restore((flags & ~(1 << 9)) | (1 << 18));
5255 + raw_local_irq_restore((flags & ~X86_EFLAGS_IF) | X86_EFLAGS_AC);
5256 }
5257
5258 static inline void raw_local_irq_enable(void)
5259 {
5260 unsigned long flags = __raw_local_save_flags();
5261
5262 - raw_local_irq_restore((flags | (1 << 9)) & ~(1 << 18));
5263 + raw_local_irq_restore((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC));
5264 }
5265
5266 static inline int raw_irqs_disabled_flags(unsigned long flags)
5267 {
5268 - return !(flags & (1<<9)) || (flags & (1 << 18));
5269 + return !(flags & X86_EFLAGS_IF) || (flags & X86_EFLAGS_AC);
5270 }
5271
5272 #else /* CONFIG_X86_VSMP */
5273 @@ -118,13 +119,21 @@
5274 * Used in the idle loop; sti takes one instruction cycle
5275 * to complete:
5276 */
5277 -void raw_safe_halt(void);
5278 +void xen_safe_halt(void);
5279 +static inline void raw_safe_halt(void)
5280 +{
5281 + xen_safe_halt();
5282 +}
5283
5284 /*
5285 * Used when interrupts are already enabled or to
5286 * shutdown the processor:
5287 */
5288 -void halt(void);
5289 +void xen_halt(void);
5290 +static inline void halt(void)
5291 +{
5292 + xen_halt();
5293 +}
5294
5295 #else /* __ASSEMBLY__: */
5296 # ifdef CONFIG_TRACE_IRQFLAGS
5297 --- a/include/asm-x86/mach-xen/asm/mmu.h
5298 +++ b/include/asm-x86/mach-xen/asm/mmu.h
5299 @@ -18,12 +18,4 @@
5300 #endif
5301 } mm_context_t;
5302
5303 -/* mm/memory.c:exit_mmap hook */
5304 -extern void _arch_exit_mmap(struct mm_struct *mm);
5305 -#define arch_exit_mmap(_mm) _arch_exit_mmap(_mm)
5306 -
5307 -/* kernel/fork.c:dup_mmap hook */
5308 -extern void _arch_dup_mmap(struct mm_struct *mm);
5309 -#define arch_dup_mmap(mm, oldmm) ((void)(oldmm), _arch_dup_mmap(mm))
5310 -
5311 #endif
5312 --- a/include/asm-x86/mach-xen/asm/mmu_64.h
5313 +++ b/include/asm-x86/mach-xen/asm/mmu_64.h
5314 @@ -25,14 +25,6 @@
5315 #ifdef CONFIG_XEN
5316 extern struct list_head mm_unpinned;
5317 extern spinlock_t mm_unpinned_lock;
5318 -
5319 -/* mm/memory.c:exit_mmap hook */
5320 -extern void _arch_exit_mmap(struct mm_struct *mm);
5321 -#define arch_exit_mmap(_mm) _arch_exit_mmap(_mm)
5322 -
5323 -/* kernel/fork.c:dup_mmap hook */
5324 -extern void _arch_dup_mmap(struct mm_struct *mm);
5325 -#define arch_dup_mmap(mm, oldmm) ((void)(oldmm), _arch_dup_mmap(mm))
5326 #endif
5327
5328 #endif
5329 --- a/include/asm-x86/mach-xen/asm/mmu_context_32.h
5330 +++ b/include/asm-x86/mach-xen/asm/mmu_context_32.h
5331 @@ -6,6 +6,20 @@
5332 #include <asm/pgalloc.h>
5333 #include <asm/tlbflush.h>
5334
5335 +void arch_exit_mmap(struct mm_struct *mm);
5336 +void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
5337 +
5338 +void mm_pin(struct mm_struct *mm);
5339 +void mm_unpin(struct mm_struct *mm);
5340 +void mm_pin_all(void);
5341 +
5342 +static inline void xen_activate_mm(struct mm_struct *prev,
5343 + struct mm_struct *next)
5344 +{
5345 + if (!test_bit(PG_pinned, &virt_to_page(next->pgd)->flags))
5346 + mm_pin(next);
5347 +}
5348 +
5349 /*
5350 * Used for LDT copy/destruction.
5351 */
5352 @@ -37,10 +51,6 @@
5353 : : "r" (0) );
5354 }
5355
5356 -extern void mm_pin(struct mm_struct *mm);
5357 -extern void mm_unpin(struct mm_struct *mm);
5358 -void mm_pin_all(void);
5359 -
5360 static inline void switch_mm(struct mm_struct *prev,
5361 struct mm_struct *next,
5362 struct task_struct *tsk)
5363 @@ -97,11 +107,10 @@
5364 #define deactivate_mm(tsk, mm) \
5365 asm("movl %0,%%gs": :"r" (0));
5366
5367 -static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next)
5368 -{
5369 - if (!test_bit(PG_pinned, &virt_to_page(next->pgd)->flags))
5370 - mm_pin(next);
5371 - switch_mm(prev, next, NULL);
5372 -}
5373 +#define activate_mm(prev, next) \
5374 + do { \
5375 + xen_activate_mm(prev, next); \
5376 + switch_mm((prev),(next),NULL); \
5377 + } while(0)
5378
5379 #endif
5380 --- a/include/asm-x86/mach-xen/asm/mmu_context_64.h
5381 +++ b/include/asm-x86/mach-xen/asm/mmu_context_64.h
5382 @@ -9,6 +9,9 @@
5383 #include <asm/pgtable.h>
5384 #include <asm/tlbflush.h>
5385
5386 +void arch_exit_mmap(struct mm_struct *mm);
5387 +void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
5388 +
5389 /*
5390 * possibly do the LDT unload here?
5391 */
5392 --- a/include/asm-x86/mach-xen/asm/page_64.h
5393 +++ b/include/asm-x86/mach-xen/asm/page_64.h
5394 @@ -7,6 +7,7 @@
5395 #include <linux/types.h>
5396 #include <asm/bug.h>
5397 #endif
5398 +#include <linux/const.h>
5399 #include <xen/interface/xen.h>
5400
5401 /*
5402 @@ -19,18 +20,14 @@
5403
5404 /* PAGE_SHIFT determines the page size */
5405 #define PAGE_SHIFT 12
5406 -#ifdef __ASSEMBLY__
5407 -#define PAGE_SIZE (0x1 << PAGE_SHIFT)
5408 -#else
5409 -#define PAGE_SIZE (1UL << PAGE_SHIFT)
5410 -#endif
5411 +#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT)
5412 #define PAGE_MASK (~(PAGE_SIZE-1))
5413
5414 /* See Documentation/x86_64/mm.txt for a description of the memory map. */
5415 #define __PHYSICAL_MASK_SHIFT 46
5416 -#define __PHYSICAL_MASK ((1UL << __PHYSICAL_MASK_SHIFT) - 1)
5417 +#define __PHYSICAL_MASK ((_AC(1,UL) << __PHYSICAL_MASK_SHIFT) - 1)
5418 #define __VIRTUAL_MASK_SHIFT 48
5419 -#define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1)
5420 +#define __VIRTUAL_MASK ((_AC(1,UL) << __VIRTUAL_MASK_SHIFT) - 1)
5421
5422 #define PHYSICAL_PAGE_MASK (~(PAGE_SIZE-1) & __PHYSICAL_MASK)
5423
5424 @@ -55,10 +52,10 @@
5425 #define N_EXCEPTION_STACKS 5 /* hw limit: 7 */
5426
5427 #define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1))
5428 -#define LARGE_PAGE_SIZE (1UL << PMD_SHIFT)
5429 +#define LARGE_PAGE_SIZE (_AC(1,UL) << PMD_SHIFT)
5430
5431 #define HPAGE_SHIFT PMD_SHIFT
5432 -#define HPAGE_SIZE ((1UL) << HPAGE_SHIFT)
5433 +#define HPAGE_SIZE (_AC(1,UL) << HPAGE_SHIFT)
5434 #define HPAGE_MASK (~(HPAGE_SIZE - 1))
5435 #define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
5436
5437 @@ -152,17 +149,23 @@
5438
5439 #define __pgprot(x) ((pgprot_t) { (x) } )
5440
5441 -#define __PHYSICAL_START ((unsigned long)CONFIG_PHYSICAL_START)
5442 -#define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START)
5443 -#define __START_KERNEL_map 0xffffffff80000000UL
5444 -#define __PAGE_OFFSET 0xffff880000000000UL
5445 +#endif /* !__ASSEMBLY__ */
5446
5447 -#else
5448 #define __PHYSICAL_START CONFIG_PHYSICAL_START
5449 +#define __KERNEL_ALIGN 0x200000
5450 +
5451 +/*
5452 + * Make sure kernel is aligned to 2MB address. Catching it at compile
5453 + * time is better. Change your config file and compile the kernel
5454 + * for a 2MB aligned address (CONFIG_PHYSICAL_START)
5455 + */
5456 +#if (CONFIG_PHYSICAL_START % __KERNEL_ALIGN) != 0
5457 +#error "CONFIG_PHYSICAL_START must be a multiple of 2MB"
5458 +#endif
5459 +
5460 #define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START)
5461 -#define __START_KERNEL_map 0xffffffff80000000
5462 -#define __PAGE_OFFSET 0xffff880000000000
5463 -#endif /* !__ASSEMBLY__ */
5464 +#define __START_KERNEL_map _AC(0xffffffff80000000, UL)
5465 +#define __PAGE_OFFSET _AC(0xffff880000000000, UL)
5466
5467 #if CONFIG_XEN_COMPAT <= 0x030002
5468 #undef LOAD_OFFSET
5469 @@ -172,20 +175,20 @@
5470 /* to align the pointer to the (next) page boundary */
5471 #define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK)
5472
5473 -#define KERNEL_TEXT_SIZE (40UL*1024*1024)
5474 -#define KERNEL_TEXT_START 0xffffffff80000000UL
5475 +#define KERNEL_TEXT_SIZE (40*1024*1024)
5476 +#define KERNEL_TEXT_START _AC(0xffffffff80000000, UL)
5477 +
5478 +#define PAGE_OFFSET __PAGE_OFFSET
5479
5480 -#define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET)
5481 +#ifndef __ASSEMBLY__
5482 +static inline unsigned long __phys_addr(unsigned long x)
5483 +{
5484 + return x - (x >= __START_KERNEL_map ? __START_KERNEL_map : PAGE_OFFSET);
5485 +}
5486 +#endif
5487
5488 -/* Note: __pa(&symbol_visible_to_c) should be always replaced with __pa_symbol.
5489 - Otherwise you risk miscompilation. */
5490 -#define __pa(x) (((unsigned long)(x)>=__START_KERNEL_map)?(unsigned long)(x) - (unsigned long)__START_KERNEL_map:(unsigned long)(x) - PAGE_OFFSET)
5491 -/* __pa_symbol should be used for C visible symbols.
5492 - This seems to be the official gcc blessed way to do such arithmetic. */
5493 -#define __pa_symbol(x) \
5494 - ({unsigned long v; \
5495 - asm("" : "=r" (v) : "0" (x)); \
5496 - __pa(v); })
5497 +#define __pa(x) __phys_addr((unsigned long)(x))
5498 +#define __pa_symbol(x) __phys_addr((unsigned long)(x))
5499
5500 #define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET))
5501 #define __boot_va(x) __va(x)
5502 --- a/include/asm-x86/mach-xen/asm/pgalloc_32.h
5503 +++ b/include/asm-x86/mach-xen/asm/pgalloc_32.h
5504 @@ -1,7 +1,6 @@
5505 #ifndef _I386_PGALLOC_H
5506 #define _I386_PGALLOC_H
5507
5508 -#include <asm/fixmap.h>
5509 #include <linux/threads.h>
5510 #include <linux/mm.h> /* for struct page */
5511 #include <asm/io.h> /* for phys_to_virt and page_to_pseudophys */
5512 @@ -69,6 +68,4 @@
5513 #define pud_populate(mm, pmd, pte) BUG()
5514 #endif
5515
5516 -#define check_pgt_cache() do { } while (0)
5517 -
5518 #endif /* _I386_PGALLOC_H */
5519 --- a/include/asm-x86/mach-xen/asm/pgalloc_64.h
5520 +++ b/include/asm-x86/mach-xen/asm/pgalloc_64.h
5521 @@ -1,7 +1,6 @@
5522 #ifndef _X86_64_PGALLOC_H
5523 #define _X86_64_PGALLOC_H
5524
5525 -#include <asm/fixmap.h>
5526 #include <asm/pda.h>
5527 #include <linux/threads.h>
5528 #include <linux/mm.h>
5529 @@ -100,24 +99,16 @@
5530 struct page *page = virt_to_page(pgd);
5531
5532 spin_lock(&pgd_lock);
5533 - page->index = (pgoff_t)pgd_list;
5534 - if (pgd_list)
5535 - pgd_list->private = (unsigned long)&page->index;
5536 - pgd_list = page;
5537 - page->private = (unsigned long)&pgd_list;
5538 + list_add(&page->lru, &pgd_list);
5539 spin_unlock(&pgd_lock);
5540 }
5541
5542 static inline void pgd_list_del(pgd_t *pgd)
5543 {
5544 - struct page *next, **pprev, *page = virt_to_page(pgd);
5545 + struct page *page = virt_to_page(pgd);
5546
5547 spin_lock(&pgd_lock);
5548 - next = (struct page *)page->index;
5549 - pprev = (struct page **)page->private;
5550 - *pprev = next;
5551 - if (next)
5552 - next->private = (unsigned long)pprev;
5553 + list_del(&page->lru);
5554 spin_unlock(&pgd_lock);
5555 }
5556
5557 --- a/include/asm-x86/mach-xen/asm/pgtable-2level.h
5558 +++ b/include/asm-x86/mach-xen/asm/pgtable-2level.h
5559 @@ -13,22 +13,43 @@
5560 * within a page table are directly modified. Thus, the following
5561 * hook is made available.
5562 */
5563 -#define set_pte(pteptr, pteval) (*(pteptr) = pteval)
5564 -
5565 -#define set_pte_at(_mm,addr,ptep,pteval) do { \
5566 - if (((_mm) != current->mm && (_mm) != &init_mm) || \
5567 - HYPERVISOR_update_va_mapping((addr), (pteval), 0)) \
5568 - set_pte((ptep), (pteval)); \
5569 -} while (0)
5570 -
5571 -#define set_pmd(pmdptr, pmdval) xen_l2_entry_update((pmdptr), (pmdval))
5572 +static inline void xen_set_pte(pte_t *ptep , pte_t pte)
5573 +{
5574 + *ptep = pte;
5575 +}
5576 +static inline void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
5577 + pte_t *ptep , pte_t pte)
5578 +{
5579 + if ((mm != current->mm && mm != &init_mm) ||
5580 + HYPERVISOR_update_va_mapping(addr, pte, 0))
5581 + xen_set_pte(ptep, pte);
5582 +}
5583 +static inline void xen_set_pmd(pmd_t *pmdp, pmd_t pmd)
5584 +{
5585 + xen_l2_entry_update(pmdp, pmd);
5586 +}
5587 +#define set_pte(pteptr, pteval) xen_set_pte(pteptr, pteval)
5588 +#define set_pte_at(mm,addr,ptep,pteval) xen_set_pte_at(mm, addr, ptep, pteval)
5589 +#define set_pmd(pmdptr, pmdval) xen_set_pmd(pmdptr, pmdval)
5590
5591 #define set_pte_atomic(pteptr, pteval) set_pte(pteptr,pteval)
5592
5593 #define pte_clear(mm,addr,xp) do { set_pte_at(mm, addr, xp, __pte(0)); } while (0)
5594 #define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0)
5595
5596 -#define raw_ptep_get_and_clear(xp, pte) __pte_ma(xchg(&(xp)->pte_low, 0))
5597 +static inline void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *xp)
5598 +{
5599 + xen_set_pte_at(mm, addr, xp, __pte(0));
5600 +}
5601 +
5602 +#ifdef CONFIG_SMP
5603 +static inline pte_t xen_ptep_get_and_clear(pte_t *xp, pte_t res)
5604 +{
5605 + return __pte_ma(xchg(&xp->pte_low, 0));
5606 +}
5607 +#else
5608 +#define xen_ptep_get_and_clear(xp, res) xen_local_ptep_get_and_clear(xp, res)
5609 +#endif
5610
5611 #define __HAVE_ARCH_PTEP_CLEAR_FLUSH
5612 #define ptep_clear_flush(vma, addr, ptep) \
5613 @@ -95,6 +116,4 @@
5614 #define __pte_to_swp_entry(pte) ((swp_entry_t) { (pte).pte_low })
5615 #define __swp_entry_to_pte(x) ((pte_t) { (x).val })
5616
5617 -void vmalloc_sync_all(void);
5618 -
5619 #endif /* _I386_PGTABLE_2LEVEL_H */
5620 --- a/include/asm-x86/mach-xen/asm/pgtable-3level-defs.h
5621 +++ b/include/asm-x86/mach-xen/asm/pgtable-3level-defs.h
5622 @@ -1,7 +1,7 @@
5623 #ifndef _I386_PGTABLE_3LEVEL_DEFS_H
5624 #define _I386_PGTABLE_3LEVEL_DEFS_H
5625
5626 -#define HAVE_SHARED_KERNEL_PMD 0
5627 +#define SHARED_KERNEL_PMD 0
5628
5629 /*
5630 * PGDIR_SHIFT determines what a top-level page table entry can map
5631 --- a/include/asm-x86/mach-xen/asm/pgtable-3level.h
5632 +++ b/include/asm-x86/mach-xen/asm/pgtable-3level.h
5633 @@ -52,32 +52,40 @@
5634 * value and then use set_pte to update it. -ben
5635 */
5636
5637 -static inline void set_pte(pte_t *ptep, pte_t pte)
5638 +static inline void xen_set_pte(pte_t *ptep, pte_t pte)
5639 {
5640 ptep->pte_high = pte.pte_high;
5641 smp_wmb();
5642 ptep->pte_low = pte.pte_low;
5643 }
5644 -#define set_pte_atomic(pteptr,pteval) \
5645 - set_64bit((unsigned long long *)(pteptr),__pte_val(pteval))
5646
5647 -#define set_pte_at(_mm,addr,ptep,pteval) do { \
5648 - if (((_mm) != current->mm && (_mm) != &init_mm) || \
5649 - HYPERVISOR_update_va_mapping((addr), (pteval), 0)) \
5650 - set_pte((ptep), (pteval)); \
5651 -} while (0)
5652 -
5653 -#define set_pmd(pmdptr,pmdval) \
5654 - xen_l2_entry_update((pmdptr), (pmdval))
5655 -#define set_pud(pudptr,pudval) \
5656 - xen_l3_entry_update((pudptr), (pudval))
5657 +static inline void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
5658 + pte_t *ptep , pte_t pte)
5659 +{
5660 + if ((mm != current->mm && mm != &init_mm) ||
5661 + HYPERVISOR_update_va_mapping(addr, pte, 0))
5662 + xen_set_pte(ptep, pte);
5663 +}
5664 +
5665 +static inline void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
5666 +{
5667 + set_64bit((unsigned long long *)(ptep),__pte_val(pte));
5668 +}
5669 +static inline void xen_set_pmd(pmd_t *pmdp, pmd_t pmd)
5670 +{
5671 + xen_l2_entry_update(pmdp, pmd);
5672 +}
5673 +static inline void xen_set_pud(pud_t *pudp, pud_t pud)
5674 +{
5675 + xen_l3_entry_update(pudp, pud);
5676 +}
5677
5678 /*
5679 * For PTEs and PDEs, we must clear the P-bit first when clearing a page table
5680 * entry, so clear the bottom half first and enforce ordering with a compiler
5681 * barrier.
5682 */
5683 -static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
5684 +static inline void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
5685 {
5686 if ((mm != current->mm && mm != &init_mm)
5687 || HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) {
5688 @@ -87,7 +95,18 @@
5689 }
5690 }
5691
5692 -#define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0)
5693 +static inline void xen_pmd_clear(pmd_t *pmd)
5694 +{
5695 + xen_l2_entry_update(pmd, __pmd(0));
5696 +}
5697 +
5698 +#define set_pte(ptep, pte) xen_set_pte(ptep, pte)
5699 +#define set_pte_at(mm, addr, ptep, pte) xen_set_pte_at(mm, addr, ptep, pte)
5700 +#define set_pte_atomic(ptep, pte) xen_set_pte_atomic(ptep, pte)
5701 +#define set_pmd(pmdp, pmd) xen_set_pmd(pmdp, pmd)
5702 +#define set_pud(pudp, pud) xen_set_pud(pudp, pud)
5703 +#define pte_clear(mm, addr, ptep) xen_pte_clear(mm, addr, ptep)
5704 +#define pmd_clear(pmd) xen_pmd_clear(pmd)
5705
5706 /*
5707 * Pentium-II erratum A13: in PAE mode we explicitly have to flush
5708 @@ -108,7 +127,8 @@
5709 #define pmd_offset(pud, address) ((pmd_t *) pud_page(*(pud)) + \
5710 pmd_index(address))
5711
5712 -static inline pte_t raw_ptep_get_and_clear(pte_t *ptep, pte_t res)
5713 +#ifdef CONFIG_SMP
5714 +static inline pte_t xen_ptep_get_and_clear(pte_t *ptep, pte_t res)
5715 {
5716 uint64_t val = __pte_val(res);
5717 if (__cmpxchg64(ptep, val, 0) != val) {
5718 @@ -119,6 +139,9 @@
5719 }
5720 return res;
5721 }
5722 +#else
5723 +#define xen_ptep_get_and_clear(xp, pte) xen_local_ptep_get_and_clear(xp, pte)
5724 +#endif
5725
5726 #define __HAVE_ARCH_PTEP_CLEAR_FLUSH
5727 #define ptep_clear_flush(vma, addr, ptep) \
5728 @@ -165,13 +188,13 @@
5729 static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
5730 {
5731 return __pte((((unsigned long long)page_nr << PAGE_SHIFT) |
5732 - pgprot_val(pgprot)) & __supported_pte_mask);
5733 + pgprot_val(pgprot)) & __supported_pte_mask);
5734 }
5735
5736 static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
5737 {
5738 return __pmd((((unsigned long long)page_nr << PAGE_SHIFT) |
5739 - pgprot_val(pgprot)) & __supported_pte_mask);
5740 + pgprot_val(pgprot)) & __supported_pte_mask);
5741 }
5742
5743 /*
5744 @@ -191,6 +214,4 @@
5745
5746 #define __pmd_free_tlb(tlb, x) do { } while (0)
5747
5748 -void vmalloc_sync_all(void);
5749 -
5750 #endif /* _I386_PGTABLE_3LEVEL_H */
5751 --- a/include/asm-x86/mach-xen/asm/pgtable_32.h
5752 +++ b/include/asm-x86/mach-xen/asm/pgtable_32.h
5753 @@ -24,11 +24,11 @@
5754 #include <linux/slab.h>
5755 #include <linux/list.h>
5756 #include <linux/spinlock.h>
5757 +#include <linux/sched.h>
5758
5759 /* Is this pagetable pinned? */
5760 #define PG_pinned PG_arch_1
5761
5762 -struct mm_struct;
5763 struct vm_area_struct;
5764
5765 /*
5766 @@ -38,17 +38,16 @@
5767 #define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
5768 extern unsigned long empty_zero_page[1024];
5769 extern pgd_t *swapper_pg_dir;
5770 -extern struct kmem_cache *pgd_cache;
5771 extern struct kmem_cache *pmd_cache;
5772 extern spinlock_t pgd_lock;
5773 extern struct page *pgd_list;
5774 +void check_pgt_cache(void);
5775
5776 void pmd_ctor(void *, struct kmem_cache *, unsigned long);
5777 -void pgd_ctor(void *, struct kmem_cache *, unsigned long);
5778 -void pgd_dtor(void *, struct kmem_cache *, unsigned long);
5779 void pgtable_cache_init(void);
5780 void paging_init(void);
5781
5782 +
5783 /*
5784 * The Linux x86 paging architecture is 'compile-time dual-mode', it
5785 * implements both the traditional 2-level x86 page tables and the
5786 @@ -165,6 +164,7 @@
5787
5788 extern unsigned long long __PAGE_KERNEL, __PAGE_KERNEL_EXEC;
5789 #define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW)
5790 +#define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW)
5791 #define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD)
5792 #define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE)
5793 #define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE)
5794 @@ -172,6 +172,7 @@
5795 #define PAGE_KERNEL __pgprot(__PAGE_KERNEL)
5796 #define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO)
5797 #define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC)
5798 +#define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX)
5799 #define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE)
5800 #define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE)
5801 #define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC)
5802 @@ -275,7 +276,13 @@
5803 */
5804 #define pte_update(mm, addr, ptep) do { } while (0)
5805 #define pte_update_defer(mm, addr, ptep) do { } while (0)
5806 -#define paravirt_map_pt_hook(slot, va, pfn) do { } while (0)
5807 +
5808 +/* local pte updates need not use xchg for locking */
5809 +static inline pte_t xen_local_ptep_get_and_clear(pte_t *ptep, pte_t res)
5810 +{
5811 + xen_set_pte(ptep, __pte(0));
5812 + return res;
5813 +}
5814
5815 /*
5816 * We only update the dirty/accessed state if we set
5817 @@ -286,17 +293,34 @@
5818 */
5819 #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
5820 #define ptep_set_access_flags(vma, address, ptep, entry, dirty) \
5821 -do { \
5822 - if (dirty) \
5823 +({ \
5824 + int __changed = !pte_same(*(ptep), entry); \
5825 + if (__changed && (dirty)) \
5826 ptep_establish(vma, address, ptep, entry); \
5827 -} while (0)
5828 + __changed; \
5829 +})
5830
5831 -/*
5832 - * We don't actually have these, but we want to advertise them so that
5833 - * we can encompass the flush here.
5834 - */
5835 #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
5836 +#define ptep_test_and_clear_dirty(vma, addr, ptep) ({ \
5837 + int __ret = 0; \
5838 + if (pte_dirty(*(ptep))) \
5839 + __ret = test_and_clear_bit(_PAGE_BIT_DIRTY, \
5840 + &(ptep)->pte_low); \
5841 + if (__ret) \
5842 + pte_update((vma)->vm_mm, addr, ptep); \
5843 + __ret; \
5844 +})
5845 +
5846 #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
5847 +#define ptep_test_and_clear_young(vma, addr, ptep) ({ \
5848 + int __ret = 0; \
5849 + if (pte_young(*(ptep))) \
5850 + __ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, \
5851 + &(ptep)->pte_low); \
5852 + if (__ret) \
5853 + pte_update((vma)->vm_mm, addr, ptep); \
5854 + __ret; \
5855 +})
5856
5857 /*
5858 * Rules for using ptep_establish: the pte MUST be a user pte, and
5859 @@ -323,7 +347,7 @@
5860 int __dirty = pte_dirty(__pte); \
5861 __pte = pte_mkclean(__pte); \
5862 if (test_bit(PG_pinned, &virt_to_page((vma)->vm_mm->pgd)->flags)) \
5863 - ptep_set_access_flags(vma, address, ptep, __pte, __dirty); \
5864 + (void)ptep_set_access_flags(vma, address, ptep, __pte, __dirty); \
5865 else if (__dirty) \
5866 (ptep)->pte_low = __pte.pte_low; \
5867 __dirty; \
5868 @@ -336,7 +360,7 @@
5869 int __young = pte_young(__pte); \
5870 __pte = pte_mkold(__pte); \
5871 if (test_bit(PG_pinned, &virt_to_page((vma)->vm_mm->pgd)->flags)) \
5872 - ptep_set_access_flags(vma, address, ptep, __pte, __young); \
5873 + (void)ptep_set_access_flags(vma, address, ptep, __pte, __young); \
5874 else if (__young) \
5875 (ptep)->pte_low = __pte.pte_low; \
5876 __young; \
5877 @@ -349,7 +373,7 @@
5878 if (!pte_none(pte)
5879 && (mm != &init_mm
5880 || HYPERVISOR_update_va_mapping(addr, __pte(0), 0))) {
5881 - pte = raw_ptep_get_and_clear(ptep, pte);
5882 + pte = xen_ptep_get_and_clear(ptep, pte);
5883 pte_update(mm, addr, ptep);
5884 }
5885 return pte;
5886 @@ -491,24 +515,10 @@
5887 #endif
5888
5889 #if defined(CONFIG_HIGHPTE)
5890 -#define pte_offset_map(dir, address) \
5891 -({ \
5892 - pte_t *__ptep; \
5893 - unsigned pfn = pmd_val(*(dir)) >> PAGE_SHIFT; \
5894 - __ptep = (pte_t *)kmap_atomic_pte(pfn_to_page(pfn),KM_PTE0); \
5895 - paravirt_map_pt_hook(KM_PTE0,__ptep, pfn); \
5896 - __ptep = __ptep + pte_index(address); \
5897 - __ptep; \
5898 -})
5899 -#define pte_offset_map_nested(dir, address) \
5900 -({ \
5901 - pte_t *__ptep; \
5902 - unsigned pfn = pmd_val(*(dir)) >> PAGE_SHIFT; \
5903 - __ptep = (pte_t *)kmap_atomic_pte(pfn_to_page(pfn),KM_PTE1); \
5904 - paravirt_map_pt_hook(KM_PTE1,__ptep, pfn); \
5905 - __ptep = __ptep + pte_index(address); \
5906 - __ptep; \
5907 -})
5908 +#define pte_offset_map(dir, address) \
5909 + ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE0) + pte_index(address))
5910 +#define pte_offset_map_nested(dir, address) \
5911 + ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE1) + pte_index(address))
5912 #define pte_unmap(pte) kunmap_atomic(pte, KM_PTE0)
5913 #define pte_unmap_nested(pte) kunmap_atomic(pte, KM_PTE1)
5914 #else
5915 @@ -587,10 +597,6 @@
5916 #define io_remap_pfn_range(vma,from,pfn,size,prot) \
5917 direct_remap_pfn_range(vma,from,pfn,size,prot,DOMID_IO)
5918
5919 -#define MK_IOSPACE_PFN(space, pfn) (pfn)
5920 -#define GET_IOSPACE(pfn) 0
5921 -#define GET_PFN(pfn) (pfn)
5922 -
5923 #include <asm-generic/pgtable.h>
5924
5925 #endif /* _I386_PGTABLE_H */
5926 --- a/include/asm-x86/mach-xen/asm/pgtable_64.h
5927 +++ b/include/asm-x86/mach-xen/asm/pgtable_64.h
5928 @@ -1,12 +1,14 @@
5929 #ifndef _X86_64_PGTABLE_H
5930 #define _X86_64_PGTABLE_H
5931
5932 +#include <linux/const.h>
5933 +#ifndef __ASSEMBLY__
5934 +
5935 /*
5936 * This file contains the functions and defines necessary to modify and use
5937 * the x86-64 page table tree.
5938 */
5939 #include <asm/processor.h>
5940 -#include <asm/fixmap.h>
5941 #include <asm/bitops.h>
5942 #include <linux/threads.h>
5943 #include <linux/sched.h>
5944 @@ -34,11 +36,9 @@
5945 #endif
5946
5947 extern pud_t level3_kernel_pgt[512];
5948 -extern pud_t level3_physmem_pgt[512];
5949 extern pud_t level3_ident_pgt[512];
5950 extern pmd_t level2_kernel_pgt[512];
5951 extern pgd_t init_level4_pgt[];
5952 -extern pgd_t boot_level4_pgt[];
5953 extern unsigned long __supported_pte_mask;
5954
5955 #define swapper_pg_dir init_level4_pgt
5956 @@ -53,6 +53,8 @@
5957 extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)];
5958 #define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
5959
5960 +#endif /* !__ASSEMBLY__ */
5961 +
5962 /*
5963 * PGDIR_SHIFT determines what a top-level page table entry can map
5964 */
5965 @@ -77,6 +79,8 @@
5966 */
5967 #define PTRS_PER_PTE 512
5968
5969 +#ifndef __ASSEMBLY__
5970 +
5971 #define pte_ERROR(e) \
5972 printk("%s:%d: bad pte %p(%016lx pfn %010lx).\n", __FILE__, __LINE__, \
5973 &(e), __pte_val(e), pte_pfn(e))
5974 @@ -119,22 +123,23 @@
5975
5976 #define pte_pgprot(a) (__pgprot((a).pte & ~PHYSICAL_PAGE_MASK))
5977
5978 -#define PMD_SIZE (1UL << PMD_SHIFT)
5979 +#endif /* !__ASSEMBLY__ */
5980 +
5981 +#define PMD_SIZE (_AC(1,UL) << PMD_SHIFT)
5982 #define PMD_MASK (~(PMD_SIZE-1))
5983 -#define PUD_SIZE (1UL << PUD_SHIFT)
5984 +#define PUD_SIZE (_AC(1,UL) << PUD_SHIFT)
5985 #define PUD_MASK (~(PUD_SIZE-1))
5986 -#define PGDIR_SIZE (1UL << PGDIR_SHIFT)
5987 +#define PGDIR_SIZE (_AC(1,UL) << PGDIR_SHIFT)
5988 #define PGDIR_MASK (~(PGDIR_SIZE-1))
5989
5990 #define USER_PTRS_PER_PGD ((TASK_SIZE-1)/PGDIR_SIZE+1)
5991 #define FIRST_USER_ADDRESS 0
5992
5993 -#ifndef __ASSEMBLY__
5994 -#define MAXMEM 0x3fffffffffffUL
5995 -#define VMALLOC_START 0xffffc20000000000UL
5996 -#define VMALLOC_END 0xffffe1ffffffffffUL
5997 -#define MODULES_VADDR 0xffffffff88000000UL
5998 -#define MODULES_END 0xfffffffffff00000UL
5999 +#define MAXMEM _AC(0x3fffffffffff, UL)
6000 +#define VMALLOC_START _AC(0xffffc20000000000, UL)
6001 +#define VMALLOC_END _AC(0xffffe1ffffffffff, UL)
6002 +#define MODULES_VADDR _AC(0xffffffff88000000, UL)
6003 +#define MODULES_END _AC(0xfffffffffff00000, UL)
6004 #define MODULES_LEN (MODULES_END - MODULES_VADDR)
6005
6006 #define _PAGE_BIT_PRESENT 0
6007 @@ -160,16 +165,18 @@
6008 #define _PAGE_GLOBAL 0x100 /* Global TLB entry */
6009
6010 #define _PAGE_PROTNONE 0x080 /* If not present */
6011 -#define _PAGE_NX (1UL<<_PAGE_BIT_NX)
6012 +#define _PAGE_NX (_AC(1,UL)<<_PAGE_BIT_NX)
6013
6014 /* Mapped page is I/O or foreign and has no associated page struct. */
6015 #define _PAGE_IO 0x200
6016
6017 +#ifndef __ASSEMBLY__
6018 #if CONFIG_XEN_COMPAT <= 0x030002
6019 extern unsigned int __kernel_page_user;
6020 #else
6021 #define __kernel_page_user 0
6022 #endif
6023 +#endif
6024
6025 #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
6026 #define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY | __kernel_page_user)
6027 @@ -234,6 +241,8 @@
6028 #define __S110 PAGE_SHARED_EXEC
6029 #define __S111 PAGE_SHARED_EXEC
6030
6031 +#ifndef __ASSEMBLY__
6032 +
6033 static inline unsigned long pgd_bad(pgd_t pgd)
6034 {
6035 return __pgd_val(pgd) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER);
6036 @@ -345,6 +354,20 @@
6037 static inline pte_t pte_mkhuge(pte_t pte) { __pte_val(pte) |= _PAGE_PSE; return pte; }
6038 static inline pte_t pte_clrhuge(pte_t pte) { __pte_val(pte) &= ~_PAGE_PSE; return pte; }
6039
6040 +static inline int ptep_test_and_clear_dirty(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
6041 +{
6042 + if (!pte_dirty(*ptep))
6043 + return 0;
6044 + return test_and_clear_bit(_PAGE_BIT_DIRTY, &ptep->pte);
6045 +}
6046 +
6047 +static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
6048 +{
6049 + if (!pte_young(*ptep))
6050 + return 0;
6051 + return test_and_clear_bit(_PAGE_BIT_ACCESSED, &ptep->pte);
6052 +}
6053 +
6054 static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
6055 {
6056 pte_t pte = *ptep;
6057 @@ -470,18 +493,12 @@
6058 * bit at the same time. */
6059 #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
6060 #define ptep_set_access_flags(vma, address, ptep, entry, dirty) \
6061 - do { \
6062 - if (dirty) \
6063 - ptep_establish(vma, address, ptep, entry); \
6064 - } while (0)
6065 -
6066 -
6067 -/*
6068 - * i386 says: We don't actually have these, but we want to advertise
6069 - * them so that we can encompass the flush here.
6070 - */
6071 -#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
6072 -#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
6073 +({ \
6074 + int __changed = !pte_same(*(ptep), entry); \
6075 + if (__changed && (dirty)) \
6076 + ptep_establish(vma, address, ptep, entry); \
6077 + __changed; \
6078 +})
6079
6080 #define __HAVE_ARCH_PTEP_CLEAR_DIRTY_FLUSH
6081 #define ptep_clear_flush_dirty(vma, address, ptep) \
6082 @@ -490,7 +507,7 @@
6083 int __dirty = pte_dirty(__pte); \
6084 __pte = pte_mkclean(__pte); \
6085 if ((vma)->vm_mm->context.pinned) \
6086 - ptep_set_access_flags(vma, address, ptep, __pte, __dirty); \
6087 + (void)ptep_set_access_flags(vma, address, ptep, __pte, __dirty); \
6088 else if (__dirty) \
6089 set_pte(ptep, __pte); \
6090 __dirty; \
6091 @@ -503,7 +520,7 @@
6092 int __young = pte_young(__pte); \
6093 __pte = pte_mkold(__pte); \
6094 if ((vma)->vm_mm->context.pinned) \
6095 - ptep_set_access_flags(vma, address, ptep, __pte, __young); \
6096 + (void)ptep_set_access_flags(vma, address, ptep, __pte, __young); \
6097 else if (__young) \
6098 set_pte(ptep, __pte); \
6099 __young; \
6100 @@ -517,10 +534,7 @@
6101 #define __swp_entry_to_pte(x) ((pte_t) { (x).val })
6102
6103 extern spinlock_t pgd_lock;
6104 -extern struct page *pgd_list;
6105 -void vmalloc_sync_all(void);
6106 -
6107 -#endif /* !__ASSEMBLY__ */
6108 +extern struct list_head pgd_list;
6109
6110 extern int kern_addr_valid(unsigned long addr);
6111
6112 @@ -559,10 +573,6 @@
6113 #define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \
6114 direct_remap_pfn_range(vma,vaddr,pfn,size,prot,DOMID_IO)
6115
6116 -#define MK_IOSPACE_PFN(space, pfn) (pfn)
6117 -#define GET_IOSPACE(pfn) 0
6118 -#define GET_PFN(pfn) (pfn)
6119 -
6120 #define HAVE_ARCH_UNMAPPED_AREA
6121
6122 #define pgtable_cache_init() do { } while (0)
6123 @@ -576,11 +586,14 @@
6124 #define kc_offset_to_vaddr(o) \
6125 (((o) & (1UL << (__VIRTUAL_MASK_SHIFT-1))) ? ((o) | (~__VIRTUAL_MASK)) : (o))
6126
6127 +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
6128 +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
6129 #define __HAVE_ARCH_PTEP_GET_AND_CLEAR
6130 #define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
6131 #define __HAVE_ARCH_PTEP_CLEAR_FLUSH
6132 #define __HAVE_ARCH_PTEP_SET_WRPROTECT
6133 #define __HAVE_ARCH_PTE_SAME
6134 #include <asm-generic/pgtable.h>
6135 +#endif /* !__ASSEMBLY__ */
6136
6137 #endif /* _X86_64_PGTABLE_H */
6138 --- a/include/asm-x86/mach-xen/asm/processor_32.h
6139 +++ b/include/asm-x86/mach-xen/asm/processor_32.h
6140 @@ -21,6 +21,7 @@
6141 #include <asm/percpu.h>
6142 #include <linux/cpumask.h>
6143 #include <linux/init.h>
6144 +#include <asm/processor-flags.h>
6145 #include <xen/interface/physdev.h>
6146
6147 /* flag for disabling the tsc */
6148 @@ -118,7 +119,8 @@
6149
6150 void __init cpu_detect(struct cpuinfo_x86 *c);
6151
6152 -extern void identify_cpu(struct cpuinfo_x86 *);
6153 +extern void identify_boot_cpu(void);
6154 +extern void identify_secondary_cpu(struct cpuinfo_x86 *);
6155 extern void print_cpu_info(struct cpuinfo_x86 *);
6156 extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
6157 extern unsigned short num_cache_leaves;
6158 @@ -129,29 +131,8 @@
6159 static inline void detect_ht(struct cpuinfo_x86 *c) {}
6160 #endif
6161
6162 -/*
6163 - * EFLAGS bits
6164 - */
6165 -#define X86_EFLAGS_CF 0x00000001 /* Carry Flag */
6166 -#define X86_EFLAGS_PF 0x00000004 /* Parity Flag */
6167 -#define X86_EFLAGS_AF 0x00000010 /* Auxillary carry Flag */
6168 -#define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */
6169 -#define X86_EFLAGS_SF 0x00000080 /* Sign Flag */
6170 -#define X86_EFLAGS_TF 0x00000100 /* Trap Flag */
6171 -#define X86_EFLAGS_IF 0x00000200 /* Interrupt Flag */
6172 -#define X86_EFLAGS_DF 0x00000400 /* Direction Flag */
6173 -#define X86_EFLAGS_OF 0x00000800 /* Overflow Flag */
6174 -#define X86_EFLAGS_IOPL 0x00003000 /* IOPL mask */
6175 -#define X86_EFLAGS_NT 0x00004000 /* Nested Task */
6176 -#define X86_EFLAGS_RF 0x00010000 /* Resume Flag */
6177 -#define X86_EFLAGS_VM 0x00020000 /* Virtual Mode */
6178 -#define X86_EFLAGS_AC 0x00040000 /* Alignment Check */
6179 -#define X86_EFLAGS_VIF 0x00080000 /* Virtual Interrupt Flag */
6180 -#define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */
6181 -#define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */
6182 -
6183 -static inline fastcall void xen_cpuid(unsigned int *eax, unsigned int *ebx,
6184 - unsigned int *ecx, unsigned int *edx)
6185 +static inline void xen_cpuid(unsigned int *eax, unsigned int *ebx,
6186 + unsigned int *ecx, unsigned int *edx)
6187 {
6188 /* ecx is often an input as well as an output. */
6189 __asm__(XEN_CPUID
6190 @@ -165,21 +146,6 @@
6191 #define load_cr3(pgdir) write_cr3(__pa(pgdir))
6192
6193 /*
6194 - * Intel CPU features in CR4
6195 - */
6196 -#define X86_CR4_VME 0x0001 /* enable vm86 extensions */
6197 -#define X86_CR4_PVI 0x0002 /* virtual interrupts flag enable */
6198 -#define X86_CR4_TSD 0x0004 /* disable time stamp at ipl 3 */
6199 -#define X86_CR4_DE 0x0008 /* enable debugging extensions */
6200 -#define X86_CR4_PSE 0x0010 /* enable page size extensions */
6201 -#define X86_CR4_PAE 0x0020 /* enable physical address extensions */
6202 -#define X86_CR4_MCE 0x0040 /* Machine check enable */
6203 -#define X86_CR4_PGE 0x0080 /* enable global pages */
6204 -#define X86_CR4_PCE 0x0100 /* enable performance counters at ipl 3 */
6205 -#define X86_CR4_OSFXSR 0x0200 /* enable fast FPU save and restore */
6206 -#define X86_CR4_OSXMMEXCPT 0x0400 /* enable unmasked SSE exceptions */
6207 -
6208 -/*
6209 * Save the cr4 feature set we're using (ie
6210 * Pentium 4MB enable and PPro Global page
6211 * enable), so that any CPU's that boot up
6212 @@ -206,26 +172,6 @@
6213 }
6214
6215 /*
6216 - * NSC/Cyrix CPU configuration register indexes
6217 - */
6218 -
6219 -#define CX86_PCR0 0x20
6220 -#define CX86_GCR 0xb8
6221 -#define CX86_CCR0 0xc0
6222 -#define CX86_CCR1 0xc1
6223 -#define CX86_CCR2 0xc2
6224 -#define CX86_CCR3 0xc3
6225 -#define CX86_CCR4 0xe8
6226 -#define CX86_CCR5 0xe9
6227 -#define CX86_CCR6 0xea
6228 -#define CX86_CCR7 0xeb
6229 -#define CX86_PCR1 0xf0
6230 -#define CX86_DIR0 0xfe
6231 -#define CX86_DIR1 0xff
6232 -#define CX86_ARR_BASE 0xc4
6233 -#define CX86_RCR_BASE 0xdc
6234 -
6235 -/*
6236 * NSC/Cyrix CPU indexed register access macros
6237 */
6238
6239 @@ -351,7 +297,8 @@
6240 struct thread_struct;
6241
6242 #ifndef CONFIG_X86_NO_TSS
6243 -struct tss_struct {
6244 +/* This is the TSS defined by the hardware. */
6245 +struct i386_hw_tss {
6246 unsigned short back_link,__blh;
6247 unsigned long esp0;
6248 unsigned short ss0,__ss0h;
6249 @@ -375,6 +322,11 @@
6250 unsigned short gs, __gsh;
6251 unsigned short ldt, __ldth;
6252 unsigned short trace, io_bitmap_base;
6253 +} __attribute__((packed));
6254 +
6255 +struct tss_struct {
6256 + struct i386_hw_tss x86_tss;
6257 +
6258 /*
6259 * The extra 1 is there because the CPU will access an
6260 * additional byte beyond the end of the IO permission
6261 @@ -428,10 +380,11 @@
6262 };
6263
6264 #define INIT_THREAD { \
6265 + .esp0 = sizeof(init_stack) + (long)&init_stack, \
6266 .vm86_info = NULL, \
6267 .sysenter_cs = __KERNEL_CS, \
6268 .io_bitmap_ptr = NULL, \
6269 - .fs = __KERNEL_PDA, \
6270 + .fs = __KERNEL_PERCPU, \
6271 }
6272
6273 /*
6274 @@ -441,10 +394,12 @@
6275 * be within the limit.
6276 */
6277 #define INIT_TSS { \
6278 - .esp0 = sizeof(init_stack) + (long)&init_stack, \
6279 - .ss0 = __KERNEL_DS, \
6280 - .ss1 = __KERNEL_CS, \
6281 - .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \
6282 + .x86_tss = { \
6283 + .esp0 = sizeof(init_stack) + (long)&init_stack, \
6284 + .ss0 = __KERNEL_DS, \
6285 + .ss1 = __KERNEL_CS, \
6286 + .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \
6287 + }, \
6288 .io_bitmap = { [ 0 ... IO_BITMAP_LONGS] = ~0 }, \
6289 }
6290
6291 @@ -551,38 +506,33 @@
6292
6293 #define cpu_relax() rep_nop()
6294
6295 -#define paravirt_enabled() 0
6296 -#define __cpuid xen_cpuid
6297 -
6298 #ifndef CONFIG_X86_NO_TSS
6299 -static inline void __load_esp0(struct tss_struct *tss, struct thread_struct *thread)
6300 +static inline void native_load_esp0(struct tss_struct *tss, struct thread_struct *thread)
6301 {
6302 - tss->esp0 = thread->esp0;
6303 + tss->x86_tss.esp0 = thread->esp0;
6304 /* This can only happen when SEP is enabled, no need to test "SEP"arately */
6305 - if (unlikely(tss->ss1 != thread->sysenter_cs)) {
6306 - tss->ss1 = thread->sysenter_cs;
6307 + if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
6308 + tss->x86_tss.ss1 = thread->sysenter_cs;
6309 wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
6310 }
6311 }
6312 -#define load_esp0(tss, thread) \
6313 - __load_esp0(tss, thread)
6314 #else
6315 -#define load_esp0(tss, thread) do { \
6316 +#define xen_load_esp0(tss, thread) do { \
6317 if (HYPERVISOR_stack_switch(__KERNEL_DS, (thread)->esp0)) \
6318 BUG(); \
6319 } while (0)
6320 #endif
6321
6322
6323 -/*
6324 - * These special macros can be used to get or set a debugging register
6325 - */
6326 -#define get_debugreg(var, register) \
6327 - (var) = HYPERVISOR_get_debugreg(register)
6328 -#define set_debugreg(value, register) \
6329 - WARN_ON(HYPERVISOR_set_debugreg(register, value))
6330 +static inline unsigned long xen_get_debugreg(int regno)
6331 +{
6332 + return HYPERVISOR_get_debugreg(regno);
6333 +}
6334
6335 -#define set_iopl_mask xen_set_iopl_mask
6336 +static inline void xen_set_debugreg(int regno, unsigned long value)
6337 +{
6338 + WARN_ON(HYPERVISOR_set_debugreg(regno, value));
6339 +}
6340
6341 /*
6342 * Set IOPL bits in EFLAGS from given mask
6343 @@ -597,6 +547,21 @@
6344 }
6345
6346
6347 +#define paravirt_enabled() 0
6348 +#define __cpuid xen_cpuid
6349 +
6350 +#define load_esp0 xen_load_esp0
6351 +
6352 +/*
6353 + * These special macros can be used to get or set a debugging register
6354 + */
6355 +#define get_debugreg(var, register) \
6356 + (var) = xen_get_debugreg(register)
6357 +#define set_debugreg(value, register) \
6358 + xen_set_debugreg(register, value)
6359 +
6360 +#define set_iopl_mask xen_set_iopl_mask
6361 +
6362 /*
6363 * Generic CPUID function
6364 * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
6365 @@ -749,8 +714,14 @@
6366 extern void enable_sep_cpu(void);
6367 extern int sysenter_setup(void);
6368
6369 -extern int init_gdt(int cpu, struct task_struct *idle);
6370 +/* Defined in head.S */
6371 +extern struct Xgt_desc_struct early_gdt_descr;
6372 +
6373 extern void cpu_set_gdt(int);
6374 -extern void secondary_cpu_init(void);
6375 +extern void switch_to_new_gdt(void);
6376 +extern void cpu_init(void);
6377 +extern void init_gdt(int cpu);
6378 +
6379 +extern int force_mwait;
6380
6381 #endif /* __ASM_I386_PROCESSOR_H */
6382 --- a/include/asm-x86/mach-xen/asm/processor_64.h
6383 +++ b/include/asm-x86/mach-xen/asm/processor_64.h
6384 @@ -20,6 +20,7 @@
6385 #include <asm/percpu.h>
6386 #include <linux/personality.h>
6387 #include <linux/cpumask.h>
6388 +#include <asm/processor-flags.h>
6389
6390 #define TF_MASK 0x00000100
6391 #define IF_MASK 0x00000200
6392 @@ -103,42 +104,6 @@
6393 extern unsigned short num_cache_leaves;
6394
6395 /*
6396 - * EFLAGS bits
6397 - */
6398 -#define X86_EFLAGS_CF 0x00000001 /* Carry Flag */
6399 -#define X86_EFLAGS_PF 0x00000004 /* Parity Flag */
6400 -#define X86_EFLAGS_AF 0x00000010 /* Auxillary carry Flag */
6401 -#define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */
6402 -#define X86_EFLAGS_SF 0x00000080 /* Sign Flag */
6403 -#define X86_EFLAGS_TF 0x00000100 /* Trap Flag */
6404 -#define X86_EFLAGS_IF 0x00000200 /* Interrupt Flag */
6405 -#define X86_EFLAGS_DF 0x00000400 /* Direction Flag */
6406 -#define X86_EFLAGS_OF 0x00000800 /* Overflow Flag */
6407 -#define X86_EFLAGS_IOPL 0x00003000 /* IOPL mask */
6408 -#define X86_EFLAGS_NT 0x00004000 /* Nested Task */
6409 -#define X86_EFLAGS_RF 0x00010000 /* Resume Flag */
6410 -#define X86_EFLAGS_VM 0x00020000 /* Virtual Mode */
6411 -#define X86_EFLAGS_AC 0x00040000 /* Alignment Check */
6412 -#define X86_EFLAGS_VIF 0x00080000 /* Virtual Interrupt Flag */
6413 -#define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */
6414 -#define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */
6415 -
6416 -/*
6417 - * Intel CPU features in CR4
6418 - */
6419 -#define X86_CR4_VME 0x0001 /* enable vm86 extensions */
6420 -#define X86_CR4_PVI 0x0002 /* virtual interrupts flag enable */
6421 -#define X86_CR4_TSD 0x0004 /* disable time stamp at ipl 3 */
6422 -#define X86_CR4_DE 0x0008 /* enable debugging extensions */
6423 -#define X86_CR4_PSE 0x0010 /* enable page size extensions */
6424 -#define X86_CR4_PAE 0x0020 /* enable physical address extensions */
6425 -#define X86_CR4_MCE 0x0040 /* Machine check enable */
6426 -#define X86_CR4_PGE 0x0080 /* enable global pages */
6427 -#define X86_CR4_PCE 0x0100 /* enable performance counters at ipl 3 */
6428 -#define X86_CR4_OSFXSR 0x0200 /* enable fast FPU save and restore */
6429 -#define X86_CR4_OSXMMEXCPT 0x0400 /* enable unmasked SSE exceptions */
6430 -
6431 -/*
6432 * Save the cr4 feature set we're using (ie
6433 * Pentium 4MB enable and PPro Global page
6434 * enable), so that any CPU's that boot up
6435 @@ -203,7 +168,7 @@
6436 u32 mxcsr;
6437 u32 mxcsr_mask;
6438 u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
6439 - u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 128 bytes */
6440 + u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */
6441 u32 padding[24];
6442 } __attribute__ ((aligned (16)));
6443
6444 @@ -436,22 +401,6 @@
6445 #define cpu_relax() rep_nop()
6446
6447 /*
6448 - * NSC/Cyrix CPU configuration register indexes
6449 - */
6450 -#define CX86_CCR0 0xc0
6451 -#define CX86_CCR1 0xc1
6452 -#define CX86_CCR2 0xc2
6453 -#define CX86_CCR3 0xc3
6454 -#define CX86_CCR4 0xe8
6455 -#define CX86_CCR5 0xe9
6456 -#define CX86_CCR6 0xea
6457 -#define CX86_CCR7 0xeb
6458 -#define CX86_DIR0 0xfe
6459 -#define CX86_DIR1 0xff
6460 -#define CX86_ARR_BASE 0xc4
6461 -#define CX86_RCR_BASE 0xdc
6462 -
6463 -/*
6464 * NSC/Cyrix CPU indexed register access macros
6465 */
6466
6467 --- a/include/asm-x86/mach-xen/asm/scatterlist_32.h
6468 +++ b/include/asm-x86/mach-xen/asm/scatterlist_32.h
6469 @@ -1,6 +1,8 @@
6470 #ifndef _I386_SCATTERLIST_H
6471 #define _I386_SCATTERLIST_H
6472
6473 +#include <asm/types.h>
6474 +
6475 struct scatterlist {
6476 struct page *page;
6477 unsigned int offset;
6478 --- a/include/asm-x86/mach-xen/asm/segment_32.h
6479 +++ b/include/asm-x86/mach-xen/asm/segment_32.h
6480 @@ -39,7 +39,7 @@
6481 * 25 - APM BIOS support
6482 *
6483 * 26 - ESPFIX small SS
6484 - * 27 - PDA [ per-cpu private data area ]
6485 + * 27 - per-cpu [ offset to per-cpu data area ]
6486 * 28 - unused
6487 * 29 - unused
6488 * 30 - unused
6489 @@ -74,8 +74,12 @@
6490 #define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE + 14)
6491 #define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8)
6492
6493 -#define GDT_ENTRY_PDA (GDT_ENTRY_KERNEL_BASE + 15)
6494 -#define __KERNEL_PDA (GDT_ENTRY_PDA * 8)
6495 +#define GDT_ENTRY_PERCPU (GDT_ENTRY_KERNEL_BASE + 15)
6496 +#ifdef CONFIG_SMP
6497 +#define __KERNEL_PERCPU (GDT_ENTRY_PERCPU * 8)
6498 +#else
6499 +#define __KERNEL_PERCPU 0
6500 +#endif
6501
6502 #define GDT_ENTRY_DOUBLEFAULT_TSS 31
6503
6504 --- a/include/asm-x86/mach-xen/asm/smp_32.h
6505 +++ b/include/asm-x86/mach-xen/asm/smp_32.h
6506 @@ -8,19 +8,15 @@
6507 #include <linux/kernel.h>
6508 #include <linux/threads.h>
6509 #include <linux/cpumask.h>
6510 -#include <asm/pda.h>
6511 #endif
6512
6513 -#ifdef CONFIG_X86_LOCAL_APIC
6514 -#ifndef __ASSEMBLY__
6515 -#include <asm/fixmap.h>
6516 +#if defined(CONFIG_X86_LOCAL_APIC) && !defined(__ASSEMBLY__)
6517 #include <asm/bitops.h>
6518 #include <asm/mpspec.h>
6519 +#include <asm/apic.h>
6520 #ifdef CONFIG_X86_IO_APIC
6521 #include <asm/io_apic.h>
6522 #endif
6523 -#include <asm/apic.h>
6524 -#endif
6525 #endif
6526
6527 #define BAD_APICID 0xFFu
6528 @@ -52,9 +48,76 @@
6529 extern void cpu_uninit(void);
6530 #endif
6531
6532 -#ifndef CONFIG_PARAVIRT
6533 +#ifndef CONFIG_XEN
6534 +struct smp_ops
6535 +{
6536 + void (*smp_prepare_boot_cpu)(void);
6537 + void (*smp_prepare_cpus)(unsigned max_cpus);
6538 + int (*cpu_up)(unsigned cpu);
6539 + void (*smp_cpus_done)(unsigned max_cpus);
6540 +
6541 + void (*smp_send_stop)(void);
6542 + void (*smp_send_reschedule)(int cpu);
6543 + int (*smp_call_function_mask)(cpumask_t mask,
6544 + void (*func)(void *info), void *info,
6545 + int wait);
6546 +};
6547 +
6548 +extern struct smp_ops smp_ops;
6549 +
6550 +static inline void smp_prepare_boot_cpu(void)
6551 +{
6552 + smp_ops.smp_prepare_boot_cpu();
6553 +}
6554 +static inline void smp_prepare_cpus(unsigned int max_cpus)
6555 +{
6556 + smp_ops.smp_prepare_cpus(max_cpus);
6557 +}
6558 +static inline int __cpu_up(unsigned int cpu)
6559 +{
6560 + return smp_ops.cpu_up(cpu);
6561 +}
6562 +static inline void smp_cpus_done(unsigned int max_cpus)
6563 +{
6564 + smp_ops.smp_cpus_done(max_cpus);
6565 +}
6566 +
6567 +static inline void smp_send_stop(void)
6568 +{
6569 + smp_ops.smp_send_stop();
6570 +}
6571 +static inline void smp_send_reschedule(int cpu)
6572 +{
6573 + smp_ops.smp_send_reschedule(cpu);
6574 +}
6575 +static inline int smp_call_function_mask(cpumask_t mask,
6576 + void (*func) (void *info), void *info,
6577 + int wait)
6578 +{
6579 + return smp_ops.smp_call_function_mask(mask, func, info, wait);
6580 +}
6581 +
6582 +void native_smp_prepare_boot_cpu(void);
6583 +void native_smp_prepare_cpus(unsigned int max_cpus);
6584 +int native_cpu_up(unsigned int cpunum);
6585 +void native_smp_cpus_done(unsigned int max_cpus);
6586 +
6587 #define startup_ipi_hook(phys_apicid, start_eip, start_esp) \
6588 do { } while (0)
6589 +
6590 +#else
6591 +
6592 +
6593 +void xen_smp_send_stop(void);
6594 +void xen_smp_send_reschedule(int cpu);
6595 +int xen_smp_call_function_mask(cpumask_t mask,
6596 + void (*func) (void *info), void *info,
6597 + int wait);
6598 +
6599 +#define smp_send_stop xen_smp_send_stop
6600 +#define smp_send_reschedule xen_smp_send_reschedule
6601 +#define smp_call_function_mask xen_smp_call_function_mask
6602 +
6603 #endif
6604
6605 /*
6606 @@ -62,7 +125,8 @@
6607 * from the initial startup. We map APIC_BASE very early in page_setup(),
6608 * so this is correct in the x86 case.
6609 */
6610 -#define raw_smp_processor_id() (read_pda(cpu_number))
6611 +DECLARE_PER_CPU(int, cpu_number);
6612 +#define raw_smp_processor_id() (x86_read_percpu(cpu_number))
6613
6614 extern cpumask_t cpu_possible_map;
6615 #define cpu_callin_map cpu_possible_map
6616 @@ -73,20 +137,6 @@
6617 return cpus_weight(cpu_possible_map);
6618 }
6619
6620 -#ifdef CONFIG_X86_LOCAL_APIC
6621 -
6622 -#ifdef APIC_DEFINITION
6623 -extern int hard_smp_processor_id(void);
6624 -#else
6625 -#include <mach_apicdef.h>
6626 -static inline int hard_smp_processor_id(void)
6627 -{
6628 - /* we don't want to mark this access volatile - bad code generation */
6629 - return GET_APIC_ID(*(unsigned long *)(APIC_BASE+APIC_ID));
6630 -}
6631 -#endif
6632 -#endif
6633 -
6634 #define safe_smp_processor_id() smp_processor_id()
6635 extern int __cpu_disable(void);
6636 extern void __cpu_die(unsigned int cpu);
6637 @@ -102,10 +152,31 @@
6638
6639 #define NO_PROC_ID 0xFF /* No processor magic marker */
6640
6641 -#endif
6642 +#endif /* CONFIG_SMP */
6643
6644 #ifndef __ASSEMBLY__
6645
6646 +#ifdef CONFIG_X86_LOCAL_APIC
6647 +
6648 +#ifdef APIC_DEFINITION
6649 +extern int hard_smp_processor_id(void);
6650 +#else
6651 +#include <mach_apicdef.h>
6652 +static inline int hard_smp_processor_id(void)
6653 +{
6654 + /* we don't want to mark this access volatile - bad code generation */
6655 + return GET_APIC_ID(*(unsigned long *)(APIC_BASE+APIC_ID));
6656 +}
6657 +#endif /* APIC_DEFINITION */
6658 +
6659 +#else /* CONFIG_X86_LOCAL_APIC */
6660 +
6661 +#ifndef CONFIG_SMP
6662 +#define hard_smp_processor_id() 0
6663 +#endif
6664 +
6665 +#endif /* CONFIG_X86_LOCAL_APIC */
6666 +
6667 extern u8 apicid_2_node[];
6668
6669 #ifdef CONFIG_X86_LOCAL_APIC
6670 --- a/include/asm-x86/mach-xen/asm/smp_64.h
6671 +++ b/include/asm-x86/mach-xen/asm/smp_64.h
6672 @@ -11,12 +11,11 @@
6673 extern int disable_apic;
6674
6675 #ifdef CONFIG_X86_LOCAL_APIC
6676 -#include <asm/fixmap.h>
6677 #include <asm/mpspec.h>
6678 +#include <asm/apic.h>
6679 #ifdef CONFIG_X86_IO_APIC
6680 #include <asm/io_apic.h>
6681 #endif
6682 -#include <asm/apic.h>
6683 #include <asm/thread_info.h>
6684 #endif
6685
6686 @@ -41,7 +40,6 @@
6687 extern void unlock_ipi_call_lock(void);
6688 extern int smp_num_siblings;
6689 extern void smp_send_reschedule(int cpu);
6690 -void smp_stop_cpu(void);
6691
6692 extern cpumask_t cpu_sibling_map[NR_CPUS];
6693 extern cpumask_t cpu_core_map[NR_CPUS];
6694 @@ -62,14 +60,6 @@
6695
6696 #define raw_smp_processor_id() read_pda(cpunumber)
6697
6698 -#ifdef CONFIG_X86_LOCAL_APIC
6699 -static inline int hard_smp_processor_id(void)
6700 -{
6701 - /* we don't want to mark this access volatile - bad code generation */
6702 - return GET_APIC_ID(*(unsigned int *)(APIC_BASE+APIC_ID));
6703 -}
6704 -#endif
6705 -
6706 extern int __cpu_disable(void);
6707 extern void __cpu_die(unsigned int cpu);
6708 extern void prefill_possible_map(void);
6709 @@ -78,6 +68,14 @@
6710
6711 #define NO_PROC_ID 0xFF /* No processor magic marker */
6712
6713 +#endif /* CONFIG_SMP */
6714 +
6715 +#ifdef CONFIG_X86_LOCAL_APIC
6716 +static inline int hard_smp_processor_id(void)
6717 +{
6718 + /* we don't want to mark this access volatile - bad code generation */
6719 + return GET_APIC_ID(*(unsigned int *)(APIC_BASE+APIC_ID));
6720 +}
6721 #endif
6722
6723 /*
6724 --- a/include/asm-x86/mach-xen/asm/system_32.h
6725 +++ b/include/asm-x86/mach-xen/asm/system_32.h
6726 @@ -4,7 +4,7 @@
6727 #include <linux/kernel.h>
6728 #include <asm/segment.h>
6729 #include <asm/cpufeature.h>
6730 -#include <linux/bitops.h> /* for LOCK_PREFIX */
6731 +#include <asm/cmpxchg.h>
6732 #include <asm/synch_bitops.h>
6733 #include <asm/hypervisor.h>
6734
6735 @@ -90,308 +90,102 @@
6736 #define savesegment(seg, value) \
6737 asm volatile("mov %%" #seg ",%0":"=rm" (value))
6738
6739 -#define read_cr0() ({ \
6740 - unsigned int __dummy; \
6741 - __asm__ __volatile__( \
6742 - "movl %%cr0,%0\n\t" \
6743 - :"=r" (__dummy)); \
6744 - __dummy; \
6745 -})
6746 -#define write_cr0(x) \
6747 - __asm__ __volatile__("movl %0,%%cr0": :"r" (x))
6748 -
6749 -#define read_cr2() (current_vcpu_info()->arch.cr2)
6750 -#define write_cr2(x) \
6751 - __asm__ __volatile__("movl %0,%%cr2": :"r" (x))
6752 -
6753 -#define read_cr3() ({ \
6754 - unsigned int __dummy; \
6755 - __asm__ ( \
6756 - "movl %%cr3,%0\n\t" \
6757 - :"=r" (__dummy)); \
6758 - __dummy = xen_cr3_to_pfn(__dummy); \
6759 - mfn_to_pfn(__dummy) << PAGE_SHIFT; \
6760 -})
6761 -#define write_cr3(x) ({ \
6762 - unsigned int __dummy = pfn_to_mfn((x) >> PAGE_SHIFT); \
6763 - __dummy = xen_pfn_to_cr3(__dummy); \
6764 - __asm__ __volatile__("movl %0,%%cr3": :"r" (__dummy)); \
6765 -})
6766 -#define read_cr4() ({ \
6767 - unsigned int __dummy; \
6768 - __asm__( \
6769 - "movl %%cr4,%0\n\t" \
6770 - :"=r" (__dummy)); \
6771 - __dummy; \
6772 -})
6773 -#define read_cr4_safe() ({ \
6774 - unsigned int __dummy; \
6775 - /* This could fault if %cr4 does not exist */ \
6776 - __asm__("1: movl %%cr4, %0 \n" \
6777 - "2: \n" \
6778 - ".section __ex_table,\"a\" \n" \
6779 - ".long 1b,2b \n" \
6780 - ".previous \n" \
6781 - : "=r" (__dummy): "0" (0)); \
6782 - __dummy; \
6783 -})
6784 -
6785 -#define write_cr4(x) \
6786 - __asm__ __volatile__("movl %0,%%cr4": :"r" (x))
6787 -
6788 -#define wbinvd() \
6789 - __asm__ __volatile__ ("wbinvd": : :"memory")
6790 -
6791 -/* Clear the 'TS' bit */
6792 -#define clts() (HYPERVISOR_fpu_taskswitch(0))
6793 -
6794 -/* Set the 'TS' bit */
6795 -#define stts() (HYPERVISOR_fpu_taskswitch(1))
6796 -
6797 -#endif /* __KERNEL__ */
6798 -
6799 -static inline unsigned long get_limit(unsigned long segment)
6800 +static inline void xen_clts(void)
6801 {
6802 - unsigned long __limit;
6803 - __asm__("lsll %1,%0"
6804 - :"=r" (__limit):"r" (segment));
6805 - return __limit+1;
6806 + HYPERVISOR_fpu_taskswitch(0);
6807 }
6808
6809 -#define nop() __asm__ __volatile__ ("nop")
6810 -
6811 -#define xchg(ptr,v) ((__typeof__(*(ptr)))__xchg((unsigned long)(v),(ptr),sizeof(*(ptr))))
6812 -
6813 -#define tas(ptr) (xchg((ptr),1))
6814 -
6815 -struct __xchg_dummy { unsigned long a[100]; };
6816 -#define __xg(x) ((struct __xchg_dummy *)(x))
6817 +static inline unsigned long xen_read_cr0(void)
6818 +{
6819 + unsigned long val;
6820 + asm volatile("movl %%cr0,%0\n\t" :"=r" (val));
6821 + return val;
6822 +}
6823
6824 +static inline void xen_write_cr0(unsigned long val)
6825 +{
6826 + asm volatile("movl %0,%%cr0": :"r" (val));
6827 +}
6828
6829 -#ifdef CONFIG_X86_CMPXCHG64
6830 +#define xen_read_cr2() (current_vcpu_info()->arch.cr2)
6831
6832 -/*
6833 - * The semantics of XCHGCMP8B are a bit strange, this is why
6834 - * there is a loop and the loading of %%eax and %%edx has to
6835 - * be inside. This inlines well in most cases, the cached
6836 - * cost is around ~38 cycles. (in the future we might want
6837 - * to do an SIMD/3DNOW!/MMX/FPU 64-bit store here, but that
6838 - * might have an implicit FPU-save as a cost, so it's not
6839 - * clear which path to go.)
6840 - *
6841 - * cmpxchg8b must be used with the lock prefix here to allow
6842 - * the instruction to be executed atomically, see page 3-102
6843 - * of the instruction set reference 24319102.pdf. We need
6844 - * the reader side to see the coherent 64bit value.
6845 - */
6846 -static inline void __set_64bit (unsigned long long * ptr,
6847 - unsigned int low, unsigned int high)
6848 +static inline void xen_write_cr2(unsigned long val)
6849 {
6850 - __asm__ __volatile__ (
6851 - "\n1:\t"
6852 - "movl (%0), %%eax\n\t"
6853 - "movl 4(%0), %%edx\n\t"
6854 - "lock cmpxchg8b (%0)\n\t"
6855 - "jnz 1b"
6856 - : /* no outputs */
6857 - : "D"(ptr),
6858 - "b"(low),
6859 - "c"(high)
6860 - : "ax","dx","memory");
6861 + asm volatile("movl %0,%%cr2": :"r" (val));
6862 }
6863
6864 -static inline void __set_64bit_constant (unsigned long long *ptr,
6865 - unsigned long long value)
6866 +static inline unsigned long xen_read_cr3(void)
6867 {
6868 - __set_64bit(ptr,(unsigned int)(value), (unsigned int)((value)>>32ULL));
6869 + unsigned long val;
6870 + asm volatile("movl %%cr3,%0\n\t" :"=r" (val));
6871 + return mfn_to_pfn(xen_cr3_to_pfn(val)) << PAGE_SHIFT;
6872 }
6873 -#define ll_low(x) *(((unsigned int*)&(x))+0)
6874 -#define ll_high(x) *(((unsigned int*)&(x))+1)
6875
6876 -static inline void __set_64bit_var (unsigned long long *ptr,
6877 - unsigned long long value)
6878 +static inline void xen_write_cr3(unsigned long val)
6879 {
6880 - __set_64bit(ptr,ll_low(value), ll_high(value));
6881 + val = xen_pfn_to_cr3(pfn_to_mfn(val >> PAGE_SHIFT));
6882 + asm volatile("movl %0,%%cr3": :"r" (val));
6883 }
6884
6885 -#define set_64bit(ptr,value) \
6886 -(__builtin_constant_p(value) ? \
6887 - __set_64bit_constant(ptr, value) : \
6888 - __set_64bit_var(ptr, value) )
6889 +static inline unsigned long xen_read_cr4(void)
6890 +{
6891 + unsigned long val;
6892 + asm volatile("movl %%cr4,%0\n\t" :"=r" (val));
6893 + return val;
6894 +}
6895
6896 -#define _set_64bit(ptr,value) \
6897 -(__builtin_constant_p(value) ? \
6898 - __set_64bit(ptr, (unsigned int)(value), (unsigned int)((value)>>32ULL) ) : \
6899 - __set_64bit(ptr, ll_low(value), ll_high(value)) )
6900 +static inline unsigned long xen_read_cr4_safe(void)
6901 +{
6902 + unsigned long val;
6903 + /* This could fault if %cr4 does not exist */
6904 + asm("1: movl %%cr4, %0 \n"
6905 + "2: \n"
6906 + ".section __ex_table,\"a\" \n"
6907 + ".long 1b,2b \n"
6908 + ".previous \n"
6909 + : "=r" (val): "0" (0));
6910 + return val;
6911 +}
6912
6913 -#endif
6914 +static inline void xen_write_cr4(unsigned long val)
6915 +{
6916 + asm volatile("movl %0,%%cr4": :"r" (val));
6917 +}
6918
6919 -/*
6920 - * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
6921 - * Note 2: xchg has side effect, so that attribute volatile is necessary,
6922 - * but generally the primitive is invalid, *ptr is output argument. --ANK
6923 - */
6924 -static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int size)
6925 +static inline void xen_wbinvd(void)
6926 {
6927 - switch (size) {
6928 - case 1:
6929 - __asm__ __volatile__("xchgb %b0,%1"
6930 - :"=q" (x)
6931 - :"m" (*__xg(ptr)), "0" (x)
6932 - :"memory");
6933 - break;
6934 - case 2:
6935 - __asm__ __volatile__("xchgw %w0,%1"
6936 - :"=r" (x)
6937 - :"m" (*__xg(ptr)), "0" (x)
6938 - :"memory");
6939 - break;
6940 - case 4:
6941 - __asm__ __volatile__("xchgl %0,%1"
6942 - :"=r" (x)
6943 - :"m" (*__xg(ptr)), "0" (x)
6944 - :"memory");
6945 - break;
6946 - }
6947 - return x;
6948 + asm volatile("wbinvd": : :"memory");
6949 }
6950
6951 -/*
6952 - * Atomic compare and exchange. Compare OLD with MEM, if identical,
6953 - * store NEW in MEM. Return the initial value in MEM. Success is
6954 - * indicated by comparing RETURN with OLD.
6955 - */
6956 +#define read_cr0() (xen_read_cr0())
6957 +#define write_cr0(x) (xen_write_cr0(x))
6958 +#define read_cr2() (xen_read_cr2())
6959 +#define write_cr2(x) (xen_write_cr2(x))
6960 +#define read_cr3() (xen_read_cr3())
6961 +#define write_cr3(x) (xen_write_cr3(x))
6962 +#define read_cr4() (xen_read_cr4())
6963 +#define read_cr4_safe() (xen_read_cr4_safe())
6964 +#define write_cr4(x) (xen_write_cr4(x))
6965 +#define wbinvd() (xen_wbinvd())
6966
6967 -#ifdef CONFIG_X86_CMPXCHG
6968 -#define __HAVE_ARCH_CMPXCHG 1
6969 -#define cmpxchg(ptr,o,n)\
6970 - ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\
6971 - (unsigned long)(n),sizeof(*(ptr))))
6972 -#define sync_cmpxchg(ptr,o,n)\
6973 - ((__typeof__(*(ptr)))__sync_cmpxchg((ptr),(unsigned long)(o),\
6974 - (unsigned long)(n),sizeof(*(ptr))))
6975 -#endif
6976 -
6977 -static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
6978 - unsigned long new, int size)
6979 -{
6980 - unsigned long prev;
6981 - switch (size) {
6982 - case 1:
6983 - __asm__ __volatile__(LOCK_PREFIX "cmpxchgb %b1,%2"
6984 - : "=a"(prev)
6985 - : "q"(new), "m"(*__xg(ptr)), "0"(old)
6986 - : "memory");
6987 - return prev;
6988 - case 2:
6989 - __asm__ __volatile__(LOCK_PREFIX "cmpxchgw %w1,%2"
6990 - : "=a"(prev)
6991 - : "r"(new), "m"(*__xg(ptr)), "0"(old)
6992 - : "memory");
6993 - return prev;
6994 - case 4:
6995 - __asm__ __volatile__(LOCK_PREFIX "cmpxchgl %1,%2"
6996 - : "=a"(prev)
6997 - : "r"(new), "m"(*__xg(ptr)), "0"(old)
6998 - : "memory");
6999 - return prev;
7000 - }
7001 - return old;
7002 -}
7003 +/* Clear the 'TS' bit */
7004 +#define clts() (xen_clts())
7005
7006 -/*
7007 - * Always use locked operations when touching memory shared with a
7008 - * hypervisor, since the system may be SMP even if the guest kernel
7009 - * isn't.
7010 - */
7011 -static inline unsigned long __sync_cmpxchg(volatile void *ptr,
7012 - unsigned long old,
7013 - unsigned long new, int size)
7014 -{
7015 - unsigned long prev;
7016 - switch (size) {
7017 - case 1:
7018 - __asm__ __volatile__("lock; cmpxchgb %b1,%2"
7019 - : "=a"(prev)
7020 - : "q"(new), "m"(*__xg(ptr)), "0"(old)
7021 - : "memory");
7022 - return prev;
7023 - case 2:
7024 - __asm__ __volatile__("lock; cmpxchgw %w1,%2"
7025 - : "=a"(prev)
7026 - : "r"(new), "m"(*__xg(ptr)), "0"(old)
7027 - : "memory");
7028 - return prev;
7029 - case 4:
7030 - __asm__ __volatile__("lock; cmpxchgl %1,%2"
7031 - : "=a"(prev)
7032 - : "r"(new), "m"(*__xg(ptr)), "0"(old)
7033 - : "memory");
7034 - return prev;
7035 - }
7036 - return old;
7037 -}
7038 +/* Set the 'TS' bit */
7039 +#define stts() (HYPERVISOR_fpu_taskswitch(1))
7040
7041 -#ifndef CONFIG_X86_CMPXCHG
7042 -/*
7043 - * Building a kernel capable running on 80386. It may be necessary to
7044 - * simulate the cmpxchg on the 80386 CPU. For that purpose we define
7045 - * a function for each of the sizes we support.
7046 - */
7047 +#endif /* __KERNEL__ */
7048
7049 -extern unsigned long cmpxchg_386_u8(volatile void *, u8, u8);
7050 -extern unsigned long cmpxchg_386_u16(volatile void *, u16, u16);
7051 -extern unsigned long cmpxchg_386_u32(volatile void *, u32, u32);
7052 -
7053 -static inline unsigned long cmpxchg_386(volatile void *ptr, unsigned long old,
7054 - unsigned long new, int size)
7055 -{
7056 - switch (size) {
7057 - case 1:
7058 - return cmpxchg_386_u8(ptr, old, new);
7059 - case 2:
7060 - return cmpxchg_386_u16(ptr, old, new);
7061 - case 4:
7062 - return cmpxchg_386_u32(ptr, old, new);
7063 - }
7064 - return old;
7065 -}
7066 -
7067 -#define cmpxchg(ptr,o,n) \
7068 -({ \
7069 - __typeof__(*(ptr)) __ret; \
7070 - if (likely(boot_cpu_data.x86 > 3)) \
7071 - __ret = __cmpxchg((ptr), (unsigned long)(o), \
7072 - (unsigned long)(n), sizeof(*(ptr))); \
7073 - else \
7074 - __ret = cmpxchg_386((ptr), (unsigned long)(o), \
7075 - (unsigned long)(n), sizeof(*(ptr))); \
7076 - __ret; \
7077 -})
7078 -#endif
7079 -
7080 -#ifdef CONFIG_X86_CMPXCHG64
7081 -
7082 -static inline unsigned long long __cmpxchg64(volatile void *ptr, unsigned long long old,
7083 - unsigned long long new)
7084 -{
7085 - unsigned long long prev;
7086 - __asm__ __volatile__(LOCK_PREFIX "cmpxchg8b %3"
7087 - : "=A"(prev)
7088 - : "b"((unsigned long)new),
7089 - "c"((unsigned long)(new >> 32)),
7090 - "m"(*__xg(ptr)),
7091 - "0"(old)
7092 - : "memory");
7093 - return prev;
7094 -}
7095 -
7096 -#define cmpxchg64(ptr,o,n)\
7097 - ((__typeof__(*(ptr)))__cmpxchg64((ptr),(unsigned long long)(o),\
7098 - (unsigned long long)(n)))
7099 +static inline unsigned long get_limit(unsigned long segment)
7100 +{
7101 + unsigned long __limit;
7102 + __asm__("lsll %1,%0"
7103 + :"=r" (__limit):"r" (segment));
7104 + return __limit+1;
7105 +}
7106 +
7107 +#define nop() __asm__ __volatile__ ("nop")
7108
7109 -#endif
7110 -
7111 /*
7112 * Force strict CPU ordering.
7113 * And yes, this is required on UP too when we're talking
7114 --- a/include/asm-x86/mach-xen/asm/system_64.h
7115 +++ b/include/asm-x86/mach-xen/asm/system_64.h
7116 @@ -3,7 +3,7 @@
7117
7118 #include <linux/kernel.h>
7119 #include <asm/segment.h>
7120 -#include <asm/alternative.h>
7121 +#include <asm/cmpxchg.h>
7122
7123 #include <asm/synch_bitops.h>
7124 #include <asm/hypervisor.h>
7125 @@ -43,7 +43,7 @@
7126 [threadrsp] "i" (offsetof(struct task_struct, thread.rsp)), \
7127 [ti_flags] "i" (offsetof(struct thread_info, flags)),\
7128 [tif_fork] "i" (TIF_FORK), \
7129 - [thread_info] "i" (offsetof(struct task_struct, thread_info)), \
7130 + [thread_info] "i" (offsetof(struct task_struct, stack)), \
7131 [pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent)) \
7132 : "memory", "cc" __EXTRA_CLOBBER)
7133
7134 @@ -92,6 +92,12 @@
7135 machine_to_phys(__dummy); \
7136 })
7137
7138 +static inline void write_cr3(unsigned long val)
7139 +{
7140 + val = phys_to_machine(val);
7141 + asm volatile("movq %0,%%cr3" :: "r" (val) : "memory");
7142 +}
7143 +
7144 static inline unsigned long read_cr4(void)
7145 {
7146 unsigned long cr4;
7147 @@ -101,7 +107,7 @@
7148
7149 static inline void write_cr4(unsigned long val)
7150 {
7151 - asm volatile("movq %0,%%cr4" :: "r" (val));
7152 + asm volatile("movq %0,%%cr4" :: "r" (val) : "memory");
7153 }
7154
7155 #define stts() (HYPERVISOR_fpu_taskswitch(1))
7156 @@ -122,100 +128,6 @@
7157
7158 #define nop() __asm__ __volatile__ ("nop")
7159
7160 -#define xchg(ptr,v) ((__typeof__(*(ptr)))__xchg((unsigned long)(v),(ptr),sizeof(*(ptr))))
7161 -
7162 -#define tas(ptr) (xchg((ptr),1))
7163 -
7164 -#define __xg(x) ((volatile long *)(x))
7165 -
7166 -static inline void set_64bit(volatile unsigned long *ptr, unsigned long val)
7167 -{
7168 - *ptr = val;
7169 -}
7170 -
7171 -#define _set_64bit set_64bit
7172 -
7173 -/*
7174 - * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
7175 - * Note 2: xchg has side effect, so that attribute volatile is necessary,
7176 - * but generally the primitive is invalid, *ptr is output argument. --ANK
7177 - */
7178 -static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int size)
7179 -{
7180 - switch (size) {
7181 - case 1:
7182 - __asm__ __volatile__("xchgb %b0,%1"
7183 - :"=q" (x)
7184 - :"m" (*__xg(ptr)), "0" (x)
7185 - :"memory");
7186 - break;
7187 - case 2:
7188 - __asm__ __volatile__("xchgw %w0,%1"
7189 - :"=r" (x)
7190 - :"m" (*__xg(ptr)), "0" (x)
7191 - :"memory");
7192 - break;
7193 - case 4:
7194 - __asm__ __volatile__("xchgl %k0,%1"
7195 - :"=r" (x)
7196 - :"m" (*__xg(ptr)), "0" (x)
7197 - :"memory");
7198 - break;
7199 - case 8:
7200 - __asm__ __volatile__("xchgq %0,%1"
7201 - :"=r" (x)
7202 - :"m" (*__xg(ptr)), "0" (x)
7203 - :"memory");
7204 - break;
7205 - }
7206 - return x;
7207 -}
7208 -
7209 -/*
7210 - * Atomic compare and exchange. Compare OLD with MEM, if identical,
7211 - * store NEW in MEM. Return the initial value in MEM. Success is
7212 - * indicated by comparing RETURN with OLD.
7213 - */
7214 -
7215 -#define __HAVE_ARCH_CMPXCHG 1
7216 -
7217 -static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
7218 - unsigned long new, int size)
7219 -{
7220 - unsigned long prev;
7221 - switch (size) {
7222 - case 1:
7223 - __asm__ __volatile__(LOCK_PREFIX "cmpxchgb %b1,%2"
7224 - : "=a"(prev)
7225 - : "q"(new), "m"(*__xg(ptr)), "0"(old)
7226 - : "memory");
7227 - return prev;
7228 - case 2:
7229 - __asm__ __volatile__(LOCK_PREFIX "cmpxchgw %w1,%2"
7230 - : "=a"(prev)
7231 - : "r"(new), "m"(*__xg(ptr)), "0"(old)
7232 - : "memory");
7233 - return prev;
7234 - case 4:
7235 - __asm__ __volatile__(LOCK_PREFIX "cmpxchgl %k1,%2"
7236 - : "=a"(prev)
7237 - : "r"(new), "m"(*__xg(ptr)), "0"(old)
7238 - : "memory");
7239 - return prev;
7240 - case 8:
7241 - __asm__ __volatile__(LOCK_PREFIX "cmpxchgq %1,%2"
7242 - : "=a"(prev)
7243 - : "r"(new), "m"(*__xg(ptr)), "0"(old)
7244 - : "memory");
7245 - return prev;
7246 - }
7247 - return old;
7248 -}
7249 -
7250 -#define cmpxchg(ptr,o,n)\
7251 - ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\
7252 - (unsigned long)(n),sizeof(*(ptr))))
7253 -
7254 #ifdef CONFIG_SMP
7255 #define smp_mb() mb()
7256 #define smp_rmb() rmb()
7257 --- a/include/asm-x86/mach-xen/asm/tlbflush_32.h
7258 +++ b/include/asm-x86/mach-xen/asm/tlbflush_32.h
7259 @@ -29,8 +29,13 @@
7260 * and page-granular flushes are available only on i486 and up.
7261 */
7262
7263 +#define TLB_FLUSH_ALL 0xffffffff
7264 +
7265 +
7266 #ifndef CONFIG_SMP
7267
7268 +#include <linux/sched.h>
7269 +
7270 #define flush_tlb() __flush_tlb()
7271 #define flush_tlb_all() __flush_tlb_all()
7272 #define local_flush_tlb() __flush_tlb()
7273 @@ -55,7 +60,7 @@
7274 __flush_tlb();
7275 }
7276
7277 -#else
7278 +#else /* SMP */
7279
7280 #include <asm/smp.h>
7281
7282 @@ -84,9 +89,7 @@
7283 char __cacheline_padding[L1_CACHE_BYTES-8];
7284 };
7285 DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate);
7286 -
7287 -
7288 -#endif
7289 +#endif /* SMP */
7290
7291 #define flush_tlb_kernel_range(start, end) flush_tlb_all()
7292
7293 --- a/include/asm-x86/mach-xen/asm/tlbflush_64.h
7294 +++ b/include/asm-x86/mach-xen/asm/tlbflush_64.h
7295 @@ -2,7 +2,9 @@
7296 #define _X8664_TLBFLUSH_H
7297
7298 #include <linux/mm.h>
7299 +#include <linux/sched.h>
7300 #include <asm/processor.h>
7301 +#include <asm/system.h>
7302
7303 #define __flush_tlb() xen_tlb_flush()
7304
7305 --- a/lib/swiotlb-xen.c
7306 +++ b/lib/swiotlb-xen.c
7307 @@ -729,7 +729,6 @@
7308 return (mask >= ((1UL << dma_bits) - 1));
7309 }
7310
7311 -EXPORT_SYMBOL(swiotlb_init);
7312 EXPORT_SYMBOL(swiotlb_map_single);
7313 EXPORT_SYMBOL(swiotlb_unmap_single);
7314 EXPORT_SYMBOL(swiotlb_map_sg);
7315 --- a/net/core/dev.c
7316 +++ b/net/core/dev.c
7317 @@ -1590,12 +1590,17 @@
7318 inline int skb_checksum_setup(struct sk_buff *skb)
7319 {
7320 if (skb->proto_csum_blank) {
7321 + struct iphdr *iph;
7322 + unsigned char *th;
7323 +
7324 if (skb->protocol != htons(ETH_P_IP))
7325 goto out;
7326 - skb->h.raw = (unsigned char *)skb->nh.iph + 4*skb->nh.iph->ihl;
7327 - if (skb->h.raw >= skb->tail)
7328 + iph = ip_hdr(skb);
7329 + th = skb_network_header(skb) + 4 * iph->ihl;
7330 + if (th >= skb_tail_pointer(skb))
7331 goto out;
7332 - switch (skb->nh.iph->protocol) {
7333 + skb->csum_start = th - skb->head;
7334 + switch (iph->protocol) {
7335 case IPPROTO_TCP:
7336 skb->csum_offset = offsetof(struct tcphdr, check);
7337 break;
7338 @@ -1606,10 +1611,10 @@
7339 if (net_ratelimit())
7340 printk(KERN_ERR "Attempting to checksum a non-"
7341 "TCP/UDP packet, dropping a protocol"
7342 - " %d packet", skb->nh.iph->protocol);
7343 + " %d packet", iph->protocol);
7344 goto out;
7345 }
7346 - if ((skb->h.raw + skb->csum_offset + 2) > skb->tail)
7347 + if ((th + skb->csum_offset + 2) > skb_tail_pointer(skb))
7348 goto out;
7349 skb->ip_summed = CHECKSUM_PARTIAL;
7350 skb->proto_csum_blank = 0;
7351 --- a/scripts/Makefile.xen.awk
7352 +++ b/scripts/Makefile.xen.awk
7353 @@ -13,7 +13,7 @@
7354 next
7355 }
7356
7357 -/:[[:space:]]*%\.[cS][[:space:]]/ {
7358 +/:[[:space:]]*\$\(src\)\/%\.[cS][[:space:]]/ {
7359 line = gensub(/%.([cS])/, "%-xen.\\1", "g", $0)
7360 line = gensub(/(single-used-m)/, "xen-\\1", "g", line)
7361 print line