Magellan Linux

Contents of /trunk/kernel26-xen/patches-2.6.25-r1/1023-2.6.25-xen-patch-2.6.22.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 613 - (show annotations) (download)
Sat May 24 01:13:37 2008 UTC (15 years, 11 months ago) by niro
File size: 211855 byte(s)
-fixed patch

1 From: www.kernel.org
2 Subject: Update to 2.6.22
3 Patch-mainline: 2.6.22
4
5 Automatically created from "patches.kernel.org/patch-2.6.22" by xen-port-patches.py
6
7 Acked-by: jbeulich@novell.com
8
9 ---
10 arch/x86/Kconfig | 5
11 arch/x86/ia32/ia32entry-xen.S | 18 -
12 arch/x86/kernel/Makefile | 2
13 arch/x86/kernel/acpi/sleep_64-xen.c | 26 -
14 arch/x86/kernel/apic_32-xen.c | 1
15 arch/x86/kernel/apic_64-xen.c | 1
16 arch/x86/kernel/cpu/common-xen.c | 224 ++++---------
17 arch/x86/kernel/cpu/mtrr/main-xen.c | 2
18 arch/x86/kernel/e820_32-xen.c | 46 +-
19 arch/x86/kernel/e820_64-xen.c | 28 -
20 arch/x86/kernel/early_printk-xen.c | 27 -
21 arch/x86/kernel/entry_32-xen.S | 30 -
22 arch/x86/kernel/entry_64-xen.S | 7
23 arch/x86/kernel/genapic_64-xen.c | 106 +-----
24 arch/x86/kernel/genapic_xen_64.c | 3
25 arch/x86/kernel/head64-xen.c | 32 +
26 arch/x86/kernel/head_32-xen.S | 101 ------
27 arch/x86/kernel/head_64-xen.S | 37 --
28 arch/x86/kernel/io_apic_32-xen.c | 43 --
29 arch/x86/kernel/io_apic_64-xen.c | 39 --
30 arch/x86/kernel/ioport_32-xen.c | 2
31 arch/x86/kernel/ioport_64-xen.c | 2
32 arch/x86/kernel/irq_32-xen.c | 3
33 arch/x86/kernel/irq_64-xen.c | 34 +-
34 arch/x86/kernel/ldt_32-xen.c | 1
35 arch/x86/kernel/ldt_64-xen.c | 1
36 arch/x86/kernel/microcode-xen.c | 2
37 arch/x86/kernel/mpparse_32-xen.c | 3
38 arch/x86/kernel/mpparse_64-xen.c | 3
39 arch/x86/kernel/pci-dma_32-xen.c | 29 +
40 arch/x86/kernel/pci-swiotlb_64-xen.c | 2
41 arch/x86/kernel/process_32-xen.c | 27 +
42 arch/x86/kernel/process_64-xen.c | 16
43 arch/x86/kernel/quirks-xen.c | 63 ---
44 arch/x86/kernel/setup64-xen.c | 17 -
45 arch/x86/kernel/setup_64-xen.c | 30 -
46 arch/x86/kernel/smp_32-xen.c | 191 ++++-------
47 arch/x86/kernel/smp_64-xen.c | 29 -
48 arch/x86/kernel/time_32-xen.c | 62 +--
49 arch/x86/kernel/traps_32-xen.c | 46 +-
50 arch/x86/kernel/traps_64-xen.c | 55 +--
51 arch/x86/kernel/vsyscall_64-xen.c | 73 +++-
52 arch/x86/mm/fault_32-xen.c | 42 +-
53 arch/x86/mm/fault_64-xen.c | 15
54 arch/x86/mm/highmem_32-xen.c | 14
55 arch/x86/mm/init_32-xen.c | 157 ++++++---
56 arch/x86/mm/init_64-xen.c | 132 ++++---
57 arch/x86/mm/ioremap_32-xen.c | 1
58 arch/x86/mm/pageattr_64-xen.c | 27 +
59 arch/x86/mm/pgtable_32-xen.c | 210 +++++++-----
60 drivers/char/tpm/tpm_xen.c | 2
61 drivers/xen/blkfront/blkfront.c | 2
62 drivers/xen/char/mem.c | 1
63 drivers/xen/core/hypervisor_sysfs.c | 2
64 drivers/xen/core/smpboot.c | 49 +-
65 drivers/xen/core/xen_sysfs.c | 20 -
66 drivers/xen/netback/netback.c | 14
67 drivers/xen/netfront/netfront.c | 2
68 drivers/xen/pciback/xenbus.c | 2
69 drivers/xen/pcifront/xenbus.c | 4
70 drivers/xen/sfc_netback/accel_fwd.c | 7
71 drivers/xen/sfc_netback/accel_solarflare.c | 2
72 drivers/xen/sfc_netfront/accel_tso.c | 28 -
73 drivers/xen/sfc_netfront/accel_vi.c | 4
74 drivers/xen/sfc_netfront/accel_xenbus.c | 4
75 drivers/xen/xenoprof/xenoprofile.c | 2
76 fs/aio.c | 7
77 include/asm-x86/mach-xen/asm/desc_32.h | 119 ++++---
78 include/asm-x86/mach-xen/asm/desc_64.h | 30 -
79 include/asm-x86/mach-xen/asm/dma-mapping_64.h | 2
80 include/asm-x86/mach-xen/asm/fixmap_32.h | 9
81 include/asm-x86/mach-xen/asm/fixmap_64.h | 1
82 include/asm-x86/mach-xen/asm/highmem.h | 6
83 include/asm-x86/mach-xen/asm/io_32.h | 13
84 include/asm-x86/mach-xen/asm/irqflags_32.h | 78 ++--
85 include/asm-x86/mach-xen/asm/irqflags_64.h | 19 -
86 include/asm-x86/mach-xen/asm/mmu.h | 8
87 include/asm-x86/mach-xen/asm/mmu_64.h | 8
88 include/asm-x86/mach-xen/asm/mmu_context_32.h | 29 +
89 include/asm-x86/mach-xen/asm/mmu_context_64.h | 3
90 include/asm-x86/mach-xen/asm/page_64.h | 61 +--
91 include/asm-x86/mach-xen/asm/pgalloc_32.h | 3
92 include/asm-x86/mach-xen/asm/pgalloc_64.h | 15
93 include/asm-x86/mach-xen/asm/pgtable-2level.h | 43 +-
94 include/asm-x86/mach-xen/asm/pgtable-3level-defs.h | 2
95 include/asm-x86/mach-xen/asm/pgtable-3level.h | 61 ++-
96 include/asm-x86/mach-xen/asm/pgtable_32.h | 80 ++--
97 include/asm-x86/mach-xen/asm/pgtable_64.h | 83 ++---
98 include/asm-x86/mach-xen/asm/processor_32.h | 141 +++-----
99 include/asm-x86/mach-xen/asm/processor_64.h | 55 ---
100 include/asm-x86/mach-xen/asm/scatterlist_32.h | 2
101 include/asm-x86/mach-xen/asm/segment_32.h | 10
102 include/asm-x86/mach-xen/asm/smp_32.h | 117 +++++--
103 include/asm-x86/mach-xen/asm/smp_64.h | 20 -
104 include/asm-x86/mach-xen/asm/system_32.h | 348 ++++-----------------
105 include/asm-x86/mach-xen/asm/system_64.h | 106 ------
106 include/asm-x86/mach-xen/asm/tlbflush_32.h | 11
107 include/asm-x86/mach-xen/asm/tlbflush_64.h | 2
108 lib/swiotlb-xen.c | 1
109 net/core/dev.c | 15
110 scripts/Makefile.xen.awk | 2
111 101 files changed, 1642 insertions(+), 2080 deletions(-)
112
113 --- a/arch/x86/Kconfig
114 +++ b/arch/x86/Kconfig
115 @@ -1222,7 +1222,7 @@
116
117 config RELOCATABLE
118 bool "Build a relocatable kernel (EXPERIMENTAL)"
119 - depends on EXPERIMENTAL && !X86_XEN
120 + depends on EXPERIMENTAL && !X86_XEN && !X86_64_XEN
121 help
122 This builds a kernel image that retains relocation information
123 so it can be loaded someplace besides the default 1MB.
124 @@ -1276,7 +1276,6 @@
125 def_bool y
126 prompt "Compat VDSO support"
127 depends on X86_32 || IA32_EMULATION
128 - depends on !X86_XEN
129 help
130 Map the 32-bit VDSO to the predictable old-style address too.
131 ---help---
132 @@ -1453,7 +1452,7 @@
133 bool "PCI support" if !X86_VISWS
134 depends on !X86_VOYAGER
135 default y
136 - select ARCH_SUPPORTS_MSI if (X86_LOCAL_APIC && X86_IO_APIC)
137 + select ARCH_SUPPORTS_MSI if (X86_LOCAL_APIC && X86_IO_APIC && !X86_XEN && !X86_64_XEN)
138 help
139 Find out whether you have a PCI motherboard. PCI is the name of a
140 bus system, i.e. the way the CPU talks to the other stuff inside
141 --- a/arch/x86/ia32/ia32entry-xen.S
142 +++ b/arch/x86/ia32/ia32entry-xen.S
143 @@ -431,11 +431,7 @@
144 .quad sys_symlink
145 .quad sys_lstat
146 .quad sys_readlink /* 85 */
147 -#ifdef CONFIG_IA32_AOUT
148 .quad sys_uselib
149 -#else
150 - .quad quiet_ni_syscall
151 -#endif
152 .quad sys_swapon
153 .quad sys_reboot
154 .quad compat_sys_old_readdir
155 @@ -574,7 +570,7 @@
156 .quad quiet_ni_syscall /* tux */
157 .quad quiet_ni_syscall /* security */
158 .quad sys_gettid
159 - .quad sys_readahead /* 225 */
160 + .quad sys32_readahead /* 225 */
161 .quad sys_setxattr
162 .quad sys_lsetxattr
163 .quad sys_fsetxattr
164 @@ -599,7 +595,7 @@
165 .quad compat_sys_io_getevents
166 .quad compat_sys_io_submit
167 .quad sys_io_cancel
168 - .quad sys_fadvise64 /* 250 */
169 + .quad sys32_fadvise64 /* 250 */
170 .quad quiet_ni_syscall /* free_huge_pages */
171 .quad sys_exit_group
172 .quad sys32_lookup_dcookie
173 @@ -663,10 +659,14 @@
174 .quad compat_sys_set_robust_list
175 .quad compat_sys_get_robust_list
176 .quad sys_splice
177 - .quad sys_sync_file_range
178 - .quad sys_tee
179 + .quad sys32_sync_file_range
180 + .quad sys_tee /* 315 */
181 .quad compat_sys_vmsplice
182 .quad compat_sys_move_pages
183 .quad sys_getcpu
184 .quad sys_epoll_pwait
185 -ia32_syscall_end:
186 + .quad compat_sys_utimensat /* 320 */
187 + .quad compat_sys_signalfd
188 + .quad compat_sys_timerfd
189 + .quad sys_eventfd
190 +ia32_syscall_end:
191 --- a/arch/x86/kernel/Makefile
192 +++ b/arch/x86/kernel/Makefile
193 @@ -106,4 +106,4 @@
194
195 disabled-obj-$(CONFIG_XEN) := early-quirks.o hpet.o i8253.o i8259_$(BITS).o reboot.o \
196 smpboot_$(BITS).o tsc_$(BITS).o tsc_sync.o
197 -%/head_$(BITS).o %/head_$(BITS).s: $(if $(CONFIG_XEN),EXTRA_AFLAGS,dummy) :=
198 +%/head_64.o %/head_64.s: $(if $(CONFIG_XEN),EXTRA_AFLAGS,dummy) :=
199 --- a/arch/x86/kernel/acpi/sleep_64-xen.c
200 +++ b/arch/x86/kernel/acpi/sleep_64-xen.c
201 @@ -60,19 +60,6 @@
202 extern char wakeup_start, wakeup_end;
203
204 extern unsigned long acpi_copy_wakeup_routine(unsigned long);
205 -
206 -static pgd_t low_ptr;
207 -
208 -static void init_low_mapping(void)
209 -{
210 - pgd_t *slot0 = pgd_offset(current->mm, 0UL);
211 - low_ptr = *slot0;
212 - /* FIXME: We're playing with the current task's page tables here, which
213 - * is potentially dangerous on SMP systems.
214 - */
215 - set_pgd(slot0, *pgd_offset(current->mm, PAGE_OFFSET));
216 - local_flush_tlb();
217 -}
218 #endif
219
220 /**
221 @@ -84,8 +71,6 @@
222 int acpi_save_state_mem(void)
223 {
224 #ifndef CONFIG_ACPI_PV_SLEEP
225 - init_low_mapping();
226 -
227 memcpy((void *)acpi_wakeup_address, &wakeup_start,
228 &wakeup_end - &wakeup_start);
229 acpi_copy_wakeup_routine(acpi_wakeup_address);
230 @@ -98,10 +83,6 @@
231 */
232 void acpi_restore_state_mem(void)
233 {
234 -#ifndef CONFIG_ACPI_PV_SLEEP
235 - set_pgd(pgd_offset(current->mm, 0UL), low_ptr);
236 - local_flush_tlb();
237 -#endif
238 }
239
240 /**
241 @@ -115,10 +96,11 @@
242 void __init acpi_reserve_bootmem(void)
243 {
244 #ifndef CONFIG_ACPI_PV_SLEEP
245 - acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE);
246 - if ((&wakeup_end - &wakeup_start) > PAGE_SIZE)
247 + acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE*2);
248 + if ((&wakeup_end - &wakeup_start) > (PAGE_SIZE*2))
249 printk(KERN_CRIT
250 - "ACPI: Wakeup code way too big, will crash on attempt to suspend\n");
251 + "ACPI: Wakeup code way too big, will crash on attempt"
252 + " to suspend\n");
253 #endif
254 }
255
256 --- a/arch/x86/kernel/apic_32-xen.c
257 +++ b/arch/x86/kernel/apic_32-xen.c
258 @@ -19,7 +19,6 @@
259 #include <linux/mm.h>
260 #include <linux/delay.h>
261 #include <linux/bootmem.h>
262 -#include <linux/smp_lock.h>
263 #include <linux/interrupt.h>
264 #include <linux/mc146818rtc.h>
265 #include <linux/kernel_stat.h>
266 --- a/arch/x86/kernel/apic_64-xen.c
267 +++ b/arch/x86/kernel/apic_64-xen.c
268 @@ -19,7 +19,6 @@
269 #include <linux/mm.h>
270 #include <linux/delay.h>
271 #include <linux/bootmem.h>
272 -#include <linux/smp_lock.h>
273 #include <linux/interrupt.h>
274 #include <linux/mc146818rtc.h>
275 #include <linux/kernel_stat.h>
276 --- a/arch/x86/kernel/cpu/common-xen.c
277 +++ b/arch/x86/kernel/cpu/common-xen.c
278 @@ -22,16 +22,40 @@
279 #define phys_pkg_id(a,b) a
280 #endif
281 #endif
282 -#include <asm/pda.h>
283 #include <asm/hypervisor.h>
284
285 #include "cpu.h"
286
287 -DEFINE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr);
288 -EXPORT_PER_CPU_SYMBOL(cpu_gdt_descr);
289 +DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
290 + [GDT_ENTRY_KERNEL_CS] = { 0x0000ffff, 0x00cf9a00 },
291 + [GDT_ENTRY_KERNEL_DS] = { 0x0000ffff, 0x00cf9200 },
292 + [GDT_ENTRY_DEFAULT_USER_CS] = { 0x0000ffff, 0x00cffa00 },
293 + [GDT_ENTRY_DEFAULT_USER_DS] = { 0x0000ffff, 0x00cff200 },
294 +#ifndef CONFIG_XEN
295 + /*
296 + * Segments used for calling PnP BIOS have byte granularity.
297 + * They code segments and data segments have fixed 64k limits,
298 + * the transfer segment sizes are set at run time.
299 + */
300 + [GDT_ENTRY_PNPBIOS_CS32] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */
301 + [GDT_ENTRY_PNPBIOS_CS16] = { 0x0000ffff, 0x00009a00 },/* 16-bit code */
302 + [GDT_ENTRY_PNPBIOS_DS] = { 0x0000ffff, 0x00009200 }, /* 16-bit data */
303 + [GDT_ENTRY_PNPBIOS_TS1] = { 0x00000000, 0x00009200 },/* 16-bit data */
304 + [GDT_ENTRY_PNPBIOS_TS2] = { 0x00000000, 0x00009200 },/* 16-bit data */
305 + /*
306 + * The APM segments have byte granularity and their bases
307 + * are set at run time. All have 64k limits.
308 + */
309 + [GDT_ENTRY_APMBIOS_BASE] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */
310 + /* 16-bit code */
311 + [GDT_ENTRY_APMBIOS_BASE+1] = { 0x0000ffff, 0x00009a00 },
312 + [GDT_ENTRY_APMBIOS_BASE+2] = { 0x0000ffff, 0x00409200 }, /* data */
313
314 -struct i386_pda *_cpu_pda[NR_CPUS] __read_mostly;
315 -EXPORT_SYMBOL(_cpu_pda);
316 + [GDT_ENTRY_ESPFIX_SS] = { 0x00000000, 0x00c09200 },
317 +#endif
318 + [GDT_ENTRY_PERCPU] = { 0x00000000, 0x00000000 },
319 +} };
320 +EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
321
322 static int cachesize_override __cpuinitdata = -1;
323 static int disable_x86_fxsr __cpuinitdata;
324 @@ -373,7 +397,7 @@
325 /*
326 * This does the hard work of actually picking apart the CPU stuff...
327 */
328 -void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
329 +static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
330 {
331 int i;
332
333 @@ -484,15 +508,22 @@
334
335 /* Init Machine Check Exception if available. */
336 mcheck_init(c);
337 +}
338
339 - if (c == &boot_cpu_data)
340 - sysenter_setup();
341 +void __init identify_boot_cpu(void)
342 +{
343 + identify_cpu(&boot_cpu_data);
344 + sysenter_setup();
345 enable_sep_cpu();
346 + mtrr_bp_init();
347 +}
348
349 - if (c == &boot_cpu_data)
350 - mtrr_bp_init();
351 - else
352 - mtrr_ap_init();
353 +void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
354 +{
355 + BUG_ON(c == &boot_cpu_data);
356 + identify_cpu(c);
357 + enable_sep_cpu();
358 + mtrr_ap_init();
359 }
360
361 #ifdef CONFIG_X86_HT
362 @@ -606,136 +637,47 @@
363 #endif
364 }
365
366 -/* Make sure %gs is initialized properly in idle threads */
367 +/* Make sure %fs is initialized properly in idle threads */
368 struct pt_regs * __devinit idle_regs(struct pt_regs *regs)
369 {
370 memset(regs, 0, sizeof(struct pt_regs));
371 - regs->xfs = __KERNEL_PDA;
372 + regs->xfs = __KERNEL_PERCPU;
373 return regs;
374 }
375
376 -static __cpuinit int alloc_gdt(int cpu)
377 +/* Current gdt points %fs at the "master" per-cpu area: after this,
378 + * it's on the real one. */
379 +void switch_to_new_gdt(void)
380 {
381 - struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
382 - struct desc_struct *gdt;
383 - struct i386_pda *pda;
384 -
385 - gdt = (struct desc_struct *)cpu_gdt_descr->address;
386 - pda = cpu_pda(cpu);
387 -
388 - /*
389 - * This is a horrible hack to allocate the GDT. The problem
390 - * is that cpu_init() is called really early for the boot CPU
391 - * (and hence needs bootmem) but much later for the secondary
392 - * CPUs, when bootmem will have gone away
393 - */
394 - if (NODE_DATA(0)->bdata->node_bootmem_map) {
395 - BUG_ON(gdt != NULL || pda != NULL);
396 -
397 - gdt = alloc_bootmem_pages(PAGE_SIZE);
398 - pda = alloc_bootmem(sizeof(*pda));
399 - /* alloc_bootmem(_pages) panics on failure, so no check */
400 -
401 - memset(gdt, 0, PAGE_SIZE);
402 - memset(pda, 0, sizeof(*pda));
403 - } else {
404 - /* GDT and PDA might already have been allocated if
405 - this is a CPU hotplug re-insertion. */
406 - if (gdt == NULL)
407 - gdt = (struct desc_struct *)get_zeroed_page(GFP_KERNEL);
408 -
409 - if (pda == NULL)
410 - pda = kmalloc_node(sizeof(*pda), GFP_KERNEL, cpu_to_node(cpu));
411 -
412 - if (unlikely(!gdt || !pda)) {
413 - free_pages((unsigned long)gdt, 0);
414 - kfree(pda);
415 - return 0;
416 - }
417 - }
418 -
419 - cpu_gdt_descr->address = (unsigned long)gdt;
420 - cpu_pda(cpu) = pda;
421 -
422 - return 1;
423 -}
424 -
425 -/* Initial PDA used by boot CPU */
426 -struct i386_pda boot_pda = {
427 - ._pda = &boot_pda,
428 - .cpu_number = 0,
429 - .pcurrent = &init_task,
430 -};
431 -
432 -static inline void set_kernel_fs(void)
433 -{
434 - /* Set %fs for this CPU's PDA. Memory clobber is to create a
435 - barrier with respect to any PDA operations, so the compiler
436 - doesn't move any before here. */
437 - asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_PDA) : "memory");
438 -}
439 -
440 -/* Initialize the CPU's GDT and PDA. The boot CPU does this for
441 - itself, but secondaries find this done for them. */
442 -__cpuinit int init_gdt(int cpu, struct task_struct *idle)
443 -{
444 - struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
445 - struct desc_struct *gdt;
446 - struct i386_pda *pda;
447 -
448 - /* For non-boot CPUs, the GDT and PDA should already have been
449 - allocated. */
450 - if (!alloc_gdt(cpu)) {
451 - printk(KERN_CRIT "CPU%d failed to allocate GDT or PDA\n", cpu);
452 - return 0;
453 - }
454 -
455 - gdt = (struct desc_struct *)cpu_gdt_descr->address;
456 - pda = cpu_pda(cpu);
457 -
458 - BUG_ON(gdt == NULL || pda == NULL);
459 -
460 - /*
461 - * Initialize the per-CPU GDT with the boot GDT,
462 - * and set up the GDT descriptor:
463 - */
464 - memcpy(gdt, cpu_gdt_table, GDT_SIZE);
465 - cpu_gdt_descr->size = GDT_SIZE - 1;
466 -
467 - pack_descriptor((u32 *)&gdt[GDT_ENTRY_PDA].a,
468 - (u32 *)&gdt[GDT_ENTRY_PDA].b,
469 - (unsigned long)pda, sizeof(*pda) - 1,
470 - 0x80 | DESCTYPE_S | 0x2, 0); /* present read-write data segment */
471 -
472 - memset(pda, 0, sizeof(*pda));
473 - pda->_pda = pda;
474 - pda->cpu_number = cpu;
475 - pda->pcurrent = idle;
476 -
477 - return 1;
478 -}
479 -
480 -void __cpuinit cpu_set_gdt(int cpu)
481 -{
482 - struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
483 + struct Xgt_desc_struct gdt_descr;
484 unsigned long va, frames[16];
485 int f;
486
487 - for (va = cpu_gdt_descr->address, f = 0;
488 - va < cpu_gdt_descr->address + cpu_gdt_descr->size;
489 + gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
490 + gdt_descr.size = GDT_SIZE - 1;
491 +
492 + for (va = gdt_descr.address, f = 0;
493 + va < gdt_descr.address + gdt_descr.size;
494 va += PAGE_SIZE, f++) {
495 frames[f] = virt_to_mfn(va);
496 make_lowmem_page_readonly(
497 (void *)va, XENFEAT_writable_descriptor_tables);
498 }
499 - BUG_ON(HYPERVISOR_set_gdt(frames, (cpu_gdt_descr->size + 1) / 8));
500 -
501 - set_kernel_fs();
502 + if (HYPERVISOR_set_gdt(frames, (gdt_descr.size + 1) / 8))
503 + BUG();
504 + asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory");
505 }
506
507 -/* Common CPU init for both boot and secondary CPUs */
508 -static void __cpuinit _cpu_init(int cpu, struct task_struct *curr)
509 +/*
510 + * cpu_init() initializes state that is per-CPU. Some data is already
511 + * initialized (naturally) in the bootstrap process, such as the GDT
512 + * and IDT. We reload them nevertheless, this function acts as a
513 + * 'CPU state barrier', nothing should get across.
514 + */
515 +void __cpuinit cpu_init(void)
516 {
517 + int cpu = smp_processor_id();
518 + struct task_struct *curr = current;
519 #ifndef CONFIG_X86_NO_TSS
520 struct tss_struct * t = &per_cpu(init_tss, cpu);
521 #endif
522 @@ -757,6 +699,8 @@
523 set_in_cr4(X86_CR4_TSD);
524 }
525
526 + switch_to_new_gdt();
527 +
528 /*
529 * Set up and load the per-CPU TSS and LDT
530 */
531 @@ -794,38 +738,6 @@
532 mxcsr_feature_mask_init();
533 }
534
535 -/* Entrypoint to initialize secondary CPU */
536 -void __cpuinit secondary_cpu_init(void)
537 -{
538 - int cpu = smp_processor_id();
539 - struct task_struct *curr = current;
540 -
541 - _cpu_init(cpu, curr);
542 -}
543 -
544 -/*
545 - * cpu_init() initializes state that is per-CPU. Some data is already
546 - * initialized (naturally) in the bootstrap process, such as the GDT
547 - * and IDT. We reload them nevertheless, this function acts as a
548 - * 'CPU state barrier', nothing should get across.
549 - */
550 -void __cpuinit cpu_init(void)
551 -{
552 - int cpu = smp_processor_id();
553 - struct task_struct *curr = current;
554 -
555 - /* Set up the real GDT and PDA, so we can transition from the
556 - boot versions. */
557 - if (!init_gdt(cpu, curr)) {
558 - /* failed to allocate something; not much we can do... */
559 - for (;;)
560 - local_irq_enable();
561 - }
562 -
563 - cpu_set_gdt(cpu);
564 - _cpu_init(cpu, curr);
565 -}
566 -
567 #ifdef CONFIG_HOTPLUG_CPU
568 void __cpuinit cpu_uninit(void)
569 {
570 --- a/arch/x86/kernel/cpu/mtrr/main-xen.c
571 +++ b/arch/x86/kernel/cpu/mtrr/main-xen.c
572 @@ -167,7 +167,7 @@
573 EXPORT_SYMBOL(mtrr_add);
574 EXPORT_SYMBOL(mtrr_del);
575
576 -void __init mtrr_bp_init(void)
577 +__init void mtrr_bp_init(void)
578 {
579 }
580
581 --- a/arch/x86/kernel/e820_32-xen.c
582 +++ b/arch/x86/kernel/e820_32-xen.c
583 @@ -162,26 +162,27 @@
584
585 static int __init romsignature(const unsigned char *rom)
586 {
587 + const unsigned short * const ptr = (const unsigned short *)rom;
588 unsigned short sig;
589
590 - return probe_kernel_address((const unsigned short *)rom, sig) == 0 &&
591 - sig == ROMSIGNATURE;
592 + return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE;
593 }
594
595 -static int __init romchecksum(unsigned char *rom, unsigned long length)
596 +static int __init romchecksum(const unsigned char *rom, unsigned long length)
597 {
598 - unsigned char sum;
599 + unsigned char sum, c;
600
601 - for (sum = 0; length; length--)
602 - sum += *rom++;
603 - return sum == 0;
604 + for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--)
605 + sum += c;
606 + return !length && !sum;
607 }
608
609 static void __init probe_roms(void)
610 {
611 + const unsigned char *rom;
612 unsigned long start, length, upper;
613 - unsigned char *rom;
614 - int i;
615 + unsigned char c;
616 + int i;
617
618 #ifdef CONFIG_XEN
619 /* Nothing to do if not running in dom0. */
620 @@ -198,8 +199,11 @@
621
622 video_rom_resource.start = start;
623
624 + if (probe_kernel_address(rom + 2, c) != 0)
625 + continue;
626 +
627 /* 0 < length <= 0x7f * 512, historically */
628 - length = rom[2] * 512;
629 + length = c * 512;
630
631 /* if checksum okay, trust length byte */
632 if (length && romchecksum(rom, length))
633 @@ -233,8 +237,11 @@
634 if (!romsignature(rom))
635 continue;
636
637 + if (probe_kernel_address(rom + 2, c) != 0)
638 + continue;
639 +
640 /* 0 < length <= 0x7f * 512, historically */
641 - length = rom[2] * 512;
642 + length = c * 512;
643
644 /* but accept any length that fits if checksum okay */
645 if (!length || start + length > upper || !romchecksum(rom, length))
646 @@ -249,7 +256,7 @@
647 }
648
649 #ifdef CONFIG_XEN
650 -static struct e820map machine_e820 __initdata;
651 +static struct e820map machine_e820;
652 #define e820 machine_e820
653 #endif
654
655 @@ -409,10 +416,8 @@
656 ____________________33__
657 ______________________4_
658 */
659 - printk("sanitize start\n");
660 /* if there's only one memory region, don't bother */
661 if (*pnr_map < 2) {
662 - printk("sanitize bail 0\n");
663 return -1;
664 }
665
666 @@ -421,7 +426,6 @@
667 /* bail out if we find any unreasonable addresses in bios map */
668 for (i=0; i<old_nr; i++)
669 if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) {
670 - printk("sanitize bail 1\n");
671 return -1;
672 }
673
674 @@ -517,7 +521,6 @@
675 memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
676 *pnr_map = new_nr;
677
678 - printk("sanitize end\n");
679 return 0;
680 }
681
682 @@ -552,7 +555,6 @@
683 unsigned long long size = biosmap->size;
684 unsigned long long end = start + size;
685 unsigned long type = biosmap->type;
686 - printk("copy_e820_map() start: %016Lx size: %016Lx end: %016Lx type: %ld\n", start, size, end, type);
687
688 /* Overflow in 64 bits? Ignore the memory map. */
689 if (start > end)
690 @@ -564,17 +566,11 @@
691 * Not right. Fix it up.
692 */
693 if (type == E820_RAM) {
694 - printk("copy_e820_map() type is E820_RAM\n");
695 if (start < 0x100000ULL && end > 0xA0000ULL) {
696 - printk("copy_e820_map() lies in range...\n");
697 - if (start < 0xA0000ULL) {
698 - printk("copy_e820_map() start < 0xA0000ULL\n");
699 + if (start < 0xA0000ULL)
700 add_memory_region(start, 0xA0000ULL-start, type);
701 - }
702 - if (end <= 0x100000ULL) {
703 - printk("copy_e820_map() end <= 0x100000ULL\n");
704 + if (end <= 0x100000ULL)
705 continue;
706 - }
707 start = 0x100000ULL;
708 size = end - start;
709 }
710 --- a/arch/x86/kernel/e820_64-xen.c
711 +++ b/arch/x86/kernel/e820_64-xen.c
712 @@ -17,6 +17,8 @@
713 #include <linux/kexec.h>
714 #include <linux/module.h>
715 #include <linux/mm.h>
716 +#include <linux/suspend.h>
717 +#include <linux/pfn.h>
718
719 #include <asm/pgtable.h>
720 #include <asm/page.h>
721 @@ -28,7 +30,7 @@
722
723 struct e820map e820 __initdata;
724 #ifdef CONFIG_XEN
725 -struct e820map machine_e820 __initdata;
726 +struct e820map machine_e820;
727 #endif
728
729 /*
730 @@ -293,22 +295,6 @@
731 }
732
733 #ifndef CONFIG_XEN
734 -/* Mark pages corresponding to given address range as nosave */
735 -static void __init
736 -e820_mark_nosave_range(unsigned long start, unsigned long end)
737 -{
738 - unsigned long pfn, max_pfn;
739 -
740 - if (start >= end)
741 - return;
742 -
743 - printk("Nosave address range: %016lx - %016lx\n", start, end);
744 - max_pfn = end >> PAGE_SHIFT;
745 - for (pfn = start >> PAGE_SHIFT; pfn < max_pfn; pfn++)
746 - if (pfn_valid(pfn))
747 - SetPageNosave(pfn_to_page(pfn));
748 -}
749 -
750 /*
751 * Find the ranges of physical addresses that do not correspond to
752 * e820 RAM areas and mark the corresponding pages as nosave for software
753 @@ -327,13 +313,13 @@
754 struct e820entry *ei = &e820.map[i];
755
756 if (paddr < ei->addr)
757 - e820_mark_nosave_range(paddr,
758 - round_up(ei->addr, PAGE_SIZE));
759 + register_nosave_region(PFN_DOWN(paddr),
760 + PFN_UP(ei->addr));
761
762 paddr = round_down(ei->addr + ei->size, PAGE_SIZE);
763 if (ei->type != E820_RAM)
764 - e820_mark_nosave_range(round_up(ei->addr, PAGE_SIZE),
765 - paddr);
766 + register_nosave_region(PFN_UP(ei->addr),
767 + PFN_DOWN(paddr));
768
769 if (paddr >= (end_pfn << PAGE_SHIFT))
770 break;
771 --- a/arch/x86/kernel/early_printk-xen.c
772 +++ b/arch/x86/kernel/early_printk-xen.c
773 @@ -11,11 +11,10 @@
774
775 #ifdef __i386__
776 #include <asm/setup.h>
777 -#define VGABASE (__ISA_IO_base + 0xb8000)
778 #else
779 #include <asm/bootsetup.h>
780 -#define VGABASE ((void __iomem *)0xffffffff800b8000UL)
781 #endif
782 +#define VGABASE (__ISA_IO_base + 0xb8000)
783
784 #ifndef CONFIG_XEN
785 static int max_ypos = 25, max_xpos = 80;
786 @@ -93,9 +92,9 @@
787 static void early_serial_write(struct console *con, const char *s, unsigned n)
788 {
789 while (*s && n-- > 0) {
790 - early_serial_putc(*s);
791 if (*s == '\n')
792 early_serial_putc('\r');
793 + early_serial_putc(*s);
794 s++;
795 }
796 }
797 @@ -205,7 +204,7 @@
798 return ret;
799 }
800
801 -void __init simnow_init(char *str)
802 +static void __init simnow_init(char *str)
803 {
804 char *fn = "klog";
805 if (*str == '=')
806 @@ -277,22 +276,12 @@
807 early_console = &simnow_console;
808 keep_early = 1;
809 }
810 +
811 + if (keep_early)
812 + early_console->flags &= ~CON_BOOT;
813 + else
814 + early_console->flags |= CON_BOOT;
815 register_console(early_console);
816 return 0;
817 }
818 -
819 early_param("earlyprintk", setup_early_printk);
820 -
821 -void __init disable_early_printk(void)
822 -{
823 - if (!early_console_initialized || !early_console)
824 - return;
825 - if (!keep_early) {
826 - printk("disabling early console\n");
827 - unregister_console(early_console);
828 - early_console_initialized = 0;
829 - } else {
830 - printk("keeping early console\n");
831 - }
832 -}
833 -
834 --- a/arch/x86/kernel/entry_32-xen.S
835 +++ b/arch/x86/kernel/entry_32-xen.S
836 @@ -15,7 +15,7 @@
837 * I changed all the .align's to 4 (16 byte alignment), as that's faster
838 * on a 486.
839 *
840 - * Stack layout in 'ret_from_system_call':
841 + * Stack layout in 'syscall_exit':
842 * ptrace needs to have all regs on the stack.
843 * if the order here is changed, it needs to be
844 * updated in fork.c:copy_process, signal.c:do_signal,
845 @@ -135,7 +135,7 @@
846 movl $(__USER_DS), %edx; \
847 movl %edx, %ds; \
848 movl %edx, %es; \
849 - movl $(__KERNEL_PDA), %edx; \
850 + movl $(__KERNEL_PERCPU), %edx; \
851 movl %edx, %fs
852
853 #define RESTORE_INT_REGS \
854 @@ -308,16 +308,12 @@
855 pushl $(__USER_CS)
856 CFI_ADJUST_CFA_OFFSET 4
857 /*CFI_REL_OFFSET cs, 0*/
858 -#ifndef CONFIG_COMPAT_VDSO
859 /*
860 * Push current_thread_info()->sysenter_return to the stack.
861 * A tiny bit of offset fixup is necessary - 4*4 means the 4 words
862 * pushed above; +8 corresponds to copy_thread's esp0 setting.
863 */
864 pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp)
865 -#else
866 - pushl $SYSENTER_RETURN
867 -#endif
868 CFI_ADJUST_CFA_OFFSET 4
869 CFI_REL_OFFSET eip, 0
870
871 @@ -345,7 +341,7 @@
872 jae syscall_badsys
873 call *sys_call_table(,%eax,4)
874 movl %eax,PT_EAX(%esp)
875 - DISABLE_INTERRUPTS(CLBR_ECX|CLBR_EDX)
876 + DISABLE_INTERRUPTS(CLBR_ANY)
877 TRACE_IRQS_OFF
878 movl TI_flags(%ebp), %ecx
879 testw $_TIF_ALLWORK_MASK, %cx
880 @@ -400,10 +396,6 @@
881 CFI_ADJUST_CFA_OFFSET 4
882 SAVE_ALL
883 GET_THREAD_INFO(%ebp)
884 - testl $TF_MASK,PT_EFLAGS(%esp)
885 - jz no_singlestep
886 - orl $_TIF_SINGLESTEP,TI_flags(%ebp)
887 -no_singlestep:
888 # system call tracing in operation / emulation
889 /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
890 testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
891 @@ -418,6 +410,10 @@
892 # setting need_resched or sigpending
893 # between sampling and the iret
894 TRACE_IRQS_OFF
895 + testl $TF_MASK,PT_EFLAGS(%esp) # If tracing set singlestep flag on exit
896 + jz no_singlestep
897 + orl $_TIF_SINGLESTEP,TI_flags(%ebp)
898 +no_singlestep:
899 movl TI_flags(%ebp), %ecx
900 testw $_TIF_ALLWORK_MASK, %cx # current->work
901 jne syscall_exit_work
902 @@ -635,9 +631,7 @@
903 #ifndef CONFIG_XEN
904 #define FIXUP_ESPFIX_STACK \
905 /* since we are on a wrong stack, we cant make it a C code :( */ \
906 - movl %fs:PDA_cpu, %ebx; \
907 - PER_CPU(cpu_gdt_descr, %ebx); \
908 - movl GDS_address(%ebx), %ebx; \
909 + PER_CPU(gdt_page, %ebx); \
910 GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \
911 addl %esp, %eax; \
912 pushl $__KERNEL_DS; \
913 @@ -710,7 +704,7 @@
914 SAVE_ALL; \
915 TRACE_IRQS_OFF \
916 movl %esp,%eax; \
917 - call smp_/**/name; \
918 + call smp_##name; \
919 jmp ret_from_intr; \
920 CFI_ENDPROC; \
921 ENDPROC(name)
922 @@ -718,10 +712,6 @@
923 /* The include is where all of the SMP etc. interrupts come from */
924 #include "entry_arch.h"
925
926 -/* This alternate entry is needed because we hijack the apic LVTT */
927 -#if defined(CONFIG_VMI) && defined(CONFIG_X86_LOCAL_APIC)
928 -BUILD_INTERRUPT(apic_vmi_timer_interrupt,LOCAL_TIMER_VECTOR)
929 -#endif
930 #else
931 #define UNWIND_ESPFIX_STACK
932 #endif
933 @@ -764,7 +754,7 @@
934 pushl %fs
935 CFI_ADJUST_CFA_OFFSET 4
936 /*CFI_REL_OFFSET fs, 0*/
937 - movl $(__KERNEL_PDA), %ecx
938 + movl $(__KERNEL_PERCPU), %ecx
939 movl %ecx, %fs
940 UNWIND_ESPFIX_STACK
941 popl %ecx
942 --- a/arch/x86/kernel/entry_64-xen.S
943 +++ b/arch/x86/kernel/entry_64-xen.S
944 @@ -1254,3 +1254,10 @@
945 ret
946 CFI_ENDPROC
947 ENDPROC(call_softirq)
948 +
949 +KPROBE_ENTRY(ignore_sysret)
950 + CFI_STARTPROC
951 + mov $-ENOSYS,%eax
952 + HYPERVISOR_IRET 0
953 + CFI_ENDPROC
954 +ENDPROC(ignore_sysret)
955 --- a/arch/x86/kernel/genapic_64-xen.c
956 +++ b/arch/x86/kernel/genapic_64-xen.c
957 @@ -11,123 +11,57 @@
958 #include <linux/threads.h>
959 #include <linux/cpumask.h>
960 #include <linux/string.h>
961 +#include <linux/module.h>
962 #include <linux/kernel.h>
963 #include <linux/ctype.h>
964 #include <linux/init.h>
965 -#include <linux/module.h>
966
967 #include <asm/smp.h>
968 #include <asm/ipi.h>
969 +#include <asm/genapic.h>
970
971 -#if defined(CONFIG_ACPI)
972 +#ifdef CONFIG_ACPI
973 #include <acpi/acpi_bus.h>
974 #endif
975
976 /* which logical CPU number maps to which CPU (physical APIC ID) */
977 -u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
978 +u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly
979 + = { [0 ... NR_CPUS-1] = BAD_APICID };
980 EXPORT_SYMBOL(x86_cpu_to_apicid);
981 -u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
982
983 -extern struct genapic apic_cluster;
984 -extern struct genapic apic_flat;
985 -extern struct genapic apic_physflat;
986 +u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
987
988 #ifndef CONFIG_XEN
989 -struct genapic *genapic = &apic_flat;
990 -struct genapic *genapic_force;
991 +struct genapic __read_mostly *genapic = &apic_flat;
992 #else
993 extern struct genapic apic_xen;
994 -struct genapic *genapic = &apic_xen;
995 +struct genapic __read_mostly *genapic = &apic_xen;
996 #endif
997
998
999 /*
1000 * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
1001 */
1002 -void __init clustered_apic_check(void)
1003 +void __init setup_apic_routing(void)
1004 {
1005 #ifndef CONFIG_XEN
1006 - long i;
1007 - u8 clusters, max_cluster;
1008 - u8 id;
1009 - u8 cluster_cnt[NUM_APIC_CLUSTERS];
1010 - int max_apic = 0;
1011 -
1012 - /* genapic selection can be forced because of certain quirks.
1013 - */
1014 - if (genapic_force) {
1015 - genapic = genapic_force;
1016 - goto print;
1017 - }
1018 -
1019 -#if defined(CONFIG_ACPI)
1020 +#ifdef CONFIG_ACPI
1021 /*
1022 - * Some x86_64 machines use physical APIC mode regardless of how many
1023 - * procs/clusters are present (x86_64 ES7000 is an example).
1024 + * Quirk: some x86_64 machines can only use physical APIC mode
1025 + * regardless of how many processors are present (x86_64 ES7000
1026 + * is an example).
1027 */
1028 - if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID)
1029 - if (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL) {
1030 - genapic = &apic_cluster;
1031 - goto print;
1032 - }
1033 -#endif
1034 -
1035 - memset(cluster_cnt, 0, sizeof(cluster_cnt));
1036 - for (i = 0; i < NR_CPUS; i++) {
1037 - id = bios_cpu_apicid[i];
1038 - if (id == BAD_APICID)
1039 - continue;
1040 - if (id > max_apic)
1041 - max_apic = id;
1042 - cluster_cnt[APIC_CLUSTERID(id)]++;
1043 - }
1044 -
1045 - /* Don't use clustered mode on AMD platforms. */
1046 - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
1047 + if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID &&
1048 + (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL))
1049 genapic = &apic_physflat;
1050 -#ifndef CONFIG_HOTPLUG_CPU
1051 - /* In the CPU hotplug case we cannot use broadcast mode
1052 - because that opens a race when a CPU is removed.
1053 - Stay at physflat mode in this case.
1054 - It is bad to do this unconditionally though. Once
1055 - we have ACPI platform support for CPU hotplug
1056 - we should detect hotplug capablity from ACPI tables and
1057 - only do this when really needed. -AK */
1058 - if (max_apic <= 8)
1059 - genapic = &apic_flat;
1060 + else
1061 #endif
1062 - goto print;
1063 - }
1064
1065 - clusters = 0;
1066 - max_cluster = 0;
1067 -
1068 - for (i = 0; i < NUM_APIC_CLUSTERS; i++) {
1069 - if (cluster_cnt[i] > 0) {
1070 - ++clusters;
1071 - if (cluster_cnt[i] > max_cluster)
1072 - max_cluster = cluster_cnt[i];
1073 - }
1074 - }
1075 -
1076 - /*
1077 - * If we have clusters <= 1 and CPUs <= 8 in cluster 0, then flat mode,
1078 - * else if max_cluster <= 4 and cluster_cnt[15] == 0, clustered logical
1079 - * else physical mode.
1080 - * (We don't use lowest priority delivery + HW APIC IRQ steering, so
1081 - * can ignore the clustered logical case and go straight to physical.)
1082 - */
1083 - if (clusters <= 1 && max_cluster <= 8 && cluster_cnt[0] == max_cluster) {
1084 -#ifdef CONFIG_HOTPLUG_CPU
1085 - /* Don't use APIC shortcuts in CPU hotplug to avoid races */
1086 - genapic = &apic_physflat;
1087 -#else
1088 + if (cpus_weight(cpu_possible_map) <= 8)
1089 genapic = &apic_flat;
1090 -#endif
1091 - } else
1092 - genapic = &apic_cluster;
1093 + else
1094 + genapic = &apic_physflat;
1095
1096 -print:
1097 #else
1098 /* hardcode to xen apic functions */
1099 genapic = &apic_xen;
1100 @@ -135,7 +69,7 @@
1101 printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name);
1102 }
1103
1104 -/* Same for both flat and clustered. */
1105 +/* Same for both flat and physical. */
1106
1107 #ifdef CONFIG_XEN
1108 extern void xen_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest);
1109 --- a/arch/x86/kernel/genapic_xen_64.c
1110 +++ b/arch/x86/kernel/genapic_xen_64.c
1111 @@ -21,9 +21,8 @@
1112 #include <asm/ipi.h>
1113 #else
1114 #include <asm/apic.h>
1115 -#include <asm/apicdef.h>
1116 -#include <asm/genapic.h>
1117 #endif
1118 +#include <asm/genapic.h>
1119 #include <xen/evtchn.h>
1120
1121 DECLARE_PER_CPU(int, ipi_to_irq[NR_IPIS]);
1122 --- a/arch/x86/kernel/head64-xen.c
1123 +++ b/arch/x86/kernel/head64-xen.c
1124 @@ -22,13 +22,21 @@
1125 #include <asm/setup.h>
1126 #include <asm/desc.h>
1127 #include <asm/pgtable.h>
1128 +#include <asm/tlbflush.h>
1129 #include <asm/sections.h>
1130
1131 unsigned long start_pfn;
1132
1133 +#ifndef CONFIG_XEN
1134 +static void __init zap_identity_mappings(void)
1135 +{
1136 + pgd_t *pgd = pgd_offset_k(0UL);
1137 + pgd_clear(pgd);
1138 + __flush_tlb();
1139 +}
1140 +
1141 /* Don't add a printk in there. printk relies on the PDA which is not initialized
1142 yet. */
1143 -#if 0
1144 static void __init clear_bss(void)
1145 {
1146 memset(__bss_start, 0,
1147 @@ -37,26 +45,25 @@
1148 #endif
1149
1150 #define NEW_CL_POINTER 0x228 /* Relative to real mode data */
1151 -#define OLD_CL_MAGIC_ADDR 0x90020
1152 +#define OLD_CL_MAGIC_ADDR 0x20
1153 #define OLD_CL_MAGIC 0xA33F
1154 -#define OLD_CL_BASE_ADDR 0x90000
1155 -#define OLD_CL_OFFSET 0x90022
1156 +#define OLD_CL_OFFSET 0x22
1157
1158 static void __init copy_bootdata(char *real_mode_data)
1159 {
1160 #ifndef CONFIG_XEN
1161 - int new_data;
1162 + unsigned long new_data;
1163 char * command_line;
1164
1165 memcpy(x86_boot_params, real_mode_data, BOOT_PARAM_SIZE);
1166 - new_data = *(int *) (x86_boot_params + NEW_CL_POINTER);
1167 + new_data = *(u32 *) (x86_boot_params + NEW_CL_POINTER);
1168 if (!new_data) {
1169 - if (OLD_CL_MAGIC != * (u16 *) OLD_CL_MAGIC_ADDR) {
1170 + if (OLD_CL_MAGIC != *(u16 *)(real_mode_data + OLD_CL_MAGIC_ADDR)) {
1171 return;
1172 }
1173 - new_data = OLD_CL_BASE_ADDR + * (u16 *) OLD_CL_OFFSET;
1174 + new_data = __pa(real_mode_data) + *(u16 *)(real_mode_data + OLD_CL_OFFSET);
1175 }
1176 - command_line = (char *) ((u64)(new_data));
1177 + command_line = __va(new_data);
1178 memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE);
1179 #else
1180 int max_cmdline;
1181 @@ -98,10 +105,13 @@
1182 while ((1UL << machine_to_phys_order) < machine_to_phys_nr_ents )
1183 machine_to_phys_order++;
1184
1185 -#if 0
1186 +#ifndef CONFIG_XEN
1187 /* clear bss before set_intr_gate with early_idt_handler */
1188 clear_bss();
1189
1190 + /* Make NULL pointers segfault */
1191 + zap_identity_mappings();
1192 +
1193 for (i = 0; i < IDT_ENTRIES; i++)
1194 set_intr_gate(i, early_idt_handler);
1195 asm volatile("lidt %0" :: "m" (idt_descr));
1196 @@ -113,7 +123,7 @@
1197 cpu_pda(i) = &boot_cpu_pda[i];
1198
1199 pda_init(0);
1200 - copy_bootdata(real_mode_data);
1201 + copy_bootdata(__va(real_mode_data));
1202 #ifdef CONFIG_SMP
1203 cpu_set(0, cpu_online_map);
1204 #endif
1205 --- a/arch/x86/kernel/head_32-xen.S
1206 +++ b/arch/x86/kernel/head_32-xen.S
1207 @@ -37,7 +37,8 @@
1208 /* Set up the stack pointer */
1209 movl $(init_thread_union+THREAD_SIZE),%esp
1210
1211 - call setup_pda
1212 + movl %ss,%eax
1213 + movl %eax,%fs # gets reset once there's real percpu
1214
1215 /* get vendor info */
1216 xorl %eax,%eax # call CPUID with 0 -> return vendor ID
1217 @@ -64,55 +65,11 @@
1218 xorl %eax,%eax # Clear GS
1219 movl %eax,%gs
1220
1221 - movl $(__KERNEL_PDA),%eax
1222 - mov %eax,%fs
1223 -
1224 cld # gcc2 wants the direction flag cleared at all times
1225
1226 pushl $0 # fake return address for unwinder
1227 jmp start_kernel
1228
1229 -/*
1230 - * Point the GDT at this CPU's PDA. This will be
1231 - * cpu_gdt_table and boot_pda.
1232 - */
1233 -ENTRY(setup_pda)
1234 - /* get the PDA pointer */
1235 - movl $boot_pda, %eax
1236 -
1237 - /* slot the PDA address into the GDT */
1238 - mov $cpu_gdt_table, %ecx
1239 - mov %ax, (__KERNEL_PDA+0+2)(%ecx) /* base & 0x0000ffff */
1240 - shr $16, %eax
1241 - mov %al, (__KERNEL_PDA+4+0)(%ecx) /* base & 0x00ff0000 */
1242 - mov %ah, (__KERNEL_PDA+4+3)(%ecx) /* base & 0xff000000 */
1243 -
1244 - # %esi still points to start_info, and no registers
1245 - # need to be preserved.
1246 -
1247 - movl XEN_START_mfn_list(%esi), %ebx
1248 - movl $(cpu_gdt_table - __PAGE_OFFSET), %eax
1249 - shrl $PAGE_SHIFT, %eax
1250 - movl (%ebx,%eax,4), %ecx
1251 - pushl %ecx # frame number for set_gdt below
1252 -
1253 - xorl %esi, %esi
1254 - xorl %edx, %edx
1255 - shldl $PAGE_SHIFT, %ecx, %edx
1256 - shll $PAGE_SHIFT, %ecx
1257 - orl $0x61, %ecx
1258 - movl $cpu_gdt_table, %ebx
1259 - movl $__HYPERVISOR_update_va_mapping, %eax
1260 - int $0x82
1261 -
1262 - movl $(PAGE_SIZE_asm / 8), %ecx
1263 - movl %esp, %ebx
1264 - movl $__HYPERVISOR_set_gdt, %eax
1265 - int $0x82
1266 -
1267 - popl %ecx
1268 - ret
1269 -
1270 #define HYPERCALL_PAGE_OFFSET 0x1000
1271 .org HYPERCALL_PAGE_OFFSET
1272 ENTRY(hypercall_page)
1273 @@ -138,60 +95,6 @@
1274 */
1275 .data
1276
1277 -/*
1278 - * The Global Descriptor Table contains 28 quadwords, per-CPU.
1279 - */
1280 - .section .data.page_aligned, "aw"
1281 - .align PAGE_SIZE_asm
1282 -ENTRY(cpu_gdt_table)
1283 - .quad 0x0000000000000000 /* NULL descriptor */
1284 - .quad 0x0000000000000000 /* 0x0b reserved */
1285 - .quad 0x0000000000000000 /* 0x13 reserved */
1286 - .quad 0x0000000000000000 /* 0x1b reserved */
1287 - .quad 0x0000000000000000 /* 0x20 unused */
1288 - .quad 0x0000000000000000 /* 0x28 unused */
1289 - .quad 0x0000000000000000 /* 0x33 TLS entry 1 */
1290 - .quad 0x0000000000000000 /* 0x3b TLS entry 2 */
1291 - .quad 0x0000000000000000 /* 0x43 TLS entry 3 */
1292 - .quad 0x0000000000000000 /* 0x4b reserved */
1293 - .quad 0x0000000000000000 /* 0x53 reserved */
1294 - .quad 0x0000000000000000 /* 0x5b reserved */
1295 -
1296 - .quad 0x00cf9a000000ffff /* 0x60 kernel 4GB code at 0x00000000 */
1297 - .quad 0x00cf92000000ffff /* 0x68 kernel 4GB data at 0x00000000 */
1298 - .quad 0x00cffa000000ffff /* 0x73 user 4GB code at 0x00000000 */
1299 - .quad 0x00cff2000000ffff /* 0x7b user 4GB data at 0x00000000 */
1300 -
1301 - .quad 0x0000000000000000 /* 0x80 TSS descriptor */
1302 - .quad 0x0000000000000000 /* 0x88 LDT descriptor */
1303 -
1304 - /*
1305 - * Segments used for calling PnP BIOS have byte granularity.
1306 - * They code segments and data segments have fixed 64k limits,
1307 - * the transfer segment sizes are set at run time.
1308 - */
1309 - .quad 0x0000000000000000 /* 0x90 32-bit code */
1310 - .quad 0x0000000000000000 /* 0x98 16-bit code */
1311 - .quad 0x0000000000000000 /* 0xa0 16-bit data */
1312 - .quad 0x0000000000000000 /* 0xa8 16-bit data */
1313 - .quad 0x0000000000000000 /* 0xb0 16-bit data */
1314 -
1315 - /*
1316 - * The APM segments have byte granularity and their bases
1317 - * are set at run time. All have 64k limits.
1318 - */
1319 - .quad 0x0000000000000000 /* 0xb8 APM CS code */
1320 - .quad 0x0000000000000000 /* 0xc0 APM CS 16 code (16 bit) */
1321 - .quad 0x0000000000000000 /* 0xc8 APM DS data */
1322 -
1323 - .quad 0x0000000000000000 /* 0xd0 - ESPFIX SS */
1324 - .quad 0x00cf92000000ffff /* 0xd8 - PDA */
1325 - .quad 0x0000000000000000 /* 0xe0 - unused */
1326 - .quad 0x0000000000000000 /* 0xe8 - unused */
1327 - .quad 0x0000000000000000 /* 0xf0 - unused */
1328 - .quad 0x0000000000000000 /* 0xf8 - GDT entry 31: double-fault TSS */
1329 - .align PAGE_SIZE_asm
1330 -
1331 #if CONFIG_XEN_COMPAT <= 0x030002
1332 /*
1333 * __xen_guest information
1334 --- a/arch/x86/kernel/head_64-xen.S
1335 +++ b/arch/x86/kernel/head_64-xen.S
1336 @@ -41,18 +42,15 @@
1337 .word gdt_end-cpu_gdt_table-1
1338 .long cpu_gdt_table-__START_KERNEL_map
1339 #endif
1340 -ENTRY(stext)
1341 -ENTRY(_stext)
1342
1343 - $page = 0
1344 +.balign PAGE_SIZE
1345 +
1346 #define NEXT_PAGE(name) \
1347 - $page = $page + 1; \
1348 - .org $page * 0x1000; \
1349 - phys_##name = $page * 0x1000 + __PHYSICAL_START; \
1350 + .balign PAGE_SIZE; \
1351 + phys_##name = . - .bootstrap.text; \
1352 ENTRY(name)
1353
1354 NEXT_PAGE(init_level4_pgt)
1355 - /* This gets initialized in x86_64_start_kernel */
1356 .fill 512,8,0
1357 NEXT_PAGE(init_level4_user_pgt)
1358 /*
1359 @@ -136,13 +134,13 @@
1360
1361 ENTRY(cpu_gdt_table)
1362 .quad 0x0000000000000000 /* NULL descriptor */
1363 + .quad 0x00cf9b000000ffff /* __KERNEL32_CS */
1364 + .quad 0x00af9b000000ffff /* __KERNEL_CS */
1365 + .quad 0x00cf93000000ffff /* __KERNEL_DS */
1366 + .quad 0x00cffb000000ffff /* __USER32_CS */
1367 + .quad 0x00cff3000000ffff /* __USER_DS, __USER32_DS */
1368 + .quad 0x00affb000000ffff /* __USER_CS */
1369 .quad 0x0 /* unused */
1370 - .quad 0x00af9a000000ffff /* __KERNEL_CS */
1371 - .quad 0x00cf92000000ffff /* __KERNEL_DS */
1372 - .quad 0x00cffa000000ffff /* __USER32_CS */
1373 - .quad 0x00cff2000000ffff /* __USER_DS, __USER32_DS */
1374 - .quad 0x00affa000000ffff /* __USER_CS */
1375 - .quad 0x00cf9a000000ffff /* __KERNEL32_CS */
1376 .quad 0,0 /* TSS */
1377 .quad 0,0 /* LDT */
1378 .quad 0,0,0 /* three TLS descriptors */
1379 @@ -165,14 +163,11 @@
1380 * __xen_guest information
1381 */
1382 .macro utoh value
1383 - .if (\value) < 0 || (\value) >= 0x10
1384 - utoh (((\value)>>4)&0x0fffffffffffffff)
1385 - .endif
1386 - .if ((\value) & 0xf) < 10
1387 - .byte '0' + ((\value) & 0xf)
1388 - .else
1389 - .byte 'A' + ((\value) & 0xf) - 10
1390 - .endif
1391 + i = 64
1392 + .rept 16
1393 + i = i - 4
1394 + .byte '0' + ((((\value) >> i) & 0xf) > 9) * ('0' - 'A' + 10) + (((\value) >> i) & 0xf)
1395 + .endr
1396 .endm
1397
1398 .section __xen_guest
1399 --- a/arch/x86/kernel/io_apic_32-xen.c
1400 +++ b/arch/x86/kernel/io_apic_32-xen.c
1401 @@ -25,7 +25,6 @@
1402 #include <linux/init.h>
1403 #include <linux/delay.h>
1404 #include <linux/sched.h>
1405 -#include <linux/smp_lock.h>
1406 #include <linux/mc146818rtc.h>
1407 #include <linux/compiler.h>
1408 #include <linux/acpi.h>
1409 @@ -35,6 +34,7 @@
1410 #include <linux/msi.h>
1411 #include <linux/htirq.h>
1412 #include <linux/freezer.h>
1413 +#include <linux/kthread.h>
1414
1415 #include <asm/io.h>
1416 #include <asm/smp.h>
1417 @@ -705,8 +705,6 @@
1418 unsigned long prev_balance_time = jiffies;
1419 long time_remaining = balanced_irq_interval;
1420
1421 - daemonize("kirqd");
1422 -
1423 /* push everything to CPU 0 to give us a starting point. */
1424 for (i = 0 ; i < NR_IRQS ; i++) {
1425 irq_desc[i].pending_mask = cpumask_of_cpu(0);
1426 @@ -766,10 +764,9 @@
1427 }
1428
1429 printk(KERN_INFO "Starting balanced_irq\n");
1430 - if (kernel_thread(balanced_irq, NULL, CLONE_KERNEL) >= 0)
1431 + if (!IS_ERR(kthread_run(balanced_irq, NULL, "kirqd")))
1432 return 0;
1433 - else
1434 - printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq");
1435 + printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq");
1436 failed:
1437 for_each_possible_cpu(i) {
1438 kfree(irq_cpu_data[i].irq_delta);
1439 @@ -1445,10 +1442,6 @@
1440 enable_8259A_irq(0);
1441 }
1442
1443 -static inline void UNEXPECTED_IO_APIC(void)
1444 -{
1445 -}
1446 -
1447 void __init print_IO_APIC(void)
1448 {
1449 int apic, i;
1450 @@ -1488,34 +1481,12 @@
1451 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
1452 printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
1453 printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS);
1454 - if (reg_00.bits.ID >= get_physical_broadcast())
1455 - UNEXPECTED_IO_APIC();
1456 - if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2)
1457 - UNEXPECTED_IO_APIC();
1458
1459 printk(KERN_DEBUG ".... register #01: %08X\n", reg_01.raw);
1460 printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries);
1461 - if ( (reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */
1462 - (reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */
1463 - (reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */
1464 - (reg_01.bits.entries != 0x1f) && /* dual Xeon boards */
1465 - (reg_01.bits.entries != 0x22) && /* bigger Xeon boards */
1466 - (reg_01.bits.entries != 0x2E) &&
1467 - (reg_01.bits.entries != 0x3F)
1468 - )
1469 - UNEXPECTED_IO_APIC();
1470
1471 printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ);
1472 printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version);
1473 - if ( (reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */
1474 - (reg_01.bits.version != 0x10) && /* oldest IO-APICs */
1475 - (reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */
1476 - (reg_01.bits.version != 0x13) && /* Xeon IO-APICs */
1477 - (reg_01.bits.version != 0x20) /* Intel P64H (82806 AA) */
1478 - )
1479 - UNEXPECTED_IO_APIC();
1480 - if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2)
1481 - UNEXPECTED_IO_APIC();
1482
1483 /*
1484 * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02,
1485 @@ -1525,8 +1496,6 @@
1486 if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) {
1487 printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
1488 printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration);
1489 - if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2)
1490 - UNEXPECTED_IO_APIC();
1491 }
1492
1493 /*
1494 @@ -1538,8 +1507,6 @@
1495 reg_03.raw != reg_01.raw) {
1496 printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw);
1497 printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT);
1498 - if (reg_03.bits.__reserved_1)
1499 - UNEXPECTED_IO_APIC();
1500 }
1501
1502 printk(KERN_DEBUG ".... IRQ redirection table:\n");
1503 @@ -2670,19 +2637,19 @@
1504 if (irq < 0)
1505 return irq;
1506
1507 - set_irq_msi(irq, desc);
1508 ret = msi_compose_msg(dev, irq, &msg);
1509 if (ret < 0) {
1510 destroy_irq(irq);
1511 return ret;
1512 }
1513
1514 + set_irq_msi(irq, desc);
1515 write_msi_msg(irq, &msg);
1516
1517 set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq,
1518 "edge");
1519
1520 - return irq;
1521 + return 0;
1522 }
1523
1524 void arch_teardown_msi_irq(unsigned int irq)
1525 --- a/arch/x86/kernel/io_apic_64-xen.c
1526 +++ b/arch/x86/kernel/io_apic_64-xen.c
1527 @@ -25,7 +25,6 @@
1528 #include <linux/init.h>
1529 #include <linux/delay.h>
1530 #include <linux/sched.h>
1531 -#include <linux/smp_lock.h>
1532 #include <linux/pci.h>
1533 #include <linux/mc146818rtc.h>
1534 #include <linux/acpi.h>
1535 @@ -897,10 +896,6 @@
1536 enable_8259A_irq(0);
1537 }
1538
1539 -void __init UNEXPECTED_IO_APIC(void)
1540 -{
1541 -}
1542 -
1543 void __apicdebuginit print_IO_APIC(void)
1544 {
1545 int apic, i;
1546 @@ -936,40 +931,16 @@
1547 printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
1548 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
1549 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
1550 - if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2)
1551 - UNEXPECTED_IO_APIC();
1552
1553 printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)&reg_01);
1554 printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries);
1555 - if ( (reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */
1556 - (reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */
1557 - (reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */
1558 - (reg_01.bits.entries != 0x1f) && /* dual Xeon boards */
1559 - (reg_01.bits.entries != 0x22) && /* bigger Xeon boards */
1560 - (reg_01.bits.entries != 0x2E) &&
1561 - (reg_01.bits.entries != 0x3F) &&
1562 - (reg_01.bits.entries != 0x03)
1563 - )
1564 - UNEXPECTED_IO_APIC();
1565
1566 printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ);
1567 printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version);
1568 - if ( (reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */
1569 - (reg_01.bits.version != 0x02) && /* 82801BA IO-APICs (ICH2) */
1570 - (reg_01.bits.version != 0x10) && /* oldest IO-APICs */
1571 - (reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */
1572 - (reg_01.bits.version != 0x13) && /* Xeon IO-APICs */
1573 - (reg_01.bits.version != 0x20) /* Intel P64H (82806 AA) */
1574 - )
1575 - UNEXPECTED_IO_APIC();
1576 - if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2)
1577 - UNEXPECTED_IO_APIC();
1578
1579 if (reg_01.bits.version >= 0x10) {
1580 printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
1581 printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration);
1582 - if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2)
1583 - UNEXPECTED_IO_APIC();
1584 }
1585
1586 printk(KERN_DEBUG ".... IRQ redirection table:\n");
1587 @@ -1401,8 +1372,7 @@
1588
1589 vector = ~get_irq_regs()->orig_rax;
1590 me = smp_processor_id();
1591 - if ((vector == cfg->vector) &&
1592 - cpu_isset(smp_processor_id(), cfg->domain)) {
1593 + if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) {
1594 cpumask_t cleanup_mask;
1595
1596 cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
1597 @@ -1437,7 +1407,7 @@
1598
1599 /*
1600 * We must acknowledge the irq before we move it or the acknowledge will
1601 - * not propogate properly.
1602 + * not propagate properly.
1603 */
1604 ack_APIC_irq();
1605
1606 @@ -1520,6 +1490,7 @@
1607 static void end_lapic_irq (unsigned int i) { /* nothing */ }
1608
1609 static struct hw_interrupt_type lapic_irq_type __read_mostly = {
1610 + .name = "local-APIC",
1611 .typename = "local-APIC-edge",
1612 .startup = NULL, /* startup_irq() not used for IRQ0 */
1613 .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */
1614 @@ -1989,18 +1960,18 @@
1615 if (irq < 0)
1616 return irq;
1617
1618 - set_irq_msi(irq, desc);
1619 ret = msi_compose_msg(dev, irq, &msg);
1620 if (ret < 0) {
1621 destroy_irq(irq);
1622 return ret;
1623 }
1624
1625 + set_irq_msi(irq, desc);
1626 write_msi_msg(irq, &msg);
1627
1628 set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
1629
1630 - return irq;
1631 + return 0;
1632 }
1633
1634 void arch_teardown_msi_irq(unsigned int irq)
1635 --- a/arch/x86/kernel/ioport_32-xen.c
1636 +++ b/arch/x86/kernel/ioport_32-xen.c
1637 @@ -12,10 +12,10 @@
1638 #include <linux/types.h>
1639 #include <linux/ioport.h>
1640 #include <linux/smp.h>
1641 -#include <linux/smp_lock.h>
1642 #include <linux/stddef.h>
1643 #include <linux/slab.h>
1644 #include <linux/thread_info.h>
1645 +#include <linux/syscalls.h>
1646 #include <xen/interface/physdev.h>
1647
1648 /* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
1649 --- a/arch/x86/kernel/ioport_64-xen.c
1650 +++ b/arch/x86/kernel/ioport_64-xen.c
1651 @@ -13,10 +13,10 @@
1652 #include <linux/ioport.h>
1653 #include <linux/mm.h>
1654 #include <linux/smp.h>
1655 -#include <linux/smp_lock.h>
1656 #include <linux/stddef.h>
1657 #include <linux/slab.h>
1658 #include <linux/thread_info.h>
1659 +#include <linux/syscalls.h>
1660 #include <xen/interface/physdev.h>
1661
1662 /* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
1663 --- a/arch/x86/kernel/irq_32-xen.c
1664 +++ b/arch/x86/kernel/irq_32-xen.c
1665 @@ -24,6 +24,9 @@
1666 DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_internodealigned_in_smp;
1667 EXPORT_PER_CPU_SYMBOL(irq_stat);
1668
1669 +DEFINE_PER_CPU(struct pt_regs *, irq_regs);
1670 +EXPORT_PER_CPU_SYMBOL(irq_regs);
1671 +
1672 /*
1673 * 'what should we do if we get a hw irq event on an illegal vector'.
1674 * each architecture has to answer this themselves.
1675 --- a/arch/x86/kernel/irq_64-xen.c
1676 +++ b/arch/x86/kernel/irq_64-xen.c
1677 @@ -32,7 +32,7 @@
1678 */
1679 static inline void stack_overflow_check(struct pt_regs *regs)
1680 {
1681 - u64 curbase = (u64) current->thread_info;
1682 + u64 curbase = (u64)task_stack_page(current);
1683 static unsigned long warned = -60*HZ;
1684
1685 if (regs->rsp >= curbase && regs->rsp <= curbase + THREAD_SIZE &&
1686 @@ -145,17 +145,43 @@
1687
1688 for (irq = 0; irq < NR_IRQS; irq++) {
1689 cpumask_t mask;
1690 + int break_affinity = 0;
1691 + int set_affinity = 1;
1692 +
1693 if (irq == 2)
1694 continue;
1695
1696 + /* interrupt's are disabled at this point */
1697 + spin_lock(&irq_desc[irq].lock);
1698 +
1699 + if (!irq_has_action(irq) ||
1700 + cpus_equal(irq_desc[irq].affinity, map)) {
1701 + spin_unlock(&irq_desc[irq].lock);
1702 + continue;
1703 + }
1704 +
1705 cpus_and(mask, irq_desc[irq].affinity, map);
1706 - if (any_online_cpu(mask) == NR_CPUS) {
1707 - printk("Breaking affinity for irq %i\n", irq);
1708 + if (cpus_empty(mask)) {
1709 + break_affinity = 1;
1710 mask = map;
1711 }
1712 +
1713 + if (irq_desc[irq].chip->mask)
1714 + irq_desc[irq].chip->mask(irq);
1715 +
1716 if (irq_desc[irq].chip->set_affinity)
1717 irq_desc[irq].chip->set_affinity(irq, mask);
1718 - else if (irq_desc[irq].action && !(warned++))
1719 + else if (!(warned++))
1720 + set_affinity = 0;
1721 +
1722 + if (irq_desc[irq].chip->unmask)
1723 + irq_desc[irq].chip->unmask(irq);
1724 +
1725 + spin_unlock(&irq_desc[irq].lock);
1726 +
1727 + if (break_affinity && set_affinity)
1728 + printk("Broke affinity for irq %i\n", irq);
1729 + else if (!set_affinity)
1730 printk("Cannot set affinity for irq %i\n", irq);
1731 }
1732
1733 --- a/arch/x86/kernel/ldt_32-xen.c
1734 +++ b/arch/x86/kernel/ldt_32-xen.c
1735 @@ -10,7 +10,6 @@
1736 #include <linux/string.h>
1737 #include <linux/mm.h>
1738 #include <linux/smp.h>
1739 -#include <linux/smp_lock.h>
1740 #include <linux/vmalloc.h>
1741 #include <linux/slab.h>
1742
1743 --- a/arch/x86/kernel/ldt_64-xen.c
1744 +++ b/arch/x86/kernel/ldt_64-xen.c
1745 @@ -13,7 +13,6 @@
1746 #include <linux/string.h>
1747 #include <linux/mm.h>
1748 #include <linux/smp.h>
1749 -#include <linux/smp_lock.h>
1750 #include <linux/vmalloc.h>
1751 #include <linux/slab.h>
1752
1753 --- a/arch/x86/kernel/microcode-xen.c
1754 +++ b/arch/x86/kernel/microcode-xen.c
1755 @@ -135,7 +135,7 @@
1756 return 0;
1757 }
1758
1759 -static void __exit microcode_dev_exit (void)
1760 +static void microcode_dev_exit (void)
1761 {
1762 misc_deregister(&microcode_dev);
1763 }
1764 --- a/arch/x86/kernel/mpparse_32-xen.c
1765 +++ b/arch/x86/kernel/mpparse_32-xen.c
1766 @@ -18,7 +18,6 @@
1767 #include <linux/acpi.h>
1768 #include <linux/delay.h>
1769 #include <linux/bootmem.h>
1770 -#include <linux/smp_lock.h>
1771 #include <linux/kernel_stat.h>
1772 #include <linux/mc146818rtc.h>
1773 #include <linux/bitops.h>
1774 @@ -484,7 +483,7 @@
1775 }
1776 ++mpc_record;
1777 }
1778 - clustered_apic_check();
1779 + setup_apic_routing();
1780 if (!num_processors)
1781 printk(KERN_ERR "SMP mptable: no processors registered!\n");
1782 return num_processors;
1783 --- a/arch/x86/kernel/mpparse_64-xen.c
1784 +++ b/arch/x86/kernel/mpparse_64-xen.c
1785 @@ -17,7 +17,6 @@
1786 #include <linux/init.h>
1787 #include <linux/delay.h>
1788 #include <linux/bootmem.h>
1789 -#include <linux/smp_lock.h>
1790 #include <linux/kernel_stat.h>
1791 #include <linux/mc146818rtc.h>
1792 #include <linux/acpi.h>
1793 @@ -307,7 +306,7 @@
1794 }
1795 }
1796 }
1797 - clustered_apic_check();
1798 + setup_apic_routing();
1799 if (!num_processors)
1800 printk(KERN_ERR "MPTABLE: no processors registered!\n");
1801 return num_processors;
1802 --- a/arch/x86/kernel/pci-dma_32-xen.c
1803 +++ b/arch/x86/kernel/pci-dma_32-xen.c
1804 @@ -13,6 +13,7 @@
1805 #include <linux/pci.h>
1806 #include <linux/module.h>
1807 #include <linux/version.h>
1808 +#include <linux/pci.h>
1809 #include <asm/io.h>
1810 #include <xen/balloon.h>
1811 #include <xen/gnttab.h>
1812 @@ -284,7 +285,7 @@
1813 {
1814 void __iomem *mem_base = NULL;
1815 int pages = size >> PAGE_SHIFT;
1816 - int bitmap_size = (pages + 31)/32;
1817 + int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long);
1818
1819 if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0)
1820 goto out;
1821 @@ -357,6 +358,32 @@
1822 EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
1823 #endif /* ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY */
1824
1825 +#if defined(CONFIG_PCI) && !defined(CONFIG_XEN)
1826 +/* Many VIA bridges seem to corrupt data for DAC. Disable it here */
1827 +
1828 +int forbid_dac;
1829 +EXPORT_SYMBOL(forbid_dac);
1830 +
1831 +static __devinit void via_no_dac(struct pci_dev *dev)
1832 +{
1833 + if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
1834 + printk(KERN_INFO "PCI: VIA PCI bridge detected. Disabling DAC.\n");
1835 + forbid_dac = 1;
1836 + }
1837 +}
1838 +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac);
1839 +
1840 +static int check_iommu(char *s)
1841 +{
1842 + if (!strcmp(s, "usedac")) {
1843 + forbid_dac = -1;
1844 + return 1;
1845 + }
1846 + return 0;
1847 +}
1848 +__setup("iommu=", check_iommu);
1849 +#endif
1850 +
1851 dma_addr_t
1852 dma_map_single(struct device *dev, void *ptr, size_t size,
1853 enum dma_data_direction direction)
1854 --- a/arch/x86/kernel/pci-swiotlb_64-xen.c
1855 +++ b/arch/x86/kernel/pci-swiotlb_64-xen.c
1856 @@ -16,7 +16,7 @@
1857
1858 void swiotlb_init(void);
1859
1860 -struct dma_mapping_ops swiotlb_dma_ops = {
1861 +const struct dma_mapping_ops swiotlb_dma_ops = {
1862 #if 0
1863 .mapping_error = swiotlb_dma_mapping_error,
1864 .alloc_coherent = swiotlb_alloc_coherent,
1865 --- a/arch/x86/kernel/process_32-xen.c
1866 +++ b/arch/x86/kernel/process_32-xen.c
1867 @@ -21,7 +21,6 @@
1868 #include <linux/mm.h>
1869 #include <linux/elfcore.h>
1870 #include <linux/smp.h>
1871 -#include <linux/smp_lock.h>
1872 #include <linux/stddef.h>
1873 #include <linux/slab.h>
1874 #include <linux/vmalloc.h>
1875 @@ -39,6 +38,7 @@
1876 #include <linux/random.h>
1877 #include <linux/personality.h>
1878 #include <linux/tick.h>
1879 +#include <linux/percpu.h>
1880
1881 #include <asm/uaccess.h>
1882 #include <asm/pgtable.h>
1883 @@ -61,7 +61,6 @@
1884
1885 #include <asm/tlbflush.h>
1886 #include <asm/cpu.h>
1887 -#include <asm/pda.h>
1888
1889 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
1890
1891 @@ -70,6 +69,12 @@
1892 unsigned long boot_option_idle_override = 0;
1893 EXPORT_SYMBOL(boot_option_idle_override);
1894
1895 +DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
1896 +EXPORT_PER_CPU_SYMBOL(current_task);
1897 +
1898 +DEFINE_PER_CPU(int, cpu_number);
1899 +EXPORT_PER_CPU_SYMBOL(cpu_number);
1900 +
1901 /*
1902 * Return saved PC of a blocked thread.
1903 */
1904 @@ -168,6 +173,7 @@
1905 if (__get_cpu_var(cpu_idle_state))
1906 __get_cpu_var(cpu_idle_state) = 0;
1907
1908 + check_pgt_cache();
1909 rmb();
1910 idle = xen_idle; /* no alternatives */
1911
1912 @@ -218,18 +224,19 @@
1913 {
1914 }
1915
1916 -static int __init idle_setup (char *str)
1917 +static int __init idle_setup(char *str)
1918 {
1919 - if (!strncmp(str, "poll", 4)) {
1920 + if (!strcmp(str, "poll")) {
1921 printk("using polling idle threads.\n");
1922 pm_idle = poll_idle;
1923 }
1924 + else
1925 + return -1;
1926
1927 boot_option_idle_override = 1;
1928 - return 1;
1929 + return 0;
1930 }
1931 -
1932 -__setup("idle=", idle_setup);
1933 +early_param("idle", idle_setup);
1934
1935 void show_regs(struct pt_regs * regs)
1936 {
1937 @@ -282,7 +289,7 @@
1938
1939 regs.xds = __USER_DS;
1940 regs.xes = __USER_DS;
1941 - regs.xfs = __KERNEL_PDA;
1942 + regs.xfs = __KERNEL_PERCPU;
1943 regs.orig_eax = -1;
1944 regs.eip = (unsigned long) kernel_thread_helper;
1945 regs.xcs = __KERNEL_CS | get_kernel_rpl();
1946 @@ -556,7 +563,7 @@
1947 * multicall to indicate FPU task switch, rather than
1948 * synchronously trapping to Xen.
1949 */
1950 - if (prev_p->thread_info->status & TS_USEDFPU) {
1951 + if (task_thread_info(prev_p)->status & TS_USEDFPU) {
1952 __save_init_fpu(prev_p); /* _not_ save_init_fpu() */
1953 mcl->op = __HYPERVISOR_fpu_taskswitch;
1954 mcl->args[0] = 1;
1955 @@ -648,7 +655,7 @@
1956 if (prev->gs | next->gs)
1957 loadsegment(gs, next->gs);
1958
1959 - write_pda(pcurrent, next_p);
1960 + x86_write_percpu(current_task, next_p);
1961
1962 return prev_p;
1963 }
1964 --- a/arch/x86/kernel/process_64-xen.c
1965 +++ b/arch/x86/kernel/process_64-xen.c
1966 @@ -39,6 +39,7 @@
1967 #include <linux/random.h>
1968 #include <linux/notifier.h>
1969 #include <linux/kprobes.h>
1970 +#include <linux/kdebug.h>
1971
1972 #include <asm/uaccess.h>
1973 #include <asm/pgtable.h>
1974 @@ -49,7 +50,6 @@
1975 #include <asm/mmu_context.h>
1976 #include <asm/pda.h>
1977 #include <asm/prctl.h>
1978 -#include <asm/kdebug.h>
1979 #include <xen/interface/platform.h>
1980 #include <xen/interface/physdev.h>
1981 #include <xen/interface/vcpu.h>
1982 @@ -232,16 +232,18 @@
1983
1984 static int __init idle_setup (char *str)
1985 {
1986 - if (!strncmp(str, "poll", 4)) {
1987 + if (!strcmp(str, "poll")) {
1988 printk("using polling idle threads.\n");
1989 pm_idle = poll_idle;
1990 - }
1991 + } else if (!strcmp(str, "mwait"))
1992 + force_mwait = 1;
1993 + else
1994 + return -1;
1995
1996 boot_option_idle_override = 1;
1997 - return 1;
1998 + return 0;
1999 }
2000 -
2001 -__setup("idle=", idle_setup);
2002 +early_param("idle", idle_setup);
2003
2004 /* Prints also some state that isn't saved in the pt_regs */
2005 void __show_regs(struct pt_regs * regs)
2006 @@ -540,7 +542,7 @@
2007 * The AMD workaround requires it to be after DS reload, or
2008 * after DS has been cleared, which we do in __prepare_arch_switch.
2009 */
2010 - if (prev_p->thread_info->status & TS_USEDFPU) {
2011 + if (task_thread_info(prev_p)->status & TS_USEDFPU) {
2012 __save_init_fpu(prev_p); /* _not_ save_init_fpu() */
2013 mcl->op = __HYPERVISOR_fpu_taskswitch;
2014 mcl->args[0] = 1;
2015 --- a/arch/x86/kernel/quirks-xen.c
2016 +++ b/arch/x86/kernel/quirks-xen.c
2017 @@ -3,12 +3,10 @@
2018 */
2019 #include <linux/pci.h>
2020 #include <linux/irq.h>
2021 -#include <asm/pci-direct.h>
2022 -#include <asm/genapic.h>
2023 -#include <asm/cpu.h>
2024
2025 #if defined(CONFIG_X86_IO_APIC) && (defined(CONFIG_SMP) || defined(CONFIG_XEN)) && defined(CONFIG_PCI)
2026 -static void __devinit verify_quirk_intel_irqbalance(struct pci_dev *dev)
2027 +
2028 +static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
2029 {
2030 u8 config, rev;
2031 u32 word;
2032 @@ -16,7 +14,7 @@
2033 /* BIOS may enable hardware IRQ balancing for
2034 * E7520/E7320/E7525(revision ID 0x9 and below)
2035 * based platforms.
2036 - * For those platforms, make sure that the genapic is set to 'flat'
2037 + * Disable SW irqbalance/affinity on those platforms.
2038 */
2039 pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev);
2040 if (rev > 0x9)
2041 @@ -30,59 +28,20 @@
2042 raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word);
2043
2044 if (!(word & (1 << 13))) {
2045 -#ifndef CONFIG_XEN
2046 -#ifdef CONFIG_X86_64
2047 - if (genapic != &apic_flat)
2048 - panic("APIC mode must be flat on this system\n");
2049 -#elif defined(CONFIG_X86_GENERICARCH)
2050 - if (genapic != &apic_default)
2051 - panic("APIC mode must be default(flat) on this system. Use apic=default\n");
2052 -#endif
2053 -#endif
2054 - }
2055 -
2056 - /* put back the original value for config space*/
2057 - if (!(config & 0x2))
2058 - pci_write_config_byte(dev, 0xf4, config);
2059 -}
2060 -
2061 -void __init quirk_intel_irqbalance(void)
2062 -{
2063 - u8 config, rev;
2064 - u32 word;
2065 -
2066 - /* BIOS may enable hardware IRQ balancing for
2067 - * E7520/E7320/E7525(revision ID 0x9 and below)
2068 - * based platforms.
2069 - * Disable SW irqbalance/affinity on those platforms.
2070 - */
2071 - rev = read_pci_config_byte(0, 0, 0, PCI_CLASS_REVISION);
2072 - if (rev > 0x9)
2073 - return;
2074 -
2075 - printk(KERN_INFO "Intel E7520/7320/7525 detected.");
2076 -
2077 - /* enable access to config space */
2078 - config = read_pci_config_byte(0, 0, 0, 0xf4);
2079 - write_pci_config_byte(0, 0, 0, 0xf4, config|0x2);
2080 -
2081 - /* read xTPR register */
2082 - word = read_pci_config_16(0, 0, 0x40, 0x4c);
2083 -
2084 - if (!(word & (1 << 13))) {
2085 struct xen_platform_op op;
2086 - printk(KERN_INFO "Disabling irq balancing and affinity\n");
2087 +
2088 + printk(KERN_INFO "Intel E7520/7320/7525 detected. "
2089 + "Disabling irq balancing and affinity\n");
2090 op.cmd = XENPF_platform_quirk;
2091 op.u.platform_quirk.quirk_id = QUIRK_NOIRQBALANCING;
2092 WARN_ON(HYPERVISOR_platform_op(&op));
2093 }
2094
2095 - /* put back the original value for config space */
2096 + /* put back the original value for config space*/
2097 if (!(config & 0x2))
2098 - write_pci_config_byte(0, 0, 0, 0xf4, config);
2099 + pci_write_config_byte(dev, 0xf4, config);
2100 }
2101 -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, verify_quirk_intel_irqbalance);
2102 -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, verify_quirk_intel_irqbalance);
2103 -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, verify_quirk_intel_irqbalance);
2104 -
2105 +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, quirk_intel_irqbalance);
2106 +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, quirk_intel_irqbalance);
2107 +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, quirk_intel_irqbalance);
2108 #endif
2109 --- a/arch/x86/kernel/setup64-xen.c
2110 +++ b/arch/x86/kernel/setup64-xen.c
2111 @@ -113,9 +113,9 @@
2112 if (!NODE_DATA(cpu_to_node(i))) {
2113 printk("cpu with no node %d, num_online_nodes %d\n",
2114 i, num_online_nodes());
2115 - ptr = alloc_bootmem(size);
2116 + ptr = alloc_bootmem_pages(size);
2117 } else {
2118 - ptr = alloc_bootmem_node(NODE_DATA(cpu_to_node(i)), size);
2119 + ptr = alloc_bootmem_pages_node(NODE_DATA(cpu_to_node(i)), size);
2120 }
2121 if (!ptr)
2122 panic("Cannot allocate cpu data for CPU %d\n", i);
2123 @@ -208,6 +208,8 @@
2124 __attribute__((section(".bss.page_aligned")));
2125 #endif
2126
2127 +extern asmlinkage void ignore_sysret(void);
2128 +
2129 /* May not be marked __init: used by software suspend */
2130 void syscall_init(void)
2131 {
2132 @@ -219,12 +221,22 @@
2133 */
2134 wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
2135 wrmsrl(MSR_LSTAR, system_call);
2136 + wrmsrl(MSR_CSTAR, ignore_sysret);
2137
2138 /* Flags to clear on syscall */
2139 wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000);
2140 #endif
2141 #ifdef CONFIG_IA32_EMULATION
2142 syscall32_cpu_init ();
2143 +#else
2144 + {
2145 + static const struct callback_register cstar = {
2146 + .type = CALLBACKTYPE_syscall32,
2147 + .address = (unsigned long)ignore_sysret
2148 + };
2149 + if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar))
2150 + printk(KERN_WARN "Unable to register CSTAR callback\n");
2151 + }
2152 #endif
2153 }
2154
2155 @@ -262,7 +274,6 @@
2156 /* CPU 0 is initialised in head64.c */
2157 if (cpu != 0) {
2158 pda_init(cpu);
2159 - zap_low_mappings(cpu);
2160 }
2161 #ifndef CONFIG_X86_NO_TSS
2162 else
2163 --- a/arch/x86/kernel/setup_64-xen.c
2164 +++ b/arch/x86/kernel/setup_64-xen.c
2165 @@ -123,6 +123,8 @@
2166
2167 unsigned long saved_video_mode;
2168
2169 +int force_mwait __cpuinitdata;
2170 +
2171 /*
2172 * Early DMI memory
2173 */
2174 @@ -256,10 +258,10 @@
2175 * there is a real-mode segmented pointer pointing to the
2176 * 4K EBDA area at 0x40E
2177 */
2178 - ebda_addr = *(unsigned short *)EBDA_ADDR_POINTER;
2179 + ebda_addr = *(unsigned short *)__va(EBDA_ADDR_POINTER);
2180 ebda_addr <<= 4;
2181
2182 - ebda_size = *(unsigned short *)(unsigned long)ebda_addr;
2183 + ebda_size = *(unsigned short *)__va(ebda_addr);
2184
2185 /* Round EBDA up to pages */
2186 if (ebda_size == 0)
2187 @@ -413,15 +415,8 @@
2188 #endif
2189
2190 #ifdef CONFIG_SMP
2191 - /*
2192 - * But first pinch a few for the stack/trampoline stuff
2193 - * FIXME: Don't need the extra page at 4K, but need to fix
2194 - * trampoline before removing it. (see the GDT stuff)
2195 - */
2196 - reserve_bootmem_generic(PAGE_SIZE, PAGE_SIZE);
2197 -
2198 /* Reserve SMP trampoline */
2199 - reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, PAGE_SIZE);
2200 + reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, 2*PAGE_SIZE);
2201 #endif
2202 #endif
2203
2204 @@ -573,8 +568,6 @@
2205 early_quirks();
2206 #endif
2207
2208 - zap_low_mappings(0);
2209 -
2210 /*
2211 * set this early, so we dont allocate cpu0
2212 * if MADT list doesnt list BSP first
2213 @@ -877,6 +870,10 @@
2214
2215 /* RDTSC can be speculated around */
2216 clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
2217 +
2218 + /* Family 10 doesn't support C states in MWAIT so don't use it */
2219 + if (c->x86 == 0x10 && !force_mwait)
2220 + clear_bit(X86_FEATURE_MWAIT, &c->x86_capability);
2221 }
2222
2223 static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
2224 @@ -1159,9 +1156,7 @@
2225 #ifdef CONFIG_X86_MCE
2226 mcheck_init(c);
2227 #endif
2228 - if (c == &boot_cpu_data)
2229 - mtrr_bp_init();
2230 - else
2231 + if (c != &boot_cpu_data)
2232 mtrr_ap_init();
2233 #ifdef CONFIG_NUMA
2234 numa_add_cpu(smp_processor_id());
2235 @@ -1252,9 +1247,8 @@
2236 "stc",
2237 "100mhzsteps",
2238 "hwpstate",
2239 - NULL, /* tsc invariant mapped to constant_tsc */
2240 - NULL,
2241 - /* nothing */ /* constant_tsc - moved to flags */
2242 + "", /* tsc invariant mapped to constant_tsc */
2243 + /* nothing */
2244 };
2245
2246
2247 --- a/arch/x86/kernel/smp_32-xen.c
2248 +++ b/arch/x86/kernel/smp_32-xen.c
2249 @@ -13,7 +13,6 @@
2250 #include <linux/mm.h>
2251 #include <linux/delay.h>
2252 #include <linux/spinlock.h>
2253 -#include <linux/smp_lock.h>
2254 #include <linux/kernel_stat.h>
2255 #include <linux/mc146818rtc.h>
2256 #include <linux/cache.h>
2257 @@ -216,7 +215,6 @@
2258 static struct mm_struct * flush_mm;
2259 static unsigned long flush_va;
2260 static DEFINE_SPINLOCK(tlbstate_lock);
2261 -#define FLUSH_ALL 0xffffffff
2262
2263 /*
2264 * We cannot call mmdrop() because we are in interrupt context,
2265 @@ -298,7 +296,7 @@
2266
2267 if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) {
2268 if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) {
2269 - if (flush_va == FLUSH_ALL)
2270 + if (flush_va == TLB_FLUSH_ALL)
2271 local_flush_tlb();
2272 else
2273 __flush_tlb_one(flush_va);
2274 @@ -314,9 +312,11 @@
2275 return IRQ_HANDLED;
2276 }
2277
2278 -static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
2279 - unsigned long va)
2280 +void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
2281 + unsigned long va)
2282 {
2283 + cpumask_t cpumask = *cpumaskp;
2284 +
2285 /*
2286 * A couple of (to be removed) sanity checks:
2287 *
2288 @@ -327,10 +327,12 @@
2289 BUG_ON(cpu_isset(smp_processor_id(), cpumask));
2290 BUG_ON(!mm);
2291
2292 +#ifdef CONFIG_HOTPLUG_CPU
2293 /* If a CPU which we ran on has gone down, OK. */
2294 cpus_and(cpumask, cpumask, cpu_online_map);
2295 - if (cpus_empty(cpumask))
2296 + if (unlikely(cpus_empty(cpumask)))
2297 return;
2298 +#endif
2299
2300 /*
2301 * i'm not happy about this global shared spinlock in the
2302 @@ -341,17 +343,7 @@
2303
2304 flush_mm = mm;
2305 flush_va = va;
2306 -#if NR_CPUS <= BITS_PER_LONG
2307 - atomic_set_mask(cpumask, &flush_cpumask);
2308 -#else
2309 - {
2310 - int k;
2311 - unsigned long *flush_mask = (unsigned long *)&flush_cpumask;
2312 - unsigned long *cpu_mask = (unsigned long *)&cpumask;
2313 - for (k = 0; k < BITS_TO_LONGS(NR_CPUS); ++k)
2314 - atomic_set_mask(cpu_mask[k], &flush_mask[k]);
2315 - }
2316 -#endif
2317 + cpus_or(flush_cpumask, cpumask, flush_cpumask);
2318 /*
2319 * We have to send the IPI only to
2320 * CPUs affected.
2321 @@ -378,7 +370,7 @@
2322
2323 local_flush_tlb();
2324 if (!cpus_empty(cpu_mask))
2325 - flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
2326 + flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
2327 preempt_enable();
2328 }
2329
2330 @@ -397,7 +389,7 @@
2331 leave_mm(smp_processor_id());
2332 }
2333 if (!cpus_empty(cpu_mask))
2334 - flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
2335 + flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
2336
2337 preempt_enable();
2338 }
2339 @@ -446,7 +438,7 @@
2340 * it goes straight through and wastes no time serializing
2341 * anything. Worst case is that we lose a reschedule ...
2342 */
2343 -void smp_send_reschedule(int cpu)
2344 +void xen_smp_send_reschedule(int cpu)
2345 {
2346 WARN_ON(cpu_is_offline(cpu));
2347 send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
2348 @@ -478,36 +470,79 @@
2349
2350 static struct call_data_struct *call_data;
2351
2352 +static void __smp_call_function(void (*func) (void *info), void *info,
2353 + int nonatomic, int wait)
2354 +{
2355 + struct call_data_struct data;
2356 + int cpus = num_online_cpus() - 1;
2357 +
2358 + if (!cpus)
2359 + return;
2360 +
2361 + data.func = func;
2362 + data.info = info;
2363 + atomic_set(&data.started, 0);
2364 + data.wait = wait;
2365 + if (wait)
2366 + atomic_set(&data.finished, 0);
2367 +
2368 + call_data = &data;
2369 + mb();
2370 +
2371 + /* Send a message to all other CPUs and wait for them to respond */
2372 + send_IPI_allbutself(CALL_FUNCTION_VECTOR);
2373 +
2374 + /* Wait for response */
2375 + while (atomic_read(&data.started) != cpus)
2376 + cpu_relax();
2377 +
2378 + if (wait)
2379 + while (atomic_read(&data.finished) != cpus)
2380 + cpu_relax();
2381 +}
2382 +
2383 +
2384 /**
2385 - * smp_call_function(): Run a function on all other CPUs.
2386 + * smp_call_function_mask(): Run a function on a set of other CPUs.
2387 + * @mask: The set of cpus to run on. Must not include the current cpu.
2388 * @func: The function to run. This must be fast and non-blocking.
2389 * @info: An arbitrary pointer to pass to the function.
2390 - * @nonatomic: currently unused.
2391 * @wait: If true, wait (atomically) until function has completed on other CPUs.
2392 *
2393 - * Returns 0 on success, else a negative status code. Does not return until
2394 - * remote CPUs are nearly ready to execute <<func>> or are or have executed.
2395 + * Returns 0 on success, else a negative status code.
2396 + *
2397 + * If @wait is true, then returns once @func has returned; otherwise
2398 + * it returns just before the target cpu calls @func.
2399 *
2400 * You must not call this function with disabled interrupts or from a
2401 * hardware interrupt handler or from a bottom half handler.
2402 */
2403 -int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
2404 - int wait)
2405 +int
2406 +xen_smp_call_function_mask(cpumask_t mask,
2407 + void (*func)(void *), void *info,
2408 + int wait)
2409 {
2410 struct call_data_struct data;
2411 + cpumask_t allbutself;
2412 int cpus;
2413
2414 + /* Can deadlock when called with interrupts disabled */
2415 + WARN_ON(irqs_disabled());
2416 +
2417 /* Holding any lock stops cpus from going down. */
2418 spin_lock(&call_lock);
2419 - cpus = num_online_cpus() - 1;
2420 +
2421 + allbutself = cpu_online_map;
2422 + cpu_clear(smp_processor_id(), allbutself);
2423 +
2424 + cpus_and(mask, mask, allbutself);
2425 + cpus = cpus_weight(mask);
2426 +
2427 if (!cpus) {
2428 spin_unlock(&call_lock);
2429 return 0;
2430 }
2431
2432 - /* Can deadlock when called with interrupts disabled */
2433 - WARN_ON(irqs_disabled());
2434 -
2435 data.func = func;
2436 data.info = info;
2437 atomic_set(&data.started, 0);
2438 @@ -517,9 +552,12 @@
2439
2440 call_data = &data;
2441 mb();
2442 -
2443 - /* Send a message to all other CPUs and wait for them to respond */
2444 - send_IPI_allbutself(CALL_FUNCTION_VECTOR);
2445 +
2446 + /* Send a message to other CPUs */
2447 + if (cpus_equal(mask, allbutself))
2448 + send_IPI_allbutself(CALL_FUNCTION_VECTOR);
2449 + else
2450 + send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
2451
2452 /* Wait for response */
2453 while (atomic_read(&data.started) != cpus)
2454 @@ -532,15 +570,14 @@
2455
2456 return 0;
2457 }
2458 -EXPORT_SYMBOL(smp_call_function);
2459
2460 static void stop_this_cpu (void * dummy)
2461 {
2462 + local_irq_disable();
2463 /*
2464 * Remove this CPU:
2465 */
2466 cpu_clear(smp_processor_id(), cpu_online_map);
2467 - local_irq_disable();
2468 disable_all_local_evtchn();
2469 if (cpu_data[smp_processor_id()].hlt_works_ok)
2470 for(;;) halt();
2471 @@ -551,13 +588,18 @@
2472 * this function calls the 'stop' function on all other CPUs in the system.
2473 */
2474
2475 -void smp_send_stop(void)
2476 +void xen_smp_send_stop(void)
2477 {
2478 - smp_call_function(stop_this_cpu, NULL, 1, 0);
2479 + /* Don't deadlock on the call lock in panic */
2480 + int nolock = !spin_trylock(&call_lock);
2481 + unsigned long flags;
2482
2483 - local_irq_disable();
2484 + local_irq_save(flags);
2485 + __smp_call_function(stop_this_cpu, NULL, 0, 0);
2486 + if (!nolock)
2487 + spin_unlock(&call_lock);
2488 disable_all_local_evtchn();
2489 - local_irq_enable();
2490 + local_irq_restore(flags);
2491 }
2492
2493 /*
2494 @@ -598,74 +640,3 @@
2495
2496 return IRQ_HANDLED;
2497 }
2498 -
2499 -/*
2500 - * this function sends a 'generic call function' IPI to one other CPU
2501 - * in the system.
2502 - *
2503 - * cpu is a standard Linux logical CPU number.
2504 - */
2505 -static void
2506 -__smp_call_function_single(int cpu, void (*func) (void *info), void *info,
2507 - int nonatomic, int wait)
2508 -{
2509 - struct call_data_struct data;
2510 - int cpus = 1;
2511 -
2512 - data.func = func;
2513 - data.info = info;
2514 - atomic_set(&data.started, 0);
2515 - data.wait = wait;
2516 - if (wait)
2517 - atomic_set(&data.finished, 0);
2518 -
2519 - call_data = &data;
2520 - wmb();
2521 - /* Send a message to all other CPUs and wait for them to respond */
2522 - send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_VECTOR);
2523 -
2524 - /* Wait for response */
2525 - while (atomic_read(&data.started) != cpus)
2526 - cpu_relax();
2527 -
2528 - if (!wait)
2529 - return;
2530 -
2531 - while (atomic_read(&data.finished) != cpus)
2532 - cpu_relax();
2533 -}
2534 -
2535 -/*
2536 - * smp_call_function_single - Run a function on another CPU
2537 - * @func: The function to run. This must be fast and non-blocking.
2538 - * @info: An arbitrary pointer to pass to the function.
2539 - * @nonatomic: Currently unused.
2540 - * @wait: If true, wait until function has completed on other CPUs.
2541 - *
2542 - * Retrurns 0 on success, else a negative status code.
2543 - *
2544 - * Does not return until the remote CPU is nearly ready to execute <func>
2545 - * or is or has executed.
2546 - */
2547 -
2548 -int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
2549 - int nonatomic, int wait)
2550 -{
2551 - /* prevent preemption and reschedule on another processor */
2552 - int me = get_cpu();
2553 - if (cpu == me) {
2554 - WARN_ON(1);
2555 - put_cpu();
2556 - return -EBUSY;
2557 - }
2558 -
2559 - /* Can deadlock when called with interrupts disabled */
2560 - WARN_ON(irqs_disabled());
2561 -
2562 - spin_lock_bh(&call_lock);
2563 - __smp_call_function_single(cpu, func, info, nonatomic, wait);
2564 - spin_unlock_bh(&call_lock);
2565 - put_cpu();
2566 - return 0;
2567 -}
2568 -EXPORT_SYMBOL(smp_call_function_single);
2569 --- a/arch/x86/kernel/smp_64-xen.c
2570 +++ b/arch/x86/kernel/smp_64-xen.c
2571 @@ -14,7 +14,6 @@
2572 #include <linux/mm.h>
2573 #include <linux/delay.h>
2574 #include <linux/spinlock.h>
2575 -#include <linux/smp_lock.h>
2576 #include <linux/smp.h>
2577 #include <linux/kernel_stat.h>
2578 #include <linux/mc146818rtc.h>
2579 @@ -457,44 +456,36 @@
2580 }
2581 EXPORT_SYMBOL(smp_call_function);
2582
2583 -void smp_stop_cpu(void)
2584 +static void stop_this_cpu(void *dummy)
2585 {
2586 - unsigned long flags;
2587 + local_irq_disable();
2588 /*
2589 * Remove this CPU:
2590 */
2591 cpu_clear(smp_processor_id(), cpu_online_map);
2592 - local_irq_save(flags);
2593 disable_all_local_evtchn();
2594 - local_irq_restore(flags);
2595 -}
2596 -
2597 -static void smp_really_stop_cpu(void *dummy)
2598 -{
2599 - smp_stop_cpu();
2600 for (;;)
2601 halt();
2602 }
2603
2604 void smp_send_stop(void)
2605 {
2606 - int nolock = 0;
2607 + int nolock;
2608 + unsigned long flags;
2609 +
2610 #ifndef CONFIG_XEN
2611 if (reboot_force)
2612 return;
2613 #endif
2614 +
2615 /* Don't deadlock on the call lock in panic */
2616 - if (!spin_trylock(&call_lock)) {
2617 - /* ignore locking because we have panicked anyways */
2618 - nolock = 1;
2619 - }
2620 - __smp_call_function(smp_really_stop_cpu, NULL, 0, 0);
2621 + nolock = !spin_trylock(&call_lock);
2622 + local_irq_save(flags);
2623 + __smp_call_function(stop_this_cpu, NULL, 0, 0);
2624 if (!nolock)
2625 spin_unlock(&call_lock);
2626 -
2627 - local_irq_disable();
2628 disable_all_local_evtchn();
2629 - local_irq_enable();
2630 + local_irq_restore(flags);
2631 }
2632
2633 /*
2634 --- a/arch/x86/kernel/time_32-xen.c
2635 +++ b/arch/x86/kernel/time_32-xen.c
2636 @@ -80,7 +80,6 @@
2637 #include <asm/i8253.h>
2638 DEFINE_SPINLOCK(i8253_lock);
2639 EXPORT_SYMBOL(i8253_lock);
2640 -int pit_latch_buggy; /* extern */
2641 #else
2642 volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
2643 #endif
2644 @@ -589,7 +588,7 @@
2645 return IRQ_HANDLED;
2646 }
2647
2648 -void mark_tsc_unstable(void)
2649 +void mark_tsc_unstable(char *reason)
2650 {
2651 #ifndef CONFIG_XEN /* XXX Should tell the hypervisor about this fact. */
2652 tsc_unstable = 1;
2653 @@ -597,17 +596,18 @@
2654 }
2655 EXPORT_SYMBOL_GPL(mark_tsc_unstable);
2656
2657 +static cycle_t cs_last;
2658 +
2659 static cycle_t xen_clocksource_read(void)
2660 {
2661 cycle_t ret = sched_clock();
2662
2663 #ifdef CONFIG_SMP
2664 for (;;) {
2665 - static cycle_t last_ret;
2666 #ifndef CONFIG_64BIT
2667 - cycle_t last = cmpxchg64(&last_ret, 0, 0);
2668 + cycle_t last = cmpxchg64(&cs_last, 0, 0);
2669 #else
2670 - cycle_t last = last_ret;
2671 + cycle_t last = cs_last;
2672 #define cmpxchg64 cmpxchg
2673 #endif
2674
2675 @@ -627,7 +627,7 @@
2676 }
2677 ret = last;
2678 }
2679 - if (cmpxchg64(&last_ret, last, ret) == last)
2680 + if (cmpxchg64(&cs_last, last, ret) == last)
2681 break;
2682 }
2683 #endif
2684 @@ -635,6 +635,14 @@
2685 return ret;
2686 }
2687
2688 +static void xen_clocksource_resume(void)
2689 +{
2690 + extern void time_resume(void);
2691 +
2692 + time_resume();
2693 + cs_last = sched_clock();
2694 +}
2695 +
2696 static struct clocksource clocksource_xen = {
2697 .name = "xen",
2698 .rating = 400,
2699 @@ -643,6 +651,7 @@
2700 .mult = 1 << XEN_SHIFT, /* time directly in nanoseconds */
2701 .shift = XEN_SHIFT,
2702 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
2703 + .resume = xen_clocksource_resume,
2704 };
2705
2706 static void init_missing_ticks_accounting(unsigned int cpu)
2707 @@ -731,35 +740,6 @@
2708 mod_timer(&sync_xen_wallclock_timer, jiffies + 1);
2709 }
2710
2711 -static int timer_resume(struct sys_device *dev)
2712 -{
2713 - extern void time_resume(void);
2714 - time_resume();
2715 - return 0;
2716 -}
2717 -
2718 -static struct sysdev_class timer_sysclass = {
2719 - .resume = timer_resume,
2720 - set_kset_name("timer"),
2721 -};
2722 -
2723 -
2724 -/* XXX this driverfs stuff should probably go elsewhere later -john */
2725 -static struct sys_device device_timer = {
2726 - .id = 0,
2727 - .cls = &timer_sysclass,
2728 -};
2729 -
2730 -static int time_init_device(void)
2731 -{
2732 - int error = sysdev_class_register(&timer_sysclass);
2733 - if (!error)
2734 - error = sysdev_register(&device_timer);
2735 - return error;
2736 -}
2737 -
2738 -device_initcall(time_init_device);
2739 -
2740 extern void (*late_time_init)(void);
2741
2742 /* Dynamically-mapped IRQ. */
2743 @@ -772,7 +752,7 @@
2744 VIRQ_TIMER,
2745 0,
2746 timer_interrupt,
2747 - SA_INTERRUPT,
2748 + IRQF_DISABLED,
2749 "timer0",
2750 NULL);
2751 BUG_ON(per_cpu(timer_irq, 0) < 0);
2752 @@ -890,21 +870,21 @@
2753 cpu_clear(smp_processor_id(), nohz_cpu_mask);
2754 }
2755
2756 -void raw_safe_halt(void)
2757 +void xen_safe_halt(void)
2758 {
2759 stop_hz_timer();
2760 /* Blocking includes an implicit local_irq_enable(). */
2761 HYPERVISOR_block();
2762 start_hz_timer();
2763 }
2764 -EXPORT_SYMBOL(raw_safe_halt);
2765 +EXPORT_SYMBOL(xen_safe_halt);
2766
2767 -void halt(void)
2768 +void xen_halt(void)
2769 {
2770 if (irqs_disabled())
2771 VOID(HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL));
2772 }
2773 -EXPORT_SYMBOL(halt);
2774 +EXPORT_SYMBOL(xen_halt);
2775
2776 /* No locking required. Interrupts are disabled on all CPUs. */
2777 void time_resume(void)
2778 @@ -967,7 +947,7 @@
2779 irq = bind_virq_to_irqhandler(VIRQ_TIMER,
2780 cpu,
2781 timer_interrupt,
2782 - SA_INTERRUPT,
2783 + IRQF_DISABLED,
2784 timer_name[cpu],
2785 NULL);
2786 if (irq < 0)
2787 --- a/arch/x86/kernel/traps_32-xen.c
2788 +++ b/arch/x86/kernel/traps_32-xen.c
2789 @@ -52,7 +52,7 @@
2790 #include <asm/unwind.h>
2791 #include <asm/smp.h>
2792 #include <asm/arch_hooks.h>
2793 -#include <asm/kdebug.h>
2794 +#include <linux/kdebug.h>
2795 #include <asm/stacktrace.h>
2796
2797 #include <linux/module.h>
2798 @@ -101,20 +101,6 @@
2799
2800 int kstack_depth_to_print = 24;
2801 static unsigned int code_bytes = 64;
2802 -ATOMIC_NOTIFIER_HEAD(i386die_chain);
2803 -
2804 -int register_die_notifier(struct notifier_block *nb)
2805 -{
2806 - vmalloc_sync_all();
2807 - return atomic_notifier_chain_register(&i386die_chain, nb);
2808 -}
2809 -EXPORT_SYMBOL(register_die_notifier); /* used modular by kdb */
2810 -
2811 -int unregister_die_notifier(struct notifier_block *nb)
2812 -{
2813 - return atomic_notifier_chain_unregister(&i386die_chain, nb);
2814 -}
2815 -EXPORT_SYMBOL(unregister_die_notifier); /* used modular by kdb */
2816
2817 static inline int valid_stack_ptr(struct thread_info *tinfo, void *p)
2818 {
2819 @@ -325,7 +311,7 @@
2820 regs->xds & 0xffff, regs->xes & 0xffff, regs->xfs & 0xffff, gs, ss);
2821 printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)",
2822 TASK_COMM_LEN, current->comm, current->pid,
2823 - current_thread_info(), current, current->thread_info);
2824 + current_thread_info(), current, task_thread_info(current));
2825 /*
2826 * When in-kernel, we also print out the stack and code at the
2827 * time of the fault..
2828 @@ -482,8 +468,6 @@
2829 siginfo_t *info)
2830 {
2831 struct task_struct *tsk = current;
2832 - tsk->thread.error_code = error_code;
2833 - tsk->thread.trap_no = trapnr;
2834
2835 if (regs->eflags & VM_MASK) {
2836 if (vm86)
2837 @@ -495,6 +479,18 @@
2838 goto kernel_trap;
2839
2840 trap_signal: {
2841 + /*
2842 + * We want error_code and trap_no set for userspace faults and
2843 + * kernelspace faults which result in die(), but not
2844 + * kernelspace faults which are fixed up. die() gives the
2845 + * process no chance to handle the signal and notice the
2846 + * kernel fault information, so that won't result in polluting
2847 + * the information about previously queued, but not yet
2848 + * delivered, faults. See also do_general_protection below.
2849 + */
2850 + tsk->thread.error_code = error_code;
2851 + tsk->thread.trap_no = trapnr;
2852 +
2853 if (info)
2854 force_sig_info(signr, info, tsk);
2855 else
2856 @@ -503,8 +499,11 @@
2857 }
2858
2859 kernel_trap: {
2860 - if (!fixup_exception(regs))
2861 + if (!fixup_exception(regs)) {
2862 + tsk->thread.error_code = error_code;
2863 + tsk->thread.trap_no = trapnr;
2864 die(str, regs, error_code);
2865 + }
2866 return;
2867 }
2868
2869 @@ -578,9 +577,6 @@
2870 fastcall void __kprobes do_general_protection(struct pt_regs * regs,
2871 long error_code)
2872 {
2873 - current->thread.error_code = error_code;
2874 - current->thread.trap_no = 13;
2875 -
2876 if (regs->eflags & VM_MASK)
2877 goto gp_in_vm86;
2878
2879 @@ -599,6 +595,8 @@
2880
2881 gp_in_kernel:
2882 if (!fixup_exception(regs)) {
2883 + current->thread.error_code = error_code;
2884 + current->thread.trap_no = 13;
2885 if (notify_die(DIE_GPF, "general protection fault", regs,
2886 error_code, 13, SIGSEGV) == NOTIFY_STOP)
2887 return;
2888 @@ -987,9 +985,7 @@
2889 fastcall unsigned long patch_espfix_desc(unsigned long uesp,
2890 unsigned long kesp)
2891 {
2892 - int cpu = smp_processor_id();
2893 - struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
2894 - struct desc_struct *gdt = (struct desc_struct *)cpu_gdt_descr->address;
2895 + struct desc_struct *gdt = __get_cpu_var(gdt_page).gdt;
2896 unsigned long base = (kesp - uesp) & -THREAD_SIZE;
2897 unsigned long new_kesp = kesp - base;
2898 unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT;
2899 --- a/arch/x86/kernel/traps_64-xen.c
2900 +++ b/arch/x86/kernel/traps_64-xen.c
2901 @@ -32,6 +32,7 @@
2902 #include <linux/unwind.h>
2903 #include <linux/uaccess.h>
2904 #include <linux/bug.h>
2905 +#include <linux/kdebug.h>
2906
2907 #include <asm/system.h>
2908 #include <asm/io.h>
2909 @@ -39,7 +40,6 @@
2910 #include <asm/debugreg.h>
2911 #include <asm/desc.h>
2912 #include <asm/i387.h>
2913 -#include <asm/kdebug.h>
2914 #include <asm/processor.h>
2915 #include <asm/unwind.h>
2916 #include <asm/smp.h>
2917 @@ -71,22 +71,6 @@
2918 asmlinkage void machine_check(void);
2919 asmlinkage void spurious_interrupt_bug(void);
2920
2921 -ATOMIC_NOTIFIER_HEAD(die_chain);
2922 -EXPORT_SYMBOL(die_chain);
2923 -
2924 -int register_die_notifier(struct notifier_block *nb)
2925 -{
2926 - vmalloc_sync_all();
2927 - return atomic_notifier_chain_register(&die_chain, nb);
2928 -}
2929 -EXPORT_SYMBOL(register_die_notifier); /* used modular by kdb */
2930 -
2931 -int unregister_die_notifier(struct notifier_block *nb)
2932 -{
2933 - return atomic_notifier_chain_unregister(&die_chain, nb);
2934 -}
2935 -EXPORT_SYMBOL(unregister_die_notifier); /* used modular by kdb */
2936 -
2937 static inline void conditional_sti(struct pt_regs *regs)
2938 {
2939 if (regs->eflags & X86_EFLAGS_IF)
2940 @@ -428,8 +412,7 @@
2941 const int cpu = smp_processor_id();
2942 struct task_struct *cur = cpu_pda(cpu)->pcurrent;
2943
2944 - rsp = regs->rsp;
2945 -
2946 + rsp = regs->rsp;
2947 printk("CPU %d ", cpu);
2948 __show_regs(regs);
2949 printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
2950 @@ -440,7 +423,6 @@
2951 * time of the fault..
2952 */
2953 if (in_kernel) {
2954 -
2955 printk("Stack: ");
2956 _show_stack(NULL, regs, (unsigned long*)rsp);
2957
2958 @@ -485,13 +467,14 @@
2959
2960 unsigned __kprobes long oops_begin(void)
2961 {
2962 - int cpu = smp_processor_id();
2963 + int cpu;
2964 unsigned long flags;
2965
2966 oops_enter();
2967
2968 /* racy, but better than risking deadlock. */
2969 local_irq_save(flags);
2970 + cpu = smp_processor_id();
2971 if (!spin_trylock(&die_lock)) {
2972 if (cpu == die_owner)
2973 /* nested oops. should stop eventually */;
2974 @@ -585,10 +568,20 @@
2975 {
2976 struct task_struct *tsk = current;
2977
2978 - tsk->thread.error_code = error_code;
2979 - tsk->thread.trap_no = trapnr;
2980 -
2981 if (user_mode(regs)) {
2982 + /*
2983 + * We want error_code and trap_no set for userspace
2984 + * faults and kernelspace faults which result in
2985 + * die(), but not kernelspace faults which are fixed
2986 + * up. die() gives the process no chance to handle
2987 + * the signal and notice the kernel fault information,
2988 + * so that won't result in polluting the information
2989 + * about previously queued, but not yet delivered,
2990 + * faults. See also do_general_protection below.
2991 + */
2992 + tsk->thread.error_code = error_code;
2993 + tsk->thread.trap_no = trapnr;
2994 +
2995 if (exception_trace && unhandled_signal(tsk, signr))
2996 printk(KERN_INFO
2997 "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n",
2998 @@ -609,8 +602,11 @@
2999 fixup = search_exception_tables(regs->rip);
3000 if (fixup)
3001 regs->rip = fixup->fixup;
3002 - else
3003 + else {
3004 + tsk->thread.error_code = error_code;
3005 + tsk->thread.trap_no = trapnr;
3006 die(str, regs, error_code);
3007 + }
3008 return;
3009 }
3010 }
3011 @@ -686,10 +682,10 @@
3012
3013 conditional_sti(regs);
3014
3015 - tsk->thread.error_code = error_code;
3016 - tsk->thread.trap_no = 13;
3017 -
3018 if (user_mode(regs)) {
3019 + tsk->thread.error_code = error_code;
3020 + tsk->thread.trap_no = 13;
3021 +
3022 if (exception_trace && unhandled_signal(tsk, SIGSEGV))
3023 printk(KERN_INFO
3024 "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n",
3025 @@ -708,6 +704,9 @@
3026 regs->rip = fixup->fixup;
3027 return;
3028 }
3029 +
3030 + tsk->thread.error_code = error_code;
3031 + tsk->thread.trap_no = 13;
3032 if (notify_die(DIE_GPF, "general protection fault", regs,
3033 error_code, 13, SIGSEGV) == NOTIFY_STOP)
3034 return;
3035 --- a/arch/x86/kernel/vsyscall_64-xen.c
3036 +++ b/arch/x86/kernel/vsyscall_64-xen.c
3037 @@ -45,14 +45,34 @@
3038
3039 #define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
3040 #define __syscall_clobber "r11","rcx","memory"
3041 +#define __pa_vsymbol(x) \
3042 + ({unsigned long v; \
3043 + extern char __vsyscall_0; \
3044 + asm("" : "=r" (v) : "0" (x)); \
3045 + ((v - VSYSCALL_FIRST_PAGE) + __pa_symbol(&__vsyscall_0)); })
3046
3047 +/*
3048 + * vsyscall_gtod_data contains data that is :
3049 + * - readonly from vsyscalls
3050 + * - writen by timer interrupt or systcl (/proc/sys/kernel/vsyscall64)
3051 + * Try to keep this structure as small as possible to avoid cache line ping pongs
3052 + */
3053 struct vsyscall_gtod_data_t {
3054 - seqlock_t lock;
3055 - int sysctl_enabled;
3056 - struct timeval wall_time_tv;
3057 + seqlock_t lock;
3058 +
3059 + /* open coded 'struct timespec' */
3060 + time_t wall_time_sec;
3061 + u32 wall_time_nsec;
3062 +
3063 + int sysctl_enabled;
3064 struct timezone sys_tz;
3065 - cycle_t offset_base;
3066 - struct clocksource clock;
3067 + struct { /* extract of a clocksource struct */
3068 + cycle_t (*vread)(void);
3069 + cycle_t cycle_last;
3070 + cycle_t mask;
3071 + u32 mult;
3072 + u32 shift;
3073 + } clock;
3074 };
3075 int __vgetcpu_mode __section_vgetcpu_mode;
3076
3077 @@ -68,9 +88,13 @@
3078
3079 write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
3080 /* copy vsyscall data */
3081 - vsyscall_gtod_data.clock = *clock;
3082 - vsyscall_gtod_data.wall_time_tv.tv_sec = wall_time->tv_sec;
3083 - vsyscall_gtod_data.wall_time_tv.tv_usec = wall_time->tv_nsec/1000;
3084 + vsyscall_gtod_data.clock.vread = clock->vread;
3085 + vsyscall_gtod_data.clock.cycle_last = clock->cycle_last;
3086 + vsyscall_gtod_data.clock.mask = clock->mask;
3087 + vsyscall_gtod_data.clock.mult = clock->mult;
3088 + vsyscall_gtod_data.clock.shift = clock->shift;
3089 + vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
3090 + vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
3091 vsyscall_gtod_data.sys_tz = sys_tz;
3092 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
3093 }
3094 @@ -105,7 +129,8 @@
3095 static __always_inline void do_vgettimeofday(struct timeval * tv)
3096 {
3097 cycle_t now, base, mask, cycle_delta;
3098 - unsigned long seq, mult, shift, nsec_delta;
3099 + unsigned seq;
3100 + unsigned long mult, shift, nsec;
3101 cycle_t (*vread)(void);
3102 do {
3103 seq = read_seqbegin(&__vsyscall_gtod_data.lock);
3104 @@ -121,21 +146,20 @@
3105 mult = __vsyscall_gtod_data.clock.mult;
3106 shift = __vsyscall_gtod_data.clock.shift;
3107
3108 - *tv = __vsyscall_gtod_data.wall_time_tv;
3109 -
3110 + tv->tv_sec = __vsyscall_gtod_data.wall_time_sec;
3111 + nsec = __vsyscall_gtod_data.wall_time_nsec;
3112 } while (read_seqretry(&__vsyscall_gtod_data.lock, seq));
3113
3114 /* calculate interval: */
3115 cycle_delta = (now - base) & mask;
3116 /* convert to nsecs: */
3117 - nsec_delta = (cycle_delta * mult) >> shift;
3118 + nsec += (cycle_delta * mult) >> shift;
3119
3120 - /* convert to usecs and add to timespec: */
3121 - tv->tv_usec += nsec_delta / NSEC_PER_USEC;
3122 - while (tv->tv_usec > USEC_PER_SEC) {
3123 + while (nsec >= NSEC_PER_SEC) {
3124 tv->tv_sec += 1;
3125 - tv->tv_usec -= USEC_PER_SEC;
3126 + nsec -= NSEC_PER_SEC;
3127 }
3128 + tv->tv_usec = nsec / NSEC_PER_USEC;
3129 }
3130
3131 int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
3132 @@ -151,11 +175,16 @@
3133 * unlikely */
3134 time_t __vsyscall(1) vtime(time_t *t)
3135 {
3136 + struct timeval tv;
3137 + time_t result;
3138 if (unlikely(!__vsyscall_gtod_data.sysctl_enabled))
3139 return time_syscall(t);
3140 - else if (t)
3141 - *t = __vsyscall_gtod_data.wall_time_tv.tv_sec;
3142 - return __vsyscall_gtod_data.wall_time_tv.tv_sec;
3143 +
3144 + vgettimeofday(&tv, 0);
3145 + result = tv.tv_sec;
3146 + if (t)
3147 + *t = result;
3148 + return result;
3149 }
3150
3151 /* Fast way to get current CPU and node.
3152 @@ -224,10 +253,10 @@
3153 return ret;
3154 /* gcc has some trouble with __va(__pa()), so just do it this
3155 way. */
3156 - map1 = ioremap(__pa_symbol(&vsysc1), 2);
3157 + map1 = ioremap(__pa_vsymbol(&vsysc1), 2);
3158 if (!map1)
3159 return -ENOMEM;
3160 - map2 = ioremap(__pa_symbol(&vsysc2), 2);
3161 + map2 = ioremap(__pa_vsymbol(&vsysc2), 2);
3162 if (!map2) {
3163 ret = -ENOMEM;
3164 goto out;
3165 @@ -304,7 +333,7 @@
3166 cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
3167 {
3168 long cpu = (long)arg;
3169 - if (action == CPU_ONLINE)
3170 + if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
3171 smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 0, 1);
3172 return NOTIFY_DONE;
3173 }
3174 --- a/arch/x86/mm/fault_32-xen.c
3175 +++ b/arch/x86/mm/fault_32-xen.c
3176 @@ -14,19 +14,20 @@
3177 #include <linux/mman.h>
3178 #include <linux/mm.h>
3179 #include <linux/smp.h>
3180 -#include <linux/smp_lock.h>
3181 #include <linux/interrupt.h>
3182 #include <linux/init.h>
3183 #include <linux/tty.h>
3184 #include <linux/vt_kern.h> /* For unblank_screen() */
3185 #include <linux/highmem.h>
3186 +#include <linux/bootmem.h> /* for max_low_pfn */
3187 +#include <linux/vmalloc.h>
3188 #include <linux/module.h>
3189 #include <linux/kprobes.h>
3190 #include <linux/uaccess.h>
3191 +#include <linux/kdebug.h>
3192
3193 #include <asm/system.h>
3194 #include <asm/desc.h>
3195 -#include <asm/kdebug.h>
3196 #include <asm/segment.h>
3197
3198 extern void die(const char *,struct pt_regs *,long);
3199 @@ -259,25 +260,20 @@
3200 unsigned long page;
3201
3202 page = read_cr3();
3203 - page = ((unsigned long *) __va(page))[address >> 22];
3204 - if (oops_may_print())
3205 - printk(KERN_ALERT "*pde = ma %08lx pa %08lx\n", page,
3206 - machine_to_phys(page));
3207 + page = ((unsigned long *) __va(page))[address >> PGDIR_SHIFT];
3208 + printk(KERN_ALERT "*pde = ma %08lx pa %08lx\n", page,
3209 + machine_to_phys(page));
3210 /*
3211 * We must not directly access the pte in the highpte
3212 * case if the page table is located in highmem.
3213 * And lets rather not kmap-atomic the pte, just in case
3214 * it's allocated already.
3215 */
3216 -#ifdef CONFIG_HIGHPTE
3217 - if ((page >> PAGE_SHIFT) >= highstart_pfn)
3218 - return;
3219 -#endif
3220 - if ((page & 1) && oops_may_print()) {
3221 - page &= PAGE_MASK;
3222 - address &= 0x003ff000;
3223 - page = machine_to_phys(page);
3224 - page = ((unsigned long *) __va(page))[address >> PAGE_SHIFT];
3225 + if ((machine_to_phys(page) >> PAGE_SHIFT) < max_low_pfn
3226 + && (page & _PAGE_PRESENT)) {
3227 + page = machine_to_phys(page & PAGE_MASK);
3228 + page = ((unsigned long *) __va(page))[(address >> PAGE_SHIFT)
3229 + & (PTRS_PER_PTE - 1)];
3230 printk(KERN_ALERT "*pte = ma %08lx pa %08lx\n", page,
3231 machine_to_phys(page));
3232 }
3233 @@ -581,6 +577,11 @@
3234 bad_area_nosemaphore:
3235 /* User mode accesses just cause a SIGSEGV */
3236 if (error_code & 4) {
3237 + /*
3238 + * It's possible to have interrupts off here.
3239 + */
3240 + local_irq_enable();
3241 +
3242 /*
3243 * Valid to do another page fault here because this one came
3244 * from user space.
3245 @@ -633,7 +634,7 @@
3246 bust_spinlocks(1);
3247
3248 if (oops_may_print()) {
3249 - #ifdef CONFIG_X86_PAE
3250 +#ifdef CONFIG_X86_PAE
3251 if (error_code & 16) {
3252 pte_t *pte = lookup_address(address);
3253
3254 @@ -642,7 +643,7 @@
3255 "NX-protected page - exploit attempt? "
3256 "(uid: %d)\n", current->uid);
3257 }
3258 - #endif
3259 +#endif
3260 if (address < PAGE_SIZE)
3261 printk(KERN_ALERT "BUG: unable to handle kernel NULL "
3262 "pointer dereference");
3263 @@ -652,8 +653,8 @@
3264 printk(" at virtual address %08lx\n",address);
3265 printk(KERN_ALERT " printing eip:\n");
3266 printk("%08lx\n", regs->eip);
3267 + dump_fault_path(address);
3268 }
3269 - dump_fault_path(address);
3270 tsk->thread.cr2 = address;
3271 tsk->thread.trap_no = 14;
3272 tsk->thread.error_code = error_code;
3273 @@ -694,7 +695,6 @@
3274 force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
3275 }
3276
3277 -#if !HAVE_SHARED_KERNEL_PMD
3278 void vmalloc_sync_all(void)
3279 {
3280 /*
3281 @@ -710,6 +710,9 @@
3282 static unsigned long start = TASK_SIZE;
3283 unsigned long address;
3284
3285 + if (SHARED_KERNEL_PMD)
3286 + return;
3287 +
3288 BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK);
3289 for (address = start;
3290 address >= TASK_SIZE && address < hypervisor_virt_start;
3291 @@ -739,4 +742,3 @@
3292 start = address + (1UL << PMD_SHIFT);
3293 }
3294 }
3295 -#endif
3296 --- a/arch/x86/mm/fault_64-xen.c
3297 +++ b/arch/x86/mm/fault_64-xen.c
3298 @@ -15,22 +15,22 @@
3299 #include <linux/mman.h>
3300 #include <linux/mm.h>
3301 #include <linux/smp.h>
3302 -#include <linux/smp_lock.h>
3303 #include <linux/interrupt.h>
3304 #include <linux/init.h>
3305 #include <linux/tty.h>
3306 #include <linux/vt_kern.h> /* For unblank_screen() */
3307 #include <linux/compiler.h>
3308 +#include <linux/vmalloc.h>
3309 #include <linux/module.h>
3310 #include <linux/kprobes.h>
3311 #include <linux/uaccess.h>
3312 +#include <linux/kdebug.h>
3313
3314 #include <asm/system.h>
3315 #include <asm/pgalloc.h>
3316 #include <asm/smp.h>
3317 #include <asm/tlbflush.h>
3318 #include <asm/proto.h>
3319 -#include <asm/kdebug.h>
3320 #include <asm-generic/sections.h>
3321
3322 /* Page fault error code bits */
3323 @@ -537,6 +537,12 @@
3324 bad_area_nosemaphore:
3325 /* User mode accesses just cause a SIGSEGV */
3326 if (error_code & PF_USER) {
3327 +
3328 + /*
3329 + * It's possible to have interrupts off here.
3330 + */
3331 + local_irq_enable();
3332 +
3333 if (is_prefetch(regs, address, error_code))
3334 return;
3335
3336 @@ -646,7 +652,7 @@
3337 }
3338
3339 DEFINE_SPINLOCK(pgd_lock);
3340 -struct page *pgd_list;
3341 +LIST_HEAD(pgd_list);
3342
3343 void vmalloc_sync_all(void)
3344 {
3345 @@ -666,8 +672,7 @@
3346 if (pgd_none(*pgd_ref))
3347 continue;
3348 spin_lock(&pgd_lock);
3349 - for (page = pgd_list; page;
3350 - page = (struct page *)page->index) {
3351 + list_for_each_entry(page, &pgd_list, lru) {
3352 pgd_t *pgd;
3353 pgd = (pgd_t *)page_address(page) + pgd_index(address);
3354 if (pgd_none(*pgd))
3355 --- a/arch/x86/mm/highmem_32-xen.c
3356 +++ b/arch/x86/mm/highmem_32-xen.c
3357 @@ -26,7 +26,7 @@
3358 * However when holding an atomic kmap is is not legal to sleep, so atomic
3359 * kmaps are appropriate for short, tight code paths only.
3360 */
3361 -static void *__kmap_atomic(struct page *page, enum km_type type, pgprot_t prot)
3362 +void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
3363 {
3364 enum fixed_addresses idx;
3365 unsigned long vaddr;
3366 @@ -49,15 +49,7 @@
3367
3368 void *kmap_atomic(struct page *page, enum km_type type)
3369 {
3370 - return __kmap_atomic(page, type, kmap_prot);
3371 -}
3372 -
3373 -/* Same as kmap_atomic but with PAGE_KERNEL_RO page protection. */
3374 -void *kmap_atomic_pte(struct page *page, enum km_type type)
3375 -{
3376 - return __kmap_atomic(page, type,
3377 - test_bit(PG_pinned, &page->flags)
3378 - ? PAGE_KERNEL_RO : kmap_prot);
3379 + return kmap_atomic_prot(page, type, kmap_prot);
3380 }
3381
3382 void kunmap_atomic(void *kvaddr, enum km_type type)
3383 @@ -80,6 +72,7 @@
3384 #endif
3385 }
3386
3387 + arch_flush_lazy_mmu_mode();
3388 pagefault_enable();
3389 }
3390
3391 @@ -117,6 +110,5 @@
3392 EXPORT_SYMBOL(kmap);
3393 EXPORT_SYMBOL(kunmap);
3394 EXPORT_SYMBOL(kmap_atomic);
3395 -EXPORT_SYMBOL(kmap_atomic_pte);
3396 EXPORT_SYMBOL(kunmap_atomic);
3397 EXPORT_SYMBOL(kmap_atomic_to_page);
3398 --- a/arch/x86/mm/init_32-xen.c
3399 +++ b/arch/x86/mm/init_32-xen.c
3400 @@ -22,6 +22,7 @@
3401 #include <linux/init.h>
3402 #include <linux/highmem.h>
3403 #include <linux/pagemap.h>
3404 +#include <linux/pfn.h>
3405 #include <linux/poison.h>
3406 #include <linux/bootmem.h>
3407 #include <linux/slab.h>
3408 @@ -67,17 +68,19 @@
3409 pmd_t *pmd_table;
3410
3411 #ifdef CONFIG_X86_PAE
3412 - pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
3413 - paravirt_alloc_pd(__pa(pmd_table) >> PAGE_SHIFT);
3414 - make_lowmem_page_readonly(pmd_table, XENFEAT_writable_page_tables);
3415 - set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
3416 - pud = pud_offset(pgd, 0);
3417 - if (pmd_table != pmd_offset(pud, 0))
3418 - BUG();
3419 -#else
3420 + if (!(__pgd_val(*pgd) & _PAGE_PRESENT)) {
3421 + pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
3422 +
3423 + paravirt_alloc_pd(__pa(pmd_table) >> PAGE_SHIFT);
3424 + make_lowmem_page_readonly(pmd_table, XENFEAT_writable_page_tables);
3425 + set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
3426 + pud = pud_offset(pgd, 0);
3427 + if (pmd_table != pmd_offset(pud, 0))
3428 + BUG();
3429 + }
3430 +#endif
3431 pud = pud_offset(pgd, 0);
3432 pmd_table = pmd_offset(pud, 0);
3433 -#endif
3434
3435 return pmd_table;
3436 }
3437 @@ -88,16 +91,18 @@
3438 */
3439 static pte_t * __init one_page_table_init(pmd_t *pmd)
3440 {
3441 +#if CONFIG_XEN_COMPAT <= 0x030002
3442 if (pmd_none(*pmd)) {
3443 +#else
3444 + if (!(__pmd_val(*pmd) & _PAGE_PRESENT)) {
3445 +#endif
3446 pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE);
3447 +
3448 paravirt_alloc_pt(__pa(page_table) >> PAGE_SHIFT);
3449 make_lowmem_page_readonly(page_table,
3450 XENFEAT_writable_page_tables);
3451 set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
3452 - if (page_table != pte_offset_kernel(pmd, 0))
3453 - BUG();
3454 -
3455 - return page_table;
3456 + BUG_ON(page_table != pte_offset_kernel(pmd, 0));
3457 }
3458
3459 return pte_offset_kernel(pmd, 0);
3460 @@ -117,7 +122,6 @@
3461 static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base)
3462 {
3463 pgd_t *pgd;
3464 - pud_t *pud;
3465 pmd_t *pmd;
3466 int pgd_idx, pmd_idx;
3467 unsigned long vaddr;
3468 @@ -128,12 +132,10 @@
3469 pgd = pgd_base + pgd_idx;
3470
3471 for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) {
3472 - if (pgd_none(*pgd))
3473 - one_md_table_init(pgd);
3474 - pud = pud_offset(pgd, vaddr);
3475 - pmd = pmd_offset(pud, vaddr);
3476 + pmd = one_md_table_init(pgd);
3477 + pmd = pmd + pmd_index(vaddr);
3478 for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) {
3479 - if (vaddr < hypervisor_virt_start && pmd_none(*pmd))
3480 + if (vaddr < hypervisor_virt_start)
3481 one_page_table_init(pmd);
3482
3483 vaddr += PMD_SIZE;
3484 @@ -196,24 +198,25 @@
3485 /* Map with big pages if possible, otherwise create normal page tables. */
3486 if (cpu_has_pse) {
3487 unsigned int address2 = (pfn + PTRS_PER_PTE - 1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1;
3488 -
3489 if (is_kernel_text(address) || is_kernel_text(address2))
3490 set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC));
3491 else
3492 set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE));
3493 +
3494 pfn += PTRS_PER_PTE;
3495 } else {
3496 pte = one_page_table_init(pmd);
3497
3498 - pte += pte_ofs;
3499 - for (; pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; pte++, pfn++, pte_ofs++) {
3500 - /* XEN: Only map initial RAM allocation. */
3501 - if ((pfn >= max_ram_pfn) || pte_present(*pte))
3502 - continue;
3503 - if (is_kernel_text(address))
3504 - set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC));
3505 - else
3506 - set_pte(pte, pfn_pte(pfn, PAGE_KERNEL));
3507 + for (pte += pte_ofs;
3508 + pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn;
3509 + pte++, pfn++, pte_ofs++, address += PAGE_SIZE) {
3510 + /* XEN: Only map initial RAM allocation. */
3511 + if ((pfn >= max_ram_pfn) || pte_present(*pte))
3512 + continue;
3513 + if (is_kernel_text(address))
3514 + set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC));
3515 + else
3516 + set_pte(pte, pfn_pte(pfn, PAGE_KERNEL));
3517 }
3518 pte_ofs = 0;
3519 }
3520 @@ -383,15 +386,44 @@
3521
3522 pgd_t *swapper_pg_dir;
3523
3524 +static void __init xen_pagetable_setup_start(pgd_t *base)
3525 +{
3526 +}
3527 +
3528 +static void __init xen_pagetable_setup_done(pgd_t *base)
3529 +{
3530 +}
3531 +
3532 +/*
3533 + * Build a proper pagetable for the kernel mappings. Up until this
3534 + * point, we've been running on some set of pagetables constructed by
3535 + * the boot process.
3536 + *
3537 + * If we're booting on native hardware, this will be a pagetable
3538 + * constructed in arch/i386/kernel/head.S, and not running in PAE mode
3539 + * (even if we'll end up running in PAE). The root of the pagetable
3540 + * will be swapper_pg_dir.
3541 + *
3542 + * If we're booting paravirtualized under a hypervisor, then there are
3543 + * more options: we may already be running PAE, and the pagetable may
3544 + * or may not be based in swapper_pg_dir. In any case,
3545 + * paravirt_pagetable_setup_start() will set up swapper_pg_dir
3546 + * appropriately for the rest of the initialization to work.
3547 + *
3548 + * In general, pagetable_init() assumes that the pagetable may already
3549 + * be partially populated, and so it avoids stomping on any existing
3550 + * mappings.
3551 + */
3552 static void __init pagetable_init (void)
3553 {
3554 - unsigned long vaddr;
3555 + unsigned long vaddr, end;
3556 pgd_t *pgd_base = (pgd_t *)xen_start_info->pt_base;
3557
3558 + xen_pagetable_setup_start(pgd_base);
3559 +
3560 /* Enable PSE if available */
3561 - if (cpu_has_pse) {
3562 + if (cpu_has_pse)
3563 set_in_cr4(X86_CR4_PSE);
3564 - }
3565
3566 /* Enable PGE if available */
3567 if (cpu_has_pge) {
3568 @@ -408,9 +440,12 @@
3569 * created - mappings will be set by set_fixmap():
3570 */
3571 vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
3572 - page_table_range_init(vaddr, hypervisor_virt_start, pgd_base);
3573 + end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
3574 + page_table_range_init(vaddr, end, pgd_base);
3575
3576 permanent_kmaps_init(pgd_base);
3577 +
3578 + xen_pagetable_setup_done(pgd_base);
3579 }
3580
3581 #if defined(CONFIG_SOFTWARE_SUSPEND) || defined(CONFIG_ACPI_SLEEP)
3582 @@ -757,34 +792,29 @@
3583 EXPORT_SYMBOL_GPL(remove_memory);
3584 #endif
3585
3586 -struct kmem_cache *pgd_cache;
3587 struct kmem_cache *pmd_cache;
3588
3589 void __init pgtable_cache_init(void)
3590 {
3591 + size_t pgd_size = PTRS_PER_PGD*sizeof(pgd_t);
3592 +
3593 if (PTRS_PER_PMD > 1) {
3594 pmd_cache = kmem_cache_create("pmd",
3595 PTRS_PER_PMD*sizeof(pmd_t),
3596 PTRS_PER_PMD*sizeof(pmd_t),
3597 - 0,
3598 + SLAB_PANIC,
3599 pmd_ctor,
3600 NULL);
3601 - if (!pmd_cache)
3602 - panic("pgtable_cache_init(): cannot create pmd cache");
3603 + if (!SHARED_KERNEL_PMD) {
3604 + /* If we're in PAE mode and have a non-shared
3605 + kernel pmd, then the pgd size must be a
3606 + page size. This is because the pgd_list
3607 + links through the page structure, so there
3608 + can only be one pgd per page for this to
3609 + work. */
3610 + pgd_size = PAGE_SIZE;
3611 + }
3612 }
3613 - pgd_cache = kmem_cache_create("pgd",
3614 -#ifndef CONFIG_XEN
3615 - PTRS_PER_PGD*sizeof(pgd_t),
3616 - PTRS_PER_PGD*sizeof(pgd_t),
3617 -#else
3618 - PAGE_SIZE,
3619 - PAGE_SIZE,
3620 -#endif
3621 - 0,
3622 - pgd_ctor,
3623 - PTRS_PER_PMD == 1 ? pgd_dtor : NULL);
3624 - if (!pgd_cache)
3625 - panic("pgtable_cache_init(): Cannot create pgd cache");
3626 }
3627
3628 /*
3629 @@ -818,13 +848,26 @@
3630
3631 void mark_rodata_ro(void)
3632 {
3633 - unsigned long addr = (unsigned long)__start_rodata;
3634 -
3635 - for (; addr < (unsigned long)__end_rodata; addr += PAGE_SIZE)
3636 - change_page_attr(virt_to_page(addr), 1, PAGE_KERNEL_RO);
3637 + unsigned long start = PFN_ALIGN(_text);
3638 + unsigned long size = PFN_ALIGN(_etext) - start;
3639
3640 - printk("Write protecting the kernel read-only data: %uk\n",
3641 - (__end_rodata - __start_rodata) >> 10);
3642 +#ifndef CONFIG_KPROBES
3643 +#ifdef CONFIG_HOTPLUG_CPU
3644 + /* It must still be possible to apply SMP alternatives. */
3645 + if (num_possible_cpus() <= 1)
3646 +#endif
3647 + {
3648 + change_page_attr(virt_to_page(start),
3649 + size >> PAGE_SHIFT, PAGE_KERNEL_RX);
3650 + printk("Write protecting the kernel text: %luk\n", size >> 10);
3651 + }
3652 +#endif
3653 + start += size;
3654 + size = (unsigned long)__end_rodata - start;
3655 + change_page_attr(virt_to_page(start),
3656 + size >> PAGE_SHIFT, PAGE_KERNEL_RO);
3657 + printk("Write protecting the kernel read-only data: %luk\n",
3658 + size >> 10);
3659
3660 /*
3661 * change_page_attr() requires a global_flush_tlb() call after it.
3662 @@ -847,7 +890,7 @@
3663 free_page(addr);
3664 totalram_pages++;
3665 }
3666 - printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10);
3667 + printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
3668 }
3669
3670 void free_initmem(void)
3671 --- a/arch/x86/mm/init_64-xen.c
3672 +++ b/arch/x86/mm/init_64-xen.c
3673 @@ -25,10 +25,12 @@
3674 #include <linux/bootmem.h>
3675 #include <linux/proc_fs.h>
3676 #include <linux/pci.h>
3677 +#include <linux/pfn.h>
3678 #include <linux/poison.h>
3679 #include <linux/dma-mapping.h>
3680 #include <linux/module.h>
3681 #include <linux/memory_hotplug.h>
3682 +#include <linux/nmi.h>
3683
3684 #include <asm/processor.h>
3685 #include <asm/system.h>
3686 @@ -51,7 +53,7 @@
3687 #define Dprintk(x...)
3688 #endif
3689
3690 -struct dma_mapping_ops* dma_ops;
3691 +const struct dma_mapping_ops* dma_ops;
3692 EXPORT_SYMBOL(dma_ops);
3693
3694 #if CONFIG_XEN_COMPAT <= 0x030002
3695 @@ -189,6 +191,13 @@
3696
3697 for_each_online_pgdat(pgdat) {
3698 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
3699 + /* this loop can take a while with 256 GB and 4k pages
3700 + so update the NMI watchdog */
3701 + if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) {
3702 + touch_nmi_watchdog();
3703 + }
3704 + if (!pfn_valid(pgdat->node_start_pfn + i))
3705 + continue;
3706 page = pfn_to_page(pgdat->node_start_pfn + i);
3707 total++;
3708 if (PageReserved(page))
3709 @@ -350,7 +359,7 @@
3710 }
3711 }
3712
3713 -unsigned long __initdata table_start, table_end;
3714 +unsigned long __meminitdata table_start, table_end;
3715
3716 static __meminit void *alloc_static_page(unsigned long *phys)
3717 {
3718 @@ -367,7 +376,7 @@
3719 start_pfn++;
3720 memset((void *)va, 0, PAGE_SIZE);
3721 return (void *)va;
3722 -}
3723 +}
3724
3725 #define PTE_SIZE PAGE_SIZE
3726
3727 @@ -408,28 +417,46 @@
3728
3729 #ifndef CONFIG_XEN
3730 /* Must run before zap_low_mappings */
3731 -__init void *early_ioremap(unsigned long addr, unsigned long size)
3732 +__meminit void *early_ioremap(unsigned long addr, unsigned long size)
3733 {
3734 - unsigned long map = round_down(addr, LARGE_PAGE_SIZE);
3735 -
3736 - /* actually usually some more */
3737 - if (size >= LARGE_PAGE_SIZE) {
3738 - return NULL;
3739 + unsigned long vaddr;
3740 + pmd_t *pmd, *last_pmd;
3741 + int i, pmds;
3742 +
3743 + pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
3744 + vaddr = __START_KERNEL_map;
3745 + pmd = level2_kernel_pgt;
3746 + last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1;
3747 + for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) {
3748 + for (i = 0; i < pmds; i++) {
3749 + if (pmd_present(pmd[i]))
3750 + goto next;
3751 + }
3752 + vaddr += addr & ~PMD_MASK;
3753 + addr &= PMD_MASK;
3754 + for (i = 0; i < pmds; i++, addr += PMD_SIZE)
3755 + set_pmd(pmd + i,__pmd(addr | _KERNPG_TABLE | _PAGE_PSE));
3756 + __flush_tlb();
3757 + return (void *)vaddr;
3758 + next:
3759 + ;
3760 }
3761 - set_pmd(temp_mappings[0].pmd, __pmd(map | _KERNPG_TABLE | _PAGE_PSE));
3762 - map += LARGE_PAGE_SIZE;
3763 - set_pmd(temp_mappings[1].pmd, __pmd(map | _KERNPG_TABLE | _PAGE_PSE));
3764 - __flush_tlb();
3765 - return temp_mappings[0].address + (addr & (LARGE_PAGE_SIZE-1));
3766 + printk("early_ioremap(0x%lx, %lu) failed\n", addr, size);
3767 + return NULL;
3768 }
3769
3770 /* To avoid virtual aliases later */
3771 -__init void early_iounmap(void *addr, unsigned long size)
3772 +__meminit void early_iounmap(void *addr, unsigned long size)
3773 {
3774 - if ((void *)round_down((unsigned long)addr, LARGE_PAGE_SIZE) != temp_mappings[0].address)
3775 - printk("early_iounmap: bad address %p\n", addr);
3776 - set_pmd(temp_mappings[0].pmd, __pmd(0));
3777 - set_pmd(temp_mappings[1].pmd, __pmd(0));
3778 + unsigned long vaddr;
3779 + pmd_t *pmd;
3780 + int i, pmds;
3781 +
3782 + vaddr = (unsigned long)addr;
3783 + pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
3784 + pmd = level2_kernel_pgt + pmd_index(vaddr);
3785 + for (i = 0; i < pmds; i++)
3786 + pmd_clear(pmd + i);
3787 __flush_tlb();
3788 }
3789 #endif
3790 @@ -763,14 +790,6 @@
3791 __flush_tlb_all();
3792 }
3793
3794 -void __cpuinit zap_low_mappings(int cpu)
3795 -{
3796 - /* this is not required for Xen */
3797 -#if 0
3798 - swap_low_mappings();
3799 -#endif
3800 -}
3801 -
3802 #ifndef CONFIG_NUMA
3803 void __init paging_init(void)
3804 {
3805 @@ -961,17 +980,6 @@
3806 reservedpages << (PAGE_SHIFT-10),
3807 datasize >> 10,
3808 initsize >> 10);
3809 -
3810 -#ifndef CONFIG_XEN
3811 -#ifdef CONFIG_SMP
3812 - /*
3813 - * Sync boot_level4_pgt mappings with the init_level4_pgt
3814 - * except for the low identity mappings which are already zapped
3815 - * in init_level4_pgt. This sync-up is essential for AP's bringup
3816 - */
3817 - memcpy(boot_level4_pgt+1, init_level4_pgt+1, (PTRS_PER_PGD-1)*sizeof(pgd_t));
3818 -#endif
3819 -#endif
3820 }
3821
3822 void free_init_pages(char *what, unsigned long begin, unsigned long end)
3823 @@ -981,7 +989,7 @@
3824 if (begin >= end)
3825 return;
3826
3827 - printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10);
3828 + printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
3829 for (addr = begin; addr < end; addr += PAGE_SIZE) {
3830 ClearPageReserved(virt_to_page(addr));
3831 init_page_count(virt_to_page(addr));
3832 @@ -990,24 +998,17 @@
3833 if (addr >= __START_KERNEL_map) {
3834 /* make_readonly() reports all kernel addresses. */
3835 __make_page_writable(__va(__pa(addr)));
3836 - if (HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) {
3837 - pgd_t *pgd = pgd_offset_k(addr);
3838 - pud_t *pud = pud_offset(pgd, addr);
3839 - pmd_t *pmd = pmd_offset(pud, addr);
3840 - pte_t *pte = pte_offset_kernel(pmd, addr);
3841 -
3842 - xen_l1_entry_update(pte, __pte(0)); /* fallback */
3843 - }
3844 + change_page_attr_addr(addr, 1, __pgprot(0));
3845 }
3846 free_page(addr);
3847 totalram_pages++;
3848 }
3849 + if (addr > __START_KERNEL_map)
3850 + global_flush_tlb();
3851 }
3852
3853 void free_initmem(void)
3854 {
3855 - memset(__initdata_begin, POISON_FREE_INITDATA,
3856 - __initdata_end - __initdata_begin);
3857 free_init_pages("unused kernel memory",
3858 (unsigned long)(&__init_begin),
3859 (unsigned long)(&__init_end));
3860 @@ -1017,13 +1018,28 @@
3861
3862 void mark_rodata_ro(void)
3863 {
3864 - unsigned long addr = (unsigned long)__start_rodata;
3865 + unsigned long start = (unsigned long)_stext, end;
3866 +
3867 +#ifdef CONFIG_HOTPLUG_CPU
3868 + /* It must still be possible to apply SMP alternatives. */
3869 + if (num_possible_cpus() > 1)
3870 + start = (unsigned long)_etext;
3871 +#endif
3872 +
3873 +#ifdef CONFIG_KPROBES
3874 + start = (unsigned long)__start_rodata;
3875 +#endif
3876 +
3877 + end = (unsigned long)__end_rodata;
3878 + start = (start + PAGE_SIZE - 1) & PAGE_MASK;
3879 + end &= PAGE_MASK;
3880 + if (end <= start)
3881 + return;
3882
3883 - for (; addr < (unsigned long)__end_rodata; addr += PAGE_SIZE)
3884 - change_page_attr_addr(addr, 1, PAGE_KERNEL_RO);
3885 + change_page_attr_addr(start, (end - start) >> PAGE_SHIFT, PAGE_KERNEL_RO);
3886
3887 - printk ("Write protecting the kernel read-only data: %luk\n",
3888 - (__end_rodata - __start_rodata) >> 10);
3889 + printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
3890 + (end - start) >> 10);
3891
3892 /*
3893 * change_page_attr_addr() requires a global_flush_tlb() call after it.
3894 @@ -1176,3 +1192,11 @@
3895 {
3896 return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
3897 }
3898 +
3899 +#ifndef CONFIG_XEN
3900 +void *alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size)
3901 +{
3902 + return __alloc_bootmem_core(pgdat->bdata, size,
3903 + SMP_CACHE_BYTES, (4UL*1024*1024*1024), 0);
3904 +}
3905 +#endif
3906 --- a/arch/x86/mm/ioremap_32-xen.c
3907 +++ b/arch/x86/mm/ioremap_32-xen.c
3908 @@ -13,6 +13,7 @@
3909 #include <linux/slab.h>
3910 #include <linux/module.h>
3911 #include <linux/io.h>
3912 +#include <linux/sched.h>
3913 #include <asm/fixmap.h>
3914 #include <asm/cacheflush.h>
3915 #include <asm/tlbflush.h>
3916 --- a/arch/x86/mm/pageattr_64-xen.c
3917 +++ b/arch/x86/mm/pageattr_64-xen.c
3918 @@ -215,13 +215,13 @@
3919 preempt_enable();
3920 }
3921
3922 -void _arch_dup_mmap(struct mm_struct *mm)
3923 +void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
3924 {
3925 if (!mm->context.pinned)
3926 mm_pin(mm);
3927 }
3928
3929 -void _arch_exit_mmap(struct mm_struct *mm)
3930 +void arch_exit_mmap(struct mm_struct *mm)
3931 {
3932 struct task_struct *tsk = current;
3933
3934 @@ -337,10 +337,11 @@
3935 struct page *pg;
3936
3937 /* When clflush is available always use it because it is
3938 - much cheaper than WBINVD */
3939 - if (!cpu_has_clflush)
3940 + much cheaper than WBINVD. Disable clflush for now because
3941 + the high level code is not ready yet */
3942 + if (1 || !cpu_has_clflush)
3943 asm volatile("wbinvd" ::: "memory");
3944 - list_for_each_entry(pg, l, lru) {
3945 + else list_for_each_entry(pg, l, lru) {
3946 void *adr = page_address(pg);
3947 if (cpu_has_clflush)
3948 cache_flush_page(adr);
3949 @@ -454,16 +455,24 @@
3950 */
3951 int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot)
3952 {
3953 - int err = 0;
3954 + int err = 0, kernel_map = 0;
3955 int i;
3956
3957 + if (address >= __START_KERNEL_map
3958 + && address < __START_KERNEL_map + KERNEL_TEXT_SIZE) {
3959 + address = (unsigned long)__va(__pa(address));
3960 + kernel_map = 1;
3961 + }
3962 +
3963 down_write(&init_mm.mmap_sem);
3964 for (i = 0; i < numpages; i++, address += PAGE_SIZE) {
3965 unsigned long pfn = __pa(address) >> PAGE_SHIFT;
3966
3967 - err = __change_page_attr(address, pfn, prot, PAGE_KERNEL);
3968 - if (err)
3969 - break;
3970 + if (!kernel_map || pte_present(pfn_pte(0, prot))) {
3971 + err = __change_page_attr(address, pfn, prot, PAGE_KERNEL);
3972 + if (err)
3973 + break;
3974 + }
3975 /* Handle kernel mapping too which aliases part of the
3976 * lowmem */
3977 if (__pa(address) < KERNEL_TEXT_SIZE) {
3978 --- a/arch/x86/mm/pgtable_32-xen.c
3979 +++ b/arch/x86/mm/pgtable_32-xen.c
3980 @@ -13,6 +13,7 @@
3981 #include <linux/pagemap.h>
3982 #include <linux/spinlock.h>
3983 #include <linux/module.h>
3984 +#include <linux/quicklist.h>
3985
3986 #include <asm/system.h>
3987 #include <asm/pgtable.h>
3988 @@ -212,8 +213,6 @@
3989 * against pageattr.c; it is the unique case in which a valid change
3990 * of kernel pagetables can't be lazily synchronized by vmalloc faults.
3991 * vmalloc faults work because attached pagetables are never freed.
3992 - * The locking scheme was chosen on the basis of manfred's
3993 - * recommendations and having no core impact whatsoever.
3994 * -- wli
3995 */
3996 DEFINE_SPINLOCK(pgd_lock);
3997 @@ -239,37 +238,59 @@
3998 set_page_private(next, (unsigned long)pprev);
3999 }
4000
4001 -void pgd_ctor(void *pgd, struct kmem_cache *cache, unsigned long unused)
4002 +
4003 +
4004 +#if (PTRS_PER_PMD == 1)
4005 +/* Non-PAE pgd constructor */
4006 +void pgd_ctor(void *pgd)
4007 {
4008 unsigned long flags;
4009
4010 - if (PTRS_PER_PMD > 1) {
4011 - if (HAVE_SHARED_KERNEL_PMD)
4012 - clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
4013 - swapper_pg_dir + USER_PTRS_PER_PGD,
4014 - KERNEL_PGD_PTRS);
4015 - } else {
4016 - spin_lock_irqsave(&pgd_lock, flags);
4017 + /* !PAE, no pagetable sharing */
4018 + memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
4019 +
4020 + spin_lock_irqsave(&pgd_lock, flags);
4021 +
4022 + /* must happen under lock */
4023 + clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
4024 + swapper_pg_dir + USER_PTRS_PER_PGD,
4025 + KERNEL_PGD_PTRS);
4026 +
4027 + paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
4028 + __pa(swapper_pg_dir) >> PAGE_SHIFT,
4029 + USER_PTRS_PER_PGD,
4030 + KERNEL_PGD_PTRS);
4031 + pgd_list_add(pgd);
4032 + spin_unlock_irqrestore(&pgd_lock, flags);
4033 +}
4034 +#else /* PTRS_PER_PMD > 1 */
4035 +/* PAE pgd constructor */
4036 +void pgd_ctor(void *pgd)
4037 +{
4038 + /* PAE, kernel PMD may be shared */
4039 +
4040 + if (SHARED_KERNEL_PMD) {
4041 clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
4042 swapper_pg_dir + USER_PTRS_PER_PGD,
4043 KERNEL_PGD_PTRS);
4044 - memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
4045 -
4046 - /* must happen under lock */
4047 - paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
4048 - __pa(swapper_pg_dir) >> PAGE_SHIFT,
4049 - USER_PTRS_PER_PGD, PTRS_PER_PGD - USER_PTRS_PER_PGD);
4050 + } else {
4051 + unsigned long flags;
4052
4053 + memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
4054 + spin_lock_irqsave(&pgd_lock, flags);
4055 pgd_list_add(pgd);
4056 spin_unlock_irqrestore(&pgd_lock, flags);
4057 }
4058 }
4059 +#endif /* PTRS_PER_PMD */
4060
4061 -/* never called when PTRS_PER_PMD > 1 */
4062 -void pgd_dtor(void *pgd, struct kmem_cache *cache, unsigned long unused)
4063 +void pgd_dtor(void *pgd)
4064 {
4065 unsigned long flags; /* can be called from interrupt context */
4066
4067 + if (SHARED_KERNEL_PMD)
4068 + return;
4069 +
4070 paravirt_release_pd(__pa(pgd) >> PAGE_SHIFT);
4071 spin_lock_irqsave(&pgd_lock, flags);
4072 pgd_list_del(pgd);
4073 @@ -278,11 +299,46 @@
4074 pgd_test_and_unpin(pgd);
4075 }
4076
4077 +#define UNSHARED_PTRS_PER_PGD \
4078 + (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD)
4079 +
4080 +/* If we allocate a pmd for part of the kernel address space, then
4081 + make sure its initialized with the appropriate kernel mappings.
4082 + Otherwise use a cached zeroed pmd. */
4083 +static pmd_t *pmd_cache_alloc(int idx)
4084 +{
4085 + pmd_t *pmd;
4086 +
4087 + if (idx >= USER_PTRS_PER_PGD) {
4088 + pmd = (pmd_t *)__get_free_page(GFP_KERNEL);
4089 +
4090 +#ifndef CONFIG_XEN
4091 + if (pmd)
4092 + memcpy(pmd,
4093 + (void *)pgd_page_vaddr(swapper_pg_dir[idx]),
4094 + sizeof(pmd_t) * PTRS_PER_PMD);
4095 +#endif
4096 + } else
4097 + pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
4098 +
4099 + return pmd;
4100 +}
4101 +
4102 +static void pmd_cache_free(pmd_t *pmd, int idx)
4103 +{
4104 + if (idx >= USER_PTRS_PER_PGD) {
4105 + make_lowmem_page_writable(pmd, XENFEAT_writable_page_tables);
4106 + memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
4107 + free_page((unsigned long)pmd);
4108 + } else
4109 + kmem_cache_free(pmd_cache, pmd);
4110 +}
4111 +
4112 pgd_t *pgd_alloc(struct mm_struct *mm)
4113 {
4114 int i;
4115 - pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL);
4116 - pmd_t **pmd;
4117 + pgd_t *pgd = quicklist_alloc(0, GFP_KERNEL, pgd_ctor);
4118 + pmd_t **pmds = NULL;
4119 unsigned long flags;
4120
4121 pgd_test_and_unpin(pgd);
4122 @@ -290,37 +346,40 @@
4123 if (PTRS_PER_PMD == 1 || !pgd)
4124 return pgd;
4125
4126 - if (HAVE_SHARED_KERNEL_PMD) {
4127 - for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
4128 - pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
4129 - if (!pmd)
4130 - goto out_oom;
4131 - paravirt_alloc_pd(__pa(pmd) >> PAGE_SHIFT);
4132 - set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
4133 +#ifdef CONFIG_XEN
4134 + if (!SHARED_KERNEL_PMD) {
4135 + /*
4136 + * We can race save/restore (if we sleep during a GFP_KERNEL memory
4137 + * allocation). We therefore store virtual addresses of pmds as they
4138 + * do not change across save/restore, and poke the machine addresses
4139 + * into the pgdir under the pgd_lock.
4140 + */
4141 + pmds = kmalloc(PTRS_PER_PGD * sizeof(pmd_t *), GFP_KERNEL);
4142 + if (!pmds) {
4143 + quicklist_free(0, pgd_dtor, pgd);
4144 + return NULL;
4145 }
4146 - return pgd;
4147 - }
4148 -
4149 - /*
4150 - * We can race save/restore (if we sleep during a GFP_KERNEL memory
4151 - * allocation). We therefore store virtual addresses of pmds as they
4152 - * do not change across save/restore, and poke the machine addresses
4153 - * into the pgdir under the pgd_lock.
4154 - */
4155 - pmd = kmalloc(PTRS_PER_PGD * sizeof(pmd_t *), GFP_KERNEL);
4156 - if (!pmd) {
4157 - kmem_cache_free(pgd_cache, pgd);
4158 - return NULL;
4159 }
4160 +#endif
4161
4162 /* Allocate pmds, remember virtual addresses. */
4163 - for (i = 0; i < PTRS_PER_PGD; ++i) {
4164 - pmd[i] = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
4165 - if (!pmd[i])
4166 + for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) {
4167 + pmd_t *pmd = pmd_cache_alloc(i);
4168 +
4169 + if (!pmd)
4170 goto out_oom;
4171 +
4172 paravirt_alloc_pd(__pa(pmd) >> PAGE_SHIFT);
4173 + if (pmds)
4174 + pmds[i] = pmd;
4175 + else
4176 + set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
4177 }
4178
4179 +#ifdef CONFIG_XEN
4180 + if (SHARED_KERNEL_PMD)
4181 + return pgd;
4182 +
4183 spin_lock_irqsave(&pgd_lock, flags);
4184
4185 /* Protect against save/restore: move below 4GB under pgd_lock. */
4186 @@ -335,44 +394,40 @@
4187
4188 /* Copy kernel pmd contents and write-protect the new pmds. */
4189 for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
4190 - unsigned long v = (unsigned long)i << PGDIR_SHIFT;
4191 - pgd_t *kpgd = pgd_offset_k(v);
4192 - pud_t *kpud = pud_offset(kpgd, v);
4193 - pmd_t *kpmd = pmd_offset(kpud, v);
4194 - memcpy(pmd[i], kpmd, PAGE_SIZE);
4195 + memcpy(pmds[i],
4196 + (void *)pgd_page_vaddr(swapper_pg_dir[i]),
4197 + sizeof(pmd_t) * PTRS_PER_PMD);
4198 make_lowmem_page_readonly(
4199 - pmd[i], XENFEAT_writable_page_tables);
4200 + pmds[i], XENFEAT_writable_page_tables);
4201 }
4202
4203 /* It is safe to poke machine addresses of pmds under the pmd_lock. */
4204 for (i = 0; i < PTRS_PER_PGD; i++)
4205 - set_pgd(&pgd[i], __pgd(1 + __pa(pmd[i])));
4206 -
4207 - /* Ensure this pgd gets picked up and pinned on save/restore. */
4208 - pgd_list_add(pgd);
4209 + set_pgd(&pgd[i], __pgd(1 + __pa(pmds[i])));
4210
4211 spin_unlock_irqrestore(&pgd_lock, flags);
4212
4213 - kfree(pmd);
4214 + kfree(pmds);
4215 +#endif
4216
4217 return pgd;
4218
4219 out_oom:
4220 - if (HAVE_SHARED_KERNEL_PMD) {
4221 + if (!pmds) {
4222 for (i--; i >= 0; i--) {
4223 pgd_t pgdent = pgd[i];
4224 void* pmd = (void *)__va(pgd_val(pgdent)-1);
4225 paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
4226 - kmem_cache_free(pmd_cache, pmd);
4227 + pmd_cache_free(pmd, i);
4228 }
4229 } else {
4230 for (i--; i >= 0; i--) {
4231 - paravirt_release_pd(__pa(pmd[i]) >> PAGE_SHIFT);
4232 - kmem_cache_free(pmd_cache, pmd[i]);
4233 + paravirt_release_pd(__pa(pmds[i]) >> PAGE_SHIFT);
4234 + pmd_cache_free(pmds[i], i);
4235 }
4236 - kfree(pmd);
4237 + kfree(pmds);
4238 }
4239 - kmem_cache_free(pgd_cache, pgd);
4240 + quicklist_free(0, pgd_dtor, pgd);
4241 return NULL;
4242 }
4243
4244 @@ -392,35 +447,24 @@
4245
4246 /* in the PAE case user pgd entries are overwritten before usage */
4247 if (PTRS_PER_PMD > 1) {
4248 - for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
4249 + for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) {
4250 pgd_t pgdent = pgd[i];
4251 void* pmd = (void *)__va(pgd_val(pgdent)-1);
4252 paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
4253 - kmem_cache_free(pmd_cache, pmd);
4254 + pmd_cache_free(pmd, i);
4255 }
4256
4257 - if (!HAVE_SHARED_KERNEL_PMD) {
4258 - unsigned long flags;
4259 - spin_lock_irqsave(&pgd_lock, flags);
4260 - pgd_list_del(pgd);
4261 - spin_unlock_irqrestore(&pgd_lock, flags);
4262 -
4263 - for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
4264 - pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1);
4265 - make_lowmem_page_writable(
4266 - pmd, XENFEAT_writable_page_tables);
4267 - memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
4268 - kmem_cache_free(pmd_cache, pmd);
4269 - }
4270 -
4271 - if (!xen_feature(XENFEAT_pae_pgdir_above_4gb))
4272 - xen_destroy_contiguous_region(
4273 - (unsigned long)pgd, 0);
4274 - }
4275 + if (!xen_feature(XENFEAT_pae_pgdir_above_4gb))
4276 + xen_destroy_contiguous_region((unsigned long)pgd, 0);
4277 }
4278
4279 /* in the non-PAE case, free_pgtables() clears user pgd entries */
4280 - kmem_cache_free(pgd_cache, pgd);
4281 + quicklist_free(0, pgd_dtor, pgd);
4282 +}
4283 +
4284 +void check_pgt_cache(void)
4285 +{
4286 + quicklist_trim(0, pgd_dtor, 25, 16);
4287 }
4288
4289 void make_lowmem_page_readonly(void *va, unsigned int feature)
4290 @@ -717,13 +761,13 @@
4291 spin_unlock_irqrestore(&pgd_lock, flags);
4292 }
4293
4294 -void _arch_dup_mmap(struct mm_struct *mm)
4295 +void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
4296 {
4297 if (!test_bit(PG_pinned, &virt_to_page(mm->pgd)->flags))
4298 mm_pin(mm);
4299 }
4300
4301 -void _arch_exit_mmap(struct mm_struct *mm)
4302 +void arch_exit_mmap(struct mm_struct *mm)
4303 {
4304 struct task_struct *tsk = current;
4305
4306 --- a/drivers/char/tpm/tpm_xen.c
4307 +++ b/drivers/char/tpm/tpm_xen.c
4308 @@ -463,7 +463,7 @@
4309 tp->backend_id = domid;
4310
4311 err = bind_listening_port_to_irqhandler(
4312 - domid, tpmif_int, SA_SAMPLE_RANDOM, "tpmif", tp);
4313 + domid, tpmif_int, IRQF_SAMPLE_RANDOM, "tpmif", tp);
4314 if (err <= 0) {
4315 WPRINTK("bind_listening_port_to_irqhandler failed "
4316 "(err=%d)\n", err);
4317 --- a/drivers/xen/blkfront/blkfront.c
4318 +++ b/drivers/xen/blkfront/blkfront.c
4319 @@ -236,7 +236,7 @@
4320 info->ring_ref = err;
4321
4322 err = bind_listening_port_to_irqhandler(
4323 - dev->otherend_id, blkif_int, SA_SAMPLE_RANDOM, "blkif", info);
4324 + dev->otherend_id, blkif_int, IRQF_SAMPLE_RANDOM, "blkif", info);
4325 if (err <= 0) {
4326 xenbus_dev_fatal(dev, err,
4327 "bind_listening_port_to_irqhandler");
4328 --- a/drivers/xen/char/mem.c
4329 +++ b/drivers/xen/char/mem.c
4330 @@ -18,7 +18,6 @@
4331 #include <linux/raw.h>
4332 #include <linux/tty.h>
4333 #include <linux/capability.h>
4334 -#include <linux/smp_lock.h>
4335 #include <linux/ptrace.h>
4336 #include <linux/device.h>
4337 #include <asm/pgalloc.h>
4338 --- a/drivers/xen/core/hypervisor_sysfs.c
4339 +++ b/drivers/xen/core/hypervisor_sysfs.c
4340 @@ -50,7 +50,7 @@
4341 if (!is_running_on_xen())
4342 return -ENODEV;
4343
4344 - hypervisor_subsys.kset.kobj.ktype = &hyp_sysfs_kobj_type;
4345 + hypervisor_subsys.kobj.ktype = &hyp_sysfs_kobj_type;
4346 return 0;
4347 }
4348
4349 --- a/drivers/xen/core/smpboot.c
4350 +++ b/drivers/xen/core/smpboot.c
4351 @@ -121,7 +121,7 @@
4352 rc = bind_ipi_to_irqhandler(RESCHEDULE_VECTOR,
4353 cpu,
4354 smp_reschedule_interrupt,
4355 - SA_INTERRUPT,
4356 + IRQF_DISABLED,
4357 resched_name[cpu],
4358 NULL);
4359 if (rc < 0)
4360 @@ -132,7 +132,7 @@
4361 rc = bind_ipi_to_irqhandler(CALL_FUNCTION_VECTOR,
4362 cpu,
4363 smp_call_function_interrupt,
4364 - SA_INTERRUPT,
4365 + IRQF_DISABLED,
4366 callfunc_name[cpu],
4367 NULL);
4368 if (rc < 0)
4369 @@ -165,13 +165,12 @@
4370
4371 void __cpuinit cpu_bringup(void)
4372 {
4373 + cpu_init();
4374 #ifdef __i386__
4375 - cpu_set_gdt(current_thread_info()->cpu);
4376 - secondary_cpu_init();
4377 + identify_secondary_cpu(cpu_data + smp_processor_id());
4378 #else
4379 - cpu_init();
4380 -#endif
4381 identify_cpu(cpu_data + smp_processor_id());
4382 +#endif
4383 touch_softlockup_watchdog();
4384 preempt_disable();
4385 local_irq_enable();
4386 @@ -191,11 +190,6 @@
4387 static DEFINE_SPINLOCK(ctxt_lock);
4388
4389 struct task_struct *idle = idle_task(cpu);
4390 -#ifdef __x86_64__
4391 - struct desc_ptr *gdt_descr = &cpu_gdt_descr[cpu];
4392 -#else
4393 - struct Xgt_desc_struct *gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
4394 -#endif
4395
4396 if (cpu_test_and_set(cpu, cpu_initialized_map))
4397 return;
4398 @@ -218,11 +212,11 @@
4399 smp_trap_init(ctxt.trap_ctxt);
4400
4401 ctxt.ldt_ents = 0;
4402 -
4403 - ctxt.gdt_frames[0] = virt_to_mfn(gdt_descr->address);
4404 - ctxt.gdt_ents = gdt_descr->size / 8;
4405 + ctxt.gdt_ents = GDT_SIZE / 8;
4406
4407 #ifdef __i386__
4408 + ctxt.gdt_frames[0] = virt_to_mfn(get_cpu_gdt_table(cpu));
4409 +
4410 ctxt.user_regs.cs = __KERNEL_CS;
4411 ctxt.user_regs.esp = idle->thread.esp0 - sizeof(struct pt_regs);
4412
4413 @@ -235,7 +229,11 @@
4414 ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback;
4415
4416 ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
4417 +
4418 + ctxt.user_regs.fs = __KERNEL_PERCPU;
4419 #else /* __x86_64__ */
4420 + ctxt.gdt_frames[0] = virt_to_mfn(cpu_gdt_descr[cpu].address);
4421 +
4422 ctxt.user_regs.cs = __KERNEL_CS;
4423 ctxt.user_regs.esp = idle->thread.rsp0 - sizeof(struct pt_regs);
4424
4425 @@ -265,9 +263,8 @@
4426 struct vcpu_get_physid cpu_id;
4427 #ifdef __x86_64__
4428 struct desc_ptr *gdt_descr;
4429 -#else
4430 - struct Xgt_desc_struct *gdt_descr;
4431 #endif
4432 + void *gdt_addr;
4433
4434 apicid = 0;
4435 if (HYPERVISOR_vcpu_op(VCPUOP_get_physid, 0, &cpu_id) == 0)
4436 @@ -317,14 +314,12 @@
4437 }
4438 gdt_descr->size = GDT_SIZE;
4439 memcpy((void *)gdt_descr->address, cpu_gdt_table, GDT_SIZE);
4440 + gdt_addr = (void *)gdt_descr->address;
4441 #else
4442 - if (unlikely(!init_gdt(cpu, idle)))
4443 - continue;
4444 - gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
4445 + init_gdt(cpu);
4446 + gdt_addr = get_cpu_gdt_table(cpu);
4447 #endif
4448 - make_page_readonly(
4449 - (void *)gdt_descr->address,
4450 - XENFEAT_writable_descriptor_tables);
4451 + make_page_readonly(gdt_addr, XENFEAT_writable_descriptor_tables);
4452
4453 apicid = cpu;
4454 if (HYPERVISOR_vcpu_op(VCPUOP_get_physid, cpu, &cpu_id) == 0)
4455 @@ -338,7 +333,9 @@
4456 #ifdef __x86_64__
4457 cpu_pda(cpu)->pcurrent = idle;
4458 cpu_pda(cpu)->cpunumber = cpu;
4459 - clear_ti_thread_flag(idle->thread_info, TIF_FORK);
4460 + clear_ti_thread_flag(task_thread_info(idle), TIF_FORK);
4461 +#else
4462 + per_cpu(current_task, cpu) = idle;
4463 #endif
4464
4465 irq_ctx_init(cpu);
4466 @@ -363,8 +360,12 @@
4467 #endif
4468 }
4469
4470 -void __devinit smp_prepare_boot_cpu(void)
4471 +void __init smp_prepare_boot_cpu(void)
4472 {
4473 +#ifdef __i386__
4474 + init_gdt(smp_processor_id());
4475 + switch_to_new_gdt();
4476 +#endif
4477 prefill_possible_map();
4478 }
4479
4480 --- a/drivers/xen/core/xen_sysfs.c
4481 +++ b/drivers/xen/core/xen_sysfs.c
4482 @@ -28,12 +28,12 @@
4483
4484 static int __init xen_sysfs_type_init(void)
4485 {
4486 - return sysfs_create_file(&hypervisor_subsys.kset.kobj, &type_attr.attr);
4487 + return sysfs_create_file(&hypervisor_subsys.kobj, &type_attr.attr);
4488 }
4489
4490 static void xen_sysfs_type_destroy(void)
4491 {
4492 - sysfs_remove_file(&hypervisor_subsys.kset.kobj, &type_attr.attr);
4493 + sysfs_remove_file(&hypervisor_subsys.kobj, &type_attr.attr);
4494 }
4495
4496 /* xen version attributes */
4497 @@ -89,13 +89,13 @@
4498
4499 static int __init xen_sysfs_version_init(void)
4500 {
4501 - return sysfs_create_group(&hypervisor_subsys.kset.kobj,
4502 + return sysfs_create_group(&hypervisor_subsys.kobj,
4503 &version_group);
4504 }
4505
4506 static void xen_sysfs_version_destroy(void)
4507 {
4508 - sysfs_remove_group(&hypervisor_subsys.kset.kobj, &version_group);
4509 + sysfs_remove_group(&hypervisor_subsys.kobj, &version_group);
4510 }
4511
4512 /* UUID */
4513 @@ -125,12 +125,12 @@
4514
4515 static int __init xen_sysfs_uuid_init(void)
4516 {
4517 - return sysfs_create_file(&hypervisor_subsys.kset.kobj, &uuid_attr.attr);
4518 + return sysfs_create_file(&hypervisor_subsys.kobj, &uuid_attr.attr);
4519 }
4520
4521 static void xen_sysfs_uuid_destroy(void)
4522 {
4523 - sysfs_remove_file(&hypervisor_subsys.kset.kobj, &uuid_attr.attr);
4524 + sysfs_remove_file(&hypervisor_subsys.kobj, &uuid_attr.attr);
4525 }
4526
4527 /* xen compilation attributes */
4528 @@ -203,13 +203,13 @@
4529
4530 int __init static xen_compilation_init(void)
4531 {
4532 - return sysfs_create_group(&hypervisor_subsys.kset.kobj,
4533 + return sysfs_create_group(&hypervisor_subsys.kobj,
4534 &xen_compilation_group);
4535 }
4536
4537 static void xen_compilation_destroy(void)
4538 {
4539 - sysfs_remove_group(&hypervisor_subsys.kset.kobj,
4540 + sysfs_remove_group(&hypervisor_subsys.kobj,
4541 &xen_compilation_group);
4542 }
4543
4544 @@ -324,13 +324,13 @@
4545
4546 static int __init xen_properties_init(void)
4547 {
4548 - return sysfs_create_group(&hypervisor_subsys.kset.kobj,
4549 + return sysfs_create_group(&hypervisor_subsys.kobj,
4550 &xen_properties_group);
4551 }
4552
4553 static void xen_properties_destroy(void)
4554 {
4555 - sysfs_remove_group(&hypervisor_subsys.kset.kobj,
4556 + sysfs_remove_group(&hypervisor_subsys.kobj,
4557 &xen_properties_group);
4558 }
4559
4560 --- a/drivers/xen/netback/netback.c
4561 +++ b/drivers/xen/netback/netback.c
4562 @@ -180,7 +180,7 @@
4563 goto err;
4564
4565 skb_reserve(nskb, 16 + NET_IP_ALIGN);
4566 - headlen = nskb->end - nskb->data;
4567 + headlen = skb_end_pointer(nskb) - nskb->data;
4568 if (headlen > skb_headlen(skb))
4569 headlen = skb_headlen(skb);
4570 ret = skb_copy_bits(skb, 0, __skb_put(nskb, headlen), headlen);
4571 @@ -226,11 +226,15 @@
4572 len -= copy;
4573 }
4574
4575 +#ifdef NET_SKBUFF_DATA_USES_OFFSET
4576 + offset = 0;
4577 +#else
4578 offset = nskb->data - skb->data;
4579 +#endif
4580
4581 - nskb->h.raw = skb->h.raw + offset;
4582 - nskb->nh.raw = skb->nh.raw + offset;
4583 - nskb->mac.raw = skb->mac.raw + offset;
4584 + nskb->transport_header = skb->transport_header + offset;
4585 + nskb->network_header = skb->network_header + offset;
4586 + nskb->mac_header = skb->mac_header + offset;
4587
4588 return nskb;
4589
4590 @@ -1601,7 +1605,7 @@
4591 (void)bind_virq_to_irqhandler(VIRQ_DEBUG,
4592 0,
4593 netif_be_dbg,
4594 - SA_SHIRQ,
4595 + IRQF_SHARED,
4596 "net-be-dbg",
4597 &netif_be_dbg);
4598 #endif
4599 --- a/drivers/xen/netfront/netfront.c
4600 +++ b/drivers/xen/netfront/netfront.c
4601 @@ -513,7 +513,7 @@
4602 memcpy(netdev->dev_addr, info->mac, ETH_ALEN);
4603
4604 err = bind_listening_port_to_irqhandler(
4605 - dev->otherend_id, netif_int, SA_SAMPLE_RANDOM, netdev->name,
4606 + dev->otherend_id, netif_int, IRQF_SAMPLE_RANDOM, netdev->name,
4607 netdev);
4608 if (err < 0)
4609 goto fail;
4610 --- a/drivers/xen/pciback/xenbus.c
4611 +++ b/drivers/xen/pciback/xenbus.c
4612 @@ -86,7 +86,7 @@
4613
4614 err = bind_interdomain_evtchn_to_irqhandler(
4615 pdev->xdev->otherend_id, remote_evtchn, pciback_handle_event,
4616 - SA_SAMPLE_RANDOM, "pciback", pdev);
4617 + IRQF_SAMPLE_RANDOM, "pciback", pdev);
4618 if (err < 0) {
4619 xenbus_dev_fatal(pdev->xdev, err,
4620 "Error binding event channel to IRQ");
4621 --- a/drivers/xen/pcifront/xenbus.c
4622 +++ b/drivers/xen/pcifront/xenbus.c
4623 @@ -10,10 +10,6 @@
4624 #include <xen/gnttab.h>
4625 #include "pcifront.h"
4626
4627 -#ifndef __init_refok
4628 -#define __init_refok
4629 -#endif
4630 -
4631 #define INVALID_GRANT_REF (0)
4632 #define INVALID_EVTCHN (-1)
4633
4634 --- a/drivers/xen/sfc_netback/accel_fwd.c
4635 +++ b/drivers/xen/sfc_netback/accel_fwd.c
4636 @@ -308,7 +308,7 @@
4637 static inline int packet_is_arp_reply(struct sk_buff *skb)
4638 {
4639 return skb->protocol == ntohs(ETH_P_ARP)
4640 - && skb->nh.arph->ar_op == ntohs(ARPOP_REPLY);
4641 + && arp_hdr(skb)->ar_op == ntohs(ARPOP_REPLY);
4642 }
4643
4644
4645 @@ -392,12 +392,13 @@
4646
4647 BUG_ON(fwd_priv == NULL);
4648
4649 - if (is_broadcast_ether_addr(skb->mac.raw) && packet_is_arp_reply(skb)) {
4650 + if (is_broadcast_ether_addr(skb_mac_header(skb))
4651 + && packet_is_arp_reply(skb)) {
4652 /*
4653 * update our fast path forwarding to reflect this
4654 * gratuitous ARP
4655 */
4656 - mac = skb->mac.raw+ETH_ALEN;
4657 + mac = skb_mac_header(skb)+ETH_ALEN;
4658
4659 DPRINTK("%s: found gratuitous ARP for " MAC_FMT "\n",
4660 __FUNCTION__, MAC_ARG(mac));
4661 --- a/drivers/xen/sfc_netback/accel_solarflare.c
4662 +++ b/drivers/xen/sfc_netback/accel_solarflare.c
4663 @@ -114,7 +114,7 @@
4664 BUG_ON(port == NULL);
4665
4666 NETBACK_ACCEL_STATS_OP(global_stats.dl_tx_packets++);
4667 - if (skb->mac.raw != NULL)
4668 + if (skb_mac_header_was_set(skb))
4669 netback_accel_tx_packet(skb, port->fwd_priv);
4670 else {
4671 DPRINTK("Ignoring packet with missing mac address\n");
4672 --- a/drivers/xen/sfc_netfront/accel_tso.c
4673 +++ b/drivers/xen/sfc_netfront/accel_tso.c
4674 @@ -33,10 +33,9 @@
4675
4676 #include "accel_tso.h"
4677
4678 -#define PTR_DIFF(p1, p2) ((u8*)(p1) - (u8*)(p2))
4679 -#define ETH_HDR_LEN(skb) ((skb)->nh.raw - (skb)->data)
4680 -#define SKB_TCP_OFF(skb) PTR_DIFF ((skb)->h.th, (skb)->data)
4681 -#define SKB_IP_OFF(skb) PTR_DIFF ((skb)->nh.iph, (skb)->data)
4682 +#define ETH_HDR_LEN(skb) skb_network_offset(skb)
4683 +#define SKB_TCP_OFF(skb) skb_transport_offset(skb)
4684 +#define SKB_IP_OFF(skb) skb_network_offset(skb)
4685
4686 /*
4687 * Set a maximum number of buffers in each output packet to make life
4688 @@ -114,9 +113,8 @@
4689 static inline void tso_check_safe(struct sk_buff *skb) {
4690 EPRINTK_ON(skb->protocol != htons (ETH_P_IP));
4691 EPRINTK_ON(((struct ethhdr*) skb->data)->h_proto != htons (ETH_P_IP));
4692 - EPRINTK_ON(skb->nh.iph->protocol != IPPROTO_TCP);
4693 - EPRINTK_ON((SKB_TCP_OFF(skb)
4694 - + (skb->h.th->doff << 2u)) > skb_headlen(skb));
4695 + EPRINTK_ON(ip_hdr(skb)->protocol != IPPROTO_TCP);
4696 + EPRINTK_ON((SKB_TCP_OFF(skb) + tcp_hdrlen(skb)) > skb_headlen(skb));
4697 }
4698
4699
4700 @@ -129,17 +127,17 @@
4701 * All ethernet/IP/TCP headers combined size is TCP header size
4702 * plus offset of TCP header relative to start of packet.
4703 */
4704 - st->p.header_length = (skb->h.th->doff << 2u) + SKB_TCP_OFF(skb);
4705 + st->p.header_length = tcp_hdrlen(skb) + SKB_TCP_OFF(skb);
4706 st->p.full_packet_size = (st->p.header_length
4707 + skb_shinfo(skb)->gso_size);
4708 st->p.gso_size = skb_shinfo(skb)->gso_size;
4709
4710 - st->p.ip_id = htons(skb->nh.iph->id);
4711 - st->seqnum = ntohl(skb->h.th->seq);
4712 + st->p.ip_id = htons(ip_hdr(skb)->id);
4713 + st->seqnum = ntohl(tcp_hdr(skb)->seq);
4714
4715 - EPRINTK_ON(skb->h.th->urg);
4716 - EPRINTK_ON(skb->h.th->syn);
4717 - EPRINTK_ON(skb->h.th->rst);
4718 + EPRINTK_ON(tcp_hdr(skb)->urg);
4719 + EPRINTK_ON(tcp_hdr(skb)->syn);
4720 + EPRINTK_ON(tcp_hdr(skb)->rst);
4721
4722 st->remaining_len = skb->len - st->p.header_length;
4723
4724 @@ -258,8 +256,8 @@
4725 /* This packet will be the last in the TSO burst. */
4726 ip_length = (st->p.header_length - ETH_HDR_LEN(skb)
4727 + st->remaining_len);
4728 - tsoh_th->fin = skb->h.th->fin;
4729 - tsoh_th->psh = skb->h.th->psh;
4730 + tsoh_th->fin = tcp_hdr(skb)->fin;
4731 + tsoh_th->psh = tcp_hdr(skb)->psh;
4732 }
4733
4734 tsoh_iph->tot_len = htons(ip_length);
4735 --- a/drivers/xen/sfc_netfront/accel_vi.c
4736 +++ b/drivers/xen/sfc_netfront/accel_vi.c
4737 @@ -463,7 +463,7 @@
4738
4739 if (skb->ip_summed == CHECKSUM_PARTIAL) {
4740 /* Set to zero to encourage falcon to work it out for us */
4741 - *(u16*)(skb->h.raw + skb->csum_offset) = 0;
4742 + *(u16*)(skb->head + skb->csum_start + skb->csum_offset) = 0;
4743 }
4744
4745 if (multi_post_start_new_buffer(vnic, &state)) {
4746 @@ -582,7 +582,7 @@
4747
4748 if (skb->ip_summed == CHECKSUM_PARTIAL) {
4749 /* Set to zero to encourage falcon to work it out for us */
4750 - *(u16*)(skb->h.raw + skb->csum_offset) = 0;
4751 + *(u16*)(skb->head + skb->csum_start + skb->csum_offset) = 0;
4752 }
4753 NETFRONT_ACCEL_PKTBUFF_FOR_EACH_FRAGMENT
4754 (skb, idx, frag_data, frag_len, {
4755 --- a/drivers/xen/sfc_netfront/accel_xenbus.c
4756 +++ b/drivers/xen/sfc_netfront/accel_xenbus.c
4757 @@ -356,7 +356,7 @@
4758 /* Create xenbus msg event channel */
4759 err = bind_listening_port_to_irqhandler
4760 (dev->otherend_id, netfront_accel_msg_channel_irq_from_bend,
4761 - SA_SAMPLE_RANDOM, "vnicctrl", vnic);
4762 + IRQF_SAMPLE_RANDOM, "vnicctrl", vnic);
4763 if (err < 0) {
4764 EPRINTK("Couldn't bind msg event channel\n");
4765 goto fail_msg_irq;
4766 @@ -367,7 +367,7 @@
4767 /* Create xenbus net event channel */
4768 err = bind_listening_port_to_irqhandler
4769 (dev->otherend_id, netfront_accel_net_channel_irq_from_bend,
4770 - SA_SAMPLE_RANDOM, "vnicfront", vnic);
4771 + IRQF_SAMPLE_RANDOM, "vnicfront", vnic);
4772 if (err < 0) {
4773 EPRINTK("Couldn't bind net event channel\n");
4774 goto fail_net_irq;
4775 --- a/drivers/xen/xenoprof/xenoprofile.c
4776 +++ b/drivers/xen/xenoprof/xenoprofile.c
4777 @@ -236,7 +236,7 @@
4778 result = bind_virq_to_irqhandler(VIRQ_XENOPROF,
4779 i,
4780 xenoprof_ovf_interrupt,
4781 - SA_INTERRUPT,
4782 + IRQF_DISABLED,
4783 "xenoprof",
4784 NULL);
4785
4786 --- a/fs/aio.c
4787 +++ b/fs/aio.c
4788 @@ -38,7 +38,7 @@
4789
4790 #ifdef CONFIG_EPOLL
4791 #include <linux/poll.h>
4792 -#include <linux/eventpoll.h>
4793 +#include <linux/anon_inodes.h>
4794 #endif
4795
4796 #if DEBUG > 1
4797 @@ -1308,7 +1308,7 @@
4798
4799 /* make_aio_fd:
4800 * Create a file descriptor that can be used to poll the event queue.
4801 - * Based and piggybacked on the excellent epoll code.
4802 + * Based on the excellent epoll code.
4803 */
4804
4805 static int make_aio_fd(struct kioctx *ioctx)
4806 @@ -1317,7 +1317,8 @@
4807 struct inode *inode;
4808 struct file *file;
4809
4810 - error = ep_getfd(&fd, &inode, &file, NULL, &aioq_fops);
4811 + error = anon_inode_getfd(&fd, &inode, &file, "[aioq]",
4812 + &aioq_fops, ioctx);
4813 if (error)
4814 return error;
4815
4816 --- a/include/asm-x86/mach-xen/asm/desc_32.h
4817 +++ b/include/asm-x86/mach-xen/asm/desc_32.h
4818 @@ -11,23 +11,24 @@
4819
4820 #include <asm/mmu.h>
4821
4822 -extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
4823 -
4824 struct Xgt_desc_struct {
4825 unsigned short size;
4826 unsigned long address __attribute__((packed));
4827 unsigned short pad;
4828 } __attribute__ ((packed));
4829
4830 -extern struct Xgt_desc_struct idt_descr;
4831 -DECLARE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr);
4832 -extern struct Xgt_desc_struct early_gdt_descr;
4833 +struct gdt_page
4834 +{
4835 + struct desc_struct gdt[GDT_ENTRIES];
4836 +} __attribute__((aligned(PAGE_SIZE)));
4837 +DECLARE_PER_CPU(struct gdt_page, gdt_page);
4838
4839 static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
4840 {
4841 - return (struct desc_struct *)per_cpu(cpu_gdt_descr, cpu).address;
4842 + return per_cpu(gdt_page, cpu).gdt;
4843 }
4844
4845 +extern struct Xgt_desc_struct idt_descr;
4846 extern struct desc_struct idt_table[];
4847 extern void set_intr_gate(unsigned int irq, void * addr);
4848
4849 @@ -55,53 +56,32 @@
4850 #define DESCTYPE_S 0x10 /* !system */
4851
4852 #ifndef CONFIG_XEN
4853 -#define load_TR_desc() __asm__ __volatile__("ltr %w0"::"q" (GDT_ENTRY_TSS*8))
4854 -
4855 -#define load_gdt(dtr) __asm__ __volatile("lgdt %0"::"m" (*dtr))
4856 -#define load_idt(dtr) __asm__ __volatile("lidt %0"::"m" (*dtr))
4857 +#define load_TR_desc() native_load_tr_desc()
4858 +#define load_gdt(dtr) native_load_gdt(dtr)
4859 +#define load_idt(dtr) native_load_idt(dtr)
4860 #define load_tr(tr) __asm__ __volatile("ltr %0"::"m" (tr))
4861 #define load_ldt(ldt) __asm__ __volatile("lldt %0"::"m" (ldt))
4862
4863 -#define store_gdt(dtr) __asm__ ("sgdt %0":"=m" (*dtr))
4864 -#define store_idt(dtr) __asm__ ("sidt %0":"=m" (*dtr))
4865 -#define store_tr(tr) __asm__ ("str %0":"=m" (tr))
4866 +#define store_gdt(dtr) native_store_gdt(dtr)
4867 +#define store_idt(dtr) native_store_idt(dtr)
4868 +#define store_tr(tr) (tr = native_store_tr())
4869 #define store_ldt(ldt) __asm__ ("sldt %0":"=m" (ldt))
4870 -#endif
4871
4872 -#if TLS_SIZE != 24
4873 -# error update this code.
4874 -#endif
4875 -
4876 -static inline void load_TLS(struct thread_struct *t, unsigned int cpu)
4877 -{
4878 -#define C(i) if (HYPERVISOR_update_descriptor(virt_to_machine(&get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]), \
4879 - *(u64 *)&t->tls_array[i]) \
4880 - BUG()
4881 - C(0); C(1); C(2);
4882 -#undef C
4883 -}
4884 +#define load_TLS(t, cpu) native_load_tls(t, cpu)
4885 +#define set_ldt native_set_ldt
4886
4887 -#ifndef CONFIG_XEN
4888 #define write_ldt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
4889 #define write_gdt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
4890 #define write_idt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
4891
4892 -static inline void write_dt_entry(void *dt, int entry, __u32 entry_a, __u32 entry_b)
4893 +static inline void write_dt_entry(struct desc_struct *dt,
4894 + int entry, u32 entry_low, u32 entry_high)
4895 {
4896 - __u32 *lp = (__u32 *)((char *)dt + entry*8);
4897 - *lp = entry_a;
4898 - *(lp+1) = entry_b;
4899 + dt[entry].a = entry_low;
4900 + dt[entry].b = entry_high;
4901 }
4902 -#define set_ldt native_set_ldt
4903 -#else
4904 -extern int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b);
4905 -extern int write_gdt_entry(void *gdt, int entry, __u32 entry_a, __u32 entry_b);
4906 -#define set_ldt xen_set_ldt
4907 -#endif
4908
4909 -#ifndef CONFIG_XEN
4910 -static inline fastcall void native_set_ldt(const void *addr,
4911 - unsigned int entries)
4912 +static inline void native_set_ldt(const void *addr, unsigned int entries)
4913 {
4914 if (likely(entries == 0))
4915 __asm__ __volatile__("lldt %w0"::"q" (0));
4916 @@ -116,6 +96,65 @@
4917 __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
4918 }
4919 }
4920 +
4921 +
4922 +static inline void native_load_tr_desc(void)
4923 +{
4924 + asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
4925 +}
4926 +
4927 +static inline void native_load_gdt(const struct Xgt_desc_struct *dtr)
4928 +{
4929 + asm volatile("lgdt %0"::"m" (*dtr));
4930 +}
4931 +
4932 +static inline void native_load_idt(const struct Xgt_desc_struct *dtr)
4933 +{
4934 + asm volatile("lidt %0"::"m" (*dtr));
4935 +}
4936 +
4937 +static inline void native_store_gdt(struct Xgt_desc_struct *dtr)
4938 +{
4939 + asm ("sgdt %0":"=m" (*dtr));
4940 +}
4941 +
4942 +static inline void native_store_idt(struct Xgt_desc_struct *dtr)
4943 +{
4944 + asm ("sidt %0":"=m" (*dtr));
4945 +}
4946 +
4947 +static inline unsigned long native_store_tr(void)
4948 +{
4949 + unsigned long tr;
4950 + asm ("str %0":"=r" (tr));
4951 + return tr;
4952 +}
4953 +
4954 +static inline void native_load_tls(struct thread_struct *t, unsigned int cpu)
4955 +{
4956 + unsigned int i;
4957 + struct desc_struct *gdt = get_cpu_gdt_table(cpu);
4958 +
4959 + for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
4960 + gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i];
4961 +}
4962 +#else
4963 +#define load_TLS(t, cpu) xen_load_tls(t, cpu)
4964 +#define set_ldt xen_set_ldt
4965 +
4966 +extern int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b);
4967 +extern int write_gdt_entry(void *gdt, int entry, __u32 entry_a, __u32 entry_b);
4968 +
4969 +static inline void xen_load_tls(struct thread_struct *t, unsigned int cpu)
4970 +{
4971 + unsigned int i;
4972 + struct desc_struct *gdt = get_cpu_gdt_table(cpu) + GDT_ENTRY_TLS_MIN;
4973 +
4974 + for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
4975 + if (HYPERVISOR_update_descriptor(virt_to_machine(&gdt[i]),
4976 + *(u64 *)&t->tls_array[i]))
4977 + BUG();
4978 +}
4979 #endif
4980
4981 #ifndef CONFIG_X86_NO_IDT
4982 --- a/include/asm-x86/mach-xen/asm/desc_64.h
4983 +++ b/include/asm-x86/mach-xen/asm/desc_64.h
4984 @@ -127,16 +127,6 @@
4985 DESC_LDT, size * 8 - 1);
4986 }
4987
4988 -static inline void set_seg_base(unsigned cpu, int entry, void *base)
4989 -{
4990 - struct desc_struct *d = &cpu_gdt(cpu)[entry];
4991 - u32 addr = (u32)(u64)base;
4992 - BUG_ON((u64)base >> 32);
4993 - d->base0 = addr & 0xffff;
4994 - d->base1 = (addr >> 16) & 0xff;
4995 - d->base2 = (addr >> 24) & 0xff;
4996 -}
4997 -
4998 #define LDT_entry_a(info) \
4999 ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff))
5000 /* Don't allow setting of the lm bit. It is useless anyways because
5001 @@ -165,25 +155,15 @@
5002 (info)->useable == 0 && \
5003 (info)->lm == 0)
5004
5005 -#if TLS_SIZE != 24
5006 -# error update this code.
5007 -#endif
5008 -
5009 static inline void load_TLS(struct thread_struct *t, unsigned int cpu)
5010 {
5011 -#if 0
5012 + unsigned int i;
5013 u64 *gdt = (u64 *)(cpu_gdt(cpu) + GDT_ENTRY_TLS_MIN);
5014 - gdt[0] = t->tls_array[0];
5015 - gdt[1] = t->tls_array[1];
5016 - gdt[2] = t->tls_array[2];
5017 -#endif
5018 -#define C(i) \
5019 - if (HYPERVISOR_update_descriptor(virt_to_machine(&cpu_gdt(cpu)[GDT_ENTRY_TLS_MIN + i]), \
5020 - t->tls_array[i])) \
5021 - BUG();
5022
5023 - C(0); C(1); C(2);
5024 -#undef C
5025 + for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
5026 + if (HYPERVISOR_update_descriptor(virt_to_machine(&gdt[i]),
5027 + t->tls_array[i]))
5028 + BUG();
5029 }
5030
5031 /*
5032 --- a/include/asm-x86/mach-xen/asm/dma-mapping_64.h
5033 +++ b/include/asm-x86/mach-xen/asm/dma-mapping_64.h
5034 @@ -51,7 +51,7 @@
5035 };
5036
5037 extern dma_addr_t bad_dma_address;
5038 -extern struct dma_mapping_ops* dma_ops;
5039 +extern const struct dma_mapping_ops* dma_ops;
5040 extern int iommu_merge;
5041
5042 #if 0
5043 --- a/include/asm-x86/mach-xen/asm/fixmap_32.h
5044 +++ b/include/asm-x86/mach-xen/asm/fixmap_32.h
5045 @@ -19,10 +19,8 @@
5046 * the start of the fixmap.
5047 */
5048 extern unsigned long __FIXADDR_TOP;
5049 -#ifdef CONFIG_COMPAT_VDSO
5050 -#define FIXADDR_USER_START __fix_to_virt(FIX_VDSO)
5051 -#define FIXADDR_USER_END __fix_to_virt(FIX_VDSO - 1)
5052 -#endif
5053 +#define FIXADDR_USER_START __fix_to_virt(FIX_VDSO)
5054 +#define FIXADDR_USER_END __fix_to_virt(FIX_VDSO - 1)
5055
5056 #ifndef __ASSEMBLY__
5057 #include <linux/kernel.h>
5058 @@ -85,6 +83,9 @@
5059 #ifdef CONFIG_PCI_MMCONFIG
5060 FIX_PCIE_MCFG,
5061 #endif
5062 +#ifdef CONFIG_PARAVIRT
5063 + FIX_PARAVIRT_BOOTMAP,
5064 +#endif
5065 FIX_SHARED_INFO,
5066 #define NR_FIX_ISAMAPS 256
5067 FIX_ISAMAP_END,
5068 --- a/include/asm-x86/mach-xen/asm/fixmap_64.h
5069 +++ b/include/asm-x86/mach-xen/asm/fixmap_64.h
5070 @@ -15,7 +15,6 @@
5071 #include <asm/apicdef.h>
5072 #include <asm/page.h>
5073 #include <asm/vsyscall.h>
5074 -#include <asm/vsyscall32.h>
5075 #include <asm/acpi.h>
5076
5077 /*
5078 --- a/include/asm-x86/mach-xen/asm/highmem.h
5079 +++ b/include/asm-x86/mach-xen/asm/highmem.h
5080 @@ -67,12 +67,18 @@
5081
5082 void *kmap(struct page *page);
5083 void kunmap(struct page *page);
5084 +void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot);
5085 void *kmap_atomic(struct page *page, enum km_type type);
5086 void *kmap_atomic_pte(struct page *page, enum km_type type);
5087 void kunmap_atomic(void *kvaddr, enum km_type type);
5088 void *kmap_atomic_pfn(unsigned long pfn, enum km_type type);
5089 struct page *kmap_atomic_to_page(void *ptr);
5090
5091 +#define kmap_atomic_pte(page, type) \
5092 + kmap_atomic_prot(page, type, \
5093 + test_bit(PG_pinned, &(page)->flags) \
5094 + ? PAGE_KERNEL_RO : kmap_prot)
5095 +
5096 #define flush_cache_kmaps() do { } while (0)
5097
5098 #endif /* __KERNEL__ */
5099 --- a/include/asm-x86/mach-xen/asm/io_32.h
5100 +++ b/include/asm-x86/mach-xen/asm/io_32.h
5101 @@ -263,15 +263,18 @@
5102
5103 #endif /* __KERNEL__ */
5104
5105 -#define __SLOW_DOWN_IO "outb %%al,$0x80;"
5106 +static inline void xen_io_delay(void)
5107 +{
5108 + asm volatile("outb %%al,$0x80" : : : "memory");
5109 +}
5110
5111 static inline void slow_down_io(void) {
5112 - __asm__ __volatile__(
5113 - __SLOW_DOWN_IO
5114 + xen_io_delay();
5115 #ifdef REALLY_SLOW_IO
5116 - __SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO
5117 + xen_io_delay();
5118 + xen_io_delay();
5119 + xen_io_delay();
5120 #endif
5121 - : : );
5122 }
5123
5124 #ifdef CONFIG_X86_NUMAQ
5125 --- a/include/asm-x86/mach-xen/asm/irqflags_32.h
5126 +++ b/include/asm-x86/mach-xen/asm/irqflags_32.h
5127 @@ -11,6 +11,43 @@
5128 #define _ASM_IRQFLAGS_H
5129
5130 #ifndef __ASSEMBLY__
5131 +#define xen_save_fl(void) (current_vcpu_info()->evtchn_upcall_mask)
5132 +
5133 +#define xen_restore_fl(f) \
5134 +do { \
5135 + vcpu_info_t *_vcpu; \
5136 + barrier(); \
5137 + _vcpu = current_vcpu_info(); \
5138 + if ((_vcpu->evtchn_upcall_mask = (f)) == 0) { \
5139 + barrier(); /* unmask then check (avoid races) */\
5140 + if (unlikely(_vcpu->evtchn_upcall_pending)) \
5141 + force_evtchn_callback(); \
5142 + } \
5143 +} while (0)
5144 +
5145 +#define xen_irq_disable() \
5146 +do { \
5147 + current_vcpu_info()->evtchn_upcall_mask = 1; \
5148 + barrier(); \
5149 +} while (0)
5150 +
5151 +#define xen_irq_enable() \
5152 +do { \
5153 + vcpu_info_t *_vcpu; \
5154 + barrier(); \
5155 + _vcpu = current_vcpu_info(); \
5156 + _vcpu->evtchn_upcall_mask = 0; \
5157 + barrier(); /* unmask then check (avoid races) */ \
5158 + if (unlikely(_vcpu->evtchn_upcall_pending)) \
5159 + force_evtchn_callback(); \
5160 +} while (0)
5161 +
5162 +void xen_safe_halt(void);
5163 +
5164 +void xen_halt(void);
5165 +#endif /* __ASSEMBLY__ */
5166 +
5167 +#ifndef __ASSEMBLY__
5168
5169 /*
5170 * The use of 'barrier' in the following reflects their use as local-lock
5171 @@ -20,48 +57,31 @@
5172 * includes these barriers, for example.
5173 */
5174
5175 -#define __raw_local_save_flags() (current_vcpu_info()->evtchn_upcall_mask)
5176 +#define __raw_local_save_flags(void) xen_save_fl()
5177
5178 -#define raw_local_irq_restore(x) \
5179 -do { \
5180 - vcpu_info_t *_vcpu; \
5181 - barrier(); \
5182 - _vcpu = current_vcpu_info(); \
5183 - if ((_vcpu->evtchn_upcall_mask = (x)) == 0) { \
5184 - barrier(); /* unmask then check (avoid races) */ \
5185 - if (unlikely(_vcpu->evtchn_upcall_pending)) \
5186 - force_evtchn_callback(); \
5187 - } \
5188 -} while (0)
5189 +#define raw_local_irq_restore(flags) xen_restore_fl(flags)
5190
5191 -#define raw_local_irq_disable() \
5192 -do { \
5193 - current_vcpu_info()->evtchn_upcall_mask = 1; \
5194 - barrier(); \
5195 -} while (0)
5196 +#define raw_local_irq_disable() xen_irq_disable()
5197
5198 -#define raw_local_irq_enable() \
5199 -do { \
5200 - vcpu_info_t *_vcpu; \
5201 - barrier(); \
5202 - _vcpu = current_vcpu_info(); \
5203 - _vcpu->evtchn_upcall_mask = 0; \
5204 - barrier(); /* unmask then check (avoid races) */ \
5205 - if (unlikely(_vcpu->evtchn_upcall_pending)) \
5206 - force_evtchn_callback(); \
5207 -} while (0)
5208 +#define raw_local_irq_enable() xen_irq_enable()
5209
5210 /*
5211 * Used in the idle loop; sti takes one instruction cycle
5212 * to complete:
5213 */
5214 -void raw_safe_halt(void);
5215 +static inline void raw_safe_halt(void)
5216 +{
5217 + xen_safe_halt();
5218 +}
5219
5220 /*
5221 * Used when interrupts are already enabled or to
5222 * shutdown the processor:
5223 */
5224 -void halt(void);
5225 +static inline void halt(void)
5226 +{
5227 + xen_halt();
5228 +}
5229
5230 /*
5231 * For spinlocks, etc:
5232 --- a/include/asm-x86/mach-xen/asm/irqflags_64.h
5233 +++ b/include/asm-x86/mach-xen/asm/irqflags_64.h
5234 @@ -9,6 +9,7 @@
5235 */
5236 #ifndef _ASM_IRQFLAGS_H
5237 #define _ASM_IRQFLAGS_H
5238 +#include <asm/processor-flags.h>
5239
5240 #ifndef __ASSEMBLY__
5241 /*
5242 @@ -50,19 +51,19 @@
5243 {
5244 unsigned long flags = __raw_local_save_flags();
5245
5246 - raw_local_irq_restore((flags & ~(1 << 9)) | (1 << 18));
5247 + raw_local_irq_restore((flags & ~X86_EFLAGS_IF) | X86_EFLAGS_AC);
5248 }
5249
5250 static inline void raw_local_irq_enable(void)
5251 {
5252 unsigned long flags = __raw_local_save_flags();
5253
5254 - raw_local_irq_restore((flags | (1 << 9)) & ~(1 << 18));
5255 + raw_local_irq_restore((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC));
5256 }
5257
5258 static inline int raw_irqs_disabled_flags(unsigned long flags)
5259 {
5260 - return !(flags & (1<<9)) || (flags & (1 << 18));
5261 + return !(flags & X86_EFLAGS_IF) || (flags & X86_EFLAGS_AC);
5262 }
5263
5264 #else /* CONFIG_X86_VSMP */
5265 @@ -118,13 +119,21 @@
5266 * Used in the idle loop; sti takes one instruction cycle
5267 * to complete:
5268 */
5269 -void raw_safe_halt(void);
5270 +void xen_safe_halt(void);
5271 +static inline void raw_safe_halt(void)
5272 +{
5273 + xen_safe_halt();
5274 +}
5275
5276 /*
5277 * Used when interrupts are already enabled or to
5278 * shutdown the processor:
5279 */
5280 -void halt(void);
5281 +void xen_halt(void);
5282 +static inline void halt(void)
5283 +{
5284 + xen_halt();
5285 +}
5286
5287 #else /* __ASSEMBLY__: */
5288 # ifdef CONFIG_TRACE_IRQFLAGS
5289 --- a/include/asm-x86/mach-xen/asm/mmu.h
5290 +++ b/include/asm-x86/mach-xen/asm/mmu.h
5291 @@ -18,12 +18,4 @@
5292 #endif
5293 } mm_context_t;
5294
5295 -/* mm/memory.c:exit_mmap hook */
5296 -extern void _arch_exit_mmap(struct mm_struct *mm);
5297 -#define arch_exit_mmap(_mm) _arch_exit_mmap(_mm)
5298 -
5299 -/* kernel/fork.c:dup_mmap hook */
5300 -extern void _arch_dup_mmap(struct mm_struct *mm);
5301 -#define arch_dup_mmap(mm, oldmm) ((void)(oldmm), _arch_dup_mmap(mm))
5302 -
5303 #endif
5304 --- a/include/asm-x86/mach-xen/asm/mmu_64.h
5305 +++ b/include/asm-x86/mach-xen/asm/mmu_64.h
5306 @@ -25,14 +25,6 @@
5307 #ifdef CONFIG_XEN
5308 extern struct list_head mm_unpinned;
5309 extern spinlock_t mm_unpinned_lock;
5310 -
5311 -/* mm/memory.c:exit_mmap hook */
5312 -extern void _arch_exit_mmap(struct mm_struct *mm);
5313 -#define arch_exit_mmap(_mm) _arch_exit_mmap(_mm)
5314 -
5315 -/* kernel/fork.c:dup_mmap hook */
5316 -extern void _arch_dup_mmap(struct mm_struct *mm);
5317 -#define arch_dup_mmap(mm, oldmm) ((void)(oldmm), _arch_dup_mmap(mm))
5318 #endif
5319
5320 #endif
5321 --- a/include/asm-x86/mach-xen/asm/mmu_context_32.h
5322 +++ b/include/asm-x86/mach-xen/asm/mmu_context_32.h
5323 @@ -6,6 +6,20 @@
5324 #include <asm/pgalloc.h>
5325 #include <asm/tlbflush.h>
5326
5327 +void arch_exit_mmap(struct mm_struct *mm);
5328 +void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
5329 +
5330 +void mm_pin(struct mm_struct *mm);
5331 +void mm_unpin(struct mm_struct *mm);
5332 +void mm_pin_all(void);
5333 +
5334 +static inline void xen_activate_mm(struct mm_struct *prev,
5335 + struct mm_struct *next)
5336 +{
5337 + if (!test_bit(PG_pinned, &virt_to_page(next->pgd)->flags))
5338 + mm_pin(next);
5339 +}
5340 +
5341 /*
5342 * Used for LDT copy/destruction.
5343 */
5344 @@ -37,10 +51,6 @@
5345 : : "r" (0) );
5346 }
5347
5348 -extern void mm_pin(struct mm_struct *mm);
5349 -extern void mm_unpin(struct mm_struct *mm);
5350 -void mm_pin_all(void);
5351 -
5352 static inline void switch_mm(struct mm_struct *prev,
5353 struct mm_struct *next,
5354 struct task_struct *tsk)
5355 @@ -97,11 +107,10 @@
5356 #define deactivate_mm(tsk, mm) \
5357 asm("movl %0,%%gs": :"r" (0));
5358
5359 -static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next)
5360 -{
5361 - if (!test_bit(PG_pinned, &virt_to_page(next->pgd)->flags))
5362 - mm_pin(next);
5363 - switch_mm(prev, next, NULL);
5364 -}
5365 +#define activate_mm(prev, next) \
5366 + do { \
5367 + xen_activate_mm(prev, next); \
5368 + switch_mm((prev),(next),NULL); \
5369 + } while(0)
5370
5371 #endif
5372 --- a/include/asm-x86/mach-xen/asm/mmu_context_64.h
5373 +++ b/include/asm-x86/mach-xen/asm/mmu_context_64.h
5374 @@ -9,6 +9,9 @@
5375 #include <asm/pgtable.h>
5376 #include <asm/tlbflush.h>
5377
5378 +void arch_exit_mmap(struct mm_struct *mm);
5379 +void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
5380 +
5381 /*
5382 * possibly do the LDT unload here?
5383 */
5384 --- a/include/asm-x86/mach-xen/asm/page_64.h
5385 +++ b/include/asm-x86/mach-xen/asm/page_64.h
5386 @@ -7,6 +7,7 @@
5387 #include <linux/types.h>
5388 #include <asm/bug.h>
5389 #endif
5390 +#include <linux/const.h>
5391 #include <xen/interface/xen.h>
5392
5393 /*
5394 @@ -19,18 +20,14 @@
5395
5396 /* PAGE_SHIFT determines the page size */
5397 #define PAGE_SHIFT 12
5398 -#ifdef __ASSEMBLY__
5399 -#define PAGE_SIZE (0x1 << PAGE_SHIFT)
5400 -#else
5401 -#define PAGE_SIZE (1UL << PAGE_SHIFT)
5402 -#endif
5403 +#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT)
5404 #define PAGE_MASK (~(PAGE_SIZE-1))
5405
5406 /* See Documentation/x86_64/mm.txt for a description of the memory map. */
5407 #define __PHYSICAL_MASK_SHIFT 46
5408 -#define __PHYSICAL_MASK ((1UL << __PHYSICAL_MASK_SHIFT) - 1)
5409 +#define __PHYSICAL_MASK ((_AC(1,UL) << __PHYSICAL_MASK_SHIFT) - 1)
5410 #define __VIRTUAL_MASK_SHIFT 48
5411 -#define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1)
5412 +#define __VIRTUAL_MASK ((_AC(1,UL) << __VIRTUAL_MASK_SHIFT) - 1)
5413
5414 #define PHYSICAL_PAGE_MASK (~(PAGE_SIZE-1) & __PHYSICAL_MASK)
5415
5416 @@ -55,10 +52,10 @@
5417 #define N_EXCEPTION_STACKS 5 /* hw limit: 7 */
5418
5419 #define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1))
5420 -#define LARGE_PAGE_SIZE (1UL << PMD_SHIFT)
5421 +#define LARGE_PAGE_SIZE (_AC(1,UL) << PMD_SHIFT)
5422
5423 #define HPAGE_SHIFT PMD_SHIFT
5424 -#define HPAGE_SIZE ((1UL) << HPAGE_SHIFT)
5425 +#define HPAGE_SIZE (_AC(1,UL) << HPAGE_SHIFT)
5426 #define HPAGE_MASK (~(HPAGE_SIZE - 1))
5427 #define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
5428
5429 @@ -152,17 +149,23 @@
5430
5431 #define __pgprot(x) ((pgprot_t) { (x) } )
5432
5433 -#define __PHYSICAL_START ((unsigned long)CONFIG_PHYSICAL_START)
5434 -#define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START)
5435 -#define __START_KERNEL_map 0xffffffff80000000UL
5436 -#define __PAGE_OFFSET 0xffff880000000000UL
5437 +#endif /* !__ASSEMBLY__ */
5438
5439 -#else
5440 #define __PHYSICAL_START CONFIG_PHYSICAL_START
5441 +#define __KERNEL_ALIGN 0x200000
5442 +
5443 +/*
5444 + * Make sure kernel is aligned to 2MB address. Catching it at compile
5445 + * time is better. Change your config file and compile the kernel
5446 + * for a 2MB aligned address (CONFIG_PHYSICAL_START)
5447 + */
5448 +#if (CONFIG_PHYSICAL_START % __KERNEL_ALIGN) != 0
5449 +#error "CONFIG_PHYSICAL_START must be a multiple of 2MB"
5450 +#endif
5451 +
5452 #define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START)
5453 -#define __START_KERNEL_map 0xffffffff80000000
5454 -#define __PAGE_OFFSET 0xffff880000000000
5455 -#endif /* !__ASSEMBLY__ */
5456 +#define __START_KERNEL_map _AC(0xffffffff80000000, UL)
5457 +#define __PAGE_OFFSET _AC(0xffff880000000000, UL)
5458
5459 #if CONFIG_XEN_COMPAT <= 0x030002
5460 #undef LOAD_OFFSET
5461 @@ -172,20 +175,20 @@
5462 /* to align the pointer to the (next) page boundary */
5463 #define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK)
5464
5465 -#define KERNEL_TEXT_SIZE (40UL*1024*1024)
5466 -#define KERNEL_TEXT_START 0xffffffff80000000UL
5467 +#define KERNEL_TEXT_SIZE (40*1024*1024)
5468 +#define KERNEL_TEXT_START _AC(0xffffffff80000000, UL)
5469 +
5470 +#define PAGE_OFFSET __PAGE_OFFSET
5471
5472 -#define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET)
5473 +#ifndef __ASSEMBLY__
5474 +static inline unsigned long __phys_addr(unsigned long x)
5475 +{
5476 + return x - (x >= __START_KERNEL_map ? __START_KERNEL_map : PAGE_OFFSET);
5477 +}
5478 +#endif
5479
5480 -/* Note: __pa(&symbol_visible_to_c) should be always replaced with __pa_symbol.
5481 - Otherwise you risk miscompilation. */
5482 -#define __pa(x) (((unsigned long)(x)>=__START_KERNEL_map)?(unsigned long)(x) - (unsigned long)__START_KERNEL_map:(unsigned long)(x) - PAGE_OFFSET)
5483 -/* __pa_symbol should be used for C visible symbols.
5484 - This seems to be the official gcc blessed way to do such arithmetic. */
5485 -#define __pa_symbol(x) \
5486 - ({unsigned long v; \
5487 - asm("" : "=r" (v) : "0" (x)); \
5488 - __pa(v); })
5489 +#define __pa(x) __phys_addr((unsigned long)(x))
5490 +#define __pa_symbol(x) __phys_addr((unsigned long)(x))
5491
5492 #define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET))
5493 #define __boot_va(x) __va(x)
5494 --- a/include/asm-x86/mach-xen/asm/pgalloc_32.h
5495 +++ b/include/asm-x86/mach-xen/asm/pgalloc_32.h
5496 @@ -1,7 +1,6 @@
5497 #ifndef _I386_PGALLOC_H
5498 #define _I386_PGALLOC_H
5499
5500 -#include <asm/fixmap.h>
5501 #include <linux/threads.h>
5502 #include <linux/mm.h> /* for struct page */
5503 #include <asm/io.h> /* for phys_to_virt and page_to_pseudophys */
5504 @@ -69,6 +68,4 @@
5505 #define pud_populate(mm, pmd, pte) BUG()
5506 #endif
5507
5508 -#define check_pgt_cache() do { } while (0)
5509 -
5510 #endif /* _I386_PGALLOC_H */
5511 --- a/include/asm-x86/mach-xen/asm/pgalloc_64.h
5512 +++ b/include/asm-x86/mach-xen/asm/pgalloc_64.h
5513 @@ -1,7 +1,6 @@
5514 #ifndef _X86_64_PGALLOC_H
5515 #define _X86_64_PGALLOC_H
5516
5517 -#include <asm/fixmap.h>
5518 #include <asm/pda.h>
5519 #include <linux/threads.h>
5520 #include <linux/mm.h>
5521 @@ -100,24 +99,16 @@
5522 struct page *page = virt_to_page(pgd);
5523
5524 spin_lock(&pgd_lock);
5525 - page->index = (pgoff_t)pgd_list;
5526 - if (pgd_list)
5527 - pgd_list->private = (unsigned long)&page->index;
5528 - pgd_list = page;
5529 - page->private = (unsigned long)&pgd_list;
5530 + list_add(&page->lru, &pgd_list);
5531 spin_unlock(&pgd_lock);
5532 }
5533
5534 static inline void pgd_list_del(pgd_t *pgd)
5535 {
5536 - struct page *next, **pprev, *page = virt_to_page(pgd);
5537 + struct page *page = virt_to_page(pgd);
5538
5539 spin_lock(&pgd_lock);
5540 - next = (struct page *)page->index;
5541 - pprev = (struct page **)page->private;
5542 - *pprev = next;
5543 - if (next)
5544 - next->private = (unsigned long)pprev;
5545 + list_del(&page->lru);
5546 spin_unlock(&pgd_lock);
5547 }
5548
5549 --- a/include/asm-x86/mach-xen/asm/pgtable-2level.h
5550 +++ b/include/asm-x86/mach-xen/asm/pgtable-2level.h
5551 @@ -13,22 +13,43 @@
5552 * within a page table are directly modified. Thus, the following
5553 * hook is made available.
5554 */
5555 -#define set_pte(pteptr, pteval) (*(pteptr) = pteval)
5556 -
5557 -#define set_pte_at(_mm,addr,ptep,pteval) do { \
5558 - if (((_mm) != current->mm && (_mm) != &init_mm) || \
5559 - HYPERVISOR_update_va_mapping((addr), (pteval), 0)) \
5560 - set_pte((ptep), (pteval)); \
5561 -} while (0)
5562 -
5563 -#define set_pmd(pmdptr, pmdval) xen_l2_entry_update((pmdptr), (pmdval))
5564 +static inline void xen_set_pte(pte_t *ptep , pte_t pte)
5565 +{
5566 + *ptep = pte;
5567 +}
5568 +static inline void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
5569 + pte_t *ptep , pte_t pte)
5570 +{
5571 + if ((mm != current->mm && mm != &init_mm) ||
5572 + HYPERVISOR_update_va_mapping(addr, pte, 0))
5573 + xen_set_pte(ptep, pte);
5574 +}
5575 +static inline void xen_set_pmd(pmd_t *pmdp, pmd_t pmd)
5576 +{
5577 + xen_l2_entry_update(pmdp, pmd);
5578 +}
5579 +#define set_pte(pteptr, pteval) xen_set_pte(pteptr, pteval)
5580 +#define set_pte_at(mm,addr,ptep,pteval) xen_set_pte_at(mm, addr, ptep, pteval)
5581 +#define set_pmd(pmdptr, pmdval) xen_set_pmd(pmdptr, pmdval)
5582
5583 #define set_pte_atomic(pteptr, pteval) set_pte(pteptr,pteval)
5584
5585 #define pte_clear(mm,addr,xp) do { set_pte_at(mm, addr, xp, __pte(0)); } while (0)
5586 #define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0)
5587
5588 -#define raw_ptep_get_and_clear(xp, pte) __pte_ma(xchg(&(xp)->pte_low, 0))
5589 +static inline void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *xp)
5590 +{
5591 + xen_set_pte_at(mm, addr, xp, __pte(0));
5592 +}
5593 +
5594 +#ifdef CONFIG_SMP
5595 +static inline pte_t xen_ptep_get_and_clear(pte_t *xp, pte_t res)
5596 +{
5597 + return __pte_ma(xchg(&xp->pte_low, 0));
5598 +}
5599 +#else
5600 +#define xen_ptep_get_and_clear(xp, res) xen_local_ptep_get_and_clear(xp, res)
5601 +#endif
5602
5603 #define __HAVE_ARCH_PTEP_CLEAR_FLUSH
5604 #define ptep_clear_flush(vma, addr, ptep) \
5605 @@ -95,6 +116,4 @@
5606 #define __pte_to_swp_entry(pte) ((swp_entry_t) { (pte).pte_low })
5607 #define __swp_entry_to_pte(x) ((pte_t) { (x).val })
5608
5609 -void vmalloc_sync_all(void);
5610 -
5611 #endif /* _I386_PGTABLE_2LEVEL_H */
5612 --- a/include/asm-x86/mach-xen/asm/pgtable-3level-defs.h
5613 +++ b/include/asm-x86/mach-xen/asm/pgtable-3level-defs.h
5614 @@ -1,7 +1,7 @@
5615 #ifndef _I386_PGTABLE_3LEVEL_DEFS_H
5616 #define _I386_PGTABLE_3LEVEL_DEFS_H
5617
5618 -#define HAVE_SHARED_KERNEL_PMD 0
5619 +#define SHARED_KERNEL_PMD 0
5620
5621 /*
5622 * PGDIR_SHIFT determines what a top-level page table entry can map
5623 --- a/include/asm-x86/mach-xen/asm/pgtable-3level.h
5624 +++ b/include/asm-x86/mach-xen/asm/pgtable-3level.h
5625 @@ -52,32 +52,40 @@
5626 * value and then use set_pte to update it. -ben
5627 */
5628
5629 -static inline void set_pte(pte_t *ptep, pte_t pte)
5630 +static inline void xen_set_pte(pte_t *ptep, pte_t pte)
5631 {
5632 ptep->pte_high = pte.pte_high;
5633 smp_wmb();
5634 ptep->pte_low = pte.pte_low;
5635 }
5636 -#define set_pte_atomic(pteptr,pteval) \
5637 - set_64bit((unsigned long long *)(pteptr),__pte_val(pteval))
5638
5639 -#define set_pte_at(_mm,addr,ptep,pteval) do { \
5640 - if (((_mm) != current->mm && (_mm) != &init_mm) || \
5641 - HYPERVISOR_update_va_mapping((addr), (pteval), 0)) \
5642 - set_pte((ptep), (pteval)); \
5643 -} while (0)
5644 -
5645 -#define set_pmd(pmdptr,pmdval) \
5646 - xen_l2_entry_update((pmdptr), (pmdval))
5647 -#define set_pud(pudptr,pudval) \
5648 - xen_l3_entry_update((pudptr), (pudval))
5649 +static inline void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
5650 + pte_t *ptep , pte_t pte)
5651 +{
5652 + if ((mm != current->mm && mm != &init_mm) ||
5653 + HYPERVISOR_update_va_mapping(addr, pte, 0))
5654 + xen_set_pte(ptep, pte);
5655 +}
5656 +
5657 +static inline void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
5658 +{
5659 + set_64bit((unsigned long long *)(ptep),__pte_val(pte));
5660 +}
5661 +static inline void xen_set_pmd(pmd_t *pmdp, pmd_t pmd)
5662 +{
5663 + xen_l2_entry_update(pmdp, pmd);
5664 +}
5665 +static inline void xen_set_pud(pud_t *pudp, pud_t pud)
5666 +{
5667 + xen_l3_entry_update(pudp, pud);
5668 +}
5669
5670 /*
5671 * For PTEs and PDEs, we must clear the P-bit first when clearing a page table
5672 * entry, so clear the bottom half first and enforce ordering with a compiler
5673 * barrier.
5674 */
5675 -static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
5676 +static inline void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
5677 {
5678 if ((mm != current->mm && mm != &init_mm)
5679 || HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) {
5680 @@ -87,7 +95,18 @@
5681 }
5682 }
5683
5684 -#define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0)
5685 +static inline void xen_pmd_clear(pmd_t *pmd)
5686 +{
5687 + xen_l2_entry_update(pmd, __pmd(0));
5688 +}
5689 +
5690 +#define set_pte(ptep, pte) xen_set_pte(ptep, pte)
5691 +#define set_pte_at(mm, addr, ptep, pte) xen_set_pte_at(mm, addr, ptep, pte)
5692 +#define set_pte_atomic(ptep, pte) xen_set_pte_atomic(ptep, pte)
5693 +#define set_pmd(pmdp, pmd) xen_set_pmd(pmdp, pmd)
5694 +#define set_pud(pudp, pud) xen_set_pud(pudp, pud)
5695 +#define pte_clear(mm, addr, ptep) xen_pte_clear(mm, addr, ptep)
5696 +#define pmd_clear(pmd) xen_pmd_clear(pmd)
5697
5698 /*
5699 * Pentium-II erratum A13: in PAE mode we explicitly have to flush
5700 @@ -108,7 +127,8 @@
5701 #define pmd_offset(pud, address) ((pmd_t *) pud_page(*(pud)) + \
5702 pmd_index(address))
5703
5704 -static inline pte_t raw_ptep_get_and_clear(pte_t *ptep, pte_t res)
5705 +#ifdef CONFIG_SMP
5706 +static inline pte_t xen_ptep_get_and_clear(pte_t *ptep, pte_t res)
5707 {
5708 uint64_t val = __pte_val(res);
5709 if (__cmpxchg64(ptep, val, 0) != val) {
5710 @@ -119,6 +139,9 @@
5711 }
5712 return res;
5713 }
5714 +#else
5715 +#define xen_ptep_get_and_clear(xp, pte) xen_local_ptep_get_and_clear(xp, pte)
5716 +#endif
5717
5718 #define __HAVE_ARCH_PTEP_CLEAR_FLUSH
5719 #define ptep_clear_flush(vma, addr, ptep) \
5720 @@ -165,13 +188,13 @@
5721 static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
5722 {
5723 return __pte((((unsigned long long)page_nr << PAGE_SHIFT) |
5724 - pgprot_val(pgprot)) & __supported_pte_mask);
5725 + pgprot_val(pgprot)) & __supported_pte_mask);
5726 }
5727
5728 static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
5729 {
5730 return __pmd((((unsigned long long)page_nr << PAGE_SHIFT) |
5731 - pgprot_val(pgprot)) & __supported_pte_mask);
5732 + pgprot_val(pgprot)) & __supported_pte_mask);
5733 }
5734
5735 /*
5736 @@ -191,6 +214,4 @@
5737
5738 #define __pmd_free_tlb(tlb, x) do { } while (0)
5739
5740 -void vmalloc_sync_all(void);
5741 -
5742 #endif /* _I386_PGTABLE_3LEVEL_H */
5743 --- a/include/asm-x86/mach-xen/asm/pgtable_32.h
5744 +++ b/include/asm-x86/mach-xen/asm/pgtable_32.h
5745 @@ -24,11 +24,11 @@
5746 #include <linux/slab.h>
5747 #include <linux/list.h>
5748 #include <linux/spinlock.h>
5749 +#include <linux/sched.h>
5750
5751 /* Is this pagetable pinned? */
5752 #define PG_pinned PG_arch_1
5753
5754 -struct mm_struct;
5755 struct vm_area_struct;
5756
5757 /*
5758 @@ -38,17 +38,16 @@
5759 #define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
5760 extern unsigned long empty_zero_page[1024];
5761 extern pgd_t *swapper_pg_dir;
5762 -extern struct kmem_cache *pgd_cache;
5763 extern struct kmem_cache *pmd_cache;
5764 extern spinlock_t pgd_lock;
5765 extern struct page *pgd_list;
5766 +void check_pgt_cache(void);
5767
5768 void pmd_ctor(void *, struct kmem_cache *, unsigned long);
5769 -void pgd_ctor(void *, struct kmem_cache *, unsigned long);
5770 -void pgd_dtor(void *, struct kmem_cache *, unsigned long);
5771 void pgtable_cache_init(void);
5772 void paging_init(void);
5773
5774 +
5775 /*
5776 * The Linux x86 paging architecture is 'compile-time dual-mode', it
5777 * implements both the traditional 2-level x86 page tables and the
5778 @@ -165,6 +164,7 @@
5779
5780 extern unsigned long long __PAGE_KERNEL, __PAGE_KERNEL_EXEC;
5781 #define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW)
5782 +#define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW)
5783 #define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD)
5784 #define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE)
5785 #define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE)
5786 @@ -172,6 +172,7 @@
5787 #define PAGE_KERNEL __pgprot(__PAGE_KERNEL)
5788 #define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO)
5789 #define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC)
5790 +#define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX)
5791 #define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE)
5792 #define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE)
5793 #define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC)
5794 @@ -275,7 +276,13 @@
5795 */
5796 #define pte_update(mm, addr, ptep) do { } while (0)
5797 #define pte_update_defer(mm, addr, ptep) do { } while (0)
5798 -#define paravirt_map_pt_hook(slot, va, pfn) do { } while (0)
5799 +
5800 +/* local pte updates need not use xchg for locking */
5801 +static inline pte_t xen_local_ptep_get_and_clear(pte_t *ptep, pte_t res)
5802 +{
5803 + xen_set_pte(ptep, __pte(0));
5804 + return res;
5805 +}
5806
5807 /*
5808 * We only update the dirty/accessed state if we set
5809 @@ -286,17 +293,34 @@
5810 */
5811 #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
5812 #define ptep_set_access_flags(vma, address, ptep, entry, dirty) \
5813 -do { \
5814 - if (dirty) \
5815 +({ \
5816 + int __changed = !pte_same(*(ptep), entry); \
5817 + if (__changed && (dirty)) \
5818 ptep_establish(vma, address, ptep, entry); \
5819 -} while (0)
5820 + __changed; \
5821 +})
5822
5823 -/*
5824 - * We don't actually have these, but we want to advertise them so that
5825 - * we can encompass the flush here.
5826 - */
5827 #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
5828 +#define ptep_test_and_clear_dirty(vma, addr, ptep) ({ \
5829 + int __ret = 0; \
5830 + if (pte_dirty(*(ptep))) \
5831 + __ret = test_and_clear_bit(_PAGE_BIT_DIRTY, \
5832 + &(ptep)->pte_low); \
5833 + if (__ret) \
5834 + pte_update((vma)->vm_mm, addr, ptep); \
5835 + __ret; \
5836 +})
5837 +
5838 #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
5839 +#define ptep_test_and_clear_young(vma, addr, ptep) ({ \
5840 + int __ret = 0; \
5841 + if (pte_young(*(ptep))) \
5842 + __ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, \
5843 + &(ptep)->pte_low); \
5844 + if (__ret) \
5845 + pte_update((vma)->vm_mm, addr, ptep); \
5846 + __ret; \
5847 +})
5848
5849 /*
5850 * Rules for using ptep_establish: the pte MUST be a user pte, and
5851 @@ -323,7 +347,7 @@
5852 int __dirty = pte_dirty(__pte); \
5853 __pte = pte_mkclean(__pte); \
5854 if (test_bit(PG_pinned, &virt_to_page((vma)->vm_mm->pgd)->flags)) \
5855 - ptep_set_access_flags(vma, address, ptep, __pte, __dirty); \
5856 + (void)ptep_set_access_flags(vma, address, ptep, __pte, __dirty); \
5857 else if (__dirty) \
5858 (ptep)->pte_low = __pte.pte_low; \
5859 __dirty; \
5860 @@ -336,7 +360,7 @@
5861 int __young = pte_young(__pte); \
5862 __pte = pte_mkold(__pte); \
5863 if (test_bit(PG_pinned, &virt_to_page((vma)->vm_mm->pgd)->flags)) \
5864 - ptep_set_access_flags(vma, address, ptep, __pte, __young); \
5865 + (void)ptep_set_access_flags(vma, address, ptep, __pte, __young); \
5866 else if (__young) \
5867 (ptep)->pte_low = __pte.pte_low; \
5868 __young; \
5869 @@ -349,7 +373,7 @@
5870 if (!pte_none(pte)
5871 && (mm != &init_mm
5872 || HYPERVISOR_update_va_mapping(addr, __pte(0), 0))) {
5873 - pte = raw_ptep_get_and_clear(ptep, pte);
5874 + pte = xen_ptep_get_and_clear(ptep, pte);
5875 pte_update(mm, addr, ptep);
5876 }
5877 return pte;
5878 @@ -491,24 +515,10 @@
5879 #endif
5880
5881 #if defined(CONFIG_HIGHPTE)
5882 -#define pte_offset_map(dir, address) \
5883 -({ \
5884 - pte_t *__ptep; \
5885 - unsigned pfn = pmd_val(*(dir)) >> PAGE_SHIFT; \
5886 - __ptep = (pte_t *)kmap_atomic_pte(pfn_to_page(pfn),KM_PTE0); \
5887 - paravirt_map_pt_hook(KM_PTE0,__ptep, pfn); \
5888 - __ptep = __ptep + pte_index(address); \
5889 - __ptep; \
5890 -})
5891 -#define pte_offset_map_nested(dir, address) \
5892 -({ \
5893 - pte_t *__ptep; \
5894 - unsigned pfn = pmd_val(*(dir)) >> PAGE_SHIFT; \
5895 - __ptep = (pte_t *)kmap_atomic_pte(pfn_to_page(pfn),KM_PTE1); \
5896 - paravirt_map_pt_hook(KM_PTE1,__ptep, pfn); \
5897 - __ptep = __ptep + pte_index(address); \
5898 - __ptep; \
5899 -})
5900 +#define pte_offset_map(dir, address) \
5901 + ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE0) + pte_index(address))
5902 +#define pte_offset_map_nested(dir, address) \
5903 + ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE1) + pte_index(address))
5904 #define pte_unmap(pte) kunmap_atomic(pte, KM_PTE0)
5905 #define pte_unmap_nested(pte) kunmap_atomic(pte, KM_PTE1)
5906 #else
5907 @@ -587,10 +597,6 @@
5908 #define io_remap_pfn_range(vma,from,pfn,size,prot) \
5909 direct_remap_pfn_range(vma,from,pfn,size,prot,DOMID_IO)
5910
5911 -#define MK_IOSPACE_PFN(space, pfn) (pfn)
5912 -#define GET_IOSPACE(pfn) 0
5913 -#define GET_PFN(pfn) (pfn)
5914 -
5915 #include <asm-generic/pgtable.h>
5916
5917 #endif /* _I386_PGTABLE_H */
5918 --- a/include/asm-x86/mach-xen/asm/pgtable_64.h
5919 +++ b/include/asm-x86/mach-xen/asm/pgtable_64.h
5920 @@ -1,12 +1,14 @@
5921 #ifndef _X86_64_PGTABLE_H
5922 #define _X86_64_PGTABLE_H
5923
5924 +#include <linux/const.h>
5925 +#ifndef __ASSEMBLY__
5926 +
5927 /*
5928 * This file contains the functions and defines necessary to modify and use
5929 * the x86-64 page table tree.
5930 */
5931 #include <asm/processor.h>
5932 -#include <asm/fixmap.h>
5933 #include <asm/bitops.h>
5934 #include <linux/threads.h>
5935 #include <linux/sched.h>
5936 @@ -34,11 +36,9 @@
5937 #endif
5938
5939 extern pud_t level3_kernel_pgt[512];
5940 -extern pud_t level3_physmem_pgt[512];
5941 extern pud_t level3_ident_pgt[512];
5942 extern pmd_t level2_kernel_pgt[512];
5943 extern pgd_t init_level4_pgt[];
5944 -extern pgd_t boot_level4_pgt[];
5945 extern unsigned long __supported_pte_mask;
5946
5947 #define swapper_pg_dir init_level4_pgt
5948 @@ -53,6 +53,8 @@
5949 extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)];
5950 #define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
5951
5952 +#endif /* !__ASSEMBLY__ */
5953 +
5954 /*
5955 * PGDIR_SHIFT determines what a top-level page table entry can map
5956 */
5957 @@ -77,6 +79,8 @@
5958 */
5959 #define PTRS_PER_PTE 512
5960
5961 +#ifndef __ASSEMBLY__
5962 +
5963 #define pte_ERROR(e) \
5964 printk("%s:%d: bad pte %p(%016lx pfn %010lx).\n", __FILE__, __LINE__, \
5965 &(e), __pte_val(e), pte_pfn(e))
5966 @@ -119,22 +123,23 @@
5967
5968 #define pte_pgprot(a) (__pgprot((a).pte & ~PHYSICAL_PAGE_MASK))
5969
5970 -#define PMD_SIZE (1UL << PMD_SHIFT)
5971 +#endif /* !__ASSEMBLY__ */
5972 +
5973 +#define PMD_SIZE (_AC(1,UL) << PMD_SHIFT)
5974 #define PMD_MASK (~(PMD_SIZE-1))
5975 -#define PUD_SIZE (1UL << PUD_SHIFT)
5976 +#define PUD_SIZE (_AC(1,UL) << PUD_SHIFT)
5977 #define PUD_MASK (~(PUD_SIZE-1))
5978 -#define PGDIR_SIZE (1UL << PGDIR_SHIFT)
5979 +#define PGDIR_SIZE (_AC(1,UL) << PGDIR_SHIFT)
5980 #define PGDIR_MASK (~(PGDIR_SIZE-1))
5981
5982 #define USER_PTRS_PER_PGD ((TASK_SIZE-1)/PGDIR_SIZE+1)
5983 #define FIRST_USER_ADDRESS 0
5984
5985 -#ifndef __ASSEMBLY__
5986 -#define MAXMEM 0x3fffffffffffUL
5987 -#define VMALLOC_START 0xffffc20000000000UL
5988 -#define VMALLOC_END 0xffffe1ffffffffffUL
5989 -#define MODULES_VADDR 0xffffffff88000000UL
5990 -#define MODULES_END 0xfffffffffff00000UL
5991 +#define MAXMEM _AC(0x3fffffffffff, UL)
5992 +#define VMALLOC_START _AC(0xffffc20000000000, UL)
5993 +#define VMALLOC_END _AC(0xffffe1ffffffffff, UL)
5994 +#define MODULES_VADDR _AC(0xffffffff88000000, UL)
5995 +#define MODULES_END _AC(0xfffffffffff00000, UL)
5996 #define MODULES_LEN (MODULES_END - MODULES_VADDR)
5997
5998 #define _PAGE_BIT_PRESENT 0
5999 @@ -160,16 +165,18 @@
6000 #define _PAGE_GLOBAL 0x100 /* Global TLB entry */
6001
6002 #define _PAGE_PROTNONE 0x080 /* If not present */
6003 -#define _PAGE_NX (1UL<<_PAGE_BIT_NX)
6004 +#define _PAGE_NX (_AC(1,UL)<<_PAGE_BIT_NX)
6005
6006 /* Mapped page is I/O or foreign and has no associated page struct. */
6007 #define _PAGE_IO 0x200
6008
6009 +#ifndef __ASSEMBLY__
6010 #if CONFIG_XEN_COMPAT <= 0x030002
6011 extern unsigned int __kernel_page_user;
6012 #else
6013 #define __kernel_page_user 0
6014 #endif
6015 +#endif
6016
6017 #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
6018 #define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY | __kernel_page_user)
6019 @@ -234,6 +241,8 @@
6020 #define __S110 PAGE_SHARED_EXEC
6021 #define __S111 PAGE_SHARED_EXEC
6022
6023 +#ifndef __ASSEMBLY__
6024 +
6025 static inline unsigned long pgd_bad(pgd_t pgd)
6026 {
6027 return __pgd_val(pgd) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER);
6028 @@ -345,6 +354,20 @@
6029 static inline pte_t pte_mkhuge(pte_t pte) { __pte_val(pte) |= _PAGE_PSE; return pte; }
6030 static inline pte_t pte_clrhuge(pte_t pte) { __pte_val(pte) &= ~_PAGE_PSE; return pte; }
6031
6032 +static inline int ptep_test_and_clear_dirty(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
6033 +{
6034 + if (!pte_dirty(*ptep))
6035 + return 0;
6036 + return test_and_clear_bit(_PAGE_BIT_DIRTY, &ptep->pte);
6037 +}
6038 +
6039 +static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
6040 +{
6041 + if (!pte_young(*ptep))
6042 + return 0;
6043 + return test_and_clear_bit(_PAGE_BIT_ACCESSED, &ptep->pte);
6044 +}
6045 +
6046 static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
6047 {
6048 pte_t pte = *ptep;
6049 @@ -470,18 +493,12 @@
6050 * bit at the same time. */
6051 #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
6052 #define ptep_set_access_flags(vma, address, ptep, entry, dirty) \
6053 - do { \
6054 - if (dirty) \
6055 - ptep_establish(vma, address, ptep, entry); \
6056 - } while (0)
6057 -
6058 -
6059 -/*
6060 - * i386 says: We don't actually have these, but we want to advertise
6061 - * them so that we can encompass the flush here.
6062 - */
6063 -#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
6064 -#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
6065 +({ \
6066 + int __changed = !pte_same(*(ptep), entry); \
6067 + if (__changed && (dirty)) \
6068 + ptep_establish(vma, address, ptep, entry); \
6069 + __changed; \
6070 +})
6071
6072 #define __HAVE_ARCH_PTEP_CLEAR_DIRTY_FLUSH
6073 #define ptep_clear_flush_dirty(vma, address, ptep) \
6074 @@ -490,7 +507,7 @@
6075 int __dirty = pte_dirty(__pte); \
6076 __pte = pte_mkclean(__pte); \
6077 if ((vma)->vm_mm->context.pinned) \
6078 - ptep_set_access_flags(vma, address, ptep, __pte, __dirty); \
6079 + (void)ptep_set_access_flags(vma, address, ptep, __pte, __dirty); \
6080 else if (__dirty) \
6081 set_pte(ptep, __pte); \
6082 __dirty; \
6083 @@ -503,7 +520,7 @@
6084 int __young = pte_young(__pte); \
6085 __pte = pte_mkold(__pte); \
6086 if ((vma)->vm_mm->context.pinned) \
6087 - ptep_set_access_flags(vma, address, ptep, __pte, __young); \
6088 + (void)ptep_set_access_flags(vma, address, ptep, __pte, __young); \
6089 else if (__young) \
6090 set_pte(ptep, __pte); \
6091 __young; \
6092 @@ -517,10 +534,7 @@
6093 #define __swp_entry_to_pte(x) ((pte_t) { (x).val })
6094
6095 extern spinlock_t pgd_lock;
6096 -extern struct page *pgd_list;
6097 -void vmalloc_sync_all(void);
6098 -
6099 -#endif /* !__ASSEMBLY__ */
6100 +extern struct list_head pgd_list;
6101
6102 extern int kern_addr_valid(unsigned long addr);
6103
6104 @@ -559,10 +573,6 @@
6105 #define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \
6106 direct_remap_pfn_range(vma,vaddr,pfn,size,prot,DOMID_IO)
6107
6108 -#define MK_IOSPACE_PFN(space, pfn) (pfn)
6109 -#define GET_IOSPACE(pfn) 0
6110 -#define GET_PFN(pfn) (pfn)
6111 -
6112 #define HAVE_ARCH_UNMAPPED_AREA
6113
6114 #define pgtable_cache_init() do { } while (0)
6115 @@ -576,11 +586,14 @@
6116 #define kc_offset_to_vaddr(o) \
6117 (((o) & (1UL << (__VIRTUAL_MASK_SHIFT-1))) ? ((o) | (~__VIRTUAL_MASK)) : (o))
6118
6119 +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
6120 +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
6121 #define __HAVE_ARCH_PTEP_GET_AND_CLEAR
6122 #define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
6123 #define __HAVE_ARCH_PTEP_CLEAR_FLUSH
6124 #define __HAVE_ARCH_PTEP_SET_WRPROTECT
6125 #define __HAVE_ARCH_PTE_SAME
6126 #include <asm-generic/pgtable.h>
6127 +#endif /* !__ASSEMBLY__ */
6128
6129 #endif /* _X86_64_PGTABLE_H */
6130 --- a/include/asm-x86/mach-xen/asm/processor_32.h
6131 +++ b/include/asm-x86/mach-xen/asm/processor_32.h
6132 @@ -21,6 +21,7 @@
6133 #include <asm/percpu.h>
6134 #include <linux/cpumask.h>
6135 #include <linux/init.h>
6136 +#include <asm/processor-flags.h>
6137 #include <xen/interface/physdev.h>
6138
6139 /* flag for disabling the tsc */
6140 @@ -118,7 +119,8 @@
6141
6142 void __init cpu_detect(struct cpuinfo_x86 *c);
6143
6144 -extern void identify_cpu(struct cpuinfo_x86 *);
6145 +extern void identify_boot_cpu(void);
6146 +extern void identify_secondary_cpu(struct cpuinfo_x86 *);
6147 extern void print_cpu_info(struct cpuinfo_x86 *);
6148 extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
6149 extern unsigned short num_cache_leaves;
6150 @@ -129,29 +131,8 @@
6151 static inline void detect_ht(struct cpuinfo_x86 *c) {}
6152 #endif
6153
6154 -/*
6155 - * EFLAGS bits
6156 - */
6157 -#define X86_EFLAGS_CF 0x00000001 /* Carry Flag */
6158 -#define X86_EFLAGS_PF 0x00000004 /* Parity Flag */
6159 -#define X86_EFLAGS_AF 0x00000010 /* Auxillary carry Flag */
6160 -#define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */
6161 -#define X86_EFLAGS_SF 0x00000080 /* Sign Flag */
6162 -#define X86_EFLAGS_TF 0x00000100 /* Trap Flag */
6163 -#define X86_EFLAGS_IF 0x00000200 /* Interrupt Flag */
6164 -#define X86_EFLAGS_DF 0x00000400 /* Direction Flag */
6165 -#define X86_EFLAGS_OF 0x00000800 /* Overflow Flag */
6166 -#define X86_EFLAGS_IOPL 0x00003000 /* IOPL mask */
6167 -#define X86_EFLAGS_NT 0x00004000 /* Nested Task */
6168 -#define X86_EFLAGS_RF 0x00010000 /* Resume Flag */
6169 -#define X86_EFLAGS_VM 0x00020000 /* Virtual Mode */
6170 -#define X86_EFLAGS_AC 0x00040000 /* Alignment Check */
6171 -#define X86_EFLAGS_VIF 0x00080000 /* Virtual Interrupt Flag */
6172 -#define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */
6173 -#define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */
6174 -
6175 -static inline fastcall void xen_cpuid(unsigned int *eax, unsigned int *ebx,
6176 - unsigned int *ecx, unsigned int *edx)
6177 +static inline void xen_cpuid(unsigned int *eax, unsigned int *ebx,
6178 + unsigned int *ecx, unsigned int *edx)
6179 {
6180 /* ecx is often an input as well as an output. */
6181 __asm__(XEN_CPUID
6182 @@ -165,21 +146,6 @@
6183 #define load_cr3(pgdir) write_cr3(__pa(pgdir))
6184
6185 /*
6186 - * Intel CPU features in CR4
6187 - */
6188 -#define X86_CR4_VME 0x0001 /* enable vm86 extensions */
6189 -#define X86_CR4_PVI 0x0002 /* virtual interrupts flag enable */
6190 -#define X86_CR4_TSD 0x0004 /* disable time stamp at ipl 3 */
6191 -#define X86_CR4_DE 0x0008 /* enable debugging extensions */
6192 -#define X86_CR4_PSE 0x0010 /* enable page size extensions */
6193 -#define X86_CR4_PAE 0x0020 /* enable physical address extensions */
6194 -#define X86_CR4_MCE 0x0040 /* Machine check enable */
6195 -#define X86_CR4_PGE 0x0080 /* enable global pages */
6196 -#define X86_CR4_PCE 0x0100 /* enable performance counters at ipl 3 */
6197 -#define X86_CR4_OSFXSR 0x0200 /* enable fast FPU save and restore */
6198 -#define X86_CR4_OSXMMEXCPT 0x0400 /* enable unmasked SSE exceptions */
6199 -
6200 -/*
6201 * Save the cr4 feature set we're using (ie
6202 * Pentium 4MB enable and PPro Global page
6203 * enable), so that any CPU's that boot up
6204 @@ -206,26 +172,6 @@
6205 }
6206
6207 /*
6208 - * NSC/Cyrix CPU configuration register indexes
6209 - */
6210 -
6211 -#define CX86_PCR0 0x20
6212 -#define CX86_GCR 0xb8
6213 -#define CX86_CCR0 0xc0
6214 -#define CX86_CCR1 0xc1
6215 -#define CX86_CCR2 0xc2
6216 -#define CX86_CCR3 0xc3
6217 -#define CX86_CCR4 0xe8
6218 -#define CX86_CCR5 0xe9
6219 -#define CX86_CCR6 0xea
6220 -#define CX86_CCR7 0xeb
6221 -#define CX86_PCR1 0xf0
6222 -#define CX86_DIR0 0xfe
6223 -#define CX86_DIR1 0xff
6224 -#define CX86_ARR_BASE 0xc4
6225 -#define CX86_RCR_BASE 0xdc
6226 -
6227 -/*
6228 * NSC/Cyrix CPU indexed register access macros
6229 */
6230
6231 @@ -351,7 +297,8 @@
6232 struct thread_struct;
6233
6234 #ifndef CONFIG_X86_NO_TSS
6235 -struct tss_struct {
6236 +/* This is the TSS defined by the hardware. */
6237 +struct i386_hw_tss {
6238 unsigned short back_link,__blh;
6239 unsigned long esp0;
6240 unsigned short ss0,__ss0h;
6241 @@ -375,6 +322,11 @@
6242 unsigned short gs, __gsh;
6243 unsigned short ldt, __ldth;
6244 unsigned short trace, io_bitmap_base;
6245 +} __attribute__((packed));
6246 +
6247 +struct tss_struct {
6248 + struct i386_hw_tss x86_tss;
6249 +
6250 /*
6251 * The extra 1 is there because the CPU will access an
6252 * additional byte beyond the end of the IO permission
6253 @@ -428,10 +380,11 @@
6254 };
6255
6256 #define INIT_THREAD { \
6257 + .esp0 = sizeof(init_stack) + (long)&init_stack, \
6258 .vm86_info = NULL, \
6259 .sysenter_cs = __KERNEL_CS, \
6260 .io_bitmap_ptr = NULL, \
6261 - .fs = __KERNEL_PDA, \
6262 + .fs = __KERNEL_PERCPU, \
6263 }
6264
6265 /*
6266 @@ -441,10 +394,12 @@
6267 * be within the limit.
6268 */
6269 #define INIT_TSS { \
6270 - .esp0 = sizeof(init_stack) + (long)&init_stack, \
6271 - .ss0 = __KERNEL_DS, \
6272 - .ss1 = __KERNEL_CS, \
6273 - .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \
6274 + .x86_tss = { \
6275 + .esp0 = sizeof(init_stack) + (long)&init_stack, \
6276 + .ss0 = __KERNEL_DS, \
6277 + .ss1 = __KERNEL_CS, \
6278 + .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \
6279 + }, \
6280 .io_bitmap = { [ 0 ... IO_BITMAP_LONGS] = ~0 }, \
6281 }
6282
6283 @@ -551,38 +506,33 @@
6284
6285 #define cpu_relax() rep_nop()
6286
6287 -#define paravirt_enabled() 0
6288 -#define __cpuid xen_cpuid
6289 -
6290 #ifndef CONFIG_X86_NO_TSS
6291 -static inline void __load_esp0(struct tss_struct *tss, struct thread_struct *thread)
6292 +static inline void native_load_esp0(struct tss_struct *tss, struct thread_struct *thread)
6293 {
6294 - tss->esp0 = thread->esp0;
6295 + tss->x86_tss.esp0 = thread->esp0;
6296 /* This can only happen when SEP is enabled, no need to test "SEP"arately */
6297 - if (unlikely(tss->ss1 != thread->sysenter_cs)) {
6298 - tss->ss1 = thread->sysenter_cs;
6299 + if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
6300 + tss->x86_tss.ss1 = thread->sysenter_cs;
6301 wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
6302 }
6303 }
6304 -#define load_esp0(tss, thread) \
6305 - __load_esp0(tss, thread)
6306 #else
6307 -#define load_esp0(tss, thread) do { \
6308 +#define xen_load_esp0(tss, thread) do { \
6309 if (HYPERVISOR_stack_switch(__KERNEL_DS, (thread)->esp0)) \
6310 BUG(); \
6311 } while (0)
6312 #endif
6313
6314
6315 -/*
6316 - * These special macros can be used to get or set a debugging register
6317 - */
6318 -#define get_debugreg(var, register) \
6319 - (var) = HYPERVISOR_get_debugreg(register)
6320 -#define set_debugreg(value, register) \
6321 - WARN_ON(HYPERVISOR_set_debugreg(register, value))
6322 +static inline unsigned long xen_get_debugreg(int regno)
6323 +{
6324 + return HYPERVISOR_get_debugreg(regno);
6325 +}
6326
6327 -#define set_iopl_mask xen_set_iopl_mask
6328 +static inline void xen_set_debugreg(int regno, unsigned long value)
6329 +{
6330 + WARN_ON(HYPERVISOR_set_debugreg(regno, value));
6331 +}
6332
6333 /*
6334 * Set IOPL bits in EFLAGS from given mask
6335 @@ -597,6 +547,21 @@
6336 }
6337
6338
6339 +#define paravirt_enabled() 0
6340 +#define __cpuid xen_cpuid
6341 +
6342 +#define load_esp0 xen_load_esp0
6343 +
6344 +/*
6345 + * These special macros can be used to get or set a debugging register
6346 + */
6347 +#define get_debugreg(var, register) \
6348 + (var) = xen_get_debugreg(register)
6349 +#define set_debugreg(value, register) \
6350 + xen_set_debugreg(register, value)
6351 +
6352 +#define set_iopl_mask xen_set_iopl_mask
6353 +
6354 /*
6355 * Generic CPUID function
6356 * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
6357 @@ -749,8 +714,14 @@
6358 extern void enable_sep_cpu(void);
6359 extern int sysenter_setup(void);
6360
6361 -extern int init_gdt(int cpu, struct task_struct *idle);
6362 +/* Defined in head.S */
6363 +extern struct Xgt_desc_struct early_gdt_descr;
6364 +
6365 extern void cpu_set_gdt(int);
6366 -extern void secondary_cpu_init(void);
6367 +extern void switch_to_new_gdt(void);
6368 +extern void cpu_init(void);
6369 +extern void init_gdt(int cpu);
6370 +
6371 +extern int force_mwait;
6372
6373 #endif /* __ASM_I386_PROCESSOR_H */
6374 --- a/include/asm-x86/mach-xen/asm/processor_64.h
6375 +++ b/include/asm-x86/mach-xen/asm/processor_64.h
6376 @@ -20,6 +20,7 @@
6377 #include <asm/percpu.h>
6378 #include <linux/personality.h>
6379 #include <linux/cpumask.h>
6380 +#include <asm/processor-flags.h>
6381
6382 #define TF_MASK 0x00000100
6383 #define IF_MASK 0x00000200
6384 @@ -103,42 +104,6 @@
6385 extern unsigned short num_cache_leaves;
6386
6387 /*
6388 - * EFLAGS bits
6389 - */
6390 -#define X86_EFLAGS_CF 0x00000001 /* Carry Flag */
6391 -#define X86_EFLAGS_PF 0x00000004 /* Parity Flag */
6392 -#define X86_EFLAGS_AF 0x00000010 /* Auxillary carry Flag */
6393 -#define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */
6394 -#define X86_EFLAGS_SF 0x00000080 /* Sign Flag */
6395 -#define X86_EFLAGS_TF 0x00000100 /* Trap Flag */
6396 -#define X86_EFLAGS_IF 0x00000200 /* Interrupt Flag */
6397 -#define X86_EFLAGS_DF 0x00000400 /* Direction Flag */
6398 -#define X86_EFLAGS_OF 0x00000800 /* Overflow Flag */
6399 -#define X86_EFLAGS_IOPL 0x00003000 /* IOPL mask */
6400 -#define X86_EFLAGS_NT 0x00004000 /* Nested Task */
6401 -#define X86_EFLAGS_RF 0x00010000 /* Resume Flag */
6402 -#define X86_EFLAGS_VM 0x00020000 /* Virtual Mode */
6403 -#define X86_EFLAGS_AC 0x00040000 /* Alignment Check */
6404 -#define X86_EFLAGS_VIF 0x00080000 /* Virtual Interrupt Flag */
6405 -#define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */
6406 -#define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */
6407 -
6408 -/*
6409 - * Intel CPU features in CR4
6410 - */
6411 -#define X86_CR4_VME 0x0001 /* enable vm86 extensions */
6412 -#define X86_CR4_PVI 0x0002 /* virtual interrupts flag enable */
6413 -#define X86_CR4_TSD 0x0004 /* disable time stamp at ipl 3 */
6414 -#define X86_CR4_DE 0x0008 /* enable debugging extensions */
6415 -#define X86_CR4_PSE 0x0010 /* enable page size extensions */
6416 -#define X86_CR4_PAE 0x0020 /* enable physical address extensions */
6417 -#define X86_CR4_MCE 0x0040 /* Machine check enable */
6418 -#define X86_CR4_PGE 0x0080 /* enable global pages */
6419 -#define X86_CR4_PCE 0x0100 /* enable performance counters at ipl 3 */
6420 -#define X86_CR4_OSFXSR 0x0200 /* enable fast FPU save and restore */
6421 -#define X86_CR4_OSXMMEXCPT 0x0400 /* enable unmasked SSE exceptions */
6422 -
6423 -/*
6424 * Save the cr4 feature set we're using (ie
6425 * Pentium 4MB enable and PPro Global page
6426 * enable), so that any CPU's that boot up
6427 @@ -203,7 +168,7 @@
6428 u32 mxcsr;
6429 u32 mxcsr_mask;
6430 u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
6431 - u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 128 bytes */
6432 + u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */
6433 u32 padding[24];
6434 } __attribute__ ((aligned (16)));
6435
6436 @@ -436,22 +401,6 @@
6437 #define cpu_relax() rep_nop()
6438
6439 /*
6440 - * NSC/Cyrix CPU configuration register indexes
6441 - */
6442 -#define CX86_CCR0 0xc0
6443 -#define CX86_CCR1 0xc1
6444 -#define CX86_CCR2 0xc2
6445 -#define CX86_CCR3 0xc3
6446 -#define CX86_CCR4 0xe8
6447 -#define CX86_CCR5 0xe9
6448 -#define CX86_CCR6 0xea
6449 -#define CX86_CCR7 0xeb
6450 -#define CX86_DIR0 0xfe
6451 -#define CX86_DIR1 0xff
6452 -#define CX86_ARR_BASE 0xc4
6453 -#define CX86_RCR_BASE 0xdc
6454 -
6455 -/*
6456 * NSC/Cyrix CPU indexed register access macros
6457 */
6458
6459 --- a/include/asm-x86/mach-xen/asm/scatterlist_32.h
6460 +++ b/include/asm-x86/mach-xen/asm/scatterlist_32.h
6461 @@ -1,6 +1,8 @@
6462 #ifndef _I386_SCATTERLIST_H
6463 #define _I386_SCATTERLIST_H
6464
6465 +#include <asm/types.h>
6466 +
6467 struct scatterlist {
6468 struct page *page;
6469 unsigned int offset;
6470 --- a/include/asm-x86/mach-xen/asm/segment_32.h
6471 +++ b/include/asm-x86/mach-xen/asm/segment_32.h
6472 @@ -39,7 +39,7 @@
6473 * 25 - APM BIOS support
6474 *
6475 * 26 - ESPFIX small SS
6476 - * 27 - PDA [ per-cpu private data area ]
6477 + * 27 - per-cpu [ offset to per-cpu data area ]
6478 * 28 - unused
6479 * 29 - unused
6480 * 30 - unused
6481 @@ -74,8 +74,12 @@
6482 #define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE + 14)
6483 #define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8)
6484
6485 -#define GDT_ENTRY_PDA (GDT_ENTRY_KERNEL_BASE + 15)
6486 -#define __KERNEL_PDA (GDT_ENTRY_PDA * 8)
6487 +#define GDT_ENTRY_PERCPU (GDT_ENTRY_KERNEL_BASE + 15)
6488 +#ifdef CONFIG_SMP
6489 +#define __KERNEL_PERCPU (GDT_ENTRY_PERCPU * 8)
6490 +#else
6491 +#define __KERNEL_PERCPU 0
6492 +#endif
6493
6494 #define GDT_ENTRY_DOUBLEFAULT_TSS 31
6495
6496 --- a/include/asm-x86/mach-xen/asm/smp_32.h
6497 +++ b/include/asm-x86/mach-xen/asm/smp_32.h
6498 @@ -8,19 +8,15 @@
6499 #include <linux/kernel.h>
6500 #include <linux/threads.h>
6501 #include <linux/cpumask.h>
6502 -#include <asm/pda.h>
6503 #endif
6504
6505 -#ifdef CONFIG_X86_LOCAL_APIC
6506 -#ifndef __ASSEMBLY__
6507 -#include <asm/fixmap.h>
6508 +#if defined(CONFIG_X86_LOCAL_APIC) && !defined(__ASSEMBLY__)
6509 #include <asm/bitops.h>
6510 #include <asm/mpspec.h>
6511 +#include <asm/apic.h>
6512 #ifdef CONFIG_X86_IO_APIC
6513 #include <asm/io_apic.h>
6514 #endif
6515 -#include <asm/apic.h>
6516 -#endif
6517 #endif
6518
6519 #define BAD_APICID 0xFFu
6520 @@ -52,9 +48,76 @@
6521 extern void cpu_uninit(void);
6522 #endif
6523
6524 -#ifndef CONFIG_PARAVIRT
6525 +#ifndef CONFIG_XEN
6526 +struct smp_ops
6527 +{
6528 + void (*smp_prepare_boot_cpu)(void);
6529 + void (*smp_prepare_cpus)(unsigned max_cpus);
6530 + int (*cpu_up)(unsigned cpu);
6531 + void (*smp_cpus_done)(unsigned max_cpus);
6532 +
6533 + void (*smp_send_stop)(void);
6534 + void (*smp_send_reschedule)(int cpu);
6535 + int (*smp_call_function_mask)(cpumask_t mask,
6536 + void (*func)(void *info), void *info,
6537 + int wait);
6538 +};
6539 +
6540 +extern struct smp_ops smp_ops;
6541 +
6542 +static inline void smp_prepare_boot_cpu(void)
6543 +{
6544 + smp_ops.smp_prepare_boot_cpu();
6545 +}
6546 +static inline void smp_prepare_cpus(unsigned int max_cpus)
6547 +{
6548 + smp_ops.smp_prepare_cpus(max_cpus);
6549 +}
6550 +static inline int __cpu_up(unsigned int cpu)
6551 +{
6552 + return smp_ops.cpu_up(cpu);
6553 +}
6554 +static inline void smp_cpus_done(unsigned int max_cpus)
6555 +{
6556 + smp_ops.smp_cpus_done(max_cpus);
6557 +}
6558 +
6559 +static inline void smp_send_stop(void)
6560 +{
6561 + smp_ops.smp_send_stop();
6562 +}
6563 +static inline void smp_send_reschedule(int cpu)
6564 +{
6565 + smp_ops.smp_send_reschedule(cpu);
6566 +}
6567 +static inline int smp_call_function_mask(cpumask_t mask,
6568 + void (*func) (void *info), void *info,
6569 + int wait)
6570 +{
6571 + return smp_ops.smp_call_function_mask(mask, func, info, wait);
6572 +}
6573 +
6574 +void native_smp_prepare_boot_cpu(void);
6575 +void native_smp_prepare_cpus(unsigned int max_cpus);
6576 +int native_cpu_up(unsigned int cpunum);
6577 +void native_smp_cpus_done(unsigned int max_cpus);
6578 +
6579 #define startup_ipi_hook(phys_apicid, start_eip, start_esp) \
6580 do { } while (0)
6581 +
6582 +#else
6583 +
6584 +
6585 +void xen_smp_send_stop(void);
6586 +void xen_smp_send_reschedule(int cpu);
6587 +int xen_smp_call_function_mask(cpumask_t mask,
6588 + void (*func) (void *info), void *info,
6589 + int wait);
6590 +
6591 +#define smp_send_stop xen_smp_send_stop
6592 +#define smp_send_reschedule xen_smp_send_reschedule
6593 +#define smp_call_function_mask xen_smp_call_function_mask
6594 +
6595 #endif
6596
6597 /*
6598 @@ -62,7 +125,8 @@
6599 * from the initial startup. We map APIC_BASE very early in page_setup(),
6600 * so this is correct in the x86 case.
6601 */
6602 -#define raw_smp_processor_id() (read_pda(cpu_number))
6603 +DECLARE_PER_CPU(int, cpu_number);
6604 +#define raw_smp_processor_id() (x86_read_percpu(cpu_number))
6605
6606 extern cpumask_t cpu_possible_map;
6607 #define cpu_callin_map cpu_possible_map
6608 @@ -73,20 +137,6 @@
6609 return cpus_weight(cpu_possible_map);
6610 }
6611
6612 -#ifdef CONFIG_X86_LOCAL_APIC
6613 -
6614 -#ifdef APIC_DEFINITION
6615 -extern int hard_smp_processor_id(void);
6616 -#else
6617 -#include <mach_apicdef.h>
6618 -static inline int hard_smp_processor_id(void)
6619 -{
6620 - /* we don't want to mark this access volatile - bad code generation */
6621 - return GET_APIC_ID(*(unsigned long *)(APIC_BASE+APIC_ID));
6622 -}
6623 -#endif
6624 -#endif
6625 -
6626 #define safe_smp_processor_id() smp_processor_id()
6627 extern int __cpu_disable(void);
6628 extern void __cpu_die(unsigned int cpu);
6629 @@ -102,10 +152,31 @@
6630
6631 #define NO_PROC_ID 0xFF /* No processor magic marker */
6632
6633 -#endif
6634 +#endif /* CONFIG_SMP */
6635
6636 #ifndef __ASSEMBLY__
6637
6638 +#ifdef CONFIG_X86_LOCAL_APIC
6639 +
6640 +#ifdef APIC_DEFINITION
6641 +extern int hard_smp_processor_id(void);
6642 +#else
6643 +#include <mach_apicdef.h>
6644 +static inline int hard_smp_processor_id(void)
6645 +{
6646 + /* we don't want to mark this access volatile - bad code generation */
6647 + return GET_APIC_ID(*(unsigned long *)(APIC_BASE+APIC_ID));
6648 +}
6649 +#endif /* APIC_DEFINITION */
6650 +
6651 +#else /* CONFIG_X86_LOCAL_APIC */
6652 +
6653 +#ifndef CONFIG_SMP
6654 +#define hard_smp_processor_id() 0
6655 +#endif
6656 +
6657 +#endif /* CONFIG_X86_LOCAL_APIC */
6658 +
6659 extern u8 apicid_2_node[];
6660
6661 #ifdef CONFIG_X86_LOCAL_APIC
6662 --- a/include/asm-x86/mach-xen/asm/smp_64.h
6663 +++ b/include/asm-x86/mach-xen/asm/smp_64.h
6664 @@ -11,12 +11,11 @@
6665 extern int disable_apic;
6666
6667 #ifdef CONFIG_X86_LOCAL_APIC
6668 -#include <asm/fixmap.h>
6669 #include <asm/mpspec.h>
6670 +#include <asm/apic.h>
6671 #ifdef CONFIG_X86_IO_APIC
6672 #include <asm/io_apic.h>
6673 #endif
6674 -#include <asm/apic.h>
6675 #include <asm/thread_info.h>
6676 #endif
6677
6678 @@ -41,7 +40,6 @@
6679 extern void unlock_ipi_call_lock(void);
6680 extern int smp_num_siblings;
6681 extern void smp_send_reschedule(int cpu);
6682 -void smp_stop_cpu(void);
6683
6684 extern cpumask_t cpu_sibling_map[NR_CPUS];
6685 extern cpumask_t cpu_core_map[NR_CPUS];
6686 @@ -62,14 +60,6 @@
6687
6688 #define raw_smp_processor_id() read_pda(cpunumber)
6689
6690 -#ifdef CONFIG_X86_LOCAL_APIC
6691 -static inline int hard_smp_processor_id(void)
6692 -{
6693 - /* we don't want to mark this access volatile - bad code generation */
6694 - return GET_APIC_ID(*(unsigned int *)(APIC_BASE+APIC_ID));
6695 -}
6696 -#endif
6697 -
6698 extern int __cpu_disable(void);
6699 extern void __cpu_die(unsigned int cpu);
6700 extern void prefill_possible_map(void);
6701 @@ -78,6 +68,14 @@
6702
6703 #define NO_PROC_ID 0xFF /* No processor magic marker */
6704
6705 +#endif /* CONFIG_SMP */
6706 +
6707 +#ifdef CONFIG_X86_LOCAL_APIC
6708 +static inline int hard_smp_processor_id(void)
6709 +{
6710 + /* we don't want to mark this access volatile - bad code generation */
6711 + return GET_APIC_ID(*(unsigned int *)(APIC_BASE+APIC_ID));
6712 +}
6713 #endif
6714
6715 /*
6716 --- a/include/asm-x86/mach-xen/asm/system_32.h
6717 +++ b/include/asm-x86/mach-xen/asm/system_32.h
6718 @@ -4,7 +4,7 @@
6719 #include <linux/kernel.h>
6720 #include <asm/segment.h>
6721 #include <asm/cpufeature.h>
6722 -#include <linux/bitops.h> /* for LOCK_PREFIX */
6723 +#include <asm/cmpxchg.h>
6724 #include <asm/synch_bitops.h>
6725 #include <asm/hypervisor.h>
6726
6727 @@ -90,308 +90,102 @@
6728 #define savesegment(seg, value) \
6729 asm volatile("mov %%" #seg ",%0":"=rm" (value))
6730
6731 -#define read_cr0() ({ \
6732 - unsigned int __dummy; \
6733 - __asm__ __volatile__( \
6734 - "movl %%cr0,%0\n\t" \
6735 - :"=r" (__dummy)); \
6736 - __dummy; \
6737 -})
6738 -#define write_cr0(x) \
6739 - __asm__ __volatile__("movl %0,%%cr0": :"r" (x))
6740 -
6741 -#define read_cr2() (current_vcpu_info()->arch.cr2)
6742 -#define write_cr2(x) \
6743 - __asm__ __volatile__("movl %0,%%cr2": :"r" (x))
6744 -
6745 -#define read_cr3() ({ \
6746 - unsigned int __dummy; \
6747 - __asm__ ( \
6748 - "movl %%cr3,%0\n\t" \
6749 - :"=r" (__dummy)); \
6750 - __dummy = xen_cr3_to_pfn(__dummy); \
6751 - mfn_to_pfn(__dummy) << PAGE_SHIFT; \
6752 -})
6753 -#define write_cr3(x) ({ \
6754 - unsigned int __dummy = pfn_to_mfn((x) >> PAGE_SHIFT); \
6755 - __dummy = xen_pfn_to_cr3(__dummy); \
6756 - __asm__ __volatile__("movl %0,%%cr3": :"r" (__dummy)); \
6757 -})
6758 -#define read_cr4() ({ \
6759 - unsigned int __dummy; \
6760 - __asm__( \
6761 - "movl %%cr4,%0\n\t" \
6762 - :"=r" (__dummy)); \
6763 - __dummy; \
6764 -})
6765 -#define read_cr4_safe() ({ \
6766 - unsigned int __dummy; \
6767 - /* This could fault if %cr4 does not exist */ \
6768 - __asm__("1: movl %%cr4, %0 \n" \
6769 - "2: \n" \
6770 - ".section __ex_table,\"a\" \n" \
6771 - ".long 1b,2b \n" \
6772 - ".previous \n" \
6773 - : "=r" (__dummy): "0" (0)); \
6774 - __dummy; \
6775 -})
6776 -
6777 -#define write_cr4(x) \
6778 - __asm__ __volatile__("movl %0,%%cr4": :"r" (x))
6779 -
6780 -#define wbinvd() \
6781 - __asm__ __volatile__ ("wbinvd": : :"memory")
6782 -
6783 -/* Clear the 'TS' bit */
6784 -#define clts() (HYPERVISOR_fpu_taskswitch(0))
6785 -
6786 -/* Set the 'TS' bit */
6787 -#define stts() (HYPERVISOR_fpu_taskswitch(1))
6788 -
6789 -#endif /* __KERNEL__ */
6790 -
6791 -static inline unsigned long get_limit(unsigned long segment)
6792 +static inline void xen_clts(void)
6793 {
6794 - unsigned long __limit;
6795 - __asm__("lsll %1,%0"
6796 - :"=r" (__limit):"r" (segment));
6797 - return __limit+1;
6798 + HYPERVISOR_fpu_taskswitch(0);
6799 }
6800
6801 -#define nop() __asm__ __volatile__ ("nop")
6802 -
6803 -#define xchg(ptr,v) ((__typeof__(*(ptr)))__xchg((unsigned long)(v),(ptr),sizeof(*(ptr))))
6804 -
6805 -#define tas(ptr) (xchg((ptr),1))
6806 -
6807 -struct __xchg_dummy { unsigned long a[100]; };
6808 -#define __xg(x) ((struct __xchg_dummy *)(x))
6809 +static inline unsigned long xen_read_cr0(void)
6810 +{
6811 + unsigned long val;
6812 + asm volatile("movl %%cr0,%0\n\t" :"=r" (val));
6813 + return val;
6814 +}
6815
6816 +static inline void xen_write_cr0(unsigned long val)
6817 +{
6818 + asm volatile("movl %0,%%cr0": :"r" (val));
6819 +}
6820
6821 -#ifdef CONFIG_X86_CMPXCHG64
6822 +#define xen_read_cr2() (current_vcpu_info()->arch.cr2)
6823
6824 -/*
6825 - * The semantics of XCHGCMP8B are a bit strange, this is why
6826 - * there is a loop and the loading of %%eax and %%edx has to
6827 - * be inside. This inlines well in most cases, the cached
6828 - * cost is around ~38 cycles. (in the future we might want
6829 - * to do an SIMD/3DNOW!/MMX/FPU 64-bit store here, but that
6830 - * might have an implicit FPU-save as a cost, so it's not
6831 - * clear which path to go.)
6832 - *
6833 - * cmpxchg8b must be used with the lock prefix here to allow
6834 - * the instruction to be executed atomically, see page 3-102
6835 - * of the instruction set reference 24319102.pdf. We need
6836 - * the reader side to see the coherent 64bit value.
6837 - */
6838 -static inline void __set_64bit (unsigned long long * ptr,
6839 - unsigned int low, unsigned int high)
6840 +static inline void xen_write_cr2(unsigned long val)
6841 {
6842 - __asm__ __volatile__ (
6843 - "\n1:\t"
6844 - "movl (%0), %%eax\n\t"
6845 - "movl 4(%0), %%edx\n\t"
6846 - "lock cmpxchg8b (%0)\n\t"
6847 - "jnz 1b"
6848 - : /* no outputs */
6849 - : "D"(ptr),
6850 - "b"(low),
6851 - "c"(high)
6852 - : "ax","dx","memory");
6853 + asm volatile("movl %0,%%cr2": :"r" (val));
6854 }
6855
6856 -static inline void __set_64bit_constant (unsigned long long *ptr,
6857 - unsigned long long value)
6858 +static inline unsigned long xen_read_cr3(void)
6859 {
6860 - __set_64bit(ptr,(unsigned int)(value), (unsigned int)((value)>>32ULL));
6861 + unsigned long val;
6862 + asm volatile("movl %%cr3,%0\n\t" :"=r" (val));
6863 + return mfn_to_pfn(xen_cr3_to_pfn(val)) << PAGE_SHIFT;
6864 }
6865 -#define ll_low(x) *(((unsigned int*)&(x))+0)
6866 -#define ll_high(x) *(((unsigned int*)&(x))+1)
6867
6868 -static inline void __set_64bit_var (unsigned long long *ptr,
6869 - unsigned long long value)
6870 +static inline void xen_write_cr3(unsigned long val)
6871 {
6872 - __set_64bit(ptr,ll_low(value), ll_high(value));
6873 + val = xen_pfn_to_cr3(pfn_to_mfn(val >> PAGE_SHIFT));
6874 + asm volatile("movl %0,%%cr3": :"r" (val));
6875 }
6876
6877 -#define set_64bit(ptr,value) \
6878 -(__builtin_constant_p(value) ? \
6879 - __set_64bit_constant(ptr, value) : \
6880 - __set_64bit_var(ptr, value) )
6881 +static inline unsigned long xen_read_cr4(void)
6882 +{
6883 + unsigned long val;
6884 + asm volatile("movl %%cr4,%0\n\t" :"=r" (val));
6885 + return val;
6886 +}
6887
6888 -#define _set_64bit(ptr,value) \
6889 -(__builtin_constant_p(value) ? \
6890 - __set_64bit(ptr, (unsigned int)(value), (unsigned int)((value)>>32ULL) ) : \
6891 - __set_64bit(ptr, ll_low(value), ll_high(value)) )
6892 +static inline unsigned long xen_read_cr4_safe(void)
6893 +{
6894 + unsigned long val;
6895 + /* This could fault if %cr4 does not exist */
6896 + asm("1: movl %%cr4, %0 \n"
6897 + "2: \n"
6898 + ".section __ex_table,\"a\" \n"
6899 + ".long 1b,2b \n"
6900 + ".previous \n"
6901 + : "=r" (val): "0" (0));
6902 + return val;
6903 +}
6904
6905 -#endif
6906 +static inline void xen_write_cr4(unsigned long val)
6907 +{
6908 + asm volatile("movl %0,%%cr4": :"r" (val));
6909 +}
6910
6911 -/*
6912 - * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
6913 - * Note 2: xchg has side effect, so that attribute volatile is necessary,
6914 - * but generally the primitive is invalid, *ptr is output argument. --ANK
6915 - */
6916 -static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int size)
6917 +static inline void xen_wbinvd(void)
6918 {
6919 - switch (size) {
6920 - case 1:
6921 - __asm__ __volatile__("xchgb %b0,%1"
6922 - :"=q" (x)
6923 - :"m" (*__xg(ptr)), "0" (x)
6924 - :"memory");
6925 - break;
6926 - case 2:
6927 - __asm__ __volatile__("xchgw %w0,%1"
6928 - :"=r" (x)
6929 - :"m" (*__xg(ptr)), "0" (x)
6930 - :"memory");
6931 - break;
6932 - case 4:
6933 - __asm__ __volatile__("xchgl %0,%1"
6934 - :"=r" (x)
6935 - :"m" (*__xg(ptr)), "0" (x)
6936 - :"memory");
6937 - break;
6938 - }
6939 - return x;
6940 + asm volatile("wbinvd": : :"memory");
6941 }
6942
6943 -/*
6944 - * Atomic compare and exchange. Compare OLD with MEM, if identical,
6945 - * store NEW in MEM. Return the initial value in MEM. Success is
6946 - * indicated by comparing RETURN with OLD.
6947 - */
6948 +#define read_cr0() (xen_read_cr0())
6949 +#define write_cr0(x) (xen_write_cr0(x))
6950 +#define read_cr2() (xen_read_cr2())
6951 +#define write_cr2(x) (xen_write_cr2(x))
6952 +#define read_cr3() (xen_read_cr3())
6953 +#define write_cr3(x) (xen_write_cr3(x))
6954 +#define read_cr4() (xen_read_cr4())
6955 +#define read_cr4_safe() (xen_read_cr4_safe())
6956 +#define write_cr4(x) (xen_write_cr4(x))
6957 +#define wbinvd() (xen_wbinvd())
6958
6959 -#ifdef CONFIG_X86_CMPXCHG
6960 -#define __HAVE_ARCH_CMPXCHG 1
6961 -#define cmpxchg(ptr,o,n)\
6962 - ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\
6963 - (unsigned long)(n),sizeof(*(ptr))))
6964 -#define sync_cmpxchg(ptr,o,n)\
6965 - ((__typeof__(*(ptr)))__sync_cmpxchg((ptr),(unsigned long)(o),\
6966 - (unsigned long)(n),sizeof(*(ptr))))
6967 -#endif
6968 -
6969 -static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
6970 - unsigned long new, int size)
6971 -{
6972 - unsigned long prev;
6973 - switch (size) {
6974 - case 1:
6975 - __asm__ __volatile__(LOCK_PREFIX "cmpxchgb %b1,%2"
6976 - : "=a"(prev)
6977 - : "q"(new), "m"(*__xg(ptr)), "0"(old)
6978 - : "memory");
6979 - return prev;
6980 - case 2:
6981 - __asm__ __volatile__(LOCK_PREFIX "cmpxchgw %w1,%2"
6982 - : "=a"(prev)
6983 - : "r"(new), "m"(*__xg(ptr)), "0"(old)
6984 - : "memory");
6985 - return prev;
6986 - case 4:
6987 - __asm__ __volatile__(LOCK_PREFIX "cmpxchgl %1,%2"
6988 - : "=a"(prev)
6989 - : "r"(new), "m"(*__xg(ptr)), "0"(old)
6990 - : "memory");
6991 - return prev;
6992 - }
6993 - return old;
6994 -}
6995 +/* Clear the 'TS' bit */
6996 +#define clts() (xen_clts())
6997
6998 -/*
6999 - * Always use locked operations when touching memory shared with a
7000 - * hypervisor, since the system may be SMP even if the guest kernel
7001 - * isn't.
7002 - */
7003 -static inline unsigned long __sync_cmpxchg(volatile void *ptr,
7004 - unsigned long old,
7005 - unsigned long new, int size)
7006 -{
7007 - unsigned long prev;
7008 - switch (size) {
7009 - case 1:
7010 - __asm__ __volatile__("lock; cmpxchgb %b1,%2"
7011 - : "=a"(prev)
7012 - : "q"(new), "m"(*__xg(ptr)), "0"(old)
7013 - : "memory");
7014 - return prev;
7015 - case 2:
7016 - __asm__ __volatile__("lock; cmpxchgw %w1,%2"
7017 - : "=a"(prev)
7018 - : "r"(new), "m"(*__xg(ptr)), "0"(old)
7019 - : "memory");
7020 - return prev;
7021 - case 4:
7022 - __asm__ __volatile__("lock; cmpxchgl %1,%2"
7023 - : "=a"(prev)
7024 - : "r"(new), "m"(*__xg(ptr)), "0"(old)
7025 - : "memory");
7026 - return prev;
7027 - }
7028 - return old;
7029 -}
7030 +/* Set the 'TS' bit */
7031 +#define stts() (HYPERVISOR_fpu_taskswitch(1))
7032
7033 -#ifndef CONFIG_X86_CMPXCHG
7034 -/*
7035 - * Building a kernel capable running on 80386. It may be necessary to
7036 - * simulate the cmpxchg on the 80386 CPU. For that purpose we define
7037 - * a function for each of the sizes we support.
7038 - */
7039 +#endif /* __KERNEL__ */
7040
7041 -extern unsigned long cmpxchg_386_u8(volatile void *, u8, u8);
7042 -extern unsigned long cmpxchg_386_u16(volatile void *, u16, u16);
7043 -extern unsigned long cmpxchg_386_u32(volatile void *, u32, u32);
7044 -
7045 -static inline unsigned long cmpxchg_386(volatile void *ptr, unsigned long old,
7046 - unsigned long new, int size)
7047 -{
7048 - switch (size) {
7049 - case 1:
7050 - return cmpxchg_386_u8(ptr, old, new);
7051 - case 2:
7052 - return cmpxchg_386_u16(ptr, old, new);
7053 - case 4:
7054 - return cmpxchg_386_u32(ptr, old, new);
7055 - }
7056 - return old;
7057 -}
7058 -
7059 -#define cmpxchg(ptr,o,n) \
7060 -({ \
7061 - __typeof__(*(ptr)) __ret; \
7062 - if (likely(boot_cpu_data.x86 > 3)) \
7063 - __ret = __cmpxchg((ptr), (unsigned long)(o), \
7064 - (unsigned long)(n), sizeof(*(ptr))); \
7065 - else \
7066 - __ret = cmpxchg_386((ptr), (unsigned long)(o), \
7067 - (unsigned long)(n), sizeof(*(ptr))); \
7068 - __ret; \
7069 -})
7070 -#endif
7071 -
7072 -#ifdef CONFIG_X86_CMPXCHG64
7073 -
7074 -static inline unsigned long long __cmpxchg64(volatile void *ptr, unsigned long long old,
7075 - unsigned long long new)
7076 -{
7077 - unsigned long long prev;
7078 - __asm__ __volatile__(LOCK_PREFIX "cmpxchg8b %3"
7079 - : "=A"(prev)
7080 - : "b"((unsigned long)new),
7081 - "c"((unsigned long)(new >> 32)),
7082 - "m"(*__xg(ptr)),
7083 - "0"(old)
7084 - : "memory");
7085 - return prev;
7086 -}
7087 -
7088 -#define cmpxchg64(ptr,o,n)\
7089 - ((__typeof__(*(ptr)))__cmpxchg64((ptr),(unsigned long long)(o),\
7090 - (unsigned long long)(n)))
7091 +static inline unsigned long get_limit(unsigned long segment)
7092 +{
7093 + unsigned long __limit;
7094 + __asm__("lsll %1,%0"
7095 + :"=r" (__limit):"r" (segment));
7096 + return __limit+1;
7097 +}
7098 +
7099 +#define nop() __asm__ __volatile__ ("nop")
7100
7101 -#endif
7102 -
7103 /*
7104 * Force strict CPU ordering.
7105 * And yes, this is required on UP too when we're talking
7106 --- a/include/asm-x86/mach-xen/asm/system_64.h
7107 +++ b/include/asm-x86/mach-xen/asm/system_64.h
7108 @@ -3,7 +3,7 @@
7109
7110 #include <linux/kernel.h>
7111 #include <asm/segment.h>
7112 -#include <asm/alternative.h>
7113 +#include <asm/cmpxchg.h>
7114
7115 #include <asm/synch_bitops.h>
7116 #include <asm/hypervisor.h>
7117 @@ -43,7 +43,7 @@
7118 [threadrsp] "i" (offsetof(struct task_struct, thread.rsp)), \
7119 [ti_flags] "i" (offsetof(struct thread_info, flags)),\
7120 [tif_fork] "i" (TIF_FORK), \
7121 - [thread_info] "i" (offsetof(struct task_struct, thread_info)), \
7122 + [thread_info] "i" (offsetof(struct task_struct, stack)), \
7123 [pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent)) \
7124 : "memory", "cc" __EXTRA_CLOBBER)
7125
7126 @@ -92,6 +92,12 @@
7127 machine_to_phys(__dummy); \
7128 })
7129
7130 +static inline void write_cr3(unsigned long val)
7131 +{
7132 + val = phys_to_machine(val);
7133 + asm volatile("movq %0,%%cr3" :: "r" (val) : "memory");
7134 +}
7135 +
7136 static inline unsigned long read_cr4(void)
7137 {
7138 unsigned long cr4;
7139 @@ -101,7 +107,7 @@
7140
7141 static inline void write_cr4(unsigned long val)
7142 {
7143 - asm volatile("movq %0,%%cr4" :: "r" (val));
7144 + asm volatile("movq %0,%%cr4" :: "r" (val) : "memory");
7145 }
7146
7147 #define stts() (HYPERVISOR_fpu_taskswitch(1))
7148 @@ -122,100 +128,6 @@
7149
7150 #define nop() __asm__ __volatile__ ("nop")
7151
7152 -#define xchg(ptr,v) ((__typeof__(*(ptr)))__xchg((unsigned long)(v),(ptr),sizeof(*(ptr))))
7153 -
7154 -#define tas(ptr) (xchg((ptr),1))
7155 -
7156 -#define __xg(x) ((volatile long *)(x))
7157 -
7158 -static inline void set_64bit(volatile unsigned long *ptr, unsigned long val)
7159 -{
7160 - *ptr = val;
7161 -}
7162 -
7163 -#define _set_64bit set_64bit
7164 -
7165 -/*
7166 - * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
7167 - * Note 2: xchg has side effect, so that attribute volatile is necessary,
7168 - * but generally the primitive is invalid, *ptr is output argument. --ANK
7169 - */
7170 -static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int size)
7171 -{
7172 - switch (size) {
7173 - case 1:
7174 - __asm__ __volatile__("xchgb %b0,%1"
7175 - :"=q" (x)
7176 - :"m" (*__xg(ptr)), "0" (x)
7177 - :"memory");
7178 - break;
7179 - case 2:
7180 - __asm__ __volatile__("xchgw %w0,%1"
7181 - :"=r" (x)
7182 - :"m" (*__xg(ptr)), "0" (x)
7183 - :"memory");
7184 - break;
7185 - case 4:
7186 - __asm__ __volatile__("xchgl %k0,%1"
7187 - :"=r" (x)
7188 - :"m" (*__xg(ptr)), "0" (x)
7189 - :"memory");
7190 - break;
7191 - case 8:
7192 - __asm__ __volatile__("xchgq %0,%1"
7193 - :"=r" (x)
7194 - :"m" (*__xg(ptr)), "0" (x)
7195 - :"memory");
7196 - break;
7197 - }
7198 - return x;
7199 -}
7200 -
7201 -/*
7202 - * Atomic compare and exchange. Compare OLD with MEM, if identical,
7203 - * store NEW in MEM. Return the initial value in MEM. Success is
7204 - * indicated by comparing RETURN with OLD.
7205 - */
7206 -
7207 -#define __HAVE_ARCH_CMPXCHG 1
7208 -
7209 -static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
7210 - unsigned long new, int size)
7211 -{
7212 - unsigned long prev;
7213 - switch (size) {
7214 - case 1:
7215 - __asm__ __volatile__(LOCK_PREFIX "cmpxchgb %b1,%2"
7216 - : "=a"(prev)
7217 - : "q"(new), "m"(*__xg(ptr)), "0"(old)
7218 - : "memory");
7219 - return prev;
7220 - case 2:
7221 - __asm__ __volatile__(LOCK_PREFIX "cmpxchgw %w1,%2"
7222 - : "=a"(prev)
7223 - : "r"(new), "m"(*__xg(ptr)), "0"(old)
7224 - : "memory");
7225 - return prev;
7226 - case 4:
7227 - __asm__ __volatile__(LOCK_PREFIX "cmpxchgl %k1,%2"
7228 - : "=a"(prev)
7229 - : "r"(new), "m"(*__xg(ptr)), "0"(old)
7230 - : "memory");
7231 - return prev;
7232 - case 8:
7233 - __asm__ __volatile__(LOCK_PREFIX "cmpxchgq %1,%2"
7234 - : "=a"(prev)
7235 - : "r"(new), "m"(*__xg(ptr)), "0"(old)
7236 - : "memory");
7237 - return prev;
7238 - }
7239 - return old;
7240 -}
7241 -
7242 -#define cmpxchg(ptr,o,n)\
7243 - ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\
7244 - (unsigned long)(n),sizeof(*(ptr))))
7245 -
7246 #ifdef CONFIG_SMP
7247 #define smp_mb() mb()
7248 #define smp_rmb() rmb()
7249 --- a/include/asm-x86/mach-xen/asm/tlbflush_32.h
7250 +++ b/include/asm-x86/mach-xen/asm/tlbflush_32.h
7251 @@ -29,8 +29,13 @@
7252 * and page-granular flushes are available only on i486 and up.
7253 */
7254
7255 +#define TLB_FLUSH_ALL 0xffffffff
7256 +
7257 +
7258 #ifndef CONFIG_SMP
7259
7260 +#include <linux/sched.h>
7261 +
7262 #define flush_tlb() __flush_tlb()
7263 #define flush_tlb_all() __flush_tlb_all()
7264 #define local_flush_tlb() __flush_tlb()
7265 @@ -55,7 +60,7 @@
7266 __flush_tlb();
7267 }
7268
7269 -#else
7270 +#else /* SMP */
7271
7272 #include <asm/smp.h>
7273
7274 @@ -84,9 +89,7 @@
7275 char __cacheline_padding[L1_CACHE_BYTES-8];
7276 };
7277 DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate);
7278 -
7279 -
7280 -#endif
7281 +#endif /* SMP */
7282
7283 #define flush_tlb_kernel_range(start, end) flush_tlb_all()
7284
7285 --- a/include/asm-x86/mach-xen/asm/tlbflush_64.h
7286 +++ b/include/asm-x86/mach-xen/asm/tlbflush_64.h
7287 @@ -2,7 +2,9 @@
7288 #define _X8664_TLBFLUSH_H
7289
7290 #include <linux/mm.h>
7291 +#include <linux/sched.h>
7292 #include <asm/processor.h>
7293 +#include <asm/system.h>
7294
7295 #define __flush_tlb() xen_tlb_flush()
7296
7297 --- a/lib/swiotlb-xen.c
7298 +++ b/lib/swiotlb-xen.c
7299 @@ -729,7 +729,6 @@
7300 return (mask >= ((1UL << dma_bits) - 1));
7301 }
7302
7303 -EXPORT_SYMBOL(swiotlb_init);
7304 EXPORT_SYMBOL(swiotlb_map_single);
7305 EXPORT_SYMBOL(swiotlb_unmap_single);
7306 EXPORT_SYMBOL(swiotlb_map_sg);
7307 --- a/net/core/dev.c
7308 +++ b/net/core/dev.c
7309 @@ -1590,12 +1590,17 @@
7310 inline int skb_checksum_setup(struct sk_buff *skb)
7311 {
7312 if (skb->proto_csum_blank) {
7313 + struct iphdr *iph;
7314 + unsigned char *th;
7315 +
7316 if (skb->protocol != htons(ETH_P_IP))
7317 goto out;
7318 - skb->h.raw = (unsigned char *)skb->nh.iph + 4*skb->nh.iph->ihl;
7319 - if (skb->h.raw >= skb->tail)
7320 + iph = ip_hdr(skb);
7321 + th = skb_network_header(skb) + 4 * iph->ihl;
7322 + if (th >= skb_tail_pointer(skb))
7323 goto out;
7324 - switch (skb->nh.iph->protocol) {
7325 + skb->csum_start = th - skb->head;
7326 + switch (iph->protocol) {
7327 case IPPROTO_TCP:
7328 skb->csum_offset = offsetof(struct tcphdr, check);
7329 break;
7330 @@ -1606,10 +1611,10 @@
7331 if (net_ratelimit())
7332 printk(KERN_ERR "Attempting to checksum a non-"
7333 "TCP/UDP packet, dropping a protocol"
7334 - " %d packet", skb->nh.iph->protocol);
7335 + " %d packet", iph->protocol);
7336 goto out;
7337 }
7338 - if ((skb->h.raw + skb->csum_offset + 2) > skb->tail)
7339 + if ((th + skb->csum_offset + 2) > skb_tail_pointer(skb))
7340 goto out;
7341 skb->ip_summed = CHECKSUM_PARTIAL;
7342 skb->proto_csum_blank = 0;
7343 --- a/scripts/Makefile.xen.awk
7344 +++ b/scripts/Makefile.xen.awk
7345 @@ -13,7 +13,7 @@
7346 next
7347 }
7348
7349 -/:[[:space:]]*%\.[cS][[:space:]]/ {
7350 +/:[[:space:]]*\$\(src\)\/%\.[cS][[:space:]]/ {
7351 line = gensub(/%.([cS])/, "%-xen.\\1", "g", $0)
7352 line = gensub(/(single-used-m)/, "xen-\\1", "g", line)
7353 print line