Magellan Linux

Contents of /trunk/kernel26-xen/patches-2.6.25-r1/1020-2.6.25-xen-patch-2.6.19.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 609 - (show annotations) (download)
Fri May 23 17:35:37 2008 UTC (16 years ago) by niro
File size: 319559 byte(s)
-using opensuse xen patchset, updated kernel configs

1 From: www.kernel.org
2 Subject: Linux 2.6.19
3 Patch-mainline: 2.6.19
4
5 Automatically created from "patches.kernel.org/patch-2.6.19" by xen-port-patches.py
6
7 Acked-by: jbeulich@novell.com
8
9 ---
10 arch/x86/Kconfig | 1
11 arch/x86/ia32/ia32entry-xen.S | 9
12 arch/x86/kernel/Makefile | 5
13 arch/x86/kernel/apic_32-xen.c | 9
14 arch/x86/kernel/apic_64-xen.c | 20
15 arch/x86/kernel/cpu/common-xen.c | 20
16 arch/x86/kernel/e820_64-xen.c | 320 +++---
17 arch/x86/kernel/early_printk-xen.c | 20
18 arch/x86/kernel/entry_32-xen.S | 139 +-
19 arch/x86/kernel/entry_64-xen.S | 106 --
20 arch/x86/kernel/genapic_xen_64.c | 9
21 arch/x86/kernel/head64-xen.c | 44
22 arch/x86/kernel/head_32-xen.S | 2
23 arch/x86/kernel/head_64-xen.S | 5
24 arch/x86/kernel/io_apic_32-xen.c | 750 +++++++++------
25 arch/x86/kernel/io_apic_64-xen.c | 1250 +++++++++++---------------
26 arch/x86/kernel/ioport_64-xen.c | 1
27 arch/x86/kernel/irq_32-xen.c | 19
28 arch/x86/kernel/irq_64-xen.c | 35
29 arch/x86/kernel/ldt_32-xen.c | 2
30 arch/x86/kernel/microcode-xen.c | 85 +
31 arch/x86/kernel/mpparse_32-xen.c | 70 -
32 arch/x86/kernel/mpparse_64-xen.c | 313 +-----
33 arch/x86/kernel/pci-dma_32-xen.c | 16
34 arch/x86/kernel/pci-swiotlb_64-xen.c | 3
35 arch/x86/kernel/process_32-xen.c | 29
36 arch/x86/kernel/process_64-xen.c | 90 +
37 arch/x86/kernel/setup64-xen.c | 41
38 arch/x86/kernel/setup_32-xen.c | 430 +++-----
39 arch/x86/kernel/setup_64-xen.c | 271 +----
40 arch/x86/kernel/smp_32-xen.c | 75 +
41 arch/x86/kernel/smp_64-xen.c | 35
42 arch/x86/kernel/time_32-xen.c | 86 -
43 arch/x86/kernel/traps_32-xen.c | 238 +++-
44 arch/x86/kernel/traps_64-xen.c | 220 +++-
45 arch/x86/kernel/vsyscall_64-xen.c | 117 ++
46 arch/x86/mach-xen/setup.c | 6
47 arch/x86/mm/fault_32-xen.c | 29
48 arch/x86/mm/fault_64-xen.c | 34
49 arch/x86/mm/highmem_32-xen.c | 31
50 arch/x86/mm/hypervisor.c | 9
51 arch/x86/mm/init_32-xen.c | 89 +
52 arch/x86/mm/init_64-xen.c | 184 +--
53 arch/x86/mm/ioremap_32-xen.c | 10
54 arch/x86/mm/pageattr_64-xen.c | 24
55 arch/x86/mm/pgtable_32-xen.c | 31
56 arch/x86/pci/irq-xen.c | 38
57 drivers/char/tpm/tpm_xen.c | 5
58 drivers/pci/Kconfig | 2
59 drivers/xen/Kconfig | 3
60 drivers/xen/balloon/balloon.c | 2
61 drivers/xen/blkback/blkback.c | 2
62 drivers/xen/blkback/common.h | 2
63 drivers/xen/blkfront/blkfront.c | 4
64 drivers/xen/blktap/blktap.c | 2
65 drivers/xen/blktap/common.h | 2
66 drivers/xen/console/console.c | 10
67 drivers/xen/console/xencons_ring.c | 4
68 drivers/xen/core/evtchn.c | 50 -
69 drivers/xen/core/reboot.c | 3
70 drivers/xen/core/smpboot.c | 6
71 drivers/xen/fbfront/xenfb.c | 3
72 drivers/xen/fbfront/xenkbd.c | 2
73 drivers/xen/gntdev/gntdev.c | 11
74 drivers/xen/netback/accel.c | 2
75 drivers/xen/netback/common.h | 2
76 drivers/xen/netback/loopback.c | 2
77 drivers/xen/netback/netback.c | 6
78 drivers/xen/netfront/netfront.c | 8
79 drivers/xen/pciback/pciback.h | 2
80 drivers/xen/pciback/pciback_ops.c | 2
81 drivers/xen/pcifront/pci_op.c | 8
82 drivers/xen/privcmd/compat_privcmd.c | 1
83 drivers/xen/privcmd/privcmd.c | 2
84 drivers/xen/sfc_netback/accel_xenbus.c | 6
85 drivers/xen/sfc_netfront/accel.h | 6
86 drivers/xen/sfc_netfront/accel_msg.c | 6
87 drivers/xen/sfc_netfront/accel_tso.c | 2
88 drivers/xen/sfc_netfront/accel_vi.c | 4
89 drivers/xen/tpmback/common.h | 2
90 drivers/xen/tpmback/tpmback.c | 4
91 drivers/xen/xenbus/xenbus_comms.c | 2
92 drivers/xen/xenoprof/xenoprofile.c | 2
93 include/asm-generic/pgtable.h | 2
94 include/asm-x86/mach-xen/asm/desc_32.h | 127 +-
95 include/asm-x86/mach-xen/asm/dma-mapping_64.h | 7
96 include/asm-x86/mach-xen/asm/e820_64.h | 15
97 include/asm-x86/mach-xen/asm/fixmap_32.h | 5
98 include/asm-x86/mach-xen/asm/fixmap_64.h | 2
99 include/asm-x86/mach-xen/asm/hw_irq_32.h | 8
100 include/asm-x86/mach-xen/asm/hw_irq_64.h | 10
101 include/asm-x86/mach-xen/asm/io_32.h | 27
102 include/asm-x86/mach-xen/asm/io_64.h | 27
103 include/asm-x86/mach-xen/asm/pgtable-2level.h | 12
104 include/asm-x86/mach-xen/asm/pgtable-3level.h | 14
105 include/asm-x86/mach-xen/asm/pgtable_32.h | 143 +-
106 include/asm-x86/mach-xen/asm/pgtable_64.h | 86 +
107 include/asm-x86/mach-xen/asm/processor_32.h | 62 -
108 include/asm-x86/mach-xen/asm/processor_64.h | 2
109 include/asm-x86/mach-xen/asm/segment_32.h | 19
110 include/asm-x86/mach-xen/asm/smp_32.h | 25
111 include/asm-x86/mach-xen/asm/smp_64.h | 27
112 include/asm-x86/mach-xen/asm/system_32.h | 36
113 include/asm-x86/mach-xen/asm/system_64.h | 1
114 include/asm-x86/mach-xen/asm/tlbflush_32.h | 2
115 include/asm-x86/mach-xen/asm/tlbflush_64.h | 3
116 include/asm-x86/thread_info_64.h | 4
117 include/linux/skbuff.h | 7
118 include/xen/evtchn.h | 10
119 include/xen/xencons.h | 2
120 mm/mprotect.c | 2
121 net/core/dev.c | 8
122 112 files changed, 3102 insertions(+), 3145 deletions(-)
123
124 --- a/arch/x86/Kconfig
125 +++ b/arch/x86/Kconfig
126 @@ -390,6 +390,7 @@
127
128 menuconfig PARAVIRT_GUEST
129 bool "Paravirtualized guest support"
130 + depends on !X86_XEN && !X86_64_XEN
131 help
132 Say Y here to get to see options related to running Linux under
133 various hypervisors. This option alone does not add any kernel code.
134 --- a/arch/x86/ia32/ia32entry-xen.S
135 +++ b/arch/x86/ia32/ia32entry-xen.S
136 @@ -83,6 +83,7 @@
137 */
138 ENTRY(ia32_sysenter_target)
139 CFI_STARTPROC32 simple
140 + CFI_SIGNAL_FRAME
141 CFI_DEF_CFA rsp,SS+8-RIP+16
142 /*CFI_REL_OFFSET ss,SS-RIP+16*/
143 CFI_REL_OFFSET rsp,RSP-RIP+16
144 @@ -164,6 +165,7 @@
145 */
146 ENTRY(ia32_cstar_target)
147 CFI_STARTPROC32 simple
148 + CFI_SIGNAL_FRAME
149 CFI_DEF_CFA rsp,SS+8-RIP+16
150 /*CFI_REL_OFFSET ss,SS-RIP+16*/
151 CFI_REL_OFFSET rsp,RSP-RIP+16
152 @@ -243,6 +245,7 @@
153
154 ENTRY(ia32_syscall)
155 CFI_STARTPROC simple
156 + CFI_SIGNAL_FRAME
157 CFI_DEF_CFA rsp,SS+8-RIP+16
158 /*CFI_REL_OFFSET ss,SS-RIP+16*/
159 CFI_REL_OFFSET rsp,RSP-RIP+16
160 @@ -320,6 +323,7 @@
161 popq %r11
162 CFI_ENDPROC
163 CFI_STARTPROC32 simple
164 + CFI_SIGNAL_FRAME
165 CFI_DEF_CFA rsp,SS+8-ARGOFFSET
166 CFI_REL_OFFSET rax,RAX-ARGOFFSET
167 CFI_REL_OFFSET rcx,RCX-ARGOFFSET
168 @@ -653,8 +657,8 @@
169 .quad sys_readlinkat /* 305 */
170 .quad sys_fchmodat
171 .quad sys_faccessat
172 - .quad quiet_ni_syscall /* pselect6 for now */
173 - .quad quiet_ni_syscall /* ppoll for now */
174 + .quad compat_sys_pselect6
175 + .quad compat_sys_ppoll
176 .quad sys_unshare /* 310 */
177 .quad compat_sys_set_robust_list
178 .quad compat_sys_get_robust_list
179 @@ -663,4 +667,5 @@
180 .quad sys_tee
181 .quad compat_sys_vmsplice
182 .quad compat_sys_move_pages
183 + .quad sys_getcpu
184 ia32_syscall_end:
185 --- a/arch/x86/kernel/Makefile
186 +++ b/arch/x86/kernel/Makefile
187 @@ -91,7 +91,7 @@
188 ###
189 # 64 bit specific files
190 ifeq ($(CONFIG_X86_64),y)
191 - obj-y += genapic_64.o genapic_flat_64.o
192 + obj-$(CONFIG_X86_LOCAL_APIC) += genapic_64.o genapic_flat_64.o
193 obj-$(CONFIG_X86_XEN_GENAPIC) += genapic_64.o genapic_xen_64.o
194 obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o
195 obj-$(CONFIG_AUDIT) += audit_64.o
196 @@ -104,5 +104,6 @@
197 pci-dma_64-$(CONFIG_XEN) += pci-dma_32.o
198 endif
199
200 -disabled-obj-$(CONFIG_XEN) := i8253.o i8259_$(BITS).o reboot.o smpboot_$(BITS).o tsc_$(BITS).o
201 +disabled-obj-$(CONFIG_XEN) := early-quirks.o i8253.o i8259_$(BITS).o reboot.o \
202 + smpboot_$(BITS).o tsc_$(BITS).o
203 %/head_$(BITS).o %/head_$(BITS).s: $(if $(CONFIG_XEN),EXTRA_AFLAGS,dummy) :=
204 --- a/arch/x86/kernel/apic_32-xen.c
205 +++ b/arch/x86/kernel/apic_32-xen.c
206 @@ -54,7 +54,6 @@
207 /*
208 * Knob to control our willingness to enable the local APIC.
209 */
210 -int enable_local_apic __initdata = 0; /* -1=force-disable, +1=force-enable */
211
212 /*
213 * Debug level
214 @@ -102,7 +101,7 @@
215
216 #ifndef CONFIG_XEN
217 #ifndef CONFIG_SMP
218 -static void up_apic_timer_interrupt_call(struct pt_regs *regs)
219 +static void up_apic_timer_interrupt_call(void)
220 {
221 int cpu = smp_processor_id();
222
223 @@ -111,11 +110,11 @@
224 */
225 per_cpu(irq_stat, cpu).apic_timer_irqs++;
226
227 - smp_local_timer_interrupt(regs);
228 + smp_local_timer_interrupt();
229 }
230 #endif
231
232 -void smp_send_timer_broadcast_ipi(struct pt_regs *regs)
233 +void smp_send_timer_broadcast_ipi(void)
234 {
235 cpumask_t mask;
236
237 @@ -128,7 +127,7 @@
238 * We can directly call the apic timer interrupt handler
239 * in UP case. Minus all irq related functions
240 */
241 - up_apic_timer_interrupt_call(regs);
242 + up_apic_timer_interrupt_call();
243 #endif
244 }
245 }
246 --- a/arch/x86/kernel/apic_64-xen.c
247 +++ b/arch/x86/kernel/apic_64-xen.c
248 @@ -43,7 +43,7 @@
249 */
250 void ack_bad_irq(unsigned int irq)
251 {
252 - printk("unexpected IRQ trap at vector %02x\n", irq);
253 + printk("unexpected IRQ trap at irq %02x\n", irq);
254 /*
255 * Currently unexpected vectors happen only on SMP and APIC.
256 * We _must_ ack these because every local APIC has only N
257 @@ -62,19 +62,19 @@
258 return -EINVAL;
259 }
260
261 -void smp_local_timer_interrupt(struct pt_regs *regs)
262 +void smp_local_timer_interrupt(void)
263 {
264 - profile_tick(CPU_PROFILING, regs);
265 + profile_tick(CPU_PROFILING);
266 #ifndef CONFIG_XEN
267 #ifdef CONFIG_SMP
268 - update_process_times(user_mode(regs));
269 + update_process_times(user_mode(get_irq_regs()));
270 #endif
271 #endif
272 /*
273 * We take the 'long' return path, and there every subsystem
274 * grabs the appropriate locks (kernel lock/ irq lock).
275 *
276 - * we might want to decouple profiling from the 'long path',
277 + * We might want to decouple profiling from the 'long path',
278 * and do the profiling totally in assembly.
279 *
280 * Currently this isn't too much of an issue (performance wise),
281 @@ -92,6 +92,8 @@
282 */
283 void smp_apic_timer_interrupt(struct pt_regs *regs)
284 {
285 + struct pt_regs *old_regs = set_irq_regs(regs);
286 +
287 /*
288 * the NMI deadlock-detector uses this.
289 */
290 @@ -109,8 +111,9 @@
291 */
292 exit_idle();
293 irq_enter();
294 - smp_local_timer_interrupt(regs);
295 + smp_local_timer_interrupt();
296 irq_exit();
297 + set_irq_regs(old_regs);
298 }
299
300 /*
301 @@ -188,9 +191,8 @@
302 int __init APIC_init_uniprocessor (void)
303 {
304 #ifdef CONFIG_X86_IO_APIC
305 - if (smp_found_config)
306 - if (!skip_ioapic_setup && nr_ioapics)
307 - setup_IO_APIC();
308 + if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
309 + setup_IO_APIC();
310 #endif
311
312 return 1;
313 --- a/arch/x86/kernel/cpu/common-xen.c
314 +++ b/arch/x86/kernel/cpu/common-xen.c
315 @@ -43,7 +43,7 @@
316
317 extern int disable_pse;
318
319 -static void default_init(struct cpuinfo_x86 * c)
320 +static void __cpuinit default_init(struct cpuinfo_x86 * c)
321 {
322 /* Not much we can do here... */
323 /* Check if at least it has cpuid */
324 @@ -56,7 +56,7 @@
325 }
326 }
327
328 -static struct cpu_dev default_cpu = {
329 +static struct cpu_dev __cpuinitdata default_cpu = {
330 .c_init = default_init,
331 .c_vendor = "Unknown",
332 };
333 @@ -191,7 +191,16 @@
334
335 static int __init x86_fxsr_setup(char * s)
336 {
337 + /* Tell all the other CPU's to not use it... */
338 disable_x86_fxsr = 1;
339 +
340 + /*
341 + * ... and clear the bits early in the boot_cpu_data
342 + * so that the bootup process doesn't try to do this
343 + * either.
344 + */
345 + clear_bit(X86_FEATURE_FXSR, boot_cpu_data.x86_capability);
346 + clear_bit(X86_FEATURE_XMM, boot_cpu_data.x86_capability);
347 return 1;
348 }
349 __setup("nofxsr", x86_fxsr_setup);
350 @@ -272,7 +281,7 @@
351 }
352 }
353
354 -void __cpuinit generic_identify(struct cpuinfo_x86 * c)
355 +static void __cpuinit generic_identify(struct cpuinfo_x86 * c)
356 {
357 u32 tfms, xlvl;
358 int ebx;
359 @@ -698,8 +707,7 @@
360 */
361 atomic_inc(&init_mm.mm_count);
362 current->active_mm = &init_mm;
363 - if (current->mm)
364 - BUG();
365 + BUG_ON(current->mm);
366 enter_lazy_tlb(&init_mm, current);
367
368 load_esp0(t, thread);
369 @@ -712,7 +720,7 @@
370 #endif
371
372 /* Clear %fs and %gs. */
373 - asm volatile ("xorl %eax, %eax; movl %eax, %fs; movl %eax, %gs");
374 + asm volatile ("movl %0, %%fs; movl %0, %%gs" : : "r" (0));
375
376 /* Clear all 6 debug registers: */
377 set_debugreg(0, 0);
378 --- a/arch/x86/kernel/e820_64-xen.c
379 +++ b/arch/x86/kernel/e820_64-xen.c
380 @@ -16,6 +16,7 @@
381 #include <linux/string.h>
382 #include <linux/kexec.h>
383 #include <linux/module.h>
384 +#include <linux/mm.h>
385
386 #include <asm/pgtable.h>
387 #include <asm/page.h>
388 @@ -25,6 +26,11 @@
389 #include <asm/sections.h>
390 #include <xen/interface/memory.h>
391
392 +struct e820map e820 __initdata;
393 +#ifdef CONFIG_XEN
394 +struct e820map machine_e820 __initdata;
395 +#endif
396 +
397 /*
398 * PFN of last memory page.
399 */
400 @@ -41,7 +47,7 @@
401 /*
402 * Last pfn which the user wants to use.
403 */
404 -unsigned long end_user_pfn = MAXMEM>>PAGE_SHIFT;
405 +static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT;
406
407 extern struct resource code_resource, data_resource;
408
409 @@ -53,13 +59,13 @@
410 #ifndef CONFIG_XEN
411 /* various gunk below that needed for SMP startup */
412 if (addr < 0x8000) {
413 - *addrp = 0x8000;
414 + *addrp = PAGE_ALIGN(0x8000);
415 return 1;
416 }
417
418 /* direct mapping tables of the kernel */
419 if (last >= table_start<<PAGE_SHIFT && addr < table_end<<PAGE_SHIFT) {
420 - *addrp = table_end << PAGE_SHIFT;
421 + *addrp = PAGE_ALIGN(table_end << PAGE_SHIFT);
422 return 1;
423 }
424
425 @@ -67,23 +73,18 @@
426 #ifdef CONFIG_BLK_DEV_INITRD
427 if (LOADER_TYPE && INITRD_START && last >= INITRD_START &&
428 addr < INITRD_START+INITRD_SIZE) {
429 - *addrp = INITRD_START + INITRD_SIZE;
430 + *addrp = PAGE_ALIGN(INITRD_START + INITRD_SIZE);
431 return 1;
432 }
433 #endif
434 - /* kernel code + 640k memory hole (later should not be needed, but
435 - be paranoid for now) */
436 - if (last >= 640*1024 && addr < 1024*1024) {
437 - *addrp = 1024*1024;
438 - return 1;
439 - }
440 - if (last >= __pa_symbol(&_text) && last < __pa_symbol(&_end)) {
441 - *addrp = __pa_symbol(&_end);
442 + /* kernel code */
443 + if (last >= __pa_symbol(&_text) && addr < __pa_symbol(&_end)) {
444 + *addrp = PAGE_ALIGN(__pa_symbol(&_end));
445 return 1;
446 }
447
448 if (last >= ebda_addr && addr < ebda_addr + ebda_size) {
449 - *addrp = ebda_addr + ebda_size;
450 + *addrp = PAGE_ALIGN(ebda_addr + ebda_size);
451 return 1;
452 }
453
454 @@ -141,8 +142,6 @@
455 for (i = 0; i < e820.nr_map; i++) {
456 struct e820entry *ei = &e820.map[i];
457 #else
458 - extern struct e820map machine_e820;
459 -
460 if (!is_initial_xendomain())
461 return 0;
462 for (i = 0; i < machine_e820.nr_map; i++) {
463 @@ -184,7 +183,7 @@
464 continue;
465 while (bad_addr(&addr, size) && addr+size <= ei->addr+ei->size)
466 ;
467 - last = addr + size;
468 + last = PAGE_ALIGN(addr) + size;
469 if (last > ei->addr + ei->size)
470 continue;
471 if (last > end)
472 @@ -194,59 +193,14 @@
473 return -1UL;
474 }
475
476 -/*
477 - * Free bootmem based on the e820 table for a node.
478 - */
479 -void __init e820_bootmem_free(pg_data_t *pgdat, unsigned long start,unsigned long end)
480 -{
481 - int i;
482 - for (i = 0; i < e820.nr_map; i++) {
483 - struct e820entry *ei = &e820.map[i];
484 - unsigned long last, addr;
485 -
486 - if (ei->type != E820_RAM ||
487 - ei->addr+ei->size <= start ||
488 - ei->addr >= end)
489 - continue;
490 -
491 - addr = round_up(ei->addr, PAGE_SIZE);
492 - if (addr < start)
493 - addr = start;
494 -
495 - last = round_down(ei->addr + ei->size, PAGE_SIZE);
496 - if (last >= end)
497 - last = end;
498 -
499 - if (last > addr && last-addr >= PAGE_SIZE)
500 - free_bootmem_node(pgdat, addr, last-addr);
501 - }
502 -}
503 -
504 /*
505 * Find the highest page frame number we have available
506 */
507 unsigned long __init e820_end_of_ram(void)
508 {
509 - int i;
510 unsigned long end_pfn = 0;
511 + end_pfn = find_max_pfn_with_active_regions();
512
513 - for (i = 0; i < e820.nr_map; i++) {
514 - struct e820entry *ei = &e820.map[i];
515 - unsigned long start, end;
516 -
517 - start = round_up(ei->addr, PAGE_SIZE);
518 - end = round_down(ei->addr + ei->size, PAGE_SIZE);
519 - if (start >= end)
520 - continue;
521 - if (ei->type == E820_RAM) {
522 - if (end > end_pfn<<PAGE_SHIFT)
523 - end_pfn = end>>PAGE_SHIFT;
524 - } else {
525 - if (end > end_pfn_map<<PAGE_SHIFT)
526 - end_pfn_map = end>>PAGE_SHIFT;
527 - }
528 - }
529 -
530 if (end_pfn > end_pfn_map)
531 end_pfn_map = end_pfn;
532 if (end_pfn_map > MAXMEM>>PAGE_SHIFT)
533 @@ -256,43 +210,10 @@
534 if (end_pfn > end_pfn_map)
535 end_pfn = end_pfn_map;
536
537 + printk("end_pfn_map = %lu\n", end_pfn_map);
538 return end_pfn;
539 }
540
541 -/*
542 - * Compute how much memory is missing in a range.
543 - * Unlike the other functions in this file the arguments are in page numbers.
544 - */
545 -unsigned long __init
546 -e820_hole_size(unsigned long start_pfn, unsigned long end_pfn)
547 -{
548 - unsigned long ram = 0;
549 - unsigned long start = start_pfn << PAGE_SHIFT;
550 - unsigned long end = end_pfn << PAGE_SHIFT;
551 - int i;
552 - for (i = 0; i < e820.nr_map; i++) {
553 - struct e820entry *ei = &e820.map[i];
554 - unsigned long last, addr;
555 -
556 - if (ei->type != E820_RAM ||
557 - ei->addr+ei->size <= start ||
558 - ei->addr >= end)
559 - continue;
560 -
561 - addr = round_up(ei->addr, PAGE_SIZE);
562 - if (addr < start)
563 - addr = start;
564 -
565 - last = round_down(ei->addr + ei->size, PAGE_SIZE);
566 - if (last >= end)
567 - last = end;
568 -
569 - if (last > addr)
570 - ram += last - addr;
571 - }
572 - return ((end - start) - ram) >> PAGE_SHIFT;
573 -}
574 -
575 /*
576 * Mark e820 reserved areas as busy for the resource manager.
577 */
578 @@ -333,6 +254,98 @@
579 }
580 }
581
582 +#ifndef CONFIG_XEN
583 +/* Mark pages corresponding to given address range as nosave */
584 +static void __init
585 +e820_mark_nosave_range(unsigned long start, unsigned long end)
586 +{
587 + unsigned long pfn, max_pfn;
588 +
589 + if (start >= end)
590 + return;
591 +
592 + printk("Nosave address range: %016lx - %016lx\n", start, end);
593 + max_pfn = end >> PAGE_SHIFT;
594 + for (pfn = start >> PAGE_SHIFT; pfn < max_pfn; pfn++)
595 + if (pfn_valid(pfn))
596 + SetPageNosave(pfn_to_page(pfn));
597 +}
598 +
599 +/*
600 + * Find the ranges of physical addresses that do not correspond to
601 + * e820 RAM areas and mark the corresponding pages as nosave for software
602 + * suspend and suspend to RAM.
603 + *
604 + * This function requires the e820 map to be sorted and without any
605 + * overlapping entries and assumes the first e820 area to be RAM.
606 + */
607 +void __init e820_mark_nosave_regions(void)
608 +{
609 + int i;
610 + unsigned long paddr;
611 +
612 + paddr = round_down(e820.map[0].addr + e820.map[0].size, PAGE_SIZE);
613 + for (i = 1; i < e820.nr_map; i++) {
614 + struct e820entry *ei = &e820.map[i];
615 +
616 + if (paddr < ei->addr)
617 + e820_mark_nosave_range(paddr,
618 + round_up(ei->addr, PAGE_SIZE));
619 +
620 + paddr = round_down(ei->addr + ei->size, PAGE_SIZE);
621 + if (ei->type != E820_RAM)
622 + e820_mark_nosave_range(round_up(ei->addr, PAGE_SIZE),
623 + paddr);
624 +
625 + if (paddr >= (end_pfn << PAGE_SHIFT))
626 + break;
627 + }
628 +}
629 +#endif
630 +
631 +/* Walk the e820 map and register active regions within a node */
632 +void __init
633 +e820_register_active_regions(int nid, unsigned long start_pfn,
634 + unsigned long end_pfn)
635 +{
636 + int i;
637 + unsigned long ei_startpfn, ei_endpfn;
638 + for (i = 0; i < e820.nr_map; i++) {
639 + struct e820entry *ei = &e820.map[i];
640 + ei_startpfn = round_up(ei->addr, PAGE_SIZE) >> PAGE_SHIFT;
641 + ei_endpfn = round_down(ei->addr + ei->size, PAGE_SIZE)
642 + >> PAGE_SHIFT;
643 +
644 + /* Skip map entries smaller than a page */
645 + if (ei_startpfn >= ei_endpfn)
646 + continue;
647 +
648 + /* Check if end_pfn_map should be updated */
649 + if (ei->type != E820_RAM && ei_endpfn > end_pfn_map)
650 + end_pfn_map = ei_endpfn;
651 +
652 + /* Skip if map is outside the node */
653 + if (ei->type != E820_RAM ||
654 + ei_endpfn <= start_pfn ||
655 + ei_startpfn >= end_pfn)
656 + continue;
657 +
658 + /* Check for overlaps */
659 + if (ei_startpfn < start_pfn)
660 + ei_startpfn = start_pfn;
661 + if (ei_endpfn > end_pfn)
662 + ei_endpfn = end_pfn;
663 +
664 + /* Obey end_user_pfn to save on memmap */
665 + if (ei_startpfn >= end_user_pfn)
666 + continue;
667 + if (ei_endpfn > end_user_pfn)
668 + ei_endpfn = end_user_pfn;
669 +
670 + add_active_range(nid, ei_startpfn, ei_endpfn);
671 + }
672 +}
673 +
674 /*
675 * Add a memory region to the kernel e820 map.
676 */
677 @@ -553,13 +566,6 @@
678 * If we're lucky and live on a modern system, the setup code
679 * will have given us a memory map that we can use to properly
680 * set up memory. If we aren't, we'll fake a memory map.
681 - *
682 - * We check to see that the memory map contains at least 2 elements
683 - * before we'll use it, because the detection code in setup.S may
684 - * not be perfect and most every PC known to man has two memory
685 - * regions: one from 0 to 640k, and one from 1mb up. (The IBM
686 - * thinkpad 560x, for example, does not cooperate with the memory
687 - * detection code.)
688 */
689 static int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
690 {
691 @@ -581,37 +587,20 @@
692 if (start > end)
693 return -1;
694
695 -#ifndef CONFIG_XEN
696 - /*
697 - * Some BIOSes claim RAM in the 640k - 1M region.
698 - * Not right. Fix it up.
699 - *
700 - * This should be removed on Hammer which is supposed to not
701 - * have non e820 covered ISA mappings there, but I had some strange
702 - * problems so it stays for now. -AK
703 - */
704 - if (type == E820_RAM) {
705 - if (start < 0x100000ULL && end > 0xA0000ULL) {
706 - if (start < 0xA0000ULL)
707 - add_memory_region(start, 0xA0000ULL-start, type);
708 - if (end <= 0x100000ULL)
709 - continue;
710 - start = 0x100000ULL;
711 - size = end - start;
712 - }
713 - }
714 -#endif
715 -
716 add_memory_region(start, size, type);
717 } while (biosmap++,--nr_map);
718 return 0;
719 }
720
721 +void early_panic(char *msg)
722 +{
723 + early_printk(msg);
724 + panic(msg);
725 +}
726 +
727 #ifndef CONFIG_XEN
728 void __init setup_memory_region(void)
729 {
730 - char *who = "BIOS-e820";
731 -
732 /*
733 * Try to copy the BIOS-supplied E820-map.
734 *
735 @@ -619,24 +608,10 @@
736 * the next section from 1mb->appropriate_mem_k
737 */
738 sanitize_e820_map(E820_MAP, &E820_MAP_NR);
739 - if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0) {
740 - unsigned long mem_size;
741 -
742 - /* compare results from other methods and take the greater */
743 - if (ALT_MEM_K < EXT_MEM_K) {
744 - mem_size = EXT_MEM_K;
745 - who = "BIOS-88";
746 - } else {
747 - mem_size = ALT_MEM_K;
748 - who = "BIOS-e801";
749 - }
750 -
751 - e820.nr_map = 0;
752 - add_memory_region(0, LOWMEMSIZE(), E820_RAM);
753 - add_memory_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
754 - }
755 + if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0)
756 + early_panic("Cannot find a valid memory map");
757 printk(KERN_INFO "BIOS-provided physical RAM map:\n");
758 - e820_print_map(who);
759 + e820_print_map("BIOS-e820");
760 }
761
762 #else /* CONFIG_XEN */
763 @@ -668,20 +643,23 @@
764
765 sanitize_e820_map(map, (char *)&memmap.nr_entries);
766
767 - BUG_ON(copy_e820_map(map, (char)memmap.nr_entries) < 0);
768 + if (copy_e820_map(map, (char)memmap.nr_entries) < 0)
769 + early_panic("Cannot find a valid memory map");
770
771 printk(KERN_INFO "BIOS-provided physical RAM map:\n");
772 e820_print_map("Xen");
773 }
774 #endif
775
776 -void __init parse_memopt(char *p, char **from)
777 -{
778 +static int __init parse_memopt(char *p)
779 +{
780 int i;
781 unsigned long current_end;
782 unsigned long end;
783
784 - end_user_pfn = memparse(p, from);
785 + if (!p)
786 + return -EINVAL;
787 + end_user_pfn = memparse(p, &p);
788 end_user_pfn >>= PAGE_SHIFT;
789
790 end = end_user_pfn<<PAGE_SHIFT;
791 @@ -698,27 +676,61 @@
792 else
793 add_memory_region(current_end, end - current_end, E820_RAM);
794 }
795 +
796 + return 0;
797 }
798 +early_param("mem", parse_memopt);
799 +
800 +static int userdef __initdata;
801
802 -void __init parse_memmapopt(char *p, char **from)
803 +static int __init parse_memmap_opt(char *p)
804 {
805 + char *oldp;
806 unsigned long long start_at, mem_size;
807
808 - mem_size = memparse(p, from);
809 - p = *from;
810 + if (!strcmp(p, "exactmap")) {
811 +#ifdef CONFIG_CRASH_DUMP
812 + /* If we are doing a crash dump, we
813 + * still need to know the real mem
814 + * size before original memory map is
815 + * reset.
816 + */
817 + e820_register_active_regions(0, 0, -1UL);
818 + saved_max_pfn = e820_end_of_ram();
819 + remove_all_active_ranges();
820 +#endif
821 + end_pfn_map = 0;
822 + e820.nr_map = 0;
823 + userdef = 1;
824 + return 0;
825 + }
826 +
827 + oldp = p;
828 + mem_size = memparse(p, &p);
829 + if (p == oldp)
830 + return -EINVAL;
831 if (*p == '@') {
832 - start_at = memparse(p+1, from);
833 + start_at = memparse(p+1, &p);
834 add_memory_region(start_at, mem_size, E820_RAM);
835 } else if (*p == '#') {
836 - start_at = memparse(p+1, from);
837 + start_at = memparse(p+1, &p);
838 add_memory_region(start_at, mem_size, E820_ACPI);
839 } else if (*p == '$') {
840 - start_at = memparse(p+1, from);
841 + start_at = memparse(p+1, &p);
842 add_memory_region(start_at, mem_size, E820_RESERVED);
843 } else {
844 end_user_pfn = (mem_size >> PAGE_SHIFT);
845 }
846 - p = *from;
847 + return *p == '\0' ? 0 : -EINVAL;
848 +}
849 +early_param("memmap", parse_memmap_opt);
850 +
851 +void finish_e820_parsing(void)
852 +{
853 + if (userdef) {
854 + printk(KERN_INFO "user-defined physical RAM map:\n");
855 + e820_print_map("user");
856 + }
857 }
858
859 unsigned long pci_mem_start = 0xaeedbabe;
860 --- a/arch/x86/kernel/early_printk-xen.c
861 +++ b/arch/x86/kernel/early_printk-xen.c
862 @@ -244,20 +244,16 @@
863
864 static int __initdata keep_early;
865
866 -int __init setup_early_printk(char *opt)
867 +static int __init setup_early_printk(char *buf)
868 {
869 - char *space;
870 - char buf[256];
871 + if (!buf)
872 + return 0;
873
874 if (early_console_initialized)
875 - return 1;
876 -
877 - strlcpy(buf,opt,sizeof(buf));
878 - space = strchr(buf, ' ');
879 - if (space)
880 - *space = 0;
881 + return 0;
882 + early_console_initialized = 1;
883
884 - if (strstr(buf,"keep"))
885 + if (strstr(buf, "keep"))
886 keep_early = 1;
887
888 if (!strncmp(buf, "serial", 6)) {
889 @@ -281,11 +277,12 @@
890 early_console = &simnow_console;
891 keep_early = 1;
892 }
893 - early_console_initialized = 1;
894 register_console(early_console);
895 return 0;
896 }
897
898 +early_param("earlyprintk", setup_early_printk);
899 +
900 void __init disable_early_printk(void)
901 {
902 if (!early_console_initialized || !early_console)
903 @@ -299,4 +296,3 @@
904 }
905 }
906
907 -__setup("earlyprintk=", setup_early_printk);
908 --- a/arch/x86/kernel/entry_32-xen.S
909 +++ b/arch/x86/kernel/entry_32-xen.S
910 @@ -80,8 +80,12 @@
911 NMI_MASK = 0x80000000
912
913 #ifndef CONFIG_XEN
914 -#define DISABLE_INTERRUPTS cli
915 -#define ENABLE_INTERRUPTS sti
916 +/* These are replaces for paravirtualization */
917 +#define DISABLE_INTERRUPTS cli
918 +#define ENABLE_INTERRUPTS sti
919 +#define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit
920 +#define INTERRUPT_RETURN iret
921 +#define GET_CR0_INTO_EAX movl %cr0, %eax
922 #else
923 /* Offsets into shared_info_t. */
924 #define evtchn_upcall_pending /* 0 */
925 @@ -99,15 +103,29 @@
926
927 #define __DISABLE_INTERRUPTS movb $1,evtchn_upcall_mask(%esi)
928 #define __ENABLE_INTERRUPTS movb $0,evtchn_upcall_mask(%esi)
929 +#define __TEST_PENDING testb $0xFF,evtchn_upcall_pending(%esi)
930 #define DISABLE_INTERRUPTS GET_VCPU_INFO ; \
931 __DISABLE_INTERRUPTS
932 #define ENABLE_INTERRUPTS GET_VCPU_INFO ; \
933 __ENABLE_INTERRUPTS
934 -#define __TEST_PENDING testb $0xFF,evtchn_upcall_pending(%esi)
935 +#define ENABLE_INTERRUPTS_SYSEXIT __ENABLE_INTERRUPTS ; \
936 +sysexit_scrit: /**** START OF SYSEXIT CRITICAL REGION ****/ ; \
937 + __TEST_PENDING ; \
938 + jnz 14f # process more events if necessary... ; \
939 + movl ESI(%esp), %esi ; \
940 + sysexit ; \
941 +14: __DISABLE_INTERRUPTS ; \
942 + TRACE_IRQS_OFF ; \
943 +sysexit_ecrit: /**** END OF SYSEXIT CRITICAL REGION ****/ ; \
944 + push %esp ; \
945 + call evtchn_do_upcall ; \
946 + add $4,%esp ; \
947 + jmp ret_from_intr
948 +#define INTERRUPT_RETURN iret
949 #endif
950
951 #ifdef CONFIG_PREEMPT
952 -#define preempt_stop cli; TRACE_IRQS_OFF
953 +#define preempt_stop DISABLE_INTERRUPTS; TRACE_IRQS_OFF
954 #else
955 #define preempt_stop
956 #define resume_kernel restore_nocheck
957 @@ -206,18 +224,21 @@
958
959 #define RING0_INT_FRAME \
960 CFI_STARTPROC simple;\
961 + CFI_SIGNAL_FRAME;\
962 CFI_DEF_CFA esp, 3*4;\
963 /*CFI_OFFSET cs, -2*4;*/\
964 CFI_OFFSET eip, -3*4
965
966 #define RING0_EC_FRAME \
967 CFI_STARTPROC simple;\
968 + CFI_SIGNAL_FRAME;\
969 CFI_DEF_CFA esp, 4*4;\
970 /*CFI_OFFSET cs, -2*4;*/\
971 CFI_OFFSET eip, -3*4
972
973 #define RING0_PTREGS_FRAME \
974 CFI_STARTPROC simple;\
975 + CFI_SIGNAL_FRAME;\
976 CFI_DEF_CFA esp, OLDESP-EBX;\
977 /*CFI_OFFSET cs, CS-OLDESP;*/\
978 CFI_OFFSET eip, EIP-OLDESP;\
979 @@ -263,8 +284,9 @@
980 check_userspace:
981 movl EFLAGS(%esp), %eax # mix EFLAGS and CS
982 movb CS(%esp), %al
983 - testl $(VM_MASK | 2), %eax
984 - jz resume_kernel
985 + andl $(VM_MASK | SEGMENT_RPL_MASK), %eax
986 + cmpl $USER_RPL, %eax
987 + jb resume_kernel # not returning to v8086 or userspace
988 ENTRY(resume_userspace)
989 DISABLE_INTERRUPTS # make sure we don't miss an interrupt
990 # setting need_resched or sigpending
991 @@ -277,7 +299,7 @@
992
993 #ifdef CONFIG_PREEMPT
994 ENTRY(resume_kernel)
995 - cli
996 + DISABLE_INTERRUPTS
997 cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ?
998 jnz restore_nocheck
999 need_resched:
1000 @@ -297,6 +319,7 @@
1001 # sysenter call handler stub
1002 ENTRY(sysenter_entry)
1003 CFI_STARTPROC simple
1004 + CFI_SIGNAL_FRAME
1005 CFI_DEF_CFA esp, 0
1006 CFI_REGISTER esp, ebp
1007 movl SYSENTER_stack_esp0(%esp),%esp
1008 @@ -305,7 +328,7 @@
1009 * No need to follow this irqs on/off section: the syscall
1010 * disabled irqs and here we enable it straight after entry:
1011 */
1012 - sti
1013 + ENABLE_INTERRUPTS
1014 pushl $(__USER_DS)
1015 CFI_ADJUST_CFA_OFFSET 4
1016 /*CFI_REL_OFFSET ss, 0*/
1017 @@ -359,26 +382,8 @@
1018 movl EIP(%esp), %edx
1019 movl OLDESP(%esp), %ecx
1020 xorl %ebp,%ebp
1021 -#ifdef CONFIG_XEN
1022 TRACE_IRQS_ON
1023 - __ENABLE_INTERRUPTS
1024 -sysexit_scrit: /**** START OF SYSEXIT CRITICAL REGION ****/
1025 - __TEST_PENDING
1026 - jnz 14f # process more events if necessary...
1027 - movl ESI(%esp), %esi
1028 - sysexit
1029 -14: __DISABLE_INTERRUPTS
1030 - TRACE_IRQS_OFF
1031 -sysexit_ecrit: /**** END OF SYSEXIT CRITICAL REGION ****/
1032 - push %esp
1033 - call evtchn_do_upcall
1034 - add $4,%esp
1035 - jmp ret_from_intr
1036 -#else
1037 - TRACE_IRQS_ON
1038 - sti
1039 - sysexit
1040 -#endif /* !CONFIG_XEN */
1041 + ENABLE_INTERRUPTS_SYSEXIT
1042 CFI_ENDPROC
1043
1044 # pv sysenter call handler stub
1045 @@ -444,8 +449,8 @@
1046 # See comments in process.c:copy_thread() for details.
1047 movb OLDSS(%esp), %ah
1048 movb CS(%esp), %al
1049 - andl $(VM_MASK | (4 << 8) | 3), %eax
1050 - cmpl $((4 << 8) | 3), %eax
1051 + andl $(VM_MASK | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
1052 + cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
1053 CFI_REMEMBER_STATE
1054 je ldt_ss # returning to user-space with LDT SS
1055 restore_nocheck:
1056 @@ -467,12 +472,11 @@
1057 RESTORE_REGS
1058 addl $4, %esp
1059 CFI_ADJUST_CFA_OFFSET -4
1060 -1: iret
1061 +1: INTERRUPT_RETURN
1062 .section .fixup,"ax"
1063 iret_exc:
1064 #ifndef CONFIG_XEN
1065 - TRACE_IRQS_ON
1066 - sti
1067 + ENABLE_INTERRUPTS
1068 #endif
1069 pushl $0 # no error code
1070 pushl $do_iret_error
1071 @@ -498,7 +502,7 @@
1072 * dosemu and wine happy. */
1073 subl $8, %esp # reserve space for switch16 pointer
1074 CFI_ADJUST_CFA_OFFSET 8
1075 - cli
1076 + DISABLE_INTERRUPTS
1077 TRACE_IRQS_OFF
1078 movl %esp, %eax
1079 /* Set up the 16bit stack frame with switch32 pointer on top,
1080 @@ -508,7 +512,7 @@
1081 TRACE_IRQS_IRET
1082 RESTORE_REGS
1083 lss 20+4(%esp), %esp # switch to 16bit stack
1084 -1: iret
1085 +1: INTERRUPT_RETURN
1086 .section __ex_table,"a"
1087 .align 4
1088 .long 1b,iret_exc
1089 @@ -524,7 +528,7 @@
1090 RESTORE_REGS
1091 addl $4, %esp
1092 CFI_ADJUST_CFA_OFFSET -4
1093 -1: iret
1094 +1: INTERRUPT_RETURN
1095 .section __ex_table,"a"
1096 .align 4
1097 .long 1b,iret_exc
1098 @@ -713,11 +717,9 @@
1099 #define UNWIND_ESPFIX_STACK
1100 #endif
1101
1102 -ENTRY(divide_error)
1103 - RING0_INT_FRAME
1104 - pushl $0 # no error code
1105 - CFI_ADJUST_CFA_OFFSET 4
1106 - pushl $do_divide_error
1107 +KPROBE_ENTRY(page_fault)
1108 + RING0_EC_FRAME
1109 + pushl $do_page_fault
1110 CFI_ADJUST_CFA_OFFSET 4
1111 ALIGN
1112 error_code:
1113 @@ -767,6 +769,7 @@
1114 call *%edi
1115 jmp ret_from_exception
1116 CFI_ENDPROC
1117 +KPROBE_END(page_fault)
1118
1119 #ifdef CONFIG_XEN
1120 # A note on the "critical region" in our callback handler.
1121 @@ -926,7 +929,7 @@
1122 CFI_ADJUST_CFA_OFFSET 4
1123 SAVE_ALL
1124 #ifndef CONFIG_XEN
1125 - movl %cr0, %eax
1126 + GET_CR0_INTO_EAX
1127 testl $0x4, %eax # EM (math emulation bit)
1128 je device_available_emulate
1129 pushl $0 # temporary storage for ORIG_EIP
1130 @@ -961,9 +964,15 @@
1131 jne ok; \
1132 label: \
1133 movl SYSENTER_stack_esp0+offset(%esp),%esp; \
1134 + CFI_DEF_CFA esp, 0; \
1135 + CFI_UNDEFINED eip; \
1136 pushfl; \
1137 + CFI_ADJUST_CFA_OFFSET 4; \
1138 pushl $__KERNEL_CS; \
1139 - pushl $sysenter_past_esp
1140 + CFI_ADJUST_CFA_OFFSET 4; \
1141 + pushl $sysenter_past_esp; \
1142 + CFI_ADJUST_CFA_OFFSET 4; \
1143 + CFI_REL_OFFSET eip, 0
1144 #endif /* CONFIG_XEN */
1145
1146 KPROBE_ENTRY(debug)
1147 @@ -982,7 +991,8 @@
1148 call do_debug
1149 jmp ret_from_exception
1150 CFI_ENDPROC
1151 - .previous .text
1152 +KPROBE_END(debug)
1153 +
1154 #ifndef CONFIG_XEN
1155 /*
1156 * NMI is doubly nasty. It can happen _while_ we're handling
1157 @@ -992,7 +1002,7 @@
1158 * check whether we got an NMI on the debug path where the debug
1159 * fault happened on the sysenter path.
1160 */
1161 -ENTRY(nmi)
1162 +KPROBE_ENTRY(nmi)
1163 RING0_INT_FRAME
1164 pushl %eax
1165 CFI_ADJUST_CFA_OFFSET 4
1166 @@ -1017,6 +1027,7 @@
1167 cmpl $sysenter_entry,12(%esp)
1168 je nmi_debug_stack_check
1169 nmi_stack_correct:
1170 + /* We have a RING0_INT_FRAME here */
1171 pushl %eax
1172 CFI_ADJUST_CFA_OFFSET 4
1173 SAVE_ALL
1174 @@ -1027,9 +1038,12 @@
1175 CFI_ENDPROC
1176
1177 nmi_stack_fixup:
1178 + RING0_INT_FRAME
1179 FIX_STACK(12,nmi_stack_correct, 1)
1180 jmp nmi_stack_correct
1181 +
1182 nmi_debug_stack_check:
1183 + /* We have a RING0_INT_FRAME here */
1184 cmpw $__KERNEL_CS,16(%esp)
1185 jne nmi_stack_correct
1186 cmpl $debug,(%esp)
1187 @@ -1040,8 +1054,10 @@
1188 jmp nmi_stack_correct
1189
1190 nmi_16bit_stack:
1191 - RING0_INT_FRAME
1192 - /* create the pointer to lss back */
1193 + /* We have a RING0_INT_FRAME here.
1194 + *
1195 + * create the pointer to lss back
1196 + */
1197 pushl %ss
1198 CFI_ADJUST_CFA_OFFSET 4
1199 pushl %esp
1200 @@ -1062,14 +1078,14 @@
1201 call do_nmi
1202 RESTORE_REGS
1203 lss 12+4(%esp), %esp # back to 16bit stack
1204 -1: iret
1205 +1: INTERRUPT_RETURN
1206 CFI_ENDPROC
1207 .section __ex_table,"a"
1208 .align 4
1209 .long 1b,iret_exc
1210 .previous
1211 #else
1212 -ENTRY(nmi)
1213 +KPROBE_ENTRY(nmi)
1214 RING0_INT_FRAME
1215 pushl %eax
1216 CFI_ADJUST_CFA_OFFSET 4
1217 @@ -1081,6 +1097,7 @@
1218 jmp restore_all
1219 CFI_ENDPROC
1220 #endif
1221 +KPROBE_END(nmi)
1222
1223 KPROBE_ENTRY(int3)
1224 RING0_INT_FRAME
1225 @@ -1092,7 +1109,7 @@
1226 call do_int3
1227 jmp ret_from_exception
1228 CFI_ENDPROC
1229 - .previous .text
1230 +KPROBE_END(int3)
1231
1232 ENTRY(overflow)
1233 RING0_INT_FRAME
1234 @@ -1157,7 +1174,7 @@
1235 CFI_ADJUST_CFA_OFFSET 4
1236 jmp error_code
1237 CFI_ENDPROC
1238 - .previous .text
1239 +KPROBE_END(general_protection)
1240
1241 ENTRY(alignment_check)
1242 RING0_EC_FRAME
1243 @@ -1166,13 +1183,14 @@
1244 jmp error_code
1245 CFI_ENDPROC
1246
1247 -KPROBE_ENTRY(page_fault)
1248 - RING0_EC_FRAME
1249 - pushl $do_page_fault
1250 +ENTRY(divide_error)
1251 + RING0_INT_FRAME
1252 + pushl $0 # no error code
1253 + CFI_ADJUST_CFA_OFFSET 4
1254 + pushl $do_divide_error
1255 CFI_ADJUST_CFA_OFFSET 4
1256 jmp error_code
1257 CFI_ENDPROC
1258 - .previous .text
1259
1260 #ifdef CONFIG_X86_MCE
1261 ENTRY(machine_check)
1262 @@ -1234,6 +1252,19 @@
1263 jmp error_code
1264 CFI_ENDPROC
1265
1266 +ENTRY(kernel_thread_helper)
1267 + pushl $0 # fake return address for unwinder
1268 + CFI_STARTPROC
1269 + movl %edx,%eax
1270 + push %edx
1271 + CFI_ADJUST_CFA_OFFSET 4
1272 + call *%ebx
1273 + push %eax
1274 + CFI_ADJUST_CFA_OFFSET 4
1275 + call do_exit
1276 + CFI_ENDPROC
1277 +ENDPROC(kernel_thread_helper)
1278 +
1279 .section .rodata,"a"
1280 #include "syscall_table.S"
1281
1282 --- a/arch/x86/kernel/entry_64-xen.S
1283 +++ b/arch/x86/kernel/entry_64-xen.S
1284 @@ -4,9 +4,6 @@
1285 * Copyright (C) 1991, 1992 Linus Torvalds
1286 * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
1287 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
1288 - *
1289 - * $Id: 1020-2.6.25-xen-patch-2.6.19.patch,v 1.1 2008-05-23 17:35:36 niro Exp $
1290 - *
1291 * Jun Nakajima <jun.nakajima@intel.com>
1292 * Asit Mallick <asit.k.mallick@intel.com>
1293 * Modified for Xen
1294 @@ -26,15 +23,25 @@
1295 * at the top of the kernel process stack.
1296 * - partial stack frame: partially saved registers upto R11.
1297 * - full stack frame: Like partial stack frame, but all register saved.
1298 - *
1299 - * TODO:
1300 - * - schedule it carefully for the final hardware.
1301 + *
1302 + * Some macro usage:
1303 + * - CFI macros are used to generate dwarf2 unwind information for better
1304 + * backtraces. They don't change any code.
1305 + * - SAVE_ALL/RESTORE_ALL - Save/restore all registers
1306 + * - SAVE_ARGS/RESTORE_ARGS - Save/restore registers that C functions modify.
1307 + * There are unfortunately lots of special cases where some registers
1308 + * not touched. The macro is a big mess that should be cleaned up.
1309 + * - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS.
1310 + * Gives a full stack frame.
1311 + * - ENTRY/END Define functions in the symbol table.
1312 + * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack
1313 + * frame that is otherwise undefined after a SYSCALL
1314 + * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging.
1315 + * - errorentry/paranoidentry/zeroentry - Define exception entry points.
1316 */
1317
1318 -#define ASSEMBLY 1
1319 #include <linux/linkage.h>
1320 #include <asm/segment.h>
1321 -#include <asm/smp.h>
1322 #include <asm/cache.h>
1323 #include <asm/errno.h>
1324 #include <asm/dwarf2.h>
1325 @@ -117,6 +124,7 @@
1326 .macro CFI_DEFAULT_STACK start=1,adj=0
1327 .if \start
1328 CFI_STARTPROC simple
1329 + CFI_SIGNAL_FRAME
1330 CFI_DEF_CFA rsp,SS+8-(\adj*ARGOFFSET)
1331 .else
1332 CFI_DEF_CFA_OFFSET SS+8-(\adj*ARGOFFSET)
1333 @@ -207,6 +215,7 @@
1334 */
1335 .macro _frame ref
1336 CFI_STARTPROC simple
1337 + CFI_SIGNAL_FRAME
1338 CFI_DEF_CFA rsp,SS+8-\ref
1339 /*CFI_REL_OFFSET ss,SS-\ref*/
1340 CFI_REL_OFFSET rsp,RSP-\ref
1341 @@ -334,6 +343,8 @@
1342 LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */
1343 RESTORE_REST
1344 cmpq $__NR_syscall_max,%rax
1345 + movq $-ENOSYS,%rcx
1346 + cmova %rcx,%rax
1347 ja 1f
1348 movq %r10,%rcx /* fixup for C */
1349 call *sys_call_table(,%rax,8)
1350 @@ -349,6 +360,7 @@
1351 */
1352 ENTRY(int_ret_from_sys_call)
1353 CFI_STARTPROC simple
1354 + CFI_SIGNAL_FRAME
1355 CFI_DEF_CFA rsp,SS+8-ARGOFFSET
1356 /*CFI_REL_OFFSET ss,SS-ARGOFFSET*/
1357 CFI_REL_OFFSET rsp,RSP-ARGOFFSET
1358 @@ -583,8 +595,7 @@
1359 #ifdef CONFIG_PREEMPT
1360 /* Returning to kernel space. Check if we need preemption */
1361 /* rcx: threadinfo. interrupts off. */
1362 - .p2align
1363 -retint_kernel:
1364 +ENTRY(retint_kernel)
1365 cmpl $0,threadinfo_preempt_count(%rcx)
1366 jnz retint_restore_args
1367 bt $TIF_NEED_RESCHED,threadinfo_flags(%rcx)
1368 @@ -644,7 +655,6 @@
1369 END(call_function_interrupt)
1370 #endif
1371
1372 -#ifdef CONFIG_X86_LOCAL_APIC
1373 ENTRY(apic_timer_interrupt)
1374 apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt
1375 END(apic_timer_interrupt)
1376 @@ -656,7 +666,6 @@
1377 ENTRY(spurious_interrupt)
1378 apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt
1379 END(spurious_interrupt)
1380 -#endif
1381 #endif /* !CONFIG_XEN */
1382
1383 /*
1384 @@ -755,7 +764,9 @@
1385 testl $3,CS(%rsp)
1386 jnz paranoid_userspace\trace
1387 paranoid_swapgs\trace:
1388 + .if \trace
1389 TRACE_IRQS_IRETQ 0
1390 + .endif
1391 swapgs
1392 paranoid_restore\trace:
1393 RESTORE_ALL 8
1394 @@ -802,7 +813,7 @@
1395 * Exception entry point. This expects an error code/orig_rax on the stack
1396 * and the exception handler in %rax.
1397 */
1398 -ENTRY(error_entry)
1399 +KPROBE_ENTRY(error_entry)
1400 _frame RDI
1401 CFI_REL_OFFSET rax,0
1402 /* rdi slot contains rax, oldrax contains error code */
1403 @@ -896,7 +907,7 @@
1404 jmp error_sti
1405 #endif
1406 CFI_ENDPROC
1407 -END(error_entry)
1408 +KPROBE_END(error_entry)
1409
1410 ENTRY(hypervisor_callback)
1411 zeroentry do_hypervisor_callback
1412 @@ -936,26 +947,6 @@
1413 CFI_ENDPROC
1414 END(do_hypervisor_callback)
1415
1416 -#ifdef CONFIG_X86_LOCAL_APIC
1417 -KPROBE_ENTRY(nmi)
1418 - zeroentry do_nmi_callback
1419 -ENTRY(do_nmi_callback)
1420 - CFI_STARTPROC
1421 - addq $8, %rsp
1422 - CFI_ENDPROC
1423 - CFI_DEFAULT_STACK
1424 - call do_nmi
1425 - orl $NMI_MASK,EFLAGS(%rsp)
1426 - RESTORE_REST
1427 - XEN_BLOCK_EVENTS(%rsi)
1428 - TRACE_IRQS_OFF
1429 - GET_THREAD_INFO(%rcx)
1430 - jmp retint_restore_args
1431 - CFI_ENDPROC
1432 - .previous .text
1433 -END(nmi)
1434 -#endif
1435 -
1436 ALIGN
1437 restore_all_enable_events:
1438 CFI_DEFAULT_STACK adj=1
1439 @@ -1121,7 +1112,7 @@
1440 * do_sys_execve asm fallback arguments:
1441 * rdi: name, rsi: argv, rdx: envp, fake frame on the stack
1442 */
1443 -ENTRY(execve)
1444 +ENTRY(kernel_execve)
1445 CFI_STARTPROC
1446 FAKE_STACK_FRAME $0
1447 SAVE_ALL
1448 @@ -1135,12 +1126,11 @@
1449 UNFAKE_STACK_FRAME
1450 ret
1451 CFI_ENDPROC
1452 -ENDPROC(execve)
1453 +ENDPROC(kernel_execve)
1454
1455 KPROBE_ENTRY(page_fault)
1456 errorentry do_page_fault
1457 -END(page_fault)
1458 - .previous .text
1459 +KPROBE_END(page_fault)
1460
1461 ENTRY(coprocessor_error)
1462 zeroentry do_coprocessor_error
1463 @@ -1162,25 +1152,25 @@
1464 zeroentry do_debug
1465 /* paranoidexit
1466 CFI_ENDPROC */
1467 -END(debug)
1468 - .previous .text
1469 +KPROBE_END(debug)
1470
1471 -#if 0
1472 - /* runs on exception stack */
1473 KPROBE_ENTRY(nmi)
1474 - INTR_FRAME
1475 - pushq $-1
1476 - CFI_ADJUST_CFA_OFFSET 8
1477 - paranoidentry do_nmi, 0, 0
1478 -#ifdef CONFIG_TRACE_IRQFLAGS
1479 - paranoidexit 0
1480 -#else
1481 - jmp paranoid_exit1
1482 - CFI_ENDPROC
1483 -#endif
1484 -END(nmi)
1485 - .previous .text
1486 -#endif
1487 + zeroentry do_nmi_callback
1488 +KPROBE_END(nmi)
1489 +do_nmi_callback:
1490 + CFI_STARTPROC
1491 + addq $8, %rsp
1492 + CFI_ENDPROC
1493 + CFI_DEFAULT_STACK
1494 + call do_nmi
1495 + orl $NMI_MASK,EFLAGS(%rsp)
1496 + RESTORE_REST
1497 + XEN_BLOCK_EVENTS(%rsi)
1498 + TRACE_IRQS_OFF
1499 + GET_THREAD_INFO(%rcx)
1500 + jmp retint_restore_args
1501 + CFI_ENDPROC
1502 +END(do_nmi_callback)
1503
1504 KPROBE_ENTRY(int3)
1505 /* INTR_FRAME
1506 @@ -1189,8 +1179,7 @@
1507 zeroentry do_int3
1508 /* jmp paranoid_exit1
1509 CFI_ENDPROC */
1510 -END(int3)
1511 - .previous .text
1512 +KPROBE_END(int3)
1513
1514 ENTRY(overflow)
1515 zeroentry do_overflow
1516 @@ -1241,8 +1230,7 @@
1517
1518 KPROBE_ENTRY(general_protection)
1519 errorentry do_general_protection
1520 -END(general_protection)
1521 - .previous .text
1522 +KPROBE_END(general_protection)
1523
1524 ENTRY(alignment_check)
1525 errorentry do_alignment_check
1526 --- a/arch/x86/kernel/genapic_xen_64.c
1527 +++ b/arch/x86/kernel/genapic_xen_64.c
1528 @@ -71,6 +71,13 @@
1529 return cpu_online_map;
1530 }
1531
1532 +static cpumask_t xen_vector_allocation_domain(int cpu)
1533 +{
1534 + cpumask_t domain = CPU_MASK_NONE;
1535 + cpu_set(cpu, domain);
1536 + return domain;
1537 +}
1538 +
1539 /*
1540 * Set up the logical destination ID.
1541 * Do nothing, not called now.
1542 @@ -147,8 +154,8 @@
1543 .int_delivery_mode = dest_LowestPrio,
1544 #endif
1545 .int_dest_mode = (APIC_DEST_LOGICAL != 0),
1546 - .int_delivery_dest = APIC_DEST_LOGICAL | APIC_DM_LOWEST,
1547 .target_cpus = xen_target_cpus,
1548 + .vector_allocation_domain = xen_vector_allocation_domain,
1549 #ifdef CONFIG_XEN_PRIVILEGED_GUEST
1550 .apic_id_registered = xen_apic_id_registered,
1551 #endif
1552 --- a/arch/x86/kernel/head64-xen.c
1553 +++ b/arch/x86/kernel/head64-xen.c
1554 @@ -54,11 +54,9 @@
1555 new_data = *(int *) (x86_boot_params + NEW_CL_POINTER);
1556 if (!new_data) {
1557 if (OLD_CL_MAGIC != * (u16 *) OLD_CL_MAGIC_ADDR) {
1558 - printk("so old bootloader that it does not support commandline?!\n");
1559 return;
1560 }
1561 new_data = OLD_CL_BASE_ADDR + * (u16 *) OLD_CL_OFFSET;
1562 - printk("old bootloader convention, maybe loadlin?\n");
1563 }
1564 command_line = (char *) ((u64)(new_data));
1565 memcpy(saved_command_line, command_line, COMMAND_LINE_SIZE);
1566 @@ -70,25 +68,6 @@
1567 memcpy(saved_command_line, xen_start_info->cmd_line, max_cmdline);
1568 saved_command_line[max_cmdline-1] = '\0';
1569 #endif
1570 - printk("Bootdata ok (command line is %s)\n", saved_command_line);
1571 -}
1572 -
1573 -static void __init setup_boot_cpu_data(void)
1574 -{
1575 - unsigned int dummy, eax;
1576 -
1577 - /* get vendor info */
1578 - cpuid(0, (unsigned int *)&boot_cpu_data.cpuid_level,
1579 - (unsigned int *)&boot_cpu_data.x86_vendor_id[0],
1580 - (unsigned int *)&boot_cpu_data.x86_vendor_id[8],
1581 - (unsigned int *)&boot_cpu_data.x86_vendor_id[4]);
1582 -
1583 - /* get cpu type */
1584 - cpuid(1, &eax, &dummy, &dummy,
1585 - (unsigned int *) &boot_cpu_data.x86_capability);
1586 - boot_cpu_data.x86 = (eax >> 8) & 0xf;
1587 - boot_cpu_data.x86_model = (eax >> 4) & 0xf;
1588 - boot_cpu_data.x86_mask = eax & 0xf;
1589 }
1590
1591 #include <xen/interface/memory.h>
1592 @@ -101,7 +80,6 @@
1593 {
1594 struct xen_machphys_mapping mapping;
1595 unsigned long machine_to_phys_nr_ents;
1596 - char *s;
1597 int i;
1598
1599 setup_xen_features();
1600 @@ -128,10 +106,7 @@
1601 asm volatile("lidt %0" :: "m" (idt_descr));
1602 #endif
1603
1604 - /*
1605 - * This must be called really, really early:
1606 - */
1607 - lockdep_init();
1608 + early_printk("Kernel alive\n");
1609
1610 for (i = 0; i < NR_CPUS; i++)
1611 cpu_pda(i) = &boot_cpu_pda[i];
1612 @@ -141,22 +116,5 @@
1613 #ifdef CONFIG_SMP
1614 cpu_set(0, cpu_online_map);
1615 #endif
1616 - s = strstr(saved_command_line, "earlyprintk=");
1617 - if (s != NULL)
1618 - setup_early_printk(strchr(s, '=') + 1);
1619 -#ifdef CONFIG_NUMA
1620 - s = strstr(saved_command_line, "numa=");
1621 - if (s != NULL)
1622 - numa_setup(s+5);
1623 -#endif
1624 -#ifdef CONFIG_X86_IO_APIC
1625 - if (strstr(saved_command_line, "disableapic"))
1626 - disable_apic = 1;
1627 -#endif
1628 - /* You need early console to see that */
1629 - if (__pa_symbol(&_end) >= KERNEL_TEXT_SIZE)
1630 - panic("Kernel too big for kernel mapping\n");
1631 -
1632 - setup_boot_cpu_data();
1633 start_kernel();
1634 }
1635 --- a/arch/x86/kernel/head_32-xen.S
1636 +++ b/arch/x86/kernel/head_32-xen.S
1637 @@ -62,7 +62,7 @@
1638 movl %eax,%gs
1639 cld # gcc2 wants the direction flag cleared at all times
1640
1641 - pushl %eax # fake return address
1642 + pushl $0 # fake return address for unwinder
1643 jmp start_kernel
1644
1645 #define HYPERCALL_PAGE_OFFSET 0x1000
1646 --- a/arch/x86/kernel/head_64-xen.S
1647 +++ b/arch/x86/kernel/head_64-xen.S
1648 @@ -5,9 +5,6 @@
1649 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
1650 * Copyright (C) 2000 Karsten Keil <kkeil@suse.de>
1651 * Copyright (C) 2001,2002 Andi Kleen <ak@suse.de>
1652 - *
1653 - * $Id: 1020-2.6.25-xen-patch-2.6.19.patch,v 1.1 2008-05-23 17:35:36 niro Exp $
1654 - *
1655 * Jun Nakajima <jun.nakajima@intel.com>
1656 * Modified for Xen
1657 */
1658 @@ -149,7 +146,7 @@
1659 .quad 0,0 /* TSS */
1660 .quad 0,0 /* LDT */
1661 .quad 0,0,0 /* three TLS descriptors */
1662 - .quad 0 /* unused */
1663 + .quad 0x0000f40000000000 /* node/CPU stored in limit */
1664 gdt_end:
1665 /* asm/segment.h:GDT_ENTRIES must match this */
1666 /* This should be a multiple of the cache line size */
1667 --- a/arch/x86/kernel/io_apic_32-xen.c
1668 +++ b/arch/x86/kernel/io_apic_32-xen.c
1669 @@ -31,6 +31,9 @@
1670 #include <linux/acpi.h>
1671 #include <linux/module.h>
1672 #include <linux/sysdev.h>
1673 +#include <linux/pci.h>
1674 +#include <linux/msi.h>
1675 +#include <linux/htirq.h>
1676
1677 #include <asm/io.h>
1678 #include <asm/smp.h>
1679 @@ -38,13 +41,15 @@
1680 #include <asm/timer.h>
1681 #include <asm/i8259.h>
1682 #include <asm/nmi.h>
1683 +#include <asm/msidef.h>
1684 +#include <asm/hypertransport.h>
1685
1686 #include <mach_apic.h>
1687 +#include <mach_apicdef.h>
1688
1689 #include "io_ports.h"
1690
1691 #ifdef CONFIG_XEN
1692 -
1693 #include <xen/interface/xen.h>
1694 #include <xen/interface/physdev.h>
1695
1696 @@ -55,32 +60,7 @@
1697
1698 unsigned long io_apic_irqs;
1699
1700 -static inline unsigned int xen_io_apic_read(unsigned int apic, unsigned int reg)
1701 -{
1702 - struct physdev_apic apic_op;
1703 - int ret;
1704 -
1705 - apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
1706 - apic_op.reg = reg;
1707 - ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op);
1708 - if (ret)
1709 - return ret;
1710 - return apic_op.value;
1711 -}
1712 -
1713 -static inline void xen_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
1714 -{
1715 - struct physdev_apic apic_op;
1716 -
1717 - apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
1718 - apic_op.reg = reg;
1719 - apic_op.value = value;
1720 - WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op));
1721 -}
1722 -
1723 -#define io_apic_read(a,r) xen_io_apic_read(a,r)
1724 -#define io_apic_write(a,r,v) xen_io_apic_write(a,r,v)
1725 -
1726 +#define clear_IO_APIC() ((void)0)
1727 #endif /* CONFIG_XEN */
1728
1729 int (*ioapic_renumber_irq)(int ioapic, int irq);
1730 @@ -105,7 +85,7 @@
1731 */
1732 int nr_ioapic_registers[MAX_IO_APICS];
1733
1734 -int disable_timer_pin_1 __initdata;
1735 +static int disable_timer_pin_1 __initdata;
1736
1737 /*
1738 * Rough estimation of how many shared IRQs there are, can
1739 @@ -125,12 +105,122 @@
1740 int apic, pin, next;
1741 } irq_2_pin[PIN_MAP_SIZE];
1742
1743 -int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1};
1744 -#ifdef CONFIG_PCI_MSI
1745 -#define vector_to_irq(vector) \
1746 - (platform_legacy_irq(vector) ? vector : vector_irq[vector])
1747 +#ifndef CONFIG_XEN
1748 +struct io_apic {
1749 + unsigned int index;
1750 + unsigned int unused[3];
1751 + unsigned int data;
1752 +};
1753 +
1754 +static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
1755 +{
1756 + return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
1757 + + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK);
1758 +}
1759 +#endif
1760 +
1761 +static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
1762 +{
1763 +#ifndef CONFIG_XEN
1764 + struct io_apic __iomem *io_apic = io_apic_base(apic);
1765 + writel(reg, &io_apic->index);
1766 + return readl(&io_apic->data);
1767 +#else
1768 + struct physdev_apic apic_op;
1769 + int ret;
1770 +
1771 + apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
1772 + apic_op.reg = reg;
1773 + ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op);
1774 + if (ret)
1775 + return ret;
1776 + return apic_op.value;
1777 +#endif
1778 +}
1779 +
1780 +static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
1781 +{
1782 +#ifndef CONFIG_XEN
1783 + struct io_apic __iomem *io_apic = io_apic_base(apic);
1784 + writel(reg, &io_apic->index);
1785 + writel(value, &io_apic->data);
1786 +#else
1787 + struct physdev_apic apic_op;
1788 +
1789 + apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
1790 + apic_op.reg = reg;
1791 + apic_op.value = value;
1792 + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op));
1793 +#endif
1794 +}
1795 +
1796 +#ifndef CONFIG_XEN
1797 +/*
1798 + * Re-write a value: to be used for read-modify-write
1799 + * cycles where the read already set up the index register.
1800 + *
1801 + * Older SiS APIC requires we rewrite the index register
1802 + */
1803 +static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
1804 +{
1805 + volatile struct io_apic *io_apic = io_apic_base(apic);
1806 + if (sis_apic_bug)
1807 + writel(reg, &io_apic->index);
1808 + writel(value, &io_apic->data);
1809 +}
1810 #else
1811 -#define vector_to_irq(vector) (vector)
1812 +#define io_apic_modify io_apic_write
1813 +#endif
1814 +
1815 +union entry_union {
1816 + struct { u32 w1, w2; };
1817 + struct IO_APIC_route_entry entry;
1818 +};
1819 +
1820 +static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
1821 +{
1822 + union entry_union eu;
1823 + unsigned long flags;
1824 + spin_lock_irqsave(&ioapic_lock, flags);
1825 + eu.w1 = io_apic_read(apic, 0x10 + 2 * pin);
1826 + eu.w2 = io_apic_read(apic, 0x11 + 2 * pin);
1827 + spin_unlock_irqrestore(&ioapic_lock, flags);
1828 + return eu.entry;
1829 +}
1830 +
1831 +/*
1832 + * When we write a new IO APIC routing entry, we need to write the high
1833 + * word first! If the mask bit in the low word is clear, we will enable
1834 + * the interrupt, and we need to make sure the entry is fully populated
1835 + * before that happens.
1836 + */
1837 +static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
1838 +{
1839 + unsigned long flags;
1840 + union entry_union eu;
1841 + eu.entry = e;
1842 + spin_lock_irqsave(&ioapic_lock, flags);
1843 + io_apic_write(apic, 0x11 + 2*pin, eu.w2);
1844 + io_apic_write(apic, 0x10 + 2*pin, eu.w1);
1845 + spin_unlock_irqrestore(&ioapic_lock, flags);
1846 +}
1847 +
1848 +#ifndef CONFIG_XEN
1849 +/*
1850 + * When we mask an IO APIC routing entry, we need to write the low
1851 + * word first, in order to set the mask bit before we change the
1852 + * high bits!
1853 + */
1854 +static void ioapic_mask_entry(int apic, int pin)
1855 +{
1856 + unsigned long flags;
1857 + union entry_union eu = { .entry.mask = 1 };
1858 +
1859 + spin_lock_irqsave(&ioapic_lock, flags);
1860 + io_apic_write(apic, 0x10 + 2*pin, eu.w1);
1861 + io_apic_write(apic, 0x11 + 2*pin, eu.w2);
1862 + spin_unlock_irqrestore(&ioapic_lock, flags);
1863 +}
1864 #endif
1865
1866 /*
1867 @@ -156,9 +246,7 @@
1868 entry->pin = pin;
1869 }
1870
1871 -#ifdef CONFIG_XEN
1872 -#define clear_IO_APIC() ((void)0)
1873 -#else
1874 +#ifndef CONFIG_XEN
1875 /*
1876 * Reroute an IRQ to a different pin.
1877 */
1878 @@ -243,25 +331,16 @@
1879 static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
1880 {
1881 struct IO_APIC_route_entry entry;
1882 - unsigned long flags;
1883
1884 /* Check delivery_mode to be sure we're not clearing an SMI pin */
1885 - spin_lock_irqsave(&ioapic_lock, flags);
1886 - *(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
1887 - *(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
1888 - spin_unlock_irqrestore(&ioapic_lock, flags);
1889 + entry = ioapic_read_entry(apic, pin);
1890 if (entry.delivery_mode == dest_SMI)
1891 return;
1892
1893 /*
1894 * Disable it in the IO-APIC irq-routing table:
1895 */
1896 - memset(&entry, 0, sizeof(entry));
1897 - entry.mask = 1;
1898 - spin_lock_irqsave(&ioapic_lock, flags);
1899 - io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0));
1900 - io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1));
1901 - spin_unlock_irqrestore(&ioapic_lock, flags);
1902 + ioapic_mask_entry(apic, pin);
1903 }
1904
1905 static void clear_IO_APIC (void)
1906 @@ -301,7 +380,7 @@
1907 break;
1908 entry = irq_2_pin + entry->next;
1909 }
1910 - set_irq_info(irq, cpumask);
1911 + set_native_irq_info(irq, cpumask);
1912 spin_unlock_irqrestore(&ioapic_lock, flags);
1913 }
1914
1915 @@ -1207,40 +1286,40 @@
1916 /* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
1917 u8 irq_vector[NR_IRQ_VECTORS] __read_mostly; /* = { FIRST_DEVICE_VECTOR , 0 }; */
1918
1919 -int assign_irq_vector(int irq)
1920 +static int __assign_irq_vector(int irq)
1921 {
1922 - unsigned long flags;
1923 int vector;
1924 struct physdev_irq irq_op;
1925
1926 - BUG_ON(irq != AUTO_ASSIGN && (unsigned)irq >= NR_IRQ_VECTORS);
1927 -
1928 - spin_lock_irqsave(&vector_lock, flags);
1929 + BUG_ON((unsigned)irq >= NR_IRQ_VECTORS);
1930
1931 - if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0) {
1932 - spin_unlock_irqrestore(&vector_lock, flags);
1933 - return IO_APIC_VECTOR(irq);
1934 - }
1935 + if (irq_vector[irq] > 0)
1936 + return irq_vector[irq];
1937
1938 irq_op.irq = irq;
1939 - if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) {
1940 - spin_unlock_irqrestore(&vector_lock, flags);
1941 + if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op))
1942 return -ENOSPC;
1943 - }
1944
1945 vector = irq_op.vector;
1946 - vector_irq[vector] = irq;
1947 - if (irq != AUTO_ASSIGN)
1948 - IO_APIC_VECTOR(irq) = vector;
1949 + irq_vector[irq] = vector;
1950 +
1951 + return vector;
1952 +}
1953
1954 +static int assign_irq_vector(int irq)
1955 +{
1956 + unsigned long flags;
1957 + int vector;
1958 +
1959 + spin_lock_irqsave(&vector_lock, flags);
1960 + vector = __assign_irq_vector(irq);
1961 spin_unlock_irqrestore(&vector_lock, flags);
1962
1963 return vector;
1964 }
1965
1966 #ifndef CONFIG_XEN
1967 -static struct hw_interrupt_type ioapic_level_type;
1968 -static struct hw_interrupt_type ioapic_edge_type;
1969 +static struct irq_chip ioapic_chip;
1970
1971 #define IOAPIC_AUTO -1
1972 #define IOAPIC_EDGE 0
1973 @@ -1248,16 +1327,16 @@
1974
1975 static void ioapic_register_intr(int irq, int vector, unsigned long trigger)
1976 {
1977 - unsigned idx;
1978 -
1979 - idx = use_pci_vector() && !platform_legacy_irq(irq) ? vector : irq;
1980 -
1981 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
1982 trigger == IOAPIC_LEVEL)
1983 - irq_desc[idx].chip = &ioapic_level_type;
1984 - else
1985 - irq_desc[idx].chip = &ioapic_edge_type;
1986 - set_intr_gate(vector, interrupt[idx]);
1987 + set_irq_chip_and_handler_name(irq, &ioapic_chip,
1988 + handle_fasteoi_irq, "fasteoi");
1989 + else {
1990 + irq_desc[irq].status |= IRQ_DELAYED_DISABLE;
1991 + set_irq_chip_and_handler_name(irq, &ioapic_chip,
1992 + handle_edge_irq, "edge");
1993 + }
1994 + set_intr_gate(vector, interrupt[irq]);
1995 }
1996 #else
1997 #define ioapic_register_intr(_irq,_vector,_trigger) ((void)0)
1998 @@ -1328,9 +1407,8 @@
1999 if (!apic && (irq < 16))
2000 disable_8259A_irq(irq);
2001 }
2002 + ioapic_write_entry(apic, pin, entry);
2003 spin_lock_irqsave(&ioapic_lock, flags);
2004 - io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
2005 - io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
2006 set_native_irq_info(irq, TARGET_CPUS);
2007 spin_unlock_irqrestore(&ioapic_lock, flags);
2008 }
2009 @@ -1347,7 +1425,6 @@
2010 static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
2011 {
2012 struct IO_APIC_route_entry entry;
2013 - unsigned long flags;
2014
2015 memset(&entry,0,sizeof(entry));
2016
2017 @@ -1372,15 +1449,13 @@
2018 * The timer IRQ doesn't have to know that behind the
2019 * scene we have a 8259A-master in AEOI mode ...
2020 */
2021 - irq_desc[0].chip = &ioapic_edge_type;
2022 + irq_desc[0].chip = &ioapic_chip;
2023 + set_irq_handler(0, handle_edge_irq);
2024
2025 /*
2026 * Add it to the IO-APIC irq-routing table:
2027 */
2028 - spin_lock_irqsave(&ioapic_lock, flags);
2029 - io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
2030 - io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
2031 - spin_unlock_irqrestore(&ioapic_lock, flags);
2032 + ioapic_write_entry(apic, pin, entry);
2033
2034 enable_8259A_irq(0);
2035 }
2036 @@ -1490,10 +1565,7 @@
2037 for (i = 0; i <= reg_01.bits.entries; i++) {
2038 struct IO_APIC_route_entry entry;
2039
2040 - spin_lock_irqsave(&ioapic_lock, flags);
2041 - *(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2);
2042 - *(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2);
2043 - spin_unlock_irqrestore(&ioapic_lock, flags);
2044 + entry = ioapic_read_entry(apic, i);
2045
2046 printk(KERN_DEBUG " %02x %03X %02X ",
2047 i,
2048 @@ -1513,17 +1585,12 @@
2049 );
2050 }
2051 }
2052 - if (use_pci_vector())
2053 - printk(KERN_INFO "Using vector-based indexing\n");
2054 printk(KERN_DEBUG "IRQ to pin mappings:\n");
2055 for (i = 0; i < NR_IRQS; i++) {
2056 struct irq_pin_list *entry = irq_2_pin + i;
2057 if (entry->pin < 0)
2058 continue;
2059 - if (use_pci_vector() && !platform_legacy_irq(i))
2060 - printk(KERN_DEBUG "IRQ%d ", IO_APIC_VECTOR(i));
2061 - else
2062 - printk(KERN_DEBUG "IRQ%d ", i);
2063 + printk(KERN_DEBUG "IRQ%d ", i);
2064 for (;;) {
2065 printk("-> %d:%d", entry->apic, entry->pin);
2066 if (!entry->next)
2067 @@ -1709,10 +1776,7 @@
2068 /* See if any of the pins is in ExtINT mode */
2069 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
2070 struct IO_APIC_route_entry entry;
2071 - spin_lock_irqsave(&ioapic_lock, flags);
2072 - *(((int *)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
2073 - *(((int *)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
2074 - spin_unlock_irqrestore(&ioapic_lock, flags);
2075 + entry = ioapic_read_entry(apic, pin);
2076
2077
2078 /* If the interrupt line is enabled and in ExtInt mode
2079 @@ -1770,7 +1834,6 @@
2080 */
2081 if (ioapic_i8259.pin != -1) {
2082 struct IO_APIC_route_entry entry;
2083 - unsigned long flags;
2084
2085 memset(&entry, 0, sizeof(entry));
2086 entry.mask = 0; /* Enabled */
2087 @@ -1787,12 +1850,7 @@
2088 /*
2089 * Add it to the IO-APIC irq-routing table:
2090 */
2091 - spin_lock_irqsave(&ioapic_lock, flags);
2092 - io_apic_write(ioapic_i8259.apic, 0x11+2*ioapic_i8259.pin,
2093 - *(((int *)&entry)+1));
2094 - io_apic_write(ioapic_i8259.apic, 0x10+2*ioapic_i8259.pin,
2095 - *(((int *)&entry)+0));
2096 - spin_unlock_irqrestore(&ioapic_lock, flags);
2097 + ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry);
2098 }
2099 disconnect_bsp_APIC(ioapic_i8259.pin != -1);
2100 #endif
2101 @@ -1959,6 +2017,8 @@
2102 */
2103
2104 /*
2105 + * Startup quirk:
2106 + *
2107 * Starting up a edge-triggered IO-APIC interrupt is
2108 * nasty - we need to make sure that we get the edge.
2109 * If it is already asserted for some reason, we need
2110 @@ -1966,8 +2026,10 @@
2111 *
2112 * This is not complete - we should be able to fake
2113 * an edge even if it isn't on the 8259A...
2114 + *
2115 + * (We do this for level-triggered IRQs too - it cannot hurt.)
2116 */
2117 -static unsigned int startup_edge_ioapic_irq(unsigned int irq)
2118 +static unsigned int startup_ioapic_irq(unsigned int irq)
2119 {
2120 int was_pending = 0;
2121 unsigned long flags;
2122 @@ -1984,47 +2046,18 @@
2123 return was_pending;
2124 }
2125
2126 -/*
2127 - * Once we have recorded IRQ_PENDING already, we can mask the
2128 - * interrupt for real. This prevents IRQ storms from unhandled
2129 - * devices.
2130 - */
2131 -static void ack_edge_ioapic_irq(unsigned int irq)
2132 -{
2133 - move_irq(irq);
2134 - if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED))
2135 - == (IRQ_PENDING | IRQ_DISABLED))
2136 - mask_IO_APIC_irq(irq);
2137 - ack_APIC_irq();
2138 -}
2139 -
2140 -/*
2141 - * Level triggered interrupts can just be masked,
2142 - * and shutting down and starting up the interrupt
2143 - * is the same as enabling and disabling them -- except
2144 - * with a startup need to return a "was pending" value.
2145 - *
2146 - * Level triggered interrupts are special because we
2147 - * do not touch any IO-APIC register while handling
2148 - * them. We ack the APIC in the end-IRQ handler, not
2149 - * in the start-IRQ-handler. Protection against reentrance
2150 - * from the same interrupt is still provided, both by the
2151 - * generic IRQ layer and by the fact that an unacked local
2152 - * APIC does not accept IRQs.
2153 - */
2154 -static unsigned int startup_level_ioapic_irq (unsigned int irq)
2155 +static void ack_ioapic_irq(unsigned int irq)
2156 {
2157 - unmask_IO_APIC_irq(irq);
2158 -
2159 - return 0; /* don't check for pending */
2160 + move_native_irq(irq);
2161 + ack_APIC_irq();
2162 }
2163
2164 -static void end_level_ioapic_irq (unsigned int irq)
2165 +static void ack_ioapic_quirk_irq(unsigned int irq)
2166 {
2167 unsigned long v;
2168 int i;
2169
2170 - move_irq(irq);
2171 + move_native_irq(irq);
2172 /*
2173 * It appears there is an erratum which affects at least version 0x11
2174 * of I/O APIC (that's the 82093AA and cores integrated into various
2175 @@ -2044,7 +2077,7 @@
2176 * operation to prevent an edge-triggered interrupt escaping meanwhile.
2177 * The idea is from Manfred Spraul. --macro
2178 */
2179 - i = IO_APIC_VECTOR(irq);
2180 + i = irq_vector[irq];
2181
2182 v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
2183
2184 @@ -2059,104 +2092,24 @@
2185 }
2186 }
2187
2188 -#ifdef CONFIG_PCI_MSI
2189 -static unsigned int startup_edge_ioapic_vector(unsigned int vector)
2190 -{
2191 - int irq = vector_to_irq(vector);
2192 -
2193 - return startup_edge_ioapic_irq(irq);
2194 -}
2195 -
2196 -static void ack_edge_ioapic_vector(unsigned int vector)
2197 -{
2198 - int irq = vector_to_irq(vector);
2199 -
2200 - move_native_irq(vector);
2201 - ack_edge_ioapic_irq(irq);
2202 -}
2203 -
2204 -static unsigned int startup_level_ioapic_vector (unsigned int vector)
2205 -{
2206 - int irq = vector_to_irq(vector);
2207 -
2208 - return startup_level_ioapic_irq (irq);
2209 -}
2210 -
2211 -static void end_level_ioapic_vector (unsigned int vector)
2212 -{
2213 - int irq = vector_to_irq(vector);
2214 -
2215 - move_native_irq(vector);
2216 - end_level_ioapic_irq(irq);
2217 -}
2218 -
2219 -static void mask_IO_APIC_vector (unsigned int vector)
2220 -{
2221 - int irq = vector_to_irq(vector);
2222 -
2223 - mask_IO_APIC_irq(irq);
2224 -}
2225 -
2226 -static void unmask_IO_APIC_vector (unsigned int vector)
2227 -{
2228 - int irq = vector_to_irq(vector);
2229 -
2230 - unmask_IO_APIC_irq(irq);
2231 -}
2232 -
2233 -#ifdef CONFIG_SMP
2234 -static void set_ioapic_affinity_vector (unsigned int vector,
2235 - cpumask_t cpu_mask)
2236 -{
2237 - int irq = vector_to_irq(vector);
2238 -
2239 - set_native_irq_info(vector, cpu_mask);
2240 - set_ioapic_affinity_irq(irq, cpu_mask);
2241 -}
2242 -#endif
2243 -#endif
2244 -
2245 -static int ioapic_retrigger(unsigned int irq)
2246 +static int ioapic_retrigger_irq(unsigned int irq)
2247 {
2248 - send_IPI_self(IO_APIC_VECTOR(irq));
2249 + send_IPI_self(irq_vector[irq]);
2250
2251 return 1;
2252 }
2253
2254 -/*
2255 - * Level and edge triggered IO-APIC interrupts need different handling,
2256 - * so we use two separate IRQ descriptors. Edge triggered IRQs can be
2257 - * handled with the level-triggered descriptor, but that one has slightly
2258 - * more overhead. Level-triggered interrupts cannot be handled with the
2259 - * edge-triggered handler, without risking IRQ storms and other ugly
2260 - * races.
2261 - */
2262 -static struct hw_interrupt_type ioapic_edge_type __read_mostly = {
2263 - .typename = "IO-APIC-edge",
2264 - .startup = startup_edge_ioapic,
2265 - .shutdown = shutdown_edge_ioapic,
2266 - .enable = enable_edge_ioapic,
2267 - .disable = disable_edge_ioapic,
2268 - .ack = ack_edge_ioapic,
2269 - .end = end_edge_ioapic,
2270 -#ifdef CONFIG_SMP
2271 - .set_affinity = set_ioapic_affinity,
2272 -#endif
2273 - .retrigger = ioapic_retrigger,
2274 -};
2275 -
2276 -static struct hw_interrupt_type ioapic_level_type __read_mostly = {
2277 - .typename = "IO-APIC-level",
2278 - .startup = startup_level_ioapic,
2279 - .shutdown = shutdown_level_ioapic,
2280 - .enable = enable_level_ioapic,
2281 - .disable = disable_level_ioapic,
2282 - .ack = mask_and_ack_level_ioapic,
2283 - .end = end_level_ioapic,
2284 +static struct irq_chip ioapic_chip __read_mostly = {
2285 + .name = "IO-APIC",
2286 + .startup = startup_ioapic_irq,
2287 + .mask = mask_IO_APIC_irq,
2288 + .unmask = unmask_IO_APIC_irq,
2289 + .ack = ack_ioapic_irq,
2290 + .eoi = ack_ioapic_quirk_irq,
2291 #ifdef CONFIG_SMP
2292 - .set_affinity = set_ioapic_affinity,
2293 + .set_affinity = set_ioapic_affinity_irq,
2294 #endif
2295 - .retrigger = ioapic_retrigger,
2296 + .retrigger = ioapic_retrigger_irq,
2297 };
2298 #endif /* !CONFIG_XEN */
2299
2300 @@ -2177,12 +2130,7 @@
2301 */
2302 for (irq = 0; irq < NR_IRQS ; irq++) {
2303 int tmp = irq;
2304 - if (use_pci_vector()) {
2305 - if (!platform_legacy_irq(tmp))
2306 - if ((tmp = vector_to_irq(tmp)) == -1)
2307 - continue;
2308 - }
2309 - if (IO_APIC_IRQ(tmp) && !IO_APIC_VECTOR(tmp)) {
2310 + if (IO_APIC_IRQ(tmp) && !irq_vector[tmp]) {
2311 /*
2312 * Hmm.. We don't have an entry for this,
2313 * so default to an old-fashioned 8259
2314 @@ -2193,22 +2141,23 @@
2315 #ifndef CONFIG_XEN
2316 else
2317 /* Strange. Oh, well.. */
2318 - irq_desc[irq].chip = &no_irq_type;
2319 + irq_desc[irq].chip = &no_irq_chip;
2320 #endif
2321 }
2322 }
2323 }
2324
2325 #ifndef CONFIG_XEN
2326 -static void enable_lapic_irq (unsigned int irq)
2327 -{
2328 - unsigned long v;
2329 +/*
2330 + * The local APIC irq-chip implementation:
2331 + */
2332
2333 - v = apic_read(APIC_LVT0);
2334 - apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED);
2335 +static void ack_apic(unsigned int irq)
2336 +{
2337 + ack_APIC_irq();
2338 }
2339
2340 -static void disable_lapic_irq (unsigned int irq)
2341 +static void mask_lapic_irq (unsigned int irq)
2342 {
2343 unsigned long v;
2344
2345 @@ -2216,21 +2165,19 @@
2346 apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
2347 }
2348
2349 -static void ack_lapic_irq (unsigned int irq)
2350 +static void unmask_lapic_irq (unsigned int irq)
2351 {
2352 - ack_APIC_irq();
2353 -}
2354 + unsigned long v;
2355
2356 -static void end_lapic_irq (unsigned int i) { /* nothing */ }
2357 + v = apic_read(APIC_LVT0);
2358 + apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED);
2359 +}
2360
2361 -static struct hw_interrupt_type lapic_irq_type __read_mostly = {
2362 - .typename = "local-APIC-edge",
2363 - .startup = NULL, /* startup_irq() not used for IRQ0 */
2364 - .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */
2365 - .enable = enable_lapic_irq,
2366 - .disable = disable_lapic_irq,
2367 - .ack = ack_lapic_irq,
2368 - .end = end_lapic_irq
2369 +static struct irq_chip lapic_chip __read_mostly = {
2370 + .name = "local-APIC-edge",
2371 + .mask = mask_lapic_irq,
2372 + .unmask = unmask_lapic_irq,
2373 + .eoi = ack_apic,
2374 };
2375
2376 static void setup_nmi (void)
2377 @@ -2263,17 +2210,13 @@
2378 int apic, pin, i;
2379 struct IO_APIC_route_entry entry0, entry1;
2380 unsigned char save_control, save_freq_select;
2381 - unsigned long flags;
2382
2383 pin = find_isa_irq_pin(8, mp_INT);
2384 apic = find_isa_irq_apic(8, mp_INT);
2385 if (pin == -1)
2386 return;
2387
2388 - spin_lock_irqsave(&ioapic_lock, flags);
2389 - *(((int *)&entry0) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
2390 - *(((int *)&entry0) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
2391 - spin_unlock_irqrestore(&ioapic_lock, flags);
2392 + entry0 = ioapic_read_entry(apic, pin);
2393 clear_IO_APIC_pin(apic, pin);
2394
2395 memset(&entry1, 0, sizeof(entry1));
2396 @@ -2286,10 +2229,7 @@
2397 entry1.trigger = 0;
2398 entry1.vector = 0;
2399
2400 - spin_lock_irqsave(&ioapic_lock, flags);
2401 - io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry1) + 1));
2402 - io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry1) + 0));
2403 - spin_unlock_irqrestore(&ioapic_lock, flags);
2404 + ioapic_write_entry(apic, pin, entry1);
2405
2406 save_control = CMOS_READ(RTC_CONTROL);
2407 save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
2408 @@ -2308,10 +2248,7 @@
2409 CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
2410 clear_IO_APIC_pin(apic, pin);
2411
2412 - spin_lock_irqsave(&ioapic_lock, flags);
2413 - io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry0) + 1));
2414 - io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry0) + 0));
2415 - spin_unlock_irqrestore(&ioapic_lock, flags);
2416 + ioapic_write_entry(apic, pin, entry0);
2417 }
2418
2419 int timer_uses_ioapic_pin_0;
2420 @@ -2411,7 +2348,8 @@
2421 printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
2422
2423 disable_8259A_irq(0);
2424 - irq_desc[0].chip = &lapic_irq_type;
2425 + set_irq_chip_and_handler_name(0, &lapic_chip, handle_fasteoi_irq,
2426 + "fasteio");
2427 apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
2428 enable_8259A_irq(0);
2429
2430 @@ -2523,17 +2461,12 @@
2431 {
2432 struct IO_APIC_route_entry *entry;
2433 struct sysfs_ioapic_data *data;
2434 - unsigned long flags;
2435 int i;
2436
2437 data = container_of(dev, struct sysfs_ioapic_data, dev);
2438 entry = data->entry;
2439 - spin_lock_irqsave(&ioapic_lock, flags);
2440 - for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
2441 - *(((int *)entry) + 1) = io_apic_read(dev->id, 0x11 + 2 * i);
2442 - *(((int *)entry) + 0) = io_apic_read(dev->id, 0x10 + 2 * i);
2443 - }
2444 - spin_unlock_irqrestore(&ioapic_lock, flags);
2445 + for (i = 0; i < nr_ioapic_registers[dev->id]; i ++)
2446 + entry[i] = ioapic_read_entry(dev->id, i);
2447
2448 return 0;
2449 }
2450 @@ -2555,11 +2488,9 @@
2451 reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
2452 io_apic_write(dev->id, 0, reg_00.raw);
2453 }
2454 - for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
2455 - io_apic_write(dev->id, 0x11+2*i, *(((int *)entry)+1));
2456 - io_apic_write(dev->id, 0x10+2*i, *(((int *)entry)+0));
2457 - }
2458 spin_unlock_irqrestore(&ioapic_lock, flags);
2459 + for (i = 0; i < nr_ioapic_registers[dev->id]; i ++)
2460 + ioapic_write_entry(dev->id, i, entry[i]);
2461
2462 return 0;
2463 }
2464 @@ -2605,6 +2536,240 @@
2465
2466 device_initcall(ioapic_init_sysfs);
2467
2468 +#ifndef CONFIG_XEN
2469 +/*
2470 + * Dynamic irq allocate and deallocation
2471 + */
2472 +int create_irq(void)
2473 +{
2474 + /* Allocate an unused irq */
2475 + int irq, new, vector;
2476 + unsigned long flags;
2477 +
2478 + irq = -ENOSPC;
2479 + spin_lock_irqsave(&vector_lock, flags);
2480 + for (new = (NR_IRQS - 1); new >= 0; new--) {
2481 + if (platform_legacy_irq(new))
2482 + continue;
2483 + if (irq_vector[new] != 0)
2484 + continue;
2485 + vector = __assign_irq_vector(new);
2486 + if (likely(vector > 0))
2487 + irq = new;
2488 + break;
2489 + }
2490 + spin_unlock_irqrestore(&vector_lock, flags);
2491 +
2492 + if (irq >= 0) {
2493 + set_intr_gate(vector, interrupt[irq]);
2494 + dynamic_irq_init(irq);
2495 + }
2496 + return irq;
2497 +}
2498 +
2499 +void destroy_irq(unsigned int irq)
2500 +{
2501 + unsigned long flags;
2502 +
2503 + dynamic_irq_cleanup(irq);
2504 +
2505 + spin_lock_irqsave(&vector_lock, flags);
2506 + irq_vector[irq] = 0;
2507 + spin_unlock_irqrestore(&vector_lock, flags);
2508 +}
2509 +#endif
2510 +
2511 +/*
2512 + * MSI mesage composition
2513 + */
2514 +#ifdef CONFIG_PCI_MSI
2515 +static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
2516 +{
2517 + int vector;
2518 + unsigned dest;
2519 +
2520 + vector = assign_irq_vector(irq);
2521 + if (vector >= 0) {
2522 + dest = cpu_mask_to_apicid(TARGET_CPUS);
2523 +
2524 + msg->address_hi = MSI_ADDR_BASE_HI;
2525 + msg->address_lo =
2526 + MSI_ADDR_BASE_LO |
2527 + ((INT_DEST_MODE == 0) ?
2528 + MSI_ADDR_DEST_MODE_PHYSICAL:
2529 + MSI_ADDR_DEST_MODE_LOGICAL) |
2530 + ((INT_DELIVERY_MODE != dest_LowestPrio) ?
2531 + MSI_ADDR_REDIRECTION_CPU:
2532 + MSI_ADDR_REDIRECTION_LOWPRI) |
2533 + MSI_ADDR_DEST_ID(dest);
2534 +
2535 + msg->data =
2536 + MSI_DATA_TRIGGER_EDGE |
2537 + MSI_DATA_LEVEL_ASSERT |
2538 + ((INT_DELIVERY_MODE != dest_LowestPrio) ?
2539 + MSI_DATA_DELIVERY_FIXED:
2540 + MSI_DATA_DELIVERY_LOWPRI) |
2541 + MSI_DATA_VECTOR(vector);
2542 + }
2543 + return vector;
2544 +}
2545 +
2546 +#ifdef CONFIG_SMP
2547 +static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
2548 +{
2549 + struct msi_msg msg;
2550 + unsigned int dest;
2551 + cpumask_t tmp;
2552 + int vector;
2553 +
2554 + cpus_and(tmp, mask, cpu_online_map);
2555 + if (cpus_empty(tmp))
2556 + tmp = TARGET_CPUS;
2557 +
2558 + vector = assign_irq_vector(irq);
2559 + if (vector < 0)
2560 + return;
2561 +
2562 + dest = cpu_mask_to_apicid(mask);
2563 +
2564 + read_msi_msg(irq, &msg);
2565 +
2566 + msg.data &= ~MSI_DATA_VECTOR_MASK;
2567 + msg.data |= MSI_DATA_VECTOR(vector);
2568 + msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
2569 + msg.address_lo |= MSI_ADDR_DEST_ID(dest);
2570 +
2571 + write_msi_msg(irq, &msg);
2572 + set_native_irq_info(irq, mask);
2573 +}
2574 +#endif /* CONFIG_SMP */
2575 +
2576 +/*
2577 + * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
2578 + * which implement the MSI or MSI-X Capability Structure.
2579 + */
2580 +static struct irq_chip msi_chip = {
2581 + .name = "PCI-MSI",
2582 + .unmask = unmask_msi_irq,
2583 + .mask = mask_msi_irq,
2584 + .ack = ack_ioapic_irq,
2585 +#ifdef CONFIG_SMP
2586 + .set_affinity = set_msi_irq_affinity,
2587 +#endif
2588 + .retrigger = ioapic_retrigger_irq,
2589 +};
2590 +
2591 +int arch_setup_msi_irq(unsigned int irq, struct pci_dev *dev)
2592 +{
2593 + struct msi_msg msg;
2594 + int ret;
2595 + ret = msi_compose_msg(dev, irq, &msg);
2596 + if (ret < 0)
2597 + return ret;
2598 +
2599 + write_msi_msg(irq, &msg);
2600 +
2601 + set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq,
2602 + "edge");
2603 +
2604 + return 0;
2605 +}
2606 +
2607 +void arch_teardown_msi_irq(unsigned int irq)
2608 +{
2609 + return;
2610 +}
2611 +
2612 +#endif /* CONFIG_PCI_MSI */
2613 +
2614 +/*
2615 + * Hypertransport interrupt support
2616 + */
2617 +#ifdef CONFIG_HT_IRQ
2618 +
2619 +#ifdef CONFIG_SMP
2620 +
2621 +static void target_ht_irq(unsigned int irq, unsigned int dest)
2622 +{
2623 + struct ht_irq_msg msg;
2624 + fetch_ht_irq_msg(irq, &msg);
2625 +
2626 + msg.address_lo &= ~(HT_IRQ_LOW_DEST_ID_MASK);
2627 + msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK);
2628 +
2629 + msg.address_lo |= HT_IRQ_LOW_DEST_ID(dest);
2630 + msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest);
2631 +
2632 + write_ht_irq_msg(irq, &msg);
2633 +}
2634 +
2635 +static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
2636 +{
2637 + unsigned int dest;
2638 + cpumask_t tmp;
2639 +
2640 + cpus_and(tmp, mask, cpu_online_map);
2641 + if (cpus_empty(tmp))
2642 + tmp = TARGET_CPUS;
2643 +
2644 + cpus_and(mask, tmp, CPU_MASK_ALL);
2645 +
2646 + dest = cpu_mask_to_apicid(mask);
2647 +
2648 + target_ht_irq(irq, dest);
2649 + set_native_irq_info(irq, mask);
2650 +}
2651 +#endif
2652 +
2653 +static struct irq_chip ht_irq_chip = {
2654 + .name = "PCI-HT",
2655 + .mask = mask_ht_irq,
2656 + .unmask = unmask_ht_irq,
2657 + .ack = ack_ioapic_irq,
2658 +#ifdef CONFIG_SMP
2659 + .set_affinity = set_ht_irq_affinity,
2660 +#endif
2661 + .retrigger = ioapic_retrigger_irq,
2662 +};
2663 +
2664 +int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
2665 +{
2666 + int vector;
2667 +
2668 + vector = assign_irq_vector(irq);
2669 + if (vector >= 0) {
2670 + struct ht_irq_msg msg;
2671 + unsigned dest;
2672 + cpumask_t tmp;
2673 +
2674 + cpus_clear(tmp);
2675 + cpu_set(vector >> 8, tmp);
2676 + dest = cpu_mask_to_apicid(tmp);
2677 +
2678 + msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
2679 +
2680 + msg.address_lo =
2681 + HT_IRQ_LOW_BASE |
2682 + HT_IRQ_LOW_DEST_ID(dest) |
2683 + HT_IRQ_LOW_VECTOR(vector) |
2684 + ((INT_DEST_MODE == 0) ?
2685 + HT_IRQ_LOW_DM_PHYSICAL :
2686 + HT_IRQ_LOW_DM_LOGICAL) |
2687 + HT_IRQ_LOW_RQEOI_EDGE |
2688 + ((INT_DELIVERY_MODE != dest_LowestPrio) ?
2689 + HT_IRQ_LOW_MT_FIXED :
2690 + HT_IRQ_LOW_MT_ARBITRATED) |
2691 + HT_IRQ_LOW_IRQ_MASKED;
2692 +
2693 + write_ht_irq_msg(irq, &msg);
2694 +
2695 + set_irq_chip_and_handler_name(irq, &ht_irq_chip,
2696 + handle_edge_irq, "edge");
2697 + }
2698 + return vector;
2699 +}
2700 +#endif /* CONFIG_HT_IRQ */
2701 +
2702 /* --------------------------------------------------------------------------
2703 ACPI-based IOAPIC Configuration
2704 -------------------------------------------------------------------------- */
2705 @@ -2758,13 +2923,34 @@
2706 if (!ioapic && (irq < 16))
2707 disable_8259A_irq(irq);
2708
2709 + ioapic_write_entry(ioapic, pin, entry);
2710 spin_lock_irqsave(&ioapic_lock, flags);
2711 - io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1));
2712 - io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0));
2713 - set_native_irq_info(use_pci_vector() ? entry.vector : irq, TARGET_CPUS);
2714 + set_native_irq_info(irq, TARGET_CPUS);
2715 spin_unlock_irqrestore(&ioapic_lock, flags);
2716
2717 return 0;
2718 }
2719
2720 #endif /* CONFIG_ACPI */
2721 +
2722 +static int __init parse_disable_timer_pin_1(char *arg)
2723 +{
2724 + disable_timer_pin_1 = 1;
2725 + return 0;
2726 +}
2727 +early_param("disable_timer_pin_1", parse_disable_timer_pin_1);
2728 +
2729 +static int __init parse_enable_timer_pin_1(char *arg)
2730 +{
2731 + disable_timer_pin_1 = -1;
2732 + return 0;
2733 +}
2734 +early_param("enable_timer_pin_1", parse_enable_timer_pin_1);
2735 +
2736 +static int __init parse_noapic(char *arg)
2737 +{
2738 + /* disable IO-APIC */
2739 + disable_ioapic_setup();
2740 + return 0;
2741 +}
2742 +early_param("noapic", parse_noapic);
2743 --- a/arch/x86/kernel/io_apic_64-xen.c
2744 +++ b/arch/x86/kernel/io_apic_64-xen.c
2745 @@ -26,9 +26,12 @@
2746 #include <linux/delay.h>
2747 #include <linux/sched.h>
2748 #include <linux/smp_lock.h>
2749 +#include <linux/pci.h>
2750 #include <linux/mc146818rtc.h>
2751 #include <linux/acpi.h>
2752 #include <linux/sysdev.h>
2753 +#include <linux/msi.h>
2754 +#include <linux/htirq.h>
2755 #ifdef CONFIG_ACPI
2756 #include <acpi/acpi_bus.h>
2757 #endif
2758 @@ -41,6 +44,10 @@
2759 #include <asm/acpi.h>
2760 #include <asm/dma.h>
2761 #include <asm/nmi.h>
2762 +#include <asm/msidef.h>
2763 +#include <asm/hypertransport.h>
2764 +
2765 +static int assign_irq_vector(int irq, cpumask_t mask, cpumask_t *result);
2766
2767 #define __apicdebuginit __init
2768
2769 @@ -48,17 +55,29 @@
2770
2771 static int no_timer_check;
2772
2773 -int disable_timer_pin_1 __initdata;
2774 +static int disable_timer_pin_1 __initdata;
2775
2776 -#ifndef CONFIG_XEN
2777 -int timer_over_8254 __initdata = 0;
2778 +#ifdef CONFIG_XEN
2779 +#include <xen/interface/xen.h>
2780 +#include <xen/interface/physdev.h>
2781 +
2782 +/* Fake i8259 */
2783 +#define make_8259A_irq(_irq) (io_apic_irqs &= ~(1UL<<(_irq)))
2784 +#define disable_8259A_irq(_irq) ((void)0)
2785 +#define i8259A_irq_pending(_irq) (0)
2786 +
2787 +unsigned long io_apic_irqs;
2788 +
2789 +#define clear_IO_APIC() ((void)0)
2790 +#else
2791 +int timer_over_8254 __initdata = 1;
2792
2793 /* Where if anywhere is the i8259 connect in external int mode */
2794 static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
2795 #endif
2796
2797 static DEFINE_SPINLOCK(ioapic_lock);
2798 -static DEFINE_SPINLOCK(vector_lock);
2799 +DEFINE_SPINLOCK(vector_lock);
2800
2801 /*
2802 * # of IRQ routing registers
2803 @@ -83,28 +102,27 @@
2804 short apic, pin, next;
2805 } irq_2_pin[PIN_MAP_SIZE];
2806
2807 -int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1};
2808 -#ifdef CONFIG_PCI_MSI
2809 -#define vector_to_irq(vector) \
2810 - (platform_legacy_irq(vector) ? vector : vector_irq[vector])
2811 -#else
2812 -#define vector_to_irq(vector) (vector)
2813 -#endif
2814 -
2815 -#ifdef CONFIG_XEN
2816 -
2817 -#include <xen/interface/xen.h>
2818 -#include <xen/interface/physdev.h>
2819 -
2820 -/* Fake i8259 */
2821 -#define make_8259A_irq(_irq) (io_apic_irqs &= ~(1UL<<(_irq)))
2822 -#define disable_8259A_irq(_irq) ((void)0)
2823 -#define i8259A_irq_pending(_irq) (0)
2824 +#ifndef CONFIG_XEN
2825 +struct io_apic {
2826 + unsigned int index;
2827 + unsigned int unused[3];
2828 + unsigned int data;
2829 +};
2830
2831 -unsigned long io_apic_irqs;
2832 +static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
2833 +{
2834 + return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
2835 + + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK);
2836 +}
2837 +#endif
2838
2839 -static inline unsigned int xen_io_apic_read(unsigned int apic, unsigned int reg)
2840 +static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
2841 {
2842 +#ifndef CONFIG_XEN
2843 + struct io_apic __iomem *io_apic = io_apic_base(apic);
2844 + writel(reg, &io_apic->index);
2845 + return readl(&io_apic->data);
2846 +#else
2847 struct physdev_apic apic_op;
2848 int ret;
2849
2850 @@ -114,31 +132,131 @@
2851 if (ret)
2852 return ret;
2853 return apic_op.value;
2854 +#endif
2855 }
2856
2857 -static inline void xen_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
2858 +static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
2859 {
2860 +#ifndef CONFIG_XEN
2861 + struct io_apic __iomem *io_apic = io_apic_base(apic);
2862 + writel(reg, &io_apic->index);
2863 + writel(value, &io_apic->data);
2864 +#else
2865 struct physdev_apic apic_op;
2866
2867 apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
2868 apic_op.reg = reg;
2869 apic_op.value = value;
2870 WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op));
2871 +#endif
2872 }
2873
2874 -#define io_apic_read(a,r) xen_io_apic_read(a,r)
2875 -#define io_apic_write(a,r,v) xen_io_apic_write(a,r,v)
2876 +#ifndef CONFIG_XEN
2877 +/*
2878 + * Re-write a value: to be used for read-modify-write
2879 + * cycles where the read already set up the index register.
2880 + */
2881 +static inline void io_apic_modify(unsigned int apic, unsigned int value)
2882 +{
2883 + struct io_apic __iomem *io_apic = io_apic_base(apic);
2884 + writel(value, &io_apic->data);
2885 +}
2886 +#else
2887 +#define io_apic_modify io_apic_write
2888 +#endif
2889
2890 -#define clear_IO_APIC() ((void)0)
2891 +/*
2892 + * Synchronize the IO-APIC and the CPU by doing
2893 + * a dummy read from the IO-APIC
2894 + */
2895 +static inline void io_apic_sync(unsigned int apic)
2896 +{
2897 +#ifndef CONFIG_XEN
2898 + struct io_apic __iomem *io_apic = io_apic_base(apic);
2899 + readl(&io_apic->data);
2900 +#endif
2901 +}
2902
2903 -#else
2904 +union entry_union {
2905 + struct { u32 w1, w2; };
2906 + struct IO_APIC_route_entry entry;
2907 +};
2908 +
2909 +static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
2910 +{
2911 + union entry_union eu;
2912 + unsigned long flags;
2913 + spin_lock_irqsave(&ioapic_lock, flags);
2914 + eu.w1 = io_apic_read(apic, 0x10 + 2 * pin);
2915 + eu.w2 = io_apic_read(apic, 0x11 + 2 * pin);
2916 + spin_unlock_irqrestore(&ioapic_lock, flags);
2917 + return eu.entry;
2918 +}
2919 +
2920 +/*
2921 + * When we write a new IO APIC routing entry, we need to write the high
2922 + * word first! If the mask bit in the low word is clear, we will enable
2923 + * the interrupt, and we need to make sure the entry is fully populated
2924 + * before that happens.
2925 + */
2926 +static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
2927 +{
2928 + unsigned long flags;
2929 + union entry_union eu;
2930 + eu.entry = e;
2931 + spin_lock_irqsave(&ioapic_lock, flags);
2932 + io_apic_write(apic, 0x11 + 2*pin, eu.w2);
2933 + io_apic_write(apic, 0x10 + 2*pin, eu.w1);
2934 + spin_unlock_irqrestore(&ioapic_lock, flags);
2935 +}
2936 +
2937 +#ifndef CONFIG_XEN
2938 +/*
2939 + * When we mask an IO APIC routing entry, we need to write the low
2940 + * word first, in order to set the mask bit before we change the
2941 + * high bits!
2942 + */
2943 +static void ioapic_mask_entry(int apic, int pin)
2944 +{
2945 + unsigned long flags;
2946 + union entry_union eu = { .entry.mask = 1 };
2947 +
2948 + spin_lock_irqsave(&ioapic_lock, flags);
2949 + io_apic_write(apic, 0x10 + 2*pin, eu.w1);
2950 + io_apic_write(apic, 0x11 + 2*pin, eu.w2);
2951 + spin_unlock_irqrestore(&ioapic_lock, flags);
2952 +}
2953
2954 #ifdef CONFIG_SMP
2955 +static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector)
2956 +{
2957 + int apic, pin;
2958 + struct irq_pin_list *entry = irq_2_pin + irq;
2959 +
2960 + BUG_ON(irq >= NR_IRQS);
2961 + for (;;) {
2962 + unsigned int reg;
2963 + apic = entry->apic;
2964 + pin = entry->pin;
2965 + if (pin == -1)
2966 + break;
2967 + io_apic_write(apic, 0x11 + pin*2, dest);
2968 + reg = io_apic_read(apic, 0x10 + pin*2);
2969 + reg &= ~0x000000ff;
2970 + reg |= vector;
2971 + io_apic_modify(apic, reg);
2972 + if (!entry->next)
2973 + break;
2974 + entry = irq_2_pin + entry->next;
2975 + }
2976 +}
2977 +
2978 static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
2979 {
2980 unsigned long flags;
2981 unsigned int dest;
2982 cpumask_t tmp;
2983 + int vector;
2984
2985 cpus_and(tmp, mask, cpu_online_map);
2986 if (cpus_empty(tmp))
2987 @@ -146,7 +264,11 @@
2988
2989 cpus_and(mask, tmp, CPU_MASK_ALL);
2990
2991 - dest = cpu_mask_to_apicid(mask);
2992 + vector = assign_irq_vector(irq, mask, &tmp);
2993 + if (vector < 0)
2994 + return;
2995 +
2996 + dest = cpu_mask_to_apicid(tmp);
2997
2998 /*
2999 * Only the high 8 bits are valid.
3000 @@ -154,13 +276,12 @@
3001 dest = SET_APIC_LOGICAL_ID(dest);
3002
3003 spin_lock_irqsave(&ioapic_lock, flags);
3004 - __DO_ACTION(1, = dest, )
3005 - set_irq_info(irq, mask);
3006 + __target_IO_APIC_irq(irq, dest, vector);
3007 + set_native_irq_info(irq, mask);
3008 spin_unlock_irqrestore(&ioapic_lock, flags);
3009 }
3010 #endif
3011 -
3012 -#endif /* !CONFIG_XEN */
3013 +#endif
3014
3015 /*
3016 * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
3017 @@ -240,24 +361,15 @@
3018 static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
3019 {
3020 struct IO_APIC_route_entry entry;
3021 - unsigned long flags;
3022
3023 /* Check delivery_mode to be sure we're not clearing an SMI pin */
3024 - spin_lock_irqsave(&ioapic_lock, flags);
3025 - *(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
3026 - *(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
3027 - spin_unlock_irqrestore(&ioapic_lock, flags);
3028 + entry = ioapic_read_entry(apic, pin);
3029 if (entry.delivery_mode == dest_SMI)
3030 return;
3031 /*
3032 * Disable it in the IO-APIC irq-routing table:
3033 */
3034 - memset(&entry, 0, sizeof(entry));
3035 - entry.mask = 1;
3036 - spin_lock_irqsave(&ioapic_lock, flags);
3037 - io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0));
3038 - io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1));
3039 - spin_unlock_irqrestore(&ioapic_lock, flags);
3040 + ioapic_mask_entry(apic, pin);
3041 }
3042
3043 static void clear_IO_APIC (void)
3044 @@ -271,16 +383,6 @@
3045
3046 #endif /* !CONFIG_XEN */
3047
3048 -static u8 gsi_2_irq[NR_IRQ_VECTORS] = { [0 ... NR_IRQ_VECTORS-1] = 0xFF };
3049 -
3050 -/*
3051 - * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
3052 - * specific CPU-side IRQs.
3053 - */
3054 -
3055 -#define MAX_PIRQS 8
3056 -static int pirq_entries [MAX_PIRQS];
3057 -static int pirqs_enabled;
3058 int skip_ioapic_setup;
3059 int ioapic_force;
3060
3061 @@ -289,18 +391,17 @@
3062 static int __init disable_ioapic_setup(char *str)
3063 {
3064 skip_ioapic_setup = 1;
3065 - return 1;
3066 + return 0;
3067 }
3068 +early_param("noapic", disable_ioapic_setup);
3069
3070 -static int __init enable_ioapic_setup(char *str)
3071 +/* Actually the next is obsolete, but keep it for paranoid reasons -AK */
3072 +static int __init disable_timer_pin_setup(char *arg)
3073 {
3074 - ioapic_force = 1;
3075 - skip_ioapic_setup = 0;
3076 + disable_timer_pin_1 = 1;
3077 return 1;
3078 }
3079 -
3080 -__setup("noapic", disable_ioapic_setup);
3081 -__setup("apic", enable_ioapic_setup);
3082 +__setup("disable_timer_pin_1", disable_timer_pin_setup);
3083
3084 #ifndef CONFIG_XEN
3085 static int __init setup_disable_8254_timer(char *s)
3086 @@ -318,137 +419,6 @@
3087 __setup("enable_8254_timer", setup_enable_8254_timer);
3088 #endif /* !CONFIG_XEN */
3089
3090 -#include <asm/pci-direct.h>
3091 -#include <linux/pci_ids.h>
3092 -#include <linux/pci.h>
3093 -
3094 -
3095 -#ifdef CONFIG_ACPI
3096 -
3097 -static int nvidia_hpet_detected __initdata;
3098 -
3099 -static int __init nvidia_hpet_check(unsigned long phys, unsigned long size)
3100 -{
3101 - nvidia_hpet_detected = 1;
3102 - return 0;
3103 -}
3104 -#endif
3105 -
3106 -/* Temporary Hack. Nvidia and VIA boards currently only work with IO-APIC
3107 - off. Check for an Nvidia or VIA PCI bridge and turn it off.
3108 - Use pci direct infrastructure because this runs before the PCI subsystem.
3109 -
3110 - Can be overwritten with "apic"
3111 -
3112 - And another hack to disable the IOMMU on VIA chipsets.
3113 -
3114 - ... and others. Really should move this somewhere else.
3115 -
3116 - Kludge-O-Rama. */
3117 -void __init check_ioapic(void)
3118 -{
3119 - int num,slot,func;
3120 - /* Poor man's PCI discovery */
3121 - for (num = 0; num < 32; num++) {
3122 - for (slot = 0; slot < 32; slot++) {
3123 - for (func = 0; func < 8; func++) {
3124 - u32 class;
3125 - u32 vendor;
3126 - u8 type;
3127 - class = read_pci_config(num,slot,func,
3128 - PCI_CLASS_REVISION);
3129 - if (class == 0xffffffff)
3130 - break;
3131 -
3132 - if ((class >> 16) != PCI_CLASS_BRIDGE_PCI)
3133 - continue;
3134 -
3135 - vendor = read_pci_config(num, slot, func,
3136 - PCI_VENDOR_ID);
3137 - vendor &= 0xffff;
3138 - switch (vendor) {
3139 - case PCI_VENDOR_ID_VIA:
3140 -#ifdef CONFIG_IOMMU
3141 - if ((end_pfn > MAX_DMA32_PFN ||
3142 - force_iommu) &&
3143 - !iommu_aperture_allowed) {
3144 - printk(KERN_INFO
3145 - "Looks like a VIA chipset. Disabling IOMMU. Override with \"iommu=allowed\"\n");
3146 - iommu_aperture_disabled = 1;
3147 - }
3148 -#endif
3149 - return;
3150 - case PCI_VENDOR_ID_NVIDIA:
3151 -#ifdef CONFIG_ACPI
3152 - /*
3153 - * All timer overrides on Nvidia are
3154 - * wrong unless HPET is enabled.
3155 - */
3156 - nvidia_hpet_detected = 0;
3157 - acpi_table_parse(ACPI_HPET,
3158 - nvidia_hpet_check);
3159 - if (nvidia_hpet_detected == 0) {
3160 - acpi_skip_timer_override = 1;
3161 - printk(KERN_INFO "Nvidia board "
3162 - "detected. Ignoring ACPI "
3163 - "timer override.\n");
3164 - }
3165 -#endif
3166 - /* RED-PEN skip them on mptables too? */
3167 - return;
3168 - case PCI_VENDOR_ID_ATI:
3169 -
3170 - /* This should be actually default, but
3171 - for 2.6.16 let's do it for ATI only where
3172 - it's really needed. */
3173 -#ifndef CONFIG_XEN
3174 - if (timer_over_8254 == 1) {
3175 - timer_over_8254 = 0;
3176 - printk(KERN_INFO
3177 - "ATI board detected. Disabling timer routing over 8254.\n");
3178 - }
3179 -#endif
3180 - return;
3181 - }
3182 -
3183 -
3184 - /* No multi-function device? */
3185 - type = read_pci_config_byte(num,slot,func,
3186 - PCI_HEADER_TYPE);
3187 - if (!(type & 0x80))
3188 - break;
3189 - }
3190 - }
3191 - }
3192 -}
3193 -
3194 -static int __init ioapic_pirq_setup(char *str)
3195 -{
3196 - int i, max;
3197 - int ints[MAX_PIRQS+1];
3198 -
3199 - get_options(str, ARRAY_SIZE(ints), ints);
3200 -
3201 - for (i = 0; i < MAX_PIRQS; i++)
3202 - pirq_entries[i] = -1;
3203 -
3204 - pirqs_enabled = 1;
3205 - apic_printk(APIC_VERBOSE, "PIRQ redirection, working around broken MP-BIOS.\n");
3206 - max = MAX_PIRQS;
3207 - if (ints[0] < MAX_PIRQS)
3208 - max = ints[0];
3209 -
3210 - for (i = 0; i < max; i++) {
3211 - apic_printk(APIC_VERBOSE, "... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
3212 - /*
3213 - * PIRQs are mapped upside down, usually.
3214 - */
3215 - pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
3216 - }
3217 - return 1;
3218 -}
3219 -
3220 -__setup("pirq=", ioapic_pirq_setup);
3221
3222 /*
3223 * Find the IRQ entry number of a certain pin.
3224 @@ -478,9 +448,7 @@
3225 for (i = 0; i < mp_irq_entries; i++) {
3226 int lbus = mp_irqs[i].mpc_srcbus;
3227
3228 - if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
3229 - mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
3230 - mp_bus_id_to_type[lbus] == MP_BUS_MCA) &&
3231 + if (test_bit(lbus, mp_bus_not_pci) &&
3232 (mp_irqs[i].mpc_irqtype == type) &&
3233 (mp_irqs[i].mpc_srcbusirq == irq))
3234
3235 @@ -496,9 +464,7 @@
3236 for (i = 0; i < mp_irq_entries; i++) {
3237 int lbus = mp_irqs[i].mpc_srcbus;
3238
3239 - if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
3240 - mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
3241 - mp_bus_id_to_type[lbus] == MP_BUS_MCA) &&
3242 + if (test_bit(lbus, mp_bus_not_pci) &&
3243 (mp_irqs[i].mpc_irqtype == type) &&
3244 (mp_irqs[i].mpc_srcbusirq == irq))
3245 break;
3246 @@ -539,7 +505,7 @@
3247 mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
3248 break;
3249
3250 - if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) &&
3251 + if (!test_bit(lbus, mp_bus_not_pci) &&
3252 !mp_irqs[i].mpc_irqtype &&
3253 (bus == lbus) &&
3254 (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
3255 @@ -562,27 +528,6 @@
3256 return best_guess;
3257 }
3258
3259 -/*
3260 - * EISA Edge/Level control register, ELCR
3261 - */
3262 -static int EISA_ELCR(unsigned int irq)
3263 -{
3264 - if (irq < 16) {
3265 - unsigned int port = 0x4d0 + (irq >> 3);
3266 - return (inb(port) >> (irq & 7)) & 1;
3267 - }
3268 - apic_printk(APIC_VERBOSE, "Broken MPtable reports ISA irq %d\n", irq);
3269 - return 0;
3270 -}
3271 -
3272 -/* EISA interrupts are always polarity zero and can be edge or level
3273 - * trigger depending on the ELCR value. If an interrupt is listed as
3274 - * EISA conforming in the MP table, that means its trigger type must
3275 - * be read in from the ELCR */
3276 -
3277 -#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq))
3278 -#define default_EISA_polarity(idx) (0)
3279 -
3280 /* ISA interrupts are always polarity zero edge triggered,
3281 * when listed as conforming in the MP table. */
3282
3283 @@ -595,12 +540,6 @@
3284 #define default_PCI_trigger(idx) (1)
3285 #define default_PCI_polarity(idx) (1)
3286
3287 -/* MCA interrupts are always polarity zero level triggered,
3288 - * when listed as conforming in the MP table. */
3289 -
3290 -#define default_MCA_trigger(idx) (1)
3291 -#define default_MCA_polarity(idx) (0)
3292 -
3293 static int __init MPBIOS_polarity(int idx)
3294 {
3295 int bus = mp_irqs[idx].mpc_srcbus;
3296 @@ -612,38 +551,11 @@
3297 switch (mp_irqs[idx].mpc_irqflag & 3)
3298 {
3299 case 0: /* conforms, ie. bus-type dependent polarity */
3300 - {
3301 - switch (mp_bus_id_to_type[bus])
3302 - {
3303 - case MP_BUS_ISA: /* ISA pin */
3304 - {
3305 - polarity = default_ISA_polarity(idx);
3306 - break;
3307 - }
3308 - case MP_BUS_EISA: /* EISA pin */
3309 - {
3310 - polarity = default_EISA_polarity(idx);
3311 - break;
3312 - }
3313 - case MP_BUS_PCI: /* PCI pin */
3314 - {
3315 - polarity = default_PCI_polarity(idx);
3316 - break;
3317 - }
3318 - case MP_BUS_MCA: /* MCA pin */
3319 - {
3320 - polarity = default_MCA_polarity(idx);
3321 - break;
3322 - }
3323 - default:
3324 - {
3325 - printk(KERN_WARNING "broken BIOS!!\n");
3326 - polarity = 1;
3327 - break;
3328 - }
3329 - }
3330 + if (test_bit(bus, mp_bus_not_pci))
3331 + polarity = default_ISA_polarity(idx);
3332 + else
3333 + polarity = default_PCI_polarity(idx);
3334 break;
3335 - }
3336 case 1: /* high active */
3337 {
3338 polarity = 0;
3339 @@ -681,38 +593,11 @@
3340 switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
3341 {
3342 case 0: /* conforms, ie. bus-type dependent */
3343 - {
3344 - switch (mp_bus_id_to_type[bus])
3345 - {
3346 - case MP_BUS_ISA: /* ISA pin */
3347 - {
3348 - trigger = default_ISA_trigger(idx);
3349 - break;
3350 - }
3351 - case MP_BUS_EISA: /* EISA pin */
3352 - {
3353 - trigger = default_EISA_trigger(idx);
3354 - break;
3355 - }
3356 - case MP_BUS_PCI: /* PCI pin */
3357 - {
3358 - trigger = default_PCI_trigger(idx);
3359 - break;
3360 - }
3361 - case MP_BUS_MCA: /* MCA pin */
3362 - {
3363 - trigger = default_MCA_trigger(idx);
3364 - break;
3365 - }
3366 - default:
3367 - {
3368 - printk(KERN_WARNING "broken BIOS!!\n");
3369 - trigger = 1;
3370 - break;
3371 - }
3372 - }
3373 + if (test_bit(bus, mp_bus_not_pci))
3374 + trigger = default_ISA_trigger(idx);
3375 + else
3376 + trigger = default_PCI_trigger(idx);
3377 break;
3378 - }
3379 case 1: /* edge */
3380 {
3381 trigger = 0;
3382 @@ -749,64 +634,6 @@
3383 return MPBIOS_trigger(idx);
3384 }
3385
3386 -static int next_irq = 16;
3387 -
3388 -/*
3389 - * gsi_irq_sharing -- Name overload! "irq" can be either a legacy IRQ
3390 - * in the range 0-15, a linux IRQ in the range 0-223, or a GSI number
3391 - * from ACPI, which can reach 800 in large boxen.
3392 - *
3393 - * Compact the sparse GSI space into a sequential IRQ series and reuse
3394 - * vectors if possible.
3395 - */
3396 -int gsi_irq_sharing(int gsi)
3397 -{
3398 - int i, tries, vector;
3399 -
3400 - BUG_ON(gsi >= NR_IRQ_VECTORS);
3401 -
3402 - if (platform_legacy_irq(gsi))
3403 - return gsi;
3404 -
3405 - if (gsi_2_irq[gsi] != 0xFF)
3406 - return (int)gsi_2_irq[gsi];
3407 -
3408 - tries = NR_IRQS;
3409 - try_again:
3410 - vector = assign_irq_vector(gsi);
3411 -
3412 - /*
3413 - * Sharing vectors means sharing IRQs, so scan irq_vectors for previous
3414 - * use of vector and if found, return that IRQ. However, we never want
3415 - * to share legacy IRQs, which usually have a different trigger mode
3416 - * than PCI.
3417 - */
3418 - for (i = 0; i < NR_IRQS; i++)
3419 - if (IO_APIC_VECTOR(i) == vector)
3420 - break;
3421 - if (platform_legacy_irq(i)) {
3422 - if (--tries >= 0) {
3423 - IO_APIC_VECTOR(i) = 0;
3424 - goto try_again;
3425 - }
3426 - panic("gsi_irq_sharing: didn't find an IRQ using vector 0x%02X for GSI %d", vector, gsi);
3427 - }
3428 - if (i < NR_IRQS) {
3429 - gsi_2_irq[gsi] = i;
3430 - printk(KERN_INFO "GSI %d sharing vector 0x%02X and IRQ %d\n",
3431 - gsi, vector, i);
3432 - return i;
3433 - }
3434 -
3435 - i = next_irq++;
3436 - BUG_ON(i >= NR_IRQS);
3437 - gsi_2_irq[gsi] = i;
3438 - IO_APIC_VECTOR(i) = vector;
3439 - printk(KERN_INFO "GSI %d assigned vector 0x%02X and IRQ %d\n",
3440 - gsi, vector, i);
3441 - return i;
3442 -}
3443 -
3444 static int pin_2_irq(int idx, int apic, int pin)
3445 {
3446 int irq, i;
3447 @@ -818,49 +645,16 @@
3448 if (mp_irqs[idx].mpc_dstirq != pin)
3449 printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
3450
3451 - switch (mp_bus_id_to_type[bus])
3452 - {
3453 - case MP_BUS_ISA: /* ISA pin */
3454 - case MP_BUS_EISA:
3455 - case MP_BUS_MCA:
3456 - {
3457 - irq = mp_irqs[idx].mpc_srcbusirq;
3458 - break;
3459 - }
3460 - case MP_BUS_PCI: /* PCI pin */
3461 - {
3462 - /*
3463 - * PCI IRQs are mapped in order
3464 - */
3465 - i = irq = 0;
3466 - while (i < apic)
3467 - irq += nr_ioapic_registers[i++];
3468 - irq += pin;
3469 - irq = gsi_irq_sharing(irq);
3470 - break;
3471 - }
3472 - default:
3473 - {
3474 - printk(KERN_ERR "unknown bus type %d.\n",bus);
3475 - irq = 0;
3476 - break;
3477 - }
3478 - }
3479 - BUG_ON(irq >= NR_IRQS);
3480 -
3481 - /*
3482 - * PCI IRQ command line redirection. Yes, limits are hardcoded.
3483 - */
3484 - if ((pin >= 16) && (pin <= 23)) {
3485 - if (pirq_entries[pin-16] != -1) {
3486 - if (!pirq_entries[pin-16]) {
3487 - apic_printk(APIC_VERBOSE, "disabling PIRQ%d\n", pin-16);
3488 - } else {
3489 - irq = pirq_entries[pin-16];
3490 - apic_printk(APIC_VERBOSE, "using PIRQ%d -> IRQ %d\n",
3491 - pin-16, irq);
3492 - }
3493 - }
3494 + if (test_bit(bus, mp_bus_not_pci)) {
3495 + irq = mp_irqs[idx].mpc_srcbusirq;
3496 + } else {
3497 + /*
3498 + * PCI IRQs are mapped in order
3499 + */
3500 + i = irq = 0;
3501 + while (i < apic)
3502 + irq += nr_ioapic_registers[i++];
3503 + irq += pin;
3504 }
3505 BUG_ON(irq >= NR_IRQS);
3506 return irq;
3507 @@ -884,43 +678,68 @@
3508 }
3509
3510 /* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
3511 -u8 irq_vector[NR_IRQ_VECTORS] __read_mostly;
3512 +static u8 irq_vector[NR_IRQ_VECTORS] __read_mostly;
3513
3514 -int assign_irq_vector(int irq)
3515 +static int __assign_irq_vector(int irq, cpumask_t mask, cpumask_t *result)
3516 {
3517 - unsigned long flags;
3518 int vector;
3519 struct physdev_irq irq_op;
3520
3521 - BUG_ON(irq != AUTO_ASSIGN && (unsigned)irq >= NR_IRQ_VECTORS);
3522 + BUG_ON((unsigned)irq >= NR_IRQ_VECTORS);
3523
3524 - spin_lock_irqsave(&vector_lock, flags);
3525 + cpus_and(*result, mask, cpu_online_map);
3526
3527 - if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0) {
3528 - spin_unlock_irqrestore(&vector_lock, flags);
3529 - return IO_APIC_VECTOR(irq);
3530 - }
3531 + if (irq_vector[irq] > 0)
3532 + return irq_vector[irq];
3533
3534 irq_op.irq = irq;
3535 - if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) {
3536 - spin_unlock_irqrestore(&vector_lock, flags);
3537 + if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op))
3538 return -ENOSPC;
3539 - }
3540
3541 vector = irq_op.vector;
3542 - vector_irq[vector] = irq;
3543 - if (irq != AUTO_ASSIGN)
3544 - IO_APIC_VECTOR(irq) = vector;
3545 + irq_vector[irq] = vector;
3546
3547 - spin_unlock_irqrestore(&vector_lock, flags);
3548 + return vector;
3549 +}
3550
3551 +static int assign_irq_vector(int irq, cpumask_t mask, cpumask_t *result)
3552 +{
3553 + int vector;
3554 + unsigned long flags;
3555 +
3556 + spin_lock_irqsave(&vector_lock, flags);
3557 + vector = __assign_irq_vector(irq, mask, result);
3558 + spin_unlock_irqrestore(&vector_lock, flags);
3559 return vector;
3560 }
3561
3562 -extern void (*interrupt[NR_IRQS])(void);
3563 #ifndef CONFIG_XEN
3564 -static struct hw_interrupt_type ioapic_level_type;
3565 -static struct hw_interrupt_type ioapic_edge_type;
3566 +void __setup_vector_irq(int cpu)
3567 +{
3568 + /* Initialize vector_irq on a new cpu */
3569 + /* This function must be called with vector_lock held */
3570 + int irq, vector;
3571 +
3572 + /* Mark the inuse vectors */
3573 + for (irq = 0; irq < NR_IRQ_VECTORS; ++irq) {
3574 + if (!cpu_isset(cpu, irq_domain[irq]))
3575 + continue;
3576 + vector = irq_vector[irq];
3577 + per_cpu(vector_irq, cpu)[vector] = irq;
3578 + }
3579 + /* Mark the free vectors */
3580 + for (vector = 0; vector < NR_VECTORS; ++vector) {
3581 + irq = per_cpu(vector_irq, cpu)[vector];
3582 + if (irq < 0)
3583 + continue;
3584 + if (!cpu_isset(cpu, irq_domain[irq]))
3585 + per_cpu(vector_irq, cpu)[vector] = -1;
3586 + }
3587 +}
3588 +
3589 +extern void (*interrupt[NR_IRQS])(void);
3590 +
3591 +static struct irq_chip ioapic_chip;
3592
3593 #define IOAPIC_AUTO -1
3594 #define IOAPIC_EDGE 0
3595 @@ -928,16 +747,15 @@
3596
3597 static void ioapic_register_intr(int irq, int vector, unsigned long trigger)
3598 {
3599 - unsigned idx;
3600 -
3601 - idx = use_pci_vector() && !platform_legacy_irq(irq) ? vector : irq;
3602 -
3603 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
3604 trigger == IOAPIC_LEVEL)
3605 - irq_desc[idx].chip = &ioapic_level_type;
3606 - else
3607 - irq_desc[idx].chip = &ioapic_edge_type;
3608 - set_intr_gate(vector, interrupt[idx]);
3609 + set_irq_chip_and_handler_name(irq, &ioapic_chip,
3610 + handle_fasteoi_irq, "fasteoi");
3611 + else {
3612 + irq_desc[irq].status |= IRQ_DELAYED_DISABLE;
3613 + set_irq_chip_and_handler_name(irq, &ioapic_chip,
3614 + handle_edge_irq, "edge");
3615 + }
3616 }
3617 #else
3618 #define ioapic_register_intr(_irq,_vector,_trigger) ((void)0)
3619 @@ -990,16 +808,21 @@
3620 continue;
3621
3622 if (IO_APIC_IRQ(irq)) {
3623 - vector = assign_irq_vector(irq);
3624 + cpumask_t mask;
3625 + vector = assign_irq_vector(irq, TARGET_CPUS, &mask);
3626 + if (vector < 0)
3627 + continue;
3628 +
3629 + entry.dest.logical.logical_dest = cpu_mask_to_apicid(mask);
3630 entry.vector = vector;
3631
3632 ioapic_register_intr(irq, vector, IOAPIC_AUTO);
3633 if (!apic && (irq < 16))
3634 disable_8259A_irq(irq);
3635 }
3636 + ioapic_write_entry(apic, pin, entry);
3637 +
3638 spin_lock_irqsave(&ioapic_lock, flags);
3639 - io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
3640 - io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
3641 set_native_irq_info(irq, TARGET_CPUS);
3642 spin_unlock_irqrestore(&ioapic_lock, flags);
3643 }
3644 @@ -1042,7 +865,7 @@
3645 * The timer IRQ doesn't have to know that behind the
3646 * scene we have a 8259A-master in AEOI mode ...
3647 */
3648 - irq_desc[0].chip = &ioapic_edge_type;
3649 + set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge");
3650
3651 /*
3652 * Add it to the IO-APIC irq-routing table:
3653 @@ -1138,10 +961,7 @@
3654 for (i = 0; i <= reg_01.bits.entries; i++) {
3655 struct IO_APIC_route_entry entry;
3656
3657 - spin_lock_irqsave(&ioapic_lock, flags);
3658 - *(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2);
3659 - *(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2);
3660 - spin_unlock_irqrestore(&ioapic_lock, flags);
3661 + entry = ioapic_read_entry(apic, i);
3662
3663 printk(KERN_DEBUG " %02x %03X %02X ",
3664 i,
3665 @@ -1161,17 +981,12 @@
3666 );
3667 }
3668 }
3669 - if (use_pci_vector())
3670 - printk(KERN_INFO "Using vector-based indexing\n");
3671 printk(KERN_DEBUG "IRQ to pin mappings:\n");
3672 for (i = 0; i < NR_IRQS; i++) {
3673 struct irq_pin_list *entry = irq_2_pin + i;
3674 if (entry->pin < 0)
3675 continue;
3676 - if (use_pci_vector() && !platform_legacy_irq(i))
3677 - printk(KERN_DEBUG "IRQ%d ", IO_APIC_VECTOR(i));
3678 - else
3679 - printk(KERN_DEBUG "IRQ%d ", i);
3680 + printk(KERN_DEBUG "IRQ%d ", i);
3681 for (;;) {
3682 printk("-> %d:%d", entry->apic, entry->pin);
3683 if (!entry->next)
3684 @@ -1335,9 +1150,6 @@
3685 irq_2_pin[i].pin = -1;
3686 irq_2_pin[i].next = 0;
3687 }
3688 - if (!pirqs_enabled)
3689 - for (i = 0; i < MAX_PIRQS; i++)
3690 - pirq_entries[i] = -1;
3691
3692 /*
3693 * The number of IO-APIC IRQ registers (== #pins):
3694 @@ -1354,11 +1166,7 @@
3695 /* See if any of the pins is in ExtINT mode */
3696 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
3697 struct IO_APIC_route_entry entry;
3698 - spin_lock_irqsave(&ioapic_lock, flags);
3699 - *(((int *)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
3700 - *(((int *)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
3701 - spin_unlock_irqrestore(&ioapic_lock, flags);
3702 -
3703 + entry = ioapic_read_entry(apic, pin);
3704
3705 /* If the interrupt line is enabled and in ExtInt mode
3706 * I have found the pin where the i8259 is connected.
3707 @@ -1412,7 +1220,6 @@
3708 */
3709 if (ioapic_i8259.pin != -1) {
3710 struct IO_APIC_route_entry entry;
3711 - unsigned long flags;
3712
3713 memset(&entry, 0, sizeof(entry));
3714 entry.mask = 0; /* Enabled */
3715 @@ -1429,12 +1236,7 @@
3716 /*
3717 * Add it to the IO-APIC irq-routing table:
3718 */
3719 - spin_lock_irqsave(&ioapic_lock, flags);
3720 - io_apic_write(ioapic_i8259.apic, 0x11+2*ioapic_i8259.pin,
3721 - *(((int *)&entry)+1));
3722 - io_apic_write(ioapic_i8259.apic, 0x10+2*ioapic_i8259.pin,
3723 - *(((int *)&entry)+0));
3724 - spin_unlock_irqrestore(&ioapic_lock, flags);
3725 + ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry);
3726 }
3727
3728 disconnect_bsp_APIC(ioapic_i8259.pin != -1);
3729 @@ -1442,76 +1244,6 @@
3730 }
3731
3732 /*
3733 - * function to set the IO-APIC physical IDs based on the
3734 - * values stored in the MPC table.
3735 - *
3736 - * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
3737 - */
3738 -
3739 -#ifndef CONFIG_XEN
3740 -static void __init setup_ioapic_ids_from_mpc (void)
3741 -{
3742 - union IO_APIC_reg_00 reg_00;
3743 - int apic;
3744 - int i;
3745 - unsigned char old_id;
3746 - unsigned long flags;
3747 -
3748 - /*
3749 - * Set the IOAPIC ID to the value stored in the MPC table.
3750 - */
3751 - for (apic = 0; apic < nr_ioapics; apic++) {
3752 -
3753 - /* Read the register 0 value */
3754 - spin_lock_irqsave(&ioapic_lock, flags);
3755 - reg_00.raw = io_apic_read(apic, 0);
3756 - spin_unlock_irqrestore(&ioapic_lock, flags);
3757 -
3758 - old_id = mp_ioapics[apic].mpc_apicid;
3759 -
3760 -
3761 - printk(KERN_INFO "Using IO-APIC %d\n", mp_ioapics[apic].mpc_apicid);
3762 -
3763 -
3764 - /*
3765 - * We need to adjust the IRQ routing table
3766 - * if the ID changed.
3767 - */
3768 - if (old_id != mp_ioapics[apic].mpc_apicid)
3769 - for (i = 0; i < mp_irq_entries; i++)
3770 - if (mp_irqs[i].mpc_dstapic == old_id)
3771 - mp_irqs[i].mpc_dstapic
3772 - = mp_ioapics[apic].mpc_apicid;
3773 -
3774 - /*
3775 - * Read the right value from the MPC table and
3776 - * write it into the ID register.
3777 - */
3778 - apic_printk(APIC_VERBOSE,KERN_INFO "...changing IO-APIC physical APIC ID to %d ...",
3779 - mp_ioapics[apic].mpc_apicid);
3780 -
3781 - reg_00.bits.ID = mp_ioapics[apic].mpc_apicid;
3782 - spin_lock_irqsave(&ioapic_lock, flags);
3783 - io_apic_write(apic, 0, reg_00.raw);
3784 - spin_unlock_irqrestore(&ioapic_lock, flags);
3785 -
3786 - /*
3787 - * Sanity check
3788 - */
3789 - spin_lock_irqsave(&ioapic_lock, flags);
3790 - reg_00.raw = io_apic_read(apic, 0);
3791 - spin_unlock_irqrestore(&ioapic_lock, flags);
3792 - if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid)
3793 - printk("could not set ID!\n");
3794 - else
3795 - apic_printk(APIC_VERBOSE," ok.\n");
3796 - }
3797 -}
3798 -#else
3799 -static void __init setup_ioapic_ids_from_mpc(void) { }
3800 -#endif
3801 -
3802 -/*
3803 * There is a nasty bug in some older SMP boards, their mptable lies
3804 * about the timer IRQ. We do the following to work around the situation:
3805 *
3806 @@ -1565,7 +1297,7 @@
3807 * an edge even if it isn't on the 8259A...
3808 */
3809
3810 -static unsigned int startup_edge_ioapic_irq(unsigned int irq)
3811 +static unsigned int startup_ioapic_irq(unsigned int irq)
3812 {
3813 int was_pending = 0;
3814 unsigned long flags;
3815 @@ -1582,107 +1314,19 @@
3816 return was_pending;
3817 }
3818
3819 -/*
3820 - * Once we have recorded IRQ_PENDING already, we can mask the
3821 - * interrupt for real. This prevents IRQ storms from unhandled
3822 - * devices.
3823 - */
3824 -static void ack_edge_ioapic_irq(unsigned int irq)
3825 -{
3826 - move_irq(irq);
3827 - if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED))
3828 - == (IRQ_PENDING | IRQ_DISABLED))
3829 - mask_IO_APIC_irq(irq);
3830 - ack_APIC_irq();
3831 -}
3832 -
3833 -/*
3834 - * Level triggered interrupts can just be masked,
3835 - * and shutting down and starting up the interrupt
3836 - * is the same as enabling and disabling them -- except
3837 - * with a startup need to return a "was pending" value.
3838 - *
3839 - * Level triggered interrupts are special because we
3840 - * do not touch any IO-APIC register while handling
3841 - * them. We ack the APIC in the end-IRQ handler, not
3842 - * in the start-IRQ-handler. Protection against reentrance
3843 - * from the same interrupt is still provided, both by the
3844 - * generic IRQ layer and by the fact that an unacked local
3845 - * APIC does not accept IRQs.
3846 - */
3847 -static unsigned int startup_level_ioapic_irq (unsigned int irq)
3848 -{
3849 - unmask_IO_APIC_irq(irq);
3850 -
3851 - return 0; /* don't check for pending */
3852 -}
3853 -
3854 -static void end_level_ioapic_irq (unsigned int irq)
3855 -{
3856 - move_irq(irq);
3857 - ack_APIC_irq();
3858 -}
3859 -
3860 -#ifdef CONFIG_PCI_MSI
3861 -static unsigned int startup_edge_ioapic_vector(unsigned int vector)
3862 -{
3863 - int irq = vector_to_irq(vector);
3864 -
3865 - return startup_edge_ioapic_irq(irq);
3866 -}
3867 -
3868 -static void ack_edge_ioapic_vector(unsigned int vector)
3869 -{
3870 - int irq = vector_to_irq(vector);
3871 -
3872 - move_native_irq(vector);
3873 - ack_edge_ioapic_irq(irq);
3874 -}
3875 -
3876 -static unsigned int startup_level_ioapic_vector (unsigned int vector)
3877 -{
3878 - int irq = vector_to_irq(vector);
3879 -
3880 - return startup_level_ioapic_irq (irq);
3881 -}
3882 -
3883 -static void end_level_ioapic_vector (unsigned int vector)
3884 -{
3885 - int irq = vector_to_irq(vector);
3886 -
3887 - move_native_irq(vector);
3888 - end_level_ioapic_irq(irq);
3889 -}
3890 -
3891 -static void mask_IO_APIC_vector (unsigned int vector)
3892 -{
3893 - int irq = vector_to_irq(vector);
3894 -
3895 - mask_IO_APIC_irq(irq);
3896 -}
3897 -
3898 -static void unmask_IO_APIC_vector (unsigned int vector)
3899 -{
3900 - int irq = vector_to_irq(vector);
3901 -
3902 - unmask_IO_APIC_irq(irq);
3903 -}
3904 -
3905 -#ifdef CONFIG_SMP
3906 -static void set_ioapic_affinity_vector (unsigned int vector,
3907 - cpumask_t cpu_mask)
3908 +static int ioapic_retrigger_irq(unsigned int irq)
3909 {
3910 - int irq = vector_to_irq(vector);
3911 + cpumask_t mask;
3912 + unsigned vector;
3913 + unsigned long flags;
3914
3915 - set_native_irq_info(vector, cpu_mask);
3916 - set_ioapic_affinity_irq(irq, cpu_mask);
3917 -}
3918 -#endif // CONFIG_SMP
3919 -#endif // CONFIG_PCI_MSI
3920 + spin_lock_irqsave(&vector_lock, flags);
3921 + vector = irq_vector[irq];
3922 + cpus_clear(mask);
3923 + cpu_set(first_cpu(irq_domain[irq]), mask);
3924
3925 -static int ioapic_retrigger(unsigned int irq)
3926 -{
3927 - send_IPI_self(IO_APIC_VECTOR(irq));
3928 + send_IPI_mask(mask, vector);
3929 + spin_unlock_irqrestore(&vector_lock, flags);
3930
3931 return 1;
3932 }
3933 @@ -1696,32 +1340,47 @@
3934 * races.
3935 */
3936
3937 -static struct hw_interrupt_type ioapic_edge_type __read_mostly = {
3938 - .typename = "IO-APIC-edge",
3939 - .startup = startup_edge_ioapic,
3940 - .shutdown = shutdown_edge_ioapic,
3941 - .enable = enable_edge_ioapic,
3942 - .disable = disable_edge_ioapic,
3943 - .ack = ack_edge_ioapic,
3944 - .end = end_edge_ioapic,
3945 -#ifdef CONFIG_SMP
3946 - .set_affinity = set_ioapic_affinity,
3947 +static void ack_apic_edge(unsigned int irq)
3948 +{
3949 + move_native_irq(irq);
3950 + ack_APIC_irq();
3951 +}
3952 +
3953 +static void ack_apic_level(unsigned int irq)
3954 +{
3955 + int do_unmask_irq = 0;
3956 +
3957 +#if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE)
3958 + /* If we are moving the irq we need to mask it */
3959 + if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) {
3960 + do_unmask_irq = 1;
3961 + mask_IO_APIC_irq(irq);
3962 + }
3963 #endif
3964 - .retrigger = ioapic_retrigger,
3965 -};
3966
3967 -static struct hw_interrupt_type ioapic_level_type __read_mostly = {
3968 - .typename = "IO-APIC-level",
3969 - .startup = startup_level_ioapic,
3970 - .shutdown = shutdown_level_ioapic,
3971 - .enable = enable_level_ioapic,
3972 - .disable = disable_level_ioapic,
3973 - .ack = mask_and_ack_level_ioapic,
3974 - .end = end_level_ioapic,
3975 + /*
3976 + * We must acknowledge the irq before we move it or the acknowledge will
3977 + * not propogate properly.
3978 + */
3979 + ack_APIC_irq();
3980 +
3981 + /* Now we can move and renable the irq */
3982 + move_masked_irq(irq);
3983 + if (unlikely(do_unmask_irq))
3984 + unmask_IO_APIC_irq(irq);
3985 +}
3986 +
3987 +static struct irq_chip ioapic_chip __read_mostly = {
3988 + .name = "IO-APIC",
3989 + .startup = startup_ioapic_irq,
3990 + .mask = mask_IO_APIC_irq,
3991 + .unmask = unmask_IO_APIC_irq,
3992 + .ack = ack_apic_edge,
3993 + .eoi = ack_apic_level,
3994 #ifdef CONFIG_SMP
3995 - .set_affinity = set_ioapic_affinity,
3996 + .set_affinity = set_ioapic_affinity_irq,
3997 #endif
3998 - .retrigger = ioapic_retrigger,
3999 + .retrigger = ioapic_retrigger_irq,
4000 };
4001 #endif /* !CONFIG_XEN */
4002
4003 @@ -1742,12 +1401,7 @@
4004 */
4005 for (irq = 0; irq < NR_IRQS ; irq++) {
4006 int tmp = irq;
4007 - if (use_pci_vector()) {
4008 - if (!platform_legacy_irq(tmp))
4009 - if ((tmp = vector_to_irq(tmp)) == -1)
4010 - continue;
4011 - }
4012 - if (IO_APIC_IRQ(tmp) && !IO_APIC_VECTOR(tmp)) {
4013 + if (IO_APIC_IRQ(tmp) && !irq_vector[tmp]) {
4014 /*
4015 * Hmm.. We don't have an entry for this,
4016 * so default to an old-fashioned 8259
4017 @@ -1758,7 +1412,7 @@
4018 #ifndef CONFIG_XEN
4019 else
4020 /* Strange. Oh, well.. */
4021 - irq_desc[irq].chip = &no_irq_type;
4022 + irq_desc[irq].chip = &no_irq_chip;
4023 #endif
4024 }
4025 }
4026 @@ -1879,8 +1533,6 @@
4027 spin_unlock_irqrestore(&ioapic_lock, flags);
4028 }
4029
4030 -int timer_uses_ioapic_pin_0;
4031 -
4032 /*
4033 * This code may look a bit paranoid, but it's supposed to cooperate with
4034 * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ
4035 @@ -1893,13 +1545,13 @@
4036 {
4037 int apic1, pin1, apic2, pin2;
4038 int vector;
4039 + cpumask_t mask;
4040
4041 /*
4042 * get/set the timer IRQ vector:
4043 */
4044 disable_8259A_irq(0);
4045 - vector = assign_irq_vector(0);
4046 - set_intr_gate(vector, interrupt[0]);
4047 + vector = assign_irq_vector(0, TARGET_CPUS, &mask);
4048
4049 /*
4050 * Subtle, code in do_timer_interrupt() expects an AEOI
4051 @@ -1918,9 +1570,6 @@
4052 pin2 = ioapic_i8259.pin;
4053 apic2 = ioapic_i8259.apic;
4054
4055 - if (pin1 == 0)
4056 - timer_uses_ioapic_pin_0 = 1;
4057 -
4058 apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
4059 vector, apic1, pin1, apic2, pin2);
4060
4061 @@ -2035,11 +1684,6 @@
4062
4063 apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
4064
4065 - /*
4066 - * Set up the IO-APIC IRQ routing table.
4067 - */
4068 - if (!acpi_ioapic)
4069 - setup_ioapic_ids_from_mpc();
4070 #ifndef CONFIG_XEN
4071 sync_Arb_IDs();
4072 #endif /* !CONFIG_XEN */
4073 @@ -2060,17 +1704,12 @@
4074 {
4075 struct IO_APIC_route_entry *entry;
4076 struct sysfs_ioapic_data *data;
4077 - unsigned long flags;
4078 int i;
4079
4080 data = container_of(dev, struct sysfs_ioapic_data, dev);
4081 entry = data->entry;
4082 - spin_lock_irqsave(&ioapic_lock, flags);
4083 - for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
4084 - *(((int *)entry) + 1) = io_apic_read(dev->id, 0x11 + 2 * i);
4085 - *(((int *)entry) + 0) = io_apic_read(dev->id, 0x10 + 2 * i);
4086 - }
4087 - spin_unlock_irqrestore(&ioapic_lock, flags);
4088 + for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ )
4089 + *entry = ioapic_read_entry(dev->id, i);
4090
4091 return 0;
4092 }
4093 @@ -2092,11 +1731,9 @@
4094 reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
4095 io_apic_write(dev->id, 0, reg_00.raw);
4096 }
4097 - for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
4098 - io_apic_write(dev->id, 0x11+2*i, *(((int *)entry)+1));
4099 - io_apic_write(dev->id, 0x10+2*i, *(((int *)entry)+0));
4100 - }
4101 spin_unlock_irqrestore(&ioapic_lock, flags);
4102 + for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
4103 + ioapic_write_entry(dev->id, i, entry[i]);
4104
4105 return 0;
4106 }
4107 @@ -2142,26 +1779,254 @@
4108
4109 device_initcall(ioapic_init_sysfs);
4110
4111 -/* --------------------------------------------------------------------------
4112 - ACPI-based IOAPIC Configuration
4113 - -------------------------------------------------------------------------- */
4114 +#ifndef CONFIG_XEN
4115 +/*
4116 + * Dynamic irq allocate and deallocation
4117 + */
4118 +int create_irq(void)
4119 +{
4120 + /* Allocate an unused irq */
4121 + int irq;
4122 + int new;
4123 + int vector = 0;
4124 + unsigned long flags;
4125 + cpumask_t mask;
4126
4127 -#ifdef CONFIG_ACPI
4128 + irq = -ENOSPC;
4129 + spin_lock_irqsave(&vector_lock, flags);
4130 + for (new = (NR_IRQS - 1); new >= 0; new--) {
4131 + if (platform_legacy_irq(new))
4132 + continue;
4133 + if (irq_vector[new] != 0)
4134 + continue;
4135 + vector = __assign_irq_vector(new, TARGET_CPUS, &mask);
4136 + if (likely(vector > 0))
4137 + irq = new;
4138 + break;
4139 + }
4140 + spin_unlock_irqrestore(&vector_lock, flags);
4141
4142 -#define IO_APIC_MAX_ID 0xFE
4143 + if (irq >= 0) {
4144 + dynamic_irq_init(irq);
4145 + }
4146 + return irq;
4147 +}
4148
4149 -int __init io_apic_get_version (int ioapic)
4150 +void destroy_irq(unsigned int irq)
4151 {
4152 - union IO_APIC_reg_01 reg_01;
4153 unsigned long flags;
4154
4155 - spin_lock_irqsave(&ioapic_lock, flags);
4156 - reg_01.raw = io_apic_read(ioapic, 1);
4157 - spin_unlock_irqrestore(&ioapic_lock, flags);
4158 + dynamic_irq_cleanup(irq);
4159 +
4160 + spin_lock_irqsave(&vector_lock, flags);
4161 + irq_vector[irq] = 0;
4162 + spin_unlock_irqrestore(&vector_lock, flags);
4163 +}
4164 +#endif
4165 +
4166 +/*
4167 + * MSI mesage composition
4168 + */
4169 +#ifdef CONFIG_PCI_MSI
4170 +static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
4171 +{
4172 + int vector;
4173 + unsigned dest;
4174 + cpumask_t tmp;
4175 +
4176 + vector = assign_irq_vector(irq, TARGET_CPUS, &tmp);
4177 + if (vector >= 0) {
4178 + dest = cpu_mask_to_apicid(tmp);
4179 +
4180 + msg->address_hi = MSI_ADDR_BASE_HI;
4181 + msg->address_lo =
4182 + MSI_ADDR_BASE_LO |
4183 + ((INT_DEST_MODE == 0) ?
4184 + MSI_ADDR_DEST_MODE_PHYSICAL:
4185 + MSI_ADDR_DEST_MODE_LOGICAL) |
4186 + ((INT_DELIVERY_MODE != dest_LowestPrio) ?
4187 + MSI_ADDR_REDIRECTION_CPU:
4188 + MSI_ADDR_REDIRECTION_LOWPRI) |
4189 + MSI_ADDR_DEST_ID(dest);
4190 +
4191 + msg->data =
4192 + MSI_DATA_TRIGGER_EDGE |
4193 + MSI_DATA_LEVEL_ASSERT |
4194 + ((INT_DELIVERY_MODE != dest_LowestPrio) ?
4195 + MSI_DATA_DELIVERY_FIXED:
4196 + MSI_DATA_DELIVERY_LOWPRI) |
4197 + MSI_DATA_VECTOR(vector);
4198 + }
4199 + return vector;
4200 +}
4201 +
4202 +#ifdef CONFIG_SMP
4203 +static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
4204 +{
4205 + struct msi_msg msg;
4206 + unsigned int dest;
4207 + cpumask_t tmp;
4208 + int vector;
4209 +
4210 + cpus_and(tmp, mask, cpu_online_map);
4211 + if (cpus_empty(tmp))
4212 + tmp = TARGET_CPUS;
4213 +
4214 + cpus_and(mask, tmp, CPU_MASK_ALL);
4215 +
4216 + vector = assign_irq_vector(irq, mask, &tmp);
4217 + if (vector < 0)
4218 + return;
4219 +
4220 + dest = cpu_mask_to_apicid(tmp);
4221 +
4222 + read_msi_msg(irq, &msg);
4223 +
4224 + msg.data &= ~MSI_DATA_VECTOR_MASK;
4225 + msg.data |= MSI_DATA_VECTOR(vector);
4226 + msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
4227 + msg.address_lo |= MSI_ADDR_DEST_ID(dest);
4228 +
4229 + write_msi_msg(irq, &msg);
4230 + set_native_irq_info(irq, mask);
4231 +}
4232 +#endif /* CONFIG_SMP */
4233 +
4234 +/*
4235 + * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
4236 + * which implement the MSI or MSI-X Capability Structure.
4237 + */
4238 +static struct irq_chip msi_chip = {
4239 + .name = "PCI-MSI",
4240 + .unmask = unmask_msi_irq,
4241 + .mask = mask_msi_irq,
4242 + .ack = ack_apic_edge,
4243 +#ifdef CONFIG_SMP
4244 + .set_affinity = set_msi_irq_affinity,
4245 +#endif
4246 + .retrigger = ioapic_retrigger_irq,
4247 +};
4248 +
4249 +int arch_setup_msi_irq(unsigned int irq, struct pci_dev *dev)
4250 +{
4251 + struct msi_msg msg;
4252 + int ret;
4253 + ret = msi_compose_msg(dev, irq, &msg);
4254 + if (ret < 0)
4255 + return ret;
4256 +
4257 + write_msi_msg(irq, &msg);
4258 +
4259 + set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
4260 +
4261 + return 0;
4262 +}
4263 +
4264 +void arch_teardown_msi_irq(unsigned int irq)
4265 +{
4266 + return;
4267 +}
4268 +
4269 +#endif /* CONFIG_PCI_MSI */
4270 +
4271 +/*
4272 + * Hypertransport interrupt support
4273 + */
4274 +#ifdef CONFIG_HT_IRQ
4275 +
4276 +#ifdef CONFIG_SMP
4277 +
4278 +static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
4279 +{
4280 + struct ht_irq_msg msg;
4281 + fetch_ht_irq_msg(irq, &msg);
4282 +
4283 + msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK | HT_IRQ_LOW_DEST_ID_MASK);
4284 + msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK);
4285
4286 - return reg_01.bits.version;
4287 + msg.address_lo |= HT_IRQ_LOW_VECTOR(vector) | HT_IRQ_LOW_DEST_ID(dest);
4288 + msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest);
4289 +
4290 + write_ht_irq_msg(irq, &msg);
4291 }
4292
4293 +static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
4294 +{
4295 + unsigned int dest;
4296 + cpumask_t tmp;
4297 + int vector;
4298 +
4299 + cpus_and(tmp, mask, cpu_online_map);
4300 + if (cpus_empty(tmp))
4301 + tmp = TARGET_CPUS;
4302 +
4303 + cpus_and(mask, tmp, CPU_MASK_ALL);
4304 +
4305 + vector = assign_irq_vector(irq, mask, &tmp);
4306 + if (vector < 0)
4307 + return;
4308 +
4309 + dest = cpu_mask_to_apicid(tmp);
4310 +
4311 + target_ht_irq(irq, dest, vector);
4312 + set_native_irq_info(irq, mask);
4313 +}
4314 +#endif
4315 +
4316 +static struct irq_chip ht_irq_chip = {
4317 + .name = "PCI-HT",
4318 + .mask = mask_ht_irq,
4319 + .unmask = unmask_ht_irq,
4320 + .ack = ack_apic_edge,
4321 +#ifdef CONFIG_SMP
4322 + .set_affinity = set_ht_irq_affinity,
4323 +#endif
4324 + .retrigger = ioapic_retrigger_irq,
4325 +};
4326 +
4327 +int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
4328 +{
4329 + int vector;
4330 + cpumask_t tmp;
4331 +
4332 + vector = assign_irq_vector(irq, TARGET_CPUS, &tmp);
4333 + if (vector >= 0) {
4334 + struct ht_irq_msg msg;
4335 + unsigned dest;
4336 +
4337 + dest = cpu_mask_to_apicid(tmp);
4338 +
4339 + msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
4340 +
4341 + msg.address_lo =
4342 + HT_IRQ_LOW_BASE |
4343 + HT_IRQ_LOW_DEST_ID(dest) |
4344 + HT_IRQ_LOW_VECTOR(vector) |
4345 + ((INT_DEST_MODE == 0) ?
4346 + HT_IRQ_LOW_DM_PHYSICAL :
4347 + HT_IRQ_LOW_DM_LOGICAL) |
4348 + HT_IRQ_LOW_RQEOI_EDGE |
4349 + ((INT_DELIVERY_MODE != dest_LowestPrio) ?
4350 + HT_IRQ_LOW_MT_FIXED :
4351 + HT_IRQ_LOW_MT_ARBITRATED) |
4352 + HT_IRQ_LOW_IRQ_MASKED;
4353 +
4354 + write_ht_irq_msg(irq, &msg);
4355 +
4356 + set_irq_chip_and_handler_name(irq, &ht_irq_chip,
4357 + handle_edge_irq, "edge");
4358 + }
4359 + return vector;
4360 +}
4361 +#endif /* CONFIG_HT_IRQ */
4362 +
4363 +/* --------------------------------------------------------------------------
4364 + ACPI-based IOAPIC Configuration
4365 + -------------------------------------------------------------------------- */
4366 +
4367 +#ifdef CONFIG_ACPI
4368 +
4369 +#define IO_APIC_MAX_ID 0xFE
4370
4371 int __init io_apic_get_redir_entries (int ioapic)
4372 {
4373 @@ -2180,6 +2045,8 @@
4374 {
4375 struct IO_APIC_route_entry entry;
4376 unsigned long flags;
4377 + int vector;
4378 + cpumask_t mask;
4379
4380 if (!IO_APIC_IRQ(irq)) {
4381 apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
4382 @@ -2188,6 +2055,17 @@
4383 }
4384
4385 /*
4386 + * IRQs < 16 are already in the irq_2_pin[] map
4387 + */
4388 + if (irq >= 16)
4389 + add_pin_to_irq(irq, ioapic, pin);
4390 +
4391 +
4392 + vector = assign_irq_vector(irq, TARGET_CPUS, &mask);
4393 + if (vector < 0)
4394 + return vector;
4395 +
4396 + /*
4397 * Generate a PCI IRQ routing entry and program the IOAPIC accordingly.
4398 * Note that we mask (disable) IRQs now -- these get enabled when the
4399 * corresponding device driver registers for this IRQ.
4400 @@ -2197,19 +2075,11 @@
4401
4402 entry.delivery_mode = INT_DELIVERY_MODE;
4403 entry.dest_mode = INT_DEST_MODE;
4404 - entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
4405 + entry.dest.logical.logical_dest = cpu_mask_to_apicid(mask);
4406 entry.trigger = edge_level;
4407 entry.polarity = active_high_low;
4408 entry.mask = 1; /* Disabled (masked) */
4409 -
4410 - irq = gsi_irq_sharing(irq);
4411 - /*
4412 - * IRQs < 16 are already in the irq_2_pin[] map
4413 - */
4414 - if (irq >= 16)
4415 - add_pin_to_irq(irq, ioapic, pin);
4416 -
4417 - entry.vector = assign_irq_vector(irq);
4418 + entry.vector = vector & 0xff;
4419
4420 apic_printk(APIC_VERBOSE,KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry (%d-%d -> 0x%x -> "
4421 "IRQ %d Mode:%i Active:%i)\n", ioapic,
4422 @@ -2221,10 +2091,10 @@
4423 if (!ioapic && (irq < 16))
4424 disable_8259A_irq(irq);
4425
4426 + ioapic_write_entry(ioapic, pin, entry);
4427 +
4428 spin_lock_irqsave(&ioapic_lock, flags);
4429 - io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1));
4430 - io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0));
4431 - set_native_irq_info(use_pci_vector() ? entry.vector : irq, TARGET_CPUS);
4432 + set_native_irq_info(irq, TARGET_CPUS);
4433 spin_unlock_irqrestore(&ioapic_lock, flags);
4434
4435 return 0;
4436 --- a/arch/x86/kernel/ioport_64-xen.c
4437 +++ b/arch/x86/kernel/ioport_64-xen.c
4438 @@ -58,6 +58,7 @@
4439
4440 memset(bitmap, 0xff, IO_BITMAP_BYTES);
4441 t->io_bitmap_ptr = bitmap;
4442 + set_thread_flag(TIF_IO_BITMAP);
4443
4444 set_xen_guest_handle(set_iobitmap.bitmap, (char *)bitmap);
4445 set_iobitmap.nr_ports = IO_BITMAP_BITS;
4446 --- a/arch/x86/kernel/irq_32-xen.c
4447 +++ b/arch/x86/kernel/irq_32-xen.c
4448 @@ -53,8 +53,10 @@
4449 */
4450 fastcall unsigned int do_IRQ(struct pt_regs *regs)
4451 {
4452 + struct pt_regs *old_regs;
4453 /* high bit used in ret_from_ code */
4454 int irq = ~regs->orig_eax;
4455 + struct irq_desc *desc = irq_desc + irq;
4456 #ifdef CONFIG_4KSTACKS
4457 union irq_ctx *curctx, *irqctx;
4458 u32 *isp;
4459 @@ -66,6 +68,7 @@
4460 BUG();
4461 }
4462
4463 + old_regs = set_irq_regs(regs);
4464 irq_enter();
4465 #ifdef CONFIG_DEBUG_STACKOVERFLOW
4466 /* Debugging check for stack overflow: is there less than 1KB free? */
4467 @@ -110,19 +113,20 @@
4468 (curctx->tinfo.preempt_count & SOFTIRQ_MASK);
4469
4470 asm volatile(
4471 - " xchgl %%ebx,%%esp \n"
4472 - " call __do_IRQ \n"
4473 + " xchgl %%ebx,%%esp \n"
4474 + " call *%%edi \n"
4475 " movl %%ebx,%%esp \n"
4476 : "=a" (arg1), "=d" (arg2), "=b" (ebx)
4477 - : "0" (irq), "1" (regs), "2" (isp)
4478 - : "memory", "cc", "ecx"
4479 + : "0" (irq), "1" (desc), "2" (isp),
4480 + "D" (desc->handle_irq)
4481 + : "memory", "cc"
4482 );
4483 } else
4484 #endif
4485 - __do_IRQ(irq, regs);
4486 + desc->handle_irq(irq, desc);
4487
4488 irq_exit();
4489 -
4490 + set_irq_regs(old_regs);
4491 return 1;
4492 }
4493
4494 @@ -253,7 +257,8 @@
4495 for_each_online_cpu(j)
4496 seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
4497 #endif
4498 - seq_printf(p, " %14s", irq_desc[i].chip->typename);
4499 + seq_printf(p, " %8s", irq_desc[i].chip->name);
4500 + seq_printf(p, "-%-8s", irq_desc[i].name);
4501 seq_printf(p, " %s", action->name);
4502
4503 for (action=action->next; action; action = action->next)
4504 --- a/arch/x86/kernel/irq_64-xen.c
4505 +++ b/arch/x86/kernel/irq_64-xen.c
4506 @@ -20,11 +20,6 @@
4507 #include <asm/idle.h>
4508
4509 atomic_t irq_err_count;
4510 -#ifdef CONFIG_X86_IO_APIC
4511 -#ifdef APIC_MISMATCH_DEBUG
4512 -atomic_t irq_mis_count;
4513 -#endif
4514 -#endif
4515
4516 #ifdef CONFIG_DEBUG_STACKOVERFLOW
4517 /*
4518 @@ -79,7 +74,8 @@
4519 for_each_online_cpu(j)
4520 seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
4521 #endif
4522 - seq_printf(p, " %14s", irq_desc[i].chip->typename);
4523 + seq_printf(p, " %8s", irq_desc[i].chip->name);
4524 + seq_printf(p, "-%-8s", irq_desc[i].name);
4525
4526 seq_printf(p, " %s", action->name);
4527 for (action=action->next; action; action = action->next)
4528 @@ -99,11 +95,6 @@
4529 seq_putc(p, '\n');
4530 #endif
4531 seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
4532 -#ifdef CONFIG_X86_IO_APIC
4533 -#ifdef APIC_MISMATCH_DEBUG
4534 - seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
4535 -#endif
4536 -#endif
4537 }
4538 return 0;
4539 }
4540 @@ -114,24 +105,28 @@
4541 * handlers).
4542 */
4543 asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
4544 -{
4545 +{
4546 + struct pt_regs *old_regs = set_irq_regs(regs);
4547 +
4548 /* high bit used in ret_from_ code */
4549 unsigned irq = ~regs->orig_rax;
4550
4551 - if (unlikely(irq >= NR_IRQS)) {
4552 - printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
4553 - __FUNCTION__, irq);
4554 - BUG();
4555 - }
4556 -
4557 exit_idle();
4558 irq_enter();
4559 +
4560 #ifdef CONFIG_DEBUG_STACKOVERFLOW
4561 stack_overflow_check(regs);
4562 #endif
4563 - __do_IRQ(irq, regs);
4564 +
4565 + if (likely(irq < NR_IRQS))
4566 + generic_handle_irq(irq);
4567 + else
4568 + printk(KERN_EMERG "%s: %d.%d No irq handler for irq\n",
4569 + __func__, smp_processor_id(), irq);
4570 +
4571 irq_exit();
4572
4573 + set_irq_regs(old_regs);
4574 return 1;
4575 }
4576
4577 @@ -192,6 +187,6 @@
4578 */
4579 void ack_bad_irq(unsigned int irq)
4580 {
4581 - printk("unexpected IRQ trap at vector %02x\n", irq);
4582 + printk("unexpected IRQ trap at irq %02x\n", irq);
4583 }
4584 #endif
4585 --- a/arch/x86/kernel/ldt_32-xen.c
4586 +++ b/arch/x86/kernel/ldt_32-xen.c
4587 @@ -1,5 +1,5 @@
4588 /*
4589 - * linux/kernel/ldt.c
4590 + * linux/arch/i386/kernel/ldt.c
4591 *
4592 * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
4593 * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
4594 --- a/arch/x86/kernel/microcode-xen.c
4595 +++ b/arch/x86/kernel/microcode-xen.c
4596 @@ -2,6 +2,7 @@
4597 * Intel CPU Microcode Update Driver for Linux
4598 *
4599 * Copyright (C) 2000-2004 Tigran Aivazian
4600 + * 2006 Shaohua Li <shaohua.li@intel.com>
4601 *
4602 * This driver allows to upgrade microcode on Intel processors
4603 * belonging to IA-32 family - PentiumPro, Pentium II,
4604 @@ -33,7 +34,9 @@
4605 #include <linux/spinlock.h>
4606 #include <linux/mm.h>
4607 #include <linux/mutex.h>
4608 -#include <linux/syscalls.h>
4609 +#include <linux/cpu.h>
4610 +#include <linux/firmware.h>
4611 +#include <linux/platform_device.h>
4612
4613 #include <asm/msr.h>
4614 #include <asm/uaccess.h>
4615 @@ -55,12 +58,7 @@
4616 /* no concurrent ->write()s are allowed on /dev/cpu/microcode */
4617 static DEFINE_MUTEX(microcode_mutex);
4618
4619 -static int microcode_open (struct inode *unused1, struct file *unused2)
4620 -{
4621 - return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
4622 -}
4623 -
4624 -
4625 +#ifdef CONFIG_MICROCODE_OLD_INTERFACE
4626 static int do_microcode_update (const void __user *ubuf, size_t len)
4627 {
4628 int err;
4629 @@ -85,6 +83,11 @@
4630 return err;
4631 }
4632
4633 +static int microcode_open (struct inode *unused1, struct file *unused2)
4634 +{
4635 + return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
4636 +}
4637 +
4638 static ssize_t microcode_write (struct file *file, const char __user *buf, size_t len, loff_t *ppos)
4639 {
4640 ssize_t ret;
4641 @@ -117,7 +120,7 @@
4642 .fops = &microcode_fops,
4643 };
4644
4645 -static int __init microcode_init (void)
4646 +static int __init microcode_dev_init (void)
4647 {
4648 int error;
4649
4650 @@ -129,6 +132,68 @@
4651 return error;
4652 }
4653
4654 + return 0;
4655 +}
4656 +
4657 +static void __exit microcode_dev_exit (void)
4658 +{
4659 + misc_deregister(&microcode_dev);
4660 +}
4661 +
4662 +MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
4663 +#else
4664 +#define microcode_dev_init() 0
4665 +#define microcode_dev_exit() do { } while(0)
4666 +#endif
4667 +
4668 +/* fake device for request_firmware */
4669 +static struct platform_device *microcode_pdev;
4670 +
4671 +static int request_microcode(void)
4672 +{
4673 + char name[30];
4674 + const struct cpuinfo_x86 *c = &boot_cpu_data;
4675 + const struct firmware *firmware;
4676 + int error;
4677 + struct xen_platform_op op;
4678 +
4679 + sprintf(name,"intel-ucode/%02x-%02x-%02x",
4680 + c->x86, c->x86_model, c->x86_mask);
4681 + error = request_firmware(&firmware, name, &microcode_pdev->dev);
4682 + if (error) {
4683 + pr_debug("ucode data file %s load failed\n", name);
4684 + return error;
4685 + }
4686 +
4687 + op.cmd = XENPF_microcode_update;
4688 + set_xen_guest_handle(op.u.microcode.data, (void *)firmware->data);
4689 + op.u.microcode.length = firmware->size;
4690 + error = HYPERVISOR_platform_op(&op);
4691 +
4692 + release_firmware(firmware);
4693 +
4694 + if (error)
4695 + pr_debug("ucode load failed\n");
4696 +
4697 + return error;
4698 +}
4699 +
4700 +static int __init microcode_init (void)
4701 +{
4702 + int error;
4703 +
4704 + error = microcode_dev_init();
4705 + if (error)
4706 + return error;
4707 + microcode_pdev = platform_device_register_simple("microcode", -1,
4708 + NULL, 0);
4709 + if (IS_ERR(microcode_pdev)) {
4710 + microcode_dev_exit();
4711 + return PTR_ERR(microcode_pdev);
4712 + }
4713 +
4714 + request_microcode();
4715 +
4716 printk(KERN_INFO
4717 "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@veritas.com>\n");
4718 return 0;
4719 @@ -136,9 +201,9 @@
4720
4721 static void __exit microcode_exit (void)
4722 {
4723 - misc_deregister(&microcode_dev);
4724 + microcode_dev_exit();
4725 + platform_device_unregister(microcode_pdev);
4726 }
4727
4728 module_init(microcode_init)
4729 module_exit(microcode_exit)
4730 -MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
4731 --- a/arch/x86/kernel/mpparse_32-xen.c
4732 +++ b/arch/x86/kernel/mpparse_32-xen.c
4733 @@ -30,6 +30,7 @@
4734 #include <asm/io_apic.h>
4735
4736 #include <mach_apic.h>
4737 +#include <mach_apicdef.h>
4738 #include <mach_mpparse.h>
4739 #include <bios_ebda.h>
4740
4741 @@ -68,7 +69,7 @@
4742 /* Processor that is doing the boot up */
4743 unsigned int boot_cpu_physical_apicid = -1U;
4744 /* Internal processor count */
4745 -static unsigned int __devinitdata num_processors;
4746 +unsigned int __cpuinitdata num_processors;
4747
4748 /* Bitmask of physically existing CPUs */
4749 physid_mask_t phys_cpu_present_map;
4750 @@ -235,12 +236,14 @@
4751
4752 mpc_oem_bus_info(m, str, translation_table[mpc_record]);
4753
4754 +#if MAX_MP_BUSSES < 256
4755 if (m->mpc_busid >= MAX_MP_BUSSES) {
4756 printk(KERN_WARNING "MP table busid value (%d) for bustype %s "
4757 " is too large, max. supported is %d\n",
4758 m->mpc_busid, str, MAX_MP_BUSSES - 1);
4759 return;
4760 }
4761 +#endif
4762
4763 if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA)-1) == 0) {
4764 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
4765 @@ -300,19 +303,6 @@
4766 m->mpc_irqtype, m->mpc_irqflag & 3,
4767 (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid,
4768 m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
4769 - /*
4770 - * Well it seems all SMP boards in existence
4771 - * use ExtINT/LVT1 == LINT0 and
4772 - * NMI/LVT2 == LINT1 - the following check
4773 - * will show us if this assumptions is false.
4774 - * Until then we do not have to add baggage.
4775 - */
4776 - if ((m->mpc_irqtype == mp_ExtINT) &&
4777 - (m->mpc_destapiclint != 0))
4778 - BUG();
4779 - if ((m->mpc_irqtype == mp_NMI) &&
4780 - (m->mpc_destapiclint != 1))
4781 - BUG();
4782 }
4783
4784 #ifdef CONFIG_X86_NUMAQ
4785 @@ -838,8 +828,7 @@
4786
4787 #ifdef CONFIG_ACPI
4788
4789 -void __init mp_register_lapic_address (
4790 - u64 address)
4791 +void __init mp_register_lapic_address(u64 address)
4792 {
4793 #ifndef CONFIG_XEN
4794 mp_lapic_addr = (unsigned long) address;
4795 @@ -853,13 +842,10 @@
4796 #endif
4797 }
4798
4799 -
4800 -void __devinit mp_register_lapic (
4801 - u8 id,
4802 - u8 enabled)
4803 +void __devinit mp_register_lapic (u8 id, u8 enabled)
4804 {
4805 struct mpc_config_processor processor;
4806 - int boot_cpu = 0;
4807 + int boot_cpu = 0;
4808
4809 if (MAX_APICS - id <= 0) {
4810 printk(KERN_WARNING "Processor #%d invalid (max %d)\n",
4811 @@ -898,11 +884,9 @@
4812 u32 pin_programmed[4];
4813 } mp_ioapic_routing[MAX_IO_APICS];
4814
4815 -
4816 -static int mp_find_ioapic (
4817 - int gsi)
4818 +static int mp_find_ioapic (int gsi)
4819 {
4820 - int i = 0;
4821 + int i = 0;
4822
4823 /* Find the IOAPIC that manages this GSI. */
4824 for (i = 0; i < nr_ioapics; i++) {
4825 @@ -915,15 +899,11 @@
4826
4827 return -1;
4828 }
4829 -
4830
4831 -void __init mp_register_ioapic (
4832 - u8 id,
4833 - u32 address,
4834 - u32 gsi_base)
4835 +void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base)
4836 {
4837 - int idx = 0;
4838 - int tmpid;
4839 + int idx = 0;
4840 + int tmpid;
4841
4842 if (nr_ioapics >= MAX_IO_APICS) {
4843 printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
4844 @@ -971,16 +951,10 @@
4845 mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
4846 mp_ioapic_routing[idx].gsi_base,
4847 mp_ioapic_routing[idx].gsi_end);
4848 -
4849 - return;
4850 }
4851
4852 -
4853 -void __init mp_override_legacy_irq (
4854 - u8 bus_irq,
4855 - u8 polarity,
4856 - u8 trigger,
4857 - u32 gsi)
4858 +void __init
4859 +mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
4860 {
4861 struct mpc_config_intsrc intsrc;
4862 int ioapic = -1;
4863 @@ -1018,15 +992,13 @@
4864 mp_irqs[mp_irq_entries] = intsrc;
4865 if (++mp_irq_entries == MAX_IRQ_SOURCES)
4866 panic("Max # of irq sources exceeded!\n");
4867 -
4868 - return;
4869 }
4870
4871 void __init mp_config_acpi_legacy_irqs (void)
4872 {
4873 struct mpc_config_intsrc intsrc;
4874 - int i = 0;
4875 - int ioapic = -1;
4876 + int i = 0;
4877 + int ioapic = -1;
4878
4879 /*
4880 * Fabricate the legacy ISA bus (bus #31).
4881 @@ -1095,12 +1067,12 @@
4882
4883 #define MAX_GSI_NUM 4096
4884
4885 -int mp_register_gsi (u32 gsi, int triggering, int polarity)
4886 +int mp_register_gsi(u32 gsi, int triggering, int polarity)
4887 {
4888 - int ioapic = -1;
4889 - int ioapic_pin = 0;
4890 - int idx, bit = 0;
4891 - static int pci_irq = 16;
4892 + int ioapic = -1;
4893 + int ioapic_pin = 0;
4894 + int idx, bit = 0;
4895 + static int pci_irq = 16;
4896 /*
4897 * Mapping between Global System Interrups, which
4898 * represent all possible interrupts, and IRQs
4899 --- a/arch/x86/kernel/mpparse_64-xen.c
4900 +++ b/arch/x86/kernel/mpparse_64-xen.c
4901 @@ -41,8 +41,7 @@
4902 * Various Linux-internal data structures created from the
4903 * MP-table.
4904 */
4905 -unsigned char apic_version [MAX_APICS];
4906 -unsigned char mp_bus_id_to_type [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
4907 +DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
4908 int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
4909
4910 static int mp_current_pci_id = 0;
4911 @@ -56,7 +55,6 @@
4912 int mp_irq_entries;
4913
4914 int nr_ioapics;
4915 -int pic_mode;
4916 unsigned long mp_lapic_addr = 0;
4917
4918
4919 @@ -71,19 +69,6 @@
4920 /* Bitmask of physically existing CPUs */
4921 physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE;
4922
4923 -/* ACPI MADT entry parsing functions */
4924 -#ifdef CONFIG_ACPI
4925 -extern struct acpi_boot_flags acpi_boot;
4926 -#ifdef CONFIG_X86_LOCAL_APIC
4927 -extern int acpi_parse_lapic (acpi_table_entry_header *header);
4928 -extern int acpi_parse_lapic_addr_ovr (acpi_table_entry_header *header);
4929 -extern int acpi_parse_lapic_nmi (acpi_table_entry_header *header);
4930 -#endif /*CONFIG_X86_LOCAL_APIC*/
4931 -#ifdef CONFIG_X86_IO_APIC
4932 -extern int acpi_parse_ioapic (acpi_table_entry_header *header);
4933 -#endif /*CONFIG_X86_IO_APIC*/
4934 -#endif /*CONFIG_ACPI*/
4935 -
4936 u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
4937
4938
4939 @@ -109,24 +94,20 @@
4940 static void __cpuinit MP_processor_info (struct mpc_config_processor *m)
4941 {
4942 int cpu;
4943 - unsigned char ver;
4944 cpumask_t tmp_map;
4945 + char *bootup_cpu = "";
4946
4947 if (!(m->mpc_cpuflag & CPU_ENABLED)) {
4948 disabled_cpus++;
4949 return;
4950 }
4951 -
4952 - printk(KERN_INFO "Processor #%d %d:%d APIC version %d\n",
4953 - m->mpc_apicid,
4954 - (m->mpc_cpufeature & CPU_FAMILY_MASK)>>8,
4955 - (m->mpc_cpufeature & CPU_MODEL_MASK)>>4,
4956 - m->mpc_apicver);
4957 -
4958 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
4959 - Dprintk(" Bootup CPU\n");
4960 + bootup_cpu = " (Bootup-CPU)";
4961 boot_cpu_id = m->mpc_apicid;
4962 }
4963 +
4964 + printk(KERN_INFO "Processor #%d%s\n", m->mpc_apicid, bootup_cpu);
4965 +
4966 if (num_processors >= NR_CPUS) {
4967 printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
4968 " Processor ignored.\n", NR_CPUS);
4969 @@ -137,24 +118,7 @@
4970 cpus_complement(tmp_map, cpu_present_map);
4971 cpu = first_cpu(tmp_map);
4972
4973 -#if MAX_APICS < 255
4974 - if ((int)m->mpc_apicid > MAX_APICS) {
4975 - printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n",
4976 - m->mpc_apicid, MAX_APICS);
4977 - return;
4978 - }
4979 -#endif
4980 - ver = m->mpc_apicver;
4981 -
4982 physid_set(m->mpc_apicid, phys_cpu_present_map);
4983 - /*
4984 - * Validate version
4985 - */
4986 - if (ver == 0x0) {
4987 - printk(KERN_ERR "BIOS bug, APIC version is 0 for CPU#%d! fixing up to 0x10. (tell your hw vendor)\n", m->mpc_apicid);
4988 - ver = 0x10;
4989 - }
4990 - apic_version[m->mpc_apicid] = ver;
4991 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
4992 /*
4993 * bios_cpu_apicid is required to have processors listed
4994 @@ -185,37 +149,42 @@
4995 Dprintk("Bus #%d is %s\n", m->mpc_busid, str);
4996
4997 if (strncmp(str, "ISA", 3) == 0) {
4998 - mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
4999 - } else if (strncmp(str, "EISA", 4) == 0) {
5000 - mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA;
5001 + set_bit(m->mpc_busid, mp_bus_not_pci);
5002 } else if (strncmp(str, "PCI", 3) == 0) {
5003 - mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
5004 + clear_bit(m->mpc_busid, mp_bus_not_pci);
5005 mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
5006 mp_current_pci_id++;
5007 - } else if (strncmp(str, "MCA", 3) == 0) {
5008 - mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA;
5009 } else {
5010 printk(KERN_ERR "Unknown bustype %s\n", str);
5011 }
5012 }
5013
5014 +static int bad_ioapic(unsigned long address)
5015 +{
5016 + if (nr_ioapics >= MAX_IO_APICS) {
5017 + printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
5018 + "(found %d)\n", MAX_IO_APICS, nr_ioapics);
5019 + panic("Recompile kernel with bigger MAX_IO_APICS!\n");
5020 + }
5021 + if (!address) {
5022 + printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
5023 + " found in table, skipping!\n");
5024 + return 1;
5025 + }
5026 + return 0;
5027 +}
5028 +
5029 static void __init MP_ioapic_info (struct mpc_config_ioapic *m)
5030 {
5031 if (!(m->mpc_flags & MPC_APIC_USABLE))
5032 return;
5033
5034 - printk("I/O APIC #%d Version %d at 0x%X.\n",
5035 - m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
5036 - if (nr_ioapics >= MAX_IO_APICS) {
5037 - printk(KERN_ERR "Max # of I/O APICs (%d) exceeded (found %d).\n",
5038 - MAX_IO_APICS, nr_ioapics);
5039 - panic("Recompile kernel with bigger MAX_IO_APICS!.\n");
5040 - }
5041 - if (!m->mpc_apicaddr) {
5042 - printk(KERN_ERR "WARNING: bogus zero I/O APIC address"
5043 - " found in MP table, skipping!\n");
5044 + printk("I/O APIC #%d at 0x%X.\n",
5045 + m->mpc_apicid, m->mpc_apicaddr);
5046 +
5047 + if (bad_ioapic(m->mpc_apicaddr))
5048 return;
5049 - }
5050 +
5051 mp_ioapics[nr_ioapics] = *m;
5052 nr_ioapics++;
5053 }
5054 @@ -239,19 +208,6 @@
5055 m->mpc_irqtype, m->mpc_irqflag & 3,
5056 (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid,
5057 m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
5058 - /*
5059 - * Well it seems all SMP boards in existence
5060 - * use ExtINT/LVT1 == LINT0 and
5061 - * NMI/LVT2 == LINT1 - the following check
5062 - * will show us if this assumptions is false.
5063 - * Until then we do not have to add baggage.
5064 - */
5065 - if ((m->mpc_irqtype == mp_ExtINT) &&
5066 - (m->mpc_destapiclint != 0))
5067 - BUG();
5068 - if ((m->mpc_irqtype == mp_NMI) &&
5069 - (m->mpc_destapiclint != 1))
5070 - BUG();
5071 }
5072
5073 /*
5074 @@ -265,7 +221,7 @@
5075 unsigned char *mpt=((unsigned char *)mpc)+count;
5076
5077 if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) {
5078 - printk("SMP mptable: bad signature [%c%c%c%c]!\n",
5079 + printk("MPTABLE: bad signature [%c%c%c%c]!\n",
5080 mpc->mpc_signature[0],
5081 mpc->mpc_signature[1],
5082 mpc->mpc_signature[2],
5083 @@ -273,31 +229,31 @@
5084 return 0;
5085 }
5086 if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) {
5087 - printk("SMP mptable: checksum error!\n");
5088 + printk("MPTABLE: checksum error!\n");
5089 return 0;
5090 }
5091 if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) {
5092 - printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n",
5093 + printk(KERN_ERR "MPTABLE: bad table version (%d)!!\n",
5094 mpc->mpc_spec);
5095 return 0;
5096 }
5097 if (!mpc->mpc_lapic) {
5098 - printk(KERN_ERR "SMP mptable: null local APIC address!\n");
5099 + printk(KERN_ERR "MPTABLE: null local APIC address!\n");
5100 return 0;
5101 }
5102 memcpy(str,mpc->mpc_oem,8);
5103 - str[8]=0;
5104 - printk(KERN_INFO "OEM ID: %s ",str);
5105 + str[8] = 0;
5106 + printk(KERN_INFO "MPTABLE: OEM ID: %s ",str);
5107
5108 memcpy(str,mpc->mpc_productid,12);
5109 - str[12]=0;
5110 - printk("Product ID: %s ",str);
5111 + str[12] = 0;
5112 + printk("MPTABLE: Product ID: %s ",str);
5113
5114 - printk("APIC at: 0x%X\n",mpc->mpc_lapic);
5115 + printk("MPTABLE: APIC at: 0x%X\n",mpc->mpc_lapic);
5116
5117 /* save the local APIC address, it might be non-default */
5118 if (!acpi_lapic)
5119 - mp_lapic_addr = mpc->mpc_lapic;
5120 + mp_lapic_addr = mpc->mpc_lapic;
5121
5122 /*
5123 * Now process the configuration blocks.
5124 @@ -309,7 +265,7 @@
5125 struct mpc_config_processor *m=
5126 (struct mpc_config_processor *)mpt;
5127 if (!acpi_lapic)
5128 - MP_processor_info(m);
5129 + MP_processor_info(m);
5130 mpt += sizeof(*m);
5131 count += sizeof(*m);
5132 break;
5133 @@ -328,8 +284,8 @@
5134 struct mpc_config_ioapic *m=
5135 (struct mpc_config_ioapic *)mpt;
5136 MP_ioapic_info(m);
5137 - mpt+=sizeof(*m);
5138 - count+=sizeof(*m);
5139 + mpt += sizeof(*m);
5140 + count += sizeof(*m);
5141 break;
5142 }
5143 case MP_INTSRC:
5144 @@ -338,8 +294,8 @@
5145 (struct mpc_config_intsrc *)mpt;
5146
5147 MP_intsrc_info(m);
5148 - mpt+=sizeof(*m);
5149 - count+=sizeof(*m);
5150 + mpt += sizeof(*m);
5151 + count += sizeof(*m);
5152 break;
5153 }
5154 case MP_LINTSRC:
5155 @@ -347,15 +303,15 @@
5156 struct mpc_config_lintsrc *m=
5157 (struct mpc_config_lintsrc *)mpt;
5158 MP_lintsrc_info(m);
5159 - mpt+=sizeof(*m);
5160 - count+=sizeof(*m);
5161 + mpt += sizeof(*m);
5162 + count += sizeof(*m);
5163 break;
5164 }
5165 }
5166 }
5167 clustered_apic_check();
5168 if (!num_processors)
5169 - printk(KERN_ERR "SMP mptable: no processors registered!\n");
5170 + printk(KERN_ERR "MPTABLE: no processors registered!\n");
5171 return num_processors;
5172 }
5173
5174 @@ -451,13 +407,10 @@
5175 * 2 CPUs, numbered 0 & 1.
5176 */
5177 processor.mpc_type = MP_PROCESSOR;
5178 - /* Either an integrated APIC or a discrete 82489DX. */
5179 - processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
5180 + processor.mpc_apicver = 0;
5181 processor.mpc_cpuflag = CPU_ENABLED;
5182 - processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
5183 - (boot_cpu_data.x86_model << 4) |
5184 - boot_cpu_data.x86_mask;
5185 - processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
5186 + processor.mpc_cpufeature = 0;
5187 + processor.mpc_featureflag = 0;
5188 processor.mpc_reserved[0] = 0;
5189 processor.mpc_reserved[1] = 0;
5190 for (i = 0; i < 2; i++) {
5191 @@ -476,14 +429,6 @@
5192 case 5:
5193 memcpy(bus.mpc_bustype, "ISA ", 6);
5194 break;
5195 - case 2:
5196 - case 6:
5197 - case 3:
5198 - memcpy(bus.mpc_bustype, "EISA ", 6);
5199 - break;
5200 - case 4:
5201 - case 7:
5202 - memcpy(bus.mpc_bustype, "MCA ", 6);
5203 }
5204 MP_bus_info(&bus);
5205 if (mpc_default_type > 4) {
5206 @@ -494,7 +439,7 @@
5207
5208 ioapic.mpc_type = MP_IOAPIC;
5209 ioapic.mpc_apicid = 2;
5210 - ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
5211 + ioapic.mpc_apicver = 0;
5212 ioapic.mpc_flags = MPC_APIC_USABLE;
5213 ioapic.mpc_apicaddr = 0xFEC00000;
5214 MP_ioapic_info(&ioapic);
5215 @@ -537,13 +482,6 @@
5216 printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n");
5217
5218 printk("Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification);
5219 - if (mpf->mpf_feature2 & (1<<7)) {
5220 - printk(KERN_INFO " IMCR and PIC compatibility mode.\n");
5221 - pic_mode = 1;
5222 - } else {
5223 - printk(KERN_INFO " Virtual Wire compatibility mode.\n");
5224 - pic_mode = 0;
5225 - }
5226
5227 /*
5228 * Now see if we need to read further.
5229 @@ -620,7 +558,7 @@
5230 return 0;
5231 }
5232
5233 -void __init find_intel_smp (void)
5234 +void __init find_smp_config(void)
5235 {
5236 unsigned int address;
5237
5238 @@ -637,9 +575,7 @@
5239 smp_scan_config(0xF0000,0x10000))
5240 return;
5241 /*
5242 - * If it is an SMP machine we should know now, unless the
5243 - * configuration is in an EISA/MCA bus machine with an
5244 - * extended bios data area.
5245 + * If it is an SMP machine we should know now.
5246 *
5247 * there is a real-mode segmented pointer pointing to the
5248 * 4K EBDA area at 0x40E, calculate and scan it here.
5249 @@ -660,64 +596,38 @@
5250 printk(KERN_INFO "No mptable found.\n");
5251 }
5252
5253 -/*
5254 - * - Intel MP Configuration Table
5255 - */
5256 -void __init find_smp_config (void)
5257 -{
5258 -#ifdef CONFIG_X86_LOCAL_APIC
5259 - find_intel_smp();
5260 -#endif
5261 -}
5262 -
5263 -
5264 /* --------------------------------------------------------------------------
5265 ACPI-based MP Configuration
5266 -------------------------------------------------------------------------- */
5267
5268 #ifdef CONFIG_ACPI
5269
5270 -void __init mp_register_lapic_address (
5271 - u64 address)
5272 +void __init mp_register_lapic_address(u64 address)
5273 {
5274 #ifndef CONFIG_XEN
5275 mp_lapic_addr = (unsigned long) address;
5276 -
5277 set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
5278 -
5279 if (boot_cpu_id == -1U)
5280 boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID));
5281 -
5282 - Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid);
5283 #endif
5284 }
5285
5286 -
5287 -void __cpuinit mp_register_lapic (
5288 - u8 id,
5289 - u8 enabled)
5290 +void __cpuinit mp_register_lapic (u8 id, u8 enabled)
5291 {
5292 struct mpc_config_processor processor;
5293 int boot_cpu = 0;
5294
5295 - if (id >= MAX_APICS) {
5296 - printk(KERN_WARNING "Processor #%d invalid (max %d)\n",
5297 - id, MAX_APICS);
5298 - return;
5299 - }
5300 -
5301 - if (id == boot_cpu_physical_apicid)
5302 + if (id == boot_cpu_id)
5303 boot_cpu = 1;
5304
5305 #ifndef CONFIG_XEN
5306 processor.mpc_type = MP_PROCESSOR;
5307 processor.mpc_apicid = id;
5308 - processor.mpc_apicver = GET_APIC_VERSION(apic_read(APIC_LVR));
5309 + processor.mpc_apicver = 0;
5310 processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0);
5311 processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0);
5312 - processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
5313 - (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
5314 - processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
5315 + processor.mpc_cpufeature = 0;
5316 + processor.mpc_featureflag = 0;
5317 processor.mpc_reserved[0] = 0;
5318 processor.mpc_reserved[1] = 0;
5319 #endif
5320 @@ -725,8 +635,6 @@
5321 MP_processor_info(&processor);
5322 }
5323
5324 -#ifdef CONFIG_X86_IO_APIC
5325 -
5326 #define MP_ISA_BUS 0
5327 #define MP_MAX_IOAPIC_PIN 127
5328
5329 @@ -737,11 +645,9 @@
5330 u32 pin_programmed[4];
5331 } mp_ioapic_routing[MAX_IO_APICS];
5332
5333 -
5334 -static int mp_find_ioapic (
5335 - int gsi)
5336 +static int mp_find_ioapic(int gsi)
5337 {
5338 - int i = 0;
5339 + int i = 0;
5340
5341 /* Find the IOAPIC that manages this GSI. */
5342 for (i = 0; i < nr_ioapics; i++) {
5343 @@ -751,28 +657,15 @@
5344 }
5345
5346 printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
5347 -
5348 return -1;
5349 }
5350 -
5351
5352 -void __init mp_register_ioapic (
5353 - u8 id,
5354 - u32 address,
5355 - u32 gsi_base)
5356 +void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base)
5357 {
5358 - int idx = 0;
5359 + int idx = 0;
5360
5361 - if (nr_ioapics >= MAX_IO_APICS) {
5362 - printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
5363 - "(found %d)\n", MAX_IO_APICS, nr_ioapics);
5364 - panic("Recompile kernel with bigger MAX_IO_APICS!\n");
5365 - }
5366 - if (!address) {
5367 - printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
5368 - " found in MADT table, skipping!\n");
5369 + if (bad_ioapic(address))
5370 return;
5371 - }
5372
5373 idx = nr_ioapics++;
5374
5375 @@ -784,7 +677,7 @@
5376 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
5377 #endif
5378 mp_ioapics[idx].mpc_apicid = id;
5379 - mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
5380 + mp_ioapics[idx].mpc_apicver = 0;
5381
5382 /*
5383 * Build basic IRQ lookup table to facilitate gsi->io_apic lookups
5384 @@ -795,21 +688,15 @@
5385 mp_ioapic_routing[idx].gsi_end = gsi_base +
5386 io_apic_get_redir_entries(idx);
5387
5388 - printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
5389 + printk(KERN_INFO "IOAPIC[%d]: apic_id %d, address 0x%x, "
5390 "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
5391 - mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
5392 + mp_ioapics[idx].mpc_apicaddr,
5393 mp_ioapic_routing[idx].gsi_start,
5394 mp_ioapic_routing[idx].gsi_end);
5395 -
5396 - return;
5397 }
5398
5399 -
5400 -void __init mp_override_legacy_irq (
5401 - u8 bus_irq,
5402 - u8 polarity,
5403 - u8 trigger,
5404 - u32 gsi)
5405 +void __init
5406 +mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
5407 {
5408 struct mpc_config_intsrc intsrc;
5409 int ioapic = -1;
5410 @@ -847,22 +734,18 @@
5411 mp_irqs[mp_irq_entries] = intsrc;
5412 if (++mp_irq_entries == MAX_IRQ_SOURCES)
5413 panic("Max # of irq sources exceeded!\n");
5414 -
5415 - return;
5416 }
5417
5418 -
5419 -void __init mp_config_acpi_legacy_irqs (void)
5420 +void __init mp_config_acpi_legacy_irqs(void)
5421 {
5422 struct mpc_config_intsrc intsrc;
5423 - int i = 0;
5424 - int ioapic = -1;
5425 + int i = 0;
5426 + int ioapic = -1;
5427
5428 /*
5429 * Fabricate the legacy ISA bus (bus #31).
5430 */
5431 - mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
5432 - Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
5433 + set_bit(MP_ISA_BUS, mp_bus_not_pci);
5434
5435 /*
5436 * Locate the IOAPIC that manages the ISA IRQs (0-15).
5437 @@ -915,24 +798,13 @@
5438 if (++mp_irq_entries == MAX_IRQ_SOURCES)
5439 panic("Max # of irq sources exceeded!\n");
5440 }
5441 -
5442 - return;
5443 }
5444
5445 -#define MAX_GSI_NUM 4096
5446 -
5447 int mp_register_gsi(u32 gsi, int triggering, int polarity)
5448 {
5449 - int ioapic = -1;
5450 - int ioapic_pin = 0;
5451 - int idx, bit = 0;
5452 - static int pci_irq = 16;
5453 - /*
5454 - * Mapping between Global System Interrupts, which
5455 - * represent all possible interrupts, to the IRQs
5456 - * assigned to actual devices.
5457 - */
5458 - static int gsi_to_irq[MAX_GSI_NUM];
5459 + int ioapic = -1;
5460 + int ioapic_pin = 0;
5461 + int idx, bit = 0;
5462
5463 if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
5464 return gsi;
5465 @@ -965,47 +837,14 @@
5466 if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
5467 Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
5468 mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
5469 - return gsi_to_irq[gsi];
5470 + return gsi;
5471 }
5472
5473 mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
5474
5475 - if (triggering == ACPI_LEVEL_SENSITIVE) {
5476 - /*
5477 - * For PCI devices assign IRQs in order, avoiding gaps
5478 - * due to unused I/O APIC pins.
5479 - */
5480 - int irq = gsi;
5481 - if (gsi < MAX_GSI_NUM) {
5482 - /*
5483 - * Retain the VIA chipset work-around (gsi > 15), but
5484 - * avoid a problem where the 8254 timer (IRQ0) is setup
5485 - * via an override (so it's not on pin 0 of the ioapic),
5486 - * and at the same time, the pin 0 interrupt is a PCI
5487 - * type. The gsi > 15 test could cause these two pins
5488 - * to be shared as IRQ0, and they are not shareable.
5489 - * So test for this condition, and if necessary, avoid
5490 - * the pin collision.
5491 - */
5492 - if (gsi > 15 || (gsi == 0 && !timer_uses_ioapic_pin_0))
5493 - gsi = pci_irq++;
5494 - /*
5495 - * Don't assign IRQ used by ACPI SCI
5496 - */
5497 - if (gsi == acpi_fadt.sci_int)
5498 - gsi = pci_irq++;
5499 - gsi_to_irq[irq] = gsi;
5500 - } else {
5501 - printk(KERN_ERR "GSI %u is too high\n", gsi);
5502 - return gsi;
5503 - }
5504 - }
5505 -
5506 io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
5507 triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
5508 polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
5509 return gsi;
5510 }
5511 -
5512 -#endif /*CONFIG_X86_IO_APIC*/
5513 #endif /*CONFIG_ACPI*/
5514 --- a/arch/x86/kernel/pci-dma_32-xen.c
5515 +++ b/arch/x86/kernel/pci-dma_32-xen.c
5516 @@ -116,8 +116,7 @@
5517 {
5518 int i, rc;
5519
5520 - if (direction == DMA_NONE)
5521 - BUG();
5522 + BUG_ON(!valid_dma_direction(direction));
5523 WARN_ON(nents == 0 || sg[0].length == 0);
5524
5525 if (swiotlb) {
5526 @@ -148,7 +147,7 @@
5527 {
5528 int i;
5529
5530 - BUG_ON(direction == DMA_NONE);
5531 + BUG_ON(!valid_dma_direction(direction));
5532 if (swiotlb)
5533 swiotlb_unmap_sg(hwdev, sg, nents, direction);
5534 else {
5535 @@ -165,8 +164,7 @@
5536 {
5537 dma_addr_t dma_addr;
5538
5539 - BUG_ON(direction == DMA_NONE);
5540 -
5541 + BUG_ON(!valid_dma_direction(direction));
5542 if (swiotlb) {
5543 dma_addr = swiotlb_map_page(
5544 dev, page, offset, size, direction);
5545 @@ -183,7 +181,7 @@
5546 dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size,
5547 enum dma_data_direction direction)
5548 {
5549 - BUG_ON(direction == DMA_NONE);
5550 + BUG_ON(!valid_dma_direction(direction));
5551 if (swiotlb)
5552 swiotlb_unmap_page(dev, dma_address, size, direction);
5553 else
5554 @@ -365,8 +363,7 @@
5555 {
5556 dma_addr_t dma;
5557
5558 - if (direction == DMA_NONE)
5559 - BUG();
5560 + BUG_ON(!valid_dma_direction(direction));
5561 WARN_ON(size == 0);
5562
5563 if (swiotlb) {
5564 @@ -387,8 +384,7 @@
5565 dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
5566 enum dma_data_direction direction)
5567 {
5568 - if (direction == DMA_NONE)
5569 - BUG();
5570 + BUG_ON(!valid_dma_direction(direction));
5571 if (swiotlb)
5572 swiotlb_unmap_single(dev, dma_addr, size, direction);
5573 else
5574 --- a/arch/x86/kernel/pci-swiotlb_64-xen.c
5575 +++ b/arch/x86/kernel/pci-swiotlb_64-xen.c
5576 @@ -3,7 +3,8 @@
5577 #include <linux/pci.h>
5578 #include <linux/cache.h>
5579 #include <linux/module.h>
5580 -#include <asm/dma-mapping.h>
5581 +#include <linux/dma-mapping.h>
5582 +
5583 #include <asm/proto.h>
5584 #include <asm/swiotlb.h>
5585 #include <asm/dma.h>
5586 --- a/arch/x86/kernel/process_32-xen.c
5587 +++ b/arch/x86/kernel/process_32-xen.c
5588 @@ -37,6 +37,7 @@
5589 #include <linux/kallsyms.h>
5590 #include <linux/ptrace.h>
5591 #include <linux/random.h>
5592 +#include <linux/personality.h>
5593
5594 #include <asm/uaccess.h>
5595 #include <asm/pgtable.h>
5596 @@ -186,7 +187,7 @@
5597 void cpu_idle_wait(void)
5598 {
5599 unsigned int cpu, this_cpu = get_cpu();
5600 - cpumask_t map;
5601 + cpumask_t map, tmp = current->cpus_allowed;
5602
5603 set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
5604 put_cpu();
5605 @@ -208,6 +209,8 @@
5606 }
5607 cpus_and(map, map, cpu_online_map);
5608 } while (!cpus_empty(map));
5609 +
5610 + set_cpus_allowed(current, tmp);
5611 }
5612 EXPORT_SYMBOL_GPL(cpu_idle_wait);
5613
5614 @@ -240,9 +243,9 @@
5615 if (user_mode_vm(regs))
5616 printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp);
5617 printk(" EFLAGS: %08lx %s (%s %.*s)\n",
5618 - regs->eflags, print_tainted(), system_utsname.release,
5619 - (int)strcspn(system_utsname.version, " "),
5620 - system_utsname.version);
5621 + regs->eflags, print_tainted(), init_utsname()->release,
5622 + (int)strcspn(init_utsname()->version, " "),
5623 + init_utsname()->version);
5624 printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
5625 regs->eax,regs->ebx,regs->ecx,regs->edx);
5626 printk("ESI: %08lx EDI: %08lx EBP: %08lx",
5627 @@ -264,15 +267,6 @@
5628 * the "args".
5629 */
5630 extern void kernel_thread_helper(void);
5631 -__asm__(".section .text\n"
5632 - ".align 4\n"
5633 - "kernel_thread_helper:\n\t"
5634 - "movl %edx,%eax\n\t"
5635 - "pushl %edx\n\t"
5636 - "call *%ebx\n\t"
5637 - "pushl %eax\n\t"
5638 - "call do_exit\n"
5639 - ".previous");
5640
5641 /*
5642 * Create a kernel thread
5643 @@ -290,7 +284,7 @@
5644 regs.xes = __USER_DS;
5645 regs.orig_eax = -1;
5646 regs.eip = (unsigned long) kernel_thread_helper;
5647 - regs.xcs = GET_KERNEL_CS();
5648 + regs.xcs = __KERNEL_CS | get_kernel_rpl();
5649 regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
5650
5651 /* Ok, create the new process.. */
5652 @@ -369,13 +363,12 @@
5653
5654 tsk = current;
5655 if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
5656 - p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
5657 + p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
5658 + IO_BITMAP_BYTES, GFP_KERNEL);
5659 if (!p->thread.io_bitmap_ptr) {
5660 p->thread.io_bitmap_max = 0;
5661 return -ENOMEM;
5662 }
5663 - memcpy(p->thread.io_bitmap_ptr, tsk->thread.io_bitmap_ptr,
5664 - IO_BITMAP_BYTES);
5665 set_tsk_thread_flag(p, TIF_IO_BITMAP);
5666 }
5667
5668 @@ -850,7 +843,7 @@
5669
5670 unsigned long arch_align_stack(unsigned long sp)
5671 {
5672 - if (randomize_va_space)
5673 + if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
5674 sp -= get_random_int() % 8192;
5675 return sp & ~0xf;
5676 }
5677 --- a/arch/x86/kernel/process_64-xen.c
5678 +++ b/arch/x86/kernel/process_64-xen.c
5679 @@ -89,25 +89,24 @@
5680 }
5681 EXPORT_SYMBOL(idle_notifier_unregister);
5682
5683 -enum idle_state { CPU_IDLE, CPU_NOT_IDLE };
5684 -static DEFINE_PER_CPU(enum idle_state, idle_state) = CPU_NOT_IDLE;
5685 -
5686 void enter_idle(void)
5687 {
5688 - __get_cpu_var(idle_state) = CPU_IDLE;
5689 + write_pda(isidle, 1);
5690 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
5691 }
5692
5693 static void __exit_idle(void)
5694 {
5695 - __get_cpu_var(idle_state) = CPU_NOT_IDLE;
5696 + if (test_and_clear_bit_pda(0, isidle) == 0)
5697 + return;
5698 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
5699 }
5700
5701 /* Called from interrupts to signify idle end */
5702 void exit_idle(void)
5703 {
5704 - if (current->pid | read_pda(irqcount))
5705 + /* idle loop has pid 0 */
5706 + if (current->pid)
5707 return;
5708 __exit_idle();
5709 }
5710 @@ -184,6 +183,9 @@
5711 play_dead();
5712 enter_idle();
5713 idle();
5714 + /* In many cases the interrupt that ended idle
5715 + has already called exit_idle. But some idle
5716 + loops can be woken up without interrupt. */
5717 __exit_idle();
5718 }
5719
5720 @@ -196,7 +198,7 @@
5721 void cpu_idle_wait(void)
5722 {
5723 unsigned int cpu, this_cpu = get_cpu();
5724 - cpumask_t map;
5725 + cpumask_t map, tmp = current->cpus_allowed;
5726
5727 set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
5728 put_cpu();
5729 @@ -219,6 +221,8 @@
5730 }
5731 cpus_and(map, map, cpu_online_map);
5732 } while (!cpus_empty(map));
5733 +
5734 + set_cpus_allowed(current, tmp);
5735 }
5736 EXPORT_SYMBOL_GPL(cpu_idle_wait);
5737
5738 @@ -250,9 +254,9 @@
5739 print_modules();
5740 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
5741 current->pid, current->comm, print_tainted(),
5742 - system_utsname.release,
5743 - (int)strcspn(system_utsname.version, " "),
5744 - system_utsname.version);
5745 + init_utsname()->release,
5746 + (int)strcspn(init_utsname()->version, " "),
5747 + init_utsname()->version);
5748 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
5749 printk_address(regs->rip);
5750 printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp,
5751 @@ -310,6 +314,7 @@
5752
5753 kfree(t->io_bitmap_ptr);
5754 t->io_bitmap_ptr = NULL;
5755 + clear_thread_flag(TIF_IO_BITMAP);
5756 /*
5757 * Careful, clear this in the TSS too:
5758 */
5759 @@ -340,6 +345,7 @@
5760 if (t->flags & _TIF_IA32)
5761 current_thread_info()->status |= TS_COMPAT;
5762 }
5763 + t->flags &= ~_TIF_DEBUG;
5764
5765 tsk->thread.debugreg0 = 0;
5766 tsk->thread.debugreg1 = 0;
5767 @@ -432,7 +438,7 @@
5768 asm("mov %%es,%0" : "=m" (p->thread.es));
5769 asm("mov %%ds,%0" : "=m" (p->thread.ds));
5770
5771 - if (unlikely(me->thread.io_bitmap_ptr != NULL)) {
5772 + if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
5773 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
5774 if (!p->thread.io_bitmap_ptr) {
5775 p->thread.io_bitmap_max = 0;
5776 @@ -440,6 +446,7 @@
5777 }
5778 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
5779 IO_BITMAP_BYTES);
5780 + set_tsk_thread_flag(p, TIF_IO_BITMAP);
5781 }
5782
5783 /*
5784 @@ -474,6 +481,30 @@
5785 }
5786
5787 /*
5788 + * This special macro can be used to load a debugging register
5789 + */
5790 +#define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r)
5791 +
5792 +static inline void __switch_to_xtra(struct task_struct *prev_p,
5793 + struct task_struct *next_p)
5794 +{
5795 + struct thread_struct *prev, *next;
5796 +
5797 + prev = &prev_p->thread,
5798 + next = &next_p->thread;
5799 +
5800 + if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
5801 + loaddebug(next, 0);
5802 + loaddebug(next, 1);
5803 + loaddebug(next, 2);
5804 + loaddebug(next, 3);
5805 + /* no 4 and 5 */
5806 + loaddebug(next, 6);
5807 + loaddebug(next, 7);
5808 + }
5809 +}
5810 +
5811 +/*
5812 * switch_to(x,y) should switch tasks from x to y.
5813 *
5814 * This could still be optimized:
5815 @@ -495,6 +526,10 @@
5816 struct physdev_set_iobitmap iobmp_op;
5817 multicall_entry_t _mcl[8], *mcl = _mcl;
5818
5819 + /* we're going to use this soon, after a few expensive things */
5820 + if (next_p->fpu_counter>5)
5821 + prefetch(&next->i387.fxsave);
5822 +
5823 /*
5824 * This is basically '__unlazy_fpu', except that we queue a
5825 * multicall to indicate FPU task switch, rather than
5826 @@ -507,7 +542,8 @@
5827 mcl->op = __HYPERVISOR_fpu_taskswitch;
5828 mcl->args[0] = 1;
5829 mcl++;
5830 - }
5831 + } else
5832 + prev_p->fpu_counter = 0;
5833
5834 /*
5835 * Reload esp0, LDT and the page table pointer:
5836 @@ -587,21 +623,29 @@
5837 write_pda(oldrsp, next->userrsp);
5838 write_pda(pcurrent, next_p);
5839 write_pda(kernelstack,
5840 - task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
5841 + (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
5842 +#ifdef CONFIG_CC_STACKPROTECTOR
5843 + write_pda(stack_canary, next_p->stack_canary);
5844 +
5845 + /*
5846 + * Build time only check to make sure the stack_canary is at
5847 + * offset 40 in the pda; this is a gcc ABI requirement
5848 + */
5849 + BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
5850 +#endif
5851
5852 /*
5853 * Now maybe reload the debug registers
5854 */
5855 - if (unlikely(next->debugreg7)) {
5856 - set_debugreg(next->debugreg0, 0);
5857 - set_debugreg(next->debugreg1, 1);
5858 - set_debugreg(next->debugreg2, 2);
5859 - set_debugreg(next->debugreg3, 3);
5860 - /* no 4 and 5 */
5861 - set_debugreg(next->debugreg6, 6);
5862 - set_debugreg(next->debugreg7, 7);
5863 - }
5864 + if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW))
5865 + __switch_to_xtra(prev_p, next_p);
5866
5867 + /* If the task has used fpu the last 5 timeslices, just do a full
5868 + * restore of the math state immediately to avoid the trap; the
5869 + * chances of needing FPU soon are obviously high now
5870 + */
5871 + if (next_p->fpu_counter>5)
5872 + math_state_restore();
5873 return prev_p;
5874 }
5875
5876 @@ -821,7 +865,7 @@
5877
5878 unsigned long arch_align_stack(unsigned long sp)
5879 {
5880 - if (randomize_va_space)
5881 + if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
5882 sp -= get_random_int() % 8192;
5883 return sp & ~0xf;
5884 }
5885 --- a/arch/x86/kernel/setup64-xen.c
5886 +++ b/arch/x86/kernel/setup64-xen.c
5887 @@ -31,7 +31,7 @@
5888 #include <asm/hypervisor.h>
5889 #endif
5890
5891 -char x86_boot_params[BOOT_PARAM_SIZE] __initdata = {0,};
5892 +char x86_boot_params[BOOT_PARAM_SIZE] __initdata;
5893
5894 cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
5895
5896 @@ -55,8 +55,10 @@
5897 on Enable(default)
5898 off Disable
5899 */
5900 -int __init nonx_setup(char *str)
5901 +static int __init nonx_setup(char *str)
5902 {
5903 + if (!str)
5904 + return -EINVAL;
5905 if (!strncmp(str, "on", 2)) {
5906 __supported_pte_mask |= _PAGE_NX;
5907 do_not_nx = 0;
5908 @@ -64,9 +66,9 @@
5909 do_not_nx = 1;
5910 __supported_pte_mask &= ~_PAGE_NX;
5911 }
5912 - return 1;
5913 + return 0;
5914 }
5915 -__setup("noexec=", nonx_setup); /* parsed early actually */
5916 +early_param("noexec", nonx_setup);
5917
5918 int force_personality32 = 0;
5919
5920 @@ -102,12 +104,9 @@
5921 #endif
5922
5923 /* Copy section for each CPU (we discard the original) */
5924 - size = ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES);
5925 -#ifdef CONFIG_MODULES
5926 - if (size < PERCPU_ENOUGH_ROOM)
5927 - size = PERCPU_ENOUGH_ROOM;
5928 -#endif
5929 + size = PERCPU_ENOUGH_ROOM;
5930
5931 + printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n", size);
5932 for_each_cpu_mask (i, cpu_possible_map) {
5933 char *ptr;
5934
5935 @@ -169,7 +168,10 @@
5936 /* Setup up data that may be needed in __get_free_pages early */
5937 asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0));
5938 #ifndef CONFIG_XEN
5939 + /* Memory clobbers used to order PDA accessed */
5940 + mb();
5941 wrmsrl(MSR_GS_BASE, pda);
5942 + mb();
5943 #else
5944 if (HYPERVISOR_set_segment_base(SEGBASE_GS_KERNEL,
5945 (unsigned long)pda))
5946 @@ -302,28 +304,17 @@
5947 * set up and load the per-CPU TSS
5948 */
5949 for (v = 0; v < N_EXCEPTION_STACKS; v++) {
5950 + static const unsigned int order[N_EXCEPTION_STACKS] = {
5951 + [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
5952 + [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
5953 + };
5954 if (cpu) {
5955 - static const unsigned int order[N_EXCEPTION_STACKS] = {
5956 - [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
5957 - [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
5958 - };
5959 -
5960 estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
5961 if (!estacks)
5962 panic("Cannot allocate exception stack %ld %d\n",
5963 v, cpu);
5964 }
5965 - switch (v + 1) {
5966 -#if DEBUG_STKSZ > EXCEPTION_STKSZ
5967 - case DEBUG_STACK:
5968 - cpu_pda(cpu)->debugstack = (unsigned long)estacks;
5969 - estacks += DEBUG_STKSZ;
5970 - break;
5971 -#endif
5972 - default:
5973 - estacks += EXCEPTION_STKSZ;
5974 - break;
5975 - }
5976 + estacks += PAGE_SIZE << order[v];
5977 orig_ist->ist[v] = t->ist[v] = (unsigned long)estacks;
5978 }
5979
5980 --- a/arch/x86/kernel/setup_32-xen.c
5981 +++ b/arch/x86/kernel/setup_32-xen.c
5982 @@ -56,6 +56,7 @@
5983 #include <asm/apic.h>
5984 #include <asm/e820.h>
5985 #include <asm/mpspec.h>
5986 +#include <asm/mmzone.h>
5987 #include <asm/setup.h>
5988 #include <asm/arch_hooks.h>
5989 #include <asm/sections.h>
5990 @@ -105,18 +106,6 @@
5991
5992 unsigned long mmu_cr4_features;
5993
5994 -#ifdef CONFIG_ACPI
5995 - int acpi_disabled = 0;
5996 -#else
5997 - int acpi_disabled = 1;
5998 -#endif
5999 -EXPORT_SYMBOL(acpi_disabled);
6000 -
6001 -#ifdef CONFIG_ACPI
6002 -int __initdata acpi_force = 0;
6003 -extern acpi_interrupt_flags acpi_sci_flags;
6004 -#endif
6005 -
6006 /* for MCA, but anyone else can use it if they want */
6007 unsigned int machine_id;
6008 #ifdef CONFIG_MCA
6009 @@ -170,7 +159,6 @@
6010 #endif
6011
6012 extern void early_cpu_init(void);
6013 -extern void generic_apic_probe(char *);
6014 extern int root_mountflags;
6015
6016 unsigned long saved_videomode;
6017 @@ -243,9 +231,6 @@
6018 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
6019 } };
6020
6021 -#define ADAPTER_ROM_RESOURCES \
6022 - (sizeof adapter_rom_resources / sizeof adapter_rom_resources[0])
6023 -
6024 static struct resource video_rom_resource = {
6025 .name = "Video ROM",
6026 .start = 0xc0000,
6027 @@ -307,9 +292,6 @@
6028 .flags = IORESOURCE_BUSY | IORESOURCE_IO
6029 } };
6030
6031 -#define STANDARD_IO_RESOURCES \
6032 - (sizeof standard_io_resources / sizeof standard_io_resources[0])
6033 -
6034 #define romsignature(x) (*(unsigned short *)(x) == 0xaa55)
6035
6036 static int __init romchecksum(unsigned char *rom, unsigned long length)
6037 @@ -372,7 +354,7 @@
6038 }
6039
6040 /* check for adapter roms on 2k boundaries */
6041 - for (i = 0; i < ADAPTER_ROM_RESOURCES && start < upper; start += 2048) {
6042 + for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) {
6043 rom = isa_bus_to_virt(start);
6044 if (!romsignature(rom))
6045 continue;
6046 @@ -764,246 +746,152 @@
6047 }
6048 #endif
6049
6050 -static void __init parse_cmdline_early (char ** cmdline_p)
6051 +static int __initdata user_defined_memmap = 0;
6052 +
6053 +/*
6054 + * "mem=nopentium" disables the 4MB page tables.
6055 + * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM
6056 + * to <mem>, overriding the bios size.
6057 + * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from
6058 + * <start> to <start>+<mem>, overriding the bios size.
6059 + *
6060 + * HPA tells me bootloaders need to parse mem=, so no new
6061 + * option should be mem= [also see Documentation/i386/boot.txt]
6062 + */
6063 +static int __init parse_mem(char *arg)
6064 {
6065 - char c = ' ', *to = command_line, *from = saved_command_line;
6066 - int len = 0, max_cmdline;
6067 - int userdef = 0;
6068 -
6069 - if ((max_cmdline = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE)
6070 - max_cmdline = COMMAND_LINE_SIZE;
6071 - memcpy(saved_command_line, xen_start_info->cmd_line, max_cmdline);
6072 - /* Save unparsed command line copy for /proc/cmdline */
6073 - saved_command_line[max_cmdline-1] = '\0';
6074 -
6075 - for (;;) {
6076 - if (c != ' ')
6077 - goto next_char;
6078 - /*
6079 - * "mem=nopentium" disables the 4MB page tables.
6080 - * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM
6081 - * to <mem>, overriding the bios size.
6082 - * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from
6083 - * <start> to <start>+<mem>, overriding the bios size.
6084 - *
6085 - * HPA tells me bootloaders need to parse mem=, so no new
6086 - * option should be mem= [also see Documentation/i386/boot.txt]
6087 - */
6088 - if (!memcmp(from, "mem=", 4)) {
6089 - if (to != command_line)
6090 - to--;
6091 - if (!memcmp(from+4, "nopentium", 9)) {
6092 - from += 9+4;
6093 - clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
6094 - disable_pse = 1;
6095 - } else {
6096 - /* If the user specifies memory size, we
6097 - * limit the BIOS-provided memory map to
6098 - * that size. exactmap can be used to specify
6099 - * the exact map. mem=number can be used to
6100 - * trim the existing memory map.
6101 - */
6102 - unsigned long long mem_size;
6103 -
6104 - mem_size = memparse(from+4, &from);
6105 - limit_regions(mem_size);
6106 - userdef=1;
6107 - }
6108 - }
6109 + if (!arg)
6110 + return -EINVAL;
6111
6112 - else if (!memcmp(from, "memmap=", 7)) {
6113 - if (to != command_line)
6114 - to--;
6115 - if (!memcmp(from+7, "exactmap", 8)) {
6116 -#ifdef CONFIG_CRASH_DUMP
6117 - /* If we are doing a crash dump, we
6118 - * still need to know the real mem
6119 - * size before original memory map is
6120 - * reset.
6121 - */
6122 - find_max_pfn();
6123 - saved_max_pfn = max_pfn;
6124 -#endif
6125 - from += 8+7;
6126 - e820.nr_map = 0;
6127 - userdef = 1;
6128 - } else {
6129 - /* If the user specifies memory size, we
6130 - * limit the BIOS-provided memory map to
6131 - * that size. exactmap can be used to specify
6132 - * the exact map. mem=number can be used to
6133 - * trim the existing memory map.
6134 - */
6135 - unsigned long long start_at, mem_size;
6136 + if (strcmp(arg, "nopentium") == 0) {
6137 + clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
6138 + disable_pse = 1;
6139 + } else {
6140 + /* If the user specifies memory size, we
6141 + * limit the BIOS-provided memory map to
6142 + * that size. exactmap can be used to specify
6143 + * the exact map. mem=number can be used to
6144 + * trim the existing memory map.
6145 + */
6146 + unsigned long long mem_size;
6147
6148 - mem_size = memparse(from+7, &from);
6149 - if (*from == '@') {
6150 - start_at = memparse(from+1, &from);
6151 - add_memory_region(start_at, mem_size, E820_RAM);
6152 - } else if (*from == '#') {
6153 - start_at = memparse(from+1, &from);
6154 - add_memory_region(start_at, mem_size, E820_ACPI);
6155 - } else if (*from == '$') {
6156 - start_at = memparse(from+1, &from);
6157 - add_memory_region(start_at, mem_size, E820_RESERVED);
6158 - } else {
6159 - limit_regions(mem_size);
6160 - userdef=1;
6161 - }
6162 - }
6163 - }
6164 -
6165 - else if (!memcmp(from, "noexec=", 7))
6166 - noexec_setup(from + 7);
6167 + mem_size = memparse(arg, &arg);
6168 + limit_regions(mem_size);
6169 + user_defined_memmap = 1;
6170 + }
6171 + return 0;
6172 +}
6173 +early_param("mem", parse_mem);
6174
6175 +static int __init parse_memmap(char *arg)
6176 +{
6177 + if (!arg)
6178 + return -EINVAL;
6179
6180 -#ifdef CONFIG_X86_MPPARSE
6181 - /*
6182 - * If the BIOS enumerates physical processors before logical,
6183 - * maxcpus=N at enumeration-time can be used to disable HT.
6184 + if (strcmp(arg, "exactmap") == 0) {
6185 +#ifdef CONFIG_CRASH_DUMP
6186 + /* If we are doing a crash dump, we
6187 + * still need to know the real mem
6188 + * size before original memory map is
6189 + * reset.
6190 */
6191 - else if (!memcmp(from, "maxcpus=", 8)) {
6192 - extern unsigned int maxcpus;
6193 -
6194 - maxcpus = simple_strtoul(from + 8, NULL, 0);
6195 - }
6196 + find_max_pfn();
6197 + saved_max_pfn = max_pfn;
6198 #endif
6199 + e820.nr_map = 0;
6200 + user_defined_memmap = 1;
6201 + } else {
6202 + /* If the user specifies memory size, we
6203 + * limit the BIOS-provided memory map to
6204 + * that size. exactmap can be used to specify
6205 + * the exact map. mem=number can be used to
6206 + * trim the existing memory map.
6207 + */
6208 + unsigned long long start_at, mem_size;
6209
6210 -#ifdef CONFIG_ACPI
6211 - /* "acpi=off" disables both ACPI table parsing and interpreter */
6212 - else if (!memcmp(from, "acpi=off", 8)) {
6213 - disable_acpi();
6214 - }
6215 -
6216 - /* acpi=force to over-ride black-list */
6217 - else if (!memcmp(from, "acpi=force", 10)) {
6218 - acpi_force = 1;
6219 - acpi_ht = 1;
6220 - acpi_disabled = 0;
6221 - }
6222 -
6223 - /* acpi=strict disables out-of-spec workarounds */
6224 - else if (!memcmp(from, "acpi=strict", 11)) {
6225 - acpi_strict = 1;
6226 - }
6227 -
6228 - /* Limit ACPI just to boot-time to enable HT */
6229 - else if (!memcmp(from, "acpi=ht", 7)) {
6230 - if (!acpi_force)
6231 - disable_acpi();
6232 - acpi_ht = 1;
6233 - }
6234 -
6235 - /* "pci=noacpi" disable ACPI IRQ routing and PCI scan */
6236 - else if (!memcmp(from, "pci=noacpi", 10)) {
6237 - acpi_disable_pci();
6238 - }
6239 - /* "acpi=noirq" disables ACPI interrupt routing */
6240 - else if (!memcmp(from, "acpi=noirq", 10)) {
6241 - acpi_noirq_set();
6242 + mem_size = memparse(arg, &arg);
6243 + if (*arg == '@') {
6244 + start_at = memparse(arg+1, &arg);
6245 + add_memory_region(start_at, mem_size, E820_RAM);
6246 + } else if (*arg == '#') {
6247 + start_at = memparse(arg+1, &arg);
6248 + add_memory_region(start_at, mem_size, E820_ACPI);
6249 + } else if (*arg == '$') {
6250 + start_at = memparse(arg+1, &arg);
6251 + add_memory_region(start_at, mem_size, E820_RESERVED);
6252 + } else {
6253 + limit_regions(mem_size);
6254 + user_defined_memmap = 1;
6255 }
6256 + }
6257 + return 0;
6258 +}
6259 +early_param("memmap", parse_memmap);
6260
6261 - else if (!memcmp(from, "acpi_sci=edge", 13))
6262 - acpi_sci_flags.trigger = 1;
6263 +#ifdef CONFIG_PROC_VMCORE
6264 +/* elfcorehdr= specifies the location of elf core header
6265 + * stored by the crashed kernel.
6266 + */
6267 +static int __init parse_elfcorehdr(char *arg)
6268 +{
6269 + if (!arg)
6270 + return -EINVAL;
6271
6272 - else if (!memcmp(from, "acpi_sci=level", 14))
6273 - acpi_sci_flags.trigger = 3;
6274 + elfcorehdr_addr = memparse(arg, &arg);
6275 + return 0;
6276 +}
6277 +early_param("elfcorehdr", parse_elfcorehdr);
6278 +#endif /* CONFIG_PROC_VMCORE */
6279
6280 - else if (!memcmp(from, "acpi_sci=high", 13))
6281 - acpi_sci_flags.polarity = 1;
6282 +/*
6283 + * highmem=size forces highmem to be exactly 'size' bytes.
6284 + * This works even on boxes that have no highmem otherwise.
6285 + * This also works to reduce highmem size on bigger boxes.
6286 + */
6287 +static int __init parse_highmem(char *arg)
6288 +{
6289 + if (!arg)
6290 + return -EINVAL;
6291
6292 - else if (!memcmp(from, "acpi_sci=low", 12))
6293 - acpi_sci_flags.polarity = 3;
6294 + highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT;
6295 + return 0;
6296 +}
6297 +early_param("highmem", parse_highmem);
6298
6299 -#ifdef CONFIG_X86_IO_APIC
6300 - else if (!memcmp(from, "acpi_skip_timer_override", 24))
6301 - acpi_skip_timer_override = 1;
6302 +/*
6303 + * vmalloc=size forces the vmalloc area to be exactly 'size'
6304 + * bytes. This can be used to increase (or decrease) the
6305 + * vmalloc area - the default is 128m.
6306 + */
6307 +static int __init parse_vmalloc(char *arg)
6308 +{
6309 + if (!arg)
6310 + return -EINVAL;
6311
6312 - if (!memcmp(from, "disable_timer_pin_1", 19))
6313 - disable_timer_pin_1 = 1;
6314 - if (!memcmp(from, "enable_timer_pin_1", 18))
6315 - disable_timer_pin_1 = -1;
6316 -
6317 - /* disable IO-APIC */
6318 - else if (!memcmp(from, "noapic", 6))
6319 - disable_ioapic_setup();
6320 -#endif /* CONFIG_X86_IO_APIC */
6321 -#endif /* CONFIG_ACPI */
6322 -
6323 -#ifdef CONFIG_X86_LOCAL_APIC
6324 - /* enable local APIC */
6325 - else if (!memcmp(from, "lapic", 5))
6326 - lapic_enable();
6327 -
6328 - /* disable local APIC */
6329 - else if (!memcmp(from, "nolapic", 6))
6330 - lapic_disable();
6331 -#endif /* CONFIG_X86_LOCAL_APIC */
6332 + __VMALLOC_RESERVE = memparse(arg, &arg);
6333 + return 0;
6334 +}
6335 +early_param("vmalloc", parse_vmalloc);
6336
6337 -#ifdef CONFIG_KEXEC
6338 - /* crashkernel=size@addr specifies the location to reserve for
6339 - * a crash kernel. By reserving this memory we guarantee
6340 - * that linux never set's it up as a DMA target.
6341 - * Useful for holding code to do something appropriate
6342 - * after a kernel panic.
6343 - */
6344 - else if (!memcmp(from, "crashkernel=", 12)) {
6345 #ifndef CONFIG_XEN
6346 - unsigned long size, base;
6347 - size = memparse(from+12, &from);
6348 - if (*from == '@') {
6349 - base = memparse(from+1, &from);
6350 - /* FIXME: Do I want a sanity check
6351 - * to validate the memory range?
6352 - */
6353 - crashk_res.start = base;
6354 - crashk_res.end = base + size - 1;
6355 - }
6356 -#else
6357 - printk("Ignoring crashkernel command line, "
6358 - "parameter will be supplied by xen\n");
6359 -#endif
6360 - }
6361 -#endif
6362 -#ifdef CONFIG_PROC_VMCORE
6363 - /* elfcorehdr= specifies the location of elf core header
6364 - * stored by the crashed kernel.
6365 - */
6366 - else if (!memcmp(from, "elfcorehdr=", 11))
6367 - elfcorehdr_addr = memparse(from+11, &from);
6368 -#endif
6369 +/*
6370 + * reservetop=size reserves a hole at the top of the kernel address space which
6371 + * a hypervisor can load into later. Needed for dynamically loaded hypervisors,
6372 + * so relocating the fixmap can be done before paging initialization.
6373 + */
6374 +static int __init parse_reservetop(char *arg)
6375 +{
6376 + unsigned long address;
6377
6378 - /*
6379 - * highmem=size forces highmem to be exactly 'size' bytes.
6380 - * This works even on boxes that have no highmem otherwise.
6381 - * This also works to reduce highmem size on bigger boxes.
6382 - */
6383 - else if (!memcmp(from, "highmem=", 8))
6384 - highmem_pages = memparse(from+8, &from) >> PAGE_SHIFT;
6385 -
6386 - /*
6387 - * vmalloc=size forces the vmalloc area to be exactly 'size'
6388 - * bytes. This can be used to increase (or decrease) the
6389 - * vmalloc area - the default is 128m.
6390 - */
6391 - else if (!memcmp(from, "vmalloc=", 8))
6392 - __VMALLOC_RESERVE = memparse(from+8, &from);
6393 + if (!arg)
6394 + return -EINVAL;
6395
6396 - next_char:
6397 - c = *(from++);
6398 - if (!c)
6399 - break;
6400 - if (COMMAND_LINE_SIZE <= ++len)
6401 - break;
6402 - *(to++) = c;
6403 - }
6404 - *to = '\0';
6405 - *cmdline_p = command_line;
6406 - if (userdef) {
6407 - printk(KERN_INFO "user-defined physical RAM map:\n");
6408 - print_memory_map("user");
6409 - }
6410 + address = memparse(arg, &arg);
6411 + reserve_top_address(address);
6412 + return 0;
6413 }
6414 +early_param("reservetop", parse_reservetop);
6415 +#endif
6416
6417 /*
6418 * Callback for efi_memory_walk.
6419 @@ -1024,7 +912,7 @@
6420 static int __init
6421 efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg)
6422 {
6423 - memory_present(0, start, end);
6424 + memory_present(0, PFN_UP(start), PFN_DOWN(end));
6425 return 0;
6426 }
6427
6428 @@ -1291,6 +1179,14 @@
6429 }
6430 printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
6431 pages_to_mb(highend_pfn - highstart_pfn));
6432 + num_physpages = highend_pfn;
6433 + high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
6434 +#else
6435 + num_physpages = max_low_pfn;
6436 + high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
6437 +#endif
6438 +#ifdef CONFIG_FLATMEM
6439 + max_mapnr = num_physpages;
6440 #endif
6441 printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
6442 pages_to_mb(max_low_pfn));
6443 @@ -1302,22 +1198,19 @@
6444
6445 void __init zone_sizes_init(void)
6446 {
6447 - unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0};
6448 - unsigned int max_dma, low;
6449 -
6450 - max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
6451 - low = max_low_pfn;
6452 -
6453 - if (low < max_dma)
6454 - zones_size[ZONE_DMA] = low;
6455 - else {
6456 - zones_size[ZONE_DMA] = max_dma;
6457 - zones_size[ZONE_NORMAL] = low - max_dma;
6458 + unsigned long max_zone_pfns[MAX_NR_ZONES];
6459 + memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
6460 + max_zone_pfns[ZONE_DMA] =
6461 + virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
6462 + max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
6463 #ifdef CONFIG_HIGHMEM
6464 - zones_size[ZONE_HIGHMEM] = highend_pfn - low;
6465 + max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
6466 + add_active_range(0, 0, highend_pfn);
6467 +#else
6468 + add_active_range(0, 0, max_low_pfn);
6469 #endif
6470 - }
6471 - free_area_init(zones_size);
6472 +
6473 + free_area_init_nodes(max_zone_pfns);
6474 }
6475 #else
6476 extern unsigned long __init setup_memory(void);
6477 @@ -1374,6 +1267,7 @@
6478 */
6479 acpi_reserve_bootmem();
6480 #endif
6481 + numa_kva_reserve();
6482 #endif /* !CONFIG_XEN */
6483
6484 #ifdef CONFIG_BLK_DEV_INITRD
6485 @@ -1559,7 +1453,7 @@
6486 request_resource(&iomem_resource, &video_ram_resource);
6487
6488 /* request I/O space for devices used on all i[345]86 PCs */
6489 - for (i = 0; i < STANDARD_IO_RESOURCES; i++)
6490 + for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
6491 request_resource(&ioport_resource, &standard_io_resources[i]);
6492 return 0;
6493 }
6494 @@ -1700,17 +1594,19 @@
6495 data_resource.start = virt_to_phys(_etext);
6496 data_resource.end = virt_to_phys(_edata)-1;
6497
6498 - parse_cmdline_early(cmdline_p);
6499 + if ((i = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE)
6500 + i = COMMAND_LINE_SIZE;
6501 + memcpy(saved_command_line, xen_start_info->cmd_line, i);
6502 + saved_command_line[i - 1] = '\0';
6503 + parse_early_param();
6504
6505 -#ifdef CONFIG_EARLY_PRINTK
6506 - {
6507 - char *s = strstr(*cmdline_p, "earlyprintk=");
6508 - if (s) {
6509 - setup_early_printk(strchr(s, '=') + 1);
6510 - printk("early console enabled\n");
6511 - }
6512 + if (user_defined_memmap) {
6513 + printk(KERN_INFO "user-defined physical RAM map:\n");
6514 + print_memory_map("user");
6515 }
6516 -#endif
6517 +
6518 + strlcpy(command_line, saved_command_line, COMMAND_LINE_SIZE);
6519 + *cmdline_p = command_line;
6520
6521 max_low_pfn = setup_memory();
6522
6523 @@ -1817,7 +1713,7 @@
6524 dmi_scan_machine();
6525
6526 #ifdef CONFIG_X86_GENERICARCH
6527 - generic_apic_probe(*cmdline_p);
6528 + generic_apic_probe();
6529 #endif
6530 if (efi_enabled)
6531 efi_map_memmap();
6532 @@ -1838,9 +1734,11 @@
6533 acpi_boot_table_init();
6534 #endif
6535
6536 +#ifdef CONFIG_PCI
6537 #ifdef CONFIG_X86_IO_APIC
6538 check_acpi_pci(); /* Checks more than just ACPI actually */
6539 #endif
6540 +#endif
6541
6542 #ifdef CONFIG_ACPI
6543 acpi_boot_init();
6544 --- a/arch/x86/kernel/setup_64-xen.c
6545 +++ b/arch/x86/kernel/setup_64-xen.c
6546 @@ -118,16 +118,6 @@
6547
6548 unsigned long mmu_cr4_features;
6549
6550 -int acpi_disabled;
6551 -EXPORT_SYMBOL(acpi_disabled);
6552 -#ifdef CONFIG_ACPI
6553 -extern int __initdata acpi_ht;
6554 -extern acpi_interrupt_flags acpi_sci_flags;
6555 -int __initdata acpi_force = 0;
6556 -#endif
6557 -
6558 -int acpi_numa __initdata;
6559 -
6560 /* Boot loader ID as an integer, for the benefit of proc_dointvec */
6561 int bootloader_type;
6562
6563 @@ -151,10 +141,6 @@
6564
6565 struct edid_info edid_info;
6566 EXPORT_SYMBOL_GPL(edid_info);
6567 -struct e820map e820;
6568 -#ifdef CONFIG_XEN
6569 -struct e820map machine_e820;
6570 -#endif
6571
6572 extern int root_mountflags;
6573
6574 @@ -181,9 +167,6 @@
6575 .flags = IORESOURCE_BUSY | IORESOURCE_IO }
6576 };
6577
6578 -#define STANDARD_IO_RESOURCES \
6579 - (sizeof standard_io_resources / sizeof standard_io_resources[0])
6580 -
6581 #define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM)
6582
6583 struct resource data_resource = {
6584 @@ -230,9 +213,6 @@
6585 .flags = IORESOURCE_ROM }
6586 };
6587
6588 -#define ADAPTER_ROM_RESOURCES \
6589 - (sizeof adapter_rom_resources / sizeof adapter_rom_resources[0])
6590 -
6591 static struct resource video_rom_resource = {
6592 .name = "Video ROM",
6593 .start = 0xc0000,
6594 @@ -309,7 +289,8 @@
6595 }
6596
6597 /* check for adapter roms on 2k boundaries */
6598 - for (i = 0; i < ADAPTER_ROM_RESOURCES && start < upper; start += 2048) {
6599 + for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper;
6600 + start += 2048) {
6601 rom = isa_bus_to_virt(start);
6602 if (!romsignature(rom))
6603 continue;
6604 @@ -329,187 +310,22 @@
6605 }
6606 }
6607
6608 -/* Check for full argument with no trailing characters */
6609 -static int fullarg(char *p, char *arg)
6610 +#ifdef CONFIG_PROC_VMCORE
6611 +/* elfcorehdr= specifies the location of elf core header
6612 + * stored by the crashed kernel. This option will be passed
6613 + * by kexec loader to the capture kernel.
6614 + */
6615 +static int __init setup_elfcorehdr(char *arg)
6616 {
6617 - int l = strlen(arg);
6618 - return !memcmp(p, arg, l) && (p[l] == 0 || isspace(p[l]));
6619 + char *end;
6620 + if (!arg)
6621 + return -EINVAL;
6622 + elfcorehdr_addr = memparse(arg, &end);
6623 + return end > arg ? 0 : -EINVAL;
6624 }
6625 -
6626 -static __init void parse_cmdline_early (char ** cmdline_p)
6627 -{
6628 - char c = ' ', *to = command_line, *from = COMMAND_LINE;
6629 - int len = 0;
6630 - int userdef = 0;
6631 -
6632 - for (;;) {
6633 - if (c != ' ')
6634 - goto next_char;
6635 -
6636 -#ifdef CONFIG_SMP
6637 - /*
6638 - * If the BIOS enumerates physical processors before logical,
6639 - * maxcpus=N at enumeration-time can be used to disable HT.
6640 - */
6641 - else if (!memcmp(from, "maxcpus=", 8)) {
6642 - extern unsigned int maxcpus;
6643 -
6644 - maxcpus = simple_strtoul(from + 8, NULL, 0);
6645 - }
6646 -#endif
6647 -#ifdef CONFIG_ACPI
6648 - /* "acpi=off" disables both ACPI table parsing and interpreter init */
6649 - if (fullarg(from,"acpi=off"))
6650 - disable_acpi();
6651 -
6652 - if (fullarg(from, "acpi=force")) {
6653 - /* add later when we do DMI horrors: */
6654 - acpi_force = 1;
6655 - acpi_disabled = 0;
6656 - }
6657 -
6658 - /* acpi=ht just means: do ACPI MADT parsing
6659 - at bootup, but don't enable the full ACPI interpreter */
6660 - if (fullarg(from, "acpi=ht")) {
6661 - if (!acpi_force)
6662 - disable_acpi();
6663 - acpi_ht = 1;
6664 - }
6665 - else if (fullarg(from, "pci=noacpi"))
6666 - acpi_disable_pci();
6667 - else if (fullarg(from, "acpi=noirq"))
6668 - acpi_noirq_set();
6669 -
6670 - else if (fullarg(from, "acpi_sci=edge"))
6671 - acpi_sci_flags.trigger = 1;
6672 - else if (fullarg(from, "acpi_sci=level"))
6673 - acpi_sci_flags.trigger = 3;
6674 - else if (fullarg(from, "acpi_sci=high"))
6675 - acpi_sci_flags.polarity = 1;
6676 - else if (fullarg(from, "acpi_sci=low"))
6677 - acpi_sci_flags.polarity = 3;
6678 -
6679 - /* acpi=strict disables out-of-spec workarounds */
6680 - else if (fullarg(from, "acpi=strict")) {
6681 - acpi_strict = 1;
6682 - }
6683 -#ifdef CONFIG_X86_IO_APIC
6684 - else if (fullarg(from, "acpi_skip_timer_override"))
6685 - acpi_skip_timer_override = 1;
6686 -#endif
6687 -#endif
6688 -
6689 -#ifndef CONFIG_XEN
6690 - if (fullarg(from, "nolapic") || fullarg(from, "disableapic")) {
6691 - clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
6692 - disable_apic = 1;
6693 - }
6694 -
6695 - if (fullarg(from, "noapic"))
6696 - skip_ioapic_setup = 1;
6697 -
6698 - if (fullarg(from,"apic")) {
6699 - skip_ioapic_setup = 0;
6700 - ioapic_force = 1;
6701 - }
6702 -#endif
6703 -
6704 - if (!memcmp(from, "mem=", 4))
6705 - parse_memopt(from+4, &from);
6706 -
6707 - if (!memcmp(from, "memmap=", 7)) {
6708 - /* exactmap option is for used defined memory */
6709 - if (!memcmp(from+7, "exactmap", 8)) {
6710 -#ifdef CONFIG_CRASH_DUMP
6711 - /* If we are doing a crash dump, we
6712 - * still need to know the real mem
6713 - * size before original memory map is
6714 - * reset.
6715 - */
6716 - saved_max_pfn = e820_end_of_ram();
6717 -#endif
6718 - from += 8+7;
6719 - end_pfn_map = 0;
6720 - e820.nr_map = 0;
6721 - userdef = 1;
6722 - }
6723 - else {
6724 - parse_memmapopt(from+7, &from);
6725 - userdef = 1;
6726 - }
6727 - }
6728 -
6729 -#ifdef CONFIG_NUMA
6730 - if (!memcmp(from, "numa=", 5))
6731 - numa_setup(from+5);
6732 +early_param("elfcorehdr", setup_elfcorehdr);
6733 #endif
6734
6735 - if (!memcmp(from,"iommu=",6)) {
6736 - iommu_setup(from+6);
6737 - }
6738 -
6739 - if (fullarg(from,"oops=panic"))
6740 - panic_on_oops = 1;
6741 -
6742 - if (!memcmp(from, "noexec=", 7))
6743 - nonx_setup(from + 7);
6744 -
6745 -#ifdef CONFIG_KEXEC
6746 - /* crashkernel=size@addr specifies the location to reserve for
6747 - * a crash kernel. By reserving this memory we guarantee
6748 - * that linux never set's it up as a DMA target.
6749 - * Useful for holding code to do something appropriate
6750 - * after a kernel panic.
6751 - */
6752 - else if (!memcmp(from, "crashkernel=", 12)) {
6753 -#ifndef CONFIG_XEN
6754 - unsigned long size, base;
6755 - size = memparse(from+12, &from);
6756 - if (*from == '@') {
6757 - base = memparse(from+1, &from);
6758 - /* FIXME: Do I want a sanity check
6759 - * to validate the memory range?
6760 - */
6761 - crashk_res.start = base;
6762 - crashk_res.end = base + size - 1;
6763 - }
6764 -#else
6765 - printk("Ignoring crashkernel command line, "
6766 - "parameter will be supplied by xen\n");
6767 -#endif
6768 - }
6769 -#endif
6770 -
6771 -#ifdef CONFIG_PROC_VMCORE
6772 - /* elfcorehdr= specifies the location of elf core header
6773 - * stored by the crashed kernel. This option will be passed
6774 - * by kexec loader to the capture kernel.
6775 - */
6776 - else if(!memcmp(from, "elfcorehdr=", 11))
6777 - elfcorehdr_addr = memparse(from+11, &from);
6778 -#endif
6779 -
6780 -#if defined(CONFIG_HOTPLUG_CPU) && !defined(CONFIG_XEN)
6781 - else if (!memcmp(from, "additional_cpus=", 16))
6782 - setup_additional_cpus(from+16);
6783 -#endif
6784 -
6785 - next_char:
6786 - c = *(from++);
6787 - if (!c)
6788 - break;
6789 - if (COMMAND_LINE_SIZE <= ++len)
6790 - break;
6791 - *(to++) = c;
6792 - }
6793 - if (userdef) {
6794 - printk(KERN_INFO "user-defined physical RAM map:\n");
6795 - e820_print_map("user");
6796 - }
6797 - *to = '\0';
6798 - *cmdline_p = command_line;
6799 -}
6800 -
6801 #ifndef CONFIG_NUMA
6802 static void __init
6803 contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
6804 @@ -521,10 +337,11 @@
6805 if (bootmap == -1L)
6806 panic("Cannot find bootmem map of size %ld\n",bootmap_size);
6807 bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
6808 + e820_register_active_regions(0, start_pfn, end_pfn);
6809 #ifdef CONFIG_XEN
6810 - e820_bootmem_free(NODE_DATA(0), 0, xen_start_info->nr_pages<<PAGE_SHIFT);
6811 + free_bootmem_with_active_regions(0, xen_start_info->nr_pages);
6812 #else
6813 - e820_bootmem_free(NODE_DATA(0), 0, end_pfn << PAGE_SHIFT);
6814 + free_bootmem_with_active_regions(0, end_pfn);
6815 #endif
6816 reserve_bootmem(bootmap, bootmap_size);
6817 }
6818 @@ -587,6 +404,10 @@
6819 void __init setup_arch(char **cmdline_p)
6820 {
6821 #ifdef CONFIG_XEN
6822 + extern struct e820map machine_e820;
6823 +
6824 + printk(KERN_INFO "Command line: %s\n", saved_command_line);
6825 +
6826 /* Register a call for panic conditions. */
6827 atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
6828
6829 @@ -612,6 +433,8 @@
6830
6831 ARCH_SETUP
6832 #else
6833 + printk(KERN_INFO "Command line: %s\n", saved_command_line);
6834 +
6835 ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV);
6836 screen_info = SCREEN_INFO;
6837 edid_info = EDID_INFO;
6838 @@ -639,16 +462,22 @@
6839 data_resource.start = virt_to_phys(&_etext);
6840 data_resource.end = virt_to_phys(&_edata)-1;
6841
6842 - parse_cmdline_early(cmdline_p);
6843 -
6844 early_identify_cpu(&boot_cpu_data);
6845
6846 + strlcpy(command_line, saved_command_line, COMMAND_LINE_SIZE);
6847 + *cmdline_p = command_line;
6848 +
6849 + parse_early_param();
6850 +
6851 + finish_e820_parsing();
6852 +
6853 + e820_register_active_regions(0, 0, -1UL);
6854 /*
6855 * partially used pages are not usable - thus
6856 * we are rounding upwards:
6857 */
6858 end_pfn = e820_end_of_ram();
6859 - num_physpages = end_pfn; /* for pfn_valid */
6860 + num_physpages = end_pfn;
6861
6862 check_efer();
6863
6864 @@ -659,6 +488,14 @@
6865 if (is_initial_xendomain())
6866 dmi_scan_machine();
6867
6868 + /* How many end-of-memory variables you have, grandma! */
6869 + max_low_pfn = end_pfn;
6870 + max_pfn = end_pfn;
6871 + high_memory = (void *)__va(end_pfn * PAGE_SIZE - 1) + 1;
6872 +
6873 + /* Remove active ranges so rediscovery with NUMA-awareness happens */
6874 + remove_all_active_ranges();
6875 +
6876 #ifdef CONFIG_ACPI_NUMA
6877 /*
6878 * Parse SRAT to discover nodes.
6879 @@ -848,16 +685,16 @@
6880 BUG();
6881 }
6882
6883 +#ifdef CONFIG_ACPI
6884 if (!is_initial_xendomain()) {
6885 acpi_disabled = 1;
6886 -#ifdef CONFIG_ACPI
6887 acpi_ht = 0;
6888 -#endif
6889 }
6890 #endif
6891 +#endif
6892
6893 -#ifndef CONFIG_XEN
6894 - check_ioapic();
6895 +#if defined(CONFIG_PCI) && !defined(CONFIG_XEN)
6896 + early_quirks();
6897 #endif
6898
6899 zap_low_mappings(0);
6900 @@ -917,6 +754,7 @@
6901 }
6902 #else
6903 e820_reserve_resources(e820.map, e820.nr_map);
6904 + e820_mark_nosave_regions();
6905 #endif
6906
6907 request_resource(&iomem_resource, &video_ram_resource);
6908 @@ -924,7 +762,7 @@
6909 {
6910 unsigned i;
6911 /* request I/O space for devices used on all i[345]86 PCs */
6912 - for (i = 0; i < STANDARD_IO_RESOURCES; i++)
6913 + for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
6914 request_resource(&ioport_resource, &standard_io_resources[i]);
6915 }
6916
6917 @@ -1108,7 +946,7 @@
6918 #endif
6919 }
6920
6921 -static void __init init_amd(struct cpuinfo_x86 *c)
6922 +static void __cpuinit init_amd(struct cpuinfo_x86 *c)
6923 {
6924 unsigned level;
6925
6926 @@ -1164,6 +1002,12 @@
6927
6928 /* Fix cpuid4 emulation for more */
6929 num_cache_leaves = 3;
6930 +
6931 + /* When there is only one core no need to synchronize RDTSC */
6932 + if (num_possible_cpus() == 1)
6933 + set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
6934 + else
6935 + clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
6936 }
6937
6938 static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
6939 @@ -1245,8 +1089,7 @@
6940 node = first_node(node_online_map);
6941 numa_set_node(cpu, node);
6942
6943 - if (acpi_numa > 0)
6944 - printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
6945 + printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
6946 #endif
6947 }
6948
6949 @@ -1280,6 +1123,8 @@
6950 if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
6951 (c->x86 == 0x6 && c->x86_model >= 0x0e))
6952 set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
6953 + if (c->x86 == 6)
6954 + set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
6955 set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
6956 c->x86_max_cores = intel_num_cpu_cores(c);
6957
6958 @@ -1498,8 +1343,8 @@
6959
6960 /* Intel-defined (#2) */
6961 "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est",
6962 - "tm2", NULL, "cid", NULL, NULL, "cx16", "xtpr", NULL,
6963 - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
6964 + "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL,
6965 + NULL, NULL, "dca", NULL, NULL, NULL, NULL, NULL,
6966 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
6967
6968 /* VIA/Cyrix/Centaur-defined */
6969 --- a/arch/x86/kernel/smp_32-xen.c
6970 +++ b/arch/x86/kernel/smp_32-xen.c
6971 @@ -279,8 +279,7 @@
6972 * 2) Leave the mm if we are in the lazy tlb mode.
6973 */
6974
6975 -irqreturn_t smp_invalidate_interrupt(int irq, void *dev_id,
6976 - struct pt_regs *regs)
6977 +irqreturn_t smp_invalidate_interrupt(int irq, void *dev_id)
6978 {
6979 unsigned long cpu;
6980
6981 @@ -567,16 +566,14 @@
6982 * all the work is done automatically when
6983 * we return from the interrupt.
6984 */
6985 -irqreturn_t smp_reschedule_interrupt(int irq, void *dev_id,
6986 - struct pt_regs *regs)
6987 +irqreturn_t smp_reschedule_interrupt(int irq, void *dev_id)
6988 {
6989
6990 return IRQ_HANDLED;
6991 }
6992
6993 #include <linux/kallsyms.h>
6994 -irqreturn_t smp_call_function_interrupt(int irq, void *dev_id,
6995 - struct pt_regs *regs)
6996 +irqreturn_t smp_call_function_interrupt(int irq, void *dev_id)
6997 {
6998 void (*func) (void *info) = call_data->func;
6999 void *info = call_data->info;
7000 @@ -603,3 +600,69 @@
7001 return IRQ_HANDLED;
7002 }
7003
7004 +/*
7005 + * this function sends a 'generic call function' IPI to one other CPU
7006 + * in the system.
7007 + *
7008 + * cpu is a standard Linux logical CPU number.
7009 + */
7010 +static void
7011 +__smp_call_function_single(int cpu, void (*func) (void *info), void *info,
7012 + int nonatomic, int wait)
7013 +{
7014 + struct call_data_struct data;
7015 + int cpus = 1;
7016 +
7017 + data.func = func;
7018 + data.info = info;
7019 + atomic_set(&data.started, 0);
7020 + data.wait = wait;
7021 + if (wait)
7022 + atomic_set(&data.finished, 0);
7023 +
7024 + call_data = &data;
7025 + wmb();
7026 + /* Send a message to all other CPUs and wait for them to respond */
7027 + send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_VECTOR);
7028 +
7029 + /* Wait for response */
7030 + while (atomic_read(&data.started) != cpus)
7031 + cpu_relax();
7032 +
7033 + if (!wait)
7034 + return;
7035 +
7036 + while (atomic_read(&data.finished) != cpus)
7037 + cpu_relax();
7038 +}
7039 +
7040 +/*
7041 + * smp_call_function_single - Run a function on another CPU
7042 + * @func: The function to run. This must be fast and non-blocking.
7043 + * @info: An arbitrary pointer to pass to the function.
7044 + * @nonatomic: Currently unused.
7045 + * @wait: If true, wait until function has completed on other CPUs.
7046 + *
7047 + * Retrurns 0 on success, else a negative status code.
7048 + *
7049 + * Does not return until the remote CPU is nearly ready to execute <func>
7050 + * or is or has executed.
7051 + */
7052 +
7053 +int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
7054 + int nonatomic, int wait)
7055 +{
7056 + /* prevent preemption and reschedule on another processor */
7057 + int me = get_cpu();
7058 + if (cpu == me) {
7059 + WARN_ON(1);
7060 + put_cpu();
7061 + return -EBUSY;
7062 + }
7063 + spin_lock_bh(&call_lock);
7064 + __smp_call_function_single(cpu, func, info, nonatomic, wait);
7065 + spin_unlock_bh(&call_lock);
7066 + put_cpu();
7067 + return 0;
7068 +}
7069 +EXPORT_SYMBOL(smp_call_function_single);
7070 --- a/arch/x86/kernel/smp_64-xen.c
7071 +++ b/arch/x86/kernel/smp_64-xen.c
7072 @@ -381,9 +381,8 @@
7073 /* prevent preemption and reschedule on another processor */
7074 int me = get_cpu();
7075 if (cpu == me) {
7076 - WARN_ON(1);
7077 put_cpu();
7078 - return -EBUSY;
7079 + return 0;
7080 }
7081 spin_lock_bh(&call_lock);
7082 __smp_call_function_single(cpu, func, info, nonatomic, wait);
7083 @@ -501,7 +500,7 @@
7084 #ifndef CONFIG_XEN
7085 asmlinkage void smp_reschedule_interrupt(void)
7086 #else
7087 -asmlinkage irqreturn_t smp_reschedule_interrupt(void)
7088 +asmlinkage irqreturn_t smp_reschedule_interrupt(int irq, void *ctx)
7089 #endif
7090 {
7091 #ifndef CONFIG_XEN
7092 @@ -514,7 +513,7 @@
7093 #ifndef CONFIG_XEN
7094 asmlinkage void smp_call_function_interrupt(void)
7095 #else
7096 -asmlinkage irqreturn_t smp_call_function_interrupt(void)
7097 +asmlinkage irqreturn_t smp_call_function_interrupt(int irq, void *ctx)
7098 #endif
7099 {
7100 void (*func) (void *info) = call_data->func;
7101 @@ -545,31 +544,3 @@
7102 return IRQ_HANDLED;
7103 #endif
7104 }
7105 -
7106 -int safe_smp_processor_id(void)
7107 -{
7108 -#ifdef CONFIG_XEN
7109 - return smp_processor_id();
7110 -#else
7111 - unsigned apicid, i;
7112 -
7113 - if (disable_apic)
7114 - return 0;
7115 -
7116 - apicid = hard_smp_processor_id();
7117 - if (apicid < NR_CPUS && x86_cpu_to_apicid[apicid] == apicid)
7118 - return apicid;
7119 -
7120 - for (i = 0; i < NR_CPUS; ++i) {
7121 - if (x86_cpu_to_apicid[i] == apicid)
7122 - return i;
7123 - }
7124 -
7125 - /* No entries in x86_cpu_to_apicid? Either no MPS|ACPI,
7126 - * or called too early. Either way, we must be CPU 0. */
7127 - if (x86_cpu_to_apicid[0] == BAD_APICID)
7128 - return 0;
7129 -
7130 - return 0; /* Should not happen */
7131 -#endif
7132 -}
7133 --- a/arch/x86/kernel/time_32-xen.c
7134 +++ b/arch/x86/kernel/time_32-xen.c
7135 @@ -89,7 +89,6 @@
7136 unsigned long vxtime_hz = PIT_TICK_RATE;
7137 struct vxtime_data __vxtime __section_vxtime; /* for vsyscalls */
7138 volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
7139 -unsigned long __wall_jiffies __section_wall_jiffies = INITIAL_JIFFIES;
7140 struct timespec __xtime __section_xtime;
7141 struct timezone __sys_tz __section_sys_tz;
7142 #endif
7143 @@ -97,8 +96,6 @@
7144 unsigned int cpu_khz; /* Detected as we calibrate the TSC */
7145 EXPORT_SYMBOL(cpu_khz);
7146
7147 -extern unsigned long wall_jiffies;
7148 -
7149 DEFINE_SPINLOCK(rtc_lock);
7150 EXPORT_SYMBOL(rtc_lock);
7151
7152 @@ -265,11 +262,10 @@
7153 time_t wtm_sec, xtime_sec;
7154 u64 tmp, wc_nsec;
7155
7156 - /* Adjust wall-clock time base based on wall_jiffies ticks. */
7157 + /* Adjust wall-clock time base. */
7158 wc_nsec = processed_system_time;
7159 wc_nsec += sec * (u64)NSEC_PER_SEC;
7160 wc_nsec += nsec;
7161 - wc_nsec -= (jiffies - wall_jiffies) * (u64)NS_PER_TICK;
7162
7163 /* Split wallclock base into seconds and nanoseconds. */
7164 tmp = wc_nsec;
7165 @@ -387,16 +383,10 @@
7166 shadow = &per_cpu(shadow_time, cpu);
7167
7168 do {
7169 - unsigned long lost;
7170 -
7171 local_time_version = shadow->version;
7172 seq = read_seqbegin(&xtime_lock);
7173
7174 usec = get_usec_offset(shadow);
7175 - lost = jiffies - wall_jiffies;
7176 -
7177 - if (unlikely(lost))
7178 - usec += lost * (USEC_PER_SEC / HZ);
7179
7180 sec = xtime.tv_sec;
7181 usec += (xtime.tv_nsec / NSEC_PER_USEC);
7182 @@ -519,7 +509,7 @@
7183 write_seqlock_irq(&xtime_lock);
7184
7185 sec = xtime.tv_sec;
7186 - nsec = xtime.tv_nsec + ((jiffies - wall_jiffies) * (u64)NS_PER_TICK);
7187 + nsec = xtime.tv_nsec;
7188 __normalize_time(&sec, &nsec);
7189
7190 op.cmd = XENPF_settime;
7191 @@ -593,42 +583,49 @@
7192 }
7193 #endif
7194
7195 -#if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER)
7196 unsigned long profile_pc(struct pt_regs *regs)
7197 {
7198 unsigned long pc = instruction_pointer(regs);
7199
7200 -#ifdef __x86_64__
7201 - /* Assume the lock function has either no stack frame or only a single word.
7202 - This checks if the address on the stack looks like a kernel text address.
7203 - There is a small window for false hits, but in that case the tick
7204 - is just accounted to the spinlock function.
7205 - Better would be to write these functions in assembler again
7206 - and check exactly. */
7207 +#if defined(CONFIG_SMP) || defined(__x86_64__)
7208 if (!user_mode_vm(regs) && in_lock_functions(pc)) {
7209 - char *v = *(char **)regs->rsp;
7210 - if ((v >= _stext && v <= _etext) ||
7211 - (v >= _sinittext && v <= _einittext) ||
7212 - (v >= (char *)MODULES_VADDR && v <= (char *)MODULES_END))
7213 - return (unsigned long)v;
7214 - return ((unsigned long *)regs->rsp)[1];
7215 +# ifdef CONFIG_FRAME_POINTER
7216 +# ifdef __i386__
7217 + return ((unsigned long *)regs->ebp)[1];
7218 +# else
7219 + return ((unsigned long *)regs->rbp)[1];
7220 +# endif
7221 +# else
7222 +# ifdef __i386__
7223 + unsigned long *sp;
7224 + if ((regs->xcs & 2) == 0)
7225 + sp = (unsigned long *)&regs->esp;
7226 + else
7227 + sp = (unsigned long *)regs->esp;
7228 +# else
7229 + unsigned long *sp = (unsigned long *)regs->rsp;
7230 +# endif
7231 + /* Return address is either directly at stack pointer
7232 + or above a saved eflags. Eflags has bits 22-31 zero,
7233 + kernel addresses don't. */
7234 + if (sp[0] >> 22)
7235 + return sp[0];
7236 + if (sp[1] >> 22)
7237 + return sp[1];
7238 +# endif
7239 }
7240 -#else
7241 - if (!user_mode_vm(regs) && in_lock_functions(pc))
7242 - return *(unsigned long *)(regs->ebp + 4);
7243 #endif
7244
7245 return pc;
7246 }
7247 EXPORT_SYMBOL(profile_pc);
7248 -#endif
7249
7250 /*
7251 * This is the same as the above, except we _also_ save the current
7252 * Time Stamp Counter value at the time of the timer interrupt, so that
7253 * we later on can estimate the time of day more exactly.
7254 */
7255 -irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
7256 +irqreturn_t timer_interrupt(int irq, void *dev_id)
7257 {
7258 s64 delta, delta_cpu, stolen, blocked;
7259 u64 sched_time;
7260 @@ -686,10 +683,14 @@
7261 }
7262
7263 /* System-wide jiffy work. */
7264 - while (delta >= NS_PER_TICK) {
7265 - delta -= NS_PER_TICK;
7266 - processed_system_time += NS_PER_TICK;
7267 - do_timer(regs);
7268 + if (delta >= NS_PER_TICK) {
7269 + do_div(delta, NS_PER_TICK);
7270 + processed_system_time += delta * NS_PER_TICK;
7271 + while (delta > HZ) {
7272 + do_timer(HZ);
7273 + delta -= HZ;
7274 + }
7275 + do_timer(delta);
7276 }
7277
7278 if (shadow_tv_version != HYPERVISOR_shared_info->wc_version) {
7279 @@ -734,7 +735,7 @@
7280 if (delta_cpu > 0) {
7281 do_div(delta_cpu, NS_PER_TICK);
7282 per_cpu(processed_system_time, cpu) += delta_cpu * NS_PER_TICK;
7283 - if (user_mode_vm(regs))
7284 + if (user_mode_vm(get_irq_regs()))
7285 account_user_time(current, (cputime_t)delta_cpu);
7286 else
7287 account_system_time(current, HARDIRQ_OFFSET,
7288 @@ -748,10 +749,10 @@
7289 /* Local timer processing (see update_process_times()). */
7290 run_local_timers();
7291 if (rcu_pending(cpu))
7292 - rcu_check_callbacks(cpu, user_mode_vm(regs));
7293 + rcu_check_callbacks(cpu, user_mode_vm(get_irq_regs()));
7294 scheduler_tick();
7295 run_posix_cpu_timers(current);
7296 - profile_tick(CPU_PROFILING, regs);
7297 + profile_tick(CPU_PROFILING);
7298
7299 return IRQ_HANDLED;
7300 }
7301 @@ -959,10 +960,11 @@
7302 /* Duplicate of time_init() below, with hpet_enable part added */
7303 static void __init hpet_time_init(void)
7304 {
7305 - xtime.tv_sec = get_cmos_time();
7306 - xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
7307 - set_normalized_timespec(&wall_to_monotonic,
7308 - -xtime.tv_sec, -xtime.tv_nsec);
7309 + struct timespec ts;
7310 + ts.tv_sec = get_cmos_time();
7311 + ts.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
7312 +
7313 + do_settimeofday(&ts);
7314
7315 if ((hpet_enable() >= 0) && hpet_use_timer) {
7316 printk("Using HPET for base-timer\n");
7317 --- a/arch/x86/kernel/traps_32-xen.c
7318 +++ b/arch/x86/kernel/traps_32-xen.c
7319 @@ -28,6 +28,7 @@
7320 #include <linux/kprobes.h>
7321 #include <linux/kexec.h>
7322 #include <linux/unwind.h>
7323 +#include <linux/uaccess.h>
7324
7325 #ifdef CONFIG_EISA
7326 #include <linux/ioport.h>
7327 @@ -40,7 +41,6 @@
7328
7329 #include <asm/processor.h>
7330 #include <asm/system.h>
7331 -#include <asm/uaccess.h>
7332 #include <asm/io.h>
7333 #include <asm/atomic.h>
7334 #include <asm/debugreg.h>
7335 @@ -51,11 +51,14 @@
7336 #include <asm/smp.h>
7337 #include <asm/arch_hooks.h>
7338 #include <asm/kdebug.h>
7339 +#include <asm/stacktrace.h>
7340
7341 #include <linux/module.h>
7342
7343 #include "mach_traps.h"
7344
7345 +int panic_on_unrecovered_nmi;
7346 +
7347 asmlinkage int system_call(void);
7348
7349 struct desc_struct default_ldt[] = { { 0, 0 }, { 0, 0 }, { 0, 0 },
7350 @@ -124,62 +127,63 @@
7351 p < (void *)tinfo + THREAD_SIZE - 3;
7352 }
7353
7354 -/*
7355 - * Print one address/symbol entries per line.
7356 - */
7357 -static inline void print_addr_and_symbol(unsigned long addr, char *log_lvl)
7358 -{
7359 - printk(" [<%08lx>] ", addr);
7360 -
7361 - print_symbol("%s\n", addr);
7362 -}
7363 -
7364 static inline unsigned long print_context_stack(struct thread_info *tinfo,
7365 unsigned long *stack, unsigned long ebp,
7366 - char *log_lvl)
7367 + struct stacktrace_ops *ops, void *data)
7368 {
7369 unsigned long addr;
7370
7371 #ifdef CONFIG_FRAME_POINTER
7372 while (valid_stack_ptr(tinfo, (void *)ebp)) {
7373 + unsigned long new_ebp;
7374 addr = *(unsigned long *)(ebp + 4);
7375 - print_addr_and_symbol(addr, log_lvl);
7376 + ops->address(data, addr);
7377 /*
7378 * break out of recursive entries (such as
7379 - * end_of_stack_stop_unwind_function):
7380 + * end_of_stack_stop_unwind_function). Also,
7381 + * we can never allow a frame pointer to
7382 + * move downwards!
7383 */
7384 - if (ebp == *(unsigned long *)ebp)
7385 + new_ebp = *(unsigned long *)ebp;
7386 + if (new_ebp <= ebp)
7387 break;
7388 - ebp = *(unsigned long *)ebp;
7389 + ebp = new_ebp;
7390 }
7391 #else
7392 while (valid_stack_ptr(tinfo, stack)) {
7393 addr = *stack++;
7394 if (__kernel_text_address(addr))
7395 - print_addr_and_symbol(addr, log_lvl);
7396 + ops->address(data, addr);
7397 }
7398 #endif
7399 return ebp;
7400 }
7401
7402 +struct ops_and_data {
7403 + struct stacktrace_ops *ops;
7404 + void *data;
7405 +};
7406 +
7407 static asmlinkage int
7408 -show_trace_unwind(struct unwind_frame_info *info, void *log_lvl)
7409 +dump_trace_unwind(struct unwind_frame_info *info, void *data)
7410 {
7411 + struct ops_and_data *oad = (struct ops_and_data *)data;
7412 int n = 0;
7413
7414 while (unwind(info) == 0 && UNW_PC(info)) {
7415 n++;
7416 - print_addr_and_symbol(UNW_PC(info), log_lvl);
7417 + oad->ops->address(oad->data, UNW_PC(info));
7418 if (arch_unw_user_mode(info))
7419 break;
7420 }
7421 return n;
7422 }
7423
7424 -static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
7425 - unsigned long *stack, char *log_lvl)
7426 +void dump_trace(struct task_struct *task, struct pt_regs *regs,
7427 + unsigned long *stack,
7428 + struct stacktrace_ops *ops, void *data)
7429 {
7430 - unsigned long ebp;
7431 + unsigned long ebp = 0;
7432
7433 if (!task)
7434 task = current;
7435 @@ -187,54 +191,116 @@
7436 if (call_trace >= 0) {
7437 int unw_ret = 0;
7438 struct unwind_frame_info info;
7439 + struct ops_and_data oad = { .ops = ops, .data = data };
7440
7441 if (regs) {
7442 if (unwind_init_frame_info(&info, task, regs) == 0)
7443 - unw_ret = show_trace_unwind(&info, log_lvl);
7444 + unw_ret = dump_trace_unwind(&info, &oad);
7445 } else if (task == current)
7446 - unw_ret = unwind_init_running(&info, show_trace_unwind, log_lvl);
7447 + unw_ret = unwind_init_running(&info, dump_trace_unwind, &oad);
7448 else {
7449 if (unwind_init_blocked(&info, task) == 0)
7450 - unw_ret = show_trace_unwind(&info, log_lvl);
7451 + unw_ret = dump_trace_unwind(&info, &oad);
7452 }
7453 if (unw_ret > 0) {
7454 if (call_trace == 1 && !arch_unw_user_mode(&info)) {
7455 - print_symbol("DWARF2 unwinder stuck at %s\n",
7456 + ops->warning_symbol(data, "DWARF2 unwinder stuck at %s\n",
7457 UNW_PC(&info));
7458 if (UNW_SP(&info) >= PAGE_OFFSET) {
7459 - printk("Leftover inexact backtrace:\n");
7460 + ops->warning(data, "Leftover inexact backtrace:\n");
7461 stack = (void *)UNW_SP(&info);
7462 + if (!stack)
7463 + return;
7464 + ebp = UNW_FP(&info);
7465 } else
7466 - printk("Full inexact backtrace again:\n");
7467 + ops->warning(data, "Full inexact backtrace again:\n");
7468 } else if (call_trace >= 1)
7469 return;
7470 else
7471 - printk("Full inexact backtrace again:\n");
7472 + ops->warning(data, "Full inexact backtrace again:\n");
7473 } else
7474 - printk("Inexact backtrace:\n");
7475 + ops->warning(data, "Inexact backtrace:\n");
7476 }
7477 -
7478 - if (task == current) {
7479 - /* Grab ebp right from our regs */
7480 - asm ("movl %%ebp, %0" : "=r" (ebp) : );
7481 - } else {
7482 - /* ebp is the last reg pushed by switch_to */
7483 - ebp = *(unsigned long *) task->thread.esp;
7484 + if (!stack) {
7485 + unsigned long dummy;
7486 + stack = &dummy;
7487 + if (task && task != current)
7488 + stack = (unsigned long *)task->thread.esp;
7489 + }
7490 +
7491 +#ifdef CONFIG_FRAME_POINTER
7492 + if (!ebp) {
7493 + if (task == current) {
7494 + /* Grab ebp right from our regs */
7495 + asm ("movl %%ebp, %0" : "=r" (ebp) : );
7496 + } else {
7497 + /* ebp is the last reg pushed by switch_to */
7498 + ebp = *(unsigned long *) task->thread.esp;
7499 + }
7500 }
7501 +#endif
7502
7503 while (1) {
7504 struct thread_info *context;
7505 context = (struct thread_info *)
7506 ((unsigned long)stack & (~(THREAD_SIZE - 1)));
7507 - ebp = print_context_stack(context, stack, ebp, log_lvl);
7508 + ebp = print_context_stack(context, stack, ebp, ops, data);
7509 + /* Should be after the line below, but somewhere
7510 + in early boot context comes out corrupted and we
7511 + can't reference it -AK */
7512 + if (ops->stack(data, "IRQ") < 0)
7513 + break;
7514 stack = (unsigned long*)context->previous_esp;
7515 if (!stack)
7516 break;
7517 - printk("%s =======================\n", log_lvl);
7518 }
7519 }
7520 +EXPORT_SYMBOL(dump_trace);
7521
7522 -void show_trace(struct task_struct *task, struct pt_regs *regs, unsigned long * stack)
7523 +static void
7524 +print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
7525 +{
7526 + printk(data);
7527 + print_symbol(msg, symbol);
7528 + printk("\n");
7529 +}
7530 +
7531 +static void print_trace_warning(void *data, char *msg)
7532 +{
7533 + printk("%s%s\n", (char *)data, msg);
7534 +}
7535 +
7536 +static int print_trace_stack(void *data, char *name)
7537 +{
7538 + return 0;
7539 +}
7540 +
7541 +/*
7542 + * Print one address/symbol entries per line.
7543 + */
7544 +static void print_trace_address(void *data, unsigned long addr)
7545 +{
7546 + printk("%s [<%08lx>] ", (char *)data, addr);
7547 + print_symbol("%s\n", addr);
7548 +}
7549 +
7550 +static struct stacktrace_ops print_trace_ops = {
7551 + .warning = print_trace_warning,
7552 + .warning_symbol = print_trace_warning_symbol,
7553 + .stack = print_trace_stack,
7554 + .address = print_trace_address,
7555 +};
7556 +
7557 +static void
7558 +show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
7559 + unsigned long * stack, char *log_lvl)
7560 +{
7561 + dump_trace(task, regs, stack, &print_trace_ops, log_lvl);
7562 + printk("%s =======================\n", log_lvl);
7563 +}
7564 +
7565 +void show_trace(struct task_struct *task, struct pt_regs *regs,
7566 + unsigned long * stack)
7567 {
7568 show_trace_log_lvl(task, regs, stack, "");
7569 }
7570 @@ -297,12 +363,13 @@
7571 ss = regs->xss & 0xffff;
7572 }
7573 print_modules();
7574 - printk(KERN_EMERG "CPU: %d\nEIP: %04x:[<%08lx>] %s VLI\n"
7575 - "EFLAGS: %08lx (%s %.*s) \n",
7576 + printk(KERN_EMERG "CPU: %d\n"
7577 + KERN_EMERG "EIP: %04x:[<%08lx>] %s VLI\n"
7578 + KERN_EMERG "EFLAGS: %08lx (%s %.*s)\n",
7579 smp_processor_id(), 0xffff & regs->xcs, regs->eip,
7580 - print_tainted(), regs->eflags, system_utsname.release,
7581 - (int)strcspn(system_utsname.version, " "),
7582 - system_utsname.version);
7583 + print_tainted(), regs->eflags, init_utsname()->release,
7584 + (int)strcspn(init_utsname()->version, " "),
7585 + init_utsname()->version);
7586 print_symbol(KERN_EMERG "EIP is at %s\n", regs->eip);
7587 printk(KERN_EMERG "eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n",
7588 regs->eax, regs->ebx, regs->ecx, regs->edx);
7589 @@ -319,6 +386,8 @@
7590 */
7591 if (in_kernel) {
7592 u8 __user *eip;
7593 + int code_bytes = 64;
7594 + unsigned char c;
7595
7596 printk("\n" KERN_EMERG "Stack: ");
7597 show_stack_log_lvl(NULL, regs, (unsigned long *)esp, KERN_EMERG);
7598 @@ -326,9 +395,12 @@
7599 printk(KERN_EMERG "Code: ");
7600
7601 eip = (u8 __user *)regs->eip - 43;
7602 - for (i = 0; i < 64; i++, eip++) {
7603 - unsigned char c;
7604 -
7605 + if (eip < (u8 __user *)PAGE_OFFSET || __get_user(c, eip)) {
7606 + /* try starting at EIP */
7607 + eip = (u8 __user *)regs->eip;
7608 + code_bytes = 32;
7609 + }
7610 + for (i = 0; i < code_bytes; i++, eip++) {
7611 if (eip < (u8 __user *)PAGE_OFFSET || __get_user(c, eip)) {
7612 printk(" Bad EIP value.");
7613 break;
7614 @@ -349,7 +421,7 @@
7615
7616 if (eip < PAGE_OFFSET)
7617 return;
7618 - if (__get_user(ud2, (unsigned short __user *)eip))
7619 + if (probe_kernel_address((unsigned short __user *)eip, ud2))
7620 return;
7621 if (ud2 != 0x0b0f)
7622 return;
7623 @@ -362,7 +434,8 @@
7624 char *file;
7625 char c;
7626
7627 - if (__get_user(line, (unsigned short __user *)(eip + 2)))
7628 + if (probe_kernel_address((unsigned short __user *)(eip + 2),
7629 + line))
7630 break;
7631 if (__get_user(file, (char * __user *)(eip + 4)) ||
7632 (unsigned long)file < PAGE_OFFSET || __get_user(c, file))
7633 @@ -604,18 +677,24 @@
7634 }
7635 }
7636
7637 -static void mem_parity_error(unsigned char reason, struct pt_regs * regs)
7638 +static __kprobes void
7639 +mem_parity_error(unsigned char reason, struct pt_regs * regs)
7640 {
7641 - printk(KERN_EMERG "Uhhuh. NMI received. Dazed and confused, but trying "
7642 - "to continue\n");
7643 + printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on "
7644 + "CPU %d.\n", reason, smp_processor_id());
7645 printk(KERN_EMERG "You probably have a hardware problem with your RAM "
7646 "chips\n");
7647 + if (panic_on_unrecovered_nmi)
7648 + panic("NMI: Not continuing");
7649 +
7650 + printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
7651
7652 /* Clear and disable the memory parity error line. */
7653 clear_mem_error(reason);
7654 }
7655
7656 -static void io_check_error(unsigned char reason, struct pt_regs * regs)
7657 +static __kprobes void
7658 +io_check_error(unsigned char reason, struct pt_regs * regs)
7659 {
7660 printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n");
7661 show_registers(regs);
7662 @@ -624,7 +703,8 @@
7663 clear_io_check_error(reason);
7664 }
7665
7666 -static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
7667 +static __kprobes void
7668 +unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
7669 {
7670 #ifdef CONFIG_MCA
7671 /* Might actually be able to figure out what the guilty party
7672 @@ -634,15 +714,18 @@
7673 return;
7674 }
7675 #endif
7676 - printk("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
7677 - reason, smp_processor_id());
7678 - printk("Dazed and confused, but trying to continue\n");
7679 - printk("Do you have a strange power saving mode enabled?\n");
7680 + printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on "
7681 + "CPU %d.\n", reason, smp_processor_id());
7682 + printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n");
7683 + if (panic_on_unrecovered_nmi)
7684 + panic("NMI: Not continuing");
7685 +
7686 + printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
7687 }
7688
7689 static DEFINE_SPINLOCK(nmi_print_lock);
7690
7691 -void die_nmi (struct pt_regs *regs, const char *msg)
7692 +void __kprobes die_nmi(struct pt_regs *regs, const char *msg)
7693 {
7694 if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 2, SIGINT) ==
7695 NOTIFY_STOP)
7696 @@ -674,7 +757,7 @@
7697 do_exit(SIGSEGV);
7698 }
7699
7700 -static void default_do_nmi(struct pt_regs * regs)
7701 +static __kprobes void default_do_nmi(struct pt_regs * regs)
7702 {
7703 unsigned char reason = 0;
7704
7705 @@ -691,12 +774,12 @@
7706 * Ok, so this is none of the documented NMI sources,
7707 * so it must be the NMI watchdog.
7708 */
7709 - if (nmi_watchdog) {
7710 - nmi_watchdog_tick(regs);
7711 + if (nmi_watchdog_tick(regs, reason))
7712 return;
7713 - }
7714 + if (!do_nmi_callback(regs, smp_processor_id()))
7715 #endif
7716 - unknown_nmi_error(reason, regs);
7717 + unknown_nmi_error(reason, regs);
7718 +
7719 return;
7720 }
7721 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
7722 @@ -712,14 +795,7 @@
7723 reassert_nmi();
7724 }
7725
7726 -static int dummy_nmi_callback(struct pt_regs * regs, int cpu)
7727 -{
7728 - return 0;
7729 -}
7730 -
7731 -static nmi_callback_t nmi_callback = dummy_nmi_callback;
7732 -
7733 -fastcall void do_nmi(struct pt_regs * regs, long error_code)
7734 +fastcall __kprobes void do_nmi(struct pt_regs * regs, long error_code)
7735 {
7736 int cpu;
7737
7738 @@ -729,25 +805,11 @@
7739
7740 ++nmi_count(cpu);
7741
7742 - if (!rcu_dereference(nmi_callback)(regs, cpu))
7743 - default_do_nmi(regs);
7744 + default_do_nmi(regs);
7745
7746 nmi_exit();
7747 }
7748
7749 -void set_nmi_callback(nmi_callback_t callback)
7750 -{
7751 - vmalloc_sync_all();
7752 - rcu_assign_pointer(nmi_callback, callback);
7753 -}
7754 -EXPORT_SYMBOL_GPL(set_nmi_callback);
7755 -
7756 -void unset_nmi_callback(void)
7757 -{
7758 - nmi_callback = dummy_nmi_callback;
7759 -}
7760 -EXPORT_SYMBOL_GPL(unset_nmi_callback);
7761 -
7762 #ifdef CONFIG_KPROBES
7763 fastcall void __kprobes do_int3(struct pt_regs *regs, long error_code)
7764 {
7765 --- a/arch/x86/kernel/traps_64-xen.c
7766 +++ b/arch/x86/kernel/traps_64-xen.c
7767 @@ -23,6 +23,7 @@
7768 #include <linux/delay.h>
7769 #include <linux/spinlock.h>
7770 #include <linux/interrupt.h>
7771 +#include <linux/kallsyms.h>
7772 #include <linux/module.h>
7773 #include <linux/moduleparam.h>
7774 #include <linux/nmi.h>
7775 @@ -45,6 +46,7 @@
7776 #include <asm/pda.h>
7777 #include <asm/proto.h>
7778 #include <asm/nmi.h>
7779 +#include <asm/stacktrace.h>
7780
7781 asmlinkage void divide_error(void);
7782 asmlinkage void debug(void);
7783 @@ -114,7 +116,6 @@
7784 #endif
7785
7786 #ifdef CONFIG_KALLSYMS
7787 -# include <linux/kallsyms.h>
7788 void printk_address(unsigned long address)
7789 {
7790 unsigned long offset = 0, symsize;
7791 @@ -142,7 +143,7 @@
7792 #endif
7793
7794 static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
7795 - unsigned *usedp, const char **idp)
7796 + unsigned *usedp, char **idp)
7797 {
7798 #ifndef CONFIG_X86_NO_TSS
7799 static char ids[][8] = {
7800 @@ -162,26 +163,7 @@
7801 * 'stack' is in one of them:
7802 */
7803 for (k = 0; k < N_EXCEPTION_STACKS; k++) {
7804 - unsigned long end;
7805 -
7806 - /*
7807 - * set 'end' to the end of the exception stack.
7808 - */
7809 - switch (k + 1) {
7810 - /*
7811 - * TODO: this block is not needed i think, because
7812 - * setup64.c:cpu_init() sets up t->ist[DEBUG_STACK]
7813 - * properly too.
7814 - */
7815 -#if DEBUG_STKSZ > EXCEPTION_STKSZ
7816 - case DEBUG_STACK:
7817 - end = cpu_pda(cpu)->debugstack + DEBUG_STKSZ;
7818 - break;
7819 -#endif
7820 - default:
7821 - end = per_cpu(orig_ist, cpu).ist[k];
7822 - break;
7823 - }
7824 + unsigned long end = per_cpu(orig_ist, cpu).ist[k];
7825 /*
7826 * Is 'stack' above this exception frame's end?
7827 * If yes then skip to the next frame.
7828 @@ -236,13 +218,19 @@
7829 return NULL;
7830 }
7831
7832 -static int show_trace_unwind(struct unwind_frame_info *info, void *context)
7833 +struct ops_and_data {
7834 + struct stacktrace_ops *ops;
7835 + void *data;
7836 +};
7837 +
7838 +static int dump_trace_unwind(struct unwind_frame_info *info, void *context)
7839 {
7840 + struct ops_and_data *oad = (struct ops_and_data *)context;
7841 int n = 0;
7842
7843 while (unwind(info) == 0 && UNW_PC(info)) {
7844 n++;
7845 - printk_address(UNW_PC(info));
7846 + oad->ops->address(oad->data, UNW_PC(info));
7847 if (arch_unw_user_mode(info))
7848 break;
7849 }
7850 @@ -256,13 +244,19 @@
7851 * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
7852 */
7853
7854 -void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * stack)
7855 +static inline int valid_stack_ptr(struct thread_info *tinfo, void *p)
7856 {
7857 - const unsigned cpu = safe_smp_processor_id();
7858 + void *t = (void *)tinfo;
7859 + return p > t && p < t + THREAD_SIZE - 3;
7860 +}
7861 +
7862 +void dump_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * stack,
7863 + struct stacktrace_ops *ops, void *data)
7864 +{
7865 + const unsigned cpu = smp_processor_id();
7866 unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
7867 unsigned used = 0;
7868 -
7869 - printk("\nCall Trace:\n");
7870 + struct thread_info *tinfo;
7871
7872 if (!tsk)
7873 tsk = current;
7874 @@ -270,32 +264,47 @@
7875 if (call_trace >= 0) {
7876 int unw_ret = 0;
7877 struct unwind_frame_info info;
7878 + struct ops_and_data oad = { .ops = ops, .data = data };
7879
7880 if (regs) {
7881 if (unwind_init_frame_info(&info, tsk, regs) == 0)
7882 - unw_ret = show_trace_unwind(&info, NULL);
7883 + unw_ret = dump_trace_unwind(&info, &oad);
7884 } else if (tsk == current)
7885 - unw_ret = unwind_init_running(&info, show_trace_unwind, NULL);
7886 + unw_ret = unwind_init_running(&info, dump_trace_unwind, &oad);
7887 else {
7888 if (unwind_init_blocked(&info, tsk) == 0)
7889 - unw_ret = show_trace_unwind(&info, NULL);
7890 + unw_ret = dump_trace_unwind(&info, &oad);
7891 }
7892 if (unw_ret > 0) {
7893 if (call_trace == 1 && !arch_unw_user_mode(&info)) {
7894 - print_symbol("DWARF2 unwinder stuck at %s\n",
7895 + ops->warning_symbol(data, "DWARF2 unwinder stuck at %s\n",
7896 UNW_PC(&info));
7897 if ((long)UNW_SP(&info) < 0) {
7898 - printk("Leftover inexact backtrace:\n");
7899 + ops->warning(data, "Leftover inexact backtrace:\n");
7900 stack = (unsigned long *)UNW_SP(&info);
7901 + if (!stack)
7902 + return;
7903 } else
7904 - printk("Full inexact backtrace again:\n");
7905 + ops->warning(data, "Full inexact backtrace again:\n");
7906 } else if (call_trace >= 1)
7907 return;
7908 else
7909 - printk("Full inexact backtrace again:\n");
7910 + ops->warning(data, "Full inexact backtrace again:\n");
7911 } else
7912 - printk("Inexact backtrace:\n");
7913 + ops->warning(data, "Inexact backtrace:\n");
7914 + }
7915 + if (!stack) {
7916 + unsigned long dummy;
7917 + stack = &dummy;
7918 + if (tsk && tsk != current)
7919 + stack = (unsigned long *)tsk->thread.rsp;
7920 }
7921 + /*
7922 + * Align the stack pointer on word boundary, later loops
7923 + * rely on that (and corruption / debug info bugs can cause
7924 + * unaligned values here):
7925 + */
7926 + stack = (unsigned long *)((unsigned long)stack & ~(sizeof(long)-1));
7927
7928 /*
7929 * Print function call entries within a stack. 'cond' is the
7930 @@ -305,7 +314,9 @@
7931 #define HANDLE_STACK(cond) \
7932 do while (cond) { \
7933 unsigned long addr = *stack++; \
7934 - if (kernel_text_address(addr)) { \
7935 + if (oops_in_progress ? \
7936 + __kernel_text_address(addr) : \
7937 + kernel_text_address(addr)) { \
7938 /* \
7939 * If the address is either in the text segment of the \
7940 * kernel, or in the region which contains vmalloc'ed \
7941 @@ -314,7 +325,7 @@
7942 * down the cause of the crash will be able to figure \
7943 * out the call path that was taken. \
7944 */ \
7945 - printk_address(addr); \
7946 + ops->address(data, addr); \
7947 } \
7948 } while (0)
7949
7950 @@ -323,16 +334,17 @@
7951 * current stack address. If the stacks consist of nested
7952 * exceptions
7953 */
7954 - for ( ; ; ) {
7955 - const char *id;
7956 + for (;;) {
7957 + char *id;
7958 unsigned long *estack_end;
7959 estack_end = in_exception_stack(cpu, (unsigned long)stack,
7960 &used, &id);
7961
7962 if (estack_end) {
7963 - printk(" <%s>", id);
7964 + if (ops->stack(data, id) < 0)
7965 + break;
7966 HANDLE_STACK (stack < estack_end);
7967 - printk(" <EOE>");
7968 + ops->stack(data, "<EOE>");
7969 /*
7970 * We link to the next stack via the
7971 * second-to-last pointer (index -2 to end) in the
7972 @@ -347,7 +359,8 @@
7973 (IRQSTACKSIZE - 64) / sizeof(*irqstack);
7974
7975 if (stack >= irqstack && stack < irqstack_end) {
7976 - printk(" <IRQ>");
7977 + if (ops->stack(data, "IRQ") < 0)
7978 + break;
7979 HANDLE_STACK (stack < irqstack_end);
7980 /*
7981 * We link to the next stack (which would be
7982 @@ -356,7 +369,7 @@
7983 */
7984 stack = (unsigned long *) (irqstack_end[-1]);
7985 irqstack_end = NULL;
7986 - printk(" <EOI>");
7987 + ops->stack(data, "EOI");
7988 continue;
7989 }
7990 }
7991 @@ -364,19 +377,58 @@
7992 }
7993
7994 /*
7995 - * This prints the process stack:
7996 + * This handles the process stack:
7997 */
7998 - HANDLE_STACK (((long) stack & (THREAD_SIZE-1)) != 0);
7999 + tinfo = current_thread_info();
8000 + HANDLE_STACK (valid_stack_ptr(tinfo, stack));
8001 #undef HANDLE_STACK
8002 +}
8003 +EXPORT_SYMBOL(dump_trace);
8004 +
8005 +static void
8006 +print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
8007 +{
8008 + print_symbol(msg, symbol);
8009 + printk("\n");
8010 +}
8011 +
8012 +static void print_trace_warning(void *data, char *msg)
8013 +{
8014 + printk("%s\n", msg);
8015 +}
8016 +
8017 +static int print_trace_stack(void *data, char *name)
8018 +{
8019 + printk(" <%s> ", name);
8020 + return 0;
8021 +}
8022 +
8023 +static void print_trace_address(void *data, unsigned long addr)
8024 +{
8025 + printk_address(addr);
8026 +}
8027 +
8028 +static struct stacktrace_ops print_trace_ops = {
8029 + .warning = print_trace_warning,
8030 + .warning_symbol = print_trace_warning_symbol,
8031 + .stack = print_trace_stack,
8032 + .address = print_trace_address,
8033 +};
8034
8035 +void
8036 +show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack)
8037 +{
8038 + printk("\nCall Trace:\n");
8039 + dump_trace(tsk, regs, stack, &print_trace_ops, NULL);
8040 printk("\n");
8041 }
8042
8043 -static void _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long * rsp)
8044 +static void
8045 +_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *rsp)
8046 {
8047 unsigned long *stack;
8048 int i;
8049 - const int cpu = safe_smp_processor_id();
8050 + const int cpu = smp_processor_id();
8051 unsigned long *irqstack_end = (unsigned long *) (cpu_pda(cpu)->irqstackptr);
8052 unsigned long *irqstack = (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE);
8053
8054 @@ -430,7 +482,7 @@
8055 int i;
8056 int in_kernel = !user_mode(regs);
8057 unsigned long rsp;
8058 - const int cpu = safe_smp_processor_id();
8059 + const int cpu = smp_processor_id();
8060 struct task_struct *cur = cpu_pda(cpu)->pcurrent;
8061
8062 rsp = regs->rsp;
8063 @@ -505,9 +557,11 @@
8064
8065 unsigned __kprobes long oops_begin(void)
8066 {
8067 - int cpu = safe_smp_processor_id();
8068 + int cpu = smp_processor_id();
8069 unsigned long flags;
8070
8071 + oops_enter();
8072 +
8073 /* racy, but better than risking deadlock. */
8074 local_irq_save(flags);
8075 if (!spin_trylock(&die_lock)) {
8076 @@ -536,6 +590,7 @@
8077 spin_unlock_irqrestore(&die_lock, flags);
8078 if (panic_on_oops)
8079 panic("Fatal exception");
8080 + oops_exit();
8081 }
8082
8083 void __kprobes __die(const char * str, struct pt_regs * regs, long err)
8084 @@ -573,7 +628,7 @@
8085 }
8086
8087 #ifdef CONFIG_X86_LOCAL_APIC
8088 -void __kprobes die_nmi(char *str, struct pt_regs *regs)
8089 +void __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic)
8090 {
8091 unsigned long flags = oops_begin();
8092
8093 @@ -581,13 +636,12 @@
8094 * We are in trouble anyway, lets at least try
8095 * to get a message out.
8096 */
8097 - printk(str, safe_smp_processor_id());
8098 + printk(str, smp_processor_id());
8099 show_registers(regs);
8100 if (kexec_should_crash(current))
8101 crash_kexec(regs);
8102 - if (panic_on_timeout || panic_on_oops)
8103 - panic("nmi watchdog");
8104 - printk("console shuts up ...\n");
8105 + if (do_panic || panic_on_oops)
8106 + panic("Non maskable interrupt");
8107 oops_end(flags);
8108 nmi_exit();
8109 local_irq_enable();
8110 @@ -734,8 +788,15 @@
8111 static __kprobes void
8112 mem_parity_error(unsigned char reason, struct pt_regs * regs)
8113 {
8114 - printk("Uhhuh. NMI received. Dazed and confused, but trying to continue\n");
8115 - printk("You probably have a hardware problem with your RAM chips\n");
8116 + printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
8117 + reason);
8118 + printk(KERN_EMERG "You probably have a hardware problem with your "
8119 + "RAM chips\n");
8120 +
8121 + if (panic_on_unrecovered_nmi)
8122 + panic("NMI: Not continuing");
8123 +
8124 + printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
8125
8126 #if 0 /* XEN */
8127 /* Clear and disable the memory parity error line. */
8128 @@ -762,9 +823,15 @@
8129
8130 static __kprobes void
8131 unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
8132 -{ printk("Uhhuh. NMI received for unknown reason %02x.\n", reason);
8133 - printk("Dazed and confused, but trying to continue\n");
8134 - printk("Do you have a strange power saving mode enabled?\n");
8135 +{
8136 + printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
8137 + reason);
8138 + printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n");
8139 +
8140 + if (panic_on_unrecovered_nmi)
8141 + panic("NMI: Not continuing");
8142 +
8143 + printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
8144 }
8145
8146 /* Runs on IST stack. This code must keep interrupts off all the time.
8147 @@ -789,12 +856,12 @@
8148 * Ok, so this is none of the documented NMI sources,
8149 * so it must be the NMI watchdog.
8150 */
8151 - if (nmi_watchdog > 0) {
8152 - nmi_watchdog_tick(regs,reason);
8153 + if (nmi_watchdog_tick(regs,reason))
8154 return;
8155 - }
8156 #endif
8157 - unknown_nmi_error(reason, regs);
8158 + if (!do_nmi_callback(regs,cpu))
8159 + unknown_nmi_error(reason, regs);
8160 +
8161 return;
8162 }
8163 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
8164 @@ -1081,6 +1148,7 @@
8165 init_fpu(me);
8166 restore_fpu_checking(&me->thread.i387.fxsave);
8167 task_thread_info(me)->status |= TS_USEDFPU;
8168 + me->fpu_counter++;
8169 }
8170
8171
8172 @@ -1141,24 +1209,30 @@
8173 }
8174
8175
8176 -/* Actual parsing is done early in setup.c. */
8177 -static int __init oops_dummy(char *s)
8178 +static int __init oops_setup(char *s)
8179 {
8180 - panic_on_oops = 1;
8181 - return 1;
8182 + if (!s)
8183 + return -EINVAL;
8184 + if (!strcmp(s, "panic"))
8185 + panic_on_oops = 1;
8186 + return 0;
8187 }
8188 -__setup("oops=", oops_dummy);
8189 +early_param("oops", oops_setup);
8190
8191 static int __init kstack_setup(char *s)
8192 {
8193 + if (!s)
8194 + return -EINVAL;
8195 kstack_depth_to_print = simple_strtoul(s,NULL,0);
8196 - return 1;
8197 + return 0;
8198 }
8199 -__setup("kstack=", kstack_setup);
8200 +early_param("kstack", kstack_setup);
8201
8202 #ifdef CONFIG_STACK_UNWIND
8203 static int __init call_trace_setup(char *s)
8204 {
8205 + if (!s)
8206 + return -EINVAL;
8207 if (strcmp(s, "old") == 0)
8208 call_trace = -1;
8209 else if (strcmp(s, "both") == 0)
8210 @@ -1167,7 +1241,7 @@
8211 call_trace = 1;
8212 else if (strcmp(s, "new") == 0)
8213 call_trace = 2;
8214 - return 1;
8215 + return 0;
8216 }
8217 -__setup("call_trace=", call_trace_setup);
8218 +early_param("call_trace", call_trace_setup);
8219 #endif
8220 --- a/arch/x86/kernel/vsyscall_64-xen.c
8221 +++ b/arch/x86/kernel/vsyscall_64-xen.c
8222 @@ -26,6 +26,10 @@
8223 #include <linux/seqlock.h>
8224 #include <linux/jiffies.h>
8225 #include <linux/sysctl.h>
8226 +#include <linux/getcpu.h>
8227 +#include <linux/cpu.h>
8228 +#include <linux/smp.h>
8229 +#include <linux/notifier.h>
8230
8231 #include <asm/vsyscall.h>
8232 #include <asm/pgtable.h>
8233 @@ -33,11 +37,15 @@
8234 #include <asm/fixmap.h>
8235 #include <asm/errno.h>
8236 #include <asm/io.h>
8237 +#include <asm/segment.h>
8238 +#include <asm/desc.h>
8239 +#include <asm/topology.h>
8240
8241 #define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
8242
8243 int __sysctl_vsyscall __section_sysctl_vsyscall = 1;
8244 seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED;
8245 +int __vgetcpu_mode __section_vgetcpu_mode;
8246
8247 #include <asm/unistd.h>
8248
8249 @@ -61,8 +69,7 @@
8250 sequence = read_seqbegin(&__xtime_lock);
8251
8252 sec = __xtime.tv_sec;
8253 - usec = (__xtime.tv_nsec / 1000) +
8254 - (__jiffies - __wall_jiffies) * (1000000 / HZ);
8255 + usec = __xtime.tv_nsec / 1000;
8256
8257 if (__vxtime.mode != VXTIME_HPET) {
8258 t = get_cycles_sync();
8259 @@ -72,7 +79,8 @@
8260 __vxtime.tsc_quot) >> 32;
8261 /* See comment in x86_64 do_gettimeofday. */
8262 } else {
8263 - usec += ((readl((void *)fix_to_virt(VSYSCALL_HPET) + 0xf0) -
8264 + usec += ((readl((void __iomem *)
8265 + fix_to_virt(VSYSCALL_HPET) + 0xf0) -
8266 __vxtime.last) * __vxtime.quot) >> 32;
8267 }
8268 } while (read_seqretry(&__xtime_lock, sequence));
8269 @@ -127,9 +135,46 @@
8270 return __xtime.tv_sec;
8271 }
8272
8273 -long __vsyscall(2) venosys_0(void)
8274 -{
8275 - return -ENOSYS;
8276 +/* Fast way to get current CPU and node.
8277 + This helps to do per node and per CPU caches in user space.
8278 + The result is not guaranteed without CPU affinity, but usually
8279 + works out because the scheduler tries to keep a thread on the same
8280 + CPU.
8281 +
8282 + tcache must point to a two element sized long array.
8283 + All arguments can be NULL. */
8284 +long __vsyscall(2)
8285 +vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
8286 +{
8287 + unsigned int dummy, p;
8288 + unsigned long j = 0;
8289 +
8290 + /* Fast cache - only recompute value once per jiffies and avoid
8291 + relatively costly rdtscp/cpuid otherwise.
8292 + This works because the scheduler usually keeps the process
8293 + on the same CPU and this syscall doesn't guarantee its
8294 + results anyways.
8295 + We do this here because otherwise user space would do it on
8296 + its own in a likely inferior way (no access to jiffies).
8297 + If you don't like it pass NULL. */
8298 + if (tcache && tcache->blob[0] == (j = __jiffies)) {
8299 + p = tcache->blob[1];
8300 + } else if (__vgetcpu_mode == VGETCPU_RDTSCP) {
8301 + /* Load per CPU data from RDTSCP */
8302 + rdtscp(dummy, dummy, p);
8303 + } else {
8304 + /* Load per CPU data from GDT */
8305 + asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
8306 + }
8307 + if (tcache) {
8308 + tcache->blob[0] = j;
8309 + tcache->blob[1] = p;
8310 + }
8311 + if (cpu)
8312 + *cpu = p & 0xfff;
8313 + if (node)
8314 + *node = p >> 12;
8315 + return 0;
8316 }
8317
8318 long __vsyscall(3) venosys_1(void)
8319 @@ -149,7 +194,8 @@
8320 void __user *buffer, size_t *lenp, loff_t *ppos)
8321 {
8322 extern u16 vsysc1, vsysc2;
8323 - u16 *map1, *map2;
8324 + u16 __iomem *map1;
8325 + u16 __iomem *map2;
8326 int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
8327 if (!write)
8328 return ret;
8329 @@ -164,11 +210,11 @@
8330 goto out;
8331 }
8332 if (!sysctl_vsyscall) {
8333 - *map1 = SYSCALL;
8334 - *map2 = SYSCALL;
8335 + writew(SYSCALL, map1);
8336 + writew(SYSCALL, map2);
8337 } else {
8338 - *map1 = NOP2;
8339 - *map2 = NOP2;
8340 + writew(NOP2, map1);
8341 + writew(NOP2, map2);
8342 }
8343 iounmap(map2);
8344 out:
8345 @@ -200,6 +246,48 @@
8346
8347 #endif
8348
8349 +/* Assume __initcall executes before all user space. Hopefully kmod
8350 + doesn't violate that. We'll find out if it does. */
8351 +static void __cpuinit vsyscall_set_cpu(int cpu)
8352 +{
8353 + unsigned long d;
8354 + unsigned long node = 0;
8355 +#ifdef CONFIG_NUMA
8356 + node = cpu_to_node[cpu];
8357 +#endif
8358 + if (cpu_has(&cpu_data[cpu], X86_FEATURE_RDTSCP))
8359 + write_rdtscp_aux((node << 12) | cpu);
8360 +
8361 + /* Store cpu number in limit so that it can be loaded quickly
8362 + in user space in vgetcpu.
8363 + 12 bits for the CPU and 8 bits for the node. */
8364 + d = 0x0f40000000000ULL;
8365 + d |= cpu;
8366 + d |= (node & 0xf) << 12;
8367 + d |= (node >> 4) << 48;
8368 + if (HYPERVISOR_update_descriptor(virt_to_machine(cpu_gdt(cpu)
8369 + + GDT_ENTRY_PER_CPU),
8370 + d))
8371 + BUG();
8372 +}
8373 +
8374 +static void __cpuinit cpu_vsyscall_init(void *arg)
8375 +{
8376 + /* preemption should be already off */
8377 + vsyscall_set_cpu(raw_smp_processor_id());
8378 +}
8379 +
8380 +#ifdef CONFIG_HOTPLUG_CPU
8381 +static int __cpuinit
8382 +cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
8383 +{
8384 + long cpu = (long)arg;
8385 + if (action == CPU_ONLINE)
8386 + smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 0, 1);
8387 + return NOTIFY_DONE;
8388 +}
8389 +#endif
8390 +
8391 static void __init map_vsyscall(void)
8392 {
8393 extern char __vsyscall_0;
8394 @@ -214,13 +302,20 @@
8395 VSYSCALL_ADDR(__NR_vgettimeofday)));
8396 BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
8397 BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
8398 + BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu));
8399 map_vsyscall();
8400 #ifdef CONFIG_XEN
8401 sysctl_vsyscall = 0; /* disable vgettimeofay() */
8402 + if (boot_cpu_has(X86_FEATURE_RDTSCP))
8403 + vgetcpu_mode = VGETCPU_RDTSCP;
8404 + else
8405 + vgetcpu_mode = VGETCPU_LSL;
8406 #endif
8407 #ifdef CONFIG_SYSCTL
8408 register_sysctl_table(kernel_root_table2, 0);
8409 #endif
8410 + on_each_cpu(cpu_vsyscall_init, NULL, 0, 1);
8411 + hotcpu_notifier(cpu_vsyscall_notifier, 0);
8412 return 0;
8413 }
8414
8415 --- a/arch/x86/mach-xen/setup.c
8416 +++ b/arch/x86/mach-xen/setup.c
8417 @@ -103,8 +103,10 @@
8418
8419 setup_xen_features();
8420
8421 - if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
8422 - set_fixaddr_top(pp.virt_start);
8423 + if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0) {
8424 + hypervisor_virt_start = pp.virt_start;
8425 + reserve_top_address(0UL - pp.virt_start);
8426 + }
8427
8428 if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
8429 machine_to_phys_mapping = (unsigned long *)mapping.v_start;
8430 --- a/arch/x86/mm/fault_32-xen.c
8431 +++ b/arch/x86/mm/fault_32-xen.c
8432 @@ -27,21 +27,24 @@
8433 #include <asm/uaccess.h>
8434 #include <asm/desc.h>
8435 #include <asm/kdebug.h>
8436 +#include <asm/segment.h>
8437
8438 extern void die(const char *,struct pt_regs *,long);
8439
8440 -#ifdef CONFIG_KPROBES
8441 -ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain);
8442 +static ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain);
8443 +
8444 int register_page_fault_notifier(struct notifier_block *nb)
8445 {
8446 vmalloc_sync_all();
8447 return atomic_notifier_chain_register(&notify_page_fault_chain, nb);
8448 }
8449 +EXPORT_SYMBOL_GPL(register_page_fault_notifier);
8450
8451 int unregister_page_fault_notifier(struct notifier_block *nb)
8452 {
8453 return atomic_notifier_chain_unregister(&notify_page_fault_chain, nb);
8454 }
8455 +EXPORT_SYMBOL_GPL(unregister_page_fault_notifier);
8456
8457 static inline int notify_page_fault(enum die_val val, const char *str,
8458 struct pt_regs *regs, long err, int trap, int sig)
8459 @@ -55,14 +58,6 @@
8460 };
8461 return atomic_notifier_call_chain(&notify_page_fault_chain, val, &args);
8462 }
8463 -#else
8464 -static inline int notify_page_fault(enum die_val val, const char *str,
8465 - struct pt_regs *regs, long err, int trap, int sig)
8466 -{
8467 - return NOTIFY_DONE;
8468 -}
8469 -#endif
8470 -
8471
8472 /*
8473 * Unlock any spinlocks which will prevent us from getting the
8474 @@ -119,10 +114,10 @@
8475 }
8476
8477 /* The standard kernel/user address space limit. */
8478 - *eip_limit = (seg & 2) ? USER_DS.seg : KERNEL_DS.seg;
8479 + *eip_limit = user_mode(regs) ? USER_DS.seg : KERNEL_DS.seg;
8480
8481 /* By far the most common cases. */
8482 - if (likely(seg == __USER_CS || seg == GET_KERNEL_CS()))
8483 + if (likely(SEGMENT_IS_FLAT_CODE(seg)))
8484 return eip;
8485
8486 /* Check the segment exists, is within the current LDT/GDT size,
8487 @@ -559,11 +554,7 @@
8488 write = 0;
8489 switch (error_code & 3) {
8490 default: /* 3: write, present */
8491 -#ifdef TEST_VERIFY_AREA
8492 - if (regs->cs == GET_KERNEL_CS())
8493 - printk("WP fault at %08lx\n", regs->eip);
8494 -#endif
8495 - /* fall through */
8496 + /* fall through */
8497 case 2: /* write, not present */
8498 if (!(vma->vm_flags & VM_WRITE))
8499 goto bad_area;
8500 @@ -572,7 +563,7 @@
8501 case 1: /* read, present */
8502 goto bad_area;
8503 case 0: /* read, not present */
8504 - if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
8505 + if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
8506 goto bad_area;
8507 }
8508
8509 @@ -704,7 +695,7 @@
8510 */
8511 out_of_memory:
8512 up_read(&mm->mmap_sem);
8513 - if (tsk->pid == 1) {
8514 + if (is_init(tsk)) {
8515 yield();
8516 down_read(&mm->mmap_sem);
8517 goto survive;
8518 --- a/arch/x86/mm/fault_64-xen.c
8519 +++ b/arch/x86/mm/fault_64-xen.c
8520 @@ -40,8 +40,7 @@
8521 #define PF_RSVD (1<<3)
8522 #define PF_INSTR (1<<4)
8523
8524 -#ifdef CONFIG_KPROBES
8525 -ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain);
8526 +static ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain);
8527
8528 /* Hook to register for page fault notifications */
8529 int register_page_fault_notifier(struct notifier_block *nb)
8530 @@ -49,11 +48,13 @@
8531 vmalloc_sync_all();
8532 return atomic_notifier_chain_register(&notify_page_fault_chain, nb);
8533 }
8534 +EXPORT_SYMBOL_GPL(register_page_fault_notifier);
8535
8536 int unregister_page_fault_notifier(struct notifier_block *nb)
8537 {
8538 return atomic_notifier_chain_unregister(&notify_page_fault_chain, nb);
8539 }
8540 +EXPORT_SYMBOL_GPL(unregister_page_fault_notifier);
8541
8542 static inline int notify_page_fault(enum die_val val, const char *str,
8543 struct pt_regs *regs, long err, int trap, int sig)
8544 @@ -67,13 +68,6 @@
8545 };
8546 return atomic_notifier_call_chain(&notify_page_fault_chain, val, &args);
8547 }
8548 -#else
8549 -static inline int notify_page_fault(enum die_val val, const char *str,
8550 - struct pt_regs *regs, long err, int trap, int sig)
8551 -{
8552 - return NOTIFY_DONE;
8553 -}
8554 -#endif
8555
8556 void bust_spinlocks(int yes)
8557 {
8558 @@ -102,7 +96,7 @@
8559 static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
8560 unsigned long error_code)
8561 {
8562 - unsigned char *instr;
8563 + unsigned char __user *instr;
8564 int scan_more = 1;
8565 int prefetch = 0;
8566 unsigned char *max_instr;
8567 @@ -111,7 +105,7 @@
8568 if (error_code & PF_INSTR)
8569 return 0;
8570
8571 - instr = (unsigned char *)convert_rip_to_linear(current, regs);
8572 + instr = (unsigned char __user *)convert_rip_to_linear(current, regs);
8573 max_instr = instr + 15;
8574
8575 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
8576 @@ -122,7 +116,7 @@
8577 unsigned char instr_hi;
8578 unsigned char instr_lo;
8579
8580 - if (__get_user(opcode, instr))
8581 + if (__get_user(opcode, (char __user *)instr))
8582 break;
8583
8584 instr_hi = opcode & 0xf0;
8585 @@ -160,7 +154,7 @@
8586 case 0x00:
8587 /* Prefetch instruction is 0x0F0D or 0x0F18 */
8588 scan_more = 0;
8589 - if (__get_user(opcode, instr))
8590 + if (__get_user(opcode, (char __user *)instr))
8591 break;
8592 prefetch = (instr_lo == 0xF) &&
8593 (opcode == 0x0D || opcode == 0x18);
8594 @@ -176,7 +170,7 @@
8595 static int bad_address(void *p)
8596 {
8597 unsigned long dummy;
8598 - return __get_user(dummy, (unsigned long *)p);
8599 + return __get_user(dummy, (unsigned long __user *)p);
8600 }
8601
8602 void dump_pagetable(unsigned long address)
8603 @@ -248,7 +242,7 @@
8604
8605 int unhandled_signal(struct task_struct *tsk, int sig)
8606 {
8607 - if (tsk->pid == 1)
8608 + if (is_init(tsk))
8609 return 1;
8610 if (tsk->ptrace & PT_PTRACED)
8611 return 0;
8612 @@ -300,7 +294,7 @@
8613 if (pgd_none(*pgd))
8614 set_pgd(pgd, *pgd_ref);
8615 else
8616 - BUG_ON(pgd_page(*pgd) != pgd_page(*pgd_ref));
8617 + BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
8618
8619 /* Below here mismatches are bugs because these lower tables
8620 are shared */
8621 @@ -309,7 +303,7 @@
8622 pud_ref = pud_offset(pgd_ref, address);
8623 if (pud_none(*pud_ref))
8624 return -1;
8625 - if (pud_none(*pud) || pud_page(*pud) != pud_page(*pud_ref))
8626 + if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
8627 BUG();
8628 pmd = pmd_offset(pud, address);
8629 pmd_ref = pmd_offset(pud_ref, address);
8630 @@ -531,7 +525,7 @@
8631 case PF_PROT: /* read, present */
8632 goto bad_area;
8633 case 0: /* read, not present */
8634 - if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
8635 + if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
8636 goto bad_area;
8637 }
8638
8639 @@ -647,7 +641,7 @@
8640 */
8641 out_of_memory:
8642 up_read(&mm->mmap_sem);
8643 - if (current->pid == 1) {
8644 + if (is_init(current)) {
8645 yield();
8646 goto again;
8647 }
8648 @@ -702,7 +696,7 @@
8649 if (pgd_none(*pgd))
8650 set_pgd(pgd, *pgd_ref);
8651 else
8652 - BUG_ON(pgd_page(*pgd) != pgd_page(*pgd_ref));
8653 + BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
8654 }
8655 spin_unlock(&pgd_lock);
8656 set_bit(pgd_index(address), insync);
8657 --- a/arch/x86/mm/highmem_32-xen.c
8658 +++ b/arch/x86/mm/highmem_32-xen.c
8659 @@ -38,11 +38,9 @@
8660
8661 idx = type + KM_TYPE_NR*smp_processor_id();
8662 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
8663 -#ifdef CONFIG_DEBUG_HIGHMEM
8664 if (!pte_none(*(kmap_pte-idx)))
8665 BUG();
8666 -#endif
8667 - set_pte_at_sync(&init_mm, vaddr, kmap_pte-idx, mk_pte(page, prot));
8668 + set_pte_at(&init_mm, vaddr, kmap_pte-idx, mk_pte(page, prot));
8669
8670 return (void*) vaddr;
8671 }
8672 @@ -62,36 +60,26 @@
8673
8674 void kunmap_atomic(void *kvaddr, enum km_type type)
8675 {
8676 -#if defined(CONFIG_DEBUG_HIGHMEM) || defined(CONFIG_XEN)
8677 unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
8678 enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
8679
8680 - if (vaddr < FIXADDR_START) { // FIXME
8681 +#ifdef CONFIG_DEBUG_HIGHMEM
8682 + if (vaddr >= PAGE_OFFSET && vaddr < (unsigned long)high_memory) {
8683 dec_preempt_count();
8684 preempt_check_resched();
8685 return;
8686 }
8687 -#endif
8688
8689 -#if defined(CONFIG_DEBUG_HIGHMEM)
8690 if (vaddr != __fix_to_virt(FIX_KMAP_BEGIN+idx))
8691 BUG();
8692 -
8693 - /*
8694 - * force other mappings to Oops if they'll try to access
8695 - * this pte without first remap it
8696 - */
8697 - pte_clear(&init_mm, vaddr, kmap_pte-idx);
8698 - __flush_tlb_one(vaddr);
8699 -#elif defined(CONFIG_XEN)
8700 +#endif
8701 /*
8702 - * We must ensure there are no dangling pagetable references when
8703 - * returning memory to Xen (decrease_reservation).
8704 - * XXX TODO: We could make this faster by only zapping when
8705 - * kmap_flush_unused is called but that is trickier and more invasive.
8706 + * Force other mappings to Oops if they'll try to access this pte
8707 + * without first remap it. Keeping stale mappings around is a bad idea
8708 + * also, in case the page changes cacheability attributes or becomes
8709 + * a protected page in a hypervisor.
8710 */
8711 - pte_clear(&init_mm, vaddr, kmap_pte-idx);
8712 -#endif
8713 + kpte_clear_flush(kmap_pte-idx, vaddr);
8714
8715 dec_preempt_count();
8716 preempt_check_resched();
8717 @@ -110,7 +98,6 @@
8718 idx = type + KM_TYPE_NR*smp_processor_id();
8719 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
8720 set_pte(kmap_pte-idx, pfn_pte(pfn, kmap_prot));
8721 - __flush_tlb_one(vaddr);
8722
8723 return (void*) vaddr;
8724 }
8725 --- a/arch/x86/mm/hypervisor.c
8726 +++ b/arch/x86/mm/hypervisor.c
8727 @@ -569,7 +569,8 @@
8728 #define MAX_BATCHED_FULL_PTES 32
8729
8730 int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
8731 - unsigned long addr, unsigned long end, pgprot_t newprot)
8732 + unsigned long addr, unsigned long end, pgprot_t newprot,
8733 + int dirty_accountable)
8734 {
8735 int rc = 0, i = 0;
8736 mmu_update_t u[MAX_BATCHED_FULL_PTES];
8737 @@ -582,10 +583,14 @@
8738 pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
8739 do {
8740 if (pte_present(*pte)) {
8741 + pte_t ptent = pte_modify(*pte, newprot);
8742 +
8743 + if (dirty_accountable && pte_dirty(ptent))
8744 + ptent = pte_mkwrite(ptent);
8745 u[i].ptr = (__pmd_val(*pmd) & PHYSICAL_PAGE_MASK)
8746 | ((unsigned long)pte & ~PAGE_MASK)
8747 | MMU_PT_UPDATE_PRESERVE_AD;
8748 - u[i].val = __pte_val(pte_modify(*pte, newprot));
8749 + u[i].val = __pte_val(ptent);
8750 if (++i == MAX_BATCHED_FULL_PTES) {
8751 if ((rc = HYPERVISOR_mmu_update(
8752 &u[0], i, NULL, DOMID_SELF)) != 0)
8753 --- a/arch/x86/mm/init_32-xen.c
8754 +++ b/arch/x86/mm/init_32-xen.c
8755 @@ -464,16 +464,22 @@
8756 * on Enable
8757 * off Disable
8758 */
8759 -void __init noexec_setup(const char *str)
8760 +static int __init noexec_setup(char *str)
8761 {
8762 - if (!strncmp(str, "on",2) && cpu_has_nx) {
8763 - __supported_pte_mask |= _PAGE_NX;
8764 - disable_nx = 0;
8765 - } else if (!strncmp(str,"off",3)) {
8766 + if (!str || !strcmp(str, "on")) {
8767 + if (cpu_has_nx) {
8768 + __supported_pte_mask |= _PAGE_NX;
8769 + disable_nx = 0;
8770 + }
8771 + } else if (!strcmp(str,"off")) {
8772 disable_nx = 1;
8773 __supported_pte_mask &= ~_PAGE_NX;
8774 - }
8775 + } else
8776 + return -EINVAL;
8777 +
8778 + return 0;
8779 }
8780 +early_param("noexec", noexec_setup);
8781
8782 int nx_enabled = 0;
8783 #ifdef CONFIG_X86_PAE
8784 @@ -516,6 +522,7 @@
8785 pte->pte_high &= ~(1 << (_PAGE_BIT_NX - 32));
8786 else
8787 pte->pte_high |= 1 << (_PAGE_BIT_NX - 32);
8788 + pte_update_defer(&init_mm, vaddr, pte);
8789 __flush_tlb_all();
8790 out:
8791 return ret;
8792 @@ -598,18 +605,6 @@
8793 }
8794 }
8795
8796 -static void __init set_max_mapnr_init(void)
8797 -{
8798 -#ifdef CONFIG_HIGHMEM
8799 - num_physpages = highend_pfn;
8800 -#else
8801 - num_physpages = max_low_pfn;
8802 -#endif
8803 -#ifdef CONFIG_FLATMEM
8804 - max_mapnr = num_physpages;
8805 -#endif
8806 -}
8807 -
8808 static struct kcore_list kcore_mem, kcore_vmalloc;
8809
8810 void __init mem_init(void)
8811 @@ -630,8 +625,7 @@
8812 #endif
8813
8814 #ifdef CONFIG_FLATMEM
8815 - if (!mem_map)
8816 - BUG();
8817 + BUG_ON(!mem_map);
8818 #endif
8819
8820 bad_ppro = ppro_with_ram_bug();
8821 @@ -646,17 +640,6 @@
8822 }
8823 #endif
8824
8825 - set_max_mapnr_init();
8826 -
8827 -#ifdef CONFIG_HIGHMEM
8828 - high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
8829 -#else
8830 - high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
8831 -#endif
8832 - printk("vmalloc area: %lx-%lx, maxmem %lx\n",
8833 - VMALLOC_START,VMALLOC_END,MAXMEM);
8834 - BUG_ON(VMALLOC_START > VMALLOC_END);
8835 -
8836 /* this will put all low memory onto the freelists */
8837 totalram_pages += free_all_bootmem();
8838 /* XEN: init and count low-mem pages outside initial allocation. */
8839 @@ -694,6 +677,48 @@
8840 (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10))
8841 );
8842
8843 +#if 1 /* double-sanity-check paranoia */
8844 + printk("virtual kernel memory layout:\n"
8845 + " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
8846 +#ifdef CONFIG_HIGHMEM
8847 + " pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
8848 +#endif
8849 + " vmalloc : 0x%08lx - 0x%08lx (%4ld MB)\n"
8850 + " lowmem : 0x%08lx - 0x%08lx (%4ld MB)\n"
8851 + " .init : 0x%08lx - 0x%08lx (%4ld kB)\n"
8852 + " .data : 0x%08lx - 0x%08lx (%4ld kB)\n"
8853 + " .text : 0x%08lx - 0x%08lx (%4ld kB)\n",
8854 + FIXADDR_START, FIXADDR_TOP,
8855 + (FIXADDR_TOP - FIXADDR_START) >> 10,
8856 +
8857 +#ifdef CONFIG_HIGHMEM
8858 + PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
8859 + (LAST_PKMAP*PAGE_SIZE) >> 10,
8860 +#endif
8861 +
8862 + VMALLOC_START, VMALLOC_END,
8863 + (VMALLOC_END - VMALLOC_START) >> 20,
8864 +
8865 + (unsigned long)__va(0), (unsigned long)high_memory,
8866 + ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20,
8867 +
8868 + (unsigned long)&__init_begin, (unsigned long)&__init_end,
8869 + ((unsigned long)&__init_end - (unsigned long)&__init_begin) >> 10,
8870 +
8871 + (unsigned long)&_etext, (unsigned long)&_edata,
8872 + ((unsigned long)&_edata - (unsigned long)&_etext) >> 10,
8873 +
8874 + (unsigned long)&_text, (unsigned long)&_etext,
8875 + ((unsigned long)&_etext - (unsigned long)&_text) >> 10);
8876 +
8877 +#ifdef CONFIG_HIGHMEM
8878 + BUG_ON(PKMAP_BASE+LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
8879 + BUG_ON(VMALLOC_END > PKMAP_BASE);
8880 +#endif
8881 + BUG_ON(VMALLOC_START > VMALLOC_END);
8882 + BUG_ON((unsigned long)high_memory > VMALLOC_START);
8883 +#endif /* double-sanity-check paranoia */
8884 +
8885 #ifdef CONFIG_X86_PAE
8886 if (!cpu_has_pae)
8887 panic("cannot execute a PAE-enabled kernel on a PAE-less CPU!");
8888 @@ -724,7 +749,7 @@
8889 int arch_add_memory(int nid, u64 start, u64 size)
8890 {
8891 struct pglist_data *pgdata = &contig_page_data;
8892 - struct zone *zone = pgdata->node_zones + MAX_NR_ZONES-1;
8893 + struct zone *zone = pgdata->node_zones + ZONE_HIGHMEM;
8894 unsigned long start_pfn = start >> PAGE_SHIFT;
8895 unsigned long nr_pages = size >> PAGE_SHIFT;
8896
8897 --- a/arch/x86/mm/init_64-xen.c
8898 +++ b/arch/x86/mm/init_64-xen.c
8899 @@ -61,8 +61,6 @@
8900
8901 extern unsigned long *contiguous_bitmap;
8902
8903 -static unsigned long dma_reserve __initdata;
8904 -
8905 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
8906 extern unsigned long start_pfn;
8907
8908 @@ -416,7 +414,6 @@
8909
8910 /* actually usually some more */
8911 if (size >= LARGE_PAGE_SIZE) {
8912 - printk("SMBIOS area too long %lu\n", size);
8913 return NULL;
8914 }
8915 set_pmd(temp_mappings[0].pmd, __pmd(map | _KERNPG_TABLE | _PAGE_PSE));
8916 @@ -438,13 +435,15 @@
8917 #endif
8918
8919 static void __meminit
8920 -phys_pmd_init(pmd_t *pmd, unsigned long address, unsigned long end)
8921 +phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
8922 {
8923 - int i, k;
8924 + int i = pmd_index(address);
8925
8926 - for (i = 0; i < PTRS_PER_PMD; pmd++, i++) {
8927 + for (; i < PTRS_PER_PMD; i++) {
8928 unsigned long pte_phys;
8929 + pmd_t *pmd = pmd_page + i;
8930 pte_t *pte, *pte_save;
8931 + int k;
8932
8933 if (address >= end) {
8934 if (!after_bootmem)
8935 @@ -452,6 +451,12 @@
8936 set_pmd(pmd, __pmd(0));
8937 break;
8938 }
8939 +
8940 + if (__pmd_val(*pmd)) {
8941 + address += PMD_SIZE;
8942 + continue;
8943 + }
8944 +
8945 pte = alloc_static_page(&pte_phys);
8946 pte_save = pte;
8947 for (k = 0; k < PTRS_PER_PTE; pte++, k++, address += PTE_SIZE) {
8948 @@ -474,40 +479,35 @@
8949 static void __meminit
8950 phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
8951 {
8952 - pmd_t *pmd = pmd_offset(pud, (unsigned long)__va(address));
8953 -
8954 - if (pmd_none(*pmd)) {
8955 - spin_lock(&init_mm.page_table_lock);
8956 - phys_pmd_init(pmd, address, end);
8957 - spin_unlock(&init_mm.page_table_lock);
8958 - __flush_tlb_all();
8959 - }
8960 + pmd_t *pmd = pmd_offset(pud,0);
8961 + spin_lock(&init_mm.page_table_lock);
8962 + phys_pmd_init(pmd, address, end);
8963 + spin_unlock(&init_mm.page_table_lock);
8964 + __flush_tlb_all();
8965 }
8966
8967 -static void __meminit phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
8968 +static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
8969 {
8970 - long i = pud_index(address);
8971 -
8972 - pud = pud + i;
8973 -
8974 - if (after_bootmem && pud_val(*pud)) {
8975 - phys_pmd_update(pud, address, end);
8976 - return;
8977 - }
8978 + int i = pud_index(addr);
8979
8980 - for (; i < PTRS_PER_PUD; pud++, i++) {
8981 - unsigned long paddr, pmd_phys;
8982 + for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE ) {
8983 + unsigned long pmd_phys;
8984 + pud_t *pud = pud_page + pud_index(addr);
8985 pmd_t *pmd;
8986
8987 - paddr = (address & PGDIR_MASK) + i*PUD_SIZE;
8988 - if (paddr >= end)
8989 + if (addr >= end)
8990 break;
8991
8992 + if (__pud_val(*pud)) {
8993 + phys_pmd_update(pud, addr, end);
8994 + continue;
8995 + }
8996 +
8997 pmd = alloc_static_page(&pmd_phys);
8998 early_make_page_readonly(pmd, XENFEAT_writable_page_tables);
8999 spin_lock(&init_mm.page_table_lock);
9000 set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
9001 - phys_pmd_init(pmd, paddr, end);
9002 + phys_pmd_init(pmd, addr, end);
9003 spin_unlock(&init_mm.page_table_lock);
9004 }
9005 __flush_tlb();
9006 @@ -771,69 +771,18 @@
9007 #endif
9008 }
9009
9010 -/* Compute zone sizes for the DMA and DMA32 zones in a node. */
9011 -__init void
9012 -size_zones(unsigned long *z, unsigned long *h,
9013 - unsigned long start_pfn, unsigned long end_pfn)
9014 -{
9015 - int i;
9016 - unsigned long w;
9017 -
9018 - for (i = 0; i < MAX_NR_ZONES; i++)
9019 - z[i] = 0;
9020 -
9021 - if (start_pfn < MAX_DMA_PFN)
9022 - z[ZONE_DMA] = MAX_DMA_PFN - start_pfn;
9023 - if (start_pfn < MAX_DMA32_PFN) {
9024 - unsigned long dma32_pfn = MAX_DMA32_PFN;
9025 - if (dma32_pfn > end_pfn)
9026 - dma32_pfn = end_pfn;
9027 - z[ZONE_DMA32] = dma32_pfn - start_pfn;
9028 - }
9029 - z[ZONE_NORMAL] = end_pfn - start_pfn;
9030 -
9031 - /* Remove lower zones from higher ones. */
9032 - w = 0;
9033 - for (i = 0; i < MAX_NR_ZONES; i++) {
9034 - if (z[i])
9035 - z[i] -= w;
9036 - w += z[i];
9037 - }
9038 -
9039 - /* Compute holes */
9040 - w = start_pfn;
9041 - for (i = 0; i < MAX_NR_ZONES; i++) {
9042 - unsigned long s = w;
9043 - w += z[i];
9044 - h[i] = e820_hole_size(s, w);
9045 - }
9046 -
9047 - /* Add the space pace needed for mem_map to the holes too. */
9048 - for (i = 0; i < MAX_NR_ZONES; i++)
9049 - h[i] += (z[i] * sizeof(struct page)) / PAGE_SIZE;
9050 -
9051 - /* The 16MB DMA zone has the kernel and other misc mappings.
9052 - Account them too */
9053 - if (h[ZONE_DMA]) {
9054 - h[ZONE_DMA] += dma_reserve;
9055 - if (h[ZONE_DMA] >= z[ZONE_DMA]) {
9056 - printk(KERN_WARNING
9057 - "Kernel too large and filling up ZONE_DMA?\n");
9058 - h[ZONE_DMA] = z[ZONE_DMA];
9059 - }
9060 - }
9061 -}
9062 -
9063 #ifndef CONFIG_NUMA
9064 void __init paging_init(void)
9065 {
9066 - unsigned long zones[MAX_NR_ZONES], holes[MAX_NR_ZONES];
9067 + unsigned long max_zone_pfns[MAX_NR_ZONES];
9068 + memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
9069 + max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
9070 + max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
9071 + max_zone_pfns[ZONE_NORMAL] = end_pfn;
9072
9073 memory_present(0, 0, end_pfn);
9074 sparse_init();
9075 - size_zones(zones, holes, 0, end_pfn);
9076 - free_area_init_node(0, NODE_DATA(0), zones,
9077 - __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes);
9078 + free_area_init_nodes(max_zone_pfns);
9079
9080 init_mm.context.pinned = 1;
9081 }
9082 @@ -887,36 +836,23 @@
9083
9084 #ifdef CONFIG_MEMORY_HOTPLUG
9085 /*
9086 - * XXX: memory_add_physaddr_to_nid() is to find node id from physical address
9087 - * via probe interface of sysfs. If acpi notifies hot-add event, then it
9088 - * can tell node id by searching dsdt. But, probe interface doesn't have
9089 - * node id. So, return 0 as node id at this time.
9090 - */
9091 -#ifdef CONFIG_NUMA
9092 -int memory_add_physaddr_to_nid(u64 start)
9093 -{
9094 - return 0;
9095 -}
9096 -#endif
9097 -
9098 -/*
9099 * Memory is added always to NORMAL zone. This means you will never get
9100 * additional DMA/DMA32 memory.
9101 */
9102 int arch_add_memory(int nid, u64 start, u64 size)
9103 {
9104 struct pglist_data *pgdat = NODE_DATA(nid);
9105 - struct zone *zone = pgdat->node_zones + MAX_NR_ZONES-2;
9106 + struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
9107 unsigned long start_pfn = start >> PAGE_SHIFT;
9108 unsigned long nr_pages = size >> PAGE_SHIFT;
9109 int ret;
9110
9111 + init_memory_mapping(start, (start + size -1));
9112 +
9113 ret = __add_pages(zone, start_pfn, nr_pages);
9114 if (ret)
9115 goto error;
9116
9117 - init_memory_mapping(start, (start + size -1));
9118 -
9119 return ret;
9120 error:
9121 printk("%s: Problem encountered in __add_pages!\n", __func__);
9122 @@ -930,7 +866,17 @@
9123 }
9124 EXPORT_SYMBOL_GPL(remove_memory);
9125
9126 -#else /* CONFIG_MEMORY_HOTPLUG */
9127 +#if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA)
9128 +int memory_add_physaddr_to_nid(u64 start)
9129 +{
9130 + return 0;
9131 +}
9132 +EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
9133 +#endif
9134 +
9135 +#endif /* CONFIG_MEMORY_HOTPLUG */
9136 +
9137 +#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
9138 /*
9139 * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance,
9140 * just online the pages.
9141 @@ -956,7 +902,7 @@
9142 }
9143 return err;
9144 }
9145 -#endif /* CONFIG_MEMORY_HOTPLUG */
9146 +#endif
9147
9148 static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
9149 kcore_vsyscall;
9150 @@ -973,12 +919,6 @@
9151
9152 pci_iommu_alloc();
9153
9154 - /* How many end-of-memory variables you have, grandma! */
9155 - max_low_pfn = end_pfn;
9156 - max_pfn = end_pfn;
9157 - num_physpages = end_pfn;
9158 - high_memory = (void *) __va(end_pfn * PAGE_SIZE);
9159 -
9160 /* clear the zero-page */
9161 memset(empty_zero_page, 0, PAGE_SIZE);
9162
9163 @@ -996,7 +936,8 @@
9164 init_page_count(pfn_to_page(pfn));
9165 totalram_pages++;
9166 }
9167 - reservedpages = end_pfn - totalram_pages - e820_hole_size(0, end_pfn);
9168 + reservedpages = end_pfn - totalram_pages -
9169 + absent_pages_in_range(0, end_pfn);
9170
9171 after_bootmem = 1;
9172
9173 @@ -1103,15 +1044,34 @@
9174
9175 void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
9176 {
9177 - /* Should check here against the e820 map to avoid double free */
9178 #ifdef CONFIG_NUMA
9179 int nid = phys_to_nid(phys);
9180 +#endif
9181 + unsigned long pfn = phys >> PAGE_SHIFT;
9182 + if (pfn >= end_pfn) {
9183 + /* This can happen with kdump kernels when accessing firmware
9184 + tables. */
9185 + if (pfn < end_pfn_map)
9186 + return;
9187 + printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
9188 + phys, len);
9189 + return;
9190 + }
9191 +
9192 + /* Should check here against the e820 map to avoid double free */
9193 +#ifdef CONFIG_NUMA
9194 reserve_bootmem_node(NODE_DATA(nid), phys, len);
9195 #else
9196 reserve_bootmem(phys, len);
9197 #endif
9198 - if (phys+len <= MAX_DMA_PFN*PAGE_SIZE)
9199 +#ifndef CONFIG_XEN
9200 + if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
9201 + static unsigned long dma_reserve __initdata;
9202 +
9203 dma_reserve += len / PAGE_SIZE;
9204 + set_dma_reserve(dma_reserve);
9205 + }
9206 +#endif
9207 }
9208
9209 int kern_addr_valid(unsigned long addr)
9210 --- a/arch/x86/mm/ioremap_32-xen.c
9211 +++ b/arch/x86/mm/ioremap_32-xen.c
9212 @@ -12,7 +12,7 @@
9213 #include <linux/init.h>
9214 #include <linux/slab.h>
9215 #include <linux/module.h>
9216 -#include <asm/io.h>
9217 +#include <linux/io.h>
9218 #include <asm/fixmap.h>
9219 #include <asm/cacheflush.h>
9220 #include <asm/tlbflush.h>
9221 @@ -118,7 +118,7 @@
9222 if (domid == DOMID_SELF)
9223 return -EINVAL;
9224
9225 - vma->vm_flags |= VM_IO | VM_RESERVED;
9226 + vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
9227
9228 vma->vm_mm->context.has_foreign_mappings = 1;
9229
9230 @@ -203,6 +203,7 @@
9231 void __iomem * addr;
9232 struct vm_struct * area;
9233 unsigned long offset, last_addr;
9234 + pgprot_t prot;
9235 domid_t domid = DOMID_IO;
9236
9237 /* Don't allow wraparound or zero size */
9238 @@ -234,6 +235,8 @@
9239 domid = DOMID_SELF;
9240 }
9241
9242 + prot = __pgprot(_KERNPG_TABLE | flags);
9243 +
9244 /*
9245 * Mappings have to be page-aligned
9246 */
9247 @@ -249,10 +252,9 @@
9248 return NULL;
9249 area->phys_addr = phys_addr;
9250 addr = (void __iomem *) area->addr;
9251 - flags |= _KERNPG_TABLE;
9252 if (__direct_remap_pfn_range(&init_mm, (unsigned long)addr,
9253 phys_addr>>PAGE_SHIFT,
9254 - size, __pgprot(flags), domid)) {
9255 + size, prot, domid)) {
9256 vunmap((void __force *) addr);
9257 return NULL;
9258 }
9259 --- a/arch/x86/mm/pageattr_64-xen.c
9260 +++ b/arch/x86/mm/pageattr_64-xen.c
9261 @@ -371,8 +371,8 @@
9262 BUG_ON(pud_none(*pud));
9263 pmd = pmd_offset(pud, address);
9264 BUG_ON(__pmd_val(*pmd) & _PAGE_PSE);
9265 - pgprot_val(ref_prot) |= _PAGE_PSE;
9266 large_pte = mk_pte_phys(__pa(address) & LARGE_PAGE_MASK, ref_prot);
9267 + large_pte = pte_mkhuge(large_pte);
9268 set_pte((pte_t *)pmd, large_pte);
9269 }
9270
9271 @@ -382,32 +382,28 @@
9272 {
9273 pte_t *kpte;
9274 struct page *kpte_page;
9275 - unsigned kpte_flags;
9276 pgprot_t ref_prot2;
9277 kpte = lookup_address(address);
9278 if (!kpte) return 0;
9279 kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK);
9280 - kpte_flags = pte_val(*kpte);
9281 if (pgprot_val(prot) != pgprot_val(ref_prot)) {
9282 - if ((kpte_flags & _PAGE_PSE) == 0) {
9283 + if (!pte_huge(*kpte)) {
9284 set_pte(kpte, pfn_pte(pfn, prot));
9285 } else {
9286 /*
9287 * split_large_page will take the reference for this
9288 * change_page_attr on the split page.
9289 */
9290 -
9291 struct page *split;
9292 - ref_prot2 = __pgprot(pgprot_val(pte_pgprot(*lookup_address(address))) & ~(1<<_PAGE_BIT_PSE));
9293 -
9294 + ref_prot2 = pte_pgprot(pte_clrhuge(*kpte));
9295 split = split_large_page(address, prot, ref_prot2);
9296 if (!split)
9297 return -ENOMEM;
9298 - set_pte(kpte,mk_pte(split, ref_prot2));
9299 + set_pte(kpte, mk_pte(split, ref_prot2));
9300 kpte_page = split;
9301 - }
9302 + }
9303 page_private(kpte_page)++;
9304 - } else if ((kpte_flags & _PAGE_PSE) == 0) {
9305 + } else if (!pte_huge(*kpte)) {
9306 set_pte(kpte, pfn_pte(pfn, ref_prot));
9307 BUG_ON(page_private(kpte_page) == 0);
9308 page_private(kpte_page)--;
9309 @@ -464,10 +460,12 @@
9310 * lowmem */
9311 if (__pa(address) < KERNEL_TEXT_SIZE) {
9312 unsigned long addr2;
9313 - pgprot_t prot2 = prot;
9314 + pgprot_t prot2;
9315 addr2 = __START_KERNEL_map + __pa(address);
9316 - pgprot_val(prot2) &= ~_PAGE_NX;
9317 - err = __change_page_attr(addr2, pfn, prot2, PAGE_KERNEL_EXEC);
9318 + /* Make sure the kernel mappings stay executable */
9319 + prot2 = pte_pgprot(pte_mkexec(pfn_pte(0, prot)));
9320 + err = __change_page_attr(addr2, pfn, prot2,
9321 + PAGE_KERNEL_EXEC);
9322 }
9323 }
9324 up_write(&init_mm.mmap_sem);
9325 --- a/arch/x86/mm/pgtable_32-xen.c
9326 +++ b/arch/x86/mm/pgtable_32-xen.c
9327 @@ -68,7 +68,9 @@
9328 printk(KERN_INFO "%lu pages writeback\n",
9329 global_page_state(NR_WRITEBACK));
9330 printk(KERN_INFO "%lu pages mapped\n", global_page_state(NR_FILE_MAPPED));
9331 - printk(KERN_INFO "%lu pages slab\n", global_page_state(NR_SLAB));
9332 + printk(KERN_INFO "%lu pages slab\n",
9333 + global_page_state(NR_SLAB_RECLAIMABLE) +
9334 + global_page_state(NR_SLAB_UNRECLAIMABLE));
9335 printk(KERN_INFO "%lu pages pagetables\n",
9336 global_page_state(NR_PAGETABLE));
9337 }
9338 @@ -108,18 +110,11 @@
9339 __flush_tlb_one(vaddr);
9340 }
9341
9342 -static int nr_fixmaps = 0;
9343 +static int fixmaps;
9344 unsigned long hypervisor_virt_start = HYPERVISOR_VIRT_START;
9345 -unsigned long __FIXADDR_TOP = (HYPERVISOR_VIRT_START - 2 * PAGE_SIZE);
9346 +unsigned long __FIXADDR_TOP = (HYPERVISOR_VIRT_START - PAGE_SIZE);
9347 EXPORT_SYMBOL(__FIXADDR_TOP);
9348
9349 -void __init set_fixaddr_top(unsigned long top)
9350 -{
9351 - BUG_ON(nr_fixmaps > 0);
9352 - hypervisor_virt_start = top;
9353 - __FIXADDR_TOP = hypervisor_virt_start - 2 * PAGE_SIZE;
9354 -}
9355 -
9356 void __set_fixmap (enum fixed_addresses idx, maddr_t phys, pgprot_t flags)
9357 {
9358 unsigned long address = __fix_to_virt(idx);
9359 @@ -141,7 +136,21 @@
9360 if (HYPERVISOR_update_va_mapping(address, pte,
9361 UVMF_INVLPG|UVMF_ALL))
9362 BUG();
9363 - nr_fixmaps++;
9364 + fixmaps++;
9365 +}
9366 +
9367 +/**
9368 + * reserve_top_address - reserves a hole in the top of kernel address space
9369 + * @reserve - size of hole to reserve
9370 + *
9371 + * Can be used to relocate the fixmap area and poke a hole in the top
9372 + * of kernel address space to make room for a hypervisor.
9373 + */
9374 +void __init reserve_top_address(unsigned long reserve)
9375 +{
9376 + BUG_ON(fixmaps > 0);
9377 + __FIXADDR_TOP = -reserve - PAGE_SIZE;
9378 + __VMALLOC_RESERVE += reserve;
9379 }
9380
9381 pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
9382 --- a/arch/x86/pci/irq-xen.c
9383 +++ b/arch/x86/pci/irq-xen.c
9384 @@ -991,10 +991,6 @@
9385 pci_name(bridge), 'A' + pin, irq);
9386 }
9387 if (irq >= 0) {
9388 - if (use_pci_vector() &&
9389 - !platform_legacy_irq(irq))
9390 - irq = IO_APIC_VECTOR(irq);
9391 -
9392 printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n",
9393 pci_name(dev), 'A' + pin, irq);
9394 dev->irq = irq;
9395 @@ -1155,10 +1151,6 @@
9396 }
9397 dev = temp_dev;
9398 if (irq >= 0) {
9399 -#ifdef CONFIG_PCI_MSI
9400 - if (!platform_legacy_irq(irq))
9401 - irq = IO_APIC_VECTOR(irq);
9402 -#endif
9403 printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n",
9404 pci_name(dev), 'A' + pin, irq);
9405 dev->irq = irq;
9406 @@ -1179,33 +1171,3 @@
9407 }
9408 return 0;
9409 }
9410 -
9411 -int pci_vector_resources(int last, int nr_released)
9412 -{
9413 - int count = nr_released;
9414 -
9415 - int next = last;
9416 - int offset = (last % 8);
9417 -
9418 - while (next < FIRST_SYSTEM_VECTOR) {
9419 - next += 8;
9420 -#ifdef CONFIG_X86_64
9421 - if (next == IA32_SYSCALL_VECTOR)
9422 - continue;
9423 -#else
9424 - if (next == SYSCALL_VECTOR)
9425 - continue;
9426 -#endif
9427 - count++;
9428 - if (next >= FIRST_SYSTEM_VECTOR) {
9429 - if (offset%8) {
9430 - next = FIRST_DEVICE_VECTOR + offset;
9431 - offset++;
9432 - continue;
9433 - }
9434 - count--;
9435 - }
9436 - }
9437 -
9438 - return count;
9439 -}
9440 --- a/drivers/char/tpm/tpm_xen.c
9441 +++ b/drivers/char/tpm/tpm_xen.c
9442 @@ -85,8 +85,7 @@
9443
9444 /* local function prototypes */
9445 static irqreturn_t tpmif_int(int irq,
9446 - void *tpm_priv,
9447 - struct pt_regs *ptregs);
9448 + void *tpm_priv);
9449 static void tpmif_rx_action(unsigned long unused);
9450 static int tpmif_connect(struct xenbus_device *dev,
9451 struct tpm_private *tp,
9452 @@ -559,7 +558,7 @@
9453 }
9454
9455
9456 -static irqreturn_t tpmif_int(int irq, void *tpm_priv, struct pt_regs *ptregs)
9457 +static irqreturn_t tpmif_int(int irq, void *tpm_priv)
9458 {
9459 struct tpm_private *tp = tpm_priv;
9460 unsigned long flags;
9461 --- a/drivers/pci/Kconfig
9462 +++ b/drivers/pci/Kconfig
9463 @@ -45,7 +45,7 @@
9464 config HT_IRQ
9465 bool "Interrupts on hypertransport devices"
9466 default y
9467 - depends on PCI && X86_LOCAL_APIC && X86_IO_APIC
9468 + depends on PCI && X86_LOCAL_APIC && X86_IO_APIC && !XEN
9469 help
9470 This allows native hypertransport devices to use interrupts.
9471
9472 --- a/drivers/xen/Kconfig
9473 +++ b/drivers/xen/Kconfig
9474 @@ -278,6 +278,9 @@
9475 config HAVE_IRQ_IGNORE_UNHANDLED
9476 def_bool y
9477
9478 +config GENERIC_HARDIRQS_NO__DO_IRQ
9479 + def_bool y
9480 +
9481 config NO_IDLE_HZ
9482 def_bool y
9483
9484 --- a/drivers/xen/balloon/balloon.c
9485 +++ b/drivers/xen/balloon/balloon.c
9486 @@ -84,7 +84,7 @@
9487 /* VM /proc information for memory */
9488 extern unsigned long totalram_pages;
9489
9490 -#ifndef MODULE
9491 +#if !defined(MODULE) && defined(CONFIG_HIGHMEM)
9492 extern unsigned long totalhigh_pages;
9493 #define inc_totalhigh_pages() (totalhigh_pages++)
9494 #define dec_totalhigh_pages() (totalhigh_pages--)
9495 --- a/drivers/xen/blkback/blkback.c
9496 +++ b/drivers/xen/blkback/blkback.c
9497 @@ -288,7 +288,7 @@
9498 wake_up(&blkif->wq);
9499 }
9500
9501 -irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs)
9502 +irqreturn_t blkif_be_int(int irq, void *dev_id)
9503 {
9504 blkif_notify_work(dev_id);
9505 return IRQ_HANDLED;
9506 --- a/drivers/xen/blkback/common.h
9507 +++ b/drivers/xen/blkback/common.h
9508 @@ -130,7 +130,7 @@
9509
9510 void blkif_xenbus_init(void);
9511
9512 -irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs);
9513 +irqreturn_t blkif_be_int(int irq, void *dev_id);
9514 int blkif_schedule(void *arg);
9515
9516 int blkback_barrier(struct xenbus_transaction xbt,
9517 --- a/drivers/xen/blkfront/blkfront.c
9518 +++ b/drivers/xen/blkfront/blkfront.c
9519 @@ -69,7 +69,7 @@
9520
9521 static void kick_pending_request_queues(struct blkfront_info *);
9522
9523 -static irqreturn_t blkif_int(int irq, void *dev_id, struct pt_regs *ptregs);
9524 +static irqreturn_t blkif_int(int irq, void *dev_id);
9525 static void blkif_restart_queue(void *arg);
9526 static void blkif_recover(struct blkfront_info *);
9527 static void blkif_completion(struct blk_shadow *);
9528 @@ -698,7 +698,7 @@
9529 }
9530
9531
9532 -static irqreturn_t blkif_int(int irq, void *dev_id, struct pt_regs *ptregs)
9533 +static irqreturn_t blkif_int(int irq, void *dev_id)
9534 {
9535 struct request *req;
9536 blkif_response_t *bret;
9537 --- a/drivers/xen/blktap/blktap.c
9538 +++ b/drivers/xen/blktap/blktap.c
9539 @@ -1175,7 +1175,7 @@
9540 wake_up(&blkif->wq);
9541 }
9542
9543 -irqreturn_t tap_blkif_be_int(int irq, void *dev_id, struct pt_regs *regs)
9544 +irqreturn_t tap_blkif_be_int(int irq, void *dev_id)
9545 {
9546 blkif_notify_work(dev_id);
9547 return IRQ_HANDLED;
9548 --- a/drivers/xen/blktap/common.h
9549 +++ b/drivers/xen/blktap/common.h
9550 @@ -112,7 +112,7 @@
9551
9552 void tap_blkif_xenbus_init(void);
9553
9554 -irqreturn_t tap_blkif_be_int(int irq, void *dev_id, struct pt_regs *regs);
9555 +irqreturn_t tap_blkif_be_int(int irq, void *dev_id);
9556 int tap_blkif_schedule(void *arg);
9557
9558 int dom_to_devid(domid_t domid, int xenbus_id, blkif_t *blkif);
9559 --- a/drivers/xen/console/console.c
9560 +++ b/drivers/xen/console/console.c
9561 @@ -345,7 +345,7 @@
9562 static int xencons_priv_irq;
9563 static char x_char;
9564
9565 -void xencons_rx(char *buf, unsigned len, struct pt_regs *regs)
9566 +void xencons_rx(char *buf, unsigned len)
9567 {
9568 int i;
9569 unsigned long flags;
9570 @@ -370,8 +370,7 @@
9571 if (time_before(jiffies, sysrq_timeout)) {
9572 spin_unlock_irqrestore(
9573 &xencons_lock, flags);
9574 - handle_sysrq(
9575 - buf[i], regs, xencons_tty);
9576 + handle_sysrq(buf[i], xencons_tty);
9577 spin_lock_irqsave(
9578 &xencons_lock, flags);
9579 continue;
9580 @@ -436,14 +435,13 @@
9581 }
9582
9583 /* Privileged receive callback and transmit kicker. */
9584 -static irqreturn_t xencons_priv_interrupt(int irq, void *dev_id,
9585 - struct pt_regs *regs)
9586 +static irqreturn_t xencons_priv_interrupt(int irq, void *dev_id)
9587 {
9588 static char rbuf[16];
9589 int l;
9590
9591 while ((l = HYPERVISOR_console_io(CONSOLEIO_read, 16, rbuf)) > 0)
9592 - xencons_rx(rbuf, l, regs);
9593 + xencons_rx(rbuf, l);
9594
9595 xencons_tx();
9596
9597 --- a/drivers/xen/console/xencons_ring.c
9598 +++ b/drivers/xen/console/xencons_ring.c
9599 @@ -83,7 +83,7 @@
9600 return sent;
9601 }
9602
9603 -static irqreturn_t handle_input(int irq, void *unused, struct pt_regs *regs)
9604 +static irqreturn_t handle_input(int irq, void *unused)
9605 {
9606 struct xencons_interface *intf = xencons_interface();
9607 XENCONS_RING_IDX cons, prod;
9608 @@ -94,7 +94,7 @@
9609 BUG_ON((prod - cons) > sizeof(intf->in));
9610
9611 while (cons != prod) {
9612 - xencons_rx(intf->in+MASK_XENCONS_IDX(cons,intf->in), 1, regs);
9613 + xencons_rx(intf->in+MASK_XENCONS_IDX(cons,intf->in), 1);
9614 cons++;
9615 }
9616
9617 --- a/drivers/xen/core/evtchn.c
9618 +++ b/drivers/xen/core/evtchn.c
9619 @@ -507,7 +507,7 @@
9620
9621 int bind_caller_port_to_irqhandler(
9622 unsigned int caller_port,
9623 - irqreturn_t (*handler)(int, void *, struct pt_regs *),
9624 + irq_handler_t handler,
9625 unsigned long irqflags,
9626 const char *devname,
9627 void *dev_id)
9628 @@ -530,7 +530,7 @@
9629
9630 int bind_listening_port_to_irqhandler(
9631 unsigned int remote_domain,
9632 - irqreturn_t (*handler)(int, void *, struct pt_regs *),
9633 + irq_handler_t handler,
9634 unsigned long irqflags,
9635 const char *devname,
9636 void *dev_id)
9637 @@ -554,7 +554,7 @@
9638 int bind_interdomain_evtchn_to_irqhandler(
9639 unsigned int remote_domain,
9640 unsigned int remote_port,
9641 - irqreturn_t (*handler)(int, void *, struct pt_regs *),
9642 + irq_handler_t handler,
9643 unsigned long irqflags,
9644 const char *devname,
9645 void *dev_id)
9646 @@ -578,7 +578,7 @@
9647 int bind_virq_to_irqhandler(
9648 unsigned int virq,
9649 unsigned int cpu,
9650 - irqreturn_t (*handler)(int, void *, struct pt_regs *),
9651 + irq_handler_t handler,
9652 unsigned long irqflags,
9653 const char *devname,
9654 void *dev_id)
9655 @@ -602,7 +602,7 @@
9656 int bind_ipi_to_irqhandler(
9657 unsigned int ipi,
9658 unsigned int cpu,
9659 - irqreturn_t (*handler)(int, void *, struct pt_regs *),
9660 + irq_handler_t handler,
9661 unsigned long irqflags,
9662 const char *devname,
9663 void *dev_id)
9664 @@ -687,15 +687,7 @@
9665 return 0;
9666 }
9667
9668 -static void shutdown_dynirq(unsigned int irq)
9669 -{
9670 - int evtchn = evtchn_from_irq(irq);
9671 -
9672 - if (VALID_EVTCHN(evtchn))
9673 - mask_evtchn(evtchn);
9674 -}
9675 -
9676 -static void enable_dynirq(unsigned int irq)
9677 +static void unmask_dynirq(unsigned int irq)
9678 {
9679 int evtchn = evtchn_from_irq(irq);
9680
9681 @@ -703,7 +695,7 @@
9682 unmask_evtchn(evtchn);
9683 }
9684
9685 -static void disable_dynirq(unsigned int irq)
9686 +static void mask_dynirq(unsigned int irq)
9687 {
9688 int evtchn = evtchn_from_irq(irq);
9689
9690 @@ -731,12 +723,12 @@
9691 unmask_evtchn(evtchn);
9692 }
9693
9694 -static struct hw_interrupt_type dynirq_type = {
9695 - .typename = "Dynamic-irq",
9696 +static struct irq_chip dynirq_chip = {
9697 + .name = "Dynamic-irq",
9698 .startup = startup_dynirq,
9699 - .shutdown = shutdown_dynirq,
9700 - .enable = enable_dynirq,
9701 - .disable = disable_dynirq,
9702 + .mask = mask_dynirq,
9703 + .unmask = unmask_dynirq,
9704 + .mask_ack = ack_dynirq,
9705 .ack = ack_dynirq,
9706 .end = end_dynirq,
9707 #ifdef CONFIG_SMP
9708 @@ -820,12 +812,12 @@
9709 irq_info[irq] = IRQ_UNBOUND;
9710 }
9711
9712 -static void enable_pirq(unsigned int irq)
9713 +static void unmask_pirq(unsigned int irq)
9714 {
9715 startup_pirq(irq);
9716 }
9717
9718 -static void disable_pirq(unsigned int irq)
9719 +static void mask_pirq(unsigned int irq)
9720 {
9721 }
9722
9723 @@ -854,12 +846,14 @@
9724 }
9725 }
9726
9727 -static struct hw_interrupt_type pirq_type = {
9728 +static struct irq_chip pirq_chip = {
9729 + .name = "Phys-irq",
9730 .typename = "Phys-irq",
9731 .startup = startup_pirq,
9732 .shutdown = shutdown_pirq,
9733 - .enable = enable_pirq,
9734 - .disable = disable_pirq,
9735 + .mask = mask_pirq,
9736 + .unmask = unmask_pirq,
9737 + .mask_ack = ack_pirq,
9738 .ack = ack_pirq,
9739 .end = end_pirq,
9740 #ifdef CONFIG_SMP
9741 @@ -1043,7 +1037,8 @@
9742 irq_desc[dynirq_to_irq(i)].status = IRQ_DISABLED;
9743 irq_desc[dynirq_to_irq(i)].action = NULL;
9744 irq_desc[dynirq_to_irq(i)].depth = 1;
9745 - irq_desc[dynirq_to_irq(i)].chip = &dynirq_type;
9746 + set_irq_chip_and_handler_name(dynirq_to_irq(i), &dynirq_chip,
9747 + handle_level_irq, "level");
9748 }
9749
9750 /* Phys IRQ space is statically bound (1:1 mapping). Nail refcnts. */
9751 @@ -1059,6 +1054,7 @@
9752 irq_desc[pirq_to_irq(i)].status = IRQ_DISABLED;
9753 irq_desc[pirq_to_irq(i)].action = NULL;
9754 irq_desc[pirq_to_irq(i)].depth = 1;
9755 - irq_desc[pirq_to_irq(i)].chip = &pirq_type;
9756 + set_irq_chip_and_handler_name(pirq_to_irq(i), &pirq_chip,
9757 + handle_level_irq, "level");
9758 }
9759 }
9760 --- a/drivers/xen/core/reboot.c
9761 +++ b/drivers/xen/core/reboot.c
9762 @@ -13,6 +13,7 @@
9763
9764 #ifdef HAVE_XEN_PLATFORM_COMPAT_H
9765 #include <xen/platform-compat.h>
9766 +#undef handle_sysrq
9767 #endif
9768
9769 MODULE_LICENSE("Dual BSD/GPL");
9770 @@ -203,7 +204,7 @@
9771
9772 #ifdef CONFIG_MAGIC_SYSRQ
9773 if (sysrq_key != '\0')
9774 - handle_sysrq(sysrq_key, NULL, NULL);
9775 + handle_sysrq(sysrq_key, NULL);
9776 #endif
9777 }
9778
9779 --- a/drivers/xen/core/smpboot.c
9780 +++ b/drivers/xen/core/smpboot.c
9781 @@ -25,8 +25,8 @@
9782 #include <xen/cpu_hotplug.h>
9783 #include <xen/xenbus.h>
9784
9785 -extern irqreturn_t smp_reschedule_interrupt(int, void *, struct pt_regs *);
9786 -extern irqreturn_t smp_call_function_interrupt(int, void *, struct pt_regs *);
9787 +extern irqreturn_t smp_reschedule_interrupt(int, void *);
9788 +extern irqreturn_t smp_call_function_interrupt(int, void *);
9789
9790 extern int local_setup_timer(unsigned int cpu);
9791 extern void local_teardown_timer(unsigned int cpu);
9792 @@ -66,8 +66,6 @@
9793 #if defined(__i386__)
9794 u8 x86_cpu_to_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = 0xff };
9795 EXPORT_SYMBOL(x86_cpu_to_apicid);
9796 -#elif !defined(CONFIG_X86_IO_APIC)
9797 -unsigned int maxcpus = NR_CPUS;
9798 #endif
9799
9800 void __init prefill_possible_map(void)
9801 --- a/drivers/xen/fbfront/xenfb.c
9802 +++ b/drivers/xen/fbfront/xenfb.c
9803 @@ -523,8 +523,7 @@
9804 .fb_set_par = xenfb_set_par,
9805 };
9806
9807 -static irqreturn_t xenfb_event_handler(int rq, void *dev_id,
9808 - struct pt_regs *regs)
9809 +static irqreturn_t xenfb_event_handler(int rq, void *dev_id)
9810 {
9811 /*
9812 * No in events recognized, simply ignore them all.
9813 --- a/drivers/xen/fbfront/xenkbd.c
9814 +++ b/drivers/xen/fbfront/xenkbd.c
9815 @@ -46,7 +46,7 @@
9816 * to do that.
9817 */
9818
9819 -static irqreturn_t input_handler(int rq, void *dev_id, struct pt_regs *regs)
9820 +static irqreturn_t input_handler(int rq, void *dev_id)
9821 {
9822 struct xenkbd_info *info = dev_id;
9823 struct xenkbd_page *page = info->page;
9824 --- a/drivers/xen/gntdev/gntdev.c
9825 +++ b/drivers/xen/gntdev/gntdev.c
9826 @@ -755,9 +755,6 @@
9827 BUG();
9828 }
9829
9830 - /* Copy the existing value of the PTE for returning. */
9831 - copy = *ptep;
9832 -
9833 /* Calculate the grant relating to this PTE. */
9834 slot_index = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT);
9835
9836 @@ -772,6 +769,10 @@
9837 GNTDEV_INVALID_HANDLE &&
9838 !xen_feature(XENFEAT_auto_translated_physmap)) {
9839 /* NOT USING SHADOW PAGE TABLES. */
9840 +
9841 + /* Copy the existing value of the PTE for returning. */
9842 + copy = *ptep;
9843 +
9844 gnttab_set_unmap_op(&op, virt_to_machine(ptep),
9845 GNTMAP_contains_pte,
9846 private_data->grants[slot_index]
9847 @@ -784,7 +785,7 @@
9848 op.status);
9849 } else {
9850 /* USING SHADOW PAGE TABLES. */
9851 - pte_clear_full(vma->vm_mm, addr, ptep, is_fullmm);
9852 + copy = ptep_get_and_clear_full(vma->vm_mm, addr, ptep, is_fullmm);
9853 }
9854
9855 /* Finally, we unmap the grant from kernel space. */
9856 @@ -812,7 +813,7 @@
9857 >> PAGE_SHIFT, INVALID_P2M_ENTRY);
9858
9859 } else {
9860 - pte_clear_full(vma->vm_mm, addr, ptep, is_fullmm);
9861 + copy = ptep_get_and_clear_full(vma->vm_mm, addr, ptep, is_fullmm);
9862 }
9863
9864 return copy;
9865 --- a/drivers/xen/netback/accel.c
9866 +++ b/drivers/xen/netback/accel.c
9867 @@ -65,7 +65,7 @@
9868
9869 if (IS_ERR(eth_name)) {
9870 /* Probably means not present */
9871 - DPRINTK("%s: no match due to xenbus_read accel error %d\n",
9872 + DPRINTK("%s: no match due to xenbus_read accel error %ld\n",
9873 __FUNCTION__, PTR_ERR(eth_name));
9874 return 0;
9875 } else {
9876 --- a/drivers/xen/netback/common.h
9877 +++ b/drivers/xen/netback/common.h
9878 @@ -200,7 +200,7 @@
9879
9880 int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev);
9881 struct net_device_stats *netif_be_get_stats(struct net_device *dev);
9882 -irqreturn_t netif_be_int(int irq, void *dev_id, struct pt_regs *regs);
9883 +irqreturn_t netif_be_int(int irq, void *dev_id);
9884
9885 static inline int netbk_can_queue(struct net_device *dev)
9886 {
9887 --- a/drivers/xen/netback/loopback.c
9888 +++ b/drivers/xen/netback/loopback.c
9889 @@ -151,7 +151,7 @@
9890 np->stats.rx_bytes += skb->len;
9891 np->stats.rx_packets++;
9892
9893 - if (skb->ip_summed == CHECKSUM_HW) {
9894 + if (skb->ip_summed == CHECKSUM_PARTIAL) {
9895 /* Defer checksum calculation. */
9896 skb->proto_csum_blank = 1;
9897 /* Must be a local packet: assert its integrity. */
9898 --- a/drivers/xen/netback/netback.c
9899 +++ b/drivers/xen/netback/netback.c
9900 @@ -677,7 +677,7 @@
9901 id = meta[npo.meta_cons].id;
9902 flags = nr_frags ? NETRXF_more_data : 0;
9903
9904 - if (skb->ip_summed == CHECKSUM_HW) /* local packet? */
9905 + if (skb->ip_summed == CHECKSUM_PARTIAL) /* local packet? */
9906 flags |= NETRXF_csum_blank | NETRXF_data_validated;
9907 else if (skb->proto_data_valid) /* remote but checksummed? */
9908 flags |= NETRXF_data_validated;
9909 @@ -1441,7 +1441,7 @@
9910 netif_idx_release(netif_page_index(page));
9911 }
9912
9913 -irqreturn_t netif_be_int(int irq, void *dev_id, struct pt_regs *regs)
9914 +irqreturn_t netif_be_int(int irq, void *dev_id)
9915 {
9916 netif_t *netif = dev_id;
9917
9918 @@ -1508,7 +1508,7 @@
9919 }
9920
9921 #ifdef NETBE_DEBUG_INTERRUPT
9922 -static irqreturn_t netif_be_dbg(int irq, void *dev_id, struct pt_regs *regs)
9923 +static irqreturn_t netif_be_dbg(int irq, void *dev_id)
9924 {
9925 struct list_head *ent;
9926 netif_t *netif;
9927 --- a/drivers/xen/netfront/netfront.c
9928 +++ b/drivers/xen/netfront/netfront.c
9929 @@ -136,7 +136,7 @@
9930 {
9931 return skb_is_gso(skb) &&
9932 (!skb_gso_ok(skb, dev->features) ||
9933 - unlikely(skb->ip_summed != CHECKSUM_HW));
9934 + unlikely(skb->ip_summed != CHECKSUM_PARTIAL));
9935 }
9936 #else
9937 #define HAVE_GSO 0
9938 @@ -222,7 +222,7 @@
9939 static void network_alloc_rx_buffers(struct net_device *);
9940 static void send_fake_arp(struct net_device *);
9941
9942 -static irqreturn_t netif_int(int irq, void *dev_id, struct pt_regs *ptregs);
9943 +static irqreturn_t netif_int(int irq, void *dev_id);
9944
9945 #ifdef CONFIG_SYSFS
9946 static int xennet_sysfs_addif(struct net_device *netdev);
9947 @@ -992,7 +992,7 @@
9948 tx->flags = 0;
9949 extra = NULL;
9950
9951 - if (skb->ip_summed == CHECKSUM_HW) /* local packet? */
9952 + if (skb->ip_summed == CHECKSUM_PARTIAL) /* local packet? */
9953 tx->flags |= NETTXF_csum_blank | NETTXF_data_validated;
9954 #ifdef CONFIG_XEN
9955 if (skb->proto_data_valid) /* remote but checksummed? */
9956 @@ -1049,7 +1049,7 @@
9957 return 0;
9958 }
9959
9960 -static irqreturn_t netif_int(int irq, void *dev_id, struct pt_regs *ptregs)
9961 +static irqreturn_t netif_int(int irq, void *dev_id)
9962 {
9963 struct net_device *dev = dev_id;
9964 struct netfront_info *np = netdev_priv(dev);
9965 --- a/drivers/xen/pciback/pciback.h
9966 +++ b/drivers/xen/pciback/pciback.h
9967 @@ -87,7 +87,7 @@
9968 void pciback_release_devices(struct pciback_device *pdev);
9969
9970 /* Handles events from front-end */
9971 -irqreturn_t pciback_handle_event(int irq, void *dev_id, struct pt_regs *regs);
9972 +irqreturn_t pciback_handle_event(int irq, void *dev_id);
9973 void pciback_do_op(void *data);
9974
9975 int pciback_xenbus_register(void);
9976 --- a/drivers/xen/pciback/pciback_ops.c
9977 +++ b/drivers/xen/pciback/pciback_ops.c
9978 @@ -85,7 +85,7 @@
9979 test_and_schedule_op(pdev);
9980 }
9981
9982 -irqreturn_t pciback_handle_event(int irq, void *dev_id, struct pt_regs *regs)
9983 +irqreturn_t pciback_handle_event(int irq, void *dev_id)
9984 {
9985 struct pciback_device *pdev = dev_id;
9986
9987 --- a/drivers/xen/pcifront/pci_op.c
9988 +++ b/drivers/xen/pcifront/pci_op.c
9989 @@ -392,10 +392,16 @@
9990
9991 d = pci_scan_single_device(b, devfn);
9992 if (d) {
9993 + int err;
9994 +
9995 dev_info(&pdev->xdev->dev, "New device on "
9996 "%04x:%02x:%02x.%02x found.\n", domain, bus,
9997 PCI_SLOT(devfn), PCI_FUNC(devfn));
9998 - pci_bus_add_device(d);
9999 + err = pci_bus_add_device(d);
10000 + if (err)
10001 + dev_err(&pdev->xdev->dev,
10002 + "error %d adding device, continuing.\n",
10003 + err);
10004 }
10005 }
10006
10007 --- a/drivers/xen/privcmd/compat_privcmd.c
10008 +++ b/drivers/xen/privcmd/compat_privcmd.c
10009 @@ -18,7 +18,6 @@
10010 * Authors: Jimi Xenidis <jimix@watson.ibm.com>
10011 */
10012
10013 -#include <linux/config.h>
10014 #include <linux/compat.h>
10015 #include <linux/ioctl.h>
10016 #include <linux/syscalls.h>
10017 --- a/drivers/xen/privcmd/privcmd.c
10018 +++ b/drivers/xen/privcmd/privcmd.c
10019 @@ -236,7 +236,7 @@
10020 #endif
10021
10022 /* DONTCOPY is essential for Xen as copy_page_range is broken. */
10023 - vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY;
10024 + vma->vm_flags |= VM_RESERVED | VM_IO | VM_PFNMAP | VM_DONTCOPY;
10025 vma->vm_ops = &privcmd_vm_ops;
10026 vma->vm_private_data = NULL;
10027
10028 --- a/drivers/xen/sfc_netback/accel_xenbus.c
10029 +++ b/drivers/xen/sfc_netback/accel_xenbus.c
10030 @@ -68,8 +68,7 @@
10031
10032
10033 /* Demultiplex a message IRQ from the frontend driver. */
10034 -static irqreturn_t msgirq_from_frontend(int irq, void *context,
10035 - struct pt_regs *unused)
10036 +static irqreturn_t msgirq_from_frontend(int irq, void *context)
10037 {
10038 struct xenbus_device *dev = context;
10039 struct netback_accel *bend = NETBACK_ACCEL_FROM_XENBUS_DEVICE(dev);
10040 @@ -84,8 +83,7 @@
10041 * functionally, but we need it to pass to the bind function, and may
10042 * get called spuriously
10043 */
10044 -static irqreturn_t netirq_from_frontend(int irq, void *context,
10045 - struct pt_regs *unused)
10046 +static irqreturn_t netirq_from_frontend(int irq, void *context)
10047 {
10048 VPRINTK("netirq %d from device %s\n", irq,
10049 ((struct xenbus_device *)context)->nodename);
10050 --- a/drivers/xen/sfc_netfront/accel.h
10051 +++ b/drivers/xen/sfc_netfront/accel.h
10052 @@ -449,10 +449,8 @@
10053 u32 ip, u16 port, u8 protocol);
10054
10055 /* Process an IRQ received from back end driver */
10056 -irqreturn_t netfront_accel_msg_channel_irq_from_bend(int irq, void *context,
10057 - struct pt_regs *unused);
10058 -irqreturn_t netfront_accel_net_channel_irq_from_bend(int irq, void *context,
10059 - struct pt_regs *unused);
10060 +irqreturn_t netfront_accel_msg_channel_irq_from_bend(int irq, void *context);
10061 +irqreturn_t netfront_accel_net_channel_irq_from_bend(int irq, void *context);
10062
10063 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
10064 extern void netfront_accel_msg_from_bend(struct work_struct *context);
10065 --- a/drivers/xen/sfc_netfront/accel_msg.c
10066 +++ b/drivers/xen/sfc_netfront/accel_msg.c
10067 @@ -490,8 +490,7 @@
10068 }
10069
10070
10071 -irqreturn_t netfront_accel_msg_channel_irq_from_bend(int irq, void *context,
10072 - struct pt_regs *unused)
10073 +irqreturn_t netfront_accel_msg_channel_irq_from_bend(int irq, void *context)
10074 {
10075 netfront_accel_vnic *vnic = (netfront_accel_vnic *)context;
10076 VPRINTK("irq %d from device %s\n", irq, vnic->dev->nodename);
10077 @@ -502,8 +501,7 @@
10078 }
10079
10080 /* Process an interrupt received from the NIC via backend */
10081 -irqreturn_t netfront_accel_net_channel_irq_from_bend(int irq, void *context,
10082 - struct pt_regs *unused)
10083 +irqreturn_t netfront_accel_net_channel_irq_from_bend(int irq, void *context)
10084 {
10085 netfront_accel_vnic *vnic = (netfront_accel_vnic *)context;
10086 struct net_device *net_dev = vnic->net_dev;
10087 --- a/drivers/xen/sfc_netfront/accel_tso.c
10088 +++ b/drivers/xen/sfc_netfront/accel_tso.c
10089 @@ -363,7 +363,7 @@
10090
10091 tso_check_safe(skb);
10092
10093 - if (skb->ip_summed != CHECKSUM_HW)
10094 + if (skb->ip_summed != CHECKSUM_PARTIAL)
10095 EPRINTK("Trying to TSO send a packet without HW checksum\n");
10096
10097 tso_start(&state, skb);
10098 --- a/drivers/xen/sfc_netfront/accel_vi.c
10099 +++ b/drivers/xen/sfc_netfront/accel_vi.c
10100 @@ -461,7 +461,7 @@
10101
10102 frag_i = -1;
10103
10104 - if (skb->ip_summed == CHECKSUM_HW) {
10105 + if (skb->ip_summed == CHECKSUM_PARTIAL) {
10106 /* Set to zero to encourage falcon to work it out for us */
10107 *(u16*)(skb->h.raw + skb->csum) = 0;
10108 }
10109 @@ -580,7 +580,7 @@
10110
10111 kva = buf->pkt_kva;
10112
10113 - if (skb->ip_summed == CHECKSUM_HW) {
10114 + if (skb->ip_summed == CHECKSUM_PARTIAL) {
10115 /* Set to zero to encourage falcon to work it out for us */
10116 *(u16*)(skb->h.raw + skb->csum) = 0;
10117 }
10118 --- a/drivers/xen/tpmback/common.h
10119 +++ b/drivers/xen/tpmback/common.h
10120 @@ -61,7 +61,7 @@
10121 void tpmif_xenbus_init(void);
10122 void tpmif_xenbus_exit(void);
10123 int tpmif_map(tpmif_t *tpmif, unsigned long shared_page, unsigned int evtchn);
10124 -irqreturn_t tpmif_be_int(int irq, void *dev_id, struct pt_regs *regs);
10125 +irqreturn_t tpmif_be_int(int irq, void *dev_id);
10126
10127 long int tpmback_get_instance(struct backend_info *bi);
10128
10129 --- a/drivers/xen/tpmback/tpmback.c
10130 +++ b/drivers/xen/tpmback/tpmback.c
10131 @@ -502,7 +502,7 @@
10132 list_del(&pak->next);
10133 write_unlock_irqrestore(&dataex.pak_lock, flags);
10134
10135 - DPRINTK("size given by app: %d, available: %d\n", size, left);
10136 + DPRINTK("size given by app: %zu, available: %u\n", size, left);
10137
10138 ret_size = min_t(size_t, size, left);
10139
10140 @@ -899,7 +899,7 @@
10141 }
10142 }
10143
10144 -irqreturn_t tpmif_be_int(int irq, void *dev_id, struct pt_regs *regs)
10145 +irqreturn_t tpmif_be_int(int irq, void *dev_id)
10146 {
10147 tpmif_t *tpmif = (tpmif_t *) dev_id;
10148
10149 --- a/drivers/xen/xenbus/xenbus_comms.c
10150 +++ b/drivers/xen/xenbus/xenbus_comms.c
10151 @@ -55,7 +55,7 @@
10152
10153 static DECLARE_WAIT_QUEUE_HEAD(xb_waitq);
10154
10155 -static irqreturn_t wake_waiting(int irq, void *unused, struct pt_regs *regs)
10156 +static irqreturn_t wake_waiting(int irq, void *unused)
10157 {
10158 if (unlikely(xenstored_ready == 0)) {
10159 xenstored_ready = 1;
10160 --- a/drivers/xen/xenoprof/xenoprofile.c
10161 +++ b/drivers/xen/xenoprof/xenoprofile.c
10162 @@ -195,7 +195,7 @@
10163 }
10164
10165 static irqreturn_t
10166 -xenoprof_ovf_interrupt(int irq, void * dev_id, struct pt_regs * regs)
10167 +xenoprof_ovf_interrupt(int irq, void * dev_id)
10168 {
10169 struct xenoprof_buf * buf;
10170 static unsigned long flag;
10171 --- a/include/asm-generic/pgtable.h
10172 +++ b/include/asm-generic/pgtable.h
10173 @@ -100,7 +100,7 @@
10174 #endif
10175
10176 #ifndef arch_change_pte_range
10177 -#define arch_change_pte_range(mm, pmd, addr, end, newprot) 0
10178 +#define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) 0
10179 #endif
10180
10181 #ifndef __HAVE_ARCH_PTE_SAME
10182 --- a/include/asm-x86/mach-xen/asm/desc_32.h
10183 +++ b/include/asm-x86/mach-xen/asm/desc_32.h
10184 @@ -32,52 +32,110 @@
10185 return (struct desc_struct *)per_cpu(cpu_gdt_descr, cpu).address;
10186 }
10187
10188 +/*
10189 + * This is the ldt that every process will get unless we need
10190 + * something other than this.
10191 + */
10192 +extern struct desc_struct default_ldt[];
10193 +extern struct desc_struct idt_table[];
10194 +extern void set_intr_gate(unsigned int irq, void * addr);
10195 +
10196 +static inline void pack_descriptor(__u32 *a, __u32 *b,
10197 + unsigned long base, unsigned long limit, unsigned char type, unsigned char flags)
10198 +{
10199 + *a = ((base & 0xffff) << 16) | (limit & 0xffff);
10200 + *b = (base & 0xff000000) | ((base & 0xff0000) >> 16) |
10201 + (limit & 0x000f0000) | ((type & 0xff) << 8) | ((flags & 0xf) << 20);
10202 +}
10203 +
10204 +static inline void pack_gate(__u32 *a, __u32 *b,
10205 + unsigned long base, unsigned short seg, unsigned char type, unsigned char flags)
10206 +{
10207 + *a = (seg << 16) | (base & 0xffff);
10208 + *b = (base & 0xffff0000) | ((type & 0xff) << 8) | (flags & 0xff);
10209 +}
10210 +
10211 +#define DESCTYPE_LDT 0x82 /* present, system, DPL-0, LDT */
10212 +#define DESCTYPE_TSS 0x89 /* present, system, DPL-0, 32-bit TSS */
10213 +#define DESCTYPE_TASK 0x85 /* present, system, DPL-0, task gate */
10214 +#define DESCTYPE_INT 0x8e /* present, system, DPL-0, interrupt gate */
10215 +#define DESCTYPE_TRAP 0x8f /* present, system, DPL-0, trap gate */
10216 +#define DESCTYPE_DPL3 0x60 /* DPL-3 */
10217 +#define DESCTYPE_S 0x10 /* !system */
10218 +
10219 #define load_TR_desc() __asm__ __volatile__("ltr %w0"::"q" (GDT_ENTRY_TSS*8))
10220 #define load_LDT_desc() __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8))
10221
10222 #define load_gdt(dtr) __asm__ __volatile("lgdt %0"::"m" (*dtr))
10223 #define load_idt(dtr) __asm__ __volatile("lidt %0"::"m" (*dtr))
10224 -#define load_tr(tr) __asm__ __volatile("ltr %0"::"mr" (tr))
10225 -#define load_ldt(ldt) __asm__ __volatile("lldt %0"::"mr" (ldt))
10226 +#define load_tr(tr) __asm__ __volatile("ltr %0"::"m" (tr))
10227 +#define load_ldt(ldt) __asm__ __volatile("lldt %0"::"m" (ldt))
10228
10229 #define store_gdt(dtr) __asm__ ("sgdt %0":"=m" (*dtr))
10230 #define store_idt(dtr) __asm__ ("sidt %0":"=m" (*dtr))
10231 -#define store_tr(tr) __asm__ ("str %0":"=mr" (tr))
10232 -#define store_ldt(ldt) __asm__ ("sldt %0":"=mr" (ldt))
10233 +#define store_tr(tr) __asm__ ("str %0":"=m" (tr))
10234 +#define store_ldt(ldt) __asm__ ("sldt %0":"=m" (ldt))
10235
10236 -/*
10237 - * This is the ldt that every process will get unless we need
10238 - * something other than this.
10239 - */
10240 -extern struct desc_struct default_ldt[];
10241 -extern void set_intr_gate(unsigned int irq, void * addr);
10242 +#if TLS_SIZE != 24
10243 +# error update this code.
10244 +#endif
10245 +
10246 +static inline void load_TLS(struct thread_struct *t, unsigned int cpu)
10247 +{
10248 +#define C(i) if (HYPERVISOR_update_descriptor(virt_to_machine(&get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]), \
10249 + *(u64 *)&t->tls_array[i]) \
10250 + BUG()
10251 + C(0); C(1); C(2);
10252 +#undef C
10253 +}
10254
10255 -#define _set_tssldt_desc(n,addr,limit,type) \
10256 -__asm__ __volatile__ ("movw %w3,0(%2)\n\t" \
10257 - "movw %w1,2(%2)\n\t" \
10258 - "rorl $16,%1\n\t" \
10259 - "movb %b1,4(%2)\n\t" \
10260 - "movb %4,5(%2)\n\t" \
10261 - "movb $0,6(%2)\n\t" \
10262 - "movb %h1,7(%2)\n\t" \
10263 - "rorl $16,%1" \
10264 - : "=m"(*(n)) : "q" (addr), "r"(n), "ir"(limit), "i"(type))
10265 +#ifndef CONFIG_XEN
10266 +static inline void write_dt_entry(void *dt, int entry, __u32 entry_a, __u32 entry_b)
10267 +{
10268 + __u32 *lp = (__u32 *)((char *)dt + entry*8);
10269 + *lp = entry_a;
10270 + *(lp+1) = entry_b;
10271 +}
10272
10273 -#ifndef CONFIG_X86_NO_TSS
10274 -static inline void __set_tss_desc(unsigned int cpu, unsigned int entry, void *addr)
10275 +#define write_ldt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
10276 +#define write_gdt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
10277 +#else
10278 +extern int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b);
10279 +extern int write_gdt_entry(void *gdt, int entry, __u32 entry_a, __u32 entry_b);
10280 +#endif
10281 +#ifndef CONFIG_X86_NO_IDT
10282 +#define write_idt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
10283 +
10284 +static inline void _set_gate(int gate, unsigned int type, void *addr, unsigned short seg)
10285 {
10286 - _set_tssldt_desc(&get_cpu_gdt_table(cpu)[entry], (int)addr,
10287 - offsetof(struct tss_struct, __cacheline_filler) - 1, 0x89);
10288 + __u32 a, b;
10289 + pack_gate(&a, &b, (unsigned long)addr, seg, type, 0);
10290 + write_idt_entry(idt_table, gate, a, b);
10291 }
10292 +#endif
10293
10294 -#define set_tss_desc(cpu,addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr)
10295 +#ifndef CONFIG_X86_NO_TSS
10296 +static inline void __set_tss_desc(unsigned int cpu, unsigned int entry, const void *addr)
10297 +{
10298 + __u32 a, b;
10299 + pack_descriptor(&a, &b, (unsigned long)addr,
10300 + offsetof(struct tss_struct, __cacheline_filler) - 1,
10301 + DESCTYPE_TSS, 0);
10302 + write_gdt_entry(get_cpu_gdt_table(cpu), entry, a, b);
10303 +}
10304 #endif
10305
10306 -static inline void set_ldt_desc(unsigned int cpu, void *addr, unsigned int size)
10307 +static inline void set_ldt_desc(unsigned int cpu, void *addr, unsigned int entries)
10308 {
10309 - _set_tssldt_desc(&get_cpu_gdt_table(cpu)[GDT_ENTRY_LDT], (int)addr, ((size << 3)-1), 0x82);
10310 + __u32 a, b;
10311 + pack_descriptor(&a, &b, (unsigned long)addr,
10312 + entries * sizeof(struct desc_struct) - 1,
10313 + DESCTYPE_LDT, 0);
10314 + write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, a, b);
10315 }
10316
10317 +#define set_tss_desc(cpu,addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr)
10318 +
10319 #define LDT_entry_a(info) \
10320 ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff))
10321
10322 @@ -103,21 +161,6 @@
10323 (info)->seg_not_present == 1 && \
10324 (info)->useable == 0 )
10325
10326 -extern int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b);
10327 -
10328 -#if TLS_SIZE != 24
10329 -# error update this code.
10330 -#endif
10331 -
10332 -static inline void load_TLS(struct thread_struct *t, unsigned int cpu)
10333 -{
10334 -#define C(i) if (HYPERVISOR_update_descriptor(virt_to_machine(&get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]), \
10335 - *(u64 *)&t->tls_array[i])) \
10336 - BUG();
10337 - C(0); C(1); C(2);
10338 -#undef C
10339 -}
10340 -
10341 static inline void clear_LDT(void)
10342 {
10343 int cpu = get_cpu();
10344 --- a/include/asm-x86/mach-xen/asm/dma-mapping_64.h
10345 +++ b/include/asm-x86/mach-xen/asm/dma-mapping_64.h
10346 @@ -55,13 +55,6 @@
10347 extern struct dma_mapping_ops* dma_ops;
10348 extern int iommu_merge;
10349
10350 -static inline int valid_dma_direction(int dma_direction)
10351 -{
10352 - return ((dma_direction == DMA_BIDIRECTIONAL) ||
10353 - (dma_direction == DMA_TO_DEVICE) ||
10354 - (dma_direction == DMA_FROM_DEVICE));
10355 -}
10356 -
10357 #if 0
10358 static inline int dma_mapping_error(dma_addr_t dma_addr)
10359 {
10360 --- a/include/asm-x86/mach-xen/asm/e820_64.h
10361 +++ b/include/asm-x86/mach-xen/asm/e820_64.h
10362 @@ -19,13 +19,9 @@
10363
10364 #define E820_RAM 1
10365 #define E820_RESERVED 2
10366 -#define E820_ACPI 3 /* usable as RAM once ACPI tables have been read */
10367 +#define E820_ACPI 3
10368 #define E820_NVS 4
10369
10370 -#define HIGH_MEMORY (1024*1024)
10371 -
10372 -#define LOWMEMSIZE() (0x9f000)
10373 -
10374 #ifndef __ASSEMBLY__
10375 struct e820entry {
10376 u64 addr; /* start of memory segment */
10377 @@ -46,17 +42,16 @@
10378 extern void contig_e820_setup(void);
10379 extern unsigned long e820_end_of_ram(void);
10380 extern void e820_reserve_resources(struct e820entry *e820, int nr_map);
10381 +extern void e820_mark_nosave_regions(void);
10382 extern void e820_print_map(char *who);
10383 extern int e820_any_mapped(unsigned long start, unsigned long end, unsigned type);
10384 extern int e820_all_mapped(unsigned long start, unsigned long end, unsigned type);
10385
10386 -extern void e820_bootmem_free(pg_data_t *pgdat, unsigned long start,unsigned long end);
10387 extern void e820_setup_gap(struct e820entry *e820, int nr_map);
10388 -extern unsigned long e820_hole_size(unsigned long start_pfn,
10389 - unsigned long end_pfn);
10390 +extern void e820_register_active_regions(int nid,
10391 + unsigned long start_pfn, unsigned long end_pfn);
10392
10393 -extern void __init parse_memopt(char *p, char **end);
10394 -extern void __init parse_memmapopt(char *p, char **end);
10395 +extern void finish_e820_parsing(void);
10396
10397 extern struct e820map e820;
10398
10399 --- a/include/asm-x86/mach-xen/asm/fixmap_32.h
10400 +++ b/include/asm-x86/mach-xen/asm/fixmap_32.h
10401 @@ -55,7 +55,7 @@
10402 #ifdef CONFIG_X86_LOCAL_APIC
10403 FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
10404 #endif
10405 -#ifdef CONFIG_X86_IO_APIC
10406 +#if defined(CONFIG_X86_IO_APIC) && !defined(CONFIG_XEN)
10407 FIX_IO_APIC_BASE_0,
10408 FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
10409 #endif
10410 @@ -95,10 +95,9 @@
10411 __end_of_fixed_addresses
10412 };
10413
10414 -extern void set_fixaddr_top(unsigned long top);
10415 -
10416 extern void __set_fixmap(enum fixed_addresses idx,
10417 maddr_t phys, pgprot_t flags);
10418 +extern void reserve_top_address(unsigned long reserve);
10419
10420 #define set_fixmap(idx, phys) \
10421 __set_fixmap(idx, phys, PAGE_KERNEL)
10422 --- a/include/asm-x86/mach-xen/asm/fixmap_64.h
10423 +++ b/include/asm-x86/mach-xen/asm/fixmap_64.h
10424 @@ -41,7 +41,7 @@
10425 #ifdef CONFIG_X86_LOCAL_APIC
10426 FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
10427 #endif
10428 -#ifdef CONFIG_X86_IO_APIC
10429 +#ifndef CONFIG_XEN
10430 FIX_IO_APIC_BASE_0,
10431 FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
10432 #endif
10433 --- a/include/asm-x86/mach-xen/asm/hw_irq_32.h
10434 +++ b/include/asm-x86/mach-xen/asm/hw_irq_32.h
10435 @@ -17,8 +17,6 @@
10436 #include <asm/irq.h>
10437 #include <asm/sections.h>
10438
10439 -struct hw_interrupt_type;
10440 -
10441 #define NMI_VECTOR 0x02
10442
10443 /*
10444 @@ -28,10 +26,6 @@
10445 * Interrupt entry/exit code at both C and assembly level
10446 */
10447
10448 -extern u8 irq_vector[NR_IRQ_VECTORS];
10449 -#define IO_APIC_VECTOR(irq) (irq_vector[irq])
10450 -#define AUTO_ASSIGN -1
10451 -
10452 extern void (*interrupt[NR_IRQS])(void);
10453
10454 #ifdef CONFIG_SMP
10455 @@ -44,7 +38,7 @@
10456 fastcall void apic_timer_interrupt(void);
10457 fastcall void error_interrupt(void);
10458 fastcall void spurious_interrupt(void);
10459 -fastcall void thermal_interrupt(struct pt_regs *);
10460 +fastcall void thermal_interrupt(void);
10461 #define platform_legacy_irq(irq) ((irq) < 16)
10462 #endif
10463
10464 --- a/include/asm-x86/mach-xen/asm/hw_irq_64.h
10465 +++ b/include/asm-x86/mach-xen/asm/hw_irq_64.h
10466 @@ -19,8 +19,7 @@
10467 #include <asm/irq.h>
10468 #include <linux/profile.h>
10469 #include <linux/smp.h>
10470 -
10471 -struct hw_interrupt_type;
10472 +#include <linux/percpu.h>
10473 #endif
10474
10475 #define NMI_VECTOR 0x02
10476 @@ -77,9 +76,10 @@
10477
10478
10479 #ifndef __ASSEMBLY__
10480 -extern u8 irq_vector[NR_IRQ_VECTORS];
10481 -#define IO_APIC_VECTOR(irq) (irq_vector[irq])
10482 -#define AUTO_ASSIGN -1
10483 +typedef int vector_irq_t[NR_VECTORS];
10484 +DECLARE_PER_CPU(vector_irq_t, vector_irq);
10485 +extern void __setup_vector_irq(int cpu);
10486 +extern spinlock_t vector_lock;
10487
10488 /*
10489 * Various low-level irq details needed by irq.c, process.c,
10490 --- a/include/asm-x86/mach-xen/asm/io_32.h
10491 +++ b/include/asm-x86/mach-xen/asm/io_32.h
10492 @@ -237,33 +237,6 @@
10493
10494 #define eth_io_copy_and_sum(a,b,c,d) eth_copy_and_sum((a),(void __force *)(b),(c),(d))
10495
10496 -/**
10497 - * check_signature - find BIOS signatures
10498 - * @io_addr: mmio address to check
10499 - * @signature: signature block
10500 - * @length: length of signature
10501 - *
10502 - * Perform a signature comparison with the mmio address io_addr. This
10503 - * address should have been obtained by ioremap.
10504 - * Returns 1 on a match.
10505 - */
10506 -
10507 -static inline int check_signature(volatile void __iomem * io_addr,
10508 - const unsigned char *signature, int length)
10509 -{
10510 - int retval = 0;
10511 - do {
10512 - if (readb(io_addr) != *signature)
10513 - goto out;
10514 - io_addr++;
10515 - signature++;
10516 - length--;
10517 - } while (length);
10518 - retval = 1;
10519 -out:
10520 - return retval;
10521 -}
10522 -
10523 /*
10524 * Cache management
10525 *
10526 --- a/include/asm-x86/mach-xen/asm/io_64.h
10527 +++ b/include/asm-x86/mach-xen/asm/io_64.h
10528 @@ -273,33 +273,6 @@
10529
10530 #define eth_io_copy_and_sum(a,b,c,d) eth_copy_and_sum((a),(void *)(b),(c),(d))
10531
10532 -/**
10533 - * check_signature - find BIOS signatures
10534 - * @io_addr: mmio address to check
10535 - * @signature: signature block
10536 - * @length: length of signature
10537 - *
10538 - * Perform a signature comparison with the mmio address io_addr. This
10539 - * address should have been obtained by ioremap.
10540 - * Returns 1 on a match.
10541 - */
10542 -
10543 -static inline int check_signature(void __iomem *io_addr,
10544 - const unsigned char *signature, int length)
10545 -{
10546 - int retval = 0;
10547 - do {
10548 - if (readb(io_addr) != *signature)
10549 - goto out;
10550 - io_addr++;
10551 - signature++;
10552 - length--;
10553 - } while (length);
10554 - retval = 1;
10555 -out:
10556 - return retval;
10557 -}
10558 -
10559 /* Nothing to do */
10560
10561 #define dma_cache_inv(_start,_size) do { } while (0)
10562 --- a/include/asm-x86/mach-xen/asm/pgtable-2level.h
10563 +++ b/include/asm-x86/mach-xen/asm/pgtable-2level.h
10564 @@ -23,14 +23,6 @@
10565 set_pte((ptep), (pteval)); \
10566 } while (0)
10567
10568 -#define set_pte_at_sync(_mm,addr,ptep,pteval) do { \
10569 - if (((_mm) != current->mm && (_mm) != &init_mm) || \
10570 - HYPERVISOR_update_va_mapping((addr), (pteval), UVMF_INVLPG)) { \
10571 - set_pte((ptep), (pteval)); \
10572 - xen_invlpg((addr)); \
10573 - } \
10574 -} while (0)
10575 -
10576 #define set_pte_atomic(pteptr, pteval) set_pte(pteptr,pteval)
10577
10578 #define set_pmd(pmdptr, pmdval) xen_l2_entry_update((pmdptr), (pmdval))
10579 @@ -40,6 +32,7 @@
10580
10581 #define pte_none(x) (!(x).pte_low)
10582
10583 +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
10584 static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
10585 {
10586 pte_t pte = *ptep;
10587 @@ -51,6 +44,7 @@
10588 return pte;
10589 }
10590
10591 +#define __HAVE_ARCH_PTEP_CLEAR_FLUSH
10592 #define ptep_clear_flush(vma, addr, ptep) \
10593 ({ \
10594 pte_t *__ptep = (ptep); \
10595 @@ -66,8 +60,6 @@
10596 __res; \
10597 })
10598
10599 -#define pte_same(a, b) ((a).pte_low == (b).pte_low)
10600 -
10601 #define __pte_mfn(_pte) ((_pte).pte_low >> PAGE_SHIFT)
10602 #define pte_mfn(_pte) ((_pte).pte_low & _PAGE_PRESENT ? \
10603 __pte_mfn(_pte) : pfn_to_mfn(__pte_mfn(_pte)))
10604 --- a/include/asm-x86/mach-xen/asm/pgtable-3level.h
10605 +++ b/include/asm-x86/mach-xen/asm/pgtable-3level.h
10606 @@ -53,7 +53,6 @@
10607 * not possible, use pte_get_and_clear to obtain the old pte
10608 * value and then use set_pte to update it. -ben
10609 */
10610 -#define __HAVE_ARCH_SET_PTE_ATOMIC
10611
10612 static inline void set_pte(pte_t *ptep, pte_t pte)
10613 {
10614 @@ -70,14 +69,6 @@
10615 set_pte((ptep), (pteval)); \
10616 } while (0)
10617
10618 -#define set_pte_at_sync(_mm,addr,ptep,pteval) do { \
10619 - if (((_mm) != current->mm && (_mm) != &init_mm) || \
10620 - HYPERVISOR_update_va_mapping((addr), (pteval), UVMF_INVLPG)) { \
10621 - set_pte((ptep), (pteval)); \
10622 - xen_invlpg((addr)); \
10623 - } \
10624 -} while (0)
10625 -
10626 #define set_pmd(pmdptr,pmdval) \
10627 xen_l2_entry_update((pmdptr), (pmdval))
10628 #define set_pud(pudptr,pudval) \
10629 @@ -94,7 +85,7 @@
10630 #define pud_page(pud) \
10631 ((struct page *) __va(pud_val(pud) & PAGE_MASK))
10632
10633 -#define pud_page_kernel(pud) \
10634 +#define pud_page_vaddr(pud) \
10635 ((unsigned long) __va(pud_val(pud) & PAGE_MASK))
10636
10637
10638 @@ -124,6 +115,7 @@
10639
10640 #define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0)
10641
10642 +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
10643 static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
10644 {
10645 pte_t pte = *ptep;
10646 @@ -142,6 +134,7 @@
10647 return pte;
10648 }
10649
10650 +#define __HAVE_ARCH_PTEP_CLEAR_FLUSH
10651 #define ptep_clear_flush(vma, addr, ptep) \
10652 ({ \
10653 pte_t *__ptep = (ptep); \
10654 @@ -159,6 +152,7 @@
10655 __res; \
10656 })
10657
10658 +#define __HAVE_ARCH_PTE_SAME
10659 static inline int pte_same(pte_t a, pte_t b)
10660 {
10661 return a.pte_low == b.pte_low && a.pte_high == b.pte_high;
10662 --- a/include/asm-x86/mach-xen/asm/pgtable_32.h
10663 +++ b/include/asm-x86/mach-xen/asm/pgtable_32.h
10664 @@ -260,31 +260,89 @@
10665 # include <asm/pgtable-2level.h>
10666 #endif
10667
10668 -#define ptep_test_and_clear_dirty(vma, addr, ptep) \
10669 +/*
10670 + * Rules for using pte_update - it must be called after any PTE update which
10671 + * has not been done using the set_pte / clear_pte interfaces. It is used by
10672 + * shadow mode hypervisors to resynchronize the shadow page tables. Kernel PTE
10673 + * updates should either be sets, clears, or set_pte_atomic for P->P
10674 + * transitions, which means this hook should only be called for user PTEs.
10675 + * This hook implies a P->P protection or access change has taken place, which
10676 + * requires a subsequent TLB flush. The notification can optionally be delayed
10677 + * until the TLB flush event by using the pte_update_defer form of the
10678 + * interface, but care must be taken to assure that the flush happens while
10679 + * still holding the same page table lock so that the shadow and primary pages
10680 + * do not become out of sync on SMP.
10681 + */
10682 +#define pte_update(mm, addr, ptep) do { } while (0)
10683 +#define pte_update_defer(mm, addr, ptep) do { } while (0)
10684 +
10685 +
10686 +/*
10687 + * We only update the dirty/accessed state if we set
10688 + * the dirty bit by hand in the kernel, since the hardware
10689 + * will do the accessed bit for us, and we don't want to
10690 + * race with other CPU's that might be updating the dirty
10691 + * bit at the same time.
10692 + */
10693 +#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
10694 +#define ptep_set_access_flags(vma, address, ptep, entry, dirty) \
10695 +do { \
10696 + if (dirty) \
10697 + ptep_establish(vma, address, ptep, entry); \
10698 +} while (0)
10699 +
10700 +/*
10701 + * We don't actually have these, but we want to advertise them so that
10702 + * we can encompass the flush here.
10703 + */
10704 +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
10705 +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
10706 +
10707 +/*
10708 + * Rules for using ptep_establish: the pte MUST be a user pte, and
10709 + * must be a present->present transition.
10710 + */
10711 +#define __HAVE_ARCH_PTEP_ESTABLISH
10712 +#define ptep_establish(vma, address, ptep, pteval) \
10713 +do { \
10714 + if ( likely((vma)->vm_mm == current->mm) ) { \
10715 + BUG_ON(HYPERVISOR_update_va_mapping(address, \
10716 + pteval, \
10717 + (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
10718 + UVMF_INVLPG|UVMF_MULTI)); \
10719 + } else { \
10720 + xen_l1_entry_update(ptep, pteval); \
10721 + flush_tlb_page(vma, address); \
10722 + } \
10723 +} while (0)
10724 +
10725 +#define __HAVE_ARCH_PTEP_CLEAR_DIRTY_FLUSH
10726 +#define ptep_clear_flush_dirty(vma, address, ptep) \
10727 ({ \
10728 pte_t __pte = *(ptep); \
10729 - int __ret = pte_dirty(__pte); \
10730 - if (__ret) { \
10731 - __pte = pte_mkclean(__pte); \
10732 - if ((vma)->vm_mm != current->mm || \
10733 - HYPERVISOR_update_va_mapping(addr, __pte, 0)) \
10734 - (ptep)->pte_low = __pte.pte_low; \
10735 - } \
10736 - __ret; \
10737 + int __dirty = pte_dirty(__pte); \
10738 + __pte = pte_mkclean(__pte); \
10739 + if (test_bit(PG_pinned, &virt_to_page((vma)->vm_mm->pgd)->flags)) \
10740 + ptep_set_access_flags(vma, address, ptep, __pte, __dirty); \
10741 + else if (__dirty) \
10742 + (ptep)->pte_low = __pte.pte_low; \
10743 + __dirty; \
10744 })
10745
10746 -#define ptep_test_and_clear_young(vma, addr, ptep) \
10747 +#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
10748 +#define ptep_clear_flush_young(vma, address, ptep) \
10749 ({ \
10750 pte_t __pte = *(ptep); \
10751 - int __ret = pte_young(__pte); \
10752 - if (__ret) \
10753 - __pte = pte_mkold(__pte); \
10754 - if ((vma)->vm_mm != current->mm || \
10755 - HYPERVISOR_update_va_mapping(addr, __pte, 0)) \
10756 - (ptep)->pte_low = __pte.pte_low; \
10757 - __ret; \
10758 + int __young = pte_young(__pte); \
10759 + __pte = pte_mkold(__pte); \
10760 + if (test_bit(PG_pinned, &virt_to_page((vma)->vm_mm->pgd)->flags)) \
10761 + ptep_set_access_flags(vma, address, ptep, __pte, __young); \
10762 + else if (__young) \
10763 + (ptep)->pte_low = __pte.pte_low; \
10764 + __young; \
10765 })
10766
10767 +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
10768 #define ptep_get_and_clear_full(mm, addr, ptep, full) \
10769 ((full) ? ({ \
10770 pte_t __res = *(ptep); \
10771 @@ -296,6 +354,7 @@
10772 }) : \
10773 ptep_get_and_clear(mm, addr, ptep))
10774
10775 +#define __HAVE_ARCH_PTEP_SET_WRPROTECT
10776 static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
10777 {
10778 pte_t pte = *ptep;
10779 @@ -391,11 +450,11 @@
10780 #define pte_index(address) \
10781 (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
10782 #define pte_offset_kernel(dir, address) \
10783 - ((pte_t *) pmd_page_kernel(*(dir)) + pte_index(address))
10784 + ((pte_t *) pmd_page_vaddr(*(dir)) + pte_index(address))
10785
10786 #define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
10787
10788 -#define pmd_page_kernel(pmd) \
10789 +#define pmd_page_vaddr(pmd) \
10790 ((unsigned long) __va(pmd_val(pmd) & PAGE_MASK))
10791
10792 /*
10793 @@ -418,8 +477,6 @@
10794 static inline int set_kernel_exec(unsigned long vaddr, int enable) { return 0;}
10795 #endif
10796
10797 -extern void noexec_setup(const char *str);
10798 -
10799 #if defined(CONFIG_HIGHPTE)
10800 #define pte_offset_map(dir, address) \
10801 ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE0) + \
10802 @@ -437,37 +494,17 @@
10803 #define pte_unmap_nested(pte) do { } while (0)
10804 #endif
10805
10806 -#define __HAVE_ARCH_PTEP_ESTABLISH
10807 -#define ptep_establish(vma, address, ptep, pteval) \
10808 - do { \
10809 - if ( likely((vma)->vm_mm == current->mm) ) { \
10810 - BUG_ON(HYPERVISOR_update_va_mapping(address, \
10811 - pteval, \
10812 - (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
10813 - UVMF_INVLPG|UVMF_MULTI)); \
10814 - } else { \
10815 - xen_l1_entry_update(ptep, pteval); \
10816 - flush_tlb_page(vma, address); \
10817 - } \
10818 - } while (0)
10819 +/* Clear a kernel PTE and flush it from the TLB */
10820 +#define kpte_clear_flush(ptep, vaddr) do { \
10821 + if (HYPERVISOR_update_va_mapping(vaddr, __pte(0), UVMF_INVLPG)) \
10822 + BUG(); \
10823 +} while (0)
10824
10825 /*
10826 * The i386 doesn't have any external MMU info: the kernel page
10827 * tables contain all the necessary information.
10828 - *
10829 - * Also, we only update the dirty/accessed state if we set
10830 - * the dirty bit by hand in the kernel, since the hardware
10831 - * will do the accessed bit for us, and we don't want to
10832 - * race with other CPU's that might be updating the dirty
10833 - * bit at the same time.
10834 */
10835 #define update_mmu_cache(vma,address,pte) do { } while (0)
10836 -#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
10837 -#define ptep_set_access_flags(vma, address, ptep, entry, dirty) \
10838 - do { \
10839 - if (dirty) \
10840 - ptep_establish(vma, address, ptep, entry); \
10841 - } while (0)
10842
10843 #include <xen/features.h>
10844 void make_lowmem_page_readonly(void *va, unsigned int feature);
10845 @@ -516,10 +553,11 @@
10846 unsigned long size);
10847
10848 int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
10849 - unsigned long addr, unsigned long end, pgprot_t newprot);
10850 + unsigned long addr, unsigned long end, pgprot_t newprot,
10851 + int dirty_accountable);
10852
10853 -#define arch_change_pte_range(mm, pmd, addr, end, newprot) \
10854 - xen_change_pte_range(mm, pmd, addr, end, newprot)
10855 +#define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) \
10856 + xen_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable)
10857
10858 #define io_remap_pfn_range(vma,from,pfn,size,prot) \
10859 direct_remap_pfn_range(vma,from,pfn,size,prot,DOMID_IO)
10860 @@ -528,13 +566,6 @@
10861 #define GET_IOSPACE(pfn) 0
10862 #define GET_PFN(pfn) (pfn)
10863
10864 -#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
10865 -#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
10866 -#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
10867 -#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
10868 -#define __HAVE_ARCH_PTEP_CLEAR_FLUSH
10869 -#define __HAVE_ARCH_PTEP_SET_WRPROTECT
10870 -#define __HAVE_ARCH_PTE_SAME
10871 #include <asm-generic/pgtable.h>
10872
10873 #endif /* _I386_PGTABLE_H */
10874 --- a/include/asm-x86/mach-xen/asm/pgtable_64.h
10875 +++ b/include/asm-x86/mach-xen/asm/pgtable_64.h
10876 @@ -43,12 +43,9 @@
10877
10878 #define swapper_pg_dir init_level4_pgt
10879
10880 -extern int nonx_setup(char *str);
10881 extern void paging_init(void);
10882 extern void clear_kernel_mapping(unsigned long addr, unsigned long size);
10883
10884 -extern unsigned long pgkern_mask;
10885 -
10886 /*
10887 * ZERO_PAGE is a global shared page that is always zero: used
10888 * for zero-mapped memory areas etc..
10889 @@ -118,9 +115,6 @@
10890 set_pgd(__user_pgd(pgd), __pgd(0));
10891 }
10892
10893 -#define pud_page(pud) \
10894 - ((unsigned long) __va(pud_val(pud) & PHYSICAL_PAGE_MASK))
10895 -
10896 #define pte_same(a, b) ((a).pte == (b).pte)
10897
10898 #define pte_pgprot(a) (__pgprot((a).pte & ~PHYSICAL_PAGE_MASK))
10899 @@ -332,7 +326,7 @@
10900 #define __LARGE_PTE (_PAGE_PSE|_PAGE_PRESENT)
10901 static inline int pte_user(pte_t pte) { return __pte_val(pte) & _PAGE_USER; }
10902 static inline int pte_read(pte_t pte) { return __pte_val(pte) & _PAGE_USER; }
10903 -static inline int pte_exec(pte_t pte) { return __pte_val(pte) & _PAGE_USER; }
10904 +static inline int pte_exec(pte_t pte) { return !(__pte_val(pte) & _PAGE_NX); }
10905 static inline int pte_dirty(pte_t pte) { return __pte_val(pte) & _PAGE_DIRTY; }
10906 static inline int pte_young(pte_t pte) { return __pte_val(pte) & _PAGE_ACCESSED; }
10907 static inline int pte_write(pte_t pte) { return __pte_val(pte) & _PAGE_RW; }
10908 @@ -345,29 +339,12 @@
10909 static inline pte_t pte_mkold(pte_t pte) { __pte_val(pte) &= ~_PAGE_ACCESSED; return pte; }
10910 static inline pte_t pte_wrprotect(pte_t pte) { __pte_val(pte) &= ~_PAGE_RW; return pte; }
10911 static inline pte_t pte_mkread(pte_t pte) { __pte_val(pte) |= _PAGE_USER; return pte; }
10912 -static inline pte_t pte_mkexec(pte_t pte) { __pte_val(pte) |= _PAGE_USER; return pte; }
10913 +static inline pte_t pte_mkexec(pte_t pte) { __pte_val(pte) &= ~_PAGE_NX; return pte; }
10914 static inline pte_t pte_mkdirty(pte_t pte) { __pte_val(pte) |= _PAGE_DIRTY; return pte; }
10915 static inline pte_t pte_mkyoung(pte_t pte) { __pte_val(pte) |= _PAGE_ACCESSED; return pte; }
10916 static inline pte_t pte_mkwrite(pte_t pte) { __pte_val(pte) |= _PAGE_RW; return pte; }
10917 static inline pte_t pte_mkhuge(pte_t pte) { __pte_val(pte) |= _PAGE_PSE; return pte; }
10918 -
10919 -#define ptep_test_and_clear_dirty(vma, addr, ptep) \
10920 -({ \
10921 - pte_t __pte = *(ptep); \
10922 - int __ret = pte_dirty(__pte); \
10923 - if (__ret) \
10924 - set_pte_at((vma)->vm_mm, addr, ptep, pte_mkclean(__pte)); \
10925 - __ret; \
10926 -})
10927 -
10928 -#define ptep_test_and_clear_young(vma, addr, ptep) \
10929 -({ \
10930 - pte_t __pte = *(ptep); \
10931 - int __ret = pte_young(__pte); \
10932 - if (__ret) \
10933 - set_pte_at((vma)->vm_mm, addr, ptep, pte_mkold(__pte)); \
10934 - __ret; \
10935 -})
10936 +static inline pte_t pte_clrhuge(pte_t pte) { __pte_val(pte) &= ~_PAGE_PSE; return pte; }
10937
10938 static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
10939 {
10940 @@ -395,7 +372,8 @@
10941 * Level 4 access.
10942 * Never use these in the common code.
10943 */
10944 -#define pgd_page(pgd) ((unsigned long) __va(pgd_val(pgd) & PTE_MASK))
10945 +#define pgd_page_vaddr(pgd) ((unsigned long) __va(pgd_val(pgd) & PTE_MASK))
10946 +#define pgd_page(pgd) (pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT))
10947 #define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1))
10948 #define pgd_offset(mm, addr) ((mm)->pgd + pgd_index(addr))
10949 #define pgd_offset_k(address) (init_level4_pgt + pgd_index(address))
10950 @@ -404,16 +382,18 @@
10951
10952 /* PUD - Level3 access */
10953 /* to find an entry in a page-table-directory. */
10954 +#define pud_page_vaddr(pud) ((unsigned long) __va(pud_val(pud) & PHYSICAL_PAGE_MASK))
10955 +#define pud_page(pud) (pfn_to_page(pud_val(pud) >> PAGE_SHIFT))
10956 #define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
10957 -#define pud_offset(pgd, address) ((pud_t *) pgd_page(*(pgd)) + pud_index(address))
10958 +#define pud_offset(pgd, address) ((pud_t *) pgd_page_vaddr(*(pgd)) + pud_index(address))
10959 #define pud_present(pud) (__pud_val(pud) & _PAGE_PRESENT)
10960
10961 /* PMD - Level 2 access */
10962 -#define pmd_page_kernel(pmd) ((unsigned long) __va(pmd_val(pmd) & PTE_MASK))
10963 +#define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val(pmd) & PTE_MASK))
10964 #define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
10965
10966 #define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1))
10967 -#define pmd_offset(dir, address) ((pmd_t *) pud_page(*(dir)) + \
10968 +#define pmd_offset(dir, address) ((pmd_t *) pud_page_vaddr(*(dir)) + \
10969 pmd_index(address))
10970 #define pmd_none(x) (!__pmd_val(x))
10971 #if CONFIG_XEN_COMPAT <= 0x030002
10972 @@ -444,6 +424,7 @@
10973 {
10974 unsigned long pteval;
10975 pteval = physpage | pgprot_val(pgprot);
10976 + pteval &= __supported_pte_mask;
10977 return __pte(pteval);
10978 }
10979
10980 @@ -465,7 +446,7 @@
10981
10982 #define pte_index(address) \
10983 (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
10984 -#define pte_offset_kernel(dir, address) ((pte_t *) pmd_page_kernel(*(dir)) + \
10985 +#define pte_offset_kernel(dir, address) ((pte_t *) pmd_page_vaddr(*(dir)) + \
10986 pte_index(address))
10987
10988 /* x86-64 always has all page tables mapped. */
10989 @@ -506,6 +487,40 @@
10990 ptep_establish(vma, address, ptep, entry); \
10991 } while (0)
10992
10993 +
10994 +/*
10995 + * i386 says: We don't actually have these, but we want to advertise
10996 + * them so that we can encompass the flush here.
10997 + */
10998 +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
10999 +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
11000 +
11001 +#define __HAVE_ARCH_PTEP_CLEAR_DIRTY_FLUSH
11002 +#define ptep_clear_flush_dirty(vma, address, ptep) \
11003 +({ \
11004 + pte_t __pte = *(ptep); \
11005 + int __dirty = pte_dirty(__pte); \
11006 + __pte = pte_mkclean(__pte); \
11007 + if ((vma)->vm_mm->context.pinned) \
11008 + ptep_set_access_flags(vma, address, ptep, __pte, __dirty); \
11009 + else if (__dirty) \
11010 + set_pte(ptep, __pte); \
11011 + __dirty; \
11012 +})
11013 +
11014 +#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
11015 +#define ptep_clear_flush_young(vma, address, ptep) \
11016 +({ \
11017 + pte_t __pte = *(ptep); \
11018 + int __young = pte_young(__pte); \
11019 + __pte = pte_mkold(__pte); \
11020 + if ((vma)->vm_mm->context.pinned) \
11021 + ptep_set_access_flags(vma, address, ptep, __pte, __young); \
11022 + else if (__young) \
11023 + set_pte(ptep, __pte); \
11024 + __young; \
11025 +})
11026 +
11027 /* Encode and de-code a swap entry */
11028 #define __swp_type(x) (((x).val >> 1) & 0x3f)
11029 #define __swp_offset(x) ((x).val >> 8)
11030 @@ -547,10 +562,11 @@
11031 unsigned long size);
11032
11033 int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
11034 - unsigned long addr, unsigned long end, pgprot_t newprot);
11035 + unsigned long addr, unsigned long end, pgprot_t newprot,
11036 + int dirty_accountable);
11037
11038 -#define arch_change_pte_range(mm, pmd, addr, end, newprot) \
11039 - xen_change_pte_range(mm, pmd, addr, end, newprot)
11040 +#define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) \
11041 + xen_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable)
11042
11043 #define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \
11044 direct_remap_pfn_range(vma,vaddr,pfn,size,prot,DOMID_IO)
11045 @@ -572,8 +588,6 @@
11046 #define kc_offset_to_vaddr(o) \
11047 (((o) & (1UL << (__VIRTUAL_MASK_SHIFT-1))) ? ((o) | (~__VIRTUAL_MASK)) : (o))
11048
11049 -#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
11050 -#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
11051 #define __HAVE_ARCH_PTEP_GET_AND_CLEAR
11052 #define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
11053 #define __HAVE_ARCH_PTEP_CLEAR_FLUSH
11054 --- a/include/asm-x86/mach-xen/asm/processor_32.h
11055 +++ b/include/asm-x86/mach-xen/asm/processor_32.h
11056 @@ -146,6 +146,18 @@
11057 #define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */
11058 #define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */
11059
11060 +static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
11061 + unsigned int *ecx, unsigned int *edx)
11062 +{
11063 + /* ecx is often an input as well as an output. */
11064 + __asm__(XEN_CPUID
11065 + : "=a" (*eax),
11066 + "=b" (*ebx),
11067 + "=c" (*ecx),
11068 + "=d" (*edx)
11069 + : "0" (*eax), "2" (*ecx));
11070 +}
11071 +
11072 /*
11073 * Generic CPUID function
11074 * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
11075 @@ -153,24 +165,18 @@
11076 */
11077 static inline void cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx)
11078 {
11079 - __asm__(XEN_CPUID
11080 - : "=a" (*eax),
11081 - "=b" (*ebx),
11082 - "=c" (*ecx),
11083 - "=d" (*edx)
11084 - : "0" (op), "c"(0));
11085 + *eax = op;
11086 + *ecx = 0;
11087 + __cpuid(eax, ebx, ecx, edx);
11088 }
11089
11090 /* Some CPUID calls want 'count' to be placed in ecx */
11091 static inline void cpuid_count(int op, int count, int *eax, int *ebx, int *ecx,
11092 - int *edx)
11093 + int *edx)
11094 {
11095 - __asm__(XEN_CPUID
11096 - : "=a" (*eax),
11097 - "=b" (*ebx),
11098 - "=c" (*ecx),
11099 - "=d" (*edx)
11100 - : "0" (op), "c" (count));
11101 + *eax = op;
11102 + *ecx = count;
11103 + __cpuid(eax, ebx, ecx, edx);
11104 }
11105
11106 /*
11107 @@ -178,42 +184,30 @@
11108 */
11109 static inline unsigned int cpuid_eax(unsigned int op)
11110 {
11111 - unsigned int eax;
11112 + unsigned int eax, ebx, ecx, edx;
11113
11114 - __asm__(XEN_CPUID
11115 - : "=a" (eax)
11116 - : "0" (op)
11117 - : "bx", "cx", "dx");
11118 + cpuid(op, &eax, &ebx, &ecx, &edx);
11119 return eax;
11120 }
11121 static inline unsigned int cpuid_ebx(unsigned int op)
11122 {
11123 - unsigned int eax, ebx;
11124 + unsigned int eax, ebx, ecx, edx;
11125
11126 - __asm__(XEN_CPUID
11127 - : "=a" (eax), "=b" (ebx)
11128 - : "0" (op)
11129 - : "cx", "dx" );
11130 + cpuid(op, &eax, &ebx, &ecx, &edx);
11131 return ebx;
11132 }
11133 static inline unsigned int cpuid_ecx(unsigned int op)
11134 {
11135 - unsigned int eax, ecx;
11136 + unsigned int eax, ebx, ecx, edx;
11137
11138 - __asm__(XEN_CPUID
11139 - : "=a" (eax), "=c" (ecx)
11140 - : "0" (op)
11141 - : "bx", "dx" );
11142 + cpuid(op, &eax, &ebx, &ecx, &edx);
11143 return ecx;
11144 }
11145 static inline unsigned int cpuid_edx(unsigned int op)
11146 {
11147 - unsigned int eax, edx;
11148 + unsigned int eax, ebx, ecx, edx;
11149
11150 - __asm__(XEN_CPUID
11151 - : "=a" (eax), "=d" (edx)
11152 - : "0" (op)
11153 - : "bx", "cx");
11154 + cpuid(op, &eax, &ebx, &ecx, &edx);
11155 return edx;
11156 }
11157
11158 @@ -315,6 +309,8 @@
11159 : :"a" (eax), "c" (ecx));
11160 }
11161
11162 +extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
11163 +
11164 /* from system description table in BIOS. Mostly for MCA use, but
11165 others may find it useful. */
11166 extern unsigned int machine_id;
11167 --- a/include/asm-x86/mach-xen/asm/processor_64.h
11168 +++ b/include/asm-x86/mach-xen/asm/processor_64.h
11169 @@ -484,6 +484,8 @@
11170 : :"a" (eax), "c" (ecx));
11171 }
11172
11173 +extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
11174 +
11175 #define stack_current() \
11176 ({ \
11177 struct thread_info *ti; \
11178 --- a/include/asm-x86/mach-xen/asm/segment_32.h
11179 +++ b/include/asm-x86/mach-xen/asm/segment_32.h
11180 @@ -61,11 +61,9 @@
11181
11182 #define GDT_ENTRY_KERNEL_CS (GDT_ENTRY_KERNEL_BASE + 0)
11183 #define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8)
11184 -#define GET_KERNEL_CS() (__KERNEL_CS | (xen_feature(XENFEAT_supervisor_mode_kernel)?0:1) )
11185
11186 #define GDT_ENTRY_KERNEL_DS (GDT_ENTRY_KERNEL_BASE + 1)
11187 #define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8)
11188 -#define GET_KERNEL_DS() (__KERNEL_DS | (xen_feature(XENFEAT_supervisor_mode_kernel)?0:1) )
11189
11190 #define GDT_ENTRY_TSS (GDT_ENTRY_KERNEL_BASE + 4)
11191 #define GDT_ENTRY_LDT (GDT_ENTRY_KERNEL_BASE + 5)
11192 @@ -85,6 +83,11 @@
11193
11194 #define GDT_SIZE (GDT_ENTRIES * 8)
11195
11196 +/* Matches __KERNEL_CS and __USER_CS (they must be 2 entries apart) */
11197 +#define SEGMENT_IS_FLAT_CODE(x) (((x) & 0xec) == GDT_ENTRY_KERNEL_CS * 8)
11198 +/* Matches PNP_CS32 and PNP_CS16 (they must be consecutive) */
11199 +#define SEGMENT_IS_PNP_CODE(x) (((x) & 0xf4) == GDT_ENTRY_PNPBIOS_BASE * 8)
11200 +
11201 /* Simple and small GDT entries for booting only */
11202
11203 #define GDT_ENTRY_BOOT_CS 2
11204 @@ -114,4 +117,16 @@
11205 */
11206 #define IDT_ENTRIES 256
11207
11208 +/* Bottom two bits of selector give the ring privilege level */
11209 +#define SEGMENT_RPL_MASK 0x3
11210 +/* Bit 2 is table indicator (LDT/GDT) */
11211 +#define SEGMENT_TI_MASK 0x4
11212 +
11213 +/* User mode is privilege level 3 */
11214 +#define USER_RPL 0x3
11215 +/* LDT segment has TI set, GDT has it cleared */
11216 +#define SEGMENT_LDT 0x4
11217 +#define SEGMENT_GDT 0x0
11218 +
11219 +#define get_kernel_rpl() (xen_feature(XENFEAT_supervisor_mode_kernel)?0:1)
11220 #endif
11221 --- a/include/asm-x86/mach-xen/asm/smp_32.h
11222 +++ b/include/asm-x86/mach-xen/asm/smp_32.h
11223 @@ -79,25 +79,36 @@
11224 return GET_APIC_ID(*(unsigned long *)(APIC_BASE+APIC_ID));
11225 }
11226 #endif
11227 -
11228 -static __inline int logical_smp_processor_id(void)
11229 -{
11230 - /* we don't want to mark this access volatile - bad code generation */
11231 - return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR));
11232 -}
11233 -
11234 #endif
11235
11236 +#define safe_smp_processor_id() smp_processor_id()
11237 extern int __cpu_disable(void);
11238 extern void __cpu_die(unsigned int cpu);
11239 extern void prefill_possible_map(void);
11240 +extern unsigned int num_processors;
11241 +
11242 #endif /* !__ASSEMBLY__ */
11243
11244 #else /* CONFIG_SMP */
11245
11246 +#define safe_smp_processor_id() 0
11247 #define cpu_physical_id(cpu) boot_cpu_physical_apicid
11248
11249 #define NO_PROC_ID 0xFF /* No processor magic marker */
11250
11251 #endif
11252 +
11253 +#ifndef __ASSEMBLY__
11254 +
11255 +extern u8 apicid_2_node[];
11256 +
11257 +#ifdef CONFIG_X86_LOCAL_APIC
11258 +static __inline int logical_smp_processor_id(void)
11259 +{
11260 + /* we don't want to mark this access volatile - bad code generation */
11261 + return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR));
11262 +}
11263 +#endif
11264 +#endif
11265 +
11266 #endif
11267 --- a/include/asm-x86/mach-xen/asm/smp_64.h
11268 +++ b/include/asm-x86/mach-xen/asm/smp_64.h
11269 @@ -4,15 +4,12 @@
11270 /*
11271 * We need the APIC definitions automatically as part of 'smp.h'
11272 */
11273 -#ifndef __ASSEMBLY__
11274 #include <linux/threads.h>
11275 #include <linux/cpumask.h>
11276 #include <linux/bitops.h>
11277 extern int disable_apic;
11278 -#endif
11279
11280 #ifdef CONFIG_X86_LOCAL_APIC
11281 -#ifndef __ASSEMBLY__
11282 #include <asm/fixmap.h>
11283 #include <asm/mpspec.h>
11284 #ifdef CONFIG_X86_IO_APIC
11285 @@ -21,10 +18,8 @@
11286 #include <asm/apic.h>
11287 #include <asm/thread_info.h>
11288 #endif
11289 -#endif
11290
11291 #ifdef CONFIG_SMP
11292 -#ifndef ASSEMBLY
11293
11294 #include <asm/pda.h>
11295
11296 @@ -41,14 +36,11 @@
11297
11298 extern void smp_alloc_memory(void);
11299 extern volatile unsigned long smp_invalidate_needed;
11300 -extern int pic_mode;
11301 extern void lock_ipi_call_lock(void);
11302 extern void unlock_ipi_call_lock(void);
11303 extern int smp_num_siblings;
11304 extern void smp_send_reschedule(int cpu);
11305 void smp_stop_cpu(void);
11306 -extern int smp_call_function_single(int cpuid, void (*func) (void *info),
11307 - void *info, int retry, int wait);
11308
11309 extern cpumask_t cpu_sibling_map[NR_CPUS];
11310 extern cpumask_t cpu_core_map[NR_CPUS];
11311 @@ -77,20 +69,16 @@
11312 }
11313 #endif
11314
11315 -extern int safe_smp_processor_id(void);
11316 extern int __cpu_disable(void);
11317 extern void __cpu_die(unsigned int cpu);
11318 extern void prefill_possible_map(void);
11319 extern unsigned num_processors;
11320 extern unsigned disabled_cpus;
11321
11322 -#endif /* !ASSEMBLY */
11323 -
11324 #define NO_PROC_ID 0xFF /* No processor magic marker */
11325
11326 #endif
11327
11328 -#ifndef ASSEMBLY
11329 /*
11330 * Some lowlevel functions might want to know about
11331 * the real APIC ID <-> CPU # mapping.
11332 @@ -114,11 +102,8 @@
11333 }
11334 #endif
11335
11336 -#endif /* !ASSEMBLY */
11337 -
11338 #ifndef CONFIG_SMP
11339 #define stack_smp_processor_id() 0
11340 -#define safe_smp_processor_id() 0
11341 #define cpu_logical_map(x) (x)
11342 #else
11343 #include <asm/thread_info.h>
11344 @@ -130,7 +115,6 @@
11345 })
11346 #endif
11347
11348 -#ifndef __ASSEMBLY__
11349 #ifdef CONFIG_X86_LOCAL_APIC
11350 static __inline int logical_smp_processor_id(void)
11351 {
11352 @@ -138,13 +122,18 @@
11353 return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR));
11354 }
11355 #endif
11356 -#endif
11357
11358 #ifdef CONFIG_SMP
11359 #define cpu_physical_id(cpu) x86_cpu_to_apicid[cpu]
11360 #else
11361 #define cpu_physical_id(cpu) boot_cpu_id
11362 -#endif
11363 -
11364 +static inline int smp_call_function_single(int cpuid, void (*func) (void *info),
11365 + void *info, int retry, int wait)
11366 +{
11367 + /* Disable interrupts here? */
11368 + func(info);
11369 + return 0;
11370 +}
11371 +#endif /* !CONFIG_SMP */
11372 #endif
11373
11374 --- a/include/asm-x86/mach-xen/asm/system_32.h
11375 +++ b/include/asm-x86/mach-xen/asm/system_32.h
11376 @@ -267,6 +267,9 @@
11377 #define cmpxchg(ptr,o,n)\
11378 ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\
11379 (unsigned long)(n),sizeof(*(ptr))))
11380 +#define sync_cmpxchg(ptr,o,n)\
11381 + ((__typeof__(*(ptr)))__sync_cmpxchg((ptr),(unsigned long)(o),\
11382 + (unsigned long)(n),sizeof(*(ptr))))
11383 #endif
11384
11385 static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
11386 @@ -291,6 +294,39 @@
11387 : "=a"(prev)
11388 : "r"(new), "m"(*__xg(ptr)), "0"(old)
11389 : "memory");
11390 + return prev;
11391 + }
11392 + return old;
11393 +}
11394 +
11395 +/*
11396 + * Always use locked operations when touching memory shared with a
11397 + * hypervisor, since the system may be SMP even if the guest kernel
11398 + * isn't.
11399 + */
11400 +static inline unsigned long __sync_cmpxchg(volatile void *ptr,
11401 + unsigned long old,
11402 + unsigned long new, int size)
11403 +{
11404 + unsigned long prev;
11405 + switch (size) {
11406 + case 1:
11407 + __asm__ __volatile__("lock; cmpxchgb %b1,%2"
11408 + : "=a"(prev)
11409 + : "q"(new), "m"(*__xg(ptr)), "0"(old)
11410 + : "memory");
11411 + return prev;
11412 + case 2:
11413 + __asm__ __volatile__("lock; cmpxchgw %w1,%2"
11414 + : "=a"(prev)
11415 + : "r"(new), "m"(*__xg(ptr)), "0"(old)
11416 + : "memory");
11417 + return prev;
11418 + case 4:
11419 + __asm__ __volatile__("lock; cmpxchgl %1,%2"
11420 + : "=a"(prev)
11421 + : "r"(new), "m"(*__xg(ptr)), "0"(old)
11422 + : "memory");
11423 return prev;
11424 }
11425 return old;
11426 --- a/include/asm-x86/mach-xen/asm/system_64.h
11427 +++ b/include/asm-x86/mach-xen/asm/system_64.h
11428 @@ -24,6 +24,7 @@
11429 #define __EXTRA_CLOBBER \
11430 ,"rcx","rbx","rdx","r8","r9","r10","r11","r12","r13","r14","r15"
11431
11432 +/* Save restore flags to clear handle leaking NT */
11433 #define switch_to(prev,next,last) \
11434 asm volatile(SAVE_CONTEXT \
11435 "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \
11436 --- a/include/asm-x86/mach-xen/asm/tlbflush_32.h
11437 +++ b/include/asm-x86/mach-xen/asm/tlbflush_32.h
11438 @@ -8,8 +8,6 @@
11439 #define __flush_tlb_global() xen_tlb_flush()
11440 #define __flush_tlb_all() xen_tlb_flush()
11441
11442 -extern unsigned long pgkern_mask;
11443 -
11444 #define cpu_has_invlpg (boot_cpu_data.x86 > 3)
11445
11446 #define __flush_tlb_single(addr) xen_invlpg(addr)
11447 --- a/include/asm-x86/mach-xen/asm/tlbflush_64.h
11448 +++ b/include/asm-x86/mach-xen/asm/tlbflush_64.h
11449 @@ -12,9 +12,6 @@
11450 */
11451 #define __flush_tlb_global() xen_tlb_flush()
11452
11453 -
11454 -extern unsigned long pgkern_mask;
11455 -
11456 #define __flush_tlb_all() __flush_tlb_global()
11457
11458 #define __flush_tlb_one(addr) xen_invlpg((unsigned long)addr)
11459 --- a/include/asm-x86/thread_info_64.h
11460 +++ b/include/asm-x86/thread_info_64.h
11461 @@ -157,10 +157,14 @@
11462 (_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY|_TIF_HRTICK_RESCHED)
11463
11464 /* flags to check in __switch_to() */
11465 +#ifndef CONFIG_XEN
11466 #define _TIF_WORK_CTXSW \
11467 (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_BTS_TRACE_TS)
11468 #define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW
11469 #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
11470 +#else
11471 +#define _TIF_WORK_CTXSW _TIF_DEBUG
11472 +#endif
11473
11474 #define PREEMPT_ACTIVE 0x10000000
11475
11476 --- a/include/linux/skbuff.h
11477 +++ b/include/linux/skbuff.h
11478 @@ -1821,5 +1821,12 @@
11479 }
11480
11481 bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off);
11482 +
11483 +#ifdef CONFIG_XEN
11484 +int skb_checksum_setup(struct sk_buff *skb);
11485 +#else
11486 +static inline int skb_checksum_setup(struct sk_buff *skb) { return 0; }
11487 +#endif
11488 +
11489 #endif /* __KERNEL__ */
11490 #endif /* _LINUX_SKBUFF_H */
11491 --- a/include/xen/evtchn.h
11492 +++ b/include/xen/evtchn.h
11493 @@ -54,34 +54,34 @@
11494 */
11495 int bind_caller_port_to_irqhandler(
11496 unsigned int caller_port,
11497 - irqreturn_t (*handler)(int, void *, struct pt_regs *),
11498 + irq_handler_t handler,
11499 unsigned long irqflags,
11500 const char *devname,
11501 void *dev_id);
11502 int bind_listening_port_to_irqhandler(
11503 unsigned int remote_domain,
11504 - irqreturn_t (*handler)(int, void *, struct pt_regs *),
11505 + irq_handler_t handler,
11506 unsigned long irqflags,
11507 const char *devname,
11508 void *dev_id);
11509 int bind_interdomain_evtchn_to_irqhandler(
11510 unsigned int remote_domain,
11511 unsigned int remote_port,
11512 - irqreturn_t (*handler)(int, void *, struct pt_regs *),
11513 + irq_handler_t handler,
11514 unsigned long irqflags,
11515 const char *devname,
11516 void *dev_id);
11517 int bind_virq_to_irqhandler(
11518 unsigned int virq,
11519 unsigned int cpu,
11520 - irqreturn_t (*handler)(int, void *, struct pt_regs *),
11521 + irq_handler_t handler,
11522 unsigned long irqflags,
11523 const char *devname,
11524 void *dev_id);
11525 int bind_ipi_to_irqhandler(
11526 unsigned int ipi,
11527 unsigned int cpu,
11528 - irqreturn_t (*handler)(int, void *, struct pt_regs *),
11529 + irq_handler_t handler,
11530 unsigned long irqflags,
11531 const char *devname,
11532 void *dev_id);
11533 --- a/include/xen/xencons.h
11534 +++ b/include/xen/xencons.h
11535 @@ -8,7 +8,7 @@
11536 void xencons_resume(void);
11537
11538 /* Interrupt work hooks. Receive data, or kick data out. */
11539 -void xencons_rx(char *buf, unsigned len, struct pt_regs *regs);
11540 +void xencons_rx(char *buf, unsigned len);
11541 void xencons_tx(void);
11542
11543 int xencons_ring_init(void);
11544 --- a/mm/mprotect.c
11545 +++ b/mm/mprotect.c
11546 @@ -86,7 +86,7 @@
11547 next = pmd_addr_end(addr, end);
11548 if (pmd_none_or_clear_bad(pmd))
11549 continue;
11550 - if (arch_change_pte_range(mm, pmd, addr, next, newprot))
11551 + if (arch_change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable))
11552 continue;
11553 change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable);
11554 } while (pmd++, addr = next, addr != end);
11555 --- a/net/core/dev.c
11556 +++ b/net/core/dev.c
11557 @@ -1611,15 +1611,14 @@
11558 }
11559 if ((skb->h.raw + skb->csum + 2) > skb->tail)
11560 goto out;
11561 - skb->ip_summed = CHECKSUM_HW;
11562 + skb->ip_summed = CHECKSUM_PARTIAL;
11563 skb->proto_csum_blank = 0;
11564 }
11565 return 0;
11566 out:
11567 return -EPROTO;
11568 }
11569 -#else
11570 -inline int skb_checksum_setup(struct sk_buff *skb) { return 0; }
11571 +EXPORT_SYMBOL(skb_checksum_setup);
11572 #endif
11573
11574 /**
11575 @@ -2115,7 +2114,7 @@
11576 case CHECKSUM_UNNECESSARY:
11577 skb->proto_data_valid = 1;
11578 break;
11579 - case CHECKSUM_HW:
11580 + case CHECKSUM_PARTIAL:
11581 /* XXX Implement me. */
11582 default:
11583 skb->proto_data_valid = 0;
11584 @@ -4648,7 +4647,6 @@
11585 EXPORT_SYMBOL(net_enable_timestamp);
11586 EXPORT_SYMBOL(net_disable_timestamp);
11587 EXPORT_SYMBOL(dev_get_flags);
11588 -EXPORT_SYMBOL(skb_checksum_setup);
11589
11590 #if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
11591 EXPORT_SYMBOL(br_handle_frame_hook);