Magellan Linux

Contents of /trunk/kernel26-xen/patches-2.6.25-r1/1020-2.6.25-xen-patch-2.6.19.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 612 - (show annotations) (download)
Sat May 24 01:03:50 2008 UTC (15 years, 11 months ago) by niro
File size: 318811 byte(s)
-fixed patch again

1 From: www.kernel.org
2 Subject: Linux 2.6.19
3 Patch-mainline: 2.6.19
4
5 Automatically created from "patches.kernel.org/patch-2.6.19" by xen-port-patches.py
6
7 Acked-by: jbeulich@novell.com
8
9 ---
10 arch/x86/Kconfig | 1
11 arch/x86/ia32/ia32entry-xen.S | 9
12 arch/x86/kernel/Makefile | 5
13 arch/x86/kernel/apic_32-xen.c | 9
14 arch/x86/kernel/apic_64-xen.c | 20
15 arch/x86/kernel/cpu/common-xen.c | 20
16 arch/x86/kernel/e820_64-xen.c | 320 +++---
17 arch/x86/kernel/early_printk-xen.c | 20
18 arch/x86/kernel/entry_32-xen.S | 139 +-
19 arch/x86/kernel/entry_64-xen.S | 106 --
20 arch/x86/kernel/genapic_xen_64.c | 9
21 arch/x86/kernel/head64-xen.c | 44
22 arch/x86/kernel/head_32-xen.S | 2
23 arch/x86/kernel/head_64-xen.S | 5
24 arch/x86/kernel/io_apic_32-xen.c | 750 +++++++++------
25 arch/x86/kernel/io_apic_64-xen.c | 1250 +++++++++++---------------
26 arch/x86/kernel/ioport_64-xen.c | 1
27 arch/x86/kernel/irq_32-xen.c | 19
28 arch/x86/kernel/irq_64-xen.c | 35
29 arch/x86/kernel/ldt_32-xen.c | 2
30 arch/x86/kernel/microcode-xen.c | 85 +
31 arch/x86/kernel/mpparse_32-xen.c | 70 -
32 arch/x86/kernel/mpparse_64-xen.c | 313 +-----
33 arch/x86/kernel/pci-dma_32-xen.c | 16
34 arch/x86/kernel/pci-swiotlb_64-xen.c | 3
35 arch/x86/kernel/process_32-xen.c | 29
36 arch/x86/kernel/process_64-xen.c | 90 +
37 arch/x86/kernel/setup64-xen.c | 41
38 arch/x86/kernel/setup_32-xen.c | 430 +++-----
39 arch/x86/kernel/setup_64-xen.c | 271 +----
40 arch/x86/kernel/smp_32-xen.c | 75 +
41 arch/x86/kernel/smp_64-xen.c | 35
42 arch/x86/kernel/time_32-xen.c | 86 -
43 arch/x86/kernel/traps_32-xen.c | 238 +++-
44 arch/x86/kernel/traps_64-xen.c | 220 +++-
45 arch/x86/kernel/vsyscall_64-xen.c | 117 ++
46 arch/x86/mach-xen/setup.c | 6
47 arch/x86/mm/fault_32-xen.c | 29
48 arch/x86/mm/fault_64-xen.c | 34
49 arch/x86/mm/highmem_32-xen.c | 31
50 arch/x86/mm/hypervisor.c | 9
51 arch/x86/mm/init_32-xen.c | 89 +
52 arch/x86/mm/init_64-xen.c | 184 +--
53 arch/x86/mm/ioremap_32-xen.c | 10
54 arch/x86/mm/pageattr_64-xen.c | 24
55 arch/x86/mm/pgtable_32-xen.c | 31
56 arch/x86/pci/irq-xen.c | 38
57 drivers/char/tpm/tpm_xen.c | 5
58 drivers/pci/Kconfig | 2
59 drivers/xen/Kconfig | 3
60 drivers/xen/balloon/balloon.c | 2
61 drivers/xen/blkback/blkback.c | 2
62 drivers/xen/blkback/common.h | 2
63 drivers/xen/blkfront/blkfront.c | 4
64 drivers/xen/blktap/blktap.c | 2
65 drivers/xen/blktap/common.h | 2
66 drivers/xen/console/console.c | 10
67 drivers/xen/console/xencons_ring.c | 4
68 drivers/xen/core/evtchn.c | 50 -
69 drivers/xen/core/reboot.c | 3
70 drivers/xen/core/smpboot.c | 6
71 drivers/xen/fbfront/xenfb.c | 3
72 drivers/xen/fbfront/xenkbd.c | 2
73 drivers/xen/gntdev/gntdev.c | 11
74 drivers/xen/netback/accel.c | 2
75 drivers/xen/netback/common.h | 2
76 drivers/xen/netback/loopback.c | 2
77 drivers/xen/netback/netback.c | 6
78 drivers/xen/netfront/netfront.c | 8
79 drivers/xen/pciback/pciback.h | 2
80 drivers/xen/pciback/pciback_ops.c | 2
81 drivers/xen/pcifront/pci_op.c | 8
82 drivers/xen/privcmd/compat_privcmd.c | 1
83 drivers/xen/privcmd/privcmd.c | 2
84 drivers/xen/sfc_netback/accel_xenbus.c | 6
85 drivers/xen/sfc_netfront/accel.h | 6
86 drivers/xen/sfc_netfront/accel_msg.c | 6
87 drivers/xen/sfc_netfront/accel_tso.c | 2
88 drivers/xen/sfc_netfront/accel_vi.c | 4
89 drivers/xen/tpmback/common.h | 2
90 drivers/xen/tpmback/tpmback.c | 4
91 drivers/xen/xenbus/xenbus_comms.c | 2
92 drivers/xen/xenoprof/xenoprofile.c | 2
93 include/asm-generic/pgtable.h | 2
94 include/asm-x86/mach-xen/asm/desc_32.h | 127 +-
95 include/asm-x86/mach-xen/asm/dma-mapping_64.h | 7
96 include/asm-x86/mach-xen/asm/e820_64.h | 15
97 include/asm-x86/mach-xen/asm/fixmap_32.h | 5
98 include/asm-x86/mach-xen/asm/fixmap_64.h | 2
99 include/asm-x86/mach-xen/asm/hw_irq_32.h | 8
100 include/asm-x86/mach-xen/asm/hw_irq_64.h | 10
101 include/asm-x86/mach-xen/asm/io_32.h | 27
102 include/asm-x86/mach-xen/asm/io_64.h | 27
103 include/asm-x86/mach-xen/asm/pgtable-2level.h | 12
104 include/asm-x86/mach-xen/asm/pgtable-3level.h | 14
105 include/asm-x86/mach-xen/asm/pgtable_32.h | 143 +-
106 include/asm-x86/mach-xen/asm/pgtable_64.h | 86 +
107 include/asm-x86/mach-xen/asm/processor_32.h | 62 -
108 include/asm-x86/mach-xen/asm/processor_64.h | 2
109 include/asm-x86/mach-xen/asm/segment_32.h | 19
110 include/asm-x86/mach-xen/asm/smp_32.h | 25
111 include/asm-x86/mach-xen/asm/smp_64.h | 27
112 include/asm-x86/mach-xen/asm/system_32.h | 36
113 include/asm-x86/mach-xen/asm/system_64.h | 1
114 include/asm-x86/mach-xen/asm/tlbflush_32.h | 2
115 include/asm-x86/mach-xen/asm/tlbflush_64.h | 3
116 include/asm-x86/thread_info_64.h | 4
117 include/linux/skbuff.h | 7
118 include/xen/evtchn.h | 10
119 include/xen/xencons.h | 2
120 mm/mprotect.c | 2
121 net/core/dev.c | 8
122 112 files changed, 3102 insertions(+), 3145 deletions(-)
123
124 --- a/arch/x86/Kconfig
125 +++ b/arch/x86/Kconfig
126 @@ -390,6 +390,7 @@
127
128 menuconfig PARAVIRT_GUEST
129 bool "Paravirtualized guest support"
130 + depends on !X86_XEN && !X86_64_XEN
131 help
132 Say Y here to get to see options related to running Linux under
133 various hypervisors. This option alone does not add any kernel code.
134 --- a/arch/x86/ia32/ia32entry-xen.S
135 +++ b/arch/x86/ia32/ia32entry-xen.S
136 @@ -83,6 +83,7 @@
137 */
138 ENTRY(ia32_sysenter_target)
139 CFI_STARTPROC32 simple
140 + CFI_SIGNAL_FRAME
141 CFI_DEF_CFA rsp,SS+8-RIP+16
142 /*CFI_REL_OFFSET ss,SS-RIP+16*/
143 CFI_REL_OFFSET rsp,RSP-RIP+16
144 @@ -164,6 +165,7 @@
145 */
146 ENTRY(ia32_cstar_target)
147 CFI_STARTPROC32 simple
148 + CFI_SIGNAL_FRAME
149 CFI_DEF_CFA rsp,SS+8-RIP+16
150 /*CFI_REL_OFFSET ss,SS-RIP+16*/
151 CFI_REL_OFFSET rsp,RSP-RIP+16
152 @@ -243,6 +245,7 @@
153
154 ENTRY(ia32_syscall)
155 CFI_STARTPROC simple
156 + CFI_SIGNAL_FRAME
157 CFI_DEF_CFA rsp,SS+8-RIP+16
158 /*CFI_REL_OFFSET ss,SS-RIP+16*/
159 CFI_REL_OFFSET rsp,RSP-RIP+16
160 @@ -320,6 +323,7 @@
161 popq %r11
162 CFI_ENDPROC
163 CFI_STARTPROC32 simple
164 + CFI_SIGNAL_FRAME
165 CFI_DEF_CFA rsp,SS+8-ARGOFFSET
166 CFI_REL_OFFSET rax,RAX-ARGOFFSET
167 CFI_REL_OFFSET rcx,RCX-ARGOFFSET
168 @@ -653,8 +657,8 @@
169 .quad sys_readlinkat /* 305 */
170 .quad sys_fchmodat
171 .quad sys_faccessat
172 - .quad quiet_ni_syscall /* pselect6 for now */
173 - .quad quiet_ni_syscall /* ppoll for now */
174 + .quad compat_sys_pselect6
175 + .quad compat_sys_ppoll
176 .quad sys_unshare /* 310 */
177 .quad compat_sys_set_robust_list
178 .quad compat_sys_get_robust_list
179 @@ -663,4 +667,5 @@
180 .quad sys_tee
181 .quad compat_sys_vmsplice
182 .quad compat_sys_move_pages
183 + .quad sys_getcpu
184 ia32_syscall_end:
185 --- a/arch/x86/kernel/Makefile
186 +++ b/arch/x86/kernel/Makefile
187 @@ -91,7 +91,7 @@
188 ###
189 # 64 bit specific files
190 ifeq ($(CONFIG_X86_64),y)
191 - obj-y += genapic_64.o genapic_flat_64.o
192 + obj-$(CONFIG_X86_LOCAL_APIC) += genapic_64.o genapic_flat_64.o
193 obj-$(CONFIG_X86_XEN_GENAPIC) += genapic_64.o genapic_xen_64.o
194 obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o
195 obj-$(CONFIG_AUDIT) += audit_64.o
196 @@ -104,5 +104,6 @@
197 pci-dma_64-$(CONFIG_XEN) += pci-dma_32.o
198 endif
199
200 -disabled-obj-$(CONFIG_XEN) := i8253.o i8259_$(BITS).o reboot.o smpboot_$(BITS).o tsc_$(BITS).o
201 +disabled-obj-$(CONFIG_XEN) := early-quirks.o i8253.o i8259_$(BITS).o reboot.o \
202 + smpboot_$(BITS).o tsc_$(BITS).o
203 %/head_$(BITS).o %/head_$(BITS).s: $(if $(CONFIG_XEN),EXTRA_AFLAGS,dummy) :=
204 --- a/arch/x86/kernel/apic_32-xen.c
205 +++ b/arch/x86/kernel/apic_32-xen.c
206 @@ -54,7 +54,6 @@
207 /*
208 * Knob to control our willingness to enable the local APIC.
209 */
210 -int enable_local_apic __initdata = 0; /* -1=force-disable, +1=force-enable */
211
212 /*
213 * Debug level
214 @@ -102,7 +101,7 @@
215
216 #ifndef CONFIG_XEN
217 #ifndef CONFIG_SMP
218 -static void up_apic_timer_interrupt_call(struct pt_regs *regs)
219 +static void up_apic_timer_interrupt_call(void)
220 {
221 int cpu = smp_processor_id();
222
223 @@ -111,11 +110,11 @@
224 */
225 per_cpu(irq_stat, cpu).apic_timer_irqs++;
226
227 - smp_local_timer_interrupt(regs);
228 + smp_local_timer_interrupt();
229 }
230 #endif
231
232 -void smp_send_timer_broadcast_ipi(struct pt_regs *regs)
233 +void smp_send_timer_broadcast_ipi(void)
234 {
235 cpumask_t mask;
236
237 @@ -128,7 +127,7 @@
238 * We can directly call the apic timer interrupt handler
239 * in UP case. Minus all irq related functions
240 */
241 - up_apic_timer_interrupt_call(regs);
242 + up_apic_timer_interrupt_call();
243 #endif
244 }
245 }
246 --- a/arch/x86/kernel/apic_64-xen.c
247 +++ b/arch/x86/kernel/apic_64-xen.c
248 @@ -43,7 +43,7 @@
249 */
250 void ack_bad_irq(unsigned int irq)
251 {
252 - printk("unexpected IRQ trap at vector %02x\n", irq);
253 + printk("unexpected IRQ trap at irq %02x\n", irq);
254 /*
255 * Currently unexpected vectors happen only on SMP and APIC.
256 * We _must_ ack these because every local APIC has only N
257 @@ -62,19 +62,19 @@
258 return -EINVAL;
259 }
260
261 -void smp_local_timer_interrupt(struct pt_regs *regs)
262 +void smp_local_timer_interrupt(void)
263 {
264 - profile_tick(CPU_PROFILING, regs);
265 + profile_tick(CPU_PROFILING);
266 #ifndef CONFIG_XEN
267 #ifdef CONFIG_SMP
268 - update_process_times(user_mode(regs));
269 + update_process_times(user_mode(get_irq_regs()));
270 #endif
271 #endif
272 /*
273 * We take the 'long' return path, and there every subsystem
274 * grabs the appropriate locks (kernel lock/ irq lock).
275 *
276 - * we might want to decouple profiling from the 'long path',
277 + * We might want to decouple profiling from the 'long path',
278 * and do the profiling totally in assembly.
279 *
280 * Currently this isn't too much of an issue (performance wise),
281 @@ -92,6 +92,8 @@
282 */
283 void smp_apic_timer_interrupt(struct pt_regs *regs)
284 {
285 + struct pt_regs *old_regs = set_irq_regs(regs);
286 +
287 /*
288 * the NMI deadlock-detector uses this.
289 */
290 @@ -109,8 +111,9 @@
291 */
292 exit_idle();
293 irq_enter();
294 - smp_local_timer_interrupt(regs);
295 + smp_local_timer_interrupt();
296 irq_exit();
297 + set_irq_regs(old_regs);
298 }
299
300 /*
301 @@ -188,9 +191,8 @@
302 int __init APIC_init_uniprocessor (void)
303 {
304 #ifdef CONFIG_X86_IO_APIC
305 - if (smp_found_config)
306 - if (!skip_ioapic_setup && nr_ioapics)
307 - setup_IO_APIC();
308 + if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
309 + setup_IO_APIC();
310 #endif
311
312 return 1;
313 --- a/arch/x86/kernel/cpu/common-xen.c
314 +++ b/arch/x86/kernel/cpu/common-xen.c
315 @@ -43,7 +43,7 @@
316
317 extern int disable_pse;
318
319 -static void default_init(struct cpuinfo_x86 * c)
320 +static void __cpuinit default_init(struct cpuinfo_x86 * c)
321 {
322 /* Not much we can do here... */
323 /* Check if at least it has cpuid */
324 @@ -56,7 +56,7 @@
325 }
326 }
327
328 -static struct cpu_dev default_cpu = {
329 +static struct cpu_dev __cpuinitdata default_cpu = {
330 .c_init = default_init,
331 .c_vendor = "Unknown",
332 };
333 @@ -191,7 +191,16 @@
334
335 static int __init x86_fxsr_setup(char * s)
336 {
337 + /* Tell all the other CPU's to not use it... */
338 disable_x86_fxsr = 1;
339 +
340 + /*
341 + * ... and clear the bits early in the boot_cpu_data
342 + * so that the bootup process doesn't try to do this
343 + * either.
344 + */
345 + clear_bit(X86_FEATURE_FXSR, boot_cpu_data.x86_capability);
346 + clear_bit(X86_FEATURE_XMM, boot_cpu_data.x86_capability);
347 return 1;
348 }
349 __setup("nofxsr", x86_fxsr_setup);
350 @@ -272,7 +281,7 @@
351 }
352 }
353
354 -void __cpuinit generic_identify(struct cpuinfo_x86 * c)
355 +static void __cpuinit generic_identify(struct cpuinfo_x86 * c)
356 {
357 u32 tfms, xlvl;
358 int ebx;
359 @@ -698,8 +707,7 @@
360 */
361 atomic_inc(&init_mm.mm_count);
362 current->active_mm = &init_mm;
363 - if (current->mm)
364 - BUG();
365 + BUG_ON(current->mm);
366 enter_lazy_tlb(&init_mm, current);
367
368 load_esp0(t, thread);
369 @@ -712,7 +720,7 @@
370 #endif
371
372 /* Clear %fs and %gs. */
373 - asm volatile ("xorl %eax, %eax; movl %eax, %fs; movl %eax, %gs");
374 + asm volatile ("movl %0, %%fs; movl %0, %%gs" : : "r" (0));
375
376 /* Clear all 6 debug registers: */
377 set_debugreg(0, 0);
378 --- a/arch/x86/kernel/e820_64-xen.c
379 +++ b/arch/x86/kernel/e820_64-xen.c
380 @@ -16,6 +16,7 @@
381 #include <linux/string.h>
382 #include <linux/kexec.h>
383 #include <linux/module.h>
384 +#include <linux/mm.h>
385
386 #include <asm/pgtable.h>
387 #include <asm/page.h>
388 @@ -25,6 +26,11 @@
389 #include <asm/sections.h>
390 #include <xen/interface/memory.h>
391
392 +struct e820map e820 __initdata;
393 +#ifdef CONFIG_XEN
394 +struct e820map machine_e820 __initdata;
395 +#endif
396 +
397 /*
398 * PFN of last memory page.
399 */
400 @@ -41,7 +47,7 @@
401 /*
402 * Last pfn which the user wants to use.
403 */
404 -unsigned long end_user_pfn = MAXMEM>>PAGE_SHIFT;
405 +static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT;
406
407 extern struct resource code_resource, data_resource;
408
409 @@ -53,13 +59,13 @@
410 #ifndef CONFIG_XEN
411 /* various gunk below that needed for SMP startup */
412 if (addr < 0x8000) {
413 - *addrp = 0x8000;
414 + *addrp = PAGE_ALIGN(0x8000);
415 return 1;
416 }
417
418 /* direct mapping tables of the kernel */
419 if (last >= table_start<<PAGE_SHIFT && addr < table_end<<PAGE_SHIFT) {
420 - *addrp = table_end << PAGE_SHIFT;
421 + *addrp = PAGE_ALIGN(table_end << PAGE_SHIFT);
422 return 1;
423 }
424
425 @@ -67,23 +73,18 @@
426 #ifdef CONFIG_BLK_DEV_INITRD
427 if (LOADER_TYPE && INITRD_START && last >= INITRD_START &&
428 addr < INITRD_START+INITRD_SIZE) {
429 - *addrp = INITRD_START + INITRD_SIZE;
430 + *addrp = PAGE_ALIGN(INITRD_START + INITRD_SIZE);
431 return 1;
432 }
433 #endif
434 - /* kernel code + 640k memory hole (later should not be needed, but
435 - be paranoid for now) */
436 - if (last >= 640*1024 && addr < 1024*1024) {
437 - *addrp = 1024*1024;
438 - return 1;
439 - }
440 - if (last >= __pa_symbol(&_text) && last < __pa_symbol(&_end)) {
441 - *addrp = __pa_symbol(&_end);
442 + /* kernel code */
443 + if (last >= __pa_symbol(&_text) && addr < __pa_symbol(&_end)) {
444 + *addrp = PAGE_ALIGN(__pa_symbol(&_end));
445 return 1;
446 }
447
448 if (last >= ebda_addr && addr < ebda_addr + ebda_size) {
449 - *addrp = ebda_addr + ebda_size;
450 + *addrp = PAGE_ALIGN(ebda_addr + ebda_size);
451 return 1;
452 }
453
454 @@ -141,8 +142,6 @@
455 for (i = 0; i < e820.nr_map; i++) {
456 struct e820entry *ei = &e820.map[i];
457 #else
458 - extern struct e820map machine_e820;
459 -
460 if (!is_initial_xendomain())
461 return 0;
462 for (i = 0; i < machine_e820.nr_map; i++) {
463 @@ -184,7 +183,7 @@
464 continue;
465 while (bad_addr(&addr, size) && addr+size <= ei->addr+ei->size)
466 ;
467 - last = addr + size;
468 + last = PAGE_ALIGN(addr) + size;
469 if (last > ei->addr + ei->size)
470 continue;
471 if (last > end)
472 @@ -194,59 +193,14 @@
473 return -1UL;
474 }
475
476 -/*
477 - * Free bootmem based on the e820 table for a node.
478 - */
479 -void __init e820_bootmem_free(pg_data_t *pgdat, unsigned long start,unsigned long end)
480 -{
481 - int i;
482 - for (i = 0; i < e820.nr_map; i++) {
483 - struct e820entry *ei = &e820.map[i];
484 - unsigned long last, addr;
485 -
486 - if (ei->type != E820_RAM ||
487 - ei->addr+ei->size <= start ||
488 - ei->addr >= end)
489 - continue;
490 -
491 - addr = round_up(ei->addr, PAGE_SIZE);
492 - if (addr < start)
493 - addr = start;
494 -
495 - last = round_down(ei->addr + ei->size, PAGE_SIZE);
496 - if (last >= end)
497 - last = end;
498 -
499 - if (last > addr && last-addr >= PAGE_SIZE)
500 - free_bootmem_node(pgdat, addr, last-addr);
501 - }
502 -}
503 -
504 /*
505 * Find the highest page frame number we have available
506 */
507 unsigned long __init e820_end_of_ram(void)
508 {
509 - int i;
510 unsigned long end_pfn = 0;
511 + end_pfn = find_max_pfn_with_active_regions();
512
513 - for (i = 0; i < e820.nr_map; i++) {
514 - struct e820entry *ei = &e820.map[i];
515 - unsigned long start, end;
516 -
517 - start = round_up(ei->addr, PAGE_SIZE);
518 - end = round_down(ei->addr + ei->size, PAGE_SIZE);
519 - if (start >= end)
520 - continue;
521 - if (ei->type == E820_RAM) {
522 - if (end > end_pfn<<PAGE_SHIFT)
523 - end_pfn = end>>PAGE_SHIFT;
524 - } else {
525 - if (end > end_pfn_map<<PAGE_SHIFT)
526 - end_pfn_map = end>>PAGE_SHIFT;
527 - }
528 - }
529 -
530 if (end_pfn > end_pfn_map)
531 end_pfn_map = end_pfn;
532 if (end_pfn_map > MAXMEM>>PAGE_SHIFT)
533 @@ -256,43 +210,10 @@
534 if (end_pfn > end_pfn_map)
535 end_pfn = end_pfn_map;
536
537 + printk("end_pfn_map = %lu\n", end_pfn_map);
538 return end_pfn;
539 }
540
541 -/*
542 - * Compute how much memory is missing in a range.
543 - * Unlike the other functions in this file the arguments are in page numbers.
544 - */
545 -unsigned long __init
546 -e820_hole_size(unsigned long start_pfn, unsigned long end_pfn)
547 -{
548 - unsigned long ram = 0;
549 - unsigned long start = start_pfn << PAGE_SHIFT;
550 - unsigned long end = end_pfn << PAGE_SHIFT;
551 - int i;
552 - for (i = 0; i < e820.nr_map; i++) {
553 - struct e820entry *ei = &e820.map[i];
554 - unsigned long last, addr;
555 -
556 - if (ei->type != E820_RAM ||
557 - ei->addr+ei->size <= start ||
558 - ei->addr >= end)
559 - continue;
560 -
561 - addr = round_up(ei->addr, PAGE_SIZE);
562 - if (addr < start)
563 - addr = start;
564 -
565 - last = round_down(ei->addr + ei->size, PAGE_SIZE);
566 - if (last >= end)
567 - last = end;
568 -
569 - if (last > addr)
570 - ram += last - addr;
571 - }
572 - return ((end - start) - ram) >> PAGE_SHIFT;
573 -}
574 -
575 /*
576 * Mark e820 reserved areas as busy for the resource manager.
577 */
578 @@ -333,6 +254,98 @@
579 }
580 }
581
582 +#ifndef CONFIG_XEN
583 +/* Mark pages corresponding to given address range as nosave */
584 +static void __init
585 +e820_mark_nosave_range(unsigned long start, unsigned long end)
586 +{
587 + unsigned long pfn, max_pfn;
588 +
589 + if (start >= end)
590 + return;
591 +
592 + printk("Nosave address range: %016lx - %016lx\n", start, end);
593 + max_pfn = end >> PAGE_SHIFT;
594 + for (pfn = start >> PAGE_SHIFT; pfn < max_pfn; pfn++)
595 + if (pfn_valid(pfn))
596 + SetPageNosave(pfn_to_page(pfn));
597 +}
598 +
599 +/*
600 + * Find the ranges of physical addresses that do not correspond to
601 + * e820 RAM areas and mark the corresponding pages as nosave for software
602 + * suspend and suspend to RAM.
603 + *
604 + * This function requires the e820 map to be sorted and without any
605 + * overlapping entries and assumes the first e820 area to be RAM.
606 + */
607 +void __init e820_mark_nosave_regions(void)
608 +{
609 + int i;
610 + unsigned long paddr;
611 +
612 + paddr = round_down(e820.map[0].addr + e820.map[0].size, PAGE_SIZE);
613 + for (i = 1; i < e820.nr_map; i++) {
614 + struct e820entry *ei = &e820.map[i];
615 +
616 + if (paddr < ei->addr)
617 + e820_mark_nosave_range(paddr,
618 + round_up(ei->addr, PAGE_SIZE));
619 +
620 + paddr = round_down(ei->addr + ei->size, PAGE_SIZE);
621 + if (ei->type != E820_RAM)
622 + e820_mark_nosave_range(round_up(ei->addr, PAGE_SIZE),
623 + paddr);
624 +
625 + if (paddr >= (end_pfn << PAGE_SHIFT))
626 + break;
627 + }
628 +}
629 +#endif
630 +
631 +/* Walk the e820 map and register active regions within a node */
632 +void __init
633 +e820_register_active_regions(int nid, unsigned long start_pfn,
634 + unsigned long end_pfn)
635 +{
636 + int i;
637 + unsigned long ei_startpfn, ei_endpfn;
638 + for (i = 0; i < e820.nr_map; i++) {
639 + struct e820entry *ei = &e820.map[i];
640 + ei_startpfn = round_up(ei->addr, PAGE_SIZE) >> PAGE_SHIFT;
641 + ei_endpfn = round_down(ei->addr + ei->size, PAGE_SIZE)
642 + >> PAGE_SHIFT;
643 +
644 + /* Skip map entries smaller than a page */
645 + if (ei_startpfn >= ei_endpfn)
646 + continue;
647 +
648 + /* Check if end_pfn_map should be updated */
649 + if (ei->type != E820_RAM && ei_endpfn > end_pfn_map)
650 + end_pfn_map = ei_endpfn;
651 +
652 + /* Skip if map is outside the node */
653 + if (ei->type != E820_RAM ||
654 + ei_endpfn <= start_pfn ||
655 + ei_startpfn >= end_pfn)
656 + continue;
657 +
658 + /* Check for overlaps */
659 + if (ei_startpfn < start_pfn)
660 + ei_startpfn = start_pfn;
661 + if (ei_endpfn > end_pfn)
662 + ei_endpfn = end_pfn;
663 +
664 + /* Obey end_user_pfn to save on memmap */
665 + if (ei_startpfn >= end_user_pfn)
666 + continue;
667 + if (ei_endpfn > end_user_pfn)
668 + ei_endpfn = end_user_pfn;
669 +
670 + add_active_range(nid, ei_startpfn, ei_endpfn);
671 + }
672 +}
673 +
674 /*
675 * Add a memory region to the kernel e820 map.
676 */
677 @@ -553,13 +566,6 @@
678 * If we're lucky and live on a modern system, the setup code
679 * will have given us a memory map that we can use to properly
680 * set up memory. If we aren't, we'll fake a memory map.
681 - *
682 - * We check to see that the memory map contains at least 2 elements
683 - * before we'll use it, because the detection code in setup.S may
684 - * not be perfect and most every PC known to man has two memory
685 - * regions: one from 0 to 640k, and one from 1mb up. (The IBM
686 - * thinkpad 560x, for example, does not cooperate with the memory
687 - * detection code.)
688 */
689 static int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
690 {
691 @@ -581,37 +587,20 @@
692 if (start > end)
693 return -1;
694
695 -#ifndef CONFIG_XEN
696 - /*
697 - * Some BIOSes claim RAM in the 640k - 1M region.
698 - * Not right. Fix it up.
699 - *
700 - * This should be removed on Hammer which is supposed to not
701 - * have non e820 covered ISA mappings there, but I had some strange
702 - * problems so it stays for now. -AK
703 - */
704 - if (type == E820_RAM) {
705 - if (start < 0x100000ULL && end > 0xA0000ULL) {
706 - if (start < 0xA0000ULL)
707 - add_memory_region(start, 0xA0000ULL-start, type);
708 - if (end <= 0x100000ULL)
709 - continue;
710 - start = 0x100000ULL;
711 - size = end - start;
712 - }
713 - }
714 -#endif
715 -
716 add_memory_region(start, size, type);
717 } while (biosmap++,--nr_map);
718 return 0;
719 }
720
721 +void early_panic(char *msg)
722 +{
723 + early_printk(msg);
724 + panic(msg);
725 +}
726 +
727 #ifndef CONFIG_XEN
728 void __init setup_memory_region(void)
729 {
730 - char *who = "BIOS-e820";
731 -
732 /*
733 * Try to copy the BIOS-supplied E820-map.
734 *
735 @@ -619,24 +608,10 @@
736 * the next section from 1mb->appropriate_mem_k
737 */
738 sanitize_e820_map(E820_MAP, &E820_MAP_NR);
739 - if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0) {
740 - unsigned long mem_size;
741 -
742 - /* compare results from other methods and take the greater */
743 - if (ALT_MEM_K < EXT_MEM_K) {
744 - mem_size = EXT_MEM_K;
745 - who = "BIOS-88";
746 - } else {
747 - mem_size = ALT_MEM_K;
748 - who = "BIOS-e801";
749 - }
750 -
751 - e820.nr_map = 0;
752 - add_memory_region(0, LOWMEMSIZE(), E820_RAM);
753 - add_memory_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
754 - }
755 + if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0)
756 + early_panic("Cannot find a valid memory map");
757 printk(KERN_INFO "BIOS-provided physical RAM map:\n");
758 - e820_print_map(who);
759 + e820_print_map("BIOS-e820");
760 }
761
762 #else /* CONFIG_XEN */
763 @@ -668,20 +643,23 @@
764
765 sanitize_e820_map(map, (char *)&memmap.nr_entries);
766
767 - BUG_ON(copy_e820_map(map, (char)memmap.nr_entries) < 0);
768 + if (copy_e820_map(map, (char)memmap.nr_entries) < 0)
769 + early_panic("Cannot find a valid memory map");
770
771 printk(KERN_INFO "BIOS-provided physical RAM map:\n");
772 e820_print_map("Xen");
773 }
774 #endif
775
776 -void __init parse_memopt(char *p, char **from)
777 -{
778 +static int __init parse_memopt(char *p)
779 +{
780 int i;
781 unsigned long current_end;
782 unsigned long end;
783
784 - end_user_pfn = memparse(p, from);
785 + if (!p)
786 + return -EINVAL;
787 + end_user_pfn = memparse(p, &p);
788 end_user_pfn >>= PAGE_SHIFT;
789
790 end = end_user_pfn<<PAGE_SHIFT;
791 @@ -698,27 +676,61 @@
792 else
793 add_memory_region(current_end, end - current_end, E820_RAM);
794 }
795 +
796 + return 0;
797 }
798 +early_param("mem", parse_memopt);
799 +
800 +static int userdef __initdata;
801
802 -void __init parse_memmapopt(char *p, char **from)
803 +static int __init parse_memmap_opt(char *p)
804 {
805 + char *oldp;
806 unsigned long long start_at, mem_size;
807
808 - mem_size = memparse(p, from);
809 - p = *from;
810 + if (!strcmp(p, "exactmap")) {
811 +#ifdef CONFIG_CRASH_DUMP
812 + /* If we are doing a crash dump, we
813 + * still need to know the real mem
814 + * size before original memory map is
815 + * reset.
816 + */
817 + e820_register_active_regions(0, 0, -1UL);
818 + saved_max_pfn = e820_end_of_ram();
819 + remove_all_active_ranges();
820 +#endif
821 + end_pfn_map = 0;
822 + e820.nr_map = 0;
823 + userdef = 1;
824 + return 0;
825 + }
826 +
827 + oldp = p;
828 + mem_size = memparse(p, &p);
829 + if (p == oldp)
830 + return -EINVAL;
831 if (*p == '@') {
832 - start_at = memparse(p+1, from);
833 + start_at = memparse(p+1, &p);
834 add_memory_region(start_at, mem_size, E820_RAM);
835 } else if (*p == '#') {
836 - start_at = memparse(p+1, from);
837 + start_at = memparse(p+1, &p);
838 add_memory_region(start_at, mem_size, E820_ACPI);
839 } else if (*p == '$') {
840 - start_at = memparse(p+1, from);
841 + start_at = memparse(p+1, &p);
842 add_memory_region(start_at, mem_size, E820_RESERVED);
843 } else {
844 end_user_pfn = (mem_size >> PAGE_SHIFT);
845 }
846 - p = *from;
847 + return *p == '\0' ? 0 : -EINVAL;
848 +}
849 +early_param("memmap", parse_memmap_opt);
850 +
851 +void finish_e820_parsing(void)
852 +{
853 + if (userdef) {
854 + printk(KERN_INFO "user-defined physical RAM map:\n");
855 + e820_print_map("user");
856 + }
857 }
858
859 unsigned long pci_mem_start = 0xaeedbabe;
860 --- a/arch/x86/kernel/early_printk-xen.c
861 +++ b/arch/x86/kernel/early_printk-xen.c
862 @@ -244,20 +244,16 @@
863
864 static int __initdata keep_early;
865
866 -int __init setup_early_printk(char *opt)
867 +static int __init setup_early_printk(char *buf)
868 {
869 - char *space;
870 - char buf[256];
871 + if (!buf)
872 + return 0;
873
874 if (early_console_initialized)
875 - return 1;
876 -
877 - strlcpy(buf,opt,sizeof(buf));
878 - space = strchr(buf, ' ');
879 - if (space)
880 - *space = 0;
881 + return 0;
882 + early_console_initialized = 1;
883
884 - if (strstr(buf,"keep"))
885 + if (strstr(buf, "keep"))
886 keep_early = 1;
887
888 if (!strncmp(buf, "serial", 6)) {
889 @@ -281,11 +277,12 @@
890 early_console = &simnow_console;
891 keep_early = 1;
892 }
893 - early_console_initialized = 1;
894 register_console(early_console);
895 return 0;
896 }
897
898 +early_param("earlyprintk", setup_early_printk);
899 +
900 void __init disable_early_printk(void)
901 {
902 if (!early_console_initialized || !early_console)
903 @@ -299,4 +296,3 @@
904 }
905 }
906
907 -__setup("earlyprintk=", setup_early_printk);
908 --- a/arch/x86/kernel/entry_32-xen.S
909 +++ b/arch/x86/kernel/entry_32-xen.S
910 @@ -80,8 +80,12 @@
911 NMI_MASK = 0x80000000
912
913 #ifndef CONFIG_XEN
914 -#define DISABLE_INTERRUPTS cli
915 -#define ENABLE_INTERRUPTS sti
916 +/* These are replaces for paravirtualization */
917 +#define DISABLE_INTERRUPTS cli
918 +#define ENABLE_INTERRUPTS sti
919 +#define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit
920 +#define INTERRUPT_RETURN iret
921 +#define GET_CR0_INTO_EAX movl %cr0, %eax
922 #else
923 /* Offsets into shared_info_t. */
924 #define evtchn_upcall_pending /* 0 */
925 @@ -99,15 +103,29 @@
926
927 #define __DISABLE_INTERRUPTS movb $1,evtchn_upcall_mask(%esi)
928 #define __ENABLE_INTERRUPTS movb $0,evtchn_upcall_mask(%esi)
929 +#define __TEST_PENDING testb $0xFF,evtchn_upcall_pending(%esi)
930 #define DISABLE_INTERRUPTS GET_VCPU_INFO ; \
931 __DISABLE_INTERRUPTS
932 #define ENABLE_INTERRUPTS GET_VCPU_INFO ; \
933 __ENABLE_INTERRUPTS
934 -#define __TEST_PENDING testb $0xFF,evtchn_upcall_pending(%esi)
935 +#define ENABLE_INTERRUPTS_SYSEXIT __ENABLE_INTERRUPTS ; \
936 +sysexit_scrit: /**** START OF SYSEXIT CRITICAL REGION ****/ ; \
937 + __TEST_PENDING ; \
938 + jnz 14f # process more events if necessary... ; \
939 + movl ESI(%esp), %esi ; \
940 + sysexit ; \
941 +14: __DISABLE_INTERRUPTS ; \
942 + TRACE_IRQS_OFF ; \
943 +sysexit_ecrit: /**** END OF SYSEXIT CRITICAL REGION ****/ ; \
944 + push %esp ; \
945 + call evtchn_do_upcall ; \
946 + add $4,%esp ; \
947 + jmp ret_from_intr
948 +#define INTERRUPT_RETURN iret
949 #endif
950
951 #ifdef CONFIG_PREEMPT
952 -#define preempt_stop cli; TRACE_IRQS_OFF
953 +#define preempt_stop DISABLE_INTERRUPTS; TRACE_IRQS_OFF
954 #else
955 #define preempt_stop
956 #define resume_kernel restore_nocheck
957 @@ -206,18 +224,21 @@
958
959 #define RING0_INT_FRAME \
960 CFI_STARTPROC simple;\
961 + CFI_SIGNAL_FRAME;\
962 CFI_DEF_CFA esp, 3*4;\
963 /*CFI_OFFSET cs, -2*4;*/\
964 CFI_OFFSET eip, -3*4
965
966 #define RING0_EC_FRAME \
967 CFI_STARTPROC simple;\
968 + CFI_SIGNAL_FRAME;\
969 CFI_DEF_CFA esp, 4*4;\
970 /*CFI_OFFSET cs, -2*4;*/\
971 CFI_OFFSET eip, -3*4
972
973 #define RING0_PTREGS_FRAME \
974 CFI_STARTPROC simple;\
975 + CFI_SIGNAL_FRAME;\
976 CFI_DEF_CFA esp, OLDESP-EBX;\
977 /*CFI_OFFSET cs, CS-OLDESP;*/\
978 CFI_OFFSET eip, EIP-OLDESP;\
979 @@ -263,8 +284,9 @@
980 check_userspace:
981 movl EFLAGS(%esp), %eax # mix EFLAGS and CS
982 movb CS(%esp), %al
983 - testl $(VM_MASK | 2), %eax
984 - jz resume_kernel
985 + andl $(VM_MASK | SEGMENT_RPL_MASK), %eax
986 + cmpl $USER_RPL, %eax
987 + jb resume_kernel # not returning to v8086 or userspace
988 ENTRY(resume_userspace)
989 DISABLE_INTERRUPTS # make sure we don't miss an interrupt
990 # setting need_resched or sigpending
991 @@ -277,7 +299,7 @@
992
993 #ifdef CONFIG_PREEMPT
994 ENTRY(resume_kernel)
995 - cli
996 + DISABLE_INTERRUPTS
997 cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ?
998 jnz restore_nocheck
999 need_resched:
1000 @@ -297,6 +319,7 @@
1001 # sysenter call handler stub
1002 ENTRY(sysenter_entry)
1003 CFI_STARTPROC simple
1004 + CFI_SIGNAL_FRAME
1005 CFI_DEF_CFA esp, 0
1006 CFI_REGISTER esp, ebp
1007 movl SYSENTER_stack_esp0(%esp),%esp
1008 @@ -305,7 +328,7 @@
1009 * No need to follow this irqs on/off section: the syscall
1010 * disabled irqs and here we enable it straight after entry:
1011 */
1012 - sti
1013 + ENABLE_INTERRUPTS
1014 pushl $(__USER_DS)
1015 CFI_ADJUST_CFA_OFFSET 4
1016 /*CFI_REL_OFFSET ss, 0*/
1017 @@ -359,26 +382,8 @@
1018 movl EIP(%esp), %edx
1019 movl OLDESP(%esp), %ecx
1020 xorl %ebp,%ebp
1021 -#ifdef CONFIG_XEN
1022 TRACE_IRQS_ON
1023 - __ENABLE_INTERRUPTS
1024 -sysexit_scrit: /**** START OF SYSEXIT CRITICAL REGION ****/
1025 - __TEST_PENDING
1026 - jnz 14f # process more events if necessary...
1027 - movl ESI(%esp), %esi
1028 - sysexit
1029 -14: __DISABLE_INTERRUPTS
1030 - TRACE_IRQS_OFF
1031 -sysexit_ecrit: /**** END OF SYSEXIT CRITICAL REGION ****/
1032 - push %esp
1033 - call evtchn_do_upcall
1034 - add $4,%esp
1035 - jmp ret_from_intr
1036 -#else
1037 - TRACE_IRQS_ON
1038 - sti
1039 - sysexit
1040 -#endif /* !CONFIG_XEN */
1041 + ENABLE_INTERRUPTS_SYSEXIT
1042 CFI_ENDPROC
1043
1044 # pv sysenter call handler stub
1045 @@ -444,8 +449,8 @@
1046 # See comments in process.c:copy_thread() for details.
1047 movb OLDSS(%esp), %ah
1048 movb CS(%esp), %al
1049 - andl $(VM_MASK | (4 << 8) | 3), %eax
1050 - cmpl $((4 << 8) | 3), %eax
1051 + andl $(VM_MASK | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
1052 + cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
1053 CFI_REMEMBER_STATE
1054 je ldt_ss # returning to user-space with LDT SS
1055 restore_nocheck:
1056 @@ -467,12 +472,11 @@
1057 RESTORE_REGS
1058 addl $4, %esp
1059 CFI_ADJUST_CFA_OFFSET -4
1060 -1: iret
1061 +1: INTERRUPT_RETURN
1062 .section .fixup,"ax"
1063 iret_exc:
1064 #ifndef CONFIG_XEN
1065 - TRACE_IRQS_ON
1066 - sti
1067 + ENABLE_INTERRUPTS
1068 #endif
1069 pushl $0 # no error code
1070 pushl $do_iret_error
1071 @@ -498,7 +502,7 @@
1072 * dosemu and wine happy. */
1073 subl $8, %esp # reserve space for switch16 pointer
1074 CFI_ADJUST_CFA_OFFSET 8
1075 - cli
1076 + DISABLE_INTERRUPTS
1077 TRACE_IRQS_OFF
1078 movl %esp, %eax
1079 /* Set up the 16bit stack frame with switch32 pointer on top,
1080 @@ -508,7 +512,7 @@
1081 TRACE_IRQS_IRET
1082 RESTORE_REGS
1083 lss 20+4(%esp), %esp # switch to 16bit stack
1084 -1: iret
1085 +1: INTERRUPT_RETURN
1086 .section __ex_table,"a"
1087 .align 4
1088 .long 1b,iret_exc
1089 @@ -524,7 +528,7 @@
1090 RESTORE_REGS
1091 addl $4, %esp
1092 CFI_ADJUST_CFA_OFFSET -4
1093 -1: iret
1094 +1: INTERRUPT_RETURN
1095 .section __ex_table,"a"
1096 .align 4
1097 .long 1b,iret_exc
1098 @@ -713,11 +717,9 @@
1099 #define UNWIND_ESPFIX_STACK
1100 #endif
1101
1102 -ENTRY(divide_error)
1103 - RING0_INT_FRAME
1104 - pushl $0 # no error code
1105 - CFI_ADJUST_CFA_OFFSET 4
1106 - pushl $do_divide_error
1107 +KPROBE_ENTRY(page_fault)
1108 + RING0_EC_FRAME
1109 + pushl $do_page_fault
1110 CFI_ADJUST_CFA_OFFSET 4
1111 ALIGN
1112 error_code:
1113 @@ -767,6 +769,7 @@
1114 call *%edi
1115 jmp ret_from_exception
1116 CFI_ENDPROC
1117 +KPROBE_END(page_fault)
1118
1119 #ifdef CONFIG_XEN
1120 # A note on the "critical region" in our callback handler.
1121 @@ -926,7 +929,7 @@
1122 CFI_ADJUST_CFA_OFFSET 4
1123 SAVE_ALL
1124 #ifndef CONFIG_XEN
1125 - movl %cr0, %eax
1126 + GET_CR0_INTO_EAX
1127 testl $0x4, %eax # EM (math emulation bit)
1128 je device_available_emulate
1129 pushl $0 # temporary storage for ORIG_EIP
1130 @@ -961,9 +964,15 @@
1131 jne ok; \
1132 label: \
1133 movl SYSENTER_stack_esp0+offset(%esp),%esp; \
1134 + CFI_DEF_CFA esp, 0; \
1135 + CFI_UNDEFINED eip; \
1136 pushfl; \
1137 + CFI_ADJUST_CFA_OFFSET 4; \
1138 pushl $__KERNEL_CS; \
1139 - pushl $sysenter_past_esp
1140 + CFI_ADJUST_CFA_OFFSET 4; \
1141 + pushl $sysenter_past_esp; \
1142 + CFI_ADJUST_CFA_OFFSET 4; \
1143 + CFI_REL_OFFSET eip, 0
1144 #endif /* CONFIG_XEN */
1145
1146 KPROBE_ENTRY(debug)
1147 @@ -982,7 +991,8 @@
1148 call do_debug
1149 jmp ret_from_exception
1150 CFI_ENDPROC
1151 - .previous .text
1152 +KPROBE_END(debug)
1153 +
1154 #ifndef CONFIG_XEN
1155 /*
1156 * NMI is doubly nasty. It can happen _while_ we're handling
1157 @@ -992,7 +1002,7 @@
1158 * check whether we got an NMI on the debug path where the debug
1159 * fault happened on the sysenter path.
1160 */
1161 -ENTRY(nmi)
1162 +KPROBE_ENTRY(nmi)
1163 RING0_INT_FRAME
1164 pushl %eax
1165 CFI_ADJUST_CFA_OFFSET 4
1166 @@ -1017,6 +1027,7 @@
1167 cmpl $sysenter_entry,12(%esp)
1168 je nmi_debug_stack_check
1169 nmi_stack_correct:
1170 + /* We have a RING0_INT_FRAME here */
1171 pushl %eax
1172 CFI_ADJUST_CFA_OFFSET 4
1173 SAVE_ALL
1174 @@ -1027,9 +1038,12 @@
1175 CFI_ENDPROC
1176
1177 nmi_stack_fixup:
1178 + RING0_INT_FRAME
1179 FIX_STACK(12,nmi_stack_correct, 1)
1180 jmp nmi_stack_correct
1181 +
1182 nmi_debug_stack_check:
1183 + /* We have a RING0_INT_FRAME here */
1184 cmpw $__KERNEL_CS,16(%esp)
1185 jne nmi_stack_correct
1186 cmpl $debug,(%esp)
1187 @@ -1040,8 +1054,10 @@
1188 jmp nmi_stack_correct
1189
1190 nmi_16bit_stack:
1191 - RING0_INT_FRAME
1192 - /* create the pointer to lss back */
1193 + /* We have a RING0_INT_FRAME here.
1194 + *
1195 + * create the pointer to lss back
1196 + */
1197 pushl %ss
1198 CFI_ADJUST_CFA_OFFSET 4
1199 pushl %esp
1200 @@ -1062,14 +1078,14 @@
1201 call do_nmi
1202 RESTORE_REGS
1203 lss 12+4(%esp), %esp # back to 16bit stack
1204 -1: iret
1205 +1: INTERRUPT_RETURN
1206 CFI_ENDPROC
1207 .section __ex_table,"a"
1208 .align 4
1209 .long 1b,iret_exc
1210 .previous
1211 #else
1212 -ENTRY(nmi)
1213 +KPROBE_ENTRY(nmi)
1214 RING0_INT_FRAME
1215 pushl %eax
1216 CFI_ADJUST_CFA_OFFSET 4
1217 @@ -1081,6 +1097,7 @@
1218 jmp restore_all
1219 CFI_ENDPROC
1220 #endif
1221 +KPROBE_END(nmi)
1222
1223 KPROBE_ENTRY(int3)
1224 RING0_INT_FRAME
1225 @@ -1092,7 +1109,7 @@
1226 call do_int3
1227 jmp ret_from_exception
1228 CFI_ENDPROC
1229 - .previous .text
1230 +KPROBE_END(int3)
1231
1232 ENTRY(overflow)
1233 RING0_INT_FRAME
1234 @@ -1157,7 +1174,7 @@
1235 CFI_ADJUST_CFA_OFFSET 4
1236 jmp error_code
1237 CFI_ENDPROC
1238 - .previous .text
1239 +KPROBE_END(general_protection)
1240
1241 ENTRY(alignment_check)
1242 RING0_EC_FRAME
1243 @@ -1166,13 +1183,14 @@
1244 jmp error_code
1245 CFI_ENDPROC
1246
1247 -KPROBE_ENTRY(page_fault)
1248 - RING0_EC_FRAME
1249 - pushl $do_page_fault
1250 +ENTRY(divide_error)
1251 + RING0_INT_FRAME
1252 + pushl $0 # no error code
1253 + CFI_ADJUST_CFA_OFFSET 4
1254 + pushl $do_divide_error
1255 CFI_ADJUST_CFA_OFFSET 4
1256 jmp error_code
1257 CFI_ENDPROC
1258 - .previous .text
1259
1260 #ifdef CONFIG_X86_MCE
1261 ENTRY(machine_check)
1262 @@ -1234,6 +1252,19 @@
1263 jmp error_code
1264 CFI_ENDPROC
1265
1266 +ENTRY(kernel_thread_helper)
1267 + pushl $0 # fake return address for unwinder
1268 + CFI_STARTPROC
1269 + movl %edx,%eax
1270 + push %edx
1271 + CFI_ADJUST_CFA_OFFSET 4
1272 + call *%ebx
1273 + push %eax
1274 + CFI_ADJUST_CFA_OFFSET 4
1275 + call do_exit
1276 + CFI_ENDPROC
1277 +ENDPROC(kernel_thread_helper)
1278 +
1279 .section .rodata,"a"
1280 #include "syscall_table.S"
1281
1282 --- a/arch/x86/kernel/entry_64-xen.S
1283 +++ b/arch/x86/kernel/entry_64-xen.S
1284 @@ -26,15 +23,25 @@
1285 * at the top of the kernel process stack.
1286 * - partial stack frame: partially saved registers upto R11.
1287 * - full stack frame: Like partial stack frame, but all register saved.
1288 - *
1289 - * TODO:
1290 - * - schedule it carefully for the final hardware.
1291 + *
1292 + * Some macro usage:
1293 + * - CFI macros are used to generate dwarf2 unwind information for better
1294 + * backtraces. They don't change any code.
1295 + * - SAVE_ALL/RESTORE_ALL - Save/restore all registers
1296 + * - SAVE_ARGS/RESTORE_ARGS - Save/restore registers that C functions modify.
1297 + * There are unfortunately lots of special cases where some registers
1298 + * not touched. The macro is a big mess that should be cleaned up.
1299 + * - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS.
1300 + * Gives a full stack frame.
1301 + * - ENTRY/END Define functions in the symbol table.
1302 + * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack
1303 + * frame that is otherwise undefined after a SYSCALL
1304 + * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging.
1305 + * - errorentry/paranoidentry/zeroentry - Define exception entry points.
1306 */
1307
1308 -#define ASSEMBLY 1
1309 #include <linux/linkage.h>
1310 #include <asm/segment.h>
1311 -#include <asm/smp.h>
1312 #include <asm/cache.h>
1313 #include <asm/errno.h>
1314 #include <asm/dwarf2.h>
1315 @@ -117,6 +124,7 @@
1316 .macro CFI_DEFAULT_STACK start=1,adj=0
1317 .if \start
1318 CFI_STARTPROC simple
1319 + CFI_SIGNAL_FRAME
1320 CFI_DEF_CFA rsp,SS+8-(\adj*ARGOFFSET)
1321 .else
1322 CFI_DEF_CFA_OFFSET SS+8-(\adj*ARGOFFSET)
1323 @@ -207,6 +215,7 @@
1324 */
1325 .macro _frame ref
1326 CFI_STARTPROC simple
1327 + CFI_SIGNAL_FRAME
1328 CFI_DEF_CFA rsp,SS+8-\ref
1329 /*CFI_REL_OFFSET ss,SS-\ref*/
1330 CFI_REL_OFFSET rsp,RSP-\ref
1331 @@ -334,6 +343,8 @@
1332 LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */
1333 RESTORE_REST
1334 cmpq $__NR_syscall_max,%rax
1335 + movq $-ENOSYS,%rcx
1336 + cmova %rcx,%rax
1337 ja 1f
1338 movq %r10,%rcx /* fixup for C */
1339 call *sys_call_table(,%rax,8)
1340 @@ -349,6 +360,7 @@
1341 */
1342 ENTRY(int_ret_from_sys_call)
1343 CFI_STARTPROC simple
1344 + CFI_SIGNAL_FRAME
1345 CFI_DEF_CFA rsp,SS+8-ARGOFFSET
1346 /*CFI_REL_OFFSET ss,SS-ARGOFFSET*/
1347 CFI_REL_OFFSET rsp,RSP-ARGOFFSET
1348 @@ -583,8 +595,7 @@
1349 #ifdef CONFIG_PREEMPT
1350 /* Returning to kernel space. Check if we need preemption */
1351 /* rcx: threadinfo. interrupts off. */
1352 - .p2align
1353 -retint_kernel:
1354 +ENTRY(retint_kernel)
1355 cmpl $0,threadinfo_preempt_count(%rcx)
1356 jnz retint_restore_args
1357 bt $TIF_NEED_RESCHED,threadinfo_flags(%rcx)
1358 @@ -644,7 +655,6 @@
1359 END(call_function_interrupt)
1360 #endif
1361
1362 -#ifdef CONFIG_X86_LOCAL_APIC
1363 ENTRY(apic_timer_interrupt)
1364 apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt
1365 END(apic_timer_interrupt)
1366 @@ -656,7 +666,6 @@
1367 ENTRY(spurious_interrupt)
1368 apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt
1369 END(spurious_interrupt)
1370 -#endif
1371 #endif /* !CONFIG_XEN */
1372
1373 /*
1374 @@ -755,7 +764,9 @@
1375 testl $3,CS(%rsp)
1376 jnz paranoid_userspace\trace
1377 paranoid_swapgs\trace:
1378 + .if \trace
1379 TRACE_IRQS_IRETQ 0
1380 + .endif
1381 swapgs
1382 paranoid_restore\trace:
1383 RESTORE_ALL 8
1384 @@ -802,7 +813,7 @@
1385 * Exception entry point. This expects an error code/orig_rax on the stack
1386 * and the exception handler in %rax.
1387 */
1388 -ENTRY(error_entry)
1389 +KPROBE_ENTRY(error_entry)
1390 _frame RDI
1391 CFI_REL_OFFSET rax,0
1392 /* rdi slot contains rax, oldrax contains error code */
1393 @@ -896,7 +907,7 @@
1394 jmp error_sti
1395 #endif
1396 CFI_ENDPROC
1397 -END(error_entry)
1398 +KPROBE_END(error_entry)
1399
1400 ENTRY(hypervisor_callback)
1401 zeroentry do_hypervisor_callback
1402 @@ -936,26 +947,6 @@
1403 CFI_ENDPROC
1404 END(do_hypervisor_callback)
1405
1406 -#ifdef CONFIG_X86_LOCAL_APIC
1407 -KPROBE_ENTRY(nmi)
1408 - zeroentry do_nmi_callback
1409 -ENTRY(do_nmi_callback)
1410 - CFI_STARTPROC
1411 - addq $8, %rsp
1412 - CFI_ENDPROC
1413 - CFI_DEFAULT_STACK
1414 - call do_nmi
1415 - orl $NMI_MASK,EFLAGS(%rsp)
1416 - RESTORE_REST
1417 - XEN_BLOCK_EVENTS(%rsi)
1418 - TRACE_IRQS_OFF
1419 - GET_THREAD_INFO(%rcx)
1420 - jmp retint_restore_args
1421 - CFI_ENDPROC
1422 - .previous .text
1423 -END(nmi)
1424 -#endif
1425 -
1426 ALIGN
1427 restore_all_enable_events:
1428 CFI_DEFAULT_STACK adj=1
1429 @@ -1121,7 +1112,7 @@
1430 * do_sys_execve asm fallback arguments:
1431 * rdi: name, rsi: argv, rdx: envp, fake frame on the stack
1432 */
1433 -ENTRY(execve)
1434 +ENTRY(kernel_execve)
1435 CFI_STARTPROC
1436 FAKE_STACK_FRAME $0
1437 SAVE_ALL
1438 @@ -1135,12 +1126,11 @@
1439 UNFAKE_STACK_FRAME
1440 ret
1441 CFI_ENDPROC
1442 -ENDPROC(execve)
1443 +ENDPROC(kernel_execve)
1444
1445 KPROBE_ENTRY(page_fault)
1446 errorentry do_page_fault
1447 -END(page_fault)
1448 - .previous .text
1449 +KPROBE_END(page_fault)
1450
1451 ENTRY(coprocessor_error)
1452 zeroentry do_coprocessor_error
1453 @@ -1162,25 +1152,25 @@
1454 zeroentry do_debug
1455 /* paranoidexit
1456 CFI_ENDPROC */
1457 -END(debug)
1458 - .previous .text
1459 +KPROBE_END(debug)
1460
1461 -#if 0
1462 - /* runs on exception stack */
1463 KPROBE_ENTRY(nmi)
1464 - INTR_FRAME
1465 - pushq $-1
1466 - CFI_ADJUST_CFA_OFFSET 8
1467 - paranoidentry do_nmi, 0, 0
1468 -#ifdef CONFIG_TRACE_IRQFLAGS
1469 - paranoidexit 0
1470 -#else
1471 - jmp paranoid_exit1
1472 - CFI_ENDPROC
1473 -#endif
1474 -END(nmi)
1475 - .previous .text
1476 -#endif
1477 + zeroentry do_nmi_callback
1478 +KPROBE_END(nmi)
1479 +do_nmi_callback:
1480 + CFI_STARTPROC
1481 + addq $8, %rsp
1482 + CFI_ENDPROC
1483 + CFI_DEFAULT_STACK
1484 + call do_nmi
1485 + orl $NMI_MASK,EFLAGS(%rsp)
1486 + RESTORE_REST
1487 + XEN_BLOCK_EVENTS(%rsi)
1488 + TRACE_IRQS_OFF
1489 + GET_THREAD_INFO(%rcx)
1490 + jmp retint_restore_args
1491 + CFI_ENDPROC
1492 +END(do_nmi_callback)
1493
1494 KPROBE_ENTRY(int3)
1495 /* INTR_FRAME
1496 @@ -1189,8 +1179,7 @@
1497 zeroentry do_int3
1498 /* jmp paranoid_exit1
1499 CFI_ENDPROC */
1500 -END(int3)
1501 - .previous .text
1502 +KPROBE_END(int3)
1503
1504 ENTRY(overflow)
1505 zeroentry do_overflow
1506 @@ -1241,8 +1230,7 @@
1507
1508 KPROBE_ENTRY(general_protection)
1509 errorentry do_general_protection
1510 -END(general_protection)
1511 - .previous .text
1512 +KPROBE_END(general_protection)
1513
1514 ENTRY(alignment_check)
1515 errorentry do_alignment_check
1516 --- a/arch/x86/kernel/genapic_xen_64.c
1517 +++ b/arch/x86/kernel/genapic_xen_64.c
1518 @@ -71,6 +71,13 @@
1519 return cpu_online_map;
1520 }
1521
1522 +static cpumask_t xen_vector_allocation_domain(int cpu)
1523 +{
1524 + cpumask_t domain = CPU_MASK_NONE;
1525 + cpu_set(cpu, domain);
1526 + return domain;
1527 +}
1528 +
1529 /*
1530 * Set up the logical destination ID.
1531 * Do nothing, not called now.
1532 @@ -147,8 +154,8 @@
1533 .int_delivery_mode = dest_LowestPrio,
1534 #endif
1535 .int_dest_mode = (APIC_DEST_LOGICAL != 0),
1536 - .int_delivery_dest = APIC_DEST_LOGICAL | APIC_DM_LOWEST,
1537 .target_cpus = xen_target_cpus,
1538 + .vector_allocation_domain = xen_vector_allocation_domain,
1539 #ifdef CONFIG_XEN_PRIVILEGED_GUEST
1540 .apic_id_registered = xen_apic_id_registered,
1541 #endif
1542 --- a/arch/x86/kernel/head64-xen.c
1543 +++ b/arch/x86/kernel/head64-xen.c
1544 @@ -54,11 +54,9 @@
1545 new_data = *(int *) (x86_boot_params + NEW_CL_POINTER);
1546 if (!new_data) {
1547 if (OLD_CL_MAGIC != * (u16 *) OLD_CL_MAGIC_ADDR) {
1548 - printk("so old bootloader that it does not support commandline?!\n");
1549 return;
1550 }
1551 new_data = OLD_CL_BASE_ADDR + * (u16 *) OLD_CL_OFFSET;
1552 - printk("old bootloader convention, maybe loadlin?\n");
1553 }
1554 command_line = (char *) ((u64)(new_data));
1555 memcpy(saved_command_line, command_line, COMMAND_LINE_SIZE);
1556 @@ -70,25 +68,6 @@
1557 memcpy(saved_command_line, xen_start_info->cmd_line, max_cmdline);
1558 saved_command_line[max_cmdline-1] = '\0';
1559 #endif
1560 - printk("Bootdata ok (command line is %s)\n", saved_command_line);
1561 -}
1562 -
1563 -static void __init setup_boot_cpu_data(void)
1564 -{
1565 - unsigned int dummy, eax;
1566 -
1567 - /* get vendor info */
1568 - cpuid(0, (unsigned int *)&boot_cpu_data.cpuid_level,
1569 - (unsigned int *)&boot_cpu_data.x86_vendor_id[0],
1570 - (unsigned int *)&boot_cpu_data.x86_vendor_id[8],
1571 - (unsigned int *)&boot_cpu_data.x86_vendor_id[4]);
1572 -
1573 - /* get cpu type */
1574 - cpuid(1, &eax, &dummy, &dummy,
1575 - (unsigned int *) &boot_cpu_data.x86_capability);
1576 - boot_cpu_data.x86 = (eax >> 8) & 0xf;
1577 - boot_cpu_data.x86_model = (eax >> 4) & 0xf;
1578 - boot_cpu_data.x86_mask = eax & 0xf;
1579 }
1580
1581 #include <xen/interface/memory.h>
1582 @@ -101,7 +80,6 @@
1583 {
1584 struct xen_machphys_mapping mapping;
1585 unsigned long machine_to_phys_nr_ents;
1586 - char *s;
1587 int i;
1588
1589 setup_xen_features();
1590 @@ -128,10 +106,7 @@
1591 asm volatile("lidt %0" :: "m" (idt_descr));
1592 #endif
1593
1594 - /*
1595 - * This must be called really, really early:
1596 - */
1597 - lockdep_init();
1598 + early_printk("Kernel alive\n");
1599
1600 for (i = 0; i < NR_CPUS; i++)
1601 cpu_pda(i) = &boot_cpu_pda[i];
1602 @@ -141,22 +116,5 @@
1603 #ifdef CONFIG_SMP
1604 cpu_set(0, cpu_online_map);
1605 #endif
1606 - s = strstr(saved_command_line, "earlyprintk=");
1607 - if (s != NULL)
1608 - setup_early_printk(strchr(s, '=') + 1);
1609 -#ifdef CONFIG_NUMA
1610 - s = strstr(saved_command_line, "numa=");
1611 - if (s != NULL)
1612 - numa_setup(s+5);
1613 -#endif
1614 -#ifdef CONFIG_X86_IO_APIC
1615 - if (strstr(saved_command_line, "disableapic"))
1616 - disable_apic = 1;
1617 -#endif
1618 - /* You need early console to see that */
1619 - if (__pa_symbol(&_end) >= KERNEL_TEXT_SIZE)
1620 - panic("Kernel too big for kernel mapping\n");
1621 -
1622 - setup_boot_cpu_data();
1623 start_kernel();
1624 }
1625 --- a/arch/x86/kernel/head_32-xen.S
1626 +++ b/arch/x86/kernel/head_32-xen.S
1627 @@ -62,7 +62,7 @@
1628 movl %eax,%gs
1629 cld # gcc2 wants the direction flag cleared at all times
1630
1631 - pushl %eax # fake return address
1632 + pushl $0 # fake return address for unwinder
1633 jmp start_kernel
1634
1635 #define HYPERCALL_PAGE_OFFSET 0x1000
1636 --- a/arch/x86/kernel/head_64-xen.S
1637 +++ b/arch/x86/kernel/head_64-xen.S
1638 @@ -149,7 +146,7 @@
1639 .quad 0,0 /* TSS */
1640 .quad 0,0 /* LDT */
1641 .quad 0,0,0 /* three TLS descriptors */
1642 - .quad 0 /* unused */
1643 + .quad 0x0000f40000000000 /* node/CPU stored in limit */
1644 gdt_end:
1645 /* asm/segment.h:GDT_ENTRIES must match this */
1646 /* This should be a multiple of the cache line size */
1647 --- a/arch/x86/kernel/io_apic_32-xen.c
1648 +++ b/arch/x86/kernel/io_apic_32-xen.c
1649 @@ -31,6 +31,9 @@
1650 #include <linux/acpi.h>
1651 #include <linux/module.h>
1652 #include <linux/sysdev.h>
1653 +#include <linux/pci.h>
1654 +#include <linux/msi.h>
1655 +#include <linux/htirq.h>
1656
1657 #include <asm/io.h>
1658 #include <asm/smp.h>
1659 @@ -38,13 +41,15 @@
1660 #include <asm/timer.h>
1661 #include <asm/i8259.h>
1662 #include <asm/nmi.h>
1663 +#include <asm/msidef.h>
1664 +#include <asm/hypertransport.h>
1665
1666 #include <mach_apic.h>
1667 +#include <mach_apicdef.h>
1668
1669 #include "io_ports.h"
1670
1671 #ifdef CONFIG_XEN
1672 -
1673 #include <xen/interface/xen.h>
1674 #include <xen/interface/physdev.h>
1675
1676 @@ -55,32 +60,7 @@
1677
1678 unsigned long io_apic_irqs;
1679
1680 -static inline unsigned int xen_io_apic_read(unsigned int apic, unsigned int reg)
1681 -{
1682 - struct physdev_apic apic_op;
1683 - int ret;
1684 -
1685 - apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
1686 - apic_op.reg = reg;
1687 - ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op);
1688 - if (ret)
1689 - return ret;
1690 - return apic_op.value;
1691 -}
1692 -
1693 -static inline void xen_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
1694 -{
1695 - struct physdev_apic apic_op;
1696 -
1697 - apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
1698 - apic_op.reg = reg;
1699 - apic_op.value = value;
1700 - WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op));
1701 -}
1702 -
1703 -#define io_apic_read(a,r) xen_io_apic_read(a,r)
1704 -#define io_apic_write(a,r,v) xen_io_apic_write(a,r,v)
1705 -
1706 +#define clear_IO_APIC() ((void)0)
1707 #endif /* CONFIG_XEN */
1708
1709 int (*ioapic_renumber_irq)(int ioapic, int irq);
1710 @@ -105,7 +85,7 @@
1711 */
1712 int nr_ioapic_registers[MAX_IO_APICS];
1713
1714 -int disable_timer_pin_1 __initdata;
1715 +static int disable_timer_pin_1 __initdata;
1716
1717 /*
1718 * Rough estimation of how many shared IRQs there are, can
1719 @@ -125,12 +105,122 @@
1720 int apic, pin, next;
1721 } irq_2_pin[PIN_MAP_SIZE];
1722
1723 -int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1};
1724 -#ifdef CONFIG_PCI_MSI
1725 -#define vector_to_irq(vector) \
1726 - (platform_legacy_irq(vector) ? vector : vector_irq[vector])
1727 +#ifndef CONFIG_XEN
1728 +struct io_apic {
1729 + unsigned int index;
1730 + unsigned int unused[3];
1731 + unsigned int data;
1732 +};
1733 +
1734 +static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
1735 +{
1736 + return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
1737 + + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK);
1738 +}
1739 +#endif
1740 +
1741 +static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
1742 +{
1743 +#ifndef CONFIG_XEN
1744 + struct io_apic __iomem *io_apic = io_apic_base(apic);
1745 + writel(reg, &io_apic->index);
1746 + return readl(&io_apic->data);
1747 +#else
1748 + struct physdev_apic apic_op;
1749 + int ret;
1750 +
1751 + apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
1752 + apic_op.reg = reg;
1753 + ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op);
1754 + if (ret)
1755 + return ret;
1756 + return apic_op.value;
1757 +#endif
1758 +}
1759 +
1760 +static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
1761 +{
1762 +#ifndef CONFIG_XEN
1763 + struct io_apic __iomem *io_apic = io_apic_base(apic);
1764 + writel(reg, &io_apic->index);
1765 + writel(value, &io_apic->data);
1766 +#else
1767 + struct physdev_apic apic_op;
1768 +
1769 + apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
1770 + apic_op.reg = reg;
1771 + apic_op.value = value;
1772 + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op));
1773 +#endif
1774 +}
1775 +
1776 +#ifndef CONFIG_XEN
1777 +/*
1778 + * Re-write a value: to be used for read-modify-write
1779 + * cycles where the read already set up the index register.
1780 + *
1781 + * Older SiS APIC requires we rewrite the index register
1782 + */
1783 +static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
1784 +{
1785 + volatile struct io_apic *io_apic = io_apic_base(apic);
1786 + if (sis_apic_bug)
1787 + writel(reg, &io_apic->index);
1788 + writel(value, &io_apic->data);
1789 +}
1790 #else
1791 -#define vector_to_irq(vector) (vector)
1792 +#define io_apic_modify io_apic_write
1793 +#endif
1794 +
1795 +union entry_union {
1796 + struct { u32 w1, w2; };
1797 + struct IO_APIC_route_entry entry;
1798 +};
1799 +
1800 +static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
1801 +{
1802 + union entry_union eu;
1803 + unsigned long flags;
1804 + spin_lock_irqsave(&ioapic_lock, flags);
1805 + eu.w1 = io_apic_read(apic, 0x10 + 2 * pin);
1806 + eu.w2 = io_apic_read(apic, 0x11 + 2 * pin);
1807 + spin_unlock_irqrestore(&ioapic_lock, flags);
1808 + return eu.entry;
1809 +}
1810 +
1811 +/*
1812 + * When we write a new IO APIC routing entry, we need to write the high
1813 + * word first! If the mask bit in the low word is clear, we will enable
1814 + * the interrupt, and we need to make sure the entry is fully populated
1815 + * before that happens.
1816 + */
1817 +static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
1818 +{
1819 + unsigned long flags;
1820 + union entry_union eu;
1821 + eu.entry = e;
1822 + spin_lock_irqsave(&ioapic_lock, flags);
1823 + io_apic_write(apic, 0x11 + 2*pin, eu.w2);
1824 + io_apic_write(apic, 0x10 + 2*pin, eu.w1);
1825 + spin_unlock_irqrestore(&ioapic_lock, flags);
1826 +}
1827 +
1828 +#ifndef CONFIG_XEN
1829 +/*
1830 + * When we mask an IO APIC routing entry, we need to write the low
1831 + * word first, in order to set the mask bit before we change the
1832 + * high bits!
1833 + */
1834 +static void ioapic_mask_entry(int apic, int pin)
1835 +{
1836 + unsigned long flags;
1837 + union entry_union eu = { .entry.mask = 1 };
1838 +
1839 + spin_lock_irqsave(&ioapic_lock, flags);
1840 + io_apic_write(apic, 0x10 + 2*pin, eu.w1);
1841 + io_apic_write(apic, 0x11 + 2*pin, eu.w2);
1842 + spin_unlock_irqrestore(&ioapic_lock, flags);
1843 +}
1844 #endif
1845
1846 /*
1847 @@ -156,9 +246,7 @@
1848 entry->pin = pin;
1849 }
1850
1851 -#ifdef CONFIG_XEN
1852 -#define clear_IO_APIC() ((void)0)
1853 -#else
1854 +#ifndef CONFIG_XEN
1855 /*
1856 * Reroute an IRQ to a different pin.
1857 */
1858 @@ -243,25 +331,16 @@
1859 static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
1860 {
1861 struct IO_APIC_route_entry entry;
1862 - unsigned long flags;
1863
1864 /* Check delivery_mode to be sure we're not clearing an SMI pin */
1865 - spin_lock_irqsave(&ioapic_lock, flags);
1866 - *(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
1867 - *(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
1868 - spin_unlock_irqrestore(&ioapic_lock, flags);
1869 + entry = ioapic_read_entry(apic, pin);
1870 if (entry.delivery_mode == dest_SMI)
1871 return;
1872
1873 /*
1874 * Disable it in the IO-APIC irq-routing table:
1875 */
1876 - memset(&entry, 0, sizeof(entry));
1877 - entry.mask = 1;
1878 - spin_lock_irqsave(&ioapic_lock, flags);
1879 - io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0));
1880 - io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1));
1881 - spin_unlock_irqrestore(&ioapic_lock, flags);
1882 + ioapic_mask_entry(apic, pin);
1883 }
1884
1885 static void clear_IO_APIC (void)
1886 @@ -301,7 +380,7 @@
1887 break;
1888 entry = irq_2_pin + entry->next;
1889 }
1890 - set_irq_info(irq, cpumask);
1891 + set_native_irq_info(irq, cpumask);
1892 spin_unlock_irqrestore(&ioapic_lock, flags);
1893 }
1894
1895 @@ -1207,40 +1286,40 @@
1896 /* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
1897 u8 irq_vector[NR_IRQ_VECTORS] __read_mostly; /* = { FIRST_DEVICE_VECTOR , 0 }; */
1898
1899 -int assign_irq_vector(int irq)
1900 +static int __assign_irq_vector(int irq)
1901 {
1902 - unsigned long flags;
1903 int vector;
1904 struct physdev_irq irq_op;
1905
1906 - BUG_ON(irq != AUTO_ASSIGN && (unsigned)irq >= NR_IRQ_VECTORS);
1907 -
1908 - spin_lock_irqsave(&vector_lock, flags);
1909 + BUG_ON((unsigned)irq >= NR_IRQ_VECTORS);
1910
1911 - if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0) {
1912 - spin_unlock_irqrestore(&vector_lock, flags);
1913 - return IO_APIC_VECTOR(irq);
1914 - }
1915 + if (irq_vector[irq] > 0)
1916 + return irq_vector[irq];
1917
1918 irq_op.irq = irq;
1919 - if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) {
1920 - spin_unlock_irqrestore(&vector_lock, flags);
1921 + if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op))
1922 return -ENOSPC;
1923 - }
1924
1925 vector = irq_op.vector;
1926 - vector_irq[vector] = irq;
1927 - if (irq != AUTO_ASSIGN)
1928 - IO_APIC_VECTOR(irq) = vector;
1929 + irq_vector[irq] = vector;
1930 +
1931 + return vector;
1932 +}
1933
1934 +static int assign_irq_vector(int irq)
1935 +{
1936 + unsigned long flags;
1937 + int vector;
1938 +
1939 + spin_lock_irqsave(&vector_lock, flags);
1940 + vector = __assign_irq_vector(irq);
1941 spin_unlock_irqrestore(&vector_lock, flags);
1942
1943 return vector;
1944 }
1945
1946 #ifndef CONFIG_XEN
1947 -static struct hw_interrupt_type ioapic_level_type;
1948 -static struct hw_interrupt_type ioapic_edge_type;
1949 +static struct irq_chip ioapic_chip;
1950
1951 #define IOAPIC_AUTO -1
1952 #define IOAPIC_EDGE 0
1953 @@ -1248,16 +1327,16 @@
1954
1955 static void ioapic_register_intr(int irq, int vector, unsigned long trigger)
1956 {
1957 - unsigned idx;
1958 -
1959 - idx = use_pci_vector() && !platform_legacy_irq(irq) ? vector : irq;
1960 -
1961 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
1962 trigger == IOAPIC_LEVEL)
1963 - irq_desc[idx].chip = &ioapic_level_type;
1964 - else
1965 - irq_desc[idx].chip = &ioapic_edge_type;
1966 - set_intr_gate(vector, interrupt[idx]);
1967 + set_irq_chip_and_handler_name(irq, &ioapic_chip,
1968 + handle_fasteoi_irq, "fasteoi");
1969 + else {
1970 + irq_desc[irq].status |= IRQ_DELAYED_DISABLE;
1971 + set_irq_chip_and_handler_name(irq, &ioapic_chip,
1972 + handle_edge_irq, "edge");
1973 + }
1974 + set_intr_gate(vector, interrupt[irq]);
1975 }
1976 #else
1977 #define ioapic_register_intr(_irq,_vector,_trigger) ((void)0)
1978 @@ -1328,9 +1407,8 @@
1979 if (!apic && (irq < 16))
1980 disable_8259A_irq(irq);
1981 }
1982 + ioapic_write_entry(apic, pin, entry);
1983 spin_lock_irqsave(&ioapic_lock, flags);
1984 - io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
1985 - io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
1986 set_native_irq_info(irq, TARGET_CPUS);
1987 spin_unlock_irqrestore(&ioapic_lock, flags);
1988 }
1989 @@ -1347,7 +1425,6 @@
1990 static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
1991 {
1992 struct IO_APIC_route_entry entry;
1993 - unsigned long flags;
1994
1995 memset(&entry,0,sizeof(entry));
1996
1997 @@ -1372,15 +1449,13 @@
1998 * The timer IRQ doesn't have to know that behind the
1999 * scene we have a 8259A-master in AEOI mode ...
2000 */
2001 - irq_desc[0].chip = &ioapic_edge_type;
2002 + irq_desc[0].chip = &ioapic_chip;
2003 + set_irq_handler(0, handle_edge_irq);
2004
2005 /*
2006 * Add it to the IO-APIC irq-routing table:
2007 */
2008 - spin_lock_irqsave(&ioapic_lock, flags);
2009 - io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
2010 - io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
2011 - spin_unlock_irqrestore(&ioapic_lock, flags);
2012 + ioapic_write_entry(apic, pin, entry);
2013
2014 enable_8259A_irq(0);
2015 }
2016 @@ -1490,10 +1565,7 @@
2017 for (i = 0; i <= reg_01.bits.entries; i++) {
2018 struct IO_APIC_route_entry entry;
2019
2020 - spin_lock_irqsave(&ioapic_lock, flags);
2021 - *(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2);
2022 - *(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2);
2023 - spin_unlock_irqrestore(&ioapic_lock, flags);
2024 + entry = ioapic_read_entry(apic, i);
2025
2026 printk(KERN_DEBUG " %02x %03X %02X ",
2027 i,
2028 @@ -1513,17 +1585,12 @@
2029 );
2030 }
2031 }
2032 - if (use_pci_vector())
2033 - printk(KERN_INFO "Using vector-based indexing\n");
2034 printk(KERN_DEBUG "IRQ to pin mappings:\n");
2035 for (i = 0; i < NR_IRQS; i++) {
2036 struct irq_pin_list *entry = irq_2_pin + i;
2037 if (entry->pin < 0)
2038 continue;
2039 - if (use_pci_vector() && !platform_legacy_irq(i))
2040 - printk(KERN_DEBUG "IRQ%d ", IO_APIC_VECTOR(i));
2041 - else
2042 - printk(KERN_DEBUG "IRQ%d ", i);
2043 + printk(KERN_DEBUG "IRQ%d ", i);
2044 for (;;) {
2045 printk("-> %d:%d", entry->apic, entry->pin);
2046 if (!entry->next)
2047 @@ -1709,10 +1776,7 @@
2048 /* See if any of the pins is in ExtINT mode */
2049 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
2050 struct IO_APIC_route_entry entry;
2051 - spin_lock_irqsave(&ioapic_lock, flags);
2052 - *(((int *)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
2053 - *(((int *)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
2054 - spin_unlock_irqrestore(&ioapic_lock, flags);
2055 + entry = ioapic_read_entry(apic, pin);
2056
2057
2058 /* If the interrupt line is enabled and in ExtInt mode
2059 @@ -1770,7 +1834,6 @@
2060 */
2061 if (ioapic_i8259.pin != -1) {
2062 struct IO_APIC_route_entry entry;
2063 - unsigned long flags;
2064
2065 memset(&entry, 0, sizeof(entry));
2066 entry.mask = 0; /* Enabled */
2067 @@ -1787,12 +1850,7 @@
2068 /*
2069 * Add it to the IO-APIC irq-routing table:
2070 */
2071 - spin_lock_irqsave(&ioapic_lock, flags);
2072 - io_apic_write(ioapic_i8259.apic, 0x11+2*ioapic_i8259.pin,
2073 - *(((int *)&entry)+1));
2074 - io_apic_write(ioapic_i8259.apic, 0x10+2*ioapic_i8259.pin,
2075 - *(((int *)&entry)+0));
2076 - spin_unlock_irqrestore(&ioapic_lock, flags);
2077 + ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry);
2078 }
2079 disconnect_bsp_APIC(ioapic_i8259.pin != -1);
2080 #endif
2081 @@ -1959,6 +2017,8 @@
2082 */
2083
2084 /*
2085 + * Startup quirk:
2086 + *
2087 * Starting up a edge-triggered IO-APIC interrupt is
2088 * nasty - we need to make sure that we get the edge.
2089 * If it is already asserted for some reason, we need
2090 @@ -1966,8 +2026,10 @@
2091 *
2092 * This is not complete - we should be able to fake
2093 * an edge even if it isn't on the 8259A...
2094 + *
2095 + * (We do this for level-triggered IRQs too - it cannot hurt.)
2096 */
2097 -static unsigned int startup_edge_ioapic_irq(unsigned int irq)
2098 +static unsigned int startup_ioapic_irq(unsigned int irq)
2099 {
2100 int was_pending = 0;
2101 unsigned long flags;
2102 @@ -1984,47 +2046,18 @@
2103 return was_pending;
2104 }
2105
2106 -/*
2107 - * Once we have recorded IRQ_PENDING already, we can mask the
2108 - * interrupt for real. This prevents IRQ storms from unhandled
2109 - * devices.
2110 - */
2111 -static void ack_edge_ioapic_irq(unsigned int irq)
2112 -{
2113 - move_irq(irq);
2114 - if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED))
2115 - == (IRQ_PENDING | IRQ_DISABLED))
2116 - mask_IO_APIC_irq(irq);
2117 - ack_APIC_irq();
2118 -}
2119 -
2120 -/*
2121 - * Level triggered interrupts can just be masked,
2122 - * and shutting down and starting up the interrupt
2123 - * is the same as enabling and disabling them -- except
2124 - * with a startup need to return a "was pending" value.
2125 - *
2126 - * Level triggered interrupts are special because we
2127 - * do not touch any IO-APIC register while handling
2128 - * them. We ack the APIC in the end-IRQ handler, not
2129 - * in the start-IRQ-handler. Protection against reentrance
2130 - * from the same interrupt is still provided, both by the
2131 - * generic IRQ layer and by the fact that an unacked local
2132 - * APIC does not accept IRQs.
2133 - */
2134 -static unsigned int startup_level_ioapic_irq (unsigned int irq)
2135 +static void ack_ioapic_irq(unsigned int irq)
2136 {
2137 - unmask_IO_APIC_irq(irq);
2138 -
2139 - return 0; /* don't check for pending */
2140 + move_native_irq(irq);
2141 + ack_APIC_irq();
2142 }
2143
2144 -static void end_level_ioapic_irq (unsigned int irq)
2145 +static void ack_ioapic_quirk_irq(unsigned int irq)
2146 {
2147 unsigned long v;
2148 int i;
2149
2150 - move_irq(irq);
2151 + move_native_irq(irq);
2152 /*
2153 * It appears there is an erratum which affects at least version 0x11
2154 * of I/O APIC (that's the 82093AA and cores integrated into various
2155 @@ -2044,7 +2077,7 @@
2156 * operation to prevent an edge-triggered interrupt escaping meanwhile.
2157 * The idea is from Manfred Spraul. --macro
2158 */
2159 - i = IO_APIC_VECTOR(irq);
2160 + i = irq_vector[irq];
2161
2162 v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
2163
2164 @@ -2059,104 +2092,24 @@
2165 }
2166 }
2167
2168 -#ifdef CONFIG_PCI_MSI
2169 -static unsigned int startup_edge_ioapic_vector(unsigned int vector)
2170 -{
2171 - int irq = vector_to_irq(vector);
2172 -
2173 - return startup_edge_ioapic_irq(irq);
2174 -}
2175 -
2176 -static void ack_edge_ioapic_vector(unsigned int vector)
2177 -{
2178 - int irq = vector_to_irq(vector);
2179 -
2180 - move_native_irq(vector);
2181 - ack_edge_ioapic_irq(irq);
2182 -}
2183 -
2184 -static unsigned int startup_level_ioapic_vector (unsigned int vector)
2185 -{
2186 - int irq = vector_to_irq(vector);
2187 -
2188 - return startup_level_ioapic_irq (irq);
2189 -}
2190 -
2191 -static void end_level_ioapic_vector (unsigned int vector)
2192 -{
2193 - int irq = vector_to_irq(vector);
2194 -
2195 - move_native_irq(vector);
2196 - end_level_ioapic_irq(irq);
2197 -}
2198 -
2199 -static void mask_IO_APIC_vector (unsigned int vector)
2200 -{
2201 - int irq = vector_to_irq(vector);
2202 -
2203 - mask_IO_APIC_irq(irq);
2204 -}
2205 -
2206 -static void unmask_IO_APIC_vector (unsigned int vector)
2207 -{
2208 - int irq = vector_to_irq(vector);
2209 -
2210 - unmask_IO_APIC_irq(irq);
2211 -}
2212 -
2213 -#ifdef CONFIG_SMP
2214 -static void set_ioapic_affinity_vector (unsigned int vector,
2215 - cpumask_t cpu_mask)
2216 -{
2217 - int irq = vector_to_irq(vector);
2218 -
2219 - set_native_irq_info(vector, cpu_mask);
2220 - set_ioapic_affinity_irq(irq, cpu_mask);
2221 -}
2222 -#endif
2223 -#endif
2224 -
2225 -static int ioapic_retrigger(unsigned int irq)
2226 +static int ioapic_retrigger_irq(unsigned int irq)
2227 {
2228 - send_IPI_self(IO_APIC_VECTOR(irq));
2229 + send_IPI_self(irq_vector[irq]);
2230
2231 return 1;
2232 }
2233
2234 -/*
2235 - * Level and edge triggered IO-APIC interrupts need different handling,
2236 - * so we use two separate IRQ descriptors. Edge triggered IRQs can be
2237 - * handled with the level-triggered descriptor, but that one has slightly
2238 - * more overhead. Level-triggered interrupts cannot be handled with the
2239 - * edge-triggered handler, without risking IRQ storms and other ugly
2240 - * races.
2241 - */
2242 -static struct hw_interrupt_type ioapic_edge_type __read_mostly = {
2243 - .typename = "IO-APIC-edge",
2244 - .startup = startup_edge_ioapic,
2245 - .shutdown = shutdown_edge_ioapic,
2246 - .enable = enable_edge_ioapic,
2247 - .disable = disable_edge_ioapic,
2248 - .ack = ack_edge_ioapic,
2249 - .end = end_edge_ioapic,
2250 -#ifdef CONFIG_SMP
2251 - .set_affinity = set_ioapic_affinity,
2252 -#endif
2253 - .retrigger = ioapic_retrigger,
2254 -};
2255 -
2256 -static struct hw_interrupt_type ioapic_level_type __read_mostly = {
2257 - .typename = "IO-APIC-level",
2258 - .startup = startup_level_ioapic,
2259 - .shutdown = shutdown_level_ioapic,
2260 - .enable = enable_level_ioapic,
2261 - .disable = disable_level_ioapic,
2262 - .ack = mask_and_ack_level_ioapic,
2263 - .end = end_level_ioapic,
2264 +static struct irq_chip ioapic_chip __read_mostly = {
2265 + .name = "IO-APIC",
2266 + .startup = startup_ioapic_irq,
2267 + .mask = mask_IO_APIC_irq,
2268 + .unmask = unmask_IO_APIC_irq,
2269 + .ack = ack_ioapic_irq,
2270 + .eoi = ack_ioapic_quirk_irq,
2271 #ifdef CONFIG_SMP
2272 - .set_affinity = set_ioapic_affinity,
2273 + .set_affinity = set_ioapic_affinity_irq,
2274 #endif
2275 - .retrigger = ioapic_retrigger,
2276 + .retrigger = ioapic_retrigger_irq,
2277 };
2278 #endif /* !CONFIG_XEN */
2279
2280 @@ -2177,12 +2130,7 @@
2281 */
2282 for (irq = 0; irq < NR_IRQS ; irq++) {
2283 int tmp = irq;
2284 - if (use_pci_vector()) {
2285 - if (!platform_legacy_irq(tmp))
2286 - if ((tmp = vector_to_irq(tmp)) == -1)
2287 - continue;
2288 - }
2289 - if (IO_APIC_IRQ(tmp) && !IO_APIC_VECTOR(tmp)) {
2290 + if (IO_APIC_IRQ(tmp) && !irq_vector[tmp]) {
2291 /*
2292 * Hmm.. We don't have an entry for this,
2293 * so default to an old-fashioned 8259
2294 @@ -2193,22 +2141,23 @@
2295 #ifndef CONFIG_XEN
2296 else
2297 /* Strange. Oh, well.. */
2298 - irq_desc[irq].chip = &no_irq_type;
2299 + irq_desc[irq].chip = &no_irq_chip;
2300 #endif
2301 }
2302 }
2303 }
2304
2305 #ifndef CONFIG_XEN
2306 -static void enable_lapic_irq (unsigned int irq)
2307 -{
2308 - unsigned long v;
2309 +/*
2310 + * The local APIC irq-chip implementation:
2311 + */
2312
2313 - v = apic_read(APIC_LVT0);
2314 - apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED);
2315 +static void ack_apic(unsigned int irq)
2316 +{
2317 + ack_APIC_irq();
2318 }
2319
2320 -static void disable_lapic_irq (unsigned int irq)
2321 +static void mask_lapic_irq (unsigned int irq)
2322 {
2323 unsigned long v;
2324
2325 @@ -2216,21 +2165,19 @@
2326 apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
2327 }
2328
2329 -static void ack_lapic_irq (unsigned int irq)
2330 +static void unmask_lapic_irq (unsigned int irq)
2331 {
2332 - ack_APIC_irq();
2333 -}
2334 + unsigned long v;
2335
2336 -static void end_lapic_irq (unsigned int i) { /* nothing */ }
2337 + v = apic_read(APIC_LVT0);
2338 + apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED);
2339 +}
2340
2341 -static struct hw_interrupt_type lapic_irq_type __read_mostly = {
2342 - .typename = "local-APIC-edge",
2343 - .startup = NULL, /* startup_irq() not used for IRQ0 */
2344 - .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */
2345 - .enable = enable_lapic_irq,
2346 - .disable = disable_lapic_irq,
2347 - .ack = ack_lapic_irq,
2348 - .end = end_lapic_irq
2349 +static struct irq_chip lapic_chip __read_mostly = {
2350 + .name = "local-APIC-edge",
2351 + .mask = mask_lapic_irq,
2352 + .unmask = unmask_lapic_irq,
2353 + .eoi = ack_apic,
2354 };
2355
2356 static void setup_nmi (void)
2357 @@ -2263,17 +2210,13 @@
2358 int apic, pin, i;
2359 struct IO_APIC_route_entry entry0, entry1;
2360 unsigned char save_control, save_freq_select;
2361 - unsigned long flags;
2362
2363 pin = find_isa_irq_pin(8, mp_INT);
2364 apic = find_isa_irq_apic(8, mp_INT);
2365 if (pin == -1)
2366 return;
2367
2368 - spin_lock_irqsave(&ioapic_lock, flags);
2369 - *(((int *)&entry0) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
2370 - *(((int *)&entry0) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
2371 - spin_unlock_irqrestore(&ioapic_lock, flags);
2372 + entry0 = ioapic_read_entry(apic, pin);
2373 clear_IO_APIC_pin(apic, pin);
2374
2375 memset(&entry1, 0, sizeof(entry1));
2376 @@ -2286,10 +2229,7 @@
2377 entry1.trigger = 0;
2378 entry1.vector = 0;
2379
2380 - spin_lock_irqsave(&ioapic_lock, flags);
2381 - io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry1) + 1));
2382 - io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry1) + 0));
2383 - spin_unlock_irqrestore(&ioapic_lock, flags);
2384 + ioapic_write_entry(apic, pin, entry1);
2385
2386 save_control = CMOS_READ(RTC_CONTROL);
2387 save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
2388 @@ -2308,10 +2248,7 @@
2389 CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
2390 clear_IO_APIC_pin(apic, pin);
2391
2392 - spin_lock_irqsave(&ioapic_lock, flags);
2393 - io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry0) + 1));
2394 - io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry0) + 0));
2395 - spin_unlock_irqrestore(&ioapic_lock, flags);
2396 + ioapic_write_entry(apic, pin, entry0);
2397 }
2398
2399 int timer_uses_ioapic_pin_0;
2400 @@ -2411,7 +2348,8 @@
2401 printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
2402
2403 disable_8259A_irq(0);
2404 - irq_desc[0].chip = &lapic_irq_type;
2405 + set_irq_chip_and_handler_name(0, &lapic_chip, handle_fasteoi_irq,
2406 + "fasteio");
2407 apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
2408 enable_8259A_irq(0);
2409
2410 @@ -2523,17 +2461,12 @@
2411 {
2412 struct IO_APIC_route_entry *entry;
2413 struct sysfs_ioapic_data *data;
2414 - unsigned long flags;
2415 int i;
2416
2417 data = container_of(dev, struct sysfs_ioapic_data, dev);
2418 entry = data->entry;
2419 - spin_lock_irqsave(&ioapic_lock, flags);
2420 - for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
2421 - *(((int *)entry) + 1) = io_apic_read(dev->id, 0x11 + 2 * i);
2422 - *(((int *)entry) + 0) = io_apic_read(dev->id, 0x10 + 2 * i);
2423 - }
2424 - spin_unlock_irqrestore(&ioapic_lock, flags);
2425 + for (i = 0; i < nr_ioapic_registers[dev->id]; i ++)
2426 + entry[i] = ioapic_read_entry(dev->id, i);
2427
2428 return 0;
2429 }
2430 @@ -2555,11 +2488,9 @@
2431 reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
2432 io_apic_write(dev->id, 0, reg_00.raw);
2433 }
2434 - for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
2435 - io_apic_write(dev->id, 0x11+2*i, *(((int *)entry)+1));
2436 - io_apic_write(dev->id, 0x10+2*i, *(((int *)entry)+0));
2437 - }
2438 spin_unlock_irqrestore(&ioapic_lock, flags);
2439 + for (i = 0; i < nr_ioapic_registers[dev->id]; i ++)
2440 + ioapic_write_entry(dev->id, i, entry[i]);
2441
2442 return 0;
2443 }
2444 @@ -2605,6 +2536,240 @@
2445
2446 device_initcall(ioapic_init_sysfs);
2447
2448 +#ifndef CONFIG_XEN
2449 +/*
2450 + * Dynamic irq allocate and deallocation
2451 + */
2452 +int create_irq(void)
2453 +{
2454 + /* Allocate an unused irq */
2455 + int irq, new, vector;
2456 + unsigned long flags;
2457 +
2458 + irq = -ENOSPC;
2459 + spin_lock_irqsave(&vector_lock, flags);
2460 + for (new = (NR_IRQS - 1); new >= 0; new--) {
2461 + if (platform_legacy_irq(new))
2462 + continue;
2463 + if (irq_vector[new] != 0)
2464 + continue;
2465 + vector = __assign_irq_vector(new);
2466 + if (likely(vector > 0))
2467 + irq = new;
2468 + break;
2469 + }
2470 + spin_unlock_irqrestore(&vector_lock, flags);
2471 +
2472 + if (irq >= 0) {
2473 + set_intr_gate(vector, interrupt[irq]);
2474 + dynamic_irq_init(irq);
2475 + }
2476 + return irq;
2477 +}
2478 +
2479 +void destroy_irq(unsigned int irq)
2480 +{
2481 + unsigned long flags;
2482 +
2483 + dynamic_irq_cleanup(irq);
2484 +
2485 + spin_lock_irqsave(&vector_lock, flags);
2486 + irq_vector[irq] = 0;
2487 + spin_unlock_irqrestore(&vector_lock, flags);
2488 +}
2489 +#endif
2490 +
2491 +/*
2492 + * MSI mesage composition
2493 + */
2494 +#ifdef CONFIG_PCI_MSI
2495 +static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
2496 +{
2497 + int vector;
2498 + unsigned dest;
2499 +
2500 + vector = assign_irq_vector(irq);
2501 + if (vector >= 0) {
2502 + dest = cpu_mask_to_apicid(TARGET_CPUS);
2503 +
2504 + msg->address_hi = MSI_ADDR_BASE_HI;
2505 + msg->address_lo =
2506 + MSI_ADDR_BASE_LO |
2507 + ((INT_DEST_MODE == 0) ?
2508 + MSI_ADDR_DEST_MODE_PHYSICAL:
2509 + MSI_ADDR_DEST_MODE_LOGICAL) |
2510 + ((INT_DELIVERY_MODE != dest_LowestPrio) ?
2511 + MSI_ADDR_REDIRECTION_CPU:
2512 + MSI_ADDR_REDIRECTION_LOWPRI) |
2513 + MSI_ADDR_DEST_ID(dest);
2514 +
2515 + msg->data =
2516 + MSI_DATA_TRIGGER_EDGE |
2517 + MSI_DATA_LEVEL_ASSERT |
2518 + ((INT_DELIVERY_MODE != dest_LowestPrio) ?
2519 + MSI_DATA_DELIVERY_FIXED:
2520 + MSI_DATA_DELIVERY_LOWPRI) |
2521 + MSI_DATA_VECTOR(vector);
2522 + }
2523 + return vector;
2524 +}
2525 +
2526 +#ifdef CONFIG_SMP
2527 +static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
2528 +{
2529 + struct msi_msg msg;
2530 + unsigned int dest;
2531 + cpumask_t tmp;
2532 + int vector;
2533 +
2534 + cpus_and(tmp, mask, cpu_online_map);
2535 + if (cpus_empty(tmp))
2536 + tmp = TARGET_CPUS;
2537 +
2538 + vector = assign_irq_vector(irq);
2539 + if (vector < 0)
2540 + return;
2541 +
2542 + dest = cpu_mask_to_apicid(mask);
2543 +
2544 + read_msi_msg(irq, &msg);
2545 +
2546 + msg.data &= ~MSI_DATA_VECTOR_MASK;
2547 + msg.data |= MSI_DATA_VECTOR(vector);
2548 + msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
2549 + msg.address_lo |= MSI_ADDR_DEST_ID(dest);
2550 +
2551 + write_msi_msg(irq, &msg);
2552 + set_native_irq_info(irq, mask);
2553 +}
2554 +#endif /* CONFIG_SMP */
2555 +
2556 +/*
2557 + * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
2558 + * which implement the MSI or MSI-X Capability Structure.
2559 + */
2560 +static struct irq_chip msi_chip = {
2561 + .name = "PCI-MSI",
2562 + .unmask = unmask_msi_irq,
2563 + .mask = mask_msi_irq,
2564 + .ack = ack_ioapic_irq,
2565 +#ifdef CONFIG_SMP
2566 + .set_affinity = set_msi_irq_affinity,
2567 +#endif
2568 + .retrigger = ioapic_retrigger_irq,
2569 +};
2570 +
2571 +int arch_setup_msi_irq(unsigned int irq, struct pci_dev *dev)
2572 +{
2573 + struct msi_msg msg;
2574 + int ret;
2575 + ret = msi_compose_msg(dev, irq, &msg);
2576 + if (ret < 0)
2577 + return ret;
2578 +
2579 + write_msi_msg(irq, &msg);
2580 +
2581 + set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq,
2582 + "edge");
2583 +
2584 + return 0;
2585 +}
2586 +
2587 +void arch_teardown_msi_irq(unsigned int irq)
2588 +{
2589 + return;
2590 +}
2591 +
2592 +#endif /* CONFIG_PCI_MSI */
2593 +
2594 +/*
2595 + * Hypertransport interrupt support
2596 + */
2597 +#ifdef CONFIG_HT_IRQ
2598 +
2599 +#ifdef CONFIG_SMP
2600 +
2601 +static void target_ht_irq(unsigned int irq, unsigned int dest)
2602 +{
2603 + struct ht_irq_msg msg;
2604 + fetch_ht_irq_msg(irq, &msg);
2605 +
2606 + msg.address_lo &= ~(HT_IRQ_LOW_DEST_ID_MASK);
2607 + msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK);
2608 +
2609 + msg.address_lo |= HT_IRQ_LOW_DEST_ID(dest);
2610 + msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest);
2611 +
2612 + write_ht_irq_msg(irq, &msg);
2613 +}
2614 +
2615 +static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
2616 +{
2617 + unsigned int dest;
2618 + cpumask_t tmp;
2619 +
2620 + cpus_and(tmp, mask, cpu_online_map);
2621 + if (cpus_empty(tmp))
2622 + tmp = TARGET_CPUS;
2623 +
2624 + cpus_and(mask, tmp, CPU_MASK_ALL);
2625 +
2626 + dest = cpu_mask_to_apicid(mask);
2627 +
2628 + target_ht_irq(irq, dest);
2629 + set_native_irq_info(irq, mask);
2630 +}
2631 +#endif
2632 +
2633 +static struct irq_chip ht_irq_chip = {
2634 + .name = "PCI-HT",
2635 + .mask = mask_ht_irq,
2636 + .unmask = unmask_ht_irq,
2637 + .ack = ack_ioapic_irq,
2638 +#ifdef CONFIG_SMP
2639 + .set_affinity = set_ht_irq_affinity,
2640 +#endif
2641 + .retrigger = ioapic_retrigger_irq,
2642 +};
2643 +
2644 +int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
2645 +{
2646 + int vector;
2647 +
2648 + vector = assign_irq_vector(irq);
2649 + if (vector >= 0) {
2650 + struct ht_irq_msg msg;
2651 + unsigned dest;
2652 + cpumask_t tmp;
2653 +
2654 + cpus_clear(tmp);
2655 + cpu_set(vector >> 8, tmp);
2656 + dest = cpu_mask_to_apicid(tmp);
2657 +
2658 + msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
2659 +
2660 + msg.address_lo =
2661 + HT_IRQ_LOW_BASE |
2662 + HT_IRQ_LOW_DEST_ID(dest) |
2663 + HT_IRQ_LOW_VECTOR(vector) |
2664 + ((INT_DEST_MODE == 0) ?
2665 + HT_IRQ_LOW_DM_PHYSICAL :
2666 + HT_IRQ_LOW_DM_LOGICAL) |
2667 + HT_IRQ_LOW_RQEOI_EDGE |
2668 + ((INT_DELIVERY_MODE != dest_LowestPrio) ?
2669 + HT_IRQ_LOW_MT_FIXED :
2670 + HT_IRQ_LOW_MT_ARBITRATED) |
2671 + HT_IRQ_LOW_IRQ_MASKED;
2672 +
2673 + write_ht_irq_msg(irq, &msg);
2674 +
2675 + set_irq_chip_and_handler_name(irq, &ht_irq_chip,
2676 + handle_edge_irq, "edge");
2677 + }
2678 + return vector;
2679 +}
2680 +#endif /* CONFIG_HT_IRQ */
2681 +
2682 /* --------------------------------------------------------------------------
2683 ACPI-based IOAPIC Configuration
2684 -------------------------------------------------------------------------- */
2685 @@ -2758,13 +2923,34 @@
2686 if (!ioapic && (irq < 16))
2687 disable_8259A_irq(irq);
2688
2689 + ioapic_write_entry(ioapic, pin, entry);
2690 spin_lock_irqsave(&ioapic_lock, flags);
2691 - io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1));
2692 - io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0));
2693 - set_native_irq_info(use_pci_vector() ? entry.vector : irq, TARGET_CPUS);
2694 + set_native_irq_info(irq, TARGET_CPUS);
2695 spin_unlock_irqrestore(&ioapic_lock, flags);
2696
2697 return 0;
2698 }
2699
2700 #endif /* CONFIG_ACPI */
2701 +
2702 +static int __init parse_disable_timer_pin_1(char *arg)
2703 +{
2704 + disable_timer_pin_1 = 1;
2705 + return 0;
2706 +}
2707 +early_param("disable_timer_pin_1", parse_disable_timer_pin_1);
2708 +
2709 +static int __init parse_enable_timer_pin_1(char *arg)
2710 +{
2711 + disable_timer_pin_1 = -1;
2712 + return 0;
2713 +}
2714 +early_param("enable_timer_pin_1", parse_enable_timer_pin_1);
2715 +
2716 +static int __init parse_noapic(char *arg)
2717 +{
2718 + /* disable IO-APIC */
2719 + disable_ioapic_setup();
2720 + return 0;
2721 +}
2722 +early_param("noapic", parse_noapic);
2723 --- a/arch/x86/kernel/io_apic_64-xen.c
2724 +++ b/arch/x86/kernel/io_apic_64-xen.c
2725 @@ -26,9 +26,12 @@
2726 #include <linux/delay.h>
2727 #include <linux/sched.h>
2728 #include <linux/smp_lock.h>
2729 +#include <linux/pci.h>
2730 #include <linux/mc146818rtc.h>
2731 #include <linux/acpi.h>
2732 #include <linux/sysdev.h>
2733 +#include <linux/msi.h>
2734 +#include <linux/htirq.h>
2735 #ifdef CONFIG_ACPI
2736 #include <acpi/acpi_bus.h>
2737 #endif
2738 @@ -41,6 +44,10 @@
2739 #include <asm/acpi.h>
2740 #include <asm/dma.h>
2741 #include <asm/nmi.h>
2742 +#include <asm/msidef.h>
2743 +#include <asm/hypertransport.h>
2744 +
2745 +static int assign_irq_vector(int irq, cpumask_t mask, cpumask_t *result);
2746
2747 #define __apicdebuginit __init
2748
2749 @@ -48,17 +55,29 @@
2750
2751 static int no_timer_check;
2752
2753 -int disable_timer_pin_1 __initdata;
2754 +static int disable_timer_pin_1 __initdata;
2755
2756 -#ifndef CONFIG_XEN
2757 -int timer_over_8254 __initdata = 0;
2758 +#ifdef CONFIG_XEN
2759 +#include <xen/interface/xen.h>
2760 +#include <xen/interface/physdev.h>
2761 +
2762 +/* Fake i8259 */
2763 +#define make_8259A_irq(_irq) (io_apic_irqs &= ~(1UL<<(_irq)))
2764 +#define disable_8259A_irq(_irq) ((void)0)
2765 +#define i8259A_irq_pending(_irq) (0)
2766 +
2767 +unsigned long io_apic_irqs;
2768 +
2769 +#define clear_IO_APIC() ((void)0)
2770 +#else
2771 +int timer_over_8254 __initdata = 1;
2772
2773 /* Where if anywhere is the i8259 connect in external int mode */
2774 static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
2775 #endif
2776
2777 static DEFINE_SPINLOCK(ioapic_lock);
2778 -static DEFINE_SPINLOCK(vector_lock);
2779 +DEFINE_SPINLOCK(vector_lock);
2780
2781 /*
2782 * # of IRQ routing registers
2783 @@ -83,28 +102,27 @@
2784 short apic, pin, next;
2785 } irq_2_pin[PIN_MAP_SIZE];
2786
2787 -int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1};
2788 -#ifdef CONFIG_PCI_MSI
2789 -#define vector_to_irq(vector) \
2790 - (platform_legacy_irq(vector) ? vector : vector_irq[vector])
2791 -#else
2792 -#define vector_to_irq(vector) (vector)
2793 -#endif
2794 -
2795 -#ifdef CONFIG_XEN
2796 -
2797 -#include <xen/interface/xen.h>
2798 -#include <xen/interface/physdev.h>
2799 -
2800 -/* Fake i8259 */
2801 -#define make_8259A_irq(_irq) (io_apic_irqs &= ~(1UL<<(_irq)))
2802 -#define disable_8259A_irq(_irq) ((void)0)
2803 -#define i8259A_irq_pending(_irq) (0)
2804 +#ifndef CONFIG_XEN
2805 +struct io_apic {
2806 + unsigned int index;
2807 + unsigned int unused[3];
2808 + unsigned int data;
2809 +};
2810
2811 -unsigned long io_apic_irqs;
2812 +static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
2813 +{
2814 + return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
2815 + + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK);
2816 +}
2817 +#endif
2818
2819 -static inline unsigned int xen_io_apic_read(unsigned int apic, unsigned int reg)
2820 +static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
2821 {
2822 +#ifndef CONFIG_XEN
2823 + struct io_apic __iomem *io_apic = io_apic_base(apic);
2824 + writel(reg, &io_apic->index);
2825 + return readl(&io_apic->data);
2826 +#else
2827 struct physdev_apic apic_op;
2828 int ret;
2829
2830 @@ -114,31 +132,131 @@
2831 if (ret)
2832 return ret;
2833 return apic_op.value;
2834 +#endif
2835 }
2836
2837 -static inline void xen_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
2838 +static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
2839 {
2840 +#ifndef CONFIG_XEN
2841 + struct io_apic __iomem *io_apic = io_apic_base(apic);
2842 + writel(reg, &io_apic->index);
2843 + writel(value, &io_apic->data);
2844 +#else
2845 struct physdev_apic apic_op;
2846
2847 apic_op.apic_physbase = mp_ioapics[apic].mpc_apicaddr;
2848 apic_op.reg = reg;
2849 apic_op.value = value;
2850 WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op));
2851 +#endif
2852 }
2853
2854 -#define io_apic_read(a,r) xen_io_apic_read(a,r)
2855 -#define io_apic_write(a,r,v) xen_io_apic_write(a,r,v)
2856 +#ifndef CONFIG_XEN
2857 +/*
2858 + * Re-write a value: to be used for read-modify-write
2859 + * cycles where the read already set up the index register.
2860 + */
2861 +static inline void io_apic_modify(unsigned int apic, unsigned int value)
2862 +{
2863 + struct io_apic __iomem *io_apic = io_apic_base(apic);
2864 + writel(value, &io_apic->data);
2865 +}
2866 +#else
2867 +#define io_apic_modify io_apic_write
2868 +#endif
2869
2870 -#define clear_IO_APIC() ((void)0)
2871 +/*
2872 + * Synchronize the IO-APIC and the CPU by doing
2873 + * a dummy read from the IO-APIC
2874 + */
2875 +static inline void io_apic_sync(unsigned int apic)
2876 +{
2877 +#ifndef CONFIG_XEN
2878 + struct io_apic __iomem *io_apic = io_apic_base(apic);
2879 + readl(&io_apic->data);
2880 +#endif
2881 +}
2882
2883 -#else
2884 +union entry_union {
2885 + struct { u32 w1, w2; };
2886 + struct IO_APIC_route_entry entry;
2887 +};
2888 +
2889 +static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
2890 +{
2891 + union entry_union eu;
2892 + unsigned long flags;
2893 + spin_lock_irqsave(&ioapic_lock, flags);
2894 + eu.w1 = io_apic_read(apic, 0x10 + 2 * pin);
2895 + eu.w2 = io_apic_read(apic, 0x11 + 2 * pin);
2896 + spin_unlock_irqrestore(&ioapic_lock, flags);
2897 + return eu.entry;
2898 +}
2899 +
2900 +/*
2901 + * When we write a new IO APIC routing entry, we need to write the high
2902 + * word first! If the mask bit in the low word is clear, we will enable
2903 + * the interrupt, and we need to make sure the entry is fully populated
2904 + * before that happens.
2905 + */
2906 +static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
2907 +{
2908 + unsigned long flags;
2909 + union entry_union eu;
2910 + eu.entry = e;
2911 + spin_lock_irqsave(&ioapic_lock, flags);
2912 + io_apic_write(apic, 0x11 + 2*pin, eu.w2);
2913 + io_apic_write(apic, 0x10 + 2*pin, eu.w1);
2914 + spin_unlock_irqrestore(&ioapic_lock, flags);
2915 +}
2916 +
2917 +#ifndef CONFIG_XEN
2918 +/*
2919 + * When we mask an IO APIC routing entry, we need to write the low
2920 + * word first, in order to set the mask bit before we change the
2921 + * high bits!
2922 + */
2923 +static void ioapic_mask_entry(int apic, int pin)
2924 +{
2925 + unsigned long flags;
2926 + union entry_union eu = { .entry.mask = 1 };
2927 +
2928 + spin_lock_irqsave(&ioapic_lock, flags);
2929 + io_apic_write(apic, 0x10 + 2*pin, eu.w1);
2930 + io_apic_write(apic, 0x11 + 2*pin, eu.w2);
2931 + spin_unlock_irqrestore(&ioapic_lock, flags);
2932 +}
2933
2934 #ifdef CONFIG_SMP
2935 +static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector)
2936 +{
2937 + int apic, pin;
2938 + struct irq_pin_list *entry = irq_2_pin + irq;
2939 +
2940 + BUG_ON(irq >= NR_IRQS);
2941 + for (;;) {
2942 + unsigned int reg;
2943 + apic = entry->apic;
2944 + pin = entry->pin;
2945 + if (pin == -1)
2946 + break;
2947 + io_apic_write(apic, 0x11 + pin*2, dest);
2948 + reg = io_apic_read(apic, 0x10 + pin*2);
2949 + reg &= ~0x000000ff;
2950 + reg |= vector;
2951 + io_apic_modify(apic, reg);
2952 + if (!entry->next)
2953 + break;
2954 + entry = irq_2_pin + entry->next;
2955 + }
2956 +}
2957 +
2958 static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
2959 {
2960 unsigned long flags;
2961 unsigned int dest;
2962 cpumask_t tmp;
2963 + int vector;
2964
2965 cpus_and(tmp, mask, cpu_online_map);
2966 if (cpus_empty(tmp))
2967 @@ -146,7 +264,11 @@
2968
2969 cpus_and(mask, tmp, CPU_MASK_ALL);
2970
2971 - dest = cpu_mask_to_apicid(mask);
2972 + vector = assign_irq_vector(irq, mask, &tmp);
2973 + if (vector < 0)
2974 + return;
2975 +
2976 + dest = cpu_mask_to_apicid(tmp);
2977
2978 /*
2979 * Only the high 8 bits are valid.
2980 @@ -154,13 +276,12 @@
2981 dest = SET_APIC_LOGICAL_ID(dest);
2982
2983 spin_lock_irqsave(&ioapic_lock, flags);
2984 - __DO_ACTION(1, = dest, )
2985 - set_irq_info(irq, mask);
2986 + __target_IO_APIC_irq(irq, dest, vector);
2987 + set_native_irq_info(irq, mask);
2988 spin_unlock_irqrestore(&ioapic_lock, flags);
2989 }
2990 #endif
2991 -
2992 -#endif /* !CONFIG_XEN */
2993 +#endif
2994
2995 /*
2996 * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
2997 @@ -240,24 +361,15 @@
2998 static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
2999 {
3000 struct IO_APIC_route_entry entry;
3001 - unsigned long flags;
3002
3003 /* Check delivery_mode to be sure we're not clearing an SMI pin */
3004 - spin_lock_irqsave(&ioapic_lock, flags);
3005 - *(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
3006 - *(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
3007 - spin_unlock_irqrestore(&ioapic_lock, flags);
3008 + entry = ioapic_read_entry(apic, pin);
3009 if (entry.delivery_mode == dest_SMI)
3010 return;
3011 /*
3012 * Disable it in the IO-APIC irq-routing table:
3013 */
3014 - memset(&entry, 0, sizeof(entry));
3015 - entry.mask = 1;
3016 - spin_lock_irqsave(&ioapic_lock, flags);
3017 - io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0));
3018 - io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1));
3019 - spin_unlock_irqrestore(&ioapic_lock, flags);
3020 + ioapic_mask_entry(apic, pin);
3021 }
3022
3023 static void clear_IO_APIC (void)
3024 @@ -271,16 +383,6 @@
3025
3026 #endif /* !CONFIG_XEN */
3027
3028 -static u8 gsi_2_irq[NR_IRQ_VECTORS] = { [0 ... NR_IRQ_VECTORS-1] = 0xFF };
3029 -
3030 -/*
3031 - * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
3032 - * specific CPU-side IRQs.
3033 - */
3034 -
3035 -#define MAX_PIRQS 8
3036 -static int pirq_entries [MAX_PIRQS];
3037 -static int pirqs_enabled;
3038 int skip_ioapic_setup;
3039 int ioapic_force;
3040
3041 @@ -289,18 +391,17 @@
3042 static int __init disable_ioapic_setup(char *str)
3043 {
3044 skip_ioapic_setup = 1;
3045 - return 1;
3046 + return 0;
3047 }
3048 +early_param("noapic", disable_ioapic_setup);
3049
3050 -static int __init enable_ioapic_setup(char *str)
3051 +/* Actually the next is obsolete, but keep it for paranoid reasons -AK */
3052 +static int __init disable_timer_pin_setup(char *arg)
3053 {
3054 - ioapic_force = 1;
3055 - skip_ioapic_setup = 0;
3056 + disable_timer_pin_1 = 1;
3057 return 1;
3058 }
3059 -
3060 -__setup("noapic", disable_ioapic_setup);
3061 -__setup("apic", enable_ioapic_setup);
3062 +__setup("disable_timer_pin_1", disable_timer_pin_setup);
3063
3064 #ifndef CONFIG_XEN
3065 static int __init setup_disable_8254_timer(char *s)
3066 @@ -318,137 +419,6 @@
3067 __setup("enable_8254_timer", setup_enable_8254_timer);
3068 #endif /* !CONFIG_XEN */
3069
3070 -#include <asm/pci-direct.h>
3071 -#include <linux/pci_ids.h>
3072 -#include <linux/pci.h>
3073 -
3074 -
3075 -#ifdef CONFIG_ACPI
3076 -
3077 -static int nvidia_hpet_detected __initdata;
3078 -
3079 -static int __init nvidia_hpet_check(unsigned long phys, unsigned long size)
3080 -{
3081 - nvidia_hpet_detected = 1;
3082 - return 0;
3083 -}
3084 -#endif
3085 -
3086 -/* Temporary Hack. Nvidia and VIA boards currently only work with IO-APIC
3087 - off. Check for an Nvidia or VIA PCI bridge and turn it off.
3088 - Use pci direct infrastructure because this runs before the PCI subsystem.
3089 -
3090 - Can be overwritten with "apic"
3091 -
3092 - And another hack to disable the IOMMU on VIA chipsets.
3093 -
3094 - ... and others. Really should move this somewhere else.
3095 -
3096 - Kludge-O-Rama. */
3097 -void __init check_ioapic(void)
3098 -{
3099 - int num,slot,func;
3100 - /* Poor man's PCI discovery */
3101 - for (num = 0; num < 32; num++) {
3102 - for (slot = 0; slot < 32; slot++) {
3103 - for (func = 0; func < 8; func++) {
3104 - u32 class;
3105 - u32 vendor;
3106 - u8 type;
3107 - class = read_pci_config(num,slot,func,
3108 - PCI_CLASS_REVISION);
3109 - if (class == 0xffffffff)
3110 - break;
3111 -
3112 - if ((class >> 16) != PCI_CLASS_BRIDGE_PCI)
3113 - continue;
3114 -
3115 - vendor = read_pci_config(num, slot, func,
3116 - PCI_VENDOR_ID);
3117 - vendor &= 0xffff;
3118 - switch (vendor) {
3119 - case PCI_VENDOR_ID_VIA:
3120 -#ifdef CONFIG_IOMMU
3121 - if ((end_pfn > MAX_DMA32_PFN ||
3122 - force_iommu) &&
3123 - !iommu_aperture_allowed) {
3124 - printk(KERN_INFO
3125 - "Looks like a VIA chipset. Disabling IOMMU. Override with \"iommu=allowed\"\n");
3126 - iommu_aperture_disabled = 1;
3127 - }
3128 -#endif
3129 - return;
3130 - case PCI_VENDOR_ID_NVIDIA:
3131 -#ifdef CONFIG_ACPI
3132 - /*
3133 - * All timer overrides on Nvidia are
3134 - * wrong unless HPET is enabled.
3135 - */
3136 - nvidia_hpet_detected = 0;
3137 - acpi_table_parse(ACPI_HPET,
3138 - nvidia_hpet_check);
3139 - if (nvidia_hpet_detected == 0) {
3140 - acpi_skip_timer_override = 1;
3141 - printk(KERN_INFO "Nvidia board "
3142 - "detected. Ignoring ACPI "
3143 - "timer override.\n");
3144 - }
3145 -#endif
3146 - /* RED-PEN skip them on mptables too? */
3147 - return;
3148 - case PCI_VENDOR_ID_ATI:
3149 -
3150 - /* This should be actually default, but
3151 - for 2.6.16 let's do it for ATI only where
3152 - it's really needed. */
3153 -#ifndef CONFIG_XEN
3154 - if (timer_over_8254 == 1) {
3155 - timer_over_8254 = 0;
3156 - printk(KERN_INFO
3157 - "ATI board detected. Disabling timer routing over 8254.\n");
3158 - }
3159 -#endif
3160 - return;
3161 - }
3162 -
3163 -
3164 - /* No multi-function device? */
3165 - type = read_pci_config_byte(num,slot,func,
3166 - PCI_HEADER_TYPE);
3167 - if (!(type & 0x80))
3168 - break;
3169 - }
3170 - }
3171 - }
3172 -}
3173 -
3174 -static int __init ioapic_pirq_setup(char *str)
3175 -{
3176 - int i, max;
3177 - int ints[MAX_PIRQS+1];
3178 -
3179 - get_options(str, ARRAY_SIZE(ints), ints);
3180 -
3181 - for (i = 0; i < MAX_PIRQS; i++)
3182 - pirq_entries[i] = -1;
3183 -
3184 - pirqs_enabled = 1;
3185 - apic_printk(APIC_VERBOSE, "PIRQ redirection, working around broken MP-BIOS.\n");
3186 - max = MAX_PIRQS;
3187 - if (ints[0] < MAX_PIRQS)
3188 - max = ints[0];
3189 -
3190 - for (i = 0; i < max; i++) {
3191 - apic_printk(APIC_VERBOSE, "... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
3192 - /*
3193 - * PIRQs are mapped upside down, usually.
3194 - */
3195 - pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
3196 - }
3197 - return 1;
3198 -}
3199 -
3200 -__setup("pirq=", ioapic_pirq_setup);
3201
3202 /*
3203 * Find the IRQ entry number of a certain pin.
3204 @@ -478,9 +448,7 @@
3205 for (i = 0; i < mp_irq_entries; i++) {
3206 int lbus = mp_irqs[i].mpc_srcbus;
3207
3208 - if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
3209 - mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
3210 - mp_bus_id_to_type[lbus] == MP_BUS_MCA) &&
3211 + if (test_bit(lbus, mp_bus_not_pci) &&
3212 (mp_irqs[i].mpc_irqtype == type) &&
3213 (mp_irqs[i].mpc_srcbusirq == irq))
3214
3215 @@ -496,9 +464,7 @@
3216 for (i = 0; i < mp_irq_entries; i++) {
3217 int lbus = mp_irqs[i].mpc_srcbus;
3218
3219 - if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
3220 - mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
3221 - mp_bus_id_to_type[lbus] == MP_BUS_MCA) &&
3222 + if (test_bit(lbus, mp_bus_not_pci) &&
3223 (mp_irqs[i].mpc_irqtype == type) &&
3224 (mp_irqs[i].mpc_srcbusirq == irq))
3225 break;
3226 @@ -539,7 +505,7 @@
3227 mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
3228 break;
3229
3230 - if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) &&
3231 + if (!test_bit(lbus, mp_bus_not_pci) &&
3232 !mp_irqs[i].mpc_irqtype &&
3233 (bus == lbus) &&
3234 (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
3235 @@ -562,27 +528,6 @@
3236 return best_guess;
3237 }
3238
3239 -/*
3240 - * EISA Edge/Level control register, ELCR
3241 - */
3242 -static int EISA_ELCR(unsigned int irq)
3243 -{
3244 - if (irq < 16) {
3245 - unsigned int port = 0x4d0 + (irq >> 3);
3246 - return (inb(port) >> (irq & 7)) & 1;
3247 - }
3248 - apic_printk(APIC_VERBOSE, "Broken MPtable reports ISA irq %d\n", irq);
3249 - return 0;
3250 -}
3251 -
3252 -/* EISA interrupts are always polarity zero and can be edge or level
3253 - * trigger depending on the ELCR value. If an interrupt is listed as
3254 - * EISA conforming in the MP table, that means its trigger type must
3255 - * be read in from the ELCR */
3256 -
3257 -#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq))
3258 -#define default_EISA_polarity(idx) (0)
3259 -
3260 /* ISA interrupts are always polarity zero edge triggered,
3261 * when listed as conforming in the MP table. */
3262
3263 @@ -595,12 +540,6 @@
3264 #define default_PCI_trigger(idx) (1)
3265 #define default_PCI_polarity(idx) (1)
3266
3267 -/* MCA interrupts are always polarity zero level triggered,
3268 - * when listed as conforming in the MP table. */
3269 -
3270 -#define default_MCA_trigger(idx) (1)
3271 -#define default_MCA_polarity(idx) (0)
3272 -
3273 static int __init MPBIOS_polarity(int idx)
3274 {
3275 int bus = mp_irqs[idx].mpc_srcbus;
3276 @@ -612,38 +551,11 @@
3277 switch (mp_irqs[idx].mpc_irqflag & 3)
3278 {
3279 case 0: /* conforms, ie. bus-type dependent polarity */
3280 - {
3281 - switch (mp_bus_id_to_type[bus])
3282 - {
3283 - case MP_BUS_ISA: /* ISA pin */
3284 - {
3285 - polarity = default_ISA_polarity(idx);
3286 - break;
3287 - }
3288 - case MP_BUS_EISA: /* EISA pin */
3289 - {
3290 - polarity = default_EISA_polarity(idx);
3291 - break;
3292 - }
3293 - case MP_BUS_PCI: /* PCI pin */
3294 - {
3295 - polarity = default_PCI_polarity(idx);
3296 - break;
3297 - }
3298 - case MP_BUS_MCA: /* MCA pin */
3299 - {
3300 - polarity = default_MCA_polarity(idx);
3301 - break;
3302 - }
3303 - default:
3304 - {
3305 - printk(KERN_WARNING "broken BIOS!!\n");
3306 - polarity = 1;
3307 - break;
3308 - }
3309 - }
3310 + if (test_bit(bus, mp_bus_not_pci))
3311 + polarity = default_ISA_polarity(idx);
3312 + else
3313 + polarity = default_PCI_polarity(idx);
3314 break;
3315 - }
3316 case 1: /* high active */
3317 {
3318 polarity = 0;
3319 @@ -681,38 +593,11 @@
3320 switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
3321 {
3322 case 0: /* conforms, ie. bus-type dependent */
3323 - {
3324 - switch (mp_bus_id_to_type[bus])
3325 - {
3326 - case MP_BUS_ISA: /* ISA pin */
3327 - {
3328 - trigger = default_ISA_trigger(idx);
3329 - break;
3330 - }
3331 - case MP_BUS_EISA: /* EISA pin */
3332 - {
3333 - trigger = default_EISA_trigger(idx);
3334 - break;
3335 - }
3336 - case MP_BUS_PCI: /* PCI pin */
3337 - {
3338 - trigger = default_PCI_trigger(idx);
3339 - break;
3340 - }
3341 - case MP_BUS_MCA: /* MCA pin */
3342 - {
3343 - trigger = default_MCA_trigger(idx);
3344 - break;
3345 - }
3346 - default:
3347 - {
3348 - printk(KERN_WARNING "broken BIOS!!\n");
3349 - trigger = 1;
3350 - break;
3351 - }
3352 - }
3353 + if (test_bit(bus, mp_bus_not_pci))
3354 + trigger = default_ISA_trigger(idx);
3355 + else
3356 + trigger = default_PCI_trigger(idx);
3357 break;
3358 - }
3359 case 1: /* edge */
3360 {
3361 trigger = 0;
3362 @@ -749,64 +634,6 @@
3363 return MPBIOS_trigger(idx);
3364 }
3365
3366 -static int next_irq = 16;
3367 -
3368 -/*
3369 - * gsi_irq_sharing -- Name overload! "irq" can be either a legacy IRQ
3370 - * in the range 0-15, a linux IRQ in the range 0-223, or a GSI number
3371 - * from ACPI, which can reach 800 in large boxen.
3372 - *
3373 - * Compact the sparse GSI space into a sequential IRQ series and reuse
3374 - * vectors if possible.
3375 - */
3376 -int gsi_irq_sharing(int gsi)
3377 -{
3378 - int i, tries, vector;
3379 -
3380 - BUG_ON(gsi >= NR_IRQ_VECTORS);
3381 -
3382 - if (platform_legacy_irq(gsi))
3383 - return gsi;
3384 -
3385 - if (gsi_2_irq[gsi] != 0xFF)
3386 - return (int)gsi_2_irq[gsi];
3387 -
3388 - tries = NR_IRQS;
3389 - try_again:
3390 - vector = assign_irq_vector(gsi);
3391 -
3392 - /*
3393 - * Sharing vectors means sharing IRQs, so scan irq_vectors for previous
3394 - * use of vector and if found, return that IRQ. However, we never want
3395 - * to share legacy IRQs, which usually have a different trigger mode
3396 - * than PCI.
3397 - */
3398 - for (i = 0; i < NR_IRQS; i++)
3399 - if (IO_APIC_VECTOR(i) == vector)
3400 - break;
3401 - if (platform_legacy_irq(i)) {
3402 - if (--tries >= 0) {
3403 - IO_APIC_VECTOR(i) = 0;
3404 - goto try_again;
3405 - }
3406 - panic("gsi_irq_sharing: didn't find an IRQ using vector 0x%02X for GSI %d", vector, gsi);
3407 - }
3408 - if (i < NR_IRQS) {
3409 - gsi_2_irq[gsi] = i;
3410 - printk(KERN_INFO "GSI %d sharing vector 0x%02X and IRQ %d\n",
3411 - gsi, vector, i);
3412 - return i;
3413 - }
3414 -
3415 - i = next_irq++;
3416 - BUG_ON(i >= NR_IRQS);
3417 - gsi_2_irq[gsi] = i;
3418 - IO_APIC_VECTOR(i) = vector;
3419 - printk(KERN_INFO "GSI %d assigned vector 0x%02X and IRQ %d\n",
3420 - gsi, vector, i);
3421 - return i;
3422 -}
3423 -
3424 static int pin_2_irq(int idx, int apic, int pin)
3425 {
3426 int irq, i;
3427 @@ -818,49 +645,16 @@
3428 if (mp_irqs[idx].mpc_dstirq != pin)
3429 printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
3430
3431 - switch (mp_bus_id_to_type[bus])
3432 - {
3433 - case MP_BUS_ISA: /* ISA pin */
3434 - case MP_BUS_EISA:
3435 - case MP_BUS_MCA:
3436 - {
3437 - irq = mp_irqs[idx].mpc_srcbusirq;
3438 - break;
3439 - }
3440 - case MP_BUS_PCI: /* PCI pin */
3441 - {
3442 - /*
3443 - * PCI IRQs are mapped in order
3444 - */
3445 - i = irq = 0;
3446 - while (i < apic)
3447 - irq += nr_ioapic_registers[i++];
3448 - irq += pin;
3449 - irq = gsi_irq_sharing(irq);
3450 - break;
3451 - }
3452 - default:
3453 - {
3454 - printk(KERN_ERR "unknown bus type %d.\n",bus);
3455 - irq = 0;
3456 - break;
3457 - }
3458 - }
3459 - BUG_ON(irq >= NR_IRQS);
3460 -
3461 - /*
3462 - * PCI IRQ command line redirection. Yes, limits are hardcoded.
3463 - */
3464 - if ((pin >= 16) && (pin <= 23)) {
3465 - if (pirq_entries[pin-16] != -1) {
3466 - if (!pirq_entries[pin-16]) {
3467 - apic_printk(APIC_VERBOSE, "disabling PIRQ%d\n", pin-16);
3468 - } else {
3469 - irq = pirq_entries[pin-16];
3470 - apic_printk(APIC_VERBOSE, "using PIRQ%d -> IRQ %d\n",
3471 - pin-16, irq);
3472 - }
3473 - }
3474 + if (test_bit(bus, mp_bus_not_pci)) {
3475 + irq = mp_irqs[idx].mpc_srcbusirq;
3476 + } else {
3477 + /*
3478 + * PCI IRQs are mapped in order
3479 + */
3480 + i = irq = 0;
3481 + while (i < apic)
3482 + irq += nr_ioapic_registers[i++];
3483 + irq += pin;
3484 }
3485 BUG_ON(irq >= NR_IRQS);
3486 return irq;
3487 @@ -884,43 +678,68 @@
3488 }
3489
3490 /* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
3491 -u8 irq_vector[NR_IRQ_VECTORS] __read_mostly;
3492 +static u8 irq_vector[NR_IRQ_VECTORS] __read_mostly;
3493
3494 -int assign_irq_vector(int irq)
3495 +static int __assign_irq_vector(int irq, cpumask_t mask, cpumask_t *result)
3496 {
3497 - unsigned long flags;
3498 int vector;
3499 struct physdev_irq irq_op;
3500
3501 - BUG_ON(irq != AUTO_ASSIGN && (unsigned)irq >= NR_IRQ_VECTORS);
3502 + BUG_ON((unsigned)irq >= NR_IRQ_VECTORS);
3503
3504 - spin_lock_irqsave(&vector_lock, flags);
3505 + cpus_and(*result, mask, cpu_online_map);
3506
3507 - if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0) {
3508 - spin_unlock_irqrestore(&vector_lock, flags);
3509 - return IO_APIC_VECTOR(irq);
3510 - }
3511 + if (irq_vector[irq] > 0)
3512 + return irq_vector[irq];
3513
3514 irq_op.irq = irq;
3515 - if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) {
3516 - spin_unlock_irqrestore(&vector_lock, flags);
3517 + if (HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op))
3518 return -ENOSPC;
3519 - }
3520
3521 vector = irq_op.vector;
3522 - vector_irq[vector] = irq;
3523 - if (irq != AUTO_ASSIGN)
3524 - IO_APIC_VECTOR(irq) = vector;
3525 + irq_vector[irq] = vector;
3526
3527 - spin_unlock_irqrestore(&vector_lock, flags);
3528 + return vector;
3529 +}
3530
3531 +static int assign_irq_vector(int irq, cpumask_t mask, cpumask_t *result)
3532 +{
3533 + int vector;
3534 + unsigned long flags;
3535 +
3536 + spin_lock_irqsave(&vector_lock, flags);
3537 + vector = __assign_irq_vector(irq, mask, result);
3538 + spin_unlock_irqrestore(&vector_lock, flags);
3539 return vector;
3540 }
3541
3542 -extern void (*interrupt[NR_IRQS])(void);
3543 #ifndef CONFIG_XEN
3544 -static struct hw_interrupt_type ioapic_level_type;
3545 -static struct hw_interrupt_type ioapic_edge_type;
3546 +void __setup_vector_irq(int cpu)
3547 +{
3548 + /* Initialize vector_irq on a new cpu */
3549 + /* This function must be called with vector_lock held */
3550 + int irq, vector;
3551 +
3552 + /* Mark the inuse vectors */
3553 + for (irq = 0; irq < NR_IRQ_VECTORS; ++irq) {
3554 + if (!cpu_isset(cpu, irq_domain[irq]))
3555 + continue;
3556 + vector = irq_vector[irq];
3557 + per_cpu(vector_irq, cpu)[vector] = irq;
3558 + }
3559 + /* Mark the free vectors */
3560 + for (vector = 0; vector < NR_VECTORS; ++vector) {
3561 + irq = per_cpu(vector_irq, cpu)[vector];
3562 + if (irq < 0)
3563 + continue;
3564 + if (!cpu_isset(cpu, irq_domain[irq]))
3565 + per_cpu(vector_irq, cpu)[vector] = -1;
3566 + }
3567 +}
3568 +
3569 +extern void (*interrupt[NR_IRQS])(void);
3570 +
3571 +static struct irq_chip ioapic_chip;
3572
3573 #define IOAPIC_AUTO -1
3574 #define IOAPIC_EDGE 0
3575 @@ -928,16 +747,15 @@
3576
3577 static void ioapic_register_intr(int irq, int vector, unsigned long trigger)
3578 {
3579 - unsigned idx;
3580 -
3581 - idx = use_pci_vector() && !platform_legacy_irq(irq) ? vector : irq;
3582 -
3583 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
3584 trigger == IOAPIC_LEVEL)
3585 - irq_desc[idx].chip = &ioapic_level_type;
3586 - else
3587 - irq_desc[idx].chip = &ioapic_edge_type;
3588 - set_intr_gate(vector, interrupt[idx]);
3589 + set_irq_chip_and_handler_name(irq, &ioapic_chip,
3590 + handle_fasteoi_irq, "fasteoi");
3591 + else {
3592 + irq_desc[irq].status |= IRQ_DELAYED_DISABLE;
3593 + set_irq_chip_and_handler_name(irq, &ioapic_chip,
3594 + handle_edge_irq, "edge");
3595 + }
3596 }
3597 #else
3598 #define ioapic_register_intr(_irq,_vector,_trigger) ((void)0)
3599 @@ -990,16 +808,21 @@
3600 continue;
3601
3602 if (IO_APIC_IRQ(irq)) {
3603 - vector = assign_irq_vector(irq);
3604 + cpumask_t mask;
3605 + vector = assign_irq_vector(irq, TARGET_CPUS, &mask);
3606 + if (vector < 0)
3607 + continue;
3608 +
3609 + entry.dest.logical.logical_dest = cpu_mask_to_apicid(mask);
3610 entry.vector = vector;
3611
3612 ioapic_register_intr(irq, vector, IOAPIC_AUTO);
3613 if (!apic && (irq < 16))
3614 disable_8259A_irq(irq);
3615 }
3616 + ioapic_write_entry(apic, pin, entry);
3617 +
3618 spin_lock_irqsave(&ioapic_lock, flags);
3619 - io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
3620 - io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
3621 set_native_irq_info(irq, TARGET_CPUS);
3622 spin_unlock_irqrestore(&ioapic_lock, flags);
3623 }
3624 @@ -1042,7 +865,7 @@
3625 * The timer IRQ doesn't have to know that behind the
3626 * scene we have a 8259A-master in AEOI mode ...
3627 */
3628 - irq_desc[0].chip = &ioapic_edge_type;
3629 + set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge");
3630
3631 /*
3632 * Add it to the IO-APIC irq-routing table:
3633 @@ -1138,10 +961,7 @@
3634 for (i = 0; i <= reg_01.bits.entries; i++) {
3635 struct IO_APIC_route_entry entry;
3636
3637 - spin_lock_irqsave(&ioapic_lock, flags);
3638 - *(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2);
3639 - *(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2);
3640 - spin_unlock_irqrestore(&ioapic_lock, flags);
3641 + entry = ioapic_read_entry(apic, i);
3642
3643 printk(KERN_DEBUG " %02x %03X %02X ",
3644 i,
3645 @@ -1161,17 +981,12 @@
3646 );
3647 }
3648 }
3649 - if (use_pci_vector())
3650 - printk(KERN_INFO "Using vector-based indexing\n");
3651 printk(KERN_DEBUG "IRQ to pin mappings:\n");
3652 for (i = 0; i < NR_IRQS; i++) {
3653 struct irq_pin_list *entry = irq_2_pin + i;
3654 if (entry->pin < 0)
3655 continue;
3656 - if (use_pci_vector() && !platform_legacy_irq(i))
3657 - printk(KERN_DEBUG "IRQ%d ", IO_APIC_VECTOR(i));
3658 - else
3659 - printk(KERN_DEBUG "IRQ%d ", i);
3660 + printk(KERN_DEBUG "IRQ%d ", i);
3661 for (;;) {
3662 printk("-> %d:%d", entry->apic, entry->pin);
3663 if (!entry->next)
3664 @@ -1335,9 +1150,6 @@
3665 irq_2_pin[i].pin = -1;
3666 irq_2_pin[i].next = 0;
3667 }
3668 - if (!pirqs_enabled)
3669 - for (i = 0; i < MAX_PIRQS; i++)
3670 - pirq_entries[i] = -1;
3671
3672 /*
3673 * The number of IO-APIC IRQ registers (== #pins):
3674 @@ -1354,11 +1166,7 @@
3675 /* See if any of the pins is in ExtINT mode */
3676 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
3677 struct IO_APIC_route_entry entry;
3678 - spin_lock_irqsave(&ioapic_lock, flags);
3679 - *(((int *)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
3680 - *(((int *)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
3681 - spin_unlock_irqrestore(&ioapic_lock, flags);
3682 -
3683 + entry = ioapic_read_entry(apic, pin);
3684
3685 /* If the interrupt line is enabled and in ExtInt mode
3686 * I have found the pin where the i8259 is connected.
3687 @@ -1412,7 +1220,6 @@
3688 */
3689 if (ioapic_i8259.pin != -1) {
3690 struct IO_APIC_route_entry entry;
3691 - unsigned long flags;
3692
3693 memset(&entry, 0, sizeof(entry));
3694 entry.mask = 0; /* Enabled */
3695 @@ -1429,12 +1236,7 @@
3696 /*
3697 * Add it to the IO-APIC irq-routing table:
3698 */
3699 - spin_lock_irqsave(&ioapic_lock, flags);
3700 - io_apic_write(ioapic_i8259.apic, 0x11+2*ioapic_i8259.pin,
3701 - *(((int *)&entry)+1));
3702 - io_apic_write(ioapic_i8259.apic, 0x10+2*ioapic_i8259.pin,
3703 - *(((int *)&entry)+0));
3704 - spin_unlock_irqrestore(&ioapic_lock, flags);
3705 + ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry);
3706 }
3707
3708 disconnect_bsp_APIC(ioapic_i8259.pin != -1);
3709 @@ -1442,76 +1244,6 @@
3710 }
3711
3712 /*
3713 - * function to set the IO-APIC physical IDs based on the
3714 - * values stored in the MPC table.
3715 - *
3716 - * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
3717 - */
3718 -
3719 -#ifndef CONFIG_XEN
3720 -static void __init setup_ioapic_ids_from_mpc (void)
3721 -{
3722 - union IO_APIC_reg_00 reg_00;
3723 - int apic;
3724 - int i;
3725 - unsigned char old_id;
3726 - unsigned long flags;
3727 -
3728 - /*
3729 - * Set the IOAPIC ID to the value stored in the MPC table.
3730 - */
3731 - for (apic = 0; apic < nr_ioapics; apic++) {
3732 -
3733 - /* Read the register 0 value */
3734 - spin_lock_irqsave(&ioapic_lock, flags);
3735 - reg_00.raw = io_apic_read(apic, 0);
3736 - spin_unlock_irqrestore(&ioapic_lock, flags);
3737 -
3738 - old_id = mp_ioapics[apic].mpc_apicid;
3739 -
3740 -
3741 - printk(KERN_INFO "Using IO-APIC %d\n", mp_ioapics[apic].mpc_apicid);
3742 -
3743 -
3744 - /*
3745 - * We need to adjust the IRQ routing table
3746 - * if the ID changed.
3747 - */
3748 - if (old_id != mp_ioapics[apic].mpc_apicid)
3749 - for (i = 0; i < mp_irq_entries; i++)
3750 - if (mp_irqs[i].mpc_dstapic == old_id)
3751 - mp_irqs[i].mpc_dstapic
3752 - = mp_ioapics[apic].mpc_apicid;
3753 -
3754 - /*
3755 - * Read the right value from the MPC table and
3756 - * write it into the ID register.
3757 - */
3758 - apic_printk(APIC_VERBOSE,KERN_INFO "...changing IO-APIC physical APIC ID to %d ...",
3759 - mp_ioapics[apic].mpc_apicid);
3760 -
3761 - reg_00.bits.ID = mp_ioapics[apic].mpc_apicid;
3762 - spin_lock_irqsave(&ioapic_lock, flags);
3763 - io_apic_write(apic, 0, reg_00.raw);
3764 - spin_unlock_irqrestore(&ioapic_lock, flags);
3765 -
3766 - /*
3767 - * Sanity check
3768 - */
3769 - spin_lock_irqsave(&ioapic_lock, flags);
3770 - reg_00.raw = io_apic_read(apic, 0);
3771 - spin_unlock_irqrestore(&ioapic_lock, flags);
3772 - if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid)
3773 - printk("could not set ID!\n");
3774 - else
3775 - apic_printk(APIC_VERBOSE," ok.\n");
3776 - }
3777 -}
3778 -#else
3779 -static void __init setup_ioapic_ids_from_mpc(void) { }
3780 -#endif
3781 -
3782 -/*
3783 * There is a nasty bug in some older SMP boards, their mptable lies
3784 * about the timer IRQ. We do the following to work around the situation:
3785 *
3786 @@ -1565,7 +1297,7 @@
3787 * an edge even if it isn't on the 8259A...
3788 */
3789
3790 -static unsigned int startup_edge_ioapic_irq(unsigned int irq)
3791 +static unsigned int startup_ioapic_irq(unsigned int irq)
3792 {
3793 int was_pending = 0;
3794 unsigned long flags;
3795 @@ -1582,107 +1314,19 @@
3796 return was_pending;
3797 }
3798
3799 -/*
3800 - * Once we have recorded IRQ_PENDING already, we can mask the
3801 - * interrupt for real. This prevents IRQ storms from unhandled
3802 - * devices.
3803 - */
3804 -static void ack_edge_ioapic_irq(unsigned int irq)
3805 -{
3806 - move_irq(irq);
3807 - if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED))
3808 - == (IRQ_PENDING | IRQ_DISABLED))
3809 - mask_IO_APIC_irq(irq);
3810 - ack_APIC_irq();
3811 -}
3812 -
3813 -/*
3814 - * Level triggered interrupts can just be masked,
3815 - * and shutting down and starting up the interrupt
3816 - * is the same as enabling and disabling them -- except
3817 - * with a startup need to return a "was pending" value.
3818 - *
3819 - * Level triggered interrupts are special because we
3820 - * do not touch any IO-APIC register while handling
3821 - * them. We ack the APIC in the end-IRQ handler, not
3822 - * in the start-IRQ-handler. Protection against reentrance
3823 - * from the same interrupt is still provided, both by the
3824 - * generic IRQ layer and by the fact that an unacked local
3825 - * APIC does not accept IRQs.
3826 - */
3827 -static unsigned int startup_level_ioapic_irq (unsigned int irq)
3828 -{
3829 - unmask_IO_APIC_irq(irq);
3830 -
3831 - return 0; /* don't check for pending */
3832 -}
3833 -
3834 -static void end_level_ioapic_irq (unsigned int irq)
3835 -{
3836 - move_irq(irq);
3837 - ack_APIC_irq();
3838 -}
3839 -
3840 -#ifdef CONFIG_PCI_MSI
3841 -static unsigned int startup_edge_ioapic_vector(unsigned int vector)
3842 -{
3843 - int irq = vector_to_irq(vector);
3844 -
3845 - return startup_edge_ioapic_irq(irq);
3846 -}
3847 -
3848 -static void ack_edge_ioapic_vector(unsigned int vector)
3849 -{
3850 - int irq = vector_to_irq(vector);
3851 -
3852 - move_native_irq(vector);
3853 - ack_edge_ioapic_irq(irq);
3854 -}
3855 -
3856 -static unsigned int startup_level_ioapic_vector (unsigned int vector)
3857 -{
3858 - int irq = vector_to_irq(vector);
3859 -
3860 - return startup_level_ioapic_irq (irq);
3861 -}
3862 -
3863 -static void end_level_ioapic_vector (unsigned int vector)
3864 -{
3865 - int irq = vector_to_irq(vector);
3866 -
3867 - move_native_irq(vector);
3868 - end_level_ioapic_irq(irq);
3869 -}
3870 -
3871 -static void mask_IO_APIC_vector (unsigned int vector)
3872 -{
3873 - int irq = vector_to_irq(vector);
3874 -
3875 - mask_IO_APIC_irq(irq);
3876 -}
3877 -
3878 -static void unmask_IO_APIC_vector (unsigned int vector)
3879 -{
3880 - int irq = vector_to_irq(vector);
3881 -
3882 - unmask_IO_APIC_irq(irq);
3883 -}
3884 -
3885 -#ifdef CONFIG_SMP
3886 -static void set_ioapic_affinity_vector (unsigned int vector,
3887 - cpumask_t cpu_mask)
3888 +static int ioapic_retrigger_irq(unsigned int irq)
3889 {
3890 - int irq = vector_to_irq(vector);
3891 + cpumask_t mask;
3892 + unsigned vector;
3893 + unsigned long flags;
3894
3895 - set_native_irq_info(vector, cpu_mask);
3896 - set_ioapic_affinity_irq(irq, cpu_mask);
3897 -}
3898 -#endif // CONFIG_SMP
3899 -#endif // CONFIG_PCI_MSI
3900 + spin_lock_irqsave(&vector_lock, flags);
3901 + vector = irq_vector[irq];
3902 + cpus_clear(mask);
3903 + cpu_set(first_cpu(irq_domain[irq]), mask);
3904
3905 -static int ioapic_retrigger(unsigned int irq)
3906 -{
3907 - send_IPI_self(IO_APIC_VECTOR(irq));
3908 + send_IPI_mask(mask, vector);
3909 + spin_unlock_irqrestore(&vector_lock, flags);
3910
3911 return 1;
3912 }
3913 @@ -1696,32 +1340,47 @@
3914 * races.
3915 */
3916
3917 -static struct hw_interrupt_type ioapic_edge_type __read_mostly = {
3918 - .typename = "IO-APIC-edge",
3919 - .startup = startup_edge_ioapic,
3920 - .shutdown = shutdown_edge_ioapic,
3921 - .enable = enable_edge_ioapic,
3922 - .disable = disable_edge_ioapic,
3923 - .ack = ack_edge_ioapic,
3924 - .end = end_edge_ioapic,
3925 -#ifdef CONFIG_SMP
3926 - .set_affinity = set_ioapic_affinity,
3927 +static void ack_apic_edge(unsigned int irq)
3928 +{
3929 + move_native_irq(irq);
3930 + ack_APIC_irq();
3931 +}
3932 +
3933 +static void ack_apic_level(unsigned int irq)
3934 +{
3935 + int do_unmask_irq = 0;
3936 +
3937 +#if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE)
3938 + /* If we are moving the irq we need to mask it */
3939 + if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) {
3940 + do_unmask_irq = 1;
3941 + mask_IO_APIC_irq(irq);
3942 + }
3943 #endif
3944 - .retrigger = ioapic_retrigger,
3945 -};
3946
3947 -static struct hw_interrupt_type ioapic_level_type __read_mostly = {
3948 - .typename = "IO-APIC-level",
3949 - .startup = startup_level_ioapic,
3950 - .shutdown = shutdown_level_ioapic,
3951 - .enable = enable_level_ioapic,
3952 - .disable = disable_level_ioapic,
3953 - .ack = mask_and_ack_level_ioapic,
3954 - .end = end_level_ioapic,
3955 + /*
3956 + * We must acknowledge the irq before we move it or the acknowledge will
3957 + * not propogate properly.
3958 + */
3959 + ack_APIC_irq();
3960 +
3961 + /* Now we can move and renable the irq */
3962 + move_masked_irq(irq);
3963 + if (unlikely(do_unmask_irq))
3964 + unmask_IO_APIC_irq(irq);
3965 +}
3966 +
3967 +static struct irq_chip ioapic_chip __read_mostly = {
3968 + .name = "IO-APIC",
3969 + .startup = startup_ioapic_irq,
3970 + .mask = mask_IO_APIC_irq,
3971 + .unmask = unmask_IO_APIC_irq,
3972 + .ack = ack_apic_edge,
3973 + .eoi = ack_apic_level,
3974 #ifdef CONFIG_SMP
3975 - .set_affinity = set_ioapic_affinity,
3976 + .set_affinity = set_ioapic_affinity_irq,
3977 #endif
3978 - .retrigger = ioapic_retrigger,
3979 + .retrigger = ioapic_retrigger_irq,
3980 };
3981 #endif /* !CONFIG_XEN */
3982
3983 @@ -1742,12 +1401,7 @@
3984 */
3985 for (irq = 0; irq < NR_IRQS ; irq++) {
3986 int tmp = irq;
3987 - if (use_pci_vector()) {
3988 - if (!platform_legacy_irq(tmp))
3989 - if ((tmp = vector_to_irq(tmp)) == -1)
3990 - continue;
3991 - }
3992 - if (IO_APIC_IRQ(tmp) && !IO_APIC_VECTOR(tmp)) {
3993 + if (IO_APIC_IRQ(tmp) && !irq_vector[tmp]) {
3994 /*
3995 * Hmm.. We don't have an entry for this,
3996 * so default to an old-fashioned 8259
3997 @@ -1758,7 +1412,7 @@
3998 #ifndef CONFIG_XEN
3999 else
4000 /* Strange. Oh, well.. */
4001 - irq_desc[irq].chip = &no_irq_type;
4002 + irq_desc[irq].chip = &no_irq_chip;
4003 #endif
4004 }
4005 }
4006 @@ -1879,8 +1533,6 @@
4007 spin_unlock_irqrestore(&ioapic_lock, flags);
4008 }
4009
4010 -int timer_uses_ioapic_pin_0;
4011 -
4012 /*
4013 * This code may look a bit paranoid, but it's supposed to cooperate with
4014 * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ
4015 @@ -1893,13 +1545,13 @@
4016 {
4017 int apic1, pin1, apic2, pin2;
4018 int vector;
4019 + cpumask_t mask;
4020
4021 /*
4022 * get/set the timer IRQ vector:
4023 */
4024 disable_8259A_irq(0);
4025 - vector = assign_irq_vector(0);
4026 - set_intr_gate(vector, interrupt[0]);
4027 + vector = assign_irq_vector(0, TARGET_CPUS, &mask);
4028
4029 /*
4030 * Subtle, code in do_timer_interrupt() expects an AEOI
4031 @@ -1918,9 +1570,6 @@
4032 pin2 = ioapic_i8259.pin;
4033 apic2 = ioapic_i8259.apic;
4034
4035 - if (pin1 == 0)
4036 - timer_uses_ioapic_pin_0 = 1;
4037 -
4038 apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
4039 vector, apic1, pin1, apic2, pin2);
4040
4041 @@ -2035,11 +1684,6 @@
4042
4043 apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
4044
4045 - /*
4046 - * Set up the IO-APIC IRQ routing table.
4047 - */
4048 - if (!acpi_ioapic)
4049 - setup_ioapic_ids_from_mpc();
4050 #ifndef CONFIG_XEN
4051 sync_Arb_IDs();
4052 #endif /* !CONFIG_XEN */
4053 @@ -2060,17 +1704,12 @@
4054 {
4055 struct IO_APIC_route_entry *entry;
4056 struct sysfs_ioapic_data *data;
4057 - unsigned long flags;
4058 int i;
4059
4060 data = container_of(dev, struct sysfs_ioapic_data, dev);
4061 entry = data->entry;
4062 - spin_lock_irqsave(&ioapic_lock, flags);
4063 - for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
4064 - *(((int *)entry) + 1) = io_apic_read(dev->id, 0x11 + 2 * i);
4065 - *(((int *)entry) + 0) = io_apic_read(dev->id, 0x10 + 2 * i);
4066 - }
4067 - spin_unlock_irqrestore(&ioapic_lock, flags);
4068 + for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ )
4069 + *entry = ioapic_read_entry(dev->id, i);
4070
4071 return 0;
4072 }
4073 @@ -2092,11 +1731,9 @@
4074 reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
4075 io_apic_write(dev->id, 0, reg_00.raw);
4076 }
4077 - for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
4078 - io_apic_write(dev->id, 0x11+2*i, *(((int *)entry)+1));
4079 - io_apic_write(dev->id, 0x10+2*i, *(((int *)entry)+0));
4080 - }
4081 spin_unlock_irqrestore(&ioapic_lock, flags);
4082 + for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
4083 + ioapic_write_entry(dev->id, i, entry[i]);
4084
4085 return 0;
4086 }
4087 @@ -2142,26 +1779,254 @@
4088
4089 device_initcall(ioapic_init_sysfs);
4090
4091 -/* --------------------------------------------------------------------------
4092 - ACPI-based IOAPIC Configuration
4093 - -------------------------------------------------------------------------- */
4094 +#ifndef CONFIG_XEN
4095 +/*
4096 + * Dynamic irq allocate and deallocation
4097 + */
4098 +int create_irq(void)
4099 +{
4100 + /* Allocate an unused irq */
4101 + int irq;
4102 + int new;
4103 + int vector = 0;
4104 + unsigned long flags;
4105 + cpumask_t mask;
4106
4107 -#ifdef CONFIG_ACPI
4108 + irq = -ENOSPC;
4109 + spin_lock_irqsave(&vector_lock, flags);
4110 + for (new = (NR_IRQS - 1); new >= 0; new--) {
4111 + if (platform_legacy_irq(new))
4112 + continue;
4113 + if (irq_vector[new] != 0)
4114 + continue;
4115 + vector = __assign_irq_vector(new, TARGET_CPUS, &mask);
4116 + if (likely(vector > 0))
4117 + irq = new;
4118 + break;
4119 + }
4120 + spin_unlock_irqrestore(&vector_lock, flags);
4121
4122 -#define IO_APIC_MAX_ID 0xFE
4123 + if (irq >= 0) {
4124 + dynamic_irq_init(irq);
4125 + }
4126 + return irq;
4127 +}
4128
4129 -int __init io_apic_get_version (int ioapic)
4130 +void destroy_irq(unsigned int irq)
4131 {
4132 - union IO_APIC_reg_01 reg_01;
4133 unsigned long flags;
4134
4135 - spin_lock_irqsave(&ioapic_lock, flags);
4136 - reg_01.raw = io_apic_read(ioapic, 1);
4137 - spin_unlock_irqrestore(&ioapic_lock, flags);
4138 + dynamic_irq_cleanup(irq);
4139 +
4140 + spin_lock_irqsave(&vector_lock, flags);
4141 + irq_vector[irq] = 0;
4142 + spin_unlock_irqrestore(&vector_lock, flags);
4143 +}
4144 +#endif
4145 +
4146 +/*
4147 + * MSI mesage composition
4148 + */
4149 +#ifdef CONFIG_PCI_MSI
4150 +static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
4151 +{
4152 + int vector;
4153 + unsigned dest;
4154 + cpumask_t tmp;
4155 +
4156 + vector = assign_irq_vector(irq, TARGET_CPUS, &tmp);
4157 + if (vector >= 0) {
4158 + dest = cpu_mask_to_apicid(tmp);
4159 +
4160 + msg->address_hi = MSI_ADDR_BASE_HI;
4161 + msg->address_lo =
4162 + MSI_ADDR_BASE_LO |
4163 + ((INT_DEST_MODE == 0) ?
4164 + MSI_ADDR_DEST_MODE_PHYSICAL:
4165 + MSI_ADDR_DEST_MODE_LOGICAL) |
4166 + ((INT_DELIVERY_MODE != dest_LowestPrio) ?
4167 + MSI_ADDR_REDIRECTION_CPU:
4168 + MSI_ADDR_REDIRECTION_LOWPRI) |
4169 + MSI_ADDR_DEST_ID(dest);
4170 +
4171 + msg->data =
4172 + MSI_DATA_TRIGGER_EDGE |
4173 + MSI_DATA_LEVEL_ASSERT |
4174 + ((INT_DELIVERY_MODE != dest_LowestPrio) ?
4175 + MSI_DATA_DELIVERY_FIXED:
4176 + MSI_DATA_DELIVERY_LOWPRI) |
4177 + MSI_DATA_VECTOR(vector);
4178 + }
4179 + return vector;
4180 +}
4181 +
4182 +#ifdef CONFIG_SMP
4183 +static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
4184 +{
4185 + struct msi_msg msg;
4186 + unsigned int dest;
4187 + cpumask_t tmp;
4188 + int vector;
4189 +
4190 + cpus_and(tmp, mask, cpu_online_map);
4191 + if (cpus_empty(tmp))
4192 + tmp = TARGET_CPUS;
4193 +
4194 + cpus_and(mask, tmp, CPU_MASK_ALL);
4195 +
4196 + vector = assign_irq_vector(irq, mask, &tmp);
4197 + if (vector < 0)
4198 + return;
4199 +
4200 + dest = cpu_mask_to_apicid(tmp);
4201 +
4202 + read_msi_msg(irq, &msg);
4203 +
4204 + msg.data &= ~MSI_DATA_VECTOR_MASK;
4205 + msg.data |= MSI_DATA_VECTOR(vector);
4206 + msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
4207 + msg.address_lo |= MSI_ADDR_DEST_ID(dest);
4208 +
4209 + write_msi_msg(irq, &msg);
4210 + set_native_irq_info(irq, mask);
4211 +}
4212 +#endif /* CONFIG_SMP */
4213 +
4214 +/*
4215 + * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
4216 + * which implement the MSI or MSI-X Capability Structure.
4217 + */
4218 +static struct irq_chip msi_chip = {
4219 + .name = "PCI-MSI",
4220 + .unmask = unmask_msi_irq,
4221 + .mask = mask_msi_irq,
4222 + .ack = ack_apic_edge,
4223 +#ifdef CONFIG_SMP
4224 + .set_affinity = set_msi_irq_affinity,
4225 +#endif
4226 + .retrigger = ioapic_retrigger_irq,
4227 +};
4228 +
4229 +int arch_setup_msi_irq(unsigned int irq, struct pci_dev *dev)
4230 +{
4231 + struct msi_msg msg;
4232 + int ret;
4233 + ret = msi_compose_msg(dev, irq, &msg);
4234 + if (ret < 0)
4235 + return ret;
4236 +
4237 + write_msi_msg(irq, &msg);
4238 +
4239 + set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
4240 +
4241 + return 0;
4242 +}
4243 +
4244 +void arch_teardown_msi_irq(unsigned int irq)
4245 +{
4246 + return;
4247 +}
4248 +
4249 +#endif /* CONFIG_PCI_MSI */
4250 +
4251 +/*
4252 + * Hypertransport interrupt support
4253 + */
4254 +#ifdef CONFIG_HT_IRQ
4255 +
4256 +#ifdef CONFIG_SMP
4257 +
4258 +static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
4259 +{
4260 + struct ht_irq_msg msg;
4261 + fetch_ht_irq_msg(irq, &msg);
4262 +
4263 + msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK | HT_IRQ_LOW_DEST_ID_MASK);
4264 + msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK);
4265
4266 - return reg_01.bits.version;
4267 + msg.address_lo |= HT_IRQ_LOW_VECTOR(vector) | HT_IRQ_LOW_DEST_ID(dest);
4268 + msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest);
4269 +
4270 + write_ht_irq_msg(irq, &msg);
4271 }
4272
4273 +static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
4274 +{
4275 + unsigned int dest;
4276 + cpumask_t tmp;
4277 + int vector;
4278 +
4279 + cpus_and(tmp, mask, cpu_online_map);
4280 + if (cpus_empty(tmp))
4281 + tmp = TARGET_CPUS;
4282 +
4283 + cpus_and(mask, tmp, CPU_MASK_ALL);
4284 +
4285 + vector = assign_irq_vector(irq, mask, &tmp);
4286 + if (vector < 0)
4287 + return;
4288 +
4289 + dest = cpu_mask_to_apicid(tmp);
4290 +
4291 + target_ht_irq(irq, dest, vector);
4292 + set_native_irq_info(irq, mask);
4293 +}
4294 +#endif
4295 +
4296 +static struct irq_chip ht_irq_chip = {
4297 + .name = "PCI-HT",
4298 + .mask = mask_ht_irq,
4299 + .unmask = unmask_ht_irq,
4300 + .ack = ack_apic_edge,
4301 +#ifdef CONFIG_SMP
4302 + .set_affinity = set_ht_irq_affinity,
4303 +#endif
4304 + .retrigger = ioapic_retrigger_irq,
4305 +};
4306 +
4307 +int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
4308 +{
4309 + int vector;
4310 + cpumask_t tmp;
4311 +
4312 + vector = assign_irq_vector(irq, TARGET_CPUS, &tmp);
4313 + if (vector >= 0) {
4314 + struct ht_irq_msg msg;
4315 + unsigned dest;
4316 +
4317 + dest = cpu_mask_to_apicid(tmp);
4318 +
4319 + msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
4320 +
4321 + msg.address_lo =
4322 + HT_IRQ_LOW_BASE |
4323 + HT_IRQ_LOW_DEST_ID(dest) |
4324 + HT_IRQ_LOW_VECTOR(vector) |
4325 + ((INT_DEST_MODE == 0) ?
4326 + HT_IRQ_LOW_DM_PHYSICAL :
4327 + HT_IRQ_LOW_DM_LOGICAL) |
4328 + HT_IRQ_LOW_RQEOI_EDGE |
4329 + ((INT_DELIVERY_MODE != dest_LowestPrio) ?
4330 + HT_IRQ_LOW_MT_FIXED :
4331 + HT_IRQ_LOW_MT_ARBITRATED) |
4332 + HT_IRQ_LOW_IRQ_MASKED;
4333 +
4334 + write_ht_irq_msg(irq, &msg);
4335 +
4336 + set_irq_chip_and_handler_name(irq, &ht_irq_chip,
4337 + handle_edge_irq, "edge");
4338 + }
4339 + return vector;
4340 +}
4341 +#endif /* CONFIG_HT_IRQ */
4342 +
4343 +/* --------------------------------------------------------------------------
4344 + ACPI-based IOAPIC Configuration
4345 + -------------------------------------------------------------------------- */
4346 +
4347 +#ifdef CONFIG_ACPI
4348 +
4349 +#define IO_APIC_MAX_ID 0xFE
4350
4351 int __init io_apic_get_redir_entries (int ioapic)
4352 {
4353 @@ -2180,6 +2045,8 @@
4354 {
4355 struct IO_APIC_route_entry entry;
4356 unsigned long flags;
4357 + int vector;
4358 + cpumask_t mask;
4359
4360 if (!IO_APIC_IRQ(irq)) {
4361 apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
4362 @@ -2188,6 +2055,17 @@
4363 }
4364
4365 /*
4366 + * IRQs < 16 are already in the irq_2_pin[] map
4367 + */
4368 + if (irq >= 16)
4369 + add_pin_to_irq(irq, ioapic, pin);
4370 +
4371 +
4372 + vector = assign_irq_vector(irq, TARGET_CPUS, &mask);
4373 + if (vector < 0)
4374 + return vector;
4375 +
4376 + /*
4377 * Generate a PCI IRQ routing entry and program the IOAPIC accordingly.
4378 * Note that we mask (disable) IRQs now -- these get enabled when the
4379 * corresponding device driver registers for this IRQ.
4380 @@ -2197,19 +2075,11 @@
4381
4382 entry.delivery_mode = INT_DELIVERY_MODE;
4383 entry.dest_mode = INT_DEST_MODE;
4384 - entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
4385 + entry.dest.logical.logical_dest = cpu_mask_to_apicid(mask);
4386 entry.trigger = edge_level;
4387 entry.polarity = active_high_low;
4388 entry.mask = 1; /* Disabled (masked) */
4389 -
4390 - irq = gsi_irq_sharing(irq);
4391 - /*
4392 - * IRQs < 16 are already in the irq_2_pin[] map
4393 - */
4394 - if (irq >= 16)
4395 - add_pin_to_irq(irq, ioapic, pin);
4396 -
4397 - entry.vector = assign_irq_vector(irq);
4398 + entry.vector = vector & 0xff;
4399
4400 apic_printk(APIC_VERBOSE,KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry (%d-%d -> 0x%x -> "
4401 "IRQ %d Mode:%i Active:%i)\n", ioapic,
4402 @@ -2221,10 +2091,10 @@
4403 if (!ioapic && (irq < 16))
4404 disable_8259A_irq(irq);
4405
4406 + ioapic_write_entry(ioapic, pin, entry);
4407 +
4408 spin_lock_irqsave(&ioapic_lock, flags);
4409 - io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1));
4410 - io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0));
4411 - set_native_irq_info(use_pci_vector() ? entry.vector : irq, TARGET_CPUS);
4412 + set_native_irq_info(irq, TARGET_CPUS);
4413 spin_unlock_irqrestore(&ioapic_lock, flags);
4414
4415 return 0;
4416 --- a/arch/x86/kernel/ioport_64-xen.c
4417 +++ b/arch/x86/kernel/ioport_64-xen.c
4418 @@ -58,6 +58,7 @@
4419
4420 memset(bitmap, 0xff, IO_BITMAP_BYTES);
4421 t->io_bitmap_ptr = bitmap;
4422 + set_thread_flag(TIF_IO_BITMAP);
4423
4424 set_xen_guest_handle(set_iobitmap.bitmap, (char *)bitmap);
4425 set_iobitmap.nr_ports = IO_BITMAP_BITS;
4426 --- a/arch/x86/kernel/irq_32-xen.c
4427 +++ b/arch/x86/kernel/irq_32-xen.c
4428 @@ -53,8 +53,10 @@
4429 */
4430 fastcall unsigned int do_IRQ(struct pt_regs *regs)
4431 {
4432 + struct pt_regs *old_regs;
4433 /* high bit used in ret_from_ code */
4434 int irq = ~regs->orig_eax;
4435 + struct irq_desc *desc = irq_desc + irq;
4436 #ifdef CONFIG_4KSTACKS
4437 union irq_ctx *curctx, *irqctx;
4438 u32 *isp;
4439 @@ -66,6 +68,7 @@
4440 BUG();
4441 }
4442
4443 + old_regs = set_irq_regs(regs);
4444 irq_enter();
4445 #ifdef CONFIG_DEBUG_STACKOVERFLOW
4446 /* Debugging check for stack overflow: is there less than 1KB free? */
4447 @@ -110,19 +113,20 @@
4448 (curctx->tinfo.preempt_count & SOFTIRQ_MASK);
4449
4450 asm volatile(
4451 - " xchgl %%ebx,%%esp \n"
4452 - " call __do_IRQ \n"
4453 + " xchgl %%ebx,%%esp \n"
4454 + " call *%%edi \n"
4455 " movl %%ebx,%%esp \n"
4456 : "=a" (arg1), "=d" (arg2), "=b" (ebx)
4457 - : "0" (irq), "1" (regs), "2" (isp)
4458 - : "memory", "cc", "ecx"
4459 + : "0" (irq), "1" (desc), "2" (isp),
4460 + "D" (desc->handle_irq)
4461 + : "memory", "cc"
4462 );
4463 } else
4464 #endif
4465 - __do_IRQ(irq, regs);
4466 + desc->handle_irq(irq, desc);
4467
4468 irq_exit();
4469 -
4470 + set_irq_regs(old_regs);
4471 return 1;
4472 }
4473
4474 @@ -253,7 +257,8 @@
4475 for_each_online_cpu(j)
4476 seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
4477 #endif
4478 - seq_printf(p, " %14s", irq_desc[i].chip->typename);
4479 + seq_printf(p, " %8s", irq_desc[i].chip->name);
4480 + seq_printf(p, "-%-8s", irq_desc[i].name);
4481 seq_printf(p, " %s", action->name);
4482
4483 for (action=action->next; action; action = action->next)
4484 --- a/arch/x86/kernel/irq_64-xen.c
4485 +++ b/arch/x86/kernel/irq_64-xen.c
4486 @@ -20,11 +20,6 @@
4487 #include <asm/idle.h>
4488
4489 atomic_t irq_err_count;
4490 -#ifdef CONFIG_X86_IO_APIC
4491 -#ifdef APIC_MISMATCH_DEBUG
4492 -atomic_t irq_mis_count;
4493 -#endif
4494 -#endif
4495
4496 #ifdef CONFIG_DEBUG_STACKOVERFLOW
4497 /*
4498 @@ -79,7 +74,8 @@
4499 for_each_online_cpu(j)
4500 seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
4501 #endif
4502 - seq_printf(p, " %14s", irq_desc[i].chip->typename);
4503 + seq_printf(p, " %8s", irq_desc[i].chip->name);
4504 + seq_printf(p, "-%-8s", irq_desc[i].name);
4505
4506 seq_printf(p, " %s", action->name);
4507 for (action=action->next; action; action = action->next)
4508 @@ -99,11 +95,6 @@
4509 seq_putc(p, '\n');
4510 #endif
4511 seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
4512 -#ifdef CONFIG_X86_IO_APIC
4513 -#ifdef APIC_MISMATCH_DEBUG
4514 - seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
4515 -#endif
4516 -#endif
4517 }
4518 return 0;
4519 }
4520 @@ -114,24 +105,28 @@
4521 * handlers).
4522 */
4523 asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
4524 -{
4525 +{
4526 + struct pt_regs *old_regs = set_irq_regs(regs);
4527 +
4528 /* high bit used in ret_from_ code */
4529 unsigned irq = ~regs->orig_rax;
4530
4531 - if (unlikely(irq >= NR_IRQS)) {
4532 - printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
4533 - __FUNCTION__, irq);
4534 - BUG();
4535 - }
4536 -
4537 exit_idle();
4538 irq_enter();
4539 +
4540 #ifdef CONFIG_DEBUG_STACKOVERFLOW
4541 stack_overflow_check(regs);
4542 #endif
4543 - __do_IRQ(irq, regs);
4544 +
4545 + if (likely(irq < NR_IRQS))
4546 + generic_handle_irq(irq);
4547 + else
4548 + printk(KERN_EMERG "%s: %d.%d No irq handler for irq\n",
4549 + __func__, smp_processor_id(), irq);
4550 +
4551 irq_exit();
4552
4553 + set_irq_regs(old_regs);
4554 return 1;
4555 }
4556
4557 @@ -192,6 +187,6 @@
4558 */
4559 void ack_bad_irq(unsigned int irq)
4560 {
4561 - printk("unexpected IRQ trap at vector %02x\n", irq);
4562 + printk("unexpected IRQ trap at irq %02x\n", irq);
4563 }
4564 #endif
4565 --- a/arch/x86/kernel/ldt_32-xen.c
4566 +++ b/arch/x86/kernel/ldt_32-xen.c
4567 @@ -1,5 +1,5 @@
4568 /*
4569 - * linux/kernel/ldt.c
4570 + * linux/arch/i386/kernel/ldt.c
4571 *
4572 * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
4573 * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
4574 --- a/arch/x86/kernel/microcode-xen.c
4575 +++ b/arch/x86/kernel/microcode-xen.c
4576 @@ -2,6 +2,7 @@
4577 * Intel CPU Microcode Update Driver for Linux
4578 *
4579 * Copyright (C) 2000-2004 Tigran Aivazian
4580 + * 2006 Shaohua Li <shaohua.li@intel.com>
4581 *
4582 * This driver allows to upgrade microcode on Intel processors
4583 * belonging to IA-32 family - PentiumPro, Pentium II,
4584 @@ -33,7 +34,9 @@
4585 #include <linux/spinlock.h>
4586 #include <linux/mm.h>
4587 #include <linux/mutex.h>
4588 -#include <linux/syscalls.h>
4589 +#include <linux/cpu.h>
4590 +#include <linux/firmware.h>
4591 +#include <linux/platform_device.h>
4592
4593 #include <asm/msr.h>
4594 #include <asm/uaccess.h>
4595 @@ -55,12 +58,7 @@
4596 /* no concurrent ->write()s are allowed on /dev/cpu/microcode */
4597 static DEFINE_MUTEX(microcode_mutex);
4598
4599 -static int microcode_open (struct inode *unused1, struct file *unused2)
4600 -{
4601 - return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
4602 -}
4603 -
4604 -
4605 +#ifdef CONFIG_MICROCODE_OLD_INTERFACE
4606 static int do_microcode_update (const void __user *ubuf, size_t len)
4607 {
4608 int err;
4609 @@ -85,6 +83,11 @@
4610 return err;
4611 }
4612
4613 +static int microcode_open (struct inode *unused1, struct file *unused2)
4614 +{
4615 + return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
4616 +}
4617 +
4618 static ssize_t microcode_write (struct file *file, const char __user *buf, size_t len, loff_t *ppos)
4619 {
4620 ssize_t ret;
4621 @@ -117,7 +120,7 @@
4622 .fops = &microcode_fops,
4623 };
4624
4625 -static int __init microcode_init (void)
4626 +static int __init microcode_dev_init (void)
4627 {
4628 int error;
4629
4630 @@ -129,6 +132,68 @@
4631 return error;
4632 }
4633
4634 + return 0;
4635 +}
4636 +
4637 +static void __exit microcode_dev_exit (void)
4638 +{
4639 + misc_deregister(&microcode_dev);
4640 +}
4641 +
4642 +MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
4643 +#else
4644 +#define microcode_dev_init() 0
4645 +#define microcode_dev_exit() do { } while(0)
4646 +#endif
4647 +
4648 +/* fake device for request_firmware */
4649 +static struct platform_device *microcode_pdev;
4650 +
4651 +static int request_microcode(void)
4652 +{
4653 + char name[30];
4654 + const struct cpuinfo_x86 *c = &boot_cpu_data;
4655 + const struct firmware *firmware;
4656 + int error;
4657 + struct xen_platform_op op;
4658 +
4659 + sprintf(name,"intel-ucode/%02x-%02x-%02x",
4660 + c->x86, c->x86_model, c->x86_mask);
4661 + error = request_firmware(&firmware, name, &microcode_pdev->dev);
4662 + if (error) {
4663 + pr_debug("ucode data file %s load failed\n", name);
4664 + return error;
4665 + }
4666 +
4667 + op.cmd = XENPF_microcode_update;
4668 + set_xen_guest_handle(op.u.microcode.data, (void *)firmware->data);
4669 + op.u.microcode.length = firmware->size;
4670 + error = HYPERVISOR_platform_op(&op);
4671 +
4672 + release_firmware(firmware);
4673 +
4674 + if (error)
4675 + pr_debug("ucode load failed\n");
4676 +
4677 + return error;
4678 +}
4679 +
4680 +static int __init microcode_init (void)
4681 +{
4682 + int error;
4683 +
4684 + error = microcode_dev_init();
4685 + if (error)
4686 + return error;
4687 + microcode_pdev = platform_device_register_simple("microcode", -1,
4688 + NULL, 0);
4689 + if (IS_ERR(microcode_pdev)) {
4690 + microcode_dev_exit();
4691 + return PTR_ERR(microcode_pdev);
4692 + }
4693 +
4694 + request_microcode();
4695 +
4696 printk(KERN_INFO
4697 "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@veritas.com>\n");
4698 return 0;
4699 @@ -136,9 +201,9 @@
4700
4701 static void __exit microcode_exit (void)
4702 {
4703 - misc_deregister(&microcode_dev);
4704 + microcode_dev_exit();
4705 + platform_device_unregister(microcode_pdev);
4706 }
4707
4708 module_init(microcode_init)
4709 module_exit(microcode_exit)
4710 -MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
4711 --- a/arch/x86/kernel/mpparse_32-xen.c
4712 +++ b/arch/x86/kernel/mpparse_32-xen.c
4713 @@ -30,6 +30,7 @@
4714 #include <asm/io_apic.h>
4715
4716 #include <mach_apic.h>
4717 +#include <mach_apicdef.h>
4718 #include <mach_mpparse.h>
4719 #include <bios_ebda.h>
4720
4721 @@ -68,7 +69,7 @@
4722 /* Processor that is doing the boot up */
4723 unsigned int boot_cpu_physical_apicid = -1U;
4724 /* Internal processor count */
4725 -static unsigned int __devinitdata num_processors;
4726 +unsigned int __cpuinitdata num_processors;
4727
4728 /* Bitmask of physically existing CPUs */
4729 physid_mask_t phys_cpu_present_map;
4730 @@ -235,12 +236,14 @@
4731
4732 mpc_oem_bus_info(m, str, translation_table[mpc_record]);
4733
4734 +#if MAX_MP_BUSSES < 256
4735 if (m->mpc_busid >= MAX_MP_BUSSES) {
4736 printk(KERN_WARNING "MP table busid value (%d) for bustype %s "
4737 " is too large, max. supported is %d\n",
4738 m->mpc_busid, str, MAX_MP_BUSSES - 1);
4739 return;
4740 }
4741 +#endif
4742
4743 if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA)-1) == 0) {
4744 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
4745 @@ -300,19 +303,6 @@
4746 m->mpc_irqtype, m->mpc_irqflag & 3,
4747 (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid,
4748 m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
4749 - /*
4750 - * Well it seems all SMP boards in existence
4751 - * use ExtINT/LVT1 == LINT0 and
4752 - * NMI/LVT2 == LINT1 - the following check
4753 - * will show us if this assumptions is false.
4754 - * Until then we do not have to add baggage.
4755 - */
4756 - if ((m->mpc_irqtype == mp_ExtINT) &&
4757 - (m->mpc_destapiclint != 0))
4758 - BUG();
4759 - if ((m->mpc_irqtype == mp_NMI) &&
4760 - (m->mpc_destapiclint != 1))
4761 - BUG();
4762 }
4763
4764 #ifdef CONFIG_X86_NUMAQ
4765 @@ -838,8 +828,7 @@
4766
4767 #ifdef CONFIG_ACPI
4768
4769 -void __init mp_register_lapic_address (
4770 - u64 address)
4771 +void __init mp_register_lapic_address(u64 address)
4772 {
4773 #ifndef CONFIG_XEN
4774 mp_lapic_addr = (unsigned long) address;
4775 @@ -853,13 +842,10 @@
4776 #endif
4777 }
4778
4779 -
4780 -void __devinit mp_register_lapic (
4781 - u8 id,
4782 - u8 enabled)
4783 +void __devinit mp_register_lapic (u8 id, u8 enabled)
4784 {
4785 struct mpc_config_processor processor;
4786 - int boot_cpu = 0;
4787 + int boot_cpu = 0;
4788
4789 if (MAX_APICS - id <= 0) {
4790 printk(KERN_WARNING "Processor #%d invalid (max %d)\n",
4791 @@ -898,11 +884,9 @@
4792 u32 pin_programmed[4];
4793 } mp_ioapic_routing[MAX_IO_APICS];
4794
4795 -
4796 -static int mp_find_ioapic (
4797 - int gsi)
4798 +static int mp_find_ioapic (int gsi)
4799 {
4800 - int i = 0;
4801 + int i = 0;
4802
4803 /* Find the IOAPIC that manages this GSI. */
4804 for (i = 0; i < nr_ioapics; i++) {
4805 @@ -915,15 +899,11 @@
4806
4807 return -1;
4808 }
4809 -
4810
4811 -void __init mp_register_ioapic (
4812 - u8 id,
4813 - u32 address,
4814 - u32 gsi_base)
4815 +void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base)
4816 {
4817 - int idx = 0;
4818 - int tmpid;
4819 + int idx = 0;
4820 + int tmpid;
4821
4822 if (nr_ioapics >= MAX_IO_APICS) {
4823 printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
4824 @@ -971,16 +951,10 @@
4825 mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
4826 mp_ioapic_routing[idx].gsi_base,
4827 mp_ioapic_routing[idx].gsi_end);
4828 -
4829 - return;
4830 }
4831
4832 -
4833 -void __init mp_override_legacy_irq (
4834 - u8 bus_irq,
4835 - u8 polarity,
4836 - u8 trigger,
4837 - u32 gsi)
4838 +void __init
4839 +mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
4840 {
4841 struct mpc_config_intsrc intsrc;
4842 int ioapic = -1;
4843 @@ -1018,15 +992,13 @@
4844 mp_irqs[mp_irq_entries] = intsrc;
4845 if (++mp_irq_entries == MAX_IRQ_SOURCES)
4846 panic("Max # of irq sources exceeded!\n");
4847 -
4848 - return;
4849 }
4850
4851 void __init mp_config_acpi_legacy_irqs (void)
4852 {
4853 struct mpc_config_intsrc intsrc;
4854 - int i = 0;
4855 - int ioapic = -1;
4856 + int i = 0;
4857 + int ioapic = -1;
4858
4859 /*
4860 * Fabricate the legacy ISA bus (bus #31).
4861 @@ -1095,12 +1067,12 @@
4862
4863 #define MAX_GSI_NUM 4096
4864
4865 -int mp_register_gsi (u32 gsi, int triggering, int polarity)
4866 +int mp_register_gsi(u32 gsi, int triggering, int polarity)
4867 {
4868 - int ioapic = -1;
4869 - int ioapic_pin = 0;
4870 - int idx, bit = 0;
4871 - static int pci_irq = 16;
4872 + int ioapic = -1;
4873 + int ioapic_pin = 0;
4874 + int idx, bit = 0;
4875 + static int pci_irq = 16;
4876 /*
4877 * Mapping between Global System Interrups, which
4878 * represent all possible interrupts, and IRQs
4879 --- a/arch/x86/kernel/mpparse_64-xen.c
4880 +++ b/arch/x86/kernel/mpparse_64-xen.c
4881 @@ -41,8 +41,7 @@
4882 * Various Linux-internal data structures created from the
4883 * MP-table.
4884 */
4885 -unsigned char apic_version [MAX_APICS];
4886 -unsigned char mp_bus_id_to_type [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
4887 +DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
4888 int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
4889
4890 static int mp_current_pci_id = 0;
4891 @@ -56,7 +55,6 @@
4892 int mp_irq_entries;
4893
4894 int nr_ioapics;
4895 -int pic_mode;
4896 unsigned long mp_lapic_addr = 0;
4897
4898
4899 @@ -71,19 +69,6 @@
4900 /* Bitmask of physically existing CPUs */
4901 physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE;
4902
4903 -/* ACPI MADT entry parsing functions */
4904 -#ifdef CONFIG_ACPI
4905 -extern struct acpi_boot_flags acpi_boot;
4906 -#ifdef CONFIG_X86_LOCAL_APIC
4907 -extern int acpi_parse_lapic (acpi_table_entry_header *header);
4908 -extern int acpi_parse_lapic_addr_ovr (acpi_table_entry_header *header);
4909 -extern int acpi_parse_lapic_nmi (acpi_table_entry_header *header);
4910 -#endif /*CONFIG_X86_LOCAL_APIC*/
4911 -#ifdef CONFIG_X86_IO_APIC
4912 -extern int acpi_parse_ioapic (acpi_table_entry_header *header);
4913 -#endif /*CONFIG_X86_IO_APIC*/
4914 -#endif /*CONFIG_ACPI*/
4915 -
4916 u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
4917
4918
4919 @@ -109,24 +94,20 @@
4920 static void __cpuinit MP_processor_info (struct mpc_config_processor *m)
4921 {
4922 int cpu;
4923 - unsigned char ver;
4924 cpumask_t tmp_map;
4925 + char *bootup_cpu = "";
4926
4927 if (!(m->mpc_cpuflag & CPU_ENABLED)) {
4928 disabled_cpus++;
4929 return;
4930 }
4931 -
4932 - printk(KERN_INFO "Processor #%d %d:%d APIC version %d\n",
4933 - m->mpc_apicid,
4934 - (m->mpc_cpufeature & CPU_FAMILY_MASK)>>8,
4935 - (m->mpc_cpufeature & CPU_MODEL_MASK)>>4,
4936 - m->mpc_apicver);
4937 -
4938 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
4939 - Dprintk(" Bootup CPU\n");
4940 + bootup_cpu = " (Bootup-CPU)";
4941 boot_cpu_id = m->mpc_apicid;
4942 }
4943 +
4944 + printk(KERN_INFO "Processor #%d%s\n", m->mpc_apicid, bootup_cpu);
4945 +
4946 if (num_processors >= NR_CPUS) {
4947 printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
4948 " Processor ignored.\n", NR_CPUS);
4949 @@ -137,24 +118,7 @@
4950 cpus_complement(tmp_map, cpu_present_map);
4951 cpu = first_cpu(tmp_map);
4952
4953 -#if MAX_APICS < 255
4954 - if ((int)m->mpc_apicid > MAX_APICS) {
4955 - printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n",
4956 - m->mpc_apicid, MAX_APICS);
4957 - return;
4958 - }
4959 -#endif
4960 - ver = m->mpc_apicver;
4961 -
4962 physid_set(m->mpc_apicid, phys_cpu_present_map);
4963 - /*
4964 - * Validate version
4965 - */
4966 - if (ver == 0x0) {
4967 - printk(KERN_ERR "BIOS bug, APIC version is 0 for CPU#%d! fixing up to 0x10. (tell your hw vendor)\n", m->mpc_apicid);
4968 - ver = 0x10;
4969 - }
4970 - apic_version[m->mpc_apicid] = ver;
4971 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
4972 /*
4973 * bios_cpu_apicid is required to have processors listed
4974 @@ -185,37 +149,42 @@
4975 Dprintk("Bus #%d is %s\n", m->mpc_busid, str);
4976
4977 if (strncmp(str, "ISA", 3) == 0) {
4978 - mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
4979 - } else if (strncmp(str, "EISA", 4) == 0) {
4980 - mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA;
4981 + set_bit(m->mpc_busid, mp_bus_not_pci);
4982 } else if (strncmp(str, "PCI", 3) == 0) {
4983 - mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
4984 + clear_bit(m->mpc_busid, mp_bus_not_pci);
4985 mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
4986 mp_current_pci_id++;
4987 - } else if (strncmp(str, "MCA", 3) == 0) {
4988 - mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA;
4989 } else {
4990 printk(KERN_ERR "Unknown bustype %s\n", str);
4991 }
4992 }
4993
4994 +static int bad_ioapic(unsigned long address)
4995 +{
4996 + if (nr_ioapics >= MAX_IO_APICS) {
4997 + printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
4998 + "(found %d)\n", MAX_IO_APICS, nr_ioapics);
4999 + panic("Recompile kernel with bigger MAX_IO_APICS!\n");
5000 + }
5001 + if (!address) {
5002 + printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
5003 + " found in table, skipping!\n");
5004 + return 1;
5005 + }
5006 + return 0;
5007 +}
5008 +
5009 static void __init MP_ioapic_info (struct mpc_config_ioapic *m)
5010 {
5011 if (!(m->mpc_flags & MPC_APIC_USABLE))
5012 return;
5013
5014 - printk("I/O APIC #%d Version %d at 0x%X.\n",
5015 - m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
5016 - if (nr_ioapics >= MAX_IO_APICS) {
5017 - printk(KERN_ERR "Max # of I/O APICs (%d) exceeded (found %d).\n",
5018 - MAX_IO_APICS, nr_ioapics);
5019 - panic("Recompile kernel with bigger MAX_IO_APICS!.\n");
5020 - }
5021 - if (!m->mpc_apicaddr) {
5022 - printk(KERN_ERR "WARNING: bogus zero I/O APIC address"
5023 - " found in MP table, skipping!\n");
5024 + printk("I/O APIC #%d at 0x%X.\n",
5025 + m->mpc_apicid, m->mpc_apicaddr);
5026 +
5027 + if (bad_ioapic(m->mpc_apicaddr))
5028 return;
5029 - }
5030 +
5031 mp_ioapics[nr_ioapics] = *m;
5032 nr_ioapics++;
5033 }
5034 @@ -239,19 +208,6 @@
5035 m->mpc_irqtype, m->mpc_irqflag & 3,
5036 (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid,
5037 m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
5038 - /*
5039 - * Well it seems all SMP boards in existence
5040 - * use ExtINT/LVT1 == LINT0 and
5041 - * NMI/LVT2 == LINT1 - the following check
5042 - * will show us if this assumptions is false.
5043 - * Until then we do not have to add baggage.
5044 - */
5045 - if ((m->mpc_irqtype == mp_ExtINT) &&
5046 - (m->mpc_destapiclint != 0))
5047 - BUG();
5048 - if ((m->mpc_irqtype == mp_NMI) &&
5049 - (m->mpc_destapiclint != 1))
5050 - BUG();
5051 }
5052
5053 /*
5054 @@ -265,7 +221,7 @@
5055 unsigned char *mpt=((unsigned char *)mpc)+count;
5056
5057 if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) {
5058 - printk("SMP mptable: bad signature [%c%c%c%c]!\n",
5059 + printk("MPTABLE: bad signature [%c%c%c%c]!\n",
5060 mpc->mpc_signature[0],
5061 mpc->mpc_signature[1],
5062 mpc->mpc_signature[2],
5063 @@ -273,31 +229,31 @@
5064 return 0;
5065 }
5066 if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) {
5067 - printk("SMP mptable: checksum error!\n");
5068 + printk("MPTABLE: checksum error!\n");
5069 return 0;
5070 }
5071 if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) {
5072 - printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n",
5073 + printk(KERN_ERR "MPTABLE: bad table version (%d)!!\n",
5074 mpc->mpc_spec);
5075 return 0;
5076 }
5077 if (!mpc->mpc_lapic) {
5078 - printk(KERN_ERR "SMP mptable: null local APIC address!\n");
5079 + printk(KERN_ERR "MPTABLE: null local APIC address!\n");
5080 return 0;
5081 }
5082 memcpy(str,mpc->mpc_oem,8);
5083 - str[8]=0;
5084 - printk(KERN_INFO "OEM ID: %s ",str);
5085 + str[8] = 0;
5086 + printk(KERN_INFO "MPTABLE: OEM ID: %s ",str);
5087
5088 memcpy(str,mpc->mpc_productid,12);
5089 - str[12]=0;
5090 - printk("Product ID: %s ",str);
5091 + str[12] = 0;
5092 + printk("MPTABLE: Product ID: %s ",str);
5093
5094 - printk("APIC at: 0x%X\n",mpc->mpc_lapic);
5095 + printk("MPTABLE: APIC at: 0x%X\n",mpc->mpc_lapic);
5096
5097 /* save the local APIC address, it might be non-default */
5098 if (!acpi_lapic)
5099 - mp_lapic_addr = mpc->mpc_lapic;
5100 + mp_lapic_addr = mpc->mpc_lapic;
5101
5102 /*
5103 * Now process the configuration blocks.
5104 @@ -309,7 +265,7 @@
5105 struct mpc_config_processor *m=
5106 (struct mpc_config_processor *)mpt;
5107 if (!acpi_lapic)
5108 - MP_processor_info(m);
5109 + MP_processor_info(m);
5110 mpt += sizeof(*m);
5111 count += sizeof(*m);
5112 break;
5113 @@ -328,8 +284,8 @@
5114 struct mpc_config_ioapic *m=
5115 (struct mpc_config_ioapic *)mpt;
5116 MP_ioapic_info(m);
5117 - mpt+=sizeof(*m);
5118 - count+=sizeof(*m);
5119 + mpt += sizeof(*m);
5120 + count += sizeof(*m);
5121 break;
5122 }
5123 case MP_INTSRC:
5124 @@ -338,8 +294,8 @@
5125 (struct mpc_config_intsrc *)mpt;
5126
5127 MP_intsrc_info(m);
5128 - mpt+=sizeof(*m);
5129 - count+=sizeof(*m);
5130 + mpt += sizeof(*m);
5131 + count += sizeof(*m);
5132 break;
5133 }
5134 case MP_LINTSRC:
5135 @@ -347,15 +303,15 @@
5136 struct mpc_config_lintsrc *m=
5137 (struct mpc_config_lintsrc *)mpt;
5138 MP_lintsrc_info(m);
5139 - mpt+=sizeof(*m);
5140 - count+=sizeof(*m);
5141 + mpt += sizeof(*m);
5142 + count += sizeof(*m);
5143 break;
5144 }
5145 }
5146 }
5147 clustered_apic_check();
5148 if (!num_processors)
5149 - printk(KERN_ERR "SMP mptable: no processors registered!\n");
5150 + printk(KERN_ERR "MPTABLE: no processors registered!\n");
5151 return num_processors;
5152 }
5153
5154 @@ -451,13 +407,10 @@
5155 * 2 CPUs, numbered 0 & 1.
5156 */
5157 processor.mpc_type = MP_PROCESSOR;
5158 - /* Either an integrated APIC or a discrete 82489DX. */
5159 - processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
5160 + processor.mpc_apicver = 0;
5161 processor.mpc_cpuflag = CPU_ENABLED;
5162 - processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
5163 - (boot_cpu_data.x86_model << 4) |
5164 - boot_cpu_data.x86_mask;
5165 - processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
5166 + processor.mpc_cpufeature = 0;
5167 + processor.mpc_featureflag = 0;
5168 processor.mpc_reserved[0] = 0;
5169 processor.mpc_reserved[1] = 0;
5170 for (i = 0; i < 2; i++) {
5171 @@ -476,14 +429,6 @@
5172 case 5:
5173 memcpy(bus.mpc_bustype, "ISA ", 6);
5174 break;
5175 - case 2:
5176 - case 6:
5177 - case 3:
5178 - memcpy(bus.mpc_bustype, "EISA ", 6);
5179 - break;
5180 - case 4:
5181 - case 7:
5182 - memcpy(bus.mpc_bustype, "MCA ", 6);
5183 }
5184 MP_bus_info(&bus);
5185 if (mpc_default_type > 4) {
5186 @@ -494,7 +439,7 @@
5187
5188 ioapic.mpc_type = MP_IOAPIC;
5189 ioapic.mpc_apicid = 2;
5190 - ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
5191 + ioapic.mpc_apicver = 0;
5192 ioapic.mpc_flags = MPC_APIC_USABLE;
5193 ioapic.mpc_apicaddr = 0xFEC00000;
5194 MP_ioapic_info(&ioapic);
5195 @@ -537,13 +482,6 @@
5196 printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n");
5197
5198 printk("Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification);
5199 - if (mpf->mpf_feature2 & (1<<7)) {
5200 - printk(KERN_INFO " IMCR and PIC compatibility mode.\n");
5201 - pic_mode = 1;
5202 - } else {
5203 - printk(KERN_INFO " Virtual Wire compatibility mode.\n");
5204 - pic_mode = 0;
5205 - }
5206
5207 /*
5208 * Now see if we need to read further.
5209 @@ -620,7 +558,7 @@
5210 return 0;
5211 }
5212
5213 -void __init find_intel_smp (void)
5214 +void __init find_smp_config(void)
5215 {
5216 unsigned int address;
5217
5218 @@ -637,9 +575,7 @@
5219 smp_scan_config(0xF0000,0x10000))
5220 return;
5221 /*
5222 - * If it is an SMP machine we should know now, unless the
5223 - * configuration is in an EISA/MCA bus machine with an
5224 - * extended bios data area.
5225 + * If it is an SMP machine we should know now.
5226 *
5227 * there is a real-mode segmented pointer pointing to the
5228 * 4K EBDA area at 0x40E, calculate and scan it here.
5229 @@ -660,64 +596,38 @@
5230 printk(KERN_INFO "No mptable found.\n");
5231 }
5232
5233 -/*
5234 - * - Intel MP Configuration Table
5235 - */
5236 -void __init find_smp_config (void)
5237 -{
5238 -#ifdef CONFIG_X86_LOCAL_APIC
5239 - find_intel_smp();
5240 -#endif
5241 -}
5242 -
5243 -
5244 /* --------------------------------------------------------------------------
5245 ACPI-based MP Configuration
5246 -------------------------------------------------------------------------- */
5247
5248 #ifdef CONFIG_ACPI
5249
5250 -void __init mp_register_lapic_address (
5251 - u64 address)
5252 +void __init mp_register_lapic_address(u64 address)
5253 {
5254 #ifndef CONFIG_XEN
5255 mp_lapic_addr = (unsigned long) address;
5256 -
5257 set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
5258 -
5259 if (boot_cpu_id == -1U)
5260 boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID));
5261 -
5262 - Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid);
5263 #endif
5264 }
5265
5266 -
5267 -void __cpuinit mp_register_lapic (
5268 - u8 id,
5269 - u8 enabled)
5270 +void __cpuinit mp_register_lapic (u8 id, u8 enabled)
5271 {
5272 struct mpc_config_processor processor;
5273 int boot_cpu = 0;
5274
5275 - if (id >= MAX_APICS) {
5276 - printk(KERN_WARNING "Processor #%d invalid (max %d)\n",
5277 - id, MAX_APICS);
5278 - return;
5279 - }
5280 -
5281 - if (id == boot_cpu_physical_apicid)
5282 + if (id == boot_cpu_id)
5283 boot_cpu = 1;
5284
5285 #ifndef CONFIG_XEN
5286 processor.mpc_type = MP_PROCESSOR;
5287 processor.mpc_apicid = id;
5288 - processor.mpc_apicver = GET_APIC_VERSION(apic_read(APIC_LVR));
5289 + processor.mpc_apicver = 0;
5290 processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0);
5291 processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0);
5292 - processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
5293 - (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
5294 - processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
5295 + processor.mpc_cpufeature = 0;
5296 + processor.mpc_featureflag = 0;
5297 processor.mpc_reserved[0] = 0;
5298 processor.mpc_reserved[1] = 0;
5299 #endif
5300 @@ -725,8 +635,6 @@
5301 MP_processor_info(&processor);
5302 }
5303
5304 -#ifdef CONFIG_X86_IO_APIC
5305 -
5306 #define MP_ISA_BUS 0
5307 #define MP_MAX_IOAPIC_PIN 127
5308
5309 @@ -737,11 +645,9 @@
5310 u32 pin_programmed[4];
5311 } mp_ioapic_routing[MAX_IO_APICS];
5312
5313 -
5314 -static int mp_find_ioapic (
5315 - int gsi)
5316 +static int mp_find_ioapic(int gsi)
5317 {
5318 - int i = 0;
5319 + int i = 0;
5320
5321 /* Find the IOAPIC that manages this GSI. */
5322 for (i = 0; i < nr_ioapics; i++) {
5323 @@ -751,28 +657,15 @@
5324 }
5325
5326 printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
5327 -
5328 return -1;
5329 }
5330 -
5331
5332 -void __init mp_register_ioapic (
5333 - u8 id,
5334 - u32 address,
5335 - u32 gsi_base)
5336 +void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base)
5337 {
5338 - int idx = 0;
5339 + int idx = 0;
5340
5341 - if (nr_ioapics >= MAX_IO_APICS) {
5342 - printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
5343 - "(found %d)\n", MAX_IO_APICS, nr_ioapics);
5344 - panic("Recompile kernel with bigger MAX_IO_APICS!\n");
5345 - }
5346 - if (!address) {
5347 - printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
5348 - " found in MADT table, skipping!\n");
5349 + if (bad_ioapic(address))
5350 return;
5351 - }
5352
5353 idx = nr_ioapics++;
5354
5355 @@ -784,7 +677,7 @@
5356 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
5357 #endif
5358 mp_ioapics[idx].mpc_apicid = id;
5359 - mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
5360 + mp_ioapics[idx].mpc_apicver = 0;
5361
5362 /*
5363 * Build basic IRQ lookup table to facilitate gsi->io_apic lookups
5364 @@ -795,21 +688,15 @@
5365 mp_ioapic_routing[idx].gsi_end = gsi_base +
5366 io_apic_get_redir_entries(idx);
5367
5368 - printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
5369 + printk(KERN_INFO "IOAPIC[%d]: apic_id %d, address 0x%x, "
5370 "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
5371 - mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
5372 + mp_ioapics[idx].mpc_apicaddr,
5373 mp_ioapic_routing[idx].gsi_start,
5374 mp_ioapic_routing[idx].gsi_end);
5375 -
5376 - return;
5377 }
5378
5379 -
5380 -void __init mp_override_legacy_irq (
5381 - u8 bus_irq,
5382 - u8 polarity,
5383 - u8 trigger,
5384 - u32 gsi)
5385 +void __init
5386 +mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
5387 {
5388 struct mpc_config_intsrc intsrc;
5389 int ioapic = -1;
5390 @@ -847,22 +734,18 @@
5391 mp_irqs[mp_irq_entries] = intsrc;
5392 if (++mp_irq_entries == MAX_IRQ_SOURCES)
5393 panic("Max # of irq sources exceeded!\n");
5394 -
5395 - return;
5396 }
5397
5398 -
5399 -void __init mp_config_acpi_legacy_irqs (void)
5400 +void __init mp_config_acpi_legacy_irqs(void)
5401 {
5402 struct mpc_config_intsrc intsrc;
5403 - int i = 0;
5404 - int ioapic = -1;
5405 + int i = 0;
5406 + int ioapic = -1;
5407
5408 /*
5409 * Fabricate the legacy ISA bus (bus #31).
5410 */
5411 - mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
5412 - Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
5413 + set_bit(MP_ISA_BUS, mp_bus_not_pci);
5414
5415 /*
5416 * Locate the IOAPIC that manages the ISA IRQs (0-15).
5417 @@ -915,24 +798,13 @@
5418 if (++mp_irq_entries == MAX_IRQ_SOURCES)
5419 panic("Max # of irq sources exceeded!\n");
5420 }
5421 -
5422 - return;
5423 }
5424
5425 -#define MAX_GSI_NUM 4096
5426 -
5427 int mp_register_gsi(u32 gsi, int triggering, int polarity)
5428 {
5429 - int ioapic = -1;
5430 - int ioapic_pin = 0;
5431 - int idx, bit = 0;
5432 - static int pci_irq = 16;
5433 - /*
5434 - * Mapping between Global System Interrupts, which
5435 - * represent all possible interrupts, to the IRQs
5436 - * assigned to actual devices.
5437 - */
5438 - static int gsi_to_irq[MAX_GSI_NUM];
5439 + int ioapic = -1;
5440 + int ioapic_pin = 0;
5441 + int idx, bit = 0;
5442
5443 if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
5444 return gsi;
5445 @@ -965,47 +837,14 @@
5446 if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
5447 Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
5448 mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
5449 - return gsi_to_irq[gsi];
5450 + return gsi;
5451 }
5452
5453 mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
5454
5455 - if (triggering == ACPI_LEVEL_SENSITIVE) {
5456 - /*
5457 - * For PCI devices assign IRQs in order, avoiding gaps
5458 - * due to unused I/O APIC pins.
5459 - */
5460 - int irq = gsi;
5461 - if (gsi < MAX_GSI_NUM) {
5462 - /*
5463 - * Retain the VIA chipset work-around (gsi > 15), but
5464 - * avoid a problem where the 8254 timer (IRQ0) is setup
5465 - * via an override (so it's not on pin 0 of the ioapic),
5466 - * and at the same time, the pin 0 interrupt is a PCI
5467 - * type. The gsi > 15 test could cause these two pins
5468 - * to be shared as IRQ0, and they are not shareable.
5469 - * So test for this condition, and if necessary, avoid
5470 - * the pin collision.
5471 - */
5472 - if (gsi > 15 || (gsi == 0 && !timer_uses_ioapic_pin_0))
5473 - gsi = pci_irq++;
5474 - /*
5475 - * Don't assign IRQ used by ACPI SCI
5476 - */
5477 - if (gsi == acpi_fadt.sci_int)
5478 - gsi = pci_irq++;
5479 - gsi_to_irq[irq] = gsi;
5480 - } else {
5481 - printk(KERN_ERR "GSI %u is too high\n", gsi);
5482 - return gsi;
5483 - }
5484 - }
5485 -
5486 io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
5487 triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
5488 polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
5489 return gsi;
5490 }
5491 -
5492 -#endif /*CONFIG_X86_IO_APIC*/
5493 #endif /*CONFIG_ACPI*/
5494 --- a/arch/x86/kernel/pci-dma_32-xen.c
5495 +++ b/arch/x86/kernel/pci-dma_32-xen.c
5496 @@ -116,8 +116,7 @@
5497 {
5498 int i, rc;
5499
5500 - if (direction == DMA_NONE)
5501 - BUG();
5502 + BUG_ON(!valid_dma_direction(direction));
5503 WARN_ON(nents == 0 || sg[0].length == 0);
5504
5505 if (swiotlb) {
5506 @@ -148,7 +147,7 @@
5507 {
5508 int i;
5509
5510 - BUG_ON(direction == DMA_NONE);
5511 + BUG_ON(!valid_dma_direction(direction));
5512 if (swiotlb)
5513 swiotlb_unmap_sg(hwdev, sg, nents, direction);
5514 else {
5515 @@ -165,8 +164,7 @@
5516 {
5517 dma_addr_t dma_addr;
5518
5519 - BUG_ON(direction == DMA_NONE);
5520 -
5521 + BUG_ON(!valid_dma_direction(direction));
5522 if (swiotlb) {
5523 dma_addr = swiotlb_map_page(
5524 dev, page, offset, size, direction);
5525 @@ -183,7 +181,7 @@
5526 dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size,
5527 enum dma_data_direction direction)
5528 {
5529 - BUG_ON(direction == DMA_NONE);
5530 + BUG_ON(!valid_dma_direction(direction));
5531 if (swiotlb)
5532 swiotlb_unmap_page(dev, dma_address, size, direction);
5533 else
5534 @@ -365,8 +363,7 @@
5535 {
5536 dma_addr_t dma;
5537
5538 - if (direction == DMA_NONE)
5539 - BUG();
5540 + BUG_ON(!valid_dma_direction(direction));
5541 WARN_ON(size == 0);
5542
5543 if (swiotlb) {
5544 @@ -387,8 +384,7 @@
5545 dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
5546 enum dma_data_direction direction)
5547 {
5548 - if (direction == DMA_NONE)
5549 - BUG();
5550 + BUG_ON(!valid_dma_direction(direction));
5551 if (swiotlb)
5552 swiotlb_unmap_single(dev, dma_addr, size, direction);
5553 else
5554 --- a/arch/x86/kernel/pci-swiotlb_64-xen.c
5555 +++ b/arch/x86/kernel/pci-swiotlb_64-xen.c
5556 @@ -3,7 +3,8 @@
5557 #include <linux/pci.h>
5558 #include <linux/cache.h>
5559 #include <linux/module.h>
5560 -#include <asm/dma-mapping.h>
5561 +#include <linux/dma-mapping.h>
5562 +
5563 #include <asm/proto.h>
5564 #include <asm/swiotlb.h>
5565 #include <asm/dma.h>
5566 --- a/arch/x86/kernel/process_32-xen.c
5567 +++ b/arch/x86/kernel/process_32-xen.c
5568 @@ -37,6 +37,7 @@
5569 #include <linux/kallsyms.h>
5570 #include <linux/ptrace.h>
5571 #include <linux/random.h>
5572 +#include <linux/personality.h>
5573
5574 #include <asm/uaccess.h>
5575 #include <asm/pgtable.h>
5576 @@ -186,7 +187,7 @@
5577 void cpu_idle_wait(void)
5578 {
5579 unsigned int cpu, this_cpu = get_cpu();
5580 - cpumask_t map;
5581 + cpumask_t map, tmp = current->cpus_allowed;
5582
5583 set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
5584 put_cpu();
5585 @@ -208,6 +209,8 @@
5586 }
5587 cpus_and(map, map, cpu_online_map);
5588 } while (!cpus_empty(map));
5589 +
5590 + set_cpus_allowed(current, tmp);
5591 }
5592 EXPORT_SYMBOL_GPL(cpu_idle_wait);
5593
5594 @@ -240,9 +243,9 @@
5595 if (user_mode_vm(regs))
5596 printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp);
5597 printk(" EFLAGS: %08lx %s (%s %.*s)\n",
5598 - regs->eflags, print_tainted(), system_utsname.release,
5599 - (int)strcspn(system_utsname.version, " "),
5600 - system_utsname.version);
5601 + regs->eflags, print_tainted(), init_utsname()->release,
5602 + (int)strcspn(init_utsname()->version, " "),
5603 + init_utsname()->version);
5604 printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
5605 regs->eax,regs->ebx,regs->ecx,regs->edx);
5606 printk("ESI: %08lx EDI: %08lx EBP: %08lx",
5607 @@ -264,15 +267,6 @@
5608 * the "args".
5609 */
5610 extern void kernel_thread_helper(void);
5611 -__asm__(".section .text\n"
5612 - ".align 4\n"
5613 - "kernel_thread_helper:\n\t"
5614 - "movl %edx,%eax\n\t"
5615 - "pushl %edx\n\t"
5616 - "call *%ebx\n\t"
5617 - "pushl %eax\n\t"
5618 - "call do_exit\n"
5619 - ".previous");
5620
5621 /*
5622 * Create a kernel thread
5623 @@ -290,7 +284,7 @@
5624 regs.xes = __USER_DS;
5625 regs.orig_eax = -1;
5626 regs.eip = (unsigned long) kernel_thread_helper;
5627 - regs.xcs = GET_KERNEL_CS();
5628 + regs.xcs = __KERNEL_CS | get_kernel_rpl();
5629 regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
5630
5631 /* Ok, create the new process.. */
5632 @@ -369,13 +363,12 @@
5633
5634 tsk = current;
5635 if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
5636 - p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
5637 + p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
5638 + IO_BITMAP_BYTES, GFP_KERNEL);
5639 if (!p->thread.io_bitmap_ptr) {
5640 p->thread.io_bitmap_max = 0;
5641 return -ENOMEM;
5642 }
5643 - memcpy(p->thread.io_bitmap_ptr, tsk->thread.io_bitmap_ptr,
5644 - IO_BITMAP_BYTES);
5645 set_tsk_thread_flag(p, TIF_IO_BITMAP);
5646 }
5647
5648 @@ -850,7 +843,7 @@
5649
5650 unsigned long arch_align_stack(unsigned long sp)
5651 {
5652 - if (randomize_va_space)
5653 + if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
5654 sp -= get_random_int() % 8192;
5655 return sp & ~0xf;
5656 }
5657 --- a/arch/x86/kernel/process_64-xen.c
5658 +++ b/arch/x86/kernel/process_64-xen.c
5659 @@ -89,25 +89,24 @@
5660 }
5661 EXPORT_SYMBOL(idle_notifier_unregister);
5662
5663 -enum idle_state { CPU_IDLE, CPU_NOT_IDLE };
5664 -static DEFINE_PER_CPU(enum idle_state, idle_state) = CPU_NOT_IDLE;
5665 -
5666 void enter_idle(void)
5667 {
5668 - __get_cpu_var(idle_state) = CPU_IDLE;
5669 + write_pda(isidle, 1);
5670 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
5671 }
5672
5673 static void __exit_idle(void)
5674 {
5675 - __get_cpu_var(idle_state) = CPU_NOT_IDLE;
5676 + if (test_and_clear_bit_pda(0, isidle) == 0)
5677 + return;
5678 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
5679 }
5680
5681 /* Called from interrupts to signify idle end */
5682 void exit_idle(void)
5683 {
5684 - if (current->pid | read_pda(irqcount))
5685 + /* idle loop has pid 0 */
5686 + if (current->pid)
5687 return;
5688 __exit_idle();
5689 }
5690 @@ -184,6 +183,9 @@
5691 play_dead();
5692 enter_idle();
5693 idle();
5694 + /* In many cases the interrupt that ended idle
5695 + has already called exit_idle. But some idle
5696 + loops can be woken up without interrupt. */
5697 __exit_idle();
5698 }
5699
5700 @@ -196,7 +198,7 @@
5701 void cpu_idle_wait(void)
5702 {
5703 unsigned int cpu, this_cpu = get_cpu();
5704 - cpumask_t map;
5705 + cpumask_t map, tmp = current->cpus_allowed;
5706
5707 set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
5708 put_cpu();
5709 @@ -219,6 +221,8 @@
5710 }
5711 cpus_and(map, map, cpu_online_map);
5712 } while (!cpus_empty(map));
5713 +
5714 + set_cpus_allowed(current, tmp);
5715 }
5716 EXPORT_SYMBOL_GPL(cpu_idle_wait);
5717
5718 @@ -250,9 +254,9 @@
5719 print_modules();
5720 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
5721 current->pid, current->comm, print_tainted(),
5722 - system_utsname.release,
5723 - (int)strcspn(system_utsname.version, " "),
5724 - system_utsname.version);
5725 + init_utsname()->release,
5726 + (int)strcspn(init_utsname()->version, " "),
5727 + init_utsname()->version);
5728 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
5729 printk_address(regs->rip);
5730 printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp,
5731 @@ -310,6 +314,7 @@
5732
5733 kfree(t->io_bitmap_ptr);
5734 t->io_bitmap_ptr = NULL;
5735 + clear_thread_flag(TIF_IO_BITMAP);
5736 /*
5737 * Careful, clear this in the TSS too:
5738 */
5739 @@ -340,6 +345,7 @@
5740 if (t->flags & _TIF_IA32)
5741 current_thread_info()->status |= TS_COMPAT;
5742 }
5743 + t->flags &= ~_TIF_DEBUG;
5744
5745 tsk->thread.debugreg0 = 0;
5746 tsk->thread.debugreg1 = 0;
5747 @@ -432,7 +438,7 @@
5748 asm("mov %%es,%0" : "=m" (p->thread.es));
5749 asm("mov %%ds,%0" : "=m" (p->thread.ds));
5750
5751 - if (unlikely(me->thread.io_bitmap_ptr != NULL)) {
5752 + if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
5753 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
5754 if (!p->thread.io_bitmap_ptr) {
5755 p->thread.io_bitmap_max = 0;
5756 @@ -440,6 +446,7 @@
5757 }
5758 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
5759 IO_BITMAP_BYTES);
5760 + set_tsk_thread_flag(p, TIF_IO_BITMAP);
5761 }
5762
5763 /*
5764 @@ -474,6 +481,30 @@
5765 }
5766
5767 /*
5768 + * This special macro can be used to load a debugging register
5769 + */
5770 +#define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r)
5771 +
5772 +static inline void __switch_to_xtra(struct task_struct *prev_p,
5773 + struct task_struct *next_p)
5774 +{
5775 + struct thread_struct *prev, *next;
5776 +
5777 + prev = &prev_p->thread,
5778 + next = &next_p->thread;
5779 +
5780 + if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
5781 + loaddebug(next, 0);
5782 + loaddebug(next, 1);
5783 + loaddebug(next, 2);
5784 + loaddebug(next, 3);
5785 + /* no 4 and 5 */
5786 + loaddebug(next, 6);
5787 + loaddebug(next, 7);
5788 + }
5789 +}
5790 +
5791 +/*
5792 * switch_to(x,y) should switch tasks from x to y.
5793 *
5794 * This could still be optimized:
5795 @@ -495,6 +526,10 @@
5796 struct physdev_set_iobitmap iobmp_op;
5797 multicall_entry_t _mcl[8], *mcl = _mcl;
5798
5799 + /* we're going to use this soon, after a few expensive things */
5800 + if (next_p->fpu_counter>5)
5801 + prefetch(&next->i387.fxsave);
5802 +
5803 /*
5804 * This is basically '__unlazy_fpu', except that we queue a
5805 * multicall to indicate FPU task switch, rather than
5806 @@ -507,7 +542,8 @@
5807 mcl->op = __HYPERVISOR_fpu_taskswitch;
5808 mcl->args[0] = 1;
5809 mcl++;
5810 - }
5811 + } else
5812 + prev_p->fpu_counter = 0;
5813
5814 /*
5815 * Reload esp0, LDT and the page table pointer:
5816 @@ -587,21 +623,29 @@
5817 write_pda(oldrsp, next->userrsp);
5818 write_pda(pcurrent, next_p);
5819 write_pda(kernelstack,
5820 - task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
5821 + (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
5822 +#ifdef CONFIG_CC_STACKPROTECTOR
5823 + write_pda(stack_canary, next_p->stack_canary);
5824 +
5825 + /*
5826 + * Build time only check to make sure the stack_canary is at
5827 + * offset 40 in the pda; this is a gcc ABI requirement
5828 + */
5829 + BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
5830 +#endif
5831
5832 /*
5833 * Now maybe reload the debug registers
5834 */
5835 - if (unlikely(next->debugreg7)) {
5836 - set_debugreg(next->debugreg0, 0);
5837 - set_debugreg(next->debugreg1, 1);
5838 - set_debugreg(next->debugreg2, 2);
5839 - set_debugreg(next->debugreg3, 3);
5840 - /* no 4 and 5 */
5841 - set_debugreg(next->debugreg6, 6);
5842 - set_debugreg(next->debugreg7, 7);
5843 - }
5844 + if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW))
5845 + __switch_to_xtra(prev_p, next_p);
5846
5847 + /* If the task has used fpu the last 5 timeslices, just do a full
5848 + * restore of the math state immediately to avoid the trap; the
5849 + * chances of needing FPU soon are obviously high now
5850 + */
5851 + if (next_p->fpu_counter>5)
5852 + math_state_restore();
5853 return prev_p;
5854 }
5855
5856 @@ -821,7 +865,7 @@
5857
5858 unsigned long arch_align_stack(unsigned long sp)
5859 {
5860 - if (randomize_va_space)
5861 + if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
5862 sp -= get_random_int() % 8192;
5863 return sp & ~0xf;
5864 }
5865 --- a/arch/x86/kernel/setup64-xen.c
5866 +++ b/arch/x86/kernel/setup64-xen.c
5867 @@ -31,7 +31,7 @@
5868 #include <asm/hypervisor.h>
5869 #endif
5870
5871 -char x86_boot_params[BOOT_PARAM_SIZE] __initdata = {0,};
5872 +char x86_boot_params[BOOT_PARAM_SIZE] __initdata;
5873
5874 cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
5875
5876 @@ -55,8 +55,10 @@
5877 on Enable(default)
5878 off Disable
5879 */
5880 -int __init nonx_setup(char *str)
5881 +static int __init nonx_setup(char *str)
5882 {
5883 + if (!str)
5884 + return -EINVAL;
5885 if (!strncmp(str, "on", 2)) {
5886 __supported_pte_mask |= _PAGE_NX;
5887 do_not_nx = 0;
5888 @@ -64,9 +66,9 @@
5889 do_not_nx = 1;
5890 __supported_pte_mask &= ~_PAGE_NX;
5891 }
5892 - return 1;
5893 + return 0;
5894 }
5895 -__setup("noexec=", nonx_setup); /* parsed early actually */
5896 +early_param("noexec", nonx_setup);
5897
5898 int force_personality32 = 0;
5899
5900 @@ -102,12 +104,9 @@
5901 #endif
5902
5903 /* Copy section for each CPU (we discard the original) */
5904 - size = ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES);
5905 -#ifdef CONFIG_MODULES
5906 - if (size < PERCPU_ENOUGH_ROOM)
5907 - size = PERCPU_ENOUGH_ROOM;
5908 -#endif
5909 + size = PERCPU_ENOUGH_ROOM;
5910
5911 + printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n", size);
5912 for_each_cpu_mask (i, cpu_possible_map) {
5913 char *ptr;
5914
5915 @@ -169,7 +168,10 @@
5916 /* Setup up data that may be needed in __get_free_pages early */
5917 asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0));
5918 #ifndef CONFIG_XEN
5919 + /* Memory clobbers used to order PDA accessed */
5920 + mb();
5921 wrmsrl(MSR_GS_BASE, pda);
5922 + mb();
5923 #else
5924 if (HYPERVISOR_set_segment_base(SEGBASE_GS_KERNEL,
5925 (unsigned long)pda))
5926 @@ -302,28 +304,17 @@
5927 * set up and load the per-CPU TSS
5928 */
5929 for (v = 0; v < N_EXCEPTION_STACKS; v++) {
5930 + static const unsigned int order[N_EXCEPTION_STACKS] = {
5931 + [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
5932 + [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
5933 + };
5934 if (cpu) {
5935 - static const unsigned int order[N_EXCEPTION_STACKS] = {
5936 - [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
5937 - [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
5938 - };
5939 -
5940 estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
5941 if (!estacks)
5942 panic("Cannot allocate exception stack %ld %d\n",
5943 v, cpu);
5944 }
5945 - switch (v + 1) {
5946 -#if DEBUG_STKSZ > EXCEPTION_STKSZ
5947 - case DEBUG_STACK:
5948 - cpu_pda(cpu)->debugstack = (unsigned long)estacks;
5949 - estacks += DEBUG_STKSZ;
5950 - break;
5951 -#endif
5952 - default:
5953 - estacks += EXCEPTION_STKSZ;
5954 - break;
5955 - }
5956 + estacks += PAGE_SIZE << order[v];
5957 orig_ist->ist[v] = t->ist[v] = (unsigned long)estacks;
5958 }
5959
5960 --- a/arch/x86/kernel/setup_32-xen.c
5961 +++ b/arch/x86/kernel/setup_32-xen.c
5962 @@ -56,6 +56,7 @@
5963 #include <asm/apic.h>
5964 #include <asm/e820.h>
5965 #include <asm/mpspec.h>
5966 +#include <asm/mmzone.h>
5967 #include <asm/setup.h>
5968 #include <asm/arch_hooks.h>
5969 #include <asm/sections.h>
5970 @@ -105,18 +106,6 @@
5971
5972 unsigned long mmu_cr4_features;
5973
5974 -#ifdef CONFIG_ACPI
5975 - int acpi_disabled = 0;
5976 -#else
5977 - int acpi_disabled = 1;
5978 -#endif
5979 -EXPORT_SYMBOL(acpi_disabled);
5980 -
5981 -#ifdef CONFIG_ACPI
5982 -int __initdata acpi_force = 0;
5983 -extern acpi_interrupt_flags acpi_sci_flags;
5984 -#endif
5985 -
5986 /* for MCA, but anyone else can use it if they want */
5987 unsigned int machine_id;
5988 #ifdef CONFIG_MCA
5989 @@ -170,7 +159,6 @@
5990 #endif
5991
5992 extern void early_cpu_init(void);
5993 -extern void generic_apic_probe(char *);
5994 extern int root_mountflags;
5995
5996 unsigned long saved_videomode;
5997 @@ -243,9 +231,6 @@
5998 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
5999 } };
6000
6001 -#define ADAPTER_ROM_RESOURCES \
6002 - (sizeof adapter_rom_resources / sizeof adapter_rom_resources[0])
6003 -
6004 static struct resource video_rom_resource = {
6005 .name = "Video ROM",
6006 .start = 0xc0000,
6007 @@ -307,9 +292,6 @@
6008 .flags = IORESOURCE_BUSY | IORESOURCE_IO
6009 } };
6010
6011 -#define STANDARD_IO_RESOURCES \
6012 - (sizeof standard_io_resources / sizeof standard_io_resources[0])
6013 -
6014 #define romsignature(x) (*(unsigned short *)(x) == 0xaa55)
6015
6016 static int __init romchecksum(unsigned char *rom, unsigned long length)
6017 @@ -372,7 +354,7 @@
6018 }
6019
6020 /* check for adapter roms on 2k boundaries */
6021 - for (i = 0; i < ADAPTER_ROM_RESOURCES && start < upper; start += 2048) {
6022 + for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) {
6023 rom = isa_bus_to_virt(start);
6024 if (!romsignature(rom))
6025 continue;
6026 @@ -764,246 +746,152 @@
6027 }
6028 #endif
6029
6030 -static void __init parse_cmdline_early (char ** cmdline_p)
6031 +static int __initdata user_defined_memmap = 0;
6032 +
6033 +/*
6034 + * "mem=nopentium" disables the 4MB page tables.
6035 + * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM
6036 + * to <mem>, overriding the bios size.
6037 + * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from
6038 + * <start> to <start>+<mem>, overriding the bios size.
6039 + *
6040 + * HPA tells me bootloaders need to parse mem=, so no new
6041 + * option should be mem= [also see Documentation/i386/boot.txt]
6042 + */
6043 +static int __init parse_mem(char *arg)
6044 {
6045 - char c = ' ', *to = command_line, *from = saved_command_line;
6046 - int len = 0, max_cmdline;
6047 - int userdef = 0;
6048 -
6049 - if ((max_cmdline = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE)
6050 - max_cmdline = COMMAND_LINE_SIZE;
6051 - memcpy(saved_command_line, xen_start_info->cmd_line, max_cmdline);
6052 - /* Save unparsed command line copy for /proc/cmdline */
6053 - saved_command_line[max_cmdline-1] = '\0';
6054 -
6055 - for (;;) {
6056 - if (c != ' ')
6057 - goto next_char;
6058 - /*
6059 - * "mem=nopentium" disables the 4MB page tables.
6060 - * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM
6061 - * to <mem>, overriding the bios size.
6062 - * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from
6063 - * <start> to <start>+<mem>, overriding the bios size.
6064 - *
6065 - * HPA tells me bootloaders need to parse mem=, so no new
6066 - * option should be mem= [also see Documentation/i386/boot.txt]
6067 - */
6068 - if (!memcmp(from, "mem=", 4)) {
6069 - if (to != command_line)
6070 - to--;
6071 - if (!memcmp(from+4, "nopentium", 9)) {
6072 - from += 9+4;
6073 - clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
6074 - disable_pse = 1;
6075 - } else {
6076 - /* If the user specifies memory size, we
6077 - * limit the BIOS-provided memory map to
6078 - * that size. exactmap can be used to specify
6079 - * the exact map. mem=number can be used to
6080 - * trim the existing memory map.
6081 - */
6082 - unsigned long long mem_size;
6083 -
6084 - mem_size = memparse(from+4, &from);
6085 - limit_regions(mem_size);
6086 - userdef=1;
6087 - }
6088 - }
6089 + if (!arg)
6090 + return -EINVAL;
6091
6092 - else if (!memcmp(from, "memmap=", 7)) {
6093 - if (to != command_line)
6094 - to--;
6095 - if (!memcmp(from+7, "exactmap", 8)) {
6096 -#ifdef CONFIG_CRASH_DUMP
6097 - /* If we are doing a crash dump, we
6098 - * still need to know the real mem
6099 - * size before original memory map is
6100 - * reset.
6101 - */
6102 - find_max_pfn();
6103 - saved_max_pfn = max_pfn;
6104 -#endif
6105 - from += 8+7;
6106 - e820.nr_map = 0;
6107 - userdef = 1;
6108 - } else {
6109 - /* If the user specifies memory size, we
6110 - * limit the BIOS-provided memory map to
6111 - * that size. exactmap can be used to specify
6112 - * the exact map. mem=number can be used to
6113 - * trim the existing memory map.
6114 - */
6115 - unsigned long long start_at, mem_size;
6116 + if (strcmp(arg, "nopentium") == 0) {
6117 + clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
6118 + disable_pse = 1;
6119 + } else {
6120 + /* If the user specifies memory size, we
6121 + * limit the BIOS-provided memory map to
6122 + * that size. exactmap can be used to specify
6123 + * the exact map. mem=number can be used to
6124 + * trim the existing memory map.
6125 + */
6126 + unsigned long long mem_size;
6127
6128 - mem_size = memparse(from+7, &from);
6129 - if (*from == '@') {
6130 - start_at = memparse(from+1, &from);
6131 - add_memory_region(start_at, mem_size, E820_RAM);
6132 - } else if (*from == '#') {
6133 - start_at = memparse(from+1, &from);
6134 - add_memory_region(start_at, mem_size, E820_ACPI);
6135 - } else if (*from == '$') {
6136 - start_at = memparse(from+1, &from);
6137 - add_memory_region(start_at, mem_size, E820_RESERVED);
6138 - } else {
6139 - limit_regions(mem_size);
6140 - userdef=1;
6141 - }
6142 - }
6143 - }
6144 -
6145 - else if (!memcmp(from, "noexec=", 7))
6146 - noexec_setup(from + 7);
6147 + mem_size = memparse(arg, &arg);
6148 + limit_regions(mem_size);
6149 + user_defined_memmap = 1;
6150 + }
6151 + return 0;
6152 +}
6153 +early_param("mem", parse_mem);
6154
6155 +static int __init parse_memmap(char *arg)
6156 +{
6157 + if (!arg)
6158 + return -EINVAL;
6159
6160 -#ifdef CONFIG_X86_MPPARSE
6161 - /*
6162 - * If the BIOS enumerates physical processors before logical,
6163 - * maxcpus=N at enumeration-time can be used to disable HT.
6164 + if (strcmp(arg, "exactmap") == 0) {
6165 +#ifdef CONFIG_CRASH_DUMP
6166 + /* If we are doing a crash dump, we
6167 + * still need to know the real mem
6168 + * size before original memory map is
6169 + * reset.
6170 */
6171 - else if (!memcmp(from, "maxcpus=", 8)) {
6172 - extern unsigned int maxcpus;
6173 -
6174 - maxcpus = simple_strtoul(from + 8, NULL, 0);
6175 - }
6176 + find_max_pfn();
6177 + saved_max_pfn = max_pfn;
6178 #endif
6179 + e820.nr_map = 0;
6180 + user_defined_memmap = 1;
6181 + } else {
6182 + /* If the user specifies memory size, we
6183 + * limit the BIOS-provided memory map to
6184 + * that size. exactmap can be used to specify
6185 + * the exact map. mem=number can be used to
6186 + * trim the existing memory map.
6187 + */
6188 + unsigned long long start_at, mem_size;
6189
6190 -#ifdef CONFIG_ACPI
6191 - /* "acpi=off" disables both ACPI table parsing and interpreter */
6192 - else if (!memcmp(from, "acpi=off", 8)) {
6193 - disable_acpi();
6194 - }
6195 -
6196 - /* acpi=force to over-ride black-list */
6197 - else if (!memcmp(from, "acpi=force", 10)) {
6198 - acpi_force = 1;
6199 - acpi_ht = 1;
6200 - acpi_disabled = 0;
6201 - }
6202 -
6203 - /* acpi=strict disables out-of-spec workarounds */
6204 - else if (!memcmp(from, "acpi=strict", 11)) {
6205 - acpi_strict = 1;
6206 - }
6207 -
6208 - /* Limit ACPI just to boot-time to enable HT */
6209 - else if (!memcmp(from, "acpi=ht", 7)) {
6210 - if (!acpi_force)
6211 - disable_acpi();
6212 - acpi_ht = 1;
6213 - }
6214 -
6215 - /* "pci=noacpi" disable ACPI IRQ routing and PCI scan */
6216 - else if (!memcmp(from, "pci=noacpi", 10)) {
6217 - acpi_disable_pci();
6218 - }
6219 - /* "acpi=noirq" disables ACPI interrupt routing */
6220 - else if (!memcmp(from, "acpi=noirq", 10)) {
6221 - acpi_noirq_set();
6222 + mem_size = memparse(arg, &arg);
6223 + if (*arg == '@') {
6224 + start_at = memparse(arg+1, &arg);
6225 + add_memory_region(start_at, mem_size, E820_RAM);
6226 + } else if (*arg == '#') {
6227 + start_at = memparse(arg+1, &arg);
6228 + add_memory_region(start_at, mem_size, E820_ACPI);
6229 + } else if (*arg == '$') {
6230 + start_at = memparse(arg+1, &arg);
6231 + add_memory_region(start_at, mem_size, E820_RESERVED);
6232 + } else {
6233 + limit_regions(mem_size);
6234 + user_defined_memmap = 1;
6235 }
6236 + }
6237 + return 0;
6238 +}
6239 +early_param("memmap", parse_memmap);
6240
6241 - else if (!memcmp(from, "acpi_sci=edge", 13))
6242 - acpi_sci_flags.trigger = 1;
6243 +#ifdef CONFIG_PROC_VMCORE
6244 +/* elfcorehdr= specifies the location of elf core header
6245 + * stored by the crashed kernel.
6246 + */
6247 +static int __init parse_elfcorehdr(char *arg)
6248 +{
6249 + if (!arg)
6250 + return -EINVAL;
6251
6252 - else if (!memcmp(from, "acpi_sci=level", 14))
6253 - acpi_sci_flags.trigger = 3;
6254 + elfcorehdr_addr = memparse(arg, &arg);
6255 + return 0;
6256 +}
6257 +early_param("elfcorehdr", parse_elfcorehdr);
6258 +#endif /* CONFIG_PROC_VMCORE */
6259
6260 - else if (!memcmp(from, "acpi_sci=high", 13))
6261 - acpi_sci_flags.polarity = 1;
6262 +/*
6263 + * highmem=size forces highmem to be exactly 'size' bytes.
6264 + * This works even on boxes that have no highmem otherwise.
6265 + * This also works to reduce highmem size on bigger boxes.
6266 + */
6267 +static int __init parse_highmem(char *arg)
6268 +{
6269 + if (!arg)
6270 + return -EINVAL;
6271
6272 - else if (!memcmp(from, "acpi_sci=low", 12))
6273 - acpi_sci_flags.polarity = 3;
6274 + highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT;
6275 + return 0;
6276 +}
6277 +early_param("highmem", parse_highmem);
6278
6279 -#ifdef CONFIG_X86_IO_APIC
6280 - else if (!memcmp(from, "acpi_skip_timer_override", 24))
6281 - acpi_skip_timer_override = 1;
6282 +/*
6283 + * vmalloc=size forces the vmalloc area to be exactly 'size'
6284 + * bytes. This can be used to increase (or decrease) the
6285 + * vmalloc area - the default is 128m.
6286 + */
6287 +static int __init parse_vmalloc(char *arg)
6288 +{
6289 + if (!arg)
6290 + return -EINVAL;
6291
6292 - if (!memcmp(from, "disable_timer_pin_1", 19))
6293 - disable_timer_pin_1 = 1;
6294 - if (!memcmp(from, "enable_timer_pin_1", 18))
6295 - disable_timer_pin_1 = -1;
6296 -
6297 - /* disable IO-APIC */
6298 - else if (!memcmp(from, "noapic", 6))
6299 - disable_ioapic_setup();
6300 -#endif /* CONFIG_X86_IO_APIC */
6301 -#endif /* CONFIG_ACPI */
6302 -
6303 -#ifdef CONFIG_X86_LOCAL_APIC
6304 - /* enable local APIC */
6305 - else if (!memcmp(from, "lapic", 5))
6306 - lapic_enable();
6307 -
6308 - /* disable local APIC */
6309 - else if (!memcmp(from, "nolapic", 6))
6310 - lapic_disable();
6311 -#endif /* CONFIG_X86_LOCAL_APIC */
6312 + __VMALLOC_RESERVE = memparse(arg, &arg);
6313 + return 0;
6314 +}
6315 +early_param("vmalloc", parse_vmalloc);
6316
6317 -#ifdef CONFIG_KEXEC
6318 - /* crashkernel=size@addr specifies the location to reserve for
6319 - * a crash kernel. By reserving this memory we guarantee
6320 - * that linux never set's it up as a DMA target.
6321 - * Useful for holding code to do something appropriate
6322 - * after a kernel panic.
6323 - */
6324 - else if (!memcmp(from, "crashkernel=", 12)) {
6325 #ifndef CONFIG_XEN
6326 - unsigned long size, base;
6327 - size = memparse(from+12, &from);
6328 - if (*from == '@') {
6329 - base = memparse(from+1, &from);
6330 - /* FIXME: Do I want a sanity check
6331 - * to validate the memory range?
6332 - */
6333 - crashk_res.start = base;
6334 - crashk_res.end = base + size - 1;
6335 - }
6336 -#else
6337 - printk("Ignoring crashkernel command line, "
6338 - "parameter will be supplied by xen\n");
6339 -#endif
6340 - }
6341 -#endif
6342 -#ifdef CONFIG_PROC_VMCORE
6343 - /* elfcorehdr= specifies the location of elf core header
6344 - * stored by the crashed kernel.
6345 - */
6346 - else if (!memcmp(from, "elfcorehdr=", 11))
6347 - elfcorehdr_addr = memparse(from+11, &from);
6348 -#endif
6349 +/*
6350 + * reservetop=size reserves a hole at the top of the kernel address space which
6351 + * a hypervisor can load into later. Needed for dynamically loaded hypervisors,
6352 + * so relocating the fixmap can be done before paging initialization.
6353 + */
6354 +static int __init parse_reservetop(char *arg)
6355 +{
6356 + unsigned long address;
6357
6358 - /*
6359 - * highmem=size forces highmem to be exactly 'size' bytes.
6360 - * This works even on boxes that have no highmem otherwise.
6361 - * This also works to reduce highmem size on bigger boxes.
6362 - */
6363 - else if (!memcmp(from, "highmem=", 8))
6364 - highmem_pages = memparse(from+8, &from) >> PAGE_SHIFT;
6365 -
6366 - /*
6367 - * vmalloc=size forces the vmalloc area to be exactly 'size'
6368 - * bytes. This can be used to increase (or decrease) the
6369 - * vmalloc area - the default is 128m.
6370 - */
6371 - else if (!memcmp(from, "vmalloc=", 8))
6372 - __VMALLOC_RESERVE = memparse(from+8, &from);
6373 + if (!arg)
6374 + return -EINVAL;
6375
6376 - next_char:
6377 - c = *(from++);
6378 - if (!c)
6379 - break;
6380 - if (COMMAND_LINE_SIZE <= ++len)
6381 - break;
6382 - *(to++) = c;
6383 - }
6384 - *to = '\0';
6385 - *cmdline_p = command_line;
6386 - if (userdef) {
6387 - printk(KERN_INFO "user-defined physical RAM map:\n");
6388 - print_memory_map("user");
6389 - }
6390 + address = memparse(arg, &arg);
6391 + reserve_top_address(address);
6392 + return 0;
6393 }
6394 +early_param("reservetop", parse_reservetop);
6395 +#endif
6396
6397 /*
6398 * Callback for efi_memory_walk.
6399 @@ -1024,7 +912,7 @@
6400 static int __init
6401 efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg)
6402 {
6403 - memory_present(0, start, end);
6404 + memory_present(0, PFN_UP(start), PFN_DOWN(end));
6405 return 0;
6406 }
6407
6408 @@ -1291,6 +1179,14 @@
6409 }
6410 printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
6411 pages_to_mb(highend_pfn - highstart_pfn));
6412 + num_physpages = highend_pfn;
6413 + high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
6414 +#else
6415 + num_physpages = max_low_pfn;
6416 + high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
6417 +#endif
6418 +#ifdef CONFIG_FLATMEM
6419 + max_mapnr = num_physpages;
6420 #endif
6421 printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
6422 pages_to_mb(max_low_pfn));
6423 @@ -1302,22 +1198,19 @@
6424
6425 void __init zone_sizes_init(void)
6426 {
6427 - unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0};
6428 - unsigned int max_dma, low;
6429 -
6430 - max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
6431 - low = max_low_pfn;
6432 -
6433 - if (low < max_dma)
6434 - zones_size[ZONE_DMA] = low;
6435 - else {
6436 - zones_size[ZONE_DMA] = max_dma;
6437 - zones_size[ZONE_NORMAL] = low - max_dma;
6438 + unsigned long max_zone_pfns[MAX_NR_ZONES];
6439 + memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
6440 + max_zone_pfns[ZONE_DMA] =
6441 + virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
6442 + max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
6443 #ifdef CONFIG_HIGHMEM
6444 - zones_size[ZONE_HIGHMEM] = highend_pfn - low;
6445 + max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
6446 + add_active_range(0, 0, highend_pfn);
6447 +#else
6448 + add_active_range(0, 0, max_low_pfn);
6449 #endif
6450 - }
6451 - free_area_init(zones_size);
6452 +
6453 + free_area_init_nodes(max_zone_pfns);
6454 }
6455 #else
6456 extern unsigned long __init setup_memory(void);
6457 @@ -1374,6 +1267,7 @@
6458 */
6459 acpi_reserve_bootmem();
6460 #endif
6461 + numa_kva_reserve();
6462 #endif /* !CONFIG_XEN */
6463
6464 #ifdef CONFIG_BLK_DEV_INITRD
6465 @@ -1559,7 +1453,7 @@
6466 request_resource(&iomem_resource, &video_ram_resource);
6467
6468 /* request I/O space for devices used on all i[345]86 PCs */
6469 - for (i = 0; i < STANDARD_IO_RESOURCES; i++)
6470 + for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
6471 request_resource(&ioport_resource, &standard_io_resources[i]);
6472 return 0;
6473 }
6474 @@ -1700,17 +1594,19 @@
6475 data_resource.start = virt_to_phys(_etext);
6476 data_resource.end = virt_to_phys(_edata)-1;
6477
6478 - parse_cmdline_early(cmdline_p);
6479 + if ((i = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE)
6480 + i = COMMAND_LINE_SIZE;
6481 + memcpy(saved_command_line, xen_start_info->cmd_line, i);
6482 + saved_command_line[i - 1] = '\0';
6483 + parse_early_param();
6484
6485 -#ifdef CONFIG_EARLY_PRINTK
6486 - {
6487 - char *s = strstr(*cmdline_p, "earlyprintk=");
6488 - if (s) {
6489 - setup_early_printk(strchr(s, '=') + 1);
6490 - printk("early console enabled\n");
6491 - }
6492 + if (user_defined_memmap) {
6493 + printk(KERN_INFO "user-defined physical RAM map:\n");
6494 + print_memory_map("user");
6495 }
6496 -#endif
6497 +
6498 + strlcpy(command_line, saved_command_line, COMMAND_LINE_SIZE);
6499 + *cmdline_p = command_line;
6500
6501 max_low_pfn = setup_memory();
6502
6503 @@ -1817,7 +1713,7 @@
6504 dmi_scan_machine();
6505
6506 #ifdef CONFIG_X86_GENERICARCH
6507 - generic_apic_probe(*cmdline_p);
6508 + generic_apic_probe();
6509 #endif
6510 if (efi_enabled)
6511 efi_map_memmap();
6512 @@ -1838,9 +1734,11 @@
6513 acpi_boot_table_init();
6514 #endif
6515
6516 +#ifdef CONFIG_PCI
6517 #ifdef CONFIG_X86_IO_APIC
6518 check_acpi_pci(); /* Checks more than just ACPI actually */
6519 #endif
6520 +#endif
6521
6522 #ifdef CONFIG_ACPI
6523 acpi_boot_init();
6524 --- a/arch/x86/kernel/setup_64-xen.c
6525 +++ b/arch/x86/kernel/setup_64-xen.c
6526 @@ -118,16 +118,6 @@
6527
6528 unsigned long mmu_cr4_features;
6529
6530 -int acpi_disabled;
6531 -EXPORT_SYMBOL(acpi_disabled);
6532 -#ifdef CONFIG_ACPI
6533 -extern int __initdata acpi_ht;
6534 -extern acpi_interrupt_flags acpi_sci_flags;
6535 -int __initdata acpi_force = 0;
6536 -#endif
6537 -
6538 -int acpi_numa __initdata;
6539 -
6540 /* Boot loader ID as an integer, for the benefit of proc_dointvec */
6541 int bootloader_type;
6542
6543 @@ -151,10 +141,6 @@
6544
6545 struct edid_info edid_info;
6546 EXPORT_SYMBOL_GPL(edid_info);
6547 -struct e820map e820;
6548 -#ifdef CONFIG_XEN
6549 -struct e820map machine_e820;
6550 -#endif
6551
6552 extern int root_mountflags;
6553
6554 @@ -181,9 +167,6 @@
6555 .flags = IORESOURCE_BUSY | IORESOURCE_IO }
6556 };
6557
6558 -#define STANDARD_IO_RESOURCES \
6559 - (sizeof standard_io_resources / sizeof standard_io_resources[0])
6560 -
6561 #define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM)
6562
6563 struct resource data_resource = {
6564 @@ -230,9 +213,6 @@
6565 .flags = IORESOURCE_ROM }
6566 };
6567
6568 -#define ADAPTER_ROM_RESOURCES \
6569 - (sizeof adapter_rom_resources / sizeof adapter_rom_resources[0])
6570 -
6571 static struct resource video_rom_resource = {
6572 .name = "Video ROM",
6573 .start = 0xc0000,
6574 @@ -309,7 +289,8 @@
6575 }
6576
6577 /* check for adapter roms on 2k boundaries */
6578 - for (i = 0; i < ADAPTER_ROM_RESOURCES && start < upper; start += 2048) {
6579 + for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper;
6580 + start += 2048) {
6581 rom = isa_bus_to_virt(start);
6582 if (!romsignature(rom))
6583 continue;
6584 @@ -329,187 +310,22 @@
6585 }
6586 }
6587
6588 -/* Check for full argument with no trailing characters */
6589 -static int fullarg(char *p, char *arg)
6590 +#ifdef CONFIG_PROC_VMCORE
6591 +/* elfcorehdr= specifies the location of elf core header
6592 + * stored by the crashed kernel. This option will be passed
6593 + * by kexec loader to the capture kernel.
6594 + */
6595 +static int __init setup_elfcorehdr(char *arg)
6596 {
6597 - int l = strlen(arg);
6598 - return !memcmp(p, arg, l) && (p[l] == 0 || isspace(p[l]));
6599 + char *end;
6600 + if (!arg)
6601 + return -EINVAL;
6602 + elfcorehdr_addr = memparse(arg, &end);
6603 + return end > arg ? 0 : -EINVAL;
6604 }
6605 -
6606 -static __init void parse_cmdline_early (char ** cmdline_p)
6607 -{
6608 - char c = ' ', *to = command_line, *from = COMMAND_LINE;
6609 - int len = 0;
6610 - int userdef = 0;
6611 -
6612 - for (;;) {
6613 - if (c != ' ')
6614 - goto next_char;
6615 -
6616 -#ifdef CONFIG_SMP
6617 - /*
6618 - * If the BIOS enumerates physical processors before logical,
6619 - * maxcpus=N at enumeration-time can be used to disable HT.
6620 - */
6621 - else if (!memcmp(from, "maxcpus=", 8)) {
6622 - extern unsigned int maxcpus;
6623 -
6624 - maxcpus = simple_strtoul(from + 8, NULL, 0);
6625 - }
6626 -#endif
6627 -#ifdef CONFIG_ACPI
6628 - /* "acpi=off" disables both ACPI table parsing and interpreter init */
6629 - if (fullarg(from,"acpi=off"))
6630 - disable_acpi();
6631 -
6632 - if (fullarg(from, "acpi=force")) {
6633 - /* add later when we do DMI horrors: */
6634 - acpi_force = 1;
6635 - acpi_disabled = 0;
6636 - }
6637 -
6638 - /* acpi=ht just means: do ACPI MADT parsing
6639 - at bootup, but don't enable the full ACPI interpreter */
6640 - if (fullarg(from, "acpi=ht")) {
6641 - if (!acpi_force)
6642 - disable_acpi();
6643 - acpi_ht = 1;
6644 - }
6645 - else if (fullarg(from, "pci=noacpi"))
6646 - acpi_disable_pci();
6647 - else if (fullarg(from, "acpi=noirq"))
6648 - acpi_noirq_set();
6649 -
6650 - else if (fullarg(from, "acpi_sci=edge"))
6651 - acpi_sci_flags.trigger = 1;
6652 - else if (fullarg(from, "acpi_sci=level"))
6653 - acpi_sci_flags.trigger = 3;
6654 - else if (fullarg(from, "acpi_sci=high"))
6655 - acpi_sci_flags.polarity = 1;
6656 - else if (fullarg(from, "acpi_sci=low"))
6657 - acpi_sci_flags.polarity = 3;
6658 -
6659 - /* acpi=strict disables out-of-spec workarounds */
6660 - else if (fullarg(from, "acpi=strict")) {
6661 - acpi_strict = 1;
6662 - }
6663 -#ifdef CONFIG_X86_IO_APIC
6664 - else if (fullarg(from, "acpi_skip_timer_override"))
6665 - acpi_skip_timer_override = 1;
6666 -#endif
6667 -#endif
6668 -
6669 -#ifndef CONFIG_XEN
6670 - if (fullarg(from, "nolapic") || fullarg(from, "disableapic")) {
6671 - clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
6672 - disable_apic = 1;
6673 - }
6674 -
6675 - if (fullarg(from, "noapic"))
6676 - skip_ioapic_setup = 1;
6677 -
6678 - if (fullarg(from,"apic")) {
6679 - skip_ioapic_setup = 0;
6680 - ioapic_force = 1;
6681 - }
6682 -#endif
6683 -
6684 - if (!memcmp(from, "mem=", 4))
6685 - parse_memopt(from+4, &from);
6686 -
6687 - if (!memcmp(from, "memmap=", 7)) {
6688 - /* exactmap option is for used defined memory */
6689 - if (!memcmp(from+7, "exactmap", 8)) {
6690 -#ifdef CONFIG_CRASH_DUMP
6691 - /* If we are doing a crash dump, we
6692 - * still need to know the real mem
6693 - * size before original memory map is
6694 - * reset.
6695 - */
6696 - saved_max_pfn = e820_end_of_ram();
6697 -#endif
6698 - from += 8+7;
6699 - end_pfn_map = 0;
6700 - e820.nr_map = 0;
6701 - userdef = 1;
6702 - }
6703 - else {
6704 - parse_memmapopt(from+7, &from);
6705 - userdef = 1;
6706 - }
6707 - }
6708 -
6709 -#ifdef CONFIG_NUMA
6710 - if (!memcmp(from, "numa=", 5))
6711 - numa_setup(from+5);
6712 +early_param("elfcorehdr", setup_elfcorehdr);
6713 #endif
6714
6715 - if (!memcmp(from,"iommu=",6)) {
6716 - iommu_setup(from+6);
6717 - }
6718 -
6719 - if (fullarg(from,"oops=panic"))
6720 - panic_on_oops = 1;
6721 -
6722 - if (!memcmp(from, "noexec=", 7))
6723 - nonx_setup(from + 7);
6724 -
6725 -#ifdef CONFIG_KEXEC
6726 - /* crashkernel=size@addr specifies the location to reserve for
6727 - * a crash kernel. By reserving this memory we guarantee
6728 - * that linux never set's it up as a DMA target.
6729 - * Useful for holding code to do something appropriate
6730 - * after a kernel panic.
6731 - */
6732 - else if (!memcmp(from, "crashkernel=", 12)) {
6733 -#ifndef CONFIG_XEN
6734 - unsigned long size, base;
6735 - size = memparse(from+12, &from);
6736 - if (*from == '@') {
6737 - base = memparse(from+1, &from);
6738 - /* FIXME: Do I want a sanity check
6739 - * to validate the memory range?
6740 - */
6741 - crashk_res.start = base;
6742 - crashk_res.end = base + size - 1;
6743 - }
6744 -#else
6745 - printk("Ignoring crashkernel command line, "
6746 - "parameter will be supplied by xen\n");
6747 -#endif
6748 - }
6749 -#endif
6750 -
6751 -#ifdef CONFIG_PROC_VMCORE
6752 - /* elfcorehdr= specifies the location of elf core header
6753 - * stored by the crashed kernel. This option will be passed
6754 - * by kexec loader to the capture kernel.
6755 - */
6756 - else if(!memcmp(from, "elfcorehdr=", 11))
6757 - elfcorehdr_addr = memparse(from+11, &from);
6758 -#endif
6759 -
6760 -#if defined(CONFIG_HOTPLUG_CPU) && !defined(CONFIG_XEN)
6761 - else if (!memcmp(from, "additional_cpus=", 16))
6762 - setup_additional_cpus(from+16);
6763 -#endif
6764 -
6765 - next_char:
6766 - c = *(from++);
6767 - if (!c)
6768 - break;
6769 - if (COMMAND_LINE_SIZE <= ++len)
6770 - break;
6771 - *(to++) = c;
6772 - }
6773 - if (userdef) {
6774 - printk(KERN_INFO "user-defined physical RAM map:\n");
6775 - e820_print_map("user");
6776 - }
6777 - *to = '\0';
6778 - *cmdline_p = command_line;
6779 -}
6780 -
6781 #ifndef CONFIG_NUMA
6782 static void __init
6783 contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
6784 @@ -521,10 +337,11 @@
6785 if (bootmap == -1L)
6786 panic("Cannot find bootmem map of size %ld\n",bootmap_size);
6787 bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
6788 + e820_register_active_regions(0, start_pfn, end_pfn);
6789 #ifdef CONFIG_XEN
6790 - e820_bootmem_free(NODE_DATA(0), 0, xen_start_info->nr_pages<<PAGE_SHIFT);
6791 + free_bootmem_with_active_regions(0, xen_start_info->nr_pages);
6792 #else
6793 - e820_bootmem_free(NODE_DATA(0), 0, end_pfn << PAGE_SHIFT);
6794 + free_bootmem_with_active_regions(0, end_pfn);
6795 #endif
6796 reserve_bootmem(bootmap, bootmap_size);
6797 }
6798 @@ -587,6 +404,10 @@
6799 void __init setup_arch(char **cmdline_p)
6800 {
6801 #ifdef CONFIG_XEN
6802 + extern struct e820map machine_e820;
6803 +
6804 + printk(KERN_INFO "Command line: %s\n", saved_command_line);
6805 +
6806 /* Register a call for panic conditions. */
6807 atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
6808
6809 @@ -612,6 +433,8 @@
6810
6811 ARCH_SETUP
6812 #else
6813 + printk(KERN_INFO "Command line: %s\n", saved_command_line);
6814 +
6815 ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV);
6816 screen_info = SCREEN_INFO;
6817 edid_info = EDID_INFO;
6818 @@ -639,16 +462,22 @@
6819 data_resource.start = virt_to_phys(&_etext);
6820 data_resource.end = virt_to_phys(&_edata)-1;
6821
6822 - parse_cmdline_early(cmdline_p);
6823 -
6824 early_identify_cpu(&boot_cpu_data);
6825
6826 + strlcpy(command_line, saved_command_line, COMMAND_LINE_SIZE);
6827 + *cmdline_p = command_line;
6828 +
6829 + parse_early_param();
6830 +
6831 + finish_e820_parsing();
6832 +
6833 + e820_register_active_regions(0, 0, -1UL);
6834 /*
6835 * partially used pages are not usable - thus
6836 * we are rounding upwards:
6837 */
6838 end_pfn = e820_end_of_ram();
6839 - num_physpages = end_pfn; /* for pfn_valid */
6840 + num_physpages = end_pfn;
6841
6842 check_efer();
6843
6844 @@ -659,6 +488,14 @@
6845 if (is_initial_xendomain())
6846 dmi_scan_machine();
6847
6848 + /* How many end-of-memory variables you have, grandma! */
6849 + max_low_pfn = end_pfn;
6850 + max_pfn = end_pfn;
6851 + high_memory = (void *)__va(end_pfn * PAGE_SIZE - 1) + 1;
6852 +
6853 + /* Remove active ranges so rediscovery with NUMA-awareness happens */
6854 + remove_all_active_ranges();
6855 +
6856 #ifdef CONFIG_ACPI_NUMA
6857 /*
6858 * Parse SRAT to discover nodes.
6859 @@ -848,16 +685,16 @@
6860 BUG();
6861 }
6862
6863 +#ifdef CONFIG_ACPI
6864 if (!is_initial_xendomain()) {
6865 acpi_disabled = 1;
6866 -#ifdef CONFIG_ACPI
6867 acpi_ht = 0;
6868 -#endif
6869 }
6870 #endif
6871 +#endif
6872
6873 -#ifndef CONFIG_XEN
6874 - check_ioapic();
6875 +#if defined(CONFIG_PCI) && !defined(CONFIG_XEN)
6876 + early_quirks();
6877 #endif
6878
6879 zap_low_mappings(0);
6880 @@ -917,6 +754,7 @@
6881 }
6882 #else
6883 e820_reserve_resources(e820.map, e820.nr_map);
6884 + e820_mark_nosave_regions();
6885 #endif
6886
6887 request_resource(&iomem_resource, &video_ram_resource);
6888 @@ -924,7 +762,7 @@
6889 {
6890 unsigned i;
6891 /* request I/O space for devices used on all i[345]86 PCs */
6892 - for (i = 0; i < STANDARD_IO_RESOURCES; i++)
6893 + for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
6894 request_resource(&ioport_resource, &standard_io_resources[i]);
6895 }
6896
6897 @@ -1108,7 +946,7 @@
6898 #endif
6899 }
6900
6901 -static void __init init_amd(struct cpuinfo_x86 *c)
6902 +static void __cpuinit init_amd(struct cpuinfo_x86 *c)
6903 {
6904 unsigned level;
6905
6906 @@ -1164,6 +1002,12 @@
6907
6908 /* Fix cpuid4 emulation for more */
6909 num_cache_leaves = 3;
6910 +
6911 + /* When there is only one core no need to synchronize RDTSC */
6912 + if (num_possible_cpus() == 1)
6913 + set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
6914 + else
6915 + clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
6916 }
6917
6918 static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
6919 @@ -1245,8 +1089,7 @@
6920 node = first_node(node_online_map);
6921 numa_set_node(cpu, node);
6922
6923 - if (acpi_numa > 0)
6924 - printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
6925 + printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
6926 #endif
6927 }
6928
6929 @@ -1280,6 +1123,8 @@
6930 if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
6931 (c->x86 == 0x6 && c->x86_model >= 0x0e))
6932 set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
6933 + if (c->x86 == 6)
6934 + set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
6935 set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
6936 c->x86_max_cores = intel_num_cpu_cores(c);
6937
6938 @@ -1498,8 +1343,8 @@
6939
6940 /* Intel-defined (#2) */
6941 "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est",
6942 - "tm2", NULL, "cid", NULL, NULL, "cx16", "xtpr", NULL,
6943 - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
6944 + "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL,
6945 + NULL, NULL, "dca", NULL, NULL, NULL, NULL, NULL,
6946 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
6947
6948 /* VIA/Cyrix/Centaur-defined */
6949 --- a/arch/x86/kernel/smp_32-xen.c
6950 +++ b/arch/x86/kernel/smp_32-xen.c
6951 @@ -279,8 +279,7 @@
6952 * 2) Leave the mm if we are in the lazy tlb mode.
6953 */
6954
6955 -irqreturn_t smp_invalidate_interrupt(int irq, void *dev_id,
6956 - struct pt_regs *regs)
6957 +irqreturn_t smp_invalidate_interrupt(int irq, void *dev_id)
6958 {
6959 unsigned long cpu;
6960
6961 @@ -567,16 +566,14 @@
6962 * all the work is done automatically when
6963 * we return from the interrupt.
6964 */
6965 -irqreturn_t smp_reschedule_interrupt(int irq, void *dev_id,
6966 - struct pt_regs *regs)
6967 +irqreturn_t smp_reschedule_interrupt(int irq, void *dev_id)
6968 {
6969
6970 return IRQ_HANDLED;
6971 }
6972
6973 #include <linux/kallsyms.h>
6974 -irqreturn_t smp_call_function_interrupt(int irq, void *dev_id,
6975 - struct pt_regs *regs)
6976 +irqreturn_t smp_call_function_interrupt(int irq, void *dev_id)
6977 {
6978 void (*func) (void *info) = call_data->func;
6979 void *info = call_data->info;
6980 @@ -603,3 +600,69 @@
6981 return IRQ_HANDLED;
6982 }
6983
6984 +/*
6985 + * this function sends a 'generic call function' IPI to one other CPU
6986 + * in the system.
6987 + *
6988 + * cpu is a standard Linux logical CPU number.
6989 + */
6990 +static void
6991 +__smp_call_function_single(int cpu, void (*func) (void *info), void *info,
6992 + int nonatomic, int wait)
6993 +{
6994 + struct call_data_struct data;
6995 + int cpus = 1;
6996 +
6997 + data.func = func;
6998 + data.info = info;
6999 + atomic_set(&data.started, 0);
7000 + data.wait = wait;
7001 + if (wait)
7002 + atomic_set(&data.finished, 0);
7003 +
7004 + call_data = &data;
7005 + wmb();
7006 + /* Send a message to all other CPUs and wait for them to respond */
7007 + send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_VECTOR);
7008 +
7009 + /* Wait for response */
7010 + while (atomic_read(&data.started) != cpus)
7011 + cpu_relax();
7012 +
7013 + if (!wait)
7014 + return;
7015 +
7016 + while (atomic_read(&data.finished) != cpus)
7017 + cpu_relax();
7018 +}
7019 +
7020 +/*
7021 + * smp_call_function_single - Run a function on another CPU
7022 + * @func: The function to run. This must be fast and non-blocking.
7023 + * @info: An arbitrary pointer to pass to the function.
7024 + * @nonatomic: Currently unused.
7025 + * @wait: If true, wait until function has completed on other CPUs.
7026 + *
7027 + * Retrurns 0 on success, else a negative status code.
7028 + *
7029 + * Does not return until the remote CPU is nearly ready to execute <func>
7030 + * or is or has executed.
7031 + */
7032 +
7033 +int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
7034 + int nonatomic, int wait)
7035 +{
7036 + /* prevent preemption and reschedule on another processor */
7037 + int me = get_cpu();
7038 + if (cpu == me) {
7039 + WARN_ON(1);
7040 + put_cpu();
7041 + return -EBUSY;
7042 + }
7043 + spin_lock_bh(&call_lock);
7044 + __smp_call_function_single(cpu, func, info, nonatomic, wait);
7045 + spin_unlock_bh(&call_lock);
7046 + put_cpu();
7047 + return 0;
7048 +}
7049 +EXPORT_SYMBOL(smp_call_function_single);
7050 --- a/arch/x86/kernel/smp_64-xen.c
7051 +++ b/arch/x86/kernel/smp_64-xen.c
7052 @@ -381,9 +381,8 @@
7053 /* prevent preemption and reschedule on another processor */
7054 int me = get_cpu();
7055 if (cpu == me) {
7056 - WARN_ON(1);
7057 put_cpu();
7058 - return -EBUSY;
7059 + return 0;
7060 }
7061 spin_lock_bh(&call_lock);
7062 __smp_call_function_single(cpu, func, info, nonatomic, wait);
7063 @@ -501,7 +500,7 @@
7064 #ifndef CONFIG_XEN
7065 asmlinkage void smp_reschedule_interrupt(void)
7066 #else
7067 -asmlinkage irqreturn_t smp_reschedule_interrupt(void)
7068 +asmlinkage irqreturn_t smp_reschedule_interrupt(int irq, void *ctx)
7069 #endif
7070 {
7071 #ifndef CONFIG_XEN
7072 @@ -514,7 +513,7 @@
7073 #ifndef CONFIG_XEN
7074 asmlinkage void smp_call_function_interrupt(void)
7075 #else
7076 -asmlinkage irqreturn_t smp_call_function_interrupt(void)
7077 +asmlinkage irqreturn_t smp_call_function_interrupt(int irq, void *ctx)
7078 #endif
7079 {
7080 void (*func) (void *info) = call_data->func;
7081 @@ -545,31 +544,3 @@
7082 return IRQ_HANDLED;
7083 #endif
7084 }
7085 -
7086 -int safe_smp_processor_id(void)
7087 -{
7088 -#ifdef CONFIG_XEN
7089 - return smp_processor_id();
7090 -#else
7091 - unsigned apicid, i;
7092 -
7093 - if (disable_apic)
7094 - return 0;
7095 -
7096 - apicid = hard_smp_processor_id();
7097 - if (apicid < NR_CPUS && x86_cpu_to_apicid[apicid] == apicid)
7098 - return apicid;
7099 -
7100 - for (i = 0; i < NR_CPUS; ++i) {
7101 - if (x86_cpu_to_apicid[i] == apicid)
7102 - return i;
7103 - }
7104 -
7105 - /* No entries in x86_cpu_to_apicid? Either no MPS|ACPI,
7106 - * or called too early. Either way, we must be CPU 0. */
7107 - if (x86_cpu_to_apicid[0] == BAD_APICID)
7108 - return 0;
7109 -
7110 - return 0; /* Should not happen */
7111 -#endif
7112 -}
7113 --- a/arch/x86/kernel/time_32-xen.c
7114 +++ b/arch/x86/kernel/time_32-xen.c
7115 @@ -89,7 +89,6 @@
7116 unsigned long vxtime_hz = PIT_TICK_RATE;
7117 struct vxtime_data __vxtime __section_vxtime; /* for vsyscalls */
7118 volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
7119 -unsigned long __wall_jiffies __section_wall_jiffies = INITIAL_JIFFIES;
7120 struct timespec __xtime __section_xtime;
7121 struct timezone __sys_tz __section_sys_tz;
7122 #endif
7123 @@ -97,8 +96,6 @@
7124 unsigned int cpu_khz; /* Detected as we calibrate the TSC */
7125 EXPORT_SYMBOL(cpu_khz);
7126
7127 -extern unsigned long wall_jiffies;
7128 -
7129 DEFINE_SPINLOCK(rtc_lock);
7130 EXPORT_SYMBOL(rtc_lock);
7131
7132 @@ -265,11 +262,10 @@
7133 time_t wtm_sec, xtime_sec;
7134 u64 tmp, wc_nsec;
7135
7136 - /* Adjust wall-clock time base based on wall_jiffies ticks. */
7137 + /* Adjust wall-clock time base. */
7138 wc_nsec = processed_system_time;
7139 wc_nsec += sec * (u64)NSEC_PER_SEC;
7140 wc_nsec += nsec;
7141 - wc_nsec -= (jiffies - wall_jiffies) * (u64)NS_PER_TICK;
7142
7143 /* Split wallclock base into seconds and nanoseconds. */
7144 tmp = wc_nsec;
7145 @@ -387,16 +383,10 @@
7146 shadow = &per_cpu(shadow_time, cpu);
7147
7148 do {
7149 - unsigned long lost;
7150 -
7151 local_time_version = shadow->version;
7152 seq = read_seqbegin(&xtime_lock);
7153
7154 usec = get_usec_offset(shadow);
7155 - lost = jiffies - wall_jiffies;
7156 -
7157 - if (unlikely(lost))
7158 - usec += lost * (USEC_PER_SEC / HZ);
7159
7160 sec = xtime.tv_sec;
7161 usec += (xtime.tv_nsec / NSEC_PER_USEC);
7162 @@ -519,7 +509,7 @@
7163 write_seqlock_irq(&xtime_lock);
7164
7165 sec = xtime.tv_sec;
7166 - nsec = xtime.tv_nsec + ((jiffies - wall_jiffies) * (u64)NS_PER_TICK);
7167 + nsec = xtime.tv_nsec;
7168 __normalize_time(&sec, &nsec);
7169
7170 op.cmd = XENPF_settime;
7171 @@ -593,42 +583,49 @@
7172 }
7173 #endif
7174
7175 -#if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER)
7176 unsigned long profile_pc(struct pt_regs *regs)
7177 {
7178 unsigned long pc = instruction_pointer(regs);
7179
7180 -#ifdef __x86_64__
7181 - /* Assume the lock function has either no stack frame or only a single word.
7182 - This checks if the address on the stack looks like a kernel text address.
7183 - There is a small window for false hits, but in that case the tick
7184 - is just accounted to the spinlock function.
7185 - Better would be to write these functions in assembler again
7186 - and check exactly. */
7187 +#if defined(CONFIG_SMP) || defined(__x86_64__)
7188 if (!user_mode_vm(regs) && in_lock_functions(pc)) {
7189 - char *v = *(char **)regs->rsp;
7190 - if ((v >= _stext && v <= _etext) ||
7191 - (v >= _sinittext && v <= _einittext) ||
7192 - (v >= (char *)MODULES_VADDR && v <= (char *)MODULES_END))
7193 - return (unsigned long)v;
7194 - return ((unsigned long *)regs->rsp)[1];
7195 +# ifdef CONFIG_FRAME_POINTER
7196 +# ifdef __i386__
7197 + return ((unsigned long *)regs->ebp)[1];
7198 +# else
7199 + return ((unsigned long *)regs->rbp)[1];
7200 +# endif
7201 +# else
7202 +# ifdef __i386__
7203 + unsigned long *sp;
7204 + if ((regs->xcs & 2) == 0)
7205 + sp = (unsigned long *)&regs->esp;
7206 + else
7207 + sp = (unsigned long *)regs->esp;
7208 +# else
7209 + unsigned long *sp = (unsigned long *)regs->rsp;
7210 +# endif
7211 + /* Return address is either directly at stack pointer
7212 + or above a saved eflags. Eflags has bits 22-31 zero,
7213 + kernel addresses don't. */
7214 + if (sp[0] >> 22)
7215 + return sp[0];
7216 + if (sp[1] >> 22)
7217 + return sp[1];
7218 +# endif
7219 }
7220 -#else
7221 - if (!user_mode_vm(regs) && in_lock_functions(pc))
7222 - return *(unsigned long *)(regs->ebp + 4);
7223 #endif
7224
7225 return pc;
7226 }
7227 EXPORT_SYMBOL(profile_pc);
7228 -#endif
7229
7230 /*
7231 * This is the same as the above, except we _also_ save the current
7232 * Time Stamp Counter value at the time of the timer interrupt, so that
7233 * we later on can estimate the time of day more exactly.
7234 */
7235 -irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
7236 +irqreturn_t timer_interrupt(int irq, void *dev_id)
7237 {
7238 s64 delta, delta_cpu, stolen, blocked;
7239 u64 sched_time;
7240 @@ -686,10 +683,14 @@
7241 }
7242
7243 /* System-wide jiffy work. */
7244 - while (delta >= NS_PER_TICK) {
7245 - delta -= NS_PER_TICK;
7246 - processed_system_time += NS_PER_TICK;
7247 - do_timer(regs);
7248 + if (delta >= NS_PER_TICK) {
7249 + do_div(delta, NS_PER_TICK);
7250 + processed_system_time += delta * NS_PER_TICK;
7251 + while (delta > HZ) {
7252 + do_timer(HZ);
7253 + delta -= HZ;
7254 + }
7255 + do_timer(delta);
7256 }
7257
7258 if (shadow_tv_version != HYPERVISOR_shared_info->wc_version) {
7259 @@ -734,7 +735,7 @@
7260 if (delta_cpu > 0) {
7261 do_div(delta_cpu, NS_PER_TICK);
7262 per_cpu(processed_system_time, cpu) += delta_cpu * NS_PER_TICK;
7263 - if (user_mode_vm(regs))
7264 + if (user_mode_vm(get_irq_regs()))
7265 account_user_time(current, (cputime_t)delta_cpu);
7266 else
7267 account_system_time(current, HARDIRQ_OFFSET,
7268 @@ -748,10 +749,10 @@
7269 /* Local timer processing (see update_process_times()). */
7270 run_local_timers();
7271 if (rcu_pending(cpu))
7272 - rcu_check_callbacks(cpu, user_mode_vm(regs));
7273 + rcu_check_callbacks(cpu, user_mode_vm(get_irq_regs()));
7274 scheduler_tick();
7275 run_posix_cpu_timers(current);
7276 - profile_tick(CPU_PROFILING, regs);
7277 + profile_tick(CPU_PROFILING);
7278
7279 return IRQ_HANDLED;
7280 }
7281 @@ -959,10 +960,11 @@
7282 /* Duplicate of time_init() below, with hpet_enable part added */
7283 static void __init hpet_time_init(void)
7284 {
7285 - xtime.tv_sec = get_cmos_time();
7286 - xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
7287 - set_normalized_timespec(&wall_to_monotonic,
7288 - -xtime.tv_sec, -xtime.tv_nsec);
7289 + struct timespec ts;
7290 + ts.tv_sec = get_cmos_time();
7291 + ts.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
7292 +
7293 + do_settimeofday(&ts);
7294
7295 if ((hpet_enable() >= 0) && hpet_use_timer) {
7296 printk("Using HPET for base-timer\n");
7297 --- a/arch/x86/kernel/traps_32-xen.c
7298 +++ b/arch/x86/kernel/traps_32-xen.c
7299 @@ -28,6 +28,7 @@
7300 #include <linux/kprobes.h>
7301 #include <linux/kexec.h>
7302 #include <linux/unwind.h>
7303 +#include <linux/uaccess.h>
7304
7305 #ifdef CONFIG_EISA
7306 #include <linux/ioport.h>
7307 @@ -40,7 +41,6 @@
7308
7309 #include <asm/processor.h>
7310 #include <asm/system.h>
7311 -#include <asm/uaccess.h>
7312 #include <asm/io.h>
7313 #include <asm/atomic.h>
7314 #include <asm/debugreg.h>
7315 @@ -51,11 +51,14 @@
7316 #include <asm/smp.h>
7317 #include <asm/arch_hooks.h>
7318 #include <asm/kdebug.h>
7319 +#include <asm/stacktrace.h>
7320
7321 #include <linux/module.h>
7322
7323 #include "mach_traps.h"
7324
7325 +int panic_on_unrecovered_nmi;
7326 +
7327 asmlinkage int system_call(void);
7328
7329 struct desc_struct default_ldt[] = { { 0, 0 }, { 0, 0 }, { 0, 0 },
7330 @@ -124,62 +127,63 @@
7331 p < (void *)tinfo + THREAD_SIZE - 3;
7332 }
7333
7334 -/*
7335 - * Print one address/symbol entries per line.
7336 - */
7337 -static inline void print_addr_and_symbol(unsigned long addr, char *log_lvl)
7338 -{
7339 - printk(" [<%08lx>] ", addr);
7340 -
7341 - print_symbol("%s\n", addr);
7342 -}
7343 -
7344 static inline unsigned long print_context_stack(struct thread_info *tinfo,
7345 unsigned long *stack, unsigned long ebp,
7346 - char *log_lvl)
7347 + struct stacktrace_ops *ops, void *data)
7348 {
7349 unsigned long addr;
7350
7351 #ifdef CONFIG_FRAME_POINTER
7352 while (valid_stack_ptr(tinfo, (void *)ebp)) {
7353 + unsigned long new_ebp;
7354 addr = *(unsigned long *)(ebp + 4);
7355 - print_addr_and_symbol(addr, log_lvl);
7356 + ops->address(data, addr);
7357 /*
7358 * break out of recursive entries (such as
7359 - * end_of_stack_stop_unwind_function):
7360 + * end_of_stack_stop_unwind_function). Also,
7361 + * we can never allow a frame pointer to
7362 + * move downwards!
7363 */
7364 - if (ebp == *(unsigned long *)ebp)
7365 + new_ebp = *(unsigned long *)ebp;
7366 + if (new_ebp <= ebp)
7367 break;
7368 - ebp = *(unsigned long *)ebp;
7369 + ebp = new_ebp;
7370 }
7371 #else
7372 while (valid_stack_ptr(tinfo, stack)) {
7373 addr = *stack++;
7374 if (__kernel_text_address(addr))
7375 - print_addr_and_symbol(addr, log_lvl);
7376 + ops->address(data, addr);
7377 }
7378 #endif
7379 return ebp;
7380 }
7381
7382 +struct ops_and_data {
7383 + struct stacktrace_ops *ops;
7384 + void *data;
7385 +};
7386 +
7387 static asmlinkage int
7388 -show_trace_unwind(struct unwind_frame_info *info, void *log_lvl)
7389 +dump_trace_unwind(struct unwind_frame_info *info, void *data)
7390 {
7391 + struct ops_and_data *oad = (struct ops_and_data *)data;
7392 int n = 0;
7393
7394 while (unwind(info) == 0 && UNW_PC(info)) {
7395 n++;
7396 - print_addr_and_symbol(UNW_PC(info), log_lvl);
7397 + oad->ops->address(oad->data, UNW_PC(info));
7398 if (arch_unw_user_mode(info))
7399 break;
7400 }
7401 return n;
7402 }
7403
7404 -static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
7405 - unsigned long *stack, char *log_lvl)
7406 +void dump_trace(struct task_struct *task, struct pt_regs *regs,
7407 + unsigned long *stack,
7408 + struct stacktrace_ops *ops, void *data)
7409 {
7410 - unsigned long ebp;
7411 + unsigned long ebp = 0;
7412
7413 if (!task)
7414 task = current;
7415 @@ -187,54 +191,116 @@
7416 if (call_trace >= 0) {
7417 int unw_ret = 0;
7418 struct unwind_frame_info info;
7419 + struct ops_and_data oad = { .ops = ops, .data = data };
7420
7421 if (regs) {
7422 if (unwind_init_frame_info(&info, task, regs) == 0)
7423 - unw_ret = show_trace_unwind(&info, log_lvl);
7424 + unw_ret = dump_trace_unwind(&info, &oad);
7425 } else if (task == current)
7426 - unw_ret = unwind_init_running(&info, show_trace_unwind, log_lvl);
7427 + unw_ret = unwind_init_running(&info, dump_trace_unwind, &oad);
7428 else {
7429 if (unwind_init_blocked(&info, task) == 0)
7430 - unw_ret = show_trace_unwind(&info, log_lvl);
7431 + unw_ret = dump_trace_unwind(&info, &oad);
7432 }
7433 if (unw_ret > 0) {
7434 if (call_trace == 1 && !arch_unw_user_mode(&info)) {
7435 - print_symbol("DWARF2 unwinder stuck at %s\n",
7436 + ops->warning_symbol(data, "DWARF2 unwinder stuck at %s\n",
7437 UNW_PC(&info));
7438 if (UNW_SP(&info) >= PAGE_OFFSET) {
7439 - printk("Leftover inexact backtrace:\n");
7440 + ops->warning(data, "Leftover inexact backtrace:\n");
7441 stack = (void *)UNW_SP(&info);
7442 + if (!stack)
7443 + return;
7444 + ebp = UNW_FP(&info);
7445 } else
7446 - printk("Full inexact backtrace again:\n");
7447 + ops->warning(data, "Full inexact backtrace again:\n");
7448 } else if (call_trace >= 1)
7449 return;
7450 else
7451 - printk("Full inexact backtrace again:\n");
7452 + ops->warning(data, "Full inexact backtrace again:\n");
7453 } else
7454 - printk("Inexact backtrace:\n");
7455 + ops->warning(data, "Inexact backtrace:\n");
7456 }
7457 -
7458 - if (task == current) {
7459 - /* Grab ebp right from our regs */
7460 - asm ("movl %%ebp, %0" : "=r" (ebp) : );
7461 - } else {
7462 - /* ebp is the last reg pushed by switch_to */
7463 - ebp = *(unsigned long *) task->thread.esp;
7464 + if (!stack) {
7465 + unsigned long dummy;
7466 + stack = &dummy;
7467 + if (task && task != current)
7468 + stack = (unsigned long *)task->thread.esp;
7469 + }
7470 +
7471 +#ifdef CONFIG_FRAME_POINTER
7472 + if (!ebp) {
7473 + if (task == current) {
7474 + /* Grab ebp right from our regs */
7475 + asm ("movl %%ebp, %0" : "=r" (ebp) : );
7476 + } else {
7477 + /* ebp is the last reg pushed by switch_to */
7478 + ebp = *(unsigned long *) task->thread.esp;
7479 + }
7480 }
7481 +#endif
7482
7483 while (1) {
7484 struct thread_info *context;
7485 context = (struct thread_info *)
7486 ((unsigned long)stack & (~(THREAD_SIZE - 1)));
7487 - ebp = print_context_stack(context, stack, ebp, log_lvl);
7488 + ebp = print_context_stack(context, stack, ebp, ops, data);
7489 + /* Should be after the line below, but somewhere
7490 + in early boot context comes out corrupted and we
7491 + can't reference it -AK */
7492 + if (ops->stack(data, "IRQ") < 0)
7493 + break;
7494 stack = (unsigned long*)context->previous_esp;
7495 if (!stack)
7496 break;
7497 - printk("%s =======================\n", log_lvl);
7498 }
7499 }
7500 +EXPORT_SYMBOL(dump_trace);
7501
7502 -void show_trace(struct task_struct *task, struct pt_regs *regs, unsigned long * stack)
7503 +static void
7504 +print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
7505 +{
7506 + printk(data);
7507 + print_symbol(msg, symbol);
7508 + printk("\n");
7509 +}
7510 +
7511 +static void print_trace_warning(void *data, char *msg)
7512 +{
7513 + printk("%s%s\n", (char *)data, msg);
7514 +}
7515 +
7516 +static int print_trace_stack(void *data, char *name)
7517 +{
7518 + return 0;
7519 +}
7520 +
7521 +/*
7522 + * Print one address/symbol entries per line.
7523 + */
7524 +static void print_trace_address(void *data, unsigned long addr)
7525 +{
7526 + printk("%s [<%08lx>] ", (char *)data, addr);
7527 + print_symbol("%s\n", addr);
7528 +}
7529 +
7530 +static struct stacktrace_ops print_trace_ops = {
7531 + .warning = print_trace_warning,
7532 + .warning_symbol = print_trace_warning_symbol,
7533 + .stack = print_trace_stack,
7534 + .address = print_trace_address,
7535 +};
7536 +
7537 +static void
7538 +show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
7539 + unsigned long * stack, char *log_lvl)
7540 +{
7541 + dump_trace(task, regs, stack, &print_trace_ops, log_lvl);
7542 + printk("%s =======================\n", log_lvl);
7543 +}
7544 +
7545 +void show_trace(struct task_struct *task, struct pt_regs *regs,
7546 + unsigned long * stack)
7547 {
7548 show_trace_log_lvl(task, regs, stack, "");
7549 }
7550 @@ -297,12 +363,13 @@
7551 ss = regs->xss & 0xffff;
7552 }
7553 print_modules();
7554 - printk(KERN_EMERG "CPU: %d\nEIP: %04x:[<%08lx>] %s VLI\n"
7555 - "EFLAGS: %08lx (%s %.*s) \n",
7556 + printk(KERN_EMERG "CPU: %d\n"
7557 + KERN_EMERG "EIP: %04x:[<%08lx>] %s VLI\n"
7558 + KERN_EMERG "EFLAGS: %08lx (%s %.*s)\n",
7559 smp_processor_id(), 0xffff & regs->xcs, regs->eip,
7560 - print_tainted(), regs->eflags, system_utsname.release,
7561 - (int)strcspn(system_utsname.version, " "),
7562 - system_utsname.version);
7563 + print_tainted(), regs->eflags, init_utsname()->release,
7564 + (int)strcspn(init_utsname()->version, " "),
7565 + init_utsname()->version);
7566 print_symbol(KERN_EMERG "EIP is at %s\n", regs->eip);
7567 printk(KERN_EMERG "eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n",
7568 regs->eax, regs->ebx, regs->ecx, regs->edx);
7569 @@ -319,6 +386,8 @@
7570 */
7571 if (in_kernel) {
7572 u8 __user *eip;
7573 + int code_bytes = 64;
7574 + unsigned char c;
7575
7576 printk("\n" KERN_EMERG "Stack: ");
7577 show_stack_log_lvl(NULL, regs, (unsigned long *)esp, KERN_EMERG);
7578 @@ -326,9 +395,12 @@
7579 printk(KERN_EMERG "Code: ");
7580
7581 eip = (u8 __user *)regs->eip - 43;
7582 - for (i = 0; i < 64; i++, eip++) {
7583 - unsigned char c;
7584 -
7585 + if (eip < (u8 __user *)PAGE_OFFSET || __get_user(c, eip)) {
7586 + /* try starting at EIP */
7587 + eip = (u8 __user *)regs->eip;
7588 + code_bytes = 32;
7589 + }
7590 + for (i = 0; i < code_bytes; i++, eip++) {
7591 if (eip < (u8 __user *)PAGE_OFFSET || __get_user(c, eip)) {
7592 printk(" Bad EIP value.");
7593 break;
7594 @@ -349,7 +421,7 @@
7595
7596 if (eip < PAGE_OFFSET)
7597 return;
7598 - if (__get_user(ud2, (unsigned short __user *)eip))
7599 + if (probe_kernel_address((unsigned short __user *)eip, ud2))
7600 return;
7601 if (ud2 != 0x0b0f)
7602 return;
7603 @@ -362,7 +434,8 @@
7604 char *file;
7605 char c;
7606
7607 - if (__get_user(line, (unsigned short __user *)(eip + 2)))
7608 + if (probe_kernel_address((unsigned short __user *)(eip + 2),
7609 + line))
7610 break;
7611 if (__get_user(file, (char * __user *)(eip + 4)) ||
7612 (unsigned long)file < PAGE_OFFSET || __get_user(c, file))
7613 @@ -604,18 +677,24 @@
7614 }
7615 }
7616
7617 -static void mem_parity_error(unsigned char reason, struct pt_regs * regs)
7618 +static __kprobes void
7619 +mem_parity_error(unsigned char reason, struct pt_regs * regs)
7620 {
7621 - printk(KERN_EMERG "Uhhuh. NMI received. Dazed and confused, but trying "
7622 - "to continue\n");
7623 + printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on "
7624 + "CPU %d.\n", reason, smp_processor_id());
7625 printk(KERN_EMERG "You probably have a hardware problem with your RAM "
7626 "chips\n");
7627 + if (panic_on_unrecovered_nmi)
7628 + panic("NMI: Not continuing");
7629 +
7630 + printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
7631
7632 /* Clear and disable the memory parity error line. */
7633 clear_mem_error(reason);
7634 }
7635
7636 -static void io_check_error(unsigned char reason, struct pt_regs * regs)
7637 +static __kprobes void
7638 +io_check_error(unsigned char reason, struct pt_regs * regs)
7639 {
7640 printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n");
7641 show_registers(regs);
7642 @@ -624,7 +703,8 @@
7643 clear_io_check_error(reason);
7644 }
7645
7646 -static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
7647 +static __kprobes void
7648 +unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
7649 {
7650 #ifdef CONFIG_MCA
7651 /* Might actually be able to figure out what the guilty party
7652 @@ -634,15 +714,18 @@
7653 return;
7654 }
7655 #endif
7656 - printk("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
7657 - reason, smp_processor_id());
7658 - printk("Dazed and confused, but trying to continue\n");
7659 - printk("Do you have a strange power saving mode enabled?\n");
7660 + printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on "
7661 + "CPU %d.\n", reason, smp_processor_id());
7662 + printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n");
7663 + if (panic_on_unrecovered_nmi)
7664 + panic("NMI: Not continuing");
7665 +
7666 + printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
7667 }
7668
7669 static DEFINE_SPINLOCK(nmi_print_lock);
7670
7671 -void die_nmi (struct pt_regs *regs, const char *msg)
7672 +void __kprobes die_nmi(struct pt_regs *regs, const char *msg)
7673 {
7674 if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 2, SIGINT) ==
7675 NOTIFY_STOP)
7676 @@ -674,7 +757,7 @@
7677 do_exit(SIGSEGV);
7678 }
7679
7680 -static void default_do_nmi(struct pt_regs * regs)
7681 +static __kprobes void default_do_nmi(struct pt_regs * regs)
7682 {
7683 unsigned char reason = 0;
7684
7685 @@ -691,12 +774,12 @@
7686 * Ok, so this is none of the documented NMI sources,
7687 * so it must be the NMI watchdog.
7688 */
7689 - if (nmi_watchdog) {
7690 - nmi_watchdog_tick(regs);
7691 + if (nmi_watchdog_tick(regs, reason))
7692 return;
7693 - }
7694 + if (!do_nmi_callback(regs, smp_processor_id()))
7695 #endif
7696 - unknown_nmi_error(reason, regs);
7697 + unknown_nmi_error(reason, regs);
7698 +
7699 return;
7700 }
7701 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
7702 @@ -712,14 +795,7 @@
7703 reassert_nmi();
7704 }
7705
7706 -static int dummy_nmi_callback(struct pt_regs * regs, int cpu)
7707 -{
7708 - return 0;
7709 -}
7710 -
7711 -static nmi_callback_t nmi_callback = dummy_nmi_callback;
7712 -
7713 -fastcall void do_nmi(struct pt_regs * regs, long error_code)
7714 +fastcall __kprobes void do_nmi(struct pt_regs * regs, long error_code)
7715 {
7716 int cpu;
7717
7718 @@ -729,25 +805,11 @@
7719
7720 ++nmi_count(cpu);
7721
7722 - if (!rcu_dereference(nmi_callback)(regs, cpu))
7723 - default_do_nmi(regs);
7724 + default_do_nmi(regs);
7725
7726 nmi_exit();
7727 }
7728
7729 -void set_nmi_callback(nmi_callback_t callback)
7730 -{
7731 - vmalloc_sync_all();
7732 - rcu_assign_pointer(nmi_callback, callback);
7733 -}
7734 -EXPORT_SYMBOL_GPL(set_nmi_callback);
7735 -
7736 -void unset_nmi_callback(void)
7737 -{
7738 - nmi_callback = dummy_nmi_callback;
7739 -}
7740 -EXPORT_SYMBOL_GPL(unset_nmi_callback);
7741 -
7742 #ifdef CONFIG_KPROBES
7743 fastcall void __kprobes do_int3(struct pt_regs *regs, long error_code)
7744 {
7745 --- a/arch/x86/kernel/traps_64-xen.c
7746 +++ b/arch/x86/kernel/traps_64-xen.c
7747 @@ -23,6 +23,7 @@
7748 #include <linux/delay.h>
7749 #include <linux/spinlock.h>
7750 #include <linux/interrupt.h>
7751 +#include <linux/kallsyms.h>
7752 #include <linux/module.h>
7753 #include <linux/moduleparam.h>
7754 #include <linux/nmi.h>
7755 @@ -45,6 +46,7 @@
7756 #include <asm/pda.h>
7757 #include <asm/proto.h>
7758 #include <asm/nmi.h>
7759 +#include <asm/stacktrace.h>
7760
7761 asmlinkage void divide_error(void);
7762 asmlinkage void debug(void);
7763 @@ -114,7 +116,6 @@
7764 #endif
7765
7766 #ifdef CONFIG_KALLSYMS
7767 -# include <linux/kallsyms.h>
7768 void printk_address(unsigned long address)
7769 {
7770 unsigned long offset = 0, symsize;
7771 @@ -142,7 +143,7 @@
7772 #endif
7773
7774 static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
7775 - unsigned *usedp, const char **idp)
7776 + unsigned *usedp, char **idp)
7777 {
7778 #ifndef CONFIG_X86_NO_TSS
7779 static char ids[][8] = {
7780 @@ -162,26 +163,7 @@
7781 * 'stack' is in one of them:
7782 */
7783 for (k = 0; k < N_EXCEPTION_STACKS; k++) {
7784 - unsigned long end;
7785 -
7786 - /*
7787 - * set 'end' to the end of the exception stack.
7788 - */
7789 - switch (k + 1) {
7790 - /*
7791 - * TODO: this block is not needed i think, because
7792 - * setup64.c:cpu_init() sets up t->ist[DEBUG_STACK]
7793 - * properly too.
7794 - */
7795 -#if DEBUG_STKSZ > EXCEPTION_STKSZ
7796 - case DEBUG_STACK:
7797 - end = cpu_pda(cpu)->debugstack + DEBUG_STKSZ;
7798 - break;
7799 -#endif
7800 - default:
7801 - end = per_cpu(orig_ist, cpu).ist[k];
7802 - break;
7803 - }
7804 + unsigned long end = per_cpu(orig_ist, cpu).ist[k];
7805 /*
7806 * Is 'stack' above this exception frame's end?
7807 * If yes then skip to the next frame.
7808 @@ -236,13 +218,19 @@
7809 return NULL;
7810 }
7811
7812 -static int show_trace_unwind(struct unwind_frame_info *info, void *context)
7813 +struct ops_and_data {
7814 + struct stacktrace_ops *ops;
7815 + void *data;
7816 +};
7817 +
7818 +static int dump_trace_unwind(struct unwind_frame_info *info, void *context)
7819 {
7820 + struct ops_and_data *oad = (struct ops_and_data *)context;
7821 int n = 0;
7822
7823 while (unwind(info) == 0 && UNW_PC(info)) {
7824 n++;
7825 - printk_address(UNW_PC(info));
7826 + oad->ops->address(oad->data, UNW_PC(info));
7827 if (arch_unw_user_mode(info))
7828 break;
7829 }
7830 @@ -256,13 +244,19 @@
7831 * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
7832 */
7833
7834 -void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * stack)
7835 +static inline int valid_stack_ptr(struct thread_info *tinfo, void *p)
7836 {
7837 - const unsigned cpu = safe_smp_processor_id();
7838 + void *t = (void *)tinfo;
7839 + return p > t && p < t + THREAD_SIZE - 3;
7840 +}
7841 +
7842 +void dump_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * stack,
7843 + struct stacktrace_ops *ops, void *data)
7844 +{
7845 + const unsigned cpu = smp_processor_id();
7846 unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
7847 unsigned used = 0;
7848 -
7849 - printk("\nCall Trace:\n");
7850 + struct thread_info *tinfo;
7851
7852 if (!tsk)
7853 tsk = current;
7854 @@ -270,32 +264,47 @@
7855 if (call_trace >= 0) {
7856 int unw_ret = 0;
7857 struct unwind_frame_info info;
7858 + struct ops_and_data oad = { .ops = ops, .data = data };
7859
7860 if (regs) {
7861 if (unwind_init_frame_info(&info, tsk, regs) == 0)
7862 - unw_ret = show_trace_unwind(&info, NULL);
7863 + unw_ret = dump_trace_unwind(&info, &oad);
7864 } else if (tsk == current)
7865 - unw_ret = unwind_init_running(&info, show_trace_unwind, NULL);
7866 + unw_ret = unwind_init_running(&info, dump_trace_unwind, &oad);
7867 else {
7868 if (unwind_init_blocked(&info, tsk) == 0)
7869 - unw_ret = show_trace_unwind(&info, NULL);
7870 + unw_ret = dump_trace_unwind(&info, &oad);
7871 }
7872 if (unw_ret > 0) {
7873 if (call_trace == 1 && !arch_unw_user_mode(&info)) {
7874 - print_symbol("DWARF2 unwinder stuck at %s\n",
7875 + ops->warning_symbol(data, "DWARF2 unwinder stuck at %s\n",
7876 UNW_PC(&info));
7877 if ((long)UNW_SP(&info) < 0) {
7878 - printk("Leftover inexact backtrace:\n");
7879 + ops->warning(data, "Leftover inexact backtrace:\n");
7880 stack = (unsigned long *)UNW_SP(&info);
7881 + if (!stack)
7882 + return;
7883 } else
7884 - printk("Full inexact backtrace again:\n");
7885 + ops->warning(data, "Full inexact backtrace again:\n");
7886 } else if (call_trace >= 1)
7887 return;
7888 else
7889 - printk("Full inexact backtrace again:\n");
7890 + ops->warning(data, "Full inexact backtrace again:\n");
7891 } else
7892 - printk("Inexact backtrace:\n");
7893 + ops->warning(data, "Inexact backtrace:\n");
7894 + }
7895 + if (!stack) {
7896 + unsigned long dummy;
7897 + stack = &dummy;
7898 + if (tsk && tsk != current)
7899 + stack = (unsigned long *)tsk->thread.rsp;
7900 }
7901 + /*
7902 + * Align the stack pointer on word boundary, later loops
7903 + * rely on that (and corruption / debug info bugs can cause
7904 + * unaligned values here):
7905 + */
7906 + stack = (unsigned long *)((unsigned long)stack & ~(sizeof(long)-1));
7907
7908 /*
7909 * Print function call entries within a stack. 'cond' is the
7910 @@ -305,7 +314,9 @@
7911 #define HANDLE_STACK(cond) \
7912 do while (cond) { \
7913 unsigned long addr = *stack++; \
7914 - if (kernel_text_address(addr)) { \
7915 + if (oops_in_progress ? \
7916 + __kernel_text_address(addr) : \
7917 + kernel_text_address(addr)) { \
7918 /* \
7919 * If the address is either in the text segment of the \
7920 * kernel, or in the region which contains vmalloc'ed \
7921 @@ -314,7 +325,7 @@
7922 * down the cause of the crash will be able to figure \
7923 * out the call path that was taken. \
7924 */ \
7925 - printk_address(addr); \
7926 + ops->address(data, addr); \
7927 } \
7928 } while (0)
7929
7930 @@ -323,16 +334,17 @@
7931 * current stack address. If the stacks consist of nested
7932 * exceptions
7933 */
7934 - for ( ; ; ) {
7935 - const char *id;
7936 + for (;;) {
7937 + char *id;
7938 unsigned long *estack_end;
7939 estack_end = in_exception_stack(cpu, (unsigned long)stack,
7940 &used, &id);
7941
7942 if (estack_end) {
7943 - printk(" <%s>", id);
7944 + if (ops->stack(data, id) < 0)
7945 + break;
7946 HANDLE_STACK (stack < estack_end);
7947 - printk(" <EOE>");
7948 + ops->stack(data, "<EOE>");
7949 /*
7950 * We link to the next stack via the
7951 * second-to-last pointer (index -2 to end) in the
7952 @@ -347,7 +359,8 @@
7953 (IRQSTACKSIZE - 64) / sizeof(*irqstack);
7954
7955 if (stack >= irqstack && stack < irqstack_end) {
7956 - printk(" <IRQ>");
7957 + if (ops->stack(data, "IRQ") < 0)
7958 + break;
7959 HANDLE_STACK (stack < irqstack_end);
7960 /*
7961 * We link to the next stack (which would be
7962 @@ -356,7 +369,7 @@
7963 */
7964 stack = (unsigned long *) (irqstack_end[-1]);
7965 irqstack_end = NULL;
7966 - printk(" <EOI>");
7967 + ops->stack(data, "EOI");
7968 continue;
7969 }
7970 }
7971 @@ -364,19 +377,58 @@
7972 }
7973
7974 /*
7975 - * This prints the process stack:
7976 + * This handles the process stack:
7977 */
7978 - HANDLE_STACK (((long) stack & (THREAD_SIZE-1)) != 0);
7979 + tinfo = current_thread_info();
7980 + HANDLE_STACK (valid_stack_ptr(tinfo, stack));
7981 #undef HANDLE_STACK
7982 +}
7983 +EXPORT_SYMBOL(dump_trace);
7984 +
7985 +static void
7986 +print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
7987 +{
7988 + print_symbol(msg, symbol);
7989 + printk("\n");
7990 +}
7991 +
7992 +static void print_trace_warning(void *data, char *msg)
7993 +{
7994 + printk("%s\n", msg);
7995 +}
7996 +
7997 +static int print_trace_stack(void *data, char *name)
7998 +{
7999 + printk(" <%s> ", name);
8000 + return 0;
8001 +}
8002 +
8003 +static void print_trace_address(void *data, unsigned long addr)
8004 +{
8005 + printk_address(addr);
8006 +}
8007 +
8008 +static struct stacktrace_ops print_trace_ops = {
8009 + .warning = print_trace_warning,
8010 + .warning_symbol = print_trace_warning_symbol,
8011 + .stack = print_trace_stack,
8012 + .address = print_trace_address,
8013 +};
8014
8015 +void
8016 +show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack)
8017 +{
8018 + printk("\nCall Trace:\n");
8019 + dump_trace(tsk, regs, stack, &print_trace_ops, NULL);
8020 printk("\n");
8021 }
8022
8023 -static void _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long * rsp)
8024 +static void
8025 +_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *rsp)
8026 {
8027 unsigned long *stack;
8028 int i;
8029 - const int cpu = safe_smp_processor_id();
8030 + const int cpu = smp_processor_id();
8031 unsigned long *irqstack_end = (unsigned long *) (cpu_pda(cpu)->irqstackptr);
8032 unsigned long *irqstack = (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE);
8033
8034 @@ -430,7 +482,7 @@
8035 int i;
8036 int in_kernel = !user_mode(regs);
8037 unsigned long rsp;
8038 - const int cpu = safe_smp_processor_id();
8039 + const int cpu = smp_processor_id();
8040 struct task_struct *cur = cpu_pda(cpu)->pcurrent;
8041
8042 rsp = regs->rsp;
8043 @@ -505,9 +557,11 @@
8044
8045 unsigned __kprobes long oops_begin(void)
8046 {
8047 - int cpu = safe_smp_processor_id();
8048 + int cpu = smp_processor_id();
8049 unsigned long flags;
8050
8051 + oops_enter();
8052 +
8053 /* racy, but better than risking deadlock. */
8054 local_irq_save(flags);
8055 if (!spin_trylock(&die_lock)) {
8056 @@ -536,6 +590,7 @@
8057 spin_unlock_irqrestore(&die_lock, flags);
8058 if (panic_on_oops)
8059 panic("Fatal exception");
8060 + oops_exit();
8061 }
8062
8063 void __kprobes __die(const char * str, struct pt_regs * regs, long err)
8064 @@ -573,7 +628,7 @@
8065 }
8066
8067 #ifdef CONFIG_X86_LOCAL_APIC
8068 -void __kprobes die_nmi(char *str, struct pt_regs *regs)
8069 +void __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic)
8070 {
8071 unsigned long flags = oops_begin();
8072
8073 @@ -581,13 +636,12 @@
8074 * We are in trouble anyway, lets at least try
8075 * to get a message out.
8076 */
8077 - printk(str, safe_smp_processor_id());
8078 + printk(str, smp_processor_id());
8079 show_registers(regs);
8080 if (kexec_should_crash(current))
8081 crash_kexec(regs);
8082 - if (panic_on_timeout || panic_on_oops)
8083 - panic("nmi watchdog");
8084 - printk("console shuts up ...\n");
8085 + if (do_panic || panic_on_oops)
8086 + panic("Non maskable interrupt");
8087 oops_end(flags);
8088 nmi_exit();
8089 local_irq_enable();
8090 @@ -734,8 +788,15 @@
8091 static __kprobes void
8092 mem_parity_error(unsigned char reason, struct pt_regs * regs)
8093 {
8094 - printk("Uhhuh. NMI received. Dazed and confused, but trying to continue\n");
8095 - printk("You probably have a hardware problem with your RAM chips\n");
8096 + printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
8097 + reason);
8098 + printk(KERN_EMERG "You probably have a hardware problem with your "
8099 + "RAM chips\n");
8100 +
8101 + if (panic_on_unrecovered_nmi)
8102 + panic("NMI: Not continuing");
8103 +
8104 + printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
8105
8106 #if 0 /* XEN */
8107 /* Clear and disable the memory parity error line. */
8108 @@ -762,9 +823,15 @@
8109
8110 static __kprobes void
8111 unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
8112 -{ printk("Uhhuh. NMI received for unknown reason %02x.\n", reason);
8113 - printk("Dazed and confused, but trying to continue\n");
8114 - printk("Do you have a strange power saving mode enabled?\n");
8115 +{
8116 + printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
8117 + reason);
8118 + printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n");
8119 +
8120 + if (panic_on_unrecovered_nmi)
8121 + panic("NMI: Not continuing");
8122 +
8123 + printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
8124 }
8125
8126 /* Runs on IST stack. This code must keep interrupts off all the time.
8127 @@ -789,12 +856,12 @@
8128 * Ok, so this is none of the documented NMI sources,
8129 * so it must be the NMI watchdog.
8130 */
8131 - if (nmi_watchdog > 0) {
8132 - nmi_watchdog_tick(regs,reason);
8133 + if (nmi_watchdog_tick(regs,reason))
8134 return;
8135 - }
8136 #endif
8137 - unknown_nmi_error(reason, regs);
8138 + if (!do_nmi_callback(regs,cpu))
8139 + unknown_nmi_error(reason, regs);
8140 +
8141 return;
8142 }
8143 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
8144 @@ -1081,6 +1148,7 @@
8145 init_fpu(me);
8146 restore_fpu_checking(&me->thread.i387.fxsave);
8147 task_thread_info(me)->status |= TS_USEDFPU;
8148 + me->fpu_counter++;
8149 }
8150
8151
8152 @@ -1141,24 +1209,30 @@
8153 }
8154
8155
8156 -/* Actual parsing is done early in setup.c. */
8157 -static int __init oops_dummy(char *s)
8158 +static int __init oops_setup(char *s)
8159 {
8160 - panic_on_oops = 1;
8161 - return 1;
8162 + if (!s)
8163 + return -EINVAL;
8164 + if (!strcmp(s, "panic"))
8165 + panic_on_oops = 1;
8166 + return 0;
8167 }
8168 -__setup("oops=", oops_dummy);
8169 +early_param("oops", oops_setup);
8170
8171 static int __init kstack_setup(char *s)
8172 {
8173 + if (!s)
8174 + return -EINVAL;
8175 kstack_depth_to_print = simple_strtoul(s,NULL,0);
8176 - return 1;
8177 + return 0;
8178 }
8179 -__setup("kstack=", kstack_setup);
8180 +early_param("kstack", kstack_setup);
8181
8182 #ifdef CONFIG_STACK_UNWIND
8183 static int __init call_trace_setup(char *s)
8184 {
8185 + if (!s)
8186 + return -EINVAL;
8187 if (strcmp(s, "old") == 0)
8188 call_trace = -1;
8189 else if (strcmp(s, "both") == 0)
8190 @@ -1167,7 +1241,7 @@
8191 call_trace = 1;
8192 else if (strcmp(s, "new") == 0)
8193 call_trace = 2;
8194 - return 1;
8195 + return 0;
8196 }
8197 -__setup("call_trace=", call_trace_setup);
8198 +early_param("call_trace", call_trace_setup);
8199 #endif
8200 --- a/arch/x86/kernel/vsyscall_64-xen.c
8201 +++ b/arch/x86/kernel/vsyscall_64-xen.c
8202 @@ -26,6 +26,10 @@
8203 #include <linux/seqlock.h>
8204 #include <linux/jiffies.h>
8205 #include <linux/sysctl.h>
8206 +#include <linux/getcpu.h>
8207 +#include <linux/cpu.h>
8208 +#include <linux/smp.h>
8209 +#include <linux/notifier.h>
8210
8211 #include <asm/vsyscall.h>
8212 #include <asm/pgtable.h>
8213 @@ -33,11 +37,15 @@
8214 #include <asm/fixmap.h>
8215 #include <asm/errno.h>
8216 #include <asm/io.h>
8217 +#include <asm/segment.h>
8218 +#include <asm/desc.h>
8219 +#include <asm/topology.h>
8220
8221 #define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
8222
8223 int __sysctl_vsyscall __section_sysctl_vsyscall = 1;
8224 seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED;
8225 +int __vgetcpu_mode __section_vgetcpu_mode;
8226
8227 #include <asm/unistd.h>
8228
8229 @@ -61,8 +69,7 @@
8230 sequence = read_seqbegin(&__xtime_lock);
8231
8232 sec = __xtime.tv_sec;
8233 - usec = (__xtime.tv_nsec / 1000) +
8234 - (__jiffies - __wall_jiffies) * (1000000 / HZ);
8235 + usec = __xtime.tv_nsec / 1000;
8236
8237 if (__vxtime.mode != VXTIME_HPET) {
8238 t = get_cycles_sync();
8239 @@ -72,7 +79,8 @@
8240 __vxtime.tsc_quot) >> 32;
8241 /* See comment in x86_64 do_gettimeofday. */
8242 } else {
8243 - usec += ((readl((void *)fix_to_virt(VSYSCALL_HPET) + 0xf0) -
8244 + usec += ((readl((void __iomem *)
8245 + fix_to_virt(VSYSCALL_HPET) + 0xf0) -
8246 __vxtime.last) * __vxtime.quot) >> 32;
8247 }
8248 } while (read_seqretry(&__xtime_lock, sequence));
8249 @@ -127,9 +135,46 @@
8250 return __xtime.tv_sec;
8251 }
8252
8253 -long __vsyscall(2) venosys_0(void)
8254 -{
8255 - return -ENOSYS;
8256 +/* Fast way to get current CPU and node.
8257 + This helps to do per node and per CPU caches in user space.
8258 + The result is not guaranteed without CPU affinity, but usually
8259 + works out because the scheduler tries to keep a thread on the same
8260 + CPU.
8261 +
8262 + tcache must point to a two element sized long array.
8263 + All arguments can be NULL. */
8264 +long __vsyscall(2)
8265 +vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
8266 +{
8267 + unsigned int dummy, p;
8268 + unsigned long j = 0;
8269 +
8270 + /* Fast cache - only recompute value once per jiffies and avoid
8271 + relatively costly rdtscp/cpuid otherwise.
8272 + This works because the scheduler usually keeps the process
8273 + on the same CPU and this syscall doesn't guarantee its
8274 + results anyways.
8275 + We do this here because otherwise user space would do it on
8276 + its own in a likely inferior way (no access to jiffies).
8277 + If you don't like it pass NULL. */
8278 + if (tcache && tcache->blob[0] == (j = __jiffies)) {
8279 + p = tcache->blob[1];
8280 + } else if (__vgetcpu_mode == VGETCPU_RDTSCP) {
8281 + /* Load per CPU data from RDTSCP */
8282 + rdtscp(dummy, dummy, p);
8283 + } else {
8284 + /* Load per CPU data from GDT */
8285 + asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
8286 + }
8287 + if (tcache) {
8288 + tcache->blob[0] = j;
8289 + tcache->blob[1] = p;
8290 + }
8291 + if (cpu)
8292 + *cpu = p & 0xfff;
8293 + if (node)
8294 + *node = p >> 12;
8295 + return 0;
8296 }
8297
8298 long __vsyscall(3) venosys_1(void)
8299 @@ -149,7 +194,8 @@
8300 void __user *buffer, size_t *lenp, loff_t *ppos)
8301 {
8302 extern u16 vsysc1, vsysc2;
8303 - u16 *map1, *map2;
8304 + u16 __iomem *map1;
8305 + u16 __iomem *map2;
8306 int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
8307 if (!write)
8308 return ret;
8309 @@ -164,11 +210,11 @@
8310 goto out;
8311 }
8312 if (!sysctl_vsyscall) {
8313 - *map1 = SYSCALL;
8314 - *map2 = SYSCALL;
8315 + writew(SYSCALL, map1);
8316 + writew(SYSCALL, map2);
8317 } else {
8318 - *map1 = NOP2;
8319 - *map2 = NOP2;
8320 + writew(NOP2, map1);
8321 + writew(NOP2, map2);
8322 }
8323 iounmap(map2);
8324 out:
8325 @@ -200,6 +246,48 @@
8326
8327 #endif
8328
8329 +/* Assume __initcall executes before all user space. Hopefully kmod
8330 + doesn't violate that. We'll find out if it does. */
8331 +static void __cpuinit vsyscall_set_cpu(int cpu)
8332 +{
8333 + unsigned long d;
8334 + unsigned long node = 0;
8335 +#ifdef CONFIG_NUMA
8336 + node = cpu_to_node[cpu];
8337 +#endif
8338 + if (cpu_has(&cpu_data[cpu], X86_FEATURE_RDTSCP))
8339 + write_rdtscp_aux((node << 12) | cpu);
8340 +
8341 + /* Store cpu number in limit so that it can be loaded quickly
8342 + in user space in vgetcpu.
8343 + 12 bits for the CPU and 8 bits for the node. */
8344 + d = 0x0f40000000000ULL;
8345 + d |= cpu;
8346 + d |= (node & 0xf) << 12;
8347 + d |= (node >> 4) << 48;
8348 + if (HYPERVISOR_update_descriptor(virt_to_machine(cpu_gdt(cpu)
8349 + + GDT_ENTRY_PER_CPU),
8350 + d))
8351 + BUG();
8352 +}
8353 +
8354 +static void __cpuinit cpu_vsyscall_init(void *arg)
8355 +{
8356 + /* preemption should be already off */
8357 + vsyscall_set_cpu(raw_smp_processor_id());
8358 +}
8359 +
8360 +#ifdef CONFIG_HOTPLUG_CPU
8361 +static int __cpuinit
8362 +cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
8363 +{
8364 + long cpu = (long)arg;
8365 + if (action == CPU_ONLINE)
8366 + smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 0, 1);
8367 + return NOTIFY_DONE;
8368 +}
8369 +#endif
8370 +
8371 static void __init map_vsyscall(void)
8372 {
8373 extern char __vsyscall_0;
8374 @@ -214,13 +302,20 @@
8375 VSYSCALL_ADDR(__NR_vgettimeofday)));
8376 BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
8377 BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
8378 + BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu));
8379 map_vsyscall();
8380 #ifdef CONFIG_XEN
8381 sysctl_vsyscall = 0; /* disable vgettimeofay() */
8382 + if (boot_cpu_has(X86_FEATURE_RDTSCP))
8383 + vgetcpu_mode = VGETCPU_RDTSCP;
8384 + else
8385 + vgetcpu_mode = VGETCPU_LSL;
8386 #endif
8387 #ifdef CONFIG_SYSCTL
8388 register_sysctl_table(kernel_root_table2, 0);
8389 #endif
8390 + on_each_cpu(cpu_vsyscall_init, NULL, 0, 1);
8391 + hotcpu_notifier(cpu_vsyscall_notifier, 0);
8392 return 0;
8393 }
8394
8395 --- a/arch/x86/mach-xen/setup.c
8396 +++ b/arch/x86/mach-xen/setup.c
8397 @@ -103,8 +103,10 @@
8398
8399 setup_xen_features();
8400
8401 - if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
8402 - set_fixaddr_top(pp.virt_start);
8403 + if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0) {
8404 + hypervisor_virt_start = pp.virt_start;
8405 + reserve_top_address(0UL - pp.virt_start);
8406 + }
8407
8408 if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
8409 machine_to_phys_mapping = (unsigned long *)mapping.v_start;
8410 --- a/arch/x86/mm/fault_32-xen.c
8411 +++ b/arch/x86/mm/fault_32-xen.c
8412 @@ -27,21 +27,24 @@
8413 #include <asm/uaccess.h>
8414 #include <asm/desc.h>
8415 #include <asm/kdebug.h>
8416 +#include <asm/segment.h>
8417
8418 extern void die(const char *,struct pt_regs *,long);
8419
8420 -#ifdef CONFIG_KPROBES
8421 -ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain);
8422 +static ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain);
8423 +
8424 int register_page_fault_notifier(struct notifier_block *nb)
8425 {
8426 vmalloc_sync_all();
8427 return atomic_notifier_chain_register(&notify_page_fault_chain, nb);
8428 }
8429 +EXPORT_SYMBOL_GPL(register_page_fault_notifier);
8430
8431 int unregister_page_fault_notifier(struct notifier_block *nb)
8432 {
8433 return atomic_notifier_chain_unregister(&notify_page_fault_chain, nb);
8434 }
8435 +EXPORT_SYMBOL_GPL(unregister_page_fault_notifier);
8436
8437 static inline int notify_page_fault(enum die_val val, const char *str,
8438 struct pt_regs *regs, long err, int trap, int sig)
8439 @@ -55,14 +58,6 @@
8440 };
8441 return atomic_notifier_call_chain(&notify_page_fault_chain, val, &args);
8442 }
8443 -#else
8444 -static inline int notify_page_fault(enum die_val val, const char *str,
8445 - struct pt_regs *regs, long err, int trap, int sig)
8446 -{
8447 - return NOTIFY_DONE;
8448 -}
8449 -#endif
8450 -
8451
8452 /*
8453 * Unlock any spinlocks which will prevent us from getting the
8454 @@ -119,10 +114,10 @@
8455 }
8456
8457 /* The standard kernel/user address space limit. */
8458 - *eip_limit = (seg & 2) ? USER_DS.seg : KERNEL_DS.seg;
8459 + *eip_limit = user_mode(regs) ? USER_DS.seg : KERNEL_DS.seg;
8460
8461 /* By far the most common cases. */
8462 - if (likely(seg == __USER_CS || seg == GET_KERNEL_CS()))
8463 + if (likely(SEGMENT_IS_FLAT_CODE(seg)))
8464 return eip;
8465
8466 /* Check the segment exists, is within the current LDT/GDT size,
8467 @@ -559,11 +554,7 @@
8468 write = 0;
8469 switch (error_code & 3) {
8470 default: /* 3: write, present */
8471 -#ifdef TEST_VERIFY_AREA
8472 - if (regs->cs == GET_KERNEL_CS())
8473 - printk("WP fault at %08lx\n", regs->eip);
8474 -#endif
8475 - /* fall through */
8476 + /* fall through */
8477 case 2: /* write, not present */
8478 if (!(vma->vm_flags & VM_WRITE))
8479 goto bad_area;
8480 @@ -572,7 +563,7 @@
8481 case 1: /* read, present */
8482 goto bad_area;
8483 case 0: /* read, not present */
8484 - if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
8485 + if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
8486 goto bad_area;
8487 }
8488
8489 @@ -704,7 +695,7 @@
8490 */
8491 out_of_memory:
8492 up_read(&mm->mmap_sem);
8493 - if (tsk->pid == 1) {
8494 + if (is_init(tsk)) {
8495 yield();
8496 down_read(&mm->mmap_sem);
8497 goto survive;
8498 --- a/arch/x86/mm/fault_64-xen.c
8499 +++ b/arch/x86/mm/fault_64-xen.c
8500 @@ -40,8 +40,7 @@
8501 #define PF_RSVD (1<<3)
8502 #define PF_INSTR (1<<4)
8503
8504 -#ifdef CONFIG_KPROBES
8505 -ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain);
8506 +static ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain);
8507
8508 /* Hook to register for page fault notifications */
8509 int register_page_fault_notifier(struct notifier_block *nb)
8510 @@ -49,11 +48,13 @@
8511 vmalloc_sync_all();
8512 return atomic_notifier_chain_register(&notify_page_fault_chain, nb);
8513 }
8514 +EXPORT_SYMBOL_GPL(register_page_fault_notifier);
8515
8516 int unregister_page_fault_notifier(struct notifier_block *nb)
8517 {
8518 return atomic_notifier_chain_unregister(&notify_page_fault_chain, nb);
8519 }
8520 +EXPORT_SYMBOL_GPL(unregister_page_fault_notifier);
8521
8522 static inline int notify_page_fault(enum die_val val, const char *str,
8523 struct pt_regs *regs, long err, int trap, int sig)
8524 @@ -67,13 +68,6 @@
8525 };
8526 return atomic_notifier_call_chain(&notify_page_fault_chain, val, &args);
8527 }
8528 -#else
8529 -static inline int notify_page_fault(enum die_val val, const char *str,
8530 - struct pt_regs *regs, long err, int trap, int sig)
8531 -{
8532 - return NOTIFY_DONE;
8533 -}
8534 -#endif
8535
8536 void bust_spinlocks(int yes)
8537 {
8538 @@ -102,7 +96,7 @@
8539 static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
8540 unsigned long error_code)
8541 {
8542 - unsigned char *instr;
8543 + unsigned char __user *instr;
8544 int scan_more = 1;
8545 int prefetch = 0;
8546 unsigned char *max_instr;
8547 @@ -111,7 +105,7 @@
8548 if (error_code & PF_INSTR)
8549 return 0;
8550
8551 - instr = (unsigned char *)convert_rip_to_linear(current, regs);
8552 + instr = (unsigned char __user *)convert_rip_to_linear(current, regs);
8553 max_instr = instr + 15;
8554
8555 if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
8556 @@ -122,7 +116,7 @@
8557 unsigned char instr_hi;
8558 unsigned char instr_lo;
8559
8560 - if (__get_user(opcode, instr))
8561 + if (__get_user(opcode, (char __user *)instr))
8562 break;
8563
8564 instr_hi = opcode & 0xf0;
8565 @@ -160,7 +154,7 @@
8566 case 0x00:
8567 /* Prefetch instruction is 0x0F0D or 0x0F18 */
8568 scan_more = 0;
8569 - if (__get_user(opcode, instr))
8570 + if (__get_user(opcode, (char __user *)instr))
8571 break;
8572 prefetch = (instr_lo == 0xF) &&
8573 (opcode == 0x0D || opcode == 0x18);
8574 @@ -176,7 +170,7 @@
8575 static int bad_address(void *p)
8576 {
8577 unsigned long dummy;
8578 - return __get_user(dummy, (unsigned long *)p);
8579 + return __get_user(dummy, (unsigned long __user *)p);
8580 }
8581
8582 void dump_pagetable(unsigned long address)
8583 @@ -248,7 +242,7 @@
8584
8585 int unhandled_signal(struct task_struct *tsk, int sig)
8586 {
8587 - if (tsk->pid == 1)
8588 + if (is_init(tsk))
8589 return 1;
8590 if (tsk->ptrace & PT_PTRACED)
8591 return 0;
8592 @@ -300,7 +294,7 @@
8593 if (pgd_none(*pgd))
8594 set_pgd(pgd, *pgd_ref);
8595 else
8596 - BUG_ON(pgd_page(*pgd) != pgd_page(*pgd_ref));
8597 + BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
8598
8599 /* Below here mismatches are bugs because these lower tables
8600 are shared */
8601 @@ -309,7 +303,7 @@
8602 pud_ref = pud_offset(pgd_ref, address);
8603 if (pud_none(*pud_ref))
8604 return -1;
8605 - if (pud_none(*pud) || pud_page(*pud) != pud_page(*pud_ref))
8606 + if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
8607 BUG();
8608 pmd = pmd_offset(pud, address);
8609 pmd_ref = pmd_offset(pud_ref, address);
8610 @@ -531,7 +525,7 @@
8611 case PF_PROT: /* read, present */
8612 goto bad_area;
8613 case 0: /* read, not present */
8614 - if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
8615 + if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
8616 goto bad_area;
8617 }
8618
8619 @@ -647,7 +641,7 @@
8620 */
8621 out_of_memory:
8622 up_read(&mm->mmap_sem);
8623 - if (current->pid == 1) {
8624 + if (is_init(current)) {
8625 yield();
8626 goto again;
8627 }
8628 @@ -702,7 +696,7 @@
8629 if (pgd_none(*pgd))
8630 set_pgd(pgd, *pgd_ref);
8631 else
8632 - BUG_ON(pgd_page(*pgd) != pgd_page(*pgd_ref));
8633 + BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
8634 }
8635 spin_unlock(&pgd_lock);
8636 set_bit(pgd_index(address), insync);
8637 --- a/arch/x86/mm/highmem_32-xen.c
8638 +++ b/arch/x86/mm/highmem_32-xen.c
8639 @@ -38,11 +38,9 @@
8640
8641 idx = type + KM_TYPE_NR*smp_processor_id();
8642 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
8643 -#ifdef CONFIG_DEBUG_HIGHMEM
8644 if (!pte_none(*(kmap_pte-idx)))
8645 BUG();
8646 -#endif
8647 - set_pte_at_sync(&init_mm, vaddr, kmap_pte-idx, mk_pte(page, prot));
8648 + set_pte_at(&init_mm, vaddr, kmap_pte-idx, mk_pte(page, prot));
8649
8650 return (void*) vaddr;
8651 }
8652 @@ -62,36 +60,26 @@
8653
8654 void kunmap_atomic(void *kvaddr, enum km_type type)
8655 {
8656 -#if defined(CONFIG_DEBUG_HIGHMEM) || defined(CONFIG_XEN)
8657 unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
8658 enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
8659
8660 - if (vaddr < FIXADDR_START) { // FIXME
8661 +#ifdef CONFIG_DEBUG_HIGHMEM
8662 + if (vaddr >= PAGE_OFFSET && vaddr < (unsigned long)high_memory) {
8663 dec_preempt_count();
8664 preempt_check_resched();
8665 return;
8666 }
8667 -#endif
8668
8669 -#if defined(CONFIG_DEBUG_HIGHMEM)
8670 if (vaddr != __fix_to_virt(FIX_KMAP_BEGIN+idx))
8671 BUG();
8672 -
8673 - /*
8674 - * force other mappings to Oops if they'll try to access
8675 - * this pte without first remap it
8676 - */
8677 - pte_clear(&init_mm, vaddr, kmap_pte-idx);
8678 - __flush_tlb_one(vaddr);
8679 -#elif defined(CONFIG_XEN)
8680 +#endif
8681 /*
8682 - * We must ensure there are no dangling pagetable references when
8683 - * returning memory to Xen (decrease_reservation).
8684 - * XXX TODO: We could make this faster by only zapping when
8685 - * kmap_flush_unused is called but that is trickier and more invasive.
8686 + * Force other mappings to Oops if they'll try to access this pte
8687 + * without first remap it. Keeping stale mappings around is a bad idea
8688 + * also, in case the page changes cacheability attributes or becomes
8689 + * a protected page in a hypervisor.
8690 */
8691 - pte_clear(&init_mm, vaddr, kmap_pte-idx);
8692 -#endif
8693 + kpte_clear_flush(kmap_pte-idx, vaddr);
8694
8695 dec_preempt_count();
8696 preempt_check_resched();
8697 @@ -110,7 +98,6 @@
8698 idx = type + KM_TYPE_NR*smp_processor_id();
8699 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
8700 set_pte(kmap_pte-idx, pfn_pte(pfn, kmap_prot));
8701 - __flush_tlb_one(vaddr);
8702
8703 return (void*) vaddr;
8704 }
8705 --- a/arch/x86/mm/hypervisor.c
8706 +++ b/arch/x86/mm/hypervisor.c
8707 @@ -569,7 +569,8 @@
8708 #define MAX_BATCHED_FULL_PTES 32
8709
8710 int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
8711 - unsigned long addr, unsigned long end, pgprot_t newprot)
8712 + unsigned long addr, unsigned long end, pgprot_t newprot,
8713 + int dirty_accountable)
8714 {
8715 int rc = 0, i = 0;
8716 mmu_update_t u[MAX_BATCHED_FULL_PTES];
8717 @@ -582,10 +583,14 @@
8718 pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
8719 do {
8720 if (pte_present(*pte)) {
8721 + pte_t ptent = pte_modify(*pte, newprot);
8722 +
8723 + if (dirty_accountable && pte_dirty(ptent))
8724 + ptent = pte_mkwrite(ptent);
8725 u[i].ptr = (__pmd_val(*pmd) & PHYSICAL_PAGE_MASK)
8726 | ((unsigned long)pte & ~PAGE_MASK)
8727 | MMU_PT_UPDATE_PRESERVE_AD;
8728 - u[i].val = __pte_val(pte_modify(*pte, newprot));
8729 + u[i].val = __pte_val(ptent);
8730 if (++i == MAX_BATCHED_FULL_PTES) {
8731 if ((rc = HYPERVISOR_mmu_update(
8732 &u[0], i, NULL, DOMID_SELF)) != 0)
8733 --- a/arch/x86/mm/init_32-xen.c
8734 +++ b/arch/x86/mm/init_32-xen.c
8735 @@ -464,16 +464,22 @@
8736 * on Enable
8737 * off Disable
8738 */
8739 -void __init noexec_setup(const char *str)
8740 +static int __init noexec_setup(char *str)
8741 {
8742 - if (!strncmp(str, "on",2) && cpu_has_nx) {
8743 - __supported_pte_mask |= _PAGE_NX;
8744 - disable_nx = 0;
8745 - } else if (!strncmp(str,"off",3)) {
8746 + if (!str || !strcmp(str, "on")) {
8747 + if (cpu_has_nx) {
8748 + __supported_pte_mask |= _PAGE_NX;
8749 + disable_nx = 0;
8750 + }
8751 + } else if (!strcmp(str,"off")) {
8752 disable_nx = 1;
8753 __supported_pte_mask &= ~_PAGE_NX;
8754 - }
8755 + } else
8756 + return -EINVAL;
8757 +
8758 + return 0;
8759 }
8760 +early_param("noexec", noexec_setup);
8761
8762 int nx_enabled = 0;
8763 #ifdef CONFIG_X86_PAE
8764 @@ -516,6 +522,7 @@
8765 pte->pte_high &= ~(1 << (_PAGE_BIT_NX - 32));
8766 else
8767 pte->pte_high |= 1 << (_PAGE_BIT_NX - 32);
8768 + pte_update_defer(&init_mm, vaddr, pte);
8769 __flush_tlb_all();
8770 out:
8771 return ret;
8772 @@ -598,18 +605,6 @@
8773 }
8774 }
8775
8776 -static void __init set_max_mapnr_init(void)
8777 -{
8778 -#ifdef CONFIG_HIGHMEM
8779 - num_physpages = highend_pfn;
8780 -#else
8781 - num_physpages = max_low_pfn;
8782 -#endif
8783 -#ifdef CONFIG_FLATMEM
8784 - max_mapnr = num_physpages;
8785 -#endif
8786 -}
8787 -
8788 static struct kcore_list kcore_mem, kcore_vmalloc;
8789
8790 void __init mem_init(void)
8791 @@ -630,8 +625,7 @@
8792 #endif
8793
8794 #ifdef CONFIG_FLATMEM
8795 - if (!mem_map)
8796 - BUG();
8797 + BUG_ON(!mem_map);
8798 #endif
8799
8800 bad_ppro = ppro_with_ram_bug();
8801 @@ -646,17 +640,6 @@
8802 }
8803 #endif
8804
8805 - set_max_mapnr_init();
8806 -
8807 -#ifdef CONFIG_HIGHMEM
8808 - high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
8809 -#else
8810 - high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
8811 -#endif
8812 - printk("vmalloc area: %lx-%lx, maxmem %lx\n",
8813 - VMALLOC_START,VMALLOC_END,MAXMEM);
8814 - BUG_ON(VMALLOC_START > VMALLOC_END);
8815 -
8816 /* this will put all low memory onto the freelists */
8817 totalram_pages += free_all_bootmem();
8818 /* XEN: init and count low-mem pages outside initial allocation. */
8819 @@ -694,6 +677,48 @@
8820 (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10))
8821 );
8822
8823 +#if 1 /* double-sanity-check paranoia */
8824 + printk("virtual kernel memory layout:\n"
8825 + " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
8826 +#ifdef CONFIG_HIGHMEM
8827 + " pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
8828 +#endif
8829 + " vmalloc : 0x%08lx - 0x%08lx (%4ld MB)\n"
8830 + " lowmem : 0x%08lx - 0x%08lx (%4ld MB)\n"
8831 + " .init : 0x%08lx - 0x%08lx (%4ld kB)\n"
8832 + " .data : 0x%08lx - 0x%08lx (%4ld kB)\n"
8833 + " .text : 0x%08lx - 0x%08lx (%4ld kB)\n",
8834 + FIXADDR_START, FIXADDR_TOP,
8835 + (FIXADDR_TOP - FIXADDR_START) >> 10,
8836 +
8837 +#ifdef CONFIG_HIGHMEM
8838 + PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
8839 + (LAST_PKMAP*PAGE_SIZE) >> 10,
8840 +#endif
8841 +
8842 + VMALLOC_START, VMALLOC_END,
8843 + (VMALLOC_END - VMALLOC_START) >> 20,
8844 +
8845 + (unsigned long)__va(0), (unsigned long)high_memory,
8846 + ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20,
8847 +
8848 + (unsigned long)&__init_begin, (unsigned long)&__init_end,
8849 + ((unsigned long)&__init_end - (unsigned long)&__init_begin) >> 10,
8850 +
8851 + (unsigned long)&_etext, (unsigned long)&_edata,
8852 + ((unsigned long)&_edata - (unsigned long)&_etext) >> 10,
8853 +
8854 + (unsigned long)&_text, (unsigned long)&_etext,
8855 + ((unsigned long)&_etext - (unsigned long)&_text) >> 10);
8856 +
8857 +#ifdef CONFIG_HIGHMEM
8858 + BUG_ON(PKMAP_BASE+LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
8859 + BUG_ON(VMALLOC_END > PKMAP_BASE);
8860 +#endif
8861 + BUG_ON(VMALLOC_START > VMALLOC_END);
8862 + BUG_ON((unsigned long)high_memory > VMALLOC_START);
8863 +#endif /* double-sanity-check paranoia */
8864 +
8865 #ifdef CONFIG_X86_PAE
8866 if (!cpu_has_pae)
8867 panic("cannot execute a PAE-enabled kernel on a PAE-less CPU!");
8868 @@ -724,7 +749,7 @@
8869 int arch_add_memory(int nid, u64 start, u64 size)
8870 {
8871 struct pglist_data *pgdata = &contig_page_data;
8872 - struct zone *zone = pgdata->node_zones + MAX_NR_ZONES-1;
8873 + struct zone *zone = pgdata->node_zones + ZONE_HIGHMEM;
8874 unsigned long start_pfn = start >> PAGE_SHIFT;
8875 unsigned long nr_pages = size >> PAGE_SHIFT;
8876
8877 --- a/arch/x86/mm/init_64-xen.c
8878 +++ b/arch/x86/mm/init_64-xen.c
8879 @@ -61,8 +61,6 @@
8880
8881 extern unsigned long *contiguous_bitmap;
8882
8883 -static unsigned long dma_reserve __initdata;
8884 -
8885 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
8886 extern unsigned long start_pfn;
8887
8888 @@ -416,7 +414,6 @@
8889
8890 /* actually usually some more */
8891 if (size >= LARGE_PAGE_SIZE) {
8892 - printk("SMBIOS area too long %lu\n", size);
8893 return NULL;
8894 }
8895 set_pmd(temp_mappings[0].pmd, __pmd(map | _KERNPG_TABLE | _PAGE_PSE));
8896 @@ -438,13 +435,15 @@
8897 #endif
8898
8899 static void __meminit
8900 -phys_pmd_init(pmd_t *pmd, unsigned long address, unsigned long end)
8901 +phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
8902 {
8903 - int i, k;
8904 + int i = pmd_index(address);
8905
8906 - for (i = 0; i < PTRS_PER_PMD; pmd++, i++) {
8907 + for (; i < PTRS_PER_PMD; i++) {
8908 unsigned long pte_phys;
8909 + pmd_t *pmd = pmd_page + i;
8910 pte_t *pte, *pte_save;
8911 + int k;
8912
8913 if (address >= end) {
8914 if (!after_bootmem)
8915 @@ -452,6 +451,12 @@
8916 set_pmd(pmd, __pmd(0));
8917 break;
8918 }
8919 +
8920 + if (__pmd_val(*pmd)) {
8921 + address += PMD_SIZE;
8922 + continue;
8923 + }
8924 +
8925 pte = alloc_static_page(&pte_phys);
8926 pte_save = pte;
8927 for (k = 0; k < PTRS_PER_PTE; pte++, k++, address += PTE_SIZE) {
8928 @@ -474,40 +479,35 @@
8929 static void __meminit
8930 phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
8931 {
8932 - pmd_t *pmd = pmd_offset(pud, (unsigned long)__va(address));
8933 -
8934 - if (pmd_none(*pmd)) {
8935 - spin_lock(&init_mm.page_table_lock);
8936 - phys_pmd_init(pmd, address, end);
8937 - spin_unlock(&init_mm.page_table_lock);
8938 - __flush_tlb_all();
8939 - }
8940 + pmd_t *pmd = pmd_offset(pud,0);
8941 + spin_lock(&init_mm.page_table_lock);
8942 + phys_pmd_init(pmd, address, end);
8943 + spin_unlock(&init_mm.page_table_lock);
8944 + __flush_tlb_all();
8945 }
8946
8947 -static void __meminit phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
8948 +static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
8949 {
8950 - long i = pud_index(address);
8951 -
8952 - pud = pud + i;
8953 -
8954 - if (after_bootmem && pud_val(*pud)) {
8955 - phys_pmd_update(pud, address, end);
8956 - return;
8957 - }
8958 + int i = pud_index(addr);
8959
8960 - for (; i < PTRS_PER_PUD; pud++, i++) {
8961 - unsigned long paddr, pmd_phys;
8962 + for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE ) {
8963 + unsigned long pmd_phys;
8964 + pud_t *pud = pud_page + pud_index(addr);
8965 pmd_t *pmd;
8966
8967 - paddr = (address & PGDIR_MASK) + i*PUD_SIZE;
8968 - if (paddr >= end)
8969 + if (addr >= end)
8970 break;
8971
8972 + if (__pud_val(*pud)) {
8973 + phys_pmd_update(pud, addr, end);
8974 + continue;
8975 + }
8976 +
8977 pmd = alloc_static_page(&pmd_phys);
8978 early_make_page_readonly(pmd, XENFEAT_writable_page_tables);
8979 spin_lock(&init_mm.page_table_lock);
8980 set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
8981 - phys_pmd_init(pmd, paddr, end);
8982 + phys_pmd_init(pmd, addr, end);
8983 spin_unlock(&init_mm.page_table_lock);
8984 }
8985 __flush_tlb();
8986 @@ -771,69 +771,18 @@
8987 #endif
8988 }
8989
8990 -/* Compute zone sizes for the DMA and DMA32 zones in a node. */
8991 -__init void
8992 -size_zones(unsigned long *z, unsigned long *h,
8993 - unsigned long start_pfn, unsigned long end_pfn)
8994 -{
8995 - int i;
8996 - unsigned long w;
8997 -
8998 - for (i = 0; i < MAX_NR_ZONES; i++)
8999 - z[i] = 0;
9000 -
9001 - if (start_pfn < MAX_DMA_PFN)
9002 - z[ZONE_DMA] = MAX_DMA_PFN - start_pfn;
9003 - if (start_pfn < MAX_DMA32_PFN) {
9004 - unsigned long dma32_pfn = MAX_DMA32_PFN;
9005 - if (dma32_pfn > end_pfn)
9006 - dma32_pfn = end_pfn;
9007 - z[ZONE_DMA32] = dma32_pfn - start_pfn;
9008 - }
9009 - z[ZONE_NORMAL] = end_pfn - start_pfn;
9010 -
9011 - /* Remove lower zones from higher ones. */
9012 - w = 0;
9013 - for (i = 0; i < MAX_NR_ZONES; i++) {
9014 - if (z[i])
9015 - z[i] -= w;
9016 - w += z[i];
9017 - }
9018 -
9019 - /* Compute holes */
9020 - w = start_pfn;
9021 - for (i = 0; i < MAX_NR_ZONES; i++) {
9022 - unsigned long s = w;
9023 - w += z[i];
9024 - h[i] = e820_hole_size(s, w);
9025 - }
9026 -
9027 - /* Add the space pace needed for mem_map to the holes too. */
9028 - for (i = 0; i < MAX_NR_ZONES; i++)
9029 - h[i] += (z[i] * sizeof(struct page)) / PAGE_SIZE;
9030 -
9031 - /* The 16MB DMA zone has the kernel and other misc mappings.
9032 - Account them too */
9033 - if (h[ZONE_DMA]) {
9034 - h[ZONE_DMA] += dma_reserve;
9035 - if (h[ZONE_DMA] >= z[ZONE_DMA]) {
9036 - printk(KERN_WARNING
9037 - "Kernel too large and filling up ZONE_DMA?\n");
9038 - h[ZONE_DMA] = z[ZONE_DMA];
9039 - }
9040 - }
9041 -}
9042 -
9043 #ifndef CONFIG_NUMA
9044 void __init paging_init(void)
9045 {
9046 - unsigned long zones[MAX_NR_ZONES], holes[MAX_NR_ZONES];
9047 + unsigned long max_zone_pfns[MAX_NR_ZONES];
9048 + memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
9049 + max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
9050 + max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
9051 + max_zone_pfns[ZONE_NORMAL] = end_pfn;
9052
9053 memory_present(0, 0, end_pfn);
9054 sparse_init();
9055 - size_zones(zones, holes, 0, end_pfn);
9056 - free_area_init_node(0, NODE_DATA(0), zones,
9057 - __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes);
9058 + free_area_init_nodes(max_zone_pfns);
9059
9060 init_mm.context.pinned = 1;
9061 }
9062 @@ -887,36 +836,23 @@
9063
9064 #ifdef CONFIG_MEMORY_HOTPLUG
9065 /*
9066 - * XXX: memory_add_physaddr_to_nid() is to find node id from physical address
9067 - * via probe interface of sysfs. If acpi notifies hot-add event, then it
9068 - * can tell node id by searching dsdt. But, probe interface doesn't have
9069 - * node id. So, return 0 as node id at this time.
9070 - */
9071 -#ifdef CONFIG_NUMA
9072 -int memory_add_physaddr_to_nid(u64 start)
9073 -{
9074 - return 0;
9075 -}
9076 -#endif
9077 -
9078 -/*
9079 * Memory is added always to NORMAL zone. This means you will never get
9080 * additional DMA/DMA32 memory.
9081 */
9082 int arch_add_memory(int nid, u64 start, u64 size)
9083 {
9084 struct pglist_data *pgdat = NODE_DATA(nid);
9085 - struct zone *zone = pgdat->node_zones + MAX_NR_ZONES-2;
9086 + struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
9087 unsigned long start_pfn = start >> PAGE_SHIFT;
9088 unsigned long nr_pages = size >> PAGE_SHIFT;
9089 int ret;
9090
9091 + init_memory_mapping(start, (start + size -1));
9092 +
9093 ret = __add_pages(zone, start_pfn, nr_pages);
9094 if (ret)
9095 goto error;
9096
9097 - init_memory_mapping(start, (start + size -1));
9098 -
9099 return ret;
9100 error:
9101 printk("%s: Problem encountered in __add_pages!\n", __func__);
9102 @@ -930,7 +866,17 @@
9103 }
9104 EXPORT_SYMBOL_GPL(remove_memory);
9105
9106 -#else /* CONFIG_MEMORY_HOTPLUG */
9107 +#if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA)
9108 +int memory_add_physaddr_to_nid(u64 start)
9109 +{
9110 + return 0;
9111 +}
9112 +EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
9113 +#endif
9114 +
9115 +#endif /* CONFIG_MEMORY_HOTPLUG */
9116 +
9117 +#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
9118 /*
9119 * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance,
9120 * just online the pages.
9121 @@ -956,7 +902,7 @@
9122 }
9123 return err;
9124 }
9125 -#endif /* CONFIG_MEMORY_HOTPLUG */
9126 +#endif
9127
9128 static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
9129 kcore_vsyscall;
9130 @@ -973,12 +919,6 @@
9131
9132 pci_iommu_alloc();
9133
9134 - /* How many end-of-memory variables you have, grandma! */
9135 - max_low_pfn = end_pfn;
9136 - max_pfn = end_pfn;
9137 - num_physpages = end_pfn;
9138 - high_memory = (void *) __va(end_pfn * PAGE_SIZE);
9139 -
9140 /* clear the zero-page */
9141 memset(empty_zero_page, 0, PAGE_SIZE);
9142
9143 @@ -996,7 +936,8 @@
9144 init_page_count(pfn_to_page(pfn));
9145 totalram_pages++;
9146 }
9147 - reservedpages = end_pfn - totalram_pages - e820_hole_size(0, end_pfn);
9148 + reservedpages = end_pfn - totalram_pages -
9149 + absent_pages_in_range(0, end_pfn);
9150
9151 after_bootmem = 1;
9152
9153 @@ -1103,15 +1044,34 @@
9154
9155 void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
9156 {
9157 - /* Should check here against the e820 map to avoid double free */
9158 #ifdef CONFIG_NUMA
9159 int nid = phys_to_nid(phys);
9160 +#endif
9161 + unsigned long pfn = phys >> PAGE_SHIFT;
9162 + if (pfn >= end_pfn) {
9163 + /* This can happen with kdump kernels when accessing firmware
9164 + tables. */
9165 + if (pfn < end_pfn_map)
9166 + return;
9167 + printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
9168 + phys, len);
9169 + return;
9170 + }
9171 +
9172 + /* Should check here against the e820 map to avoid double free */
9173 +#ifdef CONFIG_NUMA
9174 reserve_bootmem_node(NODE_DATA(nid), phys, len);
9175 #else
9176 reserve_bootmem(phys, len);
9177 #endif
9178 - if (phys+len <= MAX_DMA_PFN*PAGE_SIZE)
9179 +#ifndef CONFIG_XEN
9180 + if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
9181 + static unsigned long dma_reserve __initdata;
9182 +
9183 dma_reserve += len / PAGE_SIZE;
9184 + set_dma_reserve(dma_reserve);
9185 + }
9186 +#endif
9187 }
9188
9189 int kern_addr_valid(unsigned long addr)
9190 --- a/arch/x86/mm/ioremap_32-xen.c
9191 +++ b/arch/x86/mm/ioremap_32-xen.c
9192 @@ -12,7 +12,7 @@
9193 #include <linux/init.h>
9194 #include <linux/slab.h>
9195 #include <linux/module.h>
9196 -#include <asm/io.h>
9197 +#include <linux/io.h>
9198 #include <asm/fixmap.h>
9199 #include <asm/cacheflush.h>
9200 #include <asm/tlbflush.h>
9201 @@ -118,7 +118,7 @@
9202 if (domid == DOMID_SELF)
9203 return -EINVAL;
9204
9205 - vma->vm_flags |= VM_IO | VM_RESERVED;
9206 + vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
9207
9208 vma->vm_mm->context.has_foreign_mappings = 1;
9209
9210 @@ -203,6 +203,7 @@
9211 void __iomem * addr;
9212 struct vm_struct * area;
9213 unsigned long offset, last_addr;
9214 + pgprot_t prot;
9215 domid_t domid = DOMID_IO;
9216
9217 /* Don't allow wraparound or zero size */
9218 @@ -234,6 +235,8 @@
9219 domid = DOMID_SELF;
9220 }
9221
9222 + prot = __pgprot(_KERNPG_TABLE | flags);
9223 +
9224 /*
9225 * Mappings have to be page-aligned
9226 */
9227 @@ -249,10 +252,9 @@
9228 return NULL;
9229 area->phys_addr = phys_addr;
9230 addr = (void __iomem *) area->addr;
9231 - flags |= _KERNPG_TABLE;
9232 if (__direct_remap_pfn_range(&init_mm, (unsigned long)addr,
9233 phys_addr>>PAGE_SHIFT,
9234 - size, __pgprot(flags), domid)) {
9235 + size, prot, domid)) {
9236 vunmap((void __force *) addr);
9237 return NULL;
9238 }
9239 --- a/arch/x86/mm/pageattr_64-xen.c
9240 +++ b/arch/x86/mm/pageattr_64-xen.c
9241 @@ -371,8 +371,8 @@
9242 BUG_ON(pud_none(*pud));
9243 pmd = pmd_offset(pud, address);
9244 BUG_ON(__pmd_val(*pmd) & _PAGE_PSE);
9245 - pgprot_val(ref_prot) |= _PAGE_PSE;
9246 large_pte = mk_pte_phys(__pa(address) & LARGE_PAGE_MASK, ref_prot);
9247 + large_pte = pte_mkhuge(large_pte);
9248 set_pte((pte_t *)pmd, large_pte);
9249 }
9250
9251 @@ -382,32 +382,28 @@
9252 {
9253 pte_t *kpte;
9254 struct page *kpte_page;
9255 - unsigned kpte_flags;
9256 pgprot_t ref_prot2;
9257 kpte = lookup_address(address);
9258 if (!kpte) return 0;
9259 kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK);
9260 - kpte_flags = pte_val(*kpte);
9261 if (pgprot_val(prot) != pgprot_val(ref_prot)) {
9262 - if ((kpte_flags & _PAGE_PSE) == 0) {
9263 + if (!pte_huge(*kpte)) {
9264 set_pte(kpte, pfn_pte(pfn, prot));
9265 } else {
9266 /*
9267 * split_large_page will take the reference for this
9268 * change_page_attr on the split page.
9269 */
9270 -
9271 struct page *split;
9272 - ref_prot2 = __pgprot(pgprot_val(pte_pgprot(*lookup_address(address))) & ~(1<<_PAGE_BIT_PSE));
9273 -
9274 + ref_prot2 = pte_pgprot(pte_clrhuge(*kpte));
9275 split = split_large_page(address, prot, ref_prot2);
9276 if (!split)
9277 return -ENOMEM;
9278 - set_pte(kpte,mk_pte(split, ref_prot2));
9279 + set_pte(kpte, mk_pte(split, ref_prot2));
9280 kpte_page = split;
9281 - }
9282 + }
9283 page_private(kpte_page)++;
9284 - } else if ((kpte_flags & _PAGE_PSE) == 0) {
9285 + } else if (!pte_huge(*kpte)) {
9286 set_pte(kpte, pfn_pte(pfn, ref_prot));
9287 BUG_ON(page_private(kpte_page) == 0);
9288 page_private(kpte_page)--;
9289 @@ -464,10 +460,12 @@
9290 * lowmem */
9291 if (__pa(address) < KERNEL_TEXT_SIZE) {
9292 unsigned long addr2;
9293 - pgprot_t prot2 = prot;
9294 + pgprot_t prot2;
9295 addr2 = __START_KERNEL_map + __pa(address);
9296 - pgprot_val(prot2) &= ~_PAGE_NX;
9297 - err = __change_page_attr(addr2, pfn, prot2, PAGE_KERNEL_EXEC);
9298 + /* Make sure the kernel mappings stay executable */
9299 + prot2 = pte_pgprot(pte_mkexec(pfn_pte(0, prot)));
9300 + err = __change_page_attr(addr2, pfn, prot2,
9301 + PAGE_KERNEL_EXEC);
9302 }
9303 }
9304 up_write(&init_mm.mmap_sem);
9305 --- a/arch/x86/mm/pgtable_32-xen.c
9306 +++ b/arch/x86/mm/pgtable_32-xen.c
9307 @@ -68,7 +68,9 @@
9308 printk(KERN_INFO "%lu pages writeback\n",
9309 global_page_state(NR_WRITEBACK));
9310 printk(KERN_INFO "%lu pages mapped\n", global_page_state(NR_FILE_MAPPED));
9311 - printk(KERN_INFO "%lu pages slab\n", global_page_state(NR_SLAB));
9312 + printk(KERN_INFO "%lu pages slab\n",
9313 + global_page_state(NR_SLAB_RECLAIMABLE) +
9314 + global_page_state(NR_SLAB_UNRECLAIMABLE));
9315 printk(KERN_INFO "%lu pages pagetables\n",
9316 global_page_state(NR_PAGETABLE));
9317 }
9318 @@ -108,18 +110,11 @@
9319 __flush_tlb_one(vaddr);
9320 }
9321
9322 -static int nr_fixmaps = 0;
9323 +static int fixmaps;
9324 unsigned long hypervisor_virt_start = HYPERVISOR_VIRT_START;
9325 -unsigned long __FIXADDR_TOP = (HYPERVISOR_VIRT_START - 2 * PAGE_SIZE);
9326 +unsigned long __FIXADDR_TOP = (HYPERVISOR_VIRT_START - PAGE_SIZE);
9327 EXPORT_SYMBOL(__FIXADDR_TOP);
9328
9329 -void __init set_fixaddr_top(unsigned long top)
9330 -{
9331 - BUG_ON(nr_fixmaps > 0);
9332 - hypervisor_virt_start = top;
9333 - __FIXADDR_TOP = hypervisor_virt_start - 2 * PAGE_SIZE;
9334 -}
9335 -
9336 void __set_fixmap (enum fixed_addresses idx, maddr_t phys, pgprot_t flags)
9337 {
9338 unsigned long address = __fix_to_virt(idx);
9339 @@ -141,7 +136,21 @@
9340 if (HYPERVISOR_update_va_mapping(address, pte,
9341 UVMF_INVLPG|UVMF_ALL))
9342 BUG();
9343 - nr_fixmaps++;
9344 + fixmaps++;
9345 +}
9346 +
9347 +/**
9348 + * reserve_top_address - reserves a hole in the top of kernel address space
9349 + * @reserve - size of hole to reserve
9350 + *
9351 + * Can be used to relocate the fixmap area and poke a hole in the top
9352 + * of kernel address space to make room for a hypervisor.
9353 + */
9354 +void __init reserve_top_address(unsigned long reserve)
9355 +{
9356 + BUG_ON(fixmaps > 0);
9357 + __FIXADDR_TOP = -reserve - PAGE_SIZE;
9358 + __VMALLOC_RESERVE += reserve;
9359 }
9360
9361 pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
9362 --- a/arch/x86/pci/irq-xen.c
9363 +++ b/arch/x86/pci/irq-xen.c
9364 @@ -991,10 +991,6 @@
9365 pci_name(bridge), 'A' + pin, irq);
9366 }
9367 if (irq >= 0) {
9368 - if (use_pci_vector() &&
9369 - !platform_legacy_irq(irq))
9370 - irq = IO_APIC_VECTOR(irq);
9371 -
9372 printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n",
9373 pci_name(dev), 'A' + pin, irq);
9374 dev->irq = irq;
9375 @@ -1155,10 +1151,6 @@
9376 }
9377 dev = temp_dev;
9378 if (irq >= 0) {
9379 -#ifdef CONFIG_PCI_MSI
9380 - if (!platform_legacy_irq(irq))
9381 - irq = IO_APIC_VECTOR(irq);
9382 -#endif
9383 printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n",
9384 pci_name(dev), 'A' + pin, irq);
9385 dev->irq = irq;
9386 @@ -1179,33 +1171,3 @@
9387 }
9388 return 0;
9389 }
9390 -
9391 -int pci_vector_resources(int last, int nr_released)
9392 -{
9393 - int count = nr_released;
9394 -
9395 - int next = last;
9396 - int offset = (last % 8);
9397 -
9398 - while (next < FIRST_SYSTEM_VECTOR) {
9399 - next += 8;
9400 -#ifdef CONFIG_X86_64
9401 - if (next == IA32_SYSCALL_VECTOR)
9402 - continue;
9403 -#else
9404 - if (next == SYSCALL_VECTOR)
9405 - continue;
9406 -#endif
9407 - count++;
9408 - if (next >= FIRST_SYSTEM_VECTOR) {
9409 - if (offset%8) {
9410 - next = FIRST_DEVICE_VECTOR + offset;
9411 - offset++;
9412 - continue;
9413 - }
9414 - count--;
9415 - }
9416 - }
9417 -
9418 - return count;
9419 -}
9420 --- a/drivers/char/tpm/tpm_xen.c
9421 +++ b/drivers/char/tpm/tpm_xen.c
9422 @@ -85,8 +85,7 @@
9423
9424 /* local function prototypes */
9425 static irqreturn_t tpmif_int(int irq,
9426 - void *tpm_priv,
9427 - struct pt_regs *ptregs);
9428 + void *tpm_priv);
9429 static void tpmif_rx_action(unsigned long unused);
9430 static int tpmif_connect(struct xenbus_device *dev,
9431 struct tpm_private *tp,
9432 @@ -559,7 +558,7 @@
9433 }
9434
9435
9436 -static irqreturn_t tpmif_int(int irq, void *tpm_priv, struct pt_regs *ptregs)
9437 +static irqreturn_t tpmif_int(int irq, void *tpm_priv)
9438 {
9439 struct tpm_private *tp = tpm_priv;
9440 unsigned long flags;
9441 --- a/drivers/pci/Kconfig
9442 +++ b/drivers/pci/Kconfig
9443 @@ -45,7 +45,7 @@
9444 config HT_IRQ
9445 bool "Interrupts on hypertransport devices"
9446 default y
9447 - depends on PCI && X86_LOCAL_APIC && X86_IO_APIC
9448 + depends on PCI && X86_LOCAL_APIC && X86_IO_APIC && !XEN
9449 help
9450 This allows native hypertransport devices to use interrupts.
9451
9452 --- a/drivers/xen/Kconfig
9453 +++ b/drivers/xen/Kconfig
9454 @@ -278,6 +278,9 @@
9455 config HAVE_IRQ_IGNORE_UNHANDLED
9456 def_bool y
9457
9458 +config GENERIC_HARDIRQS_NO__DO_IRQ
9459 + def_bool y
9460 +
9461 config NO_IDLE_HZ
9462 def_bool y
9463
9464 --- a/drivers/xen/balloon/balloon.c
9465 +++ b/drivers/xen/balloon/balloon.c
9466 @@ -84,7 +84,7 @@
9467 /* VM /proc information for memory */
9468 extern unsigned long totalram_pages;
9469
9470 -#ifndef MODULE
9471 +#if !defined(MODULE) && defined(CONFIG_HIGHMEM)
9472 extern unsigned long totalhigh_pages;
9473 #define inc_totalhigh_pages() (totalhigh_pages++)
9474 #define dec_totalhigh_pages() (totalhigh_pages--)
9475 --- a/drivers/xen/blkback/blkback.c
9476 +++ b/drivers/xen/blkback/blkback.c
9477 @@ -288,7 +288,7 @@
9478 wake_up(&blkif->wq);
9479 }
9480
9481 -irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs)
9482 +irqreturn_t blkif_be_int(int irq, void *dev_id)
9483 {
9484 blkif_notify_work(dev_id);
9485 return IRQ_HANDLED;
9486 --- a/drivers/xen/blkback/common.h
9487 +++ b/drivers/xen/blkback/common.h
9488 @@ -130,7 +130,7 @@
9489
9490 void blkif_xenbus_init(void);
9491
9492 -irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs);
9493 +irqreturn_t blkif_be_int(int irq, void *dev_id);
9494 int blkif_schedule(void *arg);
9495
9496 int blkback_barrier(struct xenbus_transaction xbt,
9497 --- a/drivers/xen/blkfront/blkfront.c
9498 +++ b/drivers/xen/blkfront/blkfront.c
9499 @@ -69,7 +69,7 @@
9500
9501 static void kick_pending_request_queues(struct blkfront_info *);
9502
9503 -static irqreturn_t blkif_int(int irq, void *dev_id, struct pt_regs *ptregs);
9504 +static irqreturn_t blkif_int(int irq, void *dev_id);
9505 static void blkif_restart_queue(void *arg);
9506 static void blkif_recover(struct blkfront_info *);
9507 static void blkif_completion(struct blk_shadow *);
9508 @@ -698,7 +698,7 @@
9509 }
9510
9511
9512 -static irqreturn_t blkif_int(int irq, void *dev_id, struct pt_regs *ptregs)
9513 +static irqreturn_t blkif_int(int irq, void *dev_id)
9514 {
9515 struct request *req;
9516 blkif_response_t *bret;
9517 --- a/drivers/xen/blktap/blktap.c
9518 +++ b/drivers/xen/blktap/blktap.c
9519 @@ -1175,7 +1175,7 @@
9520 wake_up(&blkif->wq);
9521 }
9522
9523 -irqreturn_t tap_blkif_be_int(int irq, void *dev_id, struct pt_regs *regs)
9524 +irqreturn_t tap_blkif_be_int(int irq, void *dev_id)
9525 {
9526 blkif_notify_work(dev_id);
9527 return IRQ_HANDLED;
9528 --- a/drivers/xen/blktap/common.h
9529 +++ b/drivers/xen/blktap/common.h
9530 @@ -112,7 +112,7 @@
9531
9532 void tap_blkif_xenbus_init(void);
9533
9534 -irqreturn_t tap_blkif_be_int(int irq, void *dev_id, struct pt_regs *regs);
9535 +irqreturn_t tap_blkif_be_int(int irq, void *dev_id);
9536 int tap_blkif_schedule(void *arg);
9537
9538 int dom_to_devid(domid_t domid, int xenbus_id, blkif_t *blkif);
9539 --- a/drivers/xen/console/console.c
9540 +++ b/drivers/xen/console/console.c
9541 @@ -345,7 +345,7 @@
9542 static int xencons_priv_irq;
9543 static char x_char;
9544
9545 -void xencons_rx(char *buf, unsigned len, struct pt_regs *regs)
9546 +void xencons_rx(char *buf, unsigned len)
9547 {
9548 int i;
9549 unsigned long flags;
9550 @@ -370,8 +370,7 @@
9551 if (time_before(jiffies, sysrq_timeout)) {
9552 spin_unlock_irqrestore(
9553 &xencons_lock, flags);
9554 - handle_sysrq(
9555 - buf[i], regs, xencons_tty);
9556 + handle_sysrq(buf[i], xencons_tty);
9557 spin_lock_irqsave(
9558 &xencons_lock, flags);
9559 continue;
9560 @@ -436,14 +435,13 @@
9561 }
9562
9563 /* Privileged receive callback and transmit kicker. */
9564 -static irqreturn_t xencons_priv_interrupt(int irq, void *dev_id,
9565 - struct pt_regs *regs)
9566 +static irqreturn_t xencons_priv_interrupt(int irq, void *dev_id)
9567 {
9568 static char rbuf[16];
9569 int l;
9570
9571 while ((l = HYPERVISOR_console_io(CONSOLEIO_read, 16, rbuf)) > 0)
9572 - xencons_rx(rbuf, l, regs);
9573 + xencons_rx(rbuf, l);
9574
9575 xencons_tx();
9576
9577 --- a/drivers/xen/console/xencons_ring.c
9578 +++ b/drivers/xen/console/xencons_ring.c
9579 @@ -83,7 +83,7 @@
9580 return sent;
9581 }
9582
9583 -static irqreturn_t handle_input(int irq, void *unused, struct pt_regs *regs)
9584 +static irqreturn_t handle_input(int irq, void *unused)
9585 {
9586 struct xencons_interface *intf = xencons_interface();
9587 XENCONS_RING_IDX cons, prod;
9588 @@ -94,7 +94,7 @@
9589 BUG_ON((prod - cons) > sizeof(intf->in));
9590
9591 while (cons != prod) {
9592 - xencons_rx(intf->in+MASK_XENCONS_IDX(cons,intf->in), 1, regs);
9593 + xencons_rx(intf->in+MASK_XENCONS_IDX(cons,intf->in), 1);
9594 cons++;
9595 }
9596
9597 --- a/drivers/xen/core/evtchn.c
9598 +++ b/drivers/xen/core/evtchn.c
9599 @@ -507,7 +507,7 @@
9600
9601 int bind_caller_port_to_irqhandler(
9602 unsigned int caller_port,
9603 - irqreturn_t (*handler)(int, void *, struct pt_regs *),
9604 + irq_handler_t handler,
9605 unsigned long irqflags,
9606 const char *devname,
9607 void *dev_id)
9608 @@ -530,7 +530,7 @@
9609
9610 int bind_listening_port_to_irqhandler(
9611 unsigned int remote_domain,
9612 - irqreturn_t (*handler)(int, void *, struct pt_regs *),
9613 + irq_handler_t handler,
9614 unsigned long irqflags,
9615 const char *devname,
9616 void *dev_id)
9617 @@ -554,7 +554,7 @@
9618 int bind_interdomain_evtchn_to_irqhandler(
9619 unsigned int remote_domain,
9620 unsigned int remote_port,
9621 - irqreturn_t (*handler)(int, void *, struct pt_regs *),
9622 + irq_handler_t handler,
9623 unsigned long irqflags,
9624 const char *devname,
9625 void *dev_id)
9626 @@ -578,7 +578,7 @@
9627 int bind_virq_to_irqhandler(
9628 unsigned int virq,
9629 unsigned int cpu,
9630 - irqreturn_t (*handler)(int, void *, struct pt_regs *),
9631 + irq_handler_t handler,
9632 unsigned long irqflags,
9633 const char *devname,
9634 void *dev_id)
9635 @@ -602,7 +602,7 @@
9636 int bind_ipi_to_irqhandler(
9637 unsigned int ipi,
9638 unsigned int cpu,
9639 - irqreturn_t (*handler)(int, void *, struct pt_regs *),
9640 + irq_handler_t handler,
9641 unsigned long irqflags,
9642 const char *devname,
9643 void *dev_id)
9644 @@ -687,15 +687,7 @@
9645 return 0;
9646 }
9647
9648 -static void shutdown_dynirq(unsigned int irq)
9649 -{
9650 - int evtchn = evtchn_from_irq(irq);
9651 -
9652 - if (VALID_EVTCHN(evtchn))
9653 - mask_evtchn(evtchn);
9654 -}
9655 -
9656 -static void enable_dynirq(unsigned int irq)
9657 +static void unmask_dynirq(unsigned int irq)
9658 {
9659 int evtchn = evtchn_from_irq(irq);
9660
9661 @@ -703,7 +695,7 @@
9662 unmask_evtchn(evtchn);
9663 }
9664
9665 -static void disable_dynirq(unsigned int irq)
9666 +static void mask_dynirq(unsigned int irq)
9667 {
9668 int evtchn = evtchn_from_irq(irq);
9669
9670 @@ -731,12 +723,12 @@
9671 unmask_evtchn(evtchn);
9672 }
9673
9674 -static struct hw_interrupt_type dynirq_type = {
9675 - .typename = "Dynamic-irq",
9676 +static struct irq_chip dynirq_chip = {
9677 + .name = "Dynamic-irq",
9678 .startup = startup_dynirq,
9679 - .shutdown = shutdown_dynirq,
9680 - .enable = enable_dynirq,
9681 - .disable = disable_dynirq,
9682 + .mask = mask_dynirq,
9683 + .unmask = unmask_dynirq,
9684 + .mask_ack = ack_dynirq,
9685 .ack = ack_dynirq,
9686 .end = end_dynirq,
9687 #ifdef CONFIG_SMP
9688 @@ -820,12 +812,12 @@
9689 irq_info[irq] = IRQ_UNBOUND;
9690 }
9691
9692 -static void enable_pirq(unsigned int irq)
9693 +static void unmask_pirq(unsigned int irq)
9694 {
9695 startup_pirq(irq);
9696 }
9697
9698 -static void disable_pirq(unsigned int irq)
9699 +static void mask_pirq(unsigned int irq)
9700 {
9701 }
9702
9703 @@ -854,12 +846,14 @@
9704 }
9705 }
9706
9707 -static struct hw_interrupt_type pirq_type = {
9708 +static struct irq_chip pirq_chip = {
9709 + .name = "Phys-irq",
9710 .typename = "Phys-irq",
9711 .startup = startup_pirq,
9712 .shutdown = shutdown_pirq,
9713 - .enable = enable_pirq,
9714 - .disable = disable_pirq,
9715 + .mask = mask_pirq,
9716 + .unmask = unmask_pirq,
9717 + .mask_ack = ack_pirq,
9718 .ack = ack_pirq,
9719 .end = end_pirq,
9720 #ifdef CONFIG_SMP
9721 @@ -1043,7 +1037,8 @@
9722 irq_desc[dynirq_to_irq(i)].status = IRQ_DISABLED;
9723 irq_desc[dynirq_to_irq(i)].action = NULL;
9724 irq_desc[dynirq_to_irq(i)].depth = 1;
9725 - irq_desc[dynirq_to_irq(i)].chip = &dynirq_type;
9726 + set_irq_chip_and_handler_name(dynirq_to_irq(i), &dynirq_chip,
9727 + handle_level_irq, "level");
9728 }
9729
9730 /* Phys IRQ space is statically bound (1:1 mapping). Nail refcnts. */
9731 @@ -1059,6 +1054,7 @@
9732 irq_desc[pirq_to_irq(i)].status = IRQ_DISABLED;
9733 irq_desc[pirq_to_irq(i)].action = NULL;
9734 irq_desc[pirq_to_irq(i)].depth = 1;
9735 - irq_desc[pirq_to_irq(i)].chip = &pirq_type;
9736 + set_irq_chip_and_handler_name(pirq_to_irq(i), &pirq_chip,
9737 + handle_level_irq, "level");
9738 }
9739 }
9740 --- a/drivers/xen/core/reboot.c
9741 +++ b/drivers/xen/core/reboot.c
9742 @@ -13,6 +13,7 @@
9743
9744 #ifdef HAVE_XEN_PLATFORM_COMPAT_H
9745 #include <xen/platform-compat.h>
9746 +#undef handle_sysrq
9747 #endif
9748
9749 MODULE_LICENSE("Dual BSD/GPL");
9750 @@ -203,7 +204,7 @@
9751
9752 #ifdef CONFIG_MAGIC_SYSRQ
9753 if (sysrq_key != '\0')
9754 - handle_sysrq(sysrq_key, NULL, NULL);
9755 + handle_sysrq(sysrq_key, NULL);
9756 #endif
9757 }
9758
9759 --- a/drivers/xen/core/smpboot.c
9760 +++ b/drivers/xen/core/smpboot.c
9761 @@ -25,8 +25,8 @@
9762 #include <xen/cpu_hotplug.h>
9763 #include <xen/xenbus.h>
9764
9765 -extern irqreturn_t smp_reschedule_interrupt(int, void *, struct pt_regs *);
9766 -extern irqreturn_t smp_call_function_interrupt(int, void *, struct pt_regs *);
9767 +extern irqreturn_t smp_reschedule_interrupt(int, void *);
9768 +extern irqreturn_t smp_call_function_interrupt(int, void *);
9769
9770 extern int local_setup_timer(unsigned int cpu);
9771 extern void local_teardown_timer(unsigned int cpu);
9772 @@ -66,8 +66,6 @@
9773 #if defined(__i386__)
9774 u8 x86_cpu_to_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = 0xff };
9775 EXPORT_SYMBOL(x86_cpu_to_apicid);
9776 -#elif !defined(CONFIG_X86_IO_APIC)
9777 -unsigned int maxcpus = NR_CPUS;
9778 #endif
9779
9780 void __init prefill_possible_map(void)
9781 --- a/drivers/xen/fbfront/xenfb.c
9782 +++ b/drivers/xen/fbfront/xenfb.c
9783 @@ -523,8 +523,7 @@
9784 .fb_set_par = xenfb_set_par,
9785 };
9786
9787 -static irqreturn_t xenfb_event_handler(int rq, void *dev_id,
9788 - struct pt_regs *regs)
9789 +static irqreturn_t xenfb_event_handler(int rq, void *dev_id)
9790 {
9791 /*
9792 * No in events recognized, simply ignore them all.
9793 --- a/drivers/xen/fbfront/xenkbd.c
9794 +++ b/drivers/xen/fbfront/xenkbd.c
9795 @@ -46,7 +46,7 @@
9796 * to do that.
9797 */
9798
9799 -static irqreturn_t input_handler(int rq, void *dev_id, struct pt_regs *regs)
9800 +static irqreturn_t input_handler(int rq, void *dev_id)
9801 {
9802 struct xenkbd_info *info = dev_id;
9803 struct xenkbd_page *page = info->page;
9804 --- a/drivers/xen/gntdev/gntdev.c
9805 +++ b/drivers/xen/gntdev/gntdev.c
9806 @@ -755,9 +755,6 @@
9807 BUG();
9808 }
9809
9810 - /* Copy the existing value of the PTE for returning. */
9811 - copy = *ptep;
9812 -
9813 /* Calculate the grant relating to this PTE. */
9814 slot_index = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT);
9815
9816 @@ -772,6 +769,10 @@
9817 GNTDEV_INVALID_HANDLE &&
9818 !xen_feature(XENFEAT_auto_translated_physmap)) {
9819 /* NOT USING SHADOW PAGE TABLES. */
9820 +
9821 + /* Copy the existing value of the PTE for returning. */
9822 + copy = *ptep;
9823 +
9824 gnttab_set_unmap_op(&op, virt_to_machine(ptep),
9825 GNTMAP_contains_pte,
9826 private_data->grants[slot_index]
9827 @@ -784,7 +785,7 @@
9828 op.status);
9829 } else {
9830 /* USING SHADOW PAGE TABLES. */
9831 - pte_clear_full(vma->vm_mm, addr, ptep, is_fullmm);
9832 + copy = ptep_get_and_clear_full(vma->vm_mm, addr, ptep, is_fullmm);
9833 }
9834
9835 /* Finally, we unmap the grant from kernel space. */
9836 @@ -812,7 +813,7 @@
9837 >> PAGE_SHIFT, INVALID_P2M_ENTRY);
9838
9839 } else {
9840 - pte_clear_full(vma->vm_mm, addr, ptep, is_fullmm);
9841 + copy = ptep_get_and_clear_full(vma->vm_mm, addr, ptep, is_fullmm);
9842 }
9843
9844 return copy;
9845 --- a/drivers/xen/netback/accel.c
9846 +++ b/drivers/xen/netback/accel.c
9847 @@ -65,7 +65,7 @@
9848
9849 if (IS_ERR(eth_name)) {
9850 /* Probably means not present */
9851 - DPRINTK("%s: no match due to xenbus_read accel error %d\n",
9852 + DPRINTK("%s: no match due to xenbus_read accel error %ld\n",
9853 __FUNCTION__, PTR_ERR(eth_name));
9854 return 0;
9855 } else {
9856 --- a/drivers/xen/netback/common.h
9857 +++ b/drivers/xen/netback/common.h
9858 @@ -200,7 +200,7 @@
9859
9860 int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev);
9861 struct net_device_stats *netif_be_get_stats(struct net_device *dev);
9862 -irqreturn_t netif_be_int(int irq, void *dev_id, struct pt_regs *regs);
9863 +irqreturn_t netif_be_int(int irq, void *dev_id);
9864
9865 static inline int netbk_can_queue(struct net_device *dev)
9866 {
9867 --- a/drivers/xen/netback/loopback.c
9868 +++ b/drivers/xen/netback/loopback.c
9869 @@ -151,7 +151,7 @@
9870 np->stats.rx_bytes += skb->len;
9871 np->stats.rx_packets++;
9872
9873 - if (skb->ip_summed == CHECKSUM_HW) {
9874 + if (skb->ip_summed == CHECKSUM_PARTIAL) {
9875 /* Defer checksum calculation. */
9876 skb->proto_csum_blank = 1;
9877 /* Must be a local packet: assert its integrity. */
9878 --- a/drivers/xen/netback/netback.c
9879 +++ b/drivers/xen/netback/netback.c
9880 @@ -677,7 +677,7 @@
9881 id = meta[npo.meta_cons].id;
9882 flags = nr_frags ? NETRXF_more_data : 0;
9883
9884 - if (skb->ip_summed == CHECKSUM_HW) /* local packet? */
9885 + if (skb->ip_summed == CHECKSUM_PARTIAL) /* local packet? */
9886 flags |= NETRXF_csum_blank | NETRXF_data_validated;
9887 else if (skb->proto_data_valid) /* remote but checksummed? */
9888 flags |= NETRXF_data_validated;
9889 @@ -1441,7 +1441,7 @@
9890 netif_idx_release(netif_page_index(page));
9891 }
9892
9893 -irqreturn_t netif_be_int(int irq, void *dev_id, struct pt_regs *regs)
9894 +irqreturn_t netif_be_int(int irq, void *dev_id)
9895 {
9896 netif_t *netif = dev_id;
9897
9898 @@ -1508,7 +1508,7 @@
9899 }
9900
9901 #ifdef NETBE_DEBUG_INTERRUPT
9902 -static irqreturn_t netif_be_dbg(int irq, void *dev_id, struct pt_regs *regs)
9903 +static irqreturn_t netif_be_dbg(int irq, void *dev_id)
9904 {
9905 struct list_head *ent;
9906 netif_t *netif;
9907 --- a/drivers/xen/netfront/netfront.c
9908 +++ b/drivers/xen/netfront/netfront.c
9909 @@ -136,7 +136,7 @@
9910 {
9911 return skb_is_gso(skb) &&
9912 (!skb_gso_ok(skb, dev->features) ||
9913 - unlikely(skb->ip_summed != CHECKSUM_HW));
9914 + unlikely(skb->ip_summed != CHECKSUM_PARTIAL));
9915 }
9916 #else
9917 #define HAVE_GSO 0
9918 @@ -222,7 +222,7 @@
9919 static void network_alloc_rx_buffers(struct net_device *);
9920 static void send_fake_arp(struct net_device *);
9921
9922 -static irqreturn_t netif_int(int irq, void *dev_id, struct pt_regs *ptregs);
9923 +static irqreturn_t netif_int(int irq, void *dev_id);
9924
9925 #ifdef CONFIG_SYSFS
9926 static int xennet_sysfs_addif(struct net_device *netdev);
9927 @@ -992,7 +992,7 @@
9928 tx->flags = 0;
9929 extra = NULL;
9930
9931 - if (skb->ip_summed == CHECKSUM_HW) /* local packet? */
9932 + if (skb->ip_summed == CHECKSUM_PARTIAL) /* local packet? */
9933 tx->flags |= NETTXF_csum_blank | NETTXF_data_validated;
9934 #ifdef CONFIG_XEN
9935 if (skb->proto_data_valid) /* remote but checksummed? */
9936 @@ -1049,7 +1049,7 @@
9937 return 0;
9938 }
9939
9940 -static irqreturn_t netif_int(int irq, void *dev_id, struct pt_regs *ptregs)
9941 +static irqreturn_t netif_int(int irq, void *dev_id)
9942 {
9943 struct net_device *dev = dev_id;
9944 struct netfront_info *np = netdev_priv(dev);
9945 --- a/drivers/xen/pciback/pciback.h
9946 +++ b/drivers/xen/pciback/pciback.h
9947 @@ -87,7 +87,7 @@
9948 void pciback_release_devices(struct pciback_device *pdev);
9949
9950 /* Handles events from front-end */
9951 -irqreturn_t pciback_handle_event(int irq, void *dev_id, struct pt_regs *regs);
9952 +irqreturn_t pciback_handle_event(int irq, void *dev_id);
9953 void pciback_do_op(void *data);
9954
9955 int pciback_xenbus_register(void);
9956 --- a/drivers/xen/pciback/pciback_ops.c
9957 +++ b/drivers/xen/pciback/pciback_ops.c
9958 @@ -85,7 +85,7 @@
9959 test_and_schedule_op(pdev);
9960 }
9961
9962 -irqreturn_t pciback_handle_event(int irq, void *dev_id, struct pt_regs *regs)
9963 +irqreturn_t pciback_handle_event(int irq, void *dev_id)
9964 {
9965 struct pciback_device *pdev = dev_id;
9966
9967 --- a/drivers/xen/pcifront/pci_op.c
9968 +++ b/drivers/xen/pcifront/pci_op.c
9969 @@ -392,10 +392,16 @@
9970
9971 d = pci_scan_single_device(b, devfn);
9972 if (d) {
9973 + int err;
9974 +
9975 dev_info(&pdev->xdev->dev, "New device on "
9976 "%04x:%02x:%02x.%02x found.\n", domain, bus,
9977 PCI_SLOT(devfn), PCI_FUNC(devfn));
9978 - pci_bus_add_device(d);
9979 + err = pci_bus_add_device(d);
9980 + if (err)
9981 + dev_err(&pdev->xdev->dev,
9982 + "error %d adding device, continuing.\n",
9983 + err);
9984 }
9985 }
9986
9987 --- a/drivers/xen/privcmd/compat_privcmd.c
9988 +++ b/drivers/xen/privcmd/compat_privcmd.c
9989 @@ -18,7 +18,6 @@
9990 * Authors: Jimi Xenidis <jimix@watson.ibm.com>
9991 */
9992
9993 -#include <linux/config.h>
9994 #include <linux/compat.h>
9995 #include <linux/ioctl.h>
9996 #include <linux/syscalls.h>
9997 --- a/drivers/xen/privcmd/privcmd.c
9998 +++ b/drivers/xen/privcmd/privcmd.c
9999 @@ -236,7 +236,7 @@
10000 #endif
10001
10002 /* DONTCOPY is essential for Xen as copy_page_range is broken. */
10003 - vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY;
10004 + vma->vm_flags |= VM_RESERVED | VM_IO | VM_PFNMAP | VM_DONTCOPY;
10005 vma->vm_ops = &privcmd_vm_ops;
10006 vma->vm_private_data = NULL;
10007
10008 --- a/drivers/xen/sfc_netback/accel_xenbus.c
10009 +++ b/drivers/xen/sfc_netback/accel_xenbus.c
10010 @@ -68,8 +68,7 @@
10011
10012
10013 /* Demultiplex a message IRQ from the frontend driver. */
10014 -static irqreturn_t msgirq_from_frontend(int irq, void *context,
10015 - struct pt_regs *unused)
10016 +static irqreturn_t msgirq_from_frontend(int irq, void *context)
10017 {
10018 struct xenbus_device *dev = context;
10019 struct netback_accel *bend = NETBACK_ACCEL_FROM_XENBUS_DEVICE(dev);
10020 @@ -84,8 +83,7 @@
10021 * functionally, but we need it to pass to the bind function, and may
10022 * get called spuriously
10023 */
10024 -static irqreturn_t netirq_from_frontend(int irq, void *context,
10025 - struct pt_regs *unused)
10026 +static irqreturn_t netirq_from_frontend(int irq, void *context)
10027 {
10028 VPRINTK("netirq %d from device %s\n", irq,
10029 ((struct xenbus_device *)context)->nodename);
10030 --- a/drivers/xen/sfc_netfront/accel.h
10031 +++ b/drivers/xen/sfc_netfront/accel.h
10032 @@ -449,10 +449,8 @@
10033 u32 ip, u16 port, u8 protocol);
10034
10035 /* Process an IRQ received from back end driver */
10036 -irqreturn_t netfront_accel_msg_channel_irq_from_bend(int irq, void *context,
10037 - struct pt_regs *unused);
10038 -irqreturn_t netfront_accel_net_channel_irq_from_bend(int irq, void *context,
10039 - struct pt_regs *unused);
10040 +irqreturn_t netfront_accel_msg_channel_irq_from_bend(int irq, void *context);
10041 +irqreturn_t netfront_accel_net_channel_irq_from_bend(int irq, void *context);
10042
10043 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
10044 extern void netfront_accel_msg_from_bend(struct work_struct *context);
10045 --- a/drivers/xen/sfc_netfront/accel_msg.c
10046 +++ b/drivers/xen/sfc_netfront/accel_msg.c
10047 @@ -490,8 +490,7 @@
10048 }
10049
10050
10051 -irqreturn_t netfront_accel_msg_channel_irq_from_bend(int irq, void *context,
10052 - struct pt_regs *unused)
10053 +irqreturn_t netfront_accel_msg_channel_irq_from_bend(int irq, void *context)
10054 {
10055 netfront_accel_vnic *vnic = (netfront_accel_vnic *)context;
10056 VPRINTK("irq %d from device %s\n", irq, vnic->dev->nodename);
10057 @@ -502,8 +501,7 @@
10058 }
10059
10060 /* Process an interrupt received from the NIC via backend */
10061 -irqreturn_t netfront_accel_net_channel_irq_from_bend(int irq, void *context,
10062 - struct pt_regs *unused)
10063 +irqreturn_t netfront_accel_net_channel_irq_from_bend(int irq, void *context)
10064 {
10065 netfront_accel_vnic *vnic = (netfront_accel_vnic *)context;
10066 struct net_device *net_dev = vnic->net_dev;
10067 --- a/drivers/xen/sfc_netfront/accel_tso.c
10068 +++ b/drivers/xen/sfc_netfront/accel_tso.c
10069 @@ -363,7 +363,7 @@
10070
10071 tso_check_safe(skb);
10072
10073 - if (skb->ip_summed != CHECKSUM_HW)
10074 + if (skb->ip_summed != CHECKSUM_PARTIAL)
10075 EPRINTK("Trying to TSO send a packet without HW checksum\n");
10076
10077 tso_start(&state, skb);
10078 --- a/drivers/xen/sfc_netfront/accel_vi.c
10079 +++ b/drivers/xen/sfc_netfront/accel_vi.c
10080 @@ -461,7 +461,7 @@
10081
10082 frag_i = -1;
10083
10084 - if (skb->ip_summed == CHECKSUM_HW) {
10085 + if (skb->ip_summed == CHECKSUM_PARTIAL) {
10086 /* Set to zero to encourage falcon to work it out for us */
10087 *(u16*)(skb->h.raw + skb->csum) = 0;
10088 }
10089 @@ -580,7 +580,7 @@
10090
10091 kva = buf->pkt_kva;
10092
10093 - if (skb->ip_summed == CHECKSUM_HW) {
10094 + if (skb->ip_summed == CHECKSUM_PARTIAL) {
10095 /* Set to zero to encourage falcon to work it out for us */
10096 *(u16*)(skb->h.raw + skb->csum) = 0;
10097 }
10098 --- a/drivers/xen/tpmback/common.h
10099 +++ b/drivers/xen/tpmback/common.h
10100 @@ -61,7 +61,7 @@
10101 void tpmif_xenbus_init(void);
10102 void tpmif_xenbus_exit(void);
10103 int tpmif_map(tpmif_t *tpmif, unsigned long shared_page, unsigned int evtchn);
10104 -irqreturn_t tpmif_be_int(int irq, void *dev_id, struct pt_regs *regs);
10105 +irqreturn_t tpmif_be_int(int irq, void *dev_id);
10106
10107 long int tpmback_get_instance(struct backend_info *bi);
10108
10109 --- a/drivers/xen/tpmback/tpmback.c
10110 +++ b/drivers/xen/tpmback/tpmback.c
10111 @@ -502,7 +502,7 @@
10112 list_del(&pak->next);
10113 write_unlock_irqrestore(&dataex.pak_lock, flags);
10114
10115 - DPRINTK("size given by app: %d, available: %d\n", size, left);
10116 + DPRINTK("size given by app: %zu, available: %u\n", size, left);
10117
10118 ret_size = min_t(size_t, size, left);
10119
10120 @@ -899,7 +899,7 @@
10121 }
10122 }
10123
10124 -irqreturn_t tpmif_be_int(int irq, void *dev_id, struct pt_regs *regs)
10125 +irqreturn_t tpmif_be_int(int irq, void *dev_id)
10126 {
10127 tpmif_t *tpmif = (tpmif_t *) dev_id;
10128
10129 --- a/drivers/xen/xenbus/xenbus_comms.c
10130 +++ b/drivers/xen/xenbus/xenbus_comms.c
10131 @@ -55,7 +55,7 @@
10132
10133 static DECLARE_WAIT_QUEUE_HEAD(xb_waitq);
10134
10135 -static irqreturn_t wake_waiting(int irq, void *unused, struct pt_regs *regs)
10136 +static irqreturn_t wake_waiting(int irq, void *unused)
10137 {
10138 if (unlikely(xenstored_ready == 0)) {
10139 xenstored_ready = 1;
10140 --- a/drivers/xen/xenoprof/xenoprofile.c
10141 +++ b/drivers/xen/xenoprof/xenoprofile.c
10142 @@ -195,7 +195,7 @@
10143 }
10144
10145 static irqreturn_t
10146 -xenoprof_ovf_interrupt(int irq, void * dev_id, struct pt_regs * regs)
10147 +xenoprof_ovf_interrupt(int irq, void * dev_id)
10148 {
10149 struct xenoprof_buf * buf;
10150 static unsigned long flag;
10151 --- a/include/asm-generic/pgtable.h
10152 +++ b/include/asm-generic/pgtable.h
10153 @@ -100,7 +100,7 @@
10154 #endif
10155
10156 #ifndef arch_change_pte_range
10157 -#define arch_change_pte_range(mm, pmd, addr, end, newprot) 0
10158 +#define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) 0
10159 #endif
10160
10161 #ifndef __HAVE_ARCH_PTE_SAME
10162 --- a/include/asm-x86/mach-xen/asm/desc_32.h
10163 +++ b/include/asm-x86/mach-xen/asm/desc_32.h
10164 @@ -32,52 +32,110 @@
10165 return (struct desc_struct *)per_cpu(cpu_gdt_descr, cpu).address;
10166 }
10167
10168 +/*
10169 + * This is the ldt that every process will get unless we need
10170 + * something other than this.
10171 + */
10172 +extern struct desc_struct default_ldt[];
10173 +extern struct desc_struct idt_table[];
10174 +extern void set_intr_gate(unsigned int irq, void * addr);
10175 +
10176 +static inline void pack_descriptor(__u32 *a, __u32 *b,
10177 + unsigned long base, unsigned long limit, unsigned char type, unsigned char flags)
10178 +{
10179 + *a = ((base & 0xffff) << 16) | (limit & 0xffff);
10180 + *b = (base & 0xff000000) | ((base & 0xff0000) >> 16) |
10181 + (limit & 0x000f0000) | ((type & 0xff) << 8) | ((flags & 0xf) << 20);
10182 +}
10183 +
10184 +static inline void pack_gate(__u32 *a, __u32 *b,
10185 + unsigned long base, unsigned short seg, unsigned char type, unsigned char flags)
10186 +{
10187 + *a = (seg << 16) | (base & 0xffff);
10188 + *b = (base & 0xffff0000) | ((type & 0xff) << 8) | (flags & 0xff);
10189 +}
10190 +
10191 +#define DESCTYPE_LDT 0x82 /* present, system, DPL-0, LDT */
10192 +#define DESCTYPE_TSS 0x89 /* present, system, DPL-0, 32-bit TSS */
10193 +#define DESCTYPE_TASK 0x85 /* present, system, DPL-0, task gate */
10194 +#define DESCTYPE_INT 0x8e /* present, system, DPL-0, interrupt gate */
10195 +#define DESCTYPE_TRAP 0x8f /* present, system, DPL-0, trap gate */
10196 +#define DESCTYPE_DPL3 0x60 /* DPL-3 */
10197 +#define DESCTYPE_S 0x10 /* !system */
10198 +
10199 #define load_TR_desc() __asm__ __volatile__("ltr %w0"::"q" (GDT_ENTRY_TSS*8))
10200 #define load_LDT_desc() __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8))
10201
10202 #define load_gdt(dtr) __asm__ __volatile("lgdt %0"::"m" (*dtr))
10203 #define load_idt(dtr) __asm__ __volatile("lidt %0"::"m" (*dtr))
10204 -#define load_tr(tr) __asm__ __volatile("ltr %0"::"mr" (tr))
10205 -#define load_ldt(ldt) __asm__ __volatile("lldt %0"::"mr" (ldt))
10206 +#define load_tr(tr) __asm__ __volatile("ltr %0"::"m" (tr))
10207 +#define load_ldt(ldt) __asm__ __volatile("lldt %0"::"m" (ldt))
10208
10209 #define store_gdt(dtr) __asm__ ("sgdt %0":"=m" (*dtr))
10210 #define store_idt(dtr) __asm__ ("sidt %0":"=m" (*dtr))
10211 -#define store_tr(tr) __asm__ ("str %0":"=mr" (tr))
10212 -#define store_ldt(ldt) __asm__ ("sldt %0":"=mr" (ldt))
10213 +#define store_tr(tr) __asm__ ("str %0":"=m" (tr))
10214 +#define store_ldt(ldt) __asm__ ("sldt %0":"=m" (ldt))
10215
10216 -/*
10217 - * This is the ldt that every process will get unless we need
10218 - * something other than this.
10219 - */
10220 -extern struct desc_struct default_ldt[];
10221 -extern void set_intr_gate(unsigned int irq, void * addr);
10222 +#if TLS_SIZE != 24
10223 +# error update this code.
10224 +#endif
10225 +
10226 +static inline void load_TLS(struct thread_struct *t, unsigned int cpu)
10227 +{
10228 +#define C(i) if (HYPERVISOR_update_descriptor(virt_to_machine(&get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]), \
10229 + *(u64 *)&t->tls_array[i]) \
10230 + BUG()
10231 + C(0); C(1); C(2);
10232 +#undef C
10233 +}
10234
10235 -#define _set_tssldt_desc(n,addr,limit,type) \
10236 -__asm__ __volatile__ ("movw %w3,0(%2)\n\t" \
10237 - "movw %w1,2(%2)\n\t" \
10238 - "rorl $16,%1\n\t" \
10239 - "movb %b1,4(%2)\n\t" \
10240 - "movb %4,5(%2)\n\t" \
10241 - "movb $0,6(%2)\n\t" \
10242 - "movb %h1,7(%2)\n\t" \
10243 - "rorl $16,%1" \
10244 - : "=m"(*(n)) : "q" (addr), "r"(n), "ir"(limit), "i"(type))
10245 +#ifndef CONFIG_XEN
10246 +static inline void write_dt_entry(void *dt, int entry, __u32 entry_a, __u32 entry_b)
10247 +{
10248 + __u32 *lp = (__u32 *)((char *)dt + entry*8);
10249 + *lp = entry_a;
10250 + *(lp+1) = entry_b;
10251 +}
10252
10253 -#ifndef CONFIG_X86_NO_TSS
10254 -static inline void __set_tss_desc(unsigned int cpu, unsigned int entry, void *addr)
10255 +#define write_ldt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
10256 +#define write_gdt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
10257 +#else
10258 +extern int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b);
10259 +extern int write_gdt_entry(void *gdt, int entry, __u32 entry_a, __u32 entry_b);
10260 +#endif
10261 +#ifndef CONFIG_X86_NO_IDT
10262 +#define write_idt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
10263 +
10264 +static inline void _set_gate(int gate, unsigned int type, void *addr, unsigned short seg)
10265 {
10266 - _set_tssldt_desc(&get_cpu_gdt_table(cpu)[entry], (int)addr,
10267 - offsetof(struct tss_struct, __cacheline_filler) - 1, 0x89);
10268 + __u32 a, b;
10269 + pack_gate(&a, &b, (unsigned long)addr, seg, type, 0);
10270 + write_idt_entry(idt_table, gate, a, b);
10271 }
10272 +#endif
10273
10274 -#define set_tss_desc(cpu,addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr)
10275 +#ifndef CONFIG_X86_NO_TSS
10276 +static inline void __set_tss_desc(unsigned int cpu, unsigned int entry, const void *addr)
10277 +{
10278 + __u32 a, b;
10279 + pack_descriptor(&a, &b, (unsigned long)addr,
10280 + offsetof(struct tss_struct, __cacheline_filler) - 1,
10281 + DESCTYPE_TSS, 0);
10282 + write_gdt_entry(get_cpu_gdt_table(cpu), entry, a, b);
10283 +}
10284 #endif
10285
10286 -static inline void set_ldt_desc(unsigned int cpu, void *addr, unsigned int size)
10287 +static inline void set_ldt_desc(unsigned int cpu, void *addr, unsigned int entries)
10288 {
10289 - _set_tssldt_desc(&get_cpu_gdt_table(cpu)[GDT_ENTRY_LDT], (int)addr, ((size << 3)-1), 0x82);
10290 + __u32 a, b;
10291 + pack_descriptor(&a, &b, (unsigned long)addr,
10292 + entries * sizeof(struct desc_struct) - 1,
10293 + DESCTYPE_LDT, 0);
10294 + write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, a, b);
10295 }
10296
10297 +#define set_tss_desc(cpu,addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr)
10298 +
10299 #define LDT_entry_a(info) \
10300 ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff))
10301
10302 @@ -103,21 +161,6 @@
10303 (info)->seg_not_present == 1 && \
10304 (info)->useable == 0 )
10305
10306 -extern int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b);
10307 -
10308 -#if TLS_SIZE != 24
10309 -# error update this code.
10310 -#endif
10311 -
10312 -static inline void load_TLS(struct thread_struct *t, unsigned int cpu)
10313 -{
10314 -#define C(i) if (HYPERVISOR_update_descriptor(virt_to_machine(&get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]), \
10315 - *(u64 *)&t->tls_array[i])) \
10316 - BUG();
10317 - C(0); C(1); C(2);
10318 -#undef C
10319 -}
10320 -
10321 static inline void clear_LDT(void)
10322 {
10323 int cpu = get_cpu();
10324 --- a/include/asm-x86/mach-xen/asm/dma-mapping_64.h
10325 +++ b/include/asm-x86/mach-xen/asm/dma-mapping_64.h
10326 @@ -55,13 +55,6 @@
10327 extern struct dma_mapping_ops* dma_ops;
10328 extern int iommu_merge;
10329
10330 -static inline int valid_dma_direction(int dma_direction)
10331 -{
10332 - return ((dma_direction == DMA_BIDIRECTIONAL) ||
10333 - (dma_direction == DMA_TO_DEVICE) ||
10334 - (dma_direction == DMA_FROM_DEVICE));
10335 -}
10336 -
10337 #if 0
10338 static inline int dma_mapping_error(dma_addr_t dma_addr)
10339 {
10340 --- a/include/asm-x86/mach-xen/asm/e820_64.h
10341 +++ b/include/asm-x86/mach-xen/asm/e820_64.h
10342 @@ -19,13 +19,9 @@
10343
10344 #define E820_RAM 1
10345 #define E820_RESERVED 2
10346 -#define E820_ACPI 3 /* usable as RAM once ACPI tables have been read */
10347 +#define E820_ACPI 3
10348 #define E820_NVS 4
10349
10350 -#define HIGH_MEMORY (1024*1024)
10351 -
10352 -#define LOWMEMSIZE() (0x9f000)
10353 -
10354 #ifndef __ASSEMBLY__
10355 struct e820entry {
10356 u64 addr; /* start of memory segment */
10357 @@ -46,17 +42,16 @@
10358 extern void contig_e820_setup(void);
10359 extern unsigned long e820_end_of_ram(void);
10360 extern void e820_reserve_resources(struct e820entry *e820, int nr_map);
10361 +extern void e820_mark_nosave_regions(void);
10362 extern void e820_print_map(char *who);
10363 extern int e820_any_mapped(unsigned long start, unsigned long end, unsigned type);
10364 extern int e820_all_mapped(unsigned long start, unsigned long end, unsigned type);
10365
10366 -extern void e820_bootmem_free(pg_data_t *pgdat, unsigned long start,unsigned long end);
10367 extern void e820_setup_gap(struct e820entry *e820, int nr_map);
10368 -extern unsigned long e820_hole_size(unsigned long start_pfn,
10369 - unsigned long end_pfn);
10370 +extern void e820_register_active_regions(int nid,
10371 + unsigned long start_pfn, unsigned long end_pfn);
10372
10373 -extern void __init parse_memopt(char *p, char **end);
10374 -extern void __init parse_memmapopt(char *p, char **end);
10375 +extern void finish_e820_parsing(void);
10376
10377 extern struct e820map e820;
10378
10379 --- a/include/asm-x86/mach-xen/asm/fixmap_32.h
10380 +++ b/include/asm-x86/mach-xen/asm/fixmap_32.h
10381 @@ -55,7 +55,7 @@
10382 #ifdef CONFIG_X86_LOCAL_APIC
10383 FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
10384 #endif
10385 -#ifdef CONFIG_X86_IO_APIC
10386 +#if defined(CONFIG_X86_IO_APIC) && !defined(CONFIG_XEN)
10387 FIX_IO_APIC_BASE_0,
10388 FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
10389 #endif
10390 @@ -95,10 +95,9 @@
10391 __end_of_fixed_addresses
10392 };
10393
10394 -extern void set_fixaddr_top(unsigned long top);
10395 -
10396 extern void __set_fixmap(enum fixed_addresses idx,
10397 maddr_t phys, pgprot_t flags);
10398 +extern void reserve_top_address(unsigned long reserve);
10399
10400 #define set_fixmap(idx, phys) \
10401 __set_fixmap(idx, phys, PAGE_KERNEL)
10402 --- a/include/asm-x86/mach-xen/asm/fixmap_64.h
10403 +++ b/include/asm-x86/mach-xen/asm/fixmap_64.h
10404 @@ -41,7 +41,7 @@
10405 #ifdef CONFIG_X86_LOCAL_APIC
10406 FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
10407 #endif
10408 -#ifdef CONFIG_X86_IO_APIC
10409 +#ifndef CONFIG_XEN
10410 FIX_IO_APIC_BASE_0,
10411 FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
10412 #endif
10413 --- a/include/asm-x86/mach-xen/asm/hw_irq_32.h
10414 +++ b/include/asm-x86/mach-xen/asm/hw_irq_32.h
10415 @@ -17,8 +17,6 @@
10416 #include <asm/irq.h>
10417 #include <asm/sections.h>
10418
10419 -struct hw_interrupt_type;
10420 -
10421 #define NMI_VECTOR 0x02
10422
10423 /*
10424 @@ -28,10 +26,6 @@
10425 * Interrupt entry/exit code at both C and assembly level
10426 */
10427
10428 -extern u8 irq_vector[NR_IRQ_VECTORS];
10429 -#define IO_APIC_VECTOR(irq) (irq_vector[irq])
10430 -#define AUTO_ASSIGN -1
10431 -
10432 extern void (*interrupt[NR_IRQS])(void);
10433
10434 #ifdef CONFIG_SMP
10435 @@ -44,7 +38,7 @@
10436 fastcall void apic_timer_interrupt(void);
10437 fastcall void error_interrupt(void);
10438 fastcall void spurious_interrupt(void);
10439 -fastcall void thermal_interrupt(struct pt_regs *);
10440 +fastcall void thermal_interrupt(void);
10441 #define platform_legacy_irq(irq) ((irq) < 16)
10442 #endif
10443
10444 --- a/include/asm-x86/mach-xen/asm/hw_irq_64.h
10445 +++ b/include/asm-x86/mach-xen/asm/hw_irq_64.h
10446 @@ -19,8 +19,7 @@
10447 #include <asm/irq.h>
10448 #include <linux/profile.h>
10449 #include <linux/smp.h>
10450 -
10451 -struct hw_interrupt_type;
10452 +#include <linux/percpu.h>
10453 #endif
10454
10455 #define NMI_VECTOR 0x02
10456 @@ -77,9 +76,10 @@
10457
10458
10459 #ifndef __ASSEMBLY__
10460 -extern u8 irq_vector[NR_IRQ_VECTORS];
10461 -#define IO_APIC_VECTOR(irq) (irq_vector[irq])
10462 -#define AUTO_ASSIGN -1
10463 +typedef int vector_irq_t[NR_VECTORS];
10464 +DECLARE_PER_CPU(vector_irq_t, vector_irq);
10465 +extern void __setup_vector_irq(int cpu);
10466 +extern spinlock_t vector_lock;
10467
10468 /*
10469 * Various low-level irq details needed by irq.c, process.c,
10470 --- a/include/asm-x86/mach-xen/asm/io_32.h
10471 +++ b/include/asm-x86/mach-xen/asm/io_32.h
10472 @@ -237,33 +237,6 @@
10473
10474 #define eth_io_copy_and_sum(a,b,c,d) eth_copy_and_sum((a),(void __force *)(b),(c),(d))
10475
10476 -/**
10477 - * check_signature - find BIOS signatures
10478 - * @io_addr: mmio address to check
10479 - * @signature: signature block
10480 - * @length: length of signature
10481 - *
10482 - * Perform a signature comparison with the mmio address io_addr. This
10483 - * address should have been obtained by ioremap.
10484 - * Returns 1 on a match.
10485 - */
10486 -
10487 -static inline int check_signature(volatile void __iomem * io_addr,
10488 - const unsigned char *signature, int length)
10489 -{
10490 - int retval = 0;
10491 - do {
10492 - if (readb(io_addr) != *signature)
10493 - goto out;
10494 - io_addr++;
10495 - signature++;
10496 - length--;
10497 - } while (length);
10498 - retval = 1;
10499 -out:
10500 - return retval;
10501 -}
10502 -
10503 /*
10504 * Cache management
10505 *
10506 --- a/include/asm-x86/mach-xen/asm/io_64.h
10507 +++ b/include/asm-x86/mach-xen/asm/io_64.h
10508 @@ -273,33 +273,6 @@
10509
10510 #define eth_io_copy_and_sum(a,b,c,d) eth_copy_and_sum((a),(void *)(b),(c),(d))
10511
10512 -/**
10513 - * check_signature - find BIOS signatures
10514 - * @io_addr: mmio address to check
10515 - * @signature: signature block
10516 - * @length: length of signature
10517 - *
10518 - * Perform a signature comparison with the mmio address io_addr. This
10519 - * address should have been obtained by ioremap.
10520 - * Returns 1 on a match.
10521 - */
10522 -
10523 -static inline int check_signature(void __iomem *io_addr,
10524 - const unsigned char *signature, int length)
10525 -{
10526 - int retval = 0;
10527 - do {
10528 - if (readb(io_addr) != *signature)
10529 - goto out;
10530 - io_addr++;
10531 - signature++;
10532 - length--;
10533 - } while (length);
10534 - retval = 1;
10535 -out:
10536 - return retval;
10537 -}
10538 -
10539 /* Nothing to do */
10540
10541 #define dma_cache_inv(_start,_size) do { } while (0)
10542 --- a/include/asm-x86/mach-xen/asm/pgtable-2level.h
10543 +++ b/include/asm-x86/mach-xen/asm/pgtable-2level.h
10544 @@ -23,14 +23,6 @@
10545 set_pte((ptep), (pteval)); \
10546 } while (0)
10547
10548 -#define set_pte_at_sync(_mm,addr,ptep,pteval) do { \
10549 - if (((_mm) != current->mm && (_mm) != &init_mm) || \
10550 - HYPERVISOR_update_va_mapping((addr), (pteval), UVMF_INVLPG)) { \
10551 - set_pte((ptep), (pteval)); \
10552 - xen_invlpg((addr)); \
10553 - } \
10554 -} while (0)
10555 -
10556 #define set_pte_atomic(pteptr, pteval) set_pte(pteptr,pteval)
10557
10558 #define set_pmd(pmdptr, pmdval) xen_l2_entry_update((pmdptr), (pmdval))
10559 @@ -40,6 +32,7 @@
10560
10561 #define pte_none(x) (!(x).pte_low)
10562
10563 +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
10564 static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
10565 {
10566 pte_t pte = *ptep;
10567 @@ -51,6 +44,7 @@
10568 return pte;
10569 }
10570
10571 +#define __HAVE_ARCH_PTEP_CLEAR_FLUSH
10572 #define ptep_clear_flush(vma, addr, ptep) \
10573 ({ \
10574 pte_t *__ptep = (ptep); \
10575 @@ -66,8 +60,6 @@
10576 __res; \
10577 })
10578
10579 -#define pte_same(a, b) ((a).pte_low == (b).pte_low)
10580 -
10581 #define __pte_mfn(_pte) ((_pte).pte_low >> PAGE_SHIFT)
10582 #define pte_mfn(_pte) ((_pte).pte_low & _PAGE_PRESENT ? \
10583 __pte_mfn(_pte) : pfn_to_mfn(__pte_mfn(_pte)))
10584 --- a/include/asm-x86/mach-xen/asm/pgtable-3level.h
10585 +++ b/include/asm-x86/mach-xen/asm/pgtable-3level.h
10586 @@ -53,7 +53,6 @@
10587 * not possible, use pte_get_and_clear to obtain the old pte
10588 * value and then use set_pte to update it. -ben
10589 */
10590 -#define __HAVE_ARCH_SET_PTE_ATOMIC
10591
10592 static inline void set_pte(pte_t *ptep, pte_t pte)
10593 {
10594 @@ -70,14 +69,6 @@
10595 set_pte((ptep), (pteval)); \
10596 } while (0)
10597
10598 -#define set_pte_at_sync(_mm,addr,ptep,pteval) do { \
10599 - if (((_mm) != current->mm && (_mm) != &init_mm) || \
10600 - HYPERVISOR_update_va_mapping((addr), (pteval), UVMF_INVLPG)) { \
10601 - set_pte((ptep), (pteval)); \
10602 - xen_invlpg((addr)); \
10603 - } \
10604 -} while (0)
10605 -
10606 #define set_pmd(pmdptr,pmdval) \
10607 xen_l2_entry_update((pmdptr), (pmdval))
10608 #define set_pud(pudptr,pudval) \
10609 @@ -94,7 +85,7 @@
10610 #define pud_page(pud) \
10611 ((struct page *) __va(pud_val(pud) & PAGE_MASK))
10612
10613 -#define pud_page_kernel(pud) \
10614 +#define pud_page_vaddr(pud) \
10615 ((unsigned long) __va(pud_val(pud) & PAGE_MASK))
10616
10617
10618 @@ -124,6 +115,7 @@
10619
10620 #define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0)
10621
10622 +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
10623 static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
10624 {
10625 pte_t pte = *ptep;
10626 @@ -142,6 +134,7 @@
10627 return pte;
10628 }
10629
10630 +#define __HAVE_ARCH_PTEP_CLEAR_FLUSH
10631 #define ptep_clear_flush(vma, addr, ptep) \
10632 ({ \
10633 pte_t *__ptep = (ptep); \
10634 @@ -159,6 +152,7 @@
10635 __res; \
10636 })
10637
10638 +#define __HAVE_ARCH_PTE_SAME
10639 static inline int pte_same(pte_t a, pte_t b)
10640 {
10641 return a.pte_low == b.pte_low && a.pte_high == b.pte_high;
10642 --- a/include/asm-x86/mach-xen/asm/pgtable_32.h
10643 +++ b/include/asm-x86/mach-xen/asm/pgtable_32.h
10644 @@ -260,31 +260,89 @@
10645 # include <asm/pgtable-2level.h>
10646 #endif
10647
10648 -#define ptep_test_and_clear_dirty(vma, addr, ptep) \
10649 +/*
10650 + * Rules for using pte_update - it must be called after any PTE update which
10651 + * has not been done using the set_pte / clear_pte interfaces. It is used by
10652 + * shadow mode hypervisors to resynchronize the shadow page tables. Kernel PTE
10653 + * updates should either be sets, clears, or set_pte_atomic for P->P
10654 + * transitions, which means this hook should only be called for user PTEs.
10655 + * This hook implies a P->P protection or access change has taken place, which
10656 + * requires a subsequent TLB flush. The notification can optionally be delayed
10657 + * until the TLB flush event by using the pte_update_defer form of the
10658 + * interface, but care must be taken to assure that the flush happens while
10659 + * still holding the same page table lock so that the shadow and primary pages
10660 + * do not become out of sync on SMP.
10661 + */
10662 +#define pte_update(mm, addr, ptep) do { } while (0)
10663 +#define pte_update_defer(mm, addr, ptep) do { } while (0)
10664 +
10665 +
10666 +/*
10667 + * We only update the dirty/accessed state if we set
10668 + * the dirty bit by hand in the kernel, since the hardware
10669 + * will do the accessed bit for us, and we don't want to
10670 + * race with other CPU's that might be updating the dirty
10671 + * bit at the same time.
10672 + */
10673 +#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
10674 +#define ptep_set_access_flags(vma, address, ptep, entry, dirty) \
10675 +do { \
10676 + if (dirty) \
10677 + ptep_establish(vma, address, ptep, entry); \
10678 +} while (0)
10679 +
10680 +/*
10681 + * We don't actually have these, but we want to advertise them so that
10682 + * we can encompass the flush here.
10683 + */
10684 +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
10685 +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
10686 +
10687 +/*
10688 + * Rules for using ptep_establish: the pte MUST be a user pte, and
10689 + * must be a present->present transition.
10690 + */
10691 +#define __HAVE_ARCH_PTEP_ESTABLISH
10692 +#define ptep_establish(vma, address, ptep, pteval) \
10693 +do { \
10694 + if ( likely((vma)->vm_mm == current->mm) ) { \
10695 + BUG_ON(HYPERVISOR_update_va_mapping(address, \
10696 + pteval, \
10697 + (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
10698 + UVMF_INVLPG|UVMF_MULTI)); \
10699 + } else { \
10700 + xen_l1_entry_update(ptep, pteval); \
10701 + flush_tlb_page(vma, address); \
10702 + } \
10703 +} while (0)
10704 +
10705 +#define __HAVE_ARCH_PTEP_CLEAR_DIRTY_FLUSH
10706 +#define ptep_clear_flush_dirty(vma, address, ptep) \
10707 ({ \
10708 pte_t __pte = *(ptep); \
10709 - int __ret = pte_dirty(__pte); \
10710 - if (__ret) { \
10711 - __pte = pte_mkclean(__pte); \
10712 - if ((vma)->vm_mm != current->mm || \
10713 - HYPERVISOR_update_va_mapping(addr, __pte, 0)) \
10714 - (ptep)->pte_low = __pte.pte_low; \
10715 - } \
10716 - __ret; \
10717 + int __dirty = pte_dirty(__pte); \
10718 + __pte = pte_mkclean(__pte); \
10719 + if (test_bit(PG_pinned, &virt_to_page((vma)->vm_mm->pgd)->flags)) \
10720 + ptep_set_access_flags(vma, address, ptep, __pte, __dirty); \
10721 + else if (__dirty) \
10722 + (ptep)->pte_low = __pte.pte_low; \
10723 + __dirty; \
10724 })
10725
10726 -#define ptep_test_and_clear_young(vma, addr, ptep) \
10727 +#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
10728 +#define ptep_clear_flush_young(vma, address, ptep) \
10729 ({ \
10730 pte_t __pte = *(ptep); \
10731 - int __ret = pte_young(__pte); \
10732 - if (__ret) \
10733 - __pte = pte_mkold(__pte); \
10734 - if ((vma)->vm_mm != current->mm || \
10735 - HYPERVISOR_update_va_mapping(addr, __pte, 0)) \
10736 - (ptep)->pte_low = __pte.pte_low; \
10737 - __ret; \
10738 + int __young = pte_young(__pte); \
10739 + __pte = pte_mkold(__pte); \
10740 + if (test_bit(PG_pinned, &virt_to_page((vma)->vm_mm->pgd)->flags)) \
10741 + ptep_set_access_flags(vma, address, ptep, __pte, __young); \
10742 + else if (__young) \
10743 + (ptep)->pte_low = __pte.pte_low; \
10744 + __young; \
10745 })
10746
10747 +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
10748 #define ptep_get_and_clear_full(mm, addr, ptep, full) \
10749 ((full) ? ({ \
10750 pte_t __res = *(ptep); \
10751 @@ -296,6 +354,7 @@
10752 }) : \
10753 ptep_get_and_clear(mm, addr, ptep))
10754
10755 +#define __HAVE_ARCH_PTEP_SET_WRPROTECT
10756 static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
10757 {
10758 pte_t pte = *ptep;
10759 @@ -391,11 +450,11 @@
10760 #define pte_index(address) \
10761 (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
10762 #define pte_offset_kernel(dir, address) \
10763 - ((pte_t *) pmd_page_kernel(*(dir)) + pte_index(address))
10764 + ((pte_t *) pmd_page_vaddr(*(dir)) + pte_index(address))
10765
10766 #define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
10767
10768 -#define pmd_page_kernel(pmd) \
10769 +#define pmd_page_vaddr(pmd) \
10770 ((unsigned long) __va(pmd_val(pmd) & PAGE_MASK))
10771
10772 /*
10773 @@ -418,8 +477,6 @@
10774 static inline int set_kernel_exec(unsigned long vaddr, int enable) { return 0;}
10775 #endif
10776
10777 -extern void noexec_setup(const char *str);
10778 -
10779 #if defined(CONFIG_HIGHPTE)
10780 #define pte_offset_map(dir, address) \
10781 ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE0) + \
10782 @@ -437,37 +494,17 @@
10783 #define pte_unmap_nested(pte) do { } while (0)
10784 #endif
10785
10786 -#define __HAVE_ARCH_PTEP_ESTABLISH
10787 -#define ptep_establish(vma, address, ptep, pteval) \
10788 - do { \
10789 - if ( likely((vma)->vm_mm == current->mm) ) { \
10790 - BUG_ON(HYPERVISOR_update_va_mapping(address, \
10791 - pteval, \
10792 - (unsigned long)(vma)->vm_mm->cpu_vm_mask.bits| \
10793 - UVMF_INVLPG|UVMF_MULTI)); \
10794 - } else { \
10795 - xen_l1_entry_update(ptep, pteval); \
10796 - flush_tlb_page(vma, address); \
10797 - } \
10798 - } while (0)
10799 +/* Clear a kernel PTE and flush it from the TLB */
10800 +#define kpte_clear_flush(ptep, vaddr) do { \
10801 + if (HYPERVISOR_update_va_mapping(vaddr, __pte(0), UVMF_INVLPG)) \
10802 + BUG(); \
10803 +} while (0)
10804
10805 /*
10806 * The i386 doesn't have any external MMU info: the kernel page
10807 * tables contain all the necessary information.
10808 - *
10809 - * Also, we only update the dirty/accessed state if we set
10810 - * the dirty bit by hand in the kernel, since the hardware
10811 - * will do the accessed bit for us, and we don't want to
10812 - * race with other CPU's that might be updating the dirty
10813 - * bit at the same time.
10814 */
10815 #define update_mmu_cache(vma,address,pte) do { } while (0)
10816 -#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
10817 -#define ptep_set_access_flags(vma, address, ptep, entry, dirty) \
10818 - do { \
10819 - if (dirty) \
10820 - ptep_establish(vma, address, ptep, entry); \
10821 - } while (0)
10822
10823 #include <xen/features.h>
10824 void make_lowmem_page_readonly(void *va, unsigned int feature);
10825 @@ -516,10 +553,11 @@
10826 unsigned long size);
10827
10828 int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
10829 - unsigned long addr, unsigned long end, pgprot_t newprot);
10830 + unsigned long addr, unsigned long end, pgprot_t newprot,
10831 + int dirty_accountable);
10832
10833 -#define arch_change_pte_range(mm, pmd, addr, end, newprot) \
10834 - xen_change_pte_range(mm, pmd, addr, end, newprot)
10835 +#define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) \
10836 + xen_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable)
10837
10838 #define io_remap_pfn_range(vma,from,pfn,size,prot) \
10839 direct_remap_pfn_range(vma,from,pfn,size,prot,DOMID_IO)
10840 @@ -528,13 +566,6 @@
10841 #define GET_IOSPACE(pfn) 0
10842 #define GET_PFN(pfn) (pfn)
10843
10844 -#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
10845 -#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
10846 -#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
10847 -#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
10848 -#define __HAVE_ARCH_PTEP_CLEAR_FLUSH
10849 -#define __HAVE_ARCH_PTEP_SET_WRPROTECT
10850 -#define __HAVE_ARCH_PTE_SAME
10851 #include <asm-generic/pgtable.h>
10852
10853 #endif /* _I386_PGTABLE_H */
10854 --- a/include/asm-x86/mach-xen/asm/pgtable_64.h
10855 +++ b/include/asm-x86/mach-xen/asm/pgtable_64.h
10856 @@ -43,12 +43,9 @@
10857
10858 #define swapper_pg_dir init_level4_pgt
10859
10860 -extern int nonx_setup(char *str);
10861 extern void paging_init(void);
10862 extern void clear_kernel_mapping(unsigned long addr, unsigned long size);
10863
10864 -extern unsigned long pgkern_mask;
10865 -
10866 /*
10867 * ZERO_PAGE is a global shared page that is always zero: used
10868 * for zero-mapped memory areas etc..
10869 @@ -118,9 +115,6 @@
10870 set_pgd(__user_pgd(pgd), __pgd(0));
10871 }
10872
10873 -#define pud_page(pud) \
10874 - ((unsigned long) __va(pud_val(pud) & PHYSICAL_PAGE_MASK))
10875 -
10876 #define pte_same(a, b) ((a).pte == (b).pte)
10877
10878 #define pte_pgprot(a) (__pgprot((a).pte & ~PHYSICAL_PAGE_MASK))
10879 @@ -332,7 +326,7 @@
10880 #define __LARGE_PTE (_PAGE_PSE|_PAGE_PRESENT)
10881 static inline int pte_user(pte_t pte) { return __pte_val(pte) & _PAGE_USER; }
10882 static inline int pte_read(pte_t pte) { return __pte_val(pte) & _PAGE_USER; }
10883 -static inline int pte_exec(pte_t pte) { return __pte_val(pte) & _PAGE_USER; }
10884 +static inline int pte_exec(pte_t pte) { return !(__pte_val(pte) & _PAGE_NX); }
10885 static inline int pte_dirty(pte_t pte) { return __pte_val(pte) & _PAGE_DIRTY; }
10886 static inline int pte_young(pte_t pte) { return __pte_val(pte) & _PAGE_ACCESSED; }
10887 static inline int pte_write(pte_t pte) { return __pte_val(pte) & _PAGE_RW; }
10888 @@ -345,29 +339,12 @@
10889 static inline pte_t pte_mkold(pte_t pte) { __pte_val(pte) &= ~_PAGE_ACCESSED; return pte; }
10890 static inline pte_t pte_wrprotect(pte_t pte) { __pte_val(pte) &= ~_PAGE_RW; return pte; }
10891 static inline pte_t pte_mkread(pte_t pte) { __pte_val(pte) |= _PAGE_USER; return pte; }
10892 -static inline pte_t pte_mkexec(pte_t pte) { __pte_val(pte) |= _PAGE_USER; return pte; }
10893 +static inline pte_t pte_mkexec(pte_t pte) { __pte_val(pte) &= ~_PAGE_NX; return pte; }
10894 static inline pte_t pte_mkdirty(pte_t pte) { __pte_val(pte) |= _PAGE_DIRTY; return pte; }
10895 static inline pte_t pte_mkyoung(pte_t pte) { __pte_val(pte) |= _PAGE_ACCESSED; return pte; }
10896 static inline pte_t pte_mkwrite(pte_t pte) { __pte_val(pte) |= _PAGE_RW; return pte; }
10897 static inline pte_t pte_mkhuge(pte_t pte) { __pte_val(pte) |= _PAGE_PSE; return pte; }
10898 -
10899 -#define ptep_test_and_clear_dirty(vma, addr, ptep) \
10900 -({ \
10901 - pte_t __pte = *(ptep); \
10902 - int __ret = pte_dirty(__pte); \
10903 - if (__ret) \
10904 - set_pte_at((vma)->vm_mm, addr, ptep, pte_mkclean(__pte)); \
10905 - __ret; \
10906 -})
10907 -
10908 -#define ptep_test_and_clear_young(vma, addr, ptep) \
10909 -({ \
10910 - pte_t __pte = *(ptep); \
10911 - int __ret = pte_young(__pte); \
10912 - if (__ret) \
10913 - set_pte_at((vma)->vm_mm, addr, ptep, pte_mkold(__pte)); \
10914 - __ret; \
10915 -})
10916 +static inline pte_t pte_clrhuge(pte_t pte) { __pte_val(pte) &= ~_PAGE_PSE; return pte; }
10917
10918 static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
10919 {
10920 @@ -395,7 +372,8 @@
10921 * Level 4 access.
10922 * Never use these in the common code.
10923 */
10924 -#define pgd_page(pgd) ((unsigned long) __va(pgd_val(pgd) & PTE_MASK))
10925 +#define pgd_page_vaddr(pgd) ((unsigned long) __va(pgd_val(pgd) & PTE_MASK))
10926 +#define pgd_page(pgd) (pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT))
10927 #define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1))
10928 #define pgd_offset(mm, addr) ((mm)->pgd + pgd_index(addr))
10929 #define pgd_offset_k(address) (init_level4_pgt + pgd_index(address))
10930 @@ -404,16 +382,18 @@
10931
10932 /* PUD - Level3 access */
10933 /* to find an entry in a page-table-directory. */
10934 +#define pud_page_vaddr(pud) ((unsigned long) __va(pud_val(pud) & PHYSICAL_PAGE_MASK))
10935 +#define pud_page(pud) (pfn_to_page(pud_val(pud) >> PAGE_SHIFT))
10936 #define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
10937 -#define pud_offset(pgd, address) ((pud_t *) pgd_page(*(pgd)) + pud_index(address))
10938 +#define pud_offset(pgd, address) ((pud_t *) pgd_page_vaddr(*(pgd)) + pud_index(address))
10939 #define pud_present(pud) (__pud_val(pud) & _PAGE_PRESENT)
10940
10941 /* PMD - Level 2 access */
10942 -#define pmd_page_kernel(pmd) ((unsigned long) __va(pmd_val(pmd) & PTE_MASK))
10943 +#define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val(pmd) & PTE_MASK))
10944 #define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
10945
10946 #define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1))
10947 -#define pmd_offset(dir, address) ((pmd_t *) pud_page(*(dir)) + \
10948 +#define pmd_offset(dir, address) ((pmd_t *) pud_page_vaddr(*(dir)) + \
10949 pmd_index(address))
10950 #define pmd_none(x) (!__pmd_val(x))
10951 #if CONFIG_XEN_COMPAT <= 0x030002
10952 @@ -444,6 +424,7 @@
10953 {
10954 unsigned long pteval;
10955 pteval = physpage | pgprot_val(pgprot);
10956 + pteval &= __supported_pte_mask;
10957 return __pte(pteval);
10958 }
10959
10960 @@ -465,7 +446,7 @@
10961
10962 #define pte_index(address) \
10963 (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
10964 -#define pte_offset_kernel(dir, address) ((pte_t *) pmd_page_kernel(*(dir)) + \
10965 +#define pte_offset_kernel(dir, address) ((pte_t *) pmd_page_vaddr(*(dir)) + \
10966 pte_index(address))
10967
10968 /* x86-64 always has all page tables mapped. */
10969 @@ -506,6 +487,40 @@
10970 ptep_establish(vma, address, ptep, entry); \
10971 } while (0)
10972
10973 +
10974 +/*
10975 + * i386 says: We don't actually have these, but we want to advertise
10976 + * them so that we can encompass the flush here.
10977 + */
10978 +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
10979 +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
10980 +
10981 +#define __HAVE_ARCH_PTEP_CLEAR_DIRTY_FLUSH
10982 +#define ptep_clear_flush_dirty(vma, address, ptep) \
10983 +({ \
10984 + pte_t __pte = *(ptep); \
10985 + int __dirty = pte_dirty(__pte); \
10986 + __pte = pte_mkclean(__pte); \
10987 + if ((vma)->vm_mm->context.pinned) \
10988 + ptep_set_access_flags(vma, address, ptep, __pte, __dirty); \
10989 + else if (__dirty) \
10990 + set_pte(ptep, __pte); \
10991 + __dirty; \
10992 +})
10993 +
10994 +#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
10995 +#define ptep_clear_flush_young(vma, address, ptep) \
10996 +({ \
10997 + pte_t __pte = *(ptep); \
10998 + int __young = pte_young(__pte); \
10999 + __pte = pte_mkold(__pte); \
11000 + if ((vma)->vm_mm->context.pinned) \
11001 + ptep_set_access_flags(vma, address, ptep, __pte, __young); \
11002 + else if (__young) \
11003 + set_pte(ptep, __pte); \
11004 + __young; \
11005 +})
11006 +
11007 /* Encode and de-code a swap entry */
11008 #define __swp_type(x) (((x).val >> 1) & 0x3f)
11009 #define __swp_offset(x) ((x).val >> 8)
11010 @@ -547,10 +562,11 @@
11011 unsigned long size);
11012
11013 int xen_change_pte_range(struct mm_struct *mm, pmd_t *pmd,
11014 - unsigned long addr, unsigned long end, pgprot_t newprot);
11015 + unsigned long addr, unsigned long end, pgprot_t newprot,
11016 + int dirty_accountable);
11017
11018 -#define arch_change_pte_range(mm, pmd, addr, end, newprot) \
11019 - xen_change_pte_range(mm, pmd, addr, end, newprot)
11020 +#define arch_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable) \
11021 + xen_change_pte_range(mm, pmd, addr, end, newprot, dirty_accountable)
11022
11023 #define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \
11024 direct_remap_pfn_range(vma,vaddr,pfn,size,prot,DOMID_IO)
11025 @@ -572,8 +588,6 @@
11026 #define kc_offset_to_vaddr(o) \
11027 (((o) & (1UL << (__VIRTUAL_MASK_SHIFT-1))) ? ((o) | (~__VIRTUAL_MASK)) : (o))
11028
11029 -#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
11030 -#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY
11031 #define __HAVE_ARCH_PTEP_GET_AND_CLEAR
11032 #define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
11033 #define __HAVE_ARCH_PTEP_CLEAR_FLUSH
11034 --- a/include/asm-x86/mach-xen/asm/processor_32.h
11035 +++ b/include/asm-x86/mach-xen/asm/processor_32.h
11036 @@ -146,6 +146,18 @@
11037 #define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */
11038 #define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */
11039
11040 +static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
11041 + unsigned int *ecx, unsigned int *edx)
11042 +{
11043 + /* ecx is often an input as well as an output. */
11044 + __asm__(XEN_CPUID
11045 + : "=a" (*eax),
11046 + "=b" (*ebx),
11047 + "=c" (*ecx),
11048 + "=d" (*edx)
11049 + : "0" (*eax), "2" (*ecx));
11050 +}
11051 +
11052 /*
11053 * Generic CPUID function
11054 * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
11055 @@ -153,24 +165,18 @@
11056 */
11057 static inline void cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx)
11058 {
11059 - __asm__(XEN_CPUID
11060 - : "=a" (*eax),
11061 - "=b" (*ebx),
11062 - "=c" (*ecx),
11063 - "=d" (*edx)
11064 - : "0" (op), "c"(0));
11065 + *eax = op;
11066 + *ecx = 0;
11067 + __cpuid(eax, ebx, ecx, edx);
11068 }
11069
11070 /* Some CPUID calls want 'count' to be placed in ecx */
11071 static inline void cpuid_count(int op, int count, int *eax, int *ebx, int *ecx,
11072 - int *edx)
11073 + int *edx)
11074 {
11075 - __asm__(XEN_CPUID
11076 - : "=a" (*eax),
11077 - "=b" (*ebx),
11078 - "=c" (*ecx),
11079 - "=d" (*edx)
11080 - : "0" (op), "c" (count));
11081 + *eax = op;
11082 + *ecx = count;
11083 + __cpuid(eax, ebx, ecx, edx);
11084 }
11085
11086 /*
11087 @@ -178,42 +184,30 @@
11088 */
11089 static inline unsigned int cpuid_eax(unsigned int op)
11090 {
11091 - unsigned int eax;
11092 + unsigned int eax, ebx, ecx, edx;
11093
11094 - __asm__(XEN_CPUID
11095 - : "=a" (eax)
11096 - : "0" (op)
11097 - : "bx", "cx", "dx");
11098 + cpuid(op, &eax, &ebx, &ecx, &edx);
11099 return eax;
11100 }
11101 static inline unsigned int cpuid_ebx(unsigned int op)
11102 {
11103 - unsigned int eax, ebx;
11104 + unsigned int eax, ebx, ecx, edx;
11105
11106 - __asm__(XEN_CPUID
11107 - : "=a" (eax), "=b" (ebx)
11108 - : "0" (op)
11109 - : "cx", "dx" );
11110 + cpuid(op, &eax, &ebx, &ecx, &edx);
11111 return ebx;
11112 }
11113 static inline unsigned int cpuid_ecx(unsigned int op)
11114 {
11115 - unsigned int eax, ecx;
11116 + unsigned int eax, ebx, ecx, edx;
11117
11118 - __asm__(XEN_CPUID
11119 - : "=a" (eax), "=c" (ecx)
11120 - : "0" (op)
11121 - : "bx", "dx" );
11122 + cpuid(op, &eax, &ebx, &ecx, &edx);
11123 return ecx;
11124 }
11125 static inline unsigned int cpuid_edx(unsigned int op)
11126 {
11127 - unsigned int eax, edx;
11128 + unsigned int eax, ebx, ecx, edx;
11129
11130 - __asm__(XEN_CPUID
11131 - : "=a" (eax), "=d" (edx)
11132 - : "0" (op)
11133 - : "bx", "cx");
11134 + cpuid(op, &eax, &ebx, &ecx, &edx);
11135 return edx;
11136 }
11137
11138 @@ -315,6 +309,8 @@
11139 : :"a" (eax), "c" (ecx));
11140 }
11141
11142 +extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
11143 +
11144 /* from system description table in BIOS. Mostly for MCA use, but
11145 others may find it useful. */
11146 extern unsigned int machine_id;
11147 --- a/include/asm-x86/mach-xen/asm/processor_64.h
11148 +++ b/include/asm-x86/mach-xen/asm/processor_64.h
11149 @@ -484,6 +484,8 @@
11150 : :"a" (eax), "c" (ecx));
11151 }
11152
11153 +extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
11154 +
11155 #define stack_current() \
11156 ({ \
11157 struct thread_info *ti; \
11158 --- a/include/asm-x86/mach-xen/asm/segment_32.h
11159 +++ b/include/asm-x86/mach-xen/asm/segment_32.h
11160 @@ -61,11 +61,9 @@
11161
11162 #define GDT_ENTRY_KERNEL_CS (GDT_ENTRY_KERNEL_BASE + 0)
11163 #define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8)
11164 -#define GET_KERNEL_CS() (__KERNEL_CS | (xen_feature(XENFEAT_supervisor_mode_kernel)?0:1) )
11165
11166 #define GDT_ENTRY_KERNEL_DS (GDT_ENTRY_KERNEL_BASE + 1)
11167 #define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8)
11168 -#define GET_KERNEL_DS() (__KERNEL_DS | (xen_feature(XENFEAT_supervisor_mode_kernel)?0:1) )
11169
11170 #define GDT_ENTRY_TSS (GDT_ENTRY_KERNEL_BASE + 4)
11171 #define GDT_ENTRY_LDT (GDT_ENTRY_KERNEL_BASE + 5)
11172 @@ -85,6 +83,11 @@
11173
11174 #define GDT_SIZE (GDT_ENTRIES * 8)
11175
11176 +/* Matches __KERNEL_CS and __USER_CS (they must be 2 entries apart) */
11177 +#define SEGMENT_IS_FLAT_CODE(x) (((x) & 0xec) == GDT_ENTRY_KERNEL_CS * 8)
11178 +/* Matches PNP_CS32 and PNP_CS16 (they must be consecutive) */
11179 +#define SEGMENT_IS_PNP_CODE(x) (((x) & 0xf4) == GDT_ENTRY_PNPBIOS_BASE * 8)
11180 +
11181 /* Simple and small GDT entries for booting only */
11182
11183 #define GDT_ENTRY_BOOT_CS 2
11184 @@ -114,4 +117,16 @@
11185 */
11186 #define IDT_ENTRIES 256
11187
11188 +/* Bottom two bits of selector give the ring privilege level */
11189 +#define SEGMENT_RPL_MASK 0x3
11190 +/* Bit 2 is table indicator (LDT/GDT) */
11191 +#define SEGMENT_TI_MASK 0x4
11192 +
11193 +/* User mode is privilege level 3 */
11194 +#define USER_RPL 0x3
11195 +/* LDT segment has TI set, GDT has it cleared */
11196 +#define SEGMENT_LDT 0x4
11197 +#define SEGMENT_GDT 0x0
11198 +
11199 +#define get_kernel_rpl() (xen_feature(XENFEAT_supervisor_mode_kernel)?0:1)
11200 #endif
11201 --- a/include/asm-x86/mach-xen/asm/smp_32.h
11202 +++ b/include/asm-x86/mach-xen/asm/smp_32.h
11203 @@ -79,25 +79,36 @@
11204 return GET_APIC_ID(*(unsigned long *)(APIC_BASE+APIC_ID));
11205 }
11206 #endif
11207 -
11208 -static __inline int logical_smp_processor_id(void)
11209 -{
11210 - /* we don't want to mark this access volatile - bad code generation */
11211 - return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR));
11212 -}
11213 -
11214 #endif
11215
11216 +#define safe_smp_processor_id() smp_processor_id()
11217 extern int __cpu_disable(void);
11218 extern void __cpu_die(unsigned int cpu);
11219 extern void prefill_possible_map(void);
11220 +extern unsigned int num_processors;
11221 +
11222 #endif /* !__ASSEMBLY__ */
11223
11224 #else /* CONFIG_SMP */
11225
11226 +#define safe_smp_processor_id() 0
11227 #define cpu_physical_id(cpu) boot_cpu_physical_apicid
11228
11229 #define NO_PROC_ID 0xFF /* No processor magic marker */
11230
11231 #endif
11232 +
11233 +#ifndef __ASSEMBLY__
11234 +
11235 +extern u8 apicid_2_node[];
11236 +
11237 +#ifdef CONFIG_X86_LOCAL_APIC
11238 +static __inline int logical_smp_processor_id(void)
11239 +{
11240 + /* we don't want to mark this access volatile - bad code generation */
11241 + return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR));
11242 +}
11243 +#endif
11244 +#endif
11245 +
11246 #endif
11247 --- a/include/asm-x86/mach-xen/asm/smp_64.h
11248 +++ b/include/asm-x86/mach-xen/asm/smp_64.h
11249 @@ -4,15 +4,12 @@
11250 /*
11251 * We need the APIC definitions automatically as part of 'smp.h'
11252 */
11253 -#ifndef __ASSEMBLY__
11254 #include <linux/threads.h>
11255 #include <linux/cpumask.h>
11256 #include <linux/bitops.h>
11257 extern int disable_apic;
11258 -#endif
11259
11260 #ifdef CONFIG_X86_LOCAL_APIC
11261 -#ifndef __ASSEMBLY__
11262 #include <asm/fixmap.h>
11263 #include <asm/mpspec.h>
11264 #ifdef CONFIG_X86_IO_APIC
11265 @@ -21,10 +18,8 @@
11266 #include <asm/apic.h>
11267 #include <asm/thread_info.h>
11268 #endif
11269 -#endif
11270
11271 #ifdef CONFIG_SMP
11272 -#ifndef ASSEMBLY
11273
11274 #include <asm/pda.h>
11275
11276 @@ -41,14 +36,11 @@
11277
11278 extern void smp_alloc_memory(void);
11279 extern volatile unsigned long smp_invalidate_needed;
11280 -extern int pic_mode;
11281 extern void lock_ipi_call_lock(void);
11282 extern void unlock_ipi_call_lock(void);
11283 extern int smp_num_siblings;
11284 extern void smp_send_reschedule(int cpu);
11285 void smp_stop_cpu(void);
11286 -extern int smp_call_function_single(int cpuid, void (*func) (void *info),
11287 - void *info, int retry, int wait);
11288
11289 extern cpumask_t cpu_sibling_map[NR_CPUS];
11290 extern cpumask_t cpu_core_map[NR_CPUS];
11291 @@ -77,20 +69,16 @@
11292 }
11293 #endif
11294
11295 -extern int safe_smp_processor_id(void);
11296 extern int __cpu_disable(void);
11297 extern void __cpu_die(unsigned int cpu);
11298 extern void prefill_possible_map(void);
11299 extern unsigned num_processors;
11300 extern unsigned disabled_cpus;
11301
11302 -#endif /* !ASSEMBLY */
11303 -
11304 #define NO_PROC_ID 0xFF /* No processor magic marker */
11305
11306 #endif
11307
11308 -#ifndef ASSEMBLY
11309 /*
11310 * Some lowlevel functions might want to know about
11311 * the real APIC ID <-> CPU # mapping.
11312 @@ -114,11 +102,8 @@
11313 }
11314 #endif
11315
11316 -#endif /* !ASSEMBLY */
11317 -
11318 #ifndef CONFIG_SMP
11319 #define stack_smp_processor_id() 0
11320 -#define safe_smp_processor_id() 0
11321 #define cpu_logical_map(x) (x)
11322 #else
11323 #include <asm/thread_info.h>
11324 @@ -130,7 +115,6 @@
11325 })
11326 #endif
11327
11328 -#ifndef __ASSEMBLY__
11329 #ifdef CONFIG_X86_LOCAL_APIC
11330 static __inline int logical_smp_processor_id(void)
11331 {
11332 @@ -138,13 +122,18 @@
11333 return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR));
11334 }
11335 #endif
11336 -#endif
11337
11338 #ifdef CONFIG_SMP
11339 #define cpu_physical_id(cpu) x86_cpu_to_apicid[cpu]
11340 #else
11341 #define cpu_physical_id(cpu) boot_cpu_id
11342 -#endif
11343 -
11344 +static inline int smp_call_function_single(int cpuid, void (*func) (void *info),
11345 + void *info, int retry, int wait)
11346 +{
11347 + /* Disable interrupts here? */
11348 + func(info);
11349 + return 0;
11350 +}
11351 +#endif /* !CONFIG_SMP */
11352 #endif
11353
11354 --- a/include/asm-x86/mach-xen/asm/system_32.h
11355 +++ b/include/asm-x86/mach-xen/asm/system_32.h
11356 @@ -267,6 +267,9 @@
11357 #define cmpxchg(ptr,o,n)\
11358 ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\
11359 (unsigned long)(n),sizeof(*(ptr))))
11360 +#define sync_cmpxchg(ptr,o,n)\
11361 + ((__typeof__(*(ptr)))__sync_cmpxchg((ptr),(unsigned long)(o),\
11362 + (unsigned long)(n),sizeof(*(ptr))))
11363 #endif
11364
11365 static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
11366 @@ -291,6 +294,39 @@
11367 : "=a"(prev)
11368 : "r"(new), "m"(*__xg(ptr)), "0"(old)
11369 : "memory");
11370 + return prev;
11371 + }
11372 + return old;
11373 +}
11374 +
11375 +/*
11376 + * Always use locked operations when touching memory shared with a
11377 + * hypervisor, since the system may be SMP even if the guest kernel
11378 + * isn't.
11379 + */
11380 +static inline unsigned long __sync_cmpxchg(volatile void *ptr,
11381 + unsigned long old,
11382 + unsigned long new, int size)
11383 +{
11384 + unsigned long prev;
11385 + switch (size) {
11386 + case 1:
11387 + __asm__ __volatile__("lock; cmpxchgb %b1,%2"
11388 + : "=a"(prev)
11389 + : "q"(new), "m"(*__xg(ptr)), "0"(old)
11390 + : "memory");
11391 + return prev;
11392 + case 2:
11393 + __asm__ __volatile__("lock; cmpxchgw %w1,%2"
11394 + : "=a"(prev)
11395 + : "r"(new), "m"(*__xg(ptr)), "0"(old)
11396 + : "memory");
11397 + return prev;
11398 + case 4:
11399 + __asm__ __volatile__("lock; cmpxchgl %1,%2"
11400 + : "=a"(prev)
11401 + : "r"(new), "m"(*__xg(ptr)), "0"(old)
11402 + : "memory");
11403 return prev;
11404 }
11405 return old;
11406 --- a/include/asm-x86/mach-xen/asm/system_64.h
11407 +++ b/include/asm-x86/mach-xen/asm/system_64.h
11408 @@ -24,6 +24,7 @@
11409 #define __EXTRA_CLOBBER \
11410 ,"rcx","rbx","rdx","r8","r9","r10","r11","r12","r13","r14","r15"
11411
11412 +/* Save restore flags to clear handle leaking NT */
11413 #define switch_to(prev,next,last) \
11414 asm volatile(SAVE_CONTEXT \
11415 "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \
11416 --- a/include/asm-x86/mach-xen/asm/tlbflush_32.h
11417 +++ b/include/asm-x86/mach-xen/asm/tlbflush_32.h
11418 @@ -8,8 +8,6 @@
11419 #define __flush_tlb_global() xen_tlb_flush()
11420 #define __flush_tlb_all() xen_tlb_flush()
11421
11422 -extern unsigned long pgkern_mask;
11423 -
11424 #define cpu_has_invlpg (boot_cpu_data.x86 > 3)
11425
11426 #define __flush_tlb_single(addr) xen_invlpg(addr)
11427 --- a/include/asm-x86/mach-xen/asm/tlbflush_64.h
11428 +++ b/include/asm-x86/mach-xen/asm/tlbflush_64.h
11429 @@ -12,9 +12,6 @@
11430 */
11431 #define __flush_tlb_global() xen_tlb_flush()
11432
11433 -
11434 -extern unsigned long pgkern_mask;
11435 -
11436 #define __flush_tlb_all() __flush_tlb_global()
11437
11438 #define __flush_tlb_one(addr) xen_invlpg((unsigned long)addr)
11439 --- a/include/asm-x86/thread_info_64.h
11440 +++ b/include/asm-x86/thread_info_64.h
11441 @@ -157,10 +157,14 @@
11442 (_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY|_TIF_HRTICK_RESCHED)
11443
11444 /* flags to check in __switch_to() */
11445 +#ifndef CONFIG_XEN
11446 #define _TIF_WORK_CTXSW \
11447 (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_BTS_TRACE_TS)
11448 #define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW
11449 #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
11450 +#else
11451 +#define _TIF_WORK_CTXSW _TIF_DEBUG
11452 +#endif
11453
11454 #define PREEMPT_ACTIVE 0x10000000
11455
11456 --- a/include/linux/skbuff.h
11457 +++ b/include/linux/skbuff.h
11458 @@ -1821,5 +1821,12 @@
11459 }
11460
11461 bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off);
11462 +
11463 +#ifdef CONFIG_XEN
11464 +int skb_checksum_setup(struct sk_buff *skb);
11465 +#else
11466 +static inline int skb_checksum_setup(struct sk_buff *skb) { return 0; }
11467 +#endif
11468 +
11469 #endif /* __KERNEL__ */
11470 #endif /* _LINUX_SKBUFF_H */
11471 --- a/include/xen/evtchn.h
11472 +++ b/include/xen/evtchn.h
11473 @@ -54,34 +54,34 @@
11474 */
11475 int bind_caller_port_to_irqhandler(
11476 unsigned int caller_port,
11477 - irqreturn_t (*handler)(int, void *, struct pt_regs *),
11478 + irq_handler_t handler,
11479 unsigned long irqflags,
11480 const char *devname,
11481 void *dev_id);
11482 int bind_listening_port_to_irqhandler(
11483 unsigned int remote_domain,
11484 - irqreturn_t (*handler)(int, void *, struct pt_regs *),
11485 + irq_handler_t handler,
11486 unsigned long irqflags,
11487 const char *devname,
11488 void *dev_id);
11489 int bind_interdomain_evtchn_to_irqhandler(
11490 unsigned int remote_domain,
11491 unsigned int remote_port,
11492 - irqreturn_t (*handler)(int, void *, struct pt_regs *),
11493 + irq_handler_t handler,
11494 unsigned long irqflags,
11495 const char *devname,
11496 void *dev_id);
11497 int bind_virq_to_irqhandler(
11498 unsigned int virq,
11499 unsigned int cpu,
11500 - irqreturn_t (*handler)(int, void *, struct pt_regs *),
11501 + irq_handler_t handler,
11502 unsigned long irqflags,
11503 const char *devname,
11504 void *dev_id);
11505 int bind_ipi_to_irqhandler(
11506 unsigned int ipi,
11507 unsigned int cpu,
11508 - irqreturn_t (*handler)(int, void *, struct pt_regs *),
11509 + irq_handler_t handler,
11510 unsigned long irqflags,
11511 const char *devname,
11512 void *dev_id);
11513 --- a/include/xen/xencons.h
11514 +++ b/include/xen/xencons.h
11515 @@ -8,7 +8,7 @@
11516 void xencons_resume(void);
11517
11518 /* Interrupt work hooks. Receive data, or kick data out. */
11519 -void xencons_rx(char *buf, unsigned len, struct pt_regs *regs);
11520 +void xencons_rx(char *buf, unsigned len);
11521 void xencons_tx(void);
11522
11523 int xencons_ring_init(void);
11524 --- a/mm/mprotect.c
11525 +++ b/mm/mprotect.c
11526 @@ -86,7 +86,7 @@
11527 next = pmd_addr_end(addr, end);
11528 if (pmd_none_or_clear_bad(pmd))
11529 continue;
11530 - if (arch_change_pte_range(mm, pmd, addr, next, newprot))
11531 + if (arch_change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable))
11532 continue;
11533 change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable);
11534 } while (pmd++, addr = next, addr != end);
11535 --- a/net/core/dev.c
11536 +++ b/net/core/dev.c
11537 @@ -1611,15 +1611,14 @@
11538 }
11539 if ((skb->h.raw + skb->csum + 2) > skb->tail)
11540 goto out;
11541 - skb->ip_summed = CHECKSUM_HW;
11542 + skb->ip_summed = CHECKSUM_PARTIAL;
11543 skb->proto_csum_blank = 0;
11544 }
11545 return 0;
11546 out:
11547 return -EPROTO;
11548 }
11549 -#else
11550 -inline int skb_checksum_setup(struct sk_buff *skb) { return 0; }
11551 +EXPORT_SYMBOL(skb_checksum_setup);
11552 #endif
11553
11554 /**
11555 @@ -2115,7 +2114,7 @@
11556 case CHECKSUM_UNNECESSARY:
11557 skb->proto_data_valid = 1;
11558 break;
11559 - case CHECKSUM_HW:
11560 + case CHECKSUM_PARTIAL:
11561 /* XXX Implement me. */
11562 default:
11563 skb->proto_data_valid = 0;
11564 @@ -4648,7 +4647,6 @@
11565 EXPORT_SYMBOL(net_enable_timestamp);
11566 EXPORT_SYMBOL(net_disable_timestamp);
11567 EXPORT_SYMBOL(dev_get_flags);
11568 -EXPORT_SYMBOL(skb_checksum_setup);
11569
11570 #if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
11571 EXPORT_SYMBOL(br_handle_frame_hook);