Magellan Linux

Annotation of /trunk/kernel26-xen/patches-2.6.25-r1/1021-2.6.25-xen-patch-2.6.20.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 609 - (hide annotations) (download)
Fri May 23 17:35:37 2008 UTC (16 years ago) by niro
File size: 199019 byte(s)
-using opensuse xen patchset, updated kernel configs

1 niro 609 From: www.kernel.org
2     Subject: Linux 2.6.20
3     Patch-mainline: 2.6.20
4    
5     Automatically created from "patches.kernel.org/patch-2.6.20" by xen-port-patches.py
6    
7     Acked-by: jbeulich@novell.com
8    
9     ---
10     arch/x86/Kconfig | 2
11     arch/x86/kernel/asm-offsets_32.c | 6
12     arch/x86/kernel/cpu/common-xen.c | 286 ++++---
13     arch/x86/kernel/cpu/mtrr/main-xen.c | 5
14     arch/x86/kernel/e820_32-xen.c | 1000 ++++++++++++++++++++++++++
15     arch/x86/kernel/entry_32-xen.S | 387 ++++------
16     arch/x86/kernel/entry_64-xen.S | 69 -
17     arch/x86/kernel/genapic_64-xen.c | 8
18     arch/x86/kernel/head64-xen.c | 5
19     arch/x86/kernel/head_32-xen.S | 63 +
20     arch/x86/kernel/io_apic_32-xen.c | 68 -
21     arch/x86/kernel/io_apic_64-xen.c | 133 ++-
22     arch/x86/kernel/irq_64-xen.c | 2
23     arch/x86/kernel/ldt_32-xen.c | 4
24     arch/x86/kernel/microcode-xen.c | 6
25     arch/x86/kernel/mpparse_32-xen.c | 12
26     arch/x86/kernel/mpparse_64-xen.c | 2
27     arch/x86/kernel/pci-dma_32-xen.c | 10
28     arch/x86/kernel/process_32-xen.c | 56 -
29     arch/x86/kernel/process_64-xen.c | 34
30     arch/x86/kernel/quirks-xen.c | 61 +
31     arch/x86/kernel/setup_32-xen.c | 974 -------------------------
32     arch/x86/kernel/setup_64-xen.c | 24
33     arch/x86/kernel/smp_32-xen.c | 4
34     arch/x86/kernel/smp_64-xen.c | 5
35     arch/x86/kernel/time_32-xen.c | 17
36     arch/x86/kernel/traps_32-xen.c | 204 +----
37     arch/x86/kernel/traps_64-xen.c | 139 ---
38     arch/x86/kernel/vmlinux_32.lds.S | 6
39     arch/x86/kernel/vsyscall_64-xen.c | 7
40     arch/x86/kvm/Kconfig | 1
41     arch/x86/mm/fault_32-xen.c | 12
42     arch/x86/mm/fault_64-xen.c | 10
43     arch/x86/mm/highmem_32-xen.c | 26
44     arch/x86/mm/init_32-xen.c | 20
45     arch/x86/mm/init_64-xen.c | 7
46     arch/x86/mm/pageattr_64-xen.c | 58 -
47     arch/x86/mm/pgtable_32-xen.c | 6
48     arch/x86/pci/irq-xen.c | 4
49     drivers/xen/balloon/balloon.c | 6
50     drivers/xen/blkback/blkback.c | 1
51     drivers/xen/blkback/interface.c | 2
52     drivers/xen/blkfront/blkfront.c | 8
53     drivers/xen/blktap/blktap.c | 1
54     drivers/xen/blktap/interface.c | 2
55     drivers/xen/char/mem.c | 4
56     drivers/xen/console/console.c | 13
57     drivers/xen/core/reboot.c | 10
58     drivers/xen/core/smpboot.c | 21
59     drivers/xen/fbfront/xenfb.c | 1
60     drivers/xen/netback/loopback.c | 1
61     drivers/xen/pciback/conf_space_header.c | 4
62     drivers/xen/pciback/pciback.h | 2
63     drivers/xen/pciback/pciback_ops.c | 6
64     drivers/xen/pciback/xenbus.c | 3
65     drivers/xen/sfc_netfront/accel_vi.c | 4
66     drivers/xen/tpmback/interface.c | 2
67     drivers/xen/xenbus/xenbus_comms.c | 4
68     drivers/xen/xenbus/xenbus_probe.c | 2
69     include/asm-x86/mach-xen/asm/desc_32.h | 100 +-
70     include/asm-x86/mach-xen/asm/desc_64.h | 53 -
71     include/asm-x86/mach-xen/asm/dma-mapping_32.h | 4
72     include/asm-x86/mach-xen/asm/dma-mapping_64.h | 8
73     include/asm-x86/mach-xen/asm/fixmap_32.h | 5
74     include/asm-x86/mach-xen/asm/hypervisor.h | 9
75     include/asm-x86/mach-xen/asm/io_32.h | 4
76     include/asm-x86/mach-xen/asm/irqflags_32.h | 68 +
77     include/asm-x86/mach-xen/asm/mmu_context_32.h | 19
78     include/asm-x86/mach-xen/asm/pgtable-2level.h | 21
79     include/asm-x86/mach-xen/asm/pgtable-3level.h | 67 -
80     include/asm-x86/mach-xen/asm/pgtable_32.h | 24
81     include/asm-x86/mach-xen/asm/pgtable_64.h | 23
82     include/asm-x86/mach-xen/asm/processor_32.h | 207 ++---
83     include/asm-x86/mach-xen/asm/processor_64.h | 8
84     include/asm-x86/mach-xen/asm/segment_32.h | 5
85     include/asm-x86/mach-xen/asm/smp_32.h | 3
86     include/asm-x86/mach-xen/asm/smp_64.h | 12
87     include/asm-x86/mach-xen/asm/system_32.h | 12
88     kernel/kexec.c | 2
89     net/core/dev.c | 6
90     80 files changed, 2263 insertions(+), 2237 deletions(-)
91    
92     --- a/arch/x86/Kconfig
93     +++ b/arch/x86/Kconfig
94     @@ -1220,7 +1220,7 @@
95    
96     config RELOCATABLE
97     bool "Build a relocatable kernel (EXPERIMENTAL)"
98     - depends on EXPERIMENTAL
99     + depends on EXPERIMENTAL && !X86_XEN
100     help
101     This builds a kernel image that retains relocation information
102     so it can be loaded someplace besides the default 1MB.
103     --- a/arch/x86/kernel/asm-offsets_32.c
104     +++ b/arch/x86/kernel/asm-offsets_32.c
105     @@ -61,6 +61,7 @@
106     OFFSET(TI_exec_domain, thread_info, exec_domain);
107     OFFSET(TI_flags, thread_info, flags);
108     OFFSET(TI_status, thread_info, status);
109     + OFFSET(TI_cpu, thread_info, cpu);
110     OFFSET(TI_preempt_count, thread_info, preempt_count);
111     OFFSET(TI_addr_limit, thread_info, addr_limit);
112     OFFSET(TI_restart_block, thread_info, restart_block);
113     @@ -115,6 +116,11 @@
114    
115     OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
116    
117     +#ifdef CONFIG_XEN
118     + BLANK();
119     + OFFSET(XEN_START_mfn_list, start_info, mfn_list);
120     +#endif
121     +
122     #ifdef CONFIG_PARAVIRT
123     BLANK();
124     OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
125     --- a/arch/x86/kernel/cpu/common-xen.c
126     +++ b/arch/x86/kernel/cpu/common-xen.c
127     @@ -22,6 +22,7 @@
128     #define phys_pkg_id(a,b) a
129     #endif
130     #endif
131     +#include <asm/pda.h>
132     #include <asm/hypervisor.h>
133    
134     #include "cpu.h"
135     @@ -29,10 +30,8 @@
136     DEFINE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr);
137     EXPORT_PER_CPU_SYMBOL(cpu_gdt_descr);
138    
139     -#ifndef CONFIG_XEN
140     -DEFINE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]);
141     -EXPORT_PER_CPU_SYMBOL(cpu_16bit_stack);
142     -#endif
143     +struct i386_pda *_cpu_pda[NR_CPUS] __read_mostly;
144     +EXPORT_SYMBOL(_cpu_pda);
145    
146     static int cachesize_override __cpuinitdata = -1;
147     static int disable_x86_fxsr __cpuinitdata;
148     @@ -60,7 +59,7 @@
149     .c_init = default_init,
150     .c_vendor = "Unknown",
151     };
152     -static struct cpu_dev * this_cpu = &default_cpu;
153     +static struct cpu_dev * this_cpu __cpuinitdata = &default_cpu;
154    
155     static int __init cachesize_setup(char *str)
156     {
157     @@ -242,29 +241,14 @@
158     return flag_is_changeable_p(X86_EFLAGS_ID);
159     }
160    
161     -/* Do minimum CPU detection early.
162     - Fields really needed: vendor, cpuid_level, family, model, mask, cache alignment.
163     - The others are not touched to avoid unwanted side effects.
164     -
165     - WARNING: this function is only called on the BP. Don't add code here
166     - that is supposed to run on all CPUs. */
167     -static void __init early_cpu_detect(void)
168     +void __init cpu_detect(struct cpuinfo_x86 *c)
169     {
170     - struct cpuinfo_x86 *c = &boot_cpu_data;
171     -
172     - c->x86_cache_alignment = 32;
173     -
174     - if (!have_cpuid_p())
175     - return;
176     -
177     /* Get vendor name */
178     cpuid(0x00000000, &c->cpuid_level,
179     (int *)&c->x86_vendor_id[0],
180     (int *)&c->x86_vendor_id[8],
181     (int *)&c->x86_vendor_id[4]);
182    
183     - get_cpu_vendor(c, 1);
184     -
185     c->x86 = 4;
186     if (c->cpuid_level >= 0x00000001) {
187     u32 junk, tfms, cap0, misc;
188     @@ -281,6 +265,26 @@
189     }
190     }
191    
192     +/* Do minimum CPU detection early.
193     + Fields really needed: vendor, cpuid_level, family, model, mask, cache alignment.
194     + The others are not touched to avoid unwanted side effects.
195     +
196     + WARNING: this function is only called on the BP. Don't add code here
197     + that is supposed to run on all CPUs. */
198     +static void __init early_cpu_detect(void)
199     +{
200     + struct cpuinfo_x86 *c = &boot_cpu_data;
201     +
202     + c->x86_cache_alignment = 32;
203     +
204     + if (!have_cpuid_p())
205     + return;
206     +
207     + cpu_detect(c);
208     +
209     + get_cpu_vendor(c, 1);
210     +}
211     +
212     static void __cpuinit generic_identify(struct cpuinfo_x86 * c)
213     {
214     u32 tfms, xlvl;
215     @@ -315,6 +319,8 @@
216     #else
217     c->apicid = (ebx >> 24) & 0xFF;
218     #endif
219     + if (c->x86_capability[0] & (1<<19))
220     + c->x86_clflush_size = ((ebx >> 8) & 0xff) * 8;
221     } else {
222     /* Have CPUID level 0 only - unheard of */
223     c->x86 = 4;
224     @@ -379,6 +385,7 @@
225     c->x86_vendor_id[0] = '\0'; /* Unset */
226     c->x86_model_id[0] = '\0'; /* Unset */
227     c->x86_max_cores = 1;
228     + c->x86_clflush_size = 32;
229     memset(&c->x86_capability, 0, sizeof c->x86_capability);
230    
231     if (!have_cpuid_p()) {
232     @@ -599,61 +606,23 @@
233     #endif
234     }
235    
236     -static void __cpuinit cpu_gdt_init(const struct Xgt_desc_struct *gdt_descr)
237     +/* Make sure %gs is initialized properly in idle threads */
238     +struct pt_regs * __devinit idle_regs(struct pt_regs *regs)
239     {
240     - unsigned long frames[16];
241     - unsigned long va;
242     - int f;
243     -
244     - for (va = gdt_descr->address, f = 0;
245     - va < gdt_descr->address + gdt_descr->size;
246     - va += PAGE_SIZE, f++) {
247     - frames[f] = virt_to_mfn(va);
248     - make_lowmem_page_readonly(
249     - (void *)va, XENFEAT_writable_descriptor_tables);
250     - }
251     - if (HYPERVISOR_set_gdt(frames, (gdt_descr->size + 1) / 8))
252     - BUG();
253     + memset(regs, 0, sizeof(struct pt_regs));
254     + regs->xgs = __KERNEL_PDA;
255     + return regs;
256     }
257    
258     -/*
259     - * cpu_init() initializes state that is per-CPU. Some data is already
260     - * initialized (naturally) in the bootstrap process, such as the GDT
261     - * and IDT. We reload them nevertheless, this function acts as a
262     - * 'CPU state barrier', nothing should get across.
263     - */
264     -void __cpuinit cpu_init(void)
265     +static __cpuinit int alloc_gdt(int cpu)
266     {
267     - int cpu = smp_processor_id();
268     -#ifndef CONFIG_X86_NO_TSS
269     - struct tss_struct * t = &per_cpu(init_tss, cpu);
270     -#endif
271     - struct thread_struct *thread = &current->thread;
272     - struct desc_struct *gdt;
273     struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
274     + struct desc_struct *gdt;
275     + struct i386_pda *pda;
276    
277     - if (cpu_test_and_set(cpu, cpu_initialized)) {
278     - printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
279     - for (;;) local_irq_enable();
280     - }
281     - printk(KERN_INFO "Initializing CPU#%d\n", cpu);
282     -
283     - if (cpu_has_vme || cpu_has_de)
284     - clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
285     - if (tsc_disable && cpu_has_tsc) {
286     - printk(KERN_NOTICE "Disabling TSC...\n");
287     - /**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/
288     - clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability);
289     - set_in_cr4(X86_CR4_TSD);
290     - }
291     + gdt = (struct desc_struct *)cpu_gdt_descr->address;
292     + pda = cpu_pda(cpu);
293    
294     -#ifndef CONFIG_XEN
295     - /* The CPU hotplug case */
296     - if (cpu_gdt_descr->address) {
297     - gdt = (struct desc_struct *)cpu_gdt_descr->address;
298     - memset(gdt, 0, PAGE_SIZE);
299     - goto old_gdt;
300     - }
301     /*
302     * This is a horrible hack to allocate the GDT. The problem
303     * is that cpu_init() is called really early for the boot CPU
304     @@ -661,54 +630,141 @@
305     * CPUs, when bootmem will have gone away
306     */
307     if (NODE_DATA(0)->bdata->node_bootmem_map) {
308     - gdt = (struct desc_struct *)alloc_bootmem_pages(PAGE_SIZE);
309     - /* alloc_bootmem_pages panics on failure, so no check */
310     + BUG_ON(gdt != NULL || pda != NULL);
311     +
312     + gdt = alloc_bootmem_pages(PAGE_SIZE);
313     + pda = alloc_bootmem(sizeof(*pda));
314     + /* alloc_bootmem(_pages) panics on failure, so no check */
315     +
316     memset(gdt, 0, PAGE_SIZE);
317     + memset(pda, 0, sizeof(*pda));
318     } else {
319     - gdt = (struct desc_struct *)get_zeroed_page(GFP_KERNEL);
320     - if (unlikely(!gdt)) {
321     - printk(KERN_CRIT "CPU%d failed to allocate GDT\n", cpu);
322     - for (;;)
323     - local_irq_enable();
324     + /* GDT and PDA might already have been allocated if
325     + this is a CPU hotplug re-insertion. */
326     + if (gdt == NULL)
327     + gdt = (struct desc_struct *)get_zeroed_page(GFP_KERNEL);
328     +
329     + if (pda == NULL)
330     + pda = kmalloc_node(sizeof(*pda), GFP_KERNEL, cpu_to_node(cpu));
331     +
332     + if (unlikely(!gdt || !pda)) {
333     + free_pages((unsigned long)gdt, 0);
334     + kfree(pda);
335     + return 0;
336     }
337     }
338     -old_gdt:
339     +
340     + cpu_gdt_descr->address = (unsigned long)gdt;
341     + cpu_pda(cpu) = pda;
342     +
343     + return 1;
344     +}
345     +
346     +/* Initial PDA used by boot CPU */
347     +struct i386_pda boot_pda = {
348     + ._pda = &boot_pda,
349     + .cpu_number = 0,
350     + .pcurrent = &init_task,
351     +};
352     +
353     +static inline void set_kernel_gs(void)
354     +{
355     + /* Set %gs for this CPU's PDA. Memory clobber is to create a
356     + barrier with respect to any PDA operations, so the compiler
357     + doesn't move any before here. */
358     + asm volatile ("mov %0, %%gs" : : "r" (__KERNEL_PDA) : "memory");
359     +}
360     +
361     +/* Initialize the CPU's GDT and PDA. The boot CPU does this for
362     + itself, but secondaries find this done for them. */
363     +__cpuinit int init_gdt(int cpu, struct task_struct *idle)
364     +{
365     + struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
366     + struct desc_struct *gdt;
367     + struct i386_pda *pda;
368     +
369     + /* For non-boot CPUs, the GDT and PDA should already have been
370     + allocated. */
371     + if (!alloc_gdt(cpu)) {
372     + printk(KERN_CRIT "CPU%d failed to allocate GDT or PDA\n", cpu);
373     + return 0;
374     + }
375     +
376     + gdt = (struct desc_struct *)cpu_gdt_descr->address;
377     + pda = cpu_pda(cpu);
378     +
379     + BUG_ON(gdt == NULL || pda == NULL);
380     +
381     /*
382     * Initialize the per-CPU GDT with the boot GDT,
383     * and set up the GDT descriptor:
384     */
385     memcpy(gdt, cpu_gdt_table, GDT_SIZE);
386     + cpu_gdt_descr->size = GDT_SIZE - 1;
387    
388     - /* Set up GDT entry for 16bit stack */
389     - *(__u64 *)(&gdt[GDT_ENTRY_ESPFIX_SS]) |=
390     - ((((__u64)stk16_off) << 16) & 0x000000ffffff0000ULL) |
391     - ((((__u64)stk16_off) << 32) & 0xff00000000000000ULL) |
392     - (CPU_16BIT_STACK_SIZE - 1);
393     + pack_descriptor((u32 *)&gdt[GDT_ENTRY_PDA].a,
394     + (u32 *)&gdt[GDT_ENTRY_PDA].b,
395     + (unsigned long)pda, sizeof(*pda) - 1,
396     + 0x80 | DESCTYPE_S | 0x2, 0); /* present read-write data segment */
397     +
398     + memset(pda, 0, sizeof(*pda));
399     + pda->_pda = pda;
400     + pda->cpu_number = cpu;
401     + pda->pcurrent = idle;
402    
403     - cpu_gdt_descr->size = GDT_SIZE - 1;
404     - cpu_gdt_descr->address = (unsigned long)gdt;
405     -#else
406     - if (cpu == 0 && cpu_gdt_descr->address == 0) {
407     - gdt = (struct desc_struct *)alloc_bootmem_pages(PAGE_SIZE);
408     - /* alloc_bootmem_pages panics on failure, so no check */
409     - memset(gdt, 0, PAGE_SIZE);
410     + return 1;
411     +}
412    
413     - memcpy(gdt, cpu_gdt_table, GDT_SIZE);
414     -
415     - cpu_gdt_descr->size = GDT_SIZE;
416     - cpu_gdt_descr->address = (unsigned long)gdt;
417     +void __cpuinit cpu_set_gdt(int cpu)
418     +{
419     + struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
420     + unsigned long va, frames[16];
421     + int f;
422     +
423     + for (va = cpu_gdt_descr->address, f = 0;
424     + va < cpu_gdt_descr->address + cpu_gdt_descr->size;
425     + va += PAGE_SIZE, f++) {
426     + frames[f] = virt_to_mfn(va);
427     + make_lowmem_page_readonly(
428     + (void *)va, XENFEAT_writable_descriptor_tables);
429     }
430     + BUG_ON(HYPERVISOR_set_gdt(frames, (cpu_gdt_descr->size + 1) / 8));
431     +
432     + set_kernel_gs();
433     +}
434     +
435     +/* Common CPU init for both boot and secondary CPUs */
436     +static void __cpuinit _cpu_init(int cpu, struct task_struct *curr)
437     +{
438     +#ifndef CONFIG_X86_NO_TSS
439     + struct tss_struct * t = &per_cpu(init_tss, cpu);
440     #endif
441     + struct thread_struct *thread = &curr->thread;
442     +
443     + if (cpu_test_and_set(cpu, cpu_initialized)) {
444     + printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
445     + for (;;) local_irq_enable();
446     + }
447    
448     - cpu_gdt_init(cpu_gdt_descr);
449     + printk(KERN_INFO "Initializing CPU#%d\n", cpu);
450     +
451     + if (cpu_has_vme || cpu_has_de)
452     + clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
453     + if (tsc_disable && cpu_has_tsc) {
454     + printk(KERN_NOTICE "Disabling TSC...\n");
455     + /**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/
456     + clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability);
457     + set_in_cr4(X86_CR4_TSD);
458     + }
459    
460     /*
461     * Set up and load the per-CPU TSS and LDT
462     */
463     atomic_inc(&init_mm.mm_count);
464     - current->active_mm = &init_mm;
465     - BUG_ON(current->mm);
466     - enter_lazy_tlb(&init_mm, current);
467     + curr->active_mm = &init_mm;
468     + if (curr->mm)
469     + BUG();
470     + enter_lazy_tlb(&init_mm, curr);
471    
472     load_esp0(t, thread);
473    
474     @@ -719,8 +775,8 @@
475     __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
476     #endif
477    
478     - /* Clear %fs and %gs. */
479     - asm volatile ("movl %0, %%fs; movl %0, %%gs" : : "r" (0));
480     + /* Clear %fs. */
481     + asm volatile ("mov %0, %%fs" : : "r" (0));
482    
483     /* Clear all 6 debug registers: */
484     set_debugreg(0, 0);
485     @@ -738,6 +794,38 @@
486     mxcsr_feature_mask_init();
487     }
488    
489     +/* Entrypoint to initialize secondary CPU */
490     +void __cpuinit secondary_cpu_init(void)
491     +{
492     + int cpu = smp_processor_id();
493     + struct task_struct *curr = current;
494     +
495     + _cpu_init(cpu, curr);
496     +}
497     +
498     +/*
499     + * cpu_init() initializes state that is per-CPU. Some data is already
500     + * initialized (naturally) in the bootstrap process, such as the GDT
501     + * and IDT. We reload them nevertheless, this function acts as a
502     + * 'CPU state barrier', nothing should get across.
503     + */
504     +void __cpuinit cpu_init(void)
505     +{
506     + int cpu = smp_processor_id();
507     + struct task_struct *curr = current;
508     +
509     + /* Set up the real GDT and PDA, so we can transition from the
510     + boot versions. */
511     + if (!init_gdt(cpu, curr)) {
512     + /* failed to allocate something; not much we can do... */
513     + for (;;)
514     + local_irq_enable();
515     + }
516     +
517     + cpu_set_gdt(cpu);
518     + _cpu_init(cpu, curr);
519     +}
520     +
521     #ifdef CONFIG_HOTPLUG_CPU
522     void __cpuinit cpu_uninit(void)
523     {
524     --- a/arch/x86/kernel/cpu/mtrr/main-xen.c
525     +++ b/arch/x86/kernel/cpu/mtrr/main-xen.c
526     @@ -12,7 +12,7 @@
527     static DEFINE_MUTEX(mtrr_mutex);
528    
529     void generic_get_mtrr(unsigned int reg, unsigned long *base,
530     - unsigned int *size, mtrr_type * type)
531     + unsigned long *size, mtrr_type * type)
532     {
533     struct xen_platform_op op;
534    
535     @@ -115,8 +115,7 @@
536     {
537     unsigned i;
538     mtrr_type ltype;
539     - unsigned long lbase;
540     - unsigned int lsize;
541     + unsigned long lbase, lsize;
542     int error = -EINVAL;
543     struct xen_platform_op op;
544    
545     --- /dev/null
546     +++ b/arch/x86/kernel/e820_32-xen.c
547     @@ -0,0 +1,1000 @@
548     +#include <linux/kernel.h>
549     +#include <linux/types.h>
550     +#include <linux/init.h>
551     +#include <linux/bootmem.h>
552     +#include <linux/ioport.h>
553     +#include <linux/string.h>
554     +#include <linux/kexec.h>
555     +#include <linux/module.h>
556     +#include <linux/mm.h>
557     +#include <linux/efi.h>
558     +#include <linux/pfn.h>
559     +#include <linux/uaccess.h>
560     +
561     +#include <asm/pgtable.h>
562     +#include <asm/page.h>
563     +#include <asm/e820.h>
564     +#include <xen/interface/memory.h>
565     +
566     +#ifdef CONFIG_EFI
567     +int efi_enabled = 0;
568     +EXPORT_SYMBOL(efi_enabled);
569     +#endif
570     +
571     +struct e820map e820;
572     +struct change_member {
573     + struct e820entry *pbios; /* pointer to original bios entry */
574     + unsigned long long addr; /* address for this change point */
575     +};
576     +static struct change_member change_point_list[2*E820MAX] __initdata;
577     +static struct change_member *change_point[2*E820MAX] __initdata;
578     +static struct e820entry *overlap_list[E820MAX] __initdata;
579     +static struct e820entry new_bios[E820MAX] __initdata;
580     +/* For PCI or other memory-mapped resources */
581     +unsigned long pci_mem_start = 0x10000000;
582     +#ifdef CONFIG_PCI
583     +EXPORT_SYMBOL(pci_mem_start);
584     +#endif
585     +extern int user_defined_memmap;
586     +struct resource data_resource = {
587     + .name = "Kernel data",
588     + .start = 0,
589     + .end = 0,
590     + .flags = IORESOURCE_BUSY | IORESOURCE_MEM
591     +};
592     +
593     +struct resource code_resource = {
594     + .name = "Kernel code",
595     + .start = 0,
596     + .end = 0,
597     + .flags = IORESOURCE_BUSY | IORESOURCE_MEM
598     +};
599     +
600     +static struct resource system_rom_resource = {
601     + .name = "System ROM",
602     + .start = 0xf0000,
603     + .end = 0xfffff,
604     + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
605     +};
606     +
607     +static struct resource extension_rom_resource = {
608     + .name = "Extension ROM",
609     + .start = 0xe0000,
610     + .end = 0xeffff,
611     + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
612     +};
613     +
614     +static struct resource adapter_rom_resources[] = { {
615     + .name = "Adapter ROM",
616     + .start = 0xc8000,
617     + .end = 0,
618     + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
619     +}, {
620     + .name = "Adapter ROM",
621     + .start = 0,
622     + .end = 0,
623     + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
624     +}, {
625     + .name = "Adapter ROM",
626     + .start = 0,
627     + .end = 0,
628     + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
629     +}, {
630     + .name = "Adapter ROM",
631     + .start = 0,
632     + .end = 0,
633     + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
634     +}, {
635     + .name = "Adapter ROM",
636     + .start = 0,
637     + .end = 0,
638     + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
639     +}, {
640     + .name = "Adapter ROM",
641     + .start = 0,
642     + .end = 0,
643     + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
644     +} };
645     +
646     +static struct resource video_rom_resource = {
647     + .name = "Video ROM",
648     + .start = 0xc0000,
649     + .end = 0xc7fff,
650     + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
651     +};
652     +
653     +static struct resource video_ram_resource = {
654     + .name = "Video RAM area",
655     + .start = 0xa0000,
656     + .end = 0xbffff,
657     + .flags = IORESOURCE_BUSY | IORESOURCE_MEM
658     +};
659     +
660     +static struct resource standard_io_resources[] = { {
661     + .name = "dma1",
662     + .start = 0x0000,
663     + .end = 0x001f,
664     + .flags = IORESOURCE_BUSY | IORESOURCE_IO
665     +}, {
666     + .name = "pic1",
667     + .start = 0x0020,
668     + .end = 0x0021,
669     + .flags = IORESOURCE_BUSY | IORESOURCE_IO
670     +}, {
671     + .name = "timer0",
672     + .start = 0x0040,
673     + .end = 0x0043,
674     + .flags = IORESOURCE_BUSY | IORESOURCE_IO
675     +}, {
676     + .name = "timer1",
677     + .start = 0x0050,
678     + .end = 0x0053,
679     + .flags = IORESOURCE_BUSY | IORESOURCE_IO
680     +}, {
681     + .name = "keyboard",
682     + .start = 0x0060,
683     + .end = 0x006f,
684     + .flags = IORESOURCE_BUSY | IORESOURCE_IO
685     +}, {
686     + .name = "dma page reg",
687     + .start = 0x0080,
688     + .end = 0x008f,
689     + .flags = IORESOURCE_BUSY | IORESOURCE_IO
690     +}, {
691     + .name = "pic2",
692     + .start = 0x00a0,
693     + .end = 0x00a1,
694     + .flags = IORESOURCE_BUSY | IORESOURCE_IO
695     +}, {
696     + .name = "dma2",
697     + .start = 0x00c0,
698     + .end = 0x00df,
699     + .flags = IORESOURCE_BUSY | IORESOURCE_IO
700     +}, {
701     + .name = "fpu",
702     + .start = 0x00f0,
703     + .end = 0x00ff,
704     + .flags = IORESOURCE_BUSY | IORESOURCE_IO
705     +} };
706     +
707     +static int romsignature(const unsigned char *x)
708     +{
709     + unsigned short sig;
710     + int ret = 0;
711     + if (probe_kernel_address((const unsigned short *)x, sig) == 0)
712     + ret = (sig == 0xaa55);
713     + return ret;
714     +}
715     +
716     +static int __init romchecksum(unsigned char *rom, unsigned long length)
717     +{
718     + unsigned char *p, sum = 0;
719     +
720     + for (p = rom; p < rom + length; p++)
721     + sum += *p;
722     + return sum == 0;
723     +}
724     +
725     +static void __init probe_roms(void)
726     +{
727     + unsigned long start, length, upper;
728     + unsigned char *rom;
729     + int i;
730     +
731     +#ifdef CONFIG_XEN
732     + /* Nothing to do if not running in dom0. */
733     + if (!is_initial_xendomain())
734     + return;
735     +#endif
736     +
737     + /* video rom */
738     + upper = adapter_rom_resources[0].start;
739     + for (start = video_rom_resource.start; start < upper; start += 2048) {
740     + rom = isa_bus_to_virt(start);
741     + if (!romsignature(rom))
742     + continue;
743     +
744     + video_rom_resource.start = start;
745     +
746     + /* 0 < length <= 0x7f * 512, historically */
747     + length = rom[2] * 512;
748     +
749     + /* if checksum okay, trust length byte */
750     + if (length && romchecksum(rom, length))
751     + video_rom_resource.end = start + length - 1;
752     +
753     + request_resource(&iomem_resource, &video_rom_resource);
754     + break;
755     + }
756     +
757     + start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
758     + if (start < upper)
759     + start = upper;
760     +
761     + /* system rom */
762     + request_resource(&iomem_resource, &system_rom_resource);
763     + upper = system_rom_resource.start;
764     +
765     + /* check for extension rom (ignore length byte!) */
766     + rom = isa_bus_to_virt((unsigned long)extension_rom_resource.start);
767     + if (romsignature(rom)) {
768     + length = extension_rom_resource.end - extension_rom_resource.start + 1;
769     + if (romchecksum(rom, length)) {
770     + request_resource(&iomem_resource, &extension_rom_resource);
771     + upper = extension_rom_resource.start;
772     + }
773     + }
774     +
775     + /* check for adapter roms on 2k boundaries */
776     + for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) {
777     + rom = isa_bus_to_virt(start);
778     + if (!romsignature(rom))
779     + continue;
780     +
781     + /* 0 < length <= 0x7f * 512, historically */
782     + length = rom[2] * 512;
783     +
784     + /* but accept any length that fits if checksum okay */
785     + if (!length || start + length > upper || !romchecksum(rom, length))
786     + continue;
787     +
788     + adapter_rom_resources[i].start = start;
789     + adapter_rom_resources[i].end = start + length - 1;
790     + request_resource(&iomem_resource, &adapter_rom_resources[i]);
791     +
792     + start = adapter_rom_resources[i++].end & ~2047UL;
793     + }
794     +}
795     +
796     +#ifdef CONFIG_XEN
797     +static struct e820map machine_e820 __initdata;
798     +#define e820 machine_e820
799     +#endif
800     +
801     +/*
802     + * Request address space for all standard RAM and ROM resources
803     + * and also for regions reported as reserved by the e820.
804     + */
805     +static void __init
806     +legacy_init_iomem_resources(struct resource *code_resource, struct resource *data_resource)
807     +{
808     + int i;
809     +
810     + probe_roms();
811     + for (i = 0; i < e820.nr_map; i++) {
812     + struct resource *res;
813     +#ifndef CONFIG_RESOURCES_64BIT
814     + if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL)
815     + continue;
816     +#endif
817     + res = kzalloc(sizeof(struct resource), GFP_ATOMIC);
818     + switch (e820.map[i].type) {
819     + case E820_RAM: res->name = "System RAM"; break;
820     + case E820_ACPI: res->name = "ACPI Tables"; break;
821     + case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
822     + default: res->name = "reserved";
823     + }
824     + res->start = e820.map[i].addr;
825     + res->end = res->start + e820.map[i].size - 1;
826     + res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
827     + if (request_resource(&iomem_resource, res)) {
828     + kfree(res);
829     + continue;
830     + }
831     + if (e820.map[i].type == E820_RAM) {
832     + /*
833     + * We don't know which RAM region contains kernel data,
834     + * so we try it repeatedly and let the resource manager
835     + * test it.
836     + */
837     +#ifndef CONFIG_XEN
838     + request_resource(res, code_resource);
839     + request_resource(res, data_resource);
840     +#endif
841     +#ifdef CONFIG_KEXEC
842     + request_resource(res, &crashk_res);
843     +#ifdef CONFIG_XEN
844     + xen_machine_kexec_register_resources(res);
845     +#endif
846     +#endif
847     + }
848     + }
849     +}
850     +
851     +#undef e820
852     +
853     +/*
854     + * Request address space for all standard resources
855     + *
856     + * This is called just before pcibios_init(), which is also a
857     + * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
858     + */
859     +static int __init request_standard_resources(void)
860     +{
861     + int i;
862     +
863     + /* Nothing to do if not running in dom0. */
864     + if (!is_initial_xendomain())
865     + return 0;
866     +
867     + printk("Setting up standard PCI resources\n");
868     + if (efi_enabled)
869     + efi_initialize_iomem_resources(&code_resource, &data_resource);
870     + else
871     + legacy_init_iomem_resources(&code_resource, &data_resource);
872     +
873     + /* EFI systems may still have VGA */
874     + request_resource(&iomem_resource, &video_ram_resource);
875     +
876     + /* request I/O space for devices used on all i[345]86 PCs */
877     + for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
878     + request_resource(&ioport_resource, &standard_io_resources[i]);
879     + return 0;
880     +}
881     +
882     +subsys_initcall(request_standard_resources);
883     +
884     +void __init add_memory_region(unsigned long long start,
885     + unsigned long long size, int type)
886     +{
887     + int x;
888     +
889     + if (!efi_enabled) {
890     + x = e820.nr_map;
891     +
892     + if (x == E820MAX) {
893     + printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
894     + return;
895     + }
896     +
897     + e820.map[x].addr = start;
898     + e820.map[x].size = size;
899     + e820.map[x].type = type;
900     + e820.nr_map++;
901     + }
902     +} /* add_memory_region */
903     +
904     +/*
905     + * Sanitize the BIOS e820 map.
906     + *
907     + * Some e820 responses include overlapping entries. The following
908     + * replaces the original e820 map with a new one, removing overlaps.
909     + *
910     + */
911     +int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
912     +{
913     + struct change_member *change_tmp;
914     + unsigned long current_type, last_type;
915     + unsigned long long last_addr;
916     + int chgidx, still_changing;
917     + int overlap_entries;
918     + int new_bios_entry;
919     + int old_nr, new_nr, chg_nr;
920     + int i;
921     +
922     + /*
923     + Visually we're performing the following (1,2,3,4 = memory types)...
924     +
925     + Sample memory map (w/overlaps):
926     + ____22__________________
927     + ______________________4_
928     + ____1111________________
929     + _44_____________________
930     + 11111111________________
931     + ____________________33__
932     + ___________44___________
933     + __________33333_________
934     + ______________22________
935     + ___________________2222_
936     + _________111111111______
937     + _____________________11_
938     + _________________4______
939     +
940     + Sanitized equivalent (no overlap):
941     + 1_______________________
942     + _44_____________________
943     + ___1____________________
944     + ____22__________________
945     + ______11________________
946     + _________1______________
947     + __________3_____________
948     + ___________44___________
949     + _____________33_________
950     + _______________2________
951     + ________________1_______
952     + _________________4______
953     + ___________________2____
954     + ____________________33__
955     + ______________________4_
956     + */
957     + printk("sanitize start\n");
958     + /* if there's only one memory region, don't bother */
959     + if (*pnr_map < 2) {
960     + printk("sanitize bail 0\n");
961     + return -1;
962     + }
963     +
964     + old_nr = *pnr_map;
965     +
966     + /* bail out if we find any unreasonable addresses in bios map */
967     + for (i=0; i<old_nr; i++)
968     + if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) {
969     + printk("sanitize bail 1\n");
970     + return -1;
971     + }
972     +
973     + /* create pointers for initial change-point information (for sorting) */
974     + for (i=0; i < 2*old_nr; i++)
975     + change_point[i] = &change_point_list[i];
976     +
977     + /* record all known change-points (starting and ending addresses),
978     + omitting those that are for empty memory regions */
979     + chgidx = 0;
980     + for (i=0; i < old_nr; i++) {
981     + if (biosmap[i].size != 0) {
982     + change_point[chgidx]->addr = biosmap[i].addr;
983     + change_point[chgidx++]->pbios = &biosmap[i];
984     + change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
985     + change_point[chgidx++]->pbios = &biosmap[i];
986     + }
987     + }
988     + chg_nr = chgidx; /* true number of change-points */
989     +
990     + /* sort change-point list by memory addresses (low -> high) */
991     + still_changing = 1;
992     + while (still_changing) {
993     + still_changing = 0;
994     + for (i=1; i < chg_nr; i++) {
995     + /* if <current_addr> > <last_addr>, swap */
996     + /* or, if current=<start_addr> & last=<end_addr>, swap */
997     + if ((change_point[i]->addr < change_point[i-1]->addr) ||
998     + ((change_point[i]->addr == change_point[i-1]->addr) &&
999     + (change_point[i]->addr == change_point[i]->pbios->addr) &&
1000     + (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
1001     + )
1002     + {
1003     + change_tmp = change_point[i];
1004     + change_point[i] = change_point[i-1];
1005     + change_point[i-1] = change_tmp;
1006     + still_changing=1;
1007     + }
1008     + }
1009     + }
1010     +
1011     + /* create a new bios memory map, removing overlaps */
1012     + overlap_entries=0; /* number of entries in the overlap table */
1013     + new_bios_entry=0; /* index for creating new bios map entries */
1014     + last_type = 0; /* start with undefined memory type */
1015     + last_addr = 0; /* start with 0 as last starting address */
1016     + /* loop through change-points, determining affect on the new bios map */
1017     + for (chgidx=0; chgidx < chg_nr; chgidx++)
1018     + {
1019     + /* keep track of all overlapping bios entries */
1020     + if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
1021     + {
1022     + /* add map entry to overlap list (> 1 entry implies an overlap) */
1023     + overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
1024     + }
1025     + else
1026     + {
1027     + /* remove entry from list (order independent, so swap with last) */
1028     + for (i=0; i<overlap_entries; i++)
1029     + {
1030     + if (overlap_list[i] == change_point[chgidx]->pbios)
1031     + overlap_list[i] = overlap_list[overlap_entries-1];
1032     + }
1033     + overlap_entries--;
1034     + }
1035     + /* if there are overlapping entries, decide which "type" to use */
1036     + /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
1037     + current_type = 0;
1038     + for (i=0; i<overlap_entries; i++)
1039     + if (overlap_list[i]->type > current_type)
1040     + current_type = overlap_list[i]->type;
1041     + /* continue building up new bios map based on this information */
1042     + if (current_type != last_type) {
1043     + if (last_type != 0) {
1044     + new_bios[new_bios_entry].size =
1045     + change_point[chgidx]->addr - last_addr;
1046     + /* move forward only if the new size was non-zero */
1047     + if (new_bios[new_bios_entry].size != 0)
1048     + if (++new_bios_entry >= E820MAX)
1049     + break; /* no more space left for new bios entries */
1050     + }
1051     + if (current_type != 0) {
1052     + new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
1053     + new_bios[new_bios_entry].type = current_type;
1054     + last_addr=change_point[chgidx]->addr;
1055     + }
1056     + last_type = current_type;
1057     + }
1058     + }
1059     + new_nr = new_bios_entry; /* retain count for new bios entries */
1060     +
1061     + /* copy new bios mapping into original location */
1062     + memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
1063     + *pnr_map = new_nr;
1064     +
1065     + printk("sanitize end\n");
1066     + return 0;
1067     +}
1068     +
1069     +/*
1070     + * Copy the BIOS e820 map into a safe place.
1071     + *
1072     + * Sanity-check it while we're at it..
1073     + *
1074     + * If we're lucky and live on a modern system, the setup code
1075     + * will have given us a memory map that we can use to properly
1076     + * set up memory. If we aren't, we'll fake a memory map.
1077     + *
1078     + * We check to see that the memory map contains at least 2 elements
1079     + * before we'll use it, because the detection code in setup.S may
1080     + * not be perfect and most every PC known to man has two memory
1081     + * regions: one from 0 to 640k, and one from 1mb up. (The IBM
1082     + * thinkpad 560x, for example, does not cooperate with the memory
1083     + * detection code.)
1084     + */
1085     +int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
1086     +{
1087     +#ifndef CONFIG_XEN
1088     + /* Only one memory region (or negative)? Ignore it */
1089     + if (nr_map < 2)
1090     + return -1;
1091     +#else
1092     + BUG_ON(nr_map < 1);
1093     +#endif
1094     +
1095     + do {
1096     + unsigned long long start = biosmap->addr;
1097     + unsigned long long size = biosmap->size;
1098     + unsigned long long end = start + size;
1099     + unsigned long type = biosmap->type;
1100     + printk("copy_e820_map() start: %016Lx size: %016Lx end: %016Lx type: %ld\n", start, size, end, type);
1101     +
1102     + /* Overflow in 64 bits? Ignore the memory map. */
1103     + if (start > end)
1104     + return -1;
1105     +
1106     +#ifndef CONFIG_XEN
1107     + /*
1108     + * Some BIOSes claim RAM in the 640k - 1M region.
1109     + * Not right. Fix it up.
1110     + */
1111     + if (type == E820_RAM) {
1112     + printk("copy_e820_map() type is E820_RAM\n");
1113     + if (start < 0x100000ULL && end > 0xA0000ULL) {
1114     + printk("copy_e820_map() lies in range...\n");
1115     + if (start < 0xA0000ULL) {
1116     + printk("copy_e820_map() start < 0xA0000ULL\n");
1117     + add_memory_region(start, 0xA0000ULL-start, type);
1118     + }
1119     + if (end <= 0x100000ULL) {
1120     + printk("copy_e820_map() end <= 0x100000ULL\n");
1121     + continue;
1122     + }
1123     + start = 0x100000ULL;
1124     + size = end - start;
1125     + }
1126     + }
1127     +#endif
1128     + add_memory_region(start, size, type);
1129     + } while (biosmap++,--nr_map);
1130     + return 0;
1131     +}
1132     +
1133     +/*
1134     + * Callback for efi_memory_walk.
1135     + */
1136     +static int __init
1137     +efi_find_max_pfn(unsigned long start, unsigned long end, void *arg)
1138     +{
1139     + unsigned long *max_pfn = arg, pfn;
1140     +
1141     + if (start < end) {
1142     + pfn = PFN_UP(end -1);
1143     + if (pfn > *max_pfn)
1144     + *max_pfn = pfn;
1145     + }
1146     + return 0;
1147     +}
1148     +
1149     +static int __init
1150     +efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg)
1151     +{
1152     + memory_present(0, PFN_UP(start), PFN_DOWN(end));
1153     + return 0;
1154     +}
1155     +
1156     +/*
1157     + * Find the highest page frame number we have available
1158     + */
1159     +void __init find_max_pfn(void)
1160     +{
1161     + int i;
1162     +
1163     + max_pfn = 0;
1164     + if (efi_enabled) {
1165     + efi_memmap_walk(efi_find_max_pfn, &max_pfn);
1166     + efi_memmap_walk(efi_memory_present_wrapper, NULL);
1167     + return;
1168     + }
1169     +
1170     + for (i = 0; i < e820.nr_map; i++) {
1171     + unsigned long start, end;
1172     + /* RAM? */
1173     + if (e820.map[i].type != E820_RAM)
1174     + continue;
1175     + start = PFN_UP(e820.map[i].addr);
1176     + end = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
1177     + if (start >= end)
1178     + continue;
1179     + if (end > max_pfn)
1180     + max_pfn = end;
1181     + memory_present(0, start, end);
1182     + }
1183     +}
1184     +
1185     +/*
1186     + * Free all available memory for boot time allocation. Used
1187     + * as a callback function by efi_memory_walk()
1188     + */
1189     +
1190     +static int __init
1191     +free_available_memory(unsigned long start, unsigned long end, void *arg)
1192     +{
1193     + /* check max_low_pfn */
1194     + if (start >= (max_low_pfn << PAGE_SHIFT))
1195     + return 0;
1196     + if (end >= (max_low_pfn << PAGE_SHIFT))
1197     + end = max_low_pfn << PAGE_SHIFT;
1198     + if (start < end)
1199     + free_bootmem(start, end - start);
1200     +
1201     + return 0;
1202     +}
1203     +/*
1204     + * Register fully available low RAM pages with the bootmem allocator.
1205     + */
1206     +void __init register_bootmem_low_pages(unsigned long max_low_pfn)
1207     +{
1208     + int i;
1209     +
1210     + if (efi_enabled) {
1211     + efi_memmap_walk(free_available_memory, NULL);
1212     + return;
1213     + }
1214     + for (i = 0; i < e820.nr_map; i++) {
1215     + unsigned long curr_pfn, last_pfn, size;
1216     + /*
1217     + * Reserve usable low memory
1218     + */
1219     + if (e820.map[i].type != E820_RAM)
1220     + continue;
1221     + /*
1222     + * We are rounding up the start address of usable memory:
1223     + */
1224     + curr_pfn = PFN_UP(e820.map[i].addr);
1225     + if (curr_pfn >= max_low_pfn)
1226     + continue;
1227     + /*
1228     + * ... and at the end of the usable range downwards:
1229     + */
1230     + last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
1231     +
1232     +#ifdef CONFIG_XEN
1233     + /*
1234     + * Truncate to the number of actual pages currently
1235     + * present.
1236     + */
1237     + if (last_pfn > xen_start_info->nr_pages)
1238     + last_pfn = xen_start_info->nr_pages;
1239     +#endif
1240     +
1241     + if (last_pfn > max_low_pfn)
1242     + last_pfn = max_low_pfn;
1243     +
1244     + /*
1245     + * .. finally, did all the rounding and playing
1246     + * around just make the area go away?
1247     + */
1248     + if (last_pfn <= curr_pfn)
1249     + continue;
1250     +
1251     + size = last_pfn - curr_pfn;
1252     + free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size));
1253     + }
1254     +}
1255     +
1256     +void __init e820_register_memory(void)
1257     +{
1258     + unsigned long gapstart, gapsize, round;
1259     + unsigned long long last;
1260     + int i;
1261     +
1262     +#ifdef CONFIG_XEN
1263     + if (is_initial_xendomain()) {
1264     + struct xen_memory_map memmap;
1265     +
1266     + memmap.nr_entries = E820MAX;
1267     + set_xen_guest_handle(memmap.buffer, machine_e820.map);
1268     +
1269     + if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap))
1270     + BUG();
1271     + machine_e820.nr_map = memmap.nr_entries;
1272     + }
1273     + else
1274     + machine_e820 = e820;
1275     +#define e820 machine_e820
1276     +#endif
1277     +
1278     + /*
1279     + * Search for the bigest gap in the low 32 bits of the e820
1280     + * memory space.
1281     + */
1282     + last = 0x100000000ull;
1283     + gapstart = 0x10000000;
1284     + gapsize = 0x400000;
1285     + i = e820.nr_map;
1286     + while (--i >= 0) {
1287     + unsigned long long start = e820.map[i].addr;
1288     + unsigned long long end = start + e820.map[i].size;
1289     +
1290     + /*
1291     + * Since "last" is at most 4GB, we know we'll
1292     + * fit in 32 bits if this condition is true
1293     + */
1294     + if (last > end) {
1295     + unsigned long gap = last - end;
1296     +
1297     + if (gap > gapsize) {
1298     + gapsize = gap;
1299     + gapstart = end;
1300     + }
1301     + }
1302     + if (start < last)
1303     + last = start;
1304     + }
1305     +#undef e820
1306     +
1307     + /*
1308     + * See how much we want to round up: start off with
1309     + * rounding to the next 1MB area.
1310     + */
1311     + round = 0x100000;
1312     + while ((gapsize >> 4) > round)
1313     + round += round;
1314     + /* Fun with two's complement */
1315     + pci_mem_start = (gapstart + round) & -round;
1316     +
1317     + printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n",
1318     + pci_mem_start, gapstart, gapsize);
1319     +}
1320     +
1321     +void __init print_memory_map(char *who)
1322     +{
1323     + int i;
1324     +
1325     + for (i = 0; i < e820.nr_map; i++) {
1326     + printk(" %s: %016Lx - %016Lx ", who,
1327     + e820.map[i].addr,
1328     + e820.map[i].addr + e820.map[i].size);
1329     + switch (e820.map[i].type) {
1330     + case E820_RAM: printk("(usable)\n");
1331     + break;
1332     + case E820_RESERVED:
1333     + printk("(reserved)\n");
1334     + break;
1335     + case E820_ACPI:
1336     + printk("(ACPI data)\n");
1337     + break;
1338     + case E820_NVS:
1339     + printk("(ACPI NVS)\n");
1340     + break;
1341     + default: printk("type %lu\n", e820.map[i].type);
1342     + break;
1343     + }
1344     + }
1345     +}
1346     +
1347     +static __init __always_inline void efi_limit_regions(unsigned long long size)
1348     +{
1349     + unsigned long long current_addr = 0;
1350     + efi_memory_desc_t *md, *next_md;
1351     + void *p, *p1;
1352     + int i, j;
1353     +
1354     + j = 0;
1355     + p1 = memmap.map;
1356     + for (p = p1, i = 0; p < memmap.map_end; p += memmap.desc_size, i++) {
1357     + md = p;
1358     + next_md = p1;
1359     + current_addr = md->phys_addr +
1360     + PFN_PHYS(md->num_pages);
1361     + if (is_available_memory(md)) {
1362     + if (md->phys_addr >= size) continue;
1363     + memcpy(next_md, md, memmap.desc_size);
1364     + if (current_addr >= size) {
1365     + next_md->num_pages -=
1366     + PFN_UP(current_addr-size);
1367     + }
1368     + p1 += memmap.desc_size;
1369     + next_md = p1;
1370     + j++;
1371     + } else if ((md->attribute & EFI_MEMORY_RUNTIME) ==
1372     + EFI_MEMORY_RUNTIME) {
1373     + /* In order to make runtime services
1374     + * available we have to include runtime
1375     + * memory regions in memory map */
1376     + memcpy(next_md, md, memmap.desc_size);
1377     + p1 += memmap.desc_size;
1378     + next_md = p1;
1379     + j++;
1380     + }
1381     + }
1382     + memmap.nr_map = j;
1383     + memmap.map_end = memmap.map +
1384     + (memmap.nr_map * memmap.desc_size);
1385     +}
1386     +
1387     +void __init limit_regions(unsigned long long size)
1388     +{
1389     + unsigned long long current_addr = 0;
1390     + int i;
1391     +
1392     + print_memory_map("limit_regions start");
1393     + if (efi_enabled) {
1394     + efi_limit_regions(size);
1395     + return;
1396     + }
1397     + for (i = 0; i < e820.nr_map; i++) {
1398     + current_addr = e820.map[i].addr + e820.map[i].size;
1399     + if (current_addr < size)
1400     + continue;
1401     +
1402     + if (e820.map[i].type != E820_RAM)
1403     + continue;
1404     +
1405     + if (e820.map[i].addr >= size) {
1406     + /*
1407     + * This region starts past the end of the
1408     + * requested size, skip it completely.
1409     + */
1410     + e820.nr_map = i;
1411     + } else {
1412     + e820.nr_map = i + 1;
1413     + e820.map[i].size -= current_addr - size;
1414     + }
1415     + print_memory_map("limit_regions endfor");
1416     + return;
1417     + }
1418     +#ifdef CONFIG_XEN
1419     + if (current_addr < size) {
1420     + /*
1421     + * The e820 map finished before our requested size so
1422     + * extend the final entry to the requested address.
1423     + */
1424     + --i;
1425     + if (e820.map[i].type == E820_RAM)
1426     + e820.map[i].size -= current_addr - size;
1427     + else
1428     + add_memory_region(current_addr, size - current_addr, E820_RAM);
1429     + }
1430     +#endif
1431     + print_memory_map("limit_regions endfunc");
1432     +}
1433     +
1434     +/*
1435     + * This function checks if any part of the range <start,end> is mapped
1436     + * with type.
1437     + */
1438     +int
1439     +e820_any_mapped(u64 start, u64 end, unsigned type)
1440     +{
1441     + int i;
1442     +
1443     +#ifndef CONFIG_XEN
1444     + for (i = 0; i < e820.nr_map; i++) {
1445     + const struct e820entry *ei = &e820.map[i];
1446     +#else
1447     + if (!is_initial_xendomain())
1448     + return 0;
1449     + for (i = 0; i < machine_e820.nr_map; ++i) {
1450     + const struct e820entry *ei = &machine_e820.map[i];
1451     +#endif
1452     +
1453     + if (type && ei->type != type)
1454     + continue;
1455     + if (ei->addr >= end || ei->addr + ei->size <= start)
1456     + continue;
1457     + return 1;
1458     + }
1459     + return 0;
1460     +}
1461     +EXPORT_SYMBOL_GPL(e820_any_mapped);
1462     +
1463     + /*
1464     + * This function checks if the entire range <start,end> is mapped with type.
1465     + *
1466     + * Note: this function only works correct if the e820 table is sorted and
1467     + * not-overlapping, which is the case
1468     + */
1469     +int __init
1470     +e820_all_mapped(unsigned long s, unsigned long e, unsigned type)
1471     +{
1472     + u64 start = s;
1473     + u64 end = e;
1474     + int i;
1475     +
1476     +#ifndef CONFIG_XEN
1477     + for (i = 0; i < e820.nr_map; i++) {
1478     + struct e820entry *ei = &e820.map[i];
1479     +#else
1480     + if (!is_initial_xendomain())
1481     + return 0;
1482     + for (i = 0; i < machine_e820.nr_map; ++i) {
1483     + const struct e820entry *ei = &machine_e820.map[i];
1484     +#endif
1485     +
1486     + if (type && ei->type != type)
1487     + continue;
1488     + /* is the region (part) in overlap with the current region ?*/
1489     + if (ei->addr >= end || ei->addr + ei->size <= start)
1490     + continue;
1491     + /* if the region is at the beginning of <start,end> we move
1492     + * start to the end of the region since it's ok until there
1493     + */
1494     + if (ei->addr <= start)
1495     + start = ei->addr + ei->size;
1496     + /* if start is now at or beyond end, we're done, full
1497     + * coverage */
1498     + if (start >= end)
1499     + return 1; /* we're done */
1500     + }
1501     + return 0;
1502     +}
1503     +
1504     +static int __init parse_memmap(char *arg)
1505     +{
1506     + if (!arg)
1507     + return -EINVAL;
1508     +
1509     + if (strcmp(arg, "exactmap") == 0) {
1510     +#ifdef CONFIG_CRASH_DUMP
1511     + /* If we are doing a crash dump, we
1512     + * still need to know the real mem
1513     + * size before original memory map is
1514     + * reset.
1515     + */
1516     + find_max_pfn();
1517     + saved_max_pfn = max_pfn;
1518     +#endif
1519     + e820.nr_map = 0;
1520     + user_defined_memmap = 1;
1521     + } else {
1522     + /* If the user specifies memory size, we
1523     + * limit the BIOS-provided memory map to
1524     + * that size. exactmap can be used to specify
1525     + * the exact map. mem=number can be used to
1526     + * trim the existing memory map.
1527     + */
1528     + unsigned long long start_at, mem_size;
1529     +
1530     + mem_size = memparse(arg, &arg);
1531     + if (*arg == '@') {
1532     + start_at = memparse(arg+1, &arg);
1533     + add_memory_region(start_at, mem_size, E820_RAM);
1534     + } else if (*arg == '#') {
1535     + start_at = memparse(arg+1, &arg);
1536     + add_memory_region(start_at, mem_size, E820_ACPI);
1537     + } else if (*arg == '$') {
1538     + start_at = memparse(arg+1, &arg);
1539     + add_memory_region(start_at, mem_size, E820_RESERVED);
1540     + } else {
1541     + limit_regions(mem_size);
1542     + user_defined_memmap = 1;
1543     + }
1544     + }
1545     + return 0;
1546     +}
1547     +early_param("memmap", parse_memmap);
1548     --- a/arch/x86/kernel/entry_32-xen.S
1549     +++ b/arch/x86/kernel/entry_32-xen.S
1550     @@ -30,12 +30,13 @@
1551     * 18(%esp) - %eax
1552     * 1C(%esp) - %ds
1553     * 20(%esp) - %es
1554     - * 24(%esp) - orig_eax
1555     - * 28(%esp) - %eip
1556     - * 2C(%esp) - %cs
1557     - * 30(%esp) - %eflags
1558     - * 34(%esp) - %oldesp
1559     - * 38(%esp) - %oldss
1560     + * 24(%esp) - %gs
1561     + * 28(%esp) - orig_eax
1562     + * 2C(%esp) - %eip
1563     + * 30(%esp) - %cs
1564     + * 34(%esp) - %eflags
1565     + * 38(%esp) - %oldesp
1566     + * 3C(%esp) - %oldss
1567     *
1568     * "current" is in register %ebx during any slow entries.
1569     */
1570     @@ -48,27 +49,25 @@
1571     #include <asm/smp.h>
1572     #include <asm/page.h>
1573     #include <asm/desc.h>
1574     +#include <asm/percpu.h>
1575     #include <asm/dwarf2.h>
1576     #include "irq_vectors.h"
1577     #include <xen/interface/xen.h>
1578    
1579     -#define nr_syscalls ((syscall_table_size)/4)
1580     +/*
1581     + * We use macros for low-level operations which need to be overridden
1582     + * for paravirtualization. The following will never clobber any registers:
1583     + * INTERRUPT_RETURN (aka. "iret")
1584     + * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
1585     + * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
1586     + *
1587     + * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
1588     + * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
1589     + * Allowing a register to be clobbered can shrink the paravirt replacement
1590     + * enough to patch inline, increasing performance.
1591     + */
1592    
1593     -EBX = 0x00
1594     -ECX = 0x04
1595     -EDX = 0x08
1596     -ESI = 0x0C
1597     -EDI = 0x10
1598     -EBP = 0x14
1599     -EAX = 0x18
1600     -DS = 0x1C
1601     -ES = 0x20
1602     -ORIG_EAX = 0x24
1603     -EIP = 0x28
1604     -CS = 0x2C
1605     -EFLAGS = 0x30
1606     -OLDESP = 0x34
1607     -OLDSS = 0x38
1608     +#define nr_syscalls ((syscall_table_size)/4)
1609    
1610     CF_MASK = 0x00000001
1611     TF_MASK = 0x00000100
1612     @@ -79,61 +78,16 @@
1613     /* Pseudo-eflags. */
1614     NMI_MASK = 0x80000000
1615    
1616     -#ifndef CONFIG_XEN
1617     -/* These are replaces for paravirtualization */
1618     -#define DISABLE_INTERRUPTS cli
1619     -#define ENABLE_INTERRUPTS sti
1620     -#define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit
1621     -#define INTERRUPT_RETURN iret
1622     -#define GET_CR0_INTO_EAX movl %cr0, %eax
1623     -#else
1624     -/* Offsets into shared_info_t. */
1625     -#define evtchn_upcall_pending /* 0 */
1626     -#define evtchn_upcall_mask 1
1627     -
1628     -#define sizeof_vcpu_shift 6
1629     -
1630     -#ifdef CONFIG_SMP
1631     -#define GET_VCPU_INFO movl TI_cpu(%ebp),%esi ; \
1632     - shl $sizeof_vcpu_shift,%esi ; \
1633     - addl HYPERVISOR_shared_info,%esi
1634     -#else
1635     -#define GET_VCPU_INFO movl HYPERVISOR_shared_info,%esi
1636     -#endif
1637     -
1638     -#define __DISABLE_INTERRUPTS movb $1,evtchn_upcall_mask(%esi)
1639     -#define __ENABLE_INTERRUPTS movb $0,evtchn_upcall_mask(%esi)
1640     -#define __TEST_PENDING testb $0xFF,evtchn_upcall_pending(%esi)
1641     -#define DISABLE_INTERRUPTS GET_VCPU_INFO ; \
1642     - __DISABLE_INTERRUPTS
1643     -#define ENABLE_INTERRUPTS GET_VCPU_INFO ; \
1644     - __ENABLE_INTERRUPTS
1645     -#define ENABLE_INTERRUPTS_SYSEXIT __ENABLE_INTERRUPTS ; \
1646     -sysexit_scrit: /**** START OF SYSEXIT CRITICAL REGION ****/ ; \
1647     - __TEST_PENDING ; \
1648     - jnz 14f # process more events if necessary... ; \
1649     - movl ESI(%esp), %esi ; \
1650     - sysexit ; \
1651     -14: __DISABLE_INTERRUPTS ; \
1652     - TRACE_IRQS_OFF ; \
1653     -sysexit_ecrit: /**** END OF SYSEXIT CRITICAL REGION ****/ ; \
1654     - push %esp ; \
1655     - call evtchn_do_upcall ; \
1656     - add $4,%esp ; \
1657     - jmp ret_from_intr
1658     -#define INTERRUPT_RETURN iret
1659     -#endif
1660     -
1661     #ifdef CONFIG_PREEMPT
1662     -#define preempt_stop DISABLE_INTERRUPTS; TRACE_IRQS_OFF
1663     +#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
1664     #else
1665     -#define preempt_stop
1666     +#define preempt_stop(clobbers)
1667     #define resume_kernel restore_nocheck
1668     #endif
1669    
1670     .macro TRACE_IRQS_IRET
1671     #ifdef CONFIG_TRACE_IRQFLAGS
1672     - testl $IF_MASK,EFLAGS(%esp) # interrupts off?
1673     + testl $IF_MASK,PT_EFLAGS(%esp) # interrupts off?
1674     jz 1f
1675     TRACE_IRQS_ON
1676     1:
1677     @@ -148,6 +102,9 @@
1678    
1679     #define SAVE_ALL \
1680     cld; \
1681     + pushl %gs; \
1682     + CFI_ADJUST_CFA_OFFSET 4;\
1683     + /*CFI_REL_OFFSET gs, 0;*/\
1684     pushl %es; \
1685     CFI_ADJUST_CFA_OFFSET 4;\
1686     /*CFI_REL_OFFSET es, 0;*/\
1687     @@ -177,7 +134,9 @@
1688     CFI_REL_OFFSET ebx, 0;\
1689     movl $(__USER_DS), %edx; \
1690     movl %edx, %ds; \
1691     - movl %edx, %es;
1692     + movl %edx, %es; \
1693     + movl $(__KERNEL_PDA), %edx; \
1694     + movl %edx, %gs
1695    
1696     #define RESTORE_INT_REGS \
1697     popl %ebx; \
1698     @@ -210,17 +169,22 @@
1699     2: popl %es; \
1700     CFI_ADJUST_CFA_OFFSET -4;\
1701     /*CFI_RESTORE es;*/\
1702     -.section .fixup,"ax"; \
1703     -3: movl $0,(%esp); \
1704     - jmp 1b; \
1705     +3: popl %gs; \
1706     + CFI_ADJUST_CFA_OFFSET -4;\
1707     + /*CFI_RESTORE gs;*/\
1708     +.pushsection .fixup,"ax"; \
1709     4: movl $0,(%esp); \
1710     + jmp 1b; \
1711     +5: movl $0,(%esp); \
1712     jmp 2b; \
1713     -.previous; \
1714     +6: movl $0,(%esp); \
1715     + jmp 3b; \
1716     .section __ex_table,"a";\
1717     .align 4; \
1718     - .long 1b,3b; \
1719     - .long 2b,4b; \
1720     -.previous
1721     + .long 1b,4b; \
1722     + .long 2b,5b; \
1723     + .long 3b,6b; \
1724     +.popsection
1725    
1726     #define RING0_INT_FRAME \
1727     CFI_STARTPROC simple;\
1728     @@ -239,18 +203,18 @@
1729     #define RING0_PTREGS_FRAME \
1730     CFI_STARTPROC simple;\
1731     CFI_SIGNAL_FRAME;\
1732     - CFI_DEF_CFA esp, OLDESP-EBX;\
1733     - /*CFI_OFFSET cs, CS-OLDESP;*/\
1734     - CFI_OFFSET eip, EIP-OLDESP;\
1735     - /*CFI_OFFSET es, ES-OLDESP;*/\
1736     - /*CFI_OFFSET ds, DS-OLDESP;*/\
1737     - CFI_OFFSET eax, EAX-OLDESP;\
1738     - CFI_OFFSET ebp, EBP-OLDESP;\
1739     - CFI_OFFSET edi, EDI-OLDESP;\
1740     - CFI_OFFSET esi, ESI-OLDESP;\
1741     - CFI_OFFSET edx, EDX-OLDESP;\
1742     - CFI_OFFSET ecx, ECX-OLDESP;\
1743     - CFI_OFFSET ebx, EBX-OLDESP
1744     + CFI_DEF_CFA esp, PT_OLDESP-PT_EBX;\
1745     + /*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/\
1746     + CFI_OFFSET eip, PT_EIP-PT_OLDESP;\
1747     + /*CFI_OFFSET es, PT_ES-PT_OLDESP;*/\
1748     + /*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/\
1749     + CFI_OFFSET eax, PT_EAX-PT_OLDESP;\
1750     + CFI_OFFSET ebp, PT_EBP-PT_OLDESP;\
1751     + CFI_OFFSET edi, PT_EDI-PT_OLDESP;\
1752     + CFI_OFFSET esi, PT_ESI-PT_OLDESP;\
1753     + CFI_OFFSET edx, PT_EDX-PT_OLDESP;\
1754     + CFI_OFFSET ecx, PT_ECX-PT_OLDESP;\
1755     + CFI_OFFSET ebx, PT_EBX-PT_OLDESP
1756    
1757     ENTRY(ret_from_fork)
1758     CFI_STARTPROC
1759     @@ -278,17 +242,18 @@
1760     ALIGN
1761     RING0_PTREGS_FRAME
1762     ret_from_exception:
1763     - preempt_stop
1764     + preempt_stop(CLBR_ANY)
1765     ret_from_intr:
1766     GET_THREAD_INFO(%ebp)
1767     check_userspace:
1768     - movl EFLAGS(%esp), %eax # mix EFLAGS and CS
1769     - movb CS(%esp), %al
1770     + movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS
1771     + movb PT_CS(%esp), %al
1772     andl $(VM_MASK | SEGMENT_RPL_MASK), %eax
1773     cmpl $USER_RPL, %eax
1774     jb resume_kernel # not returning to v8086 or userspace
1775     +
1776     ENTRY(resume_userspace)
1777     - DISABLE_INTERRUPTS # make sure we don't miss an interrupt
1778     + DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
1779     # setting need_resched or sigpending
1780     # between sampling and the iret
1781     movl TI_flags(%ebp), %ecx
1782     @@ -299,14 +264,14 @@
1783    
1784     #ifdef CONFIG_PREEMPT
1785     ENTRY(resume_kernel)
1786     - DISABLE_INTERRUPTS
1787     + DISABLE_INTERRUPTS(CLBR_ANY)
1788     cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ?
1789     jnz restore_nocheck
1790     need_resched:
1791     movl TI_flags(%ebp), %ecx # need_resched set ?
1792     testb $_TIF_NEED_RESCHED, %cl
1793     jz restore_all
1794     - testl $IF_MASK,EFLAGS(%esp) # interrupts off (exception path) ?
1795     + testl $IF_MASK,PT_EFLAGS(%esp) # interrupts off (exception path) ?
1796     jz restore_all
1797     call preempt_schedule_irq
1798     jmp need_resched
1799     @@ -328,7 +293,7 @@
1800     * No need to follow this irqs on/off section: the syscall
1801     * disabled irqs and here we enable it straight after entry:
1802     */
1803     - ENABLE_INTERRUPTS
1804     + ENABLE_INTERRUPTS(CLBR_NONE)
1805     pushl $(__USER_DS)
1806     CFI_ADJUST_CFA_OFFSET 4
1807     /*CFI_REL_OFFSET ss, 0*/
1808     @@ -340,12 +305,16 @@
1809     pushl $(__USER_CS)
1810     CFI_ADJUST_CFA_OFFSET 4
1811     /*CFI_REL_OFFSET cs, 0*/
1812     +#ifndef CONFIG_COMPAT_VDSO
1813     /*
1814     * Push current_thread_info()->sysenter_return to the stack.
1815     * A tiny bit of offset fixup is necessary - 4*4 means the 4 words
1816     * pushed above; +8 corresponds to copy_thread's esp0 setting.
1817     */
1818     pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp)
1819     +#else
1820     + pushl $SYSENTER_RETURN
1821     +#endif
1822     CFI_ADJUST_CFA_OFFSET 4
1823     CFI_REL_OFFSET eip, 0
1824    
1825     @@ -372,19 +341,27 @@
1826     cmpl $(nr_syscalls), %eax
1827     jae syscall_badsys
1828     call *sys_call_table(,%eax,4)
1829     - movl %eax,EAX(%esp)
1830     - DISABLE_INTERRUPTS
1831     + movl %eax,PT_EAX(%esp)
1832     + DISABLE_INTERRUPTS(CLBR_ECX|CLBR_EDX)
1833     TRACE_IRQS_OFF
1834     movl TI_flags(%ebp), %ecx
1835     testw $_TIF_ALLWORK_MASK, %cx
1836     jne syscall_exit_work
1837     /* if something modifies registers it must also disable sysexit */
1838     - movl EIP(%esp), %edx
1839     - movl OLDESP(%esp), %ecx
1840     + movl PT_EIP(%esp), %edx
1841     + movl PT_OLDESP(%esp), %ecx
1842     xorl %ebp,%ebp
1843     TRACE_IRQS_ON
1844     +1: mov PT_GS(%esp), %gs
1845     ENABLE_INTERRUPTS_SYSEXIT
1846     CFI_ENDPROC
1847     +.pushsection .fixup,"ax"
1848     +2: movl $0,PT_GS(%esp)
1849     + jmp 1b
1850     +.section __ex_table,"a"
1851     + .align 4
1852     + .long 1b,2b
1853     +.popsection
1854    
1855     # pv sysenter call handler stub
1856     ENTRY(sysenter_entry_pv)
1857     @@ -419,7 +396,7 @@
1858     CFI_ADJUST_CFA_OFFSET 4
1859     SAVE_ALL
1860     GET_THREAD_INFO(%ebp)
1861     - testl $TF_MASK,EFLAGS(%esp)
1862     + testl $TF_MASK,PT_EFLAGS(%esp)
1863     jz no_singlestep
1864     orl $_TIF_SINGLESTEP,TI_flags(%ebp)
1865     no_singlestep:
1866     @@ -431,9 +408,9 @@
1867     jae syscall_badsys
1868     syscall_call:
1869     call *sys_call_table(,%eax,4)
1870     - movl %eax,EAX(%esp) # store the return value
1871     + movl %eax,PT_EAX(%esp) # store the return value
1872     syscall_exit:
1873     - DISABLE_INTERRUPTS # make sure we don't miss an interrupt
1874     + DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
1875     # setting need_resched or sigpending
1876     # between sampling and the iret
1877     TRACE_IRQS_OFF
1878     @@ -443,12 +420,12 @@
1879    
1880     restore_all:
1881     #ifndef CONFIG_XEN
1882     - movl EFLAGS(%esp), %eax # mix EFLAGS, SS and CS
1883     - # Warning: OLDSS(%esp) contains the wrong/random values if we
1884     + movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS
1885     + # Warning: PT_OLDSS(%esp) contains the wrong/random values if we
1886     # are returning to the kernel.
1887     # See comments in process.c:copy_thread() for details.
1888     - movb OLDSS(%esp), %ah
1889     - movb CS(%esp), %al
1890     + movb PT_OLDSS(%esp), %ah
1891     + movb PT_CS(%esp), %al
1892     andl $(VM_MASK | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
1893     cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
1894     CFI_REMEMBER_STATE
1895     @@ -456,7 +433,7 @@
1896     restore_nocheck:
1897     #else
1898     restore_nocheck:
1899     - movl EFLAGS(%esp), %eax
1900     + movl PT_EFLAGS(%esp), %eax
1901     testl $(VM_MASK|NMI_MASK), %eax
1902     CFI_REMEMBER_STATE
1903     jnz hypervisor_iret
1904     @@ -470,13 +447,13 @@
1905     TRACE_IRQS_IRET
1906     restore_nocheck_notrace:
1907     RESTORE_REGS
1908     - addl $4, %esp
1909     + addl $4, %esp # skip orig_eax/error_code
1910     CFI_ADJUST_CFA_OFFSET -4
1911     1: INTERRUPT_RETURN
1912     .section .fixup,"ax"
1913     iret_exc:
1914     #ifndef CONFIG_XEN
1915     - ENABLE_INTERRUPTS
1916     + ENABLE_INTERRUPTS(CLBR_NONE)
1917     #endif
1918     pushl $0 # no error code
1919     pushl $do_iret_error
1920     @@ -490,33 +467,42 @@
1921     CFI_RESTORE_STATE
1922     #ifndef CONFIG_XEN
1923     ldt_ss:
1924     - larl OLDSS(%esp), %eax
1925     + larl PT_OLDSS(%esp), %eax
1926     jnz restore_nocheck
1927     testl $0x00400000, %eax # returning to 32bit stack?
1928     jnz restore_nocheck # allright, normal return
1929     +
1930     +#ifdef CONFIG_PARAVIRT
1931     + /*
1932     + * The kernel can't run on a non-flat stack if paravirt mode
1933     + * is active. Rather than try to fixup the high bits of
1934     + * ESP, bypass this code entirely. This may break DOSemu
1935     + * and/or Wine support in a paravirt VM, although the option
1936     + * is still available to implement the setting of the high
1937     + * 16-bits in the INTERRUPT_RETURN paravirt-op.
1938     + */
1939     + cmpl $0, paravirt_ops+PARAVIRT_enabled
1940     + jne restore_nocheck
1941     +#endif
1942     +
1943     /* If returning to userspace with 16bit stack,
1944     * try to fix the higher word of ESP, as the CPU
1945     * won't restore it.
1946     * This is an "official" bug of all the x86-compatible
1947     * CPUs, which we can try to work around to make
1948     * dosemu and wine happy. */
1949     - subl $8, %esp # reserve space for switch16 pointer
1950     - CFI_ADJUST_CFA_OFFSET 8
1951     - DISABLE_INTERRUPTS
1952     + movl PT_OLDESP(%esp), %eax
1953     + movl %esp, %edx
1954     + call patch_espfix_desc
1955     + pushl $__ESPFIX_SS
1956     + CFI_ADJUST_CFA_OFFSET 4
1957     + pushl %eax
1958     + CFI_ADJUST_CFA_OFFSET 4
1959     + DISABLE_INTERRUPTS(CLBR_EAX)
1960     TRACE_IRQS_OFF
1961     - movl %esp, %eax
1962     - /* Set up the 16bit stack frame with switch32 pointer on top,
1963     - * and a switch16 pointer on top of the current frame. */
1964     - call setup_x86_bogus_stack
1965     - CFI_ADJUST_CFA_OFFSET -8 # frame has moved
1966     - TRACE_IRQS_IRET
1967     - RESTORE_REGS
1968     - lss 20+4(%esp), %esp # switch to 16bit stack
1969     -1: INTERRUPT_RETURN
1970     -.section __ex_table,"a"
1971     - .align 4
1972     - .long 1b,iret_exc
1973     -.previous
1974     + lss (%esp), %esp
1975     + CFI_ADJUST_CFA_OFFSET -8
1976     + jmp restore_nocheck
1977     #else
1978     ALIGN
1979     restore_all_enable_events:
1980     @@ -540,7 +526,7 @@
1981    
1982     CFI_RESTORE_STATE
1983     hypervisor_iret:
1984     - andl $~NMI_MASK, EFLAGS(%esp)
1985     + andl $~NMI_MASK, PT_EFLAGS(%esp)
1986     RESTORE_REGS
1987     addl $4, %esp
1988     CFI_ADJUST_CFA_OFFSET -4
1989     @@ -556,7 +542,7 @@
1990     jz work_notifysig
1991     work_resched:
1992     call schedule
1993     - DISABLE_INTERRUPTS # make sure we don't miss an interrupt
1994     + DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
1995     # setting need_resched or sigpending
1996     # between sampling and the iret
1997     TRACE_IRQS_OFF
1998     @@ -569,7 +555,8 @@
1999    
2000     work_notifysig: # deal with pending signals and
2001     # notify-resume requests
2002     - testl $VM_MASK, EFLAGS(%esp)
2003     +#ifdef CONFIG_VM86
2004     + testl $VM_MASK, PT_EFLAGS(%esp)
2005     movl %esp, %eax
2006     jne work_notifysig_v86 # returning to kernel-space or
2007     # vm86-space
2008     @@ -579,29 +566,30 @@
2009    
2010     ALIGN
2011     work_notifysig_v86:
2012     -#ifdef CONFIG_VM86
2013     pushl %ecx # save ti_flags for do_notify_resume
2014     CFI_ADJUST_CFA_OFFSET 4
2015     call save_v86_state # %eax contains pt_regs pointer
2016     popl %ecx
2017     CFI_ADJUST_CFA_OFFSET -4
2018     movl %eax, %esp
2019     +#else
2020     + movl %esp, %eax
2021     +#endif
2022     xorl %edx, %edx
2023     call do_notify_resume
2024     jmp resume_userspace_sig
2025     -#endif
2026    
2027     # perform syscall exit tracing
2028     ALIGN
2029     syscall_trace_entry:
2030     - movl $-ENOSYS,EAX(%esp)
2031     + movl $-ENOSYS,PT_EAX(%esp)
2032     movl %esp, %eax
2033     xorl %edx,%edx
2034     call do_syscall_trace
2035     cmpl $0, %eax
2036     jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU,
2037     # so must skip actual syscall
2038     - movl ORIG_EAX(%esp), %eax
2039     + movl PT_ORIG_EAX(%esp), %eax
2040     cmpl $(nr_syscalls), %eax
2041     jnae syscall_call
2042     jmp syscall_exit
2043     @@ -612,7 +600,7 @@
2044     testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl
2045     jz work_pending
2046     TRACE_IRQS_ON
2047     - ENABLE_INTERRUPTS # could let do_syscall_trace() call
2048     + ENABLE_INTERRUPTS(CLBR_ANY) # could let do_syscall_trace() call
2049     # schedule() instead
2050     movl %esp, %eax
2051     movl $1, %edx
2052     @@ -626,40 +614,39 @@
2053     CFI_ADJUST_CFA_OFFSET 4
2054     SAVE_ALL
2055     GET_THREAD_INFO(%ebp)
2056     - movl $-EFAULT,EAX(%esp)
2057     + movl $-EFAULT,PT_EAX(%esp)
2058     jmp resume_userspace
2059    
2060     syscall_badsys:
2061     - movl $-ENOSYS,EAX(%esp)
2062     + movl $-ENOSYS,PT_EAX(%esp)
2063     jmp resume_userspace
2064     CFI_ENDPROC
2065    
2066     #ifndef CONFIG_XEN
2067     #define FIXUP_ESPFIX_STACK \
2068     - movl %esp, %eax; \
2069     - /* switch to 32bit stack using the pointer on top of 16bit stack */ \
2070     - lss %ss:CPU_16BIT_STACK_SIZE-8, %esp; \
2071     - /* copy data from 16bit stack to 32bit stack */ \
2072     - call fixup_x86_bogus_stack; \
2073     - /* put ESP to the proper location */ \
2074     - movl %eax, %esp;
2075     -#define UNWIND_ESPFIX_STACK \
2076     + /* since we are on a wrong stack, we cant make it a C code :( */ \
2077     + movl %gs:PDA_cpu, %ebx; \
2078     + PER_CPU(cpu_gdt_descr, %ebx); \
2079     + movl GDS_address(%ebx), %ebx; \
2080     + GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \
2081     + addl %esp, %eax; \
2082     + pushl $__KERNEL_DS; \
2083     + CFI_ADJUST_CFA_OFFSET 4; \
2084     pushl %eax; \
2085     CFI_ADJUST_CFA_OFFSET 4; \
2086     + lss (%esp), %esp; \
2087     + CFI_ADJUST_CFA_OFFSET -8;
2088     +#define UNWIND_ESPFIX_STACK \
2089     movl %ss, %eax; \
2090     - /* see if on 16bit stack */ \
2091     + /* see if on espfix stack */ \
2092     cmpw $__ESPFIX_SS, %ax; \
2093     - je 28f; \
2094     -27: popl %eax; \
2095     - CFI_ADJUST_CFA_OFFSET -4; \
2096     -.section .fixup,"ax"; \
2097     -28: movl $__KERNEL_DS, %eax; \
2098     + jne 27f; \
2099     + movl $__KERNEL_DS, %eax; \
2100     movl %eax, %ds; \
2101     movl %eax, %es; \
2102     - /* switch to 32bit stack */ \
2103     + /* switch to normal stack */ \
2104     FIXUP_ESPFIX_STACK; \
2105     - jmp 27b; \
2106     -.previous
2107     +27:;
2108    
2109     /*
2110     * Build the entry stubs and pointer table with
2111     @@ -723,13 +710,16 @@
2112     CFI_ADJUST_CFA_OFFSET 4
2113     ALIGN
2114     error_code:
2115     + /* the function address is in %gs's slot on the stack */
2116     + pushl %es
2117     + CFI_ADJUST_CFA_OFFSET 4
2118     + /*CFI_REL_OFFSET es, 0*/
2119     pushl %ds
2120     CFI_ADJUST_CFA_OFFSET 4
2121     /*CFI_REL_OFFSET ds, 0*/
2122     pushl %eax
2123     CFI_ADJUST_CFA_OFFSET 4
2124     CFI_REL_OFFSET eax, 0
2125     - xorl %eax, %eax
2126     pushl %ebp
2127     CFI_ADJUST_CFA_OFFSET 4
2128     CFI_REL_OFFSET ebp, 0
2129     @@ -742,7 +732,6 @@
2130     pushl %edx
2131     CFI_ADJUST_CFA_OFFSET 4
2132     CFI_REL_OFFSET edx, 0
2133     - decl %eax # eax = -1
2134     pushl %ecx
2135     CFI_ADJUST_CFA_OFFSET 4
2136     CFI_REL_OFFSET ecx, 0
2137     @@ -750,18 +739,20 @@
2138     CFI_ADJUST_CFA_OFFSET 4
2139     CFI_REL_OFFSET ebx, 0
2140     cld
2141     - pushl %es
2142     + pushl %gs
2143     CFI_ADJUST_CFA_OFFSET 4
2144     - /*CFI_REL_OFFSET es, 0*/
2145     + /*CFI_REL_OFFSET gs, 0*/
2146     + movl $(__KERNEL_PDA), %ecx
2147     + movl %ecx, %gs
2148     UNWIND_ESPFIX_STACK
2149     popl %ecx
2150     CFI_ADJUST_CFA_OFFSET -4
2151     /*CFI_REGISTER es, ecx*/
2152     - movl ES(%esp), %edi # get the function address
2153     - movl ORIG_EAX(%esp), %edx # get the error code
2154     - movl %eax, ORIG_EAX(%esp)
2155     - movl %ecx, ES(%esp)
2156     - /*CFI_REL_OFFSET es, ES*/
2157     + movl PT_GS(%esp), %edi # get the function address
2158     + movl PT_ORIG_EAX(%esp), %edx # get the error code
2159     + movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
2160     + mov %ecx, PT_GS(%esp)
2161     + /*CFI_REL_OFFSET gs, ES*/
2162     movl $(__USER_DS), %ecx
2163     movl %ecx, %ds
2164     movl %ecx, %es
2165     @@ -793,7 +784,7 @@
2166     pushl %eax
2167     CFI_ADJUST_CFA_OFFSET 4
2168     SAVE_ALL
2169     - movl EIP(%esp),%eax
2170     + movl PT_EIP(%esp),%eax
2171     cmpl $scrit,%eax
2172     jb 11f
2173     cmpl $ecrit,%eax
2174     @@ -802,7 +793,7 @@
2175     jb 11f
2176     cmpl $sysexit_ecrit,%eax
2177     ja 11f
2178     - addl $OLDESP,%esp # Remove eflags...ebx from stack frame.
2179     + addl $PT_OLDESP,%esp # Remove eflags...ebx from stack frame.
2180     11: push %esp
2181     CFI_ADJUST_CFA_OFFSET 4
2182     call evtchn_do_upcall
2183     @@ -824,7 +815,7 @@
2184     jne 15f
2185     xorl %ecx,%ecx
2186     15: leal (%esp,%ecx),%esi # %esi points at end of src region
2187     - leal OLDESP(%esp),%edi # %edi points at end of dst region
2188     + leal PT_OLDESP(%esp),%edi # %edi points at end of dst region
2189     shrl $2,%ecx # convert words to bytes
2190     je 17f # skip loop if nothing to copy
2191     16: subl $4,%esi # pre-decrementing copy loop
2192     @@ -848,8 +839,9 @@
2193     .byte 0x18 # pop %eax
2194     .byte 0x1c # pop %ds
2195     .byte 0x20 # pop %es
2196     - .byte 0x24,0x24,0x24 # add $4,%esp
2197     - .byte 0x28 # iret
2198     + .byte 0x24,0x24 # pop %gs
2199     + .byte 0x28,0x28,0x28 # add $4,%esp
2200     + .byte 0x2c # iret
2201     .byte 0xff,0xff,0xff,0xff # movb $1,1(%esi)
2202     .byte 0x00,0x00 # jmp 11b
2203     .previous
2204     @@ -940,7 +932,7 @@
2205     jmp ret_from_exception
2206     device_available_emulate:
2207     #endif
2208     - preempt_stop
2209     + preempt_stop(CLBR_ANY)
2210     call math_state_restore
2211     jmp ret_from_exception
2212     CFI_ENDPROC
2213     @@ -1010,7 +1002,7 @@
2214     cmpw $__ESPFIX_SS, %ax
2215     popl %eax
2216     CFI_ADJUST_CFA_OFFSET -4
2217     - je nmi_16bit_stack
2218     + je nmi_espfix_stack
2219     cmpl $sysenter_entry,(%esp)
2220     je nmi_stack_fixup
2221     pushl %eax
2222     @@ -1053,7 +1045,7 @@
2223     FIX_STACK(24,nmi_stack_correct, 1)
2224     jmp nmi_stack_correct
2225    
2226     -nmi_16bit_stack:
2227     +nmi_espfix_stack:
2228     /* We have a RING0_INT_FRAME here.
2229     *
2230     * create the pointer to lss back
2231     @@ -1062,7 +1054,6 @@
2232     CFI_ADJUST_CFA_OFFSET 4
2233     pushl %esp
2234     CFI_ADJUST_CFA_OFFSET 4
2235     - movzwl %sp, %esp
2236     addw $4, (%esp)
2237     /* copy the iret frame of 12 bytes */
2238     .rept 3
2239     @@ -1073,11 +1064,11 @@
2240     CFI_ADJUST_CFA_OFFSET 4
2241     SAVE_ALL
2242     FIXUP_ESPFIX_STACK # %eax == %esp
2243     - CFI_ADJUST_CFA_OFFSET -20 # the frame has now moved
2244     xorl %edx,%edx # zero error code
2245     call do_nmi
2246     RESTORE_REGS
2247     - lss 12+4(%esp), %esp # back to 16bit stack
2248     + lss 12+4(%esp), %esp # back to espfix stack
2249     + CFI_ADJUST_CFA_OFFSET -24
2250     1: INTERRUPT_RETURN
2251     CFI_ENDPROC
2252     .section __ex_table,"a"
2253     @@ -1093,12 +1084,25 @@
2254     xorl %edx,%edx # zero error code
2255     movl %esp,%eax # pt_regs pointer
2256     call do_nmi
2257     - orl $NMI_MASK, EFLAGS(%esp)
2258     + orl $NMI_MASK, PT_EFLAGS(%esp)
2259     jmp restore_all
2260     CFI_ENDPROC
2261     #endif
2262     KPROBE_END(nmi)
2263    
2264     +#ifdef CONFIG_PARAVIRT
2265     +ENTRY(native_iret)
2266     +1: iret
2267     +.section __ex_table,"a"
2268     + .align 4
2269     + .long 1b,iret_exc
2270     +.previous
2271     +
2272     +ENTRY(native_irq_enable_sysexit)
2273     + sti
2274     + sysexit
2275     +#endif
2276     +
2277     KPROBE_ENTRY(int3)
2278     RING0_INT_FRAME
2279     pushl $-1 # mark this as an int
2280     @@ -1214,37 +1218,6 @@
2281     CFI_ENDPROC
2282     #endif /* !CONFIG_XEN */
2283    
2284     -#ifdef CONFIG_STACK_UNWIND
2285     -ENTRY(arch_unwind_init_running)
2286     - CFI_STARTPROC
2287     - movl 4(%esp), %edx
2288     - movl (%esp), %ecx
2289     - leal 4(%esp), %eax
2290     - movl %ebx, EBX(%edx)
2291     - xorl %ebx, %ebx
2292     - movl %ebx, ECX(%edx)
2293     - movl %ebx, EDX(%edx)
2294     - movl %esi, ESI(%edx)
2295     - movl %edi, EDI(%edx)
2296     - movl %ebp, EBP(%edx)
2297     - movl %ebx, EAX(%edx)
2298     - movl $__USER_DS, DS(%edx)
2299     - movl $__USER_DS, ES(%edx)
2300     - movl %ebx, ORIG_EAX(%edx)
2301     - movl %ecx, EIP(%edx)
2302     - movl 12(%esp), %ecx
2303     - movl $__KERNEL_CS, CS(%edx)
2304     - movl %ebx, EFLAGS(%edx)
2305     - movl %eax, OLDESP(%edx)
2306     - movl 8(%esp), %eax
2307     - movl %ecx, 8(%esp)
2308     - movl EBX(%edx), %ebx
2309     - movl $__KERNEL_DS, OLDSS(%edx)
2310     - jmpl *%eax
2311     - CFI_ENDPROC
2312     -ENDPROC(arch_unwind_init_running)
2313     -#endif
2314     -
2315     ENTRY(fixup_4gb_segment)
2316     RING0_EC_FRAME
2317     pushl $do_fixup_4gb_segment
2318     --- a/arch/x86/kernel/entry_64-xen.S
2319     +++ b/arch/x86/kernel/entry_64-xen.S
2320     @@ -261,7 +261,6 @@
2321     movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
2322     GET_THREAD_INFO(%rcx)
2323     testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx)
2324     - CFI_REMEMBER_STATE
2325     jnz tracesys
2326     cmpq $__NR_syscall_max,%rax
2327     ja badsys
2328     @@ -272,7 +271,6 @@
2329     * Syscall return path ending with SYSRET (fast path)
2330     * Has incomplete stack frame and undefined top of stack.
2331     */
2332     - .globl ret_from_sys_call
2333     ret_from_sys_call:
2334     movl $_TIF_ALLWORK_MASK,%edi
2335     /* edi: flagmask */
2336     @@ -282,8 +280,8 @@
2337     TRACE_IRQS_OFF
2338     movl threadinfo_flags(%rcx),%edx
2339     andl %edi,%edx
2340     - CFI_REMEMBER_STATE
2341     jnz sysret_careful
2342     + CFI_REMEMBER_STATE
2343     /*
2344     * sysretq will re-enable interrupts:
2345     */
2346     @@ -292,10 +290,10 @@
2347     RESTORE_ARGS 0,8,0
2348     HYPERVISOR_IRET VGCF_IN_SYSCALL
2349    
2350     + CFI_RESTORE_STATE
2351     /* Handle reschedules */
2352     /* edx: work, edi: workmask */
2353     sysret_careful:
2354     - CFI_RESTORE_STATE
2355     bt $TIF_NEED_RESCHED,%edx
2356     jnc sysret_signal
2357     TRACE_IRQS_ON
2358     @@ -334,7 +332,6 @@
2359    
2360     /* Do syscall tracing */
2361     tracesys:
2362     - CFI_RESTORE_STATE
2363     SAVE_REST
2364     movq $-ENOSYS,RAX(%rsp)
2365     FIXUP_TOP_OF_STACK %rdi
2366     @@ -350,32 +347,13 @@
2367     call *sys_call_table(,%rax,8)
2368     1: movq %rax,RAX-ARGOFFSET(%rsp)
2369     /* Use IRET because user could have changed frame */
2370     - jmp int_ret_from_sys_call
2371     - CFI_ENDPROC
2372     -END(system_call)
2373    
2374     /*
2375     * Syscall return path ending with IRET.
2376     * Has correct top of stack, but partial stack frame.
2377     - */
2378     -ENTRY(int_ret_from_sys_call)
2379     - CFI_STARTPROC simple
2380     - CFI_SIGNAL_FRAME
2381     - CFI_DEF_CFA rsp,SS+8-ARGOFFSET
2382     - /*CFI_REL_OFFSET ss,SS-ARGOFFSET*/
2383     - CFI_REL_OFFSET rsp,RSP-ARGOFFSET
2384     - /*CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/
2385     - /*CFI_REL_OFFSET cs,CS-ARGOFFSET*/
2386     - CFI_REL_OFFSET rip,RIP-ARGOFFSET
2387     - CFI_REL_OFFSET rdx,RDX-ARGOFFSET
2388     - CFI_REL_OFFSET rcx,RCX-ARGOFFSET
2389     - CFI_REL_OFFSET rax,RAX-ARGOFFSET
2390     - CFI_REL_OFFSET rdi,RDI-ARGOFFSET
2391     - CFI_REL_OFFSET rsi,RSI-ARGOFFSET
2392     - CFI_REL_OFFSET r8,R8-ARGOFFSET
2393     - CFI_REL_OFFSET r9,R9-ARGOFFSET
2394     - CFI_REL_OFFSET r10,R10-ARGOFFSET
2395     - CFI_REL_OFFSET r11,R11-ARGOFFSET
2396     + */
2397     + .globl int_ret_from_sys_call
2398     +int_ret_from_sys_call:
2399     XEN_BLOCK_EVENTS(%rsi)
2400     TRACE_IRQS_OFF
2401     testb $3,CS-ARGOFFSET(%rsp)
2402     @@ -428,8 +406,6 @@
2403     popq %rdi
2404     CFI_ADJUST_CFA_OFFSET -8
2405     andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi
2406     - XEN_BLOCK_EVENTS(%rsi)
2407     - TRACE_IRQS_OFF
2408     jmp int_restore_rest
2409    
2410     int_signal:
2411     @@ -445,7 +421,7 @@
2412     TRACE_IRQS_OFF
2413     jmp int_with_check
2414     CFI_ENDPROC
2415     -END(int_ret_from_sys_call)
2416     +END(system_call)
2417    
2418     /*
2419     * Certain special system calls that need to save a complete full stack frame.
2420     @@ -1275,36 +1251,3 @@
2421     ret
2422     CFI_ENDPROC
2423     ENDPROC(call_softirq)
2424     -
2425     -#ifdef CONFIG_STACK_UNWIND
2426     -ENTRY(arch_unwind_init_running)
2427     - CFI_STARTPROC
2428     - movq %r15, R15(%rdi)
2429     - movq %r14, R14(%rdi)
2430     - xchgq %rsi, %rdx
2431     - movq %r13, R13(%rdi)
2432     - movq %r12, R12(%rdi)
2433     - xorl %eax, %eax
2434     - movq %rbp, RBP(%rdi)
2435     - movq %rbx, RBX(%rdi)
2436     - movq (%rsp), %rcx
2437     - movq %rax, R11(%rdi)
2438     - movq %rax, R10(%rdi)
2439     - movq %rax, R9(%rdi)
2440     - movq %rax, R8(%rdi)
2441     - movq %rax, RAX(%rdi)
2442     - movq %rax, RCX(%rdi)
2443     - movq %rax, RDX(%rdi)
2444     - movq %rax, RSI(%rdi)
2445     - movq %rax, RDI(%rdi)
2446     - movq %rax, ORIG_RAX(%rdi)
2447     - movq %rcx, RIP(%rdi)
2448     - leaq 8(%rsp), %rcx
2449     - movq $__KERNEL_CS, CS(%rdi)
2450     - movq %rax, EFLAGS(%rdi)
2451     - movq %rcx, RSP(%rdi)
2452     - movq $__KERNEL_DS, SS(%rdi)
2453     - jmpq *%rdx
2454     - CFI_ENDPROC
2455     -ENDPROC(arch_unwind_init_running)
2456     -#endif
2457     --- a/arch/x86/kernel/genapic_64-xen.c
2458     +++ b/arch/x86/kernel/genapic_64-xen.c
2459     @@ -34,6 +34,7 @@
2460    
2461     #ifndef CONFIG_XEN
2462     struct genapic *genapic = &apic_flat;
2463     +struct genapic *genapic_force;
2464     #else
2465     extern struct genapic apic_xen;
2466     struct genapic *genapic = &apic_xen;
2467     @@ -52,6 +53,13 @@
2468     u8 cluster_cnt[NUM_APIC_CLUSTERS];
2469     int max_apic = 0;
2470    
2471     + /* genapic selection can be forced because of certain quirks.
2472     + */
2473     + if (genapic_force) {
2474     + genapic = genapic_force;
2475     + goto print;
2476     + }
2477     +
2478     #if defined(CONFIG_ACPI)
2479     /*
2480     * Some x86_64 machines use physical APIC mode regardless of how many
2481     --- a/arch/x86/kernel/head64-xen.c
2482     +++ b/arch/x86/kernel/head64-xen.c
2483     @@ -101,7 +101,10 @@
2484     machine_to_phys_order++;
2485    
2486     #if 0
2487     - for (i = 0; i < 256; i++)
2488     + /* clear bss before set_intr_gate with early_idt_handler */
2489     + clear_bss();
2490     +
2491     + for (i = 0; i < IDT_ENTRIES; i++)
2492     set_intr_gate(i, early_idt_handler);
2493     asm volatile("lidt %0" :: "m" (idt_descr));
2494     #endif
2495     --- a/arch/x86/kernel/head_32-xen.S
2496     +++ b/arch/x86/kernel/head_32-xen.S
2497     @@ -9,6 +9,7 @@
2498     #include <asm/cache.h>
2499     #include <asm/thread_info.h>
2500     #include <asm/asm-offsets.h>
2501     +#include <asm/boot.h>
2502     #include <asm/dwarf2.h>
2503     #include <xen/interface/xen.h>
2504     #include <xen/interface/elfnote.h>
2505     @@ -35,6 +36,8 @@
2506     /* Set up the stack pointer */
2507     movl $(init_thread_union+THREAD_SIZE),%esp
2508    
2509     + call setup_pda
2510     +
2511     /* get vendor info */
2512     xorl %eax,%eax # call CPUID with 0 -> return vendor ID
2513     XEN_CPUID
2514     @@ -57,14 +60,58 @@
2515    
2516     movb $1,X86_HARD_MATH
2517    
2518     - xorl %eax,%eax # Clear FS/GS and LDT
2519     + xorl %eax,%eax # Clear FS
2520     movl %eax,%fs
2521     - movl %eax,%gs
2522     +
2523     + movl $(__KERNEL_PDA),%eax
2524     + mov %eax,%gs
2525     +
2526     cld # gcc2 wants the direction flag cleared at all times
2527    
2528     pushl $0 # fake return address for unwinder
2529     jmp start_kernel
2530    
2531     +/*
2532     + * Point the GDT at this CPU's PDA. This will be
2533     + * cpu_gdt_table and boot_pda.
2534     + */
2535     +setup_pda:
2536     + /* get the PDA pointer */
2537     + movl $boot_pda, %eax
2538     +
2539     + /* slot the PDA address into the GDT */
2540     + mov $cpu_gdt_table, %ecx
2541     + mov %ax, (__KERNEL_PDA+0+2)(%ecx) /* base & 0x0000ffff */
2542     + shr $16, %eax
2543     + mov %al, (__KERNEL_PDA+4+0)(%ecx) /* base & 0x00ff0000 */
2544     + mov %ah, (__KERNEL_PDA+4+3)(%ecx) /* base & 0xff000000 */
2545     +
2546     + # %esi still points to start_info, and no registers
2547     + # need to be preserved.
2548     +
2549     + movl XEN_START_mfn_list(%esi), %ebx
2550     + movl $(cpu_gdt_table - __PAGE_OFFSET), %eax
2551     + shrl $PAGE_SHIFT, %eax
2552     + movl (%ebx,%eax,4), %ecx
2553     + pushl %ecx # frame number for set_gdt below
2554     +
2555     + xorl %esi, %esi
2556     + xorl %edx, %edx
2557     + shldl $PAGE_SHIFT, %ecx, %edx
2558     + shll $PAGE_SHIFT, %ecx
2559     + orl $0x61, %ecx
2560     + movl $cpu_gdt_table, %ebx
2561     + movl $__HYPERVISOR_update_va_mapping, %eax
2562     + int $0x82
2563     +
2564     + movl $(PAGE_SIZE_asm / 8), %ecx
2565     + movl %esp, %ebx
2566     + movl $__HYPERVISOR_set_gdt, %eax
2567     + int $0x82
2568     +
2569     + popl %ecx
2570     + ret
2571     +
2572     #define HYPERCALL_PAGE_OFFSET 0x1000
2573     .org HYPERCALL_PAGE_OFFSET
2574     ENTRY(hypercall_page)
2575     @@ -93,7 +140,8 @@
2576     /*
2577     * The Global Descriptor Table contains 28 quadwords, per-CPU.
2578     */
2579     - .align L1_CACHE_BYTES
2580     + .section .data.page_aligned, "aw"
2581     + .align PAGE_SIZE_asm
2582     ENTRY(cpu_gdt_table)
2583     .quad 0x0000000000000000 /* NULL descriptor */
2584     .quad 0x0000000000000000 /* 0x0b reserved */
2585     @@ -135,12 +183,13 @@
2586     .quad 0x0000000000000000 /* 0xc0 APM CS 16 code (16 bit) */
2587     .quad 0x0000000000000000 /* 0xc8 APM DS data */
2588    
2589     - .quad 0x0000000000000000 /* 0xd0 - ESPFIX 16-bit SS */
2590     - .quad 0x0000000000000000 /* 0xd8 - unused */
2591     + .quad 0x0000000000000000 /* 0xd0 - ESPFIX SS */
2592     + .quad 0x00cf92000000ffff /* 0xd8 - PDA */
2593     .quad 0x0000000000000000 /* 0xe0 - unused */
2594     .quad 0x0000000000000000 /* 0xe8 - unused */
2595     .quad 0x0000000000000000 /* 0xf0 - unused */
2596     .quad 0x0000000000000000 /* 0xf8 - GDT entry 31: double-fault TSS */
2597     + .align PAGE_SIZE_asm
2598    
2599     #if CONFIG_XEN_COMPAT <= 0x030002
2600     /*
2601     @@ -165,9 +214,9 @@
2602     .ascii ",ELF_PADDR_OFFSET=0x"
2603     utoa __PAGE_OFFSET
2604     .ascii ",VIRT_ENTRY=0x"
2605     - utoa (__PAGE_OFFSET + __PHYSICAL_START + VIRT_ENTRY_OFFSET)
2606     + utoa (__PAGE_OFFSET + LOAD_PHYSICAL_ADDR + VIRT_ENTRY_OFFSET)
2607     .ascii ",HYPERCALL_PAGE=0x"
2608     - utoa ((__PHYSICAL_START+HYPERCALL_PAGE_OFFSET)>>PAGE_SHIFT)
2609     + utoa ((LOAD_PHYSICAL_ADDR+HYPERCALL_PAGE_OFFSET)>>PAGE_SHIFT)
2610     .ascii ",FEATURES=writable_page_tables"
2611     .ascii "|writable_descriptor_tables"
2612     .ascii "|auto_translated_physmap"
2613     --- a/arch/x86/kernel/io_apic_32-xen.c
2614     +++ b/arch/x86/kernel/io_apic_32-xen.c
2615     @@ -34,6 +34,7 @@
2616     #include <linux/pci.h>
2617     #include <linux/msi.h>
2618     #include <linux/htirq.h>
2619     +#include <linux/freezer.h>
2620    
2621     #include <asm/io.h>
2622     #include <asm/smp.h>
2623     @@ -194,14 +195,20 @@
2624     * the interrupt, and we need to make sure the entry is fully populated
2625     * before that happens.
2626     */
2627     -static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
2628     +static void
2629     +__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
2630     {
2631     - unsigned long flags;
2632     union entry_union eu;
2633     eu.entry = e;
2634     - spin_lock_irqsave(&ioapic_lock, flags);
2635     io_apic_write(apic, 0x11 + 2*pin, eu.w2);
2636     io_apic_write(apic, 0x10 + 2*pin, eu.w1);
2637     +}
2638     +
2639     +static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
2640     +{
2641     + unsigned long flags;
2642     + spin_lock_irqsave(&ioapic_lock, flags);
2643     + __ioapic_write_entry(apic, pin, e);
2644     spin_unlock_irqrestore(&ioapic_lock, flags);
2645     }
2646    
2647     @@ -883,8 +890,7 @@
2648    
2649     if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
2650     mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
2651     - mp_bus_id_to_type[lbus] == MP_BUS_MCA ||
2652     - mp_bus_id_to_type[lbus] == MP_BUS_NEC98
2653     + mp_bus_id_to_type[lbus] == MP_BUS_MCA
2654     ) &&
2655     (mp_irqs[i].mpc_irqtype == type) &&
2656     (mp_irqs[i].mpc_srcbusirq == irq))
2657     @@ -903,8 +909,7 @@
2658    
2659     if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
2660     mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
2661     - mp_bus_id_to_type[lbus] == MP_BUS_MCA ||
2662     - mp_bus_id_to_type[lbus] == MP_BUS_NEC98
2663     + mp_bus_id_to_type[lbus] == MP_BUS_MCA
2664     ) &&
2665     (mp_irqs[i].mpc_irqtype == type) &&
2666     (mp_irqs[i].mpc_srcbusirq == irq))
2667     @@ -1036,12 +1041,6 @@
2668     #define default_MCA_trigger(idx) (1)
2669     #define default_MCA_polarity(idx) (0)
2670    
2671     -/* NEC98 interrupts are always polarity zero edge triggered,
2672     - * when listed as conforming in the MP table. */
2673     -
2674     -#define default_NEC98_trigger(idx) (0)
2675     -#define default_NEC98_polarity(idx) (0)
2676     -
2677     static int __init MPBIOS_polarity(int idx)
2678     {
2679     int bus = mp_irqs[idx].mpc_srcbus;
2680     @@ -1076,11 +1075,6 @@
2681     polarity = default_MCA_polarity(idx);
2682     break;
2683     }
2684     - case MP_BUS_NEC98: /* NEC 98 pin */
2685     - {
2686     - polarity = default_NEC98_polarity(idx);
2687     - break;
2688     - }
2689     default:
2690     {
2691     printk(KERN_WARNING "broken BIOS!!\n");
2692     @@ -1150,11 +1144,6 @@
2693     trigger = default_MCA_trigger(idx);
2694     break;
2695     }
2696     - case MP_BUS_NEC98: /* NEC 98 pin */
2697     - {
2698     - trigger = default_NEC98_trigger(idx);
2699     - break;
2700     - }
2701     default:
2702     {
2703     printk(KERN_WARNING "broken BIOS!!\n");
2704     @@ -1216,7 +1205,6 @@
2705     case MP_BUS_ISA: /* ISA pin */
2706     case MP_BUS_EISA:
2707     case MP_BUS_MCA:
2708     - case MP_BUS_NEC98:
2709     {
2710     irq = mp_irqs[idx].mpc_srcbusirq;
2711     break;
2712     @@ -1284,7 +1272,7 @@
2713     }
2714    
2715     /* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
2716     -u8 irq_vector[NR_IRQ_VECTORS] __read_mostly; /* = { FIRST_DEVICE_VECTOR , 0 }; */
2717     +static u8 irq_vector[NR_IRQ_VECTORS] __read_mostly; /* = { FIRST_DEVICE_VECTOR , 0 }; */
2718    
2719     static int __assign_irq_vector(int irq)
2720     {
2721     @@ -1407,8 +1395,8 @@
2722     if (!apic && (irq < 16))
2723     disable_8259A_irq(irq);
2724     }
2725     - ioapic_write_entry(apic, pin, entry);
2726     spin_lock_irqsave(&ioapic_lock, flags);
2727     + __ioapic_write_entry(apic, pin, entry);
2728     set_native_irq_info(irq, TARGET_CPUS);
2729     spin_unlock_irqrestore(&ioapic_lock, flags);
2730     }
2731     @@ -1974,6 +1962,15 @@
2732     #endif
2733    
2734     #ifndef CONFIG_XEN
2735     +static int no_timer_check __initdata;
2736     +
2737     +static int __init notimercheck(char *s)
2738     +{
2739     + no_timer_check = 1;
2740     + return 1;
2741     +}
2742     +__setup("no_timer_check", notimercheck);
2743     +
2744     /*
2745     * There is a nasty bug in some older SMP boards, their mptable lies
2746     * about the timer IRQ. We do the following to work around the situation:
2747     @@ -1982,10 +1979,13 @@
2748     * - if this function detects that timer IRQs are defunct, then we fall
2749     * back to ISA timer IRQs
2750     */
2751     -static int __init timer_irq_works(void)
2752     +int __init timer_irq_works(void)
2753     {
2754     unsigned long t1 = jiffies;
2755    
2756     + if (no_timer_check)
2757     + return 1;
2758     +
2759     local_irq_enable();
2760     /* Let ten ticks pass... */
2761     mdelay((10 * 1000) / HZ);
2762     @@ -2212,9 +2212,15 @@
2763     unsigned char save_control, save_freq_select;
2764    
2765     pin = find_isa_irq_pin(8, mp_INT);
2766     + if (pin == -1) {
2767     + WARN_ON_ONCE(1);
2768     + return;
2769     + }
2770     apic = find_isa_irq_apic(8, mp_INT);
2771     - if (pin == -1)
2772     + if (apic == -1) {
2773     + WARN_ON_ONCE(1);
2774     return;
2775     + }
2776    
2777     entry0 = ioapic_read_entry(apic, pin);
2778     clear_IO_APIC_pin(apic, pin);
2779     @@ -2259,7 +2265,7 @@
2780     * is so screwy. Thanks to Brian Perkins for testing/hacking this beast
2781     * fanatically on his truly buggy board.
2782     */
2783     -static inline void check_timer(void)
2784     +static inline void __init check_timer(void)
2785     {
2786     int apic1, pin1, apic2, pin2;
2787     int vector;
2788     @@ -2543,7 +2549,7 @@
2789     int create_irq(void)
2790     {
2791     /* Allocate an unused irq */
2792     - int irq, new, vector;
2793     + int irq, new, vector = 0;
2794     unsigned long flags;
2795    
2796     irq = -ENOSPC;
2797     @@ -2923,8 +2929,8 @@
2798     if (!ioapic && (irq < 16))
2799     disable_8259A_irq(irq);
2800    
2801     - ioapic_write_entry(ioapic, pin, entry);
2802     spin_lock_irqsave(&ioapic_lock, flags);
2803     + __ioapic_write_entry(ioapic, pin, entry);
2804     set_native_irq_info(irq, TARGET_CPUS);
2805     spin_unlock_irqrestore(&ioapic_lock, flags);
2806    
2807     --- a/arch/x86/kernel/io_apic_64-xen.c
2808     +++ b/arch/x86/kernel/io_apic_64-xen.c
2809     @@ -199,14 +199,20 @@
2810     * the interrupt, and we need to make sure the entry is fully populated
2811     * before that happens.
2812     */
2813     -static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
2814     +static void
2815     +__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
2816     {
2817     - unsigned long flags;
2818     union entry_union eu;
2819     eu.entry = e;
2820     - spin_lock_irqsave(&ioapic_lock, flags);
2821     io_apic_write(apic, 0x11 + 2*pin, eu.w2);
2822     io_apic_write(apic, 0x10 + 2*pin, eu.w1);
2823     +}
2824     +
2825     +static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
2826     +{
2827     + unsigned long flags;
2828     + spin_lock_irqsave(&ioapic_lock, flags);
2829     + __ioapic_write_entry(apic, pin, e);
2830     spin_unlock_irqrestore(&ioapic_lock, flags);
2831     }
2832    
2833     @@ -714,6 +720,22 @@
2834     }
2835    
2836     #ifndef CONFIG_XEN
2837     +static void __clear_irq_vector(int irq)
2838     +{
2839     + cpumask_t mask;
2840     + int cpu, vector;
2841     +
2842     + BUG_ON(!irq_vector[irq]);
2843     +
2844     + vector = irq_vector[irq];
2845     + cpus_and(mask, irq_domain[irq], cpu_online_map);
2846     + for_each_cpu_mask(cpu, mask)
2847     + per_cpu(vector_irq, cpu)[vector] = -1;
2848     +
2849     + irq_vector[irq] = 0;
2850     + irq_domain[irq] = CPU_MASK_NONE;
2851     +}
2852     +
2853     void __setup_vector_irq(int cpu)
2854     {
2855     /* Initialize vector_irq on a new cpu */
2856     @@ -761,26 +783,65 @@
2857     #define ioapic_register_intr(_irq,_vector,_trigger) ((void)0)
2858     #endif /* !CONFIG_XEN */
2859    
2860     -static void __init setup_IO_APIC_irqs(void)
2861     +static void __init setup_IO_APIC_irq(int apic, int pin, int idx, int irq)
2862     {
2863     struct IO_APIC_route_entry entry;
2864     - int apic, pin, idx, irq, first_notcon = 1, vector;
2865     + int vector;
2866     unsigned long flags;
2867    
2868     - apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
2869    
2870     - for (apic = 0; apic < nr_ioapics; apic++) {
2871     - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
2872     + /*
2873     + * add it to the IO-APIC irq-routing table:
2874     + */
2875     + memset(&entry,0,sizeof(entry));
2876    
2877     - /*
2878     - * add it to the IO-APIC irq-routing table:
2879     - */
2880     - memset(&entry,0,sizeof(entry));
2881     + entry.delivery_mode = INT_DELIVERY_MODE;
2882     + entry.dest_mode = INT_DEST_MODE;
2883     + entry.mask = 0; /* enable IRQ */
2884     + entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
2885    
2886     - entry.delivery_mode = INT_DELIVERY_MODE;
2887     - entry.dest_mode = INT_DEST_MODE;
2888     - entry.mask = 0; /* enable IRQ */
2889     + entry.trigger = irq_trigger(idx);
2890     + entry.polarity = irq_polarity(idx);
2891     +
2892     + if (irq_trigger(idx)) {
2893     + entry.trigger = 1;
2894     + entry.mask = 1;
2895     entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
2896     + }
2897     +
2898     + if (/* !apic && */ !IO_APIC_IRQ(irq))
2899     + return;
2900     +
2901     + if (IO_APIC_IRQ(irq)) {
2902     + cpumask_t mask;
2903     + vector = assign_irq_vector(irq, TARGET_CPUS, &mask);
2904     + if (vector < 0)
2905     + return;
2906     +
2907     + entry.dest.logical.logical_dest = cpu_mask_to_apicid(mask);
2908     + entry.vector = vector;
2909     +
2910     + ioapic_register_intr(irq, vector, IOAPIC_AUTO);
2911     + if (!apic && (irq < 16))
2912     + disable_8259A_irq(irq);
2913     + }
2914     +
2915     + ioapic_write_entry(apic, pin, entry);
2916     +
2917     + spin_lock_irqsave(&ioapic_lock, flags);
2918     + set_native_irq_info(irq, TARGET_CPUS);
2919     + spin_unlock_irqrestore(&ioapic_lock, flags);
2920     +
2921     +}
2922     +
2923     +static void __init setup_IO_APIC_irqs(void)
2924     +{
2925     + int apic, pin, idx, irq, first_notcon = 1;
2926     +
2927     + apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
2928     +
2929     + for (apic = 0; apic < nr_ioapics; apic++) {
2930     + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
2931    
2932     idx = find_irq_entry(apic,pin,mp_INT);
2933     if (idx == -1) {
2934     @@ -792,39 +853,11 @@
2935     continue;
2936     }
2937    
2938     - entry.trigger = irq_trigger(idx);
2939     - entry.polarity = irq_polarity(idx);
2940     -
2941     - if (irq_trigger(idx)) {
2942     - entry.trigger = 1;
2943     - entry.mask = 1;
2944     - entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
2945     - }
2946     -
2947     irq = pin_2_irq(idx, apic, pin);
2948     add_pin_to_irq(irq, apic, pin);
2949    
2950     - if (/* !apic && */ !IO_APIC_IRQ(irq))
2951     - continue;
2952     -
2953     - if (IO_APIC_IRQ(irq)) {
2954     - cpumask_t mask;
2955     - vector = assign_irq_vector(irq, TARGET_CPUS, &mask);
2956     - if (vector < 0)
2957     - continue;
2958     -
2959     - entry.dest.logical.logical_dest = cpu_mask_to_apicid(mask);
2960     - entry.vector = vector;
2961     + setup_IO_APIC_irq(apic, pin, idx, irq);
2962    
2963     - ioapic_register_intr(irq, vector, IOAPIC_AUTO);
2964     - if (!apic && (irq < 16))
2965     - disable_8259A_irq(irq);
2966     - }
2967     - ioapic_write_entry(apic, pin, entry);
2968     -
2969     - spin_lock_irqsave(&ioapic_lock, flags);
2970     - set_native_irq_info(irq, TARGET_CPUS);
2971     - spin_unlock_irqrestore(&ioapic_lock, flags);
2972     }
2973     }
2974    
2975     @@ -1819,7 +1852,7 @@
2976     dynamic_irq_cleanup(irq);
2977    
2978     spin_lock_irqsave(&vector_lock, flags);
2979     - irq_vector[irq] = 0;
2980     + __clear_irq_vector(irq);
2981     spin_unlock_irqrestore(&vector_lock, flags);
2982     }
2983     #endif
2984     @@ -2123,7 +2156,15 @@
2985     if (irq_entry == -1)
2986     continue;
2987     irq = pin_2_irq(irq_entry, ioapic, pin);
2988     - set_ioapic_affinity_irq(irq, TARGET_CPUS);
2989     +
2990     + /* setup_IO_APIC_irqs could fail to get vector for some device
2991     + * when you have too many devices, because at that time only boot
2992     + * cpu is online.
2993     + */
2994     + if(!irq_vector[irq])
2995     + setup_IO_APIC_irq(ioapic, pin, irq_entry, irq);
2996     + else
2997     + set_ioapic_affinity_irq(irq, TARGET_CPUS);
2998     }
2999    
3000     }
3001     --- a/arch/x86/kernel/irq_64-xen.c
3002     +++ b/arch/x86/kernel/irq_64-xen.c
3003     @@ -120,7 +120,7 @@
3004    
3005     if (likely(irq < NR_IRQS))
3006     generic_handle_irq(irq);
3007     - else
3008     + else if (printk_ratelimit())
3009     printk(KERN_EMERG "%s: %d.%d No irq handler for irq\n",
3010     __func__, smp_processor_id(), irq);
3011    
3012     --- a/arch/x86/kernel/ldt_32-xen.c
3013     +++ b/arch/x86/kernel/ldt_32-xen.c
3014     @@ -177,16 +177,14 @@
3015     {
3016     int err;
3017     unsigned long size;
3018     - void *address;
3019    
3020     err = 0;
3021     - address = &default_ldt[0];
3022     size = 5*sizeof(struct desc_struct);
3023     if (size > bytecount)
3024     size = bytecount;
3025    
3026     err = size;
3027     - if (copy_to_user(ptr, address, size))
3028     + if (clear_user(ptr, size))
3029     err = -EFAULT;
3030    
3031     return err;
3032     --- a/arch/x86/kernel/microcode-xen.c
3033     +++ b/arch/x86/kernel/microcode-xen.c
3034     @@ -1,7 +1,7 @@
3035     /*
3036     * Intel CPU Microcode Update Driver for Linux
3037     *
3038     - * Copyright (C) 2000-2004 Tigran Aivazian
3039     + * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
3040     * 2006 Shaohua Li <shaohua.li@intel.com>
3041     *
3042     * This driver allows to upgrade microcode on Intel processors
3043     @@ -43,7 +43,7 @@
3044     #include <asm/processor.h>
3045    
3046     MODULE_DESCRIPTION("Intel CPU (IA-32) Microcode Update Driver");
3047     -MODULE_AUTHOR("Tigran Aivazian <tigran@veritas.com>");
3048     +MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
3049     MODULE_LICENSE("GPL");
3050    
3051     static int verbose;
3052     @@ -195,7 +195,7 @@
3053     request_microcode();
3054    
3055     printk(KERN_INFO
3056     - "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@veritas.com>\n");
3057     + "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@aivazian.fsnet.co.uk>\n");
3058     return 0;
3059     }
3060    
3061     --- a/arch/x86/kernel/mpparse_32-xen.c
3062     +++ b/arch/x86/kernel/mpparse_32-xen.c
3063     @@ -36,7 +36,7 @@
3064    
3065     /* Have we found an MP table */
3066     int smp_found_config;
3067     -unsigned int __initdata maxcpus = NR_CPUS;
3068     +unsigned int __cpuinitdata maxcpus = NR_CPUS;
3069    
3070     /*
3071     * Various Linux-internal data structures created from the
3072     @@ -102,10 +102,10 @@
3073     */
3074    
3075     static int mpc_record;
3076     -static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY] __initdata;
3077     +static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY] __cpuinitdata;
3078    
3079     #ifndef CONFIG_XEN
3080     -static void __devinit MP_processor_info (struct mpc_config_processor *m)
3081     +static void __cpuinit MP_processor_info (struct mpc_config_processor *m)
3082     {
3083     int ver, apicid;
3084     physid_mask_t phys_cpu;
3085     @@ -221,7 +221,7 @@
3086     bios_cpu_apicid[num_processors - 1] = m->mpc_apicid;
3087     }
3088     #else
3089     -void __init MP_processor_info (struct mpc_config_processor *m)
3090     +static void __cpuinit MP_processor_info (struct mpc_config_processor *m)
3091     {
3092     num_processors++;
3093     }
3094     @@ -256,8 +256,6 @@
3095     mp_current_pci_id++;
3096     } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA)-1) == 0) {
3097     mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA;
3098     - } else if (strncmp(str, BUSTYPE_NEC98, sizeof(BUSTYPE_NEC98)-1) == 0) {
3099     - mp_bus_id_to_type[m->mpc_busid] = MP_BUS_NEC98;
3100     } else {
3101     printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
3102     }
3103     @@ -842,7 +840,7 @@
3104     #endif
3105     }
3106    
3107     -void __devinit mp_register_lapic (u8 id, u8 enabled)
3108     +void __cpuinit mp_register_lapic (u8 id, u8 enabled)
3109     {
3110     struct mpc_config_processor processor;
3111     int boot_cpu = 0;
3112     --- a/arch/x86/kernel/mpparse_64-xen.c
3113     +++ b/arch/x86/kernel/mpparse_64-xen.c
3114     @@ -35,8 +35,6 @@
3115     int smp_found_config;
3116     unsigned int __initdata maxcpus = NR_CPUS;
3117    
3118     -int acpi_found_madt;
3119     -
3120     /*
3121     * Various Linux-internal data structures created from the
3122     * MP-table.
3123     --- a/arch/x86/kernel/pci-dma_32-xen.c
3124     +++ b/arch/x86/kernel/pci-dma_32-xen.c
3125     @@ -282,7 +282,7 @@
3126     int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
3127     dma_addr_t device_addr, size_t size, int flags)
3128     {
3129     - void __iomem *mem_base;
3130     + void __iomem *mem_base = NULL;
3131     int pages = size >> PAGE_SHIFT;
3132     int bitmap_size = (pages + 31)/32;
3133    
3134     @@ -299,14 +299,12 @@
3135     if (!mem_base)
3136     goto out;
3137    
3138     - dev->dma_mem = kmalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
3139     + dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
3140     if (!dev->dma_mem)
3141     goto out;
3142     - memset(dev->dma_mem, 0, sizeof(struct dma_coherent_mem));
3143     - dev->dma_mem->bitmap = kmalloc(bitmap_size, GFP_KERNEL);
3144     + dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
3145     if (!dev->dma_mem->bitmap)
3146     goto free1_out;
3147     - memset(dev->dma_mem->bitmap, 0, bitmap_size);
3148    
3149     dev->dma_mem->virt_base = mem_base;
3150     dev->dma_mem->device_base = device_addr;
3151     @@ -321,6 +319,8 @@
3152     free1_out:
3153     kfree(dev->dma_mem->bitmap);
3154     out:
3155     + if (mem_base)
3156     + iounmap(mem_base);
3157     return 0;
3158     }
3159     EXPORT_SYMBOL(dma_declare_coherent_memory);
3160     --- a/arch/x86/kernel/process_32-xen.c
3161     +++ b/arch/x86/kernel/process_32-xen.c
3162     @@ -60,6 +60,7 @@
3163    
3164     #include <asm/tlbflush.h>
3165     #include <asm/cpu.h>
3166     +#include <asm/pda.h>
3167    
3168     asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
3169    
3170     @@ -104,28 +105,24 @@
3171     */
3172     static void poll_idle (void)
3173     {
3174     - local_irq_enable();
3175     -
3176     - asm volatile(
3177     - "2:"
3178     - "testl %0, %1;"
3179     - "rep; nop;"
3180     - "je 2b;"
3181     - : : "i"(_TIF_NEED_RESCHED), "m" (current_thread_info()->flags));
3182     + cpu_relax();
3183     }
3184    
3185     static void xen_idle(void)
3186     {
3187     - local_irq_disable();
3188     + current_thread_info()->status &= ~TS_POLLING;
3189     + /*
3190     + * TS_POLLING-cleared state must be visible before we
3191     + * test NEED_RESCHED:
3192     + */
3193     + smp_mb();
3194    
3195     - if (need_resched())
3196     + local_irq_disable();
3197     + if (!need_resched())
3198     + safe_halt(); /* enables interrupts racelessly */
3199     + else
3200     local_irq_enable();
3201     - else {
3202     - current_thread_info()->status &= ~TS_POLLING;
3203     - smp_mb__after_clear_bit();
3204     - safe_halt();
3205     - current_thread_info()->status |= TS_POLLING;
3206     - }
3207     + current_thread_info()->status |= TS_POLLING;
3208     }
3209     #ifdef CONFIG_APM_MODULE
3210     EXPORT_SYMBOL(default_idle);
3211     @@ -250,8 +247,8 @@
3212     regs->eax,regs->ebx,regs->ecx,regs->edx);
3213     printk("ESI: %08lx EDI: %08lx EBP: %08lx",
3214     regs->esi, regs->edi, regs->ebp);
3215     - printk(" DS: %04x ES: %04x\n",
3216     - 0xffff & regs->xds,0xffff & regs->xes);
3217     + printk(" DS: %04x ES: %04x GS: %04x\n",
3218     + 0xffff & regs->xds,0xffff & regs->xes, 0xffff & regs->xgs);
3219    
3220     cr0 = read_cr0();
3221     cr2 = read_cr2();
3222     @@ -282,6 +279,7 @@
3223    
3224     regs.xds = __USER_DS;
3225     regs.xes = __USER_DS;
3226     + regs.xgs = __KERNEL_PDA;
3227     regs.orig_eax = -1;
3228     regs.eip = (unsigned long) kernel_thread_helper;
3229     regs.xcs = __KERNEL_CS | get_kernel_rpl();
3230     @@ -359,7 +357,6 @@
3231     p->thread.eip = (unsigned long) ret_from_fork;
3232    
3233     savesegment(fs,p->thread.fs);
3234     - savesegment(gs,p->thread.gs);
3235    
3236     tsk = current;
3237     if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
3238     @@ -438,7 +435,7 @@
3239     dump->regs.ds = regs->xds;
3240     dump->regs.es = regs->xes;
3241     savesegment(fs,dump->regs.fs);
3242     - savesegment(gs,dump->regs.gs);
3243     + dump->regs.gs = regs->xgs;
3244     dump->regs.orig_eax = regs->orig_eax;
3245     dump->regs.eip = regs->eip;
3246     dump->regs.cs = regs->xcs;
3247     @@ -614,17 +611,19 @@
3248     if (unlikely(HYPERVISOR_multicall_check(_mcl, mcl - _mcl, NULL)))
3249     BUG();
3250    
3251     + /* we're going to use this soon, after a few expensive things */
3252     + if (next_p->fpu_counter > 5)
3253     + prefetch(&next->i387.fxsave);
3254     +
3255     /*
3256     - * Restore %fs and %gs if needed.
3257     + * Restore %fs if needed.
3258     *
3259     - * Glibc normally makes %fs be zero, and %gs is one of
3260     - * the TLS segments.
3261     + * Glibc normally makes %fs be zero.
3262     */
3263     if (unlikely(next->fs))
3264     loadsegment(fs, next->fs);
3265    
3266     - if (next->gs)
3267     - loadsegment(gs, next->gs);
3268     + write_pda(pcurrent, next_p);
3269    
3270     /*
3271     * Now maybe handle debug registers
3272     @@ -634,6 +633,13 @@
3273    
3274     disable_tsc(prev_p, next_p);
3275    
3276     + /* If the task has used fpu the last 5 timeslices, just do a full
3277     + * restore of the math state immediately to avoid the trap; the
3278     + * chances of needing FPU soon are obviously high now
3279     + */
3280     + if (next_p->fpu_counter > 5)
3281     + math_state_restore();
3282     +
3283     return prev_p;
3284     }
3285    
3286     --- a/arch/x86/kernel/process_64-xen.c
3287     +++ b/arch/x86/kernel/process_64-xen.c
3288     @@ -119,29 +119,23 @@
3289     static void poll_idle (void)
3290     {
3291     local_irq_enable();
3292     -
3293     - asm volatile(
3294     - "2:"
3295     - "testl %0,%1;"
3296     - "rep; nop;"
3297     - "je 2b;"
3298     - : :
3299     - "i" (_TIF_NEED_RESCHED),
3300     - "m" (current_thread_info()->flags));
3301     + cpu_relax();
3302     }
3303    
3304     static void xen_idle(void)
3305     {
3306     + current_thread_info()->status &= ~TS_POLLING;
3307     + /*
3308     + * TS_POLLING-cleared state must be visible before we
3309     + * test NEED_RESCHED:
3310     + */
3311     + smp_mb();
3312     local_irq_disable();
3313     -
3314     - if (need_resched())
3315     - local_irq_enable();
3316     - else {
3317     - current_thread_info()->status &= ~TS_POLLING;
3318     - smp_mb__after_clear_bit();
3319     + if (!need_resched())
3320     safe_halt();
3321     - current_thread_info()->status |= TS_POLLING;
3322     - }
3323     + else
3324     + local_irq_enable();
3325     + current_thread_info()->status |= TS_POLLING;
3326     }
3327    
3328     #ifdef CONFIG_HOTPLUG_CPU
3329     @@ -181,6 +175,12 @@
3330     idle = xen_idle; /* no alternatives */
3331     if (cpu_is_offline(smp_processor_id()))
3332     play_dead();
3333     + /*
3334     + * Idle routines should keep interrupts disabled
3335     + * from here on, until they go to idle.
3336     + * Otherwise, idle callbacks can misfire.
3337     + */
3338     + local_irq_disable();
3339     enter_idle();
3340     idle();
3341     /* In many cases the interrupt that ended idle
3342     --- a/arch/x86/kernel/quirks-xen.c
3343     +++ b/arch/x86/kernel/quirks-xen.c
3344     @@ -3,10 +3,12 @@
3345     */
3346     #include <linux/pci.h>
3347     #include <linux/irq.h>
3348     +#include <asm/pci-direct.h>
3349     +#include <asm/genapic.h>
3350     +#include <asm/cpu.h>
3351    
3352     #if defined(CONFIG_X86_IO_APIC) && (defined(CONFIG_SMP) || defined(CONFIG_XEN)) && defined(CONFIG_PCI)
3353     -
3354     -static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
3355     +static void __devinit verify_quirk_intel_irqbalance(struct pci_dev *dev)
3356     {
3357     u8 config, rev;
3358     u32 word;
3359     @@ -14,14 +16,12 @@
3360     /* BIOS may enable hardware IRQ balancing for
3361     * E7520/E7320/E7525(revision ID 0x9 and below)
3362     * based platforms.
3363     - * Disable SW irqbalance/affinity on those platforms.
3364     + * For those platforms, make sure that the genapic is set to 'flat'
3365     */
3366     pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev);
3367     if (rev > 0x9)
3368     return;
3369    
3370     - printk(KERN_INFO "Intel E7520/7320/7525 detected.");
3371     -
3372     /* enable access to config space*/
3373     pci_read_config_byte(dev, 0xf4, &config);
3374     pci_write_config_byte(dev, 0xf4, config|0x2);
3375     @@ -30,6 +30,46 @@
3376     raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word);
3377    
3378     if (!(word & (1 << 13))) {
3379     +#ifndef CONFIG_XEN
3380     +#ifdef CONFIG_X86_64
3381     + if (genapic != &apic_flat)
3382     + panic("APIC mode must be flat on this system\n");
3383     +#elif defined(CONFIG_X86_GENERICARCH)
3384     + if (genapic != &apic_default)
3385     + panic("APIC mode must be default(flat) on this system. Use apic=default\n");
3386     +#endif
3387     +#endif
3388     + }
3389     +
3390     + /* put back the original value for config space*/
3391     + if (!(config & 0x2))
3392     + pci_write_config_byte(dev, 0xf4, config);
3393     +}
3394     +
3395     +void __init quirk_intel_irqbalance(void)
3396     +{
3397     + u8 config, rev;
3398     + u32 word;
3399     +
3400     + /* BIOS may enable hardware IRQ balancing for
3401     + * E7520/E7320/E7525(revision ID 0x9 and below)
3402     + * based platforms.
3403     + * Disable SW irqbalance/affinity on those platforms.
3404     + */
3405     + rev = read_pci_config_byte(0, 0, 0, PCI_CLASS_REVISION);
3406     + if (rev > 0x9)
3407     + return;
3408     +
3409     + printk(KERN_INFO "Intel E7520/7320/7525 detected.");
3410     +
3411     + /* enable access to config space */
3412     + config = read_pci_config_byte(0, 0, 0, 0xf4);
3413     + write_pci_config_byte(0, 0, 0, 0xf4, config|0x2);
3414     +
3415     + /* read xTPR register */
3416     + word = read_pci_config_16(0, 0, 0x40, 0x4c);
3417     +
3418     + if (!(word & (1 << 13))) {
3419     struct xen_platform_op op;
3420     printk(KERN_INFO "Disabling irq balancing and affinity\n");
3421     op.cmd = XENPF_platform_quirk;
3422     @@ -37,11 +77,12 @@
3423     WARN_ON(HYPERVISOR_platform_op(&op));
3424     }
3425    
3426     - /* put back the original value for config space*/
3427     + /* put back the original value for config space */
3428     if (!(config & 0x2))
3429     - pci_write_config_byte(dev, 0xf4, config);
3430     + write_pci_config_byte(0, 0, 0, 0xf4, config);
3431     }
3432     -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, quirk_intel_irqbalance);
3433     -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, quirk_intel_irqbalance);
3434     -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, quirk_intel_irqbalance);
3435     +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, verify_quirk_intel_irqbalance);
3436     +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, verify_quirk_intel_irqbalance);
3437     +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, verify_quirk_intel_irqbalance);
3438     +
3439     #endif
3440     --- a/arch/x86/kernel/setup_32-xen.c
3441     +++ b/arch/x86/kernel/setup_32-xen.c
3442     @@ -76,9 +76,6 @@
3443     #include <xen/interface/kexec.h>
3444     #endif
3445    
3446     -/* Forward Declaration. */
3447     -void __init find_max_pfn(void);
3448     -
3449     static int xen_panic_event(struct notifier_block *, unsigned long, void *);
3450     static struct notifier_block xen_panic_block = {
3451     xen_panic_event, NULL, 0 /* try to go last */
3452     @@ -92,14 +89,11 @@
3453     /*
3454     * Machine setup..
3455     */
3456     -
3457     -#ifdef CONFIG_EFI
3458     -int efi_enabled = 0;
3459     -EXPORT_SYMBOL(efi_enabled);
3460     -#endif
3461     +extern struct resource code_resource;
3462     +extern struct resource data_resource;
3463    
3464     /* cpu data as detected by the assembly code in head.S */
3465     -struct cpuinfo_x86 new_cpu_data __initdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
3466     +struct cpuinfo_x86 new_cpu_data __cpuinitdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
3467     /* common cpu data for all cpus */
3468     struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
3469     EXPORT_SYMBOL(boot_cpu_data);
3470     @@ -115,12 +109,6 @@
3471     unsigned int BIOS_revision;
3472     unsigned int mca_pentium_flag;
3473    
3474     -/* For PCI or other memory-mapped resources */
3475     -unsigned long pci_mem_start = 0x10000000;
3476     -#ifdef CONFIG_PCI
3477     -EXPORT_SYMBOL(pci_mem_start);
3478     -#endif
3479     -
3480     /* Boot loader ID as an integer, for the benefit of proc_dointvec */
3481     int bootloader_type;
3482    
3483     @@ -153,10 +141,6 @@
3484     defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
3485     EXPORT_SYMBOL(ist_info);
3486     #endif
3487     -struct e820map e820;
3488     -#ifdef CONFIG_XEN
3489     -struct e820map machine_e820;
3490     -#endif
3491    
3492     extern void early_cpu_init(void);
3493     extern int root_mountflags;
3494     @@ -171,209 +155,6 @@
3495    
3496     unsigned char __initdata boot_params[PARAM_SIZE];
3497    
3498     -static struct resource data_resource = {
3499     - .name = "Kernel data",
3500     - .start = 0,
3501     - .end = 0,
3502     - .flags = IORESOURCE_BUSY | IORESOURCE_MEM
3503     -};
3504     -
3505     -static struct resource code_resource = {
3506     - .name = "Kernel code",
3507     - .start = 0,
3508     - .end = 0,
3509     - .flags = IORESOURCE_BUSY | IORESOURCE_MEM
3510     -};
3511     -
3512     -static struct resource system_rom_resource = {
3513     - .name = "System ROM",
3514     - .start = 0xf0000,
3515     - .end = 0xfffff,
3516     - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
3517     -};
3518     -
3519     -static struct resource extension_rom_resource = {
3520     - .name = "Extension ROM",
3521     - .start = 0xe0000,
3522     - .end = 0xeffff,
3523     - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
3524     -};
3525     -
3526     -static struct resource adapter_rom_resources[] = { {
3527     - .name = "Adapter ROM",
3528     - .start = 0xc8000,
3529     - .end = 0,
3530     - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
3531     -}, {
3532     - .name = "Adapter ROM",
3533     - .start = 0,
3534     - .end = 0,
3535     - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
3536     -}, {
3537     - .name = "Adapter ROM",
3538     - .start = 0,
3539     - .end = 0,
3540     - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
3541     -}, {
3542     - .name = "Adapter ROM",
3543     - .start = 0,
3544     - .end = 0,
3545     - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
3546     -}, {
3547     - .name = "Adapter ROM",
3548     - .start = 0,
3549     - .end = 0,
3550     - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
3551     -}, {
3552     - .name = "Adapter ROM",
3553     - .start = 0,
3554     - .end = 0,
3555     - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
3556     -} };
3557     -
3558     -static struct resource video_rom_resource = {
3559     - .name = "Video ROM",
3560     - .start = 0xc0000,
3561     - .end = 0xc7fff,
3562     - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
3563     -};
3564     -
3565     -static struct resource video_ram_resource = {
3566     - .name = "Video RAM area",
3567     - .start = 0xa0000,
3568     - .end = 0xbffff,
3569     - .flags = IORESOURCE_BUSY | IORESOURCE_MEM
3570     -};
3571     -
3572     -static struct resource standard_io_resources[] = { {
3573     - .name = "dma1",
3574     - .start = 0x0000,
3575     - .end = 0x001f,
3576     - .flags = IORESOURCE_BUSY | IORESOURCE_IO
3577     -}, {
3578     - .name = "pic1",
3579     - .start = 0x0020,
3580     - .end = 0x0021,
3581     - .flags = IORESOURCE_BUSY | IORESOURCE_IO
3582     -}, {
3583     - .name = "timer0",
3584     - .start = 0x0040,
3585     - .end = 0x0043,
3586     - .flags = IORESOURCE_BUSY | IORESOURCE_IO
3587     -}, {
3588     - .name = "timer1",
3589     - .start = 0x0050,
3590     - .end = 0x0053,
3591     - .flags = IORESOURCE_BUSY | IORESOURCE_IO
3592     -}, {
3593     - .name = "keyboard",
3594     - .start = 0x0060,
3595     - .end = 0x006f,
3596     - .flags = IORESOURCE_BUSY | IORESOURCE_IO
3597     -}, {
3598     - .name = "dma page reg",
3599     - .start = 0x0080,
3600     - .end = 0x008f,
3601     - .flags = IORESOURCE_BUSY | IORESOURCE_IO
3602     -}, {
3603     - .name = "pic2",
3604     - .start = 0x00a0,
3605     - .end = 0x00a1,
3606     - .flags = IORESOURCE_BUSY | IORESOURCE_IO
3607     -}, {
3608     - .name = "dma2",
3609     - .start = 0x00c0,
3610     - .end = 0x00df,
3611     - .flags = IORESOURCE_BUSY | IORESOURCE_IO
3612     -}, {
3613     - .name = "fpu",
3614     - .start = 0x00f0,
3615     - .end = 0x00ff,
3616     - .flags = IORESOURCE_BUSY | IORESOURCE_IO
3617     -} };
3618     -
3619     -#define romsignature(x) (*(unsigned short *)(x) == 0xaa55)
3620     -
3621     -static int __init romchecksum(unsigned char *rom, unsigned long length)
3622     -{
3623     - unsigned char *p, sum = 0;
3624     -
3625     - for (p = rom; p < rom + length; p++)
3626     - sum += *p;
3627     - return sum == 0;
3628     -}
3629     -
3630     -static void __init probe_roms(void)
3631     -{
3632     - unsigned long start, length, upper;
3633     - unsigned char *rom;
3634     - int i;
3635     -
3636     -#ifdef CONFIG_XEN
3637     - /* Nothing to do if not running in dom0. */
3638     - if (!is_initial_xendomain())
3639     - return;
3640     -#endif
3641     -
3642     - /* video rom */
3643     - upper = adapter_rom_resources[0].start;
3644     - for (start = video_rom_resource.start; start < upper; start += 2048) {
3645     - rom = isa_bus_to_virt(start);
3646     - if (!romsignature(rom))
3647     - continue;
3648     -
3649     - video_rom_resource.start = start;
3650     -
3651     - /* 0 < length <= 0x7f * 512, historically */
3652     - length = rom[2] * 512;
3653     -
3654     - /* if checksum okay, trust length byte */
3655     - if (length && romchecksum(rom, length))
3656     - video_rom_resource.end = start + length - 1;
3657     -
3658     - request_resource(&iomem_resource, &video_rom_resource);
3659     - break;
3660     - }
3661     -
3662     - start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
3663     - if (start < upper)
3664     - start = upper;
3665     -
3666     - /* system rom */
3667     - request_resource(&iomem_resource, &system_rom_resource);
3668     - upper = system_rom_resource.start;
3669     -
3670     - /* check for extension rom (ignore length byte!) */
3671     - rom = isa_bus_to_virt(extension_rom_resource.start);
3672     - if (romsignature(rom)) {
3673     - length = extension_rom_resource.end - extension_rom_resource.start + 1;
3674     - if (romchecksum(rom, length)) {
3675     - request_resource(&iomem_resource, &extension_rom_resource);
3676     - upper = extension_rom_resource.start;
3677     - }
3678     - }
3679     -
3680     - /* check for adapter roms on 2k boundaries */
3681     - for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) {
3682     - rom = isa_bus_to_virt(start);
3683     - if (!romsignature(rom))
3684     - continue;
3685     -
3686     - /* 0 < length <= 0x7f * 512, historically */
3687     - length = rom[2] * 512;
3688     -
3689     - /* but accept any length that fits if checksum okay */
3690     - if (!length || start + length > upper || !romchecksum(rom, length))
3691     - continue;
3692     -
3693     - adapter_rom_resources[i].start = start;
3694     - adapter_rom_resources[i].end = start + length - 1;
3695     - request_resource(&iomem_resource, &adapter_rom_resources[i]);
3696     -
3697     - start = adapter_rom_resources[i++].end & ~2047UL;
3698     - }
3699     -}
3700     -
3701     /*
3702     * Point at the empty zero page to start with. We map the real shared_info
3703     * page as soon as fixmap is up and running.
3704     @@ -389,338 +170,6 @@
3705     start_info_t *xen_start_info;
3706     EXPORT_SYMBOL(xen_start_info);
3707    
3708     -void __init add_memory_region(unsigned long long start,
3709     - unsigned long long size, int type)
3710     -{
3711     - int x;
3712     -
3713     - if (!efi_enabled) {
3714     - x = e820.nr_map;
3715     -
3716     - if (x == E820MAX) {
3717     - printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
3718     - return;
3719     - }
3720     -
3721     - e820.map[x].addr = start;
3722     - e820.map[x].size = size;
3723     - e820.map[x].type = type;
3724     - e820.nr_map++;
3725     - }
3726     -} /* add_memory_region */
3727     -
3728     -static void __init limit_regions(unsigned long long size)
3729     -{
3730     - unsigned long long current_addr = 0;
3731     - int i;
3732     -
3733     - if (efi_enabled) {
3734     - efi_memory_desc_t *md;
3735     - void *p;
3736     -
3737     - for (p = memmap.map, i = 0; p < memmap.map_end;
3738     - p += memmap.desc_size, i++) {
3739     - md = p;
3740     - current_addr = md->phys_addr + (md->num_pages << 12);
3741     - if (md->type == EFI_CONVENTIONAL_MEMORY) {
3742     - if (current_addr >= size) {
3743     - md->num_pages -=
3744     - (((current_addr-size) + PAGE_SIZE-1) >> PAGE_SHIFT);
3745     - memmap.nr_map = i + 1;
3746     - return;
3747     - }
3748     - }
3749     - }
3750     - }
3751     - for (i = 0; i < e820.nr_map; i++) {
3752     - current_addr = e820.map[i].addr + e820.map[i].size;
3753     - if (current_addr < size)
3754     - continue;
3755     -
3756     - if (e820.map[i].type != E820_RAM)
3757     - continue;
3758     -
3759     - if (e820.map[i].addr >= size) {
3760     - /*
3761     - * This region starts past the end of the
3762     - * requested size, skip it completely.
3763     - */
3764     - e820.nr_map = i;
3765     - } else {
3766     - e820.nr_map = i + 1;
3767     - e820.map[i].size -= current_addr - size;
3768     - }
3769     - return;
3770     - }
3771     -#ifdef CONFIG_XEN
3772     - if (i==e820.nr_map && current_addr < size) {
3773     - /*
3774     - * The e820 map finished before our requested size so
3775     - * extend the final entry to the requested address.
3776     - */
3777     - --i;
3778     - if (e820.map[i].type == E820_RAM)
3779     - e820.map[i].size -= current_addr - size;
3780     - else
3781     - add_memory_region(current_addr, size - current_addr, E820_RAM);
3782     - }
3783     -#endif
3784     -}
3785     -
3786     -#define E820_DEBUG 1
3787     -
3788     -static void __init print_memory_map(char *who)
3789     -{
3790     - int i;
3791     -
3792     - for (i = 0; i < e820.nr_map; i++) {
3793     - printk(" %s: %016Lx - %016Lx ", who,
3794     - e820.map[i].addr,
3795     - e820.map[i].addr + e820.map[i].size);
3796     - switch (e820.map[i].type) {
3797     - case E820_RAM: printk("(usable)\n");
3798     - break;
3799     - case E820_RESERVED:
3800     - printk("(reserved)\n");
3801     - break;
3802     - case E820_ACPI:
3803     - printk("(ACPI data)\n");
3804     - break;
3805     - case E820_NVS:
3806     - printk("(ACPI NVS)\n");
3807     - break;
3808     - default: printk("type %lu\n", e820.map[i].type);
3809     - break;
3810     - }
3811     - }
3812     -}
3813     -
3814     -/*
3815     - * Sanitize the BIOS e820 map.
3816     - *
3817     - * Some e820 responses include overlapping entries. The following
3818     - * replaces the original e820 map with a new one, removing overlaps.
3819     - *
3820     - */
3821     -struct change_member {
3822     - struct e820entry *pbios; /* pointer to original bios entry */
3823     - unsigned long long addr; /* address for this change point */
3824     -};
3825     -static struct change_member change_point_list[2*E820MAX] __initdata;
3826     -static struct change_member *change_point[2*E820MAX] __initdata;
3827     -static struct e820entry *overlap_list[E820MAX] __initdata;
3828     -static struct e820entry new_bios[E820MAX] __initdata;
3829     -
3830     -int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
3831     -{
3832     - struct change_member *change_tmp;
3833     - unsigned long current_type, last_type;
3834     - unsigned long long last_addr;
3835     - int chgidx, still_changing;
3836     - int overlap_entries;
3837     - int new_bios_entry;
3838     - int old_nr, new_nr, chg_nr;
3839     - int i;
3840     -
3841     - /*
3842     - Visually we're performing the following (1,2,3,4 = memory types)...
3843     -
3844     - Sample memory map (w/overlaps):
3845     - ____22__________________
3846     - ______________________4_
3847     - ____1111________________
3848     - _44_____________________
3849     - 11111111________________
3850     - ____________________33__
3851     - ___________44___________
3852     - __________33333_________
3853     - ______________22________
3854     - ___________________2222_
3855     - _________111111111______
3856     - _____________________11_
3857     - _________________4______
3858     -
3859     - Sanitized equivalent (no overlap):
3860     - 1_______________________
3861     - _44_____________________
3862     - ___1____________________
3863     - ____22__________________
3864     - ______11________________
3865     - _________1______________
3866     - __________3_____________
3867     - ___________44___________
3868     - _____________33_________
3869     - _______________2________
3870     - ________________1_______
3871     - _________________4______
3872     - ___________________2____
3873     - ____________________33__
3874     - ______________________4_
3875     - */
3876     -
3877     - /* if there's only one memory region, don't bother */
3878     - if (*pnr_map < 2)
3879     - return -1;
3880     -
3881     - old_nr = *pnr_map;
3882     -
3883     - /* bail out if we find any unreasonable addresses in bios map */
3884     - for (i=0; i<old_nr; i++)
3885     - if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
3886     - return -1;
3887     -
3888     - /* create pointers for initial change-point information (for sorting) */
3889     - for (i=0; i < 2*old_nr; i++)
3890     - change_point[i] = &change_point_list[i];
3891     -
3892     - /* record all known change-points (starting and ending addresses),
3893     - omitting those that are for empty memory regions */
3894     - chgidx = 0;
3895     - for (i=0; i < old_nr; i++) {
3896     - if (biosmap[i].size != 0) {
3897     - change_point[chgidx]->addr = biosmap[i].addr;
3898     - change_point[chgidx++]->pbios = &biosmap[i];
3899     - change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
3900     - change_point[chgidx++]->pbios = &biosmap[i];
3901     - }
3902     - }
3903     - chg_nr = chgidx; /* true number of change-points */
3904     -
3905     - /* sort change-point list by memory addresses (low -> high) */
3906     - still_changing = 1;
3907     - while (still_changing) {
3908     - still_changing = 0;
3909     - for (i=1; i < chg_nr; i++) {
3910     - /* if <current_addr> > <last_addr>, swap */
3911     - /* or, if current=<start_addr> & last=<end_addr>, swap */
3912     - if ((change_point[i]->addr < change_point[i-1]->addr) ||
3913     - ((change_point[i]->addr == change_point[i-1]->addr) &&
3914     - (change_point[i]->addr == change_point[i]->pbios->addr) &&
3915     - (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
3916     - )
3917     - {
3918     - change_tmp = change_point[i];
3919     - change_point[i] = change_point[i-1];
3920     - change_point[i-1] = change_tmp;
3921     - still_changing=1;
3922     - }
3923     - }
3924     - }
3925     -
3926     - /* create a new bios memory map, removing overlaps */
3927     - overlap_entries=0; /* number of entries in the overlap table */
3928     - new_bios_entry=0; /* index for creating new bios map entries */
3929     - last_type = 0; /* start with undefined memory type */
3930     - last_addr = 0; /* start with 0 as last starting address */
3931     - /* loop through change-points, determining affect on the new bios map */
3932     - for (chgidx=0; chgidx < chg_nr; chgidx++)
3933     - {
3934     - /* keep track of all overlapping bios entries */
3935     - if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
3936     - {
3937     - /* add map entry to overlap list (> 1 entry implies an overlap) */
3938     - overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
3939     - }
3940     - else
3941     - {
3942     - /* remove entry from list (order independent, so swap with last) */
3943     - for (i=0; i<overlap_entries; i++)
3944     - {
3945     - if (overlap_list[i] == change_point[chgidx]->pbios)
3946     - overlap_list[i] = overlap_list[overlap_entries-1];
3947     - }
3948     - overlap_entries--;
3949     - }
3950     - /* if there are overlapping entries, decide which "type" to use */
3951     - /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
3952     - current_type = 0;
3953     - for (i=0; i<overlap_entries; i++)
3954     - if (overlap_list[i]->type > current_type)
3955     - current_type = overlap_list[i]->type;
3956     - /* continue building up new bios map based on this information */
3957     - if (current_type != last_type) {
3958     - if (last_type != 0) {
3959     - new_bios[new_bios_entry].size =
3960     - change_point[chgidx]->addr - last_addr;
3961     - /* move forward only if the new size was non-zero */
3962     - if (new_bios[new_bios_entry].size != 0)
3963     - if (++new_bios_entry >= E820MAX)
3964     - break; /* no more space left for new bios entries */
3965     - }
3966     - if (current_type != 0) {
3967     - new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
3968     - new_bios[new_bios_entry].type = current_type;
3969     - last_addr=change_point[chgidx]->addr;
3970     - }
3971     - last_type = current_type;
3972     - }
3973     - }
3974     - new_nr = new_bios_entry; /* retain count for new bios entries */
3975     -
3976     - /* copy new bios mapping into original location */
3977     - memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
3978     - *pnr_map = new_nr;
3979     -
3980     - return 0;
3981     -}
3982     -
3983     -/*
3984     - * Copy the BIOS e820 map into a safe place.
3985     - *
3986     - * Sanity-check it while we're at it..
3987     - *
3988     - * If we're lucky and live on a modern system, the setup code
3989     - * will have given us a memory map that we can use to properly
3990     - * set up memory. If we aren't, we'll fake a memory map.
3991     - *
3992     - * We check to see that the memory map contains at least 2 elements
3993     - * before we'll use it, because the detection code in setup.S may
3994     - * not be perfect and most every PC known to man has two memory
3995     - * regions: one from 0 to 640k, and one from 1mb up. (The IBM
3996     - * thinkpad 560x, for example, does not cooperate with the memory
3997     - * detection code.)
3998     - */
3999     -int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
4000     -{
4001     -#ifndef CONFIG_XEN
4002     - /* Only one memory region (or negative)? Ignore it */
4003     - if (nr_map < 2)
4004     - return -1;
4005     -#else
4006     - BUG_ON(nr_map < 1);
4007     -#endif
4008     -
4009     - do {
4010     - unsigned long long start = biosmap->addr;
4011     - unsigned long long size = biosmap->size;
4012     - unsigned long long end = start + size;
4013     - unsigned long type = biosmap->type;
4014     -
4015     - /* Overflow in 64 bits? Ignore the memory map. */
4016     - if (start > end)
4017     - return -1;
4018     -
4019     -#ifndef CONFIG_XEN
4020     - /*
4021     - * Some BIOSes claim RAM in the 640k - 1M region.
4022     - * Not right. Fix it up.
4023     - */
4024     - if (type == E820_RAM) {
4025     - if (start < 0x100000ULL && end > 0xA0000ULL) {
4026     - if (start < 0xA0000ULL)
4027     - add_memory_region(start, 0xA0000ULL-start, type);
4028     - if (end <= 0x100000ULL)
4029     - continue;
4030     - start = 0x100000ULL;
4031     - size = end - start;
4032     - }
4033     - }
4034     -#endif
4035     - add_memory_region(start, size, type);
4036     - } while (biosmap++,--nr_map);
4037     - return 0;
4038     -}
4039     -
4040     #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
4041     struct edd edd;
4042     #ifdef CONFIG_EDD_MODULE
4043     @@ -746,7 +195,7 @@
4044     }
4045     #endif
4046    
4047     -static int __initdata user_defined_memmap = 0;
4048     +int __initdata user_defined_memmap = 0;
4049    
4050     /*
4051     * "mem=nopentium" disables the 4MB page tables.
4052     @@ -783,51 +232,6 @@
4053     }
4054     early_param("mem", parse_mem);
4055    
4056     -static int __init parse_memmap(char *arg)
4057     -{
4058     - if (!arg)
4059     - return -EINVAL;
4060     -
4061     - if (strcmp(arg, "exactmap") == 0) {
4062     -#ifdef CONFIG_CRASH_DUMP
4063     - /* If we are doing a crash dump, we
4064     - * still need to know the real mem
4065     - * size before original memory map is
4066     - * reset.
4067     - */
4068     - find_max_pfn();
4069     - saved_max_pfn = max_pfn;
4070     -#endif
4071     - e820.nr_map = 0;
4072     - user_defined_memmap = 1;
4073     - } else {
4074     - /* If the user specifies memory size, we
4075     - * limit the BIOS-provided memory map to
4076     - * that size. exactmap can be used to specify
4077     - * the exact map. mem=number can be used to
4078     - * trim the existing memory map.
4079     - */
4080     - unsigned long long start_at, mem_size;
4081     -
4082     - mem_size = memparse(arg, &arg);
4083     - if (*arg == '@') {
4084     - start_at = memparse(arg+1, &arg);
4085     - add_memory_region(start_at, mem_size, E820_RAM);
4086     - } else if (*arg == '#') {
4087     - start_at = memparse(arg+1, &arg);
4088     - add_memory_region(start_at, mem_size, E820_ACPI);
4089     - } else if (*arg == '$') {
4090     - start_at = memparse(arg+1, &arg);
4091     - add_memory_region(start_at, mem_size, E820_RESERVED);
4092     - } else {
4093     - limit_regions(mem_size);
4094     - user_defined_memmap = 1;
4095     - }
4096     - }
4097     - return 0;
4098     -}
4099     -early_param("memmap", parse_memmap);
4100     -
4101     #ifdef CONFIG_PROC_VMCORE
4102     /* elfcorehdr= specifies the location of elf core header
4103     * stored by the crashed kernel.
4104     @@ -894,127 +298,6 @@
4105     #endif
4106    
4107     /*
4108     - * Callback for efi_memory_walk.
4109     - */
4110     -static int __init
4111     -efi_find_max_pfn(unsigned long start, unsigned long end, void *arg)
4112     -{
4113     - unsigned long *max_pfn = arg, pfn;
4114     -
4115     - if (start < end) {
4116     - pfn = PFN_UP(end -1);
4117     - if (pfn > *max_pfn)
4118     - *max_pfn = pfn;
4119     - }
4120     - return 0;
4121     -}
4122     -
4123     -static int __init
4124     -efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg)
4125     -{
4126     - memory_present(0, PFN_UP(start), PFN_DOWN(end));
4127     - return 0;
4128     -}
4129     -
4130     -/*
4131     - * This function checks if any part of the range <start,end> is mapped
4132     - * with type.
4133     - */
4134     -int
4135     -e820_any_mapped(u64 start, u64 end, unsigned type)
4136     -{
4137     - int i;
4138     -
4139     -#ifndef CONFIG_XEN
4140     - for (i = 0; i < e820.nr_map; i++) {
4141     - const struct e820entry *ei = &e820.map[i];
4142     -#else
4143     - if (!is_initial_xendomain())
4144     - return 0;
4145     - for (i = 0; i < machine_e820.nr_map; ++i) {
4146     - const struct e820entry *ei = &machine_e820.map[i];
4147     -#endif
4148     -
4149     - if (type && ei->type != type)
4150     - continue;
4151     - if (ei->addr >= end || ei->addr + ei->size <= start)
4152     - continue;
4153     - return 1;
4154     - }
4155     - return 0;
4156     -}
4157     -EXPORT_SYMBOL_GPL(e820_any_mapped);
4158     -
4159     - /*
4160     - * This function checks if the entire range <start,end> is mapped with type.
4161     - *
4162     - * Note: this function only works correct if the e820 table is sorted and
4163     - * not-overlapping, which is the case
4164     - */
4165     -int __init
4166     -e820_all_mapped(unsigned long s, unsigned long e, unsigned type)
4167     -{
4168     - u64 start = s;
4169     - u64 end = e;
4170     - int i;
4171     -
4172     -#ifndef CONFIG_XEN
4173     - for (i = 0; i < e820.nr_map; i++) {
4174     - struct e820entry *ei = &e820.map[i];
4175     -#else
4176     - if (!is_initial_xendomain())
4177     - return 0;
4178     - for (i = 0; i < machine_e820.nr_map; ++i) {
4179     - const struct e820entry *ei = &machine_e820.map[i];
4180     -#endif
4181     - if (type && ei->type != type)
4182     - continue;
4183     - /* is the region (part) in overlap with the current region ?*/
4184     - if (ei->addr >= end || ei->addr + ei->size <= start)
4185     - continue;
4186     - /* if the region is at the beginning of <start,end> we move
4187     - * start to the end of the region since it's ok until there
4188     - */
4189     - if (ei->addr <= start)
4190     - start = ei->addr + ei->size;
4191     - /* if start is now at or beyond end, we're done, full
4192     - * coverage */
4193     - if (start >= end)
4194     - return 1; /* we're done */
4195     - }
4196     - return 0;
4197     -}
4198     -
4199     -/*
4200     - * Find the highest page frame number we have available
4201     - */
4202     -void __init find_max_pfn(void)
4203     -{
4204     - int i;
4205     -
4206     - max_pfn = 0;
4207     - if (efi_enabled) {
4208     - efi_memmap_walk(efi_find_max_pfn, &max_pfn);
4209     - efi_memmap_walk(efi_memory_present_wrapper, NULL);
4210     - return;
4211     - }
4212     -
4213     - for (i = 0; i < e820.nr_map; i++) {
4214     - unsigned long start, end;
4215     - /* RAM? */
4216     - if (e820.map[i].type != E820_RAM)
4217     - continue;
4218     - start = PFN_UP(e820.map[i].addr);
4219     - end = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
4220     - if (start >= end)
4221     - continue;
4222     - if (end > max_pfn)
4223     - max_pfn = end;
4224     - memory_present(0, start, end);
4225     - }
4226     -}
4227     -
4228     -/*
4229     * Determine low and high memory ranges:
4230     */
4231     unsigned long __init find_max_low_pfn(void)
4232     @@ -1073,77 +356,6 @@
4233     return max_low_pfn;
4234     }
4235    
4236     -/*
4237     - * Free all available memory for boot time allocation. Used
4238     - * as a callback function by efi_memory_walk()
4239     - */
4240     -
4241     -static int __init
4242     -free_available_memory(unsigned long start, unsigned long end, void *arg)
4243     -{
4244     - /* check max_low_pfn */
4245     - if (start >= (max_low_pfn << PAGE_SHIFT))
4246     - return 0;
4247     - if (end >= (max_low_pfn << PAGE_SHIFT))
4248     - end = max_low_pfn << PAGE_SHIFT;
4249     - if (start < end)
4250     - free_bootmem(start, end - start);
4251     -
4252     - return 0;
4253     -}
4254     -/*
4255     - * Register fully available low RAM pages with the bootmem allocator.
4256     - */
4257     -static void __init register_bootmem_low_pages(unsigned long max_low_pfn)
4258     -{
4259     - int i;
4260     -
4261     - if (efi_enabled) {
4262     - efi_memmap_walk(free_available_memory, NULL);
4263     - return;
4264     - }
4265     - for (i = 0; i < e820.nr_map; i++) {
4266     - unsigned long curr_pfn, last_pfn, size;
4267     - /*
4268     - * Reserve usable low memory
4269     - */
4270     - if (e820.map[i].type != E820_RAM)
4271     - continue;
4272     - /*
4273     - * We are rounding up the start address of usable memory:
4274     - */
4275     - curr_pfn = PFN_UP(e820.map[i].addr);
4276     - if (curr_pfn >= max_low_pfn)
4277     - continue;
4278     - /*
4279     - * ... and at the end of the usable range downwards:
4280     - */
4281     - last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
4282     -
4283     -#ifdef CONFIG_XEN
4284     - /*
4285     - * Truncate to the number of actual pages currently
4286     - * present.
4287     - */
4288     - if (last_pfn > xen_start_info->nr_pages)
4289     - last_pfn = xen_start_info->nr_pages;
4290     -#endif
4291     -
4292     - if (last_pfn > max_low_pfn)
4293     - last_pfn = max_low_pfn;
4294     -
4295     - /*
4296     - * .. finally, did all the rounding and playing
4297     - * around just make the area go away?
4298     - */
4299     - if (last_pfn <= curr_pfn)
4300     - continue;
4301     -
4302     - size = last_pfn - curr_pfn;
4303     - free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size));
4304     - }
4305     -}
4306     -
4307     #ifndef CONFIG_XEN
4308     /*
4309     * workaround for Dell systems that neglect to reserve EBDA
4310     @@ -1233,8 +445,8 @@
4311     * the (very unlikely) case of us accidentally initializing the
4312     * bootmem allocator with an invalid RAM area.
4313     */
4314     - reserve_bootmem(__PHYSICAL_START, (PFN_PHYS(min_low_pfn) +
4315     - bootmap_size + PAGE_SIZE-1) - (__PHYSICAL_START));
4316     + reserve_bootmem(__pa_symbol(_text), (PFN_PHYS(min_low_pfn) +
4317     + bootmap_size + PAGE_SIZE-1) - __pa_symbol(_text));
4318    
4319     #ifndef CONFIG_XEN
4320     /*
4321     @@ -1316,170 +528,6 @@
4322     }
4323     }
4324    
4325     -/*
4326     - * Request address space for all standard RAM and ROM resources
4327     - * and also for regions reported as reserved by the e820.
4328     - */
4329     -static void __init
4330     -legacy_init_iomem_resources(struct e820entry *e820, int nr_map,
4331     - struct resource *code_resource,
4332     - struct resource *data_resource)
4333     -{
4334     - int i;
4335     -
4336     - probe_roms();
4337     -
4338     - for (i = 0; i < nr_map; i++) {
4339     - struct resource *res;
4340     -#ifndef CONFIG_RESOURCES_64BIT
4341     - if (e820[i].addr + e820[i].size > 0x100000000ULL)
4342     - continue;
4343     -#endif
4344     - res = kzalloc(sizeof(struct resource), GFP_ATOMIC);
4345     - switch (e820[i].type) {
4346     - case E820_RAM: res->name = "System RAM"; break;
4347     - case E820_ACPI: res->name = "ACPI Tables"; break;
4348     - case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
4349     - default: res->name = "reserved";
4350     - }
4351     - res->start = e820[i].addr;
4352     - res->end = res->start + e820[i].size - 1;
4353     - res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
4354     - if (request_resource(&iomem_resource, res)) {
4355     - kfree(res);
4356     - continue;
4357     - }
4358     - if (e820[i].type == E820_RAM) {
4359     - /*
4360     - * We don't know which RAM region contains kernel data,
4361     - * so we try it repeatedly and let the resource manager
4362     - * test it.
4363     - */
4364     -#ifndef CONFIG_XEN
4365     - request_resource(res, code_resource);
4366     - request_resource(res, data_resource);
4367     -#endif
4368     -#ifdef CONFIG_KEXEC
4369     - if (crashk_res.start != crashk_res.end)
4370     - request_resource(res, &crashk_res);
4371     -#ifdef CONFIG_XEN
4372     - xen_machine_kexec_register_resources(res);
4373     -#endif
4374     -#endif
4375     - }
4376     - }
4377     -}
4378     -
4379     -/*
4380     - * Locate a unused range of the physical address space below 4G which
4381     - * can be used for PCI mappings.
4382     - */
4383     -static void __init
4384     -e820_setup_gap(struct e820entry *e820, int nr_map)
4385     -{
4386     - unsigned long gapstart, gapsize, round;
4387     - unsigned long long last;
4388     - int i;
4389     -
4390     - /*
4391     - * Search for the bigest gap in the low 32 bits of the e820
4392     - * memory space.
4393     - */
4394     - last = 0x100000000ull;
4395     - gapstart = 0x10000000;
4396     - gapsize = 0x400000;
4397     - i = nr_map;
4398     - while (--i >= 0) {
4399     - unsigned long long start = e820[i].addr;
4400     - unsigned long long end = start + e820[i].size;
4401     -
4402     - /*
4403     - * Since "last" is at most 4GB, we know we'll
4404     - * fit in 32 bits if this condition is true
4405     - */
4406     - if (last > end) {
4407     - unsigned long gap = last - end;
4408     -
4409     - if (gap > gapsize) {
4410     - gapsize = gap;
4411     - gapstart = end;
4412     - }
4413     - }
4414     - if (start < last)
4415     - last = start;
4416     - }
4417     -
4418     - /*
4419     - * See how much we want to round up: start off with
4420     - * rounding to the next 1MB area.
4421     - */
4422     - round = 0x100000;
4423     - while ((gapsize >> 4) > round)
4424     - round += round;
4425     - /* Fun with two's complement */
4426     - pci_mem_start = (gapstart + round) & -round;
4427     -
4428     - printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n",
4429     - pci_mem_start, gapstart, gapsize);
4430     -}
4431     -
4432     -/*
4433     - * Request address space for all standard resources
4434     - *
4435     - * This is called just before pcibios_init(), which is also a
4436     - * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
4437     - */
4438     -static int __init request_standard_resources(void)
4439     -{
4440     - int i;
4441     -
4442     - /* Nothing to do if not running in dom0. */
4443     - if (!is_initial_xendomain())
4444     - return 0;
4445     -
4446     - printk("Setting up standard PCI resources\n");
4447     -#ifdef CONFIG_XEN
4448     - legacy_init_iomem_resources(machine_e820.map, machine_e820.nr_map,
4449     - &code_resource, &data_resource);
4450     -#else
4451     - if (efi_enabled)
4452     - efi_initialize_iomem_resources(&code_resource, &data_resource);
4453     - else
4454     - legacy_init_iomem_resources(e820.map, e820.nr_map,
4455     - &code_resource, &data_resource);
4456     -#endif
4457     -
4458     - /* EFI systems may still have VGA */
4459     - request_resource(&iomem_resource, &video_ram_resource);
4460     -
4461     - /* request I/O space for devices used on all i[345]86 PCs */
4462     - for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
4463     - request_resource(&ioport_resource, &standard_io_resources[i]);
4464     - return 0;
4465     -}
4466     -
4467     -subsys_initcall(request_standard_resources);
4468     -
4469     -static void __init register_memory(void)
4470     -{
4471     -#ifdef CONFIG_XEN
4472     - if (is_initial_xendomain()) {
4473     - struct xen_memory_map memmap;
4474     -
4475     - memmap.nr_entries = E820MAX;
4476     - set_xen_guest_handle(memmap.buffer, machine_e820.map);
4477     -
4478     - if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap))
4479     - BUG();
4480     -
4481     - machine_e820.nr_map = memmap.nr_entries;
4482     - e820_setup_gap(machine_e820.map, machine_e820.nr_map);
4483     - }
4484     - else
4485     -#endif
4486     - e820_setup_gap(e820.map, e820.nr_map);
4487     -}
4488     -
4489     #ifdef CONFIG_MCA
4490     static void set_mca_bus(int x)
4491     {
4492     @@ -1489,6 +537,12 @@
4493     static void set_mca_bus(int x) { }
4494     #endif
4495    
4496     +/* Overridden in paravirt.c if CONFIG_PARAVIRT */
4497     +char * __init __attribute__((weak)) memory_setup(void)
4498     +{
4499     + return machine_specific_memory_setup();
4500     +}
4501     +
4502     /*
4503     * Determine if we were loaded by an EFI loader. If so, then we have also been
4504     * passed the efi memmap, systab, etc., so we should use these data structures
4505     @@ -1576,7 +630,7 @@
4506     efi_init();
4507     else {
4508     printk(KERN_INFO "BIOS-provided physical RAM map:\n");
4509     - print_memory_map(machine_specific_memory_setup());
4510     + print_memory_map(memory_setup());
4511     }
4512    
4513     copy_edd();
4514     @@ -1755,7 +809,7 @@
4515     get_smp_config();
4516     #endif
4517    
4518     - register_memory();
4519     + e820_register_memory();
4520    
4521     if (is_initial_xendomain()) {
4522     #ifdef CONFIG_VT
4523     --- a/arch/x86/kernel/setup_64-xen.c
4524     +++ b/arch/x86/kernel/setup_64-xen.c
4525     @@ -576,8 +576,7 @@
4526     if (LOADER_TYPE && INITRD_START) {
4527     if (INITRD_START + INITRD_SIZE <= (end_pfn << PAGE_SHIFT)) {
4528     reserve_bootmem_generic(INITRD_START, INITRD_SIZE);
4529     - initrd_start =
4530     - INITRD_START ? INITRD_START + PAGE_OFFSET : 0;
4531     + initrd_start = INITRD_START + PAGE_OFFSET;
4532     initrd_end = initrd_start+INITRD_SIZE;
4533     }
4534     else {
4535     @@ -1003,11 +1002,8 @@
4536     /* Fix cpuid4 emulation for more */
4537     num_cache_leaves = 3;
4538    
4539     - /* When there is only one core no need to synchronize RDTSC */
4540     - if (num_possible_cpus() == 1)
4541     - set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
4542     - else
4543     - clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
4544     + /* RDTSC can be speculated around */
4545     + clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
4546     }
4547    
4548     static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
4549     @@ -1106,6 +1102,15 @@
4550     set_bit(X86_FEATURE_ARCH_PERFMON, &c->x86_capability);
4551     }
4552    
4553     + if (cpu_has_ds) {
4554     + unsigned int l1, l2;
4555     + rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
4556     + if (!(l1 & (1<<11)))
4557     + set_bit(X86_FEATURE_BTS, c->x86_capability);
4558     + if (!(l1 & (1<<12)))
4559     + set_bit(X86_FEATURE_PEBS, c->x86_capability);
4560     + }
4561     +
4562     n = c->extended_cpuid_level;
4563     if (n >= 0x80000008) {
4564     unsigned eax = cpuid_eax(0x80000008);
4565     @@ -1125,7 +1130,10 @@
4566     set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
4567     if (c->x86 == 6)
4568     set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
4569     - set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
4570     + if (c->x86 == 15)
4571     + set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
4572     + else
4573     + clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
4574     c->x86_max_cores = intel_num_cpu_cores(c);
4575    
4576     srat_detect_node();
4577     --- a/arch/x86/kernel/smp_32-xen.c
4578     +++ b/arch/x86/kernel/smp_32-xen.c
4579     @@ -659,6 +659,10 @@
4580     put_cpu();
4581     return -EBUSY;
4582     }
4583     +
4584     + /* Can deadlock when called with interrupts disabled */
4585     + WARN_ON(irqs_disabled());
4586     +
4587     spin_lock_bh(&call_lock);
4588     __smp_call_function_single(cpu, func, info, nonatomic, wait);
4589     spin_unlock_bh(&call_lock);
4590     --- a/arch/x86/kernel/smp_64-xen.c
4591     +++ b/arch/x86/kernel/smp_64-xen.c
4592     @@ -384,12 +384,17 @@
4593     put_cpu();
4594     return 0;
4595     }
4596     +
4597     + /* Can deadlock when called with interrupts disabled */
4598     + WARN_ON(irqs_disabled());
4599     +
4600     spin_lock_bh(&call_lock);
4601     __smp_call_function_single(cpu, func, info, nonatomic, wait);
4602     spin_unlock_bh(&call_lock);
4603     put_cpu();
4604     return 0;
4605     }
4606     +EXPORT_SYMBOL(smp_call_function_single);
4607    
4608     /*
4609     * this function sends a 'generic call function' IPI to all other CPUs
4610     --- a/arch/x86/kernel/time_32-xen.c
4611     +++ b/arch/x86/kernel/time_32-xen.c
4612     @@ -61,6 +61,7 @@
4613     #include <asm/uaccess.h>
4614     #include <asm/processor.h>
4615     #include <asm/timer.h>
4616     +#include <asm/time.h>
4617     #include <asm/sections.h>
4618    
4619     #include "mach_time.h"
4620     @@ -129,11 +130,11 @@
4621     /* Must be signed, as it's compared with s64 quantities which can be -ve. */
4622     #define NS_PER_TICK (1000000000LL/HZ)
4623    
4624     -static void __clock_was_set(void *unused)
4625     +static void __clock_was_set(struct work_struct *unused)
4626     {
4627     clock_was_set();
4628     }
4629     -static DECLARE_WORK(clock_was_set_work, __clock_was_set, NULL);
4630     +static DECLARE_WORK(clock_was_set_work, __clock_was_set);
4631    
4632     static inline void __normalize_time(time_t *sec, s64 *nsec)
4633     {
4634     @@ -537,10 +538,7 @@
4635     /* gets recalled with irq locally disabled */
4636     /* XXX - does irqsave resolve this? -johnstul */
4637     spin_lock_irqsave(&rtc_lock, flags);
4638     - if (efi_enabled)
4639     - retval = efi_set_rtc_mmss(nowtime);
4640     - else
4641     - retval = mach_set_rtc_mmss(nowtime);
4642     + retval = set_wallclock(nowtime);
4643     spin_unlock_irqrestore(&rtc_lock, flags);
4644    
4645     return retval;
4646     @@ -865,10 +863,7 @@
4647    
4648     spin_lock_irqsave(&rtc_lock, flags);
4649    
4650     - if (efi_enabled)
4651     - retval = efi_get_time();
4652     - else
4653     - retval = mach_get_cmos_time();
4654     + retval = get_wallclock();
4655    
4656     spin_unlock_irqrestore(&rtc_lock, flags);
4657    
4658     @@ -970,7 +965,7 @@
4659     printk("Using HPET for base-timer\n");
4660     }
4661    
4662     - time_init_hook();
4663     + do_time_init();
4664     }
4665     #endif
4666    
4667     --- a/arch/x86/kernel/traps_32-xen.c
4668     +++ b/arch/x86/kernel/traps_32-xen.c
4669     @@ -29,6 +29,8 @@
4670     #include <linux/kexec.h>
4671     #include <linux/unwind.h>
4672     #include <linux/uaccess.h>
4673     +#include <linux/nmi.h>
4674     +#include <linux/bug.h>
4675    
4676     #ifdef CONFIG_EISA
4677     #include <linux/ioport.h>
4678     @@ -61,9 +63,6 @@
4679    
4680     asmlinkage int system_call(void);
4681    
4682     -struct desc_struct default_ldt[] = { { 0, 0 }, { 0, 0 }, { 0, 0 },
4683     - { 0, 0 }, { 0, 0 } };
4684     -
4685     /* Do we ignore FPU interrupts ? */
4686     char ignore_fpu_irq = 0;
4687    
4688     @@ -100,12 +99,7 @@
4689     #endif
4690     asmlinkage void machine_check(void);
4691    
4692     -static int kstack_depth_to_print = 24;
4693     -#ifdef CONFIG_STACK_UNWIND
4694     -static int call_trace = 1;
4695     -#else
4696     -#define call_trace (-1)
4697     -#endif
4698     +int kstack_depth_to_print = 24;
4699     ATOMIC_NOTIFIER_HEAD(i386die_chain);
4700    
4701     int register_die_notifier(struct notifier_block *nb)
4702     @@ -159,25 +153,7 @@
4703     return ebp;
4704     }
4705    
4706     -struct ops_and_data {
4707     - struct stacktrace_ops *ops;
4708     - void *data;
4709     -};
4710     -
4711     -static asmlinkage int
4712     -dump_trace_unwind(struct unwind_frame_info *info, void *data)
4713     -{
4714     - struct ops_and_data *oad = (struct ops_and_data *)data;
4715     - int n = 0;
4716     -
4717     - while (unwind(info) == 0 && UNW_PC(info)) {
4718     - n++;
4719     - oad->ops->address(oad->data, UNW_PC(info));
4720     - if (arch_unw_user_mode(info))
4721     - break;
4722     - }
4723     - return n;
4724     -}
4725     +#define MSG(msg) ops->warning(data, msg)
4726    
4727     void dump_trace(struct task_struct *task, struct pt_regs *regs,
4728     unsigned long *stack,
4729     @@ -188,39 +164,6 @@
4730     if (!task)
4731     task = current;
4732    
4733     - if (call_trace >= 0) {
4734     - int unw_ret = 0;
4735     - struct unwind_frame_info info;
4736     - struct ops_and_data oad = { .ops = ops, .data = data };
4737     -
4738     - if (regs) {
4739     - if (unwind_init_frame_info(&info, task, regs) == 0)
4740     - unw_ret = dump_trace_unwind(&info, &oad);
4741     - } else if (task == current)
4742     - unw_ret = unwind_init_running(&info, dump_trace_unwind, &oad);
4743     - else {
4744     - if (unwind_init_blocked(&info, task) == 0)
4745     - unw_ret = dump_trace_unwind(&info, &oad);
4746     - }
4747     - if (unw_ret > 0) {
4748     - if (call_trace == 1 && !arch_unw_user_mode(&info)) {
4749     - ops->warning_symbol(data, "DWARF2 unwinder stuck at %s\n",
4750     - UNW_PC(&info));
4751     - if (UNW_SP(&info) >= PAGE_OFFSET) {
4752     - ops->warning(data, "Leftover inexact backtrace:\n");
4753     - stack = (void *)UNW_SP(&info);
4754     - if (!stack)
4755     - return;
4756     - ebp = UNW_FP(&info);
4757     - } else
4758     - ops->warning(data, "Full inexact backtrace again:\n");
4759     - } else if (call_trace >= 1)
4760     - return;
4761     - else
4762     - ops->warning(data, "Full inexact backtrace again:\n");
4763     - } else
4764     - ops->warning(data, "Inexact backtrace:\n");
4765     - }
4766     if (!stack) {
4767     unsigned long dummy;
4768     stack = &dummy;
4769     @@ -253,6 +196,7 @@
4770     stack = (unsigned long*)context->previous_esp;
4771     if (!stack)
4772     break;
4773     + touch_nmi_watchdog();
4774     }
4775     }
4776     EXPORT_SYMBOL(dump_trace);
4777     @@ -385,7 +329,7 @@
4778     * time of the fault..
4779     */
4780     if (in_kernel) {
4781     - u8 __user *eip;
4782     + u8 *eip;
4783     int code_bytes = 64;
4784     unsigned char c;
4785    
4786     @@ -394,18 +338,20 @@
4787    
4788     printk(KERN_EMERG "Code: ");
4789    
4790     - eip = (u8 __user *)regs->eip - 43;
4791     - if (eip < (u8 __user *)PAGE_OFFSET || __get_user(c, eip)) {
4792     + eip = (u8 *)regs->eip - 43;
4793     + if (eip < (u8 *)PAGE_OFFSET ||
4794     + probe_kernel_address(eip, c)) {
4795     /* try starting at EIP */
4796     - eip = (u8 __user *)regs->eip;
4797     + eip = (u8 *)regs->eip;
4798     code_bytes = 32;
4799     }
4800     for (i = 0; i < code_bytes; i++, eip++) {
4801     - if (eip < (u8 __user *)PAGE_OFFSET || __get_user(c, eip)) {
4802     + if (eip < (u8 *)PAGE_OFFSET ||
4803     + probe_kernel_address(eip, c)) {
4804     printk(" Bad EIP value.");
4805     break;
4806     }
4807     - if (eip == (u8 __user *)regs->eip)
4808     + if (eip == (u8 *)regs->eip)
4809     printk("<%02x> ", c);
4810     else
4811     printk("%02x ", c);
4812     @@ -414,43 +360,22 @@
4813     printk("\n");
4814     }
4815    
4816     -static void handle_BUG(struct pt_regs *regs)
4817     +int is_valid_bugaddr(unsigned long eip)
4818     {
4819     - unsigned long eip = regs->eip;
4820     unsigned short ud2;
4821    
4822     if (eip < PAGE_OFFSET)
4823     - return;
4824     - if (probe_kernel_address((unsigned short __user *)eip, ud2))
4825     - return;
4826     - if (ud2 != 0x0b0f)
4827     - return;
4828     + return 0;
4829     + if (probe_kernel_address((unsigned short *)eip, ud2))
4830     + return 0;
4831    
4832     - printk(KERN_EMERG "------------[ cut here ]------------\n");
4833     -
4834     -#ifdef CONFIG_DEBUG_BUGVERBOSE
4835     - do {
4836     - unsigned short line;
4837     - char *file;
4838     - char c;
4839     -
4840     - if (probe_kernel_address((unsigned short __user *)(eip + 2),
4841     - line))
4842     - break;
4843     - if (__get_user(file, (char * __user *)(eip + 4)) ||
4844     - (unsigned long)file < PAGE_OFFSET || __get_user(c, file))
4845     - file = "<bad filename>";
4846     -
4847     - printk(KERN_EMERG "kernel BUG at %s:%d!\n", file, line);
4848     - return;
4849     - } while (0);
4850     -#endif
4851     - printk(KERN_EMERG "Kernel BUG at [verbose debug info unavailable]\n");
4852     + return ud2 == 0x0b0f;
4853     }
4854    
4855     -/* This is gone through when something in the kernel
4856     - * has done something bad and is about to be terminated.
4857     -*/
4858     +/*
4859     + * This is gone through when something in the kernel has done something bad and
4860     + * is about to be terminated.
4861     + */
4862     void die(const char * str, struct pt_regs * regs, long err)
4863     {
4864     static struct {
4865     @@ -458,7 +383,7 @@
4866     u32 lock_owner;
4867     int lock_owner_depth;
4868     } die = {
4869     - .lock = SPIN_LOCK_UNLOCKED,
4870     + .lock = __SPIN_LOCK_UNLOCKED(die.lock),
4871     .lock_owner = -1,
4872     .lock_owner_depth = 0
4873     };
4874     @@ -482,7 +407,8 @@
4875     unsigned long esp;
4876     unsigned short ss;
4877    
4878     - handle_BUG(regs);
4879     + report_bug(regs->eip);
4880     +
4881     printk(KERN_EMERG "%s: %04lx [#%d]\n", str, err & 0xffff, ++die_counter);
4882     #ifdef CONFIG_PREEMPT
4883     printk(KERN_EMERG "PREEMPT ");
4884     @@ -682,8 +608,7 @@
4885     {
4886     printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on "
4887     "CPU %d.\n", reason, smp_processor_id());
4888     - printk(KERN_EMERG "You probably have a hardware problem with your RAM "
4889     - "chips\n");
4890     + printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n");
4891     if (panic_on_unrecovered_nmi)
4892     panic("NMI: Not continuing");
4893    
4894     @@ -741,7 +666,6 @@
4895     printk(" on CPU%d, eip %08lx, registers:\n",
4896     smp_processor_id(), regs->eip);
4897     show_registers(regs);
4898     - printk(KERN_EMERG "console shuts up ...\n");
4899     console_silent();
4900     spin_unlock(&nmi_print_lock);
4901     bust_spinlocks(0);
4902     @@ -1057,49 +981,24 @@
4903     #endif
4904     }
4905    
4906     -fastcall void setup_x86_bogus_stack(unsigned char * stk)
4907     +fastcall unsigned long patch_espfix_desc(unsigned long uesp,
4908     + unsigned long kesp)
4909     {
4910     - unsigned long *switch16_ptr, *switch32_ptr;
4911     - struct pt_regs *regs;
4912     - unsigned long stack_top, stack_bot;
4913     - unsigned short iret_frame16_off;
4914     - int cpu = smp_processor_id();
4915     - /* reserve the space on 32bit stack for the magic switch16 pointer */
4916     - memmove(stk, stk + 8, sizeof(struct pt_regs));
4917     - switch16_ptr = (unsigned long *)(stk + sizeof(struct pt_regs));
4918     - regs = (struct pt_regs *)stk;
4919     - /* now the switch32 on 16bit stack */
4920     - stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu);
4921     - stack_top = stack_bot + CPU_16BIT_STACK_SIZE;
4922     - switch32_ptr = (unsigned long *)(stack_top - 8);
4923     - iret_frame16_off = CPU_16BIT_STACK_SIZE - 8 - 20;
4924     - /* copy iret frame on 16bit stack */
4925     - memcpy((void *)(stack_bot + iret_frame16_off), &regs->eip, 20);
4926     - /* fill in the switch pointers */
4927     - switch16_ptr[0] = (regs->esp & 0xffff0000) | iret_frame16_off;
4928     - switch16_ptr[1] = __ESPFIX_SS;
4929     - switch32_ptr[0] = (unsigned long)stk + sizeof(struct pt_regs) +
4930     - 8 - CPU_16BIT_STACK_SIZE;
4931     - switch32_ptr[1] = __KERNEL_DS;
4932     -}
4933     -
4934     -fastcall unsigned char * fixup_x86_bogus_stack(unsigned short sp)
4935     -{
4936     - unsigned long *switch32_ptr;
4937     - unsigned char *stack16, *stack32;
4938     - unsigned long stack_top, stack_bot;
4939     - int len;
4940     int cpu = smp_processor_id();
4941     - stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu);
4942     - stack_top = stack_bot + CPU_16BIT_STACK_SIZE;
4943     - switch32_ptr = (unsigned long *)(stack_top - 8);
4944     - /* copy the data from 16bit stack to 32bit stack */
4945     - len = CPU_16BIT_STACK_SIZE - 8 - sp;
4946     - stack16 = (unsigned char *)(stack_bot + sp);
4947     - stack32 = (unsigned char *)
4948     - (switch32_ptr[0] + CPU_16BIT_STACK_SIZE - 8 - len);
4949     - memcpy(stack32, stack16, len);
4950     - return stack32;
4951     + struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
4952     + struct desc_struct *gdt = (struct desc_struct *)cpu_gdt_descr->address;
4953     + unsigned long base = (kesp - uesp) & -THREAD_SIZE;
4954     + unsigned long new_kesp = kesp - base;
4955     + unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT;
4956     + __u64 desc = *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS];
4957     + /* Set up base for espfix segment */
4958     + desc &= 0x00f0ff0000000000ULL;
4959     + desc |= ((((__u64)base) << 16) & 0x000000ffffff0000ULL) |
4960     + ((((__u64)base) << 32) & 0xff00000000000000ULL) |
4961     + ((((__u64)lim_pages) << 32) & 0x000f000000000000ULL) |
4962     + (lim_pages & 0xffff);
4963     + *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS] = desc;
4964     + return new_kesp;
4965     }
4966     #endif
4967    
4968     @@ -1113,7 +1012,7 @@
4969     * Must be called with kernel preemption disabled (in this case,
4970     * local interrupts are disabled at the call-site in entry.S).
4971     */
4972     -asmlinkage void math_state_restore(struct pt_regs regs)
4973     +asmlinkage void math_state_restore(void)
4974     {
4975     struct thread_info *thread = current_thread_info();
4976     struct task_struct *tsk = thread->task;
4977     @@ -1123,6 +1022,7 @@
4978     init_fpu(tsk);
4979     restore_fpu(tsk);
4980     thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */
4981     + tsk->fpu_counter++;
4982     }
4983    
4984     #ifndef CONFIG_MATH_EMULATION
4985     @@ -1234,19 +1134,3 @@
4986     return 1;
4987     }
4988     __setup("kstack=", kstack_setup);
4989     -
4990     -#ifdef CONFIG_STACK_UNWIND
4991     -static int __init call_trace_setup(char *s)
4992     -{
4993     - if (strcmp(s, "old") == 0)
4994     - call_trace = -1;
4995     - else if (strcmp(s, "both") == 0)
4996     - call_trace = 0;
4997     - else if (strcmp(s, "newfallback") == 0)
4998     - call_trace = 1;
4999     - else if (strcmp(s, "new") == 2)
5000     - call_trace = 2;
5001     - return 1;
5002     -}
5003     -__setup("call_trace=", call_trace_setup);
5004     -#endif
5005     --- a/arch/x86/kernel/traps_64-xen.c
5006     +++ b/arch/x86/kernel/traps_64-xen.c
5007     @@ -30,9 +30,10 @@
5008     #include <linux/kprobes.h>
5009     #include <linux/kexec.h>
5010     #include <linux/unwind.h>
5011     +#include <linux/uaccess.h>
5012     +#include <linux/bug.h>
5013    
5014     #include <asm/system.h>
5015     -#include <asm/uaccess.h>
5016     #include <asm/io.h>
5017     #include <asm/atomic.h>
5018     #include <asm/debugreg.h>
5019     @@ -108,12 +109,7 @@
5020     preempt_enable_no_resched();
5021     }
5022    
5023     -static int kstack_depth_to_print = 12;
5024     -#ifdef CONFIG_STACK_UNWIND
5025     -static int call_trace = 1;
5026     -#else
5027     -#define call_trace (-1)
5028     -#endif
5029     +int kstack_depth_to_print = 12;
5030    
5031     #ifdef CONFIG_KALLSYMS
5032     void printk_address(unsigned long address)
5033     @@ -218,24 +214,7 @@
5034     return NULL;
5035     }
5036    
5037     -struct ops_and_data {
5038     - struct stacktrace_ops *ops;
5039     - void *data;
5040     -};
5041     -
5042     -static int dump_trace_unwind(struct unwind_frame_info *info, void *context)
5043     -{
5044     - struct ops_and_data *oad = (struct ops_and_data *)context;
5045     - int n = 0;
5046     -
5047     - while (unwind(info) == 0 && UNW_PC(info)) {
5048     - n++;
5049     - oad->ops->address(oad->data, UNW_PC(info));
5050     - if (arch_unw_user_mode(info))
5051     - break;
5052     - }
5053     - return n;
5054     -}
5055     +#define MSG(txt) ops->warning(data, txt)
5056    
5057     /*
5058     * x86-64 can have upto three kernel stacks:
5059     @@ -250,61 +229,24 @@
5060     return p > t && p < t + THREAD_SIZE - 3;
5061     }
5062    
5063     -void dump_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * stack,
5064     +void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
5065     + unsigned long *stack,
5066     struct stacktrace_ops *ops, void *data)
5067     {
5068     - const unsigned cpu = smp_processor_id();
5069     - unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
5070     + const unsigned cpu = get_cpu();
5071     + unsigned long *irqstack_end = (unsigned long*)cpu_pda(cpu)->irqstackptr;
5072     unsigned used = 0;
5073     struct thread_info *tinfo;
5074    
5075     if (!tsk)
5076     tsk = current;
5077    
5078     - if (call_trace >= 0) {
5079     - int unw_ret = 0;
5080     - struct unwind_frame_info info;
5081     - struct ops_and_data oad = { .ops = ops, .data = data };
5082     -
5083     - if (regs) {
5084     - if (unwind_init_frame_info(&info, tsk, regs) == 0)
5085     - unw_ret = dump_trace_unwind(&info, &oad);
5086     - } else if (tsk == current)
5087     - unw_ret = unwind_init_running(&info, dump_trace_unwind, &oad);
5088     - else {
5089     - if (unwind_init_blocked(&info, tsk) == 0)
5090     - unw_ret = dump_trace_unwind(&info, &oad);
5091     - }
5092     - if (unw_ret > 0) {
5093     - if (call_trace == 1 && !arch_unw_user_mode(&info)) {
5094     - ops->warning_symbol(data, "DWARF2 unwinder stuck at %s\n",
5095     - UNW_PC(&info));
5096     - if ((long)UNW_SP(&info) < 0) {
5097     - ops->warning(data, "Leftover inexact backtrace:\n");
5098     - stack = (unsigned long *)UNW_SP(&info);
5099     - if (!stack)
5100     - return;
5101     - } else
5102     - ops->warning(data, "Full inexact backtrace again:\n");
5103     - } else if (call_trace >= 1)
5104     - return;
5105     - else
5106     - ops->warning(data, "Full inexact backtrace again:\n");
5107     - } else
5108     - ops->warning(data, "Inexact backtrace:\n");
5109     - }
5110     if (!stack) {
5111     unsigned long dummy;
5112     stack = &dummy;
5113     if (tsk && tsk != current)
5114     stack = (unsigned long *)tsk->thread.rsp;
5115     }
5116     - /*
5117     - * Align the stack pointer on word boundary, later loops
5118     - * rely on that (and corruption / debug info bugs can cause
5119     - * unaligned values here):
5120     - */
5121     - stack = (unsigned long *)((unsigned long)stack & ~(sizeof(long)-1));
5122    
5123     /*
5124     * Print function call entries within a stack. 'cond' is the
5125     @@ -314,9 +256,9 @@
5126     #define HANDLE_STACK(cond) \
5127     do while (cond) { \
5128     unsigned long addr = *stack++; \
5129     - if (oops_in_progress ? \
5130     - __kernel_text_address(addr) : \
5131     - kernel_text_address(addr)) { \
5132     + /* Use unlocked access here because except for NMIs \
5133     + we should be already protected against module unloads */ \
5134     + if (__kernel_text_address(addr)) { \
5135     /* \
5136     * If the address is either in the text segment of the \
5137     * kernel, or in the region which contains vmalloc'ed \
5138     @@ -379,9 +321,10 @@
5139     /*
5140     * This handles the process stack:
5141     */
5142     - tinfo = current_thread_info();
5143     + tinfo = task_thread_info(tsk);
5144     HANDLE_STACK (valid_stack_ptr(tinfo, stack));
5145     #undef HANDLE_STACK
5146     + put_cpu();
5147     }
5148     EXPORT_SYMBOL(dump_trace);
5149    
5150     @@ -518,30 +461,15 @@
5151     printk("\n");
5152     }
5153    
5154     -void handle_BUG(struct pt_regs *regs)
5155     -{
5156     - struct bug_frame f;
5157     - long len;
5158     - const char *prefix = "";
5159     +int is_valid_bugaddr(unsigned long rip)
5160     +{
5161     + unsigned short ud2;
5162    
5163     - if (user_mode(regs))
5164     - return;
5165     - if (__copy_from_user(&f, (const void __user *) regs->rip,
5166     - sizeof(struct bug_frame)))
5167     - return;
5168     - if (f.filename >= 0 ||
5169     - f.ud2[0] != 0x0f || f.ud2[1] != 0x0b)
5170     - return;
5171     - len = __strnlen_user((char *)(long)f.filename, PATH_MAX) - 1;
5172     - if (len < 0 || len >= PATH_MAX)
5173     - f.filename = (int)(long)"unmapped filename";
5174     - else if (len > 50) {
5175     - f.filename += len - 50;
5176     - prefix = "...";
5177     - }
5178     - printk("----------- [cut here ] --------- [please bite here ] ---------\n");
5179     - printk(KERN_ALERT "Kernel BUG at %s%.50s:%d\n", prefix, (char *)(long)f.filename, f.line);
5180     -}
5181     + if (__copy_from_user(&ud2, (const void __user *) rip, sizeof(ud2)))
5182     + return 0;
5183     +
5184     + return ud2 == 0x0b0f;
5185     +}
5186    
5187     #ifdef CONFIG_BUG
5188     void out_of_line_bug(void)
5189     @@ -621,7 +549,9 @@
5190     {
5191     unsigned long flags = oops_begin();
5192    
5193     - handle_BUG(regs);
5194     + if (!user_mode(regs))
5195     + report_bug(regs->rip);
5196     +
5197     __die(str, regs, err);
5198     oops_end(flags);
5199     do_exit(SIGSEGV);
5200     @@ -790,8 +720,7 @@
5201     {
5202     printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
5203     reason);
5204     - printk(KERN_EMERG "You probably have a hardware problem with your "
5205     - "RAM chips\n");
5206     + printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n");
5207    
5208     if (panic_on_unrecovered_nmi)
5209     panic("NMI: Not continuing");
5210     @@ -1227,21 +1156,3 @@
5211     return 0;
5212     }
5213     early_param("kstack", kstack_setup);
5214     -
5215     -#ifdef CONFIG_STACK_UNWIND
5216     -static int __init call_trace_setup(char *s)
5217     -{
5218     - if (!s)
5219     - return -EINVAL;
5220     - if (strcmp(s, "old") == 0)
5221     - call_trace = -1;
5222     - else if (strcmp(s, "both") == 0)
5223     - call_trace = 0;
5224     - else if (strcmp(s, "newfallback") == 0)
5225     - call_trace = 1;
5226     - else if (strcmp(s, "new") == 0)
5227     - call_trace = 2;
5228     - return 0;
5229     -}
5230     -early_param("call_trace", call_trace_setup);
5231     -#endif
5232     --- a/arch/x86/kernel/vmlinux_32.lds.S
5233     +++ b/arch/x86/kernel/vmlinux_32.lds.S
5234     @@ -29,6 +29,12 @@
5235     SECTIONS
5236     {
5237     . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
5238     +
5239     +#if defined(CONFIG_XEN) && CONFIG_XEN_COMPAT <= 0x030002
5240     +#undef LOAD_OFFSET
5241     +#define LOAD_OFFSET 0
5242     +#endif
5243     +
5244     phys_startup_32 = startup_32 - LOAD_OFFSET;
5245    
5246     .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) {
5247     --- a/arch/x86/kernel/vsyscall_64-xen.c
5248     +++ b/arch/x86/kernel/vsyscall_64-xen.c
5249     @@ -42,6 +42,7 @@
5250     #include <asm/topology.h>
5251    
5252     #define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
5253     +#define __syscall_clobber "r11","rcx","memory"
5254    
5255     int __sysctl_vsyscall __section_sysctl_vsyscall = 1;
5256     seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED;
5257     @@ -224,8 +225,7 @@
5258    
5259     static int vsyscall_sysctl_nostrat(ctl_table *t, int __user *name, int nlen,
5260     void __user *oldval, size_t __user *oldlenp,
5261     - void __user *newval, size_t newlen,
5262     - void **context)
5263     + void __user *newval, size_t newlen)
5264     {
5265     return -ENOSYS;
5266     }
5267     @@ -277,7 +277,6 @@
5268     vsyscall_set_cpu(raw_smp_processor_id());
5269     }
5270    
5271     -#ifdef CONFIG_HOTPLUG_CPU
5272     static int __cpuinit
5273     cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
5274     {
5275     @@ -286,13 +285,13 @@
5276     smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 0, 1);
5277     return NOTIFY_DONE;
5278     }
5279     -#endif
5280    
5281     static void __init map_vsyscall(void)
5282     {
5283     extern char __vsyscall_0;
5284     unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
5285    
5286     + /* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */
5287     __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
5288     }
5289    
5290     --- a/arch/x86/kvm/Kconfig
5291     +++ b/arch/x86/kvm/Kconfig
5292     @@ -7,6 +7,7 @@
5293     menuconfig VIRTUALIZATION
5294     bool "Virtualization"
5295     depends on HAVE_KVM || X86
5296     + depends on !XEN
5297     default y
5298     ---help---
5299     Say Y here to get to see options for using your Linux host to run other
5300     --- a/arch/x86/mm/fault_32-xen.c
5301     +++ b/arch/x86/mm/fault_32-xen.c
5302     @@ -22,9 +22,9 @@
5303     #include <linux/highmem.h>
5304     #include <linux/module.h>
5305     #include <linux/kprobes.h>
5306     +#include <linux/uaccess.h>
5307    
5308     #include <asm/system.h>
5309     -#include <asm/uaccess.h>
5310     #include <asm/desc.h>
5311     #include <asm/kdebug.h>
5312     #include <asm/segment.h>
5313     @@ -167,7 +167,7 @@
5314     static int __is_prefetch(struct pt_regs *regs, unsigned long addr)
5315     {
5316     unsigned long limit;
5317     - unsigned long instr = get_segment_eip (regs, &limit);
5318     + unsigned char *instr = (unsigned char *)get_segment_eip (regs, &limit);
5319     int scan_more = 1;
5320     int prefetch = 0;
5321     int i;
5322     @@ -177,9 +177,9 @@
5323     unsigned char instr_hi;
5324     unsigned char instr_lo;
5325    
5326     - if (instr > limit)
5327     + if (instr > (unsigned char *)limit)
5328     break;
5329     - if (__get_user(opcode, (unsigned char __user *) instr))
5330     + if (probe_kernel_address(instr, opcode))
5331     break;
5332    
5333     instr_hi = opcode & 0xf0;
5334     @@ -204,9 +204,9 @@
5335     case 0x00:
5336     /* Prefetch instruction is 0x0F0D or 0x0F18 */
5337     scan_more = 0;
5338     - if (instr > limit)
5339     + if (instr > (unsigned char *)limit)
5340     break;
5341     - if (__get_user(opcode, (unsigned char __user *) instr))
5342     + if (probe_kernel_address(instr, opcode))
5343     break;
5344     prefetch = (instr_lo == 0xF) &&
5345     (opcode == 0x0D || opcode == 0x18);
5346     --- a/arch/x86/mm/fault_64-xen.c
5347     +++ b/arch/x86/mm/fault_64-xen.c
5348     @@ -23,9 +23,9 @@
5349     #include <linux/compiler.h>
5350     #include <linux/module.h>
5351     #include <linux/kprobes.h>
5352     +#include <linux/uaccess.h>
5353    
5354     #include <asm/system.h>
5355     -#include <asm/uaccess.h>
5356     #include <asm/pgalloc.h>
5357     #include <asm/smp.h>
5358     #include <asm/tlbflush.h>
5359     @@ -96,7 +96,7 @@
5360     static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
5361     unsigned long error_code)
5362     {
5363     - unsigned char __user *instr;
5364     + unsigned char *instr;
5365     int scan_more = 1;
5366     int prefetch = 0;
5367     unsigned char *max_instr;
5368     @@ -116,7 +116,7 @@
5369     unsigned char instr_hi;
5370     unsigned char instr_lo;
5371    
5372     - if (__get_user(opcode, (char __user *)instr))
5373     + if (probe_kernel_address(instr, opcode))
5374     break;
5375    
5376     instr_hi = opcode & 0xf0;
5377     @@ -154,7 +154,7 @@
5378     case 0x00:
5379     /* Prefetch instruction is 0x0F0D or 0x0F18 */
5380     scan_more = 0;
5381     - if (__get_user(opcode, (char __user *)instr))
5382     + if (probe_kernel_address(instr, opcode))
5383     break;
5384     prefetch = (instr_lo == 0xF) &&
5385     (opcode == 0x0D || opcode == 0x18);
5386     @@ -170,7 +170,7 @@
5387     static int bad_address(void *p)
5388     {
5389     unsigned long dummy;
5390     - return __get_user(dummy, (unsigned long __user *)p);
5391     + return probe_kernel_address((unsigned long *)p, dummy);
5392     }
5393    
5394     void dump_pagetable(unsigned long address)
5395     --- a/arch/x86/mm/highmem_32-xen.c
5396     +++ b/arch/x86/mm/highmem_32-xen.c
5397     @@ -32,7 +32,7 @@
5398     unsigned long vaddr;
5399    
5400     /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
5401     - inc_preempt_count();
5402     + pagefault_disable();
5403     if (!PageHighMem(page))
5404     return page_address(page);
5405    
5406     @@ -63,26 +63,22 @@
5407     unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
5408     enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
5409    
5410     -#ifdef CONFIG_DEBUG_HIGHMEM
5411     - if (vaddr >= PAGE_OFFSET && vaddr < (unsigned long)high_memory) {
5412     - dec_preempt_count();
5413     - preempt_check_resched();
5414     - return;
5415     - }
5416     -
5417     - if (vaddr != __fix_to_virt(FIX_KMAP_BEGIN+idx))
5418     - BUG();
5419     -#endif
5420     /*
5421     * Force other mappings to Oops if they'll try to access this pte
5422     * without first remap it. Keeping stale mappings around is a bad idea
5423     * also, in case the page changes cacheability attributes or becomes
5424     * a protected page in a hypervisor.
5425     */
5426     - kpte_clear_flush(kmap_pte-idx, vaddr);
5427     + if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx))
5428     + kpte_clear_flush(kmap_pte-idx, vaddr);
5429     + else {
5430     +#ifdef CONFIG_DEBUG_HIGHMEM
5431     + BUG_ON(vaddr < PAGE_OFFSET);
5432     + BUG_ON(vaddr >= (unsigned long)high_memory);
5433     +#endif
5434     + }
5435    
5436     - dec_preempt_count();
5437     - preempt_check_resched();
5438     + pagefault_enable();
5439     }
5440    
5441     /* This is the same as kmap_atomic() but can map memory that doesn't
5442     @@ -93,7 +89,7 @@
5443     enum fixed_addresses idx;
5444     unsigned long vaddr;
5445    
5446     - inc_preempt_count();
5447     + pagefault_disable();
5448    
5449     idx = type + KM_TYPE_NR*smp_processor_id();
5450     vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
5451     --- a/arch/x86/mm/init_32-xen.c
5452     +++ b/arch/x86/mm/init_32-xen.c
5453     @@ -235,8 +235,6 @@
5454    
5455     #endif
5456    
5457     -extern int is_available_memory(efi_memory_desc_t *);
5458     -
5459     int page_is_ram(unsigned long pagenr)
5460     {
5461     int i;
5462     @@ -329,7 +327,7 @@
5463     SetPageReserved(page);
5464     }
5465    
5466     -static int add_one_highpage_hotplug(struct page *page, unsigned long pfn)
5467     +static int __meminit add_one_highpage_hotplug(struct page *page, unsigned long pfn)
5468     {
5469     free_new_highpage(page, pfn);
5470     totalram_pages++;
5471     @@ -346,7 +344,7 @@
5472     * has been added dynamically that would be
5473     * onlined here is in HIGHMEM
5474     */
5475     -void online_page(struct page *page)
5476     +void __meminit online_page(struct page *page)
5477     {
5478     ClearPageReserved(page);
5479     add_one_highpage_hotplug(page, page_to_pfn(page));
5480     @@ -739,16 +737,10 @@
5481     set_bit(PG_pinned, &virt_to_page(init_mm.pgd)->flags);
5482     }
5483    
5484     -/*
5485     - * this is for the non-NUMA, single node SMP system case.
5486     - * Specifically, in the case of x86, we will always add
5487     - * memory to the highmem for now.
5488     - */
5489     #ifdef CONFIG_MEMORY_HOTPLUG
5490     -#ifndef CONFIG_NEED_MULTIPLE_NODES
5491     int arch_add_memory(int nid, u64 start, u64 size)
5492     {
5493     - struct pglist_data *pgdata = &contig_page_data;
5494     + struct pglist_data *pgdata = NODE_DATA(nid);
5495     struct zone *zone = pgdata->node_zones + ZONE_HIGHMEM;
5496     unsigned long start_pfn = start >> PAGE_SHIFT;
5497     unsigned long nr_pages = size >> PAGE_SHIFT;
5498     @@ -760,11 +752,11 @@
5499     {
5500     return -EINVAL;
5501     }
5502     -#endif
5503     +EXPORT_SYMBOL_GPL(remove_memory);
5504     #endif
5505    
5506     -kmem_cache_t *pgd_cache;
5507     -kmem_cache_t *pmd_cache;
5508     +struct kmem_cache *pgd_cache;
5509     +struct kmem_cache *pmd_cache;
5510    
5511     void __init pgtable_cache_init(void)
5512     {
5513     --- a/arch/x86/mm/init_64-xen.c
5514     +++ b/arch/x86/mm/init_64-xen.c
5515     @@ -1130,14 +1130,15 @@
5516     __initcall(x8664_sysctl_init);
5517     #endif
5518    
5519     -/* A pseudo VMAs to allow ptrace access for the vsyscall page. This only
5520     +/* A pseudo VMA to allow ptrace access for the vsyscall page. This only
5521     covers the 64bit vsyscall page now. 32bit has a real VMA now and does
5522     not need special handling anymore. */
5523    
5524     static struct vm_area_struct gate_vma = {
5525     .vm_start = VSYSCALL_START,
5526     - .vm_end = VSYSCALL_END,
5527     - .vm_page_prot = PAGE_READONLY
5528     + .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES << PAGE_SHIFT),
5529     + .vm_page_prot = PAGE_READONLY_EXEC,
5530     + .vm_flags = VM_READ | VM_EXEC
5531     };
5532    
5533     struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
5534     --- a/arch/x86/mm/pageattr_64-xen.c
5535     +++ b/arch/x86/mm/pageattr_64-xen.c
5536     @@ -324,34 +324,40 @@
5537     return base;
5538     }
5539    
5540     -
5541     -static void flush_kernel_map(void *address)
5542     +static void cache_flush_page(void *adr)
5543     {
5544     - if (0 && address && cpu_has_clflush) {
5545     - /* is this worth it? */
5546     - int i;
5547     - for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size)
5548     - asm volatile("clflush (%0)" :: "r" (address + i));
5549     - } else
5550     - asm volatile("wbinvd":::"memory");
5551     - if (address)
5552     - __flush_tlb_one(address);
5553     - else
5554     - __flush_tlb_all();
5555     + int i;
5556     + for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size)
5557     + asm volatile("clflush (%0)" :: "r" (adr + i));
5558     }
5559    
5560     +static void flush_kernel_map(void *arg)
5561     +{
5562     + struct list_head *l = (struct list_head *)arg;
5563     + struct page *pg;
5564    
5565     -static inline void flush_map(unsigned long address)
5566     + /* When clflush is available always use it because it is
5567     + much cheaper than WBINVD */
5568     + if (!cpu_has_clflush)
5569     + asm volatile("wbinvd" ::: "memory");
5570     + list_for_each_entry(pg, l, lru) {
5571     + void *adr = page_address(pg);
5572     + if (cpu_has_clflush)
5573     + cache_flush_page(adr);
5574     + __flush_tlb_one(adr);
5575     + }
5576     +}
5577     +
5578     +static inline void flush_map(struct list_head *l)
5579     {
5580     - on_each_cpu(flush_kernel_map, (void *)address, 1, 1);
5581     + on_each_cpu(flush_kernel_map, l, 1, 1);
5582     }
5583    
5584     -static struct page *deferred_pages; /* protected by init_mm.mmap_sem */
5585     +static LIST_HEAD(deferred_pages); /* protected by init_mm.mmap_sem */
5586    
5587     static inline void save_page(struct page *fpage)
5588     {
5589     - fpage->lru.next = (struct list_head *)deferred_pages;
5590     - deferred_pages = fpage;
5591     + list_add(&fpage->lru, &deferred_pages);
5592     }
5593    
5594     /*
5595     @@ -481,18 +487,18 @@
5596    
5597     void global_flush_tlb(void)
5598     {
5599     - struct page *dpage;
5600     + struct page *pg, *next;
5601     + struct list_head l;
5602    
5603     down_read(&init_mm.mmap_sem);
5604     - dpage = xchg(&deferred_pages, NULL);
5605     + list_replace_init(&deferred_pages, &l);
5606     up_read(&init_mm.mmap_sem);
5607    
5608     - flush_map((dpage && !dpage->lru.next) ? (unsigned long)page_address(dpage) : 0);
5609     - while (dpage) {
5610     - struct page *tmp = dpage;
5611     - dpage = (struct page *)dpage->lru.next;
5612     - ClearPagePrivate(tmp);
5613     - __free_page(tmp);
5614     + flush_map(&l);
5615     +
5616     + list_for_each_entry_safe(pg, next, &l, lru) {
5617     + ClearPagePrivate(pg);
5618     + __free_page(pg);
5619     }
5620     }
5621    
5622     --- a/arch/x86/mm/pgtable_32-xen.c
5623     +++ b/arch/x86/mm/pgtable_32-xen.c
5624     @@ -197,7 +197,7 @@
5625     __free_page(pte);
5626     }
5627    
5628     -void pmd_ctor(void *pmd, kmem_cache_t *cache, unsigned long flags)
5629     +void pmd_ctor(void *pmd, struct kmem_cache *cache, unsigned long flags)
5630     {
5631     memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
5632     }
5633     @@ -237,7 +237,7 @@
5634     set_page_private(next, (unsigned long)pprev);
5635     }
5636    
5637     -void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused)
5638     +void pgd_ctor(void *pgd, struct kmem_cache *cache, unsigned long unused)
5639     {
5640     unsigned long flags;
5641    
5642     @@ -258,7 +258,7 @@
5643     }
5644    
5645     /* never called when PTRS_PER_PMD > 1 */
5646     -void pgd_dtor(void *pgd, kmem_cache_t *cache, unsigned long unused)
5647     +void pgd_dtor(void *pgd, struct kmem_cache *cache, unsigned long unused)
5648     {
5649     unsigned long flags; /* can be called from interrupt context */
5650    
5651     --- a/arch/x86/pci/irq-xen.c
5652     +++ b/arch/x86/pci/irq-xen.c
5653     @@ -768,7 +768,7 @@
5654     DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for %04x:%04x\n",
5655     rt->rtr_vendor, rt->rtr_device);
5656    
5657     - pirq_router_dev = pci_find_slot(rt->rtr_bus, rt->rtr_devfn);
5658     + pirq_router_dev = pci_get_bus_and_slot(rt->rtr_bus, rt->rtr_devfn);
5659     if (!pirq_router_dev) {
5660     DBG(KERN_DEBUG "PCI: Interrupt router not found at "
5661     "%02x:%02x\n", rt->rtr_bus, rt->rtr_devfn);
5662     @@ -788,6 +788,8 @@
5663     pirq_router_dev->vendor,
5664     pirq_router_dev->device,
5665     pci_name(pirq_router_dev));
5666     +
5667     + /* The device remains referenced for the kernel lifetime */
5668     }
5669    
5670     static struct irq_info *pirq_get_info(struct pci_dev *dev)
5671     --- a/drivers/xen/balloon/balloon.c
5672     +++ b/drivers/xen/balloon/balloon.c
5673     @@ -97,8 +97,8 @@
5674     static LIST_HEAD(ballooned_pages);
5675    
5676     /* Main work function, always executed in process context. */
5677     -static void balloon_process(void *unused);
5678     -static DECLARE_WORK(balloon_worker, balloon_process, NULL);
5679     +static void balloon_process(struct work_struct *unused);
5680     +static DECLARE_WORK(balloon_worker, balloon_process);
5681     static struct timer_list balloon_timer;
5682    
5683     /* When ballooning out (allocating memory to return to Xen) we don't really
5684     @@ -387,7 +387,7 @@
5685     * by the balloon lock), or with changes to the Xen hard limit, but we will
5686     * recover from these in time.
5687     */
5688     -static void balloon_process(void *unused)
5689     +static void balloon_process(struct work_struct *unused)
5690     {
5691     int need_sleep = 0;
5692     long credit;
5693     --- a/drivers/xen/blkback/blkback.c
5694     +++ b/drivers/xen/blkback/blkback.c
5695     @@ -37,6 +37,7 @@
5696    
5697     #include <linux/spinlock.h>
5698     #include <linux/kthread.h>
5699     +#include <linux/freezer.h>
5700     #include <linux/list.h>
5701     #include <linux/delay.h>
5702     #include <xen/balloon.h>
5703     --- a/drivers/xen/blkback/interface.c
5704     +++ b/drivers/xen/blkback/interface.c
5705     @@ -34,7 +34,7 @@
5706     #include <xen/evtchn.h>
5707     #include <linux/kthread.h>
5708    
5709     -static kmem_cache_t *blkif_cachep;
5710     +static struct kmem_cache *blkif_cachep;
5711    
5712     blkif_t *blkif_alloc(domid_t domid)
5713     {
5714     --- a/drivers/xen/blkfront/blkfront.c
5715     +++ b/drivers/xen/blkfront/blkfront.c
5716     @@ -70,7 +70,7 @@
5717     static void kick_pending_request_queues(struct blkfront_info *);
5718    
5719     static irqreturn_t blkif_int(int irq, void *dev_id);
5720     -static void blkif_restart_queue(void *arg);
5721     +static void blkif_restart_queue(struct work_struct *arg);
5722     static void blkif_recover(struct blkfront_info *);
5723     static void blkif_completion(struct blk_shadow *);
5724     static void blkif_free(struct blkfront_info *, int);
5725     @@ -105,7 +105,7 @@
5726     info->xbdev = dev;
5727     info->vdevice = vdevice;
5728     info->connected = BLKIF_STATE_DISCONNECTED;
5729     - INIT_WORK(&info->work, blkif_restart_queue, (void *)info);
5730     + INIT_WORK(&info->work, blkif_restart_queue);
5731    
5732     for (i = 0; i < BLK_RING_SIZE; i++)
5733     info->shadow[i].req.id = i+1;
5734     @@ -445,9 +445,9 @@
5735     }
5736     }
5737    
5738     -static void blkif_restart_queue(void *arg)
5739     +static void blkif_restart_queue(struct work_struct *arg)
5740     {
5741     - struct blkfront_info *info = (struct blkfront_info *)arg;
5742     + struct blkfront_info *info = container_of(arg, struct blkfront_info, work);
5743     spin_lock_irq(&blkif_io_lock);
5744     if (info->connected == BLKIF_STATE_CONNECTED)
5745     kick_pending_request_queues(info);
5746     --- a/drivers/xen/blktap/blktap.c
5747     +++ b/drivers/xen/blktap/blktap.c
5748     @@ -40,6 +40,7 @@
5749    
5750     #include <linux/spinlock.h>
5751     #include <linux/kthread.h>
5752     +#include <linux/freezer.h>
5753     #include <linux/list.h>
5754     #include <asm/hypervisor.h>
5755     #include "common.h"
5756     --- a/drivers/xen/blktap/interface.c
5757     +++ b/drivers/xen/blktap/interface.c
5758     @@ -34,7 +34,7 @@
5759     #include "common.h"
5760     #include <xen/evtchn.h>
5761    
5762     -static kmem_cache_t *blkif_cachep;
5763     +static struct kmem_cache *blkif_cachep;
5764    
5765     blkif_t *tap_alloc_blkif(domid_t domid)
5766     {
5767     --- a/drivers/xen/char/mem.c
5768     +++ b/drivers/xen/char/mem.c
5769     @@ -157,7 +157,7 @@
5770     {
5771     loff_t ret;
5772    
5773     - mutex_lock(&file->f_dentry->d_inode->i_mutex);
5774     + mutex_lock(&file->f_path.dentry->d_inode->i_mutex);
5775     switch (orig) {
5776     case 0:
5777     file->f_pos = offset;
5778     @@ -172,7 +172,7 @@
5779     default:
5780     ret = -EINVAL;
5781     }
5782     - mutex_unlock(&file->f_dentry->d_inode->i_mutex);
5783     + mutex_unlock(&file->f_path.dentry->d_inode->i_mutex);
5784     return ret;
5785     }
5786    
5787     --- a/drivers/xen/console/console.c
5788     +++ b/drivers/xen/console/console.c
5789     @@ -80,11 +80,6 @@
5790     #define XEN_XVC_MAJOR 204
5791     #define XEN_XVC_MINOR 191
5792    
5793     -#ifdef CONFIG_MAGIC_SYSRQ
5794     -static unsigned long sysrq_requested;
5795     -extern int sysrq_enabled;
5796     -#endif
5797     -
5798     static int __init xencons_setup(char *str)
5799     {
5800     char *q;
5801     @@ -339,8 +334,8 @@
5802     #define DUMMY_TTY(_tty) ((xc_mode == XC_TTY) && \
5803     ((_tty)->index != (xc_num - 1)))
5804    
5805     -static struct termios *xencons_termios[MAX_NR_CONSOLES];
5806     -static struct termios *xencons_termios_locked[MAX_NR_CONSOLES];
5807     +static struct ktermios *xencons_termios[MAX_NR_CONSOLES];
5808     +static struct ktermios *xencons_termios_locked[MAX_NR_CONSOLES];
5809     static struct tty_struct *xencons_tty;
5810     static int xencons_priv_irq;
5811     static char x_char;
5812     @@ -356,7 +351,9 @@
5813    
5814     for (i = 0; i < len; i++) {
5815     #ifdef CONFIG_MAGIC_SYSRQ
5816     - if (sysrq_enabled) {
5817     + if (sysrq_on()) {
5818     + static unsigned long sysrq_requested;
5819     +
5820     if (buf[i] == '\x0f') { /* ^O */
5821     if (!sysrq_requested) {
5822     sysrq_requested = jiffies;
5823     --- a/drivers/xen/core/reboot.c
5824     +++ b/drivers/xen/core/reboot.c
5825     @@ -30,8 +30,8 @@
5826     /* Can we leave APs online when we suspend? */
5827     static int fast_suspend;
5828    
5829     -static void __shutdown_handler(void *unused);
5830     -static DECLARE_WORK(shutdown_work, __shutdown_handler, NULL);
5831     +static void __shutdown_handler(struct work_struct *unused);
5832     +static DECLARE_DELAYED_WORK(shutdown_work, __shutdown_handler);
5833    
5834     int __xen_suspend(int fast_suspend, void (*resume_notifier)(void));
5835    
5836     @@ -96,7 +96,7 @@
5837     case SHUTDOWN_RESUMING:
5838     break;
5839     default:
5840     - schedule_work(&shutdown_work);
5841     + schedule_delayed_work(&shutdown_work, 0);
5842     break;
5843     }
5844    
5845     @@ -108,7 +108,7 @@
5846     return 0;
5847     }
5848    
5849     -static void __shutdown_handler(void *unused)
5850     +static void __shutdown_handler(struct work_struct *unused)
5851     {
5852     int err;
5853    
5854     @@ -169,7 +169,7 @@
5855     if (new_state != SHUTDOWN_INVALID) {
5856     old_state = xchg(&shutting_down, new_state);
5857     if (old_state == SHUTDOWN_INVALID)
5858     - schedule_work(&shutdown_work);
5859     + schedule_delayed_work(&shutdown_work, 0);
5860     else
5861     BUG_ON(old_state != SHUTDOWN_RESUMING);
5862     }
5863     --- a/drivers/xen/core/smpboot.c
5864     +++ b/drivers/xen/core/smpboot.c
5865     @@ -165,7 +165,12 @@
5866    
5867     void __cpuinit cpu_bringup(void)
5868     {
5869     +#ifdef __i386__
5870     + cpu_set_gdt(current_thread_info()->cpu);
5871     + secondary_cpu_init();
5872     +#else
5873     cpu_init();
5874     +#endif
5875     identify_cpu(cpu_data + smp_processor_id());
5876     touch_softlockup_watchdog();
5877     preempt_disable();
5878     @@ -304,11 +309,12 @@
5879     if (cpu == 0)
5880     continue;
5881    
5882     + idle = fork_idle(cpu);
5883     + if (IS_ERR(idle))
5884     + panic("failed fork for CPU %d", cpu);
5885     +
5886     #ifdef __x86_64__
5887     gdt_descr = &cpu_gdt_descr[cpu];
5888     -#else
5889     - gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
5890     -#endif
5891     gdt_descr->address = get_zeroed_page(GFP_KERNEL);
5892     if (unlikely(!gdt_descr->address)) {
5893     printk(KERN_CRIT "CPU%d failed to allocate GDT\n",
5894     @@ -317,6 +323,11 @@
5895     }
5896     gdt_descr->size = GDT_SIZE;
5897     memcpy((void *)gdt_descr->address, cpu_gdt_table, GDT_SIZE);
5898     +#else
5899     + if (unlikely(!init_gdt(cpu, idle)))
5900     + continue;
5901     + gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
5902     +#endif
5903     make_page_readonly(
5904     (void *)gdt_descr->address,
5905     XENFEAT_writable_descriptor_tables);
5906     @@ -336,10 +347,6 @@
5907     cpu_2_logical_apicid[cpu] = apicid;
5908     x86_cpu_to_apicid[cpu] = apicid;
5909    
5910     - idle = fork_idle(cpu);
5911     - if (IS_ERR(idle))
5912     - panic("failed fork for CPU %d", cpu);
5913     -
5914     #ifdef __x86_64__
5915     cpu_pda(cpu)->pcurrent = idle;
5916     cpu_pda(cpu)->cpunumber = cpu;
5917     --- a/drivers/xen/fbfront/xenfb.c
5918     +++ b/drivers/xen/fbfront/xenfb.c
5919     @@ -25,6 +25,7 @@
5920     #include <linux/vmalloc.h>
5921     #include <linux/mm.h>
5922     #include <linux/mutex.h>
5923     +#include <linux/freezer.h>
5924     #include <asm/hypervisor.h>
5925     #include <xen/evtchn.h>
5926     #include <xen/interface/io/fbif.h>
5927     --- a/drivers/xen/netback/loopback.c
5928     +++ b/drivers/xen/netback/loopback.c
5929     @@ -54,6 +54,7 @@
5930     #include <net/dst.h>
5931     #include <net/xfrm.h> /* secpath_reset() */
5932     #include <asm/hypervisor.h> /* is_initial_xendomain() */
5933     +#include <../net/core/kmap_skb.h> /* k{,un}map_skb_frag() */
5934    
5935     static int nloopbacks = -1;
5936     module_param(nloopbacks, int, 0);
5937     --- a/drivers/xen/pciback/conf_space_header.c
5938     +++ b/drivers/xen/pciback/conf_space_header.c
5939     @@ -22,14 +22,14 @@
5940     {
5941     int err;
5942    
5943     - if (!dev->is_enabled && is_enable_cmd(value)) {
5944     + if (!atomic_read(&dev->enable_cnt) && is_enable_cmd(value)) {
5945     if (unlikely(verbose_request))
5946     printk(KERN_DEBUG "pciback: %s: enable\n",
5947     pci_name(dev));
5948     err = pci_enable_device(dev);
5949     if (err)
5950     return err;
5951     - } else if (dev->is_enabled && !is_enable_cmd(value)) {
5952     + } else if (atomic_read(&dev->enable_cnt) && !is_enable_cmd(value)) {
5953     if (unlikely(verbose_request))
5954     printk(KERN_DEBUG "pciback: %s: disable\n",
5955     pci_name(dev));
5956     --- a/drivers/xen/pciback/pciback.h
5957     +++ b/drivers/xen/pciback/pciback.h
5958     @@ -88,7 +88,7 @@
5959    
5960     /* Handles events from front-end */
5961     irqreturn_t pciback_handle_event(int irq, void *dev_id);
5962     -void pciback_do_op(void *data);
5963     +void pciback_do_op(struct work_struct *work);
5964    
5965     int pciback_xenbus_register(void);
5966     void pciback_xenbus_unregister(void);
5967     --- a/drivers/xen/pciback/pciback_ops.c
5968     +++ b/drivers/xen/pciback/pciback_ops.c
5969     @@ -25,7 +25,7 @@
5970    
5971     pci_write_config_word(dev, PCI_COMMAND, 0);
5972    
5973     - dev->is_enabled = 0;
5974     + atomic_set(&dev->enable_cnt, 0);
5975     dev->is_busmaster = 0;
5976     } else {
5977     pci_read_config_word(dev, PCI_COMMAND, &cmd);
5978     @@ -51,9 +51,9 @@
5979     * context because some of the pci_* functions can sleep (mostly due to ACPI
5980     * use of semaphores). This function is intended to be called from a work
5981     * queue in process context taking a struct pciback_device as a parameter */
5982     -void pciback_do_op(void *data)
5983     +void pciback_do_op(struct work_struct *work)
5984     {
5985     - struct pciback_device *pdev = data;
5986     + struct pciback_device *pdev = container_of(work, struct pciback_device, op_work);
5987     struct pci_dev *dev;
5988     struct xen_pci_op *op = &pdev->sh_info->op;
5989    
5990     --- a/drivers/xen/pciback/xenbus.c
5991     +++ b/drivers/xen/pciback/xenbus.c
5992     @@ -32,7 +32,7 @@
5993     pdev->evtchn_irq = INVALID_EVTCHN_IRQ;
5994     pdev->be_watching = 0;
5995    
5996     - INIT_WORK(&pdev->op_work, pciback_do_op, pdev);
5997     + INIT_WORK(&pdev->op_work, pciback_do_op);
5998    
5999     if (pciback_init_devices(pdev)) {
6000     kfree(pdev);
6001     @@ -53,7 +53,6 @@
6002    
6003     /* If the driver domain started an op, make sure we complete it or
6004     * delete it before releasing the shared memory */
6005     - cancel_delayed_work(&pdev->op_work);
6006     flush_scheduled_work();
6007    
6008     if (pdev->sh_info)
6009     --- a/drivers/xen/sfc_netfront/accel_vi.c
6010     +++ b/drivers/xen/sfc_netfront/accel_vi.c
6011     @@ -463,7 +463,7 @@
6012    
6013     if (skb->ip_summed == CHECKSUM_PARTIAL) {
6014     /* Set to zero to encourage falcon to work it out for us */
6015     - *(u16*)(skb->h.raw + skb->csum) = 0;
6016     + *(u16*)(skb->h.raw + skb->csum_offset) = 0;
6017     }
6018    
6019     if (multi_post_start_new_buffer(vnic, &state)) {
6020     @@ -582,7 +582,7 @@
6021    
6022     if (skb->ip_summed == CHECKSUM_PARTIAL) {
6023     /* Set to zero to encourage falcon to work it out for us */
6024     - *(u16*)(skb->h.raw + skb->csum) = 0;
6025     + *(u16*)(skb->h.raw + skb->csum_offset) = 0;
6026     }
6027     NETFRONT_ACCEL_PKTBUFF_FOR_EACH_FRAGMENT
6028     (skb, idx, frag_data, frag_len, {
6029     --- a/drivers/xen/tpmback/interface.c
6030     +++ b/drivers/xen/tpmback/interface.c
6031     @@ -15,7 +15,7 @@
6032     #include <xen/balloon.h>
6033     #include <xen/gnttab.h>
6034    
6035     -static kmem_cache_t *tpmif_cachep;
6036     +static struct kmem_cache *tpmif_cachep;
6037     int num_frontends = 0;
6038    
6039     LIST_HEAD(tpmif_list);
6040     --- a/drivers/xen/xenbus/xenbus_comms.c
6041     +++ b/drivers/xen/xenbus/xenbus_comms.c
6042     @@ -49,9 +49,9 @@
6043    
6044     static int xenbus_irq;
6045    
6046     -extern void xenbus_probe(void *);
6047     +extern void xenbus_probe(struct work_struct *);
6048     extern int xenstored_ready;
6049     -static DECLARE_WORK(probe_work, xenbus_probe, NULL);
6050     +static DECLARE_WORK(probe_work, xenbus_probe);
6051    
6052     static DECLARE_WAIT_QUEUE_HEAD(xb_waitq);
6053    
6054     --- a/drivers/xen/xenbus/xenbus_probe.c
6055     +++ b/drivers/xen/xenbus/xenbus_probe.c
6056     @@ -840,7 +840,7 @@
6057     EXPORT_SYMBOL_GPL(unregister_xenstore_notifier);
6058    
6059    
6060     -void xenbus_probe(void *unused)
6061     +void xenbus_probe(struct work_struct *unused)
6062     {
6063     BUG_ON((xenstored_ready <= 0));
6064    
6065     --- a/include/asm-x86/mach-xen/asm/desc_32.h
6066     +++ b/include/asm-x86/mach-xen/asm/desc_32.h
6067     @@ -4,8 +4,6 @@
6068     #include <asm/ldt.h>
6069     #include <asm/segment.h>
6070    
6071     -#define CPU_16BIT_STACK_SIZE 1024
6072     -
6073     #ifndef __ASSEMBLY__
6074    
6075     #include <linux/preempt.h>
6076     @@ -15,8 +13,6 @@
6077    
6078     extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
6079    
6080     -DECLARE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]);
6081     -
6082     struct Xgt_desc_struct {
6083     unsigned short size;
6084     unsigned long address __attribute__((packed));
6085     @@ -32,11 +28,6 @@
6086     return (struct desc_struct *)per_cpu(cpu_gdt_descr, cpu).address;
6087     }
6088    
6089     -/*
6090     - * This is the ldt that every process will get unless we need
6091     - * something other than this.
6092     - */
6093     -extern struct desc_struct default_ldt[];
6094     extern struct desc_struct idt_table[];
6095     extern void set_intr_gate(unsigned int irq, void * addr);
6096    
6097     @@ -63,8 +54,8 @@
6098     #define DESCTYPE_DPL3 0x60 /* DPL-3 */
6099     #define DESCTYPE_S 0x10 /* !system */
6100    
6101     +#ifndef CONFIG_XEN
6102     #define load_TR_desc() __asm__ __volatile__("ltr %w0"::"q" (GDT_ENTRY_TSS*8))
6103     -#define load_LDT_desc() __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8))
6104    
6105     #define load_gdt(dtr) __asm__ __volatile("lgdt %0"::"m" (*dtr))
6106     #define load_idt(dtr) __asm__ __volatile("lidt %0"::"m" (*dtr))
6107     @@ -75,6 +66,7 @@
6108     #define store_idt(dtr) __asm__ ("sidt %0":"=m" (*dtr))
6109     #define store_tr(tr) __asm__ ("str %0":"=m" (tr))
6110     #define store_ldt(ldt) __asm__ ("sldt %0":"=m" (ldt))
6111     +#endif
6112    
6113     #if TLS_SIZE != 24
6114     # error update this code.
6115     @@ -90,22 +82,43 @@
6116     }
6117    
6118     #ifndef CONFIG_XEN
6119     +#define write_ldt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
6120     +#define write_gdt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
6121     +#define write_idt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
6122     +
6123     static inline void write_dt_entry(void *dt, int entry, __u32 entry_a, __u32 entry_b)
6124     {
6125     __u32 *lp = (__u32 *)((char *)dt + entry*8);
6126     *lp = entry_a;
6127     *(lp+1) = entry_b;
6128     }
6129     -
6130     -#define write_ldt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
6131     -#define write_gdt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
6132     +#define set_ldt native_set_ldt
6133     #else
6134     extern int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b);
6135     extern int write_gdt_entry(void *gdt, int entry, __u32 entry_a, __u32 entry_b);
6136     +#define set_ldt xen_set_ldt
6137     +#endif
6138     +
6139     +#ifndef CONFIG_XEN
6140     +static inline fastcall void native_set_ldt(const void *addr,
6141     + unsigned int entries)
6142     +{
6143     + if (likely(entries == 0))
6144     + __asm__ __volatile__("lldt %w0"::"q" (0));
6145     + else {
6146     + unsigned cpu = smp_processor_id();
6147     + __u32 a, b;
6148     +
6149     + pack_descriptor(&a, &b, (unsigned long)addr,
6150     + entries * sizeof(struct desc_struct) - 1,
6151     + DESCTYPE_LDT, 0);
6152     + write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, a, b);
6153     + __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
6154     + }
6155     +}
6156     #endif
6157     -#ifndef CONFIG_X86_NO_IDT
6158     -#define write_idt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
6159    
6160     +#ifndef CONFIG_X86_NO_IDT
6161     static inline void _set_gate(int gate, unsigned int type, void *addr, unsigned short seg)
6162     {
6163     __u32 a, b;
6164     @@ -125,14 +138,6 @@
6165     }
6166     #endif
6167    
6168     -static inline void set_ldt_desc(unsigned int cpu, void *addr, unsigned int entries)
6169     -{
6170     - __u32 a, b;
6171     - pack_descriptor(&a, &b, (unsigned long)addr,
6172     - entries * sizeof(struct desc_struct) - 1,
6173     - DESCTYPE_LDT, 0);
6174     - write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, a, b);
6175     -}
6176    
6177     #define set_tss_desc(cpu,addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr)
6178    
6179     @@ -163,36 +168,22 @@
6180    
6181     static inline void clear_LDT(void)
6182     {
6183     - int cpu = get_cpu();
6184     -
6185     - /*
6186     - * NB. We load the default_ldt for lcall7/27 handling on demand, as
6187     - * it slows down context switching. Noone uses it anyway.
6188     - */
6189     - cpu = cpu; /* XXX avoid compiler warning */
6190     - xen_set_ldt(NULL, 0);
6191     - put_cpu();
6192     + set_ldt(NULL, 0);
6193     }
6194    
6195     /*
6196     * load one particular LDT into the current CPU
6197     */
6198     -static inline void load_LDT_nolock(mm_context_t *pc, int cpu)
6199     +static inline void load_LDT_nolock(mm_context_t *pc)
6200     {
6201     - void *segments = pc->ldt;
6202     - int count = pc->size;
6203     -
6204     - if (likely(!count))
6205     - segments = NULL;
6206     -
6207     - xen_set_ldt(segments, count);
6208     + set_ldt(pc->ldt, pc->size);
6209     }
6210    
6211     static inline void load_LDT(mm_context_t *pc)
6212     {
6213     - int cpu = get_cpu();
6214     - load_LDT_nolock(pc, cpu);
6215     - put_cpu();
6216     + preempt_disable();
6217     + load_LDT_nolock(pc);
6218     + preempt_enable();
6219     }
6220    
6221     static inline unsigned long get_desc_base(unsigned long *desc)
6222     @@ -204,6 +195,29 @@
6223     return base;
6224     }
6225    
6226     +#else /* __ASSEMBLY__ */
6227     +
6228     +/*
6229     + * GET_DESC_BASE reads the descriptor base of the specified segment.
6230     + *
6231     + * Args:
6232     + * idx - descriptor index
6233     + * gdt - GDT pointer
6234     + * base - 32bit register to which the base will be written
6235     + * lo_w - lo word of the "base" register
6236     + * lo_b - lo byte of the "base" register
6237     + * hi_b - hi byte of the low word of the "base" register
6238     + *
6239     + * Example:
6240     + * GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah)
6241     + * Will read the base address of GDT_ENTRY_ESPFIX_SS and put it into %eax.
6242     + */
6243     +#define GET_DESC_BASE(idx, gdt, base, lo_w, lo_b, hi_b) \
6244     + movb idx*8+4(gdt), lo_b; \
6245     + movb idx*8+7(gdt), hi_b; \
6246     + shll $16, base; \
6247     + movw idx*8+2(gdt), lo_w;
6248     +
6249     #endif /* !__ASSEMBLY__ */
6250    
6251     #endif
6252     --- a/include/asm-x86/mach-xen/asm/desc_64.h
6253     +++ b/include/asm-x86/mach-xen/asm/desc_64.h
6254     @@ -9,62 +9,11 @@
6255    
6256     #include <linux/string.h>
6257     #include <linux/smp.h>
6258     +#include <asm/desc_defs.h>
6259    
6260     #include <asm/segment.h>
6261     #include <asm/mmu.h>
6262    
6263     -// 8 byte segment descriptor
6264     -struct desc_struct {
6265     - u16 limit0;
6266     - u16 base0;
6267     - unsigned base1 : 8, type : 4, s : 1, dpl : 2, p : 1;
6268     - unsigned limit : 4, avl : 1, l : 1, d : 1, g : 1, base2 : 8;
6269     -} __attribute__((packed));
6270     -
6271     -struct n_desc_struct {
6272     - unsigned int a,b;
6273     -};
6274     -
6275     -enum {
6276     - GATE_INTERRUPT = 0xE,
6277     - GATE_TRAP = 0xF,
6278     - GATE_CALL = 0xC,
6279     -};
6280     -
6281     -// 16byte gate
6282     -struct gate_struct {
6283     - u16 offset_low;
6284     - u16 segment;
6285     - unsigned ist : 3, zero0 : 5, type : 5, dpl : 2, p : 1;
6286     - u16 offset_middle;
6287     - u32 offset_high;
6288     - u32 zero1;
6289     -} __attribute__((packed));
6290     -
6291     -#define PTR_LOW(x) ((unsigned long)(x) & 0xFFFF)
6292     -#define PTR_MIDDLE(x) (((unsigned long)(x) >> 16) & 0xFFFF)
6293     -#define PTR_HIGH(x) ((unsigned long)(x) >> 32)
6294     -
6295     -enum {
6296     - DESC_TSS = 0x9,
6297     - DESC_LDT = 0x2,
6298     -};
6299     -
6300     -// LDT or TSS descriptor in the GDT. 16 bytes.
6301     -struct ldttss_desc {
6302     - u16 limit0;
6303     - u16 base0;
6304     - unsigned base1 : 8, type : 5, dpl : 2, p : 1;
6305     - unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8;
6306     - u32 base3;
6307     - u32 zero1;
6308     -} __attribute__((packed));
6309     -
6310     -struct desc_ptr {
6311     - unsigned short size;
6312     - unsigned long address;
6313     -} __attribute__((packed)) ;
6314     -
6315     extern struct desc_ptr idt_descr, cpu_gdt_descr[NR_CPUS];
6316    
6317     extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
6318     --- a/include/asm-x86/mach-xen/asm/dma-mapping_32.h
6319     +++ b/include/asm-x86/mach-xen/asm/dma-mapping_32.h
6320     @@ -127,10 +127,10 @@
6321     return (1 << INTERNODE_CACHE_SHIFT);
6322     }
6323    
6324     -#define dma_is_consistent(d) (1)
6325     +#define dma_is_consistent(d, h) (1)
6326    
6327     static inline void
6328     -dma_cache_sync(void *vaddr, size_t size,
6329     +dma_cache_sync(struct device *dev, void *vaddr, size_t size,
6330     enum dma_data_direction direction)
6331     {
6332     flush_write_buffers();
6333     --- a/include/asm-x86/mach-xen/asm/dma-mapping_64.h
6334     +++ b/include/asm-x86/mach-xen/asm/dma-mapping_64.h
6335     @@ -64,6 +64,9 @@
6336     return (dma_addr == bad_dma_address);
6337     }
6338    
6339     +#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
6340     +#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
6341     +
6342     extern void *dma_alloc_coherent(struct device *dev, size_t size,
6343     dma_addr_t *dma_handle, gfp_t gfp);
6344     extern void dma_free_coherent(struct device *dev, size_t size, void *vaddr,
6345     @@ -181,12 +184,13 @@
6346     return boot_cpu_data.x86_clflush_size;
6347     }
6348    
6349     -#define dma_is_consistent(h) 1
6350     +#define dma_is_consistent(d, h) 1
6351    
6352     extern int dma_set_mask(struct device *dev, u64 mask);
6353    
6354     static inline void
6355     -dma_cache_sync(void *vaddr, size_t size, enum dma_data_direction dir)
6356     +dma_cache_sync(struct device *dev, void *vaddr, size_t size,
6357     + enum dma_data_direction dir)
6358     {
6359     flush_write_buffers();
6360     }
6361     --- a/include/asm-x86/mach-xen/asm/fixmap_32.h
6362     +++ b/include/asm-x86/mach-xen/asm/fixmap_32.h
6363     @@ -13,13 +13,16 @@
6364     #ifndef _ASM_FIXMAP_H
6365     #define _ASM_FIXMAP_H
6366    
6367     -
6368     /* used by vmalloc.c, vsyscall.lds.S.
6369     *
6370     * Leave one empty page between vmalloc'ed areas and
6371     * the start of the fixmap.
6372     */
6373     extern unsigned long __FIXADDR_TOP;
6374     +#ifdef CONFIG_COMPAT_VDSO
6375     +#define FIXADDR_USER_START __fix_to_virt(FIX_VDSO)
6376     +#define FIXADDR_USER_END __fix_to_virt(FIX_VDSO - 1)
6377     +#endif
6378    
6379     #ifndef __ASSEMBLY__
6380     #include <linux/kernel.h>
6381     --- a/include/asm-x86/mach-xen/asm/hypervisor.h
6382     +++ b/include/asm-x86/mach-xen/asm/hypervisor.h
6383     @@ -45,15 +45,6 @@
6384     #include <xen/interface/nmi.h>
6385     #include <asm/ptrace.h>
6386     #include <asm/page.h>
6387     -#if defined(__i386__)
6388     -# ifdef CONFIG_X86_PAE
6389     -# include <asm-generic/pgtable-nopud.h>
6390     -# else
6391     -# include <asm-generic/pgtable-nopmd.h>
6392     -# endif
6393     -#elif defined(__x86_64__) && LINUX_VERSION_CODE < KERNEL_VERSION(2,6,11)
6394     -# include <asm-generic/pgtable-nopud.h>
6395     -#endif
6396    
6397     extern shared_info_t *HYPERVISOR_shared_info;
6398    
6399     --- a/include/asm-x86/mach-xen/asm/io_32.h
6400     +++ b/include/asm-x86/mach-xen/asm/io_32.h
6401     @@ -269,11 +269,7 @@
6402    
6403     #endif /* __KERNEL__ */
6404    
6405     -#ifdef SLOW_IO_BY_JUMPING
6406     -#define __SLOW_DOWN_IO "jmp 1f; 1: jmp 1f; 1:"
6407     -#else
6408     #define __SLOW_DOWN_IO "outb %%al,$0x80;"
6409     -#endif
6410    
6411     static inline void slow_down_io(void) {
6412     __asm__ __volatile__(
6413     --- a/include/asm-x86/mach-xen/asm/irqflags_32.h
6414     +++ b/include/asm-x86/mach-xen/asm/irqflags_32.h
6415     @@ -22,9 +22,6 @@
6416    
6417     #define __raw_local_save_flags() (current_vcpu_info()->evtchn_upcall_mask)
6418    
6419     -#define raw_local_save_flags(flags) \
6420     - do { (flags) = __raw_local_save_flags(); } while (0)
6421     -
6422     #define raw_local_irq_restore(x) \
6423     do { \
6424     vcpu_info_t *_vcpu; \
6425     @@ -66,18 +63,6 @@
6426     */
6427     void halt(void);
6428    
6429     -static inline int raw_irqs_disabled_flags(unsigned long flags)
6430     -{
6431     - return (flags != 0);
6432     -}
6433     -
6434     -#define raw_irqs_disabled() \
6435     -({ \
6436     - unsigned long flags = __raw_local_save_flags(); \
6437     - \
6438     - raw_irqs_disabled_flags(flags); \
6439     -})
6440     -
6441     /*
6442     * For spinlocks, etc:
6443     */
6444     @@ -90,9 +75,62 @@
6445     flags; \
6446     })
6447    
6448     +#else
6449     +/* Offsets into shared_info_t. */
6450     +#define evtchn_upcall_pending /* 0 */
6451     +#define evtchn_upcall_mask 1
6452     +
6453     +#define sizeof_vcpu_shift 6
6454     +
6455     +#ifdef CONFIG_SMP
6456     +#define GET_VCPU_INFO movl TI_cpu(%ebp),%esi ; \
6457     + shl $sizeof_vcpu_shift,%esi ; \
6458     + addl HYPERVISOR_shared_info,%esi
6459     +#else
6460     +#define GET_VCPU_INFO movl HYPERVISOR_shared_info,%esi
6461     +#endif
6462     +
6463     +#define __DISABLE_INTERRUPTS movb $1,evtchn_upcall_mask(%esi)
6464     +#define __ENABLE_INTERRUPTS movb $0,evtchn_upcall_mask(%esi)
6465     +#define __TEST_PENDING testb $0xFF,evtchn_upcall_pending(%esi)
6466     +#define DISABLE_INTERRUPTS(clb) GET_VCPU_INFO ; \
6467     + __DISABLE_INTERRUPTS
6468     +#define ENABLE_INTERRUPTS(clb) GET_VCPU_INFO ; \
6469     + __ENABLE_INTERRUPTS
6470     +#define ENABLE_INTERRUPTS_SYSEXIT __ENABLE_INTERRUPTS ; \
6471     +sysexit_scrit: /**** START OF SYSEXIT CRITICAL REGION ****/ ; \
6472     + __TEST_PENDING ; \
6473     + jnz 14f /* process more events if necessary... */ ; \
6474     + movl PT_ESI(%esp), %esi ; \
6475     + sysexit ; \
6476     +14: __DISABLE_INTERRUPTS ; \
6477     + TRACE_IRQS_OFF ; \
6478     +sysexit_ecrit: /**** END OF SYSEXIT CRITICAL REGION ****/ ; \
6479     + push %esp ; \
6480     + call evtchn_do_upcall ; \
6481     + add $4,%esp ; \
6482     + jmp ret_from_intr
6483     +#define INTERRUPT_RETURN iret
6484     +#endif /* __ASSEMBLY__ */
6485     +
6486     +#ifndef __ASSEMBLY__
6487     +#define raw_local_save_flags(flags) \
6488     + do { (flags) = __raw_local_save_flags(); } while (0)
6489     +
6490     #define raw_local_irq_save(flags) \
6491     do { (flags) = __raw_local_irq_save(); } while (0)
6492    
6493     +static inline int raw_irqs_disabled_flags(unsigned long flags)
6494     +{
6495     + return (flags != 0);
6496     +}
6497     +
6498     +#define raw_irqs_disabled() \
6499     +({ \
6500     + unsigned long flags = __raw_local_save_flags(); \
6501     + \
6502     + raw_irqs_disabled_flags(flags); \
6503     +})
6504     #endif /* __ASSEMBLY__ */
6505    
6506     /*
6507     --- a/include/asm-x86/mach-xen/asm/mmu_context_32.h
6508     +++ b/include/asm-x86/mach-xen/asm/mmu_context_32.h
6509     @@ -27,14 +27,13 @@
6510     static inline void __prepare_arch_switch(void)
6511     {
6512     /*
6513     - * Save away %fs and %gs. No need to save %es and %ds, as those
6514     - * are always kernel segments while inside the kernel. Must
6515     - * happen before reload of cr3/ldt (i.e., not in __switch_to).
6516     + * Save away %fs. No need to save %gs, as it was saved on the
6517     + * stack on entry. No need to save %es and %ds, as those are
6518     + * always kernel segments while inside the kernel.
6519     */
6520     - asm volatile ( "mov %%fs,%0 ; mov %%gs,%1"
6521     - : "=m" (current->thread.fs),
6522     - "=m" (current->thread.gs));
6523     - asm volatile ( "movl %0,%%fs ; movl %0,%%gs"
6524     + asm volatile ( "mov %%fs,%0"
6525     + : "=m" (current->thread.fs));
6526     + asm volatile ( "movl %0,%%fs"
6527     : : "r" (0) );
6528     }
6529    
6530     @@ -89,14 +88,14 @@
6531     * tlb flush IPI delivery. We must reload %cr3.
6532     */
6533     load_cr3(next->pgd);
6534     - load_LDT_nolock(&next->context, cpu);
6535     + load_LDT_nolock(&next->context);
6536     }
6537     }
6538     #endif
6539     }
6540    
6541     -#define deactivate_mm(tsk, mm) \
6542     - asm("movl %0,%%fs ; movl %0,%%gs": :"r" (0))
6543     +#define deactivate_mm(tsk, mm) \
6544     + asm("movl %0,%%fs": :"r" (0));
6545    
6546     static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next)
6547     {
6548     --- a/include/asm-x86/mach-xen/asm/pgtable-2level.h
6549     +++ b/include/asm-x86/mach-xen/asm/pgtable-2level.h
6550     @@ -1,8 +1,6 @@
6551     #ifndef _I386_PGTABLE_2LEVEL_H
6552     #define _I386_PGTABLE_2LEVEL_H
6553    
6554     -#include <asm-generic/pgtable-nopmd.h>
6555     -
6556     #define pte_ERROR(e) \
6557     printk("%s:%d: bad pte %08lx (pfn %05lx).\n", __FILE__, __LINE__, \
6558     __pte_val(e), pte_pfn(e))
6559     @@ -23,26 +21,14 @@
6560     set_pte((ptep), (pteval)); \
6561     } while (0)
6562    
6563     -#define set_pte_atomic(pteptr, pteval) set_pte(pteptr,pteval)
6564     -
6565     #define set_pmd(pmdptr, pmdval) xen_l2_entry_update((pmdptr), (pmdval))
6566    
6567     +#define set_pte_atomic(pteptr, pteval) set_pte(pteptr,pteval)
6568     +
6569     #define pte_clear(mm,addr,xp) do { set_pte_at(mm, addr, xp, __pte(0)); } while (0)
6570     #define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0)
6571    
6572     -#define pte_none(x) (!(x).pte_low)
6573     -
6574     -#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
6575     -static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
6576     -{
6577     - pte_t pte = *ptep;
6578     - if (!pte_none(pte)) {
6579     - if ((mm != &init_mm) ||
6580     - HYPERVISOR_update_va_mapping(addr, __pte(0), 0))
6581     - pte = __pte_ma(xchg(&ptep->pte_low, 0));
6582     - }
6583     - return pte;
6584     -}
6585     +#define raw_ptep_get_and_clear(xp, pte) __pte_ma(xchg(&(xp)->pte_low, 0))
6586    
6587     #define __HAVE_ARCH_PTEP_CLEAR_FLUSH
6588     #define ptep_clear_flush(vma, addr, ptep) \
6589     @@ -69,6 +55,7 @@
6590     __pte_mfn(_pte))
6591    
6592     #define pte_page(_pte) pfn_to_page(pte_pfn(_pte))
6593     +#define pte_none(x) (!(x).pte_low)
6594    
6595     #define pfn_pte(pfn, prot) __pte(((pfn) << PAGE_SHIFT) | pgprot_val(prot))
6596     #define pfn_pmd(pfn, prot) __pmd(((pfn) << PAGE_SHIFT) | pgprot_val(prot))
6597     --- a/include/asm-x86/mach-xen/asm/pgtable-3level.h
6598     +++ b/include/asm-x86/mach-xen/asm/pgtable-3level.h
6599     @@ -1,8 +1,6 @@
6600     #ifndef _I386_PGTABLE_3LEVEL_H
6601     #define _I386_PGTABLE_3LEVEL_H
6602    
6603     -#include <asm-generic/pgtable-nopud.h>
6604     -
6605     /*
6606     * Intel Physical Address Extension (PAE) Mode - three-level page
6607     * tables on PPro+ CPUs.
6608     @@ -75,6 +73,23 @@
6609     xen_l3_entry_update((pudptr), (pudval))
6610    
6611     /*
6612     + * For PTEs and PDEs, we must clear the P-bit first when clearing a page table
6613     + * entry, so clear the bottom half first and enforce ordering with a compiler
6614     + * barrier.
6615     + */
6616     +static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
6617     +{
6618     + if ((mm != current->mm && mm != &init_mm)
6619     + || HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) {
6620     + ptep->pte_low = 0;
6621     + smp_wmb();
6622     + ptep->pte_high = 0;
6623     + }
6624     +}
6625     +
6626     +#define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0)
6627     +
6628     +/*
6629     * Pentium-II erratum A13: in PAE mode we explicitly have to flush
6630     * the TLB via cr3 if the top-level pgd is changed...
6631     * We do not let the generic code free and clear pgd entries due to
6632     @@ -93,45 +108,16 @@
6633     #define pmd_offset(pud, address) ((pmd_t *) pud_page(*(pud)) + \
6634     pmd_index(address))
6635    
6636     -static inline int pte_none(pte_t pte)
6637     -{
6638     - return !(pte.pte_low | pte.pte_high);
6639     -}
6640     -
6641     -/*
6642     - * For PTEs and PDEs, we must clear the P-bit first when clearing a page table
6643     - * entry, so clear the bottom half first and enforce ordering with a compiler
6644     - * barrier.
6645     - */
6646     -static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
6647     +static inline pte_t raw_ptep_get_and_clear(pte_t *ptep, pte_t res)
6648     {
6649     - if ((mm != current->mm && mm != &init_mm)
6650     - || HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) {
6651     - ptep->pte_low = 0;
6652     - smp_wmb();
6653     + uint64_t val = __pte_val(res);
6654     + if (__cmpxchg64(ptep, val, 0) != val) {
6655     + /* xchg acts as a barrier before the setting of the high bits */
6656     + res.pte_low = xchg(&ptep->pte_low, 0);
6657     + res.pte_high = ptep->pte_high;
6658     ptep->pte_high = 0;
6659     }
6660     -}
6661     -
6662     -#define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0)
6663     -
6664     -#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
6665     -static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
6666     -{
6667     - pte_t pte = *ptep;
6668     - if (!pte_none(pte)) {
6669     - if ((mm != &init_mm) ||
6670     - HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) {
6671     - uint64_t val = __pte_val(pte);
6672     - if (__cmpxchg64(ptep, val, 0) != val) {
6673     - /* xchg acts as a barrier before the setting of the high bits */
6674     - pte.pte_low = xchg(&ptep->pte_low, 0);
6675     - pte.pte_high = ptep->pte_high;
6676     - ptep->pte_high = 0;
6677     - }
6678     - }
6679     - }
6680     - return pte;
6681     + return res;
6682     }
6683    
6684     #define __HAVE_ARCH_PTEP_CLEAR_FLUSH
6685     @@ -160,6 +146,11 @@
6686    
6687     #define pte_page(x) pfn_to_page(pte_pfn(x))
6688    
6689     +static inline int pte_none(pte_t pte)
6690     +{
6691     + return !(pte.pte_low | pte.pte_high);
6692     +}
6693     +
6694     #define __pte_mfn(_pte) (((_pte).pte_low >> PAGE_SHIFT) | \
6695     ((_pte).pte_high << (32-PAGE_SHIFT)))
6696     #define pte_mfn(_pte) ((_pte).pte_low & _PAGE_PRESENT ? \
6697     --- a/include/asm-x86/mach-xen/asm/pgtable_32.h
6698     +++ b/include/asm-x86/mach-xen/asm/pgtable_32.h
6699     @@ -38,14 +38,14 @@
6700     #define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
6701     extern unsigned long empty_zero_page[1024];
6702     extern pgd_t *swapper_pg_dir;
6703     -extern kmem_cache_t *pgd_cache;
6704     -extern kmem_cache_t *pmd_cache;
6705     +extern struct kmem_cache *pgd_cache;
6706     +extern struct kmem_cache *pmd_cache;
6707     extern spinlock_t pgd_lock;
6708     extern struct page *pgd_list;
6709    
6710     -void pmd_ctor(void *, kmem_cache_t *, unsigned long);
6711     -void pgd_ctor(void *, kmem_cache_t *, unsigned long);
6712     -void pgd_dtor(void *, kmem_cache_t *, unsigned long);
6713     +void pmd_ctor(void *, struct kmem_cache *, unsigned long);
6714     +void pgd_ctor(void *, struct kmem_cache *, unsigned long);
6715     +void pgd_dtor(void *, struct kmem_cache *, unsigned long);
6716     void pgtable_cache_init(void);
6717     void paging_init(void);
6718    
6719     @@ -276,7 +276,6 @@
6720     #define pte_update(mm, addr, ptep) do { } while (0)
6721     #define pte_update_defer(mm, addr, ptep) do { } while (0)
6722    
6723     -
6724     /*
6725     * We only update the dirty/accessed state if we set
6726     * the dirty bit by hand in the kernel, since the hardware
6727     @@ -342,6 +341,19 @@
6728     __young; \
6729     })
6730    
6731     +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
6732     +static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
6733     +{
6734     + pte_t pte = *ptep;
6735     + if (!pte_none(pte)
6736     + && (mm != &init_mm
6737     + || HYPERVISOR_update_va_mapping(addr, __pte(0), 0))) {
6738     + pte = raw_ptep_get_and_clear(ptep, pte);
6739     + pte_update(mm, addr, ptep);
6740     + }
6741     + return pte;
6742     +}
6743     +
6744     #define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
6745     #define ptep_get_and_clear_full(mm, addr, ptep, full) \
6746     ((full) ? ({ \
6747     --- a/include/asm-x86/mach-xen/asm/pgtable_64.h
6748     +++ b/include/asm-x86/mach-xen/asm/pgtable_64.h
6749     @@ -236,19 +236,18 @@
6750    
6751     static inline unsigned long pgd_bad(pgd_t pgd)
6752     {
6753     - unsigned long val = __pgd_val(pgd);
6754     - val &= ~PTE_MASK;
6755     - val &= ~(_PAGE_USER | _PAGE_DIRTY);
6756     - return val & ~(_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED);
6757     + return __pgd_val(pgd) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER);
6758     }
6759    
6760     -static inline unsigned long pud_bad(pud_t pud)
6761     -{
6762     - unsigned long val = __pud_val(pud);
6763     - val &= ~PTE_MASK;
6764     - val &= ~(_PAGE_USER | _PAGE_DIRTY);
6765     - return val & ~(_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED);
6766     -}
6767     +static inline unsigned long pud_bad(pud_t pud)
6768     +{
6769     + return __pud_val(pud) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER);
6770     +}
6771     +
6772     +static inline unsigned long pmd_bad(pmd_t pmd)
6773     +{
6774     + return __pmd_val(pmd) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER);
6775     +}
6776    
6777     #define set_pte_at(_mm,addr,ptep,pteval) do { \
6778     if (((_mm) != current->mm && (_mm) != &init_mm) || \
6779     @@ -404,8 +403,6 @@
6780     #define pmd_present(x) (__pmd_val(x) & _PAGE_PRESENT)
6781     #endif
6782     #define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0)
6783     -#define pmd_bad(x) ((__pmd_val(x) & ~(PTE_MASK | _PAGE_USER | _PAGE_PRESENT)) \
6784     - != (_KERNPG_TABLE & ~(_PAGE_USER | _PAGE_PRESENT)))
6785     #define pfn_pmd(nr,prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val(prot)))
6786     #define pmd_pfn(x) ((pmd_val(x) & __PHYSICAL_MASK) >> PAGE_SHIFT)
6787    
6788     --- a/include/asm-x86/mach-xen/asm/processor_32.h
6789     +++ b/include/asm-x86/mach-xen/asm/processor_32.h
6790     @@ -20,6 +20,7 @@
6791     #include <linux/threads.h>
6792     #include <asm/percpu.h>
6793     #include <linux/cpumask.h>
6794     +#include <linux/init.h>
6795     #include <xen/interface/physdev.h>
6796    
6797     /* flag for disabling the tsc */
6798     @@ -73,6 +74,7 @@
6799     #endif
6800     unsigned char x86_max_cores; /* cpuid returned max cores value */
6801     unsigned char apicid;
6802     + unsigned short x86_clflush_size;
6803     #ifdef CONFIG_SMP
6804     unsigned char booted_cores; /* number of cores as seen by OS */
6805     __u8 phys_proc_id; /* Physical processor id. */
6806     @@ -114,6 +116,8 @@
6807     extern int cpu_llc_id[NR_CPUS];
6808     extern char ignore_fpu_irq;
6809    
6810     +void __init cpu_detect(struct cpuinfo_x86 *c);
6811     +
6812     extern void identify_cpu(struct cpuinfo_x86 *);
6813     extern void print_cpu_info(struct cpuinfo_x86 *);
6814     extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
6815     @@ -146,8 +150,8 @@
6816     #define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */
6817     #define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */
6818    
6819     -static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
6820     - unsigned int *ecx, unsigned int *edx)
6821     +static inline fastcall void xen_cpuid(unsigned int *eax, unsigned int *ebx,
6822     + unsigned int *ecx, unsigned int *edx)
6823     {
6824     /* ecx is often an input as well as an output. */
6825     __asm__(XEN_CPUID
6826     @@ -158,59 +162,6 @@
6827     : "0" (*eax), "2" (*ecx));
6828     }
6829    
6830     -/*
6831     - * Generic CPUID function
6832     - * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
6833     - * resulting in stale register contents being returned.
6834     - */
6835     -static inline void cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx)
6836     -{
6837     - *eax = op;
6838     - *ecx = 0;
6839     - __cpuid(eax, ebx, ecx, edx);
6840     -}
6841     -
6842     -/* Some CPUID calls want 'count' to be placed in ecx */
6843     -static inline void cpuid_count(int op, int count, int *eax, int *ebx, int *ecx,
6844     - int *edx)
6845     -{
6846     - *eax = op;
6847     - *ecx = count;
6848     - __cpuid(eax, ebx, ecx, edx);
6849     -}
6850     -
6851     -/*
6852     - * CPUID functions returning a single datum
6853     - */
6854     -static inline unsigned int cpuid_eax(unsigned int op)
6855     -{
6856     - unsigned int eax, ebx, ecx, edx;
6857     -
6858     - cpuid(op, &eax, &ebx, &ecx, &edx);
6859     - return eax;
6860     -}
6861     -static inline unsigned int cpuid_ebx(unsigned int op)
6862     -{
6863     - unsigned int eax, ebx, ecx, edx;
6864     -
6865     - cpuid(op, &eax, &ebx, &ecx, &edx);
6866     - return ebx;
6867     -}
6868     -static inline unsigned int cpuid_ecx(unsigned int op)
6869     -{
6870     - unsigned int eax, ebx, ecx, edx;
6871     -
6872     - cpuid(op, &eax, &ebx, &ecx, &edx);
6873     - return ecx;
6874     -}
6875     -static inline unsigned int cpuid_edx(unsigned int op)
6876     -{
6877     - unsigned int eax, ebx, ecx, edx;
6878     -
6879     - cpuid(op, &eax, &ebx, &ecx, &edx);
6880     - return edx;
6881     -}
6882     -
6883     #define load_cr3(pgdir) write_cr3(__pa(pgdir))
6884    
6885     /*
6886     @@ -480,9 +431,9 @@
6887     .vm86_info = NULL, \
6888     .sysenter_cs = __KERNEL_CS, \
6889     .io_bitmap_ptr = NULL, \
6890     + .gs = __KERNEL_PDA, \
6891     }
6892    
6893     -#ifndef CONFIG_X86_NO_TSS
6894     /*
6895     * Note that the .io_bitmap member must be extra-big. This is because
6896     * the CPU will access an additional byte beyond the end of the IO
6897     @@ -497,26 +448,9 @@
6898     .io_bitmap = { [ 0 ... IO_BITMAP_LONGS] = ~0 }, \
6899     }
6900    
6901     -static inline void __load_esp0(struct tss_struct *tss, struct thread_struct *thread)
6902     -{
6903     - tss->esp0 = thread->esp0;
6904     - /* This can only happen when SEP is enabled, no need to test "SEP"arately */
6905     - if (unlikely(tss->ss1 != thread->sysenter_cs)) {
6906     - tss->ss1 = thread->sysenter_cs;
6907     - wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
6908     - }
6909     -}
6910     -#define load_esp0(tss, thread) \
6911     - __load_esp0(tss, thread)
6912     -#else
6913     -#define load_esp0(tss, thread) do { \
6914     - if (HYPERVISOR_stack_switch(__KERNEL_DS, (thread)->esp0)) \
6915     - BUG(); \
6916     -} while (0)
6917     -#endif
6918     -
6919     #define start_thread(regs, new_eip, new_esp) do { \
6920     - __asm__("movl %0,%%fs ; movl %0,%%gs": :"r" (0)); \
6921     + __asm__("movl %0,%%fs": :"r" (0)); \
6922     + regs->xgs = 0; \
6923     set_fs(USER_DS); \
6924     regs->xds = __USER_DS; \
6925     regs->xes = __USER_DS; \
6926     @@ -526,26 +460,6 @@
6927     regs->esp = new_esp; \
6928     } while (0)
6929    
6930     -/*
6931     - * These special macros can be used to get or set a debugging register
6932     - */
6933     -#define get_debugreg(var, register) \
6934     - (var) = HYPERVISOR_get_debugreg((register))
6935     -#define set_debugreg(value, register) \
6936     - WARN_ON(HYPERVISOR_set_debugreg((register), (value)))
6937     -
6938     -/*
6939     - * Set IOPL bits in EFLAGS from given mask
6940     - */
6941     -static inline void set_iopl_mask(unsigned mask)
6942     -{
6943     - struct physdev_set_iopl set_iopl;
6944     -
6945     - /* Force the change at ring 0. */
6946     - set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3;
6947     - WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
6948     -}
6949     -
6950     /* Forward declaration, a strange C thing */
6951     struct task_struct;
6952     struct mm_struct;
6953     @@ -637,6 +551,105 @@
6954    
6955     #define cpu_relax() rep_nop()
6956    
6957     +#define paravirt_enabled() 0
6958     +#define __cpuid xen_cpuid
6959     +
6960     +#ifndef CONFIG_X86_NO_TSS
6961     +static inline void __load_esp0(struct tss_struct *tss, struct thread_struct *thread)
6962     +{
6963     + tss->esp0 = thread->esp0;
6964     + /* This can only happen when SEP is enabled, no need to test "SEP"arately */
6965     + if (unlikely(tss->ss1 != thread->sysenter_cs)) {
6966     + tss->ss1 = thread->sysenter_cs;
6967     + wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
6968     + }
6969     +}
6970     +#define load_esp0(tss, thread) \
6971     + __load_esp0(tss, thread)
6972     +#else
6973     +#define load_esp0(tss, thread) do { \
6974     + if (HYPERVISOR_stack_switch(__KERNEL_DS, (thread)->esp0)) \
6975     + BUG(); \
6976     +} while (0)
6977     +#endif
6978     +
6979     +
6980     +/*
6981     + * These special macros can be used to get or set a debugging register
6982     + */
6983     +#define get_debugreg(var, register) \
6984     + (var) = HYPERVISOR_get_debugreg(register)
6985     +#define set_debugreg(value, register) \
6986     + WARN_ON(HYPERVISOR_set_debugreg(register, value))
6987     +
6988     +#define set_iopl_mask xen_set_iopl_mask
6989     +
6990     +/*
6991     + * Set IOPL bits in EFLAGS from given mask
6992     + */
6993     +static inline void xen_set_iopl_mask(unsigned mask)
6994     +{
6995     + struct physdev_set_iopl set_iopl;
6996     +
6997     + /* Force the change at ring 0. */
6998     + set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3;
6999     + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
7000     +}
7001     +
7002     +
7003     +/*
7004     + * Generic CPUID function
7005     + * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
7006     + * resulting in stale register contents being returned.
7007     + */
7008     +static inline void cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx)
7009     +{
7010     + *eax = op;
7011     + *ecx = 0;
7012     + __cpuid(eax, ebx, ecx, edx);
7013     +}
7014     +
7015     +/* Some CPUID calls want 'count' to be placed in ecx */
7016     +static inline void cpuid_count(int op, int count, int *eax, int *ebx, int *ecx,
7017     + int *edx)
7018     +{
7019     + *eax = op;
7020     + *ecx = count;
7021     + __cpuid(eax, ebx, ecx, edx);
7022     +}
7023     +
7024     +/*
7025     + * CPUID functions returning a single datum
7026     + */
7027     +static inline unsigned int cpuid_eax(unsigned int op)
7028     +{
7029     + unsigned int eax, ebx, ecx, edx;
7030     +
7031     + cpuid(op, &eax, &ebx, &ecx, &edx);
7032     + return eax;
7033     +}
7034     +static inline unsigned int cpuid_ebx(unsigned int op)
7035     +{
7036     + unsigned int eax, ebx, ecx, edx;
7037     +
7038     + cpuid(op, &eax, &ebx, &ecx, &edx);
7039     + return ebx;
7040     +}
7041     +static inline unsigned int cpuid_ecx(unsigned int op)
7042     +{
7043     + unsigned int eax, ebx, ecx, edx;
7044     +
7045     + cpuid(op, &eax, &ebx, &ecx, &edx);
7046     + return ecx;
7047     +}
7048     +static inline unsigned int cpuid_edx(unsigned int op)
7049     +{
7050     + unsigned int eax, ebx, ecx, edx;
7051     +
7052     + cpuid(op, &eax, &ebx, &ecx, &edx);
7053     + return edx;
7054     +}
7055     +
7056     /* generic versions from gas */
7057     #define GENERIC_NOP1 ".byte 0x90\n"
7058     #define GENERIC_NOP2 ".byte 0x89,0xf6\n"
7059     @@ -736,4 +749,8 @@
7060     extern void enable_sep_cpu(void);
7061     extern int sysenter_setup(void);
7062    
7063     +extern int init_gdt(int cpu, struct task_struct *idle);
7064     +extern void cpu_set_gdt(int);
7065     +extern void secondary_cpu_init(void);
7066     +
7067     #endif /* __ASM_I386_PROCESSOR_H */
7068     --- a/include/asm-x86/mach-xen/asm/processor_64.h
7069     +++ b/include/asm-x86/mach-xen/asm/processor_64.h
7070     @@ -484,6 +484,14 @@
7071     : :"a" (eax), "c" (ecx));
7072     }
7073    
7074     +static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
7075     +{
7076     + /* "mwait %eax,%ecx;" */
7077     + asm volatile(
7078     + "sti; .byte 0x0f,0x01,0xc9;"
7079     + : :"a" (eax), "c" (ecx));
7080     +}
7081     +
7082     extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
7083    
7084     #define stack_current() \
7085     --- a/include/asm-x86/mach-xen/asm/segment_32.h
7086     +++ b/include/asm-x86/mach-xen/asm/segment_32.h
7087     @@ -39,7 +39,7 @@
7088     * 25 - APM BIOS support
7089     *
7090     * 26 - ESPFIX small SS
7091     - * 27 - unused
7092     + * 27 - PDA [ per-cpu private data area ]
7093     * 28 - unused
7094     * 29 - unused
7095     * 30 - unused
7096     @@ -74,6 +74,9 @@
7097     #define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE + 14)
7098     #define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8)
7099    
7100     +#define GDT_ENTRY_PDA (GDT_ENTRY_KERNEL_BASE + 15)
7101     +#define __KERNEL_PDA (GDT_ENTRY_PDA * 8)
7102     +
7103     #define GDT_ENTRY_DOUBLEFAULT_TSS 31
7104    
7105     /*
7106     --- a/include/asm-x86/mach-xen/asm/smp_32.h
7107     +++ b/include/asm-x86/mach-xen/asm/smp_32.h
7108     @@ -8,6 +8,7 @@
7109     #include <linux/kernel.h>
7110     #include <linux/threads.h>
7111     #include <linux/cpumask.h>
7112     +#include <asm/pda.h>
7113     #endif
7114    
7115     #ifdef CONFIG_X86_LOCAL_APIC
7116     @@ -56,7 +57,7 @@
7117     * from the initial startup. We map APIC_BASE very early in page_setup(),
7118     * so this is correct in the x86 case.
7119     */
7120     -#define raw_smp_processor_id() (current_thread_info()->cpu)
7121     +#define raw_smp_processor_id() (read_pda(cpu_number))
7122    
7123     extern cpumask_t cpu_possible_map;
7124     #define cpu_callin_map cpu_possible_map
7125     --- a/include/asm-x86/mach-xen/asm/smp_64.h
7126     +++ b/include/asm-x86/mach-xen/asm/smp_64.h
7127     @@ -88,11 +88,6 @@
7128     extern u8 bios_cpu_apicid[];
7129    
7130     #ifdef CONFIG_X86_LOCAL_APIC
7131     -static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
7132     -{
7133     - return cpus_addr(cpumask)[0];
7134     -}
7135     -
7136     static inline int cpu_present_to_apicid(int mps_cpu)
7137     {
7138     if (mps_cpu < NR_CPUS)
7139     @@ -127,13 +122,6 @@
7140     #define cpu_physical_id(cpu) x86_cpu_to_apicid[cpu]
7141     #else
7142     #define cpu_physical_id(cpu) boot_cpu_id
7143     -static inline int smp_call_function_single(int cpuid, void (*func) (void *info),
7144     - void *info, int retry, int wait)
7145     -{
7146     - /* Disable interrupts here? */
7147     - func(info);
7148     - return 0;
7149     -}
7150     #endif /* !CONFIG_SMP */
7151     #endif
7152    
7153     --- a/include/asm-x86/mach-xen/asm/system_32.h
7154     +++ b/include/asm-x86/mach-xen/asm/system_32.h
7155     @@ -139,17 +139,17 @@
7156     #define write_cr4(x) \
7157     __asm__ __volatile__("movl %0,%%cr4": :"r" (x))
7158    
7159     -/*
7160     - * Clear and set 'TS' bit respectively
7161     - */
7162     +#define wbinvd() \
7163     + __asm__ __volatile__ ("wbinvd": : :"memory")
7164     +
7165     +/* Clear the 'TS' bit */
7166     #define clts() (HYPERVISOR_fpu_taskswitch(0))
7167     +
7168     +/* Set the 'TS' bit */
7169     #define stts() (HYPERVISOR_fpu_taskswitch(1))
7170    
7171     #endif /* __KERNEL__ */
7172    
7173     -#define wbinvd() \
7174     - __asm__ __volatile__ ("wbinvd": : :"memory")
7175     -
7176     static inline unsigned long get_limit(unsigned long segment)
7177     {
7178     unsigned long __limit;
7179     --- a/kernel/kexec.c
7180     +++ b/kernel/kexec.c
7181     @@ -353,7 +353,7 @@
7182     if (limit == ~0UL)
7183     address_bits = BITS_PER_LONG;
7184     else
7185     - address_bits = long_log2(limit);
7186     + address_bits = ilog2(limit);
7187    
7188     if (xen_limit_pages_to_max_mfn(pages, order, address_bits) < 0) {
7189     __free_pages(pages, order);
7190     --- a/net/core/dev.c
7191     +++ b/net/core/dev.c
7192     @@ -1597,10 +1597,10 @@
7193     goto out;
7194     switch (skb->nh.iph->protocol) {
7195     case IPPROTO_TCP:
7196     - skb->csum = offsetof(struct tcphdr, check);
7197     + skb->csum_offset = offsetof(struct tcphdr, check);
7198     break;
7199     case IPPROTO_UDP:
7200     - skb->csum = offsetof(struct udphdr, check);
7201     + skb->csum_offset = offsetof(struct udphdr, check);
7202     break;
7203     default:
7204     if (net_ratelimit())
7205     @@ -1609,7 +1609,7 @@
7206     " %d packet", skb->nh.iph->protocol);
7207     goto out;
7208     }
7209     - if ((skb->h.raw + skb->csum + 2) > skb->tail)
7210     + if ((skb->h.raw + skb->csum_offset + 2) > skb->tail)
7211     goto out;
7212     skb->ip_summed = CHECKSUM_PARTIAL;
7213     skb->proto_csum_blank = 0;