Magellan Linux

Contents of /trunk/kernel26-xen/patches-2.6.25-r1/1021-2.6.25-xen-patch-2.6.20.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 609 - (show annotations) (download)
Fri May 23 17:35:37 2008 UTC (16 years ago) by niro
File size: 199019 byte(s)
-using opensuse xen patchset, updated kernel configs

1 From: www.kernel.org
2 Subject: Linux 2.6.20
3 Patch-mainline: 2.6.20
4
5 Automatically created from "patches.kernel.org/patch-2.6.20" by xen-port-patches.py
6
7 Acked-by: jbeulich@novell.com
8
9 ---
10 arch/x86/Kconfig | 2
11 arch/x86/kernel/asm-offsets_32.c | 6
12 arch/x86/kernel/cpu/common-xen.c | 286 ++++---
13 arch/x86/kernel/cpu/mtrr/main-xen.c | 5
14 arch/x86/kernel/e820_32-xen.c | 1000 ++++++++++++++++++++++++++
15 arch/x86/kernel/entry_32-xen.S | 387 ++++------
16 arch/x86/kernel/entry_64-xen.S | 69 -
17 arch/x86/kernel/genapic_64-xen.c | 8
18 arch/x86/kernel/head64-xen.c | 5
19 arch/x86/kernel/head_32-xen.S | 63 +
20 arch/x86/kernel/io_apic_32-xen.c | 68 -
21 arch/x86/kernel/io_apic_64-xen.c | 133 ++-
22 arch/x86/kernel/irq_64-xen.c | 2
23 arch/x86/kernel/ldt_32-xen.c | 4
24 arch/x86/kernel/microcode-xen.c | 6
25 arch/x86/kernel/mpparse_32-xen.c | 12
26 arch/x86/kernel/mpparse_64-xen.c | 2
27 arch/x86/kernel/pci-dma_32-xen.c | 10
28 arch/x86/kernel/process_32-xen.c | 56 -
29 arch/x86/kernel/process_64-xen.c | 34
30 arch/x86/kernel/quirks-xen.c | 61 +
31 arch/x86/kernel/setup_32-xen.c | 974 -------------------------
32 arch/x86/kernel/setup_64-xen.c | 24
33 arch/x86/kernel/smp_32-xen.c | 4
34 arch/x86/kernel/smp_64-xen.c | 5
35 arch/x86/kernel/time_32-xen.c | 17
36 arch/x86/kernel/traps_32-xen.c | 204 +----
37 arch/x86/kernel/traps_64-xen.c | 139 ---
38 arch/x86/kernel/vmlinux_32.lds.S | 6
39 arch/x86/kernel/vsyscall_64-xen.c | 7
40 arch/x86/kvm/Kconfig | 1
41 arch/x86/mm/fault_32-xen.c | 12
42 arch/x86/mm/fault_64-xen.c | 10
43 arch/x86/mm/highmem_32-xen.c | 26
44 arch/x86/mm/init_32-xen.c | 20
45 arch/x86/mm/init_64-xen.c | 7
46 arch/x86/mm/pageattr_64-xen.c | 58 -
47 arch/x86/mm/pgtable_32-xen.c | 6
48 arch/x86/pci/irq-xen.c | 4
49 drivers/xen/balloon/balloon.c | 6
50 drivers/xen/blkback/blkback.c | 1
51 drivers/xen/blkback/interface.c | 2
52 drivers/xen/blkfront/blkfront.c | 8
53 drivers/xen/blktap/blktap.c | 1
54 drivers/xen/blktap/interface.c | 2
55 drivers/xen/char/mem.c | 4
56 drivers/xen/console/console.c | 13
57 drivers/xen/core/reboot.c | 10
58 drivers/xen/core/smpboot.c | 21
59 drivers/xen/fbfront/xenfb.c | 1
60 drivers/xen/netback/loopback.c | 1
61 drivers/xen/pciback/conf_space_header.c | 4
62 drivers/xen/pciback/pciback.h | 2
63 drivers/xen/pciback/pciback_ops.c | 6
64 drivers/xen/pciback/xenbus.c | 3
65 drivers/xen/sfc_netfront/accel_vi.c | 4
66 drivers/xen/tpmback/interface.c | 2
67 drivers/xen/xenbus/xenbus_comms.c | 4
68 drivers/xen/xenbus/xenbus_probe.c | 2
69 include/asm-x86/mach-xen/asm/desc_32.h | 100 +-
70 include/asm-x86/mach-xen/asm/desc_64.h | 53 -
71 include/asm-x86/mach-xen/asm/dma-mapping_32.h | 4
72 include/asm-x86/mach-xen/asm/dma-mapping_64.h | 8
73 include/asm-x86/mach-xen/asm/fixmap_32.h | 5
74 include/asm-x86/mach-xen/asm/hypervisor.h | 9
75 include/asm-x86/mach-xen/asm/io_32.h | 4
76 include/asm-x86/mach-xen/asm/irqflags_32.h | 68 +
77 include/asm-x86/mach-xen/asm/mmu_context_32.h | 19
78 include/asm-x86/mach-xen/asm/pgtable-2level.h | 21
79 include/asm-x86/mach-xen/asm/pgtable-3level.h | 67 -
80 include/asm-x86/mach-xen/asm/pgtable_32.h | 24
81 include/asm-x86/mach-xen/asm/pgtable_64.h | 23
82 include/asm-x86/mach-xen/asm/processor_32.h | 207 ++---
83 include/asm-x86/mach-xen/asm/processor_64.h | 8
84 include/asm-x86/mach-xen/asm/segment_32.h | 5
85 include/asm-x86/mach-xen/asm/smp_32.h | 3
86 include/asm-x86/mach-xen/asm/smp_64.h | 12
87 include/asm-x86/mach-xen/asm/system_32.h | 12
88 kernel/kexec.c | 2
89 net/core/dev.c | 6
90 80 files changed, 2263 insertions(+), 2237 deletions(-)
91
92 --- a/arch/x86/Kconfig
93 +++ b/arch/x86/Kconfig
94 @@ -1220,7 +1220,7 @@
95
96 config RELOCATABLE
97 bool "Build a relocatable kernel (EXPERIMENTAL)"
98 - depends on EXPERIMENTAL
99 + depends on EXPERIMENTAL && !X86_XEN
100 help
101 This builds a kernel image that retains relocation information
102 so it can be loaded someplace besides the default 1MB.
103 --- a/arch/x86/kernel/asm-offsets_32.c
104 +++ b/arch/x86/kernel/asm-offsets_32.c
105 @@ -61,6 +61,7 @@
106 OFFSET(TI_exec_domain, thread_info, exec_domain);
107 OFFSET(TI_flags, thread_info, flags);
108 OFFSET(TI_status, thread_info, status);
109 + OFFSET(TI_cpu, thread_info, cpu);
110 OFFSET(TI_preempt_count, thread_info, preempt_count);
111 OFFSET(TI_addr_limit, thread_info, addr_limit);
112 OFFSET(TI_restart_block, thread_info, restart_block);
113 @@ -115,6 +116,11 @@
114
115 OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
116
117 +#ifdef CONFIG_XEN
118 + BLANK();
119 + OFFSET(XEN_START_mfn_list, start_info, mfn_list);
120 +#endif
121 +
122 #ifdef CONFIG_PARAVIRT
123 BLANK();
124 OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
125 --- a/arch/x86/kernel/cpu/common-xen.c
126 +++ b/arch/x86/kernel/cpu/common-xen.c
127 @@ -22,6 +22,7 @@
128 #define phys_pkg_id(a,b) a
129 #endif
130 #endif
131 +#include <asm/pda.h>
132 #include <asm/hypervisor.h>
133
134 #include "cpu.h"
135 @@ -29,10 +30,8 @@
136 DEFINE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr);
137 EXPORT_PER_CPU_SYMBOL(cpu_gdt_descr);
138
139 -#ifndef CONFIG_XEN
140 -DEFINE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]);
141 -EXPORT_PER_CPU_SYMBOL(cpu_16bit_stack);
142 -#endif
143 +struct i386_pda *_cpu_pda[NR_CPUS] __read_mostly;
144 +EXPORT_SYMBOL(_cpu_pda);
145
146 static int cachesize_override __cpuinitdata = -1;
147 static int disable_x86_fxsr __cpuinitdata;
148 @@ -60,7 +59,7 @@
149 .c_init = default_init,
150 .c_vendor = "Unknown",
151 };
152 -static struct cpu_dev * this_cpu = &default_cpu;
153 +static struct cpu_dev * this_cpu __cpuinitdata = &default_cpu;
154
155 static int __init cachesize_setup(char *str)
156 {
157 @@ -242,29 +241,14 @@
158 return flag_is_changeable_p(X86_EFLAGS_ID);
159 }
160
161 -/* Do minimum CPU detection early.
162 - Fields really needed: vendor, cpuid_level, family, model, mask, cache alignment.
163 - The others are not touched to avoid unwanted side effects.
164 -
165 - WARNING: this function is only called on the BP. Don't add code here
166 - that is supposed to run on all CPUs. */
167 -static void __init early_cpu_detect(void)
168 +void __init cpu_detect(struct cpuinfo_x86 *c)
169 {
170 - struct cpuinfo_x86 *c = &boot_cpu_data;
171 -
172 - c->x86_cache_alignment = 32;
173 -
174 - if (!have_cpuid_p())
175 - return;
176 -
177 /* Get vendor name */
178 cpuid(0x00000000, &c->cpuid_level,
179 (int *)&c->x86_vendor_id[0],
180 (int *)&c->x86_vendor_id[8],
181 (int *)&c->x86_vendor_id[4]);
182
183 - get_cpu_vendor(c, 1);
184 -
185 c->x86 = 4;
186 if (c->cpuid_level >= 0x00000001) {
187 u32 junk, tfms, cap0, misc;
188 @@ -281,6 +265,26 @@
189 }
190 }
191
192 +/* Do minimum CPU detection early.
193 + Fields really needed: vendor, cpuid_level, family, model, mask, cache alignment.
194 + The others are not touched to avoid unwanted side effects.
195 +
196 + WARNING: this function is only called on the BP. Don't add code here
197 + that is supposed to run on all CPUs. */
198 +static void __init early_cpu_detect(void)
199 +{
200 + struct cpuinfo_x86 *c = &boot_cpu_data;
201 +
202 + c->x86_cache_alignment = 32;
203 +
204 + if (!have_cpuid_p())
205 + return;
206 +
207 + cpu_detect(c);
208 +
209 + get_cpu_vendor(c, 1);
210 +}
211 +
212 static void __cpuinit generic_identify(struct cpuinfo_x86 * c)
213 {
214 u32 tfms, xlvl;
215 @@ -315,6 +319,8 @@
216 #else
217 c->apicid = (ebx >> 24) & 0xFF;
218 #endif
219 + if (c->x86_capability[0] & (1<<19))
220 + c->x86_clflush_size = ((ebx >> 8) & 0xff) * 8;
221 } else {
222 /* Have CPUID level 0 only - unheard of */
223 c->x86 = 4;
224 @@ -379,6 +385,7 @@
225 c->x86_vendor_id[0] = '\0'; /* Unset */
226 c->x86_model_id[0] = '\0'; /* Unset */
227 c->x86_max_cores = 1;
228 + c->x86_clflush_size = 32;
229 memset(&c->x86_capability, 0, sizeof c->x86_capability);
230
231 if (!have_cpuid_p()) {
232 @@ -599,61 +606,23 @@
233 #endif
234 }
235
236 -static void __cpuinit cpu_gdt_init(const struct Xgt_desc_struct *gdt_descr)
237 +/* Make sure %gs is initialized properly in idle threads */
238 +struct pt_regs * __devinit idle_regs(struct pt_regs *regs)
239 {
240 - unsigned long frames[16];
241 - unsigned long va;
242 - int f;
243 -
244 - for (va = gdt_descr->address, f = 0;
245 - va < gdt_descr->address + gdt_descr->size;
246 - va += PAGE_SIZE, f++) {
247 - frames[f] = virt_to_mfn(va);
248 - make_lowmem_page_readonly(
249 - (void *)va, XENFEAT_writable_descriptor_tables);
250 - }
251 - if (HYPERVISOR_set_gdt(frames, (gdt_descr->size + 1) / 8))
252 - BUG();
253 + memset(regs, 0, sizeof(struct pt_regs));
254 + regs->xgs = __KERNEL_PDA;
255 + return regs;
256 }
257
258 -/*
259 - * cpu_init() initializes state that is per-CPU. Some data is already
260 - * initialized (naturally) in the bootstrap process, such as the GDT
261 - * and IDT. We reload them nevertheless, this function acts as a
262 - * 'CPU state barrier', nothing should get across.
263 - */
264 -void __cpuinit cpu_init(void)
265 +static __cpuinit int alloc_gdt(int cpu)
266 {
267 - int cpu = smp_processor_id();
268 -#ifndef CONFIG_X86_NO_TSS
269 - struct tss_struct * t = &per_cpu(init_tss, cpu);
270 -#endif
271 - struct thread_struct *thread = &current->thread;
272 - struct desc_struct *gdt;
273 struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
274 + struct desc_struct *gdt;
275 + struct i386_pda *pda;
276
277 - if (cpu_test_and_set(cpu, cpu_initialized)) {
278 - printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
279 - for (;;) local_irq_enable();
280 - }
281 - printk(KERN_INFO "Initializing CPU#%d\n", cpu);
282 -
283 - if (cpu_has_vme || cpu_has_de)
284 - clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
285 - if (tsc_disable && cpu_has_tsc) {
286 - printk(KERN_NOTICE "Disabling TSC...\n");
287 - /**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/
288 - clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability);
289 - set_in_cr4(X86_CR4_TSD);
290 - }
291 + gdt = (struct desc_struct *)cpu_gdt_descr->address;
292 + pda = cpu_pda(cpu);
293
294 -#ifndef CONFIG_XEN
295 - /* The CPU hotplug case */
296 - if (cpu_gdt_descr->address) {
297 - gdt = (struct desc_struct *)cpu_gdt_descr->address;
298 - memset(gdt, 0, PAGE_SIZE);
299 - goto old_gdt;
300 - }
301 /*
302 * This is a horrible hack to allocate the GDT. The problem
303 * is that cpu_init() is called really early for the boot CPU
304 @@ -661,54 +630,141 @@
305 * CPUs, when bootmem will have gone away
306 */
307 if (NODE_DATA(0)->bdata->node_bootmem_map) {
308 - gdt = (struct desc_struct *)alloc_bootmem_pages(PAGE_SIZE);
309 - /* alloc_bootmem_pages panics on failure, so no check */
310 + BUG_ON(gdt != NULL || pda != NULL);
311 +
312 + gdt = alloc_bootmem_pages(PAGE_SIZE);
313 + pda = alloc_bootmem(sizeof(*pda));
314 + /* alloc_bootmem(_pages) panics on failure, so no check */
315 +
316 memset(gdt, 0, PAGE_SIZE);
317 + memset(pda, 0, sizeof(*pda));
318 } else {
319 - gdt = (struct desc_struct *)get_zeroed_page(GFP_KERNEL);
320 - if (unlikely(!gdt)) {
321 - printk(KERN_CRIT "CPU%d failed to allocate GDT\n", cpu);
322 - for (;;)
323 - local_irq_enable();
324 + /* GDT and PDA might already have been allocated if
325 + this is a CPU hotplug re-insertion. */
326 + if (gdt == NULL)
327 + gdt = (struct desc_struct *)get_zeroed_page(GFP_KERNEL);
328 +
329 + if (pda == NULL)
330 + pda = kmalloc_node(sizeof(*pda), GFP_KERNEL, cpu_to_node(cpu));
331 +
332 + if (unlikely(!gdt || !pda)) {
333 + free_pages((unsigned long)gdt, 0);
334 + kfree(pda);
335 + return 0;
336 }
337 }
338 -old_gdt:
339 +
340 + cpu_gdt_descr->address = (unsigned long)gdt;
341 + cpu_pda(cpu) = pda;
342 +
343 + return 1;
344 +}
345 +
346 +/* Initial PDA used by boot CPU */
347 +struct i386_pda boot_pda = {
348 + ._pda = &boot_pda,
349 + .cpu_number = 0,
350 + .pcurrent = &init_task,
351 +};
352 +
353 +static inline void set_kernel_gs(void)
354 +{
355 + /* Set %gs for this CPU's PDA. Memory clobber is to create a
356 + barrier with respect to any PDA operations, so the compiler
357 + doesn't move any before here. */
358 + asm volatile ("mov %0, %%gs" : : "r" (__KERNEL_PDA) : "memory");
359 +}
360 +
361 +/* Initialize the CPU's GDT and PDA. The boot CPU does this for
362 + itself, but secondaries find this done for them. */
363 +__cpuinit int init_gdt(int cpu, struct task_struct *idle)
364 +{
365 + struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
366 + struct desc_struct *gdt;
367 + struct i386_pda *pda;
368 +
369 + /* For non-boot CPUs, the GDT and PDA should already have been
370 + allocated. */
371 + if (!alloc_gdt(cpu)) {
372 + printk(KERN_CRIT "CPU%d failed to allocate GDT or PDA\n", cpu);
373 + return 0;
374 + }
375 +
376 + gdt = (struct desc_struct *)cpu_gdt_descr->address;
377 + pda = cpu_pda(cpu);
378 +
379 + BUG_ON(gdt == NULL || pda == NULL);
380 +
381 /*
382 * Initialize the per-CPU GDT with the boot GDT,
383 * and set up the GDT descriptor:
384 */
385 memcpy(gdt, cpu_gdt_table, GDT_SIZE);
386 + cpu_gdt_descr->size = GDT_SIZE - 1;
387
388 - /* Set up GDT entry for 16bit stack */
389 - *(__u64 *)(&gdt[GDT_ENTRY_ESPFIX_SS]) |=
390 - ((((__u64)stk16_off) << 16) & 0x000000ffffff0000ULL) |
391 - ((((__u64)stk16_off) << 32) & 0xff00000000000000ULL) |
392 - (CPU_16BIT_STACK_SIZE - 1);
393 + pack_descriptor((u32 *)&gdt[GDT_ENTRY_PDA].a,
394 + (u32 *)&gdt[GDT_ENTRY_PDA].b,
395 + (unsigned long)pda, sizeof(*pda) - 1,
396 + 0x80 | DESCTYPE_S | 0x2, 0); /* present read-write data segment */
397 +
398 + memset(pda, 0, sizeof(*pda));
399 + pda->_pda = pda;
400 + pda->cpu_number = cpu;
401 + pda->pcurrent = idle;
402
403 - cpu_gdt_descr->size = GDT_SIZE - 1;
404 - cpu_gdt_descr->address = (unsigned long)gdt;
405 -#else
406 - if (cpu == 0 && cpu_gdt_descr->address == 0) {
407 - gdt = (struct desc_struct *)alloc_bootmem_pages(PAGE_SIZE);
408 - /* alloc_bootmem_pages panics on failure, so no check */
409 - memset(gdt, 0, PAGE_SIZE);
410 + return 1;
411 +}
412
413 - memcpy(gdt, cpu_gdt_table, GDT_SIZE);
414 -
415 - cpu_gdt_descr->size = GDT_SIZE;
416 - cpu_gdt_descr->address = (unsigned long)gdt;
417 +void __cpuinit cpu_set_gdt(int cpu)
418 +{
419 + struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
420 + unsigned long va, frames[16];
421 + int f;
422 +
423 + for (va = cpu_gdt_descr->address, f = 0;
424 + va < cpu_gdt_descr->address + cpu_gdt_descr->size;
425 + va += PAGE_SIZE, f++) {
426 + frames[f] = virt_to_mfn(va);
427 + make_lowmem_page_readonly(
428 + (void *)va, XENFEAT_writable_descriptor_tables);
429 }
430 + BUG_ON(HYPERVISOR_set_gdt(frames, (cpu_gdt_descr->size + 1) / 8));
431 +
432 + set_kernel_gs();
433 +}
434 +
435 +/* Common CPU init for both boot and secondary CPUs */
436 +static void __cpuinit _cpu_init(int cpu, struct task_struct *curr)
437 +{
438 +#ifndef CONFIG_X86_NO_TSS
439 + struct tss_struct * t = &per_cpu(init_tss, cpu);
440 #endif
441 + struct thread_struct *thread = &curr->thread;
442 +
443 + if (cpu_test_and_set(cpu, cpu_initialized)) {
444 + printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
445 + for (;;) local_irq_enable();
446 + }
447
448 - cpu_gdt_init(cpu_gdt_descr);
449 + printk(KERN_INFO "Initializing CPU#%d\n", cpu);
450 +
451 + if (cpu_has_vme || cpu_has_de)
452 + clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
453 + if (tsc_disable && cpu_has_tsc) {
454 + printk(KERN_NOTICE "Disabling TSC...\n");
455 + /**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/
456 + clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability);
457 + set_in_cr4(X86_CR4_TSD);
458 + }
459
460 /*
461 * Set up and load the per-CPU TSS and LDT
462 */
463 atomic_inc(&init_mm.mm_count);
464 - current->active_mm = &init_mm;
465 - BUG_ON(current->mm);
466 - enter_lazy_tlb(&init_mm, current);
467 + curr->active_mm = &init_mm;
468 + if (curr->mm)
469 + BUG();
470 + enter_lazy_tlb(&init_mm, curr);
471
472 load_esp0(t, thread);
473
474 @@ -719,8 +775,8 @@
475 __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
476 #endif
477
478 - /* Clear %fs and %gs. */
479 - asm volatile ("movl %0, %%fs; movl %0, %%gs" : : "r" (0));
480 + /* Clear %fs. */
481 + asm volatile ("mov %0, %%fs" : : "r" (0));
482
483 /* Clear all 6 debug registers: */
484 set_debugreg(0, 0);
485 @@ -738,6 +794,38 @@
486 mxcsr_feature_mask_init();
487 }
488
489 +/* Entrypoint to initialize secondary CPU */
490 +void __cpuinit secondary_cpu_init(void)
491 +{
492 + int cpu = smp_processor_id();
493 + struct task_struct *curr = current;
494 +
495 + _cpu_init(cpu, curr);
496 +}
497 +
498 +/*
499 + * cpu_init() initializes state that is per-CPU. Some data is already
500 + * initialized (naturally) in the bootstrap process, such as the GDT
501 + * and IDT. We reload them nevertheless, this function acts as a
502 + * 'CPU state barrier', nothing should get across.
503 + */
504 +void __cpuinit cpu_init(void)
505 +{
506 + int cpu = smp_processor_id();
507 + struct task_struct *curr = current;
508 +
509 + /* Set up the real GDT and PDA, so we can transition from the
510 + boot versions. */
511 + if (!init_gdt(cpu, curr)) {
512 + /* failed to allocate something; not much we can do... */
513 + for (;;)
514 + local_irq_enable();
515 + }
516 +
517 + cpu_set_gdt(cpu);
518 + _cpu_init(cpu, curr);
519 +}
520 +
521 #ifdef CONFIG_HOTPLUG_CPU
522 void __cpuinit cpu_uninit(void)
523 {
524 --- a/arch/x86/kernel/cpu/mtrr/main-xen.c
525 +++ b/arch/x86/kernel/cpu/mtrr/main-xen.c
526 @@ -12,7 +12,7 @@
527 static DEFINE_MUTEX(mtrr_mutex);
528
529 void generic_get_mtrr(unsigned int reg, unsigned long *base,
530 - unsigned int *size, mtrr_type * type)
531 + unsigned long *size, mtrr_type * type)
532 {
533 struct xen_platform_op op;
534
535 @@ -115,8 +115,7 @@
536 {
537 unsigned i;
538 mtrr_type ltype;
539 - unsigned long lbase;
540 - unsigned int lsize;
541 + unsigned long lbase, lsize;
542 int error = -EINVAL;
543 struct xen_platform_op op;
544
545 --- /dev/null
546 +++ b/arch/x86/kernel/e820_32-xen.c
547 @@ -0,0 +1,1000 @@
548 +#include <linux/kernel.h>
549 +#include <linux/types.h>
550 +#include <linux/init.h>
551 +#include <linux/bootmem.h>
552 +#include <linux/ioport.h>
553 +#include <linux/string.h>
554 +#include <linux/kexec.h>
555 +#include <linux/module.h>
556 +#include <linux/mm.h>
557 +#include <linux/efi.h>
558 +#include <linux/pfn.h>
559 +#include <linux/uaccess.h>
560 +
561 +#include <asm/pgtable.h>
562 +#include <asm/page.h>
563 +#include <asm/e820.h>
564 +#include <xen/interface/memory.h>
565 +
566 +#ifdef CONFIG_EFI
567 +int efi_enabled = 0;
568 +EXPORT_SYMBOL(efi_enabled);
569 +#endif
570 +
571 +struct e820map e820;
572 +struct change_member {
573 + struct e820entry *pbios; /* pointer to original bios entry */
574 + unsigned long long addr; /* address for this change point */
575 +};
576 +static struct change_member change_point_list[2*E820MAX] __initdata;
577 +static struct change_member *change_point[2*E820MAX] __initdata;
578 +static struct e820entry *overlap_list[E820MAX] __initdata;
579 +static struct e820entry new_bios[E820MAX] __initdata;
580 +/* For PCI or other memory-mapped resources */
581 +unsigned long pci_mem_start = 0x10000000;
582 +#ifdef CONFIG_PCI
583 +EXPORT_SYMBOL(pci_mem_start);
584 +#endif
585 +extern int user_defined_memmap;
586 +struct resource data_resource = {
587 + .name = "Kernel data",
588 + .start = 0,
589 + .end = 0,
590 + .flags = IORESOURCE_BUSY | IORESOURCE_MEM
591 +};
592 +
593 +struct resource code_resource = {
594 + .name = "Kernel code",
595 + .start = 0,
596 + .end = 0,
597 + .flags = IORESOURCE_BUSY | IORESOURCE_MEM
598 +};
599 +
600 +static struct resource system_rom_resource = {
601 + .name = "System ROM",
602 + .start = 0xf0000,
603 + .end = 0xfffff,
604 + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
605 +};
606 +
607 +static struct resource extension_rom_resource = {
608 + .name = "Extension ROM",
609 + .start = 0xe0000,
610 + .end = 0xeffff,
611 + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
612 +};
613 +
614 +static struct resource adapter_rom_resources[] = { {
615 + .name = "Adapter ROM",
616 + .start = 0xc8000,
617 + .end = 0,
618 + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
619 +}, {
620 + .name = "Adapter ROM",
621 + .start = 0,
622 + .end = 0,
623 + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
624 +}, {
625 + .name = "Adapter ROM",
626 + .start = 0,
627 + .end = 0,
628 + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
629 +}, {
630 + .name = "Adapter ROM",
631 + .start = 0,
632 + .end = 0,
633 + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
634 +}, {
635 + .name = "Adapter ROM",
636 + .start = 0,
637 + .end = 0,
638 + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
639 +}, {
640 + .name = "Adapter ROM",
641 + .start = 0,
642 + .end = 0,
643 + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
644 +} };
645 +
646 +static struct resource video_rom_resource = {
647 + .name = "Video ROM",
648 + .start = 0xc0000,
649 + .end = 0xc7fff,
650 + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
651 +};
652 +
653 +static struct resource video_ram_resource = {
654 + .name = "Video RAM area",
655 + .start = 0xa0000,
656 + .end = 0xbffff,
657 + .flags = IORESOURCE_BUSY | IORESOURCE_MEM
658 +};
659 +
660 +static struct resource standard_io_resources[] = { {
661 + .name = "dma1",
662 + .start = 0x0000,
663 + .end = 0x001f,
664 + .flags = IORESOURCE_BUSY | IORESOURCE_IO
665 +}, {
666 + .name = "pic1",
667 + .start = 0x0020,
668 + .end = 0x0021,
669 + .flags = IORESOURCE_BUSY | IORESOURCE_IO
670 +}, {
671 + .name = "timer0",
672 + .start = 0x0040,
673 + .end = 0x0043,
674 + .flags = IORESOURCE_BUSY | IORESOURCE_IO
675 +}, {
676 + .name = "timer1",
677 + .start = 0x0050,
678 + .end = 0x0053,
679 + .flags = IORESOURCE_BUSY | IORESOURCE_IO
680 +}, {
681 + .name = "keyboard",
682 + .start = 0x0060,
683 + .end = 0x006f,
684 + .flags = IORESOURCE_BUSY | IORESOURCE_IO
685 +}, {
686 + .name = "dma page reg",
687 + .start = 0x0080,
688 + .end = 0x008f,
689 + .flags = IORESOURCE_BUSY | IORESOURCE_IO
690 +}, {
691 + .name = "pic2",
692 + .start = 0x00a0,
693 + .end = 0x00a1,
694 + .flags = IORESOURCE_BUSY | IORESOURCE_IO
695 +}, {
696 + .name = "dma2",
697 + .start = 0x00c0,
698 + .end = 0x00df,
699 + .flags = IORESOURCE_BUSY | IORESOURCE_IO
700 +}, {
701 + .name = "fpu",
702 + .start = 0x00f0,
703 + .end = 0x00ff,
704 + .flags = IORESOURCE_BUSY | IORESOURCE_IO
705 +} };
706 +
707 +static int romsignature(const unsigned char *x)
708 +{
709 + unsigned short sig;
710 + int ret = 0;
711 + if (probe_kernel_address((const unsigned short *)x, sig) == 0)
712 + ret = (sig == 0xaa55);
713 + return ret;
714 +}
715 +
716 +static int __init romchecksum(unsigned char *rom, unsigned long length)
717 +{
718 + unsigned char *p, sum = 0;
719 +
720 + for (p = rom; p < rom + length; p++)
721 + sum += *p;
722 + return sum == 0;
723 +}
724 +
725 +static void __init probe_roms(void)
726 +{
727 + unsigned long start, length, upper;
728 + unsigned char *rom;
729 + int i;
730 +
731 +#ifdef CONFIG_XEN
732 + /* Nothing to do if not running in dom0. */
733 + if (!is_initial_xendomain())
734 + return;
735 +#endif
736 +
737 + /* video rom */
738 + upper = adapter_rom_resources[0].start;
739 + for (start = video_rom_resource.start; start < upper; start += 2048) {
740 + rom = isa_bus_to_virt(start);
741 + if (!romsignature(rom))
742 + continue;
743 +
744 + video_rom_resource.start = start;
745 +
746 + /* 0 < length <= 0x7f * 512, historically */
747 + length = rom[2] * 512;
748 +
749 + /* if checksum okay, trust length byte */
750 + if (length && romchecksum(rom, length))
751 + video_rom_resource.end = start + length - 1;
752 +
753 + request_resource(&iomem_resource, &video_rom_resource);
754 + break;
755 + }
756 +
757 + start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
758 + if (start < upper)
759 + start = upper;
760 +
761 + /* system rom */
762 + request_resource(&iomem_resource, &system_rom_resource);
763 + upper = system_rom_resource.start;
764 +
765 + /* check for extension rom (ignore length byte!) */
766 + rom = isa_bus_to_virt((unsigned long)extension_rom_resource.start);
767 + if (romsignature(rom)) {
768 + length = extension_rom_resource.end - extension_rom_resource.start + 1;
769 + if (romchecksum(rom, length)) {
770 + request_resource(&iomem_resource, &extension_rom_resource);
771 + upper = extension_rom_resource.start;
772 + }
773 + }
774 +
775 + /* check for adapter roms on 2k boundaries */
776 + for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) {
777 + rom = isa_bus_to_virt(start);
778 + if (!romsignature(rom))
779 + continue;
780 +
781 + /* 0 < length <= 0x7f * 512, historically */
782 + length = rom[2] * 512;
783 +
784 + /* but accept any length that fits if checksum okay */
785 + if (!length || start + length > upper || !romchecksum(rom, length))
786 + continue;
787 +
788 + adapter_rom_resources[i].start = start;
789 + adapter_rom_resources[i].end = start + length - 1;
790 + request_resource(&iomem_resource, &adapter_rom_resources[i]);
791 +
792 + start = adapter_rom_resources[i++].end & ~2047UL;
793 + }
794 +}
795 +
796 +#ifdef CONFIG_XEN
797 +static struct e820map machine_e820 __initdata;
798 +#define e820 machine_e820
799 +#endif
800 +
801 +/*
802 + * Request address space for all standard RAM and ROM resources
803 + * and also for regions reported as reserved by the e820.
804 + */
805 +static void __init
806 +legacy_init_iomem_resources(struct resource *code_resource, struct resource *data_resource)
807 +{
808 + int i;
809 +
810 + probe_roms();
811 + for (i = 0; i < e820.nr_map; i++) {
812 + struct resource *res;
813 +#ifndef CONFIG_RESOURCES_64BIT
814 + if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL)
815 + continue;
816 +#endif
817 + res = kzalloc(sizeof(struct resource), GFP_ATOMIC);
818 + switch (e820.map[i].type) {
819 + case E820_RAM: res->name = "System RAM"; break;
820 + case E820_ACPI: res->name = "ACPI Tables"; break;
821 + case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
822 + default: res->name = "reserved";
823 + }
824 + res->start = e820.map[i].addr;
825 + res->end = res->start + e820.map[i].size - 1;
826 + res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
827 + if (request_resource(&iomem_resource, res)) {
828 + kfree(res);
829 + continue;
830 + }
831 + if (e820.map[i].type == E820_RAM) {
832 + /*
833 + * We don't know which RAM region contains kernel data,
834 + * so we try it repeatedly and let the resource manager
835 + * test it.
836 + */
837 +#ifndef CONFIG_XEN
838 + request_resource(res, code_resource);
839 + request_resource(res, data_resource);
840 +#endif
841 +#ifdef CONFIG_KEXEC
842 + request_resource(res, &crashk_res);
843 +#ifdef CONFIG_XEN
844 + xen_machine_kexec_register_resources(res);
845 +#endif
846 +#endif
847 + }
848 + }
849 +}
850 +
851 +#undef e820
852 +
853 +/*
854 + * Request address space for all standard resources
855 + *
856 + * This is called just before pcibios_init(), which is also a
857 + * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
858 + */
859 +static int __init request_standard_resources(void)
860 +{
861 + int i;
862 +
863 + /* Nothing to do if not running in dom0. */
864 + if (!is_initial_xendomain())
865 + return 0;
866 +
867 + printk("Setting up standard PCI resources\n");
868 + if (efi_enabled)
869 + efi_initialize_iomem_resources(&code_resource, &data_resource);
870 + else
871 + legacy_init_iomem_resources(&code_resource, &data_resource);
872 +
873 + /* EFI systems may still have VGA */
874 + request_resource(&iomem_resource, &video_ram_resource);
875 +
876 + /* request I/O space for devices used on all i[345]86 PCs */
877 + for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
878 + request_resource(&ioport_resource, &standard_io_resources[i]);
879 + return 0;
880 +}
881 +
882 +subsys_initcall(request_standard_resources);
883 +
884 +void __init add_memory_region(unsigned long long start,
885 + unsigned long long size, int type)
886 +{
887 + int x;
888 +
889 + if (!efi_enabled) {
890 + x = e820.nr_map;
891 +
892 + if (x == E820MAX) {
893 + printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
894 + return;
895 + }
896 +
897 + e820.map[x].addr = start;
898 + e820.map[x].size = size;
899 + e820.map[x].type = type;
900 + e820.nr_map++;
901 + }
902 +} /* add_memory_region */
903 +
904 +/*
905 + * Sanitize the BIOS e820 map.
906 + *
907 + * Some e820 responses include overlapping entries. The following
908 + * replaces the original e820 map with a new one, removing overlaps.
909 + *
910 + */
911 +int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
912 +{
913 + struct change_member *change_tmp;
914 + unsigned long current_type, last_type;
915 + unsigned long long last_addr;
916 + int chgidx, still_changing;
917 + int overlap_entries;
918 + int new_bios_entry;
919 + int old_nr, new_nr, chg_nr;
920 + int i;
921 +
922 + /*
923 + Visually we're performing the following (1,2,3,4 = memory types)...
924 +
925 + Sample memory map (w/overlaps):
926 + ____22__________________
927 + ______________________4_
928 + ____1111________________
929 + _44_____________________
930 + 11111111________________
931 + ____________________33__
932 + ___________44___________
933 + __________33333_________
934 + ______________22________
935 + ___________________2222_
936 + _________111111111______
937 + _____________________11_
938 + _________________4______
939 +
940 + Sanitized equivalent (no overlap):
941 + 1_______________________
942 + _44_____________________
943 + ___1____________________
944 + ____22__________________
945 + ______11________________
946 + _________1______________
947 + __________3_____________
948 + ___________44___________
949 + _____________33_________
950 + _______________2________
951 + ________________1_______
952 + _________________4______
953 + ___________________2____
954 + ____________________33__
955 + ______________________4_
956 + */
957 + printk("sanitize start\n");
958 + /* if there's only one memory region, don't bother */
959 + if (*pnr_map < 2) {
960 + printk("sanitize bail 0\n");
961 + return -1;
962 + }
963 +
964 + old_nr = *pnr_map;
965 +
966 + /* bail out if we find any unreasonable addresses in bios map */
967 + for (i=0; i<old_nr; i++)
968 + if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) {
969 + printk("sanitize bail 1\n");
970 + return -1;
971 + }
972 +
973 + /* create pointers for initial change-point information (for sorting) */
974 + for (i=0; i < 2*old_nr; i++)
975 + change_point[i] = &change_point_list[i];
976 +
977 + /* record all known change-points (starting and ending addresses),
978 + omitting those that are for empty memory regions */
979 + chgidx = 0;
980 + for (i=0; i < old_nr; i++) {
981 + if (biosmap[i].size != 0) {
982 + change_point[chgidx]->addr = biosmap[i].addr;
983 + change_point[chgidx++]->pbios = &biosmap[i];
984 + change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
985 + change_point[chgidx++]->pbios = &biosmap[i];
986 + }
987 + }
988 + chg_nr = chgidx; /* true number of change-points */
989 +
990 + /* sort change-point list by memory addresses (low -> high) */
991 + still_changing = 1;
992 + while (still_changing) {
993 + still_changing = 0;
994 + for (i=1; i < chg_nr; i++) {
995 + /* if <current_addr> > <last_addr>, swap */
996 + /* or, if current=<start_addr> & last=<end_addr>, swap */
997 + if ((change_point[i]->addr < change_point[i-1]->addr) ||
998 + ((change_point[i]->addr == change_point[i-1]->addr) &&
999 + (change_point[i]->addr == change_point[i]->pbios->addr) &&
1000 + (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
1001 + )
1002 + {
1003 + change_tmp = change_point[i];
1004 + change_point[i] = change_point[i-1];
1005 + change_point[i-1] = change_tmp;
1006 + still_changing=1;
1007 + }
1008 + }
1009 + }
1010 +
1011 + /* create a new bios memory map, removing overlaps */
1012 + overlap_entries=0; /* number of entries in the overlap table */
1013 + new_bios_entry=0; /* index for creating new bios map entries */
1014 + last_type = 0; /* start with undefined memory type */
1015 + last_addr = 0; /* start with 0 as last starting address */
1016 + /* loop through change-points, determining affect on the new bios map */
1017 + for (chgidx=0; chgidx < chg_nr; chgidx++)
1018 + {
1019 + /* keep track of all overlapping bios entries */
1020 + if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
1021 + {
1022 + /* add map entry to overlap list (> 1 entry implies an overlap) */
1023 + overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
1024 + }
1025 + else
1026 + {
1027 + /* remove entry from list (order independent, so swap with last) */
1028 + for (i=0; i<overlap_entries; i++)
1029 + {
1030 + if (overlap_list[i] == change_point[chgidx]->pbios)
1031 + overlap_list[i] = overlap_list[overlap_entries-1];
1032 + }
1033 + overlap_entries--;
1034 + }
1035 + /* if there are overlapping entries, decide which "type" to use */
1036 + /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
1037 + current_type = 0;
1038 + for (i=0; i<overlap_entries; i++)
1039 + if (overlap_list[i]->type > current_type)
1040 + current_type = overlap_list[i]->type;
1041 + /* continue building up new bios map based on this information */
1042 + if (current_type != last_type) {
1043 + if (last_type != 0) {
1044 + new_bios[new_bios_entry].size =
1045 + change_point[chgidx]->addr - last_addr;
1046 + /* move forward only if the new size was non-zero */
1047 + if (new_bios[new_bios_entry].size != 0)
1048 + if (++new_bios_entry >= E820MAX)
1049 + break; /* no more space left for new bios entries */
1050 + }
1051 + if (current_type != 0) {
1052 + new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
1053 + new_bios[new_bios_entry].type = current_type;
1054 + last_addr=change_point[chgidx]->addr;
1055 + }
1056 + last_type = current_type;
1057 + }
1058 + }
1059 + new_nr = new_bios_entry; /* retain count for new bios entries */
1060 +
1061 + /* copy new bios mapping into original location */
1062 + memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
1063 + *pnr_map = new_nr;
1064 +
1065 + printk("sanitize end\n");
1066 + return 0;
1067 +}
1068 +
1069 +/*
1070 + * Copy the BIOS e820 map into a safe place.
1071 + *
1072 + * Sanity-check it while we're at it..
1073 + *
1074 + * If we're lucky and live on a modern system, the setup code
1075 + * will have given us a memory map that we can use to properly
1076 + * set up memory. If we aren't, we'll fake a memory map.
1077 + *
1078 + * We check to see that the memory map contains at least 2 elements
1079 + * before we'll use it, because the detection code in setup.S may
1080 + * not be perfect and most every PC known to man has two memory
1081 + * regions: one from 0 to 640k, and one from 1mb up. (The IBM
1082 + * thinkpad 560x, for example, does not cooperate with the memory
1083 + * detection code.)
1084 + */
1085 +int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
1086 +{
1087 +#ifndef CONFIG_XEN
1088 + /* Only one memory region (or negative)? Ignore it */
1089 + if (nr_map < 2)
1090 + return -1;
1091 +#else
1092 + BUG_ON(nr_map < 1);
1093 +#endif
1094 +
1095 + do {
1096 + unsigned long long start = biosmap->addr;
1097 + unsigned long long size = biosmap->size;
1098 + unsigned long long end = start + size;
1099 + unsigned long type = biosmap->type;
1100 + printk("copy_e820_map() start: %016Lx size: %016Lx end: %016Lx type: %ld\n", start, size, end, type);
1101 +
1102 + /* Overflow in 64 bits? Ignore the memory map. */
1103 + if (start > end)
1104 + return -1;
1105 +
1106 +#ifndef CONFIG_XEN
1107 + /*
1108 + * Some BIOSes claim RAM in the 640k - 1M region.
1109 + * Not right. Fix it up.
1110 + */
1111 + if (type == E820_RAM) {
1112 + printk("copy_e820_map() type is E820_RAM\n");
1113 + if (start < 0x100000ULL && end > 0xA0000ULL) {
1114 + printk("copy_e820_map() lies in range...\n");
1115 + if (start < 0xA0000ULL) {
1116 + printk("copy_e820_map() start < 0xA0000ULL\n");
1117 + add_memory_region(start, 0xA0000ULL-start, type);
1118 + }
1119 + if (end <= 0x100000ULL) {
1120 + printk("copy_e820_map() end <= 0x100000ULL\n");
1121 + continue;
1122 + }
1123 + start = 0x100000ULL;
1124 + size = end - start;
1125 + }
1126 + }
1127 +#endif
1128 + add_memory_region(start, size, type);
1129 + } while (biosmap++,--nr_map);
1130 + return 0;
1131 +}
1132 +
1133 +/*
1134 + * Callback for efi_memory_walk.
1135 + */
1136 +static int __init
1137 +efi_find_max_pfn(unsigned long start, unsigned long end, void *arg)
1138 +{
1139 + unsigned long *max_pfn = arg, pfn;
1140 +
1141 + if (start < end) {
1142 + pfn = PFN_UP(end -1);
1143 + if (pfn > *max_pfn)
1144 + *max_pfn = pfn;
1145 + }
1146 + return 0;
1147 +}
1148 +
1149 +static int __init
1150 +efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg)
1151 +{
1152 + memory_present(0, PFN_UP(start), PFN_DOWN(end));
1153 + return 0;
1154 +}
1155 +
1156 +/*
1157 + * Find the highest page frame number we have available
1158 + */
1159 +void __init find_max_pfn(void)
1160 +{
1161 + int i;
1162 +
1163 + max_pfn = 0;
1164 + if (efi_enabled) {
1165 + efi_memmap_walk(efi_find_max_pfn, &max_pfn);
1166 + efi_memmap_walk(efi_memory_present_wrapper, NULL);
1167 + return;
1168 + }
1169 +
1170 + for (i = 0; i < e820.nr_map; i++) {
1171 + unsigned long start, end;
1172 + /* RAM? */
1173 + if (e820.map[i].type != E820_RAM)
1174 + continue;
1175 + start = PFN_UP(e820.map[i].addr);
1176 + end = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
1177 + if (start >= end)
1178 + continue;
1179 + if (end > max_pfn)
1180 + max_pfn = end;
1181 + memory_present(0, start, end);
1182 + }
1183 +}
1184 +
1185 +/*
1186 + * Free all available memory for boot time allocation. Used
1187 + * as a callback function by efi_memory_walk()
1188 + */
1189 +
1190 +static int __init
1191 +free_available_memory(unsigned long start, unsigned long end, void *arg)
1192 +{
1193 + /* check max_low_pfn */
1194 + if (start >= (max_low_pfn << PAGE_SHIFT))
1195 + return 0;
1196 + if (end >= (max_low_pfn << PAGE_SHIFT))
1197 + end = max_low_pfn << PAGE_SHIFT;
1198 + if (start < end)
1199 + free_bootmem(start, end - start);
1200 +
1201 + return 0;
1202 +}
1203 +/*
1204 + * Register fully available low RAM pages with the bootmem allocator.
1205 + */
1206 +void __init register_bootmem_low_pages(unsigned long max_low_pfn)
1207 +{
1208 + int i;
1209 +
1210 + if (efi_enabled) {
1211 + efi_memmap_walk(free_available_memory, NULL);
1212 + return;
1213 + }
1214 + for (i = 0; i < e820.nr_map; i++) {
1215 + unsigned long curr_pfn, last_pfn, size;
1216 + /*
1217 + * Reserve usable low memory
1218 + */
1219 + if (e820.map[i].type != E820_RAM)
1220 + continue;
1221 + /*
1222 + * We are rounding up the start address of usable memory:
1223 + */
1224 + curr_pfn = PFN_UP(e820.map[i].addr);
1225 + if (curr_pfn >= max_low_pfn)
1226 + continue;
1227 + /*
1228 + * ... and at the end of the usable range downwards:
1229 + */
1230 + last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
1231 +
1232 +#ifdef CONFIG_XEN
1233 + /*
1234 + * Truncate to the number of actual pages currently
1235 + * present.
1236 + */
1237 + if (last_pfn > xen_start_info->nr_pages)
1238 + last_pfn = xen_start_info->nr_pages;
1239 +#endif
1240 +
1241 + if (last_pfn > max_low_pfn)
1242 + last_pfn = max_low_pfn;
1243 +
1244 + /*
1245 + * .. finally, did all the rounding and playing
1246 + * around just make the area go away?
1247 + */
1248 + if (last_pfn <= curr_pfn)
1249 + continue;
1250 +
1251 + size = last_pfn - curr_pfn;
1252 + free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size));
1253 + }
1254 +}
1255 +
1256 +void __init e820_register_memory(void)
1257 +{
1258 + unsigned long gapstart, gapsize, round;
1259 + unsigned long long last;
1260 + int i;
1261 +
1262 +#ifdef CONFIG_XEN
1263 + if (is_initial_xendomain()) {
1264 + struct xen_memory_map memmap;
1265 +
1266 + memmap.nr_entries = E820MAX;
1267 + set_xen_guest_handle(memmap.buffer, machine_e820.map);
1268 +
1269 + if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap))
1270 + BUG();
1271 + machine_e820.nr_map = memmap.nr_entries;
1272 + }
1273 + else
1274 + machine_e820 = e820;
1275 +#define e820 machine_e820
1276 +#endif
1277 +
1278 + /*
1279 + * Search for the bigest gap in the low 32 bits of the e820
1280 + * memory space.
1281 + */
1282 + last = 0x100000000ull;
1283 + gapstart = 0x10000000;
1284 + gapsize = 0x400000;
1285 + i = e820.nr_map;
1286 + while (--i >= 0) {
1287 + unsigned long long start = e820.map[i].addr;
1288 + unsigned long long end = start + e820.map[i].size;
1289 +
1290 + /*
1291 + * Since "last" is at most 4GB, we know we'll
1292 + * fit in 32 bits if this condition is true
1293 + */
1294 + if (last > end) {
1295 + unsigned long gap = last - end;
1296 +
1297 + if (gap > gapsize) {
1298 + gapsize = gap;
1299 + gapstart = end;
1300 + }
1301 + }
1302 + if (start < last)
1303 + last = start;
1304 + }
1305 +#undef e820
1306 +
1307 + /*
1308 + * See how much we want to round up: start off with
1309 + * rounding to the next 1MB area.
1310 + */
1311 + round = 0x100000;
1312 + while ((gapsize >> 4) > round)
1313 + round += round;
1314 + /* Fun with two's complement */
1315 + pci_mem_start = (gapstart + round) & -round;
1316 +
1317 + printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n",
1318 + pci_mem_start, gapstart, gapsize);
1319 +}
1320 +
1321 +void __init print_memory_map(char *who)
1322 +{
1323 + int i;
1324 +
1325 + for (i = 0; i < e820.nr_map; i++) {
1326 + printk(" %s: %016Lx - %016Lx ", who,
1327 + e820.map[i].addr,
1328 + e820.map[i].addr + e820.map[i].size);
1329 + switch (e820.map[i].type) {
1330 + case E820_RAM: printk("(usable)\n");
1331 + break;
1332 + case E820_RESERVED:
1333 + printk("(reserved)\n");
1334 + break;
1335 + case E820_ACPI:
1336 + printk("(ACPI data)\n");
1337 + break;
1338 + case E820_NVS:
1339 + printk("(ACPI NVS)\n");
1340 + break;
1341 + default: printk("type %lu\n", e820.map[i].type);
1342 + break;
1343 + }
1344 + }
1345 +}
1346 +
1347 +static __init __always_inline void efi_limit_regions(unsigned long long size)
1348 +{
1349 + unsigned long long current_addr = 0;
1350 + efi_memory_desc_t *md, *next_md;
1351 + void *p, *p1;
1352 + int i, j;
1353 +
1354 + j = 0;
1355 + p1 = memmap.map;
1356 + for (p = p1, i = 0; p < memmap.map_end; p += memmap.desc_size, i++) {
1357 + md = p;
1358 + next_md = p1;
1359 + current_addr = md->phys_addr +
1360 + PFN_PHYS(md->num_pages);
1361 + if (is_available_memory(md)) {
1362 + if (md->phys_addr >= size) continue;
1363 + memcpy(next_md, md, memmap.desc_size);
1364 + if (current_addr >= size) {
1365 + next_md->num_pages -=
1366 + PFN_UP(current_addr-size);
1367 + }
1368 + p1 += memmap.desc_size;
1369 + next_md = p1;
1370 + j++;
1371 + } else if ((md->attribute & EFI_MEMORY_RUNTIME) ==
1372 + EFI_MEMORY_RUNTIME) {
1373 + /* In order to make runtime services
1374 + * available we have to include runtime
1375 + * memory regions in memory map */
1376 + memcpy(next_md, md, memmap.desc_size);
1377 + p1 += memmap.desc_size;
1378 + next_md = p1;
1379 + j++;
1380 + }
1381 + }
1382 + memmap.nr_map = j;
1383 + memmap.map_end = memmap.map +
1384 + (memmap.nr_map * memmap.desc_size);
1385 +}
1386 +
1387 +void __init limit_regions(unsigned long long size)
1388 +{
1389 + unsigned long long current_addr = 0;
1390 + int i;
1391 +
1392 + print_memory_map("limit_regions start");
1393 + if (efi_enabled) {
1394 + efi_limit_regions(size);
1395 + return;
1396 + }
1397 + for (i = 0; i < e820.nr_map; i++) {
1398 + current_addr = e820.map[i].addr + e820.map[i].size;
1399 + if (current_addr < size)
1400 + continue;
1401 +
1402 + if (e820.map[i].type != E820_RAM)
1403 + continue;
1404 +
1405 + if (e820.map[i].addr >= size) {
1406 + /*
1407 + * This region starts past the end of the
1408 + * requested size, skip it completely.
1409 + */
1410 + e820.nr_map = i;
1411 + } else {
1412 + e820.nr_map = i + 1;
1413 + e820.map[i].size -= current_addr - size;
1414 + }
1415 + print_memory_map("limit_regions endfor");
1416 + return;
1417 + }
1418 +#ifdef CONFIG_XEN
1419 + if (current_addr < size) {
1420 + /*
1421 + * The e820 map finished before our requested size so
1422 + * extend the final entry to the requested address.
1423 + */
1424 + --i;
1425 + if (e820.map[i].type == E820_RAM)
1426 + e820.map[i].size -= current_addr - size;
1427 + else
1428 + add_memory_region(current_addr, size - current_addr, E820_RAM);
1429 + }
1430 +#endif
1431 + print_memory_map("limit_regions endfunc");
1432 +}
1433 +
1434 +/*
1435 + * This function checks if any part of the range <start,end> is mapped
1436 + * with type.
1437 + */
1438 +int
1439 +e820_any_mapped(u64 start, u64 end, unsigned type)
1440 +{
1441 + int i;
1442 +
1443 +#ifndef CONFIG_XEN
1444 + for (i = 0; i < e820.nr_map; i++) {
1445 + const struct e820entry *ei = &e820.map[i];
1446 +#else
1447 + if (!is_initial_xendomain())
1448 + return 0;
1449 + for (i = 0; i < machine_e820.nr_map; ++i) {
1450 + const struct e820entry *ei = &machine_e820.map[i];
1451 +#endif
1452 +
1453 + if (type && ei->type != type)
1454 + continue;
1455 + if (ei->addr >= end || ei->addr + ei->size <= start)
1456 + continue;
1457 + return 1;
1458 + }
1459 + return 0;
1460 +}
1461 +EXPORT_SYMBOL_GPL(e820_any_mapped);
1462 +
1463 + /*
1464 + * This function checks if the entire range <start,end> is mapped with type.
1465 + *
1466 + * Note: this function only works correct if the e820 table is sorted and
1467 + * not-overlapping, which is the case
1468 + */
1469 +int __init
1470 +e820_all_mapped(unsigned long s, unsigned long e, unsigned type)
1471 +{
1472 + u64 start = s;
1473 + u64 end = e;
1474 + int i;
1475 +
1476 +#ifndef CONFIG_XEN
1477 + for (i = 0; i < e820.nr_map; i++) {
1478 + struct e820entry *ei = &e820.map[i];
1479 +#else
1480 + if (!is_initial_xendomain())
1481 + return 0;
1482 + for (i = 0; i < machine_e820.nr_map; ++i) {
1483 + const struct e820entry *ei = &machine_e820.map[i];
1484 +#endif
1485 +
1486 + if (type && ei->type != type)
1487 + continue;
1488 + /* is the region (part) in overlap with the current region ?*/
1489 + if (ei->addr >= end || ei->addr + ei->size <= start)
1490 + continue;
1491 + /* if the region is at the beginning of <start,end> we move
1492 + * start to the end of the region since it's ok until there
1493 + */
1494 + if (ei->addr <= start)
1495 + start = ei->addr + ei->size;
1496 + /* if start is now at or beyond end, we're done, full
1497 + * coverage */
1498 + if (start >= end)
1499 + return 1; /* we're done */
1500 + }
1501 + return 0;
1502 +}
1503 +
1504 +static int __init parse_memmap(char *arg)
1505 +{
1506 + if (!arg)
1507 + return -EINVAL;
1508 +
1509 + if (strcmp(arg, "exactmap") == 0) {
1510 +#ifdef CONFIG_CRASH_DUMP
1511 + /* If we are doing a crash dump, we
1512 + * still need to know the real mem
1513 + * size before original memory map is
1514 + * reset.
1515 + */
1516 + find_max_pfn();
1517 + saved_max_pfn = max_pfn;
1518 +#endif
1519 + e820.nr_map = 0;
1520 + user_defined_memmap = 1;
1521 + } else {
1522 + /* If the user specifies memory size, we
1523 + * limit the BIOS-provided memory map to
1524 + * that size. exactmap can be used to specify
1525 + * the exact map. mem=number can be used to
1526 + * trim the existing memory map.
1527 + */
1528 + unsigned long long start_at, mem_size;
1529 +
1530 + mem_size = memparse(arg, &arg);
1531 + if (*arg == '@') {
1532 + start_at = memparse(arg+1, &arg);
1533 + add_memory_region(start_at, mem_size, E820_RAM);
1534 + } else if (*arg == '#') {
1535 + start_at = memparse(arg+1, &arg);
1536 + add_memory_region(start_at, mem_size, E820_ACPI);
1537 + } else if (*arg == '$') {
1538 + start_at = memparse(arg+1, &arg);
1539 + add_memory_region(start_at, mem_size, E820_RESERVED);
1540 + } else {
1541 + limit_regions(mem_size);
1542 + user_defined_memmap = 1;
1543 + }
1544 + }
1545 + return 0;
1546 +}
1547 +early_param("memmap", parse_memmap);
1548 --- a/arch/x86/kernel/entry_32-xen.S
1549 +++ b/arch/x86/kernel/entry_32-xen.S
1550 @@ -30,12 +30,13 @@
1551 * 18(%esp) - %eax
1552 * 1C(%esp) - %ds
1553 * 20(%esp) - %es
1554 - * 24(%esp) - orig_eax
1555 - * 28(%esp) - %eip
1556 - * 2C(%esp) - %cs
1557 - * 30(%esp) - %eflags
1558 - * 34(%esp) - %oldesp
1559 - * 38(%esp) - %oldss
1560 + * 24(%esp) - %gs
1561 + * 28(%esp) - orig_eax
1562 + * 2C(%esp) - %eip
1563 + * 30(%esp) - %cs
1564 + * 34(%esp) - %eflags
1565 + * 38(%esp) - %oldesp
1566 + * 3C(%esp) - %oldss
1567 *
1568 * "current" is in register %ebx during any slow entries.
1569 */
1570 @@ -48,27 +49,25 @@
1571 #include <asm/smp.h>
1572 #include <asm/page.h>
1573 #include <asm/desc.h>
1574 +#include <asm/percpu.h>
1575 #include <asm/dwarf2.h>
1576 #include "irq_vectors.h"
1577 #include <xen/interface/xen.h>
1578
1579 -#define nr_syscalls ((syscall_table_size)/4)
1580 +/*
1581 + * We use macros for low-level operations which need to be overridden
1582 + * for paravirtualization. The following will never clobber any registers:
1583 + * INTERRUPT_RETURN (aka. "iret")
1584 + * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
1585 + * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
1586 + *
1587 + * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
1588 + * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
1589 + * Allowing a register to be clobbered can shrink the paravirt replacement
1590 + * enough to patch inline, increasing performance.
1591 + */
1592
1593 -EBX = 0x00
1594 -ECX = 0x04
1595 -EDX = 0x08
1596 -ESI = 0x0C
1597 -EDI = 0x10
1598 -EBP = 0x14
1599 -EAX = 0x18
1600 -DS = 0x1C
1601 -ES = 0x20
1602 -ORIG_EAX = 0x24
1603 -EIP = 0x28
1604 -CS = 0x2C
1605 -EFLAGS = 0x30
1606 -OLDESP = 0x34
1607 -OLDSS = 0x38
1608 +#define nr_syscalls ((syscall_table_size)/4)
1609
1610 CF_MASK = 0x00000001
1611 TF_MASK = 0x00000100
1612 @@ -79,61 +78,16 @@
1613 /* Pseudo-eflags. */
1614 NMI_MASK = 0x80000000
1615
1616 -#ifndef CONFIG_XEN
1617 -/* These are replaces for paravirtualization */
1618 -#define DISABLE_INTERRUPTS cli
1619 -#define ENABLE_INTERRUPTS sti
1620 -#define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit
1621 -#define INTERRUPT_RETURN iret
1622 -#define GET_CR0_INTO_EAX movl %cr0, %eax
1623 -#else
1624 -/* Offsets into shared_info_t. */
1625 -#define evtchn_upcall_pending /* 0 */
1626 -#define evtchn_upcall_mask 1
1627 -
1628 -#define sizeof_vcpu_shift 6
1629 -
1630 -#ifdef CONFIG_SMP
1631 -#define GET_VCPU_INFO movl TI_cpu(%ebp),%esi ; \
1632 - shl $sizeof_vcpu_shift,%esi ; \
1633 - addl HYPERVISOR_shared_info,%esi
1634 -#else
1635 -#define GET_VCPU_INFO movl HYPERVISOR_shared_info,%esi
1636 -#endif
1637 -
1638 -#define __DISABLE_INTERRUPTS movb $1,evtchn_upcall_mask(%esi)
1639 -#define __ENABLE_INTERRUPTS movb $0,evtchn_upcall_mask(%esi)
1640 -#define __TEST_PENDING testb $0xFF,evtchn_upcall_pending(%esi)
1641 -#define DISABLE_INTERRUPTS GET_VCPU_INFO ; \
1642 - __DISABLE_INTERRUPTS
1643 -#define ENABLE_INTERRUPTS GET_VCPU_INFO ; \
1644 - __ENABLE_INTERRUPTS
1645 -#define ENABLE_INTERRUPTS_SYSEXIT __ENABLE_INTERRUPTS ; \
1646 -sysexit_scrit: /**** START OF SYSEXIT CRITICAL REGION ****/ ; \
1647 - __TEST_PENDING ; \
1648 - jnz 14f # process more events if necessary... ; \
1649 - movl ESI(%esp), %esi ; \
1650 - sysexit ; \
1651 -14: __DISABLE_INTERRUPTS ; \
1652 - TRACE_IRQS_OFF ; \
1653 -sysexit_ecrit: /**** END OF SYSEXIT CRITICAL REGION ****/ ; \
1654 - push %esp ; \
1655 - call evtchn_do_upcall ; \
1656 - add $4,%esp ; \
1657 - jmp ret_from_intr
1658 -#define INTERRUPT_RETURN iret
1659 -#endif
1660 -
1661 #ifdef CONFIG_PREEMPT
1662 -#define preempt_stop DISABLE_INTERRUPTS; TRACE_IRQS_OFF
1663 +#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
1664 #else
1665 -#define preempt_stop
1666 +#define preempt_stop(clobbers)
1667 #define resume_kernel restore_nocheck
1668 #endif
1669
1670 .macro TRACE_IRQS_IRET
1671 #ifdef CONFIG_TRACE_IRQFLAGS
1672 - testl $IF_MASK,EFLAGS(%esp) # interrupts off?
1673 + testl $IF_MASK,PT_EFLAGS(%esp) # interrupts off?
1674 jz 1f
1675 TRACE_IRQS_ON
1676 1:
1677 @@ -148,6 +102,9 @@
1678
1679 #define SAVE_ALL \
1680 cld; \
1681 + pushl %gs; \
1682 + CFI_ADJUST_CFA_OFFSET 4;\
1683 + /*CFI_REL_OFFSET gs, 0;*/\
1684 pushl %es; \
1685 CFI_ADJUST_CFA_OFFSET 4;\
1686 /*CFI_REL_OFFSET es, 0;*/\
1687 @@ -177,7 +134,9 @@
1688 CFI_REL_OFFSET ebx, 0;\
1689 movl $(__USER_DS), %edx; \
1690 movl %edx, %ds; \
1691 - movl %edx, %es;
1692 + movl %edx, %es; \
1693 + movl $(__KERNEL_PDA), %edx; \
1694 + movl %edx, %gs
1695
1696 #define RESTORE_INT_REGS \
1697 popl %ebx; \
1698 @@ -210,17 +169,22 @@
1699 2: popl %es; \
1700 CFI_ADJUST_CFA_OFFSET -4;\
1701 /*CFI_RESTORE es;*/\
1702 -.section .fixup,"ax"; \
1703 -3: movl $0,(%esp); \
1704 - jmp 1b; \
1705 +3: popl %gs; \
1706 + CFI_ADJUST_CFA_OFFSET -4;\
1707 + /*CFI_RESTORE gs;*/\
1708 +.pushsection .fixup,"ax"; \
1709 4: movl $0,(%esp); \
1710 + jmp 1b; \
1711 +5: movl $0,(%esp); \
1712 jmp 2b; \
1713 -.previous; \
1714 +6: movl $0,(%esp); \
1715 + jmp 3b; \
1716 .section __ex_table,"a";\
1717 .align 4; \
1718 - .long 1b,3b; \
1719 - .long 2b,4b; \
1720 -.previous
1721 + .long 1b,4b; \
1722 + .long 2b,5b; \
1723 + .long 3b,6b; \
1724 +.popsection
1725
1726 #define RING0_INT_FRAME \
1727 CFI_STARTPROC simple;\
1728 @@ -239,18 +203,18 @@
1729 #define RING0_PTREGS_FRAME \
1730 CFI_STARTPROC simple;\
1731 CFI_SIGNAL_FRAME;\
1732 - CFI_DEF_CFA esp, OLDESP-EBX;\
1733 - /*CFI_OFFSET cs, CS-OLDESP;*/\
1734 - CFI_OFFSET eip, EIP-OLDESP;\
1735 - /*CFI_OFFSET es, ES-OLDESP;*/\
1736 - /*CFI_OFFSET ds, DS-OLDESP;*/\
1737 - CFI_OFFSET eax, EAX-OLDESP;\
1738 - CFI_OFFSET ebp, EBP-OLDESP;\
1739 - CFI_OFFSET edi, EDI-OLDESP;\
1740 - CFI_OFFSET esi, ESI-OLDESP;\
1741 - CFI_OFFSET edx, EDX-OLDESP;\
1742 - CFI_OFFSET ecx, ECX-OLDESP;\
1743 - CFI_OFFSET ebx, EBX-OLDESP
1744 + CFI_DEF_CFA esp, PT_OLDESP-PT_EBX;\
1745 + /*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/\
1746 + CFI_OFFSET eip, PT_EIP-PT_OLDESP;\
1747 + /*CFI_OFFSET es, PT_ES-PT_OLDESP;*/\
1748 + /*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/\
1749 + CFI_OFFSET eax, PT_EAX-PT_OLDESP;\
1750 + CFI_OFFSET ebp, PT_EBP-PT_OLDESP;\
1751 + CFI_OFFSET edi, PT_EDI-PT_OLDESP;\
1752 + CFI_OFFSET esi, PT_ESI-PT_OLDESP;\
1753 + CFI_OFFSET edx, PT_EDX-PT_OLDESP;\
1754 + CFI_OFFSET ecx, PT_ECX-PT_OLDESP;\
1755 + CFI_OFFSET ebx, PT_EBX-PT_OLDESP
1756
1757 ENTRY(ret_from_fork)
1758 CFI_STARTPROC
1759 @@ -278,17 +242,18 @@
1760 ALIGN
1761 RING0_PTREGS_FRAME
1762 ret_from_exception:
1763 - preempt_stop
1764 + preempt_stop(CLBR_ANY)
1765 ret_from_intr:
1766 GET_THREAD_INFO(%ebp)
1767 check_userspace:
1768 - movl EFLAGS(%esp), %eax # mix EFLAGS and CS
1769 - movb CS(%esp), %al
1770 + movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS
1771 + movb PT_CS(%esp), %al
1772 andl $(VM_MASK | SEGMENT_RPL_MASK), %eax
1773 cmpl $USER_RPL, %eax
1774 jb resume_kernel # not returning to v8086 or userspace
1775 +
1776 ENTRY(resume_userspace)
1777 - DISABLE_INTERRUPTS # make sure we don't miss an interrupt
1778 + DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
1779 # setting need_resched or sigpending
1780 # between sampling and the iret
1781 movl TI_flags(%ebp), %ecx
1782 @@ -299,14 +264,14 @@
1783
1784 #ifdef CONFIG_PREEMPT
1785 ENTRY(resume_kernel)
1786 - DISABLE_INTERRUPTS
1787 + DISABLE_INTERRUPTS(CLBR_ANY)
1788 cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ?
1789 jnz restore_nocheck
1790 need_resched:
1791 movl TI_flags(%ebp), %ecx # need_resched set ?
1792 testb $_TIF_NEED_RESCHED, %cl
1793 jz restore_all
1794 - testl $IF_MASK,EFLAGS(%esp) # interrupts off (exception path) ?
1795 + testl $IF_MASK,PT_EFLAGS(%esp) # interrupts off (exception path) ?
1796 jz restore_all
1797 call preempt_schedule_irq
1798 jmp need_resched
1799 @@ -328,7 +293,7 @@
1800 * No need to follow this irqs on/off section: the syscall
1801 * disabled irqs and here we enable it straight after entry:
1802 */
1803 - ENABLE_INTERRUPTS
1804 + ENABLE_INTERRUPTS(CLBR_NONE)
1805 pushl $(__USER_DS)
1806 CFI_ADJUST_CFA_OFFSET 4
1807 /*CFI_REL_OFFSET ss, 0*/
1808 @@ -340,12 +305,16 @@
1809 pushl $(__USER_CS)
1810 CFI_ADJUST_CFA_OFFSET 4
1811 /*CFI_REL_OFFSET cs, 0*/
1812 +#ifndef CONFIG_COMPAT_VDSO
1813 /*
1814 * Push current_thread_info()->sysenter_return to the stack.
1815 * A tiny bit of offset fixup is necessary - 4*4 means the 4 words
1816 * pushed above; +8 corresponds to copy_thread's esp0 setting.
1817 */
1818 pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp)
1819 +#else
1820 + pushl $SYSENTER_RETURN
1821 +#endif
1822 CFI_ADJUST_CFA_OFFSET 4
1823 CFI_REL_OFFSET eip, 0
1824
1825 @@ -372,19 +341,27 @@
1826 cmpl $(nr_syscalls), %eax
1827 jae syscall_badsys
1828 call *sys_call_table(,%eax,4)
1829 - movl %eax,EAX(%esp)
1830 - DISABLE_INTERRUPTS
1831 + movl %eax,PT_EAX(%esp)
1832 + DISABLE_INTERRUPTS(CLBR_ECX|CLBR_EDX)
1833 TRACE_IRQS_OFF
1834 movl TI_flags(%ebp), %ecx
1835 testw $_TIF_ALLWORK_MASK, %cx
1836 jne syscall_exit_work
1837 /* if something modifies registers it must also disable sysexit */
1838 - movl EIP(%esp), %edx
1839 - movl OLDESP(%esp), %ecx
1840 + movl PT_EIP(%esp), %edx
1841 + movl PT_OLDESP(%esp), %ecx
1842 xorl %ebp,%ebp
1843 TRACE_IRQS_ON
1844 +1: mov PT_GS(%esp), %gs
1845 ENABLE_INTERRUPTS_SYSEXIT
1846 CFI_ENDPROC
1847 +.pushsection .fixup,"ax"
1848 +2: movl $0,PT_GS(%esp)
1849 + jmp 1b
1850 +.section __ex_table,"a"
1851 + .align 4
1852 + .long 1b,2b
1853 +.popsection
1854
1855 # pv sysenter call handler stub
1856 ENTRY(sysenter_entry_pv)
1857 @@ -419,7 +396,7 @@
1858 CFI_ADJUST_CFA_OFFSET 4
1859 SAVE_ALL
1860 GET_THREAD_INFO(%ebp)
1861 - testl $TF_MASK,EFLAGS(%esp)
1862 + testl $TF_MASK,PT_EFLAGS(%esp)
1863 jz no_singlestep
1864 orl $_TIF_SINGLESTEP,TI_flags(%ebp)
1865 no_singlestep:
1866 @@ -431,9 +408,9 @@
1867 jae syscall_badsys
1868 syscall_call:
1869 call *sys_call_table(,%eax,4)
1870 - movl %eax,EAX(%esp) # store the return value
1871 + movl %eax,PT_EAX(%esp) # store the return value
1872 syscall_exit:
1873 - DISABLE_INTERRUPTS # make sure we don't miss an interrupt
1874 + DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
1875 # setting need_resched or sigpending
1876 # between sampling and the iret
1877 TRACE_IRQS_OFF
1878 @@ -443,12 +420,12 @@
1879
1880 restore_all:
1881 #ifndef CONFIG_XEN
1882 - movl EFLAGS(%esp), %eax # mix EFLAGS, SS and CS
1883 - # Warning: OLDSS(%esp) contains the wrong/random values if we
1884 + movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS
1885 + # Warning: PT_OLDSS(%esp) contains the wrong/random values if we
1886 # are returning to the kernel.
1887 # See comments in process.c:copy_thread() for details.
1888 - movb OLDSS(%esp), %ah
1889 - movb CS(%esp), %al
1890 + movb PT_OLDSS(%esp), %ah
1891 + movb PT_CS(%esp), %al
1892 andl $(VM_MASK | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
1893 cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
1894 CFI_REMEMBER_STATE
1895 @@ -456,7 +433,7 @@
1896 restore_nocheck:
1897 #else
1898 restore_nocheck:
1899 - movl EFLAGS(%esp), %eax
1900 + movl PT_EFLAGS(%esp), %eax
1901 testl $(VM_MASK|NMI_MASK), %eax
1902 CFI_REMEMBER_STATE
1903 jnz hypervisor_iret
1904 @@ -470,13 +447,13 @@
1905 TRACE_IRQS_IRET
1906 restore_nocheck_notrace:
1907 RESTORE_REGS
1908 - addl $4, %esp
1909 + addl $4, %esp # skip orig_eax/error_code
1910 CFI_ADJUST_CFA_OFFSET -4
1911 1: INTERRUPT_RETURN
1912 .section .fixup,"ax"
1913 iret_exc:
1914 #ifndef CONFIG_XEN
1915 - ENABLE_INTERRUPTS
1916 + ENABLE_INTERRUPTS(CLBR_NONE)
1917 #endif
1918 pushl $0 # no error code
1919 pushl $do_iret_error
1920 @@ -490,33 +467,42 @@
1921 CFI_RESTORE_STATE
1922 #ifndef CONFIG_XEN
1923 ldt_ss:
1924 - larl OLDSS(%esp), %eax
1925 + larl PT_OLDSS(%esp), %eax
1926 jnz restore_nocheck
1927 testl $0x00400000, %eax # returning to 32bit stack?
1928 jnz restore_nocheck # allright, normal return
1929 +
1930 +#ifdef CONFIG_PARAVIRT
1931 + /*
1932 + * The kernel can't run on a non-flat stack if paravirt mode
1933 + * is active. Rather than try to fixup the high bits of
1934 + * ESP, bypass this code entirely. This may break DOSemu
1935 + * and/or Wine support in a paravirt VM, although the option
1936 + * is still available to implement the setting of the high
1937 + * 16-bits in the INTERRUPT_RETURN paravirt-op.
1938 + */
1939 + cmpl $0, paravirt_ops+PARAVIRT_enabled
1940 + jne restore_nocheck
1941 +#endif
1942 +
1943 /* If returning to userspace with 16bit stack,
1944 * try to fix the higher word of ESP, as the CPU
1945 * won't restore it.
1946 * This is an "official" bug of all the x86-compatible
1947 * CPUs, which we can try to work around to make
1948 * dosemu and wine happy. */
1949 - subl $8, %esp # reserve space for switch16 pointer
1950 - CFI_ADJUST_CFA_OFFSET 8
1951 - DISABLE_INTERRUPTS
1952 + movl PT_OLDESP(%esp), %eax
1953 + movl %esp, %edx
1954 + call patch_espfix_desc
1955 + pushl $__ESPFIX_SS
1956 + CFI_ADJUST_CFA_OFFSET 4
1957 + pushl %eax
1958 + CFI_ADJUST_CFA_OFFSET 4
1959 + DISABLE_INTERRUPTS(CLBR_EAX)
1960 TRACE_IRQS_OFF
1961 - movl %esp, %eax
1962 - /* Set up the 16bit stack frame with switch32 pointer on top,
1963 - * and a switch16 pointer on top of the current frame. */
1964 - call setup_x86_bogus_stack
1965 - CFI_ADJUST_CFA_OFFSET -8 # frame has moved
1966 - TRACE_IRQS_IRET
1967 - RESTORE_REGS
1968 - lss 20+4(%esp), %esp # switch to 16bit stack
1969 -1: INTERRUPT_RETURN
1970 -.section __ex_table,"a"
1971 - .align 4
1972 - .long 1b,iret_exc
1973 -.previous
1974 + lss (%esp), %esp
1975 + CFI_ADJUST_CFA_OFFSET -8
1976 + jmp restore_nocheck
1977 #else
1978 ALIGN
1979 restore_all_enable_events:
1980 @@ -540,7 +526,7 @@
1981
1982 CFI_RESTORE_STATE
1983 hypervisor_iret:
1984 - andl $~NMI_MASK, EFLAGS(%esp)
1985 + andl $~NMI_MASK, PT_EFLAGS(%esp)
1986 RESTORE_REGS
1987 addl $4, %esp
1988 CFI_ADJUST_CFA_OFFSET -4
1989 @@ -556,7 +542,7 @@
1990 jz work_notifysig
1991 work_resched:
1992 call schedule
1993 - DISABLE_INTERRUPTS # make sure we don't miss an interrupt
1994 + DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
1995 # setting need_resched or sigpending
1996 # between sampling and the iret
1997 TRACE_IRQS_OFF
1998 @@ -569,7 +555,8 @@
1999
2000 work_notifysig: # deal with pending signals and
2001 # notify-resume requests
2002 - testl $VM_MASK, EFLAGS(%esp)
2003 +#ifdef CONFIG_VM86
2004 + testl $VM_MASK, PT_EFLAGS(%esp)
2005 movl %esp, %eax
2006 jne work_notifysig_v86 # returning to kernel-space or
2007 # vm86-space
2008 @@ -579,29 +566,30 @@
2009
2010 ALIGN
2011 work_notifysig_v86:
2012 -#ifdef CONFIG_VM86
2013 pushl %ecx # save ti_flags for do_notify_resume
2014 CFI_ADJUST_CFA_OFFSET 4
2015 call save_v86_state # %eax contains pt_regs pointer
2016 popl %ecx
2017 CFI_ADJUST_CFA_OFFSET -4
2018 movl %eax, %esp
2019 +#else
2020 + movl %esp, %eax
2021 +#endif
2022 xorl %edx, %edx
2023 call do_notify_resume
2024 jmp resume_userspace_sig
2025 -#endif
2026
2027 # perform syscall exit tracing
2028 ALIGN
2029 syscall_trace_entry:
2030 - movl $-ENOSYS,EAX(%esp)
2031 + movl $-ENOSYS,PT_EAX(%esp)
2032 movl %esp, %eax
2033 xorl %edx,%edx
2034 call do_syscall_trace
2035 cmpl $0, %eax
2036 jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU,
2037 # so must skip actual syscall
2038 - movl ORIG_EAX(%esp), %eax
2039 + movl PT_ORIG_EAX(%esp), %eax
2040 cmpl $(nr_syscalls), %eax
2041 jnae syscall_call
2042 jmp syscall_exit
2043 @@ -612,7 +600,7 @@
2044 testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl
2045 jz work_pending
2046 TRACE_IRQS_ON
2047 - ENABLE_INTERRUPTS # could let do_syscall_trace() call
2048 + ENABLE_INTERRUPTS(CLBR_ANY) # could let do_syscall_trace() call
2049 # schedule() instead
2050 movl %esp, %eax
2051 movl $1, %edx
2052 @@ -626,40 +614,39 @@
2053 CFI_ADJUST_CFA_OFFSET 4
2054 SAVE_ALL
2055 GET_THREAD_INFO(%ebp)
2056 - movl $-EFAULT,EAX(%esp)
2057 + movl $-EFAULT,PT_EAX(%esp)
2058 jmp resume_userspace
2059
2060 syscall_badsys:
2061 - movl $-ENOSYS,EAX(%esp)
2062 + movl $-ENOSYS,PT_EAX(%esp)
2063 jmp resume_userspace
2064 CFI_ENDPROC
2065
2066 #ifndef CONFIG_XEN
2067 #define FIXUP_ESPFIX_STACK \
2068 - movl %esp, %eax; \
2069 - /* switch to 32bit stack using the pointer on top of 16bit stack */ \
2070 - lss %ss:CPU_16BIT_STACK_SIZE-8, %esp; \
2071 - /* copy data from 16bit stack to 32bit stack */ \
2072 - call fixup_x86_bogus_stack; \
2073 - /* put ESP to the proper location */ \
2074 - movl %eax, %esp;
2075 -#define UNWIND_ESPFIX_STACK \
2076 + /* since we are on a wrong stack, we cant make it a C code :( */ \
2077 + movl %gs:PDA_cpu, %ebx; \
2078 + PER_CPU(cpu_gdt_descr, %ebx); \
2079 + movl GDS_address(%ebx), %ebx; \
2080 + GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \
2081 + addl %esp, %eax; \
2082 + pushl $__KERNEL_DS; \
2083 + CFI_ADJUST_CFA_OFFSET 4; \
2084 pushl %eax; \
2085 CFI_ADJUST_CFA_OFFSET 4; \
2086 + lss (%esp), %esp; \
2087 + CFI_ADJUST_CFA_OFFSET -8;
2088 +#define UNWIND_ESPFIX_STACK \
2089 movl %ss, %eax; \
2090 - /* see if on 16bit stack */ \
2091 + /* see if on espfix stack */ \
2092 cmpw $__ESPFIX_SS, %ax; \
2093 - je 28f; \
2094 -27: popl %eax; \
2095 - CFI_ADJUST_CFA_OFFSET -4; \
2096 -.section .fixup,"ax"; \
2097 -28: movl $__KERNEL_DS, %eax; \
2098 + jne 27f; \
2099 + movl $__KERNEL_DS, %eax; \
2100 movl %eax, %ds; \
2101 movl %eax, %es; \
2102 - /* switch to 32bit stack */ \
2103 + /* switch to normal stack */ \
2104 FIXUP_ESPFIX_STACK; \
2105 - jmp 27b; \
2106 -.previous
2107 +27:;
2108
2109 /*
2110 * Build the entry stubs and pointer table with
2111 @@ -723,13 +710,16 @@
2112 CFI_ADJUST_CFA_OFFSET 4
2113 ALIGN
2114 error_code:
2115 + /* the function address is in %gs's slot on the stack */
2116 + pushl %es
2117 + CFI_ADJUST_CFA_OFFSET 4
2118 + /*CFI_REL_OFFSET es, 0*/
2119 pushl %ds
2120 CFI_ADJUST_CFA_OFFSET 4
2121 /*CFI_REL_OFFSET ds, 0*/
2122 pushl %eax
2123 CFI_ADJUST_CFA_OFFSET 4
2124 CFI_REL_OFFSET eax, 0
2125 - xorl %eax, %eax
2126 pushl %ebp
2127 CFI_ADJUST_CFA_OFFSET 4
2128 CFI_REL_OFFSET ebp, 0
2129 @@ -742,7 +732,6 @@
2130 pushl %edx
2131 CFI_ADJUST_CFA_OFFSET 4
2132 CFI_REL_OFFSET edx, 0
2133 - decl %eax # eax = -1
2134 pushl %ecx
2135 CFI_ADJUST_CFA_OFFSET 4
2136 CFI_REL_OFFSET ecx, 0
2137 @@ -750,18 +739,20 @@
2138 CFI_ADJUST_CFA_OFFSET 4
2139 CFI_REL_OFFSET ebx, 0
2140 cld
2141 - pushl %es
2142 + pushl %gs
2143 CFI_ADJUST_CFA_OFFSET 4
2144 - /*CFI_REL_OFFSET es, 0*/
2145 + /*CFI_REL_OFFSET gs, 0*/
2146 + movl $(__KERNEL_PDA), %ecx
2147 + movl %ecx, %gs
2148 UNWIND_ESPFIX_STACK
2149 popl %ecx
2150 CFI_ADJUST_CFA_OFFSET -4
2151 /*CFI_REGISTER es, ecx*/
2152 - movl ES(%esp), %edi # get the function address
2153 - movl ORIG_EAX(%esp), %edx # get the error code
2154 - movl %eax, ORIG_EAX(%esp)
2155 - movl %ecx, ES(%esp)
2156 - /*CFI_REL_OFFSET es, ES*/
2157 + movl PT_GS(%esp), %edi # get the function address
2158 + movl PT_ORIG_EAX(%esp), %edx # get the error code
2159 + movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
2160 + mov %ecx, PT_GS(%esp)
2161 + /*CFI_REL_OFFSET gs, ES*/
2162 movl $(__USER_DS), %ecx
2163 movl %ecx, %ds
2164 movl %ecx, %es
2165 @@ -793,7 +784,7 @@
2166 pushl %eax
2167 CFI_ADJUST_CFA_OFFSET 4
2168 SAVE_ALL
2169 - movl EIP(%esp),%eax
2170 + movl PT_EIP(%esp),%eax
2171 cmpl $scrit,%eax
2172 jb 11f
2173 cmpl $ecrit,%eax
2174 @@ -802,7 +793,7 @@
2175 jb 11f
2176 cmpl $sysexit_ecrit,%eax
2177 ja 11f
2178 - addl $OLDESP,%esp # Remove eflags...ebx from stack frame.
2179 + addl $PT_OLDESP,%esp # Remove eflags...ebx from stack frame.
2180 11: push %esp
2181 CFI_ADJUST_CFA_OFFSET 4
2182 call evtchn_do_upcall
2183 @@ -824,7 +815,7 @@
2184 jne 15f
2185 xorl %ecx,%ecx
2186 15: leal (%esp,%ecx),%esi # %esi points at end of src region
2187 - leal OLDESP(%esp),%edi # %edi points at end of dst region
2188 + leal PT_OLDESP(%esp),%edi # %edi points at end of dst region
2189 shrl $2,%ecx # convert words to bytes
2190 je 17f # skip loop if nothing to copy
2191 16: subl $4,%esi # pre-decrementing copy loop
2192 @@ -848,8 +839,9 @@
2193 .byte 0x18 # pop %eax
2194 .byte 0x1c # pop %ds
2195 .byte 0x20 # pop %es
2196 - .byte 0x24,0x24,0x24 # add $4,%esp
2197 - .byte 0x28 # iret
2198 + .byte 0x24,0x24 # pop %gs
2199 + .byte 0x28,0x28,0x28 # add $4,%esp
2200 + .byte 0x2c # iret
2201 .byte 0xff,0xff,0xff,0xff # movb $1,1(%esi)
2202 .byte 0x00,0x00 # jmp 11b
2203 .previous
2204 @@ -940,7 +932,7 @@
2205 jmp ret_from_exception
2206 device_available_emulate:
2207 #endif
2208 - preempt_stop
2209 + preempt_stop(CLBR_ANY)
2210 call math_state_restore
2211 jmp ret_from_exception
2212 CFI_ENDPROC
2213 @@ -1010,7 +1002,7 @@
2214 cmpw $__ESPFIX_SS, %ax
2215 popl %eax
2216 CFI_ADJUST_CFA_OFFSET -4
2217 - je nmi_16bit_stack
2218 + je nmi_espfix_stack
2219 cmpl $sysenter_entry,(%esp)
2220 je nmi_stack_fixup
2221 pushl %eax
2222 @@ -1053,7 +1045,7 @@
2223 FIX_STACK(24,nmi_stack_correct, 1)
2224 jmp nmi_stack_correct
2225
2226 -nmi_16bit_stack:
2227 +nmi_espfix_stack:
2228 /* We have a RING0_INT_FRAME here.
2229 *
2230 * create the pointer to lss back
2231 @@ -1062,7 +1054,6 @@
2232 CFI_ADJUST_CFA_OFFSET 4
2233 pushl %esp
2234 CFI_ADJUST_CFA_OFFSET 4
2235 - movzwl %sp, %esp
2236 addw $4, (%esp)
2237 /* copy the iret frame of 12 bytes */
2238 .rept 3
2239 @@ -1073,11 +1064,11 @@
2240 CFI_ADJUST_CFA_OFFSET 4
2241 SAVE_ALL
2242 FIXUP_ESPFIX_STACK # %eax == %esp
2243 - CFI_ADJUST_CFA_OFFSET -20 # the frame has now moved
2244 xorl %edx,%edx # zero error code
2245 call do_nmi
2246 RESTORE_REGS
2247 - lss 12+4(%esp), %esp # back to 16bit stack
2248 + lss 12+4(%esp), %esp # back to espfix stack
2249 + CFI_ADJUST_CFA_OFFSET -24
2250 1: INTERRUPT_RETURN
2251 CFI_ENDPROC
2252 .section __ex_table,"a"
2253 @@ -1093,12 +1084,25 @@
2254 xorl %edx,%edx # zero error code
2255 movl %esp,%eax # pt_regs pointer
2256 call do_nmi
2257 - orl $NMI_MASK, EFLAGS(%esp)
2258 + orl $NMI_MASK, PT_EFLAGS(%esp)
2259 jmp restore_all
2260 CFI_ENDPROC
2261 #endif
2262 KPROBE_END(nmi)
2263
2264 +#ifdef CONFIG_PARAVIRT
2265 +ENTRY(native_iret)
2266 +1: iret
2267 +.section __ex_table,"a"
2268 + .align 4
2269 + .long 1b,iret_exc
2270 +.previous
2271 +
2272 +ENTRY(native_irq_enable_sysexit)
2273 + sti
2274 + sysexit
2275 +#endif
2276 +
2277 KPROBE_ENTRY(int3)
2278 RING0_INT_FRAME
2279 pushl $-1 # mark this as an int
2280 @@ -1214,37 +1218,6 @@
2281 CFI_ENDPROC
2282 #endif /* !CONFIG_XEN */
2283
2284 -#ifdef CONFIG_STACK_UNWIND
2285 -ENTRY(arch_unwind_init_running)
2286 - CFI_STARTPROC
2287 - movl 4(%esp), %edx
2288 - movl (%esp), %ecx
2289 - leal 4(%esp), %eax
2290 - movl %ebx, EBX(%edx)
2291 - xorl %ebx, %ebx
2292 - movl %ebx, ECX(%edx)
2293 - movl %ebx, EDX(%edx)
2294 - movl %esi, ESI(%edx)
2295 - movl %edi, EDI(%edx)
2296 - movl %ebp, EBP(%edx)
2297 - movl %ebx, EAX(%edx)
2298 - movl $__USER_DS, DS(%edx)
2299 - movl $__USER_DS, ES(%edx)
2300 - movl %ebx, ORIG_EAX(%edx)
2301 - movl %ecx, EIP(%edx)
2302 - movl 12(%esp), %ecx
2303 - movl $__KERNEL_CS, CS(%edx)
2304 - movl %ebx, EFLAGS(%edx)
2305 - movl %eax, OLDESP(%edx)
2306 - movl 8(%esp), %eax
2307 - movl %ecx, 8(%esp)
2308 - movl EBX(%edx), %ebx
2309 - movl $__KERNEL_DS, OLDSS(%edx)
2310 - jmpl *%eax
2311 - CFI_ENDPROC
2312 -ENDPROC(arch_unwind_init_running)
2313 -#endif
2314 -
2315 ENTRY(fixup_4gb_segment)
2316 RING0_EC_FRAME
2317 pushl $do_fixup_4gb_segment
2318 --- a/arch/x86/kernel/entry_64-xen.S
2319 +++ b/arch/x86/kernel/entry_64-xen.S
2320 @@ -261,7 +261,6 @@
2321 movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
2322 GET_THREAD_INFO(%rcx)
2323 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx)
2324 - CFI_REMEMBER_STATE
2325 jnz tracesys
2326 cmpq $__NR_syscall_max,%rax
2327 ja badsys
2328 @@ -272,7 +271,6 @@
2329 * Syscall return path ending with SYSRET (fast path)
2330 * Has incomplete stack frame and undefined top of stack.
2331 */
2332 - .globl ret_from_sys_call
2333 ret_from_sys_call:
2334 movl $_TIF_ALLWORK_MASK,%edi
2335 /* edi: flagmask */
2336 @@ -282,8 +280,8 @@
2337 TRACE_IRQS_OFF
2338 movl threadinfo_flags(%rcx),%edx
2339 andl %edi,%edx
2340 - CFI_REMEMBER_STATE
2341 jnz sysret_careful
2342 + CFI_REMEMBER_STATE
2343 /*
2344 * sysretq will re-enable interrupts:
2345 */
2346 @@ -292,10 +290,10 @@
2347 RESTORE_ARGS 0,8,0
2348 HYPERVISOR_IRET VGCF_IN_SYSCALL
2349
2350 + CFI_RESTORE_STATE
2351 /* Handle reschedules */
2352 /* edx: work, edi: workmask */
2353 sysret_careful:
2354 - CFI_RESTORE_STATE
2355 bt $TIF_NEED_RESCHED,%edx
2356 jnc sysret_signal
2357 TRACE_IRQS_ON
2358 @@ -334,7 +332,6 @@
2359
2360 /* Do syscall tracing */
2361 tracesys:
2362 - CFI_RESTORE_STATE
2363 SAVE_REST
2364 movq $-ENOSYS,RAX(%rsp)
2365 FIXUP_TOP_OF_STACK %rdi
2366 @@ -350,32 +347,13 @@
2367 call *sys_call_table(,%rax,8)
2368 1: movq %rax,RAX-ARGOFFSET(%rsp)
2369 /* Use IRET because user could have changed frame */
2370 - jmp int_ret_from_sys_call
2371 - CFI_ENDPROC
2372 -END(system_call)
2373
2374 /*
2375 * Syscall return path ending with IRET.
2376 * Has correct top of stack, but partial stack frame.
2377 - */
2378 -ENTRY(int_ret_from_sys_call)
2379 - CFI_STARTPROC simple
2380 - CFI_SIGNAL_FRAME
2381 - CFI_DEF_CFA rsp,SS+8-ARGOFFSET
2382 - /*CFI_REL_OFFSET ss,SS-ARGOFFSET*/
2383 - CFI_REL_OFFSET rsp,RSP-ARGOFFSET
2384 - /*CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/
2385 - /*CFI_REL_OFFSET cs,CS-ARGOFFSET*/
2386 - CFI_REL_OFFSET rip,RIP-ARGOFFSET
2387 - CFI_REL_OFFSET rdx,RDX-ARGOFFSET
2388 - CFI_REL_OFFSET rcx,RCX-ARGOFFSET
2389 - CFI_REL_OFFSET rax,RAX-ARGOFFSET
2390 - CFI_REL_OFFSET rdi,RDI-ARGOFFSET
2391 - CFI_REL_OFFSET rsi,RSI-ARGOFFSET
2392 - CFI_REL_OFFSET r8,R8-ARGOFFSET
2393 - CFI_REL_OFFSET r9,R9-ARGOFFSET
2394 - CFI_REL_OFFSET r10,R10-ARGOFFSET
2395 - CFI_REL_OFFSET r11,R11-ARGOFFSET
2396 + */
2397 + .globl int_ret_from_sys_call
2398 +int_ret_from_sys_call:
2399 XEN_BLOCK_EVENTS(%rsi)
2400 TRACE_IRQS_OFF
2401 testb $3,CS-ARGOFFSET(%rsp)
2402 @@ -428,8 +406,6 @@
2403 popq %rdi
2404 CFI_ADJUST_CFA_OFFSET -8
2405 andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi
2406 - XEN_BLOCK_EVENTS(%rsi)
2407 - TRACE_IRQS_OFF
2408 jmp int_restore_rest
2409
2410 int_signal:
2411 @@ -445,7 +421,7 @@
2412 TRACE_IRQS_OFF
2413 jmp int_with_check
2414 CFI_ENDPROC
2415 -END(int_ret_from_sys_call)
2416 +END(system_call)
2417
2418 /*
2419 * Certain special system calls that need to save a complete full stack frame.
2420 @@ -1275,36 +1251,3 @@
2421 ret
2422 CFI_ENDPROC
2423 ENDPROC(call_softirq)
2424 -
2425 -#ifdef CONFIG_STACK_UNWIND
2426 -ENTRY(arch_unwind_init_running)
2427 - CFI_STARTPROC
2428 - movq %r15, R15(%rdi)
2429 - movq %r14, R14(%rdi)
2430 - xchgq %rsi, %rdx
2431 - movq %r13, R13(%rdi)
2432 - movq %r12, R12(%rdi)
2433 - xorl %eax, %eax
2434 - movq %rbp, RBP(%rdi)
2435 - movq %rbx, RBX(%rdi)
2436 - movq (%rsp), %rcx
2437 - movq %rax, R11(%rdi)
2438 - movq %rax, R10(%rdi)
2439 - movq %rax, R9(%rdi)
2440 - movq %rax, R8(%rdi)
2441 - movq %rax, RAX(%rdi)
2442 - movq %rax, RCX(%rdi)
2443 - movq %rax, RDX(%rdi)
2444 - movq %rax, RSI(%rdi)
2445 - movq %rax, RDI(%rdi)
2446 - movq %rax, ORIG_RAX(%rdi)
2447 - movq %rcx, RIP(%rdi)
2448 - leaq 8(%rsp), %rcx
2449 - movq $__KERNEL_CS, CS(%rdi)
2450 - movq %rax, EFLAGS(%rdi)
2451 - movq %rcx, RSP(%rdi)
2452 - movq $__KERNEL_DS, SS(%rdi)
2453 - jmpq *%rdx
2454 - CFI_ENDPROC
2455 -ENDPROC(arch_unwind_init_running)
2456 -#endif
2457 --- a/arch/x86/kernel/genapic_64-xen.c
2458 +++ b/arch/x86/kernel/genapic_64-xen.c
2459 @@ -34,6 +34,7 @@
2460
2461 #ifndef CONFIG_XEN
2462 struct genapic *genapic = &apic_flat;
2463 +struct genapic *genapic_force;
2464 #else
2465 extern struct genapic apic_xen;
2466 struct genapic *genapic = &apic_xen;
2467 @@ -52,6 +53,13 @@
2468 u8 cluster_cnt[NUM_APIC_CLUSTERS];
2469 int max_apic = 0;
2470
2471 + /* genapic selection can be forced because of certain quirks.
2472 + */
2473 + if (genapic_force) {
2474 + genapic = genapic_force;
2475 + goto print;
2476 + }
2477 +
2478 #if defined(CONFIG_ACPI)
2479 /*
2480 * Some x86_64 machines use physical APIC mode regardless of how many
2481 --- a/arch/x86/kernel/head64-xen.c
2482 +++ b/arch/x86/kernel/head64-xen.c
2483 @@ -101,7 +101,10 @@
2484 machine_to_phys_order++;
2485
2486 #if 0
2487 - for (i = 0; i < 256; i++)
2488 + /* clear bss before set_intr_gate with early_idt_handler */
2489 + clear_bss();
2490 +
2491 + for (i = 0; i < IDT_ENTRIES; i++)
2492 set_intr_gate(i, early_idt_handler);
2493 asm volatile("lidt %0" :: "m" (idt_descr));
2494 #endif
2495 --- a/arch/x86/kernel/head_32-xen.S
2496 +++ b/arch/x86/kernel/head_32-xen.S
2497 @@ -9,6 +9,7 @@
2498 #include <asm/cache.h>
2499 #include <asm/thread_info.h>
2500 #include <asm/asm-offsets.h>
2501 +#include <asm/boot.h>
2502 #include <asm/dwarf2.h>
2503 #include <xen/interface/xen.h>
2504 #include <xen/interface/elfnote.h>
2505 @@ -35,6 +36,8 @@
2506 /* Set up the stack pointer */
2507 movl $(init_thread_union+THREAD_SIZE),%esp
2508
2509 + call setup_pda
2510 +
2511 /* get vendor info */
2512 xorl %eax,%eax # call CPUID with 0 -> return vendor ID
2513 XEN_CPUID
2514 @@ -57,14 +60,58 @@
2515
2516 movb $1,X86_HARD_MATH
2517
2518 - xorl %eax,%eax # Clear FS/GS and LDT
2519 + xorl %eax,%eax # Clear FS
2520 movl %eax,%fs
2521 - movl %eax,%gs
2522 +
2523 + movl $(__KERNEL_PDA),%eax
2524 + mov %eax,%gs
2525 +
2526 cld # gcc2 wants the direction flag cleared at all times
2527
2528 pushl $0 # fake return address for unwinder
2529 jmp start_kernel
2530
2531 +/*
2532 + * Point the GDT at this CPU's PDA. This will be
2533 + * cpu_gdt_table and boot_pda.
2534 + */
2535 +setup_pda:
2536 + /* get the PDA pointer */
2537 + movl $boot_pda, %eax
2538 +
2539 + /* slot the PDA address into the GDT */
2540 + mov $cpu_gdt_table, %ecx
2541 + mov %ax, (__KERNEL_PDA+0+2)(%ecx) /* base & 0x0000ffff */
2542 + shr $16, %eax
2543 + mov %al, (__KERNEL_PDA+4+0)(%ecx) /* base & 0x00ff0000 */
2544 + mov %ah, (__KERNEL_PDA+4+3)(%ecx) /* base & 0xff000000 */
2545 +
2546 + # %esi still points to start_info, and no registers
2547 + # need to be preserved.
2548 +
2549 + movl XEN_START_mfn_list(%esi), %ebx
2550 + movl $(cpu_gdt_table - __PAGE_OFFSET), %eax
2551 + shrl $PAGE_SHIFT, %eax
2552 + movl (%ebx,%eax,4), %ecx
2553 + pushl %ecx # frame number for set_gdt below
2554 +
2555 + xorl %esi, %esi
2556 + xorl %edx, %edx
2557 + shldl $PAGE_SHIFT, %ecx, %edx
2558 + shll $PAGE_SHIFT, %ecx
2559 + orl $0x61, %ecx
2560 + movl $cpu_gdt_table, %ebx
2561 + movl $__HYPERVISOR_update_va_mapping, %eax
2562 + int $0x82
2563 +
2564 + movl $(PAGE_SIZE_asm / 8), %ecx
2565 + movl %esp, %ebx
2566 + movl $__HYPERVISOR_set_gdt, %eax
2567 + int $0x82
2568 +
2569 + popl %ecx
2570 + ret
2571 +
2572 #define HYPERCALL_PAGE_OFFSET 0x1000
2573 .org HYPERCALL_PAGE_OFFSET
2574 ENTRY(hypercall_page)
2575 @@ -93,7 +140,8 @@
2576 /*
2577 * The Global Descriptor Table contains 28 quadwords, per-CPU.
2578 */
2579 - .align L1_CACHE_BYTES
2580 + .section .data.page_aligned, "aw"
2581 + .align PAGE_SIZE_asm
2582 ENTRY(cpu_gdt_table)
2583 .quad 0x0000000000000000 /* NULL descriptor */
2584 .quad 0x0000000000000000 /* 0x0b reserved */
2585 @@ -135,12 +183,13 @@
2586 .quad 0x0000000000000000 /* 0xc0 APM CS 16 code (16 bit) */
2587 .quad 0x0000000000000000 /* 0xc8 APM DS data */
2588
2589 - .quad 0x0000000000000000 /* 0xd0 - ESPFIX 16-bit SS */
2590 - .quad 0x0000000000000000 /* 0xd8 - unused */
2591 + .quad 0x0000000000000000 /* 0xd0 - ESPFIX SS */
2592 + .quad 0x00cf92000000ffff /* 0xd8 - PDA */
2593 .quad 0x0000000000000000 /* 0xe0 - unused */
2594 .quad 0x0000000000000000 /* 0xe8 - unused */
2595 .quad 0x0000000000000000 /* 0xf0 - unused */
2596 .quad 0x0000000000000000 /* 0xf8 - GDT entry 31: double-fault TSS */
2597 + .align PAGE_SIZE_asm
2598
2599 #if CONFIG_XEN_COMPAT <= 0x030002
2600 /*
2601 @@ -165,9 +214,9 @@
2602 .ascii ",ELF_PADDR_OFFSET=0x"
2603 utoa __PAGE_OFFSET
2604 .ascii ",VIRT_ENTRY=0x"
2605 - utoa (__PAGE_OFFSET + __PHYSICAL_START + VIRT_ENTRY_OFFSET)
2606 + utoa (__PAGE_OFFSET + LOAD_PHYSICAL_ADDR + VIRT_ENTRY_OFFSET)
2607 .ascii ",HYPERCALL_PAGE=0x"
2608 - utoa ((__PHYSICAL_START+HYPERCALL_PAGE_OFFSET)>>PAGE_SHIFT)
2609 + utoa ((LOAD_PHYSICAL_ADDR+HYPERCALL_PAGE_OFFSET)>>PAGE_SHIFT)
2610 .ascii ",FEATURES=writable_page_tables"
2611 .ascii "|writable_descriptor_tables"
2612 .ascii "|auto_translated_physmap"
2613 --- a/arch/x86/kernel/io_apic_32-xen.c
2614 +++ b/arch/x86/kernel/io_apic_32-xen.c
2615 @@ -34,6 +34,7 @@
2616 #include <linux/pci.h>
2617 #include <linux/msi.h>
2618 #include <linux/htirq.h>
2619 +#include <linux/freezer.h>
2620
2621 #include <asm/io.h>
2622 #include <asm/smp.h>
2623 @@ -194,14 +195,20 @@
2624 * the interrupt, and we need to make sure the entry is fully populated
2625 * before that happens.
2626 */
2627 -static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
2628 +static void
2629 +__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
2630 {
2631 - unsigned long flags;
2632 union entry_union eu;
2633 eu.entry = e;
2634 - spin_lock_irqsave(&ioapic_lock, flags);
2635 io_apic_write(apic, 0x11 + 2*pin, eu.w2);
2636 io_apic_write(apic, 0x10 + 2*pin, eu.w1);
2637 +}
2638 +
2639 +static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
2640 +{
2641 + unsigned long flags;
2642 + spin_lock_irqsave(&ioapic_lock, flags);
2643 + __ioapic_write_entry(apic, pin, e);
2644 spin_unlock_irqrestore(&ioapic_lock, flags);
2645 }
2646
2647 @@ -883,8 +890,7 @@
2648
2649 if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
2650 mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
2651 - mp_bus_id_to_type[lbus] == MP_BUS_MCA ||
2652 - mp_bus_id_to_type[lbus] == MP_BUS_NEC98
2653 + mp_bus_id_to_type[lbus] == MP_BUS_MCA
2654 ) &&
2655 (mp_irqs[i].mpc_irqtype == type) &&
2656 (mp_irqs[i].mpc_srcbusirq == irq))
2657 @@ -903,8 +909,7 @@
2658
2659 if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
2660 mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
2661 - mp_bus_id_to_type[lbus] == MP_BUS_MCA ||
2662 - mp_bus_id_to_type[lbus] == MP_BUS_NEC98
2663 + mp_bus_id_to_type[lbus] == MP_BUS_MCA
2664 ) &&
2665 (mp_irqs[i].mpc_irqtype == type) &&
2666 (mp_irqs[i].mpc_srcbusirq == irq))
2667 @@ -1036,12 +1041,6 @@
2668 #define default_MCA_trigger(idx) (1)
2669 #define default_MCA_polarity(idx) (0)
2670
2671 -/* NEC98 interrupts are always polarity zero edge triggered,
2672 - * when listed as conforming in the MP table. */
2673 -
2674 -#define default_NEC98_trigger(idx) (0)
2675 -#define default_NEC98_polarity(idx) (0)
2676 -
2677 static int __init MPBIOS_polarity(int idx)
2678 {
2679 int bus = mp_irqs[idx].mpc_srcbus;
2680 @@ -1076,11 +1075,6 @@
2681 polarity = default_MCA_polarity(idx);
2682 break;
2683 }
2684 - case MP_BUS_NEC98: /* NEC 98 pin */
2685 - {
2686 - polarity = default_NEC98_polarity(idx);
2687 - break;
2688 - }
2689 default:
2690 {
2691 printk(KERN_WARNING "broken BIOS!!\n");
2692 @@ -1150,11 +1144,6 @@
2693 trigger = default_MCA_trigger(idx);
2694 break;
2695 }
2696 - case MP_BUS_NEC98: /* NEC 98 pin */
2697 - {
2698 - trigger = default_NEC98_trigger(idx);
2699 - break;
2700 - }
2701 default:
2702 {
2703 printk(KERN_WARNING "broken BIOS!!\n");
2704 @@ -1216,7 +1205,6 @@
2705 case MP_BUS_ISA: /* ISA pin */
2706 case MP_BUS_EISA:
2707 case MP_BUS_MCA:
2708 - case MP_BUS_NEC98:
2709 {
2710 irq = mp_irqs[idx].mpc_srcbusirq;
2711 break;
2712 @@ -1284,7 +1272,7 @@
2713 }
2714
2715 /* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
2716 -u8 irq_vector[NR_IRQ_VECTORS] __read_mostly; /* = { FIRST_DEVICE_VECTOR , 0 }; */
2717 +static u8 irq_vector[NR_IRQ_VECTORS] __read_mostly; /* = { FIRST_DEVICE_VECTOR , 0 }; */
2718
2719 static int __assign_irq_vector(int irq)
2720 {
2721 @@ -1407,8 +1395,8 @@
2722 if (!apic && (irq < 16))
2723 disable_8259A_irq(irq);
2724 }
2725 - ioapic_write_entry(apic, pin, entry);
2726 spin_lock_irqsave(&ioapic_lock, flags);
2727 + __ioapic_write_entry(apic, pin, entry);
2728 set_native_irq_info(irq, TARGET_CPUS);
2729 spin_unlock_irqrestore(&ioapic_lock, flags);
2730 }
2731 @@ -1974,6 +1962,15 @@
2732 #endif
2733
2734 #ifndef CONFIG_XEN
2735 +static int no_timer_check __initdata;
2736 +
2737 +static int __init notimercheck(char *s)
2738 +{
2739 + no_timer_check = 1;
2740 + return 1;
2741 +}
2742 +__setup("no_timer_check", notimercheck);
2743 +
2744 /*
2745 * There is a nasty bug in some older SMP boards, their mptable lies
2746 * about the timer IRQ. We do the following to work around the situation:
2747 @@ -1982,10 +1979,13 @@
2748 * - if this function detects that timer IRQs are defunct, then we fall
2749 * back to ISA timer IRQs
2750 */
2751 -static int __init timer_irq_works(void)
2752 +int __init timer_irq_works(void)
2753 {
2754 unsigned long t1 = jiffies;
2755
2756 + if (no_timer_check)
2757 + return 1;
2758 +
2759 local_irq_enable();
2760 /* Let ten ticks pass... */
2761 mdelay((10 * 1000) / HZ);
2762 @@ -2212,9 +2212,15 @@
2763 unsigned char save_control, save_freq_select;
2764
2765 pin = find_isa_irq_pin(8, mp_INT);
2766 + if (pin == -1) {
2767 + WARN_ON_ONCE(1);
2768 + return;
2769 + }
2770 apic = find_isa_irq_apic(8, mp_INT);
2771 - if (pin == -1)
2772 + if (apic == -1) {
2773 + WARN_ON_ONCE(1);
2774 return;
2775 + }
2776
2777 entry0 = ioapic_read_entry(apic, pin);
2778 clear_IO_APIC_pin(apic, pin);
2779 @@ -2259,7 +2265,7 @@
2780 * is so screwy. Thanks to Brian Perkins for testing/hacking this beast
2781 * fanatically on his truly buggy board.
2782 */
2783 -static inline void check_timer(void)
2784 +static inline void __init check_timer(void)
2785 {
2786 int apic1, pin1, apic2, pin2;
2787 int vector;
2788 @@ -2543,7 +2549,7 @@
2789 int create_irq(void)
2790 {
2791 /* Allocate an unused irq */
2792 - int irq, new, vector;
2793 + int irq, new, vector = 0;
2794 unsigned long flags;
2795
2796 irq = -ENOSPC;
2797 @@ -2923,8 +2929,8 @@
2798 if (!ioapic && (irq < 16))
2799 disable_8259A_irq(irq);
2800
2801 - ioapic_write_entry(ioapic, pin, entry);
2802 spin_lock_irqsave(&ioapic_lock, flags);
2803 + __ioapic_write_entry(ioapic, pin, entry);
2804 set_native_irq_info(irq, TARGET_CPUS);
2805 spin_unlock_irqrestore(&ioapic_lock, flags);
2806
2807 --- a/arch/x86/kernel/io_apic_64-xen.c
2808 +++ b/arch/x86/kernel/io_apic_64-xen.c
2809 @@ -199,14 +199,20 @@
2810 * the interrupt, and we need to make sure the entry is fully populated
2811 * before that happens.
2812 */
2813 -static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
2814 +static void
2815 +__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
2816 {
2817 - unsigned long flags;
2818 union entry_union eu;
2819 eu.entry = e;
2820 - spin_lock_irqsave(&ioapic_lock, flags);
2821 io_apic_write(apic, 0x11 + 2*pin, eu.w2);
2822 io_apic_write(apic, 0x10 + 2*pin, eu.w1);
2823 +}
2824 +
2825 +static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
2826 +{
2827 + unsigned long flags;
2828 + spin_lock_irqsave(&ioapic_lock, flags);
2829 + __ioapic_write_entry(apic, pin, e);
2830 spin_unlock_irqrestore(&ioapic_lock, flags);
2831 }
2832
2833 @@ -714,6 +720,22 @@
2834 }
2835
2836 #ifndef CONFIG_XEN
2837 +static void __clear_irq_vector(int irq)
2838 +{
2839 + cpumask_t mask;
2840 + int cpu, vector;
2841 +
2842 + BUG_ON(!irq_vector[irq]);
2843 +
2844 + vector = irq_vector[irq];
2845 + cpus_and(mask, irq_domain[irq], cpu_online_map);
2846 + for_each_cpu_mask(cpu, mask)
2847 + per_cpu(vector_irq, cpu)[vector] = -1;
2848 +
2849 + irq_vector[irq] = 0;
2850 + irq_domain[irq] = CPU_MASK_NONE;
2851 +}
2852 +
2853 void __setup_vector_irq(int cpu)
2854 {
2855 /* Initialize vector_irq on a new cpu */
2856 @@ -761,26 +783,65 @@
2857 #define ioapic_register_intr(_irq,_vector,_trigger) ((void)0)
2858 #endif /* !CONFIG_XEN */
2859
2860 -static void __init setup_IO_APIC_irqs(void)
2861 +static void __init setup_IO_APIC_irq(int apic, int pin, int idx, int irq)
2862 {
2863 struct IO_APIC_route_entry entry;
2864 - int apic, pin, idx, irq, first_notcon = 1, vector;
2865 + int vector;
2866 unsigned long flags;
2867
2868 - apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
2869
2870 - for (apic = 0; apic < nr_ioapics; apic++) {
2871 - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
2872 + /*
2873 + * add it to the IO-APIC irq-routing table:
2874 + */
2875 + memset(&entry,0,sizeof(entry));
2876
2877 - /*
2878 - * add it to the IO-APIC irq-routing table:
2879 - */
2880 - memset(&entry,0,sizeof(entry));
2881 + entry.delivery_mode = INT_DELIVERY_MODE;
2882 + entry.dest_mode = INT_DEST_MODE;
2883 + entry.mask = 0; /* enable IRQ */
2884 + entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
2885
2886 - entry.delivery_mode = INT_DELIVERY_MODE;
2887 - entry.dest_mode = INT_DEST_MODE;
2888 - entry.mask = 0; /* enable IRQ */
2889 + entry.trigger = irq_trigger(idx);
2890 + entry.polarity = irq_polarity(idx);
2891 +
2892 + if (irq_trigger(idx)) {
2893 + entry.trigger = 1;
2894 + entry.mask = 1;
2895 entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
2896 + }
2897 +
2898 + if (/* !apic && */ !IO_APIC_IRQ(irq))
2899 + return;
2900 +
2901 + if (IO_APIC_IRQ(irq)) {
2902 + cpumask_t mask;
2903 + vector = assign_irq_vector(irq, TARGET_CPUS, &mask);
2904 + if (vector < 0)
2905 + return;
2906 +
2907 + entry.dest.logical.logical_dest = cpu_mask_to_apicid(mask);
2908 + entry.vector = vector;
2909 +
2910 + ioapic_register_intr(irq, vector, IOAPIC_AUTO);
2911 + if (!apic && (irq < 16))
2912 + disable_8259A_irq(irq);
2913 + }
2914 +
2915 + ioapic_write_entry(apic, pin, entry);
2916 +
2917 + spin_lock_irqsave(&ioapic_lock, flags);
2918 + set_native_irq_info(irq, TARGET_CPUS);
2919 + spin_unlock_irqrestore(&ioapic_lock, flags);
2920 +
2921 +}
2922 +
2923 +static void __init setup_IO_APIC_irqs(void)
2924 +{
2925 + int apic, pin, idx, irq, first_notcon = 1;
2926 +
2927 + apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
2928 +
2929 + for (apic = 0; apic < nr_ioapics; apic++) {
2930 + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
2931
2932 idx = find_irq_entry(apic,pin,mp_INT);
2933 if (idx == -1) {
2934 @@ -792,39 +853,11 @@
2935 continue;
2936 }
2937
2938 - entry.trigger = irq_trigger(idx);
2939 - entry.polarity = irq_polarity(idx);
2940 -
2941 - if (irq_trigger(idx)) {
2942 - entry.trigger = 1;
2943 - entry.mask = 1;
2944 - entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
2945 - }
2946 -
2947 irq = pin_2_irq(idx, apic, pin);
2948 add_pin_to_irq(irq, apic, pin);
2949
2950 - if (/* !apic && */ !IO_APIC_IRQ(irq))
2951 - continue;
2952 -
2953 - if (IO_APIC_IRQ(irq)) {
2954 - cpumask_t mask;
2955 - vector = assign_irq_vector(irq, TARGET_CPUS, &mask);
2956 - if (vector < 0)
2957 - continue;
2958 -
2959 - entry.dest.logical.logical_dest = cpu_mask_to_apicid(mask);
2960 - entry.vector = vector;
2961 + setup_IO_APIC_irq(apic, pin, idx, irq);
2962
2963 - ioapic_register_intr(irq, vector, IOAPIC_AUTO);
2964 - if (!apic && (irq < 16))
2965 - disable_8259A_irq(irq);
2966 - }
2967 - ioapic_write_entry(apic, pin, entry);
2968 -
2969 - spin_lock_irqsave(&ioapic_lock, flags);
2970 - set_native_irq_info(irq, TARGET_CPUS);
2971 - spin_unlock_irqrestore(&ioapic_lock, flags);
2972 }
2973 }
2974
2975 @@ -1819,7 +1852,7 @@
2976 dynamic_irq_cleanup(irq);
2977
2978 spin_lock_irqsave(&vector_lock, flags);
2979 - irq_vector[irq] = 0;
2980 + __clear_irq_vector(irq);
2981 spin_unlock_irqrestore(&vector_lock, flags);
2982 }
2983 #endif
2984 @@ -2123,7 +2156,15 @@
2985 if (irq_entry == -1)
2986 continue;
2987 irq = pin_2_irq(irq_entry, ioapic, pin);
2988 - set_ioapic_affinity_irq(irq, TARGET_CPUS);
2989 +
2990 + /* setup_IO_APIC_irqs could fail to get vector for some device
2991 + * when you have too many devices, because at that time only boot
2992 + * cpu is online.
2993 + */
2994 + if(!irq_vector[irq])
2995 + setup_IO_APIC_irq(ioapic, pin, irq_entry, irq);
2996 + else
2997 + set_ioapic_affinity_irq(irq, TARGET_CPUS);
2998 }
2999
3000 }
3001 --- a/arch/x86/kernel/irq_64-xen.c
3002 +++ b/arch/x86/kernel/irq_64-xen.c
3003 @@ -120,7 +120,7 @@
3004
3005 if (likely(irq < NR_IRQS))
3006 generic_handle_irq(irq);
3007 - else
3008 + else if (printk_ratelimit())
3009 printk(KERN_EMERG "%s: %d.%d No irq handler for irq\n",
3010 __func__, smp_processor_id(), irq);
3011
3012 --- a/arch/x86/kernel/ldt_32-xen.c
3013 +++ b/arch/x86/kernel/ldt_32-xen.c
3014 @@ -177,16 +177,14 @@
3015 {
3016 int err;
3017 unsigned long size;
3018 - void *address;
3019
3020 err = 0;
3021 - address = &default_ldt[0];
3022 size = 5*sizeof(struct desc_struct);
3023 if (size > bytecount)
3024 size = bytecount;
3025
3026 err = size;
3027 - if (copy_to_user(ptr, address, size))
3028 + if (clear_user(ptr, size))
3029 err = -EFAULT;
3030
3031 return err;
3032 --- a/arch/x86/kernel/microcode-xen.c
3033 +++ b/arch/x86/kernel/microcode-xen.c
3034 @@ -1,7 +1,7 @@
3035 /*
3036 * Intel CPU Microcode Update Driver for Linux
3037 *
3038 - * Copyright (C) 2000-2004 Tigran Aivazian
3039 + * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
3040 * 2006 Shaohua Li <shaohua.li@intel.com>
3041 *
3042 * This driver allows to upgrade microcode on Intel processors
3043 @@ -43,7 +43,7 @@
3044 #include <asm/processor.h>
3045
3046 MODULE_DESCRIPTION("Intel CPU (IA-32) Microcode Update Driver");
3047 -MODULE_AUTHOR("Tigran Aivazian <tigran@veritas.com>");
3048 +MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
3049 MODULE_LICENSE("GPL");
3050
3051 static int verbose;
3052 @@ -195,7 +195,7 @@
3053 request_microcode();
3054
3055 printk(KERN_INFO
3056 - "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@veritas.com>\n");
3057 + "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@aivazian.fsnet.co.uk>\n");
3058 return 0;
3059 }
3060
3061 --- a/arch/x86/kernel/mpparse_32-xen.c
3062 +++ b/arch/x86/kernel/mpparse_32-xen.c
3063 @@ -36,7 +36,7 @@
3064
3065 /* Have we found an MP table */
3066 int smp_found_config;
3067 -unsigned int __initdata maxcpus = NR_CPUS;
3068 +unsigned int __cpuinitdata maxcpus = NR_CPUS;
3069
3070 /*
3071 * Various Linux-internal data structures created from the
3072 @@ -102,10 +102,10 @@
3073 */
3074
3075 static int mpc_record;
3076 -static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY] __initdata;
3077 +static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY] __cpuinitdata;
3078
3079 #ifndef CONFIG_XEN
3080 -static void __devinit MP_processor_info (struct mpc_config_processor *m)
3081 +static void __cpuinit MP_processor_info (struct mpc_config_processor *m)
3082 {
3083 int ver, apicid;
3084 physid_mask_t phys_cpu;
3085 @@ -221,7 +221,7 @@
3086 bios_cpu_apicid[num_processors - 1] = m->mpc_apicid;
3087 }
3088 #else
3089 -void __init MP_processor_info (struct mpc_config_processor *m)
3090 +static void __cpuinit MP_processor_info (struct mpc_config_processor *m)
3091 {
3092 num_processors++;
3093 }
3094 @@ -256,8 +256,6 @@
3095 mp_current_pci_id++;
3096 } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA)-1) == 0) {
3097 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA;
3098 - } else if (strncmp(str, BUSTYPE_NEC98, sizeof(BUSTYPE_NEC98)-1) == 0) {
3099 - mp_bus_id_to_type[m->mpc_busid] = MP_BUS_NEC98;
3100 } else {
3101 printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
3102 }
3103 @@ -842,7 +840,7 @@
3104 #endif
3105 }
3106
3107 -void __devinit mp_register_lapic (u8 id, u8 enabled)
3108 +void __cpuinit mp_register_lapic (u8 id, u8 enabled)
3109 {
3110 struct mpc_config_processor processor;
3111 int boot_cpu = 0;
3112 --- a/arch/x86/kernel/mpparse_64-xen.c
3113 +++ b/arch/x86/kernel/mpparse_64-xen.c
3114 @@ -35,8 +35,6 @@
3115 int smp_found_config;
3116 unsigned int __initdata maxcpus = NR_CPUS;
3117
3118 -int acpi_found_madt;
3119 -
3120 /*
3121 * Various Linux-internal data structures created from the
3122 * MP-table.
3123 --- a/arch/x86/kernel/pci-dma_32-xen.c
3124 +++ b/arch/x86/kernel/pci-dma_32-xen.c
3125 @@ -282,7 +282,7 @@
3126 int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
3127 dma_addr_t device_addr, size_t size, int flags)
3128 {
3129 - void __iomem *mem_base;
3130 + void __iomem *mem_base = NULL;
3131 int pages = size >> PAGE_SHIFT;
3132 int bitmap_size = (pages + 31)/32;
3133
3134 @@ -299,14 +299,12 @@
3135 if (!mem_base)
3136 goto out;
3137
3138 - dev->dma_mem = kmalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
3139 + dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
3140 if (!dev->dma_mem)
3141 goto out;
3142 - memset(dev->dma_mem, 0, sizeof(struct dma_coherent_mem));
3143 - dev->dma_mem->bitmap = kmalloc(bitmap_size, GFP_KERNEL);
3144 + dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
3145 if (!dev->dma_mem->bitmap)
3146 goto free1_out;
3147 - memset(dev->dma_mem->bitmap, 0, bitmap_size);
3148
3149 dev->dma_mem->virt_base = mem_base;
3150 dev->dma_mem->device_base = device_addr;
3151 @@ -321,6 +319,8 @@
3152 free1_out:
3153 kfree(dev->dma_mem->bitmap);
3154 out:
3155 + if (mem_base)
3156 + iounmap(mem_base);
3157 return 0;
3158 }
3159 EXPORT_SYMBOL(dma_declare_coherent_memory);
3160 --- a/arch/x86/kernel/process_32-xen.c
3161 +++ b/arch/x86/kernel/process_32-xen.c
3162 @@ -60,6 +60,7 @@
3163
3164 #include <asm/tlbflush.h>
3165 #include <asm/cpu.h>
3166 +#include <asm/pda.h>
3167
3168 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
3169
3170 @@ -104,28 +105,24 @@
3171 */
3172 static void poll_idle (void)
3173 {
3174 - local_irq_enable();
3175 -
3176 - asm volatile(
3177 - "2:"
3178 - "testl %0, %1;"
3179 - "rep; nop;"
3180 - "je 2b;"
3181 - : : "i"(_TIF_NEED_RESCHED), "m" (current_thread_info()->flags));
3182 + cpu_relax();
3183 }
3184
3185 static void xen_idle(void)
3186 {
3187 - local_irq_disable();
3188 + current_thread_info()->status &= ~TS_POLLING;
3189 + /*
3190 + * TS_POLLING-cleared state must be visible before we
3191 + * test NEED_RESCHED:
3192 + */
3193 + smp_mb();
3194
3195 - if (need_resched())
3196 + local_irq_disable();
3197 + if (!need_resched())
3198 + safe_halt(); /* enables interrupts racelessly */
3199 + else
3200 local_irq_enable();
3201 - else {
3202 - current_thread_info()->status &= ~TS_POLLING;
3203 - smp_mb__after_clear_bit();
3204 - safe_halt();
3205 - current_thread_info()->status |= TS_POLLING;
3206 - }
3207 + current_thread_info()->status |= TS_POLLING;
3208 }
3209 #ifdef CONFIG_APM_MODULE
3210 EXPORT_SYMBOL(default_idle);
3211 @@ -250,8 +247,8 @@
3212 regs->eax,regs->ebx,regs->ecx,regs->edx);
3213 printk("ESI: %08lx EDI: %08lx EBP: %08lx",
3214 regs->esi, regs->edi, regs->ebp);
3215 - printk(" DS: %04x ES: %04x\n",
3216 - 0xffff & regs->xds,0xffff & regs->xes);
3217 + printk(" DS: %04x ES: %04x GS: %04x\n",
3218 + 0xffff & regs->xds,0xffff & regs->xes, 0xffff & regs->xgs);
3219
3220 cr0 = read_cr0();
3221 cr2 = read_cr2();
3222 @@ -282,6 +279,7 @@
3223
3224 regs.xds = __USER_DS;
3225 regs.xes = __USER_DS;
3226 + regs.xgs = __KERNEL_PDA;
3227 regs.orig_eax = -1;
3228 regs.eip = (unsigned long) kernel_thread_helper;
3229 regs.xcs = __KERNEL_CS | get_kernel_rpl();
3230 @@ -359,7 +357,6 @@
3231 p->thread.eip = (unsigned long) ret_from_fork;
3232
3233 savesegment(fs,p->thread.fs);
3234 - savesegment(gs,p->thread.gs);
3235
3236 tsk = current;
3237 if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
3238 @@ -438,7 +435,7 @@
3239 dump->regs.ds = regs->xds;
3240 dump->regs.es = regs->xes;
3241 savesegment(fs,dump->regs.fs);
3242 - savesegment(gs,dump->regs.gs);
3243 + dump->regs.gs = regs->xgs;
3244 dump->regs.orig_eax = regs->orig_eax;
3245 dump->regs.eip = regs->eip;
3246 dump->regs.cs = regs->xcs;
3247 @@ -614,17 +611,19 @@
3248 if (unlikely(HYPERVISOR_multicall_check(_mcl, mcl - _mcl, NULL)))
3249 BUG();
3250
3251 + /* we're going to use this soon, after a few expensive things */
3252 + if (next_p->fpu_counter > 5)
3253 + prefetch(&next->i387.fxsave);
3254 +
3255 /*
3256 - * Restore %fs and %gs if needed.
3257 + * Restore %fs if needed.
3258 *
3259 - * Glibc normally makes %fs be zero, and %gs is one of
3260 - * the TLS segments.
3261 + * Glibc normally makes %fs be zero.
3262 */
3263 if (unlikely(next->fs))
3264 loadsegment(fs, next->fs);
3265
3266 - if (next->gs)
3267 - loadsegment(gs, next->gs);
3268 + write_pda(pcurrent, next_p);
3269
3270 /*
3271 * Now maybe handle debug registers
3272 @@ -634,6 +633,13 @@
3273
3274 disable_tsc(prev_p, next_p);
3275
3276 + /* If the task has used fpu the last 5 timeslices, just do a full
3277 + * restore of the math state immediately to avoid the trap; the
3278 + * chances of needing FPU soon are obviously high now
3279 + */
3280 + if (next_p->fpu_counter > 5)
3281 + math_state_restore();
3282 +
3283 return prev_p;
3284 }
3285
3286 --- a/arch/x86/kernel/process_64-xen.c
3287 +++ b/arch/x86/kernel/process_64-xen.c
3288 @@ -119,29 +119,23 @@
3289 static void poll_idle (void)
3290 {
3291 local_irq_enable();
3292 -
3293 - asm volatile(
3294 - "2:"
3295 - "testl %0,%1;"
3296 - "rep; nop;"
3297 - "je 2b;"
3298 - : :
3299 - "i" (_TIF_NEED_RESCHED),
3300 - "m" (current_thread_info()->flags));
3301 + cpu_relax();
3302 }
3303
3304 static void xen_idle(void)
3305 {
3306 + current_thread_info()->status &= ~TS_POLLING;
3307 + /*
3308 + * TS_POLLING-cleared state must be visible before we
3309 + * test NEED_RESCHED:
3310 + */
3311 + smp_mb();
3312 local_irq_disable();
3313 -
3314 - if (need_resched())
3315 - local_irq_enable();
3316 - else {
3317 - current_thread_info()->status &= ~TS_POLLING;
3318 - smp_mb__after_clear_bit();
3319 + if (!need_resched())
3320 safe_halt();
3321 - current_thread_info()->status |= TS_POLLING;
3322 - }
3323 + else
3324 + local_irq_enable();
3325 + current_thread_info()->status |= TS_POLLING;
3326 }
3327
3328 #ifdef CONFIG_HOTPLUG_CPU
3329 @@ -181,6 +175,12 @@
3330 idle = xen_idle; /* no alternatives */
3331 if (cpu_is_offline(smp_processor_id()))
3332 play_dead();
3333 + /*
3334 + * Idle routines should keep interrupts disabled
3335 + * from here on, until they go to idle.
3336 + * Otherwise, idle callbacks can misfire.
3337 + */
3338 + local_irq_disable();
3339 enter_idle();
3340 idle();
3341 /* In many cases the interrupt that ended idle
3342 --- a/arch/x86/kernel/quirks-xen.c
3343 +++ b/arch/x86/kernel/quirks-xen.c
3344 @@ -3,10 +3,12 @@
3345 */
3346 #include <linux/pci.h>
3347 #include <linux/irq.h>
3348 +#include <asm/pci-direct.h>
3349 +#include <asm/genapic.h>
3350 +#include <asm/cpu.h>
3351
3352 #if defined(CONFIG_X86_IO_APIC) && (defined(CONFIG_SMP) || defined(CONFIG_XEN)) && defined(CONFIG_PCI)
3353 -
3354 -static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
3355 +static void __devinit verify_quirk_intel_irqbalance(struct pci_dev *dev)
3356 {
3357 u8 config, rev;
3358 u32 word;
3359 @@ -14,14 +16,12 @@
3360 /* BIOS may enable hardware IRQ balancing for
3361 * E7520/E7320/E7525(revision ID 0x9 and below)
3362 * based platforms.
3363 - * Disable SW irqbalance/affinity on those platforms.
3364 + * For those platforms, make sure that the genapic is set to 'flat'
3365 */
3366 pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev);
3367 if (rev > 0x9)
3368 return;
3369
3370 - printk(KERN_INFO "Intel E7520/7320/7525 detected.");
3371 -
3372 /* enable access to config space*/
3373 pci_read_config_byte(dev, 0xf4, &config);
3374 pci_write_config_byte(dev, 0xf4, config|0x2);
3375 @@ -30,6 +30,46 @@
3376 raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word);
3377
3378 if (!(word & (1 << 13))) {
3379 +#ifndef CONFIG_XEN
3380 +#ifdef CONFIG_X86_64
3381 + if (genapic != &apic_flat)
3382 + panic("APIC mode must be flat on this system\n");
3383 +#elif defined(CONFIG_X86_GENERICARCH)
3384 + if (genapic != &apic_default)
3385 + panic("APIC mode must be default(flat) on this system. Use apic=default\n");
3386 +#endif
3387 +#endif
3388 + }
3389 +
3390 + /* put back the original value for config space*/
3391 + if (!(config & 0x2))
3392 + pci_write_config_byte(dev, 0xf4, config);
3393 +}
3394 +
3395 +void __init quirk_intel_irqbalance(void)
3396 +{
3397 + u8 config, rev;
3398 + u32 word;
3399 +
3400 + /* BIOS may enable hardware IRQ balancing for
3401 + * E7520/E7320/E7525(revision ID 0x9 and below)
3402 + * based platforms.
3403 + * Disable SW irqbalance/affinity on those platforms.
3404 + */
3405 + rev = read_pci_config_byte(0, 0, 0, PCI_CLASS_REVISION);
3406 + if (rev > 0x9)
3407 + return;
3408 +
3409 + printk(KERN_INFO "Intel E7520/7320/7525 detected.");
3410 +
3411 + /* enable access to config space */
3412 + config = read_pci_config_byte(0, 0, 0, 0xf4);
3413 + write_pci_config_byte(0, 0, 0, 0xf4, config|0x2);
3414 +
3415 + /* read xTPR register */
3416 + word = read_pci_config_16(0, 0, 0x40, 0x4c);
3417 +
3418 + if (!(word & (1 << 13))) {
3419 struct xen_platform_op op;
3420 printk(KERN_INFO "Disabling irq balancing and affinity\n");
3421 op.cmd = XENPF_platform_quirk;
3422 @@ -37,11 +77,12 @@
3423 WARN_ON(HYPERVISOR_platform_op(&op));
3424 }
3425
3426 - /* put back the original value for config space*/
3427 + /* put back the original value for config space */
3428 if (!(config & 0x2))
3429 - pci_write_config_byte(dev, 0xf4, config);
3430 + write_pci_config_byte(0, 0, 0, 0xf4, config);
3431 }
3432 -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, quirk_intel_irqbalance);
3433 -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, quirk_intel_irqbalance);
3434 -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, quirk_intel_irqbalance);
3435 +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, verify_quirk_intel_irqbalance);
3436 +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, verify_quirk_intel_irqbalance);
3437 +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, verify_quirk_intel_irqbalance);
3438 +
3439 #endif
3440 --- a/arch/x86/kernel/setup_32-xen.c
3441 +++ b/arch/x86/kernel/setup_32-xen.c
3442 @@ -76,9 +76,6 @@
3443 #include <xen/interface/kexec.h>
3444 #endif
3445
3446 -/* Forward Declaration. */
3447 -void __init find_max_pfn(void);
3448 -
3449 static int xen_panic_event(struct notifier_block *, unsigned long, void *);
3450 static struct notifier_block xen_panic_block = {
3451 xen_panic_event, NULL, 0 /* try to go last */
3452 @@ -92,14 +89,11 @@
3453 /*
3454 * Machine setup..
3455 */
3456 -
3457 -#ifdef CONFIG_EFI
3458 -int efi_enabled = 0;
3459 -EXPORT_SYMBOL(efi_enabled);
3460 -#endif
3461 +extern struct resource code_resource;
3462 +extern struct resource data_resource;
3463
3464 /* cpu data as detected by the assembly code in head.S */
3465 -struct cpuinfo_x86 new_cpu_data __initdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
3466 +struct cpuinfo_x86 new_cpu_data __cpuinitdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
3467 /* common cpu data for all cpus */
3468 struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
3469 EXPORT_SYMBOL(boot_cpu_data);
3470 @@ -115,12 +109,6 @@
3471 unsigned int BIOS_revision;
3472 unsigned int mca_pentium_flag;
3473
3474 -/* For PCI or other memory-mapped resources */
3475 -unsigned long pci_mem_start = 0x10000000;
3476 -#ifdef CONFIG_PCI
3477 -EXPORT_SYMBOL(pci_mem_start);
3478 -#endif
3479 -
3480 /* Boot loader ID as an integer, for the benefit of proc_dointvec */
3481 int bootloader_type;
3482
3483 @@ -153,10 +141,6 @@
3484 defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
3485 EXPORT_SYMBOL(ist_info);
3486 #endif
3487 -struct e820map e820;
3488 -#ifdef CONFIG_XEN
3489 -struct e820map machine_e820;
3490 -#endif
3491
3492 extern void early_cpu_init(void);
3493 extern int root_mountflags;
3494 @@ -171,209 +155,6 @@
3495
3496 unsigned char __initdata boot_params[PARAM_SIZE];
3497
3498 -static struct resource data_resource = {
3499 - .name = "Kernel data",
3500 - .start = 0,
3501 - .end = 0,
3502 - .flags = IORESOURCE_BUSY | IORESOURCE_MEM
3503 -};
3504 -
3505 -static struct resource code_resource = {
3506 - .name = "Kernel code",
3507 - .start = 0,
3508 - .end = 0,
3509 - .flags = IORESOURCE_BUSY | IORESOURCE_MEM
3510 -};
3511 -
3512 -static struct resource system_rom_resource = {
3513 - .name = "System ROM",
3514 - .start = 0xf0000,
3515 - .end = 0xfffff,
3516 - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
3517 -};
3518 -
3519 -static struct resource extension_rom_resource = {
3520 - .name = "Extension ROM",
3521 - .start = 0xe0000,
3522 - .end = 0xeffff,
3523 - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
3524 -};
3525 -
3526 -static struct resource adapter_rom_resources[] = { {
3527 - .name = "Adapter ROM",
3528 - .start = 0xc8000,
3529 - .end = 0,
3530 - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
3531 -}, {
3532 - .name = "Adapter ROM",
3533 - .start = 0,
3534 - .end = 0,
3535 - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
3536 -}, {
3537 - .name = "Adapter ROM",
3538 - .start = 0,
3539 - .end = 0,
3540 - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
3541 -}, {
3542 - .name = "Adapter ROM",
3543 - .start = 0,
3544 - .end = 0,
3545 - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
3546 -}, {
3547 - .name = "Adapter ROM",
3548 - .start = 0,
3549 - .end = 0,
3550 - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
3551 -}, {
3552 - .name = "Adapter ROM",
3553 - .start = 0,
3554 - .end = 0,
3555 - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
3556 -} };
3557 -
3558 -static struct resource video_rom_resource = {
3559 - .name = "Video ROM",
3560 - .start = 0xc0000,
3561 - .end = 0xc7fff,
3562 - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
3563 -};
3564 -
3565 -static struct resource video_ram_resource = {
3566 - .name = "Video RAM area",
3567 - .start = 0xa0000,
3568 - .end = 0xbffff,
3569 - .flags = IORESOURCE_BUSY | IORESOURCE_MEM
3570 -};
3571 -
3572 -static struct resource standard_io_resources[] = { {
3573 - .name = "dma1",
3574 - .start = 0x0000,
3575 - .end = 0x001f,
3576 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
3577 -}, {
3578 - .name = "pic1",
3579 - .start = 0x0020,
3580 - .end = 0x0021,
3581 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
3582 -}, {
3583 - .name = "timer0",
3584 - .start = 0x0040,
3585 - .end = 0x0043,
3586 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
3587 -}, {
3588 - .name = "timer1",
3589 - .start = 0x0050,
3590 - .end = 0x0053,
3591 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
3592 -}, {
3593 - .name = "keyboard",
3594 - .start = 0x0060,
3595 - .end = 0x006f,
3596 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
3597 -}, {
3598 - .name = "dma page reg",
3599 - .start = 0x0080,
3600 - .end = 0x008f,
3601 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
3602 -}, {
3603 - .name = "pic2",
3604 - .start = 0x00a0,
3605 - .end = 0x00a1,
3606 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
3607 -}, {
3608 - .name = "dma2",
3609 - .start = 0x00c0,
3610 - .end = 0x00df,
3611 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
3612 -}, {
3613 - .name = "fpu",
3614 - .start = 0x00f0,
3615 - .end = 0x00ff,
3616 - .flags = IORESOURCE_BUSY | IORESOURCE_IO
3617 -} };
3618 -
3619 -#define romsignature(x) (*(unsigned short *)(x) == 0xaa55)
3620 -
3621 -static int __init romchecksum(unsigned char *rom, unsigned long length)
3622 -{
3623 - unsigned char *p, sum = 0;
3624 -
3625 - for (p = rom; p < rom + length; p++)
3626 - sum += *p;
3627 - return sum == 0;
3628 -}
3629 -
3630 -static void __init probe_roms(void)
3631 -{
3632 - unsigned long start, length, upper;
3633 - unsigned char *rom;
3634 - int i;
3635 -
3636 -#ifdef CONFIG_XEN
3637 - /* Nothing to do if not running in dom0. */
3638 - if (!is_initial_xendomain())
3639 - return;
3640 -#endif
3641 -
3642 - /* video rom */
3643 - upper = adapter_rom_resources[0].start;
3644 - for (start = video_rom_resource.start; start < upper; start += 2048) {
3645 - rom = isa_bus_to_virt(start);
3646 - if (!romsignature(rom))
3647 - continue;
3648 -
3649 - video_rom_resource.start = start;
3650 -
3651 - /* 0 < length <= 0x7f * 512, historically */
3652 - length = rom[2] * 512;
3653 -
3654 - /* if checksum okay, trust length byte */
3655 - if (length && romchecksum(rom, length))
3656 - video_rom_resource.end = start + length - 1;
3657 -
3658 - request_resource(&iomem_resource, &video_rom_resource);
3659 - break;
3660 - }
3661 -
3662 - start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
3663 - if (start < upper)
3664 - start = upper;
3665 -
3666 - /* system rom */
3667 - request_resource(&iomem_resource, &system_rom_resource);
3668 - upper = system_rom_resource.start;
3669 -
3670 - /* check for extension rom (ignore length byte!) */
3671 - rom = isa_bus_to_virt(extension_rom_resource.start);
3672 - if (romsignature(rom)) {
3673 - length = extension_rom_resource.end - extension_rom_resource.start + 1;
3674 - if (romchecksum(rom, length)) {
3675 - request_resource(&iomem_resource, &extension_rom_resource);
3676 - upper = extension_rom_resource.start;
3677 - }
3678 - }
3679 -
3680 - /* check for adapter roms on 2k boundaries */
3681 - for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) {
3682 - rom = isa_bus_to_virt(start);
3683 - if (!romsignature(rom))
3684 - continue;
3685 -
3686 - /* 0 < length <= 0x7f * 512, historically */
3687 - length = rom[2] * 512;
3688 -
3689 - /* but accept any length that fits if checksum okay */
3690 - if (!length || start + length > upper || !romchecksum(rom, length))
3691 - continue;
3692 -
3693 - adapter_rom_resources[i].start = start;
3694 - adapter_rom_resources[i].end = start + length - 1;
3695 - request_resource(&iomem_resource, &adapter_rom_resources[i]);
3696 -
3697 - start = adapter_rom_resources[i++].end & ~2047UL;
3698 - }
3699 -}
3700 -
3701 /*
3702 * Point at the empty zero page to start with. We map the real shared_info
3703 * page as soon as fixmap is up and running.
3704 @@ -389,338 +170,6 @@
3705 start_info_t *xen_start_info;
3706 EXPORT_SYMBOL(xen_start_info);
3707
3708 -void __init add_memory_region(unsigned long long start,
3709 - unsigned long long size, int type)
3710 -{
3711 - int x;
3712 -
3713 - if (!efi_enabled) {
3714 - x = e820.nr_map;
3715 -
3716 - if (x == E820MAX) {
3717 - printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
3718 - return;
3719 - }
3720 -
3721 - e820.map[x].addr = start;
3722 - e820.map[x].size = size;
3723 - e820.map[x].type = type;
3724 - e820.nr_map++;
3725 - }
3726 -} /* add_memory_region */
3727 -
3728 -static void __init limit_regions(unsigned long long size)
3729 -{
3730 - unsigned long long current_addr = 0;
3731 - int i;
3732 -
3733 - if (efi_enabled) {
3734 - efi_memory_desc_t *md;
3735 - void *p;
3736 -
3737 - for (p = memmap.map, i = 0; p < memmap.map_end;
3738 - p += memmap.desc_size, i++) {
3739 - md = p;
3740 - current_addr = md->phys_addr + (md->num_pages << 12);
3741 - if (md->type == EFI_CONVENTIONAL_MEMORY) {
3742 - if (current_addr >= size) {
3743 - md->num_pages -=
3744 - (((current_addr-size) + PAGE_SIZE-1) >> PAGE_SHIFT);
3745 - memmap.nr_map = i + 1;
3746 - return;
3747 - }
3748 - }
3749 - }
3750 - }
3751 - for (i = 0; i < e820.nr_map; i++) {
3752 - current_addr = e820.map[i].addr + e820.map[i].size;
3753 - if (current_addr < size)
3754 - continue;
3755 -
3756 - if (e820.map[i].type != E820_RAM)
3757 - continue;
3758 -
3759 - if (e820.map[i].addr >= size) {
3760 - /*
3761 - * This region starts past the end of the
3762 - * requested size, skip it completely.
3763 - */
3764 - e820.nr_map = i;
3765 - } else {
3766 - e820.nr_map = i + 1;
3767 - e820.map[i].size -= current_addr - size;
3768 - }
3769 - return;
3770 - }
3771 -#ifdef CONFIG_XEN
3772 - if (i==e820.nr_map && current_addr < size) {
3773 - /*
3774 - * The e820 map finished before our requested size so
3775 - * extend the final entry to the requested address.
3776 - */
3777 - --i;
3778 - if (e820.map[i].type == E820_RAM)
3779 - e820.map[i].size -= current_addr - size;
3780 - else
3781 - add_memory_region(current_addr, size - current_addr, E820_RAM);
3782 - }
3783 -#endif
3784 -}
3785 -
3786 -#define E820_DEBUG 1
3787 -
3788 -static void __init print_memory_map(char *who)
3789 -{
3790 - int i;
3791 -
3792 - for (i = 0; i < e820.nr_map; i++) {
3793 - printk(" %s: %016Lx - %016Lx ", who,
3794 - e820.map[i].addr,
3795 - e820.map[i].addr + e820.map[i].size);
3796 - switch (e820.map[i].type) {
3797 - case E820_RAM: printk("(usable)\n");
3798 - break;
3799 - case E820_RESERVED:
3800 - printk("(reserved)\n");
3801 - break;
3802 - case E820_ACPI:
3803 - printk("(ACPI data)\n");
3804 - break;
3805 - case E820_NVS:
3806 - printk("(ACPI NVS)\n");
3807 - break;
3808 - default: printk("type %lu\n", e820.map[i].type);
3809 - break;
3810 - }
3811 - }
3812 -}
3813 -
3814 -/*
3815 - * Sanitize the BIOS e820 map.
3816 - *
3817 - * Some e820 responses include overlapping entries. The following
3818 - * replaces the original e820 map with a new one, removing overlaps.
3819 - *
3820 - */
3821 -struct change_member {
3822 - struct e820entry *pbios; /* pointer to original bios entry */
3823 - unsigned long long addr; /* address for this change point */
3824 -};
3825 -static struct change_member change_point_list[2*E820MAX] __initdata;
3826 -static struct change_member *change_point[2*E820MAX] __initdata;
3827 -static struct e820entry *overlap_list[E820MAX] __initdata;
3828 -static struct e820entry new_bios[E820MAX] __initdata;
3829 -
3830 -int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
3831 -{
3832 - struct change_member *change_tmp;
3833 - unsigned long current_type, last_type;
3834 - unsigned long long last_addr;
3835 - int chgidx, still_changing;
3836 - int overlap_entries;
3837 - int new_bios_entry;
3838 - int old_nr, new_nr, chg_nr;
3839 - int i;
3840 -
3841 - /*
3842 - Visually we're performing the following (1,2,3,4 = memory types)...
3843 -
3844 - Sample memory map (w/overlaps):
3845 - ____22__________________
3846 - ______________________4_
3847 - ____1111________________
3848 - _44_____________________
3849 - 11111111________________
3850 - ____________________33__
3851 - ___________44___________
3852 - __________33333_________
3853 - ______________22________
3854 - ___________________2222_
3855 - _________111111111______
3856 - _____________________11_
3857 - _________________4______
3858 -
3859 - Sanitized equivalent (no overlap):
3860 - 1_______________________
3861 - _44_____________________
3862 - ___1____________________
3863 - ____22__________________
3864 - ______11________________
3865 - _________1______________
3866 - __________3_____________
3867 - ___________44___________
3868 - _____________33_________
3869 - _______________2________
3870 - ________________1_______
3871 - _________________4______
3872 - ___________________2____
3873 - ____________________33__
3874 - ______________________4_
3875 - */
3876 -
3877 - /* if there's only one memory region, don't bother */
3878 - if (*pnr_map < 2)
3879 - return -1;
3880 -
3881 - old_nr = *pnr_map;
3882 -
3883 - /* bail out if we find any unreasonable addresses in bios map */
3884 - for (i=0; i<old_nr; i++)
3885 - if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
3886 - return -1;
3887 -
3888 - /* create pointers for initial change-point information (for sorting) */
3889 - for (i=0; i < 2*old_nr; i++)
3890 - change_point[i] = &change_point_list[i];
3891 -
3892 - /* record all known change-points (starting and ending addresses),
3893 - omitting those that are for empty memory regions */
3894 - chgidx = 0;
3895 - for (i=0; i < old_nr; i++) {
3896 - if (biosmap[i].size != 0) {
3897 - change_point[chgidx]->addr = biosmap[i].addr;
3898 - change_point[chgidx++]->pbios = &biosmap[i];
3899 - change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
3900 - change_point[chgidx++]->pbios = &biosmap[i];
3901 - }
3902 - }
3903 - chg_nr = chgidx; /* true number of change-points */
3904 -
3905 - /* sort change-point list by memory addresses (low -> high) */
3906 - still_changing = 1;
3907 - while (still_changing) {
3908 - still_changing = 0;
3909 - for (i=1; i < chg_nr; i++) {
3910 - /* if <current_addr> > <last_addr>, swap */
3911 - /* or, if current=<start_addr> & last=<end_addr>, swap */
3912 - if ((change_point[i]->addr < change_point[i-1]->addr) ||
3913 - ((change_point[i]->addr == change_point[i-1]->addr) &&
3914 - (change_point[i]->addr == change_point[i]->pbios->addr) &&
3915 - (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
3916 - )
3917 - {
3918 - change_tmp = change_point[i];
3919 - change_point[i] = change_point[i-1];
3920 - change_point[i-1] = change_tmp;
3921 - still_changing=1;
3922 - }
3923 - }
3924 - }
3925 -
3926 - /* create a new bios memory map, removing overlaps */
3927 - overlap_entries=0; /* number of entries in the overlap table */
3928 - new_bios_entry=0; /* index for creating new bios map entries */
3929 - last_type = 0; /* start with undefined memory type */
3930 - last_addr = 0; /* start with 0 as last starting address */
3931 - /* loop through change-points, determining affect on the new bios map */
3932 - for (chgidx=0; chgidx < chg_nr; chgidx++)
3933 - {
3934 - /* keep track of all overlapping bios entries */
3935 - if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
3936 - {
3937 - /* add map entry to overlap list (> 1 entry implies an overlap) */
3938 - overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
3939 - }
3940 - else
3941 - {
3942 - /* remove entry from list (order independent, so swap with last) */
3943 - for (i=0; i<overlap_entries; i++)
3944 - {
3945 - if (overlap_list[i] == change_point[chgidx]->pbios)
3946 - overlap_list[i] = overlap_list[overlap_entries-1];
3947 - }
3948 - overlap_entries--;
3949 - }
3950 - /* if there are overlapping entries, decide which "type" to use */
3951 - /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
3952 - current_type = 0;
3953 - for (i=0; i<overlap_entries; i++)
3954 - if (overlap_list[i]->type > current_type)
3955 - current_type = overlap_list[i]->type;
3956 - /* continue building up new bios map based on this information */
3957 - if (current_type != last_type) {
3958 - if (last_type != 0) {
3959 - new_bios[new_bios_entry].size =
3960 - change_point[chgidx]->addr - last_addr;
3961 - /* move forward only if the new size was non-zero */
3962 - if (new_bios[new_bios_entry].size != 0)
3963 - if (++new_bios_entry >= E820MAX)
3964 - break; /* no more space left for new bios entries */
3965 - }
3966 - if (current_type != 0) {
3967 - new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
3968 - new_bios[new_bios_entry].type = current_type;
3969 - last_addr=change_point[chgidx]->addr;
3970 - }
3971 - last_type = current_type;
3972 - }
3973 - }
3974 - new_nr = new_bios_entry; /* retain count for new bios entries */
3975 -
3976 - /* copy new bios mapping into original location */
3977 - memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
3978 - *pnr_map = new_nr;
3979 -
3980 - return 0;
3981 -}
3982 -
3983 -/*
3984 - * Copy the BIOS e820 map into a safe place.
3985 - *
3986 - * Sanity-check it while we're at it..
3987 - *
3988 - * If we're lucky and live on a modern system, the setup code
3989 - * will have given us a memory map that we can use to properly
3990 - * set up memory. If we aren't, we'll fake a memory map.
3991 - *
3992 - * We check to see that the memory map contains at least 2 elements
3993 - * before we'll use it, because the detection code in setup.S may
3994 - * not be perfect and most every PC known to man has two memory
3995 - * regions: one from 0 to 640k, and one from 1mb up. (The IBM
3996 - * thinkpad 560x, for example, does not cooperate with the memory
3997 - * detection code.)
3998 - */
3999 -int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
4000 -{
4001 -#ifndef CONFIG_XEN
4002 - /* Only one memory region (or negative)? Ignore it */
4003 - if (nr_map < 2)
4004 - return -1;
4005 -#else
4006 - BUG_ON(nr_map < 1);
4007 -#endif
4008 -
4009 - do {
4010 - unsigned long long start = biosmap->addr;
4011 - unsigned long long size = biosmap->size;
4012 - unsigned long long end = start + size;
4013 - unsigned long type = biosmap->type;
4014 -
4015 - /* Overflow in 64 bits? Ignore the memory map. */
4016 - if (start > end)
4017 - return -1;
4018 -
4019 -#ifndef CONFIG_XEN
4020 - /*
4021 - * Some BIOSes claim RAM in the 640k - 1M region.
4022 - * Not right. Fix it up.
4023 - */
4024 - if (type == E820_RAM) {
4025 - if (start < 0x100000ULL && end > 0xA0000ULL) {
4026 - if (start < 0xA0000ULL)
4027 - add_memory_region(start, 0xA0000ULL-start, type);
4028 - if (end <= 0x100000ULL)
4029 - continue;
4030 - start = 0x100000ULL;
4031 - size = end - start;
4032 - }
4033 - }
4034 -#endif
4035 - add_memory_region(start, size, type);
4036 - } while (biosmap++,--nr_map);
4037 - return 0;
4038 -}
4039 -
4040 #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
4041 struct edd edd;
4042 #ifdef CONFIG_EDD_MODULE
4043 @@ -746,7 +195,7 @@
4044 }
4045 #endif
4046
4047 -static int __initdata user_defined_memmap = 0;
4048 +int __initdata user_defined_memmap = 0;
4049
4050 /*
4051 * "mem=nopentium" disables the 4MB page tables.
4052 @@ -783,51 +232,6 @@
4053 }
4054 early_param("mem", parse_mem);
4055
4056 -static int __init parse_memmap(char *arg)
4057 -{
4058 - if (!arg)
4059 - return -EINVAL;
4060 -
4061 - if (strcmp(arg, "exactmap") == 0) {
4062 -#ifdef CONFIG_CRASH_DUMP
4063 - /* If we are doing a crash dump, we
4064 - * still need to know the real mem
4065 - * size before original memory map is
4066 - * reset.
4067 - */
4068 - find_max_pfn();
4069 - saved_max_pfn = max_pfn;
4070 -#endif
4071 - e820.nr_map = 0;
4072 - user_defined_memmap = 1;
4073 - } else {
4074 - /* If the user specifies memory size, we
4075 - * limit the BIOS-provided memory map to
4076 - * that size. exactmap can be used to specify
4077 - * the exact map. mem=number can be used to
4078 - * trim the existing memory map.
4079 - */
4080 - unsigned long long start_at, mem_size;
4081 -
4082 - mem_size = memparse(arg, &arg);
4083 - if (*arg == '@') {
4084 - start_at = memparse(arg+1, &arg);
4085 - add_memory_region(start_at, mem_size, E820_RAM);
4086 - } else if (*arg == '#') {
4087 - start_at = memparse(arg+1, &arg);
4088 - add_memory_region(start_at, mem_size, E820_ACPI);
4089 - } else if (*arg == '$') {
4090 - start_at = memparse(arg+1, &arg);
4091 - add_memory_region(start_at, mem_size, E820_RESERVED);
4092 - } else {
4093 - limit_regions(mem_size);
4094 - user_defined_memmap = 1;
4095 - }
4096 - }
4097 - return 0;
4098 -}
4099 -early_param("memmap", parse_memmap);
4100 -
4101 #ifdef CONFIG_PROC_VMCORE
4102 /* elfcorehdr= specifies the location of elf core header
4103 * stored by the crashed kernel.
4104 @@ -894,127 +298,6 @@
4105 #endif
4106
4107 /*
4108 - * Callback for efi_memory_walk.
4109 - */
4110 -static int __init
4111 -efi_find_max_pfn(unsigned long start, unsigned long end, void *arg)
4112 -{
4113 - unsigned long *max_pfn = arg, pfn;
4114 -
4115 - if (start < end) {
4116 - pfn = PFN_UP(end -1);
4117 - if (pfn > *max_pfn)
4118 - *max_pfn = pfn;
4119 - }
4120 - return 0;
4121 -}
4122 -
4123 -static int __init
4124 -efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg)
4125 -{
4126 - memory_present(0, PFN_UP(start), PFN_DOWN(end));
4127 - return 0;
4128 -}
4129 -
4130 -/*
4131 - * This function checks if any part of the range <start,end> is mapped
4132 - * with type.
4133 - */
4134 -int
4135 -e820_any_mapped(u64 start, u64 end, unsigned type)
4136 -{
4137 - int i;
4138 -
4139 -#ifndef CONFIG_XEN
4140 - for (i = 0; i < e820.nr_map; i++) {
4141 - const struct e820entry *ei = &e820.map[i];
4142 -#else
4143 - if (!is_initial_xendomain())
4144 - return 0;
4145 - for (i = 0; i < machine_e820.nr_map; ++i) {
4146 - const struct e820entry *ei = &machine_e820.map[i];
4147 -#endif
4148 -
4149 - if (type && ei->type != type)
4150 - continue;
4151 - if (ei->addr >= end || ei->addr + ei->size <= start)
4152 - continue;
4153 - return 1;
4154 - }
4155 - return 0;
4156 -}
4157 -EXPORT_SYMBOL_GPL(e820_any_mapped);
4158 -
4159 - /*
4160 - * This function checks if the entire range <start,end> is mapped with type.
4161 - *
4162 - * Note: this function only works correct if the e820 table is sorted and
4163 - * not-overlapping, which is the case
4164 - */
4165 -int __init
4166 -e820_all_mapped(unsigned long s, unsigned long e, unsigned type)
4167 -{
4168 - u64 start = s;
4169 - u64 end = e;
4170 - int i;
4171 -
4172 -#ifndef CONFIG_XEN
4173 - for (i = 0; i < e820.nr_map; i++) {
4174 - struct e820entry *ei = &e820.map[i];
4175 -#else
4176 - if (!is_initial_xendomain())
4177 - return 0;
4178 - for (i = 0; i < machine_e820.nr_map; ++i) {
4179 - const struct e820entry *ei = &machine_e820.map[i];
4180 -#endif
4181 - if (type && ei->type != type)
4182 - continue;
4183 - /* is the region (part) in overlap with the current region ?*/
4184 - if (ei->addr >= end || ei->addr + ei->size <= start)
4185 - continue;
4186 - /* if the region is at the beginning of <start,end> we move
4187 - * start to the end of the region since it's ok until there
4188 - */
4189 - if (ei->addr <= start)
4190 - start = ei->addr + ei->size;
4191 - /* if start is now at or beyond end, we're done, full
4192 - * coverage */
4193 - if (start >= end)
4194 - return 1; /* we're done */
4195 - }
4196 - return 0;
4197 -}
4198 -
4199 -/*
4200 - * Find the highest page frame number we have available
4201 - */
4202 -void __init find_max_pfn(void)
4203 -{
4204 - int i;
4205 -
4206 - max_pfn = 0;
4207 - if (efi_enabled) {
4208 - efi_memmap_walk(efi_find_max_pfn, &max_pfn);
4209 - efi_memmap_walk(efi_memory_present_wrapper, NULL);
4210 - return;
4211 - }
4212 -
4213 - for (i = 0; i < e820.nr_map; i++) {
4214 - unsigned long start, end;
4215 - /* RAM? */
4216 - if (e820.map[i].type != E820_RAM)
4217 - continue;
4218 - start = PFN_UP(e820.map[i].addr);
4219 - end = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
4220 - if (start >= end)
4221 - continue;
4222 - if (end > max_pfn)
4223 - max_pfn = end;
4224 - memory_present(0, start, end);
4225 - }
4226 -}
4227 -
4228 -/*
4229 * Determine low and high memory ranges:
4230 */
4231 unsigned long __init find_max_low_pfn(void)
4232 @@ -1073,77 +356,6 @@
4233 return max_low_pfn;
4234 }
4235
4236 -/*
4237 - * Free all available memory for boot time allocation. Used
4238 - * as a callback function by efi_memory_walk()
4239 - */
4240 -
4241 -static int __init
4242 -free_available_memory(unsigned long start, unsigned long end, void *arg)
4243 -{
4244 - /* check max_low_pfn */
4245 - if (start >= (max_low_pfn << PAGE_SHIFT))
4246 - return 0;
4247 - if (end >= (max_low_pfn << PAGE_SHIFT))
4248 - end = max_low_pfn << PAGE_SHIFT;
4249 - if (start < end)
4250 - free_bootmem(start, end - start);
4251 -
4252 - return 0;
4253 -}
4254 -/*
4255 - * Register fully available low RAM pages with the bootmem allocator.
4256 - */
4257 -static void __init register_bootmem_low_pages(unsigned long max_low_pfn)
4258 -{
4259 - int i;
4260 -
4261 - if (efi_enabled) {
4262 - efi_memmap_walk(free_available_memory, NULL);
4263 - return;
4264 - }
4265 - for (i = 0; i < e820.nr_map; i++) {
4266 - unsigned long curr_pfn, last_pfn, size;
4267 - /*
4268 - * Reserve usable low memory
4269 - */
4270 - if (e820.map[i].type != E820_RAM)
4271 - continue;
4272 - /*
4273 - * We are rounding up the start address of usable memory:
4274 - */
4275 - curr_pfn = PFN_UP(e820.map[i].addr);
4276 - if (curr_pfn >= max_low_pfn)
4277 - continue;
4278 - /*
4279 - * ... and at the end of the usable range downwards:
4280 - */
4281 - last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
4282 -
4283 -#ifdef CONFIG_XEN
4284 - /*
4285 - * Truncate to the number of actual pages currently
4286 - * present.
4287 - */
4288 - if (last_pfn > xen_start_info->nr_pages)
4289 - last_pfn = xen_start_info->nr_pages;
4290 -#endif
4291 -
4292 - if (last_pfn > max_low_pfn)
4293 - last_pfn = max_low_pfn;
4294 -
4295 - /*
4296 - * .. finally, did all the rounding and playing
4297 - * around just make the area go away?
4298 - */
4299 - if (last_pfn <= curr_pfn)
4300 - continue;
4301 -
4302 - size = last_pfn - curr_pfn;
4303 - free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size));
4304 - }
4305 -}
4306 -
4307 #ifndef CONFIG_XEN
4308 /*
4309 * workaround for Dell systems that neglect to reserve EBDA
4310 @@ -1233,8 +445,8 @@
4311 * the (very unlikely) case of us accidentally initializing the
4312 * bootmem allocator with an invalid RAM area.
4313 */
4314 - reserve_bootmem(__PHYSICAL_START, (PFN_PHYS(min_low_pfn) +
4315 - bootmap_size + PAGE_SIZE-1) - (__PHYSICAL_START));
4316 + reserve_bootmem(__pa_symbol(_text), (PFN_PHYS(min_low_pfn) +
4317 + bootmap_size + PAGE_SIZE-1) - __pa_symbol(_text));
4318
4319 #ifndef CONFIG_XEN
4320 /*
4321 @@ -1316,170 +528,6 @@
4322 }
4323 }
4324
4325 -/*
4326 - * Request address space for all standard RAM and ROM resources
4327 - * and also for regions reported as reserved by the e820.
4328 - */
4329 -static void __init
4330 -legacy_init_iomem_resources(struct e820entry *e820, int nr_map,
4331 - struct resource *code_resource,
4332 - struct resource *data_resource)
4333 -{
4334 - int i;
4335 -
4336 - probe_roms();
4337 -
4338 - for (i = 0; i < nr_map; i++) {
4339 - struct resource *res;
4340 -#ifndef CONFIG_RESOURCES_64BIT
4341 - if (e820[i].addr + e820[i].size > 0x100000000ULL)
4342 - continue;
4343 -#endif
4344 - res = kzalloc(sizeof(struct resource), GFP_ATOMIC);
4345 - switch (e820[i].type) {
4346 - case E820_RAM: res->name = "System RAM"; break;
4347 - case E820_ACPI: res->name = "ACPI Tables"; break;
4348 - case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
4349 - default: res->name = "reserved";
4350 - }
4351 - res->start = e820[i].addr;
4352 - res->end = res->start + e820[i].size - 1;
4353 - res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
4354 - if (request_resource(&iomem_resource, res)) {
4355 - kfree(res);
4356 - continue;
4357 - }
4358 - if (e820[i].type == E820_RAM) {
4359 - /*
4360 - * We don't know which RAM region contains kernel data,
4361 - * so we try it repeatedly and let the resource manager
4362 - * test it.
4363 - */
4364 -#ifndef CONFIG_XEN
4365 - request_resource(res, code_resource);
4366 - request_resource(res, data_resource);
4367 -#endif
4368 -#ifdef CONFIG_KEXEC
4369 - if (crashk_res.start != crashk_res.end)
4370 - request_resource(res, &crashk_res);
4371 -#ifdef CONFIG_XEN
4372 - xen_machine_kexec_register_resources(res);
4373 -#endif
4374 -#endif
4375 - }
4376 - }
4377 -}
4378 -
4379 -/*
4380 - * Locate a unused range of the physical address space below 4G which
4381 - * can be used for PCI mappings.
4382 - */
4383 -static void __init
4384 -e820_setup_gap(struct e820entry *e820, int nr_map)
4385 -{
4386 - unsigned long gapstart, gapsize, round;
4387 - unsigned long long last;
4388 - int i;
4389 -
4390 - /*
4391 - * Search for the bigest gap in the low 32 bits of the e820
4392 - * memory space.
4393 - */
4394 - last = 0x100000000ull;
4395 - gapstart = 0x10000000;
4396 - gapsize = 0x400000;
4397 - i = nr_map;
4398 - while (--i >= 0) {
4399 - unsigned long long start = e820[i].addr;
4400 - unsigned long long end = start + e820[i].size;
4401 -
4402 - /*
4403 - * Since "last" is at most 4GB, we know we'll
4404 - * fit in 32 bits if this condition is true
4405 - */
4406 - if (last > end) {
4407 - unsigned long gap = last - end;
4408 -
4409 - if (gap > gapsize) {
4410 - gapsize = gap;
4411 - gapstart = end;
4412 - }
4413 - }
4414 - if (start < last)
4415 - last = start;
4416 - }
4417 -
4418 - /*
4419 - * See how much we want to round up: start off with
4420 - * rounding to the next 1MB area.
4421 - */
4422 - round = 0x100000;
4423 - while ((gapsize >> 4) > round)
4424 - round += round;
4425 - /* Fun with two's complement */
4426 - pci_mem_start = (gapstart + round) & -round;
4427 -
4428 - printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n",
4429 - pci_mem_start, gapstart, gapsize);
4430 -}
4431 -
4432 -/*
4433 - * Request address space for all standard resources
4434 - *
4435 - * This is called just before pcibios_init(), which is also a
4436 - * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
4437 - */
4438 -static int __init request_standard_resources(void)
4439 -{
4440 - int i;
4441 -
4442 - /* Nothing to do if not running in dom0. */
4443 - if (!is_initial_xendomain())
4444 - return 0;
4445 -
4446 - printk("Setting up standard PCI resources\n");
4447 -#ifdef CONFIG_XEN
4448 - legacy_init_iomem_resources(machine_e820.map, machine_e820.nr_map,
4449 - &code_resource, &data_resource);
4450 -#else
4451 - if (efi_enabled)
4452 - efi_initialize_iomem_resources(&code_resource, &data_resource);
4453 - else
4454 - legacy_init_iomem_resources(e820.map, e820.nr_map,
4455 - &code_resource, &data_resource);
4456 -#endif
4457 -
4458 - /* EFI systems may still have VGA */
4459 - request_resource(&iomem_resource, &video_ram_resource);
4460 -
4461 - /* request I/O space for devices used on all i[345]86 PCs */
4462 - for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
4463 - request_resource(&ioport_resource, &standard_io_resources[i]);
4464 - return 0;
4465 -}
4466 -
4467 -subsys_initcall(request_standard_resources);
4468 -
4469 -static void __init register_memory(void)
4470 -{
4471 -#ifdef CONFIG_XEN
4472 - if (is_initial_xendomain()) {
4473 - struct xen_memory_map memmap;
4474 -
4475 - memmap.nr_entries = E820MAX;
4476 - set_xen_guest_handle(memmap.buffer, machine_e820.map);
4477 -
4478 - if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap))
4479 - BUG();
4480 -
4481 - machine_e820.nr_map = memmap.nr_entries;
4482 - e820_setup_gap(machine_e820.map, machine_e820.nr_map);
4483 - }
4484 - else
4485 -#endif
4486 - e820_setup_gap(e820.map, e820.nr_map);
4487 -}
4488 -
4489 #ifdef CONFIG_MCA
4490 static void set_mca_bus(int x)
4491 {
4492 @@ -1489,6 +537,12 @@
4493 static void set_mca_bus(int x) { }
4494 #endif
4495
4496 +/* Overridden in paravirt.c if CONFIG_PARAVIRT */
4497 +char * __init __attribute__((weak)) memory_setup(void)
4498 +{
4499 + return machine_specific_memory_setup();
4500 +}
4501 +
4502 /*
4503 * Determine if we were loaded by an EFI loader. If so, then we have also been
4504 * passed the efi memmap, systab, etc., so we should use these data structures
4505 @@ -1576,7 +630,7 @@
4506 efi_init();
4507 else {
4508 printk(KERN_INFO "BIOS-provided physical RAM map:\n");
4509 - print_memory_map(machine_specific_memory_setup());
4510 + print_memory_map(memory_setup());
4511 }
4512
4513 copy_edd();
4514 @@ -1755,7 +809,7 @@
4515 get_smp_config();
4516 #endif
4517
4518 - register_memory();
4519 + e820_register_memory();
4520
4521 if (is_initial_xendomain()) {
4522 #ifdef CONFIG_VT
4523 --- a/arch/x86/kernel/setup_64-xen.c
4524 +++ b/arch/x86/kernel/setup_64-xen.c
4525 @@ -576,8 +576,7 @@
4526 if (LOADER_TYPE && INITRD_START) {
4527 if (INITRD_START + INITRD_SIZE <= (end_pfn << PAGE_SHIFT)) {
4528 reserve_bootmem_generic(INITRD_START, INITRD_SIZE);
4529 - initrd_start =
4530 - INITRD_START ? INITRD_START + PAGE_OFFSET : 0;
4531 + initrd_start = INITRD_START + PAGE_OFFSET;
4532 initrd_end = initrd_start+INITRD_SIZE;
4533 }
4534 else {
4535 @@ -1003,11 +1002,8 @@
4536 /* Fix cpuid4 emulation for more */
4537 num_cache_leaves = 3;
4538
4539 - /* When there is only one core no need to synchronize RDTSC */
4540 - if (num_possible_cpus() == 1)
4541 - set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
4542 - else
4543 - clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
4544 + /* RDTSC can be speculated around */
4545 + clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
4546 }
4547
4548 static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
4549 @@ -1106,6 +1102,15 @@
4550 set_bit(X86_FEATURE_ARCH_PERFMON, &c->x86_capability);
4551 }
4552
4553 + if (cpu_has_ds) {
4554 + unsigned int l1, l2;
4555 + rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
4556 + if (!(l1 & (1<<11)))
4557 + set_bit(X86_FEATURE_BTS, c->x86_capability);
4558 + if (!(l1 & (1<<12)))
4559 + set_bit(X86_FEATURE_PEBS, c->x86_capability);
4560 + }
4561 +
4562 n = c->extended_cpuid_level;
4563 if (n >= 0x80000008) {
4564 unsigned eax = cpuid_eax(0x80000008);
4565 @@ -1125,7 +1130,10 @@
4566 set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
4567 if (c->x86 == 6)
4568 set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
4569 - set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
4570 + if (c->x86 == 15)
4571 + set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
4572 + else
4573 + clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
4574 c->x86_max_cores = intel_num_cpu_cores(c);
4575
4576 srat_detect_node();
4577 --- a/arch/x86/kernel/smp_32-xen.c
4578 +++ b/arch/x86/kernel/smp_32-xen.c
4579 @@ -659,6 +659,10 @@
4580 put_cpu();
4581 return -EBUSY;
4582 }
4583 +
4584 + /* Can deadlock when called with interrupts disabled */
4585 + WARN_ON(irqs_disabled());
4586 +
4587 spin_lock_bh(&call_lock);
4588 __smp_call_function_single(cpu, func, info, nonatomic, wait);
4589 spin_unlock_bh(&call_lock);
4590 --- a/arch/x86/kernel/smp_64-xen.c
4591 +++ b/arch/x86/kernel/smp_64-xen.c
4592 @@ -384,12 +384,17 @@
4593 put_cpu();
4594 return 0;
4595 }
4596 +
4597 + /* Can deadlock when called with interrupts disabled */
4598 + WARN_ON(irqs_disabled());
4599 +
4600 spin_lock_bh(&call_lock);
4601 __smp_call_function_single(cpu, func, info, nonatomic, wait);
4602 spin_unlock_bh(&call_lock);
4603 put_cpu();
4604 return 0;
4605 }
4606 +EXPORT_SYMBOL(smp_call_function_single);
4607
4608 /*
4609 * this function sends a 'generic call function' IPI to all other CPUs
4610 --- a/arch/x86/kernel/time_32-xen.c
4611 +++ b/arch/x86/kernel/time_32-xen.c
4612 @@ -61,6 +61,7 @@
4613 #include <asm/uaccess.h>
4614 #include <asm/processor.h>
4615 #include <asm/timer.h>
4616 +#include <asm/time.h>
4617 #include <asm/sections.h>
4618
4619 #include "mach_time.h"
4620 @@ -129,11 +130,11 @@
4621 /* Must be signed, as it's compared with s64 quantities which can be -ve. */
4622 #define NS_PER_TICK (1000000000LL/HZ)
4623
4624 -static void __clock_was_set(void *unused)
4625 +static void __clock_was_set(struct work_struct *unused)
4626 {
4627 clock_was_set();
4628 }
4629 -static DECLARE_WORK(clock_was_set_work, __clock_was_set, NULL);
4630 +static DECLARE_WORK(clock_was_set_work, __clock_was_set);
4631
4632 static inline void __normalize_time(time_t *sec, s64 *nsec)
4633 {
4634 @@ -537,10 +538,7 @@
4635 /* gets recalled with irq locally disabled */
4636 /* XXX - does irqsave resolve this? -johnstul */
4637 spin_lock_irqsave(&rtc_lock, flags);
4638 - if (efi_enabled)
4639 - retval = efi_set_rtc_mmss(nowtime);
4640 - else
4641 - retval = mach_set_rtc_mmss(nowtime);
4642 + retval = set_wallclock(nowtime);
4643 spin_unlock_irqrestore(&rtc_lock, flags);
4644
4645 return retval;
4646 @@ -865,10 +863,7 @@
4647
4648 spin_lock_irqsave(&rtc_lock, flags);
4649
4650 - if (efi_enabled)
4651 - retval = efi_get_time();
4652 - else
4653 - retval = mach_get_cmos_time();
4654 + retval = get_wallclock();
4655
4656 spin_unlock_irqrestore(&rtc_lock, flags);
4657
4658 @@ -970,7 +965,7 @@
4659 printk("Using HPET for base-timer\n");
4660 }
4661
4662 - time_init_hook();
4663 + do_time_init();
4664 }
4665 #endif
4666
4667 --- a/arch/x86/kernel/traps_32-xen.c
4668 +++ b/arch/x86/kernel/traps_32-xen.c
4669 @@ -29,6 +29,8 @@
4670 #include <linux/kexec.h>
4671 #include <linux/unwind.h>
4672 #include <linux/uaccess.h>
4673 +#include <linux/nmi.h>
4674 +#include <linux/bug.h>
4675
4676 #ifdef CONFIG_EISA
4677 #include <linux/ioport.h>
4678 @@ -61,9 +63,6 @@
4679
4680 asmlinkage int system_call(void);
4681
4682 -struct desc_struct default_ldt[] = { { 0, 0 }, { 0, 0 }, { 0, 0 },
4683 - { 0, 0 }, { 0, 0 } };
4684 -
4685 /* Do we ignore FPU interrupts ? */
4686 char ignore_fpu_irq = 0;
4687
4688 @@ -100,12 +99,7 @@
4689 #endif
4690 asmlinkage void machine_check(void);
4691
4692 -static int kstack_depth_to_print = 24;
4693 -#ifdef CONFIG_STACK_UNWIND
4694 -static int call_trace = 1;
4695 -#else
4696 -#define call_trace (-1)
4697 -#endif
4698 +int kstack_depth_to_print = 24;
4699 ATOMIC_NOTIFIER_HEAD(i386die_chain);
4700
4701 int register_die_notifier(struct notifier_block *nb)
4702 @@ -159,25 +153,7 @@
4703 return ebp;
4704 }
4705
4706 -struct ops_and_data {
4707 - struct stacktrace_ops *ops;
4708 - void *data;
4709 -};
4710 -
4711 -static asmlinkage int
4712 -dump_trace_unwind(struct unwind_frame_info *info, void *data)
4713 -{
4714 - struct ops_and_data *oad = (struct ops_and_data *)data;
4715 - int n = 0;
4716 -
4717 - while (unwind(info) == 0 && UNW_PC(info)) {
4718 - n++;
4719 - oad->ops->address(oad->data, UNW_PC(info));
4720 - if (arch_unw_user_mode(info))
4721 - break;
4722 - }
4723 - return n;
4724 -}
4725 +#define MSG(msg) ops->warning(data, msg)
4726
4727 void dump_trace(struct task_struct *task, struct pt_regs *regs,
4728 unsigned long *stack,
4729 @@ -188,39 +164,6 @@
4730 if (!task)
4731 task = current;
4732
4733 - if (call_trace >= 0) {
4734 - int unw_ret = 0;
4735 - struct unwind_frame_info info;
4736 - struct ops_and_data oad = { .ops = ops, .data = data };
4737 -
4738 - if (regs) {
4739 - if (unwind_init_frame_info(&info, task, regs) == 0)
4740 - unw_ret = dump_trace_unwind(&info, &oad);
4741 - } else if (task == current)
4742 - unw_ret = unwind_init_running(&info, dump_trace_unwind, &oad);
4743 - else {
4744 - if (unwind_init_blocked(&info, task) == 0)
4745 - unw_ret = dump_trace_unwind(&info, &oad);
4746 - }
4747 - if (unw_ret > 0) {
4748 - if (call_trace == 1 && !arch_unw_user_mode(&info)) {
4749 - ops->warning_symbol(data, "DWARF2 unwinder stuck at %s\n",
4750 - UNW_PC(&info));
4751 - if (UNW_SP(&info) >= PAGE_OFFSET) {
4752 - ops->warning(data, "Leftover inexact backtrace:\n");
4753 - stack = (void *)UNW_SP(&info);
4754 - if (!stack)
4755 - return;
4756 - ebp = UNW_FP(&info);
4757 - } else
4758 - ops->warning(data, "Full inexact backtrace again:\n");
4759 - } else if (call_trace >= 1)
4760 - return;
4761 - else
4762 - ops->warning(data, "Full inexact backtrace again:\n");
4763 - } else
4764 - ops->warning(data, "Inexact backtrace:\n");
4765 - }
4766 if (!stack) {
4767 unsigned long dummy;
4768 stack = &dummy;
4769 @@ -253,6 +196,7 @@
4770 stack = (unsigned long*)context->previous_esp;
4771 if (!stack)
4772 break;
4773 + touch_nmi_watchdog();
4774 }
4775 }
4776 EXPORT_SYMBOL(dump_trace);
4777 @@ -385,7 +329,7 @@
4778 * time of the fault..
4779 */
4780 if (in_kernel) {
4781 - u8 __user *eip;
4782 + u8 *eip;
4783 int code_bytes = 64;
4784 unsigned char c;
4785
4786 @@ -394,18 +338,20 @@
4787
4788 printk(KERN_EMERG "Code: ");
4789
4790 - eip = (u8 __user *)regs->eip - 43;
4791 - if (eip < (u8 __user *)PAGE_OFFSET || __get_user(c, eip)) {
4792 + eip = (u8 *)regs->eip - 43;
4793 + if (eip < (u8 *)PAGE_OFFSET ||
4794 + probe_kernel_address(eip, c)) {
4795 /* try starting at EIP */
4796 - eip = (u8 __user *)regs->eip;
4797 + eip = (u8 *)regs->eip;
4798 code_bytes = 32;
4799 }
4800 for (i = 0; i < code_bytes; i++, eip++) {
4801 - if (eip < (u8 __user *)PAGE_OFFSET || __get_user(c, eip)) {
4802 + if (eip < (u8 *)PAGE_OFFSET ||
4803 + probe_kernel_address(eip, c)) {
4804 printk(" Bad EIP value.");
4805 break;
4806 }
4807 - if (eip == (u8 __user *)regs->eip)
4808 + if (eip == (u8 *)regs->eip)
4809 printk("<%02x> ", c);
4810 else
4811 printk("%02x ", c);
4812 @@ -414,43 +360,22 @@
4813 printk("\n");
4814 }
4815
4816 -static void handle_BUG(struct pt_regs *regs)
4817 +int is_valid_bugaddr(unsigned long eip)
4818 {
4819 - unsigned long eip = regs->eip;
4820 unsigned short ud2;
4821
4822 if (eip < PAGE_OFFSET)
4823 - return;
4824 - if (probe_kernel_address((unsigned short __user *)eip, ud2))
4825 - return;
4826 - if (ud2 != 0x0b0f)
4827 - return;
4828 + return 0;
4829 + if (probe_kernel_address((unsigned short *)eip, ud2))
4830 + return 0;
4831
4832 - printk(KERN_EMERG "------------[ cut here ]------------\n");
4833 -
4834 -#ifdef CONFIG_DEBUG_BUGVERBOSE
4835 - do {
4836 - unsigned short line;
4837 - char *file;
4838 - char c;
4839 -
4840 - if (probe_kernel_address((unsigned short __user *)(eip + 2),
4841 - line))
4842 - break;
4843 - if (__get_user(file, (char * __user *)(eip + 4)) ||
4844 - (unsigned long)file < PAGE_OFFSET || __get_user(c, file))
4845 - file = "<bad filename>";
4846 -
4847 - printk(KERN_EMERG "kernel BUG at %s:%d!\n", file, line);
4848 - return;
4849 - } while (0);
4850 -#endif
4851 - printk(KERN_EMERG "Kernel BUG at [verbose debug info unavailable]\n");
4852 + return ud2 == 0x0b0f;
4853 }
4854
4855 -/* This is gone through when something in the kernel
4856 - * has done something bad and is about to be terminated.
4857 -*/
4858 +/*
4859 + * This is gone through when something in the kernel has done something bad and
4860 + * is about to be terminated.
4861 + */
4862 void die(const char * str, struct pt_regs * regs, long err)
4863 {
4864 static struct {
4865 @@ -458,7 +383,7 @@
4866 u32 lock_owner;
4867 int lock_owner_depth;
4868 } die = {
4869 - .lock = SPIN_LOCK_UNLOCKED,
4870 + .lock = __SPIN_LOCK_UNLOCKED(die.lock),
4871 .lock_owner = -1,
4872 .lock_owner_depth = 0
4873 };
4874 @@ -482,7 +407,8 @@
4875 unsigned long esp;
4876 unsigned short ss;
4877
4878 - handle_BUG(regs);
4879 + report_bug(regs->eip);
4880 +
4881 printk(KERN_EMERG "%s: %04lx [#%d]\n", str, err & 0xffff, ++die_counter);
4882 #ifdef CONFIG_PREEMPT
4883 printk(KERN_EMERG "PREEMPT ");
4884 @@ -682,8 +608,7 @@
4885 {
4886 printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on "
4887 "CPU %d.\n", reason, smp_processor_id());
4888 - printk(KERN_EMERG "You probably have a hardware problem with your RAM "
4889 - "chips\n");
4890 + printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n");
4891 if (panic_on_unrecovered_nmi)
4892 panic("NMI: Not continuing");
4893
4894 @@ -741,7 +666,6 @@
4895 printk(" on CPU%d, eip %08lx, registers:\n",
4896 smp_processor_id(), regs->eip);
4897 show_registers(regs);
4898 - printk(KERN_EMERG "console shuts up ...\n");
4899 console_silent();
4900 spin_unlock(&nmi_print_lock);
4901 bust_spinlocks(0);
4902 @@ -1057,49 +981,24 @@
4903 #endif
4904 }
4905
4906 -fastcall void setup_x86_bogus_stack(unsigned char * stk)
4907 +fastcall unsigned long patch_espfix_desc(unsigned long uesp,
4908 + unsigned long kesp)
4909 {
4910 - unsigned long *switch16_ptr, *switch32_ptr;
4911 - struct pt_regs *regs;
4912 - unsigned long stack_top, stack_bot;
4913 - unsigned short iret_frame16_off;
4914 - int cpu = smp_processor_id();
4915 - /* reserve the space on 32bit stack for the magic switch16 pointer */
4916 - memmove(stk, stk + 8, sizeof(struct pt_regs));
4917 - switch16_ptr = (unsigned long *)(stk + sizeof(struct pt_regs));
4918 - regs = (struct pt_regs *)stk;
4919 - /* now the switch32 on 16bit stack */
4920 - stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu);
4921 - stack_top = stack_bot + CPU_16BIT_STACK_SIZE;
4922 - switch32_ptr = (unsigned long *)(stack_top - 8);
4923 - iret_frame16_off = CPU_16BIT_STACK_SIZE - 8 - 20;
4924 - /* copy iret frame on 16bit stack */
4925 - memcpy((void *)(stack_bot + iret_frame16_off), &regs->eip, 20);
4926 - /* fill in the switch pointers */
4927 - switch16_ptr[0] = (regs->esp & 0xffff0000) | iret_frame16_off;
4928 - switch16_ptr[1] = __ESPFIX_SS;
4929 - switch32_ptr[0] = (unsigned long)stk + sizeof(struct pt_regs) +
4930 - 8 - CPU_16BIT_STACK_SIZE;
4931 - switch32_ptr[1] = __KERNEL_DS;
4932 -}
4933 -
4934 -fastcall unsigned char * fixup_x86_bogus_stack(unsigned short sp)
4935 -{
4936 - unsigned long *switch32_ptr;
4937 - unsigned char *stack16, *stack32;
4938 - unsigned long stack_top, stack_bot;
4939 - int len;
4940 int cpu = smp_processor_id();
4941 - stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu);
4942 - stack_top = stack_bot + CPU_16BIT_STACK_SIZE;
4943 - switch32_ptr = (unsigned long *)(stack_top - 8);
4944 - /* copy the data from 16bit stack to 32bit stack */
4945 - len = CPU_16BIT_STACK_SIZE - 8 - sp;
4946 - stack16 = (unsigned char *)(stack_bot + sp);
4947 - stack32 = (unsigned char *)
4948 - (switch32_ptr[0] + CPU_16BIT_STACK_SIZE - 8 - len);
4949 - memcpy(stack32, stack16, len);
4950 - return stack32;
4951 + struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
4952 + struct desc_struct *gdt = (struct desc_struct *)cpu_gdt_descr->address;
4953 + unsigned long base = (kesp - uesp) & -THREAD_SIZE;
4954 + unsigned long new_kesp = kesp - base;
4955 + unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT;
4956 + __u64 desc = *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS];
4957 + /* Set up base for espfix segment */
4958 + desc &= 0x00f0ff0000000000ULL;
4959 + desc |= ((((__u64)base) << 16) & 0x000000ffffff0000ULL) |
4960 + ((((__u64)base) << 32) & 0xff00000000000000ULL) |
4961 + ((((__u64)lim_pages) << 32) & 0x000f000000000000ULL) |
4962 + (lim_pages & 0xffff);
4963 + *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS] = desc;
4964 + return new_kesp;
4965 }
4966 #endif
4967
4968 @@ -1113,7 +1012,7 @@
4969 * Must be called with kernel preemption disabled (in this case,
4970 * local interrupts are disabled at the call-site in entry.S).
4971 */
4972 -asmlinkage void math_state_restore(struct pt_regs regs)
4973 +asmlinkage void math_state_restore(void)
4974 {
4975 struct thread_info *thread = current_thread_info();
4976 struct task_struct *tsk = thread->task;
4977 @@ -1123,6 +1022,7 @@
4978 init_fpu(tsk);
4979 restore_fpu(tsk);
4980 thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */
4981 + tsk->fpu_counter++;
4982 }
4983
4984 #ifndef CONFIG_MATH_EMULATION
4985 @@ -1234,19 +1134,3 @@
4986 return 1;
4987 }
4988 __setup("kstack=", kstack_setup);
4989 -
4990 -#ifdef CONFIG_STACK_UNWIND
4991 -static int __init call_trace_setup(char *s)
4992 -{
4993 - if (strcmp(s, "old") == 0)
4994 - call_trace = -1;
4995 - else if (strcmp(s, "both") == 0)
4996 - call_trace = 0;
4997 - else if (strcmp(s, "newfallback") == 0)
4998 - call_trace = 1;
4999 - else if (strcmp(s, "new") == 2)
5000 - call_trace = 2;
5001 - return 1;
5002 -}
5003 -__setup("call_trace=", call_trace_setup);
5004 -#endif
5005 --- a/arch/x86/kernel/traps_64-xen.c
5006 +++ b/arch/x86/kernel/traps_64-xen.c
5007 @@ -30,9 +30,10 @@
5008 #include <linux/kprobes.h>
5009 #include <linux/kexec.h>
5010 #include <linux/unwind.h>
5011 +#include <linux/uaccess.h>
5012 +#include <linux/bug.h>
5013
5014 #include <asm/system.h>
5015 -#include <asm/uaccess.h>
5016 #include <asm/io.h>
5017 #include <asm/atomic.h>
5018 #include <asm/debugreg.h>
5019 @@ -108,12 +109,7 @@
5020 preempt_enable_no_resched();
5021 }
5022
5023 -static int kstack_depth_to_print = 12;
5024 -#ifdef CONFIG_STACK_UNWIND
5025 -static int call_trace = 1;
5026 -#else
5027 -#define call_trace (-1)
5028 -#endif
5029 +int kstack_depth_to_print = 12;
5030
5031 #ifdef CONFIG_KALLSYMS
5032 void printk_address(unsigned long address)
5033 @@ -218,24 +214,7 @@
5034 return NULL;
5035 }
5036
5037 -struct ops_and_data {
5038 - struct stacktrace_ops *ops;
5039 - void *data;
5040 -};
5041 -
5042 -static int dump_trace_unwind(struct unwind_frame_info *info, void *context)
5043 -{
5044 - struct ops_and_data *oad = (struct ops_and_data *)context;
5045 - int n = 0;
5046 -
5047 - while (unwind(info) == 0 && UNW_PC(info)) {
5048 - n++;
5049 - oad->ops->address(oad->data, UNW_PC(info));
5050 - if (arch_unw_user_mode(info))
5051 - break;
5052 - }
5053 - return n;
5054 -}
5055 +#define MSG(txt) ops->warning(data, txt)
5056
5057 /*
5058 * x86-64 can have upto three kernel stacks:
5059 @@ -250,61 +229,24 @@
5060 return p > t && p < t + THREAD_SIZE - 3;
5061 }
5062
5063 -void dump_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * stack,
5064 +void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
5065 + unsigned long *stack,
5066 struct stacktrace_ops *ops, void *data)
5067 {
5068 - const unsigned cpu = smp_processor_id();
5069 - unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
5070 + const unsigned cpu = get_cpu();
5071 + unsigned long *irqstack_end = (unsigned long*)cpu_pda(cpu)->irqstackptr;
5072 unsigned used = 0;
5073 struct thread_info *tinfo;
5074
5075 if (!tsk)
5076 tsk = current;
5077
5078 - if (call_trace >= 0) {
5079 - int unw_ret = 0;
5080 - struct unwind_frame_info info;
5081 - struct ops_and_data oad = { .ops = ops, .data = data };
5082 -
5083 - if (regs) {
5084 - if (unwind_init_frame_info(&info, tsk, regs) == 0)
5085 - unw_ret = dump_trace_unwind(&info, &oad);
5086 - } else if (tsk == current)
5087 - unw_ret = unwind_init_running(&info, dump_trace_unwind, &oad);
5088 - else {
5089 - if (unwind_init_blocked(&info, tsk) == 0)
5090 - unw_ret = dump_trace_unwind(&info, &oad);
5091 - }
5092 - if (unw_ret > 0) {
5093 - if (call_trace == 1 && !arch_unw_user_mode(&info)) {
5094 - ops->warning_symbol(data, "DWARF2 unwinder stuck at %s\n",
5095 - UNW_PC(&info));
5096 - if ((long)UNW_SP(&info) < 0) {
5097 - ops->warning(data, "Leftover inexact backtrace:\n");
5098 - stack = (unsigned long *)UNW_SP(&info);
5099 - if (!stack)
5100 - return;
5101 - } else
5102 - ops->warning(data, "Full inexact backtrace again:\n");
5103 - } else if (call_trace >= 1)
5104 - return;
5105 - else
5106 - ops->warning(data, "Full inexact backtrace again:\n");
5107 - } else
5108 - ops->warning(data, "Inexact backtrace:\n");
5109 - }
5110 if (!stack) {
5111 unsigned long dummy;
5112 stack = &dummy;
5113 if (tsk && tsk != current)
5114 stack = (unsigned long *)tsk->thread.rsp;
5115 }
5116 - /*
5117 - * Align the stack pointer on word boundary, later loops
5118 - * rely on that (and corruption / debug info bugs can cause
5119 - * unaligned values here):
5120 - */
5121 - stack = (unsigned long *)((unsigned long)stack & ~(sizeof(long)-1));
5122
5123 /*
5124 * Print function call entries within a stack. 'cond' is the
5125 @@ -314,9 +256,9 @@
5126 #define HANDLE_STACK(cond) \
5127 do while (cond) { \
5128 unsigned long addr = *stack++; \
5129 - if (oops_in_progress ? \
5130 - __kernel_text_address(addr) : \
5131 - kernel_text_address(addr)) { \
5132 + /* Use unlocked access here because except for NMIs \
5133 + we should be already protected against module unloads */ \
5134 + if (__kernel_text_address(addr)) { \
5135 /* \
5136 * If the address is either in the text segment of the \
5137 * kernel, or in the region which contains vmalloc'ed \
5138 @@ -379,9 +321,10 @@
5139 /*
5140 * This handles the process stack:
5141 */
5142 - tinfo = current_thread_info();
5143 + tinfo = task_thread_info(tsk);
5144 HANDLE_STACK (valid_stack_ptr(tinfo, stack));
5145 #undef HANDLE_STACK
5146 + put_cpu();
5147 }
5148 EXPORT_SYMBOL(dump_trace);
5149
5150 @@ -518,30 +461,15 @@
5151 printk("\n");
5152 }
5153
5154 -void handle_BUG(struct pt_regs *regs)
5155 -{
5156 - struct bug_frame f;
5157 - long len;
5158 - const char *prefix = "";
5159 +int is_valid_bugaddr(unsigned long rip)
5160 +{
5161 + unsigned short ud2;
5162
5163 - if (user_mode(regs))
5164 - return;
5165 - if (__copy_from_user(&f, (const void __user *) regs->rip,
5166 - sizeof(struct bug_frame)))
5167 - return;
5168 - if (f.filename >= 0 ||
5169 - f.ud2[0] != 0x0f || f.ud2[1] != 0x0b)
5170 - return;
5171 - len = __strnlen_user((char *)(long)f.filename, PATH_MAX) - 1;
5172 - if (len < 0 || len >= PATH_MAX)
5173 - f.filename = (int)(long)"unmapped filename";
5174 - else if (len > 50) {
5175 - f.filename += len - 50;
5176 - prefix = "...";
5177 - }
5178 - printk("----------- [cut here ] --------- [please bite here ] ---------\n");
5179 - printk(KERN_ALERT "Kernel BUG at %s%.50s:%d\n", prefix, (char *)(long)f.filename, f.line);
5180 -}
5181 + if (__copy_from_user(&ud2, (const void __user *) rip, sizeof(ud2)))
5182 + return 0;
5183 +
5184 + return ud2 == 0x0b0f;
5185 +}
5186
5187 #ifdef CONFIG_BUG
5188 void out_of_line_bug(void)
5189 @@ -621,7 +549,9 @@
5190 {
5191 unsigned long flags = oops_begin();
5192
5193 - handle_BUG(regs);
5194 + if (!user_mode(regs))
5195 + report_bug(regs->rip);
5196 +
5197 __die(str, regs, err);
5198 oops_end(flags);
5199 do_exit(SIGSEGV);
5200 @@ -790,8 +720,7 @@
5201 {
5202 printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
5203 reason);
5204 - printk(KERN_EMERG "You probably have a hardware problem with your "
5205 - "RAM chips\n");
5206 + printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n");
5207
5208 if (panic_on_unrecovered_nmi)
5209 panic("NMI: Not continuing");
5210 @@ -1227,21 +1156,3 @@
5211 return 0;
5212 }
5213 early_param("kstack", kstack_setup);
5214 -
5215 -#ifdef CONFIG_STACK_UNWIND
5216 -static int __init call_trace_setup(char *s)
5217 -{
5218 - if (!s)
5219 - return -EINVAL;
5220 - if (strcmp(s, "old") == 0)
5221 - call_trace = -1;
5222 - else if (strcmp(s, "both") == 0)
5223 - call_trace = 0;
5224 - else if (strcmp(s, "newfallback") == 0)
5225 - call_trace = 1;
5226 - else if (strcmp(s, "new") == 0)
5227 - call_trace = 2;
5228 - return 0;
5229 -}
5230 -early_param("call_trace", call_trace_setup);
5231 -#endif
5232 --- a/arch/x86/kernel/vmlinux_32.lds.S
5233 +++ b/arch/x86/kernel/vmlinux_32.lds.S
5234 @@ -29,6 +29,12 @@
5235 SECTIONS
5236 {
5237 . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
5238 +
5239 +#if defined(CONFIG_XEN) && CONFIG_XEN_COMPAT <= 0x030002
5240 +#undef LOAD_OFFSET
5241 +#define LOAD_OFFSET 0
5242 +#endif
5243 +
5244 phys_startup_32 = startup_32 - LOAD_OFFSET;
5245
5246 .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) {
5247 --- a/arch/x86/kernel/vsyscall_64-xen.c
5248 +++ b/arch/x86/kernel/vsyscall_64-xen.c
5249 @@ -42,6 +42,7 @@
5250 #include <asm/topology.h>
5251
5252 #define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
5253 +#define __syscall_clobber "r11","rcx","memory"
5254
5255 int __sysctl_vsyscall __section_sysctl_vsyscall = 1;
5256 seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED;
5257 @@ -224,8 +225,7 @@
5258
5259 static int vsyscall_sysctl_nostrat(ctl_table *t, int __user *name, int nlen,
5260 void __user *oldval, size_t __user *oldlenp,
5261 - void __user *newval, size_t newlen,
5262 - void **context)
5263 + void __user *newval, size_t newlen)
5264 {
5265 return -ENOSYS;
5266 }
5267 @@ -277,7 +277,6 @@
5268 vsyscall_set_cpu(raw_smp_processor_id());
5269 }
5270
5271 -#ifdef CONFIG_HOTPLUG_CPU
5272 static int __cpuinit
5273 cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
5274 {
5275 @@ -286,13 +285,13 @@
5276 smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 0, 1);
5277 return NOTIFY_DONE;
5278 }
5279 -#endif
5280
5281 static void __init map_vsyscall(void)
5282 {
5283 extern char __vsyscall_0;
5284 unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
5285
5286 + /* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */
5287 __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
5288 }
5289
5290 --- a/arch/x86/kvm/Kconfig
5291 +++ b/arch/x86/kvm/Kconfig
5292 @@ -7,6 +7,7 @@
5293 menuconfig VIRTUALIZATION
5294 bool "Virtualization"
5295 depends on HAVE_KVM || X86
5296 + depends on !XEN
5297 default y
5298 ---help---
5299 Say Y here to get to see options for using your Linux host to run other
5300 --- a/arch/x86/mm/fault_32-xen.c
5301 +++ b/arch/x86/mm/fault_32-xen.c
5302 @@ -22,9 +22,9 @@
5303 #include <linux/highmem.h>
5304 #include <linux/module.h>
5305 #include <linux/kprobes.h>
5306 +#include <linux/uaccess.h>
5307
5308 #include <asm/system.h>
5309 -#include <asm/uaccess.h>
5310 #include <asm/desc.h>
5311 #include <asm/kdebug.h>
5312 #include <asm/segment.h>
5313 @@ -167,7 +167,7 @@
5314 static int __is_prefetch(struct pt_regs *regs, unsigned long addr)
5315 {
5316 unsigned long limit;
5317 - unsigned long instr = get_segment_eip (regs, &limit);
5318 + unsigned char *instr = (unsigned char *)get_segment_eip (regs, &limit);
5319 int scan_more = 1;
5320 int prefetch = 0;
5321 int i;
5322 @@ -177,9 +177,9 @@
5323 unsigned char instr_hi;
5324 unsigned char instr_lo;
5325
5326 - if (instr > limit)
5327 + if (instr > (unsigned char *)limit)
5328 break;
5329 - if (__get_user(opcode, (unsigned char __user *) instr))
5330 + if (probe_kernel_address(instr, opcode))
5331 break;
5332
5333 instr_hi = opcode & 0xf0;
5334 @@ -204,9 +204,9 @@
5335 case 0x00:
5336 /* Prefetch instruction is 0x0F0D or 0x0F18 */
5337 scan_more = 0;
5338 - if (instr > limit)
5339 + if (instr > (unsigned char *)limit)
5340 break;
5341 - if (__get_user(opcode, (unsigned char __user *) instr))
5342 + if (probe_kernel_address(instr, opcode))
5343 break;
5344 prefetch = (instr_lo == 0xF) &&
5345 (opcode == 0x0D || opcode == 0x18);
5346 --- a/arch/x86/mm/fault_64-xen.c
5347 +++ b/arch/x86/mm/fault_64-xen.c
5348 @@ -23,9 +23,9 @@
5349 #include <linux/compiler.h>
5350 #include <linux/module.h>
5351 #include <linux/kprobes.h>
5352 +#include <linux/uaccess.h>
5353
5354 #include <asm/system.h>
5355 -#include <asm/uaccess.h>
5356 #include <asm/pgalloc.h>
5357 #include <asm/smp.h>
5358 #include <asm/tlbflush.h>
5359 @@ -96,7 +96,7 @@
5360 static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
5361 unsigned long error_code)
5362 {
5363 - unsigned char __user *instr;
5364 + unsigned char *instr;
5365 int scan_more = 1;
5366 int prefetch = 0;
5367 unsigned char *max_instr;
5368 @@ -116,7 +116,7 @@
5369 unsigned char instr_hi;
5370 unsigned char instr_lo;
5371
5372 - if (__get_user(opcode, (char __user *)instr))
5373 + if (probe_kernel_address(instr, opcode))
5374 break;
5375
5376 instr_hi = opcode & 0xf0;
5377 @@ -154,7 +154,7 @@
5378 case 0x00:
5379 /* Prefetch instruction is 0x0F0D or 0x0F18 */
5380 scan_more = 0;
5381 - if (__get_user(opcode, (char __user *)instr))
5382 + if (probe_kernel_address(instr, opcode))
5383 break;
5384 prefetch = (instr_lo == 0xF) &&
5385 (opcode == 0x0D || opcode == 0x18);
5386 @@ -170,7 +170,7 @@
5387 static int bad_address(void *p)
5388 {
5389 unsigned long dummy;
5390 - return __get_user(dummy, (unsigned long __user *)p);
5391 + return probe_kernel_address((unsigned long *)p, dummy);
5392 }
5393
5394 void dump_pagetable(unsigned long address)
5395 --- a/arch/x86/mm/highmem_32-xen.c
5396 +++ b/arch/x86/mm/highmem_32-xen.c
5397 @@ -32,7 +32,7 @@
5398 unsigned long vaddr;
5399
5400 /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
5401 - inc_preempt_count();
5402 + pagefault_disable();
5403 if (!PageHighMem(page))
5404 return page_address(page);
5405
5406 @@ -63,26 +63,22 @@
5407 unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
5408 enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
5409
5410 -#ifdef CONFIG_DEBUG_HIGHMEM
5411 - if (vaddr >= PAGE_OFFSET && vaddr < (unsigned long)high_memory) {
5412 - dec_preempt_count();
5413 - preempt_check_resched();
5414 - return;
5415 - }
5416 -
5417 - if (vaddr != __fix_to_virt(FIX_KMAP_BEGIN+idx))
5418 - BUG();
5419 -#endif
5420 /*
5421 * Force other mappings to Oops if they'll try to access this pte
5422 * without first remap it. Keeping stale mappings around is a bad idea
5423 * also, in case the page changes cacheability attributes or becomes
5424 * a protected page in a hypervisor.
5425 */
5426 - kpte_clear_flush(kmap_pte-idx, vaddr);
5427 + if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx))
5428 + kpte_clear_flush(kmap_pte-idx, vaddr);
5429 + else {
5430 +#ifdef CONFIG_DEBUG_HIGHMEM
5431 + BUG_ON(vaddr < PAGE_OFFSET);
5432 + BUG_ON(vaddr >= (unsigned long)high_memory);
5433 +#endif
5434 + }
5435
5436 - dec_preempt_count();
5437 - preempt_check_resched();
5438 + pagefault_enable();
5439 }
5440
5441 /* This is the same as kmap_atomic() but can map memory that doesn't
5442 @@ -93,7 +89,7 @@
5443 enum fixed_addresses idx;
5444 unsigned long vaddr;
5445
5446 - inc_preempt_count();
5447 + pagefault_disable();
5448
5449 idx = type + KM_TYPE_NR*smp_processor_id();
5450 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
5451 --- a/arch/x86/mm/init_32-xen.c
5452 +++ b/arch/x86/mm/init_32-xen.c
5453 @@ -235,8 +235,6 @@
5454
5455 #endif
5456
5457 -extern int is_available_memory(efi_memory_desc_t *);
5458 -
5459 int page_is_ram(unsigned long pagenr)
5460 {
5461 int i;
5462 @@ -329,7 +327,7 @@
5463 SetPageReserved(page);
5464 }
5465
5466 -static int add_one_highpage_hotplug(struct page *page, unsigned long pfn)
5467 +static int __meminit add_one_highpage_hotplug(struct page *page, unsigned long pfn)
5468 {
5469 free_new_highpage(page, pfn);
5470 totalram_pages++;
5471 @@ -346,7 +344,7 @@
5472 * has been added dynamically that would be
5473 * onlined here is in HIGHMEM
5474 */
5475 -void online_page(struct page *page)
5476 +void __meminit online_page(struct page *page)
5477 {
5478 ClearPageReserved(page);
5479 add_one_highpage_hotplug(page, page_to_pfn(page));
5480 @@ -739,16 +737,10 @@
5481 set_bit(PG_pinned, &virt_to_page(init_mm.pgd)->flags);
5482 }
5483
5484 -/*
5485 - * this is for the non-NUMA, single node SMP system case.
5486 - * Specifically, in the case of x86, we will always add
5487 - * memory to the highmem for now.
5488 - */
5489 #ifdef CONFIG_MEMORY_HOTPLUG
5490 -#ifndef CONFIG_NEED_MULTIPLE_NODES
5491 int arch_add_memory(int nid, u64 start, u64 size)
5492 {
5493 - struct pglist_data *pgdata = &contig_page_data;
5494 + struct pglist_data *pgdata = NODE_DATA(nid);
5495 struct zone *zone = pgdata->node_zones + ZONE_HIGHMEM;
5496 unsigned long start_pfn = start >> PAGE_SHIFT;
5497 unsigned long nr_pages = size >> PAGE_SHIFT;
5498 @@ -760,11 +752,11 @@
5499 {
5500 return -EINVAL;
5501 }
5502 -#endif
5503 +EXPORT_SYMBOL_GPL(remove_memory);
5504 #endif
5505
5506 -kmem_cache_t *pgd_cache;
5507 -kmem_cache_t *pmd_cache;
5508 +struct kmem_cache *pgd_cache;
5509 +struct kmem_cache *pmd_cache;
5510
5511 void __init pgtable_cache_init(void)
5512 {
5513 --- a/arch/x86/mm/init_64-xen.c
5514 +++ b/arch/x86/mm/init_64-xen.c
5515 @@ -1130,14 +1130,15 @@
5516 __initcall(x8664_sysctl_init);
5517 #endif
5518
5519 -/* A pseudo VMAs to allow ptrace access for the vsyscall page. This only
5520 +/* A pseudo VMA to allow ptrace access for the vsyscall page. This only
5521 covers the 64bit vsyscall page now. 32bit has a real VMA now and does
5522 not need special handling anymore. */
5523
5524 static struct vm_area_struct gate_vma = {
5525 .vm_start = VSYSCALL_START,
5526 - .vm_end = VSYSCALL_END,
5527 - .vm_page_prot = PAGE_READONLY
5528 + .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES << PAGE_SHIFT),
5529 + .vm_page_prot = PAGE_READONLY_EXEC,
5530 + .vm_flags = VM_READ | VM_EXEC
5531 };
5532
5533 struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
5534 --- a/arch/x86/mm/pageattr_64-xen.c
5535 +++ b/arch/x86/mm/pageattr_64-xen.c
5536 @@ -324,34 +324,40 @@
5537 return base;
5538 }
5539
5540 -
5541 -static void flush_kernel_map(void *address)
5542 +static void cache_flush_page(void *adr)
5543 {
5544 - if (0 && address && cpu_has_clflush) {
5545 - /* is this worth it? */
5546 - int i;
5547 - for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size)
5548 - asm volatile("clflush (%0)" :: "r" (address + i));
5549 - } else
5550 - asm volatile("wbinvd":::"memory");
5551 - if (address)
5552 - __flush_tlb_one(address);
5553 - else
5554 - __flush_tlb_all();
5555 + int i;
5556 + for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size)
5557 + asm volatile("clflush (%0)" :: "r" (adr + i));
5558 }
5559
5560 +static void flush_kernel_map(void *arg)
5561 +{
5562 + struct list_head *l = (struct list_head *)arg;
5563 + struct page *pg;
5564
5565 -static inline void flush_map(unsigned long address)
5566 + /* When clflush is available always use it because it is
5567 + much cheaper than WBINVD */
5568 + if (!cpu_has_clflush)
5569 + asm volatile("wbinvd" ::: "memory");
5570 + list_for_each_entry(pg, l, lru) {
5571 + void *adr = page_address(pg);
5572 + if (cpu_has_clflush)
5573 + cache_flush_page(adr);
5574 + __flush_tlb_one(adr);
5575 + }
5576 +}
5577 +
5578 +static inline void flush_map(struct list_head *l)
5579 {
5580 - on_each_cpu(flush_kernel_map, (void *)address, 1, 1);
5581 + on_each_cpu(flush_kernel_map, l, 1, 1);
5582 }
5583
5584 -static struct page *deferred_pages; /* protected by init_mm.mmap_sem */
5585 +static LIST_HEAD(deferred_pages); /* protected by init_mm.mmap_sem */
5586
5587 static inline void save_page(struct page *fpage)
5588 {
5589 - fpage->lru.next = (struct list_head *)deferred_pages;
5590 - deferred_pages = fpage;
5591 + list_add(&fpage->lru, &deferred_pages);
5592 }
5593
5594 /*
5595 @@ -481,18 +487,18 @@
5596
5597 void global_flush_tlb(void)
5598 {
5599 - struct page *dpage;
5600 + struct page *pg, *next;
5601 + struct list_head l;
5602
5603 down_read(&init_mm.mmap_sem);
5604 - dpage = xchg(&deferred_pages, NULL);
5605 + list_replace_init(&deferred_pages, &l);
5606 up_read(&init_mm.mmap_sem);
5607
5608 - flush_map((dpage && !dpage->lru.next) ? (unsigned long)page_address(dpage) : 0);
5609 - while (dpage) {
5610 - struct page *tmp = dpage;
5611 - dpage = (struct page *)dpage->lru.next;
5612 - ClearPagePrivate(tmp);
5613 - __free_page(tmp);
5614 + flush_map(&l);
5615 +
5616 + list_for_each_entry_safe(pg, next, &l, lru) {
5617 + ClearPagePrivate(pg);
5618 + __free_page(pg);
5619 }
5620 }
5621
5622 --- a/arch/x86/mm/pgtable_32-xen.c
5623 +++ b/arch/x86/mm/pgtable_32-xen.c
5624 @@ -197,7 +197,7 @@
5625 __free_page(pte);
5626 }
5627
5628 -void pmd_ctor(void *pmd, kmem_cache_t *cache, unsigned long flags)
5629 +void pmd_ctor(void *pmd, struct kmem_cache *cache, unsigned long flags)
5630 {
5631 memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
5632 }
5633 @@ -237,7 +237,7 @@
5634 set_page_private(next, (unsigned long)pprev);
5635 }
5636
5637 -void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused)
5638 +void pgd_ctor(void *pgd, struct kmem_cache *cache, unsigned long unused)
5639 {
5640 unsigned long flags;
5641
5642 @@ -258,7 +258,7 @@
5643 }
5644
5645 /* never called when PTRS_PER_PMD > 1 */
5646 -void pgd_dtor(void *pgd, kmem_cache_t *cache, unsigned long unused)
5647 +void pgd_dtor(void *pgd, struct kmem_cache *cache, unsigned long unused)
5648 {
5649 unsigned long flags; /* can be called from interrupt context */
5650
5651 --- a/arch/x86/pci/irq-xen.c
5652 +++ b/arch/x86/pci/irq-xen.c
5653 @@ -768,7 +768,7 @@
5654 DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for %04x:%04x\n",
5655 rt->rtr_vendor, rt->rtr_device);
5656
5657 - pirq_router_dev = pci_find_slot(rt->rtr_bus, rt->rtr_devfn);
5658 + pirq_router_dev = pci_get_bus_and_slot(rt->rtr_bus, rt->rtr_devfn);
5659 if (!pirq_router_dev) {
5660 DBG(KERN_DEBUG "PCI: Interrupt router not found at "
5661 "%02x:%02x\n", rt->rtr_bus, rt->rtr_devfn);
5662 @@ -788,6 +788,8 @@
5663 pirq_router_dev->vendor,
5664 pirq_router_dev->device,
5665 pci_name(pirq_router_dev));
5666 +
5667 + /* The device remains referenced for the kernel lifetime */
5668 }
5669
5670 static struct irq_info *pirq_get_info(struct pci_dev *dev)
5671 --- a/drivers/xen/balloon/balloon.c
5672 +++ b/drivers/xen/balloon/balloon.c
5673 @@ -97,8 +97,8 @@
5674 static LIST_HEAD(ballooned_pages);
5675
5676 /* Main work function, always executed in process context. */
5677 -static void balloon_process(void *unused);
5678 -static DECLARE_WORK(balloon_worker, balloon_process, NULL);
5679 +static void balloon_process(struct work_struct *unused);
5680 +static DECLARE_WORK(balloon_worker, balloon_process);
5681 static struct timer_list balloon_timer;
5682
5683 /* When ballooning out (allocating memory to return to Xen) we don't really
5684 @@ -387,7 +387,7 @@
5685 * by the balloon lock), or with changes to the Xen hard limit, but we will
5686 * recover from these in time.
5687 */
5688 -static void balloon_process(void *unused)
5689 +static void balloon_process(struct work_struct *unused)
5690 {
5691 int need_sleep = 0;
5692 long credit;
5693 --- a/drivers/xen/blkback/blkback.c
5694 +++ b/drivers/xen/blkback/blkback.c
5695 @@ -37,6 +37,7 @@
5696
5697 #include <linux/spinlock.h>
5698 #include <linux/kthread.h>
5699 +#include <linux/freezer.h>
5700 #include <linux/list.h>
5701 #include <linux/delay.h>
5702 #include <xen/balloon.h>
5703 --- a/drivers/xen/blkback/interface.c
5704 +++ b/drivers/xen/blkback/interface.c
5705 @@ -34,7 +34,7 @@
5706 #include <xen/evtchn.h>
5707 #include <linux/kthread.h>
5708
5709 -static kmem_cache_t *blkif_cachep;
5710 +static struct kmem_cache *blkif_cachep;
5711
5712 blkif_t *blkif_alloc(domid_t domid)
5713 {
5714 --- a/drivers/xen/blkfront/blkfront.c
5715 +++ b/drivers/xen/blkfront/blkfront.c
5716 @@ -70,7 +70,7 @@
5717 static void kick_pending_request_queues(struct blkfront_info *);
5718
5719 static irqreturn_t blkif_int(int irq, void *dev_id);
5720 -static void blkif_restart_queue(void *arg);
5721 +static void blkif_restart_queue(struct work_struct *arg);
5722 static void blkif_recover(struct blkfront_info *);
5723 static void blkif_completion(struct blk_shadow *);
5724 static void blkif_free(struct blkfront_info *, int);
5725 @@ -105,7 +105,7 @@
5726 info->xbdev = dev;
5727 info->vdevice = vdevice;
5728 info->connected = BLKIF_STATE_DISCONNECTED;
5729 - INIT_WORK(&info->work, blkif_restart_queue, (void *)info);
5730 + INIT_WORK(&info->work, blkif_restart_queue);
5731
5732 for (i = 0; i < BLK_RING_SIZE; i++)
5733 info->shadow[i].req.id = i+1;
5734 @@ -445,9 +445,9 @@
5735 }
5736 }
5737
5738 -static void blkif_restart_queue(void *arg)
5739 +static void blkif_restart_queue(struct work_struct *arg)
5740 {
5741 - struct blkfront_info *info = (struct blkfront_info *)arg;
5742 + struct blkfront_info *info = container_of(arg, struct blkfront_info, work);
5743 spin_lock_irq(&blkif_io_lock);
5744 if (info->connected == BLKIF_STATE_CONNECTED)
5745 kick_pending_request_queues(info);
5746 --- a/drivers/xen/blktap/blktap.c
5747 +++ b/drivers/xen/blktap/blktap.c
5748 @@ -40,6 +40,7 @@
5749
5750 #include <linux/spinlock.h>
5751 #include <linux/kthread.h>
5752 +#include <linux/freezer.h>
5753 #include <linux/list.h>
5754 #include <asm/hypervisor.h>
5755 #include "common.h"
5756 --- a/drivers/xen/blktap/interface.c
5757 +++ b/drivers/xen/blktap/interface.c
5758 @@ -34,7 +34,7 @@
5759 #include "common.h"
5760 #include <xen/evtchn.h>
5761
5762 -static kmem_cache_t *blkif_cachep;
5763 +static struct kmem_cache *blkif_cachep;
5764
5765 blkif_t *tap_alloc_blkif(domid_t domid)
5766 {
5767 --- a/drivers/xen/char/mem.c
5768 +++ b/drivers/xen/char/mem.c
5769 @@ -157,7 +157,7 @@
5770 {
5771 loff_t ret;
5772
5773 - mutex_lock(&file->f_dentry->d_inode->i_mutex);
5774 + mutex_lock(&file->f_path.dentry->d_inode->i_mutex);
5775 switch (orig) {
5776 case 0:
5777 file->f_pos = offset;
5778 @@ -172,7 +172,7 @@
5779 default:
5780 ret = -EINVAL;
5781 }
5782 - mutex_unlock(&file->f_dentry->d_inode->i_mutex);
5783 + mutex_unlock(&file->f_path.dentry->d_inode->i_mutex);
5784 return ret;
5785 }
5786
5787 --- a/drivers/xen/console/console.c
5788 +++ b/drivers/xen/console/console.c
5789 @@ -80,11 +80,6 @@
5790 #define XEN_XVC_MAJOR 204
5791 #define XEN_XVC_MINOR 191
5792
5793 -#ifdef CONFIG_MAGIC_SYSRQ
5794 -static unsigned long sysrq_requested;
5795 -extern int sysrq_enabled;
5796 -#endif
5797 -
5798 static int __init xencons_setup(char *str)
5799 {
5800 char *q;
5801 @@ -339,8 +334,8 @@
5802 #define DUMMY_TTY(_tty) ((xc_mode == XC_TTY) && \
5803 ((_tty)->index != (xc_num - 1)))
5804
5805 -static struct termios *xencons_termios[MAX_NR_CONSOLES];
5806 -static struct termios *xencons_termios_locked[MAX_NR_CONSOLES];
5807 +static struct ktermios *xencons_termios[MAX_NR_CONSOLES];
5808 +static struct ktermios *xencons_termios_locked[MAX_NR_CONSOLES];
5809 static struct tty_struct *xencons_tty;
5810 static int xencons_priv_irq;
5811 static char x_char;
5812 @@ -356,7 +351,9 @@
5813
5814 for (i = 0; i < len; i++) {
5815 #ifdef CONFIG_MAGIC_SYSRQ
5816 - if (sysrq_enabled) {
5817 + if (sysrq_on()) {
5818 + static unsigned long sysrq_requested;
5819 +
5820 if (buf[i] == '\x0f') { /* ^O */
5821 if (!sysrq_requested) {
5822 sysrq_requested = jiffies;
5823 --- a/drivers/xen/core/reboot.c
5824 +++ b/drivers/xen/core/reboot.c
5825 @@ -30,8 +30,8 @@
5826 /* Can we leave APs online when we suspend? */
5827 static int fast_suspend;
5828
5829 -static void __shutdown_handler(void *unused);
5830 -static DECLARE_WORK(shutdown_work, __shutdown_handler, NULL);
5831 +static void __shutdown_handler(struct work_struct *unused);
5832 +static DECLARE_DELAYED_WORK(shutdown_work, __shutdown_handler);
5833
5834 int __xen_suspend(int fast_suspend, void (*resume_notifier)(void));
5835
5836 @@ -96,7 +96,7 @@
5837 case SHUTDOWN_RESUMING:
5838 break;
5839 default:
5840 - schedule_work(&shutdown_work);
5841 + schedule_delayed_work(&shutdown_work, 0);
5842 break;
5843 }
5844
5845 @@ -108,7 +108,7 @@
5846 return 0;
5847 }
5848
5849 -static void __shutdown_handler(void *unused)
5850 +static void __shutdown_handler(struct work_struct *unused)
5851 {
5852 int err;
5853
5854 @@ -169,7 +169,7 @@
5855 if (new_state != SHUTDOWN_INVALID) {
5856 old_state = xchg(&shutting_down, new_state);
5857 if (old_state == SHUTDOWN_INVALID)
5858 - schedule_work(&shutdown_work);
5859 + schedule_delayed_work(&shutdown_work, 0);
5860 else
5861 BUG_ON(old_state != SHUTDOWN_RESUMING);
5862 }
5863 --- a/drivers/xen/core/smpboot.c
5864 +++ b/drivers/xen/core/smpboot.c
5865 @@ -165,7 +165,12 @@
5866
5867 void __cpuinit cpu_bringup(void)
5868 {
5869 +#ifdef __i386__
5870 + cpu_set_gdt(current_thread_info()->cpu);
5871 + secondary_cpu_init();
5872 +#else
5873 cpu_init();
5874 +#endif
5875 identify_cpu(cpu_data + smp_processor_id());
5876 touch_softlockup_watchdog();
5877 preempt_disable();
5878 @@ -304,11 +309,12 @@
5879 if (cpu == 0)
5880 continue;
5881
5882 + idle = fork_idle(cpu);
5883 + if (IS_ERR(idle))
5884 + panic("failed fork for CPU %d", cpu);
5885 +
5886 #ifdef __x86_64__
5887 gdt_descr = &cpu_gdt_descr[cpu];
5888 -#else
5889 - gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
5890 -#endif
5891 gdt_descr->address = get_zeroed_page(GFP_KERNEL);
5892 if (unlikely(!gdt_descr->address)) {
5893 printk(KERN_CRIT "CPU%d failed to allocate GDT\n",
5894 @@ -317,6 +323,11 @@
5895 }
5896 gdt_descr->size = GDT_SIZE;
5897 memcpy((void *)gdt_descr->address, cpu_gdt_table, GDT_SIZE);
5898 +#else
5899 + if (unlikely(!init_gdt(cpu, idle)))
5900 + continue;
5901 + gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
5902 +#endif
5903 make_page_readonly(
5904 (void *)gdt_descr->address,
5905 XENFEAT_writable_descriptor_tables);
5906 @@ -336,10 +347,6 @@
5907 cpu_2_logical_apicid[cpu] = apicid;
5908 x86_cpu_to_apicid[cpu] = apicid;
5909
5910 - idle = fork_idle(cpu);
5911 - if (IS_ERR(idle))
5912 - panic("failed fork for CPU %d", cpu);
5913 -
5914 #ifdef __x86_64__
5915 cpu_pda(cpu)->pcurrent = idle;
5916 cpu_pda(cpu)->cpunumber = cpu;
5917 --- a/drivers/xen/fbfront/xenfb.c
5918 +++ b/drivers/xen/fbfront/xenfb.c
5919 @@ -25,6 +25,7 @@
5920 #include <linux/vmalloc.h>
5921 #include <linux/mm.h>
5922 #include <linux/mutex.h>
5923 +#include <linux/freezer.h>
5924 #include <asm/hypervisor.h>
5925 #include <xen/evtchn.h>
5926 #include <xen/interface/io/fbif.h>
5927 --- a/drivers/xen/netback/loopback.c
5928 +++ b/drivers/xen/netback/loopback.c
5929 @@ -54,6 +54,7 @@
5930 #include <net/dst.h>
5931 #include <net/xfrm.h> /* secpath_reset() */
5932 #include <asm/hypervisor.h> /* is_initial_xendomain() */
5933 +#include <../net/core/kmap_skb.h> /* k{,un}map_skb_frag() */
5934
5935 static int nloopbacks = -1;
5936 module_param(nloopbacks, int, 0);
5937 --- a/drivers/xen/pciback/conf_space_header.c
5938 +++ b/drivers/xen/pciback/conf_space_header.c
5939 @@ -22,14 +22,14 @@
5940 {
5941 int err;
5942
5943 - if (!dev->is_enabled && is_enable_cmd(value)) {
5944 + if (!atomic_read(&dev->enable_cnt) && is_enable_cmd(value)) {
5945 if (unlikely(verbose_request))
5946 printk(KERN_DEBUG "pciback: %s: enable\n",
5947 pci_name(dev));
5948 err = pci_enable_device(dev);
5949 if (err)
5950 return err;
5951 - } else if (dev->is_enabled && !is_enable_cmd(value)) {
5952 + } else if (atomic_read(&dev->enable_cnt) && !is_enable_cmd(value)) {
5953 if (unlikely(verbose_request))
5954 printk(KERN_DEBUG "pciback: %s: disable\n",
5955 pci_name(dev));
5956 --- a/drivers/xen/pciback/pciback.h
5957 +++ b/drivers/xen/pciback/pciback.h
5958 @@ -88,7 +88,7 @@
5959
5960 /* Handles events from front-end */
5961 irqreturn_t pciback_handle_event(int irq, void *dev_id);
5962 -void pciback_do_op(void *data);
5963 +void pciback_do_op(struct work_struct *work);
5964
5965 int pciback_xenbus_register(void);
5966 void pciback_xenbus_unregister(void);
5967 --- a/drivers/xen/pciback/pciback_ops.c
5968 +++ b/drivers/xen/pciback/pciback_ops.c
5969 @@ -25,7 +25,7 @@
5970
5971 pci_write_config_word(dev, PCI_COMMAND, 0);
5972
5973 - dev->is_enabled = 0;
5974 + atomic_set(&dev->enable_cnt, 0);
5975 dev->is_busmaster = 0;
5976 } else {
5977 pci_read_config_word(dev, PCI_COMMAND, &cmd);
5978 @@ -51,9 +51,9 @@
5979 * context because some of the pci_* functions can sleep (mostly due to ACPI
5980 * use of semaphores). This function is intended to be called from a work
5981 * queue in process context taking a struct pciback_device as a parameter */
5982 -void pciback_do_op(void *data)
5983 +void pciback_do_op(struct work_struct *work)
5984 {
5985 - struct pciback_device *pdev = data;
5986 + struct pciback_device *pdev = container_of(work, struct pciback_device, op_work);
5987 struct pci_dev *dev;
5988 struct xen_pci_op *op = &pdev->sh_info->op;
5989
5990 --- a/drivers/xen/pciback/xenbus.c
5991 +++ b/drivers/xen/pciback/xenbus.c
5992 @@ -32,7 +32,7 @@
5993 pdev->evtchn_irq = INVALID_EVTCHN_IRQ;
5994 pdev->be_watching = 0;
5995
5996 - INIT_WORK(&pdev->op_work, pciback_do_op, pdev);
5997 + INIT_WORK(&pdev->op_work, pciback_do_op);
5998
5999 if (pciback_init_devices(pdev)) {
6000 kfree(pdev);
6001 @@ -53,7 +53,6 @@
6002
6003 /* If the driver domain started an op, make sure we complete it or
6004 * delete it before releasing the shared memory */
6005 - cancel_delayed_work(&pdev->op_work);
6006 flush_scheduled_work();
6007
6008 if (pdev->sh_info)
6009 --- a/drivers/xen/sfc_netfront/accel_vi.c
6010 +++ b/drivers/xen/sfc_netfront/accel_vi.c
6011 @@ -463,7 +463,7 @@
6012
6013 if (skb->ip_summed == CHECKSUM_PARTIAL) {
6014 /* Set to zero to encourage falcon to work it out for us */
6015 - *(u16*)(skb->h.raw + skb->csum) = 0;
6016 + *(u16*)(skb->h.raw + skb->csum_offset) = 0;
6017 }
6018
6019 if (multi_post_start_new_buffer(vnic, &state)) {
6020 @@ -582,7 +582,7 @@
6021
6022 if (skb->ip_summed == CHECKSUM_PARTIAL) {
6023 /* Set to zero to encourage falcon to work it out for us */
6024 - *(u16*)(skb->h.raw + skb->csum) = 0;
6025 + *(u16*)(skb->h.raw + skb->csum_offset) = 0;
6026 }
6027 NETFRONT_ACCEL_PKTBUFF_FOR_EACH_FRAGMENT
6028 (skb, idx, frag_data, frag_len, {
6029 --- a/drivers/xen/tpmback/interface.c
6030 +++ b/drivers/xen/tpmback/interface.c
6031 @@ -15,7 +15,7 @@
6032 #include <xen/balloon.h>
6033 #include <xen/gnttab.h>
6034
6035 -static kmem_cache_t *tpmif_cachep;
6036 +static struct kmem_cache *tpmif_cachep;
6037 int num_frontends = 0;
6038
6039 LIST_HEAD(tpmif_list);
6040 --- a/drivers/xen/xenbus/xenbus_comms.c
6041 +++ b/drivers/xen/xenbus/xenbus_comms.c
6042 @@ -49,9 +49,9 @@
6043
6044 static int xenbus_irq;
6045
6046 -extern void xenbus_probe(void *);
6047 +extern void xenbus_probe(struct work_struct *);
6048 extern int xenstored_ready;
6049 -static DECLARE_WORK(probe_work, xenbus_probe, NULL);
6050 +static DECLARE_WORK(probe_work, xenbus_probe);
6051
6052 static DECLARE_WAIT_QUEUE_HEAD(xb_waitq);
6053
6054 --- a/drivers/xen/xenbus/xenbus_probe.c
6055 +++ b/drivers/xen/xenbus/xenbus_probe.c
6056 @@ -840,7 +840,7 @@
6057 EXPORT_SYMBOL_GPL(unregister_xenstore_notifier);
6058
6059
6060 -void xenbus_probe(void *unused)
6061 +void xenbus_probe(struct work_struct *unused)
6062 {
6063 BUG_ON((xenstored_ready <= 0));
6064
6065 --- a/include/asm-x86/mach-xen/asm/desc_32.h
6066 +++ b/include/asm-x86/mach-xen/asm/desc_32.h
6067 @@ -4,8 +4,6 @@
6068 #include <asm/ldt.h>
6069 #include <asm/segment.h>
6070
6071 -#define CPU_16BIT_STACK_SIZE 1024
6072 -
6073 #ifndef __ASSEMBLY__
6074
6075 #include <linux/preempt.h>
6076 @@ -15,8 +13,6 @@
6077
6078 extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
6079
6080 -DECLARE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]);
6081 -
6082 struct Xgt_desc_struct {
6083 unsigned short size;
6084 unsigned long address __attribute__((packed));
6085 @@ -32,11 +28,6 @@
6086 return (struct desc_struct *)per_cpu(cpu_gdt_descr, cpu).address;
6087 }
6088
6089 -/*
6090 - * This is the ldt that every process will get unless we need
6091 - * something other than this.
6092 - */
6093 -extern struct desc_struct default_ldt[];
6094 extern struct desc_struct idt_table[];
6095 extern void set_intr_gate(unsigned int irq, void * addr);
6096
6097 @@ -63,8 +54,8 @@
6098 #define DESCTYPE_DPL3 0x60 /* DPL-3 */
6099 #define DESCTYPE_S 0x10 /* !system */
6100
6101 +#ifndef CONFIG_XEN
6102 #define load_TR_desc() __asm__ __volatile__("ltr %w0"::"q" (GDT_ENTRY_TSS*8))
6103 -#define load_LDT_desc() __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8))
6104
6105 #define load_gdt(dtr) __asm__ __volatile("lgdt %0"::"m" (*dtr))
6106 #define load_idt(dtr) __asm__ __volatile("lidt %0"::"m" (*dtr))
6107 @@ -75,6 +66,7 @@
6108 #define store_idt(dtr) __asm__ ("sidt %0":"=m" (*dtr))
6109 #define store_tr(tr) __asm__ ("str %0":"=m" (tr))
6110 #define store_ldt(ldt) __asm__ ("sldt %0":"=m" (ldt))
6111 +#endif
6112
6113 #if TLS_SIZE != 24
6114 # error update this code.
6115 @@ -90,22 +82,43 @@
6116 }
6117
6118 #ifndef CONFIG_XEN
6119 +#define write_ldt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
6120 +#define write_gdt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
6121 +#define write_idt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
6122 +
6123 static inline void write_dt_entry(void *dt, int entry, __u32 entry_a, __u32 entry_b)
6124 {
6125 __u32 *lp = (__u32 *)((char *)dt + entry*8);
6126 *lp = entry_a;
6127 *(lp+1) = entry_b;
6128 }
6129 -
6130 -#define write_ldt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
6131 -#define write_gdt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
6132 +#define set_ldt native_set_ldt
6133 #else
6134 extern int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b);
6135 extern int write_gdt_entry(void *gdt, int entry, __u32 entry_a, __u32 entry_b);
6136 +#define set_ldt xen_set_ldt
6137 +#endif
6138 +
6139 +#ifndef CONFIG_XEN
6140 +static inline fastcall void native_set_ldt(const void *addr,
6141 + unsigned int entries)
6142 +{
6143 + if (likely(entries == 0))
6144 + __asm__ __volatile__("lldt %w0"::"q" (0));
6145 + else {
6146 + unsigned cpu = smp_processor_id();
6147 + __u32 a, b;
6148 +
6149 + pack_descriptor(&a, &b, (unsigned long)addr,
6150 + entries * sizeof(struct desc_struct) - 1,
6151 + DESCTYPE_LDT, 0);
6152 + write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, a, b);
6153 + __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
6154 + }
6155 +}
6156 #endif
6157 -#ifndef CONFIG_X86_NO_IDT
6158 -#define write_idt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
6159
6160 +#ifndef CONFIG_X86_NO_IDT
6161 static inline void _set_gate(int gate, unsigned int type, void *addr, unsigned short seg)
6162 {
6163 __u32 a, b;
6164 @@ -125,14 +138,6 @@
6165 }
6166 #endif
6167
6168 -static inline void set_ldt_desc(unsigned int cpu, void *addr, unsigned int entries)
6169 -{
6170 - __u32 a, b;
6171 - pack_descriptor(&a, &b, (unsigned long)addr,
6172 - entries * sizeof(struct desc_struct) - 1,
6173 - DESCTYPE_LDT, 0);
6174 - write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, a, b);
6175 -}
6176
6177 #define set_tss_desc(cpu,addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr)
6178
6179 @@ -163,36 +168,22 @@
6180
6181 static inline void clear_LDT(void)
6182 {
6183 - int cpu = get_cpu();
6184 -
6185 - /*
6186 - * NB. We load the default_ldt for lcall7/27 handling on demand, as
6187 - * it slows down context switching. Noone uses it anyway.
6188 - */
6189 - cpu = cpu; /* XXX avoid compiler warning */
6190 - xen_set_ldt(NULL, 0);
6191 - put_cpu();
6192 + set_ldt(NULL, 0);
6193 }
6194
6195 /*
6196 * load one particular LDT into the current CPU
6197 */
6198 -static inline void load_LDT_nolock(mm_context_t *pc, int cpu)
6199 +static inline void load_LDT_nolock(mm_context_t *pc)
6200 {
6201 - void *segments = pc->ldt;
6202 - int count = pc->size;
6203 -
6204 - if (likely(!count))
6205 - segments = NULL;
6206 -
6207 - xen_set_ldt(segments, count);
6208 + set_ldt(pc->ldt, pc->size);
6209 }
6210
6211 static inline void load_LDT(mm_context_t *pc)
6212 {
6213 - int cpu = get_cpu();
6214 - load_LDT_nolock(pc, cpu);
6215 - put_cpu();
6216 + preempt_disable();
6217 + load_LDT_nolock(pc);
6218 + preempt_enable();
6219 }
6220
6221 static inline unsigned long get_desc_base(unsigned long *desc)
6222 @@ -204,6 +195,29 @@
6223 return base;
6224 }
6225
6226 +#else /* __ASSEMBLY__ */
6227 +
6228 +/*
6229 + * GET_DESC_BASE reads the descriptor base of the specified segment.
6230 + *
6231 + * Args:
6232 + * idx - descriptor index
6233 + * gdt - GDT pointer
6234 + * base - 32bit register to which the base will be written
6235 + * lo_w - lo word of the "base" register
6236 + * lo_b - lo byte of the "base" register
6237 + * hi_b - hi byte of the low word of the "base" register
6238 + *
6239 + * Example:
6240 + * GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah)
6241 + * Will read the base address of GDT_ENTRY_ESPFIX_SS and put it into %eax.
6242 + */
6243 +#define GET_DESC_BASE(idx, gdt, base, lo_w, lo_b, hi_b) \
6244 + movb idx*8+4(gdt), lo_b; \
6245 + movb idx*8+7(gdt), hi_b; \
6246 + shll $16, base; \
6247 + movw idx*8+2(gdt), lo_w;
6248 +
6249 #endif /* !__ASSEMBLY__ */
6250
6251 #endif
6252 --- a/include/asm-x86/mach-xen/asm/desc_64.h
6253 +++ b/include/asm-x86/mach-xen/asm/desc_64.h
6254 @@ -9,62 +9,11 @@
6255
6256 #include <linux/string.h>
6257 #include <linux/smp.h>
6258 +#include <asm/desc_defs.h>
6259
6260 #include <asm/segment.h>
6261 #include <asm/mmu.h>
6262
6263 -// 8 byte segment descriptor
6264 -struct desc_struct {
6265 - u16 limit0;
6266 - u16 base0;
6267 - unsigned base1 : 8, type : 4, s : 1, dpl : 2, p : 1;
6268 - unsigned limit : 4, avl : 1, l : 1, d : 1, g : 1, base2 : 8;
6269 -} __attribute__((packed));
6270 -
6271 -struct n_desc_struct {
6272 - unsigned int a,b;
6273 -};
6274 -
6275 -enum {
6276 - GATE_INTERRUPT = 0xE,
6277 - GATE_TRAP = 0xF,
6278 - GATE_CALL = 0xC,
6279 -};
6280 -
6281 -// 16byte gate
6282 -struct gate_struct {
6283 - u16 offset_low;
6284 - u16 segment;
6285 - unsigned ist : 3, zero0 : 5, type : 5, dpl : 2, p : 1;
6286 - u16 offset_middle;
6287 - u32 offset_high;
6288 - u32 zero1;
6289 -} __attribute__((packed));
6290 -
6291 -#define PTR_LOW(x) ((unsigned long)(x) & 0xFFFF)
6292 -#define PTR_MIDDLE(x) (((unsigned long)(x) >> 16) & 0xFFFF)
6293 -#define PTR_HIGH(x) ((unsigned long)(x) >> 32)
6294 -
6295 -enum {
6296 - DESC_TSS = 0x9,
6297 - DESC_LDT = 0x2,
6298 -};
6299 -
6300 -// LDT or TSS descriptor in the GDT. 16 bytes.
6301 -struct ldttss_desc {
6302 - u16 limit0;
6303 - u16 base0;
6304 - unsigned base1 : 8, type : 5, dpl : 2, p : 1;
6305 - unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8;
6306 - u32 base3;
6307 - u32 zero1;
6308 -} __attribute__((packed));
6309 -
6310 -struct desc_ptr {
6311 - unsigned short size;
6312 - unsigned long address;
6313 -} __attribute__((packed)) ;
6314 -
6315 extern struct desc_ptr idt_descr, cpu_gdt_descr[NR_CPUS];
6316
6317 extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
6318 --- a/include/asm-x86/mach-xen/asm/dma-mapping_32.h
6319 +++ b/include/asm-x86/mach-xen/asm/dma-mapping_32.h
6320 @@ -127,10 +127,10 @@
6321 return (1 << INTERNODE_CACHE_SHIFT);
6322 }
6323
6324 -#define dma_is_consistent(d) (1)
6325 +#define dma_is_consistent(d, h) (1)
6326
6327 static inline void
6328 -dma_cache_sync(void *vaddr, size_t size,
6329 +dma_cache_sync(struct device *dev, void *vaddr, size_t size,
6330 enum dma_data_direction direction)
6331 {
6332 flush_write_buffers();
6333 --- a/include/asm-x86/mach-xen/asm/dma-mapping_64.h
6334 +++ b/include/asm-x86/mach-xen/asm/dma-mapping_64.h
6335 @@ -64,6 +64,9 @@
6336 return (dma_addr == bad_dma_address);
6337 }
6338
6339 +#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
6340 +#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
6341 +
6342 extern void *dma_alloc_coherent(struct device *dev, size_t size,
6343 dma_addr_t *dma_handle, gfp_t gfp);
6344 extern void dma_free_coherent(struct device *dev, size_t size, void *vaddr,
6345 @@ -181,12 +184,13 @@
6346 return boot_cpu_data.x86_clflush_size;
6347 }
6348
6349 -#define dma_is_consistent(h) 1
6350 +#define dma_is_consistent(d, h) 1
6351
6352 extern int dma_set_mask(struct device *dev, u64 mask);
6353
6354 static inline void
6355 -dma_cache_sync(void *vaddr, size_t size, enum dma_data_direction dir)
6356 +dma_cache_sync(struct device *dev, void *vaddr, size_t size,
6357 + enum dma_data_direction dir)
6358 {
6359 flush_write_buffers();
6360 }
6361 --- a/include/asm-x86/mach-xen/asm/fixmap_32.h
6362 +++ b/include/asm-x86/mach-xen/asm/fixmap_32.h
6363 @@ -13,13 +13,16 @@
6364 #ifndef _ASM_FIXMAP_H
6365 #define _ASM_FIXMAP_H
6366
6367 -
6368 /* used by vmalloc.c, vsyscall.lds.S.
6369 *
6370 * Leave one empty page between vmalloc'ed areas and
6371 * the start of the fixmap.
6372 */
6373 extern unsigned long __FIXADDR_TOP;
6374 +#ifdef CONFIG_COMPAT_VDSO
6375 +#define FIXADDR_USER_START __fix_to_virt(FIX_VDSO)
6376 +#define FIXADDR_USER_END __fix_to_virt(FIX_VDSO - 1)
6377 +#endif
6378
6379 #ifndef __ASSEMBLY__
6380 #include <linux/kernel.h>
6381 --- a/include/asm-x86/mach-xen/asm/hypervisor.h
6382 +++ b/include/asm-x86/mach-xen/asm/hypervisor.h
6383 @@ -45,15 +45,6 @@
6384 #include <xen/interface/nmi.h>
6385 #include <asm/ptrace.h>
6386 #include <asm/page.h>
6387 -#if defined(__i386__)
6388 -# ifdef CONFIG_X86_PAE
6389 -# include <asm-generic/pgtable-nopud.h>
6390 -# else
6391 -# include <asm-generic/pgtable-nopmd.h>
6392 -# endif
6393 -#elif defined(__x86_64__) && LINUX_VERSION_CODE < KERNEL_VERSION(2,6,11)
6394 -# include <asm-generic/pgtable-nopud.h>
6395 -#endif
6396
6397 extern shared_info_t *HYPERVISOR_shared_info;
6398
6399 --- a/include/asm-x86/mach-xen/asm/io_32.h
6400 +++ b/include/asm-x86/mach-xen/asm/io_32.h
6401 @@ -269,11 +269,7 @@
6402
6403 #endif /* __KERNEL__ */
6404
6405 -#ifdef SLOW_IO_BY_JUMPING
6406 -#define __SLOW_DOWN_IO "jmp 1f; 1: jmp 1f; 1:"
6407 -#else
6408 #define __SLOW_DOWN_IO "outb %%al,$0x80;"
6409 -#endif
6410
6411 static inline void slow_down_io(void) {
6412 __asm__ __volatile__(
6413 --- a/include/asm-x86/mach-xen/asm/irqflags_32.h
6414 +++ b/include/asm-x86/mach-xen/asm/irqflags_32.h
6415 @@ -22,9 +22,6 @@
6416
6417 #define __raw_local_save_flags() (current_vcpu_info()->evtchn_upcall_mask)
6418
6419 -#define raw_local_save_flags(flags) \
6420 - do { (flags) = __raw_local_save_flags(); } while (0)
6421 -
6422 #define raw_local_irq_restore(x) \
6423 do { \
6424 vcpu_info_t *_vcpu; \
6425 @@ -66,18 +63,6 @@
6426 */
6427 void halt(void);
6428
6429 -static inline int raw_irqs_disabled_flags(unsigned long flags)
6430 -{
6431 - return (flags != 0);
6432 -}
6433 -
6434 -#define raw_irqs_disabled() \
6435 -({ \
6436 - unsigned long flags = __raw_local_save_flags(); \
6437 - \
6438 - raw_irqs_disabled_flags(flags); \
6439 -})
6440 -
6441 /*
6442 * For spinlocks, etc:
6443 */
6444 @@ -90,9 +75,62 @@
6445 flags; \
6446 })
6447
6448 +#else
6449 +/* Offsets into shared_info_t. */
6450 +#define evtchn_upcall_pending /* 0 */
6451 +#define evtchn_upcall_mask 1
6452 +
6453 +#define sizeof_vcpu_shift 6
6454 +
6455 +#ifdef CONFIG_SMP
6456 +#define GET_VCPU_INFO movl TI_cpu(%ebp),%esi ; \
6457 + shl $sizeof_vcpu_shift,%esi ; \
6458 + addl HYPERVISOR_shared_info,%esi
6459 +#else
6460 +#define GET_VCPU_INFO movl HYPERVISOR_shared_info,%esi
6461 +#endif
6462 +
6463 +#define __DISABLE_INTERRUPTS movb $1,evtchn_upcall_mask(%esi)
6464 +#define __ENABLE_INTERRUPTS movb $0,evtchn_upcall_mask(%esi)
6465 +#define __TEST_PENDING testb $0xFF,evtchn_upcall_pending(%esi)
6466 +#define DISABLE_INTERRUPTS(clb) GET_VCPU_INFO ; \
6467 + __DISABLE_INTERRUPTS
6468 +#define ENABLE_INTERRUPTS(clb) GET_VCPU_INFO ; \
6469 + __ENABLE_INTERRUPTS
6470 +#define ENABLE_INTERRUPTS_SYSEXIT __ENABLE_INTERRUPTS ; \
6471 +sysexit_scrit: /**** START OF SYSEXIT CRITICAL REGION ****/ ; \
6472 + __TEST_PENDING ; \
6473 + jnz 14f /* process more events if necessary... */ ; \
6474 + movl PT_ESI(%esp), %esi ; \
6475 + sysexit ; \
6476 +14: __DISABLE_INTERRUPTS ; \
6477 + TRACE_IRQS_OFF ; \
6478 +sysexit_ecrit: /**** END OF SYSEXIT CRITICAL REGION ****/ ; \
6479 + push %esp ; \
6480 + call evtchn_do_upcall ; \
6481 + add $4,%esp ; \
6482 + jmp ret_from_intr
6483 +#define INTERRUPT_RETURN iret
6484 +#endif /* __ASSEMBLY__ */
6485 +
6486 +#ifndef __ASSEMBLY__
6487 +#define raw_local_save_flags(flags) \
6488 + do { (flags) = __raw_local_save_flags(); } while (0)
6489 +
6490 #define raw_local_irq_save(flags) \
6491 do { (flags) = __raw_local_irq_save(); } while (0)
6492
6493 +static inline int raw_irqs_disabled_flags(unsigned long flags)
6494 +{
6495 + return (flags != 0);
6496 +}
6497 +
6498 +#define raw_irqs_disabled() \
6499 +({ \
6500 + unsigned long flags = __raw_local_save_flags(); \
6501 + \
6502 + raw_irqs_disabled_flags(flags); \
6503 +})
6504 #endif /* __ASSEMBLY__ */
6505
6506 /*
6507 --- a/include/asm-x86/mach-xen/asm/mmu_context_32.h
6508 +++ b/include/asm-x86/mach-xen/asm/mmu_context_32.h
6509 @@ -27,14 +27,13 @@
6510 static inline void __prepare_arch_switch(void)
6511 {
6512 /*
6513 - * Save away %fs and %gs. No need to save %es and %ds, as those
6514 - * are always kernel segments while inside the kernel. Must
6515 - * happen before reload of cr3/ldt (i.e., not in __switch_to).
6516 + * Save away %fs. No need to save %gs, as it was saved on the
6517 + * stack on entry. No need to save %es and %ds, as those are
6518 + * always kernel segments while inside the kernel.
6519 */
6520 - asm volatile ( "mov %%fs,%0 ; mov %%gs,%1"
6521 - : "=m" (current->thread.fs),
6522 - "=m" (current->thread.gs));
6523 - asm volatile ( "movl %0,%%fs ; movl %0,%%gs"
6524 + asm volatile ( "mov %%fs,%0"
6525 + : "=m" (current->thread.fs));
6526 + asm volatile ( "movl %0,%%fs"
6527 : : "r" (0) );
6528 }
6529
6530 @@ -89,14 +88,14 @@
6531 * tlb flush IPI delivery. We must reload %cr3.
6532 */
6533 load_cr3(next->pgd);
6534 - load_LDT_nolock(&next->context, cpu);
6535 + load_LDT_nolock(&next->context);
6536 }
6537 }
6538 #endif
6539 }
6540
6541 -#define deactivate_mm(tsk, mm) \
6542 - asm("movl %0,%%fs ; movl %0,%%gs": :"r" (0))
6543 +#define deactivate_mm(tsk, mm) \
6544 + asm("movl %0,%%fs": :"r" (0));
6545
6546 static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next)
6547 {
6548 --- a/include/asm-x86/mach-xen/asm/pgtable-2level.h
6549 +++ b/include/asm-x86/mach-xen/asm/pgtable-2level.h
6550 @@ -1,8 +1,6 @@
6551 #ifndef _I386_PGTABLE_2LEVEL_H
6552 #define _I386_PGTABLE_2LEVEL_H
6553
6554 -#include <asm-generic/pgtable-nopmd.h>
6555 -
6556 #define pte_ERROR(e) \
6557 printk("%s:%d: bad pte %08lx (pfn %05lx).\n", __FILE__, __LINE__, \
6558 __pte_val(e), pte_pfn(e))
6559 @@ -23,26 +21,14 @@
6560 set_pte((ptep), (pteval)); \
6561 } while (0)
6562
6563 -#define set_pte_atomic(pteptr, pteval) set_pte(pteptr,pteval)
6564 -
6565 #define set_pmd(pmdptr, pmdval) xen_l2_entry_update((pmdptr), (pmdval))
6566
6567 +#define set_pte_atomic(pteptr, pteval) set_pte(pteptr,pteval)
6568 +
6569 #define pte_clear(mm,addr,xp) do { set_pte_at(mm, addr, xp, __pte(0)); } while (0)
6570 #define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0)
6571
6572 -#define pte_none(x) (!(x).pte_low)
6573 -
6574 -#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
6575 -static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
6576 -{
6577 - pte_t pte = *ptep;
6578 - if (!pte_none(pte)) {
6579 - if ((mm != &init_mm) ||
6580 - HYPERVISOR_update_va_mapping(addr, __pte(0), 0))
6581 - pte = __pte_ma(xchg(&ptep->pte_low, 0));
6582 - }
6583 - return pte;
6584 -}
6585 +#define raw_ptep_get_and_clear(xp, pte) __pte_ma(xchg(&(xp)->pte_low, 0))
6586
6587 #define __HAVE_ARCH_PTEP_CLEAR_FLUSH
6588 #define ptep_clear_flush(vma, addr, ptep) \
6589 @@ -69,6 +55,7 @@
6590 __pte_mfn(_pte))
6591
6592 #define pte_page(_pte) pfn_to_page(pte_pfn(_pte))
6593 +#define pte_none(x) (!(x).pte_low)
6594
6595 #define pfn_pte(pfn, prot) __pte(((pfn) << PAGE_SHIFT) | pgprot_val(prot))
6596 #define pfn_pmd(pfn, prot) __pmd(((pfn) << PAGE_SHIFT) | pgprot_val(prot))
6597 --- a/include/asm-x86/mach-xen/asm/pgtable-3level.h
6598 +++ b/include/asm-x86/mach-xen/asm/pgtable-3level.h
6599 @@ -1,8 +1,6 @@
6600 #ifndef _I386_PGTABLE_3LEVEL_H
6601 #define _I386_PGTABLE_3LEVEL_H
6602
6603 -#include <asm-generic/pgtable-nopud.h>
6604 -
6605 /*
6606 * Intel Physical Address Extension (PAE) Mode - three-level page
6607 * tables on PPro+ CPUs.
6608 @@ -75,6 +73,23 @@
6609 xen_l3_entry_update((pudptr), (pudval))
6610
6611 /*
6612 + * For PTEs and PDEs, we must clear the P-bit first when clearing a page table
6613 + * entry, so clear the bottom half first and enforce ordering with a compiler
6614 + * barrier.
6615 + */
6616 +static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
6617 +{
6618 + if ((mm != current->mm && mm != &init_mm)
6619 + || HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) {
6620 + ptep->pte_low = 0;
6621 + smp_wmb();
6622 + ptep->pte_high = 0;
6623 + }
6624 +}
6625 +
6626 +#define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0)
6627 +
6628 +/*
6629 * Pentium-II erratum A13: in PAE mode we explicitly have to flush
6630 * the TLB via cr3 if the top-level pgd is changed...
6631 * We do not let the generic code free and clear pgd entries due to
6632 @@ -93,45 +108,16 @@
6633 #define pmd_offset(pud, address) ((pmd_t *) pud_page(*(pud)) + \
6634 pmd_index(address))
6635
6636 -static inline int pte_none(pte_t pte)
6637 -{
6638 - return !(pte.pte_low | pte.pte_high);
6639 -}
6640 -
6641 -/*
6642 - * For PTEs and PDEs, we must clear the P-bit first when clearing a page table
6643 - * entry, so clear the bottom half first and enforce ordering with a compiler
6644 - * barrier.
6645 - */
6646 -static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
6647 +static inline pte_t raw_ptep_get_and_clear(pte_t *ptep, pte_t res)
6648 {
6649 - if ((mm != current->mm && mm != &init_mm)
6650 - || HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) {
6651 - ptep->pte_low = 0;
6652 - smp_wmb();
6653 + uint64_t val = __pte_val(res);
6654 + if (__cmpxchg64(ptep, val, 0) != val) {
6655 + /* xchg acts as a barrier before the setting of the high bits */
6656 + res.pte_low = xchg(&ptep->pte_low, 0);
6657 + res.pte_high = ptep->pte_high;
6658 ptep->pte_high = 0;
6659 }
6660 -}
6661 -
6662 -#define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0)
6663 -
6664 -#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
6665 -static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
6666 -{
6667 - pte_t pte = *ptep;
6668 - if (!pte_none(pte)) {
6669 - if ((mm != &init_mm) ||
6670 - HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) {
6671 - uint64_t val = __pte_val(pte);
6672 - if (__cmpxchg64(ptep, val, 0) != val) {
6673 - /* xchg acts as a barrier before the setting of the high bits */
6674 - pte.pte_low = xchg(&ptep->pte_low, 0);
6675 - pte.pte_high = ptep->pte_high;
6676 - ptep->pte_high = 0;
6677 - }
6678 - }
6679 - }
6680 - return pte;
6681 + return res;
6682 }
6683
6684 #define __HAVE_ARCH_PTEP_CLEAR_FLUSH
6685 @@ -160,6 +146,11 @@
6686
6687 #define pte_page(x) pfn_to_page(pte_pfn(x))
6688
6689 +static inline int pte_none(pte_t pte)
6690 +{
6691 + return !(pte.pte_low | pte.pte_high);
6692 +}
6693 +
6694 #define __pte_mfn(_pte) (((_pte).pte_low >> PAGE_SHIFT) | \
6695 ((_pte).pte_high << (32-PAGE_SHIFT)))
6696 #define pte_mfn(_pte) ((_pte).pte_low & _PAGE_PRESENT ? \
6697 --- a/include/asm-x86/mach-xen/asm/pgtable_32.h
6698 +++ b/include/asm-x86/mach-xen/asm/pgtable_32.h
6699 @@ -38,14 +38,14 @@
6700 #define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
6701 extern unsigned long empty_zero_page[1024];
6702 extern pgd_t *swapper_pg_dir;
6703 -extern kmem_cache_t *pgd_cache;
6704 -extern kmem_cache_t *pmd_cache;
6705 +extern struct kmem_cache *pgd_cache;
6706 +extern struct kmem_cache *pmd_cache;
6707 extern spinlock_t pgd_lock;
6708 extern struct page *pgd_list;
6709
6710 -void pmd_ctor(void *, kmem_cache_t *, unsigned long);
6711 -void pgd_ctor(void *, kmem_cache_t *, unsigned long);
6712 -void pgd_dtor(void *, kmem_cache_t *, unsigned long);
6713 +void pmd_ctor(void *, struct kmem_cache *, unsigned long);
6714 +void pgd_ctor(void *, struct kmem_cache *, unsigned long);
6715 +void pgd_dtor(void *, struct kmem_cache *, unsigned long);
6716 void pgtable_cache_init(void);
6717 void paging_init(void);
6718
6719 @@ -276,7 +276,6 @@
6720 #define pte_update(mm, addr, ptep) do { } while (0)
6721 #define pte_update_defer(mm, addr, ptep) do { } while (0)
6722
6723 -
6724 /*
6725 * We only update the dirty/accessed state if we set
6726 * the dirty bit by hand in the kernel, since the hardware
6727 @@ -342,6 +341,19 @@
6728 __young; \
6729 })
6730
6731 +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
6732 +static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
6733 +{
6734 + pte_t pte = *ptep;
6735 + if (!pte_none(pte)
6736 + && (mm != &init_mm
6737 + || HYPERVISOR_update_va_mapping(addr, __pte(0), 0))) {
6738 + pte = raw_ptep_get_and_clear(ptep, pte);
6739 + pte_update(mm, addr, ptep);
6740 + }
6741 + return pte;
6742 +}
6743 +
6744 #define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
6745 #define ptep_get_and_clear_full(mm, addr, ptep, full) \
6746 ((full) ? ({ \
6747 --- a/include/asm-x86/mach-xen/asm/pgtable_64.h
6748 +++ b/include/asm-x86/mach-xen/asm/pgtable_64.h
6749 @@ -236,19 +236,18 @@
6750
6751 static inline unsigned long pgd_bad(pgd_t pgd)
6752 {
6753 - unsigned long val = __pgd_val(pgd);
6754 - val &= ~PTE_MASK;
6755 - val &= ~(_PAGE_USER | _PAGE_DIRTY);
6756 - return val & ~(_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED);
6757 + return __pgd_val(pgd) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER);
6758 }
6759
6760 -static inline unsigned long pud_bad(pud_t pud)
6761 -{
6762 - unsigned long val = __pud_val(pud);
6763 - val &= ~PTE_MASK;
6764 - val &= ~(_PAGE_USER | _PAGE_DIRTY);
6765 - return val & ~(_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED);
6766 -}
6767 +static inline unsigned long pud_bad(pud_t pud)
6768 +{
6769 + return __pud_val(pud) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER);
6770 +}
6771 +
6772 +static inline unsigned long pmd_bad(pmd_t pmd)
6773 +{
6774 + return __pmd_val(pmd) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER);
6775 +}
6776
6777 #define set_pte_at(_mm,addr,ptep,pteval) do { \
6778 if (((_mm) != current->mm && (_mm) != &init_mm) || \
6779 @@ -404,8 +403,6 @@
6780 #define pmd_present(x) (__pmd_val(x) & _PAGE_PRESENT)
6781 #endif
6782 #define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0)
6783 -#define pmd_bad(x) ((__pmd_val(x) & ~(PTE_MASK | _PAGE_USER | _PAGE_PRESENT)) \
6784 - != (_KERNPG_TABLE & ~(_PAGE_USER | _PAGE_PRESENT)))
6785 #define pfn_pmd(nr,prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val(prot)))
6786 #define pmd_pfn(x) ((pmd_val(x) & __PHYSICAL_MASK) >> PAGE_SHIFT)
6787
6788 --- a/include/asm-x86/mach-xen/asm/processor_32.h
6789 +++ b/include/asm-x86/mach-xen/asm/processor_32.h
6790 @@ -20,6 +20,7 @@
6791 #include <linux/threads.h>
6792 #include <asm/percpu.h>
6793 #include <linux/cpumask.h>
6794 +#include <linux/init.h>
6795 #include <xen/interface/physdev.h>
6796
6797 /* flag for disabling the tsc */
6798 @@ -73,6 +74,7 @@
6799 #endif
6800 unsigned char x86_max_cores; /* cpuid returned max cores value */
6801 unsigned char apicid;
6802 + unsigned short x86_clflush_size;
6803 #ifdef CONFIG_SMP
6804 unsigned char booted_cores; /* number of cores as seen by OS */
6805 __u8 phys_proc_id; /* Physical processor id. */
6806 @@ -114,6 +116,8 @@
6807 extern int cpu_llc_id[NR_CPUS];
6808 extern char ignore_fpu_irq;
6809
6810 +void __init cpu_detect(struct cpuinfo_x86 *c);
6811 +
6812 extern void identify_cpu(struct cpuinfo_x86 *);
6813 extern void print_cpu_info(struct cpuinfo_x86 *);
6814 extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
6815 @@ -146,8 +150,8 @@
6816 #define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */
6817 #define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */
6818
6819 -static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
6820 - unsigned int *ecx, unsigned int *edx)
6821 +static inline fastcall void xen_cpuid(unsigned int *eax, unsigned int *ebx,
6822 + unsigned int *ecx, unsigned int *edx)
6823 {
6824 /* ecx is often an input as well as an output. */
6825 __asm__(XEN_CPUID
6826 @@ -158,59 +162,6 @@
6827 : "0" (*eax), "2" (*ecx));
6828 }
6829
6830 -/*
6831 - * Generic CPUID function
6832 - * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
6833 - * resulting in stale register contents being returned.
6834 - */
6835 -static inline void cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx)
6836 -{
6837 - *eax = op;
6838 - *ecx = 0;
6839 - __cpuid(eax, ebx, ecx, edx);
6840 -}
6841 -
6842 -/* Some CPUID calls want 'count' to be placed in ecx */
6843 -static inline void cpuid_count(int op, int count, int *eax, int *ebx, int *ecx,
6844 - int *edx)
6845 -{
6846 - *eax = op;
6847 - *ecx = count;
6848 - __cpuid(eax, ebx, ecx, edx);
6849 -}
6850 -
6851 -/*
6852 - * CPUID functions returning a single datum
6853 - */
6854 -static inline unsigned int cpuid_eax(unsigned int op)
6855 -{
6856 - unsigned int eax, ebx, ecx, edx;
6857 -
6858 - cpuid(op, &eax, &ebx, &ecx, &edx);
6859 - return eax;
6860 -}
6861 -static inline unsigned int cpuid_ebx(unsigned int op)
6862 -{
6863 - unsigned int eax, ebx, ecx, edx;
6864 -
6865 - cpuid(op, &eax, &ebx, &ecx, &edx);
6866 - return ebx;
6867 -}
6868 -static inline unsigned int cpuid_ecx(unsigned int op)
6869 -{
6870 - unsigned int eax, ebx, ecx, edx;
6871 -
6872 - cpuid(op, &eax, &ebx, &ecx, &edx);
6873 - return ecx;
6874 -}
6875 -static inline unsigned int cpuid_edx(unsigned int op)
6876 -{
6877 - unsigned int eax, ebx, ecx, edx;
6878 -
6879 - cpuid(op, &eax, &ebx, &ecx, &edx);
6880 - return edx;
6881 -}
6882 -
6883 #define load_cr3(pgdir) write_cr3(__pa(pgdir))
6884
6885 /*
6886 @@ -480,9 +431,9 @@
6887 .vm86_info = NULL, \
6888 .sysenter_cs = __KERNEL_CS, \
6889 .io_bitmap_ptr = NULL, \
6890 + .gs = __KERNEL_PDA, \
6891 }
6892
6893 -#ifndef CONFIG_X86_NO_TSS
6894 /*
6895 * Note that the .io_bitmap member must be extra-big. This is because
6896 * the CPU will access an additional byte beyond the end of the IO
6897 @@ -497,26 +448,9 @@
6898 .io_bitmap = { [ 0 ... IO_BITMAP_LONGS] = ~0 }, \
6899 }
6900
6901 -static inline void __load_esp0(struct tss_struct *tss, struct thread_struct *thread)
6902 -{
6903 - tss->esp0 = thread->esp0;
6904 - /* This can only happen when SEP is enabled, no need to test "SEP"arately */
6905 - if (unlikely(tss->ss1 != thread->sysenter_cs)) {
6906 - tss->ss1 = thread->sysenter_cs;
6907 - wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
6908 - }
6909 -}
6910 -#define load_esp0(tss, thread) \
6911 - __load_esp0(tss, thread)
6912 -#else
6913 -#define load_esp0(tss, thread) do { \
6914 - if (HYPERVISOR_stack_switch(__KERNEL_DS, (thread)->esp0)) \
6915 - BUG(); \
6916 -} while (0)
6917 -#endif
6918 -
6919 #define start_thread(regs, new_eip, new_esp) do { \
6920 - __asm__("movl %0,%%fs ; movl %0,%%gs": :"r" (0)); \
6921 + __asm__("movl %0,%%fs": :"r" (0)); \
6922 + regs->xgs = 0; \
6923 set_fs(USER_DS); \
6924 regs->xds = __USER_DS; \
6925 regs->xes = __USER_DS; \
6926 @@ -526,26 +460,6 @@
6927 regs->esp = new_esp; \
6928 } while (0)
6929
6930 -/*
6931 - * These special macros can be used to get or set a debugging register
6932 - */
6933 -#define get_debugreg(var, register) \
6934 - (var) = HYPERVISOR_get_debugreg((register))
6935 -#define set_debugreg(value, register) \
6936 - WARN_ON(HYPERVISOR_set_debugreg((register), (value)))
6937 -
6938 -/*
6939 - * Set IOPL bits in EFLAGS from given mask
6940 - */
6941 -static inline void set_iopl_mask(unsigned mask)
6942 -{
6943 - struct physdev_set_iopl set_iopl;
6944 -
6945 - /* Force the change at ring 0. */
6946 - set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3;
6947 - WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
6948 -}
6949 -
6950 /* Forward declaration, a strange C thing */
6951 struct task_struct;
6952 struct mm_struct;
6953 @@ -637,6 +551,105 @@
6954
6955 #define cpu_relax() rep_nop()
6956
6957 +#define paravirt_enabled() 0
6958 +#define __cpuid xen_cpuid
6959 +
6960 +#ifndef CONFIG_X86_NO_TSS
6961 +static inline void __load_esp0(struct tss_struct *tss, struct thread_struct *thread)
6962 +{
6963 + tss->esp0 = thread->esp0;
6964 + /* This can only happen when SEP is enabled, no need to test "SEP"arately */
6965 + if (unlikely(tss->ss1 != thread->sysenter_cs)) {
6966 + tss->ss1 = thread->sysenter_cs;
6967 + wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
6968 + }
6969 +}
6970 +#define load_esp0(tss, thread) \
6971 + __load_esp0(tss, thread)
6972 +#else
6973 +#define load_esp0(tss, thread) do { \
6974 + if (HYPERVISOR_stack_switch(__KERNEL_DS, (thread)->esp0)) \
6975 + BUG(); \
6976 +} while (0)
6977 +#endif
6978 +
6979 +
6980 +/*
6981 + * These special macros can be used to get or set a debugging register
6982 + */
6983 +#define get_debugreg(var, register) \
6984 + (var) = HYPERVISOR_get_debugreg(register)
6985 +#define set_debugreg(value, register) \
6986 + WARN_ON(HYPERVISOR_set_debugreg(register, value))
6987 +
6988 +#define set_iopl_mask xen_set_iopl_mask
6989 +
6990 +/*
6991 + * Set IOPL bits in EFLAGS from given mask
6992 + */
6993 +static inline void xen_set_iopl_mask(unsigned mask)
6994 +{
6995 + struct physdev_set_iopl set_iopl;
6996 +
6997 + /* Force the change at ring 0. */
6998 + set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3;
6999 + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl));
7000 +}
7001 +
7002 +
7003 +/*
7004 + * Generic CPUID function
7005 + * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
7006 + * resulting in stale register contents being returned.
7007 + */
7008 +static inline void cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx)
7009 +{
7010 + *eax = op;
7011 + *ecx = 0;
7012 + __cpuid(eax, ebx, ecx, edx);
7013 +}
7014 +
7015 +/* Some CPUID calls want 'count' to be placed in ecx */
7016 +static inline void cpuid_count(int op, int count, int *eax, int *ebx, int *ecx,
7017 + int *edx)
7018 +{
7019 + *eax = op;
7020 + *ecx = count;
7021 + __cpuid(eax, ebx, ecx, edx);
7022 +}
7023 +
7024 +/*
7025 + * CPUID functions returning a single datum
7026 + */
7027 +static inline unsigned int cpuid_eax(unsigned int op)
7028 +{
7029 + unsigned int eax, ebx, ecx, edx;
7030 +
7031 + cpuid(op, &eax, &ebx, &ecx, &edx);
7032 + return eax;
7033 +}
7034 +static inline unsigned int cpuid_ebx(unsigned int op)
7035 +{
7036 + unsigned int eax, ebx, ecx, edx;
7037 +
7038 + cpuid(op, &eax, &ebx, &ecx, &edx);
7039 + return ebx;
7040 +}
7041 +static inline unsigned int cpuid_ecx(unsigned int op)
7042 +{
7043 + unsigned int eax, ebx, ecx, edx;
7044 +
7045 + cpuid(op, &eax, &ebx, &ecx, &edx);
7046 + return ecx;
7047 +}
7048 +static inline unsigned int cpuid_edx(unsigned int op)
7049 +{
7050 + unsigned int eax, ebx, ecx, edx;
7051 +
7052 + cpuid(op, &eax, &ebx, &ecx, &edx);
7053 + return edx;
7054 +}
7055 +
7056 /* generic versions from gas */
7057 #define GENERIC_NOP1 ".byte 0x90\n"
7058 #define GENERIC_NOP2 ".byte 0x89,0xf6\n"
7059 @@ -736,4 +749,8 @@
7060 extern void enable_sep_cpu(void);
7061 extern int sysenter_setup(void);
7062
7063 +extern int init_gdt(int cpu, struct task_struct *idle);
7064 +extern void cpu_set_gdt(int);
7065 +extern void secondary_cpu_init(void);
7066 +
7067 #endif /* __ASM_I386_PROCESSOR_H */
7068 --- a/include/asm-x86/mach-xen/asm/processor_64.h
7069 +++ b/include/asm-x86/mach-xen/asm/processor_64.h
7070 @@ -484,6 +484,14 @@
7071 : :"a" (eax), "c" (ecx));
7072 }
7073
7074 +static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
7075 +{
7076 + /* "mwait %eax,%ecx;" */
7077 + asm volatile(
7078 + "sti; .byte 0x0f,0x01,0xc9;"
7079 + : :"a" (eax), "c" (ecx));
7080 +}
7081 +
7082 extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
7083
7084 #define stack_current() \
7085 --- a/include/asm-x86/mach-xen/asm/segment_32.h
7086 +++ b/include/asm-x86/mach-xen/asm/segment_32.h
7087 @@ -39,7 +39,7 @@
7088 * 25 - APM BIOS support
7089 *
7090 * 26 - ESPFIX small SS
7091 - * 27 - unused
7092 + * 27 - PDA [ per-cpu private data area ]
7093 * 28 - unused
7094 * 29 - unused
7095 * 30 - unused
7096 @@ -74,6 +74,9 @@
7097 #define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE + 14)
7098 #define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8)
7099
7100 +#define GDT_ENTRY_PDA (GDT_ENTRY_KERNEL_BASE + 15)
7101 +#define __KERNEL_PDA (GDT_ENTRY_PDA * 8)
7102 +
7103 #define GDT_ENTRY_DOUBLEFAULT_TSS 31
7104
7105 /*
7106 --- a/include/asm-x86/mach-xen/asm/smp_32.h
7107 +++ b/include/asm-x86/mach-xen/asm/smp_32.h
7108 @@ -8,6 +8,7 @@
7109 #include <linux/kernel.h>
7110 #include <linux/threads.h>
7111 #include <linux/cpumask.h>
7112 +#include <asm/pda.h>
7113 #endif
7114
7115 #ifdef CONFIG_X86_LOCAL_APIC
7116 @@ -56,7 +57,7 @@
7117 * from the initial startup. We map APIC_BASE very early in page_setup(),
7118 * so this is correct in the x86 case.
7119 */
7120 -#define raw_smp_processor_id() (current_thread_info()->cpu)
7121 +#define raw_smp_processor_id() (read_pda(cpu_number))
7122
7123 extern cpumask_t cpu_possible_map;
7124 #define cpu_callin_map cpu_possible_map
7125 --- a/include/asm-x86/mach-xen/asm/smp_64.h
7126 +++ b/include/asm-x86/mach-xen/asm/smp_64.h
7127 @@ -88,11 +88,6 @@
7128 extern u8 bios_cpu_apicid[];
7129
7130 #ifdef CONFIG_X86_LOCAL_APIC
7131 -static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
7132 -{
7133 - return cpus_addr(cpumask)[0];
7134 -}
7135 -
7136 static inline int cpu_present_to_apicid(int mps_cpu)
7137 {
7138 if (mps_cpu < NR_CPUS)
7139 @@ -127,13 +122,6 @@
7140 #define cpu_physical_id(cpu) x86_cpu_to_apicid[cpu]
7141 #else
7142 #define cpu_physical_id(cpu) boot_cpu_id
7143 -static inline int smp_call_function_single(int cpuid, void (*func) (void *info),
7144 - void *info, int retry, int wait)
7145 -{
7146 - /* Disable interrupts here? */
7147 - func(info);
7148 - return 0;
7149 -}
7150 #endif /* !CONFIG_SMP */
7151 #endif
7152
7153 --- a/include/asm-x86/mach-xen/asm/system_32.h
7154 +++ b/include/asm-x86/mach-xen/asm/system_32.h
7155 @@ -139,17 +139,17 @@
7156 #define write_cr4(x) \
7157 __asm__ __volatile__("movl %0,%%cr4": :"r" (x))
7158
7159 -/*
7160 - * Clear and set 'TS' bit respectively
7161 - */
7162 +#define wbinvd() \
7163 + __asm__ __volatile__ ("wbinvd": : :"memory")
7164 +
7165 +/* Clear the 'TS' bit */
7166 #define clts() (HYPERVISOR_fpu_taskswitch(0))
7167 +
7168 +/* Set the 'TS' bit */
7169 #define stts() (HYPERVISOR_fpu_taskswitch(1))
7170
7171 #endif /* __KERNEL__ */
7172
7173 -#define wbinvd() \
7174 - __asm__ __volatile__ ("wbinvd": : :"memory")
7175 -
7176 static inline unsigned long get_limit(unsigned long segment)
7177 {
7178 unsigned long __limit;
7179 --- a/kernel/kexec.c
7180 +++ b/kernel/kexec.c
7181 @@ -353,7 +353,7 @@
7182 if (limit == ~0UL)
7183 address_bits = BITS_PER_LONG;
7184 else
7185 - address_bits = long_log2(limit);
7186 + address_bits = ilog2(limit);
7187
7188 if (xen_limit_pages_to_max_mfn(pages, order, address_bits) < 0) {
7189 __free_pages(pages, order);
7190 --- a/net/core/dev.c
7191 +++ b/net/core/dev.c
7192 @@ -1597,10 +1597,10 @@
7193 goto out;
7194 switch (skb->nh.iph->protocol) {
7195 case IPPROTO_TCP:
7196 - skb->csum = offsetof(struct tcphdr, check);
7197 + skb->csum_offset = offsetof(struct tcphdr, check);
7198 break;
7199 case IPPROTO_UDP:
7200 - skb->csum = offsetof(struct udphdr, check);
7201 + skb->csum_offset = offsetof(struct udphdr, check);
7202 break;
7203 default:
7204 if (net_ratelimit())
7205 @@ -1609,7 +1609,7 @@
7206 " %d packet", skb->nh.iph->protocol);
7207 goto out;
7208 }
7209 - if ((skb->h.raw + skb->csum + 2) > skb->tail)
7210 + if ((skb->h.raw + skb->csum_offset + 2) > skb->tail)
7211 goto out;
7212 skb->ip_summed = CHECKSUM_PARTIAL;
7213 skb->proto_csum_blank = 0;