Magellan Linux

Contents of /trunk/kernel26-xen/patches-2.6.25-r1/1046-2.6.25-xen-Chainsaw-party-SPLITME.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 606 - (show annotations) (download)
Thu May 22 23:13:13 2008 UTC (15 years, 11 months ago) by niro
File size: 29109 byte(s)
-ver bump to 2.6.25-magellan-r1:
- linux-2.6.25.4
- fbcondecor-0.9.4
- squashfs-3.3
- unionfs-2.3.3
- tuxonice-3.0-rc7
- linux-phc-0.3.0
- acpi-dstd-0.9a
- reiser4
- xen-3.2.0
. ipw3945-1.2.2

1 From 1355d1f32472a6617ed41fe7c4407592a96e944d Mon Sep 17 00:00:00 2001
2 From: Eduardo Habkost <ehabkost@redhat.com>
3 Date: Mon, 3 Dec 2007 17:28:38 -0200
4 Subject: [PATCH] Chainsaw party (SPLITME)
5
6 :D
7
8 Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
9 ---
10 arch/x86/kernel/asm-offsets_64.c | 15 +++
11 arch/x86/kernel/entry_32.S | 83 +---------------
12 arch/x86/kernel/entry_64.S | 4 +
13 arch/x86/kernel/head_64.S | 3 +
14 arch/x86/kernel/smpboot_64.c | 4 +-
15 arch/x86/xen/enlighten.c | 3 +
16 arch/x86/xen/entry.S | 5 +
17 arch/x86/xen/entry_32.S | 81 +++++++++++++++
18 arch/x86/xen/entry_64.S | 1 +
19 arch/x86/xen/smp.c | 16 +++
20 arch/x86/xen/xen-asm.S | 210 +-------------------------------------
21 arch/x86/xen/xen-asm_32.S | 184 +++++++++++++++++++++++++++++++++
22 arch/x86/xen/xen-asm_64.S | 8 ++
23 arch/x86/xen/xen-head.S | 20 +++-
24 include/asm-x86/asm-hack.h | 27 +++++
25 include/asm-x86/smp_64.h | 3 +
26 include/linux/elfnote.h | 2 +-
27 include/xen/interface/elfnote.h | 16 +++
28 18 files changed, 390 insertions(+), 295 deletions(-)
29 create mode 100644 arch/x86/xen/entry.S
30 create mode 100644 arch/x86/xen/entry_32.S
31 create mode 100644 arch/x86/xen/entry_64.S
32 create mode 100644 arch/x86/xen/xen-asm_32.S
33 create mode 100644 arch/x86/xen/xen-asm_64.S
34 create mode 100644 include/asm-x86/asm-hack.h
35
36 diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
37 index 494e1e0..d0fabfd 100644
38 --- a/arch/x86/kernel/asm-offsets_64.c
39 +++ b/arch/x86/kernel/asm-offsets_64.c
40 @@ -25,6 +25,8 @@
41 #define OFFSET(sym, str, mem) \
42 DEFINE(sym, offsetof(struct str, mem))
43
44 +#include <xen/interface/xen.h>
45 +
46 #define __NO_STUBS 1
47 #undef __SYSCALL
48 #undef _ASM_X86_64_UNISTD_H_
49 @@ -92,6 +94,13 @@ int main(void)
50 offsetof (struct rt_sigframe32, uc.uc_mcontext));
51 BLANK();
52 #endif
53 +
54 +#ifdef CONFIG_XEN
55 + BLANK();
56 + OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
57 + OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
58 +#endif
59 +
60 DEFINE(pbe_address, offsetof(struct pbe, address));
61 DEFINE(pbe_orig_address, offsetof(struct pbe, orig_address));
62 DEFINE(pbe_next, offsetof(struct pbe, next));
63 @@ -130,6 +139,12 @@ int main(void)
64 BLANK();
65 DEFINE(__NR_syscall_max, sizeof(syscalls) - 1);
66
67 + DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
68 + DEFINE(PAGE_SHIFT_asm, PAGE_SHIFT);
69 + DEFINE(PTRS_PER_PTE, PTRS_PER_PTE);
70 + DEFINE(PTRS_PER_PMD, PTRS_PER_PMD);
71 + DEFINE(PTRS_PER_PGD, PTRS_PER_PGD);
72 +
73 BLANK();
74 OFFSET(BP_scratch, boot_params, scratch);
75 OFFSET(BP_loadflags, boot_params, hdr.loadflags);
76 diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
77 index 4b87c32..4ef3881 100644
78 --- a/arch/x86/kernel/entry_32.S
79 +++ b/arch/x86/kernel/entry_32.S
80 @@ -1023,88 +1023,7 @@ ENTRY(kernel_thread_helper)
81 ENDPROC(kernel_thread_helper)
82
83 #ifdef CONFIG_XEN
84 -ENTRY(xen_hypervisor_callback)
85 - CFI_STARTPROC
86 - pushl $0
87 - CFI_ADJUST_CFA_OFFSET 4
88 - SAVE_ALL
89 - TRACE_IRQS_OFF
90 -
91 - /* Check to see if we got the event in the critical
92 - region in xen_iret_direct, after we've reenabled
93 - events and checked for pending events. This simulates
94 - iret instruction's behaviour where it delivers a
95 - pending interrupt when enabling interrupts. */
96 - movl PT_EIP(%esp),%eax
97 - cmpl $xen_iret_start_crit,%eax
98 - jb 1f
99 - cmpl $xen_iret_end_crit,%eax
100 - jae 1f
101 -
102 - call xen_iret_crit_fixup
103 -
104 -1: mov %esp, %eax
105 - call xen_evtchn_do_upcall
106 - jmp ret_from_intr
107 - CFI_ENDPROC
108 -ENDPROC(xen_hypervisor_callback)
109 -
110 -# Hypervisor uses this for application faults while it executes.
111 -# We get here for two reasons:
112 -# 1. Fault while reloading DS, ES, FS or GS
113 -# 2. Fault while executing IRET
114 -# Category 1 we fix up by reattempting the load, and zeroing the segment
115 -# register if the load fails.
116 -# Category 2 we fix up by jumping to do_iret_error. We cannot use the
117 -# normal Linux return path in this case because if we use the IRET hypercall
118 -# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
119 -# We distinguish between categories by maintaining a status value in EAX.
120 -ENTRY(xen_failsafe_callback)
121 - CFI_STARTPROC
122 - pushl %eax
123 - CFI_ADJUST_CFA_OFFSET 4
124 - movl $1,%eax
125 -1: mov 4(%esp),%ds
126 -2: mov 8(%esp),%es
127 -3: mov 12(%esp),%fs
128 -4: mov 16(%esp),%gs
129 - testl %eax,%eax
130 - popl %eax
131 - CFI_ADJUST_CFA_OFFSET -4
132 - lea 16(%esp),%esp
133 - CFI_ADJUST_CFA_OFFSET -16
134 - jz 5f
135 - addl $16,%esp
136 - jmp iret_exc # EAX != 0 => Category 2 (Bad IRET)
137 -5: pushl $0 # EAX == 0 => Category 1 (Bad segment)
138 - CFI_ADJUST_CFA_OFFSET 4
139 - SAVE_ALL
140 - jmp ret_from_exception
141 - CFI_ENDPROC
142 -
143 -.section .fixup,"ax"
144 -6: xorl %eax,%eax
145 - movl %eax,4(%esp)
146 - jmp 1b
147 -7: xorl %eax,%eax
148 - movl %eax,8(%esp)
149 - jmp 2b
150 -8: xorl %eax,%eax
151 - movl %eax,12(%esp)
152 - jmp 3b
153 -9: xorl %eax,%eax
154 - movl %eax,16(%esp)
155 - jmp 4b
156 -.previous
157 -.section __ex_table,"a"
158 - .align 4
159 - .long 1b,6b
160 - .long 2b,7b
161 - .long 3b,8b
162 - .long 4b,9b
163 -.previous
164 -ENDPROC(xen_failsafe_callback)
165 -
166 +#include "../xen/entry_32.S"
167 #endif /* CONFIG_XEN */
168
169 .section .rodata,"a"
170 diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
171 index 556a8df..3fedbd6 100644
172 --- a/arch/x86/kernel/entry_64.S
173 +++ b/arch/x86/kernel/entry_64.S
174 @@ -1202,3 +1202,7 @@ KPROBE_ENTRY(ignore_sysret)
175 sysret
176 CFI_ENDPROC
177 ENDPROC(ignore_sysret)
178 +
179 +#ifdef CONFIG_XEN
180 +#include "../xen/entry_64.S"
181 +#endif /* CONFIG_XEN */
182 diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
183 index a007454..923ad56 100644
184 --- a/arch/x86/kernel/head_64.S
185 +++ b/arch/x86/kernel/head_64.S
186 @@ -419,6 +419,9 @@ ENTRY(phys_base)
187 /* This must match the first entry in level2_kernel_pgt */
188 .quad 0x0000000000000000
189
190 +#include "../../x86/xen/xen-head.S"
191 +
192 +
193 /* We need valid kernel segments for data and code in long mode too
194 * IRET will check the segment types kkeil 2000/10/28
195 * Also sysret mandates a special GDT layout
196 diff --git a/arch/x86/kernel/smpboot_64.c b/arch/x86/kernel/smpboot_64.c
197 index a47f973..08bbcad 100644
198 --- a/arch/x86/kernel/smpboot_64.c
199 +++ b/arch/x86/kernel/smpboot_64.c
200 @@ -144,7 +144,7 @@ static unsigned long __cpuinit setup_trampoline(void)
201 * a given CPU
202 */
203
204 -static void __cpuinit smp_store_cpu_info(int id)
205 +void __cpuinit smp_store_cpu_info(int id)
206 {
207 struct cpuinfo_x86 *c = &cpu_data(id);
208
209 @@ -261,7 +261,7 @@ cpumask_t cpu_coregroup_map(int cpu)
210 /* representing cpus for which sibling maps can be computed */
211 static cpumask_t cpu_sibling_setup_map;
212
213 -static inline void set_cpu_sibling_map(int cpu)
214 +inline void set_cpu_sibling_map(int cpu)
215 {
216 int i;
217 struct cpuinfo_x86 *c = &cpu_data(cpu);
218 diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
219 index 4b8ebb8..48a9e35 100644
220 --- a/arch/x86/xen/enlighten.c
221 +++ b/arch/x86/xen/enlighten.c
222 @@ -1238,7 +1238,10 @@ asmlinkage void __init xen_start_kernel(void)
223 pv_apic_ops = xen_apic_ops;
224 pv_mmu_ops = xen_mmu_ops;
225
226 +#ifdef CONFIG_X86_32
227 + /*FIXME: implement me! */
228 machine_ops = xen_machine_ops;
229 +#endif
230
231 #ifdef CONFIG_SMP
232 smp_ops = xen_smp_ops;
233 diff --git a/arch/x86/xen/entry.S b/arch/x86/xen/entry.S
234 new file mode 100644
235 index 0000000..1e7551e
236 --- /dev/null
237 +++ b/arch/x86/xen/entry.S
238 @@ -0,0 +1,5 @@
239 +#ifdef CONFIG_X86_64
240 +# include "entry_64.S"
241 +#else
242 +# include "entry_32.S"
243 +#endif
244 diff --git a/arch/x86/xen/entry_32.S b/arch/x86/xen/entry_32.S
245 new file mode 100644
246 index 0000000..89109a8
247 --- /dev/null
248 +++ b/arch/x86/xen/entry_32.S
249 @@ -0,0 +1,81 @@
250 +ENTRY(xen_hypervisor_callback)
251 + CFI_STARTPROC
252 + pushl $0
253 + CFI_ADJUST_CFA_OFFSET 4
254 + SAVE_ALL
255 + TRACE_IRQS_OFF
256 +
257 + /* Check to see if we got the event in the critical
258 + region in xen_iret_direct, after we've reenabled
259 + events and checked for pending events. This simulates
260 + iret instruction's behaviour where it delivers a
261 + pending interrupt when enabling interrupts. */
262 + movl PT_EIP(%esp),%eax
263 + cmpl $xen_iret_start_crit,%eax
264 + jb 1f
265 + cmpl $xen_iret_end_crit,%eax
266 + jae 1f
267 +
268 + call xen_iret_crit_fixup
269 +
270 +1: mov %esp, %eax
271 + call xen_evtchn_do_upcall
272 + jmp ret_from_intr
273 + CFI_ENDPROC
274 +ENDPROC(xen_hypervisor_callback)
275 +
276 +# Hypervisor uses this for application faults while it executes.
277 +# We get here for two reasons:
278 +# 1. Fault while reloading DS, ES, FS or GS
279 +# 2. Fault while executing IRET
280 +# Category 1 we fix up by reattempting the load, and zeroing the segment
281 +# register if the load fails.
282 +# Category 2 we fix up by jumping to do_iret_error. We cannot use the
283 +# normal Linux return path in this case because if we use the IRET hypercall
284 +# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
285 +# We distinguish between categories by maintaining a status value in EAX.
286 +ENTRY(xen_failsafe_callback)
287 + CFI_STARTPROC
288 + pushl %eax
289 + CFI_ADJUST_CFA_OFFSET 4
290 + movl $1,%eax
291 +1: mov 4(%esp),%ds
292 +2: mov 8(%esp),%es
293 +3: mov 12(%esp),%fs
294 +4: mov 16(%esp),%gs
295 + testl %eax,%eax
296 + popl %eax
297 + CFI_ADJUST_CFA_OFFSET -4
298 + lea 16(%esp),%esp
299 + CFI_ADJUST_CFA_OFFSET -16
300 + jz 5f
301 + addl $16,%esp
302 + jmp iret_exc # EAX != 0 => Category 2 (Bad IRET)
303 +5: pushl $0 # EAX == 0 => Category 1 (Bad segment)
304 + CFI_ADJUST_CFA_OFFSET 4
305 + SAVE_ALL
306 + jmp ret_from_exception
307 + CFI_ENDPROC
308 +
309 +.section .fixup,"ax"
310 +6: xorl %eax,%eax
311 + movl %eax,4(%esp)
312 + jmp 1b
313 +7: xorl %eax,%eax
314 + movl %eax,8(%esp)
315 + jmp 2b
316 +8: xorl %eax,%eax
317 + movl %eax,12(%esp)
318 + jmp 3b
319 +9: xorl %eax,%eax
320 + movl %eax,16(%esp)
321 + jmp 4b
322 +.previous
323 +.section __ex_table,"a"
324 + .align 4
325 + .long 1b,6b
326 + .long 2b,7b
327 + .long 3b,8b
328 + .long 4b,9b
329 +.previous
330 +ENDPROC(xen_failsafe_callback)
331 diff --git a/arch/x86/xen/entry_64.S b/arch/x86/xen/entry_64.S
332 new file mode 100644
333 index 0000000..c8c1473
334 --- /dev/null
335 +++ b/arch/x86/xen/entry_64.S
336 @@ -0,0 +1 @@
337 +#error foo
338 diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
339 index aafc544..03580bf 100644
340 --- a/arch/x86/xen/smp.c
341 +++ b/arch/x86/xen/smp.c
342 @@ -144,7 +144,10 @@ void __init xen_smp_prepare_boot_cpu(void)
343
344 /* We've switched to the "real" per-cpu gdt, so make sure the
345 old memory can be recycled */
346 +#ifdef CONFIG_X86_32
347 + //FIXME: implement this on 64-bit
348 make_lowmem_page_readwrite(&per_cpu__gdt_page);
349 +#endif
350
351 for_each_possible_cpu(cpu) {
352 cpus_clear(per_cpu(cpu_sibling_map, cpu));
353 @@ -207,6 +210,8 @@ void __init xen_smp_prepare_cpus(unsigned int max_cpus)
354 static __cpuinit int
355 cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
356 {
357 +/*FIXME: implement me */
358 +#ifdef CONFIG_X86_32
359 struct vcpu_guest_context *ctxt;
360 struct gdt_page *gdt = &per_cpu(gdt_page, cpu);
361
362 @@ -256,11 +261,14 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
363 BUG();
364
365 kfree(ctxt);
366 +#endif
367 return 0;
368 }
369
370 int __cpuinit xen_cpu_up(unsigned int cpu)
371 {
372 +//FIXME: implement me!
373 +#ifdef CONFIG_X86_32
374 struct task_struct *idle = idle_task(cpu);
375 int rc;
376
377 @@ -299,6 +307,7 @@ int __cpuinit xen_cpu_up(unsigned int cpu)
378 rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
379 BUG_ON(rc);
380
381 +#endif
382 return 0;
383 }
384
385 @@ -308,6 +317,8 @@ void xen_smp_cpus_done(unsigned int max_cpus)
386
387 static void stop_self(void *v)
388 {
389 +//FIXME: implement me!
390 +#ifdef CONFIG_X86_32
391 int cpu = smp_processor_id();
392
393 /* make sure we're not pinning something down */
394 @@ -316,6 +327,7 @@ static void stop_self(void *v)
395
396 HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL);
397 BUG();
398 +#endif
399 }
400
401 void xen_smp_send_stop(void)
402 @@ -356,7 +368,11 @@ static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
403 */
404 irq_enter();
405 (*func)(info);
406 +#ifdef CONFIG_X86_32
407 __get_cpu_var(irq_stat).irq_call_count++;
408 +#else
409 + add_pda(irq_call_count, 1);
410 +#endif
411 irq_exit();
412
413 if (wait) {
414 diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S
415 index af6abfa..1c3c0ac 100644
416 --- a/arch/x86/xen/xen-asm.S
417 +++ b/arch/x86/xen/xen-asm.S
418 @@ -18,33 +18,13 @@
419 #include <asm/percpu.h>
420 #include <asm/processor-flags.h>
421 #include <asm/segment.h>
422 +#include <asm/asm-hack.h>
423
424 #include <xen/interface/xen.h>
425
426 #define RELOC(x, v) .globl x##_reloc; x##_reloc=v
427 #define ENDPATCH(x) .globl x##_end; x##_end=.
428
429 -#ifdef CONFIG_X86_64
430 -# define SUFFIX q
431 -# define REGPREF r
432 -#else
433 -# define SUFFIX l
434 -# define REGPREF e
435 -#endif
436 -
437 -#define __REG(pref, reg) %pref##reg
438 -#define _REG(pref, reg) __REG(pref, reg)
439 -#define REG(reg) _REG(REGPREF, reg)
440 -
441 -#define __INSN(in, suff) in##suff
442 -#define _INSN(in, suff) __INSN(in, suff)
443 -#define INSN(in) _INSN(in, SUFFIX)
444 -
445 -#define rAX REG(ax)
446 -#define rSP REG(sp)
447 -#define MOV INSN(mov)
448 -#define AND INSN(and)
449 -
450
451 /* Pseudo-flag used for virtual NMI, which we don't implement yet */
452 #define XEN_EFLAGS_NMI 0x80000000
453 @@ -159,191 +139,9 @@ ENDPATCH(xen_restore_fl_direct)
454 ENDPROC(xen_restore_fl_direct)
455 RELOC(xen_restore_fl_direct, 2b+1)
456
457 -/*
458 - This is run where a normal iret would be run, with the same stack setup:
459 - 8: eflags
460 - 4: cs
461 - esp-> 0: eip
462 -
463 - This attempts to make sure that any pending events are dealt
464 - with on return to usermode, but there is a small window in
465 - which an event can happen just before entering usermode. If
466 - the nested interrupt ends up setting one of the TIF_WORK_MASK
467 - pending work flags, they will not be tested again before
468 - returning to usermode. This means that a process can end up
469 - with pending work, which will be unprocessed until the process
470 - enters and leaves the kernel again, which could be an
471 - unbounded amount of time. This means that a pending signal or
472 - reschedule event could be indefinitely delayed.
473 -
474 - The fix is to notice a nested interrupt in the critical
475 - window, and if one occurs, then fold the nested interrupt into
476 - the current interrupt stack frame, and re-process it
477 - iteratively rather than recursively. This means that it will
478 - exit via the normal path, and all pending work will be dealt
479 - with appropriately.
480 -
481 - Because the nested interrupt handler needs to deal with the
482 - current stack state in whatever form its in, we keep things
483 - simple by only using a single register which is pushed/popped
484 - on the stack.
485
486 - Non-direct iret could be done in the same way, but it would
487 - require an annoying amount of code duplication. We'll assume
488 - that direct mode will be the common case once the hypervisor
489 - support becomes commonplace.
490 - */
491 -ENTRY(xen_iret_direct)
492 - /* test eflags for special cases */
493 - /*FIXME: use right offset for rFLAGS */
494 - testl $(X86_EFLAGS_VM | XEN_EFLAGS_NMI), 8(rSP)
495 - jnz hyper_iret
496 -
497 - push %eax
498 - ESP_OFFSET=4 # bytes pushed onto stack
499 -
500 - /* Store vcpu_info pointer for easy access. Do it this
501 - way to avoid having to reload %fs */
502 -#ifdef CONFIG_SMP
503 - GET_THREAD_INFO(%eax)
504 - movl TI_cpu(%eax),%eax
505 - movl __per_cpu_offset(,%eax,4),%eax
506 - lea per_cpu__xen_vcpu_info(%eax),%eax
507 +#ifdef CONFIG_X86_64
508 +#include "xen-asm_64.S"
509 #else
510 - movl $per_cpu__xen_vcpu_info, %eax
511 +#include "xen-asm_32.S"
512 #endif
513 -
514 - /* check IF state we're restoring */
515 - testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp)
516 -
517 - /* Maybe enable events. Once this happens we could get a
518 - recursive event, so the critical region starts immediately
519 - afterwards. However, if that happens we don't end up
520 - resuming the code, so we don't have to be worried about
521 - being preempted to another CPU. */
522 - setz XEN_vcpu_info_mask(%eax)
523 -xen_iret_start_crit:
524 -
525 - /* check for unmasked and pending */
526 - cmpw $0x0001, XEN_vcpu_info_pending(%eax)
527 -
528 - /* If there's something pending, mask events again so we
529 - can jump back into xen_hypervisor_callback */
530 - sete XEN_vcpu_info_mask(%eax)
531 -
532 - popl %eax
533 -
534 - /* From this point on the registers are restored and the stack
535 - updated, so we don't need to worry about it if we're preempted */
536 -iret_restore_end:
537 -
538 - /* Jump to hypervisor_callback after fixing up the stack.
539 - Events are masked, so jumping out of the critical
540 - region is OK. */
541 - je xen_hypervisor_callback
542 -
543 - iret
544 -xen_iret_end_crit:
545 -
546 -hyper_iret:
547 - /* put this out of line since its very rarely used */
548 - jmp hypercall_page + __HYPERVISOR_iret * 32
549 -
550 - .globl xen_iret_start_crit, xen_iret_end_crit
551 -
552 -/*
553 - This is called by xen_hypervisor_callback in entry.S when it sees
554 - that the EIP at the time of interrupt was between xen_iret_start_crit
555 - and xen_iret_end_crit. We're passed the EIP in %eax so we can do
556 - a more refined determination of what to do.
557 -
558 - The stack format at this point is:
559 - ----------------
560 - ss : (ss/esp may be present if we came from usermode)
561 - esp :
562 - eflags } outer exception info
563 - cs }
564 - eip }
565 - ---------------- <- edi (copy dest)
566 - eax : outer eax if it hasn't been restored
567 - ----------------
568 - eflags } nested exception info
569 - cs } (no ss/esp because we're nested
570 - eip } from the same ring)
571 - orig_eax }<- esi (copy src)
572 - - - - - - - - -
573 - fs }
574 - es }
575 - ds } SAVE_ALL state
576 - eax }
577 - : :
578 - ebx }
579 - ----------------
580 - return addr <- esp
581 - ----------------
582 -
583 - In order to deliver the nested exception properly, we need to shift
584 - everything from the return addr up to the error code so it
585 - sits just under the outer exception info. This means that when we
586 - handle the exception, we do it in the context of the outer exception
587 - rather than starting a new one.
588 -
589 - The only caveat is that if the outer eax hasn't been
590 - restored yet (ie, it's still on stack), we need to insert
591 - its value into the SAVE_ALL state before going on, since
592 - it's usermode state which we eventually need to restore.
593 - */
594 -ENTRY(xen_iret_crit_fixup)
595 - /* offsets +4 for return address */
596 -
597 - /*
598 - Paranoia: Make sure we're really coming from userspace.
599 - One could imagine a case where userspace jumps into the
600 - critical range address, but just before the CPU delivers a GP,
601 - it decides to deliver an interrupt instead. Unlikely?
602 - Definitely. Easy to avoid? Yes. The Intel documents
603 - explicitly say that the reported EIP for a bad jump is the
604 - jump instruction itself, not the destination, but some virtual
605 - environments get this wrong.
606 - */
607 - movl PT_CS+4(%esp), %ecx
608 - andl $SEGMENT_RPL_MASK, %ecx
609 - cmpl $USER_RPL, %ecx
610 - je 2f
611 -
612 - lea PT_ORIG_EAX+4(%esp), %esi
613 - lea PT_EFLAGS+4(%esp), %edi
614 -
615 - /* If eip is before iret_restore_end then stack
616 - hasn't been restored yet. */
617 - cmp $iret_restore_end, %eax
618 - jae 1f
619 -
620 - movl 0+4(%edi),%eax /* copy EAX */
621 - movl %eax, PT_EAX+4(%esp)
622 -
623 - lea ESP_OFFSET(%edi),%edi /* move dest up over saved regs */
624 -
625 - /* set up the copy */
626 -1: std
627 - mov $(PT_EIP+4) / 4, %ecx /* copy ret+saved regs up to orig_eax */
628 - rep movsl
629 - cld
630 -
631 - lea 4(%edi),%esp /* point esp to new frame */
632 -2: ret
633 -
634 -
635 -/*
636 - Force an event check by making a hypercall,
637 - but preserve regs before making the call.
638 - */
639 -check_events:
640 - push %eax
641 - push %ecx
642 - push %edx
643 - call force_evtchn_callback
644 - pop %edx
645 - pop %ecx
646 - pop %eax
647 - ret
648 diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S
649 new file mode 100644
650 index 0000000..1340296
651 --- /dev/null
652 +++ b/arch/x86/xen/xen-asm_32.S
653 @@ -0,0 +1,184 @@
654 +/*
655 + This is run where a normal iret would be run, with the same stack setup:
656 + 8: eflags
657 + 4: cs
658 + esp-> 0: eip
659 +
660 + This attempts to make sure that any pending events are dealt
661 + with on return to usermode, but there is a small window in
662 + which an event can happen just before entering usermode. If
663 + the nested interrupt ends up setting one of the TIF_WORK_MASK
664 + pending work flags, they will not be tested again before
665 + returning to usermode. This means that a process can end up
666 + with pending work, which will be unprocessed until the process
667 + enters and leaves the kernel again, which could be an
668 + unbounded amount of time. This means that a pending signal or
669 + reschedule event could be indefinitely delayed.
670 +
671 + The fix is to notice a nested interrupt in the critical
672 + window, and if one occurs, then fold the nested interrupt into
673 + the current interrupt stack frame, and re-process it
674 + iteratively rather than recursively. This means that it will
675 + exit via the normal path, and all pending work will be dealt
676 + with appropriately.
677 +
678 + Because the nested interrupt handler needs to deal with the
679 + current stack state in whatever form its in, we keep things
680 + simple by only using a single register which is pushed/popped
681 + on the stack.
682 +
683 + Non-direct iret could be done in the same way, but it would
684 + require an annoying amount of code duplication. We'll assume
685 + that direct mode will be the common case once the hypervisor
686 + support becomes commonplace.
687 + */
688 +ENTRY(xen_iret_direct)
689 + /* test eflags for special cases */
690 + /*FIXME: use right offset for rFLAGS */
691 + testl $(X86_EFLAGS_VM | XEN_EFLAGS_NMI), 8(rSP)
692 + jnz hyper_iret
693 +
694 + push rAX
695 + ESP_OFFSET=4 # bytes pushed onto stack
696 +
697 + /* Store vcpu_info pointer for easy access. Do it this
698 + way to avoid having to reload %fs */
699 + PER_CPU(xen_vcpu_info, rAX)
700 +
701 + /* check IF state we're restoring */
702 + /*FIXME: fix ESP offset */
703 + /*FIXME: check WTF the magic numbers below mean */
704 + testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(rSP)
705 +
706 + /* Maybe enable events. Once this happens we could get a
707 + recursive event, so the critical region starts immediately
708 + afterwards. However, if that happens we don't end up
709 + resuming the code, so we don't have to be worried about
710 + being preempted to another CPU. */
711 + setz XEN_vcpu_info_mask(rAX)
712 +xen_iret_start_crit:
713 +
714 + /* check for unmasked and pending */
715 + cmpw $0x0001, XEN_vcpu_info_pending(%eax)
716 +
717 + /* If there's something pending, mask events again so we
718 + can jump back into xen_hypervisor_callback */
719 + sete XEN_vcpu_info_mask(%eax)
720 +
721 + popl %eax
722 +
723 + /* From this point on the registers are restored and the stack
724 + updated, so we don't need to worry about it if we're preempted */
725 +iret_restore_end:
726 +
727 + /* Jump to hypervisor_callback after fixing up the stack.
728 + Events are masked, so jumping out of the critical
729 + region is OK. */
730 + je xen_hypervisor_callback
731 +
732 + iret
733 +xen_iret_end_crit:
734 +
735 +hyper_iret:
736 + /* put this out of line since its very rarely used */
737 + jmp hypercall_page + __HYPERVISOR_iret * 32
738 +
739 + .globl xen_iret_start_crit, xen_iret_end_crit
740 +
741 +/*
742 + This is called by xen_hypervisor_callback in entry.S when it sees
743 + that the EIP at the time of interrupt was between xen_iret_start_crit
744 + and xen_iret_end_crit. We're passed the EIP in %eax so we can do
745 + a more refined determination of what to do.
746 +
747 + The stack format at this point is:
748 + ----------------
749 + ss : (ss/esp may be present if we came from usermode)
750 + esp :
751 + eflags } outer exception info
752 + cs }
753 + eip }
754 + ---------------- <- edi (copy dest)
755 + eax : outer eax if it hasn't been restored
756 + ----------------
757 + eflags } nested exception info
758 + cs } (no ss/esp because we're nested
759 + eip } from the same ring)
760 + orig_eax }<- esi (copy src)
761 + - - - - - - - -
762 + fs }
763 + es }
764 + ds } SAVE_ALL state
765 + eax }
766 + : :
767 + ebx }
768 + ----------------
769 + return addr <- esp
770 + ----------------
771 +
772 + In order to deliver the nested exception properly, we need to shift
773 + everything from the return addr up to the error code so it
774 + sits just under the outer exception info. This means that when we
775 + handle the exception, we do it in the context of the outer exception
776 + rather than starting a new one.
777 +
778 + The only caveat is that if the outer eax hasn't been
779 + restored yet (ie, it's still on stack), we need to insert
780 + its value into the SAVE_ALL state before going on, since
781 + it's usermode state which we eventually need to restore.
782 + */
783 +ENTRY(xen_iret_crit_fixup)
784 + /* offsets +4 for return address */
785 +
786 + /*
787 + Paranoia: Make sure we're really coming from userspace.
788 + One could imagine a case where userspace jumps into the
789 + critical range address, but just before the CPU delivers a GP,
790 + it decides to deliver an interrupt instead. Unlikely?
791 + Definitely. Easy to avoid? Yes. The Intel documents
792 + explicitly say that the reported EIP for a bad jump is the
793 + jump instruction itself, not the destination, but some virtual
794 + environments get this wrong.
795 + */
796 + movl PT_CS+4(%esp), %ecx
797 + andl $SEGMENT_RPL_MASK, %ecx
798 + cmpl $USER_RPL, %ecx
799 + je 2f
800 +
801 + lea PT_ORIG_EAX+4(%esp), %esi
802 + lea PT_EFLAGS+4(%esp), %edi
803 +
804 + /* If eip is before iret_restore_end then stack
805 + hasn't been restored yet. */
806 + cmp $iret_restore_end, %eax
807 + jae 1f
808 +
809 + movl 0+4(%edi),%eax /* copy EAX */
810 + movl %eax, PT_EAX+4(%esp)
811 +
812 + lea ESP_OFFSET(%edi),%edi /* move dest up over saved regs */
813 +
814 + /* set up the copy */
815 +1: std
816 + mov $(PT_EIP+4) / 4, %ecx /* copy ret+saved regs up to orig_eax */
817 + rep movsl
818 + cld
819 +
820 + lea 4(%edi),%esp /* point esp to new frame */
821 +2: ret
822 +
823 +
824 +/*
825 + Force an event check by making a hypercall,
826 + but preserve regs before making the call.
827 + */
828 +check_events:
829 + push %eax
830 + push %ecx
831 + push %edx
832 + call force_evtchn_callback
833 + pop %edx
834 + pop %ecx
835 + pop %eax
836 + ret
837 +
838 diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
839 new file mode 100644
840 index 0000000..38443b8
841 --- /dev/null
842 +++ b/arch/x86/xen/xen-asm_64.S
843 @@ -0,0 +1,8 @@
844 +check_events:
845 + /*FIXME: implement me! */
846 + ud2a
847 +
848 +
849 +ENTRY(xen_iret_direct)
850 + /*FIXME: implement me! */
851 + ud2a
852 diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
853 index 288d587..ec5d622 100644
854 --- a/arch/x86/xen/xen-head.S
855 +++ b/arch/x86/xen/xen-head.S
856 @@ -7,12 +7,14 @@
857 #include <linux/init.h>
858 #include <asm/boot.h>
859 #include <xen/interface/elfnote.h>
860 +#include <asm/asm-hack.h>
861 +
862
863 __INIT
864 ENTRY(startup_xen)
865 - movl %esi,xen_start_info
866 + MOV rSI,xen_start_info
867 cld
868 - movl $(init_thread_union+THREAD_SIZE),%esp
869 + MOV $(init_thread_union+THREAD_SIZE),rSP
870 jmp xen_start_kernel
871
872 __FINIT
873 @@ -26,14 +28,24 @@ ENTRY(hypercall_page)
874 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux")
875 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6")
876 ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0")
877 +#ifdef CONFIG_X86_32
878 ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long __PAGE_OFFSET)
879 ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long startup_xen)
880 ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long hypercall_page)
881 +#else
882 + ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .quad __START_KERNEL_map)
883 + ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, .quad 0)
884 + ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .quad startup_64)
885 + ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .quad _PAGE_PRESENT,_PAGE_PRESENT)
886 +#endif
887 +
888 ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb")
889 -#ifdef CONFIG_X86_PAE
890 +#ifdef CONFIG_X86_32
891 +# ifdef CONFIG_X86_PAE
892 ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes")
893 -#else
894 +# else
895 ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "no")
896 +# endif
897 #endif
898 ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic")
899
900 diff --git a/include/asm-x86/asm-hack.h b/include/asm-x86/asm-hack.h
901 new file mode 100644
902 index 0000000..b7c2a66
903 --- /dev/null
904 +++ b/include/asm-x86/asm-hack.h
905 @@ -0,0 +1,27 @@
906 +#ifndef __ASM_ASM_HACK_H
907 +
908 +#ifdef CONFIG_X86_64
909 +# define SUFFIX q
910 +# define REGPREF r
911 +#else
912 +# define SUFFIX l
913 +# define REGPREF e
914 +#endif
915 +
916 +#define __REG(pref, reg) %pref##reg
917 +#define _REG(pref, reg) __REG(pref, reg)
918 +#define REG(reg) _REG(REGPREF, reg)
919 +
920 +#define __INSN(in, suff) in##suff
921 +#define _INSN(in, suff) __INSN(in, suff)
922 +#define INSN(in) _INSN(in, SUFFIX)
923 +
924 +#define rAX REG(ax)
925 +#define rSP REG(sp)
926 +#define rSI REG(si)
927 +#define rDI REG(di)
928 +
929 +#define MOV INSN(mov)
930 +#define AND INSN(and)
931 +
932 +#endif /* __ASM_ASM_HACK_H */
933 diff --git a/include/asm-x86/smp_64.h b/include/asm-x86/smp_64.h
934 index e0a7551..48630c1 100644
935 --- a/include/asm-x86/smp_64.h
936 +++ b/include/asm-x86/smp_64.h
937 @@ -97,5 +97,8 @@ static inline int hard_smp_processor_id(void)
938 return GET_APIC_ID(*(u32 *)(APIC_BASE + APIC_ID));
939 }
940
941 +extern void smp_store_cpu_info(int id);
942 +extern void set_cpu_sibling_map(int cpu);
943 +
944 #endif
945
946 diff --git a/include/linux/elfnote.h b/include/linux/elfnote.h
947 index 278e3ef..9f9816a 100644
948 --- a/include/linux/elfnote.h
949 +++ b/include/linux/elfnote.h
950 @@ -52,7 +52,7 @@
951 4484:.balign 4 ; \
952 .popsection ;
953
954 -#define ELFNOTE(name, type, desc) \
955 +#define ELFNOTE(name, type, desc...) \
956 ELFNOTE_START(name, type, "") \
957 desc ; \
958 ELFNOTE_END
959 diff --git a/include/xen/interface/elfnote.h b/include/xen/interface/elfnote.h
960 index a64d3df..ee5501d 100644
961 --- a/include/xen/interface/elfnote.h
962 +++ b/include/xen/interface/elfnote.h
963 @@ -120,6 +120,22 @@
964 */
965 #define XEN_ELFNOTE_BSD_SYMTAB 11
966
967 +/*
968 + * The lowest address the hypervisor hole can begin at (numeric).
969 + *
970 + * This must not be set higher than HYPERVISOR_VIRT_START. Its presence
971 + * also indicates to the hypervisor that the kernel can deal with the
972 + * hole starting at a higher address.
973 + */
974 +#define XEN_ELFNOTE_HV_START_LOW 12
975 +
976 +/*
977 + * List of maddr_t-sized mask/value pairs describing how to recognize
978 + * (non-present) L1 page table entries carrying valid MFNs (numeric).
979 + */
980 +#define XEN_ELFNOTE_L1_MFN_VALID 13
981 +
982 +
983 #endif /* __XEN_PUBLIC_ELFNOTE_H__ */
984
985 /*
986 --
987 1.5.4.1
988