/[pkg-src]/tags/kernel26-xen-2_6_25_r1-fedora9-patches/kernel26-xen/patches-2.6.25-r1/1046-2.6.25-xen-Chainsaw-party-SPLITME.patch |
Contents of /tags/kernel26-xen-2_6_25_r1-fedora9-patches/kernel26-xen/patches-2.6.25-r1/1046-2.6.25-xen-Chainsaw-party-SPLITME.patch
Parent Directory | Revision Log
Revision 608 -
(show annotations)
(download)
Fri May 23 12:17:32 2008 UTC (16 years, 1 month ago) by (unknown author)
File size: 29109 byte(s)
Fri May 23 12:17:32 2008 UTC (16 years, 1 month ago) by (unknown author)
File size: 29109 byte(s)
This commit was manufactured by cvs2svn to create tag 'kernel26-xen-2_6_25_r1-fedora9-patches'.
1 | From 1355d1f32472a6617ed41fe7c4407592a96e944d Mon Sep 17 00:00:00 2001 |
2 | From: Eduardo Habkost <ehabkost@redhat.com> |
3 | Date: Mon, 3 Dec 2007 17:28:38 -0200 |
4 | Subject: [PATCH] Chainsaw party (SPLITME) |
5 | |
6 | :D |
7 | |
8 | Signed-off-by: Eduardo Habkost <ehabkost@redhat.com> |
9 | --- |
10 | arch/x86/kernel/asm-offsets_64.c | 15 +++ |
11 | arch/x86/kernel/entry_32.S | 83 +--------------- |
12 | arch/x86/kernel/entry_64.S | 4 + |
13 | arch/x86/kernel/head_64.S | 3 + |
14 | arch/x86/kernel/smpboot_64.c | 4 +- |
15 | arch/x86/xen/enlighten.c | 3 + |
16 | arch/x86/xen/entry.S | 5 + |
17 | arch/x86/xen/entry_32.S | 81 +++++++++++++++ |
18 | arch/x86/xen/entry_64.S | 1 + |
19 | arch/x86/xen/smp.c | 16 +++ |
20 | arch/x86/xen/xen-asm.S | 210 +------------------------------------- |
21 | arch/x86/xen/xen-asm_32.S | 184 +++++++++++++++++++++++++++++++++ |
22 | arch/x86/xen/xen-asm_64.S | 8 ++ |
23 | arch/x86/xen/xen-head.S | 20 +++- |
24 | include/asm-x86/asm-hack.h | 27 +++++ |
25 | include/asm-x86/smp_64.h | 3 + |
26 | include/linux/elfnote.h | 2 +- |
27 | include/xen/interface/elfnote.h | 16 +++ |
28 | 18 files changed, 390 insertions(+), 295 deletions(-) |
29 | create mode 100644 arch/x86/xen/entry.S |
30 | create mode 100644 arch/x86/xen/entry_32.S |
31 | create mode 100644 arch/x86/xen/entry_64.S |
32 | create mode 100644 arch/x86/xen/xen-asm_32.S |
33 | create mode 100644 arch/x86/xen/xen-asm_64.S |
34 | create mode 100644 include/asm-x86/asm-hack.h |
35 | |
36 | diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c |
37 | index 494e1e0..d0fabfd 100644 |
38 | --- a/arch/x86/kernel/asm-offsets_64.c |
39 | +++ b/arch/x86/kernel/asm-offsets_64.c |
40 | @@ -25,6 +25,8 @@ |
41 | #define OFFSET(sym, str, mem) \ |
42 | DEFINE(sym, offsetof(struct str, mem)) |
43 | |
44 | +#include <xen/interface/xen.h> |
45 | + |
46 | #define __NO_STUBS 1 |
47 | #undef __SYSCALL |
48 | #undef _ASM_X86_64_UNISTD_H_ |
49 | @@ -92,6 +94,13 @@ int main(void) |
50 | offsetof (struct rt_sigframe32, uc.uc_mcontext)); |
51 | BLANK(); |
52 | #endif |
53 | + |
54 | +#ifdef CONFIG_XEN |
55 | + BLANK(); |
56 | + OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask); |
57 | + OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending); |
58 | +#endif |
59 | + |
60 | DEFINE(pbe_address, offsetof(struct pbe, address)); |
61 | DEFINE(pbe_orig_address, offsetof(struct pbe, orig_address)); |
62 | DEFINE(pbe_next, offsetof(struct pbe, next)); |
63 | @@ -130,6 +139,12 @@ int main(void) |
64 | BLANK(); |
65 | DEFINE(__NR_syscall_max, sizeof(syscalls) - 1); |
66 | |
67 | + DEFINE(PAGE_SIZE_asm, PAGE_SIZE); |
68 | + DEFINE(PAGE_SHIFT_asm, PAGE_SHIFT); |
69 | + DEFINE(PTRS_PER_PTE, PTRS_PER_PTE); |
70 | + DEFINE(PTRS_PER_PMD, PTRS_PER_PMD); |
71 | + DEFINE(PTRS_PER_PGD, PTRS_PER_PGD); |
72 | + |
73 | BLANK(); |
74 | OFFSET(BP_scratch, boot_params, scratch); |
75 | OFFSET(BP_loadflags, boot_params, hdr.loadflags); |
76 | diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S |
77 | index 4b87c32..4ef3881 100644 |
78 | --- a/arch/x86/kernel/entry_32.S |
79 | +++ b/arch/x86/kernel/entry_32.S |
80 | @@ -1023,88 +1023,7 @@ ENTRY(kernel_thread_helper) |
81 | ENDPROC(kernel_thread_helper) |
82 | |
83 | #ifdef CONFIG_XEN |
84 | -ENTRY(xen_hypervisor_callback) |
85 | - CFI_STARTPROC |
86 | - pushl $0 |
87 | - CFI_ADJUST_CFA_OFFSET 4 |
88 | - SAVE_ALL |
89 | - TRACE_IRQS_OFF |
90 | - |
91 | - /* Check to see if we got the event in the critical |
92 | - region in xen_iret_direct, after we've reenabled |
93 | - events and checked for pending events. This simulates |
94 | - iret instruction's behaviour where it delivers a |
95 | - pending interrupt when enabling interrupts. */ |
96 | - movl PT_EIP(%esp),%eax |
97 | - cmpl $xen_iret_start_crit,%eax |
98 | - jb 1f |
99 | - cmpl $xen_iret_end_crit,%eax |
100 | - jae 1f |
101 | - |
102 | - call xen_iret_crit_fixup |
103 | - |
104 | -1: mov %esp, %eax |
105 | - call xen_evtchn_do_upcall |
106 | - jmp ret_from_intr |
107 | - CFI_ENDPROC |
108 | -ENDPROC(xen_hypervisor_callback) |
109 | - |
110 | -# Hypervisor uses this for application faults while it executes. |
111 | -# We get here for two reasons: |
112 | -# 1. Fault while reloading DS, ES, FS or GS |
113 | -# 2. Fault while executing IRET |
114 | -# Category 1 we fix up by reattempting the load, and zeroing the segment |
115 | -# register if the load fails. |
116 | -# Category 2 we fix up by jumping to do_iret_error. We cannot use the |
117 | -# normal Linux return path in this case because if we use the IRET hypercall |
118 | -# to pop the stack frame we end up in an infinite loop of failsafe callbacks. |
119 | -# We distinguish between categories by maintaining a status value in EAX. |
120 | -ENTRY(xen_failsafe_callback) |
121 | - CFI_STARTPROC |
122 | - pushl %eax |
123 | - CFI_ADJUST_CFA_OFFSET 4 |
124 | - movl $1,%eax |
125 | -1: mov 4(%esp),%ds |
126 | -2: mov 8(%esp),%es |
127 | -3: mov 12(%esp),%fs |
128 | -4: mov 16(%esp),%gs |
129 | - testl %eax,%eax |
130 | - popl %eax |
131 | - CFI_ADJUST_CFA_OFFSET -4 |
132 | - lea 16(%esp),%esp |
133 | - CFI_ADJUST_CFA_OFFSET -16 |
134 | - jz 5f |
135 | - addl $16,%esp |
136 | - jmp iret_exc # EAX != 0 => Category 2 (Bad IRET) |
137 | -5: pushl $0 # EAX == 0 => Category 1 (Bad segment) |
138 | - CFI_ADJUST_CFA_OFFSET 4 |
139 | - SAVE_ALL |
140 | - jmp ret_from_exception |
141 | - CFI_ENDPROC |
142 | - |
143 | -.section .fixup,"ax" |
144 | -6: xorl %eax,%eax |
145 | - movl %eax,4(%esp) |
146 | - jmp 1b |
147 | -7: xorl %eax,%eax |
148 | - movl %eax,8(%esp) |
149 | - jmp 2b |
150 | -8: xorl %eax,%eax |
151 | - movl %eax,12(%esp) |
152 | - jmp 3b |
153 | -9: xorl %eax,%eax |
154 | - movl %eax,16(%esp) |
155 | - jmp 4b |
156 | -.previous |
157 | -.section __ex_table,"a" |
158 | - .align 4 |
159 | - .long 1b,6b |
160 | - .long 2b,7b |
161 | - .long 3b,8b |
162 | - .long 4b,9b |
163 | -.previous |
164 | -ENDPROC(xen_failsafe_callback) |
165 | - |
166 | +#include "../xen/entry_32.S" |
167 | #endif /* CONFIG_XEN */ |
168 | |
169 | .section .rodata,"a" |
170 | diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S |
171 | index 556a8df..3fedbd6 100644 |
172 | --- a/arch/x86/kernel/entry_64.S |
173 | +++ b/arch/x86/kernel/entry_64.S |
174 | @@ -1202,3 +1202,7 @@ KPROBE_ENTRY(ignore_sysret) |
175 | sysret |
176 | CFI_ENDPROC |
177 | ENDPROC(ignore_sysret) |
178 | + |
179 | +#ifdef CONFIG_XEN |
180 | +#include "../xen/entry_64.S" |
181 | +#endif /* CONFIG_XEN */ |
182 | diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S |
183 | index a007454..923ad56 100644 |
184 | --- a/arch/x86/kernel/head_64.S |
185 | +++ b/arch/x86/kernel/head_64.S |
186 | @@ -419,6 +419,9 @@ ENTRY(phys_base) |
187 | /* This must match the first entry in level2_kernel_pgt */ |
188 | .quad 0x0000000000000000 |
189 | |
190 | +#include "../../x86/xen/xen-head.S" |
191 | + |
192 | + |
193 | /* We need valid kernel segments for data and code in long mode too |
194 | * IRET will check the segment types kkeil 2000/10/28 |
195 | * Also sysret mandates a special GDT layout |
196 | diff --git a/arch/x86/kernel/smpboot_64.c b/arch/x86/kernel/smpboot_64.c |
197 | index a47f973..08bbcad 100644 |
198 | --- a/arch/x86/kernel/smpboot_64.c |
199 | +++ b/arch/x86/kernel/smpboot_64.c |
200 | @@ -144,7 +144,7 @@ static unsigned long __cpuinit setup_trampoline(void) |
201 | * a given CPU |
202 | */ |
203 | |
204 | -static void __cpuinit smp_store_cpu_info(int id) |
205 | +void __cpuinit smp_store_cpu_info(int id) |
206 | { |
207 | struct cpuinfo_x86 *c = &cpu_data(id); |
208 | |
209 | @@ -261,7 +261,7 @@ cpumask_t cpu_coregroup_map(int cpu) |
210 | /* representing cpus for which sibling maps can be computed */ |
211 | static cpumask_t cpu_sibling_setup_map; |
212 | |
213 | -static inline void set_cpu_sibling_map(int cpu) |
214 | +inline void set_cpu_sibling_map(int cpu) |
215 | { |
216 | int i; |
217 | struct cpuinfo_x86 *c = &cpu_data(cpu); |
218 | diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c |
219 | index 4b8ebb8..48a9e35 100644 |
220 | --- a/arch/x86/xen/enlighten.c |
221 | +++ b/arch/x86/xen/enlighten.c |
222 | @@ -1238,7 +1238,10 @@ asmlinkage void __init xen_start_kernel(void) |
223 | pv_apic_ops = xen_apic_ops; |
224 | pv_mmu_ops = xen_mmu_ops; |
225 | |
226 | +#ifdef CONFIG_X86_32 |
227 | + /*FIXME: implement me! */ |
228 | machine_ops = xen_machine_ops; |
229 | +#endif |
230 | |
231 | #ifdef CONFIG_SMP |
232 | smp_ops = xen_smp_ops; |
233 | diff --git a/arch/x86/xen/entry.S b/arch/x86/xen/entry.S |
234 | new file mode 100644 |
235 | index 0000000..1e7551e |
236 | --- /dev/null |
237 | +++ b/arch/x86/xen/entry.S |
238 | @@ -0,0 +1,5 @@ |
239 | +#ifdef CONFIG_X86_64 |
240 | +# include "entry_64.S" |
241 | +#else |
242 | +# include "entry_32.S" |
243 | +#endif |
244 | diff --git a/arch/x86/xen/entry_32.S b/arch/x86/xen/entry_32.S |
245 | new file mode 100644 |
246 | index 0000000..89109a8 |
247 | --- /dev/null |
248 | +++ b/arch/x86/xen/entry_32.S |
249 | @@ -0,0 +1,81 @@ |
250 | +ENTRY(xen_hypervisor_callback) |
251 | + CFI_STARTPROC |
252 | + pushl $0 |
253 | + CFI_ADJUST_CFA_OFFSET 4 |
254 | + SAVE_ALL |
255 | + TRACE_IRQS_OFF |
256 | + |
257 | + /* Check to see if we got the event in the critical |
258 | + region in xen_iret_direct, after we've reenabled |
259 | + events and checked for pending events. This simulates |
260 | + iret instruction's behaviour where it delivers a |
261 | + pending interrupt when enabling interrupts. */ |
262 | + movl PT_EIP(%esp),%eax |
263 | + cmpl $xen_iret_start_crit,%eax |
264 | + jb 1f |
265 | + cmpl $xen_iret_end_crit,%eax |
266 | + jae 1f |
267 | + |
268 | + call xen_iret_crit_fixup |
269 | + |
270 | +1: mov %esp, %eax |
271 | + call xen_evtchn_do_upcall |
272 | + jmp ret_from_intr |
273 | + CFI_ENDPROC |
274 | +ENDPROC(xen_hypervisor_callback) |
275 | + |
276 | +# Hypervisor uses this for application faults while it executes. |
277 | +# We get here for two reasons: |
278 | +# 1. Fault while reloading DS, ES, FS or GS |
279 | +# 2. Fault while executing IRET |
280 | +# Category 1 we fix up by reattempting the load, and zeroing the segment |
281 | +# register if the load fails. |
282 | +# Category 2 we fix up by jumping to do_iret_error. We cannot use the |
283 | +# normal Linux return path in this case because if we use the IRET hypercall |
284 | +# to pop the stack frame we end up in an infinite loop of failsafe callbacks. |
285 | +# We distinguish between categories by maintaining a status value in EAX. |
286 | +ENTRY(xen_failsafe_callback) |
287 | + CFI_STARTPROC |
288 | + pushl %eax |
289 | + CFI_ADJUST_CFA_OFFSET 4 |
290 | + movl $1,%eax |
291 | +1: mov 4(%esp),%ds |
292 | +2: mov 8(%esp),%es |
293 | +3: mov 12(%esp),%fs |
294 | +4: mov 16(%esp),%gs |
295 | + testl %eax,%eax |
296 | + popl %eax |
297 | + CFI_ADJUST_CFA_OFFSET -4 |
298 | + lea 16(%esp),%esp |
299 | + CFI_ADJUST_CFA_OFFSET -16 |
300 | + jz 5f |
301 | + addl $16,%esp |
302 | + jmp iret_exc # EAX != 0 => Category 2 (Bad IRET) |
303 | +5: pushl $0 # EAX == 0 => Category 1 (Bad segment) |
304 | + CFI_ADJUST_CFA_OFFSET 4 |
305 | + SAVE_ALL |
306 | + jmp ret_from_exception |
307 | + CFI_ENDPROC |
308 | + |
309 | +.section .fixup,"ax" |
310 | +6: xorl %eax,%eax |
311 | + movl %eax,4(%esp) |
312 | + jmp 1b |
313 | +7: xorl %eax,%eax |
314 | + movl %eax,8(%esp) |
315 | + jmp 2b |
316 | +8: xorl %eax,%eax |
317 | + movl %eax,12(%esp) |
318 | + jmp 3b |
319 | +9: xorl %eax,%eax |
320 | + movl %eax,16(%esp) |
321 | + jmp 4b |
322 | +.previous |
323 | +.section __ex_table,"a" |
324 | + .align 4 |
325 | + .long 1b,6b |
326 | + .long 2b,7b |
327 | + .long 3b,8b |
328 | + .long 4b,9b |
329 | +.previous |
330 | +ENDPROC(xen_failsafe_callback) |
331 | diff --git a/arch/x86/xen/entry_64.S b/arch/x86/xen/entry_64.S |
332 | new file mode 100644 |
333 | index 0000000..c8c1473 |
334 | --- /dev/null |
335 | +++ b/arch/x86/xen/entry_64.S |
336 | @@ -0,0 +1 @@ |
337 | +#error foo |
338 | diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c |
339 | index aafc544..03580bf 100644 |
340 | --- a/arch/x86/xen/smp.c |
341 | +++ b/arch/x86/xen/smp.c |
342 | @@ -144,7 +144,10 @@ void __init xen_smp_prepare_boot_cpu(void) |
343 | |
344 | /* We've switched to the "real" per-cpu gdt, so make sure the |
345 | old memory can be recycled */ |
346 | +#ifdef CONFIG_X86_32 |
347 | + //FIXME: implement this on 64-bit |
348 | make_lowmem_page_readwrite(&per_cpu__gdt_page); |
349 | +#endif |
350 | |
351 | for_each_possible_cpu(cpu) { |
352 | cpus_clear(per_cpu(cpu_sibling_map, cpu)); |
353 | @@ -207,6 +210,8 @@ void __init xen_smp_prepare_cpus(unsigned int max_cpus) |
354 | static __cpuinit int |
355 | cpu_initialize_context(unsigned int cpu, struct task_struct *idle) |
356 | { |
357 | +/*FIXME: implement me */ |
358 | +#ifdef CONFIG_X86_32 |
359 | struct vcpu_guest_context *ctxt; |
360 | struct gdt_page *gdt = &per_cpu(gdt_page, cpu); |
361 | |
362 | @@ -256,11 +261,14 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) |
363 | BUG(); |
364 | |
365 | kfree(ctxt); |
366 | +#endif |
367 | return 0; |
368 | } |
369 | |
370 | int __cpuinit xen_cpu_up(unsigned int cpu) |
371 | { |
372 | +//FIXME: implement me! |
373 | +#ifdef CONFIG_X86_32 |
374 | struct task_struct *idle = idle_task(cpu); |
375 | int rc; |
376 | |
377 | @@ -299,6 +307,7 @@ int __cpuinit xen_cpu_up(unsigned int cpu) |
378 | rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL); |
379 | BUG_ON(rc); |
380 | |
381 | +#endif |
382 | return 0; |
383 | } |
384 | |
385 | @@ -308,6 +317,8 @@ void xen_smp_cpus_done(unsigned int max_cpus) |
386 | |
387 | static void stop_self(void *v) |
388 | { |
389 | +//FIXME: implement me! |
390 | +#ifdef CONFIG_X86_32 |
391 | int cpu = smp_processor_id(); |
392 | |
393 | /* make sure we're not pinning something down */ |
394 | @@ -316,6 +327,7 @@ static void stop_self(void *v) |
395 | |
396 | HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL); |
397 | BUG(); |
398 | +#endif |
399 | } |
400 | |
401 | void xen_smp_send_stop(void) |
402 | @@ -356,7 +368,11 @@ static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id) |
403 | */ |
404 | irq_enter(); |
405 | (*func)(info); |
406 | +#ifdef CONFIG_X86_32 |
407 | __get_cpu_var(irq_stat).irq_call_count++; |
408 | +#else |
409 | + add_pda(irq_call_count, 1); |
410 | +#endif |
411 | irq_exit(); |
412 | |
413 | if (wait) { |
414 | diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S |
415 | index af6abfa..1c3c0ac 100644 |
416 | --- a/arch/x86/xen/xen-asm.S |
417 | +++ b/arch/x86/xen/xen-asm.S |
418 | @@ -18,33 +18,13 @@ |
419 | #include <asm/percpu.h> |
420 | #include <asm/processor-flags.h> |
421 | #include <asm/segment.h> |
422 | +#include <asm/asm-hack.h> |
423 | |
424 | #include <xen/interface/xen.h> |
425 | |
426 | #define RELOC(x, v) .globl x##_reloc; x##_reloc=v |
427 | #define ENDPATCH(x) .globl x##_end; x##_end=. |
428 | |
429 | -#ifdef CONFIG_X86_64 |
430 | -# define SUFFIX q |
431 | -# define REGPREF r |
432 | -#else |
433 | -# define SUFFIX l |
434 | -# define REGPREF e |
435 | -#endif |
436 | - |
437 | -#define __REG(pref, reg) %pref##reg |
438 | -#define _REG(pref, reg) __REG(pref, reg) |
439 | -#define REG(reg) _REG(REGPREF, reg) |
440 | - |
441 | -#define __INSN(in, suff) in##suff |
442 | -#define _INSN(in, suff) __INSN(in, suff) |
443 | -#define INSN(in) _INSN(in, SUFFIX) |
444 | - |
445 | -#define rAX REG(ax) |
446 | -#define rSP REG(sp) |
447 | -#define MOV INSN(mov) |
448 | -#define AND INSN(and) |
449 | - |
450 | |
451 | /* Pseudo-flag used for virtual NMI, which we don't implement yet */ |
452 | #define XEN_EFLAGS_NMI 0x80000000 |
453 | @@ -159,191 +139,9 @@ ENDPATCH(xen_restore_fl_direct) |
454 | ENDPROC(xen_restore_fl_direct) |
455 | RELOC(xen_restore_fl_direct, 2b+1) |
456 | |
457 | -/* |
458 | - This is run where a normal iret would be run, with the same stack setup: |
459 | - 8: eflags |
460 | - 4: cs |
461 | - esp-> 0: eip |
462 | - |
463 | - This attempts to make sure that any pending events are dealt |
464 | - with on return to usermode, but there is a small window in |
465 | - which an event can happen just before entering usermode. If |
466 | - the nested interrupt ends up setting one of the TIF_WORK_MASK |
467 | - pending work flags, they will not be tested again before |
468 | - returning to usermode. This means that a process can end up |
469 | - with pending work, which will be unprocessed until the process |
470 | - enters and leaves the kernel again, which could be an |
471 | - unbounded amount of time. This means that a pending signal or |
472 | - reschedule event could be indefinitely delayed. |
473 | - |
474 | - The fix is to notice a nested interrupt in the critical |
475 | - window, and if one occurs, then fold the nested interrupt into |
476 | - the current interrupt stack frame, and re-process it |
477 | - iteratively rather than recursively. This means that it will |
478 | - exit via the normal path, and all pending work will be dealt |
479 | - with appropriately. |
480 | - |
481 | - Because the nested interrupt handler needs to deal with the |
482 | - current stack state in whatever form its in, we keep things |
483 | - simple by only using a single register which is pushed/popped |
484 | - on the stack. |
485 | |
486 | - Non-direct iret could be done in the same way, but it would |
487 | - require an annoying amount of code duplication. We'll assume |
488 | - that direct mode will be the common case once the hypervisor |
489 | - support becomes commonplace. |
490 | - */ |
491 | -ENTRY(xen_iret_direct) |
492 | - /* test eflags for special cases */ |
493 | - /*FIXME: use right offset for rFLAGS */ |
494 | - testl $(X86_EFLAGS_VM | XEN_EFLAGS_NMI), 8(rSP) |
495 | - jnz hyper_iret |
496 | - |
497 | - push %eax |
498 | - ESP_OFFSET=4 # bytes pushed onto stack |
499 | - |
500 | - /* Store vcpu_info pointer for easy access. Do it this |
501 | - way to avoid having to reload %fs */ |
502 | -#ifdef CONFIG_SMP |
503 | - GET_THREAD_INFO(%eax) |
504 | - movl TI_cpu(%eax),%eax |
505 | - movl __per_cpu_offset(,%eax,4),%eax |
506 | - lea per_cpu__xen_vcpu_info(%eax),%eax |
507 | +#ifdef CONFIG_X86_64 |
508 | +#include "xen-asm_64.S" |
509 | #else |
510 | - movl $per_cpu__xen_vcpu_info, %eax |
511 | +#include "xen-asm_32.S" |
512 | #endif |
513 | - |
514 | - /* check IF state we're restoring */ |
515 | - testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp) |
516 | - |
517 | - /* Maybe enable events. Once this happens we could get a |
518 | - recursive event, so the critical region starts immediately |
519 | - afterwards. However, if that happens we don't end up |
520 | - resuming the code, so we don't have to be worried about |
521 | - being preempted to another CPU. */ |
522 | - setz XEN_vcpu_info_mask(%eax) |
523 | -xen_iret_start_crit: |
524 | - |
525 | - /* check for unmasked and pending */ |
526 | - cmpw $0x0001, XEN_vcpu_info_pending(%eax) |
527 | - |
528 | - /* If there's something pending, mask events again so we |
529 | - can jump back into xen_hypervisor_callback */ |
530 | - sete XEN_vcpu_info_mask(%eax) |
531 | - |
532 | - popl %eax |
533 | - |
534 | - /* From this point on the registers are restored and the stack |
535 | - updated, so we don't need to worry about it if we're preempted */ |
536 | -iret_restore_end: |
537 | - |
538 | - /* Jump to hypervisor_callback after fixing up the stack. |
539 | - Events are masked, so jumping out of the critical |
540 | - region is OK. */ |
541 | - je xen_hypervisor_callback |
542 | - |
543 | - iret |
544 | -xen_iret_end_crit: |
545 | - |
546 | -hyper_iret: |
547 | - /* put this out of line since its very rarely used */ |
548 | - jmp hypercall_page + __HYPERVISOR_iret * 32 |
549 | - |
550 | - .globl xen_iret_start_crit, xen_iret_end_crit |
551 | - |
552 | -/* |
553 | - This is called by xen_hypervisor_callback in entry.S when it sees |
554 | - that the EIP at the time of interrupt was between xen_iret_start_crit |
555 | - and xen_iret_end_crit. We're passed the EIP in %eax so we can do |
556 | - a more refined determination of what to do. |
557 | - |
558 | - The stack format at this point is: |
559 | - ---------------- |
560 | - ss : (ss/esp may be present if we came from usermode) |
561 | - esp : |
562 | - eflags } outer exception info |
563 | - cs } |
564 | - eip } |
565 | - ---------------- <- edi (copy dest) |
566 | - eax : outer eax if it hasn't been restored |
567 | - ---------------- |
568 | - eflags } nested exception info |
569 | - cs } (no ss/esp because we're nested |
570 | - eip } from the same ring) |
571 | - orig_eax }<- esi (copy src) |
572 | - - - - - - - - - |
573 | - fs } |
574 | - es } |
575 | - ds } SAVE_ALL state |
576 | - eax } |
577 | - : : |
578 | - ebx } |
579 | - ---------------- |
580 | - return addr <- esp |
581 | - ---------------- |
582 | - |
583 | - In order to deliver the nested exception properly, we need to shift |
584 | - everything from the return addr up to the error code so it |
585 | - sits just under the outer exception info. This means that when we |
586 | - handle the exception, we do it in the context of the outer exception |
587 | - rather than starting a new one. |
588 | - |
589 | - The only caveat is that if the outer eax hasn't been |
590 | - restored yet (ie, it's still on stack), we need to insert |
591 | - its value into the SAVE_ALL state before going on, since |
592 | - it's usermode state which we eventually need to restore. |
593 | - */ |
594 | -ENTRY(xen_iret_crit_fixup) |
595 | - /* offsets +4 for return address */ |
596 | - |
597 | - /* |
598 | - Paranoia: Make sure we're really coming from userspace. |
599 | - One could imagine a case where userspace jumps into the |
600 | - critical range address, but just before the CPU delivers a GP, |
601 | - it decides to deliver an interrupt instead. Unlikely? |
602 | - Definitely. Easy to avoid? Yes. The Intel documents |
603 | - explicitly say that the reported EIP for a bad jump is the |
604 | - jump instruction itself, not the destination, but some virtual |
605 | - environments get this wrong. |
606 | - */ |
607 | - movl PT_CS+4(%esp), %ecx |
608 | - andl $SEGMENT_RPL_MASK, %ecx |
609 | - cmpl $USER_RPL, %ecx |
610 | - je 2f |
611 | - |
612 | - lea PT_ORIG_EAX+4(%esp), %esi |
613 | - lea PT_EFLAGS+4(%esp), %edi |
614 | - |
615 | - /* If eip is before iret_restore_end then stack |
616 | - hasn't been restored yet. */ |
617 | - cmp $iret_restore_end, %eax |
618 | - jae 1f |
619 | - |
620 | - movl 0+4(%edi),%eax /* copy EAX */ |
621 | - movl %eax, PT_EAX+4(%esp) |
622 | - |
623 | - lea ESP_OFFSET(%edi),%edi /* move dest up over saved regs */ |
624 | - |
625 | - /* set up the copy */ |
626 | -1: std |
627 | - mov $(PT_EIP+4) / 4, %ecx /* copy ret+saved regs up to orig_eax */ |
628 | - rep movsl |
629 | - cld |
630 | - |
631 | - lea 4(%edi),%esp /* point esp to new frame */ |
632 | -2: ret |
633 | - |
634 | - |
635 | -/* |
636 | - Force an event check by making a hypercall, |
637 | - but preserve regs before making the call. |
638 | - */ |
639 | -check_events: |
640 | - push %eax |
641 | - push %ecx |
642 | - push %edx |
643 | - call force_evtchn_callback |
644 | - pop %edx |
645 | - pop %ecx |
646 | - pop %eax |
647 | - ret |
648 | diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S |
649 | new file mode 100644 |
650 | index 0000000..1340296 |
651 | --- /dev/null |
652 | +++ b/arch/x86/xen/xen-asm_32.S |
653 | @@ -0,0 +1,184 @@ |
654 | +/* |
655 | + This is run where a normal iret would be run, with the same stack setup: |
656 | + 8: eflags |
657 | + 4: cs |
658 | + esp-> 0: eip |
659 | + |
660 | + This attempts to make sure that any pending events are dealt |
661 | + with on return to usermode, but there is a small window in |
662 | + which an event can happen just before entering usermode. If |
663 | + the nested interrupt ends up setting one of the TIF_WORK_MASK |
664 | + pending work flags, they will not be tested again before |
665 | + returning to usermode. This means that a process can end up |
666 | + with pending work, which will be unprocessed until the process |
667 | + enters and leaves the kernel again, which could be an |
668 | + unbounded amount of time. This means that a pending signal or |
669 | + reschedule event could be indefinitely delayed. |
670 | + |
671 | + The fix is to notice a nested interrupt in the critical |
672 | + window, and if one occurs, then fold the nested interrupt into |
673 | + the current interrupt stack frame, and re-process it |
674 | + iteratively rather than recursively. This means that it will |
675 | + exit via the normal path, and all pending work will be dealt |
676 | + with appropriately. |
677 | + |
678 | + Because the nested interrupt handler needs to deal with the |
679 | + current stack state in whatever form its in, we keep things |
680 | + simple by only using a single register which is pushed/popped |
681 | + on the stack. |
682 | + |
683 | + Non-direct iret could be done in the same way, but it would |
684 | + require an annoying amount of code duplication. We'll assume |
685 | + that direct mode will be the common case once the hypervisor |
686 | + support becomes commonplace. |
687 | + */ |
688 | +ENTRY(xen_iret_direct) |
689 | + /* test eflags for special cases */ |
690 | + /*FIXME: use right offset for rFLAGS */ |
691 | + testl $(X86_EFLAGS_VM | XEN_EFLAGS_NMI), 8(rSP) |
692 | + jnz hyper_iret |
693 | + |
694 | + push rAX |
695 | + ESP_OFFSET=4 # bytes pushed onto stack |
696 | + |
697 | + /* Store vcpu_info pointer for easy access. Do it this |
698 | + way to avoid having to reload %fs */ |
699 | + PER_CPU(xen_vcpu_info, rAX) |
700 | + |
701 | + /* check IF state we're restoring */ |
702 | + /*FIXME: fix ESP offset */ |
703 | + /*FIXME: check WTF the magic numbers below mean */ |
704 | + testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(rSP) |
705 | + |
706 | + /* Maybe enable events. Once this happens we could get a |
707 | + recursive event, so the critical region starts immediately |
708 | + afterwards. However, if that happens we don't end up |
709 | + resuming the code, so we don't have to be worried about |
710 | + being preempted to another CPU. */ |
711 | + setz XEN_vcpu_info_mask(rAX) |
712 | +xen_iret_start_crit: |
713 | + |
714 | + /* check for unmasked and pending */ |
715 | + cmpw $0x0001, XEN_vcpu_info_pending(%eax) |
716 | + |
717 | + /* If there's something pending, mask events again so we |
718 | + can jump back into xen_hypervisor_callback */ |
719 | + sete XEN_vcpu_info_mask(%eax) |
720 | + |
721 | + popl %eax |
722 | + |
723 | + /* From this point on the registers are restored and the stack |
724 | + updated, so we don't need to worry about it if we're preempted */ |
725 | +iret_restore_end: |
726 | + |
727 | + /* Jump to hypervisor_callback after fixing up the stack. |
728 | + Events are masked, so jumping out of the critical |
729 | + region is OK. */ |
730 | + je xen_hypervisor_callback |
731 | + |
732 | + iret |
733 | +xen_iret_end_crit: |
734 | + |
735 | +hyper_iret: |
736 | + /* put this out of line since its very rarely used */ |
737 | + jmp hypercall_page + __HYPERVISOR_iret * 32 |
738 | + |
739 | + .globl xen_iret_start_crit, xen_iret_end_crit |
740 | + |
741 | +/* |
742 | + This is called by xen_hypervisor_callback in entry.S when it sees |
743 | + that the EIP at the time of interrupt was between xen_iret_start_crit |
744 | + and xen_iret_end_crit. We're passed the EIP in %eax so we can do |
745 | + a more refined determination of what to do. |
746 | + |
747 | + The stack format at this point is: |
748 | + ---------------- |
749 | + ss : (ss/esp may be present if we came from usermode) |
750 | + esp : |
751 | + eflags } outer exception info |
752 | + cs } |
753 | + eip } |
754 | + ---------------- <- edi (copy dest) |
755 | + eax : outer eax if it hasn't been restored |
756 | + ---------------- |
757 | + eflags } nested exception info |
758 | + cs } (no ss/esp because we're nested |
759 | + eip } from the same ring) |
760 | + orig_eax }<- esi (copy src) |
761 | + - - - - - - - - |
762 | + fs } |
763 | + es } |
764 | + ds } SAVE_ALL state |
765 | + eax } |
766 | + : : |
767 | + ebx } |
768 | + ---------------- |
769 | + return addr <- esp |
770 | + ---------------- |
771 | + |
772 | + In order to deliver the nested exception properly, we need to shift |
773 | + everything from the return addr up to the error code so it |
774 | + sits just under the outer exception info. This means that when we |
775 | + handle the exception, we do it in the context of the outer exception |
776 | + rather than starting a new one. |
777 | + |
778 | + The only caveat is that if the outer eax hasn't been |
779 | + restored yet (ie, it's still on stack), we need to insert |
780 | + its value into the SAVE_ALL state before going on, since |
781 | + it's usermode state which we eventually need to restore. |
782 | + */ |
783 | +ENTRY(xen_iret_crit_fixup) |
784 | + /* offsets +4 for return address */ |
785 | + |
786 | + /* |
787 | + Paranoia: Make sure we're really coming from userspace. |
788 | + One could imagine a case where userspace jumps into the |
789 | + critical range address, but just before the CPU delivers a GP, |
790 | + it decides to deliver an interrupt instead. Unlikely? |
791 | + Definitely. Easy to avoid? Yes. The Intel documents |
792 | + explicitly say that the reported EIP for a bad jump is the |
793 | + jump instruction itself, not the destination, but some virtual |
794 | + environments get this wrong. |
795 | + */ |
796 | + movl PT_CS+4(%esp), %ecx |
797 | + andl $SEGMENT_RPL_MASK, %ecx |
798 | + cmpl $USER_RPL, %ecx |
799 | + je 2f |
800 | + |
801 | + lea PT_ORIG_EAX+4(%esp), %esi |
802 | + lea PT_EFLAGS+4(%esp), %edi |
803 | + |
804 | + /* If eip is before iret_restore_end then stack |
805 | + hasn't been restored yet. */ |
806 | + cmp $iret_restore_end, %eax |
807 | + jae 1f |
808 | + |
809 | + movl 0+4(%edi),%eax /* copy EAX */ |
810 | + movl %eax, PT_EAX+4(%esp) |
811 | + |
812 | + lea ESP_OFFSET(%edi),%edi /* move dest up over saved regs */ |
813 | + |
814 | + /* set up the copy */ |
815 | +1: std |
816 | + mov $(PT_EIP+4) / 4, %ecx /* copy ret+saved regs up to orig_eax */ |
817 | + rep movsl |
818 | + cld |
819 | + |
820 | + lea 4(%edi),%esp /* point esp to new frame */ |
821 | +2: ret |
822 | + |
823 | + |
824 | +/* |
825 | + Force an event check by making a hypercall, |
826 | + but preserve regs before making the call. |
827 | + */ |
828 | +check_events: |
829 | + push %eax |
830 | + push %ecx |
831 | + push %edx |
832 | + call force_evtchn_callback |
833 | + pop %edx |
834 | + pop %ecx |
835 | + pop %eax |
836 | + ret |
837 | + |
838 | diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S |
839 | new file mode 100644 |
840 | index 0000000..38443b8 |
841 | --- /dev/null |
842 | +++ b/arch/x86/xen/xen-asm_64.S |
843 | @@ -0,0 +1,8 @@ |
844 | +check_events: |
845 | + /*FIXME: implement me! */ |
846 | + ud2a |
847 | + |
848 | + |
849 | +ENTRY(xen_iret_direct) |
850 | + /*FIXME: implement me! */ |
851 | + ud2a |
852 | diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S |
853 | index 288d587..ec5d622 100644 |
854 | --- a/arch/x86/xen/xen-head.S |
855 | +++ b/arch/x86/xen/xen-head.S |
856 | @@ -7,12 +7,14 @@ |
857 | #include <linux/init.h> |
858 | #include <asm/boot.h> |
859 | #include <xen/interface/elfnote.h> |
860 | +#include <asm/asm-hack.h> |
861 | + |
862 | |
863 | __INIT |
864 | ENTRY(startup_xen) |
865 | - movl %esi,xen_start_info |
866 | + MOV rSI,xen_start_info |
867 | cld |
868 | - movl $(init_thread_union+THREAD_SIZE),%esp |
869 | + MOV $(init_thread_union+THREAD_SIZE),rSP |
870 | jmp xen_start_kernel |
871 | |
872 | __FINIT |
873 | @@ -26,14 +28,24 @@ ENTRY(hypercall_page) |
874 | ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") |
875 | ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6") |
876 | ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0") |
877 | +#ifdef CONFIG_X86_32 |
878 | ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long __PAGE_OFFSET) |
879 | ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long startup_xen) |
880 | ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long hypercall_page) |
881 | +#else |
882 | + ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .quad __START_KERNEL_map) |
883 | + ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, .quad 0) |
884 | + ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .quad startup_64) |
885 | + ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .quad _PAGE_PRESENT,_PAGE_PRESENT) |
886 | +#endif |
887 | + |
888 | ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb") |
889 | -#ifdef CONFIG_X86_PAE |
890 | +#ifdef CONFIG_X86_32 |
891 | +# ifdef CONFIG_X86_PAE |
892 | ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes") |
893 | -#else |
894 | +# else |
895 | ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "no") |
896 | +# endif |
897 | #endif |
898 | ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") |
899 | |
900 | diff --git a/include/asm-x86/asm-hack.h b/include/asm-x86/asm-hack.h |
901 | new file mode 100644 |
902 | index 0000000..b7c2a66 |
903 | --- /dev/null |
904 | +++ b/include/asm-x86/asm-hack.h |
905 | @@ -0,0 +1,27 @@ |
906 | +#ifndef __ASM_ASM_HACK_H |
907 | + |
908 | +#ifdef CONFIG_X86_64 |
909 | +# define SUFFIX q |
910 | +# define REGPREF r |
911 | +#else |
912 | +# define SUFFIX l |
913 | +# define REGPREF e |
914 | +#endif |
915 | + |
916 | +#define __REG(pref, reg) %pref##reg |
917 | +#define _REG(pref, reg) __REG(pref, reg) |
918 | +#define REG(reg) _REG(REGPREF, reg) |
919 | + |
920 | +#define __INSN(in, suff) in##suff |
921 | +#define _INSN(in, suff) __INSN(in, suff) |
922 | +#define INSN(in) _INSN(in, SUFFIX) |
923 | + |
924 | +#define rAX REG(ax) |
925 | +#define rSP REG(sp) |
926 | +#define rSI REG(si) |
927 | +#define rDI REG(di) |
928 | + |
929 | +#define MOV INSN(mov) |
930 | +#define AND INSN(and) |
931 | + |
932 | +#endif /* __ASM_ASM_HACK_H */ |
933 | diff --git a/include/asm-x86/smp_64.h b/include/asm-x86/smp_64.h |
934 | index e0a7551..48630c1 100644 |
935 | --- a/include/asm-x86/smp_64.h |
936 | +++ b/include/asm-x86/smp_64.h |
937 | @@ -97,5 +97,8 @@ static inline int hard_smp_processor_id(void) |
938 | return GET_APIC_ID(*(u32 *)(APIC_BASE + APIC_ID)); |
939 | } |
940 | |
941 | +extern void smp_store_cpu_info(int id); |
942 | +extern void set_cpu_sibling_map(int cpu); |
943 | + |
944 | #endif |
945 | |
946 | diff --git a/include/linux/elfnote.h b/include/linux/elfnote.h |
947 | index 278e3ef..9f9816a 100644 |
948 | --- a/include/linux/elfnote.h |
949 | +++ b/include/linux/elfnote.h |
950 | @@ -52,7 +52,7 @@ |
951 | 4484:.balign 4 ; \ |
952 | .popsection ; |
953 | |
954 | -#define ELFNOTE(name, type, desc) \ |
955 | +#define ELFNOTE(name, type, desc...) \ |
956 | ELFNOTE_START(name, type, "") \ |
957 | desc ; \ |
958 | ELFNOTE_END |
959 | diff --git a/include/xen/interface/elfnote.h b/include/xen/interface/elfnote.h |
960 | index a64d3df..ee5501d 100644 |
961 | --- a/include/xen/interface/elfnote.h |
962 | +++ b/include/xen/interface/elfnote.h |
963 | @@ -120,6 +120,22 @@ |
964 | */ |
965 | #define XEN_ELFNOTE_BSD_SYMTAB 11 |
966 | |
967 | +/* |
968 | + * The lowest address the hypervisor hole can begin at (numeric). |
969 | + * |
970 | + * This must not be set higher than HYPERVISOR_VIRT_START. Its presence |
971 | + * also indicates to the hypervisor that the kernel can deal with the |
972 | + * hole starting at a higher address. |
973 | + */ |
974 | +#define XEN_ELFNOTE_HV_START_LOW 12 |
975 | + |
976 | +/* |
977 | + * List of maddr_t-sized mask/value pairs describing how to recognize |
978 | + * (non-present) L1 page table entries carrying valid MFNs (numeric). |
979 | + */ |
980 | +#define XEN_ELFNOTE_L1_MFN_VALID 13 |
981 | + |
982 | + |
983 | #endif /* __XEN_PUBLIC_ELFNOTE_H__ */ |
984 | |
985 | /* |
986 | -- |
987 | 1.5.4.1 |
988 |