Annotation of /trunk/kernel-alx/patches-4.9/0174-4.9.75-all-fixes.patch
Parent Directory | Revision Log
Revision 3063 -
(hide annotations)
(download)
Wed Jan 10 10:33:48 2018 UTC (6 years, 8 months ago) by niro
File size: 79490 byte(s)
Wed Jan 10 10:33:48 2018 UTC (6 years, 8 months ago) by niro
File size: 79490 byte(s)
-linux-4.9.75
1 | niro | 3063 | diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt |
2 | index 152ec4e87b57..5d2676d043de 100644 | ||
3 | --- a/Documentation/kernel-parameters.txt | ||
4 | +++ b/Documentation/kernel-parameters.txt | ||
5 | @@ -2763,6 +2763,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted. | ||
6 | |||
7 | nojitter [IA-64] Disables jitter checking for ITC timers. | ||
8 | |||
9 | + nopti [X86-64] Disable KAISER isolation of kernel from user. | ||
10 | + | ||
11 | no-kvmclock [X86,KVM] Disable paravirtualized KVM clock driver | ||
12 | |||
13 | no-kvmapf [X86,KVM] Disable paravirtualized asynchronous page | ||
14 | @@ -3325,6 +3327,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted. | ||
15 | pt. [PARIDE] | ||
16 | See Documentation/blockdev/paride.txt. | ||
17 | |||
18 | + pti= [X86_64] | ||
19 | + Control KAISER user/kernel address space isolation: | ||
20 | + on - enable | ||
21 | + off - disable | ||
22 | + auto - default setting | ||
23 | + | ||
24 | pty.legacy_count= | ||
25 | [KNL] Number of legacy pty's. Overwrites compiled-in | ||
26 | default number. | ||
27 | diff --git a/Makefile b/Makefile | ||
28 | index 075e429732e7..acbc1b032db2 100644 | ||
29 | --- a/Makefile | ||
30 | +++ b/Makefile | ||
31 | @@ -1,6 +1,6 @@ | ||
32 | VERSION = 4 | ||
33 | PATCHLEVEL = 9 | ||
34 | -SUBLEVEL = 74 | ||
35 | +SUBLEVEL = 75 | ||
36 | EXTRAVERSION = | ||
37 | NAME = Roaring Lionus | ||
38 | |||
39 | diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h | ||
40 | index 766a5211f827..2728e1b7e4a6 100644 | ||
41 | --- a/arch/x86/boot/compressed/misc.h | ||
42 | +++ b/arch/x86/boot/compressed/misc.h | ||
43 | @@ -9,6 +9,7 @@ | ||
44 | */ | ||
45 | #undef CONFIG_PARAVIRT | ||
46 | #undef CONFIG_PARAVIRT_SPINLOCKS | ||
47 | +#undef CONFIG_PAGE_TABLE_ISOLATION | ||
48 | #undef CONFIG_KASAN | ||
49 | |||
50 | #include <linux/linkage.h> | ||
51 | diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S | ||
52 | index e7b0e7ff4c58..af4e58132d91 100644 | ||
53 | --- a/arch/x86/entry/entry_64.S | ||
54 | +++ b/arch/x86/entry/entry_64.S | ||
55 | @@ -36,6 +36,7 @@ | ||
56 | #include <asm/smap.h> | ||
57 | #include <asm/pgtable_types.h> | ||
58 | #include <asm/export.h> | ||
59 | +#include <asm/kaiser.h> | ||
60 | #include <linux/err.h> | ||
61 | |||
62 | /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ | ||
63 | @@ -146,6 +147,7 @@ ENTRY(entry_SYSCALL_64) | ||
64 | * it is too small to ever cause noticeable irq latency. | ||
65 | */ | ||
66 | SWAPGS_UNSAFE_STACK | ||
67 | + SWITCH_KERNEL_CR3_NO_STACK | ||
68 | /* | ||
69 | * A hypervisor implementation might want to use a label | ||
70 | * after the swapgs, so that it can do the swapgs | ||
71 | @@ -228,6 +230,14 @@ entry_SYSCALL_64_fastpath: | ||
72 | movq RIP(%rsp), %rcx | ||
73 | movq EFLAGS(%rsp), %r11 | ||
74 | RESTORE_C_REGS_EXCEPT_RCX_R11 | ||
75 | + /* | ||
76 | + * This opens a window where we have a user CR3, but are | ||
77 | + * running in the kernel. This makes using the CS | ||
78 | + * register useless for telling whether or not we need to | ||
79 | + * switch CR3 in NMIs. Normal interrupts are OK because | ||
80 | + * they are off here. | ||
81 | + */ | ||
82 | + SWITCH_USER_CR3 | ||
83 | movq RSP(%rsp), %rsp | ||
84 | USERGS_SYSRET64 | ||
85 | |||
86 | @@ -323,10 +333,26 @@ return_from_SYSCALL_64: | ||
87 | syscall_return_via_sysret: | ||
88 | /* rcx and r11 are already restored (see code above) */ | ||
89 | RESTORE_C_REGS_EXCEPT_RCX_R11 | ||
90 | + /* | ||
91 | + * This opens a window where we have a user CR3, but are | ||
92 | + * running in the kernel. This makes using the CS | ||
93 | + * register useless for telling whether or not we need to | ||
94 | + * switch CR3 in NMIs. Normal interrupts are OK because | ||
95 | + * they are off here. | ||
96 | + */ | ||
97 | + SWITCH_USER_CR3 | ||
98 | movq RSP(%rsp), %rsp | ||
99 | USERGS_SYSRET64 | ||
100 | |||
101 | opportunistic_sysret_failed: | ||
102 | + /* | ||
103 | + * This opens a window where we have a user CR3, but are | ||
104 | + * running in the kernel. This makes using the CS | ||
105 | + * register useless for telling whether or not we need to | ||
106 | + * switch CR3 in NMIs. Normal interrupts are OK because | ||
107 | + * they are off here. | ||
108 | + */ | ||
109 | + SWITCH_USER_CR3 | ||
110 | SWAPGS | ||
111 | jmp restore_c_regs_and_iret | ||
112 | END(entry_SYSCALL_64) | ||
113 | @@ -424,6 +450,7 @@ ENTRY(ret_from_fork) | ||
114 | movq %rsp, %rdi | ||
115 | call syscall_return_slowpath /* returns with IRQs disabled */ | ||
116 | TRACE_IRQS_ON /* user mode is traced as IRQS on */ | ||
117 | + SWITCH_USER_CR3 | ||
118 | SWAPGS | ||
119 | jmp restore_regs_and_iret | ||
120 | |||
121 | @@ -478,6 +505,7 @@ END(irq_entries_start) | ||
122 | * tracking that we're in kernel mode. | ||
123 | */ | ||
124 | SWAPGS | ||
125 | + SWITCH_KERNEL_CR3 | ||
126 | |||
127 | /* | ||
128 | * We need to tell lockdep that IRQs are off. We can't do this until | ||
129 | @@ -535,6 +563,7 @@ GLOBAL(retint_user) | ||
130 | mov %rsp,%rdi | ||
131 | call prepare_exit_to_usermode | ||
132 | TRACE_IRQS_IRETQ | ||
133 | + SWITCH_USER_CR3 | ||
134 | SWAPGS | ||
135 | jmp restore_regs_and_iret | ||
136 | |||
137 | @@ -612,6 +641,7 @@ native_irq_return_ldt: | ||
138 | |||
139 | pushq %rdi /* Stash user RDI */ | ||
140 | SWAPGS | ||
141 | + SWITCH_KERNEL_CR3 | ||
142 | movq PER_CPU_VAR(espfix_waddr), %rdi | ||
143 | movq %rax, (0*8)(%rdi) /* user RAX */ | ||
144 | movq (1*8)(%rsp), %rax /* user RIP */ | ||
145 | @@ -638,6 +668,7 @@ native_irq_return_ldt: | ||
146 | * still points to an RO alias of the ESPFIX stack. | ||
147 | */ | ||
148 | orq PER_CPU_VAR(espfix_stack), %rax | ||
149 | + SWITCH_USER_CR3 | ||
150 | SWAPGS | ||
151 | movq %rax, %rsp | ||
152 | |||
153 | @@ -1022,7 +1053,11 @@ idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vec | ||
154 | /* | ||
155 | * Save all registers in pt_regs, and switch gs if needed. | ||
156 | * Use slow, but surefire "are we in kernel?" check. | ||
157 | - * Return: ebx=0: need swapgs on exit, ebx=1: otherwise | ||
158 | + * | ||
159 | + * Return: ebx=0: needs swapgs but not SWITCH_USER_CR3 in paranoid_exit | ||
160 | + * ebx=1: needs neither swapgs nor SWITCH_USER_CR3 in paranoid_exit | ||
161 | + * ebx=2: needs both swapgs and SWITCH_USER_CR3 in paranoid_exit | ||
162 | + * ebx=3: needs SWITCH_USER_CR3 but not swapgs in paranoid_exit | ||
163 | */ | ||
164 | ENTRY(paranoid_entry) | ||
165 | cld | ||
166 | @@ -1035,7 +1070,26 @@ ENTRY(paranoid_entry) | ||
167 | js 1f /* negative -> in kernel */ | ||
168 | SWAPGS | ||
169 | xorl %ebx, %ebx | ||
170 | -1: ret | ||
171 | +1: | ||
172 | +#ifdef CONFIG_PAGE_TABLE_ISOLATION | ||
173 | + /* | ||
174 | + * We might have come in between a swapgs and a SWITCH_KERNEL_CR3 | ||
175 | + * on entry, or between a SWITCH_USER_CR3 and a swapgs on exit. | ||
176 | + * Do a conditional SWITCH_KERNEL_CR3: this could safely be done | ||
177 | + * unconditionally, but we need to find out whether the reverse | ||
178 | + * should be done on return (conveyed to paranoid_exit in %ebx). | ||
179 | + */ | ||
180 | + ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER | ||
181 | + testl $KAISER_SHADOW_PGD_OFFSET, %eax | ||
182 | + jz 2f | ||
183 | + orl $2, %ebx | ||
184 | + andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax | ||
185 | + /* If PCID enabled, set X86_CR3_PCID_NOFLUSH_BIT */ | ||
186 | + ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID | ||
187 | + movq %rax, %cr3 | ||
188 | +2: | ||
189 | +#endif | ||
190 | + ret | ||
191 | END(paranoid_entry) | ||
192 | |||
193 | /* | ||
194 | @@ -1048,19 +1102,26 @@ END(paranoid_entry) | ||
195 | * be complicated. Fortunately, we there's no good reason | ||
196 | * to try to handle preemption here. | ||
197 | * | ||
198 | - * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) | ||
199 | + * On entry: ebx=0: needs swapgs but not SWITCH_USER_CR3 | ||
200 | + * ebx=1: needs neither swapgs nor SWITCH_USER_CR3 | ||
201 | + * ebx=2: needs both swapgs and SWITCH_USER_CR3 | ||
202 | + * ebx=3: needs SWITCH_USER_CR3 but not swapgs | ||
203 | */ | ||
204 | ENTRY(paranoid_exit) | ||
205 | DISABLE_INTERRUPTS(CLBR_NONE) | ||
206 | TRACE_IRQS_OFF_DEBUG | ||
207 | - testl %ebx, %ebx /* swapgs needed? */ | ||
208 | + TRACE_IRQS_IRETQ_DEBUG | ||
209 | +#ifdef CONFIG_PAGE_TABLE_ISOLATION | ||
210 | + /* No ALTERNATIVE for X86_FEATURE_KAISER: paranoid_entry sets %ebx */ | ||
211 | + testl $2, %ebx /* SWITCH_USER_CR3 needed? */ | ||
212 | + jz paranoid_exit_no_switch | ||
213 | + SWITCH_USER_CR3 | ||
214 | +paranoid_exit_no_switch: | ||
215 | +#endif | ||
216 | + testl $1, %ebx /* swapgs needed? */ | ||
217 | jnz paranoid_exit_no_swapgs | ||
218 | - TRACE_IRQS_IRETQ | ||
219 | SWAPGS_UNSAFE_STACK | ||
220 | - jmp paranoid_exit_restore | ||
221 | paranoid_exit_no_swapgs: | ||
222 | - TRACE_IRQS_IRETQ_DEBUG | ||
223 | -paranoid_exit_restore: | ||
224 | RESTORE_EXTRA_REGS | ||
225 | RESTORE_C_REGS | ||
226 | REMOVE_PT_GPREGS_FROM_STACK 8 | ||
227 | @@ -1075,6 +1136,13 @@ ENTRY(error_entry) | ||
228 | cld | ||
229 | SAVE_C_REGS 8 | ||
230 | SAVE_EXTRA_REGS 8 | ||
231 | + /* | ||
232 | + * error_entry() always returns with a kernel gsbase and | ||
233 | + * CR3. We must also have a kernel CR3/gsbase before | ||
234 | + * calling TRACE_IRQS_*. Just unconditionally switch to | ||
235 | + * the kernel CR3 here. | ||
236 | + */ | ||
237 | + SWITCH_KERNEL_CR3 | ||
238 | xorl %ebx, %ebx | ||
239 | testb $3, CS+8(%rsp) | ||
240 | jz .Lerror_kernelspace | ||
241 | @@ -1235,6 +1303,10 @@ ENTRY(nmi) | ||
242 | */ | ||
243 | |||
244 | SWAPGS_UNSAFE_STACK | ||
245 | + /* | ||
246 | + * percpu variables are mapped with user CR3, so no need | ||
247 | + * to switch CR3 here. | ||
248 | + */ | ||
249 | cld | ||
250 | movq %rsp, %rdx | ||
251 | movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp | ||
252 | @@ -1268,12 +1340,34 @@ ENTRY(nmi) | ||
253 | |||
254 | movq %rsp, %rdi | ||
255 | movq $-1, %rsi | ||
256 | +#ifdef CONFIG_PAGE_TABLE_ISOLATION | ||
257 | + /* Unconditionally use kernel CR3 for do_nmi() */ | ||
258 | + /* %rax is saved above, so OK to clobber here */ | ||
259 | + ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER | ||
260 | + /* If PCID enabled, NOFLUSH now and NOFLUSH on return */ | ||
261 | + ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID | ||
262 | + pushq %rax | ||
263 | + /* mask off "user" bit of pgd address and 12 PCID bits: */ | ||
264 | + andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax | ||
265 | + movq %rax, %cr3 | ||
266 | +2: | ||
267 | +#endif | ||
268 | call do_nmi | ||
269 | |||
270 | +#ifdef CONFIG_PAGE_TABLE_ISOLATION | ||
271 | + /* | ||
272 | + * Unconditionally restore CR3. I know we return to | ||
273 | + * kernel code that needs user CR3, but do we ever return | ||
274 | + * to "user mode" where we need the kernel CR3? | ||
275 | + */ | ||
276 | + ALTERNATIVE "", "popq %rax; movq %rax, %cr3", X86_FEATURE_KAISER | ||
277 | +#endif | ||
278 | + | ||
279 | /* | ||
280 | * Return back to user mode. We must *not* do the normal exit | ||
281 | - * work, because we don't want to enable interrupts. Fortunately, | ||
282 | - * do_nmi doesn't modify pt_regs. | ||
283 | + * work, because we don't want to enable interrupts. Do not | ||
284 | + * switch to user CR3: we might be going back to kernel code | ||
285 | + * that had a user CR3 set. | ||
286 | */ | ||
287 | SWAPGS | ||
288 | jmp restore_c_regs_and_iret | ||
289 | @@ -1470,22 +1564,55 @@ end_repeat_nmi: | ||
290 | ALLOC_PT_GPREGS_ON_STACK | ||
291 | |||
292 | /* | ||
293 | - * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit | ||
294 | - * as we should not be calling schedule in NMI context. | ||
295 | - * Even with normal interrupts enabled. An NMI should not be | ||
296 | - * setting NEED_RESCHED or anything that normal interrupts and | ||
297 | - * exceptions might do. | ||
298 | + * Use the same approach as paranoid_entry to handle SWAPGS, but | ||
299 | + * without CR3 handling since we do that differently in NMIs. No | ||
300 | + * need to use paranoid_exit as we should not be calling schedule | ||
301 | + * in NMI context. Even with normal interrupts enabled. An NMI | ||
302 | + * should not be setting NEED_RESCHED or anything that normal | ||
303 | + * interrupts and exceptions might do. | ||
304 | */ | ||
305 | - call paranoid_entry | ||
306 | - | ||
307 | - /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ | ||
308 | + cld | ||
309 | + SAVE_C_REGS | ||
310 | + SAVE_EXTRA_REGS | ||
311 | + movl $1, %ebx | ||
312 | + movl $MSR_GS_BASE, %ecx | ||
313 | + rdmsr | ||
314 | + testl %edx, %edx | ||
315 | + js 1f /* negative -> in kernel */ | ||
316 | + SWAPGS | ||
317 | + xorl %ebx, %ebx | ||
318 | +1: | ||
319 | movq %rsp, %rdi | ||
320 | movq $-1, %rsi | ||
321 | +#ifdef CONFIG_PAGE_TABLE_ISOLATION | ||
322 | + /* Unconditionally use kernel CR3 for do_nmi() */ | ||
323 | + /* %rax is saved above, so OK to clobber here */ | ||
324 | + ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER | ||
325 | + /* If PCID enabled, NOFLUSH now and NOFLUSH on return */ | ||
326 | + ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID | ||
327 | + pushq %rax | ||
328 | + /* mask off "user" bit of pgd address and 12 PCID bits: */ | ||
329 | + andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax | ||
330 | + movq %rax, %cr3 | ||
331 | +2: | ||
332 | +#endif | ||
333 | + | ||
334 | + /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ | ||
335 | call do_nmi | ||
336 | |||
337 | +#ifdef CONFIG_PAGE_TABLE_ISOLATION | ||
338 | + /* | ||
339 | + * Unconditionally restore CR3. We might be returning to | ||
340 | + * kernel code that needs user CR3, like just just before | ||
341 | + * a sysret. | ||
342 | + */ | ||
343 | + ALTERNATIVE "", "popq %rax; movq %rax, %cr3", X86_FEATURE_KAISER | ||
344 | +#endif | ||
345 | + | ||
346 | testl %ebx, %ebx /* swapgs needed? */ | ||
347 | jnz nmi_restore | ||
348 | nmi_swapgs: | ||
349 | + /* We fixed up CR3 above, so no need to switch it here */ | ||
350 | SWAPGS_UNSAFE_STACK | ||
351 | nmi_restore: | ||
352 | RESTORE_EXTRA_REGS | ||
353 | diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S | ||
354 | index e1721dafbcb1..d76a97653980 100644 | ||
355 | --- a/arch/x86/entry/entry_64_compat.S | ||
356 | +++ b/arch/x86/entry/entry_64_compat.S | ||
357 | @@ -13,6 +13,8 @@ | ||
358 | #include <asm/irqflags.h> | ||
359 | #include <asm/asm.h> | ||
360 | #include <asm/smap.h> | ||
361 | +#include <asm/pgtable_types.h> | ||
362 | +#include <asm/kaiser.h> | ||
363 | #include <linux/linkage.h> | ||
364 | #include <linux/err.h> | ||
365 | |||
366 | @@ -48,6 +50,7 @@ | ||
367 | ENTRY(entry_SYSENTER_compat) | ||
368 | /* Interrupts are off on entry. */ | ||
369 | SWAPGS_UNSAFE_STACK | ||
370 | + SWITCH_KERNEL_CR3_NO_STACK | ||
371 | movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp | ||
372 | |||
373 | /* | ||
374 | @@ -184,6 +187,7 @@ ENDPROC(entry_SYSENTER_compat) | ||
375 | ENTRY(entry_SYSCALL_compat) | ||
376 | /* Interrupts are off on entry. */ | ||
377 | SWAPGS_UNSAFE_STACK | ||
378 | + SWITCH_KERNEL_CR3_NO_STACK | ||
379 | |||
380 | /* Stash user ESP and switch to the kernel stack. */ | ||
381 | movl %esp, %r8d | ||
382 | @@ -259,6 +263,7 @@ sysret32_from_system_call: | ||
383 | xorq %r8, %r8 | ||
384 | xorq %r9, %r9 | ||
385 | xorq %r10, %r10 | ||
386 | + SWITCH_USER_CR3 | ||
387 | movq RSP-ORIG_RAX(%rsp), %rsp | ||
388 | swapgs | ||
389 | sysretl | ||
390 | @@ -297,7 +302,7 @@ ENTRY(entry_INT80_compat) | ||
391 | PARAVIRT_ADJUST_EXCEPTION_FRAME | ||
392 | ASM_CLAC /* Do this early to minimize exposure */ | ||
393 | SWAPGS | ||
394 | - | ||
395 | + SWITCH_KERNEL_CR3_NO_STACK | ||
396 | /* | ||
397 | * User tracing code (ptrace or signal handlers) might assume that | ||
398 | * the saved RAX contains a 32-bit number when we're invoking a 32-bit | ||
399 | @@ -338,6 +343,7 @@ ENTRY(entry_INT80_compat) | ||
400 | |||
401 | /* Go back to user mode. */ | ||
402 | TRACE_IRQS_ON | ||
403 | + SWITCH_USER_CR3 | ||
404 | SWAPGS | ||
405 | jmp restore_regs_and_iret | ||
406 | END(entry_INT80_compat) | ||
407 | diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c | ||
408 | index 9dfeeeca0ea8..8e7a3f1df3a5 100644 | ||
409 | --- a/arch/x86/events/intel/ds.c | ||
410 | +++ b/arch/x86/events/intel/ds.c | ||
411 | @@ -2,11 +2,15 @@ | ||
412 | #include <linux/types.h> | ||
413 | #include <linux/slab.h> | ||
414 | |||
415 | +#include <asm/kaiser.h> | ||
416 | #include <asm/perf_event.h> | ||
417 | #include <asm/insn.h> | ||
418 | |||
419 | #include "../perf_event.h" | ||
420 | |||
421 | +static | ||
422 | +DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct debug_store, cpu_debug_store); | ||
423 | + | ||
424 | /* The size of a BTS record in bytes: */ | ||
425 | #define BTS_RECORD_SIZE 24 | ||
426 | |||
427 | @@ -268,6 +272,39 @@ void fini_debug_store_on_cpu(int cpu) | ||
428 | |||
429 | static DEFINE_PER_CPU(void *, insn_buffer); | ||
430 | |||
431 | +static void *dsalloc(size_t size, gfp_t flags, int node) | ||
432 | +{ | ||
433 | +#ifdef CONFIG_PAGE_TABLE_ISOLATION | ||
434 | + unsigned int order = get_order(size); | ||
435 | + struct page *page; | ||
436 | + unsigned long addr; | ||
437 | + | ||
438 | + page = __alloc_pages_node(node, flags | __GFP_ZERO, order); | ||
439 | + if (!page) | ||
440 | + return NULL; | ||
441 | + addr = (unsigned long)page_address(page); | ||
442 | + if (kaiser_add_mapping(addr, size, __PAGE_KERNEL) < 0) { | ||
443 | + __free_pages(page, order); | ||
444 | + addr = 0; | ||
445 | + } | ||
446 | + return (void *)addr; | ||
447 | +#else | ||
448 | + return kmalloc_node(size, flags | __GFP_ZERO, node); | ||
449 | +#endif | ||
450 | +} | ||
451 | + | ||
452 | +static void dsfree(const void *buffer, size_t size) | ||
453 | +{ | ||
454 | +#ifdef CONFIG_PAGE_TABLE_ISOLATION | ||
455 | + if (!buffer) | ||
456 | + return; | ||
457 | + kaiser_remove_mapping((unsigned long)buffer, size); | ||
458 | + free_pages((unsigned long)buffer, get_order(size)); | ||
459 | +#else | ||
460 | + kfree(buffer); | ||
461 | +#endif | ||
462 | +} | ||
463 | + | ||
464 | static int alloc_pebs_buffer(int cpu) | ||
465 | { | ||
466 | struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; | ||
467 | @@ -278,7 +315,7 @@ static int alloc_pebs_buffer(int cpu) | ||
468 | if (!x86_pmu.pebs) | ||
469 | return 0; | ||
470 | |||
471 | - buffer = kzalloc_node(x86_pmu.pebs_buffer_size, GFP_KERNEL, node); | ||
472 | + buffer = dsalloc(x86_pmu.pebs_buffer_size, GFP_KERNEL, node); | ||
473 | if (unlikely(!buffer)) | ||
474 | return -ENOMEM; | ||
475 | |||
476 | @@ -289,7 +326,7 @@ static int alloc_pebs_buffer(int cpu) | ||
477 | if (x86_pmu.intel_cap.pebs_format < 2) { | ||
478 | ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node); | ||
479 | if (!ibuffer) { | ||
480 | - kfree(buffer); | ||
481 | + dsfree(buffer, x86_pmu.pebs_buffer_size); | ||
482 | return -ENOMEM; | ||
483 | } | ||
484 | per_cpu(insn_buffer, cpu) = ibuffer; | ||
485 | @@ -315,7 +352,8 @@ static void release_pebs_buffer(int cpu) | ||
486 | kfree(per_cpu(insn_buffer, cpu)); | ||
487 | per_cpu(insn_buffer, cpu) = NULL; | ||
488 | |||
489 | - kfree((void *)(unsigned long)ds->pebs_buffer_base); | ||
490 | + dsfree((void *)(unsigned long)ds->pebs_buffer_base, | ||
491 | + x86_pmu.pebs_buffer_size); | ||
492 | ds->pebs_buffer_base = 0; | ||
493 | } | ||
494 | |||
495 | @@ -329,7 +367,7 @@ static int alloc_bts_buffer(int cpu) | ||
496 | if (!x86_pmu.bts) | ||
497 | return 0; | ||
498 | |||
499 | - buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node); | ||
500 | + buffer = dsalloc(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node); | ||
501 | if (unlikely(!buffer)) { | ||
502 | WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__); | ||
503 | return -ENOMEM; | ||
504 | @@ -355,19 +393,15 @@ static void release_bts_buffer(int cpu) | ||
505 | if (!ds || !x86_pmu.bts) | ||
506 | return; | ||
507 | |||
508 | - kfree((void *)(unsigned long)ds->bts_buffer_base); | ||
509 | + dsfree((void *)(unsigned long)ds->bts_buffer_base, BTS_BUFFER_SIZE); | ||
510 | ds->bts_buffer_base = 0; | ||
511 | } | ||
512 | |||
513 | static int alloc_ds_buffer(int cpu) | ||
514 | { | ||
515 | - int node = cpu_to_node(cpu); | ||
516 | - struct debug_store *ds; | ||
517 | - | ||
518 | - ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node); | ||
519 | - if (unlikely(!ds)) | ||
520 | - return -ENOMEM; | ||
521 | + struct debug_store *ds = per_cpu_ptr(&cpu_debug_store, cpu); | ||
522 | |||
523 | + memset(ds, 0, sizeof(*ds)); | ||
524 | per_cpu(cpu_hw_events, cpu).ds = ds; | ||
525 | |||
526 | return 0; | ||
527 | @@ -381,7 +415,6 @@ static void release_ds_buffer(int cpu) | ||
528 | return; | ||
529 | |||
530 | per_cpu(cpu_hw_events, cpu).ds = NULL; | ||
531 | - kfree(ds); | ||
532 | } | ||
533 | |||
534 | void release_ds_buffers(void) | ||
535 | diff --git a/arch/x86/include/asm/cmdline.h b/arch/x86/include/asm/cmdline.h | ||
536 | index e01f7f7ccb0c..84ae170bc3d0 100644 | ||
537 | --- a/arch/x86/include/asm/cmdline.h | ||
538 | +++ b/arch/x86/include/asm/cmdline.h | ||
539 | @@ -2,5 +2,7 @@ | ||
540 | #define _ASM_X86_CMDLINE_H | ||
541 | |||
542 | int cmdline_find_option_bool(const char *cmdline_ptr, const char *option); | ||
543 | +int cmdline_find_option(const char *cmdline_ptr, const char *option, | ||
544 | + char *buffer, int bufsize); | ||
545 | |||
546 | #endif /* _ASM_X86_CMDLINE_H */ | ||
547 | diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h | ||
548 | index ed10b5bf9b93..454a37adb823 100644 | ||
549 | --- a/arch/x86/include/asm/cpufeatures.h | ||
550 | +++ b/arch/x86/include/asm/cpufeatures.h | ||
551 | @@ -189,6 +189,7 @@ | ||
552 | |||
553 | #define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */ | ||
554 | #define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */ | ||
555 | +#define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 4) /* Effectively INVPCID && CR4.PCIDE=1 */ | ||
556 | |||
557 | #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ | ||
558 | #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ | ||
559 | @@ -197,6 +198,9 @@ | ||
560 | #define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */ | ||
561 | #define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */ | ||
562 | |||
563 | +/* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */ | ||
564 | +#define X86_FEATURE_KAISER ( 7*32+31) /* CONFIG_PAGE_TABLE_ISOLATION w/o nokaiser */ | ||
565 | + | ||
566 | /* Virtualization flags: Linux defined, word 8 */ | ||
567 | #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ | ||
568 | #define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */ | ||
569 | diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h | ||
570 | index 12080d87da3b..2ed5a2b3f8f7 100644 | ||
571 | --- a/arch/x86/include/asm/desc.h | ||
572 | +++ b/arch/x86/include/asm/desc.h | ||
573 | @@ -43,7 +43,7 @@ struct gdt_page { | ||
574 | struct desc_struct gdt[GDT_ENTRIES]; | ||
575 | } __attribute__((aligned(PAGE_SIZE))); | ||
576 | |||
577 | -DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page); | ||
578 | +DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page); | ||
579 | |||
580 | static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu) | ||
581 | { | ||
582 | diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h | ||
583 | index b90e1053049b..0817d63bce41 100644 | ||
584 | --- a/arch/x86/include/asm/hw_irq.h | ||
585 | +++ b/arch/x86/include/asm/hw_irq.h | ||
586 | @@ -178,7 +178,7 @@ extern char irq_entries_start[]; | ||
587 | #define VECTOR_RETRIGGERED ((void *)~0UL) | ||
588 | |||
589 | typedef struct irq_desc* vector_irq_t[NR_VECTORS]; | ||
590 | -DECLARE_PER_CPU(vector_irq_t, vector_irq); | ||
591 | +DECLARE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq); | ||
592 | |||
593 | #endif /* !ASSEMBLY_ */ | ||
594 | |||
595 | diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h | ||
596 | new file mode 100644 | ||
597 | index 000000000000..802bbbdfe143 | ||
598 | --- /dev/null | ||
599 | +++ b/arch/x86/include/asm/kaiser.h | ||
600 | @@ -0,0 +1,141 @@ | ||
601 | +#ifndef _ASM_X86_KAISER_H | ||
602 | +#define _ASM_X86_KAISER_H | ||
603 | + | ||
604 | +#include <uapi/asm/processor-flags.h> /* For PCID constants */ | ||
605 | + | ||
606 | +/* | ||
607 | + * This file includes the definitions for the KAISER feature. | ||
608 | + * KAISER is a counter measure against x86_64 side channel attacks on | ||
609 | + * the kernel virtual memory. It has a shadow pgd for every process: the | ||
610 | + * shadow pgd has a minimalistic kernel-set mapped, but includes the whole | ||
611 | + * user memory. Within a kernel context switch, or when an interrupt is handled, | ||
612 | + * the pgd is switched to the normal one. When the system switches to user mode, | ||
613 | + * the shadow pgd is enabled. By this, the virtual memory caches are freed, | ||
614 | + * and the user may not attack the whole kernel memory. | ||
615 | + * | ||
616 | + * A minimalistic kernel mapping holds the parts needed to be mapped in user | ||
617 | + * mode, such as the entry/exit functions of the user space, or the stacks. | ||
618 | + */ | ||
619 | + | ||
620 | +#define KAISER_SHADOW_PGD_OFFSET 0x1000 | ||
621 | + | ||
622 | +#ifdef __ASSEMBLY__ | ||
623 | +#ifdef CONFIG_PAGE_TABLE_ISOLATION | ||
624 | + | ||
625 | +.macro _SWITCH_TO_KERNEL_CR3 reg | ||
626 | +movq %cr3, \reg | ||
627 | +andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg | ||
628 | +/* If PCID enabled, set X86_CR3_PCID_NOFLUSH_BIT */ | ||
629 | +ALTERNATIVE "", "bts $63, \reg", X86_FEATURE_PCID | ||
630 | +movq \reg, %cr3 | ||
631 | +.endm | ||
632 | + | ||
633 | +.macro _SWITCH_TO_USER_CR3 reg regb | ||
634 | +/* | ||
635 | + * regb must be the low byte portion of reg: because we have arranged | ||
636 | + * for the low byte of the user PCID to serve as the high byte of NOFLUSH | ||
637 | + * (0x80 for each when PCID is enabled, or 0x00 when PCID and NOFLUSH are | ||
638 | + * not enabled): so that the one register can update both memory and cr3. | ||
639 | + */ | ||
640 | +movq %cr3, \reg | ||
641 | +orq PER_CPU_VAR(x86_cr3_pcid_user), \reg | ||
642 | +js 9f | ||
643 | +/* If PCID enabled, FLUSH this time, reset to NOFLUSH for next time */ | ||
644 | +movb \regb, PER_CPU_VAR(x86_cr3_pcid_user+7) | ||
645 | +9: | ||
646 | +movq \reg, %cr3 | ||
647 | +.endm | ||
648 | + | ||
649 | +.macro SWITCH_KERNEL_CR3 | ||
650 | +ALTERNATIVE "jmp 8f", "pushq %rax", X86_FEATURE_KAISER | ||
651 | +_SWITCH_TO_KERNEL_CR3 %rax | ||
652 | +popq %rax | ||
653 | +8: | ||
654 | +.endm | ||
655 | + | ||
656 | +.macro SWITCH_USER_CR3 | ||
657 | +ALTERNATIVE "jmp 8f", "pushq %rax", X86_FEATURE_KAISER | ||
658 | +_SWITCH_TO_USER_CR3 %rax %al | ||
659 | +popq %rax | ||
660 | +8: | ||
661 | +.endm | ||
662 | + | ||
663 | +.macro SWITCH_KERNEL_CR3_NO_STACK | ||
664 | +ALTERNATIVE "jmp 8f", \ | ||
665 | + __stringify(movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)), \ | ||
666 | + X86_FEATURE_KAISER | ||
667 | +_SWITCH_TO_KERNEL_CR3 %rax | ||
668 | +movq PER_CPU_VAR(unsafe_stack_register_backup), %rax | ||
669 | +8: | ||
670 | +.endm | ||
671 | + | ||
672 | +#else /* CONFIG_PAGE_TABLE_ISOLATION */ | ||
673 | + | ||
674 | +.macro SWITCH_KERNEL_CR3 | ||
675 | +.endm | ||
676 | +.macro SWITCH_USER_CR3 | ||
677 | +.endm | ||
678 | +.macro SWITCH_KERNEL_CR3_NO_STACK | ||
679 | +.endm | ||
680 | + | ||
681 | +#endif /* CONFIG_PAGE_TABLE_ISOLATION */ | ||
682 | + | ||
683 | +#else /* __ASSEMBLY__ */ | ||
684 | + | ||
685 | +#ifdef CONFIG_PAGE_TABLE_ISOLATION | ||
686 | +/* | ||
687 | + * Upon kernel/user mode switch, it may happen that the address | ||
688 | + * space has to be switched before the registers have been | ||
689 | + * stored. To change the address space, another register is | ||
690 | + * needed. A register therefore has to be stored/restored. | ||
691 | +*/ | ||
692 | +DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); | ||
693 | + | ||
694 | +DECLARE_PER_CPU(unsigned long, x86_cr3_pcid_user); | ||
695 | + | ||
696 | +extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[]; | ||
697 | + | ||
698 | +extern int kaiser_enabled; | ||
699 | +extern void __init kaiser_check_boottime_disable(void); | ||
700 | +#else | ||
701 | +#define kaiser_enabled 0 | ||
702 | +static inline void __init kaiser_check_boottime_disable(void) {} | ||
703 | +#endif /* CONFIG_PAGE_TABLE_ISOLATION */ | ||
704 | + | ||
705 | +/* | ||
706 | + * Kaiser function prototypes are needed even when CONFIG_PAGE_TABLE_ISOLATION is not set, | ||
707 | + * so as to build with tests on kaiser_enabled instead of #ifdefs. | ||
708 | + */ | ||
709 | + | ||
710 | +/** | ||
711 | + * kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping | ||
712 | + * @addr: the start address of the range | ||
713 | + * @size: the size of the range | ||
714 | + * @flags: The mapping flags of the pages | ||
715 | + * | ||
716 | + * The mapping is done on a global scope, so no bigger | ||
717 | + * synchronization has to be done. the pages have to be | ||
718 | + * manually unmapped again when they are not needed any longer. | ||
719 | + */ | ||
720 | +extern int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags); | ||
721 | + | ||
722 | +/** | ||
723 | + * kaiser_remove_mapping - unmap a virtual memory part of the shadow mapping | ||
724 | + * @addr: the start address of the range | ||
725 | + * @size: the size of the range | ||
726 | + */ | ||
727 | +extern void kaiser_remove_mapping(unsigned long start, unsigned long size); | ||
728 | + | ||
729 | +/** | ||
730 | + * kaiser_init - Initialize the shadow mapping | ||
731 | + * | ||
732 | + * Most parts of the shadow mapping can be mapped upon boot | ||
733 | + * time. Only per-process things like the thread stacks | ||
734 | + * or a new LDT have to be mapped at runtime. These boot- | ||
735 | + * time mappings are permanent and never unmapped. | ||
736 | + */ | ||
737 | +extern void kaiser_init(void); | ||
738 | + | ||
739 | +#endif /* __ASSEMBLY */ | ||
740 | + | ||
741 | +#endif /* _ASM_X86_KAISER_H */ | ||
742 | diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h | ||
743 | index 437feb436efa..2536f90cd30c 100644 | ||
744 | --- a/arch/x86/include/asm/pgtable.h | ||
745 | +++ b/arch/x86/include/asm/pgtable.h | ||
746 | @@ -18,6 +18,12 @@ | ||
747 | #ifndef __ASSEMBLY__ | ||
748 | #include <asm/x86_init.h> | ||
749 | |||
750 | +#ifdef CONFIG_PAGE_TABLE_ISOLATION | ||
751 | +extern int kaiser_enabled; | ||
752 | +#else | ||
753 | +#define kaiser_enabled 0 | ||
754 | +#endif | ||
755 | + | ||
756 | void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd); | ||
757 | void ptdump_walk_pgd_level_checkwx(void); | ||
758 | |||
759 | @@ -690,7 +696,17 @@ static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address) | ||
760 | |||
761 | static inline int pgd_bad(pgd_t pgd) | ||
762 | { | ||
763 | - return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE; | ||
764 | + pgdval_t ignore_flags = _PAGE_USER; | ||
765 | + /* | ||
766 | + * We set NX on KAISER pgds that map userspace memory so | ||
767 | + * that userspace can not meaningfully use the kernel | ||
768 | + * page table by accident; it will fault on the first | ||
769 | + * instruction it tries to run. See native_set_pgd(). | ||
770 | + */ | ||
771 | + if (kaiser_enabled) | ||
772 | + ignore_flags |= _PAGE_NX; | ||
773 | + | ||
774 | + return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE; | ||
775 | } | ||
776 | |||
777 | static inline int pgd_none(pgd_t pgd) | ||
778 | @@ -903,7 +919,15 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, | ||
779 | */ | ||
780 | static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count) | ||
781 | { | ||
782 | - memcpy(dst, src, count * sizeof(pgd_t)); | ||
783 | + memcpy(dst, src, count * sizeof(pgd_t)); | ||
784 | +#ifdef CONFIG_PAGE_TABLE_ISOLATION | ||
785 | + if (kaiser_enabled) { | ||
786 | + /* Clone the shadow pgd part as well */ | ||
787 | + memcpy(native_get_shadow_pgd(dst), | ||
788 | + native_get_shadow_pgd(src), | ||
789 | + count * sizeof(pgd_t)); | ||
790 | + } | ||
791 | +#endif | ||
792 | } | ||
793 | |||
794 | #define PTE_SHIFT ilog2(PTRS_PER_PTE) | ||
795 | diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h | ||
796 | index 1cc82ece9ac1..ce97c8c6a310 100644 | ||
797 | --- a/arch/x86/include/asm/pgtable_64.h | ||
798 | +++ b/arch/x86/include/asm/pgtable_64.h | ||
799 | @@ -106,9 +106,32 @@ static inline void native_pud_clear(pud_t *pud) | ||
800 | native_set_pud(pud, native_make_pud(0)); | ||
801 | } | ||
802 | |||
803 | +#ifdef CONFIG_PAGE_TABLE_ISOLATION | ||
804 | +extern pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd); | ||
805 | + | ||
806 | +static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp) | ||
807 | +{ | ||
808 | +#ifdef CONFIG_DEBUG_VM | ||
809 | + /* linux/mmdebug.h may not have been included at this point */ | ||
810 | + BUG_ON(!kaiser_enabled); | ||
811 | +#endif | ||
812 | + return (pgd_t *)((unsigned long)pgdp | (unsigned long)PAGE_SIZE); | ||
813 | +} | ||
814 | +#else | ||
815 | +static inline pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd) | ||
816 | +{ | ||
817 | + return pgd; | ||
818 | +} | ||
819 | +static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp) | ||
820 | +{ | ||
821 | + BUILD_BUG_ON(1); | ||
822 | + return NULL; | ||
823 | +} | ||
824 | +#endif /* CONFIG_PAGE_TABLE_ISOLATION */ | ||
825 | + | ||
826 | static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) | ||
827 | { | ||
828 | - *pgdp = pgd; | ||
829 | + *pgdp = kaiser_set_shadow_pgd(pgdp, pgd); | ||
830 | } | ||
831 | |||
832 | static inline void native_pgd_clear(pgd_t *pgd) | ||
833 | diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h | ||
834 | index 8b4de22d6429..f1c8ac468292 100644 | ||
835 | --- a/arch/x86/include/asm/pgtable_types.h | ||
836 | +++ b/arch/x86/include/asm/pgtable_types.h | ||
837 | @@ -119,7 +119,7 @@ | ||
838 | #define _PAGE_DEVMAP (_AT(pteval_t, 0)) | ||
839 | #endif | ||
840 | |||
841 | -#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) | ||
842 | +#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) | ||
843 | |||
844 | #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ | ||
845 | _PAGE_ACCESSED | _PAGE_DIRTY) | ||
846 | @@ -137,6 +137,33 @@ | ||
847 | _PAGE_SOFT_DIRTY) | ||
848 | #define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE) | ||
849 | |||
850 | +/* The ASID is the lower 12 bits of CR3 */ | ||
851 | +#define X86_CR3_PCID_ASID_MASK (_AC((1<<12)-1,UL)) | ||
852 | + | ||
853 | +/* Mask for all the PCID-related bits in CR3: */ | ||
854 | +#define X86_CR3_PCID_MASK (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_MASK) | ||
855 | +#define X86_CR3_PCID_ASID_KERN (_AC(0x0,UL)) | ||
856 | + | ||
857 | +#if defined(CONFIG_PAGE_TABLE_ISOLATION) && defined(CONFIG_X86_64) | ||
858 | +/* Let X86_CR3_PCID_ASID_USER be usable for the X86_CR3_PCID_NOFLUSH bit */ | ||
859 | +#define X86_CR3_PCID_ASID_USER (_AC(0x80,UL)) | ||
860 | + | ||
861 | +#define X86_CR3_PCID_KERN_FLUSH (X86_CR3_PCID_ASID_KERN) | ||
862 | +#define X86_CR3_PCID_USER_FLUSH (X86_CR3_PCID_ASID_USER) | ||
863 | +#define X86_CR3_PCID_KERN_NOFLUSH (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_KERN) | ||
864 | +#define X86_CR3_PCID_USER_NOFLUSH (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_USER) | ||
865 | +#else | ||
866 | +#define X86_CR3_PCID_ASID_USER (_AC(0x0,UL)) | ||
867 | +/* | ||
868 | + * PCIDs are unsupported on 32-bit and none of these bits can be | ||
869 | + * set in CR3: | ||
870 | + */ | ||
871 | +#define X86_CR3_PCID_KERN_FLUSH (0) | ||
872 | +#define X86_CR3_PCID_USER_FLUSH (0) | ||
873 | +#define X86_CR3_PCID_KERN_NOFLUSH (0) | ||
874 | +#define X86_CR3_PCID_USER_NOFLUSH (0) | ||
875 | +#endif | ||
876 | + | ||
877 | /* | ||
878 | * The cache modes defined here are used to translate between pure SW usage | ||
879 | * and the HW defined cache mode bits and/or PAT entries. | ||
880 | diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h | ||
881 | index 83db0eae9979..8cb52ee3ade6 100644 | ||
882 | --- a/arch/x86/include/asm/processor.h | ||
883 | +++ b/arch/x86/include/asm/processor.h | ||
884 | @@ -308,7 +308,7 @@ struct tss_struct { | ||
885 | |||
886 | } ____cacheline_aligned; | ||
887 | |||
888 | -DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss); | ||
889 | +DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss); | ||
890 | |||
891 | #ifdef CONFIG_X86_32 | ||
892 | DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack); | ||
893 | diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h | ||
894 | index 7d2ea6b1f7d9..94146f665a3c 100644 | ||
895 | --- a/arch/x86/include/asm/tlbflush.h | ||
896 | +++ b/arch/x86/include/asm/tlbflush.h | ||
897 | @@ -132,6 +132,24 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask) | ||
898 | cr4_set_bits(mask); | ||
899 | } | ||
900 | |||
901 | +/* | ||
902 | + * Declare a couple of kaiser interfaces here for convenience, | ||
903 | + * to avoid the need for asm/kaiser.h in unexpected places. | ||
904 | + */ | ||
905 | +#ifdef CONFIG_PAGE_TABLE_ISOLATION | ||
906 | +extern int kaiser_enabled; | ||
907 | +extern void kaiser_setup_pcid(void); | ||
908 | +extern void kaiser_flush_tlb_on_return_to_user(void); | ||
909 | +#else | ||
910 | +#define kaiser_enabled 0 | ||
911 | +static inline void kaiser_setup_pcid(void) | ||
912 | +{ | ||
913 | +} | ||
914 | +static inline void kaiser_flush_tlb_on_return_to_user(void) | ||
915 | +{ | ||
916 | +} | ||
917 | +#endif | ||
918 | + | ||
919 | static inline void __native_flush_tlb(void) | ||
920 | { | ||
921 | /* | ||
922 | @@ -140,6 +158,8 @@ static inline void __native_flush_tlb(void) | ||
923 | * back: | ||
924 | */ | ||
925 | preempt_disable(); | ||
926 | + if (kaiser_enabled) | ||
927 | + kaiser_flush_tlb_on_return_to_user(); | ||
928 | native_write_cr3(native_read_cr3()); | ||
929 | preempt_enable(); | ||
930 | } | ||
931 | @@ -149,20 +169,27 @@ static inline void __native_flush_tlb_global_irq_disabled(void) | ||
932 | unsigned long cr4; | ||
933 | |||
934 | cr4 = this_cpu_read(cpu_tlbstate.cr4); | ||
935 | - /* clear PGE */ | ||
936 | - native_write_cr4(cr4 & ~X86_CR4_PGE); | ||
937 | - /* write old PGE again and flush TLBs */ | ||
938 | - native_write_cr4(cr4); | ||
939 | + if (cr4 & X86_CR4_PGE) { | ||
940 | + /* clear PGE and flush TLB of all entries */ | ||
941 | + native_write_cr4(cr4 & ~X86_CR4_PGE); | ||
942 | + /* restore PGE as it was before */ | ||
943 | + native_write_cr4(cr4); | ||
944 | + } else { | ||
945 | + /* do it with cr3, letting kaiser flush user PCID */ | ||
946 | + __native_flush_tlb(); | ||
947 | + } | ||
948 | } | ||
949 | |||
950 | static inline void __native_flush_tlb_global(void) | ||
951 | { | ||
952 | unsigned long flags; | ||
953 | |||
954 | - if (static_cpu_has(X86_FEATURE_INVPCID)) { | ||
955 | + if (this_cpu_has(X86_FEATURE_INVPCID)) { | ||
956 | /* | ||
957 | * Using INVPCID is considerably faster than a pair of writes | ||
958 | * to CR4 sandwiched inside an IRQ flag save/restore. | ||
959 | + * | ||
960 | + * Note, this works with CR4.PCIDE=0 or 1. | ||
961 | */ | ||
962 | invpcid_flush_all(); | ||
963 | return; | ||
964 | @@ -174,24 +201,45 @@ static inline void __native_flush_tlb_global(void) | ||
965 | * be called from deep inside debugging code.) | ||
966 | */ | ||
967 | raw_local_irq_save(flags); | ||
968 | - | ||
969 | __native_flush_tlb_global_irq_disabled(); | ||
970 | - | ||
971 | raw_local_irq_restore(flags); | ||
972 | } | ||
973 | |||
974 | static inline void __native_flush_tlb_single(unsigned long addr) | ||
975 | { | ||
976 | - asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); | ||
977 | + /* | ||
978 | + * SIMICS #GP's if you run INVPCID with type 2/3 | ||
979 | + * and X86_CR4_PCIDE clear. Shame! | ||
980 | + * | ||
981 | + * The ASIDs used below are hard-coded. But, we must not | ||
982 | + * call invpcid(type=1/2) before CR4.PCIDE=1. Just call | ||
983 | + * invlpg in the case we are called early. | ||
984 | + */ | ||
985 | + | ||
986 | + if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) { | ||
987 | + if (kaiser_enabled) | ||
988 | + kaiser_flush_tlb_on_return_to_user(); | ||
989 | + asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); | ||
990 | + return; | ||
991 | + } | ||
992 | + /* Flush the address out of both PCIDs. */ | ||
993 | + /* | ||
994 | + * An optimization here might be to determine addresses | ||
995 | + * that are only kernel-mapped and only flush the kernel | ||
996 | + * ASID. But, userspace flushes are probably much more | ||
997 | + * important performance-wise. | ||
998 | + * | ||
999 | + * Make sure to do only a single invpcid when KAISER is | ||
1000 | + * disabled and we have only a single ASID. | ||
1001 | + */ | ||
1002 | + if (kaiser_enabled) | ||
1003 | + invpcid_flush_one(X86_CR3_PCID_ASID_USER, addr); | ||
1004 | + invpcid_flush_one(X86_CR3_PCID_ASID_KERN, addr); | ||
1005 | } | ||
1006 | |||
1007 | static inline void __flush_tlb_all(void) | ||
1008 | { | ||
1009 | - if (boot_cpu_has(X86_FEATURE_PGE)) | ||
1010 | - __flush_tlb_global(); | ||
1011 | - else | ||
1012 | - __flush_tlb(); | ||
1013 | - | ||
1014 | + __flush_tlb_global(); | ||
1015 | /* | ||
1016 | * Note: if we somehow had PCID but not PGE, then this wouldn't work -- | ||
1017 | * we'd end up flushing kernel translations for the current ASID but | ||
1018 | diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h | ||
1019 | index 567de50a4c2a..6768d1321016 100644 | ||
1020 | --- a/arch/x86/include/uapi/asm/processor-flags.h | ||
1021 | +++ b/arch/x86/include/uapi/asm/processor-flags.h | ||
1022 | @@ -77,7 +77,8 @@ | ||
1023 | #define X86_CR3_PWT _BITUL(X86_CR3_PWT_BIT) | ||
1024 | #define X86_CR3_PCD_BIT 4 /* Page Cache Disable */ | ||
1025 | #define X86_CR3_PCD _BITUL(X86_CR3_PCD_BIT) | ||
1026 | -#define X86_CR3_PCID_MASK _AC(0x00000fff,UL) /* PCID Mask */ | ||
1027 | +#define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */ | ||
1028 | +#define X86_CR3_PCID_NOFLUSH _BITULL(X86_CR3_PCID_NOFLUSH_BIT) | ||
1029 | |||
1030 | /* | ||
1031 | * Intel CPU features in CR4 | ||
1032 | diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c | ||
1033 | index 91588be529b9..918e44772b04 100644 | ||
1034 | --- a/arch/x86/kernel/cpu/common.c | ||
1035 | +++ b/arch/x86/kernel/cpu/common.c | ||
1036 | @@ -93,7 +93,7 @@ static const struct cpu_dev default_cpu = { | ||
1037 | |||
1038 | static const struct cpu_dev *this_cpu = &default_cpu; | ||
1039 | |||
1040 | -DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { | ||
1041 | +DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page) = { .gdt = { | ||
1042 | #ifdef CONFIG_X86_64 | ||
1043 | /* | ||
1044 | * We need valid kernel segments for data and code in long mode too | ||
1045 | @@ -327,8 +327,21 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c) | ||
1046 | static void setup_pcid(struct cpuinfo_x86 *c) | ||
1047 | { | ||
1048 | if (cpu_has(c, X86_FEATURE_PCID)) { | ||
1049 | - if (cpu_has(c, X86_FEATURE_PGE)) { | ||
1050 | + if (cpu_has(c, X86_FEATURE_PGE) || kaiser_enabled) { | ||
1051 | cr4_set_bits(X86_CR4_PCIDE); | ||
1052 | + /* | ||
1053 | + * INVPCID has two "groups" of types: | ||
1054 | + * 1/2: Invalidate an individual address | ||
1055 | + * 3/4: Invalidate all contexts | ||
1056 | + * | ||
1057 | + * 1/2 take a PCID, but 3/4 do not. So, 3/4 | ||
1058 | + * ignore the PCID argument in the descriptor. | ||
1059 | + * But, we have to be careful not to call 1/2 | ||
1060 | + * with an actual non-zero PCID in them before | ||
1061 | + * we do the above cr4_set_bits(). | ||
1062 | + */ | ||
1063 | + if (cpu_has(c, X86_FEATURE_INVPCID)) | ||
1064 | + set_cpu_cap(c, X86_FEATURE_INVPCID_SINGLE); | ||
1065 | } else { | ||
1066 | /* | ||
1067 | * flush_tlb_all(), as currently implemented, won't | ||
1068 | @@ -341,6 +354,7 @@ static void setup_pcid(struct cpuinfo_x86 *c) | ||
1069 | clear_cpu_cap(c, X86_FEATURE_PCID); | ||
1070 | } | ||
1071 | } | ||
1072 | + kaiser_setup_pcid(); | ||
1073 | } | ||
1074 | |||
1075 | /* | ||
1076 | @@ -1365,7 +1379,7 @@ static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = { | ||
1077 | [DEBUG_STACK - 1] = DEBUG_STKSZ | ||
1078 | }; | ||
1079 | |||
1080 | -static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks | ||
1081 | +DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(char, exception_stacks | ||
1082 | [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); | ||
1083 | |||
1084 | /* May not be marked __init: used by software suspend */ | ||
1085 | @@ -1523,6 +1537,14 @@ void cpu_init(void) | ||
1086 | * try to read it. | ||
1087 | */ | ||
1088 | cr4_init_shadow(); | ||
1089 | + if (!kaiser_enabled) { | ||
1090 | + /* | ||
1091 | + * secondary_startup_64() deferred setting PGE in cr4: | ||
1092 | + * probe_page_size_mask() sets it on the boot cpu, | ||
1093 | + * but it needs to be set on each secondary cpu. | ||
1094 | + */ | ||
1095 | + cr4_set_bits(X86_CR4_PGE); | ||
1096 | + } | ||
1097 | |||
1098 | /* | ||
1099 | * Load microcode on this cpu if a valid microcode is available. | ||
1100 | diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c | ||
1101 | index 04f89caef9c4..e33b38541be3 100644 | ||
1102 | --- a/arch/x86/kernel/espfix_64.c | ||
1103 | +++ b/arch/x86/kernel/espfix_64.c | ||
1104 | @@ -41,6 +41,7 @@ | ||
1105 | #include <asm/pgalloc.h> | ||
1106 | #include <asm/setup.h> | ||
1107 | #include <asm/espfix.h> | ||
1108 | +#include <asm/kaiser.h> | ||
1109 | |||
1110 | /* | ||
1111 | * Note: we only need 6*8 = 48 bytes for the espfix stack, but round | ||
1112 | @@ -126,6 +127,15 @@ void __init init_espfix_bsp(void) | ||
1113 | /* Install the espfix pud into the kernel page directory */ | ||
1114 | pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)]; | ||
1115 | pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page); | ||
1116 | + /* | ||
1117 | + * Just copy the top-level PGD that is mapping the espfix | ||
1118 | + * area to ensure it is mapped into the shadow user page | ||
1119 | + * tables. | ||
1120 | + */ | ||
1121 | + if (kaiser_enabled) { | ||
1122 | + set_pgd(native_get_shadow_pgd(pgd_p), | ||
1123 | + __pgd(_KERNPG_TABLE | __pa((pud_t *)espfix_pud_page))); | ||
1124 | + } | ||
1125 | |||
1126 | /* Randomize the locations */ | ||
1127 | init_espfix_random(); | ||
1128 | diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S | ||
1129 | index b4421cc191b0..67cd7c1b99da 100644 | ||
1130 | --- a/arch/x86/kernel/head_64.S | ||
1131 | +++ b/arch/x86/kernel/head_64.S | ||
1132 | @@ -190,8 +190,8 @@ ENTRY(secondary_startup_64) | ||
1133 | movq $(init_level4_pgt - __START_KERNEL_map), %rax | ||
1134 | 1: | ||
1135 | |||
1136 | - /* Enable PAE mode and PGE */ | ||
1137 | - movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx | ||
1138 | + /* Enable PAE and PSE, but defer PGE until kaiser_enabled is decided */ | ||
1139 | + movl $(X86_CR4_PAE | X86_CR4_PSE), %ecx | ||
1140 | movq %rcx, %cr4 | ||
1141 | |||
1142 | /* Setup early boot stage 4 level pagetables. */ | ||
1143 | @@ -405,6 +405,27 @@ GLOBAL(early_recursion_flag) | ||
1144 | .balign PAGE_SIZE; \ | ||
1145 | GLOBAL(name) | ||
1146 | |||
1147 | +#ifdef CONFIG_PAGE_TABLE_ISOLATION | ||
1148 | +/* | ||
1149 | + * Each PGD needs to be 8k long and 8k aligned. We do not | ||
1150 | + * ever go out to userspace with these, so we do not | ||
1151 | + * strictly *need* the second page, but this allows us to | ||
1152 | + * have a single set_pgd() implementation that does not | ||
1153 | + * need to worry about whether it has 4k or 8k to work | ||
1154 | + * with. | ||
1155 | + * | ||
1156 | + * This ensures PGDs are 8k long: | ||
1157 | + */ | ||
1158 | +#define KAISER_USER_PGD_FILL 512 | ||
1159 | +/* This ensures they are 8k-aligned: */ | ||
1160 | +#define NEXT_PGD_PAGE(name) \ | ||
1161 | + .balign 2 * PAGE_SIZE; \ | ||
1162 | +GLOBAL(name) | ||
1163 | +#else | ||
1164 | +#define NEXT_PGD_PAGE(name) NEXT_PAGE(name) | ||
1165 | +#define KAISER_USER_PGD_FILL 0 | ||
1166 | +#endif | ||
1167 | + | ||
1168 | /* Automate the creation of 1 to 1 mapping pmd entries */ | ||
1169 | #define PMDS(START, PERM, COUNT) \ | ||
1170 | i = 0 ; \ | ||
1171 | @@ -414,9 +435,10 @@ GLOBAL(name) | ||
1172 | .endr | ||
1173 | |||
1174 | __INITDATA | ||
1175 | -NEXT_PAGE(early_level4_pgt) | ||
1176 | +NEXT_PGD_PAGE(early_level4_pgt) | ||
1177 | .fill 511,8,0 | ||
1178 | .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE | ||
1179 | + .fill KAISER_USER_PGD_FILL,8,0 | ||
1180 | |||
1181 | NEXT_PAGE(early_dynamic_pgts) | ||
1182 | .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0 | ||
1183 | @@ -424,16 +446,18 @@ NEXT_PAGE(early_dynamic_pgts) | ||
1184 | .data | ||
1185 | |||
1186 | #ifndef CONFIG_XEN | ||
1187 | -NEXT_PAGE(init_level4_pgt) | ||
1188 | +NEXT_PGD_PAGE(init_level4_pgt) | ||
1189 | .fill 512,8,0 | ||
1190 | + .fill KAISER_USER_PGD_FILL,8,0 | ||
1191 | #else | ||
1192 | -NEXT_PAGE(init_level4_pgt) | ||
1193 | +NEXT_PGD_PAGE(init_level4_pgt) | ||
1194 | .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE | ||
1195 | .org init_level4_pgt + L4_PAGE_OFFSET*8, 0 | ||
1196 | .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE | ||
1197 | .org init_level4_pgt + L4_START_KERNEL*8, 0 | ||
1198 | /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ | ||
1199 | .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE | ||
1200 | + .fill KAISER_USER_PGD_FILL,8,0 | ||
1201 | |||
1202 | NEXT_PAGE(level3_ident_pgt) | ||
1203 | .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE | ||
1204 | @@ -444,6 +468,7 @@ NEXT_PAGE(level2_ident_pgt) | ||
1205 | */ | ||
1206 | PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) | ||
1207 | #endif | ||
1208 | + .fill KAISER_USER_PGD_FILL,8,0 | ||
1209 | |||
1210 | NEXT_PAGE(level3_kernel_pgt) | ||
1211 | .fill L3_START_KERNEL,8,0 | ||
1212 | diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c | ||
1213 | index 1423ab1b0312..f480b38a03c3 100644 | ||
1214 | --- a/arch/x86/kernel/irqinit.c | ||
1215 | +++ b/arch/x86/kernel/irqinit.c | ||
1216 | @@ -51,7 +51,7 @@ static struct irqaction irq2 = { | ||
1217 | .flags = IRQF_NO_THREAD, | ||
1218 | }; | ||
1219 | |||
1220 | -DEFINE_PER_CPU(vector_irq_t, vector_irq) = { | ||
1221 | +DEFINE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq) = { | ||
1222 | [0 ... NR_VECTORS - 1] = VECTOR_UNUSED, | ||
1223 | }; | ||
1224 | |||
1225 | diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c | ||
1226 | index 5f70014ca602..8bc68cfc0d33 100644 | ||
1227 | --- a/arch/x86/kernel/ldt.c | ||
1228 | +++ b/arch/x86/kernel/ldt.c | ||
1229 | @@ -16,6 +16,7 @@ | ||
1230 | #include <linux/slab.h> | ||
1231 | #include <linux/vmalloc.h> | ||
1232 | #include <linux/uaccess.h> | ||
1233 | +#include <linux/kaiser.h> | ||
1234 | |||
1235 | #include <asm/ldt.h> | ||
1236 | #include <asm/desc.h> | ||
1237 | @@ -34,11 +35,21 @@ static void flush_ldt(void *current_mm) | ||
1238 | set_ldt(pc->ldt->entries, pc->ldt->size); | ||
1239 | } | ||
1240 | |||
1241 | +static void __free_ldt_struct(struct ldt_struct *ldt) | ||
1242 | +{ | ||
1243 | + if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE) | ||
1244 | + vfree(ldt->entries); | ||
1245 | + else | ||
1246 | + free_page((unsigned long)ldt->entries); | ||
1247 | + kfree(ldt); | ||
1248 | +} | ||
1249 | + | ||
1250 | /* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */ | ||
1251 | static struct ldt_struct *alloc_ldt_struct(int size) | ||
1252 | { | ||
1253 | struct ldt_struct *new_ldt; | ||
1254 | int alloc_size; | ||
1255 | + int ret; | ||
1256 | |||
1257 | if (size > LDT_ENTRIES) | ||
1258 | return NULL; | ||
1259 | @@ -66,7 +77,13 @@ static struct ldt_struct *alloc_ldt_struct(int size) | ||
1260 | return NULL; | ||
1261 | } | ||
1262 | |||
1263 | + ret = kaiser_add_mapping((unsigned long)new_ldt->entries, alloc_size, | ||
1264 | + __PAGE_KERNEL); | ||
1265 | new_ldt->size = size; | ||
1266 | + if (ret) { | ||
1267 | + __free_ldt_struct(new_ldt); | ||
1268 | + return NULL; | ||
1269 | + } | ||
1270 | return new_ldt; | ||
1271 | } | ||
1272 | |||
1273 | @@ -92,12 +109,10 @@ static void free_ldt_struct(struct ldt_struct *ldt) | ||
1274 | if (likely(!ldt)) | ||
1275 | return; | ||
1276 | |||
1277 | + kaiser_remove_mapping((unsigned long)ldt->entries, | ||
1278 | + ldt->size * LDT_ENTRY_SIZE); | ||
1279 | paravirt_free_ldt(ldt->entries, ldt->size); | ||
1280 | - if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE) | ||
1281 | - vfree(ldt->entries); | ||
1282 | - else | ||
1283 | - free_page((unsigned long)ldt->entries); | ||
1284 | - kfree(ldt); | ||
1285 | + __free_ldt_struct(ldt); | ||
1286 | } | ||
1287 | |||
1288 | /* | ||
1289 | diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c | ||
1290 | index bb3840cedb4f..ee43b36075c7 100644 | ||
1291 | --- a/arch/x86/kernel/paravirt_patch_64.c | ||
1292 | +++ b/arch/x86/kernel/paravirt_patch_64.c | ||
1293 | @@ -9,7 +9,6 @@ DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax"); | ||
1294 | DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax"); | ||
1295 | DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax"); | ||
1296 | DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3"); | ||
1297 | -DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)"); | ||
1298 | DEF_NATIVE(pv_cpu_ops, clts, "clts"); | ||
1299 | DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd"); | ||
1300 | |||
1301 | @@ -59,7 +58,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, | ||
1302 | PATCH_SITE(pv_mmu_ops, read_cr3); | ||
1303 | PATCH_SITE(pv_mmu_ops, write_cr3); | ||
1304 | PATCH_SITE(pv_cpu_ops, clts); | ||
1305 | - PATCH_SITE(pv_mmu_ops, flush_tlb_single); | ||
1306 | PATCH_SITE(pv_cpu_ops, wbinvd); | ||
1307 | #if defined(CONFIG_PARAVIRT_SPINLOCKS) | ||
1308 | case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock): | ||
1309 | diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c | ||
1310 | index 8e10e72bf6ee..a55b32007785 100644 | ||
1311 | --- a/arch/x86/kernel/process.c | ||
1312 | +++ b/arch/x86/kernel/process.c | ||
1313 | @@ -41,7 +41,7 @@ | ||
1314 | * section. Since TSS's are completely CPU-local, we want them | ||
1315 | * on exact cacheline boundaries, to eliminate cacheline ping-pong. | ||
1316 | */ | ||
1317 | -__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { | ||
1318 | +__visible DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss) = { | ||
1319 | .x86_tss = { | ||
1320 | .sp0 = TOP_OF_INIT_STACK, | ||
1321 | #ifdef CONFIG_X86_32 | ||
1322 | diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c | ||
1323 | index feaab07fa124..6b55012d02a3 100644 | ||
1324 | --- a/arch/x86/kernel/setup.c | ||
1325 | +++ b/arch/x86/kernel/setup.c | ||
1326 | @@ -114,6 +114,7 @@ | ||
1327 | #include <asm/microcode.h> | ||
1328 | #include <asm/mmu_context.h> | ||
1329 | #include <asm/kaslr.h> | ||
1330 | +#include <asm/kaiser.h> | ||
1331 | |||
1332 | /* | ||
1333 | * max_low_pfn_mapped: highest direct mapped pfn under 4GB | ||
1334 | @@ -1019,6 +1020,12 @@ void __init setup_arch(char **cmdline_p) | ||
1335 | */ | ||
1336 | init_hypervisor_platform(); | ||
1337 | |||
1338 | + /* | ||
1339 | + * This needs to happen right after XENPV is set on xen and | ||
1340 | + * kaiser_enabled is checked below in cleanup_highmap(). | ||
1341 | + */ | ||
1342 | + kaiser_check_boottime_disable(); | ||
1343 | + | ||
1344 | x86_init.resources.probe_roms(); | ||
1345 | |||
1346 | /* after parse_early_param, so could debug it */ | ||
1347 | diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c | ||
1348 | index 1c113db9ed57..2bb5ee464df3 100644 | ||
1349 | --- a/arch/x86/kernel/tracepoint.c | ||
1350 | +++ b/arch/x86/kernel/tracepoint.c | ||
1351 | @@ -9,10 +9,12 @@ | ||
1352 | #include <linux/atomic.h> | ||
1353 | |||
1354 | atomic_t trace_idt_ctr = ATOMIC_INIT(0); | ||
1355 | +__aligned(PAGE_SIZE) | ||
1356 | struct desc_ptr trace_idt_descr = { NR_VECTORS * 16 - 1, | ||
1357 | (unsigned long) trace_idt_table }; | ||
1358 | |||
1359 | /* No need to be aligned, but done to keep all IDTs defined the same way. */ | ||
1360 | +__aligned(PAGE_SIZE) | ||
1361 | gate_desc trace_idt_table[NR_VECTORS] __page_aligned_bss; | ||
1362 | |||
1363 | static int trace_irq_vector_refcount; | ||
1364 | diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c | ||
1365 | index 7e28e6c877d9..73304b1a03cc 100644 | ||
1366 | --- a/arch/x86/kvm/x86.c | ||
1367 | +++ b/arch/x86/kvm/x86.c | ||
1368 | @@ -773,7 +773,8 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) | ||
1369 | return 1; | ||
1370 | |||
1371 | /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */ | ||
1372 | - if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu)) | ||
1373 | + if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_ASID_MASK) || | ||
1374 | + !is_long_mode(vcpu)) | ||
1375 | return 1; | ||
1376 | } | ||
1377 | |||
1378 | diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c | ||
1379 | index 5cc78bf57232..3261abb21ef4 100644 | ||
1380 | --- a/arch/x86/lib/cmdline.c | ||
1381 | +++ b/arch/x86/lib/cmdline.c | ||
1382 | @@ -104,7 +104,112 @@ __cmdline_find_option_bool(const char *cmdline, int max_cmdline_size, | ||
1383 | return 0; /* Buffer overrun */ | ||
1384 | } | ||
1385 | |||
1386 | +/* | ||
1387 | + * Find a non-boolean option (i.e. option=argument). In accordance with | ||
1388 | + * standard Linux practice, if this option is repeated, this returns the | ||
1389 | + * last instance on the command line. | ||
1390 | + * | ||
1391 | + * @cmdline: the cmdline string | ||
1392 | + * @max_cmdline_size: the maximum size of cmdline | ||
1393 | + * @option: option string to look for | ||
1394 | + * @buffer: memory buffer to return the option argument | ||
1395 | + * @bufsize: size of the supplied memory buffer | ||
1396 | + * | ||
1397 | + * Returns the length of the argument (regardless of if it was | ||
1398 | + * truncated to fit in the buffer), or -1 on not found. | ||
1399 | + */ | ||
1400 | +static int | ||
1401 | +__cmdline_find_option(const char *cmdline, int max_cmdline_size, | ||
1402 | + const char *option, char *buffer, int bufsize) | ||
1403 | +{ | ||
1404 | + char c; | ||
1405 | + int pos = 0, len = -1; | ||
1406 | + const char *opptr = NULL; | ||
1407 | + char *bufptr = buffer; | ||
1408 | + enum { | ||
1409 | + st_wordstart = 0, /* Start of word/after whitespace */ | ||
1410 | + st_wordcmp, /* Comparing this word */ | ||
1411 | + st_wordskip, /* Miscompare, skip */ | ||
1412 | + st_bufcpy, /* Copying this to buffer */ | ||
1413 | + } state = st_wordstart; | ||
1414 | + | ||
1415 | + if (!cmdline) | ||
1416 | + return -1; /* No command line */ | ||
1417 | + | ||
1418 | + /* | ||
1419 | + * This 'pos' check ensures we do not overrun | ||
1420 | + * a non-NULL-terminated 'cmdline' | ||
1421 | + */ | ||
1422 | + while (pos++ < max_cmdline_size) { | ||
1423 | + c = *(char *)cmdline++; | ||
1424 | + if (!c) | ||
1425 | + break; | ||
1426 | + | ||
1427 | + switch (state) { | ||
1428 | + case st_wordstart: | ||
1429 | + if (myisspace(c)) | ||
1430 | + break; | ||
1431 | + | ||
1432 | + state = st_wordcmp; | ||
1433 | + opptr = option; | ||
1434 | + /* fall through */ | ||
1435 | + | ||
1436 | + case st_wordcmp: | ||
1437 | + if ((c == '=') && !*opptr) { | ||
1438 | + /* | ||
1439 | + * We matched all the way to the end of the | ||
1440 | + * option we were looking for, prepare to | ||
1441 | + * copy the argument. | ||
1442 | + */ | ||
1443 | + len = 0; | ||
1444 | + bufptr = buffer; | ||
1445 | + state = st_bufcpy; | ||
1446 | + break; | ||
1447 | + } else if (c == *opptr++) { | ||
1448 | + /* | ||
1449 | + * We are currently matching, so continue | ||
1450 | + * to the next character on the cmdline. | ||
1451 | + */ | ||
1452 | + break; | ||
1453 | + } | ||
1454 | + state = st_wordskip; | ||
1455 | + /* fall through */ | ||
1456 | + | ||
1457 | + case st_wordskip: | ||
1458 | + if (myisspace(c)) | ||
1459 | + state = st_wordstart; | ||
1460 | + break; | ||
1461 | + | ||
1462 | + case st_bufcpy: | ||
1463 | + if (myisspace(c)) { | ||
1464 | + state = st_wordstart; | ||
1465 | + } else { | ||
1466 | + /* | ||
1467 | + * Increment len, but don't overrun the | ||
1468 | + * supplied buffer and leave room for the | ||
1469 | + * NULL terminator. | ||
1470 | + */ | ||
1471 | + if (++len < bufsize) | ||
1472 | + *bufptr++ = c; | ||
1473 | + } | ||
1474 | + break; | ||
1475 | + } | ||
1476 | + } | ||
1477 | + | ||
1478 | + if (bufsize) | ||
1479 | + *bufptr = '\0'; | ||
1480 | + | ||
1481 | + return len; | ||
1482 | +} | ||
1483 | + | ||
1484 | int cmdline_find_option_bool(const char *cmdline, const char *option) | ||
1485 | { | ||
1486 | return __cmdline_find_option_bool(cmdline, COMMAND_LINE_SIZE, option); | ||
1487 | } | ||
1488 | + | ||
1489 | +int cmdline_find_option(const char *cmdline, const char *option, char *buffer, | ||
1490 | + int bufsize) | ||
1491 | +{ | ||
1492 | + return __cmdline_find_option(cmdline, COMMAND_LINE_SIZE, option, | ||
1493 | + buffer, bufsize); | ||
1494 | +} | ||
1495 | diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile | ||
1496 | index 96d2b847e09e..c548b46100cb 100644 | ||
1497 | --- a/arch/x86/mm/Makefile | ||
1498 | +++ b/arch/x86/mm/Makefile | ||
1499 | @@ -37,5 +37,5 @@ obj-$(CONFIG_NUMA_EMU) += numa_emulation.o | ||
1500 | |||
1501 | obj-$(CONFIG_X86_INTEL_MPX) += mpx.o | ||
1502 | obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o | ||
1503 | -obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o | ||
1504 | - | ||
1505 | +obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o | ||
1506 | +obj-$(CONFIG_PAGE_TABLE_ISOLATION) += kaiser.o | ||
1507 | diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c | ||
1508 | index 0381638168d1..1e779bca4f3e 100644 | ||
1509 | --- a/arch/x86/mm/init.c | ||
1510 | +++ b/arch/x86/mm/init.c | ||
1511 | @@ -177,7 +177,7 @@ static void __init probe_page_size_mask(void) | ||
1512 | cr4_set_bits_and_update_boot(X86_CR4_PSE); | ||
1513 | |||
1514 | /* Enable PGE if available */ | ||
1515 | - if (boot_cpu_has(X86_FEATURE_PGE)) { | ||
1516 | + if (boot_cpu_has(X86_FEATURE_PGE) && !kaiser_enabled) { | ||
1517 | cr4_set_bits_and_update_boot(X86_CR4_PGE); | ||
1518 | __supported_pte_mask |= _PAGE_GLOBAL; | ||
1519 | } else | ||
1520 | diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c | ||
1521 | index 3e27ded6ac65..7df8e3a79dc0 100644 | ||
1522 | --- a/arch/x86/mm/init_64.c | ||
1523 | +++ b/arch/x86/mm/init_64.c | ||
1524 | @@ -324,6 +324,16 @@ void __init cleanup_highmap(void) | ||
1525 | continue; | ||
1526 | if (vaddr < (unsigned long) _text || vaddr > end) | ||
1527 | set_pmd(pmd, __pmd(0)); | ||
1528 | + else if (kaiser_enabled) { | ||
1529 | + /* | ||
1530 | + * level2_kernel_pgt is initialized with _PAGE_GLOBAL: | ||
1531 | + * clear that now. This is not important, so long as | ||
1532 | + * CR4.PGE remains clear, but it removes an anomaly. | ||
1533 | + * Physical mapping setup below avoids _PAGE_GLOBAL | ||
1534 | + * by use of massage_pgprot() inside pfn_pte() etc. | ||
1535 | + */ | ||
1536 | + set_pmd(pmd, pmd_clear_flags(*pmd, _PAGE_GLOBAL)); | ||
1537 | + } | ||
1538 | } | ||
1539 | } | ||
1540 | |||
1541 | diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c | ||
1542 | new file mode 100644 | ||
1543 | index 000000000000..d8376b4ad9f0 | ||
1544 | --- /dev/null | ||
1545 | +++ b/arch/x86/mm/kaiser.c | ||
1546 | @@ -0,0 +1,455 @@ | ||
1547 | +#include <linux/bug.h> | ||
1548 | +#include <linux/kernel.h> | ||
1549 | +#include <linux/errno.h> | ||
1550 | +#include <linux/string.h> | ||
1551 | +#include <linux/types.h> | ||
1552 | +#include <linux/bug.h> | ||
1553 | +#include <linux/init.h> | ||
1554 | +#include <linux/interrupt.h> | ||
1555 | +#include <linux/spinlock.h> | ||
1556 | +#include <linux/mm.h> | ||
1557 | +#include <linux/uaccess.h> | ||
1558 | + | ||
1559 | +#undef pr_fmt | ||
1560 | +#define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt | ||
1561 | + | ||
1562 | +#include <asm/kaiser.h> | ||
1563 | +#include <asm/tlbflush.h> /* to verify its kaiser declarations */ | ||
1564 | +#include <asm/pgtable.h> | ||
1565 | +#include <asm/pgalloc.h> | ||
1566 | +#include <asm/desc.h> | ||
1567 | +#include <asm/cmdline.h> | ||
1568 | + | ||
1569 | +int kaiser_enabled __read_mostly = 1; | ||
1570 | +EXPORT_SYMBOL(kaiser_enabled); /* for inlined TLB flush functions */ | ||
1571 | + | ||
1572 | +__visible | ||
1573 | +DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); | ||
1574 | + | ||
1575 | +/* | ||
1576 | + * These can have bit 63 set, so we can not just use a plain "or" | ||
1577 | + * instruction to get their value or'd into CR3. It would take | ||
1578 | + * another register. So, we use a memory reference to these instead. | ||
1579 | + * | ||
1580 | + * This is also handy because systems that do not support PCIDs | ||
1581 | + * just end up or'ing a 0 into their CR3, which does no harm. | ||
1582 | + */ | ||
1583 | +DEFINE_PER_CPU(unsigned long, x86_cr3_pcid_user); | ||
1584 | + | ||
1585 | +/* | ||
1586 | + * At runtime, the only things we map are some things for CPU | ||
1587 | + * hotplug, and stacks for new processes. No two CPUs will ever | ||
1588 | + * be populating the same addresses, so we only need to ensure | ||
1589 | + * that we protect between two CPUs trying to allocate and | ||
1590 | + * populate the same page table page. | ||
1591 | + * | ||
1592 | + * Only take this lock when doing a set_p[4um]d(), but it is not | ||
1593 | + * needed for doing a set_pte(). We assume that only the *owner* | ||
1594 | + * of a given allocation will be doing this for _their_ | ||
1595 | + * allocation. | ||
1596 | + * | ||
1597 | + * This ensures that once a system has been running for a while | ||
1598 | + * and there have been stacks all over and these page tables | ||
1599 | + * are fully populated, there will be no further acquisitions of | ||
1600 | + * this lock. | ||
1601 | + */ | ||
1602 | +static DEFINE_SPINLOCK(shadow_table_allocation_lock); | ||
1603 | + | ||
1604 | +/* | ||
1605 | + * Returns -1 on error. | ||
1606 | + */ | ||
1607 | +static inline unsigned long get_pa_from_mapping(unsigned long vaddr) | ||
1608 | +{ | ||
1609 | + pgd_t *pgd; | ||
1610 | + pud_t *pud; | ||
1611 | + pmd_t *pmd; | ||
1612 | + pte_t *pte; | ||
1613 | + | ||
1614 | + pgd = pgd_offset_k(vaddr); | ||
1615 | + /* | ||
1616 | + * We made all the kernel PGDs present in kaiser_init(). | ||
1617 | + * We expect them to stay that way. | ||
1618 | + */ | ||
1619 | + BUG_ON(pgd_none(*pgd)); | ||
1620 | + /* | ||
1621 | + * PGDs are either 512GB or 128TB on all x86_64 | ||
1622 | + * configurations. We don't handle these. | ||
1623 | + */ | ||
1624 | + BUG_ON(pgd_large(*pgd)); | ||
1625 | + | ||
1626 | + pud = pud_offset(pgd, vaddr); | ||
1627 | + if (pud_none(*pud)) { | ||
1628 | + WARN_ON_ONCE(1); | ||
1629 | + return -1; | ||
1630 | + } | ||
1631 | + | ||
1632 | + if (pud_large(*pud)) | ||
1633 | + return (pud_pfn(*pud) << PAGE_SHIFT) | (vaddr & ~PUD_PAGE_MASK); | ||
1634 | + | ||
1635 | + pmd = pmd_offset(pud, vaddr); | ||
1636 | + if (pmd_none(*pmd)) { | ||
1637 | + WARN_ON_ONCE(1); | ||
1638 | + return -1; | ||
1639 | + } | ||
1640 | + | ||
1641 | + if (pmd_large(*pmd)) | ||
1642 | + return (pmd_pfn(*pmd) << PAGE_SHIFT) | (vaddr & ~PMD_PAGE_MASK); | ||
1643 | + | ||
1644 | + pte = pte_offset_kernel(pmd, vaddr); | ||
1645 | + if (pte_none(*pte)) { | ||
1646 | + WARN_ON_ONCE(1); | ||
1647 | + return -1; | ||
1648 | + } | ||
1649 | + | ||
1650 | + return (pte_pfn(*pte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK); | ||
1651 | +} | ||
1652 | + | ||
1653 | +/* | ||
1654 | + * This is a relatively normal page table walk, except that it | ||
1655 | + * also tries to allocate page tables pages along the way. | ||
1656 | + * | ||
1657 | + * Returns a pointer to a PTE on success, or NULL on failure. | ||
1658 | + */ | ||
1659 | +static pte_t *kaiser_pagetable_walk(unsigned long address) | ||
1660 | +{ | ||
1661 | + pmd_t *pmd; | ||
1662 | + pud_t *pud; | ||
1663 | + pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address)); | ||
1664 | + gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); | ||
1665 | + | ||
1666 | + if (pgd_none(*pgd)) { | ||
1667 | + WARN_ONCE(1, "All shadow pgds should have been populated"); | ||
1668 | + return NULL; | ||
1669 | + } | ||
1670 | + BUILD_BUG_ON(pgd_large(*pgd) != 0); | ||
1671 | + | ||
1672 | + pud = pud_offset(pgd, address); | ||
1673 | + /* The shadow page tables do not use large mappings: */ | ||
1674 | + if (pud_large(*pud)) { | ||
1675 | + WARN_ON(1); | ||
1676 | + return NULL; | ||
1677 | + } | ||
1678 | + if (pud_none(*pud)) { | ||
1679 | + unsigned long new_pmd_page = __get_free_page(gfp); | ||
1680 | + if (!new_pmd_page) | ||
1681 | + return NULL; | ||
1682 | + spin_lock(&shadow_table_allocation_lock); | ||
1683 | + if (pud_none(*pud)) { | ||
1684 | + set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page))); | ||
1685 | + __inc_zone_page_state(virt_to_page((void *) | ||
1686 | + new_pmd_page), NR_KAISERTABLE); | ||
1687 | + } else | ||
1688 | + free_page(new_pmd_page); | ||
1689 | + spin_unlock(&shadow_table_allocation_lock); | ||
1690 | + } | ||
1691 | + | ||
1692 | + pmd = pmd_offset(pud, address); | ||
1693 | + /* The shadow page tables do not use large mappings: */ | ||
1694 | + if (pmd_large(*pmd)) { | ||
1695 | + WARN_ON(1); | ||
1696 | + return NULL; | ||
1697 | + } | ||
1698 | + if (pmd_none(*pmd)) { | ||
1699 | + unsigned long new_pte_page = __get_free_page(gfp); | ||
1700 | + if (!new_pte_page) | ||
1701 | + return NULL; | ||
1702 | + spin_lock(&shadow_table_allocation_lock); | ||
1703 | + if (pmd_none(*pmd)) { | ||
1704 | + set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page))); | ||
1705 | + __inc_zone_page_state(virt_to_page((void *) | ||
1706 | + new_pte_page), NR_KAISERTABLE); | ||
1707 | + } else | ||
1708 | + free_page(new_pte_page); | ||
1709 | + spin_unlock(&shadow_table_allocation_lock); | ||
1710 | + } | ||
1711 | + | ||
1712 | + return pte_offset_kernel(pmd, address); | ||
1713 | +} | ||
1714 | + | ||
1715 | +static int kaiser_add_user_map(const void *__start_addr, unsigned long size, | ||
1716 | + unsigned long flags) | ||
1717 | +{ | ||
1718 | + int ret = 0; | ||
1719 | + pte_t *pte; | ||
1720 | + unsigned long start_addr = (unsigned long )__start_addr; | ||
1721 | + unsigned long address = start_addr & PAGE_MASK; | ||
1722 | + unsigned long end_addr = PAGE_ALIGN(start_addr + size); | ||
1723 | + unsigned long target_address; | ||
1724 | + | ||
1725 | + /* | ||
1726 | + * It is convenient for callers to pass in __PAGE_KERNEL etc, | ||
1727 | + * and there is no actual harm from setting _PAGE_GLOBAL, so | ||
1728 | + * long as CR4.PGE is not set. But it is nonetheless troubling | ||
1729 | + * to see Kaiser itself setting _PAGE_GLOBAL (now that "nokaiser" | ||
1730 | + * requires that not to be #defined to 0): so mask it off here. | ||
1731 | + */ | ||
1732 | + flags &= ~_PAGE_GLOBAL; | ||
1733 | + | ||
1734 | + for (; address < end_addr; address += PAGE_SIZE) { | ||
1735 | + target_address = get_pa_from_mapping(address); | ||
1736 | + if (target_address == -1) { | ||
1737 | + ret = -EIO; | ||
1738 | + break; | ||
1739 | + } | ||
1740 | + pte = kaiser_pagetable_walk(address); | ||
1741 | + if (!pte) { | ||
1742 | + ret = -ENOMEM; | ||
1743 | + break; | ||
1744 | + } | ||
1745 | + if (pte_none(*pte)) { | ||
1746 | + set_pte(pte, __pte(flags | target_address)); | ||
1747 | + } else { | ||
1748 | + pte_t tmp; | ||
1749 | + set_pte(&tmp, __pte(flags | target_address)); | ||
1750 | + WARN_ON_ONCE(!pte_same(*pte, tmp)); | ||
1751 | + } | ||
1752 | + } | ||
1753 | + return ret; | ||
1754 | +} | ||
1755 | + | ||
1756 | +static int kaiser_add_user_map_ptrs(const void *start, const void *end, unsigned long flags) | ||
1757 | +{ | ||
1758 | + unsigned long size = end - start; | ||
1759 | + | ||
1760 | + return kaiser_add_user_map(start, size, flags); | ||
1761 | +} | ||
1762 | + | ||
1763 | +/* | ||
1764 | + * Ensure that the top level of the (shadow) page tables are | ||
1765 | + * entirely populated. This ensures that all processes that get | ||
1766 | + * forked have the same entries. This way, we do not have to | ||
1767 | + * ever go set up new entries in older processes. | ||
1768 | + * | ||
1769 | + * Note: we never free these, so there are no updates to them | ||
1770 | + * after this. | ||
1771 | + */ | ||
1772 | +static void __init kaiser_init_all_pgds(void) | ||
1773 | +{ | ||
1774 | + pgd_t *pgd; | ||
1775 | + int i = 0; | ||
1776 | + | ||
1777 | + pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0)); | ||
1778 | + for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) { | ||
1779 | + pgd_t new_pgd; | ||
1780 | + pud_t *pud = pud_alloc_one(&init_mm, | ||
1781 | + PAGE_OFFSET + i * PGDIR_SIZE); | ||
1782 | + if (!pud) { | ||
1783 | + WARN_ON(1); | ||
1784 | + break; | ||
1785 | + } | ||
1786 | + inc_zone_page_state(virt_to_page(pud), NR_KAISERTABLE); | ||
1787 | + new_pgd = __pgd(_KERNPG_TABLE |__pa(pud)); | ||
1788 | + /* | ||
1789 | + * Make sure not to stomp on some other pgd entry. | ||
1790 | + */ | ||
1791 | + if (!pgd_none(pgd[i])) { | ||
1792 | + WARN_ON(1); | ||
1793 | + continue; | ||
1794 | + } | ||
1795 | + set_pgd(pgd + i, new_pgd); | ||
1796 | + } | ||
1797 | +} | ||
1798 | + | ||
1799 | +#define kaiser_add_user_map_early(start, size, flags) do { \ | ||
1800 | + int __ret = kaiser_add_user_map(start, size, flags); \ | ||
1801 | + WARN_ON(__ret); \ | ||
1802 | +} while (0) | ||
1803 | + | ||
1804 | +#define kaiser_add_user_map_ptrs_early(start, end, flags) do { \ | ||
1805 | + int __ret = kaiser_add_user_map_ptrs(start, end, flags); \ | ||
1806 | + WARN_ON(__ret); \ | ||
1807 | +} while (0) | ||
1808 | + | ||
1809 | +void __init kaiser_check_boottime_disable(void) | ||
1810 | +{ | ||
1811 | + bool enable = true; | ||
1812 | + char arg[5]; | ||
1813 | + int ret; | ||
1814 | + | ||
1815 | + if (boot_cpu_has(X86_FEATURE_XENPV)) | ||
1816 | + goto silent_disable; | ||
1817 | + | ||
1818 | + ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg)); | ||
1819 | + if (ret > 0) { | ||
1820 | + if (!strncmp(arg, "on", 2)) | ||
1821 | + goto enable; | ||
1822 | + | ||
1823 | + if (!strncmp(arg, "off", 3)) | ||
1824 | + goto disable; | ||
1825 | + | ||
1826 | + if (!strncmp(arg, "auto", 4)) | ||
1827 | + goto skip; | ||
1828 | + } | ||
1829 | + | ||
1830 | + if (cmdline_find_option_bool(boot_command_line, "nopti")) | ||
1831 | + goto disable; | ||
1832 | + | ||
1833 | +skip: | ||
1834 | + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) | ||
1835 | + goto disable; | ||
1836 | + | ||
1837 | +enable: | ||
1838 | + if (enable) | ||
1839 | + setup_force_cpu_cap(X86_FEATURE_KAISER); | ||
1840 | + | ||
1841 | + return; | ||
1842 | + | ||
1843 | +disable: | ||
1844 | + pr_info("disabled\n"); | ||
1845 | + | ||
1846 | +silent_disable: | ||
1847 | + kaiser_enabled = 0; | ||
1848 | + setup_clear_cpu_cap(X86_FEATURE_KAISER); | ||
1849 | +} | ||
1850 | + | ||
1851 | +/* | ||
1852 | + * If anything in here fails, we will likely die on one of the | ||
1853 | + * first kernel->user transitions and init will die. But, we | ||
1854 | + * will have most of the kernel up by then and should be able to | ||
1855 | + * get a clean warning out of it. If we BUG_ON() here, we run | ||
1856 | + * the risk of being before we have good console output. | ||
1857 | + */ | ||
1858 | +void __init kaiser_init(void) | ||
1859 | +{ | ||
1860 | + int cpu; | ||
1861 | + | ||
1862 | + if (!kaiser_enabled) | ||
1863 | + return; | ||
1864 | + | ||
1865 | + kaiser_init_all_pgds(); | ||
1866 | + | ||
1867 | + for_each_possible_cpu(cpu) { | ||
1868 | + void *percpu_vaddr = __per_cpu_user_mapped_start + | ||
1869 | + per_cpu_offset(cpu); | ||
1870 | + unsigned long percpu_sz = __per_cpu_user_mapped_end - | ||
1871 | + __per_cpu_user_mapped_start; | ||
1872 | + kaiser_add_user_map_early(percpu_vaddr, percpu_sz, | ||
1873 | + __PAGE_KERNEL); | ||
1874 | + } | ||
1875 | + | ||
1876 | + /* | ||
1877 | + * Map the entry/exit text section, which is needed at | ||
1878 | + * switches from user to and from kernel. | ||
1879 | + */ | ||
1880 | + kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end, | ||
1881 | + __PAGE_KERNEL_RX); | ||
1882 | + | ||
1883 | +#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN) | ||
1884 | + kaiser_add_user_map_ptrs_early(__irqentry_text_start, | ||
1885 | + __irqentry_text_end, | ||
1886 | + __PAGE_KERNEL_RX); | ||
1887 | +#endif | ||
1888 | + kaiser_add_user_map_early((void *)idt_descr.address, | ||
1889 | + sizeof(gate_desc) * NR_VECTORS, | ||
1890 | + __PAGE_KERNEL_RO); | ||
1891 | +#ifdef CONFIG_TRACING | ||
1892 | + kaiser_add_user_map_early(&trace_idt_descr, | ||
1893 | + sizeof(trace_idt_descr), | ||
1894 | + __PAGE_KERNEL); | ||
1895 | + kaiser_add_user_map_early(&trace_idt_table, | ||
1896 | + sizeof(gate_desc) * NR_VECTORS, | ||
1897 | + __PAGE_KERNEL); | ||
1898 | +#endif | ||
1899 | + kaiser_add_user_map_early(&debug_idt_descr, sizeof(debug_idt_descr), | ||
1900 | + __PAGE_KERNEL); | ||
1901 | + kaiser_add_user_map_early(&debug_idt_table, | ||
1902 | + sizeof(gate_desc) * NR_VECTORS, | ||
1903 | + __PAGE_KERNEL); | ||
1904 | + | ||
1905 | + pr_info("enabled\n"); | ||
1906 | +} | ||
1907 | + | ||
1908 | +/* Add a mapping to the shadow mapping, and synchronize the mappings */ | ||
1909 | +int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags) | ||
1910 | +{ | ||
1911 | + if (!kaiser_enabled) | ||
1912 | + return 0; | ||
1913 | + return kaiser_add_user_map((const void *)addr, size, flags); | ||
1914 | +} | ||
1915 | + | ||
1916 | +void kaiser_remove_mapping(unsigned long start, unsigned long size) | ||
1917 | +{ | ||
1918 | + extern void unmap_pud_range_nofree(pgd_t *pgd, | ||
1919 | + unsigned long start, unsigned long end); | ||
1920 | + unsigned long end = start + size; | ||
1921 | + unsigned long addr, next; | ||
1922 | + pgd_t *pgd; | ||
1923 | + | ||
1924 | + if (!kaiser_enabled) | ||
1925 | + return; | ||
1926 | + pgd = native_get_shadow_pgd(pgd_offset_k(start)); | ||
1927 | + for (addr = start; addr < end; pgd++, addr = next) { | ||
1928 | + next = pgd_addr_end(addr, end); | ||
1929 | + unmap_pud_range_nofree(pgd, addr, next); | ||
1930 | + } | ||
1931 | +} | ||
1932 | + | ||
1933 | +/* | ||
1934 | + * Page table pages are page-aligned. The lower half of the top | ||
1935 | + * level is used for userspace and the top half for the kernel. | ||
1936 | + * This returns true for user pages that need to get copied into | ||
1937 | + * both the user and kernel copies of the page tables, and false | ||
1938 | + * for kernel pages that should only be in the kernel copy. | ||
1939 | + */ | ||
1940 | +static inline bool is_userspace_pgd(pgd_t *pgdp) | ||
1941 | +{ | ||
1942 | + return ((unsigned long)pgdp % PAGE_SIZE) < (PAGE_SIZE / 2); | ||
1943 | +} | ||
1944 | + | ||
1945 | +pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd) | ||
1946 | +{ | ||
1947 | + if (!kaiser_enabled) | ||
1948 | + return pgd; | ||
1949 | + /* | ||
1950 | + * Do we need to also populate the shadow pgd? Check _PAGE_USER to | ||
1951 | + * skip cases like kexec and EFI which make temporary low mappings. | ||
1952 | + */ | ||
1953 | + if (pgd.pgd & _PAGE_USER) { | ||
1954 | + if (is_userspace_pgd(pgdp)) { | ||
1955 | + native_get_shadow_pgd(pgdp)->pgd = pgd.pgd; | ||
1956 | + /* | ||
1957 | + * Even if the entry is *mapping* userspace, ensure | ||
1958 | + * that userspace can not use it. This way, if we | ||
1959 | + * get out to userspace running on the kernel CR3, | ||
1960 | + * userspace will crash instead of running. | ||
1961 | + */ | ||
1962 | + if (__supported_pte_mask & _PAGE_NX) | ||
1963 | + pgd.pgd |= _PAGE_NX; | ||
1964 | + } | ||
1965 | + } else if (!pgd.pgd) { | ||
1966 | + /* | ||
1967 | + * pgd_clear() cannot check _PAGE_USER, and is even used to | ||
1968 | + * clear corrupted pgd entries: so just rely on cases like | ||
1969 | + * kexec and EFI never to be using pgd_clear(). | ||
1970 | + */ | ||
1971 | + if (!WARN_ON_ONCE((unsigned long)pgdp & PAGE_SIZE) && | ||
1972 | + is_userspace_pgd(pgdp)) | ||
1973 | + native_get_shadow_pgd(pgdp)->pgd = pgd.pgd; | ||
1974 | + } | ||
1975 | + return pgd; | ||
1976 | +} | ||
1977 | + | ||
1978 | +void kaiser_setup_pcid(void) | ||
1979 | +{ | ||
1980 | + unsigned long user_cr3 = KAISER_SHADOW_PGD_OFFSET; | ||
1981 | + | ||
1982 | + if (this_cpu_has(X86_FEATURE_PCID)) | ||
1983 | + user_cr3 |= X86_CR3_PCID_USER_NOFLUSH; | ||
1984 | + /* | ||
1985 | + * These variables are used by the entry/exit | ||
1986 | + * code to change PCID and pgd and TLB flushing. | ||
1987 | + */ | ||
1988 | + this_cpu_write(x86_cr3_pcid_user, user_cr3); | ||
1989 | +} | ||
1990 | + | ||
1991 | +/* | ||
1992 | + * Make a note that this cpu will need to flush USER tlb on return to user. | ||
1993 | + * If cpu does not have PCID, then the NOFLUSH bit will never have been set. | ||
1994 | + */ | ||
1995 | +void kaiser_flush_tlb_on_return_to_user(void) | ||
1996 | +{ | ||
1997 | + if (this_cpu_has(X86_FEATURE_PCID)) | ||
1998 | + this_cpu_write(x86_cr3_pcid_user, | ||
1999 | + X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET); | ||
2000 | +} | ||
2001 | +EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user); | ||
2002 | diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c | ||
2003 | index aed206475aa7..319183d93602 100644 | ||
2004 | --- a/arch/x86/mm/kaslr.c | ||
2005 | +++ b/arch/x86/mm/kaslr.c | ||
2006 | @@ -189,6 +189,6 @@ void __meminit init_trampoline(void) | ||
2007 | *pud_tramp = *pud; | ||
2008 | } | ||
2009 | |||
2010 | - set_pgd(&trampoline_pgd_entry, | ||
2011 | - __pgd(_KERNPG_TABLE | __pa(pud_page_tramp))); | ||
2012 | + /* Avoid set_pgd(), in case it's complicated by CONFIG_PAGE_TABLE_ISOLATION */ | ||
2013 | + trampoline_pgd_entry = __pgd(_KERNPG_TABLE | __pa(pud_page_tramp)); | ||
2014 | } | ||
2015 | diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c | ||
2016 | index e3353c97d086..73dcb0e18c1b 100644 | ||
2017 | --- a/arch/x86/mm/pageattr.c | ||
2018 | +++ b/arch/x86/mm/pageattr.c | ||
2019 | @@ -52,6 +52,7 @@ static DEFINE_SPINLOCK(cpa_lock); | ||
2020 | #define CPA_FLUSHTLB 1 | ||
2021 | #define CPA_ARRAY 2 | ||
2022 | #define CPA_PAGES_ARRAY 4 | ||
2023 | +#define CPA_FREE_PAGETABLES 8 | ||
2024 | |||
2025 | #ifdef CONFIG_PROC_FS | ||
2026 | static unsigned long direct_pages_count[PG_LEVEL_NUM]; | ||
2027 | @@ -729,10 +730,13 @@ static int split_large_page(struct cpa_data *cpa, pte_t *kpte, | ||
2028 | return 0; | ||
2029 | } | ||
2030 | |||
2031 | -static bool try_to_free_pte_page(pte_t *pte) | ||
2032 | +static bool try_to_free_pte_page(struct cpa_data *cpa, pte_t *pte) | ||
2033 | { | ||
2034 | int i; | ||
2035 | |||
2036 | + if (!(cpa->flags & CPA_FREE_PAGETABLES)) | ||
2037 | + return false; | ||
2038 | + | ||
2039 | for (i = 0; i < PTRS_PER_PTE; i++) | ||
2040 | if (!pte_none(pte[i])) | ||
2041 | return false; | ||
2042 | @@ -741,10 +745,13 @@ static bool try_to_free_pte_page(pte_t *pte) | ||
2043 | return true; | ||
2044 | } | ||
2045 | |||
2046 | -static bool try_to_free_pmd_page(pmd_t *pmd) | ||
2047 | +static bool try_to_free_pmd_page(struct cpa_data *cpa, pmd_t *pmd) | ||
2048 | { | ||
2049 | int i; | ||
2050 | |||
2051 | + if (!(cpa->flags & CPA_FREE_PAGETABLES)) | ||
2052 | + return false; | ||
2053 | + | ||
2054 | for (i = 0; i < PTRS_PER_PMD; i++) | ||
2055 | if (!pmd_none(pmd[i])) | ||
2056 | return false; | ||
2057 | @@ -753,7 +760,9 @@ static bool try_to_free_pmd_page(pmd_t *pmd) | ||
2058 | return true; | ||
2059 | } | ||
2060 | |||
2061 | -static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end) | ||
2062 | +static bool unmap_pte_range(struct cpa_data *cpa, pmd_t *pmd, | ||
2063 | + unsigned long start, | ||
2064 | + unsigned long end) | ||
2065 | { | ||
2066 | pte_t *pte = pte_offset_kernel(pmd, start); | ||
2067 | |||
2068 | @@ -764,22 +773,23 @@ static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end) | ||
2069 | pte++; | ||
2070 | } | ||
2071 | |||
2072 | - if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) { | ||
2073 | + if (try_to_free_pte_page(cpa, (pte_t *)pmd_page_vaddr(*pmd))) { | ||
2074 | pmd_clear(pmd); | ||
2075 | return true; | ||
2076 | } | ||
2077 | return false; | ||
2078 | } | ||
2079 | |||
2080 | -static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd, | ||
2081 | +static void __unmap_pmd_range(struct cpa_data *cpa, pud_t *pud, pmd_t *pmd, | ||
2082 | unsigned long start, unsigned long end) | ||
2083 | { | ||
2084 | - if (unmap_pte_range(pmd, start, end)) | ||
2085 | - if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud))) | ||
2086 | + if (unmap_pte_range(cpa, pmd, start, end)) | ||
2087 | + if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud))) | ||
2088 | pud_clear(pud); | ||
2089 | } | ||
2090 | |||
2091 | -static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end) | ||
2092 | +static void unmap_pmd_range(struct cpa_data *cpa, pud_t *pud, | ||
2093 | + unsigned long start, unsigned long end) | ||
2094 | { | ||
2095 | pmd_t *pmd = pmd_offset(pud, start); | ||
2096 | |||
2097 | @@ -790,7 +800,7 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end) | ||
2098 | unsigned long next_page = (start + PMD_SIZE) & PMD_MASK; | ||
2099 | unsigned long pre_end = min_t(unsigned long, end, next_page); | ||
2100 | |||
2101 | - __unmap_pmd_range(pud, pmd, start, pre_end); | ||
2102 | + __unmap_pmd_range(cpa, pud, pmd, start, pre_end); | ||
2103 | |||
2104 | start = pre_end; | ||
2105 | pmd++; | ||
2106 | @@ -803,7 +813,8 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end) | ||
2107 | if (pmd_large(*pmd)) | ||
2108 | pmd_clear(pmd); | ||
2109 | else | ||
2110 | - __unmap_pmd_range(pud, pmd, start, start + PMD_SIZE); | ||
2111 | + __unmap_pmd_range(cpa, pud, pmd, | ||
2112 | + start, start + PMD_SIZE); | ||
2113 | |||
2114 | start += PMD_SIZE; | ||
2115 | pmd++; | ||
2116 | @@ -813,17 +824,19 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end) | ||
2117 | * 4K leftovers? | ||
2118 | */ | ||
2119 | if (start < end) | ||
2120 | - return __unmap_pmd_range(pud, pmd, start, end); | ||
2121 | + return __unmap_pmd_range(cpa, pud, pmd, start, end); | ||
2122 | |||
2123 | /* | ||
2124 | * Try again to free the PMD page if haven't succeeded above. | ||
2125 | */ | ||
2126 | if (!pud_none(*pud)) | ||
2127 | - if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud))) | ||
2128 | + if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud))) | ||
2129 | pud_clear(pud); | ||
2130 | } | ||
2131 | |||
2132 | -static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) | ||
2133 | +static void __unmap_pud_range(struct cpa_data *cpa, pgd_t *pgd, | ||
2134 | + unsigned long start, | ||
2135 | + unsigned long end) | ||
2136 | { | ||
2137 | pud_t *pud = pud_offset(pgd, start); | ||
2138 | |||
2139 | @@ -834,7 +847,7 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) | ||
2140 | unsigned long next_page = (start + PUD_SIZE) & PUD_MASK; | ||
2141 | unsigned long pre_end = min_t(unsigned long, end, next_page); | ||
2142 | |||
2143 | - unmap_pmd_range(pud, start, pre_end); | ||
2144 | + unmap_pmd_range(cpa, pud, start, pre_end); | ||
2145 | |||
2146 | start = pre_end; | ||
2147 | pud++; | ||
2148 | @@ -848,7 +861,7 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) | ||
2149 | if (pud_large(*pud)) | ||
2150 | pud_clear(pud); | ||
2151 | else | ||
2152 | - unmap_pmd_range(pud, start, start + PUD_SIZE); | ||
2153 | + unmap_pmd_range(cpa, pud, start, start + PUD_SIZE); | ||
2154 | |||
2155 | start += PUD_SIZE; | ||
2156 | pud++; | ||
2157 | @@ -858,7 +871,7 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) | ||
2158 | * 2M leftovers? | ||
2159 | */ | ||
2160 | if (start < end) | ||
2161 | - unmap_pmd_range(pud, start, end); | ||
2162 | + unmap_pmd_range(cpa, pud, start, end); | ||
2163 | |||
2164 | /* | ||
2165 | * No need to try to free the PUD page because we'll free it in | ||
2166 | @@ -866,6 +879,24 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) | ||
2167 | */ | ||
2168 | } | ||
2169 | |||
2170 | +static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) | ||
2171 | +{ | ||
2172 | + struct cpa_data cpa = { | ||
2173 | + .flags = CPA_FREE_PAGETABLES, | ||
2174 | + }; | ||
2175 | + | ||
2176 | + __unmap_pud_range(&cpa, pgd, start, end); | ||
2177 | +} | ||
2178 | + | ||
2179 | +void unmap_pud_range_nofree(pgd_t *pgd, unsigned long start, unsigned long end) | ||
2180 | +{ | ||
2181 | + struct cpa_data cpa = { | ||
2182 | + .flags = 0, | ||
2183 | + }; | ||
2184 | + | ||
2185 | + __unmap_pud_range(&cpa, pgd, start, end); | ||
2186 | +} | ||
2187 | + | ||
2188 | static int alloc_pte_page(pmd_t *pmd) | ||
2189 | { | ||
2190 | pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); | ||
2191 | diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c | ||
2192 | index 3feec5af4e67..5aaec8effc5f 100644 | ||
2193 | --- a/arch/x86/mm/pgtable.c | ||
2194 | +++ b/arch/x86/mm/pgtable.c | ||
2195 | @@ -344,14 +344,22 @@ static inline void _pgd_free(pgd_t *pgd) | ||
2196 | kmem_cache_free(pgd_cache, pgd); | ||
2197 | } | ||
2198 | #else | ||
2199 | + | ||
2200 | +/* | ||
2201 | + * Instead of one pgd, Kaiser acquires two pgds. Being order-1, it is | ||
2202 | + * both 8k in size and 8k-aligned. That lets us just flip bit 12 | ||
2203 | + * in a pointer to swap between the two 4k halves. | ||
2204 | + */ | ||
2205 | +#define PGD_ALLOCATION_ORDER kaiser_enabled | ||
2206 | + | ||
2207 | static inline pgd_t *_pgd_alloc(void) | ||
2208 | { | ||
2209 | - return (pgd_t *)__get_free_page(PGALLOC_GFP); | ||
2210 | + return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER); | ||
2211 | } | ||
2212 | |||
2213 | static inline void _pgd_free(pgd_t *pgd) | ||
2214 | { | ||
2215 | - free_page((unsigned long)pgd); | ||
2216 | + free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER); | ||
2217 | } | ||
2218 | #endif /* CONFIG_X86_PAE */ | ||
2219 | |||
2220 | diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c | ||
2221 | index 53b72fb4e781..41205de487e7 100644 | ||
2222 | --- a/arch/x86/mm/tlb.c | ||
2223 | +++ b/arch/x86/mm/tlb.c | ||
2224 | @@ -6,13 +6,14 @@ | ||
2225 | #include <linux/interrupt.h> | ||
2226 | #include <linux/export.h> | ||
2227 | #include <linux/cpu.h> | ||
2228 | +#include <linux/debugfs.h> | ||
2229 | |||
2230 | #include <asm/tlbflush.h> | ||
2231 | #include <asm/mmu_context.h> | ||
2232 | #include <asm/cache.h> | ||
2233 | #include <asm/apic.h> | ||
2234 | #include <asm/uv/uv.h> | ||
2235 | -#include <linux/debugfs.h> | ||
2236 | +#include <asm/kaiser.h> | ||
2237 | |||
2238 | /* | ||
2239 | * TLB flushing, formerly SMP-only | ||
2240 | @@ -34,6 +35,36 @@ struct flush_tlb_info { | ||
2241 | unsigned long flush_end; | ||
2242 | }; | ||
2243 | |||
2244 | +static void load_new_mm_cr3(pgd_t *pgdir) | ||
2245 | +{ | ||
2246 | + unsigned long new_mm_cr3 = __pa(pgdir); | ||
2247 | + | ||
2248 | + if (kaiser_enabled) { | ||
2249 | + /* | ||
2250 | + * We reuse the same PCID for different tasks, so we must | ||
2251 | + * flush all the entries for the PCID out when we change tasks. | ||
2252 | + * Flush KERN below, flush USER when returning to userspace in | ||
2253 | + * kaiser's SWITCH_USER_CR3 (_SWITCH_TO_USER_CR3) macro. | ||
2254 | + * | ||
2255 | + * invpcid_flush_single_context(X86_CR3_PCID_ASID_USER) could | ||
2256 | + * do it here, but can only be used if X86_FEATURE_INVPCID is | ||
2257 | + * available - and many machines support pcid without invpcid. | ||
2258 | + * | ||
2259 | + * If X86_CR3_PCID_KERN_FLUSH actually added something, then it | ||
2260 | + * would be needed in the write_cr3() below - if PCIDs enabled. | ||
2261 | + */ | ||
2262 | + BUILD_BUG_ON(X86_CR3_PCID_KERN_FLUSH); | ||
2263 | + kaiser_flush_tlb_on_return_to_user(); | ||
2264 | + } | ||
2265 | + | ||
2266 | + /* | ||
2267 | + * Caution: many callers of this function expect | ||
2268 | + * that load_cr3() is serializing and orders TLB | ||
2269 | + * fills with respect to the mm_cpumask writes. | ||
2270 | + */ | ||
2271 | + write_cr3(new_mm_cr3); | ||
2272 | +} | ||
2273 | + | ||
2274 | /* | ||
2275 | * We cannot call mmdrop() because we are in interrupt context, | ||
2276 | * instead update mm->cpu_vm_mask. | ||
2277 | @@ -45,7 +76,7 @@ void leave_mm(int cpu) | ||
2278 | BUG(); | ||
2279 | if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) { | ||
2280 | cpumask_clear_cpu(cpu, mm_cpumask(active_mm)); | ||
2281 | - load_cr3(swapper_pg_dir); | ||
2282 | + load_new_mm_cr3(swapper_pg_dir); | ||
2283 | /* | ||
2284 | * This gets called in the idle path where RCU | ||
2285 | * functions differently. Tracing normally | ||
2286 | @@ -120,7 +151,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, | ||
2287 | * ordering guarantee we need. | ||
2288 | * | ||
2289 | */ | ||
2290 | - load_cr3(next->pgd); | ||
2291 | + load_new_mm_cr3(next->pgd); | ||
2292 | |||
2293 | trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); | ||
2294 | |||
2295 | @@ -167,7 +198,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, | ||
2296 | * As above, load_cr3() is serializing and orders TLB | ||
2297 | * fills with respect to the mm_cpumask write. | ||
2298 | */ | ||
2299 | - load_cr3(next->pgd); | ||
2300 | + load_new_mm_cr3(next->pgd); | ||
2301 | trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); | ||
2302 | load_mm_cr4(next); | ||
2303 | load_mm_ldt(next); | ||
2304 | diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h | ||
2305 | index dc81e5287ebf..2e6000a4eb2c 100644 | ||
2306 | --- a/include/asm-generic/vmlinux.lds.h | ||
2307 | +++ b/include/asm-generic/vmlinux.lds.h | ||
2308 | @@ -778,7 +778,14 @@ | ||
2309 | */ | ||
2310 | #define PERCPU_INPUT(cacheline) \ | ||
2311 | VMLINUX_SYMBOL(__per_cpu_start) = .; \ | ||
2312 | + VMLINUX_SYMBOL(__per_cpu_user_mapped_start) = .; \ | ||
2313 | *(.data..percpu..first) \ | ||
2314 | + . = ALIGN(cacheline); \ | ||
2315 | + *(.data..percpu..user_mapped) \ | ||
2316 | + *(.data..percpu..user_mapped..shared_aligned) \ | ||
2317 | + . = ALIGN(PAGE_SIZE); \ | ||
2318 | + *(.data..percpu..user_mapped..page_aligned) \ | ||
2319 | + VMLINUX_SYMBOL(__per_cpu_user_mapped_end) = .; \ | ||
2320 | . = ALIGN(PAGE_SIZE); \ | ||
2321 | *(.data..percpu..page_aligned) \ | ||
2322 | . = ALIGN(cacheline); \ | ||
2323 | diff --git a/include/linux/kaiser.h b/include/linux/kaiser.h | ||
2324 | new file mode 100644 | ||
2325 | index 000000000000..58c55b1589d0 | ||
2326 | --- /dev/null | ||
2327 | +++ b/include/linux/kaiser.h | ||
2328 | @@ -0,0 +1,52 @@ | ||
2329 | +#ifndef _LINUX_KAISER_H | ||
2330 | +#define _LINUX_KAISER_H | ||
2331 | + | ||
2332 | +#ifdef CONFIG_PAGE_TABLE_ISOLATION | ||
2333 | +#include <asm/kaiser.h> | ||
2334 | + | ||
2335 | +static inline int kaiser_map_thread_stack(void *stack) | ||
2336 | +{ | ||
2337 | + /* | ||
2338 | + * Map that page of kernel stack on which we enter from user context. | ||
2339 | + */ | ||
2340 | + return kaiser_add_mapping((unsigned long)stack + | ||
2341 | + THREAD_SIZE - PAGE_SIZE, PAGE_SIZE, __PAGE_KERNEL); | ||
2342 | +} | ||
2343 | + | ||
2344 | +static inline void kaiser_unmap_thread_stack(void *stack) | ||
2345 | +{ | ||
2346 | + /* | ||
2347 | + * Note: may be called even when kaiser_map_thread_stack() failed. | ||
2348 | + */ | ||
2349 | + kaiser_remove_mapping((unsigned long)stack + | ||
2350 | + THREAD_SIZE - PAGE_SIZE, PAGE_SIZE); | ||
2351 | +} | ||
2352 | +#else | ||
2353 | + | ||
2354 | +/* | ||
2355 | + * These stubs are used whenever CONFIG_PAGE_TABLE_ISOLATION is off, which | ||
2356 | + * includes architectures that support KAISER, but have it disabled. | ||
2357 | + */ | ||
2358 | + | ||
2359 | +static inline void kaiser_init(void) | ||
2360 | +{ | ||
2361 | +} | ||
2362 | +static inline int kaiser_add_mapping(unsigned long addr, | ||
2363 | + unsigned long size, unsigned long flags) | ||
2364 | +{ | ||
2365 | + return 0; | ||
2366 | +} | ||
2367 | +static inline void kaiser_remove_mapping(unsigned long start, | ||
2368 | + unsigned long size) | ||
2369 | +{ | ||
2370 | +} | ||
2371 | +static inline int kaiser_map_thread_stack(void *stack) | ||
2372 | +{ | ||
2373 | + return 0; | ||
2374 | +} | ||
2375 | +static inline void kaiser_unmap_thread_stack(void *stack) | ||
2376 | +{ | ||
2377 | +} | ||
2378 | + | ||
2379 | +#endif /* !CONFIG_PAGE_TABLE_ISOLATION */ | ||
2380 | +#endif /* _LINUX_KAISER_H */ | ||
2381 | diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h | ||
2382 | index fff21a82780c..490f5a83f947 100644 | ||
2383 | --- a/include/linux/mmzone.h | ||
2384 | +++ b/include/linux/mmzone.h | ||
2385 | @@ -124,8 +124,9 @@ enum zone_stat_item { | ||
2386 | NR_SLAB_UNRECLAIMABLE, | ||
2387 | NR_PAGETABLE, /* used for pagetables */ | ||
2388 | NR_KERNEL_STACK_KB, /* measured in KiB */ | ||
2389 | - /* Second 128 byte cacheline */ | ||
2390 | + NR_KAISERTABLE, | ||
2391 | NR_BOUNCE, | ||
2392 | + /* Second 128 byte cacheline */ | ||
2393 | #if IS_ENABLED(CONFIG_ZSMALLOC) | ||
2394 | NR_ZSPAGES, /* allocated in zsmalloc */ | ||
2395 | #endif | ||
2396 | diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h | ||
2397 | index 8f16299ca068..8902f23bb770 100644 | ||
2398 | --- a/include/linux/percpu-defs.h | ||
2399 | +++ b/include/linux/percpu-defs.h | ||
2400 | @@ -35,6 +35,12 @@ | ||
2401 | |||
2402 | #endif | ||
2403 | |||
2404 | +#ifdef CONFIG_PAGE_TABLE_ISOLATION | ||
2405 | +#define USER_MAPPED_SECTION "..user_mapped" | ||
2406 | +#else | ||
2407 | +#define USER_MAPPED_SECTION "" | ||
2408 | +#endif | ||
2409 | + | ||
2410 | /* | ||
2411 | * Base implementations of per-CPU variable declarations and definitions, where | ||
2412 | * the section in which the variable is to be placed is provided by the | ||
2413 | @@ -115,6 +121,12 @@ | ||
2414 | #define DEFINE_PER_CPU(type, name) \ | ||
2415 | DEFINE_PER_CPU_SECTION(type, name, "") | ||
2416 | |||
2417 | +#define DECLARE_PER_CPU_USER_MAPPED(type, name) \ | ||
2418 | + DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION) | ||
2419 | + | ||
2420 | +#define DEFINE_PER_CPU_USER_MAPPED(type, name) \ | ||
2421 | + DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION) | ||
2422 | + | ||
2423 | /* | ||
2424 | * Declaration/definition used for per-CPU variables that must come first in | ||
2425 | * the set of variables. | ||
2426 | @@ -144,6 +156,14 @@ | ||
2427 | DEFINE_PER_CPU_SECTION(type, name, PER_CPU_SHARED_ALIGNED_SECTION) \ | ||
2428 | ____cacheline_aligned_in_smp | ||
2429 | |||
2430 | +#define DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name) \ | ||
2431 | + DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \ | ||
2432 | + ____cacheline_aligned_in_smp | ||
2433 | + | ||
2434 | +#define DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name) \ | ||
2435 | + DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \ | ||
2436 | + ____cacheline_aligned_in_smp | ||
2437 | + | ||
2438 | #define DECLARE_PER_CPU_ALIGNED(type, name) \ | ||
2439 | DECLARE_PER_CPU_SECTION(type, name, PER_CPU_ALIGNED_SECTION) \ | ||
2440 | ____cacheline_aligned | ||
2441 | @@ -162,11 +182,21 @@ | ||
2442 | #define DEFINE_PER_CPU_PAGE_ALIGNED(type, name) \ | ||
2443 | DEFINE_PER_CPU_SECTION(type, name, "..page_aligned") \ | ||
2444 | __aligned(PAGE_SIZE) | ||
2445 | +/* | ||
2446 | + * Declaration/definition used for per-CPU variables that must be page aligned and need to be mapped in user mode. | ||
2447 | + */ | ||
2448 | +#define DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name) \ | ||
2449 | + DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \ | ||
2450 | + __aligned(PAGE_SIZE) | ||
2451 | + | ||
2452 | +#define DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name) \ | ||
2453 | + DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \ | ||
2454 | + __aligned(PAGE_SIZE) | ||
2455 | |||
2456 | /* | ||
2457 | * Declaration/definition used for per-CPU variables that must be read mostly. | ||
2458 | */ | ||
2459 | -#define DECLARE_PER_CPU_READ_MOSTLY(type, name) \ | ||
2460 | +#define DECLARE_PER_CPU_READ_MOSTLY(type, name) \ | ||
2461 | DECLARE_PER_CPU_SECTION(type, name, "..read_mostly") | ||
2462 | |||
2463 | #define DEFINE_PER_CPU_READ_MOSTLY(type, name) \ | ||
2464 | diff --git a/init/main.c b/init/main.c | ||
2465 | index 25bac88bc66e..99f026565608 100644 | ||
2466 | --- a/init/main.c | ||
2467 | +++ b/init/main.c | ||
2468 | @@ -80,6 +80,7 @@ | ||
2469 | #include <linux/integrity.h> | ||
2470 | #include <linux/proc_ns.h> | ||
2471 | #include <linux/io.h> | ||
2472 | +#include <linux/kaiser.h> | ||
2473 | |||
2474 | #include <asm/io.h> | ||
2475 | #include <asm/bugs.h> | ||
2476 | @@ -473,6 +474,7 @@ static void __init mm_init(void) | ||
2477 | pgtable_init(); | ||
2478 | vmalloc_init(); | ||
2479 | ioremap_huge_init(); | ||
2480 | + kaiser_init(); | ||
2481 | } | ||
2482 | |||
2483 | asmlinkage __visible void __init start_kernel(void) | ||
2484 | diff --git a/kernel/fork.c b/kernel/fork.c | ||
2485 | index 9321b1ad3335..70e10cb49be0 100644 | ||
2486 | --- a/kernel/fork.c | ||
2487 | +++ b/kernel/fork.c | ||
2488 | @@ -58,6 +58,7 @@ | ||
2489 | #include <linux/tsacct_kern.h> | ||
2490 | #include <linux/cn_proc.h> | ||
2491 | #include <linux/freezer.h> | ||
2492 | +#include <linux/kaiser.h> | ||
2493 | #include <linux/delayacct.h> | ||
2494 | #include <linux/taskstats_kern.h> | ||
2495 | #include <linux/random.h> | ||
2496 | @@ -213,6 +214,7 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) | ||
2497 | |||
2498 | static inline void free_thread_stack(struct task_struct *tsk) | ||
2499 | { | ||
2500 | + kaiser_unmap_thread_stack(tsk->stack); | ||
2501 | #ifdef CONFIG_VMAP_STACK | ||
2502 | if (task_stack_vm_area(tsk)) { | ||
2503 | unsigned long flags; | ||
2504 | @@ -495,6 +497,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) | ||
2505 | * functions again. | ||
2506 | */ | ||
2507 | tsk->stack = stack; | ||
2508 | + | ||
2509 | + err= kaiser_map_thread_stack(tsk->stack); | ||
2510 | + if (err) | ||
2511 | + goto free_stack; | ||
2512 | #ifdef CONFIG_VMAP_STACK | ||
2513 | tsk->stack_vm_area = stack_vm_area; | ||
2514 | #endif | ||
2515 | diff --git a/mm/vmstat.c b/mm/vmstat.c | ||
2516 | index 604f26a4f696..6a088df04b29 100644 | ||
2517 | --- a/mm/vmstat.c | ||
2518 | +++ b/mm/vmstat.c | ||
2519 | @@ -932,6 +932,7 @@ const char * const vmstat_text[] = { | ||
2520 | "nr_slab_unreclaimable", | ||
2521 | "nr_page_table_pages", | ||
2522 | "nr_kernel_stack", | ||
2523 | + "nr_overhead", | ||
2524 | "nr_bounce", | ||
2525 | #if IS_ENABLED(CONFIG_ZSMALLOC) | ||
2526 | "nr_zspages", | ||
2527 | diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c | ||
2528 | index 97f9cac98348..e86a34fd5484 100644 | ||
2529 | --- a/net/ipv4/tcp_bbr.c | ||
2530 | +++ b/net/ipv4/tcp_bbr.c | ||
2531 | @@ -843,6 +843,11 @@ static u32 bbr_sndbuf_expand(struct sock *sk) | ||
2532 | */ | ||
2533 | static u32 bbr_undo_cwnd(struct sock *sk) | ||
2534 | { | ||
2535 | + struct bbr *bbr = inet_csk_ca(sk); | ||
2536 | + | ||
2537 | + bbr->full_bw = 0; /* spurious slow-down; reset full pipe detection */ | ||
2538 | + bbr->full_bw_cnt = 0; | ||
2539 | + bbr_reset_lt_bw_sampling(sk); | ||
2540 | return tcp_sk(sk)->snd_cwnd; | ||
2541 | } | ||
2542 | |||
2543 | diff --git a/security/Kconfig b/security/Kconfig | ||
2544 | index 118f4549404e..32f36b40e9f0 100644 | ||
2545 | --- a/security/Kconfig | ||
2546 | +++ b/security/Kconfig | ||
2547 | @@ -31,6 +31,16 @@ config SECURITY | ||
2548 | |||
2549 | If you are unsure how to answer this question, answer N. | ||
2550 | |||
2551 | +config PAGE_TABLE_ISOLATION | ||
2552 | + bool "Remove the kernel mapping in user mode" | ||
2553 | + default y | ||
2554 | + depends on X86_64 && SMP | ||
2555 | + help | ||
2556 | + This enforces a strict kernel and user space isolation, in order | ||
2557 | + to close hardware side channels on kernel address information. | ||
2558 | + | ||
2559 | + If you are unsure how to answer this question, answer Y. | ||
2560 | + | ||
2561 | config SECURITYFS | ||
2562 | bool "Enable the securityfs filesystem" | ||
2563 | help | ||
2564 | diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h | ||
2565 | index a39629206864..f79669a38c0c 100644 | ||
2566 | --- a/tools/arch/x86/include/asm/cpufeatures.h | ||
2567 | +++ b/tools/arch/x86/include/asm/cpufeatures.h | ||
2568 | @@ -197,6 +197,9 @@ | ||
2569 | #define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */ | ||
2570 | #define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */ | ||
2571 | |||
2572 | +/* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */ | ||
2573 | +#define X86_FEATURE_KAISER ( 7*32+31) /* CONFIG_PAGE_TABLE_ISOLATION w/o nokaiser */ | ||
2574 | + | ||
2575 | /* Virtualization flags: Linux defined, word 8 */ | ||
2576 | #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ | ||
2577 | #define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */ |