Contents of /trunk/kernel-alx/patches-4.9/0174-4.9.75-all-fixes.patch
Parent Directory | Revision Log
Revision 3063 -
(show annotations)
(download)
Wed Jan 10 10:33:48 2018 UTC (6 years, 8 months ago) by niro
File size: 79490 byte(s)
Wed Jan 10 10:33:48 2018 UTC (6 years, 8 months ago) by niro
File size: 79490 byte(s)
-linux-4.9.75
1 | diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt |
2 | index 152ec4e87b57..5d2676d043de 100644 |
3 | --- a/Documentation/kernel-parameters.txt |
4 | +++ b/Documentation/kernel-parameters.txt |
5 | @@ -2763,6 +2763,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted. |
6 | |
7 | nojitter [IA-64] Disables jitter checking for ITC timers. |
8 | |
9 | + nopti [X86-64] Disable KAISER isolation of kernel from user. |
10 | + |
11 | no-kvmclock [X86,KVM] Disable paravirtualized KVM clock driver |
12 | |
13 | no-kvmapf [X86,KVM] Disable paravirtualized asynchronous page |
14 | @@ -3325,6 +3327,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted. |
15 | pt. [PARIDE] |
16 | See Documentation/blockdev/paride.txt. |
17 | |
18 | + pti= [X86_64] |
19 | + Control KAISER user/kernel address space isolation: |
20 | + on - enable |
21 | + off - disable |
22 | + auto - default setting |
23 | + |
24 | pty.legacy_count= |
25 | [KNL] Number of legacy pty's. Overwrites compiled-in |
26 | default number. |
27 | diff --git a/Makefile b/Makefile |
28 | index 075e429732e7..acbc1b032db2 100644 |
29 | --- a/Makefile |
30 | +++ b/Makefile |
31 | @@ -1,6 +1,6 @@ |
32 | VERSION = 4 |
33 | PATCHLEVEL = 9 |
34 | -SUBLEVEL = 74 |
35 | +SUBLEVEL = 75 |
36 | EXTRAVERSION = |
37 | NAME = Roaring Lionus |
38 | |
39 | diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h |
40 | index 766a5211f827..2728e1b7e4a6 100644 |
41 | --- a/arch/x86/boot/compressed/misc.h |
42 | +++ b/arch/x86/boot/compressed/misc.h |
43 | @@ -9,6 +9,7 @@ |
44 | */ |
45 | #undef CONFIG_PARAVIRT |
46 | #undef CONFIG_PARAVIRT_SPINLOCKS |
47 | +#undef CONFIG_PAGE_TABLE_ISOLATION |
48 | #undef CONFIG_KASAN |
49 | |
50 | #include <linux/linkage.h> |
51 | diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S |
52 | index e7b0e7ff4c58..af4e58132d91 100644 |
53 | --- a/arch/x86/entry/entry_64.S |
54 | +++ b/arch/x86/entry/entry_64.S |
55 | @@ -36,6 +36,7 @@ |
56 | #include <asm/smap.h> |
57 | #include <asm/pgtable_types.h> |
58 | #include <asm/export.h> |
59 | +#include <asm/kaiser.h> |
60 | #include <linux/err.h> |
61 | |
62 | /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ |
63 | @@ -146,6 +147,7 @@ ENTRY(entry_SYSCALL_64) |
64 | * it is too small to ever cause noticeable irq latency. |
65 | */ |
66 | SWAPGS_UNSAFE_STACK |
67 | + SWITCH_KERNEL_CR3_NO_STACK |
68 | /* |
69 | * A hypervisor implementation might want to use a label |
70 | * after the swapgs, so that it can do the swapgs |
71 | @@ -228,6 +230,14 @@ entry_SYSCALL_64_fastpath: |
72 | movq RIP(%rsp), %rcx |
73 | movq EFLAGS(%rsp), %r11 |
74 | RESTORE_C_REGS_EXCEPT_RCX_R11 |
75 | + /* |
76 | + * This opens a window where we have a user CR3, but are |
77 | + * running in the kernel. This makes using the CS |
78 | + * register useless for telling whether or not we need to |
79 | + * switch CR3 in NMIs. Normal interrupts are OK because |
80 | + * they are off here. |
81 | + */ |
82 | + SWITCH_USER_CR3 |
83 | movq RSP(%rsp), %rsp |
84 | USERGS_SYSRET64 |
85 | |
86 | @@ -323,10 +333,26 @@ return_from_SYSCALL_64: |
87 | syscall_return_via_sysret: |
88 | /* rcx and r11 are already restored (see code above) */ |
89 | RESTORE_C_REGS_EXCEPT_RCX_R11 |
90 | + /* |
91 | + * This opens a window where we have a user CR3, but are |
92 | + * running in the kernel. This makes using the CS |
93 | + * register useless for telling whether or not we need to |
94 | + * switch CR3 in NMIs. Normal interrupts are OK because |
95 | + * they are off here. |
96 | + */ |
97 | + SWITCH_USER_CR3 |
98 | movq RSP(%rsp), %rsp |
99 | USERGS_SYSRET64 |
100 | |
101 | opportunistic_sysret_failed: |
102 | + /* |
103 | + * This opens a window where we have a user CR3, but are |
104 | + * running in the kernel. This makes using the CS |
105 | + * register useless for telling whether or not we need to |
106 | + * switch CR3 in NMIs. Normal interrupts are OK because |
107 | + * they are off here. |
108 | + */ |
109 | + SWITCH_USER_CR3 |
110 | SWAPGS |
111 | jmp restore_c_regs_and_iret |
112 | END(entry_SYSCALL_64) |
113 | @@ -424,6 +450,7 @@ ENTRY(ret_from_fork) |
114 | movq %rsp, %rdi |
115 | call syscall_return_slowpath /* returns with IRQs disabled */ |
116 | TRACE_IRQS_ON /* user mode is traced as IRQS on */ |
117 | + SWITCH_USER_CR3 |
118 | SWAPGS |
119 | jmp restore_regs_and_iret |
120 | |
121 | @@ -478,6 +505,7 @@ END(irq_entries_start) |
122 | * tracking that we're in kernel mode. |
123 | */ |
124 | SWAPGS |
125 | + SWITCH_KERNEL_CR3 |
126 | |
127 | /* |
128 | * We need to tell lockdep that IRQs are off. We can't do this until |
129 | @@ -535,6 +563,7 @@ GLOBAL(retint_user) |
130 | mov %rsp,%rdi |
131 | call prepare_exit_to_usermode |
132 | TRACE_IRQS_IRETQ |
133 | + SWITCH_USER_CR3 |
134 | SWAPGS |
135 | jmp restore_regs_and_iret |
136 | |
137 | @@ -612,6 +641,7 @@ native_irq_return_ldt: |
138 | |
139 | pushq %rdi /* Stash user RDI */ |
140 | SWAPGS |
141 | + SWITCH_KERNEL_CR3 |
142 | movq PER_CPU_VAR(espfix_waddr), %rdi |
143 | movq %rax, (0*8)(%rdi) /* user RAX */ |
144 | movq (1*8)(%rsp), %rax /* user RIP */ |
145 | @@ -638,6 +668,7 @@ native_irq_return_ldt: |
146 | * still points to an RO alias of the ESPFIX stack. |
147 | */ |
148 | orq PER_CPU_VAR(espfix_stack), %rax |
149 | + SWITCH_USER_CR3 |
150 | SWAPGS |
151 | movq %rax, %rsp |
152 | |
153 | @@ -1022,7 +1053,11 @@ idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vec |
154 | /* |
155 | * Save all registers in pt_regs, and switch gs if needed. |
156 | * Use slow, but surefire "are we in kernel?" check. |
157 | - * Return: ebx=0: need swapgs on exit, ebx=1: otherwise |
158 | + * |
159 | + * Return: ebx=0: needs swapgs but not SWITCH_USER_CR3 in paranoid_exit |
160 | + * ebx=1: needs neither swapgs nor SWITCH_USER_CR3 in paranoid_exit |
161 | + * ebx=2: needs both swapgs and SWITCH_USER_CR3 in paranoid_exit |
162 | + * ebx=3: needs SWITCH_USER_CR3 but not swapgs in paranoid_exit |
163 | */ |
164 | ENTRY(paranoid_entry) |
165 | cld |
166 | @@ -1035,7 +1070,26 @@ ENTRY(paranoid_entry) |
167 | js 1f /* negative -> in kernel */ |
168 | SWAPGS |
169 | xorl %ebx, %ebx |
170 | -1: ret |
171 | +1: |
172 | +#ifdef CONFIG_PAGE_TABLE_ISOLATION |
173 | + /* |
174 | + * We might have come in between a swapgs and a SWITCH_KERNEL_CR3 |
175 | + * on entry, or between a SWITCH_USER_CR3 and a swapgs on exit. |
176 | + * Do a conditional SWITCH_KERNEL_CR3: this could safely be done |
177 | + * unconditionally, but we need to find out whether the reverse |
178 | + * should be done on return (conveyed to paranoid_exit in %ebx). |
179 | + */ |
180 | + ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER |
181 | + testl $KAISER_SHADOW_PGD_OFFSET, %eax |
182 | + jz 2f |
183 | + orl $2, %ebx |
184 | + andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax |
185 | + /* If PCID enabled, set X86_CR3_PCID_NOFLUSH_BIT */ |
186 | + ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID |
187 | + movq %rax, %cr3 |
188 | +2: |
189 | +#endif |
190 | + ret |
191 | END(paranoid_entry) |
192 | |
193 | /* |
194 | @@ -1048,19 +1102,26 @@ END(paranoid_entry) |
195 | * be complicated. Fortunately, we there's no good reason |
196 | * to try to handle preemption here. |
197 | * |
198 | - * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) |
199 | + * On entry: ebx=0: needs swapgs but not SWITCH_USER_CR3 |
200 | + * ebx=1: needs neither swapgs nor SWITCH_USER_CR3 |
201 | + * ebx=2: needs both swapgs and SWITCH_USER_CR3 |
202 | + * ebx=3: needs SWITCH_USER_CR3 but not swapgs |
203 | */ |
204 | ENTRY(paranoid_exit) |
205 | DISABLE_INTERRUPTS(CLBR_NONE) |
206 | TRACE_IRQS_OFF_DEBUG |
207 | - testl %ebx, %ebx /* swapgs needed? */ |
208 | + TRACE_IRQS_IRETQ_DEBUG |
209 | +#ifdef CONFIG_PAGE_TABLE_ISOLATION |
210 | + /* No ALTERNATIVE for X86_FEATURE_KAISER: paranoid_entry sets %ebx */ |
211 | + testl $2, %ebx /* SWITCH_USER_CR3 needed? */ |
212 | + jz paranoid_exit_no_switch |
213 | + SWITCH_USER_CR3 |
214 | +paranoid_exit_no_switch: |
215 | +#endif |
216 | + testl $1, %ebx /* swapgs needed? */ |
217 | jnz paranoid_exit_no_swapgs |
218 | - TRACE_IRQS_IRETQ |
219 | SWAPGS_UNSAFE_STACK |
220 | - jmp paranoid_exit_restore |
221 | paranoid_exit_no_swapgs: |
222 | - TRACE_IRQS_IRETQ_DEBUG |
223 | -paranoid_exit_restore: |
224 | RESTORE_EXTRA_REGS |
225 | RESTORE_C_REGS |
226 | REMOVE_PT_GPREGS_FROM_STACK 8 |
227 | @@ -1075,6 +1136,13 @@ ENTRY(error_entry) |
228 | cld |
229 | SAVE_C_REGS 8 |
230 | SAVE_EXTRA_REGS 8 |
231 | + /* |
232 | + * error_entry() always returns with a kernel gsbase and |
233 | + * CR3. We must also have a kernel CR3/gsbase before |
234 | + * calling TRACE_IRQS_*. Just unconditionally switch to |
235 | + * the kernel CR3 here. |
236 | + */ |
237 | + SWITCH_KERNEL_CR3 |
238 | xorl %ebx, %ebx |
239 | testb $3, CS+8(%rsp) |
240 | jz .Lerror_kernelspace |
241 | @@ -1235,6 +1303,10 @@ ENTRY(nmi) |
242 | */ |
243 | |
244 | SWAPGS_UNSAFE_STACK |
245 | + /* |
246 | + * percpu variables are mapped with user CR3, so no need |
247 | + * to switch CR3 here. |
248 | + */ |
249 | cld |
250 | movq %rsp, %rdx |
251 | movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp |
252 | @@ -1268,12 +1340,34 @@ ENTRY(nmi) |
253 | |
254 | movq %rsp, %rdi |
255 | movq $-1, %rsi |
256 | +#ifdef CONFIG_PAGE_TABLE_ISOLATION |
257 | + /* Unconditionally use kernel CR3 for do_nmi() */ |
258 | + /* %rax is saved above, so OK to clobber here */ |
259 | + ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER |
260 | + /* If PCID enabled, NOFLUSH now and NOFLUSH on return */ |
261 | + ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID |
262 | + pushq %rax |
263 | + /* mask off "user" bit of pgd address and 12 PCID bits: */ |
264 | + andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax |
265 | + movq %rax, %cr3 |
266 | +2: |
267 | +#endif |
268 | call do_nmi |
269 | |
270 | +#ifdef CONFIG_PAGE_TABLE_ISOLATION |
271 | + /* |
272 | + * Unconditionally restore CR3. I know we return to |
273 | + * kernel code that needs user CR3, but do we ever return |
274 | + * to "user mode" where we need the kernel CR3? |
275 | + */ |
276 | + ALTERNATIVE "", "popq %rax; movq %rax, %cr3", X86_FEATURE_KAISER |
277 | +#endif |
278 | + |
279 | /* |
280 | * Return back to user mode. We must *not* do the normal exit |
281 | - * work, because we don't want to enable interrupts. Fortunately, |
282 | - * do_nmi doesn't modify pt_regs. |
283 | + * work, because we don't want to enable interrupts. Do not |
284 | + * switch to user CR3: we might be going back to kernel code |
285 | + * that had a user CR3 set. |
286 | */ |
287 | SWAPGS |
288 | jmp restore_c_regs_and_iret |
289 | @@ -1470,22 +1564,55 @@ end_repeat_nmi: |
290 | ALLOC_PT_GPREGS_ON_STACK |
291 | |
292 | /* |
293 | - * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit |
294 | - * as we should not be calling schedule in NMI context. |
295 | - * Even with normal interrupts enabled. An NMI should not be |
296 | - * setting NEED_RESCHED or anything that normal interrupts and |
297 | - * exceptions might do. |
298 | + * Use the same approach as paranoid_entry to handle SWAPGS, but |
299 | + * without CR3 handling since we do that differently in NMIs. No |
300 | + * need to use paranoid_exit as we should not be calling schedule |
301 | + * in NMI context. Even with normal interrupts enabled. An NMI |
302 | + * should not be setting NEED_RESCHED or anything that normal |
303 | + * interrupts and exceptions might do. |
304 | */ |
305 | - call paranoid_entry |
306 | - |
307 | - /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ |
308 | + cld |
309 | + SAVE_C_REGS |
310 | + SAVE_EXTRA_REGS |
311 | + movl $1, %ebx |
312 | + movl $MSR_GS_BASE, %ecx |
313 | + rdmsr |
314 | + testl %edx, %edx |
315 | + js 1f /* negative -> in kernel */ |
316 | + SWAPGS |
317 | + xorl %ebx, %ebx |
318 | +1: |
319 | movq %rsp, %rdi |
320 | movq $-1, %rsi |
321 | +#ifdef CONFIG_PAGE_TABLE_ISOLATION |
322 | + /* Unconditionally use kernel CR3 for do_nmi() */ |
323 | + /* %rax is saved above, so OK to clobber here */ |
324 | + ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER |
325 | + /* If PCID enabled, NOFLUSH now and NOFLUSH on return */ |
326 | + ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID |
327 | + pushq %rax |
328 | + /* mask off "user" bit of pgd address and 12 PCID bits: */ |
329 | + andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax |
330 | + movq %rax, %cr3 |
331 | +2: |
332 | +#endif |
333 | + |
334 | + /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ |
335 | call do_nmi |
336 | |
337 | +#ifdef CONFIG_PAGE_TABLE_ISOLATION |
338 | + /* |
339 | + * Unconditionally restore CR3. We might be returning to |
340 | + * kernel code that needs user CR3, like just just before |
341 | + * a sysret. |
342 | + */ |
343 | + ALTERNATIVE "", "popq %rax; movq %rax, %cr3", X86_FEATURE_KAISER |
344 | +#endif |
345 | + |
346 | testl %ebx, %ebx /* swapgs needed? */ |
347 | jnz nmi_restore |
348 | nmi_swapgs: |
349 | + /* We fixed up CR3 above, so no need to switch it here */ |
350 | SWAPGS_UNSAFE_STACK |
351 | nmi_restore: |
352 | RESTORE_EXTRA_REGS |
353 | diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S |
354 | index e1721dafbcb1..d76a97653980 100644 |
355 | --- a/arch/x86/entry/entry_64_compat.S |
356 | +++ b/arch/x86/entry/entry_64_compat.S |
357 | @@ -13,6 +13,8 @@ |
358 | #include <asm/irqflags.h> |
359 | #include <asm/asm.h> |
360 | #include <asm/smap.h> |
361 | +#include <asm/pgtable_types.h> |
362 | +#include <asm/kaiser.h> |
363 | #include <linux/linkage.h> |
364 | #include <linux/err.h> |
365 | |
366 | @@ -48,6 +50,7 @@ |
367 | ENTRY(entry_SYSENTER_compat) |
368 | /* Interrupts are off on entry. */ |
369 | SWAPGS_UNSAFE_STACK |
370 | + SWITCH_KERNEL_CR3_NO_STACK |
371 | movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp |
372 | |
373 | /* |
374 | @@ -184,6 +187,7 @@ ENDPROC(entry_SYSENTER_compat) |
375 | ENTRY(entry_SYSCALL_compat) |
376 | /* Interrupts are off on entry. */ |
377 | SWAPGS_UNSAFE_STACK |
378 | + SWITCH_KERNEL_CR3_NO_STACK |
379 | |
380 | /* Stash user ESP and switch to the kernel stack. */ |
381 | movl %esp, %r8d |
382 | @@ -259,6 +263,7 @@ sysret32_from_system_call: |
383 | xorq %r8, %r8 |
384 | xorq %r9, %r9 |
385 | xorq %r10, %r10 |
386 | + SWITCH_USER_CR3 |
387 | movq RSP-ORIG_RAX(%rsp), %rsp |
388 | swapgs |
389 | sysretl |
390 | @@ -297,7 +302,7 @@ ENTRY(entry_INT80_compat) |
391 | PARAVIRT_ADJUST_EXCEPTION_FRAME |
392 | ASM_CLAC /* Do this early to minimize exposure */ |
393 | SWAPGS |
394 | - |
395 | + SWITCH_KERNEL_CR3_NO_STACK |
396 | /* |
397 | * User tracing code (ptrace or signal handlers) might assume that |
398 | * the saved RAX contains a 32-bit number when we're invoking a 32-bit |
399 | @@ -338,6 +343,7 @@ ENTRY(entry_INT80_compat) |
400 | |
401 | /* Go back to user mode. */ |
402 | TRACE_IRQS_ON |
403 | + SWITCH_USER_CR3 |
404 | SWAPGS |
405 | jmp restore_regs_and_iret |
406 | END(entry_INT80_compat) |
407 | diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c |
408 | index 9dfeeeca0ea8..8e7a3f1df3a5 100644 |
409 | --- a/arch/x86/events/intel/ds.c |
410 | +++ b/arch/x86/events/intel/ds.c |
411 | @@ -2,11 +2,15 @@ |
412 | #include <linux/types.h> |
413 | #include <linux/slab.h> |
414 | |
415 | +#include <asm/kaiser.h> |
416 | #include <asm/perf_event.h> |
417 | #include <asm/insn.h> |
418 | |
419 | #include "../perf_event.h" |
420 | |
421 | +static |
422 | +DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct debug_store, cpu_debug_store); |
423 | + |
424 | /* The size of a BTS record in bytes: */ |
425 | #define BTS_RECORD_SIZE 24 |
426 | |
427 | @@ -268,6 +272,39 @@ void fini_debug_store_on_cpu(int cpu) |
428 | |
429 | static DEFINE_PER_CPU(void *, insn_buffer); |
430 | |
431 | +static void *dsalloc(size_t size, gfp_t flags, int node) |
432 | +{ |
433 | +#ifdef CONFIG_PAGE_TABLE_ISOLATION |
434 | + unsigned int order = get_order(size); |
435 | + struct page *page; |
436 | + unsigned long addr; |
437 | + |
438 | + page = __alloc_pages_node(node, flags | __GFP_ZERO, order); |
439 | + if (!page) |
440 | + return NULL; |
441 | + addr = (unsigned long)page_address(page); |
442 | + if (kaiser_add_mapping(addr, size, __PAGE_KERNEL) < 0) { |
443 | + __free_pages(page, order); |
444 | + addr = 0; |
445 | + } |
446 | + return (void *)addr; |
447 | +#else |
448 | + return kmalloc_node(size, flags | __GFP_ZERO, node); |
449 | +#endif |
450 | +} |
451 | + |
452 | +static void dsfree(const void *buffer, size_t size) |
453 | +{ |
454 | +#ifdef CONFIG_PAGE_TABLE_ISOLATION |
455 | + if (!buffer) |
456 | + return; |
457 | + kaiser_remove_mapping((unsigned long)buffer, size); |
458 | + free_pages((unsigned long)buffer, get_order(size)); |
459 | +#else |
460 | + kfree(buffer); |
461 | +#endif |
462 | +} |
463 | + |
464 | static int alloc_pebs_buffer(int cpu) |
465 | { |
466 | struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; |
467 | @@ -278,7 +315,7 @@ static int alloc_pebs_buffer(int cpu) |
468 | if (!x86_pmu.pebs) |
469 | return 0; |
470 | |
471 | - buffer = kzalloc_node(x86_pmu.pebs_buffer_size, GFP_KERNEL, node); |
472 | + buffer = dsalloc(x86_pmu.pebs_buffer_size, GFP_KERNEL, node); |
473 | if (unlikely(!buffer)) |
474 | return -ENOMEM; |
475 | |
476 | @@ -289,7 +326,7 @@ static int alloc_pebs_buffer(int cpu) |
477 | if (x86_pmu.intel_cap.pebs_format < 2) { |
478 | ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node); |
479 | if (!ibuffer) { |
480 | - kfree(buffer); |
481 | + dsfree(buffer, x86_pmu.pebs_buffer_size); |
482 | return -ENOMEM; |
483 | } |
484 | per_cpu(insn_buffer, cpu) = ibuffer; |
485 | @@ -315,7 +352,8 @@ static void release_pebs_buffer(int cpu) |
486 | kfree(per_cpu(insn_buffer, cpu)); |
487 | per_cpu(insn_buffer, cpu) = NULL; |
488 | |
489 | - kfree((void *)(unsigned long)ds->pebs_buffer_base); |
490 | + dsfree((void *)(unsigned long)ds->pebs_buffer_base, |
491 | + x86_pmu.pebs_buffer_size); |
492 | ds->pebs_buffer_base = 0; |
493 | } |
494 | |
495 | @@ -329,7 +367,7 @@ static int alloc_bts_buffer(int cpu) |
496 | if (!x86_pmu.bts) |
497 | return 0; |
498 | |
499 | - buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node); |
500 | + buffer = dsalloc(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node); |
501 | if (unlikely(!buffer)) { |
502 | WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__); |
503 | return -ENOMEM; |
504 | @@ -355,19 +393,15 @@ static void release_bts_buffer(int cpu) |
505 | if (!ds || !x86_pmu.bts) |
506 | return; |
507 | |
508 | - kfree((void *)(unsigned long)ds->bts_buffer_base); |
509 | + dsfree((void *)(unsigned long)ds->bts_buffer_base, BTS_BUFFER_SIZE); |
510 | ds->bts_buffer_base = 0; |
511 | } |
512 | |
513 | static int alloc_ds_buffer(int cpu) |
514 | { |
515 | - int node = cpu_to_node(cpu); |
516 | - struct debug_store *ds; |
517 | - |
518 | - ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node); |
519 | - if (unlikely(!ds)) |
520 | - return -ENOMEM; |
521 | + struct debug_store *ds = per_cpu_ptr(&cpu_debug_store, cpu); |
522 | |
523 | + memset(ds, 0, sizeof(*ds)); |
524 | per_cpu(cpu_hw_events, cpu).ds = ds; |
525 | |
526 | return 0; |
527 | @@ -381,7 +415,6 @@ static void release_ds_buffer(int cpu) |
528 | return; |
529 | |
530 | per_cpu(cpu_hw_events, cpu).ds = NULL; |
531 | - kfree(ds); |
532 | } |
533 | |
534 | void release_ds_buffers(void) |
535 | diff --git a/arch/x86/include/asm/cmdline.h b/arch/x86/include/asm/cmdline.h |
536 | index e01f7f7ccb0c..84ae170bc3d0 100644 |
537 | --- a/arch/x86/include/asm/cmdline.h |
538 | +++ b/arch/x86/include/asm/cmdline.h |
539 | @@ -2,5 +2,7 @@ |
540 | #define _ASM_X86_CMDLINE_H |
541 | |
542 | int cmdline_find_option_bool(const char *cmdline_ptr, const char *option); |
543 | +int cmdline_find_option(const char *cmdline_ptr, const char *option, |
544 | + char *buffer, int bufsize); |
545 | |
546 | #endif /* _ASM_X86_CMDLINE_H */ |
547 | diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h |
548 | index ed10b5bf9b93..454a37adb823 100644 |
549 | --- a/arch/x86/include/asm/cpufeatures.h |
550 | +++ b/arch/x86/include/asm/cpufeatures.h |
551 | @@ -189,6 +189,7 @@ |
552 | |
553 | #define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */ |
554 | #define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */ |
555 | +#define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 4) /* Effectively INVPCID && CR4.PCIDE=1 */ |
556 | |
557 | #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ |
558 | #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ |
559 | @@ -197,6 +198,9 @@ |
560 | #define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */ |
561 | #define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */ |
562 | |
563 | +/* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */ |
564 | +#define X86_FEATURE_KAISER ( 7*32+31) /* CONFIG_PAGE_TABLE_ISOLATION w/o nokaiser */ |
565 | + |
566 | /* Virtualization flags: Linux defined, word 8 */ |
567 | #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ |
568 | #define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */ |
569 | diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h |
570 | index 12080d87da3b..2ed5a2b3f8f7 100644 |
571 | --- a/arch/x86/include/asm/desc.h |
572 | +++ b/arch/x86/include/asm/desc.h |
573 | @@ -43,7 +43,7 @@ struct gdt_page { |
574 | struct desc_struct gdt[GDT_ENTRIES]; |
575 | } __attribute__((aligned(PAGE_SIZE))); |
576 | |
577 | -DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page); |
578 | +DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page); |
579 | |
580 | static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu) |
581 | { |
582 | diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h |
583 | index b90e1053049b..0817d63bce41 100644 |
584 | --- a/arch/x86/include/asm/hw_irq.h |
585 | +++ b/arch/x86/include/asm/hw_irq.h |
586 | @@ -178,7 +178,7 @@ extern char irq_entries_start[]; |
587 | #define VECTOR_RETRIGGERED ((void *)~0UL) |
588 | |
589 | typedef struct irq_desc* vector_irq_t[NR_VECTORS]; |
590 | -DECLARE_PER_CPU(vector_irq_t, vector_irq); |
591 | +DECLARE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq); |
592 | |
593 | #endif /* !ASSEMBLY_ */ |
594 | |
595 | diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h |
596 | new file mode 100644 |
597 | index 000000000000..802bbbdfe143 |
598 | --- /dev/null |
599 | +++ b/arch/x86/include/asm/kaiser.h |
600 | @@ -0,0 +1,141 @@ |
601 | +#ifndef _ASM_X86_KAISER_H |
602 | +#define _ASM_X86_KAISER_H |
603 | + |
604 | +#include <uapi/asm/processor-flags.h> /* For PCID constants */ |
605 | + |
606 | +/* |
607 | + * This file includes the definitions for the KAISER feature. |
608 | + * KAISER is a counter measure against x86_64 side channel attacks on |
609 | + * the kernel virtual memory. It has a shadow pgd for every process: the |
610 | + * shadow pgd has a minimalistic kernel-set mapped, but includes the whole |
611 | + * user memory. Within a kernel context switch, or when an interrupt is handled, |
612 | + * the pgd is switched to the normal one. When the system switches to user mode, |
613 | + * the shadow pgd is enabled. By this, the virtual memory caches are freed, |
614 | + * and the user may not attack the whole kernel memory. |
615 | + * |
616 | + * A minimalistic kernel mapping holds the parts needed to be mapped in user |
617 | + * mode, such as the entry/exit functions of the user space, or the stacks. |
618 | + */ |
619 | + |
620 | +#define KAISER_SHADOW_PGD_OFFSET 0x1000 |
621 | + |
622 | +#ifdef __ASSEMBLY__ |
623 | +#ifdef CONFIG_PAGE_TABLE_ISOLATION |
624 | + |
625 | +.macro _SWITCH_TO_KERNEL_CR3 reg |
626 | +movq %cr3, \reg |
627 | +andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg |
628 | +/* If PCID enabled, set X86_CR3_PCID_NOFLUSH_BIT */ |
629 | +ALTERNATIVE "", "bts $63, \reg", X86_FEATURE_PCID |
630 | +movq \reg, %cr3 |
631 | +.endm |
632 | + |
633 | +.macro _SWITCH_TO_USER_CR3 reg regb |
634 | +/* |
635 | + * regb must be the low byte portion of reg: because we have arranged |
636 | + * for the low byte of the user PCID to serve as the high byte of NOFLUSH |
637 | + * (0x80 for each when PCID is enabled, or 0x00 when PCID and NOFLUSH are |
638 | + * not enabled): so that the one register can update both memory and cr3. |
639 | + */ |
640 | +movq %cr3, \reg |
641 | +orq PER_CPU_VAR(x86_cr3_pcid_user), \reg |
642 | +js 9f |
643 | +/* If PCID enabled, FLUSH this time, reset to NOFLUSH for next time */ |
644 | +movb \regb, PER_CPU_VAR(x86_cr3_pcid_user+7) |
645 | +9: |
646 | +movq \reg, %cr3 |
647 | +.endm |
648 | + |
649 | +.macro SWITCH_KERNEL_CR3 |
650 | +ALTERNATIVE "jmp 8f", "pushq %rax", X86_FEATURE_KAISER |
651 | +_SWITCH_TO_KERNEL_CR3 %rax |
652 | +popq %rax |
653 | +8: |
654 | +.endm |
655 | + |
656 | +.macro SWITCH_USER_CR3 |
657 | +ALTERNATIVE "jmp 8f", "pushq %rax", X86_FEATURE_KAISER |
658 | +_SWITCH_TO_USER_CR3 %rax %al |
659 | +popq %rax |
660 | +8: |
661 | +.endm |
662 | + |
663 | +.macro SWITCH_KERNEL_CR3_NO_STACK |
664 | +ALTERNATIVE "jmp 8f", \ |
665 | + __stringify(movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)), \ |
666 | + X86_FEATURE_KAISER |
667 | +_SWITCH_TO_KERNEL_CR3 %rax |
668 | +movq PER_CPU_VAR(unsafe_stack_register_backup), %rax |
669 | +8: |
670 | +.endm |
671 | + |
672 | +#else /* CONFIG_PAGE_TABLE_ISOLATION */ |
673 | + |
674 | +.macro SWITCH_KERNEL_CR3 |
675 | +.endm |
676 | +.macro SWITCH_USER_CR3 |
677 | +.endm |
678 | +.macro SWITCH_KERNEL_CR3_NO_STACK |
679 | +.endm |
680 | + |
681 | +#endif /* CONFIG_PAGE_TABLE_ISOLATION */ |
682 | + |
683 | +#else /* __ASSEMBLY__ */ |
684 | + |
685 | +#ifdef CONFIG_PAGE_TABLE_ISOLATION |
686 | +/* |
687 | + * Upon kernel/user mode switch, it may happen that the address |
688 | + * space has to be switched before the registers have been |
689 | + * stored. To change the address space, another register is |
690 | + * needed. A register therefore has to be stored/restored. |
691 | +*/ |
692 | +DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); |
693 | + |
694 | +DECLARE_PER_CPU(unsigned long, x86_cr3_pcid_user); |
695 | + |
696 | +extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[]; |
697 | + |
698 | +extern int kaiser_enabled; |
699 | +extern void __init kaiser_check_boottime_disable(void); |
700 | +#else |
701 | +#define kaiser_enabled 0 |
702 | +static inline void __init kaiser_check_boottime_disable(void) {} |
703 | +#endif /* CONFIG_PAGE_TABLE_ISOLATION */ |
704 | + |
705 | +/* |
706 | + * Kaiser function prototypes are needed even when CONFIG_PAGE_TABLE_ISOLATION is not set, |
707 | + * so as to build with tests on kaiser_enabled instead of #ifdefs. |
708 | + */ |
709 | + |
710 | +/** |
711 | + * kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping |
712 | + * @addr: the start address of the range |
713 | + * @size: the size of the range |
714 | + * @flags: The mapping flags of the pages |
715 | + * |
716 | + * The mapping is done on a global scope, so no bigger |
717 | + * synchronization has to be done. the pages have to be |
718 | + * manually unmapped again when they are not needed any longer. |
719 | + */ |
720 | +extern int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags); |
721 | + |
722 | +/** |
723 | + * kaiser_remove_mapping - unmap a virtual memory part of the shadow mapping |
724 | + * @addr: the start address of the range |
725 | + * @size: the size of the range |
726 | + */ |
727 | +extern void kaiser_remove_mapping(unsigned long start, unsigned long size); |
728 | + |
729 | +/** |
730 | + * kaiser_init - Initialize the shadow mapping |
731 | + * |
732 | + * Most parts of the shadow mapping can be mapped upon boot |
733 | + * time. Only per-process things like the thread stacks |
734 | + * or a new LDT have to be mapped at runtime. These boot- |
735 | + * time mappings are permanent and never unmapped. |
736 | + */ |
737 | +extern void kaiser_init(void); |
738 | + |
739 | +#endif /* __ASSEMBLY */ |
740 | + |
741 | +#endif /* _ASM_X86_KAISER_H */ |
742 | diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h |
743 | index 437feb436efa..2536f90cd30c 100644 |
744 | --- a/arch/x86/include/asm/pgtable.h |
745 | +++ b/arch/x86/include/asm/pgtable.h |
746 | @@ -18,6 +18,12 @@ |
747 | #ifndef __ASSEMBLY__ |
748 | #include <asm/x86_init.h> |
749 | |
750 | +#ifdef CONFIG_PAGE_TABLE_ISOLATION |
751 | +extern int kaiser_enabled; |
752 | +#else |
753 | +#define kaiser_enabled 0 |
754 | +#endif |
755 | + |
756 | void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd); |
757 | void ptdump_walk_pgd_level_checkwx(void); |
758 | |
759 | @@ -690,7 +696,17 @@ static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address) |
760 | |
761 | static inline int pgd_bad(pgd_t pgd) |
762 | { |
763 | - return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE; |
764 | + pgdval_t ignore_flags = _PAGE_USER; |
765 | + /* |
766 | + * We set NX on KAISER pgds that map userspace memory so |
767 | + * that userspace can not meaningfully use the kernel |
768 | + * page table by accident; it will fault on the first |
769 | + * instruction it tries to run. See native_set_pgd(). |
770 | + */ |
771 | + if (kaiser_enabled) |
772 | + ignore_flags |= _PAGE_NX; |
773 | + |
774 | + return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE; |
775 | } |
776 | |
777 | static inline int pgd_none(pgd_t pgd) |
778 | @@ -903,7 +919,15 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, |
779 | */ |
780 | static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count) |
781 | { |
782 | - memcpy(dst, src, count * sizeof(pgd_t)); |
783 | + memcpy(dst, src, count * sizeof(pgd_t)); |
784 | +#ifdef CONFIG_PAGE_TABLE_ISOLATION |
785 | + if (kaiser_enabled) { |
786 | + /* Clone the shadow pgd part as well */ |
787 | + memcpy(native_get_shadow_pgd(dst), |
788 | + native_get_shadow_pgd(src), |
789 | + count * sizeof(pgd_t)); |
790 | + } |
791 | +#endif |
792 | } |
793 | |
794 | #define PTE_SHIFT ilog2(PTRS_PER_PTE) |
795 | diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h |
796 | index 1cc82ece9ac1..ce97c8c6a310 100644 |
797 | --- a/arch/x86/include/asm/pgtable_64.h |
798 | +++ b/arch/x86/include/asm/pgtable_64.h |
799 | @@ -106,9 +106,32 @@ static inline void native_pud_clear(pud_t *pud) |
800 | native_set_pud(pud, native_make_pud(0)); |
801 | } |
802 | |
803 | +#ifdef CONFIG_PAGE_TABLE_ISOLATION |
804 | +extern pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd); |
805 | + |
806 | +static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp) |
807 | +{ |
808 | +#ifdef CONFIG_DEBUG_VM |
809 | + /* linux/mmdebug.h may not have been included at this point */ |
810 | + BUG_ON(!kaiser_enabled); |
811 | +#endif |
812 | + return (pgd_t *)((unsigned long)pgdp | (unsigned long)PAGE_SIZE); |
813 | +} |
814 | +#else |
815 | +static inline pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd) |
816 | +{ |
817 | + return pgd; |
818 | +} |
819 | +static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp) |
820 | +{ |
821 | + BUILD_BUG_ON(1); |
822 | + return NULL; |
823 | +} |
824 | +#endif /* CONFIG_PAGE_TABLE_ISOLATION */ |
825 | + |
826 | static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) |
827 | { |
828 | - *pgdp = pgd; |
829 | + *pgdp = kaiser_set_shadow_pgd(pgdp, pgd); |
830 | } |
831 | |
832 | static inline void native_pgd_clear(pgd_t *pgd) |
833 | diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h |
834 | index 8b4de22d6429..f1c8ac468292 100644 |
835 | --- a/arch/x86/include/asm/pgtable_types.h |
836 | +++ b/arch/x86/include/asm/pgtable_types.h |
837 | @@ -119,7 +119,7 @@ |
838 | #define _PAGE_DEVMAP (_AT(pteval_t, 0)) |
839 | #endif |
840 | |
841 | -#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) |
842 | +#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) |
843 | |
844 | #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ |
845 | _PAGE_ACCESSED | _PAGE_DIRTY) |
846 | @@ -137,6 +137,33 @@ |
847 | _PAGE_SOFT_DIRTY) |
848 | #define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE) |
849 | |
850 | +/* The ASID is the lower 12 bits of CR3 */ |
851 | +#define X86_CR3_PCID_ASID_MASK (_AC((1<<12)-1,UL)) |
852 | + |
853 | +/* Mask for all the PCID-related bits in CR3: */ |
854 | +#define X86_CR3_PCID_MASK (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_MASK) |
855 | +#define X86_CR3_PCID_ASID_KERN (_AC(0x0,UL)) |
856 | + |
857 | +#if defined(CONFIG_PAGE_TABLE_ISOLATION) && defined(CONFIG_X86_64) |
858 | +/* Let X86_CR3_PCID_ASID_USER be usable for the X86_CR3_PCID_NOFLUSH bit */ |
859 | +#define X86_CR3_PCID_ASID_USER (_AC(0x80,UL)) |
860 | + |
861 | +#define X86_CR3_PCID_KERN_FLUSH (X86_CR3_PCID_ASID_KERN) |
862 | +#define X86_CR3_PCID_USER_FLUSH (X86_CR3_PCID_ASID_USER) |
863 | +#define X86_CR3_PCID_KERN_NOFLUSH (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_KERN) |
864 | +#define X86_CR3_PCID_USER_NOFLUSH (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_USER) |
865 | +#else |
866 | +#define X86_CR3_PCID_ASID_USER (_AC(0x0,UL)) |
867 | +/* |
868 | + * PCIDs are unsupported on 32-bit and none of these bits can be |
869 | + * set in CR3: |
870 | + */ |
871 | +#define X86_CR3_PCID_KERN_FLUSH (0) |
872 | +#define X86_CR3_PCID_USER_FLUSH (0) |
873 | +#define X86_CR3_PCID_KERN_NOFLUSH (0) |
874 | +#define X86_CR3_PCID_USER_NOFLUSH (0) |
875 | +#endif |
876 | + |
877 | /* |
878 | * The cache modes defined here are used to translate between pure SW usage |
879 | * and the HW defined cache mode bits and/or PAT entries. |
880 | diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h |
881 | index 83db0eae9979..8cb52ee3ade6 100644 |
882 | --- a/arch/x86/include/asm/processor.h |
883 | +++ b/arch/x86/include/asm/processor.h |
884 | @@ -308,7 +308,7 @@ struct tss_struct { |
885 | |
886 | } ____cacheline_aligned; |
887 | |
888 | -DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss); |
889 | +DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss); |
890 | |
891 | #ifdef CONFIG_X86_32 |
892 | DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack); |
893 | diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h |
894 | index 7d2ea6b1f7d9..94146f665a3c 100644 |
895 | --- a/arch/x86/include/asm/tlbflush.h |
896 | +++ b/arch/x86/include/asm/tlbflush.h |
897 | @@ -132,6 +132,24 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask) |
898 | cr4_set_bits(mask); |
899 | } |
900 | |
901 | +/* |
902 | + * Declare a couple of kaiser interfaces here for convenience, |
903 | + * to avoid the need for asm/kaiser.h in unexpected places. |
904 | + */ |
905 | +#ifdef CONFIG_PAGE_TABLE_ISOLATION |
906 | +extern int kaiser_enabled; |
907 | +extern void kaiser_setup_pcid(void); |
908 | +extern void kaiser_flush_tlb_on_return_to_user(void); |
909 | +#else |
910 | +#define kaiser_enabled 0 |
911 | +static inline void kaiser_setup_pcid(void) |
912 | +{ |
913 | +} |
914 | +static inline void kaiser_flush_tlb_on_return_to_user(void) |
915 | +{ |
916 | +} |
917 | +#endif |
918 | + |
919 | static inline void __native_flush_tlb(void) |
920 | { |
921 | /* |
922 | @@ -140,6 +158,8 @@ static inline void __native_flush_tlb(void) |
923 | * back: |
924 | */ |
925 | preempt_disable(); |
926 | + if (kaiser_enabled) |
927 | + kaiser_flush_tlb_on_return_to_user(); |
928 | native_write_cr3(native_read_cr3()); |
929 | preempt_enable(); |
930 | } |
931 | @@ -149,20 +169,27 @@ static inline void __native_flush_tlb_global_irq_disabled(void) |
932 | unsigned long cr4; |
933 | |
934 | cr4 = this_cpu_read(cpu_tlbstate.cr4); |
935 | - /* clear PGE */ |
936 | - native_write_cr4(cr4 & ~X86_CR4_PGE); |
937 | - /* write old PGE again and flush TLBs */ |
938 | - native_write_cr4(cr4); |
939 | + if (cr4 & X86_CR4_PGE) { |
940 | + /* clear PGE and flush TLB of all entries */ |
941 | + native_write_cr4(cr4 & ~X86_CR4_PGE); |
942 | + /* restore PGE as it was before */ |
943 | + native_write_cr4(cr4); |
944 | + } else { |
945 | + /* do it with cr3, letting kaiser flush user PCID */ |
946 | + __native_flush_tlb(); |
947 | + } |
948 | } |
949 | |
950 | static inline void __native_flush_tlb_global(void) |
951 | { |
952 | unsigned long flags; |
953 | |
954 | - if (static_cpu_has(X86_FEATURE_INVPCID)) { |
955 | + if (this_cpu_has(X86_FEATURE_INVPCID)) { |
956 | /* |
957 | * Using INVPCID is considerably faster than a pair of writes |
958 | * to CR4 sandwiched inside an IRQ flag save/restore. |
959 | + * |
960 | + * Note, this works with CR4.PCIDE=0 or 1. |
961 | */ |
962 | invpcid_flush_all(); |
963 | return; |
964 | @@ -174,24 +201,45 @@ static inline void __native_flush_tlb_global(void) |
965 | * be called from deep inside debugging code.) |
966 | */ |
967 | raw_local_irq_save(flags); |
968 | - |
969 | __native_flush_tlb_global_irq_disabled(); |
970 | - |
971 | raw_local_irq_restore(flags); |
972 | } |
973 | |
974 | static inline void __native_flush_tlb_single(unsigned long addr) |
975 | { |
976 | - asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); |
977 | + /* |
978 | + * SIMICS #GP's if you run INVPCID with type 2/3 |
979 | + * and X86_CR4_PCIDE clear. Shame! |
980 | + * |
981 | + * The ASIDs used below are hard-coded. But, we must not |
982 | + * call invpcid(type=1/2) before CR4.PCIDE=1. Just call |
983 | + * invlpg in the case we are called early. |
984 | + */ |
985 | + |
986 | + if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) { |
987 | + if (kaiser_enabled) |
988 | + kaiser_flush_tlb_on_return_to_user(); |
989 | + asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); |
990 | + return; |
991 | + } |
992 | + /* Flush the address out of both PCIDs. */ |
993 | + /* |
994 | + * An optimization here might be to determine addresses |
995 | + * that are only kernel-mapped and only flush the kernel |
996 | + * ASID. But, userspace flushes are probably much more |
997 | + * important performance-wise. |
998 | + * |
999 | + * Make sure to do only a single invpcid when KAISER is |
1000 | + * disabled and we have only a single ASID. |
1001 | + */ |
1002 | + if (kaiser_enabled) |
1003 | + invpcid_flush_one(X86_CR3_PCID_ASID_USER, addr); |
1004 | + invpcid_flush_one(X86_CR3_PCID_ASID_KERN, addr); |
1005 | } |
1006 | |
1007 | static inline void __flush_tlb_all(void) |
1008 | { |
1009 | - if (boot_cpu_has(X86_FEATURE_PGE)) |
1010 | - __flush_tlb_global(); |
1011 | - else |
1012 | - __flush_tlb(); |
1013 | - |
1014 | + __flush_tlb_global(); |
1015 | /* |
1016 | * Note: if we somehow had PCID but not PGE, then this wouldn't work -- |
1017 | * we'd end up flushing kernel translations for the current ASID but |
1018 | diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h |
1019 | index 567de50a4c2a..6768d1321016 100644 |
1020 | --- a/arch/x86/include/uapi/asm/processor-flags.h |
1021 | +++ b/arch/x86/include/uapi/asm/processor-flags.h |
1022 | @@ -77,7 +77,8 @@ |
1023 | #define X86_CR3_PWT _BITUL(X86_CR3_PWT_BIT) |
1024 | #define X86_CR3_PCD_BIT 4 /* Page Cache Disable */ |
1025 | #define X86_CR3_PCD _BITUL(X86_CR3_PCD_BIT) |
1026 | -#define X86_CR3_PCID_MASK _AC(0x00000fff,UL) /* PCID Mask */ |
1027 | +#define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */ |
1028 | +#define X86_CR3_PCID_NOFLUSH _BITULL(X86_CR3_PCID_NOFLUSH_BIT) |
1029 | |
1030 | /* |
1031 | * Intel CPU features in CR4 |
1032 | diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c |
1033 | index 91588be529b9..918e44772b04 100644 |
1034 | --- a/arch/x86/kernel/cpu/common.c |
1035 | +++ b/arch/x86/kernel/cpu/common.c |
1036 | @@ -93,7 +93,7 @@ static const struct cpu_dev default_cpu = { |
1037 | |
1038 | static const struct cpu_dev *this_cpu = &default_cpu; |
1039 | |
1040 | -DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { |
1041 | +DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page) = { .gdt = { |
1042 | #ifdef CONFIG_X86_64 |
1043 | /* |
1044 | * We need valid kernel segments for data and code in long mode too |
1045 | @@ -327,8 +327,21 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c) |
1046 | static void setup_pcid(struct cpuinfo_x86 *c) |
1047 | { |
1048 | if (cpu_has(c, X86_FEATURE_PCID)) { |
1049 | - if (cpu_has(c, X86_FEATURE_PGE)) { |
1050 | + if (cpu_has(c, X86_FEATURE_PGE) || kaiser_enabled) { |
1051 | cr4_set_bits(X86_CR4_PCIDE); |
1052 | + /* |
1053 | + * INVPCID has two "groups" of types: |
1054 | + * 1/2: Invalidate an individual address |
1055 | + * 3/4: Invalidate all contexts |
1056 | + * |
1057 | + * 1/2 take a PCID, but 3/4 do not. So, 3/4 |
1058 | + * ignore the PCID argument in the descriptor. |
1059 | + * But, we have to be careful not to call 1/2 |
1060 | + * with an actual non-zero PCID in them before |
1061 | + * we do the above cr4_set_bits(). |
1062 | + */ |
1063 | + if (cpu_has(c, X86_FEATURE_INVPCID)) |
1064 | + set_cpu_cap(c, X86_FEATURE_INVPCID_SINGLE); |
1065 | } else { |
1066 | /* |
1067 | * flush_tlb_all(), as currently implemented, won't |
1068 | @@ -341,6 +354,7 @@ static void setup_pcid(struct cpuinfo_x86 *c) |
1069 | clear_cpu_cap(c, X86_FEATURE_PCID); |
1070 | } |
1071 | } |
1072 | + kaiser_setup_pcid(); |
1073 | } |
1074 | |
1075 | /* |
1076 | @@ -1365,7 +1379,7 @@ static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = { |
1077 | [DEBUG_STACK - 1] = DEBUG_STKSZ |
1078 | }; |
1079 | |
1080 | -static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks |
1081 | +DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(char, exception_stacks |
1082 | [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); |
1083 | |
1084 | /* May not be marked __init: used by software suspend */ |
1085 | @@ -1523,6 +1537,14 @@ void cpu_init(void) |
1086 | * try to read it. |
1087 | */ |
1088 | cr4_init_shadow(); |
1089 | + if (!kaiser_enabled) { |
1090 | + /* |
1091 | + * secondary_startup_64() deferred setting PGE in cr4: |
1092 | + * probe_page_size_mask() sets it on the boot cpu, |
1093 | + * but it needs to be set on each secondary cpu. |
1094 | + */ |
1095 | + cr4_set_bits(X86_CR4_PGE); |
1096 | + } |
1097 | |
1098 | /* |
1099 | * Load microcode on this cpu if a valid microcode is available. |
1100 | diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c |
1101 | index 04f89caef9c4..e33b38541be3 100644 |
1102 | --- a/arch/x86/kernel/espfix_64.c |
1103 | +++ b/arch/x86/kernel/espfix_64.c |
1104 | @@ -41,6 +41,7 @@ |
1105 | #include <asm/pgalloc.h> |
1106 | #include <asm/setup.h> |
1107 | #include <asm/espfix.h> |
1108 | +#include <asm/kaiser.h> |
1109 | |
1110 | /* |
1111 | * Note: we only need 6*8 = 48 bytes for the espfix stack, but round |
1112 | @@ -126,6 +127,15 @@ void __init init_espfix_bsp(void) |
1113 | /* Install the espfix pud into the kernel page directory */ |
1114 | pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)]; |
1115 | pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page); |
1116 | + /* |
1117 | + * Just copy the top-level PGD that is mapping the espfix |
1118 | + * area to ensure it is mapped into the shadow user page |
1119 | + * tables. |
1120 | + */ |
1121 | + if (kaiser_enabled) { |
1122 | + set_pgd(native_get_shadow_pgd(pgd_p), |
1123 | + __pgd(_KERNPG_TABLE | __pa((pud_t *)espfix_pud_page))); |
1124 | + } |
1125 | |
1126 | /* Randomize the locations */ |
1127 | init_espfix_random(); |
1128 | diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S |
1129 | index b4421cc191b0..67cd7c1b99da 100644 |
1130 | --- a/arch/x86/kernel/head_64.S |
1131 | +++ b/arch/x86/kernel/head_64.S |
1132 | @@ -190,8 +190,8 @@ ENTRY(secondary_startup_64) |
1133 | movq $(init_level4_pgt - __START_KERNEL_map), %rax |
1134 | 1: |
1135 | |
1136 | - /* Enable PAE mode and PGE */ |
1137 | - movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx |
1138 | + /* Enable PAE and PSE, but defer PGE until kaiser_enabled is decided */ |
1139 | + movl $(X86_CR4_PAE | X86_CR4_PSE), %ecx |
1140 | movq %rcx, %cr4 |
1141 | |
1142 | /* Setup early boot stage 4 level pagetables. */ |
1143 | @@ -405,6 +405,27 @@ GLOBAL(early_recursion_flag) |
1144 | .balign PAGE_SIZE; \ |
1145 | GLOBAL(name) |
1146 | |
1147 | +#ifdef CONFIG_PAGE_TABLE_ISOLATION |
1148 | +/* |
1149 | + * Each PGD needs to be 8k long and 8k aligned. We do not |
1150 | + * ever go out to userspace with these, so we do not |
1151 | + * strictly *need* the second page, but this allows us to |
1152 | + * have a single set_pgd() implementation that does not |
1153 | + * need to worry about whether it has 4k or 8k to work |
1154 | + * with. |
1155 | + * |
1156 | + * This ensures PGDs are 8k long: |
1157 | + */ |
1158 | +#define KAISER_USER_PGD_FILL 512 |
1159 | +/* This ensures they are 8k-aligned: */ |
1160 | +#define NEXT_PGD_PAGE(name) \ |
1161 | + .balign 2 * PAGE_SIZE; \ |
1162 | +GLOBAL(name) |
1163 | +#else |
1164 | +#define NEXT_PGD_PAGE(name) NEXT_PAGE(name) |
1165 | +#define KAISER_USER_PGD_FILL 0 |
1166 | +#endif |
1167 | + |
1168 | /* Automate the creation of 1 to 1 mapping pmd entries */ |
1169 | #define PMDS(START, PERM, COUNT) \ |
1170 | i = 0 ; \ |
1171 | @@ -414,9 +435,10 @@ GLOBAL(name) |
1172 | .endr |
1173 | |
1174 | __INITDATA |
1175 | -NEXT_PAGE(early_level4_pgt) |
1176 | +NEXT_PGD_PAGE(early_level4_pgt) |
1177 | .fill 511,8,0 |
1178 | .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE |
1179 | + .fill KAISER_USER_PGD_FILL,8,0 |
1180 | |
1181 | NEXT_PAGE(early_dynamic_pgts) |
1182 | .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0 |
1183 | @@ -424,16 +446,18 @@ NEXT_PAGE(early_dynamic_pgts) |
1184 | .data |
1185 | |
1186 | #ifndef CONFIG_XEN |
1187 | -NEXT_PAGE(init_level4_pgt) |
1188 | +NEXT_PGD_PAGE(init_level4_pgt) |
1189 | .fill 512,8,0 |
1190 | + .fill KAISER_USER_PGD_FILL,8,0 |
1191 | #else |
1192 | -NEXT_PAGE(init_level4_pgt) |
1193 | +NEXT_PGD_PAGE(init_level4_pgt) |
1194 | .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE |
1195 | .org init_level4_pgt + L4_PAGE_OFFSET*8, 0 |
1196 | .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE |
1197 | .org init_level4_pgt + L4_START_KERNEL*8, 0 |
1198 | /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ |
1199 | .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE |
1200 | + .fill KAISER_USER_PGD_FILL,8,0 |
1201 | |
1202 | NEXT_PAGE(level3_ident_pgt) |
1203 | .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE |
1204 | @@ -444,6 +468,7 @@ NEXT_PAGE(level2_ident_pgt) |
1205 | */ |
1206 | PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) |
1207 | #endif |
1208 | + .fill KAISER_USER_PGD_FILL,8,0 |
1209 | |
1210 | NEXT_PAGE(level3_kernel_pgt) |
1211 | .fill L3_START_KERNEL,8,0 |
1212 | diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c |
1213 | index 1423ab1b0312..f480b38a03c3 100644 |
1214 | --- a/arch/x86/kernel/irqinit.c |
1215 | +++ b/arch/x86/kernel/irqinit.c |
1216 | @@ -51,7 +51,7 @@ static struct irqaction irq2 = { |
1217 | .flags = IRQF_NO_THREAD, |
1218 | }; |
1219 | |
1220 | -DEFINE_PER_CPU(vector_irq_t, vector_irq) = { |
1221 | +DEFINE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq) = { |
1222 | [0 ... NR_VECTORS - 1] = VECTOR_UNUSED, |
1223 | }; |
1224 | |
1225 | diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c |
1226 | index 5f70014ca602..8bc68cfc0d33 100644 |
1227 | --- a/arch/x86/kernel/ldt.c |
1228 | +++ b/arch/x86/kernel/ldt.c |
1229 | @@ -16,6 +16,7 @@ |
1230 | #include <linux/slab.h> |
1231 | #include <linux/vmalloc.h> |
1232 | #include <linux/uaccess.h> |
1233 | +#include <linux/kaiser.h> |
1234 | |
1235 | #include <asm/ldt.h> |
1236 | #include <asm/desc.h> |
1237 | @@ -34,11 +35,21 @@ static void flush_ldt(void *current_mm) |
1238 | set_ldt(pc->ldt->entries, pc->ldt->size); |
1239 | } |
1240 | |
1241 | +static void __free_ldt_struct(struct ldt_struct *ldt) |
1242 | +{ |
1243 | + if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE) |
1244 | + vfree(ldt->entries); |
1245 | + else |
1246 | + free_page((unsigned long)ldt->entries); |
1247 | + kfree(ldt); |
1248 | +} |
1249 | + |
1250 | /* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */ |
1251 | static struct ldt_struct *alloc_ldt_struct(int size) |
1252 | { |
1253 | struct ldt_struct *new_ldt; |
1254 | int alloc_size; |
1255 | + int ret; |
1256 | |
1257 | if (size > LDT_ENTRIES) |
1258 | return NULL; |
1259 | @@ -66,7 +77,13 @@ static struct ldt_struct *alloc_ldt_struct(int size) |
1260 | return NULL; |
1261 | } |
1262 | |
1263 | + ret = kaiser_add_mapping((unsigned long)new_ldt->entries, alloc_size, |
1264 | + __PAGE_KERNEL); |
1265 | new_ldt->size = size; |
1266 | + if (ret) { |
1267 | + __free_ldt_struct(new_ldt); |
1268 | + return NULL; |
1269 | + } |
1270 | return new_ldt; |
1271 | } |
1272 | |
1273 | @@ -92,12 +109,10 @@ static void free_ldt_struct(struct ldt_struct *ldt) |
1274 | if (likely(!ldt)) |
1275 | return; |
1276 | |
1277 | + kaiser_remove_mapping((unsigned long)ldt->entries, |
1278 | + ldt->size * LDT_ENTRY_SIZE); |
1279 | paravirt_free_ldt(ldt->entries, ldt->size); |
1280 | - if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE) |
1281 | - vfree(ldt->entries); |
1282 | - else |
1283 | - free_page((unsigned long)ldt->entries); |
1284 | - kfree(ldt); |
1285 | + __free_ldt_struct(ldt); |
1286 | } |
1287 | |
1288 | /* |
1289 | diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c |
1290 | index bb3840cedb4f..ee43b36075c7 100644 |
1291 | --- a/arch/x86/kernel/paravirt_patch_64.c |
1292 | +++ b/arch/x86/kernel/paravirt_patch_64.c |
1293 | @@ -9,7 +9,6 @@ DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax"); |
1294 | DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax"); |
1295 | DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax"); |
1296 | DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3"); |
1297 | -DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)"); |
1298 | DEF_NATIVE(pv_cpu_ops, clts, "clts"); |
1299 | DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd"); |
1300 | |
1301 | @@ -59,7 +58,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, |
1302 | PATCH_SITE(pv_mmu_ops, read_cr3); |
1303 | PATCH_SITE(pv_mmu_ops, write_cr3); |
1304 | PATCH_SITE(pv_cpu_ops, clts); |
1305 | - PATCH_SITE(pv_mmu_ops, flush_tlb_single); |
1306 | PATCH_SITE(pv_cpu_ops, wbinvd); |
1307 | #if defined(CONFIG_PARAVIRT_SPINLOCKS) |
1308 | case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock): |
1309 | diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c |
1310 | index 8e10e72bf6ee..a55b32007785 100644 |
1311 | --- a/arch/x86/kernel/process.c |
1312 | +++ b/arch/x86/kernel/process.c |
1313 | @@ -41,7 +41,7 @@ |
1314 | * section. Since TSS's are completely CPU-local, we want them |
1315 | * on exact cacheline boundaries, to eliminate cacheline ping-pong. |
1316 | */ |
1317 | -__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { |
1318 | +__visible DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss) = { |
1319 | .x86_tss = { |
1320 | .sp0 = TOP_OF_INIT_STACK, |
1321 | #ifdef CONFIG_X86_32 |
1322 | diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c |
1323 | index feaab07fa124..6b55012d02a3 100644 |
1324 | --- a/arch/x86/kernel/setup.c |
1325 | +++ b/arch/x86/kernel/setup.c |
1326 | @@ -114,6 +114,7 @@ |
1327 | #include <asm/microcode.h> |
1328 | #include <asm/mmu_context.h> |
1329 | #include <asm/kaslr.h> |
1330 | +#include <asm/kaiser.h> |
1331 | |
1332 | /* |
1333 | * max_low_pfn_mapped: highest direct mapped pfn under 4GB |
1334 | @@ -1019,6 +1020,12 @@ void __init setup_arch(char **cmdline_p) |
1335 | */ |
1336 | init_hypervisor_platform(); |
1337 | |
1338 | + /* |
1339 | + * This needs to happen right after XENPV is set on xen and |
1340 | + * kaiser_enabled is checked below in cleanup_highmap(). |
1341 | + */ |
1342 | + kaiser_check_boottime_disable(); |
1343 | + |
1344 | x86_init.resources.probe_roms(); |
1345 | |
1346 | /* after parse_early_param, so could debug it */ |
1347 | diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c |
1348 | index 1c113db9ed57..2bb5ee464df3 100644 |
1349 | --- a/arch/x86/kernel/tracepoint.c |
1350 | +++ b/arch/x86/kernel/tracepoint.c |
1351 | @@ -9,10 +9,12 @@ |
1352 | #include <linux/atomic.h> |
1353 | |
1354 | atomic_t trace_idt_ctr = ATOMIC_INIT(0); |
1355 | +__aligned(PAGE_SIZE) |
1356 | struct desc_ptr trace_idt_descr = { NR_VECTORS * 16 - 1, |
1357 | (unsigned long) trace_idt_table }; |
1358 | |
1359 | /* No need to be aligned, but done to keep all IDTs defined the same way. */ |
1360 | +__aligned(PAGE_SIZE) |
1361 | gate_desc trace_idt_table[NR_VECTORS] __page_aligned_bss; |
1362 | |
1363 | static int trace_irq_vector_refcount; |
1364 | diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c |
1365 | index 7e28e6c877d9..73304b1a03cc 100644 |
1366 | --- a/arch/x86/kvm/x86.c |
1367 | +++ b/arch/x86/kvm/x86.c |
1368 | @@ -773,7 +773,8 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) |
1369 | return 1; |
1370 | |
1371 | /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */ |
1372 | - if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu)) |
1373 | + if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_ASID_MASK) || |
1374 | + !is_long_mode(vcpu)) |
1375 | return 1; |
1376 | } |
1377 | |
1378 | diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c |
1379 | index 5cc78bf57232..3261abb21ef4 100644 |
1380 | --- a/arch/x86/lib/cmdline.c |
1381 | +++ b/arch/x86/lib/cmdline.c |
1382 | @@ -104,7 +104,112 @@ __cmdline_find_option_bool(const char *cmdline, int max_cmdline_size, |
1383 | return 0; /* Buffer overrun */ |
1384 | } |
1385 | |
1386 | +/* |
1387 | + * Find a non-boolean option (i.e. option=argument). In accordance with |
1388 | + * standard Linux practice, if this option is repeated, this returns the |
1389 | + * last instance on the command line. |
1390 | + * |
1391 | + * @cmdline: the cmdline string |
1392 | + * @max_cmdline_size: the maximum size of cmdline |
1393 | + * @option: option string to look for |
1394 | + * @buffer: memory buffer to return the option argument |
1395 | + * @bufsize: size of the supplied memory buffer |
1396 | + * |
1397 | + * Returns the length of the argument (regardless of if it was |
1398 | + * truncated to fit in the buffer), or -1 on not found. |
1399 | + */ |
1400 | +static int |
1401 | +__cmdline_find_option(const char *cmdline, int max_cmdline_size, |
1402 | + const char *option, char *buffer, int bufsize) |
1403 | +{ |
1404 | + char c; |
1405 | + int pos = 0, len = -1; |
1406 | + const char *opptr = NULL; |
1407 | + char *bufptr = buffer; |
1408 | + enum { |
1409 | + st_wordstart = 0, /* Start of word/after whitespace */ |
1410 | + st_wordcmp, /* Comparing this word */ |
1411 | + st_wordskip, /* Miscompare, skip */ |
1412 | + st_bufcpy, /* Copying this to buffer */ |
1413 | + } state = st_wordstart; |
1414 | + |
1415 | + if (!cmdline) |
1416 | + return -1; /* No command line */ |
1417 | + |
1418 | + /* |
1419 | + * This 'pos' check ensures we do not overrun |
1420 | + * a non-NULL-terminated 'cmdline' |
1421 | + */ |
1422 | + while (pos++ < max_cmdline_size) { |
1423 | + c = *(char *)cmdline++; |
1424 | + if (!c) |
1425 | + break; |
1426 | + |
1427 | + switch (state) { |
1428 | + case st_wordstart: |
1429 | + if (myisspace(c)) |
1430 | + break; |
1431 | + |
1432 | + state = st_wordcmp; |
1433 | + opptr = option; |
1434 | + /* fall through */ |
1435 | + |
1436 | + case st_wordcmp: |
1437 | + if ((c == '=') && !*opptr) { |
1438 | + /* |
1439 | + * We matched all the way to the end of the |
1440 | + * option we were looking for, prepare to |
1441 | + * copy the argument. |
1442 | + */ |
1443 | + len = 0; |
1444 | + bufptr = buffer; |
1445 | + state = st_bufcpy; |
1446 | + break; |
1447 | + } else if (c == *opptr++) { |
1448 | + /* |
1449 | + * We are currently matching, so continue |
1450 | + * to the next character on the cmdline. |
1451 | + */ |
1452 | + break; |
1453 | + } |
1454 | + state = st_wordskip; |
1455 | + /* fall through */ |
1456 | + |
1457 | + case st_wordskip: |
1458 | + if (myisspace(c)) |
1459 | + state = st_wordstart; |
1460 | + break; |
1461 | + |
1462 | + case st_bufcpy: |
1463 | + if (myisspace(c)) { |
1464 | + state = st_wordstart; |
1465 | + } else { |
1466 | + /* |
1467 | + * Increment len, but don't overrun the |
1468 | + * supplied buffer and leave room for the |
1469 | + * NULL terminator. |
1470 | + */ |
1471 | + if (++len < bufsize) |
1472 | + *bufptr++ = c; |
1473 | + } |
1474 | + break; |
1475 | + } |
1476 | + } |
1477 | + |
1478 | + if (bufsize) |
1479 | + *bufptr = '\0'; |
1480 | + |
1481 | + return len; |
1482 | +} |
1483 | + |
1484 | int cmdline_find_option_bool(const char *cmdline, const char *option) |
1485 | { |
1486 | return __cmdline_find_option_bool(cmdline, COMMAND_LINE_SIZE, option); |
1487 | } |
1488 | + |
1489 | +int cmdline_find_option(const char *cmdline, const char *option, char *buffer, |
1490 | + int bufsize) |
1491 | +{ |
1492 | + return __cmdline_find_option(cmdline, COMMAND_LINE_SIZE, option, |
1493 | + buffer, bufsize); |
1494 | +} |
1495 | diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile |
1496 | index 96d2b847e09e..c548b46100cb 100644 |
1497 | --- a/arch/x86/mm/Makefile |
1498 | +++ b/arch/x86/mm/Makefile |
1499 | @@ -37,5 +37,5 @@ obj-$(CONFIG_NUMA_EMU) += numa_emulation.o |
1500 | |
1501 | obj-$(CONFIG_X86_INTEL_MPX) += mpx.o |
1502 | obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o |
1503 | -obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o |
1504 | - |
1505 | +obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o |
1506 | +obj-$(CONFIG_PAGE_TABLE_ISOLATION) += kaiser.o |
1507 | diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c |
1508 | index 0381638168d1..1e779bca4f3e 100644 |
1509 | --- a/arch/x86/mm/init.c |
1510 | +++ b/arch/x86/mm/init.c |
1511 | @@ -177,7 +177,7 @@ static void __init probe_page_size_mask(void) |
1512 | cr4_set_bits_and_update_boot(X86_CR4_PSE); |
1513 | |
1514 | /* Enable PGE if available */ |
1515 | - if (boot_cpu_has(X86_FEATURE_PGE)) { |
1516 | + if (boot_cpu_has(X86_FEATURE_PGE) && !kaiser_enabled) { |
1517 | cr4_set_bits_and_update_boot(X86_CR4_PGE); |
1518 | __supported_pte_mask |= _PAGE_GLOBAL; |
1519 | } else |
1520 | diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c |
1521 | index 3e27ded6ac65..7df8e3a79dc0 100644 |
1522 | --- a/arch/x86/mm/init_64.c |
1523 | +++ b/arch/x86/mm/init_64.c |
1524 | @@ -324,6 +324,16 @@ void __init cleanup_highmap(void) |
1525 | continue; |
1526 | if (vaddr < (unsigned long) _text || vaddr > end) |
1527 | set_pmd(pmd, __pmd(0)); |
1528 | + else if (kaiser_enabled) { |
1529 | + /* |
1530 | + * level2_kernel_pgt is initialized with _PAGE_GLOBAL: |
1531 | + * clear that now. This is not important, so long as |
1532 | + * CR4.PGE remains clear, but it removes an anomaly. |
1533 | + * Physical mapping setup below avoids _PAGE_GLOBAL |
1534 | + * by use of massage_pgprot() inside pfn_pte() etc. |
1535 | + */ |
1536 | + set_pmd(pmd, pmd_clear_flags(*pmd, _PAGE_GLOBAL)); |
1537 | + } |
1538 | } |
1539 | } |
1540 | |
1541 | diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c |
1542 | new file mode 100644 |
1543 | index 000000000000..d8376b4ad9f0 |
1544 | --- /dev/null |
1545 | +++ b/arch/x86/mm/kaiser.c |
1546 | @@ -0,0 +1,455 @@ |
1547 | +#include <linux/bug.h> |
1548 | +#include <linux/kernel.h> |
1549 | +#include <linux/errno.h> |
1550 | +#include <linux/string.h> |
1551 | +#include <linux/types.h> |
1552 | +#include <linux/bug.h> |
1553 | +#include <linux/init.h> |
1554 | +#include <linux/interrupt.h> |
1555 | +#include <linux/spinlock.h> |
1556 | +#include <linux/mm.h> |
1557 | +#include <linux/uaccess.h> |
1558 | + |
1559 | +#undef pr_fmt |
1560 | +#define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt |
1561 | + |
1562 | +#include <asm/kaiser.h> |
1563 | +#include <asm/tlbflush.h> /* to verify its kaiser declarations */ |
1564 | +#include <asm/pgtable.h> |
1565 | +#include <asm/pgalloc.h> |
1566 | +#include <asm/desc.h> |
1567 | +#include <asm/cmdline.h> |
1568 | + |
1569 | +int kaiser_enabled __read_mostly = 1; |
1570 | +EXPORT_SYMBOL(kaiser_enabled); /* for inlined TLB flush functions */ |
1571 | + |
1572 | +__visible |
1573 | +DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); |
1574 | + |
1575 | +/* |
1576 | + * These can have bit 63 set, so we can not just use a plain "or" |
1577 | + * instruction to get their value or'd into CR3. It would take |
1578 | + * another register. So, we use a memory reference to these instead. |
1579 | + * |
1580 | + * This is also handy because systems that do not support PCIDs |
1581 | + * just end up or'ing a 0 into their CR3, which does no harm. |
1582 | + */ |
1583 | +DEFINE_PER_CPU(unsigned long, x86_cr3_pcid_user); |
1584 | + |
1585 | +/* |
1586 | + * At runtime, the only things we map are some things for CPU |
1587 | + * hotplug, and stacks for new processes. No two CPUs will ever |
1588 | + * be populating the same addresses, so we only need to ensure |
1589 | + * that we protect between two CPUs trying to allocate and |
1590 | + * populate the same page table page. |
1591 | + * |
1592 | + * Only take this lock when doing a set_p[4um]d(), but it is not |
1593 | + * needed for doing a set_pte(). We assume that only the *owner* |
1594 | + * of a given allocation will be doing this for _their_ |
1595 | + * allocation. |
1596 | + * |
1597 | + * This ensures that once a system has been running for a while |
1598 | + * and there have been stacks all over and these page tables |
1599 | + * are fully populated, there will be no further acquisitions of |
1600 | + * this lock. |
1601 | + */ |
1602 | +static DEFINE_SPINLOCK(shadow_table_allocation_lock); |
1603 | + |
1604 | +/* |
1605 | + * Returns -1 on error. |
1606 | + */ |
1607 | +static inline unsigned long get_pa_from_mapping(unsigned long vaddr) |
1608 | +{ |
1609 | + pgd_t *pgd; |
1610 | + pud_t *pud; |
1611 | + pmd_t *pmd; |
1612 | + pte_t *pte; |
1613 | + |
1614 | + pgd = pgd_offset_k(vaddr); |
1615 | + /* |
1616 | + * We made all the kernel PGDs present in kaiser_init(). |
1617 | + * We expect them to stay that way. |
1618 | + */ |
1619 | + BUG_ON(pgd_none(*pgd)); |
1620 | + /* |
1621 | + * PGDs are either 512GB or 128TB on all x86_64 |
1622 | + * configurations. We don't handle these. |
1623 | + */ |
1624 | + BUG_ON(pgd_large(*pgd)); |
1625 | + |
1626 | + pud = pud_offset(pgd, vaddr); |
1627 | + if (pud_none(*pud)) { |
1628 | + WARN_ON_ONCE(1); |
1629 | + return -1; |
1630 | + } |
1631 | + |
1632 | + if (pud_large(*pud)) |
1633 | + return (pud_pfn(*pud) << PAGE_SHIFT) | (vaddr & ~PUD_PAGE_MASK); |
1634 | + |
1635 | + pmd = pmd_offset(pud, vaddr); |
1636 | + if (pmd_none(*pmd)) { |
1637 | + WARN_ON_ONCE(1); |
1638 | + return -1; |
1639 | + } |
1640 | + |
1641 | + if (pmd_large(*pmd)) |
1642 | + return (pmd_pfn(*pmd) << PAGE_SHIFT) | (vaddr & ~PMD_PAGE_MASK); |
1643 | + |
1644 | + pte = pte_offset_kernel(pmd, vaddr); |
1645 | + if (pte_none(*pte)) { |
1646 | + WARN_ON_ONCE(1); |
1647 | + return -1; |
1648 | + } |
1649 | + |
1650 | + return (pte_pfn(*pte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK); |
1651 | +} |
1652 | + |
1653 | +/* |
1654 | + * This is a relatively normal page table walk, except that it |
1655 | + * also tries to allocate page tables pages along the way. |
1656 | + * |
1657 | + * Returns a pointer to a PTE on success, or NULL on failure. |
1658 | + */ |
1659 | +static pte_t *kaiser_pagetable_walk(unsigned long address) |
1660 | +{ |
1661 | + pmd_t *pmd; |
1662 | + pud_t *pud; |
1663 | + pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address)); |
1664 | + gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); |
1665 | + |
1666 | + if (pgd_none(*pgd)) { |
1667 | + WARN_ONCE(1, "All shadow pgds should have been populated"); |
1668 | + return NULL; |
1669 | + } |
1670 | + BUILD_BUG_ON(pgd_large(*pgd) != 0); |
1671 | + |
1672 | + pud = pud_offset(pgd, address); |
1673 | + /* The shadow page tables do not use large mappings: */ |
1674 | + if (pud_large(*pud)) { |
1675 | + WARN_ON(1); |
1676 | + return NULL; |
1677 | + } |
1678 | + if (pud_none(*pud)) { |
1679 | + unsigned long new_pmd_page = __get_free_page(gfp); |
1680 | + if (!new_pmd_page) |
1681 | + return NULL; |
1682 | + spin_lock(&shadow_table_allocation_lock); |
1683 | + if (pud_none(*pud)) { |
1684 | + set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page))); |
1685 | + __inc_zone_page_state(virt_to_page((void *) |
1686 | + new_pmd_page), NR_KAISERTABLE); |
1687 | + } else |
1688 | + free_page(new_pmd_page); |
1689 | + spin_unlock(&shadow_table_allocation_lock); |
1690 | + } |
1691 | + |
1692 | + pmd = pmd_offset(pud, address); |
1693 | + /* The shadow page tables do not use large mappings: */ |
1694 | + if (pmd_large(*pmd)) { |
1695 | + WARN_ON(1); |
1696 | + return NULL; |
1697 | + } |
1698 | + if (pmd_none(*pmd)) { |
1699 | + unsigned long new_pte_page = __get_free_page(gfp); |
1700 | + if (!new_pte_page) |
1701 | + return NULL; |
1702 | + spin_lock(&shadow_table_allocation_lock); |
1703 | + if (pmd_none(*pmd)) { |
1704 | + set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page))); |
1705 | + __inc_zone_page_state(virt_to_page((void *) |
1706 | + new_pte_page), NR_KAISERTABLE); |
1707 | + } else |
1708 | + free_page(new_pte_page); |
1709 | + spin_unlock(&shadow_table_allocation_lock); |
1710 | + } |
1711 | + |
1712 | + return pte_offset_kernel(pmd, address); |
1713 | +} |
1714 | + |
1715 | +static int kaiser_add_user_map(const void *__start_addr, unsigned long size, |
1716 | + unsigned long flags) |
1717 | +{ |
1718 | + int ret = 0; |
1719 | + pte_t *pte; |
1720 | + unsigned long start_addr = (unsigned long )__start_addr; |
1721 | + unsigned long address = start_addr & PAGE_MASK; |
1722 | + unsigned long end_addr = PAGE_ALIGN(start_addr + size); |
1723 | + unsigned long target_address; |
1724 | + |
1725 | + /* |
1726 | + * It is convenient for callers to pass in __PAGE_KERNEL etc, |
1727 | + * and there is no actual harm from setting _PAGE_GLOBAL, so |
1728 | + * long as CR4.PGE is not set. But it is nonetheless troubling |
1729 | + * to see Kaiser itself setting _PAGE_GLOBAL (now that "nokaiser" |
1730 | + * requires that not to be #defined to 0): so mask it off here. |
1731 | + */ |
1732 | + flags &= ~_PAGE_GLOBAL; |
1733 | + |
1734 | + for (; address < end_addr; address += PAGE_SIZE) { |
1735 | + target_address = get_pa_from_mapping(address); |
1736 | + if (target_address == -1) { |
1737 | + ret = -EIO; |
1738 | + break; |
1739 | + } |
1740 | + pte = kaiser_pagetable_walk(address); |
1741 | + if (!pte) { |
1742 | + ret = -ENOMEM; |
1743 | + break; |
1744 | + } |
1745 | + if (pte_none(*pte)) { |
1746 | + set_pte(pte, __pte(flags | target_address)); |
1747 | + } else { |
1748 | + pte_t tmp; |
1749 | + set_pte(&tmp, __pte(flags | target_address)); |
1750 | + WARN_ON_ONCE(!pte_same(*pte, tmp)); |
1751 | + } |
1752 | + } |
1753 | + return ret; |
1754 | +} |
1755 | + |
1756 | +static int kaiser_add_user_map_ptrs(const void *start, const void *end, unsigned long flags) |
1757 | +{ |
1758 | + unsigned long size = end - start; |
1759 | + |
1760 | + return kaiser_add_user_map(start, size, flags); |
1761 | +} |
1762 | + |
1763 | +/* |
1764 | + * Ensure that the top level of the (shadow) page tables are |
1765 | + * entirely populated. This ensures that all processes that get |
1766 | + * forked have the same entries. This way, we do not have to |
1767 | + * ever go set up new entries in older processes. |
1768 | + * |
1769 | + * Note: we never free these, so there are no updates to them |
1770 | + * after this. |
1771 | + */ |
1772 | +static void __init kaiser_init_all_pgds(void) |
1773 | +{ |
1774 | + pgd_t *pgd; |
1775 | + int i = 0; |
1776 | + |
1777 | + pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0)); |
1778 | + for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) { |
1779 | + pgd_t new_pgd; |
1780 | + pud_t *pud = pud_alloc_one(&init_mm, |
1781 | + PAGE_OFFSET + i * PGDIR_SIZE); |
1782 | + if (!pud) { |
1783 | + WARN_ON(1); |
1784 | + break; |
1785 | + } |
1786 | + inc_zone_page_state(virt_to_page(pud), NR_KAISERTABLE); |
1787 | + new_pgd = __pgd(_KERNPG_TABLE |__pa(pud)); |
1788 | + /* |
1789 | + * Make sure not to stomp on some other pgd entry. |
1790 | + */ |
1791 | + if (!pgd_none(pgd[i])) { |
1792 | + WARN_ON(1); |
1793 | + continue; |
1794 | + } |
1795 | + set_pgd(pgd + i, new_pgd); |
1796 | + } |
1797 | +} |
1798 | + |
1799 | +#define kaiser_add_user_map_early(start, size, flags) do { \ |
1800 | + int __ret = kaiser_add_user_map(start, size, flags); \ |
1801 | + WARN_ON(__ret); \ |
1802 | +} while (0) |
1803 | + |
1804 | +#define kaiser_add_user_map_ptrs_early(start, end, flags) do { \ |
1805 | + int __ret = kaiser_add_user_map_ptrs(start, end, flags); \ |
1806 | + WARN_ON(__ret); \ |
1807 | +} while (0) |
1808 | + |
1809 | +void __init kaiser_check_boottime_disable(void) |
1810 | +{ |
1811 | + bool enable = true; |
1812 | + char arg[5]; |
1813 | + int ret; |
1814 | + |
1815 | + if (boot_cpu_has(X86_FEATURE_XENPV)) |
1816 | + goto silent_disable; |
1817 | + |
1818 | + ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg)); |
1819 | + if (ret > 0) { |
1820 | + if (!strncmp(arg, "on", 2)) |
1821 | + goto enable; |
1822 | + |
1823 | + if (!strncmp(arg, "off", 3)) |
1824 | + goto disable; |
1825 | + |
1826 | + if (!strncmp(arg, "auto", 4)) |
1827 | + goto skip; |
1828 | + } |
1829 | + |
1830 | + if (cmdline_find_option_bool(boot_command_line, "nopti")) |
1831 | + goto disable; |
1832 | + |
1833 | +skip: |
1834 | + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) |
1835 | + goto disable; |
1836 | + |
1837 | +enable: |
1838 | + if (enable) |
1839 | + setup_force_cpu_cap(X86_FEATURE_KAISER); |
1840 | + |
1841 | + return; |
1842 | + |
1843 | +disable: |
1844 | + pr_info("disabled\n"); |
1845 | + |
1846 | +silent_disable: |
1847 | + kaiser_enabled = 0; |
1848 | + setup_clear_cpu_cap(X86_FEATURE_KAISER); |
1849 | +} |
1850 | + |
1851 | +/* |
1852 | + * If anything in here fails, we will likely die on one of the |
1853 | + * first kernel->user transitions and init will die. But, we |
1854 | + * will have most of the kernel up by then and should be able to |
1855 | + * get a clean warning out of it. If we BUG_ON() here, we run |
1856 | + * the risk of being before we have good console output. |
1857 | + */ |
1858 | +void __init kaiser_init(void) |
1859 | +{ |
1860 | + int cpu; |
1861 | + |
1862 | + if (!kaiser_enabled) |
1863 | + return; |
1864 | + |
1865 | + kaiser_init_all_pgds(); |
1866 | + |
1867 | + for_each_possible_cpu(cpu) { |
1868 | + void *percpu_vaddr = __per_cpu_user_mapped_start + |
1869 | + per_cpu_offset(cpu); |
1870 | + unsigned long percpu_sz = __per_cpu_user_mapped_end - |
1871 | + __per_cpu_user_mapped_start; |
1872 | + kaiser_add_user_map_early(percpu_vaddr, percpu_sz, |
1873 | + __PAGE_KERNEL); |
1874 | + } |
1875 | + |
1876 | + /* |
1877 | + * Map the entry/exit text section, which is needed at |
1878 | + * switches from user to and from kernel. |
1879 | + */ |
1880 | + kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end, |
1881 | + __PAGE_KERNEL_RX); |
1882 | + |
1883 | +#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN) |
1884 | + kaiser_add_user_map_ptrs_early(__irqentry_text_start, |
1885 | + __irqentry_text_end, |
1886 | + __PAGE_KERNEL_RX); |
1887 | +#endif |
1888 | + kaiser_add_user_map_early((void *)idt_descr.address, |
1889 | + sizeof(gate_desc) * NR_VECTORS, |
1890 | + __PAGE_KERNEL_RO); |
1891 | +#ifdef CONFIG_TRACING |
1892 | + kaiser_add_user_map_early(&trace_idt_descr, |
1893 | + sizeof(trace_idt_descr), |
1894 | + __PAGE_KERNEL); |
1895 | + kaiser_add_user_map_early(&trace_idt_table, |
1896 | + sizeof(gate_desc) * NR_VECTORS, |
1897 | + __PAGE_KERNEL); |
1898 | +#endif |
1899 | + kaiser_add_user_map_early(&debug_idt_descr, sizeof(debug_idt_descr), |
1900 | + __PAGE_KERNEL); |
1901 | + kaiser_add_user_map_early(&debug_idt_table, |
1902 | + sizeof(gate_desc) * NR_VECTORS, |
1903 | + __PAGE_KERNEL); |
1904 | + |
1905 | + pr_info("enabled\n"); |
1906 | +} |
1907 | + |
1908 | +/* Add a mapping to the shadow mapping, and synchronize the mappings */ |
1909 | +int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags) |
1910 | +{ |
1911 | + if (!kaiser_enabled) |
1912 | + return 0; |
1913 | + return kaiser_add_user_map((const void *)addr, size, flags); |
1914 | +} |
1915 | + |
1916 | +void kaiser_remove_mapping(unsigned long start, unsigned long size) |
1917 | +{ |
1918 | + extern void unmap_pud_range_nofree(pgd_t *pgd, |
1919 | + unsigned long start, unsigned long end); |
1920 | + unsigned long end = start + size; |
1921 | + unsigned long addr, next; |
1922 | + pgd_t *pgd; |
1923 | + |
1924 | + if (!kaiser_enabled) |
1925 | + return; |
1926 | + pgd = native_get_shadow_pgd(pgd_offset_k(start)); |
1927 | + for (addr = start; addr < end; pgd++, addr = next) { |
1928 | + next = pgd_addr_end(addr, end); |
1929 | + unmap_pud_range_nofree(pgd, addr, next); |
1930 | + } |
1931 | +} |
1932 | + |
1933 | +/* |
1934 | + * Page table pages are page-aligned. The lower half of the top |
1935 | + * level is used for userspace and the top half for the kernel. |
1936 | + * This returns true for user pages that need to get copied into |
1937 | + * both the user and kernel copies of the page tables, and false |
1938 | + * for kernel pages that should only be in the kernel copy. |
1939 | + */ |
1940 | +static inline bool is_userspace_pgd(pgd_t *pgdp) |
1941 | +{ |
1942 | + return ((unsigned long)pgdp % PAGE_SIZE) < (PAGE_SIZE / 2); |
1943 | +} |
1944 | + |
1945 | +pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd) |
1946 | +{ |
1947 | + if (!kaiser_enabled) |
1948 | + return pgd; |
1949 | + /* |
1950 | + * Do we need to also populate the shadow pgd? Check _PAGE_USER to |
1951 | + * skip cases like kexec and EFI which make temporary low mappings. |
1952 | + */ |
1953 | + if (pgd.pgd & _PAGE_USER) { |
1954 | + if (is_userspace_pgd(pgdp)) { |
1955 | + native_get_shadow_pgd(pgdp)->pgd = pgd.pgd; |
1956 | + /* |
1957 | + * Even if the entry is *mapping* userspace, ensure |
1958 | + * that userspace can not use it. This way, if we |
1959 | + * get out to userspace running on the kernel CR3, |
1960 | + * userspace will crash instead of running. |
1961 | + */ |
1962 | + if (__supported_pte_mask & _PAGE_NX) |
1963 | + pgd.pgd |= _PAGE_NX; |
1964 | + } |
1965 | + } else if (!pgd.pgd) { |
1966 | + /* |
1967 | + * pgd_clear() cannot check _PAGE_USER, and is even used to |
1968 | + * clear corrupted pgd entries: so just rely on cases like |
1969 | + * kexec and EFI never to be using pgd_clear(). |
1970 | + */ |
1971 | + if (!WARN_ON_ONCE((unsigned long)pgdp & PAGE_SIZE) && |
1972 | + is_userspace_pgd(pgdp)) |
1973 | + native_get_shadow_pgd(pgdp)->pgd = pgd.pgd; |
1974 | + } |
1975 | + return pgd; |
1976 | +} |
1977 | + |
1978 | +void kaiser_setup_pcid(void) |
1979 | +{ |
1980 | + unsigned long user_cr3 = KAISER_SHADOW_PGD_OFFSET; |
1981 | + |
1982 | + if (this_cpu_has(X86_FEATURE_PCID)) |
1983 | + user_cr3 |= X86_CR3_PCID_USER_NOFLUSH; |
1984 | + /* |
1985 | + * These variables are used by the entry/exit |
1986 | + * code to change PCID and pgd and TLB flushing. |
1987 | + */ |
1988 | + this_cpu_write(x86_cr3_pcid_user, user_cr3); |
1989 | +} |
1990 | + |
1991 | +/* |
1992 | + * Make a note that this cpu will need to flush USER tlb on return to user. |
1993 | + * If cpu does not have PCID, then the NOFLUSH bit will never have been set. |
1994 | + */ |
1995 | +void kaiser_flush_tlb_on_return_to_user(void) |
1996 | +{ |
1997 | + if (this_cpu_has(X86_FEATURE_PCID)) |
1998 | + this_cpu_write(x86_cr3_pcid_user, |
1999 | + X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET); |
2000 | +} |
2001 | +EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user); |
2002 | diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c |
2003 | index aed206475aa7..319183d93602 100644 |
2004 | --- a/arch/x86/mm/kaslr.c |
2005 | +++ b/arch/x86/mm/kaslr.c |
2006 | @@ -189,6 +189,6 @@ void __meminit init_trampoline(void) |
2007 | *pud_tramp = *pud; |
2008 | } |
2009 | |
2010 | - set_pgd(&trampoline_pgd_entry, |
2011 | - __pgd(_KERNPG_TABLE | __pa(pud_page_tramp))); |
2012 | + /* Avoid set_pgd(), in case it's complicated by CONFIG_PAGE_TABLE_ISOLATION */ |
2013 | + trampoline_pgd_entry = __pgd(_KERNPG_TABLE | __pa(pud_page_tramp)); |
2014 | } |
2015 | diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c |
2016 | index e3353c97d086..73dcb0e18c1b 100644 |
2017 | --- a/arch/x86/mm/pageattr.c |
2018 | +++ b/arch/x86/mm/pageattr.c |
2019 | @@ -52,6 +52,7 @@ static DEFINE_SPINLOCK(cpa_lock); |
2020 | #define CPA_FLUSHTLB 1 |
2021 | #define CPA_ARRAY 2 |
2022 | #define CPA_PAGES_ARRAY 4 |
2023 | +#define CPA_FREE_PAGETABLES 8 |
2024 | |
2025 | #ifdef CONFIG_PROC_FS |
2026 | static unsigned long direct_pages_count[PG_LEVEL_NUM]; |
2027 | @@ -729,10 +730,13 @@ static int split_large_page(struct cpa_data *cpa, pte_t *kpte, |
2028 | return 0; |
2029 | } |
2030 | |
2031 | -static bool try_to_free_pte_page(pte_t *pte) |
2032 | +static bool try_to_free_pte_page(struct cpa_data *cpa, pte_t *pte) |
2033 | { |
2034 | int i; |
2035 | |
2036 | + if (!(cpa->flags & CPA_FREE_PAGETABLES)) |
2037 | + return false; |
2038 | + |
2039 | for (i = 0; i < PTRS_PER_PTE; i++) |
2040 | if (!pte_none(pte[i])) |
2041 | return false; |
2042 | @@ -741,10 +745,13 @@ static bool try_to_free_pte_page(pte_t *pte) |
2043 | return true; |
2044 | } |
2045 | |
2046 | -static bool try_to_free_pmd_page(pmd_t *pmd) |
2047 | +static bool try_to_free_pmd_page(struct cpa_data *cpa, pmd_t *pmd) |
2048 | { |
2049 | int i; |
2050 | |
2051 | + if (!(cpa->flags & CPA_FREE_PAGETABLES)) |
2052 | + return false; |
2053 | + |
2054 | for (i = 0; i < PTRS_PER_PMD; i++) |
2055 | if (!pmd_none(pmd[i])) |
2056 | return false; |
2057 | @@ -753,7 +760,9 @@ static bool try_to_free_pmd_page(pmd_t *pmd) |
2058 | return true; |
2059 | } |
2060 | |
2061 | -static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end) |
2062 | +static bool unmap_pte_range(struct cpa_data *cpa, pmd_t *pmd, |
2063 | + unsigned long start, |
2064 | + unsigned long end) |
2065 | { |
2066 | pte_t *pte = pte_offset_kernel(pmd, start); |
2067 | |
2068 | @@ -764,22 +773,23 @@ static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end) |
2069 | pte++; |
2070 | } |
2071 | |
2072 | - if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) { |
2073 | + if (try_to_free_pte_page(cpa, (pte_t *)pmd_page_vaddr(*pmd))) { |
2074 | pmd_clear(pmd); |
2075 | return true; |
2076 | } |
2077 | return false; |
2078 | } |
2079 | |
2080 | -static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd, |
2081 | +static void __unmap_pmd_range(struct cpa_data *cpa, pud_t *pud, pmd_t *pmd, |
2082 | unsigned long start, unsigned long end) |
2083 | { |
2084 | - if (unmap_pte_range(pmd, start, end)) |
2085 | - if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud))) |
2086 | + if (unmap_pte_range(cpa, pmd, start, end)) |
2087 | + if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud))) |
2088 | pud_clear(pud); |
2089 | } |
2090 | |
2091 | -static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end) |
2092 | +static void unmap_pmd_range(struct cpa_data *cpa, pud_t *pud, |
2093 | + unsigned long start, unsigned long end) |
2094 | { |
2095 | pmd_t *pmd = pmd_offset(pud, start); |
2096 | |
2097 | @@ -790,7 +800,7 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end) |
2098 | unsigned long next_page = (start + PMD_SIZE) & PMD_MASK; |
2099 | unsigned long pre_end = min_t(unsigned long, end, next_page); |
2100 | |
2101 | - __unmap_pmd_range(pud, pmd, start, pre_end); |
2102 | + __unmap_pmd_range(cpa, pud, pmd, start, pre_end); |
2103 | |
2104 | start = pre_end; |
2105 | pmd++; |
2106 | @@ -803,7 +813,8 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end) |
2107 | if (pmd_large(*pmd)) |
2108 | pmd_clear(pmd); |
2109 | else |
2110 | - __unmap_pmd_range(pud, pmd, start, start + PMD_SIZE); |
2111 | + __unmap_pmd_range(cpa, pud, pmd, |
2112 | + start, start + PMD_SIZE); |
2113 | |
2114 | start += PMD_SIZE; |
2115 | pmd++; |
2116 | @@ -813,17 +824,19 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end) |
2117 | * 4K leftovers? |
2118 | */ |
2119 | if (start < end) |
2120 | - return __unmap_pmd_range(pud, pmd, start, end); |
2121 | + return __unmap_pmd_range(cpa, pud, pmd, start, end); |
2122 | |
2123 | /* |
2124 | * Try again to free the PMD page if haven't succeeded above. |
2125 | */ |
2126 | if (!pud_none(*pud)) |
2127 | - if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud))) |
2128 | + if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud))) |
2129 | pud_clear(pud); |
2130 | } |
2131 | |
2132 | -static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) |
2133 | +static void __unmap_pud_range(struct cpa_data *cpa, pgd_t *pgd, |
2134 | + unsigned long start, |
2135 | + unsigned long end) |
2136 | { |
2137 | pud_t *pud = pud_offset(pgd, start); |
2138 | |
2139 | @@ -834,7 +847,7 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) |
2140 | unsigned long next_page = (start + PUD_SIZE) & PUD_MASK; |
2141 | unsigned long pre_end = min_t(unsigned long, end, next_page); |
2142 | |
2143 | - unmap_pmd_range(pud, start, pre_end); |
2144 | + unmap_pmd_range(cpa, pud, start, pre_end); |
2145 | |
2146 | start = pre_end; |
2147 | pud++; |
2148 | @@ -848,7 +861,7 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) |
2149 | if (pud_large(*pud)) |
2150 | pud_clear(pud); |
2151 | else |
2152 | - unmap_pmd_range(pud, start, start + PUD_SIZE); |
2153 | + unmap_pmd_range(cpa, pud, start, start + PUD_SIZE); |
2154 | |
2155 | start += PUD_SIZE; |
2156 | pud++; |
2157 | @@ -858,7 +871,7 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) |
2158 | * 2M leftovers? |
2159 | */ |
2160 | if (start < end) |
2161 | - unmap_pmd_range(pud, start, end); |
2162 | + unmap_pmd_range(cpa, pud, start, end); |
2163 | |
2164 | /* |
2165 | * No need to try to free the PUD page because we'll free it in |
2166 | @@ -866,6 +879,24 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) |
2167 | */ |
2168 | } |
2169 | |
2170 | +static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) |
2171 | +{ |
2172 | + struct cpa_data cpa = { |
2173 | + .flags = CPA_FREE_PAGETABLES, |
2174 | + }; |
2175 | + |
2176 | + __unmap_pud_range(&cpa, pgd, start, end); |
2177 | +} |
2178 | + |
2179 | +void unmap_pud_range_nofree(pgd_t *pgd, unsigned long start, unsigned long end) |
2180 | +{ |
2181 | + struct cpa_data cpa = { |
2182 | + .flags = 0, |
2183 | + }; |
2184 | + |
2185 | + __unmap_pud_range(&cpa, pgd, start, end); |
2186 | +} |
2187 | + |
2188 | static int alloc_pte_page(pmd_t *pmd) |
2189 | { |
2190 | pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); |
2191 | diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c |
2192 | index 3feec5af4e67..5aaec8effc5f 100644 |
2193 | --- a/arch/x86/mm/pgtable.c |
2194 | +++ b/arch/x86/mm/pgtable.c |
2195 | @@ -344,14 +344,22 @@ static inline void _pgd_free(pgd_t *pgd) |
2196 | kmem_cache_free(pgd_cache, pgd); |
2197 | } |
2198 | #else |
2199 | + |
2200 | +/* |
2201 | + * Instead of one pgd, Kaiser acquires two pgds. Being order-1, it is |
2202 | + * both 8k in size and 8k-aligned. That lets us just flip bit 12 |
2203 | + * in a pointer to swap between the two 4k halves. |
2204 | + */ |
2205 | +#define PGD_ALLOCATION_ORDER kaiser_enabled |
2206 | + |
2207 | static inline pgd_t *_pgd_alloc(void) |
2208 | { |
2209 | - return (pgd_t *)__get_free_page(PGALLOC_GFP); |
2210 | + return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER); |
2211 | } |
2212 | |
2213 | static inline void _pgd_free(pgd_t *pgd) |
2214 | { |
2215 | - free_page((unsigned long)pgd); |
2216 | + free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER); |
2217 | } |
2218 | #endif /* CONFIG_X86_PAE */ |
2219 | |
2220 | diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c |
2221 | index 53b72fb4e781..41205de487e7 100644 |
2222 | --- a/arch/x86/mm/tlb.c |
2223 | +++ b/arch/x86/mm/tlb.c |
2224 | @@ -6,13 +6,14 @@ |
2225 | #include <linux/interrupt.h> |
2226 | #include <linux/export.h> |
2227 | #include <linux/cpu.h> |
2228 | +#include <linux/debugfs.h> |
2229 | |
2230 | #include <asm/tlbflush.h> |
2231 | #include <asm/mmu_context.h> |
2232 | #include <asm/cache.h> |
2233 | #include <asm/apic.h> |
2234 | #include <asm/uv/uv.h> |
2235 | -#include <linux/debugfs.h> |
2236 | +#include <asm/kaiser.h> |
2237 | |
2238 | /* |
2239 | * TLB flushing, formerly SMP-only |
2240 | @@ -34,6 +35,36 @@ struct flush_tlb_info { |
2241 | unsigned long flush_end; |
2242 | }; |
2243 | |
2244 | +static void load_new_mm_cr3(pgd_t *pgdir) |
2245 | +{ |
2246 | + unsigned long new_mm_cr3 = __pa(pgdir); |
2247 | + |
2248 | + if (kaiser_enabled) { |
2249 | + /* |
2250 | + * We reuse the same PCID for different tasks, so we must |
2251 | + * flush all the entries for the PCID out when we change tasks. |
2252 | + * Flush KERN below, flush USER when returning to userspace in |
2253 | + * kaiser's SWITCH_USER_CR3 (_SWITCH_TO_USER_CR3) macro. |
2254 | + * |
2255 | + * invpcid_flush_single_context(X86_CR3_PCID_ASID_USER) could |
2256 | + * do it here, but can only be used if X86_FEATURE_INVPCID is |
2257 | + * available - and many machines support pcid without invpcid. |
2258 | + * |
2259 | + * If X86_CR3_PCID_KERN_FLUSH actually added something, then it |
2260 | + * would be needed in the write_cr3() below - if PCIDs enabled. |
2261 | + */ |
2262 | + BUILD_BUG_ON(X86_CR3_PCID_KERN_FLUSH); |
2263 | + kaiser_flush_tlb_on_return_to_user(); |
2264 | + } |
2265 | + |
2266 | + /* |
2267 | + * Caution: many callers of this function expect |
2268 | + * that load_cr3() is serializing and orders TLB |
2269 | + * fills with respect to the mm_cpumask writes. |
2270 | + */ |
2271 | + write_cr3(new_mm_cr3); |
2272 | +} |
2273 | + |
2274 | /* |
2275 | * We cannot call mmdrop() because we are in interrupt context, |
2276 | * instead update mm->cpu_vm_mask. |
2277 | @@ -45,7 +76,7 @@ void leave_mm(int cpu) |
2278 | BUG(); |
2279 | if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) { |
2280 | cpumask_clear_cpu(cpu, mm_cpumask(active_mm)); |
2281 | - load_cr3(swapper_pg_dir); |
2282 | + load_new_mm_cr3(swapper_pg_dir); |
2283 | /* |
2284 | * This gets called in the idle path where RCU |
2285 | * functions differently. Tracing normally |
2286 | @@ -120,7 +151,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, |
2287 | * ordering guarantee we need. |
2288 | * |
2289 | */ |
2290 | - load_cr3(next->pgd); |
2291 | + load_new_mm_cr3(next->pgd); |
2292 | |
2293 | trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); |
2294 | |
2295 | @@ -167,7 +198,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, |
2296 | * As above, load_cr3() is serializing and orders TLB |
2297 | * fills with respect to the mm_cpumask write. |
2298 | */ |
2299 | - load_cr3(next->pgd); |
2300 | + load_new_mm_cr3(next->pgd); |
2301 | trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); |
2302 | load_mm_cr4(next); |
2303 | load_mm_ldt(next); |
2304 | diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h |
2305 | index dc81e5287ebf..2e6000a4eb2c 100644 |
2306 | --- a/include/asm-generic/vmlinux.lds.h |
2307 | +++ b/include/asm-generic/vmlinux.lds.h |
2308 | @@ -778,7 +778,14 @@ |
2309 | */ |
2310 | #define PERCPU_INPUT(cacheline) \ |
2311 | VMLINUX_SYMBOL(__per_cpu_start) = .; \ |
2312 | + VMLINUX_SYMBOL(__per_cpu_user_mapped_start) = .; \ |
2313 | *(.data..percpu..first) \ |
2314 | + . = ALIGN(cacheline); \ |
2315 | + *(.data..percpu..user_mapped) \ |
2316 | + *(.data..percpu..user_mapped..shared_aligned) \ |
2317 | + . = ALIGN(PAGE_SIZE); \ |
2318 | + *(.data..percpu..user_mapped..page_aligned) \ |
2319 | + VMLINUX_SYMBOL(__per_cpu_user_mapped_end) = .; \ |
2320 | . = ALIGN(PAGE_SIZE); \ |
2321 | *(.data..percpu..page_aligned) \ |
2322 | . = ALIGN(cacheline); \ |
2323 | diff --git a/include/linux/kaiser.h b/include/linux/kaiser.h |
2324 | new file mode 100644 |
2325 | index 000000000000..58c55b1589d0 |
2326 | --- /dev/null |
2327 | +++ b/include/linux/kaiser.h |
2328 | @@ -0,0 +1,52 @@ |
2329 | +#ifndef _LINUX_KAISER_H |
2330 | +#define _LINUX_KAISER_H |
2331 | + |
2332 | +#ifdef CONFIG_PAGE_TABLE_ISOLATION |
2333 | +#include <asm/kaiser.h> |
2334 | + |
2335 | +static inline int kaiser_map_thread_stack(void *stack) |
2336 | +{ |
2337 | + /* |
2338 | + * Map that page of kernel stack on which we enter from user context. |
2339 | + */ |
2340 | + return kaiser_add_mapping((unsigned long)stack + |
2341 | + THREAD_SIZE - PAGE_SIZE, PAGE_SIZE, __PAGE_KERNEL); |
2342 | +} |
2343 | + |
2344 | +static inline void kaiser_unmap_thread_stack(void *stack) |
2345 | +{ |
2346 | + /* |
2347 | + * Note: may be called even when kaiser_map_thread_stack() failed. |
2348 | + */ |
2349 | + kaiser_remove_mapping((unsigned long)stack + |
2350 | + THREAD_SIZE - PAGE_SIZE, PAGE_SIZE); |
2351 | +} |
2352 | +#else |
2353 | + |
2354 | +/* |
2355 | + * These stubs are used whenever CONFIG_PAGE_TABLE_ISOLATION is off, which |
2356 | + * includes architectures that support KAISER, but have it disabled. |
2357 | + */ |
2358 | + |
2359 | +static inline void kaiser_init(void) |
2360 | +{ |
2361 | +} |
2362 | +static inline int kaiser_add_mapping(unsigned long addr, |
2363 | + unsigned long size, unsigned long flags) |
2364 | +{ |
2365 | + return 0; |
2366 | +} |
2367 | +static inline void kaiser_remove_mapping(unsigned long start, |
2368 | + unsigned long size) |
2369 | +{ |
2370 | +} |
2371 | +static inline int kaiser_map_thread_stack(void *stack) |
2372 | +{ |
2373 | + return 0; |
2374 | +} |
2375 | +static inline void kaiser_unmap_thread_stack(void *stack) |
2376 | +{ |
2377 | +} |
2378 | + |
2379 | +#endif /* !CONFIG_PAGE_TABLE_ISOLATION */ |
2380 | +#endif /* _LINUX_KAISER_H */ |
2381 | diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h |
2382 | index fff21a82780c..490f5a83f947 100644 |
2383 | --- a/include/linux/mmzone.h |
2384 | +++ b/include/linux/mmzone.h |
2385 | @@ -124,8 +124,9 @@ enum zone_stat_item { |
2386 | NR_SLAB_UNRECLAIMABLE, |
2387 | NR_PAGETABLE, /* used for pagetables */ |
2388 | NR_KERNEL_STACK_KB, /* measured in KiB */ |
2389 | - /* Second 128 byte cacheline */ |
2390 | + NR_KAISERTABLE, |
2391 | NR_BOUNCE, |
2392 | + /* Second 128 byte cacheline */ |
2393 | #if IS_ENABLED(CONFIG_ZSMALLOC) |
2394 | NR_ZSPAGES, /* allocated in zsmalloc */ |
2395 | #endif |
2396 | diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h |
2397 | index 8f16299ca068..8902f23bb770 100644 |
2398 | --- a/include/linux/percpu-defs.h |
2399 | +++ b/include/linux/percpu-defs.h |
2400 | @@ -35,6 +35,12 @@ |
2401 | |
2402 | #endif |
2403 | |
2404 | +#ifdef CONFIG_PAGE_TABLE_ISOLATION |
2405 | +#define USER_MAPPED_SECTION "..user_mapped" |
2406 | +#else |
2407 | +#define USER_MAPPED_SECTION "" |
2408 | +#endif |
2409 | + |
2410 | /* |
2411 | * Base implementations of per-CPU variable declarations and definitions, where |
2412 | * the section in which the variable is to be placed is provided by the |
2413 | @@ -115,6 +121,12 @@ |
2414 | #define DEFINE_PER_CPU(type, name) \ |
2415 | DEFINE_PER_CPU_SECTION(type, name, "") |
2416 | |
2417 | +#define DECLARE_PER_CPU_USER_MAPPED(type, name) \ |
2418 | + DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION) |
2419 | + |
2420 | +#define DEFINE_PER_CPU_USER_MAPPED(type, name) \ |
2421 | + DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION) |
2422 | + |
2423 | /* |
2424 | * Declaration/definition used for per-CPU variables that must come first in |
2425 | * the set of variables. |
2426 | @@ -144,6 +156,14 @@ |
2427 | DEFINE_PER_CPU_SECTION(type, name, PER_CPU_SHARED_ALIGNED_SECTION) \ |
2428 | ____cacheline_aligned_in_smp |
2429 | |
2430 | +#define DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name) \ |
2431 | + DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \ |
2432 | + ____cacheline_aligned_in_smp |
2433 | + |
2434 | +#define DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name) \ |
2435 | + DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \ |
2436 | + ____cacheline_aligned_in_smp |
2437 | + |
2438 | #define DECLARE_PER_CPU_ALIGNED(type, name) \ |
2439 | DECLARE_PER_CPU_SECTION(type, name, PER_CPU_ALIGNED_SECTION) \ |
2440 | ____cacheline_aligned |
2441 | @@ -162,11 +182,21 @@ |
2442 | #define DEFINE_PER_CPU_PAGE_ALIGNED(type, name) \ |
2443 | DEFINE_PER_CPU_SECTION(type, name, "..page_aligned") \ |
2444 | __aligned(PAGE_SIZE) |
2445 | +/* |
2446 | + * Declaration/definition used for per-CPU variables that must be page aligned and need to be mapped in user mode. |
2447 | + */ |
2448 | +#define DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name) \ |
2449 | + DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \ |
2450 | + __aligned(PAGE_SIZE) |
2451 | + |
2452 | +#define DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name) \ |
2453 | + DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \ |
2454 | + __aligned(PAGE_SIZE) |
2455 | |
2456 | /* |
2457 | * Declaration/definition used for per-CPU variables that must be read mostly. |
2458 | */ |
2459 | -#define DECLARE_PER_CPU_READ_MOSTLY(type, name) \ |
2460 | +#define DECLARE_PER_CPU_READ_MOSTLY(type, name) \ |
2461 | DECLARE_PER_CPU_SECTION(type, name, "..read_mostly") |
2462 | |
2463 | #define DEFINE_PER_CPU_READ_MOSTLY(type, name) \ |
2464 | diff --git a/init/main.c b/init/main.c |
2465 | index 25bac88bc66e..99f026565608 100644 |
2466 | --- a/init/main.c |
2467 | +++ b/init/main.c |
2468 | @@ -80,6 +80,7 @@ |
2469 | #include <linux/integrity.h> |
2470 | #include <linux/proc_ns.h> |
2471 | #include <linux/io.h> |
2472 | +#include <linux/kaiser.h> |
2473 | |
2474 | #include <asm/io.h> |
2475 | #include <asm/bugs.h> |
2476 | @@ -473,6 +474,7 @@ static void __init mm_init(void) |
2477 | pgtable_init(); |
2478 | vmalloc_init(); |
2479 | ioremap_huge_init(); |
2480 | + kaiser_init(); |
2481 | } |
2482 | |
2483 | asmlinkage __visible void __init start_kernel(void) |
2484 | diff --git a/kernel/fork.c b/kernel/fork.c |
2485 | index 9321b1ad3335..70e10cb49be0 100644 |
2486 | --- a/kernel/fork.c |
2487 | +++ b/kernel/fork.c |
2488 | @@ -58,6 +58,7 @@ |
2489 | #include <linux/tsacct_kern.h> |
2490 | #include <linux/cn_proc.h> |
2491 | #include <linux/freezer.h> |
2492 | +#include <linux/kaiser.h> |
2493 | #include <linux/delayacct.h> |
2494 | #include <linux/taskstats_kern.h> |
2495 | #include <linux/random.h> |
2496 | @@ -213,6 +214,7 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) |
2497 | |
2498 | static inline void free_thread_stack(struct task_struct *tsk) |
2499 | { |
2500 | + kaiser_unmap_thread_stack(tsk->stack); |
2501 | #ifdef CONFIG_VMAP_STACK |
2502 | if (task_stack_vm_area(tsk)) { |
2503 | unsigned long flags; |
2504 | @@ -495,6 +497,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) |
2505 | * functions again. |
2506 | */ |
2507 | tsk->stack = stack; |
2508 | + |
2509 | + err= kaiser_map_thread_stack(tsk->stack); |
2510 | + if (err) |
2511 | + goto free_stack; |
2512 | #ifdef CONFIG_VMAP_STACK |
2513 | tsk->stack_vm_area = stack_vm_area; |
2514 | #endif |
2515 | diff --git a/mm/vmstat.c b/mm/vmstat.c |
2516 | index 604f26a4f696..6a088df04b29 100644 |
2517 | --- a/mm/vmstat.c |
2518 | +++ b/mm/vmstat.c |
2519 | @@ -932,6 +932,7 @@ const char * const vmstat_text[] = { |
2520 | "nr_slab_unreclaimable", |
2521 | "nr_page_table_pages", |
2522 | "nr_kernel_stack", |
2523 | + "nr_overhead", |
2524 | "nr_bounce", |
2525 | #if IS_ENABLED(CONFIG_ZSMALLOC) |
2526 | "nr_zspages", |
2527 | diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c |
2528 | index 97f9cac98348..e86a34fd5484 100644 |
2529 | --- a/net/ipv4/tcp_bbr.c |
2530 | +++ b/net/ipv4/tcp_bbr.c |
2531 | @@ -843,6 +843,11 @@ static u32 bbr_sndbuf_expand(struct sock *sk) |
2532 | */ |
2533 | static u32 bbr_undo_cwnd(struct sock *sk) |
2534 | { |
2535 | + struct bbr *bbr = inet_csk_ca(sk); |
2536 | + |
2537 | + bbr->full_bw = 0; /* spurious slow-down; reset full pipe detection */ |
2538 | + bbr->full_bw_cnt = 0; |
2539 | + bbr_reset_lt_bw_sampling(sk); |
2540 | return tcp_sk(sk)->snd_cwnd; |
2541 | } |
2542 | |
2543 | diff --git a/security/Kconfig b/security/Kconfig |
2544 | index 118f4549404e..32f36b40e9f0 100644 |
2545 | --- a/security/Kconfig |
2546 | +++ b/security/Kconfig |
2547 | @@ -31,6 +31,16 @@ config SECURITY |
2548 | |
2549 | If you are unsure how to answer this question, answer N. |
2550 | |
2551 | +config PAGE_TABLE_ISOLATION |
2552 | + bool "Remove the kernel mapping in user mode" |
2553 | + default y |
2554 | + depends on X86_64 && SMP |
2555 | + help |
2556 | + This enforces a strict kernel and user space isolation, in order |
2557 | + to close hardware side channels on kernel address information. |
2558 | + |
2559 | + If you are unsure how to answer this question, answer Y. |
2560 | + |
2561 | config SECURITYFS |
2562 | bool "Enable the securityfs filesystem" |
2563 | help |
2564 | diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h |
2565 | index a39629206864..f79669a38c0c 100644 |
2566 | --- a/tools/arch/x86/include/asm/cpufeatures.h |
2567 | +++ b/tools/arch/x86/include/asm/cpufeatures.h |
2568 | @@ -197,6 +197,9 @@ |
2569 | #define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */ |
2570 | #define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */ |
2571 | |
2572 | +/* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */ |
2573 | +#define X86_FEATURE_KAISER ( 7*32+31) /* CONFIG_PAGE_TABLE_ISOLATION w/o nokaiser */ |
2574 | + |
2575 | /* Virtualization flags: Linux defined, word 8 */ |
2576 | #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ |
2577 | #define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */ |