Magellan Linux

Contents of /trunk/kernel-alx-legacy/patches-4.9/0174-4.9.75-all-fixes.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 3608 - (show annotations) (download)
Fri Aug 14 07:34:29 2020 UTC (3 years, 8 months ago) by niro
File size: 79490 byte(s)
-added kerenl-alx-legacy pkg
1 diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
2 index 152ec4e87b57..5d2676d043de 100644
3 --- a/Documentation/kernel-parameters.txt
4 +++ b/Documentation/kernel-parameters.txt
5 @@ -2763,6 +2763,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
6
7 nojitter [IA-64] Disables jitter checking for ITC timers.
8
9 + nopti [X86-64] Disable KAISER isolation of kernel from user.
10 +
11 no-kvmclock [X86,KVM] Disable paravirtualized KVM clock driver
12
13 no-kvmapf [X86,KVM] Disable paravirtualized asynchronous page
14 @@ -3325,6 +3327,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
15 pt. [PARIDE]
16 See Documentation/blockdev/paride.txt.
17
18 + pti= [X86_64]
19 + Control KAISER user/kernel address space isolation:
20 + on - enable
21 + off - disable
22 + auto - default setting
23 +
24 pty.legacy_count=
25 [KNL] Number of legacy pty's. Overwrites compiled-in
26 default number.
27 diff --git a/Makefile b/Makefile
28 index 075e429732e7..acbc1b032db2 100644
29 --- a/Makefile
30 +++ b/Makefile
31 @@ -1,6 +1,6 @@
32 VERSION = 4
33 PATCHLEVEL = 9
34 -SUBLEVEL = 74
35 +SUBLEVEL = 75
36 EXTRAVERSION =
37 NAME = Roaring Lionus
38
39 diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
40 index 766a5211f827..2728e1b7e4a6 100644
41 --- a/arch/x86/boot/compressed/misc.h
42 +++ b/arch/x86/boot/compressed/misc.h
43 @@ -9,6 +9,7 @@
44 */
45 #undef CONFIG_PARAVIRT
46 #undef CONFIG_PARAVIRT_SPINLOCKS
47 +#undef CONFIG_PAGE_TABLE_ISOLATION
48 #undef CONFIG_KASAN
49
50 #include <linux/linkage.h>
51 diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
52 index e7b0e7ff4c58..af4e58132d91 100644
53 --- a/arch/x86/entry/entry_64.S
54 +++ b/arch/x86/entry/entry_64.S
55 @@ -36,6 +36,7 @@
56 #include <asm/smap.h>
57 #include <asm/pgtable_types.h>
58 #include <asm/export.h>
59 +#include <asm/kaiser.h>
60 #include <linux/err.h>
61
62 /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
63 @@ -146,6 +147,7 @@ ENTRY(entry_SYSCALL_64)
64 * it is too small to ever cause noticeable irq latency.
65 */
66 SWAPGS_UNSAFE_STACK
67 + SWITCH_KERNEL_CR3_NO_STACK
68 /*
69 * A hypervisor implementation might want to use a label
70 * after the swapgs, so that it can do the swapgs
71 @@ -228,6 +230,14 @@ entry_SYSCALL_64_fastpath:
72 movq RIP(%rsp), %rcx
73 movq EFLAGS(%rsp), %r11
74 RESTORE_C_REGS_EXCEPT_RCX_R11
75 + /*
76 + * This opens a window where we have a user CR3, but are
77 + * running in the kernel. This makes using the CS
78 + * register useless for telling whether or not we need to
79 + * switch CR3 in NMIs. Normal interrupts are OK because
80 + * they are off here.
81 + */
82 + SWITCH_USER_CR3
83 movq RSP(%rsp), %rsp
84 USERGS_SYSRET64
85
86 @@ -323,10 +333,26 @@ return_from_SYSCALL_64:
87 syscall_return_via_sysret:
88 /* rcx and r11 are already restored (see code above) */
89 RESTORE_C_REGS_EXCEPT_RCX_R11
90 + /*
91 + * This opens a window where we have a user CR3, but are
92 + * running in the kernel. This makes using the CS
93 + * register useless for telling whether or not we need to
94 + * switch CR3 in NMIs. Normal interrupts are OK because
95 + * they are off here.
96 + */
97 + SWITCH_USER_CR3
98 movq RSP(%rsp), %rsp
99 USERGS_SYSRET64
100
101 opportunistic_sysret_failed:
102 + /*
103 + * This opens a window where we have a user CR3, but are
104 + * running in the kernel. This makes using the CS
105 + * register useless for telling whether or not we need to
106 + * switch CR3 in NMIs. Normal interrupts are OK because
107 + * they are off here.
108 + */
109 + SWITCH_USER_CR3
110 SWAPGS
111 jmp restore_c_regs_and_iret
112 END(entry_SYSCALL_64)
113 @@ -424,6 +450,7 @@ ENTRY(ret_from_fork)
114 movq %rsp, %rdi
115 call syscall_return_slowpath /* returns with IRQs disabled */
116 TRACE_IRQS_ON /* user mode is traced as IRQS on */
117 + SWITCH_USER_CR3
118 SWAPGS
119 jmp restore_regs_and_iret
120
121 @@ -478,6 +505,7 @@ END(irq_entries_start)
122 * tracking that we're in kernel mode.
123 */
124 SWAPGS
125 + SWITCH_KERNEL_CR3
126
127 /*
128 * We need to tell lockdep that IRQs are off. We can't do this until
129 @@ -535,6 +563,7 @@ GLOBAL(retint_user)
130 mov %rsp,%rdi
131 call prepare_exit_to_usermode
132 TRACE_IRQS_IRETQ
133 + SWITCH_USER_CR3
134 SWAPGS
135 jmp restore_regs_and_iret
136
137 @@ -612,6 +641,7 @@ native_irq_return_ldt:
138
139 pushq %rdi /* Stash user RDI */
140 SWAPGS
141 + SWITCH_KERNEL_CR3
142 movq PER_CPU_VAR(espfix_waddr), %rdi
143 movq %rax, (0*8)(%rdi) /* user RAX */
144 movq (1*8)(%rsp), %rax /* user RIP */
145 @@ -638,6 +668,7 @@ native_irq_return_ldt:
146 * still points to an RO alias of the ESPFIX stack.
147 */
148 orq PER_CPU_VAR(espfix_stack), %rax
149 + SWITCH_USER_CR3
150 SWAPGS
151 movq %rax, %rsp
152
153 @@ -1022,7 +1053,11 @@ idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vec
154 /*
155 * Save all registers in pt_regs, and switch gs if needed.
156 * Use slow, but surefire "are we in kernel?" check.
157 - * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
158 + *
159 + * Return: ebx=0: needs swapgs but not SWITCH_USER_CR3 in paranoid_exit
160 + * ebx=1: needs neither swapgs nor SWITCH_USER_CR3 in paranoid_exit
161 + * ebx=2: needs both swapgs and SWITCH_USER_CR3 in paranoid_exit
162 + * ebx=3: needs SWITCH_USER_CR3 but not swapgs in paranoid_exit
163 */
164 ENTRY(paranoid_entry)
165 cld
166 @@ -1035,7 +1070,26 @@ ENTRY(paranoid_entry)
167 js 1f /* negative -> in kernel */
168 SWAPGS
169 xorl %ebx, %ebx
170 -1: ret
171 +1:
172 +#ifdef CONFIG_PAGE_TABLE_ISOLATION
173 + /*
174 + * We might have come in between a swapgs and a SWITCH_KERNEL_CR3
175 + * on entry, or between a SWITCH_USER_CR3 and a swapgs on exit.
176 + * Do a conditional SWITCH_KERNEL_CR3: this could safely be done
177 + * unconditionally, but we need to find out whether the reverse
178 + * should be done on return (conveyed to paranoid_exit in %ebx).
179 + */
180 + ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
181 + testl $KAISER_SHADOW_PGD_OFFSET, %eax
182 + jz 2f
183 + orl $2, %ebx
184 + andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
185 + /* If PCID enabled, set X86_CR3_PCID_NOFLUSH_BIT */
186 + ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID
187 + movq %rax, %cr3
188 +2:
189 +#endif
190 + ret
191 END(paranoid_entry)
192
193 /*
194 @@ -1048,19 +1102,26 @@ END(paranoid_entry)
195 * be complicated. Fortunately, we there's no good reason
196 * to try to handle preemption here.
197 *
198 - * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it)
199 + * On entry: ebx=0: needs swapgs but not SWITCH_USER_CR3
200 + * ebx=1: needs neither swapgs nor SWITCH_USER_CR3
201 + * ebx=2: needs both swapgs and SWITCH_USER_CR3
202 + * ebx=3: needs SWITCH_USER_CR3 but not swapgs
203 */
204 ENTRY(paranoid_exit)
205 DISABLE_INTERRUPTS(CLBR_NONE)
206 TRACE_IRQS_OFF_DEBUG
207 - testl %ebx, %ebx /* swapgs needed? */
208 + TRACE_IRQS_IRETQ_DEBUG
209 +#ifdef CONFIG_PAGE_TABLE_ISOLATION
210 + /* No ALTERNATIVE for X86_FEATURE_KAISER: paranoid_entry sets %ebx */
211 + testl $2, %ebx /* SWITCH_USER_CR3 needed? */
212 + jz paranoid_exit_no_switch
213 + SWITCH_USER_CR3
214 +paranoid_exit_no_switch:
215 +#endif
216 + testl $1, %ebx /* swapgs needed? */
217 jnz paranoid_exit_no_swapgs
218 - TRACE_IRQS_IRETQ
219 SWAPGS_UNSAFE_STACK
220 - jmp paranoid_exit_restore
221 paranoid_exit_no_swapgs:
222 - TRACE_IRQS_IRETQ_DEBUG
223 -paranoid_exit_restore:
224 RESTORE_EXTRA_REGS
225 RESTORE_C_REGS
226 REMOVE_PT_GPREGS_FROM_STACK 8
227 @@ -1075,6 +1136,13 @@ ENTRY(error_entry)
228 cld
229 SAVE_C_REGS 8
230 SAVE_EXTRA_REGS 8
231 + /*
232 + * error_entry() always returns with a kernel gsbase and
233 + * CR3. We must also have a kernel CR3/gsbase before
234 + * calling TRACE_IRQS_*. Just unconditionally switch to
235 + * the kernel CR3 here.
236 + */
237 + SWITCH_KERNEL_CR3
238 xorl %ebx, %ebx
239 testb $3, CS+8(%rsp)
240 jz .Lerror_kernelspace
241 @@ -1235,6 +1303,10 @@ ENTRY(nmi)
242 */
243
244 SWAPGS_UNSAFE_STACK
245 + /*
246 + * percpu variables are mapped with user CR3, so no need
247 + * to switch CR3 here.
248 + */
249 cld
250 movq %rsp, %rdx
251 movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
252 @@ -1268,12 +1340,34 @@ ENTRY(nmi)
253
254 movq %rsp, %rdi
255 movq $-1, %rsi
256 +#ifdef CONFIG_PAGE_TABLE_ISOLATION
257 + /* Unconditionally use kernel CR3 for do_nmi() */
258 + /* %rax is saved above, so OK to clobber here */
259 + ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
260 + /* If PCID enabled, NOFLUSH now and NOFLUSH on return */
261 + ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID
262 + pushq %rax
263 + /* mask off "user" bit of pgd address and 12 PCID bits: */
264 + andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
265 + movq %rax, %cr3
266 +2:
267 +#endif
268 call do_nmi
269
270 +#ifdef CONFIG_PAGE_TABLE_ISOLATION
271 + /*
272 + * Unconditionally restore CR3. I know we return to
273 + * kernel code that needs user CR3, but do we ever return
274 + * to "user mode" where we need the kernel CR3?
275 + */
276 + ALTERNATIVE "", "popq %rax; movq %rax, %cr3", X86_FEATURE_KAISER
277 +#endif
278 +
279 /*
280 * Return back to user mode. We must *not* do the normal exit
281 - * work, because we don't want to enable interrupts. Fortunately,
282 - * do_nmi doesn't modify pt_regs.
283 + * work, because we don't want to enable interrupts. Do not
284 + * switch to user CR3: we might be going back to kernel code
285 + * that had a user CR3 set.
286 */
287 SWAPGS
288 jmp restore_c_regs_and_iret
289 @@ -1470,22 +1564,55 @@ end_repeat_nmi:
290 ALLOC_PT_GPREGS_ON_STACK
291
292 /*
293 - * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit
294 - * as we should not be calling schedule in NMI context.
295 - * Even with normal interrupts enabled. An NMI should not be
296 - * setting NEED_RESCHED or anything that normal interrupts and
297 - * exceptions might do.
298 + * Use the same approach as paranoid_entry to handle SWAPGS, but
299 + * without CR3 handling since we do that differently in NMIs. No
300 + * need to use paranoid_exit as we should not be calling schedule
301 + * in NMI context. Even with normal interrupts enabled. An NMI
302 + * should not be setting NEED_RESCHED or anything that normal
303 + * interrupts and exceptions might do.
304 */
305 - call paranoid_entry
306 -
307 - /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
308 + cld
309 + SAVE_C_REGS
310 + SAVE_EXTRA_REGS
311 + movl $1, %ebx
312 + movl $MSR_GS_BASE, %ecx
313 + rdmsr
314 + testl %edx, %edx
315 + js 1f /* negative -> in kernel */
316 + SWAPGS
317 + xorl %ebx, %ebx
318 +1:
319 movq %rsp, %rdi
320 movq $-1, %rsi
321 +#ifdef CONFIG_PAGE_TABLE_ISOLATION
322 + /* Unconditionally use kernel CR3 for do_nmi() */
323 + /* %rax is saved above, so OK to clobber here */
324 + ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
325 + /* If PCID enabled, NOFLUSH now and NOFLUSH on return */
326 + ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID
327 + pushq %rax
328 + /* mask off "user" bit of pgd address and 12 PCID bits: */
329 + andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
330 + movq %rax, %cr3
331 +2:
332 +#endif
333 +
334 + /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
335 call do_nmi
336
337 +#ifdef CONFIG_PAGE_TABLE_ISOLATION
338 + /*
339 + * Unconditionally restore CR3. We might be returning to
340 + * kernel code that needs user CR3, like just just before
341 + * a sysret.
342 + */
343 + ALTERNATIVE "", "popq %rax; movq %rax, %cr3", X86_FEATURE_KAISER
344 +#endif
345 +
346 testl %ebx, %ebx /* swapgs needed? */
347 jnz nmi_restore
348 nmi_swapgs:
349 + /* We fixed up CR3 above, so no need to switch it here */
350 SWAPGS_UNSAFE_STACK
351 nmi_restore:
352 RESTORE_EXTRA_REGS
353 diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
354 index e1721dafbcb1..d76a97653980 100644
355 --- a/arch/x86/entry/entry_64_compat.S
356 +++ b/arch/x86/entry/entry_64_compat.S
357 @@ -13,6 +13,8 @@
358 #include <asm/irqflags.h>
359 #include <asm/asm.h>
360 #include <asm/smap.h>
361 +#include <asm/pgtable_types.h>
362 +#include <asm/kaiser.h>
363 #include <linux/linkage.h>
364 #include <linux/err.h>
365
366 @@ -48,6 +50,7 @@
367 ENTRY(entry_SYSENTER_compat)
368 /* Interrupts are off on entry. */
369 SWAPGS_UNSAFE_STACK
370 + SWITCH_KERNEL_CR3_NO_STACK
371 movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
372
373 /*
374 @@ -184,6 +187,7 @@ ENDPROC(entry_SYSENTER_compat)
375 ENTRY(entry_SYSCALL_compat)
376 /* Interrupts are off on entry. */
377 SWAPGS_UNSAFE_STACK
378 + SWITCH_KERNEL_CR3_NO_STACK
379
380 /* Stash user ESP and switch to the kernel stack. */
381 movl %esp, %r8d
382 @@ -259,6 +263,7 @@ sysret32_from_system_call:
383 xorq %r8, %r8
384 xorq %r9, %r9
385 xorq %r10, %r10
386 + SWITCH_USER_CR3
387 movq RSP-ORIG_RAX(%rsp), %rsp
388 swapgs
389 sysretl
390 @@ -297,7 +302,7 @@ ENTRY(entry_INT80_compat)
391 PARAVIRT_ADJUST_EXCEPTION_FRAME
392 ASM_CLAC /* Do this early to minimize exposure */
393 SWAPGS
394 -
395 + SWITCH_KERNEL_CR3_NO_STACK
396 /*
397 * User tracing code (ptrace or signal handlers) might assume that
398 * the saved RAX contains a 32-bit number when we're invoking a 32-bit
399 @@ -338,6 +343,7 @@ ENTRY(entry_INT80_compat)
400
401 /* Go back to user mode. */
402 TRACE_IRQS_ON
403 + SWITCH_USER_CR3
404 SWAPGS
405 jmp restore_regs_and_iret
406 END(entry_INT80_compat)
407 diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
408 index 9dfeeeca0ea8..8e7a3f1df3a5 100644
409 --- a/arch/x86/events/intel/ds.c
410 +++ b/arch/x86/events/intel/ds.c
411 @@ -2,11 +2,15 @@
412 #include <linux/types.h>
413 #include <linux/slab.h>
414
415 +#include <asm/kaiser.h>
416 #include <asm/perf_event.h>
417 #include <asm/insn.h>
418
419 #include "../perf_event.h"
420
421 +static
422 +DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct debug_store, cpu_debug_store);
423 +
424 /* The size of a BTS record in bytes: */
425 #define BTS_RECORD_SIZE 24
426
427 @@ -268,6 +272,39 @@ void fini_debug_store_on_cpu(int cpu)
428
429 static DEFINE_PER_CPU(void *, insn_buffer);
430
431 +static void *dsalloc(size_t size, gfp_t flags, int node)
432 +{
433 +#ifdef CONFIG_PAGE_TABLE_ISOLATION
434 + unsigned int order = get_order(size);
435 + struct page *page;
436 + unsigned long addr;
437 +
438 + page = __alloc_pages_node(node, flags | __GFP_ZERO, order);
439 + if (!page)
440 + return NULL;
441 + addr = (unsigned long)page_address(page);
442 + if (kaiser_add_mapping(addr, size, __PAGE_KERNEL) < 0) {
443 + __free_pages(page, order);
444 + addr = 0;
445 + }
446 + return (void *)addr;
447 +#else
448 + return kmalloc_node(size, flags | __GFP_ZERO, node);
449 +#endif
450 +}
451 +
452 +static void dsfree(const void *buffer, size_t size)
453 +{
454 +#ifdef CONFIG_PAGE_TABLE_ISOLATION
455 + if (!buffer)
456 + return;
457 + kaiser_remove_mapping((unsigned long)buffer, size);
458 + free_pages((unsigned long)buffer, get_order(size));
459 +#else
460 + kfree(buffer);
461 +#endif
462 +}
463 +
464 static int alloc_pebs_buffer(int cpu)
465 {
466 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
467 @@ -278,7 +315,7 @@ static int alloc_pebs_buffer(int cpu)
468 if (!x86_pmu.pebs)
469 return 0;
470
471 - buffer = kzalloc_node(x86_pmu.pebs_buffer_size, GFP_KERNEL, node);
472 + buffer = dsalloc(x86_pmu.pebs_buffer_size, GFP_KERNEL, node);
473 if (unlikely(!buffer))
474 return -ENOMEM;
475
476 @@ -289,7 +326,7 @@ static int alloc_pebs_buffer(int cpu)
477 if (x86_pmu.intel_cap.pebs_format < 2) {
478 ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node);
479 if (!ibuffer) {
480 - kfree(buffer);
481 + dsfree(buffer, x86_pmu.pebs_buffer_size);
482 return -ENOMEM;
483 }
484 per_cpu(insn_buffer, cpu) = ibuffer;
485 @@ -315,7 +352,8 @@ static void release_pebs_buffer(int cpu)
486 kfree(per_cpu(insn_buffer, cpu));
487 per_cpu(insn_buffer, cpu) = NULL;
488
489 - kfree((void *)(unsigned long)ds->pebs_buffer_base);
490 + dsfree((void *)(unsigned long)ds->pebs_buffer_base,
491 + x86_pmu.pebs_buffer_size);
492 ds->pebs_buffer_base = 0;
493 }
494
495 @@ -329,7 +367,7 @@ static int alloc_bts_buffer(int cpu)
496 if (!x86_pmu.bts)
497 return 0;
498
499 - buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);
500 + buffer = dsalloc(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);
501 if (unlikely(!buffer)) {
502 WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__);
503 return -ENOMEM;
504 @@ -355,19 +393,15 @@ static void release_bts_buffer(int cpu)
505 if (!ds || !x86_pmu.bts)
506 return;
507
508 - kfree((void *)(unsigned long)ds->bts_buffer_base);
509 + dsfree((void *)(unsigned long)ds->bts_buffer_base, BTS_BUFFER_SIZE);
510 ds->bts_buffer_base = 0;
511 }
512
513 static int alloc_ds_buffer(int cpu)
514 {
515 - int node = cpu_to_node(cpu);
516 - struct debug_store *ds;
517 -
518 - ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node);
519 - if (unlikely(!ds))
520 - return -ENOMEM;
521 + struct debug_store *ds = per_cpu_ptr(&cpu_debug_store, cpu);
522
523 + memset(ds, 0, sizeof(*ds));
524 per_cpu(cpu_hw_events, cpu).ds = ds;
525
526 return 0;
527 @@ -381,7 +415,6 @@ static void release_ds_buffer(int cpu)
528 return;
529
530 per_cpu(cpu_hw_events, cpu).ds = NULL;
531 - kfree(ds);
532 }
533
534 void release_ds_buffers(void)
535 diff --git a/arch/x86/include/asm/cmdline.h b/arch/x86/include/asm/cmdline.h
536 index e01f7f7ccb0c..84ae170bc3d0 100644
537 --- a/arch/x86/include/asm/cmdline.h
538 +++ b/arch/x86/include/asm/cmdline.h
539 @@ -2,5 +2,7 @@
540 #define _ASM_X86_CMDLINE_H
541
542 int cmdline_find_option_bool(const char *cmdline_ptr, const char *option);
543 +int cmdline_find_option(const char *cmdline_ptr, const char *option,
544 + char *buffer, int bufsize);
545
546 #endif /* _ASM_X86_CMDLINE_H */
547 diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
548 index ed10b5bf9b93..454a37adb823 100644
549 --- a/arch/x86/include/asm/cpufeatures.h
550 +++ b/arch/x86/include/asm/cpufeatures.h
551 @@ -189,6 +189,7 @@
552
553 #define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */
554 #define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
555 +#define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 4) /* Effectively INVPCID && CR4.PCIDE=1 */
556
557 #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */
558 #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
559 @@ -197,6 +198,9 @@
560 #define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */
561 #define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */
562
563 +/* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */
564 +#define X86_FEATURE_KAISER ( 7*32+31) /* CONFIG_PAGE_TABLE_ISOLATION w/o nokaiser */
565 +
566 /* Virtualization flags: Linux defined, word 8 */
567 #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */
568 #define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */
569 diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
570 index 12080d87da3b..2ed5a2b3f8f7 100644
571 --- a/arch/x86/include/asm/desc.h
572 +++ b/arch/x86/include/asm/desc.h
573 @@ -43,7 +43,7 @@ struct gdt_page {
574 struct desc_struct gdt[GDT_ENTRIES];
575 } __attribute__((aligned(PAGE_SIZE)));
576
577 -DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page);
578 +DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page);
579
580 static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
581 {
582 diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
583 index b90e1053049b..0817d63bce41 100644
584 --- a/arch/x86/include/asm/hw_irq.h
585 +++ b/arch/x86/include/asm/hw_irq.h
586 @@ -178,7 +178,7 @@ extern char irq_entries_start[];
587 #define VECTOR_RETRIGGERED ((void *)~0UL)
588
589 typedef struct irq_desc* vector_irq_t[NR_VECTORS];
590 -DECLARE_PER_CPU(vector_irq_t, vector_irq);
591 +DECLARE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq);
592
593 #endif /* !ASSEMBLY_ */
594
595 diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h
596 new file mode 100644
597 index 000000000000..802bbbdfe143
598 --- /dev/null
599 +++ b/arch/x86/include/asm/kaiser.h
600 @@ -0,0 +1,141 @@
601 +#ifndef _ASM_X86_KAISER_H
602 +#define _ASM_X86_KAISER_H
603 +
604 +#include <uapi/asm/processor-flags.h> /* For PCID constants */
605 +
606 +/*
607 + * This file includes the definitions for the KAISER feature.
608 + * KAISER is a counter measure against x86_64 side channel attacks on
609 + * the kernel virtual memory. It has a shadow pgd for every process: the
610 + * shadow pgd has a minimalistic kernel-set mapped, but includes the whole
611 + * user memory. Within a kernel context switch, or when an interrupt is handled,
612 + * the pgd is switched to the normal one. When the system switches to user mode,
613 + * the shadow pgd is enabled. By this, the virtual memory caches are freed,
614 + * and the user may not attack the whole kernel memory.
615 + *
616 + * A minimalistic kernel mapping holds the parts needed to be mapped in user
617 + * mode, such as the entry/exit functions of the user space, or the stacks.
618 + */
619 +
620 +#define KAISER_SHADOW_PGD_OFFSET 0x1000
621 +
622 +#ifdef __ASSEMBLY__
623 +#ifdef CONFIG_PAGE_TABLE_ISOLATION
624 +
625 +.macro _SWITCH_TO_KERNEL_CR3 reg
626 +movq %cr3, \reg
627 +andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg
628 +/* If PCID enabled, set X86_CR3_PCID_NOFLUSH_BIT */
629 +ALTERNATIVE "", "bts $63, \reg", X86_FEATURE_PCID
630 +movq \reg, %cr3
631 +.endm
632 +
633 +.macro _SWITCH_TO_USER_CR3 reg regb
634 +/*
635 + * regb must be the low byte portion of reg: because we have arranged
636 + * for the low byte of the user PCID to serve as the high byte of NOFLUSH
637 + * (0x80 for each when PCID is enabled, or 0x00 when PCID and NOFLUSH are
638 + * not enabled): so that the one register can update both memory and cr3.
639 + */
640 +movq %cr3, \reg
641 +orq PER_CPU_VAR(x86_cr3_pcid_user), \reg
642 +js 9f
643 +/* If PCID enabled, FLUSH this time, reset to NOFLUSH for next time */
644 +movb \regb, PER_CPU_VAR(x86_cr3_pcid_user+7)
645 +9:
646 +movq \reg, %cr3
647 +.endm
648 +
649 +.macro SWITCH_KERNEL_CR3
650 +ALTERNATIVE "jmp 8f", "pushq %rax", X86_FEATURE_KAISER
651 +_SWITCH_TO_KERNEL_CR3 %rax
652 +popq %rax
653 +8:
654 +.endm
655 +
656 +.macro SWITCH_USER_CR3
657 +ALTERNATIVE "jmp 8f", "pushq %rax", X86_FEATURE_KAISER
658 +_SWITCH_TO_USER_CR3 %rax %al
659 +popq %rax
660 +8:
661 +.endm
662 +
663 +.macro SWITCH_KERNEL_CR3_NO_STACK
664 +ALTERNATIVE "jmp 8f", \
665 + __stringify(movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)), \
666 + X86_FEATURE_KAISER
667 +_SWITCH_TO_KERNEL_CR3 %rax
668 +movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
669 +8:
670 +.endm
671 +
672 +#else /* CONFIG_PAGE_TABLE_ISOLATION */
673 +
674 +.macro SWITCH_KERNEL_CR3
675 +.endm
676 +.macro SWITCH_USER_CR3
677 +.endm
678 +.macro SWITCH_KERNEL_CR3_NO_STACK
679 +.endm
680 +
681 +#endif /* CONFIG_PAGE_TABLE_ISOLATION */
682 +
683 +#else /* __ASSEMBLY__ */
684 +
685 +#ifdef CONFIG_PAGE_TABLE_ISOLATION
686 +/*
687 + * Upon kernel/user mode switch, it may happen that the address
688 + * space has to be switched before the registers have been
689 + * stored. To change the address space, another register is
690 + * needed. A register therefore has to be stored/restored.
691 +*/
692 +DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
693 +
694 +DECLARE_PER_CPU(unsigned long, x86_cr3_pcid_user);
695 +
696 +extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
697 +
698 +extern int kaiser_enabled;
699 +extern void __init kaiser_check_boottime_disable(void);
700 +#else
701 +#define kaiser_enabled 0
702 +static inline void __init kaiser_check_boottime_disable(void) {}
703 +#endif /* CONFIG_PAGE_TABLE_ISOLATION */
704 +
705 +/*
706 + * Kaiser function prototypes are needed even when CONFIG_PAGE_TABLE_ISOLATION is not set,
707 + * so as to build with tests on kaiser_enabled instead of #ifdefs.
708 + */
709 +
710 +/**
711 + * kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping
712 + * @addr: the start address of the range
713 + * @size: the size of the range
714 + * @flags: The mapping flags of the pages
715 + *
716 + * The mapping is done on a global scope, so no bigger
717 + * synchronization has to be done. the pages have to be
718 + * manually unmapped again when they are not needed any longer.
719 + */
720 +extern int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);
721 +
722 +/**
723 + * kaiser_remove_mapping - unmap a virtual memory part of the shadow mapping
724 + * @addr: the start address of the range
725 + * @size: the size of the range
726 + */
727 +extern void kaiser_remove_mapping(unsigned long start, unsigned long size);
728 +
729 +/**
730 + * kaiser_init - Initialize the shadow mapping
731 + *
732 + * Most parts of the shadow mapping can be mapped upon boot
733 + * time. Only per-process things like the thread stacks
734 + * or a new LDT have to be mapped at runtime. These boot-
735 + * time mappings are permanent and never unmapped.
736 + */
737 +extern void kaiser_init(void);
738 +
739 +#endif /* __ASSEMBLY */
740 +
741 +#endif /* _ASM_X86_KAISER_H */
742 diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
743 index 437feb436efa..2536f90cd30c 100644
744 --- a/arch/x86/include/asm/pgtable.h
745 +++ b/arch/x86/include/asm/pgtable.h
746 @@ -18,6 +18,12 @@
747 #ifndef __ASSEMBLY__
748 #include <asm/x86_init.h>
749
750 +#ifdef CONFIG_PAGE_TABLE_ISOLATION
751 +extern int kaiser_enabled;
752 +#else
753 +#define kaiser_enabled 0
754 +#endif
755 +
756 void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd);
757 void ptdump_walk_pgd_level_checkwx(void);
758
759 @@ -690,7 +696,17 @@ static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address)
760
761 static inline int pgd_bad(pgd_t pgd)
762 {
763 - return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE;
764 + pgdval_t ignore_flags = _PAGE_USER;
765 + /*
766 + * We set NX on KAISER pgds that map userspace memory so
767 + * that userspace can not meaningfully use the kernel
768 + * page table by accident; it will fault on the first
769 + * instruction it tries to run. See native_set_pgd().
770 + */
771 + if (kaiser_enabled)
772 + ignore_flags |= _PAGE_NX;
773 +
774 + return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
775 }
776
777 static inline int pgd_none(pgd_t pgd)
778 @@ -903,7 +919,15 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
779 */
780 static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
781 {
782 - memcpy(dst, src, count * sizeof(pgd_t));
783 + memcpy(dst, src, count * sizeof(pgd_t));
784 +#ifdef CONFIG_PAGE_TABLE_ISOLATION
785 + if (kaiser_enabled) {
786 + /* Clone the shadow pgd part as well */
787 + memcpy(native_get_shadow_pgd(dst),
788 + native_get_shadow_pgd(src),
789 + count * sizeof(pgd_t));
790 + }
791 +#endif
792 }
793
794 #define PTE_SHIFT ilog2(PTRS_PER_PTE)
795 diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
796 index 1cc82ece9ac1..ce97c8c6a310 100644
797 --- a/arch/x86/include/asm/pgtable_64.h
798 +++ b/arch/x86/include/asm/pgtable_64.h
799 @@ -106,9 +106,32 @@ static inline void native_pud_clear(pud_t *pud)
800 native_set_pud(pud, native_make_pud(0));
801 }
802
803 +#ifdef CONFIG_PAGE_TABLE_ISOLATION
804 +extern pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd);
805 +
806 +static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)
807 +{
808 +#ifdef CONFIG_DEBUG_VM
809 + /* linux/mmdebug.h may not have been included at this point */
810 + BUG_ON(!kaiser_enabled);
811 +#endif
812 + return (pgd_t *)((unsigned long)pgdp | (unsigned long)PAGE_SIZE);
813 +}
814 +#else
815 +static inline pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
816 +{
817 + return pgd;
818 +}
819 +static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)
820 +{
821 + BUILD_BUG_ON(1);
822 + return NULL;
823 +}
824 +#endif /* CONFIG_PAGE_TABLE_ISOLATION */
825 +
826 static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
827 {
828 - *pgdp = pgd;
829 + *pgdp = kaiser_set_shadow_pgd(pgdp, pgd);
830 }
831
832 static inline void native_pgd_clear(pgd_t *pgd)
833 diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
834 index 8b4de22d6429..f1c8ac468292 100644
835 --- a/arch/x86/include/asm/pgtable_types.h
836 +++ b/arch/x86/include/asm/pgtable_types.h
837 @@ -119,7 +119,7 @@
838 #define _PAGE_DEVMAP (_AT(pteval_t, 0))
839 #endif
840
841 -#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
842 +#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
843
844 #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
845 _PAGE_ACCESSED | _PAGE_DIRTY)
846 @@ -137,6 +137,33 @@
847 _PAGE_SOFT_DIRTY)
848 #define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
849
850 +/* The ASID is the lower 12 bits of CR3 */
851 +#define X86_CR3_PCID_ASID_MASK (_AC((1<<12)-1,UL))
852 +
853 +/* Mask for all the PCID-related bits in CR3: */
854 +#define X86_CR3_PCID_MASK (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_MASK)
855 +#define X86_CR3_PCID_ASID_KERN (_AC(0x0,UL))
856 +
857 +#if defined(CONFIG_PAGE_TABLE_ISOLATION) && defined(CONFIG_X86_64)
858 +/* Let X86_CR3_PCID_ASID_USER be usable for the X86_CR3_PCID_NOFLUSH bit */
859 +#define X86_CR3_PCID_ASID_USER (_AC(0x80,UL))
860 +
861 +#define X86_CR3_PCID_KERN_FLUSH (X86_CR3_PCID_ASID_KERN)
862 +#define X86_CR3_PCID_USER_FLUSH (X86_CR3_PCID_ASID_USER)
863 +#define X86_CR3_PCID_KERN_NOFLUSH (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_KERN)
864 +#define X86_CR3_PCID_USER_NOFLUSH (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_USER)
865 +#else
866 +#define X86_CR3_PCID_ASID_USER (_AC(0x0,UL))
867 +/*
868 + * PCIDs are unsupported on 32-bit and none of these bits can be
869 + * set in CR3:
870 + */
871 +#define X86_CR3_PCID_KERN_FLUSH (0)
872 +#define X86_CR3_PCID_USER_FLUSH (0)
873 +#define X86_CR3_PCID_KERN_NOFLUSH (0)
874 +#define X86_CR3_PCID_USER_NOFLUSH (0)
875 +#endif
876 +
877 /*
878 * The cache modes defined here are used to translate between pure SW usage
879 * and the HW defined cache mode bits and/or PAT entries.
880 diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
881 index 83db0eae9979..8cb52ee3ade6 100644
882 --- a/arch/x86/include/asm/processor.h
883 +++ b/arch/x86/include/asm/processor.h
884 @@ -308,7 +308,7 @@ struct tss_struct {
885
886 } ____cacheline_aligned;
887
888 -DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss);
889 +DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss);
890
891 #ifdef CONFIG_X86_32
892 DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
893 diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
894 index 7d2ea6b1f7d9..94146f665a3c 100644
895 --- a/arch/x86/include/asm/tlbflush.h
896 +++ b/arch/x86/include/asm/tlbflush.h
897 @@ -132,6 +132,24 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask)
898 cr4_set_bits(mask);
899 }
900
901 +/*
902 + * Declare a couple of kaiser interfaces here for convenience,
903 + * to avoid the need for asm/kaiser.h in unexpected places.
904 + */
905 +#ifdef CONFIG_PAGE_TABLE_ISOLATION
906 +extern int kaiser_enabled;
907 +extern void kaiser_setup_pcid(void);
908 +extern void kaiser_flush_tlb_on_return_to_user(void);
909 +#else
910 +#define kaiser_enabled 0
911 +static inline void kaiser_setup_pcid(void)
912 +{
913 +}
914 +static inline void kaiser_flush_tlb_on_return_to_user(void)
915 +{
916 +}
917 +#endif
918 +
919 static inline void __native_flush_tlb(void)
920 {
921 /*
922 @@ -140,6 +158,8 @@ static inline void __native_flush_tlb(void)
923 * back:
924 */
925 preempt_disable();
926 + if (kaiser_enabled)
927 + kaiser_flush_tlb_on_return_to_user();
928 native_write_cr3(native_read_cr3());
929 preempt_enable();
930 }
931 @@ -149,20 +169,27 @@ static inline void __native_flush_tlb_global_irq_disabled(void)
932 unsigned long cr4;
933
934 cr4 = this_cpu_read(cpu_tlbstate.cr4);
935 - /* clear PGE */
936 - native_write_cr4(cr4 & ~X86_CR4_PGE);
937 - /* write old PGE again and flush TLBs */
938 - native_write_cr4(cr4);
939 + if (cr4 & X86_CR4_PGE) {
940 + /* clear PGE and flush TLB of all entries */
941 + native_write_cr4(cr4 & ~X86_CR4_PGE);
942 + /* restore PGE as it was before */
943 + native_write_cr4(cr4);
944 + } else {
945 + /* do it with cr3, letting kaiser flush user PCID */
946 + __native_flush_tlb();
947 + }
948 }
949
950 static inline void __native_flush_tlb_global(void)
951 {
952 unsigned long flags;
953
954 - if (static_cpu_has(X86_FEATURE_INVPCID)) {
955 + if (this_cpu_has(X86_FEATURE_INVPCID)) {
956 /*
957 * Using INVPCID is considerably faster than a pair of writes
958 * to CR4 sandwiched inside an IRQ flag save/restore.
959 + *
960 + * Note, this works with CR4.PCIDE=0 or 1.
961 */
962 invpcid_flush_all();
963 return;
964 @@ -174,24 +201,45 @@ static inline void __native_flush_tlb_global(void)
965 * be called from deep inside debugging code.)
966 */
967 raw_local_irq_save(flags);
968 -
969 __native_flush_tlb_global_irq_disabled();
970 -
971 raw_local_irq_restore(flags);
972 }
973
974 static inline void __native_flush_tlb_single(unsigned long addr)
975 {
976 - asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
977 + /*
978 + * SIMICS #GP's if you run INVPCID with type 2/3
979 + * and X86_CR4_PCIDE clear. Shame!
980 + *
981 + * The ASIDs used below are hard-coded. But, we must not
982 + * call invpcid(type=1/2) before CR4.PCIDE=1. Just call
983 + * invlpg in the case we are called early.
984 + */
985 +
986 + if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) {
987 + if (kaiser_enabled)
988 + kaiser_flush_tlb_on_return_to_user();
989 + asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
990 + return;
991 + }
992 + /* Flush the address out of both PCIDs. */
993 + /*
994 + * An optimization here might be to determine addresses
995 + * that are only kernel-mapped and only flush the kernel
996 + * ASID. But, userspace flushes are probably much more
997 + * important performance-wise.
998 + *
999 + * Make sure to do only a single invpcid when KAISER is
1000 + * disabled and we have only a single ASID.
1001 + */
1002 + if (kaiser_enabled)
1003 + invpcid_flush_one(X86_CR3_PCID_ASID_USER, addr);
1004 + invpcid_flush_one(X86_CR3_PCID_ASID_KERN, addr);
1005 }
1006
1007 static inline void __flush_tlb_all(void)
1008 {
1009 - if (boot_cpu_has(X86_FEATURE_PGE))
1010 - __flush_tlb_global();
1011 - else
1012 - __flush_tlb();
1013 -
1014 + __flush_tlb_global();
1015 /*
1016 * Note: if we somehow had PCID but not PGE, then this wouldn't work --
1017 * we'd end up flushing kernel translations for the current ASID but
1018 diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h
1019 index 567de50a4c2a..6768d1321016 100644
1020 --- a/arch/x86/include/uapi/asm/processor-flags.h
1021 +++ b/arch/x86/include/uapi/asm/processor-flags.h
1022 @@ -77,7 +77,8 @@
1023 #define X86_CR3_PWT _BITUL(X86_CR3_PWT_BIT)
1024 #define X86_CR3_PCD_BIT 4 /* Page Cache Disable */
1025 #define X86_CR3_PCD _BITUL(X86_CR3_PCD_BIT)
1026 -#define X86_CR3_PCID_MASK _AC(0x00000fff,UL) /* PCID Mask */
1027 +#define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */
1028 +#define X86_CR3_PCID_NOFLUSH _BITULL(X86_CR3_PCID_NOFLUSH_BIT)
1029
1030 /*
1031 * Intel CPU features in CR4
1032 diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
1033 index 91588be529b9..918e44772b04 100644
1034 --- a/arch/x86/kernel/cpu/common.c
1035 +++ b/arch/x86/kernel/cpu/common.c
1036 @@ -93,7 +93,7 @@ static const struct cpu_dev default_cpu = {
1037
1038 static const struct cpu_dev *this_cpu = &default_cpu;
1039
1040 -DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
1041 +DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page) = { .gdt = {
1042 #ifdef CONFIG_X86_64
1043 /*
1044 * We need valid kernel segments for data and code in long mode too
1045 @@ -327,8 +327,21 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c)
1046 static void setup_pcid(struct cpuinfo_x86 *c)
1047 {
1048 if (cpu_has(c, X86_FEATURE_PCID)) {
1049 - if (cpu_has(c, X86_FEATURE_PGE)) {
1050 + if (cpu_has(c, X86_FEATURE_PGE) || kaiser_enabled) {
1051 cr4_set_bits(X86_CR4_PCIDE);
1052 + /*
1053 + * INVPCID has two "groups" of types:
1054 + * 1/2: Invalidate an individual address
1055 + * 3/4: Invalidate all contexts
1056 + *
1057 + * 1/2 take a PCID, but 3/4 do not. So, 3/4
1058 + * ignore the PCID argument in the descriptor.
1059 + * But, we have to be careful not to call 1/2
1060 + * with an actual non-zero PCID in them before
1061 + * we do the above cr4_set_bits().
1062 + */
1063 + if (cpu_has(c, X86_FEATURE_INVPCID))
1064 + set_cpu_cap(c, X86_FEATURE_INVPCID_SINGLE);
1065 } else {
1066 /*
1067 * flush_tlb_all(), as currently implemented, won't
1068 @@ -341,6 +354,7 @@ static void setup_pcid(struct cpuinfo_x86 *c)
1069 clear_cpu_cap(c, X86_FEATURE_PCID);
1070 }
1071 }
1072 + kaiser_setup_pcid();
1073 }
1074
1075 /*
1076 @@ -1365,7 +1379,7 @@ static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
1077 [DEBUG_STACK - 1] = DEBUG_STKSZ
1078 };
1079
1080 -static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
1081 +DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(char, exception_stacks
1082 [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
1083
1084 /* May not be marked __init: used by software suspend */
1085 @@ -1523,6 +1537,14 @@ void cpu_init(void)
1086 * try to read it.
1087 */
1088 cr4_init_shadow();
1089 + if (!kaiser_enabled) {
1090 + /*
1091 + * secondary_startup_64() deferred setting PGE in cr4:
1092 + * probe_page_size_mask() sets it on the boot cpu,
1093 + * but it needs to be set on each secondary cpu.
1094 + */
1095 + cr4_set_bits(X86_CR4_PGE);
1096 + }
1097
1098 /*
1099 * Load microcode on this cpu if a valid microcode is available.
1100 diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
1101 index 04f89caef9c4..e33b38541be3 100644
1102 --- a/arch/x86/kernel/espfix_64.c
1103 +++ b/arch/x86/kernel/espfix_64.c
1104 @@ -41,6 +41,7 @@
1105 #include <asm/pgalloc.h>
1106 #include <asm/setup.h>
1107 #include <asm/espfix.h>
1108 +#include <asm/kaiser.h>
1109
1110 /*
1111 * Note: we only need 6*8 = 48 bytes for the espfix stack, but round
1112 @@ -126,6 +127,15 @@ void __init init_espfix_bsp(void)
1113 /* Install the espfix pud into the kernel page directory */
1114 pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
1115 pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page);
1116 + /*
1117 + * Just copy the top-level PGD that is mapping the espfix
1118 + * area to ensure it is mapped into the shadow user page
1119 + * tables.
1120 + */
1121 + if (kaiser_enabled) {
1122 + set_pgd(native_get_shadow_pgd(pgd_p),
1123 + __pgd(_KERNPG_TABLE | __pa((pud_t *)espfix_pud_page)));
1124 + }
1125
1126 /* Randomize the locations */
1127 init_espfix_random();
1128 diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
1129 index b4421cc191b0..67cd7c1b99da 100644
1130 --- a/arch/x86/kernel/head_64.S
1131 +++ b/arch/x86/kernel/head_64.S
1132 @@ -190,8 +190,8 @@ ENTRY(secondary_startup_64)
1133 movq $(init_level4_pgt - __START_KERNEL_map), %rax
1134 1:
1135
1136 - /* Enable PAE mode and PGE */
1137 - movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx
1138 + /* Enable PAE and PSE, but defer PGE until kaiser_enabled is decided */
1139 + movl $(X86_CR4_PAE | X86_CR4_PSE), %ecx
1140 movq %rcx, %cr4
1141
1142 /* Setup early boot stage 4 level pagetables. */
1143 @@ -405,6 +405,27 @@ GLOBAL(early_recursion_flag)
1144 .balign PAGE_SIZE; \
1145 GLOBAL(name)
1146
1147 +#ifdef CONFIG_PAGE_TABLE_ISOLATION
1148 +/*
1149 + * Each PGD needs to be 8k long and 8k aligned. We do not
1150 + * ever go out to userspace with these, so we do not
1151 + * strictly *need* the second page, but this allows us to
1152 + * have a single set_pgd() implementation that does not
1153 + * need to worry about whether it has 4k or 8k to work
1154 + * with.
1155 + *
1156 + * This ensures PGDs are 8k long:
1157 + */
1158 +#define KAISER_USER_PGD_FILL 512
1159 +/* This ensures they are 8k-aligned: */
1160 +#define NEXT_PGD_PAGE(name) \
1161 + .balign 2 * PAGE_SIZE; \
1162 +GLOBAL(name)
1163 +#else
1164 +#define NEXT_PGD_PAGE(name) NEXT_PAGE(name)
1165 +#define KAISER_USER_PGD_FILL 0
1166 +#endif
1167 +
1168 /* Automate the creation of 1 to 1 mapping pmd entries */
1169 #define PMDS(START, PERM, COUNT) \
1170 i = 0 ; \
1171 @@ -414,9 +435,10 @@ GLOBAL(name)
1172 .endr
1173
1174 __INITDATA
1175 -NEXT_PAGE(early_level4_pgt)
1176 +NEXT_PGD_PAGE(early_level4_pgt)
1177 .fill 511,8,0
1178 .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
1179 + .fill KAISER_USER_PGD_FILL,8,0
1180
1181 NEXT_PAGE(early_dynamic_pgts)
1182 .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0
1183 @@ -424,16 +446,18 @@ NEXT_PAGE(early_dynamic_pgts)
1184 .data
1185
1186 #ifndef CONFIG_XEN
1187 -NEXT_PAGE(init_level4_pgt)
1188 +NEXT_PGD_PAGE(init_level4_pgt)
1189 .fill 512,8,0
1190 + .fill KAISER_USER_PGD_FILL,8,0
1191 #else
1192 -NEXT_PAGE(init_level4_pgt)
1193 +NEXT_PGD_PAGE(init_level4_pgt)
1194 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
1195 .org init_level4_pgt + L4_PAGE_OFFSET*8, 0
1196 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
1197 .org init_level4_pgt + L4_START_KERNEL*8, 0
1198 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
1199 .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
1200 + .fill KAISER_USER_PGD_FILL,8,0
1201
1202 NEXT_PAGE(level3_ident_pgt)
1203 .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
1204 @@ -444,6 +468,7 @@ NEXT_PAGE(level2_ident_pgt)
1205 */
1206 PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
1207 #endif
1208 + .fill KAISER_USER_PGD_FILL,8,0
1209
1210 NEXT_PAGE(level3_kernel_pgt)
1211 .fill L3_START_KERNEL,8,0
1212 diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
1213 index 1423ab1b0312..f480b38a03c3 100644
1214 --- a/arch/x86/kernel/irqinit.c
1215 +++ b/arch/x86/kernel/irqinit.c
1216 @@ -51,7 +51,7 @@ static struct irqaction irq2 = {
1217 .flags = IRQF_NO_THREAD,
1218 };
1219
1220 -DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
1221 +DEFINE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq) = {
1222 [0 ... NR_VECTORS - 1] = VECTOR_UNUSED,
1223 };
1224
1225 diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
1226 index 5f70014ca602..8bc68cfc0d33 100644
1227 --- a/arch/x86/kernel/ldt.c
1228 +++ b/arch/x86/kernel/ldt.c
1229 @@ -16,6 +16,7 @@
1230 #include <linux/slab.h>
1231 #include <linux/vmalloc.h>
1232 #include <linux/uaccess.h>
1233 +#include <linux/kaiser.h>
1234
1235 #include <asm/ldt.h>
1236 #include <asm/desc.h>
1237 @@ -34,11 +35,21 @@ static void flush_ldt(void *current_mm)
1238 set_ldt(pc->ldt->entries, pc->ldt->size);
1239 }
1240
1241 +static void __free_ldt_struct(struct ldt_struct *ldt)
1242 +{
1243 + if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
1244 + vfree(ldt->entries);
1245 + else
1246 + free_page((unsigned long)ldt->entries);
1247 + kfree(ldt);
1248 +}
1249 +
1250 /* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */
1251 static struct ldt_struct *alloc_ldt_struct(int size)
1252 {
1253 struct ldt_struct *new_ldt;
1254 int alloc_size;
1255 + int ret;
1256
1257 if (size > LDT_ENTRIES)
1258 return NULL;
1259 @@ -66,7 +77,13 @@ static struct ldt_struct *alloc_ldt_struct(int size)
1260 return NULL;
1261 }
1262
1263 + ret = kaiser_add_mapping((unsigned long)new_ldt->entries, alloc_size,
1264 + __PAGE_KERNEL);
1265 new_ldt->size = size;
1266 + if (ret) {
1267 + __free_ldt_struct(new_ldt);
1268 + return NULL;
1269 + }
1270 return new_ldt;
1271 }
1272
1273 @@ -92,12 +109,10 @@ static void free_ldt_struct(struct ldt_struct *ldt)
1274 if (likely(!ldt))
1275 return;
1276
1277 + kaiser_remove_mapping((unsigned long)ldt->entries,
1278 + ldt->size * LDT_ENTRY_SIZE);
1279 paravirt_free_ldt(ldt->entries, ldt->size);
1280 - if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
1281 - vfree(ldt->entries);
1282 - else
1283 - free_page((unsigned long)ldt->entries);
1284 - kfree(ldt);
1285 + __free_ldt_struct(ldt);
1286 }
1287
1288 /*
1289 diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
1290 index bb3840cedb4f..ee43b36075c7 100644
1291 --- a/arch/x86/kernel/paravirt_patch_64.c
1292 +++ b/arch/x86/kernel/paravirt_patch_64.c
1293 @@ -9,7 +9,6 @@ DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax");
1294 DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax");
1295 DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax");
1296 DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3");
1297 -DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)");
1298 DEF_NATIVE(pv_cpu_ops, clts, "clts");
1299 DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd");
1300
1301 @@ -59,7 +58,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
1302 PATCH_SITE(pv_mmu_ops, read_cr3);
1303 PATCH_SITE(pv_mmu_ops, write_cr3);
1304 PATCH_SITE(pv_cpu_ops, clts);
1305 - PATCH_SITE(pv_mmu_ops, flush_tlb_single);
1306 PATCH_SITE(pv_cpu_ops, wbinvd);
1307 #if defined(CONFIG_PARAVIRT_SPINLOCKS)
1308 case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):
1309 diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
1310 index 8e10e72bf6ee..a55b32007785 100644
1311 --- a/arch/x86/kernel/process.c
1312 +++ b/arch/x86/kernel/process.c
1313 @@ -41,7 +41,7 @@
1314 * section. Since TSS's are completely CPU-local, we want them
1315 * on exact cacheline boundaries, to eliminate cacheline ping-pong.
1316 */
1317 -__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
1318 +__visible DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss) = {
1319 .x86_tss = {
1320 .sp0 = TOP_OF_INIT_STACK,
1321 #ifdef CONFIG_X86_32
1322 diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
1323 index feaab07fa124..6b55012d02a3 100644
1324 --- a/arch/x86/kernel/setup.c
1325 +++ b/arch/x86/kernel/setup.c
1326 @@ -114,6 +114,7 @@
1327 #include <asm/microcode.h>
1328 #include <asm/mmu_context.h>
1329 #include <asm/kaslr.h>
1330 +#include <asm/kaiser.h>
1331
1332 /*
1333 * max_low_pfn_mapped: highest direct mapped pfn under 4GB
1334 @@ -1019,6 +1020,12 @@ void __init setup_arch(char **cmdline_p)
1335 */
1336 init_hypervisor_platform();
1337
1338 + /*
1339 + * This needs to happen right after XENPV is set on xen and
1340 + * kaiser_enabled is checked below in cleanup_highmap().
1341 + */
1342 + kaiser_check_boottime_disable();
1343 +
1344 x86_init.resources.probe_roms();
1345
1346 /* after parse_early_param, so could debug it */
1347 diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c
1348 index 1c113db9ed57..2bb5ee464df3 100644
1349 --- a/arch/x86/kernel/tracepoint.c
1350 +++ b/arch/x86/kernel/tracepoint.c
1351 @@ -9,10 +9,12 @@
1352 #include <linux/atomic.h>
1353
1354 atomic_t trace_idt_ctr = ATOMIC_INIT(0);
1355 +__aligned(PAGE_SIZE)
1356 struct desc_ptr trace_idt_descr = { NR_VECTORS * 16 - 1,
1357 (unsigned long) trace_idt_table };
1358
1359 /* No need to be aligned, but done to keep all IDTs defined the same way. */
1360 +__aligned(PAGE_SIZE)
1361 gate_desc trace_idt_table[NR_VECTORS] __page_aligned_bss;
1362
1363 static int trace_irq_vector_refcount;
1364 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
1365 index 7e28e6c877d9..73304b1a03cc 100644
1366 --- a/arch/x86/kvm/x86.c
1367 +++ b/arch/x86/kvm/x86.c
1368 @@ -773,7 +773,8 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1369 return 1;
1370
1371 /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
1372 - if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu))
1373 + if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_ASID_MASK) ||
1374 + !is_long_mode(vcpu))
1375 return 1;
1376 }
1377
1378 diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c
1379 index 5cc78bf57232..3261abb21ef4 100644
1380 --- a/arch/x86/lib/cmdline.c
1381 +++ b/arch/x86/lib/cmdline.c
1382 @@ -104,7 +104,112 @@ __cmdline_find_option_bool(const char *cmdline, int max_cmdline_size,
1383 return 0; /* Buffer overrun */
1384 }
1385
1386 +/*
1387 + * Find a non-boolean option (i.e. option=argument). In accordance with
1388 + * standard Linux practice, if this option is repeated, this returns the
1389 + * last instance on the command line.
1390 + *
1391 + * @cmdline: the cmdline string
1392 + * @max_cmdline_size: the maximum size of cmdline
1393 + * @option: option string to look for
1394 + * @buffer: memory buffer to return the option argument
1395 + * @bufsize: size of the supplied memory buffer
1396 + *
1397 + * Returns the length of the argument (regardless of if it was
1398 + * truncated to fit in the buffer), or -1 on not found.
1399 + */
1400 +static int
1401 +__cmdline_find_option(const char *cmdline, int max_cmdline_size,
1402 + const char *option, char *buffer, int bufsize)
1403 +{
1404 + char c;
1405 + int pos = 0, len = -1;
1406 + const char *opptr = NULL;
1407 + char *bufptr = buffer;
1408 + enum {
1409 + st_wordstart = 0, /* Start of word/after whitespace */
1410 + st_wordcmp, /* Comparing this word */
1411 + st_wordskip, /* Miscompare, skip */
1412 + st_bufcpy, /* Copying this to buffer */
1413 + } state = st_wordstart;
1414 +
1415 + if (!cmdline)
1416 + return -1; /* No command line */
1417 +
1418 + /*
1419 + * This 'pos' check ensures we do not overrun
1420 + * a non-NULL-terminated 'cmdline'
1421 + */
1422 + while (pos++ < max_cmdline_size) {
1423 + c = *(char *)cmdline++;
1424 + if (!c)
1425 + break;
1426 +
1427 + switch (state) {
1428 + case st_wordstart:
1429 + if (myisspace(c))
1430 + break;
1431 +
1432 + state = st_wordcmp;
1433 + opptr = option;
1434 + /* fall through */
1435 +
1436 + case st_wordcmp:
1437 + if ((c == '=') && !*opptr) {
1438 + /*
1439 + * We matched all the way to the end of the
1440 + * option we were looking for, prepare to
1441 + * copy the argument.
1442 + */
1443 + len = 0;
1444 + bufptr = buffer;
1445 + state = st_bufcpy;
1446 + break;
1447 + } else if (c == *opptr++) {
1448 + /*
1449 + * We are currently matching, so continue
1450 + * to the next character on the cmdline.
1451 + */
1452 + break;
1453 + }
1454 + state = st_wordskip;
1455 + /* fall through */
1456 +
1457 + case st_wordskip:
1458 + if (myisspace(c))
1459 + state = st_wordstart;
1460 + break;
1461 +
1462 + case st_bufcpy:
1463 + if (myisspace(c)) {
1464 + state = st_wordstart;
1465 + } else {
1466 + /*
1467 + * Increment len, but don't overrun the
1468 + * supplied buffer and leave room for the
1469 + * NULL terminator.
1470 + */
1471 + if (++len < bufsize)
1472 + *bufptr++ = c;
1473 + }
1474 + break;
1475 + }
1476 + }
1477 +
1478 + if (bufsize)
1479 + *bufptr = '\0';
1480 +
1481 + return len;
1482 +}
1483 +
1484 int cmdline_find_option_bool(const char *cmdline, const char *option)
1485 {
1486 return __cmdline_find_option_bool(cmdline, COMMAND_LINE_SIZE, option);
1487 }
1488 +
1489 +int cmdline_find_option(const char *cmdline, const char *option, char *buffer,
1490 + int bufsize)
1491 +{
1492 + return __cmdline_find_option(cmdline, COMMAND_LINE_SIZE, option,
1493 + buffer, bufsize);
1494 +}
1495 diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
1496 index 96d2b847e09e..c548b46100cb 100644
1497 --- a/arch/x86/mm/Makefile
1498 +++ b/arch/x86/mm/Makefile
1499 @@ -37,5 +37,5 @@ obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
1500
1501 obj-$(CONFIG_X86_INTEL_MPX) += mpx.o
1502 obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
1503 -obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
1504 -
1505 +obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
1506 +obj-$(CONFIG_PAGE_TABLE_ISOLATION) += kaiser.o
1507 diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
1508 index 0381638168d1..1e779bca4f3e 100644
1509 --- a/arch/x86/mm/init.c
1510 +++ b/arch/x86/mm/init.c
1511 @@ -177,7 +177,7 @@ static void __init probe_page_size_mask(void)
1512 cr4_set_bits_and_update_boot(X86_CR4_PSE);
1513
1514 /* Enable PGE if available */
1515 - if (boot_cpu_has(X86_FEATURE_PGE)) {
1516 + if (boot_cpu_has(X86_FEATURE_PGE) && !kaiser_enabled) {
1517 cr4_set_bits_and_update_boot(X86_CR4_PGE);
1518 __supported_pte_mask |= _PAGE_GLOBAL;
1519 } else
1520 diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
1521 index 3e27ded6ac65..7df8e3a79dc0 100644
1522 --- a/arch/x86/mm/init_64.c
1523 +++ b/arch/x86/mm/init_64.c
1524 @@ -324,6 +324,16 @@ void __init cleanup_highmap(void)
1525 continue;
1526 if (vaddr < (unsigned long) _text || vaddr > end)
1527 set_pmd(pmd, __pmd(0));
1528 + else if (kaiser_enabled) {
1529 + /*
1530 + * level2_kernel_pgt is initialized with _PAGE_GLOBAL:
1531 + * clear that now. This is not important, so long as
1532 + * CR4.PGE remains clear, but it removes an anomaly.
1533 + * Physical mapping setup below avoids _PAGE_GLOBAL
1534 + * by use of massage_pgprot() inside pfn_pte() etc.
1535 + */
1536 + set_pmd(pmd, pmd_clear_flags(*pmd, _PAGE_GLOBAL));
1537 + }
1538 }
1539 }
1540
1541 diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
1542 new file mode 100644
1543 index 000000000000..d8376b4ad9f0
1544 --- /dev/null
1545 +++ b/arch/x86/mm/kaiser.c
1546 @@ -0,0 +1,455 @@
1547 +#include <linux/bug.h>
1548 +#include <linux/kernel.h>
1549 +#include <linux/errno.h>
1550 +#include <linux/string.h>
1551 +#include <linux/types.h>
1552 +#include <linux/bug.h>
1553 +#include <linux/init.h>
1554 +#include <linux/interrupt.h>
1555 +#include <linux/spinlock.h>
1556 +#include <linux/mm.h>
1557 +#include <linux/uaccess.h>
1558 +
1559 +#undef pr_fmt
1560 +#define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt
1561 +
1562 +#include <asm/kaiser.h>
1563 +#include <asm/tlbflush.h> /* to verify its kaiser declarations */
1564 +#include <asm/pgtable.h>
1565 +#include <asm/pgalloc.h>
1566 +#include <asm/desc.h>
1567 +#include <asm/cmdline.h>
1568 +
1569 +int kaiser_enabled __read_mostly = 1;
1570 +EXPORT_SYMBOL(kaiser_enabled); /* for inlined TLB flush functions */
1571 +
1572 +__visible
1573 +DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
1574 +
1575 +/*
1576 + * These can have bit 63 set, so we can not just use a plain "or"
1577 + * instruction to get their value or'd into CR3. It would take
1578 + * another register. So, we use a memory reference to these instead.
1579 + *
1580 + * This is also handy because systems that do not support PCIDs
1581 + * just end up or'ing a 0 into their CR3, which does no harm.
1582 + */
1583 +DEFINE_PER_CPU(unsigned long, x86_cr3_pcid_user);
1584 +
1585 +/*
1586 + * At runtime, the only things we map are some things for CPU
1587 + * hotplug, and stacks for new processes. No two CPUs will ever
1588 + * be populating the same addresses, so we only need to ensure
1589 + * that we protect between two CPUs trying to allocate and
1590 + * populate the same page table page.
1591 + *
1592 + * Only take this lock when doing a set_p[4um]d(), but it is not
1593 + * needed for doing a set_pte(). We assume that only the *owner*
1594 + * of a given allocation will be doing this for _their_
1595 + * allocation.
1596 + *
1597 + * This ensures that once a system has been running for a while
1598 + * and there have been stacks all over and these page tables
1599 + * are fully populated, there will be no further acquisitions of
1600 + * this lock.
1601 + */
1602 +static DEFINE_SPINLOCK(shadow_table_allocation_lock);
1603 +
1604 +/*
1605 + * Returns -1 on error.
1606 + */
1607 +static inline unsigned long get_pa_from_mapping(unsigned long vaddr)
1608 +{
1609 + pgd_t *pgd;
1610 + pud_t *pud;
1611 + pmd_t *pmd;
1612 + pte_t *pte;
1613 +
1614 + pgd = pgd_offset_k(vaddr);
1615 + /*
1616 + * We made all the kernel PGDs present in kaiser_init().
1617 + * We expect them to stay that way.
1618 + */
1619 + BUG_ON(pgd_none(*pgd));
1620 + /*
1621 + * PGDs are either 512GB or 128TB on all x86_64
1622 + * configurations. We don't handle these.
1623 + */
1624 + BUG_ON(pgd_large(*pgd));
1625 +
1626 + pud = pud_offset(pgd, vaddr);
1627 + if (pud_none(*pud)) {
1628 + WARN_ON_ONCE(1);
1629 + return -1;
1630 + }
1631 +
1632 + if (pud_large(*pud))
1633 + return (pud_pfn(*pud) << PAGE_SHIFT) | (vaddr & ~PUD_PAGE_MASK);
1634 +
1635 + pmd = pmd_offset(pud, vaddr);
1636 + if (pmd_none(*pmd)) {
1637 + WARN_ON_ONCE(1);
1638 + return -1;
1639 + }
1640 +
1641 + if (pmd_large(*pmd))
1642 + return (pmd_pfn(*pmd) << PAGE_SHIFT) | (vaddr & ~PMD_PAGE_MASK);
1643 +
1644 + pte = pte_offset_kernel(pmd, vaddr);
1645 + if (pte_none(*pte)) {
1646 + WARN_ON_ONCE(1);
1647 + return -1;
1648 + }
1649 +
1650 + return (pte_pfn(*pte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK);
1651 +}
1652 +
1653 +/*
1654 + * This is a relatively normal page table walk, except that it
1655 + * also tries to allocate page tables pages along the way.
1656 + *
1657 + * Returns a pointer to a PTE on success, or NULL on failure.
1658 + */
1659 +static pte_t *kaiser_pagetable_walk(unsigned long address)
1660 +{
1661 + pmd_t *pmd;
1662 + pud_t *pud;
1663 + pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address));
1664 + gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
1665 +
1666 + if (pgd_none(*pgd)) {
1667 + WARN_ONCE(1, "All shadow pgds should have been populated");
1668 + return NULL;
1669 + }
1670 + BUILD_BUG_ON(pgd_large(*pgd) != 0);
1671 +
1672 + pud = pud_offset(pgd, address);
1673 + /* The shadow page tables do not use large mappings: */
1674 + if (pud_large(*pud)) {
1675 + WARN_ON(1);
1676 + return NULL;
1677 + }
1678 + if (pud_none(*pud)) {
1679 + unsigned long new_pmd_page = __get_free_page(gfp);
1680 + if (!new_pmd_page)
1681 + return NULL;
1682 + spin_lock(&shadow_table_allocation_lock);
1683 + if (pud_none(*pud)) {
1684 + set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
1685 + __inc_zone_page_state(virt_to_page((void *)
1686 + new_pmd_page), NR_KAISERTABLE);
1687 + } else
1688 + free_page(new_pmd_page);
1689 + spin_unlock(&shadow_table_allocation_lock);
1690 + }
1691 +
1692 + pmd = pmd_offset(pud, address);
1693 + /* The shadow page tables do not use large mappings: */
1694 + if (pmd_large(*pmd)) {
1695 + WARN_ON(1);
1696 + return NULL;
1697 + }
1698 + if (pmd_none(*pmd)) {
1699 + unsigned long new_pte_page = __get_free_page(gfp);
1700 + if (!new_pte_page)
1701 + return NULL;
1702 + spin_lock(&shadow_table_allocation_lock);
1703 + if (pmd_none(*pmd)) {
1704 + set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page)));
1705 + __inc_zone_page_state(virt_to_page((void *)
1706 + new_pte_page), NR_KAISERTABLE);
1707 + } else
1708 + free_page(new_pte_page);
1709 + spin_unlock(&shadow_table_allocation_lock);
1710 + }
1711 +
1712 + return pte_offset_kernel(pmd, address);
1713 +}
1714 +
1715 +static int kaiser_add_user_map(const void *__start_addr, unsigned long size,
1716 + unsigned long flags)
1717 +{
1718 + int ret = 0;
1719 + pte_t *pte;
1720 + unsigned long start_addr = (unsigned long )__start_addr;
1721 + unsigned long address = start_addr & PAGE_MASK;
1722 + unsigned long end_addr = PAGE_ALIGN(start_addr + size);
1723 + unsigned long target_address;
1724 +
1725 + /*
1726 + * It is convenient for callers to pass in __PAGE_KERNEL etc,
1727 + * and there is no actual harm from setting _PAGE_GLOBAL, so
1728 + * long as CR4.PGE is not set. But it is nonetheless troubling
1729 + * to see Kaiser itself setting _PAGE_GLOBAL (now that "nokaiser"
1730 + * requires that not to be #defined to 0): so mask it off here.
1731 + */
1732 + flags &= ~_PAGE_GLOBAL;
1733 +
1734 + for (; address < end_addr; address += PAGE_SIZE) {
1735 + target_address = get_pa_from_mapping(address);
1736 + if (target_address == -1) {
1737 + ret = -EIO;
1738 + break;
1739 + }
1740 + pte = kaiser_pagetable_walk(address);
1741 + if (!pte) {
1742 + ret = -ENOMEM;
1743 + break;
1744 + }
1745 + if (pte_none(*pte)) {
1746 + set_pte(pte, __pte(flags | target_address));
1747 + } else {
1748 + pte_t tmp;
1749 + set_pte(&tmp, __pte(flags | target_address));
1750 + WARN_ON_ONCE(!pte_same(*pte, tmp));
1751 + }
1752 + }
1753 + return ret;
1754 +}
1755 +
1756 +static int kaiser_add_user_map_ptrs(const void *start, const void *end, unsigned long flags)
1757 +{
1758 + unsigned long size = end - start;
1759 +
1760 + return kaiser_add_user_map(start, size, flags);
1761 +}
1762 +
1763 +/*
1764 + * Ensure that the top level of the (shadow) page tables are
1765 + * entirely populated. This ensures that all processes that get
1766 + * forked have the same entries. This way, we do not have to
1767 + * ever go set up new entries in older processes.
1768 + *
1769 + * Note: we never free these, so there are no updates to them
1770 + * after this.
1771 + */
1772 +static void __init kaiser_init_all_pgds(void)
1773 +{
1774 + pgd_t *pgd;
1775 + int i = 0;
1776 +
1777 + pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0));
1778 + for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) {
1779 + pgd_t new_pgd;
1780 + pud_t *pud = pud_alloc_one(&init_mm,
1781 + PAGE_OFFSET + i * PGDIR_SIZE);
1782 + if (!pud) {
1783 + WARN_ON(1);
1784 + break;
1785 + }
1786 + inc_zone_page_state(virt_to_page(pud), NR_KAISERTABLE);
1787 + new_pgd = __pgd(_KERNPG_TABLE |__pa(pud));
1788 + /*
1789 + * Make sure not to stomp on some other pgd entry.
1790 + */
1791 + if (!pgd_none(pgd[i])) {
1792 + WARN_ON(1);
1793 + continue;
1794 + }
1795 + set_pgd(pgd + i, new_pgd);
1796 + }
1797 +}
1798 +
1799 +#define kaiser_add_user_map_early(start, size, flags) do { \
1800 + int __ret = kaiser_add_user_map(start, size, flags); \
1801 + WARN_ON(__ret); \
1802 +} while (0)
1803 +
1804 +#define kaiser_add_user_map_ptrs_early(start, end, flags) do { \
1805 + int __ret = kaiser_add_user_map_ptrs(start, end, flags); \
1806 + WARN_ON(__ret); \
1807 +} while (0)
1808 +
1809 +void __init kaiser_check_boottime_disable(void)
1810 +{
1811 + bool enable = true;
1812 + char arg[5];
1813 + int ret;
1814 +
1815 + if (boot_cpu_has(X86_FEATURE_XENPV))
1816 + goto silent_disable;
1817 +
1818 + ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg));
1819 + if (ret > 0) {
1820 + if (!strncmp(arg, "on", 2))
1821 + goto enable;
1822 +
1823 + if (!strncmp(arg, "off", 3))
1824 + goto disable;
1825 +
1826 + if (!strncmp(arg, "auto", 4))
1827 + goto skip;
1828 + }
1829 +
1830 + if (cmdline_find_option_bool(boot_command_line, "nopti"))
1831 + goto disable;
1832 +
1833 +skip:
1834 + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
1835 + goto disable;
1836 +
1837 +enable:
1838 + if (enable)
1839 + setup_force_cpu_cap(X86_FEATURE_KAISER);
1840 +
1841 + return;
1842 +
1843 +disable:
1844 + pr_info("disabled\n");
1845 +
1846 +silent_disable:
1847 + kaiser_enabled = 0;
1848 + setup_clear_cpu_cap(X86_FEATURE_KAISER);
1849 +}
1850 +
1851 +/*
1852 + * If anything in here fails, we will likely die on one of the
1853 + * first kernel->user transitions and init will die. But, we
1854 + * will have most of the kernel up by then and should be able to
1855 + * get a clean warning out of it. If we BUG_ON() here, we run
1856 + * the risk of being before we have good console output.
1857 + */
1858 +void __init kaiser_init(void)
1859 +{
1860 + int cpu;
1861 +
1862 + if (!kaiser_enabled)
1863 + return;
1864 +
1865 + kaiser_init_all_pgds();
1866 +
1867 + for_each_possible_cpu(cpu) {
1868 + void *percpu_vaddr = __per_cpu_user_mapped_start +
1869 + per_cpu_offset(cpu);
1870 + unsigned long percpu_sz = __per_cpu_user_mapped_end -
1871 + __per_cpu_user_mapped_start;
1872 + kaiser_add_user_map_early(percpu_vaddr, percpu_sz,
1873 + __PAGE_KERNEL);
1874 + }
1875 +
1876 + /*
1877 + * Map the entry/exit text section, which is needed at
1878 + * switches from user to and from kernel.
1879 + */
1880 + kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end,
1881 + __PAGE_KERNEL_RX);
1882 +
1883 +#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
1884 + kaiser_add_user_map_ptrs_early(__irqentry_text_start,
1885 + __irqentry_text_end,
1886 + __PAGE_KERNEL_RX);
1887 +#endif
1888 + kaiser_add_user_map_early((void *)idt_descr.address,
1889 + sizeof(gate_desc) * NR_VECTORS,
1890 + __PAGE_KERNEL_RO);
1891 +#ifdef CONFIG_TRACING
1892 + kaiser_add_user_map_early(&trace_idt_descr,
1893 + sizeof(trace_idt_descr),
1894 + __PAGE_KERNEL);
1895 + kaiser_add_user_map_early(&trace_idt_table,
1896 + sizeof(gate_desc) * NR_VECTORS,
1897 + __PAGE_KERNEL);
1898 +#endif
1899 + kaiser_add_user_map_early(&debug_idt_descr, sizeof(debug_idt_descr),
1900 + __PAGE_KERNEL);
1901 + kaiser_add_user_map_early(&debug_idt_table,
1902 + sizeof(gate_desc) * NR_VECTORS,
1903 + __PAGE_KERNEL);
1904 +
1905 + pr_info("enabled\n");
1906 +}
1907 +
1908 +/* Add a mapping to the shadow mapping, and synchronize the mappings */
1909 +int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
1910 +{
1911 + if (!kaiser_enabled)
1912 + return 0;
1913 + return kaiser_add_user_map((const void *)addr, size, flags);
1914 +}
1915 +
1916 +void kaiser_remove_mapping(unsigned long start, unsigned long size)
1917 +{
1918 + extern void unmap_pud_range_nofree(pgd_t *pgd,
1919 + unsigned long start, unsigned long end);
1920 + unsigned long end = start + size;
1921 + unsigned long addr, next;
1922 + pgd_t *pgd;
1923 +
1924 + if (!kaiser_enabled)
1925 + return;
1926 + pgd = native_get_shadow_pgd(pgd_offset_k(start));
1927 + for (addr = start; addr < end; pgd++, addr = next) {
1928 + next = pgd_addr_end(addr, end);
1929 + unmap_pud_range_nofree(pgd, addr, next);
1930 + }
1931 +}
1932 +
1933 +/*
1934 + * Page table pages are page-aligned. The lower half of the top
1935 + * level is used for userspace and the top half for the kernel.
1936 + * This returns true for user pages that need to get copied into
1937 + * both the user and kernel copies of the page tables, and false
1938 + * for kernel pages that should only be in the kernel copy.
1939 + */
1940 +static inline bool is_userspace_pgd(pgd_t *pgdp)
1941 +{
1942 + return ((unsigned long)pgdp % PAGE_SIZE) < (PAGE_SIZE / 2);
1943 +}
1944 +
1945 +pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
1946 +{
1947 + if (!kaiser_enabled)
1948 + return pgd;
1949 + /*
1950 + * Do we need to also populate the shadow pgd? Check _PAGE_USER to
1951 + * skip cases like kexec and EFI which make temporary low mappings.
1952 + */
1953 + if (pgd.pgd & _PAGE_USER) {
1954 + if (is_userspace_pgd(pgdp)) {
1955 + native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
1956 + /*
1957 + * Even if the entry is *mapping* userspace, ensure
1958 + * that userspace can not use it. This way, if we
1959 + * get out to userspace running on the kernel CR3,
1960 + * userspace will crash instead of running.
1961 + */
1962 + if (__supported_pte_mask & _PAGE_NX)
1963 + pgd.pgd |= _PAGE_NX;
1964 + }
1965 + } else if (!pgd.pgd) {
1966 + /*
1967 + * pgd_clear() cannot check _PAGE_USER, and is even used to
1968 + * clear corrupted pgd entries: so just rely on cases like
1969 + * kexec and EFI never to be using pgd_clear().
1970 + */
1971 + if (!WARN_ON_ONCE((unsigned long)pgdp & PAGE_SIZE) &&
1972 + is_userspace_pgd(pgdp))
1973 + native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
1974 + }
1975 + return pgd;
1976 +}
1977 +
1978 +void kaiser_setup_pcid(void)
1979 +{
1980 + unsigned long user_cr3 = KAISER_SHADOW_PGD_OFFSET;
1981 +
1982 + if (this_cpu_has(X86_FEATURE_PCID))
1983 + user_cr3 |= X86_CR3_PCID_USER_NOFLUSH;
1984 + /*
1985 + * These variables are used by the entry/exit
1986 + * code to change PCID and pgd and TLB flushing.
1987 + */
1988 + this_cpu_write(x86_cr3_pcid_user, user_cr3);
1989 +}
1990 +
1991 +/*
1992 + * Make a note that this cpu will need to flush USER tlb on return to user.
1993 + * If cpu does not have PCID, then the NOFLUSH bit will never have been set.
1994 + */
1995 +void kaiser_flush_tlb_on_return_to_user(void)
1996 +{
1997 + if (this_cpu_has(X86_FEATURE_PCID))
1998 + this_cpu_write(x86_cr3_pcid_user,
1999 + X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET);
2000 +}
2001 +EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user);
2002 diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
2003 index aed206475aa7..319183d93602 100644
2004 --- a/arch/x86/mm/kaslr.c
2005 +++ b/arch/x86/mm/kaslr.c
2006 @@ -189,6 +189,6 @@ void __meminit init_trampoline(void)
2007 *pud_tramp = *pud;
2008 }
2009
2010 - set_pgd(&trampoline_pgd_entry,
2011 - __pgd(_KERNPG_TABLE | __pa(pud_page_tramp)));
2012 + /* Avoid set_pgd(), in case it's complicated by CONFIG_PAGE_TABLE_ISOLATION */
2013 + trampoline_pgd_entry = __pgd(_KERNPG_TABLE | __pa(pud_page_tramp));
2014 }
2015 diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
2016 index e3353c97d086..73dcb0e18c1b 100644
2017 --- a/arch/x86/mm/pageattr.c
2018 +++ b/arch/x86/mm/pageattr.c
2019 @@ -52,6 +52,7 @@ static DEFINE_SPINLOCK(cpa_lock);
2020 #define CPA_FLUSHTLB 1
2021 #define CPA_ARRAY 2
2022 #define CPA_PAGES_ARRAY 4
2023 +#define CPA_FREE_PAGETABLES 8
2024
2025 #ifdef CONFIG_PROC_FS
2026 static unsigned long direct_pages_count[PG_LEVEL_NUM];
2027 @@ -729,10 +730,13 @@ static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
2028 return 0;
2029 }
2030
2031 -static bool try_to_free_pte_page(pte_t *pte)
2032 +static bool try_to_free_pte_page(struct cpa_data *cpa, pte_t *pte)
2033 {
2034 int i;
2035
2036 + if (!(cpa->flags & CPA_FREE_PAGETABLES))
2037 + return false;
2038 +
2039 for (i = 0; i < PTRS_PER_PTE; i++)
2040 if (!pte_none(pte[i]))
2041 return false;
2042 @@ -741,10 +745,13 @@ static bool try_to_free_pte_page(pte_t *pte)
2043 return true;
2044 }
2045
2046 -static bool try_to_free_pmd_page(pmd_t *pmd)
2047 +static bool try_to_free_pmd_page(struct cpa_data *cpa, pmd_t *pmd)
2048 {
2049 int i;
2050
2051 + if (!(cpa->flags & CPA_FREE_PAGETABLES))
2052 + return false;
2053 +
2054 for (i = 0; i < PTRS_PER_PMD; i++)
2055 if (!pmd_none(pmd[i]))
2056 return false;
2057 @@ -753,7 +760,9 @@ static bool try_to_free_pmd_page(pmd_t *pmd)
2058 return true;
2059 }
2060
2061 -static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
2062 +static bool unmap_pte_range(struct cpa_data *cpa, pmd_t *pmd,
2063 + unsigned long start,
2064 + unsigned long end)
2065 {
2066 pte_t *pte = pte_offset_kernel(pmd, start);
2067
2068 @@ -764,22 +773,23 @@ static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
2069 pte++;
2070 }
2071
2072 - if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) {
2073 + if (try_to_free_pte_page(cpa, (pte_t *)pmd_page_vaddr(*pmd))) {
2074 pmd_clear(pmd);
2075 return true;
2076 }
2077 return false;
2078 }
2079
2080 -static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd,
2081 +static void __unmap_pmd_range(struct cpa_data *cpa, pud_t *pud, pmd_t *pmd,
2082 unsigned long start, unsigned long end)
2083 {
2084 - if (unmap_pte_range(pmd, start, end))
2085 - if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
2086 + if (unmap_pte_range(cpa, pmd, start, end))
2087 + if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud)))
2088 pud_clear(pud);
2089 }
2090
2091 -static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
2092 +static void unmap_pmd_range(struct cpa_data *cpa, pud_t *pud,
2093 + unsigned long start, unsigned long end)
2094 {
2095 pmd_t *pmd = pmd_offset(pud, start);
2096
2097 @@ -790,7 +800,7 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
2098 unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
2099 unsigned long pre_end = min_t(unsigned long, end, next_page);
2100
2101 - __unmap_pmd_range(pud, pmd, start, pre_end);
2102 + __unmap_pmd_range(cpa, pud, pmd, start, pre_end);
2103
2104 start = pre_end;
2105 pmd++;
2106 @@ -803,7 +813,8 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
2107 if (pmd_large(*pmd))
2108 pmd_clear(pmd);
2109 else
2110 - __unmap_pmd_range(pud, pmd, start, start + PMD_SIZE);
2111 + __unmap_pmd_range(cpa, pud, pmd,
2112 + start, start + PMD_SIZE);
2113
2114 start += PMD_SIZE;
2115 pmd++;
2116 @@ -813,17 +824,19 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
2117 * 4K leftovers?
2118 */
2119 if (start < end)
2120 - return __unmap_pmd_range(pud, pmd, start, end);
2121 + return __unmap_pmd_range(cpa, pud, pmd, start, end);
2122
2123 /*
2124 * Try again to free the PMD page if haven't succeeded above.
2125 */
2126 if (!pud_none(*pud))
2127 - if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
2128 + if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud)))
2129 pud_clear(pud);
2130 }
2131
2132 -static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
2133 +static void __unmap_pud_range(struct cpa_data *cpa, pgd_t *pgd,
2134 + unsigned long start,
2135 + unsigned long end)
2136 {
2137 pud_t *pud = pud_offset(pgd, start);
2138
2139 @@ -834,7 +847,7 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
2140 unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
2141 unsigned long pre_end = min_t(unsigned long, end, next_page);
2142
2143 - unmap_pmd_range(pud, start, pre_end);
2144 + unmap_pmd_range(cpa, pud, start, pre_end);
2145
2146 start = pre_end;
2147 pud++;
2148 @@ -848,7 +861,7 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
2149 if (pud_large(*pud))
2150 pud_clear(pud);
2151 else
2152 - unmap_pmd_range(pud, start, start + PUD_SIZE);
2153 + unmap_pmd_range(cpa, pud, start, start + PUD_SIZE);
2154
2155 start += PUD_SIZE;
2156 pud++;
2157 @@ -858,7 +871,7 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
2158 * 2M leftovers?
2159 */
2160 if (start < end)
2161 - unmap_pmd_range(pud, start, end);
2162 + unmap_pmd_range(cpa, pud, start, end);
2163
2164 /*
2165 * No need to try to free the PUD page because we'll free it in
2166 @@ -866,6 +879,24 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
2167 */
2168 }
2169
2170 +static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
2171 +{
2172 + struct cpa_data cpa = {
2173 + .flags = CPA_FREE_PAGETABLES,
2174 + };
2175 +
2176 + __unmap_pud_range(&cpa, pgd, start, end);
2177 +}
2178 +
2179 +void unmap_pud_range_nofree(pgd_t *pgd, unsigned long start, unsigned long end)
2180 +{
2181 + struct cpa_data cpa = {
2182 + .flags = 0,
2183 + };
2184 +
2185 + __unmap_pud_range(&cpa, pgd, start, end);
2186 +}
2187 +
2188 static int alloc_pte_page(pmd_t *pmd)
2189 {
2190 pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
2191 diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
2192 index 3feec5af4e67..5aaec8effc5f 100644
2193 --- a/arch/x86/mm/pgtable.c
2194 +++ b/arch/x86/mm/pgtable.c
2195 @@ -344,14 +344,22 @@ static inline void _pgd_free(pgd_t *pgd)
2196 kmem_cache_free(pgd_cache, pgd);
2197 }
2198 #else
2199 +
2200 +/*
2201 + * Instead of one pgd, Kaiser acquires two pgds. Being order-1, it is
2202 + * both 8k in size and 8k-aligned. That lets us just flip bit 12
2203 + * in a pointer to swap between the two 4k halves.
2204 + */
2205 +#define PGD_ALLOCATION_ORDER kaiser_enabled
2206 +
2207 static inline pgd_t *_pgd_alloc(void)
2208 {
2209 - return (pgd_t *)__get_free_page(PGALLOC_GFP);
2210 + return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER);
2211 }
2212
2213 static inline void _pgd_free(pgd_t *pgd)
2214 {
2215 - free_page((unsigned long)pgd);
2216 + free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
2217 }
2218 #endif /* CONFIG_X86_PAE */
2219
2220 diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
2221 index 53b72fb4e781..41205de487e7 100644
2222 --- a/arch/x86/mm/tlb.c
2223 +++ b/arch/x86/mm/tlb.c
2224 @@ -6,13 +6,14 @@
2225 #include <linux/interrupt.h>
2226 #include <linux/export.h>
2227 #include <linux/cpu.h>
2228 +#include <linux/debugfs.h>
2229
2230 #include <asm/tlbflush.h>
2231 #include <asm/mmu_context.h>
2232 #include <asm/cache.h>
2233 #include <asm/apic.h>
2234 #include <asm/uv/uv.h>
2235 -#include <linux/debugfs.h>
2236 +#include <asm/kaiser.h>
2237
2238 /*
2239 * TLB flushing, formerly SMP-only
2240 @@ -34,6 +35,36 @@ struct flush_tlb_info {
2241 unsigned long flush_end;
2242 };
2243
2244 +static void load_new_mm_cr3(pgd_t *pgdir)
2245 +{
2246 + unsigned long new_mm_cr3 = __pa(pgdir);
2247 +
2248 + if (kaiser_enabled) {
2249 + /*
2250 + * We reuse the same PCID for different tasks, so we must
2251 + * flush all the entries for the PCID out when we change tasks.
2252 + * Flush KERN below, flush USER when returning to userspace in
2253 + * kaiser's SWITCH_USER_CR3 (_SWITCH_TO_USER_CR3) macro.
2254 + *
2255 + * invpcid_flush_single_context(X86_CR3_PCID_ASID_USER) could
2256 + * do it here, but can only be used if X86_FEATURE_INVPCID is
2257 + * available - and many machines support pcid without invpcid.
2258 + *
2259 + * If X86_CR3_PCID_KERN_FLUSH actually added something, then it
2260 + * would be needed in the write_cr3() below - if PCIDs enabled.
2261 + */
2262 + BUILD_BUG_ON(X86_CR3_PCID_KERN_FLUSH);
2263 + kaiser_flush_tlb_on_return_to_user();
2264 + }
2265 +
2266 + /*
2267 + * Caution: many callers of this function expect
2268 + * that load_cr3() is serializing and orders TLB
2269 + * fills with respect to the mm_cpumask writes.
2270 + */
2271 + write_cr3(new_mm_cr3);
2272 +}
2273 +
2274 /*
2275 * We cannot call mmdrop() because we are in interrupt context,
2276 * instead update mm->cpu_vm_mask.
2277 @@ -45,7 +76,7 @@ void leave_mm(int cpu)
2278 BUG();
2279 if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) {
2280 cpumask_clear_cpu(cpu, mm_cpumask(active_mm));
2281 - load_cr3(swapper_pg_dir);
2282 + load_new_mm_cr3(swapper_pg_dir);
2283 /*
2284 * This gets called in the idle path where RCU
2285 * functions differently. Tracing normally
2286 @@ -120,7 +151,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
2287 * ordering guarantee we need.
2288 *
2289 */
2290 - load_cr3(next->pgd);
2291 + load_new_mm_cr3(next->pgd);
2292
2293 trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
2294
2295 @@ -167,7 +198,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
2296 * As above, load_cr3() is serializing and orders TLB
2297 * fills with respect to the mm_cpumask write.
2298 */
2299 - load_cr3(next->pgd);
2300 + load_new_mm_cr3(next->pgd);
2301 trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
2302 load_mm_cr4(next);
2303 load_mm_ldt(next);
2304 diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
2305 index dc81e5287ebf..2e6000a4eb2c 100644
2306 --- a/include/asm-generic/vmlinux.lds.h
2307 +++ b/include/asm-generic/vmlinux.lds.h
2308 @@ -778,7 +778,14 @@
2309 */
2310 #define PERCPU_INPUT(cacheline) \
2311 VMLINUX_SYMBOL(__per_cpu_start) = .; \
2312 + VMLINUX_SYMBOL(__per_cpu_user_mapped_start) = .; \
2313 *(.data..percpu..first) \
2314 + . = ALIGN(cacheline); \
2315 + *(.data..percpu..user_mapped) \
2316 + *(.data..percpu..user_mapped..shared_aligned) \
2317 + . = ALIGN(PAGE_SIZE); \
2318 + *(.data..percpu..user_mapped..page_aligned) \
2319 + VMLINUX_SYMBOL(__per_cpu_user_mapped_end) = .; \
2320 . = ALIGN(PAGE_SIZE); \
2321 *(.data..percpu..page_aligned) \
2322 . = ALIGN(cacheline); \
2323 diff --git a/include/linux/kaiser.h b/include/linux/kaiser.h
2324 new file mode 100644
2325 index 000000000000..58c55b1589d0
2326 --- /dev/null
2327 +++ b/include/linux/kaiser.h
2328 @@ -0,0 +1,52 @@
2329 +#ifndef _LINUX_KAISER_H
2330 +#define _LINUX_KAISER_H
2331 +
2332 +#ifdef CONFIG_PAGE_TABLE_ISOLATION
2333 +#include <asm/kaiser.h>
2334 +
2335 +static inline int kaiser_map_thread_stack(void *stack)
2336 +{
2337 + /*
2338 + * Map that page of kernel stack on which we enter from user context.
2339 + */
2340 + return kaiser_add_mapping((unsigned long)stack +
2341 + THREAD_SIZE - PAGE_SIZE, PAGE_SIZE, __PAGE_KERNEL);
2342 +}
2343 +
2344 +static inline void kaiser_unmap_thread_stack(void *stack)
2345 +{
2346 + /*
2347 + * Note: may be called even when kaiser_map_thread_stack() failed.
2348 + */
2349 + kaiser_remove_mapping((unsigned long)stack +
2350 + THREAD_SIZE - PAGE_SIZE, PAGE_SIZE);
2351 +}
2352 +#else
2353 +
2354 +/*
2355 + * These stubs are used whenever CONFIG_PAGE_TABLE_ISOLATION is off, which
2356 + * includes architectures that support KAISER, but have it disabled.
2357 + */
2358 +
2359 +static inline void kaiser_init(void)
2360 +{
2361 +}
2362 +static inline int kaiser_add_mapping(unsigned long addr,
2363 + unsigned long size, unsigned long flags)
2364 +{
2365 + return 0;
2366 +}
2367 +static inline void kaiser_remove_mapping(unsigned long start,
2368 + unsigned long size)
2369 +{
2370 +}
2371 +static inline int kaiser_map_thread_stack(void *stack)
2372 +{
2373 + return 0;
2374 +}
2375 +static inline void kaiser_unmap_thread_stack(void *stack)
2376 +{
2377 +}
2378 +
2379 +#endif /* !CONFIG_PAGE_TABLE_ISOLATION */
2380 +#endif /* _LINUX_KAISER_H */
2381 diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
2382 index fff21a82780c..490f5a83f947 100644
2383 --- a/include/linux/mmzone.h
2384 +++ b/include/linux/mmzone.h
2385 @@ -124,8 +124,9 @@ enum zone_stat_item {
2386 NR_SLAB_UNRECLAIMABLE,
2387 NR_PAGETABLE, /* used for pagetables */
2388 NR_KERNEL_STACK_KB, /* measured in KiB */
2389 - /* Second 128 byte cacheline */
2390 + NR_KAISERTABLE,
2391 NR_BOUNCE,
2392 + /* Second 128 byte cacheline */
2393 #if IS_ENABLED(CONFIG_ZSMALLOC)
2394 NR_ZSPAGES, /* allocated in zsmalloc */
2395 #endif
2396 diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h
2397 index 8f16299ca068..8902f23bb770 100644
2398 --- a/include/linux/percpu-defs.h
2399 +++ b/include/linux/percpu-defs.h
2400 @@ -35,6 +35,12 @@
2401
2402 #endif
2403
2404 +#ifdef CONFIG_PAGE_TABLE_ISOLATION
2405 +#define USER_MAPPED_SECTION "..user_mapped"
2406 +#else
2407 +#define USER_MAPPED_SECTION ""
2408 +#endif
2409 +
2410 /*
2411 * Base implementations of per-CPU variable declarations and definitions, where
2412 * the section in which the variable is to be placed is provided by the
2413 @@ -115,6 +121,12 @@
2414 #define DEFINE_PER_CPU(type, name) \
2415 DEFINE_PER_CPU_SECTION(type, name, "")
2416
2417 +#define DECLARE_PER_CPU_USER_MAPPED(type, name) \
2418 + DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION)
2419 +
2420 +#define DEFINE_PER_CPU_USER_MAPPED(type, name) \
2421 + DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION)
2422 +
2423 /*
2424 * Declaration/definition used for per-CPU variables that must come first in
2425 * the set of variables.
2426 @@ -144,6 +156,14 @@
2427 DEFINE_PER_CPU_SECTION(type, name, PER_CPU_SHARED_ALIGNED_SECTION) \
2428 ____cacheline_aligned_in_smp
2429
2430 +#define DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name) \
2431 + DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \
2432 + ____cacheline_aligned_in_smp
2433 +
2434 +#define DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name) \
2435 + DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \
2436 + ____cacheline_aligned_in_smp
2437 +
2438 #define DECLARE_PER_CPU_ALIGNED(type, name) \
2439 DECLARE_PER_CPU_SECTION(type, name, PER_CPU_ALIGNED_SECTION) \
2440 ____cacheline_aligned
2441 @@ -162,11 +182,21 @@
2442 #define DEFINE_PER_CPU_PAGE_ALIGNED(type, name) \
2443 DEFINE_PER_CPU_SECTION(type, name, "..page_aligned") \
2444 __aligned(PAGE_SIZE)
2445 +/*
2446 + * Declaration/definition used for per-CPU variables that must be page aligned and need to be mapped in user mode.
2447 + */
2448 +#define DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name) \
2449 + DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \
2450 + __aligned(PAGE_SIZE)
2451 +
2452 +#define DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name) \
2453 + DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \
2454 + __aligned(PAGE_SIZE)
2455
2456 /*
2457 * Declaration/definition used for per-CPU variables that must be read mostly.
2458 */
2459 -#define DECLARE_PER_CPU_READ_MOSTLY(type, name) \
2460 +#define DECLARE_PER_CPU_READ_MOSTLY(type, name) \
2461 DECLARE_PER_CPU_SECTION(type, name, "..read_mostly")
2462
2463 #define DEFINE_PER_CPU_READ_MOSTLY(type, name) \
2464 diff --git a/init/main.c b/init/main.c
2465 index 25bac88bc66e..99f026565608 100644
2466 --- a/init/main.c
2467 +++ b/init/main.c
2468 @@ -80,6 +80,7 @@
2469 #include <linux/integrity.h>
2470 #include <linux/proc_ns.h>
2471 #include <linux/io.h>
2472 +#include <linux/kaiser.h>
2473
2474 #include <asm/io.h>
2475 #include <asm/bugs.h>
2476 @@ -473,6 +474,7 @@ static void __init mm_init(void)
2477 pgtable_init();
2478 vmalloc_init();
2479 ioremap_huge_init();
2480 + kaiser_init();
2481 }
2482
2483 asmlinkage __visible void __init start_kernel(void)
2484 diff --git a/kernel/fork.c b/kernel/fork.c
2485 index 9321b1ad3335..70e10cb49be0 100644
2486 --- a/kernel/fork.c
2487 +++ b/kernel/fork.c
2488 @@ -58,6 +58,7 @@
2489 #include <linux/tsacct_kern.h>
2490 #include <linux/cn_proc.h>
2491 #include <linux/freezer.h>
2492 +#include <linux/kaiser.h>
2493 #include <linux/delayacct.h>
2494 #include <linux/taskstats_kern.h>
2495 #include <linux/random.h>
2496 @@ -213,6 +214,7 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
2497
2498 static inline void free_thread_stack(struct task_struct *tsk)
2499 {
2500 + kaiser_unmap_thread_stack(tsk->stack);
2501 #ifdef CONFIG_VMAP_STACK
2502 if (task_stack_vm_area(tsk)) {
2503 unsigned long flags;
2504 @@ -495,6 +497,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
2505 * functions again.
2506 */
2507 tsk->stack = stack;
2508 +
2509 + err= kaiser_map_thread_stack(tsk->stack);
2510 + if (err)
2511 + goto free_stack;
2512 #ifdef CONFIG_VMAP_STACK
2513 tsk->stack_vm_area = stack_vm_area;
2514 #endif
2515 diff --git a/mm/vmstat.c b/mm/vmstat.c
2516 index 604f26a4f696..6a088df04b29 100644
2517 --- a/mm/vmstat.c
2518 +++ b/mm/vmstat.c
2519 @@ -932,6 +932,7 @@ const char * const vmstat_text[] = {
2520 "nr_slab_unreclaimable",
2521 "nr_page_table_pages",
2522 "nr_kernel_stack",
2523 + "nr_overhead",
2524 "nr_bounce",
2525 #if IS_ENABLED(CONFIG_ZSMALLOC)
2526 "nr_zspages",
2527 diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
2528 index 97f9cac98348..e86a34fd5484 100644
2529 --- a/net/ipv4/tcp_bbr.c
2530 +++ b/net/ipv4/tcp_bbr.c
2531 @@ -843,6 +843,11 @@ static u32 bbr_sndbuf_expand(struct sock *sk)
2532 */
2533 static u32 bbr_undo_cwnd(struct sock *sk)
2534 {
2535 + struct bbr *bbr = inet_csk_ca(sk);
2536 +
2537 + bbr->full_bw = 0; /* spurious slow-down; reset full pipe detection */
2538 + bbr->full_bw_cnt = 0;
2539 + bbr_reset_lt_bw_sampling(sk);
2540 return tcp_sk(sk)->snd_cwnd;
2541 }
2542
2543 diff --git a/security/Kconfig b/security/Kconfig
2544 index 118f4549404e..32f36b40e9f0 100644
2545 --- a/security/Kconfig
2546 +++ b/security/Kconfig
2547 @@ -31,6 +31,16 @@ config SECURITY
2548
2549 If you are unsure how to answer this question, answer N.
2550
2551 +config PAGE_TABLE_ISOLATION
2552 + bool "Remove the kernel mapping in user mode"
2553 + default y
2554 + depends on X86_64 && SMP
2555 + help
2556 + This enforces a strict kernel and user space isolation, in order
2557 + to close hardware side channels on kernel address information.
2558 +
2559 + If you are unsure how to answer this question, answer Y.
2560 +
2561 config SECURITYFS
2562 bool "Enable the securityfs filesystem"
2563 help
2564 diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h
2565 index a39629206864..f79669a38c0c 100644
2566 --- a/tools/arch/x86/include/asm/cpufeatures.h
2567 +++ b/tools/arch/x86/include/asm/cpufeatures.h
2568 @@ -197,6 +197,9 @@
2569 #define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */
2570 #define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */
2571
2572 +/* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */
2573 +#define X86_FEATURE_KAISER ( 7*32+31) /* CONFIG_PAGE_TABLE_ISOLATION w/o nokaiser */
2574 +
2575 /* Virtualization flags: Linux defined, word 8 */
2576 #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */
2577 #define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */