Magellan Linux

Annotation of /trunk/kernel-alx-legacy/patches-4.9/0174-4.9.75-all-fixes.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 3608 - (hide annotations) (download)
Fri Aug 14 07:34:29 2020 UTC (3 years, 8 months ago) by niro
File size: 79490 byte(s)
-added kerenl-alx-legacy pkg
1 niro 3608 diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
2     index 152ec4e87b57..5d2676d043de 100644
3     --- a/Documentation/kernel-parameters.txt
4     +++ b/Documentation/kernel-parameters.txt
5     @@ -2763,6 +2763,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
6    
7     nojitter [IA-64] Disables jitter checking for ITC timers.
8    
9     + nopti [X86-64] Disable KAISER isolation of kernel from user.
10     +
11     no-kvmclock [X86,KVM] Disable paravirtualized KVM clock driver
12    
13     no-kvmapf [X86,KVM] Disable paravirtualized asynchronous page
14     @@ -3325,6 +3327,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
15     pt. [PARIDE]
16     See Documentation/blockdev/paride.txt.
17    
18     + pti= [X86_64]
19     + Control KAISER user/kernel address space isolation:
20     + on - enable
21     + off - disable
22     + auto - default setting
23     +
24     pty.legacy_count=
25     [KNL] Number of legacy pty's. Overwrites compiled-in
26     default number.
27     diff --git a/Makefile b/Makefile
28     index 075e429732e7..acbc1b032db2 100644
29     --- a/Makefile
30     +++ b/Makefile
31     @@ -1,6 +1,6 @@
32     VERSION = 4
33     PATCHLEVEL = 9
34     -SUBLEVEL = 74
35     +SUBLEVEL = 75
36     EXTRAVERSION =
37     NAME = Roaring Lionus
38    
39     diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
40     index 766a5211f827..2728e1b7e4a6 100644
41     --- a/arch/x86/boot/compressed/misc.h
42     +++ b/arch/x86/boot/compressed/misc.h
43     @@ -9,6 +9,7 @@
44     */
45     #undef CONFIG_PARAVIRT
46     #undef CONFIG_PARAVIRT_SPINLOCKS
47     +#undef CONFIG_PAGE_TABLE_ISOLATION
48     #undef CONFIG_KASAN
49    
50     #include <linux/linkage.h>
51     diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
52     index e7b0e7ff4c58..af4e58132d91 100644
53     --- a/arch/x86/entry/entry_64.S
54     +++ b/arch/x86/entry/entry_64.S
55     @@ -36,6 +36,7 @@
56     #include <asm/smap.h>
57     #include <asm/pgtable_types.h>
58     #include <asm/export.h>
59     +#include <asm/kaiser.h>
60     #include <linux/err.h>
61    
62     /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
63     @@ -146,6 +147,7 @@ ENTRY(entry_SYSCALL_64)
64     * it is too small to ever cause noticeable irq latency.
65     */
66     SWAPGS_UNSAFE_STACK
67     + SWITCH_KERNEL_CR3_NO_STACK
68     /*
69     * A hypervisor implementation might want to use a label
70     * after the swapgs, so that it can do the swapgs
71     @@ -228,6 +230,14 @@ entry_SYSCALL_64_fastpath:
72     movq RIP(%rsp), %rcx
73     movq EFLAGS(%rsp), %r11
74     RESTORE_C_REGS_EXCEPT_RCX_R11
75     + /*
76     + * This opens a window where we have a user CR3, but are
77     + * running in the kernel. This makes using the CS
78     + * register useless for telling whether or not we need to
79     + * switch CR3 in NMIs. Normal interrupts are OK because
80     + * they are off here.
81     + */
82     + SWITCH_USER_CR3
83     movq RSP(%rsp), %rsp
84     USERGS_SYSRET64
85    
86     @@ -323,10 +333,26 @@ return_from_SYSCALL_64:
87     syscall_return_via_sysret:
88     /* rcx and r11 are already restored (see code above) */
89     RESTORE_C_REGS_EXCEPT_RCX_R11
90     + /*
91     + * This opens a window where we have a user CR3, but are
92     + * running in the kernel. This makes using the CS
93     + * register useless for telling whether or not we need to
94     + * switch CR3 in NMIs. Normal interrupts are OK because
95     + * they are off here.
96     + */
97     + SWITCH_USER_CR3
98     movq RSP(%rsp), %rsp
99     USERGS_SYSRET64
100    
101     opportunistic_sysret_failed:
102     + /*
103     + * This opens a window where we have a user CR3, but are
104     + * running in the kernel. This makes using the CS
105     + * register useless for telling whether or not we need to
106     + * switch CR3 in NMIs. Normal interrupts are OK because
107     + * they are off here.
108     + */
109     + SWITCH_USER_CR3
110     SWAPGS
111     jmp restore_c_regs_and_iret
112     END(entry_SYSCALL_64)
113     @@ -424,6 +450,7 @@ ENTRY(ret_from_fork)
114     movq %rsp, %rdi
115     call syscall_return_slowpath /* returns with IRQs disabled */
116     TRACE_IRQS_ON /* user mode is traced as IRQS on */
117     + SWITCH_USER_CR3
118     SWAPGS
119     jmp restore_regs_and_iret
120    
121     @@ -478,6 +505,7 @@ END(irq_entries_start)
122     * tracking that we're in kernel mode.
123     */
124     SWAPGS
125     + SWITCH_KERNEL_CR3
126    
127     /*
128     * We need to tell lockdep that IRQs are off. We can't do this until
129     @@ -535,6 +563,7 @@ GLOBAL(retint_user)
130     mov %rsp,%rdi
131     call prepare_exit_to_usermode
132     TRACE_IRQS_IRETQ
133     + SWITCH_USER_CR3
134     SWAPGS
135     jmp restore_regs_and_iret
136    
137     @@ -612,6 +641,7 @@ native_irq_return_ldt:
138    
139     pushq %rdi /* Stash user RDI */
140     SWAPGS
141     + SWITCH_KERNEL_CR3
142     movq PER_CPU_VAR(espfix_waddr), %rdi
143     movq %rax, (0*8)(%rdi) /* user RAX */
144     movq (1*8)(%rsp), %rax /* user RIP */
145     @@ -638,6 +668,7 @@ native_irq_return_ldt:
146     * still points to an RO alias of the ESPFIX stack.
147     */
148     orq PER_CPU_VAR(espfix_stack), %rax
149     + SWITCH_USER_CR3
150     SWAPGS
151     movq %rax, %rsp
152    
153     @@ -1022,7 +1053,11 @@ idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vec
154     /*
155     * Save all registers in pt_regs, and switch gs if needed.
156     * Use slow, but surefire "are we in kernel?" check.
157     - * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
158     + *
159     + * Return: ebx=0: needs swapgs but not SWITCH_USER_CR3 in paranoid_exit
160     + * ebx=1: needs neither swapgs nor SWITCH_USER_CR3 in paranoid_exit
161     + * ebx=2: needs both swapgs and SWITCH_USER_CR3 in paranoid_exit
162     + * ebx=3: needs SWITCH_USER_CR3 but not swapgs in paranoid_exit
163     */
164     ENTRY(paranoid_entry)
165     cld
166     @@ -1035,7 +1070,26 @@ ENTRY(paranoid_entry)
167     js 1f /* negative -> in kernel */
168     SWAPGS
169     xorl %ebx, %ebx
170     -1: ret
171     +1:
172     +#ifdef CONFIG_PAGE_TABLE_ISOLATION
173     + /*
174     + * We might have come in between a swapgs and a SWITCH_KERNEL_CR3
175     + * on entry, or between a SWITCH_USER_CR3 and a swapgs on exit.
176     + * Do a conditional SWITCH_KERNEL_CR3: this could safely be done
177     + * unconditionally, but we need to find out whether the reverse
178     + * should be done on return (conveyed to paranoid_exit in %ebx).
179     + */
180     + ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
181     + testl $KAISER_SHADOW_PGD_OFFSET, %eax
182     + jz 2f
183     + orl $2, %ebx
184     + andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
185     + /* If PCID enabled, set X86_CR3_PCID_NOFLUSH_BIT */
186     + ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID
187     + movq %rax, %cr3
188     +2:
189     +#endif
190     + ret
191     END(paranoid_entry)
192    
193     /*
194     @@ -1048,19 +1102,26 @@ END(paranoid_entry)
195     * be complicated. Fortunately, we there's no good reason
196     * to try to handle preemption here.
197     *
198     - * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it)
199     + * On entry: ebx=0: needs swapgs but not SWITCH_USER_CR3
200     + * ebx=1: needs neither swapgs nor SWITCH_USER_CR3
201     + * ebx=2: needs both swapgs and SWITCH_USER_CR3
202     + * ebx=3: needs SWITCH_USER_CR3 but not swapgs
203     */
204     ENTRY(paranoid_exit)
205     DISABLE_INTERRUPTS(CLBR_NONE)
206     TRACE_IRQS_OFF_DEBUG
207     - testl %ebx, %ebx /* swapgs needed? */
208     + TRACE_IRQS_IRETQ_DEBUG
209     +#ifdef CONFIG_PAGE_TABLE_ISOLATION
210     + /* No ALTERNATIVE for X86_FEATURE_KAISER: paranoid_entry sets %ebx */
211     + testl $2, %ebx /* SWITCH_USER_CR3 needed? */
212     + jz paranoid_exit_no_switch
213     + SWITCH_USER_CR3
214     +paranoid_exit_no_switch:
215     +#endif
216     + testl $1, %ebx /* swapgs needed? */
217     jnz paranoid_exit_no_swapgs
218     - TRACE_IRQS_IRETQ
219     SWAPGS_UNSAFE_STACK
220     - jmp paranoid_exit_restore
221     paranoid_exit_no_swapgs:
222     - TRACE_IRQS_IRETQ_DEBUG
223     -paranoid_exit_restore:
224     RESTORE_EXTRA_REGS
225     RESTORE_C_REGS
226     REMOVE_PT_GPREGS_FROM_STACK 8
227     @@ -1075,6 +1136,13 @@ ENTRY(error_entry)
228     cld
229     SAVE_C_REGS 8
230     SAVE_EXTRA_REGS 8
231     + /*
232     + * error_entry() always returns with a kernel gsbase and
233     + * CR3. We must also have a kernel CR3/gsbase before
234     + * calling TRACE_IRQS_*. Just unconditionally switch to
235     + * the kernel CR3 here.
236     + */
237     + SWITCH_KERNEL_CR3
238     xorl %ebx, %ebx
239     testb $3, CS+8(%rsp)
240     jz .Lerror_kernelspace
241     @@ -1235,6 +1303,10 @@ ENTRY(nmi)
242     */
243    
244     SWAPGS_UNSAFE_STACK
245     + /*
246     + * percpu variables are mapped with user CR3, so no need
247     + * to switch CR3 here.
248     + */
249     cld
250     movq %rsp, %rdx
251     movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
252     @@ -1268,12 +1340,34 @@ ENTRY(nmi)
253    
254     movq %rsp, %rdi
255     movq $-1, %rsi
256     +#ifdef CONFIG_PAGE_TABLE_ISOLATION
257     + /* Unconditionally use kernel CR3 for do_nmi() */
258     + /* %rax is saved above, so OK to clobber here */
259     + ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
260     + /* If PCID enabled, NOFLUSH now and NOFLUSH on return */
261     + ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID
262     + pushq %rax
263     + /* mask off "user" bit of pgd address and 12 PCID bits: */
264     + andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
265     + movq %rax, %cr3
266     +2:
267     +#endif
268     call do_nmi
269    
270     +#ifdef CONFIG_PAGE_TABLE_ISOLATION
271     + /*
272     + * Unconditionally restore CR3. I know we return to
273     + * kernel code that needs user CR3, but do we ever return
274     + * to "user mode" where we need the kernel CR3?
275     + */
276     + ALTERNATIVE "", "popq %rax; movq %rax, %cr3", X86_FEATURE_KAISER
277     +#endif
278     +
279     /*
280     * Return back to user mode. We must *not* do the normal exit
281     - * work, because we don't want to enable interrupts. Fortunately,
282     - * do_nmi doesn't modify pt_regs.
283     + * work, because we don't want to enable interrupts. Do not
284     + * switch to user CR3: we might be going back to kernel code
285     + * that had a user CR3 set.
286     */
287     SWAPGS
288     jmp restore_c_regs_and_iret
289     @@ -1470,22 +1564,55 @@ end_repeat_nmi:
290     ALLOC_PT_GPREGS_ON_STACK
291    
292     /*
293     - * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit
294     - * as we should not be calling schedule in NMI context.
295     - * Even with normal interrupts enabled. An NMI should not be
296     - * setting NEED_RESCHED or anything that normal interrupts and
297     - * exceptions might do.
298     + * Use the same approach as paranoid_entry to handle SWAPGS, but
299     + * without CR3 handling since we do that differently in NMIs. No
300     + * need to use paranoid_exit as we should not be calling schedule
301     + * in NMI context. Even with normal interrupts enabled. An NMI
302     + * should not be setting NEED_RESCHED or anything that normal
303     + * interrupts and exceptions might do.
304     */
305     - call paranoid_entry
306     -
307     - /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
308     + cld
309     + SAVE_C_REGS
310     + SAVE_EXTRA_REGS
311     + movl $1, %ebx
312     + movl $MSR_GS_BASE, %ecx
313     + rdmsr
314     + testl %edx, %edx
315     + js 1f /* negative -> in kernel */
316     + SWAPGS
317     + xorl %ebx, %ebx
318     +1:
319     movq %rsp, %rdi
320     movq $-1, %rsi
321     +#ifdef CONFIG_PAGE_TABLE_ISOLATION
322     + /* Unconditionally use kernel CR3 for do_nmi() */
323     + /* %rax is saved above, so OK to clobber here */
324     + ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
325     + /* If PCID enabled, NOFLUSH now and NOFLUSH on return */
326     + ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID
327     + pushq %rax
328     + /* mask off "user" bit of pgd address and 12 PCID bits: */
329     + andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
330     + movq %rax, %cr3
331     +2:
332     +#endif
333     +
334     + /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
335     call do_nmi
336    
337     +#ifdef CONFIG_PAGE_TABLE_ISOLATION
338     + /*
339     + * Unconditionally restore CR3. We might be returning to
340     + * kernel code that needs user CR3, like just just before
341     + * a sysret.
342     + */
343     + ALTERNATIVE "", "popq %rax; movq %rax, %cr3", X86_FEATURE_KAISER
344     +#endif
345     +
346     testl %ebx, %ebx /* swapgs needed? */
347     jnz nmi_restore
348     nmi_swapgs:
349     + /* We fixed up CR3 above, so no need to switch it here */
350     SWAPGS_UNSAFE_STACK
351     nmi_restore:
352     RESTORE_EXTRA_REGS
353     diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
354     index e1721dafbcb1..d76a97653980 100644
355     --- a/arch/x86/entry/entry_64_compat.S
356     +++ b/arch/x86/entry/entry_64_compat.S
357     @@ -13,6 +13,8 @@
358     #include <asm/irqflags.h>
359     #include <asm/asm.h>
360     #include <asm/smap.h>
361     +#include <asm/pgtable_types.h>
362     +#include <asm/kaiser.h>
363     #include <linux/linkage.h>
364     #include <linux/err.h>
365    
366     @@ -48,6 +50,7 @@
367     ENTRY(entry_SYSENTER_compat)
368     /* Interrupts are off on entry. */
369     SWAPGS_UNSAFE_STACK
370     + SWITCH_KERNEL_CR3_NO_STACK
371     movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
372    
373     /*
374     @@ -184,6 +187,7 @@ ENDPROC(entry_SYSENTER_compat)
375     ENTRY(entry_SYSCALL_compat)
376     /* Interrupts are off on entry. */
377     SWAPGS_UNSAFE_STACK
378     + SWITCH_KERNEL_CR3_NO_STACK
379    
380     /* Stash user ESP and switch to the kernel stack. */
381     movl %esp, %r8d
382     @@ -259,6 +263,7 @@ sysret32_from_system_call:
383     xorq %r8, %r8
384     xorq %r9, %r9
385     xorq %r10, %r10
386     + SWITCH_USER_CR3
387     movq RSP-ORIG_RAX(%rsp), %rsp
388     swapgs
389     sysretl
390     @@ -297,7 +302,7 @@ ENTRY(entry_INT80_compat)
391     PARAVIRT_ADJUST_EXCEPTION_FRAME
392     ASM_CLAC /* Do this early to minimize exposure */
393     SWAPGS
394     -
395     + SWITCH_KERNEL_CR3_NO_STACK
396     /*
397     * User tracing code (ptrace or signal handlers) might assume that
398     * the saved RAX contains a 32-bit number when we're invoking a 32-bit
399     @@ -338,6 +343,7 @@ ENTRY(entry_INT80_compat)
400    
401     /* Go back to user mode. */
402     TRACE_IRQS_ON
403     + SWITCH_USER_CR3
404     SWAPGS
405     jmp restore_regs_and_iret
406     END(entry_INT80_compat)
407     diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
408     index 9dfeeeca0ea8..8e7a3f1df3a5 100644
409     --- a/arch/x86/events/intel/ds.c
410     +++ b/arch/x86/events/intel/ds.c
411     @@ -2,11 +2,15 @@
412     #include <linux/types.h>
413     #include <linux/slab.h>
414    
415     +#include <asm/kaiser.h>
416     #include <asm/perf_event.h>
417     #include <asm/insn.h>
418    
419     #include "../perf_event.h"
420    
421     +static
422     +DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct debug_store, cpu_debug_store);
423     +
424     /* The size of a BTS record in bytes: */
425     #define BTS_RECORD_SIZE 24
426    
427     @@ -268,6 +272,39 @@ void fini_debug_store_on_cpu(int cpu)
428    
429     static DEFINE_PER_CPU(void *, insn_buffer);
430    
431     +static void *dsalloc(size_t size, gfp_t flags, int node)
432     +{
433     +#ifdef CONFIG_PAGE_TABLE_ISOLATION
434     + unsigned int order = get_order(size);
435     + struct page *page;
436     + unsigned long addr;
437     +
438     + page = __alloc_pages_node(node, flags | __GFP_ZERO, order);
439     + if (!page)
440     + return NULL;
441     + addr = (unsigned long)page_address(page);
442     + if (kaiser_add_mapping(addr, size, __PAGE_KERNEL) < 0) {
443     + __free_pages(page, order);
444     + addr = 0;
445     + }
446     + return (void *)addr;
447     +#else
448     + return kmalloc_node(size, flags | __GFP_ZERO, node);
449     +#endif
450     +}
451     +
452     +static void dsfree(const void *buffer, size_t size)
453     +{
454     +#ifdef CONFIG_PAGE_TABLE_ISOLATION
455     + if (!buffer)
456     + return;
457     + kaiser_remove_mapping((unsigned long)buffer, size);
458     + free_pages((unsigned long)buffer, get_order(size));
459     +#else
460     + kfree(buffer);
461     +#endif
462     +}
463     +
464     static int alloc_pebs_buffer(int cpu)
465     {
466     struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
467     @@ -278,7 +315,7 @@ static int alloc_pebs_buffer(int cpu)
468     if (!x86_pmu.pebs)
469     return 0;
470    
471     - buffer = kzalloc_node(x86_pmu.pebs_buffer_size, GFP_KERNEL, node);
472     + buffer = dsalloc(x86_pmu.pebs_buffer_size, GFP_KERNEL, node);
473     if (unlikely(!buffer))
474     return -ENOMEM;
475    
476     @@ -289,7 +326,7 @@ static int alloc_pebs_buffer(int cpu)
477     if (x86_pmu.intel_cap.pebs_format < 2) {
478     ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node);
479     if (!ibuffer) {
480     - kfree(buffer);
481     + dsfree(buffer, x86_pmu.pebs_buffer_size);
482     return -ENOMEM;
483     }
484     per_cpu(insn_buffer, cpu) = ibuffer;
485     @@ -315,7 +352,8 @@ static void release_pebs_buffer(int cpu)
486     kfree(per_cpu(insn_buffer, cpu));
487     per_cpu(insn_buffer, cpu) = NULL;
488    
489     - kfree((void *)(unsigned long)ds->pebs_buffer_base);
490     + dsfree((void *)(unsigned long)ds->pebs_buffer_base,
491     + x86_pmu.pebs_buffer_size);
492     ds->pebs_buffer_base = 0;
493     }
494    
495     @@ -329,7 +367,7 @@ static int alloc_bts_buffer(int cpu)
496     if (!x86_pmu.bts)
497     return 0;
498    
499     - buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);
500     + buffer = dsalloc(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);
501     if (unlikely(!buffer)) {
502     WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__);
503     return -ENOMEM;
504     @@ -355,19 +393,15 @@ static void release_bts_buffer(int cpu)
505     if (!ds || !x86_pmu.bts)
506     return;
507    
508     - kfree((void *)(unsigned long)ds->bts_buffer_base);
509     + dsfree((void *)(unsigned long)ds->bts_buffer_base, BTS_BUFFER_SIZE);
510     ds->bts_buffer_base = 0;
511     }
512    
513     static int alloc_ds_buffer(int cpu)
514     {
515     - int node = cpu_to_node(cpu);
516     - struct debug_store *ds;
517     -
518     - ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node);
519     - if (unlikely(!ds))
520     - return -ENOMEM;
521     + struct debug_store *ds = per_cpu_ptr(&cpu_debug_store, cpu);
522    
523     + memset(ds, 0, sizeof(*ds));
524     per_cpu(cpu_hw_events, cpu).ds = ds;
525    
526     return 0;
527     @@ -381,7 +415,6 @@ static void release_ds_buffer(int cpu)
528     return;
529    
530     per_cpu(cpu_hw_events, cpu).ds = NULL;
531     - kfree(ds);
532     }
533    
534     void release_ds_buffers(void)
535     diff --git a/arch/x86/include/asm/cmdline.h b/arch/x86/include/asm/cmdline.h
536     index e01f7f7ccb0c..84ae170bc3d0 100644
537     --- a/arch/x86/include/asm/cmdline.h
538     +++ b/arch/x86/include/asm/cmdline.h
539     @@ -2,5 +2,7 @@
540     #define _ASM_X86_CMDLINE_H
541    
542     int cmdline_find_option_bool(const char *cmdline_ptr, const char *option);
543     +int cmdline_find_option(const char *cmdline_ptr, const char *option,
544     + char *buffer, int bufsize);
545    
546     #endif /* _ASM_X86_CMDLINE_H */
547     diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
548     index ed10b5bf9b93..454a37adb823 100644
549     --- a/arch/x86/include/asm/cpufeatures.h
550     +++ b/arch/x86/include/asm/cpufeatures.h
551     @@ -189,6 +189,7 @@
552    
553     #define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */
554     #define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
555     +#define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 4) /* Effectively INVPCID && CR4.PCIDE=1 */
556    
557     #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */
558     #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
559     @@ -197,6 +198,9 @@
560     #define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */
561     #define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */
562    
563     +/* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */
564     +#define X86_FEATURE_KAISER ( 7*32+31) /* CONFIG_PAGE_TABLE_ISOLATION w/o nokaiser */
565     +
566     /* Virtualization flags: Linux defined, word 8 */
567     #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */
568     #define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */
569     diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
570     index 12080d87da3b..2ed5a2b3f8f7 100644
571     --- a/arch/x86/include/asm/desc.h
572     +++ b/arch/x86/include/asm/desc.h
573     @@ -43,7 +43,7 @@ struct gdt_page {
574     struct desc_struct gdt[GDT_ENTRIES];
575     } __attribute__((aligned(PAGE_SIZE)));
576    
577     -DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page);
578     +DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page);
579    
580     static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
581     {
582     diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
583     index b90e1053049b..0817d63bce41 100644
584     --- a/arch/x86/include/asm/hw_irq.h
585     +++ b/arch/x86/include/asm/hw_irq.h
586     @@ -178,7 +178,7 @@ extern char irq_entries_start[];
587     #define VECTOR_RETRIGGERED ((void *)~0UL)
588    
589     typedef struct irq_desc* vector_irq_t[NR_VECTORS];
590     -DECLARE_PER_CPU(vector_irq_t, vector_irq);
591     +DECLARE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq);
592    
593     #endif /* !ASSEMBLY_ */
594    
595     diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h
596     new file mode 100644
597     index 000000000000..802bbbdfe143
598     --- /dev/null
599     +++ b/arch/x86/include/asm/kaiser.h
600     @@ -0,0 +1,141 @@
601     +#ifndef _ASM_X86_KAISER_H
602     +#define _ASM_X86_KAISER_H
603     +
604     +#include <uapi/asm/processor-flags.h> /* For PCID constants */
605     +
606     +/*
607     + * This file includes the definitions for the KAISER feature.
608     + * KAISER is a counter measure against x86_64 side channel attacks on
609     + * the kernel virtual memory. It has a shadow pgd for every process: the
610     + * shadow pgd has a minimalistic kernel-set mapped, but includes the whole
611     + * user memory. Within a kernel context switch, or when an interrupt is handled,
612     + * the pgd is switched to the normal one. When the system switches to user mode,
613     + * the shadow pgd is enabled. By this, the virtual memory caches are freed,
614     + * and the user may not attack the whole kernel memory.
615     + *
616     + * A minimalistic kernel mapping holds the parts needed to be mapped in user
617     + * mode, such as the entry/exit functions of the user space, or the stacks.
618     + */
619     +
620     +#define KAISER_SHADOW_PGD_OFFSET 0x1000
621     +
622     +#ifdef __ASSEMBLY__
623     +#ifdef CONFIG_PAGE_TABLE_ISOLATION
624     +
625     +.macro _SWITCH_TO_KERNEL_CR3 reg
626     +movq %cr3, \reg
627     +andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg
628     +/* If PCID enabled, set X86_CR3_PCID_NOFLUSH_BIT */
629     +ALTERNATIVE "", "bts $63, \reg", X86_FEATURE_PCID
630     +movq \reg, %cr3
631     +.endm
632     +
633     +.macro _SWITCH_TO_USER_CR3 reg regb
634     +/*
635     + * regb must be the low byte portion of reg: because we have arranged
636     + * for the low byte of the user PCID to serve as the high byte of NOFLUSH
637     + * (0x80 for each when PCID is enabled, or 0x00 when PCID and NOFLUSH are
638     + * not enabled): so that the one register can update both memory and cr3.
639     + */
640     +movq %cr3, \reg
641     +orq PER_CPU_VAR(x86_cr3_pcid_user), \reg
642     +js 9f
643     +/* If PCID enabled, FLUSH this time, reset to NOFLUSH for next time */
644     +movb \regb, PER_CPU_VAR(x86_cr3_pcid_user+7)
645     +9:
646     +movq \reg, %cr3
647     +.endm
648     +
649     +.macro SWITCH_KERNEL_CR3
650     +ALTERNATIVE "jmp 8f", "pushq %rax", X86_FEATURE_KAISER
651     +_SWITCH_TO_KERNEL_CR3 %rax
652     +popq %rax
653     +8:
654     +.endm
655     +
656     +.macro SWITCH_USER_CR3
657     +ALTERNATIVE "jmp 8f", "pushq %rax", X86_FEATURE_KAISER
658     +_SWITCH_TO_USER_CR3 %rax %al
659     +popq %rax
660     +8:
661     +.endm
662     +
663     +.macro SWITCH_KERNEL_CR3_NO_STACK
664     +ALTERNATIVE "jmp 8f", \
665     + __stringify(movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)), \
666     + X86_FEATURE_KAISER
667     +_SWITCH_TO_KERNEL_CR3 %rax
668     +movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
669     +8:
670     +.endm
671     +
672     +#else /* CONFIG_PAGE_TABLE_ISOLATION */
673     +
674     +.macro SWITCH_KERNEL_CR3
675     +.endm
676     +.macro SWITCH_USER_CR3
677     +.endm
678     +.macro SWITCH_KERNEL_CR3_NO_STACK
679     +.endm
680     +
681     +#endif /* CONFIG_PAGE_TABLE_ISOLATION */
682     +
683     +#else /* __ASSEMBLY__ */
684     +
685     +#ifdef CONFIG_PAGE_TABLE_ISOLATION
686     +/*
687     + * Upon kernel/user mode switch, it may happen that the address
688     + * space has to be switched before the registers have been
689     + * stored. To change the address space, another register is
690     + * needed. A register therefore has to be stored/restored.
691     +*/
692     +DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
693     +
694     +DECLARE_PER_CPU(unsigned long, x86_cr3_pcid_user);
695     +
696     +extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
697     +
698     +extern int kaiser_enabled;
699     +extern void __init kaiser_check_boottime_disable(void);
700     +#else
701     +#define kaiser_enabled 0
702     +static inline void __init kaiser_check_boottime_disable(void) {}
703     +#endif /* CONFIG_PAGE_TABLE_ISOLATION */
704     +
705     +/*
706     + * Kaiser function prototypes are needed even when CONFIG_PAGE_TABLE_ISOLATION is not set,
707     + * so as to build with tests on kaiser_enabled instead of #ifdefs.
708     + */
709     +
710     +/**
711     + * kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping
712     + * @addr: the start address of the range
713     + * @size: the size of the range
714     + * @flags: The mapping flags of the pages
715     + *
716     + * The mapping is done on a global scope, so no bigger
717     + * synchronization has to be done. the pages have to be
718     + * manually unmapped again when they are not needed any longer.
719     + */
720     +extern int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);
721     +
722     +/**
723     + * kaiser_remove_mapping - unmap a virtual memory part of the shadow mapping
724     + * @addr: the start address of the range
725     + * @size: the size of the range
726     + */
727     +extern void kaiser_remove_mapping(unsigned long start, unsigned long size);
728     +
729     +/**
730     + * kaiser_init - Initialize the shadow mapping
731     + *
732     + * Most parts of the shadow mapping can be mapped upon boot
733     + * time. Only per-process things like the thread stacks
734     + * or a new LDT have to be mapped at runtime. These boot-
735     + * time mappings are permanent and never unmapped.
736     + */
737     +extern void kaiser_init(void);
738     +
739     +#endif /* __ASSEMBLY */
740     +
741     +#endif /* _ASM_X86_KAISER_H */
742     diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
743     index 437feb436efa..2536f90cd30c 100644
744     --- a/arch/x86/include/asm/pgtable.h
745     +++ b/arch/x86/include/asm/pgtable.h
746     @@ -18,6 +18,12 @@
747     #ifndef __ASSEMBLY__
748     #include <asm/x86_init.h>
749    
750     +#ifdef CONFIG_PAGE_TABLE_ISOLATION
751     +extern int kaiser_enabled;
752     +#else
753     +#define kaiser_enabled 0
754     +#endif
755     +
756     void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd);
757     void ptdump_walk_pgd_level_checkwx(void);
758    
759     @@ -690,7 +696,17 @@ static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address)
760    
761     static inline int pgd_bad(pgd_t pgd)
762     {
763     - return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE;
764     + pgdval_t ignore_flags = _PAGE_USER;
765     + /*
766     + * We set NX on KAISER pgds that map userspace memory so
767     + * that userspace can not meaningfully use the kernel
768     + * page table by accident; it will fault on the first
769     + * instruction it tries to run. See native_set_pgd().
770     + */
771     + if (kaiser_enabled)
772     + ignore_flags |= _PAGE_NX;
773     +
774     + return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
775     }
776    
777     static inline int pgd_none(pgd_t pgd)
778     @@ -903,7 +919,15 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
779     */
780     static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
781     {
782     - memcpy(dst, src, count * sizeof(pgd_t));
783     + memcpy(dst, src, count * sizeof(pgd_t));
784     +#ifdef CONFIG_PAGE_TABLE_ISOLATION
785     + if (kaiser_enabled) {
786     + /* Clone the shadow pgd part as well */
787     + memcpy(native_get_shadow_pgd(dst),
788     + native_get_shadow_pgd(src),
789     + count * sizeof(pgd_t));
790     + }
791     +#endif
792     }
793    
794     #define PTE_SHIFT ilog2(PTRS_PER_PTE)
795     diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
796     index 1cc82ece9ac1..ce97c8c6a310 100644
797     --- a/arch/x86/include/asm/pgtable_64.h
798     +++ b/arch/x86/include/asm/pgtable_64.h
799     @@ -106,9 +106,32 @@ static inline void native_pud_clear(pud_t *pud)
800     native_set_pud(pud, native_make_pud(0));
801     }
802    
803     +#ifdef CONFIG_PAGE_TABLE_ISOLATION
804     +extern pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd);
805     +
806     +static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)
807     +{
808     +#ifdef CONFIG_DEBUG_VM
809     + /* linux/mmdebug.h may not have been included at this point */
810     + BUG_ON(!kaiser_enabled);
811     +#endif
812     + return (pgd_t *)((unsigned long)pgdp | (unsigned long)PAGE_SIZE);
813     +}
814     +#else
815     +static inline pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
816     +{
817     + return pgd;
818     +}
819     +static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)
820     +{
821     + BUILD_BUG_ON(1);
822     + return NULL;
823     +}
824     +#endif /* CONFIG_PAGE_TABLE_ISOLATION */
825     +
826     static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
827     {
828     - *pgdp = pgd;
829     + *pgdp = kaiser_set_shadow_pgd(pgdp, pgd);
830     }
831    
832     static inline void native_pgd_clear(pgd_t *pgd)
833     diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
834     index 8b4de22d6429..f1c8ac468292 100644
835     --- a/arch/x86/include/asm/pgtable_types.h
836     +++ b/arch/x86/include/asm/pgtable_types.h
837     @@ -119,7 +119,7 @@
838     #define _PAGE_DEVMAP (_AT(pteval_t, 0))
839     #endif
840    
841     -#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
842     +#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
843    
844     #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
845     _PAGE_ACCESSED | _PAGE_DIRTY)
846     @@ -137,6 +137,33 @@
847     _PAGE_SOFT_DIRTY)
848     #define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
849    
850     +/* The ASID is the lower 12 bits of CR3 */
851     +#define X86_CR3_PCID_ASID_MASK (_AC((1<<12)-1,UL))
852     +
853     +/* Mask for all the PCID-related bits in CR3: */
854     +#define X86_CR3_PCID_MASK (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_MASK)
855     +#define X86_CR3_PCID_ASID_KERN (_AC(0x0,UL))
856     +
857     +#if defined(CONFIG_PAGE_TABLE_ISOLATION) && defined(CONFIG_X86_64)
858     +/* Let X86_CR3_PCID_ASID_USER be usable for the X86_CR3_PCID_NOFLUSH bit */
859     +#define X86_CR3_PCID_ASID_USER (_AC(0x80,UL))
860     +
861     +#define X86_CR3_PCID_KERN_FLUSH (X86_CR3_PCID_ASID_KERN)
862     +#define X86_CR3_PCID_USER_FLUSH (X86_CR3_PCID_ASID_USER)
863     +#define X86_CR3_PCID_KERN_NOFLUSH (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_KERN)
864     +#define X86_CR3_PCID_USER_NOFLUSH (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_USER)
865     +#else
866     +#define X86_CR3_PCID_ASID_USER (_AC(0x0,UL))
867     +/*
868     + * PCIDs are unsupported on 32-bit and none of these bits can be
869     + * set in CR3:
870     + */
871     +#define X86_CR3_PCID_KERN_FLUSH (0)
872     +#define X86_CR3_PCID_USER_FLUSH (0)
873     +#define X86_CR3_PCID_KERN_NOFLUSH (0)
874     +#define X86_CR3_PCID_USER_NOFLUSH (0)
875     +#endif
876     +
877     /*
878     * The cache modes defined here are used to translate between pure SW usage
879     * and the HW defined cache mode bits and/or PAT entries.
880     diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
881     index 83db0eae9979..8cb52ee3ade6 100644
882     --- a/arch/x86/include/asm/processor.h
883     +++ b/arch/x86/include/asm/processor.h
884     @@ -308,7 +308,7 @@ struct tss_struct {
885    
886     } ____cacheline_aligned;
887    
888     -DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss);
889     +DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss);
890    
891     #ifdef CONFIG_X86_32
892     DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
893     diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
894     index 7d2ea6b1f7d9..94146f665a3c 100644
895     --- a/arch/x86/include/asm/tlbflush.h
896     +++ b/arch/x86/include/asm/tlbflush.h
897     @@ -132,6 +132,24 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask)
898     cr4_set_bits(mask);
899     }
900    
901     +/*
902     + * Declare a couple of kaiser interfaces here for convenience,
903     + * to avoid the need for asm/kaiser.h in unexpected places.
904     + */
905     +#ifdef CONFIG_PAGE_TABLE_ISOLATION
906     +extern int kaiser_enabled;
907     +extern void kaiser_setup_pcid(void);
908     +extern void kaiser_flush_tlb_on_return_to_user(void);
909     +#else
910     +#define kaiser_enabled 0
911     +static inline void kaiser_setup_pcid(void)
912     +{
913     +}
914     +static inline void kaiser_flush_tlb_on_return_to_user(void)
915     +{
916     +}
917     +#endif
918     +
919     static inline void __native_flush_tlb(void)
920     {
921     /*
922     @@ -140,6 +158,8 @@ static inline void __native_flush_tlb(void)
923     * back:
924     */
925     preempt_disable();
926     + if (kaiser_enabled)
927     + kaiser_flush_tlb_on_return_to_user();
928     native_write_cr3(native_read_cr3());
929     preempt_enable();
930     }
931     @@ -149,20 +169,27 @@ static inline void __native_flush_tlb_global_irq_disabled(void)
932     unsigned long cr4;
933    
934     cr4 = this_cpu_read(cpu_tlbstate.cr4);
935     - /* clear PGE */
936     - native_write_cr4(cr4 & ~X86_CR4_PGE);
937     - /* write old PGE again and flush TLBs */
938     - native_write_cr4(cr4);
939     + if (cr4 & X86_CR4_PGE) {
940     + /* clear PGE and flush TLB of all entries */
941     + native_write_cr4(cr4 & ~X86_CR4_PGE);
942     + /* restore PGE as it was before */
943     + native_write_cr4(cr4);
944     + } else {
945     + /* do it with cr3, letting kaiser flush user PCID */
946     + __native_flush_tlb();
947     + }
948     }
949    
950     static inline void __native_flush_tlb_global(void)
951     {
952     unsigned long flags;
953    
954     - if (static_cpu_has(X86_FEATURE_INVPCID)) {
955     + if (this_cpu_has(X86_FEATURE_INVPCID)) {
956     /*
957     * Using INVPCID is considerably faster than a pair of writes
958     * to CR4 sandwiched inside an IRQ flag save/restore.
959     + *
960     + * Note, this works with CR4.PCIDE=0 or 1.
961     */
962     invpcid_flush_all();
963     return;
964     @@ -174,24 +201,45 @@ static inline void __native_flush_tlb_global(void)
965     * be called from deep inside debugging code.)
966     */
967     raw_local_irq_save(flags);
968     -
969     __native_flush_tlb_global_irq_disabled();
970     -
971     raw_local_irq_restore(flags);
972     }
973    
974     static inline void __native_flush_tlb_single(unsigned long addr)
975     {
976     - asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
977     + /*
978     + * SIMICS #GP's if you run INVPCID with type 2/3
979     + * and X86_CR4_PCIDE clear. Shame!
980     + *
981     + * The ASIDs used below are hard-coded. But, we must not
982     + * call invpcid(type=1/2) before CR4.PCIDE=1. Just call
983     + * invlpg in the case we are called early.
984     + */
985     +
986     + if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) {
987     + if (kaiser_enabled)
988     + kaiser_flush_tlb_on_return_to_user();
989     + asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
990     + return;
991     + }
992     + /* Flush the address out of both PCIDs. */
993     + /*
994     + * An optimization here might be to determine addresses
995     + * that are only kernel-mapped and only flush the kernel
996     + * ASID. But, userspace flushes are probably much more
997     + * important performance-wise.
998     + *
999     + * Make sure to do only a single invpcid when KAISER is
1000     + * disabled and we have only a single ASID.
1001     + */
1002     + if (kaiser_enabled)
1003     + invpcid_flush_one(X86_CR3_PCID_ASID_USER, addr);
1004     + invpcid_flush_one(X86_CR3_PCID_ASID_KERN, addr);
1005     }
1006    
1007     static inline void __flush_tlb_all(void)
1008     {
1009     - if (boot_cpu_has(X86_FEATURE_PGE))
1010     - __flush_tlb_global();
1011     - else
1012     - __flush_tlb();
1013     -
1014     + __flush_tlb_global();
1015     /*
1016     * Note: if we somehow had PCID but not PGE, then this wouldn't work --
1017     * we'd end up flushing kernel translations for the current ASID but
1018     diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h
1019     index 567de50a4c2a..6768d1321016 100644
1020     --- a/arch/x86/include/uapi/asm/processor-flags.h
1021     +++ b/arch/x86/include/uapi/asm/processor-flags.h
1022     @@ -77,7 +77,8 @@
1023     #define X86_CR3_PWT _BITUL(X86_CR3_PWT_BIT)
1024     #define X86_CR3_PCD_BIT 4 /* Page Cache Disable */
1025     #define X86_CR3_PCD _BITUL(X86_CR3_PCD_BIT)
1026     -#define X86_CR3_PCID_MASK _AC(0x00000fff,UL) /* PCID Mask */
1027     +#define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */
1028     +#define X86_CR3_PCID_NOFLUSH _BITULL(X86_CR3_PCID_NOFLUSH_BIT)
1029    
1030     /*
1031     * Intel CPU features in CR4
1032     diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
1033     index 91588be529b9..918e44772b04 100644
1034     --- a/arch/x86/kernel/cpu/common.c
1035     +++ b/arch/x86/kernel/cpu/common.c
1036     @@ -93,7 +93,7 @@ static const struct cpu_dev default_cpu = {
1037    
1038     static const struct cpu_dev *this_cpu = &default_cpu;
1039    
1040     -DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
1041     +DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page) = { .gdt = {
1042     #ifdef CONFIG_X86_64
1043     /*
1044     * We need valid kernel segments for data and code in long mode too
1045     @@ -327,8 +327,21 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c)
1046     static void setup_pcid(struct cpuinfo_x86 *c)
1047     {
1048     if (cpu_has(c, X86_FEATURE_PCID)) {
1049     - if (cpu_has(c, X86_FEATURE_PGE)) {
1050     + if (cpu_has(c, X86_FEATURE_PGE) || kaiser_enabled) {
1051     cr4_set_bits(X86_CR4_PCIDE);
1052     + /*
1053     + * INVPCID has two "groups" of types:
1054     + * 1/2: Invalidate an individual address
1055     + * 3/4: Invalidate all contexts
1056     + *
1057     + * 1/2 take a PCID, but 3/4 do not. So, 3/4
1058     + * ignore the PCID argument in the descriptor.
1059     + * But, we have to be careful not to call 1/2
1060     + * with an actual non-zero PCID in them before
1061     + * we do the above cr4_set_bits().
1062     + */
1063     + if (cpu_has(c, X86_FEATURE_INVPCID))
1064     + set_cpu_cap(c, X86_FEATURE_INVPCID_SINGLE);
1065     } else {
1066     /*
1067     * flush_tlb_all(), as currently implemented, won't
1068     @@ -341,6 +354,7 @@ static void setup_pcid(struct cpuinfo_x86 *c)
1069     clear_cpu_cap(c, X86_FEATURE_PCID);
1070     }
1071     }
1072     + kaiser_setup_pcid();
1073     }
1074    
1075     /*
1076     @@ -1365,7 +1379,7 @@ static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
1077     [DEBUG_STACK - 1] = DEBUG_STKSZ
1078     };
1079    
1080     -static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
1081     +DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(char, exception_stacks
1082     [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
1083    
1084     /* May not be marked __init: used by software suspend */
1085     @@ -1523,6 +1537,14 @@ void cpu_init(void)
1086     * try to read it.
1087     */
1088     cr4_init_shadow();
1089     + if (!kaiser_enabled) {
1090     + /*
1091     + * secondary_startup_64() deferred setting PGE in cr4:
1092     + * probe_page_size_mask() sets it on the boot cpu,
1093     + * but it needs to be set on each secondary cpu.
1094     + */
1095     + cr4_set_bits(X86_CR4_PGE);
1096     + }
1097    
1098     /*
1099     * Load microcode on this cpu if a valid microcode is available.
1100     diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
1101     index 04f89caef9c4..e33b38541be3 100644
1102     --- a/arch/x86/kernel/espfix_64.c
1103     +++ b/arch/x86/kernel/espfix_64.c
1104     @@ -41,6 +41,7 @@
1105     #include <asm/pgalloc.h>
1106     #include <asm/setup.h>
1107     #include <asm/espfix.h>
1108     +#include <asm/kaiser.h>
1109    
1110     /*
1111     * Note: we only need 6*8 = 48 bytes for the espfix stack, but round
1112     @@ -126,6 +127,15 @@ void __init init_espfix_bsp(void)
1113     /* Install the espfix pud into the kernel page directory */
1114     pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
1115     pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page);
1116     + /*
1117     + * Just copy the top-level PGD that is mapping the espfix
1118     + * area to ensure it is mapped into the shadow user page
1119     + * tables.
1120     + */
1121     + if (kaiser_enabled) {
1122     + set_pgd(native_get_shadow_pgd(pgd_p),
1123     + __pgd(_KERNPG_TABLE | __pa((pud_t *)espfix_pud_page)));
1124     + }
1125    
1126     /* Randomize the locations */
1127     init_espfix_random();
1128     diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
1129     index b4421cc191b0..67cd7c1b99da 100644
1130     --- a/arch/x86/kernel/head_64.S
1131     +++ b/arch/x86/kernel/head_64.S
1132     @@ -190,8 +190,8 @@ ENTRY(secondary_startup_64)
1133     movq $(init_level4_pgt - __START_KERNEL_map), %rax
1134     1:
1135    
1136     - /* Enable PAE mode and PGE */
1137     - movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx
1138     + /* Enable PAE and PSE, but defer PGE until kaiser_enabled is decided */
1139     + movl $(X86_CR4_PAE | X86_CR4_PSE), %ecx
1140     movq %rcx, %cr4
1141    
1142     /* Setup early boot stage 4 level pagetables. */
1143     @@ -405,6 +405,27 @@ GLOBAL(early_recursion_flag)
1144     .balign PAGE_SIZE; \
1145     GLOBAL(name)
1146    
1147     +#ifdef CONFIG_PAGE_TABLE_ISOLATION
1148     +/*
1149     + * Each PGD needs to be 8k long and 8k aligned. We do not
1150     + * ever go out to userspace with these, so we do not
1151     + * strictly *need* the second page, but this allows us to
1152     + * have a single set_pgd() implementation that does not
1153     + * need to worry about whether it has 4k or 8k to work
1154     + * with.
1155     + *
1156     + * This ensures PGDs are 8k long:
1157     + */
1158     +#define KAISER_USER_PGD_FILL 512
1159     +/* This ensures they are 8k-aligned: */
1160     +#define NEXT_PGD_PAGE(name) \
1161     + .balign 2 * PAGE_SIZE; \
1162     +GLOBAL(name)
1163     +#else
1164     +#define NEXT_PGD_PAGE(name) NEXT_PAGE(name)
1165     +#define KAISER_USER_PGD_FILL 0
1166     +#endif
1167     +
1168     /* Automate the creation of 1 to 1 mapping pmd entries */
1169     #define PMDS(START, PERM, COUNT) \
1170     i = 0 ; \
1171     @@ -414,9 +435,10 @@ GLOBAL(name)
1172     .endr
1173    
1174     __INITDATA
1175     -NEXT_PAGE(early_level4_pgt)
1176     +NEXT_PGD_PAGE(early_level4_pgt)
1177     .fill 511,8,0
1178     .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
1179     + .fill KAISER_USER_PGD_FILL,8,0
1180    
1181     NEXT_PAGE(early_dynamic_pgts)
1182     .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0
1183     @@ -424,16 +446,18 @@ NEXT_PAGE(early_dynamic_pgts)
1184     .data
1185    
1186     #ifndef CONFIG_XEN
1187     -NEXT_PAGE(init_level4_pgt)
1188     +NEXT_PGD_PAGE(init_level4_pgt)
1189     .fill 512,8,0
1190     + .fill KAISER_USER_PGD_FILL,8,0
1191     #else
1192     -NEXT_PAGE(init_level4_pgt)
1193     +NEXT_PGD_PAGE(init_level4_pgt)
1194     .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
1195     .org init_level4_pgt + L4_PAGE_OFFSET*8, 0
1196     .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
1197     .org init_level4_pgt + L4_START_KERNEL*8, 0
1198     /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
1199     .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
1200     + .fill KAISER_USER_PGD_FILL,8,0
1201    
1202     NEXT_PAGE(level3_ident_pgt)
1203     .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
1204     @@ -444,6 +468,7 @@ NEXT_PAGE(level2_ident_pgt)
1205     */
1206     PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
1207     #endif
1208     + .fill KAISER_USER_PGD_FILL,8,0
1209    
1210     NEXT_PAGE(level3_kernel_pgt)
1211     .fill L3_START_KERNEL,8,0
1212     diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
1213     index 1423ab1b0312..f480b38a03c3 100644
1214     --- a/arch/x86/kernel/irqinit.c
1215     +++ b/arch/x86/kernel/irqinit.c
1216     @@ -51,7 +51,7 @@ static struct irqaction irq2 = {
1217     .flags = IRQF_NO_THREAD,
1218     };
1219    
1220     -DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
1221     +DEFINE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq) = {
1222     [0 ... NR_VECTORS - 1] = VECTOR_UNUSED,
1223     };
1224    
1225     diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
1226     index 5f70014ca602..8bc68cfc0d33 100644
1227     --- a/arch/x86/kernel/ldt.c
1228     +++ b/arch/x86/kernel/ldt.c
1229     @@ -16,6 +16,7 @@
1230     #include <linux/slab.h>
1231     #include <linux/vmalloc.h>
1232     #include <linux/uaccess.h>
1233     +#include <linux/kaiser.h>
1234    
1235     #include <asm/ldt.h>
1236     #include <asm/desc.h>
1237     @@ -34,11 +35,21 @@ static void flush_ldt(void *current_mm)
1238     set_ldt(pc->ldt->entries, pc->ldt->size);
1239     }
1240    
1241     +static void __free_ldt_struct(struct ldt_struct *ldt)
1242     +{
1243     + if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
1244     + vfree(ldt->entries);
1245     + else
1246     + free_page((unsigned long)ldt->entries);
1247     + kfree(ldt);
1248     +}
1249     +
1250     /* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */
1251     static struct ldt_struct *alloc_ldt_struct(int size)
1252     {
1253     struct ldt_struct *new_ldt;
1254     int alloc_size;
1255     + int ret;
1256    
1257     if (size > LDT_ENTRIES)
1258     return NULL;
1259     @@ -66,7 +77,13 @@ static struct ldt_struct *alloc_ldt_struct(int size)
1260     return NULL;
1261     }
1262    
1263     + ret = kaiser_add_mapping((unsigned long)new_ldt->entries, alloc_size,
1264     + __PAGE_KERNEL);
1265     new_ldt->size = size;
1266     + if (ret) {
1267     + __free_ldt_struct(new_ldt);
1268     + return NULL;
1269     + }
1270     return new_ldt;
1271     }
1272    
1273     @@ -92,12 +109,10 @@ static void free_ldt_struct(struct ldt_struct *ldt)
1274     if (likely(!ldt))
1275     return;
1276    
1277     + kaiser_remove_mapping((unsigned long)ldt->entries,
1278     + ldt->size * LDT_ENTRY_SIZE);
1279     paravirt_free_ldt(ldt->entries, ldt->size);
1280     - if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
1281     - vfree(ldt->entries);
1282     - else
1283     - free_page((unsigned long)ldt->entries);
1284     - kfree(ldt);
1285     + __free_ldt_struct(ldt);
1286     }
1287    
1288     /*
1289     diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
1290     index bb3840cedb4f..ee43b36075c7 100644
1291     --- a/arch/x86/kernel/paravirt_patch_64.c
1292     +++ b/arch/x86/kernel/paravirt_patch_64.c
1293     @@ -9,7 +9,6 @@ DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax");
1294     DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax");
1295     DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax");
1296     DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3");
1297     -DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)");
1298     DEF_NATIVE(pv_cpu_ops, clts, "clts");
1299     DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd");
1300    
1301     @@ -59,7 +58,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
1302     PATCH_SITE(pv_mmu_ops, read_cr3);
1303     PATCH_SITE(pv_mmu_ops, write_cr3);
1304     PATCH_SITE(pv_cpu_ops, clts);
1305     - PATCH_SITE(pv_mmu_ops, flush_tlb_single);
1306     PATCH_SITE(pv_cpu_ops, wbinvd);
1307     #if defined(CONFIG_PARAVIRT_SPINLOCKS)
1308     case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):
1309     diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
1310     index 8e10e72bf6ee..a55b32007785 100644
1311     --- a/arch/x86/kernel/process.c
1312     +++ b/arch/x86/kernel/process.c
1313     @@ -41,7 +41,7 @@
1314     * section. Since TSS's are completely CPU-local, we want them
1315     * on exact cacheline boundaries, to eliminate cacheline ping-pong.
1316     */
1317     -__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
1318     +__visible DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss) = {
1319     .x86_tss = {
1320     .sp0 = TOP_OF_INIT_STACK,
1321     #ifdef CONFIG_X86_32
1322     diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
1323     index feaab07fa124..6b55012d02a3 100644
1324     --- a/arch/x86/kernel/setup.c
1325     +++ b/arch/x86/kernel/setup.c
1326     @@ -114,6 +114,7 @@
1327     #include <asm/microcode.h>
1328     #include <asm/mmu_context.h>
1329     #include <asm/kaslr.h>
1330     +#include <asm/kaiser.h>
1331    
1332     /*
1333     * max_low_pfn_mapped: highest direct mapped pfn under 4GB
1334     @@ -1019,6 +1020,12 @@ void __init setup_arch(char **cmdline_p)
1335     */
1336     init_hypervisor_platform();
1337    
1338     + /*
1339     + * This needs to happen right after XENPV is set on xen and
1340     + * kaiser_enabled is checked below in cleanup_highmap().
1341     + */
1342     + kaiser_check_boottime_disable();
1343     +
1344     x86_init.resources.probe_roms();
1345    
1346     /* after parse_early_param, so could debug it */
1347     diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c
1348     index 1c113db9ed57..2bb5ee464df3 100644
1349     --- a/arch/x86/kernel/tracepoint.c
1350     +++ b/arch/x86/kernel/tracepoint.c
1351     @@ -9,10 +9,12 @@
1352     #include <linux/atomic.h>
1353    
1354     atomic_t trace_idt_ctr = ATOMIC_INIT(0);
1355     +__aligned(PAGE_SIZE)
1356     struct desc_ptr trace_idt_descr = { NR_VECTORS * 16 - 1,
1357     (unsigned long) trace_idt_table };
1358    
1359     /* No need to be aligned, but done to keep all IDTs defined the same way. */
1360     +__aligned(PAGE_SIZE)
1361     gate_desc trace_idt_table[NR_VECTORS] __page_aligned_bss;
1362    
1363     static int trace_irq_vector_refcount;
1364     diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
1365     index 7e28e6c877d9..73304b1a03cc 100644
1366     --- a/arch/x86/kvm/x86.c
1367     +++ b/arch/x86/kvm/x86.c
1368     @@ -773,7 +773,8 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1369     return 1;
1370    
1371     /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
1372     - if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu))
1373     + if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_ASID_MASK) ||
1374     + !is_long_mode(vcpu))
1375     return 1;
1376     }
1377    
1378     diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c
1379     index 5cc78bf57232..3261abb21ef4 100644
1380     --- a/arch/x86/lib/cmdline.c
1381     +++ b/arch/x86/lib/cmdline.c
1382     @@ -104,7 +104,112 @@ __cmdline_find_option_bool(const char *cmdline, int max_cmdline_size,
1383     return 0; /* Buffer overrun */
1384     }
1385    
1386     +/*
1387     + * Find a non-boolean option (i.e. option=argument). In accordance with
1388     + * standard Linux practice, if this option is repeated, this returns the
1389     + * last instance on the command line.
1390     + *
1391     + * @cmdline: the cmdline string
1392     + * @max_cmdline_size: the maximum size of cmdline
1393     + * @option: option string to look for
1394     + * @buffer: memory buffer to return the option argument
1395     + * @bufsize: size of the supplied memory buffer
1396     + *
1397     + * Returns the length of the argument (regardless of if it was
1398     + * truncated to fit in the buffer), or -1 on not found.
1399     + */
1400     +static int
1401     +__cmdline_find_option(const char *cmdline, int max_cmdline_size,
1402     + const char *option, char *buffer, int bufsize)
1403     +{
1404     + char c;
1405     + int pos = 0, len = -1;
1406     + const char *opptr = NULL;
1407     + char *bufptr = buffer;
1408     + enum {
1409     + st_wordstart = 0, /* Start of word/after whitespace */
1410     + st_wordcmp, /* Comparing this word */
1411     + st_wordskip, /* Miscompare, skip */
1412     + st_bufcpy, /* Copying this to buffer */
1413     + } state = st_wordstart;
1414     +
1415     + if (!cmdline)
1416     + return -1; /* No command line */
1417     +
1418     + /*
1419     + * This 'pos' check ensures we do not overrun
1420     + * a non-NULL-terminated 'cmdline'
1421     + */
1422     + while (pos++ < max_cmdline_size) {
1423     + c = *(char *)cmdline++;
1424     + if (!c)
1425     + break;
1426     +
1427     + switch (state) {
1428     + case st_wordstart:
1429     + if (myisspace(c))
1430     + break;
1431     +
1432     + state = st_wordcmp;
1433     + opptr = option;
1434     + /* fall through */
1435     +
1436     + case st_wordcmp:
1437     + if ((c == '=') && !*opptr) {
1438     + /*
1439     + * We matched all the way to the end of the
1440     + * option we were looking for, prepare to
1441     + * copy the argument.
1442     + */
1443     + len = 0;
1444     + bufptr = buffer;
1445     + state = st_bufcpy;
1446     + break;
1447     + } else if (c == *opptr++) {
1448     + /*
1449     + * We are currently matching, so continue
1450     + * to the next character on the cmdline.
1451     + */
1452     + break;
1453     + }
1454     + state = st_wordskip;
1455     + /* fall through */
1456     +
1457     + case st_wordskip:
1458     + if (myisspace(c))
1459     + state = st_wordstart;
1460     + break;
1461     +
1462     + case st_bufcpy:
1463     + if (myisspace(c)) {
1464     + state = st_wordstart;
1465     + } else {
1466     + /*
1467     + * Increment len, but don't overrun the
1468     + * supplied buffer and leave room for the
1469     + * NULL terminator.
1470     + */
1471     + if (++len < bufsize)
1472     + *bufptr++ = c;
1473     + }
1474     + break;
1475     + }
1476     + }
1477     +
1478     + if (bufsize)
1479     + *bufptr = '\0';
1480     +
1481     + return len;
1482     +}
1483     +
1484     int cmdline_find_option_bool(const char *cmdline, const char *option)
1485     {
1486     return __cmdline_find_option_bool(cmdline, COMMAND_LINE_SIZE, option);
1487     }
1488     +
1489     +int cmdline_find_option(const char *cmdline, const char *option, char *buffer,
1490     + int bufsize)
1491     +{
1492     + return __cmdline_find_option(cmdline, COMMAND_LINE_SIZE, option,
1493     + buffer, bufsize);
1494     +}
1495     diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
1496     index 96d2b847e09e..c548b46100cb 100644
1497     --- a/arch/x86/mm/Makefile
1498     +++ b/arch/x86/mm/Makefile
1499     @@ -37,5 +37,5 @@ obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
1500    
1501     obj-$(CONFIG_X86_INTEL_MPX) += mpx.o
1502     obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
1503     -obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
1504     -
1505     +obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
1506     +obj-$(CONFIG_PAGE_TABLE_ISOLATION) += kaiser.o
1507     diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
1508     index 0381638168d1..1e779bca4f3e 100644
1509     --- a/arch/x86/mm/init.c
1510     +++ b/arch/x86/mm/init.c
1511     @@ -177,7 +177,7 @@ static void __init probe_page_size_mask(void)
1512     cr4_set_bits_and_update_boot(X86_CR4_PSE);
1513    
1514     /* Enable PGE if available */
1515     - if (boot_cpu_has(X86_FEATURE_PGE)) {
1516     + if (boot_cpu_has(X86_FEATURE_PGE) && !kaiser_enabled) {
1517     cr4_set_bits_and_update_boot(X86_CR4_PGE);
1518     __supported_pte_mask |= _PAGE_GLOBAL;
1519     } else
1520     diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
1521     index 3e27ded6ac65..7df8e3a79dc0 100644
1522     --- a/arch/x86/mm/init_64.c
1523     +++ b/arch/x86/mm/init_64.c
1524     @@ -324,6 +324,16 @@ void __init cleanup_highmap(void)
1525     continue;
1526     if (vaddr < (unsigned long) _text || vaddr > end)
1527     set_pmd(pmd, __pmd(0));
1528     + else if (kaiser_enabled) {
1529     + /*
1530     + * level2_kernel_pgt is initialized with _PAGE_GLOBAL:
1531     + * clear that now. This is not important, so long as
1532     + * CR4.PGE remains clear, but it removes an anomaly.
1533     + * Physical mapping setup below avoids _PAGE_GLOBAL
1534     + * by use of massage_pgprot() inside pfn_pte() etc.
1535     + */
1536     + set_pmd(pmd, pmd_clear_flags(*pmd, _PAGE_GLOBAL));
1537     + }
1538     }
1539     }
1540    
1541     diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
1542     new file mode 100644
1543     index 000000000000..d8376b4ad9f0
1544     --- /dev/null
1545     +++ b/arch/x86/mm/kaiser.c
1546     @@ -0,0 +1,455 @@
1547     +#include <linux/bug.h>
1548     +#include <linux/kernel.h>
1549     +#include <linux/errno.h>
1550     +#include <linux/string.h>
1551     +#include <linux/types.h>
1552     +#include <linux/bug.h>
1553     +#include <linux/init.h>
1554     +#include <linux/interrupt.h>
1555     +#include <linux/spinlock.h>
1556     +#include <linux/mm.h>
1557     +#include <linux/uaccess.h>
1558     +
1559     +#undef pr_fmt
1560     +#define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt
1561     +
1562     +#include <asm/kaiser.h>
1563     +#include <asm/tlbflush.h> /* to verify its kaiser declarations */
1564     +#include <asm/pgtable.h>
1565     +#include <asm/pgalloc.h>
1566     +#include <asm/desc.h>
1567     +#include <asm/cmdline.h>
1568     +
1569     +int kaiser_enabled __read_mostly = 1;
1570     +EXPORT_SYMBOL(kaiser_enabled); /* for inlined TLB flush functions */
1571     +
1572     +__visible
1573     +DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
1574     +
1575     +/*
1576     + * These can have bit 63 set, so we can not just use a plain "or"
1577     + * instruction to get their value or'd into CR3. It would take
1578     + * another register. So, we use a memory reference to these instead.
1579     + *
1580     + * This is also handy because systems that do not support PCIDs
1581     + * just end up or'ing a 0 into their CR3, which does no harm.
1582     + */
1583     +DEFINE_PER_CPU(unsigned long, x86_cr3_pcid_user);
1584     +
1585     +/*
1586     + * At runtime, the only things we map are some things for CPU
1587     + * hotplug, and stacks for new processes. No two CPUs will ever
1588     + * be populating the same addresses, so we only need to ensure
1589     + * that we protect between two CPUs trying to allocate and
1590     + * populate the same page table page.
1591     + *
1592     + * Only take this lock when doing a set_p[4um]d(), but it is not
1593     + * needed for doing a set_pte(). We assume that only the *owner*
1594     + * of a given allocation will be doing this for _their_
1595     + * allocation.
1596     + *
1597     + * This ensures that once a system has been running for a while
1598     + * and there have been stacks all over and these page tables
1599     + * are fully populated, there will be no further acquisitions of
1600     + * this lock.
1601     + */
1602     +static DEFINE_SPINLOCK(shadow_table_allocation_lock);
1603     +
1604     +/*
1605     + * Returns -1 on error.
1606     + */
1607     +static inline unsigned long get_pa_from_mapping(unsigned long vaddr)
1608     +{
1609     + pgd_t *pgd;
1610     + pud_t *pud;
1611     + pmd_t *pmd;
1612     + pte_t *pte;
1613     +
1614     + pgd = pgd_offset_k(vaddr);
1615     + /*
1616     + * We made all the kernel PGDs present in kaiser_init().
1617     + * We expect them to stay that way.
1618     + */
1619     + BUG_ON(pgd_none(*pgd));
1620     + /*
1621     + * PGDs are either 512GB or 128TB on all x86_64
1622     + * configurations. We don't handle these.
1623     + */
1624     + BUG_ON(pgd_large(*pgd));
1625     +
1626     + pud = pud_offset(pgd, vaddr);
1627     + if (pud_none(*pud)) {
1628     + WARN_ON_ONCE(1);
1629     + return -1;
1630     + }
1631     +
1632     + if (pud_large(*pud))
1633     + return (pud_pfn(*pud) << PAGE_SHIFT) | (vaddr & ~PUD_PAGE_MASK);
1634     +
1635     + pmd = pmd_offset(pud, vaddr);
1636     + if (pmd_none(*pmd)) {
1637     + WARN_ON_ONCE(1);
1638     + return -1;
1639     + }
1640     +
1641     + if (pmd_large(*pmd))
1642     + return (pmd_pfn(*pmd) << PAGE_SHIFT) | (vaddr & ~PMD_PAGE_MASK);
1643     +
1644     + pte = pte_offset_kernel(pmd, vaddr);
1645     + if (pte_none(*pte)) {
1646     + WARN_ON_ONCE(1);
1647     + return -1;
1648     + }
1649     +
1650     + return (pte_pfn(*pte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK);
1651     +}
1652     +
1653     +/*
1654     + * This is a relatively normal page table walk, except that it
1655     + * also tries to allocate page tables pages along the way.
1656     + *
1657     + * Returns a pointer to a PTE on success, or NULL on failure.
1658     + */
1659     +static pte_t *kaiser_pagetable_walk(unsigned long address)
1660     +{
1661     + pmd_t *pmd;
1662     + pud_t *pud;
1663     + pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address));
1664     + gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
1665     +
1666     + if (pgd_none(*pgd)) {
1667     + WARN_ONCE(1, "All shadow pgds should have been populated");
1668     + return NULL;
1669     + }
1670     + BUILD_BUG_ON(pgd_large(*pgd) != 0);
1671     +
1672     + pud = pud_offset(pgd, address);
1673     + /* The shadow page tables do not use large mappings: */
1674     + if (pud_large(*pud)) {
1675     + WARN_ON(1);
1676     + return NULL;
1677     + }
1678     + if (pud_none(*pud)) {
1679     + unsigned long new_pmd_page = __get_free_page(gfp);
1680     + if (!new_pmd_page)
1681     + return NULL;
1682     + spin_lock(&shadow_table_allocation_lock);
1683     + if (pud_none(*pud)) {
1684     + set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
1685     + __inc_zone_page_state(virt_to_page((void *)
1686     + new_pmd_page), NR_KAISERTABLE);
1687     + } else
1688     + free_page(new_pmd_page);
1689     + spin_unlock(&shadow_table_allocation_lock);
1690     + }
1691     +
1692     + pmd = pmd_offset(pud, address);
1693     + /* The shadow page tables do not use large mappings: */
1694     + if (pmd_large(*pmd)) {
1695     + WARN_ON(1);
1696     + return NULL;
1697     + }
1698     + if (pmd_none(*pmd)) {
1699     + unsigned long new_pte_page = __get_free_page(gfp);
1700     + if (!new_pte_page)
1701     + return NULL;
1702     + spin_lock(&shadow_table_allocation_lock);
1703     + if (pmd_none(*pmd)) {
1704     + set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page)));
1705     + __inc_zone_page_state(virt_to_page((void *)
1706     + new_pte_page), NR_KAISERTABLE);
1707     + } else
1708     + free_page(new_pte_page);
1709     + spin_unlock(&shadow_table_allocation_lock);
1710     + }
1711     +
1712     + return pte_offset_kernel(pmd, address);
1713     +}
1714     +
1715     +static int kaiser_add_user_map(const void *__start_addr, unsigned long size,
1716     + unsigned long flags)
1717     +{
1718     + int ret = 0;
1719     + pte_t *pte;
1720     + unsigned long start_addr = (unsigned long )__start_addr;
1721     + unsigned long address = start_addr & PAGE_MASK;
1722     + unsigned long end_addr = PAGE_ALIGN(start_addr + size);
1723     + unsigned long target_address;
1724     +
1725     + /*
1726     + * It is convenient for callers to pass in __PAGE_KERNEL etc,
1727     + * and there is no actual harm from setting _PAGE_GLOBAL, so
1728     + * long as CR4.PGE is not set. But it is nonetheless troubling
1729     + * to see Kaiser itself setting _PAGE_GLOBAL (now that "nokaiser"
1730     + * requires that not to be #defined to 0): so mask it off here.
1731     + */
1732     + flags &= ~_PAGE_GLOBAL;
1733     +
1734     + for (; address < end_addr; address += PAGE_SIZE) {
1735     + target_address = get_pa_from_mapping(address);
1736     + if (target_address == -1) {
1737     + ret = -EIO;
1738     + break;
1739     + }
1740     + pte = kaiser_pagetable_walk(address);
1741     + if (!pte) {
1742     + ret = -ENOMEM;
1743     + break;
1744     + }
1745     + if (pte_none(*pte)) {
1746     + set_pte(pte, __pte(flags | target_address));
1747     + } else {
1748     + pte_t tmp;
1749     + set_pte(&tmp, __pte(flags | target_address));
1750     + WARN_ON_ONCE(!pte_same(*pte, tmp));
1751     + }
1752     + }
1753     + return ret;
1754     +}
1755     +
1756     +static int kaiser_add_user_map_ptrs(const void *start, const void *end, unsigned long flags)
1757     +{
1758     + unsigned long size = end - start;
1759     +
1760     + return kaiser_add_user_map(start, size, flags);
1761     +}
1762     +
1763     +/*
1764     + * Ensure that the top level of the (shadow) page tables are
1765     + * entirely populated. This ensures that all processes that get
1766     + * forked have the same entries. This way, we do not have to
1767     + * ever go set up new entries in older processes.
1768     + *
1769     + * Note: we never free these, so there are no updates to them
1770     + * after this.
1771     + */
1772     +static void __init kaiser_init_all_pgds(void)
1773     +{
1774     + pgd_t *pgd;
1775     + int i = 0;
1776     +
1777     + pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0));
1778     + for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) {
1779     + pgd_t new_pgd;
1780     + pud_t *pud = pud_alloc_one(&init_mm,
1781     + PAGE_OFFSET + i * PGDIR_SIZE);
1782     + if (!pud) {
1783     + WARN_ON(1);
1784     + break;
1785     + }
1786     + inc_zone_page_state(virt_to_page(pud), NR_KAISERTABLE);
1787     + new_pgd = __pgd(_KERNPG_TABLE |__pa(pud));
1788     + /*
1789     + * Make sure not to stomp on some other pgd entry.
1790     + */
1791     + if (!pgd_none(pgd[i])) {
1792     + WARN_ON(1);
1793     + continue;
1794     + }
1795     + set_pgd(pgd + i, new_pgd);
1796     + }
1797     +}
1798     +
1799     +#define kaiser_add_user_map_early(start, size, flags) do { \
1800     + int __ret = kaiser_add_user_map(start, size, flags); \
1801     + WARN_ON(__ret); \
1802     +} while (0)
1803     +
1804     +#define kaiser_add_user_map_ptrs_early(start, end, flags) do { \
1805     + int __ret = kaiser_add_user_map_ptrs(start, end, flags); \
1806     + WARN_ON(__ret); \
1807     +} while (0)
1808     +
1809     +void __init kaiser_check_boottime_disable(void)
1810     +{
1811     + bool enable = true;
1812     + char arg[5];
1813     + int ret;
1814     +
1815     + if (boot_cpu_has(X86_FEATURE_XENPV))
1816     + goto silent_disable;
1817     +
1818     + ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg));
1819     + if (ret > 0) {
1820     + if (!strncmp(arg, "on", 2))
1821     + goto enable;
1822     +
1823     + if (!strncmp(arg, "off", 3))
1824     + goto disable;
1825     +
1826     + if (!strncmp(arg, "auto", 4))
1827     + goto skip;
1828     + }
1829     +
1830     + if (cmdline_find_option_bool(boot_command_line, "nopti"))
1831     + goto disable;
1832     +
1833     +skip:
1834     + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
1835     + goto disable;
1836     +
1837     +enable:
1838     + if (enable)
1839     + setup_force_cpu_cap(X86_FEATURE_KAISER);
1840     +
1841     + return;
1842     +
1843     +disable:
1844     + pr_info("disabled\n");
1845     +
1846     +silent_disable:
1847     + kaiser_enabled = 0;
1848     + setup_clear_cpu_cap(X86_FEATURE_KAISER);
1849     +}
1850     +
1851     +/*
1852     + * If anything in here fails, we will likely die on one of the
1853     + * first kernel->user transitions and init will die. But, we
1854     + * will have most of the kernel up by then and should be able to
1855     + * get a clean warning out of it. If we BUG_ON() here, we run
1856     + * the risk of being before we have good console output.
1857     + */
1858     +void __init kaiser_init(void)
1859     +{
1860     + int cpu;
1861     +
1862     + if (!kaiser_enabled)
1863     + return;
1864     +
1865     + kaiser_init_all_pgds();
1866     +
1867     + for_each_possible_cpu(cpu) {
1868     + void *percpu_vaddr = __per_cpu_user_mapped_start +
1869     + per_cpu_offset(cpu);
1870     + unsigned long percpu_sz = __per_cpu_user_mapped_end -
1871     + __per_cpu_user_mapped_start;
1872     + kaiser_add_user_map_early(percpu_vaddr, percpu_sz,
1873     + __PAGE_KERNEL);
1874     + }
1875     +
1876     + /*
1877     + * Map the entry/exit text section, which is needed at
1878     + * switches from user to and from kernel.
1879     + */
1880     + kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end,
1881     + __PAGE_KERNEL_RX);
1882     +
1883     +#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
1884     + kaiser_add_user_map_ptrs_early(__irqentry_text_start,
1885     + __irqentry_text_end,
1886     + __PAGE_KERNEL_RX);
1887     +#endif
1888     + kaiser_add_user_map_early((void *)idt_descr.address,
1889     + sizeof(gate_desc) * NR_VECTORS,
1890     + __PAGE_KERNEL_RO);
1891     +#ifdef CONFIG_TRACING
1892     + kaiser_add_user_map_early(&trace_idt_descr,
1893     + sizeof(trace_idt_descr),
1894     + __PAGE_KERNEL);
1895     + kaiser_add_user_map_early(&trace_idt_table,
1896     + sizeof(gate_desc) * NR_VECTORS,
1897     + __PAGE_KERNEL);
1898     +#endif
1899     + kaiser_add_user_map_early(&debug_idt_descr, sizeof(debug_idt_descr),
1900     + __PAGE_KERNEL);
1901     + kaiser_add_user_map_early(&debug_idt_table,
1902     + sizeof(gate_desc) * NR_VECTORS,
1903     + __PAGE_KERNEL);
1904     +
1905     + pr_info("enabled\n");
1906     +}
1907     +
1908     +/* Add a mapping to the shadow mapping, and synchronize the mappings */
1909     +int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
1910     +{
1911     + if (!kaiser_enabled)
1912     + return 0;
1913     + return kaiser_add_user_map((const void *)addr, size, flags);
1914     +}
1915     +
1916     +void kaiser_remove_mapping(unsigned long start, unsigned long size)
1917     +{
1918     + extern void unmap_pud_range_nofree(pgd_t *pgd,
1919     + unsigned long start, unsigned long end);
1920     + unsigned long end = start + size;
1921     + unsigned long addr, next;
1922     + pgd_t *pgd;
1923     +
1924     + if (!kaiser_enabled)
1925     + return;
1926     + pgd = native_get_shadow_pgd(pgd_offset_k(start));
1927     + for (addr = start; addr < end; pgd++, addr = next) {
1928     + next = pgd_addr_end(addr, end);
1929     + unmap_pud_range_nofree(pgd, addr, next);
1930     + }
1931     +}
1932     +
1933     +/*
1934     + * Page table pages are page-aligned. The lower half of the top
1935     + * level is used for userspace and the top half for the kernel.
1936     + * This returns true for user pages that need to get copied into
1937     + * both the user and kernel copies of the page tables, and false
1938     + * for kernel pages that should only be in the kernel copy.
1939     + */
1940     +static inline bool is_userspace_pgd(pgd_t *pgdp)
1941     +{
1942     + return ((unsigned long)pgdp % PAGE_SIZE) < (PAGE_SIZE / 2);
1943     +}
1944     +
1945     +pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
1946     +{
1947     + if (!kaiser_enabled)
1948     + return pgd;
1949     + /*
1950     + * Do we need to also populate the shadow pgd? Check _PAGE_USER to
1951     + * skip cases like kexec and EFI which make temporary low mappings.
1952     + */
1953     + if (pgd.pgd & _PAGE_USER) {
1954     + if (is_userspace_pgd(pgdp)) {
1955     + native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
1956     + /*
1957     + * Even if the entry is *mapping* userspace, ensure
1958     + * that userspace can not use it. This way, if we
1959     + * get out to userspace running on the kernel CR3,
1960     + * userspace will crash instead of running.
1961     + */
1962     + if (__supported_pte_mask & _PAGE_NX)
1963     + pgd.pgd |= _PAGE_NX;
1964     + }
1965     + } else if (!pgd.pgd) {
1966     + /*
1967     + * pgd_clear() cannot check _PAGE_USER, and is even used to
1968     + * clear corrupted pgd entries: so just rely on cases like
1969     + * kexec and EFI never to be using pgd_clear().
1970     + */
1971     + if (!WARN_ON_ONCE((unsigned long)pgdp & PAGE_SIZE) &&
1972     + is_userspace_pgd(pgdp))
1973     + native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
1974     + }
1975     + return pgd;
1976     +}
1977     +
1978     +void kaiser_setup_pcid(void)
1979     +{
1980     + unsigned long user_cr3 = KAISER_SHADOW_PGD_OFFSET;
1981     +
1982     + if (this_cpu_has(X86_FEATURE_PCID))
1983     + user_cr3 |= X86_CR3_PCID_USER_NOFLUSH;
1984     + /*
1985     + * These variables are used by the entry/exit
1986     + * code to change PCID and pgd and TLB flushing.
1987     + */
1988     + this_cpu_write(x86_cr3_pcid_user, user_cr3);
1989     +}
1990     +
1991     +/*
1992     + * Make a note that this cpu will need to flush USER tlb on return to user.
1993     + * If cpu does not have PCID, then the NOFLUSH bit will never have been set.
1994     + */
1995     +void kaiser_flush_tlb_on_return_to_user(void)
1996     +{
1997     + if (this_cpu_has(X86_FEATURE_PCID))
1998     + this_cpu_write(x86_cr3_pcid_user,
1999     + X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET);
2000     +}
2001     +EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user);
2002     diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
2003     index aed206475aa7..319183d93602 100644
2004     --- a/arch/x86/mm/kaslr.c
2005     +++ b/arch/x86/mm/kaslr.c
2006     @@ -189,6 +189,6 @@ void __meminit init_trampoline(void)
2007     *pud_tramp = *pud;
2008     }
2009    
2010     - set_pgd(&trampoline_pgd_entry,
2011     - __pgd(_KERNPG_TABLE | __pa(pud_page_tramp)));
2012     + /* Avoid set_pgd(), in case it's complicated by CONFIG_PAGE_TABLE_ISOLATION */
2013     + trampoline_pgd_entry = __pgd(_KERNPG_TABLE | __pa(pud_page_tramp));
2014     }
2015     diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
2016     index e3353c97d086..73dcb0e18c1b 100644
2017     --- a/arch/x86/mm/pageattr.c
2018     +++ b/arch/x86/mm/pageattr.c
2019     @@ -52,6 +52,7 @@ static DEFINE_SPINLOCK(cpa_lock);
2020     #define CPA_FLUSHTLB 1
2021     #define CPA_ARRAY 2
2022     #define CPA_PAGES_ARRAY 4
2023     +#define CPA_FREE_PAGETABLES 8
2024    
2025     #ifdef CONFIG_PROC_FS
2026     static unsigned long direct_pages_count[PG_LEVEL_NUM];
2027     @@ -729,10 +730,13 @@ static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
2028     return 0;
2029     }
2030    
2031     -static bool try_to_free_pte_page(pte_t *pte)
2032     +static bool try_to_free_pte_page(struct cpa_data *cpa, pte_t *pte)
2033     {
2034     int i;
2035    
2036     + if (!(cpa->flags & CPA_FREE_PAGETABLES))
2037     + return false;
2038     +
2039     for (i = 0; i < PTRS_PER_PTE; i++)
2040     if (!pte_none(pte[i]))
2041     return false;
2042     @@ -741,10 +745,13 @@ static bool try_to_free_pte_page(pte_t *pte)
2043     return true;
2044     }
2045    
2046     -static bool try_to_free_pmd_page(pmd_t *pmd)
2047     +static bool try_to_free_pmd_page(struct cpa_data *cpa, pmd_t *pmd)
2048     {
2049     int i;
2050    
2051     + if (!(cpa->flags & CPA_FREE_PAGETABLES))
2052     + return false;
2053     +
2054     for (i = 0; i < PTRS_PER_PMD; i++)
2055     if (!pmd_none(pmd[i]))
2056     return false;
2057     @@ -753,7 +760,9 @@ static bool try_to_free_pmd_page(pmd_t *pmd)
2058     return true;
2059     }
2060    
2061     -static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
2062     +static bool unmap_pte_range(struct cpa_data *cpa, pmd_t *pmd,
2063     + unsigned long start,
2064     + unsigned long end)
2065     {
2066     pte_t *pte = pte_offset_kernel(pmd, start);
2067    
2068     @@ -764,22 +773,23 @@ static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
2069     pte++;
2070     }
2071    
2072     - if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) {
2073     + if (try_to_free_pte_page(cpa, (pte_t *)pmd_page_vaddr(*pmd))) {
2074     pmd_clear(pmd);
2075     return true;
2076     }
2077     return false;
2078     }
2079    
2080     -static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd,
2081     +static void __unmap_pmd_range(struct cpa_data *cpa, pud_t *pud, pmd_t *pmd,
2082     unsigned long start, unsigned long end)
2083     {
2084     - if (unmap_pte_range(pmd, start, end))
2085     - if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
2086     + if (unmap_pte_range(cpa, pmd, start, end))
2087     + if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud)))
2088     pud_clear(pud);
2089     }
2090    
2091     -static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
2092     +static void unmap_pmd_range(struct cpa_data *cpa, pud_t *pud,
2093     + unsigned long start, unsigned long end)
2094     {
2095     pmd_t *pmd = pmd_offset(pud, start);
2096    
2097     @@ -790,7 +800,7 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
2098     unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
2099     unsigned long pre_end = min_t(unsigned long, end, next_page);
2100    
2101     - __unmap_pmd_range(pud, pmd, start, pre_end);
2102     + __unmap_pmd_range(cpa, pud, pmd, start, pre_end);
2103    
2104     start = pre_end;
2105     pmd++;
2106     @@ -803,7 +813,8 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
2107     if (pmd_large(*pmd))
2108     pmd_clear(pmd);
2109     else
2110     - __unmap_pmd_range(pud, pmd, start, start + PMD_SIZE);
2111     + __unmap_pmd_range(cpa, pud, pmd,
2112     + start, start + PMD_SIZE);
2113    
2114     start += PMD_SIZE;
2115     pmd++;
2116     @@ -813,17 +824,19 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
2117     * 4K leftovers?
2118     */
2119     if (start < end)
2120     - return __unmap_pmd_range(pud, pmd, start, end);
2121     + return __unmap_pmd_range(cpa, pud, pmd, start, end);
2122    
2123     /*
2124     * Try again to free the PMD page if haven't succeeded above.
2125     */
2126     if (!pud_none(*pud))
2127     - if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
2128     + if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud)))
2129     pud_clear(pud);
2130     }
2131    
2132     -static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
2133     +static void __unmap_pud_range(struct cpa_data *cpa, pgd_t *pgd,
2134     + unsigned long start,
2135     + unsigned long end)
2136     {
2137     pud_t *pud = pud_offset(pgd, start);
2138    
2139     @@ -834,7 +847,7 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
2140     unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
2141     unsigned long pre_end = min_t(unsigned long, end, next_page);
2142    
2143     - unmap_pmd_range(pud, start, pre_end);
2144     + unmap_pmd_range(cpa, pud, start, pre_end);
2145    
2146     start = pre_end;
2147     pud++;
2148     @@ -848,7 +861,7 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
2149     if (pud_large(*pud))
2150     pud_clear(pud);
2151     else
2152     - unmap_pmd_range(pud, start, start + PUD_SIZE);
2153     + unmap_pmd_range(cpa, pud, start, start + PUD_SIZE);
2154    
2155     start += PUD_SIZE;
2156     pud++;
2157     @@ -858,7 +871,7 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
2158     * 2M leftovers?
2159     */
2160     if (start < end)
2161     - unmap_pmd_range(pud, start, end);
2162     + unmap_pmd_range(cpa, pud, start, end);
2163    
2164     /*
2165     * No need to try to free the PUD page because we'll free it in
2166     @@ -866,6 +879,24 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
2167     */
2168     }
2169    
2170     +static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
2171     +{
2172     + struct cpa_data cpa = {
2173     + .flags = CPA_FREE_PAGETABLES,
2174     + };
2175     +
2176     + __unmap_pud_range(&cpa, pgd, start, end);
2177     +}
2178     +
2179     +void unmap_pud_range_nofree(pgd_t *pgd, unsigned long start, unsigned long end)
2180     +{
2181     + struct cpa_data cpa = {
2182     + .flags = 0,
2183     + };
2184     +
2185     + __unmap_pud_range(&cpa, pgd, start, end);
2186     +}
2187     +
2188     static int alloc_pte_page(pmd_t *pmd)
2189     {
2190     pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
2191     diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
2192     index 3feec5af4e67..5aaec8effc5f 100644
2193     --- a/arch/x86/mm/pgtable.c
2194     +++ b/arch/x86/mm/pgtable.c
2195     @@ -344,14 +344,22 @@ static inline void _pgd_free(pgd_t *pgd)
2196     kmem_cache_free(pgd_cache, pgd);
2197     }
2198     #else
2199     +
2200     +/*
2201     + * Instead of one pgd, Kaiser acquires two pgds. Being order-1, it is
2202     + * both 8k in size and 8k-aligned. That lets us just flip bit 12
2203     + * in a pointer to swap between the two 4k halves.
2204     + */
2205     +#define PGD_ALLOCATION_ORDER kaiser_enabled
2206     +
2207     static inline pgd_t *_pgd_alloc(void)
2208     {
2209     - return (pgd_t *)__get_free_page(PGALLOC_GFP);
2210     + return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER);
2211     }
2212    
2213     static inline void _pgd_free(pgd_t *pgd)
2214     {
2215     - free_page((unsigned long)pgd);
2216     + free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
2217     }
2218     #endif /* CONFIG_X86_PAE */
2219    
2220     diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
2221     index 53b72fb4e781..41205de487e7 100644
2222     --- a/arch/x86/mm/tlb.c
2223     +++ b/arch/x86/mm/tlb.c
2224     @@ -6,13 +6,14 @@
2225     #include <linux/interrupt.h>
2226     #include <linux/export.h>
2227     #include <linux/cpu.h>
2228     +#include <linux/debugfs.h>
2229    
2230     #include <asm/tlbflush.h>
2231     #include <asm/mmu_context.h>
2232     #include <asm/cache.h>
2233     #include <asm/apic.h>
2234     #include <asm/uv/uv.h>
2235     -#include <linux/debugfs.h>
2236     +#include <asm/kaiser.h>
2237    
2238     /*
2239     * TLB flushing, formerly SMP-only
2240     @@ -34,6 +35,36 @@ struct flush_tlb_info {
2241     unsigned long flush_end;
2242     };
2243    
2244     +static void load_new_mm_cr3(pgd_t *pgdir)
2245     +{
2246     + unsigned long new_mm_cr3 = __pa(pgdir);
2247     +
2248     + if (kaiser_enabled) {
2249     + /*
2250     + * We reuse the same PCID for different tasks, so we must
2251     + * flush all the entries for the PCID out when we change tasks.
2252     + * Flush KERN below, flush USER when returning to userspace in
2253     + * kaiser's SWITCH_USER_CR3 (_SWITCH_TO_USER_CR3) macro.
2254     + *
2255     + * invpcid_flush_single_context(X86_CR3_PCID_ASID_USER) could
2256     + * do it here, but can only be used if X86_FEATURE_INVPCID is
2257     + * available - and many machines support pcid without invpcid.
2258     + *
2259     + * If X86_CR3_PCID_KERN_FLUSH actually added something, then it
2260     + * would be needed in the write_cr3() below - if PCIDs enabled.
2261     + */
2262     + BUILD_BUG_ON(X86_CR3_PCID_KERN_FLUSH);
2263     + kaiser_flush_tlb_on_return_to_user();
2264     + }
2265     +
2266     + /*
2267     + * Caution: many callers of this function expect
2268     + * that load_cr3() is serializing and orders TLB
2269     + * fills with respect to the mm_cpumask writes.
2270     + */
2271     + write_cr3(new_mm_cr3);
2272     +}
2273     +
2274     /*
2275     * We cannot call mmdrop() because we are in interrupt context,
2276     * instead update mm->cpu_vm_mask.
2277     @@ -45,7 +76,7 @@ void leave_mm(int cpu)
2278     BUG();
2279     if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) {
2280     cpumask_clear_cpu(cpu, mm_cpumask(active_mm));
2281     - load_cr3(swapper_pg_dir);
2282     + load_new_mm_cr3(swapper_pg_dir);
2283     /*
2284     * This gets called in the idle path where RCU
2285     * functions differently. Tracing normally
2286     @@ -120,7 +151,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
2287     * ordering guarantee we need.
2288     *
2289     */
2290     - load_cr3(next->pgd);
2291     + load_new_mm_cr3(next->pgd);
2292    
2293     trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
2294    
2295     @@ -167,7 +198,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
2296     * As above, load_cr3() is serializing and orders TLB
2297     * fills with respect to the mm_cpumask write.
2298     */
2299     - load_cr3(next->pgd);
2300     + load_new_mm_cr3(next->pgd);
2301     trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
2302     load_mm_cr4(next);
2303     load_mm_ldt(next);
2304     diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
2305     index dc81e5287ebf..2e6000a4eb2c 100644
2306     --- a/include/asm-generic/vmlinux.lds.h
2307     +++ b/include/asm-generic/vmlinux.lds.h
2308     @@ -778,7 +778,14 @@
2309     */
2310     #define PERCPU_INPUT(cacheline) \
2311     VMLINUX_SYMBOL(__per_cpu_start) = .; \
2312     + VMLINUX_SYMBOL(__per_cpu_user_mapped_start) = .; \
2313     *(.data..percpu..first) \
2314     + . = ALIGN(cacheline); \
2315     + *(.data..percpu..user_mapped) \
2316     + *(.data..percpu..user_mapped..shared_aligned) \
2317     + . = ALIGN(PAGE_SIZE); \
2318     + *(.data..percpu..user_mapped..page_aligned) \
2319     + VMLINUX_SYMBOL(__per_cpu_user_mapped_end) = .; \
2320     . = ALIGN(PAGE_SIZE); \
2321     *(.data..percpu..page_aligned) \
2322     . = ALIGN(cacheline); \
2323     diff --git a/include/linux/kaiser.h b/include/linux/kaiser.h
2324     new file mode 100644
2325     index 000000000000..58c55b1589d0
2326     --- /dev/null
2327     +++ b/include/linux/kaiser.h
2328     @@ -0,0 +1,52 @@
2329     +#ifndef _LINUX_KAISER_H
2330     +#define _LINUX_KAISER_H
2331     +
2332     +#ifdef CONFIG_PAGE_TABLE_ISOLATION
2333     +#include <asm/kaiser.h>
2334     +
2335     +static inline int kaiser_map_thread_stack(void *stack)
2336     +{
2337     + /*
2338     + * Map that page of kernel stack on which we enter from user context.
2339     + */
2340     + return kaiser_add_mapping((unsigned long)stack +
2341     + THREAD_SIZE - PAGE_SIZE, PAGE_SIZE, __PAGE_KERNEL);
2342     +}
2343     +
2344     +static inline void kaiser_unmap_thread_stack(void *stack)
2345     +{
2346     + /*
2347     + * Note: may be called even when kaiser_map_thread_stack() failed.
2348     + */
2349     + kaiser_remove_mapping((unsigned long)stack +
2350     + THREAD_SIZE - PAGE_SIZE, PAGE_SIZE);
2351     +}
2352     +#else
2353     +
2354     +/*
2355     + * These stubs are used whenever CONFIG_PAGE_TABLE_ISOLATION is off, which
2356     + * includes architectures that support KAISER, but have it disabled.
2357     + */
2358     +
2359     +static inline void kaiser_init(void)
2360     +{
2361     +}
2362     +static inline int kaiser_add_mapping(unsigned long addr,
2363     + unsigned long size, unsigned long flags)
2364     +{
2365     + return 0;
2366     +}
2367     +static inline void kaiser_remove_mapping(unsigned long start,
2368     + unsigned long size)
2369     +{
2370     +}
2371     +static inline int kaiser_map_thread_stack(void *stack)
2372     +{
2373     + return 0;
2374     +}
2375     +static inline void kaiser_unmap_thread_stack(void *stack)
2376     +{
2377     +}
2378     +
2379     +#endif /* !CONFIG_PAGE_TABLE_ISOLATION */
2380     +#endif /* _LINUX_KAISER_H */
2381     diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
2382     index fff21a82780c..490f5a83f947 100644
2383     --- a/include/linux/mmzone.h
2384     +++ b/include/linux/mmzone.h
2385     @@ -124,8 +124,9 @@ enum zone_stat_item {
2386     NR_SLAB_UNRECLAIMABLE,
2387     NR_PAGETABLE, /* used for pagetables */
2388     NR_KERNEL_STACK_KB, /* measured in KiB */
2389     - /* Second 128 byte cacheline */
2390     + NR_KAISERTABLE,
2391     NR_BOUNCE,
2392     + /* Second 128 byte cacheline */
2393     #if IS_ENABLED(CONFIG_ZSMALLOC)
2394     NR_ZSPAGES, /* allocated in zsmalloc */
2395     #endif
2396     diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h
2397     index 8f16299ca068..8902f23bb770 100644
2398     --- a/include/linux/percpu-defs.h
2399     +++ b/include/linux/percpu-defs.h
2400     @@ -35,6 +35,12 @@
2401    
2402     #endif
2403    
2404     +#ifdef CONFIG_PAGE_TABLE_ISOLATION
2405     +#define USER_MAPPED_SECTION "..user_mapped"
2406     +#else
2407     +#define USER_MAPPED_SECTION ""
2408     +#endif
2409     +
2410     /*
2411     * Base implementations of per-CPU variable declarations and definitions, where
2412     * the section in which the variable is to be placed is provided by the
2413     @@ -115,6 +121,12 @@
2414     #define DEFINE_PER_CPU(type, name) \
2415     DEFINE_PER_CPU_SECTION(type, name, "")
2416    
2417     +#define DECLARE_PER_CPU_USER_MAPPED(type, name) \
2418     + DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION)
2419     +
2420     +#define DEFINE_PER_CPU_USER_MAPPED(type, name) \
2421     + DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION)
2422     +
2423     /*
2424     * Declaration/definition used for per-CPU variables that must come first in
2425     * the set of variables.
2426     @@ -144,6 +156,14 @@
2427     DEFINE_PER_CPU_SECTION(type, name, PER_CPU_SHARED_ALIGNED_SECTION) \
2428     ____cacheline_aligned_in_smp
2429    
2430     +#define DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name) \
2431     + DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \
2432     + ____cacheline_aligned_in_smp
2433     +
2434     +#define DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name) \
2435     + DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \
2436     + ____cacheline_aligned_in_smp
2437     +
2438     #define DECLARE_PER_CPU_ALIGNED(type, name) \
2439     DECLARE_PER_CPU_SECTION(type, name, PER_CPU_ALIGNED_SECTION) \
2440     ____cacheline_aligned
2441     @@ -162,11 +182,21 @@
2442     #define DEFINE_PER_CPU_PAGE_ALIGNED(type, name) \
2443     DEFINE_PER_CPU_SECTION(type, name, "..page_aligned") \
2444     __aligned(PAGE_SIZE)
2445     +/*
2446     + * Declaration/definition used for per-CPU variables that must be page aligned and need to be mapped in user mode.
2447     + */
2448     +#define DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name) \
2449     + DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \
2450     + __aligned(PAGE_SIZE)
2451     +
2452     +#define DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name) \
2453     + DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \
2454     + __aligned(PAGE_SIZE)
2455    
2456     /*
2457     * Declaration/definition used for per-CPU variables that must be read mostly.
2458     */
2459     -#define DECLARE_PER_CPU_READ_MOSTLY(type, name) \
2460     +#define DECLARE_PER_CPU_READ_MOSTLY(type, name) \
2461     DECLARE_PER_CPU_SECTION(type, name, "..read_mostly")
2462    
2463     #define DEFINE_PER_CPU_READ_MOSTLY(type, name) \
2464     diff --git a/init/main.c b/init/main.c
2465     index 25bac88bc66e..99f026565608 100644
2466     --- a/init/main.c
2467     +++ b/init/main.c
2468     @@ -80,6 +80,7 @@
2469     #include <linux/integrity.h>
2470     #include <linux/proc_ns.h>
2471     #include <linux/io.h>
2472     +#include <linux/kaiser.h>
2473    
2474     #include <asm/io.h>
2475     #include <asm/bugs.h>
2476     @@ -473,6 +474,7 @@ static void __init mm_init(void)
2477     pgtable_init();
2478     vmalloc_init();
2479     ioremap_huge_init();
2480     + kaiser_init();
2481     }
2482    
2483     asmlinkage __visible void __init start_kernel(void)
2484     diff --git a/kernel/fork.c b/kernel/fork.c
2485     index 9321b1ad3335..70e10cb49be0 100644
2486     --- a/kernel/fork.c
2487     +++ b/kernel/fork.c
2488     @@ -58,6 +58,7 @@
2489     #include <linux/tsacct_kern.h>
2490     #include <linux/cn_proc.h>
2491     #include <linux/freezer.h>
2492     +#include <linux/kaiser.h>
2493     #include <linux/delayacct.h>
2494     #include <linux/taskstats_kern.h>
2495     #include <linux/random.h>
2496     @@ -213,6 +214,7 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
2497    
2498     static inline void free_thread_stack(struct task_struct *tsk)
2499     {
2500     + kaiser_unmap_thread_stack(tsk->stack);
2501     #ifdef CONFIG_VMAP_STACK
2502     if (task_stack_vm_area(tsk)) {
2503     unsigned long flags;
2504     @@ -495,6 +497,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
2505     * functions again.
2506     */
2507     tsk->stack = stack;
2508     +
2509     + err= kaiser_map_thread_stack(tsk->stack);
2510     + if (err)
2511     + goto free_stack;
2512     #ifdef CONFIG_VMAP_STACK
2513     tsk->stack_vm_area = stack_vm_area;
2514     #endif
2515     diff --git a/mm/vmstat.c b/mm/vmstat.c
2516     index 604f26a4f696..6a088df04b29 100644
2517     --- a/mm/vmstat.c
2518     +++ b/mm/vmstat.c
2519     @@ -932,6 +932,7 @@ const char * const vmstat_text[] = {
2520     "nr_slab_unreclaimable",
2521     "nr_page_table_pages",
2522     "nr_kernel_stack",
2523     + "nr_overhead",
2524     "nr_bounce",
2525     #if IS_ENABLED(CONFIG_ZSMALLOC)
2526     "nr_zspages",
2527     diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
2528     index 97f9cac98348..e86a34fd5484 100644
2529     --- a/net/ipv4/tcp_bbr.c
2530     +++ b/net/ipv4/tcp_bbr.c
2531     @@ -843,6 +843,11 @@ static u32 bbr_sndbuf_expand(struct sock *sk)
2532     */
2533     static u32 bbr_undo_cwnd(struct sock *sk)
2534     {
2535     + struct bbr *bbr = inet_csk_ca(sk);
2536     +
2537     + bbr->full_bw = 0; /* spurious slow-down; reset full pipe detection */
2538     + bbr->full_bw_cnt = 0;
2539     + bbr_reset_lt_bw_sampling(sk);
2540     return tcp_sk(sk)->snd_cwnd;
2541     }
2542    
2543     diff --git a/security/Kconfig b/security/Kconfig
2544     index 118f4549404e..32f36b40e9f0 100644
2545     --- a/security/Kconfig
2546     +++ b/security/Kconfig
2547     @@ -31,6 +31,16 @@ config SECURITY
2548    
2549     If you are unsure how to answer this question, answer N.
2550    
2551     +config PAGE_TABLE_ISOLATION
2552     + bool "Remove the kernel mapping in user mode"
2553     + default y
2554     + depends on X86_64 && SMP
2555     + help
2556     + This enforces a strict kernel and user space isolation, in order
2557     + to close hardware side channels on kernel address information.
2558     +
2559     + If you are unsure how to answer this question, answer Y.
2560     +
2561     config SECURITYFS
2562     bool "Enable the securityfs filesystem"
2563     help
2564     diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h
2565     index a39629206864..f79669a38c0c 100644
2566     --- a/tools/arch/x86/include/asm/cpufeatures.h
2567     +++ b/tools/arch/x86/include/asm/cpufeatures.h
2568     @@ -197,6 +197,9 @@
2569     #define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */
2570     #define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */
2571    
2572     +/* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */
2573     +#define X86_FEATURE_KAISER ( 7*32+31) /* CONFIG_PAGE_TABLE_ISOLATION w/o nokaiser */
2574     +
2575     /* Virtualization flags: Linux defined, word 8 */
2576     #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */
2577     #define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */