Magellan Linux

Annotation of /trunk/kernel-magellan/patches-4.15/0101-4.15.2-all-fixes.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 3085 - (hide annotations) (download)
Wed Mar 21 14:52:15 2018 UTC (6 years, 2 months ago) by niro
File size: 122489 byte(s)
-linux-4.15.2
1 niro 3085 diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
2     index 46b26bfee27b..1e762c210f1b 100644
3     --- a/Documentation/admin-guide/kernel-parameters.txt
4     +++ b/Documentation/admin-guide/kernel-parameters.txt
5     @@ -2742,8 +2742,6 @@
6     norandmaps Don't use address space randomization. Equivalent to
7     echo 0 > /proc/sys/kernel/randomize_va_space
8    
9     - noreplace-paravirt [X86,IA-64,PV_OPS] Don't patch paravirt_ops
10     -
11     noreplace-smp [X86-32,SMP] Don't replace SMP instructions
12     with UP alternatives
13    
14     diff --git a/Documentation/speculation.txt b/Documentation/speculation.txt
15     new file mode 100644
16     index 000000000000..e9e6cbae2841
17     --- /dev/null
18     +++ b/Documentation/speculation.txt
19     @@ -0,0 +1,90 @@
20     +This document explains potential effects of speculation, and how undesirable
21     +effects can be mitigated portably using common APIs.
22     +
23     +===========
24     +Speculation
25     +===========
26     +
27     +To improve performance and minimize average latencies, many contemporary CPUs
28     +employ speculative execution techniques such as branch prediction, performing
29     +work which may be discarded at a later stage.
30     +
31     +Typically speculative execution cannot be observed from architectural state,
32     +such as the contents of registers. However, in some cases it is possible to
33     +observe its impact on microarchitectural state, such as the presence or
34     +absence of data in caches. Such state may form side-channels which can be
35     +observed to extract secret information.
36     +
37     +For example, in the presence of branch prediction, it is possible for bounds
38     +checks to be ignored by code which is speculatively executed. Consider the
39     +following code:
40     +
41     + int load_array(int *array, unsigned int index)
42     + {
43     + if (index >= MAX_ARRAY_ELEMS)
44     + return 0;
45     + else
46     + return array[index];
47     + }
48     +
49     +Which, on arm64, may be compiled to an assembly sequence such as:
50     +
51     + CMP <index>, #MAX_ARRAY_ELEMS
52     + B.LT less
53     + MOV <returnval>, #0
54     + RET
55     + less:
56     + LDR <returnval>, [<array>, <index>]
57     + RET
58     +
59     +It is possible that a CPU mis-predicts the conditional branch, and
60     +speculatively loads array[index], even if index >= MAX_ARRAY_ELEMS. This
61     +value will subsequently be discarded, but the speculated load may affect
62     +microarchitectural state which can be subsequently measured.
63     +
64     +More complex sequences involving multiple dependent memory accesses may
65     +result in sensitive information being leaked. Consider the following
66     +code, building on the prior example:
67     +
68     + int load_dependent_arrays(int *arr1, int *arr2, int index)
69     + {
70     + int val1, val2,
71     +
72     + val1 = load_array(arr1, index);
73     + val2 = load_array(arr2, val1);
74     +
75     + return val2;
76     + }
77     +
78     +Under speculation, the first call to load_array() may return the value
79     +of an out-of-bounds address, while the second call will influence
80     +microarchitectural state dependent on this value. This may provide an
81     +arbitrary read primitive.
82     +
83     +====================================
84     +Mitigating speculation side-channels
85     +====================================
86     +
87     +The kernel provides a generic API to ensure that bounds checks are
88     +respected even under speculation. Architectures which are affected by
89     +speculation-based side-channels are expected to implement these
90     +primitives.
91     +
92     +The array_index_nospec() helper in <linux/nospec.h> can be used to
93     +prevent information from being leaked via side-channels.
94     +
95     +A call to array_index_nospec(index, size) returns a sanitized index
96     +value that is bounded to [0, size) even under cpu speculation
97     +conditions.
98     +
99     +This can be used to protect the earlier load_array() example:
100     +
101     + int load_array(int *array, unsigned int index)
102     + {
103     + if (index >= MAX_ARRAY_ELEMS)
104     + return 0;
105     + else {
106     + index = array_index_nospec(index, MAX_ARRAY_ELEMS);
107     + return array[index];
108     + }
109     + }
110     diff --git a/Makefile b/Makefile
111     index af101b556ba0..54f1bc10b531 100644
112     --- a/Makefile
113     +++ b/Makefile
114     @@ -1,7 +1,7 @@
115     # SPDX-License-Identifier: GPL-2.0
116     VERSION = 4
117     PATCHLEVEL = 15
118     -SUBLEVEL = 1
119     +SUBLEVEL = 2
120     EXTRAVERSION =
121     NAME = Fearless Coyote
122    
123     diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
124     index d7d3cc24baf4..21dbdf0e476b 100644
125     --- a/arch/x86/entry/common.c
126     +++ b/arch/x86/entry/common.c
127     @@ -21,6 +21,7 @@
128     #include <linux/export.h>
129     #include <linux/context_tracking.h>
130     #include <linux/user-return-notifier.h>
131     +#include <linux/nospec.h>
132     #include <linux/uprobes.h>
133     #include <linux/livepatch.h>
134     #include <linux/syscalls.h>
135     @@ -206,7 +207,7 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
136     * special case only applies after poking regs and before the
137     * very next return to user mode.
138     */
139     - current->thread.status &= ~(TS_COMPAT|TS_I386_REGS_POKED);
140     + ti->status &= ~(TS_COMPAT|TS_I386_REGS_POKED);
141     #endif
142    
143     user_enter_irqoff();
144     @@ -282,7 +283,8 @@ __visible void do_syscall_64(struct pt_regs *regs)
145     * regs->orig_ax, which changes the behavior of some syscalls.
146     */
147     if (likely((nr & __SYSCALL_MASK) < NR_syscalls)) {
148     - regs->ax = sys_call_table[nr & __SYSCALL_MASK](
149     + nr = array_index_nospec(nr & __SYSCALL_MASK, NR_syscalls);
150     + regs->ax = sys_call_table[nr](
151     regs->di, regs->si, regs->dx,
152     regs->r10, regs->r8, regs->r9);
153     }
154     @@ -304,7 +306,7 @@ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
155     unsigned int nr = (unsigned int)regs->orig_ax;
156    
157     #ifdef CONFIG_IA32_EMULATION
158     - current->thread.status |= TS_COMPAT;
159     + ti->status |= TS_COMPAT;
160     #endif
161    
162     if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) {
163     @@ -318,6 +320,7 @@ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
164     }
165    
166     if (likely(nr < IA32_NR_syscalls)) {
167     + nr = array_index_nospec(nr, IA32_NR_syscalls);
168     /*
169     * It's possible that a 32-bit syscall implementation
170     * takes a 64-bit parameter but nonetheless assumes that
171     diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
172     index 60c4c342316c..2a35b1e0fb90 100644
173     --- a/arch/x86/entry/entry_32.S
174     +++ b/arch/x86/entry/entry_32.S
175     @@ -252,7 +252,8 @@ ENTRY(__switch_to_asm)
176     * exist, overwrite the RSB with entries which capture
177     * speculative execution to prevent attack.
178     */
179     - FILL_RETURN_BUFFER %ebx, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
180     + /* Clobbers %ebx */
181     + FILL_RETURN_BUFFER RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
182     #endif
183    
184     /* restore callee-saved registers */
185     diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
186     index ff6f8022612c..c752abe89d80 100644
187     --- a/arch/x86/entry/entry_64.S
188     +++ b/arch/x86/entry/entry_64.S
189     @@ -236,91 +236,20 @@ GLOBAL(entry_SYSCALL_64_after_hwframe)
190     pushq %r9 /* pt_regs->r9 */
191     pushq %r10 /* pt_regs->r10 */
192     pushq %r11 /* pt_regs->r11 */
193     - sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */
194     - UNWIND_HINT_REGS extra=0
195     -
196     - TRACE_IRQS_OFF
197     -
198     - /*
199     - * If we need to do entry work or if we guess we'll need to do
200     - * exit work, go straight to the slow path.
201     - */
202     - movq PER_CPU_VAR(current_task), %r11
203     - testl $_TIF_WORK_SYSCALL_ENTRY|_TIF_ALLWORK_MASK, TASK_TI_flags(%r11)
204     - jnz entry_SYSCALL64_slow_path
205     -
206     -entry_SYSCALL_64_fastpath:
207     - /*
208     - * Easy case: enable interrupts and issue the syscall. If the syscall
209     - * needs pt_regs, we'll call a stub that disables interrupts again
210     - * and jumps to the slow path.
211     - */
212     - TRACE_IRQS_ON
213     - ENABLE_INTERRUPTS(CLBR_NONE)
214     -#if __SYSCALL_MASK == ~0
215     - cmpq $__NR_syscall_max, %rax
216     -#else
217     - andl $__SYSCALL_MASK, %eax
218     - cmpl $__NR_syscall_max, %eax
219     -#endif
220     - ja 1f /* return -ENOSYS (already in pt_regs->ax) */
221     - movq %r10, %rcx
222     -
223     - /*
224     - * This call instruction is handled specially in stub_ptregs_64.
225     - * It might end up jumping to the slow path. If it jumps, RAX
226     - * and all argument registers are clobbered.
227     - */
228     -#ifdef CONFIG_RETPOLINE
229     - movq sys_call_table(, %rax, 8), %rax
230     - call __x86_indirect_thunk_rax
231     -#else
232     - call *sys_call_table(, %rax, 8)
233     -#endif
234     -.Lentry_SYSCALL_64_after_fastpath_call:
235     -
236     - movq %rax, RAX(%rsp)
237     -1:
238     + pushq %rbx /* pt_regs->rbx */
239     + pushq %rbp /* pt_regs->rbp */
240     + pushq %r12 /* pt_regs->r12 */
241     + pushq %r13 /* pt_regs->r13 */
242     + pushq %r14 /* pt_regs->r14 */
243     + pushq %r15 /* pt_regs->r15 */
244     + UNWIND_HINT_REGS
245    
246     - /*
247     - * If we get here, then we know that pt_regs is clean for SYSRET64.
248     - * If we see that no exit work is required (which we are required
249     - * to check with IRQs off), then we can go straight to SYSRET64.
250     - */
251     - DISABLE_INTERRUPTS(CLBR_ANY)
252     TRACE_IRQS_OFF
253     - movq PER_CPU_VAR(current_task), %r11
254     - testl $_TIF_ALLWORK_MASK, TASK_TI_flags(%r11)
255     - jnz 1f
256     -
257     - LOCKDEP_SYS_EXIT
258     - TRACE_IRQS_ON /* user mode is traced as IRQs on */
259     - movq RIP(%rsp), %rcx
260     - movq EFLAGS(%rsp), %r11
261     - addq $6*8, %rsp /* skip extra regs -- they were preserved */
262     - UNWIND_HINT_EMPTY
263     - jmp .Lpop_c_regs_except_rcx_r11_and_sysret
264    
265     -1:
266     - /*
267     - * The fast path looked good when we started, but something changed
268     - * along the way and we need to switch to the slow path. Calling
269     - * raise(3) will trigger this, for example. IRQs are off.
270     - */
271     - TRACE_IRQS_ON
272     - ENABLE_INTERRUPTS(CLBR_ANY)
273     - SAVE_EXTRA_REGS
274     - movq %rsp, %rdi
275     - call syscall_return_slowpath /* returns with IRQs disabled */
276     - jmp return_from_SYSCALL_64
277     -
278     -entry_SYSCALL64_slow_path:
279     /* IRQs are off. */
280     - SAVE_EXTRA_REGS
281     movq %rsp, %rdi
282     call do_syscall_64 /* returns with IRQs disabled */
283    
284     -return_from_SYSCALL_64:
285     TRACE_IRQS_IRETQ /* we're about to change IF */
286    
287     /*
288     @@ -393,7 +322,6 @@ syscall_return_via_sysret:
289     /* rcx and r11 are already restored (see code above) */
290     UNWIND_HINT_EMPTY
291     POP_EXTRA_REGS
292     -.Lpop_c_regs_except_rcx_r11_and_sysret:
293     popq %rsi /* skip r11 */
294     popq %r10
295     popq %r9
296     @@ -424,47 +352,6 @@ syscall_return_via_sysret:
297     USERGS_SYSRET64
298     END(entry_SYSCALL_64)
299    
300     -ENTRY(stub_ptregs_64)
301     - /*
302     - * Syscalls marked as needing ptregs land here.
303     - * If we are on the fast path, we need to save the extra regs,
304     - * which we achieve by trying again on the slow path. If we are on
305     - * the slow path, the extra regs are already saved.
306     - *
307     - * RAX stores a pointer to the C function implementing the syscall.
308     - * IRQs are on.
309     - */
310     - cmpq $.Lentry_SYSCALL_64_after_fastpath_call, (%rsp)
311     - jne 1f
312     -
313     - /*
314     - * Called from fast path -- disable IRQs again, pop return address
315     - * and jump to slow path
316     - */
317     - DISABLE_INTERRUPTS(CLBR_ANY)
318     - TRACE_IRQS_OFF
319     - popq %rax
320     - UNWIND_HINT_REGS extra=0
321     - jmp entry_SYSCALL64_slow_path
322     -
323     -1:
324     - JMP_NOSPEC %rax /* Called from C */
325     -END(stub_ptregs_64)
326     -
327     -.macro ptregs_stub func
328     -ENTRY(ptregs_\func)
329     - UNWIND_HINT_FUNC
330     - leaq \func(%rip), %rax
331     - jmp stub_ptregs_64
332     -END(ptregs_\func)
333     -.endm
334     -
335     -/* Instantiate ptregs_stub for each ptregs-using syscall */
336     -#define __SYSCALL_64_QUAL_(sym)
337     -#define __SYSCALL_64_QUAL_ptregs(sym) ptregs_stub sym
338     -#define __SYSCALL_64(nr, sym, qual) __SYSCALL_64_QUAL_##qual(sym)
339     -#include <asm/syscalls_64.h>
340     -
341     /*
342     * %rdi: prev task
343     * %rsi: next task
344     @@ -499,7 +386,8 @@ ENTRY(__switch_to_asm)
345     * exist, overwrite the RSB with entries which capture
346     * speculative execution to prevent attack.
347     */
348     - FILL_RETURN_BUFFER %r12, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
349     + /* Clobbers %rbx */
350     + FILL_RETURN_BUFFER RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
351     #endif
352    
353     /* restore callee-saved registers */
354     diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c
355     index 9c09775e589d..c176d2fab1da 100644
356     --- a/arch/x86/entry/syscall_64.c
357     +++ b/arch/x86/entry/syscall_64.c
358     @@ -7,14 +7,11 @@
359     #include <asm/asm-offsets.h>
360     #include <asm/syscall.h>
361    
362     -#define __SYSCALL_64_QUAL_(sym) sym
363     -#define __SYSCALL_64_QUAL_ptregs(sym) ptregs_##sym
364     -
365     -#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long __SYSCALL_64_QUAL_##qual(sym)(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
366     +#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
367     #include <asm/syscalls_64.h>
368     #undef __SYSCALL_64
369    
370     -#define __SYSCALL_64(nr, sym, qual) [nr] = __SYSCALL_64_QUAL_##qual(sym),
371     +#define __SYSCALL_64(nr, sym, qual) [nr] = sym,
372    
373     extern long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
374    
375     diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h
376     index 1908214b9125..4d111616524b 100644
377     --- a/arch/x86/include/asm/asm-prototypes.h
378     +++ b/arch/x86/include/asm/asm-prototypes.h
379     @@ -38,4 +38,7 @@ INDIRECT_THUNK(dx)
380     INDIRECT_THUNK(si)
381     INDIRECT_THUNK(di)
382     INDIRECT_THUNK(bp)
383     +asmlinkage void __fill_rsb(void);
384     +asmlinkage void __clear_rsb(void);
385     +
386     #endif /* CONFIG_RETPOLINE */
387     diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
388     index 7fb336210e1b..30d406146016 100644
389     --- a/arch/x86/include/asm/barrier.h
390     +++ b/arch/x86/include/asm/barrier.h
391     @@ -24,6 +24,34 @@
392     #define wmb() asm volatile("sfence" ::: "memory")
393     #endif
394    
395     +/**
396     + * array_index_mask_nospec() - generate a mask that is ~0UL when the
397     + * bounds check succeeds and 0 otherwise
398     + * @index: array element index
399     + * @size: number of elements in array
400     + *
401     + * Returns:
402     + * 0 - (index < size)
403     + */
404     +static inline unsigned long array_index_mask_nospec(unsigned long index,
405     + unsigned long size)
406     +{
407     + unsigned long mask;
408     +
409     + asm ("cmp %1,%2; sbb %0,%0;"
410     + :"=r" (mask)
411     + :"r"(size),"r" (index)
412     + :"cc");
413     + return mask;
414     +}
415     +
416     +/* Override the default implementation from linux/nospec.h. */
417     +#define array_index_mask_nospec array_index_mask_nospec
418     +
419     +/* Prevent speculative execution past this barrier. */
420     +#define barrier_nospec() alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC, \
421     + "lfence", X86_FEATURE_LFENCE_RDTSC)
422     +
423     #ifdef CONFIG_X86_PPRO_FENCE
424     #define dma_rmb() rmb()
425     #else
426     diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
427     index ea9a7dde62e5..70eddb3922ff 100644
428     --- a/arch/x86/include/asm/cpufeature.h
429     +++ b/arch/x86/include/asm/cpufeature.h
430     @@ -29,6 +29,7 @@ enum cpuid_leafs
431     CPUID_8000_000A_EDX,
432     CPUID_7_ECX,
433     CPUID_8000_0007_EBX,
434     + CPUID_7_EDX,
435     };
436    
437     #ifdef CONFIG_X86_FEATURE_NAMES
438     @@ -79,8 +80,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
439     CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 15, feature_bit) || \
440     CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 16, feature_bit) || \
441     CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 17, feature_bit) || \
442     + CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 18, feature_bit) || \
443     REQUIRED_MASK_CHECK || \
444     - BUILD_BUG_ON_ZERO(NCAPINTS != 18))
445     + BUILD_BUG_ON_ZERO(NCAPINTS != 19))
446    
447     #define DISABLED_MASK_BIT_SET(feature_bit) \
448     ( CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 0, feature_bit) || \
449     @@ -101,8 +103,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
450     CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 15, feature_bit) || \
451     CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 16, feature_bit) || \
452     CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 17, feature_bit) || \
453     + CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 18, feature_bit) || \
454     DISABLED_MASK_CHECK || \
455     - BUILD_BUG_ON_ZERO(NCAPINTS != 18))
456     + BUILD_BUG_ON_ZERO(NCAPINTS != 19))
457    
458     #define cpu_has(c, bit) \
459     (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \
460     diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
461     index 25b9375c1484..73b5fff159a4 100644
462     --- a/arch/x86/include/asm/cpufeatures.h
463     +++ b/arch/x86/include/asm/cpufeatures.h
464     @@ -13,7 +13,7 @@
465     /*
466     * Defines x86 CPU feature bits
467     */
468     -#define NCAPINTS 18 /* N 32-bit words worth of info */
469     +#define NCAPINTS 19 /* N 32-bit words worth of info */
470     #define NBUGINTS 1 /* N 32-bit bug flags */
471    
472     /*
473     @@ -203,14 +203,14 @@
474     #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
475     #define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */
476     #define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */
477     -#define X86_FEATURE_RETPOLINE ( 7*32+12) /* Generic Retpoline mitigation for Spectre variant 2 */
478     -#define X86_FEATURE_RETPOLINE_AMD ( 7*32+13) /* AMD Retpoline mitigation for Spectre variant 2 */
479     +#define X86_FEATURE_RETPOLINE ( 7*32+12) /* "" Generic Retpoline mitigation for Spectre variant 2 */
480     +#define X86_FEATURE_RETPOLINE_AMD ( 7*32+13) /* "" AMD Retpoline mitigation for Spectre variant 2 */
481     #define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */
482     -#define X86_FEATURE_AVX512_4VNNIW ( 7*32+16) /* AVX-512 Neural Network Instructions */
483     -#define X86_FEATURE_AVX512_4FMAPS ( 7*32+17) /* AVX-512 Multiply Accumulation Single precision */
484    
485     #define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */
486     -#define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* Fill RSB on context switches */
487     +#define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* "" Fill RSB on context switches */
488     +
489     +#define X86_FEATURE_USE_IBPB ( 7*32+21) /* "" Indirect Branch Prediction Barrier enabled */
490    
491     /* Virtualization flags: Linux defined, word 8 */
492     #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */
493     @@ -271,6 +271,9 @@
494     #define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */
495     #define X86_FEATURE_IRPERF (13*32+ 1) /* Instructions Retired Count */
496     #define X86_FEATURE_XSAVEERPTR (13*32+ 2) /* Always save/restore FP error pointers */
497     +#define X86_FEATURE_IBPB (13*32+12) /* Indirect Branch Prediction Barrier */
498     +#define X86_FEATURE_IBRS (13*32+14) /* Indirect Branch Restricted Speculation */
499     +#define X86_FEATURE_STIBP (13*32+15) /* Single Thread Indirect Branch Predictors */
500    
501     /* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */
502     #define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */
503     @@ -319,6 +322,13 @@
504     #define X86_FEATURE_SUCCOR (17*32+ 1) /* Uncorrectable error containment and recovery */
505     #define X86_FEATURE_SMCA (17*32+ 3) /* Scalable MCA */
506    
507     +/* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */
508     +#define X86_FEATURE_AVX512_4VNNIW (18*32+ 2) /* AVX-512 Neural Network Instructions */
509     +#define X86_FEATURE_AVX512_4FMAPS (18*32+ 3) /* AVX-512 Multiply Accumulation Single precision */
510     +#define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */
511     +#define X86_FEATURE_INTEL_STIBP (18*32+27) /* "" Single Thread Indirect Branch Predictors */
512     +#define X86_FEATURE_ARCH_CAPABILITIES (18*32+29) /* IA32_ARCH_CAPABILITIES MSR (Intel) */
513     +
514     /*
515     * BUG word(s)
516     */
517     diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h
518     index b027633e7300..33833d1909af 100644
519     --- a/arch/x86/include/asm/disabled-features.h
520     +++ b/arch/x86/include/asm/disabled-features.h
521     @@ -77,6 +77,7 @@
522     #define DISABLED_MASK15 0
523     #define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP)
524     #define DISABLED_MASK17 0
525     -#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 18)
526     +#define DISABLED_MASK18 0
527     +#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19)
528    
529     #endif /* _ASM_X86_DISABLED_FEATURES_H */
530     diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
531     index 64c4a30e0d39..e203169931c7 100644
532     --- a/arch/x86/include/asm/fixmap.h
533     +++ b/arch/x86/include/asm/fixmap.h
534     @@ -137,8 +137,10 @@ enum fixed_addresses {
535    
536     extern void reserve_top_address(unsigned long reserve);
537    
538     -#define FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT)
539     -#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE)
540     +#define FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT)
541     +#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE)
542     +#define FIXADDR_TOT_SIZE (__end_of_fixed_addresses << PAGE_SHIFT)
543     +#define FIXADDR_TOT_START (FIXADDR_TOP - FIXADDR_TOT_SIZE)
544    
545     extern int fixmaps_set;
546    
547     diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
548     index e7b983a35506..e520a1e6fc11 100644
549     --- a/arch/x86/include/asm/msr-index.h
550     +++ b/arch/x86/include/asm/msr-index.h
551     @@ -39,6 +39,13 @@
552    
553     /* Intel MSRs. Some also available on other CPUs */
554    
555     +#define MSR_IA32_SPEC_CTRL 0x00000048 /* Speculation Control */
556     +#define SPEC_CTRL_IBRS (1 << 0) /* Indirect Branch Restricted Speculation */
557     +#define SPEC_CTRL_STIBP (1 << 1) /* Single Thread Indirect Branch Predictors */
558     +
559     +#define MSR_IA32_PRED_CMD 0x00000049 /* Prediction Command */
560     +#define PRED_CMD_IBPB (1 << 0) /* Indirect Branch Prediction Barrier */
561     +
562     #define MSR_PPIN_CTL 0x0000004e
563     #define MSR_PPIN 0x0000004f
564    
565     @@ -57,6 +64,11 @@
566     #define SNB_C3_AUTO_UNDEMOTE (1UL << 28)
567    
568     #define MSR_MTRRcap 0x000000fe
569     +
570     +#define MSR_IA32_ARCH_CAPABILITIES 0x0000010a
571     +#define ARCH_CAP_RDCL_NO (1 << 0) /* Not susceptible to Meltdown */
572     +#define ARCH_CAP_IBRS_ALL (1 << 1) /* Enhanced IBRS support */
573     +
574     #define MSR_IA32_BBL_CR_CTL 0x00000119
575     #define MSR_IA32_BBL_CR_CTL3 0x0000011e
576    
577     diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
578     index 07962f5f6fba..30df295f6d94 100644
579     --- a/arch/x86/include/asm/msr.h
580     +++ b/arch/x86/include/asm/msr.h
581     @@ -214,8 +214,7 @@ static __always_inline unsigned long long rdtsc_ordered(void)
582     * that some other imaginary CPU is updating continuously with a
583     * time stamp.
584     */
585     - alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC,
586     - "lfence", X86_FEATURE_LFENCE_RDTSC);
587     + barrier_nospec();
588     return rdtsc();
589     }
590    
591     diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
592     index 4ad41087ce0e..4d57894635f2 100644
593     --- a/arch/x86/include/asm/nospec-branch.h
594     +++ b/arch/x86/include/asm/nospec-branch.h
595     @@ -1,56 +1,12 @@
596     /* SPDX-License-Identifier: GPL-2.0 */
597    
598     -#ifndef __NOSPEC_BRANCH_H__
599     -#define __NOSPEC_BRANCH_H__
600     +#ifndef _ASM_X86_NOSPEC_BRANCH_H_
601     +#define _ASM_X86_NOSPEC_BRANCH_H_
602    
603     #include <asm/alternative.h>
604     #include <asm/alternative-asm.h>
605     #include <asm/cpufeatures.h>
606    
607     -/*
608     - * Fill the CPU return stack buffer.
609     - *
610     - * Each entry in the RSB, if used for a speculative 'ret', contains an
611     - * infinite 'pause; lfence; jmp' loop to capture speculative execution.
612     - *
613     - * This is required in various cases for retpoline and IBRS-based
614     - * mitigations for the Spectre variant 2 vulnerability. Sometimes to
615     - * eliminate potentially bogus entries from the RSB, and sometimes
616     - * purely to ensure that it doesn't get empty, which on some CPUs would
617     - * allow predictions from other (unwanted!) sources to be used.
618     - *
619     - * We define a CPP macro such that it can be used from both .S files and
620     - * inline assembly. It's possible to do a .macro and then include that
621     - * from C via asm(".include <asm/nospec-branch.h>") but let's not go there.
622     - */
623     -
624     -#define RSB_CLEAR_LOOPS 32 /* To forcibly overwrite all entries */
625     -#define RSB_FILL_LOOPS 16 /* To avoid underflow */
626     -
627     -/*
628     - * Google experimented with loop-unrolling and this turned out to be
629     - * the optimal version — two calls, each with their own speculation
630     - * trap should their return address end up getting used, in a loop.
631     - */
632     -#define __FILL_RETURN_BUFFER(reg, nr, sp) \
633     - mov $(nr/2), reg; \
634     -771: \
635     - call 772f; \
636     -773: /* speculation trap */ \
637     - pause; \
638     - lfence; \
639     - jmp 773b; \
640     -772: \
641     - call 774f; \
642     -775: /* speculation trap */ \
643     - pause; \
644     - lfence; \
645     - jmp 775b; \
646     -774: \
647     - dec reg; \
648     - jnz 771b; \
649     - add $(BITS_PER_LONG/8) * nr, sp;
650     -
651     #ifdef __ASSEMBLY__
652    
653     /*
654     @@ -121,17 +77,10 @@
655     #endif
656     .endm
657    
658     - /*
659     - * A simpler FILL_RETURN_BUFFER macro. Don't make people use the CPP
660     - * monstrosity above, manually.
661     - */
662     -.macro FILL_RETURN_BUFFER reg:req nr:req ftr:req
663     +/* This clobbers the BX register */
664     +.macro FILL_RETURN_BUFFER nr:req ftr:req
665     #ifdef CONFIG_RETPOLINE
666     - ANNOTATE_NOSPEC_ALTERNATIVE
667     - ALTERNATIVE "jmp .Lskip_rsb_\@", \
668     - __stringify(__FILL_RETURN_BUFFER(\reg,\nr,%_ASM_SP)) \
669     - \ftr
670     -.Lskip_rsb_\@:
671     + ALTERNATIVE "", "call __clear_rsb", \ftr
672     #endif
673     .endm
674    
675     @@ -201,22 +150,25 @@ extern char __indirect_thunk_end[];
676     * On VMEXIT we must ensure that no RSB predictions learned in the guest
677     * can be followed in the host, by overwriting the RSB completely. Both
678     * retpoline and IBRS mitigations for Spectre v2 need this; only on future
679     - * CPUs with IBRS_ATT *might* it be avoided.
680     + * CPUs with IBRS_ALL *might* it be avoided.
681     */
682     static inline void vmexit_fill_RSB(void)
683     {
684     #ifdef CONFIG_RETPOLINE
685     - unsigned long loops;
686     -
687     - asm volatile (ANNOTATE_NOSPEC_ALTERNATIVE
688     - ALTERNATIVE("jmp 910f",
689     - __stringify(__FILL_RETURN_BUFFER(%0, RSB_CLEAR_LOOPS, %1)),
690     - X86_FEATURE_RETPOLINE)
691     - "910:"
692     - : "=r" (loops), ASM_CALL_CONSTRAINT
693     - : : "memory" );
694     + alternative_input("",
695     + "call __fill_rsb",
696     + X86_FEATURE_RETPOLINE,
697     + ASM_NO_INPUT_CLOBBER(_ASM_BX, "memory"));
698     #endif
699     }
700    
701     +static inline void indirect_branch_prediction_barrier(void)
702     +{
703     + alternative_input("",
704     + "call __ibp_barrier",
705     + X86_FEATURE_USE_IBPB,
706     + ASM_NO_INPUT_CLOBBER("eax", "ecx", "edx", "memory"));
707     +}
708     +
709     #endif /* __ASSEMBLY__ */
710     -#endif /* __NOSPEC_BRANCH_H__ */
711     +#endif /* _ASM_X86_NOSPEC_BRANCH_H_ */
712     diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h
713     index ce245b0cdfca..0777e18a1d23 100644
714     --- a/arch/x86/include/asm/pgtable_32_types.h
715     +++ b/arch/x86/include/asm/pgtable_32_types.h
716     @@ -44,8 +44,9 @@ extern bool __vmalloc_start_set; /* set once high_memory is set */
717     */
718     #define CPU_ENTRY_AREA_PAGES (NR_CPUS * 40)
719    
720     -#define CPU_ENTRY_AREA_BASE \
721     - ((FIXADDR_START - PAGE_SIZE * (CPU_ENTRY_AREA_PAGES + 1)) & PMD_MASK)
722     +#define CPU_ENTRY_AREA_BASE \
723     + ((FIXADDR_TOT_START - PAGE_SIZE * (CPU_ENTRY_AREA_PAGES + 1)) \
724     + & PMD_MASK)
725    
726     #define PKMAP_BASE \
727     ((CPU_ENTRY_AREA_BASE - PAGE_SIZE) & PMD_MASK)
728     diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
729     index d3a67fba200a..513f9604c192 100644
730     --- a/arch/x86/include/asm/processor.h
731     +++ b/arch/x86/include/asm/processor.h
732     @@ -460,8 +460,6 @@ struct thread_struct {
733     unsigned short gsindex;
734     #endif
735    
736     - u32 status; /* thread synchronous flags */
737     -
738     #ifdef CONFIG_X86_64
739     unsigned long fsbase;
740     unsigned long gsbase;
741     @@ -971,4 +969,7 @@ bool xen_set_default_idle(void);
742    
743     void stop_this_cpu(void *dummy);
744     void df_debug(struct pt_regs *regs, long error_code);
745     +
746     +void __ibp_barrier(void);
747     +
748     #endif /* _ASM_X86_PROCESSOR_H */
749     diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h
750     index d91ba04dd007..fb3a6de7440b 100644
751     --- a/arch/x86/include/asm/required-features.h
752     +++ b/arch/x86/include/asm/required-features.h
753     @@ -106,6 +106,7 @@
754     #define REQUIRED_MASK15 0
755     #define REQUIRED_MASK16 (NEED_LA57)
756     #define REQUIRED_MASK17 0
757     -#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 18)
758     +#define REQUIRED_MASK18 0
759     +#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19)
760    
761     #endif /* _ASM_X86_REQUIRED_FEATURES_H */
762     diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h
763     index e3c95e8e61c5..03eedc21246d 100644
764     --- a/arch/x86/include/asm/syscall.h
765     +++ b/arch/x86/include/asm/syscall.h
766     @@ -60,7 +60,7 @@ static inline long syscall_get_error(struct task_struct *task,
767     * TS_COMPAT is set for 32-bit syscall entries and then
768     * remains set until we return to user mode.
769     */
770     - if (task->thread.status & (TS_COMPAT|TS_I386_REGS_POKED))
771     + if (task->thread_info.status & (TS_COMPAT|TS_I386_REGS_POKED))
772     /*
773     * Sign-extend the value so (int)-EFOO becomes (long)-EFOO
774     * and will match correctly in comparisons.
775     @@ -116,7 +116,7 @@ static inline void syscall_get_arguments(struct task_struct *task,
776     unsigned long *args)
777     {
778     # ifdef CONFIG_IA32_EMULATION
779     - if (task->thread.status & TS_COMPAT)
780     + if (task->thread_info.status & TS_COMPAT)
781     switch (i) {
782     case 0:
783     if (!n--) break;
784     @@ -177,7 +177,7 @@ static inline void syscall_set_arguments(struct task_struct *task,
785     const unsigned long *args)
786     {
787     # ifdef CONFIG_IA32_EMULATION
788     - if (task->thread.status & TS_COMPAT)
789     + if (task->thread_info.status & TS_COMPAT)
790     switch (i) {
791     case 0:
792     if (!n--) break;
793     diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
794     index 00223333821a..eda3b6823ca4 100644
795     --- a/arch/x86/include/asm/thread_info.h
796     +++ b/arch/x86/include/asm/thread_info.h
797     @@ -55,6 +55,7 @@ struct task_struct;
798    
799     struct thread_info {
800     unsigned long flags; /* low level flags */
801     + u32 status; /* thread synchronous flags */
802     };
803    
804     #define INIT_THREAD_INFO(tsk) \
805     @@ -221,7 +222,7 @@ static inline int arch_within_stack_frames(const void * const stack,
806     #define in_ia32_syscall() true
807     #else
808     #define in_ia32_syscall() (IS_ENABLED(CONFIG_IA32_EMULATION) && \
809     - current->thread.status & TS_COMPAT)
810     + current_thread_info()->status & TS_COMPAT)
811     #endif
812    
813     /*
814     diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
815     index d33e4a26dc7e..2b8f18ca5874 100644
816     --- a/arch/x86/include/asm/tlbflush.h
817     +++ b/arch/x86/include/asm/tlbflush.h
818     @@ -174,6 +174,8 @@ struct tlb_state {
819     struct mm_struct *loaded_mm;
820     u16 loaded_mm_asid;
821     u16 next_asid;
822     + /* last user mm's ctx id */
823     + u64 last_ctx_id;
824    
825     /*
826     * We can be in one of several states:
827     diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
828     index 574dff4d2913..aae77eb8491c 100644
829     --- a/arch/x86/include/asm/uaccess.h
830     +++ b/arch/x86/include/asm/uaccess.h
831     @@ -124,6 +124,11 @@ extern int __get_user_bad(void);
832    
833     #define __uaccess_begin() stac()
834     #define __uaccess_end() clac()
835     +#define __uaccess_begin_nospec() \
836     +({ \
837     + stac(); \
838     + barrier_nospec(); \
839     +})
840    
841     /*
842     * This is a type: either unsigned long, if the argument fits into
843     @@ -445,7 +450,7 @@ do { \
844     ({ \
845     int __gu_err; \
846     __inttype(*(ptr)) __gu_val; \
847     - __uaccess_begin(); \
848     + __uaccess_begin_nospec(); \
849     __get_user_size(__gu_val, (ptr), (size), __gu_err, -EFAULT); \
850     __uaccess_end(); \
851     (x) = (__force __typeof__(*(ptr)))__gu_val; \
852     @@ -487,6 +492,10 @@ struct __large_struct { unsigned long buf[100]; };
853     __uaccess_begin(); \
854     barrier();
855    
856     +#define uaccess_try_nospec do { \
857     + current->thread.uaccess_err = 0; \
858     + __uaccess_begin_nospec(); \
859     +
860     #define uaccess_catch(err) \
861     __uaccess_end(); \
862     (err) |= (current->thread.uaccess_err ? -EFAULT : 0); \
863     @@ -548,7 +557,7 @@ struct __large_struct { unsigned long buf[100]; };
864     * get_user_ex(...);
865     * } get_user_catch(err)
866     */
867     -#define get_user_try uaccess_try
868     +#define get_user_try uaccess_try_nospec
869     #define get_user_catch(err) uaccess_catch(err)
870    
871     #define get_user_ex(x, ptr) do { \
872     @@ -582,7 +591,7 @@ extern void __cmpxchg_wrong_size(void)
873     __typeof__(ptr) __uval = (uval); \
874     __typeof__(*(ptr)) __old = (old); \
875     __typeof__(*(ptr)) __new = (new); \
876     - __uaccess_begin(); \
877     + __uaccess_begin_nospec(); \
878     switch (size) { \
879     case 1: \
880     { \
881     diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
882     index 72950401b223..ba2dc1930630 100644
883     --- a/arch/x86/include/asm/uaccess_32.h
884     +++ b/arch/x86/include/asm/uaccess_32.h
885     @@ -29,21 +29,21 @@ raw_copy_from_user(void *to, const void __user *from, unsigned long n)
886     switch (n) {
887     case 1:
888     ret = 0;
889     - __uaccess_begin();
890     + __uaccess_begin_nospec();
891     __get_user_asm_nozero(*(u8 *)to, from, ret,
892     "b", "b", "=q", 1);
893     __uaccess_end();
894     return ret;
895     case 2:
896     ret = 0;
897     - __uaccess_begin();
898     + __uaccess_begin_nospec();
899     __get_user_asm_nozero(*(u16 *)to, from, ret,
900     "w", "w", "=r", 2);
901     __uaccess_end();
902     return ret;
903     case 4:
904     ret = 0;
905     - __uaccess_begin();
906     + __uaccess_begin_nospec();
907     __get_user_asm_nozero(*(u32 *)to, from, ret,
908     "l", "k", "=r", 4);
909     __uaccess_end();
910     diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
911     index f07ef3c575db..62546b3a398e 100644
912     --- a/arch/x86/include/asm/uaccess_64.h
913     +++ b/arch/x86/include/asm/uaccess_64.h
914     @@ -55,31 +55,31 @@ raw_copy_from_user(void *dst, const void __user *src, unsigned long size)
915     return copy_user_generic(dst, (__force void *)src, size);
916     switch (size) {
917     case 1:
918     - __uaccess_begin();
919     + __uaccess_begin_nospec();
920     __get_user_asm_nozero(*(u8 *)dst, (u8 __user *)src,
921     ret, "b", "b", "=q", 1);
922     __uaccess_end();
923     return ret;
924     case 2:
925     - __uaccess_begin();
926     + __uaccess_begin_nospec();
927     __get_user_asm_nozero(*(u16 *)dst, (u16 __user *)src,
928     ret, "w", "w", "=r", 2);
929     __uaccess_end();
930     return ret;
931     case 4:
932     - __uaccess_begin();
933     + __uaccess_begin_nospec();
934     __get_user_asm_nozero(*(u32 *)dst, (u32 __user *)src,
935     ret, "l", "k", "=r", 4);
936     __uaccess_end();
937     return ret;
938     case 8:
939     - __uaccess_begin();
940     + __uaccess_begin_nospec();
941     __get_user_asm_nozero(*(u64 *)dst, (u64 __user *)src,
942     ret, "q", "", "=r", 8);
943     __uaccess_end();
944     return ret;
945     case 10:
946     - __uaccess_begin();
947     + __uaccess_begin_nospec();
948     __get_user_asm_nozero(*(u64 *)dst, (u64 __user *)src,
949     ret, "q", "", "=r", 10);
950     if (likely(!ret))
951     @@ -89,7 +89,7 @@ raw_copy_from_user(void *dst, const void __user *src, unsigned long size)
952     __uaccess_end();
953     return ret;
954     case 16:
955     - __uaccess_begin();
956     + __uaccess_begin_nospec();
957     __get_user_asm_nozero(*(u64 *)dst, (u64 __user *)src,
958     ret, "q", "", "=r", 16);
959     if (likely(!ret))
960     diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
961     index 4817d743c263..a481763a3776 100644
962     --- a/arch/x86/kernel/alternative.c
963     +++ b/arch/x86/kernel/alternative.c
964     @@ -46,17 +46,6 @@ static int __init setup_noreplace_smp(char *str)
965     }
966     __setup("noreplace-smp", setup_noreplace_smp);
967    
968     -#ifdef CONFIG_PARAVIRT
969     -static int __initdata_or_module noreplace_paravirt = 0;
970     -
971     -static int __init setup_noreplace_paravirt(char *str)
972     -{
973     - noreplace_paravirt = 1;
974     - return 1;
975     -}
976     -__setup("noreplace-paravirt", setup_noreplace_paravirt);
977     -#endif
978     -
979     #define DPRINTK(fmt, args...) \
980     do { \
981     if (debug_alternative) \
982     @@ -298,7 +287,7 @@ recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insnbuf)
983     tgt_rip = next_rip + o_dspl;
984     n_dspl = tgt_rip - orig_insn;
985    
986     - DPRINTK("target RIP: %p, new_displ: 0x%x", tgt_rip, n_dspl);
987     + DPRINTK("target RIP: %px, new_displ: 0x%x", tgt_rip, n_dspl);
988    
989     if (tgt_rip - orig_insn >= 0) {
990     if (n_dspl - 2 <= 127)
991     @@ -355,7 +344,7 @@ static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *ins
992     add_nops(instr + (a->instrlen - a->padlen), a->padlen);
993     local_irq_restore(flags);
994    
995     - DUMP_BYTES(instr, a->instrlen, "%p: [%d:%d) optimized NOPs: ",
996     + DUMP_BYTES(instr, a->instrlen, "%px: [%d:%d) optimized NOPs: ",
997     instr, a->instrlen - a->padlen, a->padlen);
998     }
999    
1000     @@ -376,7 +365,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
1001     u8 *instr, *replacement;
1002     u8 insnbuf[MAX_PATCH_LEN];
1003    
1004     - DPRINTK("alt table %p -> %p", start, end);
1005     + DPRINTK("alt table %px, -> %px", start, end);
1006     /*
1007     * The scan order should be from start to end. A later scanned
1008     * alternative code can overwrite previously scanned alternative code.
1009     @@ -400,14 +389,14 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
1010     continue;
1011     }
1012    
1013     - DPRINTK("feat: %d*32+%d, old: (%p, len: %d), repl: (%p, len: %d), pad: %d",
1014     + DPRINTK("feat: %d*32+%d, old: (%px len: %d), repl: (%px, len: %d), pad: %d",
1015     a->cpuid >> 5,
1016     a->cpuid & 0x1f,
1017     instr, a->instrlen,
1018     replacement, a->replacementlen, a->padlen);
1019    
1020     - DUMP_BYTES(instr, a->instrlen, "%p: old_insn: ", instr);
1021     - DUMP_BYTES(replacement, a->replacementlen, "%p: rpl_insn: ", replacement);
1022     + DUMP_BYTES(instr, a->instrlen, "%px: old_insn: ", instr);
1023     + DUMP_BYTES(replacement, a->replacementlen, "%px: rpl_insn: ", replacement);
1024    
1025     memcpy(insnbuf, replacement, a->replacementlen);
1026     insnbuf_sz = a->replacementlen;
1027     @@ -433,7 +422,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
1028     a->instrlen - a->replacementlen);
1029     insnbuf_sz += a->instrlen - a->replacementlen;
1030     }
1031     - DUMP_BYTES(insnbuf, insnbuf_sz, "%p: final_insn: ", instr);
1032     + DUMP_BYTES(insnbuf, insnbuf_sz, "%px: final_insn: ", instr);
1033    
1034     text_poke_early(instr, insnbuf, insnbuf_sz);
1035     }
1036     @@ -599,9 +588,6 @@ void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
1037     struct paravirt_patch_site *p;
1038     char insnbuf[MAX_PATCH_LEN];
1039    
1040     - if (noreplace_paravirt)
1041     - return;
1042     -
1043     for (p = start; p < end; p++) {
1044     unsigned int used;
1045    
1046     diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
1047     index 390b3dc3d438..71949bf2de5a 100644
1048     --- a/arch/x86/kernel/cpu/bugs.c
1049     +++ b/arch/x86/kernel/cpu/bugs.c
1050     @@ -11,6 +11,7 @@
1051     #include <linux/init.h>
1052     #include <linux/utsname.h>
1053     #include <linux/cpu.h>
1054     +#include <linux/module.h>
1055    
1056     #include <asm/nospec-branch.h>
1057     #include <asm/cmdline.h>
1058     @@ -90,20 +91,41 @@ static const char *spectre_v2_strings[] = {
1059     };
1060    
1061     #undef pr_fmt
1062     -#define pr_fmt(fmt) "Spectre V2 mitigation: " fmt
1063     +#define pr_fmt(fmt) "Spectre V2 : " fmt
1064    
1065     static enum spectre_v2_mitigation spectre_v2_enabled = SPECTRE_V2_NONE;
1066    
1067     +#ifdef RETPOLINE
1068     +static bool spectre_v2_bad_module;
1069     +
1070     +bool retpoline_module_ok(bool has_retpoline)
1071     +{
1072     + if (spectre_v2_enabled == SPECTRE_V2_NONE || has_retpoline)
1073     + return true;
1074     +
1075     + pr_err("System may be vulnerable to spectre v2\n");
1076     + spectre_v2_bad_module = true;
1077     + return false;
1078     +}
1079     +
1080     +static inline const char *spectre_v2_module_string(void)
1081     +{
1082     + return spectre_v2_bad_module ? " - vulnerable module loaded" : "";
1083     +}
1084     +#else
1085     +static inline const char *spectre_v2_module_string(void) { return ""; }
1086     +#endif
1087     +
1088     static void __init spec2_print_if_insecure(const char *reason)
1089     {
1090     if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
1091     - pr_info("%s\n", reason);
1092     + pr_info("%s selected on command line.\n", reason);
1093     }
1094    
1095     static void __init spec2_print_if_secure(const char *reason)
1096     {
1097     if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
1098     - pr_info("%s\n", reason);
1099     + pr_info("%s selected on command line.\n", reason);
1100     }
1101    
1102     static inline bool retp_compiler(void)
1103     @@ -118,42 +140,68 @@ static inline bool match_option(const char *arg, int arglen, const char *opt)
1104     return len == arglen && !strncmp(arg, opt, len);
1105     }
1106    
1107     +static const struct {
1108     + const char *option;
1109     + enum spectre_v2_mitigation_cmd cmd;
1110     + bool secure;
1111     +} mitigation_options[] = {
1112     + { "off", SPECTRE_V2_CMD_NONE, false },
1113     + { "on", SPECTRE_V2_CMD_FORCE, true },
1114     + { "retpoline", SPECTRE_V2_CMD_RETPOLINE, false },
1115     + { "retpoline,amd", SPECTRE_V2_CMD_RETPOLINE_AMD, false },
1116     + { "retpoline,generic", SPECTRE_V2_CMD_RETPOLINE_GENERIC, false },
1117     + { "auto", SPECTRE_V2_CMD_AUTO, false },
1118     +};
1119     +
1120     static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
1121     {
1122     char arg[20];
1123     - int ret;
1124     -
1125     - ret = cmdline_find_option(boot_command_line, "spectre_v2", arg,
1126     - sizeof(arg));
1127     - if (ret > 0) {
1128     - if (match_option(arg, ret, "off")) {
1129     - goto disable;
1130     - } else if (match_option(arg, ret, "on")) {
1131     - spec2_print_if_secure("force enabled on command line.");
1132     - return SPECTRE_V2_CMD_FORCE;
1133     - } else if (match_option(arg, ret, "retpoline")) {
1134     - spec2_print_if_insecure("retpoline selected on command line.");
1135     - return SPECTRE_V2_CMD_RETPOLINE;
1136     - } else if (match_option(arg, ret, "retpoline,amd")) {
1137     - if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
1138     - pr_err("retpoline,amd selected but CPU is not AMD. Switching to AUTO select\n");
1139     - return SPECTRE_V2_CMD_AUTO;
1140     - }
1141     - spec2_print_if_insecure("AMD retpoline selected on command line.");
1142     - return SPECTRE_V2_CMD_RETPOLINE_AMD;
1143     - } else if (match_option(arg, ret, "retpoline,generic")) {
1144     - spec2_print_if_insecure("generic retpoline selected on command line.");
1145     - return SPECTRE_V2_CMD_RETPOLINE_GENERIC;
1146     - } else if (match_option(arg, ret, "auto")) {
1147     + int ret, i;
1148     + enum spectre_v2_mitigation_cmd cmd = SPECTRE_V2_CMD_AUTO;
1149     +
1150     + if (cmdline_find_option_bool(boot_command_line, "nospectre_v2"))
1151     + return SPECTRE_V2_CMD_NONE;
1152     + else {
1153     + ret = cmdline_find_option(boot_command_line, "spectre_v2", arg,
1154     + sizeof(arg));
1155     + if (ret < 0)
1156     + return SPECTRE_V2_CMD_AUTO;
1157     +
1158     + for (i = 0; i < ARRAY_SIZE(mitigation_options); i++) {
1159     + if (!match_option(arg, ret, mitigation_options[i].option))
1160     + continue;
1161     + cmd = mitigation_options[i].cmd;
1162     + break;
1163     + }
1164     +
1165     + if (i >= ARRAY_SIZE(mitigation_options)) {
1166     + pr_err("unknown option (%s). Switching to AUTO select\n",
1167     + mitigation_options[i].option);
1168     return SPECTRE_V2_CMD_AUTO;
1169     }
1170     }
1171    
1172     - if (!cmdline_find_option_bool(boot_command_line, "nospectre_v2"))
1173     + if ((cmd == SPECTRE_V2_CMD_RETPOLINE ||
1174     + cmd == SPECTRE_V2_CMD_RETPOLINE_AMD ||
1175     + cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC) &&
1176     + !IS_ENABLED(CONFIG_RETPOLINE)) {
1177     + pr_err("%s selected but not compiled in. Switching to AUTO select\n",
1178     + mitigation_options[i].option);
1179     return SPECTRE_V2_CMD_AUTO;
1180     -disable:
1181     - spec2_print_if_insecure("disabled on command line.");
1182     - return SPECTRE_V2_CMD_NONE;
1183     + }
1184     +
1185     + if (cmd == SPECTRE_V2_CMD_RETPOLINE_AMD &&
1186     + boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
1187     + pr_err("retpoline,amd selected but CPU is not AMD. Switching to AUTO select\n");
1188     + return SPECTRE_V2_CMD_AUTO;
1189     + }
1190     +
1191     + if (mitigation_options[i].secure)
1192     + spec2_print_if_secure(mitigation_options[i].option);
1193     + else
1194     + spec2_print_if_insecure(mitigation_options[i].option);
1195     +
1196     + return cmd;
1197     }
1198    
1199     /* Check for Skylake-like CPUs (for RSB handling) */
1200     @@ -191,10 +239,10 @@ static void __init spectre_v2_select_mitigation(void)
1201     return;
1202    
1203     case SPECTRE_V2_CMD_FORCE:
1204     - /* FALLTRHU */
1205     case SPECTRE_V2_CMD_AUTO:
1206     - goto retpoline_auto;
1207     -
1208     + if (IS_ENABLED(CONFIG_RETPOLINE))
1209     + goto retpoline_auto;
1210     + break;
1211     case SPECTRE_V2_CMD_RETPOLINE_AMD:
1212     if (IS_ENABLED(CONFIG_RETPOLINE))
1213     goto retpoline_amd;
1214     @@ -249,6 +297,12 @@ static void __init spectre_v2_select_mitigation(void)
1215     setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW);
1216     pr_info("Filling RSB on context switch\n");
1217     }
1218     +
1219     + /* Initialize Indirect Branch Prediction Barrier if supported */
1220     + if (boot_cpu_has(X86_FEATURE_IBPB)) {
1221     + setup_force_cpu_cap(X86_FEATURE_USE_IBPB);
1222     + pr_info("Enabling Indirect Branch Prediction Barrier\n");
1223     + }
1224     }
1225    
1226     #undef pr_fmt
1227     @@ -269,7 +323,7 @@ ssize_t cpu_show_spectre_v1(struct device *dev,
1228     {
1229     if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1))
1230     return sprintf(buf, "Not affected\n");
1231     - return sprintf(buf, "Vulnerable\n");
1232     + return sprintf(buf, "Mitigation: __user pointer sanitization\n");
1233     }
1234    
1235     ssize_t cpu_show_spectre_v2(struct device *dev,
1236     @@ -278,6 +332,14 @@ ssize_t cpu_show_spectre_v2(struct device *dev,
1237     if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
1238     return sprintf(buf, "Not affected\n");
1239    
1240     - return sprintf(buf, "%s\n", spectre_v2_strings[spectre_v2_enabled]);
1241     + return sprintf(buf, "%s%s%s\n", spectre_v2_strings[spectre_v2_enabled],
1242     + boot_cpu_has(X86_FEATURE_USE_IBPB) ? ", IBPB" : "",
1243     + spectre_v2_module_string());
1244     }
1245     #endif
1246     +
1247     +void __ibp_barrier(void)
1248     +{
1249     + __wrmsr(MSR_IA32_PRED_CMD, PRED_CMD_IBPB, 0);
1250     +}
1251     +EXPORT_SYMBOL_GPL(__ibp_barrier);
1252     diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
1253     index ef29ad001991..d63f4b5706e4 100644
1254     --- a/arch/x86/kernel/cpu/common.c
1255     +++ b/arch/x86/kernel/cpu/common.c
1256     @@ -47,6 +47,8 @@
1257     #include <asm/pat.h>
1258     #include <asm/microcode.h>
1259     #include <asm/microcode_intel.h>
1260     +#include <asm/intel-family.h>
1261     +#include <asm/cpu_device_id.h>
1262    
1263     #ifdef CONFIG_X86_LOCAL_APIC
1264     #include <asm/uv/uv.h>
1265     @@ -748,6 +750,26 @@ static void apply_forced_caps(struct cpuinfo_x86 *c)
1266     }
1267     }
1268    
1269     +static void init_speculation_control(struct cpuinfo_x86 *c)
1270     +{
1271     + /*
1272     + * The Intel SPEC_CTRL CPUID bit implies IBRS and IBPB support,
1273     + * and they also have a different bit for STIBP support. Also,
1274     + * a hypervisor might have set the individual AMD bits even on
1275     + * Intel CPUs, for finer-grained selection of what's available.
1276     + *
1277     + * We use the AMD bits in 0x8000_0008 EBX as the generic hardware
1278     + * features, which are visible in /proc/cpuinfo and used by the
1279     + * kernel. So set those accordingly from the Intel bits.
1280     + */
1281     + if (cpu_has(c, X86_FEATURE_SPEC_CTRL)) {
1282     + set_cpu_cap(c, X86_FEATURE_IBRS);
1283     + set_cpu_cap(c, X86_FEATURE_IBPB);
1284     + }
1285     + if (cpu_has(c, X86_FEATURE_INTEL_STIBP))
1286     + set_cpu_cap(c, X86_FEATURE_STIBP);
1287     +}
1288     +
1289     void get_cpu_cap(struct cpuinfo_x86 *c)
1290     {
1291     u32 eax, ebx, ecx, edx;
1292     @@ -769,6 +791,7 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
1293     cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx);
1294     c->x86_capability[CPUID_7_0_EBX] = ebx;
1295     c->x86_capability[CPUID_7_ECX] = ecx;
1296     + c->x86_capability[CPUID_7_EDX] = edx;
1297     }
1298    
1299     /* Extended state features: level 0x0000000d */
1300     @@ -841,6 +864,7 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
1301     c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x8000000a);
1302    
1303     init_scattered_cpuid_features(c);
1304     + init_speculation_control(c);
1305    
1306     /*
1307     * Clear/Set all flags overridden by options, after probe.
1308     @@ -876,6 +900,41 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
1309     #endif
1310     }
1311    
1312     +static const __initconst struct x86_cpu_id cpu_no_speculation[] = {
1313     + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CEDARVIEW, X86_FEATURE_ANY },
1314     + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CLOVERVIEW, X86_FEATURE_ANY },
1315     + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_LINCROFT, X86_FEATURE_ANY },
1316     + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PENWELL, X86_FEATURE_ANY },
1317     + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PINEVIEW, X86_FEATURE_ANY },
1318     + { X86_VENDOR_CENTAUR, 5 },
1319     + { X86_VENDOR_INTEL, 5 },
1320     + { X86_VENDOR_NSC, 5 },
1321     + { X86_VENDOR_ANY, 4 },
1322     + {}
1323     +};
1324     +
1325     +static const __initconst struct x86_cpu_id cpu_no_meltdown[] = {
1326     + { X86_VENDOR_AMD },
1327     + {}
1328     +};
1329     +
1330     +static bool __init cpu_vulnerable_to_meltdown(struct cpuinfo_x86 *c)
1331     +{
1332     + u64 ia32_cap = 0;
1333     +
1334     + if (x86_match_cpu(cpu_no_meltdown))
1335     + return false;
1336     +
1337     + if (cpu_has(c, X86_FEATURE_ARCH_CAPABILITIES))
1338     + rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap);
1339     +
1340     + /* Rogue Data Cache Load? No! */
1341     + if (ia32_cap & ARCH_CAP_RDCL_NO)
1342     + return false;
1343     +
1344     + return true;
1345     +}
1346     +
1347     /*
1348     * Do minimum CPU detection early.
1349     * Fields really needed: vendor, cpuid_level, family, model, mask,
1350     @@ -923,11 +982,12 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
1351    
1352     setup_force_cpu_cap(X86_FEATURE_ALWAYS);
1353    
1354     - if (c->x86_vendor != X86_VENDOR_AMD)
1355     - setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN);
1356     -
1357     - setup_force_cpu_bug(X86_BUG_SPECTRE_V1);
1358     - setup_force_cpu_bug(X86_BUG_SPECTRE_V2);
1359     + if (!x86_match_cpu(cpu_no_speculation)) {
1360     + if (cpu_vulnerable_to_meltdown(c))
1361     + setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN);
1362     + setup_force_cpu_bug(X86_BUG_SPECTRE_V1);
1363     + setup_force_cpu_bug(X86_BUG_SPECTRE_V2);
1364     + }
1365    
1366     fpu__init_system(c);
1367    
1368     diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
1369     index b1af22073e28..319bf989fad1 100644
1370     --- a/arch/x86/kernel/cpu/intel.c
1371     +++ b/arch/x86/kernel/cpu/intel.c
1372     @@ -102,6 +102,59 @@ static void probe_xeon_phi_r3mwait(struct cpuinfo_x86 *c)
1373     ELF_HWCAP2 |= HWCAP2_RING3MWAIT;
1374     }
1375    
1376     +/*
1377     + * Early microcode releases for the Spectre v2 mitigation were broken.
1378     + * Information taken from;
1379     + * - https://newsroom.intel.com/wp-content/uploads/sites/11/2018/01/microcode-update-guidance.pdf
1380     + * - https://kb.vmware.com/s/article/52345
1381     + * - Microcode revisions observed in the wild
1382     + * - Release note from 20180108 microcode release
1383     + */
1384     +struct sku_microcode {
1385     + u8 model;
1386     + u8 stepping;
1387     + u32 microcode;
1388     +};
1389     +static const struct sku_microcode spectre_bad_microcodes[] = {
1390     + { INTEL_FAM6_KABYLAKE_DESKTOP, 0x0B, 0x84 },
1391     + { INTEL_FAM6_KABYLAKE_DESKTOP, 0x0A, 0x84 },
1392     + { INTEL_FAM6_KABYLAKE_DESKTOP, 0x09, 0x84 },
1393     + { INTEL_FAM6_KABYLAKE_MOBILE, 0x0A, 0x84 },
1394     + { INTEL_FAM6_KABYLAKE_MOBILE, 0x09, 0x84 },
1395     + { INTEL_FAM6_SKYLAKE_X, 0x03, 0x0100013e },
1396     + { INTEL_FAM6_SKYLAKE_X, 0x04, 0x0200003c },
1397     + { INTEL_FAM6_SKYLAKE_MOBILE, 0x03, 0xc2 },
1398     + { INTEL_FAM6_SKYLAKE_DESKTOP, 0x03, 0xc2 },
1399     + { INTEL_FAM6_BROADWELL_CORE, 0x04, 0x28 },
1400     + { INTEL_FAM6_BROADWELL_GT3E, 0x01, 0x1b },
1401     + { INTEL_FAM6_BROADWELL_XEON_D, 0x02, 0x14 },
1402     + { INTEL_FAM6_BROADWELL_XEON_D, 0x03, 0x07000011 },
1403     + { INTEL_FAM6_BROADWELL_X, 0x01, 0x0b000025 },
1404     + { INTEL_FAM6_HASWELL_ULT, 0x01, 0x21 },
1405     + { INTEL_FAM6_HASWELL_GT3E, 0x01, 0x18 },
1406     + { INTEL_FAM6_HASWELL_CORE, 0x03, 0x23 },
1407     + { INTEL_FAM6_HASWELL_X, 0x02, 0x3b },
1408     + { INTEL_FAM6_HASWELL_X, 0x04, 0x10 },
1409     + { INTEL_FAM6_IVYBRIDGE_X, 0x04, 0x42a },
1410     + /* Updated in the 20180108 release; blacklist until we know otherwise */
1411     + { INTEL_FAM6_ATOM_GEMINI_LAKE, 0x01, 0x22 },
1412     + /* Observed in the wild */
1413     + { INTEL_FAM6_SANDYBRIDGE_X, 0x06, 0x61b },
1414     + { INTEL_FAM6_SANDYBRIDGE_X, 0x07, 0x712 },
1415     +};
1416     +
1417     +static bool bad_spectre_microcode(struct cpuinfo_x86 *c)
1418     +{
1419     + int i;
1420     +
1421     + for (i = 0; i < ARRAY_SIZE(spectre_bad_microcodes); i++) {
1422     + if (c->x86_model == spectre_bad_microcodes[i].model &&
1423     + c->x86_mask == spectre_bad_microcodes[i].stepping)
1424     + return (c->microcode <= spectre_bad_microcodes[i].microcode);
1425     + }
1426     + return false;
1427     +}
1428     +
1429     static void early_init_intel(struct cpuinfo_x86 *c)
1430     {
1431     u64 misc_enable;
1432     @@ -122,6 +175,19 @@ static void early_init_intel(struct cpuinfo_x86 *c)
1433     if (c->x86 >= 6 && !cpu_has(c, X86_FEATURE_IA64))
1434     c->microcode = intel_get_microcode_revision();
1435    
1436     + /* Now if any of them are set, check the blacklist and clear the lot */
1437     + if ((cpu_has(c, X86_FEATURE_SPEC_CTRL) ||
1438     + cpu_has(c, X86_FEATURE_INTEL_STIBP) ||
1439     + cpu_has(c, X86_FEATURE_IBRS) || cpu_has(c, X86_FEATURE_IBPB) ||
1440     + cpu_has(c, X86_FEATURE_STIBP)) && bad_spectre_microcode(c)) {
1441     + pr_warn("Intel Spectre v2 broken microcode detected; disabling Speculation Control\n");
1442     + setup_clear_cpu_cap(X86_FEATURE_IBRS);
1443     + setup_clear_cpu_cap(X86_FEATURE_IBPB);
1444     + setup_clear_cpu_cap(X86_FEATURE_STIBP);
1445     + setup_clear_cpu_cap(X86_FEATURE_SPEC_CTRL);
1446     + setup_clear_cpu_cap(X86_FEATURE_INTEL_STIBP);
1447     + }
1448     +
1449     /*
1450     * Atom erratum AAE44/AAF40/AAG38/AAH41:
1451     *
1452     diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
1453     index d0e69769abfd..df11f5d604be 100644
1454     --- a/arch/x86/kernel/cpu/scattered.c
1455     +++ b/arch/x86/kernel/cpu/scattered.c
1456     @@ -21,8 +21,6 @@ struct cpuid_bit {
1457     static const struct cpuid_bit cpuid_bits[] = {
1458     { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 },
1459     { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 },
1460     - { X86_FEATURE_AVX512_4VNNIW, CPUID_EDX, 2, 0x00000007, 0 },
1461     - { X86_FEATURE_AVX512_4FMAPS, CPUID_EDX, 3, 0x00000007, 0 },
1462     { X86_FEATURE_CAT_L3, CPUID_EBX, 1, 0x00000010, 0 },
1463     { X86_FEATURE_CAT_L2, CPUID_EBX, 2, 0x00000010, 0 },
1464     { X86_FEATURE_CDP_L3, CPUID_ECX, 2, 0x00000010, 1 },
1465     diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
1466     index c75466232016..9eb448c7859d 100644
1467     --- a/arch/x86/kernel/process_64.c
1468     +++ b/arch/x86/kernel/process_64.c
1469     @@ -557,7 +557,7 @@ static void __set_personality_x32(void)
1470     * Pretend to come from a x32 execve.
1471     */
1472     task_pt_regs(current)->orig_ax = __NR_x32_execve | __X32_SYSCALL_BIT;
1473     - current->thread.status &= ~TS_COMPAT;
1474     + current_thread_info()->status &= ~TS_COMPAT;
1475     #endif
1476     }
1477    
1478     @@ -571,7 +571,7 @@ static void __set_personality_ia32(void)
1479     current->personality |= force_personality32;
1480     /* Prepare the first "return" to user space */
1481     task_pt_regs(current)->orig_ax = __NR_ia32_execve;
1482     - current->thread.status |= TS_COMPAT;
1483     + current_thread_info()->status |= TS_COMPAT;
1484     #endif
1485     }
1486    
1487     diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
1488     index f37d18124648..ed5c4cdf0a34 100644
1489     --- a/arch/x86/kernel/ptrace.c
1490     +++ b/arch/x86/kernel/ptrace.c
1491     @@ -935,7 +935,7 @@ static int putreg32(struct task_struct *child, unsigned regno, u32 value)
1492     */
1493     regs->orig_ax = value;
1494     if (syscall_get_nr(child, regs) >= 0)
1495     - child->thread.status |= TS_I386_REGS_POKED;
1496     + child->thread_info.status |= TS_I386_REGS_POKED;
1497     break;
1498    
1499     case offsetof(struct user32, regs.eflags):
1500     diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
1501     index b9e00e8f1c9b..4cdc0b27ec82 100644
1502     --- a/arch/x86/kernel/signal.c
1503     +++ b/arch/x86/kernel/signal.c
1504     @@ -787,7 +787,7 @@ static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs)
1505     * than the tracee.
1506     */
1507     #ifdef CONFIG_IA32_EMULATION
1508     - if (current->thread.status & (TS_COMPAT|TS_I386_REGS_POKED))
1509     + if (current_thread_info()->status & (TS_COMPAT|TS_I386_REGS_POKED))
1510     return __NR_ia32_restart_syscall;
1511     #endif
1512     #ifdef CONFIG_X86_X32_ABI
1513     diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
1514     index 0099e10eb045..13f5d4217e4f 100644
1515     --- a/arch/x86/kvm/cpuid.c
1516     +++ b/arch/x86/kvm/cpuid.c
1517     @@ -67,9 +67,7 @@ u64 kvm_supported_xcr0(void)
1518    
1519     #define F(x) bit(X86_FEATURE_##x)
1520    
1521     -/* These are scattered features in cpufeatures.h. */
1522     -#define KVM_CPUID_BIT_AVX512_4VNNIW 2
1523     -#define KVM_CPUID_BIT_AVX512_4FMAPS 3
1524     +/* For scattered features from cpufeatures.h; we currently expose none */
1525     #define KF(x) bit(KVM_CPUID_BIT_##x)
1526    
1527     int kvm_update_cpuid(struct kvm_vcpu *vcpu)
1528     @@ -367,6 +365,10 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1529     F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) |
1530     0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM);
1531    
1532     + /* cpuid 0x80000008.ebx */
1533     + const u32 kvm_cpuid_8000_0008_ebx_x86_features =
1534     + F(IBPB) | F(IBRS);
1535     +
1536     /* cpuid 0xC0000001.edx */
1537     const u32 kvm_cpuid_C000_0001_edx_x86_features =
1538     F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) |
1539     @@ -392,7 +394,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1540    
1541     /* cpuid 7.0.edx*/
1542     const u32 kvm_cpuid_7_0_edx_x86_features =
1543     - KF(AVX512_4VNNIW) | KF(AVX512_4FMAPS);
1544     + F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) |
1545     + F(ARCH_CAPABILITIES);
1546    
1547     /* all calls to cpuid_count() should be made on the same cpu */
1548     get_cpu();
1549     @@ -477,7 +480,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1550     if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE))
1551     entry->ecx &= ~F(PKU);
1552     entry->edx &= kvm_cpuid_7_0_edx_x86_features;
1553     - entry->edx &= get_scattered_cpuid_leaf(7, 0, CPUID_EDX);
1554     + cpuid_mask(&entry->edx, CPUID_7_EDX);
1555     } else {
1556     entry->ebx = 0;
1557     entry->ecx = 0;
1558     @@ -627,7 +630,14 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1559     if (!g_phys_as)
1560     g_phys_as = phys_as;
1561     entry->eax = g_phys_as | (virt_as << 8);
1562     - entry->ebx = entry->edx = 0;
1563     + entry->edx = 0;
1564     + /* IBRS and IBPB aren't necessarily present in hardware cpuid */
1565     + if (boot_cpu_has(X86_FEATURE_IBPB))
1566     + entry->ebx |= F(IBPB);
1567     + if (boot_cpu_has(X86_FEATURE_IBRS))
1568     + entry->ebx |= F(IBRS);
1569     + entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features;
1570     + cpuid_mask(&entry->ebx, CPUID_8000_0008_EBX);
1571     break;
1572     }
1573     case 0x80000019:
1574     diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
1575     index c2cea6651279..9a327d5b6d1f 100644
1576     --- a/arch/x86/kvm/cpuid.h
1577     +++ b/arch/x86/kvm/cpuid.h
1578     @@ -54,6 +54,7 @@ static const struct cpuid_reg reverse_cpuid[] = {
1579     [CPUID_8000_000A_EDX] = {0x8000000a, 0, CPUID_EDX},
1580     [CPUID_7_ECX] = { 7, 0, CPUID_ECX},
1581     [CPUID_8000_0007_EBX] = {0x80000007, 0, CPUID_EBX},
1582     + [CPUID_7_EDX] = { 7, 0, CPUID_EDX},
1583     };
1584    
1585     static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned x86_feature)
1586     diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
1587     index b514b2b2845a..290ecf711aec 100644
1588     --- a/arch/x86/kvm/emulate.c
1589     +++ b/arch/x86/kvm/emulate.c
1590     @@ -25,6 +25,7 @@
1591     #include <asm/kvm_emulate.h>
1592     #include <linux/stringify.h>
1593     #include <asm/debugreg.h>
1594     +#include <asm/nospec-branch.h>
1595    
1596     #include "x86.h"
1597     #include "tss.h"
1598     @@ -1021,8 +1022,8 @@ static __always_inline u8 test_cc(unsigned int condition, unsigned long flags)
1599     void (*fop)(void) = (void *)em_setcc + 4 * (condition & 0xf);
1600    
1601     flags = (flags & EFLAGS_MASK) | X86_EFLAGS_IF;
1602     - asm("push %[flags]; popf; call *%[fastop]"
1603     - : "=a"(rc) : [fastop]"r"(fop), [flags]"r"(flags));
1604     + asm("push %[flags]; popf; " CALL_NOSPEC
1605     + : "=a"(rc) : [thunk_target]"r"(fop), [flags]"r"(flags));
1606     return rc;
1607     }
1608    
1609     @@ -5335,9 +5336,9 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *))
1610     if (!(ctxt->d & ByteOp))
1611     fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE;
1612    
1613     - asm("push %[flags]; popf; call *%[fastop]; pushf; pop %[flags]\n"
1614     + asm("push %[flags]; popf; " CALL_NOSPEC " ; pushf; pop %[flags]\n"
1615     : "+a"(ctxt->dst.val), "+d"(ctxt->src.val), [flags]"+D"(flags),
1616     - [fastop]"+S"(fop), ASM_CALL_CONSTRAINT
1617     + [thunk_target]"+S"(fop), ASM_CALL_CONSTRAINT
1618     : "c"(ctxt->src2.val));
1619    
1620     ctxt->eflags = (ctxt->eflags & ~EFLAGS_MASK) | (flags & EFLAGS_MASK);
1621     diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
1622     index f40d0da1f1d3..4e3c79530526 100644
1623     --- a/arch/x86/kvm/svm.c
1624     +++ b/arch/x86/kvm/svm.c
1625     @@ -184,6 +184,8 @@ struct vcpu_svm {
1626     u64 gs_base;
1627     } host;
1628    
1629     + u64 spec_ctrl;
1630     +
1631     u32 *msrpm;
1632    
1633     ulong nmi_iret_rip;
1634     @@ -249,6 +251,8 @@ static const struct svm_direct_access_msrs {
1635     { .index = MSR_CSTAR, .always = true },
1636     { .index = MSR_SYSCALL_MASK, .always = true },
1637     #endif
1638     + { .index = MSR_IA32_SPEC_CTRL, .always = false },
1639     + { .index = MSR_IA32_PRED_CMD, .always = false },
1640     { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false },
1641     { .index = MSR_IA32_LASTBRANCHTOIP, .always = false },
1642     { .index = MSR_IA32_LASTINTFROMIP, .always = false },
1643     @@ -529,6 +533,7 @@ struct svm_cpu_data {
1644     struct kvm_ldttss_desc *tss_desc;
1645    
1646     struct page *save_area;
1647     + struct vmcb *current_vmcb;
1648     };
1649    
1650     static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
1651     @@ -880,6 +885,25 @@ static bool valid_msr_intercept(u32 index)
1652     return false;
1653     }
1654    
1655     +static bool msr_write_intercepted(struct kvm_vcpu *vcpu, unsigned msr)
1656     +{
1657     + u8 bit_write;
1658     + unsigned long tmp;
1659     + u32 offset;
1660     + u32 *msrpm;
1661     +
1662     + msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm:
1663     + to_svm(vcpu)->msrpm;
1664     +
1665     + offset = svm_msrpm_offset(msr);
1666     + bit_write = 2 * (msr & 0x0f) + 1;
1667     + tmp = msrpm[offset];
1668     +
1669     + BUG_ON(offset == MSR_INVALID);
1670     +
1671     + return !!test_bit(bit_write, &tmp);
1672     +}
1673     +
1674     static void set_msr_interception(u32 *msrpm, unsigned msr,
1675     int read, int write)
1676     {
1677     @@ -1582,6 +1606,8 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
1678     u32 dummy;
1679     u32 eax = 1;
1680    
1681     + svm->spec_ctrl = 0;
1682     +
1683     if (!init_event) {
1684     svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
1685     MSR_IA32_APICBASE_ENABLE;
1686     @@ -1703,11 +1729,17 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
1687     __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
1688     kvm_vcpu_uninit(vcpu);
1689     kmem_cache_free(kvm_vcpu_cache, svm);
1690     + /*
1691     + * The vmcb page can be recycled, causing a false negative in
1692     + * svm_vcpu_load(). So do a full IBPB now.
1693     + */
1694     + indirect_branch_prediction_barrier();
1695     }
1696    
1697     static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1698     {
1699     struct vcpu_svm *svm = to_svm(vcpu);
1700     + struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
1701     int i;
1702    
1703     if (unlikely(cpu != vcpu->cpu)) {
1704     @@ -1736,6 +1768,10 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1705     if (static_cpu_has(X86_FEATURE_RDTSCP))
1706     wrmsrl(MSR_TSC_AUX, svm->tsc_aux);
1707    
1708     + if (sd->current_vmcb != svm->vmcb) {
1709     + sd->current_vmcb = svm->vmcb;
1710     + indirect_branch_prediction_barrier();
1711     + }
1712     avic_vcpu_load(vcpu, cpu);
1713     }
1714    
1715     @@ -3593,6 +3629,13 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
1716     case MSR_VM_CR:
1717     msr_info->data = svm->nested.vm_cr_msr;
1718     break;
1719     + case MSR_IA32_SPEC_CTRL:
1720     + if (!msr_info->host_initiated &&
1721     + !guest_cpuid_has(vcpu, X86_FEATURE_IBRS))
1722     + return 1;
1723     +
1724     + msr_info->data = svm->spec_ctrl;
1725     + break;
1726     case MSR_IA32_UCODE_REV:
1727     msr_info->data = 0x01000065;
1728     break;
1729     @@ -3684,6 +3727,49 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
1730     case MSR_IA32_TSC:
1731     kvm_write_tsc(vcpu, msr);
1732     break;
1733     + case MSR_IA32_SPEC_CTRL:
1734     + if (!msr->host_initiated &&
1735     + !guest_cpuid_has(vcpu, X86_FEATURE_IBRS))
1736     + return 1;
1737     +
1738     + /* The STIBP bit doesn't fault even if it's not advertised */
1739     + if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP))
1740     + return 1;
1741     +
1742     + svm->spec_ctrl = data;
1743     +
1744     + if (!data)
1745     + break;
1746     +
1747     + /*
1748     + * For non-nested:
1749     + * When it's written (to non-zero) for the first time, pass
1750     + * it through.
1751     + *
1752     + * For nested:
1753     + * The handling of the MSR bitmap for L2 guests is done in
1754     + * nested_svm_vmrun_msrpm.
1755     + * We update the L1 MSR bit as well since it will end up
1756     + * touching the MSR anyway now.
1757     + */
1758     + set_msr_interception(svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
1759     + break;
1760     + case MSR_IA32_PRED_CMD:
1761     + if (!msr->host_initiated &&
1762     + !guest_cpuid_has(vcpu, X86_FEATURE_IBPB))
1763     + return 1;
1764     +
1765     + if (data & ~PRED_CMD_IBPB)
1766     + return 1;
1767     +
1768     + if (!data)
1769     + break;
1770     +
1771     + wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
1772     + if (is_guest_mode(vcpu))
1773     + break;
1774     + set_msr_interception(svm->msrpm, MSR_IA32_PRED_CMD, 0, 1);
1775     + break;
1776     case MSR_STAR:
1777     svm->vmcb->save.star = data;
1778     break;
1779     @@ -4936,6 +5022,15 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
1780    
1781     local_irq_enable();
1782    
1783     + /*
1784     + * If this vCPU has touched SPEC_CTRL, restore the guest's value if
1785     + * it's non-zero. Since vmentry is serialising on affected CPUs, there
1786     + * is no need to worry about the conditional branch over the wrmsr
1787     + * being speculatively taken.
1788     + */
1789     + if (svm->spec_ctrl)
1790     + wrmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl);
1791     +
1792     asm volatile (
1793     "push %%" _ASM_BP "; \n\t"
1794     "mov %c[rbx](%[svm]), %%" _ASM_BX " \n\t"
1795     @@ -5028,6 +5123,27 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
1796     #endif
1797     );
1798    
1799     + /*
1800     + * We do not use IBRS in the kernel. If this vCPU has used the
1801     + * SPEC_CTRL MSR it may have left it on; save the value and
1802     + * turn it off. This is much more efficient than blindly adding
1803     + * it to the atomic save/restore list. Especially as the former
1804     + * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
1805     + *
1806     + * For non-nested case:
1807     + * If the L01 MSR bitmap does not intercept the MSR, then we need to
1808     + * save it.
1809     + *
1810     + * For nested case:
1811     + * If the L02 MSR bitmap does not intercept the MSR, then we need to
1812     + * save it.
1813     + */
1814     + if (!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))
1815     + rdmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl);
1816     +
1817     + if (svm->spec_ctrl)
1818     + wrmsrl(MSR_IA32_SPEC_CTRL, 0);
1819     +
1820     /* Eliminate branch target predictions from guest mode */
1821     vmexit_fill_RSB();
1822    
1823     diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
1824     index c829d89e2e63..bee4c49f6dd0 100644
1825     --- a/arch/x86/kvm/vmx.c
1826     +++ b/arch/x86/kvm/vmx.c
1827     @@ -34,6 +34,7 @@
1828     #include <linux/tboot.h>
1829     #include <linux/hrtimer.h>
1830     #include <linux/frame.h>
1831     +#include <linux/nospec.h>
1832     #include "kvm_cache_regs.h"
1833     #include "x86.h"
1834    
1835     @@ -111,6 +112,14 @@ static u64 __read_mostly host_xss;
1836     static bool __read_mostly enable_pml = 1;
1837     module_param_named(pml, enable_pml, bool, S_IRUGO);
1838    
1839     +#define MSR_TYPE_R 1
1840     +#define MSR_TYPE_W 2
1841     +#define MSR_TYPE_RW 3
1842     +
1843     +#define MSR_BITMAP_MODE_X2APIC 1
1844     +#define MSR_BITMAP_MODE_X2APIC_APICV 2
1845     +#define MSR_BITMAP_MODE_LM 4
1846     +
1847     #define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL
1848    
1849     /* Guest_tsc -> host_tsc conversion requires 64-bit division. */
1850     @@ -185,7 +194,6 @@ module_param(ple_window_max, int, S_IRUGO);
1851     extern const ulong vmx_return;
1852    
1853     #define NR_AUTOLOAD_MSRS 8
1854     -#define VMCS02_POOL_SIZE 1
1855    
1856     struct vmcs {
1857     u32 revision_id;
1858     @@ -210,6 +218,7 @@ struct loaded_vmcs {
1859     int soft_vnmi_blocked;
1860     ktime_t entry_time;
1861     s64 vnmi_blocked_time;
1862     + unsigned long *msr_bitmap;
1863     struct list_head loaded_vmcss_on_cpu_link;
1864     };
1865    
1866     @@ -226,7 +235,7 @@ struct shared_msr_entry {
1867     * stored in guest memory specified by VMPTRLD, but is opaque to the guest,
1868     * which must access it using VMREAD/VMWRITE/VMCLEAR instructions.
1869     * More than one of these structures may exist, if L1 runs multiple L2 guests.
1870     - * nested_vmx_run() will use the data here to build a vmcs02: a VMCS for the
1871     + * nested_vmx_run() will use the data here to build the vmcs02: a VMCS for the
1872     * underlying hardware which will be used to run L2.
1873     * This structure is packed to ensure that its layout is identical across
1874     * machines (necessary for live migration).
1875     @@ -409,13 +418,6 @@ struct __packed vmcs12 {
1876     */
1877     #define VMCS12_SIZE 0x1000
1878    
1879     -/* Used to remember the last vmcs02 used for some recently used vmcs12s */
1880     -struct vmcs02_list {
1881     - struct list_head list;
1882     - gpa_t vmptr;
1883     - struct loaded_vmcs vmcs02;
1884     -};
1885     -
1886     /*
1887     * The nested_vmx structure is part of vcpu_vmx, and holds information we need
1888     * for correct emulation of VMX (i.e., nested VMX) on this vcpu.
1889     @@ -440,15 +442,15 @@ struct nested_vmx {
1890     */
1891     bool sync_shadow_vmcs;
1892    
1893     - /* vmcs02_list cache of VMCSs recently used to run L2 guests */
1894     - struct list_head vmcs02_pool;
1895     - int vmcs02_num;
1896     bool change_vmcs01_virtual_x2apic_mode;
1897     /* L2 must run next, and mustn't decide to exit to L1. */
1898     bool nested_run_pending;
1899     +
1900     + struct loaded_vmcs vmcs02;
1901     +
1902     /*
1903     - * Guest pages referred to in vmcs02 with host-physical pointers, so
1904     - * we must keep them pinned while L2 runs.
1905     + * Guest pages referred to in the vmcs02 with host-physical
1906     + * pointers, so we must keep them pinned while L2 runs.
1907     */
1908     struct page *apic_access_page;
1909     struct page *virtual_apic_page;
1910     @@ -457,8 +459,6 @@ struct nested_vmx {
1911     bool pi_pending;
1912     u16 posted_intr_nv;
1913    
1914     - unsigned long *msr_bitmap;
1915     -
1916     struct hrtimer preemption_timer;
1917     bool preemption_timer_expired;
1918    
1919     @@ -581,6 +581,7 @@ struct vcpu_vmx {
1920     struct kvm_vcpu vcpu;
1921     unsigned long host_rsp;
1922     u8 fail;
1923     + u8 msr_bitmap_mode;
1924     u32 exit_intr_info;
1925     u32 idt_vectoring_info;
1926     ulong rflags;
1927     @@ -592,6 +593,10 @@ struct vcpu_vmx {
1928     u64 msr_host_kernel_gs_base;
1929     u64 msr_guest_kernel_gs_base;
1930     #endif
1931     +
1932     + u64 arch_capabilities;
1933     + u64 spec_ctrl;
1934     +
1935     u32 vm_entry_controls_shadow;
1936     u32 vm_exit_controls_shadow;
1937     u32 secondary_exec_control;
1938     @@ -898,21 +903,18 @@ static const unsigned short vmcs_field_to_offset_table[] = {
1939    
1940     static inline short vmcs_field_to_offset(unsigned long field)
1941     {
1942     - BUILD_BUG_ON(ARRAY_SIZE(vmcs_field_to_offset_table) > SHRT_MAX);
1943     + const size_t size = ARRAY_SIZE(vmcs_field_to_offset_table);
1944     + unsigned short offset;
1945    
1946     - if (field >= ARRAY_SIZE(vmcs_field_to_offset_table))
1947     + BUILD_BUG_ON(size > SHRT_MAX);
1948     + if (field >= size)
1949     return -ENOENT;
1950    
1951     - /*
1952     - * FIXME: Mitigation for CVE-2017-5753. To be replaced with a
1953     - * generic mechanism.
1954     - */
1955     - asm("lfence");
1956     -
1957     - if (vmcs_field_to_offset_table[field] == 0)
1958     + field = array_index_nospec(field, size);
1959     + offset = vmcs_field_to_offset_table[field];
1960     + if (offset == 0)
1961     return -ENOENT;
1962     -
1963     - return vmcs_field_to_offset_table[field];
1964     + return offset;
1965     }
1966    
1967     static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
1968     @@ -935,6 +937,9 @@ static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu);
1969     static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
1970     static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
1971     u16 error_code);
1972     +static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
1973     +static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
1974     + u32 msr, int type);
1975    
1976     static DEFINE_PER_CPU(struct vmcs *, vmxarea);
1977     static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
1978     @@ -954,12 +959,6 @@ static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
1979     enum {
1980     VMX_IO_BITMAP_A,
1981     VMX_IO_BITMAP_B,
1982     - VMX_MSR_BITMAP_LEGACY,
1983     - VMX_MSR_BITMAP_LONGMODE,
1984     - VMX_MSR_BITMAP_LEGACY_X2APIC_APICV,
1985     - VMX_MSR_BITMAP_LONGMODE_X2APIC_APICV,
1986     - VMX_MSR_BITMAP_LEGACY_X2APIC,
1987     - VMX_MSR_BITMAP_LONGMODE_X2APIC,
1988     VMX_VMREAD_BITMAP,
1989     VMX_VMWRITE_BITMAP,
1990     VMX_BITMAP_NR
1991     @@ -969,12 +968,6 @@ static unsigned long *vmx_bitmap[VMX_BITMAP_NR];
1992    
1993     #define vmx_io_bitmap_a (vmx_bitmap[VMX_IO_BITMAP_A])
1994     #define vmx_io_bitmap_b (vmx_bitmap[VMX_IO_BITMAP_B])
1995     -#define vmx_msr_bitmap_legacy (vmx_bitmap[VMX_MSR_BITMAP_LEGACY])
1996     -#define vmx_msr_bitmap_longmode (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE])
1997     -#define vmx_msr_bitmap_legacy_x2apic_apicv (vmx_bitmap[VMX_MSR_BITMAP_LEGACY_X2APIC_APICV])
1998     -#define vmx_msr_bitmap_longmode_x2apic_apicv (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE_X2APIC_APICV])
1999     -#define vmx_msr_bitmap_legacy_x2apic (vmx_bitmap[VMX_MSR_BITMAP_LEGACY_X2APIC])
2000     -#define vmx_msr_bitmap_longmode_x2apic (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE_X2APIC])
2001     #define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP])
2002     #define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP])
2003    
2004     @@ -1918,6 +1911,52 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
2005     vmcs_write32(EXCEPTION_BITMAP, eb);
2006     }
2007    
2008     +/*
2009     + * Check if MSR is intercepted for currently loaded MSR bitmap.
2010     + */
2011     +static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
2012     +{
2013     + unsigned long *msr_bitmap;
2014     + int f = sizeof(unsigned long);
2015     +
2016     + if (!cpu_has_vmx_msr_bitmap())
2017     + return true;
2018     +
2019     + msr_bitmap = to_vmx(vcpu)->loaded_vmcs->msr_bitmap;
2020     +
2021     + if (msr <= 0x1fff) {
2022     + return !!test_bit(msr, msr_bitmap + 0x800 / f);
2023     + } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
2024     + msr &= 0x1fff;
2025     + return !!test_bit(msr, msr_bitmap + 0xc00 / f);
2026     + }
2027     +
2028     + return true;
2029     +}
2030     +
2031     +/*
2032     + * Check if MSR is intercepted for L01 MSR bitmap.
2033     + */
2034     +static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr)
2035     +{
2036     + unsigned long *msr_bitmap;
2037     + int f = sizeof(unsigned long);
2038     +
2039     + if (!cpu_has_vmx_msr_bitmap())
2040     + return true;
2041     +
2042     + msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap;
2043     +
2044     + if (msr <= 0x1fff) {
2045     + return !!test_bit(msr, msr_bitmap + 0x800 / f);
2046     + } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
2047     + msr &= 0x1fff;
2048     + return !!test_bit(msr, msr_bitmap + 0xc00 / f);
2049     + }
2050     +
2051     + return true;
2052     +}
2053     +
2054     static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
2055     unsigned long entry, unsigned long exit)
2056     {
2057     @@ -2296,6 +2335,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
2058     if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
2059     per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
2060     vmcs_load(vmx->loaded_vmcs->vmcs);
2061     + indirect_branch_prediction_barrier();
2062     }
2063    
2064     if (!already_loaded) {
2065     @@ -2572,36 +2612,6 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
2066     vmx->guest_msrs[from] = tmp;
2067     }
2068    
2069     -static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
2070     -{
2071     - unsigned long *msr_bitmap;
2072     -
2073     - if (is_guest_mode(vcpu))
2074     - msr_bitmap = to_vmx(vcpu)->nested.msr_bitmap;
2075     - else if (cpu_has_secondary_exec_ctrls() &&
2076     - (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
2077     - SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
2078     - if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) {
2079     - if (is_long_mode(vcpu))
2080     - msr_bitmap = vmx_msr_bitmap_longmode_x2apic_apicv;
2081     - else
2082     - msr_bitmap = vmx_msr_bitmap_legacy_x2apic_apicv;
2083     - } else {
2084     - if (is_long_mode(vcpu))
2085     - msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
2086     - else
2087     - msr_bitmap = vmx_msr_bitmap_legacy_x2apic;
2088     - }
2089     - } else {
2090     - if (is_long_mode(vcpu))
2091     - msr_bitmap = vmx_msr_bitmap_longmode;
2092     - else
2093     - msr_bitmap = vmx_msr_bitmap_legacy;
2094     - }
2095     -
2096     - vmcs_write64(MSR_BITMAP, __pa(msr_bitmap));
2097     -}
2098     -
2099     /*
2100     * Set up the vmcs to automatically save and restore system
2101     * msrs. Don't touch the 64-bit msrs if the guest is in legacy
2102     @@ -2642,7 +2652,7 @@ static void setup_msrs(struct vcpu_vmx *vmx)
2103     vmx->save_nmsrs = save_nmsrs;
2104    
2105     if (cpu_has_vmx_msr_bitmap())
2106     - vmx_set_msr_bitmap(&vmx->vcpu);
2107     + vmx_update_msr_bitmap(&vmx->vcpu);
2108     }
2109    
2110     /*
2111     @@ -3276,6 +3286,20 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2112     case MSR_IA32_TSC:
2113     msr_info->data = guest_read_tsc(vcpu);
2114     break;
2115     + case MSR_IA32_SPEC_CTRL:
2116     + if (!msr_info->host_initiated &&
2117     + !guest_cpuid_has(vcpu, X86_FEATURE_IBRS) &&
2118     + !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
2119     + return 1;
2120     +
2121     + msr_info->data = to_vmx(vcpu)->spec_ctrl;
2122     + break;
2123     + case MSR_IA32_ARCH_CAPABILITIES:
2124     + if (!msr_info->host_initiated &&
2125     + !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES))
2126     + return 1;
2127     + msr_info->data = to_vmx(vcpu)->arch_capabilities;
2128     + break;
2129     case MSR_IA32_SYSENTER_CS:
2130     msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
2131     break;
2132     @@ -3383,6 +3407,70 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2133     case MSR_IA32_TSC:
2134     kvm_write_tsc(vcpu, msr_info);
2135     break;
2136     + case MSR_IA32_SPEC_CTRL:
2137     + if (!msr_info->host_initiated &&
2138     + !guest_cpuid_has(vcpu, X86_FEATURE_IBRS) &&
2139     + !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
2140     + return 1;
2141     +
2142     + /* The STIBP bit doesn't fault even if it's not advertised */
2143     + if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP))
2144     + return 1;
2145     +
2146     + vmx->spec_ctrl = data;
2147     +
2148     + if (!data)
2149     + break;
2150     +
2151     + /*
2152     + * For non-nested:
2153     + * When it's written (to non-zero) for the first time, pass
2154     + * it through.
2155     + *
2156     + * For nested:
2157     + * The handling of the MSR bitmap for L2 guests is done in
2158     + * nested_vmx_merge_msr_bitmap. We should not touch the
2159     + * vmcs02.msr_bitmap here since it gets completely overwritten
2160     + * in the merging. We update the vmcs01 here for L1 as well
2161     + * since it will end up touching the MSR anyway now.
2162     + */
2163     + vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap,
2164     + MSR_IA32_SPEC_CTRL,
2165     + MSR_TYPE_RW);
2166     + break;
2167     + case MSR_IA32_PRED_CMD:
2168     + if (!msr_info->host_initiated &&
2169     + !guest_cpuid_has(vcpu, X86_FEATURE_IBPB) &&
2170     + !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
2171     + return 1;
2172     +
2173     + if (data & ~PRED_CMD_IBPB)
2174     + return 1;
2175     +
2176     + if (!data)
2177     + break;
2178     +
2179     + wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
2180     +
2181     + /*
2182     + * For non-nested:
2183     + * When it's written (to non-zero) for the first time, pass
2184     + * it through.
2185     + *
2186     + * For nested:
2187     + * The handling of the MSR bitmap for L2 guests is done in
2188     + * nested_vmx_merge_msr_bitmap. We should not touch the
2189     + * vmcs02.msr_bitmap here since it gets completely overwritten
2190     + * in the merging.
2191     + */
2192     + vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD,
2193     + MSR_TYPE_W);
2194     + break;
2195     + case MSR_IA32_ARCH_CAPABILITIES:
2196     + if (!msr_info->host_initiated)
2197     + return 1;
2198     + vmx->arch_capabilities = data;
2199     + break;
2200     case MSR_IA32_CR_PAT:
2201     if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
2202     if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
2203     @@ -3837,11 +3925,6 @@ static struct vmcs *alloc_vmcs_cpu(int cpu)
2204     return vmcs;
2205     }
2206    
2207     -static struct vmcs *alloc_vmcs(void)
2208     -{
2209     - return alloc_vmcs_cpu(raw_smp_processor_id());
2210     -}
2211     -
2212     static void free_vmcs(struct vmcs *vmcs)
2213     {
2214     free_pages((unsigned long)vmcs, vmcs_config.order);
2215     @@ -3857,9 +3940,38 @@ static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
2216     loaded_vmcs_clear(loaded_vmcs);
2217     free_vmcs(loaded_vmcs->vmcs);
2218     loaded_vmcs->vmcs = NULL;
2219     + if (loaded_vmcs->msr_bitmap)
2220     + free_page((unsigned long)loaded_vmcs->msr_bitmap);
2221     WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
2222     }
2223    
2224     +static struct vmcs *alloc_vmcs(void)
2225     +{
2226     + return alloc_vmcs_cpu(raw_smp_processor_id());
2227     +}
2228     +
2229     +static int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
2230     +{
2231     + loaded_vmcs->vmcs = alloc_vmcs();
2232     + if (!loaded_vmcs->vmcs)
2233     + return -ENOMEM;
2234     +
2235     + loaded_vmcs->shadow_vmcs = NULL;
2236     + loaded_vmcs_init(loaded_vmcs);
2237     +
2238     + if (cpu_has_vmx_msr_bitmap()) {
2239     + loaded_vmcs->msr_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
2240     + if (!loaded_vmcs->msr_bitmap)
2241     + goto out_vmcs;
2242     + memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
2243     + }
2244     + return 0;
2245     +
2246     +out_vmcs:
2247     + free_loaded_vmcs(loaded_vmcs);
2248     + return -ENOMEM;
2249     +}
2250     +
2251     static void free_kvm_area(void)
2252     {
2253     int cpu;
2254     @@ -4918,10 +5030,8 @@ static void free_vpid(int vpid)
2255     spin_unlock(&vmx_vpid_lock);
2256     }
2257    
2258     -#define MSR_TYPE_R 1
2259     -#define MSR_TYPE_W 2
2260     -static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
2261     - u32 msr, int type)
2262     +static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
2263     + u32 msr, int type)
2264     {
2265     int f = sizeof(unsigned long);
2266    
2267     @@ -4955,6 +5065,50 @@ static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
2268     }
2269     }
2270    
2271     +static void __always_inline vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
2272     + u32 msr, int type)
2273     +{
2274     + int f = sizeof(unsigned long);
2275     +
2276     + if (!cpu_has_vmx_msr_bitmap())
2277     + return;
2278     +
2279     + /*
2280     + * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
2281     + * have the write-low and read-high bitmap offsets the wrong way round.
2282     + * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
2283     + */
2284     + if (msr <= 0x1fff) {
2285     + if (type & MSR_TYPE_R)
2286     + /* read-low */
2287     + __set_bit(msr, msr_bitmap + 0x000 / f);
2288     +
2289     + if (type & MSR_TYPE_W)
2290     + /* write-low */
2291     + __set_bit(msr, msr_bitmap + 0x800 / f);
2292     +
2293     + } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
2294     + msr &= 0x1fff;
2295     + if (type & MSR_TYPE_R)
2296     + /* read-high */
2297     + __set_bit(msr, msr_bitmap + 0x400 / f);
2298     +
2299     + if (type & MSR_TYPE_W)
2300     + /* write-high */
2301     + __set_bit(msr, msr_bitmap + 0xc00 / f);
2302     +
2303     + }
2304     +}
2305     +
2306     +static void __always_inline vmx_set_intercept_for_msr(unsigned long *msr_bitmap,
2307     + u32 msr, int type, bool value)
2308     +{
2309     + if (value)
2310     + vmx_enable_intercept_for_msr(msr_bitmap, msr, type);
2311     + else
2312     + vmx_disable_intercept_for_msr(msr_bitmap, msr, type);
2313     +}
2314     +
2315     /*
2316     * If a msr is allowed by L0, we should check whether it is allowed by L1.
2317     * The corresponding bit will be cleared unless both of L0 and L1 allow it.
2318     @@ -5001,30 +5155,70 @@ static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1,
2319     }
2320     }
2321    
2322     -static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
2323     +static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu)
2324     {
2325     - if (!longmode_only)
2326     - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy,
2327     - msr, MSR_TYPE_R | MSR_TYPE_W);
2328     - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
2329     - msr, MSR_TYPE_R | MSR_TYPE_W);
2330     + u8 mode = 0;
2331     +
2332     + if (cpu_has_secondary_exec_ctrls() &&
2333     + (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
2334     + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
2335     + mode |= MSR_BITMAP_MODE_X2APIC;
2336     + if (enable_apicv && kvm_vcpu_apicv_active(vcpu))
2337     + mode |= MSR_BITMAP_MODE_X2APIC_APICV;
2338     + }
2339     +
2340     + if (is_long_mode(vcpu))
2341     + mode |= MSR_BITMAP_MODE_LM;
2342     +
2343     + return mode;
2344     }
2345    
2346     -static void vmx_disable_intercept_msr_x2apic(u32 msr, int type, bool apicv_active)
2347     +#define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4))
2348     +
2349     +static void vmx_update_msr_bitmap_x2apic(unsigned long *msr_bitmap,
2350     + u8 mode)
2351     {
2352     - if (apicv_active) {
2353     - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv,
2354     - msr, type);
2355     - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv,
2356     - msr, type);
2357     - } else {
2358     - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
2359     - msr, type);
2360     - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
2361     - msr, type);
2362     + int msr;
2363     +
2364     + for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
2365     + unsigned word = msr / BITS_PER_LONG;
2366     + msr_bitmap[word] = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0;
2367     + msr_bitmap[word + (0x800 / sizeof(long))] = ~0;
2368     + }
2369     +
2370     + if (mode & MSR_BITMAP_MODE_X2APIC) {
2371     + /*
2372     + * TPR reads and writes can be virtualized even if virtual interrupt
2373     + * delivery is not in use.
2374     + */
2375     + vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW);
2376     + if (mode & MSR_BITMAP_MODE_X2APIC_APICV) {
2377     + vmx_enable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_R);
2378     + vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
2379     + vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
2380     + }
2381     }
2382     }
2383    
2384     +static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu)
2385     +{
2386     + struct vcpu_vmx *vmx = to_vmx(vcpu);
2387     + unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
2388     + u8 mode = vmx_msr_bitmap_mode(vcpu);
2389     + u8 changed = mode ^ vmx->msr_bitmap_mode;
2390     +
2391     + if (!changed)
2392     + return;
2393     +
2394     + vmx_set_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW,
2395     + !(mode & MSR_BITMAP_MODE_LM));
2396     +
2397     + if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV))
2398     + vmx_update_msr_bitmap_x2apic(msr_bitmap, mode);
2399     +
2400     + vmx->msr_bitmap_mode = mode;
2401     +}
2402     +
2403     static bool vmx_get_enable_apicv(struct kvm_vcpu *vcpu)
2404     {
2405     return enable_apicv;
2406     @@ -5274,7 +5468,7 @@ static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
2407     }
2408    
2409     if (cpu_has_vmx_msr_bitmap())
2410     - vmx_set_msr_bitmap(vcpu);
2411     + vmx_update_msr_bitmap(vcpu);
2412     }
2413    
2414     static u32 vmx_exec_control(struct vcpu_vmx *vmx)
2415     @@ -5461,7 +5655,7 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
2416     vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
2417     }
2418     if (cpu_has_vmx_msr_bitmap())
2419     - vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy));
2420     + vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap));
2421    
2422     vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
2423    
2424     @@ -5539,6 +5733,8 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
2425     ++vmx->nmsrs;
2426     }
2427    
2428     + if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
2429     + rdmsrl(MSR_IA32_ARCH_CAPABILITIES, vmx->arch_capabilities);
2430    
2431     vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl);
2432    
2433     @@ -5567,6 +5763,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
2434     u64 cr0;
2435    
2436     vmx->rmode.vm86_active = 0;
2437     + vmx->spec_ctrl = 0;
2438    
2439     vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
2440     kvm_set_cr8(vcpu, 0);
2441     @@ -6744,7 +6941,7 @@ void vmx_enable_tdp(void)
2442    
2443     static __init int hardware_setup(void)
2444     {
2445     - int r = -ENOMEM, i, msr;
2446     + int r = -ENOMEM, i;
2447    
2448     rdmsrl_safe(MSR_EFER, &host_efer);
2449    
2450     @@ -6764,9 +6961,6 @@ static __init int hardware_setup(void)
2451    
2452     memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE);
2453    
2454     - memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
2455     - memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
2456     -
2457     if (setup_vmcs_config(&vmcs_config) < 0) {
2458     r = -EIO;
2459     goto out;
2460     @@ -6835,42 +7029,8 @@ static __init int hardware_setup(void)
2461     kvm_tsc_scaling_ratio_frac_bits = 48;
2462     }
2463    
2464     - vmx_disable_intercept_for_msr(MSR_FS_BASE, false);
2465     - vmx_disable_intercept_for_msr(MSR_GS_BASE, false);
2466     - vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true);
2467     - vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
2468     - vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
2469     - vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
2470     -
2471     - memcpy(vmx_msr_bitmap_legacy_x2apic_apicv,
2472     - vmx_msr_bitmap_legacy, PAGE_SIZE);
2473     - memcpy(vmx_msr_bitmap_longmode_x2apic_apicv,
2474     - vmx_msr_bitmap_longmode, PAGE_SIZE);
2475     - memcpy(vmx_msr_bitmap_legacy_x2apic,
2476     - vmx_msr_bitmap_legacy, PAGE_SIZE);
2477     - memcpy(vmx_msr_bitmap_longmode_x2apic,
2478     - vmx_msr_bitmap_longmode, PAGE_SIZE);
2479     -
2480     set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
2481    
2482     - for (msr = 0x800; msr <= 0x8ff; msr++) {
2483     - if (msr == 0x839 /* TMCCT */)
2484     - continue;
2485     - vmx_disable_intercept_msr_x2apic(msr, MSR_TYPE_R, true);
2486     - }
2487     -
2488     - /*
2489     - * TPR reads and writes can be virtualized even if virtual interrupt
2490     - * delivery is not in use.
2491     - */
2492     - vmx_disable_intercept_msr_x2apic(0x808, MSR_TYPE_W, true);
2493     - vmx_disable_intercept_msr_x2apic(0x808, MSR_TYPE_R | MSR_TYPE_W, false);
2494     -
2495     - /* EOI */
2496     - vmx_disable_intercept_msr_x2apic(0x80b, MSR_TYPE_W, true);
2497     - /* SELF-IPI */
2498     - vmx_disable_intercept_msr_x2apic(0x83f, MSR_TYPE_W, true);
2499     -
2500     if (enable_ept)
2501     vmx_enable_tdp();
2502     else
2503     @@ -6973,94 +7133,6 @@ static int handle_monitor(struct kvm_vcpu *vcpu)
2504     return handle_nop(vcpu);
2505     }
2506    
2507     -/*
2508     - * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12.
2509     - * We could reuse a single VMCS for all the L2 guests, but we also want the
2510     - * option to allocate a separate vmcs02 for each separate loaded vmcs12 - this
2511     - * allows keeping them loaded on the processor, and in the future will allow
2512     - * optimizations where prepare_vmcs02 doesn't need to set all the fields on
2513     - * every entry if they never change.
2514     - * So we keep, in vmx->nested.vmcs02_pool, a cache of size VMCS02_POOL_SIZE
2515     - * (>=0) with a vmcs02 for each recently loaded vmcs12s, most recent first.
2516     - *
2517     - * The following functions allocate and free a vmcs02 in this pool.
2518     - */
2519     -
2520     -/* Get a VMCS from the pool to use as vmcs02 for the current vmcs12. */
2521     -static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx)
2522     -{
2523     - struct vmcs02_list *item;
2524     - list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
2525     - if (item->vmptr == vmx->nested.current_vmptr) {
2526     - list_move(&item->list, &vmx->nested.vmcs02_pool);
2527     - return &item->vmcs02;
2528     - }
2529     -
2530     - if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) {
2531     - /* Recycle the least recently used VMCS. */
2532     - item = list_last_entry(&vmx->nested.vmcs02_pool,
2533     - struct vmcs02_list, list);
2534     - item->vmptr = vmx->nested.current_vmptr;
2535     - list_move(&item->list, &vmx->nested.vmcs02_pool);
2536     - return &item->vmcs02;
2537     - }
2538     -
2539     - /* Create a new VMCS */
2540     - item = kzalloc(sizeof(struct vmcs02_list), GFP_KERNEL);
2541     - if (!item)
2542     - return NULL;
2543     - item->vmcs02.vmcs = alloc_vmcs();
2544     - item->vmcs02.shadow_vmcs = NULL;
2545     - if (!item->vmcs02.vmcs) {
2546     - kfree(item);
2547     - return NULL;
2548     - }
2549     - loaded_vmcs_init(&item->vmcs02);
2550     - item->vmptr = vmx->nested.current_vmptr;
2551     - list_add(&(item->list), &(vmx->nested.vmcs02_pool));
2552     - vmx->nested.vmcs02_num++;
2553     - return &item->vmcs02;
2554     -}
2555     -
2556     -/* Free and remove from pool a vmcs02 saved for a vmcs12 (if there is one) */
2557     -static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr)
2558     -{
2559     - struct vmcs02_list *item;
2560     - list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
2561     - if (item->vmptr == vmptr) {
2562     - free_loaded_vmcs(&item->vmcs02);
2563     - list_del(&item->list);
2564     - kfree(item);
2565     - vmx->nested.vmcs02_num--;
2566     - return;
2567     - }
2568     -}
2569     -
2570     -/*
2571     - * Free all VMCSs saved for this vcpu, except the one pointed by
2572     - * vmx->loaded_vmcs. We must be running L1, so vmx->loaded_vmcs
2573     - * must be &vmx->vmcs01.
2574     - */
2575     -static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx)
2576     -{
2577     - struct vmcs02_list *item, *n;
2578     -
2579     - WARN_ON(vmx->loaded_vmcs != &vmx->vmcs01);
2580     - list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) {
2581     - /*
2582     - * Something will leak if the above WARN triggers. Better than
2583     - * a use-after-free.
2584     - */
2585     - if (vmx->loaded_vmcs == &item->vmcs02)
2586     - continue;
2587     -
2588     - free_loaded_vmcs(&item->vmcs02);
2589     - list_del(&item->list);
2590     - kfree(item);
2591     - vmx->nested.vmcs02_num--;
2592     - }
2593     -}
2594     -
2595     /*
2596     * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
2597     * set the success or error code of an emulated VMX instruction, as specified
2598     @@ -7241,13 +7313,11 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
2599     {
2600     struct vcpu_vmx *vmx = to_vmx(vcpu);
2601     struct vmcs *shadow_vmcs;
2602     + int r;
2603    
2604     - if (cpu_has_vmx_msr_bitmap()) {
2605     - vmx->nested.msr_bitmap =
2606     - (unsigned long *)__get_free_page(GFP_KERNEL);
2607     - if (!vmx->nested.msr_bitmap)
2608     - goto out_msr_bitmap;
2609     - }
2610     + r = alloc_loaded_vmcs(&vmx->nested.vmcs02);
2611     + if (r < 0)
2612     + goto out_vmcs02;
2613    
2614     vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
2615     if (!vmx->nested.cached_vmcs12)
2616     @@ -7264,9 +7334,6 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
2617     vmx->vmcs01.shadow_vmcs = shadow_vmcs;
2618     }
2619    
2620     - INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
2621     - vmx->nested.vmcs02_num = 0;
2622     -
2623     hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
2624     HRTIMER_MODE_REL_PINNED);
2625     vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
2626     @@ -7278,9 +7345,9 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
2627     kfree(vmx->nested.cached_vmcs12);
2628    
2629     out_cached_vmcs12:
2630     - free_page((unsigned long)vmx->nested.msr_bitmap);
2631     + free_loaded_vmcs(&vmx->nested.vmcs02);
2632    
2633     -out_msr_bitmap:
2634     +out_vmcs02:
2635     return -ENOMEM;
2636     }
2637    
2638     @@ -7423,10 +7490,6 @@ static void free_nested(struct vcpu_vmx *vmx)
2639     free_vpid(vmx->nested.vpid02);
2640     vmx->nested.posted_intr_nv = -1;
2641     vmx->nested.current_vmptr = -1ull;
2642     - if (vmx->nested.msr_bitmap) {
2643     - free_page((unsigned long)vmx->nested.msr_bitmap);
2644     - vmx->nested.msr_bitmap = NULL;
2645     - }
2646     if (enable_shadow_vmcs) {
2647     vmx_disable_shadow_vmcs(vmx);
2648     vmcs_clear(vmx->vmcs01.shadow_vmcs);
2649     @@ -7434,7 +7497,7 @@ static void free_nested(struct vcpu_vmx *vmx)
2650     vmx->vmcs01.shadow_vmcs = NULL;
2651     }
2652     kfree(vmx->nested.cached_vmcs12);
2653     - /* Unpin physical memory we referred to in current vmcs02 */
2654     + /* Unpin physical memory we referred to in the vmcs02 */
2655     if (vmx->nested.apic_access_page) {
2656     kvm_release_page_dirty(vmx->nested.apic_access_page);
2657     vmx->nested.apic_access_page = NULL;
2658     @@ -7450,7 +7513,7 @@ static void free_nested(struct vcpu_vmx *vmx)
2659     vmx->nested.pi_desc = NULL;
2660     }
2661    
2662     - nested_free_all_saved_vmcss(vmx);
2663     + free_loaded_vmcs(&vmx->nested.vmcs02);
2664     }
2665    
2666     /* Emulate the VMXOFF instruction */
2667     @@ -7493,8 +7556,6 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
2668     vmptr + offsetof(struct vmcs12, launch_state),
2669     &zero, sizeof(zero));
2670    
2671     - nested_free_vmcs02(vmx, vmptr);
2672     -
2673     nested_vmx_succeed(vcpu);
2674     return kvm_skip_emulated_instruction(vcpu);
2675     }
2676     @@ -8406,10 +8467,11 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
2677    
2678     /*
2679     * The host physical addresses of some pages of guest memory
2680     - * are loaded into VMCS02 (e.g. L1's Virtual APIC Page). The CPU
2681     - * may write to these pages via their host physical address while
2682     - * L2 is running, bypassing any address-translation-based dirty
2683     - * tracking (e.g. EPT write protection).
2684     + * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
2685     + * Page). The CPU may write to these pages via their host
2686     + * physical address while L2 is running, bypassing any
2687     + * address-translation-based dirty tracking (e.g. EPT write
2688     + * protection).
2689     *
2690     * Mark them dirty on every exit from L2 to prevent them from
2691     * getting out of sync with dirty tracking.
2692     @@ -8943,7 +9005,7 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
2693     }
2694     vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control);
2695    
2696     - vmx_set_msr_bitmap(vcpu);
2697     + vmx_update_msr_bitmap(vcpu);
2698     }
2699    
2700     static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa)
2701     @@ -9129,14 +9191,14 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
2702     #endif
2703     "pushf\n\t"
2704     __ASM_SIZE(push) " $%c[cs]\n\t"
2705     - "call *%[entry]\n\t"
2706     + CALL_NOSPEC
2707     :
2708     #ifdef CONFIG_X86_64
2709     [sp]"=&r"(tmp),
2710     #endif
2711     ASM_CALL_CONSTRAINT
2712     :
2713     - [entry]"r"(entry),
2714     + THUNK_TARGET(entry),
2715     [ss]"i"(__KERNEL_DS),
2716     [cs]"i"(__KERNEL_CS)
2717     );
2718     @@ -9373,6 +9435,15 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
2719    
2720     vmx_arm_hv_timer(vcpu);
2721    
2722     + /*
2723     + * If this vCPU has touched SPEC_CTRL, restore the guest's value if
2724     + * it's non-zero. Since vmentry is serialising on affected CPUs, there
2725     + * is no need to worry about the conditional branch over the wrmsr
2726     + * being speculatively taken.
2727     + */
2728     + if (vmx->spec_ctrl)
2729     + wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
2730     +
2731     vmx->__launched = vmx->loaded_vmcs->launched;
2732     asm(
2733     /* Store host registers */
2734     @@ -9491,6 +9562,27 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
2735     #endif
2736     );
2737    
2738     + /*
2739     + * We do not use IBRS in the kernel. If this vCPU has used the
2740     + * SPEC_CTRL MSR it may have left it on; save the value and
2741     + * turn it off. This is much more efficient than blindly adding
2742     + * it to the atomic save/restore list. Especially as the former
2743     + * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
2744     + *
2745     + * For non-nested case:
2746     + * If the L01 MSR bitmap does not intercept the MSR, then we need to
2747     + * save it.
2748     + *
2749     + * For nested case:
2750     + * If the L02 MSR bitmap does not intercept the MSR, then we need to
2751     + * save it.
2752     + */
2753     + if (!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))
2754     + rdmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
2755     +
2756     + if (vmx->spec_ctrl)
2757     + wrmsrl(MSR_IA32_SPEC_CTRL, 0);
2758     +
2759     /* Eliminate branch target predictions from guest mode */
2760     vmexit_fill_RSB();
2761    
2762     @@ -9604,6 +9696,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
2763     {
2764     int err;
2765     struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
2766     + unsigned long *msr_bitmap;
2767     int cpu;
2768    
2769     if (!vmx)
2770     @@ -9636,13 +9729,20 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
2771     if (!vmx->guest_msrs)
2772     goto free_pml;
2773    
2774     - vmx->loaded_vmcs = &vmx->vmcs01;
2775     - vmx->loaded_vmcs->vmcs = alloc_vmcs();
2776     - vmx->loaded_vmcs->shadow_vmcs = NULL;
2777     - if (!vmx->loaded_vmcs->vmcs)
2778     + err = alloc_loaded_vmcs(&vmx->vmcs01);
2779     + if (err < 0)
2780     goto free_msrs;
2781     - loaded_vmcs_init(vmx->loaded_vmcs);
2782    
2783     + msr_bitmap = vmx->vmcs01.msr_bitmap;
2784     + vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE, MSR_TYPE_RW);
2785     + vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE, MSR_TYPE_RW);
2786     + vmx_disable_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
2787     + vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
2788     + vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
2789     + vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
2790     + vmx->msr_bitmap_mode = 0;
2791     +
2792     + vmx->loaded_vmcs = &vmx->vmcs01;
2793     cpu = get_cpu();
2794     vmx_vcpu_load(&vmx->vcpu, cpu);
2795     vmx->vcpu.cpu = cpu;
2796     @@ -10105,10 +10205,25 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
2797     int msr;
2798     struct page *page;
2799     unsigned long *msr_bitmap_l1;
2800     - unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.msr_bitmap;
2801     + unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap;
2802     + /*
2803     + * pred_cmd & spec_ctrl are trying to verify two things:
2804     + *
2805     + * 1. L0 gave a permission to L1 to actually passthrough the MSR. This
2806     + * ensures that we do not accidentally generate an L02 MSR bitmap
2807     + * from the L12 MSR bitmap that is too permissive.
2808     + * 2. That L1 or L2s have actually used the MSR. This avoids
2809     + * unnecessarily merging of the bitmap if the MSR is unused. This
2810     + * works properly because we only update the L01 MSR bitmap lazily.
2811     + * So even if L0 should pass L1 these MSRs, the L01 bitmap is only
2812     + * updated to reflect this when L1 (or its L2s) actually write to
2813     + * the MSR.
2814     + */
2815     + bool pred_cmd = msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD);
2816     + bool spec_ctrl = msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL);
2817    
2818     - /* This shortcut is ok because we support only x2APIC MSRs so far. */
2819     - if (!nested_cpu_has_virt_x2apic_mode(vmcs12))
2820     + if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
2821     + !pred_cmd && !spec_ctrl)
2822     return false;
2823    
2824     page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->msr_bitmap);
2825     @@ -10141,6 +10256,19 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
2826     MSR_TYPE_W);
2827     }
2828     }
2829     +
2830     + if (spec_ctrl)
2831     + nested_vmx_disable_intercept_for_msr(
2832     + msr_bitmap_l1, msr_bitmap_l0,
2833     + MSR_IA32_SPEC_CTRL,
2834     + MSR_TYPE_R | MSR_TYPE_W);
2835     +
2836     + if (pred_cmd)
2837     + nested_vmx_disable_intercept_for_msr(
2838     + msr_bitmap_l1, msr_bitmap_l0,
2839     + MSR_IA32_PRED_CMD,
2840     + MSR_TYPE_W);
2841     +
2842     kunmap(page);
2843     kvm_release_page_clean(page);
2844    
2845     @@ -10682,6 +10810,9 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
2846     if (kvm_has_tsc_control)
2847     decache_tsc_multiplier(vmx);
2848    
2849     + if (cpu_has_vmx_msr_bitmap())
2850     + vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
2851     +
2852     if (enable_vpid) {
2853     /*
2854     * There is no direct mapping between vpid02 and vpid12, the
2855     @@ -10903,20 +11034,15 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
2856     {
2857     struct vcpu_vmx *vmx = to_vmx(vcpu);
2858     struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
2859     - struct loaded_vmcs *vmcs02;
2860     u32 msr_entry_idx;
2861     u32 exit_qual;
2862    
2863     - vmcs02 = nested_get_current_vmcs02(vmx);
2864     - if (!vmcs02)
2865     - return -ENOMEM;
2866     -
2867     enter_guest_mode(vcpu);
2868    
2869     if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
2870     vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
2871    
2872     - vmx_switch_vmcs(vcpu, vmcs02);
2873     + vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
2874     vmx_segment_cache_clear(vmx);
2875    
2876     if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &exit_qual)) {
2877     @@ -11485,7 +11611,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
2878     vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
2879    
2880     if (cpu_has_vmx_msr_bitmap())
2881     - vmx_set_msr_bitmap(vcpu);
2882     + vmx_update_msr_bitmap(vcpu);
2883    
2884     if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
2885     vmcs12->vm_exit_msr_load_count))
2886     @@ -11534,10 +11660,6 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
2887     vm_exit_controls_reset_shadow(vmx);
2888     vmx_segment_cache_clear(vmx);
2889    
2890     - /* if no vmcs02 cache requested, remove the one we used */
2891     - if (VMCS02_POOL_SIZE == 0)
2892     - nested_free_vmcs02(vmx, vmx->nested.current_vmptr);
2893     -
2894     /* Update any VMCS fields that might have changed while L2 ran */
2895     vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
2896     vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
2897     diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
2898     index c53298dfbf50..ac381437c291 100644
2899     --- a/arch/x86/kvm/x86.c
2900     +++ b/arch/x86/kvm/x86.c
2901     @@ -1009,6 +1009,7 @@ static u32 msrs_to_save[] = {
2902     #endif
2903     MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
2904     MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
2905     + MSR_IA32_SPEC_CTRL, MSR_IA32_ARCH_CAPABILITIES
2906     };
2907    
2908     static unsigned num_msrs_to_save;
2909     diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
2910     index f23934bbaf4e..69a473919260 100644
2911     --- a/arch/x86/lib/Makefile
2912     +++ b/arch/x86/lib/Makefile
2913     @@ -27,6 +27,7 @@ lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
2914     lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o insn-eval.o
2915     lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o
2916     lib-$(CONFIG_RETPOLINE) += retpoline.o
2917     +OBJECT_FILES_NON_STANDARD_retpoline.o :=y
2918    
2919     obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o
2920    
2921     diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S
2922     index c97d935a29e8..49b167f73215 100644
2923     --- a/arch/x86/lib/getuser.S
2924     +++ b/arch/x86/lib/getuser.S
2925     @@ -40,6 +40,8 @@ ENTRY(__get_user_1)
2926     mov PER_CPU_VAR(current_task), %_ASM_DX
2927     cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX
2928     jae bad_get_user
2929     + sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */
2930     + and %_ASM_DX, %_ASM_AX
2931     ASM_STAC
2932     1: movzbl (%_ASM_AX),%edx
2933     xor %eax,%eax
2934     @@ -54,6 +56,8 @@ ENTRY(__get_user_2)
2935     mov PER_CPU_VAR(current_task), %_ASM_DX
2936     cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX
2937     jae bad_get_user
2938     + sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */
2939     + and %_ASM_DX, %_ASM_AX
2940     ASM_STAC
2941     2: movzwl -1(%_ASM_AX),%edx
2942     xor %eax,%eax
2943     @@ -68,6 +72,8 @@ ENTRY(__get_user_4)
2944     mov PER_CPU_VAR(current_task), %_ASM_DX
2945     cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX
2946     jae bad_get_user
2947     + sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */
2948     + and %_ASM_DX, %_ASM_AX
2949     ASM_STAC
2950     3: movl -3(%_ASM_AX),%edx
2951     xor %eax,%eax
2952     @@ -83,6 +89,8 @@ ENTRY(__get_user_8)
2953     mov PER_CPU_VAR(current_task), %_ASM_DX
2954     cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX
2955     jae bad_get_user
2956     + sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */
2957     + and %_ASM_DX, %_ASM_AX
2958     ASM_STAC
2959     4: movq -7(%_ASM_AX),%rdx
2960     xor %eax,%eax
2961     @@ -94,6 +102,8 @@ ENTRY(__get_user_8)
2962     mov PER_CPU_VAR(current_task), %_ASM_DX
2963     cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX
2964     jae bad_get_user_8
2965     + sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */
2966     + and %_ASM_DX, %_ASM_AX
2967     ASM_STAC
2968     4: movl -7(%_ASM_AX),%edx
2969     5: movl -3(%_ASM_AX),%ecx
2970     diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S
2971     index c909961e678a..480edc3a5e03 100644
2972     --- a/arch/x86/lib/retpoline.S
2973     +++ b/arch/x86/lib/retpoline.S
2974     @@ -7,6 +7,7 @@
2975     #include <asm/alternative-asm.h>
2976     #include <asm/export.h>
2977     #include <asm/nospec-branch.h>
2978     +#include <asm/bitsperlong.h>
2979    
2980     .macro THUNK reg
2981     .section .text.__x86.indirect_thunk
2982     @@ -46,3 +47,58 @@ GENERATE_THUNK(r13)
2983     GENERATE_THUNK(r14)
2984     GENERATE_THUNK(r15)
2985     #endif
2986     +
2987     +/*
2988     + * Fill the CPU return stack buffer.
2989     + *
2990     + * Each entry in the RSB, if used for a speculative 'ret', contains an
2991     + * infinite 'pause; lfence; jmp' loop to capture speculative execution.
2992     + *
2993     + * This is required in various cases for retpoline and IBRS-based
2994     + * mitigations for the Spectre variant 2 vulnerability. Sometimes to
2995     + * eliminate potentially bogus entries from the RSB, and sometimes
2996     + * purely to ensure that it doesn't get empty, which on some CPUs would
2997     + * allow predictions from other (unwanted!) sources to be used.
2998     + *
2999     + * Google experimented with loop-unrolling and this turned out to be
3000     + * the optimal version - two calls, each with their own speculation
3001     + * trap should their return address end up getting used, in a loop.
3002     + */
3003     +.macro STUFF_RSB nr:req sp:req
3004     + mov $(\nr / 2), %_ASM_BX
3005     + .align 16
3006     +771:
3007     + call 772f
3008     +773: /* speculation trap */
3009     + pause
3010     + lfence
3011     + jmp 773b
3012     + .align 16
3013     +772:
3014     + call 774f
3015     +775: /* speculation trap */
3016     + pause
3017     + lfence
3018     + jmp 775b
3019     + .align 16
3020     +774:
3021     + dec %_ASM_BX
3022     + jnz 771b
3023     + add $((BITS_PER_LONG/8) * \nr), \sp
3024     +.endm
3025     +
3026     +#define RSB_FILL_LOOPS 16 /* To avoid underflow */
3027     +
3028     +ENTRY(__fill_rsb)
3029     + STUFF_RSB RSB_FILL_LOOPS, %_ASM_SP
3030     + ret
3031     +END(__fill_rsb)
3032     +EXPORT_SYMBOL_GPL(__fill_rsb)
3033     +
3034     +#define RSB_CLEAR_LOOPS 32 /* To forcibly overwrite all entries */
3035     +
3036     +ENTRY(__clear_rsb)
3037     + STUFF_RSB RSB_CLEAR_LOOPS, %_ASM_SP
3038     + ret
3039     +END(__clear_rsb)
3040     +EXPORT_SYMBOL_GPL(__clear_rsb)
3041     diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
3042     index 1b377f734e64..7add8ba06887 100644
3043     --- a/arch/x86/lib/usercopy_32.c
3044     +++ b/arch/x86/lib/usercopy_32.c
3045     @@ -331,12 +331,12 @@ do { \
3046    
3047     unsigned long __copy_user_ll(void *to, const void *from, unsigned long n)
3048     {
3049     - stac();
3050     + __uaccess_begin_nospec();
3051     if (movsl_is_ok(to, from, n))
3052     __copy_user(to, from, n);
3053     else
3054     n = __copy_user_intel(to, from, n);
3055     - clac();
3056     + __uaccess_end();
3057     return n;
3058     }
3059     EXPORT_SYMBOL(__copy_user_ll);
3060     @@ -344,7 +344,7 @@ EXPORT_SYMBOL(__copy_user_ll);
3061     unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *from,
3062     unsigned long n)
3063     {
3064     - stac();
3065     + __uaccess_begin_nospec();
3066     #ifdef CONFIG_X86_INTEL_USERCOPY
3067     if (n > 64 && static_cpu_has(X86_FEATURE_XMM2))
3068     n = __copy_user_intel_nocache(to, from, n);
3069     @@ -353,7 +353,7 @@ unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *fr
3070     #else
3071     __copy_user(to, from, n);
3072     #endif
3073     - clac();
3074     + __uaccess_end();
3075     return n;
3076     }
3077     EXPORT_SYMBOL(__copy_from_user_ll_nocache_nozero);
3078     diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
3079     index 5bfe61a5e8e3..012d02624848 100644
3080     --- a/arch/x86/mm/tlb.c
3081     +++ b/arch/x86/mm/tlb.c
3082     @@ -6,13 +6,14 @@
3083     #include <linux/interrupt.h>
3084     #include <linux/export.h>
3085     #include <linux/cpu.h>
3086     +#include <linux/debugfs.h>
3087    
3088     #include <asm/tlbflush.h>
3089     #include <asm/mmu_context.h>
3090     +#include <asm/nospec-branch.h>
3091     #include <asm/cache.h>
3092     #include <asm/apic.h>
3093     #include <asm/uv/uv.h>
3094     -#include <linux/debugfs.h>
3095    
3096     /*
3097     * TLB flushing, formerly SMP-only
3098     @@ -247,6 +248,27 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
3099     } else {
3100     u16 new_asid;
3101     bool need_flush;
3102     + u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id);
3103     +
3104     + /*
3105     + * Avoid user/user BTB poisoning by flushing the branch
3106     + * predictor when switching between processes. This stops
3107     + * one process from doing Spectre-v2 attacks on another.
3108     + *
3109     + * As an optimization, flush indirect branches only when
3110     + * switching into processes that disable dumping. This
3111     + * protects high value processes like gpg, without having
3112     + * too high performance overhead. IBPB is *expensive*!
3113     + *
3114     + * This will not flush branches when switching into kernel
3115     + * threads. It will also not flush if we switch to idle
3116     + * thread and back to the same process. It will flush if we
3117     + * switch to a different non-dumpable process.
3118     + */
3119     + if (tsk && tsk->mm &&
3120     + tsk->mm->context.ctx_id != last_ctx_id &&
3121     + get_dumpable(tsk->mm) != SUID_DUMP_USER)
3122     + indirect_branch_prediction_barrier();
3123    
3124     if (IS_ENABLED(CONFIG_VMAP_STACK)) {
3125     /*
3126     @@ -292,6 +314,14 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
3127     trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
3128     }
3129    
3130     + /*
3131     + * Record last user mm's context id, so we can avoid
3132     + * flushing branch buffer with IBPB if we switch back
3133     + * to the same user.
3134     + */
3135     + if (next != &init_mm)
3136     + this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
3137     +
3138     this_cpu_write(cpu_tlbstate.loaded_mm, next);
3139     this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
3140     }
3141     @@ -369,6 +399,7 @@ void initialize_tlbstate_and_flush(void)
3142     write_cr3(build_cr3(mm->pgd, 0));
3143    
3144     /* Reinitialize tlbstate. */
3145     + this_cpu_write(cpu_tlbstate.last_ctx_id, mm->context.ctx_id);
3146     this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0);
3147     this_cpu_write(cpu_tlbstate.next_asid, 1);
3148     this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id);
3149     diff --git a/drivers/auxdisplay/img-ascii-lcd.c b/drivers/auxdisplay/img-ascii-lcd.c
3150     index db040b378224..9180b9bd5821 100644
3151     --- a/drivers/auxdisplay/img-ascii-lcd.c
3152     +++ b/drivers/auxdisplay/img-ascii-lcd.c
3153     @@ -441,3 +441,7 @@ static struct platform_driver img_ascii_lcd_driver = {
3154     .remove = img_ascii_lcd_remove,
3155     };
3156     module_platform_driver(img_ascii_lcd_driver);
3157     +
3158     +MODULE_DESCRIPTION("Imagination Technologies ASCII LCD Display");
3159     +MODULE_AUTHOR("Paul Burton <paul.burton@mips.com>");
3160     +MODULE_LICENSE("GPL");
3161     diff --git a/drivers/fpga/fpga-region.c b/drivers/fpga/fpga-region.c
3162     index d9ab7c75b14f..e0c73ceba2ed 100644
3163     --- a/drivers/fpga/fpga-region.c
3164     +++ b/drivers/fpga/fpga-region.c
3165     @@ -147,6 +147,7 @@ static struct fpga_manager *fpga_region_get_manager(struct fpga_region *region)
3166     mgr_node = of_parse_phandle(np, "fpga-mgr", 0);
3167     if (mgr_node) {
3168     mgr = of_fpga_mgr_get(mgr_node);
3169     + of_node_put(mgr_node);
3170     of_node_put(np);
3171     return mgr;
3172     }
3173     @@ -192,10 +193,13 @@ static int fpga_region_get_bridges(struct fpga_region *region,
3174     parent_br = region_np->parent;
3175    
3176     /* If overlay has a list of bridges, use it. */
3177     - if (of_parse_phandle(overlay, "fpga-bridges", 0))
3178     + br = of_parse_phandle(overlay, "fpga-bridges", 0);
3179     + if (br) {
3180     + of_node_put(br);
3181     np = overlay;
3182     - else
3183     + } else {
3184     np = region_np;
3185     + }
3186    
3187     for (i = 0; ; i++) {
3188     br = of_parse_phandle(np, "fpga-bridges", i);
3189     @@ -203,12 +207,15 @@ static int fpga_region_get_bridges(struct fpga_region *region,
3190     break;
3191    
3192     /* If parent bridge is in list, skip it. */
3193     - if (br == parent_br)
3194     + if (br == parent_br) {
3195     + of_node_put(br);
3196     continue;
3197     + }
3198    
3199     /* If node is a bridge, get it and add to list */
3200     ret = fpga_bridge_get_to_list(br, region->info,
3201     &region->bridge_list);
3202     + of_node_put(br);
3203    
3204     /* If any of the bridges are in use, give up */
3205     if (ret == -EBUSY) {
3206     diff --git a/drivers/iio/accel/kxsd9-i2c.c b/drivers/iio/accel/kxsd9-i2c.c
3207     index 98fbb628d5bd..38411e1c155b 100644
3208     --- a/drivers/iio/accel/kxsd9-i2c.c
3209     +++ b/drivers/iio/accel/kxsd9-i2c.c
3210     @@ -63,3 +63,6 @@ static struct i2c_driver kxsd9_i2c_driver = {
3211     .id_table = kxsd9_i2c_id,
3212     };
3213     module_i2c_driver(kxsd9_i2c_driver);
3214     +
3215     +MODULE_LICENSE("GPL v2");
3216     +MODULE_DESCRIPTION("KXSD9 accelerometer I2C interface");
3217     diff --git a/drivers/iio/adc/qcom-vadc-common.c b/drivers/iio/adc/qcom-vadc-common.c
3218     index 47d24ae5462f..fe3d7826783c 100644
3219     --- a/drivers/iio/adc/qcom-vadc-common.c
3220     +++ b/drivers/iio/adc/qcom-vadc-common.c
3221     @@ -5,6 +5,7 @@
3222     #include <linux/math64.h>
3223     #include <linux/log2.h>
3224     #include <linux/err.h>
3225     +#include <linux/module.h>
3226    
3227     #include "qcom-vadc-common.h"
3228    
3229     @@ -229,3 +230,6 @@ int qcom_vadc_decimation_from_dt(u32 value)
3230     return __ffs64(value / VADC_DECIMATION_MIN);
3231     }
3232     EXPORT_SYMBOL(qcom_vadc_decimation_from_dt);
3233     +
3234     +MODULE_LICENSE("GPL v2");
3235     +MODULE_DESCRIPTION("Qualcomm ADC common functionality");
3236     diff --git a/drivers/pinctrl/pxa/pinctrl-pxa2xx.c b/drivers/pinctrl/pxa/pinctrl-pxa2xx.c
3237     index 866aa3ce1ac9..6cf0006d4c8d 100644
3238     --- a/drivers/pinctrl/pxa/pinctrl-pxa2xx.c
3239     +++ b/drivers/pinctrl/pxa/pinctrl-pxa2xx.c
3240     @@ -436,3 +436,7 @@ int pxa2xx_pinctrl_exit(struct platform_device *pdev)
3241     return 0;
3242     }
3243     EXPORT_SYMBOL_GPL(pxa2xx_pinctrl_exit);
3244     +
3245     +MODULE_AUTHOR("Robert Jarzmik <robert.jarzmik@free.fr>");
3246     +MODULE_DESCRIPTION("Marvell PXA2xx pinctrl driver");
3247     +MODULE_LICENSE("GPL v2");
3248     diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
3249     index 854995e1cae7..7e7e6eb95b0a 100644
3250     --- a/drivers/tty/serial/serial_core.c
3251     +++ b/drivers/tty/serial/serial_core.c
3252     @@ -974,6 +974,8 @@ static int uart_set_info(struct tty_struct *tty, struct tty_port *port,
3253     }
3254     } else {
3255     retval = uart_startup(tty, state, 1);
3256     + if (retval == 0)
3257     + tty_port_set_initialized(port, true);
3258     if (retval > 0)
3259     retval = 0;
3260     }
3261     diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
3262     index 1c65817673db..41615f38bcff 100644
3263     --- a/include/linux/fdtable.h
3264     +++ b/include/linux/fdtable.h
3265     @@ -10,6 +10,7 @@
3266     #include <linux/compiler.h>
3267     #include <linux/spinlock.h>
3268     #include <linux/rcupdate.h>
3269     +#include <linux/nospec.h>
3270     #include <linux/types.h>
3271     #include <linux/init.h>
3272     #include <linux/fs.h>
3273     @@ -82,8 +83,10 @@ static inline struct file *__fcheck_files(struct files_struct *files, unsigned i
3274     {
3275     struct fdtable *fdt = rcu_dereference_raw(files->fdt);
3276    
3277     - if (fd < fdt->max_fds)
3278     + if (fd < fdt->max_fds) {
3279     + fd = array_index_nospec(fd, fdt->max_fds);
3280     return rcu_dereference_raw(fdt->fd[fd]);
3281     + }
3282     return NULL;
3283     }
3284    
3285     diff --git a/include/linux/init.h b/include/linux/init.h
3286     index ea1b31101d9e..506a98151131 100644
3287     --- a/include/linux/init.h
3288     +++ b/include/linux/init.h
3289     @@ -5,6 +5,13 @@
3290     #include <linux/compiler.h>
3291     #include <linux/types.h>
3292    
3293     +/* Built-in __init functions needn't be compiled with retpoline */
3294     +#if defined(RETPOLINE) && !defined(MODULE)
3295     +#define __noretpoline __attribute__((indirect_branch("keep")))
3296     +#else
3297     +#define __noretpoline
3298     +#endif
3299     +
3300     /* These macros are used to mark some functions or
3301     * initialized data (doesn't apply to uninitialized data)
3302     * as `initialization' functions. The kernel can take this
3303     @@ -40,7 +47,7 @@
3304    
3305     /* These are for everybody (although not all archs will actually
3306     discard it in modules) */
3307     -#define __init __section(.init.text) __cold __latent_entropy
3308     +#define __init __section(.init.text) __cold __latent_entropy __noretpoline
3309     #define __initdata __section(.init.data)
3310     #define __initconst __section(.init.rodata)
3311     #define __exitdata __section(.exit.data)
3312     diff --git a/include/linux/module.h b/include/linux/module.h
3313     index c69b49abe877..1d8f245967be 100644
3314     --- a/include/linux/module.h
3315     +++ b/include/linux/module.h
3316     @@ -801,6 +801,15 @@ static inline void module_bug_finalize(const Elf_Ehdr *hdr,
3317     static inline void module_bug_cleanup(struct module *mod) {}
3318     #endif /* CONFIG_GENERIC_BUG */
3319    
3320     +#ifdef RETPOLINE
3321     +extern bool retpoline_module_ok(bool has_retpoline);
3322     +#else
3323     +static inline bool retpoline_module_ok(bool has_retpoline)
3324     +{
3325     + return true;
3326     +}
3327     +#endif
3328     +
3329     #ifdef CONFIG_MODULE_SIG
3330     static inline bool module_sig_ok(struct module *module)
3331     {
3332     diff --git a/include/linux/nospec.h b/include/linux/nospec.h
3333     new file mode 100644
3334     index 000000000000..b99bced39ac2
3335     --- /dev/null
3336     +++ b/include/linux/nospec.h
3337     @@ -0,0 +1,72 @@
3338     +// SPDX-License-Identifier: GPL-2.0
3339     +// Copyright(c) 2018 Linus Torvalds. All rights reserved.
3340     +// Copyright(c) 2018 Alexei Starovoitov. All rights reserved.
3341     +// Copyright(c) 2018 Intel Corporation. All rights reserved.
3342     +
3343     +#ifndef _LINUX_NOSPEC_H
3344     +#define _LINUX_NOSPEC_H
3345     +
3346     +/**
3347     + * array_index_mask_nospec() - generate a ~0 mask when index < size, 0 otherwise
3348     + * @index: array element index
3349     + * @size: number of elements in array
3350     + *
3351     + * When @index is out of bounds (@index >= @size), the sign bit will be
3352     + * set. Extend the sign bit to all bits and invert, giving a result of
3353     + * zero for an out of bounds index, or ~0 if within bounds [0, @size).
3354     + */
3355     +#ifndef array_index_mask_nospec
3356     +static inline unsigned long array_index_mask_nospec(unsigned long index,
3357     + unsigned long size)
3358     +{
3359     + /*
3360     + * Warn developers about inappropriate array_index_nospec() usage.
3361     + *
3362     + * Even if the CPU speculates past the WARN_ONCE branch, the
3363     + * sign bit of @index is taken into account when generating the
3364     + * mask.
3365     + *
3366     + * This warning is compiled out when the compiler can infer that
3367     + * @index and @size are less than LONG_MAX.
3368     + */
3369     + if (WARN_ONCE(index > LONG_MAX || size > LONG_MAX,
3370     + "array_index_nospec() limited to range of [0, LONG_MAX]\n"))
3371     + return 0;
3372     +
3373     + /*
3374     + * Always calculate and emit the mask even if the compiler
3375     + * thinks the mask is not needed. The compiler does not take
3376     + * into account the value of @index under speculation.
3377     + */
3378     + OPTIMIZER_HIDE_VAR(index);
3379     + return ~(long)(index | (size - 1UL - index)) >> (BITS_PER_LONG - 1);
3380     +}
3381     +#endif
3382     +
3383     +/*
3384     + * array_index_nospec - sanitize an array index after a bounds check
3385     + *
3386     + * For a code sequence like:
3387     + *
3388     + * if (index < size) {
3389     + * index = array_index_nospec(index, size);
3390     + * val = array[index];
3391     + * }
3392     + *
3393     + * ...if the CPU speculates past the bounds check then
3394     + * array_index_nospec() will clamp the index within the range of [0,
3395     + * size).
3396     + */
3397     +#define array_index_nospec(index, size) \
3398     +({ \
3399     + typeof(index) _i = (index); \
3400     + typeof(size) _s = (size); \
3401     + unsigned long _mask = array_index_mask_nospec(_i, _s); \
3402     + \
3403     + BUILD_BUG_ON(sizeof(_i) > sizeof(long)); \
3404     + BUILD_BUG_ON(sizeof(_s) > sizeof(long)); \
3405     + \
3406     + _i &= _mask; \
3407     + _i; \
3408     +})
3409     +#endif /* _LINUX_NOSPEC_H */
3410     diff --git a/kernel/module.c b/kernel/module.c
3411     index dea01ac9cb74..09e48eee4d55 100644
3412     --- a/kernel/module.c
3413     +++ b/kernel/module.c
3414     @@ -2863,6 +2863,15 @@ static int check_modinfo_livepatch(struct module *mod, struct load_info *info)
3415     }
3416     #endif /* CONFIG_LIVEPATCH */
3417    
3418     +static void check_modinfo_retpoline(struct module *mod, struct load_info *info)
3419     +{
3420     + if (retpoline_module_ok(get_modinfo(info, "retpoline")))
3421     + return;
3422     +
3423     + pr_warn("%s: loading module not compiled with retpoline compiler.\n",
3424     + mod->name);
3425     +}
3426     +
3427     /* Sets info->hdr and info->len. */
3428     static int copy_module_from_user(const void __user *umod, unsigned long len,
3429     struct load_info *info)
3430     @@ -3029,6 +3038,8 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags)
3431     add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK);
3432     }
3433    
3434     + check_modinfo_retpoline(mod, info);
3435     +
3436     if (get_modinfo(info, "staging")) {
3437     add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK);
3438     pr_warn("%s: module is from the staging directory, the quality "
3439     diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
3440     index 542a4fc0a8d7..4bbcfc1e2d43 100644
3441     --- a/net/wireless/nl80211.c
3442     +++ b/net/wireless/nl80211.c
3443     @@ -16,6 +16,7 @@
3444     #include <linux/nl80211.h>
3445     #include <linux/rtnetlink.h>
3446     #include <linux/netlink.h>
3447     +#include <linux/nospec.h>
3448     #include <linux/etherdevice.h>
3449     #include <net/net_namespace.h>
3450     #include <net/genetlink.h>
3451     @@ -2056,20 +2057,22 @@ static const struct nla_policy txq_params_policy[NL80211_TXQ_ATTR_MAX + 1] = {
3452     static int parse_txq_params(struct nlattr *tb[],
3453     struct ieee80211_txq_params *txq_params)
3454     {
3455     + u8 ac;
3456     +
3457     if (!tb[NL80211_TXQ_ATTR_AC] || !tb[NL80211_TXQ_ATTR_TXOP] ||
3458     !tb[NL80211_TXQ_ATTR_CWMIN] || !tb[NL80211_TXQ_ATTR_CWMAX] ||
3459     !tb[NL80211_TXQ_ATTR_AIFS])
3460     return -EINVAL;
3461    
3462     - txq_params->ac = nla_get_u8(tb[NL80211_TXQ_ATTR_AC]);
3463     + ac = nla_get_u8(tb[NL80211_TXQ_ATTR_AC]);
3464     txq_params->txop = nla_get_u16(tb[NL80211_TXQ_ATTR_TXOP]);
3465     txq_params->cwmin = nla_get_u16(tb[NL80211_TXQ_ATTR_CWMIN]);
3466     txq_params->cwmax = nla_get_u16(tb[NL80211_TXQ_ATTR_CWMAX]);
3467     txq_params->aifs = nla_get_u8(tb[NL80211_TXQ_ATTR_AIFS]);
3468    
3469     - if (txq_params->ac >= NL80211_NUM_ACS)
3470     + if (ac >= NL80211_NUM_ACS)
3471     return -EINVAL;
3472     -
3473     + txq_params->ac = array_index_nospec(ac, NL80211_NUM_ACS);
3474     return 0;
3475     }
3476    
3477     diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c
3478     index f51cf977c65b..6510536c06df 100644
3479     --- a/scripts/mod/modpost.c
3480     +++ b/scripts/mod/modpost.c
3481     @@ -2165,6 +2165,14 @@ static void add_intree_flag(struct buffer *b, int is_intree)
3482     buf_printf(b, "\nMODULE_INFO(intree, \"Y\");\n");
3483     }
3484    
3485     +/* Cannot check for assembler */
3486     +static void add_retpoline(struct buffer *b)
3487     +{
3488     + buf_printf(b, "\n#ifdef RETPOLINE\n");
3489     + buf_printf(b, "MODULE_INFO(retpoline, \"Y\");\n");
3490     + buf_printf(b, "#endif\n");
3491     +}
3492     +
3493     static void add_staging_flag(struct buffer *b, const char *name)
3494     {
3495     static const char *staging_dir = "drivers/staging";
3496     @@ -2506,6 +2514,7 @@ int main(int argc, char **argv)
3497     err |= check_modname_len(mod);
3498     add_header(&buf, mod);
3499     add_intree_flag(&buf, !external_module);
3500     + add_retpoline(&buf);
3501     add_staging_flag(&buf, mod->name);
3502     err |= add_versions(&buf, mod);
3503     add_depends(&buf, mod, modules);
3504     diff --git a/sound/soc/codecs/pcm512x-spi.c b/sound/soc/codecs/pcm512x-spi.c
3505     index 25c63510ae15..7cdd2dc4fd79 100644
3506     --- a/sound/soc/codecs/pcm512x-spi.c
3507     +++ b/sound/soc/codecs/pcm512x-spi.c
3508     @@ -70,3 +70,7 @@ static struct spi_driver pcm512x_spi_driver = {
3509     };
3510    
3511     module_spi_driver(pcm512x_spi_driver);
3512     +
3513     +MODULE_DESCRIPTION("ASoC PCM512x codec driver - SPI");
3514     +MODULE_AUTHOR("Mark Brown <broonie@kernel.org>");
3515     +MODULE_LICENSE("GPL v2");
3516     diff --git a/tools/objtool/check.c b/tools/objtool/check.c
3517     index f40d46e24bcc..9cd028aa1509 100644
3518     --- a/tools/objtool/check.c
3519     +++ b/tools/objtool/check.c
3520     @@ -543,18 +543,14 @@ static int add_call_destinations(struct objtool_file *file)
3521     dest_off = insn->offset + insn->len + insn->immediate;
3522     insn->call_dest = find_symbol_by_offset(insn->sec,
3523     dest_off);
3524     - /*
3525     - * FIXME: Thanks to retpolines, it's now considered
3526     - * normal for a function to call within itself. So
3527     - * disable this warning for now.
3528     - */
3529     -#if 0
3530     - if (!insn->call_dest) {
3531     - WARN_FUNC("can't find call dest symbol at offset 0x%lx",
3532     - insn->sec, insn->offset, dest_off);
3533     +
3534     + if (!insn->call_dest && !insn->ignore) {
3535     + WARN_FUNC("unsupported intra-function call",
3536     + insn->sec, insn->offset);
3537     + WARN("If this is a retpoline, please patch it in with alternatives and annotate it with ANNOTATE_NOSPEC_ALTERNATIVE.");
3538     return -1;
3539     }
3540     -#endif
3541     +
3542     } else if (rela->sym->type == STT_SECTION) {
3543     insn->call_dest = find_symbol_by_offset(rela->sym->sec,
3544     rela->addend+4);
3545     @@ -598,7 +594,7 @@ static int handle_group_alt(struct objtool_file *file,
3546     struct instruction *orig_insn,
3547     struct instruction **new_insn)
3548     {
3549     - struct instruction *last_orig_insn, *last_new_insn, *insn, *fake_jump;
3550     + struct instruction *last_orig_insn, *last_new_insn, *insn, *fake_jump = NULL;
3551     unsigned long dest_off;
3552    
3553     last_orig_insn = NULL;
3554     @@ -614,28 +610,30 @@ static int handle_group_alt(struct objtool_file *file,
3555     last_orig_insn = insn;
3556     }
3557    
3558     - if (!next_insn_same_sec(file, last_orig_insn)) {
3559     - WARN("%s: don't know how to handle alternatives at end of section",
3560     - special_alt->orig_sec->name);
3561     - return -1;
3562     - }
3563     -
3564     - fake_jump = malloc(sizeof(*fake_jump));
3565     - if (!fake_jump) {
3566     - WARN("malloc failed");
3567     - return -1;
3568     + if (next_insn_same_sec(file, last_orig_insn)) {
3569     + fake_jump = malloc(sizeof(*fake_jump));
3570     + if (!fake_jump) {
3571     + WARN("malloc failed");
3572     + return -1;
3573     + }
3574     + memset(fake_jump, 0, sizeof(*fake_jump));
3575     + INIT_LIST_HEAD(&fake_jump->alts);
3576     + clear_insn_state(&fake_jump->state);
3577     +
3578     + fake_jump->sec = special_alt->new_sec;
3579     + fake_jump->offset = -1;
3580     + fake_jump->type = INSN_JUMP_UNCONDITIONAL;
3581     + fake_jump->jump_dest = list_next_entry(last_orig_insn, list);
3582     + fake_jump->ignore = true;
3583     }
3584     - memset(fake_jump, 0, sizeof(*fake_jump));
3585     - INIT_LIST_HEAD(&fake_jump->alts);
3586     - clear_insn_state(&fake_jump->state);
3587     -
3588     - fake_jump->sec = special_alt->new_sec;
3589     - fake_jump->offset = -1;
3590     - fake_jump->type = INSN_JUMP_UNCONDITIONAL;
3591     - fake_jump->jump_dest = list_next_entry(last_orig_insn, list);
3592     - fake_jump->ignore = true;
3593    
3594     if (!special_alt->new_len) {
3595     + if (!fake_jump) {
3596     + WARN("%s: empty alternative at end of section",
3597     + special_alt->orig_sec->name);
3598     + return -1;
3599     + }
3600     +
3601     *new_insn = fake_jump;
3602     return 0;
3603     }
3604     @@ -648,6 +646,8 @@ static int handle_group_alt(struct objtool_file *file,
3605    
3606     last_new_insn = insn;
3607    
3608     + insn->ignore = orig_insn->ignore_alts;
3609     +
3610     if (insn->type != INSN_JUMP_CONDITIONAL &&
3611     insn->type != INSN_JUMP_UNCONDITIONAL)
3612     continue;
3613     @@ -656,8 +656,14 @@ static int handle_group_alt(struct objtool_file *file,
3614     continue;
3615    
3616     dest_off = insn->offset + insn->len + insn->immediate;
3617     - if (dest_off == special_alt->new_off + special_alt->new_len)
3618     + if (dest_off == special_alt->new_off + special_alt->new_len) {
3619     + if (!fake_jump) {
3620     + WARN("%s: alternative jump to end of section",
3621     + special_alt->orig_sec->name);
3622     + return -1;
3623     + }
3624     insn->jump_dest = fake_jump;
3625     + }
3626    
3627     if (!insn->jump_dest) {
3628     WARN_FUNC("can't find alternative jump destination",
3629     @@ -672,7 +678,8 @@ static int handle_group_alt(struct objtool_file *file,
3630     return -1;
3631     }
3632    
3633     - list_add(&fake_jump->list, &last_new_insn->list);
3634     + if (fake_jump)
3635     + list_add(&fake_jump->list, &last_new_insn->list);
3636    
3637     return 0;
3638     }
3639     @@ -729,10 +736,6 @@ static int add_special_section_alts(struct objtool_file *file)
3640     goto out;
3641     }
3642    
3643     - /* Ignore retpoline alternatives. */
3644     - if (orig_insn->ignore_alts)
3645     - continue;
3646     -
3647     new_insn = NULL;
3648     if (!special_alt->group || special_alt->new_len) {
3649     new_insn = find_insn(file, special_alt->new_sec,
3650     @@ -1089,11 +1092,11 @@ static int decode_sections(struct objtool_file *file)
3651     if (ret)
3652     return ret;
3653    
3654     - ret = add_call_destinations(file);
3655     + ret = add_special_section_alts(file);
3656     if (ret)
3657     return ret;
3658    
3659     - ret = add_special_section_alts(file);
3660     + ret = add_call_destinations(file);
3661     if (ret)
3662     return ret;
3663    
3664     @@ -1720,10 +1723,12 @@ static int validate_branch(struct objtool_file *file, struct instruction *first,
3665    
3666     insn->visited = true;
3667    
3668     - list_for_each_entry(alt, &insn->alts, list) {
3669     - ret = validate_branch(file, alt->insn, state);
3670     - if (ret)
3671     - return 1;
3672     + if (!insn->ignore_alts) {
3673     + list_for_each_entry(alt, &insn->alts, list) {
3674     + ret = validate_branch(file, alt->insn, state);
3675     + if (ret)
3676     + return 1;
3677     + }
3678     }
3679    
3680     switch (insn->type) {
3681     diff --git a/tools/objtool/orc_gen.c b/tools/objtool/orc_gen.c
3682     index e61fe703197b..18384d9be4e1 100644
3683     --- a/tools/objtool/orc_gen.c
3684     +++ b/tools/objtool/orc_gen.c
3685     @@ -98,6 +98,11 @@ static int create_orc_entry(struct section *u_sec, struct section *ip_relasec,
3686     struct orc_entry *orc;
3687     struct rela *rela;
3688    
3689     + if (!insn_sec->sym) {
3690     + WARN("missing symbol for section %s", insn_sec->name);
3691     + return -1;
3692     + }
3693     +
3694     /* populate ORC data */
3695     orc = (struct orc_entry *)u_sec->data->d_buf + idx;
3696     memcpy(orc, o, sizeof(*orc));