Magellan Linux

Annotation of /trunk/kernel-alx/patches-4.14/0117-4.14.18-all-fixes.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 3238 - (hide annotations) (download)
Fri Nov 9 12:14:58 2018 UTC (5 years, 7 months ago) by niro
File size: 125071 byte(s)
-added up to patches-4.14.79
1 niro 3238 diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
2     index 8122b5f98ea1..c76afdcafbef 100644
3     --- a/Documentation/admin-guide/kernel-parameters.txt
4     +++ b/Documentation/admin-guide/kernel-parameters.txt
5     @@ -2718,8 +2718,6 @@
6     norandmaps Don't use address space randomization. Equivalent to
7     echo 0 > /proc/sys/kernel/randomize_va_space
8    
9     - noreplace-paravirt [X86,IA-64,PV_OPS] Don't patch paravirt_ops
10     -
11     noreplace-smp [X86-32,SMP] Don't replace SMP instructions
12     with UP alternatives
13    
14     diff --git a/Documentation/speculation.txt b/Documentation/speculation.txt
15     new file mode 100644
16     index 000000000000..e9e6cbae2841
17     --- /dev/null
18     +++ b/Documentation/speculation.txt
19     @@ -0,0 +1,90 @@
20     +This document explains potential effects of speculation, and how undesirable
21     +effects can be mitigated portably using common APIs.
22     +
23     +===========
24     +Speculation
25     +===========
26     +
27     +To improve performance and minimize average latencies, many contemporary CPUs
28     +employ speculative execution techniques such as branch prediction, performing
29     +work which may be discarded at a later stage.
30     +
31     +Typically speculative execution cannot be observed from architectural state,
32     +such as the contents of registers. However, in some cases it is possible to
33     +observe its impact on microarchitectural state, such as the presence or
34     +absence of data in caches. Such state may form side-channels which can be
35     +observed to extract secret information.
36     +
37     +For example, in the presence of branch prediction, it is possible for bounds
38     +checks to be ignored by code which is speculatively executed. Consider the
39     +following code:
40     +
41     + int load_array(int *array, unsigned int index)
42     + {
43     + if (index >= MAX_ARRAY_ELEMS)
44     + return 0;
45     + else
46     + return array[index];
47     + }
48     +
49     +Which, on arm64, may be compiled to an assembly sequence such as:
50     +
51     + CMP <index>, #MAX_ARRAY_ELEMS
52     + B.LT less
53     + MOV <returnval>, #0
54     + RET
55     + less:
56     + LDR <returnval>, [<array>, <index>]
57     + RET
58     +
59     +It is possible that a CPU mis-predicts the conditional branch, and
60     +speculatively loads array[index], even if index >= MAX_ARRAY_ELEMS. This
61     +value will subsequently be discarded, but the speculated load may affect
62     +microarchitectural state which can be subsequently measured.
63     +
64     +More complex sequences involving multiple dependent memory accesses may
65     +result in sensitive information being leaked. Consider the following
66     +code, building on the prior example:
67     +
68     + int load_dependent_arrays(int *arr1, int *arr2, int index)
69     + {
70     + int val1, val2,
71     +
72     + val1 = load_array(arr1, index);
73     + val2 = load_array(arr2, val1);
74     +
75     + return val2;
76     + }
77     +
78     +Under speculation, the first call to load_array() may return the value
79     +of an out-of-bounds address, while the second call will influence
80     +microarchitectural state dependent on this value. This may provide an
81     +arbitrary read primitive.
82     +
83     +====================================
84     +Mitigating speculation side-channels
85     +====================================
86     +
87     +The kernel provides a generic API to ensure that bounds checks are
88     +respected even under speculation. Architectures which are affected by
89     +speculation-based side-channels are expected to implement these
90     +primitives.
91     +
92     +The array_index_nospec() helper in <linux/nospec.h> can be used to
93     +prevent information from being leaked via side-channels.
94     +
95     +A call to array_index_nospec(index, size) returns a sanitized index
96     +value that is bounded to [0, size) even under cpu speculation
97     +conditions.
98     +
99     +This can be used to protect the earlier load_array() example:
100     +
101     + int load_array(int *array, unsigned int index)
102     + {
103     + if (index >= MAX_ARRAY_ELEMS)
104     + return 0;
105     + else {
106     + index = array_index_nospec(index, MAX_ARRAY_ELEMS);
107     + return array[index];
108     + }
109     + }
110     diff --git a/Makefile b/Makefile
111     index 7ed993896dd5..a69e5da9ed86 100644
112     --- a/Makefile
113     +++ b/Makefile
114     @@ -1,7 +1,7 @@
115     # SPDX-License-Identifier: GPL-2.0
116     VERSION = 4
117     PATCHLEVEL = 14
118     -SUBLEVEL = 17
119     +SUBLEVEL = 18
120     EXTRAVERSION =
121     NAME = Petit Gorille
122    
123     diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
124     index cb782ac1c35d..fe418226df7f 100644
125     --- a/arch/powerpc/Kconfig
126     +++ b/arch/powerpc/Kconfig
127     @@ -164,6 +164,7 @@ config PPC
128     select GENERIC_CLOCKEVENTS_BROADCAST if SMP
129     select GENERIC_CMOS_UPDATE
130     select GENERIC_CPU_AUTOPROBE
131     + select GENERIC_CPU_VULNERABILITIES if PPC_BOOK3S_64
132     select GENERIC_IRQ_SHOW
133     select GENERIC_IRQ_SHOW_LEVEL
134     select GENERIC_SMP_IDLE_THREAD
135     diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
136     index 935059cb9e40..9527a4c6cbc2 100644
137     --- a/arch/powerpc/kernel/setup_64.c
138     +++ b/arch/powerpc/kernel/setup_64.c
139     @@ -38,6 +38,7 @@
140     #include <linux/memory.h>
141     #include <linux/nmi.h>
142    
143     +#include <asm/debugfs.h>
144     #include <asm/io.h>
145     #include <asm/kdump.h>
146     #include <asm/prom.h>
147     @@ -884,4 +885,41 @@ void __init setup_rfi_flush(enum l1d_flush_type types, bool enable)
148     if (!no_rfi_flush)
149     rfi_flush_enable(enable);
150     }
151     +
152     +#ifdef CONFIG_DEBUG_FS
153     +static int rfi_flush_set(void *data, u64 val)
154     +{
155     + if (val == 1)
156     + rfi_flush_enable(true);
157     + else if (val == 0)
158     + rfi_flush_enable(false);
159     + else
160     + return -EINVAL;
161     +
162     + return 0;
163     +}
164     +
165     +static int rfi_flush_get(void *data, u64 *val)
166     +{
167     + *val = rfi_flush ? 1 : 0;
168     + return 0;
169     +}
170     +
171     +DEFINE_SIMPLE_ATTRIBUTE(fops_rfi_flush, rfi_flush_get, rfi_flush_set, "%llu\n");
172     +
173     +static __init int rfi_flush_debugfs_init(void)
174     +{
175     + debugfs_create_file("rfi_flush", 0600, powerpc_debugfs_root, NULL, &fops_rfi_flush);
176     + return 0;
177     +}
178     +device_initcall(rfi_flush_debugfs_init);
179     +#endif
180     +
181     +ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr, char *buf)
182     +{
183     + if (rfi_flush)
184     + return sprintf(buf, "Mitigation: RFI Flush\n");
185     +
186     + return sprintf(buf, "Vulnerable\n");
187     +}
188     #endif /* CONFIG_PPC_BOOK3S_64 */
189     diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
190     index 03505ffbe1b6..60e21ccfb6d6 100644
191     --- a/arch/x86/entry/common.c
192     +++ b/arch/x86/entry/common.c
193     @@ -21,6 +21,7 @@
194     #include <linux/export.h>
195     #include <linux/context_tracking.h>
196     #include <linux/user-return-notifier.h>
197     +#include <linux/nospec.h>
198     #include <linux/uprobes.h>
199     #include <linux/livepatch.h>
200     #include <linux/syscalls.h>
201     @@ -208,7 +209,7 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
202     * special case only applies after poking regs and before the
203     * very next return to user mode.
204     */
205     - current->thread.status &= ~(TS_COMPAT|TS_I386_REGS_POKED);
206     + ti->status &= ~(TS_COMPAT|TS_I386_REGS_POKED);
207     #endif
208    
209     user_enter_irqoff();
210     @@ -284,7 +285,8 @@ __visible void do_syscall_64(struct pt_regs *regs)
211     * regs->orig_ax, which changes the behavior of some syscalls.
212     */
213     if (likely((nr & __SYSCALL_MASK) < NR_syscalls)) {
214     - regs->ax = sys_call_table[nr & __SYSCALL_MASK](
215     + nr = array_index_nospec(nr & __SYSCALL_MASK, NR_syscalls);
216     + regs->ax = sys_call_table[nr](
217     regs->di, regs->si, regs->dx,
218     regs->r10, regs->r8, regs->r9);
219     }
220     @@ -306,7 +308,7 @@ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
221     unsigned int nr = (unsigned int)regs->orig_ax;
222    
223     #ifdef CONFIG_IA32_EMULATION
224     - current->thread.status |= TS_COMPAT;
225     + ti->status |= TS_COMPAT;
226     #endif
227    
228     if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) {
229     @@ -320,6 +322,7 @@ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
230     }
231    
232     if (likely(nr < IA32_NR_syscalls)) {
233     + nr = array_index_nospec(nr, IA32_NR_syscalls);
234     /*
235     * It's possible that a 32-bit syscall implementation
236     * takes a 64-bit parameter but nonetheless assumes that
237     diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
238     index 60c4c342316c..2a35b1e0fb90 100644
239     --- a/arch/x86/entry/entry_32.S
240     +++ b/arch/x86/entry/entry_32.S
241     @@ -252,7 +252,8 @@ ENTRY(__switch_to_asm)
242     * exist, overwrite the RSB with entries which capture
243     * speculative execution to prevent attack.
244     */
245     - FILL_RETURN_BUFFER %ebx, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
246     + /* Clobbers %ebx */
247     + FILL_RETURN_BUFFER RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
248     #endif
249    
250     /* restore callee-saved registers */
251     diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
252     index be6b66464f6a..16e2d72e79a0 100644
253     --- a/arch/x86/entry/entry_64.S
254     +++ b/arch/x86/entry/entry_64.S
255     @@ -232,91 +232,20 @@ GLOBAL(entry_SYSCALL_64_after_hwframe)
256     pushq %r9 /* pt_regs->r9 */
257     pushq %r10 /* pt_regs->r10 */
258     pushq %r11 /* pt_regs->r11 */
259     - sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */
260     - UNWIND_HINT_REGS extra=0
261     -
262     - TRACE_IRQS_OFF
263     -
264     - /*
265     - * If we need to do entry work or if we guess we'll need to do
266     - * exit work, go straight to the slow path.
267     - */
268     - movq PER_CPU_VAR(current_task), %r11
269     - testl $_TIF_WORK_SYSCALL_ENTRY|_TIF_ALLWORK_MASK, TASK_TI_flags(%r11)
270     - jnz entry_SYSCALL64_slow_path
271     -
272     -entry_SYSCALL_64_fastpath:
273     - /*
274     - * Easy case: enable interrupts and issue the syscall. If the syscall
275     - * needs pt_regs, we'll call a stub that disables interrupts again
276     - * and jumps to the slow path.
277     - */
278     - TRACE_IRQS_ON
279     - ENABLE_INTERRUPTS(CLBR_NONE)
280     -#if __SYSCALL_MASK == ~0
281     - cmpq $__NR_syscall_max, %rax
282     -#else
283     - andl $__SYSCALL_MASK, %eax
284     - cmpl $__NR_syscall_max, %eax
285     -#endif
286     - ja 1f /* return -ENOSYS (already in pt_regs->ax) */
287     - movq %r10, %rcx
288     -
289     - /*
290     - * This call instruction is handled specially in stub_ptregs_64.
291     - * It might end up jumping to the slow path. If it jumps, RAX
292     - * and all argument registers are clobbered.
293     - */
294     -#ifdef CONFIG_RETPOLINE
295     - movq sys_call_table(, %rax, 8), %rax
296     - call __x86_indirect_thunk_rax
297     -#else
298     - call *sys_call_table(, %rax, 8)
299     -#endif
300     -.Lentry_SYSCALL_64_after_fastpath_call:
301     -
302     - movq %rax, RAX(%rsp)
303     -1:
304     + pushq %rbx /* pt_regs->rbx */
305     + pushq %rbp /* pt_regs->rbp */
306     + pushq %r12 /* pt_regs->r12 */
307     + pushq %r13 /* pt_regs->r13 */
308     + pushq %r14 /* pt_regs->r14 */
309     + pushq %r15 /* pt_regs->r15 */
310     + UNWIND_HINT_REGS
311    
312     - /*
313     - * If we get here, then we know that pt_regs is clean for SYSRET64.
314     - * If we see that no exit work is required (which we are required
315     - * to check with IRQs off), then we can go straight to SYSRET64.
316     - */
317     - DISABLE_INTERRUPTS(CLBR_ANY)
318     TRACE_IRQS_OFF
319     - movq PER_CPU_VAR(current_task), %r11
320     - testl $_TIF_ALLWORK_MASK, TASK_TI_flags(%r11)
321     - jnz 1f
322     -
323     - LOCKDEP_SYS_EXIT
324     - TRACE_IRQS_ON /* user mode is traced as IRQs on */
325     - movq RIP(%rsp), %rcx
326     - movq EFLAGS(%rsp), %r11
327     - addq $6*8, %rsp /* skip extra regs -- they were preserved */
328     - UNWIND_HINT_EMPTY
329     - jmp .Lpop_c_regs_except_rcx_r11_and_sysret
330    
331     -1:
332     - /*
333     - * The fast path looked good when we started, but something changed
334     - * along the way and we need to switch to the slow path. Calling
335     - * raise(3) will trigger this, for example. IRQs are off.
336     - */
337     - TRACE_IRQS_ON
338     - ENABLE_INTERRUPTS(CLBR_ANY)
339     - SAVE_EXTRA_REGS
340     - movq %rsp, %rdi
341     - call syscall_return_slowpath /* returns with IRQs disabled */
342     - jmp return_from_SYSCALL_64
343     -
344     -entry_SYSCALL64_slow_path:
345     /* IRQs are off. */
346     - SAVE_EXTRA_REGS
347     movq %rsp, %rdi
348     call do_syscall_64 /* returns with IRQs disabled */
349    
350     -return_from_SYSCALL_64:
351     TRACE_IRQS_IRETQ /* we're about to change IF */
352    
353     /*
354     @@ -389,7 +318,6 @@ syscall_return_via_sysret:
355     /* rcx and r11 are already restored (see code above) */
356     UNWIND_HINT_EMPTY
357     POP_EXTRA_REGS
358     -.Lpop_c_regs_except_rcx_r11_and_sysret:
359     popq %rsi /* skip r11 */
360     popq %r10
361     popq %r9
362     @@ -420,47 +348,6 @@ syscall_return_via_sysret:
363     USERGS_SYSRET64
364     END(entry_SYSCALL_64)
365    
366     -ENTRY(stub_ptregs_64)
367     - /*
368     - * Syscalls marked as needing ptregs land here.
369     - * If we are on the fast path, we need to save the extra regs,
370     - * which we achieve by trying again on the slow path. If we are on
371     - * the slow path, the extra regs are already saved.
372     - *
373     - * RAX stores a pointer to the C function implementing the syscall.
374     - * IRQs are on.
375     - */
376     - cmpq $.Lentry_SYSCALL_64_after_fastpath_call, (%rsp)
377     - jne 1f
378     -
379     - /*
380     - * Called from fast path -- disable IRQs again, pop return address
381     - * and jump to slow path
382     - */
383     - DISABLE_INTERRUPTS(CLBR_ANY)
384     - TRACE_IRQS_OFF
385     - popq %rax
386     - UNWIND_HINT_REGS extra=0
387     - jmp entry_SYSCALL64_slow_path
388     -
389     -1:
390     - JMP_NOSPEC %rax /* Called from C */
391     -END(stub_ptregs_64)
392     -
393     -.macro ptregs_stub func
394     -ENTRY(ptregs_\func)
395     - UNWIND_HINT_FUNC
396     - leaq \func(%rip), %rax
397     - jmp stub_ptregs_64
398     -END(ptregs_\func)
399     -.endm
400     -
401     -/* Instantiate ptregs_stub for each ptregs-using syscall */
402     -#define __SYSCALL_64_QUAL_(sym)
403     -#define __SYSCALL_64_QUAL_ptregs(sym) ptregs_stub sym
404     -#define __SYSCALL_64(nr, sym, qual) __SYSCALL_64_QUAL_##qual(sym)
405     -#include <asm/syscalls_64.h>
406     -
407     /*
408     * %rdi: prev task
409     * %rsi: next task
410     @@ -495,7 +382,8 @@ ENTRY(__switch_to_asm)
411     * exist, overwrite the RSB with entries which capture
412     * speculative execution to prevent attack.
413     */
414     - FILL_RETURN_BUFFER %r12, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
415     + /* Clobbers %rbx */
416     + FILL_RETURN_BUFFER RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
417     #endif
418    
419     /* restore callee-saved registers */
420     diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c
421     index 9c09775e589d..c176d2fab1da 100644
422     --- a/arch/x86/entry/syscall_64.c
423     +++ b/arch/x86/entry/syscall_64.c
424     @@ -7,14 +7,11 @@
425     #include <asm/asm-offsets.h>
426     #include <asm/syscall.h>
427    
428     -#define __SYSCALL_64_QUAL_(sym) sym
429     -#define __SYSCALL_64_QUAL_ptregs(sym) ptregs_##sym
430     -
431     -#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long __SYSCALL_64_QUAL_##qual(sym)(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
432     +#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
433     #include <asm/syscalls_64.h>
434     #undef __SYSCALL_64
435    
436     -#define __SYSCALL_64(nr, sym, qual) [nr] = __SYSCALL_64_QUAL_##qual(sym),
437     +#define __SYSCALL_64(nr, sym, qual) [nr] = sym,
438    
439     extern long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
440    
441     diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h
442     index 0927cdc4f946..4d111616524b 100644
443     --- a/arch/x86/include/asm/asm-prototypes.h
444     +++ b/arch/x86/include/asm/asm-prototypes.h
445     @@ -38,5 +38,7 @@ INDIRECT_THUNK(dx)
446     INDIRECT_THUNK(si)
447     INDIRECT_THUNK(di)
448     INDIRECT_THUNK(bp)
449     -INDIRECT_THUNK(sp)
450     +asmlinkage void __fill_rsb(void);
451     +asmlinkage void __clear_rsb(void);
452     +
453     #endif /* CONFIG_RETPOLINE */
454     diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
455     index 01727dbc294a..1e7c955b6303 100644
456     --- a/arch/x86/include/asm/barrier.h
457     +++ b/arch/x86/include/asm/barrier.h
458     @@ -24,6 +24,34 @@
459     #define wmb() asm volatile("sfence" ::: "memory")
460     #endif
461    
462     +/**
463     + * array_index_mask_nospec() - generate a mask that is ~0UL when the
464     + * bounds check succeeds and 0 otherwise
465     + * @index: array element index
466     + * @size: number of elements in array
467     + *
468     + * Returns:
469     + * 0 - (index < size)
470     + */
471     +static inline unsigned long array_index_mask_nospec(unsigned long index,
472     + unsigned long size)
473     +{
474     + unsigned long mask;
475     +
476     + asm ("cmp %1,%2; sbb %0,%0;"
477     + :"=r" (mask)
478     + :"r"(size),"r" (index)
479     + :"cc");
480     + return mask;
481     +}
482     +
483     +/* Override the default implementation from linux/nospec.h. */
484     +#define array_index_mask_nospec array_index_mask_nospec
485     +
486     +/* Prevent speculative execution past this barrier. */
487     +#define barrier_nospec() alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC, \
488     + "lfence", X86_FEATURE_LFENCE_RDTSC)
489     +
490     #ifdef CONFIG_X86_PPRO_FENCE
491     #define dma_rmb() rmb()
492     #else
493     diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
494     index ea9a7dde62e5..70eddb3922ff 100644
495     --- a/arch/x86/include/asm/cpufeature.h
496     +++ b/arch/x86/include/asm/cpufeature.h
497     @@ -29,6 +29,7 @@ enum cpuid_leafs
498     CPUID_8000_000A_EDX,
499     CPUID_7_ECX,
500     CPUID_8000_0007_EBX,
501     + CPUID_7_EDX,
502     };
503    
504     #ifdef CONFIG_X86_FEATURE_NAMES
505     @@ -79,8 +80,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
506     CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 15, feature_bit) || \
507     CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 16, feature_bit) || \
508     CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 17, feature_bit) || \
509     + CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 18, feature_bit) || \
510     REQUIRED_MASK_CHECK || \
511     - BUILD_BUG_ON_ZERO(NCAPINTS != 18))
512     + BUILD_BUG_ON_ZERO(NCAPINTS != 19))
513    
514     #define DISABLED_MASK_BIT_SET(feature_bit) \
515     ( CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 0, feature_bit) || \
516     @@ -101,8 +103,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
517     CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 15, feature_bit) || \
518     CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 16, feature_bit) || \
519     CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 17, feature_bit) || \
520     + CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 18, feature_bit) || \
521     DISABLED_MASK_CHECK || \
522     - BUILD_BUG_ON_ZERO(NCAPINTS != 18))
523     + BUILD_BUG_ON_ZERO(NCAPINTS != 19))
524    
525     #define cpu_has(c, bit) \
526     (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \
527     diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
528     index 25b9375c1484..73b5fff159a4 100644
529     --- a/arch/x86/include/asm/cpufeatures.h
530     +++ b/arch/x86/include/asm/cpufeatures.h
531     @@ -13,7 +13,7 @@
532     /*
533     * Defines x86 CPU feature bits
534     */
535     -#define NCAPINTS 18 /* N 32-bit words worth of info */
536     +#define NCAPINTS 19 /* N 32-bit words worth of info */
537     #define NBUGINTS 1 /* N 32-bit bug flags */
538    
539     /*
540     @@ -203,14 +203,14 @@
541     #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
542     #define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */
543     #define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */
544     -#define X86_FEATURE_RETPOLINE ( 7*32+12) /* Generic Retpoline mitigation for Spectre variant 2 */
545     -#define X86_FEATURE_RETPOLINE_AMD ( 7*32+13) /* AMD Retpoline mitigation for Spectre variant 2 */
546     +#define X86_FEATURE_RETPOLINE ( 7*32+12) /* "" Generic Retpoline mitigation for Spectre variant 2 */
547     +#define X86_FEATURE_RETPOLINE_AMD ( 7*32+13) /* "" AMD Retpoline mitigation for Spectre variant 2 */
548     #define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */
549     -#define X86_FEATURE_AVX512_4VNNIW ( 7*32+16) /* AVX-512 Neural Network Instructions */
550     -#define X86_FEATURE_AVX512_4FMAPS ( 7*32+17) /* AVX-512 Multiply Accumulation Single precision */
551    
552     #define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */
553     -#define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* Fill RSB on context switches */
554     +#define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* "" Fill RSB on context switches */
555     +
556     +#define X86_FEATURE_USE_IBPB ( 7*32+21) /* "" Indirect Branch Prediction Barrier enabled */
557    
558     /* Virtualization flags: Linux defined, word 8 */
559     #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */
560     @@ -271,6 +271,9 @@
561     #define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */
562     #define X86_FEATURE_IRPERF (13*32+ 1) /* Instructions Retired Count */
563     #define X86_FEATURE_XSAVEERPTR (13*32+ 2) /* Always save/restore FP error pointers */
564     +#define X86_FEATURE_IBPB (13*32+12) /* Indirect Branch Prediction Barrier */
565     +#define X86_FEATURE_IBRS (13*32+14) /* Indirect Branch Restricted Speculation */
566     +#define X86_FEATURE_STIBP (13*32+15) /* Single Thread Indirect Branch Predictors */
567    
568     /* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */
569     #define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */
570     @@ -319,6 +322,13 @@
571     #define X86_FEATURE_SUCCOR (17*32+ 1) /* Uncorrectable error containment and recovery */
572     #define X86_FEATURE_SMCA (17*32+ 3) /* Scalable MCA */
573    
574     +/* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */
575     +#define X86_FEATURE_AVX512_4VNNIW (18*32+ 2) /* AVX-512 Neural Network Instructions */
576     +#define X86_FEATURE_AVX512_4FMAPS (18*32+ 3) /* AVX-512 Multiply Accumulation Single precision */
577     +#define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */
578     +#define X86_FEATURE_INTEL_STIBP (18*32+27) /* "" Single Thread Indirect Branch Predictors */
579     +#define X86_FEATURE_ARCH_CAPABILITIES (18*32+29) /* IA32_ARCH_CAPABILITIES MSR (Intel) */
580     +
581     /*
582     * BUG word(s)
583     */
584     diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h
585     index e428e16dd822..c6a3af198294 100644
586     --- a/arch/x86/include/asm/disabled-features.h
587     +++ b/arch/x86/include/asm/disabled-features.h
588     @@ -71,6 +71,7 @@
589     #define DISABLED_MASK15 0
590     #define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57)
591     #define DISABLED_MASK17 0
592     -#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 18)
593     +#define DISABLED_MASK18 0
594     +#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19)
595    
596     #endif /* _ASM_X86_DISABLED_FEATURES_H */
597     diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
598     index 64c4a30e0d39..e203169931c7 100644
599     --- a/arch/x86/include/asm/fixmap.h
600     +++ b/arch/x86/include/asm/fixmap.h
601     @@ -137,8 +137,10 @@ enum fixed_addresses {
602    
603     extern void reserve_top_address(unsigned long reserve);
604    
605     -#define FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT)
606     -#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE)
607     +#define FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT)
608     +#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE)
609     +#define FIXADDR_TOT_SIZE (__end_of_fixed_addresses << PAGE_SHIFT)
610     +#define FIXADDR_TOT_START (FIXADDR_TOP - FIXADDR_TOT_SIZE)
611    
612     extern int fixmaps_set;
613    
614     diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
615     index fa11fb1fa570..eb83ff1bae8f 100644
616     --- a/arch/x86/include/asm/msr-index.h
617     +++ b/arch/x86/include/asm/msr-index.h
618     @@ -39,6 +39,13 @@
619    
620     /* Intel MSRs. Some also available on other CPUs */
621    
622     +#define MSR_IA32_SPEC_CTRL 0x00000048 /* Speculation Control */
623     +#define SPEC_CTRL_IBRS (1 << 0) /* Indirect Branch Restricted Speculation */
624     +#define SPEC_CTRL_STIBP (1 << 1) /* Single Thread Indirect Branch Predictors */
625     +
626     +#define MSR_IA32_PRED_CMD 0x00000049 /* Prediction Command */
627     +#define PRED_CMD_IBPB (1 << 0) /* Indirect Branch Prediction Barrier */
628     +
629     #define MSR_PPIN_CTL 0x0000004e
630     #define MSR_PPIN 0x0000004f
631    
632     @@ -57,6 +64,11 @@
633     #define SNB_C3_AUTO_UNDEMOTE (1UL << 28)
634    
635     #define MSR_MTRRcap 0x000000fe
636     +
637     +#define MSR_IA32_ARCH_CAPABILITIES 0x0000010a
638     +#define ARCH_CAP_RDCL_NO (1 << 0) /* Not susceptible to Meltdown */
639     +#define ARCH_CAP_IBRS_ALL (1 << 1) /* Enhanced IBRS support */
640     +
641     #define MSR_IA32_BBL_CR_CTL 0x00000119
642     #define MSR_IA32_BBL_CR_CTL3 0x0000011e
643    
644     diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
645     index 07962f5f6fba..30df295f6d94 100644
646     --- a/arch/x86/include/asm/msr.h
647     +++ b/arch/x86/include/asm/msr.h
648     @@ -214,8 +214,7 @@ static __always_inline unsigned long long rdtsc_ordered(void)
649     * that some other imaginary CPU is updating continuously with a
650     * time stamp.
651     */
652     - alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC,
653     - "lfence", X86_FEATURE_LFENCE_RDTSC);
654     + barrier_nospec();
655     return rdtsc();
656     }
657    
658     diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
659     index 4ad41087ce0e..4d57894635f2 100644
660     --- a/arch/x86/include/asm/nospec-branch.h
661     +++ b/arch/x86/include/asm/nospec-branch.h
662     @@ -1,56 +1,12 @@
663     /* SPDX-License-Identifier: GPL-2.0 */
664    
665     -#ifndef __NOSPEC_BRANCH_H__
666     -#define __NOSPEC_BRANCH_H__
667     +#ifndef _ASM_X86_NOSPEC_BRANCH_H_
668     +#define _ASM_X86_NOSPEC_BRANCH_H_
669    
670     #include <asm/alternative.h>
671     #include <asm/alternative-asm.h>
672     #include <asm/cpufeatures.h>
673    
674     -/*
675     - * Fill the CPU return stack buffer.
676     - *
677     - * Each entry in the RSB, if used for a speculative 'ret', contains an
678     - * infinite 'pause; lfence; jmp' loop to capture speculative execution.
679     - *
680     - * This is required in various cases for retpoline and IBRS-based
681     - * mitigations for the Spectre variant 2 vulnerability. Sometimes to
682     - * eliminate potentially bogus entries from the RSB, and sometimes
683     - * purely to ensure that it doesn't get empty, which on some CPUs would
684     - * allow predictions from other (unwanted!) sources to be used.
685     - *
686     - * We define a CPP macro such that it can be used from both .S files and
687     - * inline assembly. It's possible to do a .macro and then include that
688     - * from C via asm(".include <asm/nospec-branch.h>") but let's not go there.
689     - */
690     -
691     -#define RSB_CLEAR_LOOPS 32 /* To forcibly overwrite all entries */
692     -#define RSB_FILL_LOOPS 16 /* To avoid underflow */
693     -
694     -/*
695     - * Google experimented with loop-unrolling and this turned out to be
696     - * the optimal version — two calls, each with their own speculation
697     - * trap should their return address end up getting used, in a loop.
698     - */
699     -#define __FILL_RETURN_BUFFER(reg, nr, sp) \
700     - mov $(nr/2), reg; \
701     -771: \
702     - call 772f; \
703     -773: /* speculation trap */ \
704     - pause; \
705     - lfence; \
706     - jmp 773b; \
707     -772: \
708     - call 774f; \
709     -775: /* speculation trap */ \
710     - pause; \
711     - lfence; \
712     - jmp 775b; \
713     -774: \
714     - dec reg; \
715     - jnz 771b; \
716     - add $(BITS_PER_LONG/8) * nr, sp;
717     -
718     #ifdef __ASSEMBLY__
719    
720     /*
721     @@ -121,17 +77,10 @@
722     #endif
723     .endm
724    
725     - /*
726     - * A simpler FILL_RETURN_BUFFER macro. Don't make people use the CPP
727     - * monstrosity above, manually.
728     - */
729     -.macro FILL_RETURN_BUFFER reg:req nr:req ftr:req
730     +/* This clobbers the BX register */
731     +.macro FILL_RETURN_BUFFER nr:req ftr:req
732     #ifdef CONFIG_RETPOLINE
733     - ANNOTATE_NOSPEC_ALTERNATIVE
734     - ALTERNATIVE "jmp .Lskip_rsb_\@", \
735     - __stringify(__FILL_RETURN_BUFFER(\reg,\nr,%_ASM_SP)) \
736     - \ftr
737     -.Lskip_rsb_\@:
738     + ALTERNATIVE "", "call __clear_rsb", \ftr
739     #endif
740     .endm
741    
742     @@ -201,22 +150,25 @@ extern char __indirect_thunk_end[];
743     * On VMEXIT we must ensure that no RSB predictions learned in the guest
744     * can be followed in the host, by overwriting the RSB completely. Both
745     * retpoline and IBRS mitigations for Spectre v2 need this; only on future
746     - * CPUs with IBRS_ATT *might* it be avoided.
747     + * CPUs with IBRS_ALL *might* it be avoided.
748     */
749     static inline void vmexit_fill_RSB(void)
750     {
751     #ifdef CONFIG_RETPOLINE
752     - unsigned long loops;
753     -
754     - asm volatile (ANNOTATE_NOSPEC_ALTERNATIVE
755     - ALTERNATIVE("jmp 910f",
756     - __stringify(__FILL_RETURN_BUFFER(%0, RSB_CLEAR_LOOPS, %1)),
757     - X86_FEATURE_RETPOLINE)
758     - "910:"
759     - : "=r" (loops), ASM_CALL_CONSTRAINT
760     - : : "memory" );
761     + alternative_input("",
762     + "call __fill_rsb",
763     + X86_FEATURE_RETPOLINE,
764     + ASM_NO_INPUT_CLOBBER(_ASM_BX, "memory"));
765     #endif
766     }
767    
768     +static inline void indirect_branch_prediction_barrier(void)
769     +{
770     + alternative_input("",
771     + "call __ibp_barrier",
772     + X86_FEATURE_USE_IBPB,
773     + ASM_NO_INPUT_CLOBBER("eax", "ecx", "edx", "memory"));
774     +}
775     +
776     #endif /* __ASSEMBLY__ */
777     -#endif /* __NOSPEC_BRANCH_H__ */
778     +#endif /* _ASM_X86_NOSPEC_BRANCH_H_ */
779     diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h
780     index ce245b0cdfca..0777e18a1d23 100644
781     --- a/arch/x86/include/asm/pgtable_32_types.h
782     +++ b/arch/x86/include/asm/pgtable_32_types.h
783     @@ -44,8 +44,9 @@ extern bool __vmalloc_start_set; /* set once high_memory is set */
784     */
785     #define CPU_ENTRY_AREA_PAGES (NR_CPUS * 40)
786    
787     -#define CPU_ENTRY_AREA_BASE \
788     - ((FIXADDR_START - PAGE_SIZE * (CPU_ENTRY_AREA_PAGES + 1)) & PMD_MASK)
789     +#define CPU_ENTRY_AREA_BASE \
790     + ((FIXADDR_TOT_START - PAGE_SIZE * (CPU_ENTRY_AREA_PAGES + 1)) \
791     + & PMD_MASK)
792    
793     #define PKMAP_BASE \
794     ((CPU_ENTRY_AREA_BASE - PAGE_SIZE) & PMD_MASK)
795     diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
796     index 9c18da64daa9..c57c6e77c29f 100644
797     --- a/arch/x86/include/asm/processor.h
798     +++ b/arch/x86/include/asm/processor.h
799     @@ -459,8 +459,6 @@ struct thread_struct {
800     unsigned short gsindex;
801     #endif
802    
803     - u32 status; /* thread synchronous flags */
804     -
805     #ifdef CONFIG_X86_64
806     unsigned long fsbase;
807     unsigned long gsbase;
808     @@ -970,4 +968,7 @@ bool xen_set_default_idle(void);
809    
810     void stop_this_cpu(void *dummy);
811     void df_debug(struct pt_regs *regs, long error_code);
812     +
813     +void __ibp_barrier(void);
814     +
815     #endif /* _ASM_X86_PROCESSOR_H */
816     diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h
817     index d91ba04dd007..fb3a6de7440b 100644
818     --- a/arch/x86/include/asm/required-features.h
819     +++ b/arch/x86/include/asm/required-features.h
820     @@ -106,6 +106,7 @@
821     #define REQUIRED_MASK15 0
822     #define REQUIRED_MASK16 (NEED_LA57)
823     #define REQUIRED_MASK17 0
824     -#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 18)
825     +#define REQUIRED_MASK18 0
826     +#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19)
827    
828     #endif /* _ASM_X86_REQUIRED_FEATURES_H */
829     diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h
830     index e3c95e8e61c5..03eedc21246d 100644
831     --- a/arch/x86/include/asm/syscall.h
832     +++ b/arch/x86/include/asm/syscall.h
833     @@ -60,7 +60,7 @@ static inline long syscall_get_error(struct task_struct *task,
834     * TS_COMPAT is set for 32-bit syscall entries and then
835     * remains set until we return to user mode.
836     */
837     - if (task->thread.status & (TS_COMPAT|TS_I386_REGS_POKED))
838     + if (task->thread_info.status & (TS_COMPAT|TS_I386_REGS_POKED))
839     /*
840     * Sign-extend the value so (int)-EFOO becomes (long)-EFOO
841     * and will match correctly in comparisons.
842     @@ -116,7 +116,7 @@ static inline void syscall_get_arguments(struct task_struct *task,
843     unsigned long *args)
844     {
845     # ifdef CONFIG_IA32_EMULATION
846     - if (task->thread.status & TS_COMPAT)
847     + if (task->thread_info.status & TS_COMPAT)
848     switch (i) {
849     case 0:
850     if (!n--) break;
851     @@ -177,7 +177,7 @@ static inline void syscall_set_arguments(struct task_struct *task,
852     const unsigned long *args)
853     {
854     # ifdef CONFIG_IA32_EMULATION
855     - if (task->thread.status & TS_COMPAT)
856     + if (task->thread_info.status & TS_COMPAT)
857     switch (i) {
858     case 0:
859     if (!n--) break;
860     diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
861     index 00223333821a..eda3b6823ca4 100644
862     --- a/arch/x86/include/asm/thread_info.h
863     +++ b/arch/x86/include/asm/thread_info.h
864     @@ -55,6 +55,7 @@ struct task_struct;
865    
866     struct thread_info {
867     unsigned long flags; /* low level flags */
868     + u32 status; /* thread synchronous flags */
869     };
870    
871     #define INIT_THREAD_INFO(tsk) \
872     @@ -221,7 +222,7 @@ static inline int arch_within_stack_frames(const void * const stack,
873     #define in_ia32_syscall() true
874     #else
875     #define in_ia32_syscall() (IS_ENABLED(CONFIG_IA32_EMULATION) && \
876     - current->thread.status & TS_COMPAT)
877     + current_thread_info()->status & TS_COMPAT)
878     #endif
879    
880     /*
881     diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
882     index 3effd3c994af..4405c4b308e8 100644
883     --- a/arch/x86/include/asm/tlbflush.h
884     +++ b/arch/x86/include/asm/tlbflush.h
885     @@ -174,6 +174,8 @@ struct tlb_state {
886     struct mm_struct *loaded_mm;
887     u16 loaded_mm_asid;
888     u16 next_asid;
889     + /* last user mm's ctx id */
890     + u64 last_ctx_id;
891    
892     /*
893     * We can be in one of several states:
894     diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
895     index 574dff4d2913..aae77eb8491c 100644
896     --- a/arch/x86/include/asm/uaccess.h
897     +++ b/arch/x86/include/asm/uaccess.h
898     @@ -124,6 +124,11 @@ extern int __get_user_bad(void);
899    
900     #define __uaccess_begin() stac()
901     #define __uaccess_end() clac()
902     +#define __uaccess_begin_nospec() \
903     +({ \
904     + stac(); \
905     + barrier_nospec(); \
906     +})
907    
908     /*
909     * This is a type: either unsigned long, if the argument fits into
910     @@ -445,7 +450,7 @@ do { \
911     ({ \
912     int __gu_err; \
913     __inttype(*(ptr)) __gu_val; \
914     - __uaccess_begin(); \
915     + __uaccess_begin_nospec(); \
916     __get_user_size(__gu_val, (ptr), (size), __gu_err, -EFAULT); \
917     __uaccess_end(); \
918     (x) = (__force __typeof__(*(ptr)))__gu_val; \
919     @@ -487,6 +492,10 @@ struct __large_struct { unsigned long buf[100]; };
920     __uaccess_begin(); \
921     barrier();
922    
923     +#define uaccess_try_nospec do { \
924     + current->thread.uaccess_err = 0; \
925     + __uaccess_begin_nospec(); \
926     +
927     #define uaccess_catch(err) \
928     __uaccess_end(); \
929     (err) |= (current->thread.uaccess_err ? -EFAULT : 0); \
930     @@ -548,7 +557,7 @@ struct __large_struct { unsigned long buf[100]; };
931     * get_user_ex(...);
932     * } get_user_catch(err)
933     */
934     -#define get_user_try uaccess_try
935     +#define get_user_try uaccess_try_nospec
936     #define get_user_catch(err) uaccess_catch(err)
937    
938     #define get_user_ex(x, ptr) do { \
939     @@ -582,7 +591,7 @@ extern void __cmpxchg_wrong_size(void)
940     __typeof__(ptr) __uval = (uval); \
941     __typeof__(*(ptr)) __old = (old); \
942     __typeof__(*(ptr)) __new = (new); \
943     - __uaccess_begin(); \
944     + __uaccess_begin_nospec(); \
945     switch (size) { \
946     case 1: \
947     { \
948     diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
949     index 72950401b223..ba2dc1930630 100644
950     --- a/arch/x86/include/asm/uaccess_32.h
951     +++ b/arch/x86/include/asm/uaccess_32.h
952     @@ -29,21 +29,21 @@ raw_copy_from_user(void *to, const void __user *from, unsigned long n)
953     switch (n) {
954     case 1:
955     ret = 0;
956     - __uaccess_begin();
957     + __uaccess_begin_nospec();
958     __get_user_asm_nozero(*(u8 *)to, from, ret,
959     "b", "b", "=q", 1);
960     __uaccess_end();
961     return ret;
962     case 2:
963     ret = 0;
964     - __uaccess_begin();
965     + __uaccess_begin_nospec();
966     __get_user_asm_nozero(*(u16 *)to, from, ret,
967     "w", "w", "=r", 2);
968     __uaccess_end();
969     return ret;
970     case 4:
971     ret = 0;
972     - __uaccess_begin();
973     + __uaccess_begin_nospec();
974     __get_user_asm_nozero(*(u32 *)to, from, ret,
975     "l", "k", "=r", 4);
976     __uaccess_end();
977     diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
978     index f07ef3c575db..62546b3a398e 100644
979     --- a/arch/x86/include/asm/uaccess_64.h
980     +++ b/arch/x86/include/asm/uaccess_64.h
981     @@ -55,31 +55,31 @@ raw_copy_from_user(void *dst, const void __user *src, unsigned long size)
982     return copy_user_generic(dst, (__force void *)src, size);
983     switch (size) {
984     case 1:
985     - __uaccess_begin();
986     + __uaccess_begin_nospec();
987     __get_user_asm_nozero(*(u8 *)dst, (u8 __user *)src,
988     ret, "b", "b", "=q", 1);
989     __uaccess_end();
990     return ret;
991     case 2:
992     - __uaccess_begin();
993     + __uaccess_begin_nospec();
994     __get_user_asm_nozero(*(u16 *)dst, (u16 __user *)src,
995     ret, "w", "w", "=r", 2);
996     __uaccess_end();
997     return ret;
998     case 4:
999     - __uaccess_begin();
1000     + __uaccess_begin_nospec();
1001     __get_user_asm_nozero(*(u32 *)dst, (u32 __user *)src,
1002     ret, "l", "k", "=r", 4);
1003     __uaccess_end();
1004     return ret;
1005     case 8:
1006     - __uaccess_begin();
1007     + __uaccess_begin_nospec();
1008     __get_user_asm_nozero(*(u64 *)dst, (u64 __user *)src,
1009     ret, "q", "", "=r", 8);
1010     __uaccess_end();
1011     return ret;
1012     case 10:
1013     - __uaccess_begin();
1014     + __uaccess_begin_nospec();
1015     __get_user_asm_nozero(*(u64 *)dst, (u64 __user *)src,
1016     ret, "q", "", "=r", 10);
1017     if (likely(!ret))
1018     @@ -89,7 +89,7 @@ raw_copy_from_user(void *dst, const void __user *src, unsigned long size)
1019     __uaccess_end();
1020     return ret;
1021     case 16:
1022     - __uaccess_begin();
1023     + __uaccess_begin_nospec();
1024     __get_user_asm_nozero(*(u64 *)dst, (u64 __user *)src,
1025     ret, "q", "", "=r", 16);
1026     if (likely(!ret))
1027     diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
1028     index e0b97e4d1db5..21be0193d9dc 100644
1029     --- a/arch/x86/kernel/alternative.c
1030     +++ b/arch/x86/kernel/alternative.c
1031     @@ -46,17 +46,6 @@ static int __init setup_noreplace_smp(char *str)
1032     }
1033     __setup("noreplace-smp", setup_noreplace_smp);
1034    
1035     -#ifdef CONFIG_PARAVIRT
1036     -static int __initdata_or_module noreplace_paravirt = 0;
1037     -
1038     -static int __init setup_noreplace_paravirt(char *str)
1039     -{
1040     - noreplace_paravirt = 1;
1041     - return 1;
1042     -}
1043     -__setup("noreplace-paravirt", setup_noreplace_paravirt);
1044     -#endif
1045     -
1046     #define DPRINTK(fmt, args...) \
1047     do { \
1048     if (debug_alternative) \
1049     @@ -298,7 +287,7 @@ recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insnbuf)
1050     tgt_rip = next_rip + o_dspl;
1051     n_dspl = tgt_rip - orig_insn;
1052    
1053     - DPRINTK("target RIP: %p, new_displ: 0x%x", tgt_rip, n_dspl);
1054     + DPRINTK("target RIP: %px, new_displ: 0x%x", tgt_rip, n_dspl);
1055    
1056     if (tgt_rip - orig_insn >= 0) {
1057     if (n_dspl - 2 <= 127)
1058     @@ -355,7 +344,7 @@ static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *ins
1059     add_nops(instr + (a->instrlen - a->padlen), a->padlen);
1060     local_irq_restore(flags);
1061    
1062     - DUMP_BYTES(instr, a->instrlen, "%p: [%d:%d) optimized NOPs: ",
1063     + DUMP_BYTES(instr, a->instrlen, "%px: [%d:%d) optimized NOPs: ",
1064     instr, a->instrlen - a->padlen, a->padlen);
1065     }
1066    
1067     @@ -376,7 +365,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
1068     u8 *instr, *replacement;
1069     u8 insnbuf[MAX_PATCH_LEN];
1070    
1071     - DPRINTK("alt table %p -> %p", start, end);
1072     + DPRINTK("alt table %px, -> %px", start, end);
1073     /*
1074     * The scan order should be from start to end. A later scanned
1075     * alternative code can overwrite previously scanned alternative code.
1076     @@ -400,14 +389,14 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
1077     continue;
1078     }
1079    
1080     - DPRINTK("feat: %d*32+%d, old: (%p, len: %d), repl: (%p, len: %d), pad: %d",
1081     + DPRINTK("feat: %d*32+%d, old: (%px len: %d), repl: (%px, len: %d), pad: %d",
1082     a->cpuid >> 5,
1083     a->cpuid & 0x1f,
1084     instr, a->instrlen,
1085     replacement, a->replacementlen, a->padlen);
1086    
1087     - DUMP_BYTES(instr, a->instrlen, "%p: old_insn: ", instr);
1088     - DUMP_BYTES(replacement, a->replacementlen, "%p: rpl_insn: ", replacement);
1089     + DUMP_BYTES(instr, a->instrlen, "%px: old_insn: ", instr);
1090     + DUMP_BYTES(replacement, a->replacementlen, "%px: rpl_insn: ", replacement);
1091    
1092     memcpy(insnbuf, replacement, a->replacementlen);
1093     insnbuf_sz = a->replacementlen;
1094     @@ -433,7 +422,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
1095     a->instrlen - a->replacementlen);
1096     insnbuf_sz += a->instrlen - a->replacementlen;
1097     }
1098     - DUMP_BYTES(insnbuf, insnbuf_sz, "%p: final_insn: ", instr);
1099     + DUMP_BYTES(insnbuf, insnbuf_sz, "%px: final_insn: ", instr);
1100    
1101     text_poke_early(instr, insnbuf, insnbuf_sz);
1102     }
1103     @@ -599,9 +588,6 @@ void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
1104     struct paravirt_patch_site *p;
1105     char insnbuf[MAX_PATCH_LEN];
1106    
1107     - if (noreplace_paravirt)
1108     - return;
1109     -
1110     for (p = start; p < end; p++) {
1111     unsigned int used;
1112    
1113     diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
1114     index 390b3dc3d438..71949bf2de5a 100644
1115     --- a/arch/x86/kernel/cpu/bugs.c
1116     +++ b/arch/x86/kernel/cpu/bugs.c
1117     @@ -11,6 +11,7 @@
1118     #include <linux/init.h>
1119     #include <linux/utsname.h>
1120     #include <linux/cpu.h>
1121     +#include <linux/module.h>
1122    
1123     #include <asm/nospec-branch.h>
1124     #include <asm/cmdline.h>
1125     @@ -90,20 +91,41 @@ static const char *spectre_v2_strings[] = {
1126     };
1127    
1128     #undef pr_fmt
1129     -#define pr_fmt(fmt) "Spectre V2 mitigation: " fmt
1130     +#define pr_fmt(fmt) "Spectre V2 : " fmt
1131    
1132     static enum spectre_v2_mitigation spectre_v2_enabled = SPECTRE_V2_NONE;
1133    
1134     +#ifdef RETPOLINE
1135     +static bool spectre_v2_bad_module;
1136     +
1137     +bool retpoline_module_ok(bool has_retpoline)
1138     +{
1139     + if (spectre_v2_enabled == SPECTRE_V2_NONE || has_retpoline)
1140     + return true;
1141     +
1142     + pr_err("System may be vulnerable to spectre v2\n");
1143     + spectre_v2_bad_module = true;
1144     + return false;
1145     +}
1146     +
1147     +static inline const char *spectre_v2_module_string(void)
1148     +{
1149     + return spectre_v2_bad_module ? " - vulnerable module loaded" : "";
1150     +}
1151     +#else
1152     +static inline const char *spectre_v2_module_string(void) { return ""; }
1153     +#endif
1154     +
1155     static void __init spec2_print_if_insecure(const char *reason)
1156     {
1157     if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
1158     - pr_info("%s\n", reason);
1159     + pr_info("%s selected on command line.\n", reason);
1160     }
1161    
1162     static void __init spec2_print_if_secure(const char *reason)
1163     {
1164     if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
1165     - pr_info("%s\n", reason);
1166     + pr_info("%s selected on command line.\n", reason);
1167     }
1168    
1169     static inline bool retp_compiler(void)
1170     @@ -118,42 +140,68 @@ static inline bool match_option(const char *arg, int arglen, const char *opt)
1171     return len == arglen && !strncmp(arg, opt, len);
1172     }
1173    
1174     +static const struct {
1175     + const char *option;
1176     + enum spectre_v2_mitigation_cmd cmd;
1177     + bool secure;
1178     +} mitigation_options[] = {
1179     + { "off", SPECTRE_V2_CMD_NONE, false },
1180     + { "on", SPECTRE_V2_CMD_FORCE, true },
1181     + { "retpoline", SPECTRE_V2_CMD_RETPOLINE, false },
1182     + { "retpoline,amd", SPECTRE_V2_CMD_RETPOLINE_AMD, false },
1183     + { "retpoline,generic", SPECTRE_V2_CMD_RETPOLINE_GENERIC, false },
1184     + { "auto", SPECTRE_V2_CMD_AUTO, false },
1185     +};
1186     +
1187     static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
1188     {
1189     char arg[20];
1190     - int ret;
1191     -
1192     - ret = cmdline_find_option(boot_command_line, "spectre_v2", arg,
1193     - sizeof(arg));
1194     - if (ret > 0) {
1195     - if (match_option(arg, ret, "off")) {
1196     - goto disable;
1197     - } else if (match_option(arg, ret, "on")) {
1198     - spec2_print_if_secure("force enabled on command line.");
1199     - return SPECTRE_V2_CMD_FORCE;
1200     - } else if (match_option(arg, ret, "retpoline")) {
1201     - spec2_print_if_insecure("retpoline selected on command line.");
1202     - return SPECTRE_V2_CMD_RETPOLINE;
1203     - } else if (match_option(arg, ret, "retpoline,amd")) {
1204     - if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
1205     - pr_err("retpoline,amd selected but CPU is not AMD. Switching to AUTO select\n");
1206     - return SPECTRE_V2_CMD_AUTO;
1207     - }
1208     - spec2_print_if_insecure("AMD retpoline selected on command line.");
1209     - return SPECTRE_V2_CMD_RETPOLINE_AMD;
1210     - } else if (match_option(arg, ret, "retpoline,generic")) {
1211     - spec2_print_if_insecure("generic retpoline selected on command line.");
1212     - return SPECTRE_V2_CMD_RETPOLINE_GENERIC;
1213     - } else if (match_option(arg, ret, "auto")) {
1214     + int ret, i;
1215     + enum spectre_v2_mitigation_cmd cmd = SPECTRE_V2_CMD_AUTO;
1216     +
1217     + if (cmdline_find_option_bool(boot_command_line, "nospectre_v2"))
1218     + return SPECTRE_V2_CMD_NONE;
1219     + else {
1220     + ret = cmdline_find_option(boot_command_line, "spectre_v2", arg,
1221     + sizeof(arg));
1222     + if (ret < 0)
1223     + return SPECTRE_V2_CMD_AUTO;
1224     +
1225     + for (i = 0; i < ARRAY_SIZE(mitigation_options); i++) {
1226     + if (!match_option(arg, ret, mitigation_options[i].option))
1227     + continue;
1228     + cmd = mitigation_options[i].cmd;
1229     + break;
1230     + }
1231     +
1232     + if (i >= ARRAY_SIZE(mitigation_options)) {
1233     + pr_err("unknown option (%s). Switching to AUTO select\n",
1234     + mitigation_options[i].option);
1235     return SPECTRE_V2_CMD_AUTO;
1236     }
1237     }
1238    
1239     - if (!cmdline_find_option_bool(boot_command_line, "nospectre_v2"))
1240     + if ((cmd == SPECTRE_V2_CMD_RETPOLINE ||
1241     + cmd == SPECTRE_V2_CMD_RETPOLINE_AMD ||
1242     + cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC) &&
1243     + !IS_ENABLED(CONFIG_RETPOLINE)) {
1244     + pr_err("%s selected but not compiled in. Switching to AUTO select\n",
1245     + mitigation_options[i].option);
1246     return SPECTRE_V2_CMD_AUTO;
1247     -disable:
1248     - spec2_print_if_insecure("disabled on command line.");
1249     - return SPECTRE_V2_CMD_NONE;
1250     + }
1251     +
1252     + if (cmd == SPECTRE_V2_CMD_RETPOLINE_AMD &&
1253     + boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
1254     + pr_err("retpoline,amd selected but CPU is not AMD. Switching to AUTO select\n");
1255     + return SPECTRE_V2_CMD_AUTO;
1256     + }
1257     +
1258     + if (mitigation_options[i].secure)
1259     + spec2_print_if_secure(mitigation_options[i].option);
1260     + else
1261     + spec2_print_if_insecure(mitigation_options[i].option);
1262     +
1263     + return cmd;
1264     }
1265    
1266     /* Check for Skylake-like CPUs (for RSB handling) */
1267     @@ -191,10 +239,10 @@ static void __init spectre_v2_select_mitigation(void)
1268     return;
1269    
1270     case SPECTRE_V2_CMD_FORCE:
1271     - /* FALLTRHU */
1272     case SPECTRE_V2_CMD_AUTO:
1273     - goto retpoline_auto;
1274     -
1275     + if (IS_ENABLED(CONFIG_RETPOLINE))
1276     + goto retpoline_auto;
1277     + break;
1278     case SPECTRE_V2_CMD_RETPOLINE_AMD:
1279     if (IS_ENABLED(CONFIG_RETPOLINE))
1280     goto retpoline_amd;
1281     @@ -249,6 +297,12 @@ static void __init spectre_v2_select_mitigation(void)
1282     setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW);
1283     pr_info("Filling RSB on context switch\n");
1284     }
1285     +
1286     + /* Initialize Indirect Branch Prediction Barrier if supported */
1287     + if (boot_cpu_has(X86_FEATURE_IBPB)) {
1288     + setup_force_cpu_cap(X86_FEATURE_USE_IBPB);
1289     + pr_info("Enabling Indirect Branch Prediction Barrier\n");
1290     + }
1291     }
1292    
1293     #undef pr_fmt
1294     @@ -269,7 +323,7 @@ ssize_t cpu_show_spectre_v1(struct device *dev,
1295     {
1296     if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1))
1297     return sprintf(buf, "Not affected\n");
1298     - return sprintf(buf, "Vulnerable\n");
1299     + return sprintf(buf, "Mitigation: __user pointer sanitization\n");
1300     }
1301    
1302     ssize_t cpu_show_spectre_v2(struct device *dev,
1303     @@ -278,6 +332,14 @@ ssize_t cpu_show_spectre_v2(struct device *dev,
1304     if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
1305     return sprintf(buf, "Not affected\n");
1306    
1307     - return sprintf(buf, "%s\n", spectre_v2_strings[spectre_v2_enabled]);
1308     + return sprintf(buf, "%s%s%s\n", spectre_v2_strings[spectre_v2_enabled],
1309     + boot_cpu_has(X86_FEATURE_USE_IBPB) ? ", IBPB" : "",
1310     + spectre_v2_module_string());
1311     }
1312     #endif
1313     +
1314     +void __ibp_barrier(void)
1315     +{
1316     + __wrmsr(MSR_IA32_PRED_CMD, PRED_CMD_IBPB, 0);
1317     +}
1318     +EXPORT_SYMBOL_GPL(__ibp_barrier);
1319     diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
1320     index 372ba3fb400f..92b66e21bae5 100644
1321     --- a/arch/x86/kernel/cpu/common.c
1322     +++ b/arch/x86/kernel/cpu/common.c
1323     @@ -47,6 +47,8 @@
1324     #include <asm/pat.h>
1325     #include <asm/microcode.h>
1326     #include <asm/microcode_intel.h>
1327     +#include <asm/intel-family.h>
1328     +#include <asm/cpu_device_id.h>
1329    
1330     #ifdef CONFIG_X86_LOCAL_APIC
1331     #include <asm/uv/uv.h>
1332     @@ -724,6 +726,26 @@ static void apply_forced_caps(struct cpuinfo_x86 *c)
1333     }
1334     }
1335    
1336     +static void init_speculation_control(struct cpuinfo_x86 *c)
1337     +{
1338     + /*
1339     + * The Intel SPEC_CTRL CPUID bit implies IBRS and IBPB support,
1340     + * and they also have a different bit for STIBP support. Also,
1341     + * a hypervisor might have set the individual AMD bits even on
1342     + * Intel CPUs, for finer-grained selection of what's available.
1343     + *
1344     + * We use the AMD bits in 0x8000_0008 EBX as the generic hardware
1345     + * features, which are visible in /proc/cpuinfo and used by the
1346     + * kernel. So set those accordingly from the Intel bits.
1347     + */
1348     + if (cpu_has(c, X86_FEATURE_SPEC_CTRL)) {
1349     + set_cpu_cap(c, X86_FEATURE_IBRS);
1350     + set_cpu_cap(c, X86_FEATURE_IBPB);
1351     + }
1352     + if (cpu_has(c, X86_FEATURE_INTEL_STIBP))
1353     + set_cpu_cap(c, X86_FEATURE_STIBP);
1354     +}
1355     +
1356     void get_cpu_cap(struct cpuinfo_x86 *c)
1357     {
1358     u32 eax, ebx, ecx, edx;
1359     @@ -745,6 +767,7 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
1360     cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx);
1361     c->x86_capability[CPUID_7_0_EBX] = ebx;
1362     c->x86_capability[CPUID_7_ECX] = ecx;
1363     + c->x86_capability[CPUID_7_EDX] = edx;
1364     }
1365    
1366     /* Extended state features: level 0x0000000d */
1367     @@ -817,6 +840,7 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
1368     c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x8000000a);
1369    
1370     init_scattered_cpuid_features(c);
1371     + init_speculation_control(c);
1372    
1373     /*
1374     * Clear/Set all flags overridden by options, after probe.
1375     @@ -852,6 +876,41 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
1376     #endif
1377     }
1378    
1379     +static const __initconst struct x86_cpu_id cpu_no_speculation[] = {
1380     + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CEDARVIEW, X86_FEATURE_ANY },
1381     + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CLOVERVIEW, X86_FEATURE_ANY },
1382     + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_LINCROFT, X86_FEATURE_ANY },
1383     + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PENWELL, X86_FEATURE_ANY },
1384     + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PINEVIEW, X86_FEATURE_ANY },
1385     + { X86_VENDOR_CENTAUR, 5 },
1386     + { X86_VENDOR_INTEL, 5 },
1387     + { X86_VENDOR_NSC, 5 },
1388     + { X86_VENDOR_ANY, 4 },
1389     + {}
1390     +};
1391     +
1392     +static const __initconst struct x86_cpu_id cpu_no_meltdown[] = {
1393     + { X86_VENDOR_AMD },
1394     + {}
1395     +};
1396     +
1397     +static bool __init cpu_vulnerable_to_meltdown(struct cpuinfo_x86 *c)
1398     +{
1399     + u64 ia32_cap = 0;
1400     +
1401     + if (x86_match_cpu(cpu_no_meltdown))
1402     + return false;
1403     +
1404     + if (cpu_has(c, X86_FEATURE_ARCH_CAPABILITIES))
1405     + rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap);
1406     +
1407     + /* Rogue Data Cache Load? No! */
1408     + if (ia32_cap & ARCH_CAP_RDCL_NO)
1409     + return false;
1410     +
1411     + return true;
1412     +}
1413     +
1414     /*
1415     * Do minimum CPU detection early.
1416     * Fields really needed: vendor, cpuid_level, family, model, mask,
1417     @@ -899,11 +958,12 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
1418    
1419     setup_force_cpu_cap(X86_FEATURE_ALWAYS);
1420    
1421     - if (c->x86_vendor != X86_VENDOR_AMD)
1422     - setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN);
1423     -
1424     - setup_force_cpu_bug(X86_BUG_SPECTRE_V1);
1425     - setup_force_cpu_bug(X86_BUG_SPECTRE_V2);
1426     + if (!x86_match_cpu(cpu_no_speculation)) {
1427     + if (cpu_vulnerable_to_meltdown(c))
1428     + setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN);
1429     + setup_force_cpu_bug(X86_BUG_SPECTRE_V1);
1430     + setup_force_cpu_bug(X86_BUG_SPECTRE_V2);
1431     + }
1432    
1433     fpu__init_system(c);
1434    
1435     diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
1436     index b720dacac051..4cf4f8cbc69d 100644
1437     --- a/arch/x86/kernel/cpu/intel.c
1438     +++ b/arch/x86/kernel/cpu/intel.c
1439     @@ -102,6 +102,59 @@ static void probe_xeon_phi_r3mwait(struct cpuinfo_x86 *c)
1440     ELF_HWCAP2 |= HWCAP2_RING3MWAIT;
1441     }
1442    
1443     +/*
1444     + * Early microcode releases for the Spectre v2 mitigation were broken.
1445     + * Information taken from;
1446     + * - https://newsroom.intel.com/wp-content/uploads/sites/11/2018/01/microcode-update-guidance.pdf
1447     + * - https://kb.vmware.com/s/article/52345
1448     + * - Microcode revisions observed in the wild
1449     + * - Release note from 20180108 microcode release
1450     + */
1451     +struct sku_microcode {
1452     + u8 model;
1453     + u8 stepping;
1454     + u32 microcode;
1455     +};
1456     +static const struct sku_microcode spectre_bad_microcodes[] = {
1457     + { INTEL_FAM6_KABYLAKE_DESKTOP, 0x0B, 0x84 },
1458     + { INTEL_FAM6_KABYLAKE_DESKTOP, 0x0A, 0x84 },
1459     + { INTEL_FAM6_KABYLAKE_DESKTOP, 0x09, 0x84 },
1460     + { INTEL_FAM6_KABYLAKE_MOBILE, 0x0A, 0x84 },
1461     + { INTEL_FAM6_KABYLAKE_MOBILE, 0x09, 0x84 },
1462     + { INTEL_FAM6_SKYLAKE_X, 0x03, 0x0100013e },
1463     + { INTEL_FAM6_SKYLAKE_X, 0x04, 0x0200003c },
1464     + { INTEL_FAM6_SKYLAKE_MOBILE, 0x03, 0xc2 },
1465     + { INTEL_FAM6_SKYLAKE_DESKTOP, 0x03, 0xc2 },
1466     + { INTEL_FAM6_BROADWELL_CORE, 0x04, 0x28 },
1467     + { INTEL_FAM6_BROADWELL_GT3E, 0x01, 0x1b },
1468     + { INTEL_FAM6_BROADWELL_XEON_D, 0x02, 0x14 },
1469     + { INTEL_FAM6_BROADWELL_XEON_D, 0x03, 0x07000011 },
1470     + { INTEL_FAM6_BROADWELL_X, 0x01, 0x0b000025 },
1471     + { INTEL_FAM6_HASWELL_ULT, 0x01, 0x21 },
1472     + { INTEL_FAM6_HASWELL_GT3E, 0x01, 0x18 },
1473     + { INTEL_FAM6_HASWELL_CORE, 0x03, 0x23 },
1474     + { INTEL_FAM6_HASWELL_X, 0x02, 0x3b },
1475     + { INTEL_FAM6_HASWELL_X, 0x04, 0x10 },
1476     + { INTEL_FAM6_IVYBRIDGE_X, 0x04, 0x42a },
1477     + /* Updated in the 20180108 release; blacklist until we know otherwise */
1478     + { INTEL_FAM6_ATOM_GEMINI_LAKE, 0x01, 0x22 },
1479     + /* Observed in the wild */
1480     + { INTEL_FAM6_SANDYBRIDGE_X, 0x06, 0x61b },
1481     + { INTEL_FAM6_SANDYBRIDGE_X, 0x07, 0x712 },
1482     +};
1483     +
1484     +static bool bad_spectre_microcode(struct cpuinfo_x86 *c)
1485     +{
1486     + int i;
1487     +
1488     + for (i = 0; i < ARRAY_SIZE(spectre_bad_microcodes); i++) {
1489     + if (c->x86_model == spectre_bad_microcodes[i].model &&
1490     + c->x86_mask == spectre_bad_microcodes[i].stepping)
1491     + return (c->microcode <= spectre_bad_microcodes[i].microcode);
1492     + }
1493     + return false;
1494     +}
1495     +
1496     static void early_init_intel(struct cpuinfo_x86 *c)
1497     {
1498     u64 misc_enable;
1499     @@ -122,6 +175,19 @@ static void early_init_intel(struct cpuinfo_x86 *c)
1500     if (c->x86 >= 6 && !cpu_has(c, X86_FEATURE_IA64))
1501     c->microcode = intel_get_microcode_revision();
1502    
1503     + /* Now if any of them are set, check the blacklist and clear the lot */
1504     + if ((cpu_has(c, X86_FEATURE_SPEC_CTRL) ||
1505     + cpu_has(c, X86_FEATURE_INTEL_STIBP) ||
1506     + cpu_has(c, X86_FEATURE_IBRS) || cpu_has(c, X86_FEATURE_IBPB) ||
1507     + cpu_has(c, X86_FEATURE_STIBP)) && bad_spectre_microcode(c)) {
1508     + pr_warn("Intel Spectre v2 broken microcode detected; disabling Speculation Control\n");
1509     + setup_clear_cpu_cap(X86_FEATURE_IBRS);
1510     + setup_clear_cpu_cap(X86_FEATURE_IBPB);
1511     + setup_clear_cpu_cap(X86_FEATURE_STIBP);
1512     + setup_clear_cpu_cap(X86_FEATURE_SPEC_CTRL);
1513     + setup_clear_cpu_cap(X86_FEATURE_INTEL_STIBP);
1514     + }
1515     +
1516     /*
1517     * Atom erratum AAE44/AAF40/AAG38/AAH41:
1518     *
1519     diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
1520     index d0e69769abfd..df11f5d604be 100644
1521     --- a/arch/x86/kernel/cpu/scattered.c
1522     +++ b/arch/x86/kernel/cpu/scattered.c
1523     @@ -21,8 +21,6 @@ struct cpuid_bit {
1524     static const struct cpuid_bit cpuid_bits[] = {
1525     { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 },
1526     { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 },
1527     - { X86_FEATURE_AVX512_4VNNIW, CPUID_EDX, 2, 0x00000007, 0 },
1528     - { X86_FEATURE_AVX512_4FMAPS, CPUID_EDX, 3, 0x00000007, 0 },
1529     { X86_FEATURE_CAT_L3, CPUID_EBX, 1, 0x00000010, 0 },
1530     { X86_FEATURE_CAT_L2, CPUID_EBX, 2, 0x00000010, 0 },
1531     { X86_FEATURE_CDP_L3, CPUID_ECX, 2, 0x00000010, 1 },
1532     diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
1533     index c75466232016..9eb448c7859d 100644
1534     --- a/arch/x86/kernel/process_64.c
1535     +++ b/arch/x86/kernel/process_64.c
1536     @@ -557,7 +557,7 @@ static void __set_personality_x32(void)
1537     * Pretend to come from a x32 execve.
1538     */
1539     task_pt_regs(current)->orig_ax = __NR_x32_execve | __X32_SYSCALL_BIT;
1540     - current->thread.status &= ~TS_COMPAT;
1541     + current_thread_info()->status &= ~TS_COMPAT;
1542     #endif
1543     }
1544    
1545     @@ -571,7 +571,7 @@ static void __set_personality_ia32(void)
1546     current->personality |= force_personality32;
1547     /* Prepare the first "return" to user space */
1548     task_pt_regs(current)->orig_ax = __NR_ia32_execve;
1549     - current->thread.status |= TS_COMPAT;
1550     + current_thread_info()->status |= TS_COMPAT;
1551     #endif
1552     }
1553    
1554     diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
1555     index f37d18124648..ed5c4cdf0a34 100644
1556     --- a/arch/x86/kernel/ptrace.c
1557     +++ b/arch/x86/kernel/ptrace.c
1558     @@ -935,7 +935,7 @@ static int putreg32(struct task_struct *child, unsigned regno, u32 value)
1559     */
1560     regs->orig_ax = value;
1561     if (syscall_get_nr(child, regs) >= 0)
1562     - child->thread.status |= TS_I386_REGS_POKED;
1563     + child->thread_info.status |= TS_I386_REGS_POKED;
1564     break;
1565    
1566     case offsetof(struct user32, regs.eflags):
1567     diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
1568     index b9e00e8f1c9b..4cdc0b27ec82 100644
1569     --- a/arch/x86/kernel/signal.c
1570     +++ b/arch/x86/kernel/signal.c
1571     @@ -787,7 +787,7 @@ static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs)
1572     * than the tracee.
1573     */
1574     #ifdef CONFIG_IA32_EMULATION
1575     - if (current->thread.status & (TS_COMPAT|TS_I386_REGS_POKED))
1576     + if (current_thread_info()->status & (TS_COMPAT|TS_I386_REGS_POKED))
1577     return __NR_ia32_restart_syscall;
1578     #endif
1579     #ifdef CONFIG_X86_X32_ABI
1580     diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
1581     index 0099e10eb045..13f5d4217e4f 100644
1582     --- a/arch/x86/kvm/cpuid.c
1583     +++ b/arch/x86/kvm/cpuid.c
1584     @@ -67,9 +67,7 @@ u64 kvm_supported_xcr0(void)
1585    
1586     #define F(x) bit(X86_FEATURE_##x)
1587    
1588     -/* These are scattered features in cpufeatures.h. */
1589     -#define KVM_CPUID_BIT_AVX512_4VNNIW 2
1590     -#define KVM_CPUID_BIT_AVX512_4FMAPS 3
1591     +/* For scattered features from cpufeatures.h; we currently expose none */
1592     #define KF(x) bit(KVM_CPUID_BIT_##x)
1593    
1594     int kvm_update_cpuid(struct kvm_vcpu *vcpu)
1595     @@ -367,6 +365,10 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1596     F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) |
1597     0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM);
1598    
1599     + /* cpuid 0x80000008.ebx */
1600     + const u32 kvm_cpuid_8000_0008_ebx_x86_features =
1601     + F(IBPB) | F(IBRS);
1602     +
1603     /* cpuid 0xC0000001.edx */
1604     const u32 kvm_cpuid_C000_0001_edx_x86_features =
1605     F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) |
1606     @@ -392,7 +394,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1607    
1608     /* cpuid 7.0.edx*/
1609     const u32 kvm_cpuid_7_0_edx_x86_features =
1610     - KF(AVX512_4VNNIW) | KF(AVX512_4FMAPS);
1611     + F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) |
1612     + F(ARCH_CAPABILITIES);
1613    
1614     /* all calls to cpuid_count() should be made on the same cpu */
1615     get_cpu();
1616     @@ -477,7 +480,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1617     if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE))
1618     entry->ecx &= ~F(PKU);
1619     entry->edx &= kvm_cpuid_7_0_edx_x86_features;
1620     - entry->edx &= get_scattered_cpuid_leaf(7, 0, CPUID_EDX);
1621     + cpuid_mask(&entry->edx, CPUID_7_EDX);
1622     } else {
1623     entry->ebx = 0;
1624     entry->ecx = 0;
1625     @@ -627,7 +630,14 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1626     if (!g_phys_as)
1627     g_phys_as = phys_as;
1628     entry->eax = g_phys_as | (virt_as << 8);
1629     - entry->ebx = entry->edx = 0;
1630     + entry->edx = 0;
1631     + /* IBRS and IBPB aren't necessarily present in hardware cpuid */
1632     + if (boot_cpu_has(X86_FEATURE_IBPB))
1633     + entry->ebx |= F(IBPB);
1634     + if (boot_cpu_has(X86_FEATURE_IBRS))
1635     + entry->ebx |= F(IBRS);
1636     + entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features;
1637     + cpuid_mask(&entry->ebx, CPUID_8000_0008_EBX);
1638     break;
1639     }
1640     case 0x80000019:
1641     diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
1642     index c2cea6651279..9a327d5b6d1f 100644
1643     --- a/arch/x86/kvm/cpuid.h
1644     +++ b/arch/x86/kvm/cpuid.h
1645     @@ -54,6 +54,7 @@ static const struct cpuid_reg reverse_cpuid[] = {
1646     [CPUID_8000_000A_EDX] = {0x8000000a, 0, CPUID_EDX},
1647     [CPUID_7_ECX] = { 7, 0, CPUID_ECX},
1648     [CPUID_8000_0007_EBX] = {0x80000007, 0, CPUID_EBX},
1649     + [CPUID_7_EDX] = { 7, 0, CPUID_EDX},
1650     };
1651    
1652     static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned x86_feature)
1653     diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
1654     index eca6a89f2326..fab073b19528 100644
1655     --- a/arch/x86/kvm/emulate.c
1656     +++ b/arch/x86/kvm/emulate.c
1657     @@ -25,6 +25,7 @@
1658     #include <asm/kvm_emulate.h>
1659     #include <linux/stringify.h>
1660     #include <asm/debugreg.h>
1661     +#include <asm/nospec-branch.h>
1662    
1663     #include "x86.h"
1664     #include "tss.h"
1665     @@ -1021,8 +1022,8 @@ static __always_inline u8 test_cc(unsigned int condition, unsigned long flags)
1666     void (*fop)(void) = (void *)em_setcc + 4 * (condition & 0xf);
1667    
1668     flags = (flags & EFLAGS_MASK) | X86_EFLAGS_IF;
1669     - asm("push %[flags]; popf; call *%[fastop]"
1670     - : "=a"(rc) : [fastop]"r"(fop), [flags]"r"(flags));
1671     + asm("push %[flags]; popf; " CALL_NOSPEC
1672     + : "=a"(rc) : [thunk_target]"r"(fop), [flags]"r"(flags));
1673     return rc;
1674     }
1675    
1676     @@ -5350,9 +5351,9 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *))
1677     if (!(ctxt->d & ByteOp))
1678     fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE;
1679    
1680     - asm("push %[flags]; popf; call *%[fastop]; pushf; pop %[flags]\n"
1681     + asm("push %[flags]; popf; " CALL_NOSPEC " ; pushf; pop %[flags]\n"
1682     : "+a"(ctxt->dst.val), "+d"(ctxt->src.val), [flags]"+D"(flags),
1683     - [fastop]"+S"(fop), ASM_CALL_CONSTRAINT
1684     + [thunk_target]"+S"(fop), ASM_CALL_CONSTRAINT
1685     : "c"(ctxt->src2.val));
1686    
1687     ctxt->eflags = (ctxt->eflags & ~EFLAGS_MASK) | (flags & EFLAGS_MASK);
1688     diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
1689     index 6a8284f72328..e0bc3ad0f6cd 100644
1690     --- a/arch/x86/kvm/svm.c
1691     +++ b/arch/x86/kvm/svm.c
1692     @@ -184,6 +184,8 @@ struct vcpu_svm {
1693     u64 gs_base;
1694     } host;
1695    
1696     + u64 spec_ctrl;
1697     +
1698     u32 *msrpm;
1699    
1700     ulong nmi_iret_rip;
1701     @@ -249,6 +251,8 @@ static const struct svm_direct_access_msrs {
1702     { .index = MSR_CSTAR, .always = true },
1703     { .index = MSR_SYSCALL_MASK, .always = true },
1704     #endif
1705     + { .index = MSR_IA32_SPEC_CTRL, .always = false },
1706     + { .index = MSR_IA32_PRED_CMD, .always = false },
1707     { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false },
1708     { .index = MSR_IA32_LASTBRANCHTOIP, .always = false },
1709     { .index = MSR_IA32_LASTINTFROMIP, .always = false },
1710     @@ -529,6 +533,7 @@ struct svm_cpu_data {
1711     struct kvm_ldttss_desc *tss_desc;
1712    
1713     struct page *save_area;
1714     + struct vmcb *current_vmcb;
1715     };
1716    
1717     static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
1718     @@ -880,6 +885,25 @@ static bool valid_msr_intercept(u32 index)
1719     return false;
1720     }
1721    
1722     +static bool msr_write_intercepted(struct kvm_vcpu *vcpu, unsigned msr)
1723     +{
1724     + u8 bit_write;
1725     + unsigned long tmp;
1726     + u32 offset;
1727     + u32 *msrpm;
1728     +
1729     + msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm:
1730     + to_svm(vcpu)->msrpm;
1731     +
1732     + offset = svm_msrpm_offset(msr);
1733     + bit_write = 2 * (msr & 0x0f) + 1;
1734     + tmp = msrpm[offset];
1735     +
1736     + BUG_ON(offset == MSR_INVALID);
1737     +
1738     + return !!test_bit(bit_write, &tmp);
1739     +}
1740     +
1741     static void set_msr_interception(u32 *msrpm, unsigned msr,
1742     int read, int write)
1743     {
1744     @@ -1585,6 +1609,8 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
1745     u32 dummy;
1746     u32 eax = 1;
1747    
1748     + svm->spec_ctrl = 0;
1749     +
1750     if (!init_event) {
1751     svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
1752     MSR_IA32_APICBASE_ENABLE;
1753     @@ -1706,11 +1732,17 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
1754     __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
1755     kvm_vcpu_uninit(vcpu);
1756     kmem_cache_free(kvm_vcpu_cache, svm);
1757     + /*
1758     + * The vmcb page can be recycled, causing a false negative in
1759     + * svm_vcpu_load(). So do a full IBPB now.
1760     + */
1761     + indirect_branch_prediction_barrier();
1762     }
1763    
1764     static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1765     {
1766     struct vcpu_svm *svm = to_svm(vcpu);
1767     + struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
1768     int i;
1769    
1770     if (unlikely(cpu != vcpu->cpu)) {
1771     @@ -1739,6 +1771,10 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1772     if (static_cpu_has(X86_FEATURE_RDTSCP))
1773     wrmsrl(MSR_TSC_AUX, svm->tsc_aux);
1774    
1775     + if (sd->current_vmcb != svm->vmcb) {
1776     + sd->current_vmcb = svm->vmcb;
1777     + indirect_branch_prediction_barrier();
1778     + }
1779     avic_vcpu_load(vcpu, cpu);
1780     }
1781    
1782     @@ -3579,6 +3615,13 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
1783     case MSR_VM_CR:
1784     msr_info->data = svm->nested.vm_cr_msr;
1785     break;
1786     + case MSR_IA32_SPEC_CTRL:
1787     + if (!msr_info->host_initiated &&
1788     + !guest_cpuid_has(vcpu, X86_FEATURE_IBRS))
1789     + return 1;
1790     +
1791     + msr_info->data = svm->spec_ctrl;
1792     + break;
1793     case MSR_IA32_UCODE_REV:
1794     msr_info->data = 0x01000065;
1795     break;
1796     @@ -3670,6 +3713,49 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
1797     case MSR_IA32_TSC:
1798     kvm_write_tsc(vcpu, msr);
1799     break;
1800     + case MSR_IA32_SPEC_CTRL:
1801     + if (!msr->host_initiated &&
1802     + !guest_cpuid_has(vcpu, X86_FEATURE_IBRS))
1803     + return 1;
1804     +
1805     + /* The STIBP bit doesn't fault even if it's not advertised */
1806     + if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP))
1807     + return 1;
1808     +
1809     + svm->spec_ctrl = data;
1810     +
1811     + if (!data)
1812     + break;
1813     +
1814     + /*
1815     + * For non-nested:
1816     + * When it's written (to non-zero) for the first time, pass
1817     + * it through.
1818     + *
1819     + * For nested:
1820     + * The handling of the MSR bitmap for L2 guests is done in
1821     + * nested_svm_vmrun_msrpm.
1822     + * We update the L1 MSR bit as well since it will end up
1823     + * touching the MSR anyway now.
1824     + */
1825     + set_msr_interception(svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
1826     + break;
1827     + case MSR_IA32_PRED_CMD:
1828     + if (!msr->host_initiated &&
1829     + !guest_cpuid_has(vcpu, X86_FEATURE_IBPB))
1830     + return 1;
1831     +
1832     + if (data & ~PRED_CMD_IBPB)
1833     + return 1;
1834     +
1835     + if (!data)
1836     + break;
1837     +
1838     + wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
1839     + if (is_guest_mode(vcpu))
1840     + break;
1841     + set_msr_interception(svm->msrpm, MSR_IA32_PRED_CMD, 0, 1);
1842     + break;
1843     case MSR_STAR:
1844     svm->vmcb->save.star = data;
1845     break;
1846     @@ -4922,6 +5008,15 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
1847    
1848     local_irq_enable();
1849    
1850     + /*
1851     + * If this vCPU has touched SPEC_CTRL, restore the guest's value if
1852     + * it's non-zero. Since vmentry is serialising on affected CPUs, there
1853     + * is no need to worry about the conditional branch over the wrmsr
1854     + * being speculatively taken.
1855     + */
1856     + if (svm->spec_ctrl)
1857     + wrmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl);
1858     +
1859     asm volatile (
1860     "push %%" _ASM_BP "; \n\t"
1861     "mov %c[rbx](%[svm]), %%" _ASM_BX " \n\t"
1862     @@ -5014,6 +5109,27 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
1863     #endif
1864     );
1865    
1866     + /*
1867     + * We do not use IBRS in the kernel. If this vCPU has used the
1868     + * SPEC_CTRL MSR it may have left it on; save the value and
1869     + * turn it off. This is much more efficient than blindly adding
1870     + * it to the atomic save/restore list. Especially as the former
1871     + * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
1872     + *
1873     + * For non-nested case:
1874     + * If the L01 MSR bitmap does not intercept the MSR, then we need to
1875     + * save it.
1876     + *
1877     + * For nested case:
1878     + * If the L02 MSR bitmap does not intercept the MSR, then we need to
1879     + * save it.
1880     + */
1881     + if (!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))
1882     + rdmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl);
1883     +
1884     + if (svm->spec_ctrl)
1885     + wrmsrl(MSR_IA32_SPEC_CTRL, 0);
1886     +
1887     /* Eliminate branch target predictions from guest mode */
1888     vmexit_fill_RSB();
1889    
1890     diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
1891     index a45063a9219c..0ae4b1a86168 100644
1892     --- a/arch/x86/kvm/vmx.c
1893     +++ b/arch/x86/kvm/vmx.c
1894     @@ -34,6 +34,7 @@
1895     #include <linux/tboot.h>
1896     #include <linux/hrtimer.h>
1897     #include <linux/frame.h>
1898     +#include <linux/nospec.h>
1899     #include "kvm_cache_regs.h"
1900     #include "x86.h"
1901    
1902     @@ -108,6 +109,14 @@ static u64 __read_mostly host_xss;
1903     static bool __read_mostly enable_pml = 1;
1904     module_param_named(pml, enable_pml, bool, S_IRUGO);
1905    
1906     +#define MSR_TYPE_R 1
1907     +#define MSR_TYPE_W 2
1908     +#define MSR_TYPE_RW 3
1909     +
1910     +#define MSR_BITMAP_MODE_X2APIC 1
1911     +#define MSR_BITMAP_MODE_X2APIC_APICV 2
1912     +#define MSR_BITMAP_MODE_LM 4
1913     +
1914     #define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL
1915    
1916     /* Guest_tsc -> host_tsc conversion requires 64-bit division. */
1917     @@ -182,7 +191,6 @@ module_param(ple_window_max, int, S_IRUGO);
1918     extern const ulong vmx_return;
1919    
1920     #define NR_AUTOLOAD_MSRS 8
1921     -#define VMCS02_POOL_SIZE 1
1922    
1923     struct vmcs {
1924     u32 revision_id;
1925     @@ -207,6 +215,7 @@ struct loaded_vmcs {
1926     int soft_vnmi_blocked;
1927     ktime_t entry_time;
1928     s64 vnmi_blocked_time;
1929     + unsigned long *msr_bitmap;
1930     struct list_head loaded_vmcss_on_cpu_link;
1931     };
1932    
1933     @@ -223,7 +232,7 @@ struct shared_msr_entry {
1934     * stored in guest memory specified by VMPTRLD, but is opaque to the guest,
1935     * which must access it using VMREAD/VMWRITE/VMCLEAR instructions.
1936     * More than one of these structures may exist, if L1 runs multiple L2 guests.
1937     - * nested_vmx_run() will use the data here to build a vmcs02: a VMCS for the
1938     + * nested_vmx_run() will use the data here to build the vmcs02: a VMCS for the
1939     * underlying hardware which will be used to run L2.
1940     * This structure is packed to ensure that its layout is identical across
1941     * machines (necessary for live migration).
1942     @@ -406,13 +415,6 @@ struct __packed vmcs12 {
1943     */
1944     #define VMCS12_SIZE 0x1000
1945    
1946     -/* Used to remember the last vmcs02 used for some recently used vmcs12s */
1947     -struct vmcs02_list {
1948     - struct list_head list;
1949     - gpa_t vmptr;
1950     - struct loaded_vmcs vmcs02;
1951     -};
1952     -
1953     /*
1954     * The nested_vmx structure is part of vcpu_vmx, and holds information we need
1955     * for correct emulation of VMX (i.e., nested VMX) on this vcpu.
1956     @@ -437,15 +439,15 @@ struct nested_vmx {
1957     */
1958     bool sync_shadow_vmcs;
1959    
1960     - /* vmcs02_list cache of VMCSs recently used to run L2 guests */
1961     - struct list_head vmcs02_pool;
1962     - int vmcs02_num;
1963     bool change_vmcs01_virtual_x2apic_mode;
1964     /* L2 must run next, and mustn't decide to exit to L1. */
1965     bool nested_run_pending;
1966     +
1967     + struct loaded_vmcs vmcs02;
1968     +
1969     /*
1970     - * Guest pages referred to in vmcs02 with host-physical pointers, so
1971     - * we must keep them pinned while L2 runs.
1972     + * Guest pages referred to in the vmcs02 with host-physical
1973     + * pointers, so we must keep them pinned while L2 runs.
1974     */
1975     struct page *apic_access_page;
1976     struct page *virtual_apic_page;
1977     @@ -454,8 +456,6 @@ struct nested_vmx {
1978     bool pi_pending;
1979     u16 posted_intr_nv;
1980    
1981     - unsigned long *msr_bitmap;
1982     -
1983     struct hrtimer preemption_timer;
1984     bool preemption_timer_expired;
1985    
1986     @@ -570,6 +570,7 @@ struct vcpu_vmx {
1987     struct kvm_vcpu vcpu;
1988     unsigned long host_rsp;
1989     u8 fail;
1990     + u8 msr_bitmap_mode;
1991     u32 exit_intr_info;
1992     u32 idt_vectoring_info;
1993     ulong rflags;
1994     @@ -581,6 +582,10 @@ struct vcpu_vmx {
1995     u64 msr_host_kernel_gs_base;
1996     u64 msr_guest_kernel_gs_base;
1997     #endif
1998     +
1999     + u64 arch_capabilities;
2000     + u64 spec_ctrl;
2001     +
2002     u32 vm_entry_controls_shadow;
2003     u32 vm_exit_controls_shadow;
2004     u32 secondary_exec_control;
2005     @@ -887,21 +892,18 @@ static const unsigned short vmcs_field_to_offset_table[] = {
2006    
2007     static inline short vmcs_field_to_offset(unsigned long field)
2008     {
2009     - BUILD_BUG_ON(ARRAY_SIZE(vmcs_field_to_offset_table) > SHRT_MAX);
2010     + const size_t size = ARRAY_SIZE(vmcs_field_to_offset_table);
2011     + unsigned short offset;
2012    
2013     - if (field >= ARRAY_SIZE(vmcs_field_to_offset_table))
2014     + BUILD_BUG_ON(size > SHRT_MAX);
2015     + if (field >= size)
2016     return -ENOENT;
2017    
2018     - /*
2019     - * FIXME: Mitigation for CVE-2017-5753. To be replaced with a
2020     - * generic mechanism.
2021     - */
2022     - asm("lfence");
2023     -
2024     - if (vmcs_field_to_offset_table[field] == 0)
2025     + field = array_index_nospec(field, size);
2026     + offset = vmcs_field_to_offset_table[field];
2027     + if (offset == 0)
2028     return -ENOENT;
2029     -
2030     - return vmcs_field_to_offset_table[field];
2031     + return offset;
2032     }
2033    
2034     static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
2035     @@ -927,6 +929,9 @@ static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu);
2036     static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
2037     static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
2038     u16 error_code);
2039     +static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
2040     +static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
2041     + u32 msr, int type);
2042    
2043     static DEFINE_PER_CPU(struct vmcs *, vmxarea);
2044     static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
2045     @@ -946,12 +951,6 @@ static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
2046     enum {
2047     VMX_IO_BITMAP_A,
2048     VMX_IO_BITMAP_B,
2049     - VMX_MSR_BITMAP_LEGACY,
2050     - VMX_MSR_BITMAP_LONGMODE,
2051     - VMX_MSR_BITMAP_LEGACY_X2APIC_APICV,
2052     - VMX_MSR_BITMAP_LONGMODE_X2APIC_APICV,
2053     - VMX_MSR_BITMAP_LEGACY_X2APIC,
2054     - VMX_MSR_BITMAP_LONGMODE_X2APIC,
2055     VMX_VMREAD_BITMAP,
2056     VMX_VMWRITE_BITMAP,
2057     VMX_BITMAP_NR
2058     @@ -961,12 +960,6 @@ static unsigned long *vmx_bitmap[VMX_BITMAP_NR];
2059    
2060     #define vmx_io_bitmap_a (vmx_bitmap[VMX_IO_BITMAP_A])
2061     #define vmx_io_bitmap_b (vmx_bitmap[VMX_IO_BITMAP_B])
2062     -#define vmx_msr_bitmap_legacy (vmx_bitmap[VMX_MSR_BITMAP_LEGACY])
2063     -#define vmx_msr_bitmap_longmode (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE])
2064     -#define vmx_msr_bitmap_legacy_x2apic_apicv (vmx_bitmap[VMX_MSR_BITMAP_LEGACY_X2APIC_APICV])
2065     -#define vmx_msr_bitmap_longmode_x2apic_apicv (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE_X2APIC_APICV])
2066     -#define vmx_msr_bitmap_legacy_x2apic (vmx_bitmap[VMX_MSR_BITMAP_LEGACY_X2APIC])
2067     -#define vmx_msr_bitmap_longmode_x2apic (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE_X2APIC])
2068     #define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP])
2069     #define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP])
2070    
2071     @@ -1913,6 +1906,52 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
2072     vmcs_write32(EXCEPTION_BITMAP, eb);
2073     }
2074    
2075     +/*
2076     + * Check if MSR is intercepted for currently loaded MSR bitmap.
2077     + */
2078     +static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
2079     +{
2080     + unsigned long *msr_bitmap;
2081     + int f = sizeof(unsigned long);
2082     +
2083     + if (!cpu_has_vmx_msr_bitmap())
2084     + return true;
2085     +
2086     + msr_bitmap = to_vmx(vcpu)->loaded_vmcs->msr_bitmap;
2087     +
2088     + if (msr <= 0x1fff) {
2089     + return !!test_bit(msr, msr_bitmap + 0x800 / f);
2090     + } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
2091     + msr &= 0x1fff;
2092     + return !!test_bit(msr, msr_bitmap + 0xc00 / f);
2093     + }
2094     +
2095     + return true;
2096     +}
2097     +
2098     +/*
2099     + * Check if MSR is intercepted for L01 MSR bitmap.
2100     + */
2101     +static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr)
2102     +{
2103     + unsigned long *msr_bitmap;
2104     + int f = sizeof(unsigned long);
2105     +
2106     + if (!cpu_has_vmx_msr_bitmap())
2107     + return true;
2108     +
2109     + msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap;
2110     +
2111     + if (msr <= 0x1fff) {
2112     + return !!test_bit(msr, msr_bitmap + 0x800 / f);
2113     + } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
2114     + msr &= 0x1fff;
2115     + return !!test_bit(msr, msr_bitmap + 0xc00 / f);
2116     + }
2117     +
2118     + return true;
2119     +}
2120     +
2121     static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
2122     unsigned long entry, unsigned long exit)
2123     {
2124     @@ -2291,6 +2330,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
2125     if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
2126     per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
2127     vmcs_load(vmx->loaded_vmcs->vmcs);
2128     + indirect_branch_prediction_barrier();
2129     }
2130    
2131     if (!already_loaded) {
2132     @@ -2567,36 +2607,6 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
2133     vmx->guest_msrs[from] = tmp;
2134     }
2135    
2136     -static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
2137     -{
2138     - unsigned long *msr_bitmap;
2139     -
2140     - if (is_guest_mode(vcpu))
2141     - msr_bitmap = to_vmx(vcpu)->nested.msr_bitmap;
2142     - else if (cpu_has_secondary_exec_ctrls() &&
2143     - (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
2144     - SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
2145     - if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) {
2146     - if (is_long_mode(vcpu))
2147     - msr_bitmap = vmx_msr_bitmap_longmode_x2apic_apicv;
2148     - else
2149     - msr_bitmap = vmx_msr_bitmap_legacy_x2apic_apicv;
2150     - } else {
2151     - if (is_long_mode(vcpu))
2152     - msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
2153     - else
2154     - msr_bitmap = vmx_msr_bitmap_legacy_x2apic;
2155     - }
2156     - } else {
2157     - if (is_long_mode(vcpu))
2158     - msr_bitmap = vmx_msr_bitmap_longmode;
2159     - else
2160     - msr_bitmap = vmx_msr_bitmap_legacy;
2161     - }
2162     -
2163     - vmcs_write64(MSR_BITMAP, __pa(msr_bitmap));
2164     -}
2165     -
2166     /*
2167     * Set up the vmcs to automatically save and restore system
2168     * msrs. Don't touch the 64-bit msrs if the guest is in legacy
2169     @@ -2637,7 +2647,7 @@ static void setup_msrs(struct vcpu_vmx *vmx)
2170     vmx->save_nmsrs = save_nmsrs;
2171    
2172     if (cpu_has_vmx_msr_bitmap())
2173     - vmx_set_msr_bitmap(&vmx->vcpu);
2174     + vmx_update_msr_bitmap(&vmx->vcpu);
2175     }
2176    
2177     /*
2178     @@ -3273,6 +3283,20 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2179     case MSR_IA32_TSC:
2180     msr_info->data = guest_read_tsc(vcpu);
2181     break;
2182     + case MSR_IA32_SPEC_CTRL:
2183     + if (!msr_info->host_initiated &&
2184     + !guest_cpuid_has(vcpu, X86_FEATURE_IBRS) &&
2185     + !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
2186     + return 1;
2187     +
2188     + msr_info->data = to_vmx(vcpu)->spec_ctrl;
2189     + break;
2190     + case MSR_IA32_ARCH_CAPABILITIES:
2191     + if (!msr_info->host_initiated &&
2192     + !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES))
2193     + return 1;
2194     + msr_info->data = to_vmx(vcpu)->arch_capabilities;
2195     + break;
2196     case MSR_IA32_SYSENTER_CS:
2197     msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
2198     break;
2199     @@ -3380,6 +3404,70 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2200     case MSR_IA32_TSC:
2201     kvm_write_tsc(vcpu, msr_info);
2202     break;
2203     + case MSR_IA32_SPEC_CTRL:
2204     + if (!msr_info->host_initiated &&
2205     + !guest_cpuid_has(vcpu, X86_FEATURE_IBRS) &&
2206     + !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
2207     + return 1;
2208     +
2209     + /* The STIBP bit doesn't fault even if it's not advertised */
2210     + if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP))
2211     + return 1;
2212     +
2213     + vmx->spec_ctrl = data;
2214     +
2215     + if (!data)
2216     + break;
2217     +
2218     + /*
2219     + * For non-nested:
2220     + * When it's written (to non-zero) for the first time, pass
2221     + * it through.
2222     + *
2223     + * For nested:
2224     + * The handling of the MSR bitmap for L2 guests is done in
2225     + * nested_vmx_merge_msr_bitmap. We should not touch the
2226     + * vmcs02.msr_bitmap here since it gets completely overwritten
2227     + * in the merging. We update the vmcs01 here for L1 as well
2228     + * since it will end up touching the MSR anyway now.
2229     + */
2230     + vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap,
2231     + MSR_IA32_SPEC_CTRL,
2232     + MSR_TYPE_RW);
2233     + break;
2234     + case MSR_IA32_PRED_CMD:
2235     + if (!msr_info->host_initiated &&
2236     + !guest_cpuid_has(vcpu, X86_FEATURE_IBPB) &&
2237     + !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
2238     + return 1;
2239     +
2240     + if (data & ~PRED_CMD_IBPB)
2241     + return 1;
2242     +
2243     + if (!data)
2244     + break;
2245     +
2246     + wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
2247     +
2248     + /*
2249     + * For non-nested:
2250     + * When it's written (to non-zero) for the first time, pass
2251     + * it through.
2252     + *
2253     + * For nested:
2254     + * The handling of the MSR bitmap for L2 guests is done in
2255     + * nested_vmx_merge_msr_bitmap. We should not touch the
2256     + * vmcs02.msr_bitmap here since it gets completely overwritten
2257     + * in the merging.
2258     + */
2259     + vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD,
2260     + MSR_TYPE_W);
2261     + break;
2262     + case MSR_IA32_ARCH_CAPABILITIES:
2263     + if (!msr_info->host_initiated)
2264     + return 1;
2265     + vmx->arch_capabilities = data;
2266     + break;
2267     case MSR_IA32_CR_PAT:
2268     if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
2269     if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
2270     @@ -3822,11 +3910,6 @@ static struct vmcs *alloc_vmcs_cpu(int cpu)
2271     return vmcs;
2272     }
2273    
2274     -static struct vmcs *alloc_vmcs(void)
2275     -{
2276     - return alloc_vmcs_cpu(raw_smp_processor_id());
2277     -}
2278     -
2279     static void free_vmcs(struct vmcs *vmcs)
2280     {
2281     free_pages((unsigned long)vmcs, vmcs_config.order);
2282     @@ -3842,9 +3925,38 @@ static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
2283     loaded_vmcs_clear(loaded_vmcs);
2284     free_vmcs(loaded_vmcs->vmcs);
2285     loaded_vmcs->vmcs = NULL;
2286     + if (loaded_vmcs->msr_bitmap)
2287     + free_page((unsigned long)loaded_vmcs->msr_bitmap);
2288     WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
2289     }
2290    
2291     +static struct vmcs *alloc_vmcs(void)
2292     +{
2293     + return alloc_vmcs_cpu(raw_smp_processor_id());
2294     +}
2295     +
2296     +static int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
2297     +{
2298     + loaded_vmcs->vmcs = alloc_vmcs();
2299     + if (!loaded_vmcs->vmcs)
2300     + return -ENOMEM;
2301     +
2302     + loaded_vmcs->shadow_vmcs = NULL;
2303     + loaded_vmcs_init(loaded_vmcs);
2304     +
2305     + if (cpu_has_vmx_msr_bitmap()) {
2306     + loaded_vmcs->msr_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
2307     + if (!loaded_vmcs->msr_bitmap)
2308     + goto out_vmcs;
2309     + memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
2310     + }
2311     + return 0;
2312     +
2313     +out_vmcs:
2314     + free_loaded_vmcs(loaded_vmcs);
2315     + return -ENOMEM;
2316     +}
2317     +
2318     static void free_kvm_area(void)
2319     {
2320     int cpu;
2321     @@ -4917,10 +5029,8 @@ static void free_vpid(int vpid)
2322     spin_unlock(&vmx_vpid_lock);
2323     }
2324    
2325     -#define MSR_TYPE_R 1
2326     -#define MSR_TYPE_W 2
2327     -static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
2328     - u32 msr, int type)
2329     +static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
2330     + u32 msr, int type)
2331     {
2332     int f = sizeof(unsigned long);
2333    
2334     @@ -4954,6 +5064,50 @@ static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
2335     }
2336     }
2337    
2338     +static void __always_inline vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
2339     + u32 msr, int type)
2340     +{
2341     + int f = sizeof(unsigned long);
2342     +
2343     + if (!cpu_has_vmx_msr_bitmap())
2344     + return;
2345     +
2346     + /*
2347     + * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
2348     + * have the write-low and read-high bitmap offsets the wrong way round.
2349     + * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
2350     + */
2351     + if (msr <= 0x1fff) {
2352     + if (type & MSR_TYPE_R)
2353     + /* read-low */
2354     + __set_bit(msr, msr_bitmap + 0x000 / f);
2355     +
2356     + if (type & MSR_TYPE_W)
2357     + /* write-low */
2358     + __set_bit(msr, msr_bitmap + 0x800 / f);
2359     +
2360     + } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
2361     + msr &= 0x1fff;
2362     + if (type & MSR_TYPE_R)
2363     + /* read-high */
2364     + __set_bit(msr, msr_bitmap + 0x400 / f);
2365     +
2366     + if (type & MSR_TYPE_W)
2367     + /* write-high */
2368     + __set_bit(msr, msr_bitmap + 0xc00 / f);
2369     +
2370     + }
2371     +}
2372     +
2373     +static void __always_inline vmx_set_intercept_for_msr(unsigned long *msr_bitmap,
2374     + u32 msr, int type, bool value)
2375     +{
2376     + if (value)
2377     + vmx_enable_intercept_for_msr(msr_bitmap, msr, type);
2378     + else
2379     + vmx_disable_intercept_for_msr(msr_bitmap, msr, type);
2380     +}
2381     +
2382     /*
2383     * If a msr is allowed by L0, we should check whether it is allowed by L1.
2384     * The corresponding bit will be cleared unless both of L0 and L1 allow it.
2385     @@ -5000,30 +5154,70 @@ static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1,
2386     }
2387     }
2388    
2389     -static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
2390     +static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu)
2391     {
2392     - if (!longmode_only)
2393     - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy,
2394     - msr, MSR_TYPE_R | MSR_TYPE_W);
2395     - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
2396     - msr, MSR_TYPE_R | MSR_TYPE_W);
2397     + u8 mode = 0;
2398     +
2399     + if (cpu_has_secondary_exec_ctrls() &&
2400     + (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
2401     + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
2402     + mode |= MSR_BITMAP_MODE_X2APIC;
2403     + if (enable_apicv && kvm_vcpu_apicv_active(vcpu))
2404     + mode |= MSR_BITMAP_MODE_X2APIC_APICV;
2405     + }
2406     +
2407     + if (is_long_mode(vcpu))
2408     + mode |= MSR_BITMAP_MODE_LM;
2409     +
2410     + return mode;
2411     }
2412    
2413     -static void vmx_disable_intercept_msr_x2apic(u32 msr, int type, bool apicv_active)
2414     +#define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4))
2415     +
2416     +static void vmx_update_msr_bitmap_x2apic(unsigned long *msr_bitmap,
2417     + u8 mode)
2418     {
2419     - if (apicv_active) {
2420     - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv,
2421     - msr, type);
2422     - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv,
2423     - msr, type);
2424     - } else {
2425     - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
2426     - msr, type);
2427     - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
2428     - msr, type);
2429     + int msr;
2430     +
2431     + for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
2432     + unsigned word = msr / BITS_PER_LONG;
2433     + msr_bitmap[word] = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0;
2434     + msr_bitmap[word + (0x800 / sizeof(long))] = ~0;
2435     + }
2436     +
2437     + if (mode & MSR_BITMAP_MODE_X2APIC) {
2438     + /*
2439     + * TPR reads and writes can be virtualized even if virtual interrupt
2440     + * delivery is not in use.
2441     + */
2442     + vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW);
2443     + if (mode & MSR_BITMAP_MODE_X2APIC_APICV) {
2444     + vmx_enable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_R);
2445     + vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
2446     + vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
2447     + }
2448     }
2449     }
2450    
2451     +static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu)
2452     +{
2453     + struct vcpu_vmx *vmx = to_vmx(vcpu);
2454     + unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
2455     + u8 mode = vmx_msr_bitmap_mode(vcpu);
2456     + u8 changed = mode ^ vmx->msr_bitmap_mode;
2457     +
2458     + if (!changed)
2459     + return;
2460     +
2461     + vmx_set_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW,
2462     + !(mode & MSR_BITMAP_MODE_LM));
2463     +
2464     + if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV))
2465     + vmx_update_msr_bitmap_x2apic(msr_bitmap, mode);
2466     +
2467     + vmx->msr_bitmap_mode = mode;
2468     +}
2469     +
2470     static bool vmx_get_enable_apicv(struct kvm_vcpu *vcpu)
2471     {
2472     return enable_apicv;
2473     @@ -5269,7 +5463,7 @@ static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
2474     }
2475    
2476     if (cpu_has_vmx_msr_bitmap())
2477     - vmx_set_msr_bitmap(vcpu);
2478     + vmx_update_msr_bitmap(vcpu);
2479     }
2480    
2481     static u32 vmx_exec_control(struct vcpu_vmx *vmx)
2482     @@ -5456,7 +5650,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2483     vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
2484     }
2485     if (cpu_has_vmx_msr_bitmap())
2486     - vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy));
2487     + vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap));
2488    
2489     vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
2490    
2491     @@ -5534,6 +5728,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2492     ++vmx->nmsrs;
2493     }
2494    
2495     + if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
2496     + rdmsrl(MSR_IA32_ARCH_CAPABILITIES, vmx->arch_capabilities);
2497    
2498     vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl);
2499    
2500     @@ -5564,6 +5760,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
2501     u64 cr0;
2502    
2503     vmx->rmode.vm86_active = 0;
2504     + vmx->spec_ctrl = 0;
2505    
2506     vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
2507     kvm_set_cr8(vcpu, 0);
2508     @@ -6739,7 +6936,7 @@ void vmx_enable_tdp(void)
2509    
2510     static __init int hardware_setup(void)
2511     {
2512     - int r = -ENOMEM, i, msr;
2513     + int r = -ENOMEM, i;
2514    
2515     rdmsrl_safe(MSR_EFER, &host_efer);
2516    
2517     @@ -6760,9 +6957,6 @@ static __init int hardware_setup(void)
2518    
2519     memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE);
2520    
2521     - memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
2522     - memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
2523     -
2524     if (setup_vmcs_config(&vmcs_config) < 0) {
2525     r = -EIO;
2526     goto out;
2527     @@ -6825,42 +7019,8 @@ static __init int hardware_setup(void)
2528     kvm_tsc_scaling_ratio_frac_bits = 48;
2529     }
2530    
2531     - vmx_disable_intercept_for_msr(MSR_FS_BASE, false);
2532     - vmx_disable_intercept_for_msr(MSR_GS_BASE, false);
2533     - vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true);
2534     - vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
2535     - vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
2536     - vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
2537     -
2538     - memcpy(vmx_msr_bitmap_legacy_x2apic_apicv,
2539     - vmx_msr_bitmap_legacy, PAGE_SIZE);
2540     - memcpy(vmx_msr_bitmap_longmode_x2apic_apicv,
2541     - vmx_msr_bitmap_longmode, PAGE_SIZE);
2542     - memcpy(vmx_msr_bitmap_legacy_x2apic,
2543     - vmx_msr_bitmap_legacy, PAGE_SIZE);
2544     - memcpy(vmx_msr_bitmap_longmode_x2apic,
2545     - vmx_msr_bitmap_longmode, PAGE_SIZE);
2546     -
2547     set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
2548    
2549     - for (msr = 0x800; msr <= 0x8ff; msr++) {
2550     - if (msr == 0x839 /* TMCCT */)
2551     - continue;
2552     - vmx_disable_intercept_msr_x2apic(msr, MSR_TYPE_R, true);
2553     - }
2554     -
2555     - /*
2556     - * TPR reads and writes can be virtualized even if virtual interrupt
2557     - * delivery is not in use.
2558     - */
2559     - vmx_disable_intercept_msr_x2apic(0x808, MSR_TYPE_W, true);
2560     - vmx_disable_intercept_msr_x2apic(0x808, MSR_TYPE_R | MSR_TYPE_W, false);
2561     -
2562     - /* EOI */
2563     - vmx_disable_intercept_msr_x2apic(0x80b, MSR_TYPE_W, true);
2564     - /* SELF-IPI */
2565     - vmx_disable_intercept_msr_x2apic(0x83f, MSR_TYPE_W, true);
2566     -
2567     if (enable_ept)
2568     vmx_enable_tdp();
2569     else
2570     @@ -6963,94 +7123,6 @@ static int handle_monitor(struct kvm_vcpu *vcpu)
2571     return handle_nop(vcpu);
2572     }
2573    
2574     -/*
2575     - * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12.
2576     - * We could reuse a single VMCS for all the L2 guests, but we also want the
2577     - * option to allocate a separate vmcs02 for each separate loaded vmcs12 - this
2578     - * allows keeping them loaded on the processor, and in the future will allow
2579     - * optimizations where prepare_vmcs02 doesn't need to set all the fields on
2580     - * every entry if they never change.
2581     - * So we keep, in vmx->nested.vmcs02_pool, a cache of size VMCS02_POOL_SIZE
2582     - * (>=0) with a vmcs02 for each recently loaded vmcs12s, most recent first.
2583     - *
2584     - * The following functions allocate and free a vmcs02 in this pool.
2585     - */
2586     -
2587     -/* Get a VMCS from the pool to use as vmcs02 for the current vmcs12. */
2588     -static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx)
2589     -{
2590     - struct vmcs02_list *item;
2591     - list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
2592     - if (item->vmptr == vmx->nested.current_vmptr) {
2593     - list_move(&item->list, &vmx->nested.vmcs02_pool);
2594     - return &item->vmcs02;
2595     - }
2596     -
2597     - if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) {
2598     - /* Recycle the least recently used VMCS. */
2599     - item = list_last_entry(&vmx->nested.vmcs02_pool,
2600     - struct vmcs02_list, list);
2601     - item->vmptr = vmx->nested.current_vmptr;
2602     - list_move(&item->list, &vmx->nested.vmcs02_pool);
2603     - return &item->vmcs02;
2604     - }
2605     -
2606     - /* Create a new VMCS */
2607     - item = kzalloc(sizeof(struct vmcs02_list), GFP_KERNEL);
2608     - if (!item)
2609     - return NULL;
2610     - item->vmcs02.vmcs = alloc_vmcs();
2611     - item->vmcs02.shadow_vmcs = NULL;
2612     - if (!item->vmcs02.vmcs) {
2613     - kfree(item);
2614     - return NULL;
2615     - }
2616     - loaded_vmcs_init(&item->vmcs02);
2617     - item->vmptr = vmx->nested.current_vmptr;
2618     - list_add(&(item->list), &(vmx->nested.vmcs02_pool));
2619     - vmx->nested.vmcs02_num++;
2620     - return &item->vmcs02;
2621     -}
2622     -
2623     -/* Free and remove from pool a vmcs02 saved for a vmcs12 (if there is one) */
2624     -static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr)
2625     -{
2626     - struct vmcs02_list *item;
2627     - list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
2628     - if (item->vmptr == vmptr) {
2629     - free_loaded_vmcs(&item->vmcs02);
2630     - list_del(&item->list);
2631     - kfree(item);
2632     - vmx->nested.vmcs02_num--;
2633     - return;
2634     - }
2635     -}
2636     -
2637     -/*
2638     - * Free all VMCSs saved for this vcpu, except the one pointed by
2639     - * vmx->loaded_vmcs. We must be running L1, so vmx->loaded_vmcs
2640     - * must be &vmx->vmcs01.
2641     - */
2642     -static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx)
2643     -{
2644     - struct vmcs02_list *item, *n;
2645     -
2646     - WARN_ON(vmx->loaded_vmcs != &vmx->vmcs01);
2647     - list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) {
2648     - /*
2649     - * Something will leak if the above WARN triggers. Better than
2650     - * a use-after-free.
2651     - */
2652     - if (vmx->loaded_vmcs == &item->vmcs02)
2653     - continue;
2654     -
2655     - free_loaded_vmcs(&item->vmcs02);
2656     - list_del(&item->list);
2657     - kfree(item);
2658     - vmx->nested.vmcs02_num--;
2659     - }
2660     -}
2661     -
2662     /*
2663     * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
2664     * set the success or error code of an emulated VMX instruction, as specified
2665     @@ -7231,13 +7303,11 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
2666     {
2667     struct vcpu_vmx *vmx = to_vmx(vcpu);
2668     struct vmcs *shadow_vmcs;
2669     + int r;
2670    
2671     - if (cpu_has_vmx_msr_bitmap()) {
2672     - vmx->nested.msr_bitmap =
2673     - (unsigned long *)__get_free_page(GFP_KERNEL);
2674     - if (!vmx->nested.msr_bitmap)
2675     - goto out_msr_bitmap;
2676     - }
2677     + r = alloc_loaded_vmcs(&vmx->nested.vmcs02);
2678     + if (r < 0)
2679     + goto out_vmcs02;
2680    
2681     vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
2682     if (!vmx->nested.cached_vmcs12)
2683     @@ -7254,9 +7324,6 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
2684     vmx->vmcs01.shadow_vmcs = shadow_vmcs;
2685     }
2686    
2687     - INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
2688     - vmx->nested.vmcs02_num = 0;
2689     -
2690     hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
2691     HRTIMER_MODE_REL_PINNED);
2692     vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
2693     @@ -7268,9 +7335,9 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
2694     kfree(vmx->nested.cached_vmcs12);
2695    
2696     out_cached_vmcs12:
2697     - free_page((unsigned long)vmx->nested.msr_bitmap);
2698     + free_loaded_vmcs(&vmx->nested.vmcs02);
2699    
2700     -out_msr_bitmap:
2701     +out_vmcs02:
2702     return -ENOMEM;
2703     }
2704    
2705     @@ -7412,10 +7479,6 @@ static void free_nested(struct vcpu_vmx *vmx)
2706     free_vpid(vmx->nested.vpid02);
2707     vmx->nested.posted_intr_nv = -1;
2708     vmx->nested.current_vmptr = -1ull;
2709     - if (vmx->nested.msr_bitmap) {
2710     - free_page((unsigned long)vmx->nested.msr_bitmap);
2711     - vmx->nested.msr_bitmap = NULL;
2712     - }
2713     if (enable_shadow_vmcs) {
2714     vmx_disable_shadow_vmcs(vmx);
2715     vmcs_clear(vmx->vmcs01.shadow_vmcs);
2716     @@ -7423,7 +7486,7 @@ static void free_nested(struct vcpu_vmx *vmx)
2717     vmx->vmcs01.shadow_vmcs = NULL;
2718     }
2719     kfree(vmx->nested.cached_vmcs12);
2720     - /* Unpin physical memory we referred to in current vmcs02 */
2721     + /* Unpin physical memory we referred to in the vmcs02 */
2722     if (vmx->nested.apic_access_page) {
2723     kvm_release_page_dirty(vmx->nested.apic_access_page);
2724     vmx->nested.apic_access_page = NULL;
2725     @@ -7439,7 +7502,7 @@ static void free_nested(struct vcpu_vmx *vmx)
2726     vmx->nested.pi_desc = NULL;
2727     }
2728    
2729     - nested_free_all_saved_vmcss(vmx);
2730     + free_loaded_vmcs(&vmx->nested.vmcs02);
2731     }
2732    
2733     /* Emulate the VMXOFF instruction */
2734     @@ -7482,8 +7545,6 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
2735     vmptr + offsetof(struct vmcs12, launch_state),
2736     &zero, sizeof(zero));
2737    
2738     - nested_free_vmcs02(vmx, vmptr);
2739     -
2740     nested_vmx_succeed(vcpu);
2741     return kvm_skip_emulated_instruction(vcpu);
2742     }
2743     @@ -8395,10 +8456,11 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
2744    
2745     /*
2746     * The host physical addresses of some pages of guest memory
2747     - * are loaded into VMCS02 (e.g. L1's Virtual APIC Page). The CPU
2748     - * may write to these pages via their host physical address while
2749     - * L2 is running, bypassing any address-translation-based dirty
2750     - * tracking (e.g. EPT write protection).
2751     + * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
2752     + * Page). The CPU may write to these pages via their host
2753     + * physical address while L2 is running, bypassing any
2754     + * address-translation-based dirty tracking (e.g. EPT write
2755     + * protection).
2756     *
2757     * Mark them dirty on every exit from L2 to prevent them from
2758     * getting out of sync with dirty tracking.
2759     @@ -8932,7 +8994,7 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
2760     }
2761     vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control);
2762    
2763     - vmx_set_msr_bitmap(vcpu);
2764     + vmx_update_msr_bitmap(vcpu);
2765     }
2766    
2767     static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa)
2768     @@ -9118,14 +9180,14 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
2769     #endif
2770     "pushf\n\t"
2771     __ASM_SIZE(push) " $%c[cs]\n\t"
2772     - "call *%[entry]\n\t"
2773     + CALL_NOSPEC
2774     :
2775     #ifdef CONFIG_X86_64
2776     [sp]"=&r"(tmp),
2777     #endif
2778     ASM_CALL_CONSTRAINT
2779     :
2780     - [entry]"r"(entry),
2781     + THUNK_TARGET(entry),
2782     [ss]"i"(__KERNEL_DS),
2783     [cs]"i"(__KERNEL_CS)
2784     );
2785     @@ -9362,6 +9424,15 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
2786    
2787     vmx_arm_hv_timer(vcpu);
2788    
2789     + /*
2790     + * If this vCPU has touched SPEC_CTRL, restore the guest's value if
2791     + * it's non-zero. Since vmentry is serialising on affected CPUs, there
2792     + * is no need to worry about the conditional branch over the wrmsr
2793     + * being speculatively taken.
2794     + */
2795     + if (vmx->spec_ctrl)
2796     + wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
2797     +
2798     vmx->__launched = vmx->loaded_vmcs->launched;
2799     asm(
2800     /* Store host registers */
2801     @@ -9480,6 +9551,27 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
2802     #endif
2803     );
2804    
2805     + /*
2806     + * We do not use IBRS in the kernel. If this vCPU has used the
2807     + * SPEC_CTRL MSR it may have left it on; save the value and
2808     + * turn it off. This is much more efficient than blindly adding
2809     + * it to the atomic save/restore list. Especially as the former
2810     + * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
2811     + *
2812     + * For non-nested case:
2813     + * If the L01 MSR bitmap does not intercept the MSR, then we need to
2814     + * save it.
2815     + *
2816     + * For nested case:
2817     + * If the L02 MSR bitmap does not intercept the MSR, then we need to
2818     + * save it.
2819     + */
2820     + if (!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))
2821     + rdmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
2822     +
2823     + if (vmx->spec_ctrl)
2824     + wrmsrl(MSR_IA32_SPEC_CTRL, 0);
2825     +
2826     /* Eliminate branch target predictions from guest mode */
2827     vmexit_fill_RSB();
2828    
2829     @@ -9594,6 +9686,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
2830     {
2831     int err;
2832     struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
2833     + unsigned long *msr_bitmap;
2834     int cpu;
2835    
2836     if (!vmx)
2837     @@ -9626,13 +9719,20 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
2838     if (!vmx->guest_msrs)
2839     goto free_pml;
2840    
2841     - vmx->loaded_vmcs = &vmx->vmcs01;
2842     - vmx->loaded_vmcs->vmcs = alloc_vmcs();
2843     - vmx->loaded_vmcs->shadow_vmcs = NULL;
2844     - if (!vmx->loaded_vmcs->vmcs)
2845     + err = alloc_loaded_vmcs(&vmx->vmcs01);
2846     + if (err < 0)
2847     goto free_msrs;
2848     - loaded_vmcs_init(vmx->loaded_vmcs);
2849    
2850     + msr_bitmap = vmx->vmcs01.msr_bitmap;
2851     + vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE, MSR_TYPE_RW);
2852     + vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE, MSR_TYPE_RW);
2853     + vmx_disable_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
2854     + vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
2855     + vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
2856     + vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
2857     + vmx->msr_bitmap_mode = 0;
2858     +
2859     + vmx->loaded_vmcs = &vmx->vmcs01;
2860     cpu = get_cpu();
2861     vmx_vcpu_load(&vmx->vcpu, cpu);
2862     vmx->vcpu.cpu = cpu;
2863     @@ -10101,10 +10201,25 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
2864     int msr;
2865     struct page *page;
2866     unsigned long *msr_bitmap_l1;
2867     - unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.msr_bitmap;
2868     + unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap;
2869     + /*
2870     + * pred_cmd & spec_ctrl are trying to verify two things:
2871     + *
2872     + * 1. L0 gave a permission to L1 to actually passthrough the MSR. This
2873     + * ensures that we do not accidentally generate an L02 MSR bitmap
2874     + * from the L12 MSR bitmap that is too permissive.
2875     + * 2. That L1 or L2s have actually used the MSR. This avoids
2876     + * unnecessarily merging of the bitmap if the MSR is unused. This
2877     + * works properly because we only update the L01 MSR bitmap lazily.
2878     + * So even if L0 should pass L1 these MSRs, the L01 bitmap is only
2879     + * updated to reflect this when L1 (or its L2s) actually write to
2880     + * the MSR.
2881     + */
2882     + bool pred_cmd = msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD);
2883     + bool spec_ctrl = msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL);
2884    
2885     - /* This shortcut is ok because we support only x2APIC MSRs so far. */
2886     - if (!nested_cpu_has_virt_x2apic_mode(vmcs12))
2887     + if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
2888     + !pred_cmd && !spec_ctrl)
2889     return false;
2890    
2891     page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->msr_bitmap);
2892     @@ -10137,6 +10252,19 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
2893     MSR_TYPE_W);
2894     }
2895     }
2896     +
2897     + if (spec_ctrl)
2898     + nested_vmx_disable_intercept_for_msr(
2899     + msr_bitmap_l1, msr_bitmap_l0,
2900     + MSR_IA32_SPEC_CTRL,
2901     + MSR_TYPE_R | MSR_TYPE_W);
2902     +
2903     + if (pred_cmd)
2904     + nested_vmx_disable_intercept_for_msr(
2905     + msr_bitmap_l1, msr_bitmap_l0,
2906     + MSR_IA32_PRED_CMD,
2907     + MSR_TYPE_W);
2908     +
2909     kunmap(page);
2910     kvm_release_page_clean(page);
2911    
2912     @@ -10678,6 +10806,9 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
2913     if (kvm_has_tsc_control)
2914     decache_tsc_multiplier(vmx);
2915    
2916     + if (cpu_has_vmx_msr_bitmap())
2917     + vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
2918     +
2919     if (enable_vpid) {
2920     /*
2921     * There is no direct mapping between vpid02 and vpid12, the
2922     @@ -10894,20 +11025,15 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
2923     {
2924     struct vcpu_vmx *vmx = to_vmx(vcpu);
2925     struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
2926     - struct loaded_vmcs *vmcs02;
2927     u32 msr_entry_idx;
2928     u32 exit_qual;
2929    
2930     - vmcs02 = nested_get_current_vmcs02(vmx);
2931     - if (!vmcs02)
2932     - return -ENOMEM;
2933     -
2934     enter_guest_mode(vcpu);
2935    
2936     if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
2937     vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
2938    
2939     - vmx_switch_vmcs(vcpu, vmcs02);
2940     + vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
2941     vmx_segment_cache_clear(vmx);
2942    
2943     if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &exit_qual)) {
2944     @@ -11476,7 +11602,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
2945     vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
2946    
2947     if (cpu_has_vmx_msr_bitmap())
2948     - vmx_set_msr_bitmap(vcpu);
2949     + vmx_update_msr_bitmap(vcpu);
2950    
2951     if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
2952     vmcs12->vm_exit_msr_load_count))
2953     @@ -11522,10 +11648,6 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
2954     vm_exit_controls_reset_shadow(vmx);
2955     vmx_segment_cache_clear(vmx);
2956    
2957     - /* if no vmcs02 cache requested, remove the one we used */
2958     - if (VMCS02_POOL_SIZE == 0)
2959     - nested_free_vmcs02(vmx, vmx->nested.current_vmptr);
2960     -
2961     /* Update any VMCS fields that might have changed while L2 ran */
2962     vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
2963     vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
2964     diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
2965     index 8c28023a43b1..f97358423f9c 100644
2966     --- a/arch/x86/kvm/x86.c
2967     +++ b/arch/x86/kvm/x86.c
2968     @@ -1006,6 +1006,7 @@ static u32 msrs_to_save[] = {
2969     #endif
2970     MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
2971     MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
2972     + MSR_IA32_SPEC_CTRL, MSR_IA32_ARCH_CAPABILITIES
2973     };
2974    
2975     static unsigned num_msrs_to_save;
2976     diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
2977     index d435c89875c1..d0a3170e6804 100644
2978     --- a/arch/x86/lib/Makefile
2979     +++ b/arch/x86/lib/Makefile
2980     @@ -27,6 +27,7 @@ lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
2981     lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o
2982     lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o
2983     lib-$(CONFIG_RETPOLINE) += retpoline.o
2984     +OBJECT_FILES_NON_STANDARD_retpoline.o :=y
2985    
2986     obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o
2987    
2988     diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S
2989     index c97d935a29e8..49b167f73215 100644
2990     --- a/arch/x86/lib/getuser.S
2991     +++ b/arch/x86/lib/getuser.S
2992     @@ -40,6 +40,8 @@ ENTRY(__get_user_1)
2993     mov PER_CPU_VAR(current_task), %_ASM_DX
2994     cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX
2995     jae bad_get_user
2996     + sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */
2997     + and %_ASM_DX, %_ASM_AX
2998     ASM_STAC
2999     1: movzbl (%_ASM_AX),%edx
3000     xor %eax,%eax
3001     @@ -54,6 +56,8 @@ ENTRY(__get_user_2)
3002     mov PER_CPU_VAR(current_task), %_ASM_DX
3003     cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX
3004     jae bad_get_user
3005     + sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */
3006     + and %_ASM_DX, %_ASM_AX
3007     ASM_STAC
3008     2: movzwl -1(%_ASM_AX),%edx
3009     xor %eax,%eax
3010     @@ -68,6 +72,8 @@ ENTRY(__get_user_4)
3011     mov PER_CPU_VAR(current_task), %_ASM_DX
3012     cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX
3013     jae bad_get_user
3014     + sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */
3015     + and %_ASM_DX, %_ASM_AX
3016     ASM_STAC
3017     3: movl -3(%_ASM_AX),%edx
3018     xor %eax,%eax
3019     @@ -83,6 +89,8 @@ ENTRY(__get_user_8)
3020     mov PER_CPU_VAR(current_task), %_ASM_DX
3021     cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX
3022     jae bad_get_user
3023     + sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */
3024     + and %_ASM_DX, %_ASM_AX
3025     ASM_STAC
3026     4: movq -7(%_ASM_AX),%rdx
3027     xor %eax,%eax
3028     @@ -94,6 +102,8 @@ ENTRY(__get_user_8)
3029     mov PER_CPU_VAR(current_task), %_ASM_DX
3030     cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX
3031     jae bad_get_user_8
3032     + sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */
3033     + and %_ASM_DX, %_ASM_AX
3034     ASM_STAC
3035     4: movl -7(%_ASM_AX),%edx
3036     5: movl -3(%_ASM_AX),%ecx
3037     diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S
3038     index dfb2ba91b670..480edc3a5e03 100644
3039     --- a/arch/x86/lib/retpoline.S
3040     +++ b/arch/x86/lib/retpoline.S
3041     @@ -7,6 +7,7 @@
3042     #include <asm/alternative-asm.h>
3043     #include <asm/export.h>
3044     #include <asm/nospec-branch.h>
3045     +#include <asm/bitsperlong.h>
3046    
3047     .macro THUNK reg
3048     .section .text.__x86.indirect_thunk
3049     @@ -36,7 +37,6 @@ GENERATE_THUNK(_ASM_DX)
3050     GENERATE_THUNK(_ASM_SI)
3051     GENERATE_THUNK(_ASM_DI)
3052     GENERATE_THUNK(_ASM_BP)
3053     -GENERATE_THUNK(_ASM_SP)
3054     #ifdef CONFIG_64BIT
3055     GENERATE_THUNK(r8)
3056     GENERATE_THUNK(r9)
3057     @@ -47,3 +47,58 @@ GENERATE_THUNK(r13)
3058     GENERATE_THUNK(r14)
3059     GENERATE_THUNK(r15)
3060     #endif
3061     +
3062     +/*
3063     + * Fill the CPU return stack buffer.
3064     + *
3065     + * Each entry in the RSB, if used for a speculative 'ret', contains an
3066     + * infinite 'pause; lfence; jmp' loop to capture speculative execution.
3067     + *
3068     + * This is required in various cases for retpoline and IBRS-based
3069     + * mitigations for the Spectre variant 2 vulnerability. Sometimes to
3070     + * eliminate potentially bogus entries from the RSB, and sometimes
3071     + * purely to ensure that it doesn't get empty, which on some CPUs would
3072     + * allow predictions from other (unwanted!) sources to be used.
3073     + *
3074     + * Google experimented with loop-unrolling and this turned out to be
3075     + * the optimal version - two calls, each with their own speculation
3076     + * trap should their return address end up getting used, in a loop.
3077     + */
3078     +.macro STUFF_RSB nr:req sp:req
3079     + mov $(\nr / 2), %_ASM_BX
3080     + .align 16
3081     +771:
3082     + call 772f
3083     +773: /* speculation trap */
3084     + pause
3085     + lfence
3086     + jmp 773b
3087     + .align 16
3088     +772:
3089     + call 774f
3090     +775: /* speculation trap */
3091     + pause
3092     + lfence
3093     + jmp 775b
3094     + .align 16
3095     +774:
3096     + dec %_ASM_BX
3097     + jnz 771b
3098     + add $((BITS_PER_LONG/8) * \nr), \sp
3099     +.endm
3100     +
3101     +#define RSB_FILL_LOOPS 16 /* To avoid underflow */
3102     +
3103     +ENTRY(__fill_rsb)
3104     + STUFF_RSB RSB_FILL_LOOPS, %_ASM_SP
3105     + ret
3106     +END(__fill_rsb)
3107     +EXPORT_SYMBOL_GPL(__fill_rsb)
3108     +
3109     +#define RSB_CLEAR_LOOPS 32 /* To forcibly overwrite all entries */
3110     +
3111     +ENTRY(__clear_rsb)
3112     + STUFF_RSB RSB_CLEAR_LOOPS, %_ASM_SP
3113     + ret
3114     +END(__clear_rsb)
3115     +EXPORT_SYMBOL_GPL(__clear_rsb)
3116     diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
3117     index 1b377f734e64..7add8ba06887 100644
3118     --- a/arch/x86/lib/usercopy_32.c
3119     +++ b/arch/x86/lib/usercopy_32.c
3120     @@ -331,12 +331,12 @@ do { \
3121    
3122     unsigned long __copy_user_ll(void *to, const void *from, unsigned long n)
3123     {
3124     - stac();
3125     + __uaccess_begin_nospec();
3126     if (movsl_is_ok(to, from, n))
3127     __copy_user(to, from, n);
3128     else
3129     n = __copy_user_intel(to, from, n);
3130     - clac();
3131     + __uaccess_end();
3132     return n;
3133     }
3134     EXPORT_SYMBOL(__copy_user_ll);
3135     @@ -344,7 +344,7 @@ EXPORT_SYMBOL(__copy_user_ll);
3136     unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *from,
3137     unsigned long n)
3138     {
3139     - stac();
3140     + __uaccess_begin_nospec();
3141     #ifdef CONFIG_X86_INTEL_USERCOPY
3142     if (n > 64 && static_cpu_has(X86_FEATURE_XMM2))
3143     n = __copy_user_intel_nocache(to, from, n);
3144     @@ -353,7 +353,7 @@ unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *fr
3145     #else
3146     __copy_user(to, from, n);
3147     #endif
3148     - clac();
3149     + __uaccess_end();
3150     return n;
3151     }
3152     EXPORT_SYMBOL(__copy_from_user_ll_nocache_nozero);
3153     diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
3154     index 5bfe61a5e8e3..012d02624848 100644
3155     --- a/arch/x86/mm/tlb.c
3156     +++ b/arch/x86/mm/tlb.c
3157     @@ -6,13 +6,14 @@
3158     #include <linux/interrupt.h>
3159     #include <linux/export.h>
3160     #include <linux/cpu.h>
3161     +#include <linux/debugfs.h>
3162    
3163     #include <asm/tlbflush.h>
3164     #include <asm/mmu_context.h>
3165     +#include <asm/nospec-branch.h>
3166     #include <asm/cache.h>
3167     #include <asm/apic.h>
3168     #include <asm/uv/uv.h>
3169     -#include <linux/debugfs.h>
3170    
3171     /*
3172     * TLB flushing, formerly SMP-only
3173     @@ -247,6 +248,27 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
3174     } else {
3175     u16 new_asid;
3176     bool need_flush;
3177     + u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id);
3178     +
3179     + /*
3180     + * Avoid user/user BTB poisoning by flushing the branch
3181     + * predictor when switching between processes. This stops
3182     + * one process from doing Spectre-v2 attacks on another.
3183     + *
3184     + * As an optimization, flush indirect branches only when
3185     + * switching into processes that disable dumping. This
3186     + * protects high value processes like gpg, without having
3187     + * too high performance overhead. IBPB is *expensive*!
3188     + *
3189     + * This will not flush branches when switching into kernel
3190     + * threads. It will also not flush if we switch to idle
3191     + * thread and back to the same process. It will flush if we
3192     + * switch to a different non-dumpable process.
3193     + */
3194     + if (tsk && tsk->mm &&
3195     + tsk->mm->context.ctx_id != last_ctx_id &&
3196     + get_dumpable(tsk->mm) != SUID_DUMP_USER)
3197     + indirect_branch_prediction_barrier();
3198    
3199     if (IS_ENABLED(CONFIG_VMAP_STACK)) {
3200     /*
3201     @@ -292,6 +314,14 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
3202     trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
3203     }
3204    
3205     + /*
3206     + * Record last user mm's context id, so we can avoid
3207     + * flushing branch buffer with IBPB if we switch back
3208     + * to the same user.
3209     + */
3210     + if (next != &init_mm)
3211     + this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
3212     +
3213     this_cpu_write(cpu_tlbstate.loaded_mm, next);
3214     this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
3215     }
3216     @@ -369,6 +399,7 @@ void initialize_tlbstate_and_flush(void)
3217     write_cr3(build_cr3(mm->pgd, 0));
3218    
3219     /* Reinitialize tlbstate. */
3220     + this_cpu_write(cpu_tlbstate.last_ctx_id, mm->context.ctx_id);
3221     this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0);
3222     this_cpu_write(cpu_tlbstate.next_asid, 1);
3223     this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id);
3224     diff --git a/drivers/auxdisplay/img-ascii-lcd.c b/drivers/auxdisplay/img-ascii-lcd.c
3225     index a9020f82eea7..58403052514f 100644
3226     --- a/drivers/auxdisplay/img-ascii-lcd.c
3227     +++ b/drivers/auxdisplay/img-ascii-lcd.c
3228     @@ -443,3 +443,7 @@ static struct platform_driver img_ascii_lcd_driver = {
3229     .remove = img_ascii_lcd_remove,
3230     };
3231     module_platform_driver(img_ascii_lcd_driver);
3232     +
3233     +MODULE_DESCRIPTION("Imagination Technologies ASCII LCD Display");
3234     +MODULE_AUTHOR("Paul Burton <paul.burton@mips.com>");
3235     +MODULE_LICENSE("GPL");
3236     diff --git a/drivers/fpga/fpga-region.c b/drivers/fpga/fpga-region.c
3237     index d9ab7c75b14f..e0c73ceba2ed 100644
3238     --- a/drivers/fpga/fpga-region.c
3239     +++ b/drivers/fpga/fpga-region.c
3240     @@ -147,6 +147,7 @@ static struct fpga_manager *fpga_region_get_manager(struct fpga_region *region)
3241     mgr_node = of_parse_phandle(np, "fpga-mgr", 0);
3242     if (mgr_node) {
3243     mgr = of_fpga_mgr_get(mgr_node);
3244     + of_node_put(mgr_node);
3245     of_node_put(np);
3246     return mgr;
3247     }
3248     @@ -192,10 +193,13 @@ static int fpga_region_get_bridges(struct fpga_region *region,
3249     parent_br = region_np->parent;
3250    
3251     /* If overlay has a list of bridges, use it. */
3252     - if (of_parse_phandle(overlay, "fpga-bridges", 0))
3253     + br = of_parse_phandle(overlay, "fpga-bridges", 0);
3254     + if (br) {
3255     + of_node_put(br);
3256     np = overlay;
3257     - else
3258     + } else {
3259     np = region_np;
3260     + }
3261    
3262     for (i = 0; ; i++) {
3263     br = of_parse_phandle(np, "fpga-bridges", i);
3264     @@ -203,12 +207,15 @@ static int fpga_region_get_bridges(struct fpga_region *region,
3265     break;
3266    
3267     /* If parent bridge is in list, skip it. */
3268     - if (br == parent_br)
3269     + if (br == parent_br) {
3270     + of_node_put(br);
3271     continue;
3272     + }
3273    
3274     /* If node is a bridge, get it and add to list */
3275     ret = fpga_bridge_get_to_list(br, region->info,
3276     &region->bridge_list);
3277     + of_node_put(br);
3278    
3279     /* If any of the bridges are in use, give up */
3280     if (ret == -EBUSY) {
3281     diff --git a/drivers/iio/accel/kxsd9-i2c.c b/drivers/iio/accel/kxsd9-i2c.c
3282     index 98fbb628d5bd..38411e1c155b 100644
3283     --- a/drivers/iio/accel/kxsd9-i2c.c
3284     +++ b/drivers/iio/accel/kxsd9-i2c.c
3285     @@ -63,3 +63,6 @@ static struct i2c_driver kxsd9_i2c_driver = {
3286     .id_table = kxsd9_i2c_id,
3287     };
3288     module_i2c_driver(kxsd9_i2c_driver);
3289     +
3290     +MODULE_LICENSE("GPL v2");
3291     +MODULE_DESCRIPTION("KXSD9 accelerometer I2C interface");
3292     diff --git a/drivers/iio/adc/qcom-vadc-common.c b/drivers/iio/adc/qcom-vadc-common.c
3293     index 47d24ae5462f..fe3d7826783c 100644
3294     --- a/drivers/iio/adc/qcom-vadc-common.c
3295     +++ b/drivers/iio/adc/qcom-vadc-common.c
3296     @@ -5,6 +5,7 @@
3297     #include <linux/math64.h>
3298     #include <linux/log2.h>
3299     #include <linux/err.h>
3300     +#include <linux/module.h>
3301    
3302     #include "qcom-vadc-common.h"
3303    
3304     @@ -229,3 +230,6 @@ int qcom_vadc_decimation_from_dt(u32 value)
3305     return __ffs64(value / VADC_DECIMATION_MIN);
3306     }
3307     EXPORT_SYMBOL(qcom_vadc_decimation_from_dt);
3308     +
3309     +MODULE_LICENSE("GPL v2");
3310     +MODULE_DESCRIPTION("Qualcomm ADC common functionality");
3311     diff --git a/drivers/pinctrl/pxa/pinctrl-pxa2xx.c b/drivers/pinctrl/pxa/pinctrl-pxa2xx.c
3312     index 866aa3ce1ac9..6cf0006d4c8d 100644
3313     --- a/drivers/pinctrl/pxa/pinctrl-pxa2xx.c
3314     +++ b/drivers/pinctrl/pxa/pinctrl-pxa2xx.c
3315     @@ -436,3 +436,7 @@ int pxa2xx_pinctrl_exit(struct platform_device *pdev)
3316     return 0;
3317     }
3318     EXPORT_SYMBOL_GPL(pxa2xx_pinctrl_exit);
3319     +
3320     +MODULE_AUTHOR("Robert Jarzmik <robert.jarzmik@free.fr>");
3321     +MODULE_DESCRIPTION("Marvell PXA2xx pinctrl driver");
3322     +MODULE_LICENSE("GPL v2");
3323     diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
3324     index 3a14cccbd7ff..7948acf14601 100644
3325     --- a/drivers/tty/serial/serial_core.c
3326     +++ b/drivers/tty/serial/serial_core.c
3327     @@ -987,6 +987,8 @@ static int uart_set_info(struct tty_struct *tty, struct tty_port *port,
3328     }
3329     } else {
3330     retval = uart_startup(tty, state, 1);
3331     + if (retval == 0)
3332     + tty_port_set_initialized(port, true);
3333     if (retval > 0)
3334     retval = 0;
3335     }
3336     diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
3337     index 1c65817673db..41615f38bcff 100644
3338     --- a/include/linux/fdtable.h
3339     +++ b/include/linux/fdtable.h
3340     @@ -10,6 +10,7 @@
3341     #include <linux/compiler.h>
3342     #include <linux/spinlock.h>
3343     #include <linux/rcupdate.h>
3344     +#include <linux/nospec.h>
3345     #include <linux/types.h>
3346     #include <linux/init.h>
3347     #include <linux/fs.h>
3348     @@ -82,8 +83,10 @@ static inline struct file *__fcheck_files(struct files_struct *files, unsigned i
3349     {
3350     struct fdtable *fdt = rcu_dereference_raw(files->fdt);
3351    
3352     - if (fd < fdt->max_fds)
3353     + if (fd < fdt->max_fds) {
3354     + fd = array_index_nospec(fd, fdt->max_fds);
3355     return rcu_dereference_raw(fdt->fd[fd]);
3356     + }
3357     return NULL;
3358     }
3359    
3360     diff --git a/include/linux/init.h b/include/linux/init.h
3361     index f38b993edacb..943139a563e3 100644
3362     --- a/include/linux/init.h
3363     +++ b/include/linux/init.h
3364     @@ -5,6 +5,13 @@
3365     #include <linux/compiler.h>
3366     #include <linux/types.h>
3367    
3368     +/* Built-in __init functions needn't be compiled with retpoline */
3369     +#if defined(RETPOLINE) && !defined(MODULE)
3370     +#define __noretpoline __attribute__((indirect_branch("keep")))
3371     +#else
3372     +#define __noretpoline
3373     +#endif
3374     +
3375     /* These macros are used to mark some functions or
3376     * initialized data (doesn't apply to uninitialized data)
3377     * as `initialization' functions. The kernel can take this
3378     @@ -40,7 +47,7 @@
3379    
3380     /* These are for everybody (although not all archs will actually
3381     discard it in modules) */
3382     -#define __init __section(.init.text) __cold __inittrace __latent_entropy
3383     +#define __init __section(.init.text) __cold __inittrace __latent_entropy __noretpoline
3384     #define __initdata __section(.init.data)
3385     #define __initconst __section(.init.rodata)
3386     #define __exitdata __section(.exit.data)
3387     diff --git a/include/linux/module.h b/include/linux/module.h
3388     index fe5aa3736707..b1cc541f2ddf 100644
3389     --- a/include/linux/module.h
3390     +++ b/include/linux/module.h
3391     @@ -794,6 +794,15 @@ static inline void module_bug_finalize(const Elf_Ehdr *hdr,
3392     static inline void module_bug_cleanup(struct module *mod) {}
3393     #endif /* CONFIG_GENERIC_BUG */
3394    
3395     +#ifdef RETPOLINE
3396     +extern bool retpoline_module_ok(bool has_retpoline);
3397     +#else
3398     +static inline bool retpoline_module_ok(bool has_retpoline)
3399     +{
3400     + return true;
3401     +}
3402     +#endif
3403     +
3404     #ifdef CONFIG_MODULE_SIG
3405     static inline bool module_sig_ok(struct module *module)
3406     {
3407     diff --git a/include/linux/nospec.h b/include/linux/nospec.h
3408     new file mode 100644
3409     index 000000000000..b99bced39ac2
3410     --- /dev/null
3411     +++ b/include/linux/nospec.h
3412     @@ -0,0 +1,72 @@
3413     +// SPDX-License-Identifier: GPL-2.0
3414     +// Copyright(c) 2018 Linus Torvalds. All rights reserved.
3415     +// Copyright(c) 2018 Alexei Starovoitov. All rights reserved.
3416     +// Copyright(c) 2018 Intel Corporation. All rights reserved.
3417     +
3418     +#ifndef _LINUX_NOSPEC_H
3419     +#define _LINUX_NOSPEC_H
3420     +
3421     +/**
3422     + * array_index_mask_nospec() - generate a ~0 mask when index < size, 0 otherwise
3423     + * @index: array element index
3424     + * @size: number of elements in array
3425     + *
3426     + * When @index is out of bounds (@index >= @size), the sign bit will be
3427     + * set. Extend the sign bit to all bits and invert, giving a result of
3428     + * zero for an out of bounds index, or ~0 if within bounds [0, @size).
3429     + */
3430     +#ifndef array_index_mask_nospec
3431     +static inline unsigned long array_index_mask_nospec(unsigned long index,
3432     + unsigned long size)
3433     +{
3434     + /*
3435     + * Warn developers about inappropriate array_index_nospec() usage.
3436     + *
3437     + * Even if the CPU speculates past the WARN_ONCE branch, the
3438     + * sign bit of @index is taken into account when generating the
3439     + * mask.
3440     + *
3441     + * This warning is compiled out when the compiler can infer that
3442     + * @index and @size are less than LONG_MAX.
3443     + */
3444     + if (WARN_ONCE(index > LONG_MAX || size > LONG_MAX,
3445     + "array_index_nospec() limited to range of [0, LONG_MAX]\n"))
3446     + return 0;
3447     +
3448     + /*
3449     + * Always calculate and emit the mask even if the compiler
3450     + * thinks the mask is not needed. The compiler does not take
3451     + * into account the value of @index under speculation.
3452     + */
3453     + OPTIMIZER_HIDE_VAR(index);
3454     + return ~(long)(index | (size - 1UL - index)) >> (BITS_PER_LONG - 1);
3455     +}
3456     +#endif
3457     +
3458     +/*
3459     + * array_index_nospec - sanitize an array index after a bounds check
3460     + *
3461     + * For a code sequence like:
3462     + *
3463     + * if (index < size) {
3464     + * index = array_index_nospec(index, size);
3465     + * val = array[index];
3466     + * }
3467     + *
3468     + * ...if the CPU speculates past the bounds check then
3469     + * array_index_nospec() will clamp the index within the range of [0,
3470     + * size).
3471     + */
3472     +#define array_index_nospec(index, size) \
3473     +({ \
3474     + typeof(index) _i = (index); \
3475     + typeof(size) _s = (size); \
3476     + unsigned long _mask = array_index_mask_nospec(_i, _s); \
3477     + \
3478     + BUILD_BUG_ON(sizeof(_i) > sizeof(long)); \
3479     + BUILD_BUG_ON(sizeof(_s) > sizeof(long)); \
3480     + \
3481     + _i &= _mask; \
3482     + _i; \
3483     +})
3484     +#endif /* _LINUX_NOSPEC_H */
3485     diff --git a/kernel/module.c b/kernel/module.c
3486     index de66ec825992..690c0651c40f 100644
3487     --- a/kernel/module.c
3488     +++ b/kernel/module.c
3489     @@ -2855,6 +2855,15 @@ static int check_modinfo_livepatch(struct module *mod, struct load_info *info)
3490     }
3491     #endif /* CONFIG_LIVEPATCH */
3492    
3493     +static void check_modinfo_retpoline(struct module *mod, struct load_info *info)
3494     +{
3495     + if (retpoline_module_ok(get_modinfo(info, "retpoline")))
3496     + return;
3497     +
3498     + pr_warn("%s: loading module not compiled with retpoline compiler.\n",
3499     + mod->name);
3500     +}
3501     +
3502     /* Sets info->hdr and info->len. */
3503     static int copy_module_from_user(const void __user *umod, unsigned long len,
3504     struct load_info *info)
3505     @@ -3021,6 +3030,8 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags)
3506     add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK);
3507     }
3508    
3509     + check_modinfo_retpoline(mod, info);
3510     +
3511     if (get_modinfo(info, "staging")) {
3512     add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK);
3513     pr_warn("%s: module is from the staging directory, the quality "
3514     diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
3515     index d396cb61a280..81bef0676e1d 100644
3516     --- a/net/wireless/nl80211.c
3517     +++ b/net/wireless/nl80211.c
3518     @@ -16,6 +16,7 @@
3519     #include <linux/nl80211.h>
3520     #include <linux/rtnetlink.h>
3521     #include <linux/netlink.h>
3522     +#include <linux/nospec.h>
3523     #include <linux/etherdevice.h>
3524     #include <net/net_namespace.h>
3525     #include <net/genetlink.h>
3526     @@ -2056,20 +2057,22 @@ static const struct nla_policy txq_params_policy[NL80211_TXQ_ATTR_MAX + 1] = {
3527     static int parse_txq_params(struct nlattr *tb[],
3528     struct ieee80211_txq_params *txq_params)
3529     {
3530     + u8 ac;
3531     +
3532     if (!tb[NL80211_TXQ_ATTR_AC] || !tb[NL80211_TXQ_ATTR_TXOP] ||
3533     !tb[NL80211_TXQ_ATTR_CWMIN] || !tb[NL80211_TXQ_ATTR_CWMAX] ||
3534     !tb[NL80211_TXQ_ATTR_AIFS])
3535     return -EINVAL;
3536    
3537     - txq_params->ac = nla_get_u8(tb[NL80211_TXQ_ATTR_AC]);
3538     + ac = nla_get_u8(tb[NL80211_TXQ_ATTR_AC]);
3539     txq_params->txop = nla_get_u16(tb[NL80211_TXQ_ATTR_TXOP]);
3540     txq_params->cwmin = nla_get_u16(tb[NL80211_TXQ_ATTR_CWMIN]);
3541     txq_params->cwmax = nla_get_u16(tb[NL80211_TXQ_ATTR_CWMAX]);
3542     txq_params->aifs = nla_get_u8(tb[NL80211_TXQ_ATTR_AIFS]);
3543    
3544     - if (txq_params->ac >= NL80211_NUM_ACS)
3545     + if (ac >= NL80211_NUM_ACS)
3546     return -EINVAL;
3547     -
3548     + txq_params->ac = array_index_nospec(ac, NL80211_NUM_ACS);
3549     return 0;
3550     }
3551    
3552     diff --git a/scripts/faddr2line b/scripts/faddr2line
3553     index 39e07d8574dd..7721d5b2b0c0 100755
3554     --- a/scripts/faddr2line
3555     +++ b/scripts/faddr2line
3556     @@ -44,10 +44,10 @@
3557     set -o errexit
3558     set -o nounset
3559    
3560     -READELF="${CROSS_COMPILE}readelf"
3561     -ADDR2LINE="${CROSS_COMPILE}addr2line"
3562     -SIZE="${CROSS_COMPILE}size"
3563     -NM="${CROSS_COMPILE}nm"
3564     +READELF="${CROSS_COMPILE:-}readelf"
3565     +ADDR2LINE="${CROSS_COMPILE:-}addr2line"
3566     +SIZE="${CROSS_COMPILE:-}size"
3567     +NM="${CROSS_COMPILE:-}nm"
3568    
3569     command -v awk >/dev/null 2>&1 || die "awk isn't installed"
3570     command -v ${READELF} >/dev/null 2>&1 || die "readelf isn't installed"
3571     diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c
3572     index 98314b400a95..54deaa1066cf 100644
3573     --- a/scripts/mod/modpost.c
3574     +++ b/scripts/mod/modpost.c
3575     @@ -2165,6 +2165,14 @@ static void add_intree_flag(struct buffer *b, int is_intree)
3576     buf_printf(b, "\nMODULE_INFO(intree, \"Y\");\n");
3577     }
3578    
3579     +/* Cannot check for assembler */
3580     +static void add_retpoline(struct buffer *b)
3581     +{
3582     + buf_printf(b, "\n#ifdef RETPOLINE\n");
3583     + buf_printf(b, "MODULE_INFO(retpoline, \"Y\");\n");
3584     + buf_printf(b, "#endif\n");
3585     +}
3586     +
3587     static void add_staging_flag(struct buffer *b, const char *name)
3588     {
3589     static const char *staging_dir = "drivers/staging";
3590     @@ -2506,6 +2514,7 @@ int main(int argc, char **argv)
3591     err |= check_modname_len(mod);
3592     add_header(&buf, mod);
3593     add_intree_flag(&buf, !external_module);
3594     + add_retpoline(&buf);
3595     add_staging_flag(&buf, mod->name);
3596     err |= add_versions(&buf, mod);
3597     add_depends(&buf, mod, modules);
3598     diff --git a/sound/soc/codecs/pcm512x-spi.c b/sound/soc/codecs/pcm512x-spi.c
3599     index 712ed6598c48..ebdf9bd5a64c 100644
3600     --- a/sound/soc/codecs/pcm512x-spi.c
3601     +++ b/sound/soc/codecs/pcm512x-spi.c
3602     @@ -70,3 +70,7 @@ static struct spi_driver pcm512x_spi_driver = {
3603     };
3604    
3605     module_spi_driver(pcm512x_spi_driver);
3606     +
3607     +MODULE_DESCRIPTION("ASoC PCM512x codec driver - SPI");
3608     +MODULE_AUTHOR("Mark Brown <broonie@kernel.org>");
3609     +MODULE_LICENSE("GPL v2");
3610     diff --git a/tools/objtool/check.c b/tools/objtool/check.c
3611     index f40d46e24bcc..9cd028aa1509 100644
3612     --- a/tools/objtool/check.c
3613     +++ b/tools/objtool/check.c
3614     @@ -543,18 +543,14 @@ static int add_call_destinations(struct objtool_file *file)
3615     dest_off = insn->offset + insn->len + insn->immediate;
3616     insn->call_dest = find_symbol_by_offset(insn->sec,
3617     dest_off);
3618     - /*
3619     - * FIXME: Thanks to retpolines, it's now considered
3620     - * normal for a function to call within itself. So
3621     - * disable this warning for now.
3622     - */
3623     -#if 0
3624     - if (!insn->call_dest) {
3625     - WARN_FUNC("can't find call dest symbol at offset 0x%lx",
3626     - insn->sec, insn->offset, dest_off);
3627     +
3628     + if (!insn->call_dest && !insn->ignore) {
3629     + WARN_FUNC("unsupported intra-function call",
3630     + insn->sec, insn->offset);
3631     + WARN("If this is a retpoline, please patch it in with alternatives and annotate it with ANNOTATE_NOSPEC_ALTERNATIVE.");
3632     return -1;
3633     }
3634     -#endif
3635     +
3636     } else if (rela->sym->type == STT_SECTION) {
3637     insn->call_dest = find_symbol_by_offset(rela->sym->sec,
3638     rela->addend+4);
3639     @@ -598,7 +594,7 @@ static int handle_group_alt(struct objtool_file *file,
3640     struct instruction *orig_insn,
3641     struct instruction **new_insn)
3642     {
3643     - struct instruction *last_orig_insn, *last_new_insn, *insn, *fake_jump;
3644     + struct instruction *last_orig_insn, *last_new_insn, *insn, *fake_jump = NULL;
3645     unsigned long dest_off;
3646    
3647     last_orig_insn = NULL;
3648     @@ -614,28 +610,30 @@ static int handle_group_alt(struct objtool_file *file,
3649     last_orig_insn = insn;
3650     }
3651    
3652     - if (!next_insn_same_sec(file, last_orig_insn)) {
3653     - WARN("%s: don't know how to handle alternatives at end of section",
3654     - special_alt->orig_sec->name);
3655     - return -1;
3656     - }
3657     -
3658     - fake_jump = malloc(sizeof(*fake_jump));
3659     - if (!fake_jump) {
3660     - WARN("malloc failed");
3661     - return -1;
3662     + if (next_insn_same_sec(file, last_orig_insn)) {
3663     + fake_jump = malloc(sizeof(*fake_jump));
3664     + if (!fake_jump) {
3665     + WARN("malloc failed");
3666     + return -1;
3667     + }
3668     + memset(fake_jump, 0, sizeof(*fake_jump));
3669     + INIT_LIST_HEAD(&fake_jump->alts);
3670     + clear_insn_state(&fake_jump->state);
3671     +
3672     + fake_jump->sec = special_alt->new_sec;
3673     + fake_jump->offset = -1;
3674     + fake_jump->type = INSN_JUMP_UNCONDITIONAL;
3675     + fake_jump->jump_dest = list_next_entry(last_orig_insn, list);
3676     + fake_jump->ignore = true;
3677     }
3678     - memset(fake_jump, 0, sizeof(*fake_jump));
3679     - INIT_LIST_HEAD(&fake_jump->alts);
3680     - clear_insn_state(&fake_jump->state);
3681     -
3682     - fake_jump->sec = special_alt->new_sec;
3683     - fake_jump->offset = -1;
3684     - fake_jump->type = INSN_JUMP_UNCONDITIONAL;
3685     - fake_jump->jump_dest = list_next_entry(last_orig_insn, list);
3686     - fake_jump->ignore = true;
3687    
3688     if (!special_alt->new_len) {
3689     + if (!fake_jump) {
3690     + WARN("%s: empty alternative at end of section",
3691     + special_alt->orig_sec->name);
3692     + return -1;
3693     + }
3694     +
3695     *new_insn = fake_jump;
3696     return 0;
3697     }
3698     @@ -648,6 +646,8 @@ static int handle_group_alt(struct objtool_file *file,
3699    
3700     last_new_insn = insn;
3701    
3702     + insn->ignore = orig_insn->ignore_alts;
3703     +
3704     if (insn->type != INSN_JUMP_CONDITIONAL &&
3705     insn->type != INSN_JUMP_UNCONDITIONAL)
3706     continue;
3707     @@ -656,8 +656,14 @@ static int handle_group_alt(struct objtool_file *file,
3708     continue;
3709    
3710     dest_off = insn->offset + insn->len + insn->immediate;
3711     - if (dest_off == special_alt->new_off + special_alt->new_len)
3712     + if (dest_off == special_alt->new_off + special_alt->new_len) {
3713     + if (!fake_jump) {
3714     + WARN("%s: alternative jump to end of section",
3715     + special_alt->orig_sec->name);
3716     + return -1;
3717     + }
3718     insn->jump_dest = fake_jump;
3719     + }
3720    
3721     if (!insn->jump_dest) {
3722     WARN_FUNC("can't find alternative jump destination",
3723     @@ -672,7 +678,8 @@ static int handle_group_alt(struct objtool_file *file,
3724     return -1;
3725     }
3726    
3727     - list_add(&fake_jump->list, &last_new_insn->list);
3728     + if (fake_jump)
3729     + list_add(&fake_jump->list, &last_new_insn->list);
3730    
3731     return 0;
3732     }
3733     @@ -729,10 +736,6 @@ static int add_special_section_alts(struct objtool_file *file)
3734     goto out;
3735     }
3736    
3737     - /* Ignore retpoline alternatives. */
3738     - if (orig_insn->ignore_alts)
3739     - continue;
3740     -
3741     new_insn = NULL;
3742     if (!special_alt->group || special_alt->new_len) {
3743     new_insn = find_insn(file, special_alt->new_sec,
3744     @@ -1089,11 +1092,11 @@ static int decode_sections(struct objtool_file *file)
3745     if (ret)
3746     return ret;
3747    
3748     - ret = add_call_destinations(file);
3749     + ret = add_special_section_alts(file);
3750     if (ret)
3751     return ret;
3752    
3753     - ret = add_special_section_alts(file);
3754     + ret = add_call_destinations(file);
3755     if (ret)
3756     return ret;
3757    
3758     @@ -1720,10 +1723,12 @@ static int validate_branch(struct objtool_file *file, struct instruction *first,
3759    
3760     insn->visited = true;
3761    
3762     - list_for_each_entry(alt, &insn->alts, list) {
3763     - ret = validate_branch(file, alt->insn, state);
3764     - if (ret)
3765     - return 1;
3766     + if (!insn->ignore_alts) {
3767     + list_for_each_entry(alt, &insn->alts, list) {
3768     + ret = validate_branch(file, alt->insn, state);
3769     + if (ret)
3770     + return 1;
3771     + }
3772     }
3773    
3774     switch (insn->type) {
3775     diff --git a/tools/objtool/orc_gen.c b/tools/objtool/orc_gen.c
3776     index e61fe703197b..18384d9be4e1 100644
3777     --- a/tools/objtool/orc_gen.c
3778     +++ b/tools/objtool/orc_gen.c
3779     @@ -98,6 +98,11 @@ static int create_orc_entry(struct section *u_sec, struct section *ip_relasec,
3780     struct orc_entry *orc;
3781     struct rela *rela;
3782    
3783     + if (!insn_sec->sym) {
3784     + WARN("missing symbol for section %s", insn_sec->name);
3785     + return -1;
3786     + }
3787     +
3788     /* populate ORC data */
3789     orc = (struct orc_entry *)u_sec->data->d_buf + idx;
3790     memcpy(orc, o, sizeof(*orc));