Magellan Linux

Contents of /trunk/kernel-magellan/patches-4.15/0101-4.15.2-all-fixes.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 3085 - (show annotations) (download)
Wed Mar 21 14:52:15 2018 UTC (6 years, 1 month ago) by niro
File size: 122489 byte(s)
-linux-4.15.2
1 diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
2 index 46b26bfee27b..1e762c210f1b 100644
3 --- a/Documentation/admin-guide/kernel-parameters.txt
4 +++ b/Documentation/admin-guide/kernel-parameters.txt
5 @@ -2742,8 +2742,6 @@
6 norandmaps Don't use address space randomization. Equivalent to
7 echo 0 > /proc/sys/kernel/randomize_va_space
8
9 - noreplace-paravirt [X86,IA-64,PV_OPS] Don't patch paravirt_ops
10 -
11 noreplace-smp [X86-32,SMP] Don't replace SMP instructions
12 with UP alternatives
13
14 diff --git a/Documentation/speculation.txt b/Documentation/speculation.txt
15 new file mode 100644
16 index 000000000000..e9e6cbae2841
17 --- /dev/null
18 +++ b/Documentation/speculation.txt
19 @@ -0,0 +1,90 @@
20 +This document explains potential effects of speculation, and how undesirable
21 +effects can be mitigated portably using common APIs.
22 +
23 +===========
24 +Speculation
25 +===========
26 +
27 +To improve performance and minimize average latencies, many contemporary CPUs
28 +employ speculative execution techniques such as branch prediction, performing
29 +work which may be discarded at a later stage.
30 +
31 +Typically speculative execution cannot be observed from architectural state,
32 +such as the contents of registers. However, in some cases it is possible to
33 +observe its impact on microarchitectural state, such as the presence or
34 +absence of data in caches. Such state may form side-channels which can be
35 +observed to extract secret information.
36 +
37 +For example, in the presence of branch prediction, it is possible for bounds
38 +checks to be ignored by code which is speculatively executed. Consider the
39 +following code:
40 +
41 + int load_array(int *array, unsigned int index)
42 + {
43 + if (index >= MAX_ARRAY_ELEMS)
44 + return 0;
45 + else
46 + return array[index];
47 + }
48 +
49 +Which, on arm64, may be compiled to an assembly sequence such as:
50 +
51 + CMP <index>, #MAX_ARRAY_ELEMS
52 + B.LT less
53 + MOV <returnval>, #0
54 + RET
55 + less:
56 + LDR <returnval>, [<array>, <index>]
57 + RET
58 +
59 +It is possible that a CPU mis-predicts the conditional branch, and
60 +speculatively loads array[index], even if index >= MAX_ARRAY_ELEMS. This
61 +value will subsequently be discarded, but the speculated load may affect
62 +microarchitectural state which can be subsequently measured.
63 +
64 +More complex sequences involving multiple dependent memory accesses may
65 +result in sensitive information being leaked. Consider the following
66 +code, building on the prior example:
67 +
68 + int load_dependent_arrays(int *arr1, int *arr2, int index)
69 + {
70 + int val1, val2,
71 +
72 + val1 = load_array(arr1, index);
73 + val2 = load_array(arr2, val1);
74 +
75 + return val2;
76 + }
77 +
78 +Under speculation, the first call to load_array() may return the value
79 +of an out-of-bounds address, while the second call will influence
80 +microarchitectural state dependent on this value. This may provide an
81 +arbitrary read primitive.
82 +
83 +====================================
84 +Mitigating speculation side-channels
85 +====================================
86 +
87 +The kernel provides a generic API to ensure that bounds checks are
88 +respected even under speculation. Architectures which are affected by
89 +speculation-based side-channels are expected to implement these
90 +primitives.
91 +
92 +The array_index_nospec() helper in <linux/nospec.h> can be used to
93 +prevent information from being leaked via side-channels.
94 +
95 +A call to array_index_nospec(index, size) returns a sanitized index
96 +value that is bounded to [0, size) even under cpu speculation
97 +conditions.
98 +
99 +This can be used to protect the earlier load_array() example:
100 +
101 + int load_array(int *array, unsigned int index)
102 + {
103 + if (index >= MAX_ARRAY_ELEMS)
104 + return 0;
105 + else {
106 + index = array_index_nospec(index, MAX_ARRAY_ELEMS);
107 + return array[index];
108 + }
109 + }
110 diff --git a/Makefile b/Makefile
111 index af101b556ba0..54f1bc10b531 100644
112 --- a/Makefile
113 +++ b/Makefile
114 @@ -1,7 +1,7 @@
115 # SPDX-License-Identifier: GPL-2.0
116 VERSION = 4
117 PATCHLEVEL = 15
118 -SUBLEVEL = 1
119 +SUBLEVEL = 2
120 EXTRAVERSION =
121 NAME = Fearless Coyote
122
123 diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
124 index d7d3cc24baf4..21dbdf0e476b 100644
125 --- a/arch/x86/entry/common.c
126 +++ b/arch/x86/entry/common.c
127 @@ -21,6 +21,7 @@
128 #include <linux/export.h>
129 #include <linux/context_tracking.h>
130 #include <linux/user-return-notifier.h>
131 +#include <linux/nospec.h>
132 #include <linux/uprobes.h>
133 #include <linux/livepatch.h>
134 #include <linux/syscalls.h>
135 @@ -206,7 +207,7 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
136 * special case only applies after poking regs and before the
137 * very next return to user mode.
138 */
139 - current->thread.status &= ~(TS_COMPAT|TS_I386_REGS_POKED);
140 + ti->status &= ~(TS_COMPAT|TS_I386_REGS_POKED);
141 #endif
142
143 user_enter_irqoff();
144 @@ -282,7 +283,8 @@ __visible void do_syscall_64(struct pt_regs *regs)
145 * regs->orig_ax, which changes the behavior of some syscalls.
146 */
147 if (likely((nr & __SYSCALL_MASK) < NR_syscalls)) {
148 - regs->ax = sys_call_table[nr & __SYSCALL_MASK](
149 + nr = array_index_nospec(nr & __SYSCALL_MASK, NR_syscalls);
150 + regs->ax = sys_call_table[nr](
151 regs->di, regs->si, regs->dx,
152 regs->r10, regs->r8, regs->r9);
153 }
154 @@ -304,7 +306,7 @@ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
155 unsigned int nr = (unsigned int)regs->orig_ax;
156
157 #ifdef CONFIG_IA32_EMULATION
158 - current->thread.status |= TS_COMPAT;
159 + ti->status |= TS_COMPAT;
160 #endif
161
162 if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) {
163 @@ -318,6 +320,7 @@ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
164 }
165
166 if (likely(nr < IA32_NR_syscalls)) {
167 + nr = array_index_nospec(nr, IA32_NR_syscalls);
168 /*
169 * It's possible that a 32-bit syscall implementation
170 * takes a 64-bit parameter but nonetheless assumes that
171 diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
172 index 60c4c342316c..2a35b1e0fb90 100644
173 --- a/arch/x86/entry/entry_32.S
174 +++ b/arch/x86/entry/entry_32.S
175 @@ -252,7 +252,8 @@ ENTRY(__switch_to_asm)
176 * exist, overwrite the RSB with entries which capture
177 * speculative execution to prevent attack.
178 */
179 - FILL_RETURN_BUFFER %ebx, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
180 + /* Clobbers %ebx */
181 + FILL_RETURN_BUFFER RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
182 #endif
183
184 /* restore callee-saved registers */
185 diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
186 index ff6f8022612c..c752abe89d80 100644
187 --- a/arch/x86/entry/entry_64.S
188 +++ b/arch/x86/entry/entry_64.S
189 @@ -236,91 +236,20 @@ GLOBAL(entry_SYSCALL_64_after_hwframe)
190 pushq %r9 /* pt_regs->r9 */
191 pushq %r10 /* pt_regs->r10 */
192 pushq %r11 /* pt_regs->r11 */
193 - sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */
194 - UNWIND_HINT_REGS extra=0
195 -
196 - TRACE_IRQS_OFF
197 -
198 - /*
199 - * If we need to do entry work or if we guess we'll need to do
200 - * exit work, go straight to the slow path.
201 - */
202 - movq PER_CPU_VAR(current_task), %r11
203 - testl $_TIF_WORK_SYSCALL_ENTRY|_TIF_ALLWORK_MASK, TASK_TI_flags(%r11)
204 - jnz entry_SYSCALL64_slow_path
205 -
206 -entry_SYSCALL_64_fastpath:
207 - /*
208 - * Easy case: enable interrupts and issue the syscall. If the syscall
209 - * needs pt_regs, we'll call a stub that disables interrupts again
210 - * and jumps to the slow path.
211 - */
212 - TRACE_IRQS_ON
213 - ENABLE_INTERRUPTS(CLBR_NONE)
214 -#if __SYSCALL_MASK == ~0
215 - cmpq $__NR_syscall_max, %rax
216 -#else
217 - andl $__SYSCALL_MASK, %eax
218 - cmpl $__NR_syscall_max, %eax
219 -#endif
220 - ja 1f /* return -ENOSYS (already in pt_regs->ax) */
221 - movq %r10, %rcx
222 -
223 - /*
224 - * This call instruction is handled specially in stub_ptregs_64.
225 - * It might end up jumping to the slow path. If it jumps, RAX
226 - * and all argument registers are clobbered.
227 - */
228 -#ifdef CONFIG_RETPOLINE
229 - movq sys_call_table(, %rax, 8), %rax
230 - call __x86_indirect_thunk_rax
231 -#else
232 - call *sys_call_table(, %rax, 8)
233 -#endif
234 -.Lentry_SYSCALL_64_after_fastpath_call:
235 -
236 - movq %rax, RAX(%rsp)
237 -1:
238 + pushq %rbx /* pt_regs->rbx */
239 + pushq %rbp /* pt_regs->rbp */
240 + pushq %r12 /* pt_regs->r12 */
241 + pushq %r13 /* pt_regs->r13 */
242 + pushq %r14 /* pt_regs->r14 */
243 + pushq %r15 /* pt_regs->r15 */
244 + UNWIND_HINT_REGS
245
246 - /*
247 - * If we get here, then we know that pt_regs is clean for SYSRET64.
248 - * If we see that no exit work is required (which we are required
249 - * to check with IRQs off), then we can go straight to SYSRET64.
250 - */
251 - DISABLE_INTERRUPTS(CLBR_ANY)
252 TRACE_IRQS_OFF
253 - movq PER_CPU_VAR(current_task), %r11
254 - testl $_TIF_ALLWORK_MASK, TASK_TI_flags(%r11)
255 - jnz 1f
256 -
257 - LOCKDEP_SYS_EXIT
258 - TRACE_IRQS_ON /* user mode is traced as IRQs on */
259 - movq RIP(%rsp), %rcx
260 - movq EFLAGS(%rsp), %r11
261 - addq $6*8, %rsp /* skip extra regs -- they were preserved */
262 - UNWIND_HINT_EMPTY
263 - jmp .Lpop_c_regs_except_rcx_r11_and_sysret
264
265 -1:
266 - /*
267 - * The fast path looked good when we started, but something changed
268 - * along the way and we need to switch to the slow path. Calling
269 - * raise(3) will trigger this, for example. IRQs are off.
270 - */
271 - TRACE_IRQS_ON
272 - ENABLE_INTERRUPTS(CLBR_ANY)
273 - SAVE_EXTRA_REGS
274 - movq %rsp, %rdi
275 - call syscall_return_slowpath /* returns with IRQs disabled */
276 - jmp return_from_SYSCALL_64
277 -
278 -entry_SYSCALL64_slow_path:
279 /* IRQs are off. */
280 - SAVE_EXTRA_REGS
281 movq %rsp, %rdi
282 call do_syscall_64 /* returns with IRQs disabled */
283
284 -return_from_SYSCALL_64:
285 TRACE_IRQS_IRETQ /* we're about to change IF */
286
287 /*
288 @@ -393,7 +322,6 @@ syscall_return_via_sysret:
289 /* rcx and r11 are already restored (see code above) */
290 UNWIND_HINT_EMPTY
291 POP_EXTRA_REGS
292 -.Lpop_c_regs_except_rcx_r11_and_sysret:
293 popq %rsi /* skip r11 */
294 popq %r10
295 popq %r9
296 @@ -424,47 +352,6 @@ syscall_return_via_sysret:
297 USERGS_SYSRET64
298 END(entry_SYSCALL_64)
299
300 -ENTRY(stub_ptregs_64)
301 - /*
302 - * Syscalls marked as needing ptregs land here.
303 - * If we are on the fast path, we need to save the extra regs,
304 - * which we achieve by trying again on the slow path. If we are on
305 - * the slow path, the extra regs are already saved.
306 - *
307 - * RAX stores a pointer to the C function implementing the syscall.
308 - * IRQs are on.
309 - */
310 - cmpq $.Lentry_SYSCALL_64_after_fastpath_call, (%rsp)
311 - jne 1f
312 -
313 - /*
314 - * Called from fast path -- disable IRQs again, pop return address
315 - * and jump to slow path
316 - */
317 - DISABLE_INTERRUPTS(CLBR_ANY)
318 - TRACE_IRQS_OFF
319 - popq %rax
320 - UNWIND_HINT_REGS extra=0
321 - jmp entry_SYSCALL64_slow_path
322 -
323 -1:
324 - JMP_NOSPEC %rax /* Called from C */
325 -END(stub_ptregs_64)
326 -
327 -.macro ptregs_stub func
328 -ENTRY(ptregs_\func)
329 - UNWIND_HINT_FUNC
330 - leaq \func(%rip), %rax
331 - jmp stub_ptregs_64
332 -END(ptregs_\func)
333 -.endm
334 -
335 -/* Instantiate ptregs_stub for each ptregs-using syscall */
336 -#define __SYSCALL_64_QUAL_(sym)
337 -#define __SYSCALL_64_QUAL_ptregs(sym) ptregs_stub sym
338 -#define __SYSCALL_64(nr, sym, qual) __SYSCALL_64_QUAL_##qual(sym)
339 -#include <asm/syscalls_64.h>
340 -
341 /*
342 * %rdi: prev task
343 * %rsi: next task
344 @@ -499,7 +386,8 @@ ENTRY(__switch_to_asm)
345 * exist, overwrite the RSB with entries which capture
346 * speculative execution to prevent attack.
347 */
348 - FILL_RETURN_BUFFER %r12, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
349 + /* Clobbers %rbx */
350 + FILL_RETURN_BUFFER RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
351 #endif
352
353 /* restore callee-saved registers */
354 diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c
355 index 9c09775e589d..c176d2fab1da 100644
356 --- a/arch/x86/entry/syscall_64.c
357 +++ b/arch/x86/entry/syscall_64.c
358 @@ -7,14 +7,11 @@
359 #include <asm/asm-offsets.h>
360 #include <asm/syscall.h>
361
362 -#define __SYSCALL_64_QUAL_(sym) sym
363 -#define __SYSCALL_64_QUAL_ptregs(sym) ptregs_##sym
364 -
365 -#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long __SYSCALL_64_QUAL_##qual(sym)(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
366 +#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
367 #include <asm/syscalls_64.h>
368 #undef __SYSCALL_64
369
370 -#define __SYSCALL_64(nr, sym, qual) [nr] = __SYSCALL_64_QUAL_##qual(sym),
371 +#define __SYSCALL_64(nr, sym, qual) [nr] = sym,
372
373 extern long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
374
375 diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h
376 index 1908214b9125..4d111616524b 100644
377 --- a/arch/x86/include/asm/asm-prototypes.h
378 +++ b/arch/x86/include/asm/asm-prototypes.h
379 @@ -38,4 +38,7 @@ INDIRECT_THUNK(dx)
380 INDIRECT_THUNK(si)
381 INDIRECT_THUNK(di)
382 INDIRECT_THUNK(bp)
383 +asmlinkage void __fill_rsb(void);
384 +asmlinkage void __clear_rsb(void);
385 +
386 #endif /* CONFIG_RETPOLINE */
387 diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
388 index 7fb336210e1b..30d406146016 100644
389 --- a/arch/x86/include/asm/barrier.h
390 +++ b/arch/x86/include/asm/barrier.h
391 @@ -24,6 +24,34 @@
392 #define wmb() asm volatile("sfence" ::: "memory")
393 #endif
394
395 +/**
396 + * array_index_mask_nospec() - generate a mask that is ~0UL when the
397 + * bounds check succeeds and 0 otherwise
398 + * @index: array element index
399 + * @size: number of elements in array
400 + *
401 + * Returns:
402 + * 0 - (index < size)
403 + */
404 +static inline unsigned long array_index_mask_nospec(unsigned long index,
405 + unsigned long size)
406 +{
407 + unsigned long mask;
408 +
409 + asm ("cmp %1,%2; sbb %0,%0;"
410 + :"=r" (mask)
411 + :"r"(size),"r" (index)
412 + :"cc");
413 + return mask;
414 +}
415 +
416 +/* Override the default implementation from linux/nospec.h. */
417 +#define array_index_mask_nospec array_index_mask_nospec
418 +
419 +/* Prevent speculative execution past this barrier. */
420 +#define barrier_nospec() alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC, \
421 + "lfence", X86_FEATURE_LFENCE_RDTSC)
422 +
423 #ifdef CONFIG_X86_PPRO_FENCE
424 #define dma_rmb() rmb()
425 #else
426 diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
427 index ea9a7dde62e5..70eddb3922ff 100644
428 --- a/arch/x86/include/asm/cpufeature.h
429 +++ b/arch/x86/include/asm/cpufeature.h
430 @@ -29,6 +29,7 @@ enum cpuid_leafs
431 CPUID_8000_000A_EDX,
432 CPUID_7_ECX,
433 CPUID_8000_0007_EBX,
434 + CPUID_7_EDX,
435 };
436
437 #ifdef CONFIG_X86_FEATURE_NAMES
438 @@ -79,8 +80,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
439 CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 15, feature_bit) || \
440 CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 16, feature_bit) || \
441 CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 17, feature_bit) || \
442 + CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 18, feature_bit) || \
443 REQUIRED_MASK_CHECK || \
444 - BUILD_BUG_ON_ZERO(NCAPINTS != 18))
445 + BUILD_BUG_ON_ZERO(NCAPINTS != 19))
446
447 #define DISABLED_MASK_BIT_SET(feature_bit) \
448 ( CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 0, feature_bit) || \
449 @@ -101,8 +103,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
450 CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 15, feature_bit) || \
451 CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 16, feature_bit) || \
452 CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 17, feature_bit) || \
453 + CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 18, feature_bit) || \
454 DISABLED_MASK_CHECK || \
455 - BUILD_BUG_ON_ZERO(NCAPINTS != 18))
456 + BUILD_BUG_ON_ZERO(NCAPINTS != 19))
457
458 #define cpu_has(c, bit) \
459 (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \
460 diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
461 index 25b9375c1484..73b5fff159a4 100644
462 --- a/arch/x86/include/asm/cpufeatures.h
463 +++ b/arch/x86/include/asm/cpufeatures.h
464 @@ -13,7 +13,7 @@
465 /*
466 * Defines x86 CPU feature bits
467 */
468 -#define NCAPINTS 18 /* N 32-bit words worth of info */
469 +#define NCAPINTS 19 /* N 32-bit words worth of info */
470 #define NBUGINTS 1 /* N 32-bit bug flags */
471
472 /*
473 @@ -203,14 +203,14 @@
474 #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
475 #define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */
476 #define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */
477 -#define X86_FEATURE_RETPOLINE ( 7*32+12) /* Generic Retpoline mitigation for Spectre variant 2 */
478 -#define X86_FEATURE_RETPOLINE_AMD ( 7*32+13) /* AMD Retpoline mitigation for Spectre variant 2 */
479 +#define X86_FEATURE_RETPOLINE ( 7*32+12) /* "" Generic Retpoline mitigation for Spectre variant 2 */
480 +#define X86_FEATURE_RETPOLINE_AMD ( 7*32+13) /* "" AMD Retpoline mitigation for Spectre variant 2 */
481 #define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */
482 -#define X86_FEATURE_AVX512_4VNNIW ( 7*32+16) /* AVX-512 Neural Network Instructions */
483 -#define X86_FEATURE_AVX512_4FMAPS ( 7*32+17) /* AVX-512 Multiply Accumulation Single precision */
484
485 #define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */
486 -#define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* Fill RSB on context switches */
487 +#define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* "" Fill RSB on context switches */
488 +
489 +#define X86_FEATURE_USE_IBPB ( 7*32+21) /* "" Indirect Branch Prediction Barrier enabled */
490
491 /* Virtualization flags: Linux defined, word 8 */
492 #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */
493 @@ -271,6 +271,9 @@
494 #define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */
495 #define X86_FEATURE_IRPERF (13*32+ 1) /* Instructions Retired Count */
496 #define X86_FEATURE_XSAVEERPTR (13*32+ 2) /* Always save/restore FP error pointers */
497 +#define X86_FEATURE_IBPB (13*32+12) /* Indirect Branch Prediction Barrier */
498 +#define X86_FEATURE_IBRS (13*32+14) /* Indirect Branch Restricted Speculation */
499 +#define X86_FEATURE_STIBP (13*32+15) /* Single Thread Indirect Branch Predictors */
500
501 /* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */
502 #define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */
503 @@ -319,6 +322,13 @@
504 #define X86_FEATURE_SUCCOR (17*32+ 1) /* Uncorrectable error containment and recovery */
505 #define X86_FEATURE_SMCA (17*32+ 3) /* Scalable MCA */
506
507 +/* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */
508 +#define X86_FEATURE_AVX512_4VNNIW (18*32+ 2) /* AVX-512 Neural Network Instructions */
509 +#define X86_FEATURE_AVX512_4FMAPS (18*32+ 3) /* AVX-512 Multiply Accumulation Single precision */
510 +#define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */
511 +#define X86_FEATURE_INTEL_STIBP (18*32+27) /* "" Single Thread Indirect Branch Predictors */
512 +#define X86_FEATURE_ARCH_CAPABILITIES (18*32+29) /* IA32_ARCH_CAPABILITIES MSR (Intel) */
513 +
514 /*
515 * BUG word(s)
516 */
517 diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h
518 index b027633e7300..33833d1909af 100644
519 --- a/arch/x86/include/asm/disabled-features.h
520 +++ b/arch/x86/include/asm/disabled-features.h
521 @@ -77,6 +77,7 @@
522 #define DISABLED_MASK15 0
523 #define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP)
524 #define DISABLED_MASK17 0
525 -#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 18)
526 +#define DISABLED_MASK18 0
527 +#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19)
528
529 #endif /* _ASM_X86_DISABLED_FEATURES_H */
530 diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
531 index 64c4a30e0d39..e203169931c7 100644
532 --- a/arch/x86/include/asm/fixmap.h
533 +++ b/arch/x86/include/asm/fixmap.h
534 @@ -137,8 +137,10 @@ enum fixed_addresses {
535
536 extern void reserve_top_address(unsigned long reserve);
537
538 -#define FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT)
539 -#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE)
540 +#define FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT)
541 +#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE)
542 +#define FIXADDR_TOT_SIZE (__end_of_fixed_addresses << PAGE_SHIFT)
543 +#define FIXADDR_TOT_START (FIXADDR_TOP - FIXADDR_TOT_SIZE)
544
545 extern int fixmaps_set;
546
547 diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
548 index e7b983a35506..e520a1e6fc11 100644
549 --- a/arch/x86/include/asm/msr-index.h
550 +++ b/arch/x86/include/asm/msr-index.h
551 @@ -39,6 +39,13 @@
552
553 /* Intel MSRs. Some also available on other CPUs */
554
555 +#define MSR_IA32_SPEC_CTRL 0x00000048 /* Speculation Control */
556 +#define SPEC_CTRL_IBRS (1 << 0) /* Indirect Branch Restricted Speculation */
557 +#define SPEC_CTRL_STIBP (1 << 1) /* Single Thread Indirect Branch Predictors */
558 +
559 +#define MSR_IA32_PRED_CMD 0x00000049 /* Prediction Command */
560 +#define PRED_CMD_IBPB (1 << 0) /* Indirect Branch Prediction Barrier */
561 +
562 #define MSR_PPIN_CTL 0x0000004e
563 #define MSR_PPIN 0x0000004f
564
565 @@ -57,6 +64,11 @@
566 #define SNB_C3_AUTO_UNDEMOTE (1UL << 28)
567
568 #define MSR_MTRRcap 0x000000fe
569 +
570 +#define MSR_IA32_ARCH_CAPABILITIES 0x0000010a
571 +#define ARCH_CAP_RDCL_NO (1 << 0) /* Not susceptible to Meltdown */
572 +#define ARCH_CAP_IBRS_ALL (1 << 1) /* Enhanced IBRS support */
573 +
574 #define MSR_IA32_BBL_CR_CTL 0x00000119
575 #define MSR_IA32_BBL_CR_CTL3 0x0000011e
576
577 diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
578 index 07962f5f6fba..30df295f6d94 100644
579 --- a/arch/x86/include/asm/msr.h
580 +++ b/arch/x86/include/asm/msr.h
581 @@ -214,8 +214,7 @@ static __always_inline unsigned long long rdtsc_ordered(void)
582 * that some other imaginary CPU is updating continuously with a
583 * time stamp.
584 */
585 - alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC,
586 - "lfence", X86_FEATURE_LFENCE_RDTSC);
587 + barrier_nospec();
588 return rdtsc();
589 }
590
591 diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
592 index 4ad41087ce0e..4d57894635f2 100644
593 --- a/arch/x86/include/asm/nospec-branch.h
594 +++ b/arch/x86/include/asm/nospec-branch.h
595 @@ -1,56 +1,12 @@
596 /* SPDX-License-Identifier: GPL-2.0 */
597
598 -#ifndef __NOSPEC_BRANCH_H__
599 -#define __NOSPEC_BRANCH_H__
600 +#ifndef _ASM_X86_NOSPEC_BRANCH_H_
601 +#define _ASM_X86_NOSPEC_BRANCH_H_
602
603 #include <asm/alternative.h>
604 #include <asm/alternative-asm.h>
605 #include <asm/cpufeatures.h>
606
607 -/*
608 - * Fill the CPU return stack buffer.
609 - *
610 - * Each entry in the RSB, if used for a speculative 'ret', contains an
611 - * infinite 'pause; lfence; jmp' loop to capture speculative execution.
612 - *
613 - * This is required in various cases for retpoline and IBRS-based
614 - * mitigations for the Spectre variant 2 vulnerability. Sometimes to
615 - * eliminate potentially bogus entries from the RSB, and sometimes
616 - * purely to ensure that it doesn't get empty, which on some CPUs would
617 - * allow predictions from other (unwanted!) sources to be used.
618 - *
619 - * We define a CPP macro such that it can be used from both .S files and
620 - * inline assembly. It's possible to do a .macro and then include that
621 - * from C via asm(".include <asm/nospec-branch.h>") but let's not go there.
622 - */
623 -
624 -#define RSB_CLEAR_LOOPS 32 /* To forcibly overwrite all entries */
625 -#define RSB_FILL_LOOPS 16 /* To avoid underflow */
626 -
627 -/*
628 - * Google experimented with loop-unrolling and this turned out to be
629 - * the optimal version — two calls, each with their own speculation
630 - * trap should their return address end up getting used, in a loop.
631 - */
632 -#define __FILL_RETURN_BUFFER(reg, nr, sp) \
633 - mov $(nr/2), reg; \
634 -771: \
635 - call 772f; \
636 -773: /* speculation trap */ \
637 - pause; \
638 - lfence; \
639 - jmp 773b; \
640 -772: \
641 - call 774f; \
642 -775: /* speculation trap */ \
643 - pause; \
644 - lfence; \
645 - jmp 775b; \
646 -774: \
647 - dec reg; \
648 - jnz 771b; \
649 - add $(BITS_PER_LONG/8) * nr, sp;
650 -
651 #ifdef __ASSEMBLY__
652
653 /*
654 @@ -121,17 +77,10 @@
655 #endif
656 .endm
657
658 - /*
659 - * A simpler FILL_RETURN_BUFFER macro. Don't make people use the CPP
660 - * monstrosity above, manually.
661 - */
662 -.macro FILL_RETURN_BUFFER reg:req nr:req ftr:req
663 +/* This clobbers the BX register */
664 +.macro FILL_RETURN_BUFFER nr:req ftr:req
665 #ifdef CONFIG_RETPOLINE
666 - ANNOTATE_NOSPEC_ALTERNATIVE
667 - ALTERNATIVE "jmp .Lskip_rsb_\@", \
668 - __stringify(__FILL_RETURN_BUFFER(\reg,\nr,%_ASM_SP)) \
669 - \ftr
670 -.Lskip_rsb_\@:
671 + ALTERNATIVE "", "call __clear_rsb", \ftr
672 #endif
673 .endm
674
675 @@ -201,22 +150,25 @@ extern char __indirect_thunk_end[];
676 * On VMEXIT we must ensure that no RSB predictions learned in the guest
677 * can be followed in the host, by overwriting the RSB completely. Both
678 * retpoline and IBRS mitigations for Spectre v2 need this; only on future
679 - * CPUs with IBRS_ATT *might* it be avoided.
680 + * CPUs with IBRS_ALL *might* it be avoided.
681 */
682 static inline void vmexit_fill_RSB(void)
683 {
684 #ifdef CONFIG_RETPOLINE
685 - unsigned long loops;
686 -
687 - asm volatile (ANNOTATE_NOSPEC_ALTERNATIVE
688 - ALTERNATIVE("jmp 910f",
689 - __stringify(__FILL_RETURN_BUFFER(%0, RSB_CLEAR_LOOPS, %1)),
690 - X86_FEATURE_RETPOLINE)
691 - "910:"
692 - : "=r" (loops), ASM_CALL_CONSTRAINT
693 - : : "memory" );
694 + alternative_input("",
695 + "call __fill_rsb",
696 + X86_FEATURE_RETPOLINE,
697 + ASM_NO_INPUT_CLOBBER(_ASM_BX, "memory"));
698 #endif
699 }
700
701 +static inline void indirect_branch_prediction_barrier(void)
702 +{
703 + alternative_input("",
704 + "call __ibp_barrier",
705 + X86_FEATURE_USE_IBPB,
706 + ASM_NO_INPUT_CLOBBER("eax", "ecx", "edx", "memory"));
707 +}
708 +
709 #endif /* __ASSEMBLY__ */
710 -#endif /* __NOSPEC_BRANCH_H__ */
711 +#endif /* _ASM_X86_NOSPEC_BRANCH_H_ */
712 diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h
713 index ce245b0cdfca..0777e18a1d23 100644
714 --- a/arch/x86/include/asm/pgtable_32_types.h
715 +++ b/arch/x86/include/asm/pgtable_32_types.h
716 @@ -44,8 +44,9 @@ extern bool __vmalloc_start_set; /* set once high_memory is set */
717 */
718 #define CPU_ENTRY_AREA_PAGES (NR_CPUS * 40)
719
720 -#define CPU_ENTRY_AREA_BASE \
721 - ((FIXADDR_START - PAGE_SIZE * (CPU_ENTRY_AREA_PAGES + 1)) & PMD_MASK)
722 +#define CPU_ENTRY_AREA_BASE \
723 + ((FIXADDR_TOT_START - PAGE_SIZE * (CPU_ENTRY_AREA_PAGES + 1)) \
724 + & PMD_MASK)
725
726 #define PKMAP_BASE \
727 ((CPU_ENTRY_AREA_BASE - PAGE_SIZE) & PMD_MASK)
728 diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
729 index d3a67fba200a..513f9604c192 100644
730 --- a/arch/x86/include/asm/processor.h
731 +++ b/arch/x86/include/asm/processor.h
732 @@ -460,8 +460,6 @@ struct thread_struct {
733 unsigned short gsindex;
734 #endif
735
736 - u32 status; /* thread synchronous flags */
737 -
738 #ifdef CONFIG_X86_64
739 unsigned long fsbase;
740 unsigned long gsbase;
741 @@ -971,4 +969,7 @@ bool xen_set_default_idle(void);
742
743 void stop_this_cpu(void *dummy);
744 void df_debug(struct pt_regs *regs, long error_code);
745 +
746 +void __ibp_barrier(void);
747 +
748 #endif /* _ASM_X86_PROCESSOR_H */
749 diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h
750 index d91ba04dd007..fb3a6de7440b 100644
751 --- a/arch/x86/include/asm/required-features.h
752 +++ b/arch/x86/include/asm/required-features.h
753 @@ -106,6 +106,7 @@
754 #define REQUIRED_MASK15 0
755 #define REQUIRED_MASK16 (NEED_LA57)
756 #define REQUIRED_MASK17 0
757 -#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 18)
758 +#define REQUIRED_MASK18 0
759 +#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19)
760
761 #endif /* _ASM_X86_REQUIRED_FEATURES_H */
762 diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h
763 index e3c95e8e61c5..03eedc21246d 100644
764 --- a/arch/x86/include/asm/syscall.h
765 +++ b/arch/x86/include/asm/syscall.h
766 @@ -60,7 +60,7 @@ static inline long syscall_get_error(struct task_struct *task,
767 * TS_COMPAT is set for 32-bit syscall entries and then
768 * remains set until we return to user mode.
769 */
770 - if (task->thread.status & (TS_COMPAT|TS_I386_REGS_POKED))
771 + if (task->thread_info.status & (TS_COMPAT|TS_I386_REGS_POKED))
772 /*
773 * Sign-extend the value so (int)-EFOO becomes (long)-EFOO
774 * and will match correctly in comparisons.
775 @@ -116,7 +116,7 @@ static inline void syscall_get_arguments(struct task_struct *task,
776 unsigned long *args)
777 {
778 # ifdef CONFIG_IA32_EMULATION
779 - if (task->thread.status & TS_COMPAT)
780 + if (task->thread_info.status & TS_COMPAT)
781 switch (i) {
782 case 0:
783 if (!n--) break;
784 @@ -177,7 +177,7 @@ static inline void syscall_set_arguments(struct task_struct *task,
785 const unsigned long *args)
786 {
787 # ifdef CONFIG_IA32_EMULATION
788 - if (task->thread.status & TS_COMPAT)
789 + if (task->thread_info.status & TS_COMPAT)
790 switch (i) {
791 case 0:
792 if (!n--) break;
793 diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
794 index 00223333821a..eda3b6823ca4 100644
795 --- a/arch/x86/include/asm/thread_info.h
796 +++ b/arch/x86/include/asm/thread_info.h
797 @@ -55,6 +55,7 @@ struct task_struct;
798
799 struct thread_info {
800 unsigned long flags; /* low level flags */
801 + u32 status; /* thread synchronous flags */
802 };
803
804 #define INIT_THREAD_INFO(tsk) \
805 @@ -221,7 +222,7 @@ static inline int arch_within_stack_frames(const void * const stack,
806 #define in_ia32_syscall() true
807 #else
808 #define in_ia32_syscall() (IS_ENABLED(CONFIG_IA32_EMULATION) && \
809 - current->thread.status & TS_COMPAT)
810 + current_thread_info()->status & TS_COMPAT)
811 #endif
812
813 /*
814 diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
815 index d33e4a26dc7e..2b8f18ca5874 100644
816 --- a/arch/x86/include/asm/tlbflush.h
817 +++ b/arch/x86/include/asm/tlbflush.h
818 @@ -174,6 +174,8 @@ struct tlb_state {
819 struct mm_struct *loaded_mm;
820 u16 loaded_mm_asid;
821 u16 next_asid;
822 + /* last user mm's ctx id */
823 + u64 last_ctx_id;
824
825 /*
826 * We can be in one of several states:
827 diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
828 index 574dff4d2913..aae77eb8491c 100644
829 --- a/arch/x86/include/asm/uaccess.h
830 +++ b/arch/x86/include/asm/uaccess.h
831 @@ -124,6 +124,11 @@ extern int __get_user_bad(void);
832
833 #define __uaccess_begin() stac()
834 #define __uaccess_end() clac()
835 +#define __uaccess_begin_nospec() \
836 +({ \
837 + stac(); \
838 + barrier_nospec(); \
839 +})
840
841 /*
842 * This is a type: either unsigned long, if the argument fits into
843 @@ -445,7 +450,7 @@ do { \
844 ({ \
845 int __gu_err; \
846 __inttype(*(ptr)) __gu_val; \
847 - __uaccess_begin(); \
848 + __uaccess_begin_nospec(); \
849 __get_user_size(__gu_val, (ptr), (size), __gu_err, -EFAULT); \
850 __uaccess_end(); \
851 (x) = (__force __typeof__(*(ptr)))__gu_val; \
852 @@ -487,6 +492,10 @@ struct __large_struct { unsigned long buf[100]; };
853 __uaccess_begin(); \
854 barrier();
855
856 +#define uaccess_try_nospec do { \
857 + current->thread.uaccess_err = 0; \
858 + __uaccess_begin_nospec(); \
859 +
860 #define uaccess_catch(err) \
861 __uaccess_end(); \
862 (err) |= (current->thread.uaccess_err ? -EFAULT : 0); \
863 @@ -548,7 +557,7 @@ struct __large_struct { unsigned long buf[100]; };
864 * get_user_ex(...);
865 * } get_user_catch(err)
866 */
867 -#define get_user_try uaccess_try
868 +#define get_user_try uaccess_try_nospec
869 #define get_user_catch(err) uaccess_catch(err)
870
871 #define get_user_ex(x, ptr) do { \
872 @@ -582,7 +591,7 @@ extern void __cmpxchg_wrong_size(void)
873 __typeof__(ptr) __uval = (uval); \
874 __typeof__(*(ptr)) __old = (old); \
875 __typeof__(*(ptr)) __new = (new); \
876 - __uaccess_begin(); \
877 + __uaccess_begin_nospec(); \
878 switch (size) { \
879 case 1: \
880 { \
881 diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
882 index 72950401b223..ba2dc1930630 100644
883 --- a/arch/x86/include/asm/uaccess_32.h
884 +++ b/arch/x86/include/asm/uaccess_32.h
885 @@ -29,21 +29,21 @@ raw_copy_from_user(void *to, const void __user *from, unsigned long n)
886 switch (n) {
887 case 1:
888 ret = 0;
889 - __uaccess_begin();
890 + __uaccess_begin_nospec();
891 __get_user_asm_nozero(*(u8 *)to, from, ret,
892 "b", "b", "=q", 1);
893 __uaccess_end();
894 return ret;
895 case 2:
896 ret = 0;
897 - __uaccess_begin();
898 + __uaccess_begin_nospec();
899 __get_user_asm_nozero(*(u16 *)to, from, ret,
900 "w", "w", "=r", 2);
901 __uaccess_end();
902 return ret;
903 case 4:
904 ret = 0;
905 - __uaccess_begin();
906 + __uaccess_begin_nospec();
907 __get_user_asm_nozero(*(u32 *)to, from, ret,
908 "l", "k", "=r", 4);
909 __uaccess_end();
910 diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
911 index f07ef3c575db..62546b3a398e 100644
912 --- a/arch/x86/include/asm/uaccess_64.h
913 +++ b/arch/x86/include/asm/uaccess_64.h
914 @@ -55,31 +55,31 @@ raw_copy_from_user(void *dst, const void __user *src, unsigned long size)
915 return copy_user_generic(dst, (__force void *)src, size);
916 switch (size) {
917 case 1:
918 - __uaccess_begin();
919 + __uaccess_begin_nospec();
920 __get_user_asm_nozero(*(u8 *)dst, (u8 __user *)src,
921 ret, "b", "b", "=q", 1);
922 __uaccess_end();
923 return ret;
924 case 2:
925 - __uaccess_begin();
926 + __uaccess_begin_nospec();
927 __get_user_asm_nozero(*(u16 *)dst, (u16 __user *)src,
928 ret, "w", "w", "=r", 2);
929 __uaccess_end();
930 return ret;
931 case 4:
932 - __uaccess_begin();
933 + __uaccess_begin_nospec();
934 __get_user_asm_nozero(*(u32 *)dst, (u32 __user *)src,
935 ret, "l", "k", "=r", 4);
936 __uaccess_end();
937 return ret;
938 case 8:
939 - __uaccess_begin();
940 + __uaccess_begin_nospec();
941 __get_user_asm_nozero(*(u64 *)dst, (u64 __user *)src,
942 ret, "q", "", "=r", 8);
943 __uaccess_end();
944 return ret;
945 case 10:
946 - __uaccess_begin();
947 + __uaccess_begin_nospec();
948 __get_user_asm_nozero(*(u64 *)dst, (u64 __user *)src,
949 ret, "q", "", "=r", 10);
950 if (likely(!ret))
951 @@ -89,7 +89,7 @@ raw_copy_from_user(void *dst, const void __user *src, unsigned long size)
952 __uaccess_end();
953 return ret;
954 case 16:
955 - __uaccess_begin();
956 + __uaccess_begin_nospec();
957 __get_user_asm_nozero(*(u64 *)dst, (u64 __user *)src,
958 ret, "q", "", "=r", 16);
959 if (likely(!ret))
960 diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
961 index 4817d743c263..a481763a3776 100644
962 --- a/arch/x86/kernel/alternative.c
963 +++ b/arch/x86/kernel/alternative.c
964 @@ -46,17 +46,6 @@ static int __init setup_noreplace_smp(char *str)
965 }
966 __setup("noreplace-smp", setup_noreplace_smp);
967
968 -#ifdef CONFIG_PARAVIRT
969 -static int __initdata_or_module noreplace_paravirt = 0;
970 -
971 -static int __init setup_noreplace_paravirt(char *str)
972 -{
973 - noreplace_paravirt = 1;
974 - return 1;
975 -}
976 -__setup("noreplace-paravirt", setup_noreplace_paravirt);
977 -#endif
978 -
979 #define DPRINTK(fmt, args...) \
980 do { \
981 if (debug_alternative) \
982 @@ -298,7 +287,7 @@ recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insnbuf)
983 tgt_rip = next_rip + o_dspl;
984 n_dspl = tgt_rip - orig_insn;
985
986 - DPRINTK("target RIP: %p, new_displ: 0x%x", tgt_rip, n_dspl);
987 + DPRINTK("target RIP: %px, new_displ: 0x%x", tgt_rip, n_dspl);
988
989 if (tgt_rip - orig_insn >= 0) {
990 if (n_dspl - 2 <= 127)
991 @@ -355,7 +344,7 @@ static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *ins
992 add_nops(instr + (a->instrlen - a->padlen), a->padlen);
993 local_irq_restore(flags);
994
995 - DUMP_BYTES(instr, a->instrlen, "%p: [%d:%d) optimized NOPs: ",
996 + DUMP_BYTES(instr, a->instrlen, "%px: [%d:%d) optimized NOPs: ",
997 instr, a->instrlen - a->padlen, a->padlen);
998 }
999
1000 @@ -376,7 +365,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
1001 u8 *instr, *replacement;
1002 u8 insnbuf[MAX_PATCH_LEN];
1003
1004 - DPRINTK("alt table %p -> %p", start, end);
1005 + DPRINTK("alt table %px, -> %px", start, end);
1006 /*
1007 * The scan order should be from start to end. A later scanned
1008 * alternative code can overwrite previously scanned alternative code.
1009 @@ -400,14 +389,14 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
1010 continue;
1011 }
1012
1013 - DPRINTK("feat: %d*32+%d, old: (%p, len: %d), repl: (%p, len: %d), pad: %d",
1014 + DPRINTK("feat: %d*32+%d, old: (%px len: %d), repl: (%px, len: %d), pad: %d",
1015 a->cpuid >> 5,
1016 a->cpuid & 0x1f,
1017 instr, a->instrlen,
1018 replacement, a->replacementlen, a->padlen);
1019
1020 - DUMP_BYTES(instr, a->instrlen, "%p: old_insn: ", instr);
1021 - DUMP_BYTES(replacement, a->replacementlen, "%p: rpl_insn: ", replacement);
1022 + DUMP_BYTES(instr, a->instrlen, "%px: old_insn: ", instr);
1023 + DUMP_BYTES(replacement, a->replacementlen, "%px: rpl_insn: ", replacement);
1024
1025 memcpy(insnbuf, replacement, a->replacementlen);
1026 insnbuf_sz = a->replacementlen;
1027 @@ -433,7 +422,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
1028 a->instrlen - a->replacementlen);
1029 insnbuf_sz += a->instrlen - a->replacementlen;
1030 }
1031 - DUMP_BYTES(insnbuf, insnbuf_sz, "%p: final_insn: ", instr);
1032 + DUMP_BYTES(insnbuf, insnbuf_sz, "%px: final_insn: ", instr);
1033
1034 text_poke_early(instr, insnbuf, insnbuf_sz);
1035 }
1036 @@ -599,9 +588,6 @@ void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
1037 struct paravirt_patch_site *p;
1038 char insnbuf[MAX_PATCH_LEN];
1039
1040 - if (noreplace_paravirt)
1041 - return;
1042 -
1043 for (p = start; p < end; p++) {
1044 unsigned int used;
1045
1046 diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
1047 index 390b3dc3d438..71949bf2de5a 100644
1048 --- a/arch/x86/kernel/cpu/bugs.c
1049 +++ b/arch/x86/kernel/cpu/bugs.c
1050 @@ -11,6 +11,7 @@
1051 #include <linux/init.h>
1052 #include <linux/utsname.h>
1053 #include <linux/cpu.h>
1054 +#include <linux/module.h>
1055
1056 #include <asm/nospec-branch.h>
1057 #include <asm/cmdline.h>
1058 @@ -90,20 +91,41 @@ static const char *spectre_v2_strings[] = {
1059 };
1060
1061 #undef pr_fmt
1062 -#define pr_fmt(fmt) "Spectre V2 mitigation: " fmt
1063 +#define pr_fmt(fmt) "Spectre V2 : " fmt
1064
1065 static enum spectre_v2_mitigation spectre_v2_enabled = SPECTRE_V2_NONE;
1066
1067 +#ifdef RETPOLINE
1068 +static bool spectre_v2_bad_module;
1069 +
1070 +bool retpoline_module_ok(bool has_retpoline)
1071 +{
1072 + if (spectre_v2_enabled == SPECTRE_V2_NONE || has_retpoline)
1073 + return true;
1074 +
1075 + pr_err("System may be vulnerable to spectre v2\n");
1076 + spectre_v2_bad_module = true;
1077 + return false;
1078 +}
1079 +
1080 +static inline const char *spectre_v2_module_string(void)
1081 +{
1082 + return spectre_v2_bad_module ? " - vulnerable module loaded" : "";
1083 +}
1084 +#else
1085 +static inline const char *spectre_v2_module_string(void) { return ""; }
1086 +#endif
1087 +
1088 static void __init spec2_print_if_insecure(const char *reason)
1089 {
1090 if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
1091 - pr_info("%s\n", reason);
1092 + pr_info("%s selected on command line.\n", reason);
1093 }
1094
1095 static void __init spec2_print_if_secure(const char *reason)
1096 {
1097 if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
1098 - pr_info("%s\n", reason);
1099 + pr_info("%s selected on command line.\n", reason);
1100 }
1101
1102 static inline bool retp_compiler(void)
1103 @@ -118,42 +140,68 @@ static inline bool match_option(const char *arg, int arglen, const char *opt)
1104 return len == arglen && !strncmp(arg, opt, len);
1105 }
1106
1107 +static const struct {
1108 + const char *option;
1109 + enum spectre_v2_mitigation_cmd cmd;
1110 + bool secure;
1111 +} mitigation_options[] = {
1112 + { "off", SPECTRE_V2_CMD_NONE, false },
1113 + { "on", SPECTRE_V2_CMD_FORCE, true },
1114 + { "retpoline", SPECTRE_V2_CMD_RETPOLINE, false },
1115 + { "retpoline,amd", SPECTRE_V2_CMD_RETPOLINE_AMD, false },
1116 + { "retpoline,generic", SPECTRE_V2_CMD_RETPOLINE_GENERIC, false },
1117 + { "auto", SPECTRE_V2_CMD_AUTO, false },
1118 +};
1119 +
1120 static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
1121 {
1122 char arg[20];
1123 - int ret;
1124 -
1125 - ret = cmdline_find_option(boot_command_line, "spectre_v2", arg,
1126 - sizeof(arg));
1127 - if (ret > 0) {
1128 - if (match_option(arg, ret, "off")) {
1129 - goto disable;
1130 - } else if (match_option(arg, ret, "on")) {
1131 - spec2_print_if_secure("force enabled on command line.");
1132 - return SPECTRE_V2_CMD_FORCE;
1133 - } else if (match_option(arg, ret, "retpoline")) {
1134 - spec2_print_if_insecure("retpoline selected on command line.");
1135 - return SPECTRE_V2_CMD_RETPOLINE;
1136 - } else if (match_option(arg, ret, "retpoline,amd")) {
1137 - if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
1138 - pr_err("retpoline,amd selected but CPU is not AMD. Switching to AUTO select\n");
1139 - return SPECTRE_V2_CMD_AUTO;
1140 - }
1141 - spec2_print_if_insecure("AMD retpoline selected on command line.");
1142 - return SPECTRE_V2_CMD_RETPOLINE_AMD;
1143 - } else if (match_option(arg, ret, "retpoline,generic")) {
1144 - spec2_print_if_insecure("generic retpoline selected on command line.");
1145 - return SPECTRE_V2_CMD_RETPOLINE_GENERIC;
1146 - } else if (match_option(arg, ret, "auto")) {
1147 + int ret, i;
1148 + enum spectre_v2_mitigation_cmd cmd = SPECTRE_V2_CMD_AUTO;
1149 +
1150 + if (cmdline_find_option_bool(boot_command_line, "nospectre_v2"))
1151 + return SPECTRE_V2_CMD_NONE;
1152 + else {
1153 + ret = cmdline_find_option(boot_command_line, "spectre_v2", arg,
1154 + sizeof(arg));
1155 + if (ret < 0)
1156 + return SPECTRE_V2_CMD_AUTO;
1157 +
1158 + for (i = 0; i < ARRAY_SIZE(mitigation_options); i++) {
1159 + if (!match_option(arg, ret, mitigation_options[i].option))
1160 + continue;
1161 + cmd = mitigation_options[i].cmd;
1162 + break;
1163 + }
1164 +
1165 + if (i >= ARRAY_SIZE(mitigation_options)) {
1166 + pr_err("unknown option (%s). Switching to AUTO select\n",
1167 + mitigation_options[i].option);
1168 return SPECTRE_V2_CMD_AUTO;
1169 }
1170 }
1171
1172 - if (!cmdline_find_option_bool(boot_command_line, "nospectre_v2"))
1173 + if ((cmd == SPECTRE_V2_CMD_RETPOLINE ||
1174 + cmd == SPECTRE_V2_CMD_RETPOLINE_AMD ||
1175 + cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC) &&
1176 + !IS_ENABLED(CONFIG_RETPOLINE)) {
1177 + pr_err("%s selected but not compiled in. Switching to AUTO select\n",
1178 + mitigation_options[i].option);
1179 return SPECTRE_V2_CMD_AUTO;
1180 -disable:
1181 - spec2_print_if_insecure("disabled on command line.");
1182 - return SPECTRE_V2_CMD_NONE;
1183 + }
1184 +
1185 + if (cmd == SPECTRE_V2_CMD_RETPOLINE_AMD &&
1186 + boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
1187 + pr_err("retpoline,amd selected but CPU is not AMD. Switching to AUTO select\n");
1188 + return SPECTRE_V2_CMD_AUTO;
1189 + }
1190 +
1191 + if (mitigation_options[i].secure)
1192 + spec2_print_if_secure(mitigation_options[i].option);
1193 + else
1194 + spec2_print_if_insecure(mitigation_options[i].option);
1195 +
1196 + return cmd;
1197 }
1198
1199 /* Check for Skylake-like CPUs (for RSB handling) */
1200 @@ -191,10 +239,10 @@ static void __init spectre_v2_select_mitigation(void)
1201 return;
1202
1203 case SPECTRE_V2_CMD_FORCE:
1204 - /* FALLTRHU */
1205 case SPECTRE_V2_CMD_AUTO:
1206 - goto retpoline_auto;
1207 -
1208 + if (IS_ENABLED(CONFIG_RETPOLINE))
1209 + goto retpoline_auto;
1210 + break;
1211 case SPECTRE_V2_CMD_RETPOLINE_AMD:
1212 if (IS_ENABLED(CONFIG_RETPOLINE))
1213 goto retpoline_amd;
1214 @@ -249,6 +297,12 @@ static void __init spectre_v2_select_mitigation(void)
1215 setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW);
1216 pr_info("Filling RSB on context switch\n");
1217 }
1218 +
1219 + /* Initialize Indirect Branch Prediction Barrier if supported */
1220 + if (boot_cpu_has(X86_FEATURE_IBPB)) {
1221 + setup_force_cpu_cap(X86_FEATURE_USE_IBPB);
1222 + pr_info("Enabling Indirect Branch Prediction Barrier\n");
1223 + }
1224 }
1225
1226 #undef pr_fmt
1227 @@ -269,7 +323,7 @@ ssize_t cpu_show_spectre_v1(struct device *dev,
1228 {
1229 if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1))
1230 return sprintf(buf, "Not affected\n");
1231 - return sprintf(buf, "Vulnerable\n");
1232 + return sprintf(buf, "Mitigation: __user pointer sanitization\n");
1233 }
1234
1235 ssize_t cpu_show_spectre_v2(struct device *dev,
1236 @@ -278,6 +332,14 @@ ssize_t cpu_show_spectre_v2(struct device *dev,
1237 if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
1238 return sprintf(buf, "Not affected\n");
1239
1240 - return sprintf(buf, "%s\n", spectre_v2_strings[spectre_v2_enabled]);
1241 + return sprintf(buf, "%s%s%s\n", spectre_v2_strings[spectre_v2_enabled],
1242 + boot_cpu_has(X86_FEATURE_USE_IBPB) ? ", IBPB" : "",
1243 + spectre_v2_module_string());
1244 }
1245 #endif
1246 +
1247 +void __ibp_barrier(void)
1248 +{
1249 + __wrmsr(MSR_IA32_PRED_CMD, PRED_CMD_IBPB, 0);
1250 +}
1251 +EXPORT_SYMBOL_GPL(__ibp_barrier);
1252 diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
1253 index ef29ad001991..d63f4b5706e4 100644
1254 --- a/arch/x86/kernel/cpu/common.c
1255 +++ b/arch/x86/kernel/cpu/common.c
1256 @@ -47,6 +47,8 @@
1257 #include <asm/pat.h>
1258 #include <asm/microcode.h>
1259 #include <asm/microcode_intel.h>
1260 +#include <asm/intel-family.h>
1261 +#include <asm/cpu_device_id.h>
1262
1263 #ifdef CONFIG_X86_LOCAL_APIC
1264 #include <asm/uv/uv.h>
1265 @@ -748,6 +750,26 @@ static void apply_forced_caps(struct cpuinfo_x86 *c)
1266 }
1267 }
1268
1269 +static void init_speculation_control(struct cpuinfo_x86 *c)
1270 +{
1271 + /*
1272 + * The Intel SPEC_CTRL CPUID bit implies IBRS and IBPB support,
1273 + * and they also have a different bit for STIBP support. Also,
1274 + * a hypervisor might have set the individual AMD bits even on
1275 + * Intel CPUs, for finer-grained selection of what's available.
1276 + *
1277 + * We use the AMD bits in 0x8000_0008 EBX as the generic hardware
1278 + * features, which are visible in /proc/cpuinfo and used by the
1279 + * kernel. So set those accordingly from the Intel bits.
1280 + */
1281 + if (cpu_has(c, X86_FEATURE_SPEC_CTRL)) {
1282 + set_cpu_cap(c, X86_FEATURE_IBRS);
1283 + set_cpu_cap(c, X86_FEATURE_IBPB);
1284 + }
1285 + if (cpu_has(c, X86_FEATURE_INTEL_STIBP))
1286 + set_cpu_cap(c, X86_FEATURE_STIBP);
1287 +}
1288 +
1289 void get_cpu_cap(struct cpuinfo_x86 *c)
1290 {
1291 u32 eax, ebx, ecx, edx;
1292 @@ -769,6 +791,7 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
1293 cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx);
1294 c->x86_capability[CPUID_7_0_EBX] = ebx;
1295 c->x86_capability[CPUID_7_ECX] = ecx;
1296 + c->x86_capability[CPUID_7_EDX] = edx;
1297 }
1298
1299 /* Extended state features: level 0x0000000d */
1300 @@ -841,6 +864,7 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
1301 c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x8000000a);
1302
1303 init_scattered_cpuid_features(c);
1304 + init_speculation_control(c);
1305
1306 /*
1307 * Clear/Set all flags overridden by options, after probe.
1308 @@ -876,6 +900,41 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
1309 #endif
1310 }
1311
1312 +static const __initconst struct x86_cpu_id cpu_no_speculation[] = {
1313 + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CEDARVIEW, X86_FEATURE_ANY },
1314 + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CLOVERVIEW, X86_FEATURE_ANY },
1315 + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_LINCROFT, X86_FEATURE_ANY },
1316 + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PENWELL, X86_FEATURE_ANY },
1317 + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PINEVIEW, X86_FEATURE_ANY },
1318 + { X86_VENDOR_CENTAUR, 5 },
1319 + { X86_VENDOR_INTEL, 5 },
1320 + { X86_VENDOR_NSC, 5 },
1321 + { X86_VENDOR_ANY, 4 },
1322 + {}
1323 +};
1324 +
1325 +static const __initconst struct x86_cpu_id cpu_no_meltdown[] = {
1326 + { X86_VENDOR_AMD },
1327 + {}
1328 +};
1329 +
1330 +static bool __init cpu_vulnerable_to_meltdown(struct cpuinfo_x86 *c)
1331 +{
1332 + u64 ia32_cap = 0;
1333 +
1334 + if (x86_match_cpu(cpu_no_meltdown))
1335 + return false;
1336 +
1337 + if (cpu_has(c, X86_FEATURE_ARCH_CAPABILITIES))
1338 + rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap);
1339 +
1340 + /* Rogue Data Cache Load? No! */
1341 + if (ia32_cap & ARCH_CAP_RDCL_NO)
1342 + return false;
1343 +
1344 + return true;
1345 +}
1346 +
1347 /*
1348 * Do minimum CPU detection early.
1349 * Fields really needed: vendor, cpuid_level, family, model, mask,
1350 @@ -923,11 +982,12 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
1351
1352 setup_force_cpu_cap(X86_FEATURE_ALWAYS);
1353
1354 - if (c->x86_vendor != X86_VENDOR_AMD)
1355 - setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN);
1356 -
1357 - setup_force_cpu_bug(X86_BUG_SPECTRE_V1);
1358 - setup_force_cpu_bug(X86_BUG_SPECTRE_V2);
1359 + if (!x86_match_cpu(cpu_no_speculation)) {
1360 + if (cpu_vulnerable_to_meltdown(c))
1361 + setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN);
1362 + setup_force_cpu_bug(X86_BUG_SPECTRE_V1);
1363 + setup_force_cpu_bug(X86_BUG_SPECTRE_V2);
1364 + }
1365
1366 fpu__init_system(c);
1367
1368 diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
1369 index b1af22073e28..319bf989fad1 100644
1370 --- a/arch/x86/kernel/cpu/intel.c
1371 +++ b/arch/x86/kernel/cpu/intel.c
1372 @@ -102,6 +102,59 @@ static void probe_xeon_phi_r3mwait(struct cpuinfo_x86 *c)
1373 ELF_HWCAP2 |= HWCAP2_RING3MWAIT;
1374 }
1375
1376 +/*
1377 + * Early microcode releases for the Spectre v2 mitigation were broken.
1378 + * Information taken from;
1379 + * - https://newsroom.intel.com/wp-content/uploads/sites/11/2018/01/microcode-update-guidance.pdf
1380 + * - https://kb.vmware.com/s/article/52345
1381 + * - Microcode revisions observed in the wild
1382 + * - Release note from 20180108 microcode release
1383 + */
1384 +struct sku_microcode {
1385 + u8 model;
1386 + u8 stepping;
1387 + u32 microcode;
1388 +};
1389 +static const struct sku_microcode spectre_bad_microcodes[] = {
1390 + { INTEL_FAM6_KABYLAKE_DESKTOP, 0x0B, 0x84 },
1391 + { INTEL_FAM6_KABYLAKE_DESKTOP, 0x0A, 0x84 },
1392 + { INTEL_FAM6_KABYLAKE_DESKTOP, 0x09, 0x84 },
1393 + { INTEL_FAM6_KABYLAKE_MOBILE, 0x0A, 0x84 },
1394 + { INTEL_FAM6_KABYLAKE_MOBILE, 0x09, 0x84 },
1395 + { INTEL_FAM6_SKYLAKE_X, 0x03, 0x0100013e },
1396 + { INTEL_FAM6_SKYLAKE_X, 0x04, 0x0200003c },
1397 + { INTEL_FAM6_SKYLAKE_MOBILE, 0x03, 0xc2 },
1398 + { INTEL_FAM6_SKYLAKE_DESKTOP, 0x03, 0xc2 },
1399 + { INTEL_FAM6_BROADWELL_CORE, 0x04, 0x28 },
1400 + { INTEL_FAM6_BROADWELL_GT3E, 0x01, 0x1b },
1401 + { INTEL_FAM6_BROADWELL_XEON_D, 0x02, 0x14 },
1402 + { INTEL_FAM6_BROADWELL_XEON_D, 0x03, 0x07000011 },
1403 + { INTEL_FAM6_BROADWELL_X, 0x01, 0x0b000025 },
1404 + { INTEL_FAM6_HASWELL_ULT, 0x01, 0x21 },
1405 + { INTEL_FAM6_HASWELL_GT3E, 0x01, 0x18 },
1406 + { INTEL_FAM6_HASWELL_CORE, 0x03, 0x23 },
1407 + { INTEL_FAM6_HASWELL_X, 0x02, 0x3b },
1408 + { INTEL_FAM6_HASWELL_X, 0x04, 0x10 },
1409 + { INTEL_FAM6_IVYBRIDGE_X, 0x04, 0x42a },
1410 + /* Updated in the 20180108 release; blacklist until we know otherwise */
1411 + { INTEL_FAM6_ATOM_GEMINI_LAKE, 0x01, 0x22 },
1412 + /* Observed in the wild */
1413 + { INTEL_FAM6_SANDYBRIDGE_X, 0x06, 0x61b },
1414 + { INTEL_FAM6_SANDYBRIDGE_X, 0x07, 0x712 },
1415 +};
1416 +
1417 +static bool bad_spectre_microcode(struct cpuinfo_x86 *c)
1418 +{
1419 + int i;
1420 +
1421 + for (i = 0; i < ARRAY_SIZE(spectre_bad_microcodes); i++) {
1422 + if (c->x86_model == spectre_bad_microcodes[i].model &&
1423 + c->x86_mask == spectre_bad_microcodes[i].stepping)
1424 + return (c->microcode <= spectre_bad_microcodes[i].microcode);
1425 + }
1426 + return false;
1427 +}
1428 +
1429 static void early_init_intel(struct cpuinfo_x86 *c)
1430 {
1431 u64 misc_enable;
1432 @@ -122,6 +175,19 @@ static void early_init_intel(struct cpuinfo_x86 *c)
1433 if (c->x86 >= 6 && !cpu_has(c, X86_FEATURE_IA64))
1434 c->microcode = intel_get_microcode_revision();
1435
1436 + /* Now if any of them are set, check the blacklist and clear the lot */
1437 + if ((cpu_has(c, X86_FEATURE_SPEC_CTRL) ||
1438 + cpu_has(c, X86_FEATURE_INTEL_STIBP) ||
1439 + cpu_has(c, X86_FEATURE_IBRS) || cpu_has(c, X86_FEATURE_IBPB) ||
1440 + cpu_has(c, X86_FEATURE_STIBP)) && bad_spectre_microcode(c)) {
1441 + pr_warn("Intel Spectre v2 broken microcode detected; disabling Speculation Control\n");
1442 + setup_clear_cpu_cap(X86_FEATURE_IBRS);
1443 + setup_clear_cpu_cap(X86_FEATURE_IBPB);
1444 + setup_clear_cpu_cap(X86_FEATURE_STIBP);
1445 + setup_clear_cpu_cap(X86_FEATURE_SPEC_CTRL);
1446 + setup_clear_cpu_cap(X86_FEATURE_INTEL_STIBP);
1447 + }
1448 +
1449 /*
1450 * Atom erratum AAE44/AAF40/AAG38/AAH41:
1451 *
1452 diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
1453 index d0e69769abfd..df11f5d604be 100644
1454 --- a/arch/x86/kernel/cpu/scattered.c
1455 +++ b/arch/x86/kernel/cpu/scattered.c
1456 @@ -21,8 +21,6 @@ struct cpuid_bit {
1457 static const struct cpuid_bit cpuid_bits[] = {
1458 { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 },
1459 { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 },
1460 - { X86_FEATURE_AVX512_4VNNIW, CPUID_EDX, 2, 0x00000007, 0 },
1461 - { X86_FEATURE_AVX512_4FMAPS, CPUID_EDX, 3, 0x00000007, 0 },
1462 { X86_FEATURE_CAT_L3, CPUID_EBX, 1, 0x00000010, 0 },
1463 { X86_FEATURE_CAT_L2, CPUID_EBX, 2, 0x00000010, 0 },
1464 { X86_FEATURE_CDP_L3, CPUID_ECX, 2, 0x00000010, 1 },
1465 diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
1466 index c75466232016..9eb448c7859d 100644
1467 --- a/arch/x86/kernel/process_64.c
1468 +++ b/arch/x86/kernel/process_64.c
1469 @@ -557,7 +557,7 @@ static void __set_personality_x32(void)
1470 * Pretend to come from a x32 execve.
1471 */
1472 task_pt_regs(current)->orig_ax = __NR_x32_execve | __X32_SYSCALL_BIT;
1473 - current->thread.status &= ~TS_COMPAT;
1474 + current_thread_info()->status &= ~TS_COMPAT;
1475 #endif
1476 }
1477
1478 @@ -571,7 +571,7 @@ static void __set_personality_ia32(void)
1479 current->personality |= force_personality32;
1480 /* Prepare the first "return" to user space */
1481 task_pt_regs(current)->orig_ax = __NR_ia32_execve;
1482 - current->thread.status |= TS_COMPAT;
1483 + current_thread_info()->status |= TS_COMPAT;
1484 #endif
1485 }
1486
1487 diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
1488 index f37d18124648..ed5c4cdf0a34 100644
1489 --- a/arch/x86/kernel/ptrace.c
1490 +++ b/arch/x86/kernel/ptrace.c
1491 @@ -935,7 +935,7 @@ static int putreg32(struct task_struct *child, unsigned regno, u32 value)
1492 */
1493 regs->orig_ax = value;
1494 if (syscall_get_nr(child, regs) >= 0)
1495 - child->thread.status |= TS_I386_REGS_POKED;
1496 + child->thread_info.status |= TS_I386_REGS_POKED;
1497 break;
1498
1499 case offsetof(struct user32, regs.eflags):
1500 diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
1501 index b9e00e8f1c9b..4cdc0b27ec82 100644
1502 --- a/arch/x86/kernel/signal.c
1503 +++ b/arch/x86/kernel/signal.c
1504 @@ -787,7 +787,7 @@ static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs)
1505 * than the tracee.
1506 */
1507 #ifdef CONFIG_IA32_EMULATION
1508 - if (current->thread.status & (TS_COMPAT|TS_I386_REGS_POKED))
1509 + if (current_thread_info()->status & (TS_COMPAT|TS_I386_REGS_POKED))
1510 return __NR_ia32_restart_syscall;
1511 #endif
1512 #ifdef CONFIG_X86_X32_ABI
1513 diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
1514 index 0099e10eb045..13f5d4217e4f 100644
1515 --- a/arch/x86/kvm/cpuid.c
1516 +++ b/arch/x86/kvm/cpuid.c
1517 @@ -67,9 +67,7 @@ u64 kvm_supported_xcr0(void)
1518
1519 #define F(x) bit(X86_FEATURE_##x)
1520
1521 -/* These are scattered features in cpufeatures.h. */
1522 -#define KVM_CPUID_BIT_AVX512_4VNNIW 2
1523 -#define KVM_CPUID_BIT_AVX512_4FMAPS 3
1524 +/* For scattered features from cpufeatures.h; we currently expose none */
1525 #define KF(x) bit(KVM_CPUID_BIT_##x)
1526
1527 int kvm_update_cpuid(struct kvm_vcpu *vcpu)
1528 @@ -367,6 +365,10 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1529 F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) |
1530 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM);
1531
1532 + /* cpuid 0x80000008.ebx */
1533 + const u32 kvm_cpuid_8000_0008_ebx_x86_features =
1534 + F(IBPB) | F(IBRS);
1535 +
1536 /* cpuid 0xC0000001.edx */
1537 const u32 kvm_cpuid_C000_0001_edx_x86_features =
1538 F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) |
1539 @@ -392,7 +394,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1540
1541 /* cpuid 7.0.edx*/
1542 const u32 kvm_cpuid_7_0_edx_x86_features =
1543 - KF(AVX512_4VNNIW) | KF(AVX512_4FMAPS);
1544 + F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) |
1545 + F(ARCH_CAPABILITIES);
1546
1547 /* all calls to cpuid_count() should be made on the same cpu */
1548 get_cpu();
1549 @@ -477,7 +480,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1550 if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE))
1551 entry->ecx &= ~F(PKU);
1552 entry->edx &= kvm_cpuid_7_0_edx_x86_features;
1553 - entry->edx &= get_scattered_cpuid_leaf(7, 0, CPUID_EDX);
1554 + cpuid_mask(&entry->edx, CPUID_7_EDX);
1555 } else {
1556 entry->ebx = 0;
1557 entry->ecx = 0;
1558 @@ -627,7 +630,14 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1559 if (!g_phys_as)
1560 g_phys_as = phys_as;
1561 entry->eax = g_phys_as | (virt_as << 8);
1562 - entry->ebx = entry->edx = 0;
1563 + entry->edx = 0;
1564 + /* IBRS and IBPB aren't necessarily present in hardware cpuid */
1565 + if (boot_cpu_has(X86_FEATURE_IBPB))
1566 + entry->ebx |= F(IBPB);
1567 + if (boot_cpu_has(X86_FEATURE_IBRS))
1568 + entry->ebx |= F(IBRS);
1569 + entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features;
1570 + cpuid_mask(&entry->ebx, CPUID_8000_0008_EBX);
1571 break;
1572 }
1573 case 0x80000019:
1574 diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
1575 index c2cea6651279..9a327d5b6d1f 100644
1576 --- a/arch/x86/kvm/cpuid.h
1577 +++ b/arch/x86/kvm/cpuid.h
1578 @@ -54,6 +54,7 @@ static const struct cpuid_reg reverse_cpuid[] = {
1579 [CPUID_8000_000A_EDX] = {0x8000000a, 0, CPUID_EDX},
1580 [CPUID_7_ECX] = { 7, 0, CPUID_ECX},
1581 [CPUID_8000_0007_EBX] = {0x80000007, 0, CPUID_EBX},
1582 + [CPUID_7_EDX] = { 7, 0, CPUID_EDX},
1583 };
1584
1585 static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned x86_feature)
1586 diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
1587 index b514b2b2845a..290ecf711aec 100644
1588 --- a/arch/x86/kvm/emulate.c
1589 +++ b/arch/x86/kvm/emulate.c
1590 @@ -25,6 +25,7 @@
1591 #include <asm/kvm_emulate.h>
1592 #include <linux/stringify.h>
1593 #include <asm/debugreg.h>
1594 +#include <asm/nospec-branch.h>
1595
1596 #include "x86.h"
1597 #include "tss.h"
1598 @@ -1021,8 +1022,8 @@ static __always_inline u8 test_cc(unsigned int condition, unsigned long flags)
1599 void (*fop)(void) = (void *)em_setcc + 4 * (condition & 0xf);
1600
1601 flags = (flags & EFLAGS_MASK) | X86_EFLAGS_IF;
1602 - asm("push %[flags]; popf; call *%[fastop]"
1603 - : "=a"(rc) : [fastop]"r"(fop), [flags]"r"(flags));
1604 + asm("push %[flags]; popf; " CALL_NOSPEC
1605 + : "=a"(rc) : [thunk_target]"r"(fop), [flags]"r"(flags));
1606 return rc;
1607 }
1608
1609 @@ -5335,9 +5336,9 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *))
1610 if (!(ctxt->d & ByteOp))
1611 fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE;
1612
1613 - asm("push %[flags]; popf; call *%[fastop]; pushf; pop %[flags]\n"
1614 + asm("push %[flags]; popf; " CALL_NOSPEC " ; pushf; pop %[flags]\n"
1615 : "+a"(ctxt->dst.val), "+d"(ctxt->src.val), [flags]"+D"(flags),
1616 - [fastop]"+S"(fop), ASM_CALL_CONSTRAINT
1617 + [thunk_target]"+S"(fop), ASM_CALL_CONSTRAINT
1618 : "c"(ctxt->src2.val));
1619
1620 ctxt->eflags = (ctxt->eflags & ~EFLAGS_MASK) | (flags & EFLAGS_MASK);
1621 diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
1622 index f40d0da1f1d3..4e3c79530526 100644
1623 --- a/arch/x86/kvm/svm.c
1624 +++ b/arch/x86/kvm/svm.c
1625 @@ -184,6 +184,8 @@ struct vcpu_svm {
1626 u64 gs_base;
1627 } host;
1628
1629 + u64 spec_ctrl;
1630 +
1631 u32 *msrpm;
1632
1633 ulong nmi_iret_rip;
1634 @@ -249,6 +251,8 @@ static const struct svm_direct_access_msrs {
1635 { .index = MSR_CSTAR, .always = true },
1636 { .index = MSR_SYSCALL_MASK, .always = true },
1637 #endif
1638 + { .index = MSR_IA32_SPEC_CTRL, .always = false },
1639 + { .index = MSR_IA32_PRED_CMD, .always = false },
1640 { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false },
1641 { .index = MSR_IA32_LASTBRANCHTOIP, .always = false },
1642 { .index = MSR_IA32_LASTINTFROMIP, .always = false },
1643 @@ -529,6 +533,7 @@ struct svm_cpu_data {
1644 struct kvm_ldttss_desc *tss_desc;
1645
1646 struct page *save_area;
1647 + struct vmcb *current_vmcb;
1648 };
1649
1650 static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
1651 @@ -880,6 +885,25 @@ static bool valid_msr_intercept(u32 index)
1652 return false;
1653 }
1654
1655 +static bool msr_write_intercepted(struct kvm_vcpu *vcpu, unsigned msr)
1656 +{
1657 + u8 bit_write;
1658 + unsigned long tmp;
1659 + u32 offset;
1660 + u32 *msrpm;
1661 +
1662 + msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm:
1663 + to_svm(vcpu)->msrpm;
1664 +
1665 + offset = svm_msrpm_offset(msr);
1666 + bit_write = 2 * (msr & 0x0f) + 1;
1667 + tmp = msrpm[offset];
1668 +
1669 + BUG_ON(offset == MSR_INVALID);
1670 +
1671 + return !!test_bit(bit_write, &tmp);
1672 +}
1673 +
1674 static void set_msr_interception(u32 *msrpm, unsigned msr,
1675 int read, int write)
1676 {
1677 @@ -1582,6 +1606,8 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
1678 u32 dummy;
1679 u32 eax = 1;
1680
1681 + svm->spec_ctrl = 0;
1682 +
1683 if (!init_event) {
1684 svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
1685 MSR_IA32_APICBASE_ENABLE;
1686 @@ -1703,11 +1729,17 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
1687 __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
1688 kvm_vcpu_uninit(vcpu);
1689 kmem_cache_free(kvm_vcpu_cache, svm);
1690 + /*
1691 + * The vmcb page can be recycled, causing a false negative in
1692 + * svm_vcpu_load(). So do a full IBPB now.
1693 + */
1694 + indirect_branch_prediction_barrier();
1695 }
1696
1697 static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1698 {
1699 struct vcpu_svm *svm = to_svm(vcpu);
1700 + struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
1701 int i;
1702
1703 if (unlikely(cpu != vcpu->cpu)) {
1704 @@ -1736,6 +1768,10 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1705 if (static_cpu_has(X86_FEATURE_RDTSCP))
1706 wrmsrl(MSR_TSC_AUX, svm->tsc_aux);
1707
1708 + if (sd->current_vmcb != svm->vmcb) {
1709 + sd->current_vmcb = svm->vmcb;
1710 + indirect_branch_prediction_barrier();
1711 + }
1712 avic_vcpu_load(vcpu, cpu);
1713 }
1714
1715 @@ -3593,6 +3629,13 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
1716 case MSR_VM_CR:
1717 msr_info->data = svm->nested.vm_cr_msr;
1718 break;
1719 + case MSR_IA32_SPEC_CTRL:
1720 + if (!msr_info->host_initiated &&
1721 + !guest_cpuid_has(vcpu, X86_FEATURE_IBRS))
1722 + return 1;
1723 +
1724 + msr_info->data = svm->spec_ctrl;
1725 + break;
1726 case MSR_IA32_UCODE_REV:
1727 msr_info->data = 0x01000065;
1728 break;
1729 @@ -3684,6 +3727,49 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
1730 case MSR_IA32_TSC:
1731 kvm_write_tsc(vcpu, msr);
1732 break;
1733 + case MSR_IA32_SPEC_CTRL:
1734 + if (!msr->host_initiated &&
1735 + !guest_cpuid_has(vcpu, X86_FEATURE_IBRS))
1736 + return 1;
1737 +
1738 + /* The STIBP bit doesn't fault even if it's not advertised */
1739 + if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP))
1740 + return 1;
1741 +
1742 + svm->spec_ctrl = data;
1743 +
1744 + if (!data)
1745 + break;
1746 +
1747 + /*
1748 + * For non-nested:
1749 + * When it's written (to non-zero) for the first time, pass
1750 + * it through.
1751 + *
1752 + * For nested:
1753 + * The handling of the MSR bitmap for L2 guests is done in
1754 + * nested_svm_vmrun_msrpm.
1755 + * We update the L1 MSR bit as well since it will end up
1756 + * touching the MSR anyway now.
1757 + */
1758 + set_msr_interception(svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
1759 + break;
1760 + case MSR_IA32_PRED_CMD:
1761 + if (!msr->host_initiated &&
1762 + !guest_cpuid_has(vcpu, X86_FEATURE_IBPB))
1763 + return 1;
1764 +
1765 + if (data & ~PRED_CMD_IBPB)
1766 + return 1;
1767 +
1768 + if (!data)
1769 + break;
1770 +
1771 + wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
1772 + if (is_guest_mode(vcpu))
1773 + break;
1774 + set_msr_interception(svm->msrpm, MSR_IA32_PRED_CMD, 0, 1);
1775 + break;
1776 case MSR_STAR:
1777 svm->vmcb->save.star = data;
1778 break;
1779 @@ -4936,6 +5022,15 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
1780
1781 local_irq_enable();
1782
1783 + /*
1784 + * If this vCPU has touched SPEC_CTRL, restore the guest's value if
1785 + * it's non-zero. Since vmentry is serialising on affected CPUs, there
1786 + * is no need to worry about the conditional branch over the wrmsr
1787 + * being speculatively taken.
1788 + */
1789 + if (svm->spec_ctrl)
1790 + wrmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl);
1791 +
1792 asm volatile (
1793 "push %%" _ASM_BP "; \n\t"
1794 "mov %c[rbx](%[svm]), %%" _ASM_BX " \n\t"
1795 @@ -5028,6 +5123,27 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
1796 #endif
1797 );
1798
1799 + /*
1800 + * We do not use IBRS in the kernel. If this vCPU has used the
1801 + * SPEC_CTRL MSR it may have left it on; save the value and
1802 + * turn it off. This is much more efficient than blindly adding
1803 + * it to the atomic save/restore list. Especially as the former
1804 + * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
1805 + *
1806 + * For non-nested case:
1807 + * If the L01 MSR bitmap does not intercept the MSR, then we need to
1808 + * save it.
1809 + *
1810 + * For nested case:
1811 + * If the L02 MSR bitmap does not intercept the MSR, then we need to
1812 + * save it.
1813 + */
1814 + if (!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))
1815 + rdmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl);
1816 +
1817 + if (svm->spec_ctrl)
1818 + wrmsrl(MSR_IA32_SPEC_CTRL, 0);
1819 +
1820 /* Eliminate branch target predictions from guest mode */
1821 vmexit_fill_RSB();
1822
1823 diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
1824 index c829d89e2e63..bee4c49f6dd0 100644
1825 --- a/arch/x86/kvm/vmx.c
1826 +++ b/arch/x86/kvm/vmx.c
1827 @@ -34,6 +34,7 @@
1828 #include <linux/tboot.h>
1829 #include <linux/hrtimer.h>
1830 #include <linux/frame.h>
1831 +#include <linux/nospec.h>
1832 #include "kvm_cache_regs.h"
1833 #include "x86.h"
1834
1835 @@ -111,6 +112,14 @@ static u64 __read_mostly host_xss;
1836 static bool __read_mostly enable_pml = 1;
1837 module_param_named(pml, enable_pml, bool, S_IRUGO);
1838
1839 +#define MSR_TYPE_R 1
1840 +#define MSR_TYPE_W 2
1841 +#define MSR_TYPE_RW 3
1842 +
1843 +#define MSR_BITMAP_MODE_X2APIC 1
1844 +#define MSR_BITMAP_MODE_X2APIC_APICV 2
1845 +#define MSR_BITMAP_MODE_LM 4
1846 +
1847 #define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL
1848
1849 /* Guest_tsc -> host_tsc conversion requires 64-bit division. */
1850 @@ -185,7 +194,6 @@ module_param(ple_window_max, int, S_IRUGO);
1851 extern const ulong vmx_return;
1852
1853 #define NR_AUTOLOAD_MSRS 8
1854 -#define VMCS02_POOL_SIZE 1
1855
1856 struct vmcs {
1857 u32 revision_id;
1858 @@ -210,6 +218,7 @@ struct loaded_vmcs {
1859 int soft_vnmi_blocked;
1860 ktime_t entry_time;
1861 s64 vnmi_blocked_time;
1862 + unsigned long *msr_bitmap;
1863 struct list_head loaded_vmcss_on_cpu_link;
1864 };
1865
1866 @@ -226,7 +235,7 @@ struct shared_msr_entry {
1867 * stored in guest memory specified by VMPTRLD, but is opaque to the guest,
1868 * which must access it using VMREAD/VMWRITE/VMCLEAR instructions.
1869 * More than one of these structures may exist, if L1 runs multiple L2 guests.
1870 - * nested_vmx_run() will use the data here to build a vmcs02: a VMCS for the
1871 + * nested_vmx_run() will use the data here to build the vmcs02: a VMCS for the
1872 * underlying hardware which will be used to run L2.
1873 * This structure is packed to ensure that its layout is identical across
1874 * machines (necessary for live migration).
1875 @@ -409,13 +418,6 @@ struct __packed vmcs12 {
1876 */
1877 #define VMCS12_SIZE 0x1000
1878
1879 -/* Used to remember the last vmcs02 used for some recently used vmcs12s */
1880 -struct vmcs02_list {
1881 - struct list_head list;
1882 - gpa_t vmptr;
1883 - struct loaded_vmcs vmcs02;
1884 -};
1885 -
1886 /*
1887 * The nested_vmx structure is part of vcpu_vmx, and holds information we need
1888 * for correct emulation of VMX (i.e., nested VMX) on this vcpu.
1889 @@ -440,15 +442,15 @@ struct nested_vmx {
1890 */
1891 bool sync_shadow_vmcs;
1892
1893 - /* vmcs02_list cache of VMCSs recently used to run L2 guests */
1894 - struct list_head vmcs02_pool;
1895 - int vmcs02_num;
1896 bool change_vmcs01_virtual_x2apic_mode;
1897 /* L2 must run next, and mustn't decide to exit to L1. */
1898 bool nested_run_pending;
1899 +
1900 + struct loaded_vmcs vmcs02;
1901 +
1902 /*
1903 - * Guest pages referred to in vmcs02 with host-physical pointers, so
1904 - * we must keep them pinned while L2 runs.
1905 + * Guest pages referred to in the vmcs02 with host-physical
1906 + * pointers, so we must keep them pinned while L2 runs.
1907 */
1908 struct page *apic_access_page;
1909 struct page *virtual_apic_page;
1910 @@ -457,8 +459,6 @@ struct nested_vmx {
1911 bool pi_pending;
1912 u16 posted_intr_nv;
1913
1914 - unsigned long *msr_bitmap;
1915 -
1916 struct hrtimer preemption_timer;
1917 bool preemption_timer_expired;
1918
1919 @@ -581,6 +581,7 @@ struct vcpu_vmx {
1920 struct kvm_vcpu vcpu;
1921 unsigned long host_rsp;
1922 u8 fail;
1923 + u8 msr_bitmap_mode;
1924 u32 exit_intr_info;
1925 u32 idt_vectoring_info;
1926 ulong rflags;
1927 @@ -592,6 +593,10 @@ struct vcpu_vmx {
1928 u64 msr_host_kernel_gs_base;
1929 u64 msr_guest_kernel_gs_base;
1930 #endif
1931 +
1932 + u64 arch_capabilities;
1933 + u64 spec_ctrl;
1934 +
1935 u32 vm_entry_controls_shadow;
1936 u32 vm_exit_controls_shadow;
1937 u32 secondary_exec_control;
1938 @@ -898,21 +903,18 @@ static const unsigned short vmcs_field_to_offset_table[] = {
1939
1940 static inline short vmcs_field_to_offset(unsigned long field)
1941 {
1942 - BUILD_BUG_ON(ARRAY_SIZE(vmcs_field_to_offset_table) > SHRT_MAX);
1943 + const size_t size = ARRAY_SIZE(vmcs_field_to_offset_table);
1944 + unsigned short offset;
1945
1946 - if (field >= ARRAY_SIZE(vmcs_field_to_offset_table))
1947 + BUILD_BUG_ON(size > SHRT_MAX);
1948 + if (field >= size)
1949 return -ENOENT;
1950
1951 - /*
1952 - * FIXME: Mitigation for CVE-2017-5753. To be replaced with a
1953 - * generic mechanism.
1954 - */
1955 - asm("lfence");
1956 -
1957 - if (vmcs_field_to_offset_table[field] == 0)
1958 + field = array_index_nospec(field, size);
1959 + offset = vmcs_field_to_offset_table[field];
1960 + if (offset == 0)
1961 return -ENOENT;
1962 -
1963 - return vmcs_field_to_offset_table[field];
1964 + return offset;
1965 }
1966
1967 static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
1968 @@ -935,6 +937,9 @@ static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu);
1969 static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
1970 static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
1971 u16 error_code);
1972 +static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
1973 +static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
1974 + u32 msr, int type);
1975
1976 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
1977 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
1978 @@ -954,12 +959,6 @@ static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
1979 enum {
1980 VMX_IO_BITMAP_A,
1981 VMX_IO_BITMAP_B,
1982 - VMX_MSR_BITMAP_LEGACY,
1983 - VMX_MSR_BITMAP_LONGMODE,
1984 - VMX_MSR_BITMAP_LEGACY_X2APIC_APICV,
1985 - VMX_MSR_BITMAP_LONGMODE_X2APIC_APICV,
1986 - VMX_MSR_BITMAP_LEGACY_X2APIC,
1987 - VMX_MSR_BITMAP_LONGMODE_X2APIC,
1988 VMX_VMREAD_BITMAP,
1989 VMX_VMWRITE_BITMAP,
1990 VMX_BITMAP_NR
1991 @@ -969,12 +968,6 @@ static unsigned long *vmx_bitmap[VMX_BITMAP_NR];
1992
1993 #define vmx_io_bitmap_a (vmx_bitmap[VMX_IO_BITMAP_A])
1994 #define vmx_io_bitmap_b (vmx_bitmap[VMX_IO_BITMAP_B])
1995 -#define vmx_msr_bitmap_legacy (vmx_bitmap[VMX_MSR_BITMAP_LEGACY])
1996 -#define vmx_msr_bitmap_longmode (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE])
1997 -#define vmx_msr_bitmap_legacy_x2apic_apicv (vmx_bitmap[VMX_MSR_BITMAP_LEGACY_X2APIC_APICV])
1998 -#define vmx_msr_bitmap_longmode_x2apic_apicv (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE_X2APIC_APICV])
1999 -#define vmx_msr_bitmap_legacy_x2apic (vmx_bitmap[VMX_MSR_BITMAP_LEGACY_X2APIC])
2000 -#define vmx_msr_bitmap_longmode_x2apic (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE_X2APIC])
2001 #define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP])
2002 #define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP])
2003
2004 @@ -1918,6 +1911,52 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
2005 vmcs_write32(EXCEPTION_BITMAP, eb);
2006 }
2007
2008 +/*
2009 + * Check if MSR is intercepted for currently loaded MSR bitmap.
2010 + */
2011 +static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
2012 +{
2013 + unsigned long *msr_bitmap;
2014 + int f = sizeof(unsigned long);
2015 +
2016 + if (!cpu_has_vmx_msr_bitmap())
2017 + return true;
2018 +
2019 + msr_bitmap = to_vmx(vcpu)->loaded_vmcs->msr_bitmap;
2020 +
2021 + if (msr <= 0x1fff) {
2022 + return !!test_bit(msr, msr_bitmap + 0x800 / f);
2023 + } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
2024 + msr &= 0x1fff;
2025 + return !!test_bit(msr, msr_bitmap + 0xc00 / f);
2026 + }
2027 +
2028 + return true;
2029 +}
2030 +
2031 +/*
2032 + * Check if MSR is intercepted for L01 MSR bitmap.
2033 + */
2034 +static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr)
2035 +{
2036 + unsigned long *msr_bitmap;
2037 + int f = sizeof(unsigned long);
2038 +
2039 + if (!cpu_has_vmx_msr_bitmap())
2040 + return true;
2041 +
2042 + msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap;
2043 +
2044 + if (msr <= 0x1fff) {
2045 + return !!test_bit(msr, msr_bitmap + 0x800 / f);
2046 + } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
2047 + msr &= 0x1fff;
2048 + return !!test_bit(msr, msr_bitmap + 0xc00 / f);
2049 + }
2050 +
2051 + return true;
2052 +}
2053 +
2054 static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
2055 unsigned long entry, unsigned long exit)
2056 {
2057 @@ -2296,6 +2335,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
2058 if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
2059 per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
2060 vmcs_load(vmx->loaded_vmcs->vmcs);
2061 + indirect_branch_prediction_barrier();
2062 }
2063
2064 if (!already_loaded) {
2065 @@ -2572,36 +2612,6 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
2066 vmx->guest_msrs[from] = tmp;
2067 }
2068
2069 -static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
2070 -{
2071 - unsigned long *msr_bitmap;
2072 -
2073 - if (is_guest_mode(vcpu))
2074 - msr_bitmap = to_vmx(vcpu)->nested.msr_bitmap;
2075 - else if (cpu_has_secondary_exec_ctrls() &&
2076 - (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
2077 - SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
2078 - if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) {
2079 - if (is_long_mode(vcpu))
2080 - msr_bitmap = vmx_msr_bitmap_longmode_x2apic_apicv;
2081 - else
2082 - msr_bitmap = vmx_msr_bitmap_legacy_x2apic_apicv;
2083 - } else {
2084 - if (is_long_mode(vcpu))
2085 - msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
2086 - else
2087 - msr_bitmap = vmx_msr_bitmap_legacy_x2apic;
2088 - }
2089 - } else {
2090 - if (is_long_mode(vcpu))
2091 - msr_bitmap = vmx_msr_bitmap_longmode;
2092 - else
2093 - msr_bitmap = vmx_msr_bitmap_legacy;
2094 - }
2095 -
2096 - vmcs_write64(MSR_BITMAP, __pa(msr_bitmap));
2097 -}
2098 -
2099 /*
2100 * Set up the vmcs to automatically save and restore system
2101 * msrs. Don't touch the 64-bit msrs if the guest is in legacy
2102 @@ -2642,7 +2652,7 @@ static void setup_msrs(struct vcpu_vmx *vmx)
2103 vmx->save_nmsrs = save_nmsrs;
2104
2105 if (cpu_has_vmx_msr_bitmap())
2106 - vmx_set_msr_bitmap(&vmx->vcpu);
2107 + vmx_update_msr_bitmap(&vmx->vcpu);
2108 }
2109
2110 /*
2111 @@ -3276,6 +3286,20 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2112 case MSR_IA32_TSC:
2113 msr_info->data = guest_read_tsc(vcpu);
2114 break;
2115 + case MSR_IA32_SPEC_CTRL:
2116 + if (!msr_info->host_initiated &&
2117 + !guest_cpuid_has(vcpu, X86_FEATURE_IBRS) &&
2118 + !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
2119 + return 1;
2120 +
2121 + msr_info->data = to_vmx(vcpu)->spec_ctrl;
2122 + break;
2123 + case MSR_IA32_ARCH_CAPABILITIES:
2124 + if (!msr_info->host_initiated &&
2125 + !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES))
2126 + return 1;
2127 + msr_info->data = to_vmx(vcpu)->arch_capabilities;
2128 + break;
2129 case MSR_IA32_SYSENTER_CS:
2130 msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
2131 break;
2132 @@ -3383,6 +3407,70 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2133 case MSR_IA32_TSC:
2134 kvm_write_tsc(vcpu, msr_info);
2135 break;
2136 + case MSR_IA32_SPEC_CTRL:
2137 + if (!msr_info->host_initiated &&
2138 + !guest_cpuid_has(vcpu, X86_FEATURE_IBRS) &&
2139 + !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
2140 + return 1;
2141 +
2142 + /* The STIBP bit doesn't fault even if it's not advertised */
2143 + if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP))
2144 + return 1;
2145 +
2146 + vmx->spec_ctrl = data;
2147 +
2148 + if (!data)
2149 + break;
2150 +
2151 + /*
2152 + * For non-nested:
2153 + * When it's written (to non-zero) for the first time, pass
2154 + * it through.
2155 + *
2156 + * For nested:
2157 + * The handling of the MSR bitmap for L2 guests is done in
2158 + * nested_vmx_merge_msr_bitmap. We should not touch the
2159 + * vmcs02.msr_bitmap here since it gets completely overwritten
2160 + * in the merging. We update the vmcs01 here for L1 as well
2161 + * since it will end up touching the MSR anyway now.
2162 + */
2163 + vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap,
2164 + MSR_IA32_SPEC_CTRL,
2165 + MSR_TYPE_RW);
2166 + break;
2167 + case MSR_IA32_PRED_CMD:
2168 + if (!msr_info->host_initiated &&
2169 + !guest_cpuid_has(vcpu, X86_FEATURE_IBPB) &&
2170 + !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
2171 + return 1;
2172 +
2173 + if (data & ~PRED_CMD_IBPB)
2174 + return 1;
2175 +
2176 + if (!data)
2177 + break;
2178 +
2179 + wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
2180 +
2181 + /*
2182 + * For non-nested:
2183 + * When it's written (to non-zero) for the first time, pass
2184 + * it through.
2185 + *
2186 + * For nested:
2187 + * The handling of the MSR bitmap for L2 guests is done in
2188 + * nested_vmx_merge_msr_bitmap. We should not touch the
2189 + * vmcs02.msr_bitmap here since it gets completely overwritten
2190 + * in the merging.
2191 + */
2192 + vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD,
2193 + MSR_TYPE_W);
2194 + break;
2195 + case MSR_IA32_ARCH_CAPABILITIES:
2196 + if (!msr_info->host_initiated)
2197 + return 1;
2198 + vmx->arch_capabilities = data;
2199 + break;
2200 case MSR_IA32_CR_PAT:
2201 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
2202 if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
2203 @@ -3837,11 +3925,6 @@ static struct vmcs *alloc_vmcs_cpu(int cpu)
2204 return vmcs;
2205 }
2206
2207 -static struct vmcs *alloc_vmcs(void)
2208 -{
2209 - return alloc_vmcs_cpu(raw_smp_processor_id());
2210 -}
2211 -
2212 static void free_vmcs(struct vmcs *vmcs)
2213 {
2214 free_pages((unsigned long)vmcs, vmcs_config.order);
2215 @@ -3857,9 +3940,38 @@ static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
2216 loaded_vmcs_clear(loaded_vmcs);
2217 free_vmcs(loaded_vmcs->vmcs);
2218 loaded_vmcs->vmcs = NULL;
2219 + if (loaded_vmcs->msr_bitmap)
2220 + free_page((unsigned long)loaded_vmcs->msr_bitmap);
2221 WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
2222 }
2223
2224 +static struct vmcs *alloc_vmcs(void)
2225 +{
2226 + return alloc_vmcs_cpu(raw_smp_processor_id());
2227 +}
2228 +
2229 +static int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
2230 +{
2231 + loaded_vmcs->vmcs = alloc_vmcs();
2232 + if (!loaded_vmcs->vmcs)
2233 + return -ENOMEM;
2234 +
2235 + loaded_vmcs->shadow_vmcs = NULL;
2236 + loaded_vmcs_init(loaded_vmcs);
2237 +
2238 + if (cpu_has_vmx_msr_bitmap()) {
2239 + loaded_vmcs->msr_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
2240 + if (!loaded_vmcs->msr_bitmap)
2241 + goto out_vmcs;
2242 + memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
2243 + }
2244 + return 0;
2245 +
2246 +out_vmcs:
2247 + free_loaded_vmcs(loaded_vmcs);
2248 + return -ENOMEM;
2249 +}
2250 +
2251 static void free_kvm_area(void)
2252 {
2253 int cpu;
2254 @@ -4918,10 +5030,8 @@ static void free_vpid(int vpid)
2255 spin_unlock(&vmx_vpid_lock);
2256 }
2257
2258 -#define MSR_TYPE_R 1
2259 -#define MSR_TYPE_W 2
2260 -static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
2261 - u32 msr, int type)
2262 +static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
2263 + u32 msr, int type)
2264 {
2265 int f = sizeof(unsigned long);
2266
2267 @@ -4955,6 +5065,50 @@ static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
2268 }
2269 }
2270
2271 +static void __always_inline vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
2272 + u32 msr, int type)
2273 +{
2274 + int f = sizeof(unsigned long);
2275 +
2276 + if (!cpu_has_vmx_msr_bitmap())
2277 + return;
2278 +
2279 + /*
2280 + * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
2281 + * have the write-low and read-high bitmap offsets the wrong way round.
2282 + * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
2283 + */
2284 + if (msr <= 0x1fff) {
2285 + if (type & MSR_TYPE_R)
2286 + /* read-low */
2287 + __set_bit(msr, msr_bitmap + 0x000 / f);
2288 +
2289 + if (type & MSR_TYPE_W)
2290 + /* write-low */
2291 + __set_bit(msr, msr_bitmap + 0x800 / f);
2292 +
2293 + } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
2294 + msr &= 0x1fff;
2295 + if (type & MSR_TYPE_R)
2296 + /* read-high */
2297 + __set_bit(msr, msr_bitmap + 0x400 / f);
2298 +
2299 + if (type & MSR_TYPE_W)
2300 + /* write-high */
2301 + __set_bit(msr, msr_bitmap + 0xc00 / f);
2302 +
2303 + }
2304 +}
2305 +
2306 +static void __always_inline vmx_set_intercept_for_msr(unsigned long *msr_bitmap,
2307 + u32 msr, int type, bool value)
2308 +{
2309 + if (value)
2310 + vmx_enable_intercept_for_msr(msr_bitmap, msr, type);
2311 + else
2312 + vmx_disable_intercept_for_msr(msr_bitmap, msr, type);
2313 +}
2314 +
2315 /*
2316 * If a msr is allowed by L0, we should check whether it is allowed by L1.
2317 * The corresponding bit will be cleared unless both of L0 and L1 allow it.
2318 @@ -5001,30 +5155,70 @@ static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1,
2319 }
2320 }
2321
2322 -static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
2323 +static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu)
2324 {
2325 - if (!longmode_only)
2326 - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy,
2327 - msr, MSR_TYPE_R | MSR_TYPE_W);
2328 - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
2329 - msr, MSR_TYPE_R | MSR_TYPE_W);
2330 + u8 mode = 0;
2331 +
2332 + if (cpu_has_secondary_exec_ctrls() &&
2333 + (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
2334 + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
2335 + mode |= MSR_BITMAP_MODE_X2APIC;
2336 + if (enable_apicv && kvm_vcpu_apicv_active(vcpu))
2337 + mode |= MSR_BITMAP_MODE_X2APIC_APICV;
2338 + }
2339 +
2340 + if (is_long_mode(vcpu))
2341 + mode |= MSR_BITMAP_MODE_LM;
2342 +
2343 + return mode;
2344 }
2345
2346 -static void vmx_disable_intercept_msr_x2apic(u32 msr, int type, bool apicv_active)
2347 +#define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4))
2348 +
2349 +static void vmx_update_msr_bitmap_x2apic(unsigned long *msr_bitmap,
2350 + u8 mode)
2351 {
2352 - if (apicv_active) {
2353 - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv,
2354 - msr, type);
2355 - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv,
2356 - msr, type);
2357 - } else {
2358 - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
2359 - msr, type);
2360 - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
2361 - msr, type);
2362 + int msr;
2363 +
2364 + for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
2365 + unsigned word = msr / BITS_PER_LONG;
2366 + msr_bitmap[word] = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0;
2367 + msr_bitmap[word + (0x800 / sizeof(long))] = ~0;
2368 + }
2369 +
2370 + if (mode & MSR_BITMAP_MODE_X2APIC) {
2371 + /*
2372 + * TPR reads and writes can be virtualized even if virtual interrupt
2373 + * delivery is not in use.
2374 + */
2375 + vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW);
2376 + if (mode & MSR_BITMAP_MODE_X2APIC_APICV) {
2377 + vmx_enable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_R);
2378 + vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
2379 + vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
2380 + }
2381 }
2382 }
2383
2384 +static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu)
2385 +{
2386 + struct vcpu_vmx *vmx = to_vmx(vcpu);
2387 + unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
2388 + u8 mode = vmx_msr_bitmap_mode(vcpu);
2389 + u8 changed = mode ^ vmx->msr_bitmap_mode;
2390 +
2391 + if (!changed)
2392 + return;
2393 +
2394 + vmx_set_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW,
2395 + !(mode & MSR_BITMAP_MODE_LM));
2396 +
2397 + if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV))
2398 + vmx_update_msr_bitmap_x2apic(msr_bitmap, mode);
2399 +
2400 + vmx->msr_bitmap_mode = mode;
2401 +}
2402 +
2403 static bool vmx_get_enable_apicv(struct kvm_vcpu *vcpu)
2404 {
2405 return enable_apicv;
2406 @@ -5274,7 +5468,7 @@ static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
2407 }
2408
2409 if (cpu_has_vmx_msr_bitmap())
2410 - vmx_set_msr_bitmap(vcpu);
2411 + vmx_update_msr_bitmap(vcpu);
2412 }
2413
2414 static u32 vmx_exec_control(struct vcpu_vmx *vmx)
2415 @@ -5461,7 +5655,7 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
2416 vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
2417 }
2418 if (cpu_has_vmx_msr_bitmap())
2419 - vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy));
2420 + vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap));
2421
2422 vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
2423
2424 @@ -5539,6 +5733,8 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
2425 ++vmx->nmsrs;
2426 }
2427
2428 + if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
2429 + rdmsrl(MSR_IA32_ARCH_CAPABILITIES, vmx->arch_capabilities);
2430
2431 vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl);
2432
2433 @@ -5567,6 +5763,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
2434 u64 cr0;
2435
2436 vmx->rmode.vm86_active = 0;
2437 + vmx->spec_ctrl = 0;
2438
2439 vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
2440 kvm_set_cr8(vcpu, 0);
2441 @@ -6744,7 +6941,7 @@ void vmx_enable_tdp(void)
2442
2443 static __init int hardware_setup(void)
2444 {
2445 - int r = -ENOMEM, i, msr;
2446 + int r = -ENOMEM, i;
2447
2448 rdmsrl_safe(MSR_EFER, &host_efer);
2449
2450 @@ -6764,9 +6961,6 @@ static __init int hardware_setup(void)
2451
2452 memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE);
2453
2454 - memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
2455 - memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
2456 -
2457 if (setup_vmcs_config(&vmcs_config) < 0) {
2458 r = -EIO;
2459 goto out;
2460 @@ -6835,42 +7029,8 @@ static __init int hardware_setup(void)
2461 kvm_tsc_scaling_ratio_frac_bits = 48;
2462 }
2463
2464 - vmx_disable_intercept_for_msr(MSR_FS_BASE, false);
2465 - vmx_disable_intercept_for_msr(MSR_GS_BASE, false);
2466 - vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true);
2467 - vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
2468 - vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
2469 - vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
2470 -
2471 - memcpy(vmx_msr_bitmap_legacy_x2apic_apicv,
2472 - vmx_msr_bitmap_legacy, PAGE_SIZE);
2473 - memcpy(vmx_msr_bitmap_longmode_x2apic_apicv,
2474 - vmx_msr_bitmap_longmode, PAGE_SIZE);
2475 - memcpy(vmx_msr_bitmap_legacy_x2apic,
2476 - vmx_msr_bitmap_legacy, PAGE_SIZE);
2477 - memcpy(vmx_msr_bitmap_longmode_x2apic,
2478 - vmx_msr_bitmap_longmode, PAGE_SIZE);
2479 -
2480 set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
2481
2482 - for (msr = 0x800; msr <= 0x8ff; msr++) {
2483 - if (msr == 0x839 /* TMCCT */)
2484 - continue;
2485 - vmx_disable_intercept_msr_x2apic(msr, MSR_TYPE_R, true);
2486 - }
2487 -
2488 - /*
2489 - * TPR reads and writes can be virtualized even if virtual interrupt
2490 - * delivery is not in use.
2491 - */
2492 - vmx_disable_intercept_msr_x2apic(0x808, MSR_TYPE_W, true);
2493 - vmx_disable_intercept_msr_x2apic(0x808, MSR_TYPE_R | MSR_TYPE_W, false);
2494 -
2495 - /* EOI */
2496 - vmx_disable_intercept_msr_x2apic(0x80b, MSR_TYPE_W, true);
2497 - /* SELF-IPI */
2498 - vmx_disable_intercept_msr_x2apic(0x83f, MSR_TYPE_W, true);
2499 -
2500 if (enable_ept)
2501 vmx_enable_tdp();
2502 else
2503 @@ -6973,94 +7133,6 @@ static int handle_monitor(struct kvm_vcpu *vcpu)
2504 return handle_nop(vcpu);
2505 }
2506
2507 -/*
2508 - * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12.
2509 - * We could reuse a single VMCS for all the L2 guests, but we also want the
2510 - * option to allocate a separate vmcs02 for each separate loaded vmcs12 - this
2511 - * allows keeping them loaded on the processor, and in the future will allow
2512 - * optimizations where prepare_vmcs02 doesn't need to set all the fields on
2513 - * every entry if they never change.
2514 - * So we keep, in vmx->nested.vmcs02_pool, a cache of size VMCS02_POOL_SIZE
2515 - * (>=0) with a vmcs02 for each recently loaded vmcs12s, most recent first.
2516 - *
2517 - * The following functions allocate and free a vmcs02 in this pool.
2518 - */
2519 -
2520 -/* Get a VMCS from the pool to use as vmcs02 for the current vmcs12. */
2521 -static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx)
2522 -{
2523 - struct vmcs02_list *item;
2524 - list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
2525 - if (item->vmptr == vmx->nested.current_vmptr) {
2526 - list_move(&item->list, &vmx->nested.vmcs02_pool);
2527 - return &item->vmcs02;
2528 - }
2529 -
2530 - if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) {
2531 - /* Recycle the least recently used VMCS. */
2532 - item = list_last_entry(&vmx->nested.vmcs02_pool,
2533 - struct vmcs02_list, list);
2534 - item->vmptr = vmx->nested.current_vmptr;
2535 - list_move(&item->list, &vmx->nested.vmcs02_pool);
2536 - return &item->vmcs02;
2537 - }
2538 -
2539 - /* Create a new VMCS */
2540 - item = kzalloc(sizeof(struct vmcs02_list), GFP_KERNEL);
2541 - if (!item)
2542 - return NULL;
2543 - item->vmcs02.vmcs = alloc_vmcs();
2544 - item->vmcs02.shadow_vmcs = NULL;
2545 - if (!item->vmcs02.vmcs) {
2546 - kfree(item);
2547 - return NULL;
2548 - }
2549 - loaded_vmcs_init(&item->vmcs02);
2550 - item->vmptr = vmx->nested.current_vmptr;
2551 - list_add(&(item->list), &(vmx->nested.vmcs02_pool));
2552 - vmx->nested.vmcs02_num++;
2553 - return &item->vmcs02;
2554 -}
2555 -
2556 -/* Free and remove from pool a vmcs02 saved for a vmcs12 (if there is one) */
2557 -static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr)
2558 -{
2559 - struct vmcs02_list *item;
2560 - list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
2561 - if (item->vmptr == vmptr) {
2562 - free_loaded_vmcs(&item->vmcs02);
2563 - list_del(&item->list);
2564 - kfree(item);
2565 - vmx->nested.vmcs02_num--;
2566 - return;
2567 - }
2568 -}
2569 -
2570 -/*
2571 - * Free all VMCSs saved for this vcpu, except the one pointed by
2572 - * vmx->loaded_vmcs. We must be running L1, so vmx->loaded_vmcs
2573 - * must be &vmx->vmcs01.
2574 - */
2575 -static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx)
2576 -{
2577 - struct vmcs02_list *item, *n;
2578 -
2579 - WARN_ON(vmx->loaded_vmcs != &vmx->vmcs01);
2580 - list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) {
2581 - /*
2582 - * Something will leak if the above WARN triggers. Better than
2583 - * a use-after-free.
2584 - */
2585 - if (vmx->loaded_vmcs == &item->vmcs02)
2586 - continue;
2587 -
2588 - free_loaded_vmcs(&item->vmcs02);
2589 - list_del(&item->list);
2590 - kfree(item);
2591 - vmx->nested.vmcs02_num--;
2592 - }
2593 -}
2594 -
2595 /*
2596 * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
2597 * set the success or error code of an emulated VMX instruction, as specified
2598 @@ -7241,13 +7313,11 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
2599 {
2600 struct vcpu_vmx *vmx = to_vmx(vcpu);
2601 struct vmcs *shadow_vmcs;
2602 + int r;
2603
2604 - if (cpu_has_vmx_msr_bitmap()) {
2605 - vmx->nested.msr_bitmap =
2606 - (unsigned long *)__get_free_page(GFP_KERNEL);
2607 - if (!vmx->nested.msr_bitmap)
2608 - goto out_msr_bitmap;
2609 - }
2610 + r = alloc_loaded_vmcs(&vmx->nested.vmcs02);
2611 + if (r < 0)
2612 + goto out_vmcs02;
2613
2614 vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
2615 if (!vmx->nested.cached_vmcs12)
2616 @@ -7264,9 +7334,6 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
2617 vmx->vmcs01.shadow_vmcs = shadow_vmcs;
2618 }
2619
2620 - INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
2621 - vmx->nested.vmcs02_num = 0;
2622 -
2623 hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
2624 HRTIMER_MODE_REL_PINNED);
2625 vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
2626 @@ -7278,9 +7345,9 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
2627 kfree(vmx->nested.cached_vmcs12);
2628
2629 out_cached_vmcs12:
2630 - free_page((unsigned long)vmx->nested.msr_bitmap);
2631 + free_loaded_vmcs(&vmx->nested.vmcs02);
2632
2633 -out_msr_bitmap:
2634 +out_vmcs02:
2635 return -ENOMEM;
2636 }
2637
2638 @@ -7423,10 +7490,6 @@ static void free_nested(struct vcpu_vmx *vmx)
2639 free_vpid(vmx->nested.vpid02);
2640 vmx->nested.posted_intr_nv = -1;
2641 vmx->nested.current_vmptr = -1ull;
2642 - if (vmx->nested.msr_bitmap) {
2643 - free_page((unsigned long)vmx->nested.msr_bitmap);
2644 - vmx->nested.msr_bitmap = NULL;
2645 - }
2646 if (enable_shadow_vmcs) {
2647 vmx_disable_shadow_vmcs(vmx);
2648 vmcs_clear(vmx->vmcs01.shadow_vmcs);
2649 @@ -7434,7 +7497,7 @@ static void free_nested(struct vcpu_vmx *vmx)
2650 vmx->vmcs01.shadow_vmcs = NULL;
2651 }
2652 kfree(vmx->nested.cached_vmcs12);
2653 - /* Unpin physical memory we referred to in current vmcs02 */
2654 + /* Unpin physical memory we referred to in the vmcs02 */
2655 if (vmx->nested.apic_access_page) {
2656 kvm_release_page_dirty(vmx->nested.apic_access_page);
2657 vmx->nested.apic_access_page = NULL;
2658 @@ -7450,7 +7513,7 @@ static void free_nested(struct vcpu_vmx *vmx)
2659 vmx->nested.pi_desc = NULL;
2660 }
2661
2662 - nested_free_all_saved_vmcss(vmx);
2663 + free_loaded_vmcs(&vmx->nested.vmcs02);
2664 }
2665
2666 /* Emulate the VMXOFF instruction */
2667 @@ -7493,8 +7556,6 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
2668 vmptr + offsetof(struct vmcs12, launch_state),
2669 &zero, sizeof(zero));
2670
2671 - nested_free_vmcs02(vmx, vmptr);
2672 -
2673 nested_vmx_succeed(vcpu);
2674 return kvm_skip_emulated_instruction(vcpu);
2675 }
2676 @@ -8406,10 +8467,11 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
2677
2678 /*
2679 * The host physical addresses of some pages of guest memory
2680 - * are loaded into VMCS02 (e.g. L1's Virtual APIC Page). The CPU
2681 - * may write to these pages via their host physical address while
2682 - * L2 is running, bypassing any address-translation-based dirty
2683 - * tracking (e.g. EPT write protection).
2684 + * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
2685 + * Page). The CPU may write to these pages via their host
2686 + * physical address while L2 is running, bypassing any
2687 + * address-translation-based dirty tracking (e.g. EPT write
2688 + * protection).
2689 *
2690 * Mark them dirty on every exit from L2 to prevent them from
2691 * getting out of sync with dirty tracking.
2692 @@ -8943,7 +9005,7 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
2693 }
2694 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control);
2695
2696 - vmx_set_msr_bitmap(vcpu);
2697 + vmx_update_msr_bitmap(vcpu);
2698 }
2699
2700 static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa)
2701 @@ -9129,14 +9191,14 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
2702 #endif
2703 "pushf\n\t"
2704 __ASM_SIZE(push) " $%c[cs]\n\t"
2705 - "call *%[entry]\n\t"
2706 + CALL_NOSPEC
2707 :
2708 #ifdef CONFIG_X86_64
2709 [sp]"=&r"(tmp),
2710 #endif
2711 ASM_CALL_CONSTRAINT
2712 :
2713 - [entry]"r"(entry),
2714 + THUNK_TARGET(entry),
2715 [ss]"i"(__KERNEL_DS),
2716 [cs]"i"(__KERNEL_CS)
2717 );
2718 @@ -9373,6 +9435,15 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
2719
2720 vmx_arm_hv_timer(vcpu);
2721
2722 + /*
2723 + * If this vCPU has touched SPEC_CTRL, restore the guest's value if
2724 + * it's non-zero. Since vmentry is serialising on affected CPUs, there
2725 + * is no need to worry about the conditional branch over the wrmsr
2726 + * being speculatively taken.
2727 + */
2728 + if (vmx->spec_ctrl)
2729 + wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
2730 +
2731 vmx->__launched = vmx->loaded_vmcs->launched;
2732 asm(
2733 /* Store host registers */
2734 @@ -9491,6 +9562,27 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
2735 #endif
2736 );
2737
2738 + /*
2739 + * We do not use IBRS in the kernel. If this vCPU has used the
2740 + * SPEC_CTRL MSR it may have left it on; save the value and
2741 + * turn it off. This is much more efficient than blindly adding
2742 + * it to the atomic save/restore list. Especially as the former
2743 + * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
2744 + *
2745 + * For non-nested case:
2746 + * If the L01 MSR bitmap does not intercept the MSR, then we need to
2747 + * save it.
2748 + *
2749 + * For nested case:
2750 + * If the L02 MSR bitmap does not intercept the MSR, then we need to
2751 + * save it.
2752 + */
2753 + if (!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))
2754 + rdmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
2755 +
2756 + if (vmx->spec_ctrl)
2757 + wrmsrl(MSR_IA32_SPEC_CTRL, 0);
2758 +
2759 /* Eliminate branch target predictions from guest mode */
2760 vmexit_fill_RSB();
2761
2762 @@ -9604,6 +9696,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
2763 {
2764 int err;
2765 struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
2766 + unsigned long *msr_bitmap;
2767 int cpu;
2768
2769 if (!vmx)
2770 @@ -9636,13 +9729,20 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
2771 if (!vmx->guest_msrs)
2772 goto free_pml;
2773
2774 - vmx->loaded_vmcs = &vmx->vmcs01;
2775 - vmx->loaded_vmcs->vmcs = alloc_vmcs();
2776 - vmx->loaded_vmcs->shadow_vmcs = NULL;
2777 - if (!vmx->loaded_vmcs->vmcs)
2778 + err = alloc_loaded_vmcs(&vmx->vmcs01);
2779 + if (err < 0)
2780 goto free_msrs;
2781 - loaded_vmcs_init(vmx->loaded_vmcs);
2782
2783 + msr_bitmap = vmx->vmcs01.msr_bitmap;
2784 + vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE, MSR_TYPE_RW);
2785 + vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE, MSR_TYPE_RW);
2786 + vmx_disable_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
2787 + vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
2788 + vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
2789 + vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
2790 + vmx->msr_bitmap_mode = 0;
2791 +
2792 + vmx->loaded_vmcs = &vmx->vmcs01;
2793 cpu = get_cpu();
2794 vmx_vcpu_load(&vmx->vcpu, cpu);
2795 vmx->vcpu.cpu = cpu;
2796 @@ -10105,10 +10205,25 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
2797 int msr;
2798 struct page *page;
2799 unsigned long *msr_bitmap_l1;
2800 - unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.msr_bitmap;
2801 + unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap;
2802 + /*
2803 + * pred_cmd & spec_ctrl are trying to verify two things:
2804 + *
2805 + * 1. L0 gave a permission to L1 to actually passthrough the MSR. This
2806 + * ensures that we do not accidentally generate an L02 MSR bitmap
2807 + * from the L12 MSR bitmap that is too permissive.
2808 + * 2. That L1 or L2s have actually used the MSR. This avoids
2809 + * unnecessarily merging of the bitmap if the MSR is unused. This
2810 + * works properly because we only update the L01 MSR bitmap lazily.
2811 + * So even if L0 should pass L1 these MSRs, the L01 bitmap is only
2812 + * updated to reflect this when L1 (or its L2s) actually write to
2813 + * the MSR.
2814 + */
2815 + bool pred_cmd = msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD);
2816 + bool spec_ctrl = msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL);
2817
2818 - /* This shortcut is ok because we support only x2APIC MSRs so far. */
2819 - if (!nested_cpu_has_virt_x2apic_mode(vmcs12))
2820 + if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
2821 + !pred_cmd && !spec_ctrl)
2822 return false;
2823
2824 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->msr_bitmap);
2825 @@ -10141,6 +10256,19 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
2826 MSR_TYPE_W);
2827 }
2828 }
2829 +
2830 + if (spec_ctrl)
2831 + nested_vmx_disable_intercept_for_msr(
2832 + msr_bitmap_l1, msr_bitmap_l0,
2833 + MSR_IA32_SPEC_CTRL,
2834 + MSR_TYPE_R | MSR_TYPE_W);
2835 +
2836 + if (pred_cmd)
2837 + nested_vmx_disable_intercept_for_msr(
2838 + msr_bitmap_l1, msr_bitmap_l0,
2839 + MSR_IA32_PRED_CMD,
2840 + MSR_TYPE_W);
2841 +
2842 kunmap(page);
2843 kvm_release_page_clean(page);
2844
2845 @@ -10682,6 +10810,9 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
2846 if (kvm_has_tsc_control)
2847 decache_tsc_multiplier(vmx);
2848
2849 + if (cpu_has_vmx_msr_bitmap())
2850 + vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
2851 +
2852 if (enable_vpid) {
2853 /*
2854 * There is no direct mapping between vpid02 and vpid12, the
2855 @@ -10903,20 +11034,15 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
2856 {
2857 struct vcpu_vmx *vmx = to_vmx(vcpu);
2858 struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
2859 - struct loaded_vmcs *vmcs02;
2860 u32 msr_entry_idx;
2861 u32 exit_qual;
2862
2863 - vmcs02 = nested_get_current_vmcs02(vmx);
2864 - if (!vmcs02)
2865 - return -ENOMEM;
2866 -
2867 enter_guest_mode(vcpu);
2868
2869 if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
2870 vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
2871
2872 - vmx_switch_vmcs(vcpu, vmcs02);
2873 + vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
2874 vmx_segment_cache_clear(vmx);
2875
2876 if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &exit_qual)) {
2877 @@ -11485,7 +11611,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
2878 vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
2879
2880 if (cpu_has_vmx_msr_bitmap())
2881 - vmx_set_msr_bitmap(vcpu);
2882 + vmx_update_msr_bitmap(vcpu);
2883
2884 if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
2885 vmcs12->vm_exit_msr_load_count))
2886 @@ -11534,10 +11660,6 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
2887 vm_exit_controls_reset_shadow(vmx);
2888 vmx_segment_cache_clear(vmx);
2889
2890 - /* if no vmcs02 cache requested, remove the one we used */
2891 - if (VMCS02_POOL_SIZE == 0)
2892 - nested_free_vmcs02(vmx, vmx->nested.current_vmptr);
2893 -
2894 /* Update any VMCS fields that might have changed while L2 ran */
2895 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
2896 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
2897 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
2898 index c53298dfbf50..ac381437c291 100644
2899 --- a/arch/x86/kvm/x86.c
2900 +++ b/arch/x86/kvm/x86.c
2901 @@ -1009,6 +1009,7 @@ static u32 msrs_to_save[] = {
2902 #endif
2903 MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
2904 MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
2905 + MSR_IA32_SPEC_CTRL, MSR_IA32_ARCH_CAPABILITIES
2906 };
2907
2908 static unsigned num_msrs_to_save;
2909 diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
2910 index f23934bbaf4e..69a473919260 100644
2911 --- a/arch/x86/lib/Makefile
2912 +++ b/arch/x86/lib/Makefile
2913 @@ -27,6 +27,7 @@ lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
2914 lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o insn-eval.o
2915 lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o
2916 lib-$(CONFIG_RETPOLINE) += retpoline.o
2917 +OBJECT_FILES_NON_STANDARD_retpoline.o :=y
2918
2919 obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o
2920
2921 diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S
2922 index c97d935a29e8..49b167f73215 100644
2923 --- a/arch/x86/lib/getuser.S
2924 +++ b/arch/x86/lib/getuser.S
2925 @@ -40,6 +40,8 @@ ENTRY(__get_user_1)
2926 mov PER_CPU_VAR(current_task), %_ASM_DX
2927 cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX
2928 jae bad_get_user
2929 + sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */
2930 + and %_ASM_DX, %_ASM_AX
2931 ASM_STAC
2932 1: movzbl (%_ASM_AX),%edx
2933 xor %eax,%eax
2934 @@ -54,6 +56,8 @@ ENTRY(__get_user_2)
2935 mov PER_CPU_VAR(current_task), %_ASM_DX
2936 cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX
2937 jae bad_get_user
2938 + sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */
2939 + and %_ASM_DX, %_ASM_AX
2940 ASM_STAC
2941 2: movzwl -1(%_ASM_AX),%edx
2942 xor %eax,%eax
2943 @@ -68,6 +72,8 @@ ENTRY(__get_user_4)
2944 mov PER_CPU_VAR(current_task), %_ASM_DX
2945 cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX
2946 jae bad_get_user
2947 + sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */
2948 + and %_ASM_DX, %_ASM_AX
2949 ASM_STAC
2950 3: movl -3(%_ASM_AX),%edx
2951 xor %eax,%eax
2952 @@ -83,6 +89,8 @@ ENTRY(__get_user_8)
2953 mov PER_CPU_VAR(current_task), %_ASM_DX
2954 cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX
2955 jae bad_get_user
2956 + sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */
2957 + and %_ASM_DX, %_ASM_AX
2958 ASM_STAC
2959 4: movq -7(%_ASM_AX),%rdx
2960 xor %eax,%eax
2961 @@ -94,6 +102,8 @@ ENTRY(__get_user_8)
2962 mov PER_CPU_VAR(current_task), %_ASM_DX
2963 cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX
2964 jae bad_get_user_8
2965 + sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */
2966 + and %_ASM_DX, %_ASM_AX
2967 ASM_STAC
2968 4: movl -7(%_ASM_AX),%edx
2969 5: movl -3(%_ASM_AX),%ecx
2970 diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S
2971 index c909961e678a..480edc3a5e03 100644
2972 --- a/arch/x86/lib/retpoline.S
2973 +++ b/arch/x86/lib/retpoline.S
2974 @@ -7,6 +7,7 @@
2975 #include <asm/alternative-asm.h>
2976 #include <asm/export.h>
2977 #include <asm/nospec-branch.h>
2978 +#include <asm/bitsperlong.h>
2979
2980 .macro THUNK reg
2981 .section .text.__x86.indirect_thunk
2982 @@ -46,3 +47,58 @@ GENERATE_THUNK(r13)
2983 GENERATE_THUNK(r14)
2984 GENERATE_THUNK(r15)
2985 #endif
2986 +
2987 +/*
2988 + * Fill the CPU return stack buffer.
2989 + *
2990 + * Each entry in the RSB, if used for a speculative 'ret', contains an
2991 + * infinite 'pause; lfence; jmp' loop to capture speculative execution.
2992 + *
2993 + * This is required in various cases for retpoline and IBRS-based
2994 + * mitigations for the Spectre variant 2 vulnerability. Sometimes to
2995 + * eliminate potentially bogus entries from the RSB, and sometimes
2996 + * purely to ensure that it doesn't get empty, which on some CPUs would
2997 + * allow predictions from other (unwanted!) sources to be used.
2998 + *
2999 + * Google experimented with loop-unrolling and this turned out to be
3000 + * the optimal version - two calls, each with their own speculation
3001 + * trap should their return address end up getting used, in a loop.
3002 + */
3003 +.macro STUFF_RSB nr:req sp:req
3004 + mov $(\nr / 2), %_ASM_BX
3005 + .align 16
3006 +771:
3007 + call 772f
3008 +773: /* speculation trap */
3009 + pause
3010 + lfence
3011 + jmp 773b
3012 + .align 16
3013 +772:
3014 + call 774f
3015 +775: /* speculation trap */
3016 + pause
3017 + lfence
3018 + jmp 775b
3019 + .align 16
3020 +774:
3021 + dec %_ASM_BX
3022 + jnz 771b
3023 + add $((BITS_PER_LONG/8) * \nr), \sp
3024 +.endm
3025 +
3026 +#define RSB_FILL_LOOPS 16 /* To avoid underflow */
3027 +
3028 +ENTRY(__fill_rsb)
3029 + STUFF_RSB RSB_FILL_LOOPS, %_ASM_SP
3030 + ret
3031 +END(__fill_rsb)
3032 +EXPORT_SYMBOL_GPL(__fill_rsb)
3033 +
3034 +#define RSB_CLEAR_LOOPS 32 /* To forcibly overwrite all entries */
3035 +
3036 +ENTRY(__clear_rsb)
3037 + STUFF_RSB RSB_CLEAR_LOOPS, %_ASM_SP
3038 + ret
3039 +END(__clear_rsb)
3040 +EXPORT_SYMBOL_GPL(__clear_rsb)
3041 diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
3042 index 1b377f734e64..7add8ba06887 100644
3043 --- a/arch/x86/lib/usercopy_32.c
3044 +++ b/arch/x86/lib/usercopy_32.c
3045 @@ -331,12 +331,12 @@ do { \
3046
3047 unsigned long __copy_user_ll(void *to, const void *from, unsigned long n)
3048 {
3049 - stac();
3050 + __uaccess_begin_nospec();
3051 if (movsl_is_ok(to, from, n))
3052 __copy_user(to, from, n);
3053 else
3054 n = __copy_user_intel(to, from, n);
3055 - clac();
3056 + __uaccess_end();
3057 return n;
3058 }
3059 EXPORT_SYMBOL(__copy_user_ll);
3060 @@ -344,7 +344,7 @@ EXPORT_SYMBOL(__copy_user_ll);
3061 unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *from,
3062 unsigned long n)
3063 {
3064 - stac();
3065 + __uaccess_begin_nospec();
3066 #ifdef CONFIG_X86_INTEL_USERCOPY
3067 if (n > 64 && static_cpu_has(X86_FEATURE_XMM2))
3068 n = __copy_user_intel_nocache(to, from, n);
3069 @@ -353,7 +353,7 @@ unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *fr
3070 #else
3071 __copy_user(to, from, n);
3072 #endif
3073 - clac();
3074 + __uaccess_end();
3075 return n;
3076 }
3077 EXPORT_SYMBOL(__copy_from_user_ll_nocache_nozero);
3078 diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
3079 index 5bfe61a5e8e3..012d02624848 100644
3080 --- a/arch/x86/mm/tlb.c
3081 +++ b/arch/x86/mm/tlb.c
3082 @@ -6,13 +6,14 @@
3083 #include <linux/interrupt.h>
3084 #include <linux/export.h>
3085 #include <linux/cpu.h>
3086 +#include <linux/debugfs.h>
3087
3088 #include <asm/tlbflush.h>
3089 #include <asm/mmu_context.h>
3090 +#include <asm/nospec-branch.h>
3091 #include <asm/cache.h>
3092 #include <asm/apic.h>
3093 #include <asm/uv/uv.h>
3094 -#include <linux/debugfs.h>
3095
3096 /*
3097 * TLB flushing, formerly SMP-only
3098 @@ -247,6 +248,27 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
3099 } else {
3100 u16 new_asid;
3101 bool need_flush;
3102 + u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id);
3103 +
3104 + /*
3105 + * Avoid user/user BTB poisoning by flushing the branch
3106 + * predictor when switching between processes. This stops
3107 + * one process from doing Spectre-v2 attacks on another.
3108 + *
3109 + * As an optimization, flush indirect branches only when
3110 + * switching into processes that disable dumping. This
3111 + * protects high value processes like gpg, without having
3112 + * too high performance overhead. IBPB is *expensive*!
3113 + *
3114 + * This will not flush branches when switching into kernel
3115 + * threads. It will also not flush if we switch to idle
3116 + * thread and back to the same process. It will flush if we
3117 + * switch to a different non-dumpable process.
3118 + */
3119 + if (tsk && tsk->mm &&
3120 + tsk->mm->context.ctx_id != last_ctx_id &&
3121 + get_dumpable(tsk->mm) != SUID_DUMP_USER)
3122 + indirect_branch_prediction_barrier();
3123
3124 if (IS_ENABLED(CONFIG_VMAP_STACK)) {
3125 /*
3126 @@ -292,6 +314,14 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
3127 trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
3128 }
3129
3130 + /*
3131 + * Record last user mm's context id, so we can avoid
3132 + * flushing branch buffer with IBPB if we switch back
3133 + * to the same user.
3134 + */
3135 + if (next != &init_mm)
3136 + this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
3137 +
3138 this_cpu_write(cpu_tlbstate.loaded_mm, next);
3139 this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
3140 }
3141 @@ -369,6 +399,7 @@ void initialize_tlbstate_and_flush(void)
3142 write_cr3(build_cr3(mm->pgd, 0));
3143
3144 /* Reinitialize tlbstate. */
3145 + this_cpu_write(cpu_tlbstate.last_ctx_id, mm->context.ctx_id);
3146 this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0);
3147 this_cpu_write(cpu_tlbstate.next_asid, 1);
3148 this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id);
3149 diff --git a/drivers/auxdisplay/img-ascii-lcd.c b/drivers/auxdisplay/img-ascii-lcd.c
3150 index db040b378224..9180b9bd5821 100644
3151 --- a/drivers/auxdisplay/img-ascii-lcd.c
3152 +++ b/drivers/auxdisplay/img-ascii-lcd.c
3153 @@ -441,3 +441,7 @@ static struct platform_driver img_ascii_lcd_driver = {
3154 .remove = img_ascii_lcd_remove,
3155 };
3156 module_platform_driver(img_ascii_lcd_driver);
3157 +
3158 +MODULE_DESCRIPTION("Imagination Technologies ASCII LCD Display");
3159 +MODULE_AUTHOR("Paul Burton <paul.burton@mips.com>");
3160 +MODULE_LICENSE("GPL");
3161 diff --git a/drivers/fpga/fpga-region.c b/drivers/fpga/fpga-region.c
3162 index d9ab7c75b14f..e0c73ceba2ed 100644
3163 --- a/drivers/fpga/fpga-region.c
3164 +++ b/drivers/fpga/fpga-region.c
3165 @@ -147,6 +147,7 @@ static struct fpga_manager *fpga_region_get_manager(struct fpga_region *region)
3166 mgr_node = of_parse_phandle(np, "fpga-mgr", 0);
3167 if (mgr_node) {
3168 mgr = of_fpga_mgr_get(mgr_node);
3169 + of_node_put(mgr_node);
3170 of_node_put(np);
3171 return mgr;
3172 }
3173 @@ -192,10 +193,13 @@ static int fpga_region_get_bridges(struct fpga_region *region,
3174 parent_br = region_np->parent;
3175
3176 /* If overlay has a list of bridges, use it. */
3177 - if (of_parse_phandle(overlay, "fpga-bridges", 0))
3178 + br = of_parse_phandle(overlay, "fpga-bridges", 0);
3179 + if (br) {
3180 + of_node_put(br);
3181 np = overlay;
3182 - else
3183 + } else {
3184 np = region_np;
3185 + }
3186
3187 for (i = 0; ; i++) {
3188 br = of_parse_phandle(np, "fpga-bridges", i);
3189 @@ -203,12 +207,15 @@ static int fpga_region_get_bridges(struct fpga_region *region,
3190 break;
3191
3192 /* If parent bridge is in list, skip it. */
3193 - if (br == parent_br)
3194 + if (br == parent_br) {
3195 + of_node_put(br);
3196 continue;
3197 + }
3198
3199 /* If node is a bridge, get it and add to list */
3200 ret = fpga_bridge_get_to_list(br, region->info,
3201 &region->bridge_list);
3202 + of_node_put(br);
3203
3204 /* If any of the bridges are in use, give up */
3205 if (ret == -EBUSY) {
3206 diff --git a/drivers/iio/accel/kxsd9-i2c.c b/drivers/iio/accel/kxsd9-i2c.c
3207 index 98fbb628d5bd..38411e1c155b 100644
3208 --- a/drivers/iio/accel/kxsd9-i2c.c
3209 +++ b/drivers/iio/accel/kxsd9-i2c.c
3210 @@ -63,3 +63,6 @@ static struct i2c_driver kxsd9_i2c_driver = {
3211 .id_table = kxsd9_i2c_id,
3212 };
3213 module_i2c_driver(kxsd9_i2c_driver);
3214 +
3215 +MODULE_LICENSE("GPL v2");
3216 +MODULE_DESCRIPTION("KXSD9 accelerometer I2C interface");
3217 diff --git a/drivers/iio/adc/qcom-vadc-common.c b/drivers/iio/adc/qcom-vadc-common.c
3218 index 47d24ae5462f..fe3d7826783c 100644
3219 --- a/drivers/iio/adc/qcom-vadc-common.c
3220 +++ b/drivers/iio/adc/qcom-vadc-common.c
3221 @@ -5,6 +5,7 @@
3222 #include <linux/math64.h>
3223 #include <linux/log2.h>
3224 #include <linux/err.h>
3225 +#include <linux/module.h>
3226
3227 #include "qcom-vadc-common.h"
3228
3229 @@ -229,3 +230,6 @@ int qcom_vadc_decimation_from_dt(u32 value)
3230 return __ffs64(value / VADC_DECIMATION_MIN);
3231 }
3232 EXPORT_SYMBOL(qcom_vadc_decimation_from_dt);
3233 +
3234 +MODULE_LICENSE("GPL v2");
3235 +MODULE_DESCRIPTION("Qualcomm ADC common functionality");
3236 diff --git a/drivers/pinctrl/pxa/pinctrl-pxa2xx.c b/drivers/pinctrl/pxa/pinctrl-pxa2xx.c
3237 index 866aa3ce1ac9..6cf0006d4c8d 100644
3238 --- a/drivers/pinctrl/pxa/pinctrl-pxa2xx.c
3239 +++ b/drivers/pinctrl/pxa/pinctrl-pxa2xx.c
3240 @@ -436,3 +436,7 @@ int pxa2xx_pinctrl_exit(struct platform_device *pdev)
3241 return 0;
3242 }
3243 EXPORT_SYMBOL_GPL(pxa2xx_pinctrl_exit);
3244 +
3245 +MODULE_AUTHOR("Robert Jarzmik <robert.jarzmik@free.fr>");
3246 +MODULE_DESCRIPTION("Marvell PXA2xx pinctrl driver");
3247 +MODULE_LICENSE("GPL v2");
3248 diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
3249 index 854995e1cae7..7e7e6eb95b0a 100644
3250 --- a/drivers/tty/serial/serial_core.c
3251 +++ b/drivers/tty/serial/serial_core.c
3252 @@ -974,6 +974,8 @@ static int uart_set_info(struct tty_struct *tty, struct tty_port *port,
3253 }
3254 } else {
3255 retval = uart_startup(tty, state, 1);
3256 + if (retval == 0)
3257 + tty_port_set_initialized(port, true);
3258 if (retval > 0)
3259 retval = 0;
3260 }
3261 diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
3262 index 1c65817673db..41615f38bcff 100644
3263 --- a/include/linux/fdtable.h
3264 +++ b/include/linux/fdtable.h
3265 @@ -10,6 +10,7 @@
3266 #include <linux/compiler.h>
3267 #include <linux/spinlock.h>
3268 #include <linux/rcupdate.h>
3269 +#include <linux/nospec.h>
3270 #include <linux/types.h>
3271 #include <linux/init.h>
3272 #include <linux/fs.h>
3273 @@ -82,8 +83,10 @@ static inline struct file *__fcheck_files(struct files_struct *files, unsigned i
3274 {
3275 struct fdtable *fdt = rcu_dereference_raw(files->fdt);
3276
3277 - if (fd < fdt->max_fds)
3278 + if (fd < fdt->max_fds) {
3279 + fd = array_index_nospec(fd, fdt->max_fds);
3280 return rcu_dereference_raw(fdt->fd[fd]);
3281 + }
3282 return NULL;
3283 }
3284
3285 diff --git a/include/linux/init.h b/include/linux/init.h
3286 index ea1b31101d9e..506a98151131 100644
3287 --- a/include/linux/init.h
3288 +++ b/include/linux/init.h
3289 @@ -5,6 +5,13 @@
3290 #include <linux/compiler.h>
3291 #include <linux/types.h>
3292
3293 +/* Built-in __init functions needn't be compiled with retpoline */
3294 +#if defined(RETPOLINE) && !defined(MODULE)
3295 +#define __noretpoline __attribute__((indirect_branch("keep")))
3296 +#else
3297 +#define __noretpoline
3298 +#endif
3299 +
3300 /* These macros are used to mark some functions or
3301 * initialized data (doesn't apply to uninitialized data)
3302 * as `initialization' functions. The kernel can take this
3303 @@ -40,7 +47,7 @@
3304
3305 /* These are for everybody (although not all archs will actually
3306 discard it in modules) */
3307 -#define __init __section(.init.text) __cold __latent_entropy
3308 +#define __init __section(.init.text) __cold __latent_entropy __noretpoline
3309 #define __initdata __section(.init.data)
3310 #define __initconst __section(.init.rodata)
3311 #define __exitdata __section(.exit.data)
3312 diff --git a/include/linux/module.h b/include/linux/module.h
3313 index c69b49abe877..1d8f245967be 100644
3314 --- a/include/linux/module.h
3315 +++ b/include/linux/module.h
3316 @@ -801,6 +801,15 @@ static inline void module_bug_finalize(const Elf_Ehdr *hdr,
3317 static inline void module_bug_cleanup(struct module *mod) {}
3318 #endif /* CONFIG_GENERIC_BUG */
3319
3320 +#ifdef RETPOLINE
3321 +extern bool retpoline_module_ok(bool has_retpoline);
3322 +#else
3323 +static inline bool retpoline_module_ok(bool has_retpoline)
3324 +{
3325 + return true;
3326 +}
3327 +#endif
3328 +
3329 #ifdef CONFIG_MODULE_SIG
3330 static inline bool module_sig_ok(struct module *module)
3331 {
3332 diff --git a/include/linux/nospec.h b/include/linux/nospec.h
3333 new file mode 100644
3334 index 000000000000..b99bced39ac2
3335 --- /dev/null
3336 +++ b/include/linux/nospec.h
3337 @@ -0,0 +1,72 @@
3338 +// SPDX-License-Identifier: GPL-2.0
3339 +// Copyright(c) 2018 Linus Torvalds. All rights reserved.
3340 +// Copyright(c) 2018 Alexei Starovoitov. All rights reserved.
3341 +// Copyright(c) 2018 Intel Corporation. All rights reserved.
3342 +
3343 +#ifndef _LINUX_NOSPEC_H
3344 +#define _LINUX_NOSPEC_H
3345 +
3346 +/**
3347 + * array_index_mask_nospec() - generate a ~0 mask when index < size, 0 otherwise
3348 + * @index: array element index
3349 + * @size: number of elements in array
3350 + *
3351 + * When @index is out of bounds (@index >= @size), the sign bit will be
3352 + * set. Extend the sign bit to all bits and invert, giving a result of
3353 + * zero for an out of bounds index, or ~0 if within bounds [0, @size).
3354 + */
3355 +#ifndef array_index_mask_nospec
3356 +static inline unsigned long array_index_mask_nospec(unsigned long index,
3357 + unsigned long size)
3358 +{
3359 + /*
3360 + * Warn developers about inappropriate array_index_nospec() usage.
3361 + *
3362 + * Even if the CPU speculates past the WARN_ONCE branch, the
3363 + * sign bit of @index is taken into account when generating the
3364 + * mask.
3365 + *
3366 + * This warning is compiled out when the compiler can infer that
3367 + * @index and @size are less than LONG_MAX.
3368 + */
3369 + if (WARN_ONCE(index > LONG_MAX || size > LONG_MAX,
3370 + "array_index_nospec() limited to range of [0, LONG_MAX]\n"))
3371 + return 0;
3372 +
3373 + /*
3374 + * Always calculate and emit the mask even if the compiler
3375 + * thinks the mask is not needed. The compiler does not take
3376 + * into account the value of @index under speculation.
3377 + */
3378 + OPTIMIZER_HIDE_VAR(index);
3379 + return ~(long)(index | (size - 1UL - index)) >> (BITS_PER_LONG - 1);
3380 +}
3381 +#endif
3382 +
3383 +/*
3384 + * array_index_nospec - sanitize an array index after a bounds check
3385 + *
3386 + * For a code sequence like:
3387 + *
3388 + * if (index < size) {
3389 + * index = array_index_nospec(index, size);
3390 + * val = array[index];
3391 + * }
3392 + *
3393 + * ...if the CPU speculates past the bounds check then
3394 + * array_index_nospec() will clamp the index within the range of [0,
3395 + * size).
3396 + */
3397 +#define array_index_nospec(index, size) \
3398 +({ \
3399 + typeof(index) _i = (index); \
3400 + typeof(size) _s = (size); \
3401 + unsigned long _mask = array_index_mask_nospec(_i, _s); \
3402 + \
3403 + BUILD_BUG_ON(sizeof(_i) > sizeof(long)); \
3404 + BUILD_BUG_ON(sizeof(_s) > sizeof(long)); \
3405 + \
3406 + _i &= _mask; \
3407 + _i; \
3408 +})
3409 +#endif /* _LINUX_NOSPEC_H */
3410 diff --git a/kernel/module.c b/kernel/module.c
3411 index dea01ac9cb74..09e48eee4d55 100644
3412 --- a/kernel/module.c
3413 +++ b/kernel/module.c
3414 @@ -2863,6 +2863,15 @@ static int check_modinfo_livepatch(struct module *mod, struct load_info *info)
3415 }
3416 #endif /* CONFIG_LIVEPATCH */
3417
3418 +static void check_modinfo_retpoline(struct module *mod, struct load_info *info)
3419 +{
3420 + if (retpoline_module_ok(get_modinfo(info, "retpoline")))
3421 + return;
3422 +
3423 + pr_warn("%s: loading module not compiled with retpoline compiler.\n",
3424 + mod->name);
3425 +}
3426 +
3427 /* Sets info->hdr and info->len. */
3428 static int copy_module_from_user(const void __user *umod, unsigned long len,
3429 struct load_info *info)
3430 @@ -3029,6 +3038,8 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags)
3431 add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK);
3432 }
3433
3434 + check_modinfo_retpoline(mod, info);
3435 +
3436 if (get_modinfo(info, "staging")) {
3437 add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK);
3438 pr_warn("%s: module is from the staging directory, the quality "
3439 diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
3440 index 542a4fc0a8d7..4bbcfc1e2d43 100644
3441 --- a/net/wireless/nl80211.c
3442 +++ b/net/wireless/nl80211.c
3443 @@ -16,6 +16,7 @@
3444 #include <linux/nl80211.h>
3445 #include <linux/rtnetlink.h>
3446 #include <linux/netlink.h>
3447 +#include <linux/nospec.h>
3448 #include <linux/etherdevice.h>
3449 #include <net/net_namespace.h>
3450 #include <net/genetlink.h>
3451 @@ -2056,20 +2057,22 @@ static const struct nla_policy txq_params_policy[NL80211_TXQ_ATTR_MAX + 1] = {
3452 static int parse_txq_params(struct nlattr *tb[],
3453 struct ieee80211_txq_params *txq_params)
3454 {
3455 + u8 ac;
3456 +
3457 if (!tb[NL80211_TXQ_ATTR_AC] || !tb[NL80211_TXQ_ATTR_TXOP] ||
3458 !tb[NL80211_TXQ_ATTR_CWMIN] || !tb[NL80211_TXQ_ATTR_CWMAX] ||
3459 !tb[NL80211_TXQ_ATTR_AIFS])
3460 return -EINVAL;
3461
3462 - txq_params->ac = nla_get_u8(tb[NL80211_TXQ_ATTR_AC]);
3463 + ac = nla_get_u8(tb[NL80211_TXQ_ATTR_AC]);
3464 txq_params->txop = nla_get_u16(tb[NL80211_TXQ_ATTR_TXOP]);
3465 txq_params->cwmin = nla_get_u16(tb[NL80211_TXQ_ATTR_CWMIN]);
3466 txq_params->cwmax = nla_get_u16(tb[NL80211_TXQ_ATTR_CWMAX]);
3467 txq_params->aifs = nla_get_u8(tb[NL80211_TXQ_ATTR_AIFS]);
3468
3469 - if (txq_params->ac >= NL80211_NUM_ACS)
3470 + if (ac >= NL80211_NUM_ACS)
3471 return -EINVAL;
3472 -
3473 + txq_params->ac = array_index_nospec(ac, NL80211_NUM_ACS);
3474 return 0;
3475 }
3476
3477 diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c
3478 index f51cf977c65b..6510536c06df 100644
3479 --- a/scripts/mod/modpost.c
3480 +++ b/scripts/mod/modpost.c
3481 @@ -2165,6 +2165,14 @@ static void add_intree_flag(struct buffer *b, int is_intree)
3482 buf_printf(b, "\nMODULE_INFO(intree, \"Y\");\n");
3483 }
3484
3485 +/* Cannot check for assembler */
3486 +static void add_retpoline(struct buffer *b)
3487 +{
3488 + buf_printf(b, "\n#ifdef RETPOLINE\n");
3489 + buf_printf(b, "MODULE_INFO(retpoline, \"Y\");\n");
3490 + buf_printf(b, "#endif\n");
3491 +}
3492 +
3493 static void add_staging_flag(struct buffer *b, const char *name)
3494 {
3495 static const char *staging_dir = "drivers/staging";
3496 @@ -2506,6 +2514,7 @@ int main(int argc, char **argv)
3497 err |= check_modname_len(mod);
3498 add_header(&buf, mod);
3499 add_intree_flag(&buf, !external_module);
3500 + add_retpoline(&buf);
3501 add_staging_flag(&buf, mod->name);
3502 err |= add_versions(&buf, mod);
3503 add_depends(&buf, mod, modules);
3504 diff --git a/sound/soc/codecs/pcm512x-spi.c b/sound/soc/codecs/pcm512x-spi.c
3505 index 25c63510ae15..7cdd2dc4fd79 100644
3506 --- a/sound/soc/codecs/pcm512x-spi.c
3507 +++ b/sound/soc/codecs/pcm512x-spi.c
3508 @@ -70,3 +70,7 @@ static struct spi_driver pcm512x_spi_driver = {
3509 };
3510
3511 module_spi_driver(pcm512x_spi_driver);
3512 +
3513 +MODULE_DESCRIPTION("ASoC PCM512x codec driver - SPI");
3514 +MODULE_AUTHOR("Mark Brown <broonie@kernel.org>");
3515 +MODULE_LICENSE("GPL v2");
3516 diff --git a/tools/objtool/check.c b/tools/objtool/check.c
3517 index f40d46e24bcc..9cd028aa1509 100644
3518 --- a/tools/objtool/check.c
3519 +++ b/tools/objtool/check.c
3520 @@ -543,18 +543,14 @@ static int add_call_destinations(struct objtool_file *file)
3521 dest_off = insn->offset + insn->len + insn->immediate;
3522 insn->call_dest = find_symbol_by_offset(insn->sec,
3523 dest_off);
3524 - /*
3525 - * FIXME: Thanks to retpolines, it's now considered
3526 - * normal for a function to call within itself. So
3527 - * disable this warning for now.
3528 - */
3529 -#if 0
3530 - if (!insn->call_dest) {
3531 - WARN_FUNC("can't find call dest symbol at offset 0x%lx",
3532 - insn->sec, insn->offset, dest_off);
3533 +
3534 + if (!insn->call_dest && !insn->ignore) {
3535 + WARN_FUNC("unsupported intra-function call",
3536 + insn->sec, insn->offset);
3537 + WARN("If this is a retpoline, please patch it in with alternatives and annotate it with ANNOTATE_NOSPEC_ALTERNATIVE.");
3538 return -1;
3539 }
3540 -#endif
3541 +
3542 } else if (rela->sym->type == STT_SECTION) {
3543 insn->call_dest = find_symbol_by_offset(rela->sym->sec,
3544 rela->addend+4);
3545 @@ -598,7 +594,7 @@ static int handle_group_alt(struct objtool_file *file,
3546 struct instruction *orig_insn,
3547 struct instruction **new_insn)
3548 {
3549 - struct instruction *last_orig_insn, *last_new_insn, *insn, *fake_jump;
3550 + struct instruction *last_orig_insn, *last_new_insn, *insn, *fake_jump = NULL;
3551 unsigned long dest_off;
3552
3553 last_orig_insn = NULL;
3554 @@ -614,28 +610,30 @@ static int handle_group_alt(struct objtool_file *file,
3555 last_orig_insn = insn;
3556 }
3557
3558 - if (!next_insn_same_sec(file, last_orig_insn)) {
3559 - WARN("%s: don't know how to handle alternatives at end of section",
3560 - special_alt->orig_sec->name);
3561 - return -1;
3562 - }
3563 -
3564 - fake_jump = malloc(sizeof(*fake_jump));
3565 - if (!fake_jump) {
3566 - WARN("malloc failed");
3567 - return -1;
3568 + if (next_insn_same_sec(file, last_orig_insn)) {
3569 + fake_jump = malloc(sizeof(*fake_jump));
3570 + if (!fake_jump) {
3571 + WARN("malloc failed");
3572 + return -1;
3573 + }
3574 + memset(fake_jump, 0, sizeof(*fake_jump));
3575 + INIT_LIST_HEAD(&fake_jump->alts);
3576 + clear_insn_state(&fake_jump->state);
3577 +
3578 + fake_jump->sec = special_alt->new_sec;
3579 + fake_jump->offset = -1;
3580 + fake_jump->type = INSN_JUMP_UNCONDITIONAL;
3581 + fake_jump->jump_dest = list_next_entry(last_orig_insn, list);
3582 + fake_jump->ignore = true;
3583 }
3584 - memset(fake_jump, 0, sizeof(*fake_jump));
3585 - INIT_LIST_HEAD(&fake_jump->alts);
3586 - clear_insn_state(&fake_jump->state);
3587 -
3588 - fake_jump->sec = special_alt->new_sec;
3589 - fake_jump->offset = -1;
3590 - fake_jump->type = INSN_JUMP_UNCONDITIONAL;
3591 - fake_jump->jump_dest = list_next_entry(last_orig_insn, list);
3592 - fake_jump->ignore = true;
3593
3594 if (!special_alt->new_len) {
3595 + if (!fake_jump) {
3596 + WARN("%s: empty alternative at end of section",
3597 + special_alt->orig_sec->name);
3598 + return -1;
3599 + }
3600 +
3601 *new_insn = fake_jump;
3602 return 0;
3603 }
3604 @@ -648,6 +646,8 @@ static int handle_group_alt(struct objtool_file *file,
3605
3606 last_new_insn = insn;
3607
3608 + insn->ignore = orig_insn->ignore_alts;
3609 +
3610 if (insn->type != INSN_JUMP_CONDITIONAL &&
3611 insn->type != INSN_JUMP_UNCONDITIONAL)
3612 continue;
3613 @@ -656,8 +656,14 @@ static int handle_group_alt(struct objtool_file *file,
3614 continue;
3615
3616 dest_off = insn->offset + insn->len + insn->immediate;
3617 - if (dest_off == special_alt->new_off + special_alt->new_len)
3618 + if (dest_off == special_alt->new_off + special_alt->new_len) {
3619 + if (!fake_jump) {
3620 + WARN("%s: alternative jump to end of section",
3621 + special_alt->orig_sec->name);
3622 + return -1;
3623 + }
3624 insn->jump_dest = fake_jump;
3625 + }
3626
3627 if (!insn->jump_dest) {
3628 WARN_FUNC("can't find alternative jump destination",
3629 @@ -672,7 +678,8 @@ static int handle_group_alt(struct objtool_file *file,
3630 return -1;
3631 }
3632
3633 - list_add(&fake_jump->list, &last_new_insn->list);
3634 + if (fake_jump)
3635 + list_add(&fake_jump->list, &last_new_insn->list);
3636
3637 return 0;
3638 }
3639 @@ -729,10 +736,6 @@ static int add_special_section_alts(struct objtool_file *file)
3640 goto out;
3641 }
3642
3643 - /* Ignore retpoline alternatives. */
3644 - if (orig_insn->ignore_alts)
3645 - continue;
3646 -
3647 new_insn = NULL;
3648 if (!special_alt->group || special_alt->new_len) {
3649 new_insn = find_insn(file, special_alt->new_sec,
3650 @@ -1089,11 +1092,11 @@ static int decode_sections(struct objtool_file *file)
3651 if (ret)
3652 return ret;
3653
3654 - ret = add_call_destinations(file);
3655 + ret = add_special_section_alts(file);
3656 if (ret)
3657 return ret;
3658
3659 - ret = add_special_section_alts(file);
3660 + ret = add_call_destinations(file);
3661 if (ret)
3662 return ret;
3663
3664 @@ -1720,10 +1723,12 @@ static int validate_branch(struct objtool_file *file, struct instruction *first,
3665
3666 insn->visited = true;
3667
3668 - list_for_each_entry(alt, &insn->alts, list) {
3669 - ret = validate_branch(file, alt->insn, state);
3670 - if (ret)
3671 - return 1;
3672 + if (!insn->ignore_alts) {
3673 + list_for_each_entry(alt, &insn->alts, list) {
3674 + ret = validate_branch(file, alt->insn, state);
3675 + if (ret)
3676 + return 1;
3677 + }
3678 }
3679
3680 switch (insn->type) {
3681 diff --git a/tools/objtool/orc_gen.c b/tools/objtool/orc_gen.c
3682 index e61fe703197b..18384d9be4e1 100644
3683 --- a/tools/objtool/orc_gen.c
3684 +++ b/tools/objtool/orc_gen.c
3685 @@ -98,6 +98,11 @@ static int create_orc_entry(struct section *u_sec, struct section *ip_relasec,
3686 struct orc_entry *orc;
3687 struct rela *rela;
3688
3689 + if (!insn_sec->sym) {
3690 + WARN("missing symbol for section %s", insn_sec->name);
3691 + return -1;
3692 + }
3693 +
3694 /* populate ORC data */
3695 orc = (struct orc_entry *)u_sec->data->d_buf + idx;
3696 memcpy(orc, o, sizeof(*orc));