Magellan Linux

Contents of /trunk/kernel-alx/patches-5.4/0316-5.4.217-all-fixes.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 3635 - (show annotations) (download)
Mon Oct 24 12:34:12 2022 UTC (18 months, 1 week ago) by niro
File size: 103893 byte(s)
-sync kernel patches
1 diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
2 index db9d53b879f89..8f71a17ad5442 100644
3 --- a/Documentation/admin-guide/kernel-parameters.txt
4 +++ b/Documentation/admin-guide/kernel-parameters.txt
5 @@ -4298,6 +4298,18 @@
6
7 retain_initrd [RAM] Keep initrd memory after extraction
8
9 + retbleed= [X86] Control mitigation of RETBleed (Arbitrary
10 + Speculative Code Execution with Return Instructions)
11 + vulnerability.
12 +
13 + off - unconditionally disable
14 + auto - automatically select a migitation
15 +
16 + Selecting 'auto' will choose a mitigation method at run
17 + time according to the CPU.
18 +
19 + Not specifying this option is equivalent to retbleed=auto.
20 +
21 rfkill.default_state=
22 0 "airplane mode". All wifi, bluetooth, wimax, gps, fm,
23 etc. communication is blocked by default.
24 @@ -4541,6 +4553,7 @@
25 eibrs - enhanced IBRS
26 eibrs,retpoline - enhanced IBRS + Retpolines
27 eibrs,lfence - enhanced IBRS + LFENCE
28 + ibrs - use IBRS to protect kernel
29
30 Not specifying this option is equivalent to
31 spectre_v2=auto.
32 diff --git a/Documentation/process/code-of-conduct-interpretation.rst b/Documentation/process/code-of-conduct-interpretation.rst
33 index e899f14a4ba24..4f8a06b00f608 100644
34 --- a/Documentation/process/code-of-conduct-interpretation.rst
35 +++ b/Documentation/process/code-of-conduct-interpretation.rst
36 @@ -51,7 +51,7 @@ the Technical Advisory Board (TAB) or other maintainers if you're
37 uncertain how to handle situations that come up. It will not be
38 considered a violation report unless you want it to be. If you are
39 uncertain about approaching the TAB or any other maintainers, please
40 -reach out to our conflict mediator, Mishi Choudhary <mishi@linux.com>.
41 +reach out to our conflict mediator, Joanna Lee <joanna.lee@gesmer.com>.
42
43 In the end, "be kind to each other" is really what the end goal is for
44 everybody. We know everyone is human and we all fail at times, but the
45 diff --git a/Makefile b/Makefile
46 index 3d9d7ef6f8bf1..201ac8e410a94 100644
47 --- a/Makefile
48 +++ b/Makefile
49 @@ -1,7 +1,7 @@
50 # SPDX-License-Identifier: GPL-2.0
51 VERSION = 5
52 PATCHLEVEL = 4
53 -SUBLEVEL = 216
54 +SUBLEVEL = 217
55 EXTRAVERSION =
56 NAME = Kleptomaniac Octopus
57
58 diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
59 index b3f1214787386..29e5675c6d4f2 100644
60 --- a/arch/x86/entry/calling.h
61 +++ b/arch/x86/entry/calling.h
62 @@ -6,6 +6,8 @@
63 #include <asm/percpu.h>
64 #include <asm/asm-offsets.h>
65 #include <asm/processor-flags.h>
66 +#include <asm/msr.h>
67 +#include <asm/nospec-branch.h>
68
69 /*
70
71 @@ -146,27 +148,19 @@ For 32-bit we have the following conventions - kernel is built with
72
73 .endm
74
75 -.macro POP_REGS pop_rdi=1 skip_r11rcx=0
76 +.macro POP_REGS pop_rdi=1
77 popq %r15
78 popq %r14
79 popq %r13
80 popq %r12
81 popq %rbp
82 popq %rbx
83 - .if \skip_r11rcx
84 - popq %rsi
85 - .else
86 popq %r11
87 - .endif
88 popq %r10
89 popq %r9
90 popq %r8
91 popq %rax
92 - .if \skip_r11rcx
93 - popq %rsi
94 - .else
95 popq %rcx
96 - .endif
97 popq %rdx
98 popq %rsi
99 .if \pop_rdi
100 @@ -316,6 +310,62 @@ For 32-bit we have the following conventions - kernel is built with
101
102 #endif
103
104 +/*
105 + * IBRS kernel mitigation for Spectre_v2.
106 + *
107 + * Assumes full context is established (PUSH_REGS, CR3 and GS) and it clobbers
108 + * the regs it uses (AX, CX, DX). Must be called before the first RET
109 + * instruction (NOTE! UNTRAIN_RET includes a RET instruction)
110 + *
111 + * The optional argument is used to save/restore the current value,
112 + * which is used on the paranoid paths.
113 + *
114 + * Assumes x86_spec_ctrl_{base,current} to have SPEC_CTRL_IBRS set.
115 + */
116 +.macro IBRS_ENTER save_reg
117 + ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_KERNEL_IBRS
118 + movl $MSR_IA32_SPEC_CTRL, %ecx
119 +
120 +.ifnb \save_reg
121 + rdmsr
122 + shl $32, %rdx
123 + or %rdx, %rax
124 + mov %rax, \save_reg
125 + test $SPEC_CTRL_IBRS, %eax
126 + jz .Ldo_wrmsr_\@
127 + lfence
128 + jmp .Lend_\@
129 +.Ldo_wrmsr_\@:
130 +.endif
131 +
132 + movq PER_CPU_VAR(x86_spec_ctrl_current), %rdx
133 + movl %edx, %eax
134 + shr $32, %rdx
135 + wrmsr
136 +.Lend_\@:
137 +.endm
138 +
139 +/*
140 + * Similar to IBRS_ENTER, requires KERNEL GS,CR3 and clobbers (AX, CX, DX)
141 + * regs. Must be called after the last RET.
142 + */
143 +.macro IBRS_EXIT save_reg
144 + ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_KERNEL_IBRS
145 + movl $MSR_IA32_SPEC_CTRL, %ecx
146 +
147 +.ifnb \save_reg
148 + mov \save_reg, %rdx
149 +.else
150 + movq PER_CPU_VAR(x86_spec_ctrl_current), %rdx
151 + andl $(~SPEC_CTRL_IBRS), %edx
152 +.endif
153 +
154 + movl %edx, %eax
155 + shr $32, %rdx
156 + wrmsr
157 +.Lend_\@:
158 +.endm
159 +
160 /*
161 * Mitigate Spectre v1 for conditional swapgs code paths.
162 *
163 diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
164 index bde3e0f85425f..2d837fb54c31b 100644
165 --- a/arch/x86/entry/entry_32.S
166 +++ b/arch/x86/entry/entry_32.S
167 @@ -750,7 +750,6 @@ ENTRY(__switch_to_asm)
168 movl %ebx, PER_CPU_VAR(stack_canary)+stack_canary_offset
169 #endif
170
171 -#ifdef CONFIG_RETPOLINE
172 /*
173 * When switching from a shallower to a deeper call stack
174 * the RSB may either underflow or use entries populated
175 @@ -759,7 +758,6 @@ ENTRY(__switch_to_asm)
176 * speculative execution to prevent attack.
177 */
178 FILL_RETURN_BUFFER %ebx, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
179 -#endif
180
181 /* restore callee-saved registers */
182 popfl
183 diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
184 index 2ba3d53ac5b11..c82136030d58f 100644
185 --- a/arch/x86/entry/entry_64.S
186 +++ b/arch/x86/entry/entry_64.S
187 @@ -172,6 +172,10 @@ GLOBAL(entry_SYSCALL_64_after_hwframe)
188 /* IRQs are off. */
189 movq %rax, %rdi
190 movq %rsp, %rsi
191 +
192 + /* clobbers %rax, make sure it is after saving the syscall nr */
193 + IBRS_ENTER
194 +
195 call do_syscall_64 /* returns with IRQs disabled */
196
197 TRACE_IRQS_IRETQ /* we're about to change IF */
198 @@ -248,8 +252,8 @@ GLOBAL(entry_SYSCALL_64_after_hwframe)
199 * perf profiles. Nothing jumps here.
200 */
201 syscall_return_via_sysret:
202 - /* rcx and r11 are already restored (see code above) */
203 - POP_REGS pop_rdi=0 skip_r11rcx=1
204 + IBRS_EXIT
205 + POP_REGS pop_rdi=0
206
207 /*
208 * Now all regs are restored except RSP and RDI.
209 @@ -301,7 +305,6 @@ ENTRY(__switch_to_asm)
210 movq %rbx, PER_CPU_VAR(fixed_percpu_data) + stack_canary_offset
211 #endif
212
213 -#ifdef CONFIG_RETPOLINE
214 /*
215 * When switching from a shallower to a deeper call stack
216 * the RSB may either underflow or use entries populated
217 @@ -310,7 +313,6 @@ ENTRY(__switch_to_asm)
218 * speculative execution to prevent attack.
219 */
220 FILL_RETURN_BUFFER %r12, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
221 -#endif
222
223 /* restore callee-saved registers */
224 popq %r15
225 @@ -622,6 +624,7 @@ GLOBAL(retint_user)
226 TRACE_IRQS_IRETQ
227
228 GLOBAL(swapgs_restore_regs_and_return_to_usermode)
229 + IBRS_EXIT
230 #ifdef CONFIG_DEBUG_ENTRY
231 /* Assert that pt_regs indicates user mode. */
232 testb $3, CS(%rsp)
233 @@ -1248,7 +1251,13 @@ ENTRY(paranoid_entry)
234 */
235 FENCE_SWAPGS_KERNEL_ENTRY
236
237 - ret
238 + /*
239 + * Once we have CR3 and %GS setup save and set SPEC_CTRL. Just like
240 + * CR3 above, keep the old value in a callee saved register.
241 + */
242 + IBRS_ENTER save_reg=%r15
243 +
244 + RET
245 END(paranoid_entry)
246
247 /*
248 @@ -1276,12 +1285,20 @@ ENTRY(paranoid_exit)
249 jmp .Lparanoid_exit_restore
250 .Lparanoid_exit_no_swapgs:
251 TRACE_IRQS_IRETQ_DEBUG
252 +
253 + /*
254 + * Must restore IBRS state before both CR3 and %GS since we need access
255 + * to the per-CPU x86_spec_ctrl_shadow variable.
256 + */
257 + IBRS_EXIT save_reg=%r15
258 +
259 /* Always restore stashed CR3 value (see paranoid_entry) */
260 RESTORE_CR3 scratch_reg=%rbx save_reg=%r14
261 .Lparanoid_exit_restore:
262 jmp restore_regs_and_return_to_kernel
263 END(paranoid_exit)
264
265 +
266 /*
267 * Save all registers in pt_regs, and switch GS if needed.
268 */
269 @@ -1301,6 +1318,7 @@ ENTRY(error_entry)
270 FENCE_SWAPGS_USER_ENTRY
271 /* We have user CR3. Change to kernel CR3. */
272 SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
273 + IBRS_ENTER
274
275 .Lerror_entry_from_usermode_after_swapgs:
276 /* Put us onto the real thread stack. */
277 @@ -1356,6 +1374,7 @@ ENTRY(error_entry)
278 SWAPGS
279 FENCE_SWAPGS_USER_ENTRY
280 SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
281 + IBRS_ENTER
282
283 /*
284 * Pretend that the exception came from user mode: set up pt_regs
285 @@ -1461,6 +1480,8 @@ ENTRY(nmi)
286 PUSH_AND_CLEAR_REGS rdx=(%rdx)
287 ENCODE_FRAME_POINTER
288
289 + IBRS_ENTER
290 +
291 /*
292 * At this point we no longer need to worry about stack damage
293 * due to nesting -- we're on the normal thread stack and we're
294 @@ -1684,6 +1705,9 @@ end_repeat_nmi:
295 movq $-1, %rsi
296 call do_nmi
297
298 + /* Always restore stashed SPEC_CTRL value (see paranoid_entry) */
299 + IBRS_EXIT save_reg=%r15
300 +
301 /* Always restore stashed CR3 value (see paranoid_entry) */
302 RESTORE_CR3 scratch_reg=%r15 save_reg=%r14
303
304 diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
305 index 39913770a44d5..c3c4ea4a6711a 100644
306 --- a/arch/x86/entry/entry_64_compat.S
307 +++ b/arch/x86/entry/entry_64_compat.S
308 @@ -4,7 +4,6 @@
309 *
310 * Copyright 2000-2002 Andi Kleen, SuSE Labs.
311 */
312 -#include "calling.h"
313 #include <asm/asm-offsets.h>
314 #include <asm/current.h>
315 #include <asm/errno.h>
316 @@ -17,6 +16,8 @@
317 #include <linux/linkage.h>
318 #include <linux/err.h>
319
320 +#include "calling.h"
321 +
322 .section .entry.text, "ax"
323
324 /*
325 @@ -106,6 +107,8 @@ ENTRY(entry_SYSENTER_compat)
326 xorl %r15d, %r15d /* nospec r15 */
327 cld
328
329 + IBRS_ENTER
330 +
331 /*
332 * SYSENTER doesn't filter flags, so we need to clear NT and AC
333 * ourselves. To save a few cycles, we can check whether
334 @@ -253,6 +256,8 @@ GLOBAL(entry_SYSCALL_compat_after_hwframe)
335 */
336 TRACE_IRQS_OFF
337
338 + IBRS_ENTER
339 +
340 movq %rsp, %rdi
341 call do_fast_syscall_32
342 /* XEN PV guests always use IRET path */
343 @@ -267,6 +272,9 @@ sysret32_from_system_call:
344 */
345 STACKLEAK_ERASE
346 TRACE_IRQS_ON /* User mode traces as IRQs on. */
347 +
348 + IBRS_EXIT
349 +
350 movq RBX(%rsp), %rbx /* pt_regs->rbx */
351 movq RBP(%rsp), %rbp /* pt_regs->rbp */
352 movq EFLAGS(%rsp), %r11 /* pt_regs->flags (in r11) */
353 @@ -408,6 +416,7 @@ ENTRY(entry_INT80_compat)
354 * gate turned them off.
355 */
356 TRACE_IRQS_OFF
357 + IBRS_ENTER
358
359 movq %rsp, %rdi
360 call do_int80_syscall_32
361 diff --git a/arch/x86/include/asm/cpu_device_id.h b/arch/x86/include/asm/cpu_device_id.h
362 index 0c814cd9ea42c..cdf39decf7340 100644
363 --- a/arch/x86/include/asm/cpu_device_id.h
364 +++ b/arch/x86/include/asm/cpu_device_id.h
365 @@ -5,15 +5,22 @@
366 /*
367 * Declare drivers belonging to specific x86 CPUs
368 * Similar in spirit to pci_device_id and related PCI functions
369 + *
370 + * The wildcard initializers are in mod_devicetable.h because
371 + * file2alias needs them. Sigh.
372 */
373 -
374 #include <linux/mod_devicetable.h>
375 +/* Get the INTEL_FAM* model defines */
376 +#include <asm/intel-family.h>
377 +/* And the X86_VENDOR_* ones */
378 +#include <asm/processor.h>
379
380 +/* Centaur FAM6 models */
381 +#define X86_CENTAUR_FAM6_C7_A 0xa
382 #define X86_CENTAUR_FAM6_C7_D 0xd
383 #define X86_CENTAUR_FAM6_NANO 0xf
384
385 #define X86_STEPPINGS(mins, maxs) GENMASK(maxs, mins)
386 -
387 /**
388 * X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE - Base macro for CPU matching
389 * @_vendor: The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY
390 @@ -26,8 +33,11 @@
391 * format is unsigned long. The supplied value, pointer
392 * etc. is casted to unsigned long internally.
393 *
394 - * Backport version to keep the SRBDS pile consistant. No shorter variants
395 - * required for this.
396 + * Use only if you need all selectors. Otherwise use one of the shorter
397 + * macros of the X86_MATCH_* family. If there is no matching shorthand
398 + * macro, consider to add one. If you really need to wrap one of the macros
399 + * into another macro at the usage site for good reasons, then please
400 + * start this local macro with X86_MATCH to allow easy grepping.
401 */
402 #define X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(_vendor, _family, _model, \
403 _steppings, _feature, _data) { \
404 @@ -39,6 +49,120 @@
405 .driver_data = (unsigned long) _data \
406 }
407
408 +/**
409 + * X86_MATCH_VENDOR_FAM_MODEL_FEATURE - Macro for CPU matching
410 + * @_vendor: The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY
411 + * The name is expanded to X86_VENDOR_@_vendor
412 + * @_family: The family number or X86_FAMILY_ANY
413 + * @_model: The model number, model constant or X86_MODEL_ANY
414 + * @_feature: A X86_FEATURE bit or X86_FEATURE_ANY
415 + * @_data: Driver specific data or NULL. The internal storage
416 + * format is unsigned long. The supplied value, pointer
417 + * etc. is casted to unsigned long internally.
418 + *
419 + * The steppings arguments of X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE() is
420 + * set to wildcards.
421 + */
422 +#define X86_MATCH_VENDOR_FAM_MODEL_FEATURE(vendor, family, model, feature, data) \
423 + X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(vendor, family, model, \
424 + X86_STEPPING_ANY, feature, data)
425 +
426 +/**
427 + * X86_MATCH_VENDOR_FAM_FEATURE - Macro for matching vendor, family and CPU feature
428 + * @vendor: The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY
429 + * The name is expanded to X86_VENDOR_@vendor
430 + * @family: The family number or X86_FAMILY_ANY
431 + * @feature: A X86_FEATURE bit
432 + * @data: Driver specific data or NULL. The internal storage
433 + * format is unsigned long. The supplied value, pointer
434 + * etc. is casted to unsigned long internally.
435 + *
436 + * All other missing arguments of X86_MATCH_VENDOR_FAM_MODEL_FEATURE() are
437 + * set to wildcards.
438 + */
439 +#define X86_MATCH_VENDOR_FAM_FEATURE(vendor, family, feature, data) \
440 + X86_MATCH_VENDOR_FAM_MODEL_FEATURE(vendor, family, \
441 + X86_MODEL_ANY, feature, data)
442 +
443 +/**
444 + * X86_MATCH_VENDOR_FEATURE - Macro for matching vendor and CPU feature
445 + * @vendor: The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY
446 + * The name is expanded to X86_VENDOR_@vendor
447 + * @feature: A X86_FEATURE bit
448 + * @data: Driver specific data or NULL. The internal storage
449 + * format is unsigned long. The supplied value, pointer
450 + * etc. is casted to unsigned long internally.
451 + *
452 + * All other missing arguments of X86_MATCH_VENDOR_FAM_MODEL_FEATURE() are
453 + * set to wildcards.
454 + */
455 +#define X86_MATCH_VENDOR_FEATURE(vendor, feature, data) \
456 + X86_MATCH_VENDOR_FAM_FEATURE(vendor, X86_FAMILY_ANY, feature, data)
457 +
458 +/**
459 + * X86_MATCH_FEATURE - Macro for matching a CPU feature
460 + * @feature: A X86_FEATURE bit
461 + * @data: Driver specific data or NULL. The internal storage
462 + * format is unsigned long. The supplied value, pointer
463 + * etc. is casted to unsigned long internally.
464 + *
465 + * All other missing arguments of X86_MATCH_VENDOR_FAM_MODEL_FEATURE() are
466 + * set to wildcards.
467 + */
468 +#define X86_MATCH_FEATURE(feature, data) \
469 + X86_MATCH_VENDOR_FEATURE(ANY, feature, data)
470 +
471 +/* Transitional to keep the existing code working */
472 +#define X86_FEATURE_MATCH(feature) X86_MATCH_FEATURE(feature, NULL)
473 +
474 +/**
475 + * X86_MATCH_VENDOR_FAM_MODEL - Match vendor, family and model
476 + * @vendor: The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY
477 + * The name is expanded to X86_VENDOR_@vendor
478 + * @family: The family number or X86_FAMILY_ANY
479 + * @model: The model number, model constant or X86_MODEL_ANY
480 + * @data: Driver specific data or NULL. The internal storage
481 + * format is unsigned long. The supplied value, pointer
482 + * etc. is casted to unsigned long internally.
483 + *
484 + * All other missing arguments of X86_MATCH_VENDOR_FAM_MODEL_FEATURE() are
485 + * set to wildcards.
486 + */
487 +#define X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, data) \
488 + X86_MATCH_VENDOR_FAM_MODEL_FEATURE(vendor, family, model, \
489 + X86_FEATURE_ANY, data)
490 +
491 +/**
492 + * X86_MATCH_VENDOR_FAM - Match vendor and family
493 + * @vendor: The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY
494 + * The name is expanded to X86_VENDOR_@vendor
495 + * @family: The family number or X86_FAMILY_ANY
496 + * @data: Driver specific data or NULL. The internal storage
497 + * format is unsigned long. The supplied value, pointer
498 + * etc. is casted to unsigned long internally.
499 + *
500 + * All other missing arguments to X86_MATCH_VENDOR_FAM_MODEL_FEATURE() are
501 + * set of wildcards.
502 + */
503 +#define X86_MATCH_VENDOR_FAM(vendor, family, data) \
504 + X86_MATCH_VENDOR_FAM_MODEL(vendor, family, X86_MODEL_ANY, data)
505 +
506 +/**
507 + * X86_MATCH_INTEL_FAM6_MODEL - Match vendor INTEL, family 6 and model
508 + * @model: The model name without the INTEL_FAM6_ prefix or ANY
509 + * The model name is expanded to INTEL_FAM6_@model internally
510 + * @data: Driver specific data or NULL. The internal storage
511 + * format is unsigned long. The supplied value, pointer
512 + * etc. is casted to unsigned long internally.
513 + *
514 + * The vendor is set to INTEL, the family to 6 and all other missing
515 + * arguments of X86_MATCH_VENDOR_FAM_MODEL_FEATURE() are set to wildcards.
516 + *
517 + * See X86_MATCH_VENDOR_FAM_MODEL_FEATURE() for further information.
518 + */
519 +#define X86_MATCH_INTEL_FAM6_MODEL(model, data) \
520 + X86_MATCH_VENDOR_FAM_MODEL(INTEL, 6, INTEL_FAM6_##model, data)
521 +
522 /*
523 * Match specific microcode revisions.
524 *
525 diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
526 index 736b0e412344b..2ec85d7bfdff2 100644
527 --- a/arch/x86/include/asm/cpufeatures.h
528 +++ b/arch/x86/include/asm/cpufeatures.h
529 @@ -203,8 +203,8 @@
530 #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
531 #define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */
532 #define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */
533 -#define X86_FEATURE_RETPOLINE ( 7*32+12) /* "" Generic Retpoline mitigation for Spectre variant 2 */
534 -#define X86_FEATURE_RETPOLINE_LFENCE ( 7*32+13) /* "" Use LFENCE for Spectre variant 2 */
535 +#define X86_FEATURE_KERNEL_IBRS ( 7*32+12) /* "" Set/clear IBRS on kernel entry/exit */
536 +#define X86_FEATURE_RSB_VMEXIT ( 7*32+13) /* "" Fill RSB on VM-Exit */
537 #define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */
538 #define X86_FEATURE_CDP_L2 ( 7*32+15) /* Code and Data Prioritization L2 */
539 #define X86_FEATURE_MSR_SPEC_CTRL ( 7*32+16) /* "" MSR SPEC_CTRL is implemented */
540 @@ -286,7 +286,10 @@
541 #define X86_FEATURE_CQM_MBM_LOCAL (11*32+ 3) /* LLC Local MBM monitoring */
542 #define X86_FEATURE_FENCE_SWAPGS_USER (11*32+ 4) /* "" LFENCE in user entry SWAPGS path */
543 #define X86_FEATURE_FENCE_SWAPGS_KERNEL (11*32+ 5) /* "" LFENCE in kernel entry SWAPGS path */
544 -#define X86_FEATURE_RSB_VMEXIT_LITE (11*32+ 6) /* "" Fill RSB on VM exit when EIBRS is enabled */
545 +#define X86_FEATURE_RRSBA_CTRL (11*32+11) /* "" RET prediction control */
546 +#define X86_FEATURE_RETPOLINE (11*32+12) /* "" Generic Retpoline mitigation for Spectre variant 2 */
547 +#define X86_FEATURE_RETPOLINE_LFENCE (11*32+13) /* "" Use LFENCE for Spectre variant 2 */
548 +#define X86_FEATURE_RSB_VMEXIT_LITE (11*32+17) /* "" Fill RSB on VM exit when EIBRS is enabled */
549
550 /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */
551 #define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* AVX512 BFLOAT16 instructions */
552 @@ -303,6 +306,7 @@
553 #define X86_FEATURE_AMD_SSBD (13*32+24) /* "" Speculative Store Bypass Disable */
554 #define X86_FEATURE_VIRT_SSBD (13*32+25) /* Virtualized Speculative Store Bypass Disable */
555 #define X86_FEATURE_AMD_SSB_NO (13*32+26) /* "" Speculative Store Bypass is fixed in hardware. */
556 +#define X86_FEATURE_BTC_NO (13*32+29) /* "" Not vulnerable to Branch Type Confusion */
557
558 /* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */
559 #define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */
560 @@ -407,7 +411,8 @@
561 #define X86_BUG_ITLB_MULTIHIT X86_BUG(23) /* CPU may incur MCE during certain page attribute changes */
562 #define X86_BUG_SRBDS X86_BUG(24) /* CPU may leak RNG bits if not mitigated */
563 #define X86_BUG_MMIO_STALE_DATA X86_BUG(25) /* CPU is affected by Processor MMIO Stale Data vulnerabilities */
564 -#define X86_BUG_MMIO_UNKNOWN X86_BUG(26) /* CPU is too old and its MMIO Stale Data status is unknown */
565 +#define X86_BUG_RETBLEED X86_BUG(26) /* CPU is affected by RETBleed */
566 #define X86_BUG_EIBRS_PBRSB X86_BUG(27) /* EIBRS is vulnerable to Post Barrier RSB Predictions */
567 +#define X86_BUG_MMIO_UNKNOWN X86_BUG(28) /* CPU is too old and its MMIO Stale Data status is unknown */
568
569 #endif /* _ASM_X86_CPUFEATURES_H */
570 diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h
571 index 5b07573c3bc87..c1d6d8bbb7dad 100644
572 --- a/arch/x86/include/asm/intel-family.h
573 +++ b/arch/x86/include/asm/intel-family.h
574 @@ -35,6 +35,9 @@
575 * The #define line may optionally include a comment including platform names.
576 */
577
578 +/* Wildcard match for FAM6 so X86_MATCH_INTEL_FAM6_MODEL(ANY) works */
579 +#define INTEL_FAM6_ANY X86_MODEL_ANY
580 +
581 #define INTEL_FAM6_CORE_YONAH 0x0E
582
583 #define INTEL_FAM6_CORE2_MEROM 0x0F
584 @@ -126,6 +129,9 @@
585 #define INTEL_FAM6_XEON_PHI_KNL 0x57 /* Knights Landing */
586 #define INTEL_FAM6_XEON_PHI_KNM 0x85 /* Knights Mill */
587
588 +/* Family 5 */
589 +#define INTEL_FAM5_QUARK_X1000 0x09 /* Quark X1000 SoC */
590 +
591 /* Useful macros */
592 #define INTEL_CPU_FAM_ANY(_family, _model, _driver_data) \
593 { \
594 diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
595 index cef4eba03ff36..713886d5493a8 100644
596 --- a/arch/x86/include/asm/msr-index.h
597 +++ b/arch/x86/include/asm/msr-index.h
598 @@ -47,6 +47,8 @@
599 #define SPEC_CTRL_STIBP BIT(SPEC_CTRL_STIBP_SHIFT) /* STIBP mask */
600 #define SPEC_CTRL_SSBD_SHIFT 2 /* Speculative Store Bypass Disable bit */
601 #define SPEC_CTRL_SSBD BIT(SPEC_CTRL_SSBD_SHIFT) /* Speculative Store Bypass Disable */
602 +#define SPEC_CTRL_RRSBA_DIS_S_SHIFT 6 /* Disable RRSBA behavior */
603 +#define SPEC_CTRL_RRSBA_DIS_S BIT(SPEC_CTRL_RRSBA_DIS_S_SHIFT)
604
605 #define MSR_IA32_PRED_CMD 0x00000049 /* Prediction Command */
606 #define PRED_CMD_IBPB BIT(0) /* Indirect Branch Prediction Barrier */
607 @@ -82,6 +84,7 @@
608 #define MSR_IA32_ARCH_CAPABILITIES 0x0000010a
609 #define ARCH_CAP_RDCL_NO BIT(0) /* Not susceptible to Meltdown */
610 #define ARCH_CAP_IBRS_ALL BIT(1) /* Enhanced IBRS support */
611 +#define ARCH_CAP_RSBA BIT(2) /* RET may use alternative branch predictors */
612 #define ARCH_CAP_SKIP_VMENTRY_L1DFLUSH BIT(3) /* Skip L1D flush on vmentry */
613 #define ARCH_CAP_SSB_NO BIT(4) /*
614 * Not susceptible to Speculative Store Bypass
615 @@ -129,6 +132,13 @@
616 * bit available to control VERW
617 * behavior.
618 */
619 +#define ARCH_CAP_RRSBA BIT(19) /*
620 + * Indicates RET may use predictors
621 + * other than the RSB. With eIBRS
622 + * enabled predictions in kernel mode
623 + * are restricted to targets in
624 + * kernel.
625 + */
626 #define ARCH_CAP_PBRSB_NO BIT(24) /*
627 * Not susceptible to Post-Barrier
628 * Return Stack Buffer Predictions.
629 diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
630 index a1ee1a760c3eb..8c898eed28941 100644
631 --- a/arch/x86/include/asm/nospec-branch.h
632 +++ b/arch/x86/include/asm/nospec-branch.h
633 @@ -4,11 +4,14 @@
634 #define _ASM_X86_NOSPEC_BRANCH_H_
635
636 #include <linux/static_key.h>
637 +#include <linux/frame.h>
638
639 #include <asm/alternative.h>
640 #include <asm/alternative-asm.h>
641 #include <asm/cpufeatures.h>
642 #include <asm/msr-index.h>
643 +#include <asm/unwind_hints.h>
644 +#include <asm/percpu.h>
645
646 /*
647 * This should be used immediately before a retpoline alternative. It tells
648 @@ -60,9 +63,9 @@
649 lfence; \
650 jmp 775b; \
651 774: \
652 + add $(BITS_PER_LONG/8) * 2, sp; \
653 dec reg; \
654 jnz 771b; \
655 - add $(BITS_PER_LONG/8) * nr, sp; \
656 /* barrier for jnz misprediction */ \
657 lfence;
658 #else
659 @@ -79,13 +82,6 @@
660 add $(BITS_PER_LONG/8) * nr, sp;
661 #endif
662
663 -#define __ISSUE_UNBALANCED_RET_GUARD(sp) \
664 - call 881f; \
665 - int3; \
666 -881: \
667 - add $(BITS_PER_LONG/8), sp; \
668 - lfence;
669 -
670 #ifdef __ASSEMBLY__
671
672 /*
673 @@ -155,26 +151,28 @@
674 #endif
675 .endm
676
677 -.macro ISSUE_UNBALANCED_RET_GUARD ftr:req
678 - ANNOTATE_NOSPEC_ALTERNATIVE
679 - ALTERNATIVE "jmp .Lskip_pbrsb_\@", \
680 - __stringify(__ISSUE_UNBALANCED_RET_GUARD(%_ASM_SP)) \
681 - \ftr
682 -.Lskip_pbrsb_\@:
683 +.macro ISSUE_UNBALANCED_RET_GUARD
684 + call .Lunbalanced_ret_guard_\@
685 + int3
686 +.Lunbalanced_ret_guard_\@:
687 + add $(BITS_PER_LONG/8), %_ASM_SP
688 + lfence
689 .endm
690
691 /*
692 * A simpler FILL_RETURN_BUFFER macro. Don't make people use the CPP
693 * monstrosity above, manually.
694 */
695 -.macro FILL_RETURN_BUFFER reg:req nr:req ftr:req
696 -#ifdef CONFIG_RETPOLINE
697 - ANNOTATE_NOSPEC_ALTERNATIVE
698 - ALTERNATIVE "jmp .Lskip_rsb_\@", \
699 - __stringify(__FILL_RETURN_BUFFER(\reg,\nr,%_ASM_SP)) \
700 - \ftr
701 +.macro FILL_RETURN_BUFFER reg:req nr:req ftr:req ftr2
702 +.ifb \ftr2
703 + ALTERNATIVE "jmp .Lskip_rsb_\@", "", \ftr
704 +.else
705 + ALTERNATIVE_2 "jmp .Lskip_rsb_\@", "", \ftr, "jmp .Lunbalanced_\@", \ftr2
706 +.endif
707 + __FILL_RETURN_BUFFER(\reg,\nr,%_ASM_SP)
708 +.Lunbalanced_\@:
709 + ISSUE_UNBALANCED_RET_GUARD
710 .Lskip_rsb_\@:
711 -#endif
712 .endm
713
714 #else /* __ASSEMBLY__ */
715 @@ -249,6 +247,7 @@ enum spectre_v2_mitigation {
716 SPECTRE_V2_EIBRS,
717 SPECTRE_V2_EIBRS_RETPOLINE,
718 SPECTRE_V2_EIBRS_LFENCE,
719 + SPECTRE_V2_IBRS,
720 };
721
722 /* The indirect branch speculation control variants */
723 @@ -312,6 +311,9 @@ static inline void indirect_branch_prediction_barrier(void)
724
725 /* The Intel SPEC CTRL MSR base value cache */
726 extern u64 x86_spec_ctrl_base;
727 +DECLARE_PER_CPU(u64, x86_spec_ctrl_current);
728 +extern void write_spec_ctrl_current(u64 val, bool force);
729 +extern u64 spec_ctrl_current(void);
730
731 /*
732 * With retpoline, we must use IBRS to restrict branch prediction
733 @@ -321,18 +323,16 @@ extern u64 x86_spec_ctrl_base;
734 */
735 #define firmware_restrict_branch_speculation_start() \
736 do { \
737 - u64 val = x86_spec_ctrl_base | SPEC_CTRL_IBRS; \
738 - \
739 preempt_disable(); \
740 - alternative_msr_write(MSR_IA32_SPEC_CTRL, val, \
741 + alternative_msr_write(MSR_IA32_SPEC_CTRL, \
742 + spec_ctrl_current() | SPEC_CTRL_IBRS, \
743 X86_FEATURE_USE_IBRS_FW); \
744 } while (0)
745
746 #define firmware_restrict_branch_speculation_end() \
747 do { \
748 - u64 val = x86_spec_ctrl_base; \
749 - \
750 - alternative_msr_write(MSR_IA32_SPEC_CTRL, val, \
751 + alternative_msr_write(MSR_IA32_SPEC_CTRL, \
752 + spec_ctrl_current(), \
753 X86_FEATURE_USE_IBRS_FW); \
754 preempt_enable(); \
755 } while (0)
756 diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
757 index 88cef978380bf..5571b28d35b60 100644
758 --- a/arch/x86/kernel/cpu/amd.c
759 +++ b/arch/x86/kernel/cpu/amd.c
760 @@ -894,12 +894,21 @@ static void init_amd_zn(struct cpuinfo_x86 *c)
761 node_reclaim_distance = 32;
762 #endif
763
764 - /*
765 - * Fix erratum 1076: CPB feature bit not being set in CPUID.
766 - * Always set it, except when running under a hypervisor.
767 - */
768 - if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && !cpu_has(c, X86_FEATURE_CPB))
769 - set_cpu_cap(c, X86_FEATURE_CPB);
770 + /* Fix up CPUID bits, but only if not virtualised. */
771 + if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) {
772 +
773 + /* Erratum 1076: CPB feature bit not being set in CPUID. */
774 + if (!cpu_has(c, X86_FEATURE_CPB))
775 + set_cpu_cap(c, X86_FEATURE_CPB);
776 +
777 + /*
778 + * Zen3 (Fam19 model < 0x10) parts are not susceptible to
779 + * Branch Type Confusion, but predate the allocation of the
780 + * BTC_NO bit.
781 + */
782 + if (c->x86 == 0x19 && !cpu_has(c, X86_FEATURE_BTC_NO))
783 + set_cpu_cap(c, X86_FEATURE_BTC_NO);
784 + }
785 }
786
787 static void init_amd(struct cpuinfo_x86 *c)
788 diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
789 index c90d91cb14341..cf5a18e261e36 100644
790 --- a/arch/x86/kernel/cpu/bugs.c
791 +++ b/arch/x86/kernel/cpu/bugs.c
792 @@ -37,6 +37,8 @@
793
794 static void __init spectre_v1_select_mitigation(void);
795 static void __init spectre_v2_select_mitigation(void);
796 +static void __init retbleed_select_mitigation(void);
797 +static void __init spectre_v2_user_select_mitigation(void);
798 static void __init ssb_select_mitigation(void);
799 static void __init l1tf_select_mitigation(void);
800 static void __init mds_select_mitigation(void);
801 @@ -46,16 +48,40 @@ static void __init taa_select_mitigation(void);
802 static void __init mmio_select_mitigation(void);
803 static void __init srbds_select_mitigation(void);
804
805 -/* The base value of the SPEC_CTRL MSR that always has to be preserved. */
806 +/* The base value of the SPEC_CTRL MSR without task-specific bits set */
807 u64 x86_spec_ctrl_base;
808 EXPORT_SYMBOL_GPL(x86_spec_ctrl_base);
809 +
810 +/* The current value of the SPEC_CTRL MSR with task-specific bits set */
811 +DEFINE_PER_CPU(u64, x86_spec_ctrl_current);
812 +EXPORT_SYMBOL_GPL(x86_spec_ctrl_current);
813 +
814 static DEFINE_MUTEX(spec_ctrl_mutex);
815
816 /*
817 - * The vendor and possibly platform specific bits which can be modified in
818 - * x86_spec_ctrl_base.
819 + * Keep track of the SPEC_CTRL MSR value for the current task, which may differ
820 + * from x86_spec_ctrl_base due to STIBP/SSB in __speculation_ctrl_update().
821 */
822 -static u64 __ro_after_init x86_spec_ctrl_mask = SPEC_CTRL_IBRS;
823 +void write_spec_ctrl_current(u64 val, bool force)
824 +{
825 + if (this_cpu_read(x86_spec_ctrl_current) == val)
826 + return;
827 +
828 + this_cpu_write(x86_spec_ctrl_current, val);
829 +
830 + /*
831 + * When KERNEL_IBRS this MSR is written on return-to-user, unless
832 + * forced the update can be delayed until that time.
833 + */
834 + if (force || !cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS))
835 + wrmsrl(MSR_IA32_SPEC_CTRL, val);
836 +}
837 +
838 +u64 spec_ctrl_current(void)
839 +{
840 + return this_cpu_read(x86_spec_ctrl_current);
841 +}
842 +EXPORT_SYMBOL_GPL(spec_ctrl_current);
843
844 /*
845 * AMD specific MSR info for Speculative Store Bypass control.
846 @@ -105,13 +131,21 @@ void __init check_bugs(void)
847 if (boot_cpu_has(X86_FEATURE_MSR_SPEC_CTRL))
848 rdmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
849
850 - /* Allow STIBP in MSR_SPEC_CTRL if supported */
851 - if (boot_cpu_has(X86_FEATURE_STIBP))
852 - x86_spec_ctrl_mask |= SPEC_CTRL_STIBP;
853 -
854 /* Select the proper CPU mitigations before patching alternatives: */
855 spectre_v1_select_mitigation();
856 spectre_v2_select_mitigation();
857 + /*
858 + * retbleed_select_mitigation() relies on the state set by
859 + * spectre_v2_select_mitigation(); specifically it wants to know about
860 + * spectre_v2=ibrs.
861 + */
862 + retbleed_select_mitigation();
863 + /*
864 + * spectre_v2_user_select_mitigation() relies on the state set by
865 + * retbleed_select_mitigation(); specifically the STIBP selection is
866 + * forced for UNRET.
867 + */
868 + spectre_v2_user_select_mitigation();
869 ssb_select_mitigation();
870 l1tf_select_mitigation();
871 md_clear_select_mitigation();
872 @@ -151,31 +185,17 @@ void __init check_bugs(void)
873 #endif
874 }
875
876 +/*
877 + * NOTE: For VMX, this function is not called in the vmexit path.
878 + * It uses vmx_spec_ctrl_restore_host() instead.
879 + */
880 void
881 x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest)
882 {
883 - u64 msrval, guestval, hostval = x86_spec_ctrl_base;
884 + u64 msrval, guestval = guest_spec_ctrl, hostval = spec_ctrl_current();
885 struct thread_info *ti = current_thread_info();
886
887 - /* Is MSR_SPEC_CTRL implemented ? */
888 if (static_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) {
889 - /*
890 - * Restrict guest_spec_ctrl to supported values. Clear the
891 - * modifiable bits in the host base value and or the
892 - * modifiable bits from the guest value.
893 - */
894 - guestval = hostval & ~x86_spec_ctrl_mask;
895 - guestval |= guest_spec_ctrl & x86_spec_ctrl_mask;
896 -
897 - /* SSBD controlled in MSR_SPEC_CTRL */
898 - if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) ||
899 - static_cpu_has(X86_FEATURE_AMD_SSBD))
900 - hostval |= ssbd_tif_to_spec_ctrl(ti->flags);
901 -
902 - /* Conditional STIBP enabled? */
903 - if (static_branch_unlikely(&switch_to_cond_stibp))
904 - hostval |= stibp_tif_to_spec_ctrl(ti->flags);
905 -
906 if (hostval != guestval) {
907 msrval = setguest ? guestval : hostval;
908 wrmsrl(MSR_IA32_SPEC_CTRL, msrval);
909 @@ -705,12 +725,103 @@ static int __init nospectre_v1_cmdline(char *str)
910 }
911 early_param("nospectre_v1", nospectre_v1_cmdline);
912
913 -#undef pr_fmt
914 -#define pr_fmt(fmt) "Spectre V2 : " fmt
915 -
916 static enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init =
917 SPECTRE_V2_NONE;
918
919 +#undef pr_fmt
920 +#define pr_fmt(fmt) "RETBleed: " fmt
921 +
922 +enum retbleed_mitigation {
923 + RETBLEED_MITIGATION_NONE,
924 + RETBLEED_MITIGATION_IBRS,
925 + RETBLEED_MITIGATION_EIBRS,
926 +};
927 +
928 +enum retbleed_mitigation_cmd {
929 + RETBLEED_CMD_OFF,
930 + RETBLEED_CMD_AUTO,
931 +};
932 +
933 +const char * const retbleed_strings[] = {
934 + [RETBLEED_MITIGATION_NONE] = "Vulnerable",
935 + [RETBLEED_MITIGATION_IBRS] = "Mitigation: IBRS",
936 + [RETBLEED_MITIGATION_EIBRS] = "Mitigation: Enhanced IBRS",
937 +};
938 +
939 +static enum retbleed_mitigation retbleed_mitigation __ro_after_init =
940 + RETBLEED_MITIGATION_NONE;
941 +static enum retbleed_mitigation_cmd retbleed_cmd __ro_after_init =
942 + RETBLEED_CMD_AUTO;
943 +
944 +static int __init retbleed_parse_cmdline(char *str)
945 +{
946 + if (!str)
947 + return -EINVAL;
948 +
949 + if (!strcmp(str, "off"))
950 + retbleed_cmd = RETBLEED_CMD_OFF;
951 + else if (!strcmp(str, "auto"))
952 + retbleed_cmd = RETBLEED_CMD_AUTO;
953 + else
954 + pr_err("Unknown retbleed option (%s). Defaulting to 'auto'\n", str);
955 +
956 + return 0;
957 +}
958 +early_param("retbleed", retbleed_parse_cmdline);
959 +
960 +#define RETBLEED_UNTRAIN_MSG "WARNING: BTB untrained return thunk mitigation is only effective on AMD/Hygon!\n"
961 +#define RETBLEED_COMPILER_MSG "WARNING: kernel not compiled with RETPOLINE or -mfunction-return capable compiler!\n"
962 +#define RETBLEED_INTEL_MSG "WARNING: Spectre v2 mitigation leaves CPU vulnerable to RETBleed attacks, data leaks possible!\n"
963 +
964 +static void __init retbleed_select_mitigation(void)
965 +{
966 + if (!boot_cpu_has_bug(X86_BUG_RETBLEED) || cpu_mitigations_off())
967 + return;
968 +
969 + switch (retbleed_cmd) {
970 + case RETBLEED_CMD_OFF:
971 + return;
972 +
973 + case RETBLEED_CMD_AUTO:
974 + default:
975 + /*
976 + * The Intel mitigation (IBRS) was already selected in
977 + * spectre_v2_select_mitigation().
978 + */
979 +
980 + break;
981 + }
982 +
983 + switch (retbleed_mitigation) {
984 + default:
985 + break;
986 + }
987 +
988 + /*
989 + * Let IBRS trump all on Intel without affecting the effects of the
990 + * retbleed= cmdline option.
991 + */
992 + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
993 + switch (spectre_v2_enabled) {
994 + case SPECTRE_V2_IBRS:
995 + retbleed_mitigation = RETBLEED_MITIGATION_IBRS;
996 + break;
997 + case SPECTRE_V2_EIBRS:
998 + case SPECTRE_V2_EIBRS_RETPOLINE:
999 + case SPECTRE_V2_EIBRS_LFENCE:
1000 + retbleed_mitigation = RETBLEED_MITIGATION_EIBRS;
1001 + break;
1002 + default:
1003 + pr_err(RETBLEED_INTEL_MSG);
1004 + }
1005 + }
1006 +
1007 + pr_info("%s\n", retbleed_strings[retbleed_mitigation]);
1008 +}
1009 +
1010 +#undef pr_fmt
1011 +#define pr_fmt(fmt) "Spectre V2 : " fmt
1012 +
1013 static enum spectre_v2_user_mitigation spectre_v2_user_stibp __ro_after_init =
1014 SPECTRE_V2_USER_NONE;
1015 static enum spectre_v2_user_mitigation spectre_v2_user_ibpb __ro_after_init =
1016 @@ -740,6 +851,7 @@ static inline const char *spectre_v2_module_string(void) { return ""; }
1017 #define SPECTRE_V2_LFENCE_MSG "WARNING: LFENCE mitigation is not recommended for this CPU, data leaks possible!\n"
1018 #define SPECTRE_V2_EIBRS_EBPF_MSG "WARNING: Unprivileged eBPF is enabled with eIBRS on, data leaks possible via Spectre v2 BHB attacks!\n"
1019 #define SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG "WARNING: Unprivileged eBPF is enabled with eIBRS+LFENCE mitigation and SMT, data leaks possible via Spectre v2 BHB attacks!\n"
1020 +#define SPECTRE_V2_IBRS_PERF_MSG "WARNING: IBRS mitigation selected on Enhanced IBRS CPU, this may cause unnecessary performance loss\n"
1021
1022 #ifdef CONFIG_BPF_SYSCALL
1023 void unpriv_ebpf_notify(int new_state)
1024 @@ -781,6 +893,7 @@ enum spectre_v2_mitigation_cmd {
1025 SPECTRE_V2_CMD_EIBRS,
1026 SPECTRE_V2_CMD_EIBRS_RETPOLINE,
1027 SPECTRE_V2_CMD_EIBRS_LFENCE,
1028 + SPECTRE_V2_CMD_IBRS,
1029 };
1030
1031 enum spectre_v2_user_cmd {
1032 @@ -821,13 +934,15 @@ static void __init spec_v2_user_print_cond(const char *reason, bool secure)
1033 pr_info("spectre_v2_user=%s forced on command line.\n", reason);
1034 }
1035
1036 +static __ro_after_init enum spectre_v2_mitigation_cmd spectre_v2_cmd;
1037 +
1038 static enum spectre_v2_user_cmd __init
1039 -spectre_v2_parse_user_cmdline(enum spectre_v2_mitigation_cmd v2_cmd)
1040 +spectre_v2_parse_user_cmdline(void)
1041 {
1042 char arg[20];
1043 int ret, i;
1044
1045 - switch (v2_cmd) {
1046 + switch (spectre_v2_cmd) {
1047 case SPECTRE_V2_CMD_NONE:
1048 return SPECTRE_V2_USER_CMD_NONE;
1049 case SPECTRE_V2_CMD_FORCE:
1050 @@ -853,15 +968,16 @@ spectre_v2_parse_user_cmdline(enum spectre_v2_mitigation_cmd v2_cmd)
1051 return SPECTRE_V2_USER_CMD_AUTO;
1052 }
1053
1054 -static inline bool spectre_v2_in_eibrs_mode(enum spectre_v2_mitigation mode)
1055 +static inline bool spectre_v2_in_ibrs_mode(enum spectre_v2_mitigation mode)
1056 {
1057 - return (mode == SPECTRE_V2_EIBRS ||
1058 - mode == SPECTRE_V2_EIBRS_RETPOLINE ||
1059 - mode == SPECTRE_V2_EIBRS_LFENCE);
1060 + return mode == SPECTRE_V2_IBRS ||
1061 + mode == SPECTRE_V2_EIBRS ||
1062 + mode == SPECTRE_V2_EIBRS_RETPOLINE ||
1063 + mode == SPECTRE_V2_EIBRS_LFENCE;
1064 }
1065
1066 static void __init
1067 -spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd)
1068 +spectre_v2_user_select_mitigation(void)
1069 {
1070 enum spectre_v2_user_mitigation mode = SPECTRE_V2_USER_NONE;
1071 bool smt_possible = IS_ENABLED(CONFIG_SMP);
1072 @@ -874,7 +990,7 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd)
1073 cpu_smt_control == CPU_SMT_NOT_SUPPORTED)
1074 smt_possible = false;
1075
1076 - cmd = spectre_v2_parse_user_cmdline(v2_cmd);
1077 + cmd = spectre_v2_parse_user_cmdline();
1078 switch (cmd) {
1079 case SPECTRE_V2_USER_CMD_NONE:
1080 goto set_mode;
1081 @@ -922,12 +1038,12 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd)
1082 }
1083
1084 /*
1085 - * If no STIBP, enhanced IBRS is enabled or SMT impossible, STIBP is not
1086 - * required.
1087 + * If no STIBP, IBRS or enhanced IBRS is enabled, or SMT impossible,
1088 + * STIBP is not required.
1089 */
1090 if (!boot_cpu_has(X86_FEATURE_STIBP) ||
1091 !smt_possible ||
1092 - spectre_v2_in_eibrs_mode(spectre_v2_enabled))
1093 + spectre_v2_in_ibrs_mode(spectre_v2_enabled))
1094 return;
1095
1096 /*
1097 @@ -952,6 +1068,7 @@ static const char * const spectre_v2_strings[] = {
1098 [SPECTRE_V2_EIBRS] = "Mitigation: Enhanced IBRS",
1099 [SPECTRE_V2_EIBRS_LFENCE] = "Mitigation: Enhanced IBRS + LFENCE",
1100 [SPECTRE_V2_EIBRS_RETPOLINE] = "Mitigation: Enhanced IBRS + Retpolines",
1101 + [SPECTRE_V2_IBRS] = "Mitigation: IBRS",
1102 };
1103
1104 static const struct {
1105 @@ -969,6 +1086,7 @@ static const struct {
1106 { "eibrs,lfence", SPECTRE_V2_CMD_EIBRS_LFENCE, false },
1107 { "eibrs,retpoline", SPECTRE_V2_CMD_EIBRS_RETPOLINE, false },
1108 { "auto", SPECTRE_V2_CMD_AUTO, false },
1109 + { "ibrs", SPECTRE_V2_CMD_IBRS, false },
1110 };
1111
1112 static void __init spec_v2_print_cond(const char *reason, bool secure)
1113 @@ -1031,6 +1149,24 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
1114 return SPECTRE_V2_CMD_AUTO;
1115 }
1116
1117 + if (cmd == SPECTRE_V2_CMD_IBRS && boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
1118 + pr_err("%s selected but not Intel CPU. Switching to AUTO select\n",
1119 + mitigation_options[i].option);
1120 + return SPECTRE_V2_CMD_AUTO;
1121 + }
1122 +
1123 + if (cmd == SPECTRE_V2_CMD_IBRS && !boot_cpu_has(X86_FEATURE_IBRS)) {
1124 + pr_err("%s selected but CPU doesn't have IBRS. Switching to AUTO select\n",
1125 + mitigation_options[i].option);
1126 + return SPECTRE_V2_CMD_AUTO;
1127 + }
1128 +
1129 + if (cmd == SPECTRE_V2_CMD_IBRS && boot_cpu_has(X86_FEATURE_XENPV)) {
1130 + pr_err("%s selected but running as XenPV guest. Switching to AUTO select\n",
1131 + mitigation_options[i].option);
1132 + return SPECTRE_V2_CMD_AUTO;
1133 + }
1134 +
1135 spec_v2_print_cond(mitigation_options[i].option,
1136 mitigation_options[i].secure);
1137 return cmd;
1138 @@ -1046,6 +1182,22 @@ static enum spectre_v2_mitigation __init spectre_v2_select_retpoline(void)
1139 return SPECTRE_V2_RETPOLINE;
1140 }
1141
1142 +/* Disable in-kernel use of non-RSB RET predictors */
1143 +static void __init spec_ctrl_disable_kernel_rrsba(void)
1144 +{
1145 + u64 ia32_cap;
1146 +
1147 + if (!boot_cpu_has(X86_FEATURE_RRSBA_CTRL))
1148 + return;
1149 +
1150 + ia32_cap = x86_read_arch_cap_msr();
1151 +
1152 + if (ia32_cap & ARCH_CAP_RRSBA) {
1153 + x86_spec_ctrl_base |= SPEC_CTRL_RRSBA_DIS_S;
1154 + write_spec_ctrl_current(x86_spec_ctrl_base, true);
1155 + }
1156 +}
1157 +
1158 static void __init spectre_v2_determine_rsb_fill_type_at_vmexit(enum spectre_v2_mitigation mode)
1159 {
1160 /*
1161 @@ -1070,10 +1222,6 @@ static void __init spectre_v2_determine_rsb_fill_type_at_vmexit(enum spectre_v2_
1162 */
1163 switch (mode) {
1164 case SPECTRE_V2_NONE:
1165 - /* These modes already fill RSB at vmexit */
1166 - case SPECTRE_V2_LFENCE:
1167 - case SPECTRE_V2_RETPOLINE:
1168 - case SPECTRE_V2_EIBRS_RETPOLINE:
1169 return;
1170
1171 case SPECTRE_V2_EIBRS_LFENCE:
1172 @@ -1083,6 +1231,14 @@ static void __init spectre_v2_determine_rsb_fill_type_at_vmexit(enum spectre_v2_
1173 pr_info("Spectre v2 / PBRSB-eIBRS: Retire a single CALL on VMEXIT\n");
1174 }
1175 return;
1176 +
1177 + case SPECTRE_V2_EIBRS_RETPOLINE:
1178 + case SPECTRE_V2_RETPOLINE:
1179 + case SPECTRE_V2_LFENCE:
1180 + case SPECTRE_V2_IBRS:
1181 + setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT);
1182 + pr_info("Spectre v2 / SpectreRSB : Filling RSB on VMEXIT\n");
1183 + return;
1184 }
1185
1186 pr_warn_once("Unknown Spectre v2 mode, disabling RSB mitigation at VM exit");
1187 @@ -1113,6 +1269,14 @@ static void __init spectre_v2_select_mitigation(void)
1188 break;
1189 }
1190
1191 + if (boot_cpu_has_bug(X86_BUG_RETBLEED) &&
1192 + retbleed_cmd != RETBLEED_CMD_OFF &&
1193 + boot_cpu_has(X86_FEATURE_IBRS) &&
1194 + boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
1195 + mode = SPECTRE_V2_IBRS;
1196 + break;
1197 + }
1198 +
1199 mode = spectre_v2_select_retpoline();
1200 break;
1201
1202 @@ -1129,6 +1293,10 @@ static void __init spectre_v2_select_mitigation(void)
1203 mode = spectre_v2_select_retpoline();
1204 break;
1205
1206 + case SPECTRE_V2_CMD_IBRS:
1207 + mode = SPECTRE_V2_IBRS;
1208 + break;
1209 +
1210 case SPECTRE_V2_CMD_EIBRS:
1211 mode = SPECTRE_V2_EIBRS;
1212 break;
1213 @@ -1145,10 +1313,9 @@ static void __init spectre_v2_select_mitigation(void)
1214 if (mode == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled())
1215 pr_err(SPECTRE_V2_EIBRS_EBPF_MSG);
1216
1217 - if (spectre_v2_in_eibrs_mode(mode)) {
1218 - /* Force it so VMEXIT will restore correctly */
1219 + if (spectre_v2_in_ibrs_mode(mode)) {
1220 x86_spec_ctrl_base |= SPEC_CTRL_IBRS;
1221 - wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
1222 + write_spec_ctrl_current(x86_spec_ctrl_base, true);
1223 }
1224
1225 switch (mode) {
1226 @@ -1156,6 +1323,12 @@ static void __init spectre_v2_select_mitigation(void)
1227 case SPECTRE_V2_EIBRS:
1228 break;
1229
1230 + case SPECTRE_V2_IBRS:
1231 + setup_force_cpu_cap(X86_FEATURE_KERNEL_IBRS);
1232 + if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED))
1233 + pr_warn(SPECTRE_V2_IBRS_PERF_MSG);
1234 + break;
1235 +
1236 case SPECTRE_V2_LFENCE:
1237 case SPECTRE_V2_EIBRS_LFENCE:
1238 setup_force_cpu_cap(X86_FEATURE_RETPOLINE_LFENCE);
1239 @@ -1167,16 +1340,56 @@ static void __init spectre_v2_select_mitigation(void)
1240 break;
1241 }
1242
1243 + /*
1244 + * Disable alternate RSB predictions in kernel when indirect CALLs and
1245 + * JMPs gets protection against BHI and Intramode-BTI, but RET
1246 + * prediction from a non-RSB predictor is still a risk.
1247 + */
1248 + if (mode == SPECTRE_V2_EIBRS_LFENCE ||
1249 + mode == SPECTRE_V2_EIBRS_RETPOLINE ||
1250 + mode == SPECTRE_V2_RETPOLINE)
1251 + spec_ctrl_disable_kernel_rrsba();
1252 +
1253 spectre_v2_enabled = mode;
1254 pr_info("%s\n", spectre_v2_strings[mode]);
1255
1256 /*
1257 - * If spectre v2 protection has been enabled, unconditionally fill
1258 - * RSB during a context switch; this protects against two independent
1259 - * issues:
1260 + * If Spectre v2 protection has been enabled, fill the RSB during a
1261 + * context switch. In general there are two types of RSB attacks
1262 + * across context switches, for which the CALLs/RETs may be unbalanced.
1263 + *
1264 + * 1) RSB underflow
1265 + *
1266 + * Some Intel parts have "bottomless RSB". When the RSB is empty,
1267 + * speculated return targets may come from the branch predictor,
1268 + * which could have a user-poisoned BTB or BHB entry.
1269 + *
1270 + * AMD has it even worse: *all* returns are speculated from the BTB,
1271 + * regardless of the state of the RSB.
1272 + *
1273 + * When IBRS or eIBRS is enabled, the "user -> kernel" attack
1274 + * scenario is mitigated by the IBRS branch prediction isolation
1275 + * properties, so the RSB buffer filling wouldn't be necessary to
1276 + * protect against this type of attack.
1277 + *
1278 + * The "user -> user" attack scenario is mitigated by RSB filling.
1279 *
1280 - * - RSB underflow (and switch to BTB) on Skylake+
1281 - * - SpectreRSB variant of spectre v2 on X86_BUG_SPECTRE_V2 CPUs
1282 + * 2) Poisoned RSB entry
1283 + *
1284 + * If the 'next' in-kernel return stack is shorter than 'prev',
1285 + * 'next' could be tricked into speculating with a user-poisoned RSB
1286 + * entry.
1287 + *
1288 + * The "user -> kernel" attack scenario is mitigated by SMEP and
1289 + * eIBRS.
1290 + *
1291 + * The "user -> user" scenario, also known as SpectreBHB, requires
1292 + * RSB clearing.
1293 + *
1294 + * So to mitigate all cases, unconditionally fill RSB on context
1295 + * switches.
1296 + *
1297 + * FIXME: Is this pointless for retbleed-affected AMD?
1298 */
1299 setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW);
1300 pr_info("Spectre v2 / SpectreRSB mitigation: Filling RSB on context switch\n");
1301 @@ -1184,28 +1397,29 @@ static void __init spectre_v2_select_mitigation(void)
1302 spectre_v2_determine_rsb_fill_type_at_vmexit(mode);
1303
1304 /*
1305 - * Retpoline means the kernel is safe because it has no indirect
1306 - * branches. Enhanced IBRS protects firmware too, so, enable restricted
1307 - * speculation around firmware calls only when Enhanced IBRS isn't
1308 - * supported.
1309 + * Retpoline protects the kernel, but doesn't protect firmware. IBRS
1310 + * and Enhanced IBRS protect firmware too, so enable IBRS around
1311 + * firmware calls only when IBRS / Enhanced IBRS aren't otherwise
1312 + * enabled.
1313 *
1314 * Use "mode" to check Enhanced IBRS instead of boot_cpu_has(), because
1315 * the user might select retpoline on the kernel command line and if
1316 * the CPU supports Enhanced IBRS, kernel might un-intentionally not
1317 * enable IBRS around firmware calls.
1318 */
1319 - if (boot_cpu_has(X86_FEATURE_IBRS) && !spectre_v2_in_eibrs_mode(mode)) {
1320 + if (boot_cpu_has(X86_FEATURE_IBRS) && !spectre_v2_in_ibrs_mode(mode)) {
1321 setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW);
1322 pr_info("Enabling Restricted Speculation for firmware calls\n");
1323 }
1324
1325 /* Set up IBPB and STIBP depending on the general spectre V2 command */
1326 - spectre_v2_user_select_mitigation(cmd);
1327 + spectre_v2_cmd = cmd;
1328 }
1329
1330 static void update_stibp_msr(void * __unused)
1331 {
1332 - wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
1333 + u64 val = spec_ctrl_current() | (x86_spec_ctrl_base & SPEC_CTRL_STIBP);
1334 + write_spec_ctrl_current(val, true);
1335 }
1336
1337 /* Update x86_spec_ctrl_base in case SMT state changed. */
1338 @@ -1421,16 +1635,6 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void)
1339 break;
1340 }
1341
1342 - /*
1343 - * If SSBD is controlled by the SPEC_CTRL MSR, then set the proper
1344 - * bit in the mask to allow guests to use the mitigation even in the
1345 - * case where the host does not enable it.
1346 - */
1347 - if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) ||
1348 - static_cpu_has(X86_FEATURE_AMD_SSBD)) {
1349 - x86_spec_ctrl_mask |= SPEC_CTRL_SSBD;
1350 - }
1351 -
1352 /*
1353 * We have three CPU feature flags that are in play here:
1354 * - X86_BUG_SPEC_STORE_BYPASS - CPU is susceptible.
1355 @@ -1448,7 +1652,7 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void)
1356 x86_amd_ssb_disable();
1357 } else {
1358 x86_spec_ctrl_base |= SPEC_CTRL_SSBD;
1359 - wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
1360 + write_spec_ctrl_current(x86_spec_ctrl_base, true);
1361 }
1362 }
1363
1364 @@ -1665,7 +1869,7 @@ int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which)
1365 void x86_spec_ctrl_setup_ap(void)
1366 {
1367 if (boot_cpu_has(X86_FEATURE_MSR_SPEC_CTRL))
1368 - wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
1369 + write_spec_ctrl_current(x86_spec_ctrl_base, true);
1370
1371 if (ssb_mode == SPEC_STORE_BYPASS_DISABLE)
1372 x86_amd_ssb_disable();
1373 @@ -1900,7 +2104,7 @@ static ssize_t mmio_stale_data_show_state(char *buf)
1374
1375 static char *stibp_state(void)
1376 {
1377 - if (spectre_v2_in_eibrs_mode(spectre_v2_enabled))
1378 + if (spectre_v2_in_ibrs_mode(spectre_v2_enabled))
1379 return "";
1380
1381 switch (spectre_v2_user_stibp) {
1382 @@ -1934,7 +2138,7 @@ static char *pbrsb_eibrs_state(void)
1383 {
1384 if (boot_cpu_has_bug(X86_BUG_EIBRS_PBRSB)) {
1385 if (boot_cpu_has(X86_FEATURE_RSB_VMEXIT_LITE) ||
1386 - boot_cpu_has(X86_FEATURE_RETPOLINE))
1387 + boot_cpu_has(X86_FEATURE_RSB_VMEXIT))
1388 return ", PBRSB-eIBRS: SW sequence";
1389 else
1390 return ", PBRSB-eIBRS: Vulnerable";
1391 @@ -1970,6 +2174,11 @@ static ssize_t srbds_show_state(char *buf)
1392 return sprintf(buf, "%s\n", srbds_strings[srbds_mitigation]);
1393 }
1394
1395 +static ssize_t retbleed_show_state(char *buf)
1396 +{
1397 + return sprintf(buf, "%s\n", retbleed_strings[retbleed_mitigation]);
1398 +}
1399 +
1400 static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr,
1401 char *buf, unsigned int bug)
1402 {
1403 @@ -2016,6 +2225,9 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr
1404 case X86_BUG_MMIO_UNKNOWN:
1405 return mmio_stale_data_show_state(buf);
1406
1407 + case X86_BUG_RETBLEED:
1408 + return retbleed_show_state(buf);
1409 +
1410 default:
1411 break;
1412 }
1413 @@ -2075,4 +2287,9 @@ ssize_t cpu_show_mmio_stale_data(struct device *dev, struct device_attribute *at
1414 else
1415 return cpu_show_common(dev, attr, buf, X86_BUG_MMIO_STALE_DATA);
1416 }
1417 +
1418 +ssize_t cpu_show_retbleed(struct device *dev, struct device_attribute *attr, char *buf)
1419 +{
1420 + return cpu_show_common(dev, attr, buf, X86_BUG_RETBLEED);
1421 +}
1422 #endif
1423 diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
1424 index 59413e741ecf1..5e1e32f1086ba 100644
1425 --- a/arch/x86/kernel/cpu/common.c
1426 +++ b/arch/x86/kernel/cpu/common.c
1427 @@ -1102,48 +1102,60 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
1428 {}
1429 };
1430
1431 +#define VULNBL(vendor, family, model, blacklist) \
1432 + X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, blacklist)
1433 +
1434 #define VULNBL_INTEL_STEPPINGS(model, steppings, issues) \
1435 X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(INTEL, 6, \
1436 INTEL_FAM6_##model, steppings, \
1437 X86_FEATURE_ANY, issues)
1438
1439 +#define VULNBL_AMD(family, blacklist) \
1440 + VULNBL(AMD, family, X86_MODEL_ANY, blacklist)
1441 +
1442 +#define VULNBL_HYGON(family, blacklist) \
1443 + VULNBL(HYGON, family, X86_MODEL_ANY, blacklist)
1444 +
1445 #define SRBDS BIT(0)
1446 /* CPU is affected by X86_BUG_MMIO_STALE_DATA */
1447 #define MMIO BIT(1)
1448 /* CPU is affected by Shared Buffers Data Sampling (SBDS), a variant of X86_BUG_MMIO_STALE_DATA */
1449 #define MMIO_SBDS BIT(2)
1450 +/* CPU is affected by RETbleed, speculating where you would not expect it */
1451 +#define RETBLEED BIT(3)
1452
1453 static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = {
1454 VULNBL_INTEL_STEPPINGS(IVYBRIDGE, X86_STEPPING_ANY, SRBDS),
1455 VULNBL_INTEL_STEPPINGS(HASWELL, X86_STEPPING_ANY, SRBDS),
1456 VULNBL_INTEL_STEPPINGS(HASWELL_L, X86_STEPPING_ANY, SRBDS),
1457 VULNBL_INTEL_STEPPINGS(HASWELL_G, X86_STEPPING_ANY, SRBDS),
1458 - VULNBL_INTEL_STEPPINGS(HASWELL_X, BIT(2) | BIT(4), MMIO),
1459 - VULNBL_INTEL_STEPPINGS(BROADWELL_D, X86_STEPPINGS(0x3, 0x5), MMIO),
1460 + VULNBL_INTEL_STEPPINGS(HASWELL_X, X86_STEPPING_ANY, MMIO),
1461 + VULNBL_INTEL_STEPPINGS(BROADWELL_D, X86_STEPPING_ANY, MMIO),
1462 VULNBL_INTEL_STEPPINGS(BROADWELL_G, X86_STEPPING_ANY, SRBDS),
1463 VULNBL_INTEL_STEPPINGS(BROADWELL_X, X86_STEPPING_ANY, MMIO),
1464 VULNBL_INTEL_STEPPINGS(BROADWELL, X86_STEPPING_ANY, SRBDS),
1465 - VULNBL_INTEL_STEPPINGS(SKYLAKE_L, X86_STEPPINGS(0x3, 0x3), SRBDS | MMIO),
1466 - VULNBL_INTEL_STEPPINGS(SKYLAKE_L, X86_STEPPING_ANY, SRBDS),
1467 - VULNBL_INTEL_STEPPINGS(SKYLAKE_X, BIT(3) | BIT(4) | BIT(6) |
1468 - BIT(7) | BIT(0xB), MMIO),
1469 - VULNBL_INTEL_STEPPINGS(SKYLAKE, X86_STEPPINGS(0x3, 0x3), SRBDS | MMIO),
1470 - VULNBL_INTEL_STEPPINGS(SKYLAKE, X86_STEPPING_ANY, SRBDS),
1471 - VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPINGS(0x9, 0xC), SRBDS | MMIO),
1472 - VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPINGS(0x0, 0x8), SRBDS),
1473 - VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPINGS(0x9, 0xD), SRBDS | MMIO),
1474 - VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPINGS(0x0, 0x8), SRBDS),
1475 - VULNBL_INTEL_STEPPINGS(ICELAKE_L, X86_STEPPINGS(0x5, 0x5), MMIO | MMIO_SBDS),
1476 - VULNBL_INTEL_STEPPINGS(ICELAKE_D, X86_STEPPINGS(0x1, 0x1), MMIO),
1477 - VULNBL_INTEL_STEPPINGS(ICELAKE_X, X86_STEPPINGS(0x4, 0x6), MMIO),
1478 - VULNBL_INTEL_STEPPINGS(COMETLAKE, BIT(2) | BIT(3) | BIT(5), MMIO | MMIO_SBDS),
1479 - VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPINGS(0x1, 0x1), MMIO | MMIO_SBDS),
1480 - VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPINGS(0x0, 0x0), MMIO),
1481 - VULNBL_INTEL_STEPPINGS(LAKEFIELD, X86_STEPPINGS(0x1, 0x1), MMIO | MMIO_SBDS),
1482 - VULNBL_INTEL_STEPPINGS(ROCKETLAKE, X86_STEPPINGS(0x1, 0x1), MMIO),
1483 - VULNBL_INTEL_STEPPINGS(ATOM_TREMONT, X86_STEPPINGS(0x1, 0x1), MMIO | MMIO_SBDS),
1484 + VULNBL_INTEL_STEPPINGS(SKYLAKE_L, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED),
1485 + VULNBL_INTEL_STEPPINGS(SKYLAKE_X, X86_STEPPING_ANY, MMIO | RETBLEED),
1486 + VULNBL_INTEL_STEPPINGS(SKYLAKE, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED),
1487 + VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED),
1488 + VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED),
1489 + VULNBL_INTEL_STEPPINGS(CANNONLAKE_L, X86_STEPPING_ANY, RETBLEED),
1490 + VULNBL_INTEL_STEPPINGS(ICELAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED),
1491 + VULNBL_INTEL_STEPPINGS(ICELAKE_D, X86_STEPPING_ANY, MMIO),
1492 + VULNBL_INTEL_STEPPINGS(ICELAKE_X, X86_STEPPING_ANY, MMIO),
1493 + VULNBL_INTEL_STEPPINGS(COMETLAKE, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED),
1494 + VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPINGS(0x0, 0x0), MMIO | RETBLEED),
1495 + VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED),
1496 + VULNBL_INTEL_STEPPINGS(LAKEFIELD, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED),
1497 + VULNBL_INTEL_STEPPINGS(ROCKETLAKE, X86_STEPPING_ANY, MMIO | RETBLEED),
1498 + VULNBL_INTEL_STEPPINGS(ATOM_TREMONT, X86_STEPPING_ANY, MMIO | MMIO_SBDS),
1499 VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_D, X86_STEPPING_ANY, MMIO),
1500 - VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_L, X86_STEPPINGS(0x0, 0x0), MMIO | MMIO_SBDS),
1501 + VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS),
1502 +
1503 + VULNBL_AMD(0x15, RETBLEED),
1504 + VULNBL_AMD(0x16, RETBLEED),
1505 + VULNBL_AMD(0x17, RETBLEED),
1506 + VULNBL_HYGON(0x18, RETBLEED),
1507 {}
1508 };
1509
1510 @@ -1251,6 +1263,11 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
1511 setup_force_cpu_bug(X86_BUG_MMIO_UNKNOWN);
1512 }
1513
1514 + if (!cpu_has(c, X86_FEATURE_BTC_NO)) {
1515 + if (cpu_matches(cpu_vuln_blacklist, RETBLEED) || (ia32_cap & ARCH_CAP_RSBA))
1516 + setup_force_cpu_bug(X86_BUG_RETBLEED);
1517 + }
1518 +
1519 if (cpu_has(c, X86_FEATURE_IBRS_ENHANCED) &&
1520 !cpu_matches(cpu_vuln_whitelist, NO_EIBRS_PBRSB) &&
1521 !(ia32_cap & ARCH_CAP_PBRSB_NO))
1522 diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c
1523 index 2f163e6646b6f..ad6776081e60d 100644
1524 --- a/arch/x86/kernel/cpu/match.c
1525 +++ b/arch/x86/kernel/cpu/match.c
1526 @@ -16,12 +16,17 @@
1527 * respective wildcard entries.
1528 *
1529 * A typical table entry would be to match a specific CPU
1530 - * { X86_VENDOR_INTEL, 6, 0x12 }
1531 - * or to match a specific CPU feature
1532 - * { X86_FEATURE_MATCH(X86_FEATURE_FOOBAR) }
1533 + *
1534 + * X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, INTEL_FAM6_BROADWELL,
1535 + * X86_FEATURE_ANY, NULL);
1536 *
1537 * Fields can be wildcarded with %X86_VENDOR_ANY, %X86_FAMILY_ANY,
1538 - * %X86_MODEL_ANY, %X86_FEATURE_ANY or 0 (except for vendor)
1539 + * %X86_MODEL_ANY, %X86_FEATURE_ANY (except for vendor)
1540 + *
1541 + * asm/cpu_device_id.h contains a set of useful macros which are shortcuts
1542 + * for various common selections. The above can be shortened to:
1543 + *
1544 + * X86_MATCH_INTEL_FAM6_MODEL(BROADWELL, NULL);
1545 *
1546 * Arrays used to match for this should also be declared using
1547 * MODULE_DEVICE_TABLE(x86cpu, ...)
1548 diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
1549 index 53004dbd55c47..a03e309a0ac5f 100644
1550 --- a/arch/x86/kernel/cpu/scattered.c
1551 +++ b/arch/x86/kernel/cpu/scattered.c
1552 @@ -26,6 +26,7 @@ struct cpuid_bit {
1553 static const struct cpuid_bit cpuid_bits[] = {
1554 { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 },
1555 { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 },
1556 + { X86_FEATURE_RRSBA_CTRL, CPUID_EDX, 2, 0x00000007, 2 },
1557 { X86_FEATURE_CQM_LLC, CPUID_EDX, 1, 0x0000000f, 0 },
1558 { X86_FEATURE_CQM_OCCUP_LLC, CPUID_EDX, 0, 0x0000000f, 1 },
1559 { X86_FEATURE_CQM_MBM_TOTAL, CPUID_EDX, 1, 0x0000000f, 1 },
1560 diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
1561 index 068715a52ac10..87cfd2ee9ca0d 100644
1562 --- a/arch/x86/kernel/process.c
1563 +++ b/arch/x86/kernel/process.c
1564 @@ -449,7 +449,7 @@ static __always_inline void __speculation_ctrl_update(unsigned long tifp,
1565 }
1566
1567 if (updmsr)
1568 - wrmsrl(MSR_IA32_SPEC_CTRL, msr);
1569 + write_spec_ctrl_current(msr, false);
1570 }
1571
1572 static unsigned long speculation_ctrl_update_tif(struct task_struct *tsk)
1573 diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
1574 index 1efcc7d4bc88e..3db407e3c4166 100644
1575 --- a/arch/x86/kvm/svm.c
1576 +++ b/arch/x86/kvm/svm.c
1577 @@ -47,6 +47,7 @@
1578 #include <asm/kvm_para.h>
1579 #include <asm/irq_remapping.h>
1580 #include <asm/spec-ctrl.h>
1581 +#include <asm/cpu_device_id.h>
1582
1583 #include <asm/virtext.h>
1584 #include "trace.h"
1585 diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
1586 index 34ee4835b0177..a7b62a00913e5 100644
1587 --- a/arch/x86/kvm/vmx/nested.c
1588 +++ b/arch/x86/kvm/vmx/nested.c
1589 @@ -11,6 +11,7 @@
1590 #include "mmu.h"
1591 #include "nested.h"
1592 #include "trace.h"
1593 +#include "vmx.h"
1594 #include "x86.h"
1595
1596 static bool __read_mostly enable_shadow_vmcs = 1;
1597 @@ -2863,35 +2864,8 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
1598 vmx->loaded_vmcs->host_state.cr4 = cr4;
1599 }
1600
1601 - asm(
1602 - "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */
1603 - "cmp %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t"
1604 - "je 1f \n\t"
1605 - __ex("vmwrite %%" _ASM_SP ", %[HOST_RSP]") "\n\t"
1606 - "mov %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t"
1607 - "1: \n\t"
1608 - "add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */
1609 -
1610 - /* Check if vmlaunch or vmresume is needed */
1611 - "cmpb $0, %c[launched](%[loaded_vmcs])\n\t"
1612 -
1613 - /*
1614 - * VMLAUNCH and VMRESUME clear RFLAGS.{CF,ZF} on VM-Exit, set
1615 - * RFLAGS.CF on VM-Fail Invalid and set RFLAGS.ZF on VM-Fail
1616 - * Valid. vmx_vmenter() directly "returns" RFLAGS, and so the
1617 - * results of VM-Enter is captured via CC_{SET,OUT} to vm_fail.
1618 - */
1619 - "call vmx_vmenter\n\t"
1620 -
1621 - CC_SET(be)
1622 - : ASM_CALL_CONSTRAINT, CC_OUT(be) (vm_fail)
1623 - : [HOST_RSP]"r"((unsigned long)HOST_RSP),
1624 - [loaded_vmcs]"r"(vmx->loaded_vmcs),
1625 - [launched]"i"(offsetof(struct loaded_vmcs, launched)),
1626 - [host_state_rsp]"i"(offsetof(struct loaded_vmcs, host_state.rsp)),
1627 - [wordsize]"i"(sizeof(ulong))
1628 - : "memory"
1629 - );
1630 + vm_fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
1631 + __vmx_vcpu_run_flags(vmx));
1632
1633 if (vmx->msr_autoload.host.nr)
1634 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
1635 diff --git a/arch/x86/kvm/vmx/run_flags.h b/arch/x86/kvm/vmx/run_flags.h
1636 new file mode 100644
1637 index 0000000000000..edc3f16cc1896
1638 --- /dev/null
1639 +++ b/arch/x86/kvm/vmx/run_flags.h
1640 @@ -0,0 +1,8 @@
1641 +/* SPDX-License-Identifier: GPL-2.0 */
1642 +#ifndef __KVM_X86_VMX_RUN_FLAGS_H
1643 +#define __KVM_X86_VMX_RUN_FLAGS_H
1644 +
1645 +#define VMX_RUN_VMRESUME (1 << 0)
1646 +#define VMX_RUN_SAVE_SPEC_CTRL (1 << 1)
1647 +
1648 +#endif /* __KVM_X86_VMX_RUN_FLAGS_H */
1649 diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
1650 index 946d9205c3b6d..2850670c38bb0 100644
1651 --- a/arch/x86/kvm/vmx/vmenter.S
1652 +++ b/arch/x86/kvm/vmx/vmenter.S
1653 @@ -4,6 +4,7 @@
1654 #include <asm/bitsperlong.h>
1655 #include <asm/kvm_vcpu_regs.h>
1656 #include <asm/nospec-branch.h>
1657 +#include "run_flags.h"
1658
1659 #define WORD_SIZE (BITS_PER_LONG / 8)
1660
1661 @@ -29,78 +30,12 @@
1662
1663 .text
1664
1665 -/**
1666 - * vmx_vmenter - VM-Enter the current loaded VMCS
1667 - *
1668 - * %RFLAGS.ZF: !VMCS.LAUNCHED, i.e. controls VMLAUNCH vs. VMRESUME
1669 - *
1670 - * Returns:
1671 - * %RFLAGS.CF is set on VM-Fail Invalid
1672 - * %RFLAGS.ZF is set on VM-Fail Valid
1673 - * %RFLAGS.{CF,ZF} are cleared on VM-Success, i.e. VM-Exit
1674 - *
1675 - * Note that VMRESUME/VMLAUNCH fall-through and return directly if
1676 - * they VM-Fail, whereas a successful VM-Enter + VM-Exit will jump
1677 - * to vmx_vmexit.
1678 - */
1679 -ENTRY(vmx_vmenter)
1680 - /* EFLAGS.ZF is set if VMCS.LAUNCHED == 0 */
1681 - je 2f
1682 -
1683 -1: vmresume
1684 - ret
1685 -
1686 -2: vmlaunch
1687 - ret
1688 -
1689 -3: cmpb $0, kvm_rebooting
1690 - je 4f
1691 - ret
1692 -4: ud2
1693 -
1694 - .pushsection .fixup, "ax"
1695 -5: jmp 3b
1696 - .popsection
1697 -
1698 - _ASM_EXTABLE(1b, 5b)
1699 - _ASM_EXTABLE(2b, 5b)
1700 -
1701 -ENDPROC(vmx_vmenter)
1702 -
1703 -/**
1704 - * vmx_vmexit - Handle a VMX VM-Exit
1705 - *
1706 - * Returns:
1707 - * %RFLAGS.{CF,ZF} are cleared on VM-Success, i.e. VM-Exit
1708 - *
1709 - * This is vmx_vmenter's partner in crime. On a VM-Exit, control will jump
1710 - * here after hardware loads the host's state, i.e. this is the destination
1711 - * referred to by VMCS.HOST_RIP.
1712 - */
1713 -ENTRY(vmx_vmexit)
1714 -#ifdef CONFIG_RETPOLINE
1715 - ALTERNATIVE "jmp .Lvmexit_skip_rsb", "", X86_FEATURE_RETPOLINE
1716 - /* Preserve guest's RAX, it's used to stuff the RSB. */
1717 - push %_ASM_AX
1718 -
1719 - /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
1720 - FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE
1721 -
1722 - /* Clear RFLAGS.CF and RFLAGS.ZF to preserve VM-Exit, i.e. !VM-Fail. */
1723 - or $1, %_ASM_AX
1724 -
1725 - pop %_ASM_AX
1726 -.Lvmexit_skip_rsb:
1727 -#endif
1728 - ISSUE_UNBALANCED_RET_GUARD X86_FEATURE_RSB_VMEXIT_LITE
1729 - ret
1730 -ENDPROC(vmx_vmexit)
1731 -
1732 /**
1733 * __vmx_vcpu_run - Run a vCPU via a transition to VMX guest mode
1734 - * @vmx: struct vcpu_vmx * (forwarded to vmx_update_host_rsp)
1735 + * @vmx: struct vcpu_vmx *
1736 * @regs: unsigned long * (to guest registers)
1737 - * @launched: %true if the VMCS has been launched
1738 + * @flags: VMX_RUN_VMRESUME: use VMRESUME instead of VMLAUNCH
1739 + * VMX_RUN_SAVE_SPEC_CTRL: save guest SPEC_CTRL into vmx->spec_ctrl
1740 *
1741 * Returns:
1742 * 0 on VM-Exit, 1 on VM-Fail
1743 @@ -119,24 +54,29 @@ ENTRY(__vmx_vcpu_run)
1744 #endif
1745 push %_ASM_BX
1746
1747 + /* Save @vmx for SPEC_CTRL handling */
1748 + push %_ASM_ARG1
1749 +
1750 + /* Save @flags for SPEC_CTRL handling */
1751 + push %_ASM_ARG3
1752 +
1753 /*
1754 * Save @regs, _ASM_ARG2 may be modified by vmx_update_host_rsp() and
1755 * @regs is needed after VM-Exit to save the guest's register values.
1756 */
1757 push %_ASM_ARG2
1758
1759 - /* Copy @launched to BL, _ASM_ARG3 is volatile. */
1760 + /* Copy @flags to BL, _ASM_ARG3 is volatile. */
1761 mov %_ASM_ARG3B, %bl
1762
1763 - /* Adjust RSP to account for the CALL to vmx_vmenter(). */
1764 - lea -WORD_SIZE(%_ASM_SP), %_ASM_ARG2
1765 + lea (%_ASM_SP), %_ASM_ARG2
1766 call vmx_update_host_rsp
1767
1768 /* Load @regs to RAX. */
1769 mov (%_ASM_SP), %_ASM_AX
1770
1771 /* Check if vmlaunch or vmresume is needed */
1772 - cmpb $0, %bl
1773 + testb $VMX_RUN_VMRESUME, %bl
1774
1775 /* Load guest registers. Don't clobber flags. */
1776 mov VCPU_RBX(%_ASM_AX), %_ASM_BX
1777 @@ -158,11 +98,25 @@ ENTRY(__vmx_vcpu_run)
1778 /* Load guest RAX. This kills the @regs pointer! */
1779 mov VCPU_RAX(%_ASM_AX), %_ASM_AX
1780
1781 - /* Enter guest mode */
1782 - call vmx_vmenter
1783 + /* Check EFLAGS.ZF from 'testb' above */
1784 + jz .Lvmlaunch
1785
1786 - /* Jump on VM-Fail. */
1787 - jbe 2f
1788 +/*
1789 + * If VMRESUME/VMLAUNCH and corresponding vmexit succeed, execution resumes at
1790 + * the 'vmx_vmexit' label below.
1791 + */
1792 +.Lvmresume:
1793 + vmresume
1794 + jmp .Lvmfail
1795 +
1796 +.Lvmlaunch:
1797 + vmlaunch
1798 + jmp .Lvmfail
1799 +
1800 + _ASM_EXTABLE(.Lvmresume, .Lfixup)
1801 + _ASM_EXTABLE(.Lvmlaunch, .Lfixup)
1802 +
1803 +SYM_INNER_LABEL(vmx_vmexit, SYM_L_GLOBAL)
1804
1805 /* Temporarily save guest's RAX. */
1806 push %_ASM_AX
1807 @@ -189,19 +143,21 @@ ENTRY(__vmx_vcpu_run)
1808 mov %r15, VCPU_R15(%_ASM_AX)
1809 #endif
1810
1811 - /* Clear RAX to indicate VM-Exit (as opposed to VM-Fail). */
1812 - xor %eax, %eax
1813 + /* Clear return value to indicate VM-Exit (as opposed to VM-Fail). */
1814 + xor %ebx, %ebx
1815
1816 +.Lclear_regs:
1817 /*
1818 - * Clear all general purpose registers except RSP and RAX to prevent
1819 + * Clear all general purpose registers except RSP and RBX to prevent
1820 * speculative use of the guest's values, even those that are reloaded
1821 * via the stack. In theory, an L1 cache miss when restoring registers
1822 * could lead to speculative execution with the guest's values.
1823 * Zeroing XORs are dirt cheap, i.e. the extra paranoia is essentially
1824 * free. RSP and RAX are exempt as RSP is restored by hardware during
1825 - * VM-Exit and RAX is explicitly loaded with 0 or 1 to return VM-Fail.
1826 + * VM-Exit and RBX is explicitly loaded with 0 or 1 to hold the return
1827 + * value.
1828 */
1829 -1: xor %ebx, %ebx
1830 + xor %eax, %eax
1831 xor %ecx, %ecx
1832 xor %edx, %edx
1833 xor %esi, %esi
1834 @@ -220,8 +176,32 @@ ENTRY(__vmx_vcpu_run)
1835
1836 /* "POP" @regs. */
1837 add $WORD_SIZE, %_ASM_SP
1838 - pop %_ASM_BX
1839
1840 + /*
1841 + * IMPORTANT: RSB filling and SPEC_CTRL handling must be done before
1842 + * the first unbalanced RET after vmexit!
1843 + *
1844 + * For retpoline or IBRS, RSB filling is needed to prevent poisoned RSB
1845 + * entries and (in some cases) RSB underflow.
1846 + *
1847 + * eIBRS has its own protection against poisoned RSB, so it doesn't
1848 + * need the RSB filling sequence. But it does need to be enabled, and a
1849 + * single call to retire, before the first unbalanced RET.
1850 + */
1851 +
1852 + FILL_RETURN_BUFFER %_ASM_CX, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_VMEXIT,\
1853 + X86_FEATURE_RSB_VMEXIT_LITE
1854 +
1855 +
1856 + pop %_ASM_ARG2 /* @flags */
1857 + pop %_ASM_ARG1 /* @vmx */
1858 +
1859 + call vmx_spec_ctrl_restore_host
1860 +
1861 + /* Put return value in AX */
1862 + mov %_ASM_BX, %_ASM_AX
1863 +
1864 + pop %_ASM_BX
1865 #ifdef CONFIG_X86_64
1866 pop %r12
1867 pop %r13
1868 @@ -234,11 +214,20 @@ ENTRY(__vmx_vcpu_run)
1869 pop %_ASM_BP
1870 ret
1871
1872 - /* VM-Fail. Out-of-line to avoid a taken Jcc after VM-Exit. */
1873 -2: mov $1, %eax
1874 - jmp 1b
1875 +.Lfixup:
1876 + cmpb $0, kvm_rebooting
1877 + jne .Lvmfail
1878 + ud2
1879 +.Lvmfail:
1880 + /* VM-Fail: set return value to 1 */
1881 + mov $1, %_ASM_BX
1882 + jmp .Lclear_regs
1883 +
1884 ENDPROC(__vmx_vcpu_run)
1885
1886 +
1887 +.section .text, "ax"
1888 +
1889 /**
1890 * vmread_error_trampoline - Trampoline from inline asm to vmread_error()
1891 * @field: VMCS field encoding that failed
1892 diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
1893 index 4bd1bf6214eea..d522c9de41df9 100644
1894 --- a/arch/x86/kvm/vmx/vmx.c
1895 +++ b/arch/x86/kvm/vmx/vmx.c
1896 @@ -31,6 +31,7 @@
1897 #include <asm/apic.h>
1898 #include <asm/asm.h>
1899 #include <asm/cpu.h>
1900 +#include <asm/cpu_device_id.h>
1901 #include <asm/debugreg.h>
1902 #include <asm/desc.h>
1903 #include <asm/fpu/internal.h>
1904 @@ -358,9 +359,9 @@ static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx)
1905 if (!vmx->disable_fb_clear)
1906 return;
1907
1908 - rdmsrl(MSR_IA32_MCU_OPT_CTRL, msr);
1909 + msr = __rdmsr(MSR_IA32_MCU_OPT_CTRL);
1910 msr |= FB_CLEAR_DIS;
1911 - wrmsrl(MSR_IA32_MCU_OPT_CTRL, msr);
1912 + native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, msr);
1913 /* Cache the MSR value to avoid reading it later */
1914 vmx->msr_ia32_mcu_opt_ctrl = msr;
1915 }
1916 @@ -371,7 +372,7 @@ static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx)
1917 return;
1918
1919 vmx->msr_ia32_mcu_opt_ctrl &= ~FB_CLEAR_DIS;
1920 - wrmsrl(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl);
1921 + native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl);
1922 }
1923
1924 static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx)
1925 @@ -862,6 +863,24 @@ static bool msr_write_intercepted(struct vcpu_vmx *vmx, u32 msr)
1926 return true;
1927 }
1928
1929 +unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx)
1930 +{
1931 + unsigned int flags = 0;
1932 +
1933 + if (vmx->loaded_vmcs->launched)
1934 + flags |= VMX_RUN_VMRESUME;
1935 +
1936 + /*
1937 + * If writes to the SPEC_CTRL MSR aren't intercepted, the guest is free
1938 + * to change it directly without causing a vmexit. In that case read
1939 + * it after vmexit and store it in vmx->spec_ctrl.
1940 + */
1941 + if (unlikely(!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL)))
1942 + flags |= VMX_RUN_SAVE_SPEC_CTRL;
1943 +
1944 + return flags;
1945 +}
1946 +
1947 static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
1948 unsigned long entry, unsigned long exit)
1949 {
1950 @@ -6539,7 +6558,30 @@ void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
1951 }
1952 }
1953
1954 -bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched);
1955 +void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx,
1956 + unsigned int flags)
1957 +{
1958 + u64 hostval = this_cpu_read(x86_spec_ctrl_current);
1959 +
1960 + if (!cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL))
1961 + return;
1962 +
1963 + if (flags & VMX_RUN_SAVE_SPEC_CTRL)
1964 + vmx->spec_ctrl = __rdmsr(MSR_IA32_SPEC_CTRL);
1965 +
1966 + /*
1967 + * If the guest/host SPEC_CTRL values differ, restore the host value.
1968 + *
1969 + * For legacy IBRS, the IBRS bit always needs to be written after
1970 + * transitioning from a less privileged predictor mode, regardless of
1971 + * whether the guest/host values differ.
1972 + */
1973 + if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) ||
1974 + vmx->spec_ctrl != hostval)
1975 + native_wrmsrl(MSR_IA32_SPEC_CTRL, hostval);
1976 +
1977 + barrier_nospec();
1978 +}
1979
1980 static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
1981 {
1982 @@ -6628,32 +6670,12 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
1983 write_cr2(vcpu->arch.cr2);
1984
1985 vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
1986 - vmx->loaded_vmcs->launched);
1987 + __vmx_vcpu_run_flags(vmx));
1988
1989 vcpu->arch.cr2 = read_cr2();
1990
1991 vmx_enable_fb_clear(vmx);
1992
1993 - /*
1994 - * We do not use IBRS in the kernel. If this vCPU has used the
1995 - * SPEC_CTRL MSR it may have left it on; save the value and
1996 - * turn it off. This is much more efficient than blindly adding
1997 - * it to the atomic save/restore list. Especially as the former
1998 - * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
1999 - *
2000 - * For non-nested case:
2001 - * If the L01 MSR bitmap does not intercept the MSR, then we need to
2002 - * save it.
2003 - *
2004 - * For nested case:
2005 - * If the L02 MSR bitmap does not intercept the MSR, then we need to
2006 - * save it.
2007 - */
2008 - if (unlikely(!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL)))
2009 - vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
2010 -
2011 - x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0);
2012 -
2013 /* All fields are clean at this point */
2014 if (static_branch_unlikely(&enable_evmcs))
2015 current_evmcs->hv_clean_fields |=
2016 diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
2017 index 7a3362ab59867..4d5be4610af84 100644
2018 --- a/arch/x86/kvm/vmx/vmx.h
2019 +++ b/arch/x86/kvm/vmx/vmx.h
2020 @@ -10,6 +10,7 @@
2021 #include "capabilities.h"
2022 #include "ops.h"
2023 #include "vmcs.h"
2024 +#include "run_flags.h"
2025
2026 extern const u32 vmx_msr_index[];
2027 extern u64 host_efer;
2028 @@ -336,6 +337,10 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu);
2029 struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr);
2030 void pt_update_intercept_for_msr(struct vcpu_vmx *vmx);
2031 void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp);
2032 +void vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx, unsigned int flags);
2033 +unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx);
2034 +bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs,
2035 + unsigned int flags);
2036
2037 #define POSTED_INTR_ON 0
2038 #define POSTED_INTR_SN 1
2039 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
2040 index d0b297583df88..c431a34522d6c 100644
2041 --- a/arch/x86/kvm/x86.c
2042 +++ b/arch/x86/kvm/x86.c
2043 @@ -10329,9 +10329,9 @@ void kvm_arch_end_assignment(struct kvm *kvm)
2044 }
2045 EXPORT_SYMBOL_GPL(kvm_arch_end_assignment);
2046
2047 -bool kvm_arch_has_assigned_device(struct kvm *kvm)
2048 +bool noinstr kvm_arch_has_assigned_device(struct kvm *kvm)
2049 {
2050 - return atomic_read(&kvm->arch.assigned_device_count);
2051 + return arch_atomic_read(&kvm->arch.assigned_device_count);
2052 }
2053 EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device);
2054
2055 diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
2056 index 9b5edf1dfe9e9..7000c836951c5 100644
2057 --- a/drivers/base/cpu.c
2058 +++ b/drivers/base/cpu.c
2059 @@ -574,6 +574,12 @@ ssize_t __weak cpu_show_mmio_stale_data(struct device *dev,
2060 return sysfs_emit(buf, "Not affected\n");
2061 }
2062
2063 +ssize_t __weak cpu_show_retbleed(struct device *dev,
2064 + struct device_attribute *attr, char *buf)
2065 +{
2066 + return sysfs_emit(buf, "Not affected\n");
2067 +}
2068 +
2069 static DEVICE_ATTR(meltdown, 0444, cpu_show_meltdown, NULL);
2070 static DEVICE_ATTR(spectre_v1, 0444, cpu_show_spectre_v1, NULL);
2071 static DEVICE_ATTR(spectre_v2, 0444, cpu_show_spectre_v2, NULL);
2072 @@ -584,6 +590,7 @@ static DEVICE_ATTR(tsx_async_abort, 0444, cpu_show_tsx_async_abort, NULL);
2073 static DEVICE_ATTR(itlb_multihit, 0444, cpu_show_itlb_multihit, NULL);
2074 static DEVICE_ATTR(srbds, 0444, cpu_show_srbds, NULL);
2075 static DEVICE_ATTR(mmio_stale_data, 0444, cpu_show_mmio_stale_data, NULL);
2076 +static DEVICE_ATTR(retbleed, 0444, cpu_show_retbleed, NULL);
2077
2078 static struct attribute *cpu_root_vulnerabilities_attrs[] = {
2079 &dev_attr_meltdown.attr,
2080 @@ -596,6 +603,7 @@ static struct attribute *cpu_root_vulnerabilities_attrs[] = {
2081 &dev_attr_itlb_multihit.attr,
2082 &dev_attr_srbds.attr,
2083 &dev_attr_mmio_stale_data.attr,
2084 + &dev_attr_retbleed.attr,
2085 NULL
2086 };
2087
2088 diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c
2089 index 4195834a45912..cf7ebe3bd1ad2 100644
2090 --- a/drivers/cpufreq/acpi-cpufreq.c
2091 +++ b/drivers/cpufreq/acpi-cpufreq.c
2092 @@ -30,6 +30,7 @@
2093 #include <asm/msr.h>
2094 #include <asm/processor.h>
2095 #include <asm/cpufeature.h>
2096 +#include <asm/cpu_device_id.h>
2097
2098 MODULE_AUTHOR("Paul Diefenbaugh, Dominik Brodowski");
2099 MODULE_DESCRIPTION("ACPI Processor P-States Driver");
2100 diff --git a/drivers/cpufreq/amd_freq_sensitivity.c b/drivers/cpufreq/amd_freq_sensitivity.c
2101 index e2df9d1121063..5107cbe2d64dd 100644
2102 --- a/drivers/cpufreq/amd_freq_sensitivity.c
2103 +++ b/drivers/cpufreq/amd_freq_sensitivity.c
2104 @@ -18,6 +18,7 @@
2105
2106 #include <asm/msr.h>
2107 #include <asm/cpufeature.h>
2108 +#include <asm/cpu_device_id.h>
2109
2110 #include "cpufreq_ondemand.h"
2111
2112 diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c
2113 index d8687868407de..b588e0e409e72 100644
2114 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c
2115 +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c
2116 @@ -35,7 +35,6 @@
2117 #include <linux/pci.h>
2118 #include <linux/pm_runtime.h>
2119 #include <drm/drm_crtc_helper.h>
2120 -#include <drm/drm_damage_helper.h>
2121 #include <drm/drm_edid.h>
2122 #include <drm/drm_gem_framebuffer_helper.h>
2123 #include <drm/drm_fb_helper.h>
2124 @@ -496,7 +495,6 @@ bool amdgpu_display_ddc_probe(struct amdgpu_connector *amdgpu_connector,
2125 static const struct drm_framebuffer_funcs amdgpu_fb_funcs = {
2126 .destroy = drm_gem_fb_destroy,
2127 .create_handle = drm_gem_fb_create_handle,
2128 - .dirty = drm_atomic_helper_dirtyfb,
2129 };
2130
2131 uint32_t amdgpu_display_supported_domains(struct amdgpu_device *adev,
2132 diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
2133 index 347b08b56042f..63b2212262618 100644
2134 --- a/drivers/idle/intel_idle.c
2135 +++ b/drivers/idle/intel_idle.c
2136 @@ -46,11 +46,13 @@
2137 #include <linux/tick.h>
2138 #include <trace/events/power.h>
2139 #include <linux/sched.h>
2140 +#include <linux/sched/smt.h>
2141 #include <linux/notifier.h>
2142 #include <linux/cpu.h>
2143 #include <linux/moduleparam.h>
2144 #include <asm/cpu_device_id.h>
2145 #include <asm/intel-family.h>
2146 +#include <asm/nospec-branch.h>
2147 #include <asm/mwait.h>
2148 #include <asm/msr.h>
2149
2150 @@ -97,6 +99,12 @@ static struct cpuidle_state *cpuidle_state_table;
2151 */
2152 #define CPUIDLE_FLAG_TLB_FLUSHED 0x10000
2153
2154 +/*
2155 + * Disable IBRS across idle (when KERNEL_IBRS), is exclusive vs IRQ_ENABLE
2156 + * above.
2157 + */
2158 +#define CPUIDLE_FLAG_IBRS BIT(16)
2159 +
2160 /*
2161 * MWAIT takes an 8-bit "hint" in EAX "suggesting"
2162 * the C-state (top nibble) and sub-state (bottom nibble)
2163 @@ -107,6 +115,24 @@ static struct cpuidle_state *cpuidle_state_table;
2164 #define flg2MWAIT(flags) (((flags) >> 24) & 0xFF)
2165 #define MWAIT2flg(eax) ((eax & 0xFF) << 24)
2166
2167 +static __cpuidle int intel_idle_ibrs(struct cpuidle_device *dev,
2168 + struct cpuidle_driver *drv, int index)
2169 +{
2170 + bool smt_active = sched_smt_active();
2171 + u64 spec_ctrl = spec_ctrl_current();
2172 + int ret;
2173 +
2174 + if (smt_active)
2175 + wrmsrl(MSR_IA32_SPEC_CTRL, 0);
2176 +
2177 + ret = intel_idle(dev, drv, index);
2178 +
2179 + if (smt_active)
2180 + wrmsrl(MSR_IA32_SPEC_CTRL, spec_ctrl);
2181 +
2182 + return ret;
2183 +}
2184 +
2185 /*
2186 * States are indexed by the cstate number,
2187 * which is also the index into the MWAIT hint array.
2188 @@ -605,7 +631,7 @@ static struct cpuidle_state skl_cstates[] = {
2189 {
2190 .name = "C6",
2191 .desc = "MWAIT 0x20",
2192 - .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
2193 + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS,
2194 .exit_latency = 85,
2195 .target_residency = 200,
2196 .enter = &intel_idle,
2197 @@ -613,7 +639,7 @@ static struct cpuidle_state skl_cstates[] = {
2198 {
2199 .name = "C7s",
2200 .desc = "MWAIT 0x33",
2201 - .flags = MWAIT2flg(0x33) | CPUIDLE_FLAG_TLB_FLUSHED,
2202 + .flags = MWAIT2flg(0x33) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS,
2203 .exit_latency = 124,
2204 .target_residency = 800,
2205 .enter = &intel_idle,
2206 @@ -621,7 +647,7 @@ static struct cpuidle_state skl_cstates[] = {
2207 {
2208 .name = "C8",
2209 .desc = "MWAIT 0x40",
2210 - .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED,
2211 + .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS,
2212 .exit_latency = 200,
2213 .target_residency = 800,
2214 .enter = &intel_idle,
2215 @@ -629,7 +655,7 @@ static struct cpuidle_state skl_cstates[] = {
2216 {
2217 .name = "C9",
2218 .desc = "MWAIT 0x50",
2219 - .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED,
2220 + .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS,
2221 .exit_latency = 480,
2222 .target_residency = 5000,
2223 .enter = &intel_idle,
2224 @@ -637,7 +663,7 @@ static struct cpuidle_state skl_cstates[] = {
2225 {
2226 .name = "C10",
2227 .desc = "MWAIT 0x60",
2228 - .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED,
2229 + .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS,
2230 .exit_latency = 890,
2231 .target_residency = 5000,
2232 .enter = &intel_idle,
2233 @@ -666,7 +692,7 @@ static struct cpuidle_state skx_cstates[] = {
2234 {
2235 .name = "C6",
2236 .desc = "MWAIT 0x20",
2237 - .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
2238 + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS,
2239 .exit_latency = 133,
2240 .target_residency = 600,
2241 .enter = &intel_idle,
2242 @@ -1370,6 +1396,11 @@ static void __init intel_idle_cpuidle_driver_init(void)
2243 drv->states[drv->state_count] = /* structure copy */
2244 cpuidle_state_table[cstate];
2245
2246 + if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) &&
2247 + cpuidle_state_table[cstate].flags & CPUIDLE_FLAG_IBRS) {
2248 + drv->states[drv->state_count].enter = intel_idle_ibrs;
2249 + }
2250 +
2251 drv->state_count += 1;
2252 }
2253
2254 diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
2255 index 510ca69746042..c83ff610ecb6c 100644
2256 --- a/fs/xfs/libxfs/xfs_attr.c
2257 +++ b/fs/xfs/libxfs/xfs_attr.c
2258 @@ -1007,7 +1007,7 @@ restart:
2259 * The INCOMPLETE flag means that we will find the "old"
2260 * attr, not the "new" one.
2261 */
2262 - args->flags |= XFS_ATTR_INCOMPLETE;
2263 + args->op_flags |= XFS_DA_OP_INCOMPLETE;
2264 state = xfs_da_state_alloc();
2265 state->args = args;
2266 state->mp = mp;
2267 diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
2268 index 0c23127347aca..c86ddbf6d105b 100644
2269 --- a/fs/xfs/libxfs/xfs_attr_leaf.c
2270 +++ b/fs/xfs/libxfs/xfs_attr_leaf.c
2271 @@ -2345,8 +2345,8 @@ xfs_attr3_leaf_lookup_int(
2272 * If we are looking for INCOMPLETE entries, show only those.
2273 * If we are looking for complete entries, show only those.
2274 */
2275 - if ((args->flags & XFS_ATTR_INCOMPLETE) !=
2276 - (entry->flags & XFS_ATTR_INCOMPLETE)) {
2277 + if (!!(args->op_flags & XFS_DA_OP_INCOMPLETE) !=
2278 + !!(entry->flags & XFS_ATTR_INCOMPLETE)) {
2279 continue;
2280 }
2281 if (entry->flags & XFS_ATTR_LOCAL) {
2282 diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h
2283 index 7b74e18becff7..38c05d6ae2aa4 100644
2284 --- a/fs/xfs/libxfs/xfs_attr_leaf.h
2285 +++ b/fs/xfs/libxfs/xfs_attr_leaf.h
2286 @@ -17,13 +17,27 @@ struct xfs_inode;
2287 struct xfs_trans;
2288
2289 /*
2290 - * Used to keep a list of "remote value" extents when unlinking an inode.
2291 + * Incore version of the attribute leaf header.
2292 */
2293 -typedef struct xfs_attr_inactive_list {
2294 - xfs_dablk_t valueblk; /* block number of value bytes */
2295 - int valuelen; /* number of bytes in value */
2296 -} xfs_attr_inactive_list_t;
2297 -
2298 +struct xfs_attr3_icleaf_hdr {
2299 + uint32_t forw;
2300 + uint32_t back;
2301 + uint16_t magic;
2302 + uint16_t count;
2303 + uint16_t usedbytes;
2304 + /*
2305 + * Firstused is 32-bit here instead of 16-bit like the on-disk variant
2306 + * to support maximum fsb size of 64k without overflow issues throughout
2307 + * the attr code. Instead, the overflow condition is handled on
2308 + * conversion to/from disk.
2309 + */
2310 + uint32_t firstused;
2311 + __u8 holes;
2312 + struct {
2313 + uint16_t base;
2314 + uint16_t size;
2315 + } freemap[XFS_ATTR_LEAF_MAPSIZE];
2316 +};
2317
2318 /*========================================================================
2319 * Function prototypes for the kernel.
2320 diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
2321 index 3e39b7d40f256..de9096b8a47c6 100644
2322 --- a/fs/xfs/libxfs/xfs_attr_remote.c
2323 +++ b/fs/xfs/libxfs/xfs_attr_remote.c
2324 @@ -24,6 +24,23 @@
2325
2326 #define ATTR_RMTVALUE_MAPSIZE 1 /* # of map entries at once */
2327
2328 +/*
2329 + * Remote Attribute Values
2330 + * =======================
2331 + *
2332 + * Remote extended attribute values are conceptually simple -- they're written
2333 + * to data blocks mapped by an inode's attribute fork, and they have an upper
2334 + * size limit of 64k. Setting a value does not involve the XFS log.
2335 + *
2336 + * However, on a v5 filesystem, maximally sized remote attr values require one
2337 + * block more than 64k worth of space to hold both the remote attribute value
2338 + * header (64 bytes). On a 4k block filesystem this results in a 68k buffer;
2339 + * on a 64k block filesystem, this would be a 128k buffer. Note that the log
2340 + * format can only handle a dirty buffer of XFS_MAX_BLOCKSIZE length (64k).
2341 + * Therefore, we /must/ ensure that remote attribute value buffers never touch
2342 + * the logging system and therefore never have a log item.
2343 + */
2344 +
2345 /*
2346 * Each contiguous block has a header, so it is not just a simple attribute
2347 * length to FSB conversion.
2348 @@ -400,17 +417,25 @@ xfs_attr_rmtval_get(
2349 (map[i].br_startblock != HOLESTARTBLOCK));
2350 dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock);
2351 dblkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
2352 - error = xfs_trans_read_buf(mp, args->trans,
2353 - mp->m_ddev_targp,
2354 - dblkno, dblkcnt, 0, &bp,
2355 - &xfs_attr3_rmt_buf_ops);
2356 - if (error)
2357 + bp = xfs_buf_read(mp->m_ddev_targp, dblkno, dblkcnt, 0,
2358 + &xfs_attr3_rmt_buf_ops);
2359 + if (!bp)
2360 + return -ENOMEM;
2361 + error = bp->b_error;
2362 + if (error) {
2363 + xfs_buf_ioerror_alert(bp, __func__);
2364 + xfs_buf_relse(bp);
2365 +
2366 + /* bad CRC means corrupted metadata */
2367 + if (error == -EFSBADCRC)
2368 + error = -EFSCORRUPTED;
2369 return error;
2370 + }
2371
2372 error = xfs_attr_rmtval_copyout(mp, bp, args->dp->i_ino,
2373 &offset, &valuelen,
2374 &dst);
2375 - xfs_trans_brelse(args->trans, bp);
2376 + xfs_buf_relse(bp);
2377 if (error)
2378 return error;
2379
2380 @@ -551,6 +576,32 @@ xfs_attr_rmtval_set(
2381 return 0;
2382 }
2383
2384 +/* Mark stale any incore buffers for the remote value. */
2385 +int
2386 +xfs_attr_rmtval_stale(
2387 + struct xfs_inode *ip,
2388 + struct xfs_bmbt_irec *map,
2389 + xfs_buf_flags_t incore_flags)
2390 +{
2391 + struct xfs_mount *mp = ip->i_mount;
2392 + struct xfs_buf *bp;
2393 +
2394 + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
2395 +
2396 + ASSERT((map->br_startblock != DELAYSTARTBLOCK) &&
2397 + (map->br_startblock != HOLESTARTBLOCK));
2398 +
2399 + bp = xfs_buf_incore(mp->m_ddev_targp,
2400 + XFS_FSB_TO_DADDR(mp, map->br_startblock),
2401 + XFS_FSB_TO_BB(mp, map->br_blockcount), incore_flags);
2402 + if (bp) {
2403 + xfs_buf_stale(bp);
2404 + xfs_buf_relse(bp);
2405 + }
2406 +
2407 + return 0;
2408 +}
2409 +
2410 /*
2411 * Remove the value associated with an attribute by deleting the
2412 * out-of-line buffer that it is stored on.
2413 @@ -559,7 +610,6 @@ int
2414 xfs_attr_rmtval_remove(
2415 struct xfs_da_args *args)
2416 {
2417 - struct xfs_mount *mp = args->dp->i_mount;
2418 xfs_dablk_t lblkno;
2419 int blkcnt;
2420 int error;
2421 @@ -574,9 +624,6 @@ xfs_attr_rmtval_remove(
2422 blkcnt = args->rmtblkcnt;
2423 while (blkcnt > 0) {
2424 struct xfs_bmbt_irec map;
2425 - struct xfs_buf *bp;
2426 - xfs_daddr_t dblkno;
2427 - int dblkcnt;
2428 int nmap;
2429
2430 /*
2431 @@ -588,21 +635,9 @@ xfs_attr_rmtval_remove(
2432 if (error)
2433 return error;
2434 ASSERT(nmap == 1);
2435 - ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
2436 - (map.br_startblock != HOLESTARTBLOCK));
2437 -
2438 - dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
2439 - dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
2440 -
2441 - /*
2442 - * If the "remote" value is in the cache, remove it.
2443 - */
2444 - bp = xfs_buf_incore(mp->m_ddev_targp, dblkno, dblkcnt, XBF_TRYLOCK);
2445 - if (bp) {
2446 - xfs_buf_stale(bp);
2447 - xfs_buf_relse(bp);
2448 - bp = NULL;
2449 - }
2450 + error = xfs_attr_rmtval_stale(args->dp, &map, XBF_TRYLOCK);
2451 + if (error)
2452 + return error;
2453
2454 lblkno += map.br_blockcount;
2455 blkcnt -= map.br_blockcount;
2456 diff --git a/fs/xfs/libxfs/xfs_attr_remote.h b/fs/xfs/libxfs/xfs_attr_remote.h
2457 index 9d20b66ad379e..6fb4572845ce8 100644
2458 --- a/fs/xfs/libxfs/xfs_attr_remote.h
2459 +++ b/fs/xfs/libxfs/xfs_attr_remote.h
2460 @@ -11,5 +11,7 @@ int xfs_attr3_rmt_blocks(struct xfs_mount *mp, int attrlen);
2461 int xfs_attr_rmtval_get(struct xfs_da_args *args);
2462 int xfs_attr_rmtval_set(struct xfs_da_args *args);
2463 int xfs_attr_rmtval_remove(struct xfs_da_args *args);
2464 +int xfs_attr_rmtval_stale(struct xfs_inode *ip, struct xfs_bmbt_irec *map,
2465 + xfs_buf_flags_t incore_flags);
2466
2467 #endif /* __XFS_ATTR_REMOTE_H__ */
2468 diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h
2469 index ae0bbd20d9caf..588e4674e931f 100644
2470 --- a/fs/xfs/libxfs/xfs_da_btree.h
2471 +++ b/fs/xfs/libxfs/xfs_da_btree.h
2472 @@ -82,6 +82,7 @@ typedef struct xfs_da_args {
2473 #define XFS_DA_OP_OKNOENT 0x0008 /* lookup/add op, ENOENT ok, else die */
2474 #define XFS_DA_OP_CILOOKUP 0x0010 /* lookup to return CI name if found */
2475 #define XFS_DA_OP_ALLOCVAL 0x0020 /* lookup to alloc buffer if found */
2476 +#define XFS_DA_OP_INCOMPLETE 0x0040 /* lookup INCOMPLETE attr keys */
2477
2478 #define XFS_DA_OP_FLAGS \
2479 { XFS_DA_OP_JUSTCHECK, "JUSTCHECK" }, \
2480 @@ -89,7 +90,8 @@ typedef struct xfs_da_args {
2481 { XFS_DA_OP_ADDNAME, "ADDNAME" }, \
2482 { XFS_DA_OP_OKNOENT, "OKNOENT" }, \
2483 { XFS_DA_OP_CILOOKUP, "CILOOKUP" }, \
2484 - { XFS_DA_OP_ALLOCVAL, "ALLOCVAL" }
2485 + { XFS_DA_OP_ALLOCVAL, "ALLOCVAL" }, \
2486 + { XFS_DA_OP_INCOMPLETE, "INCOMPLETE" }
2487
2488 /*
2489 * Storage for holding state during Btree searches and split/join ops.
2490 @@ -124,6 +126,19 @@ typedef struct xfs_da_state {
2491 /* for dirv2 extrablk is data */
2492 } xfs_da_state_t;
2493
2494 +/*
2495 + * In-core version of the node header to abstract the differences in the v2 and
2496 + * v3 disk format of the headers. Callers need to convert to/from disk format as
2497 + * appropriate.
2498 + */
2499 +struct xfs_da3_icnode_hdr {
2500 + uint32_t forw;
2501 + uint32_t back;
2502 + uint16_t magic;
2503 + uint16_t count;
2504 + uint16_t level;
2505 +};
2506 +
2507 /*
2508 * Utility macros to aid in logging changed structure fields.
2509 */
2510 diff --git a/fs/xfs/libxfs/xfs_da_format.c b/fs/xfs/libxfs/xfs_da_format.c
2511 index b1ae572496b69..31bb250c18992 100644
2512 --- a/fs/xfs/libxfs/xfs_da_format.c
2513 +++ b/fs/xfs/libxfs/xfs_da_format.c
2514 @@ -13,6 +13,7 @@
2515 #include "xfs_mount.h"
2516 #include "xfs_inode.h"
2517 #include "xfs_dir2.h"
2518 +#include "xfs_dir2_priv.h"
2519
2520 /*
2521 * Shortform directory ops
2522 diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
2523 index ae654e06b2fb6..222ee48da5e80 100644
2524 --- a/fs/xfs/libxfs/xfs_da_format.h
2525 +++ b/fs/xfs/libxfs/xfs_da_format.h
2526 @@ -93,19 +93,6 @@ struct xfs_da3_intnode {
2527 struct xfs_da_node_entry __btree[];
2528 };
2529
2530 -/*
2531 - * In-core version of the node header to abstract the differences in the v2 and
2532 - * v3 disk format of the headers. Callers need to convert to/from disk format as
2533 - * appropriate.
2534 - */
2535 -struct xfs_da3_icnode_hdr {
2536 - uint32_t forw;
2537 - uint32_t back;
2538 - uint16_t magic;
2539 - uint16_t count;
2540 - uint16_t level;
2541 -};
2542 -
2543 /*
2544 * Directory version 2.
2545 *
2546 @@ -434,14 +421,6 @@ struct xfs_dir3_leaf_hdr {
2547 __be32 pad; /* 64 bit alignment */
2548 };
2549
2550 -struct xfs_dir3_icleaf_hdr {
2551 - uint32_t forw;
2552 - uint32_t back;
2553 - uint16_t magic;
2554 - uint16_t count;
2555 - uint16_t stale;
2556 -};
2557 -
2558 /*
2559 * Leaf block entry.
2560 */
2561 @@ -520,19 +499,6 @@ struct xfs_dir3_free {
2562
2563 #define XFS_DIR3_FREE_CRC_OFF offsetof(struct xfs_dir3_free, hdr.hdr.crc)
2564
2565 -/*
2566 - * In core version of the free block header, abstracted away from on-disk format
2567 - * differences. Use this in the code, and convert to/from the disk version using
2568 - * xfs_dir3_free_hdr_from_disk/xfs_dir3_free_hdr_to_disk.
2569 - */
2570 -struct xfs_dir3_icfree_hdr {
2571 - uint32_t magic;
2572 - uint32_t firstdb;
2573 - uint32_t nvalid;
2574 - uint32_t nused;
2575 -
2576 -};
2577 -
2578 /*
2579 * Single block format.
2580 *
2581 @@ -709,29 +675,6 @@ struct xfs_attr3_leafblock {
2582 */
2583 };
2584
2585 -/*
2586 - * incore, neutral version of the attribute leaf header
2587 - */
2588 -struct xfs_attr3_icleaf_hdr {
2589 - uint32_t forw;
2590 - uint32_t back;
2591 - uint16_t magic;
2592 - uint16_t count;
2593 - uint16_t usedbytes;
2594 - /*
2595 - * firstused is 32-bit here instead of 16-bit like the on-disk variant
2596 - * to support maximum fsb size of 64k without overflow issues throughout
2597 - * the attr code. Instead, the overflow condition is handled on
2598 - * conversion to/from disk.
2599 - */
2600 - uint32_t firstused;
2601 - __u8 holes;
2602 - struct {
2603 - uint16_t base;
2604 - uint16_t size;
2605 - } freemap[XFS_ATTR_LEAF_MAPSIZE];
2606 -};
2607 -
2608 /*
2609 * Special value to represent fs block size in the leaf header firstused field.
2610 * Only used when block size overflows the 2-bytes available on disk.
2611 @@ -740,8 +683,6 @@ struct xfs_attr3_icleaf_hdr {
2612
2613 /*
2614 * Flags used in the leaf_entry[i].flags field.
2615 - * NOTE: the INCOMPLETE bit must not collide with the flags bits specified
2616 - * on the system call, they are "or"ed together for various operations.
2617 */
2618 #define XFS_ATTR_LOCAL_BIT 0 /* attr is stored locally */
2619 #define XFS_ATTR_ROOT_BIT 1 /* limit access to trusted attrs */
2620 diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h
2621 index f542447794928..e170792c0acce 100644
2622 --- a/fs/xfs/libxfs/xfs_dir2.h
2623 +++ b/fs/xfs/libxfs/xfs_dir2.h
2624 @@ -18,6 +18,8 @@ struct xfs_dir2_sf_entry;
2625 struct xfs_dir2_data_hdr;
2626 struct xfs_dir2_data_entry;
2627 struct xfs_dir2_data_unused;
2628 +struct xfs_dir3_icfree_hdr;
2629 +struct xfs_dir3_icleaf_hdr;
2630
2631 extern struct xfs_name xfs_name_dotdot;
2632
2633 diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h
2634 index 59f9fb2241a5f..d2eaea663e7f2 100644
2635 --- a/fs/xfs/libxfs/xfs_dir2_priv.h
2636 +++ b/fs/xfs/libxfs/xfs_dir2_priv.h
2637 @@ -8,6 +8,25 @@
2638
2639 struct dir_context;
2640
2641 +/*
2642 + * In-core version of the leaf and free block headers to abstract the
2643 + * differences in the v2 and v3 disk format of the headers.
2644 + */
2645 +struct xfs_dir3_icleaf_hdr {
2646 + uint32_t forw;
2647 + uint32_t back;
2648 + uint16_t magic;
2649 + uint16_t count;
2650 + uint16_t stale;
2651 +};
2652 +
2653 +struct xfs_dir3_icfree_hdr {
2654 + uint32_t magic;
2655 + uint32_t firstdb;
2656 + uint32_t nvalid;
2657 + uint32_t nused;
2658 +};
2659 +
2660 /* xfs_dir2.c */
2661 extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space,
2662 xfs_dir2_db_t *dbp);
2663 diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
2664 index c968b60cee15b..28203b626f6a2 100644
2665 --- a/fs/xfs/libxfs/xfs_format.h
2666 +++ b/fs/xfs/libxfs/xfs_format.h
2667 @@ -1540,6 +1540,13 @@ typedef struct xfs_bmdr_block {
2668 #define BMBT_BLOCKCOUNT_BITLEN 21
2669
2670 #define BMBT_STARTOFF_MASK ((1ULL << BMBT_STARTOFF_BITLEN) - 1)
2671 +#define BMBT_BLOCKCOUNT_MASK ((1ULL << BMBT_BLOCKCOUNT_BITLEN) - 1)
2672 +
2673 +/*
2674 + * bmbt records have a file offset (block) field that is 54 bits wide, so this
2675 + * is the largest xfs_fileoff_t that we ever expect to see.
2676 + */
2677 +#define XFS_MAX_FILEOFF (BMBT_STARTOFF_MASK + BMBT_BLOCKCOUNT_MASK)
2678
2679 typedef struct xfs_bmbt_rec {
2680 __be64 l0, l1;
2681 diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
2682 index 766b1386402a0..9c88203b537b1 100644
2683 --- a/fs/xfs/xfs_attr_inactive.c
2684 +++ b/fs/xfs/xfs_attr_inactive.c
2685 @@ -25,22 +25,18 @@
2686 #include "xfs_error.h"
2687
2688 /*
2689 - * Look at all the extents for this logical region,
2690 - * invalidate any buffers that are incore/in transactions.
2691 + * Invalidate any incore buffers associated with this remote attribute value
2692 + * extent. We never log remote attribute value buffers, which means that they
2693 + * won't be attached to a transaction and are therefore safe to mark stale.
2694 + * The actual bunmapi will be taken care of later.
2695 */
2696 STATIC int
2697 -xfs_attr3_leaf_freextent(
2698 - struct xfs_trans **trans,
2699 +xfs_attr3_rmt_stale(
2700 struct xfs_inode *dp,
2701 xfs_dablk_t blkno,
2702 int blkcnt)
2703 {
2704 struct xfs_bmbt_irec map;
2705 - struct xfs_buf *bp;
2706 - xfs_dablk_t tblkno;
2707 - xfs_daddr_t dblkno;
2708 - int tblkcnt;
2709 - int dblkcnt;
2710 int nmap;
2711 int error;
2712
2713 @@ -48,47 +44,28 @@ xfs_attr3_leaf_freextent(
2714 * Roll through the "value", invalidating the attribute value's
2715 * blocks.
2716 */
2717 - tblkno = blkno;
2718 - tblkcnt = blkcnt;
2719 - while (tblkcnt > 0) {
2720 + while (blkcnt > 0) {
2721 /*
2722 * Try to remember where we decided to put the value.
2723 */
2724 nmap = 1;
2725 - error = xfs_bmapi_read(dp, (xfs_fileoff_t)tblkno, tblkcnt,
2726 + error = xfs_bmapi_read(dp, (xfs_fileoff_t)blkno, blkcnt,
2727 &map, &nmap, XFS_BMAPI_ATTRFORK);
2728 - if (error) {
2729 + if (error)
2730 return error;
2731 - }
2732 ASSERT(nmap == 1);
2733 - ASSERT(map.br_startblock != DELAYSTARTBLOCK);
2734
2735 /*
2736 - * If it's a hole, these are already unmapped
2737 - * so there's nothing to invalidate.
2738 + * Mark any incore buffers for the remote value as stale. We
2739 + * never log remote attr value buffers, so the buffer should be
2740 + * easy to kill.
2741 */
2742 - if (map.br_startblock != HOLESTARTBLOCK) {
2743 -
2744 - dblkno = XFS_FSB_TO_DADDR(dp->i_mount,
2745 - map.br_startblock);
2746 - dblkcnt = XFS_FSB_TO_BB(dp->i_mount,
2747 - map.br_blockcount);
2748 - bp = xfs_trans_get_buf(*trans,
2749 - dp->i_mount->m_ddev_targp,
2750 - dblkno, dblkcnt, 0);
2751 - if (!bp)
2752 - return -ENOMEM;
2753 - xfs_trans_binval(*trans, bp);
2754 - /*
2755 - * Roll to next transaction.
2756 - */
2757 - error = xfs_trans_roll_inode(trans, dp);
2758 - if (error)
2759 - return error;
2760 - }
2761 + error = xfs_attr_rmtval_stale(dp, &map, 0);
2762 + if (error)
2763 + return error;
2764
2765 - tblkno += map.br_blockcount;
2766 - tblkcnt -= map.br_blockcount;
2767 + blkno += map.br_blockcount;
2768 + blkcnt -= map.br_blockcount;
2769 }
2770
2771 return 0;
2772 @@ -102,86 +79,45 @@ xfs_attr3_leaf_freextent(
2773 */
2774 STATIC int
2775 xfs_attr3_leaf_inactive(
2776 - struct xfs_trans **trans,
2777 - struct xfs_inode *dp,
2778 - struct xfs_buf *bp)
2779 + struct xfs_trans **trans,
2780 + struct xfs_inode *dp,
2781 + struct xfs_buf *bp)
2782 {
2783 - struct xfs_attr_leafblock *leaf;
2784 - struct xfs_attr3_icleaf_hdr ichdr;
2785 - struct xfs_attr_leaf_entry *entry;
2786 + struct xfs_attr3_icleaf_hdr ichdr;
2787 + struct xfs_mount *mp = bp->b_mount;
2788 + struct xfs_attr_leafblock *leaf = bp->b_addr;
2789 + struct xfs_attr_leaf_entry *entry;
2790 struct xfs_attr_leaf_name_remote *name_rmt;
2791 - struct xfs_attr_inactive_list *list;
2792 - struct xfs_attr_inactive_list *lp;
2793 - int error;
2794 - int count;
2795 - int size;
2796 - int tmp;
2797 - int i;
2798 - struct xfs_mount *mp = bp->b_mount;
2799 + int error = 0;
2800 + int i;
2801
2802 - leaf = bp->b_addr;
2803 xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf);
2804
2805 /*
2806 - * Count the number of "remote" value extents.
2807 + * Find the remote value extents for this leaf and invalidate their
2808 + * incore buffers.
2809 */
2810 - count = 0;
2811 entry = xfs_attr3_leaf_entryp(leaf);
2812 for (i = 0; i < ichdr.count; entry++, i++) {
2813 - if (be16_to_cpu(entry->nameidx) &&
2814 - ((entry->flags & XFS_ATTR_LOCAL) == 0)) {
2815 - name_rmt = xfs_attr3_leaf_name_remote(leaf, i);
2816 - if (name_rmt->valueblk)
2817 - count++;
2818 - }
2819 - }
2820 -
2821 - /*
2822 - * If there are no "remote" values, we're done.
2823 - */
2824 - if (count == 0) {
2825 - xfs_trans_brelse(*trans, bp);
2826 - return 0;
2827 - }
2828 + int blkcnt;
2829
2830 - /*
2831 - * Allocate storage for a list of all the "remote" value extents.
2832 - */
2833 - size = count * sizeof(xfs_attr_inactive_list_t);
2834 - list = kmem_alloc(size, 0);
2835 -
2836 - /*
2837 - * Identify each of the "remote" value extents.
2838 - */
2839 - lp = list;
2840 - entry = xfs_attr3_leaf_entryp(leaf);
2841 - for (i = 0; i < ichdr.count; entry++, i++) {
2842 - if (be16_to_cpu(entry->nameidx) &&
2843 - ((entry->flags & XFS_ATTR_LOCAL) == 0)) {
2844 - name_rmt = xfs_attr3_leaf_name_remote(leaf, i);
2845 - if (name_rmt->valueblk) {
2846 - lp->valueblk = be32_to_cpu(name_rmt->valueblk);
2847 - lp->valuelen = xfs_attr3_rmt_blocks(dp->i_mount,
2848 - be32_to_cpu(name_rmt->valuelen));
2849 - lp++;
2850 - }
2851 - }
2852 - }
2853 - xfs_trans_brelse(*trans, bp); /* unlock for trans. in freextent() */
2854 + if (!entry->nameidx || (entry->flags & XFS_ATTR_LOCAL))
2855 + continue;
2856
2857 - /*
2858 - * Invalidate each of the "remote" value extents.
2859 - */
2860 - error = 0;
2861 - for (lp = list, i = 0; i < count; i++, lp++) {
2862 - tmp = xfs_attr3_leaf_freextent(trans, dp,
2863 - lp->valueblk, lp->valuelen);
2864 + name_rmt = xfs_attr3_leaf_name_remote(leaf, i);
2865 + if (!name_rmt->valueblk)
2866 + continue;
2867
2868 - if (error == 0)
2869 - error = tmp; /* save only the 1st errno */
2870 + blkcnt = xfs_attr3_rmt_blocks(dp->i_mount,
2871 + be32_to_cpu(name_rmt->valuelen));
2872 + error = xfs_attr3_rmt_stale(dp,
2873 + be32_to_cpu(name_rmt->valueblk), blkcnt);
2874 + if (error)
2875 + goto err;
2876 }
2877
2878 - kmem_free(list);
2879 + xfs_trans_brelse(*trans, bp);
2880 +err:
2881 return error;
2882 }
2883
2884 diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
2885 index 203065a647652..e41c13ffa5a43 100644
2886 --- a/fs/xfs/xfs_file.c
2887 +++ b/fs/xfs/xfs_file.c
2888 @@ -187,7 +187,12 @@ xfs_file_dio_aio_read(
2889
2890 file_accessed(iocb->ki_filp);
2891
2892 - xfs_ilock(ip, XFS_IOLOCK_SHARED);
2893 + if (iocb->ki_flags & IOCB_NOWAIT) {
2894 + if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
2895 + return -EAGAIN;
2896 + } else {
2897 + xfs_ilock(ip, XFS_IOLOCK_SHARED);
2898 + }
2899 ret = iomap_dio_rw(iocb, to, &xfs_iomap_ops, NULL);
2900 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
2901
2902 diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
2903 index 7b72c189cff0b..30202d8c25e4f 100644
2904 --- a/fs/xfs/xfs_inode.c
2905 +++ b/fs/xfs/xfs_inode.c
2906 @@ -1513,10 +1513,8 @@ xfs_itruncate_extents_flags(
2907 struct xfs_mount *mp = ip->i_mount;
2908 struct xfs_trans *tp = *tpp;
2909 xfs_fileoff_t first_unmap_block;
2910 - xfs_fileoff_t last_block;
2911 xfs_filblks_t unmap_len;
2912 int error = 0;
2913 - int done = 0;
2914
2915 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
2916 ASSERT(!atomic_read(&VFS_I(ip)->i_count) ||
2917 @@ -1536,21 +1534,22 @@ xfs_itruncate_extents_flags(
2918 * the end of the file (in a crash where the space is allocated
2919 * but the inode size is not yet updated), simply remove any
2920 * blocks which show up between the new EOF and the maximum
2921 - * possible file size. If the first block to be removed is
2922 - * beyond the maximum file size (ie it is the same as last_block),
2923 - * then there is nothing to do.
2924 + * possible file size.
2925 + *
2926 + * We have to free all the blocks to the bmbt maximum offset, even if
2927 + * the page cache can't scale that far.
2928 */
2929 first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size);
2930 - last_block = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
2931 - if (first_unmap_block == last_block)
2932 + if (first_unmap_block >= XFS_MAX_FILEOFF) {
2933 + WARN_ON_ONCE(first_unmap_block > XFS_MAX_FILEOFF);
2934 return 0;
2935 + }
2936
2937 - ASSERT(first_unmap_block < last_block);
2938 - unmap_len = last_block - first_unmap_block + 1;
2939 - while (!done) {
2940 + unmap_len = XFS_MAX_FILEOFF - first_unmap_block + 1;
2941 + while (unmap_len > 0) {
2942 ASSERT(tp->t_firstblock == NULLFSBLOCK);
2943 - error = xfs_bunmapi(tp, ip, first_unmap_block, unmap_len, flags,
2944 - XFS_ITRUNC_MAX_EXTENTS, &done);
2945 + error = __xfs_bunmapi(tp, ip, first_unmap_block, &unmap_len,
2946 + flags, XFS_ITRUNC_MAX_EXTENTS);
2947 if (error)
2948 goto out;
2949
2950 @@ -1570,7 +1569,7 @@ xfs_itruncate_extents_flags(
2951 if (whichfork == XFS_DATA_FORK) {
2952 /* Remove all pending CoW reservations. */
2953 error = xfs_reflink_cancel_cow_blocks(ip, &tp,
2954 - first_unmap_block, last_block, true);
2955 + first_unmap_block, XFS_MAX_FILEOFF, true);
2956 if (error)
2957 goto out;
2958
2959 diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
2960 index 904d8285c2269..dfbf3f8f1ec86 100644
2961 --- a/fs/xfs/xfs_reflink.c
2962 +++ b/fs/xfs/xfs_reflink.c
2963 @@ -1544,7 +1544,8 @@ xfs_reflink_clear_inode_flag(
2964 * We didn't find any shared blocks so turn off the reflink flag.
2965 * First, get rid of any leftover CoW mappings.
2966 */
2967 - error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, NULLFILEOFF, true);
2968 + error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, XFS_MAX_FILEOFF,
2969 + true);
2970 if (error)
2971 return error;
2972
2973 diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
2974 index 8d1df9f8be071..a3a54a0fbffea 100644
2975 --- a/fs/xfs/xfs_super.c
2976 +++ b/fs/xfs/xfs_super.c
2977 @@ -512,32 +512,6 @@ xfs_showargs(
2978 seq_puts(m, ",noquota");
2979 }
2980
2981 -static uint64_t
2982 -xfs_max_file_offset(
2983 - unsigned int blockshift)
2984 -{
2985 - unsigned int pagefactor = 1;
2986 - unsigned int bitshift = BITS_PER_LONG - 1;
2987 -
2988 - /* Figure out maximum filesize, on Linux this can depend on
2989 - * the filesystem blocksize (on 32 bit platforms).
2990 - * __block_write_begin does this in an [unsigned] long long...
2991 - * page->index << (PAGE_SHIFT - bbits)
2992 - * So, for page sized blocks (4K on 32 bit platforms),
2993 - * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is
2994 - * (((u64)PAGE_SIZE << (BITS_PER_LONG-1))-1)
2995 - * but for smaller blocksizes it is less (bbits = log2 bsize).
2996 - */
2997 -
2998 -#if BITS_PER_LONG == 32
2999 - ASSERT(sizeof(sector_t) == 8);
3000 - pagefactor = PAGE_SIZE;
3001 - bitshift = BITS_PER_LONG;
3002 -#endif
3003 -
3004 - return (((uint64_t)pagefactor) << bitshift) - 1;
3005 -}
3006 -
3007 /*
3008 * Set parameters for inode allocation heuristics, taking into account
3009 * filesystem size and inode32/inode64 mount options; i.e. specifically
3010 @@ -1650,6 +1624,26 @@ xfs_fs_fill_super(
3011 if (error)
3012 goto out_free_sb;
3013
3014 + /*
3015 + * XFS block mappings use 54 bits to store the logical block offset.
3016 + * This should suffice to handle the maximum file size that the VFS
3017 + * supports (currently 2^63 bytes on 64-bit and ULONG_MAX << PAGE_SHIFT
3018 + * bytes on 32-bit), but as XFS and VFS have gotten the s_maxbytes
3019 + * calculation wrong on 32-bit kernels in the past, we'll add a WARN_ON
3020 + * to check this assertion.
3021 + *
3022 + * Avoid integer overflow by comparing the maximum bmbt offset to the
3023 + * maximum pagecache offset in units of fs blocks.
3024 + */
3025 + if (XFS_B_TO_FSBT(mp, MAX_LFS_FILESIZE) > XFS_MAX_FILEOFF) {
3026 + xfs_warn(mp,
3027 +"MAX_LFS_FILESIZE block offset (%llu) exceeds extent map maximum (%llu)!",
3028 + XFS_B_TO_FSBT(mp, MAX_LFS_FILESIZE),
3029 + XFS_MAX_FILEOFF);
3030 + error = -EINVAL;
3031 + goto out_free_sb;
3032 + }
3033 +
3034 error = xfs_filestream_mount(mp);
3035 if (error)
3036 goto out_free_sb;
3037 @@ -1661,7 +1655,7 @@ xfs_fs_fill_super(
3038 sb->s_magic = XFS_SUPER_MAGIC;
3039 sb->s_blocksize = mp->m_sb.sb_blocksize;
3040 sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1;
3041 - sb->s_maxbytes = xfs_max_file_offset(sb->s_blocksize_bits);
3042 + sb->s_maxbytes = MAX_LFS_FILESIZE;
3043 sb->s_max_links = XFS_MAXLINK;
3044 sb->s_time_gran = 1;
3045 sb->s_time_min = S32_MIN;
3046 diff --git a/include/linux/cpu.h b/include/linux/cpu.h
3047 index 29a6fa2f518db..b42e9c4134475 100644
3048 --- a/include/linux/cpu.h
3049 +++ b/include/linux/cpu.h
3050 @@ -68,6 +68,8 @@ extern ssize_t cpu_show_srbds(struct device *dev, struct device_attribute *attr,
3051 extern ssize_t cpu_show_mmio_stale_data(struct device *dev,
3052 struct device_attribute *attr,
3053 char *buf);
3054 +extern ssize_t cpu_show_retbleed(struct device *dev,
3055 + struct device_attribute *attr, char *buf);
3056
3057 extern __printf(4, 5)
3058 struct device *cpu_device_create(struct device *parent, void *drvdata,
3059 diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
3060 index dd4cdad76b18e..ee7d57478a454 100644
3061 --- a/include/linux/kvm_host.h
3062 +++ b/include/linux/kvm_host.h
3063 @@ -955,7 +955,7 @@ static inline void kvm_arch_end_assignment(struct kvm *kvm)
3064 {
3065 }
3066
3067 -static inline bool kvm_arch_has_assigned_device(struct kvm *kvm)
3068 +static __always_inline bool kvm_arch_has_assigned_device(struct kvm *kvm)
3069 {
3070 return false;
3071 }
3072 diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
3073 index 4c56404e53a76..8265b99d6d55b 100644
3074 --- a/include/linux/mod_devicetable.h
3075 +++ b/include/linux/mod_devicetable.h
3076 @@ -672,9 +672,7 @@ struct x86_cpu_id {
3077 __u16 steppings;
3078 };
3079
3080 -#define X86_FEATURE_MATCH(x) \
3081 - { X86_VENDOR_ANY, X86_FAMILY_ANY, X86_MODEL_ANY, x }
3082 -
3083 +/* Wild cards for x86_cpu_id::vendor, family, model and feature */
3084 #define X86_VENDOR_ANY 0xffff
3085 #define X86_FAMILY_ANY 0
3086 #define X86_MODEL_ANY 0
3087 diff --git a/scripts/Makefile.extrawarn b/scripts/Makefile.extrawarn
3088 index 854e2ba9daa29..6a78afc6f13b4 100644
3089 --- a/scripts/Makefile.extrawarn
3090 +++ b/scripts/Makefile.extrawarn
3091 @@ -50,6 +50,7 @@ KBUILD_CFLAGS += -Wno-sign-compare
3092 KBUILD_CFLAGS += -Wno-format-zero-length
3093 KBUILD_CFLAGS += $(call cc-disable-warning, pointer-to-enum-cast)
3094 KBUILD_CFLAGS += $(call cc-disable-warning, unaligned-access)
3095 +KBUILD_CFLAGS += $(call cc-disable-warning, cast-function-type-strict)
3096 endif
3097
3098 endif
3099 diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h
3100 index 59f924e92c284..3efaf338d3257 100644
3101 --- a/tools/arch/x86/include/asm/cpufeatures.h
3102 +++ b/tools/arch/x86/include/asm/cpufeatures.h
3103 @@ -284,7 +284,7 @@
3104 #define X86_FEATURE_CQM_MBM_LOCAL (11*32+ 3) /* LLC Local MBM monitoring */
3105 #define X86_FEATURE_FENCE_SWAPGS_USER (11*32+ 4) /* "" LFENCE in user entry SWAPGS path */
3106 #define X86_FEATURE_FENCE_SWAPGS_KERNEL (11*32+ 5) /* "" LFENCE in kernel entry SWAPGS path */
3107 -#define X86_FEATURE_RSB_VMEXIT_LITE (11*32+ 6) /* "" Fill RSB on VM-Exit when EIBRS is enabled */
3108 +#define X86_FEATURE_RSB_VMEXIT_LITE (11*32+17) /* "" Fill RSB on VM-Exit when EIBRS is enabled */
3109
3110 /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */
3111 #define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* AVX512 BFLOAT16 instructions */