Annotation of /trunk/kernel-alx/patches-5.4/0316-5.4.217-all-fixes.patch
Parent Directory
|
Revision Log
Revision 3635 -
(hide annotations)
(download)
Mon Oct 24 12:34:12 2022 UTC (19 months, 3 weeks ago) by niro
File size: 103893 byte(s)
Mon Oct 24 12:34:12 2022 UTC (19 months, 3 weeks ago) by niro
File size: 103893 byte(s)
-sync kernel patches
1 | niro | 3635 | diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt |
2 | index db9d53b879f89..8f71a17ad5442 100644 | ||
3 | --- a/Documentation/admin-guide/kernel-parameters.txt | ||
4 | +++ b/Documentation/admin-guide/kernel-parameters.txt | ||
5 | @@ -4298,6 +4298,18 @@ | ||
6 | |||
7 | retain_initrd [RAM] Keep initrd memory after extraction | ||
8 | |||
9 | + retbleed= [X86] Control mitigation of RETBleed (Arbitrary | ||
10 | + Speculative Code Execution with Return Instructions) | ||
11 | + vulnerability. | ||
12 | + | ||
13 | + off - unconditionally disable | ||
14 | + auto - automatically select a migitation | ||
15 | + | ||
16 | + Selecting 'auto' will choose a mitigation method at run | ||
17 | + time according to the CPU. | ||
18 | + | ||
19 | + Not specifying this option is equivalent to retbleed=auto. | ||
20 | + | ||
21 | rfkill.default_state= | ||
22 | 0 "airplane mode". All wifi, bluetooth, wimax, gps, fm, | ||
23 | etc. communication is blocked by default. | ||
24 | @@ -4541,6 +4553,7 @@ | ||
25 | eibrs - enhanced IBRS | ||
26 | eibrs,retpoline - enhanced IBRS + Retpolines | ||
27 | eibrs,lfence - enhanced IBRS + LFENCE | ||
28 | + ibrs - use IBRS to protect kernel | ||
29 | |||
30 | Not specifying this option is equivalent to | ||
31 | spectre_v2=auto. | ||
32 | diff --git a/Documentation/process/code-of-conduct-interpretation.rst b/Documentation/process/code-of-conduct-interpretation.rst | ||
33 | index e899f14a4ba24..4f8a06b00f608 100644 | ||
34 | --- a/Documentation/process/code-of-conduct-interpretation.rst | ||
35 | +++ b/Documentation/process/code-of-conduct-interpretation.rst | ||
36 | @@ -51,7 +51,7 @@ the Technical Advisory Board (TAB) or other maintainers if you're | ||
37 | uncertain how to handle situations that come up. It will not be | ||
38 | considered a violation report unless you want it to be. If you are | ||
39 | uncertain about approaching the TAB or any other maintainers, please | ||
40 | -reach out to our conflict mediator, Mishi Choudhary <mishi@linux.com>. | ||
41 | +reach out to our conflict mediator, Joanna Lee <joanna.lee@gesmer.com>. | ||
42 | |||
43 | In the end, "be kind to each other" is really what the end goal is for | ||
44 | everybody. We know everyone is human and we all fail at times, but the | ||
45 | diff --git a/Makefile b/Makefile | ||
46 | index 3d9d7ef6f8bf1..201ac8e410a94 100644 | ||
47 | --- a/Makefile | ||
48 | +++ b/Makefile | ||
49 | @@ -1,7 +1,7 @@ | ||
50 | # SPDX-License-Identifier: GPL-2.0 | ||
51 | VERSION = 5 | ||
52 | PATCHLEVEL = 4 | ||
53 | -SUBLEVEL = 216 | ||
54 | +SUBLEVEL = 217 | ||
55 | EXTRAVERSION = | ||
56 | NAME = Kleptomaniac Octopus | ||
57 | |||
58 | diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h | ||
59 | index b3f1214787386..29e5675c6d4f2 100644 | ||
60 | --- a/arch/x86/entry/calling.h | ||
61 | +++ b/arch/x86/entry/calling.h | ||
62 | @@ -6,6 +6,8 @@ | ||
63 | #include <asm/percpu.h> | ||
64 | #include <asm/asm-offsets.h> | ||
65 | #include <asm/processor-flags.h> | ||
66 | +#include <asm/msr.h> | ||
67 | +#include <asm/nospec-branch.h> | ||
68 | |||
69 | /* | ||
70 | |||
71 | @@ -146,27 +148,19 @@ For 32-bit we have the following conventions - kernel is built with | ||
72 | |||
73 | .endm | ||
74 | |||
75 | -.macro POP_REGS pop_rdi=1 skip_r11rcx=0 | ||
76 | +.macro POP_REGS pop_rdi=1 | ||
77 | popq %r15 | ||
78 | popq %r14 | ||
79 | popq %r13 | ||
80 | popq %r12 | ||
81 | popq %rbp | ||
82 | popq %rbx | ||
83 | - .if \skip_r11rcx | ||
84 | - popq %rsi | ||
85 | - .else | ||
86 | popq %r11 | ||
87 | - .endif | ||
88 | popq %r10 | ||
89 | popq %r9 | ||
90 | popq %r8 | ||
91 | popq %rax | ||
92 | - .if \skip_r11rcx | ||
93 | - popq %rsi | ||
94 | - .else | ||
95 | popq %rcx | ||
96 | - .endif | ||
97 | popq %rdx | ||
98 | popq %rsi | ||
99 | .if \pop_rdi | ||
100 | @@ -316,6 +310,62 @@ For 32-bit we have the following conventions - kernel is built with | ||
101 | |||
102 | #endif | ||
103 | |||
104 | +/* | ||
105 | + * IBRS kernel mitigation for Spectre_v2. | ||
106 | + * | ||
107 | + * Assumes full context is established (PUSH_REGS, CR3 and GS) and it clobbers | ||
108 | + * the regs it uses (AX, CX, DX). Must be called before the first RET | ||
109 | + * instruction (NOTE! UNTRAIN_RET includes a RET instruction) | ||
110 | + * | ||
111 | + * The optional argument is used to save/restore the current value, | ||
112 | + * which is used on the paranoid paths. | ||
113 | + * | ||
114 | + * Assumes x86_spec_ctrl_{base,current} to have SPEC_CTRL_IBRS set. | ||
115 | + */ | ||
116 | +.macro IBRS_ENTER save_reg | ||
117 | + ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_KERNEL_IBRS | ||
118 | + movl $MSR_IA32_SPEC_CTRL, %ecx | ||
119 | + | ||
120 | +.ifnb \save_reg | ||
121 | + rdmsr | ||
122 | + shl $32, %rdx | ||
123 | + or %rdx, %rax | ||
124 | + mov %rax, \save_reg | ||
125 | + test $SPEC_CTRL_IBRS, %eax | ||
126 | + jz .Ldo_wrmsr_\@ | ||
127 | + lfence | ||
128 | + jmp .Lend_\@ | ||
129 | +.Ldo_wrmsr_\@: | ||
130 | +.endif | ||
131 | + | ||
132 | + movq PER_CPU_VAR(x86_spec_ctrl_current), %rdx | ||
133 | + movl %edx, %eax | ||
134 | + shr $32, %rdx | ||
135 | + wrmsr | ||
136 | +.Lend_\@: | ||
137 | +.endm | ||
138 | + | ||
139 | +/* | ||
140 | + * Similar to IBRS_ENTER, requires KERNEL GS,CR3 and clobbers (AX, CX, DX) | ||
141 | + * regs. Must be called after the last RET. | ||
142 | + */ | ||
143 | +.macro IBRS_EXIT save_reg | ||
144 | + ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_KERNEL_IBRS | ||
145 | + movl $MSR_IA32_SPEC_CTRL, %ecx | ||
146 | + | ||
147 | +.ifnb \save_reg | ||
148 | + mov \save_reg, %rdx | ||
149 | +.else | ||
150 | + movq PER_CPU_VAR(x86_spec_ctrl_current), %rdx | ||
151 | + andl $(~SPEC_CTRL_IBRS), %edx | ||
152 | +.endif | ||
153 | + | ||
154 | + movl %edx, %eax | ||
155 | + shr $32, %rdx | ||
156 | + wrmsr | ||
157 | +.Lend_\@: | ||
158 | +.endm | ||
159 | + | ||
160 | /* | ||
161 | * Mitigate Spectre v1 for conditional swapgs code paths. | ||
162 | * | ||
163 | diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S | ||
164 | index bde3e0f85425f..2d837fb54c31b 100644 | ||
165 | --- a/arch/x86/entry/entry_32.S | ||
166 | +++ b/arch/x86/entry/entry_32.S | ||
167 | @@ -750,7 +750,6 @@ ENTRY(__switch_to_asm) | ||
168 | movl %ebx, PER_CPU_VAR(stack_canary)+stack_canary_offset | ||
169 | #endif | ||
170 | |||
171 | -#ifdef CONFIG_RETPOLINE | ||
172 | /* | ||
173 | * When switching from a shallower to a deeper call stack | ||
174 | * the RSB may either underflow or use entries populated | ||
175 | @@ -759,7 +758,6 @@ ENTRY(__switch_to_asm) | ||
176 | * speculative execution to prevent attack. | ||
177 | */ | ||
178 | FILL_RETURN_BUFFER %ebx, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW | ||
179 | -#endif | ||
180 | |||
181 | /* restore callee-saved registers */ | ||
182 | popfl | ||
183 | diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S | ||
184 | index 2ba3d53ac5b11..c82136030d58f 100644 | ||
185 | --- a/arch/x86/entry/entry_64.S | ||
186 | +++ b/arch/x86/entry/entry_64.S | ||
187 | @@ -172,6 +172,10 @@ GLOBAL(entry_SYSCALL_64_after_hwframe) | ||
188 | /* IRQs are off. */ | ||
189 | movq %rax, %rdi | ||
190 | movq %rsp, %rsi | ||
191 | + | ||
192 | + /* clobbers %rax, make sure it is after saving the syscall nr */ | ||
193 | + IBRS_ENTER | ||
194 | + | ||
195 | call do_syscall_64 /* returns with IRQs disabled */ | ||
196 | |||
197 | TRACE_IRQS_IRETQ /* we're about to change IF */ | ||
198 | @@ -248,8 +252,8 @@ GLOBAL(entry_SYSCALL_64_after_hwframe) | ||
199 | * perf profiles. Nothing jumps here. | ||
200 | */ | ||
201 | syscall_return_via_sysret: | ||
202 | - /* rcx and r11 are already restored (see code above) */ | ||
203 | - POP_REGS pop_rdi=0 skip_r11rcx=1 | ||
204 | + IBRS_EXIT | ||
205 | + POP_REGS pop_rdi=0 | ||
206 | |||
207 | /* | ||
208 | * Now all regs are restored except RSP and RDI. | ||
209 | @@ -301,7 +305,6 @@ ENTRY(__switch_to_asm) | ||
210 | movq %rbx, PER_CPU_VAR(fixed_percpu_data) + stack_canary_offset | ||
211 | #endif | ||
212 | |||
213 | -#ifdef CONFIG_RETPOLINE | ||
214 | /* | ||
215 | * When switching from a shallower to a deeper call stack | ||
216 | * the RSB may either underflow or use entries populated | ||
217 | @@ -310,7 +313,6 @@ ENTRY(__switch_to_asm) | ||
218 | * speculative execution to prevent attack. | ||
219 | */ | ||
220 | FILL_RETURN_BUFFER %r12, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW | ||
221 | -#endif | ||
222 | |||
223 | /* restore callee-saved registers */ | ||
224 | popq %r15 | ||
225 | @@ -622,6 +624,7 @@ GLOBAL(retint_user) | ||
226 | TRACE_IRQS_IRETQ | ||
227 | |||
228 | GLOBAL(swapgs_restore_regs_and_return_to_usermode) | ||
229 | + IBRS_EXIT | ||
230 | #ifdef CONFIG_DEBUG_ENTRY | ||
231 | /* Assert that pt_regs indicates user mode. */ | ||
232 | testb $3, CS(%rsp) | ||
233 | @@ -1248,7 +1251,13 @@ ENTRY(paranoid_entry) | ||
234 | */ | ||
235 | FENCE_SWAPGS_KERNEL_ENTRY | ||
236 | |||
237 | - ret | ||
238 | + /* | ||
239 | + * Once we have CR3 and %GS setup save and set SPEC_CTRL. Just like | ||
240 | + * CR3 above, keep the old value in a callee saved register. | ||
241 | + */ | ||
242 | + IBRS_ENTER save_reg=%r15 | ||
243 | + | ||
244 | + RET | ||
245 | END(paranoid_entry) | ||
246 | |||
247 | /* | ||
248 | @@ -1276,12 +1285,20 @@ ENTRY(paranoid_exit) | ||
249 | jmp .Lparanoid_exit_restore | ||
250 | .Lparanoid_exit_no_swapgs: | ||
251 | TRACE_IRQS_IRETQ_DEBUG | ||
252 | + | ||
253 | + /* | ||
254 | + * Must restore IBRS state before both CR3 and %GS since we need access | ||
255 | + * to the per-CPU x86_spec_ctrl_shadow variable. | ||
256 | + */ | ||
257 | + IBRS_EXIT save_reg=%r15 | ||
258 | + | ||
259 | /* Always restore stashed CR3 value (see paranoid_entry) */ | ||
260 | RESTORE_CR3 scratch_reg=%rbx save_reg=%r14 | ||
261 | .Lparanoid_exit_restore: | ||
262 | jmp restore_regs_and_return_to_kernel | ||
263 | END(paranoid_exit) | ||
264 | |||
265 | + | ||
266 | /* | ||
267 | * Save all registers in pt_regs, and switch GS if needed. | ||
268 | */ | ||
269 | @@ -1301,6 +1318,7 @@ ENTRY(error_entry) | ||
270 | FENCE_SWAPGS_USER_ENTRY | ||
271 | /* We have user CR3. Change to kernel CR3. */ | ||
272 | SWITCH_TO_KERNEL_CR3 scratch_reg=%rax | ||
273 | + IBRS_ENTER | ||
274 | |||
275 | .Lerror_entry_from_usermode_after_swapgs: | ||
276 | /* Put us onto the real thread stack. */ | ||
277 | @@ -1356,6 +1374,7 @@ ENTRY(error_entry) | ||
278 | SWAPGS | ||
279 | FENCE_SWAPGS_USER_ENTRY | ||
280 | SWITCH_TO_KERNEL_CR3 scratch_reg=%rax | ||
281 | + IBRS_ENTER | ||
282 | |||
283 | /* | ||
284 | * Pretend that the exception came from user mode: set up pt_regs | ||
285 | @@ -1461,6 +1480,8 @@ ENTRY(nmi) | ||
286 | PUSH_AND_CLEAR_REGS rdx=(%rdx) | ||
287 | ENCODE_FRAME_POINTER | ||
288 | |||
289 | + IBRS_ENTER | ||
290 | + | ||
291 | /* | ||
292 | * At this point we no longer need to worry about stack damage | ||
293 | * due to nesting -- we're on the normal thread stack and we're | ||
294 | @@ -1684,6 +1705,9 @@ end_repeat_nmi: | ||
295 | movq $-1, %rsi | ||
296 | call do_nmi | ||
297 | |||
298 | + /* Always restore stashed SPEC_CTRL value (see paranoid_entry) */ | ||
299 | + IBRS_EXIT save_reg=%r15 | ||
300 | + | ||
301 | /* Always restore stashed CR3 value (see paranoid_entry) */ | ||
302 | RESTORE_CR3 scratch_reg=%r15 save_reg=%r14 | ||
303 | |||
304 | diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S | ||
305 | index 39913770a44d5..c3c4ea4a6711a 100644 | ||
306 | --- a/arch/x86/entry/entry_64_compat.S | ||
307 | +++ b/arch/x86/entry/entry_64_compat.S | ||
308 | @@ -4,7 +4,6 @@ | ||
309 | * | ||
310 | * Copyright 2000-2002 Andi Kleen, SuSE Labs. | ||
311 | */ | ||
312 | -#include "calling.h" | ||
313 | #include <asm/asm-offsets.h> | ||
314 | #include <asm/current.h> | ||
315 | #include <asm/errno.h> | ||
316 | @@ -17,6 +16,8 @@ | ||
317 | #include <linux/linkage.h> | ||
318 | #include <linux/err.h> | ||
319 | |||
320 | +#include "calling.h" | ||
321 | + | ||
322 | .section .entry.text, "ax" | ||
323 | |||
324 | /* | ||
325 | @@ -106,6 +107,8 @@ ENTRY(entry_SYSENTER_compat) | ||
326 | xorl %r15d, %r15d /* nospec r15 */ | ||
327 | cld | ||
328 | |||
329 | + IBRS_ENTER | ||
330 | + | ||
331 | /* | ||
332 | * SYSENTER doesn't filter flags, so we need to clear NT and AC | ||
333 | * ourselves. To save a few cycles, we can check whether | ||
334 | @@ -253,6 +256,8 @@ GLOBAL(entry_SYSCALL_compat_after_hwframe) | ||
335 | */ | ||
336 | TRACE_IRQS_OFF | ||
337 | |||
338 | + IBRS_ENTER | ||
339 | + | ||
340 | movq %rsp, %rdi | ||
341 | call do_fast_syscall_32 | ||
342 | /* XEN PV guests always use IRET path */ | ||
343 | @@ -267,6 +272,9 @@ sysret32_from_system_call: | ||
344 | */ | ||
345 | STACKLEAK_ERASE | ||
346 | TRACE_IRQS_ON /* User mode traces as IRQs on. */ | ||
347 | + | ||
348 | + IBRS_EXIT | ||
349 | + | ||
350 | movq RBX(%rsp), %rbx /* pt_regs->rbx */ | ||
351 | movq RBP(%rsp), %rbp /* pt_regs->rbp */ | ||
352 | movq EFLAGS(%rsp), %r11 /* pt_regs->flags (in r11) */ | ||
353 | @@ -408,6 +416,7 @@ ENTRY(entry_INT80_compat) | ||
354 | * gate turned them off. | ||
355 | */ | ||
356 | TRACE_IRQS_OFF | ||
357 | + IBRS_ENTER | ||
358 | |||
359 | movq %rsp, %rdi | ||
360 | call do_int80_syscall_32 | ||
361 | diff --git a/arch/x86/include/asm/cpu_device_id.h b/arch/x86/include/asm/cpu_device_id.h | ||
362 | index 0c814cd9ea42c..cdf39decf7340 100644 | ||
363 | --- a/arch/x86/include/asm/cpu_device_id.h | ||
364 | +++ b/arch/x86/include/asm/cpu_device_id.h | ||
365 | @@ -5,15 +5,22 @@ | ||
366 | /* | ||
367 | * Declare drivers belonging to specific x86 CPUs | ||
368 | * Similar in spirit to pci_device_id and related PCI functions | ||
369 | + * | ||
370 | + * The wildcard initializers are in mod_devicetable.h because | ||
371 | + * file2alias needs them. Sigh. | ||
372 | */ | ||
373 | - | ||
374 | #include <linux/mod_devicetable.h> | ||
375 | +/* Get the INTEL_FAM* model defines */ | ||
376 | +#include <asm/intel-family.h> | ||
377 | +/* And the X86_VENDOR_* ones */ | ||
378 | +#include <asm/processor.h> | ||
379 | |||
380 | +/* Centaur FAM6 models */ | ||
381 | +#define X86_CENTAUR_FAM6_C7_A 0xa | ||
382 | #define X86_CENTAUR_FAM6_C7_D 0xd | ||
383 | #define X86_CENTAUR_FAM6_NANO 0xf | ||
384 | |||
385 | #define X86_STEPPINGS(mins, maxs) GENMASK(maxs, mins) | ||
386 | - | ||
387 | /** | ||
388 | * X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE - Base macro for CPU matching | ||
389 | * @_vendor: The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY | ||
390 | @@ -26,8 +33,11 @@ | ||
391 | * format is unsigned long. The supplied value, pointer | ||
392 | * etc. is casted to unsigned long internally. | ||
393 | * | ||
394 | - * Backport version to keep the SRBDS pile consistant. No shorter variants | ||
395 | - * required for this. | ||
396 | + * Use only if you need all selectors. Otherwise use one of the shorter | ||
397 | + * macros of the X86_MATCH_* family. If there is no matching shorthand | ||
398 | + * macro, consider to add one. If you really need to wrap one of the macros | ||
399 | + * into another macro at the usage site for good reasons, then please | ||
400 | + * start this local macro with X86_MATCH to allow easy grepping. | ||
401 | */ | ||
402 | #define X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(_vendor, _family, _model, \ | ||
403 | _steppings, _feature, _data) { \ | ||
404 | @@ -39,6 +49,120 @@ | ||
405 | .driver_data = (unsigned long) _data \ | ||
406 | } | ||
407 | |||
408 | +/** | ||
409 | + * X86_MATCH_VENDOR_FAM_MODEL_FEATURE - Macro for CPU matching | ||
410 | + * @_vendor: The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY | ||
411 | + * The name is expanded to X86_VENDOR_@_vendor | ||
412 | + * @_family: The family number or X86_FAMILY_ANY | ||
413 | + * @_model: The model number, model constant or X86_MODEL_ANY | ||
414 | + * @_feature: A X86_FEATURE bit or X86_FEATURE_ANY | ||
415 | + * @_data: Driver specific data or NULL. The internal storage | ||
416 | + * format is unsigned long. The supplied value, pointer | ||
417 | + * etc. is casted to unsigned long internally. | ||
418 | + * | ||
419 | + * The steppings arguments of X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE() is | ||
420 | + * set to wildcards. | ||
421 | + */ | ||
422 | +#define X86_MATCH_VENDOR_FAM_MODEL_FEATURE(vendor, family, model, feature, data) \ | ||
423 | + X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(vendor, family, model, \ | ||
424 | + X86_STEPPING_ANY, feature, data) | ||
425 | + | ||
426 | +/** | ||
427 | + * X86_MATCH_VENDOR_FAM_FEATURE - Macro for matching vendor, family and CPU feature | ||
428 | + * @vendor: The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY | ||
429 | + * The name is expanded to X86_VENDOR_@vendor | ||
430 | + * @family: The family number or X86_FAMILY_ANY | ||
431 | + * @feature: A X86_FEATURE bit | ||
432 | + * @data: Driver specific data or NULL. The internal storage | ||
433 | + * format is unsigned long. The supplied value, pointer | ||
434 | + * etc. is casted to unsigned long internally. | ||
435 | + * | ||
436 | + * All other missing arguments of X86_MATCH_VENDOR_FAM_MODEL_FEATURE() are | ||
437 | + * set to wildcards. | ||
438 | + */ | ||
439 | +#define X86_MATCH_VENDOR_FAM_FEATURE(vendor, family, feature, data) \ | ||
440 | + X86_MATCH_VENDOR_FAM_MODEL_FEATURE(vendor, family, \ | ||
441 | + X86_MODEL_ANY, feature, data) | ||
442 | + | ||
443 | +/** | ||
444 | + * X86_MATCH_VENDOR_FEATURE - Macro for matching vendor and CPU feature | ||
445 | + * @vendor: The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY | ||
446 | + * The name is expanded to X86_VENDOR_@vendor | ||
447 | + * @feature: A X86_FEATURE bit | ||
448 | + * @data: Driver specific data or NULL. The internal storage | ||
449 | + * format is unsigned long. The supplied value, pointer | ||
450 | + * etc. is casted to unsigned long internally. | ||
451 | + * | ||
452 | + * All other missing arguments of X86_MATCH_VENDOR_FAM_MODEL_FEATURE() are | ||
453 | + * set to wildcards. | ||
454 | + */ | ||
455 | +#define X86_MATCH_VENDOR_FEATURE(vendor, feature, data) \ | ||
456 | + X86_MATCH_VENDOR_FAM_FEATURE(vendor, X86_FAMILY_ANY, feature, data) | ||
457 | + | ||
458 | +/** | ||
459 | + * X86_MATCH_FEATURE - Macro for matching a CPU feature | ||
460 | + * @feature: A X86_FEATURE bit | ||
461 | + * @data: Driver specific data or NULL. The internal storage | ||
462 | + * format is unsigned long. The supplied value, pointer | ||
463 | + * etc. is casted to unsigned long internally. | ||
464 | + * | ||
465 | + * All other missing arguments of X86_MATCH_VENDOR_FAM_MODEL_FEATURE() are | ||
466 | + * set to wildcards. | ||
467 | + */ | ||
468 | +#define X86_MATCH_FEATURE(feature, data) \ | ||
469 | + X86_MATCH_VENDOR_FEATURE(ANY, feature, data) | ||
470 | + | ||
471 | +/* Transitional to keep the existing code working */ | ||
472 | +#define X86_FEATURE_MATCH(feature) X86_MATCH_FEATURE(feature, NULL) | ||
473 | + | ||
474 | +/** | ||
475 | + * X86_MATCH_VENDOR_FAM_MODEL - Match vendor, family and model | ||
476 | + * @vendor: The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY | ||
477 | + * The name is expanded to X86_VENDOR_@vendor | ||
478 | + * @family: The family number or X86_FAMILY_ANY | ||
479 | + * @model: The model number, model constant or X86_MODEL_ANY | ||
480 | + * @data: Driver specific data or NULL. The internal storage | ||
481 | + * format is unsigned long. The supplied value, pointer | ||
482 | + * etc. is casted to unsigned long internally. | ||
483 | + * | ||
484 | + * All other missing arguments of X86_MATCH_VENDOR_FAM_MODEL_FEATURE() are | ||
485 | + * set to wildcards. | ||
486 | + */ | ||
487 | +#define X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, data) \ | ||
488 | + X86_MATCH_VENDOR_FAM_MODEL_FEATURE(vendor, family, model, \ | ||
489 | + X86_FEATURE_ANY, data) | ||
490 | + | ||
491 | +/** | ||
492 | + * X86_MATCH_VENDOR_FAM - Match vendor and family | ||
493 | + * @vendor: The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY | ||
494 | + * The name is expanded to X86_VENDOR_@vendor | ||
495 | + * @family: The family number or X86_FAMILY_ANY | ||
496 | + * @data: Driver specific data or NULL. The internal storage | ||
497 | + * format is unsigned long. The supplied value, pointer | ||
498 | + * etc. is casted to unsigned long internally. | ||
499 | + * | ||
500 | + * All other missing arguments to X86_MATCH_VENDOR_FAM_MODEL_FEATURE() are | ||
501 | + * set of wildcards. | ||
502 | + */ | ||
503 | +#define X86_MATCH_VENDOR_FAM(vendor, family, data) \ | ||
504 | + X86_MATCH_VENDOR_FAM_MODEL(vendor, family, X86_MODEL_ANY, data) | ||
505 | + | ||
506 | +/** | ||
507 | + * X86_MATCH_INTEL_FAM6_MODEL - Match vendor INTEL, family 6 and model | ||
508 | + * @model: The model name without the INTEL_FAM6_ prefix or ANY | ||
509 | + * The model name is expanded to INTEL_FAM6_@model internally | ||
510 | + * @data: Driver specific data or NULL. The internal storage | ||
511 | + * format is unsigned long. The supplied value, pointer | ||
512 | + * etc. is casted to unsigned long internally. | ||
513 | + * | ||
514 | + * The vendor is set to INTEL, the family to 6 and all other missing | ||
515 | + * arguments of X86_MATCH_VENDOR_FAM_MODEL_FEATURE() are set to wildcards. | ||
516 | + * | ||
517 | + * See X86_MATCH_VENDOR_FAM_MODEL_FEATURE() for further information. | ||
518 | + */ | ||
519 | +#define X86_MATCH_INTEL_FAM6_MODEL(model, data) \ | ||
520 | + X86_MATCH_VENDOR_FAM_MODEL(INTEL, 6, INTEL_FAM6_##model, data) | ||
521 | + | ||
522 | /* | ||
523 | * Match specific microcode revisions. | ||
524 | * | ||
525 | diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h | ||
526 | index 736b0e412344b..2ec85d7bfdff2 100644 | ||
527 | --- a/arch/x86/include/asm/cpufeatures.h | ||
528 | +++ b/arch/x86/include/asm/cpufeatures.h | ||
529 | @@ -203,8 +203,8 @@ | ||
530 | #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ | ||
531 | #define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */ | ||
532 | #define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */ | ||
533 | -#define X86_FEATURE_RETPOLINE ( 7*32+12) /* "" Generic Retpoline mitigation for Spectre variant 2 */ | ||
534 | -#define X86_FEATURE_RETPOLINE_LFENCE ( 7*32+13) /* "" Use LFENCE for Spectre variant 2 */ | ||
535 | +#define X86_FEATURE_KERNEL_IBRS ( 7*32+12) /* "" Set/clear IBRS on kernel entry/exit */ | ||
536 | +#define X86_FEATURE_RSB_VMEXIT ( 7*32+13) /* "" Fill RSB on VM-Exit */ | ||
537 | #define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ | ||
538 | #define X86_FEATURE_CDP_L2 ( 7*32+15) /* Code and Data Prioritization L2 */ | ||
539 | #define X86_FEATURE_MSR_SPEC_CTRL ( 7*32+16) /* "" MSR SPEC_CTRL is implemented */ | ||
540 | @@ -286,7 +286,10 @@ | ||
541 | #define X86_FEATURE_CQM_MBM_LOCAL (11*32+ 3) /* LLC Local MBM monitoring */ | ||
542 | #define X86_FEATURE_FENCE_SWAPGS_USER (11*32+ 4) /* "" LFENCE in user entry SWAPGS path */ | ||
543 | #define X86_FEATURE_FENCE_SWAPGS_KERNEL (11*32+ 5) /* "" LFENCE in kernel entry SWAPGS path */ | ||
544 | -#define X86_FEATURE_RSB_VMEXIT_LITE (11*32+ 6) /* "" Fill RSB on VM exit when EIBRS is enabled */ | ||
545 | +#define X86_FEATURE_RRSBA_CTRL (11*32+11) /* "" RET prediction control */ | ||
546 | +#define X86_FEATURE_RETPOLINE (11*32+12) /* "" Generic Retpoline mitigation for Spectre variant 2 */ | ||
547 | +#define X86_FEATURE_RETPOLINE_LFENCE (11*32+13) /* "" Use LFENCE for Spectre variant 2 */ | ||
548 | +#define X86_FEATURE_RSB_VMEXIT_LITE (11*32+17) /* "" Fill RSB on VM exit when EIBRS is enabled */ | ||
549 | |||
550 | /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ | ||
551 | #define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* AVX512 BFLOAT16 instructions */ | ||
552 | @@ -303,6 +306,7 @@ | ||
553 | #define X86_FEATURE_AMD_SSBD (13*32+24) /* "" Speculative Store Bypass Disable */ | ||
554 | #define X86_FEATURE_VIRT_SSBD (13*32+25) /* Virtualized Speculative Store Bypass Disable */ | ||
555 | #define X86_FEATURE_AMD_SSB_NO (13*32+26) /* "" Speculative Store Bypass is fixed in hardware. */ | ||
556 | +#define X86_FEATURE_BTC_NO (13*32+29) /* "" Not vulnerable to Branch Type Confusion */ | ||
557 | |||
558 | /* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */ | ||
559 | #define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ | ||
560 | @@ -407,7 +411,8 @@ | ||
561 | #define X86_BUG_ITLB_MULTIHIT X86_BUG(23) /* CPU may incur MCE during certain page attribute changes */ | ||
562 | #define X86_BUG_SRBDS X86_BUG(24) /* CPU may leak RNG bits if not mitigated */ | ||
563 | #define X86_BUG_MMIO_STALE_DATA X86_BUG(25) /* CPU is affected by Processor MMIO Stale Data vulnerabilities */ | ||
564 | -#define X86_BUG_MMIO_UNKNOWN X86_BUG(26) /* CPU is too old and its MMIO Stale Data status is unknown */ | ||
565 | +#define X86_BUG_RETBLEED X86_BUG(26) /* CPU is affected by RETBleed */ | ||
566 | #define X86_BUG_EIBRS_PBRSB X86_BUG(27) /* EIBRS is vulnerable to Post Barrier RSB Predictions */ | ||
567 | +#define X86_BUG_MMIO_UNKNOWN X86_BUG(28) /* CPU is too old and its MMIO Stale Data status is unknown */ | ||
568 | |||
569 | #endif /* _ASM_X86_CPUFEATURES_H */ | ||
570 | diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h | ||
571 | index 5b07573c3bc87..c1d6d8bbb7dad 100644 | ||
572 | --- a/arch/x86/include/asm/intel-family.h | ||
573 | +++ b/arch/x86/include/asm/intel-family.h | ||
574 | @@ -35,6 +35,9 @@ | ||
575 | * The #define line may optionally include a comment including platform names. | ||
576 | */ | ||
577 | |||
578 | +/* Wildcard match for FAM6 so X86_MATCH_INTEL_FAM6_MODEL(ANY) works */ | ||
579 | +#define INTEL_FAM6_ANY X86_MODEL_ANY | ||
580 | + | ||
581 | #define INTEL_FAM6_CORE_YONAH 0x0E | ||
582 | |||
583 | #define INTEL_FAM6_CORE2_MEROM 0x0F | ||
584 | @@ -126,6 +129,9 @@ | ||
585 | #define INTEL_FAM6_XEON_PHI_KNL 0x57 /* Knights Landing */ | ||
586 | #define INTEL_FAM6_XEON_PHI_KNM 0x85 /* Knights Mill */ | ||
587 | |||
588 | +/* Family 5 */ | ||
589 | +#define INTEL_FAM5_QUARK_X1000 0x09 /* Quark X1000 SoC */ | ||
590 | + | ||
591 | /* Useful macros */ | ||
592 | #define INTEL_CPU_FAM_ANY(_family, _model, _driver_data) \ | ||
593 | { \ | ||
594 | diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h | ||
595 | index cef4eba03ff36..713886d5493a8 100644 | ||
596 | --- a/arch/x86/include/asm/msr-index.h | ||
597 | +++ b/arch/x86/include/asm/msr-index.h | ||
598 | @@ -47,6 +47,8 @@ | ||
599 | #define SPEC_CTRL_STIBP BIT(SPEC_CTRL_STIBP_SHIFT) /* STIBP mask */ | ||
600 | #define SPEC_CTRL_SSBD_SHIFT 2 /* Speculative Store Bypass Disable bit */ | ||
601 | #define SPEC_CTRL_SSBD BIT(SPEC_CTRL_SSBD_SHIFT) /* Speculative Store Bypass Disable */ | ||
602 | +#define SPEC_CTRL_RRSBA_DIS_S_SHIFT 6 /* Disable RRSBA behavior */ | ||
603 | +#define SPEC_CTRL_RRSBA_DIS_S BIT(SPEC_CTRL_RRSBA_DIS_S_SHIFT) | ||
604 | |||
605 | #define MSR_IA32_PRED_CMD 0x00000049 /* Prediction Command */ | ||
606 | #define PRED_CMD_IBPB BIT(0) /* Indirect Branch Prediction Barrier */ | ||
607 | @@ -82,6 +84,7 @@ | ||
608 | #define MSR_IA32_ARCH_CAPABILITIES 0x0000010a | ||
609 | #define ARCH_CAP_RDCL_NO BIT(0) /* Not susceptible to Meltdown */ | ||
610 | #define ARCH_CAP_IBRS_ALL BIT(1) /* Enhanced IBRS support */ | ||
611 | +#define ARCH_CAP_RSBA BIT(2) /* RET may use alternative branch predictors */ | ||
612 | #define ARCH_CAP_SKIP_VMENTRY_L1DFLUSH BIT(3) /* Skip L1D flush on vmentry */ | ||
613 | #define ARCH_CAP_SSB_NO BIT(4) /* | ||
614 | * Not susceptible to Speculative Store Bypass | ||
615 | @@ -129,6 +132,13 @@ | ||
616 | * bit available to control VERW | ||
617 | * behavior. | ||
618 | */ | ||
619 | +#define ARCH_CAP_RRSBA BIT(19) /* | ||
620 | + * Indicates RET may use predictors | ||
621 | + * other than the RSB. With eIBRS | ||
622 | + * enabled predictions in kernel mode | ||
623 | + * are restricted to targets in | ||
624 | + * kernel. | ||
625 | + */ | ||
626 | #define ARCH_CAP_PBRSB_NO BIT(24) /* | ||
627 | * Not susceptible to Post-Barrier | ||
628 | * Return Stack Buffer Predictions. | ||
629 | diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h | ||
630 | index a1ee1a760c3eb..8c898eed28941 100644 | ||
631 | --- a/arch/x86/include/asm/nospec-branch.h | ||
632 | +++ b/arch/x86/include/asm/nospec-branch.h | ||
633 | @@ -4,11 +4,14 @@ | ||
634 | #define _ASM_X86_NOSPEC_BRANCH_H_ | ||
635 | |||
636 | #include <linux/static_key.h> | ||
637 | +#include <linux/frame.h> | ||
638 | |||
639 | #include <asm/alternative.h> | ||
640 | #include <asm/alternative-asm.h> | ||
641 | #include <asm/cpufeatures.h> | ||
642 | #include <asm/msr-index.h> | ||
643 | +#include <asm/unwind_hints.h> | ||
644 | +#include <asm/percpu.h> | ||
645 | |||
646 | /* | ||
647 | * This should be used immediately before a retpoline alternative. It tells | ||
648 | @@ -60,9 +63,9 @@ | ||
649 | lfence; \ | ||
650 | jmp 775b; \ | ||
651 | 774: \ | ||
652 | + add $(BITS_PER_LONG/8) * 2, sp; \ | ||
653 | dec reg; \ | ||
654 | jnz 771b; \ | ||
655 | - add $(BITS_PER_LONG/8) * nr, sp; \ | ||
656 | /* barrier for jnz misprediction */ \ | ||
657 | lfence; | ||
658 | #else | ||
659 | @@ -79,13 +82,6 @@ | ||
660 | add $(BITS_PER_LONG/8) * nr, sp; | ||
661 | #endif | ||
662 | |||
663 | -#define __ISSUE_UNBALANCED_RET_GUARD(sp) \ | ||
664 | - call 881f; \ | ||
665 | - int3; \ | ||
666 | -881: \ | ||
667 | - add $(BITS_PER_LONG/8), sp; \ | ||
668 | - lfence; | ||
669 | - | ||
670 | #ifdef __ASSEMBLY__ | ||
671 | |||
672 | /* | ||
673 | @@ -155,26 +151,28 @@ | ||
674 | #endif | ||
675 | .endm | ||
676 | |||
677 | -.macro ISSUE_UNBALANCED_RET_GUARD ftr:req | ||
678 | - ANNOTATE_NOSPEC_ALTERNATIVE | ||
679 | - ALTERNATIVE "jmp .Lskip_pbrsb_\@", \ | ||
680 | - __stringify(__ISSUE_UNBALANCED_RET_GUARD(%_ASM_SP)) \ | ||
681 | - \ftr | ||
682 | -.Lskip_pbrsb_\@: | ||
683 | +.macro ISSUE_UNBALANCED_RET_GUARD | ||
684 | + call .Lunbalanced_ret_guard_\@ | ||
685 | + int3 | ||
686 | +.Lunbalanced_ret_guard_\@: | ||
687 | + add $(BITS_PER_LONG/8), %_ASM_SP | ||
688 | + lfence | ||
689 | .endm | ||
690 | |||
691 | /* | ||
692 | * A simpler FILL_RETURN_BUFFER macro. Don't make people use the CPP | ||
693 | * monstrosity above, manually. | ||
694 | */ | ||
695 | -.macro FILL_RETURN_BUFFER reg:req nr:req ftr:req | ||
696 | -#ifdef CONFIG_RETPOLINE | ||
697 | - ANNOTATE_NOSPEC_ALTERNATIVE | ||
698 | - ALTERNATIVE "jmp .Lskip_rsb_\@", \ | ||
699 | - __stringify(__FILL_RETURN_BUFFER(\reg,\nr,%_ASM_SP)) \ | ||
700 | - \ftr | ||
701 | +.macro FILL_RETURN_BUFFER reg:req nr:req ftr:req ftr2 | ||
702 | +.ifb \ftr2 | ||
703 | + ALTERNATIVE "jmp .Lskip_rsb_\@", "", \ftr | ||
704 | +.else | ||
705 | + ALTERNATIVE_2 "jmp .Lskip_rsb_\@", "", \ftr, "jmp .Lunbalanced_\@", \ftr2 | ||
706 | +.endif | ||
707 | + __FILL_RETURN_BUFFER(\reg,\nr,%_ASM_SP) | ||
708 | +.Lunbalanced_\@: | ||
709 | + ISSUE_UNBALANCED_RET_GUARD | ||
710 | .Lskip_rsb_\@: | ||
711 | -#endif | ||
712 | .endm | ||
713 | |||
714 | #else /* __ASSEMBLY__ */ | ||
715 | @@ -249,6 +247,7 @@ enum spectre_v2_mitigation { | ||
716 | SPECTRE_V2_EIBRS, | ||
717 | SPECTRE_V2_EIBRS_RETPOLINE, | ||
718 | SPECTRE_V2_EIBRS_LFENCE, | ||
719 | + SPECTRE_V2_IBRS, | ||
720 | }; | ||
721 | |||
722 | /* The indirect branch speculation control variants */ | ||
723 | @@ -312,6 +311,9 @@ static inline void indirect_branch_prediction_barrier(void) | ||
724 | |||
725 | /* The Intel SPEC CTRL MSR base value cache */ | ||
726 | extern u64 x86_spec_ctrl_base; | ||
727 | +DECLARE_PER_CPU(u64, x86_spec_ctrl_current); | ||
728 | +extern void write_spec_ctrl_current(u64 val, bool force); | ||
729 | +extern u64 spec_ctrl_current(void); | ||
730 | |||
731 | /* | ||
732 | * With retpoline, we must use IBRS to restrict branch prediction | ||
733 | @@ -321,18 +323,16 @@ extern u64 x86_spec_ctrl_base; | ||
734 | */ | ||
735 | #define firmware_restrict_branch_speculation_start() \ | ||
736 | do { \ | ||
737 | - u64 val = x86_spec_ctrl_base | SPEC_CTRL_IBRS; \ | ||
738 | - \ | ||
739 | preempt_disable(); \ | ||
740 | - alternative_msr_write(MSR_IA32_SPEC_CTRL, val, \ | ||
741 | + alternative_msr_write(MSR_IA32_SPEC_CTRL, \ | ||
742 | + spec_ctrl_current() | SPEC_CTRL_IBRS, \ | ||
743 | X86_FEATURE_USE_IBRS_FW); \ | ||
744 | } while (0) | ||
745 | |||
746 | #define firmware_restrict_branch_speculation_end() \ | ||
747 | do { \ | ||
748 | - u64 val = x86_spec_ctrl_base; \ | ||
749 | - \ | ||
750 | - alternative_msr_write(MSR_IA32_SPEC_CTRL, val, \ | ||
751 | + alternative_msr_write(MSR_IA32_SPEC_CTRL, \ | ||
752 | + spec_ctrl_current(), \ | ||
753 | X86_FEATURE_USE_IBRS_FW); \ | ||
754 | preempt_enable(); \ | ||
755 | } while (0) | ||
756 | diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c | ||
757 | index 88cef978380bf..5571b28d35b60 100644 | ||
758 | --- a/arch/x86/kernel/cpu/amd.c | ||
759 | +++ b/arch/x86/kernel/cpu/amd.c | ||
760 | @@ -894,12 +894,21 @@ static void init_amd_zn(struct cpuinfo_x86 *c) | ||
761 | node_reclaim_distance = 32; | ||
762 | #endif | ||
763 | |||
764 | - /* | ||
765 | - * Fix erratum 1076: CPB feature bit not being set in CPUID. | ||
766 | - * Always set it, except when running under a hypervisor. | ||
767 | - */ | ||
768 | - if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && !cpu_has(c, X86_FEATURE_CPB)) | ||
769 | - set_cpu_cap(c, X86_FEATURE_CPB); | ||
770 | + /* Fix up CPUID bits, but only if not virtualised. */ | ||
771 | + if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) { | ||
772 | + | ||
773 | + /* Erratum 1076: CPB feature bit not being set in CPUID. */ | ||
774 | + if (!cpu_has(c, X86_FEATURE_CPB)) | ||
775 | + set_cpu_cap(c, X86_FEATURE_CPB); | ||
776 | + | ||
777 | + /* | ||
778 | + * Zen3 (Fam19 model < 0x10) parts are not susceptible to | ||
779 | + * Branch Type Confusion, but predate the allocation of the | ||
780 | + * BTC_NO bit. | ||
781 | + */ | ||
782 | + if (c->x86 == 0x19 && !cpu_has(c, X86_FEATURE_BTC_NO)) | ||
783 | + set_cpu_cap(c, X86_FEATURE_BTC_NO); | ||
784 | + } | ||
785 | } | ||
786 | |||
787 | static void init_amd(struct cpuinfo_x86 *c) | ||
788 | diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c | ||
789 | index c90d91cb14341..cf5a18e261e36 100644 | ||
790 | --- a/arch/x86/kernel/cpu/bugs.c | ||
791 | +++ b/arch/x86/kernel/cpu/bugs.c | ||
792 | @@ -37,6 +37,8 @@ | ||
793 | |||
794 | static void __init spectre_v1_select_mitigation(void); | ||
795 | static void __init spectre_v2_select_mitigation(void); | ||
796 | +static void __init retbleed_select_mitigation(void); | ||
797 | +static void __init spectre_v2_user_select_mitigation(void); | ||
798 | static void __init ssb_select_mitigation(void); | ||
799 | static void __init l1tf_select_mitigation(void); | ||
800 | static void __init mds_select_mitigation(void); | ||
801 | @@ -46,16 +48,40 @@ static void __init taa_select_mitigation(void); | ||
802 | static void __init mmio_select_mitigation(void); | ||
803 | static void __init srbds_select_mitigation(void); | ||
804 | |||
805 | -/* The base value of the SPEC_CTRL MSR that always has to be preserved. */ | ||
806 | +/* The base value of the SPEC_CTRL MSR without task-specific bits set */ | ||
807 | u64 x86_spec_ctrl_base; | ||
808 | EXPORT_SYMBOL_GPL(x86_spec_ctrl_base); | ||
809 | + | ||
810 | +/* The current value of the SPEC_CTRL MSR with task-specific bits set */ | ||
811 | +DEFINE_PER_CPU(u64, x86_spec_ctrl_current); | ||
812 | +EXPORT_SYMBOL_GPL(x86_spec_ctrl_current); | ||
813 | + | ||
814 | static DEFINE_MUTEX(spec_ctrl_mutex); | ||
815 | |||
816 | /* | ||
817 | - * The vendor and possibly platform specific bits which can be modified in | ||
818 | - * x86_spec_ctrl_base. | ||
819 | + * Keep track of the SPEC_CTRL MSR value for the current task, which may differ | ||
820 | + * from x86_spec_ctrl_base due to STIBP/SSB in __speculation_ctrl_update(). | ||
821 | */ | ||
822 | -static u64 __ro_after_init x86_spec_ctrl_mask = SPEC_CTRL_IBRS; | ||
823 | +void write_spec_ctrl_current(u64 val, bool force) | ||
824 | +{ | ||
825 | + if (this_cpu_read(x86_spec_ctrl_current) == val) | ||
826 | + return; | ||
827 | + | ||
828 | + this_cpu_write(x86_spec_ctrl_current, val); | ||
829 | + | ||
830 | + /* | ||
831 | + * When KERNEL_IBRS this MSR is written on return-to-user, unless | ||
832 | + * forced the update can be delayed until that time. | ||
833 | + */ | ||
834 | + if (force || !cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS)) | ||
835 | + wrmsrl(MSR_IA32_SPEC_CTRL, val); | ||
836 | +} | ||
837 | + | ||
838 | +u64 spec_ctrl_current(void) | ||
839 | +{ | ||
840 | + return this_cpu_read(x86_spec_ctrl_current); | ||
841 | +} | ||
842 | +EXPORT_SYMBOL_GPL(spec_ctrl_current); | ||
843 | |||
844 | /* | ||
845 | * AMD specific MSR info for Speculative Store Bypass control. | ||
846 | @@ -105,13 +131,21 @@ void __init check_bugs(void) | ||
847 | if (boot_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) | ||
848 | rdmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); | ||
849 | |||
850 | - /* Allow STIBP in MSR_SPEC_CTRL if supported */ | ||
851 | - if (boot_cpu_has(X86_FEATURE_STIBP)) | ||
852 | - x86_spec_ctrl_mask |= SPEC_CTRL_STIBP; | ||
853 | - | ||
854 | /* Select the proper CPU mitigations before patching alternatives: */ | ||
855 | spectre_v1_select_mitigation(); | ||
856 | spectre_v2_select_mitigation(); | ||
857 | + /* | ||
858 | + * retbleed_select_mitigation() relies on the state set by | ||
859 | + * spectre_v2_select_mitigation(); specifically it wants to know about | ||
860 | + * spectre_v2=ibrs. | ||
861 | + */ | ||
862 | + retbleed_select_mitigation(); | ||
863 | + /* | ||
864 | + * spectre_v2_user_select_mitigation() relies on the state set by | ||
865 | + * retbleed_select_mitigation(); specifically the STIBP selection is | ||
866 | + * forced for UNRET. | ||
867 | + */ | ||
868 | + spectre_v2_user_select_mitigation(); | ||
869 | ssb_select_mitigation(); | ||
870 | l1tf_select_mitigation(); | ||
871 | md_clear_select_mitigation(); | ||
872 | @@ -151,31 +185,17 @@ void __init check_bugs(void) | ||
873 | #endif | ||
874 | } | ||
875 | |||
876 | +/* | ||
877 | + * NOTE: For VMX, this function is not called in the vmexit path. | ||
878 | + * It uses vmx_spec_ctrl_restore_host() instead. | ||
879 | + */ | ||
880 | void | ||
881 | x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest) | ||
882 | { | ||
883 | - u64 msrval, guestval, hostval = x86_spec_ctrl_base; | ||
884 | + u64 msrval, guestval = guest_spec_ctrl, hostval = spec_ctrl_current(); | ||
885 | struct thread_info *ti = current_thread_info(); | ||
886 | |||
887 | - /* Is MSR_SPEC_CTRL implemented ? */ | ||
888 | if (static_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) { | ||
889 | - /* | ||
890 | - * Restrict guest_spec_ctrl to supported values. Clear the | ||
891 | - * modifiable bits in the host base value and or the | ||
892 | - * modifiable bits from the guest value. | ||
893 | - */ | ||
894 | - guestval = hostval & ~x86_spec_ctrl_mask; | ||
895 | - guestval |= guest_spec_ctrl & x86_spec_ctrl_mask; | ||
896 | - | ||
897 | - /* SSBD controlled in MSR_SPEC_CTRL */ | ||
898 | - if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) || | ||
899 | - static_cpu_has(X86_FEATURE_AMD_SSBD)) | ||
900 | - hostval |= ssbd_tif_to_spec_ctrl(ti->flags); | ||
901 | - | ||
902 | - /* Conditional STIBP enabled? */ | ||
903 | - if (static_branch_unlikely(&switch_to_cond_stibp)) | ||
904 | - hostval |= stibp_tif_to_spec_ctrl(ti->flags); | ||
905 | - | ||
906 | if (hostval != guestval) { | ||
907 | msrval = setguest ? guestval : hostval; | ||
908 | wrmsrl(MSR_IA32_SPEC_CTRL, msrval); | ||
909 | @@ -705,12 +725,103 @@ static int __init nospectre_v1_cmdline(char *str) | ||
910 | } | ||
911 | early_param("nospectre_v1", nospectre_v1_cmdline); | ||
912 | |||
913 | -#undef pr_fmt | ||
914 | -#define pr_fmt(fmt) "Spectre V2 : " fmt | ||
915 | - | ||
916 | static enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init = | ||
917 | SPECTRE_V2_NONE; | ||
918 | |||
919 | +#undef pr_fmt | ||
920 | +#define pr_fmt(fmt) "RETBleed: " fmt | ||
921 | + | ||
922 | +enum retbleed_mitigation { | ||
923 | + RETBLEED_MITIGATION_NONE, | ||
924 | + RETBLEED_MITIGATION_IBRS, | ||
925 | + RETBLEED_MITIGATION_EIBRS, | ||
926 | +}; | ||
927 | + | ||
928 | +enum retbleed_mitigation_cmd { | ||
929 | + RETBLEED_CMD_OFF, | ||
930 | + RETBLEED_CMD_AUTO, | ||
931 | +}; | ||
932 | + | ||
933 | +const char * const retbleed_strings[] = { | ||
934 | + [RETBLEED_MITIGATION_NONE] = "Vulnerable", | ||
935 | + [RETBLEED_MITIGATION_IBRS] = "Mitigation: IBRS", | ||
936 | + [RETBLEED_MITIGATION_EIBRS] = "Mitigation: Enhanced IBRS", | ||
937 | +}; | ||
938 | + | ||
939 | +static enum retbleed_mitigation retbleed_mitigation __ro_after_init = | ||
940 | + RETBLEED_MITIGATION_NONE; | ||
941 | +static enum retbleed_mitigation_cmd retbleed_cmd __ro_after_init = | ||
942 | + RETBLEED_CMD_AUTO; | ||
943 | + | ||
944 | +static int __init retbleed_parse_cmdline(char *str) | ||
945 | +{ | ||
946 | + if (!str) | ||
947 | + return -EINVAL; | ||
948 | + | ||
949 | + if (!strcmp(str, "off")) | ||
950 | + retbleed_cmd = RETBLEED_CMD_OFF; | ||
951 | + else if (!strcmp(str, "auto")) | ||
952 | + retbleed_cmd = RETBLEED_CMD_AUTO; | ||
953 | + else | ||
954 | + pr_err("Unknown retbleed option (%s). Defaulting to 'auto'\n", str); | ||
955 | + | ||
956 | + return 0; | ||
957 | +} | ||
958 | +early_param("retbleed", retbleed_parse_cmdline); | ||
959 | + | ||
960 | +#define RETBLEED_UNTRAIN_MSG "WARNING: BTB untrained return thunk mitigation is only effective on AMD/Hygon!\n" | ||
961 | +#define RETBLEED_COMPILER_MSG "WARNING: kernel not compiled with RETPOLINE or -mfunction-return capable compiler!\n" | ||
962 | +#define RETBLEED_INTEL_MSG "WARNING: Spectre v2 mitigation leaves CPU vulnerable to RETBleed attacks, data leaks possible!\n" | ||
963 | + | ||
964 | +static void __init retbleed_select_mitigation(void) | ||
965 | +{ | ||
966 | + if (!boot_cpu_has_bug(X86_BUG_RETBLEED) || cpu_mitigations_off()) | ||
967 | + return; | ||
968 | + | ||
969 | + switch (retbleed_cmd) { | ||
970 | + case RETBLEED_CMD_OFF: | ||
971 | + return; | ||
972 | + | ||
973 | + case RETBLEED_CMD_AUTO: | ||
974 | + default: | ||
975 | + /* | ||
976 | + * The Intel mitigation (IBRS) was already selected in | ||
977 | + * spectre_v2_select_mitigation(). | ||
978 | + */ | ||
979 | + | ||
980 | + break; | ||
981 | + } | ||
982 | + | ||
983 | + switch (retbleed_mitigation) { | ||
984 | + default: | ||
985 | + break; | ||
986 | + } | ||
987 | + | ||
988 | + /* | ||
989 | + * Let IBRS trump all on Intel without affecting the effects of the | ||
990 | + * retbleed= cmdline option. | ||
991 | + */ | ||
992 | + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { | ||
993 | + switch (spectre_v2_enabled) { | ||
994 | + case SPECTRE_V2_IBRS: | ||
995 | + retbleed_mitigation = RETBLEED_MITIGATION_IBRS; | ||
996 | + break; | ||
997 | + case SPECTRE_V2_EIBRS: | ||
998 | + case SPECTRE_V2_EIBRS_RETPOLINE: | ||
999 | + case SPECTRE_V2_EIBRS_LFENCE: | ||
1000 | + retbleed_mitigation = RETBLEED_MITIGATION_EIBRS; | ||
1001 | + break; | ||
1002 | + default: | ||
1003 | + pr_err(RETBLEED_INTEL_MSG); | ||
1004 | + } | ||
1005 | + } | ||
1006 | + | ||
1007 | + pr_info("%s\n", retbleed_strings[retbleed_mitigation]); | ||
1008 | +} | ||
1009 | + | ||
1010 | +#undef pr_fmt | ||
1011 | +#define pr_fmt(fmt) "Spectre V2 : " fmt | ||
1012 | + | ||
1013 | static enum spectre_v2_user_mitigation spectre_v2_user_stibp __ro_after_init = | ||
1014 | SPECTRE_V2_USER_NONE; | ||
1015 | static enum spectre_v2_user_mitigation spectre_v2_user_ibpb __ro_after_init = | ||
1016 | @@ -740,6 +851,7 @@ static inline const char *spectre_v2_module_string(void) { return ""; } | ||
1017 | #define SPECTRE_V2_LFENCE_MSG "WARNING: LFENCE mitigation is not recommended for this CPU, data leaks possible!\n" | ||
1018 | #define SPECTRE_V2_EIBRS_EBPF_MSG "WARNING: Unprivileged eBPF is enabled with eIBRS on, data leaks possible via Spectre v2 BHB attacks!\n" | ||
1019 | #define SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG "WARNING: Unprivileged eBPF is enabled with eIBRS+LFENCE mitigation and SMT, data leaks possible via Spectre v2 BHB attacks!\n" | ||
1020 | +#define SPECTRE_V2_IBRS_PERF_MSG "WARNING: IBRS mitigation selected on Enhanced IBRS CPU, this may cause unnecessary performance loss\n" | ||
1021 | |||
1022 | #ifdef CONFIG_BPF_SYSCALL | ||
1023 | void unpriv_ebpf_notify(int new_state) | ||
1024 | @@ -781,6 +893,7 @@ enum spectre_v2_mitigation_cmd { | ||
1025 | SPECTRE_V2_CMD_EIBRS, | ||
1026 | SPECTRE_V2_CMD_EIBRS_RETPOLINE, | ||
1027 | SPECTRE_V2_CMD_EIBRS_LFENCE, | ||
1028 | + SPECTRE_V2_CMD_IBRS, | ||
1029 | }; | ||
1030 | |||
1031 | enum spectre_v2_user_cmd { | ||
1032 | @@ -821,13 +934,15 @@ static void __init spec_v2_user_print_cond(const char *reason, bool secure) | ||
1033 | pr_info("spectre_v2_user=%s forced on command line.\n", reason); | ||
1034 | } | ||
1035 | |||
1036 | +static __ro_after_init enum spectre_v2_mitigation_cmd spectre_v2_cmd; | ||
1037 | + | ||
1038 | static enum spectre_v2_user_cmd __init | ||
1039 | -spectre_v2_parse_user_cmdline(enum spectre_v2_mitigation_cmd v2_cmd) | ||
1040 | +spectre_v2_parse_user_cmdline(void) | ||
1041 | { | ||
1042 | char arg[20]; | ||
1043 | int ret, i; | ||
1044 | |||
1045 | - switch (v2_cmd) { | ||
1046 | + switch (spectre_v2_cmd) { | ||
1047 | case SPECTRE_V2_CMD_NONE: | ||
1048 | return SPECTRE_V2_USER_CMD_NONE; | ||
1049 | case SPECTRE_V2_CMD_FORCE: | ||
1050 | @@ -853,15 +968,16 @@ spectre_v2_parse_user_cmdline(enum spectre_v2_mitigation_cmd v2_cmd) | ||
1051 | return SPECTRE_V2_USER_CMD_AUTO; | ||
1052 | } | ||
1053 | |||
1054 | -static inline bool spectre_v2_in_eibrs_mode(enum spectre_v2_mitigation mode) | ||
1055 | +static inline bool spectre_v2_in_ibrs_mode(enum spectre_v2_mitigation mode) | ||
1056 | { | ||
1057 | - return (mode == SPECTRE_V2_EIBRS || | ||
1058 | - mode == SPECTRE_V2_EIBRS_RETPOLINE || | ||
1059 | - mode == SPECTRE_V2_EIBRS_LFENCE); | ||
1060 | + return mode == SPECTRE_V2_IBRS || | ||
1061 | + mode == SPECTRE_V2_EIBRS || | ||
1062 | + mode == SPECTRE_V2_EIBRS_RETPOLINE || | ||
1063 | + mode == SPECTRE_V2_EIBRS_LFENCE; | ||
1064 | } | ||
1065 | |||
1066 | static void __init | ||
1067 | -spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) | ||
1068 | +spectre_v2_user_select_mitigation(void) | ||
1069 | { | ||
1070 | enum spectre_v2_user_mitigation mode = SPECTRE_V2_USER_NONE; | ||
1071 | bool smt_possible = IS_ENABLED(CONFIG_SMP); | ||
1072 | @@ -874,7 +990,7 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) | ||
1073 | cpu_smt_control == CPU_SMT_NOT_SUPPORTED) | ||
1074 | smt_possible = false; | ||
1075 | |||
1076 | - cmd = spectre_v2_parse_user_cmdline(v2_cmd); | ||
1077 | + cmd = spectre_v2_parse_user_cmdline(); | ||
1078 | switch (cmd) { | ||
1079 | case SPECTRE_V2_USER_CMD_NONE: | ||
1080 | goto set_mode; | ||
1081 | @@ -922,12 +1038,12 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) | ||
1082 | } | ||
1083 | |||
1084 | /* | ||
1085 | - * If no STIBP, enhanced IBRS is enabled or SMT impossible, STIBP is not | ||
1086 | - * required. | ||
1087 | + * If no STIBP, IBRS or enhanced IBRS is enabled, or SMT impossible, | ||
1088 | + * STIBP is not required. | ||
1089 | */ | ||
1090 | if (!boot_cpu_has(X86_FEATURE_STIBP) || | ||
1091 | !smt_possible || | ||
1092 | - spectre_v2_in_eibrs_mode(spectre_v2_enabled)) | ||
1093 | + spectre_v2_in_ibrs_mode(spectre_v2_enabled)) | ||
1094 | return; | ||
1095 | |||
1096 | /* | ||
1097 | @@ -952,6 +1068,7 @@ static const char * const spectre_v2_strings[] = { | ||
1098 | [SPECTRE_V2_EIBRS] = "Mitigation: Enhanced IBRS", | ||
1099 | [SPECTRE_V2_EIBRS_LFENCE] = "Mitigation: Enhanced IBRS + LFENCE", | ||
1100 | [SPECTRE_V2_EIBRS_RETPOLINE] = "Mitigation: Enhanced IBRS + Retpolines", | ||
1101 | + [SPECTRE_V2_IBRS] = "Mitigation: IBRS", | ||
1102 | }; | ||
1103 | |||
1104 | static const struct { | ||
1105 | @@ -969,6 +1086,7 @@ static const struct { | ||
1106 | { "eibrs,lfence", SPECTRE_V2_CMD_EIBRS_LFENCE, false }, | ||
1107 | { "eibrs,retpoline", SPECTRE_V2_CMD_EIBRS_RETPOLINE, false }, | ||
1108 | { "auto", SPECTRE_V2_CMD_AUTO, false }, | ||
1109 | + { "ibrs", SPECTRE_V2_CMD_IBRS, false }, | ||
1110 | }; | ||
1111 | |||
1112 | static void __init spec_v2_print_cond(const char *reason, bool secure) | ||
1113 | @@ -1031,6 +1149,24 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) | ||
1114 | return SPECTRE_V2_CMD_AUTO; | ||
1115 | } | ||
1116 | |||
1117 | + if (cmd == SPECTRE_V2_CMD_IBRS && boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { | ||
1118 | + pr_err("%s selected but not Intel CPU. Switching to AUTO select\n", | ||
1119 | + mitigation_options[i].option); | ||
1120 | + return SPECTRE_V2_CMD_AUTO; | ||
1121 | + } | ||
1122 | + | ||
1123 | + if (cmd == SPECTRE_V2_CMD_IBRS && !boot_cpu_has(X86_FEATURE_IBRS)) { | ||
1124 | + pr_err("%s selected but CPU doesn't have IBRS. Switching to AUTO select\n", | ||
1125 | + mitigation_options[i].option); | ||
1126 | + return SPECTRE_V2_CMD_AUTO; | ||
1127 | + } | ||
1128 | + | ||
1129 | + if (cmd == SPECTRE_V2_CMD_IBRS && boot_cpu_has(X86_FEATURE_XENPV)) { | ||
1130 | + pr_err("%s selected but running as XenPV guest. Switching to AUTO select\n", | ||
1131 | + mitigation_options[i].option); | ||
1132 | + return SPECTRE_V2_CMD_AUTO; | ||
1133 | + } | ||
1134 | + | ||
1135 | spec_v2_print_cond(mitigation_options[i].option, | ||
1136 | mitigation_options[i].secure); | ||
1137 | return cmd; | ||
1138 | @@ -1046,6 +1182,22 @@ static enum spectre_v2_mitigation __init spectre_v2_select_retpoline(void) | ||
1139 | return SPECTRE_V2_RETPOLINE; | ||
1140 | } | ||
1141 | |||
1142 | +/* Disable in-kernel use of non-RSB RET predictors */ | ||
1143 | +static void __init spec_ctrl_disable_kernel_rrsba(void) | ||
1144 | +{ | ||
1145 | + u64 ia32_cap; | ||
1146 | + | ||
1147 | + if (!boot_cpu_has(X86_FEATURE_RRSBA_CTRL)) | ||
1148 | + return; | ||
1149 | + | ||
1150 | + ia32_cap = x86_read_arch_cap_msr(); | ||
1151 | + | ||
1152 | + if (ia32_cap & ARCH_CAP_RRSBA) { | ||
1153 | + x86_spec_ctrl_base |= SPEC_CTRL_RRSBA_DIS_S; | ||
1154 | + write_spec_ctrl_current(x86_spec_ctrl_base, true); | ||
1155 | + } | ||
1156 | +} | ||
1157 | + | ||
1158 | static void __init spectre_v2_determine_rsb_fill_type_at_vmexit(enum spectre_v2_mitigation mode) | ||
1159 | { | ||
1160 | /* | ||
1161 | @@ -1070,10 +1222,6 @@ static void __init spectre_v2_determine_rsb_fill_type_at_vmexit(enum spectre_v2_ | ||
1162 | */ | ||
1163 | switch (mode) { | ||
1164 | case SPECTRE_V2_NONE: | ||
1165 | - /* These modes already fill RSB at vmexit */ | ||
1166 | - case SPECTRE_V2_LFENCE: | ||
1167 | - case SPECTRE_V2_RETPOLINE: | ||
1168 | - case SPECTRE_V2_EIBRS_RETPOLINE: | ||
1169 | return; | ||
1170 | |||
1171 | case SPECTRE_V2_EIBRS_LFENCE: | ||
1172 | @@ -1083,6 +1231,14 @@ static void __init spectre_v2_determine_rsb_fill_type_at_vmexit(enum spectre_v2_ | ||
1173 | pr_info("Spectre v2 / PBRSB-eIBRS: Retire a single CALL on VMEXIT\n"); | ||
1174 | } | ||
1175 | return; | ||
1176 | + | ||
1177 | + case SPECTRE_V2_EIBRS_RETPOLINE: | ||
1178 | + case SPECTRE_V2_RETPOLINE: | ||
1179 | + case SPECTRE_V2_LFENCE: | ||
1180 | + case SPECTRE_V2_IBRS: | ||
1181 | + setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT); | ||
1182 | + pr_info("Spectre v2 / SpectreRSB : Filling RSB on VMEXIT\n"); | ||
1183 | + return; | ||
1184 | } | ||
1185 | |||
1186 | pr_warn_once("Unknown Spectre v2 mode, disabling RSB mitigation at VM exit"); | ||
1187 | @@ -1113,6 +1269,14 @@ static void __init spectre_v2_select_mitigation(void) | ||
1188 | break; | ||
1189 | } | ||
1190 | |||
1191 | + if (boot_cpu_has_bug(X86_BUG_RETBLEED) && | ||
1192 | + retbleed_cmd != RETBLEED_CMD_OFF && | ||
1193 | + boot_cpu_has(X86_FEATURE_IBRS) && | ||
1194 | + boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { | ||
1195 | + mode = SPECTRE_V2_IBRS; | ||
1196 | + break; | ||
1197 | + } | ||
1198 | + | ||
1199 | mode = spectre_v2_select_retpoline(); | ||
1200 | break; | ||
1201 | |||
1202 | @@ -1129,6 +1293,10 @@ static void __init spectre_v2_select_mitigation(void) | ||
1203 | mode = spectre_v2_select_retpoline(); | ||
1204 | break; | ||
1205 | |||
1206 | + case SPECTRE_V2_CMD_IBRS: | ||
1207 | + mode = SPECTRE_V2_IBRS; | ||
1208 | + break; | ||
1209 | + | ||
1210 | case SPECTRE_V2_CMD_EIBRS: | ||
1211 | mode = SPECTRE_V2_EIBRS; | ||
1212 | break; | ||
1213 | @@ -1145,10 +1313,9 @@ static void __init spectre_v2_select_mitigation(void) | ||
1214 | if (mode == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) | ||
1215 | pr_err(SPECTRE_V2_EIBRS_EBPF_MSG); | ||
1216 | |||
1217 | - if (spectre_v2_in_eibrs_mode(mode)) { | ||
1218 | - /* Force it so VMEXIT will restore correctly */ | ||
1219 | + if (spectre_v2_in_ibrs_mode(mode)) { | ||
1220 | x86_spec_ctrl_base |= SPEC_CTRL_IBRS; | ||
1221 | - wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); | ||
1222 | + write_spec_ctrl_current(x86_spec_ctrl_base, true); | ||
1223 | } | ||
1224 | |||
1225 | switch (mode) { | ||
1226 | @@ -1156,6 +1323,12 @@ static void __init spectre_v2_select_mitigation(void) | ||
1227 | case SPECTRE_V2_EIBRS: | ||
1228 | break; | ||
1229 | |||
1230 | + case SPECTRE_V2_IBRS: | ||
1231 | + setup_force_cpu_cap(X86_FEATURE_KERNEL_IBRS); | ||
1232 | + if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) | ||
1233 | + pr_warn(SPECTRE_V2_IBRS_PERF_MSG); | ||
1234 | + break; | ||
1235 | + | ||
1236 | case SPECTRE_V2_LFENCE: | ||
1237 | case SPECTRE_V2_EIBRS_LFENCE: | ||
1238 | setup_force_cpu_cap(X86_FEATURE_RETPOLINE_LFENCE); | ||
1239 | @@ -1167,16 +1340,56 @@ static void __init spectre_v2_select_mitigation(void) | ||
1240 | break; | ||
1241 | } | ||
1242 | |||
1243 | + /* | ||
1244 | + * Disable alternate RSB predictions in kernel when indirect CALLs and | ||
1245 | + * JMPs gets protection against BHI and Intramode-BTI, but RET | ||
1246 | + * prediction from a non-RSB predictor is still a risk. | ||
1247 | + */ | ||
1248 | + if (mode == SPECTRE_V2_EIBRS_LFENCE || | ||
1249 | + mode == SPECTRE_V2_EIBRS_RETPOLINE || | ||
1250 | + mode == SPECTRE_V2_RETPOLINE) | ||
1251 | + spec_ctrl_disable_kernel_rrsba(); | ||
1252 | + | ||
1253 | spectre_v2_enabled = mode; | ||
1254 | pr_info("%s\n", spectre_v2_strings[mode]); | ||
1255 | |||
1256 | /* | ||
1257 | - * If spectre v2 protection has been enabled, unconditionally fill | ||
1258 | - * RSB during a context switch; this protects against two independent | ||
1259 | - * issues: | ||
1260 | + * If Spectre v2 protection has been enabled, fill the RSB during a | ||
1261 | + * context switch. In general there are two types of RSB attacks | ||
1262 | + * across context switches, for which the CALLs/RETs may be unbalanced. | ||
1263 | + * | ||
1264 | + * 1) RSB underflow | ||
1265 | + * | ||
1266 | + * Some Intel parts have "bottomless RSB". When the RSB is empty, | ||
1267 | + * speculated return targets may come from the branch predictor, | ||
1268 | + * which could have a user-poisoned BTB or BHB entry. | ||
1269 | + * | ||
1270 | + * AMD has it even worse: *all* returns are speculated from the BTB, | ||
1271 | + * regardless of the state of the RSB. | ||
1272 | + * | ||
1273 | + * When IBRS or eIBRS is enabled, the "user -> kernel" attack | ||
1274 | + * scenario is mitigated by the IBRS branch prediction isolation | ||
1275 | + * properties, so the RSB buffer filling wouldn't be necessary to | ||
1276 | + * protect against this type of attack. | ||
1277 | + * | ||
1278 | + * The "user -> user" attack scenario is mitigated by RSB filling. | ||
1279 | * | ||
1280 | - * - RSB underflow (and switch to BTB) on Skylake+ | ||
1281 | - * - SpectreRSB variant of spectre v2 on X86_BUG_SPECTRE_V2 CPUs | ||
1282 | + * 2) Poisoned RSB entry | ||
1283 | + * | ||
1284 | + * If the 'next' in-kernel return stack is shorter than 'prev', | ||
1285 | + * 'next' could be tricked into speculating with a user-poisoned RSB | ||
1286 | + * entry. | ||
1287 | + * | ||
1288 | + * The "user -> kernel" attack scenario is mitigated by SMEP and | ||
1289 | + * eIBRS. | ||
1290 | + * | ||
1291 | + * The "user -> user" scenario, also known as SpectreBHB, requires | ||
1292 | + * RSB clearing. | ||
1293 | + * | ||
1294 | + * So to mitigate all cases, unconditionally fill RSB on context | ||
1295 | + * switches. | ||
1296 | + * | ||
1297 | + * FIXME: Is this pointless for retbleed-affected AMD? | ||
1298 | */ | ||
1299 | setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW); | ||
1300 | pr_info("Spectre v2 / SpectreRSB mitigation: Filling RSB on context switch\n"); | ||
1301 | @@ -1184,28 +1397,29 @@ static void __init spectre_v2_select_mitigation(void) | ||
1302 | spectre_v2_determine_rsb_fill_type_at_vmexit(mode); | ||
1303 | |||
1304 | /* | ||
1305 | - * Retpoline means the kernel is safe because it has no indirect | ||
1306 | - * branches. Enhanced IBRS protects firmware too, so, enable restricted | ||
1307 | - * speculation around firmware calls only when Enhanced IBRS isn't | ||
1308 | - * supported. | ||
1309 | + * Retpoline protects the kernel, but doesn't protect firmware. IBRS | ||
1310 | + * and Enhanced IBRS protect firmware too, so enable IBRS around | ||
1311 | + * firmware calls only when IBRS / Enhanced IBRS aren't otherwise | ||
1312 | + * enabled. | ||
1313 | * | ||
1314 | * Use "mode" to check Enhanced IBRS instead of boot_cpu_has(), because | ||
1315 | * the user might select retpoline on the kernel command line and if | ||
1316 | * the CPU supports Enhanced IBRS, kernel might un-intentionally not | ||
1317 | * enable IBRS around firmware calls. | ||
1318 | */ | ||
1319 | - if (boot_cpu_has(X86_FEATURE_IBRS) && !spectre_v2_in_eibrs_mode(mode)) { | ||
1320 | + if (boot_cpu_has(X86_FEATURE_IBRS) && !spectre_v2_in_ibrs_mode(mode)) { | ||
1321 | setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW); | ||
1322 | pr_info("Enabling Restricted Speculation for firmware calls\n"); | ||
1323 | } | ||
1324 | |||
1325 | /* Set up IBPB and STIBP depending on the general spectre V2 command */ | ||
1326 | - spectre_v2_user_select_mitigation(cmd); | ||
1327 | + spectre_v2_cmd = cmd; | ||
1328 | } | ||
1329 | |||
1330 | static void update_stibp_msr(void * __unused) | ||
1331 | { | ||
1332 | - wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); | ||
1333 | + u64 val = spec_ctrl_current() | (x86_spec_ctrl_base & SPEC_CTRL_STIBP); | ||
1334 | + write_spec_ctrl_current(val, true); | ||
1335 | } | ||
1336 | |||
1337 | /* Update x86_spec_ctrl_base in case SMT state changed. */ | ||
1338 | @@ -1421,16 +1635,6 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void) | ||
1339 | break; | ||
1340 | } | ||
1341 | |||
1342 | - /* | ||
1343 | - * If SSBD is controlled by the SPEC_CTRL MSR, then set the proper | ||
1344 | - * bit in the mask to allow guests to use the mitigation even in the | ||
1345 | - * case where the host does not enable it. | ||
1346 | - */ | ||
1347 | - if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) || | ||
1348 | - static_cpu_has(X86_FEATURE_AMD_SSBD)) { | ||
1349 | - x86_spec_ctrl_mask |= SPEC_CTRL_SSBD; | ||
1350 | - } | ||
1351 | - | ||
1352 | /* | ||
1353 | * We have three CPU feature flags that are in play here: | ||
1354 | * - X86_BUG_SPEC_STORE_BYPASS - CPU is susceptible. | ||
1355 | @@ -1448,7 +1652,7 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void) | ||
1356 | x86_amd_ssb_disable(); | ||
1357 | } else { | ||
1358 | x86_spec_ctrl_base |= SPEC_CTRL_SSBD; | ||
1359 | - wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); | ||
1360 | + write_spec_ctrl_current(x86_spec_ctrl_base, true); | ||
1361 | } | ||
1362 | } | ||
1363 | |||
1364 | @@ -1665,7 +1869,7 @@ int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which) | ||
1365 | void x86_spec_ctrl_setup_ap(void) | ||
1366 | { | ||
1367 | if (boot_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) | ||
1368 | - wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); | ||
1369 | + write_spec_ctrl_current(x86_spec_ctrl_base, true); | ||
1370 | |||
1371 | if (ssb_mode == SPEC_STORE_BYPASS_DISABLE) | ||
1372 | x86_amd_ssb_disable(); | ||
1373 | @@ -1900,7 +2104,7 @@ static ssize_t mmio_stale_data_show_state(char *buf) | ||
1374 | |||
1375 | static char *stibp_state(void) | ||
1376 | { | ||
1377 | - if (spectre_v2_in_eibrs_mode(spectre_v2_enabled)) | ||
1378 | + if (spectre_v2_in_ibrs_mode(spectre_v2_enabled)) | ||
1379 | return ""; | ||
1380 | |||
1381 | switch (spectre_v2_user_stibp) { | ||
1382 | @@ -1934,7 +2138,7 @@ static char *pbrsb_eibrs_state(void) | ||
1383 | { | ||
1384 | if (boot_cpu_has_bug(X86_BUG_EIBRS_PBRSB)) { | ||
1385 | if (boot_cpu_has(X86_FEATURE_RSB_VMEXIT_LITE) || | ||
1386 | - boot_cpu_has(X86_FEATURE_RETPOLINE)) | ||
1387 | + boot_cpu_has(X86_FEATURE_RSB_VMEXIT)) | ||
1388 | return ", PBRSB-eIBRS: SW sequence"; | ||
1389 | else | ||
1390 | return ", PBRSB-eIBRS: Vulnerable"; | ||
1391 | @@ -1970,6 +2174,11 @@ static ssize_t srbds_show_state(char *buf) | ||
1392 | return sprintf(buf, "%s\n", srbds_strings[srbds_mitigation]); | ||
1393 | } | ||
1394 | |||
1395 | +static ssize_t retbleed_show_state(char *buf) | ||
1396 | +{ | ||
1397 | + return sprintf(buf, "%s\n", retbleed_strings[retbleed_mitigation]); | ||
1398 | +} | ||
1399 | + | ||
1400 | static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr, | ||
1401 | char *buf, unsigned int bug) | ||
1402 | { | ||
1403 | @@ -2016,6 +2225,9 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr | ||
1404 | case X86_BUG_MMIO_UNKNOWN: | ||
1405 | return mmio_stale_data_show_state(buf); | ||
1406 | |||
1407 | + case X86_BUG_RETBLEED: | ||
1408 | + return retbleed_show_state(buf); | ||
1409 | + | ||
1410 | default: | ||
1411 | break; | ||
1412 | } | ||
1413 | @@ -2075,4 +2287,9 @@ ssize_t cpu_show_mmio_stale_data(struct device *dev, struct device_attribute *at | ||
1414 | else | ||
1415 | return cpu_show_common(dev, attr, buf, X86_BUG_MMIO_STALE_DATA); | ||
1416 | } | ||
1417 | + | ||
1418 | +ssize_t cpu_show_retbleed(struct device *dev, struct device_attribute *attr, char *buf) | ||
1419 | +{ | ||
1420 | + return cpu_show_common(dev, attr, buf, X86_BUG_RETBLEED); | ||
1421 | +} | ||
1422 | #endif | ||
1423 | diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c | ||
1424 | index 59413e741ecf1..5e1e32f1086ba 100644 | ||
1425 | --- a/arch/x86/kernel/cpu/common.c | ||
1426 | +++ b/arch/x86/kernel/cpu/common.c | ||
1427 | @@ -1102,48 +1102,60 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { | ||
1428 | {} | ||
1429 | }; | ||
1430 | |||
1431 | +#define VULNBL(vendor, family, model, blacklist) \ | ||
1432 | + X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, blacklist) | ||
1433 | + | ||
1434 | #define VULNBL_INTEL_STEPPINGS(model, steppings, issues) \ | ||
1435 | X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(INTEL, 6, \ | ||
1436 | INTEL_FAM6_##model, steppings, \ | ||
1437 | X86_FEATURE_ANY, issues) | ||
1438 | |||
1439 | +#define VULNBL_AMD(family, blacklist) \ | ||
1440 | + VULNBL(AMD, family, X86_MODEL_ANY, blacklist) | ||
1441 | + | ||
1442 | +#define VULNBL_HYGON(family, blacklist) \ | ||
1443 | + VULNBL(HYGON, family, X86_MODEL_ANY, blacklist) | ||
1444 | + | ||
1445 | #define SRBDS BIT(0) | ||
1446 | /* CPU is affected by X86_BUG_MMIO_STALE_DATA */ | ||
1447 | #define MMIO BIT(1) | ||
1448 | /* CPU is affected by Shared Buffers Data Sampling (SBDS), a variant of X86_BUG_MMIO_STALE_DATA */ | ||
1449 | #define MMIO_SBDS BIT(2) | ||
1450 | +/* CPU is affected by RETbleed, speculating where you would not expect it */ | ||
1451 | +#define RETBLEED BIT(3) | ||
1452 | |||
1453 | static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { | ||
1454 | VULNBL_INTEL_STEPPINGS(IVYBRIDGE, X86_STEPPING_ANY, SRBDS), | ||
1455 | VULNBL_INTEL_STEPPINGS(HASWELL, X86_STEPPING_ANY, SRBDS), | ||
1456 | VULNBL_INTEL_STEPPINGS(HASWELL_L, X86_STEPPING_ANY, SRBDS), | ||
1457 | VULNBL_INTEL_STEPPINGS(HASWELL_G, X86_STEPPING_ANY, SRBDS), | ||
1458 | - VULNBL_INTEL_STEPPINGS(HASWELL_X, BIT(2) | BIT(4), MMIO), | ||
1459 | - VULNBL_INTEL_STEPPINGS(BROADWELL_D, X86_STEPPINGS(0x3, 0x5), MMIO), | ||
1460 | + VULNBL_INTEL_STEPPINGS(HASWELL_X, X86_STEPPING_ANY, MMIO), | ||
1461 | + VULNBL_INTEL_STEPPINGS(BROADWELL_D, X86_STEPPING_ANY, MMIO), | ||
1462 | VULNBL_INTEL_STEPPINGS(BROADWELL_G, X86_STEPPING_ANY, SRBDS), | ||
1463 | VULNBL_INTEL_STEPPINGS(BROADWELL_X, X86_STEPPING_ANY, MMIO), | ||
1464 | VULNBL_INTEL_STEPPINGS(BROADWELL, X86_STEPPING_ANY, SRBDS), | ||
1465 | - VULNBL_INTEL_STEPPINGS(SKYLAKE_L, X86_STEPPINGS(0x3, 0x3), SRBDS | MMIO), | ||
1466 | - VULNBL_INTEL_STEPPINGS(SKYLAKE_L, X86_STEPPING_ANY, SRBDS), | ||
1467 | - VULNBL_INTEL_STEPPINGS(SKYLAKE_X, BIT(3) | BIT(4) | BIT(6) | | ||
1468 | - BIT(7) | BIT(0xB), MMIO), | ||
1469 | - VULNBL_INTEL_STEPPINGS(SKYLAKE, X86_STEPPINGS(0x3, 0x3), SRBDS | MMIO), | ||
1470 | - VULNBL_INTEL_STEPPINGS(SKYLAKE, X86_STEPPING_ANY, SRBDS), | ||
1471 | - VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPINGS(0x9, 0xC), SRBDS | MMIO), | ||
1472 | - VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPINGS(0x0, 0x8), SRBDS), | ||
1473 | - VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPINGS(0x9, 0xD), SRBDS | MMIO), | ||
1474 | - VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPINGS(0x0, 0x8), SRBDS), | ||
1475 | - VULNBL_INTEL_STEPPINGS(ICELAKE_L, X86_STEPPINGS(0x5, 0x5), MMIO | MMIO_SBDS), | ||
1476 | - VULNBL_INTEL_STEPPINGS(ICELAKE_D, X86_STEPPINGS(0x1, 0x1), MMIO), | ||
1477 | - VULNBL_INTEL_STEPPINGS(ICELAKE_X, X86_STEPPINGS(0x4, 0x6), MMIO), | ||
1478 | - VULNBL_INTEL_STEPPINGS(COMETLAKE, BIT(2) | BIT(3) | BIT(5), MMIO | MMIO_SBDS), | ||
1479 | - VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPINGS(0x1, 0x1), MMIO | MMIO_SBDS), | ||
1480 | - VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPINGS(0x0, 0x0), MMIO), | ||
1481 | - VULNBL_INTEL_STEPPINGS(LAKEFIELD, X86_STEPPINGS(0x1, 0x1), MMIO | MMIO_SBDS), | ||
1482 | - VULNBL_INTEL_STEPPINGS(ROCKETLAKE, X86_STEPPINGS(0x1, 0x1), MMIO), | ||
1483 | - VULNBL_INTEL_STEPPINGS(ATOM_TREMONT, X86_STEPPINGS(0x1, 0x1), MMIO | MMIO_SBDS), | ||
1484 | + VULNBL_INTEL_STEPPINGS(SKYLAKE_L, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED), | ||
1485 | + VULNBL_INTEL_STEPPINGS(SKYLAKE_X, X86_STEPPING_ANY, MMIO | RETBLEED), | ||
1486 | + VULNBL_INTEL_STEPPINGS(SKYLAKE, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED), | ||
1487 | + VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED), | ||
1488 | + VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPING_ANY, SRBDS | MMIO | RETBLEED), | ||
1489 | + VULNBL_INTEL_STEPPINGS(CANNONLAKE_L, X86_STEPPING_ANY, RETBLEED), | ||
1490 | + VULNBL_INTEL_STEPPINGS(ICELAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED), | ||
1491 | + VULNBL_INTEL_STEPPINGS(ICELAKE_D, X86_STEPPING_ANY, MMIO), | ||
1492 | + VULNBL_INTEL_STEPPINGS(ICELAKE_X, X86_STEPPING_ANY, MMIO), | ||
1493 | + VULNBL_INTEL_STEPPINGS(COMETLAKE, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED), | ||
1494 | + VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPINGS(0x0, 0x0), MMIO | RETBLEED), | ||
1495 | + VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED), | ||
1496 | + VULNBL_INTEL_STEPPINGS(LAKEFIELD, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED), | ||
1497 | + VULNBL_INTEL_STEPPINGS(ROCKETLAKE, X86_STEPPING_ANY, MMIO | RETBLEED), | ||
1498 | + VULNBL_INTEL_STEPPINGS(ATOM_TREMONT, X86_STEPPING_ANY, MMIO | MMIO_SBDS), | ||
1499 | VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_D, X86_STEPPING_ANY, MMIO), | ||
1500 | - VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_L, X86_STEPPINGS(0x0, 0x0), MMIO | MMIO_SBDS), | ||
1501 | + VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS), | ||
1502 | + | ||
1503 | + VULNBL_AMD(0x15, RETBLEED), | ||
1504 | + VULNBL_AMD(0x16, RETBLEED), | ||
1505 | + VULNBL_AMD(0x17, RETBLEED), | ||
1506 | + VULNBL_HYGON(0x18, RETBLEED), | ||
1507 | {} | ||
1508 | }; | ||
1509 | |||
1510 | @@ -1251,6 +1263,11 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) | ||
1511 | setup_force_cpu_bug(X86_BUG_MMIO_UNKNOWN); | ||
1512 | } | ||
1513 | |||
1514 | + if (!cpu_has(c, X86_FEATURE_BTC_NO)) { | ||
1515 | + if (cpu_matches(cpu_vuln_blacklist, RETBLEED) || (ia32_cap & ARCH_CAP_RSBA)) | ||
1516 | + setup_force_cpu_bug(X86_BUG_RETBLEED); | ||
1517 | + } | ||
1518 | + | ||
1519 | if (cpu_has(c, X86_FEATURE_IBRS_ENHANCED) && | ||
1520 | !cpu_matches(cpu_vuln_whitelist, NO_EIBRS_PBRSB) && | ||
1521 | !(ia32_cap & ARCH_CAP_PBRSB_NO)) | ||
1522 | diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c | ||
1523 | index 2f163e6646b6f..ad6776081e60d 100644 | ||
1524 | --- a/arch/x86/kernel/cpu/match.c | ||
1525 | +++ b/arch/x86/kernel/cpu/match.c | ||
1526 | @@ -16,12 +16,17 @@ | ||
1527 | * respective wildcard entries. | ||
1528 | * | ||
1529 | * A typical table entry would be to match a specific CPU | ||
1530 | - * { X86_VENDOR_INTEL, 6, 0x12 } | ||
1531 | - * or to match a specific CPU feature | ||
1532 | - * { X86_FEATURE_MATCH(X86_FEATURE_FOOBAR) } | ||
1533 | + * | ||
1534 | + * X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, INTEL_FAM6_BROADWELL, | ||
1535 | + * X86_FEATURE_ANY, NULL); | ||
1536 | * | ||
1537 | * Fields can be wildcarded with %X86_VENDOR_ANY, %X86_FAMILY_ANY, | ||
1538 | - * %X86_MODEL_ANY, %X86_FEATURE_ANY or 0 (except for vendor) | ||
1539 | + * %X86_MODEL_ANY, %X86_FEATURE_ANY (except for vendor) | ||
1540 | + * | ||
1541 | + * asm/cpu_device_id.h contains a set of useful macros which are shortcuts | ||
1542 | + * for various common selections. The above can be shortened to: | ||
1543 | + * | ||
1544 | + * X86_MATCH_INTEL_FAM6_MODEL(BROADWELL, NULL); | ||
1545 | * | ||
1546 | * Arrays used to match for this should also be declared using | ||
1547 | * MODULE_DEVICE_TABLE(x86cpu, ...) | ||
1548 | diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c | ||
1549 | index 53004dbd55c47..a03e309a0ac5f 100644 | ||
1550 | --- a/arch/x86/kernel/cpu/scattered.c | ||
1551 | +++ b/arch/x86/kernel/cpu/scattered.c | ||
1552 | @@ -26,6 +26,7 @@ struct cpuid_bit { | ||
1553 | static const struct cpuid_bit cpuid_bits[] = { | ||
1554 | { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 }, | ||
1555 | { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 }, | ||
1556 | + { X86_FEATURE_RRSBA_CTRL, CPUID_EDX, 2, 0x00000007, 2 }, | ||
1557 | { X86_FEATURE_CQM_LLC, CPUID_EDX, 1, 0x0000000f, 0 }, | ||
1558 | { X86_FEATURE_CQM_OCCUP_LLC, CPUID_EDX, 0, 0x0000000f, 1 }, | ||
1559 | { X86_FEATURE_CQM_MBM_TOTAL, CPUID_EDX, 1, 0x0000000f, 1 }, | ||
1560 | diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c | ||
1561 | index 068715a52ac10..87cfd2ee9ca0d 100644 | ||
1562 | --- a/arch/x86/kernel/process.c | ||
1563 | +++ b/arch/x86/kernel/process.c | ||
1564 | @@ -449,7 +449,7 @@ static __always_inline void __speculation_ctrl_update(unsigned long tifp, | ||
1565 | } | ||
1566 | |||
1567 | if (updmsr) | ||
1568 | - wrmsrl(MSR_IA32_SPEC_CTRL, msr); | ||
1569 | + write_spec_ctrl_current(msr, false); | ||
1570 | } | ||
1571 | |||
1572 | static unsigned long speculation_ctrl_update_tif(struct task_struct *tsk) | ||
1573 | diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c | ||
1574 | index 1efcc7d4bc88e..3db407e3c4166 100644 | ||
1575 | --- a/arch/x86/kvm/svm.c | ||
1576 | +++ b/arch/x86/kvm/svm.c | ||
1577 | @@ -47,6 +47,7 @@ | ||
1578 | #include <asm/kvm_para.h> | ||
1579 | #include <asm/irq_remapping.h> | ||
1580 | #include <asm/spec-ctrl.h> | ||
1581 | +#include <asm/cpu_device_id.h> | ||
1582 | |||
1583 | #include <asm/virtext.h> | ||
1584 | #include "trace.h" | ||
1585 | diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c | ||
1586 | index 34ee4835b0177..a7b62a00913e5 100644 | ||
1587 | --- a/arch/x86/kvm/vmx/nested.c | ||
1588 | +++ b/arch/x86/kvm/vmx/nested.c | ||
1589 | @@ -11,6 +11,7 @@ | ||
1590 | #include "mmu.h" | ||
1591 | #include "nested.h" | ||
1592 | #include "trace.h" | ||
1593 | +#include "vmx.h" | ||
1594 | #include "x86.h" | ||
1595 | |||
1596 | static bool __read_mostly enable_shadow_vmcs = 1; | ||
1597 | @@ -2863,35 +2864,8 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) | ||
1598 | vmx->loaded_vmcs->host_state.cr4 = cr4; | ||
1599 | } | ||
1600 | |||
1601 | - asm( | ||
1602 | - "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */ | ||
1603 | - "cmp %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t" | ||
1604 | - "je 1f \n\t" | ||
1605 | - __ex("vmwrite %%" _ASM_SP ", %[HOST_RSP]") "\n\t" | ||
1606 | - "mov %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t" | ||
1607 | - "1: \n\t" | ||
1608 | - "add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */ | ||
1609 | - | ||
1610 | - /* Check if vmlaunch or vmresume is needed */ | ||
1611 | - "cmpb $0, %c[launched](%[loaded_vmcs])\n\t" | ||
1612 | - | ||
1613 | - /* | ||
1614 | - * VMLAUNCH and VMRESUME clear RFLAGS.{CF,ZF} on VM-Exit, set | ||
1615 | - * RFLAGS.CF on VM-Fail Invalid and set RFLAGS.ZF on VM-Fail | ||
1616 | - * Valid. vmx_vmenter() directly "returns" RFLAGS, and so the | ||
1617 | - * results of VM-Enter is captured via CC_{SET,OUT} to vm_fail. | ||
1618 | - */ | ||
1619 | - "call vmx_vmenter\n\t" | ||
1620 | - | ||
1621 | - CC_SET(be) | ||
1622 | - : ASM_CALL_CONSTRAINT, CC_OUT(be) (vm_fail) | ||
1623 | - : [HOST_RSP]"r"((unsigned long)HOST_RSP), | ||
1624 | - [loaded_vmcs]"r"(vmx->loaded_vmcs), | ||
1625 | - [launched]"i"(offsetof(struct loaded_vmcs, launched)), | ||
1626 | - [host_state_rsp]"i"(offsetof(struct loaded_vmcs, host_state.rsp)), | ||
1627 | - [wordsize]"i"(sizeof(ulong)) | ||
1628 | - : "memory" | ||
1629 | - ); | ||
1630 | + vm_fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, | ||
1631 | + __vmx_vcpu_run_flags(vmx)); | ||
1632 | |||
1633 | if (vmx->msr_autoload.host.nr) | ||
1634 | vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); | ||
1635 | diff --git a/arch/x86/kvm/vmx/run_flags.h b/arch/x86/kvm/vmx/run_flags.h | ||
1636 | new file mode 100644 | ||
1637 | index 0000000000000..edc3f16cc1896 | ||
1638 | --- /dev/null | ||
1639 | +++ b/arch/x86/kvm/vmx/run_flags.h | ||
1640 | @@ -0,0 +1,8 @@ | ||
1641 | +/* SPDX-License-Identifier: GPL-2.0 */ | ||
1642 | +#ifndef __KVM_X86_VMX_RUN_FLAGS_H | ||
1643 | +#define __KVM_X86_VMX_RUN_FLAGS_H | ||
1644 | + | ||
1645 | +#define VMX_RUN_VMRESUME (1 << 0) | ||
1646 | +#define VMX_RUN_SAVE_SPEC_CTRL (1 << 1) | ||
1647 | + | ||
1648 | +#endif /* __KVM_X86_VMX_RUN_FLAGS_H */ | ||
1649 | diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S | ||
1650 | index 946d9205c3b6d..2850670c38bb0 100644 | ||
1651 | --- a/arch/x86/kvm/vmx/vmenter.S | ||
1652 | +++ b/arch/x86/kvm/vmx/vmenter.S | ||
1653 | @@ -4,6 +4,7 @@ | ||
1654 | #include <asm/bitsperlong.h> | ||
1655 | #include <asm/kvm_vcpu_regs.h> | ||
1656 | #include <asm/nospec-branch.h> | ||
1657 | +#include "run_flags.h" | ||
1658 | |||
1659 | #define WORD_SIZE (BITS_PER_LONG / 8) | ||
1660 | |||
1661 | @@ -29,78 +30,12 @@ | ||
1662 | |||
1663 | .text | ||
1664 | |||
1665 | -/** | ||
1666 | - * vmx_vmenter - VM-Enter the current loaded VMCS | ||
1667 | - * | ||
1668 | - * %RFLAGS.ZF: !VMCS.LAUNCHED, i.e. controls VMLAUNCH vs. VMRESUME | ||
1669 | - * | ||
1670 | - * Returns: | ||
1671 | - * %RFLAGS.CF is set on VM-Fail Invalid | ||
1672 | - * %RFLAGS.ZF is set on VM-Fail Valid | ||
1673 | - * %RFLAGS.{CF,ZF} are cleared on VM-Success, i.e. VM-Exit | ||
1674 | - * | ||
1675 | - * Note that VMRESUME/VMLAUNCH fall-through and return directly if | ||
1676 | - * they VM-Fail, whereas a successful VM-Enter + VM-Exit will jump | ||
1677 | - * to vmx_vmexit. | ||
1678 | - */ | ||
1679 | -ENTRY(vmx_vmenter) | ||
1680 | - /* EFLAGS.ZF is set if VMCS.LAUNCHED == 0 */ | ||
1681 | - je 2f | ||
1682 | - | ||
1683 | -1: vmresume | ||
1684 | - ret | ||
1685 | - | ||
1686 | -2: vmlaunch | ||
1687 | - ret | ||
1688 | - | ||
1689 | -3: cmpb $0, kvm_rebooting | ||
1690 | - je 4f | ||
1691 | - ret | ||
1692 | -4: ud2 | ||
1693 | - | ||
1694 | - .pushsection .fixup, "ax" | ||
1695 | -5: jmp 3b | ||
1696 | - .popsection | ||
1697 | - | ||
1698 | - _ASM_EXTABLE(1b, 5b) | ||
1699 | - _ASM_EXTABLE(2b, 5b) | ||
1700 | - | ||
1701 | -ENDPROC(vmx_vmenter) | ||
1702 | - | ||
1703 | -/** | ||
1704 | - * vmx_vmexit - Handle a VMX VM-Exit | ||
1705 | - * | ||
1706 | - * Returns: | ||
1707 | - * %RFLAGS.{CF,ZF} are cleared on VM-Success, i.e. VM-Exit | ||
1708 | - * | ||
1709 | - * This is vmx_vmenter's partner in crime. On a VM-Exit, control will jump | ||
1710 | - * here after hardware loads the host's state, i.e. this is the destination | ||
1711 | - * referred to by VMCS.HOST_RIP. | ||
1712 | - */ | ||
1713 | -ENTRY(vmx_vmexit) | ||
1714 | -#ifdef CONFIG_RETPOLINE | ||
1715 | - ALTERNATIVE "jmp .Lvmexit_skip_rsb", "", X86_FEATURE_RETPOLINE | ||
1716 | - /* Preserve guest's RAX, it's used to stuff the RSB. */ | ||
1717 | - push %_ASM_AX | ||
1718 | - | ||
1719 | - /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */ | ||
1720 | - FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE | ||
1721 | - | ||
1722 | - /* Clear RFLAGS.CF and RFLAGS.ZF to preserve VM-Exit, i.e. !VM-Fail. */ | ||
1723 | - or $1, %_ASM_AX | ||
1724 | - | ||
1725 | - pop %_ASM_AX | ||
1726 | -.Lvmexit_skip_rsb: | ||
1727 | -#endif | ||
1728 | - ISSUE_UNBALANCED_RET_GUARD X86_FEATURE_RSB_VMEXIT_LITE | ||
1729 | - ret | ||
1730 | -ENDPROC(vmx_vmexit) | ||
1731 | - | ||
1732 | /** | ||
1733 | * __vmx_vcpu_run - Run a vCPU via a transition to VMX guest mode | ||
1734 | - * @vmx: struct vcpu_vmx * (forwarded to vmx_update_host_rsp) | ||
1735 | + * @vmx: struct vcpu_vmx * | ||
1736 | * @regs: unsigned long * (to guest registers) | ||
1737 | - * @launched: %true if the VMCS has been launched | ||
1738 | + * @flags: VMX_RUN_VMRESUME: use VMRESUME instead of VMLAUNCH | ||
1739 | + * VMX_RUN_SAVE_SPEC_CTRL: save guest SPEC_CTRL into vmx->spec_ctrl | ||
1740 | * | ||
1741 | * Returns: | ||
1742 | * 0 on VM-Exit, 1 on VM-Fail | ||
1743 | @@ -119,24 +54,29 @@ ENTRY(__vmx_vcpu_run) | ||
1744 | #endif | ||
1745 | push %_ASM_BX | ||
1746 | |||
1747 | + /* Save @vmx for SPEC_CTRL handling */ | ||
1748 | + push %_ASM_ARG1 | ||
1749 | + | ||
1750 | + /* Save @flags for SPEC_CTRL handling */ | ||
1751 | + push %_ASM_ARG3 | ||
1752 | + | ||
1753 | /* | ||
1754 | * Save @regs, _ASM_ARG2 may be modified by vmx_update_host_rsp() and | ||
1755 | * @regs is needed after VM-Exit to save the guest's register values. | ||
1756 | */ | ||
1757 | push %_ASM_ARG2 | ||
1758 | |||
1759 | - /* Copy @launched to BL, _ASM_ARG3 is volatile. */ | ||
1760 | + /* Copy @flags to BL, _ASM_ARG3 is volatile. */ | ||
1761 | mov %_ASM_ARG3B, %bl | ||
1762 | |||
1763 | - /* Adjust RSP to account for the CALL to vmx_vmenter(). */ | ||
1764 | - lea -WORD_SIZE(%_ASM_SP), %_ASM_ARG2 | ||
1765 | + lea (%_ASM_SP), %_ASM_ARG2 | ||
1766 | call vmx_update_host_rsp | ||
1767 | |||
1768 | /* Load @regs to RAX. */ | ||
1769 | mov (%_ASM_SP), %_ASM_AX | ||
1770 | |||
1771 | /* Check if vmlaunch or vmresume is needed */ | ||
1772 | - cmpb $0, %bl | ||
1773 | + testb $VMX_RUN_VMRESUME, %bl | ||
1774 | |||
1775 | /* Load guest registers. Don't clobber flags. */ | ||
1776 | mov VCPU_RBX(%_ASM_AX), %_ASM_BX | ||
1777 | @@ -158,11 +98,25 @@ ENTRY(__vmx_vcpu_run) | ||
1778 | /* Load guest RAX. This kills the @regs pointer! */ | ||
1779 | mov VCPU_RAX(%_ASM_AX), %_ASM_AX | ||
1780 | |||
1781 | - /* Enter guest mode */ | ||
1782 | - call vmx_vmenter | ||
1783 | + /* Check EFLAGS.ZF from 'testb' above */ | ||
1784 | + jz .Lvmlaunch | ||
1785 | |||
1786 | - /* Jump on VM-Fail. */ | ||
1787 | - jbe 2f | ||
1788 | +/* | ||
1789 | + * If VMRESUME/VMLAUNCH and corresponding vmexit succeed, execution resumes at | ||
1790 | + * the 'vmx_vmexit' label below. | ||
1791 | + */ | ||
1792 | +.Lvmresume: | ||
1793 | + vmresume | ||
1794 | + jmp .Lvmfail | ||
1795 | + | ||
1796 | +.Lvmlaunch: | ||
1797 | + vmlaunch | ||
1798 | + jmp .Lvmfail | ||
1799 | + | ||
1800 | + _ASM_EXTABLE(.Lvmresume, .Lfixup) | ||
1801 | + _ASM_EXTABLE(.Lvmlaunch, .Lfixup) | ||
1802 | + | ||
1803 | +SYM_INNER_LABEL(vmx_vmexit, SYM_L_GLOBAL) | ||
1804 | |||
1805 | /* Temporarily save guest's RAX. */ | ||
1806 | push %_ASM_AX | ||
1807 | @@ -189,19 +143,21 @@ ENTRY(__vmx_vcpu_run) | ||
1808 | mov %r15, VCPU_R15(%_ASM_AX) | ||
1809 | #endif | ||
1810 | |||
1811 | - /* Clear RAX to indicate VM-Exit (as opposed to VM-Fail). */ | ||
1812 | - xor %eax, %eax | ||
1813 | + /* Clear return value to indicate VM-Exit (as opposed to VM-Fail). */ | ||
1814 | + xor %ebx, %ebx | ||
1815 | |||
1816 | +.Lclear_regs: | ||
1817 | /* | ||
1818 | - * Clear all general purpose registers except RSP and RAX to prevent | ||
1819 | + * Clear all general purpose registers except RSP and RBX to prevent | ||
1820 | * speculative use of the guest's values, even those that are reloaded | ||
1821 | * via the stack. In theory, an L1 cache miss when restoring registers | ||
1822 | * could lead to speculative execution with the guest's values. | ||
1823 | * Zeroing XORs are dirt cheap, i.e. the extra paranoia is essentially | ||
1824 | * free. RSP and RAX are exempt as RSP is restored by hardware during | ||
1825 | - * VM-Exit and RAX is explicitly loaded with 0 or 1 to return VM-Fail. | ||
1826 | + * VM-Exit and RBX is explicitly loaded with 0 or 1 to hold the return | ||
1827 | + * value. | ||
1828 | */ | ||
1829 | -1: xor %ebx, %ebx | ||
1830 | + xor %eax, %eax | ||
1831 | xor %ecx, %ecx | ||
1832 | xor %edx, %edx | ||
1833 | xor %esi, %esi | ||
1834 | @@ -220,8 +176,32 @@ ENTRY(__vmx_vcpu_run) | ||
1835 | |||
1836 | /* "POP" @regs. */ | ||
1837 | add $WORD_SIZE, %_ASM_SP | ||
1838 | - pop %_ASM_BX | ||
1839 | |||
1840 | + /* | ||
1841 | + * IMPORTANT: RSB filling and SPEC_CTRL handling must be done before | ||
1842 | + * the first unbalanced RET after vmexit! | ||
1843 | + * | ||
1844 | + * For retpoline or IBRS, RSB filling is needed to prevent poisoned RSB | ||
1845 | + * entries and (in some cases) RSB underflow. | ||
1846 | + * | ||
1847 | + * eIBRS has its own protection against poisoned RSB, so it doesn't | ||
1848 | + * need the RSB filling sequence. But it does need to be enabled, and a | ||
1849 | + * single call to retire, before the first unbalanced RET. | ||
1850 | + */ | ||
1851 | + | ||
1852 | + FILL_RETURN_BUFFER %_ASM_CX, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_VMEXIT,\ | ||
1853 | + X86_FEATURE_RSB_VMEXIT_LITE | ||
1854 | + | ||
1855 | + | ||
1856 | + pop %_ASM_ARG2 /* @flags */ | ||
1857 | + pop %_ASM_ARG1 /* @vmx */ | ||
1858 | + | ||
1859 | + call vmx_spec_ctrl_restore_host | ||
1860 | + | ||
1861 | + /* Put return value in AX */ | ||
1862 | + mov %_ASM_BX, %_ASM_AX | ||
1863 | + | ||
1864 | + pop %_ASM_BX | ||
1865 | #ifdef CONFIG_X86_64 | ||
1866 | pop %r12 | ||
1867 | pop %r13 | ||
1868 | @@ -234,11 +214,20 @@ ENTRY(__vmx_vcpu_run) | ||
1869 | pop %_ASM_BP | ||
1870 | ret | ||
1871 | |||
1872 | - /* VM-Fail. Out-of-line to avoid a taken Jcc after VM-Exit. */ | ||
1873 | -2: mov $1, %eax | ||
1874 | - jmp 1b | ||
1875 | +.Lfixup: | ||
1876 | + cmpb $0, kvm_rebooting | ||
1877 | + jne .Lvmfail | ||
1878 | + ud2 | ||
1879 | +.Lvmfail: | ||
1880 | + /* VM-Fail: set return value to 1 */ | ||
1881 | + mov $1, %_ASM_BX | ||
1882 | + jmp .Lclear_regs | ||
1883 | + | ||
1884 | ENDPROC(__vmx_vcpu_run) | ||
1885 | |||
1886 | + | ||
1887 | +.section .text, "ax" | ||
1888 | + | ||
1889 | /** | ||
1890 | * vmread_error_trampoline - Trampoline from inline asm to vmread_error() | ||
1891 | * @field: VMCS field encoding that failed | ||
1892 | diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c | ||
1893 | index 4bd1bf6214eea..d522c9de41df9 100644 | ||
1894 | --- a/arch/x86/kvm/vmx/vmx.c | ||
1895 | +++ b/arch/x86/kvm/vmx/vmx.c | ||
1896 | @@ -31,6 +31,7 @@ | ||
1897 | #include <asm/apic.h> | ||
1898 | #include <asm/asm.h> | ||
1899 | #include <asm/cpu.h> | ||
1900 | +#include <asm/cpu_device_id.h> | ||
1901 | #include <asm/debugreg.h> | ||
1902 | #include <asm/desc.h> | ||
1903 | #include <asm/fpu/internal.h> | ||
1904 | @@ -358,9 +359,9 @@ static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx) | ||
1905 | if (!vmx->disable_fb_clear) | ||
1906 | return; | ||
1907 | |||
1908 | - rdmsrl(MSR_IA32_MCU_OPT_CTRL, msr); | ||
1909 | + msr = __rdmsr(MSR_IA32_MCU_OPT_CTRL); | ||
1910 | msr |= FB_CLEAR_DIS; | ||
1911 | - wrmsrl(MSR_IA32_MCU_OPT_CTRL, msr); | ||
1912 | + native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, msr); | ||
1913 | /* Cache the MSR value to avoid reading it later */ | ||
1914 | vmx->msr_ia32_mcu_opt_ctrl = msr; | ||
1915 | } | ||
1916 | @@ -371,7 +372,7 @@ static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx) | ||
1917 | return; | ||
1918 | |||
1919 | vmx->msr_ia32_mcu_opt_ctrl &= ~FB_CLEAR_DIS; | ||
1920 | - wrmsrl(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl); | ||
1921 | + native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl); | ||
1922 | } | ||
1923 | |||
1924 | static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) | ||
1925 | @@ -862,6 +863,24 @@ static bool msr_write_intercepted(struct vcpu_vmx *vmx, u32 msr) | ||
1926 | return true; | ||
1927 | } | ||
1928 | |||
1929 | +unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx) | ||
1930 | +{ | ||
1931 | + unsigned int flags = 0; | ||
1932 | + | ||
1933 | + if (vmx->loaded_vmcs->launched) | ||
1934 | + flags |= VMX_RUN_VMRESUME; | ||
1935 | + | ||
1936 | + /* | ||
1937 | + * If writes to the SPEC_CTRL MSR aren't intercepted, the guest is free | ||
1938 | + * to change it directly without causing a vmexit. In that case read | ||
1939 | + * it after vmexit and store it in vmx->spec_ctrl. | ||
1940 | + */ | ||
1941 | + if (unlikely(!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL))) | ||
1942 | + flags |= VMX_RUN_SAVE_SPEC_CTRL; | ||
1943 | + | ||
1944 | + return flags; | ||
1945 | +} | ||
1946 | + | ||
1947 | static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx, | ||
1948 | unsigned long entry, unsigned long exit) | ||
1949 | { | ||
1950 | @@ -6539,7 +6558,30 @@ void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp) | ||
1951 | } | ||
1952 | } | ||
1953 | |||
1954 | -bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched); | ||
1955 | +void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx, | ||
1956 | + unsigned int flags) | ||
1957 | +{ | ||
1958 | + u64 hostval = this_cpu_read(x86_spec_ctrl_current); | ||
1959 | + | ||
1960 | + if (!cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL)) | ||
1961 | + return; | ||
1962 | + | ||
1963 | + if (flags & VMX_RUN_SAVE_SPEC_CTRL) | ||
1964 | + vmx->spec_ctrl = __rdmsr(MSR_IA32_SPEC_CTRL); | ||
1965 | + | ||
1966 | + /* | ||
1967 | + * If the guest/host SPEC_CTRL values differ, restore the host value. | ||
1968 | + * | ||
1969 | + * For legacy IBRS, the IBRS bit always needs to be written after | ||
1970 | + * transitioning from a less privileged predictor mode, regardless of | ||
1971 | + * whether the guest/host values differ. | ||
1972 | + */ | ||
1973 | + if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) || | ||
1974 | + vmx->spec_ctrl != hostval) | ||
1975 | + native_wrmsrl(MSR_IA32_SPEC_CTRL, hostval); | ||
1976 | + | ||
1977 | + barrier_nospec(); | ||
1978 | +} | ||
1979 | |||
1980 | static void vmx_vcpu_run(struct kvm_vcpu *vcpu) | ||
1981 | { | ||
1982 | @@ -6628,32 +6670,12 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) | ||
1983 | write_cr2(vcpu->arch.cr2); | ||
1984 | |||
1985 | vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, | ||
1986 | - vmx->loaded_vmcs->launched); | ||
1987 | + __vmx_vcpu_run_flags(vmx)); | ||
1988 | |||
1989 | vcpu->arch.cr2 = read_cr2(); | ||
1990 | |||
1991 | vmx_enable_fb_clear(vmx); | ||
1992 | |||
1993 | - /* | ||
1994 | - * We do not use IBRS in the kernel. If this vCPU has used the | ||
1995 | - * SPEC_CTRL MSR it may have left it on; save the value and | ||
1996 | - * turn it off. This is much more efficient than blindly adding | ||
1997 | - * it to the atomic save/restore list. Especially as the former | ||
1998 | - * (Saving guest MSRs on vmexit) doesn't even exist in KVM. | ||
1999 | - * | ||
2000 | - * For non-nested case: | ||
2001 | - * If the L01 MSR bitmap does not intercept the MSR, then we need to | ||
2002 | - * save it. | ||
2003 | - * | ||
2004 | - * For nested case: | ||
2005 | - * If the L02 MSR bitmap does not intercept the MSR, then we need to | ||
2006 | - * save it. | ||
2007 | - */ | ||
2008 | - if (unlikely(!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL))) | ||
2009 | - vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL); | ||
2010 | - | ||
2011 | - x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0); | ||
2012 | - | ||
2013 | /* All fields are clean at this point */ | ||
2014 | if (static_branch_unlikely(&enable_evmcs)) | ||
2015 | current_evmcs->hv_clean_fields |= | ||
2016 | diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h | ||
2017 | index 7a3362ab59867..4d5be4610af84 100644 | ||
2018 | --- a/arch/x86/kvm/vmx/vmx.h | ||
2019 | +++ b/arch/x86/kvm/vmx/vmx.h | ||
2020 | @@ -10,6 +10,7 @@ | ||
2021 | #include "capabilities.h" | ||
2022 | #include "ops.h" | ||
2023 | #include "vmcs.h" | ||
2024 | +#include "run_flags.h" | ||
2025 | |||
2026 | extern const u32 vmx_msr_index[]; | ||
2027 | extern u64 host_efer; | ||
2028 | @@ -336,6 +337,10 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu); | ||
2029 | struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr); | ||
2030 | void pt_update_intercept_for_msr(struct vcpu_vmx *vmx); | ||
2031 | void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp); | ||
2032 | +void vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx, unsigned int flags); | ||
2033 | +unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx); | ||
2034 | +bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, | ||
2035 | + unsigned int flags); | ||
2036 | |||
2037 | #define POSTED_INTR_ON 0 | ||
2038 | #define POSTED_INTR_SN 1 | ||
2039 | diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c | ||
2040 | index d0b297583df88..c431a34522d6c 100644 | ||
2041 | --- a/arch/x86/kvm/x86.c | ||
2042 | +++ b/arch/x86/kvm/x86.c | ||
2043 | @@ -10329,9 +10329,9 @@ void kvm_arch_end_assignment(struct kvm *kvm) | ||
2044 | } | ||
2045 | EXPORT_SYMBOL_GPL(kvm_arch_end_assignment); | ||
2046 | |||
2047 | -bool kvm_arch_has_assigned_device(struct kvm *kvm) | ||
2048 | +bool noinstr kvm_arch_has_assigned_device(struct kvm *kvm) | ||
2049 | { | ||
2050 | - return atomic_read(&kvm->arch.assigned_device_count); | ||
2051 | + return arch_atomic_read(&kvm->arch.assigned_device_count); | ||
2052 | } | ||
2053 | EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device); | ||
2054 | |||
2055 | diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c | ||
2056 | index 9b5edf1dfe9e9..7000c836951c5 100644 | ||
2057 | --- a/drivers/base/cpu.c | ||
2058 | +++ b/drivers/base/cpu.c | ||
2059 | @@ -574,6 +574,12 @@ ssize_t __weak cpu_show_mmio_stale_data(struct device *dev, | ||
2060 | return sysfs_emit(buf, "Not affected\n"); | ||
2061 | } | ||
2062 | |||
2063 | +ssize_t __weak cpu_show_retbleed(struct device *dev, | ||
2064 | + struct device_attribute *attr, char *buf) | ||
2065 | +{ | ||
2066 | + return sysfs_emit(buf, "Not affected\n"); | ||
2067 | +} | ||
2068 | + | ||
2069 | static DEVICE_ATTR(meltdown, 0444, cpu_show_meltdown, NULL); | ||
2070 | static DEVICE_ATTR(spectre_v1, 0444, cpu_show_spectre_v1, NULL); | ||
2071 | static DEVICE_ATTR(spectre_v2, 0444, cpu_show_spectre_v2, NULL); | ||
2072 | @@ -584,6 +590,7 @@ static DEVICE_ATTR(tsx_async_abort, 0444, cpu_show_tsx_async_abort, NULL); | ||
2073 | static DEVICE_ATTR(itlb_multihit, 0444, cpu_show_itlb_multihit, NULL); | ||
2074 | static DEVICE_ATTR(srbds, 0444, cpu_show_srbds, NULL); | ||
2075 | static DEVICE_ATTR(mmio_stale_data, 0444, cpu_show_mmio_stale_data, NULL); | ||
2076 | +static DEVICE_ATTR(retbleed, 0444, cpu_show_retbleed, NULL); | ||
2077 | |||
2078 | static struct attribute *cpu_root_vulnerabilities_attrs[] = { | ||
2079 | &dev_attr_meltdown.attr, | ||
2080 | @@ -596,6 +603,7 @@ static struct attribute *cpu_root_vulnerabilities_attrs[] = { | ||
2081 | &dev_attr_itlb_multihit.attr, | ||
2082 | &dev_attr_srbds.attr, | ||
2083 | &dev_attr_mmio_stale_data.attr, | ||
2084 | + &dev_attr_retbleed.attr, | ||
2085 | NULL | ||
2086 | }; | ||
2087 | |||
2088 | diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c | ||
2089 | index 4195834a45912..cf7ebe3bd1ad2 100644 | ||
2090 | --- a/drivers/cpufreq/acpi-cpufreq.c | ||
2091 | +++ b/drivers/cpufreq/acpi-cpufreq.c | ||
2092 | @@ -30,6 +30,7 @@ | ||
2093 | #include <asm/msr.h> | ||
2094 | #include <asm/processor.h> | ||
2095 | #include <asm/cpufeature.h> | ||
2096 | +#include <asm/cpu_device_id.h> | ||
2097 | |||
2098 | MODULE_AUTHOR("Paul Diefenbaugh, Dominik Brodowski"); | ||
2099 | MODULE_DESCRIPTION("ACPI Processor P-States Driver"); | ||
2100 | diff --git a/drivers/cpufreq/amd_freq_sensitivity.c b/drivers/cpufreq/amd_freq_sensitivity.c | ||
2101 | index e2df9d1121063..5107cbe2d64dd 100644 | ||
2102 | --- a/drivers/cpufreq/amd_freq_sensitivity.c | ||
2103 | +++ b/drivers/cpufreq/amd_freq_sensitivity.c | ||
2104 | @@ -18,6 +18,7 @@ | ||
2105 | |||
2106 | #include <asm/msr.h> | ||
2107 | #include <asm/cpufeature.h> | ||
2108 | +#include <asm/cpu_device_id.h> | ||
2109 | |||
2110 | #include "cpufreq_ondemand.h" | ||
2111 | |||
2112 | diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c | ||
2113 | index d8687868407de..b588e0e409e72 100644 | ||
2114 | --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c | ||
2115 | +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c | ||
2116 | @@ -35,7 +35,6 @@ | ||
2117 | #include <linux/pci.h> | ||
2118 | #include <linux/pm_runtime.h> | ||
2119 | #include <drm/drm_crtc_helper.h> | ||
2120 | -#include <drm/drm_damage_helper.h> | ||
2121 | #include <drm/drm_edid.h> | ||
2122 | #include <drm/drm_gem_framebuffer_helper.h> | ||
2123 | #include <drm/drm_fb_helper.h> | ||
2124 | @@ -496,7 +495,6 @@ bool amdgpu_display_ddc_probe(struct amdgpu_connector *amdgpu_connector, | ||
2125 | static const struct drm_framebuffer_funcs amdgpu_fb_funcs = { | ||
2126 | .destroy = drm_gem_fb_destroy, | ||
2127 | .create_handle = drm_gem_fb_create_handle, | ||
2128 | - .dirty = drm_atomic_helper_dirtyfb, | ||
2129 | }; | ||
2130 | |||
2131 | uint32_t amdgpu_display_supported_domains(struct amdgpu_device *adev, | ||
2132 | diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c | ||
2133 | index 347b08b56042f..63b2212262618 100644 | ||
2134 | --- a/drivers/idle/intel_idle.c | ||
2135 | +++ b/drivers/idle/intel_idle.c | ||
2136 | @@ -46,11 +46,13 @@ | ||
2137 | #include <linux/tick.h> | ||
2138 | #include <trace/events/power.h> | ||
2139 | #include <linux/sched.h> | ||
2140 | +#include <linux/sched/smt.h> | ||
2141 | #include <linux/notifier.h> | ||
2142 | #include <linux/cpu.h> | ||
2143 | #include <linux/moduleparam.h> | ||
2144 | #include <asm/cpu_device_id.h> | ||
2145 | #include <asm/intel-family.h> | ||
2146 | +#include <asm/nospec-branch.h> | ||
2147 | #include <asm/mwait.h> | ||
2148 | #include <asm/msr.h> | ||
2149 | |||
2150 | @@ -97,6 +99,12 @@ static struct cpuidle_state *cpuidle_state_table; | ||
2151 | */ | ||
2152 | #define CPUIDLE_FLAG_TLB_FLUSHED 0x10000 | ||
2153 | |||
2154 | +/* | ||
2155 | + * Disable IBRS across idle (when KERNEL_IBRS), is exclusive vs IRQ_ENABLE | ||
2156 | + * above. | ||
2157 | + */ | ||
2158 | +#define CPUIDLE_FLAG_IBRS BIT(16) | ||
2159 | + | ||
2160 | /* | ||
2161 | * MWAIT takes an 8-bit "hint" in EAX "suggesting" | ||
2162 | * the C-state (top nibble) and sub-state (bottom nibble) | ||
2163 | @@ -107,6 +115,24 @@ static struct cpuidle_state *cpuidle_state_table; | ||
2164 | #define flg2MWAIT(flags) (((flags) >> 24) & 0xFF) | ||
2165 | #define MWAIT2flg(eax) ((eax & 0xFF) << 24) | ||
2166 | |||
2167 | +static __cpuidle int intel_idle_ibrs(struct cpuidle_device *dev, | ||
2168 | + struct cpuidle_driver *drv, int index) | ||
2169 | +{ | ||
2170 | + bool smt_active = sched_smt_active(); | ||
2171 | + u64 spec_ctrl = spec_ctrl_current(); | ||
2172 | + int ret; | ||
2173 | + | ||
2174 | + if (smt_active) | ||
2175 | + wrmsrl(MSR_IA32_SPEC_CTRL, 0); | ||
2176 | + | ||
2177 | + ret = intel_idle(dev, drv, index); | ||
2178 | + | ||
2179 | + if (smt_active) | ||
2180 | + wrmsrl(MSR_IA32_SPEC_CTRL, spec_ctrl); | ||
2181 | + | ||
2182 | + return ret; | ||
2183 | +} | ||
2184 | + | ||
2185 | /* | ||
2186 | * States are indexed by the cstate number, | ||
2187 | * which is also the index into the MWAIT hint array. | ||
2188 | @@ -605,7 +631,7 @@ static struct cpuidle_state skl_cstates[] = { | ||
2189 | { | ||
2190 | .name = "C6", | ||
2191 | .desc = "MWAIT 0x20", | ||
2192 | - .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, | ||
2193 | + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS, | ||
2194 | .exit_latency = 85, | ||
2195 | .target_residency = 200, | ||
2196 | .enter = &intel_idle, | ||
2197 | @@ -613,7 +639,7 @@ static struct cpuidle_state skl_cstates[] = { | ||
2198 | { | ||
2199 | .name = "C7s", | ||
2200 | .desc = "MWAIT 0x33", | ||
2201 | - .flags = MWAIT2flg(0x33) | CPUIDLE_FLAG_TLB_FLUSHED, | ||
2202 | + .flags = MWAIT2flg(0x33) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS, | ||
2203 | .exit_latency = 124, | ||
2204 | .target_residency = 800, | ||
2205 | .enter = &intel_idle, | ||
2206 | @@ -621,7 +647,7 @@ static struct cpuidle_state skl_cstates[] = { | ||
2207 | { | ||
2208 | .name = "C8", | ||
2209 | .desc = "MWAIT 0x40", | ||
2210 | - .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED, | ||
2211 | + .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS, | ||
2212 | .exit_latency = 200, | ||
2213 | .target_residency = 800, | ||
2214 | .enter = &intel_idle, | ||
2215 | @@ -629,7 +655,7 @@ static struct cpuidle_state skl_cstates[] = { | ||
2216 | { | ||
2217 | .name = "C9", | ||
2218 | .desc = "MWAIT 0x50", | ||
2219 | - .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED, | ||
2220 | + .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS, | ||
2221 | .exit_latency = 480, | ||
2222 | .target_residency = 5000, | ||
2223 | .enter = &intel_idle, | ||
2224 | @@ -637,7 +663,7 @@ static struct cpuidle_state skl_cstates[] = { | ||
2225 | { | ||
2226 | .name = "C10", | ||
2227 | .desc = "MWAIT 0x60", | ||
2228 | - .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, | ||
2229 | + .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS, | ||
2230 | .exit_latency = 890, | ||
2231 | .target_residency = 5000, | ||
2232 | .enter = &intel_idle, | ||
2233 | @@ -666,7 +692,7 @@ static struct cpuidle_state skx_cstates[] = { | ||
2234 | { | ||
2235 | .name = "C6", | ||
2236 | .desc = "MWAIT 0x20", | ||
2237 | - .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, | ||
2238 | + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS, | ||
2239 | .exit_latency = 133, | ||
2240 | .target_residency = 600, | ||
2241 | .enter = &intel_idle, | ||
2242 | @@ -1370,6 +1396,11 @@ static void __init intel_idle_cpuidle_driver_init(void) | ||
2243 | drv->states[drv->state_count] = /* structure copy */ | ||
2244 | cpuidle_state_table[cstate]; | ||
2245 | |||
2246 | + if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) && | ||
2247 | + cpuidle_state_table[cstate].flags & CPUIDLE_FLAG_IBRS) { | ||
2248 | + drv->states[drv->state_count].enter = intel_idle_ibrs; | ||
2249 | + } | ||
2250 | + | ||
2251 | drv->state_count += 1; | ||
2252 | } | ||
2253 | |||
2254 | diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c | ||
2255 | index 510ca69746042..c83ff610ecb6c 100644 | ||
2256 | --- a/fs/xfs/libxfs/xfs_attr.c | ||
2257 | +++ b/fs/xfs/libxfs/xfs_attr.c | ||
2258 | @@ -1007,7 +1007,7 @@ restart: | ||
2259 | * The INCOMPLETE flag means that we will find the "old" | ||
2260 | * attr, not the "new" one. | ||
2261 | */ | ||
2262 | - args->flags |= XFS_ATTR_INCOMPLETE; | ||
2263 | + args->op_flags |= XFS_DA_OP_INCOMPLETE; | ||
2264 | state = xfs_da_state_alloc(); | ||
2265 | state->args = args; | ||
2266 | state->mp = mp; | ||
2267 | diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c | ||
2268 | index 0c23127347aca..c86ddbf6d105b 100644 | ||
2269 | --- a/fs/xfs/libxfs/xfs_attr_leaf.c | ||
2270 | +++ b/fs/xfs/libxfs/xfs_attr_leaf.c | ||
2271 | @@ -2345,8 +2345,8 @@ xfs_attr3_leaf_lookup_int( | ||
2272 | * If we are looking for INCOMPLETE entries, show only those. | ||
2273 | * If we are looking for complete entries, show only those. | ||
2274 | */ | ||
2275 | - if ((args->flags & XFS_ATTR_INCOMPLETE) != | ||
2276 | - (entry->flags & XFS_ATTR_INCOMPLETE)) { | ||
2277 | + if (!!(args->op_flags & XFS_DA_OP_INCOMPLETE) != | ||
2278 | + !!(entry->flags & XFS_ATTR_INCOMPLETE)) { | ||
2279 | continue; | ||
2280 | } | ||
2281 | if (entry->flags & XFS_ATTR_LOCAL) { | ||
2282 | diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h | ||
2283 | index 7b74e18becff7..38c05d6ae2aa4 100644 | ||
2284 | --- a/fs/xfs/libxfs/xfs_attr_leaf.h | ||
2285 | +++ b/fs/xfs/libxfs/xfs_attr_leaf.h | ||
2286 | @@ -17,13 +17,27 @@ struct xfs_inode; | ||
2287 | struct xfs_trans; | ||
2288 | |||
2289 | /* | ||
2290 | - * Used to keep a list of "remote value" extents when unlinking an inode. | ||
2291 | + * Incore version of the attribute leaf header. | ||
2292 | */ | ||
2293 | -typedef struct xfs_attr_inactive_list { | ||
2294 | - xfs_dablk_t valueblk; /* block number of value bytes */ | ||
2295 | - int valuelen; /* number of bytes in value */ | ||
2296 | -} xfs_attr_inactive_list_t; | ||
2297 | - | ||
2298 | +struct xfs_attr3_icleaf_hdr { | ||
2299 | + uint32_t forw; | ||
2300 | + uint32_t back; | ||
2301 | + uint16_t magic; | ||
2302 | + uint16_t count; | ||
2303 | + uint16_t usedbytes; | ||
2304 | + /* | ||
2305 | + * Firstused is 32-bit here instead of 16-bit like the on-disk variant | ||
2306 | + * to support maximum fsb size of 64k without overflow issues throughout | ||
2307 | + * the attr code. Instead, the overflow condition is handled on | ||
2308 | + * conversion to/from disk. | ||
2309 | + */ | ||
2310 | + uint32_t firstused; | ||
2311 | + __u8 holes; | ||
2312 | + struct { | ||
2313 | + uint16_t base; | ||
2314 | + uint16_t size; | ||
2315 | + } freemap[XFS_ATTR_LEAF_MAPSIZE]; | ||
2316 | +}; | ||
2317 | |||
2318 | /*======================================================================== | ||
2319 | * Function prototypes for the kernel. | ||
2320 | diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c | ||
2321 | index 3e39b7d40f256..de9096b8a47c6 100644 | ||
2322 | --- a/fs/xfs/libxfs/xfs_attr_remote.c | ||
2323 | +++ b/fs/xfs/libxfs/xfs_attr_remote.c | ||
2324 | @@ -24,6 +24,23 @@ | ||
2325 | |||
2326 | #define ATTR_RMTVALUE_MAPSIZE 1 /* # of map entries at once */ | ||
2327 | |||
2328 | +/* | ||
2329 | + * Remote Attribute Values | ||
2330 | + * ======================= | ||
2331 | + * | ||
2332 | + * Remote extended attribute values are conceptually simple -- they're written | ||
2333 | + * to data blocks mapped by an inode's attribute fork, and they have an upper | ||
2334 | + * size limit of 64k. Setting a value does not involve the XFS log. | ||
2335 | + * | ||
2336 | + * However, on a v5 filesystem, maximally sized remote attr values require one | ||
2337 | + * block more than 64k worth of space to hold both the remote attribute value | ||
2338 | + * header (64 bytes). On a 4k block filesystem this results in a 68k buffer; | ||
2339 | + * on a 64k block filesystem, this would be a 128k buffer. Note that the log | ||
2340 | + * format can only handle a dirty buffer of XFS_MAX_BLOCKSIZE length (64k). | ||
2341 | + * Therefore, we /must/ ensure that remote attribute value buffers never touch | ||
2342 | + * the logging system and therefore never have a log item. | ||
2343 | + */ | ||
2344 | + | ||
2345 | /* | ||
2346 | * Each contiguous block has a header, so it is not just a simple attribute | ||
2347 | * length to FSB conversion. | ||
2348 | @@ -400,17 +417,25 @@ xfs_attr_rmtval_get( | ||
2349 | (map[i].br_startblock != HOLESTARTBLOCK)); | ||
2350 | dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock); | ||
2351 | dblkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount); | ||
2352 | - error = xfs_trans_read_buf(mp, args->trans, | ||
2353 | - mp->m_ddev_targp, | ||
2354 | - dblkno, dblkcnt, 0, &bp, | ||
2355 | - &xfs_attr3_rmt_buf_ops); | ||
2356 | - if (error) | ||
2357 | + bp = xfs_buf_read(mp->m_ddev_targp, dblkno, dblkcnt, 0, | ||
2358 | + &xfs_attr3_rmt_buf_ops); | ||
2359 | + if (!bp) | ||
2360 | + return -ENOMEM; | ||
2361 | + error = bp->b_error; | ||
2362 | + if (error) { | ||
2363 | + xfs_buf_ioerror_alert(bp, __func__); | ||
2364 | + xfs_buf_relse(bp); | ||
2365 | + | ||
2366 | + /* bad CRC means corrupted metadata */ | ||
2367 | + if (error == -EFSBADCRC) | ||
2368 | + error = -EFSCORRUPTED; | ||
2369 | return error; | ||
2370 | + } | ||
2371 | |||
2372 | error = xfs_attr_rmtval_copyout(mp, bp, args->dp->i_ino, | ||
2373 | &offset, &valuelen, | ||
2374 | &dst); | ||
2375 | - xfs_trans_brelse(args->trans, bp); | ||
2376 | + xfs_buf_relse(bp); | ||
2377 | if (error) | ||
2378 | return error; | ||
2379 | |||
2380 | @@ -551,6 +576,32 @@ xfs_attr_rmtval_set( | ||
2381 | return 0; | ||
2382 | } | ||
2383 | |||
2384 | +/* Mark stale any incore buffers for the remote value. */ | ||
2385 | +int | ||
2386 | +xfs_attr_rmtval_stale( | ||
2387 | + struct xfs_inode *ip, | ||
2388 | + struct xfs_bmbt_irec *map, | ||
2389 | + xfs_buf_flags_t incore_flags) | ||
2390 | +{ | ||
2391 | + struct xfs_mount *mp = ip->i_mount; | ||
2392 | + struct xfs_buf *bp; | ||
2393 | + | ||
2394 | + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); | ||
2395 | + | ||
2396 | + ASSERT((map->br_startblock != DELAYSTARTBLOCK) && | ||
2397 | + (map->br_startblock != HOLESTARTBLOCK)); | ||
2398 | + | ||
2399 | + bp = xfs_buf_incore(mp->m_ddev_targp, | ||
2400 | + XFS_FSB_TO_DADDR(mp, map->br_startblock), | ||
2401 | + XFS_FSB_TO_BB(mp, map->br_blockcount), incore_flags); | ||
2402 | + if (bp) { | ||
2403 | + xfs_buf_stale(bp); | ||
2404 | + xfs_buf_relse(bp); | ||
2405 | + } | ||
2406 | + | ||
2407 | + return 0; | ||
2408 | +} | ||
2409 | + | ||
2410 | /* | ||
2411 | * Remove the value associated with an attribute by deleting the | ||
2412 | * out-of-line buffer that it is stored on. | ||
2413 | @@ -559,7 +610,6 @@ int | ||
2414 | xfs_attr_rmtval_remove( | ||
2415 | struct xfs_da_args *args) | ||
2416 | { | ||
2417 | - struct xfs_mount *mp = args->dp->i_mount; | ||
2418 | xfs_dablk_t lblkno; | ||
2419 | int blkcnt; | ||
2420 | int error; | ||
2421 | @@ -574,9 +624,6 @@ xfs_attr_rmtval_remove( | ||
2422 | blkcnt = args->rmtblkcnt; | ||
2423 | while (blkcnt > 0) { | ||
2424 | struct xfs_bmbt_irec map; | ||
2425 | - struct xfs_buf *bp; | ||
2426 | - xfs_daddr_t dblkno; | ||
2427 | - int dblkcnt; | ||
2428 | int nmap; | ||
2429 | |||
2430 | /* | ||
2431 | @@ -588,21 +635,9 @@ xfs_attr_rmtval_remove( | ||
2432 | if (error) | ||
2433 | return error; | ||
2434 | ASSERT(nmap == 1); | ||
2435 | - ASSERT((map.br_startblock != DELAYSTARTBLOCK) && | ||
2436 | - (map.br_startblock != HOLESTARTBLOCK)); | ||
2437 | - | ||
2438 | - dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), | ||
2439 | - dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); | ||
2440 | - | ||
2441 | - /* | ||
2442 | - * If the "remote" value is in the cache, remove it. | ||
2443 | - */ | ||
2444 | - bp = xfs_buf_incore(mp->m_ddev_targp, dblkno, dblkcnt, XBF_TRYLOCK); | ||
2445 | - if (bp) { | ||
2446 | - xfs_buf_stale(bp); | ||
2447 | - xfs_buf_relse(bp); | ||
2448 | - bp = NULL; | ||
2449 | - } | ||
2450 | + error = xfs_attr_rmtval_stale(args->dp, &map, XBF_TRYLOCK); | ||
2451 | + if (error) | ||
2452 | + return error; | ||
2453 | |||
2454 | lblkno += map.br_blockcount; | ||
2455 | blkcnt -= map.br_blockcount; | ||
2456 | diff --git a/fs/xfs/libxfs/xfs_attr_remote.h b/fs/xfs/libxfs/xfs_attr_remote.h | ||
2457 | index 9d20b66ad379e..6fb4572845ce8 100644 | ||
2458 | --- a/fs/xfs/libxfs/xfs_attr_remote.h | ||
2459 | +++ b/fs/xfs/libxfs/xfs_attr_remote.h | ||
2460 | @@ -11,5 +11,7 @@ int xfs_attr3_rmt_blocks(struct xfs_mount *mp, int attrlen); | ||
2461 | int xfs_attr_rmtval_get(struct xfs_da_args *args); | ||
2462 | int xfs_attr_rmtval_set(struct xfs_da_args *args); | ||
2463 | int xfs_attr_rmtval_remove(struct xfs_da_args *args); | ||
2464 | +int xfs_attr_rmtval_stale(struct xfs_inode *ip, struct xfs_bmbt_irec *map, | ||
2465 | + xfs_buf_flags_t incore_flags); | ||
2466 | |||
2467 | #endif /* __XFS_ATTR_REMOTE_H__ */ | ||
2468 | diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h | ||
2469 | index ae0bbd20d9caf..588e4674e931f 100644 | ||
2470 | --- a/fs/xfs/libxfs/xfs_da_btree.h | ||
2471 | +++ b/fs/xfs/libxfs/xfs_da_btree.h | ||
2472 | @@ -82,6 +82,7 @@ typedef struct xfs_da_args { | ||
2473 | #define XFS_DA_OP_OKNOENT 0x0008 /* lookup/add op, ENOENT ok, else die */ | ||
2474 | #define XFS_DA_OP_CILOOKUP 0x0010 /* lookup to return CI name if found */ | ||
2475 | #define XFS_DA_OP_ALLOCVAL 0x0020 /* lookup to alloc buffer if found */ | ||
2476 | +#define XFS_DA_OP_INCOMPLETE 0x0040 /* lookup INCOMPLETE attr keys */ | ||
2477 | |||
2478 | #define XFS_DA_OP_FLAGS \ | ||
2479 | { XFS_DA_OP_JUSTCHECK, "JUSTCHECK" }, \ | ||
2480 | @@ -89,7 +90,8 @@ typedef struct xfs_da_args { | ||
2481 | { XFS_DA_OP_ADDNAME, "ADDNAME" }, \ | ||
2482 | { XFS_DA_OP_OKNOENT, "OKNOENT" }, \ | ||
2483 | { XFS_DA_OP_CILOOKUP, "CILOOKUP" }, \ | ||
2484 | - { XFS_DA_OP_ALLOCVAL, "ALLOCVAL" } | ||
2485 | + { XFS_DA_OP_ALLOCVAL, "ALLOCVAL" }, \ | ||
2486 | + { XFS_DA_OP_INCOMPLETE, "INCOMPLETE" } | ||
2487 | |||
2488 | /* | ||
2489 | * Storage for holding state during Btree searches and split/join ops. | ||
2490 | @@ -124,6 +126,19 @@ typedef struct xfs_da_state { | ||
2491 | /* for dirv2 extrablk is data */ | ||
2492 | } xfs_da_state_t; | ||
2493 | |||
2494 | +/* | ||
2495 | + * In-core version of the node header to abstract the differences in the v2 and | ||
2496 | + * v3 disk format of the headers. Callers need to convert to/from disk format as | ||
2497 | + * appropriate. | ||
2498 | + */ | ||
2499 | +struct xfs_da3_icnode_hdr { | ||
2500 | + uint32_t forw; | ||
2501 | + uint32_t back; | ||
2502 | + uint16_t magic; | ||
2503 | + uint16_t count; | ||
2504 | + uint16_t level; | ||
2505 | +}; | ||
2506 | + | ||
2507 | /* | ||
2508 | * Utility macros to aid in logging changed structure fields. | ||
2509 | */ | ||
2510 | diff --git a/fs/xfs/libxfs/xfs_da_format.c b/fs/xfs/libxfs/xfs_da_format.c | ||
2511 | index b1ae572496b69..31bb250c18992 100644 | ||
2512 | --- a/fs/xfs/libxfs/xfs_da_format.c | ||
2513 | +++ b/fs/xfs/libxfs/xfs_da_format.c | ||
2514 | @@ -13,6 +13,7 @@ | ||
2515 | #include "xfs_mount.h" | ||
2516 | #include "xfs_inode.h" | ||
2517 | #include "xfs_dir2.h" | ||
2518 | +#include "xfs_dir2_priv.h" | ||
2519 | |||
2520 | /* | ||
2521 | * Shortform directory ops | ||
2522 | diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h | ||
2523 | index ae654e06b2fb6..222ee48da5e80 100644 | ||
2524 | --- a/fs/xfs/libxfs/xfs_da_format.h | ||
2525 | +++ b/fs/xfs/libxfs/xfs_da_format.h | ||
2526 | @@ -93,19 +93,6 @@ struct xfs_da3_intnode { | ||
2527 | struct xfs_da_node_entry __btree[]; | ||
2528 | }; | ||
2529 | |||
2530 | -/* | ||
2531 | - * In-core version of the node header to abstract the differences in the v2 and | ||
2532 | - * v3 disk format of the headers. Callers need to convert to/from disk format as | ||
2533 | - * appropriate. | ||
2534 | - */ | ||
2535 | -struct xfs_da3_icnode_hdr { | ||
2536 | - uint32_t forw; | ||
2537 | - uint32_t back; | ||
2538 | - uint16_t magic; | ||
2539 | - uint16_t count; | ||
2540 | - uint16_t level; | ||
2541 | -}; | ||
2542 | - | ||
2543 | /* | ||
2544 | * Directory version 2. | ||
2545 | * | ||
2546 | @@ -434,14 +421,6 @@ struct xfs_dir3_leaf_hdr { | ||
2547 | __be32 pad; /* 64 bit alignment */ | ||
2548 | }; | ||
2549 | |||
2550 | -struct xfs_dir3_icleaf_hdr { | ||
2551 | - uint32_t forw; | ||
2552 | - uint32_t back; | ||
2553 | - uint16_t magic; | ||
2554 | - uint16_t count; | ||
2555 | - uint16_t stale; | ||
2556 | -}; | ||
2557 | - | ||
2558 | /* | ||
2559 | * Leaf block entry. | ||
2560 | */ | ||
2561 | @@ -520,19 +499,6 @@ struct xfs_dir3_free { | ||
2562 | |||
2563 | #define XFS_DIR3_FREE_CRC_OFF offsetof(struct xfs_dir3_free, hdr.hdr.crc) | ||
2564 | |||
2565 | -/* | ||
2566 | - * In core version of the free block header, abstracted away from on-disk format | ||
2567 | - * differences. Use this in the code, and convert to/from the disk version using | ||
2568 | - * xfs_dir3_free_hdr_from_disk/xfs_dir3_free_hdr_to_disk. | ||
2569 | - */ | ||
2570 | -struct xfs_dir3_icfree_hdr { | ||
2571 | - uint32_t magic; | ||
2572 | - uint32_t firstdb; | ||
2573 | - uint32_t nvalid; | ||
2574 | - uint32_t nused; | ||
2575 | - | ||
2576 | -}; | ||
2577 | - | ||
2578 | /* | ||
2579 | * Single block format. | ||
2580 | * | ||
2581 | @@ -709,29 +675,6 @@ struct xfs_attr3_leafblock { | ||
2582 | */ | ||
2583 | }; | ||
2584 | |||
2585 | -/* | ||
2586 | - * incore, neutral version of the attribute leaf header | ||
2587 | - */ | ||
2588 | -struct xfs_attr3_icleaf_hdr { | ||
2589 | - uint32_t forw; | ||
2590 | - uint32_t back; | ||
2591 | - uint16_t magic; | ||
2592 | - uint16_t count; | ||
2593 | - uint16_t usedbytes; | ||
2594 | - /* | ||
2595 | - * firstused is 32-bit here instead of 16-bit like the on-disk variant | ||
2596 | - * to support maximum fsb size of 64k without overflow issues throughout | ||
2597 | - * the attr code. Instead, the overflow condition is handled on | ||
2598 | - * conversion to/from disk. | ||
2599 | - */ | ||
2600 | - uint32_t firstused; | ||
2601 | - __u8 holes; | ||
2602 | - struct { | ||
2603 | - uint16_t base; | ||
2604 | - uint16_t size; | ||
2605 | - } freemap[XFS_ATTR_LEAF_MAPSIZE]; | ||
2606 | -}; | ||
2607 | - | ||
2608 | /* | ||
2609 | * Special value to represent fs block size in the leaf header firstused field. | ||
2610 | * Only used when block size overflows the 2-bytes available on disk. | ||
2611 | @@ -740,8 +683,6 @@ struct xfs_attr3_icleaf_hdr { | ||
2612 | |||
2613 | /* | ||
2614 | * Flags used in the leaf_entry[i].flags field. | ||
2615 | - * NOTE: the INCOMPLETE bit must not collide with the flags bits specified | ||
2616 | - * on the system call, they are "or"ed together for various operations. | ||
2617 | */ | ||
2618 | #define XFS_ATTR_LOCAL_BIT 0 /* attr is stored locally */ | ||
2619 | #define XFS_ATTR_ROOT_BIT 1 /* limit access to trusted attrs */ | ||
2620 | diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h | ||
2621 | index f542447794928..e170792c0acce 100644 | ||
2622 | --- a/fs/xfs/libxfs/xfs_dir2.h | ||
2623 | +++ b/fs/xfs/libxfs/xfs_dir2.h | ||
2624 | @@ -18,6 +18,8 @@ struct xfs_dir2_sf_entry; | ||
2625 | struct xfs_dir2_data_hdr; | ||
2626 | struct xfs_dir2_data_entry; | ||
2627 | struct xfs_dir2_data_unused; | ||
2628 | +struct xfs_dir3_icfree_hdr; | ||
2629 | +struct xfs_dir3_icleaf_hdr; | ||
2630 | |||
2631 | extern struct xfs_name xfs_name_dotdot; | ||
2632 | |||
2633 | diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h | ||
2634 | index 59f9fb2241a5f..d2eaea663e7f2 100644 | ||
2635 | --- a/fs/xfs/libxfs/xfs_dir2_priv.h | ||
2636 | +++ b/fs/xfs/libxfs/xfs_dir2_priv.h | ||
2637 | @@ -8,6 +8,25 @@ | ||
2638 | |||
2639 | struct dir_context; | ||
2640 | |||
2641 | +/* | ||
2642 | + * In-core version of the leaf and free block headers to abstract the | ||
2643 | + * differences in the v2 and v3 disk format of the headers. | ||
2644 | + */ | ||
2645 | +struct xfs_dir3_icleaf_hdr { | ||
2646 | + uint32_t forw; | ||
2647 | + uint32_t back; | ||
2648 | + uint16_t magic; | ||
2649 | + uint16_t count; | ||
2650 | + uint16_t stale; | ||
2651 | +}; | ||
2652 | + | ||
2653 | +struct xfs_dir3_icfree_hdr { | ||
2654 | + uint32_t magic; | ||
2655 | + uint32_t firstdb; | ||
2656 | + uint32_t nvalid; | ||
2657 | + uint32_t nused; | ||
2658 | +}; | ||
2659 | + | ||
2660 | /* xfs_dir2.c */ | ||
2661 | extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space, | ||
2662 | xfs_dir2_db_t *dbp); | ||
2663 | diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h | ||
2664 | index c968b60cee15b..28203b626f6a2 100644 | ||
2665 | --- a/fs/xfs/libxfs/xfs_format.h | ||
2666 | +++ b/fs/xfs/libxfs/xfs_format.h | ||
2667 | @@ -1540,6 +1540,13 @@ typedef struct xfs_bmdr_block { | ||
2668 | #define BMBT_BLOCKCOUNT_BITLEN 21 | ||
2669 | |||
2670 | #define BMBT_STARTOFF_MASK ((1ULL << BMBT_STARTOFF_BITLEN) - 1) | ||
2671 | +#define BMBT_BLOCKCOUNT_MASK ((1ULL << BMBT_BLOCKCOUNT_BITLEN) - 1) | ||
2672 | + | ||
2673 | +/* | ||
2674 | + * bmbt records have a file offset (block) field that is 54 bits wide, so this | ||
2675 | + * is the largest xfs_fileoff_t that we ever expect to see. | ||
2676 | + */ | ||
2677 | +#define XFS_MAX_FILEOFF (BMBT_STARTOFF_MASK + BMBT_BLOCKCOUNT_MASK) | ||
2678 | |||
2679 | typedef struct xfs_bmbt_rec { | ||
2680 | __be64 l0, l1; | ||
2681 | diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c | ||
2682 | index 766b1386402a0..9c88203b537b1 100644 | ||
2683 | --- a/fs/xfs/xfs_attr_inactive.c | ||
2684 | +++ b/fs/xfs/xfs_attr_inactive.c | ||
2685 | @@ -25,22 +25,18 @@ | ||
2686 | #include "xfs_error.h" | ||
2687 | |||
2688 | /* | ||
2689 | - * Look at all the extents for this logical region, | ||
2690 | - * invalidate any buffers that are incore/in transactions. | ||
2691 | + * Invalidate any incore buffers associated with this remote attribute value | ||
2692 | + * extent. We never log remote attribute value buffers, which means that they | ||
2693 | + * won't be attached to a transaction and are therefore safe to mark stale. | ||
2694 | + * The actual bunmapi will be taken care of later. | ||
2695 | */ | ||
2696 | STATIC int | ||
2697 | -xfs_attr3_leaf_freextent( | ||
2698 | - struct xfs_trans **trans, | ||
2699 | +xfs_attr3_rmt_stale( | ||
2700 | struct xfs_inode *dp, | ||
2701 | xfs_dablk_t blkno, | ||
2702 | int blkcnt) | ||
2703 | { | ||
2704 | struct xfs_bmbt_irec map; | ||
2705 | - struct xfs_buf *bp; | ||
2706 | - xfs_dablk_t tblkno; | ||
2707 | - xfs_daddr_t dblkno; | ||
2708 | - int tblkcnt; | ||
2709 | - int dblkcnt; | ||
2710 | int nmap; | ||
2711 | int error; | ||
2712 | |||
2713 | @@ -48,47 +44,28 @@ xfs_attr3_leaf_freextent( | ||
2714 | * Roll through the "value", invalidating the attribute value's | ||
2715 | * blocks. | ||
2716 | */ | ||
2717 | - tblkno = blkno; | ||
2718 | - tblkcnt = blkcnt; | ||
2719 | - while (tblkcnt > 0) { | ||
2720 | + while (blkcnt > 0) { | ||
2721 | /* | ||
2722 | * Try to remember where we decided to put the value. | ||
2723 | */ | ||
2724 | nmap = 1; | ||
2725 | - error = xfs_bmapi_read(dp, (xfs_fileoff_t)tblkno, tblkcnt, | ||
2726 | + error = xfs_bmapi_read(dp, (xfs_fileoff_t)blkno, blkcnt, | ||
2727 | &map, &nmap, XFS_BMAPI_ATTRFORK); | ||
2728 | - if (error) { | ||
2729 | + if (error) | ||
2730 | return error; | ||
2731 | - } | ||
2732 | ASSERT(nmap == 1); | ||
2733 | - ASSERT(map.br_startblock != DELAYSTARTBLOCK); | ||
2734 | |||
2735 | /* | ||
2736 | - * If it's a hole, these are already unmapped | ||
2737 | - * so there's nothing to invalidate. | ||
2738 | + * Mark any incore buffers for the remote value as stale. We | ||
2739 | + * never log remote attr value buffers, so the buffer should be | ||
2740 | + * easy to kill. | ||
2741 | */ | ||
2742 | - if (map.br_startblock != HOLESTARTBLOCK) { | ||
2743 | - | ||
2744 | - dblkno = XFS_FSB_TO_DADDR(dp->i_mount, | ||
2745 | - map.br_startblock); | ||
2746 | - dblkcnt = XFS_FSB_TO_BB(dp->i_mount, | ||
2747 | - map.br_blockcount); | ||
2748 | - bp = xfs_trans_get_buf(*trans, | ||
2749 | - dp->i_mount->m_ddev_targp, | ||
2750 | - dblkno, dblkcnt, 0); | ||
2751 | - if (!bp) | ||
2752 | - return -ENOMEM; | ||
2753 | - xfs_trans_binval(*trans, bp); | ||
2754 | - /* | ||
2755 | - * Roll to next transaction. | ||
2756 | - */ | ||
2757 | - error = xfs_trans_roll_inode(trans, dp); | ||
2758 | - if (error) | ||
2759 | - return error; | ||
2760 | - } | ||
2761 | + error = xfs_attr_rmtval_stale(dp, &map, 0); | ||
2762 | + if (error) | ||
2763 | + return error; | ||
2764 | |||
2765 | - tblkno += map.br_blockcount; | ||
2766 | - tblkcnt -= map.br_blockcount; | ||
2767 | + blkno += map.br_blockcount; | ||
2768 | + blkcnt -= map.br_blockcount; | ||
2769 | } | ||
2770 | |||
2771 | return 0; | ||
2772 | @@ -102,86 +79,45 @@ xfs_attr3_leaf_freextent( | ||
2773 | */ | ||
2774 | STATIC int | ||
2775 | xfs_attr3_leaf_inactive( | ||
2776 | - struct xfs_trans **trans, | ||
2777 | - struct xfs_inode *dp, | ||
2778 | - struct xfs_buf *bp) | ||
2779 | + struct xfs_trans **trans, | ||
2780 | + struct xfs_inode *dp, | ||
2781 | + struct xfs_buf *bp) | ||
2782 | { | ||
2783 | - struct xfs_attr_leafblock *leaf; | ||
2784 | - struct xfs_attr3_icleaf_hdr ichdr; | ||
2785 | - struct xfs_attr_leaf_entry *entry; | ||
2786 | + struct xfs_attr3_icleaf_hdr ichdr; | ||
2787 | + struct xfs_mount *mp = bp->b_mount; | ||
2788 | + struct xfs_attr_leafblock *leaf = bp->b_addr; | ||
2789 | + struct xfs_attr_leaf_entry *entry; | ||
2790 | struct xfs_attr_leaf_name_remote *name_rmt; | ||
2791 | - struct xfs_attr_inactive_list *list; | ||
2792 | - struct xfs_attr_inactive_list *lp; | ||
2793 | - int error; | ||
2794 | - int count; | ||
2795 | - int size; | ||
2796 | - int tmp; | ||
2797 | - int i; | ||
2798 | - struct xfs_mount *mp = bp->b_mount; | ||
2799 | + int error = 0; | ||
2800 | + int i; | ||
2801 | |||
2802 | - leaf = bp->b_addr; | ||
2803 | xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf); | ||
2804 | |||
2805 | /* | ||
2806 | - * Count the number of "remote" value extents. | ||
2807 | + * Find the remote value extents for this leaf and invalidate their | ||
2808 | + * incore buffers. | ||
2809 | */ | ||
2810 | - count = 0; | ||
2811 | entry = xfs_attr3_leaf_entryp(leaf); | ||
2812 | for (i = 0; i < ichdr.count; entry++, i++) { | ||
2813 | - if (be16_to_cpu(entry->nameidx) && | ||
2814 | - ((entry->flags & XFS_ATTR_LOCAL) == 0)) { | ||
2815 | - name_rmt = xfs_attr3_leaf_name_remote(leaf, i); | ||
2816 | - if (name_rmt->valueblk) | ||
2817 | - count++; | ||
2818 | - } | ||
2819 | - } | ||
2820 | - | ||
2821 | - /* | ||
2822 | - * If there are no "remote" values, we're done. | ||
2823 | - */ | ||
2824 | - if (count == 0) { | ||
2825 | - xfs_trans_brelse(*trans, bp); | ||
2826 | - return 0; | ||
2827 | - } | ||
2828 | + int blkcnt; | ||
2829 | |||
2830 | - /* | ||
2831 | - * Allocate storage for a list of all the "remote" value extents. | ||
2832 | - */ | ||
2833 | - size = count * sizeof(xfs_attr_inactive_list_t); | ||
2834 | - list = kmem_alloc(size, 0); | ||
2835 | - | ||
2836 | - /* | ||
2837 | - * Identify each of the "remote" value extents. | ||
2838 | - */ | ||
2839 | - lp = list; | ||
2840 | - entry = xfs_attr3_leaf_entryp(leaf); | ||
2841 | - for (i = 0; i < ichdr.count; entry++, i++) { | ||
2842 | - if (be16_to_cpu(entry->nameidx) && | ||
2843 | - ((entry->flags & XFS_ATTR_LOCAL) == 0)) { | ||
2844 | - name_rmt = xfs_attr3_leaf_name_remote(leaf, i); | ||
2845 | - if (name_rmt->valueblk) { | ||
2846 | - lp->valueblk = be32_to_cpu(name_rmt->valueblk); | ||
2847 | - lp->valuelen = xfs_attr3_rmt_blocks(dp->i_mount, | ||
2848 | - be32_to_cpu(name_rmt->valuelen)); | ||
2849 | - lp++; | ||
2850 | - } | ||
2851 | - } | ||
2852 | - } | ||
2853 | - xfs_trans_brelse(*trans, bp); /* unlock for trans. in freextent() */ | ||
2854 | + if (!entry->nameidx || (entry->flags & XFS_ATTR_LOCAL)) | ||
2855 | + continue; | ||
2856 | |||
2857 | - /* | ||
2858 | - * Invalidate each of the "remote" value extents. | ||
2859 | - */ | ||
2860 | - error = 0; | ||
2861 | - for (lp = list, i = 0; i < count; i++, lp++) { | ||
2862 | - tmp = xfs_attr3_leaf_freextent(trans, dp, | ||
2863 | - lp->valueblk, lp->valuelen); | ||
2864 | + name_rmt = xfs_attr3_leaf_name_remote(leaf, i); | ||
2865 | + if (!name_rmt->valueblk) | ||
2866 | + continue; | ||
2867 | |||
2868 | - if (error == 0) | ||
2869 | - error = tmp; /* save only the 1st errno */ | ||
2870 | + blkcnt = xfs_attr3_rmt_blocks(dp->i_mount, | ||
2871 | + be32_to_cpu(name_rmt->valuelen)); | ||
2872 | + error = xfs_attr3_rmt_stale(dp, | ||
2873 | + be32_to_cpu(name_rmt->valueblk), blkcnt); | ||
2874 | + if (error) | ||
2875 | + goto err; | ||
2876 | } | ||
2877 | |||
2878 | - kmem_free(list); | ||
2879 | + xfs_trans_brelse(*trans, bp); | ||
2880 | +err: | ||
2881 | return error; | ||
2882 | } | ||
2883 | |||
2884 | diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c | ||
2885 | index 203065a647652..e41c13ffa5a43 100644 | ||
2886 | --- a/fs/xfs/xfs_file.c | ||
2887 | +++ b/fs/xfs/xfs_file.c | ||
2888 | @@ -187,7 +187,12 @@ xfs_file_dio_aio_read( | ||
2889 | |||
2890 | file_accessed(iocb->ki_filp); | ||
2891 | |||
2892 | - xfs_ilock(ip, XFS_IOLOCK_SHARED); | ||
2893 | + if (iocb->ki_flags & IOCB_NOWAIT) { | ||
2894 | + if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) | ||
2895 | + return -EAGAIN; | ||
2896 | + } else { | ||
2897 | + xfs_ilock(ip, XFS_IOLOCK_SHARED); | ||
2898 | + } | ||
2899 | ret = iomap_dio_rw(iocb, to, &xfs_iomap_ops, NULL); | ||
2900 | xfs_iunlock(ip, XFS_IOLOCK_SHARED); | ||
2901 | |||
2902 | diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c | ||
2903 | index 7b72c189cff0b..30202d8c25e4f 100644 | ||
2904 | --- a/fs/xfs/xfs_inode.c | ||
2905 | +++ b/fs/xfs/xfs_inode.c | ||
2906 | @@ -1513,10 +1513,8 @@ xfs_itruncate_extents_flags( | ||
2907 | struct xfs_mount *mp = ip->i_mount; | ||
2908 | struct xfs_trans *tp = *tpp; | ||
2909 | xfs_fileoff_t first_unmap_block; | ||
2910 | - xfs_fileoff_t last_block; | ||
2911 | xfs_filblks_t unmap_len; | ||
2912 | int error = 0; | ||
2913 | - int done = 0; | ||
2914 | |||
2915 | ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); | ||
2916 | ASSERT(!atomic_read(&VFS_I(ip)->i_count) || | ||
2917 | @@ -1536,21 +1534,22 @@ xfs_itruncate_extents_flags( | ||
2918 | * the end of the file (in a crash where the space is allocated | ||
2919 | * but the inode size is not yet updated), simply remove any | ||
2920 | * blocks which show up between the new EOF and the maximum | ||
2921 | - * possible file size. If the first block to be removed is | ||
2922 | - * beyond the maximum file size (ie it is the same as last_block), | ||
2923 | - * then there is nothing to do. | ||
2924 | + * possible file size. | ||
2925 | + * | ||
2926 | + * We have to free all the blocks to the bmbt maximum offset, even if | ||
2927 | + * the page cache can't scale that far. | ||
2928 | */ | ||
2929 | first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size); | ||
2930 | - last_block = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); | ||
2931 | - if (first_unmap_block == last_block) | ||
2932 | + if (first_unmap_block >= XFS_MAX_FILEOFF) { | ||
2933 | + WARN_ON_ONCE(first_unmap_block > XFS_MAX_FILEOFF); | ||
2934 | return 0; | ||
2935 | + } | ||
2936 | |||
2937 | - ASSERT(first_unmap_block < last_block); | ||
2938 | - unmap_len = last_block - first_unmap_block + 1; | ||
2939 | - while (!done) { | ||
2940 | + unmap_len = XFS_MAX_FILEOFF - first_unmap_block + 1; | ||
2941 | + while (unmap_len > 0) { | ||
2942 | ASSERT(tp->t_firstblock == NULLFSBLOCK); | ||
2943 | - error = xfs_bunmapi(tp, ip, first_unmap_block, unmap_len, flags, | ||
2944 | - XFS_ITRUNC_MAX_EXTENTS, &done); | ||
2945 | + error = __xfs_bunmapi(tp, ip, first_unmap_block, &unmap_len, | ||
2946 | + flags, XFS_ITRUNC_MAX_EXTENTS); | ||
2947 | if (error) | ||
2948 | goto out; | ||
2949 | |||
2950 | @@ -1570,7 +1569,7 @@ xfs_itruncate_extents_flags( | ||
2951 | if (whichfork == XFS_DATA_FORK) { | ||
2952 | /* Remove all pending CoW reservations. */ | ||
2953 | error = xfs_reflink_cancel_cow_blocks(ip, &tp, | ||
2954 | - first_unmap_block, last_block, true); | ||
2955 | + first_unmap_block, XFS_MAX_FILEOFF, true); | ||
2956 | if (error) | ||
2957 | goto out; | ||
2958 | |||
2959 | diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c | ||
2960 | index 904d8285c2269..dfbf3f8f1ec86 100644 | ||
2961 | --- a/fs/xfs/xfs_reflink.c | ||
2962 | +++ b/fs/xfs/xfs_reflink.c | ||
2963 | @@ -1544,7 +1544,8 @@ xfs_reflink_clear_inode_flag( | ||
2964 | * We didn't find any shared blocks so turn off the reflink flag. | ||
2965 | * First, get rid of any leftover CoW mappings. | ||
2966 | */ | ||
2967 | - error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, NULLFILEOFF, true); | ||
2968 | + error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, XFS_MAX_FILEOFF, | ||
2969 | + true); | ||
2970 | if (error) | ||
2971 | return error; | ||
2972 | |||
2973 | diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c | ||
2974 | index 8d1df9f8be071..a3a54a0fbffea 100644 | ||
2975 | --- a/fs/xfs/xfs_super.c | ||
2976 | +++ b/fs/xfs/xfs_super.c | ||
2977 | @@ -512,32 +512,6 @@ xfs_showargs( | ||
2978 | seq_puts(m, ",noquota"); | ||
2979 | } | ||
2980 | |||
2981 | -static uint64_t | ||
2982 | -xfs_max_file_offset( | ||
2983 | - unsigned int blockshift) | ||
2984 | -{ | ||
2985 | - unsigned int pagefactor = 1; | ||
2986 | - unsigned int bitshift = BITS_PER_LONG - 1; | ||
2987 | - | ||
2988 | - /* Figure out maximum filesize, on Linux this can depend on | ||
2989 | - * the filesystem blocksize (on 32 bit platforms). | ||
2990 | - * __block_write_begin does this in an [unsigned] long long... | ||
2991 | - * page->index << (PAGE_SHIFT - bbits) | ||
2992 | - * So, for page sized blocks (4K on 32 bit platforms), | ||
2993 | - * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is | ||
2994 | - * (((u64)PAGE_SIZE << (BITS_PER_LONG-1))-1) | ||
2995 | - * but for smaller blocksizes it is less (bbits = log2 bsize). | ||
2996 | - */ | ||
2997 | - | ||
2998 | -#if BITS_PER_LONG == 32 | ||
2999 | - ASSERT(sizeof(sector_t) == 8); | ||
3000 | - pagefactor = PAGE_SIZE; | ||
3001 | - bitshift = BITS_PER_LONG; | ||
3002 | -#endif | ||
3003 | - | ||
3004 | - return (((uint64_t)pagefactor) << bitshift) - 1; | ||
3005 | -} | ||
3006 | - | ||
3007 | /* | ||
3008 | * Set parameters for inode allocation heuristics, taking into account | ||
3009 | * filesystem size and inode32/inode64 mount options; i.e. specifically | ||
3010 | @@ -1650,6 +1624,26 @@ xfs_fs_fill_super( | ||
3011 | if (error) | ||
3012 | goto out_free_sb; | ||
3013 | |||
3014 | + /* | ||
3015 | + * XFS block mappings use 54 bits to store the logical block offset. | ||
3016 | + * This should suffice to handle the maximum file size that the VFS | ||
3017 | + * supports (currently 2^63 bytes on 64-bit and ULONG_MAX << PAGE_SHIFT | ||
3018 | + * bytes on 32-bit), but as XFS and VFS have gotten the s_maxbytes | ||
3019 | + * calculation wrong on 32-bit kernels in the past, we'll add a WARN_ON | ||
3020 | + * to check this assertion. | ||
3021 | + * | ||
3022 | + * Avoid integer overflow by comparing the maximum bmbt offset to the | ||
3023 | + * maximum pagecache offset in units of fs blocks. | ||
3024 | + */ | ||
3025 | + if (XFS_B_TO_FSBT(mp, MAX_LFS_FILESIZE) > XFS_MAX_FILEOFF) { | ||
3026 | + xfs_warn(mp, | ||
3027 | +"MAX_LFS_FILESIZE block offset (%llu) exceeds extent map maximum (%llu)!", | ||
3028 | + XFS_B_TO_FSBT(mp, MAX_LFS_FILESIZE), | ||
3029 | + XFS_MAX_FILEOFF); | ||
3030 | + error = -EINVAL; | ||
3031 | + goto out_free_sb; | ||
3032 | + } | ||
3033 | + | ||
3034 | error = xfs_filestream_mount(mp); | ||
3035 | if (error) | ||
3036 | goto out_free_sb; | ||
3037 | @@ -1661,7 +1655,7 @@ xfs_fs_fill_super( | ||
3038 | sb->s_magic = XFS_SUPER_MAGIC; | ||
3039 | sb->s_blocksize = mp->m_sb.sb_blocksize; | ||
3040 | sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1; | ||
3041 | - sb->s_maxbytes = xfs_max_file_offset(sb->s_blocksize_bits); | ||
3042 | + sb->s_maxbytes = MAX_LFS_FILESIZE; | ||
3043 | sb->s_max_links = XFS_MAXLINK; | ||
3044 | sb->s_time_gran = 1; | ||
3045 | sb->s_time_min = S32_MIN; | ||
3046 | diff --git a/include/linux/cpu.h b/include/linux/cpu.h | ||
3047 | index 29a6fa2f518db..b42e9c4134475 100644 | ||
3048 | --- a/include/linux/cpu.h | ||
3049 | +++ b/include/linux/cpu.h | ||
3050 | @@ -68,6 +68,8 @@ extern ssize_t cpu_show_srbds(struct device *dev, struct device_attribute *attr, | ||
3051 | extern ssize_t cpu_show_mmio_stale_data(struct device *dev, | ||
3052 | struct device_attribute *attr, | ||
3053 | char *buf); | ||
3054 | +extern ssize_t cpu_show_retbleed(struct device *dev, | ||
3055 | + struct device_attribute *attr, char *buf); | ||
3056 | |||
3057 | extern __printf(4, 5) | ||
3058 | struct device *cpu_device_create(struct device *parent, void *drvdata, | ||
3059 | diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h | ||
3060 | index dd4cdad76b18e..ee7d57478a454 100644 | ||
3061 | --- a/include/linux/kvm_host.h | ||
3062 | +++ b/include/linux/kvm_host.h | ||
3063 | @@ -955,7 +955,7 @@ static inline void kvm_arch_end_assignment(struct kvm *kvm) | ||
3064 | { | ||
3065 | } | ||
3066 | |||
3067 | -static inline bool kvm_arch_has_assigned_device(struct kvm *kvm) | ||
3068 | +static __always_inline bool kvm_arch_has_assigned_device(struct kvm *kvm) | ||
3069 | { | ||
3070 | return false; | ||
3071 | } | ||
3072 | diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h | ||
3073 | index 4c56404e53a76..8265b99d6d55b 100644 | ||
3074 | --- a/include/linux/mod_devicetable.h | ||
3075 | +++ b/include/linux/mod_devicetable.h | ||
3076 | @@ -672,9 +672,7 @@ struct x86_cpu_id { | ||
3077 | __u16 steppings; | ||
3078 | }; | ||
3079 | |||
3080 | -#define X86_FEATURE_MATCH(x) \ | ||
3081 | - { X86_VENDOR_ANY, X86_FAMILY_ANY, X86_MODEL_ANY, x } | ||
3082 | - | ||
3083 | +/* Wild cards for x86_cpu_id::vendor, family, model and feature */ | ||
3084 | #define X86_VENDOR_ANY 0xffff | ||
3085 | #define X86_FAMILY_ANY 0 | ||
3086 | #define X86_MODEL_ANY 0 | ||
3087 | diff --git a/scripts/Makefile.extrawarn b/scripts/Makefile.extrawarn | ||
3088 | index 854e2ba9daa29..6a78afc6f13b4 100644 | ||
3089 | --- a/scripts/Makefile.extrawarn | ||
3090 | +++ b/scripts/Makefile.extrawarn | ||
3091 | @@ -50,6 +50,7 @@ KBUILD_CFLAGS += -Wno-sign-compare | ||
3092 | KBUILD_CFLAGS += -Wno-format-zero-length | ||
3093 | KBUILD_CFLAGS += $(call cc-disable-warning, pointer-to-enum-cast) | ||
3094 | KBUILD_CFLAGS += $(call cc-disable-warning, unaligned-access) | ||
3095 | +KBUILD_CFLAGS += $(call cc-disable-warning, cast-function-type-strict) | ||
3096 | endif | ||
3097 | |||
3098 | endif | ||
3099 | diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h | ||
3100 | index 59f924e92c284..3efaf338d3257 100644 | ||
3101 | --- a/tools/arch/x86/include/asm/cpufeatures.h | ||
3102 | +++ b/tools/arch/x86/include/asm/cpufeatures.h | ||
3103 | @@ -284,7 +284,7 @@ | ||
3104 | #define X86_FEATURE_CQM_MBM_LOCAL (11*32+ 3) /* LLC Local MBM monitoring */ | ||
3105 | #define X86_FEATURE_FENCE_SWAPGS_USER (11*32+ 4) /* "" LFENCE in user entry SWAPGS path */ | ||
3106 | #define X86_FEATURE_FENCE_SWAPGS_KERNEL (11*32+ 5) /* "" LFENCE in kernel entry SWAPGS path */ | ||
3107 | -#define X86_FEATURE_RSB_VMEXIT_LITE (11*32+ 6) /* "" Fill RSB on VM-Exit when EIBRS is enabled */ | ||
3108 | +#define X86_FEATURE_RSB_VMEXIT_LITE (11*32+17) /* "" Fill RSB on VM-Exit when EIBRS is enabled */ | ||
3109 | |||
3110 | /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ | ||
3111 | #define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* AVX512 BFLOAT16 instructions */ |