Contents of /trunk/kernel-alx/patches-4.9/0275-4.9.176-all-fixes.patch
Parent Directory | Revision Log
Revision 3352 -
(show annotations)
(download)
Tue Jun 18 09:42:05 2019 UTC (5 years, 3 months ago) by niro
File size: 192806 byte(s)
Tue Jun 18 09:42:05 2019 UTC (5 years, 3 months ago) by niro
File size: 192806 byte(s)
-linux-4.9.176
1 | diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu |
2 | index 069e8d52c991..cadb7a9a5218 100644 |
3 | --- a/Documentation/ABI/testing/sysfs-devices-system-cpu |
4 | +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu |
5 | @@ -357,6 +357,7 @@ What: /sys/devices/system/cpu/vulnerabilities |
6 | /sys/devices/system/cpu/vulnerabilities/spectre_v2 |
7 | /sys/devices/system/cpu/vulnerabilities/spec_store_bypass |
8 | /sys/devices/system/cpu/vulnerabilities/l1tf |
9 | + /sys/devices/system/cpu/vulnerabilities/mds |
10 | Date: January 2018 |
11 | Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org> |
12 | Description: Information about CPU vulnerabilities |
13 | @@ -369,8 +370,7 @@ Description: Information about CPU vulnerabilities |
14 | "Vulnerable" CPU is affected and no mitigation in effect |
15 | "Mitigation: $M" CPU is affected and mitigation $M is in effect |
16 | |
17 | - Details about the l1tf file can be found in |
18 | - Documentation/admin-guide/l1tf.rst |
19 | + See also: Documentation/hw-vuln/index.rst |
20 | |
21 | What: /sys/devices/system/cpu/smt |
22 | /sys/devices/system/cpu/smt/active |
23 | diff --git a/Documentation/hw-vuln/index.rst b/Documentation/hw-vuln/index.rst |
24 | new file mode 100644 |
25 | index 000000000000..ffc064c1ec68 |
26 | --- /dev/null |
27 | +++ b/Documentation/hw-vuln/index.rst |
28 | @@ -0,0 +1,13 @@ |
29 | +======================== |
30 | +Hardware vulnerabilities |
31 | +======================== |
32 | + |
33 | +This section describes CPU vulnerabilities and provides an overview of the |
34 | +possible mitigations along with guidance for selecting mitigations if they |
35 | +are configurable at compile, boot or run time. |
36 | + |
37 | +.. toctree:: |
38 | + :maxdepth: 1 |
39 | + |
40 | + l1tf |
41 | + mds |
42 | diff --git a/Documentation/hw-vuln/l1tf.rst b/Documentation/hw-vuln/l1tf.rst |
43 | new file mode 100644 |
44 | index 000000000000..31653a9f0e1b |
45 | --- /dev/null |
46 | +++ b/Documentation/hw-vuln/l1tf.rst |
47 | @@ -0,0 +1,615 @@ |
48 | +L1TF - L1 Terminal Fault |
49 | +======================== |
50 | + |
51 | +L1 Terminal Fault is a hardware vulnerability which allows unprivileged |
52 | +speculative access to data which is available in the Level 1 Data Cache |
53 | +when the page table entry controlling the virtual address, which is used |
54 | +for the access, has the Present bit cleared or other reserved bits set. |
55 | + |
56 | +Affected processors |
57 | +------------------- |
58 | + |
59 | +This vulnerability affects a wide range of Intel processors. The |
60 | +vulnerability is not present on: |
61 | + |
62 | + - Processors from AMD, Centaur and other non Intel vendors |
63 | + |
64 | + - Older processor models, where the CPU family is < 6 |
65 | + |
66 | + - A range of Intel ATOM processors (Cedarview, Cloverview, Lincroft, |
67 | + Penwell, Pineview, Silvermont, Airmont, Merrifield) |
68 | + |
69 | + - The Intel XEON PHI family |
70 | + |
71 | + - Intel processors which have the ARCH_CAP_RDCL_NO bit set in the |
72 | + IA32_ARCH_CAPABILITIES MSR. If the bit is set the CPU is not affected |
73 | + by the Meltdown vulnerability either. These CPUs should become |
74 | + available by end of 2018. |
75 | + |
76 | +Whether a processor is affected or not can be read out from the L1TF |
77 | +vulnerability file in sysfs. See :ref:`l1tf_sys_info`. |
78 | + |
79 | +Related CVEs |
80 | +------------ |
81 | + |
82 | +The following CVE entries are related to the L1TF vulnerability: |
83 | + |
84 | + ============= ================= ============================== |
85 | + CVE-2018-3615 L1 Terminal Fault SGX related aspects |
86 | + CVE-2018-3620 L1 Terminal Fault OS, SMM related aspects |
87 | + CVE-2018-3646 L1 Terminal Fault Virtualization related aspects |
88 | + ============= ================= ============================== |
89 | + |
90 | +Problem |
91 | +------- |
92 | + |
93 | +If an instruction accesses a virtual address for which the relevant page |
94 | +table entry (PTE) has the Present bit cleared or other reserved bits set, |
95 | +then speculative execution ignores the invalid PTE and loads the referenced |
96 | +data if it is present in the Level 1 Data Cache, as if the page referenced |
97 | +by the address bits in the PTE was still present and accessible. |
98 | + |
99 | +While this is a purely speculative mechanism and the instruction will raise |
100 | +a page fault when it is retired eventually, the pure act of loading the |
101 | +data and making it available to other speculative instructions opens up the |
102 | +opportunity for side channel attacks to unprivileged malicious code, |
103 | +similar to the Meltdown attack. |
104 | + |
105 | +While Meltdown breaks the user space to kernel space protection, L1TF |
106 | +allows to attack any physical memory address in the system and the attack |
107 | +works across all protection domains. It allows an attack of SGX and also |
108 | +works from inside virtual machines because the speculation bypasses the |
109 | +extended page table (EPT) protection mechanism. |
110 | + |
111 | + |
112 | +Attack scenarios |
113 | +---------------- |
114 | + |
115 | +1. Malicious user space |
116 | +^^^^^^^^^^^^^^^^^^^^^^^ |
117 | + |
118 | + Operating Systems store arbitrary information in the address bits of a |
119 | + PTE which is marked non present. This allows a malicious user space |
120 | + application to attack the physical memory to which these PTEs resolve. |
121 | + In some cases user-space can maliciously influence the information |
122 | + encoded in the address bits of the PTE, thus making attacks more |
123 | + deterministic and more practical. |
124 | + |
125 | + The Linux kernel contains a mitigation for this attack vector, PTE |
126 | + inversion, which is permanently enabled and has no performance |
127 | + impact. The kernel ensures that the address bits of PTEs, which are not |
128 | + marked present, never point to cacheable physical memory space. |
129 | + |
130 | + A system with an up to date kernel is protected against attacks from |
131 | + malicious user space applications. |
132 | + |
133 | +2. Malicious guest in a virtual machine |
134 | +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
135 | + |
136 | + The fact that L1TF breaks all domain protections allows malicious guest |
137 | + OSes, which can control the PTEs directly, and malicious guest user |
138 | + space applications, which run on an unprotected guest kernel lacking the |
139 | + PTE inversion mitigation for L1TF, to attack physical host memory. |
140 | + |
141 | + A special aspect of L1TF in the context of virtualization is symmetric |
142 | + multi threading (SMT). The Intel implementation of SMT is called |
143 | + HyperThreading. The fact that Hyperthreads on the affected processors |
144 | + share the L1 Data Cache (L1D) is important for this. As the flaw allows |
145 | + only to attack data which is present in L1D, a malicious guest running |
146 | + on one Hyperthread can attack the data which is brought into the L1D by |
147 | + the context which runs on the sibling Hyperthread of the same physical |
148 | + core. This context can be host OS, host user space or a different guest. |
149 | + |
150 | + If the processor does not support Extended Page Tables, the attack is |
151 | + only possible, when the hypervisor does not sanitize the content of the |
152 | + effective (shadow) page tables. |
153 | + |
154 | + While solutions exist to mitigate these attack vectors fully, these |
155 | + mitigations are not enabled by default in the Linux kernel because they |
156 | + can affect performance significantly. The kernel provides several |
157 | + mechanisms which can be utilized to address the problem depending on the |
158 | + deployment scenario. The mitigations, their protection scope and impact |
159 | + are described in the next sections. |
160 | + |
161 | + The default mitigations and the rationale for choosing them are explained |
162 | + at the end of this document. See :ref:`default_mitigations`. |
163 | + |
164 | +.. _l1tf_sys_info: |
165 | + |
166 | +L1TF system information |
167 | +----------------------- |
168 | + |
169 | +The Linux kernel provides a sysfs interface to enumerate the current L1TF |
170 | +status of the system: whether the system is vulnerable, and which |
171 | +mitigations are active. The relevant sysfs file is: |
172 | + |
173 | +/sys/devices/system/cpu/vulnerabilities/l1tf |
174 | + |
175 | +The possible values in this file are: |
176 | + |
177 | + =========================== =============================== |
178 | + 'Not affected' The processor is not vulnerable |
179 | + 'Mitigation: PTE Inversion' The host protection is active |
180 | + =========================== =============================== |
181 | + |
182 | +If KVM/VMX is enabled and the processor is vulnerable then the following |
183 | +information is appended to the 'Mitigation: PTE Inversion' part: |
184 | + |
185 | + - SMT status: |
186 | + |
187 | + ===================== ================ |
188 | + 'VMX: SMT vulnerable' SMT is enabled |
189 | + 'VMX: SMT disabled' SMT is disabled |
190 | + ===================== ================ |
191 | + |
192 | + - L1D Flush mode: |
193 | + |
194 | + ================================ ==================================== |
195 | + 'L1D vulnerable' L1D flushing is disabled |
196 | + |
197 | + 'L1D conditional cache flushes' L1D flush is conditionally enabled |
198 | + |
199 | + 'L1D cache flushes' L1D flush is unconditionally enabled |
200 | + ================================ ==================================== |
201 | + |
202 | +The resulting grade of protection is discussed in the following sections. |
203 | + |
204 | + |
205 | +Host mitigation mechanism |
206 | +------------------------- |
207 | + |
208 | +The kernel is unconditionally protected against L1TF attacks from malicious |
209 | +user space running on the host. |
210 | + |
211 | + |
212 | +Guest mitigation mechanisms |
213 | +--------------------------- |
214 | + |
215 | +.. _l1d_flush: |
216 | + |
217 | +1. L1D flush on VMENTER |
218 | +^^^^^^^^^^^^^^^^^^^^^^^ |
219 | + |
220 | + To make sure that a guest cannot attack data which is present in the L1D |
221 | + the hypervisor flushes the L1D before entering the guest. |
222 | + |
223 | + Flushing the L1D evicts not only the data which should not be accessed |
224 | + by a potentially malicious guest, it also flushes the guest |
225 | + data. Flushing the L1D has a performance impact as the processor has to |
226 | + bring the flushed guest data back into the L1D. Depending on the |
227 | + frequency of VMEXIT/VMENTER and the type of computations in the guest |
228 | + performance degradation in the range of 1% to 50% has been observed. For |
229 | + scenarios where guest VMEXIT/VMENTER are rare the performance impact is |
230 | + minimal. Virtio and mechanisms like posted interrupts are designed to |
231 | + confine the VMEXITs to a bare minimum, but specific configurations and |
232 | + application scenarios might still suffer from a high VMEXIT rate. |
233 | + |
234 | + The kernel provides two L1D flush modes: |
235 | + - conditional ('cond') |
236 | + - unconditional ('always') |
237 | + |
238 | + The conditional mode avoids L1D flushing after VMEXITs which execute |
239 | + only audited code paths before the corresponding VMENTER. These code |
240 | + paths have been verified that they cannot expose secrets or other |
241 | + interesting data to an attacker, but they can leak information about the |
242 | + address space layout of the hypervisor. |
243 | + |
244 | + Unconditional mode flushes L1D on all VMENTER invocations and provides |
245 | + maximum protection. It has a higher overhead than the conditional |
246 | + mode. The overhead cannot be quantified correctly as it depends on the |
247 | + workload scenario and the resulting number of VMEXITs. |
248 | + |
249 | + The general recommendation is to enable L1D flush on VMENTER. The kernel |
250 | + defaults to conditional mode on affected processors. |
251 | + |
252 | + **Note**, that L1D flush does not prevent the SMT problem because the |
253 | + sibling thread will also bring back its data into the L1D which makes it |
254 | + attackable again. |
255 | + |
256 | + L1D flush can be controlled by the administrator via the kernel command |
257 | + line and sysfs control files. See :ref:`mitigation_control_command_line` |
258 | + and :ref:`mitigation_control_kvm`. |
259 | + |
260 | +.. _guest_confinement: |
261 | + |
262 | +2. Guest VCPU confinement to dedicated physical cores |
263 | +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
264 | + |
265 | + To address the SMT problem, it is possible to make a guest or a group of |
266 | + guests affine to one or more physical cores. The proper mechanism for |
267 | + that is to utilize exclusive cpusets to ensure that no other guest or |
268 | + host tasks can run on these cores. |
269 | + |
270 | + If only a single guest or related guests run on sibling SMT threads on |
271 | + the same physical core then they can only attack their own memory and |
272 | + restricted parts of the host memory. |
273 | + |
274 | + Host memory is attackable, when one of the sibling SMT threads runs in |
275 | + host OS (hypervisor) context and the other in guest context. The amount |
276 | + of valuable information from the host OS context depends on the context |
277 | + which the host OS executes, i.e. interrupts, soft interrupts and kernel |
278 | + threads. The amount of valuable data from these contexts cannot be |
279 | + declared as non-interesting for an attacker without deep inspection of |
280 | + the code. |
281 | + |
282 | + **Note**, that assigning guests to a fixed set of physical cores affects |
283 | + the ability of the scheduler to do load balancing and might have |
284 | + negative effects on CPU utilization depending on the hosting |
285 | + scenario. Disabling SMT might be a viable alternative for particular |
286 | + scenarios. |
287 | + |
288 | + For further information about confining guests to a single or to a group |
289 | + of cores consult the cpusets documentation: |
290 | + |
291 | + https://www.kernel.org/doc/Documentation/cgroup-v1/cpusets.txt |
292 | + |
293 | +.. _interrupt_isolation: |
294 | + |
295 | +3. Interrupt affinity |
296 | +^^^^^^^^^^^^^^^^^^^^^ |
297 | + |
298 | + Interrupts can be made affine to logical CPUs. This is not universally |
299 | + true because there are types of interrupts which are truly per CPU |
300 | + interrupts, e.g. the local timer interrupt. Aside of that multi queue |
301 | + devices affine their interrupts to single CPUs or groups of CPUs per |
302 | + queue without allowing the administrator to control the affinities. |
303 | + |
304 | + Moving the interrupts, which can be affinity controlled, away from CPUs |
305 | + which run untrusted guests, reduces the attack vector space. |
306 | + |
307 | + Whether the interrupts with are affine to CPUs, which run untrusted |
308 | + guests, provide interesting data for an attacker depends on the system |
309 | + configuration and the scenarios which run on the system. While for some |
310 | + of the interrupts it can be assumed that they won't expose interesting |
311 | + information beyond exposing hints about the host OS memory layout, there |
312 | + is no way to make general assumptions. |
313 | + |
314 | + Interrupt affinity can be controlled by the administrator via the |
315 | + /proc/irq/$NR/smp_affinity[_list] files. Limited documentation is |
316 | + available at: |
317 | + |
318 | + https://www.kernel.org/doc/Documentation/IRQ-affinity.txt |
319 | + |
320 | +.. _smt_control: |
321 | + |
322 | +4. SMT control |
323 | +^^^^^^^^^^^^^^ |
324 | + |
325 | + To prevent the SMT issues of L1TF it might be necessary to disable SMT |
326 | + completely. Disabling SMT can have a significant performance impact, but |
327 | + the impact depends on the hosting scenario and the type of workloads. |
328 | + The impact of disabling SMT needs also to be weighted against the impact |
329 | + of other mitigation solutions like confining guests to dedicated cores. |
330 | + |
331 | + The kernel provides a sysfs interface to retrieve the status of SMT and |
332 | + to control it. It also provides a kernel command line interface to |
333 | + control SMT. |
334 | + |
335 | + The kernel command line interface consists of the following options: |
336 | + |
337 | + =========== ========================================================== |
338 | + nosmt Affects the bring up of the secondary CPUs during boot. The |
339 | + kernel tries to bring all present CPUs online during the |
340 | + boot process. "nosmt" makes sure that from each physical |
341 | + core only one - the so called primary (hyper) thread is |
342 | + activated. Due to a design flaw of Intel processors related |
343 | + to Machine Check Exceptions the non primary siblings have |
344 | + to be brought up at least partially and are then shut down |
345 | + again. "nosmt" can be undone via the sysfs interface. |
346 | + |
347 | + nosmt=force Has the same effect as "nosmt" but it does not allow to |
348 | + undo the SMT disable via the sysfs interface. |
349 | + =========== ========================================================== |
350 | + |
351 | + The sysfs interface provides two files: |
352 | + |
353 | + - /sys/devices/system/cpu/smt/control |
354 | + - /sys/devices/system/cpu/smt/active |
355 | + |
356 | + /sys/devices/system/cpu/smt/control: |
357 | + |
358 | + This file allows to read out the SMT control state and provides the |
359 | + ability to disable or (re)enable SMT. The possible states are: |
360 | + |
361 | + ============== =================================================== |
362 | + on SMT is supported by the CPU and enabled. All |
363 | + logical CPUs can be onlined and offlined without |
364 | + restrictions. |
365 | + |
366 | + off SMT is supported by the CPU and disabled. Only |
367 | + the so called primary SMT threads can be onlined |
368 | + and offlined without restrictions. An attempt to |
369 | + online a non-primary sibling is rejected |
370 | + |
371 | + forceoff Same as 'off' but the state cannot be controlled. |
372 | + Attempts to write to the control file are rejected. |
373 | + |
374 | + notsupported The processor does not support SMT. It's therefore |
375 | + not affected by the SMT implications of L1TF. |
376 | + Attempts to write to the control file are rejected. |
377 | + ============== =================================================== |
378 | + |
379 | + The possible states which can be written into this file to control SMT |
380 | + state are: |
381 | + |
382 | + - on |
383 | + - off |
384 | + - forceoff |
385 | + |
386 | + /sys/devices/system/cpu/smt/active: |
387 | + |
388 | + This file reports whether SMT is enabled and active, i.e. if on any |
389 | + physical core two or more sibling threads are online. |
390 | + |
391 | + SMT control is also possible at boot time via the l1tf kernel command |
392 | + line parameter in combination with L1D flush control. See |
393 | + :ref:`mitigation_control_command_line`. |
394 | + |
395 | +5. Disabling EPT |
396 | +^^^^^^^^^^^^^^^^ |
397 | + |
398 | + Disabling EPT for virtual machines provides full mitigation for L1TF even |
399 | + with SMT enabled, because the effective page tables for guests are |
400 | + managed and sanitized by the hypervisor. Though disabling EPT has a |
401 | + significant performance impact especially when the Meltdown mitigation |
402 | + KPTI is enabled. |
403 | + |
404 | + EPT can be disabled in the hypervisor via the 'kvm-intel.ept' parameter. |
405 | + |
406 | +There is ongoing research and development for new mitigation mechanisms to |
407 | +address the performance impact of disabling SMT or EPT. |
408 | + |
409 | +.. _mitigation_control_command_line: |
410 | + |
411 | +Mitigation control on the kernel command line |
412 | +--------------------------------------------- |
413 | + |
414 | +The kernel command line allows to control the L1TF mitigations at boot |
415 | +time with the option "l1tf=". The valid arguments for this option are: |
416 | + |
417 | + ============ ============================================================= |
418 | + full Provides all available mitigations for the L1TF |
419 | + vulnerability. Disables SMT and enables all mitigations in |
420 | + the hypervisors, i.e. unconditional L1D flushing |
421 | + |
422 | + SMT control and L1D flush control via the sysfs interface |
423 | + is still possible after boot. Hypervisors will issue a |
424 | + warning when the first VM is started in a potentially |
425 | + insecure configuration, i.e. SMT enabled or L1D flush |
426 | + disabled. |
427 | + |
428 | + full,force Same as 'full', but disables SMT and L1D flush runtime |
429 | + control. Implies the 'nosmt=force' command line option. |
430 | + (i.e. sysfs control of SMT is disabled.) |
431 | + |
432 | + flush Leaves SMT enabled and enables the default hypervisor |
433 | + mitigation, i.e. conditional L1D flushing |
434 | + |
435 | + SMT control and L1D flush control via the sysfs interface |
436 | + is still possible after boot. Hypervisors will issue a |
437 | + warning when the first VM is started in a potentially |
438 | + insecure configuration, i.e. SMT enabled or L1D flush |
439 | + disabled. |
440 | + |
441 | + flush,nosmt Disables SMT and enables the default hypervisor mitigation, |
442 | + i.e. conditional L1D flushing. |
443 | + |
444 | + SMT control and L1D flush control via the sysfs interface |
445 | + is still possible after boot. Hypervisors will issue a |
446 | + warning when the first VM is started in a potentially |
447 | + insecure configuration, i.e. SMT enabled or L1D flush |
448 | + disabled. |
449 | + |
450 | + flush,nowarn Same as 'flush', but hypervisors will not warn when a VM is |
451 | + started in a potentially insecure configuration. |
452 | + |
453 | + off Disables hypervisor mitigations and doesn't emit any |
454 | + warnings. |
455 | + It also drops the swap size and available RAM limit restrictions |
456 | + on both hypervisor and bare metal. |
457 | + |
458 | + ============ ============================================================= |
459 | + |
460 | +The default is 'flush'. For details about L1D flushing see :ref:`l1d_flush`. |
461 | + |
462 | + |
463 | +.. _mitigation_control_kvm: |
464 | + |
465 | +Mitigation control for KVM - module parameter |
466 | +------------------------------------------------------------- |
467 | + |
468 | +The KVM hypervisor mitigation mechanism, flushing the L1D cache when |
469 | +entering a guest, can be controlled with a module parameter. |
470 | + |
471 | +The option/parameter is "kvm-intel.vmentry_l1d_flush=". It takes the |
472 | +following arguments: |
473 | + |
474 | + ============ ============================================================== |
475 | + always L1D cache flush on every VMENTER. |
476 | + |
477 | + cond Flush L1D on VMENTER only when the code between VMEXIT and |
478 | + VMENTER can leak host memory which is considered |
479 | + interesting for an attacker. This still can leak host memory |
480 | + which allows e.g. to determine the hosts address space layout. |
481 | + |
482 | + never Disables the mitigation |
483 | + ============ ============================================================== |
484 | + |
485 | +The parameter can be provided on the kernel command line, as a module |
486 | +parameter when loading the modules and at runtime modified via the sysfs |
487 | +file: |
488 | + |
489 | +/sys/module/kvm_intel/parameters/vmentry_l1d_flush |
490 | + |
491 | +The default is 'cond'. If 'l1tf=full,force' is given on the kernel command |
492 | +line, then 'always' is enforced and the kvm-intel.vmentry_l1d_flush |
493 | +module parameter is ignored and writes to the sysfs file are rejected. |
494 | + |
495 | +.. _mitigation_selection: |
496 | + |
497 | +Mitigation selection guide |
498 | +-------------------------- |
499 | + |
500 | +1. No virtualization in use |
501 | +^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
502 | + |
503 | + The system is protected by the kernel unconditionally and no further |
504 | + action is required. |
505 | + |
506 | +2. Virtualization with trusted guests |
507 | +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
508 | + |
509 | + If the guest comes from a trusted source and the guest OS kernel is |
510 | + guaranteed to have the L1TF mitigations in place the system is fully |
511 | + protected against L1TF and no further action is required. |
512 | + |
513 | + To avoid the overhead of the default L1D flushing on VMENTER the |
514 | + administrator can disable the flushing via the kernel command line and |
515 | + sysfs control files. See :ref:`mitigation_control_command_line` and |
516 | + :ref:`mitigation_control_kvm`. |
517 | + |
518 | + |
519 | +3. Virtualization with untrusted guests |
520 | +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
521 | + |
522 | +3.1. SMT not supported or disabled |
523 | +"""""""""""""""""""""""""""""""""" |
524 | + |
525 | + If SMT is not supported by the processor or disabled in the BIOS or by |
526 | + the kernel, it's only required to enforce L1D flushing on VMENTER. |
527 | + |
528 | + Conditional L1D flushing is the default behaviour and can be tuned. See |
529 | + :ref:`mitigation_control_command_line` and :ref:`mitigation_control_kvm`. |
530 | + |
531 | +3.2. EPT not supported or disabled |
532 | +"""""""""""""""""""""""""""""""""" |
533 | + |
534 | + If EPT is not supported by the processor or disabled in the hypervisor, |
535 | + the system is fully protected. SMT can stay enabled and L1D flushing on |
536 | + VMENTER is not required. |
537 | + |
538 | + EPT can be disabled in the hypervisor via the 'kvm-intel.ept' parameter. |
539 | + |
540 | +3.3. SMT and EPT supported and active |
541 | +""""""""""""""""""""""""""""""""""""" |
542 | + |
543 | + If SMT and EPT are supported and active then various degrees of |
544 | + mitigations can be employed: |
545 | + |
546 | + - L1D flushing on VMENTER: |
547 | + |
548 | + L1D flushing on VMENTER is the minimal protection requirement, but it |
549 | + is only potent in combination with other mitigation methods. |
550 | + |
551 | + Conditional L1D flushing is the default behaviour and can be tuned. See |
552 | + :ref:`mitigation_control_command_line` and :ref:`mitigation_control_kvm`. |
553 | + |
554 | + - Guest confinement: |
555 | + |
556 | + Confinement of guests to a single or a group of physical cores which |
557 | + are not running any other processes, can reduce the attack surface |
558 | + significantly, but interrupts, soft interrupts and kernel threads can |
559 | + still expose valuable data to a potential attacker. See |
560 | + :ref:`guest_confinement`. |
561 | + |
562 | + - Interrupt isolation: |
563 | + |
564 | + Isolating the guest CPUs from interrupts can reduce the attack surface |
565 | + further, but still allows a malicious guest to explore a limited amount |
566 | + of host physical memory. This can at least be used to gain knowledge |
567 | + about the host address space layout. The interrupts which have a fixed |
568 | + affinity to the CPUs which run the untrusted guests can depending on |
569 | + the scenario still trigger soft interrupts and schedule kernel threads |
570 | + which might expose valuable information. See |
571 | + :ref:`interrupt_isolation`. |
572 | + |
573 | +The above three mitigation methods combined can provide protection to a |
574 | +certain degree, but the risk of the remaining attack surface has to be |
575 | +carefully analyzed. For full protection the following methods are |
576 | +available: |
577 | + |
578 | + - Disabling SMT: |
579 | + |
580 | + Disabling SMT and enforcing the L1D flushing provides the maximum |
581 | + amount of protection. This mitigation is not depending on any of the |
582 | + above mitigation methods. |
583 | + |
584 | + SMT control and L1D flushing can be tuned by the command line |
585 | + parameters 'nosmt', 'l1tf', 'kvm-intel.vmentry_l1d_flush' and at run |
586 | + time with the matching sysfs control files. See :ref:`smt_control`, |
587 | + :ref:`mitigation_control_command_line` and |
588 | + :ref:`mitigation_control_kvm`. |
589 | + |
590 | + - Disabling EPT: |
591 | + |
592 | + Disabling EPT provides the maximum amount of protection as well. It is |
593 | + not depending on any of the above mitigation methods. SMT can stay |
594 | + enabled and L1D flushing is not required, but the performance impact is |
595 | + significant. |
596 | + |
597 | + EPT can be disabled in the hypervisor via the 'kvm-intel.ept' |
598 | + parameter. |
599 | + |
600 | +3.4. Nested virtual machines |
601 | +"""""""""""""""""""""""""""" |
602 | + |
603 | +When nested virtualization is in use, three operating systems are involved: |
604 | +the bare metal hypervisor, the nested hypervisor and the nested virtual |
605 | +machine. VMENTER operations from the nested hypervisor into the nested |
606 | +guest will always be processed by the bare metal hypervisor. If KVM is the |
607 | +bare metal hypervisor it will: |
608 | + |
609 | + - Flush the L1D cache on every switch from the nested hypervisor to the |
610 | + nested virtual machine, so that the nested hypervisor's secrets are not |
611 | + exposed to the nested virtual machine; |
612 | + |
613 | + - Flush the L1D cache on every switch from the nested virtual machine to |
614 | + the nested hypervisor; this is a complex operation, and flushing the L1D |
615 | + cache avoids that the bare metal hypervisor's secrets are exposed to the |
616 | + nested virtual machine; |
617 | + |
618 | + - Instruct the nested hypervisor to not perform any L1D cache flush. This |
619 | + is an optimization to avoid double L1D flushing. |
620 | + |
621 | + |
622 | +.. _default_mitigations: |
623 | + |
624 | +Default mitigations |
625 | +------------------- |
626 | + |
627 | + The kernel default mitigations for vulnerable processors are: |
628 | + |
629 | + - PTE inversion to protect against malicious user space. This is done |
630 | + unconditionally and cannot be controlled. The swap storage is limited |
631 | + to ~16TB. |
632 | + |
633 | + - L1D conditional flushing on VMENTER when EPT is enabled for |
634 | + a guest. |
635 | + |
636 | + The kernel does not by default enforce the disabling of SMT, which leaves |
637 | + SMT systems vulnerable when running untrusted guests with EPT enabled. |
638 | + |
639 | + The rationale for this choice is: |
640 | + |
641 | + - Force disabling SMT can break existing setups, especially with |
642 | + unattended updates. |
643 | + |
644 | + - If regular users run untrusted guests on their machine, then L1TF is |
645 | + just an add on to other malware which might be embedded in an untrusted |
646 | + guest, e.g. spam-bots or attacks on the local network. |
647 | + |
648 | + There is no technical way to prevent a user from running untrusted code |
649 | + on their machines blindly. |
650 | + |
651 | + - It's technically extremely unlikely and from today's knowledge even |
652 | + impossible that L1TF can be exploited via the most popular attack |
653 | + mechanisms like JavaScript because these mechanisms have no way to |
654 | + control PTEs. If this would be possible and not other mitigation would |
655 | + be possible, then the default might be different. |
656 | + |
657 | + - The administrators of cloud and hosting setups have to carefully |
658 | + analyze the risk for their scenarios and make the appropriate |
659 | + mitigation choices, which might even vary across their deployed |
660 | + machines and also result in other changes of their overall setup. |
661 | + There is no way for the kernel to provide a sensible default for this |
662 | + kind of scenarios. |
663 | diff --git a/Documentation/hw-vuln/mds.rst b/Documentation/hw-vuln/mds.rst |
664 | new file mode 100644 |
665 | index 000000000000..daf6fdac49a3 |
666 | --- /dev/null |
667 | +++ b/Documentation/hw-vuln/mds.rst |
668 | @@ -0,0 +1,308 @@ |
669 | +MDS - Microarchitectural Data Sampling |
670 | +====================================== |
671 | + |
672 | +Microarchitectural Data Sampling is a hardware vulnerability which allows |
673 | +unprivileged speculative access to data which is available in various CPU |
674 | +internal buffers. |
675 | + |
676 | +Affected processors |
677 | +------------------- |
678 | + |
679 | +This vulnerability affects a wide range of Intel processors. The |
680 | +vulnerability is not present on: |
681 | + |
682 | + - Processors from AMD, Centaur and other non Intel vendors |
683 | + |
684 | + - Older processor models, where the CPU family is < 6 |
685 | + |
686 | + - Some Atoms (Bonnell, Saltwell, Goldmont, GoldmontPlus) |
687 | + |
688 | + - Intel processors which have the ARCH_CAP_MDS_NO bit set in the |
689 | + IA32_ARCH_CAPABILITIES MSR. |
690 | + |
691 | +Whether a processor is affected or not can be read out from the MDS |
692 | +vulnerability file in sysfs. See :ref:`mds_sys_info`. |
693 | + |
694 | +Not all processors are affected by all variants of MDS, but the mitigation |
695 | +is identical for all of them so the kernel treats them as a single |
696 | +vulnerability. |
697 | + |
698 | +Related CVEs |
699 | +------------ |
700 | + |
701 | +The following CVE entries are related to the MDS vulnerability: |
702 | + |
703 | + ============== ===== =================================================== |
704 | + CVE-2018-12126 MSBDS Microarchitectural Store Buffer Data Sampling |
705 | + CVE-2018-12130 MFBDS Microarchitectural Fill Buffer Data Sampling |
706 | + CVE-2018-12127 MLPDS Microarchitectural Load Port Data Sampling |
707 | + CVE-2019-11091 MDSUM Microarchitectural Data Sampling Uncacheable Memory |
708 | + ============== ===== =================================================== |
709 | + |
710 | +Problem |
711 | +------- |
712 | + |
713 | +When performing store, load, L1 refill operations, processors write data |
714 | +into temporary microarchitectural structures (buffers). The data in the |
715 | +buffer can be forwarded to load operations as an optimization. |
716 | + |
717 | +Under certain conditions, usually a fault/assist caused by a load |
718 | +operation, data unrelated to the load memory address can be speculatively |
719 | +forwarded from the buffers. Because the load operation causes a fault or |
720 | +assist and its result will be discarded, the forwarded data will not cause |
721 | +incorrect program execution or state changes. But a malicious operation |
722 | +may be able to forward this speculative data to a disclosure gadget which |
723 | +allows in turn to infer the value via a cache side channel attack. |
724 | + |
725 | +Because the buffers are potentially shared between Hyper-Threads cross |
726 | +Hyper-Thread attacks are possible. |
727 | + |
728 | +Deeper technical information is available in the MDS specific x86 |
729 | +architecture section: :ref:`Documentation/x86/mds.rst <mds>`. |
730 | + |
731 | + |
732 | +Attack scenarios |
733 | +---------------- |
734 | + |
735 | +Attacks against the MDS vulnerabilities can be mounted from malicious non |
736 | +priviledged user space applications running on hosts or guest. Malicious |
737 | +guest OSes can obviously mount attacks as well. |
738 | + |
739 | +Contrary to other speculation based vulnerabilities the MDS vulnerability |
740 | +does not allow the attacker to control the memory target address. As a |
741 | +consequence the attacks are purely sampling based, but as demonstrated with |
742 | +the TLBleed attack samples can be postprocessed successfully. |
743 | + |
744 | +Web-Browsers |
745 | +^^^^^^^^^^^^ |
746 | + |
747 | + It's unclear whether attacks through Web-Browsers are possible at |
748 | + all. The exploitation through Java-Script is considered very unlikely, |
749 | + but other widely used web technologies like Webassembly could possibly be |
750 | + abused. |
751 | + |
752 | + |
753 | +.. _mds_sys_info: |
754 | + |
755 | +MDS system information |
756 | +----------------------- |
757 | + |
758 | +The Linux kernel provides a sysfs interface to enumerate the current MDS |
759 | +status of the system: whether the system is vulnerable, and which |
760 | +mitigations are active. The relevant sysfs file is: |
761 | + |
762 | +/sys/devices/system/cpu/vulnerabilities/mds |
763 | + |
764 | +The possible values in this file are: |
765 | + |
766 | + .. list-table:: |
767 | + |
768 | + * - 'Not affected' |
769 | + - The processor is not vulnerable |
770 | + * - 'Vulnerable' |
771 | + - The processor is vulnerable, but no mitigation enabled |
772 | + * - 'Vulnerable: Clear CPU buffers attempted, no microcode' |
773 | + - The processor is vulnerable but microcode is not updated. |
774 | + |
775 | + The mitigation is enabled on a best effort basis. See :ref:`vmwerv` |
776 | + * - 'Mitigation: Clear CPU buffers' |
777 | + - The processor is vulnerable and the CPU buffer clearing mitigation is |
778 | + enabled. |
779 | + |
780 | +If the processor is vulnerable then the following information is appended |
781 | +to the above information: |
782 | + |
783 | + ======================== ============================================ |
784 | + 'SMT vulnerable' SMT is enabled |
785 | + 'SMT mitigated' SMT is enabled and mitigated |
786 | + 'SMT disabled' SMT is disabled |
787 | + 'SMT Host state unknown' Kernel runs in a VM, Host SMT state unknown |
788 | + ======================== ============================================ |
789 | + |
790 | +.. _vmwerv: |
791 | + |
792 | +Best effort mitigation mode |
793 | +^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
794 | + |
795 | + If the processor is vulnerable, but the availability of the microcode based |
796 | + mitigation mechanism is not advertised via CPUID the kernel selects a best |
797 | + effort mitigation mode. This mode invokes the mitigation instructions |
798 | + without a guarantee that they clear the CPU buffers. |
799 | + |
800 | + This is done to address virtualization scenarios where the host has the |
801 | + microcode update applied, but the hypervisor is not yet updated to expose |
802 | + the CPUID to the guest. If the host has updated microcode the protection |
803 | + takes effect otherwise a few cpu cycles are wasted pointlessly. |
804 | + |
805 | + The state in the mds sysfs file reflects this situation accordingly. |
806 | + |
807 | + |
808 | +Mitigation mechanism |
809 | +------------------------- |
810 | + |
811 | +The kernel detects the affected CPUs and the presence of the microcode |
812 | +which is required. |
813 | + |
814 | +If a CPU is affected and the microcode is available, then the kernel |
815 | +enables the mitigation by default. The mitigation can be controlled at boot |
816 | +time via a kernel command line option. See |
817 | +:ref:`mds_mitigation_control_command_line`. |
818 | + |
819 | +.. _cpu_buffer_clear: |
820 | + |
821 | +CPU buffer clearing |
822 | +^^^^^^^^^^^^^^^^^^^ |
823 | + |
824 | + The mitigation for MDS clears the affected CPU buffers on return to user |
825 | + space and when entering a guest. |
826 | + |
827 | + If SMT is enabled it also clears the buffers on idle entry when the CPU |
828 | + is only affected by MSBDS and not any other MDS variant, because the |
829 | + other variants cannot be protected against cross Hyper-Thread attacks. |
830 | + |
831 | + For CPUs which are only affected by MSBDS the user space, guest and idle |
832 | + transition mitigations are sufficient and SMT is not affected. |
833 | + |
834 | +.. _virt_mechanism: |
835 | + |
836 | +Virtualization mitigation |
837 | +^^^^^^^^^^^^^^^^^^^^^^^^^ |
838 | + |
839 | + The protection for host to guest transition depends on the L1TF |
840 | + vulnerability of the CPU: |
841 | + |
842 | + - CPU is affected by L1TF: |
843 | + |
844 | + If the L1D flush mitigation is enabled and up to date microcode is |
845 | + available, the L1D flush mitigation is automatically protecting the |
846 | + guest transition. |
847 | + |
848 | + If the L1D flush mitigation is disabled then the MDS mitigation is |
849 | + invoked explicit when the host MDS mitigation is enabled. |
850 | + |
851 | + For details on L1TF and virtualization see: |
852 | + :ref:`Documentation/hw-vuln//l1tf.rst <mitigation_control_kvm>`. |
853 | + |
854 | + - CPU is not affected by L1TF: |
855 | + |
856 | + CPU buffers are flushed before entering the guest when the host MDS |
857 | + mitigation is enabled. |
858 | + |
859 | + The resulting MDS protection matrix for the host to guest transition: |
860 | + |
861 | + ============ ===== ============= ============ ================= |
862 | + L1TF MDS VMX-L1FLUSH Host MDS MDS-State |
863 | + |
864 | + Don't care No Don't care N/A Not affected |
865 | + |
866 | + Yes Yes Disabled Off Vulnerable |
867 | + |
868 | + Yes Yes Disabled Full Mitigated |
869 | + |
870 | + Yes Yes Enabled Don't care Mitigated |
871 | + |
872 | + No Yes N/A Off Vulnerable |
873 | + |
874 | + No Yes N/A Full Mitigated |
875 | + ============ ===== ============= ============ ================= |
876 | + |
877 | + This only covers the host to guest transition, i.e. prevents leakage from |
878 | + host to guest, but does not protect the guest internally. Guests need to |
879 | + have their own protections. |
880 | + |
881 | +.. _xeon_phi: |
882 | + |
883 | +XEON PHI specific considerations |
884 | +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
885 | + |
886 | + The XEON PHI processor family is affected by MSBDS which can be exploited |
887 | + cross Hyper-Threads when entering idle states. Some XEON PHI variants allow |
888 | + to use MWAIT in user space (Ring 3) which opens an potential attack vector |
889 | + for malicious user space. The exposure can be disabled on the kernel |
890 | + command line with the 'ring3mwait=disable' command line option. |
891 | + |
892 | + XEON PHI is not affected by the other MDS variants and MSBDS is mitigated |
893 | + before the CPU enters a idle state. As XEON PHI is not affected by L1TF |
894 | + either disabling SMT is not required for full protection. |
895 | + |
896 | +.. _mds_smt_control: |
897 | + |
898 | +SMT control |
899 | +^^^^^^^^^^^ |
900 | + |
901 | + All MDS variants except MSBDS can be attacked cross Hyper-Threads. That |
902 | + means on CPUs which are affected by MFBDS or MLPDS it is necessary to |
903 | + disable SMT for full protection. These are most of the affected CPUs; the |
904 | + exception is XEON PHI, see :ref:`xeon_phi`. |
905 | + |
906 | + Disabling SMT can have a significant performance impact, but the impact |
907 | + depends on the type of workloads. |
908 | + |
909 | + See the relevant chapter in the L1TF mitigation documentation for details: |
910 | + :ref:`Documentation/hw-vuln/l1tf.rst <smt_control>`. |
911 | + |
912 | + |
913 | +.. _mds_mitigation_control_command_line: |
914 | + |
915 | +Mitigation control on the kernel command line |
916 | +--------------------------------------------- |
917 | + |
918 | +The kernel command line allows to control the MDS mitigations at boot |
919 | +time with the option "mds=". The valid arguments for this option are: |
920 | + |
921 | + ============ ============================================================= |
922 | + full If the CPU is vulnerable, enable all available mitigations |
923 | + for the MDS vulnerability, CPU buffer clearing on exit to |
924 | + userspace and when entering a VM. Idle transitions are |
925 | + protected as well if SMT is enabled. |
926 | + |
927 | + It does not automatically disable SMT. |
928 | + |
929 | + full,nosmt The same as mds=full, with SMT disabled on vulnerable |
930 | + CPUs. This is the complete mitigation. |
931 | + |
932 | + off Disables MDS mitigations completely. |
933 | + |
934 | + ============ ============================================================= |
935 | + |
936 | +Not specifying this option is equivalent to "mds=full". |
937 | + |
938 | + |
939 | +Mitigation selection guide |
940 | +-------------------------- |
941 | + |
942 | +1. Trusted userspace |
943 | +^^^^^^^^^^^^^^^^^^^^ |
944 | + |
945 | + If all userspace applications are from a trusted source and do not |
946 | + execute untrusted code which is supplied externally, then the mitigation |
947 | + can be disabled. |
948 | + |
949 | + |
950 | +2. Virtualization with trusted guests |
951 | +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
952 | + |
953 | + The same considerations as above versus trusted user space apply. |
954 | + |
955 | +3. Virtualization with untrusted guests |
956 | +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
957 | + |
958 | + The protection depends on the state of the L1TF mitigations. |
959 | + See :ref:`virt_mechanism`. |
960 | + |
961 | + If the MDS mitigation is enabled and SMT is disabled, guest to host and |
962 | + guest to guest attacks are prevented. |
963 | + |
964 | +.. _mds_default_mitigations: |
965 | + |
966 | +Default mitigations |
967 | +------------------- |
968 | + |
969 | + The kernel default mitigations for vulnerable processors are: |
970 | + |
971 | + - Enable CPU buffer clearing |
972 | + |
973 | + The kernel does not by default enforce the disabling of SMT, which leaves |
974 | + SMT systems vulnerable when running untrusted code. The same rationale as |
975 | + for L1TF applies. |
976 | + See :ref:`Documentation/hw-vuln//l1tf.rst <default_mitigations>`. |
977 | diff --git a/Documentation/index.rst b/Documentation/index.rst |
978 | index 213399aac757..f95c58dbbbc3 100644 |
979 | --- a/Documentation/index.rst |
980 | +++ b/Documentation/index.rst |
981 | @@ -12,7 +12,6 @@ Contents: |
982 | :maxdepth: 2 |
983 | |
984 | kernel-documentation |
985 | - l1tf |
986 | development-process/index |
987 | dev-tools/tools |
988 | driver-api/index |
989 | @@ -20,6 +19,24 @@ Contents: |
990 | gpu/index |
991 | 80211/index |
992 | |
993 | +This section describes CPU vulnerabilities and their mitigations. |
994 | + |
995 | +.. toctree:: |
996 | + :maxdepth: 1 |
997 | + |
998 | + hw-vuln/index |
999 | + |
1000 | +Architecture-specific documentation |
1001 | +----------------------------------- |
1002 | + |
1003 | +These books provide programming details about architecture-specific |
1004 | +implementation. |
1005 | + |
1006 | +.. toctree:: |
1007 | + :maxdepth: 2 |
1008 | + |
1009 | + x86/index |
1010 | + |
1011 | Indices and tables |
1012 | ================== |
1013 | |
1014 | diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt |
1015 | index a1472b48ee22..55a9bbbcf5e1 100644 |
1016 | --- a/Documentation/kernel-parameters.txt |
1017 | +++ b/Documentation/kernel-parameters.txt |
1018 | @@ -2076,10 +2076,13 @@ bytes respectively. Such letter suffixes can also be entirely omitted. |
1019 | off |
1020 | Disables hypervisor mitigations and doesn't |
1021 | emit any warnings. |
1022 | + It also drops the swap size and available |
1023 | + RAM limit restriction on both hypervisor and |
1024 | + bare metal. |
1025 | |
1026 | Default is 'flush'. |
1027 | |
1028 | - For details see: Documentation/admin-guide/l1tf.rst |
1029 | + For details see: Documentation/hw-vuln/l1tf.rst |
1030 | |
1031 | l2cr= [PPC] |
1032 | |
1033 | @@ -2322,6 +2325,32 @@ bytes respectively. Such letter suffixes can also be entirely omitted. |
1034 | Format: <first>,<last> |
1035 | Specifies range of consoles to be captured by the MDA. |
1036 | |
1037 | + mds= [X86,INTEL] |
1038 | + Control mitigation for the Micro-architectural Data |
1039 | + Sampling (MDS) vulnerability. |
1040 | + |
1041 | + Certain CPUs are vulnerable to an exploit against CPU |
1042 | + internal buffers which can forward information to a |
1043 | + disclosure gadget under certain conditions. |
1044 | + |
1045 | + In vulnerable processors, the speculatively |
1046 | + forwarded data can be used in a cache side channel |
1047 | + attack, to access data to which the attacker does |
1048 | + not have direct access. |
1049 | + |
1050 | + This parameter controls the MDS mitigation. The |
1051 | + options are: |
1052 | + |
1053 | + full - Enable MDS mitigation on vulnerable CPUs |
1054 | + full,nosmt - Enable MDS mitigation and disable |
1055 | + SMT on vulnerable CPUs |
1056 | + off - Unconditionally disable MDS mitigation |
1057 | + |
1058 | + Not specifying this option is equivalent to |
1059 | + mds=full. |
1060 | + |
1061 | + For details see: Documentation/hw-vuln/mds.rst |
1062 | + |
1063 | mem=nn[KMG] [KNL,BOOT] Force usage of a specific amount of memory |
1064 | Amount of memory to be used when the kernel is not able |
1065 | to see the whole system memory or for test. |
1066 | @@ -2444,6 +2473,38 @@ bytes respectively. Such letter suffixes can also be entirely omitted. |
1067 | in the "bleeding edge" mini2440 support kernel at |
1068 | http://repo.or.cz/w/linux-2.6/mini2440.git |
1069 | |
1070 | + mitigations= |
1071 | + [X86] Control optional mitigations for CPU |
1072 | + vulnerabilities. This is a set of curated, |
1073 | + arch-independent options, each of which is an |
1074 | + aggregation of existing arch-specific options. |
1075 | + |
1076 | + off |
1077 | + Disable all optional CPU mitigations. This |
1078 | + improves system performance, but it may also |
1079 | + expose users to several CPU vulnerabilities. |
1080 | + Equivalent to: nopti [X86] |
1081 | + nospectre_v2 [X86] |
1082 | + spectre_v2_user=off [X86] |
1083 | + spec_store_bypass_disable=off [X86] |
1084 | + l1tf=off [X86] |
1085 | + mds=off [X86] |
1086 | + |
1087 | + auto (default) |
1088 | + Mitigate all CPU vulnerabilities, but leave SMT |
1089 | + enabled, even if it's vulnerable. This is for |
1090 | + users who don't want to be surprised by SMT |
1091 | + getting disabled across kernel upgrades, or who |
1092 | + have other ways of avoiding SMT-based attacks. |
1093 | + Equivalent to: (default behavior) |
1094 | + |
1095 | + auto,nosmt |
1096 | + Mitigate all CPU vulnerabilities, disabling SMT |
1097 | + if needed. This is for users who always want to |
1098 | + be fully mitigated, even if it means losing SMT. |
1099 | + Equivalent to: l1tf=flush,nosmt [X86] |
1100 | + mds=full,nosmt [X86] |
1101 | + |
1102 | mminit_loglevel= |
1103 | [KNL] When CONFIG_DEBUG_MEMORY_INIT is set, this |
1104 | parameter allows control of the logging verbosity for |
1105 | @@ -4030,9 +4091,13 @@ bytes respectively. Such letter suffixes can also be entirely omitted. |
1106 | |
1107 | spectre_v2= [X86] Control mitigation of Spectre variant 2 |
1108 | (indirect branch speculation) vulnerability. |
1109 | + The default operation protects the kernel from |
1110 | + user space attacks. |
1111 | |
1112 | - on - unconditionally enable |
1113 | - off - unconditionally disable |
1114 | + on - unconditionally enable, implies |
1115 | + spectre_v2_user=on |
1116 | + off - unconditionally disable, implies |
1117 | + spectre_v2_user=off |
1118 | auto - kernel detects whether your CPU model is |
1119 | vulnerable |
1120 | |
1121 | @@ -4042,6 +4107,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted. |
1122 | CONFIG_RETPOLINE configuration option, and the |
1123 | compiler with which the kernel was built. |
1124 | |
1125 | + Selecting 'on' will also enable the mitigation |
1126 | + against user space to user space task attacks. |
1127 | + |
1128 | + Selecting 'off' will disable both the kernel and |
1129 | + the user space protections. |
1130 | + |
1131 | Specific mitigations can also be selected manually: |
1132 | |
1133 | retpoline - replace indirect branches |
1134 | @@ -4051,6 +4122,48 @@ bytes respectively. Such letter suffixes can also be entirely omitted. |
1135 | Not specifying this option is equivalent to |
1136 | spectre_v2=auto. |
1137 | |
1138 | + spectre_v2_user= |
1139 | + [X86] Control mitigation of Spectre variant 2 |
1140 | + (indirect branch speculation) vulnerability between |
1141 | + user space tasks |
1142 | + |
1143 | + on - Unconditionally enable mitigations. Is |
1144 | + enforced by spectre_v2=on |
1145 | + |
1146 | + off - Unconditionally disable mitigations. Is |
1147 | + enforced by spectre_v2=off |
1148 | + |
1149 | + prctl - Indirect branch speculation is enabled, |
1150 | + but mitigation can be enabled via prctl |
1151 | + per thread. The mitigation control state |
1152 | + is inherited on fork. |
1153 | + |
1154 | + prctl,ibpb |
1155 | + - Like "prctl" above, but only STIBP is |
1156 | + controlled per thread. IBPB is issued |
1157 | + always when switching between different user |
1158 | + space processes. |
1159 | + |
1160 | + seccomp |
1161 | + - Same as "prctl" above, but all seccomp |
1162 | + threads will enable the mitigation unless |
1163 | + they explicitly opt out. |
1164 | + |
1165 | + seccomp,ibpb |
1166 | + - Like "seccomp" above, but only STIBP is |
1167 | + controlled per thread. IBPB is issued |
1168 | + always when switching between different |
1169 | + user space processes. |
1170 | + |
1171 | + auto - Kernel selects the mitigation depending on |
1172 | + the available CPU features and vulnerability. |
1173 | + |
1174 | + Default mitigation: |
1175 | + If CONFIG_SECCOMP=y then "seccomp", otherwise "prctl" |
1176 | + |
1177 | + Not specifying this option is equivalent to |
1178 | + spectre_v2_user=auto. |
1179 | + |
1180 | spec_store_bypass_disable= |
1181 | [HW] Control Speculative Store Bypass (SSB) Disable mitigation |
1182 | (Speculative Store Bypass vulnerability) |
1183 | diff --git a/Documentation/l1tf.rst b/Documentation/l1tf.rst |
1184 | deleted file mode 100644 |
1185 | index bae52b845de0..000000000000 |
1186 | --- a/Documentation/l1tf.rst |
1187 | +++ /dev/null |
1188 | @@ -1,610 +0,0 @@ |
1189 | -L1TF - L1 Terminal Fault |
1190 | -======================== |
1191 | - |
1192 | -L1 Terminal Fault is a hardware vulnerability which allows unprivileged |
1193 | -speculative access to data which is available in the Level 1 Data Cache |
1194 | -when the page table entry controlling the virtual address, which is used |
1195 | -for the access, has the Present bit cleared or other reserved bits set. |
1196 | - |
1197 | -Affected processors |
1198 | -------------------- |
1199 | - |
1200 | -This vulnerability affects a wide range of Intel processors. The |
1201 | -vulnerability is not present on: |
1202 | - |
1203 | - - Processors from AMD, Centaur and other non Intel vendors |
1204 | - |
1205 | - - Older processor models, where the CPU family is < 6 |
1206 | - |
1207 | - - A range of Intel ATOM processors (Cedarview, Cloverview, Lincroft, |
1208 | - Penwell, Pineview, Silvermont, Airmont, Merrifield) |
1209 | - |
1210 | - - The Intel XEON PHI family |
1211 | - |
1212 | - - Intel processors which have the ARCH_CAP_RDCL_NO bit set in the |
1213 | - IA32_ARCH_CAPABILITIES MSR. If the bit is set the CPU is not affected |
1214 | - by the Meltdown vulnerability either. These CPUs should become |
1215 | - available by end of 2018. |
1216 | - |
1217 | -Whether a processor is affected or not can be read out from the L1TF |
1218 | -vulnerability file in sysfs. See :ref:`l1tf_sys_info`. |
1219 | - |
1220 | -Related CVEs |
1221 | ------------- |
1222 | - |
1223 | -The following CVE entries are related to the L1TF vulnerability: |
1224 | - |
1225 | - ============= ================= ============================== |
1226 | - CVE-2018-3615 L1 Terminal Fault SGX related aspects |
1227 | - CVE-2018-3620 L1 Terminal Fault OS, SMM related aspects |
1228 | - CVE-2018-3646 L1 Terminal Fault Virtualization related aspects |
1229 | - ============= ================= ============================== |
1230 | - |
1231 | -Problem |
1232 | -------- |
1233 | - |
1234 | -If an instruction accesses a virtual address for which the relevant page |
1235 | -table entry (PTE) has the Present bit cleared or other reserved bits set, |
1236 | -then speculative execution ignores the invalid PTE and loads the referenced |
1237 | -data if it is present in the Level 1 Data Cache, as if the page referenced |
1238 | -by the address bits in the PTE was still present and accessible. |
1239 | - |
1240 | -While this is a purely speculative mechanism and the instruction will raise |
1241 | -a page fault when it is retired eventually, the pure act of loading the |
1242 | -data and making it available to other speculative instructions opens up the |
1243 | -opportunity for side channel attacks to unprivileged malicious code, |
1244 | -similar to the Meltdown attack. |
1245 | - |
1246 | -While Meltdown breaks the user space to kernel space protection, L1TF |
1247 | -allows to attack any physical memory address in the system and the attack |
1248 | -works across all protection domains. It allows an attack of SGX and also |
1249 | -works from inside virtual machines because the speculation bypasses the |
1250 | -extended page table (EPT) protection mechanism. |
1251 | - |
1252 | - |
1253 | -Attack scenarios |
1254 | ----------------- |
1255 | - |
1256 | -1. Malicious user space |
1257 | -^^^^^^^^^^^^^^^^^^^^^^^ |
1258 | - |
1259 | - Operating Systems store arbitrary information in the address bits of a |
1260 | - PTE which is marked non present. This allows a malicious user space |
1261 | - application to attack the physical memory to which these PTEs resolve. |
1262 | - In some cases user-space can maliciously influence the information |
1263 | - encoded in the address bits of the PTE, thus making attacks more |
1264 | - deterministic and more practical. |
1265 | - |
1266 | - The Linux kernel contains a mitigation for this attack vector, PTE |
1267 | - inversion, which is permanently enabled and has no performance |
1268 | - impact. The kernel ensures that the address bits of PTEs, which are not |
1269 | - marked present, never point to cacheable physical memory space. |
1270 | - |
1271 | - A system with an up to date kernel is protected against attacks from |
1272 | - malicious user space applications. |
1273 | - |
1274 | -2. Malicious guest in a virtual machine |
1275 | -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
1276 | - |
1277 | - The fact that L1TF breaks all domain protections allows malicious guest |
1278 | - OSes, which can control the PTEs directly, and malicious guest user |
1279 | - space applications, which run on an unprotected guest kernel lacking the |
1280 | - PTE inversion mitigation for L1TF, to attack physical host memory. |
1281 | - |
1282 | - A special aspect of L1TF in the context of virtualization is symmetric |
1283 | - multi threading (SMT). The Intel implementation of SMT is called |
1284 | - HyperThreading. The fact that Hyperthreads on the affected processors |
1285 | - share the L1 Data Cache (L1D) is important for this. As the flaw allows |
1286 | - only to attack data which is present in L1D, a malicious guest running |
1287 | - on one Hyperthread can attack the data which is brought into the L1D by |
1288 | - the context which runs on the sibling Hyperthread of the same physical |
1289 | - core. This context can be host OS, host user space or a different guest. |
1290 | - |
1291 | - If the processor does not support Extended Page Tables, the attack is |
1292 | - only possible, when the hypervisor does not sanitize the content of the |
1293 | - effective (shadow) page tables. |
1294 | - |
1295 | - While solutions exist to mitigate these attack vectors fully, these |
1296 | - mitigations are not enabled by default in the Linux kernel because they |
1297 | - can affect performance significantly. The kernel provides several |
1298 | - mechanisms which can be utilized to address the problem depending on the |
1299 | - deployment scenario. The mitigations, their protection scope and impact |
1300 | - are described in the next sections. |
1301 | - |
1302 | - The default mitigations and the rationale for choosing them are explained |
1303 | - at the end of this document. See :ref:`default_mitigations`. |
1304 | - |
1305 | -.. _l1tf_sys_info: |
1306 | - |
1307 | -L1TF system information |
1308 | ------------------------ |
1309 | - |
1310 | -The Linux kernel provides a sysfs interface to enumerate the current L1TF |
1311 | -status of the system: whether the system is vulnerable, and which |
1312 | -mitigations are active. The relevant sysfs file is: |
1313 | - |
1314 | -/sys/devices/system/cpu/vulnerabilities/l1tf |
1315 | - |
1316 | -The possible values in this file are: |
1317 | - |
1318 | - =========================== =============================== |
1319 | - 'Not affected' The processor is not vulnerable |
1320 | - 'Mitigation: PTE Inversion' The host protection is active |
1321 | - =========================== =============================== |
1322 | - |
1323 | -If KVM/VMX is enabled and the processor is vulnerable then the following |
1324 | -information is appended to the 'Mitigation: PTE Inversion' part: |
1325 | - |
1326 | - - SMT status: |
1327 | - |
1328 | - ===================== ================ |
1329 | - 'VMX: SMT vulnerable' SMT is enabled |
1330 | - 'VMX: SMT disabled' SMT is disabled |
1331 | - ===================== ================ |
1332 | - |
1333 | - - L1D Flush mode: |
1334 | - |
1335 | - ================================ ==================================== |
1336 | - 'L1D vulnerable' L1D flushing is disabled |
1337 | - |
1338 | - 'L1D conditional cache flushes' L1D flush is conditionally enabled |
1339 | - |
1340 | - 'L1D cache flushes' L1D flush is unconditionally enabled |
1341 | - ================================ ==================================== |
1342 | - |
1343 | -The resulting grade of protection is discussed in the following sections. |
1344 | - |
1345 | - |
1346 | -Host mitigation mechanism |
1347 | -------------------------- |
1348 | - |
1349 | -The kernel is unconditionally protected against L1TF attacks from malicious |
1350 | -user space running on the host. |
1351 | - |
1352 | - |
1353 | -Guest mitigation mechanisms |
1354 | ---------------------------- |
1355 | - |
1356 | -.. _l1d_flush: |
1357 | - |
1358 | -1. L1D flush on VMENTER |
1359 | -^^^^^^^^^^^^^^^^^^^^^^^ |
1360 | - |
1361 | - To make sure that a guest cannot attack data which is present in the L1D |
1362 | - the hypervisor flushes the L1D before entering the guest. |
1363 | - |
1364 | - Flushing the L1D evicts not only the data which should not be accessed |
1365 | - by a potentially malicious guest, it also flushes the guest |
1366 | - data. Flushing the L1D has a performance impact as the processor has to |
1367 | - bring the flushed guest data back into the L1D. Depending on the |
1368 | - frequency of VMEXIT/VMENTER and the type of computations in the guest |
1369 | - performance degradation in the range of 1% to 50% has been observed. For |
1370 | - scenarios where guest VMEXIT/VMENTER are rare the performance impact is |
1371 | - minimal. Virtio and mechanisms like posted interrupts are designed to |
1372 | - confine the VMEXITs to a bare minimum, but specific configurations and |
1373 | - application scenarios might still suffer from a high VMEXIT rate. |
1374 | - |
1375 | - The kernel provides two L1D flush modes: |
1376 | - - conditional ('cond') |
1377 | - - unconditional ('always') |
1378 | - |
1379 | - The conditional mode avoids L1D flushing after VMEXITs which execute |
1380 | - only audited code paths before the corresponding VMENTER. These code |
1381 | - paths have been verified that they cannot expose secrets or other |
1382 | - interesting data to an attacker, but they can leak information about the |
1383 | - address space layout of the hypervisor. |
1384 | - |
1385 | - Unconditional mode flushes L1D on all VMENTER invocations and provides |
1386 | - maximum protection. It has a higher overhead than the conditional |
1387 | - mode. The overhead cannot be quantified correctly as it depends on the |
1388 | - workload scenario and the resulting number of VMEXITs. |
1389 | - |
1390 | - The general recommendation is to enable L1D flush on VMENTER. The kernel |
1391 | - defaults to conditional mode on affected processors. |
1392 | - |
1393 | - **Note**, that L1D flush does not prevent the SMT problem because the |
1394 | - sibling thread will also bring back its data into the L1D which makes it |
1395 | - attackable again. |
1396 | - |
1397 | - L1D flush can be controlled by the administrator via the kernel command |
1398 | - line and sysfs control files. See :ref:`mitigation_control_command_line` |
1399 | - and :ref:`mitigation_control_kvm`. |
1400 | - |
1401 | -.. _guest_confinement: |
1402 | - |
1403 | -2. Guest VCPU confinement to dedicated physical cores |
1404 | -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
1405 | - |
1406 | - To address the SMT problem, it is possible to make a guest or a group of |
1407 | - guests affine to one or more physical cores. The proper mechanism for |
1408 | - that is to utilize exclusive cpusets to ensure that no other guest or |
1409 | - host tasks can run on these cores. |
1410 | - |
1411 | - If only a single guest or related guests run on sibling SMT threads on |
1412 | - the same physical core then they can only attack their own memory and |
1413 | - restricted parts of the host memory. |
1414 | - |
1415 | - Host memory is attackable, when one of the sibling SMT threads runs in |
1416 | - host OS (hypervisor) context and the other in guest context. The amount |
1417 | - of valuable information from the host OS context depends on the context |
1418 | - which the host OS executes, i.e. interrupts, soft interrupts and kernel |
1419 | - threads. The amount of valuable data from these contexts cannot be |
1420 | - declared as non-interesting for an attacker without deep inspection of |
1421 | - the code. |
1422 | - |
1423 | - **Note**, that assigning guests to a fixed set of physical cores affects |
1424 | - the ability of the scheduler to do load balancing and might have |
1425 | - negative effects on CPU utilization depending on the hosting |
1426 | - scenario. Disabling SMT might be a viable alternative for particular |
1427 | - scenarios. |
1428 | - |
1429 | - For further information about confining guests to a single or to a group |
1430 | - of cores consult the cpusets documentation: |
1431 | - |
1432 | - https://www.kernel.org/doc/Documentation/cgroup-v1/cpusets.txt |
1433 | - |
1434 | -.. _interrupt_isolation: |
1435 | - |
1436 | -3. Interrupt affinity |
1437 | -^^^^^^^^^^^^^^^^^^^^^ |
1438 | - |
1439 | - Interrupts can be made affine to logical CPUs. This is not universally |
1440 | - true because there are types of interrupts which are truly per CPU |
1441 | - interrupts, e.g. the local timer interrupt. Aside of that multi queue |
1442 | - devices affine their interrupts to single CPUs or groups of CPUs per |
1443 | - queue without allowing the administrator to control the affinities. |
1444 | - |
1445 | - Moving the interrupts, which can be affinity controlled, away from CPUs |
1446 | - which run untrusted guests, reduces the attack vector space. |
1447 | - |
1448 | - Whether the interrupts with are affine to CPUs, which run untrusted |
1449 | - guests, provide interesting data for an attacker depends on the system |
1450 | - configuration and the scenarios which run on the system. While for some |
1451 | - of the interrupts it can be assumed that they won't expose interesting |
1452 | - information beyond exposing hints about the host OS memory layout, there |
1453 | - is no way to make general assumptions. |
1454 | - |
1455 | - Interrupt affinity can be controlled by the administrator via the |
1456 | - /proc/irq/$NR/smp_affinity[_list] files. Limited documentation is |
1457 | - available at: |
1458 | - |
1459 | - https://www.kernel.org/doc/Documentation/IRQ-affinity.txt |
1460 | - |
1461 | -.. _smt_control: |
1462 | - |
1463 | -4. SMT control |
1464 | -^^^^^^^^^^^^^^ |
1465 | - |
1466 | - To prevent the SMT issues of L1TF it might be necessary to disable SMT |
1467 | - completely. Disabling SMT can have a significant performance impact, but |
1468 | - the impact depends on the hosting scenario and the type of workloads. |
1469 | - The impact of disabling SMT needs also to be weighted against the impact |
1470 | - of other mitigation solutions like confining guests to dedicated cores. |
1471 | - |
1472 | - The kernel provides a sysfs interface to retrieve the status of SMT and |
1473 | - to control it. It also provides a kernel command line interface to |
1474 | - control SMT. |
1475 | - |
1476 | - The kernel command line interface consists of the following options: |
1477 | - |
1478 | - =========== ========================================================== |
1479 | - nosmt Affects the bring up of the secondary CPUs during boot. The |
1480 | - kernel tries to bring all present CPUs online during the |
1481 | - boot process. "nosmt" makes sure that from each physical |
1482 | - core only one - the so called primary (hyper) thread is |
1483 | - activated. Due to a design flaw of Intel processors related |
1484 | - to Machine Check Exceptions the non primary siblings have |
1485 | - to be brought up at least partially and are then shut down |
1486 | - again. "nosmt" can be undone via the sysfs interface. |
1487 | - |
1488 | - nosmt=force Has the same effect as "nosmt" but it does not allow to |
1489 | - undo the SMT disable via the sysfs interface. |
1490 | - =========== ========================================================== |
1491 | - |
1492 | - The sysfs interface provides two files: |
1493 | - |
1494 | - - /sys/devices/system/cpu/smt/control |
1495 | - - /sys/devices/system/cpu/smt/active |
1496 | - |
1497 | - /sys/devices/system/cpu/smt/control: |
1498 | - |
1499 | - This file allows to read out the SMT control state and provides the |
1500 | - ability to disable or (re)enable SMT. The possible states are: |
1501 | - |
1502 | - ============== =================================================== |
1503 | - on SMT is supported by the CPU and enabled. All |
1504 | - logical CPUs can be onlined and offlined without |
1505 | - restrictions. |
1506 | - |
1507 | - off SMT is supported by the CPU and disabled. Only |
1508 | - the so called primary SMT threads can be onlined |
1509 | - and offlined without restrictions. An attempt to |
1510 | - online a non-primary sibling is rejected |
1511 | - |
1512 | - forceoff Same as 'off' but the state cannot be controlled. |
1513 | - Attempts to write to the control file are rejected. |
1514 | - |
1515 | - notsupported The processor does not support SMT. It's therefore |
1516 | - not affected by the SMT implications of L1TF. |
1517 | - Attempts to write to the control file are rejected. |
1518 | - ============== =================================================== |
1519 | - |
1520 | - The possible states which can be written into this file to control SMT |
1521 | - state are: |
1522 | - |
1523 | - - on |
1524 | - - off |
1525 | - - forceoff |
1526 | - |
1527 | - /sys/devices/system/cpu/smt/active: |
1528 | - |
1529 | - This file reports whether SMT is enabled and active, i.e. if on any |
1530 | - physical core two or more sibling threads are online. |
1531 | - |
1532 | - SMT control is also possible at boot time via the l1tf kernel command |
1533 | - line parameter in combination with L1D flush control. See |
1534 | - :ref:`mitigation_control_command_line`. |
1535 | - |
1536 | -5. Disabling EPT |
1537 | -^^^^^^^^^^^^^^^^ |
1538 | - |
1539 | - Disabling EPT for virtual machines provides full mitigation for L1TF even |
1540 | - with SMT enabled, because the effective page tables for guests are |
1541 | - managed and sanitized by the hypervisor. Though disabling EPT has a |
1542 | - significant performance impact especially when the Meltdown mitigation |
1543 | - KPTI is enabled. |
1544 | - |
1545 | - EPT can be disabled in the hypervisor via the 'kvm-intel.ept' parameter. |
1546 | - |
1547 | -There is ongoing research and development for new mitigation mechanisms to |
1548 | -address the performance impact of disabling SMT or EPT. |
1549 | - |
1550 | -.. _mitigation_control_command_line: |
1551 | - |
1552 | -Mitigation control on the kernel command line |
1553 | ---------------------------------------------- |
1554 | - |
1555 | -The kernel command line allows to control the L1TF mitigations at boot |
1556 | -time with the option "l1tf=". The valid arguments for this option are: |
1557 | - |
1558 | - ============ ============================================================= |
1559 | - full Provides all available mitigations for the L1TF |
1560 | - vulnerability. Disables SMT and enables all mitigations in |
1561 | - the hypervisors, i.e. unconditional L1D flushing |
1562 | - |
1563 | - SMT control and L1D flush control via the sysfs interface |
1564 | - is still possible after boot. Hypervisors will issue a |
1565 | - warning when the first VM is started in a potentially |
1566 | - insecure configuration, i.e. SMT enabled or L1D flush |
1567 | - disabled. |
1568 | - |
1569 | - full,force Same as 'full', but disables SMT and L1D flush runtime |
1570 | - control. Implies the 'nosmt=force' command line option. |
1571 | - (i.e. sysfs control of SMT is disabled.) |
1572 | - |
1573 | - flush Leaves SMT enabled and enables the default hypervisor |
1574 | - mitigation, i.e. conditional L1D flushing |
1575 | - |
1576 | - SMT control and L1D flush control via the sysfs interface |
1577 | - is still possible after boot. Hypervisors will issue a |
1578 | - warning when the first VM is started in a potentially |
1579 | - insecure configuration, i.e. SMT enabled or L1D flush |
1580 | - disabled. |
1581 | - |
1582 | - flush,nosmt Disables SMT and enables the default hypervisor mitigation, |
1583 | - i.e. conditional L1D flushing. |
1584 | - |
1585 | - SMT control and L1D flush control via the sysfs interface |
1586 | - is still possible after boot. Hypervisors will issue a |
1587 | - warning when the first VM is started in a potentially |
1588 | - insecure configuration, i.e. SMT enabled or L1D flush |
1589 | - disabled. |
1590 | - |
1591 | - flush,nowarn Same as 'flush', but hypervisors will not warn when a VM is |
1592 | - started in a potentially insecure configuration. |
1593 | - |
1594 | - off Disables hypervisor mitigations and doesn't emit any |
1595 | - warnings. |
1596 | - ============ ============================================================= |
1597 | - |
1598 | -The default is 'flush'. For details about L1D flushing see :ref:`l1d_flush`. |
1599 | - |
1600 | - |
1601 | -.. _mitigation_control_kvm: |
1602 | - |
1603 | -Mitigation control for KVM - module parameter |
1604 | -------------------------------------------------------------- |
1605 | - |
1606 | -The KVM hypervisor mitigation mechanism, flushing the L1D cache when |
1607 | -entering a guest, can be controlled with a module parameter. |
1608 | - |
1609 | -The option/parameter is "kvm-intel.vmentry_l1d_flush=". It takes the |
1610 | -following arguments: |
1611 | - |
1612 | - ============ ============================================================== |
1613 | - always L1D cache flush on every VMENTER. |
1614 | - |
1615 | - cond Flush L1D on VMENTER only when the code between VMEXIT and |
1616 | - VMENTER can leak host memory which is considered |
1617 | - interesting for an attacker. This still can leak host memory |
1618 | - which allows e.g. to determine the hosts address space layout. |
1619 | - |
1620 | - never Disables the mitigation |
1621 | - ============ ============================================================== |
1622 | - |
1623 | -The parameter can be provided on the kernel command line, as a module |
1624 | -parameter when loading the modules and at runtime modified via the sysfs |
1625 | -file: |
1626 | - |
1627 | -/sys/module/kvm_intel/parameters/vmentry_l1d_flush |
1628 | - |
1629 | -The default is 'cond'. If 'l1tf=full,force' is given on the kernel command |
1630 | -line, then 'always' is enforced and the kvm-intel.vmentry_l1d_flush |
1631 | -module parameter is ignored and writes to the sysfs file are rejected. |
1632 | - |
1633 | - |
1634 | -Mitigation selection guide |
1635 | --------------------------- |
1636 | - |
1637 | -1. No virtualization in use |
1638 | -^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
1639 | - |
1640 | - The system is protected by the kernel unconditionally and no further |
1641 | - action is required. |
1642 | - |
1643 | -2. Virtualization with trusted guests |
1644 | -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
1645 | - |
1646 | - If the guest comes from a trusted source and the guest OS kernel is |
1647 | - guaranteed to have the L1TF mitigations in place the system is fully |
1648 | - protected against L1TF and no further action is required. |
1649 | - |
1650 | - To avoid the overhead of the default L1D flushing on VMENTER the |
1651 | - administrator can disable the flushing via the kernel command line and |
1652 | - sysfs control files. See :ref:`mitigation_control_command_line` and |
1653 | - :ref:`mitigation_control_kvm`. |
1654 | - |
1655 | - |
1656 | -3. Virtualization with untrusted guests |
1657 | -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
1658 | - |
1659 | -3.1. SMT not supported or disabled |
1660 | -"""""""""""""""""""""""""""""""""" |
1661 | - |
1662 | - If SMT is not supported by the processor or disabled in the BIOS or by |
1663 | - the kernel, it's only required to enforce L1D flushing on VMENTER. |
1664 | - |
1665 | - Conditional L1D flushing is the default behaviour and can be tuned. See |
1666 | - :ref:`mitigation_control_command_line` and :ref:`mitigation_control_kvm`. |
1667 | - |
1668 | -3.2. EPT not supported or disabled |
1669 | -"""""""""""""""""""""""""""""""""" |
1670 | - |
1671 | - If EPT is not supported by the processor or disabled in the hypervisor, |
1672 | - the system is fully protected. SMT can stay enabled and L1D flushing on |
1673 | - VMENTER is not required. |
1674 | - |
1675 | - EPT can be disabled in the hypervisor via the 'kvm-intel.ept' parameter. |
1676 | - |
1677 | -3.3. SMT and EPT supported and active |
1678 | -""""""""""""""""""""""""""""""""""""" |
1679 | - |
1680 | - If SMT and EPT are supported and active then various degrees of |
1681 | - mitigations can be employed: |
1682 | - |
1683 | - - L1D flushing on VMENTER: |
1684 | - |
1685 | - L1D flushing on VMENTER is the minimal protection requirement, but it |
1686 | - is only potent in combination with other mitigation methods. |
1687 | - |
1688 | - Conditional L1D flushing is the default behaviour and can be tuned. See |
1689 | - :ref:`mitigation_control_command_line` and :ref:`mitigation_control_kvm`. |
1690 | - |
1691 | - - Guest confinement: |
1692 | - |
1693 | - Confinement of guests to a single or a group of physical cores which |
1694 | - are not running any other processes, can reduce the attack surface |
1695 | - significantly, but interrupts, soft interrupts and kernel threads can |
1696 | - still expose valuable data to a potential attacker. See |
1697 | - :ref:`guest_confinement`. |
1698 | - |
1699 | - - Interrupt isolation: |
1700 | - |
1701 | - Isolating the guest CPUs from interrupts can reduce the attack surface |
1702 | - further, but still allows a malicious guest to explore a limited amount |
1703 | - of host physical memory. This can at least be used to gain knowledge |
1704 | - about the host address space layout. The interrupts which have a fixed |
1705 | - affinity to the CPUs which run the untrusted guests can depending on |
1706 | - the scenario still trigger soft interrupts and schedule kernel threads |
1707 | - which might expose valuable information. See |
1708 | - :ref:`interrupt_isolation`. |
1709 | - |
1710 | -The above three mitigation methods combined can provide protection to a |
1711 | -certain degree, but the risk of the remaining attack surface has to be |
1712 | -carefully analyzed. For full protection the following methods are |
1713 | -available: |
1714 | - |
1715 | - - Disabling SMT: |
1716 | - |
1717 | - Disabling SMT and enforcing the L1D flushing provides the maximum |
1718 | - amount of protection. This mitigation is not depending on any of the |
1719 | - above mitigation methods. |
1720 | - |
1721 | - SMT control and L1D flushing can be tuned by the command line |
1722 | - parameters 'nosmt', 'l1tf', 'kvm-intel.vmentry_l1d_flush' and at run |
1723 | - time with the matching sysfs control files. See :ref:`smt_control`, |
1724 | - :ref:`mitigation_control_command_line` and |
1725 | - :ref:`mitigation_control_kvm`. |
1726 | - |
1727 | - - Disabling EPT: |
1728 | - |
1729 | - Disabling EPT provides the maximum amount of protection as well. It is |
1730 | - not depending on any of the above mitigation methods. SMT can stay |
1731 | - enabled and L1D flushing is not required, but the performance impact is |
1732 | - significant. |
1733 | - |
1734 | - EPT can be disabled in the hypervisor via the 'kvm-intel.ept' |
1735 | - parameter. |
1736 | - |
1737 | -3.4. Nested virtual machines |
1738 | -"""""""""""""""""""""""""""" |
1739 | - |
1740 | -When nested virtualization is in use, three operating systems are involved: |
1741 | -the bare metal hypervisor, the nested hypervisor and the nested virtual |
1742 | -machine. VMENTER operations from the nested hypervisor into the nested |
1743 | -guest will always be processed by the bare metal hypervisor. If KVM is the |
1744 | -bare metal hypervisor it wiil: |
1745 | - |
1746 | - - Flush the L1D cache on every switch from the nested hypervisor to the |
1747 | - nested virtual machine, so that the nested hypervisor's secrets are not |
1748 | - exposed to the nested virtual machine; |
1749 | - |
1750 | - - Flush the L1D cache on every switch from the nested virtual machine to |
1751 | - the nested hypervisor; this is a complex operation, and flushing the L1D |
1752 | - cache avoids that the bare metal hypervisor's secrets are exposed to the |
1753 | - nested virtual machine; |
1754 | - |
1755 | - - Instruct the nested hypervisor to not perform any L1D cache flush. This |
1756 | - is an optimization to avoid double L1D flushing. |
1757 | - |
1758 | - |
1759 | -.. _default_mitigations: |
1760 | - |
1761 | -Default mitigations |
1762 | -------------------- |
1763 | - |
1764 | - The kernel default mitigations for vulnerable processors are: |
1765 | - |
1766 | - - PTE inversion to protect against malicious user space. This is done |
1767 | - unconditionally and cannot be controlled. |
1768 | - |
1769 | - - L1D conditional flushing on VMENTER when EPT is enabled for |
1770 | - a guest. |
1771 | - |
1772 | - The kernel does not by default enforce the disabling of SMT, which leaves |
1773 | - SMT systems vulnerable when running untrusted guests with EPT enabled. |
1774 | - |
1775 | - The rationale for this choice is: |
1776 | - |
1777 | - - Force disabling SMT can break existing setups, especially with |
1778 | - unattended updates. |
1779 | - |
1780 | - - If regular users run untrusted guests on their machine, then L1TF is |
1781 | - just an add on to other malware which might be embedded in an untrusted |
1782 | - guest, e.g. spam-bots or attacks on the local network. |
1783 | - |
1784 | - There is no technical way to prevent a user from running untrusted code |
1785 | - on their machines blindly. |
1786 | - |
1787 | - - It's technically extremely unlikely and from today's knowledge even |
1788 | - impossible that L1TF can be exploited via the most popular attack |
1789 | - mechanisms like JavaScript because these mechanisms have no way to |
1790 | - control PTEs. If this would be possible and not other mitigation would |
1791 | - be possible, then the default might be different. |
1792 | - |
1793 | - - The administrators of cloud and hosting setups have to carefully |
1794 | - analyze the risk for their scenarios and make the appropriate |
1795 | - mitigation choices, which might even vary across their deployed |
1796 | - machines and also result in other changes of their overall setup. |
1797 | - There is no way for the kernel to provide a sensible default for this |
1798 | - kind of scenarios. |
1799 | diff --git a/Documentation/spec_ctrl.txt b/Documentation/spec_ctrl.txt |
1800 | index 32f3d55c54b7..c4dbe6f7cdae 100644 |
1801 | --- a/Documentation/spec_ctrl.txt |
1802 | +++ b/Documentation/spec_ctrl.txt |
1803 | @@ -92,3 +92,12 @@ Speculation misfeature controls |
1804 | * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_STORE_BYPASS, PR_SPEC_ENABLE, 0, 0); |
1805 | * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_STORE_BYPASS, PR_SPEC_DISABLE, 0, 0); |
1806 | * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_STORE_BYPASS, PR_SPEC_FORCE_DISABLE, 0, 0); |
1807 | + |
1808 | +- PR_SPEC_INDIR_BRANCH: Indirect Branch Speculation in User Processes |
1809 | + (Mitigate Spectre V2 style attacks against user processes) |
1810 | + |
1811 | + Invocations: |
1812 | + * prctl(PR_GET_SPECULATION_CTRL, PR_SPEC_INDIRECT_BRANCH, 0, 0, 0); |
1813 | + * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_INDIRECT_BRANCH, PR_SPEC_ENABLE, 0, 0); |
1814 | + * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_INDIRECT_BRANCH, PR_SPEC_DISABLE, 0, 0); |
1815 | + * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_INDIRECT_BRANCH, PR_SPEC_FORCE_DISABLE, 0, 0); |
1816 | diff --git a/Documentation/x86/conf.py b/Documentation/x86/conf.py |
1817 | new file mode 100644 |
1818 | index 000000000000..33c5c3142e20 |
1819 | --- /dev/null |
1820 | +++ b/Documentation/x86/conf.py |
1821 | @@ -0,0 +1,10 @@ |
1822 | +# -*- coding: utf-8; mode: python -*- |
1823 | + |
1824 | +project = "X86 architecture specific documentation" |
1825 | + |
1826 | +tags.add("subproject") |
1827 | + |
1828 | +latex_documents = [ |
1829 | + ('index', 'x86.tex', project, |
1830 | + 'The kernel development community', 'manual'), |
1831 | +] |
1832 | diff --git a/Documentation/x86/index.rst b/Documentation/x86/index.rst |
1833 | new file mode 100644 |
1834 | index 000000000000..ef389dcf1b1d |
1835 | --- /dev/null |
1836 | +++ b/Documentation/x86/index.rst |
1837 | @@ -0,0 +1,8 @@ |
1838 | +========================== |
1839 | +x86 architecture specifics |
1840 | +========================== |
1841 | + |
1842 | +.. toctree:: |
1843 | + :maxdepth: 1 |
1844 | + |
1845 | + mds |
1846 | diff --git a/Documentation/x86/mds.rst b/Documentation/x86/mds.rst |
1847 | new file mode 100644 |
1848 | index 000000000000..534e9baa4e1d |
1849 | --- /dev/null |
1850 | +++ b/Documentation/x86/mds.rst |
1851 | @@ -0,0 +1,225 @@ |
1852 | +Microarchitectural Data Sampling (MDS) mitigation |
1853 | +================================================= |
1854 | + |
1855 | +.. _mds: |
1856 | + |
1857 | +Overview |
1858 | +-------- |
1859 | + |
1860 | +Microarchitectural Data Sampling (MDS) is a family of side channel attacks |
1861 | +on internal buffers in Intel CPUs. The variants are: |
1862 | + |
1863 | + - Microarchitectural Store Buffer Data Sampling (MSBDS) (CVE-2018-12126) |
1864 | + - Microarchitectural Fill Buffer Data Sampling (MFBDS) (CVE-2018-12130) |
1865 | + - Microarchitectural Load Port Data Sampling (MLPDS) (CVE-2018-12127) |
1866 | + - Microarchitectural Data Sampling Uncacheable Memory (MDSUM) (CVE-2019-11091) |
1867 | + |
1868 | +MSBDS leaks Store Buffer Entries which can be speculatively forwarded to a |
1869 | +dependent load (store-to-load forwarding) as an optimization. The forward |
1870 | +can also happen to a faulting or assisting load operation for a different |
1871 | +memory address, which can be exploited under certain conditions. Store |
1872 | +buffers are partitioned between Hyper-Threads so cross thread forwarding is |
1873 | +not possible. But if a thread enters or exits a sleep state the store |
1874 | +buffer is repartitioned which can expose data from one thread to the other. |
1875 | + |
1876 | +MFBDS leaks Fill Buffer Entries. Fill buffers are used internally to manage |
1877 | +L1 miss situations and to hold data which is returned or sent in response |
1878 | +to a memory or I/O operation. Fill buffers can forward data to a load |
1879 | +operation and also write data to the cache. When the fill buffer is |
1880 | +deallocated it can retain the stale data of the preceding operations which |
1881 | +can then be forwarded to a faulting or assisting load operation, which can |
1882 | +be exploited under certain conditions. Fill buffers are shared between |
1883 | +Hyper-Threads so cross thread leakage is possible. |
1884 | + |
1885 | +MLPDS leaks Load Port Data. Load ports are used to perform load operations |
1886 | +from memory or I/O. The received data is then forwarded to the register |
1887 | +file or a subsequent operation. In some implementations the Load Port can |
1888 | +contain stale data from a previous operation which can be forwarded to |
1889 | +faulting or assisting loads under certain conditions, which again can be |
1890 | +exploited eventually. Load ports are shared between Hyper-Threads so cross |
1891 | +thread leakage is possible. |
1892 | + |
1893 | +MDSUM is a special case of MSBDS, MFBDS and MLPDS. An uncacheable load from |
1894 | +memory that takes a fault or assist can leave data in a microarchitectural |
1895 | +structure that may later be observed using one of the same methods used by |
1896 | +MSBDS, MFBDS or MLPDS. |
1897 | + |
1898 | +Exposure assumptions |
1899 | +-------------------- |
1900 | + |
1901 | +It is assumed that attack code resides in user space or in a guest with one |
1902 | +exception. The rationale behind this assumption is that the code construct |
1903 | +needed for exploiting MDS requires: |
1904 | + |
1905 | + - to control the load to trigger a fault or assist |
1906 | + |
1907 | + - to have a disclosure gadget which exposes the speculatively accessed |
1908 | + data for consumption through a side channel. |
1909 | + |
1910 | + - to control the pointer through which the disclosure gadget exposes the |
1911 | + data |
1912 | + |
1913 | +The existence of such a construct in the kernel cannot be excluded with |
1914 | +100% certainty, but the complexity involved makes it extremly unlikely. |
1915 | + |
1916 | +There is one exception, which is untrusted BPF. The functionality of |
1917 | +untrusted BPF is limited, but it needs to be thoroughly investigated |
1918 | +whether it can be used to create such a construct. |
1919 | + |
1920 | + |
1921 | +Mitigation strategy |
1922 | +------------------- |
1923 | + |
1924 | +All variants have the same mitigation strategy at least for the single CPU |
1925 | +thread case (SMT off): Force the CPU to clear the affected buffers. |
1926 | + |
1927 | +This is achieved by using the otherwise unused and obsolete VERW |
1928 | +instruction in combination with a microcode update. The microcode clears |
1929 | +the affected CPU buffers when the VERW instruction is executed. |
1930 | + |
1931 | +For virtualization there are two ways to achieve CPU buffer |
1932 | +clearing. Either the modified VERW instruction or via the L1D Flush |
1933 | +command. The latter is issued when L1TF mitigation is enabled so the extra |
1934 | +VERW can be avoided. If the CPU is not affected by L1TF then VERW needs to |
1935 | +be issued. |
1936 | + |
1937 | +If the VERW instruction with the supplied segment selector argument is |
1938 | +executed on a CPU without the microcode update there is no side effect |
1939 | +other than a small number of pointlessly wasted CPU cycles. |
1940 | + |
1941 | +This does not protect against cross Hyper-Thread attacks except for MSBDS |
1942 | +which is only exploitable cross Hyper-thread when one of the Hyper-Threads |
1943 | +enters a C-state. |
1944 | + |
1945 | +The kernel provides a function to invoke the buffer clearing: |
1946 | + |
1947 | + mds_clear_cpu_buffers() |
1948 | + |
1949 | +The mitigation is invoked on kernel/userspace, hypervisor/guest and C-state |
1950 | +(idle) transitions. |
1951 | + |
1952 | +As a special quirk to address virtualization scenarios where the host has |
1953 | +the microcode updated, but the hypervisor does not (yet) expose the |
1954 | +MD_CLEAR CPUID bit to guests, the kernel issues the VERW instruction in the |
1955 | +hope that it might actually clear the buffers. The state is reflected |
1956 | +accordingly. |
1957 | + |
1958 | +According to current knowledge additional mitigations inside the kernel |
1959 | +itself are not required because the necessary gadgets to expose the leaked |
1960 | +data cannot be controlled in a way which allows exploitation from malicious |
1961 | +user space or VM guests. |
1962 | + |
1963 | +Kernel internal mitigation modes |
1964 | +-------------------------------- |
1965 | + |
1966 | + ======= ============================================================ |
1967 | + off Mitigation is disabled. Either the CPU is not affected or |
1968 | + mds=off is supplied on the kernel command line |
1969 | + |
1970 | + full Mitigation is enabled. CPU is affected and MD_CLEAR is |
1971 | + advertised in CPUID. |
1972 | + |
1973 | + vmwerv Mitigation is enabled. CPU is affected and MD_CLEAR is not |
1974 | + advertised in CPUID. That is mainly for virtualization |
1975 | + scenarios where the host has the updated microcode but the |
1976 | + hypervisor does not expose MD_CLEAR in CPUID. It's a best |
1977 | + effort approach without guarantee. |
1978 | + ======= ============================================================ |
1979 | + |
1980 | +If the CPU is affected and mds=off is not supplied on the kernel command |
1981 | +line then the kernel selects the appropriate mitigation mode depending on |
1982 | +the availability of the MD_CLEAR CPUID bit. |
1983 | + |
1984 | +Mitigation points |
1985 | +----------------- |
1986 | + |
1987 | +1. Return to user space |
1988 | +^^^^^^^^^^^^^^^^^^^^^^^ |
1989 | + |
1990 | + When transitioning from kernel to user space the CPU buffers are flushed |
1991 | + on affected CPUs when the mitigation is not disabled on the kernel |
1992 | + command line. The migitation is enabled through the static key |
1993 | + mds_user_clear. |
1994 | + |
1995 | + The mitigation is invoked in prepare_exit_to_usermode() which covers |
1996 | + most of the kernel to user space transitions. There are a few exceptions |
1997 | + which are not invoking prepare_exit_to_usermode() on return to user |
1998 | + space. These exceptions use the paranoid exit code. |
1999 | + |
2000 | + - Non Maskable Interrupt (NMI): |
2001 | + |
2002 | + Access to sensible data like keys, credentials in the NMI context is |
2003 | + mostly theoretical: The CPU can do prefetching or execute a |
2004 | + misspeculated code path and thereby fetching data which might end up |
2005 | + leaking through a buffer. |
2006 | + |
2007 | + But for mounting other attacks the kernel stack address of the task is |
2008 | + already valuable information. So in full mitigation mode, the NMI is |
2009 | + mitigated on the return from do_nmi() to provide almost complete |
2010 | + coverage. |
2011 | + |
2012 | + - Double fault (#DF): |
2013 | + |
2014 | + A double fault is usually fatal, but the ESPFIX workaround, which can |
2015 | + be triggered from user space through modify_ldt(2) is a recoverable |
2016 | + double fault. #DF uses the paranoid exit path, so explicit mitigation |
2017 | + in the double fault handler is required. |
2018 | + |
2019 | + - Machine Check Exception (#MC): |
2020 | + |
2021 | + Another corner case is a #MC which hits between the CPU buffer clear |
2022 | + invocation and the actual return to user. As this still is in kernel |
2023 | + space it takes the paranoid exit path which does not clear the CPU |
2024 | + buffers. So the #MC handler repopulates the buffers to some |
2025 | + extent. Machine checks are not reliably controllable and the window is |
2026 | + extremly small so mitigation would just tick a checkbox that this |
2027 | + theoretical corner case is covered. To keep the amount of special |
2028 | + cases small, ignore #MC. |
2029 | + |
2030 | + - Debug Exception (#DB): |
2031 | + |
2032 | + This takes the paranoid exit path only when the INT1 breakpoint is in |
2033 | + kernel space. #DB on a user space address takes the regular exit path, |
2034 | + so no extra mitigation required. |
2035 | + |
2036 | + |
2037 | +2. C-State transition |
2038 | +^^^^^^^^^^^^^^^^^^^^^ |
2039 | + |
2040 | + When a CPU goes idle and enters a C-State the CPU buffers need to be |
2041 | + cleared on affected CPUs when SMT is active. This addresses the |
2042 | + repartitioning of the store buffer when one of the Hyper-Threads enters |
2043 | + a C-State. |
2044 | + |
2045 | + When SMT is inactive, i.e. either the CPU does not support it or all |
2046 | + sibling threads are offline CPU buffer clearing is not required. |
2047 | + |
2048 | + The idle clearing is enabled on CPUs which are only affected by MSBDS |
2049 | + and not by any other MDS variant. The other MDS variants cannot be |
2050 | + protected against cross Hyper-Thread attacks because the Fill Buffer and |
2051 | + the Load Ports are shared. So on CPUs affected by other variants, the |
2052 | + idle clearing would be a window dressing exercise and is therefore not |
2053 | + activated. |
2054 | + |
2055 | + The invocation is controlled by the static key mds_idle_clear which is |
2056 | + switched depending on the chosen mitigation mode and the SMT state of |
2057 | + the system. |
2058 | + |
2059 | + The buffer clear is only invoked before entering the C-State to prevent |
2060 | + that stale data from the idling CPU from spilling to the Hyper-Thread |
2061 | + sibling after the store buffer got repartitioned and all entries are |
2062 | + available to the non idle sibling. |
2063 | + |
2064 | + When coming out of idle the store buffer is partitioned again so each |
2065 | + sibling has half of it available. The back from idle CPU could be then |
2066 | + speculatively exposed to contents of the sibling. The buffers are |
2067 | + flushed either on exit to user space or on VMENTER so malicious code |
2068 | + in user space or the guest cannot speculatively access them. |
2069 | + |
2070 | + The mitigation is hooked into all variants of halt()/mwait(), but does |
2071 | + not cover the legacy ACPI IO-Port mechanism because the ACPI idle driver |
2072 | + has been superseded by the intel_idle driver around 2010 and is |
2073 | + preferred on all affected CPUs which are expected to gain the MD_CLEAR |
2074 | + functionality in microcode. Aside of that the IO-Port mechanism is a |
2075 | + legacy interface which is only used on older systems which are either |
2076 | + not affected or do not receive microcode updates anymore. |
2077 | diff --git a/Makefile b/Makefile |
2078 | index e52b0579e176..92fe701e5582 100644 |
2079 | --- a/Makefile |
2080 | +++ b/Makefile |
2081 | @@ -1,6 +1,6 @@ |
2082 | VERSION = 4 |
2083 | PATCHLEVEL = 9 |
2084 | -SUBLEVEL = 175 |
2085 | +SUBLEVEL = 176 |
2086 | EXTRAVERSION = |
2087 | NAME = Roaring Lionus |
2088 | |
2089 | diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig |
2090 | index 5a4591ff8407..e0055b4302d6 100644 |
2091 | --- a/arch/x86/Kconfig |
2092 | +++ b/arch/x86/Kconfig |
2093 | @@ -937,13 +937,7 @@ config NR_CPUS |
2094 | approximately eight kilobytes to the kernel image. |
2095 | |
2096 | config SCHED_SMT |
2097 | - bool "SMT (Hyperthreading) scheduler support" |
2098 | - depends on SMP |
2099 | - ---help--- |
2100 | - SMT scheduler support improves the CPU scheduler's decision making |
2101 | - when dealing with Intel Pentium 4 chips with HyperThreading at a |
2102 | - cost of slightly increased overhead in some places. If unsure say |
2103 | - N here. |
2104 | + def_bool y if SMP |
2105 | |
2106 | config SCHED_MC |
2107 | def_bool y |
2108 | diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c |
2109 | index b0cd306dc527..8841d016b4a4 100644 |
2110 | --- a/arch/x86/entry/common.c |
2111 | +++ b/arch/x86/entry/common.c |
2112 | @@ -28,6 +28,7 @@ |
2113 | #include <asm/vdso.h> |
2114 | #include <asm/uaccess.h> |
2115 | #include <asm/cpufeature.h> |
2116 | +#include <asm/nospec-branch.h> |
2117 | |
2118 | #define CREATE_TRACE_POINTS |
2119 | #include <trace/events/syscalls.h> |
2120 | @@ -206,6 +207,8 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs) |
2121 | #endif |
2122 | |
2123 | user_enter_irqoff(); |
2124 | + |
2125 | + mds_user_clear_cpu_buffers(); |
2126 | } |
2127 | |
2128 | #define SYSCALL_EXIT_WORK_FLAGS \ |
2129 | diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c |
2130 | index a30829052a00..cb8178a2783a 100644 |
2131 | --- a/arch/x86/events/intel/core.c |
2132 | +++ b/arch/x86/events/intel/core.c |
2133 | @@ -3750,11 +3750,11 @@ __init int intel_pmu_init(void) |
2134 | pr_cont("Nehalem events, "); |
2135 | break; |
2136 | |
2137 | - case INTEL_FAM6_ATOM_PINEVIEW: |
2138 | - case INTEL_FAM6_ATOM_LINCROFT: |
2139 | - case INTEL_FAM6_ATOM_PENWELL: |
2140 | - case INTEL_FAM6_ATOM_CLOVERVIEW: |
2141 | - case INTEL_FAM6_ATOM_CEDARVIEW: |
2142 | + case INTEL_FAM6_ATOM_BONNELL: |
2143 | + case INTEL_FAM6_ATOM_BONNELL_MID: |
2144 | + case INTEL_FAM6_ATOM_SALTWELL: |
2145 | + case INTEL_FAM6_ATOM_SALTWELL_MID: |
2146 | + case INTEL_FAM6_ATOM_SALTWELL_TABLET: |
2147 | memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, |
2148 | sizeof(hw_cache_event_ids)); |
2149 | |
2150 | @@ -3766,9 +3766,11 @@ __init int intel_pmu_init(void) |
2151 | pr_cont("Atom events, "); |
2152 | break; |
2153 | |
2154 | - case INTEL_FAM6_ATOM_SILVERMONT1: |
2155 | - case INTEL_FAM6_ATOM_SILVERMONT2: |
2156 | + case INTEL_FAM6_ATOM_SILVERMONT: |
2157 | + case INTEL_FAM6_ATOM_SILVERMONT_X: |
2158 | + case INTEL_FAM6_ATOM_SILVERMONT_MID: |
2159 | case INTEL_FAM6_ATOM_AIRMONT: |
2160 | + case INTEL_FAM6_ATOM_AIRMONT_MID: |
2161 | memcpy(hw_cache_event_ids, slm_hw_cache_event_ids, |
2162 | sizeof(hw_cache_event_ids)); |
2163 | memcpy(hw_cache_extra_regs, slm_hw_cache_extra_regs, |
2164 | @@ -3785,7 +3787,7 @@ __init int intel_pmu_init(void) |
2165 | break; |
2166 | |
2167 | case INTEL_FAM6_ATOM_GOLDMONT: |
2168 | - case INTEL_FAM6_ATOM_DENVERTON: |
2169 | + case INTEL_FAM6_ATOM_GOLDMONT_X: |
2170 | memcpy(hw_cache_event_ids, glm_hw_cache_event_ids, |
2171 | sizeof(hw_cache_event_ids)); |
2172 | memcpy(hw_cache_extra_regs, glm_hw_cache_extra_regs, |
2173 | diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c |
2174 | index 47d526c700a1..72d09340c24d 100644 |
2175 | --- a/arch/x86/events/intel/cstate.c |
2176 | +++ b/arch/x86/events/intel/cstate.c |
2177 | @@ -531,8 +531,8 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { |
2178 | |
2179 | X86_CSTATES_MODEL(INTEL_FAM6_HASWELL_ULT, hswult_cstates), |
2180 | |
2181 | - X86_CSTATES_MODEL(INTEL_FAM6_ATOM_SILVERMONT1, slm_cstates), |
2182 | - X86_CSTATES_MODEL(INTEL_FAM6_ATOM_SILVERMONT2, slm_cstates), |
2183 | + X86_CSTATES_MODEL(INTEL_FAM6_ATOM_SILVERMONT, slm_cstates), |
2184 | + X86_CSTATES_MODEL(INTEL_FAM6_ATOM_SILVERMONT_X, slm_cstates), |
2185 | X86_CSTATES_MODEL(INTEL_FAM6_ATOM_AIRMONT, slm_cstates), |
2186 | |
2187 | X86_CSTATES_MODEL(INTEL_FAM6_BROADWELL_CORE, snb_cstates), |
2188 | diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c |
2189 | index be0b1968d60a..68144a341903 100644 |
2190 | --- a/arch/x86/events/msr.c |
2191 | +++ b/arch/x86/events/msr.c |
2192 | @@ -61,8 +61,8 @@ static bool test_intel(int idx) |
2193 | case INTEL_FAM6_BROADWELL_GT3E: |
2194 | case INTEL_FAM6_BROADWELL_X: |
2195 | |
2196 | - case INTEL_FAM6_ATOM_SILVERMONT1: |
2197 | - case INTEL_FAM6_ATOM_SILVERMONT2: |
2198 | + case INTEL_FAM6_ATOM_SILVERMONT: |
2199 | + case INTEL_FAM6_ATOM_SILVERMONT_X: |
2200 | case INTEL_FAM6_ATOM_AIRMONT: |
2201 | if (idx == PERF_MSR_SMI) |
2202 | return true; |
2203 | diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h |
2204 | index 98444b77fbe3..06de338be0d8 100644 |
2205 | --- a/arch/x86/include/asm/cpufeatures.h |
2206 | +++ b/arch/x86/include/asm/cpufeatures.h |
2207 | @@ -271,10 +271,12 @@ |
2208 | /* AMD-defined CPU features, CPUID level 0x80000008 (ebx), word 13 */ |
2209 | #define X86_FEATURE_CLZERO (13*32+0) /* CLZERO instruction */ |
2210 | #define X86_FEATURE_IRPERF (13*32+1) /* Instructions Retired Count */ |
2211 | -#define X86_FEATURE_AMD_IBPB (13*32+12) /* Indirect Branch Prediction Barrier */ |
2212 | -#define X86_FEATURE_AMD_IBRS (13*32+14) /* Indirect Branch Restricted Speculation */ |
2213 | -#define X86_FEATURE_AMD_STIBP (13*32+15) /* Single Thread Indirect Branch Predictors */ |
2214 | +#define X86_FEATURE_AMD_IBPB (13*32+12) /* "" Indirect Branch Prediction Barrier */ |
2215 | +#define X86_FEATURE_AMD_IBRS (13*32+14) /* "" Indirect Branch Restricted Speculation */ |
2216 | +#define X86_FEATURE_AMD_STIBP (13*32+15) /* "" Single Thread Indirect Branch Predictors */ |
2217 | +#define X86_FEATURE_AMD_SSBD (13*32+24) /* "" Speculative Store Bypass Disable */ |
2218 | #define X86_FEATURE_VIRT_SSBD (13*32+25) /* Virtualized Speculative Store Bypass Disable */ |
2219 | +#define X86_FEATURE_AMD_SSB_NO (13*32+26) /* "" Speculative Store Bypass is fixed in hardware. */ |
2220 | |
2221 | /* Thermal and Power Management Leaf, CPUID level 0x00000006 (eax), word 14 */ |
2222 | #define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ |
2223 | @@ -315,6 +317,7 @@ |
2224 | #define X86_FEATURE_AVX512_4VNNIW (18*32+ 2) /* AVX-512 Neural Network Instructions */ |
2225 | #define X86_FEATURE_AVX512_4FMAPS (18*32+ 3) /* AVX-512 Multiply Accumulation Single precision */ |
2226 | #define X86_FEATURE_TSX_FORCE_ABORT (18*32+13) /* "" TSX_FORCE_ABORT */ |
2227 | +#define X86_FEATURE_MD_CLEAR (18*32+10) /* VERW clears CPU buffers */ |
2228 | #define X86_FEATURE_PCONFIG (18*32+18) /* Intel PCONFIG */ |
2229 | #define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */ |
2230 | #define X86_FEATURE_INTEL_STIBP (18*32+27) /* "" Single Thread Indirect Branch Predictors */ |
2231 | @@ -352,5 +355,7 @@ |
2232 | #define X86_BUG_SPECTRE_V2 X86_BUG(16) /* CPU is affected by Spectre variant 2 attack with indirect branches */ |
2233 | #define X86_BUG_SPEC_STORE_BYPASS X86_BUG(17) /* CPU is affected by speculative store bypass attack */ |
2234 | #define X86_BUG_L1TF X86_BUG(18) /* CPU is affected by L1 Terminal Fault */ |
2235 | +#define X86_BUG_MDS X86_BUG(19) /* CPU is affected by Microarchitectural data sampling */ |
2236 | +#define X86_BUG_MSBDS_ONLY X86_BUG(20) /* CPU is only affected by the MSDBS variant of BUG_MDS */ |
2237 | |
2238 | #endif /* _ASM_X86_CPUFEATURES_H */ |
2239 | diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h |
2240 | index 75b748a1deb8..ba7b6f736414 100644 |
2241 | --- a/arch/x86/include/asm/intel-family.h |
2242 | +++ b/arch/x86/include/asm/intel-family.h |
2243 | @@ -50,19 +50,23 @@ |
2244 | |
2245 | /* "Small Core" Processors (Atom) */ |
2246 | |
2247 | -#define INTEL_FAM6_ATOM_PINEVIEW 0x1C |
2248 | -#define INTEL_FAM6_ATOM_LINCROFT 0x26 |
2249 | -#define INTEL_FAM6_ATOM_PENWELL 0x27 |
2250 | -#define INTEL_FAM6_ATOM_CLOVERVIEW 0x35 |
2251 | -#define INTEL_FAM6_ATOM_CEDARVIEW 0x36 |
2252 | -#define INTEL_FAM6_ATOM_SILVERMONT1 0x37 /* BayTrail/BYT / Valleyview */ |
2253 | -#define INTEL_FAM6_ATOM_SILVERMONT2 0x4D /* Avaton/Rangely */ |
2254 | -#define INTEL_FAM6_ATOM_AIRMONT 0x4C /* CherryTrail / Braswell */ |
2255 | -#define INTEL_FAM6_ATOM_MERRIFIELD 0x4A /* Tangier */ |
2256 | -#define INTEL_FAM6_ATOM_MOOREFIELD 0x5A /* Anniedale */ |
2257 | -#define INTEL_FAM6_ATOM_GOLDMONT 0x5C |
2258 | -#define INTEL_FAM6_ATOM_DENVERTON 0x5F /* Goldmont Microserver */ |
2259 | -#define INTEL_FAM6_ATOM_GEMINI_LAKE 0x7A |
2260 | +#define INTEL_FAM6_ATOM_BONNELL 0x1C /* Diamondville, Pineview */ |
2261 | +#define INTEL_FAM6_ATOM_BONNELL_MID 0x26 /* Silverthorne, Lincroft */ |
2262 | + |
2263 | +#define INTEL_FAM6_ATOM_SALTWELL 0x36 /* Cedarview */ |
2264 | +#define INTEL_FAM6_ATOM_SALTWELL_MID 0x27 /* Penwell */ |
2265 | +#define INTEL_FAM6_ATOM_SALTWELL_TABLET 0x35 /* Cloverview */ |
2266 | + |
2267 | +#define INTEL_FAM6_ATOM_SILVERMONT 0x37 /* Bay Trail, Valleyview */ |
2268 | +#define INTEL_FAM6_ATOM_SILVERMONT_X 0x4D /* Avaton, Rangely */ |
2269 | +#define INTEL_FAM6_ATOM_SILVERMONT_MID 0x4A /* Merriefield */ |
2270 | + |
2271 | +#define INTEL_FAM6_ATOM_AIRMONT 0x4C /* Cherry Trail, Braswell */ |
2272 | +#define INTEL_FAM6_ATOM_AIRMONT_MID 0x5A /* Moorefield */ |
2273 | + |
2274 | +#define INTEL_FAM6_ATOM_GOLDMONT 0x5C /* Apollo Lake */ |
2275 | +#define INTEL_FAM6_ATOM_GOLDMONT_X 0x5F /* Denverton */ |
2276 | +#define INTEL_FAM6_ATOM_GOLDMONT_PLUS 0x7A /* Gemini Lake */ |
2277 | |
2278 | /* Xeon Phi */ |
2279 | |
2280 | diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h |
2281 | index 508a062e6cf1..0c8f4281b151 100644 |
2282 | --- a/arch/x86/include/asm/irqflags.h |
2283 | +++ b/arch/x86/include/asm/irqflags.h |
2284 | @@ -5,6 +5,8 @@ |
2285 | |
2286 | #ifndef __ASSEMBLY__ |
2287 | |
2288 | +#include <asm/nospec-branch.h> |
2289 | + |
2290 | /* Provide __cpuidle; we can't safely include <linux/cpu.h> */ |
2291 | #define __cpuidle __attribute__((__section__(".cpuidle.text"))) |
2292 | |
2293 | @@ -53,11 +55,13 @@ static inline void native_irq_enable(void) |
2294 | |
2295 | static inline __cpuidle void native_safe_halt(void) |
2296 | { |
2297 | + mds_idle_clear_cpu_buffers(); |
2298 | asm volatile("sti; hlt": : :"memory"); |
2299 | } |
2300 | |
2301 | static inline __cpuidle void native_halt(void) |
2302 | { |
2303 | + mds_idle_clear_cpu_buffers(); |
2304 | asm volatile("hlt": : :"memory"); |
2305 | } |
2306 | |
2307 | diff --git a/arch/x86/include/asm/microcode_intel.h b/arch/x86/include/asm/microcode_intel.h |
2308 | index 5e69154c9f07..a61ec81b27db 100644 |
2309 | --- a/arch/x86/include/asm/microcode_intel.h |
2310 | +++ b/arch/x86/include/asm/microcode_intel.h |
2311 | @@ -52,6 +52,21 @@ struct extended_sigtable { |
2312 | |
2313 | #define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE) |
2314 | |
2315 | +static inline u32 intel_get_microcode_revision(void) |
2316 | +{ |
2317 | + u32 rev, dummy; |
2318 | + |
2319 | + native_wrmsrl(MSR_IA32_UCODE_REV, 0); |
2320 | + |
2321 | + /* As documented in the SDM: Do a CPUID 1 here */ |
2322 | + sync_core(); |
2323 | + |
2324 | + /* get the current revision from MSR 0x8B */ |
2325 | + native_rdmsr(MSR_IA32_UCODE_REV, dummy, rev); |
2326 | + |
2327 | + return rev; |
2328 | +} |
2329 | + |
2330 | extern int has_newer_microcode(void *mc, unsigned int csig, int cpf, int rev); |
2331 | extern int microcode_sanity_check(void *mc, int print_err); |
2332 | extern int find_matching_signature(void *mc, unsigned int csig, int cpf); |
2333 | diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h |
2334 | index 9963e21ac443..38f94d07920d 100644 |
2335 | --- a/arch/x86/include/asm/msr-index.h |
2336 | +++ b/arch/x86/include/asm/msr-index.h |
2337 | @@ -1,6 +1,8 @@ |
2338 | #ifndef _ASM_X86_MSR_INDEX_H |
2339 | #define _ASM_X86_MSR_INDEX_H |
2340 | |
2341 | +#include <linux/bits.h> |
2342 | + |
2343 | /* |
2344 | * CPU model specific register (MSR) numbers. |
2345 | * |
2346 | @@ -38,13 +40,14 @@ |
2347 | |
2348 | /* Intel MSRs. Some also available on other CPUs */ |
2349 | #define MSR_IA32_SPEC_CTRL 0x00000048 /* Speculation Control */ |
2350 | -#define SPEC_CTRL_IBRS (1 << 0) /* Indirect Branch Restricted Speculation */ |
2351 | -#define SPEC_CTRL_STIBP (1 << 1) /* Single Thread Indirect Branch Predictors */ |
2352 | +#define SPEC_CTRL_IBRS BIT(0) /* Indirect Branch Restricted Speculation */ |
2353 | +#define SPEC_CTRL_STIBP_SHIFT 1 /* Single Thread Indirect Branch Predictor (STIBP) bit */ |
2354 | +#define SPEC_CTRL_STIBP BIT(SPEC_CTRL_STIBP_SHIFT) /* STIBP mask */ |
2355 | #define SPEC_CTRL_SSBD_SHIFT 2 /* Speculative Store Bypass Disable bit */ |
2356 | -#define SPEC_CTRL_SSBD (1 << SPEC_CTRL_SSBD_SHIFT) /* Speculative Store Bypass Disable */ |
2357 | +#define SPEC_CTRL_SSBD BIT(SPEC_CTRL_SSBD_SHIFT) /* Speculative Store Bypass Disable */ |
2358 | |
2359 | #define MSR_IA32_PRED_CMD 0x00000049 /* Prediction Command */ |
2360 | -#define PRED_CMD_IBPB (1 << 0) /* Indirect Branch Prediction Barrier */ |
2361 | +#define PRED_CMD_IBPB BIT(0) /* Indirect Branch Prediction Barrier */ |
2362 | |
2363 | #define MSR_IA32_PERFCTR0 0x000000c1 |
2364 | #define MSR_IA32_PERFCTR1 0x000000c2 |
2365 | @@ -61,20 +64,25 @@ |
2366 | #define MSR_MTRRcap 0x000000fe |
2367 | |
2368 | #define MSR_IA32_ARCH_CAPABILITIES 0x0000010a |
2369 | -#define ARCH_CAP_RDCL_NO (1 << 0) /* Not susceptible to Meltdown */ |
2370 | -#define ARCH_CAP_IBRS_ALL (1 << 1) /* Enhanced IBRS support */ |
2371 | -#define ARCH_CAP_SKIP_VMENTRY_L1DFLUSH (1 << 3) /* Skip L1D flush on vmentry */ |
2372 | -#define ARCH_CAP_SSB_NO (1 << 4) /* |
2373 | - * Not susceptible to Speculative Store Bypass |
2374 | - * attack, so no Speculative Store Bypass |
2375 | - * control required. |
2376 | - */ |
2377 | +#define ARCH_CAP_RDCL_NO BIT(0) /* Not susceptible to Meltdown */ |
2378 | +#define ARCH_CAP_IBRS_ALL BIT(1) /* Enhanced IBRS support */ |
2379 | +#define ARCH_CAP_SKIP_VMENTRY_L1DFLUSH BIT(3) /* Skip L1D flush on vmentry */ |
2380 | +#define ARCH_CAP_SSB_NO BIT(4) /* |
2381 | + * Not susceptible to Speculative Store Bypass |
2382 | + * attack, so no Speculative Store Bypass |
2383 | + * control required. |
2384 | + */ |
2385 | +#define ARCH_CAP_MDS_NO BIT(5) /* |
2386 | + * Not susceptible to |
2387 | + * Microarchitectural Data |
2388 | + * Sampling (MDS) vulnerabilities. |
2389 | + */ |
2390 | |
2391 | #define MSR_IA32_FLUSH_CMD 0x0000010b |
2392 | -#define L1D_FLUSH (1 << 0) /* |
2393 | - * Writeback and invalidate the |
2394 | - * L1 data cache. |
2395 | - */ |
2396 | +#define L1D_FLUSH BIT(0) /* |
2397 | + * Writeback and invalidate the |
2398 | + * L1 data cache. |
2399 | + */ |
2400 | |
2401 | #define MSR_IA32_BBL_CR_CTL 0x00000119 |
2402 | #define MSR_IA32_BBL_CR_CTL3 0x0000011e |
2403 | diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h |
2404 | index f37f2d8a2989..0b40cc442bda 100644 |
2405 | --- a/arch/x86/include/asm/mwait.h |
2406 | +++ b/arch/x86/include/asm/mwait.h |
2407 | @@ -4,6 +4,7 @@ |
2408 | #include <linux/sched.h> |
2409 | |
2410 | #include <asm/cpufeature.h> |
2411 | +#include <asm/nospec-branch.h> |
2412 | |
2413 | #define MWAIT_SUBSTATE_MASK 0xf |
2414 | #define MWAIT_CSTATE_MASK 0xf |
2415 | @@ -38,6 +39,8 @@ static inline void __monitorx(const void *eax, unsigned long ecx, |
2416 | |
2417 | static inline void __mwait(unsigned long eax, unsigned long ecx) |
2418 | { |
2419 | + mds_idle_clear_cpu_buffers(); |
2420 | + |
2421 | /* "mwait %eax, %ecx;" */ |
2422 | asm volatile(".byte 0x0f, 0x01, 0xc9;" |
2423 | :: "a" (eax), "c" (ecx)); |
2424 | @@ -72,6 +75,8 @@ static inline void __mwait(unsigned long eax, unsigned long ecx) |
2425 | static inline void __mwaitx(unsigned long eax, unsigned long ebx, |
2426 | unsigned long ecx) |
2427 | { |
2428 | + /* No MDS buffer clear as this is AMD/HYGON only */ |
2429 | + |
2430 | /* "mwaitx %eax, %ebx, %ecx;" */ |
2431 | asm volatile(".byte 0x0f, 0x01, 0xfb;" |
2432 | :: "a" (eax), "b" (ebx), "c" (ecx)); |
2433 | @@ -79,6 +84,8 @@ static inline void __mwaitx(unsigned long eax, unsigned long ebx, |
2434 | |
2435 | static inline void __sti_mwait(unsigned long eax, unsigned long ecx) |
2436 | { |
2437 | + mds_idle_clear_cpu_buffers(); |
2438 | + |
2439 | trace_hardirqs_on(); |
2440 | /* "mwait %eax, %ecx;" */ |
2441 | asm volatile("sti; .byte 0x0f, 0x01, 0xc9;" |
2442 | diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h |
2443 | index 1b4132161c1f..031a58e84e5b 100644 |
2444 | --- a/arch/x86/include/asm/nospec-branch.h |
2445 | +++ b/arch/x86/include/asm/nospec-branch.h |
2446 | @@ -3,6 +3,8 @@ |
2447 | #ifndef _ASM_X86_NOSPEC_BRANCH_H_ |
2448 | #define _ASM_X86_NOSPEC_BRANCH_H_ |
2449 | |
2450 | +#include <linux/static_key.h> |
2451 | + |
2452 | #include <asm/alternative.h> |
2453 | #include <asm/alternative-asm.h> |
2454 | #include <asm/cpufeatures.h> |
2455 | @@ -214,10 +216,17 @@ enum spectre_v2_mitigation { |
2456 | SPECTRE_V2_RETPOLINE_MINIMAL_AMD, |
2457 | SPECTRE_V2_RETPOLINE_GENERIC, |
2458 | SPECTRE_V2_RETPOLINE_AMD, |
2459 | - SPECTRE_V2_IBRS, |
2460 | SPECTRE_V2_IBRS_ENHANCED, |
2461 | }; |
2462 | |
2463 | +/* The indirect branch speculation control variants */ |
2464 | +enum spectre_v2_user_mitigation { |
2465 | + SPECTRE_V2_USER_NONE, |
2466 | + SPECTRE_V2_USER_STRICT, |
2467 | + SPECTRE_V2_USER_PRCTL, |
2468 | + SPECTRE_V2_USER_SECCOMP, |
2469 | +}; |
2470 | + |
2471 | /* The Speculative Store Bypass disable variants */ |
2472 | enum ssb_mitigation { |
2473 | SPEC_STORE_BYPASS_NONE, |
2474 | @@ -295,6 +304,60 @@ do { \ |
2475 | preempt_enable(); \ |
2476 | } while (0) |
2477 | |
2478 | +DECLARE_STATIC_KEY_FALSE(switch_to_cond_stibp); |
2479 | +DECLARE_STATIC_KEY_FALSE(switch_mm_cond_ibpb); |
2480 | +DECLARE_STATIC_KEY_FALSE(switch_mm_always_ibpb); |
2481 | + |
2482 | +DECLARE_STATIC_KEY_FALSE(mds_user_clear); |
2483 | +DECLARE_STATIC_KEY_FALSE(mds_idle_clear); |
2484 | + |
2485 | +#include <asm/segment.h> |
2486 | + |
2487 | +/** |
2488 | + * mds_clear_cpu_buffers - Mitigation for MDS vulnerability |
2489 | + * |
2490 | + * This uses the otherwise unused and obsolete VERW instruction in |
2491 | + * combination with microcode which triggers a CPU buffer flush when the |
2492 | + * instruction is executed. |
2493 | + */ |
2494 | +static inline void mds_clear_cpu_buffers(void) |
2495 | +{ |
2496 | + static const u16 ds = __KERNEL_DS; |
2497 | + |
2498 | + /* |
2499 | + * Has to be the memory-operand variant because only that |
2500 | + * guarantees the CPU buffer flush functionality according to |
2501 | + * documentation. The register-operand variant does not. |
2502 | + * Works with any segment selector, but a valid writable |
2503 | + * data segment is the fastest variant. |
2504 | + * |
2505 | + * "cc" clobber is required because VERW modifies ZF. |
2506 | + */ |
2507 | + asm volatile("verw %[ds]" : : [ds] "m" (ds) : "cc"); |
2508 | +} |
2509 | + |
2510 | +/** |
2511 | + * mds_user_clear_cpu_buffers - Mitigation for MDS vulnerability |
2512 | + * |
2513 | + * Clear CPU buffers if the corresponding static key is enabled |
2514 | + */ |
2515 | +static inline void mds_user_clear_cpu_buffers(void) |
2516 | +{ |
2517 | + if (static_branch_likely(&mds_user_clear)) |
2518 | + mds_clear_cpu_buffers(); |
2519 | +} |
2520 | + |
2521 | +/** |
2522 | + * mds_idle_clear_cpu_buffers - Mitigation for MDS vulnerability |
2523 | + * |
2524 | + * Clear CPU buffers if the corresponding static key is enabled |
2525 | + */ |
2526 | +static inline void mds_idle_clear_cpu_buffers(void) |
2527 | +{ |
2528 | + if (static_branch_likely(&mds_idle_clear)) |
2529 | + mds_clear_cpu_buffers(); |
2530 | +} |
2531 | + |
2532 | #endif /* __ASSEMBLY__ */ |
2533 | |
2534 | /* |
2535 | diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h |
2536 | index 221a32ed1372..f12e61e2a86b 100644 |
2537 | --- a/arch/x86/include/asm/pgtable_64.h |
2538 | +++ b/arch/x86/include/asm/pgtable_64.h |
2539 | @@ -44,15 +44,15 @@ struct mm_struct; |
2540 | void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte); |
2541 | |
2542 | |
2543 | -static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr, |
2544 | - pte_t *ptep) |
2545 | +static inline void native_set_pte(pte_t *ptep, pte_t pte) |
2546 | { |
2547 | - *ptep = native_make_pte(0); |
2548 | + WRITE_ONCE(*ptep, pte); |
2549 | } |
2550 | |
2551 | -static inline void native_set_pte(pte_t *ptep, pte_t pte) |
2552 | +static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr, |
2553 | + pte_t *ptep) |
2554 | { |
2555 | - *ptep = pte; |
2556 | + native_set_pte(ptep, native_make_pte(0)); |
2557 | } |
2558 | |
2559 | static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte) |
2560 | @@ -62,7 +62,7 @@ static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte) |
2561 | |
2562 | static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd) |
2563 | { |
2564 | - *pmdp = pmd; |
2565 | + WRITE_ONCE(*pmdp, pmd); |
2566 | } |
2567 | |
2568 | static inline void native_pmd_clear(pmd_t *pmd) |
2569 | @@ -98,7 +98,7 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp) |
2570 | |
2571 | static inline void native_set_pud(pud_t *pudp, pud_t pud) |
2572 | { |
2573 | - *pudp = pud; |
2574 | + WRITE_ONCE(*pudp, pud); |
2575 | } |
2576 | |
2577 | static inline void native_pud_clear(pud_t *pud) |
2578 | @@ -131,7 +131,7 @@ static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp) |
2579 | |
2580 | static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) |
2581 | { |
2582 | - *pgdp = kaiser_set_shadow_pgd(pgdp, pgd); |
2583 | + WRITE_ONCE(*pgdp, kaiser_set_shadow_pgd(pgdp, pgd)); |
2584 | } |
2585 | |
2586 | static inline void native_pgd_clear(pgd_t *pgd) |
2587 | diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h |
2588 | index ee8c6290c421..155e49fc7010 100644 |
2589 | --- a/arch/x86/include/asm/processor.h |
2590 | +++ b/arch/x86/include/asm/processor.h |
2591 | @@ -874,4 +874,10 @@ enum l1tf_mitigations { |
2592 | |
2593 | extern enum l1tf_mitigations l1tf_mitigation; |
2594 | |
2595 | +enum mds_mitigations { |
2596 | + MDS_MITIGATION_OFF, |
2597 | + MDS_MITIGATION_FULL, |
2598 | + MDS_MITIGATION_VMWERV, |
2599 | +}; |
2600 | + |
2601 | #endif /* _ASM_X86_PROCESSOR_H */ |
2602 | diff --git a/arch/x86/include/asm/spec-ctrl.h b/arch/x86/include/asm/spec-ctrl.h |
2603 | index ae7c2c5cd7f0..5393babc0598 100644 |
2604 | --- a/arch/x86/include/asm/spec-ctrl.h |
2605 | +++ b/arch/x86/include/asm/spec-ctrl.h |
2606 | @@ -53,12 +53,24 @@ static inline u64 ssbd_tif_to_spec_ctrl(u64 tifn) |
2607 | return (tifn & _TIF_SSBD) >> (TIF_SSBD - SPEC_CTRL_SSBD_SHIFT); |
2608 | } |
2609 | |
2610 | +static inline u64 stibp_tif_to_spec_ctrl(u64 tifn) |
2611 | +{ |
2612 | + BUILD_BUG_ON(TIF_SPEC_IB < SPEC_CTRL_STIBP_SHIFT); |
2613 | + return (tifn & _TIF_SPEC_IB) >> (TIF_SPEC_IB - SPEC_CTRL_STIBP_SHIFT); |
2614 | +} |
2615 | + |
2616 | static inline unsigned long ssbd_spec_ctrl_to_tif(u64 spec_ctrl) |
2617 | { |
2618 | BUILD_BUG_ON(TIF_SSBD < SPEC_CTRL_SSBD_SHIFT); |
2619 | return (spec_ctrl & SPEC_CTRL_SSBD) << (TIF_SSBD - SPEC_CTRL_SSBD_SHIFT); |
2620 | } |
2621 | |
2622 | +static inline unsigned long stibp_spec_ctrl_to_tif(u64 spec_ctrl) |
2623 | +{ |
2624 | + BUILD_BUG_ON(TIF_SPEC_IB < SPEC_CTRL_STIBP_SHIFT); |
2625 | + return (spec_ctrl & SPEC_CTRL_STIBP) << (TIF_SPEC_IB - SPEC_CTRL_STIBP_SHIFT); |
2626 | +} |
2627 | + |
2628 | static inline u64 ssbd_tif_to_amd_ls_cfg(u64 tifn) |
2629 | { |
2630 | return (tifn & _TIF_SSBD) ? x86_amd_ls_cfg_ssbd_mask : 0ULL; |
2631 | @@ -70,11 +82,7 @@ extern void speculative_store_bypass_ht_init(void); |
2632 | static inline void speculative_store_bypass_ht_init(void) { } |
2633 | #endif |
2634 | |
2635 | -extern void speculative_store_bypass_update(unsigned long tif); |
2636 | - |
2637 | -static inline void speculative_store_bypass_update_current(void) |
2638 | -{ |
2639 | - speculative_store_bypass_update(current_thread_info()->flags); |
2640 | -} |
2641 | +extern void speculation_ctrl_update(unsigned long tif); |
2642 | +extern void speculation_ctrl_update_current(void); |
2643 | |
2644 | #endif |
2645 | diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h |
2646 | index 5cb436acd463..676e84f521ba 100644 |
2647 | --- a/arch/x86/include/asm/switch_to.h |
2648 | +++ b/arch/x86/include/asm/switch_to.h |
2649 | @@ -8,9 +8,6 @@ struct task_struct *__switch_to_asm(struct task_struct *prev, |
2650 | |
2651 | __visible struct task_struct *__switch_to(struct task_struct *prev, |
2652 | struct task_struct *next); |
2653 | -struct tss_struct; |
2654 | -void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, |
2655 | - struct tss_struct *tss); |
2656 | |
2657 | /* This runs runs on the previous thread's stack. */ |
2658 | static inline void prepare_switch_to(struct task_struct *prev, |
2659 | diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h |
2660 | index 2d8788a59b4d..0438f7fbb383 100644 |
2661 | --- a/arch/x86/include/asm/thread_info.h |
2662 | +++ b/arch/x86/include/asm/thread_info.h |
2663 | @@ -83,10 +83,12 @@ struct thread_info { |
2664 | #define TIF_SIGPENDING 2 /* signal pending */ |
2665 | #define TIF_NEED_RESCHED 3 /* rescheduling necessary */ |
2666 | #define TIF_SINGLESTEP 4 /* reenable singlestep on user return*/ |
2667 | -#define TIF_SSBD 5 /* Reduced data speculation */ |
2668 | +#define TIF_SSBD 5 /* Speculative store bypass disable */ |
2669 | #define TIF_SYSCALL_EMU 6 /* syscall emulation active */ |
2670 | #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ |
2671 | #define TIF_SECCOMP 8 /* secure computing */ |
2672 | +#define TIF_SPEC_IB 9 /* Indirect branch speculation mitigation */ |
2673 | +#define TIF_SPEC_FORCE_UPDATE 10 /* Force speculation MSR update in context switch */ |
2674 | #define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */ |
2675 | #define TIF_UPROBE 12 /* breakpointed or singlestepping */ |
2676 | #define TIF_NOTSC 16 /* TSC is not accessible in userland */ |
2677 | @@ -111,6 +113,8 @@ struct thread_info { |
2678 | #define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU) |
2679 | #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) |
2680 | #define _TIF_SECCOMP (1 << TIF_SECCOMP) |
2681 | +#define _TIF_SPEC_IB (1 << TIF_SPEC_IB) |
2682 | +#define _TIF_SPEC_FORCE_UPDATE (1 << TIF_SPEC_FORCE_UPDATE) |
2683 | #define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY) |
2684 | #define _TIF_UPROBE (1 << TIF_UPROBE) |
2685 | #define _TIF_NOTSC (1 << TIF_NOTSC) |
2686 | @@ -140,8 +144,18 @@ struct thread_info { |
2687 | _TIF_NOHZ) |
2688 | |
2689 | /* flags to check in __switch_to() */ |
2690 | -#define _TIF_WORK_CTXSW \ |
2691 | - (_TIF_IO_BITMAP|_TIF_NOTSC|_TIF_BLOCKSTEP|_TIF_SSBD) |
2692 | +#define _TIF_WORK_CTXSW_BASE \ |
2693 | + (_TIF_IO_BITMAP|_TIF_NOTSC|_TIF_BLOCKSTEP| \ |
2694 | + _TIF_SSBD | _TIF_SPEC_FORCE_UPDATE) |
2695 | + |
2696 | +/* |
2697 | + * Avoid calls to __switch_to_xtra() on UP as STIBP is not evaluated. |
2698 | + */ |
2699 | +#ifdef CONFIG_SMP |
2700 | +# define _TIF_WORK_CTXSW (_TIF_WORK_CTXSW_BASE | _TIF_SPEC_IB) |
2701 | +#else |
2702 | +# define _TIF_WORK_CTXSW (_TIF_WORK_CTXSW_BASE) |
2703 | +#endif |
2704 | |
2705 | #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY) |
2706 | #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW) |
2707 | diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h |
2708 | index 686a58d793e5..f5ca15622dc9 100644 |
2709 | --- a/arch/x86/include/asm/tlbflush.h |
2710 | +++ b/arch/x86/include/asm/tlbflush.h |
2711 | @@ -68,8 +68,12 @@ static inline void invpcid_flush_all_nonglobals(void) |
2712 | struct tlb_state { |
2713 | struct mm_struct *active_mm; |
2714 | int state; |
2715 | - /* last user mm's ctx id */ |
2716 | - u64 last_ctx_id; |
2717 | + |
2718 | + /* Last user mm for optimizing IBPB */ |
2719 | + union { |
2720 | + struct mm_struct *last_user_mm; |
2721 | + unsigned long last_user_mm_ibpb; |
2722 | + }; |
2723 | |
2724 | /* |
2725 | * Access to this CR4 shadow and to H/W CR4 is protected by |
2726 | diff --git a/arch/x86/include/uapi/asm/Kbuild b/arch/x86/include/uapi/asm/Kbuild |
2727 | index 3dec769cadf7..1c532b3f18ea 100644 |
2728 | --- a/arch/x86/include/uapi/asm/Kbuild |
2729 | +++ b/arch/x86/include/uapi/asm/Kbuild |
2730 | @@ -27,7 +27,6 @@ header-y += ldt.h |
2731 | header-y += mce.h |
2732 | header-y += mman.h |
2733 | header-y += msgbuf.h |
2734 | -header-y += msr-index.h |
2735 | header-y += msr.h |
2736 | header-y += mtrr.h |
2737 | header-y += param.h |
2738 | diff --git a/arch/x86/include/uapi/asm/mce.h b/arch/x86/include/uapi/asm/mce.h |
2739 | index 69a6e07e3149..db7dae58745f 100644 |
2740 | --- a/arch/x86/include/uapi/asm/mce.h |
2741 | +++ b/arch/x86/include/uapi/asm/mce.h |
2742 | @@ -28,6 +28,8 @@ struct mce { |
2743 | __u64 mcgcap; /* MCGCAP MSR: machine check capabilities of CPU */ |
2744 | __u64 synd; /* MCA_SYND MSR: only valid on SMCA systems */ |
2745 | __u64 ipid; /* MCA_IPID MSR: only valid on SMCA systems */ |
2746 | + __u64 ppin; /* Protected Processor Inventory Number */ |
2747 | + __u32 microcode;/* Microcode revision */ |
2748 | }; |
2749 | |
2750 | #define MCE_GET_RECORD_LEN _IOR('M', 1, int) |
2751 | diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c |
2752 | index 6221166e3fca..16970c39baea 100644 |
2753 | --- a/arch/x86/kernel/cpu/bugs.c |
2754 | +++ b/arch/x86/kernel/cpu/bugs.c |
2755 | @@ -13,6 +13,7 @@ |
2756 | #include <linux/module.h> |
2757 | #include <linux/nospec.h> |
2758 | #include <linux/prctl.h> |
2759 | +#include <linux/sched/smt.h> |
2760 | |
2761 | #include <asm/spec-ctrl.h> |
2762 | #include <asm/cmdline.h> |
2763 | @@ -24,6 +25,7 @@ |
2764 | #include <asm/vmx.h> |
2765 | #include <asm/paravirt.h> |
2766 | #include <asm/alternative.h> |
2767 | +#include <asm/hypervisor.h> |
2768 | #include <asm/pgtable.h> |
2769 | #include <asm/cacheflush.h> |
2770 | #include <asm/intel-family.h> |
2771 | @@ -32,13 +34,12 @@ |
2772 | static void __init spectre_v2_select_mitigation(void); |
2773 | static void __init ssb_select_mitigation(void); |
2774 | static void __init l1tf_select_mitigation(void); |
2775 | +static void __init mds_select_mitigation(void); |
2776 | |
2777 | -/* |
2778 | - * Our boot-time value of the SPEC_CTRL MSR. We read it once so that any |
2779 | - * writes to SPEC_CTRL contain whatever reserved bits have been set. |
2780 | - */ |
2781 | -u64 __ro_after_init x86_spec_ctrl_base; |
2782 | +/* The base value of the SPEC_CTRL MSR that always has to be preserved. */ |
2783 | +u64 x86_spec_ctrl_base; |
2784 | EXPORT_SYMBOL_GPL(x86_spec_ctrl_base); |
2785 | +static DEFINE_MUTEX(spec_ctrl_mutex); |
2786 | |
2787 | /* |
2788 | * The vendor and possibly platform specific bits which can be modified in |
2789 | @@ -53,6 +54,20 @@ static u64 __ro_after_init x86_spec_ctrl_mask = SPEC_CTRL_IBRS; |
2790 | u64 __ro_after_init x86_amd_ls_cfg_base; |
2791 | u64 __ro_after_init x86_amd_ls_cfg_ssbd_mask; |
2792 | |
2793 | +/* Control conditional STIPB in switch_to() */ |
2794 | +DEFINE_STATIC_KEY_FALSE(switch_to_cond_stibp); |
2795 | +/* Control conditional IBPB in switch_mm() */ |
2796 | +DEFINE_STATIC_KEY_FALSE(switch_mm_cond_ibpb); |
2797 | +/* Control unconditional IBPB in switch_mm() */ |
2798 | +DEFINE_STATIC_KEY_FALSE(switch_mm_always_ibpb); |
2799 | + |
2800 | +/* Control MDS CPU buffer clear before returning to user space */ |
2801 | +DEFINE_STATIC_KEY_FALSE(mds_user_clear); |
2802 | +EXPORT_SYMBOL_GPL(mds_user_clear); |
2803 | +/* Control MDS CPU buffer clear before idling (halt, mwait) */ |
2804 | +DEFINE_STATIC_KEY_FALSE(mds_idle_clear); |
2805 | +EXPORT_SYMBOL_GPL(mds_idle_clear); |
2806 | + |
2807 | void __init check_bugs(void) |
2808 | { |
2809 | identify_boot_cpu(); |
2810 | @@ -91,6 +106,10 @@ void __init check_bugs(void) |
2811 | |
2812 | l1tf_select_mitigation(); |
2813 | |
2814 | + mds_select_mitigation(); |
2815 | + |
2816 | + arch_smt_update(); |
2817 | + |
2818 | #ifdef CONFIG_X86_32 |
2819 | /* |
2820 | * Check whether we are able to run this kernel safely on SMP. |
2821 | @@ -123,31 +142,6 @@ void __init check_bugs(void) |
2822 | #endif |
2823 | } |
2824 | |
2825 | -/* The kernel command line selection */ |
2826 | -enum spectre_v2_mitigation_cmd { |
2827 | - SPECTRE_V2_CMD_NONE, |
2828 | - SPECTRE_V2_CMD_AUTO, |
2829 | - SPECTRE_V2_CMD_FORCE, |
2830 | - SPECTRE_V2_CMD_RETPOLINE, |
2831 | - SPECTRE_V2_CMD_RETPOLINE_GENERIC, |
2832 | - SPECTRE_V2_CMD_RETPOLINE_AMD, |
2833 | -}; |
2834 | - |
2835 | -static const char *spectre_v2_strings[] = { |
2836 | - [SPECTRE_V2_NONE] = "Vulnerable", |
2837 | - [SPECTRE_V2_RETPOLINE_MINIMAL] = "Vulnerable: Minimal generic ASM retpoline", |
2838 | - [SPECTRE_V2_RETPOLINE_MINIMAL_AMD] = "Vulnerable: Minimal AMD ASM retpoline", |
2839 | - [SPECTRE_V2_RETPOLINE_GENERIC] = "Mitigation: Full generic retpoline", |
2840 | - [SPECTRE_V2_RETPOLINE_AMD] = "Mitigation: Full AMD retpoline", |
2841 | - [SPECTRE_V2_IBRS_ENHANCED] = "Mitigation: Enhanced IBRS", |
2842 | -}; |
2843 | - |
2844 | -#undef pr_fmt |
2845 | -#define pr_fmt(fmt) "Spectre V2 : " fmt |
2846 | - |
2847 | -static enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init = |
2848 | - SPECTRE_V2_NONE; |
2849 | - |
2850 | void |
2851 | x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest) |
2852 | { |
2853 | @@ -165,9 +159,14 @@ x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest) |
2854 | guestval |= guest_spec_ctrl & x86_spec_ctrl_mask; |
2855 | |
2856 | /* SSBD controlled in MSR_SPEC_CTRL */ |
2857 | - if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD)) |
2858 | + if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) || |
2859 | + static_cpu_has(X86_FEATURE_AMD_SSBD)) |
2860 | hostval |= ssbd_tif_to_spec_ctrl(ti->flags); |
2861 | |
2862 | + /* Conditional STIBP enabled? */ |
2863 | + if (static_branch_unlikely(&switch_to_cond_stibp)) |
2864 | + hostval |= stibp_tif_to_spec_ctrl(ti->flags); |
2865 | + |
2866 | if (hostval != guestval) { |
2867 | msrval = setguest ? guestval : hostval; |
2868 | wrmsrl(MSR_IA32_SPEC_CTRL, msrval); |
2869 | @@ -201,7 +200,7 @@ x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest) |
2870 | tif = setguest ? ssbd_spec_ctrl_to_tif(guestval) : |
2871 | ssbd_spec_ctrl_to_tif(hostval); |
2872 | |
2873 | - speculative_store_bypass_update(tif); |
2874 | + speculation_ctrl_update(tif); |
2875 | } |
2876 | } |
2877 | EXPORT_SYMBOL_GPL(x86_virt_spec_ctrl); |
2878 | @@ -216,6 +215,70 @@ static void x86_amd_ssb_disable(void) |
2879 | wrmsrl(MSR_AMD64_LS_CFG, msrval); |
2880 | } |
2881 | |
2882 | +#undef pr_fmt |
2883 | +#define pr_fmt(fmt) "MDS: " fmt |
2884 | + |
2885 | +/* Default mitigation for MDS-affected CPUs */ |
2886 | +static enum mds_mitigations mds_mitigation __ro_after_init = MDS_MITIGATION_FULL; |
2887 | +static bool mds_nosmt __ro_after_init = false; |
2888 | + |
2889 | +static const char * const mds_strings[] = { |
2890 | + [MDS_MITIGATION_OFF] = "Vulnerable", |
2891 | + [MDS_MITIGATION_FULL] = "Mitigation: Clear CPU buffers", |
2892 | + [MDS_MITIGATION_VMWERV] = "Vulnerable: Clear CPU buffers attempted, no microcode", |
2893 | +}; |
2894 | + |
2895 | +static void __init mds_select_mitigation(void) |
2896 | +{ |
2897 | + if (!boot_cpu_has_bug(X86_BUG_MDS) || cpu_mitigations_off()) { |
2898 | + mds_mitigation = MDS_MITIGATION_OFF; |
2899 | + return; |
2900 | + } |
2901 | + |
2902 | + if (mds_mitigation == MDS_MITIGATION_FULL) { |
2903 | + if (!boot_cpu_has(X86_FEATURE_MD_CLEAR)) |
2904 | + mds_mitigation = MDS_MITIGATION_VMWERV; |
2905 | + |
2906 | + static_branch_enable(&mds_user_clear); |
2907 | + |
2908 | + if (!boot_cpu_has(X86_BUG_MSBDS_ONLY) && |
2909 | + (mds_nosmt || cpu_mitigations_auto_nosmt())) |
2910 | + cpu_smt_disable(false); |
2911 | + } |
2912 | + |
2913 | + pr_info("%s\n", mds_strings[mds_mitigation]); |
2914 | +} |
2915 | + |
2916 | +static int __init mds_cmdline(char *str) |
2917 | +{ |
2918 | + if (!boot_cpu_has_bug(X86_BUG_MDS)) |
2919 | + return 0; |
2920 | + |
2921 | + if (!str) |
2922 | + return -EINVAL; |
2923 | + |
2924 | + if (!strcmp(str, "off")) |
2925 | + mds_mitigation = MDS_MITIGATION_OFF; |
2926 | + else if (!strcmp(str, "full")) |
2927 | + mds_mitigation = MDS_MITIGATION_FULL; |
2928 | + else if (!strcmp(str, "full,nosmt")) { |
2929 | + mds_mitigation = MDS_MITIGATION_FULL; |
2930 | + mds_nosmt = true; |
2931 | + } |
2932 | + |
2933 | + return 0; |
2934 | +} |
2935 | +early_param("mds", mds_cmdline); |
2936 | + |
2937 | +#undef pr_fmt |
2938 | +#define pr_fmt(fmt) "Spectre V2 : " fmt |
2939 | + |
2940 | +static enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init = |
2941 | + SPECTRE_V2_NONE; |
2942 | + |
2943 | +static enum spectre_v2_user_mitigation spectre_v2_user __ro_after_init = |
2944 | + SPECTRE_V2_USER_NONE; |
2945 | + |
2946 | #ifdef RETPOLINE |
2947 | static bool spectre_v2_bad_module; |
2948 | |
2949 | @@ -237,67 +300,225 @@ static inline const char *spectre_v2_module_string(void) |
2950 | static inline const char *spectre_v2_module_string(void) { return ""; } |
2951 | #endif |
2952 | |
2953 | -static void __init spec2_print_if_insecure(const char *reason) |
2954 | +static inline bool match_option(const char *arg, int arglen, const char *opt) |
2955 | { |
2956 | - if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) |
2957 | - pr_info("%s selected on command line.\n", reason); |
2958 | + int len = strlen(opt); |
2959 | + |
2960 | + return len == arglen && !strncmp(arg, opt, len); |
2961 | } |
2962 | |
2963 | -static void __init spec2_print_if_secure(const char *reason) |
2964 | +/* The kernel command line selection for spectre v2 */ |
2965 | +enum spectre_v2_mitigation_cmd { |
2966 | + SPECTRE_V2_CMD_NONE, |
2967 | + SPECTRE_V2_CMD_AUTO, |
2968 | + SPECTRE_V2_CMD_FORCE, |
2969 | + SPECTRE_V2_CMD_RETPOLINE, |
2970 | + SPECTRE_V2_CMD_RETPOLINE_GENERIC, |
2971 | + SPECTRE_V2_CMD_RETPOLINE_AMD, |
2972 | +}; |
2973 | + |
2974 | +enum spectre_v2_user_cmd { |
2975 | + SPECTRE_V2_USER_CMD_NONE, |
2976 | + SPECTRE_V2_USER_CMD_AUTO, |
2977 | + SPECTRE_V2_USER_CMD_FORCE, |
2978 | + SPECTRE_V2_USER_CMD_PRCTL, |
2979 | + SPECTRE_V2_USER_CMD_PRCTL_IBPB, |
2980 | + SPECTRE_V2_USER_CMD_SECCOMP, |
2981 | + SPECTRE_V2_USER_CMD_SECCOMP_IBPB, |
2982 | +}; |
2983 | + |
2984 | +static const char * const spectre_v2_user_strings[] = { |
2985 | + [SPECTRE_V2_USER_NONE] = "User space: Vulnerable", |
2986 | + [SPECTRE_V2_USER_STRICT] = "User space: Mitigation: STIBP protection", |
2987 | + [SPECTRE_V2_USER_PRCTL] = "User space: Mitigation: STIBP via prctl", |
2988 | + [SPECTRE_V2_USER_SECCOMP] = "User space: Mitigation: STIBP via seccomp and prctl", |
2989 | +}; |
2990 | + |
2991 | +static const struct { |
2992 | + const char *option; |
2993 | + enum spectre_v2_user_cmd cmd; |
2994 | + bool secure; |
2995 | +} v2_user_options[] __initconst = { |
2996 | + { "auto", SPECTRE_V2_USER_CMD_AUTO, false }, |
2997 | + { "off", SPECTRE_V2_USER_CMD_NONE, false }, |
2998 | + { "on", SPECTRE_V2_USER_CMD_FORCE, true }, |
2999 | + { "prctl", SPECTRE_V2_USER_CMD_PRCTL, false }, |
3000 | + { "prctl,ibpb", SPECTRE_V2_USER_CMD_PRCTL_IBPB, false }, |
3001 | + { "seccomp", SPECTRE_V2_USER_CMD_SECCOMP, false }, |
3002 | + { "seccomp,ibpb", SPECTRE_V2_USER_CMD_SECCOMP_IBPB, false }, |
3003 | +}; |
3004 | + |
3005 | +static void __init spec_v2_user_print_cond(const char *reason, bool secure) |
3006 | { |
3007 | - if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) |
3008 | - pr_info("%s selected on command line.\n", reason); |
3009 | + if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2) != secure) |
3010 | + pr_info("spectre_v2_user=%s forced on command line.\n", reason); |
3011 | } |
3012 | |
3013 | -static inline bool retp_compiler(void) |
3014 | +static enum spectre_v2_user_cmd __init |
3015 | +spectre_v2_parse_user_cmdline(enum spectre_v2_mitigation_cmd v2_cmd) |
3016 | { |
3017 | - return __is_defined(RETPOLINE); |
3018 | + char arg[20]; |
3019 | + int ret, i; |
3020 | + |
3021 | + switch (v2_cmd) { |
3022 | + case SPECTRE_V2_CMD_NONE: |
3023 | + return SPECTRE_V2_USER_CMD_NONE; |
3024 | + case SPECTRE_V2_CMD_FORCE: |
3025 | + return SPECTRE_V2_USER_CMD_FORCE; |
3026 | + default: |
3027 | + break; |
3028 | + } |
3029 | + |
3030 | + ret = cmdline_find_option(boot_command_line, "spectre_v2_user", |
3031 | + arg, sizeof(arg)); |
3032 | + if (ret < 0) |
3033 | + return SPECTRE_V2_USER_CMD_AUTO; |
3034 | + |
3035 | + for (i = 0; i < ARRAY_SIZE(v2_user_options); i++) { |
3036 | + if (match_option(arg, ret, v2_user_options[i].option)) { |
3037 | + spec_v2_user_print_cond(v2_user_options[i].option, |
3038 | + v2_user_options[i].secure); |
3039 | + return v2_user_options[i].cmd; |
3040 | + } |
3041 | + } |
3042 | + |
3043 | + pr_err("Unknown user space protection option (%s). Switching to AUTO select\n", arg); |
3044 | + return SPECTRE_V2_USER_CMD_AUTO; |
3045 | } |
3046 | |
3047 | -static inline bool match_option(const char *arg, int arglen, const char *opt) |
3048 | +static void __init |
3049 | +spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) |
3050 | { |
3051 | - int len = strlen(opt); |
3052 | + enum spectre_v2_user_mitigation mode = SPECTRE_V2_USER_NONE; |
3053 | + bool smt_possible = IS_ENABLED(CONFIG_SMP); |
3054 | + enum spectre_v2_user_cmd cmd; |
3055 | |
3056 | - return len == arglen && !strncmp(arg, opt, len); |
3057 | + if (!boot_cpu_has(X86_FEATURE_IBPB) && !boot_cpu_has(X86_FEATURE_STIBP)) |
3058 | + return; |
3059 | + |
3060 | + if (cpu_smt_control == CPU_SMT_FORCE_DISABLED || |
3061 | + cpu_smt_control == CPU_SMT_NOT_SUPPORTED) |
3062 | + smt_possible = false; |
3063 | + |
3064 | + cmd = spectre_v2_parse_user_cmdline(v2_cmd); |
3065 | + switch (cmd) { |
3066 | + case SPECTRE_V2_USER_CMD_NONE: |
3067 | + goto set_mode; |
3068 | + case SPECTRE_V2_USER_CMD_FORCE: |
3069 | + mode = SPECTRE_V2_USER_STRICT; |
3070 | + break; |
3071 | + case SPECTRE_V2_USER_CMD_PRCTL: |
3072 | + case SPECTRE_V2_USER_CMD_PRCTL_IBPB: |
3073 | + mode = SPECTRE_V2_USER_PRCTL; |
3074 | + break; |
3075 | + case SPECTRE_V2_USER_CMD_AUTO: |
3076 | + case SPECTRE_V2_USER_CMD_SECCOMP: |
3077 | + case SPECTRE_V2_USER_CMD_SECCOMP_IBPB: |
3078 | + if (IS_ENABLED(CONFIG_SECCOMP)) |
3079 | + mode = SPECTRE_V2_USER_SECCOMP; |
3080 | + else |
3081 | + mode = SPECTRE_V2_USER_PRCTL; |
3082 | + break; |
3083 | + } |
3084 | + |
3085 | + /* Initialize Indirect Branch Prediction Barrier */ |
3086 | + if (boot_cpu_has(X86_FEATURE_IBPB)) { |
3087 | + setup_force_cpu_cap(X86_FEATURE_USE_IBPB); |
3088 | + |
3089 | + switch (cmd) { |
3090 | + case SPECTRE_V2_USER_CMD_FORCE: |
3091 | + case SPECTRE_V2_USER_CMD_PRCTL_IBPB: |
3092 | + case SPECTRE_V2_USER_CMD_SECCOMP_IBPB: |
3093 | + static_branch_enable(&switch_mm_always_ibpb); |
3094 | + break; |
3095 | + case SPECTRE_V2_USER_CMD_PRCTL: |
3096 | + case SPECTRE_V2_USER_CMD_AUTO: |
3097 | + case SPECTRE_V2_USER_CMD_SECCOMP: |
3098 | + static_branch_enable(&switch_mm_cond_ibpb); |
3099 | + break; |
3100 | + default: |
3101 | + break; |
3102 | + } |
3103 | + |
3104 | + pr_info("mitigation: Enabling %s Indirect Branch Prediction Barrier\n", |
3105 | + static_key_enabled(&switch_mm_always_ibpb) ? |
3106 | + "always-on" : "conditional"); |
3107 | + } |
3108 | + |
3109 | + /* If enhanced IBRS is enabled no STIPB required */ |
3110 | + if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) |
3111 | + return; |
3112 | + |
3113 | + /* |
3114 | + * If SMT is not possible or STIBP is not available clear the STIPB |
3115 | + * mode. |
3116 | + */ |
3117 | + if (!smt_possible || !boot_cpu_has(X86_FEATURE_STIBP)) |
3118 | + mode = SPECTRE_V2_USER_NONE; |
3119 | +set_mode: |
3120 | + spectre_v2_user = mode; |
3121 | + /* Only print the STIBP mode when SMT possible */ |
3122 | + if (smt_possible) |
3123 | + pr_info("%s\n", spectre_v2_user_strings[mode]); |
3124 | } |
3125 | |
3126 | +static const char * const spectre_v2_strings[] = { |
3127 | + [SPECTRE_V2_NONE] = "Vulnerable", |
3128 | + [SPECTRE_V2_RETPOLINE_MINIMAL] = "Vulnerable: Minimal generic ASM retpoline", |
3129 | + [SPECTRE_V2_RETPOLINE_MINIMAL_AMD] = "Vulnerable: Minimal AMD ASM retpoline", |
3130 | + [SPECTRE_V2_RETPOLINE_GENERIC] = "Mitigation: Full generic retpoline", |
3131 | + [SPECTRE_V2_RETPOLINE_AMD] = "Mitigation: Full AMD retpoline", |
3132 | + [SPECTRE_V2_IBRS_ENHANCED] = "Mitigation: Enhanced IBRS", |
3133 | +}; |
3134 | + |
3135 | static const struct { |
3136 | const char *option; |
3137 | enum spectre_v2_mitigation_cmd cmd; |
3138 | bool secure; |
3139 | -} mitigation_options[] = { |
3140 | - { "off", SPECTRE_V2_CMD_NONE, false }, |
3141 | - { "on", SPECTRE_V2_CMD_FORCE, true }, |
3142 | - { "retpoline", SPECTRE_V2_CMD_RETPOLINE, false }, |
3143 | - { "retpoline,amd", SPECTRE_V2_CMD_RETPOLINE_AMD, false }, |
3144 | - { "retpoline,generic", SPECTRE_V2_CMD_RETPOLINE_GENERIC, false }, |
3145 | - { "auto", SPECTRE_V2_CMD_AUTO, false }, |
3146 | +} mitigation_options[] __initconst = { |
3147 | + { "off", SPECTRE_V2_CMD_NONE, false }, |
3148 | + { "on", SPECTRE_V2_CMD_FORCE, true }, |
3149 | + { "retpoline", SPECTRE_V2_CMD_RETPOLINE, false }, |
3150 | + { "retpoline,amd", SPECTRE_V2_CMD_RETPOLINE_AMD, false }, |
3151 | + { "retpoline,generic", SPECTRE_V2_CMD_RETPOLINE_GENERIC, false }, |
3152 | + { "auto", SPECTRE_V2_CMD_AUTO, false }, |
3153 | }; |
3154 | |
3155 | +static void __init spec_v2_print_cond(const char *reason, bool secure) |
3156 | +{ |
3157 | + if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2) != secure) |
3158 | + pr_info("%s selected on command line.\n", reason); |
3159 | +} |
3160 | + |
3161 | +static inline bool retp_compiler(void) |
3162 | +{ |
3163 | + return __is_defined(RETPOLINE); |
3164 | +} |
3165 | + |
3166 | static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) |
3167 | { |
3168 | + enum spectre_v2_mitigation_cmd cmd = SPECTRE_V2_CMD_AUTO; |
3169 | char arg[20]; |
3170 | int ret, i; |
3171 | - enum spectre_v2_mitigation_cmd cmd = SPECTRE_V2_CMD_AUTO; |
3172 | |
3173 | - if (cmdline_find_option_bool(boot_command_line, "nospectre_v2")) |
3174 | + if (cmdline_find_option_bool(boot_command_line, "nospectre_v2") || |
3175 | + cpu_mitigations_off()) |
3176 | return SPECTRE_V2_CMD_NONE; |
3177 | - else { |
3178 | - ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, sizeof(arg)); |
3179 | - if (ret < 0) |
3180 | - return SPECTRE_V2_CMD_AUTO; |
3181 | |
3182 | - for (i = 0; i < ARRAY_SIZE(mitigation_options); i++) { |
3183 | - if (!match_option(arg, ret, mitigation_options[i].option)) |
3184 | - continue; |
3185 | - cmd = mitigation_options[i].cmd; |
3186 | - break; |
3187 | - } |
3188 | + ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, sizeof(arg)); |
3189 | + if (ret < 0) |
3190 | + return SPECTRE_V2_CMD_AUTO; |
3191 | |
3192 | - if (i >= ARRAY_SIZE(mitigation_options)) { |
3193 | - pr_err("unknown option (%s). Switching to AUTO select\n", arg); |
3194 | - return SPECTRE_V2_CMD_AUTO; |
3195 | - } |
3196 | + for (i = 0; i < ARRAY_SIZE(mitigation_options); i++) { |
3197 | + if (!match_option(arg, ret, mitigation_options[i].option)) |
3198 | + continue; |
3199 | + cmd = mitigation_options[i].cmd; |
3200 | + break; |
3201 | + } |
3202 | + |
3203 | + if (i >= ARRAY_SIZE(mitigation_options)) { |
3204 | + pr_err("unknown option (%s). Switching to AUTO select\n", arg); |
3205 | + return SPECTRE_V2_CMD_AUTO; |
3206 | } |
3207 | |
3208 | if ((cmd == SPECTRE_V2_CMD_RETPOLINE || |
3209 | @@ -314,11 +535,8 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) |
3210 | return SPECTRE_V2_CMD_AUTO; |
3211 | } |
3212 | |
3213 | - if (mitigation_options[i].secure) |
3214 | - spec2_print_if_secure(mitigation_options[i].option); |
3215 | - else |
3216 | - spec2_print_if_insecure(mitigation_options[i].option); |
3217 | - |
3218 | + spec_v2_print_cond(mitigation_options[i].option, |
3219 | + mitigation_options[i].secure); |
3220 | return cmd; |
3221 | } |
3222 | |
3223 | @@ -400,12 +618,6 @@ specv2_set_mode: |
3224 | setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW); |
3225 | pr_info("Spectre v2 / SpectreRSB mitigation: Filling RSB on context switch\n"); |
3226 | |
3227 | - /* Initialize Indirect Branch Prediction Barrier if supported */ |
3228 | - if (boot_cpu_has(X86_FEATURE_IBPB)) { |
3229 | - setup_force_cpu_cap(X86_FEATURE_USE_IBPB); |
3230 | - pr_info("Spectre v2 mitigation: Enabling Indirect Branch Prediction Barrier\n"); |
3231 | - } |
3232 | - |
3233 | /* |
3234 | * Retpoline means the kernel is safe because it has no indirect |
3235 | * branches. Enhanced IBRS protects firmware too, so, enable restricted |
3236 | @@ -421,6 +633,99 @@ specv2_set_mode: |
3237 | setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW); |
3238 | pr_info("Enabling Restricted Speculation for firmware calls\n"); |
3239 | } |
3240 | + |
3241 | + /* Set up IBPB and STIBP depending on the general spectre V2 command */ |
3242 | + spectre_v2_user_select_mitigation(cmd); |
3243 | +} |
3244 | + |
3245 | +static void update_stibp_msr(void * __unused) |
3246 | +{ |
3247 | + wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); |
3248 | +} |
3249 | + |
3250 | +/* Update x86_spec_ctrl_base in case SMT state changed. */ |
3251 | +static void update_stibp_strict(void) |
3252 | +{ |
3253 | + u64 mask = x86_spec_ctrl_base & ~SPEC_CTRL_STIBP; |
3254 | + |
3255 | + if (sched_smt_active()) |
3256 | + mask |= SPEC_CTRL_STIBP; |
3257 | + |
3258 | + if (mask == x86_spec_ctrl_base) |
3259 | + return; |
3260 | + |
3261 | + pr_info("Update user space SMT mitigation: STIBP %s\n", |
3262 | + mask & SPEC_CTRL_STIBP ? "always-on" : "off"); |
3263 | + x86_spec_ctrl_base = mask; |
3264 | + on_each_cpu(update_stibp_msr, NULL, 1); |
3265 | +} |
3266 | + |
3267 | +/* Update the static key controlling the evaluation of TIF_SPEC_IB */ |
3268 | +static void update_indir_branch_cond(void) |
3269 | +{ |
3270 | + if (sched_smt_active()) |
3271 | + static_branch_enable(&switch_to_cond_stibp); |
3272 | + else |
3273 | + static_branch_disable(&switch_to_cond_stibp); |
3274 | +} |
3275 | + |
3276 | +#undef pr_fmt |
3277 | +#define pr_fmt(fmt) fmt |
3278 | + |
3279 | +/* Update the static key controlling the MDS CPU buffer clear in idle */ |
3280 | +static void update_mds_branch_idle(void) |
3281 | +{ |
3282 | + /* |
3283 | + * Enable the idle clearing if SMT is active on CPUs which are |
3284 | + * affected only by MSBDS and not any other MDS variant. |
3285 | + * |
3286 | + * The other variants cannot be mitigated when SMT is enabled, so |
3287 | + * clearing the buffers on idle just to prevent the Store Buffer |
3288 | + * repartitioning leak would be a window dressing exercise. |
3289 | + */ |
3290 | + if (!boot_cpu_has_bug(X86_BUG_MSBDS_ONLY)) |
3291 | + return; |
3292 | + |
3293 | + if (sched_smt_active()) |
3294 | + static_branch_enable(&mds_idle_clear); |
3295 | + else |
3296 | + static_branch_disable(&mds_idle_clear); |
3297 | +} |
3298 | + |
3299 | +#define MDS_MSG_SMT "MDS CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/mds.html for more details.\n" |
3300 | + |
3301 | +void arch_smt_update(void) |
3302 | +{ |
3303 | + /* Enhanced IBRS implies STIBP. No update required. */ |
3304 | + if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) |
3305 | + return; |
3306 | + |
3307 | + mutex_lock(&spec_ctrl_mutex); |
3308 | + |
3309 | + switch (spectre_v2_user) { |
3310 | + case SPECTRE_V2_USER_NONE: |
3311 | + break; |
3312 | + case SPECTRE_V2_USER_STRICT: |
3313 | + update_stibp_strict(); |
3314 | + break; |
3315 | + case SPECTRE_V2_USER_PRCTL: |
3316 | + case SPECTRE_V2_USER_SECCOMP: |
3317 | + update_indir_branch_cond(); |
3318 | + break; |
3319 | + } |
3320 | + |
3321 | + switch (mds_mitigation) { |
3322 | + case MDS_MITIGATION_FULL: |
3323 | + case MDS_MITIGATION_VMWERV: |
3324 | + if (sched_smt_active() && !boot_cpu_has(X86_BUG_MSBDS_ONLY)) |
3325 | + pr_warn_once(MDS_MSG_SMT); |
3326 | + update_mds_branch_idle(); |
3327 | + break; |
3328 | + case MDS_MITIGATION_OFF: |
3329 | + break; |
3330 | + } |
3331 | + |
3332 | + mutex_unlock(&spec_ctrl_mutex); |
3333 | } |
3334 | |
3335 | #undef pr_fmt |
3336 | @@ -437,7 +742,7 @@ enum ssb_mitigation_cmd { |
3337 | SPEC_STORE_BYPASS_CMD_SECCOMP, |
3338 | }; |
3339 | |
3340 | -static const char *ssb_strings[] = { |
3341 | +static const char * const ssb_strings[] = { |
3342 | [SPEC_STORE_BYPASS_NONE] = "Vulnerable", |
3343 | [SPEC_STORE_BYPASS_DISABLE] = "Mitigation: Speculative Store Bypass disabled", |
3344 | [SPEC_STORE_BYPASS_PRCTL] = "Mitigation: Speculative Store Bypass disabled via prctl", |
3345 | @@ -447,7 +752,7 @@ static const char *ssb_strings[] = { |
3346 | static const struct { |
3347 | const char *option; |
3348 | enum ssb_mitigation_cmd cmd; |
3349 | -} ssb_mitigation_options[] = { |
3350 | +} ssb_mitigation_options[] __initconst = { |
3351 | { "auto", SPEC_STORE_BYPASS_CMD_AUTO }, /* Platform decides */ |
3352 | { "on", SPEC_STORE_BYPASS_CMD_ON }, /* Disable Speculative Store Bypass */ |
3353 | { "off", SPEC_STORE_BYPASS_CMD_NONE }, /* Don't touch Speculative Store Bypass */ |
3354 | @@ -461,7 +766,8 @@ static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void) |
3355 | char arg[20]; |
3356 | int ret, i; |
3357 | |
3358 | - if (cmdline_find_option_bool(boot_command_line, "nospec_store_bypass_disable")) { |
3359 | + if (cmdline_find_option_bool(boot_command_line, "nospec_store_bypass_disable") || |
3360 | + cpu_mitigations_off()) { |
3361 | return SPEC_STORE_BYPASS_CMD_NONE; |
3362 | } else { |
3363 | ret = cmdline_find_option(boot_command_line, "spec_store_bypass_disable", |
3364 | @@ -531,18 +837,16 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void) |
3365 | if (mode == SPEC_STORE_BYPASS_DISABLE) { |
3366 | setup_force_cpu_cap(X86_FEATURE_SPEC_STORE_BYPASS_DISABLE); |
3367 | /* |
3368 | - * Intel uses the SPEC CTRL MSR Bit(2) for this, while AMD uses |
3369 | - * a completely different MSR and bit dependent on family. |
3370 | + * Intel uses the SPEC CTRL MSR Bit(2) for this, while AMD may |
3371 | + * use a completely different MSR and bit dependent on family. |
3372 | */ |
3373 | - switch (boot_cpu_data.x86_vendor) { |
3374 | - case X86_VENDOR_INTEL: |
3375 | + if (!static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) && |
3376 | + !static_cpu_has(X86_FEATURE_AMD_SSBD)) { |
3377 | + x86_amd_ssb_disable(); |
3378 | + } else { |
3379 | x86_spec_ctrl_base |= SPEC_CTRL_SSBD; |
3380 | x86_spec_ctrl_mask |= SPEC_CTRL_SSBD; |
3381 | wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); |
3382 | - break; |
3383 | - case X86_VENDOR_AMD: |
3384 | - x86_amd_ssb_disable(); |
3385 | - break; |
3386 | } |
3387 | } |
3388 | |
3389 | @@ -560,10 +864,25 @@ static void ssb_select_mitigation(void) |
3390 | #undef pr_fmt |
3391 | #define pr_fmt(fmt) "Speculation prctl: " fmt |
3392 | |
3393 | -static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl) |
3394 | +static void task_update_spec_tif(struct task_struct *tsk) |
3395 | { |
3396 | - bool update; |
3397 | + /* Force the update of the real TIF bits */ |
3398 | + set_tsk_thread_flag(tsk, TIF_SPEC_FORCE_UPDATE); |
3399 | |
3400 | + /* |
3401 | + * Immediately update the speculation control MSRs for the current |
3402 | + * task, but for a non-current task delay setting the CPU |
3403 | + * mitigation until it is scheduled next. |
3404 | + * |
3405 | + * This can only happen for SECCOMP mitigation. For PRCTL it's |
3406 | + * always the current task. |
3407 | + */ |
3408 | + if (tsk == current) |
3409 | + speculation_ctrl_update_current(); |
3410 | +} |
3411 | + |
3412 | +static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl) |
3413 | +{ |
3414 | if (ssb_mode != SPEC_STORE_BYPASS_PRCTL && |
3415 | ssb_mode != SPEC_STORE_BYPASS_SECCOMP) |
3416 | return -ENXIO; |
3417 | @@ -574,28 +893,56 @@ static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl) |
3418 | if (task_spec_ssb_force_disable(task)) |
3419 | return -EPERM; |
3420 | task_clear_spec_ssb_disable(task); |
3421 | - update = test_and_clear_tsk_thread_flag(task, TIF_SSBD); |
3422 | + task_update_spec_tif(task); |
3423 | break; |
3424 | case PR_SPEC_DISABLE: |
3425 | task_set_spec_ssb_disable(task); |
3426 | - update = !test_and_set_tsk_thread_flag(task, TIF_SSBD); |
3427 | + task_update_spec_tif(task); |
3428 | break; |
3429 | case PR_SPEC_FORCE_DISABLE: |
3430 | task_set_spec_ssb_disable(task); |
3431 | task_set_spec_ssb_force_disable(task); |
3432 | - update = !test_and_set_tsk_thread_flag(task, TIF_SSBD); |
3433 | + task_update_spec_tif(task); |
3434 | break; |
3435 | default: |
3436 | return -ERANGE; |
3437 | } |
3438 | + return 0; |
3439 | +} |
3440 | |
3441 | - /* |
3442 | - * If being set on non-current task, delay setting the CPU |
3443 | - * mitigation until it is next scheduled. |
3444 | - */ |
3445 | - if (task == current && update) |
3446 | - speculative_store_bypass_update_current(); |
3447 | - |
3448 | +static int ib_prctl_set(struct task_struct *task, unsigned long ctrl) |
3449 | +{ |
3450 | + switch (ctrl) { |
3451 | + case PR_SPEC_ENABLE: |
3452 | + if (spectre_v2_user == SPECTRE_V2_USER_NONE) |
3453 | + return 0; |
3454 | + /* |
3455 | + * Indirect branch speculation is always disabled in strict |
3456 | + * mode. |
3457 | + */ |
3458 | + if (spectre_v2_user == SPECTRE_V2_USER_STRICT) |
3459 | + return -EPERM; |
3460 | + task_clear_spec_ib_disable(task); |
3461 | + task_update_spec_tif(task); |
3462 | + break; |
3463 | + case PR_SPEC_DISABLE: |
3464 | + case PR_SPEC_FORCE_DISABLE: |
3465 | + /* |
3466 | + * Indirect branch speculation is always allowed when |
3467 | + * mitigation is force disabled. |
3468 | + */ |
3469 | + if (spectre_v2_user == SPECTRE_V2_USER_NONE) |
3470 | + return -EPERM; |
3471 | + if (spectre_v2_user == SPECTRE_V2_USER_STRICT) |
3472 | + return 0; |
3473 | + task_set_spec_ib_disable(task); |
3474 | + if (ctrl == PR_SPEC_FORCE_DISABLE) |
3475 | + task_set_spec_ib_force_disable(task); |
3476 | + task_update_spec_tif(task); |
3477 | + break; |
3478 | + default: |
3479 | + return -ERANGE; |
3480 | + } |
3481 | return 0; |
3482 | } |
3483 | |
3484 | @@ -605,6 +952,8 @@ int arch_prctl_spec_ctrl_set(struct task_struct *task, unsigned long which, |
3485 | switch (which) { |
3486 | case PR_SPEC_STORE_BYPASS: |
3487 | return ssb_prctl_set(task, ctrl); |
3488 | + case PR_SPEC_INDIRECT_BRANCH: |
3489 | + return ib_prctl_set(task, ctrl); |
3490 | default: |
3491 | return -ENODEV; |
3492 | } |
3493 | @@ -615,6 +964,8 @@ void arch_seccomp_spec_mitigate(struct task_struct *task) |
3494 | { |
3495 | if (ssb_mode == SPEC_STORE_BYPASS_SECCOMP) |
3496 | ssb_prctl_set(task, PR_SPEC_FORCE_DISABLE); |
3497 | + if (spectre_v2_user == SPECTRE_V2_USER_SECCOMP) |
3498 | + ib_prctl_set(task, PR_SPEC_FORCE_DISABLE); |
3499 | } |
3500 | #endif |
3501 | |
3502 | @@ -637,11 +988,35 @@ static int ssb_prctl_get(struct task_struct *task) |
3503 | } |
3504 | } |
3505 | |
3506 | +static int ib_prctl_get(struct task_struct *task) |
3507 | +{ |
3508 | + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) |
3509 | + return PR_SPEC_NOT_AFFECTED; |
3510 | + |
3511 | + switch (spectre_v2_user) { |
3512 | + case SPECTRE_V2_USER_NONE: |
3513 | + return PR_SPEC_ENABLE; |
3514 | + case SPECTRE_V2_USER_PRCTL: |
3515 | + case SPECTRE_V2_USER_SECCOMP: |
3516 | + if (task_spec_ib_force_disable(task)) |
3517 | + return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE; |
3518 | + if (task_spec_ib_disable(task)) |
3519 | + return PR_SPEC_PRCTL | PR_SPEC_DISABLE; |
3520 | + return PR_SPEC_PRCTL | PR_SPEC_ENABLE; |
3521 | + case SPECTRE_V2_USER_STRICT: |
3522 | + return PR_SPEC_DISABLE; |
3523 | + default: |
3524 | + return PR_SPEC_NOT_AFFECTED; |
3525 | + } |
3526 | +} |
3527 | + |
3528 | int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which) |
3529 | { |
3530 | switch (which) { |
3531 | case PR_SPEC_STORE_BYPASS: |
3532 | return ssb_prctl_get(task); |
3533 | + case PR_SPEC_INDIRECT_BRANCH: |
3534 | + return ib_prctl_get(task); |
3535 | default: |
3536 | return -ENODEV; |
3537 | } |
3538 | @@ -713,6 +1088,11 @@ static void __init l1tf_select_mitigation(void) |
3539 | if (!boot_cpu_has_bug(X86_BUG_L1TF)) |
3540 | return; |
3541 | |
3542 | + if (cpu_mitigations_off()) |
3543 | + l1tf_mitigation = L1TF_MITIGATION_OFF; |
3544 | + else if (cpu_mitigations_auto_nosmt()) |
3545 | + l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOSMT; |
3546 | + |
3547 | override_cache_bits(&boot_cpu_data); |
3548 | |
3549 | switch (l1tf_mitigation) { |
3550 | @@ -735,12 +1115,13 @@ static void __init l1tf_select_mitigation(void) |
3551 | #endif |
3552 | |
3553 | half_pa = (u64)l1tf_pfn_limit() << PAGE_SHIFT; |
3554 | - if (e820_any_mapped(half_pa, ULLONG_MAX - half_pa, E820_RAM)) { |
3555 | + if (l1tf_mitigation != L1TF_MITIGATION_OFF && |
3556 | + e820_any_mapped(half_pa, ULLONG_MAX - half_pa, E820_RAM)) { |
3557 | pr_warn("System has more than MAX_PA/2 memory. L1TF mitigation not effective.\n"); |
3558 | pr_info("You may make it effective by booting the kernel with mem=%llu parameter.\n", |
3559 | half_pa); |
3560 | pr_info("However, doing so will make a part of your RAM unusable.\n"); |
3561 | - pr_info("Reading https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html might help you decide.\n"); |
3562 | + pr_info("Reading https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html might help you decide.\n"); |
3563 | return; |
3564 | } |
3565 | |
3566 | @@ -773,13 +1154,14 @@ static int __init l1tf_cmdline(char *str) |
3567 | early_param("l1tf", l1tf_cmdline); |
3568 | |
3569 | #undef pr_fmt |
3570 | +#define pr_fmt(fmt) fmt |
3571 | |
3572 | #ifdef CONFIG_SYSFS |
3573 | |
3574 | #define L1TF_DEFAULT_MSG "Mitigation: PTE Inversion" |
3575 | |
3576 | #if IS_ENABLED(CONFIG_KVM_INTEL) |
3577 | -static const char *l1tf_vmx_states[] = { |
3578 | +static const char * const l1tf_vmx_states[] = { |
3579 | [VMENTER_L1D_FLUSH_AUTO] = "auto", |
3580 | [VMENTER_L1D_FLUSH_NEVER] = "vulnerable", |
3581 | [VMENTER_L1D_FLUSH_COND] = "conditional cache flushes", |
3582 | @@ -795,13 +1177,14 @@ static ssize_t l1tf_show_state(char *buf) |
3583 | |
3584 | if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_EPT_DISABLED || |
3585 | (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER && |
3586 | - cpu_smt_control == CPU_SMT_ENABLED)) |
3587 | + sched_smt_active())) { |
3588 | return sprintf(buf, "%s; VMX: %s\n", L1TF_DEFAULT_MSG, |
3589 | l1tf_vmx_states[l1tf_vmx_mitigation]); |
3590 | + } |
3591 | |
3592 | return sprintf(buf, "%s; VMX: %s, SMT %s\n", L1TF_DEFAULT_MSG, |
3593 | l1tf_vmx_states[l1tf_vmx_mitigation], |
3594 | - cpu_smt_control == CPU_SMT_ENABLED ? "vulnerable" : "disabled"); |
3595 | + sched_smt_active() ? "vulnerable" : "disabled"); |
3596 | } |
3597 | #else |
3598 | static ssize_t l1tf_show_state(char *buf) |
3599 | @@ -810,6 +1193,55 @@ static ssize_t l1tf_show_state(char *buf) |
3600 | } |
3601 | #endif |
3602 | |
3603 | +static ssize_t mds_show_state(char *buf) |
3604 | +{ |
3605 | +#ifdef CONFIG_HYPERVISOR_GUEST |
3606 | + if (x86_hyper) { |
3607 | + return sprintf(buf, "%s; SMT Host state unknown\n", |
3608 | + mds_strings[mds_mitigation]); |
3609 | + } |
3610 | +#endif |
3611 | + |
3612 | + if (boot_cpu_has(X86_BUG_MSBDS_ONLY)) { |
3613 | + return sprintf(buf, "%s; SMT %s\n", mds_strings[mds_mitigation], |
3614 | + (mds_mitigation == MDS_MITIGATION_OFF ? "vulnerable" : |
3615 | + sched_smt_active() ? "mitigated" : "disabled")); |
3616 | + } |
3617 | + |
3618 | + return sprintf(buf, "%s; SMT %s\n", mds_strings[mds_mitigation], |
3619 | + sched_smt_active() ? "vulnerable" : "disabled"); |
3620 | +} |
3621 | + |
3622 | +static char *stibp_state(void) |
3623 | +{ |
3624 | + if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) |
3625 | + return ""; |
3626 | + |
3627 | + switch (spectre_v2_user) { |
3628 | + case SPECTRE_V2_USER_NONE: |
3629 | + return ", STIBP: disabled"; |
3630 | + case SPECTRE_V2_USER_STRICT: |
3631 | + return ", STIBP: forced"; |
3632 | + case SPECTRE_V2_USER_PRCTL: |
3633 | + case SPECTRE_V2_USER_SECCOMP: |
3634 | + if (static_key_enabled(&switch_to_cond_stibp)) |
3635 | + return ", STIBP: conditional"; |
3636 | + } |
3637 | + return ""; |
3638 | +} |
3639 | + |
3640 | +static char *ibpb_state(void) |
3641 | +{ |
3642 | + if (boot_cpu_has(X86_FEATURE_IBPB)) { |
3643 | + if (static_key_enabled(&switch_mm_always_ibpb)) |
3644 | + return ", IBPB: always-on"; |
3645 | + if (static_key_enabled(&switch_mm_cond_ibpb)) |
3646 | + return ", IBPB: conditional"; |
3647 | + return ", IBPB: disabled"; |
3648 | + } |
3649 | + return ""; |
3650 | +} |
3651 | + |
3652 | static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr, |
3653 | char *buf, unsigned int bug) |
3654 | { |
3655 | @@ -827,9 +1259,11 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr |
3656 | return sprintf(buf, "Mitigation: __user pointer sanitization\n"); |
3657 | |
3658 | case X86_BUG_SPECTRE_V2: |
3659 | - return sprintf(buf, "%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled], |
3660 | - boot_cpu_has(X86_FEATURE_USE_IBPB) ? ", IBPB" : "", |
3661 | + return sprintf(buf, "%s%s%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled], |
3662 | + ibpb_state(), |
3663 | boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "", |
3664 | + stibp_state(), |
3665 | + boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "", |
3666 | spectre_v2_module_string()); |
3667 | |
3668 | case X86_BUG_SPEC_STORE_BYPASS: |
3669 | @@ -839,6 +1273,10 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr |
3670 | if (boot_cpu_has(X86_FEATURE_L1TF_PTEINV)) |
3671 | return l1tf_show_state(buf); |
3672 | break; |
3673 | + |
3674 | + case X86_BUG_MDS: |
3675 | + return mds_show_state(buf); |
3676 | + |
3677 | default: |
3678 | break; |
3679 | } |
3680 | @@ -870,4 +1308,9 @@ ssize_t cpu_show_l1tf(struct device *dev, struct device_attribute *attr, char *b |
3681 | { |
3682 | return cpu_show_common(dev, attr, buf, X86_BUG_L1TF); |
3683 | } |
3684 | + |
3685 | +ssize_t cpu_show_mds(struct device *dev, struct device_attribute *attr, char *buf) |
3686 | +{ |
3687 | + return cpu_show_common(dev, attr, buf, X86_BUG_MDS); |
3688 | +} |
3689 | #endif |
3690 | diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c |
3691 | index 3c01610c5ba9..cda130dc56b9 100644 |
3692 | --- a/arch/x86/kernel/cpu/common.c |
3693 | +++ b/arch/x86/kernel/cpu/common.c |
3694 | @@ -752,6 +752,12 @@ static void init_speculation_control(struct cpuinfo_x86 *c) |
3695 | set_cpu_cap(c, X86_FEATURE_STIBP); |
3696 | set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL); |
3697 | } |
3698 | + |
3699 | + if (cpu_has(c, X86_FEATURE_AMD_SSBD)) { |
3700 | + set_cpu_cap(c, X86_FEATURE_SSBD); |
3701 | + set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL); |
3702 | + clear_cpu_cap(c, X86_FEATURE_VIRT_SSBD); |
3703 | + } |
3704 | } |
3705 | |
3706 | void get_cpu_cap(struct cpuinfo_x86 *c) |
3707 | @@ -885,84 +891,95 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) |
3708 | c->x86_cache_bits = c->x86_phys_bits; |
3709 | } |
3710 | |
3711 | -static const __initconst struct x86_cpu_id cpu_no_speculation[] = { |
3712 | - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CEDARVIEW, X86_FEATURE_ANY }, |
3713 | - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CLOVERVIEW, X86_FEATURE_ANY }, |
3714 | - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_LINCROFT, X86_FEATURE_ANY }, |
3715 | - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PENWELL, X86_FEATURE_ANY }, |
3716 | - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PINEVIEW, X86_FEATURE_ANY }, |
3717 | - { X86_VENDOR_CENTAUR, 5 }, |
3718 | - { X86_VENDOR_INTEL, 5 }, |
3719 | - { X86_VENDOR_NSC, 5 }, |
3720 | - { X86_VENDOR_ANY, 4 }, |
3721 | - {} |
3722 | -}; |
3723 | +#define NO_SPECULATION BIT(0) |
3724 | +#define NO_MELTDOWN BIT(1) |
3725 | +#define NO_SSB BIT(2) |
3726 | +#define NO_L1TF BIT(3) |
3727 | +#define NO_MDS BIT(4) |
3728 | +#define MSBDS_ONLY BIT(5) |
3729 | |
3730 | -static const __initconst struct x86_cpu_id cpu_no_meltdown[] = { |
3731 | - { X86_VENDOR_AMD }, |
3732 | - {} |
3733 | -}; |
3734 | +#define VULNWL(_vendor, _family, _model, _whitelist) \ |
3735 | + { X86_VENDOR_##_vendor, _family, _model, X86_FEATURE_ANY, _whitelist } |
3736 | |
3737 | -static const __initconst struct x86_cpu_id cpu_no_spec_store_bypass[] = { |
3738 | - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PINEVIEW }, |
3739 | - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_LINCROFT }, |
3740 | - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PENWELL }, |
3741 | - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CLOVERVIEW }, |
3742 | - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CEDARVIEW }, |
3743 | - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT1 }, |
3744 | - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_AIRMONT }, |
3745 | - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT2 }, |
3746 | - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_MERRIFIELD }, |
3747 | - { X86_VENDOR_INTEL, 6, INTEL_FAM6_CORE_YONAH }, |
3748 | - { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNL }, |
3749 | - { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNM }, |
3750 | - { X86_VENDOR_CENTAUR, 5, }, |
3751 | - { X86_VENDOR_INTEL, 5, }, |
3752 | - { X86_VENDOR_NSC, 5, }, |
3753 | - { X86_VENDOR_AMD, 0x12, }, |
3754 | - { X86_VENDOR_AMD, 0x11, }, |
3755 | - { X86_VENDOR_AMD, 0x10, }, |
3756 | - { X86_VENDOR_AMD, 0xf, }, |
3757 | - { X86_VENDOR_ANY, 4, }, |
3758 | - {} |
3759 | -}; |
3760 | +#define VULNWL_INTEL(model, whitelist) \ |
3761 | + VULNWL(INTEL, 6, INTEL_FAM6_##model, whitelist) |
3762 | + |
3763 | +#define VULNWL_AMD(family, whitelist) \ |
3764 | + VULNWL(AMD, family, X86_MODEL_ANY, whitelist) |
3765 | + |
3766 | +static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { |
3767 | + VULNWL(ANY, 4, X86_MODEL_ANY, NO_SPECULATION), |
3768 | + VULNWL(CENTAUR, 5, X86_MODEL_ANY, NO_SPECULATION), |
3769 | + VULNWL(INTEL, 5, X86_MODEL_ANY, NO_SPECULATION), |
3770 | + VULNWL(NSC, 5, X86_MODEL_ANY, NO_SPECULATION), |
3771 | + |
3772 | + /* Intel Family 6 */ |
3773 | + VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION), |
3774 | + VULNWL_INTEL(ATOM_SALTWELL_TABLET, NO_SPECULATION), |
3775 | + VULNWL_INTEL(ATOM_SALTWELL_MID, NO_SPECULATION), |
3776 | + VULNWL_INTEL(ATOM_BONNELL, NO_SPECULATION), |
3777 | + VULNWL_INTEL(ATOM_BONNELL_MID, NO_SPECULATION), |
3778 | + |
3779 | + VULNWL_INTEL(ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY), |
3780 | + VULNWL_INTEL(ATOM_SILVERMONT_X, NO_SSB | NO_L1TF | MSBDS_ONLY), |
3781 | + VULNWL_INTEL(ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY), |
3782 | + VULNWL_INTEL(ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY), |
3783 | + VULNWL_INTEL(XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY), |
3784 | + VULNWL_INTEL(XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY), |
3785 | + |
3786 | + VULNWL_INTEL(CORE_YONAH, NO_SSB), |
3787 | + |
3788 | + VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY), |
3789 | + |
3790 | + VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF), |
3791 | + VULNWL_INTEL(ATOM_GOLDMONT_X, NO_MDS | NO_L1TF), |
3792 | + VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF), |
3793 | |
3794 | -static const __initconst struct x86_cpu_id cpu_no_l1tf[] = { |
3795 | - /* in addition to cpu_no_speculation */ |
3796 | - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT1 }, |
3797 | - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT2 }, |
3798 | - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_AIRMONT }, |
3799 | - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_MERRIFIELD }, |
3800 | - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_MOOREFIELD }, |
3801 | - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GOLDMONT }, |
3802 | - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_DENVERTON }, |
3803 | - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GEMINI_LAKE }, |
3804 | - { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNL }, |
3805 | - { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNM }, |
3806 | + /* AMD Family 0xf - 0x12 */ |
3807 | + VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS), |
3808 | + VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS), |
3809 | + VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS), |
3810 | + VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS), |
3811 | + |
3812 | + /* FAMILY_ANY must be last, otherwise 0x0f - 0x12 matches won't work */ |
3813 | + VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS), |
3814 | {} |
3815 | }; |
3816 | |
3817 | -static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) |
3818 | +static bool __init cpu_matches(unsigned long which) |
3819 | { |
3820 | - u64 ia32_cap = 0; |
3821 | + const struct x86_cpu_id *m = x86_match_cpu(cpu_vuln_whitelist); |
3822 | |
3823 | - if (cpu_has(c, X86_FEATURE_ARCH_CAPABILITIES)) |
3824 | - rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap); |
3825 | + return m && !!(m->driver_data & which); |
3826 | +} |
3827 | |
3828 | - if (!x86_match_cpu(cpu_no_spec_store_bypass) && |
3829 | - !(ia32_cap & ARCH_CAP_SSB_NO)) |
3830 | - setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS); |
3831 | +static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) |
3832 | +{ |
3833 | + u64 ia32_cap = 0; |
3834 | |
3835 | - if (x86_match_cpu(cpu_no_speculation)) |
3836 | + if (cpu_matches(NO_SPECULATION)) |
3837 | return; |
3838 | |
3839 | setup_force_cpu_bug(X86_BUG_SPECTRE_V1); |
3840 | setup_force_cpu_bug(X86_BUG_SPECTRE_V2); |
3841 | |
3842 | + if (cpu_has(c, X86_FEATURE_ARCH_CAPABILITIES)) |
3843 | + rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap); |
3844 | + |
3845 | + if (!cpu_matches(NO_SSB) && !(ia32_cap & ARCH_CAP_SSB_NO) && |
3846 | + !cpu_has(c, X86_FEATURE_AMD_SSB_NO)) |
3847 | + setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS); |
3848 | + |
3849 | if (ia32_cap & ARCH_CAP_IBRS_ALL) |
3850 | setup_force_cpu_cap(X86_FEATURE_IBRS_ENHANCED); |
3851 | |
3852 | - if (x86_match_cpu(cpu_no_meltdown)) |
3853 | + if (!cpu_matches(NO_MDS) && !(ia32_cap & ARCH_CAP_MDS_NO)) { |
3854 | + setup_force_cpu_bug(X86_BUG_MDS); |
3855 | + if (cpu_matches(MSBDS_ONLY)) |
3856 | + setup_force_cpu_bug(X86_BUG_MSBDS_ONLY); |
3857 | + } |
3858 | + |
3859 | + if (cpu_matches(NO_MELTDOWN)) |
3860 | return; |
3861 | |
3862 | /* Rogue Data Cache Load? No! */ |
3863 | @@ -971,7 +988,7 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) |
3864 | |
3865 | setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN); |
3866 | |
3867 | - if (x86_match_cpu(cpu_no_l1tf)) |
3868 | + if (cpu_matches(NO_L1TF)) |
3869 | return; |
3870 | |
3871 | setup_force_cpu_bug(X86_BUG_L1TF); |
3872 | diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c |
3873 | index cee0fec0d232..860f2fd9f540 100644 |
3874 | --- a/arch/x86/kernel/cpu/intel.c |
3875 | +++ b/arch/x86/kernel/cpu/intel.c |
3876 | @@ -14,6 +14,7 @@ |
3877 | #include <asm/bugs.h> |
3878 | #include <asm/cpu.h> |
3879 | #include <asm/intel-family.h> |
3880 | +#include <asm/microcode_intel.h> |
3881 | |
3882 | #ifdef CONFIG_X86_64 |
3883 | #include <linux/topology.h> |
3884 | @@ -137,14 +138,8 @@ static void early_init_intel(struct cpuinfo_x86 *c) |
3885 | (c->x86 == 0x6 && c->x86_model >= 0x0e)) |
3886 | set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); |
3887 | |
3888 | - if (c->x86 >= 6 && !cpu_has(c, X86_FEATURE_IA64)) { |
3889 | - unsigned lower_word; |
3890 | - |
3891 | - wrmsr(MSR_IA32_UCODE_REV, 0, 0); |
3892 | - /* Required by the SDM */ |
3893 | - sync_core(); |
3894 | - rdmsr(MSR_IA32_UCODE_REV, lower_word, c->microcode); |
3895 | - } |
3896 | + if (c->x86 >= 6 && !cpu_has(c, X86_FEATURE_IA64)) |
3897 | + c->microcode = intel_get_microcode_revision(); |
3898 | |
3899 | /* Now if any of them are set, check the blacklist and clear the lot */ |
3900 | if ((cpu_has(c, X86_FEATURE_SPEC_CTRL) || |
3901 | diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c |
3902 | index 25310d2b8609..d9ad49ca3cbe 100644 |
3903 | --- a/arch/x86/kernel/cpu/mcheck/mce.c |
3904 | +++ b/arch/x86/kernel/cpu/mcheck/mce.c |
3905 | @@ -139,6 +139,8 @@ void mce_setup(struct mce *m) |
3906 | m->socketid = cpu_data(m->extcpu).phys_proc_id; |
3907 | m->apicid = cpu_data(m->extcpu).initial_apicid; |
3908 | rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); |
3909 | + |
3910 | + m->microcode = boot_cpu_data.microcode; |
3911 | } |
3912 | |
3913 | DEFINE_PER_CPU(struct mce, injectm); |
3914 | @@ -309,7 +311,7 @@ static void print_mce(struct mce *m) |
3915 | */ |
3916 | pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n", |
3917 | m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid, |
3918 | - cpu_data(m->extcpu).microcode); |
3919 | + m->microcode); |
3920 | |
3921 | pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n"); |
3922 | } |
3923 | diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c |
3924 | index 732bb03fcf91..a19fddfb6bf8 100644 |
3925 | --- a/arch/x86/kernel/cpu/microcode/amd.c |
3926 | +++ b/arch/x86/kernel/cpu/microcode/amd.c |
3927 | @@ -707,22 +707,26 @@ int apply_microcode_amd(int cpu) |
3928 | return -1; |
3929 | |
3930 | /* need to apply patch? */ |
3931 | - if (rev >= mc_amd->hdr.patch_id) { |
3932 | - c->microcode = rev; |
3933 | - uci->cpu_sig.rev = rev; |
3934 | - return 0; |
3935 | - } |
3936 | + if (rev >= mc_amd->hdr.patch_id) |
3937 | + goto out; |
3938 | |
3939 | if (__apply_microcode_amd(mc_amd)) { |
3940 | pr_err("CPU%d: update failed for patch_level=0x%08x\n", |
3941 | cpu, mc_amd->hdr.patch_id); |
3942 | return -1; |
3943 | } |
3944 | - pr_info("CPU%d: new patch_level=0x%08x\n", cpu, |
3945 | - mc_amd->hdr.patch_id); |
3946 | |
3947 | - uci->cpu_sig.rev = mc_amd->hdr.patch_id; |
3948 | - c->microcode = mc_amd->hdr.patch_id; |
3949 | + rev = mc_amd->hdr.patch_id; |
3950 | + |
3951 | + pr_info("CPU%d: new patch_level=0x%08x\n", cpu, rev); |
3952 | + |
3953 | +out: |
3954 | + uci->cpu_sig.rev = rev; |
3955 | + c->microcode = rev; |
3956 | + |
3957 | + /* Update boot_cpu_data's revision too, if we're on the BSP: */ |
3958 | + if (c->cpu_index == boot_cpu_data.cpu_index) |
3959 | + boot_cpu_data.microcode = rev; |
3960 | |
3961 | return 0; |
3962 | } |
3963 | diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c |
3964 | index 79291d6fb301..1308abfc4758 100644 |
3965 | --- a/arch/x86/kernel/cpu/microcode/intel.c |
3966 | +++ b/arch/x86/kernel/cpu/microcode/intel.c |
3967 | @@ -386,15 +386,8 @@ static int collect_cpu_info_early(struct ucode_cpu_info *uci) |
3968 | native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); |
3969 | csig.pf = 1 << ((val[1] >> 18) & 7); |
3970 | } |
3971 | - native_wrmsrl(MSR_IA32_UCODE_REV, 0); |
3972 | |
3973 | - /* As documented in the SDM: Do a CPUID 1 here */ |
3974 | - sync_core(); |
3975 | - |
3976 | - /* get the current revision from MSR 0x8B */ |
3977 | - native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); |
3978 | - |
3979 | - csig.rev = val[1]; |
3980 | + csig.rev = intel_get_microcode_revision(); |
3981 | |
3982 | uci->cpu_sig = csig; |
3983 | uci->valid = 1; |
3984 | @@ -618,29 +611,35 @@ static inline void print_ucode(struct ucode_cpu_info *uci) |
3985 | static int apply_microcode_early(struct ucode_cpu_info *uci, bool early) |
3986 | { |
3987 | struct microcode_intel *mc; |
3988 | - unsigned int val[2]; |
3989 | + u32 rev; |
3990 | |
3991 | mc = uci->mc; |
3992 | if (!mc) |
3993 | return 0; |
3994 | |
3995 | + /* |
3996 | + * Save us the MSR write below - which is a particular expensive |
3997 | + * operation - when the other hyperthread has updated the microcode |
3998 | + * already. |
3999 | + */ |
4000 | + rev = intel_get_microcode_revision(); |
4001 | + if (rev >= mc->hdr.rev) { |
4002 | + uci->cpu_sig.rev = rev; |
4003 | + return 0; |
4004 | + } |
4005 | + |
4006 | /* write microcode via MSR 0x79 */ |
4007 | native_wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits); |
4008 | - native_wrmsrl(MSR_IA32_UCODE_REV, 0); |
4009 | - |
4010 | - /* As documented in the SDM: Do a CPUID 1 here */ |
4011 | - sync_core(); |
4012 | |
4013 | - /* get the current revision from MSR 0x8B */ |
4014 | - native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); |
4015 | - if (val[1] != mc->hdr.rev) |
4016 | + rev = intel_get_microcode_revision(); |
4017 | + if (rev != mc->hdr.rev) |
4018 | return -1; |
4019 | |
4020 | #ifdef CONFIG_X86_64 |
4021 | /* Flush global tlb. This is precaution. */ |
4022 | flush_tlb_early(); |
4023 | #endif |
4024 | - uci->cpu_sig.rev = val[1]; |
4025 | + uci->cpu_sig.rev = rev; |
4026 | |
4027 | if (early) |
4028 | print_ucode(uci); |
4029 | @@ -903,9 +902,9 @@ static int apply_microcode_intel(int cpu) |
4030 | { |
4031 | struct microcode_intel *mc; |
4032 | struct ucode_cpu_info *uci; |
4033 | - struct cpuinfo_x86 *c; |
4034 | - unsigned int val[2]; |
4035 | + struct cpuinfo_x86 *c = &cpu_data(cpu); |
4036 | static int prev_rev; |
4037 | + u32 rev; |
4038 | |
4039 | /* We should bind the task to the CPU */ |
4040 | if (WARN_ON(raw_smp_processor_id() != cpu)) |
4041 | @@ -924,35 +923,42 @@ static int apply_microcode_intel(int cpu) |
4042 | if (!get_matching_mc(mc, cpu)) |
4043 | return 0; |
4044 | |
4045 | + /* |
4046 | + * Save us the MSR write below - which is a particular expensive |
4047 | + * operation - when the other hyperthread has updated the microcode |
4048 | + * already. |
4049 | + */ |
4050 | + rev = intel_get_microcode_revision(); |
4051 | + if (rev >= mc->hdr.rev) |
4052 | + goto out; |
4053 | + |
4054 | /* write microcode via MSR 0x79 */ |
4055 | wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits); |
4056 | - wrmsrl(MSR_IA32_UCODE_REV, 0); |
4057 | |
4058 | - /* As documented in the SDM: Do a CPUID 1 here */ |
4059 | - sync_core(); |
4060 | + rev = intel_get_microcode_revision(); |
4061 | |
4062 | - /* get the current revision from MSR 0x8B */ |
4063 | - rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); |
4064 | - |
4065 | - if (val[1] != mc->hdr.rev) { |
4066 | + if (rev != mc->hdr.rev) { |
4067 | pr_err("CPU%d update to revision 0x%x failed\n", |
4068 | cpu, mc->hdr.rev); |
4069 | return -1; |
4070 | } |
4071 | |
4072 | - if (val[1] != prev_rev) { |
4073 | + if (rev != prev_rev) { |
4074 | pr_info("updated to revision 0x%x, date = %04x-%02x-%02x\n", |
4075 | - val[1], |
4076 | + rev, |
4077 | mc->hdr.date & 0xffff, |
4078 | mc->hdr.date >> 24, |
4079 | (mc->hdr.date >> 16) & 0xff); |
4080 | - prev_rev = val[1]; |
4081 | + prev_rev = rev; |
4082 | } |
4083 | |
4084 | - c = &cpu_data(cpu); |
4085 | +out: |
4086 | + uci->cpu_sig.rev = rev; |
4087 | + c->microcode = rev; |
4088 | |
4089 | - uci->cpu_sig.rev = val[1]; |
4090 | - c->microcode = val[1]; |
4091 | + /* Update boot_cpu_data's revision too, if we're on the BSP: */ |
4092 | + if (c->cpu_index == boot_cpu_data.cpu_index) |
4093 | + boot_cpu_data.microcode = rev; |
4094 | |
4095 | return 0; |
4096 | } |
4097 | diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c |
4098 | index bfe4d6c96fbd..6b7b35d80264 100644 |
4099 | --- a/arch/x86/kernel/nmi.c |
4100 | +++ b/arch/x86/kernel/nmi.c |
4101 | @@ -32,6 +32,7 @@ |
4102 | #include <asm/x86_init.h> |
4103 | #include <asm/reboot.h> |
4104 | #include <asm/cache.h> |
4105 | +#include <asm/nospec-branch.h> |
4106 | |
4107 | #define CREATE_TRACE_POINTS |
4108 | #include <trace/events/nmi.h> |
4109 | @@ -544,6 +545,9 @@ nmi_restart: |
4110 | write_cr2(this_cpu_read(nmi_cr2)); |
4111 | if (this_cpu_dec_return(nmi_state)) |
4112 | goto nmi_restart; |
4113 | + |
4114 | + if (user_mode(regs)) |
4115 | + mds_user_clear_cpu_buffers(); |
4116 | } |
4117 | NOKPROBE_SYMBOL(do_nmi); |
4118 | |
4119 | diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c |
4120 | index 00a9047539d7..2e4eab22ca37 100644 |
4121 | --- a/arch/x86/kernel/process.c |
4122 | +++ b/arch/x86/kernel/process.c |
4123 | @@ -35,6 +35,8 @@ |
4124 | #include <asm/switch_to.h> |
4125 | #include <asm/spec-ctrl.h> |
4126 | |
4127 | +#include "process.h" |
4128 | + |
4129 | /* |
4130 | * per-CPU TSS segments. Threads are completely 'soft' on Linux, |
4131 | * no more per-task TSS's. The TSS size is kept cacheline-aligned |
4132 | @@ -183,11 +185,12 @@ int set_tsc_mode(unsigned int val) |
4133 | return 0; |
4134 | } |
4135 | |
4136 | -static inline void switch_to_bitmap(struct tss_struct *tss, |
4137 | - struct thread_struct *prev, |
4138 | +static inline void switch_to_bitmap(struct thread_struct *prev, |
4139 | struct thread_struct *next, |
4140 | unsigned long tifp, unsigned long tifn) |
4141 | { |
4142 | + struct tss_struct *tss = this_cpu_ptr(&cpu_tss); |
4143 | + |
4144 | if (tifn & _TIF_IO_BITMAP) { |
4145 | /* |
4146 | * Copy the relevant range of the IO bitmap. |
4147 | @@ -321,32 +324,85 @@ static __always_inline void amd_set_ssb_virt_state(unsigned long tifn) |
4148 | wrmsrl(MSR_AMD64_VIRT_SPEC_CTRL, ssbd_tif_to_spec_ctrl(tifn)); |
4149 | } |
4150 | |
4151 | -static __always_inline void intel_set_ssb_state(unsigned long tifn) |
4152 | +/* |
4153 | + * Update the MSRs managing speculation control, during context switch. |
4154 | + * |
4155 | + * tifp: Previous task's thread flags |
4156 | + * tifn: Next task's thread flags |
4157 | + */ |
4158 | +static __always_inline void __speculation_ctrl_update(unsigned long tifp, |
4159 | + unsigned long tifn) |
4160 | { |
4161 | - u64 msr = x86_spec_ctrl_base | ssbd_tif_to_spec_ctrl(tifn); |
4162 | + unsigned long tif_diff = tifp ^ tifn; |
4163 | + u64 msr = x86_spec_ctrl_base; |
4164 | + bool updmsr = false; |
4165 | + |
4166 | + /* |
4167 | + * If TIF_SSBD is different, select the proper mitigation |
4168 | + * method. Note that if SSBD mitigation is disabled or permanentely |
4169 | + * enabled this branch can't be taken because nothing can set |
4170 | + * TIF_SSBD. |
4171 | + */ |
4172 | + if (tif_diff & _TIF_SSBD) { |
4173 | + if (static_cpu_has(X86_FEATURE_VIRT_SSBD)) { |
4174 | + amd_set_ssb_virt_state(tifn); |
4175 | + } else if (static_cpu_has(X86_FEATURE_LS_CFG_SSBD)) { |
4176 | + amd_set_core_ssb_state(tifn); |
4177 | + } else if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) || |
4178 | + static_cpu_has(X86_FEATURE_AMD_SSBD)) { |
4179 | + msr |= ssbd_tif_to_spec_ctrl(tifn); |
4180 | + updmsr = true; |
4181 | + } |
4182 | + } |
4183 | + |
4184 | + /* |
4185 | + * Only evaluate TIF_SPEC_IB if conditional STIBP is enabled, |
4186 | + * otherwise avoid the MSR write. |
4187 | + */ |
4188 | + if (IS_ENABLED(CONFIG_SMP) && |
4189 | + static_branch_unlikely(&switch_to_cond_stibp)) { |
4190 | + updmsr |= !!(tif_diff & _TIF_SPEC_IB); |
4191 | + msr |= stibp_tif_to_spec_ctrl(tifn); |
4192 | + } |
4193 | |
4194 | - wrmsrl(MSR_IA32_SPEC_CTRL, msr); |
4195 | + if (updmsr) |
4196 | + wrmsrl(MSR_IA32_SPEC_CTRL, msr); |
4197 | } |
4198 | |
4199 | -static __always_inline void __speculative_store_bypass_update(unsigned long tifn) |
4200 | +static unsigned long speculation_ctrl_update_tif(struct task_struct *tsk) |
4201 | { |
4202 | - if (static_cpu_has(X86_FEATURE_VIRT_SSBD)) |
4203 | - amd_set_ssb_virt_state(tifn); |
4204 | - else if (static_cpu_has(X86_FEATURE_LS_CFG_SSBD)) |
4205 | - amd_set_core_ssb_state(tifn); |
4206 | - else |
4207 | - intel_set_ssb_state(tifn); |
4208 | + if (test_and_clear_tsk_thread_flag(tsk, TIF_SPEC_FORCE_UPDATE)) { |
4209 | + if (task_spec_ssb_disable(tsk)) |
4210 | + set_tsk_thread_flag(tsk, TIF_SSBD); |
4211 | + else |
4212 | + clear_tsk_thread_flag(tsk, TIF_SSBD); |
4213 | + |
4214 | + if (task_spec_ib_disable(tsk)) |
4215 | + set_tsk_thread_flag(tsk, TIF_SPEC_IB); |
4216 | + else |
4217 | + clear_tsk_thread_flag(tsk, TIF_SPEC_IB); |
4218 | + } |
4219 | + /* Return the updated threadinfo flags*/ |
4220 | + return task_thread_info(tsk)->flags; |
4221 | } |
4222 | |
4223 | -void speculative_store_bypass_update(unsigned long tif) |
4224 | +void speculation_ctrl_update(unsigned long tif) |
4225 | { |
4226 | + /* Forced update. Make sure all relevant TIF flags are different */ |
4227 | preempt_disable(); |
4228 | - __speculative_store_bypass_update(tif); |
4229 | + __speculation_ctrl_update(~tif, tif); |
4230 | preempt_enable(); |
4231 | } |
4232 | |
4233 | -void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, |
4234 | - struct tss_struct *tss) |
4235 | +/* Called from seccomp/prctl update */ |
4236 | +void speculation_ctrl_update_current(void) |
4237 | +{ |
4238 | + preempt_disable(); |
4239 | + speculation_ctrl_update(speculation_ctrl_update_tif(current)); |
4240 | + preempt_enable(); |
4241 | +} |
4242 | + |
4243 | +void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p) |
4244 | { |
4245 | struct thread_struct *prev, *next; |
4246 | unsigned long tifp, tifn; |
4247 | @@ -356,7 +412,7 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, |
4248 | |
4249 | tifn = READ_ONCE(task_thread_info(next_p)->flags); |
4250 | tifp = READ_ONCE(task_thread_info(prev_p)->flags); |
4251 | - switch_to_bitmap(tss, prev, next, tifp, tifn); |
4252 | + switch_to_bitmap(prev, next, tifp, tifn); |
4253 | |
4254 | propagate_user_return_notify(prev_p, next_p); |
4255 | |
4256 | @@ -374,8 +430,15 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, |
4257 | if ((tifp ^ tifn) & _TIF_NOTSC) |
4258 | cr4_toggle_bits(X86_CR4_TSD); |
4259 | |
4260 | - if ((tifp ^ tifn) & _TIF_SSBD) |
4261 | - __speculative_store_bypass_update(tifn); |
4262 | + if (likely(!((tifp | tifn) & _TIF_SPEC_FORCE_UPDATE))) { |
4263 | + __speculation_ctrl_update(tifp, tifn); |
4264 | + } else { |
4265 | + speculation_ctrl_update_tif(prev_p); |
4266 | + tifn = speculation_ctrl_update_tif(next_p); |
4267 | + |
4268 | + /* Enforce MSR update to ensure consistent state */ |
4269 | + __speculation_ctrl_update(~tifn, tifn); |
4270 | + } |
4271 | } |
4272 | |
4273 | /* |
4274 | diff --git a/arch/x86/kernel/process.h b/arch/x86/kernel/process.h |
4275 | new file mode 100644 |
4276 | index 000000000000..898e97cf6629 |
4277 | --- /dev/null |
4278 | +++ b/arch/x86/kernel/process.h |
4279 | @@ -0,0 +1,39 @@ |
4280 | +// SPDX-License-Identifier: GPL-2.0 |
4281 | +// |
4282 | +// Code shared between 32 and 64 bit |
4283 | + |
4284 | +#include <asm/spec-ctrl.h> |
4285 | + |
4286 | +void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p); |
4287 | + |
4288 | +/* |
4289 | + * This needs to be inline to optimize for the common case where no extra |
4290 | + * work needs to be done. |
4291 | + */ |
4292 | +static inline void switch_to_extra(struct task_struct *prev, |
4293 | + struct task_struct *next) |
4294 | +{ |
4295 | + unsigned long next_tif = task_thread_info(next)->flags; |
4296 | + unsigned long prev_tif = task_thread_info(prev)->flags; |
4297 | + |
4298 | + if (IS_ENABLED(CONFIG_SMP)) { |
4299 | + /* |
4300 | + * Avoid __switch_to_xtra() invocation when conditional |
4301 | + * STIPB is disabled and the only different bit is |
4302 | + * TIF_SPEC_IB. For CONFIG_SMP=n TIF_SPEC_IB is not |
4303 | + * in the TIF_WORK_CTXSW masks. |
4304 | + */ |
4305 | + if (!static_branch_likely(&switch_to_cond_stibp)) { |
4306 | + prev_tif &= ~_TIF_SPEC_IB; |
4307 | + next_tif &= ~_TIF_SPEC_IB; |
4308 | + } |
4309 | + } |
4310 | + |
4311 | + /* |
4312 | + * __switch_to_xtra() handles debug registers, i/o bitmaps, |
4313 | + * speculation mitigations etc. |
4314 | + */ |
4315 | + if (unlikely(next_tif & _TIF_WORK_CTXSW_NEXT || |
4316 | + prev_tif & _TIF_WORK_CTXSW_PREV)) |
4317 | + __switch_to_xtra(prev, next); |
4318 | +} |
4319 | diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c |
4320 | index bd7be8efdc4c..912246fd6cd9 100644 |
4321 | --- a/arch/x86/kernel/process_32.c |
4322 | +++ b/arch/x86/kernel/process_32.c |
4323 | @@ -55,6 +55,8 @@ |
4324 | #include <asm/switch_to.h> |
4325 | #include <asm/vm86.h> |
4326 | |
4327 | +#include "process.h" |
4328 | + |
4329 | void __show_regs(struct pt_regs *regs, int all) |
4330 | { |
4331 | unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; |
4332 | @@ -264,12 +266,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) |
4333 | if (get_kernel_rpl() && unlikely(prev->iopl != next->iopl)) |
4334 | set_iopl_mask(next->iopl); |
4335 | |
4336 | - /* |
4337 | - * Now maybe handle debug registers and/or IO bitmaps |
4338 | - */ |
4339 | - if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV || |
4340 | - task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT)) |
4341 | - __switch_to_xtra(prev_p, next_p, tss); |
4342 | + switch_to_extra(prev_p, next_p); |
4343 | |
4344 | /* |
4345 | * Leave lazy mode, flushing any hypercalls made here. |
4346 | diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c |
4347 | index a2661814bde0..81eec65fe053 100644 |
4348 | --- a/arch/x86/kernel/process_64.c |
4349 | +++ b/arch/x86/kernel/process_64.c |
4350 | @@ -51,6 +51,8 @@ |
4351 | #include <asm/xen/hypervisor.h> |
4352 | #include <asm/vdso.h> |
4353 | |
4354 | +#include "process.h" |
4355 | + |
4356 | __visible DEFINE_PER_CPU(unsigned long, rsp_scratch); |
4357 | |
4358 | /* Prints also some state that isn't saved in the pt_regs */ |
4359 | @@ -454,12 +456,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) |
4360 | /* Reload esp0 and ss1. This changes current_thread_info(). */ |
4361 | load_sp0(tss, next); |
4362 | |
4363 | - /* |
4364 | - * Now maybe reload the debug registers and handle I/O bitmaps |
4365 | - */ |
4366 | - if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT || |
4367 | - task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV)) |
4368 | - __switch_to_xtra(prev_p, next_p, tss); |
4369 | + switch_to_extra(prev_p, next_p); |
4370 | |
4371 | #ifdef CONFIG_XEN |
4372 | /* |
4373 | diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c |
4374 | index 5bbfa2f63b8c..ef225fa8e928 100644 |
4375 | --- a/arch/x86/kernel/traps.c |
4376 | +++ b/arch/x86/kernel/traps.c |
4377 | @@ -62,6 +62,7 @@ |
4378 | #include <asm/alternative.h> |
4379 | #include <asm/fpu/xstate.h> |
4380 | #include <asm/trace/mpx.h> |
4381 | +#include <asm/nospec-branch.h> |
4382 | #include <asm/mpx.h> |
4383 | #include <asm/vm86.h> |
4384 | |
4385 | @@ -340,6 +341,13 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) |
4386 | regs->ip = (unsigned long)general_protection; |
4387 | regs->sp = (unsigned long)&normal_regs->orig_ax; |
4388 | |
4389 | + /* |
4390 | + * This situation can be triggered by userspace via |
4391 | + * modify_ldt(2) and the return does not take the regular |
4392 | + * user space exit, so a CPU buffer clear is required when |
4393 | + * MDS mitigation is enabled. |
4394 | + */ |
4395 | + mds_user_clear_cpu_buffers(); |
4396 | return; |
4397 | } |
4398 | #endif |
4399 | diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c |
4400 | index 769c370011d6..cb768417429d 100644 |
4401 | --- a/arch/x86/kernel/tsc.c |
4402 | +++ b/arch/x86/kernel/tsc.c |
4403 | @@ -713,7 +713,7 @@ unsigned long native_calibrate_tsc(void) |
4404 | case INTEL_FAM6_KABYLAKE_DESKTOP: |
4405 | crystal_khz = 24000; /* 24.0 MHz */ |
4406 | break; |
4407 | - case INTEL_FAM6_ATOM_DENVERTON: |
4408 | + case INTEL_FAM6_ATOM_GOLDMONT_X: |
4409 | crystal_khz = 25000; /* 25.0 MHz */ |
4410 | break; |
4411 | case INTEL_FAM6_ATOM_GOLDMONT: |
4412 | diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c |
4413 | index c17d3893ae60..fc8236fd2495 100644 |
4414 | --- a/arch/x86/kvm/cpuid.c |
4415 | +++ b/arch/x86/kvm/cpuid.c |
4416 | @@ -355,7 +355,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, |
4417 | |
4418 | /* cpuid 0x80000008.ebx */ |
4419 | const u32 kvm_cpuid_8000_0008_ebx_x86_features = |
4420 | - F(AMD_IBPB) | F(AMD_IBRS) | F(VIRT_SSBD); |
4421 | + F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) | |
4422 | + F(AMD_SSB_NO) | F(AMD_STIBP); |
4423 | |
4424 | /* cpuid 0xC0000001.edx */ |
4425 | const u32 kvm_cpuid_C000_0001_edx_x86_features = |
4426 | @@ -380,7 +381,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, |
4427 | |
4428 | /* cpuid 7.0.edx*/ |
4429 | const u32 kvm_cpuid_7_0_edx_x86_features = |
4430 | - F(SPEC_CTRL) | F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES); |
4431 | + F(SPEC_CTRL) | F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | |
4432 | + F(INTEL_STIBP) | F(MD_CLEAR); |
4433 | |
4434 | /* all calls to cpuid_count() should be made on the same cpu */ |
4435 | get_cpu(); |
4436 | @@ -633,7 +635,12 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, |
4437 | entry->ebx |= F(VIRT_SSBD); |
4438 | entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features; |
4439 | cpuid_mask(&entry->ebx, CPUID_8000_0008_EBX); |
4440 | - if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD)) |
4441 | + /* |
4442 | + * The preference is to use SPEC CTRL MSR instead of the |
4443 | + * VIRT_SPEC MSR. |
4444 | + */ |
4445 | + if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) && |
4446 | + !boot_cpu_has(X86_FEATURE_AMD_SSBD)) |
4447 | entry->ebx |= F(VIRT_SSBD); |
4448 | break; |
4449 | } |
4450 | diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h |
4451 | index 8a841b9d8f84..b2bf8e1d5782 100644 |
4452 | --- a/arch/x86/kvm/cpuid.h |
4453 | +++ b/arch/x86/kvm/cpuid.h |
4454 | @@ -176,7 +176,7 @@ static inline bool guest_cpuid_has_spec_ctrl(struct kvm_vcpu *vcpu) |
4455 | struct kvm_cpuid_entry2 *best; |
4456 | |
4457 | best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); |
4458 | - if (best && (best->ebx & bit(X86_FEATURE_AMD_IBRS))) |
4459 | + if (best && (best->ebx & (bit(X86_FEATURE_AMD_IBRS | bit(X86_FEATURE_AMD_SSBD))))) |
4460 | return true; |
4461 | best = kvm_find_cpuid_entry(vcpu, 7, 0); |
4462 | return best && (best->edx & (bit(X86_FEATURE_SPEC_CTRL) | bit(X86_FEATURE_SPEC_CTRL_SSBD))); |
4463 | diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c |
4464 | index 9a6d258c3c16..9338136a6a23 100644 |
4465 | --- a/arch/x86/kvm/svm.c |
4466 | +++ b/arch/x86/kvm/svm.c |
4467 | @@ -3704,7 +3704,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) |
4468 | return 1; |
4469 | |
4470 | /* The STIBP bit doesn't fault even if it's not advertised */ |
4471 | - if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP)) |
4472 | + if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD)) |
4473 | return 1; |
4474 | |
4475 | svm->spec_ctrl = data; |
4476 | diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c |
4477 | index 75466d9417b8..8feb4f7e2e59 100644 |
4478 | --- a/arch/x86/kvm/vmx.c |
4479 | +++ b/arch/x86/kvm/vmx.c |
4480 | @@ -9206,8 +9206,11 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) |
4481 | |
4482 | vmx->__launched = vmx->loaded_vmcs->launched; |
4483 | |
4484 | + /* L1D Flush includes CPU buffer clear to mitigate MDS */ |
4485 | if (static_branch_unlikely(&vmx_l1d_should_flush)) |
4486 | vmx_l1d_flush(vcpu); |
4487 | + else if (static_branch_unlikely(&mds_user_clear)) |
4488 | + mds_clear_cpu_buffers(); |
4489 | |
4490 | asm( |
4491 | /* Store host registers */ |
4492 | @@ -9566,8 +9569,8 @@ free_vcpu: |
4493 | return ERR_PTR(err); |
4494 | } |
4495 | |
4496 | -#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n" |
4497 | -#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n" |
4498 | +#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n" |
4499 | +#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n" |
4500 | |
4501 | static int vmx_vm_init(struct kvm *kvm) |
4502 | { |
4503 | diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c |
4504 | index 90801a8f19c9..ce092a62fc5d 100644 |
4505 | --- a/arch/x86/mm/init.c |
4506 | +++ b/arch/x86/mm/init.c |
4507 | @@ -790,7 +790,7 @@ unsigned long max_swapfile_size(void) |
4508 | |
4509 | pages = generic_max_swapfile_size(); |
4510 | |
4511 | - if (boot_cpu_has_bug(X86_BUG_L1TF)) { |
4512 | + if (boot_cpu_has_bug(X86_BUG_L1TF) && l1tf_mitigation != L1TF_MITIGATION_OFF) { |
4513 | /* Limit the swap file size to MAX_PA/2 for L1TF workaround */ |
4514 | unsigned long long l1tf_limit = l1tf_pfn_limit(); |
4515 | /* |
4516 | diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c |
4517 | index 3f729e20f0e3..12522dbae615 100644 |
4518 | --- a/arch/x86/mm/kaiser.c |
4519 | +++ b/arch/x86/mm/kaiser.c |
4520 | @@ -9,6 +9,7 @@ |
4521 | #include <linux/spinlock.h> |
4522 | #include <linux/mm.h> |
4523 | #include <linux/uaccess.h> |
4524 | +#include <linux/cpu.h> |
4525 | |
4526 | #undef pr_fmt |
4527 | #define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt |
4528 | @@ -297,7 +298,8 @@ void __init kaiser_check_boottime_disable(void) |
4529 | goto skip; |
4530 | } |
4531 | |
4532 | - if (cmdline_find_option_bool(boot_command_line, "nopti")) |
4533 | + if (cmdline_find_option_bool(boot_command_line, "nopti") || |
4534 | + cpu_mitigations_off()) |
4535 | goto disable; |
4536 | |
4537 | skip: |
4538 | diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c |
4539 | index e30baa8ad94f..dff8ac2d255c 100644 |
4540 | --- a/arch/x86/mm/pgtable.c |
4541 | +++ b/arch/x86/mm/pgtable.c |
4542 | @@ -251,7 +251,7 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) |
4543 | if (pgd_val(pgd) != 0) { |
4544 | pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd); |
4545 | |
4546 | - pgdp[i] = native_make_pgd(0); |
4547 | + pgd_clear(&pgdp[i]); |
4548 | |
4549 | paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT); |
4550 | pmd_free(mm, pmd); |
4551 | @@ -419,7 +419,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma, |
4552 | int changed = !pte_same(*ptep, entry); |
4553 | |
4554 | if (changed && dirty) { |
4555 | - *ptep = entry; |
4556 | + set_pte(ptep, entry); |
4557 | pte_update(vma->vm_mm, address, ptep); |
4558 | } |
4559 | |
4560 | @@ -436,7 +436,7 @@ int pmdp_set_access_flags(struct vm_area_struct *vma, |
4561 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); |
4562 | |
4563 | if (changed && dirty) { |
4564 | - *pmdp = entry; |
4565 | + set_pmd(pmdp, entry); |
4566 | /* |
4567 | * We had a write-protection fault here and changed the pmd |
4568 | * to to more permissive. No need to flush the TLB for that, |
4569 | diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c |
4570 | index eac92e2d171b..a112bb175dd4 100644 |
4571 | --- a/arch/x86/mm/tlb.c |
4572 | +++ b/arch/x86/mm/tlb.c |
4573 | @@ -30,6 +30,12 @@ |
4574 | * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi |
4575 | */ |
4576 | |
4577 | +/* |
4578 | + * Use bit 0 to mangle the TIF_SPEC_IB state into the mm pointer which is |
4579 | + * stored in cpu_tlb_state.last_user_mm_ibpb. |
4580 | + */ |
4581 | +#define LAST_USER_MM_IBPB 0x1UL |
4582 | + |
4583 | atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1); |
4584 | |
4585 | struct flush_tlb_info { |
4586 | @@ -101,33 +107,101 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next, |
4587 | local_irq_restore(flags); |
4588 | } |
4589 | |
4590 | +static inline unsigned long mm_mangle_tif_spec_ib(struct task_struct *next) |
4591 | +{ |
4592 | + unsigned long next_tif = task_thread_info(next)->flags; |
4593 | + unsigned long ibpb = (next_tif >> TIF_SPEC_IB) & LAST_USER_MM_IBPB; |
4594 | + |
4595 | + return (unsigned long)next->mm | ibpb; |
4596 | +} |
4597 | + |
4598 | +static void cond_ibpb(struct task_struct *next) |
4599 | +{ |
4600 | + if (!next || !next->mm) |
4601 | + return; |
4602 | + |
4603 | + /* |
4604 | + * Both, the conditional and the always IBPB mode use the mm |
4605 | + * pointer to avoid the IBPB when switching between tasks of the |
4606 | + * same process. Using the mm pointer instead of mm->context.ctx_id |
4607 | + * opens a hypothetical hole vs. mm_struct reuse, which is more or |
4608 | + * less impossible to control by an attacker. Aside of that it |
4609 | + * would only affect the first schedule so the theoretically |
4610 | + * exposed data is not really interesting. |
4611 | + */ |
4612 | + if (static_branch_likely(&switch_mm_cond_ibpb)) { |
4613 | + unsigned long prev_mm, next_mm; |
4614 | + |
4615 | + /* |
4616 | + * This is a bit more complex than the always mode because |
4617 | + * it has to handle two cases: |
4618 | + * |
4619 | + * 1) Switch from a user space task (potential attacker) |
4620 | + * which has TIF_SPEC_IB set to a user space task |
4621 | + * (potential victim) which has TIF_SPEC_IB not set. |
4622 | + * |
4623 | + * 2) Switch from a user space task (potential attacker) |
4624 | + * which has TIF_SPEC_IB not set to a user space task |
4625 | + * (potential victim) which has TIF_SPEC_IB set. |
4626 | + * |
4627 | + * This could be done by unconditionally issuing IBPB when |
4628 | + * a task which has TIF_SPEC_IB set is either scheduled in |
4629 | + * or out. Though that results in two flushes when: |
4630 | + * |
4631 | + * - the same user space task is scheduled out and later |
4632 | + * scheduled in again and only a kernel thread ran in |
4633 | + * between. |
4634 | + * |
4635 | + * - a user space task belonging to the same process is |
4636 | + * scheduled in after a kernel thread ran in between |
4637 | + * |
4638 | + * - a user space task belonging to the same process is |
4639 | + * scheduled in immediately. |
4640 | + * |
4641 | + * Optimize this with reasonably small overhead for the |
4642 | + * above cases. Mangle the TIF_SPEC_IB bit into the mm |
4643 | + * pointer of the incoming task which is stored in |
4644 | + * cpu_tlbstate.last_user_mm_ibpb for comparison. |
4645 | + */ |
4646 | + next_mm = mm_mangle_tif_spec_ib(next); |
4647 | + prev_mm = this_cpu_read(cpu_tlbstate.last_user_mm_ibpb); |
4648 | + |
4649 | + /* |
4650 | + * Issue IBPB only if the mm's are different and one or |
4651 | + * both have the IBPB bit set. |
4652 | + */ |
4653 | + if (next_mm != prev_mm && |
4654 | + (next_mm | prev_mm) & LAST_USER_MM_IBPB) |
4655 | + indirect_branch_prediction_barrier(); |
4656 | + |
4657 | + this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, next_mm); |
4658 | + } |
4659 | + |
4660 | + if (static_branch_unlikely(&switch_mm_always_ibpb)) { |
4661 | + /* |
4662 | + * Only flush when switching to a user space task with a |
4663 | + * different context than the user space task which ran |
4664 | + * last on this CPU. |
4665 | + */ |
4666 | + if (this_cpu_read(cpu_tlbstate.last_user_mm) != next->mm) { |
4667 | + indirect_branch_prediction_barrier(); |
4668 | + this_cpu_write(cpu_tlbstate.last_user_mm, next->mm); |
4669 | + } |
4670 | + } |
4671 | +} |
4672 | + |
4673 | void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, |
4674 | struct task_struct *tsk) |
4675 | { |
4676 | unsigned cpu = smp_processor_id(); |
4677 | |
4678 | if (likely(prev != next)) { |
4679 | - u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id); |
4680 | - |
4681 | /* |
4682 | * Avoid user/user BTB poisoning by flushing the branch |
4683 | * predictor when switching between processes. This stops |
4684 | * one process from doing Spectre-v2 attacks on another. |
4685 | - * |
4686 | - * As an optimization, flush indirect branches only when |
4687 | - * switching into processes that disable dumping. This |
4688 | - * protects high value processes like gpg, without having |
4689 | - * too high performance overhead. IBPB is *expensive*! |
4690 | - * |
4691 | - * This will not flush branches when switching into kernel |
4692 | - * threads. It will also not flush if we switch to idle |
4693 | - * thread and back to the same process. It will flush if we |
4694 | - * switch to a different non-dumpable process. |
4695 | */ |
4696 | - if (tsk && tsk->mm && |
4697 | - tsk->mm->context.ctx_id != last_ctx_id && |
4698 | - get_dumpable(tsk->mm) != SUID_DUMP_USER) |
4699 | - indirect_branch_prediction_barrier(); |
4700 | + cond_ibpb(tsk); |
4701 | |
4702 | if (IS_ENABLED(CONFIG_VMAP_STACK)) { |
4703 | /* |
4704 | @@ -143,14 +217,6 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, |
4705 | set_pgd(pgd, init_mm.pgd[stack_pgd_index]); |
4706 | } |
4707 | |
4708 | - /* |
4709 | - * Record last user mm's context id, so we can avoid |
4710 | - * flushing branch buffer with IBPB if we switch back |
4711 | - * to the same user. |
4712 | - */ |
4713 | - if (next != &init_mm) |
4714 | - this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id); |
4715 | - |
4716 | this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); |
4717 | this_cpu_write(cpu_tlbstate.active_mm, next); |
4718 | |
4719 | diff --git a/arch/x86/platform/atom/punit_atom_debug.c b/arch/x86/platform/atom/punit_atom_debug.c |
4720 | index d49d3be81953..ecb5866aaf84 100644 |
4721 | --- a/arch/x86/platform/atom/punit_atom_debug.c |
4722 | +++ b/arch/x86/platform/atom/punit_atom_debug.c |
4723 | @@ -154,8 +154,8 @@ static void punit_dbgfs_unregister(void) |
4724 | (kernel_ulong_t)&drv_data } |
4725 | |
4726 | static const struct x86_cpu_id intel_punit_cpu_ids[] = { |
4727 | - ICPU(INTEL_FAM6_ATOM_SILVERMONT1, punit_device_byt), |
4728 | - ICPU(INTEL_FAM6_ATOM_MERRIFIELD, punit_device_tng), |
4729 | + ICPU(INTEL_FAM6_ATOM_SILVERMONT, punit_device_byt), |
4730 | + ICPU(INTEL_FAM6_ATOM_SILVERMONT_MID, punit_device_tng), |
4731 | ICPU(INTEL_FAM6_ATOM_AIRMONT, punit_device_cht), |
4732 | {} |
4733 | }; |
4734 | diff --git a/drivers/acpi/acpi_lpss.c b/drivers/acpi/acpi_lpss.c |
4735 | index 957d3fa3b543..8e38249311bd 100644 |
4736 | --- a/drivers/acpi/acpi_lpss.c |
4737 | +++ b/drivers/acpi/acpi_lpss.c |
4738 | @@ -243,7 +243,7 @@ static const struct lpss_device_desc bsw_spi_dev_desc = { |
4739 | #define ICPU(model) { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, } |
4740 | |
4741 | static const struct x86_cpu_id lpss_cpu_ids[] = { |
4742 | - ICPU(INTEL_FAM6_ATOM_SILVERMONT1), /* Valleyview, Bay Trail */ |
4743 | + ICPU(INTEL_FAM6_ATOM_SILVERMONT), /* Valleyview, Bay Trail */ |
4744 | ICPU(INTEL_FAM6_ATOM_AIRMONT), /* Braswell, Cherry Trail */ |
4745 | {} |
4746 | }; |
4747 | diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c |
4748 | index f1f4ce7ddb47..3b123735a1c4 100644 |
4749 | --- a/drivers/base/cpu.c |
4750 | +++ b/drivers/base/cpu.c |
4751 | @@ -531,11 +531,18 @@ ssize_t __weak cpu_show_l1tf(struct device *dev, |
4752 | return sprintf(buf, "Not affected\n"); |
4753 | } |
4754 | |
4755 | +ssize_t __weak cpu_show_mds(struct device *dev, |
4756 | + struct device_attribute *attr, char *buf) |
4757 | +{ |
4758 | + return sprintf(buf, "Not affected\n"); |
4759 | +} |
4760 | + |
4761 | static DEVICE_ATTR(meltdown, 0444, cpu_show_meltdown, NULL); |
4762 | static DEVICE_ATTR(spectre_v1, 0444, cpu_show_spectre_v1, NULL); |
4763 | static DEVICE_ATTR(spectre_v2, 0444, cpu_show_spectre_v2, NULL); |
4764 | static DEVICE_ATTR(spec_store_bypass, 0444, cpu_show_spec_store_bypass, NULL); |
4765 | static DEVICE_ATTR(l1tf, 0444, cpu_show_l1tf, NULL); |
4766 | +static DEVICE_ATTR(mds, 0444, cpu_show_mds, NULL); |
4767 | |
4768 | static struct attribute *cpu_root_vulnerabilities_attrs[] = { |
4769 | &dev_attr_meltdown.attr, |
4770 | @@ -543,6 +550,7 @@ static struct attribute *cpu_root_vulnerabilities_attrs[] = { |
4771 | &dev_attr_spectre_v2.attr, |
4772 | &dev_attr_spec_store_bypass.attr, |
4773 | &dev_attr_l1tf.attr, |
4774 | + &dev_attr_mds.attr, |
4775 | NULL |
4776 | }; |
4777 | |
4778 | diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c |
4779 | index f690085b1ad9..4fe999687415 100644 |
4780 | --- a/drivers/cpufreq/intel_pstate.c |
4781 | +++ b/drivers/cpufreq/intel_pstate.c |
4782 | @@ -1413,7 +1413,7 @@ static void intel_pstate_update_util(struct update_util_data *data, u64 time, |
4783 | static const struct x86_cpu_id intel_pstate_cpu_ids[] = { |
4784 | ICPU(INTEL_FAM6_SANDYBRIDGE, core_params), |
4785 | ICPU(INTEL_FAM6_SANDYBRIDGE_X, core_params), |
4786 | - ICPU(INTEL_FAM6_ATOM_SILVERMONT1, silvermont_params), |
4787 | + ICPU(INTEL_FAM6_ATOM_SILVERMONT, silvermont_params), |
4788 | ICPU(INTEL_FAM6_IVYBRIDGE, core_params), |
4789 | ICPU(INTEL_FAM6_HASWELL_CORE, core_params), |
4790 | ICPU(INTEL_FAM6_BROADWELL_CORE, core_params), |
4791 | diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c |
4792 | index 5ded9b22b015..a6fa32c7e068 100644 |
4793 | --- a/drivers/idle/intel_idle.c |
4794 | +++ b/drivers/idle/intel_idle.c |
4795 | @@ -1107,14 +1107,14 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = { |
4796 | ICPU(INTEL_FAM6_WESTMERE, idle_cpu_nehalem), |
4797 | ICPU(INTEL_FAM6_WESTMERE_EP, idle_cpu_nehalem), |
4798 | ICPU(INTEL_FAM6_NEHALEM_EX, idle_cpu_nehalem), |
4799 | - ICPU(INTEL_FAM6_ATOM_PINEVIEW, idle_cpu_atom), |
4800 | - ICPU(INTEL_FAM6_ATOM_LINCROFT, idle_cpu_lincroft), |
4801 | + ICPU(INTEL_FAM6_ATOM_BONNELL, idle_cpu_atom), |
4802 | + ICPU(INTEL_FAM6_ATOM_BONNELL_MID, idle_cpu_lincroft), |
4803 | ICPU(INTEL_FAM6_WESTMERE_EX, idle_cpu_nehalem), |
4804 | ICPU(INTEL_FAM6_SANDYBRIDGE, idle_cpu_snb), |
4805 | ICPU(INTEL_FAM6_SANDYBRIDGE_X, idle_cpu_snb), |
4806 | - ICPU(INTEL_FAM6_ATOM_CEDARVIEW, idle_cpu_atom), |
4807 | - ICPU(INTEL_FAM6_ATOM_SILVERMONT1, idle_cpu_byt), |
4808 | - ICPU(INTEL_FAM6_ATOM_MERRIFIELD, idle_cpu_tangier), |
4809 | + ICPU(INTEL_FAM6_ATOM_SALTWELL, idle_cpu_atom), |
4810 | + ICPU(INTEL_FAM6_ATOM_SILVERMONT, idle_cpu_byt), |
4811 | + ICPU(INTEL_FAM6_ATOM_SILVERMONT_MID, idle_cpu_tangier), |
4812 | ICPU(INTEL_FAM6_ATOM_AIRMONT, idle_cpu_cht), |
4813 | ICPU(INTEL_FAM6_IVYBRIDGE, idle_cpu_ivb), |
4814 | ICPU(INTEL_FAM6_IVYBRIDGE_X, idle_cpu_ivt), |
4815 | @@ -1122,7 +1122,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = { |
4816 | ICPU(INTEL_FAM6_HASWELL_X, idle_cpu_hsw), |
4817 | ICPU(INTEL_FAM6_HASWELL_ULT, idle_cpu_hsw), |
4818 | ICPU(INTEL_FAM6_HASWELL_GT3E, idle_cpu_hsw), |
4819 | - ICPU(INTEL_FAM6_ATOM_SILVERMONT2, idle_cpu_avn), |
4820 | + ICPU(INTEL_FAM6_ATOM_SILVERMONT_X, idle_cpu_avn), |
4821 | ICPU(INTEL_FAM6_BROADWELL_CORE, idle_cpu_bdw), |
4822 | ICPU(INTEL_FAM6_BROADWELL_GT3E, idle_cpu_bdw), |
4823 | ICPU(INTEL_FAM6_BROADWELL_X, idle_cpu_bdw), |
4824 | @@ -1134,7 +1134,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = { |
4825 | ICPU(INTEL_FAM6_SKYLAKE_X, idle_cpu_skx), |
4826 | ICPU(INTEL_FAM6_XEON_PHI_KNL, idle_cpu_knl), |
4827 | ICPU(INTEL_FAM6_ATOM_GOLDMONT, idle_cpu_bxt), |
4828 | - ICPU(INTEL_FAM6_ATOM_DENVERTON, idle_cpu_dnv), |
4829 | + ICPU(INTEL_FAM6_ATOM_GOLDMONT_X, idle_cpu_dnv), |
4830 | {} |
4831 | }; |
4832 | |
4833 | diff --git a/drivers/mmc/host/sdhci-acpi.c b/drivers/mmc/host/sdhci-acpi.c |
4834 | index 80918abfc468..4398398c0935 100644 |
4835 | --- a/drivers/mmc/host/sdhci-acpi.c |
4836 | +++ b/drivers/mmc/host/sdhci-acpi.c |
4837 | @@ -127,7 +127,7 @@ static const struct sdhci_acpi_chip sdhci_acpi_chip_int = { |
4838 | static bool sdhci_acpi_byt(void) |
4839 | { |
4840 | static const struct x86_cpu_id byt[] = { |
4841 | - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT1 }, |
4842 | + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT }, |
4843 | {} |
4844 | }; |
4845 | |
4846 | diff --git a/drivers/pci/pci-mid.c b/drivers/pci/pci-mid.c |
4847 | index c7f3408e3148..54b3f9bc5ad8 100644 |
4848 | --- a/drivers/pci/pci-mid.c |
4849 | +++ b/drivers/pci/pci-mid.c |
4850 | @@ -71,8 +71,8 @@ static struct pci_platform_pm_ops mid_pci_platform_pm = { |
4851 | * arch/x86/platform/intel-mid/pwr.c. |
4852 | */ |
4853 | static const struct x86_cpu_id lpss_cpu_ids[] = { |
4854 | - ICPU(INTEL_FAM6_ATOM_PENWELL), |
4855 | - ICPU(INTEL_FAM6_ATOM_MERRIFIELD), |
4856 | + ICPU(INTEL_FAM6_ATOM_SALTWELL_MID), |
4857 | + ICPU(INTEL_FAM6_ATOM_SILVERMONT_MID), |
4858 | {} |
4859 | }; |
4860 | |
4861 | diff --git a/drivers/powercap/intel_rapl.c b/drivers/powercap/intel_rapl.c |
4862 | index 3c71f608b444..8809c1a20bed 100644 |
4863 | --- a/drivers/powercap/intel_rapl.c |
4864 | +++ b/drivers/powercap/intel_rapl.c |
4865 | @@ -1175,12 +1175,12 @@ static const struct x86_cpu_id rapl_ids[] __initconst = { |
4866 | RAPL_CPU(INTEL_FAM6_KABYLAKE_MOBILE, rapl_defaults_core), |
4867 | RAPL_CPU(INTEL_FAM6_KABYLAKE_DESKTOP, rapl_defaults_core), |
4868 | |
4869 | - RAPL_CPU(INTEL_FAM6_ATOM_SILVERMONT1, rapl_defaults_byt), |
4870 | + RAPL_CPU(INTEL_FAM6_ATOM_SILVERMONT, rapl_defaults_byt), |
4871 | RAPL_CPU(INTEL_FAM6_ATOM_AIRMONT, rapl_defaults_cht), |
4872 | - RAPL_CPU(INTEL_FAM6_ATOM_MERRIFIELD, rapl_defaults_tng), |
4873 | - RAPL_CPU(INTEL_FAM6_ATOM_MOOREFIELD, rapl_defaults_ann), |
4874 | + RAPL_CPU(INTEL_FAM6_ATOM_SILVERMONT_MID,rapl_defaults_tng), |
4875 | + RAPL_CPU(INTEL_FAM6_ATOM_AIRMONT_MID, rapl_defaults_ann), |
4876 | RAPL_CPU(INTEL_FAM6_ATOM_GOLDMONT, rapl_defaults_core), |
4877 | - RAPL_CPU(INTEL_FAM6_ATOM_DENVERTON, rapl_defaults_core), |
4878 | + RAPL_CPU(INTEL_FAM6_ATOM_GOLDMONT_X, rapl_defaults_core), |
4879 | |
4880 | RAPL_CPU(INTEL_FAM6_XEON_PHI_KNL, rapl_defaults_hsw_server), |
4881 | {} |
4882 | diff --git a/drivers/thermal/intel_soc_dts_thermal.c b/drivers/thermal/intel_soc_dts_thermal.c |
4883 | index b2bbaa1c60b0..18788109cae6 100644 |
4884 | --- a/drivers/thermal/intel_soc_dts_thermal.c |
4885 | +++ b/drivers/thermal/intel_soc_dts_thermal.c |
4886 | @@ -43,7 +43,7 @@ static irqreturn_t soc_irq_thread_fn(int irq, void *dev_data) |
4887 | } |
4888 | |
4889 | static const struct x86_cpu_id soc_thermal_ids[] = { |
4890 | - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT1, 0, |
4891 | + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT, 0, |
4892 | BYT_SOC_DTS_APIC_IRQ}, |
4893 | {} |
4894 | }; |
4895 | diff --git a/include/linux/bitops.h b/include/linux/bitops.h |
4896 | index a83c822c35c2..d4b167fc9ecb 100644 |
4897 | --- a/include/linux/bitops.h |
4898 | +++ b/include/linux/bitops.h |
4899 | @@ -1,28 +1,9 @@ |
4900 | #ifndef _LINUX_BITOPS_H |
4901 | #define _LINUX_BITOPS_H |
4902 | #include <asm/types.h> |
4903 | +#include <linux/bits.h> |
4904 | |
4905 | -#ifdef __KERNEL__ |
4906 | -#define BIT(nr) (1UL << (nr)) |
4907 | -#define BIT_ULL(nr) (1ULL << (nr)) |
4908 | -#define BIT_MASK(nr) (1UL << ((nr) % BITS_PER_LONG)) |
4909 | -#define BIT_WORD(nr) ((nr) / BITS_PER_LONG) |
4910 | -#define BIT_ULL_MASK(nr) (1ULL << ((nr) % BITS_PER_LONG_LONG)) |
4911 | -#define BIT_ULL_WORD(nr) ((nr) / BITS_PER_LONG_LONG) |
4912 | -#define BITS_PER_BYTE 8 |
4913 | #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(long)) |
4914 | -#endif |
4915 | - |
4916 | -/* |
4917 | - * Create a contiguous bitmask starting at bit position @l and ending at |
4918 | - * position @h. For example |
4919 | - * GENMASK_ULL(39, 21) gives us the 64bit vector 0x000000ffffe00000. |
4920 | - */ |
4921 | -#define GENMASK(h, l) \ |
4922 | - (((~0UL) << (l)) & (~0UL >> (BITS_PER_LONG - 1 - (h)))) |
4923 | - |
4924 | -#define GENMASK_ULL(h, l) \ |
4925 | - (((~0ULL) << (l)) & (~0ULL >> (BITS_PER_LONG_LONG - 1 - (h)))) |
4926 | |
4927 | extern unsigned int __sw_hweight8(unsigned int w); |
4928 | extern unsigned int __sw_hweight16(unsigned int w); |
4929 | diff --git a/include/linux/bits.h b/include/linux/bits.h |
4930 | new file mode 100644 |
4931 | index 000000000000..2b7b532c1d51 |
4932 | --- /dev/null |
4933 | +++ b/include/linux/bits.h |
4934 | @@ -0,0 +1,26 @@ |
4935 | +/* SPDX-License-Identifier: GPL-2.0 */ |
4936 | +#ifndef __LINUX_BITS_H |
4937 | +#define __LINUX_BITS_H |
4938 | +#include <asm/bitsperlong.h> |
4939 | + |
4940 | +#define BIT(nr) (1UL << (nr)) |
4941 | +#define BIT_ULL(nr) (1ULL << (nr)) |
4942 | +#define BIT_MASK(nr) (1UL << ((nr) % BITS_PER_LONG)) |
4943 | +#define BIT_WORD(nr) ((nr) / BITS_PER_LONG) |
4944 | +#define BIT_ULL_MASK(nr) (1ULL << ((nr) % BITS_PER_LONG_LONG)) |
4945 | +#define BIT_ULL_WORD(nr) ((nr) / BITS_PER_LONG_LONG) |
4946 | +#define BITS_PER_BYTE 8 |
4947 | + |
4948 | +/* |
4949 | + * Create a contiguous bitmask starting at bit position @l and ending at |
4950 | + * position @h. For example |
4951 | + * GENMASK_ULL(39, 21) gives us the 64bit vector 0x000000ffffe00000. |
4952 | + */ |
4953 | +#define GENMASK(h, l) \ |
4954 | + (((~0UL) - (1UL << (l)) + 1) & (~0UL >> (BITS_PER_LONG - 1 - (h)))) |
4955 | + |
4956 | +#define GENMASK_ULL(h, l) \ |
4957 | + (((~0ULL) - (1ULL << (l)) + 1) & \ |
4958 | + (~0ULL >> (BITS_PER_LONG_LONG - 1 - (h)))) |
4959 | + |
4960 | +#endif /* __LINUX_BITS_H */ |
4961 | diff --git a/include/linux/cpu.h b/include/linux/cpu.h |
4962 | index ae5ac89324df..166686209f2c 100644 |
4963 | --- a/include/linux/cpu.h |
4964 | +++ b/include/linux/cpu.h |
4965 | @@ -54,6 +54,8 @@ extern ssize_t cpu_show_spec_store_bypass(struct device *dev, |
4966 | struct device_attribute *attr, char *buf); |
4967 | extern ssize_t cpu_show_l1tf(struct device *dev, |
4968 | struct device_attribute *attr, char *buf); |
4969 | +extern ssize_t cpu_show_mds(struct device *dev, |
4970 | + struct device_attribute *attr, char *buf); |
4971 | |
4972 | extern __printf(4, 5) |
4973 | struct device *cpu_device_create(struct device *parent, void *drvdata, |
4974 | @@ -276,4 +278,28 @@ static inline void cpu_smt_check_topology_early(void) { } |
4975 | static inline void cpu_smt_check_topology(void) { } |
4976 | #endif |
4977 | |
4978 | +/* |
4979 | + * These are used for a global "mitigations=" cmdline option for toggling |
4980 | + * optional CPU mitigations. |
4981 | + */ |
4982 | +enum cpu_mitigations { |
4983 | + CPU_MITIGATIONS_OFF, |
4984 | + CPU_MITIGATIONS_AUTO, |
4985 | + CPU_MITIGATIONS_AUTO_NOSMT, |
4986 | +}; |
4987 | + |
4988 | +extern enum cpu_mitigations cpu_mitigations; |
4989 | + |
4990 | +/* mitigations=off */ |
4991 | +static inline bool cpu_mitigations_off(void) |
4992 | +{ |
4993 | + return cpu_mitigations == CPU_MITIGATIONS_OFF; |
4994 | +} |
4995 | + |
4996 | +/* mitigations=auto,nosmt */ |
4997 | +static inline bool cpu_mitigations_auto_nosmt(void) |
4998 | +{ |
4999 | + return cpu_mitigations == CPU_MITIGATIONS_AUTO_NOSMT; |
5000 | +} |
5001 | + |
5002 | #endif /* _LINUX_CPU_H_ */ |
5003 | diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h |
5004 | index d53a23100401..58ae371556bc 100644 |
5005 | --- a/include/linux/ptrace.h |
5006 | +++ b/include/linux/ptrace.h |
5007 | @@ -60,14 +60,17 @@ extern void exit_ptrace(struct task_struct *tracer, struct list_head *dead); |
5008 | #define PTRACE_MODE_READ 0x01 |
5009 | #define PTRACE_MODE_ATTACH 0x02 |
5010 | #define PTRACE_MODE_NOAUDIT 0x04 |
5011 | -#define PTRACE_MODE_FSCREDS 0x08 |
5012 | -#define PTRACE_MODE_REALCREDS 0x10 |
5013 | +#define PTRACE_MODE_FSCREDS 0x08 |
5014 | +#define PTRACE_MODE_REALCREDS 0x10 |
5015 | +#define PTRACE_MODE_SCHED 0x20 |
5016 | +#define PTRACE_MODE_IBPB 0x40 |
5017 | |
5018 | /* shorthands for READ/ATTACH and FSCREDS/REALCREDS combinations */ |
5019 | #define PTRACE_MODE_READ_FSCREDS (PTRACE_MODE_READ | PTRACE_MODE_FSCREDS) |
5020 | #define PTRACE_MODE_READ_REALCREDS (PTRACE_MODE_READ | PTRACE_MODE_REALCREDS) |
5021 | #define PTRACE_MODE_ATTACH_FSCREDS (PTRACE_MODE_ATTACH | PTRACE_MODE_FSCREDS) |
5022 | #define PTRACE_MODE_ATTACH_REALCREDS (PTRACE_MODE_ATTACH | PTRACE_MODE_REALCREDS) |
5023 | +#define PTRACE_MODE_SPEC_IBPB (PTRACE_MODE_ATTACH_REALCREDS | PTRACE_MODE_IBPB) |
5024 | |
5025 | /** |
5026 | * ptrace_may_access - check whether the caller is permitted to access |
5027 | @@ -85,6 +88,20 @@ extern void exit_ptrace(struct task_struct *tracer, struct list_head *dead); |
5028 | */ |
5029 | extern bool ptrace_may_access(struct task_struct *task, unsigned int mode); |
5030 | |
5031 | +/** |
5032 | + * ptrace_may_access - check whether the caller is permitted to access |
5033 | + * a target task. |
5034 | + * @task: target task |
5035 | + * @mode: selects type of access and caller credentials |
5036 | + * |
5037 | + * Returns true on success, false on denial. |
5038 | + * |
5039 | + * Similar to ptrace_may_access(). Only to be called from context switch |
5040 | + * code. Does not call into audit and the regular LSM hooks due to locking |
5041 | + * constraints. |
5042 | + */ |
5043 | +extern bool ptrace_may_access_sched(struct task_struct *task, unsigned int mode); |
5044 | + |
5045 | static inline int ptrace_reparented(struct task_struct *child) |
5046 | { |
5047 | return !same_thread_group(child->real_parent, child->parent); |
5048 | diff --git a/include/linux/sched.h b/include/linux/sched.h |
5049 | index ebd0afb35d16..1c487a3abd84 100644 |
5050 | --- a/include/linux/sched.h |
5051 | +++ b/include/linux/sched.h |
5052 | @@ -2357,6 +2357,8 @@ static inline void memalloc_noio_restore(unsigned int flags) |
5053 | #define PFA_LMK_WAITING 3 /* Lowmemorykiller is waiting */ |
5054 | #define PFA_SPEC_SSB_DISABLE 4 /* Speculative Store Bypass disabled */ |
5055 | #define PFA_SPEC_SSB_FORCE_DISABLE 5 /* Speculative Store Bypass force disabled*/ |
5056 | +#define PFA_SPEC_IB_DISABLE 6 /* Indirect branch speculation restricted */ |
5057 | +#define PFA_SPEC_IB_FORCE_DISABLE 7 /* Indirect branch speculation permanently restricted */ |
5058 | |
5059 | |
5060 | #define TASK_PFA_TEST(name, func) \ |
5061 | @@ -2390,6 +2392,13 @@ TASK_PFA_CLEAR(SPEC_SSB_DISABLE, spec_ssb_disable) |
5062 | TASK_PFA_TEST(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable) |
5063 | TASK_PFA_SET(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable) |
5064 | |
5065 | +TASK_PFA_TEST(SPEC_IB_DISABLE, spec_ib_disable) |
5066 | +TASK_PFA_SET(SPEC_IB_DISABLE, spec_ib_disable) |
5067 | +TASK_PFA_CLEAR(SPEC_IB_DISABLE, spec_ib_disable) |
5068 | + |
5069 | +TASK_PFA_TEST(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable) |
5070 | +TASK_PFA_SET(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable) |
5071 | + |
5072 | /* |
5073 | * task->jobctl flags |
5074 | */ |
5075 | diff --git a/include/linux/sched/smt.h b/include/linux/sched/smt.h |
5076 | new file mode 100644 |
5077 | index 000000000000..559ac4590593 |
5078 | --- /dev/null |
5079 | +++ b/include/linux/sched/smt.h |
5080 | @@ -0,0 +1,20 @@ |
5081 | +/* SPDX-License-Identifier: GPL-2.0 */ |
5082 | +#ifndef _LINUX_SCHED_SMT_H |
5083 | +#define _LINUX_SCHED_SMT_H |
5084 | + |
5085 | +#include <linux/atomic.h> |
5086 | + |
5087 | +#ifdef CONFIG_SCHED_SMT |
5088 | +extern atomic_t sched_smt_present; |
5089 | + |
5090 | +static __always_inline bool sched_smt_active(void) |
5091 | +{ |
5092 | + return atomic_read(&sched_smt_present); |
5093 | +} |
5094 | +#else |
5095 | +static inline bool sched_smt_active(void) { return false; } |
5096 | +#endif |
5097 | + |
5098 | +void arch_smt_update(void); |
5099 | + |
5100 | +#endif |
5101 | diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h |
5102 | index 64776b72e1eb..64ec0d62e5f5 100644 |
5103 | --- a/include/uapi/linux/prctl.h |
5104 | +++ b/include/uapi/linux/prctl.h |
5105 | @@ -202,6 +202,7 @@ struct prctl_mm_map { |
5106 | #define PR_SET_SPECULATION_CTRL 53 |
5107 | /* Speculation control variants */ |
5108 | # define PR_SPEC_STORE_BYPASS 0 |
5109 | +# define PR_SPEC_INDIRECT_BRANCH 1 |
5110 | /* Return and control values for PR_SET/GET_SPECULATION_CTRL */ |
5111 | # define PR_SPEC_NOT_AFFECTED 0 |
5112 | # define PR_SPEC_PRCTL (1UL << 0) |
5113 | diff --git a/kernel/cpu.c b/kernel/cpu.c |
5114 | index bf24e8400903..db1a0bc46c3e 100644 |
5115 | --- a/kernel/cpu.c |
5116 | +++ b/kernel/cpu.c |
5117 | @@ -8,6 +8,7 @@ |
5118 | #include <linux/init.h> |
5119 | #include <linux/notifier.h> |
5120 | #include <linux/sched.h> |
5121 | +#include <linux/sched/smt.h> |
5122 | #include <linux/unistd.h> |
5123 | #include <linux/cpu.h> |
5124 | #include <linux/oom.h> |
5125 | @@ -356,6 +357,12 @@ void cpu_hotplug_enable(void) |
5126 | EXPORT_SYMBOL_GPL(cpu_hotplug_enable); |
5127 | #endif /* CONFIG_HOTPLUG_CPU */ |
5128 | |
5129 | +/* |
5130 | + * Architectures that need SMT-specific errata handling during SMT hotplug |
5131 | + * should override this. |
5132 | + */ |
5133 | +void __weak arch_smt_update(void) { } |
5134 | + |
5135 | #ifdef CONFIG_HOTPLUG_SMT |
5136 | enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED; |
5137 | EXPORT_SYMBOL_GPL(cpu_smt_control); |
5138 | @@ -1058,6 +1065,7 @@ out: |
5139 | /* This post dead nonsense must die */ |
5140 | if (!ret && hasdied) |
5141 | cpu_notify_nofail(CPU_POST_DEAD, cpu); |
5142 | + arch_smt_update(); |
5143 | return ret; |
5144 | } |
5145 | |
5146 | @@ -1177,6 +1185,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target) |
5147 | ret = cpuhp_up_callbacks(cpu, st, target); |
5148 | out: |
5149 | cpu_hotplug_done(); |
5150 | + arch_smt_update(); |
5151 | return ret; |
5152 | } |
5153 | |
5154 | @@ -2012,8 +2021,10 @@ static int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) |
5155 | */ |
5156 | cpuhp_offline_cpu_device(cpu); |
5157 | } |
5158 | - if (!ret) |
5159 | + if (!ret) { |
5160 | cpu_smt_control = ctrlval; |
5161 | + arch_smt_update(); |
5162 | + } |
5163 | cpu_maps_update_done(); |
5164 | return ret; |
5165 | } |
5166 | @@ -2024,6 +2035,7 @@ static int cpuhp_smt_enable(void) |
5167 | |
5168 | cpu_maps_update_begin(); |
5169 | cpu_smt_control = CPU_SMT_ENABLED; |
5170 | + arch_smt_update(); |
5171 | for_each_present_cpu(cpu) { |
5172 | /* Skip online CPUs and CPUs on offline nodes */ |
5173 | if (cpu_online(cpu) || !node_online(cpu_to_node(cpu))) |
5174 | @@ -2222,3 +2234,18 @@ void __init boot_cpu_hotplug_init(void) |
5175 | #endif |
5176 | this_cpu_write(cpuhp_state.state, CPUHP_ONLINE); |
5177 | } |
5178 | + |
5179 | +enum cpu_mitigations cpu_mitigations __ro_after_init = CPU_MITIGATIONS_AUTO; |
5180 | + |
5181 | +static int __init mitigations_parse_cmdline(char *arg) |
5182 | +{ |
5183 | + if (!strcmp(arg, "off")) |
5184 | + cpu_mitigations = CPU_MITIGATIONS_OFF; |
5185 | + else if (!strcmp(arg, "auto")) |
5186 | + cpu_mitigations = CPU_MITIGATIONS_AUTO; |
5187 | + else if (!strcmp(arg, "auto,nosmt")) |
5188 | + cpu_mitigations = CPU_MITIGATIONS_AUTO_NOSMT; |
5189 | + |
5190 | + return 0; |
5191 | +} |
5192 | +early_param("mitigations", mitigations_parse_cmdline); |
5193 | diff --git a/kernel/ptrace.c b/kernel/ptrace.c |
5194 | index f39a7be98fc1..efba851ee018 100644 |
5195 | --- a/kernel/ptrace.c |
5196 | +++ b/kernel/ptrace.c |
5197 | @@ -258,6 +258,9 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state) |
5198 | |
5199 | static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode) |
5200 | { |
5201 | + if (mode & PTRACE_MODE_SCHED) |
5202 | + return false; |
5203 | + |
5204 | if (mode & PTRACE_MODE_NOAUDIT) |
5205 | return has_ns_capability_noaudit(current, ns, CAP_SYS_PTRACE); |
5206 | else |
5207 | @@ -325,9 +328,16 @@ ok: |
5208 | !ptrace_has_cap(mm->user_ns, mode))) |
5209 | return -EPERM; |
5210 | |
5211 | + if (mode & PTRACE_MODE_SCHED) |
5212 | + return 0; |
5213 | return security_ptrace_access_check(task, mode); |
5214 | } |
5215 | |
5216 | +bool ptrace_may_access_sched(struct task_struct *task, unsigned int mode) |
5217 | +{ |
5218 | + return __ptrace_may_access(task, mode | PTRACE_MODE_SCHED); |
5219 | +} |
5220 | + |
5221 | bool ptrace_may_access(struct task_struct *task, unsigned int mode) |
5222 | { |
5223 | int err; |
5224 | diff --git a/kernel/sched/core.c b/kernel/sched/core.c |
5225 | index 6b3fff6a6437..50e80b1be2c8 100644 |
5226 | --- a/kernel/sched/core.c |
5227 | +++ b/kernel/sched/core.c |
5228 | @@ -7355,11 +7355,22 @@ static int cpuset_cpu_inactive(unsigned int cpu) |
5229 | return 0; |
5230 | } |
5231 | |
5232 | +#ifdef CONFIG_SCHED_SMT |
5233 | +atomic_t sched_smt_present = ATOMIC_INIT(0); |
5234 | +#endif |
5235 | + |
5236 | int sched_cpu_activate(unsigned int cpu) |
5237 | { |
5238 | struct rq *rq = cpu_rq(cpu); |
5239 | unsigned long flags; |
5240 | |
5241 | +#ifdef CONFIG_SCHED_SMT |
5242 | + /* |
5243 | + * When going up, increment the number of cores with SMT present. |
5244 | + */ |
5245 | + if (cpumask_weight(cpu_smt_mask(cpu)) == 2) |
5246 | + atomic_inc(&sched_smt_present); |
5247 | +#endif |
5248 | set_cpu_active(cpu, true); |
5249 | |
5250 | if (sched_smp_initialized) { |
5251 | @@ -7408,6 +7419,14 @@ int sched_cpu_deactivate(unsigned int cpu) |
5252 | else |
5253 | synchronize_rcu(); |
5254 | |
5255 | +#ifdef CONFIG_SCHED_SMT |
5256 | + /* |
5257 | + * When going down, decrement the number of cores with SMT present. |
5258 | + */ |
5259 | + if (cpumask_weight(cpu_smt_mask(cpu)) == 2) |
5260 | + atomic_dec(&sched_smt_present); |
5261 | +#endif |
5262 | + |
5263 | if (!sched_smp_initialized) |
5264 | return 0; |
5265 | |
5266 | diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h |
5267 | index ec6e838e991a..15c08752926b 100644 |
5268 | --- a/kernel/sched/sched.h |
5269 | +++ b/kernel/sched/sched.h |
5270 | @@ -2,6 +2,7 @@ |
5271 | #include <linux/sched.h> |
5272 | #include <linux/sched/sysctl.h> |
5273 | #include <linux/sched/rt.h> |
5274 | +#include <linux/sched/smt.h> |
5275 | #include <linux/u64_stats_sync.h> |
5276 | #include <linux/sched/deadline.h> |
5277 | #include <linux/kernel_stat.h> |
5278 | diff --git a/tools/power/x86/turbostat/Makefile b/tools/power/x86/turbostat/Makefile |
5279 | index 8561e7ddca59..92be948c922d 100644 |
5280 | --- a/tools/power/x86/turbostat/Makefile |
5281 | +++ b/tools/power/x86/turbostat/Makefile |
5282 | @@ -8,7 +8,7 @@ ifeq ("$(origin O)", "command line") |
5283 | endif |
5284 | |
5285 | turbostat : turbostat.c |
5286 | -CFLAGS += -Wall |
5287 | +CFLAGS += -Wall -I../../../include |
5288 | CFLAGS += -DMSRHEADER='"../../../../arch/x86/include/asm/msr-index.h"' |
5289 | |
5290 | %: %.c |