Annotation of /trunk/kernel-alx/patches-4.9/0275-4.9.176-all-fixes.patch
Parent Directory | Revision Log
Revision 3352 -
(hide annotations)
(download)
Tue Jun 18 09:42:05 2019 UTC (5 years, 3 months ago) by niro
File size: 192806 byte(s)
Tue Jun 18 09:42:05 2019 UTC (5 years, 3 months ago) by niro
File size: 192806 byte(s)
-linux-4.9.176
1 | niro | 3352 | diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu |
2 | index 069e8d52c991..cadb7a9a5218 100644 | ||
3 | --- a/Documentation/ABI/testing/sysfs-devices-system-cpu | ||
4 | +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu | ||
5 | @@ -357,6 +357,7 @@ What: /sys/devices/system/cpu/vulnerabilities | ||
6 | /sys/devices/system/cpu/vulnerabilities/spectre_v2 | ||
7 | /sys/devices/system/cpu/vulnerabilities/spec_store_bypass | ||
8 | /sys/devices/system/cpu/vulnerabilities/l1tf | ||
9 | + /sys/devices/system/cpu/vulnerabilities/mds | ||
10 | Date: January 2018 | ||
11 | Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org> | ||
12 | Description: Information about CPU vulnerabilities | ||
13 | @@ -369,8 +370,7 @@ Description: Information about CPU vulnerabilities | ||
14 | "Vulnerable" CPU is affected and no mitigation in effect | ||
15 | "Mitigation: $M" CPU is affected and mitigation $M is in effect | ||
16 | |||
17 | - Details about the l1tf file can be found in | ||
18 | - Documentation/admin-guide/l1tf.rst | ||
19 | + See also: Documentation/hw-vuln/index.rst | ||
20 | |||
21 | What: /sys/devices/system/cpu/smt | ||
22 | /sys/devices/system/cpu/smt/active | ||
23 | diff --git a/Documentation/hw-vuln/index.rst b/Documentation/hw-vuln/index.rst | ||
24 | new file mode 100644 | ||
25 | index 000000000000..ffc064c1ec68 | ||
26 | --- /dev/null | ||
27 | +++ b/Documentation/hw-vuln/index.rst | ||
28 | @@ -0,0 +1,13 @@ | ||
29 | +======================== | ||
30 | +Hardware vulnerabilities | ||
31 | +======================== | ||
32 | + | ||
33 | +This section describes CPU vulnerabilities and provides an overview of the | ||
34 | +possible mitigations along with guidance for selecting mitigations if they | ||
35 | +are configurable at compile, boot or run time. | ||
36 | + | ||
37 | +.. toctree:: | ||
38 | + :maxdepth: 1 | ||
39 | + | ||
40 | + l1tf | ||
41 | + mds | ||
42 | diff --git a/Documentation/hw-vuln/l1tf.rst b/Documentation/hw-vuln/l1tf.rst | ||
43 | new file mode 100644 | ||
44 | index 000000000000..31653a9f0e1b | ||
45 | --- /dev/null | ||
46 | +++ b/Documentation/hw-vuln/l1tf.rst | ||
47 | @@ -0,0 +1,615 @@ | ||
48 | +L1TF - L1 Terminal Fault | ||
49 | +======================== | ||
50 | + | ||
51 | +L1 Terminal Fault is a hardware vulnerability which allows unprivileged | ||
52 | +speculative access to data which is available in the Level 1 Data Cache | ||
53 | +when the page table entry controlling the virtual address, which is used | ||
54 | +for the access, has the Present bit cleared or other reserved bits set. | ||
55 | + | ||
56 | +Affected processors | ||
57 | +------------------- | ||
58 | + | ||
59 | +This vulnerability affects a wide range of Intel processors. The | ||
60 | +vulnerability is not present on: | ||
61 | + | ||
62 | + - Processors from AMD, Centaur and other non Intel vendors | ||
63 | + | ||
64 | + - Older processor models, where the CPU family is < 6 | ||
65 | + | ||
66 | + - A range of Intel ATOM processors (Cedarview, Cloverview, Lincroft, | ||
67 | + Penwell, Pineview, Silvermont, Airmont, Merrifield) | ||
68 | + | ||
69 | + - The Intel XEON PHI family | ||
70 | + | ||
71 | + - Intel processors which have the ARCH_CAP_RDCL_NO bit set in the | ||
72 | + IA32_ARCH_CAPABILITIES MSR. If the bit is set the CPU is not affected | ||
73 | + by the Meltdown vulnerability either. These CPUs should become | ||
74 | + available by end of 2018. | ||
75 | + | ||
76 | +Whether a processor is affected or not can be read out from the L1TF | ||
77 | +vulnerability file in sysfs. See :ref:`l1tf_sys_info`. | ||
78 | + | ||
79 | +Related CVEs | ||
80 | +------------ | ||
81 | + | ||
82 | +The following CVE entries are related to the L1TF vulnerability: | ||
83 | + | ||
84 | + ============= ================= ============================== | ||
85 | + CVE-2018-3615 L1 Terminal Fault SGX related aspects | ||
86 | + CVE-2018-3620 L1 Terminal Fault OS, SMM related aspects | ||
87 | + CVE-2018-3646 L1 Terminal Fault Virtualization related aspects | ||
88 | + ============= ================= ============================== | ||
89 | + | ||
90 | +Problem | ||
91 | +------- | ||
92 | + | ||
93 | +If an instruction accesses a virtual address for which the relevant page | ||
94 | +table entry (PTE) has the Present bit cleared or other reserved bits set, | ||
95 | +then speculative execution ignores the invalid PTE and loads the referenced | ||
96 | +data if it is present in the Level 1 Data Cache, as if the page referenced | ||
97 | +by the address bits in the PTE was still present and accessible. | ||
98 | + | ||
99 | +While this is a purely speculative mechanism and the instruction will raise | ||
100 | +a page fault when it is retired eventually, the pure act of loading the | ||
101 | +data and making it available to other speculative instructions opens up the | ||
102 | +opportunity for side channel attacks to unprivileged malicious code, | ||
103 | +similar to the Meltdown attack. | ||
104 | + | ||
105 | +While Meltdown breaks the user space to kernel space protection, L1TF | ||
106 | +allows to attack any physical memory address in the system and the attack | ||
107 | +works across all protection domains. It allows an attack of SGX and also | ||
108 | +works from inside virtual machines because the speculation bypasses the | ||
109 | +extended page table (EPT) protection mechanism. | ||
110 | + | ||
111 | + | ||
112 | +Attack scenarios | ||
113 | +---------------- | ||
114 | + | ||
115 | +1. Malicious user space | ||
116 | +^^^^^^^^^^^^^^^^^^^^^^^ | ||
117 | + | ||
118 | + Operating Systems store arbitrary information in the address bits of a | ||
119 | + PTE which is marked non present. This allows a malicious user space | ||
120 | + application to attack the physical memory to which these PTEs resolve. | ||
121 | + In some cases user-space can maliciously influence the information | ||
122 | + encoded in the address bits of the PTE, thus making attacks more | ||
123 | + deterministic and more practical. | ||
124 | + | ||
125 | + The Linux kernel contains a mitigation for this attack vector, PTE | ||
126 | + inversion, which is permanently enabled and has no performance | ||
127 | + impact. The kernel ensures that the address bits of PTEs, which are not | ||
128 | + marked present, never point to cacheable physical memory space. | ||
129 | + | ||
130 | + A system with an up to date kernel is protected against attacks from | ||
131 | + malicious user space applications. | ||
132 | + | ||
133 | +2. Malicious guest in a virtual machine | ||
134 | +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | ||
135 | + | ||
136 | + The fact that L1TF breaks all domain protections allows malicious guest | ||
137 | + OSes, which can control the PTEs directly, and malicious guest user | ||
138 | + space applications, which run on an unprotected guest kernel lacking the | ||
139 | + PTE inversion mitigation for L1TF, to attack physical host memory. | ||
140 | + | ||
141 | + A special aspect of L1TF in the context of virtualization is symmetric | ||
142 | + multi threading (SMT). The Intel implementation of SMT is called | ||
143 | + HyperThreading. The fact that Hyperthreads on the affected processors | ||
144 | + share the L1 Data Cache (L1D) is important for this. As the flaw allows | ||
145 | + only to attack data which is present in L1D, a malicious guest running | ||
146 | + on one Hyperthread can attack the data which is brought into the L1D by | ||
147 | + the context which runs on the sibling Hyperthread of the same physical | ||
148 | + core. This context can be host OS, host user space or a different guest. | ||
149 | + | ||
150 | + If the processor does not support Extended Page Tables, the attack is | ||
151 | + only possible, when the hypervisor does not sanitize the content of the | ||
152 | + effective (shadow) page tables. | ||
153 | + | ||
154 | + While solutions exist to mitigate these attack vectors fully, these | ||
155 | + mitigations are not enabled by default in the Linux kernel because they | ||
156 | + can affect performance significantly. The kernel provides several | ||
157 | + mechanisms which can be utilized to address the problem depending on the | ||
158 | + deployment scenario. The mitigations, their protection scope and impact | ||
159 | + are described in the next sections. | ||
160 | + | ||
161 | + The default mitigations and the rationale for choosing them are explained | ||
162 | + at the end of this document. See :ref:`default_mitigations`. | ||
163 | + | ||
164 | +.. _l1tf_sys_info: | ||
165 | + | ||
166 | +L1TF system information | ||
167 | +----------------------- | ||
168 | + | ||
169 | +The Linux kernel provides a sysfs interface to enumerate the current L1TF | ||
170 | +status of the system: whether the system is vulnerable, and which | ||
171 | +mitigations are active. The relevant sysfs file is: | ||
172 | + | ||
173 | +/sys/devices/system/cpu/vulnerabilities/l1tf | ||
174 | + | ||
175 | +The possible values in this file are: | ||
176 | + | ||
177 | + =========================== =============================== | ||
178 | + 'Not affected' The processor is not vulnerable | ||
179 | + 'Mitigation: PTE Inversion' The host protection is active | ||
180 | + =========================== =============================== | ||
181 | + | ||
182 | +If KVM/VMX is enabled and the processor is vulnerable then the following | ||
183 | +information is appended to the 'Mitigation: PTE Inversion' part: | ||
184 | + | ||
185 | + - SMT status: | ||
186 | + | ||
187 | + ===================== ================ | ||
188 | + 'VMX: SMT vulnerable' SMT is enabled | ||
189 | + 'VMX: SMT disabled' SMT is disabled | ||
190 | + ===================== ================ | ||
191 | + | ||
192 | + - L1D Flush mode: | ||
193 | + | ||
194 | + ================================ ==================================== | ||
195 | + 'L1D vulnerable' L1D flushing is disabled | ||
196 | + | ||
197 | + 'L1D conditional cache flushes' L1D flush is conditionally enabled | ||
198 | + | ||
199 | + 'L1D cache flushes' L1D flush is unconditionally enabled | ||
200 | + ================================ ==================================== | ||
201 | + | ||
202 | +The resulting grade of protection is discussed in the following sections. | ||
203 | + | ||
204 | + | ||
205 | +Host mitigation mechanism | ||
206 | +------------------------- | ||
207 | + | ||
208 | +The kernel is unconditionally protected against L1TF attacks from malicious | ||
209 | +user space running on the host. | ||
210 | + | ||
211 | + | ||
212 | +Guest mitigation mechanisms | ||
213 | +--------------------------- | ||
214 | + | ||
215 | +.. _l1d_flush: | ||
216 | + | ||
217 | +1. L1D flush on VMENTER | ||
218 | +^^^^^^^^^^^^^^^^^^^^^^^ | ||
219 | + | ||
220 | + To make sure that a guest cannot attack data which is present in the L1D | ||
221 | + the hypervisor flushes the L1D before entering the guest. | ||
222 | + | ||
223 | + Flushing the L1D evicts not only the data which should not be accessed | ||
224 | + by a potentially malicious guest, it also flushes the guest | ||
225 | + data. Flushing the L1D has a performance impact as the processor has to | ||
226 | + bring the flushed guest data back into the L1D. Depending on the | ||
227 | + frequency of VMEXIT/VMENTER and the type of computations in the guest | ||
228 | + performance degradation in the range of 1% to 50% has been observed. For | ||
229 | + scenarios where guest VMEXIT/VMENTER are rare the performance impact is | ||
230 | + minimal. Virtio and mechanisms like posted interrupts are designed to | ||
231 | + confine the VMEXITs to a bare minimum, but specific configurations and | ||
232 | + application scenarios might still suffer from a high VMEXIT rate. | ||
233 | + | ||
234 | + The kernel provides two L1D flush modes: | ||
235 | + - conditional ('cond') | ||
236 | + - unconditional ('always') | ||
237 | + | ||
238 | + The conditional mode avoids L1D flushing after VMEXITs which execute | ||
239 | + only audited code paths before the corresponding VMENTER. These code | ||
240 | + paths have been verified that they cannot expose secrets or other | ||
241 | + interesting data to an attacker, but they can leak information about the | ||
242 | + address space layout of the hypervisor. | ||
243 | + | ||
244 | + Unconditional mode flushes L1D on all VMENTER invocations and provides | ||
245 | + maximum protection. It has a higher overhead than the conditional | ||
246 | + mode. The overhead cannot be quantified correctly as it depends on the | ||
247 | + workload scenario and the resulting number of VMEXITs. | ||
248 | + | ||
249 | + The general recommendation is to enable L1D flush on VMENTER. The kernel | ||
250 | + defaults to conditional mode on affected processors. | ||
251 | + | ||
252 | + **Note**, that L1D flush does not prevent the SMT problem because the | ||
253 | + sibling thread will also bring back its data into the L1D which makes it | ||
254 | + attackable again. | ||
255 | + | ||
256 | + L1D flush can be controlled by the administrator via the kernel command | ||
257 | + line and sysfs control files. See :ref:`mitigation_control_command_line` | ||
258 | + and :ref:`mitigation_control_kvm`. | ||
259 | + | ||
260 | +.. _guest_confinement: | ||
261 | + | ||
262 | +2. Guest VCPU confinement to dedicated physical cores | ||
263 | +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | ||
264 | + | ||
265 | + To address the SMT problem, it is possible to make a guest or a group of | ||
266 | + guests affine to one or more physical cores. The proper mechanism for | ||
267 | + that is to utilize exclusive cpusets to ensure that no other guest or | ||
268 | + host tasks can run on these cores. | ||
269 | + | ||
270 | + If only a single guest or related guests run on sibling SMT threads on | ||
271 | + the same physical core then they can only attack their own memory and | ||
272 | + restricted parts of the host memory. | ||
273 | + | ||
274 | + Host memory is attackable, when one of the sibling SMT threads runs in | ||
275 | + host OS (hypervisor) context and the other in guest context. The amount | ||
276 | + of valuable information from the host OS context depends on the context | ||
277 | + which the host OS executes, i.e. interrupts, soft interrupts and kernel | ||
278 | + threads. The amount of valuable data from these contexts cannot be | ||
279 | + declared as non-interesting for an attacker without deep inspection of | ||
280 | + the code. | ||
281 | + | ||
282 | + **Note**, that assigning guests to a fixed set of physical cores affects | ||
283 | + the ability of the scheduler to do load balancing and might have | ||
284 | + negative effects on CPU utilization depending on the hosting | ||
285 | + scenario. Disabling SMT might be a viable alternative for particular | ||
286 | + scenarios. | ||
287 | + | ||
288 | + For further information about confining guests to a single or to a group | ||
289 | + of cores consult the cpusets documentation: | ||
290 | + | ||
291 | + https://www.kernel.org/doc/Documentation/cgroup-v1/cpusets.txt | ||
292 | + | ||
293 | +.. _interrupt_isolation: | ||
294 | + | ||
295 | +3. Interrupt affinity | ||
296 | +^^^^^^^^^^^^^^^^^^^^^ | ||
297 | + | ||
298 | + Interrupts can be made affine to logical CPUs. This is not universally | ||
299 | + true because there are types of interrupts which are truly per CPU | ||
300 | + interrupts, e.g. the local timer interrupt. Aside of that multi queue | ||
301 | + devices affine their interrupts to single CPUs or groups of CPUs per | ||
302 | + queue without allowing the administrator to control the affinities. | ||
303 | + | ||
304 | + Moving the interrupts, which can be affinity controlled, away from CPUs | ||
305 | + which run untrusted guests, reduces the attack vector space. | ||
306 | + | ||
307 | + Whether the interrupts with are affine to CPUs, which run untrusted | ||
308 | + guests, provide interesting data for an attacker depends on the system | ||
309 | + configuration and the scenarios which run on the system. While for some | ||
310 | + of the interrupts it can be assumed that they won't expose interesting | ||
311 | + information beyond exposing hints about the host OS memory layout, there | ||
312 | + is no way to make general assumptions. | ||
313 | + | ||
314 | + Interrupt affinity can be controlled by the administrator via the | ||
315 | + /proc/irq/$NR/smp_affinity[_list] files. Limited documentation is | ||
316 | + available at: | ||
317 | + | ||
318 | + https://www.kernel.org/doc/Documentation/IRQ-affinity.txt | ||
319 | + | ||
320 | +.. _smt_control: | ||
321 | + | ||
322 | +4. SMT control | ||
323 | +^^^^^^^^^^^^^^ | ||
324 | + | ||
325 | + To prevent the SMT issues of L1TF it might be necessary to disable SMT | ||
326 | + completely. Disabling SMT can have a significant performance impact, but | ||
327 | + the impact depends on the hosting scenario and the type of workloads. | ||
328 | + The impact of disabling SMT needs also to be weighted against the impact | ||
329 | + of other mitigation solutions like confining guests to dedicated cores. | ||
330 | + | ||
331 | + The kernel provides a sysfs interface to retrieve the status of SMT and | ||
332 | + to control it. It also provides a kernel command line interface to | ||
333 | + control SMT. | ||
334 | + | ||
335 | + The kernel command line interface consists of the following options: | ||
336 | + | ||
337 | + =========== ========================================================== | ||
338 | + nosmt Affects the bring up of the secondary CPUs during boot. The | ||
339 | + kernel tries to bring all present CPUs online during the | ||
340 | + boot process. "nosmt" makes sure that from each physical | ||
341 | + core only one - the so called primary (hyper) thread is | ||
342 | + activated. Due to a design flaw of Intel processors related | ||
343 | + to Machine Check Exceptions the non primary siblings have | ||
344 | + to be brought up at least partially and are then shut down | ||
345 | + again. "nosmt" can be undone via the sysfs interface. | ||
346 | + | ||
347 | + nosmt=force Has the same effect as "nosmt" but it does not allow to | ||
348 | + undo the SMT disable via the sysfs interface. | ||
349 | + =========== ========================================================== | ||
350 | + | ||
351 | + The sysfs interface provides two files: | ||
352 | + | ||
353 | + - /sys/devices/system/cpu/smt/control | ||
354 | + - /sys/devices/system/cpu/smt/active | ||
355 | + | ||
356 | + /sys/devices/system/cpu/smt/control: | ||
357 | + | ||
358 | + This file allows to read out the SMT control state and provides the | ||
359 | + ability to disable or (re)enable SMT. The possible states are: | ||
360 | + | ||
361 | + ============== =================================================== | ||
362 | + on SMT is supported by the CPU and enabled. All | ||
363 | + logical CPUs can be onlined and offlined without | ||
364 | + restrictions. | ||
365 | + | ||
366 | + off SMT is supported by the CPU and disabled. Only | ||
367 | + the so called primary SMT threads can be onlined | ||
368 | + and offlined without restrictions. An attempt to | ||
369 | + online a non-primary sibling is rejected | ||
370 | + | ||
371 | + forceoff Same as 'off' but the state cannot be controlled. | ||
372 | + Attempts to write to the control file are rejected. | ||
373 | + | ||
374 | + notsupported The processor does not support SMT. It's therefore | ||
375 | + not affected by the SMT implications of L1TF. | ||
376 | + Attempts to write to the control file are rejected. | ||
377 | + ============== =================================================== | ||
378 | + | ||
379 | + The possible states which can be written into this file to control SMT | ||
380 | + state are: | ||
381 | + | ||
382 | + - on | ||
383 | + - off | ||
384 | + - forceoff | ||
385 | + | ||
386 | + /sys/devices/system/cpu/smt/active: | ||
387 | + | ||
388 | + This file reports whether SMT is enabled and active, i.e. if on any | ||
389 | + physical core two or more sibling threads are online. | ||
390 | + | ||
391 | + SMT control is also possible at boot time via the l1tf kernel command | ||
392 | + line parameter in combination with L1D flush control. See | ||
393 | + :ref:`mitigation_control_command_line`. | ||
394 | + | ||
395 | +5. Disabling EPT | ||
396 | +^^^^^^^^^^^^^^^^ | ||
397 | + | ||
398 | + Disabling EPT for virtual machines provides full mitigation for L1TF even | ||
399 | + with SMT enabled, because the effective page tables for guests are | ||
400 | + managed and sanitized by the hypervisor. Though disabling EPT has a | ||
401 | + significant performance impact especially when the Meltdown mitigation | ||
402 | + KPTI is enabled. | ||
403 | + | ||
404 | + EPT can be disabled in the hypervisor via the 'kvm-intel.ept' parameter. | ||
405 | + | ||
406 | +There is ongoing research and development for new mitigation mechanisms to | ||
407 | +address the performance impact of disabling SMT or EPT. | ||
408 | + | ||
409 | +.. _mitigation_control_command_line: | ||
410 | + | ||
411 | +Mitigation control on the kernel command line | ||
412 | +--------------------------------------------- | ||
413 | + | ||
414 | +The kernel command line allows to control the L1TF mitigations at boot | ||
415 | +time with the option "l1tf=". The valid arguments for this option are: | ||
416 | + | ||
417 | + ============ ============================================================= | ||
418 | + full Provides all available mitigations for the L1TF | ||
419 | + vulnerability. Disables SMT and enables all mitigations in | ||
420 | + the hypervisors, i.e. unconditional L1D flushing | ||
421 | + | ||
422 | + SMT control and L1D flush control via the sysfs interface | ||
423 | + is still possible after boot. Hypervisors will issue a | ||
424 | + warning when the first VM is started in a potentially | ||
425 | + insecure configuration, i.e. SMT enabled or L1D flush | ||
426 | + disabled. | ||
427 | + | ||
428 | + full,force Same as 'full', but disables SMT and L1D flush runtime | ||
429 | + control. Implies the 'nosmt=force' command line option. | ||
430 | + (i.e. sysfs control of SMT is disabled.) | ||
431 | + | ||
432 | + flush Leaves SMT enabled and enables the default hypervisor | ||
433 | + mitigation, i.e. conditional L1D flushing | ||
434 | + | ||
435 | + SMT control and L1D flush control via the sysfs interface | ||
436 | + is still possible after boot. Hypervisors will issue a | ||
437 | + warning when the first VM is started in a potentially | ||
438 | + insecure configuration, i.e. SMT enabled or L1D flush | ||
439 | + disabled. | ||
440 | + | ||
441 | + flush,nosmt Disables SMT and enables the default hypervisor mitigation, | ||
442 | + i.e. conditional L1D flushing. | ||
443 | + | ||
444 | + SMT control and L1D flush control via the sysfs interface | ||
445 | + is still possible after boot. Hypervisors will issue a | ||
446 | + warning when the first VM is started in a potentially | ||
447 | + insecure configuration, i.e. SMT enabled or L1D flush | ||
448 | + disabled. | ||
449 | + | ||
450 | + flush,nowarn Same as 'flush', but hypervisors will not warn when a VM is | ||
451 | + started in a potentially insecure configuration. | ||
452 | + | ||
453 | + off Disables hypervisor mitigations and doesn't emit any | ||
454 | + warnings. | ||
455 | + It also drops the swap size and available RAM limit restrictions | ||
456 | + on both hypervisor and bare metal. | ||
457 | + | ||
458 | + ============ ============================================================= | ||
459 | + | ||
460 | +The default is 'flush'. For details about L1D flushing see :ref:`l1d_flush`. | ||
461 | + | ||
462 | + | ||
463 | +.. _mitigation_control_kvm: | ||
464 | + | ||
465 | +Mitigation control for KVM - module parameter | ||
466 | +------------------------------------------------------------- | ||
467 | + | ||
468 | +The KVM hypervisor mitigation mechanism, flushing the L1D cache when | ||
469 | +entering a guest, can be controlled with a module parameter. | ||
470 | + | ||
471 | +The option/parameter is "kvm-intel.vmentry_l1d_flush=". It takes the | ||
472 | +following arguments: | ||
473 | + | ||
474 | + ============ ============================================================== | ||
475 | + always L1D cache flush on every VMENTER. | ||
476 | + | ||
477 | + cond Flush L1D on VMENTER only when the code between VMEXIT and | ||
478 | + VMENTER can leak host memory which is considered | ||
479 | + interesting for an attacker. This still can leak host memory | ||
480 | + which allows e.g. to determine the hosts address space layout. | ||
481 | + | ||
482 | + never Disables the mitigation | ||
483 | + ============ ============================================================== | ||
484 | + | ||
485 | +The parameter can be provided on the kernel command line, as a module | ||
486 | +parameter when loading the modules and at runtime modified via the sysfs | ||
487 | +file: | ||
488 | + | ||
489 | +/sys/module/kvm_intel/parameters/vmentry_l1d_flush | ||
490 | + | ||
491 | +The default is 'cond'. If 'l1tf=full,force' is given on the kernel command | ||
492 | +line, then 'always' is enforced and the kvm-intel.vmentry_l1d_flush | ||
493 | +module parameter is ignored and writes to the sysfs file are rejected. | ||
494 | + | ||
495 | +.. _mitigation_selection: | ||
496 | + | ||
497 | +Mitigation selection guide | ||
498 | +-------------------------- | ||
499 | + | ||
500 | +1. No virtualization in use | ||
501 | +^^^^^^^^^^^^^^^^^^^^^^^^^^^ | ||
502 | + | ||
503 | + The system is protected by the kernel unconditionally and no further | ||
504 | + action is required. | ||
505 | + | ||
506 | +2. Virtualization with trusted guests | ||
507 | +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | ||
508 | + | ||
509 | + If the guest comes from a trusted source and the guest OS kernel is | ||
510 | + guaranteed to have the L1TF mitigations in place the system is fully | ||
511 | + protected against L1TF and no further action is required. | ||
512 | + | ||
513 | + To avoid the overhead of the default L1D flushing on VMENTER the | ||
514 | + administrator can disable the flushing via the kernel command line and | ||
515 | + sysfs control files. See :ref:`mitigation_control_command_line` and | ||
516 | + :ref:`mitigation_control_kvm`. | ||
517 | + | ||
518 | + | ||
519 | +3. Virtualization with untrusted guests | ||
520 | +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | ||
521 | + | ||
522 | +3.1. SMT not supported or disabled | ||
523 | +"""""""""""""""""""""""""""""""""" | ||
524 | + | ||
525 | + If SMT is not supported by the processor or disabled in the BIOS or by | ||
526 | + the kernel, it's only required to enforce L1D flushing on VMENTER. | ||
527 | + | ||
528 | + Conditional L1D flushing is the default behaviour and can be tuned. See | ||
529 | + :ref:`mitigation_control_command_line` and :ref:`mitigation_control_kvm`. | ||
530 | + | ||
531 | +3.2. EPT not supported or disabled | ||
532 | +"""""""""""""""""""""""""""""""""" | ||
533 | + | ||
534 | + If EPT is not supported by the processor or disabled in the hypervisor, | ||
535 | + the system is fully protected. SMT can stay enabled and L1D flushing on | ||
536 | + VMENTER is not required. | ||
537 | + | ||
538 | + EPT can be disabled in the hypervisor via the 'kvm-intel.ept' parameter. | ||
539 | + | ||
540 | +3.3. SMT and EPT supported and active | ||
541 | +""""""""""""""""""""""""""""""""""""" | ||
542 | + | ||
543 | + If SMT and EPT are supported and active then various degrees of | ||
544 | + mitigations can be employed: | ||
545 | + | ||
546 | + - L1D flushing on VMENTER: | ||
547 | + | ||
548 | + L1D flushing on VMENTER is the minimal protection requirement, but it | ||
549 | + is only potent in combination with other mitigation methods. | ||
550 | + | ||
551 | + Conditional L1D flushing is the default behaviour and can be tuned. See | ||
552 | + :ref:`mitigation_control_command_line` and :ref:`mitigation_control_kvm`. | ||
553 | + | ||
554 | + - Guest confinement: | ||
555 | + | ||
556 | + Confinement of guests to a single or a group of physical cores which | ||
557 | + are not running any other processes, can reduce the attack surface | ||
558 | + significantly, but interrupts, soft interrupts and kernel threads can | ||
559 | + still expose valuable data to a potential attacker. See | ||
560 | + :ref:`guest_confinement`. | ||
561 | + | ||
562 | + - Interrupt isolation: | ||
563 | + | ||
564 | + Isolating the guest CPUs from interrupts can reduce the attack surface | ||
565 | + further, but still allows a malicious guest to explore a limited amount | ||
566 | + of host physical memory. This can at least be used to gain knowledge | ||
567 | + about the host address space layout. The interrupts which have a fixed | ||
568 | + affinity to the CPUs which run the untrusted guests can depending on | ||
569 | + the scenario still trigger soft interrupts and schedule kernel threads | ||
570 | + which might expose valuable information. See | ||
571 | + :ref:`interrupt_isolation`. | ||
572 | + | ||
573 | +The above three mitigation methods combined can provide protection to a | ||
574 | +certain degree, but the risk of the remaining attack surface has to be | ||
575 | +carefully analyzed. For full protection the following methods are | ||
576 | +available: | ||
577 | + | ||
578 | + - Disabling SMT: | ||
579 | + | ||
580 | + Disabling SMT and enforcing the L1D flushing provides the maximum | ||
581 | + amount of protection. This mitigation is not depending on any of the | ||
582 | + above mitigation methods. | ||
583 | + | ||
584 | + SMT control and L1D flushing can be tuned by the command line | ||
585 | + parameters 'nosmt', 'l1tf', 'kvm-intel.vmentry_l1d_flush' and at run | ||
586 | + time with the matching sysfs control files. See :ref:`smt_control`, | ||
587 | + :ref:`mitigation_control_command_line` and | ||
588 | + :ref:`mitigation_control_kvm`. | ||
589 | + | ||
590 | + - Disabling EPT: | ||
591 | + | ||
592 | + Disabling EPT provides the maximum amount of protection as well. It is | ||
593 | + not depending on any of the above mitigation methods. SMT can stay | ||
594 | + enabled and L1D flushing is not required, but the performance impact is | ||
595 | + significant. | ||
596 | + | ||
597 | + EPT can be disabled in the hypervisor via the 'kvm-intel.ept' | ||
598 | + parameter. | ||
599 | + | ||
600 | +3.4. Nested virtual machines | ||
601 | +"""""""""""""""""""""""""""" | ||
602 | + | ||
603 | +When nested virtualization is in use, three operating systems are involved: | ||
604 | +the bare metal hypervisor, the nested hypervisor and the nested virtual | ||
605 | +machine. VMENTER operations from the nested hypervisor into the nested | ||
606 | +guest will always be processed by the bare metal hypervisor. If KVM is the | ||
607 | +bare metal hypervisor it will: | ||
608 | + | ||
609 | + - Flush the L1D cache on every switch from the nested hypervisor to the | ||
610 | + nested virtual machine, so that the nested hypervisor's secrets are not | ||
611 | + exposed to the nested virtual machine; | ||
612 | + | ||
613 | + - Flush the L1D cache on every switch from the nested virtual machine to | ||
614 | + the nested hypervisor; this is a complex operation, and flushing the L1D | ||
615 | + cache avoids that the bare metal hypervisor's secrets are exposed to the | ||
616 | + nested virtual machine; | ||
617 | + | ||
618 | + - Instruct the nested hypervisor to not perform any L1D cache flush. This | ||
619 | + is an optimization to avoid double L1D flushing. | ||
620 | + | ||
621 | + | ||
622 | +.. _default_mitigations: | ||
623 | + | ||
624 | +Default mitigations | ||
625 | +------------------- | ||
626 | + | ||
627 | + The kernel default mitigations for vulnerable processors are: | ||
628 | + | ||
629 | + - PTE inversion to protect against malicious user space. This is done | ||
630 | + unconditionally and cannot be controlled. The swap storage is limited | ||
631 | + to ~16TB. | ||
632 | + | ||
633 | + - L1D conditional flushing on VMENTER when EPT is enabled for | ||
634 | + a guest. | ||
635 | + | ||
636 | + The kernel does not by default enforce the disabling of SMT, which leaves | ||
637 | + SMT systems vulnerable when running untrusted guests with EPT enabled. | ||
638 | + | ||
639 | + The rationale for this choice is: | ||
640 | + | ||
641 | + - Force disabling SMT can break existing setups, especially with | ||
642 | + unattended updates. | ||
643 | + | ||
644 | + - If regular users run untrusted guests on their machine, then L1TF is | ||
645 | + just an add on to other malware which might be embedded in an untrusted | ||
646 | + guest, e.g. spam-bots or attacks on the local network. | ||
647 | + | ||
648 | + There is no technical way to prevent a user from running untrusted code | ||
649 | + on their machines blindly. | ||
650 | + | ||
651 | + - It's technically extremely unlikely and from today's knowledge even | ||
652 | + impossible that L1TF can be exploited via the most popular attack | ||
653 | + mechanisms like JavaScript because these mechanisms have no way to | ||
654 | + control PTEs. If this would be possible and not other mitigation would | ||
655 | + be possible, then the default might be different. | ||
656 | + | ||
657 | + - The administrators of cloud and hosting setups have to carefully | ||
658 | + analyze the risk for their scenarios and make the appropriate | ||
659 | + mitigation choices, which might even vary across their deployed | ||
660 | + machines and also result in other changes of their overall setup. | ||
661 | + There is no way for the kernel to provide a sensible default for this | ||
662 | + kind of scenarios. | ||
663 | diff --git a/Documentation/hw-vuln/mds.rst b/Documentation/hw-vuln/mds.rst | ||
664 | new file mode 100644 | ||
665 | index 000000000000..daf6fdac49a3 | ||
666 | --- /dev/null | ||
667 | +++ b/Documentation/hw-vuln/mds.rst | ||
668 | @@ -0,0 +1,308 @@ | ||
669 | +MDS - Microarchitectural Data Sampling | ||
670 | +====================================== | ||
671 | + | ||
672 | +Microarchitectural Data Sampling is a hardware vulnerability which allows | ||
673 | +unprivileged speculative access to data which is available in various CPU | ||
674 | +internal buffers. | ||
675 | + | ||
676 | +Affected processors | ||
677 | +------------------- | ||
678 | + | ||
679 | +This vulnerability affects a wide range of Intel processors. The | ||
680 | +vulnerability is not present on: | ||
681 | + | ||
682 | + - Processors from AMD, Centaur and other non Intel vendors | ||
683 | + | ||
684 | + - Older processor models, where the CPU family is < 6 | ||
685 | + | ||
686 | + - Some Atoms (Bonnell, Saltwell, Goldmont, GoldmontPlus) | ||
687 | + | ||
688 | + - Intel processors which have the ARCH_CAP_MDS_NO bit set in the | ||
689 | + IA32_ARCH_CAPABILITIES MSR. | ||
690 | + | ||
691 | +Whether a processor is affected or not can be read out from the MDS | ||
692 | +vulnerability file in sysfs. See :ref:`mds_sys_info`. | ||
693 | + | ||
694 | +Not all processors are affected by all variants of MDS, but the mitigation | ||
695 | +is identical for all of them so the kernel treats them as a single | ||
696 | +vulnerability. | ||
697 | + | ||
698 | +Related CVEs | ||
699 | +------------ | ||
700 | + | ||
701 | +The following CVE entries are related to the MDS vulnerability: | ||
702 | + | ||
703 | + ============== ===== =================================================== | ||
704 | + CVE-2018-12126 MSBDS Microarchitectural Store Buffer Data Sampling | ||
705 | + CVE-2018-12130 MFBDS Microarchitectural Fill Buffer Data Sampling | ||
706 | + CVE-2018-12127 MLPDS Microarchitectural Load Port Data Sampling | ||
707 | + CVE-2019-11091 MDSUM Microarchitectural Data Sampling Uncacheable Memory | ||
708 | + ============== ===== =================================================== | ||
709 | + | ||
710 | +Problem | ||
711 | +------- | ||
712 | + | ||
713 | +When performing store, load, L1 refill operations, processors write data | ||
714 | +into temporary microarchitectural structures (buffers). The data in the | ||
715 | +buffer can be forwarded to load operations as an optimization. | ||
716 | + | ||
717 | +Under certain conditions, usually a fault/assist caused by a load | ||
718 | +operation, data unrelated to the load memory address can be speculatively | ||
719 | +forwarded from the buffers. Because the load operation causes a fault or | ||
720 | +assist and its result will be discarded, the forwarded data will not cause | ||
721 | +incorrect program execution or state changes. But a malicious operation | ||
722 | +may be able to forward this speculative data to a disclosure gadget which | ||
723 | +allows in turn to infer the value via a cache side channel attack. | ||
724 | + | ||
725 | +Because the buffers are potentially shared between Hyper-Threads cross | ||
726 | +Hyper-Thread attacks are possible. | ||
727 | + | ||
728 | +Deeper technical information is available in the MDS specific x86 | ||
729 | +architecture section: :ref:`Documentation/x86/mds.rst <mds>`. | ||
730 | + | ||
731 | + | ||
732 | +Attack scenarios | ||
733 | +---------------- | ||
734 | + | ||
735 | +Attacks against the MDS vulnerabilities can be mounted from malicious non | ||
736 | +priviledged user space applications running on hosts or guest. Malicious | ||
737 | +guest OSes can obviously mount attacks as well. | ||
738 | + | ||
739 | +Contrary to other speculation based vulnerabilities the MDS vulnerability | ||
740 | +does not allow the attacker to control the memory target address. As a | ||
741 | +consequence the attacks are purely sampling based, but as demonstrated with | ||
742 | +the TLBleed attack samples can be postprocessed successfully. | ||
743 | + | ||
744 | +Web-Browsers | ||
745 | +^^^^^^^^^^^^ | ||
746 | + | ||
747 | + It's unclear whether attacks through Web-Browsers are possible at | ||
748 | + all. The exploitation through Java-Script is considered very unlikely, | ||
749 | + but other widely used web technologies like Webassembly could possibly be | ||
750 | + abused. | ||
751 | + | ||
752 | + | ||
753 | +.. _mds_sys_info: | ||
754 | + | ||
755 | +MDS system information | ||
756 | +----------------------- | ||
757 | + | ||
758 | +The Linux kernel provides a sysfs interface to enumerate the current MDS | ||
759 | +status of the system: whether the system is vulnerable, and which | ||
760 | +mitigations are active. The relevant sysfs file is: | ||
761 | + | ||
762 | +/sys/devices/system/cpu/vulnerabilities/mds | ||
763 | + | ||
764 | +The possible values in this file are: | ||
765 | + | ||
766 | + .. list-table:: | ||
767 | + | ||
768 | + * - 'Not affected' | ||
769 | + - The processor is not vulnerable | ||
770 | + * - 'Vulnerable' | ||
771 | + - The processor is vulnerable, but no mitigation enabled | ||
772 | + * - 'Vulnerable: Clear CPU buffers attempted, no microcode' | ||
773 | + - The processor is vulnerable but microcode is not updated. | ||
774 | + | ||
775 | + The mitigation is enabled on a best effort basis. See :ref:`vmwerv` | ||
776 | + * - 'Mitigation: Clear CPU buffers' | ||
777 | + - The processor is vulnerable and the CPU buffer clearing mitigation is | ||
778 | + enabled. | ||
779 | + | ||
780 | +If the processor is vulnerable then the following information is appended | ||
781 | +to the above information: | ||
782 | + | ||
783 | + ======================== ============================================ | ||
784 | + 'SMT vulnerable' SMT is enabled | ||
785 | + 'SMT mitigated' SMT is enabled and mitigated | ||
786 | + 'SMT disabled' SMT is disabled | ||
787 | + 'SMT Host state unknown' Kernel runs in a VM, Host SMT state unknown | ||
788 | + ======================== ============================================ | ||
789 | + | ||
790 | +.. _vmwerv: | ||
791 | + | ||
792 | +Best effort mitigation mode | ||
793 | +^^^^^^^^^^^^^^^^^^^^^^^^^^^ | ||
794 | + | ||
795 | + If the processor is vulnerable, but the availability of the microcode based | ||
796 | + mitigation mechanism is not advertised via CPUID the kernel selects a best | ||
797 | + effort mitigation mode. This mode invokes the mitigation instructions | ||
798 | + without a guarantee that they clear the CPU buffers. | ||
799 | + | ||
800 | + This is done to address virtualization scenarios where the host has the | ||
801 | + microcode update applied, but the hypervisor is not yet updated to expose | ||
802 | + the CPUID to the guest. If the host has updated microcode the protection | ||
803 | + takes effect otherwise a few cpu cycles are wasted pointlessly. | ||
804 | + | ||
805 | + The state in the mds sysfs file reflects this situation accordingly. | ||
806 | + | ||
807 | + | ||
808 | +Mitigation mechanism | ||
809 | +------------------------- | ||
810 | + | ||
811 | +The kernel detects the affected CPUs and the presence of the microcode | ||
812 | +which is required. | ||
813 | + | ||
814 | +If a CPU is affected and the microcode is available, then the kernel | ||
815 | +enables the mitigation by default. The mitigation can be controlled at boot | ||
816 | +time via a kernel command line option. See | ||
817 | +:ref:`mds_mitigation_control_command_line`. | ||
818 | + | ||
819 | +.. _cpu_buffer_clear: | ||
820 | + | ||
821 | +CPU buffer clearing | ||
822 | +^^^^^^^^^^^^^^^^^^^ | ||
823 | + | ||
824 | + The mitigation for MDS clears the affected CPU buffers on return to user | ||
825 | + space and when entering a guest. | ||
826 | + | ||
827 | + If SMT is enabled it also clears the buffers on idle entry when the CPU | ||
828 | + is only affected by MSBDS and not any other MDS variant, because the | ||
829 | + other variants cannot be protected against cross Hyper-Thread attacks. | ||
830 | + | ||
831 | + For CPUs which are only affected by MSBDS the user space, guest and idle | ||
832 | + transition mitigations are sufficient and SMT is not affected. | ||
833 | + | ||
834 | +.. _virt_mechanism: | ||
835 | + | ||
836 | +Virtualization mitigation | ||
837 | +^^^^^^^^^^^^^^^^^^^^^^^^^ | ||
838 | + | ||
839 | + The protection for host to guest transition depends on the L1TF | ||
840 | + vulnerability of the CPU: | ||
841 | + | ||
842 | + - CPU is affected by L1TF: | ||
843 | + | ||
844 | + If the L1D flush mitigation is enabled and up to date microcode is | ||
845 | + available, the L1D flush mitigation is automatically protecting the | ||
846 | + guest transition. | ||
847 | + | ||
848 | + If the L1D flush mitigation is disabled then the MDS mitigation is | ||
849 | + invoked explicit when the host MDS mitigation is enabled. | ||
850 | + | ||
851 | + For details on L1TF and virtualization see: | ||
852 | + :ref:`Documentation/hw-vuln//l1tf.rst <mitigation_control_kvm>`. | ||
853 | + | ||
854 | + - CPU is not affected by L1TF: | ||
855 | + | ||
856 | + CPU buffers are flushed before entering the guest when the host MDS | ||
857 | + mitigation is enabled. | ||
858 | + | ||
859 | + The resulting MDS protection matrix for the host to guest transition: | ||
860 | + | ||
861 | + ============ ===== ============= ============ ================= | ||
862 | + L1TF MDS VMX-L1FLUSH Host MDS MDS-State | ||
863 | + | ||
864 | + Don't care No Don't care N/A Not affected | ||
865 | + | ||
866 | + Yes Yes Disabled Off Vulnerable | ||
867 | + | ||
868 | + Yes Yes Disabled Full Mitigated | ||
869 | + | ||
870 | + Yes Yes Enabled Don't care Mitigated | ||
871 | + | ||
872 | + No Yes N/A Off Vulnerable | ||
873 | + | ||
874 | + No Yes N/A Full Mitigated | ||
875 | + ============ ===== ============= ============ ================= | ||
876 | + | ||
877 | + This only covers the host to guest transition, i.e. prevents leakage from | ||
878 | + host to guest, but does not protect the guest internally. Guests need to | ||
879 | + have their own protections. | ||
880 | + | ||
881 | +.. _xeon_phi: | ||
882 | + | ||
883 | +XEON PHI specific considerations | ||
884 | +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | ||
885 | + | ||
886 | + The XEON PHI processor family is affected by MSBDS which can be exploited | ||
887 | + cross Hyper-Threads when entering idle states. Some XEON PHI variants allow | ||
888 | + to use MWAIT in user space (Ring 3) which opens an potential attack vector | ||
889 | + for malicious user space. The exposure can be disabled on the kernel | ||
890 | + command line with the 'ring3mwait=disable' command line option. | ||
891 | + | ||
892 | + XEON PHI is not affected by the other MDS variants and MSBDS is mitigated | ||
893 | + before the CPU enters a idle state. As XEON PHI is not affected by L1TF | ||
894 | + either disabling SMT is not required for full protection. | ||
895 | + | ||
896 | +.. _mds_smt_control: | ||
897 | + | ||
898 | +SMT control | ||
899 | +^^^^^^^^^^^ | ||
900 | + | ||
901 | + All MDS variants except MSBDS can be attacked cross Hyper-Threads. That | ||
902 | + means on CPUs which are affected by MFBDS or MLPDS it is necessary to | ||
903 | + disable SMT for full protection. These are most of the affected CPUs; the | ||
904 | + exception is XEON PHI, see :ref:`xeon_phi`. | ||
905 | + | ||
906 | + Disabling SMT can have a significant performance impact, but the impact | ||
907 | + depends on the type of workloads. | ||
908 | + | ||
909 | + See the relevant chapter in the L1TF mitigation documentation for details: | ||
910 | + :ref:`Documentation/hw-vuln/l1tf.rst <smt_control>`. | ||
911 | + | ||
912 | + | ||
913 | +.. _mds_mitigation_control_command_line: | ||
914 | + | ||
915 | +Mitigation control on the kernel command line | ||
916 | +--------------------------------------------- | ||
917 | + | ||
918 | +The kernel command line allows to control the MDS mitigations at boot | ||
919 | +time with the option "mds=". The valid arguments for this option are: | ||
920 | + | ||
921 | + ============ ============================================================= | ||
922 | + full If the CPU is vulnerable, enable all available mitigations | ||
923 | + for the MDS vulnerability, CPU buffer clearing on exit to | ||
924 | + userspace and when entering a VM. Idle transitions are | ||
925 | + protected as well if SMT is enabled. | ||
926 | + | ||
927 | + It does not automatically disable SMT. | ||
928 | + | ||
929 | + full,nosmt The same as mds=full, with SMT disabled on vulnerable | ||
930 | + CPUs. This is the complete mitigation. | ||
931 | + | ||
932 | + off Disables MDS mitigations completely. | ||
933 | + | ||
934 | + ============ ============================================================= | ||
935 | + | ||
936 | +Not specifying this option is equivalent to "mds=full". | ||
937 | + | ||
938 | + | ||
939 | +Mitigation selection guide | ||
940 | +-------------------------- | ||
941 | + | ||
942 | +1. Trusted userspace | ||
943 | +^^^^^^^^^^^^^^^^^^^^ | ||
944 | + | ||
945 | + If all userspace applications are from a trusted source and do not | ||
946 | + execute untrusted code which is supplied externally, then the mitigation | ||
947 | + can be disabled. | ||
948 | + | ||
949 | + | ||
950 | +2. Virtualization with trusted guests | ||
951 | +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | ||
952 | + | ||
953 | + The same considerations as above versus trusted user space apply. | ||
954 | + | ||
955 | +3. Virtualization with untrusted guests | ||
956 | +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | ||
957 | + | ||
958 | + The protection depends on the state of the L1TF mitigations. | ||
959 | + See :ref:`virt_mechanism`. | ||
960 | + | ||
961 | + If the MDS mitigation is enabled and SMT is disabled, guest to host and | ||
962 | + guest to guest attacks are prevented. | ||
963 | + | ||
964 | +.. _mds_default_mitigations: | ||
965 | + | ||
966 | +Default mitigations | ||
967 | +------------------- | ||
968 | + | ||
969 | + The kernel default mitigations for vulnerable processors are: | ||
970 | + | ||
971 | + - Enable CPU buffer clearing | ||
972 | + | ||
973 | + The kernel does not by default enforce the disabling of SMT, which leaves | ||
974 | + SMT systems vulnerable when running untrusted code. The same rationale as | ||
975 | + for L1TF applies. | ||
976 | + See :ref:`Documentation/hw-vuln//l1tf.rst <default_mitigations>`. | ||
977 | diff --git a/Documentation/index.rst b/Documentation/index.rst | ||
978 | index 213399aac757..f95c58dbbbc3 100644 | ||
979 | --- a/Documentation/index.rst | ||
980 | +++ b/Documentation/index.rst | ||
981 | @@ -12,7 +12,6 @@ Contents: | ||
982 | :maxdepth: 2 | ||
983 | |||
984 | kernel-documentation | ||
985 | - l1tf | ||
986 | development-process/index | ||
987 | dev-tools/tools | ||
988 | driver-api/index | ||
989 | @@ -20,6 +19,24 @@ Contents: | ||
990 | gpu/index | ||
991 | 80211/index | ||
992 | |||
993 | +This section describes CPU vulnerabilities and their mitigations. | ||
994 | + | ||
995 | +.. toctree:: | ||
996 | + :maxdepth: 1 | ||
997 | + | ||
998 | + hw-vuln/index | ||
999 | + | ||
1000 | +Architecture-specific documentation | ||
1001 | +----------------------------------- | ||
1002 | + | ||
1003 | +These books provide programming details about architecture-specific | ||
1004 | +implementation. | ||
1005 | + | ||
1006 | +.. toctree:: | ||
1007 | + :maxdepth: 2 | ||
1008 | + | ||
1009 | + x86/index | ||
1010 | + | ||
1011 | Indices and tables | ||
1012 | ================== | ||
1013 | |||
1014 | diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt | ||
1015 | index a1472b48ee22..55a9bbbcf5e1 100644 | ||
1016 | --- a/Documentation/kernel-parameters.txt | ||
1017 | +++ b/Documentation/kernel-parameters.txt | ||
1018 | @@ -2076,10 +2076,13 @@ bytes respectively. Such letter suffixes can also be entirely omitted. | ||
1019 | off | ||
1020 | Disables hypervisor mitigations and doesn't | ||
1021 | emit any warnings. | ||
1022 | + It also drops the swap size and available | ||
1023 | + RAM limit restriction on both hypervisor and | ||
1024 | + bare metal. | ||
1025 | |||
1026 | Default is 'flush'. | ||
1027 | |||
1028 | - For details see: Documentation/admin-guide/l1tf.rst | ||
1029 | + For details see: Documentation/hw-vuln/l1tf.rst | ||
1030 | |||
1031 | l2cr= [PPC] | ||
1032 | |||
1033 | @@ -2322,6 +2325,32 @@ bytes respectively. Such letter suffixes can also be entirely omitted. | ||
1034 | Format: <first>,<last> | ||
1035 | Specifies range of consoles to be captured by the MDA. | ||
1036 | |||
1037 | + mds= [X86,INTEL] | ||
1038 | + Control mitigation for the Micro-architectural Data | ||
1039 | + Sampling (MDS) vulnerability. | ||
1040 | + | ||
1041 | + Certain CPUs are vulnerable to an exploit against CPU | ||
1042 | + internal buffers which can forward information to a | ||
1043 | + disclosure gadget under certain conditions. | ||
1044 | + | ||
1045 | + In vulnerable processors, the speculatively | ||
1046 | + forwarded data can be used in a cache side channel | ||
1047 | + attack, to access data to which the attacker does | ||
1048 | + not have direct access. | ||
1049 | + | ||
1050 | + This parameter controls the MDS mitigation. The | ||
1051 | + options are: | ||
1052 | + | ||
1053 | + full - Enable MDS mitigation on vulnerable CPUs | ||
1054 | + full,nosmt - Enable MDS mitigation and disable | ||
1055 | + SMT on vulnerable CPUs | ||
1056 | + off - Unconditionally disable MDS mitigation | ||
1057 | + | ||
1058 | + Not specifying this option is equivalent to | ||
1059 | + mds=full. | ||
1060 | + | ||
1061 | + For details see: Documentation/hw-vuln/mds.rst | ||
1062 | + | ||
1063 | mem=nn[KMG] [KNL,BOOT] Force usage of a specific amount of memory | ||
1064 | Amount of memory to be used when the kernel is not able | ||
1065 | to see the whole system memory or for test. | ||
1066 | @@ -2444,6 +2473,38 @@ bytes respectively. Such letter suffixes can also be entirely omitted. | ||
1067 | in the "bleeding edge" mini2440 support kernel at | ||
1068 | http://repo.or.cz/w/linux-2.6/mini2440.git | ||
1069 | |||
1070 | + mitigations= | ||
1071 | + [X86] Control optional mitigations for CPU | ||
1072 | + vulnerabilities. This is a set of curated, | ||
1073 | + arch-independent options, each of which is an | ||
1074 | + aggregation of existing arch-specific options. | ||
1075 | + | ||
1076 | + off | ||
1077 | + Disable all optional CPU mitigations. This | ||
1078 | + improves system performance, but it may also | ||
1079 | + expose users to several CPU vulnerabilities. | ||
1080 | + Equivalent to: nopti [X86] | ||
1081 | + nospectre_v2 [X86] | ||
1082 | + spectre_v2_user=off [X86] | ||
1083 | + spec_store_bypass_disable=off [X86] | ||
1084 | + l1tf=off [X86] | ||
1085 | + mds=off [X86] | ||
1086 | + | ||
1087 | + auto (default) | ||
1088 | + Mitigate all CPU vulnerabilities, but leave SMT | ||
1089 | + enabled, even if it's vulnerable. This is for | ||
1090 | + users who don't want to be surprised by SMT | ||
1091 | + getting disabled across kernel upgrades, or who | ||
1092 | + have other ways of avoiding SMT-based attacks. | ||
1093 | + Equivalent to: (default behavior) | ||
1094 | + | ||
1095 | + auto,nosmt | ||
1096 | + Mitigate all CPU vulnerabilities, disabling SMT | ||
1097 | + if needed. This is for users who always want to | ||
1098 | + be fully mitigated, even if it means losing SMT. | ||
1099 | + Equivalent to: l1tf=flush,nosmt [X86] | ||
1100 | + mds=full,nosmt [X86] | ||
1101 | + | ||
1102 | mminit_loglevel= | ||
1103 | [KNL] When CONFIG_DEBUG_MEMORY_INIT is set, this | ||
1104 | parameter allows control of the logging verbosity for | ||
1105 | @@ -4030,9 +4091,13 @@ bytes respectively. Such letter suffixes can also be entirely omitted. | ||
1106 | |||
1107 | spectre_v2= [X86] Control mitigation of Spectre variant 2 | ||
1108 | (indirect branch speculation) vulnerability. | ||
1109 | + The default operation protects the kernel from | ||
1110 | + user space attacks. | ||
1111 | |||
1112 | - on - unconditionally enable | ||
1113 | - off - unconditionally disable | ||
1114 | + on - unconditionally enable, implies | ||
1115 | + spectre_v2_user=on | ||
1116 | + off - unconditionally disable, implies | ||
1117 | + spectre_v2_user=off | ||
1118 | auto - kernel detects whether your CPU model is | ||
1119 | vulnerable | ||
1120 | |||
1121 | @@ -4042,6 +4107,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted. | ||
1122 | CONFIG_RETPOLINE configuration option, and the | ||
1123 | compiler with which the kernel was built. | ||
1124 | |||
1125 | + Selecting 'on' will also enable the mitigation | ||
1126 | + against user space to user space task attacks. | ||
1127 | + | ||
1128 | + Selecting 'off' will disable both the kernel and | ||
1129 | + the user space protections. | ||
1130 | + | ||
1131 | Specific mitigations can also be selected manually: | ||
1132 | |||
1133 | retpoline - replace indirect branches | ||
1134 | @@ -4051,6 +4122,48 @@ bytes respectively. Such letter suffixes can also be entirely omitted. | ||
1135 | Not specifying this option is equivalent to | ||
1136 | spectre_v2=auto. | ||
1137 | |||
1138 | + spectre_v2_user= | ||
1139 | + [X86] Control mitigation of Spectre variant 2 | ||
1140 | + (indirect branch speculation) vulnerability between | ||
1141 | + user space tasks | ||
1142 | + | ||
1143 | + on - Unconditionally enable mitigations. Is | ||
1144 | + enforced by spectre_v2=on | ||
1145 | + | ||
1146 | + off - Unconditionally disable mitigations. Is | ||
1147 | + enforced by spectre_v2=off | ||
1148 | + | ||
1149 | + prctl - Indirect branch speculation is enabled, | ||
1150 | + but mitigation can be enabled via prctl | ||
1151 | + per thread. The mitigation control state | ||
1152 | + is inherited on fork. | ||
1153 | + | ||
1154 | + prctl,ibpb | ||
1155 | + - Like "prctl" above, but only STIBP is | ||
1156 | + controlled per thread. IBPB is issued | ||
1157 | + always when switching between different user | ||
1158 | + space processes. | ||
1159 | + | ||
1160 | + seccomp | ||
1161 | + - Same as "prctl" above, but all seccomp | ||
1162 | + threads will enable the mitigation unless | ||
1163 | + they explicitly opt out. | ||
1164 | + | ||
1165 | + seccomp,ibpb | ||
1166 | + - Like "seccomp" above, but only STIBP is | ||
1167 | + controlled per thread. IBPB is issued | ||
1168 | + always when switching between different | ||
1169 | + user space processes. | ||
1170 | + | ||
1171 | + auto - Kernel selects the mitigation depending on | ||
1172 | + the available CPU features and vulnerability. | ||
1173 | + | ||
1174 | + Default mitigation: | ||
1175 | + If CONFIG_SECCOMP=y then "seccomp", otherwise "prctl" | ||
1176 | + | ||
1177 | + Not specifying this option is equivalent to | ||
1178 | + spectre_v2_user=auto. | ||
1179 | + | ||
1180 | spec_store_bypass_disable= | ||
1181 | [HW] Control Speculative Store Bypass (SSB) Disable mitigation | ||
1182 | (Speculative Store Bypass vulnerability) | ||
1183 | diff --git a/Documentation/l1tf.rst b/Documentation/l1tf.rst | ||
1184 | deleted file mode 100644 | ||
1185 | index bae52b845de0..000000000000 | ||
1186 | --- a/Documentation/l1tf.rst | ||
1187 | +++ /dev/null | ||
1188 | @@ -1,610 +0,0 @@ | ||
1189 | -L1TF - L1 Terminal Fault | ||
1190 | -======================== | ||
1191 | - | ||
1192 | -L1 Terminal Fault is a hardware vulnerability which allows unprivileged | ||
1193 | -speculative access to data which is available in the Level 1 Data Cache | ||
1194 | -when the page table entry controlling the virtual address, which is used | ||
1195 | -for the access, has the Present bit cleared or other reserved bits set. | ||
1196 | - | ||
1197 | -Affected processors | ||
1198 | -------------------- | ||
1199 | - | ||
1200 | -This vulnerability affects a wide range of Intel processors. The | ||
1201 | -vulnerability is not present on: | ||
1202 | - | ||
1203 | - - Processors from AMD, Centaur and other non Intel vendors | ||
1204 | - | ||
1205 | - - Older processor models, where the CPU family is < 6 | ||
1206 | - | ||
1207 | - - A range of Intel ATOM processors (Cedarview, Cloverview, Lincroft, | ||
1208 | - Penwell, Pineview, Silvermont, Airmont, Merrifield) | ||
1209 | - | ||
1210 | - - The Intel XEON PHI family | ||
1211 | - | ||
1212 | - - Intel processors which have the ARCH_CAP_RDCL_NO bit set in the | ||
1213 | - IA32_ARCH_CAPABILITIES MSR. If the bit is set the CPU is not affected | ||
1214 | - by the Meltdown vulnerability either. These CPUs should become | ||
1215 | - available by end of 2018. | ||
1216 | - | ||
1217 | -Whether a processor is affected or not can be read out from the L1TF | ||
1218 | -vulnerability file in sysfs. See :ref:`l1tf_sys_info`. | ||
1219 | - | ||
1220 | -Related CVEs | ||
1221 | ------------- | ||
1222 | - | ||
1223 | -The following CVE entries are related to the L1TF vulnerability: | ||
1224 | - | ||
1225 | - ============= ================= ============================== | ||
1226 | - CVE-2018-3615 L1 Terminal Fault SGX related aspects | ||
1227 | - CVE-2018-3620 L1 Terminal Fault OS, SMM related aspects | ||
1228 | - CVE-2018-3646 L1 Terminal Fault Virtualization related aspects | ||
1229 | - ============= ================= ============================== | ||
1230 | - | ||
1231 | -Problem | ||
1232 | -------- | ||
1233 | - | ||
1234 | -If an instruction accesses a virtual address for which the relevant page | ||
1235 | -table entry (PTE) has the Present bit cleared or other reserved bits set, | ||
1236 | -then speculative execution ignores the invalid PTE and loads the referenced | ||
1237 | -data if it is present in the Level 1 Data Cache, as if the page referenced | ||
1238 | -by the address bits in the PTE was still present and accessible. | ||
1239 | - | ||
1240 | -While this is a purely speculative mechanism and the instruction will raise | ||
1241 | -a page fault when it is retired eventually, the pure act of loading the | ||
1242 | -data and making it available to other speculative instructions opens up the | ||
1243 | -opportunity for side channel attacks to unprivileged malicious code, | ||
1244 | -similar to the Meltdown attack. | ||
1245 | - | ||
1246 | -While Meltdown breaks the user space to kernel space protection, L1TF | ||
1247 | -allows to attack any physical memory address in the system and the attack | ||
1248 | -works across all protection domains. It allows an attack of SGX and also | ||
1249 | -works from inside virtual machines because the speculation bypasses the | ||
1250 | -extended page table (EPT) protection mechanism. | ||
1251 | - | ||
1252 | - | ||
1253 | -Attack scenarios | ||
1254 | ----------------- | ||
1255 | - | ||
1256 | -1. Malicious user space | ||
1257 | -^^^^^^^^^^^^^^^^^^^^^^^ | ||
1258 | - | ||
1259 | - Operating Systems store arbitrary information in the address bits of a | ||
1260 | - PTE which is marked non present. This allows a malicious user space | ||
1261 | - application to attack the physical memory to which these PTEs resolve. | ||
1262 | - In some cases user-space can maliciously influence the information | ||
1263 | - encoded in the address bits of the PTE, thus making attacks more | ||
1264 | - deterministic and more practical. | ||
1265 | - | ||
1266 | - The Linux kernel contains a mitigation for this attack vector, PTE | ||
1267 | - inversion, which is permanently enabled and has no performance | ||
1268 | - impact. The kernel ensures that the address bits of PTEs, which are not | ||
1269 | - marked present, never point to cacheable physical memory space. | ||
1270 | - | ||
1271 | - A system with an up to date kernel is protected against attacks from | ||
1272 | - malicious user space applications. | ||
1273 | - | ||
1274 | -2. Malicious guest in a virtual machine | ||
1275 | -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | ||
1276 | - | ||
1277 | - The fact that L1TF breaks all domain protections allows malicious guest | ||
1278 | - OSes, which can control the PTEs directly, and malicious guest user | ||
1279 | - space applications, which run on an unprotected guest kernel lacking the | ||
1280 | - PTE inversion mitigation for L1TF, to attack physical host memory. | ||
1281 | - | ||
1282 | - A special aspect of L1TF in the context of virtualization is symmetric | ||
1283 | - multi threading (SMT). The Intel implementation of SMT is called | ||
1284 | - HyperThreading. The fact that Hyperthreads on the affected processors | ||
1285 | - share the L1 Data Cache (L1D) is important for this. As the flaw allows | ||
1286 | - only to attack data which is present in L1D, a malicious guest running | ||
1287 | - on one Hyperthread can attack the data which is brought into the L1D by | ||
1288 | - the context which runs on the sibling Hyperthread of the same physical | ||
1289 | - core. This context can be host OS, host user space or a different guest. | ||
1290 | - | ||
1291 | - If the processor does not support Extended Page Tables, the attack is | ||
1292 | - only possible, when the hypervisor does not sanitize the content of the | ||
1293 | - effective (shadow) page tables. | ||
1294 | - | ||
1295 | - While solutions exist to mitigate these attack vectors fully, these | ||
1296 | - mitigations are not enabled by default in the Linux kernel because they | ||
1297 | - can affect performance significantly. The kernel provides several | ||
1298 | - mechanisms which can be utilized to address the problem depending on the | ||
1299 | - deployment scenario. The mitigations, their protection scope and impact | ||
1300 | - are described in the next sections. | ||
1301 | - | ||
1302 | - The default mitigations and the rationale for choosing them are explained | ||
1303 | - at the end of this document. See :ref:`default_mitigations`. | ||
1304 | - | ||
1305 | -.. _l1tf_sys_info: | ||
1306 | - | ||
1307 | -L1TF system information | ||
1308 | ------------------------ | ||
1309 | - | ||
1310 | -The Linux kernel provides a sysfs interface to enumerate the current L1TF | ||
1311 | -status of the system: whether the system is vulnerable, and which | ||
1312 | -mitigations are active. The relevant sysfs file is: | ||
1313 | - | ||
1314 | -/sys/devices/system/cpu/vulnerabilities/l1tf | ||
1315 | - | ||
1316 | -The possible values in this file are: | ||
1317 | - | ||
1318 | - =========================== =============================== | ||
1319 | - 'Not affected' The processor is not vulnerable | ||
1320 | - 'Mitigation: PTE Inversion' The host protection is active | ||
1321 | - =========================== =============================== | ||
1322 | - | ||
1323 | -If KVM/VMX is enabled and the processor is vulnerable then the following | ||
1324 | -information is appended to the 'Mitigation: PTE Inversion' part: | ||
1325 | - | ||
1326 | - - SMT status: | ||
1327 | - | ||
1328 | - ===================== ================ | ||
1329 | - 'VMX: SMT vulnerable' SMT is enabled | ||
1330 | - 'VMX: SMT disabled' SMT is disabled | ||
1331 | - ===================== ================ | ||
1332 | - | ||
1333 | - - L1D Flush mode: | ||
1334 | - | ||
1335 | - ================================ ==================================== | ||
1336 | - 'L1D vulnerable' L1D flushing is disabled | ||
1337 | - | ||
1338 | - 'L1D conditional cache flushes' L1D flush is conditionally enabled | ||
1339 | - | ||
1340 | - 'L1D cache flushes' L1D flush is unconditionally enabled | ||
1341 | - ================================ ==================================== | ||
1342 | - | ||
1343 | -The resulting grade of protection is discussed in the following sections. | ||
1344 | - | ||
1345 | - | ||
1346 | -Host mitigation mechanism | ||
1347 | -------------------------- | ||
1348 | - | ||
1349 | -The kernel is unconditionally protected against L1TF attacks from malicious | ||
1350 | -user space running on the host. | ||
1351 | - | ||
1352 | - | ||
1353 | -Guest mitigation mechanisms | ||
1354 | ---------------------------- | ||
1355 | - | ||
1356 | -.. _l1d_flush: | ||
1357 | - | ||
1358 | -1. L1D flush on VMENTER | ||
1359 | -^^^^^^^^^^^^^^^^^^^^^^^ | ||
1360 | - | ||
1361 | - To make sure that a guest cannot attack data which is present in the L1D | ||
1362 | - the hypervisor flushes the L1D before entering the guest. | ||
1363 | - | ||
1364 | - Flushing the L1D evicts not only the data which should not be accessed | ||
1365 | - by a potentially malicious guest, it also flushes the guest | ||
1366 | - data. Flushing the L1D has a performance impact as the processor has to | ||
1367 | - bring the flushed guest data back into the L1D. Depending on the | ||
1368 | - frequency of VMEXIT/VMENTER and the type of computations in the guest | ||
1369 | - performance degradation in the range of 1% to 50% has been observed. For | ||
1370 | - scenarios where guest VMEXIT/VMENTER are rare the performance impact is | ||
1371 | - minimal. Virtio and mechanisms like posted interrupts are designed to | ||
1372 | - confine the VMEXITs to a bare minimum, but specific configurations and | ||
1373 | - application scenarios might still suffer from a high VMEXIT rate. | ||
1374 | - | ||
1375 | - The kernel provides two L1D flush modes: | ||
1376 | - - conditional ('cond') | ||
1377 | - - unconditional ('always') | ||
1378 | - | ||
1379 | - The conditional mode avoids L1D flushing after VMEXITs which execute | ||
1380 | - only audited code paths before the corresponding VMENTER. These code | ||
1381 | - paths have been verified that they cannot expose secrets or other | ||
1382 | - interesting data to an attacker, but they can leak information about the | ||
1383 | - address space layout of the hypervisor. | ||
1384 | - | ||
1385 | - Unconditional mode flushes L1D on all VMENTER invocations and provides | ||
1386 | - maximum protection. It has a higher overhead than the conditional | ||
1387 | - mode. The overhead cannot be quantified correctly as it depends on the | ||
1388 | - workload scenario and the resulting number of VMEXITs. | ||
1389 | - | ||
1390 | - The general recommendation is to enable L1D flush on VMENTER. The kernel | ||
1391 | - defaults to conditional mode on affected processors. | ||
1392 | - | ||
1393 | - **Note**, that L1D flush does not prevent the SMT problem because the | ||
1394 | - sibling thread will also bring back its data into the L1D which makes it | ||
1395 | - attackable again. | ||
1396 | - | ||
1397 | - L1D flush can be controlled by the administrator via the kernel command | ||
1398 | - line and sysfs control files. See :ref:`mitigation_control_command_line` | ||
1399 | - and :ref:`mitigation_control_kvm`. | ||
1400 | - | ||
1401 | -.. _guest_confinement: | ||
1402 | - | ||
1403 | -2. Guest VCPU confinement to dedicated physical cores | ||
1404 | -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | ||
1405 | - | ||
1406 | - To address the SMT problem, it is possible to make a guest or a group of | ||
1407 | - guests affine to one or more physical cores. The proper mechanism for | ||
1408 | - that is to utilize exclusive cpusets to ensure that no other guest or | ||
1409 | - host tasks can run on these cores. | ||
1410 | - | ||
1411 | - If only a single guest or related guests run on sibling SMT threads on | ||
1412 | - the same physical core then they can only attack their own memory and | ||
1413 | - restricted parts of the host memory. | ||
1414 | - | ||
1415 | - Host memory is attackable, when one of the sibling SMT threads runs in | ||
1416 | - host OS (hypervisor) context and the other in guest context. The amount | ||
1417 | - of valuable information from the host OS context depends on the context | ||
1418 | - which the host OS executes, i.e. interrupts, soft interrupts and kernel | ||
1419 | - threads. The amount of valuable data from these contexts cannot be | ||
1420 | - declared as non-interesting for an attacker without deep inspection of | ||
1421 | - the code. | ||
1422 | - | ||
1423 | - **Note**, that assigning guests to a fixed set of physical cores affects | ||
1424 | - the ability of the scheduler to do load balancing and might have | ||
1425 | - negative effects on CPU utilization depending on the hosting | ||
1426 | - scenario. Disabling SMT might be a viable alternative for particular | ||
1427 | - scenarios. | ||
1428 | - | ||
1429 | - For further information about confining guests to a single or to a group | ||
1430 | - of cores consult the cpusets documentation: | ||
1431 | - | ||
1432 | - https://www.kernel.org/doc/Documentation/cgroup-v1/cpusets.txt | ||
1433 | - | ||
1434 | -.. _interrupt_isolation: | ||
1435 | - | ||
1436 | -3. Interrupt affinity | ||
1437 | -^^^^^^^^^^^^^^^^^^^^^ | ||
1438 | - | ||
1439 | - Interrupts can be made affine to logical CPUs. This is not universally | ||
1440 | - true because there are types of interrupts which are truly per CPU | ||
1441 | - interrupts, e.g. the local timer interrupt. Aside of that multi queue | ||
1442 | - devices affine their interrupts to single CPUs or groups of CPUs per | ||
1443 | - queue without allowing the administrator to control the affinities. | ||
1444 | - | ||
1445 | - Moving the interrupts, which can be affinity controlled, away from CPUs | ||
1446 | - which run untrusted guests, reduces the attack vector space. | ||
1447 | - | ||
1448 | - Whether the interrupts with are affine to CPUs, which run untrusted | ||
1449 | - guests, provide interesting data for an attacker depends on the system | ||
1450 | - configuration and the scenarios which run on the system. While for some | ||
1451 | - of the interrupts it can be assumed that they won't expose interesting | ||
1452 | - information beyond exposing hints about the host OS memory layout, there | ||
1453 | - is no way to make general assumptions. | ||
1454 | - | ||
1455 | - Interrupt affinity can be controlled by the administrator via the | ||
1456 | - /proc/irq/$NR/smp_affinity[_list] files. Limited documentation is | ||
1457 | - available at: | ||
1458 | - | ||
1459 | - https://www.kernel.org/doc/Documentation/IRQ-affinity.txt | ||
1460 | - | ||
1461 | -.. _smt_control: | ||
1462 | - | ||
1463 | -4. SMT control | ||
1464 | -^^^^^^^^^^^^^^ | ||
1465 | - | ||
1466 | - To prevent the SMT issues of L1TF it might be necessary to disable SMT | ||
1467 | - completely. Disabling SMT can have a significant performance impact, but | ||
1468 | - the impact depends on the hosting scenario and the type of workloads. | ||
1469 | - The impact of disabling SMT needs also to be weighted against the impact | ||
1470 | - of other mitigation solutions like confining guests to dedicated cores. | ||
1471 | - | ||
1472 | - The kernel provides a sysfs interface to retrieve the status of SMT and | ||
1473 | - to control it. It also provides a kernel command line interface to | ||
1474 | - control SMT. | ||
1475 | - | ||
1476 | - The kernel command line interface consists of the following options: | ||
1477 | - | ||
1478 | - =========== ========================================================== | ||
1479 | - nosmt Affects the bring up of the secondary CPUs during boot. The | ||
1480 | - kernel tries to bring all present CPUs online during the | ||
1481 | - boot process. "nosmt" makes sure that from each physical | ||
1482 | - core only one - the so called primary (hyper) thread is | ||
1483 | - activated. Due to a design flaw of Intel processors related | ||
1484 | - to Machine Check Exceptions the non primary siblings have | ||
1485 | - to be brought up at least partially and are then shut down | ||
1486 | - again. "nosmt" can be undone via the sysfs interface. | ||
1487 | - | ||
1488 | - nosmt=force Has the same effect as "nosmt" but it does not allow to | ||
1489 | - undo the SMT disable via the sysfs interface. | ||
1490 | - =========== ========================================================== | ||
1491 | - | ||
1492 | - The sysfs interface provides two files: | ||
1493 | - | ||
1494 | - - /sys/devices/system/cpu/smt/control | ||
1495 | - - /sys/devices/system/cpu/smt/active | ||
1496 | - | ||
1497 | - /sys/devices/system/cpu/smt/control: | ||
1498 | - | ||
1499 | - This file allows to read out the SMT control state and provides the | ||
1500 | - ability to disable or (re)enable SMT. The possible states are: | ||
1501 | - | ||
1502 | - ============== =================================================== | ||
1503 | - on SMT is supported by the CPU and enabled. All | ||
1504 | - logical CPUs can be onlined and offlined without | ||
1505 | - restrictions. | ||
1506 | - | ||
1507 | - off SMT is supported by the CPU and disabled. Only | ||
1508 | - the so called primary SMT threads can be onlined | ||
1509 | - and offlined without restrictions. An attempt to | ||
1510 | - online a non-primary sibling is rejected | ||
1511 | - | ||
1512 | - forceoff Same as 'off' but the state cannot be controlled. | ||
1513 | - Attempts to write to the control file are rejected. | ||
1514 | - | ||
1515 | - notsupported The processor does not support SMT. It's therefore | ||
1516 | - not affected by the SMT implications of L1TF. | ||
1517 | - Attempts to write to the control file are rejected. | ||
1518 | - ============== =================================================== | ||
1519 | - | ||
1520 | - The possible states which can be written into this file to control SMT | ||
1521 | - state are: | ||
1522 | - | ||
1523 | - - on | ||
1524 | - - off | ||
1525 | - - forceoff | ||
1526 | - | ||
1527 | - /sys/devices/system/cpu/smt/active: | ||
1528 | - | ||
1529 | - This file reports whether SMT is enabled and active, i.e. if on any | ||
1530 | - physical core two or more sibling threads are online. | ||
1531 | - | ||
1532 | - SMT control is also possible at boot time via the l1tf kernel command | ||
1533 | - line parameter in combination with L1D flush control. See | ||
1534 | - :ref:`mitigation_control_command_line`. | ||
1535 | - | ||
1536 | -5. Disabling EPT | ||
1537 | -^^^^^^^^^^^^^^^^ | ||
1538 | - | ||
1539 | - Disabling EPT for virtual machines provides full mitigation for L1TF even | ||
1540 | - with SMT enabled, because the effective page tables for guests are | ||
1541 | - managed and sanitized by the hypervisor. Though disabling EPT has a | ||
1542 | - significant performance impact especially when the Meltdown mitigation | ||
1543 | - KPTI is enabled. | ||
1544 | - | ||
1545 | - EPT can be disabled in the hypervisor via the 'kvm-intel.ept' parameter. | ||
1546 | - | ||
1547 | -There is ongoing research and development for new mitigation mechanisms to | ||
1548 | -address the performance impact of disabling SMT or EPT. | ||
1549 | - | ||
1550 | -.. _mitigation_control_command_line: | ||
1551 | - | ||
1552 | -Mitigation control on the kernel command line | ||
1553 | ---------------------------------------------- | ||
1554 | - | ||
1555 | -The kernel command line allows to control the L1TF mitigations at boot | ||
1556 | -time with the option "l1tf=". The valid arguments for this option are: | ||
1557 | - | ||
1558 | - ============ ============================================================= | ||
1559 | - full Provides all available mitigations for the L1TF | ||
1560 | - vulnerability. Disables SMT and enables all mitigations in | ||
1561 | - the hypervisors, i.e. unconditional L1D flushing | ||
1562 | - | ||
1563 | - SMT control and L1D flush control via the sysfs interface | ||
1564 | - is still possible after boot. Hypervisors will issue a | ||
1565 | - warning when the first VM is started in a potentially | ||
1566 | - insecure configuration, i.e. SMT enabled or L1D flush | ||
1567 | - disabled. | ||
1568 | - | ||
1569 | - full,force Same as 'full', but disables SMT and L1D flush runtime | ||
1570 | - control. Implies the 'nosmt=force' command line option. | ||
1571 | - (i.e. sysfs control of SMT is disabled.) | ||
1572 | - | ||
1573 | - flush Leaves SMT enabled and enables the default hypervisor | ||
1574 | - mitigation, i.e. conditional L1D flushing | ||
1575 | - | ||
1576 | - SMT control and L1D flush control via the sysfs interface | ||
1577 | - is still possible after boot. Hypervisors will issue a | ||
1578 | - warning when the first VM is started in a potentially | ||
1579 | - insecure configuration, i.e. SMT enabled or L1D flush | ||
1580 | - disabled. | ||
1581 | - | ||
1582 | - flush,nosmt Disables SMT and enables the default hypervisor mitigation, | ||
1583 | - i.e. conditional L1D flushing. | ||
1584 | - | ||
1585 | - SMT control and L1D flush control via the sysfs interface | ||
1586 | - is still possible after boot. Hypervisors will issue a | ||
1587 | - warning when the first VM is started in a potentially | ||
1588 | - insecure configuration, i.e. SMT enabled or L1D flush | ||
1589 | - disabled. | ||
1590 | - | ||
1591 | - flush,nowarn Same as 'flush', but hypervisors will not warn when a VM is | ||
1592 | - started in a potentially insecure configuration. | ||
1593 | - | ||
1594 | - off Disables hypervisor mitigations and doesn't emit any | ||
1595 | - warnings. | ||
1596 | - ============ ============================================================= | ||
1597 | - | ||
1598 | -The default is 'flush'. For details about L1D flushing see :ref:`l1d_flush`. | ||
1599 | - | ||
1600 | - | ||
1601 | -.. _mitigation_control_kvm: | ||
1602 | - | ||
1603 | -Mitigation control for KVM - module parameter | ||
1604 | -------------------------------------------------------------- | ||
1605 | - | ||
1606 | -The KVM hypervisor mitigation mechanism, flushing the L1D cache when | ||
1607 | -entering a guest, can be controlled with a module parameter. | ||
1608 | - | ||
1609 | -The option/parameter is "kvm-intel.vmentry_l1d_flush=". It takes the | ||
1610 | -following arguments: | ||
1611 | - | ||
1612 | - ============ ============================================================== | ||
1613 | - always L1D cache flush on every VMENTER. | ||
1614 | - | ||
1615 | - cond Flush L1D on VMENTER only when the code between VMEXIT and | ||
1616 | - VMENTER can leak host memory which is considered | ||
1617 | - interesting for an attacker. This still can leak host memory | ||
1618 | - which allows e.g. to determine the hosts address space layout. | ||
1619 | - | ||
1620 | - never Disables the mitigation | ||
1621 | - ============ ============================================================== | ||
1622 | - | ||
1623 | -The parameter can be provided on the kernel command line, as a module | ||
1624 | -parameter when loading the modules and at runtime modified via the sysfs | ||
1625 | -file: | ||
1626 | - | ||
1627 | -/sys/module/kvm_intel/parameters/vmentry_l1d_flush | ||
1628 | - | ||
1629 | -The default is 'cond'. If 'l1tf=full,force' is given on the kernel command | ||
1630 | -line, then 'always' is enforced and the kvm-intel.vmentry_l1d_flush | ||
1631 | -module parameter is ignored and writes to the sysfs file are rejected. | ||
1632 | - | ||
1633 | - | ||
1634 | -Mitigation selection guide | ||
1635 | --------------------------- | ||
1636 | - | ||
1637 | -1. No virtualization in use | ||
1638 | -^^^^^^^^^^^^^^^^^^^^^^^^^^^ | ||
1639 | - | ||
1640 | - The system is protected by the kernel unconditionally and no further | ||
1641 | - action is required. | ||
1642 | - | ||
1643 | -2. Virtualization with trusted guests | ||
1644 | -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | ||
1645 | - | ||
1646 | - If the guest comes from a trusted source and the guest OS kernel is | ||
1647 | - guaranteed to have the L1TF mitigations in place the system is fully | ||
1648 | - protected against L1TF and no further action is required. | ||
1649 | - | ||
1650 | - To avoid the overhead of the default L1D flushing on VMENTER the | ||
1651 | - administrator can disable the flushing via the kernel command line and | ||
1652 | - sysfs control files. See :ref:`mitigation_control_command_line` and | ||
1653 | - :ref:`mitigation_control_kvm`. | ||
1654 | - | ||
1655 | - | ||
1656 | -3. Virtualization with untrusted guests | ||
1657 | -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | ||
1658 | - | ||
1659 | -3.1. SMT not supported or disabled | ||
1660 | -"""""""""""""""""""""""""""""""""" | ||
1661 | - | ||
1662 | - If SMT is not supported by the processor or disabled in the BIOS or by | ||
1663 | - the kernel, it's only required to enforce L1D flushing on VMENTER. | ||
1664 | - | ||
1665 | - Conditional L1D flushing is the default behaviour and can be tuned. See | ||
1666 | - :ref:`mitigation_control_command_line` and :ref:`mitigation_control_kvm`. | ||
1667 | - | ||
1668 | -3.2. EPT not supported or disabled | ||
1669 | -"""""""""""""""""""""""""""""""""" | ||
1670 | - | ||
1671 | - If EPT is not supported by the processor or disabled in the hypervisor, | ||
1672 | - the system is fully protected. SMT can stay enabled and L1D flushing on | ||
1673 | - VMENTER is not required. | ||
1674 | - | ||
1675 | - EPT can be disabled in the hypervisor via the 'kvm-intel.ept' parameter. | ||
1676 | - | ||
1677 | -3.3. SMT and EPT supported and active | ||
1678 | -""""""""""""""""""""""""""""""""""""" | ||
1679 | - | ||
1680 | - If SMT and EPT are supported and active then various degrees of | ||
1681 | - mitigations can be employed: | ||
1682 | - | ||
1683 | - - L1D flushing on VMENTER: | ||
1684 | - | ||
1685 | - L1D flushing on VMENTER is the minimal protection requirement, but it | ||
1686 | - is only potent in combination with other mitigation methods. | ||
1687 | - | ||
1688 | - Conditional L1D flushing is the default behaviour and can be tuned. See | ||
1689 | - :ref:`mitigation_control_command_line` and :ref:`mitigation_control_kvm`. | ||
1690 | - | ||
1691 | - - Guest confinement: | ||
1692 | - | ||
1693 | - Confinement of guests to a single or a group of physical cores which | ||
1694 | - are not running any other processes, can reduce the attack surface | ||
1695 | - significantly, but interrupts, soft interrupts and kernel threads can | ||
1696 | - still expose valuable data to a potential attacker. See | ||
1697 | - :ref:`guest_confinement`. | ||
1698 | - | ||
1699 | - - Interrupt isolation: | ||
1700 | - | ||
1701 | - Isolating the guest CPUs from interrupts can reduce the attack surface | ||
1702 | - further, but still allows a malicious guest to explore a limited amount | ||
1703 | - of host physical memory. This can at least be used to gain knowledge | ||
1704 | - about the host address space layout. The interrupts which have a fixed | ||
1705 | - affinity to the CPUs which run the untrusted guests can depending on | ||
1706 | - the scenario still trigger soft interrupts and schedule kernel threads | ||
1707 | - which might expose valuable information. See | ||
1708 | - :ref:`interrupt_isolation`. | ||
1709 | - | ||
1710 | -The above three mitigation methods combined can provide protection to a | ||
1711 | -certain degree, but the risk of the remaining attack surface has to be | ||
1712 | -carefully analyzed. For full protection the following methods are | ||
1713 | -available: | ||
1714 | - | ||
1715 | - - Disabling SMT: | ||
1716 | - | ||
1717 | - Disabling SMT and enforcing the L1D flushing provides the maximum | ||
1718 | - amount of protection. This mitigation is not depending on any of the | ||
1719 | - above mitigation methods. | ||
1720 | - | ||
1721 | - SMT control and L1D flushing can be tuned by the command line | ||
1722 | - parameters 'nosmt', 'l1tf', 'kvm-intel.vmentry_l1d_flush' and at run | ||
1723 | - time with the matching sysfs control files. See :ref:`smt_control`, | ||
1724 | - :ref:`mitigation_control_command_line` and | ||
1725 | - :ref:`mitigation_control_kvm`. | ||
1726 | - | ||
1727 | - - Disabling EPT: | ||
1728 | - | ||
1729 | - Disabling EPT provides the maximum amount of protection as well. It is | ||
1730 | - not depending on any of the above mitigation methods. SMT can stay | ||
1731 | - enabled and L1D flushing is not required, but the performance impact is | ||
1732 | - significant. | ||
1733 | - | ||
1734 | - EPT can be disabled in the hypervisor via the 'kvm-intel.ept' | ||
1735 | - parameter. | ||
1736 | - | ||
1737 | -3.4. Nested virtual machines | ||
1738 | -"""""""""""""""""""""""""""" | ||
1739 | - | ||
1740 | -When nested virtualization is in use, three operating systems are involved: | ||
1741 | -the bare metal hypervisor, the nested hypervisor and the nested virtual | ||
1742 | -machine. VMENTER operations from the nested hypervisor into the nested | ||
1743 | -guest will always be processed by the bare metal hypervisor. If KVM is the | ||
1744 | -bare metal hypervisor it wiil: | ||
1745 | - | ||
1746 | - - Flush the L1D cache on every switch from the nested hypervisor to the | ||
1747 | - nested virtual machine, so that the nested hypervisor's secrets are not | ||
1748 | - exposed to the nested virtual machine; | ||
1749 | - | ||
1750 | - - Flush the L1D cache on every switch from the nested virtual machine to | ||
1751 | - the nested hypervisor; this is a complex operation, and flushing the L1D | ||
1752 | - cache avoids that the bare metal hypervisor's secrets are exposed to the | ||
1753 | - nested virtual machine; | ||
1754 | - | ||
1755 | - - Instruct the nested hypervisor to not perform any L1D cache flush. This | ||
1756 | - is an optimization to avoid double L1D flushing. | ||
1757 | - | ||
1758 | - | ||
1759 | -.. _default_mitigations: | ||
1760 | - | ||
1761 | -Default mitigations | ||
1762 | -------------------- | ||
1763 | - | ||
1764 | - The kernel default mitigations for vulnerable processors are: | ||
1765 | - | ||
1766 | - - PTE inversion to protect against malicious user space. This is done | ||
1767 | - unconditionally and cannot be controlled. | ||
1768 | - | ||
1769 | - - L1D conditional flushing on VMENTER when EPT is enabled for | ||
1770 | - a guest. | ||
1771 | - | ||
1772 | - The kernel does not by default enforce the disabling of SMT, which leaves | ||
1773 | - SMT systems vulnerable when running untrusted guests with EPT enabled. | ||
1774 | - | ||
1775 | - The rationale for this choice is: | ||
1776 | - | ||
1777 | - - Force disabling SMT can break existing setups, especially with | ||
1778 | - unattended updates. | ||
1779 | - | ||
1780 | - - If regular users run untrusted guests on their machine, then L1TF is | ||
1781 | - just an add on to other malware which might be embedded in an untrusted | ||
1782 | - guest, e.g. spam-bots or attacks on the local network. | ||
1783 | - | ||
1784 | - There is no technical way to prevent a user from running untrusted code | ||
1785 | - on their machines blindly. | ||
1786 | - | ||
1787 | - - It's technically extremely unlikely and from today's knowledge even | ||
1788 | - impossible that L1TF can be exploited via the most popular attack | ||
1789 | - mechanisms like JavaScript because these mechanisms have no way to | ||
1790 | - control PTEs. If this would be possible and not other mitigation would | ||
1791 | - be possible, then the default might be different. | ||
1792 | - | ||
1793 | - - The administrators of cloud and hosting setups have to carefully | ||
1794 | - analyze the risk for their scenarios and make the appropriate | ||
1795 | - mitigation choices, which might even vary across their deployed | ||
1796 | - machines and also result in other changes of their overall setup. | ||
1797 | - There is no way for the kernel to provide a sensible default for this | ||
1798 | - kind of scenarios. | ||
1799 | diff --git a/Documentation/spec_ctrl.txt b/Documentation/spec_ctrl.txt | ||
1800 | index 32f3d55c54b7..c4dbe6f7cdae 100644 | ||
1801 | --- a/Documentation/spec_ctrl.txt | ||
1802 | +++ b/Documentation/spec_ctrl.txt | ||
1803 | @@ -92,3 +92,12 @@ Speculation misfeature controls | ||
1804 | * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_STORE_BYPASS, PR_SPEC_ENABLE, 0, 0); | ||
1805 | * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_STORE_BYPASS, PR_SPEC_DISABLE, 0, 0); | ||
1806 | * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_STORE_BYPASS, PR_SPEC_FORCE_DISABLE, 0, 0); | ||
1807 | + | ||
1808 | +- PR_SPEC_INDIR_BRANCH: Indirect Branch Speculation in User Processes | ||
1809 | + (Mitigate Spectre V2 style attacks against user processes) | ||
1810 | + | ||
1811 | + Invocations: | ||
1812 | + * prctl(PR_GET_SPECULATION_CTRL, PR_SPEC_INDIRECT_BRANCH, 0, 0, 0); | ||
1813 | + * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_INDIRECT_BRANCH, PR_SPEC_ENABLE, 0, 0); | ||
1814 | + * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_INDIRECT_BRANCH, PR_SPEC_DISABLE, 0, 0); | ||
1815 | + * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_INDIRECT_BRANCH, PR_SPEC_FORCE_DISABLE, 0, 0); | ||
1816 | diff --git a/Documentation/x86/conf.py b/Documentation/x86/conf.py | ||
1817 | new file mode 100644 | ||
1818 | index 000000000000..33c5c3142e20 | ||
1819 | --- /dev/null | ||
1820 | +++ b/Documentation/x86/conf.py | ||
1821 | @@ -0,0 +1,10 @@ | ||
1822 | +# -*- coding: utf-8; mode: python -*- | ||
1823 | + | ||
1824 | +project = "X86 architecture specific documentation" | ||
1825 | + | ||
1826 | +tags.add("subproject") | ||
1827 | + | ||
1828 | +latex_documents = [ | ||
1829 | + ('index', 'x86.tex', project, | ||
1830 | + 'The kernel development community', 'manual'), | ||
1831 | +] | ||
1832 | diff --git a/Documentation/x86/index.rst b/Documentation/x86/index.rst | ||
1833 | new file mode 100644 | ||
1834 | index 000000000000..ef389dcf1b1d | ||
1835 | --- /dev/null | ||
1836 | +++ b/Documentation/x86/index.rst | ||
1837 | @@ -0,0 +1,8 @@ | ||
1838 | +========================== | ||
1839 | +x86 architecture specifics | ||
1840 | +========================== | ||
1841 | + | ||
1842 | +.. toctree:: | ||
1843 | + :maxdepth: 1 | ||
1844 | + | ||
1845 | + mds | ||
1846 | diff --git a/Documentation/x86/mds.rst b/Documentation/x86/mds.rst | ||
1847 | new file mode 100644 | ||
1848 | index 000000000000..534e9baa4e1d | ||
1849 | --- /dev/null | ||
1850 | +++ b/Documentation/x86/mds.rst | ||
1851 | @@ -0,0 +1,225 @@ | ||
1852 | +Microarchitectural Data Sampling (MDS) mitigation | ||
1853 | +================================================= | ||
1854 | + | ||
1855 | +.. _mds: | ||
1856 | + | ||
1857 | +Overview | ||
1858 | +-------- | ||
1859 | + | ||
1860 | +Microarchitectural Data Sampling (MDS) is a family of side channel attacks | ||
1861 | +on internal buffers in Intel CPUs. The variants are: | ||
1862 | + | ||
1863 | + - Microarchitectural Store Buffer Data Sampling (MSBDS) (CVE-2018-12126) | ||
1864 | + - Microarchitectural Fill Buffer Data Sampling (MFBDS) (CVE-2018-12130) | ||
1865 | + - Microarchitectural Load Port Data Sampling (MLPDS) (CVE-2018-12127) | ||
1866 | + - Microarchitectural Data Sampling Uncacheable Memory (MDSUM) (CVE-2019-11091) | ||
1867 | + | ||
1868 | +MSBDS leaks Store Buffer Entries which can be speculatively forwarded to a | ||
1869 | +dependent load (store-to-load forwarding) as an optimization. The forward | ||
1870 | +can also happen to a faulting or assisting load operation for a different | ||
1871 | +memory address, which can be exploited under certain conditions. Store | ||
1872 | +buffers are partitioned between Hyper-Threads so cross thread forwarding is | ||
1873 | +not possible. But if a thread enters or exits a sleep state the store | ||
1874 | +buffer is repartitioned which can expose data from one thread to the other. | ||
1875 | + | ||
1876 | +MFBDS leaks Fill Buffer Entries. Fill buffers are used internally to manage | ||
1877 | +L1 miss situations and to hold data which is returned or sent in response | ||
1878 | +to a memory or I/O operation. Fill buffers can forward data to a load | ||
1879 | +operation and also write data to the cache. When the fill buffer is | ||
1880 | +deallocated it can retain the stale data of the preceding operations which | ||
1881 | +can then be forwarded to a faulting or assisting load operation, which can | ||
1882 | +be exploited under certain conditions. Fill buffers are shared between | ||
1883 | +Hyper-Threads so cross thread leakage is possible. | ||
1884 | + | ||
1885 | +MLPDS leaks Load Port Data. Load ports are used to perform load operations | ||
1886 | +from memory or I/O. The received data is then forwarded to the register | ||
1887 | +file or a subsequent operation. In some implementations the Load Port can | ||
1888 | +contain stale data from a previous operation which can be forwarded to | ||
1889 | +faulting or assisting loads under certain conditions, which again can be | ||
1890 | +exploited eventually. Load ports are shared between Hyper-Threads so cross | ||
1891 | +thread leakage is possible. | ||
1892 | + | ||
1893 | +MDSUM is a special case of MSBDS, MFBDS and MLPDS. An uncacheable load from | ||
1894 | +memory that takes a fault or assist can leave data in a microarchitectural | ||
1895 | +structure that may later be observed using one of the same methods used by | ||
1896 | +MSBDS, MFBDS or MLPDS. | ||
1897 | + | ||
1898 | +Exposure assumptions | ||
1899 | +-------------------- | ||
1900 | + | ||
1901 | +It is assumed that attack code resides in user space or in a guest with one | ||
1902 | +exception. The rationale behind this assumption is that the code construct | ||
1903 | +needed for exploiting MDS requires: | ||
1904 | + | ||
1905 | + - to control the load to trigger a fault or assist | ||
1906 | + | ||
1907 | + - to have a disclosure gadget which exposes the speculatively accessed | ||
1908 | + data for consumption through a side channel. | ||
1909 | + | ||
1910 | + - to control the pointer through which the disclosure gadget exposes the | ||
1911 | + data | ||
1912 | + | ||
1913 | +The existence of such a construct in the kernel cannot be excluded with | ||
1914 | +100% certainty, but the complexity involved makes it extremly unlikely. | ||
1915 | + | ||
1916 | +There is one exception, which is untrusted BPF. The functionality of | ||
1917 | +untrusted BPF is limited, but it needs to be thoroughly investigated | ||
1918 | +whether it can be used to create such a construct. | ||
1919 | + | ||
1920 | + | ||
1921 | +Mitigation strategy | ||
1922 | +------------------- | ||
1923 | + | ||
1924 | +All variants have the same mitigation strategy at least for the single CPU | ||
1925 | +thread case (SMT off): Force the CPU to clear the affected buffers. | ||
1926 | + | ||
1927 | +This is achieved by using the otherwise unused and obsolete VERW | ||
1928 | +instruction in combination with a microcode update. The microcode clears | ||
1929 | +the affected CPU buffers when the VERW instruction is executed. | ||
1930 | + | ||
1931 | +For virtualization there are two ways to achieve CPU buffer | ||
1932 | +clearing. Either the modified VERW instruction or via the L1D Flush | ||
1933 | +command. The latter is issued when L1TF mitigation is enabled so the extra | ||
1934 | +VERW can be avoided. If the CPU is not affected by L1TF then VERW needs to | ||
1935 | +be issued. | ||
1936 | + | ||
1937 | +If the VERW instruction with the supplied segment selector argument is | ||
1938 | +executed on a CPU without the microcode update there is no side effect | ||
1939 | +other than a small number of pointlessly wasted CPU cycles. | ||
1940 | + | ||
1941 | +This does not protect against cross Hyper-Thread attacks except for MSBDS | ||
1942 | +which is only exploitable cross Hyper-thread when one of the Hyper-Threads | ||
1943 | +enters a C-state. | ||
1944 | + | ||
1945 | +The kernel provides a function to invoke the buffer clearing: | ||
1946 | + | ||
1947 | + mds_clear_cpu_buffers() | ||
1948 | + | ||
1949 | +The mitigation is invoked on kernel/userspace, hypervisor/guest and C-state | ||
1950 | +(idle) transitions. | ||
1951 | + | ||
1952 | +As a special quirk to address virtualization scenarios where the host has | ||
1953 | +the microcode updated, but the hypervisor does not (yet) expose the | ||
1954 | +MD_CLEAR CPUID bit to guests, the kernel issues the VERW instruction in the | ||
1955 | +hope that it might actually clear the buffers. The state is reflected | ||
1956 | +accordingly. | ||
1957 | + | ||
1958 | +According to current knowledge additional mitigations inside the kernel | ||
1959 | +itself are not required because the necessary gadgets to expose the leaked | ||
1960 | +data cannot be controlled in a way which allows exploitation from malicious | ||
1961 | +user space or VM guests. | ||
1962 | + | ||
1963 | +Kernel internal mitigation modes | ||
1964 | +-------------------------------- | ||
1965 | + | ||
1966 | + ======= ============================================================ | ||
1967 | + off Mitigation is disabled. Either the CPU is not affected or | ||
1968 | + mds=off is supplied on the kernel command line | ||
1969 | + | ||
1970 | + full Mitigation is enabled. CPU is affected and MD_CLEAR is | ||
1971 | + advertised in CPUID. | ||
1972 | + | ||
1973 | + vmwerv Mitigation is enabled. CPU is affected and MD_CLEAR is not | ||
1974 | + advertised in CPUID. That is mainly for virtualization | ||
1975 | + scenarios where the host has the updated microcode but the | ||
1976 | + hypervisor does not expose MD_CLEAR in CPUID. It's a best | ||
1977 | + effort approach without guarantee. | ||
1978 | + ======= ============================================================ | ||
1979 | + | ||
1980 | +If the CPU is affected and mds=off is not supplied on the kernel command | ||
1981 | +line then the kernel selects the appropriate mitigation mode depending on | ||
1982 | +the availability of the MD_CLEAR CPUID bit. | ||
1983 | + | ||
1984 | +Mitigation points | ||
1985 | +----------------- | ||
1986 | + | ||
1987 | +1. Return to user space | ||
1988 | +^^^^^^^^^^^^^^^^^^^^^^^ | ||
1989 | + | ||
1990 | + When transitioning from kernel to user space the CPU buffers are flushed | ||
1991 | + on affected CPUs when the mitigation is not disabled on the kernel | ||
1992 | + command line. The migitation is enabled through the static key | ||
1993 | + mds_user_clear. | ||
1994 | + | ||
1995 | + The mitigation is invoked in prepare_exit_to_usermode() which covers | ||
1996 | + most of the kernel to user space transitions. There are a few exceptions | ||
1997 | + which are not invoking prepare_exit_to_usermode() on return to user | ||
1998 | + space. These exceptions use the paranoid exit code. | ||
1999 | + | ||
2000 | + - Non Maskable Interrupt (NMI): | ||
2001 | + | ||
2002 | + Access to sensible data like keys, credentials in the NMI context is | ||
2003 | + mostly theoretical: The CPU can do prefetching or execute a | ||
2004 | + misspeculated code path and thereby fetching data which might end up | ||
2005 | + leaking through a buffer. | ||
2006 | + | ||
2007 | + But for mounting other attacks the kernel stack address of the task is | ||
2008 | + already valuable information. So in full mitigation mode, the NMI is | ||
2009 | + mitigated on the return from do_nmi() to provide almost complete | ||
2010 | + coverage. | ||
2011 | + | ||
2012 | + - Double fault (#DF): | ||
2013 | + | ||
2014 | + A double fault is usually fatal, but the ESPFIX workaround, which can | ||
2015 | + be triggered from user space through modify_ldt(2) is a recoverable | ||
2016 | + double fault. #DF uses the paranoid exit path, so explicit mitigation | ||
2017 | + in the double fault handler is required. | ||
2018 | + | ||
2019 | + - Machine Check Exception (#MC): | ||
2020 | + | ||
2021 | + Another corner case is a #MC which hits between the CPU buffer clear | ||
2022 | + invocation and the actual return to user. As this still is in kernel | ||
2023 | + space it takes the paranoid exit path which does not clear the CPU | ||
2024 | + buffers. So the #MC handler repopulates the buffers to some | ||
2025 | + extent. Machine checks are not reliably controllable and the window is | ||
2026 | + extremly small so mitigation would just tick a checkbox that this | ||
2027 | + theoretical corner case is covered. To keep the amount of special | ||
2028 | + cases small, ignore #MC. | ||
2029 | + | ||
2030 | + - Debug Exception (#DB): | ||
2031 | + | ||
2032 | + This takes the paranoid exit path only when the INT1 breakpoint is in | ||
2033 | + kernel space. #DB on a user space address takes the regular exit path, | ||
2034 | + so no extra mitigation required. | ||
2035 | + | ||
2036 | + | ||
2037 | +2. C-State transition | ||
2038 | +^^^^^^^^^^^^^^^^^^^^^ | ||
2039 | + | ||
2040 | + When a CPU goes idle and enters a C-State the CPU buffers need to be | ||
2041 | + cleared on affected CPUs when SMT is active. This addresses the | ||
2042 | + repartitioning of the store buffer when one of the Hyper-Threads enters | ||
2043 | + a C-State. | ||
2044 | + | ||
2045 | + When SMT is inactive, i.e. either the CPU does not support it or all | ||
2046 | + sibling threads are offline CPU buffer clearing is not required. | ||
2047 | + | ||
2048 | + The idle clearing is enabled on CPUs which are only affected by MSBDS | ||
2049 | + and not by any other MDS variant. The other MDS variants cannot be | ||
2050 | + protected against cross Hyper-Thread attacks because the Fill Buffer and | ||
2051 | + the Load Ports are shared. So on CPUs affected by other variants, the | ||
2052 | + idle clearing would be a window dressing exercise and is therefore not | ||
2053 | + activated. | ||
2054 | + | ||
2055 | + The invocation is controlled by the static key mds_idle_clear which is | ||
2056 | + switched depending on the chosen mitigation mode and the SMT state of | ||
2057 | + the system. | ||
2058 | + | ||
2059 | + The buffer clear is only invoked before entering the C-State to prevent | ||
2060 | + that stale data from the idling CPU from spilling to the Hyper-Thread | ||
2061 | + sibling after the store buffer got repartitioned and all entries are | ||
2062 | + available to the non idle sibling. | ||
2063 | + | ||
2064 | + When coming out of idle the store buffer is partitioned again so each | ||
2065 | + sibling has half of it available. The back from idle CPU could be then | ||
2066 | + speculatively exposed to contents of the sibling. The buffers are | ||
2067 | + flushed either on exit to user space or on VMENTER so malicious code | ||
2068 | + in user space or the guest cannot speculatively access them. | ||
2069 | + | ||
2070 | + The mitigation is hooked into all variants of halt()/mwait(), but does | ||
2071 | + not cover the legacy ACPI IO-Port mechanism because the ACPI idle driver | ||
2072 | + has been superseded by the intel_idle driver around 2010 and is | ||
2073 | + preferred on all affected CPUs which are expected to gain the MD_CLEAR | ||
2074 | + functionality in microcode. Aside of that the IO-Port mechanism is a | ||
2075 | + legacy interface which is only used on older systems which are either | ||
2076 | + not affected or do not receive microcode updates anymore. | ||
2077 | diff --git a/Makefile b/Makefile | ||
2078 | index e52b0579e176..92fe701e5582 100644 | ||
2079 | --- a/Makefile | ||
2080 | +++ b/Makefile | ||
2081 | @@ -1,6 +1,6 @@ | ||
2082 | VERSION = 4 | ||
2083 | PATCHLEVEL = 9 | ||
2084 | -SUBLEVEL = 175 | ||
2085 | +SUBLEVEL = 176 | ||
2086 | EXTRAVERSION = | ||
2087 | NAME = Roaring Lionus | ||
2088 | |||
2089 | diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig | ||
2090 | index 5a4591ff8407..e0055b4302d6 100644 | ||
2091 | --- a/arch/x86/Kconfig | ||
2092 | +++ b/arch/x86/Kconfig | ||
2093 | @@ -937,13 +937,7 @@ config NR_CPUS | ||
2094 | approximately eight kilobytes to the kernel image. | ||
2095 | |||
2096 | config SCHED_SMT | ||
2097 | - bool "SMT (Hyperthreading) scheduler support" | ||
2098 | - depends on SMP | ||
2099 | - ---help--- | ||
2100 | - SMT scheduler support improves the CPU scheduler's decision making | ||
2101 | - when dealing with Intel Pentium 4 chips with HyperThreading at a | ||
2102 | - cost of slightly increased overhead in some places. If unsure say | ||
2103 | - N here. | ||
2104 | + def_bool y if SMP | ||
2105 | |||
2106 | config SCHED_MC | ||
2107 | def_bool y | ||
2108 | diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c | ||
2109 | index b0cd306dc527..8841d016b4a4 100644 | ||
2110 | --- a/arch/x86/entry/common.c | ||
2111 | +++ b/arch/x86/entry/common.c | ||
2112 | @@ -28,6 +28,7 @@ | ||
2113 | #include <asm/vdso.h> | ||
2114 | #include <asm/uaccess.h> | ||
2115 | #include <asm/cpufeature.h> | ||
2116 | +#include <asm/nospec-branch.h> | ||
2117 | |||
2118 | #define CREATE_TRACE_POINTS | ||
2119 | #include <trace/events/syscalls.h> | ||
2120 | @@ -206,6 +207,8 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs) | ||
2121 | #endif | ||
2122 | |||
2123 | user_enter_irqoff(); | ||
2124 | + | ||
2125 | + mds_user_clear_cpu_buffers(); | ||
2126 | } | ||
2127 | |||
2128 | #define SYSCALL_EXIT_WORK_FLAGS \ | ||
2129 | diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c | ||
2130 | index a30829052a00..cb8178a2783a 100644 | ||
2131 | --- a/arch/x86/events/intel/core.c | ||
2132 | +++ b/arch/x86/events/intel/core.c | ||
2133 | @@ -3750,11 +3750,11 @@ __init int intel_pmu_init(void) | ||
2134 | pr_cont("Nehalem events, "); | ||
2135 | break; | ||
2136 | |||
2137 | - case INTEL_FAM6_ATOM_PINEVIEW: | ||
2138 | - case INTEL_FAM6_ATOM_LINCROFT: | ||
2139 | - case INTEL_FAM6_ATOM_PENWELL: | ||
2140 | - case INTEL_FAM6_ATOM_CLOVERVIEW: | ||
2141 | - case INTEL_FAM6_ATOM_CEDARVIEW: | ||
2142 | + case INTEL_FAM6_ATOM_BONNELL: | ||
2143 | + case INTEL_FAM6_ATOM_BONNELL_MID: | ||
2144 | + case INTEL_FAM6_ATOM_SALTWELL: | ||
2145 | + case INTEL_FAM6_ATOM_SALTWELL_MID: | ||
2146 | + case INTEL_FAM6_ATOM_SALTWELL_TABLET: | ||
2147 | memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, | ||
2148 | sizeof(hw_cache_event_ids)); | ||
2149 | |||
2150 | @@ -3766,9 +3766,11 @@ __init int intel_pmu_init(void) | ||
2151 | pr_cont("Atom events, "); | ||
2152 | break; | ||
2153 | |||
2154 | - case INTEL_FAM6_ATOM_SILVERMONT1: | ||
2155 | - case INTEL_FAM6_ATOM_SILVERMONT2: | ||
2156 | + case INTEL_FAM6_ATOM_SILVERMONT: | ||
2157 | + case INTEL_FAM6_ATOM_SILVERMONT_X: | ||
2158 | + case INTEL_FAM6_ATOM_SILVERMONT_MID: | ||
2159 | case INTEL_FAM6_ATOM_AIRMONT: | ||
2160 | + case INTEL_FAM6_ATOM_AIRMONT_MID: | ||
2161 | memcpy(hw_cache_event_ids, slm_hw_cache_event_ids, | ||
2162 | sizeof(hw_cache_event_ids)); | ||
2163 | memcpy(hw_cache_extra_regs, slm_hw_cache_extra_regs, | ||
2164 | @@ -3785,7 +3787,7 @@ __init int intel_pmu_init(void) | ||
2165 | break; | ||
2166 | |||
2167 | case INTEL_FAM6_ATOM_GOLDMONT: | ||
2168 | - case INTEL_FAM6_ATOM_DENVERTON: | ||
2169 | + case INTEL_FAM6_ATOM_GOLDMONT_X: | ||
2170 | memcpy(hw_cache_event_ids, glm_hw_cache_event_ids, | ||
2171 | sizeof(hw_cache_event_ids)); | ||
2172 | memcpy(hw_cache_extra_regs, glm_hw_cache_extra_regs, | ||
2173 | diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c | ||
2174 | index 47d526c700a1..72d09340c24d 100644 | ||
2175 | --- a/arch/x86/events/intel/cstate.c | ||
2176 | +++ b/arch/x86/events/intel/cstate.c | ||
2177 | @@ -531,8 +531,8 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { | ||
2178 | |||
2179 | X86_CSTATES_MODEL(INTEL_FAM6_HASWELL_ULT, hswult_cstates), | ||
2180 | |||
2181 | - X86_CSTATES_MODEL(INTEL_FAM6_ATOM_SILVERMONT1, slm_cstates), | ||
2182 | - X86_CSTATES_MODEL(INTEL_FAM6_ATOM_SILVERMONT2, slm_cstates), | ||
2183 | + X86_CSTATES_MODEL(INTEL_FAM6_ATOM_SILVERMONT, slm_cstates), | ||
2184 | + X86_CSTATES_MODEL(INTEL_FAM6_ATOM_SILVERMONT_X, slm_cstates), | ||
2185 | X86_CSTATES_MODEL(INTEL_FAM6_ATOM_AIRMONT, slm_cstates), | ||
2186 | |||
2187 | X86_CSTATES_MODEL(INTEL_FAM6_BROADWELL_CORE, snb_cstates), | ||
2188 | diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c | ||
2189 | index be0b1968d60a..68144a341903 100644 | ||
2190 | --- a/arch/x86/events/msr.c | ||
2191 | +++ b/arch/x86/events/msr.c | ||
2192 | @@ -61,8 +61,8 @@ static bool test_intel(int idx) | ||
2193 | case INTEL_FAM6_BROADWELL_GT3E: | ||
2194 | case INTEL_FAM6_BROADWELL_X: | ||
2195 | |||
2196 | - case INTEL_FAM6_ATOM_SILVERMONT1: | ||
2197 | - case INTEL_FAM6_ATOM_SILVERMONT2: | ||
2198 | + case INTEL_FAM6_ATOM_SILVERMONT: | ||
2199 | + case INTEL_FAM6_ATOM_SILVERMONT_X: | ||
2200 | case INTEL_FAM6_ATOM_AIRMONT: | ||
2201 | if (idx == PERF_MSR_SMI) | ||
2202 | return true; | ||
2203 | diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h | ||
2204 | index 98444b77fbe3..06de338be0d8 100644 | ||
2205 | --- a/arch/x86/include/asm/cpufeatures.h | ||
2206 | +++ b/arch/x86/include/asm/cpufeatures.h | ||
2207 | @@ -271,10 +271,12 @@ | ||
2208 | /* AMD-defined CPU features, CPUID level 0x80000008 (ebx), word 13 */ | ||
2209 | #define X86_FEATURE_CLZERO (13*32+0) /* CLZERO instruction */ | ||
2210 | #define X86_FEATURE_IRPERF (13*32+1) /* Instructions Retired Count */ | ||
2211 | -#define X86_FEATURE_AMD_IBPB (13*32+12) /* Indirect Branch Prediction Barrier */ | ||
2212 | -#define X86_FEATURE_AMD_IBRS (13*32+14) /* Indirect Branch Restricted Speculation */ | ||
2213 | -#define X86_FEATURE_AMD_STIBP (13*32+15) /* Single Thread Indirect Branch Predictors */ | ||
2214 | +#define X86_FEATURE_AMD_IBPB (13*32+12) /* "" Indirect Branch Prediction Barrier */ | ||
2215 | +#define X86_FEATURE_AMD_IBRS (13*32+14) /* "" Indirect Branch Restricted Speculation */ | ||
2216 | +#define X86_FEATURE_AMD_STIBP (13*32+15) /* "" Single Thread Indirect Branch Predictors */ | ||
2217 | +#define X86_FEATURE_AMD_SSBD (13*32+24) /* "" Speculative Store Bypass Disable */ | ||
2218 | #define X86_FEATURE_VIRT_SSBD (13*32+25) /* Virtualized Speculative Store Bypass Disable */ | ||
2219 | +#define X86_FEATURE_AMD_SSB_NO (13*32+26) /* "" Speculative Store Bypass is fixed in hardware. */ | ||
2220 | |||
2221 | /* Thermal and Power Management Leaf, CPUID level 0x00000006 (eax), word 14 */ | ||
2222 | #define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ | ||
2223 | @@ -315,6 +317,7 @@ | ||
2224 | #define X86_FEATURE_AVX512_4VNNIW (18*32+ 2) /* AVX-512 Neural Network Instructions */ | ||
2225 | #define X86_FEATURE_AVX512_4FMAPS (18*32+ 3) /* AVX-512 Multiply Accumulation Single precision */ | ||
2226 | #define X86_FEATURE_TSX_FORCE_ABORT (18*32+13) /* "" TSX_FORCE_ABORT */ | ||
2227 | +#define X86_FEATURE_MD_CLEAR (18*32+10) /* VERW clears CPU buffers */ | ||
2228 | #define X86_FEATURE_PCONFIG (18*32+18) /* Intel PCONFIG */ | ||
2229 | #define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */ | ||
2230 | #define X86_FEATURE_INTEL_STIBP (18*32+27) /* "" Single Thread Indirect Branch Predictors */ | ||
2231 | @@ -352,5 +355,7 @@ | ||
2232 | #define X86_BUG_SPECTRE_V2 X86_BUG(16) /* CPU is affected by Spectre variant 2 attack with indirect branches */ | ||
2233 | #define X86_BUG_SPEC_STORE_BYPASS X86_BUG(17) /* CPU is affected by speculative store bypass attack */ | ||
2234 | #define X86_BUG_L1TF X86_BUG(18) /* CPU is affected by L1 Terminal Fault */ | ||
2235 | +#define X86_BUG_MDS X86_BUG(19) /* CPU is affected by Microarchitectural data sampling */ | ||
2236 | +#define X86_BUG_MSBDS_ONLY X86_BUG(20) /* CPU is only affected by the MSDBS variant of BUG_MDS */ | ||
2237 | |||
2238 | #endif /* _ASM_X86_CPUFEATURES_H */ | ||
2239 | diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h | ||
2240 | index 75b748a1deb8..ba7b6f736414 100644 | ||
2241 | --- a/arch/x86/include/asm/intel-family.h | ||
2242 | +++ b/arch/x86/include/asm/intel-family.h | ||
2243 | @@ -50,19 +50,23 @@ | ||
2244 | |||
2245 | /* "Small Core" Processors (Atom) */ | ||
2246 | |||
2247 | -#define INTEL_FAM6_ATOM_PINEVIEW 0x1C | ||
2248 | -#define INTEL_FAM6_ATOM_LINCROFT 0x26 | ||
2249 | -#define INTEL_FAM6_ATOM_PENWELL 0x27 | ||
2250 | -#define INTEL_FAM6_ATOM_CLOVERVIEW 0x35 | ||
2251 | -#define INTEL_FAM6_ATOM_CEDARVIEW 0x36 | ||
2252 | -#define INTEL_FAM6_ATOM_SILVERMONT1 0x37 /* BayTrail/BYT / Valleyview */ | ||
2253 | -#define INTEL_FAM6_ATOM_SILVERMONT2 0x4D /* Avaton/Rangely */ | ||
2254 | -#define INTEL_FAM6_ATOM_AIRMONT 0x4C /* CherryTrail / Braswell */ | ||
2255 | -#define INTEL_FAM6_ATOM_MERRIFIELD 0x4A /* Tangier */ | ||
2256 | -#define INTEL_FAM6_ATOM_MOOREFIELD 0x5A /* Anniedale */ | ||
2257 | -#define INTEL_FAM6_ATOM_GOLDMONT 0x5C | ||
2258 | -#define INTEL_FAM6_ATOM_DENVERTON 0x5F /* Goldmont Microserver */ | ||
2259 | -#define INTEL_FAM6_ATOM_GEMINI_LAKE 0x7A | ||
2260 | +#define INTEL_FAM6_ATOM_BONNELL 0x1C /* Diamondville, Pineview */ | ||
2261 | +#define INTEL_FAM6_ATOM_BONNELL_MID 0x26 /* Silverthorne, Lincroft */ | ||
2262 | + | ||
2263 | +#define INTEL_FAM6_ATOM_SALTWELL 0x36 /* Cedarview */ | ||
2264 | +#define INTEL_FAM6_ATOM_SALTWELL_MID 0x27 /* Penwell */ | ||
2265 | +#define INTEL_FAM6_ATOM_SALTWELL_TABLET 0x35 /* Cloverview */ | ||
2266 | + | ||
2267 | +#define INTEL_FAM6_ATOM_SILVERMONT 0x37 /* Bay Trail, Valleyview */ | ||
2268 | +#define INTEL_FAM6_ATOM_SILVERMONT_X 0x4D /* Avaton, Rangely */ | ||
2269 | +#define INTEL_FAM6_ATOM_SILVERMONT_MID 0x4A /* Merriefield */ | ||
2270 | + | ||
2271 | +#define INTEL_FAM6_ATOM_AIRMONT 0x4C /* Cherry Trail, Braswell */ | ||
2272 | +#define INTEL_FAM6_ATOM_AIRMONT_MID 0x5A /* Moorefield */ | ||
2273 | + | ||
2274 | +#define INTEL_FAM6_ATOM_GOLDMONT 0x5C /* Apollo Lake */ | ||
2275 | +#define INTEL_FAM6_ATOM_GOLDMONT_X 0x5F /* Denverton */ | ||
2276 | +#define INTEL_FAM6_ATOM_GOLDMONT_PLUS 0x7A /* Gemini Lake */ | ||
2277 | |||
2278 | /* Xeon Phi */ | ||
2279 | |||
2280 | diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h | ||
2281 | index 508a062e6cf1..0c8f4281b151 100644 | ||
2282 | --- a/arch/x86/include/asm/irqflags.h | ||
2283 | +++ b/arch/x86/include/asm/irqflags.h | ||
2284 | @@ -5,6 +5,8 @@ | ||
2285 | |||
2286 | #ifndef __ASSEMBLY__ | ||
2287 | |||
2288 | +#include <asm/nospec-branch.h> | ||
2289 | + | ||
2290 | /* Provide __cpuidle; we can't safely include <linux/cpu.h> */ | ||
2291 | #define __cpuidle __attribute__((__section__(".cpuidle.text"))) | ||
2292 | |||
2293 | @@ -53,11 +55,13 @@ static inline void native_irq_enable(void) | ||
2294 | |||
2295 | static inline __cpuidle void native_safe_halt(void) | ||
2296 | { | ||
2297 | + mds_idle_clear_cpu_buffers(); | ||
2298 | asm volatile("sti; hlt": : :"memory"); | ||
2299 | } | ||
2300 | |||
2301 | static inline __cpuidle void native_halt(void) | ||
2302 | { | ||
2303 | + mds_idle_clear_cpu_buffers(); | ||
2304 | asm volatile("hlt": : :"memory"); | ||
2305 | } | ||
2306 | |||
2307 | diff --git a/arch/x86/include/asm/microcode_intel.h b/arch/x86/include/asm/microcode_intel.h | ||
2308 | index 5e69154c9f07..a61ec81b27db 100644 | ||
2309 | --- a/arch/x86/include/asm/microcode_intel.h | ||
2310 | +++ b/arch/x86/include/asm/microcode_intel.h | ||
2311 | @@ -52,6 +52,21 @@ struct extended_sigtable { | ||
2312 | |||
2313 | #define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE) | ||
2314 | |||
2315 | +static inline u32 intel_get_microcode_revision(void) | ||
2316 | +{ | ||
2317 | + u32 rev, dummy; | ||
2318 | + | ||
2319 | + native_wrmsrl(MSR_IA32_UCODE_REV, 0); | ||
2320 | + | ||
2321 | + /* As documented in the SDM: Do a CPUID 1 here */ | ||
2322 | + sync_core(); | ||
2323 | + | ||
2324 | + /* get the current revision from MSR 0x8B */ | ||
2325 | + native_rdmsr(MSR_IA32_UCODE_REV, dummy, rev); | ||
2326 | + | ||
2327 | + return rev; | ||
2328 | +} | ||
2329 | + | ||
2330 | extern int has_newer_microcode(void *mc, unsigned int csig, int cpf, int rev); | ||
2331 | extern int microcode_sanity_check(void *mc, int print_err); | ||
2332 | extern int find_matching_signature(void *mc, unsigned int csig, int cpf); | ||
2333 | diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h | ||
2334 | index 9963e21ac443..38f94d07920d 100644 | ||
2335 | --- a/arch/x86/include/asm/msr-index.h | ||
2336 | +++ b/arch/x86/include/asm/msr-index.h | ||
2337 | @@ -1,6 +1,8 @@ | ||
2338 | #ifndef _ASM_X86_MSR_INDEX_H | ||
2339 | #define _ASM_X86_MSR_INDEX_H | ||
2340 | |||
2341 | +#include <linux/bits.h> | ||
2342 | + | ||
2343 | /* | ||
2344 | * CPU model specific register (MSR) numbers. | ||
2345 | * | ||
2346 | @@ -38,13 +40,14 @@ | ||
2347 | |||
2348 | /* Intel MSRs. Some also available on other CPUs */ | ||
2349 | #define MSR_IA32_SPEC_CTRL 0x00000048 /* Speculation Control */ | ||
2350 | -#define SPEC_CTRL_IBRS (1 << 0) /* Indirect Branch Restricted Speculation */ | ||
2351 | -#define SPEC_CTRL_STIBP (1 << 1) /* Single Thread Indirect Branch Predictors */ | ||
2352 | +#define SPEC_CTRL_IBRS BIT(0) /* Indirect Branch Restricted Speculation */ | ||
2353 | +#define SPEC_CTRL_STIBP_SHIFT 1 /* Single Thread Indirect Branch Predictor (STIBP) bit */ | ||
2354 | +#define SPEC_CTRL_STIBP BIT(SPEC_CTRL_STIBP_SHIFT) /* STIBP mask */ | ||
2355 | #define SPEC_CTRL_SSBD_SHIFT 2 /* Speculative Store Bypass Disable bit */ | ||
2356 | -#define SPEC_CTRL_SSBD (1 << SPEC_CTRL_SSBD_SHIFT) /* Speculative Store Bypass Disable */ | ||
2357 | +#define SPEC_CTRL_SSBD BIT(SPEC_CTRL_SSBD_SHIFT) /* Speculative Store Bypass Disable */ | ||
2358 | |||
2359 | #define MSR_IA32_PRED_CMD 0x00000049 /* Prediction Command */ | ||
2360 | -#define PRED_CMD_IBPB (1 << 0) /* Indirect Branch Prediction Barrier */ | ||
2361 | +#define PRED_CMD_IBPB BIT(0) /* Indirect Branch Prediction Barrier */ | ||
2362 | |||
2363 | #define MSR_IA32_PERFCTR0 0x000000c1 | ||
2364 | #define MSR_IA32_PERFCTR1 0x000000c2 | ||
2365 | @@ -61,20 +64,25 @@ | ||
2366 | #define MSR_MTRRcap 0x000000fe | ||
2367 | |||
2368 | #define MSR_IA32_ARCH_CAPABILITIES 0x0000010a | ||
2369 | -#define ARCH_CAP_RDCL_NO (1 << 0) /* Not susceptible to Meltdown */ | ||
2370 | -#define ARCH_CAP_IBRS_ALL (1 << 1) /* Enhanced IBRS support */ | ||
2371 | -#define ARCH_CAP_SKIP_VMENTRY_L1DFLUSH (1 << 3) /* Skip L1D flush on vmentry */ | ||
2372 | -#define ARCH_CAP_SSB_NO (1 << 4) /* | ||
2373 | - * Not susceptible to Speculative Store Bypass | ||
2374 | - * attack, so no Speculative Store Bypass | ||
2375 | - * control required. | ||
2376 | - */ | ||
2377 | +#define ARCH_CAP_RDCL_NO BIT(0) /* Not susceptible to Meltdown */ | ||
2378 | +#define ARCH_CAP_IBRS_ALL BIT(1) /* Enhanced IBRS support */ | ||
2379 | +#define ARCH_CAP_SKIP_VMENTRY_L1DFLUSH BIT(3) /* Skip L1D flush on vmentry */ | ||
2380 | +#define ARCH_CAP_SSB_NO BIT(4) /* | ||
2381 | + * Not susceptible to Speculative Store Bypass | ||
2382 | + * attack, so no Speculative Store Bypass | ||
2383 | + * control required. | ||
2384 | + */ | ||
2385 | +#define ARCH_CAP_MDS_NO BIT(5) /* | ||
2386 | + * Not susceptible to | ||
2387 | + * Microarchitectural Data | ||
2388 | + * Sampling (MDS) vulnerabilities. | ||
2389 | + */ | ||
2390 | |||
2391 | #define MSR_IA32_FLUSH_CMD 0x0000010b | ||
2392 | -#define L1D_FLUSH (1 << 0) /* | ||
2393 | - * Writeback and invalidate the | ||
2394 | - * L1 data cache. | ||
2395 | - */ | ||
2396 | +#define L1D_FLUSH BIT(0) /* | ||
2397 | + * Writeback and invalidate the | ||
2398 | + * L1 data cache. | ||
2399 | + */ | ||
2400 | |||
2401 | #define MSR_IA32_BBL_CR_CTL 0x00000119 | ||
2402 | #define MSR_IA32_BBL_CR_CTL3 0x0000011e | ||
2403 | diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h | ||
2404 | index f37f2d8a2989..0b40cc442bda 100644 | ||
2405 | --- a/arch/x86/include/asm/mwait.h | ||
2406 | +++ b/arch/x86/include/asm/mwait.h | ||
2407 | @@ -4,6 +4,7 @@ | ||
2408 | #include <linux/sched.h> | ||
2409 | |||
2410 | #include <asm/cpufeature.h> | ||
2411 | +#include <asm/nospec-branch.h> | ||
2412 | |||
2413 | #define MWAIT_SUBSTATE_MASK 0xf | ||
2414 | #define MWAIT_CSTATE_MASK 0xf | ||
2415 | @@ -38,6 +39,8 @@ static inline void __monitorx(const void *eax, unsigned long ecx, | ||
2416 | |||
2417 | static inline void __mwait(unsigned long eax, unsigned long ecx) | ||
2418 | { | ||
2419 | + mds_idle_clear_cpu_buffers(); | ||
2420 | + | ||
2421 | /* "mwait %eax, %ecx;" */ | ||
2422 | asm volatile(".byte 0x0f, 0x01, 0xc9;" | ||
2423 | :: "a" (eax), "c" (ecx)); | ||
2424 | @@ -72,6 +75,8 @@ static inline void __mwait(unsigned long eax, unsigned long ecx) | ||
2425 | static inline void __mwaitx(unsigned long eax, unsigned long ebx, | ||
2426 | unsigned long ecx) | ||
2427 | { | ||
2428 | + /* No MDS buffer clear as this is AMD/HYGON only */ | ||
2429 | + | ||
2430 | /* "mwaitx %eax, %ebx, %ecx;" */ | ||
2431 | asm volatile(".byte 0x0f, 0x01, 0xfb;" | ||
2432 | :: "a" (eax), "b" (ebx), "c" (ecx)); | ||
2433 | @@ -79,6 +84,8 @@ static inline void __mwaitx(unsigned long eax, unsigned long ebx, | ||
2434 | |||
2435 | static inline void __sti_mwait(unsigned long eax, unsigned long ecx) | ||
2436 | { | ||
2437 | + mds_idle_clear_cpu_buffers(); | ||
2438 | + | ||
2439 | trace_hardirqs_on(); | ||
2440 | /* "mwait %eax, %ecx;" */ | ||
2441 | asm volatile("sti; .byte 0x0f, 0x01, 0xc9;" | ||
2442 | diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h | ||
2443 | index 1b4132161c1f..031a58e84e5b 100644 | ||
2444 | --- a/arch/x86/include/asm/nospec-branch.h | ||
2445 | +++ b/arch/x86/include/asm/nospec-branch.h | ||
2446 | @@ -3,6 +3,8 @@ | ||
2447 | #ifndef _ASM_X86_NOSPEC_BRANCH_H_ | ||
2448 | #define _ASM_X86_NOSPEC_BRANCH_H_ | ||
2449 | |||
2450 | +#include <linux/static_key.h> | ||
2451 | + | ||
2452 | #include <asm/alternative.h> | ||
2453 | #include <asm/alternative-asm.h> | ||
2454 | #include <asm/cpufeatures.h> | ||
2455 | @@ -214,10 +216,17 @@ enum spectre_v2_mitigation { | ||
2456 | SPECTRE_V2_RETPOLINE_MINIMAL_AMD, | ||
2457 | SPECTRE_V2_RETPOLINE_GENERIC, | ||
2458 | SPECTRE_V2_RETPOLINE_AMD, | ||
2459 | - SPECTRE_V2_IBRS, | ||
2460 | SPECTRE_V2_IBRS_ENHANCED, | ||
2461 | }; | ||
2462 | |||
2463 | +/* The indirect branch speculation control variants */ | ||
2464 | +enum spectre_v2_user_mitigation { | ||
2465 | + SPECTRE_V2_USER_NONE, | ||
2466 | + SPECTRE_V2_USER_STRICT, | ||
2467 | + SPECTRE_V2_USER_PRCTL, | ||
2468 | + SPECTRE_V2_USER_SECCOMP, | ||
2469 | +}; | ||
2470 | + | ||
2471 | /* The Speculative Store Bypass disable variants */ | ||
2472 | enum ssb_mitigation { | ||
2473 | SPEC_STORE_BYPASS_NONE, | ||
2474 | @@ -295,6 +304,60 @@ do { \ | ||
2475 | preempt_enable(); \ | ||
2476 | } while (0) | ||
2477 | |||
2478 | +DECLARE_STATIC_KEY_FALSE(switch_to_cond_stibp); | ||
2479 | +DECLARE_STATIC_KEY_FALSE(switch_mm_cond_ibpb); | ||
2480 | +DECLARE_STATIC_KEY_FALSE(switch_mm_always_ibpb); | ||
2481 | + | ||
2482 | +DECLARE_STATIC_KEY_FALSE(mds_user_clear); | ||
2483 | +DECLARE_STATIC_KEY_FALSE(mds_idle_clear); | ||
2484 | + | ||
2485 | +#include <asm/segment.h> | ||
2486 | + | ||
2487 | +/** | ||
2488 | + * mds_clear_cpu_buffers - Mitigation for MDS vulnerability | ||
2489 | + * | ||
2490 | + * This uses the otherwise unused and obsolete VERW instruction in | ||
2491 | + * combination with microcode which triggers a CPU buffer flush when the | ||
2492 | + * instruction is executed. | ||
2493 | + */ | ||
2494 | +static inline void mds_clear_cpu_buffers(void) | ||
2495 | +{ | ||
2496 | + static const u16 ds = __KERNEL_DS; | ||
2497 | + | ||
2498 | + /* | ||
2499 | + * Has to be the memory-operand variant because only that | ||
2500 | + * guarantees the CPU buffer flush functionality according to | ||
2501 | + * documentation. The register-operand variant does not. | ||
2502 | + * Works with any segment selector, but a valid writable | ||
2503 | + * data segment is the fastest variant. | ||
2504 | + * | ||
2505 | + * "cc" clobber is required because VERW modifies ZF. | ||
2506 | + */ | ||
2507 | + asm volatile("verw %[ds]" : : [ds] "m" (ds) : "cc"); | ||
2508 | +} | ||
2509 | + | ||
2510 | +/** | ||
2511 | + * mds_user_clear_cpu_buffers - Mitigation for MDS vulnerability | ||
2512 | + * | ||
2513 | + * Clear CPU buffers if the corresponding static key is enabled | ||
2514 | + */ | ||
2515 | +static inline void mds_user_clear_cpu_buffers(void) | ||
2516 | +{ | ||
2517 | + if (static_branch_likely(&mds_user_clear)) | ||
2518 | + mds_clear_cpu_buffers(); | ||
2519 | +} | ||
2520 | + | ||
2521 | +/** | ||
2522 | + * mds_idle_clear_cpu_buffers - Mitigation for MDS vulnerability | ||
2523 | + * | ||
2524 | + * Clear CPU buffers if the corresponding static key is enabled | ||
2525 | + */ | ||
2526 | +static inline void mds_idle_clear_cpu_buffers(void) | ||
2527 | +{ | ||
2528 | + if (static_branch_likely(&mds_idle_clear)) | ||
2529 | + mds_clear_cpu_buffers(); | ||
2530 | +} | ||
2531 | + | ||
2532 | #endif /* __ASSEMBLY__ */ | ||
2533 | |||
2534 | /* | ||
2535 | diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h | ||
2536 | index 221a32ed1372..f12e61e2a86b 100644 | ||
2537 | --- a/arch/x86/include/asm/pgtable_64.h | ||
2538 | +++ b/arch/x86/include/asm/pgtable_64.h | ||
2539 | @@ -44,15 +44,15 @@ struct mm_struct; | ||
2540 | void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte); | ||
2541 | |||
2542 | |||
2543 | -static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr, | ||
2544 | - pte_t *ptep) | ||
2545 | +static inline void native_set_pte(pte_t *ptep, pte_t pte) | ||
2546 | { | ||
2547 | - *ptep = native_make_pte(0); | ||
2548 | + WRITE_ONCE(*ptep, pte); | ||
2549 | } | ||
2550 | |||
2551 | -static inline void native_set_pte(pte_t *ptep, pte_t pte) | ||
2552 | +static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr, | ||
2553 | + pte_t *ptep) | ||
2554 | { | ||
2555 | - *ptep = pte; | ||
2556 | + native_set_pte(ptep, native_make_pte(0)); | ||
2557 | } | ||
2558 | |||
2559 | static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte) | ||
2560 | @@ -62,7 +62,7 @@ static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte) | ||
2561 | |||
2562 | static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd) | ||
2563 | { | ||
2564 | - *pmdp = pmd; | ||
2565 | + WRITE_ONCE(*pmdp, pmd); | ||
2566 | } | ||
2567 | |||
2568 | static inline void native_pmd_clear(pmd_t *pmd) | ||
2569 | @@ -98,7 +98,7 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp) | ||
2570 | |||
2571 | static inline void native_set_pud(pud_t *pudp, pud_t pud) | ||
2572 | { | ||
2573 | - *pudp = pud; | ||
2574 | + WRITE_ONCE(*pudp, pud); | ||
2575 | } | ||
2576 | |||
2577 | static inline void native_pud_clear(pud_t *pud) | ||
2578 | @@ -131,7 +131,7 @@ static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp) | ||
2579 | |||
2580 | static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) | ||
2581 | { | ||
2582 | - *pgdp = kaiser_set_shadow_pgd(pgdp, pgd); | ||
2583 | + WRITE_ONCE(*pgdp, kaiser_set_shadow_pgd(pgdp, pgd)); | ||
2584 | } | ||
2585 | |||
2586 | static inline void native_pgd_clear(pgd_t *pgd) | ||
2587 | diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h | ||
2588 | index ee8c6290c421..155e49fc7010 100644 | ||
2589 | --- a/arch/x86/include/asm/processor.h | ||
2590 | +++ b/arch/x86/include/asm/processor.h | ||
2591 | @@ -874,4 +874,10 @@ enum l1tf_mitigations { | ||
2592 | |||
2593 | extern enum l1tf_mitigations l1tf_mitigation; | ||
2594 | |||
2595 | +enum mds_mitigations { | ||
2596 | + MDS_MITIGATION_OFF, | ||
2597 | + MDS_MITIGATION_FULL, | ||
2598 | + MDS_MITIGATION_VMWERV, | ||
2599 | +}; | ||
2600 | + | ||
2601 | #endif /* _ASM_X86_PROCESSOR_H */ | ||
2602 | diff --git a/arch/x86/include/asm/spec-ctrl.h b/arch/x86/include/asm/spec-ctrl.h | ||
2603 | index ae7c2c5cd7f0..5393babc0598 100644 | ||
2604 | --- a/arch/x86/include/asm/spec-ctrl.h | ||
2605 | +++ b/arch/x86/include/asm/spec-ctrl.h | ||
2606 | @@ -53,12 +53,24 @@ static inline u64 ssbd_tif_to_spec_ctrl(u64 tifn) | ||
2607 | return (tifn & _TIF_SSBD) >> (TIF_SSBD - SPEC_CTRL_SSBD_SHIFT); | ||
2608 | } | ||
2609 | |||
2610 | +static inline u64 stibp_tif_to_spec_ctrl(u64 tifn) | ||
2611 | +{ | ||
2612 | + BUILD_BUG_ON(TIF_SPEC_IB < SPEC_CTRL_STIBP_SHIFT); | ||
2613 | + return (tifn & _TIF_SPEC_IB) >> (TIF_SPEC_IB - SPEC_CTRL_STIBP_SHIFT); | ||
2614 | +} | ||
2615 | + | ||
2616 | static inline unsigned long ssbd_spec_ctrl_to_tif(u64 spec_ctrl) | ||
2617 | { | ||
2618 | BUILD_BUG_ON(TIF_SSBD < SPEC_CTRL_SSBD_SHIFT); | ||
2619 | return (spec_ctrl & SPEC_CTRL_SSBD) << (TIF_SSBD - SPEC_CTRL_SSBD_SHIFT); | ||
2620 | } | ||
2621 | |||
2622 | +static inline unsigned long stibp_spec_ctrl_to_tif(u64 spec_ctrl) | ||
2623 | +{ | ||
2624 | + BUILD_BUG_ON(TIF_SPEC_IB < SPEC_CTRL_STIBP_SHIFT); | ||
2625 | + return (spec_ctrl & SPEC_CTRL_STIBP) << (TIF_SPEC_IB - SPEC_CTRL_STIBP_SHIFT); | ||
2626 | +} | ||
2627 | + | ||
2628 | static inline u64 ssbd_tif_to_amd_ls_cfg(u64 tifn) | ||
2629 | { | ||
2630 | return (tifn & _TIF_SSBD) ? x86_amd_ls_cfg_ssbd_mask : 0ULL; | ||
2631 | @@ -70,11 +82,7 @@ extern void speculative_store_bypass_ht_init(void); | ||
2632 | static inline void speculative_store_bypass_ht_init(void) { } | ||
2633 | #endif | ||
2634 | |||
2635 | -extern void speculative_store_bypass_update(unsigned long tif); | ||
2636 | - | ||
2637 | -static inline void speculative_store_bypass_update_current(void) | ||
2638 | -{ | ||
2639 | - speculative_store_bypass_update(current_thread_info()->flags); | ||
2640 | -} | ||
2641 | +extern void speculation_ctrl_update(unsigned long tif); | ||
2642 | +extern void speculation_ctrl_update_current(void); | ||
2643 | |||
2644 | #endif | ||
2645 | diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h | ||
2646 | index 5cb436acd463..676e84f521ba 100644 | ||
2647 | --- a/arch/x86/include/asm/switch_to.h | ||
2648 | +++ b/arch/x86/include/asm/switch_to.h | ||
2649 | @@ -8,9 +8,6 @@ struct task_struct *__switch_to_asm(struct task_struct *prev, | ||
2650 | |||
2651 | __visible struct task_struct *__switch_to(struct task_struct *prev, | ||
2652 | struct task_struct *next); | ||
2653 | -struct tss_struct; | ||
2654 | -void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, | ||
2655 | - struct tss_struct *tss); | ||
2656 | |||
2657 | /* This runs runs on the previous thread's stack. */ | ||
2658 | static inline void prepare_switch_to(struct task_struct *prev, | ||
2659 | diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h | ||
2660 | index 2d8788a59b4d..0438f7fbb383 100644 | ||
2661 | --- a/arch/x86/include/asm/thread_info.h | ||
2662 | +++ b/arch/x86/include/asm/thread_info.h | ||
2663 | @@ -83,10 +83,12 @@ struct thread_info { | ||
2664 | #define TIF_SIGPENDING 2 /* signal pending */ | ||
2665 | #define TIF_NEED_RESCHED 3 /* rescheduling necessary */ | ||
2666 | #define TIF_SINGLESTEP 4 /* reenable singlestep on user return*/ | ||
2667 | -#define TIF_SSBD 5 /* Reduced data speculation */ | ||
2668 | +#define TIF_SSBD 5 /* Speculative store bypass disable */ | ||
2669 | #define TIF_SYSCALL_EMU 6 /* syscall emulation active */ | ||
2670 | #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ | ||
2671 | #define TIF_SECCOMP 8 /* secure computing */ | ||
2672 | +#define TIF_SPEC_IB 9 /* Indirect branch speculation mitigation */ | ||
2673 | +#define TIF_SPEC_FORCE_UPDATE 10 /* Force speculation MSR update in context switch */ | ||
2674 | #define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */ | ||
2675 | #define TIF_UPROBE 12 /* breakpointed or singlestepping */ | ||
2676 | #define TIF_NOTSC 16 /* TSC is not accessible in userland */ | ||
2677 | @@ -111,6 +113,8 @@ struct thread_info { | ||
2678 | #define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU) | ||
2679 | #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) | ||
2680 | #define _TIF_SECCOMP (1 << TIF_SECCOMP) | ||
2681 | +#define _TIF_SPEC_IB (1 << TIF_SPEC_IB) | ||
2682 | +#define _TIF_SPEC_FORCE_UPDATE (1 << TIF_SPEC_FORCE_UPDATE) | ||
2683 | #define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY) | ||
2684 | #define _TIF_UPROBE (1 << TIF_UPROBE) | ||
2685 | #define _TIF_NOTSC (1 << TIF_NOTSC) | ||
2686 | @@ -140,8 +144,18 @@ struct thread_info { | ||
2687 | _TIF_NOHZ) | ||
2688 | |||
2689 | /* flags to check in __switch_to() */ | ||
2690 | -#define _TIF_WORK_CTXSW \ | ||
2691 | - (_TIF_IO_BITMAP|_TIF_NOTSC|_TIF_BLOCKSTEP|_TIF_SSBD) | ||
2692 | +#define _TIF_WORK_CTXSW_BASE \ | ||
2693 | + (_TIF_IO_BITMAP|_TIF_NOTSC|_TIF_BLOCKSTEP| \ | ||
2694 | + _TIF_SSBD | _TIF_SPEC_FORCE_UPDATE) | ||
2695 | + | ||
2696 | +/* | ||
2697 | + * Avoid calls to __switch_to_xtra() on UP as STIBP is not evaluated. | ||
2698 | + */ | ||
2699 | +#ifdef CONFIG_SMP | ||
2700 | +# define _TIF_WORK_CTXSW (_TIF_WORK_CTXSW_BASE | _TIF_SPEC_IB) | ||
2701 | +#else | ||
2702 | +# define _TIF_WORK_CTXSW (_TIF_WORK_CTXSW_BASE) | ||
2703 | +#endif | ||
2704 | |||
2705 | #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY) | ||
2706 | #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW) | ||
2707 | diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h | ||
2708 | index 686a58d793e5..f5ca15622dc9 100644 | ||
2709 | --- a/arch/x86/include/asm/tlbflush.h | ||
2710 | +++ b/arch/x86/include/asm/tlbflush.h | ||
2711 | @@ -68,8 +68,12 @@ static inline void invpcid_flush_all_nonglobals(void) | ||
2712 | struct tlb_state { | ||
2713 | struct mm_struct *active_mm; | ||
2714 | int state; | ||
2715 | - /* last user mm's ctx id */ | ||
2716 | - u64 last_ctx_id; | ||
2717 | + | ||
2718 | + /* Last user mm for optimizing IBPB */ | ||
2719 | + union { | ||
2720 | + struct mm_struct *last_user_mm; | ||
2721 | + unsigned long last_user_mm_ibpb; | ||
2722 | + }; | ||
2723 | |||
2724 | /* | ||
2725 | * Access to this CR4 shadow and to H/W CR4 is protected by | ||
2726 | diff --git a/arch/x86/include/uapi/asm/Kbuild b/arch/x86/include/uapi/asm/Kbuild | ||
2727 | index 3dec769cadf7..1c532b3f18ea 100644 | ||
2728 | --- a/arch/x86/include/uapi/asm/Kbuild | ||
2729 | +++ b/arch/x86/include/uapi/asm/Kbuild | ||
2730 | @@ -27,7 +27,6 @@ header-y += ldt.h | ||
2731 | header-y += mce.h | ||
2732 | header-y += mman.h | ||
2733 | header-y += msgbuf.h | ||
2734 | -header-y += msr-index.h | ||
2735 | header-y += msr.h | ||
2736 | header-y += mtrr.h | ||
2737 | header-y += param.h | ||
2738 | diff --git a/arch/x86/include/uapi/asm/mce.h b/arch/x86/include/uapi/asm/mce.h | ||
2739 | index 69a6e07e3149..db7dae58745f 100644 | ||
2740 | --- a/arch/x86/include/uapi/asm/mce.h | ||
2741 | +++ b/arch/x86/include/uapi/asm/mce.h | ||
2742 | @@ -28,6 +28,8 @@ struct mce { | ||
2743 | __u64 mcgcap; /* MCGCAP MSR: machine check capabilities of CPU */ | ||
2744 | __u64 synd; /* MCA_SYND MSR: only valid on SMCA systems */ | ||
2745 | __u64 ipid; /* MCA_IPID MSR: only valid on SMCA systems */ | ||
2746 | + __u64 ppin; /* Protected Processor Inventory Number */ | ||
2747 | + __u32 microcode;/* Microcode revision */ | ||
2748 | }; | ||
2749 | |||
2750 | #define MCE_GET_RECORD_LEN _IOR('M', 1, int) | ||
2751 | diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c | ||
2752 | index 6221166e3fca..16970c39baea 100644 | ||
2753 | --- a/arch/x86/kernel/cpu/bugs.c | ||
2754 | +++ b/arch/x86/kernel/cpu/bugs.c | ||
2755 | @@ -13,6 +13,7 @@ | ||
2756 | #include <linux/module.h> | ||
2757 | #include <linux/nospec.h> | ||
2758 | #include <linux/prctl.h> | ||
2759 | +#include <linux/sched/smt.h> | ||
2760 | |||
2761 | #include <asm/spec-ctrl.h> | ||
2762 | #include <asm/cmdline.h> | ||
2763 | @@ -24,6 +25,7 @@ | ||
2764 | #include <asm/vmx.h> | ||
2765 | #include <asm/paravirt.h> | ||
2766 | #include <asm/alternative.h> | ||
2767 | +#include <asm/hypervisor.h> | ||
2768 | #include <asm/pgtable.h> | ||
2769 | #include <asm/cacheflush.h> | ||
2770 | #include <asm/intel-family.h> | ||
2771 | @@ -32,13 +34,12 @@ | ||
2772 | static void __init spectre_v2_select_mitigation(void); | ||
2773 | static void __init ssb_select_mitigation(void); | ||
2774 | static void __init l1tf_select_mitigation(void); | ||
2775 | +static void __init mds_select_mitigation(void); | ||
2776 | |||
2777 | -/* | ||
2778 | - * Our boot-time value of the SPEC_CTRL MSR. We read it once so that any | ||
2779 | - * writes to SPEC_CTRL contain whatever reserved bits have been set. | ||
2780 | - */ | ||
2781 | -u64 __ro_after_init x86_spec_ctrl_base; | ||
2782 | +/* The base value of the SPEC_CTRL MSR that always has to be preserved. */ | ||
2783 | +u64 x86_spec_ctrl_base; | ||
2784 | EXPORT_SYMBOL_GPL(x86_spec_ctrl_base); | ||
2785 | +static DEFINE_MUTEX(spec_ctrl_mutex); | ||
2786 | |||
2787 | /* | ||
2788 | * The vendor and possibly platform specific bits which can be modified in | ||
2789 | @@ -53,6 +54,20 @@ static u64 __ro_after_init x86_spec_ctrl_mask = SPEC_CTRL_IBRS; | ||
2790 | u64 __ro_after_init x86_amd_ls_cfg_base; | ||
2791 | u64 __ro_after_init x86_amd_ls_cfg_ssbd_mask; | ||
2792 | |||
2793 | +/* Control conditional STIPB in switch_to() */ | ||
2794 | +DEFINE_STATIC_KEY_FALSE(switch_to_cond_stibp); | ||
2795 | +/* Control conditional IBPB in switch_mm() */ | ||
2796 | +DEFINE_STATIC_KEY_FALSE(switch_mm_cond_ibpb); | ||
2797 | +/* Control unconditional IBPB in switch_mm() */ | ||
2798 | +DEFINE_STATIC_KEY_FALSE(switch_mm_always_ibpb); | ||
2799 | + | ||
2800 | +/* Control MDS CPU buffer clear before returning to user space */ | ||
2801 | +DEFINE_STATIC_KEY_FALSE(mds_user_clear); | ||
2802 | +EXPORT_SYMBOL_GPL(mds_user_clear); | ||
2803 | +/* Control MDS CPU buffer clear before idling (halt, mwait) */ | ||
2804 | +DEFINE_STATIC_KEY_FALSE(mds_idle_clear); | ||
2805 | +EXPORT_SYMBOL_GPL(mds_idle_clear); | ||
2806 | + | ||
2807 | void __init check_bugs(void) | ||
2808 | { | ||
2809 | identify_boot_cpu(); | ||
2810 | @@ -91,6 +106,10 @@ void __init check_bugs(void) | ||
2811 | |||
2812 | l1tf_select_mitigation(); | ||
2813 | |||
2814 | + mds_select_mitigation(); | ||
2815 | + | ||
2816 | + arch_smt_update(); | ||
2817 | + | ||
2818 | #ifdef CONFIG_X86_32 | ||
2819 | /* | ||
2820 | * Check whether we are able to run this kernel safely on SMP. | ||
2821 | @@ -123,31 +142,6 @@ void __init check_bugs(void) | ||
2822 | #endif | ||
2823 | } | ||
2824 | |||
2825 | -/* The kernel command line selection */ | ||
2826 | -enum spectre_v2_mitigation_cmd { | ||
2827 | - SPECTRE_V2_CMD_NONE, | ||
2828 | - SPECTRE_V2_CMD_AUTO, | ||
2829 | - SPECTRE_V2_CMD_FORCE, | ||
2830 | - SPECTRE_V2_CMD_RETPOLINE, | ||
2831 | - SPECTRE_V2_CMD_RETPOLINE_GENERIC, | ||
2832 | - SPECTRE_V2_CMD_RETPOLINE_AMD, | ||
2833 | -}; | ||
2834 | - | ||
2835 | -static const char *spectre_v2_strings[] = { | ||
2836 | - [SPECTRE_V2_NONE] = "Vulnerable", | ||
2837 | - [SPECTRE_V2_RETPOLINE_MINIMAL] = "Vulnerable: Minimal generic ASM retpoline", | ||
2838 | - [SPECTRE_V2_RETPOLINE_MINIMAL_AMD] = "Vulnerable: Minimal AMD ASM retpoline", | ||
2839 | - [SPECTRE_V2_RETPOLINE_GENERIC] = "Mitigation: Full generic retpoline", | ||
2840 | - [SPECTRE_V2_RETPOLINE_AMD] = "Mitigation: Full AMD retpoline", | ||
2841 | - [SPECTRE_V2_IBRS_ENHANCED] = "Mitigation: Enhanced IBRS", | ||
2842 | -}; | ||
2843 | - | ||
2844 | -#undef pr_fmt | ||
2845 | -#define pr_fmt(fmt) "Spectre V2 : " fmt | ||
2846 | - | ||
2847 | -static enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init = | ||
2848 | - SPECTRE_V2_NONE; | ||
2849 | - | ||
2850 | void | ||
2851 | x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest) | ||
2852 | { | ||
2853 | @@ -165,9 +159,14 @@ x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest) | ||
2854 | guestval |= guest_spec_ctrl & x86_spec_ctrl_mask; | ||
2855 | |||
2856 | /* SSBD controlled in MSR_SPEC_CTRL */ | ||
2857 | - if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD)) | ||
2858 | + if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) || | ||
2859 | + static_cpu_has(X86_FEATURE_AMD_SSBD)) | ||
2860 | hostval |= ssbd_tif_to_spec_ctrl(ti->flags); | ||
2861 | |||
2862 | + /* Conditional STIBP enabled? */ | ||
2863 | + if (static_branch_unlikely(&switch_to_cond_stibp)) | ||
2864 | + hostval |= stibp_tif_to_spec_ctrl(ti->flags); | ||
2865 | + | ||
2866 | if (hostval != guestval) { | ||
2867 | msrval = setguest ? guestval : hostval; | ||
2868 | wrmsrl(MSR_IA32_SPEC_CTRL, msrval); | ||
2869 | @@ -201,7 +200,7 @@ x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest) | ||
2870 | tif = setguest ? ssbd_spec_ctrl_to_tif(guestval) : | ||
2871 | ssbd_spec_ctrl_to_tif(hostval); | ||
2872 | |||
2873 | - speculative_store_bypass_update(tif); | ||
2874 | + speculation_ctrl_update(tif); | ||
2875 | } | ||
2876 | } | ||
2877 | EXPORT_SYMBOL_GPL(x86_virt_spec_ctrl); | ||
2878 | @@ -216,6 +215,70 @@ static void x86_amd_ssb_disable(void) | ||
2879 | wrmsrl(MSR_AMD64_LS_CFG, msrval); | ||
2880 | } | ||
2881 | |||
2882 | +#undef pr_fmt | ||
2883 | +#define pr_fmt(fmt) "MDS: " fmt | ||
2884 | + | ||
2885 | +/* Default mitigation for MDS-affected CPUs */ | ||
2886 | +static enum mds_mitigations mds_mitigation __ro_after_init = MDS_MITIGATION_FULL; | ||
2887 | +static bool mds_nosmt __ro_after_init = false; | ||
2888 | + | ||
2889 | +static const char * const mds_strings[] = { | ||
2890 | + [MDS_MITIGATION_OFF] = "Vulnerable", | ||
2891 | + [MDS_MITIGATION_FULL] = "Mitigation: Clear CPU buffers", | ||
2892 | + [MDS_MITIGATION_VMWERV] = "Vulnerable: Clear CPU buffers attempted, no microcode", | ||
2893 | +}; | ||
2894 | + | ||
2895 | +static void __init mds_select_mitigation(void) | ||
2896 | +{ | ||
2897 | + if (!boot_cpu_has_bug(X86_BUG_MDS) || cpu_mitigations_off()) { | ||
2898 | + mds_mitigation = MDS_MITIGATION_OFF; | ||
2899 | + return; | ||
2900 | + } | ||
2901 | + | ||
2902 | + if (mds_mitigation == MDS_MITIGATION_FULL) { | ||
2903 | + if (!boot_cpu_has(X86_FEATURE_MD_CLEAR)) | ||
2904 | + mds_mitigation = MDS_MITIGATION_VMWERV; | ||
2905 | + | ||
2906 | + static_branch_enable(&mds_user_clear); | ||
2907 | + | ||
2908 | + if (!boot_cpu_has(X86_BUG_MSBDS_ONLY) && | ||
2909 | + (mds_nosmt || cpu_mitigations_auto_nosmt())) | ||
2910 | + cpu_smt_disable(false); | ||
2911 | + } | ||
2912 | + | ||
2913 | + pr_info("%s\n", mds_strings[mds_mitigation]); | ||
2914 | +} | ||
2915 | + | ||
2916 | +static int __init mds_cmdline(char *str) | ||
2917 | +{ | ||
2918 | + if (!boot_cpu_has_bug(X86_BUG_MDS)) | ||
2919 | + return 0; | ||
2920 | + | ||
2921 | + if (!str) | ||
2922 | + return -EINVAL; | ||
2923 | + | ||
2924 | + if (!strcmp(str, "off")) | ||
2925 | + mds_mitigation = MDS_MITIGATION_OFF; | ||
2926 | + else if (!strcmp(str, "full")) | ||
2927 | + mds_mitigation = MDS_MITIGATION_FULL; | ||
2928 | + else if (!strcmp(str, "full,nosmt")) { | ||
2929 | + mds_mitigation = MDS_MITIGATION_FULL; | ||
2930 | + mds_nosmt = true; | ||
2931 | + } | ||
2932 | + | ||
2933 | + return 0; | ||
2934 | +} | ||
2935 | +early_param("mds", mds_cmdline); | ||
2936 | + | ||
2937 | +#undef pr_fmt | ||
2938 | +#define pr_fmt(fmt) "Spectre V2 : " fmt | ||
2939 | + | ||
2940 | +static enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init = | ||
2941 | + SPECTRE_V2_NONE; | ||
2942 | + | ||
2943 | +static enum spectre_v2_user_mitigation spectre_v2_user __ro_after_init = | ||
2944 | + SPECTRE_V2_USER_NONE; | ||
2945 | + | ||
2946 | #ifdef RETPOLINE | ||
2947 | static bool spectre_v2_bad_module; | ||
2948 | |||
2949 | @@ -237,67 +300,225 @@ static inline const char *spectre_v2_module_string(void) | ||
2950 | static inline const char *spectre_v2_module_string(void) { return ""; } | ||
2951 | #endif | ||
2952 | |||
2953 | -static void __init spec2_print_if_insecure(const char *reason) | ||
2954 | +static inline bool match_option(const char *arg, int arglen, const char *opt) | ||
2955 | { | ||
2956 | - if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) | ||
2957 | - pr_info("%s selected on command line.\n", reason); | ||
2958 | + int len = strlen(opt); | ||
2959 | + | ||
2960 | + return len == arglen && !strncmp(arg, opt, len); | ||
2961 | } | ||
2962 | |||
2963 | -static void __init spec2_print_if_secure(const char *reason) | ||
2964 | +/* The kernel command line selection for spectre v2 */ | ||
2965 | +enum spectre_v2_mitigation_cmd { | ||
2966 | + SPECTRE_V2_CMD_NONE, | ||
2967 | + SPECTRE_V2_CMD_AUTO, | ||
2968 | + SPECTRE_V2_CMD_FORCE, | ||
2969 | + SPECTRE_V2_CMD_RETPOLINE, | ||
2970 | + SPECTRE_V2_CMD_RETPOLINE_GENERIC, | ||
2971 | + SPECTRE_V2_CMD_RETPOLINE_AMD, | ||
2972 | +}; | ||
2973 | + | ||
2974 | +enum spectre_v2_user_cmd { | ||
2975 | + SPECTRE_V2_USER_CMD_NONE, | ||
2976 | + SPECTRE_V2_USER_CMD_AUTO, | ||
2977 | + SPECTRE_V2_USER_CMD_FORCE, | ||
2978 | + SPECTRE_V2_USER_CMD_PRCTL, | ||
2979 | + SPECTRE_V2_USER_CMD_PRCTL_IBPB, | ||
2980 | + SPECTRE_V2_USER_CMD_SECCOMP, | ||
2981 | + SPECTRE_V2_USER_CMD_SECCOMP_IBPB, | ||
2982 | +}; | ||
2983 | + | ||
2984 | +static const char * const spectre_v2_user_strings[] = { | ||
2985 | + [SPECTRE_V2_USER_NONE] = "User space: Vulnerable", | ||
2986 | + [SPECTRE_V2_USER_STRICT] = "User space: Mitigation: STIBP protection", | ||
2987 | + [SPECTRE_V2_USER_PRCTL] = "User space: Mitigation: STIBP via prctl", | ||
2988 | + [SPECTRE_V2_USER_SECCOMP] = "User space: Mitigation: STIBP via seccomp and prctl", | ||
2989 | +}; | ||
2990 | + | ||
2991 | +static const struct { | ||
2992 | + const char *option; | ||
2993 | + enum spectre_v2_user_cmd cmd; | ||
2994 | + bool secure; | ||
2995 | +} v2_user_options[] __initconst = { | ||
2996 | + { "auto", SPECTRE_V2_USER_CMD_AUTO, false }, | ||
2997 | + { "off", SPECTRE_V2_USER_CMD_NONE, false }, | ||
2998 | + { "on", SPECTRE_V2_USER_CMD_FORCE, true }, | ||
2999 | + { "prctl", SPECTRE_V2_USER_CMD_PRCTL, false }, | ||
3000 | + { "prctl,ibpb", SPECTRE_V2_USER_CMD_PRCTL_IBPB, false }, | ||
3001 | + { "seccomp", SPECTRE_V2_USER_CMD_SECCOMP, false }, | ||
3002 | + { "seccomp,ibpb", SPECTRE_V2_USER_CMD_SECCOMP_IBPB, false }, | ||
3003 | +}; | ||
3004 | + | ||
3005 | +static void __init spec_v2_user_print_cond(const char *reason, bool secure) | ||
3006 | { | ||
3007 | - if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) | ||
3008 | - pr_info("%s selected on command line.\n", reason); | ||
3009 | + if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2) != secure) | ||
3010 | + pr_info("spectre_v2_user=%s forced on command line.\n", reason); | ||
3011 | } | ||
3012 | |||
3013 | -static inline bool retp_compiler(void) | ||
3014 | +static enum spectre_v2_user_cmd __init | ||
3015 | +spectre_v2_parse_user_cmdline(enum spectre_v2_mitigation_cmd v2_cmd) | ||
3016 | { | ||
3017 | - return __is_defined(RETPOLINE); | ||
3018 | + char arg[20]; | ||
3019 | + int ret, i; | ||
3020 | + | ||
3021 | + switch (v2_cmd) { | ||
3022 | + case SPECTRE_V2_CMD_NONE: | ||
3023 | + return SPECTRE_V2_USER_CMD_NONE; | ||
3024 | + case SPECTRE_V2_CMD_FORCE: | ||
3025 | + return SPECTRE_V2_USER_CMD_FORCE; | ||
3026 | + default: | ||
3027 | + break; | ||
3028 | + } | ||
3029 | + | ||
3030 | + ret = cmdline_find_option(boot_command_line, "spectre_v2_user", | ||
3031 | + arg, sizeof(arg)); | ||
3032 | + if (ret < 0) | ||
3033 | + return SPECTRE_V2_USER_CMD_AUTO; | ||
3034 | + | ||
3035 | + for (i = 0; i < ARRAY_SIZE(v2_user_options); i++) { | ||
3036 | + if (match_option(arg, ret, v2_user_options[i].option)) { | ||
3037 | + spec_v2_user_print_cond(v2_user_options[i].option, | ||
3038 | + v2_user_options[i].secure); | ||
3039 | + return v2_user_options[i].cmd; | ||
3040 | + } | ||
3041 | + } | ||
3042 | + | ||
3043 | + pr_err("Unknown user space protection option (%s). Switching to AUTO select\n", arg); | ||
3044 | + return SPECTRE_V2_USER_CMD_AUTO; | ||
3045 | } | ||
3046 | |||
3047 | -static inline bool match_option(const char *arg, int arglen, const char *opt) | ||
3048 | +static void __init | ||
3049 | +spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) | ||
3050 | { | ||
3051 | - int len = strlen(opt); | ||
3052 | + enum spectre_v2_user_mitigation mode = SPECTRE_V2_USER_NONE; | ||
3053 | + bool smt_possible = IS_ENABLED(CONFIG_SMP); | ||
3054 | + enum spectre_v2_user_cmd cmd; | ||
3055 | |||
3056 | - return len == arglen && !strncmp(arg, opt, len); | ||
3057 | + if (!boot_cpu_has(X86_FEATURE_IBPB) && !boot_cpu_has(X86_FEATURE_STIBP)) | ||
3058 | + return; | ||
3059 | + | ||
3060 | + if (cpu_smt_control == CPU_SMT_FORCE_DISABLED || | ||
3061 | + cpu_smt_control == CPU_SMT_NOT_SUPPORTED) | ||
3062 | + smt_possible = false; | ||
3063 | + | ||
3064 | + cmd = spectre_v2_parse_user_cmdline(v2_cmd); | ||
3065 | + switch (cmd) { | ||
3066 | + case SPECTRE_V2_USER_CMD_NONE: | ||
3067 | + goto set_mode; | ||
3068 | + case SPECTRE_V2_USER_CMD_FORCE: | ||
3069 | + mode = SPECTRE_V2_USER_STRICT; | ||
3070 | + break; | ||
3071 | + case SPECTRE_V2_USER_CMD_PRCTL: | ||
3072 | + case SPECTRE_V2_USER_CMD_PRCTL_IBPB: | ||
3073 | + mode = SPECTRE_V2_USER_PRCTL; | ||
3074 | + break; | ||
3075 | + case SPECTRE_V2_USER_CMD_AUTO: | ||
3076 | + case SPECTRE_V2_USER_CMD_SECCOMP: | ||
3077 | + case SPECTRE_V2_USER_CMD_SECCOMP_IBPB: | ||
3078 | + if (IS_ENABLED(CONFIG_SECCOMP)) | ||
3079 | + mode = SPECTRE_V2_USER_SECCOMP; | ||
3080 | + else | ||
3081 | + mode = SPECTRE_V2_USER_PRCTL; | ||
3082 | + break; | ||
3083 | + } | ||
3084 | + | ||
3085 | + /* Initialize Indirect Branch Prediction Barrier */ | ||
3086 | + if (boot_cpu_has(X86_FEATURE_IBPB)) { | ||
3087 | + setup_force_cpu_cap(X86_FEATURE_USE_IBPB); | ||
3088 | + | ||
3089 | + switch (cmd) { | ||
3090 | + case SPECTRE_V2_USER_CMD_FORCE: | ||
3091 | + case SPECTRE_V2_USER_CMD_PRCTL_IBPB: | ||
3092 | + case SPECTRE_V2_USER_CMD_SECCOMP_IBPB: | ||
3093 | + static_branch_enable(&switch_mm_always_ibpb); | ||
3094 | + break; | ||
3095 | + case SPECTRE_V2_USER_CMD_PRCTL: | ||
3096 | + case SPECTRE_V2_USER_CMD_AUTO: | ||
3097 | + case SPECTRE_V2_USER_CMD_SECCOMP: | ||
3098 | + static_branch_enable(&switch_mm_cond_ibpb); | ||
3099 | + break; | ||
3100 | + default: | ||
3101 | + break; | ||
3102 | + } | ||
3103 | + | ||
3104 | + pr_info("mitigation: Enabling %s Indirect Branch Prediction Barrier\n", | ||
3105 | + static_key_enabled(&switch_mm_always_ibpb) ? | ||
3106 | + "always-on" : "conditional"); | ||
3107 | + } | ||
3108 | + | ||
3109 | + /* If enhanced IBRS is enabled no STIPB required */ | ||
3110 | + if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) | ||
3111 | + return; | ||
3112 | + | ||
3113 | + /* | ||
3114 | + * If SMT is not possible or STIBP is not available clear the STIPB | ||
3115 | + * mode. | ||
3116 | + */ | ||
3117 | + if (!smt_possible || !boot_cpu_has(X86_FEATURE_STIBP)) | ||
3118 | + mode = SPECTRE_V2_USER_NONE; | ||
3119 | +set_mode: | ||
3120 | + spectre_v2_user = mode; | ||
3121 | + /* Only print the STIBP mode when SMT possible */ | ||
3122 | + if (smt_possible) | ||
3123 | + pr_info("%s\n", spectre_v2_user_strings[mode]); | ||
3124 | } | ||
3125 | |||
3126 | +static const char * const spectre_v2_strings[] = { | ||
3127 | + [SPECTRE_V2_NONE] = "Vulnerable", | ||
3128 | + [SPECTRE_V2_RETPOLINE_MINIMAL] = "Vulnerable: Minimal generic ASM retpoline", | ||
3129 | + [SPECTRE_V2_RETPOLINE_MINIMAL_AMD] = "Vulnerable: Minimal AMD ASM retpoline", | ||
3130 | + [SPECTRE_V2_RETPOLINE_GENERIC] = "Mitigation: Full generic retpoline", | ||
3131 | + [SPECTRE_V2_RETPOLINE_AMD] = "Mitigation: Full AMD retpoline", | ||
3132 | + [SPECTRE_V2_IBRS_ENHANCED] = "Mitigation: Enhanced IBRS", | ||
3133 | +}; | ||
3134 | + | ||
3135 | static const struct { | ||
3136 | const char *option; | ||
3137 | enum spectre_v2_mitigation_cmd cmd; | ||
3138 | bool secure; | ||
3139 | -} mitigation_options[] = { | ||
3140 | - { "off", SPECTRE_V2_CMD_NONE, false }, | ||
3141 | - { "on", SPECTRE_V2_CMD_FORCE, true }, | ||
3142 | - { "retpoline", SPECTRE_V2_CMD_RETPOLINE, false }, | ||
3143 | - { "retpoline,amd", SPECTRE_V2_CMD_RETPOLINE_AMD, false }, | ||
3144 | - { "retpoline,generic", SPECTRE_V2_CMD_RETPOLINE_GENERIC, false }, | ||
3145 | - { "auto", SPECTRE_V2_CMD_AUTO, false }, | ||
3146 | +} mitigation_options[] __initconst = { | ||
3147 | + { "off", SPECTRE_V2_CMD_NONE, false }, | ||
3148 | + { "on", SPECTRE_V2_CMD_FORCE, true }, | ||
3149 | + { "retpoline", SPECTRE_V2_CMD_RETPOLINE, false }, | ||
3150 | + { "retpoline,amd", SPECTRE_V2_CMD_RETPOLINE_AMD, false }, | ||
3151 | + { "retpoline,generic", SPECTRE_V2_CMD_RETPOLINE_GENERIC, false }, | ||
3152 | + { "auto", SPECTRE_V2_CMD_AUTO, false }, | ||
3153 | }; | ||
3154 | |||
3155 | +static void __init spec_v2_print_cond(const char *reason, bool secure) | ||
3156 | +{ | ||
3157 | + if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2) != secure) | ||
3158 | + pr_info("%s selected on command line.\n", reason); | ||
3159 | +} | ||
3160 | + | ||
3161 | +static inline bool retp_compiler(void) | ||
3162 | +{ | ||
3163 | + return __is_defined(RETPOLINE); | ||
3164 | +} | ||
3165 | + | ||
3166 | static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) | ||
3167 | { | ||
3168 | + enum spectre_v2_mitigation_cmd cmd = SPECTRE_V2_CMD_AUTO; | ||
3169 | char arg[20]; | ||
3170 | int ret, i; | ||
3171 | - enum spectre_v2_mitigation_cmd cmd = SPECTRE_V2_CMD_AUTO; | ||
3172 | |||
3173 | - if (cmdline_find_option_bool(boot_command_line, "nospectre_v2")) | ||
3174 | + if (cmdline_find_option_bool(boot_command_line, "nospectre_v2") || | ||
3175 | + cpu_mitigations_off()) | ||
3176 | return SPECTRE_V2_CMD_NONE; | ||
3177 | - else { | ||
3178 | - ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, sizeof(arg)); | ||
3179 | - if (ret < 0) | ||
3180 | - return SPECTRE_V2_CMD_AUTO; | ||
3181 | |||
3182 | - for (i = 0; i < ARRAY_SIZE(mitigation_options); i++) { | ||
3183 | - if (!match_option(arg, ret, mitigation_options[i].option)) | ||
3184 | - continue; | ||
3185 | - cmd = mitigation_options[i].cmd; | ||
3186 | - break; | ||
3187 | - } | ||
3188 | + ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, sizeof(arg)); | ||
3189 | + if (ret < 0) | ||
3190 | + return SPECTRE_V2_CMD_AUTO; | ||
3191 | |||
3192 | - if (i >= ARRAY_SIZE(mitigation_options)) { | ||
3193 | - pr_err("unknown option (%s). Switching to AUTO select\n", arg); | ||
3194 | - return SPECTRE_V2_CMD_AUTO; | ||
3195 | - } | ||
3196 | + for (i = 0; i < ARRAY_SIZE(mitigation_options); i++) { | ||
3197 | + if (!match_option(arg, ret, mitigation_options[i].option)) | ||
3198 | + continue; | ||
3199 | + cmd = mitigation_options[i].cmd; | ||
3200 | + break; | ||
3201 | + } | ||
3202 | + | ||
3203 | + if (i >= ARRAY_SIZE(mitigation_options)) { | ||
3204 | + pr_err("unknown option (%s). Switching to AUTO select\n", arg); | ||
3205 | + return SPECTRE_V2_CMD_AUTO; | ||
3206 | } | ||
3207 | |||
3208 | if ((cmd == SPECTRE_V2_CMD_RETPOLINE || | ||
3209 | @@ -314,11 +535,8 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) | ||
3210 | return SPECTRE_V2_CMD_AUTO; | ||
3211 | } | ||
3212 | |||
3213 | - if (mitigation_options[i].secure) | ||
3214 | - spec2_print_if_secure(mitigation_options[i].option); | ||
3215 | - else | ||
3216 | - spec2_print_if_insecure(mitigation_options[i].option); | ||
3217 | - | ||
3218 | + spec_v2_print_cond(mitigation_options[i].option, | ||
3219 | + mitigation_options[i].secure); | ||
3220 | return cmd; | ||
3221 | } | ||
3222 | |||
3223 | @@ -400,12 +618,6 @@ specv2_set_mode: | ||
3224 | setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW); | ||
3225 | pr_info("Spectre v2 / SpectreRSB mitigation: Filling RSB on context switch\n"); | ||
3226 | |||
3227 | - /* Initialize Indirect Branch Prediction Barrier if supported */ | ||
3228 | - if (boot_cpu_has(X86_FEATURE_IBPB)) { | ||
3229 | - setup_force_cpu_cap(X86_FEATURE_USE_IBPB); | ||
3230 | - pr_info("Spectre v2 mitigation: Enabling Indirect Branch Prediction Barrier\n"); | ||
3231 | - } | ||
3232 | - | ||
3233 | /* | ||
3234 | * Retpoline means the kernel is safe because it has no indirect | ||
3235 | * branches. Enhanced IBRS protects firmware too, so, enable restricted | ||
3236 | @@ -421,6 +633,99 @@ specv2_set_mode: | ||
3237 | setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW); | ||
3238 | pr_info("Enabling Restricted Speculation for firmware calls\n"); | ||
3239 | } | ||
3240 | + | ||
3241 | + /* Set up IBPB and STIBP depending on the general spectre V2 command */ | ||
3242 | + spectre_v2_user_select_mitigation(cmd); | ||
3243 | +} | ||
3244 | + | ||
3245 | +static void update_stibp_msr(void * __unused) | ||
3246 | +{ | ||
3247 | + wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); | ||
3248 | +} | ||
3249 | + | ||
3250 | +/* Update x86_spec_ctrl_base in case SMT state changed. */ | ||
3251 | +static void update_stibp_strict(void) | ||
3252 | +{ | ||
3253 | + u64 mask = x86_spec_ctrl_base & ~SPEC_CTRL_STIBP; | ||
3254 | + | ||
3255 | + if (sched_smt_active()) | ||
3256 | + mask |= SPEC_CTRL_STIBP; | ||
3257 | + | ||
3258 | + if (mask == x86_spec_ctrl_base) | ||
3259 | + return; | ||
3260 | + | ||
3261 | + pr_info("Update user space SMT mitigation: STIBP %s\n", | ||
3262 | + mask & SPEC_CTRL_STIBP ? "always-on" : "off"); | ||
3263 | + x86_spec_ctrl_base = mask; | ||
3264 | + on_each_cpu(update_stibp_msr, NULL, 1); | ||
3265 | +} | ||
3266 | + | ||
3267 | +/* Update the static key controlling the evaluation of TIF_SPEC_IB */ | ||
3268 | +static void update_indir_branch_cond(void) | ||
3269 | +{ | ||
3270 | + if (sched_smt_active()) | ||
3271 | + static_branch_enable(&switch_to_cond_stibp); | ||
3272 | + else | ||
3273 | + static_branch_disable(&switch_to_cond_stibp); | ||
3274 | +} | ||
3275 | + | ||
3276 | +#undef pr_fmt | ||
3277 | +#define pr_fmt(fmt) fmt | ||
3278 | + | ||
3279 | +/* Update the static key controlling the MDS CPU buffer clear in idle */ | ||
3280 | +static void update_mds_branch_idle(void) | ||
3281 | +{ | ||
3282 | + /* | ||
3283 | + * Enable the idle clearing if SMT is active on CPUs which are | ||
3284 | + * affected only by MSBDS and not any other MDS variant. | ||
3285 | + * | ||
3286 | + * The other variants cannot be mitigated when SMT is enabled, so | ||
3287 | + * clearing the buffers on idle just to prevent the Store Buffer | ||
3288 | + * repartitioning leak would be a window dressing exercise. | ||
3289 | + */ | ||
3290 | + if (!boot_cpu_has_bug(X86_BUG_MSBDS_ONLY)) | ||
3291 | + return; | ||
3292 | + | ||
3293 | + if (sched_smt_active()) | ||
3294 | + static_branch_enable(&mds_idle_clear); | ||
3295 | + else | ||
3296 | + static_branch_disable(&mds_idle_clear); | ||
3297 | +} | ||
3298 | + | ||
3299 | +#define MDS_MSG_SMT "MDS CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/mds.html for more details.\n" | ||
3300 | + | ||
3301 | +void arch_smt_update(void) | ||
3302 | +{ | ||
3303 | + /* Enhanced IBRS implies STIBP. No update required. */ | ||
3304 | + if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) | ||
3305 | + return; | ||
3306 | + | ||
3307 | + mutex_lock(&spec_ctrl_mutex); | ||
3308 | + | ||
3309 | + switch (spectre_v2_user) { | ||
3310 | + case SPECTRE_V2_USER_NONE: | ||
3311 | + break; | ||
3312 | + case SPECTRE_V2_USER_STRICT: | ||
3313 | + update_stibp_strict(); | ||
3314 | + break; | ||
3315 | + case SPECTRE_V2_USER_PRCTL: | ||
3316 | + case SPECTRE_V2_USER_SECCOMP: | ||
3317 | + update_indir_branch_cond(); | ||
3318 | + break; | ||
3319 | + } | ||
3320 | + | ||
3321 | + switch (mds_mitigation) { | ||
3322 | + case MDS_MITIGATION_FULL: | ||
3323 | + case MDS_MITIGATION_VMWERV: | ||
3324 | + if (sched_smt_active() && !boot_cpu_has(X86_BUG_MSBDS_ONLY)) | ||
3325 | + pr_warn_once(MDS_MSG_SMT); | ||
3326 | + update_mds_branch_idle(); | ||
3327 | + break; | ||
3328 | + case MDS_MITIGATION_OFF: | ||
3329 | + break; | ||
3330 | + } | ||
3331 | + | ||
3332 | + mutex_unlock(&spec_ctrl_mutex); | ||
3333 | } | ||
3334 | |||
3335 | #undef pr_fmt | ||
3336 | @@ -437,7 +742,7 @@ enum ssb_mitigation_cmd { | ||
3337 | SPEC_STORE_BYPASS_CMD_SECCOMP, | ||
3338 | }; | ||
3339 | |||
3340 | -static const char *ssb_strings[] = { | ||
3341 | +static const char * const ssb_strings[] = { | ||
3342 | [SPEC_STORE_BYPASS_NONE] = "Vulnerable", | ||
3343 | [SPEC_STORE_BYPASS_DISABLE] = "Mitigation: Speculative Store Bypass disabled", | ||
3344 | [SPEC_STORE_BYPASS_PRCTL] = "Mitigation: Speculative Store Bypass disabled via prctl", | ||
3345 | @@ -447,7 +752,7 @@ static const char *ssb_strings[] = { | ||
3346 | static const struct { | ||
3347 | const char *option; | ||
3348 | enum ssb_mitigation_cmd cmd; | ||
3349 | -} ssb_mitigation_options[] = { | ||
3350 | +} ssb_mitigation_options[] __initconst = { | ||
3351 | { "auto", SPEC_STORE_BYPASS_CMD_AUTO }, /* Platform decides */ | ||
3352 | { "on", SPEC_STORE_BYPASS_CMD_ON }, /* Disable Speculative Store Bypass */ | ||
3353 | { "off", SPEC_STORE_BYPASS_CMD_NONE }, /* Don't touch Speculative Store Bypass */ | ||
3354 | @@ -461,7 +766,8 @@ static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void) | ||
3355 | char arg[20]; | ||
3356 | int ret, i; | ||
3357 | |||
3358 | - if (cmdline_find_option_bool(boot_command_line, "nospec_store_bypass_disable")) { | ||
3359 | + if (cmdline_find_option_bool(boot_command_line, "nospec_store_bypass_disable") || | ||
3360 | + cpu_mitigations_off()) { | ||
3361 | return SPEC_STORE_BYPASS_CMD_NONE; | ||
3362 | } else { | ||
3363 | ret = cmdline_find_option(boot_command_line, "spec_store_bypass_disable", | ||
3364 | @@ -531,18 +837,16 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void) | ||
3365 | if (mode == SPEC_STORE_BYPASS_DISABLE) { | ||
3366 | setup_force_cpu_cap(X86_FEATURE_SPEC_STORE_BYPASS_DISABLE); | ||
3367 | /* | ||
3368 | - * Intel uses the SPEC CTRL MSR Bit(2) for this, while AMD uses | ||
3369 | - * a completely different MSR and bit dependent on family. | ||
3370 | + * Intel uses the SPEC CTRL MSR Bit(2) for this, while AMD may | ||
3371 | + * use a completely different MSR and bit dependent on family. | ||
3372 | */ | ||
3373 | - switch (boot_cpu_data.x86_vendor) { | ||
3374 | - case X86_VENDOR_INTEL: | ||
3375 | + if (!static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) && | ||
3376 | + !static_cpu_has(X86_FEATURE_AMD_SSBD)) { | ||
3377 | + x86_amd_ssb_disable(); | ||
3378 | + } else { | ||
3379 | x86_spec_ctrl_base |= SPEC_CTRL_SSBD; | ||
3380 | x86_spec_ctrl_mask |= SPEC_CTRL_SSBD; | ||
3381 | wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); | ||
3382 | - break; | ||
3383 | - case X86_VENDOR_AMD: | ||
3384 | - x86_amd_ssb_disable(); | ||
3385 | - break; | ||
3386 | } | ||
3387 | } | ||
3388 | |||
3389 | @@ -560,10 +864,25 @@ static void ssb_select_mitigation(void) | ||
3390 | #undef pr_fmt | ||
3391 | #define pr_fmt(fmt) "Speculation prctl: " fmt | ||
3392 | |||
3393 | -static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl) | ||
3394 | +static void task_update_spec_tif(struct task_struct *tsk) | ||
3395 | { | ||
3396 | - bool update; | ||
3397 | + /* Force the update of the real TIF bits */ | ||
3398 | + set_tsk_thread_flag(tsk, TIF_SPEC_FORCE_UPDATE); | ||
3399 | |||
3400 | + /* | ||
3401 | + * Immediately update the speculation control MSRs for the current | ||
3402 | + * task, but for a non-current task delay setting the CPU | ||
3403 | + * mitigation until it is scheduled next. | ||
3404 | + * | ||
3405 | + * This can only happen for SECCOMP mitigation. For PRCTL it's | ||
3406 | + * always the current task. | ||
3407 | + */ | ||
3408 | + if (tsk == current) | ||
3409 | + speculation_ctrl_update_current(); | ||
3410 | +} | ||
3411 | + | ||
3412 | +static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl) | ||
3413 | +{ | ||
3414 | if (ssb_mode != SPEC_STORE_BYPASS_PRCTL && | ||
3415 | ssb_mode != SPEC_STORE_BYPASS_SECCOMP) | ||
3416 | return -ENXIO; | ||
3417 | @@ -574,28 +893,56 @@ static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl) | ||
3418 | if (task_spec_ssb_force_disable(task)) | ||
3419 | return -EPERM; | ||
3420 | task_clear_spec_ssb_disable(task); | ||
3421 | - update = test_and_clear_tsk_thread_flag(task, TIF_SSBD); | ||
3422 | + task_update_spec_tif(task); | ||
3423 | break; | ||
3424 | case PR_SPEC_DISABLE: | ||
3425 | task_set_spec_ssb_disable(task); | ||
3426 | - update = !test_and_set_tsk_thread_flag(task, TIF_SSBD); | ||
3427 | + task_update_spec_tif(task); | ||
3428 | break; | ||
3429 | case PR_SPEC_FORCE_DISABLE: | ||
3430 | task_set_spec_ssb_disable(task); | ||
3431 | task_set_spec_ssb_force_disable(task); | ||
3432 | - update = !test_and_set_tsk_thread_flag(task, TIF_SSBD); | ||
3433 | + task_update_spec_tif(task); | ||
3434 | break; | ||
3435 | default: | ||
3436 | return -ERANGE; | ||
3437 | } | ||
3438 | + return 0; | ||
3439 | +} | ||
3440 | |||
3441 | - /* | ||
3442 | - * If being set on non-current task, delay setting the CPU | ||
3443 | - * mitigation until it is next scheduled. | ||
3444 | - */ | ||
3445 | - if (task == current && update) | ||
3446 | - speculative_store_bypass_update_current(); | ||
3447 | - | ||
3448 | +static int ib_prctl_set(struct task_struct *task, unsigned long ctrl) | ||
3449 | +{ | ||
3450 | + switch (ctrl) { | ||
3451 | + case PR_SPEC_ENABLE: | ||
3452 | + if (spectre_v2_user == SPECTRE_V2_USER_NONE) | ||
3453 | + return 0; | ||
3454 | + /* | ||
3455 | + * Indirect branch speculation is always disabled in strict | ||
3456 | + * mode. | ||
3457 | + */ | ||
3458 | + if (spectre_v2_user == SPECTRE_V2_USER_STRICT) | ||
3459 | + return -EPERM; | ||
3460 | + task_clear_spec_ib_disable(task); | ||
3461 | + task_update_spec_tif(task); | ||
3462 | + break; | ||
3463 | + case PR_SPEC_DISABLE: | ||
3464 | + case PR_SPEC_FORCE_DISABLE: | ||
3465 | + /* | ||
3466 | + * Indirect branch speculation is always allowed when | ||
3467 | + * mitigation is force disabled. | ||
3468 | + */ | ||
3469 | + if (spectre_v2_user == SPECTRE_V2_USER_NONE) | ||
3470 | + return -EPERM; | ||
3471 | + if (spectre_v2_user == SPECTRE_V2_USER_STRICT) | ||
3472 | + return 0; | ||
3473 | + task_set_spec_ib_disable(task); | ||
3474 | + if (ctrl == PR_SPEC_FORCE_DISABLE) | ||
3475 | + task_set_spec_ib_force_disable(task); | ||
3476 | + task_update_spec_tif(task); | ||
3477 | + break; | ||
3478 | + default: | ||
3479 | + return -ERANGE; | ||
3480 | + } | ||
3481 | return 0; | ||
3482 | } | ||
3483 | |||
3484 | @@ -605,6 +952,8 @@ int arch_prctl_spec_ctrl_set(struct task_struct *task, unsigned long which, | ||
3485 | switch (which) { | ||
3486 | case PR_SPEC_STORE_BYPASS: | ||
3487 | return ssb_prctl_set(task, ctrl); | ||
3488 | + case PR_SPEC_INDIRECT_BRANCH: | ||
3489 | + return ib_prctl_set(task, ctrl); | ||
3490 | default: | ||
3491 | return -ENODEV; | ||
3492 | } | ||
3493 | @@ -615,6 +964,8 @@ void arch_seccomp_spec_mitigate(struct task_struct *task) | ||
3494 | { | ||
3495 | if (ssb_mode == SPEC_STORE_BYPASS_SECCOMP) | ||
3496 | ssb_prctl_set(task, PR_SPEC_FORCE_DISABLE); | ||
3497 | + if (spectre_v2_user == SPECTRE_V2_USER_SECCOMP) | ||
3498 | + ib_prctl_set(task, PR_SPEC_FORCE_DISABLE); | ||
3499 | } | ||
3500 | #endif | ||
3501 | |||
3502 | @@ -637,11 +988,35 @@ static int ssb_prctl_get(struct task_struct *task) | ||
3503 | } | ||
3504 | } | ||
3505 | |||
3506 | +static int ib_prctl_get(struct task_struct *task) | ||
3507 | +{ | ||
3508 | + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) | ||
3509 | + return PR_SPEC_NOT_AFFECTED; | ||
3510 | + | ||
3511 | + switch (spectre_v2_user) { | ||
3512 | + case SPECTRE_V2_USER_NONE: | ||
3513 | + return PR_SPEC_ENABLE; | ||
3514 | + case SPECTRE_V2_USER_PRCTL: | ||
3515 | + case SPECTRE_V2_USER_SECCOMP: | ||
3516 | + if (task_spec_ib_force_disable(task)) | ||
3517 | + return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE; | ||
3518 | + if (task_spec_ib_disable(task)) | ||
3519 | + return PR_SPEC_PRCTL | PR_SPEC_DISABLE; | ||
3520 | + return PR_SPEC_PRCTL | PR_SPEC_ENABLE; | ||
3521 | + case SPECTRE_V2_USER_STRICT: | ||
3522 | + return PR_SPEC_DISABLE; | ||
3523 | + default: | ||
3524 | + return PR_SPEC_NOT_AFFECTED; | ||
3525 | + } | ||
3526 | +} | ||
3527 | + | ||
3528 | int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which) | ||
3529 | { | ||
3530 | switch (which) { | ||
3531 | case PR_SPEC_STORE_BYPASS: | ||
3532 | return ssb_prctl_get(task); | ||
3533 | + case PR_SPEC_INDIRECT_BRANCH: | ||
3534 | + return ib_prctl_get(task); | ||
3535 | default: | ||
3536 | return -ENODEV; | ||
3537 | } | ||
3538 | @@ -713,6 +1088,11 @@ static void __init l1tf_select_mitigation(void) | ||
3539 | if (!boot_cpu_has_bug(X86_BUG_L1TF)) | ||
3540 | return; | ||
3541 | |||
3542 | + if (cpu_mitigations_off()) | ||
3543 | + l1tf_mitigation = L1TF_MITIGATION_OFF; | ||
3544 | + else if (cpu_mitigations_auto_nosmt()) | ||
3545 | + l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOSMT; | ||
3546 | + | ||
3547 | override_cache_bits(&boot_cpu_data); | ||
3548 | |||
3549 | switch (l1tf_mitigation) { | ||
3550 | @@ -735,12 +1115,13 @@ static void __init l1tf_select_mitigation(void) | ||
3551 | #endif | ||
3552 | |||
3553 | half_pa = (u64)l1tf_pfn_limit() << PAGE_SHIFT; | ||
3554 | - if (e820_any_mapped(half_pa, ULLONG_MAX - half_pa, E820_RAM)) { | ||
3555 | + if (l1tf_mitigation != L1TF_MITIGATION_OFF && | ||
3556 | + e820_any_mapped(half_pa, ULLONG_MAX - half_pa, E820_RAM)) { | ||
3557 | pr_warn("System has more than MAX_PA/2 memory. L1TF mitigation not effective.\n"); | ||
3558 | pr_info("You may make it effective by booting the kernel with mem=%llu parameter.\n", | ||
3559 | half_pa); | ||
3560 | pr_info("However, doing so will make a part of your RAM unusable.\n"); | ||
3561 | - pr_info("Reading https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html might help you decide.\n"); | ||
3562 | + pr_info("Reading https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html might help you decide.\n"); | ||
3563 | return; | ||
3564 | } | ||
3565 | |||
3566 | @@ -773,13 +1154,14 @@ static int __init l1tf_cmdline(char *str) | ||
3567 | early_param("l1tf", l1tf_cmdline); | ||
3568 | |||
3569 | #undef pr_fmt | ||
3570 | +#define pr_fmt(fmt) fmt | ||
3571 | |||
3572 | #ifdef CONFIG_SYSFS | ||
3573 | |||
3574 | #define L1TF_DEFAULT_MSG "Mitigation: PTE Inversion" | ||
3575 | |||
3576 | #if IS_ENABLED(CONFIG_KVM_INTEL) | ||
3577 | -static const char *l1tf_vmx_states[] = { | ||
3578 | +static const char * const l1tf_vmx_states[] = { | ||
3579 | [VMENTER_L1D_FLUSH_AUTO] = "auto", | ||
3580 | [VMENTER_L1D_FLUSH_NEVER] = "vulnerable", | ||
3581 | [VMENTER_L1D_FLUSH_COND] = "conditional cache flushes", | ||
3582 | @@ -795,13 +1177,14 @@ static ssize_t l1tf_show_state(char *buf) | ||
3583 | |||
3584 | if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_EPT_DISABLED || | ||
3585 | (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER && | ||
3586 | - cpu_smt_control == CPU_SMT_ENABLED)) | ||
3587 | + sched_smt_active())) { | ||
3588 | return sprintf(buf, "%s; VMX: %s\n", L1TF_DEFAULT_MSG, | ||
3589 | l1tf_vmx_states[l1tf_vmx_mitigation]); | ||
3590 | + } | ||
3591 | |||
3592 | return sprintf(buf, "%s; VMX: %s, SMT %s\n", L1TF_DEFAULT_MSG, | ||
3593 | l1tf_vmx_states[l1tf_vmx_mitigation], | ||
3594 | - cpu_smt_control == CPU_SMT_ENABLED ? "vulnerable" : "disabled"); | ||
3595 | + sched_smt_active() ? "vulnerable" : "disabled"); | ||
3596 | } | ||
3597 | #else | ||
3598 | static ssize_t l1tf_show_state(char *buf) | ||
3599 | @@ -810,6 +1193,55 @@ static ssize_t l1tf_show_state(char *buf) | ||
3600 | } | ||
3601 | #endif | ||
3602 | |||
3603 | +static ssize_t mds_show_state(char *buf) | ||
3604 | +{ | ||
3605 | +#ifdef CONFIG_HYPERVISOR_GUEST | ||
3606 | + if (x86_hyper) { | ||
3607 | + return sprintf(buf, "%s; SMT Host state unknown\n", | ||
3608 | + mds_strings[mds_mitigation]); | ||
3609 | + } | ||
3610 | +#endif | ||
3611 | + | ||
3612 | + if (boot_cpu_has(X86_BUG_MSBDS_ONLY)) { | ||
3613 | + return sprintf(buf, "%s; SMT %s\n", mds_strings[mds_mitigation], | ||
3614 | + (mds_mitigation == MDS_MITIGATION_OFF ? "vulnerable" : | ||
3615 | + sched_smt_active() ? "mitigated" : "disabled")); | ||
3616 | + } | ||
3617 | + | ||
3618 | + return sprintf(buf, "%s; SMT %s\n", mds_strings[mds_mitigation], | ||
3619 | + sched_smt_active() ? "vulnerable" : "disabled"); | ||
3620 | +} | ||
3621 | + | ||
3622 | +static char *stibp_state(void) | ||
3623 | +{ | ||
3624 | + if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) | ||
3625 | + return ""; | ||
3626 | + | ||
3627 | + switch (spectre_v2_user) { | ||
3628 | + case SPECTRE_V2_USER_NONE: | ||
3629 | + return ", STIBP: disabled"; | ||
3630 | + case SPECTRE_V2_USER_STRICT: | ||
3631 | + return ", STIBP: forced"; | ||
3632 | + case SPECTRE_V2_USER_PRCTL: | ||
3633 | + case SPECTRE_V2_USER_SECCOMP: | ||
3634 | + if (static_key_enabled(&switch_to_cond_stibp)) | ||
3635 | + return ", STIBP: conditional"; | ||
3636 | + } | ||
3637 | + return ""; | ||
3638 | +} | ||
3639 | + | ||
3640 | +static char *ibpb_state(void) | ||
3641 | +{ | ||
3642 | + if (boot_cpu_has(X86_FEATURE_IBPB)) { | ||
3643 | + if (static_key_enabled(&switch_mm_always_ibpb)) | ||
3644 | + return ", IBPB: always-on"; | ||
3645 | + if (static_key_enabled(&switch_mm_cond_ibpb)) | ||
3646 | + return ", IBPB: conditional"; | ||
3647 | + return ", IBPB: disabled"; | ||
3648 | + } | ||
3649 | + return ""; | ||
3650 | +} | ||
3651 | + | ||
3652 | static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr, | ||
3653 | char *buf, unsigned int bug) | ||
3654 | { | ||
3655 | @@ -827,9 +1259,11 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr | ||
3656 | return sprintf(buf, "Mitigation: __user pointer sanitization\n"); | ||
3657 | |||
3658 | case X86_BUG_SPECTRE_V2: | ||
3659 | - return sprintf(buf, "%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled], | ||
3660 | - boot_cpu_has(X86_FEATURE_USE_IBPB) ? ", IBPB" : "", | ||
3661 | + return sprintf(buf, "%s%s%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled], | ||
3662 | + ibpb_state(), | ||
3663 | boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "", | ||
3664 | + stibp_state(), | ||
3665 | + boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "", | ||
3666 | spectre_v2_module_string()); | ||
3667 | |||
3668 | case X86_BUG_SPEC_STORE_BYPASS: | ||
3669 | @@ -839,6 +1273,10 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr | ||
3670 | if (boot_cpu_has(X86_FEATURE_L1TF_PTEINV)) | ||
3671 | return l1tf_show_state(buf); | ||
3672 | break; | ||
3673 | + | ||
3674 | + case X86_BUG_MDS: | ||
3675 | + return mds_show_state(buf); | ||
3676 | + | ||
3677 | default: | ||
3678 | break; | ||
3679 | } | ||
3680 | @@ -870,4 +1308,9 @@ ssize_t cpu_show_l1tf(struct device *dev, struct device_attribute *attr, char *b | ||
3681 | { | ||
3682 | return cpu_show_common(dev, attr, buf, X86_BUG_L1TF); | ||
3683 | } | ||
3684 | + | ||
3685 | +ssize_t cpu_show_mds(struct device *dev, struct device_attribute *attr, char *buf) | ||
3686 | +{ | ||
3687 | + return cpu_show_common(dev, attr, buf, X86_BUG_MDS); | ||
3688 | +} | ||
3689 | #endif | ||
3690 | diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c | ||
3691 | index 3c01610c5ba9..cda130dc56b9 100644 | ||
3692 | --- a/arch/x86/kernel/cpu/common.c | ||
3693 | +++ b/arch/x86/kernel/cpu/common.c | ||
3694 | @@ -752,6 +752,12 @@ static void init_speculation_control(struct cpuinfo_x86 *c) | ||
3695 | set_cpu_cap(c, X86_FEATURE_STIBP); | ||
3696 | set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL); | ||
3697 | } | ||
3698 | + | ||
3699 | + if (cpu_has(c, X86_FEATURE_AMD_SSBD)) { | ||
3700 | + set_cpu_cap(c, X86_FEATURE_SSBD); | ||
3701 | + set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL); | ||
3702 | + clear_cpu_cap(c, X86_FEATURE_VIRT_SSBD); | ||
3703 | + } | ||
3704 | } | ||
3705 | |||
3706 | void get_cpu_cap(struct cpuinfo_x86 *c) | ||
3707 | @@ -885,84 +891,95 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) | ||
3708 | c->x86_cache_bits = c->x86_phys_bits; | ||
3709 | } | ||
3710 | |||
3711 | -static const __initconst struct x86_cpu_id cpu_no_speculation[] = { | ||
3712 | - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CEDARVIEW, X86_FEATURE_ANY }, | ||
3713 | - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CLOVERVIEW, X86_FEATURE_ANY }, | ||
3714 | - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_LINCROFT, X86_FEATURE_ANY }, | ||
3715 | - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PENWELL, X86_FEATURE_ANY }, | ||
3716 | - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PINEVIEW, X86_FEATURE_ANY }, | ||
3717 | - { X86_VENDOR_CENTAUR, 5 }, | ||
3718 | - { X86_VENDOR_INTEL, 5 }, | ||
3719 | - { X86_VENDOR_NSC, 5 }, | ||
3720 | - { X86_VENDOR_ANY, 4 }, | ||
3721 | - {} | ||
3722 | -}; | ||
3723 | +#define NO_SPECULATION BIT(0) | ||
3724 | +#define NO_MELTDOWN BIT(1) | ||
3725 | +#define NO_SSB BIT(2) | ||
3726 | +#define NO_L1TF BIT(3) | ||
3727 | +#define NO_MDS BIT(4) | ||
3728 | +#define MSBDS_ONLY BIT(5) | ||
3729 | |||
3730 | -static const __initconst struct x86_cpu_id cpu_no_meltdown[] = { | ||
3731 | - { X86_VENDOR_AMD }, | ||
3732 | - {} | ||
3733 | -}; | ||
3734 | +#define VULNWL(_vendor, _family, _model, _whitelist) \ | ||
3735 | + { X86_VENDOR_##_vendor, _family, _model, X86_FEATURE_ANY, _whitelist } | ||
3736 | |||
3737 | -static const __initconst struct x86_cpu_id cpu_no_spec_store_bypass[] = { | ||
3738 | - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PINEVIEW }, | ||
3739 | - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_LINCROFT }, | ||
3740 | - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PENWELL }, | ||
3741 | - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CLOVERVIEW }, | ||
3742 | - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CEDARVIEW }, | ||
3743 | - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT1 }, | ||
3744 | - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_AIRMONT }, | ||
3745 | - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT2 }, | ||
3746 | - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_MERRIFIELD }, | ||
3747 | - { X86_VENDOR_INTEL, 6, INTEL_FAM6_CORE_YONAH }, | ||
3748 | - { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNL }, | ||
3749 | - { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNM }, | ||
3750 | - { X86_VENDOR_CENTAUR, 5, }, | ||
3751 | - { X86_VENDOR_INTEL, 5, }, | ||
3752 | - { X86_VENDOR_NSC, 5, }, | ||
3753 | - { X86_VENDOR_AMD, 0x12, }, | ||
3754 | - { X86_VENDOR_AMD, 0x11, }, | ||
3755 | - { X86_VENDOR_AMD, 0x10, }, | ||
3756 | - { X86_VENDOR_AMD, 0xf, }, | ||
3757 | - { X86_VENDOR_ANY, 4, }, | ||
3758 | - {} | ||
3759 | -}; | ||
3760 | +#define VULNWL_INTEL(model, whitelist) \ | ||
3761 | + VULNWL(INTEL, 6, INTEL_FAM6_##model, whitelist) | ||
3762 | + | ||
3763 | +#define VULNWL_AMD(family, whitelist) \ | ||
3764 | + VULNWL(AMD, family, X86_MODEL_ANY, whitelist) | ||
3765 | + | ||
3766 | +static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { | ||
3767 | + VULNWL(ANY, 4, X86_MODEL_ANY, NO_SPECULATION), | ||
3768 | + VULNWL(CENTAUR, 5, X86_MODEL_ANY, NO_SPECULATION), | ||
3769 | + VULNWL(INTEL, 5, X86_MODEL_ANY, NO_SPECULATION), | ||
3770 | + VULNWL(NSC, 5, X86_MODEL_ANY, NO_SPECULATION), | ||
3771 | + | ||
3772 | + /* Intel Family 6 */ | ||
3773 | + VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION), | ||
3774 | + VULNWL_INTEL(ATOM_SALTWELL_TABLET, NO_SPECULATION), | ||
3775 | + VULNWL_INTEL(ATOM_SALTWELL_MID, NO_SPECULATION), | ||
3776 | + VULNWL_INTEL(ATOM_BONNELL, NO_SPECULATION), | ||
3777 | + VULNWL_INTEL(ATOM_BONNELL_MID, NO_SPECULATION), | ||
3778 | + | ||
3779 | + VULNWL_INTEL(ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY), | ||
3780 | + VULNWL_INTEL(ATOM_SILVERMONT_X, NO_SSB | NO_L1TF | MSBDS_ONLY), | ||
3781 | + VULNWL_INTEL(ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY), | ||
3782 | + VULNWL_INTEL(ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY), | ||
3783 | + VULNWL_INTEL(XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY), | ||
3784 | + VULNWL_INTEL(XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY), | ||
3785 | + | ||
3786 | + VULNWL_INTEL(CORE_YONAH, NO_SSB), | ||
3787 | + | ||
3788 | + VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY), | ||
3789 | + | ||
3790 | + VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF), | ||
3791 | + VULNWL_INTEL(ATOM_GOLDMONT_X, NO_MDS | NO_L1TF), | ||
3792 | + VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF), | ||
3793 | |||
3794 | -static const __initconst struct x86_cpu_id cpu_no_l1tf[] = { | ||
3795 | - /* in addition to cpu_no_speculation */ | ||
3796 | - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT1 }, | ||
3797 | - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT2 }, | ||
3798 | - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_AIRMONT }, | ||
3799 | - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_MERRIFIELD }, | ||
3800 | - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_MOOREFIELD }, | ||
3801 | - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GOLDMONT }, | ||
3802 | - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_DENVERTON }, | ||
3803 | - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GEMINI_LAKE }, | ||
3804 | - { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNL }, | ||
3805 | - { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNM }, | ||
3806 | + /* AMD Family 0xf - 0x12 */ | ||
3807 | + VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS), | ||
3808 | + VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS), | ||
3809 | + VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS), | ||
3810 | + VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS), | ||
3811 | + | ||
3812 | + /* FAMILY_ANY must be last, otherwise 0x0f - 0x12 matches won't work */ | ||
3813 | + VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS), | ||
3814 | {} | ||
3815 | }; | ||
3816 | |||
3817 | -static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) | ||
3818 | +static bool __init cpu_matches(unsigned long which) | ||
3819 | { | ||
3820 | - u64 ia32_cap = 0; | ||
3821 | + const struct x86_cpu_id *m = x86_match_cpu(cpu_vuln_whitelist); | ||
3822 | |||
3823 | - if (cpu_has(c, X86_FEATURE_ARCH_CAPABILITIES)) | ||
3824 | - rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap); | ||
3825 | + return m && !!(m->driver_data & which); | ||
3826 | +} | ||
3827 | |||
3828 | - if (!x86_match_cpu(cpu_no_spec_store_bypass) && | ||
3829 | - !(ia32_cap & ARCH_CAP_SSB_NO)) | ||
3830 | - setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS); | ||
3831 | +static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) | ||
3832 | +{ | ||
3833 | + u64 ia32_cap = 0; | ||
3834 | |||
3835 | - if (x86_match_cpu(cpu_no_speculation)) | ||
3836 | + if (cpu_matches(NO_SPECULATION)) | ||
3837 | return; | ||
3838 | |||
3839 | setup_force_cpu_bug(X86_BUG_SPECTRE_V1); | ||
3840 | setup_force_cpu_bug(X86_BUG_SPECTRE_V2); | ||
3841 | |||
3842 | + if (cpu_has(c, X86_FEATURE_ARCH_CAPABILITIES)) | ||
3843 | + rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap); | ||
3844 | + | ||
3845 | + if (!cpu_matches(NO_SSB) && !(ia32_cap & ARCH_CAP_SSB_NO) && | ||
3846 | + !cpu_has(c, X86_FEATURE_AMD_SSB_NO)) | ||
3847 | + setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS); | ||
3848 | + | ||
3849 | if (ia32_cap & ARCH_CAP_IBRS_ALL) | ||
3850 | setup_force_cpu_cap(X86_FEATURE_IBRS_ENHANCED); | ||
3851 | |||
3852 | - if (x86_match_cpu(cpu_no_meltdown)) | ||
3853 | + if (!cpu_matches(NO_MDS) && !(ia32_cap & ARCH_CAP_MDS_NO)) { | ||
3854 | + setup_force_cpu_bug(X86_BUG_MDS); | ||
3855 | + if (cpu_matches(MSBDS_ONLY)) | ||
3856 | + setup_force_cpu_bug(X86_BUG_MSBDS_ONLY); | ||
3857 | + } | ||
3858 | + | ||
3859 | + if (cpu_matches(NO_MELTDOWN)) | ||
3860 | return; | ||
3861 | |||
3862 | /* Rogue Data Cache Load? No! */ | ||
3863 | @@ -971,7 +988,7 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) | ||
3864 | |||
3865 | setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN); | ||
3866 | |||
3867 | - if (x86_match_cpu(cpu_no_l1tf)) | ||
3868 | + if (cpu_matches(NO_L1TF)) | ||
3869 | return; | ||
3870 | |||
3871 | setup_force_cpu_bug(X86_BUG_L1TF); | ||
3872 | diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c | ||
3873 | index cee0fec0d232..860f2fd9f540 100644 | ||
3874 | --- a/arch/x86/kernel/cpu/intel.c | ||
3875 | +++ b/arch/x86/kernel/cpu/intel.c | ||
3876 | @@ -14,6 +14,7 @@ | ||
3877 | #include <asm/bugs.h> | ||
3878 | #include <asm/cpu.h> | ||
3879 | #include <asm/intel-family.h> | ||
3880 | +#include <asm/microcode_intel.h> | ||
3881 | |||
3882 | #ifdef CONFIG_X86_64 | ||
3883 | #include <linux/topology.h> | ||
3884 | @@ -137,14 +138,8 @@ static void early_init_intel(struct cpuinfo_x86 *c) | ||
3885 | (c->x86 == 0x6 && c->x86_model >= 0x0e)) | ||
3886 | set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); | ||
3887 | |||
3888 | - if (c->x86 >= 6 && !cpu_has(c, X86_FEATURE_IA64)) { | ||
3889 | - unsigned lower_word; | ||
3890 | - | ||
3891 | - wrmsr(MSR_IA32_UCODE_REV, 0, 0); | ||
3892 | - /* Required by the SDM */ | ||
3893 | - sync_core(); | ||
3894 | - rdmsr(MSR_IA32_UCODE_REV, lower_word, c->microcode); | ||
3895 | - } | ||
3896 | + if (c->x86 >= 6 && !cpu_has(c, X86_FEATURE_IA64)) | ||
3897 | + c->microcode = intel_get_microcode_revision(); | ||
3898 | |||
3899 | /* Now if any of them are set, check the blacklist and clear the lot */ | ||
3900 | if ((cpu_has(c, X86_FEATURE_SPEC_CTRL) || | ||
3901 | diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c | ||
3902 | index 25310d2b8609..d9ad49ca3cbe 100644 | ||
3903 | --- a/arch/x86/kernel/cpu/mcheck/mce.c | ||
3904 | +++ b/arch/x86/kernel/cpu/mcheck/mce.c | ||
3905 | @@ -139,6 +139,8 @@ void mce_setup(struct mce *m) | ||
3906 | m->socketid = cpu_data(m->extcpu).phys_proc_id; | ||
3907 | m->apicid = cpu_data(m->extcpu).initial_apicid; | ||
3908 | rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); | ||
3909 | + | ||
3910 | + m->microcode = boot_cpu_data.microcode; | ||
3911 | } | ||
3912 | |||
3913 | DEFINE_PER_CPU(struct mce, injectm); | ||
3914 | @@ -309,7 +311,7 @@ static void print_mce(struct mce *m) | ||
3915 | */ | ||
3916 | pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n", | ||
3917 | m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid, | ||
3918 | - cpu_data(m->extcpu).microcode); | ||
3919 | + m->microcode); | ||
3920 | |||
3921 | pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n"); | ||
3922 | } | ||
3923 | diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c | ||
3924 | index 732bb03fcf91..a19fddfb6bf8 100644 | ||
3925 | --- a/arch/x86/kernel/cpu/microcode/amd.c | ||
3926 | +++ b/arch/x86/kernel/cpu/microcode/amd.c | ||
3927 | @@ -707,22 +707,26 @@ int apply_microcode_amd(int cpu) | ||
3928 | return -1; | ||
3929 | |||
3930 | /* need to apply patch? */ | ||
3931 | - if (rev >= mc_amd->hdr.patch_id) { | ||
3932 | - c->microcode = rev; | ||
3933 | - uci->cpu_sig.rev = rev; | ||
3934 | - return 0; | ||
3935 | - } | ||
3936 | + if (rev >= mc_amd->hdr.patch_id) | ||
3937 | + goto out; | ||
3938 | |||
3939 | if (__apply_microcode_amd(mc_amd)) { | ||
3940 | pr_err("CPU%d: update failed for patch_level=0x%08x\n", | ||
3941 | cpu, mc_amd->hdr.patch_id); | ||
3942 | return -1; | ||
3943 | } | ||
3944 | - pr_info("CPU%d: new patch_level=0x%08x\n", cpu, | ||
3945 | - mc_amd->hdr.patch_id); | ||
3946 | |||
3947 | - uci->cpu_sig.rev = mc_amd->hdr.patch_id; | ||
3948 | - c->microcode = mc_amd->hdr.patch_id; | ||
3949 | + rev = mc_amd->hdr.patch_id; | ||
3950 | + | ||
3951 | + pr_info("CPU%d: new patch_level=0x%08x\n", cpu, rev); | ||
3952 | + | ||
3953 | +out: | ||
3954 | + uci->cpu_sig.rev = rev; | ||
3955 | + c->microcode = rev; | ||
3956 | + | ||
3957 | + /* Update boot_cpu_data's revision too, if we're on the BSP: */ | ||
3958 | + if (c->cpu_index == boot_cpu_data.cpu_index) | ||
3959 | + boot_cpu_data.microcode = rev; | ||
3960 | |||
3961 | return 0; | ||
3962 | } | ||
3963 | diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c | ||
3964 | index 79291d6fb301..1308abfc4758 100644 | ||
3965 | --- a/arch/x86/kernel/cpu/microcode/intel.c | ||
3966 | +++ b/arch/x86/kernel/cpu/microcode/intel.c | ||
3967 | @@ -386,15 +386,8 @@ static int collect_cpu_info_early(struct ucode_cpu_info *uci) | ||
3968 | native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); | ||
3969 | csig.pf = 1 << ((val[1] >> 18) & 7); | ||
3970 | } | ||
3971 | - native_wrmsrl(MSR_IA32_UCODE_REV, 0); | ||
3972 | |||
3973 | - /* As documented in the SDM: Do a CPUID 1 here */ | ||
3974 | - sync_core(); | ||
3975 | - | ||
3976 | - /* get the current revision from MSR 0x8B */ | ||
3977 | - native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); | ||
3978 | - | ||
3979 | - csig.rev = val[1]; | ||
3980 | + csig.rev = intel_get_microcode_revision(); | ||
3981 | |||
3982 | uci->cpu_sig = csig; | ||
3983 | uci->valid = 1; | ||
3984 | @@ -618,29 +611,35 @@ static inline void print_ucode(struct ucode_cpu_info *uci) | ||
3985 | static int apply_microcode_early(struct ucode_cpu_info *uci, bool early) | ||
3986 | { | ||
3987 | struct microcode_intel *mc; | ||
3988 | - unsigned int val[2]; | ||
3989 | + u32 rev; | ||
3990 | |||
3991 | mc = uci->mc; | ||
3992 | if (!mc) | ||
3993 | return 0; | ||
3994 | |||
3995 | + /* | ||
3996 | + * Save us the MSR write below - which is a particular expensive | ||
3997 | + * operation - when the other hyperthread has updated the microcode | ||
3998 | + * already. | ||
3999 | + */ | ||
4000 | + rev = intel_get_microcode_revision(); | ||
4001 | + if (rev >= mc->hdr.rev) { | ||
4002 | + uci->cpu_sig.rev = rev; | ||
4003 | + return 0; | ||
4004 | + } | ||
4005 | + | ||
4006 | /* write microcode via MSR 0x79 */ | ||
4007 | native_wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits); | ||
4008 | - native_wrmsrl(MSR_IA32_UCODE_REV, 0); | ||
4009 | - | ||
4010 | - /* As documented in the SDM: Do a CPUID 1 here */ | ||
4011 | - sync_core(); | ||
4012 | |||
4013 | - /* get the current revision from MSR 0x8B */ | ||
4014 | - native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); | ||
4015 | - if (val[1] != mc->hdr.rev) | ||
4016 | + rev = intel_get_microcode_revision(); | ||
4017 | + if (rev != mc->hdr.rev) | ||
4018 | return -1; | ||
4019 | |||
4020 | #ifdef CONFIG_X86_64 | ||
4021 | /* Flush global tlb. This is precaution. */ | ||
4022 | flush_tlb_early(); | ||
4023 | #endif | ||
4024 | - uci->cpu_sig.rev = val[1]; | ||
4025 | + uci->cpu_sig.rev = rev; | ||
4026 | |||
4027 | if (early) | ||
4028 | print_ucode(uci); | ||
4029 | @@ -903,9 +902,9 @@ static int apply_microcode_intel(int cpu) | ||
4030 | { | ||
4031 | struct microcode_intel *mc; | ||
4032 | struct ucode_cpu_info *uci; | ||
4033 | - struct cpuinfo_x86 *c; | ||
4034 | - unsigned int val[2]; | ||
4035 | + struct cpuinfo_x86 *c = &cpu_data(cpu); | ||
4036 | static int prev_rev; | ||
4037 | + u32 rev; | ||
4038 | |||
4039 | /* We should bind the task to the CPU */ | ||
4040 | if (WARN_ON(raw_smp_processor_id() != cpu)) | ||
4041 | @@ -924,35 +923,42 @@ static int apply_microcode_intel(int cpu) | ||
4042 | if (!get_matching_mc(mc, cpu)) | ||
4043 | return 0; | ||
4044 | |||
4045 | + /* | ||
4046 | + * Save us the MSR write below - which is a particular expensive | ||
4047 | + * operation - when the other hyperthread has updated the microcode | ||
4048 | + * already. | ||
4049 | + */ | ||
4050 | + rev = intel_get_microcode_revision(); | ||
4051 | + if (rev >= mc->hdr.rev) | ||
4052 | + goto out; | ||
4053 | + | ||
4054 | /* write microcode via MSR 0x79 */ | ||
4055 | wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits); | ||
4056 | - wrmsrl(MSR_IA32_UCODE_REV, 0); | ||
4057 | |||
4058 | - /* As documented in the SDM: Do a CPUID 1 here */ | ||
4059 | - sync_core(); | ||
4060 | + rev = intel_get_microcode_revision(); | ||
4061 | |||
4062 | - /* get the current revision from MSR 0x8B */ | ||
4063 | - rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); | ||
4064 | - | ||
4065 | - if (val[1] != mc->hdr.rev) { | ||
4066 | + if (rev != mc->hdr.rev) { | ||
4067 | pr_err("CPU%d update to revision 0x%x failed\n", | ||
4068 | cpu, mc->hdr.rev); | ||
4069 | return -1; | ||
4070 | } | ||
4071 | |||
4072 | - if (val[1] != prev_rev) { | ||
4073 | + if (rev != prev_rev) { | ||
4074 | pr_info("updated to revision 0x%x, date = %04x-%02x-%02x\n", | ||
4075 | - val[1], | ||
4076 | + rev, | ||
4077 | mc->hdr.date & 0xffff, | ||
4078 | mc->hdr.date >> 24, | ||
4079 | (mc->hdr.date >> 16) & 0xff); | ||
4080 | - prev_rev = val[1]; | ||
4081 | + prev_rev = rev; | ||
4082 | } | ||
4083 | |||
4084 | - c = &cpu_data(cpu); | ||
4085 | +out: | ||
4086 | + uci->cpu_sig.rev = rev; | ||
4087 | + c->microcode = rev; | ||
4088 | |||
4089 | - uci->cpu_sig.rev = val[1]; | ||
4090 | - c->microcode = val[1]; | ||
4091 | + /* Update boot_cpu_data's revision too, if we're on the BSP: */ | ||
4092 | + if (c->cpu_index == boot_cpu_data.cpu_index) | ||
4093 | + boot_cpu_data.microcode = rev; | ||
4094 | |||
4095 | return 0; | ||
4096 | } | ||
4097 | diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c | ||
4098 | index bfe4d6c96fbd..6b7b35d80264 100644 | ||
4099 | --- a/arch/x86/kernel/nmi.c | ||
4100 | +++ b/arch/x86/kernel/nmi.c | ||
4101 | @@ -32,6 +32,7 @@ | ||
4102 | #include <asm/x86_init.h> | ||
4103 | #include <asm/reboot.h> | ||
4104 | #include <asm/cache.h> | ||
4105 | +#include <asm/nospec-branch.h> | ||
4106 | |||
4107 | #define CREATE_TRACE_POINTS | ||
4108 | #include <trace/events/nmi.h> | ||
4109 | @@ -544,6 +545,9 @@ nmi_restart: | ||
4110 | write_cr2(this_cpu_read(nmi_cr2)); | ||
4111 | if (this_cpu_dec_return(nmi_state)) | ||
4112 | goto nmi_restart; | ||
4113 | + | ||
4114 | + if (user_mode(regs)) | ||
4115 | + mds_user_clear_cpu_buffers(); | ||
4116 | } | ||
4117 | NOKPROBE_SYMBOL(do_nmi); | ||
4118 | |||
4119 | diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c | ||
4120 | index 00a9047539d7..2e4eab22ca37 100644 | ||
4121 | --- a/arch/x86/kernel/process.c | ||
4122 | +++ b/arch/x86/kernel/process.c | ||
4123 | @@ -35,6 +35,8 @@ | ||
4124 | #include <asm/switch_to.h> | ||
4125 | #include <asm/spec-ctrl.h> | ||
4126 | |||
4127 | +#include "process.h" | ||
4128 | + | ||
4129 | /* | ||
4130 | * per-CPU TSS segments. Threads are completely 'soft' on Linux, | ||
4131 | * no more per-task TSS's. The TSS size is kept cacheline-aligned | ||
4132 | @@ -183,11 +185,12 @@ int set_tsc_mode(unsigned int val) | ||
4133 | return 0; | ||
4134 | } | ||
4135 | |||
4136 | -static inline void switch_to_bitmap(struct tss_struct *tss, | ||
4137 | - struct thread_struct *prev, | ||
4138 | +static inline void switch_to_bitmap(struct thread_struct *prev, | ||
4139 | struct thread_struct *next, | ||
4140 | unsigned long tifp, unsigned long tifn) | ||
4141 | { | ||
4142 | + struct tss_struct *tss = this_cpu_ptr(&cpu_tss); | ||
4143 | + | ||
4144 | if (tifn & _TIF_IO_BITMAP) { | ||
4145 | /* | ||
4146 | * Copy the relevant range of the IO bitmap. | ||
4147 | @@ -321,32 +324,85 @@ static __always_inline void amd_set_ssb_virt_state(unsigned long tifn) | ||
4148 | wrmsrl(MSR_AMD64_VIRT_SPEC_CTRL, ssbd_tif_to_spec_ctrl(tifn)); | ||
4149 | } | ||
4150 | |||
4151 | -static __always_inline void intel_set_ssb_state(unsigned long tifn) | ||
4152 | +/* | ||
4153 | + * Update the MSRs managing speculation control, during context switch. | ||
4154 | + * | ||
4155 | + * tifp: Previous task's thread flags | ||
4156 | + * tifn: Next task's thread flags | ||
4157 | + */ | ||
4158 | +static __always_inline void __speculation_ctrl_update(unsigned long tifp, | ||
4159 | + unsigned long tifn) | ||
4160 | { | ||
4161 | - u64 msr = x86_spec_ctrl_base | ssbd_tif_to_spec_ctrl(tifn); | ||
4162 | + unsigned long tif_diff = tifp ^ tifn; | ||
4163 | + u64 msr = x86_spec_ctrl_base; | ||
4164 | + bool updmsr = false; | ||
4165 | + | ||
4166 | + /* | ||
4167 | + * If TIF_SSBD is different, select the proper mitigation | ||
4168 | + * method. Note that if SSBD mitigation is disabled or permanentely | ||
4169 | + * enabled this branch can't be taken because nothing can set | ||
4170 | + * TIF_SSBD. | ||
4171 | + */ | ||
4172 | + if (tif_diff & _TIF_SSBD) { | ||
4173 | + if (static_cpu_has(X86_FEATURE_VIRT_SSBD)) { | ||
4174 | + amd_set_ssb_virt_state(tifn); | ||
4175 | + } else if (static_cpu_has(X86_FEATURE_LS_CFG_SSBD)) { | ||
4176 | + amd_set_core_ssb_state(tifn); | ||
4177 | + } else if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) || | ||
4178 | + static_cpu_has(X86_FEATURE_AMD_SSBD)) { | ||
4179 | + msr |= ssbd_tif_to_spec_ctrl(tifn); | ||
4180 | + updmsr = true; | ||
4181 | + } | ||
4182 | + } | ||
4183 | + | ||
4184 | + /* | ||
4185 | + * Only evaluate TIF_SPEC_IB if conditional STIBP is enabled, | ||
4186 | + * otherwise avoid the MSR write. | ||
4187 | + */ | ||
4188 | + if (IS_ENABLED(CONFIG_SMP) && | ||
4189 | + static_branch_unlikely(&switch_to_cond_stibp)) { | ||
4190 | + updmsr |= !!(tif_diff & _TIF_SPEC_IB); | ||
4191 | + msr |= stibp_tif_to_spec_ctrl(tifn); | ||
4192 | + } | ||
4193 | |||
4194 | - wrmsrl(MSR_IA32_SPEC_CTRL, msr); | ||
4195 | + if (updmsr) | ||
4196 | + wrmsrl(MSR_IA32_SPEC_CTRL, msr); | ||
4197 | } | ||
4198 | |||
4199 | -static __always_inline void __speculative_store_bypass_update(unsigned long tifn) | ||
4200 | +static unsigned long speculation_ctrl_update_tif(struct task_struct *tsk) | ||
4201 | { | ||
4202 | - if (static_cpu_has(X86_FEATURE_VIRT_SSBD)) | ||
4203 | - amd_set_ssb_virt_state(tifn); | ||
4204 | - else if (static_cpu_has(X86_FEATURE_LS_CFG_SSBD)) | ||
4205 | - amd_set_core_ssb_state(tifn); | ||
4206 | - else | ||
4207 | - intel_set_ssb_state(tifn); | ||
4208 | + if (test_and_clear_tsk_thread_flag(tsk, TIF_SPEC_FORCE_UPDATE)) { | ||
4209 | + if (task_spec_ssb_disable(tsk)) | ||
4210 | + set_tsk_thread_flag(tsk, TIF_SSBD); | ||
4211 | + else | ||
4212 | + clear_tsk_thread_flag(tsk, TIF_SSBD); | ||
4213 | + | ||
4214 | + if (task_spec_ib_disable(tsk)) | ||
4215 | + set_tsk_thread_flag(tsk, TIF_SPEC_IB); | ||
4216 | + else | ||
4217 | + clear_tsk_thread_flag(tsk, TIF_SPEC_IB); | ||
4218 | + } | ||
4219 | + /* Return the updated threadinfo flags*/ | ||
4220 | + return task_thread_info(tsk)->flags; | ||
4221 | } | ||
4222 | |||
4223 | -void speculative_store_bypass_update(unsigned long tif) | ||
4224 | +void speculation_ctrl_update(unsigned long tif) | ||
4225 | { | ||
4226 | + /* Forced update. Make sure all relevant TIF flags are different */ | ||
4227 | preempt_disable(); | ||
4228 | - __speculative_store_bypass_update(tif); | ||
4229 | + __speculation_ctrl_update(~tif, tif); | ||
4230 | preempt_enable(); | ||
4231 | } | ||
4232 | |||
4233 | -void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, | ||
4234 | - struct tss_struct *tss) | ||
4235 | +/* Called from seccomp/prctl update */ | ||
4236 | +void speculation_ctrl_update_current(void) | ||
4237 | +{ | ||
4238 | + preempt_disable(); | ||
4239 | + speculation_ctrl_update(speculation_ctrl_update_tif(current)); | ||
4240 | + preempt_enable(); | ||
4241 | +} | ||
4242 | + | ||
4243 | +void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p) | ||
4244 | { | ||
4245 | struct thread_struct *prev, *next; | ||
4246 | unsigned long tifp, tifn; | ||
4247 | @@ -356,7 +412,7 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, | ||
4248 | |||
4249 | tifn = READ_ONCE(task_thread_info(next_p)->flags); | ||
4250 | tifp = READ_ONCE(task_thread_info(prev_p)->flags); | ||
4251 | - switch_to_bitmap(tss, prev, next, tifp, tifn); | ||
4252 | + switch_to_bitmap(prev, next, tifp, tifn); | ||
4253 | |||
4254 | propagate_user_return_notify(prev_p, next_p); | ||
4255 | |||
4256 | @@ -374,8 +430,15 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, | ||
4257 | if ((tifp ^ tifn) & _TIF_NOTSC) | ||
4258 | cr4_toggle_bits(X86_CR4_TSD); | ||
4259 | |||
4260 | - if ((tifp ^ tifn) & _TIF_SSBD) | ||
4261 | - __speculative_store_bypass_update(tifn); | ||
4262 | + if (likely(!((tifp | tifn) & _TIF_SPEC_FORCE_UPDATE))) { | ||
4263 | + __speculation_ctrl_update(tifp, tifn); | ||
4264 | + } else { | ||
4265 | + speculation_ctrl_update_tif(prev_p); | ||
4266 | + tifn = speculation_ctrl_update_tif(next_p); | ||
4267 | + | ||
4268 | + /* Enforce MSR update to ensure consistent state */ | ||
4269 | + __speculation_ctrl_update(~tifn, tifn); | ||
4270 | + } | ||
4271 | } | ||
4272 | |||
4273 | /* | ||
4274 | diff --git a/arch/x86/kernel/process.h b/arch/x86/kernel/process.h | ||
4275 | new file mode 100644 | ||
4276 | index 000000000000..898e97cf6629 | ||
4277 | --- /dev/null | ||
4278 | +++ b/arch/x86/kernel/process.h | ||
4279 | @@ -0,0 +1,39 @@ | ||
4280 | +// SPDX-License-Identifier: GPL-2.0 | ||
4281 | +// | ||
4282 | +// Code shared between 32 and 64 bit | ||
4283 | + | ||
4284 | +#include <asm/spec-ctrl.h> | ||
4285 | + | ||
4286 | +void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p); | ||
4287 | + | ||
4288 | +/* | ||
4289 | + * This needs to be inline to optimize for the common case where no extra | ||
4290 | + * work needs to be done. | ||
4291 | + */ | ||
4292 | +static inline void switch_to_extra(struct task_struct *prev, | ||
4293 | + struct task_struct *next) | ||
4294 | +{ | ||
4295 | + unsigned long next_tif = task_thread_info(next)->flags; | ||
4296 | + unsigned long prev_tif = task_thread_info(prev)->flags; | ||
4297 | + | ||
4298 | + if (IS_ENABLED(CONFIG_SMP)) { | ||
4299 | + /* | ||
4300 | + * Avoid __switch_to_xtra() invocation when conditional | ||
4301 | + * STIPB is disabled and the only different bit is | ||
4302 | + * TIF_SPEC_IB. For CONFIG_SMP=n TIF_SPEC_IB is not | ||
4303 | + * in the TIF_WORK_CTXSW masks. | ||
4304 | + */ | ||
4305 | + if (!static_branch_likely(&switch_to_cond_stibp)) { | ||
4306 | + prev_tif &= ~_TIF_SPEC_IB; | ||
4307 | + next_tif &= ~_TIF_SPEC_IB; | ||
4308 | + } | ||
4309 | + } | ||
4310 | + | ||
4311 | + /* | ||
4312 | + * __switch_to_xtra() handles debug registers, i/o bitmaps, | ||
4313 | + * speculation mitigations etc. | ||
4314 | + */ | ||
4315 | + if (unlikely(next_tif & _TIF_WORK_CTXSW_NEXT || | ||
4316 | + prev_tif & _TIF_WORK_CTXSW_PREV)) | ||
4317 | + __switch_to_xtra(prev, next); | ||
4318 | +} | ||
4319 | diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c | ||
4320 | index bd7be8efdc4c..912246fd6cd9 100644 | ||
4321 | --- a/arch/x86/kernel/process_32.c | ||
4322 | +++ b/arch/x86/kernel/process_32.c | ||
4323 | @@ -55,6 +55,8 @@ | ||
4324 | #include <asm/switch_to.h> | ||
4325 | #include <asm/vm86.h> | ||
4326 | |||
4327 | +#include "process.h" | ||
4328 | + | ||
4329 | void __show_regs(struct pt_regs *regs, int all) | ||
4330 | { | ||
4331 | unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; | ||
4332 | @@ -264,12 +266,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | ||
4333 | if (get_kernel_rpl() && unlikely(prev->iopl != next->iopl)) | ||
4334 | set_iopl_mask(next->iopl); | ||
4335 | |||
4336 | - /* | ||
4337 | - * Now maybe handle debug registers and/or IO bitmaps | ||
4338 | - */ | ||
4339 | - if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV || | ||
4340 | - task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT)) | ||
4341 | - __switch_to_xtra(prev_p, next_p, tss); | ||
4342 | + switch_to_extra(prev_p, next_p); | ||
4343 | |||
4344 | /* | ||
4345 | * Leave lazy mode, flushing any hypercalls made here. | ||
4346 | diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c | ||
4347 | index a2661814bde0..81eec65fe053 100644 | ||
4348 | --- a/arch/x86/kernel/process_64.c | ||
4349 | +++ b/arch/x86/kernel/process_64.c | ||
4350 | @@ -51,6 +51,8 @@ | ||
4351 | #include <asm/xen/hypervisor.h> | ||
4352 | #include <asm/vdso.h> | ||
4353 | |||
4354 | +#include "process.h" | ||
4355 | + | ||
4356 | __visible DEFINE_PER_CPU(unsigned long, rsp_scratch); | ||
4357 | |||
4358 | /* Prints also some state that isn't saved in the pt_regs */ | ||
4359 | @@ -454,12 +456,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | ||
4360 | /* Reload esp0 and ss1. This changes current_thread_info(). */ | ||
4361 | load_sp0(tss, next); | ||
4362 | |||
4363 | - /* | ||
4364 | - * Now maybe reload the debug registers and handle I/O bitmaps | ||
4365 | - */ | ||
4366 | - if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT || | ||
4367 | - task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV)) | ||
4368 | - __switch_to_xtra(prev_p, next_p, tss); | ||
4369 | + switch_to_extra(prev_p, next_p); | ||
4370 | |||
4371 | #ifdef CONFIG_XEN | ||
4372 | /* | ||
4373 | diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c | ||
4374 | index 5bbfa2f63b8c..ef225fa8e928 100644 | ||
4375 | --- a/arch/x86/kernel/traps.c | ||
4376 | +++ b/arch/x86/kernel/traps.c | ||
4377 | @@ -62,6 +62,7 @@ | ||
4378 | #include <asm/alternative.h> | ||
4379 | #include <asm/fpu/xstate.h> | ||
4380 | #include <asm/trace/mpx.h> | ||
4381 | +#include <asm/nospec-branch.h> | ||
4382 | #include <asm/mpx.h> | ||
4383 | #include <asm/vm86.h> | ||
4384 | |||
4385 | @@ -340,6 +341,13 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) | ||
4386 | regs->ip = (unsigned long)general_protection; | ||
4387 | regs->sp = (unsigned long)&normal_regs->orig_ax; | ||
4388 | |||
4389 | + /* | ||
4390 | + * This situation can be triggered by userspace via | ||
4391 | + * modify_ldt(2) and the return does not take the regular | ||
4392 | + * user space exit, so a CPU buffer clear is required when | ||
4393 | + * MDS mitigation is enabled. | ||
4394 | + */ | ||
4395 | + mds_user_clear_cpu_buffers(); | ||
4396 | return; | ||
4397 | } | ||
4398 | #endif | ||
4399 | diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c | ||
4400 | index 769c370011d6..cb768417429d 100644 | ||
4401 | --- a/arch/x86/kernel/tsc.c | ||
4402 | +++ b/arch/x86/kernel/tsc.c | ||
4403 | @@ -713,7 +713,7 @@ unsigned long native_calibrate_tsc(void) | ||
4404 | case INTEL_FAM6_KABYLAKE_DESKTOP: | ||
4405 | crystal_khz = 24000; /* 24.0 MHz */ | ||
4406 | break; | ||
4407 | - case INTEL_FAM6_ATOM_DENVERTON: | ||
4408 | + case INTEL_FAM6_ATOM_GOLDMONT_X: | ||
4409 | crystal_khz = 25000; /* 25.0 MHz */ | ||
4410 | break; | ||
4411 | case INTEL_FAM6_ATOM_GOLDMONT: | ||
4412 | diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c | ||
4413 | index c17d3893ae60..fc8236fd2495 100644 | ||
4414 | --- a/arch/x86/kvm/cpuid.c | ||
4415 | +++ b/arch/x86/kvm/cpuid.c | ||
4416 | @@ -355,7 +355,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | ||
4417 | |||
4418 | /* cpuid 0x80000008.ebx */ | ||
4419 | const u32 kvm_cpuid_8000_0008_ebx_x86_features = | ||
4420 | - F(AMD_IBPB) | F(AMD_IBRS) | F(VIRT_SSBD); | ||
4421 | + F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) | | ||
4422 | + F(AMD_SSB_NO) | F(AMD_STIBP); | ||
4423 | |||
4424 | /* cpuid 0xC0000001.edx */ | ||
4425 | const u32 kvm_cpuid_C000_0001_edx_x86_features = | ||
4426 | @@ -380,7 +381,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | ||
4427 | |||
4428 | /* cpuid 7.0.edx*/ | ||
4429 | const u32 kvm_cpuid_7_0_edx_x86_features = | ||
4430 | - F(SPEC_CTRL) | F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES); | ||
4431 | + F(SPEC_CTRL) | F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | | ||
4432 | + F(INTEL_STIBP) | F(MD_CLEAR); | ||
4433 | |||
4434 | /* all calls to cpuid_count() should be made on the same cpu */ | ||
4435 | get_cpu(); | ||
4436 | @@ -633,7 +635,12 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | ||
4437 | entry->ebx |= F(VIRT_SSBD); | ||
4438 | entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features; | ||
4439 | cpuid_mask(&entry->ebx, CPUID_8000_0008_EBX); | ||
4440 | - if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD)) | ||
4441 | + /* | ||
4442 | + * The preference is to use SPEC CTRL MSR instead of the | ||
4443 | + * VIRT_SPEC MSR. | ||
4444 | + */ | ||
4445 | + if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) && | ||
4446 | + !boot_cpu_has(X86_FEATURE_AMD_SSBD)) | ||
4447 | entry->ebx |= F(VIRT_SSBD); | ||
4448 | break; | ||
4449 | } | ||
4450 | diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h | ||
4451 | index 8a841b9d8f84..b2bf8e1d5782 100644 | ||
4452 | --- a/arch/x86/kvm/cpuid.h | ||
4453 | +++ b/arch/x86/kvm/cpuid.h | ||
4454 | @@ -176,7 +176,7 @@ static inline bool guest_cpuid_has_spec_ctrl(struct kvm_vcpu *vcpu) | ||
4455 | struct kvm_cpuid_entry2 *best; | ||
4456 | |||
4457 | best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); | ||
4458 | - if (best && (best->ebx & bit(X86_FEATURE_AMD_IBRS))) | ||
4459 | + if (best && (best->ebx & (bit(X86_FEATURE_AMD_IBRS | bit(X86_FEATURE_AMD_SSBD))))) | ||
4460 | return true; | ||
4461 | best = kvm_find_cpuid_entry(vcpu, 7, 0); | ||
4462 | return best && (best->edx & (bit(X86_FEATURE_SPEC_CTRL) | bit(X86_FEATURE_SPEC_CTRL_SSBD))); | ||
4463 | diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c | ||
4464 | index 9a6d258c3c16..9338136a6a23 100644 | ||
4465 | --- a/arch/x86/kvm/svm.c | ||
4466 | +++ b/arch/x86/kvm/svm.c | ||
4467 | @@ -3704,7 +3704,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) | ||
4468 | return 1; | ||
4469 | |||
4470 | /* The STIBP bit doesn't fault even if it's not advertised */ | ||
4471 | - if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP)) | ||
4472 | + if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD)) | ||
4473 | return 1; | ||
4474 | |||
4475 | svm->spec_ctrl = data; | ||
4476 | diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c | ||
4477 | index 75466d9417b8..8feb4f7e2e59 100644 | ||
4478 | --- a/arch/x86/kvm/vmx.c | ||
4479 | +++ b/arch/x86/kvm/vmx.c | ||
4480 | @@ -9206,8 +9206,11 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) | ||
4481 | |||
4482 | vmx->__launched = vmx->loaded_vmcs->launched; | ||
4483 | |||
4484 | + /* L1D Flush includes CPU buffer clear to mitigate MDS */ | ||
4485 | if (static_branch_unlikely(&vmx_l1d_should_flush)) | ||
4486 | vmx_l1d_flush(vcpu); | ||
4487 | + else if (static_branch_unlikely(&mds_user_clear)) | ||
4488 | + mds_clear_cpu_buffers(); | ||
4489 | |||
4490 | asm( | ||
4491 | /* Store host registers */ | ||
4492 | @@ -9566,8 +9569,8 @@ free_vcpu: | ||
4493 | return ERR_PTR(err); | ||
4494 | } | ||
4495 | |||
4496 | -#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n" | ||
4497 | -#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n" | ||
4498 | +#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n" | ||
4499 | +#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n" | ||
4500 | |||
4501 | static int vmx_vm_init(struct kvm *kvm) | ||
4502 | { | ||
4503 | diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c | ||
4504 | index 90801a8f19c9..ce092a62fc5d 100644 | ||
4505 | --- a/arch/x86/mm/init.c | ||
4506 | +++ b/arch/x86/mm/init.c | ||
4507 | @@ -790,7 +790,7 @@ unsigned long max_swapfile_size(void) | ||
4508 | |||
4509 | pages = generic_max_swapfile_size(); | ||
4510 | |||
4511 | - if (boot_cpu_has_bug(X86_BUG_L1TF)) { | ||
4512 | + if (boot_cpu_has_bug(X86_BUG_L1TF) && l1tf_mitigation != L1TF_MITIGATION_OFF) { | ||
4513 | /* Limit the swap file size to MAX_PA/2 for L1TF workaround */ | ||
4514 | unsigned long long l1tf_limit = l1tf_pfn_limit(); | ||
4515 | /* | ||
4516 | diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c | ||
4517 | index 3f729e20f0e3..12522dbae615 100644 | ||
4518 | --- a/arch/x86/mm/kaiser.c | ||
4519 | +++ b/arch/x86/mm/kaiser.c | ||
4520 | @@ -9,6 +9,7 @@ | ||
4521 | #include <linux/spinlock.h> | ||
4522 | #include <linux/mm.h> | ||
4523 | #include <linux/uaccess.h> | ||
4524 | +#include <linux/cpu.h> | ||
4525 | |||
4526 | #undef pr_fmt | ||
4527 | #define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt | ||
4528 | @@ -297,7 +298,8 @@ void __init kaiser_check_boottime_disable(void) | ||
4529 | goto skip; | ||
4530 | } | ||
4531 | |||
4532 | - if (cmdline_find_option_bool(boot_command_line, "nopti")) | ||
4533 | + if (cmdline_find_option_bool(boot_command_line, "nopti") || | ||
4534 | + cpu_mitigations_off()) | ||
4535 | goto disable; | ||
4536 | |||
4537 | skip: | ||
4538 | diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c | ||
4539 | index e30baa8ad94f..dff8ac2d255c 100644 | ||
4540 | --- a/arch/x86/mm/pgtable.c | ||
4541 | +++ b/arch/x86/mm/pgtable.c | ||
4542 | @@ -251,7 +251,7 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) | ||
4543 | if (pgd_val(pgd) != 0) { | ||
4544 | pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd); | ||
4545 | |||
4546 | - pgdp[i] = native_make_pgd(0); | ||
4547 | + pgd_clear(&pgdp[i]); | ||
4548 | |||
4549 | paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT); | ||
4550 | pmd_free(mm, pmd); | ||
4551 | @@ -419,7 +419,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma, | ||
4552 | int changed = !pte_same(*ptep, entry); | ||
4553 | |||
4554 | if (changed && dirty) { | ||
4555 | - *ptep = entry; | ||
4556 | + set_pte(ptep, entry); | ||
4557 | pte_update(vma->vm_mm, address, ptep); | ||
4558 | } | ||
4559 | |||
4560 | @@ -436,7 +436,7 @@ int pmdp_set_access_flags(struct vm_area_struct *vma, | ||
4561 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | ||
4562 | |||
4563 | if (changed && dirty) { | ||
4564 | - *pmdp = entry; | ||
4565 | + set_pmd(pmdp, entry); | ||
4566 | /* | ||
4567 | * We had a write-protection fault here and changed the pmd | ||
4568 | * to to more permissive. No need to flush the TLB for that, | ||
4569 | diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c | ||
4570 | index eac92e2d171b..a112bb175dd4 100644 | ||
4571 | --- a/arch/x86/mm/tlb.c | ||
4572 | +++ b/arch/x86/mm/tlb.c | ||
4573 | @@ -30,6 +30,12 @@ | ||
4574 | * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi | ||
4575 | */ | ||
4576 | |||
4577 | +/* | ||
4578 | + * Use bit 0 to mangle the TIF_SPEC_IB state into the mm pointer which is | ||
4579 | + * stored in cpu_tlb_state.last_user_mm_ibpb. | ||
4580 | + */ | ||
4581 | +#define LAST_USER_MM_IBPB 0x1UL | ||
4582 | + | ||
4583 | atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1); | ||
4584 | |||
4585 | struct flush_tlb_info { | ||
4586 | @@ -101,33 +107,101 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next, | ||
4587 | local_irq_restore(flags); | ||
4588 | } | ||
4589 | |||
4590 | +static inline unsigned long mm_mangle_tif_spec_ib(struct task_struct *next) | ||
4591 | +{ | ||
4592 | + unsigned long next_tif = task_thread_info(next)->flags; | ||
4593 | + unsigned long ibpb = (next_tif >> TIF_SPEC_IB) & LAST_USER_MM_IBPB; | ||
4594 | + | ||
4595 | + return (unsigned long)next->mm | ibpb; | ||
4596 | +} | ||
4597 | + | ||
4598 | +static void cond_ibpb(struct task_struct *next) | ||
4599 | +{ | ||
4600 | + if (!next || !next->mm) | ||
4601 | + return; | ||
4602 | + | ||
4603 | + /* | ||
4604 | + * Both, the conditional and the always IBPB mode use the mm | ||
4605 | + * pointer to avoid the IBPB when switching between tasks of the | ||
4606 | + * same process. Using the mm pointer instead of mm->context.ctx_id | ||
4607 | + * opens a hypothetical hole vs. mm_struct reuse, which is more or | ||
4608 | + * less impossible to control by an attacker. Aside of that it | ||
4609 | + * would only affect the first schedule so the theoretically | ||
4610 | + * exposed data is not really interesting. | ||
4611 | + */ | ||
4612 | + if (static_branch_likely(&switch_mm_cond_ibpb)) { | ||
4613 | + unsigned long prev_mm, next_mm; | ||
4614 | + | ||
4615 | + /* | ||
4616 | + * This is a bit more complex than the always mode because | ||
4617 | + * it has to handle two cases: | ||
4618 | + * | ||
4619 | + * 1) Switch from a user space task (potential attacker) | ||
4620 | + * which has TIF_SPEC_IB set to a user space task | ||
4621 | + * (potential victim) which has TIF_SPEC_IB not set. | ||
4622 | + * | ||
4623 | + * 2) Switch from a user space task (potential attacker) | ||
4624 | + * which has TIF_SPEC_IB not set to a user space task | ||
4625 | + * (potential victim) which has TIF_SPEC_IB set. | ||
4626 | + * | ||
4627 | + * This could be done by unconditionally issuing IBPB when | ||
4628 | + * a task which has TIF_SPEC_IB set is either scheduled in | ||
4629 | + * or out. Though that results in two flushes when: | ||
4630 | + * | ||
4631 | + * - the same user space task is scheduled out and later | ||
4632 | + * scheduled in again and only a kernel thread ran in | ||
4633 | + * between. | ||
4634 | + * | ||
4635 | + * - a user space task belonging to the same process is | ||
4636 | + * scheduled in after a kernel thread ran in between | ||
4637 | + * | ||
4638 | + * - a user space task belonging to the same process is | ||
4639 | + * scheduled in immediately. | ||
4640 | + * | ||
4641 | + * Optimize this with reasonably small overhead for the | ||
4642 | + * above cases. Mangle the TIF_SPEC_IB bit into the mm | ||
4643 | + * pointer of the incoming task which is stored in | ||
4644 | + * cpu_tlbstate.last_user_mm_ibpb for comparison. | ||
4645 | + */ | ||
4646 | + next_mm = mm_mangle_tif_spec_ib(next); | ||
4647 | + prev_mm = this_cpu_read(cpu_tlbstate.last_user_mm_ibpb); | ||
4648 | + | ||
4649 | + /* | ||
4650 | + * Issue IBPB only if the mm's are different and one or | ||
4651 | + * both have the IBPB bit set. | ||
4652 | + */ | ||
4653 | + if (next_mm != prev_mm && | ||
4654 | + (next_mm | prev_mm) & LAST_USER_MM_IBPB) | ||
4655 | + indirect_branch_prediction_barrier(); | ||
4656 | + | ||
4657 | + this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, next_mm); | ||
4658 | + } | ||
4659 | + | ||
4660 | + if (static_branch_unlikely(&switch_mm_always_ibpb)) { | ||
4661 | + /* | ||
4662 | + * Only flush when switching to a user space task with a | ||
4663 | + * different context than the user space task which ran | ||
4664 | + * last on this CPU. | ||
4665 | + */ | ||
4666 | + if (this_cpu_read(cpu_tlbstate.last_user_mm) != next->mm) { | ||
4667 | + indirect_branch_prediction_barrier(); | ||
4668 | + this_cpu_write(cpu_tlbstate.last_user_mm, next->mm); | ||
4669 | + } | ||
4670 | + } | ||
4671 | +} | ||
4672 | + | ||
4673 | void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, | ||
4674 | struct task_struct *tsk) | ||
4675 | { | ||
4676 | unsigned cpu = smp_processor_id(); | ||
4677 | |||
4678 | if (likely(prev != next)) { | ||
4679 | - u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id); | ||
4680 | - | ||
4681 | /* | ||
4682 | * Avoid user/user BTB poisoning by flushing the branch | ||
4683 | * predictor when switching between processes. This stops | ||
4684 | * one process from doing Spectre-v2 attacks on another. | ||
4685 | - * | ||
4686 | - * As an optimization, flush indirect branches only when | ||
4687 | - * switching into processes that disable dumping. This | ||
4688 | - * protects high value processes like gpg, without having | ||
4689 | - * too high performance overhead. IBPB is *expensive*! | ||
4690 | - * | ||
4691 | - * This will not flush branches when switching into kernel | ||
4692 | - * threads. It will also not flush if we switch to idle | ||
4693 | - * thread and back to the same process. It will flush if we | ||
4694 | - * switch to a different non-dumpable process. | ||
4695 | */ | ||
4696 | - if (tsk && tsk->mm && | ||
4697 | - tsk->mm->context.ctx_id != last_ctx_id && | ||
4698 | - get_dumpable(tsk->mm) != SUID_DUMP_USER) | ||
4699 | - indirect_branch_prediction_barrier(); | ||
4700 | + cond_ibpb(tsk); | ||
4701 | |||
4702 | if (IS_ENABLED(CONFIG_VMAP_STACK)) { | ||
4703 | /* | ||
4704 | @@ -143,14 +217,6 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, | ||
4705 | set_pgd(pgd, init_mm.pgd[stack_pgd_index]); | ||
4706 | } | ||
4707 | |||
4708 | - /* | ||
4709 | - * Record last user mm's context id, so we can avoid | ||
4710 | - * flushing branch buffer with IBPB if we switch back | ||
4711 | - * to the same user. | ||
4712 | - */ | ||
4713 | - if (next != &init_mm) | ||
4714 | - this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id); | ||
4715 | - | ||
4716 | this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); | ||
4717 | this_cpu_write(cpu_tlbstate.active_mm, next); | ||
4718 | |||
4719 | diff --git a/arch/x86/platform/atom/punit_atom_debug.c b/arch/x86/platform/atom/punit_atom_debug.c | ||
4720 | index d49d3be81953..ecb5866aaf84 100644 | ||
4721 | --- a/arch/x86/platform/atom/punit_atom_debug.c | ||
4722 | +++ b/arch/x86/platform/atom/punit_atom_debug.c | ||
4723 | @@ -154,8 +154,8 @@ static void punit_dbgfs_unregister(void) | ||
4724 | (kernel_ulong_t)&drv_data } | ||
4725 | |||
4726 | static const struct x86_cpu_id intel_punit_cpu_ids[] = { | ||
4727 | - ICPU(INTEL_FAM6_ATOM_SILVERMONT1, punit_device_byt), | ||
4728 | - ICPU(INTEL_FAM6_ATOM_MERRIFIELD, punit_device_tng), | ||
4729 | + ICPU(INTEL_FAM6_ATOM_SILVERMONT, punit_device_byt), | ||
4730 | + ICPU(INTEL_FAM6_ATOM_SILVERMONT_MID, punit_device_tng), | ||
4731 | ICPU(INTEL_FAM6_ATOM_AIRMONT, punit_device_cht), | ||
4732 | {} | ||
4733 | }; | ||
4734 | diff --git a/drivers/acpi/acpi_lpss.c b/drivers/acpi/acpi_lpss.c | ||
4735 | index 957d3fa3b543..8e38249311bd 100644 | ||
4736 | --- a/drivers/acpi/acpi_lpss.c | ||
4737 | +++ b/drivers/acpi/acpi_lpss.c | ||
4738 | @@ -243,7 +243,7 @@ static const struct lpss_device_desc bsw_spi_dev_desc = { | ||
4739 | #define ICPU(model) { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, } | ||
4740 | |||
4741 | static const struct x86_cpu_id lpss_cpu_ids[] = { | ||
4742 | - ICPU(INTEL_FAM6_ATOM_SILVERMONT1), /* Valleyview, Bay Trail */ | ||
4743 | + ICPU(INTEL_FAM6_ATOM_SILVERMONT), /* Valleyview, Bay Trail */ | ||
4744 | ICPU(INTEL_FAM6_ATOM_AIRMONT), /* Braswell, Cherry Trail */ | ||
4745 | {} | ||
4746 | }; | ||
4747 | diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c | ||
4748 | index f1f4ce7ddb47..3b123735a1c4 100644 | ||
4749 | --- a/drivers/base/cpu.c | ||
4750 | +++ b/drivers/base/cpu.c | ||
4751 | @@ -531,11 +531,18 @@ ssize_t __weak cpu_show_l1tf(struct device *dev, | ||
4752 | return sprintf(buf, "Not affected\n"); | ||
4753 | } | ||
4754 | |||
4755 | +ssize_t __weak cpu_show_mds(struct device *dev, | ||
4756 | + struct device_attribute *attr, char *buf) | ||
4757 | +{ | ||
4758 | + return sprintf(buf, "Not affected\n"); | ||
4759 | +} | ||
4760 | + | ||
4761 | static DEVICE_ATTR(meltdown, 0444, cpu_show_meltdown, NULL); | ||
4762 | static DEVICE_ATTR(spectre_v1, 0444, cpu_show_spectre_v1, NULL); | ||
4763 | static DEVICE_ATTR(spectre_v2, 0444, cpu_show_spectre_v2, NULL); | ||
4764 | static DEVICE_ATTR(spec_store_bypass, 0444, cpu_show_spec_store_bypass, NULL); | ||
4765 | static DEVICE_ATTR(l1tf, 0444, cpu_show_l1tf, NULL); | ||
4766 | +static DEVICE_ATTR(mds, 0444, cpu_show_mds, NULL); | ||
4767 | |||
4768 | static struct attribute *cpu_root_vulnerabilities_attrs[] = { | ||
4769 | &dev_attr_meltdown.attr, | ||
4770 | @@ -543,6 +550,7 @@ static struct attribute *cpu_root_vulnerabilities_attrs[] = { | ||
4771 | &dev_attr_spectre_v2.attr, | ||
4772 | &dev_attr_spec_store_bypass.attr, | ||
4773 | &dev_attr_l1tf.attr, | ||
4774 | + &dev_attr_mds.attr, | ||
4775 | NULL | ||
4776 | }; | ||
4777 | |||
4778 | diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c | ||
4779 | index f690085b1ad9..4fe999687415 100644 | ||
4780 | --- a/drivers/cpufreq/intel_pstate.c | ||
4781 | +++ b/drivers/cpufreq/intel_pstate.c | ||
4782 | @@ -1413,7 +1413,7 @@ static void intel_pstate_update_util(struct update_util_data *data, u64 time, | ||
4783 | static const struct x86_cpu_id intel_pstate_cpu_ids[] = { | ||
4784 | ICPU(INTEL_FAM6_SANDYBRIDGE, core_params), | ||
4785 | ICPU(INTEL_FAM6_SANDYBRIDGE_X, core_params), | ||
4786 | - ICPU(INTEL_FAM6_ATOM_SILVERMONT1, silvermont_params), | ||
4787 | + ICPU(INTEL_FAM6_ATOM_SILVERMONT, silvermont_params), | ||
4788 | ICPU(INTEL_FAM6_IVYBRIDGE, core_params), | ||
4789 | ICPU(INTEL_FAM6_HASWELL_CORE, core_params), | ||
4790 | ICPU(INTEL_FAM6_BROADWELL_CORE, core_params), | ||
4791 | diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c | ||
4792 | index 5ded9b22b015..a6fa32c7e068 100644 | ||
4793 | --- a/drivers/idle/intel_idle.c | ||
4794 | +++ b/drivers/idle/intel_idle.c | ||
4795 | @@ -1107,14 +1107,14 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = { | ||
4796 | ICPU(INTEL_FAM6_WESTMERE, idle_cpu_nehalem), | ||
4797 | ICPU(INTEL_FAM6_WESTMERE_EP, idle_cpu_nehalem), | ||
4798 | ICPU(INTEL_FAM6_NEHALEM_EX, idle_cpu_nehalem), | ||
4799 | - ICPU(INTEL_FAM6_ATOM_PINEVIEW, idle_cpu_atom), | ||
4800 | - ICPU(INTEL_FAM6_ATOM_LINCROFT, idle_cpu_lincroft), | ||
4801 | + ICPU(INTEL_FAM6_ATOM_BONNELL, idle_cpu_atom), | ||
4802 | + ICPU(INTEL_FAM6_ATOM_BONNELL_MID, idle_cpu_lincroft), | ||
4803 | ICPU(INTEL_FAM6_WESTMERE_EX, idle_cpu_nehalem), | ||
4804 | ICPU(INTEL_FAM6_SANDYBRIDGE, idle_cpu_snb), | ||
4805 | ICPU(INTEL_FAM6_SANDYBRIDGE_X, idle_cpu_snb), | ||
4806 | - ICPU(INTEL_FAM6_ATOM_CEDARVIEW, idle_cpu_atom), | ||
4807 | - ICPU(INTEL_FAM6_ATOM_SILVERMONT1, idle_cpu_byt), | ||
4808 | - ICPU(INTEL_FAM6_ATOM_MERRIFIELD, idle_cpu_tangier), | ||
4809 | + ICPU(INTEL_FAM6_ATOM_SALTWELL, idle_cpu_atom), | ||
4810 | + ICPU(INTEL_FAM6_ATOM_SILVERMONT, idle_cpu_byt), | ||
4811 | + ICPU(INTEL_FAM6_ATOM_SILVERMONT_MID, idle_cpu_tangier), | ||
4812 | ICPU(INTEL_FAM6_ATOM_AIRMONT, idle_cpu_cht), | ||
4813 | ICPU(INTEL_FAM6_IVYBRIDGE, idle_cpu_ivb), | ||
4814 | ICPU(INTEL_FAM6_IVYBRIDGE_X, idle_cpu_ivt), | ||
4815 | @@ -1122,7 +1122,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = { | ||
4816 | ICPU(INTEL_FAM6_HASWELL_X, idle_cpu_hsw), | ||
4817 | ICPU(INTEL_FAM6_HASWELL_ULT, idle_cpu_hsw), | ||
4818 | ICPU(INTEL_FAM6_HASWELL_GT3E, idle_cpu_hsw), | ||
4819 | - ICPU(INTEL_FAM6_ATOM_SILVERMONT2, idle_cpu_avn), | ||
4820 | + ICPU(INTEL_FAM6_ATOM_SILVERMONT_X, idle_cpu_avn), | ||
4821 | ICPU(INTEL_FAM6_BROADWELL_CORE, idle_cpu_bdw), | ||
4822 | ICPU(INTEL_FAM6_BROADWELL_GT3E, idle_cpu_bdw), | ||
4823 | ICPU(INTEL_FAM6_BROADWELL_X, idle_cpu_bdw), | ||
4824 | @@ -1134,7 +1134,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = { | ||
4825 | ICPU(INTEL_FAM6_SKYLAKE_X, idle_cpu_skx), | ||
4826 | ICPU(INTEL_FAM6_XEON_PHI_KNL, idle_cpu_knl), | ||
4827 | ICPU(INTEL_FAM6_ATOM_GOLDMONT, idle_cpu_bxt), | ||
4828 | - ICPU(INTEL_FAM6_ATOM_DENVERTON, idle_cpu_dnv), | ||
4829 | + ICPU(INTEL_FAM6_ATOM_GOLDMONT_X, idle_cpu_dnv), | ||
4830 | {} | ||
4831 | }; | ||
4832 | |||
4833 | diff --git a/drivers/mmc/host/sdhci-acpi.c b/drivers/mmc/host/sdhci-acpi.c | ||
4834 | index 80918abfc468..4398398c0935 100644 | ||
4835 | --- a/drivers/mmc/host/sdhci-acpi.c | ||
4836 | +++ b/drivers/mmc/host/sdhci-acpi.c | ||
4837 | @@ -127,7 +127,7 @@ static const struct sdhci_acpi_chip sdhci_acpi_chip_int = { | ||
4838 | static bool sdhci_acpi_byt(void) | ||
4839 | { | ||
4840 | static const struct x86_cpu_id byt[] = { | ||
4841 | - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT1 }, | ||
4842 | + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT }, | ||
4843 | {} | ||
4844 | }; | ||
4845 | |||
4846 | diff --git a/drivers/pci/pci-mid.c b/drivers/pci/pci-mid.c | ||
4847 | index c7f3408e3148..54b3f9bc5ad8 100644 | ||
4848 | --- a/drivers/pci/pci-mid.c | ||
4849 | +++ b/drivers/pci/pci-mid.c | ||
4850 | @@ -71,8 +71,8 @@ static struct pci_platform_pm_ops mid_pci_platform_pm = { | ||
4851 | * arch/x86/platform/intel-mid/pwr.c. | ||
4852 | */ | ||
4853 | static const struct x86_cpu_id lpss_cpu_ids[] = { | ||
4854 | - ICPU(INTEL_FAM6_ATOM_PENWELL), | ||
4855 | - ICPU(INTEL_FAM6_ATOM_MERRIFIELD), | ||
4856 | + ICPU(INTEL_FAM6_ATOM_SALTWELL_MID), | ||
4857 | + ICPU(INTEL_FAM6_ATOM_SILVERMONT_MID), | ||
4858 | {} | ||
4859 | }; | ||
4860 | |||
4861 | diff --git a/drivers/powercap/intel_rapl.c b/drivers/powercap/intel_rapl.c | ||
4862 | index 3c71f608b444..8809c1a20bed 100644 | ||
4863 | --- a/drivers/powercap/intel_rapl.c | ||
4864 | +++ b/drivers/powercap/intel_rapl.c | ||
4865 | @@ -1175,12 +1175,12 @@ static const struct x86_cpu_id rapl_ids[] __initconst = { | ||
4866 | RAPL_CPU(INTEL_FAM6_KABYLAKE_MOBILE, rapl_defaults_core), | ||
4867 | RAPL_CPU(INTEL_FAM6_KABYLAKE_DESKTOP, rapl_defaults_core), | ||
4868 | |||
4869 | - RAPL_CPU(INTEL_FAM6_ATOM_SILVERMONT1, rapl_defaults_byt), | ||
4870 | + RAPL_CPU(INTEL_FAM6_ATOM_SILVERMONT, rapl_defaults_byt), | ||
4871 | RAPL_CPU(INTEL_FAM6_ATOM_AIRMONT, rapl_defaults_cht), | ||
4872 | - RAPL_CPU(INTEL_FAM6_ATOM_MERRIFIELD, rapl_defaults_tng), | ||
4873 | - RAPL_CPU(INTEL_FAM6_ATOM_MOOREFIELD, rapl_defaults_ann), | ||
4874 | + RAPL_CPU(INTEL_FAM6_ATOM_SILVERMONT_MID,rapl_defaults_tng), | ||
4875 | + RAPL_CPU(INTEL_FAM6_ATOM_AIRMONT_MID, rapl_defaults_ann), | ||
4876 | RAPL_CPU(INTEL_FAM6_ATOM_GOLDMONT, rapl_defaults_core), | ||
4877 | - RAPL_CPU(INTEL_FAM6_ATOM_DENVERTON, rapl_defaults_core), | ||
4878 | + RAPL_CPU(INTEL_FAM6_ATOM_GOLDMONT_X, rapl_defaults_core), | ||
4879 | |||
4880 | RAPL_CPU(INTEL_FAM6_XEON_PHI_KNL, rapl_defaults_hsw_server), | ||
4881 | {} | ||
4882 | diff --git a/drivers/thermal/intel_soc_dts_thermal.c b/drivers/thermal/intel_soc_dts_thermal.c | ||
4883 | index b2bbaa1c60b0..18788109cae6 100644 | ||
4884 | --- a/drivers/thermal/intel_soc_dts_thermal.c | ||
4885 | +++ b/drivers/thermal/intel_soc_dts_thermal.c | ||
4886 | @@ -43,7 +43,7 @@ static irqreturn_t soc_irq_thread_fn(int irq, void *dev_data) | ||
4887 | } | ||
4888 | |||
4889 | static const struct x86_cpu_id soc_thermal_ids[] = { | ||
4890 | - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT1, 0, | ||
4891 | + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT, 0, | ||
4892 | BYT_SOC_DTS_APIC_IRQ}, | ||
4893 | {} | ||
4894 | }; | ||
4895 | diff --git a/include/linux/bitops.h b/include/linux/bitops.h | ||
4896 | index a83c822c35c2..d4b167fc9ecb 100644 | ||
4897 | --- a/include/linux/bitops.h | ||
4898 | +++ b/include/linux/bitops.h | ||
4899 | @@ -1,28 +1,9 @@ | ||
4900 | #ifndef _LINUX_BITOPS_H | ||
4901 | #define _LINUX_BITOPS_H | ||
4902 | #include <asm/types.h> | ||
4903 | +#include <linux/bits.h> | ||
4904 | |||
4905 | -#ifdef __KERNEL__ | ||
4906 | -#define BIT(nr) (1UL << (nr)) | ||
4907 | -#define BIT_ULL(nr) (1ULL << (nr)) | ||
4908 | -#define BIT_MASK(nr) (1UL << ((nr) % BITS_PER_LONG)) | ||
4909 | -#define BIT_WORD(nr) ((nr) / BITS_PER_LONG) | ||
4910 | -#define BIT_ULL_MASK(nr) (1ULL << ((nr) % BITS_PER_LONG_LONG)) | ||
4911 | -#define BIT_ULL_WORD(nr) ((nr) / BITS_PER_LONG_LONG) | ||
4912 | -#define BITS_PER_BYTE 8 | ||
4913 | #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(long)) | ||
4914 | -#endif | ||
4915 | - | ||
4916 | -/* | ||
4917 | - * Create a contiguous bitmask starting at bit position @l and ending at | ||
4918 | - * position @h. For example | ||
4919 | - * GENMASK_ULL(39, 21) gives us the 64bit vector 0x000000ffffe00000. | ||
4920 | - */ | ||
4921 | -#define GENMASK(h, l) \ | ||
4922 | - (((~0UL) << (l)) & (~0UL >> (BITS_PER_LONG - 1 - (h)))) | ||
4923 | - | ||
4924 | -#define GENMASK_ULL(h, l) \ | ||
4925 | - (((~0ULL) << (l)) & (~0ULL >> (BITS_PER_LONG_LONG - 1 - (h)))) | ||
4926 | |||
4927 | extern unsigned int __sw_hweight8(unsigned int w); | ||
4928 | extern unsigned int __sw_hweight16(unsigned int w); | ||
4929 | diff --git a/include/linux/bits.h b/include/linux/bits.h | ||
4930 | new file mode 100644 | ||
4931 | index 000000000000..2b7b532c1d51 | ||
4932 | --- /dev/null | ||
4933 | +++ b/include/linux/bits.h | ||
4934 | @@ -0,0 +1,26 @@ | ||
4935 | +/* SPDX-License-Identifier: GPL-2.0 */ | ||
4936 | +#ifndef __LINUX_BITS_H | ||
4937 | +#define __LINUX_BITS_H | ||
4938 | +#include <asm/bitsperlong.h> | ||
4939 | + | ||
4940 | +#define BIT(nr) (1UL << (nr)) | ||
4941 | +#define BIT_ULL(nr) (1ULL << (nr)) | ||
4942 | +#define BIT_MASK(nr) (1UL << ((nr) % BITS_PER_LONG)) | ||
4943 | +#define BIT_WORD(nr) ((nr) / BITS_PER_LONG) | ||
4944 | +#define BIT_ULL_MASK(nr) (1ULL << ((nr) % BITS_PER_LONG_LONG)) | ||
4945 | +#define BIT_ULL_WORD(nr) ((nr) / BITS_PER_LONG_LONG) | ||
4946 | +#define BITS_PER_BYTE 8 | ||
4947 | + | ||
4948 | +/* | ||
4949 | + * Create a contiguous bitmask starting at bit position @l and ending at | ||
4950 | + * position @h. For example | ||
4951 | + * GENMASK_ULL(39, 21) gives us the 64bit vector 0x000000ffffe00000. | ||
4952 | + */ | ||
4953 | +#define GENMASK(h, l) \ | ||
4954 | + (((~0UL) - (1UL << (l)) + 1) & (~0UL >> (BITS_PER_LONG - 1 - (h)))) | ||
4955 | + | ||
4956 | +#define GENMASK_ULL(h, l) \ | ||
4957 | + (((~0ULL) - (1ULL << (l)) + 1) & \ | ||
4958 | + (~0ULL >> (BITS_PER_LONG_LONG - 1 - (h)))) | ||
4959 | + | ||
4960 | +#endif /* __LINUX_BITS_H */ | ||
4961 | diff --git a/include/linux/cpu.h b/include/linux/cpu.h | ||
4962 | index ae5ac89324df..166686209f2c 100644 | ||
4963 | --- a/include/linux/cpu.h | ||
4964 | +++ b/include/linux/cpu.h | ||
4965 | @@ -54,6 +54,8 @@ extern ssize_t cpu_show_spec_store_bypass(struct device *dev, | ||
4966 | struct device_attribute *attr, char *buf); | ||
4967 | extern ssize_t cpu_show_l1tf(struct device *dev, | ||
4968 | struct device_attribute *attr, char *buf); | ||
4969 | +extern ssize_t cpu_show_mds(struct device *dev, | ||
4970 | + struct device_attribute *attr, char *buf); | ||
4971 | |||
4972 | extern __printf(4, 5) | ||
4973 | struct device *cpu_device_create(struct device *parent, void *drvdata, | ||
4974 | @@ -276,4 +278,28 @@ static inline void cpu_smt_check_topology_early(void) { } | ||
4975 | static inline void cpu_smt_check_topology(void) { } | ||
4976 | #endif | ||
4977 | |||
4978 | +/* | ||
4979 | + * These are used for a global "mitigations=" cmdline option for toggling | ||
4980 | + * optional CPU mitigations. | ||
4981 | + */ | ||
4982 | +enum cpu_mitigations { | ||
4983 | + CPU_MITIGATIONS_OFF, | ||
4984 | + CPU_MITIGATIONS_AUTO, | ||
4985 | + CPU_MITIGATIONS_AUTO_NOSMT, | ||
4986 | +}; | ||
4987 | + | ||
4988 | +extern enum cpu_mitigations cpu_mitigations; | ||
4989 | + | ||
4990 | +/* mitigations=off */ | ||
4991 | +static inline bool cpu_mitigations_off(void) | ||
4992 | +{ | ||
4993 | + return cpu_mitigations == CPU_MITIGATIONS_OFF; | ||
4994 | +} | ||
4995 | + | ||
4996 | +/* mitigations=auto,nosmt */ | ||
4997 | +static inline bool cpu_mitigations_auto_nosmt(void) | ||
4998 | +{ | ||
4999 | + return cpu_mitigations == CPU_MITIGATIONS_AUTO_NOSMT; | ||
5000 | +} | ||
5001 | + | ||
5002 | #endif /* _LINUX_CPU_H_ */ | ||
5003 | diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h | ||
5004 | index d53a23100401..58ae371556bc 100644 | ||
5005 | --- a/include/linux/ptrace.h | ||
5006 | +++ b/include/linux/ptrace.h | ||
5007 | @@ -60,14 +60,17 @@ extern void exit_ptrace(struct task_struct *tracer, struct list_head *dead); | ||
5008 | #define PTRACE_MODE_READ 0x01 | ||
5009 | #define PTRACE_MODE_ATTACH 0x02 | ||
5010 | #define PTRACE_MODE_NOAUDIT 0x04 | ||
5011 | -#define PTRACE_MODE_FSCREDS 0x08 | ||
5012 | -#define PTRACE_MODE_REALCREDS 0x10 | ||
5013 | +#define PTRACE_MODE_FSCREDS 0x08 | ||
5014 | +#define PTRACE_MODE_REALCREDS 0x10 | ||
5015 | +#define PTRACE_MODE_SCHED 0x20 | ||
5016 | +#define PTRACE_MODE_IBPB 0x40 | ||
5017 | |||
5018 | /* shorthands for READ/ATTACH and FSCREDS/REALCREDS combinations */ | ||
5019 | #define PTRACE_MODE_READ_FSCREDS (PTRACE_MODE_READ | PTRACE_MODE_FSCREDS) | ||
5020 | #define PTRACE_MODE_READ_REALCREDS (PTRACE_MODE_READ | PTRACE_MODE_REALCREDS) | ||
5021 | #define PTRACE_MODE_ATTACH_FSCREDS (PTRACE_MODE_ATTACH | PTRACE_MODE_FSCREDS) | ||
5022 | #define PTRACE_MODE_ATTACH_REALCREDS (PTRACE_MODE_ATTACH | PTRACE_MODE_REALCREDS) | ||
5023 | +#define PTRACE_MODE_SPEC_IBPB (PTRACE_MODE_ATTACH_REALCREDS | PTRACE_MODE_IBPB) | ||
5024 | |||
5025 | /** | ||
5026 | * ptrace_may_access - check whether the caller is permitted to access | ||
5027 | @@ -85,6 +88,20 @@ extern void exit_ptrace(struct task_struct *tracer, struct list_head *dead); | ||
5028 | */ | ||
5029 | extern bool ptrace_may_access(struct task_struct *task, unsigned int mode); | ||
5030 | |||
5031 | +/** | ||
5032 | + * ptrace_may_access - check whether the caller is permitted to access | ||
5033 | + * a target task. | ||
5034 | + * @task: target task | ||
5035 | + * @mode: selects type of access and caller credentials | ||
5036 | + * | ||
5037 | + * Returns true on success, false on denial. | ||
5038 | + * | ||
5039 | + * Similar to ptrace_may_access(). Only to be called from context switch | ||
5040 | + * code. Does not call into audit and the regular LSM hooks due to locking | ||
5041 | + * constraints. | ||
5042 | + */ | ||
5043 | +extern bool ptrace_may_access_sched(struct task_struct *task, unsigned int mode); | ||
5044 | + | ||
5045 | static inline int ptrace_reparented(struct task_struct *child) | ||
5046 | { | ||
5047 | return !same_thread_group(child->real_parent, child->parent); | ||
5048 | diff --git a/include/linux/sched.h b/include/linux/sched.h | ||
5049 | index ebd0afb35d16..1c487a3abd84 100644 | ||
5050 | --- a/include/linux/sched.h | ||
5051 | +++ b/include/linux/sched.h | ||
5052 | @@ -2357,6 +2357,8 @@ static inline void memalloc_noio_restore(unsigned int flags) | ||
5053 | #define PFA_LMK_WAITING 3 /* Lowmemorykiller is waiting */ | ||
5054 | #define PFA_SPEC_SSB_DISABLE 4 /* Speculative Store Bypass disabled */ | ||
5055 | #define PFA_SPEC_SSB_FORCE_DISABLE 5 /* Speculative Store Bypass force disabled*/ | ||
5056 | +#define PFA_SPEC_IB_DISABLE 6 /* Indirect branch speculation restricted */ | ||
5057 | +#define PFA_SPEC_IB_FORCE_DISABLE 7 /* Indirect branch speculation permanently restricted */ | ||
5058 | |||
5059 | |||
5060 | #define TASK_PFA_TEST(name, func) \ | ||
5061 | @@ -2390,6 +2392,13 @@ TASK_PFA_CLEAR(SPEC_SSB_DISABLE, spec_ssb_disable) | ||
5062 | TASK_PFA_TEST(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable) | ||
5063 | TASK_PFA_SET(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable) | ||
5064 | |||
5065 | +TASK_PFA_TEST(SPEC_IB_DISABLE, spec_ib_disable) | ||
5066 | +TASK_PFA_SET(SPEC_IB_DISABLE, spec_ib_disable) | ||
5067 | +TASK_PFA_CLEAR(SPEC_IB_DISABLE, spec_ib_disable) | ||
5068 | + | ||
5069 | +TASK_PFA_TEST(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable) | ||
5070 | +TASK_PFA_SET(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable) | ||
5071 | + | ||
5072 | /* | ||
5073 | * task->jobctl flags | ||
5074 | */ | ||
5075 | diff --git a/include/linux/sched/smt.h b/include/linux/sched/smt.h | ||
5076 | new file mode 100644 | ||
5077 | index 000000000000..559ac4590593 | ||
5078 | --- /dev/null | ||
5079 | +++ b/include/linux/sched/smt.h | ||
5080 | @@ -0,0 +1,20 @@ | ||
5081 | +/* SPDX-License-Identifier: GPL-2.0 */ | ||
5082 | +#ifndef _LINUX_SCHED_SMT_H | ||
5083 | +#define _LINUX_SCHED_SMT_H | ||
5084 | + | ||
5085 | +#include <linux/atomic.h> | ||
5086 | + | ||
5087 | +#ifdef CONFIG_SCHED_SMT | ||
5088 | +extern atomic_t sched_smt_present; | ||
5089 | + | ||
5090 | +static __always_inline bool sched_smt_active(void) | ||
5091 | +{ | ||
5092 | + return atomic_read(&sched_smt_present); | ||
5093 | +} | ||
5094 | +#else | ||
5095 | +static inline bool sched_smt_active(void) { return false; } | ||
5096 | +#endif | ||
5097 | + | ||
5098 | +void arch_smt_update(void); | ||
5099 | + | ||
5100 | +#endif | ||
5101 | diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h | ||
5102 | index 64776b72e1eb..64ec0d62e5f5 100644 | ||
5103 | --- a/include/uapi/linux/prctl.h | ||
5104 | +++ b/include/uapi/linux/prctl.h | ||
5105 | @@ -202,6 +202,7 @@ struct prctl_mm_map { | ||
5106 | #define PR_SET_SPECULATION_CTRL 53 | ||
5107 | /* Speculation control variants */ | ||
5108 | # define PR_SPEC_STORE_BYPASS 0 | ||
5109 | +# define PR_SPEC_INDIRECT_BRANCH 1 | ||
5110 | /* Return and control values for PR_SET/GET_SPECULATION_CTRL */ | ||
5111 | # define PR_SPEC_NOT_AFFECTED 0 | ||
5112 | # define PR_SPEC_PRCTL (1UL << 0) | ||
5113 | diff --git a/kernel/cpu.c b/kernel/cpu.c | ||
5114 | index bf24e8400903..db1a0bc46c3e 100644 | ||
5115 | --- a/kernel/cpu.c | ||
5116 | +++ b/kernel/cpu.c | ||
5117 | @@ -8,6 +8,7 @@ | ||
5118 | #include <linux/init.h> | ||
5119 | #include <linux/notifier.h> | ||
5120 | #include <linux/sched.h> | ||
5121 | +#include <linux/sched/smt.h> | ||
5122 | #include <linux/unistd.h> | ||
5123 | #include <linux/cpu.h> | ||
5124 | #include <linux/oom.h> | ||
5125 | @@ -356,6 +357,12 @@ void cpu_hotplug_enable(void) | ||
5126 | EXPORT_SYMBOL_GPL(cpu_hotplug_enable); | ||
5127 | #endif /* CONFIG_HOTPLUG_CPU */ | ||
5128 | |||
5129 | +/* | ||
5130 | + * Architectures that need SMT-specific errata handling during SMT hotplug | ||
5131 | + * should override this. | ||
5132 | + */ | ||
5133 | +void __weak arch_smt_update(void) { } | ||
5134 | + | ||
5135 | #ifdef CONFIG_HOTPLUG_SMT | ||
5136 | enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED; | ||
5137 | EXPORT_SYMBOL_GPL(cpu_smt_control); | ||
5138 | @@ -1058,6 +1065,7 @@ out: | ||
5139 | /* This post dead nonsense must die */ | ||
5140 | if (!ret && hasdied) | ||
5141 | cpu_notify_nofail(CPU_POST_DEAD, cpu); | ||
5142 | + arch_smt_update(); | ||
5143 | return ret; | ||
5144 | } | ||
5145 | |||
5146 | @@ -1177,6 +1185,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target) | ||
5147 | ret = cpuhp_up_callbacks(cpu, st, target); | ||
5148 | out: | ||
5149 | cpu_hotplug_done(); | ||
5150 | + arch_smt_update(); | ||
5151 | return ret; | ||
5152 | } | ||
5153 | |||
5154 | @@ -2012,8 +2021,10 @@ static int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) | ||
5155 | */ | ||
5156 | cpuhp_offline_cpu_device(cpu); | ||
5157 | } | ||
5158 | - if (!ret) | ||
5159 | + if (!ret) { | ||
5160 | cpu_smt_control = ctrlval; | ||
5161 | + arch_smt_update(); | ||
5162 | + } | ||
5163 | cpu_maps_update_done(); | ||
5164 | return ret; | ||
5165 | } | ||
5166 | @@ -2024,6 +2035,7 @@ static int cpuhp_smt_enable(void) | ||
5167 | |||
5168 | cpu_maps_update_begin(); | ||
5169 | cpu_smt_control = CPU_SMT_ENABLED; | ||
5170 | + arch_smt_update(); | ||
5171 | for_each_present_cpu(cpu) { | ||
5172 | /* Skip online CPUs and CPUs on offline nodes */ | ||
5173 | if (cpu_online(cpu) || !node_online(cpu_to_node(cpu))) | ||
5174 | @@ -2222,3 +2234,18 @@ void __init boot_cpu_hotplug_init(void) | ||
5175 | #endif | ||
5176 | this_cpu_write(cpuhp_state.state, CPUHP_ONLINE); | ||
5177 | } | ||
5178 | + | ||
5179 | +enum cpu_mitigations cpu_mitigations __ro_after_init = CPU_MITIGATIONS_AUTO; | ||
5180 | + | ||
5181 | +static int __init mitigations_parse_cmdline(char *arg) | ||
5182 | +{ | ||
5183 | + if (!strcmp(arg, "off")) | ||
5184 | + cpu_mitigations = CPU_MITIGATIONS_OFF; | ||
5185 | + else if (!strcmp(arg, "auto")) | ||
5186 | + cpu_mitigations = CPU_MITIGATIONS_AUTO; | ||
5187 | + else if (!strcmp(arg, "auto,nosmt")) | ||
5188 | + cpu_mitigations = CPU_MITIGATIONS_AUTO_NOSMT; | ||
5189 | + | ||
5190 | + return 0; | ||
5191 | +} | ||
5192 | +early_param("mitigations", mitigations_parse_cmdline); | ||
5193 | diff --git a/kernel/ptrace.c b/kernel/ptrace.c | ||
5194 | index f39a7be98fc1..efba851ee018 100644 | ||
5195 | --- a/kernel/ptrace.c | ||
5196 | +++ b/kernel/ptrace.c | ||
5197 | @@ -258,6 +258,9 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state) | ||
5198 | |||
5199 | static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode) | ||
5200 | { | ||
5201 | + if (mode & PTRACE_MODE_SCHED) | ||
5202 | + return false; | ||
5203 | + | ||
5204 | if (mode & PTRACE_MODE_NOAUDIT) | ||
5205 | return has_ns_capability_noaudit(current, ns, CAP_SYS_PTRACE); | ||
5206 | else | ||
5207 | @@ -325,9 +328,16 @@ ok: | ||
5208 | !ptrace_has_cap(mm->user_ns, mode))) | ||
5209 | return -EPERM; | ||
5210 | |||
5211 | + if (mode & PTRACE_MODE_SCHED) | ||
5212 | + return 0; | ||
5213 | return security_ptrace_access_check(task, mode); | ||
5214 | } | ||
5215 | |||
5216 | +bool ptrace_may_access_sched(struct task_struct *task, unsigned int mode) | ||
5217 | +{ | ||
5218 | + return __ptrace_may_access(task, mode | PTRACE_MODE_SCHED); | ||
5219 | +} | ||
5220 | + | ||
5221 | bool ptrace_may_access(struct task_struct *task, unsigned int mode) | ||
5222 | { | ||
5223 | int err; | ||
5224 | diff --git a/kernel/sched/core.c b/kernel/sched/core.c | ||
5225 | index 6b3fff6a6437..50e80b1be2c8 100644 | ||
5226 | --- a/kernel/sched/core.c | ||
5227 | +++ b/kernel/sched/core.c | ||
5228 | @@ -7355,11 +7355,22 @@ static int cpuset_cpu_inactive(unsigned int cpu) | ||
5229 | return 0; | ||
5230 | } | ||
5231 | |||
5232 | +#ifdef CONFIG_SCHED_SMT | ||
5233 | +atomic_t sched_smt_present = ATOMIC_INIT(0); | ||
5234 | +#endif | ||
5235 | + | ||
5236 | int sched_cpu_activate(unsigned int cpu) | ||
5237 | { | ||
5238 | struct rq *rq = cpu_rq(cpu); | ||
5239 | unsigned long flags; | ||
5240 | |||
5241 | +#ifdef CONFIG_SCHED_SMT | ||
5242 | + /* | ||
5243 | + * When going up, increment the number of cores with SMT present. | ||
5244 | + */ | ||
5245 | + if (cpumask_weight(cpu_smt_mask(cpu)) == 2) | ||
5246 | + atomic_inc(&sched_smt_present); | ||
5247 | +#endif | ||
5248 | set_cpu_active(cpu, true); | ||
5249 | |||
5250 | if (sched_smp_initialized) { | ||
5251 | @@ -7408,6 +7419,14 @@ int sched_cpu_deactivate(unsigned int cpu) | ||
5252 | else | ||
5253 | synchronize_rcu(); | ||
5254 | |||
5255 | +#ifdef CONFIG_SCHED_SMT | ||
5256 | + /* | ||
5257 | + * When going down, decrement the number of cores with SMT present. | ||
5258 | + */ | ||
5259 | + if (cpumask_weight(cpu_smt_mask(cpu)) == 2) | ||
5260 | + atomic_dec(&sched_smt_present); | ||
5261 | +#endif | ||
5262 | + | ||
5263 | if (!sched_smp_initialized) | ||
5264 | return 0; | ||
5265 | |||
5266 | diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h | ||
5267 | index ec6e838e991a..15c08752926b 100644 | ||
5268 | --- a/kernel/sched/sched.h | ||
5269 | +++ b/kernel/sched/sched.h | ||
5270 | @@ -2,6 +2,7 @@ | ||
5271 | #include <linux/sched.h> | ||
5272 | #include <linux/sched/sysctl.h> | ||
5273 | #include <linux/sched/rt.h> | ||
5274 | +#include <linux/sched/smt.h> | ||
5275 | #include <linux/u64_stats_sync.h> | ||
5276 | #include <linux/sched/deadline.h> | ||
5277 | #include <linux/kernel_stat.h> | ||
5278 | diff --git a/tools/power/x86/turbostat/Makefile b/tools/power/x86/turbostat/Makefile | ||
5279 | index 8561e7ddca59..92be948c922d 100644 | ||
5280 | --- a/tools/power/x86/turbostat/Makefile | ||
5281 | +++ b/tools/power/x86/turbostat/Makefile | ||
5282 | @@ -8,7 +8,7 @@ ifeq ("$(origin O)", "command line") | ||
5283 | endif | ||
5284 | |||
5285 | turbostat : turbostat.c | ||
5286 | -CFLAGS += -Wall | ||
5287 | +CFLAGS += -Wall -I../../../include | ||
5288 | CFLAGS += -DMSRHEADER='"../../../../arch/x86/include/asm/msr-index.h"' | ||
5289 | |||
5290 | %: %.c |