Magellan Linux

Annotation of /trunk/kernel-alx-legacy/patches-4.9/0275-4.9.176-all-fixes.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 3608 - (hide annotations) (download)
Fri Aug 14 07:34:29 2020 UTC (3 years, 10 months ago) by niro
File size: 192806 byte(s)
-added kerenl-alx-legacy pkg
1 niro 3608 diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu
2     index 069e8d52c991..cadb7a9a5218 100644
3     --- a/Documentation/ABI/testing/sysfs-devices-system-cpu
4     +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
5     @@ -357,6 +357,7 @@ What: /sys/devices/system/cpu/vulnerabilities
6     /sys/devices/system/cpu/vulnerabilities/spectre_v2
7     /sys/devices/system/cpu/vulnerabilities/spec_store_bypass
8     /sys/devices/system/cpu/vulnerabilities/l1tf
9     + /sys/devices/system/cpu/vulnerabilities/mds
10     Date: January 2018
11     Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org>
12     Description: Information about CPU vulnerabilities
13     @@ -369,8 +370,7 @@ Description: Information about CPU vulnerabilities
14     "Vulnerable" CPU is affected and no mitigation in effect
15     "Mitigation: $M" CPU is affected and mitigation $M is in effect
16    
17     - Details about the l1tf file can be found in
18     - Documentation/admin-guide/l1tf.rst
19     + See also: Documentation/hw-vuln/index.rst
20    
21     What: /sys/devices/system/cpu/smt
22     /sys/devices/system/cpu/smt/active
23     diff --git a/Documentation/hw-vuln/index.rst b/Documentation/hw-vuln/index.rst
24     new file mode 100644
25     index 000000000000..ffc064c1ec68
26     --- /dev/null
27     +++ b/Documentation/hw-vuln/index.rst
28     @@ -0,0 +1,13 @@
29     +========================
30     +Hardware vulnerabilities
31     +========================
32     +
33     +This section describes CPU vulnerabilities and provides an overview of the
34     +possible mitigations along with guidance for selecting mitigations if they
35     +are configurable at compile, boot or run time.
36     +
37     +.. toctree::
38     + :maxdepth: 1
39     +
40     + l1tf
41     + mds
42     diff --git a/Documentation/hw-vuln/l1tf.rst b/Documentation/hw-vuln/l1tf.rst
43     new file mode 100644
44     index 000000000000..31653a9f0e1b
45     --- /dev/null
46     +++ b/Documentation/hw-vuln/l1tf.rst
47     @@ -0,0 +1,615 @@
48     +L1TF - L1 Terminal Fault
49     +========================
50     +
51     +L1 Terminal Fault is a hardware vulnerability which allows unprivileged
52     +speculative access to data which is available in the Level 1 Data Cache
53     +when the page table entry controlling the virtual address, which is used
54     +for the access, has the Present bit cleared or other reserved bits set.
55     +
56     +Affected processors
57     +-------------------
58     +
59     +This vulnerability affects a wide range of Intel processors. The
60     +vulnerability is not present on:
61     +
62     + - Processors from AMD, Centaur and other non Intel vendors
63     +
64     + - Older processor models, where the CPU family is < 6
65     +
66     + - A range of Intel ATOM processors (Cedarview, Cloverview, Lincroft,
67     + Penwell, Pineview, Silvermont, Airmont, Merrifield)
68     +
69     + - The Intel XEON PHI family
70     +
71     + - Intel processors which have the ARCH_CAP_RDCL_NO bit set in the
72     + IA32_ARCH_CAPABILITIES MSR. If the bit is set the CPU is not affected
73     + by the Meltdown vulnerability either. These CPUs should become
74     + available by end of 2018.
75     +
76     +Whether a processor is affected or not can be read out from the L1TF
77     +vulnerability file in sysfs. See :ref:`l1tf_sys_info`.
78     +
79     +Related CVEs
80     +------------
81     +
82     +The following CVE entries are related to the L1TF vulnerability:
83     +
84     + ============= ================= ==============================
85     + CVE-2018-3615 L1 Terminal Fault SGX related aspects
86     + CVE-2018-3620 L1 Terminal Fault OS, SMM related aspects
87     + CVE-2018-3646 L1 Terminal Fault Virtualization related aspects
88     + ============= ================= ==============================
89     +
90     +Problem
91     +-------
92     +
93     +If an instruction accesses a virtual address for which the relevant page
94     +table entry (PTE) has the Present bit cleared or other reserved bits set,
95     +then speculative execution ignores the invalid PTE and loads the referenced
96     +data if it is present in the Level 1 Data Cache, as if the page referenced
97     +by the address bits in the PTE was still present and accessible.
98     +
99     +While this is a purely speculative mechanism and the instruction will raise
100     +a page fault when it is retired eventually, the pure act of loading the
101     +data and making it available to other speculative instructions opens up the
102     +opportunity for side channel attacks to unprivileged malicious code,
103     +similar to the Meltdown attack.
104     +
105     +While Meltdown breaks the user space to kernel space protection, L1TF
106     +allows to attack any physical memory address in the system and the attack
107     +works across all protection domains. It allows an attack of SGX and also
108     +works from inside virtual machines because the speculation bypasses the
109     +extended page table (EPT) protection mechanism.
110     +
111     +
112     +Attack scenarios
113     +----------------
114     +
115     +1. Malicious user space
116     +^^^^^^^^^^^^^^^^^^^^^^^
117     +
118     + Operating Systems store arbitrary information in the address bits of a
119     + PTE which is marked non present. This allows a malicious user space
120     + application to attack the physical memory to which these PTEs resolve.
121     + In some cases user-space can maliciously influence the information
122     + encoded in the address bits of the PTE, thus making attacks more
123     + deterministic and more practical.
124     +
125     + The Linux kernel contains a mitigation for this attack vector, PTE
126     + inversion, which is permanently enabled and has no performance
127     + impact. The kernel ensures that the address bits of PTEs, which are not
128     + marked present, never point to cacheable physical memory space.
129     +
130     + A system with an up to date kernel is protected against attacks from
131     + malicious user space applications.
132     +
133     +2. Malicious guest in a virtual machine
134     +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
135     +
136     + The fact that L1TF breaks all domain protections allows malicious guest
137     + OSes, which can control the PTEs directly, and malicious guest user
138     + space applications, which run on an unprotected guest kernel lacking the
139     + PTE inversion mitigation for L1TF, to attack physical host memory.
140     +
141     + A special aspect of L1TF in the context of virtualization is symmetric
142     + multi threading (SMT). The Intel implementation of SMT is called
143     + HyperThreading. The fact that Hyperthreads on the affected processors
144     + share the L1 Data Cache (L1D) is important for this. As the flaw allows
145     + only to attack data which is present in L1D, a malicious guest running
146     + on one Hyperthread can attack the data which is brought into the L1D by
147     + the context which runs on the sibling Hyperthread of the same physical
148     + core. This context can be host OS, host user space or a different guest.
149     +
150     + If the processor does not support Extended Page Tables, the attack is
151     + only possible, when the hypervisor does not sanitize the content of the
152     + effective (shadow) page tables.
153     +
154     + While solutions exist to mitigate these attack vectors fully, these
155     + mitigations are not enabled by default in the Linux kernel because they
156     + can affect performance significantly. The kernel provides several
157     + mechanisms which can be utilized to address the problem depending on the
158     + deployment scenario. The mitigations, their protection scope and impact
159     + are described in the next sections.
160     +
161     + The default mitigations and the rationale for choosing them are explained
162     + at the end of this document. See :ref:`default_mitigations`.
163     +
164     +.. _l1tf_sys_info:
165     +
166     +L1TF system information
167     +-----------------------
168     +
169     +The Linux kernel provides a sysfs interface to enumerate the current L1TF
170     +status of the system: whether the system is vulnerable, and which
171     +mitigations are active. The relevant sysfs file is:
172     +
173     +/sys/devices/system/cpu/vulnerabilities/l1tf
174     +
175     +The possible values in this file are:
176     +
177     + =========================== ===============================
178     + 'Not affected' The processor is not vulnerable
179     + 'Mitigation: PTE Inversion' The host protection is active
180     + =========================== ===============================
181     +
182     +If KVM/VMX is enabled and the processor is vulnerable then the following
183     +information is appended to the 'Mitigation: PTE Inversion' part:
184     +
185     + - SMT status:
186     +
187     + ===================== ================
188     + 'VMX: SMT vulnerable' SMT is enabled
189     + 'VMX: SMT disabled' SMT is disabled
190     + ===================== ================
191     +
192     + - L1D Flush mode:
193     +
194     + ================================ ====================================
195     + 'L1D vulnerable' L1D flushing is disabled
196     +
197     + 'L1D conditional cache flushes' L1D flush is conditionally enabled
198     +
199     + 'L1D cache flushes' L1D flush is unconditionally enabled
200     + ================================ ====================================
201     +
202     +The resulting grade of protection is discussed in the following sections.
203     +
204     +
205     +Host mitigation mechanism
206     +-------------------------
207     +
208     +The kernel is unconditionally protected against L1TF attacks from malicious
209     +user space running on the host.
210     +
211     +
212     +Guest mitigation mechanisms
213     +---------------------------
214     +
215     +.. _l1d_flush:
216     +
217     +1. L1D flush on VMENTER
218     +^^^^^^^^^^^^^^^^^^^^^^^
219     +
220     + To make sure that a guest cannot attack data which is present in the L1D
221     + the hypervisor flushes the L1D before entering the guest.
222     +
223     + Flushing the L1D evicts not only the data which should not be accessed
224     + by a potentially malicious guest, it also flushes the guest
225     + data. Flushing the L1D has a performance impact as the processor has to
226     + bring the flushed guest data back into the L1D. Depending on the
227     + frequency of VMEXIT/VMENTER and the type of computations in the guest
228     + performance degradation in the range of 1% to 50% has been observed. For
229     + scenarios where guest VMEXIT/VMENTER are rare the performance impact is
230     + minimal. Virtio and mechanisms like posted interrupts are designed to
231     + confine the VMEXITs to a bare minimum, but specific configurations and
232     + application scenarios might still suffer from a high VMEXIT rate.
233     +
234     + The kernel provides two L1D flush modes:
235     + - conditional ('cond')
236     + - unconditional ('always')
237     +
238     + The conditional mode avoids L1D flushing after VMEXITs which execute
239     + only audited code paths before the corresponding VMENTER. These code
240     + paths have been verified that they cannot expose secrets or other
241     + interesting data to an attacker, but they can leak information about the
242     + address space layout of the hypervisor.
243     +
244     + Unconditional mode flushes L1D on all VMENTER invocations and provides
245     + maximum protection. It has a higher overhead than the conditional
246     + mode. The overhead cannot be quantified correctly as it depends on the
247     + workload scenario and the resulting number of VMEXITs.
248     +
249     + The general recommendation is to enable L1D flush on VMENTER. The kernel
250     + defaults to conditional mode on affected processors.
251     +
252     + **Note**, that L1D flush does not prevent the SMT problem because the
253     + sibling thread will also bring back its data into the L1D which makes it
254     + attackable again.
255     +
256     + L1D flush can be controlled by the administrator via the kernel command
257     + line and sysfs control files. See :ref:`mitigation_control_command_line`
258     + and :ref:`mitigation_control_kvm`.
259     +
260     +.. _guest_confinement:
261     +
262     +2. Guest VCPU confinement to dedicated physical cores
263     +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
264     +
265     + To address the SMT problem, it is possible to make a guest or a group of
266     + guests affine to one or more physical cores. The proper mechanism for
267     + that is to utilize exclusive cpusets to ensure that no other guest or
268     + host tasks can run on these cores.
269     +
270     + If only a single guest or related guests run on sibling SMT threads on
271     + the same physical core then they can only attack their own memory and
272     + restricted parts of the host memory.
273     +
274     + Host memory is attackable, when one of the sibling SMT threads runs in
275     + host OS (hypervisor) context and the other in guest context. The amount
276     + of valuable information from the host OS context depends on the context
277     + which the host OS executes, i.e. interrupts, soft interrupts and kernel
278     + threads. The amount of valuable data from these contexts cannot be
279     + declared as non-interesting for an attacker without deep inspection of
280     + the code.
281     +
282     + **Note**, that assigning guests to a fixed set of physical cores affects
283     + the ability of the scheduler to do load balancing and might have
284     + negative effects on CPU utilization depending on the hosting
285     + scenario. Disabling SMT might be a viable alternative for particular
286     + scenarios.
287     +
288     + For further information about confining guests to a single or to a group
289     + of cores consult the cpusets documentation:
290     +
291     + https://www.kernel.org/doc/Documentation/cgroup-v1/cpusets.txt
292     +
293     +.. _interrupt_isolation:
294     +
295     +3. Interrupt affinity
296     +^^^^^^^^^^^^^^^^^^^^^
297     +
298     + Interrupts can be made affine to logical CPUs. This is not universally
299     + true because there are types of interrupts which are truly per CPU
300     + interrupts, e.g. the local timer interrupt. Aside of that multi queue
301     + devices affine their interrupts to single CPUs or groups of CPUs per
302     + queue without allowing the administrator to control the affinities.
303     +
304     + Moving the interrupts, which can be affinity controlled, away from CPUs
305     + which run untrusted guests, reduces the attack vector space.
306     +
307     + Whether the interrupts with are affine to CPUs, which run untrusted
308     + guests, provide interesting data for an attacker depends on the system
309     + configuration and the scenarios which run on the system. While for some
310     + of the interrupts it can be assumed that they won't expose interesting
311     + information beyond exposing hints about the host OS memory layout, there
312     + is no way to make general assumptions.
313     +
314     + Interrupt affinity can be controlled by the administrator via the
315     + /proc/irq/$NR/smp_affinity[_list] files. Limited documentation is
316     + available at:
317     +
318     + https://www.kernel.org/doc/Documentation/IRQ-affinity.txt
319     +
320     +.. _smt_control:
321     +
322     +4. SMT control
323     +^^^^^^^^^^^^^^
324     +
325     + To prevent the SMT issues of L1TF it might be necessary to disable SMT
326     + completely. Disabling SMT can have a significant performance impact, but
327     + the impact depends on the hosting scenario and the type of workloads.
328     + The impact of disabling SMT needs also to be weighted against the impact
329     + of other mitigation solutions like confining guests to dedicated cores.
330     +
331     + The kernel provides a sysfs interface to retrieve the status of SMT and
332     + to control it. It also provides a kernel command line interface to
333     + control SMT.
334     +
335     + The kernel command line interface consists of the following options:
336     +
337     + =========== ==========================================================
338     + nosmt Affects the bring up of the secondary CPUs during boot. The
339     + kernel tries to bring all present CPUs online during the
340     + boot process. "nosmt" makes sure that from each physical
341     + core only one - the so called primary (hyper) thread is
342     + activated. Due to a design flaw of Intel processors related
343     + to Machine Check Exceptions the non primary siblings have
344     + to be brought up at least partially and are then shut down
345     + again. "nosmt" can be undone via the sysfs interface.
346     +
347     + nosmt=force Has the same effect as "nosmt" but it does not allow to
348     + undo the SMT disable via the sysfs interface.
349     + =========== ==========================================================
350     +
351     + The sysfs interface provides two files:
352     +
353     + - /sys/devices/system/cpu/smt/control
354     + - /sys/devices/system/cpu/smt/active
355     +
356     + /sys/devices/system/cpu/smt/control:
357     +
358     + This file allows to read out the SMT control state and provides the
359     + ability to disable or (re)enable SMT. The possible states are:
360     +
361     + ============== ===================================================
362     + on SMT is supported by the CPU and enabled. All
363     + logical CPUs can be onlined and offlined without
364     + restrictions.
365     +
366     + off SMT is supported by the CPU and disabled. Only
367     + the so called primary SMT threads can be onlined
368     + and offlined without restrictions. An attempt to
369     + online a non-primary sibling is rejected
370     +
371     + forceoff Same as 'off' but the state cannot be controlled.
372     + Attempts to write to the control file are rejected.
373     +
374     + notsupported The processor does not support SMT. It's therefore
375     + not affected by the SMT implications of L1TF.
376     + Attempts to write to the control file are rejected.
377     + ============== ===================================================
378     +
379     + The possible states which can be written into this file to control SMT
380     + state are:
381     +
382     + - on
383     + - off
384     + - forceoff
385     +
386     + /sys/devices/system/cpu/smt/active:
387     +
388     + This file reports whether SMT is enabled and active, i.e. if on any
389     + physical core two or more sibling threads are online.
390     +
391     + SMT control is also possible at boot time via the l1tf kernel command
392     + line parameter in combination with L1D flush control. See
393     + :ref:`mitigation_control_command_line`.
394     +
395     +5. Disabling EPT
396     +^^^^^^^^^^^^^^^^
397     +
398     + Disabling EPT for virtual machines provides full mitigation for L1TF even
399     + with SMT enabled, because the effective page tables for guests are
400     + managed and sanitized by the hypervisor. Though disabling EPT has a
401     + significant performance impact especially when the Meltdown mitigation
402     + KPTI is enabled.
403     +
404     + EPT can be disabled in the hypervisor via the 'kvm-intel.ept' parameter.
405     +
406     +There is ongoing research and development for new mitigation mechanisms to
407     +address the performance impact of disabling SMT or EPT.
408     +
409     +.. _mitigation_control_command_line:
410     +
411     +Mitigation control on the kernel command line
412     +---------------------------------------------
413     +
414     +The kernel command line allows to control the L1TF mitigations at boot
415     +time with the option "l1tf=". The valid arguments for this option are:
416     +
417     + ============ =============================================================
418     + full Provides all available mitigations for the L1TF
419     + vulnerability. Disables SMT and enables all mitigations in
420     + the hypervisors, i.e. unconditional L1D flushing
421     +
422     + SMT control and L1D flush control via the sysfs interface
423     + is still possible after boot. Hypervisors will issue a
424     + warning when the first VM is started in a potentially
425     + insecure configuration, i.e. SMT enabled or L1D flush
426     + disabled.
427     +
428     + full,force Same as 'full', but disables SMT and L1D flush runtime
429     + control. Implies the 'nosmt=force' command line option.
430     + (i.e. sysfs control of SMT is disabled.)
431     +
432     + flush Leaves SMT enabled and enables the default hypervisor
433     + mitigation, i.e. conditional L1D flushing
434     +
435     + SMT control and L1D flush control via the sysfs interface
436     + is still possible after boot. Hypervisors will issue a
437     + warning when the first VM is started in a potentially
438     + insecure configuration, i.e. SMT enabled or L1D flush
439     + disabled.
440     +
441     + flush,nosmt Disables SMT and enables the default hypervisor mitigation,
442     + i.e. conditional L1D flushing.
443     +
444     + SMT control and L1D flush control via the sysfs interface
445     + is still possible after boot. Hypervisors will issue a
446     + warning when the first VM is started in a potentially
447     + insecure configuration, i.e. SMT enabled or L1D flush
448     + disabled.
449     +
450     + flush,nowarn Same as 'flush', but hypervisors will not warn when a VM is
451     + started in a potentially insecure configuration.
452     +
453     + off Disables hypervisor mitigations and doesn't emit any
454     + warnings.
455     + It also drops the swap size and available RAM limit restrictions
456     + on both hypervisor and bare metal.
457     +
458     + ============ =============================================================
459     +
460     +The default is 'flush'. For details about L1D flushing see :ref:`l1d_flush`.
461     +
462     +
463     +.. _mitigation_control_kvm:
464     +
465     +Mitigation control for KVM - module parameter
466     +-------------------------------------------------------------
467     +
468     +The KVM hypervisor mitigation mechanism, flushing the L1D cache when
469     +entering a guest, can be controlled with a module parameter.
470     +
471     +The option/parameter is "kvm-intel.vmentry_l1d_flush=". It takes the
472     +following arguments:
473     +
474     + ============ ==============================================================
475     + always L1D cache flush on every VMENTER.
476     +
477     + cond Flush L1D on VMENTER only when the code between VMEXIT and
478     + VMENTER can leak host memory which is considered
479     + interesting for an attacker. This still can leak host memory
480     + which allows e.g. to determine the hosts address space layout.
481     +
482     + never Disables the mitigation
483     + ============ ==============================================================
484     +
485     +The parameter can be provided on the kernel command line, as a module
486     +parameter when loading the modules and at runtime modified via the sysfs
487     +file:
488     +
489     +/sys/module/kvm_intel/parameters/vmentry_l1d_flush
490     +
491     +The default is 'cond'. If 'l1tf=full,force' is given on the kernel command
492     +line, then 'always' is enforced and the kvm-intel.vmentry_l1d_flush
493     +module parameter is ignored and writes to the sysfs file are rejected.
494     +
495     +.. _mitigation_selection:
496     +
497     +Mitigation selection guide
498     +--------------------------
499     +
500     +1. No virtualization in use
501     +^^^^^^^^^^^^^^^^^^^^^^^^^^^
502     +
503     + The system is protected by the kernel unconditionally and no further
504     + action is required.
505     +
506     +2. Virtualization with trusted guests
507     +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
508     +
509     + If the guest comes from a trusted source and the guest OS kernel is
510     + guaranteed to have the L1TF mitigations in place the system is fully
511     + protected against L1TF and no further action is required.
512     +
513     + To avoid the overhead of the default L1D flushing on VMENTER the
514     + administrator can disable the flushing via the kernel command line and
515     + sysfs control files. See :ref:`mitigation_control_command_line` and
516     + :ref:`mitigation_control_kvm`.
517     +
518     +
519     +3. Virtualization with untrusted guests
520     +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
521     +
522     +3.1. SMT not supported or disabled
523     +""""""""""""""""""""""""""""""""""
524     +
525     + If SMT is not supported by the processor or disabled in the BIOS or by
526     + the kernel, it's only required to enforce L1D flushing on VMENTER.
527     +
528     + Conditional L1D flushing is the default behaviour and can be tuned. See
529     + :ref:`mitigation_control_command_line` and :ref:`mitigation_control_kvm`.
530     +
531     +3.2. EPT not supported or disabled
532     +""""""""""""""""""""""""""""""""""
533     +
534     + If EPT is not supported by the processor or disabled in the hypervisor,
535     + the system is fully protected. SMT can stay enabled and L1D flushing on
536     + VMENTER is not required.
537     +
538     + EPT can be disabled in the hypervisor via the 'kvm-intel.ept' parameter.
539     +
540     +3.3. SMT and EPT supported and active
541     +"""""""""""""""""""""""""""""""""""""
542     +
543     + If SMT and EPT are supported and active then various degrees of
544     + mitigations can be employed:
545     +
546     + - L1D flushing on VMENTER:
547     +
548     + L1D flushing on VMENTER is the minimal protection requirement, but it
549     + is only potent in combination with other mitigation methods.
550     +
551     + Conditional L1D flushing is the default behaviour and can be tuned. See
552     + :ref:`mitigation_control_command_line` and :ref:`mitigation_control_kvm`.
553     +
554     + - Guest confinement:
555     +
556     + Confinement of guests to a single or a group of physical cores which
557     + are not running any other processes, can reduce the attack surface
558     + significantly, but interrupts, soft interrupts and kernel threads can
559     + still expose valuable data to a potential attacker. See
560     + :ref:`guest_confinement`.
561     +
562     + - Interrupt isolation:
563     +
564     + Isolating the guest CPUs from interrupts can reduce the attack surface
565     + further, but still allows a malicious guest to explore a limited amount
566     + of host physical memory. This can at least be used to gain knowledge
567     + about the host address space layout. The interrupts which have a fixed
568     + affinity to the CPUs which run the untrusted guests can depending on
569     + the scenario still trigger soft interrupts and schedule kernel threads
570     + which might expose valuable information. See
571     + :ref:`interrupt_isolation`.
572     +
573     +The above three mitigation methods combined can provide protection to a
574     +certain degree, but the risk of the remaining attack surface has to be
575     +carefully analyzed. For full protection the following methods are
576     +available:
577     +
578     + - Disabling SMT:
579     +
580     + Disabling SMT and enforcing the L1D flushing provides the maximum
581     + amount of protection. This mitigation is not depending on any of the
582     + above mitigation methods.
583     +
584     + SMT control and L1D flushing can be tuned by the command line
585     + parameters 'nosmt', 'l1tf', 'kvm-intel.vmentry_l1d_flush' and at run
586     + time with the matching sysfs control files. See :ref:`smt_control`,
587     + :ref:`mitigation_control_command_line` and
588     + :ref:`mitigation_control_kvm`.
589     +
590     + - Disabling EPT:
591     +
592     + Disabling EPT provides the maximum amount of protection as well. It is
593     + not depending on any of the above mitigation methods. SMT can stay
594     + enabled and L1D flushing is not required, but the performance impact is
595     + significant.
596     +
597     + EPT can be disabled in the hypervisor via the 'kvm-intel.ept'
598     + parameter.
599     +
600     +3.4. Nested virtual machines
601     +""""""""""""""""""""""""""""
602     +
603     +When nested virtualization is in use, three operating systems are involved:
604     +the bare metal hypervisor, the nested hypervisor and the nested virtual
605     +machine. VMENTER operations from the nested hypervisor into the nested
606     +guest will always be processed by the bare metal hypervisor. If KVM is the
607     +bare metal hypervisor it will:
608     +
609     + - Flush the L1D cache on every switch from the nested hypervisor to the
610     + nested virtual machine, so that the nested hypervisor's secrets are not
611     + exposed to the nested virtual machine;
612     +
613     + - Flush the L1D cache on every switch from the nested virtual machine to
614     + the nested hypervisor; this is a complex operation, and flushing the L1D
615     + cache avoids that the bare metal hypervisor's secrets are exposed to the
616     + nested virtual machine;
617     +
618     + - Instruct the nested hypervisor to not perform any L1D cache flush. This
619     + is an optimization to avoid double L1D flushing.
620     +
621     +
622     +.. _default_mitigations:
623     +
624     +Default mitigations
625     +-------------------
626     +
627     + The kernel default mitigations for vulnerable processors are:
628     +
629     + - PTE inversion to protect against malicious user space. This is done
630     + unconditionally and cannot be controlled. The swap storage is limited
631     + to ~16TB.
632     +
633     + - L1D conditional flushing on VMENTER when EPT is enabled for
634     + a guest.
635     +
636     + The kernel does not by default enforce the disabling of SMT, which leaves
637     + SMT systems vulnerable when running untrusted guests with EPT enabled.
638     +
639     + The rationale for this choice is:
640     +
641     + - Force disabling SMT can break existing setups, especially with
642     + unattended updates.
643     +
644     + - If regular users run untrusted guests on their machine, then L1TF is
645     + just an add on to other malware which might be embedded in an untrusted
646     + guest, e.g. spam-bots or attacks on the local network.
647     +
648     + There is no technical way to prevent a user from running untrusted code
649     + on their machines blindly.
650     +
651     + - It's technically extremely unlikely and from today's knowledge even
652     + impossible that L1TF can be exploited via the most popular attack
653     + mechanisms like JavaScript because these mechanisms have no way to
654     + control PTEs. If this would be possible and not other mitigation would
655     + be possible, then the default might be different.
656     +
657     + - The administrators of cloud and hosting setups have to carefully
658     + analyze the risk for their scenarios and make the appropriate
659     + mitigation choices, which might even vary across their deployed
660     + machines and also result in other changes of their overall setup.
661     + There is no way for the kernel to provide a sensible default for this
662     + kind of scenarios.
663     diff --git a/Documentation/hw-vuln/mds.rst b/Documentation/hw-vuln/mds.rst
664     new file mode 100644
665     index 000000000000..daf6fdac49a3
666     --- /dev/null
667     +++ b/Documentation/hw-vuln/mds.rst
668     @@ -0,0 +1,308 @@
669     +MDS - Microarchitectural Data Sampling
670     +======================================
671     +
672     +Microarchitectural Data Sampling is a hardware vulnerability which allows
673     +unprivileged speculative access to data which is available in various CPU
674     +internal buffers.
675     +
676     +Affected processors
677     +-------------------
678     +
679     +This vulnerability affects a wide range of Intel processors. The
680     +vulnerability is not present on:
681     +
682     + - Processors from AMD, Centaur and other non Intel vendors
683     +
684     + - Older processor models, where the CPU family is < 6
685     +
686     + - Some Atoms (Bonnell, Saltwell, Goldmont, GoldmontPlus)
687     +
688     + - Intel processors which have the ARCH_CAP_MDS_NO bit set in the
689     + IA32_ARCH_CAPABILITIES MSR.
690     +
691     +Whether a processor is affected or not can be read out from the MDS
692     +vulnerability file in sysfs. See :ref:`mds_sys_info`.
693     +
694     +Not all processors are affected by all variants of MDS, but the mitigation
695     +is identical for all of them so the kernel treats them as a single
696     +vulnerability.
697     +
698     +Related CVEs
699     +------------
700     +
701     +The following CVE entries are related to the MDS vulnerability:
702     +
703     + ============== ===== ===================================================
704     + CVE-2018-12126 MSBDS Microarchitectural Store Buffer Data Sampling
705     + CVE-2018-12130 MFBDS Microarchitectural Fill Buffer Data Sampling
706     + CVE-2018-12127 MLPDS Microarchitectural Load Port Data Sampling
707     + CVE-2019-11091 MDSUM Microarchitectural Data Sampling Uncacheable Memory
708     + ============== ===== ===================================================
709     +
710     +Problem
711     +-------
712     +
713     +When performing store, load, L1 refill operations, processors write data
714     +into temporary microarchitectural structures (buffers). The data in the
715     +buffer can be forwarded to load operations as an optimization.
716     +
717     +Under certain conditions, usually a fault/assist caused by a load
718     +operation, data unrelated to the load memory address can be speculatively
719     +forwarded from the buffers. Because the load operation causes a fault or
720     +assist and its result will be discarded, the forwarded data will not cause
721     +incorrect program execution or state changes. But a malicious operation
722     +may be able to forward this speculative data to a disclosure gadget which
723     +allows in turn to infer the value via a cache side channel attack.
724     +
725     +Because the buffers are potentially shared between Hyper-Threads cross
726     +Hyper-Thread attacks are possible.
727     +
728     +Deeper technical information is available in the MDS specific x86
729     +architecture section: :ref:`Documentation/x86/mds.rst <mds>`.
730     +
731     +
732     +Attack scenarios
733     +----------------
734     +
735     +Attacks against the MDS vulnerabilities can be mounted from malicious non
736     +priviledged user space applications running on hosts or guest. Malicious
737     +guest OSes can obviously mount attacks as well.
738     +
739     +Contrary to other speculation based vulnerabilities the MDS vulnerability
740     +does not allow the attacker to control the memory target address. As a
741     +consequence the attacks are purely sampling based, but as demonstrated with
742     +the TLBleed attack samples can be postprocessed successfully.
743     +
744     +Web-Browsers
745     +^^^^^^^^^^^^
746     +
747     + It's unclear whether attacks through Web-Browsers are possible at
748     + all. The exploitation through Java-Script is considered very unlikely,
749     + but other widely used web technologies like Webassembly could possibly be
750     + abused.
751     +
752     +
753     +.. _mds_sys_info:
754     +
755     +MDS system information
756     +-----------------------
757     +
758     +The Linux kernel provides a sysfs interface to enumerate the current MDS
759     +status of the system: whether the system is vulnerable, and which
760     +mitigations are active. The relevant sysfs file is:
761     +
762     +/sys/devices/system/cpu/vulnerabilities/mds
763     +
764     +The possible values in this file are:
765     +
766     + .. list-table::
767     +
768     + * - 'Not affected'
769     + - The processor is not vulnerable
770     + * - 'Vulnerable'
771     + - The processor is vulnerable, but no mitigation enabled
772     + * - 'Vulnerable: Clear CPU buffers attempted, no microcode'
773     + - The processor is vulnerable but microcode is not updated.
774     +
775     + The mitigation is enabled on a best effort basis. See :ref:`vmwerv`
776     + * - 'Mitigation: Clear CPU buffers'
777     + - The processor is vulnerable and the CPU buffer clearing mitigation is
778     + enabled.
779     +
780     +If the processor is vulnerable then the following information is appended
781     +to the above information:
782     +
783     + ======================== ============================================
784     + 'SMT vulnerable' SMT is enabled
785     + 'SMT mitigated' SMT is enabled and mitigated
786     + 'SMT disabled' SMT is disabled
787     + 'SMT Host state unknown' Kernel runs in a VM, Host SMT state unknown
788     + ======================== ============================================
789     +
790     +.. _vmwerv:
791     +
792     +Best effort mitigation mode
793     +^^^^^^^^^^^^^^^^^^^^^^^^^^^
794     +
795     + If the processor is vulnerable, but the availability of the microcode based
796     + mitigation mechanism is not advertised via CPUID the kernel selects a best
797     + effort mitigation mode. This mode invokes the mitigation instructions
798     + without a guarantee that they clear the CPU buffers.
799     +
800     + This is done to address virtualization scenarios where the host has the
801     + microcode update applied, but the hypervisor is not yet updated to expose
802     + the CPUID to the guest. If the host has updated microcode the protection
803     + takes effect otherwise a few cpu cycles are wasted pointlessly.
804     +
805     + The state in the mds sysfs file reflects this situation accordingly.
806     +
807     +
808     +Mitigation mechanism
809     +-------------------------
810     +
811     +The kernel detects the affected CPUs and the presence of the microcode
812     +which is required.
813     +
814     +If a CPU is affected and the microcode is available, then the kernel
815     +enables the mitigation by default. The mitigation can be controlled at boot
816     +time via a kernel command line option. See
817     +:ref:`mds_mitigation_control_command_line`.
818     +
819     +.. _cpu_buffer_clear:
820     +
821     +CPU buffer clearing
822     +^^^^^^^^^^^^^^^^^^^
823     +
824     + The mitigation for MDS clears the affected CPU buffers on return to user
825     + space and when entering a guest.
826     +
827     + If SMT is enabled it also clears the buffers on idle entry when the CPU
828     + is only affected by MSBDS and not any other MDS variant, because the
829     + other variants cannot be protected against cross Hyper-Thread attacks.
830     +
831     + For CPUs which are only affected by MSBDS the user space, guest and idle
832     + transition mitigations are sufficient and SMT is not affected.
833     +
834     +.. _virt_mechanism:
835     +
836     +Virtualization mitigation
837     +^^^^^^^^^^^^^^^^^^^^^^^^^
838     +
839     + The protection for host to guest transition depends on the L1TF
840     + vulnerability of the CPU:
841     +
842     + - CPU is affected by L1TF:
843     +
844     + If the L1D flush mitigation is enabled and up to date microcode is
845     + available, the L1D flush mitigation is automatically protecting the
846     + guest transition.
847     +
848     + If the L1D flush mitigation is disabled then the MDS mitigation is
849     + invoked explicit when the host MDS mitigation is enabled.
850     +
851     + For details on L1TF and virtualization see:
852     + :ref:`Documentation/hw-vuln//l1tf.rst <mitigation_control_kvm>`.
853     +
854     + - CPU is not affected by L1TF:
855     +
856     + CPU buffers are flushed before entering the guest when the host MDS
857     + mitigation is enabled.
858     +
859     + The resulting MDS protection matrix for the host to guest transition:
860     +
861     + ============ ===== ============= ============ =================
862     + L1TF MDS VMX-L1FLUSH Host MDS MDS-State
863     +
864     + Don't care No Don't care N/A Not affected
865     +
866     + Yes Yes Disabled Off Vulnerable
867     +
868     + Yes Yes Disabled Full Mitigated
869     +
870     + Yes Yes Enabled Don't care Mitigated
871     +
872     + No Yes N/A Off Vulnerable
873     +
874     + No Yes N/A Full Mitigated
875     + ============ ===== ============= ============ =================
876     +
877     + This only covers the host to guest transition, i.e. prevents leakage from
878     + host to guest, but does not protect the guest internally. Guests need to
879     + have their own protections.
880     +
881     +.. _xeon_phi:
882     +
883     +XEON PHI specific considerations
884     +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
885     +
886     + The XEON PHI processor family is affected by MSBDS which can be exploited
887     + cross Hyper-Threads when entering idle states. Some XEON PHI variants allow
888     + to use MWAIT in user space (Ring 3) which opens an potential attack vector
889     + for malicious user space. The exposure can be disabled on the kernel
890     + command line with the 'ring3mwait=disable' command line option.
891     +
892     + XEON PHI is not affected by the other MDS variants and MSBDS is mitigated
893     + before the CPU enters a idle state. As XEON PHI is not affected by L1TF
894     + either disabling SMT is not required for full protection.
895     +
896     +.. _mds_smt_control:
897     +
898     +SMT control
899     +^^^^^^^^^^^
900     +
901     + All MDS variants except MSBDS can be attacked cross Hyper-Threads. That
902     + means on CPUs which are affected by MFBDS or MLPDS it is necessary to
903     + disable SMT for full protection. These are most of the affected CPUs; the
904     + exception is XEON PHI, see :ref:`xeon_phi`.
905     +
906     + Disabling SMT can have a significant performance impact, but the impact
907     + depends on the type of workloads.
908     +
909     + See the relevant chapter in the L1TF mitigation documentation for details:
910     + :ref:`Documentation/hw-vuln/l1tf.rst <smt_control>`.
911     +
912     +
913     +.. _mds_mitigation_control_command_line:
914     +
915     +Mitigation control on the kernel command line
916     +---------------------------------------------
917     +
918     +The kernel command line allows to control the MDS mitigations at boot
919     +time with the option "mds=". The valid arguments for this option are:
920     +
921     + ============ =============================================================
922     + full If the CPU is vulnerable, enable all available mitigations
923     + for the MDS vulnerability, CPU buffer clearing on exit to
924     + userspace and when entering a VM. Idle transitions are
925     + protected as well if SMT is enabled.
926     +
927     + It does not automatically disable SMT.
928     +
929     + full,nosmt The same as mds=full, with SMT disabled on vulnerable
930     + CPUs. This is the complete mitigation.
931     +
932     + off Disables MDS mitigations completely.
933     +
934     + ============ =============================================================
935     +
936     +Not specifying this option is equivalent to "mds=full".
937     +
938     +
939     +Mitigation selection guide
940     +--------------------------
941     +
942     +1. Trusted userspace
943     +^^^^^^^^^^^^^^^^^^^^
944     +
945     + If all userspace applications are from a trusted source and do not
946     + execute untrusted code which is supplied externally, then the mitigation
947     + can be disabled.
948     +
949     +
950     +2. Virtualization with trusted guests
951     +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
952     +
953     + The same considerations as above versus trusted user space apply.
954     +
955     +3. Virtualization with untrusted guests
956     +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
957     +
958     + The protection depends on the state of the L1TF mitigations.
959     + See :ref:`virt_mechanism`.
960     +
961     + If the MDS mitigation is enabled and SMT is disabled, guest to host and
962     + guest to guest attacks are prevented.
963     +
964     +.. _mds_default_mitigations:
965     +
966     +Default mitigations
967     +-------------------
968     +
969     + The kernel default mitigations for vulnerable processors are:
970     +
971     + - Enable CPU buffer clearing
972     +
973     + The kernel does not by default enforce the disabling of SMT, which leaves
974     + SMT systems vulnerable when running untrusted code. The same rationale as
975     + for L1TF applies.
976     + See :ref:`Documentation/hw-vuln//l1tf.rst <default_mitigations>`.
977     diff --git a/Documentation/index.rst b/Documentation/index.rst
978     index 213399aac757..f95c58dbbbc3 100644
979     --- a/Documentation/index.rst
980     +++ b/Documentation/index.rst
981     @@ -12,7 +12,6 @@ Contents:
982     :maxdepth: 2
983    
984     kernel-documentation
985     - l1tf
986     development-process/index
987     dev-tools/tools
988     driver-api/index
989     @@ -20,6 +19,24 @@ Contents:
990     gpu/index
991     80211/index
992    
993     +This section describes CPU vulnerabilities and their mitigations.
994     +
995     +.. toctree::
996     + :maxdepth: 1
997     +
998     + hw-vuln/index
999     +
1000     +Architecture-specific documentation
1001     +-----------------------------------
1002     +
1003     +These books provide programming details about architecture-specific
1004     +implementation.
1005     +
1006     +.. toctree::
1007     + :maxdepth: 2
1008     +
1009     + x86/index
1010     +
1011     Indices and tables
1012     ==================
1013    
1014     diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
1015     index a1472b48ee22..55a9bbbcf5e1 100644
1016     --- a/Documentation/kernel-parameters.txt
1017     +++ b/Documentation/kernel-parameters.txt
1018     @@ -2076,10 +2076,13 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
1019     off
1020     Disables hypervisor mitigations and doesn't
1021     emit any warnings.
1022     + It also drops the swap size and available
1023     + RAM limit restriction on both hypervisor and
1024     + bare metal.
1025    
1026     Default is 'flush'.
1027    
1028     - For details see: Documentation/admin-guide/l1tf.rst
1029     + For details see: Documentation/hw-vuln/l1tf.rst
1030    
1031     l2cr= [PPC]
1032    
1033     @@ -2322,6 +2325,32 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
1034     Format: <first>,<last>
1035     Specifies range of consoles to be captured by the MDA.
1036    
1037     + mds= [X86,INTEL]
1038     + Control mitigation for the Micro-architectural Data
1039     + Sampling (MDS) vulnerability.
1040     +
1041     + Certain CPUs are vulnerable to an exploit against CPU
1042     + internal buffers which can forward information to a
1043     + disclosure gadget under certain conditions.
1044     +
1045     + In vulnerable processors, the speculatively
1046     + forwarded data can be used in a cache side channel
1047     + attack, to access data to which the attacker does
1048     + not have direct access.
1049     +
1050     + This parameter controls the MDS mitigation. The
1051     + options are:
1052     +
1053     + full - Enable MDS mitigation on vulnerable CPUs
1054     + full,nosmt - Enable MDS mitigation and disable
1055     + SMT on vulnerable CPUs
1056     + off - Unconditionally disable MDS mitigation
1057     +
1058     + Not specifying this option is equivalent to
1059     + mds=full.
1060     +
1061     + For details see: Documentation/hw-vuln/mds.rst
1062     +
1063     mem=nn[KMG] [KNL,BOOT] Force usage of a specific amount of memory
1064     Amount of memory to be used when the kernel is not able
1065     to see the whole system memory or for test.
1066     @@ -2444,6 +2473,38 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
1067     in the "bleeding edge" mini2440 support kernel at
1068     http://repo.or.cz/w/linux-2.6/mini2440.git
1069    
1070     + mitigations=
1071     + [X86] Control optional mitigations for CPU
1072     + vulnerabilities. This is a set of curated,
1073     + arch-independent options, each of which is an
1074     + aggregation of existing arch-specific options.
1075     +
1076     + off
1077     + Disable all optional CPU mitigations. This
1078     + improves system performance, but it may also
1079     + expose users to several CPU vulnerabilities.
1080     + Equivalent to: nopti [X86]
1081     + nospectre_v2 [X86]
1082     + spectre_v2_user=off [X86]
1083     + spec_store_bypass_disable=off [X86]
1084     + l1tf=off [X86]
1085     + mds=off [X86]
1086     +
1087     + auto (default)
1088     + Mitigate all CPU vulnerabilities, but leave SMT
1089     + enabled, even if it's vulnerable. This is for
1090     + users who don't want to be surprised by SMT
1091     + getting disabled across kernel upgrades, or who
1092     + have other ways of avoiding SMT-based attacks.
1093     + Equivalent to: (default behavior)
1094     +
1095     + auto,nosmt
1096     + Mitigate all CPU vulnerabilities, disabling SMT
1097     + if needed. This is for users who always want to
1098     + be fully mitigated, even if it means losing SMT.
1099     + Equivalent to: l1tf=flush,nosmt [X86]
1100     + mds=full,nosmt [X86]
1101     +
1102     mminit_loglevel=
1103     [KNL] When CONFIG_DEBUG_MEMORY_INIT is set, this
1104     parameter allows control of the logging verbosity for
1105     @@ -4030,9 +4091,13 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
1106    
1107     spectre_v2= [X86] Control mitigation of Spectre variant 2
1108     (indirect branch speculation) vulnerability.
1109     + The default operation protects the kernel from
1110     + user space attacks.
1111    
1112     - on - unconditionally enable
1113     - off - unconditionally disable
1114     + on - unconditionally enable, implies
1115     + spectre_v2_user=on
1116     + off - unconditionally disable, implies
1117     + spectre_v2_user=off
1118     auto - kernel detects whether your CPU model is
1119     vulnerable
1120    
1121     @@ -4042,6 +4107,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
1122     CONFIG_RETPOLINE configuration option, and the
1123     compiler with which the kernel was built.
1124    
1125     + Selecting 'on' will also enable the mitigation
1126     + against user space to user space task attacks.
1127     +
1128     + Selecting 'off' will disable both the kernel and
1129     + the user space protections.
1130     +
1131     Specific mitigations can also be selected manually:
1132    
1133     retpoline - replace indirect branches
1134     @@ -4051,6 +4122,48 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
1135     Not specifying this option is equivalent to
1136     spectre_v2=auto.
1137    
1138     + spectre_v2_user=
1139     + [X86] Control mitigation of Spectre variant 2
1140     + (indirect branch speculation) vulnerability between
1141     + user space tasks
1142     +
1143     + on - Unconditionally enable mitigations. Is
1144     + enforced by spectre_v2=on
1145     +
1146     + off - Unconditionally disable mitigations. Is
1147     + enforced by spectre_v2=off
1148     +
1149     + prctl - Indirect branch speculation is enabled,
1150     + but mitigation can be enabled via prctl
1151     + per thread. The mitigation control state
1152     + is inherited on fork.
1153     +
1154     + prctl,ibpb
1155     + - Like "prctl" above, but only STIBP is
1156     + controlled per thread. IBPB is issued
1157     + always when switching between different user
1158     + space processes.
1159     +
1160     + seccomp
1161     + - Same as "prctl" above, but all seccomp
1162     + threads will enable the mitigation unless
1163     + they explicitly opt out.
1164     +
1165     + seccomp,ibpb
1166     + - Like "seccomp" above, but only STIBP is
1167     + controlled per thread. IBPB is issued
1168     + always when switching between different
1169     + user space processes.
1170     +
1171     + auto - Kernel selects the mitigation depending on
1172     + the available CPU features and vulnerability.
1173     +
1174     + Default mitigation:
1175     + If CONFIG_SECCOMP=y then "seccomp", otherwise "prctl"
1176     +
1177     + Not specifying this option is equivalent to
1178     + spectre_v2_user=auto.
1179     +
1180     spec_store_bypass_disable=
1181     [HW] Control Speculative Store Bypass (SSB) Disable mitigation
1182     (Speculative Store Bypass vulnerability)
1183     diff --git a/Documentation/l1tf.rst b/Documentation/l1tf.rst
1184     deleted file mode 100644
1185     index bae52b845de0..000000000000
1186     --- a/Documentation/l1tf.rst
1187     +++ /dev/null
1188     @@ -1,610 +0,0 @@
1189     -L1TF - L1 Terminal Fault
1190     -========================
1191     -
1192     -L1 Terminal Fault is a hardware vulnerability which allows unprivileged
1193     -speculative access to data which is available in the Level 1 Data Cache
1194     -when the page table entry controlling the virtual address, which is used
1195     -for the access, has the Present bit cleared or other reserved bits set.
1196     -
1197     -Affected processors
1198     --------------------
1199     -
1200     -This vulnerability affects a wide range of Intel processors. The
1201     -vulnerability is not present on:
1202     -
1203     - - Processors from AMD, Centaur and other non Intel vendors
1204     -
1205     - - Older processor models, where the CPU family is < 6
1206     -
1207     - - A range of Intel ATOM processors (Cedarview, Cloverview, Lincroft,
1208     - Penwell, Pineview, Silvermont, Airmont, Merrifield)
1209     -
1210     - - The Intel XEON PHI family
1211     -
1212     - - Intel processors which have the ARCH_CAP_RDCL_NO bit set in the
1213     - IA32_ARCH_CAPABILITIES MSR. If the bit is set the CPU is not affected
1214     - by the Meltdown vulnerability either. These CPUs should become
1215     - available by end of 2018.
1216     -
1217     -Whether a processor is affected or not can be read out from the L1TF
1218     -vulnerability file in sysfs. See :ref:`l1tf_sys_info`.
1219     -
1220     -Related CVEs
1221     -------------
1222     -
1223     -The following CVE entries are related to the L1TF vulnerability:
1224     -
1225     - ============= ================= ==============================
1226     - CVE-2018-3615 L1 Terminal Fault SGX related aspects
1227     - CVE-2018-3620 L1 Terminal Fault OS, SMM related aspects
1228     - CVE-2018-3646 L1 Terminal Fault Virtualization related aspects
1229     - ============= ================= ==============================
1230     -
1231     -Problem
1232     --------
1233     -
1234     -If an instruction accesses a virtual address for which the relevant page
1235     -table entry (PTE) has the Present bit cleared or other reserved bits set,
1236     -then speculative execution ignores the invalid PTE and loads the referenced
1237     -data if it is present in the Level 1 Data Cache, as if the page referenced
1238     -by the address bits in the PTE was still present and accessible.
1239     -
1240     -While this is a purely speculative mechanism and the instruction will raise
1241     -a page fault when it is retired eventually, the pure act of loading the
1242     -data and making it available to other speculative instructions opens up the
1243     -opportunity for side channel attacks to unprivileged malicious code,
1244     -similar to the Meltdown attack.
1245     -
1246     -While Meltdown breaks the user space to kernel space protection, L1TF
1247     -allows to attack any physical memory address in the system and the attack
1248     -works across all protection domains. It allows an attack of SGX and also
1249     -works from inside virtual machines because the speculation bypasses the
1250     -extended page table (EPT) protection mechanism.
1251     -
1252     -
1253     -Attack scenarios
1254     -----------------
1255     -
1256     -1. Malicious user space
1257     -^^^^^^^^^^^^^^^^^^^^^^^
1258     -
1259     - Operating Systems store arbitrary information in the address bits of a
1260     - PTE which is marked non present. This allows a malicious user space
1261     - application to attack the physical memory to which these PTEs resolve.
1262     - In some cases user-space can maliciously influence the information
1263     - encoded in the address bits of the PTE, thus making attacks more
1264     - deterministic and more practical.
1265     -
1266     - The Linux kernel contains a mitigation for this attack vector, PTE
1267     - inversion, which is permanently enabled and has no performance
1268     - impact. The kernel ensures that the address bits of PTEs, which are not
1269     - marked present, never point to cacheable physical memory space.
1270     -
1271     - A system with an up to date kernel is protected against attacks from
1272     - malicious user space applications.
1273     -
1274     -2. Malicious guest in a virtual machine
1275     -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
1276     -
1277     - The fact that L1TF breaks all domain protections allows malicious guest
1278     - OSes, which can control the PTEs directly, and malicious guest user
1279     - space applications, which run on an unprotected guest kernel lacking the
1280     - PTE inversion mitigation for L1TF, to attack physical host memory.
1281     -
1282     - A special aspect of L1TF in the context of virtualization is symmetric
1283     - multi threading (SMT). The Intel implementation of SMT is called
1284     - HyperThreading. The fact that Hyperthreads on the affected processors
1285     - share the L1 Data Cache (L1D) is important for this. As the flaw allows
1286     - only to attack data which is present in L1D, a malicious guest running
1287     - on one Hyperthread can attack the data which is brought into the L1D by
1288     - the context which runs on the sibling Hyperthread of the same physical
1289     - core. This context can be host OS, host user space or a different guest.
1290     -
1291     - If the processor does not support Extended Page Tables, the attack is
1292     - only possible, when the hypervisor does not sanitize the content of the
1293     - effective (shadow) page tables.
1294     -
1295     - While solutions exist to mitigate these attack vectors fully, these
1296     - mitigations are not enabled by default in the Linux kernel because they
1297     - can affect performance significantly. The kernel provides several
1298     - mechanisms which can be utilized to address the problem depending on the
1299     - deployment scenario. The mitigations, their protection scope and impact
1300     - are described in the next sections.
1301     -
1302     - The default mitigations and the rationale for choosing them are explained
1303     - at the end of this document. See :ref:`default_mitigations`.
1304     -
1305     -.. _l1tf_sys_info:
1306     -
1307     -L1TF system information
1308     ------------------------
1309     -
1310     -The Linux kernel provides a sysfs interface to enumerate the current L1TF
1311     -status of the system: whether the system is vulnerable, and which
1312     -mitigations are active. The relevant sysfs file is:
1313     -
1314     -/sys/devices/system/cpu/vulnerabilities/l1tf
1315     -
1316     -The possible values in this file are:
1317     -
1318     - =========================== ===============================
1319     - 'Not affected' The processor is not vulnerable
1320     - 'Mitigation: PTE Inversion' The host protection is active
1321     - =========================== ===============================
1322     -
1323     -If KVM/VMX is enabled and the processor is vulnerable then the following
1324     -information is appended to the 'Mitigation: PTE Inversion' part:
1325     -
1326     - - SMT status:
1327     -
1328     - ===================== ================
1329     - 'VMX: SMT vulnerable' SMT is enabled
1330     - 'VMX: SMT disabled' SMT is disabled
1331     - ===================== ================
1332     -
1333     - - L1D Flush mode:
1334     -
1335     - ================================ ====================================
1336     - 'L1D vulnerable' L1D flushing is disabled
1337     -
1338     - 'L1D conditional cache flushes' L1D flush is conditionally enabled
1339     -
1340     - 'L1D cache flushes' L1D flush is unconditionally enabled
1341     - ================================ ====================================
1342     -
1343     -The resulting grade of protection is discussed in the following sections.
1344     -
1345     -
1346     -Host mitigation mechanism
1347     --------------------------
1348     -
1349     -The kernel is unconditionally protected against L1TF attacks from malicious
1350     -user space running on the host.
1351     -
1352     -
1353     -Guest mitigation mechanisms
1354     ----------------------------
1355     -
1356     -.. _l1d_flush:
1357     -
1358     -1. L1D flush on VMENTER
1359     -^^^^^^^^^^^^^^^^^^^^^^^
1360     -
1361     - To make sure that a guest cannot attack data which is present in the L1D
1362     - the hypervisor flushes the L1D before entering the guest.
1363     -
1364     - Flushing the L1D evicts not only the data which should not be accessed
1365     - by a potentially malicious guest, it also flushes the guest
1366     - data. Flushing the L1D has a performance impact as the processor has to
1367     - bring the flushed guest data back into the L1D. Depending on the
1368     - frequency of VMEXIT/VMENTER and the type of computations in the guest
1369     - performance degradation in the range of 1% to 50% has been observed. For
1370     - scenarios where guest VMEXIT/VMENTER are rare the performance impact is
1371     - minimal. Virtio and mechanisms like posted interrupts are designed to
1372     - confine the VMEXITs to a bare minimum, but specific configurations and
1373     - application scenarios might still suffer from a high VMEXIT rate.
1374     -
1375     - The kernel provides two L1D flush modes:
1376     - - conditional ('cond')
1377     - - unconditional ('always')
1378     -
1379     - The conditional mode avoids L1D flushing after VMEXITs which execute
1380     - only audited code paths before the corresponding VMENTER. These code
1381     - paths have been verified that they cannot expose secrets or other
1382     - interesting data to an attacker, but they can leak information about the
1383     - address space layout of the hypervisor.
1384     -
1385     - Unconditional mode flushes L1D on all VMENTER invocations and provides
1386     - maximum protection. It has a higher overhead than the conditional
1387     - mode. The overhead cannot be quantified correctly as it depends on the
1388     - workload scenario and the resulting number of VMEXITs.
1389     -
1390     - The general recommendation is to enable L1D flush on VMENTER. The kernel
1391     - defaults to conditional mode on affected processors.
1392     -
1393     - **Note**, that L1D flush does not prevent the SMT problem because the
1394     - sibling thread will also bring back its data into the L1D which makes it
1395     - attackable again.
1396     -
1397     - L1D flush can be controlled by the administrator via the kernel command
1398     - line and sysfs control files. See :ref:`mitigation_control_command_line`
1399     - and :ref:`mitigation_control_kvm`.
1400     -
1401     -.. _guest_confinement:
1402     -
1403     -2. Guest VCPU confinement to dedicated physical cores
1404     -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
1405     -
1406     - To address the SMT problem, it is possible to make a guest or a group of
1407     - guests affine to one or more physical cores. The proper mechanism for
1408     - that is to utilize exclusive cpusets to ensure that no other guest or
1409     - host tasks can run on these cores.
1410     -
1411     - If only a single guest or related guests run on sibling SMT threads on
1412     - the same physical core then they can only attack their own memory and
1413     - restricted parts of the host memory.
1414     -
1415     - Host memory is attackable, when one of the sibling SMT threads runs in
1416     - host OS (hypervisor) context and the other in guest context. The amount
1417     - of valuable information from the host OS context depends on the context
1418     - which the host OS executes, i.e. interrupts, soft interrupts and kernel
1419     - threads. The amount of valuable data from these contexts cannot be
1420     - declared as non-interesting for an attacker without deep inspection of
1421     - the code.
1422     -
1423     - **Note**, that assigning guests to a fixed set of physical cores affects
1424     - the ability of the scheduler to do load balancing and might have
1425     - negative effects on CPU utilization depending on the hosting
1426     - scenario. Disabling SMT might be a viable alternative for particular
1427     - scenarios.
1428     -
1429     - For further information about confining guests to a single or to a group
1430     - of cores consult the cpusets documentation:
1431     -
1432     - https://www.kernel.org/doc/Documentation/cgroup-v1/cpusets.txt
1433     -
1434     -.. _interrupt_isolation:
1435     -
1436     -3. Interrupt affinity
1437     -^^^^^^^^^^^^^^^^^^^^^
1438     -
1439     - Interrupts can be made affine to logical CPUs. This is not universally
1440     - true because there are types of interrupts which are truly per CPU
1441     - interrupts, e.g. the local timer interrupt. Aside of that multi queue
1442     - devices affine their interrupts to single CPUs or groups of CPUs per
1443     - queue without allowing the administrator to control the affinities.
1444     -
1445     - Moving the interrupts, which can be affinity controlled, away from CPUs
1446     - which run untrusted guests, reduces the attack vector space.
1447     -
1448     - Whether the interrupts with are affine to CPUs, which run untrusted
1449     - guests, provide interesting data for an attacker depends on the system
1450     - configuration and the scenarios which run on the system. While for some
1451     - of the interrupts it can be assumed that they won't expose interesting
1452     - information beyond exposing hints about the host OS memory layout, there
1453     - is no way to make general assumptions.
1454     -
1455     - Interrupt affinity can be controlled by the administrator via the
1456     - /proc/irq/$NR/smp_affinity[_list] files. Limited documentation is
1457     - available at:
1458     -
1459     - https://www.kernel.org/doc/Documentation/IRQ-affinity.txt
1460     -
1461     -.. _smt_control:
1462     -
1463     -4. SMT control
1464     -^^^^^^^^^^^^^^
1465     -
1466     - To prevent the SMT issues of L1TF it might be necessary to disable SMT
1467     - completely. Disabling SMT can have a significant performance impact, but
1468     - the impact depends on the hosting scenario and the type of workloads.
1469     - The impact of disabling SMT needs also to be weighted against the impact
1470     - of other mitigation solutions like confining guests to dedicated cores.
1471     -
1472     - The kernel provides a sysfs interface to retrieve the status of SMT and
1473     - to control it. It also provides a kernel command line interface to
1474     - control SMT.
1475     -
1476     - The kernel command line interface consists of the following options:
1477     -
1478     - =========== ==========================================================
1479     - nosmt Affects the bring up of the secondary CPUs during boot. The
1480     - kernel tries to bring all present CPUs online during the
1481     - boot process. "nosmt" makes sure that from each physical
1482     - core only one - the so called primary (hyper) thread is
1483     - activated. Due to a design flaw of Intel processors related
1484     - to Machine Check Exceptions the non primary siblings have
1485     - to be brought up at least partially and are then shut down
1486     - again. "nosmt" can be undone via the sysfs interface.
1487     -
1488     - nosmt=force Has the same effect as "nosmt" but it does not allow to
1489     - undo the SMT disable via the sysfs interface.
1490     - =========== ==========================================================
1491     -
1492     - The sysfs interface provides two files:
1493     -
1494     - - /sys/devices/system/cpu/smt/control
1495     - - /sys/devices/system/cpu/smt/active
1496     -
1497     - /sys/devices/system/cpu/smt/control:
1498     -
1499     - This file allows to read out the SMT control state and provides the
1500     - ability to disable or (re)enable SMT. The possible states are:
1501     -
1502     - ============== ===================================================
1503     - on SMT is supported by the CPU and enabled. All
1504     - logical CPUs can be onlined and offlined without
1505     - restrictions.
1506     -
1507     - off SMT is supported by the CPU and disabled. Only
1508     - the so called primary SMT threads can be onlined
1509     - and offlined without restrictions. An attempt to
1510     - online a non-primary sibling is rejected
1511     -
1512     - forceoff Same as 'off' but the state cannot be controlled.
1513     - Attempts to write to the control file are rejected.
1514     -
1515     - notsupported The processor does not support SMT. It's therefore
1516     - not affected by the SMT implications of L1TF.
1517     - Attempts to write to the control file are rejected.
1518     - ============== ===================================================
1519     -
1520     - The possible states which can be written into this file to control SMT
1521     - state are:
1522     -
1523     - - on
1524     - - off
1525     - - forceoff
1526     -
1527     - /sys/devices/system/cpu/smt/active:
1528     -
1529     - This file reports whether SMT is enabled and active, i.e. if on any
1530     - physical core two or more sibling threads are online.
1531     -
1532     - SMT control is also possible at boot time via the l1tf kernel command
1533     - line parameter in combination with L1D flush control. See
1534     - :ref:`mitigation_control_command_line`.
1535     -
1536     -5. Disabling EPT
1537     -^^^^^^^^^^^^^^^^
1538     -
1539     - Disabling EPT for virtual machines provides full mitigation for L1TF even
1540     - with SMT enabled, because the effective page tables for guests are
1541     - managed and sanitized by the hypervisor. Though disabling EPT has a
1542     - significant performance impact especially when the Meltdown mitigation
1543     - KPTI is enabled.
1544     -
1545     - EPT can be disabled in the hypervisor via the 'kvm-intel.ept' parameter.
1546     -
1547     -There is ongoing research and development for new mitigation mechanisms to
1548     -address the performance impact of disabling SMT or EPT.
1549     -
1550     -.. _mitigation_control_command_line:
1551     -
1552     -Mitigation control on the kernel command line
1553     ----------------------------------------------
1554     -
1555     -The kernel command line allows to control the L1TF mitigations at boot
1556     -time with the option "l1tf=". The valid arguments for this option are:
1557     -
1558     - ============ =============================================================
1559     - full Provides all available mitigations for the L1TF
1560     - vulnerability. Disables SMT and enables all mitigations in
1561     - the hypervisors, i.e. unconditional L1D flushing
1562     -
1563     - SMT control and L1D flush control via the sysfs interface
1564     - is still possible after boot. Hypervisors will issue a
1565     - warning when the first VM is started in a potentially
1566     - insecure configuration, i.e. SMT enabled or L1D flush
1567     - disabled.
1568     -
1569     - full,force Same as 'full', but disables SMT and L1D flush runtime
1570     - control. Implies the 'nosmt=force' command line option.
1571     - (i.e. sysfs control of SMT is disabled.)
1572     -
1573     - flush Leaves SMT enabled and enables the default hypervisor
1574     - mitigation, i.e. conditional L1D flushing
1575     -
1576     - SMT control and L1D flush control via the sysfs interface
1577     - is still possible after boot. Hypervisors will issue a
1578     - warning when the first VM is started in a potentially
1579     - insecure configuration, i.e. SMT enabled or L1D flush
1580     - disabled.
1581     -
1582     - flush,nosmt Disables SMT and enables the default hypervisor mitigation,
1583     - i.e. conditional L1D flushing.
1584     -
1585     - SMT control and L1D flush control via the sysfs interface
1586     - is still possible after boot. Hypervisors will issue a
1587     - warning when the first VM is started in a potentially
1588     - insecure configuration, i.e. SMT enabled or L1D flush
1589     - disabled.
1590     -
1591     - flush,nowarn Same as 'flush', but hypervisors will not warn when a VM is
1592     - started in a potentially insecure configuration.
1593     -
1594     - off Disables hypervisor mitigations and doesn't emit any
1595     - warnings.
1596     - ============ =============================================================
1597     -
1598     -The default is 'flush'. For details about L1D flushing see :ref:`l1d_flush`.
1599     -
1600     -
1601     -.. _mitigation_control_kvm:
1602     -
1603     -Mitigation control for KVM - module parameter
1604     --------------------------------------------------------------
1605     -
1606     -The KVM hypervisor mitigation mechanism, flushing the L1D cache when
1607     -entering a guest, can be controlled with a module parameter.
1608     -
1609     -The option/parameter is "kvm-intel.vmentry_l1d_flush=". It takes the
1610     -following arguments:
1611     -
1612     - ============ ==============================================================
1613     - always L1D cache flush on every VMENTER.
1614     -
1615     - cond Flush L1D on VMENTER only when the code between VMEXIT and
1616     - VMENTER can leak host memory which is considered
1617     - interesting for an attacker. This still can leak host memory
1618     - which allows e.g. to determine the hosts address space layout.
1619     -
1620     - never Disables the mitigation
1621     - ============ ==============================================================
1622     -
1623     -The parameter can be provided on the kernel command line, as a module
1624     -parameter when loading the modules and at runtime modified via the sysfs
1625     -file:
1626     -
1627     -/sys/module/kvm_intel/parameters/vmentry_l1d_flush
1628     -
1629     -The default is 'cond'. If 'l1tf=full,force' is given on the kernel command
1630     -line, then 'always' is enforced and the kvm-intel.vmentry_l1d_flush
1631     -module parameter is ignored and writes to the sysfs file are rejected.
1632     -
1633     -
1634     -Mitigation selection guide
1635     ---------------------------
1636     -
1637     -1. No virtualization in use
1638     -^^^^^^^^^^^^^^^^^^^^^^^^^^^
1639     -
1640     - The system is protected by the kernel unconditionally and no further
1641     - action is required.
1642     -
1643     -2. Virtualization with trusted guests
1644     -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
1645     -
1646     - If the guest comes from a trusted source and the guest OS kernel is
1647     - guaranteed to have the L1TF mitigations in place the system is fully
1648     - protected against L1TF and no further action is required.
1649     -
1650     - To avoid the overhead of the default L1D flushing on VMENTER the
1651     - administrator can disable the flushing via the kernel command line and
1652     - sysfs control files. See :ref:`mitigation_control_command_line` and
1653     - :ref:`mitigation_control_kvm`.
1654     -
1655     -
1656     -3. Virtualization with untrusted guests
1657     -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
1658     -
1659     -3.1. SMT not supported or disabled
1660     -""""""""""""""""""""""""""""""""""
1661     -
1662     - If SMT is not supported by the processor or disabled in the BIOS or by
1663     - the kernel, it's only required to enforce L1D flushing on VMENTER.
1664     -
1665     - Conditional L1D flushing is the default behaviour and can be tuned. See
1666     - :ref:`mitigation_control_command_line` and :ref:`mitigation_control_kvm`.
1667     -
1668     -3.2. EPT not supported or disabled
1669     -""""""""""""""""""""""""""""""""""
1670     -
1671     - If EPT is not supported by the processor or disabled in the hypervisor,
1672     - the system is fully protected. SMT can stay enabled and L1D flushing on
1673     - VMENTER is not required.
1674     -
1675     - EPT can be disabled in the hypervisor via the 'kvm-intel.ept' parameter.
1676     -
1677     -3.3. SMT and EPT supported and active
1678     -"""""""""""""""""""""""""""""""""""""
1679     -
1680     - If SMT and EPT are supported and active then various degrees of
1681     - mitigations can be employed:
1682     -
1683     - - L1D flushing on VMENTER:
1684     -
1685     - L1D flushing on VMENTER is the minimal protection requirement, but it
1686     - is only potent in combination with other mitigation methods.
1687     -
1688     - Conditional L1D flushing is the default behaviour and can be tuned. See
1689     - :ref:`mitigation_control_command_line` and :ref:`mitigation_control_kvm`.
1690     -
1691     - - Guest confinement:
1692     -
1693     - Confinement of guests to a single or a group of physical cores which
1694     - are not running any other processes, can reduce the attack surface
1695     - significantly, but interrupts, soft interrupts and kernel threads can
1696     - still expose valuable data to a potential attacker. See
1697     - :ref:`guest_confinement`.
1698     -
1699     - - Interrupt isolation:
1700     -
1701     - Isolating the guest CPUs from interrupts can reduce the attack surface
1702     - further, but still allows a malicious guest to explore a limited amount
1703     - of host physical memory. This can at least be used to gain knowledge
1704     - about the host address space layout. The interrupts which have a fixed
1705     - affinity to the CPUs which run the untrusted guests can depending on
1706     - the scenario still trigger soft interrupts and schedule kernel threads
1707     - which might expose valuable information. See
1708     - :ref:`interrupt_isolation`.
1709     -
1710     -The above three mitigation methods combined can provide protection to a
1711     -certain degree, but the risk of the remaining attack surface has to be
1712     -carefully analyzed. For full protection the following methods are
1713     -available:
1714     -
1715     - - Disabling SMT:
1716     -
1717     - Disabling SMT and enforcing the L1D flushing provides the maximum
1718     - amount of protection. This mitigation is not depending on any of the
1719     - above mitigation methods.
1720     -
1721     - SMT control and L1D flushing can be tuned by the command line
1722     - parameters 'nosmt', 'l1tf', 'kvm-intel.vmentry_l1d_flush' and at run
1723     - time with the matching sysfs control files. See :ref:`smt_control`,
1724     - :ref:`mitigation_control_command_line` and
1725     - :ref:`mitigation_control_kvm`.
1726     -
1727     - - Disabling EPT:
1728     -
1729     - Disabling EPT provides the maximum amount of protection as well. It is
1730     - not depending on any of the above mitigation methods. SMT can stay
1731     - enabled and L1D flushing is not required, but the performance impact is
1732     - significant.
1733     -
1734     - EPT can be disabled in the hypervisor via the 'kvm-intel.ept'
1735     - parameter.
1736     -
1737     -3.4. Nested virtual machines
1738     -""""""""""""""""""""""""""""
1739     -
1740     -When nested virtualization is in use, three operating systems are involved:
1741     -the bare metal hypervisor, the nested hypervisor and the nested virtual
1742     -machine. VMENTER operations from the nested hypervisor into the nested
1743     -guest will always be processed by the bare metal hypervisor. If KVM is the
1744     -bare metal hypervisor it wiil:
1745     -
1746     - - Flush the L1D cache on every switch from the nested hypervisor to the
1747     - nested virtual machine, so that the nested hypervisor's secrets are not
1748     - exposed to the nested virtual machine;
1749     -
1750     - - Flush the L1D cache on every switch from the nested virtual machine to
1751     - the nested hypervisor; this is a complex operation, and flushing the L1D
1752     - cache avoids that the bare metal hypervisor's secrets are exposed to the
1753     - nested virtual machine;
1754     -
1755     - - Instruct the nested hypervisor to not perform any L1D cache flush. This
1756     - is an optimization to avoid double L1D flushing.
1757     -
1758     -
1759     -.. _default_mitigations:
1760     -
1761     -Default mitigations
1762     --------------------
1763     -
1764     - The kernel default mitigations for vulnerable processors are:
1765     -
1766     - - PTE inversion to protect against malicious user space. This is done
1767     - unconditionally and cannot be controlled.
1768     -
1769     - - L1D conditional flushing on VMENTER when EPT is enabled for
1770     - a guest.
1771     -
1772     - The kernel does not by default enforce the disabling of SMT, which leaves
1773     - SMT systems vulnerable when running untrusted guests with EPT enabled.
1774     -
1775     - The rationale for this choice is:
1776     -
1777     - - Force disabling SMT can break existing setups, especially with
1778     - unattended updates.
1779     -
1780     - - If regular users run untrusted guests on their machine, then L1TF is
1781     - just an add on to other malware which might be embedded in an untrusted
1782     - guest, e.g. spam-bots or attacks on the local network.
1783     -
1784     - There is no technical way to prevent a user from running untrusted code
1785     - on their machines blindly.
1786     -
1787     - - It's technically extremely unlikely and from today's knowledge even
1788     - impossible that L1TF can be exploited via the most popular attack
1789     - mechanisms like JavaScript because these mechanisms have no way to
1790     - control PTEs. If this would be possible and not other mitigation would
1791     - be possible, then the default might be different.
1792     -
1793     - - The administrators of cloud and hosting setups have to carefully
1794     - analyze the risk for their scenarios and make the appropriate
1795     - mitigation choices, which might even vary across their deployed
1796     - machines and also result in other changes of their overall setup.
1797     - There is no way for the kernel to provide a sensible default for this
1798     - kind of scenarios.
1799     diff --git a/Documentation/spec_ctrl.txt b/Documentation/spec_ctrl.txt
1800     index 32f3d55c54b7..c4dbe6f7cdae 100644
1801     --- a/Documentation/spec_ctrl.txt
1802     +++ b/Documentation/spec_ctrl.txt
1803     @@ -92,3 +92,12 @@ Speculation misfeature controls
1804     * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_STORE_BYPASS, PR_SPEC_ENABLE, 0, 0);
1805     * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_STORE_BYPASS, PR_SPEC_DISABLE, 0, 0);
1806     * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_STORE_BYPASS, PR_SPEC_FORCE_DISABLE, 0, 0);
1807     +
1808     +- PR_SPEC_INDIR_BRANCH: Indirect Branch Speculation in User Processes
1809     + (Mitigate Spectre V2 style attacks against user processes)
1810     +
1811     + Invocations:
1812     + * prctl(PR_GET_SPECULATION_CTRL, PR_SPEC_INDIRECT_BRANCH, 0, 0, 0);
1813     + * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_INDIRECT_BRANCH, PR_SPEC_ENABLE, 0, 0);
1814     + * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_INDIRECT_BRANCH, PR_SPEC_DISABLE, 0, 0);
1815     + * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_INDIRECT_BRANCH, PR_SPEC_FORCE_DISABLE, 0, 0);
1816     diff --git a/Documentation/x86/conf.py b/Documentation/x86/conf.py
1817     new file mode 100644
1818     index 000000000000..33c5c3142e20
1819     --- /dev/null
1820     +++ b/Documentation/x86/conf.py
1821     @@ -0,0 +1,10 @@
1822     +# -*- coding: utf-8; mode: python -*-
1823     +
1824     +project = "X86 architecture specific documentation"
1825     +
1826     +tags.add("subproject")
1827     +
1828     +latex_documents = [
1829     + ('index', 'x86.tex', project,
1830     + 'The kernel development community', 'manual'),
1831     +]
1832     diff --git a/Documentation/x86/index.rst b/Documentation/x86/index.rst
1833     new file mode 100644
1834     index 000000000000..ef389dcf1b1d
1835     --- /dev/null
1836     +++ b/Documentation/x86/index.rst
1837     @@ -0,0 +1,8 @@
1838     +==========================
1839     +x86 architecture specifics
1840     +==========================
1841     +
1842     +.. toctree::
1843     + :maxdepth: 1
1844     +
1845     + mds
1846     diff --git a/Documentation/x86/mds.rst b/Documentation/x86/mds.rst
1847     new file mode 100644
1848     index 000000000000..534e9baa4e1d
1849     --- /dev/null
1850     +++ b/Documentation/x86/mds.rst
1851     @@ -0,0 +1,225 @@
1852     +Microarchitectural Data Sampling (MDS) mitigation
1853     +=================================================
1854     +
1855     +.. _mds:
1856     +
1857     +Overview
1858     +--------
1859     +
1860     +Microarchitectural Data Sampling (MDS) is a family of side channel attacks
1861     +on internal buffers in Intel CPUs. The variants are:
1862     +
1863     + - Microarchitectural Store Buffer Data Sampling (MSBDS) (CVE-2018-12126)
1864     + - Microarchitectural Fill Buffer Data Sampling (MFBDS) (CVE-2018-12130)
1865     + - Microarchitectural Load Port Data Sampling (MLPDS) (CVE-2018-12127)
1866     + - Microarchitectural Data Sampling Uncacheable Memory (MDSUM) (CVE-2019-11091)
1867     +
1868     +MSBDS leaks Store Buffer Entries which can be speculatively forwarded to a
1869     +dependent load (store-to-load forwarding) as an optimization. The forward
1870     +can also happen to a faulting or assisting load operation for a different
1871     +memory address, which can be exploited under certain conditions. Store
1872     +buffers are partitioned between Hyper-Threads so cross thread forwarding is
1873     +not possible. But if a thread enters or exits a sleep state the store
1874     +buffer is repartitioned which can expose data from one thread to the other.
1875     +
1876     +MFBDS leaks Fill Buffer Entries. Fill buffers are used internally to manage
1877     +L1 miss situations and to hold data which is returned or sent in response
1878     +to a memory or I/O operation. Fill buffers can forward data to a load
1879     +operation and also write data to the cache. When the fill buffer is
1880     +deallocated it can retain the stale data of the preceding operations which
1881     +can then be forwarded to a faulting or assisting load operation, which can
1882     +be exploited under certain conditions. Fill buffers are shared between
1883     +Hyper-Threads so cross thread leakage is possible.
1884     +
1885     +MLPDS leaks Load Port Data. Load ports are used to perform load operations
1886     +from memory or I/O. The received data is then forwarded to the register
1887     +file or a subsequent operation. In some implementations the Load Port can
1888     +contain stale data from a previous operation which can be forwarded to
1889     +faulting or assisting loads under certain conditions, which again can be
1890     +exploited eventually. Load ports are shared between Hyper-Threads so cross
1891     +thread leakage is possible.
1892     +
1893     +MDSUM is a special case of MSBDS, MFBDS and MLPDS. An uncacheable load from
1894     +memory that takes a fault or assist can leave data in a microarchitectural
1895     +structure that may later be observed using one of the same methods used by
1896     +MSBDS, MFBDS or MLPDS.
1897     +
1898     +Exposure assumptions
1899     +--------------------
1900     +
1901     +It is assumed that attack code resides in user space or in a guest with one
1902     +exception. The rationale behind this assumption is that the code construct
1903     +needed for exploiting MDS requires:
1904     +
1905     + - to control the load to trigger a fault or assist
1906     +
1907     + - to have a disclosure gadget which exposes the speculatively accessed
1908     + data for consumption through a side channel.
1909     +
1910     + - to control the pointer through which the disclosure gadget exposes the
1911     + data
1912     +
1913     +The existence of such a construct in the kernel cannot be excluded with
1914     +100% certainty, but the complexity involved makes it extremly unlikely.
1915     +
1916     +There is one exception, which is untrusted BPF. The functionality of
1917     +untrusted BPF is limited, but it needs to be thoroughly investigated
1918     +whether it can be used to create such a construct.
1919     +
1920     +
1921     +Mitigation strategy
1922     +-------------------
1923     +
1924     +All variants have the same mitigation strategy at least for the single CPU
1925     +thread case (SMT off): Force the CPU to clear the affected buffers.
1926     +
1927     +This is achieved by using the otherwise unused and obsolete VERW
1928     +instruction in combination with a microcode update. The microcode clears
1929     +the affected CPU buffers when the VERW instruction is executed.
1930     +
1931     +For virtualization there are two ways to achieve CPU buffer
1932     +clearing. Either the modified VERW instruction or via the L1D Flush
1933     +command. The latter is issued when L1TF mitigation is enabled so the extra
1934     +VERW can be avoided. If the CPU is not affected by L1TF then VERW needs to
1935     +be issued.
1936     +
1937     +If the VERW instruction with the supplied segment selector argument is
1938     +executed on a CPU without the microcode update there is no side effect
1939     +other than a small number of pointlessly wasted CPU cycles.
1940     +
1941     +This does not protect against cross Hyper-Thread attacks except for MSBDS
1942     +which is only exploitable cross Hyper-thread when one of the Hyper-Threads
1943     +enters a C-state.
1944     +
1945     +The kernel provides a function to invoke the buffer clearing:
1946     +
1947     + mds_clear_cpu_buffers()
1948     +
1949     +The mitigation is invoked on kernel/userspace, hypervisor/guest and C-state
1950     +(idle) transitions.
1951     +
1952     +As a special quirk to address virtualization scenarios where the host has
1953     +the microcode updated, but the hypervisor does not (yet) expose the
1954     +MD_CLEAR CPUID bit to guests, the kernel issues the VERW instruction in the
1955     +hope that it might actually clear the buffers. The state is reflected
1956     +accordingly.
1957     +
1958     +According to current knowledge additional mitigations inside the kernel
1959     +itself are not required because the necessary gadgets to expose the leaked
1960     +data cannot be controlled in a way which allows exploitation from malicious
1961     +user space or VM guests.
1962     +
1963     +Kernel internal mitigation modes
1964     +--------------------------------
1965     +
1966     + ======= ============================================================
1967     + off Mitigation is disabled. Either the CPU is not affected or
1968     + mds=off is supplied on the kernel command line
1969     +
1970     + full Mitigation is enabled. CPU is affected and MD_CLEAR is
1971     + advertised in CPUID.
1972     +
1973     + vmwerv Mitigation is enabled. CPU is affected and MD_CLEAR is not
1974     + advertised in CPUID. That is mainly for virtualization
1975     + scenarios where the host has the updated microcode but the
1976     + hypervisor does not expose MD_CLEAR in CPUID. It's a best
1977     + effort approach without guarantee.
1978     + ======= ============================================================
1979     +
1980     +If the CPU is affected and mds=off is not supplied on the kernel command
1981     +line then the kernel selects the appropriate mitigation mode depending on
1982     +the availability of the MD_CLEAR CPUID bit.
1983     +
1984     +Mitigation points
1985     +-----------------
1986     +
1987     +1. Return to user space
1988     +^^^^^^^^^^^^^^^^^^^^^^^
1989     +
1990     + When transitioning from kernel to user space the CPU buffers are flushed
1991     + on affected CPUs when the mitigation is not disabled on the kernel
1992     + command line. The migitation is enabled through the static key
1993     + mds_user_clear.
1994     +
1995     + The mitigation is invoked in prepare_exit_to_usermode() which covers
1996     + most of the kernel to user space transitions. There are a few exceptions
1997     + which are not invoking prepare_exit_to_usermode() on return to user
1998     + space. These exceptions use the paranoid exit code.
1999     +
2000     + - Non Maskable Interrupt (NMI):
2001     +
2002     + Access to sensible data like keys, credentials in the NMI context is
2003     + mostly theoretical: The CPU can do prefetching or execute a
2004     + misspeculated code path and thereby fetching data which might end up
2005     + leaking through a buffer.
2006     +
2007     + But for mounting other attacks the kernel stack address of the task is
2008     + already valuable information. So in full mitigation mode, the NMI is
2009     + mitigated on the return from do_nmi() to provide almost complete
2010     + coverage.
2011     +
2012     + - Double fault (#DF):
2013     +
2014     + A double fault is usually fatal, but the ESPFIX workaround, which can
2015     + be triggered from user space through modify_ldt(2) is a recoverable
2016     + double fault. #DF uses the paranoid exit path, so explicit mitigation
2017     + in the double fault handler is required.
2018     +
2019     + - Machine Check Exception (#MC):
2020     +
2021     + Another corner case is a #MC which hits between the CPU buffer clear
2022     + invocation and the actual return to user. As this still is in kernel
2023     + space it takes the paranoid exit path which does not clear the CPU
2024     + buffers. So the #MC handler repopulates the buffers to some
2025     + extent. Machine checks are not reliably controllable and the window is
2026     + extremly small so mitigation would just tick a checkbox that this
2027     + theoretical corner case is covered. To keep the amount of special
2028     + cases small, ignore #MC.
2029     +
2030     + - Debug Exception (#DB):
2031     +
2032     + This takes the paranoid exit path only when the INT1 breakpoint is in
2033     + kernel space. #DB on a user space address takes the regular exit path,
2034     + so no extra mitigation required.
2035     +
2036     +
2037     +2. C-State transition
2038     +^^^^^^^^^^^^^^^^^^^^^
2039     +
2040     + When a CPU goes idle and enters a C-State the CPU buffers need to be
2041     + cleared on affected CPUs when SMT is active. This addresses the
2042     + repartitioning of the store buffer when one of the Hyper-Threads enters
2043     + a C-State.
2044     +
2045     + When SMT is inactive, i.e. either the CPU does not support it or all
2046     + sibling threads are offline CPU buffer clearing is not required.
2047     +
2048     + The idle clearing is enabled on CPUs which are only affected by MSBDS
2049     + and not by any other MDS variant. The other MDS variants cannot be
2050     + protected against cross Hyper-Thread attacks because the Fill Buffer and
2051     + the Load Ports are shared. So on CPUs affected by other variants, the
2052     + idle clearing would be a window dressing exercise and is therefore not
2053     + activated.
2054     +
2055     + The invocation is controlled by the static key mds_idle_clear which is
2056     + switched depending on the chosen mitigation mode and the SMT state of
2057     + the system.
2058     +
2059     + The buffer clear is only invoked before entering the C-State to prevent
2060     + that stale data from the idling CPU from spilling to the Hyper-Thread
2061     + sibling after the store buffer got repartitioned and all entries are
2062     + available to the non idle sibling.
2063     +
2064     + When coming out of idle the store buffer is partitioned again so each
2065     + sibling has half of it available. The back from idle CPU could be then
2066     + speculatively exposed to contents of the sibling. The buffers are
2067     + flushed either on exit to user space or on VMENTER so malicious code
2068     + in user space or the guest cannot speculatively access them.
2069     +
2070     + The mitigation is hooked into all variants of halt()/mwait(), but does
2071     + not cover the legacy ACPI IO-Port mechanism because the ACPI idle driver
2072     + has been superseded by the intel_idle driver around 2010 and is
2073     + preferred on all affected CPUs which are expected to gain the MD_CLEAR
2074     + functionality in microcode. Aside of that the IO-Port mechanism is a
2075     + legacy interface which is only used on older systems which are either
2076     + not affected or do not receive microcode updates anymore.
2077     diff --git a/Makefile b/Makefile
2078     index e52b0579e176..92fe701e5582 100644
2079     --- a/Makefile
2080     +++ b/Makefile
2081     @@ -1,6 +1,6 @@
2082     VERSION = 4
2083     PATCHLEVEL = 9
2084     -SUBLEVEL = 175
2085     +SUBLEVEL = 176
2086     EXTRAVERSION =
2087     NAME = Roaring Lionus
2088    
2089     diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
2090     index 5a4591ff8407..e0055b4302d6 100644
2091     --- a/arch/x86/Kconfig
2092     +++ b/arch/x86/Kconfig
2093     @@ -937,13 +937,7 @@ config NR_CPUS
2094     approximately eight kilobytes to the kernel image.
2095    
2096     config SCHED_SMT
2097     - bool "SMT (Hyperthreading) scheduler support"
2098     - depends on SMP
2099     - ---help---
2100     - SMT scheduler support improves the CPU scheduler's decision making
2101     - when dealing with Intel Pentium 4 chips with HyperThreading at a
2102     - cost of slightly increased overhead in some places. If unsure say
2103     - N here.
2104     + def_bool y if SMP
2105    
2106     config SCHED_MC
2107     def_bool y
2108     diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
2109     index b0cd306dc527..8841d016b4a4 100644
2110     --- a/arch/x86/entry/common.c
2111     +++ b/arch/x86/entry/common.c
2112     @@ -28,6 +28,7 @@
2113     #include <asm/vdso.h>
2114     #include <asm/uaccess.h>
2115     #include <asm/cpufeature.h>
2116     +#include <asm/nospec-branch.h>
2117    
2118     #define CREATE_TRACE_POINTS
2119     #include <trace/events/syscalls.h>
2120     @@ -206,6 +207,8 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
2121     #endif
2122    
2123     user_enter_irqoff();
2124     +
2125     + mds_user_clear_cpu_buffers();
2126     }
2127    
2128     #define SYSCALL_EXIT_WORK_FLAGS \
2129     diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
2130     index a30829052a00..cb8178a2783a 100644
2131     --- a/arch/x86/events/intel/core.c
2132     +++ b/arch/x86/events/intel/core.c
2133     @@ -3750,11 +3750,11 @@ __init int intel_pmu_init(void)
2134     pr_cont("Nehalem events, ");
2135     break;
2136    
2137     - case INTEL_FAM6_ATOM_PINEVIEW:
2138     - case INTEL_FAM6_ATOM_LINCROFT:
2139     - case INTEL_FAM6_ATOM_PENWELL:
2140     - case INTEL_FAM6_ATOM_CLOVERVIEW:
2141     - case INTEL_FAM6_ATOM_CEDARVIEW:
2142     + case INTEL_FAM6_ATOM_BONNELL:
2143     + case INTEL_FAM6_ATOM_BONNELL_MID:
2144     + case INTEL_FAM6_ATOM_SALTWELL:
2145     + case INTEL_FAM6_ATOM_SALTWELL_MID:
2146     + case INTEL_FAM6_ATOM_SALTWELL_TABLET:
2147     memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
2148     sizeof(hw_cache_event_ids));
2149    
2150     @@ -3766,9 +3766,11 @@ __init int intel_pmu_init(void)
2151     pr_cont("Atom events, ");
2152     break;
2153    
2154     - case INTEL_FAM6_ATOM_SILVERMONT1:
2155     - case INTEL_FAM6_ATOM_SILVERMONT2:
2156     + case INTEL_FAM6_ATOM_SILVERMONT:
2157     + case INTEL_FAM6_ATOM_SILVERMONT_X:
2158     + case INTEL_FAM6_ATOM_SILVERMONT_MID:
2159     case INTEL_FAM6_ATOM_AIRMONT:
2160     + case INTEL_FAM6_ATOM_AIRMONT_MID:
2161     memcpy(hw_cache_event_ids, slm_hw_cache_event_ids,
2162     sizeof(hw_cache_event_ids));
2163     memcpy(hw_cache_extra_regs, slm_hw_cache_extra_regs,
2164     @@ -3785,7 +3787,7 @@ __init int intel_pmu_init(void)
2165     break;
2166    
2167     case INTEL_FAM6_ATOM_GOLDMONT:
2168     - case INTEL_FAM6_ATOM_DENVERTON:
2169     + case INTEL_FAM6_ATOM_GOLDMONT_X:
2170     memcpy(hw_cache_event_ids, glm_hw_cache_event_ids,
2171     sizeof(hw_cache_event_ids));
2172     memcpy(hw_cache_extra_regs, glm_hw_cache_extra_regs,
2173     diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c
2174     index 47d526c700a1..72d09340c24d 100644
2175     --- a/arch/x86/events/intel/cstate.c
2176     +++ b/arch/x86/events/intel/cstate.c
2177     @@ -531,8 +531,8 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = {
2178    
2179     X86_CSTATES_MODEL(INTEL_FAM6_HASWELL_ULT, hswult_cstates),
2180    
2181     - X86_CSTATES_MODEL(INTEL_FAM6_ATOM_SILVERMONT1, slm_cstates),
2182     - X86_CSTATES_MODEL(INTEL_FAM6_ATOM_SILVERMONT2, slm_cstates),
2183     + X86_CSTATES_MODEL(INTEL_FAM6_ATOM_SILVERMONT, slm_cstates),
2184     + X86_CSTATES_MODEL(INTEL_FAM6_ATOM_SILVERMONT_X, slm_cstates),
2185     X86_CSTATES_MODEL(INTEL_FAM6_ATOM_AIRMONT, slm_cstates),
2186    
2187     X86_CSTATES_MODEL(INTEL_FAM6_BROADWELL_CORE, snb_cstates),
2188     diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c
2189     index be0b1968d60a..68144a341903 100644
2190     --- a/arch/x86/events/msr.c
2191     +++ b/arch/x86/events/msr.c
2192     @@ -61,8 +61,8 @@ static bool test_intel(int idx)
2193     case INTEL_FAM6_BROADWELL_GT3E:
2194     case INTEL_FAM6_BROADWELL_X:
2195    
2196     - case INTEL_FAM6_ATOM_SILVERMONT1:
2197     - case INTEL_FAM6_ATOM_SILVERMONT2:
2198     + case INTEL_FAM6_ATOM_SILVERMONT:
2199     + case INTEL_FAM6_ATOM_SILVERMONT_X:
2200     case INTEL_FAM6_ATOM_AIRMONT:
2201     if (idx == PERF_MSR_SMI)
2202     return true;
2203     diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
2204     index 98444b77fbe3..06de338be0d8 100644
2205     --- a/arch/x86/include/asm/cpufeatures.h
2206     +++ b/arch/x86/include/asm/cpufeatures.h
2207     @@ -271,10 +271,12 @@
2208     /* AMD-defined CPU features, CPUID level 0x80000008 (ebx), word 13 */
2209     #define X86_FEATURE_CLZERO (13*32+0) /* CLZERO instruction */
2210     #define X86_FEATURE_IRPERF (13*32+1) /* Instructions Retired Count */
2211     -#define X86_FEATURE_AMD_IBPB (13*32+12) /* Indirect Branch Prediction Barrier */
2212     -#define X86_FEATURE_AMD_IBRS (13*32+14) /* Indirect Branch Restricted Speculation */
2213     -#define X86_FEATURE_AMD_STIBP (13*32+15) /* Single Thread Indirect Branch Predictors */
2214     +#define X86_FEATURE_AMD_IBPB (13*32+12) /* "" Indirect Branch Prediction Barrier */
2215     +#define X86_FEATURE_AMD_IBRS (13*32+14) /* "" Indirect Branch Restricted Speculation */
2216     +#define X86_FEATURE_AMD_STIBP (13*32+15) /* "" Single Thread Indirect Branch Predictors */
2217     +#define X86_FEATURE_AMD_SSBD (13*32+24) /* "" Speculative Store Bypass Disable */
2218     #define X86_FEATURE_VIRT_SSBD (13*32+25) /* Virtualized Speculative Store Bypass Disable */
2219     +#define X86_FEATURE_AMD_SSB_NO (13*32+26) /* "" Speculative Store Bypass is fixed in hardware. */
2220    
2221     /* Thermal and Power Management Leaf, CPUID level 0x00000006 (eax), word 14 */
2222     #define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */
2223     @@ -315,6 +317,7 @@
2224     #define X86_FEATURE_AVX512_4VNNIW (18*32+ 2) /* AVX-512 Neural Network Instructions */
2225     #define X86_FEATURE_AVX512_4FMAPS (18*32+ 3) /* AVX-512 Multiply Accumulation Single precision */
2226     #define X86_FEATURE_TSX_FORCE_ABORT (18*32+13) /* "" TSX_FORCE_ABORT */
2227     +#define X86_FEATURE_MD_CLEAR (18*32+10) /* VERW clears CPU buffers */
2228     #define X86_FEATURE_PCONFIG (18*32+18) /* Intel PCONFIG */
2229     #define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */
2230     #define X86_FEATURE_INTEL_STIBP (18*32+27) /* "" Single Thread Indirect Branch Predictors */
2231     @@ -352,5 +355,7 @@
2232     #define X86_BUG_SPECTRE_V2 X86_BUG(16) /* CPU is affected by Spectre variant 2 attack with indirect branches */
2233     #define X86_BUG_SPEC_STORE_BYPASS X86_BUG(17) /* CPU is affected by speculative store bypass attack */
2234     #define X86_BUG_L1TF X86_BUG(18) /* CPU is affected by L1 Terminal Fault */
2235     +#define X86_BUG_MDS X86_BUG(19) /* CPU is affected by Microarchitectural data sampling */
2236     +#define X86_BUG_MSBDS_ONLY X86_BUG(20) /* CPU is only affected by the MSDBS variant of BUG_MDS */
2237    
2238     #endif /* _ASM_X86_CPUFEATURES_H */
2239     diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h
2240     index 75b748a1deb8..ba7b6f736414 100644
2241     --- a/arch/x86/include/asm/intel-family.h
2242     +++ b/arch/x86/include/asm/intel-family.h
2243     @@ -50,19 +50,23 @@
2244    
2245     /* "Small Core" Processors (Atom) */
2246    
2247     -#define INTEL_FAM6_ATOM_PINEVIEW 0x1C
2248     -#define INTEL_FAM6_ATOM_LINCROFT 0x26
2249     -#define INTEL_FAM6_ATOM_PENWELL 0x27
2250     -#define INTEL_FAM6_ATOM_CLOVERVIEW 0x35
2251     -#define INTEL_FAM6_ATOM_CEDARVIEW 0x36
2252     -#define INTEL_FAM6_ATOM_SILVERMONT1 0x37 /* BayTrail/BYT / Valleyview */
2253     -#define INTEL_FAM6_ATOM_SILVERMONT2 0x4D /* Avaton/Rangely */
2254     -#define INTEL_FAM6_ATOM_AIRMONT 0x4C /* CherryTrail / Braswell */
2255     -#define INTEL_FAM6_ATOM_MERRIFIELD 0x4A /* Tangier */
2256     -#define INTEL_FAM6_ATOM_MOOREFIELD 0x5A /* Anniedale */
2257     -#define INTEL_FAM6_ATOM_GOLDMONT 0x5C
2258     -#define INTEL_FAM6_ATOM_DENVERTON 0x5F /* Goldmont Microserver */
2259     -#define INTEL_FAM6_ATOM_GEMINI_LAKE 0x7A
2260     +#define INTEL_FAM6_ATOM_BONNELL 0x1C /* Diamondville, Pineview */
2261     +#define INTEL_FAM6_ATOM_BONNELL_MID 0x26 /* Silverthorne, Lincroft */
2262     +
2263     +#define INTEL_FAM6_ATOM_SALTWELL 0x36 /* Cedarview */
2264     +#define INTEL_FAM6_ATOM_SALTWELL_MID 0x27 /* Penwell */
2265     +#define INTEL_FAM6_ATOM_SALTWELL_TABLET 0x35 /* Cloverview */
2266     +
2267     +#define INTEL_FAM6_ATOM_SILVERMONT 0x37 /* Bay Trail, Valleyview */
2268     +#define INTEL_FAM6_ATOM_SILVERMONT_X 0x4D /* Avaton, Rangely */
2269     +#define INTEL_FAM6_ATOM_SILVERMONT_MID 0x4A /* Merriefield */
2270     +
2271     +#define INTEL_FAM6_ATOM_AIRMONT 0x4C /* Cherry Trail, Braswell */
2272     +#define INTEL_FAM6_ATOM_AIRMONT_MID 0x5A /* Moorefield */
2273     +
2274     +#define INTEL_FAM6_ATOM_GOLDMONT 0x5C /* Apollo Lake */
2275     +#define INTEL_FAM6_ATOM_GOLDMONT_X 0x5F /* Denverton */
2276     +#define INTEL_FAM6_ATOM_GOLDMONT_PLUS 0x7A /* Gemini Lake */
2277    
2278     /* Xeon Phi */
2279    
2280     diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
2281     index 508a062e6cf1..0c8f4281b151 100644
2282     --- a/arch/x86/include/asm/irqflags.h
2283     +++ b/arch/x86/include/asm/irqflags.h
2284     @@ -5,6 +5,8 @@
2285    
2286     #ifndef __ASSEMBLY__
2287    
2288     +#include <asm/nospec-branch.h>
2289     +
2290     /* Provide __cpuidle; we can't safely include <linux/cpu.h> */
2291     #define __cpuidle __attribute__((__section__(".cpuidle.text")))
2292    
2293     @@ -53,11 +55,13 @@ static inline void native_irq_enable(void)
2294    
2295     static inline __cpuidle void native_safe_halt(void)
2296     {
2297     + mds_idle_clear_cpu_buffers();
2298     asm volatile("sti; hlt": : :"memory");
2299     }
2300    
2301     static inline __cpuidle void native_halt(void)
2302     {
2303     + mds_idle_clear_cpu_buffers();
2304     asm volatile("hlt": : :"memory");
2305     }
2306    
2307     diff --git a/arch/x86/include/asm/microcode_intel.h b/arch/x86/include/asm/microcode_intel.h
2308     index 5e69154c9f07..a61ec81b27db 100644
2309     --- a/arch/x86/include/asm/microcode_intel.h
2310     +++ b/arch/x86/include/asm/microcode_intel.h
2311     @@ -52,6 +52,21 @@ struct extended_sigtable {
2312    
2313     #define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
2314    
2315     +static inline u32 intel_get_microcode_revision(void)
2316     +{
2317     + u32 rev, dummy;
2318     +
2319     + native_wrmsrl(MSR_IA32_UCODE_REV, 0);
2320     +
2321     + /* As documented in the SDM: Do a CPUID 1 here */
2322     + sync_core();
2323     +
2324     + /* get the current revision from MSR 0x8B */
2325     + native_rdmsr(MSR_IA32_UCODE_REV, dummy, rev);
2326     +
2327     + return rev;
2328     +}
2329     +
2330     extern int has_newer_microcode(void *mc, unsigned int csig, int cpf, int rev);
2331     extern int microcode_sanity_check(void *mc, int print_err);
2332     extern int find_matching_signature(void *mc, unsigned int csig, int cpf);
2333     diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
2334     index 9963e21ac443..38f94d07920d 100644
2335     --- a/arch/x86/include/asm/msr-index.h
2336     +++ b/arch/x86/include/asm/msr-index.h
2337     @@ -1,6 +1,8 @@
2338     #ifndef _ASM_X86_MSR_INDEX_H
2339     #define _ASM_X86_MSR_INDEX_H
2340    
2341     +#include <linux/bits.h>
2342     +
2343     /*
2344     * CPU model specific register (MSR) numbers.
2345     *
2346     @@ -38,13 +40,14 @@
2347    
2348     /* Intel MSRs. Some also available on other CPUs */
2349     #define MSR_IA32_SPEC_CTRL 0x00000048 /* Speculation Control */
2350     -#define SPEC_CTRL_IBRS (1 << 0) /* Indirect Branch Restricted Speculation */
2351     -#define SPEC_CTRL_STIBP (1 << 1) /* Single Thread Indirect Branch Predictors */
2352     +#define SPEC_CTRL_IBRS BIT(0) /* Indirect Branch Restricted Speculation */
2353     +#define SPEC_CTRL_STIBP_SHIFT 1 /* Single Thread Indirect Branch Predictor (STIBP) bit */
2354     +#define SPEC_CTRL_STIBP BIT(SPEC_CTRL_STIBP_SHIFT) /* STIBP mask */
2355     #define SPEC_CTRL_SSBD_SHIFT 2 /* Speculative Store Bypass Disable bit */
2356     -#define SPEC_CTRL_SSBD (1 << SPEC_CTRL_SSBD_SHIFT) /* Speculative Store Bypass Disable */
2357     +#define SPEC_CTRL_SSBD BIT(SPEC_CTRL_SSBD_SHIFT) /* Speculative Store Bypass Disable */
2358    
2359     #define MSR_IA32_PRED_CMD 0x00000049 /* Prediction Command */
2360     -#define PRED_CMD_IBPB (1 << 0) /* Indirect Branch Prediction Barrier */
2361     +#define PRED_CMD_IBPB BIT(0) /* Indirect Branch Prediction Barrier */
2362    
2363     #define MSR_IA32_PERFCTR0 0x000000c1
2364     #define MSR_IA32_PERFCTR1 0x000000c2
2365     @@ -61,20 +64,25 @@
2366     #define MSR_MTRRcap 0x000000fe
2367    
2368     #define MSR_IA32_ARCH_CAPABILITIES 0x0000010a
2369     -#define ARCH_CAP_RDCL_NO (1 << 0) /* Not susceptible to Meltdown */
2370     -#define ARCH_CAP_IBRS_ALL (1 << 1) /* Enhanced IBRS support */
2371     -#define ARCH_CAP_SKIP_VMENTRY_L1DFLUSH (1 << 3) /* Skip L1D flush on vmentry */
2372     -#define ARCH_CAP_SSB_NO (1 << 4) /*
2373     - * Not susceptible to Speculative Store Bypass
2374     - * attack, so no Speculative Store Bypass
2375     - * control required.
2376     - */
2377     +#define ARCH_CAP_RDCL_NO BIT(0) /* Not susceptible to Meltdown */
2378     +#define ARCH_CAP_IBRS_ALL BIT(1) /* Enhanced IBRS support */
2379     +#define ARCH_CAP_SKIP_VMENTRY_L1DFLUSH BIT(3) /* Skip L1D flush on vmentry */
2380     +#define ARCH_CAP_SSB_NO BIT(4) /*
2381     + * Not susceptible to Speculative Store Bypass
2382     + * attack, so no Speculative Store Bypass
2383     + * control required.
2384     + */
2385     +#define ARCH_CAP_MDS_NO BIT(5) /*
2386     + * Not susceptible to
2387     + * Microarchitectural Data
2388     + * Sampling (MDS) vulnerabilities.
2389     + */
2390    
2391     #define MSR_IA32_FLUSH_CMD 0x0000010b
2392     -#define L1D_FLUSH (1 << 0) /*
2393     - * Writeback and invalidate the
2394     - * L1 data cache.
2395     - */
2396     +#define L1D_FLUSH BIT(0) /*
2397     + * Writeback and invalidate the
2398     + * L1 data cache.
2399     + */
2400    
2401     #define MSR_IA32_BBL_CR_CTL 0x00000119
2402     #define MSR_IA32_BBL_CR_CTL3 0x0000011e
2403     diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
2404     index f37f2d8a2989..0b40cc442bda 100644
2405     --- a/arch/x86/include/asm/mwait.h
2406     +++ b/arch/x86/include/asm/mwait.h
2407     @@ -4,6 +4,7 @@
2408     #include <linux/sched.h>
2409    
2410     #include <asm/cpufeature.h>
2411     +#include <asm/nospec-branch.h>
2412    
2413     #define MWAIT_SUBSTATE_MASK 0xf
2414     #define MWAIT_CSTATE_MASK 0xf
2415     @@ -38,6 +39,8 @@ static inline void __monitorx(const void *eax, unsigned long ecx,
2416    
2417     static inline void __mwait(unsigned long eax, unsigned long ecx)
2418     {
2419     + mds_idle_clear_cpu_buffers();
2420     +
2421     /* "mwait %eax, %ecx;" */
2422     asm volatile(".byte 0x0f, 0x01, 0xc9;"
2423     :: "a" (eax), "c" (ecx));
2424     @@ -72,6 +75,8 @@ static inline void __mwait(unsigned long eax, unsigned long ecx)
2425     static inline void __mwaitx(unsigned long eax, unsigned long ebx,
2426     unsigned long ecx)
2427     {
2428     + /* No MDS buffer clear as this is AMD/HYGON only */
2429     +
2430     /* "mwaitx %eax, %ebx, %ecx;" */
2431     asm volatile(".byte 0x0f, 0x01, 0xfb;"
2432     :: "a" (eax), "b" (ebx), "c" (ecx));
2433     @@ -79,6 +84,8 @@ static inline void __mwaitx(unsigned long eax, unsigned long ebx,
2434    
2435     static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
2436     {
2437     + mds_idle_clear_cpu_buffers();
2438     +
2439     trace_hardirqs_on();
2440     /* "mwait %eax, %ecx;" */
2441     asm volatile("sti; .byte 0x0f, 0x01, 0xc9;"
2442     diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
2443     index 1b4132161c1f..031a58e84e5b 100644
2444     --- a/arch/x86/include/asm/nospec-branch.h
2445     +++ b/arch/x86/include/asm/nospec-branch.h
2446     @@ -3,6 +3,8 @@
2447     #ifndef _ASM_X86_NOSPEC_BRANCH_H_
2448     #define _ASM_X86_NOSPEC_BRANCH_H_
2449    
2450     +#include <linux/static_key.h>
2451     +
2452     #include <asm/alternative.h>
2453     #include <asm/alternative-asm.h>
2454     #include <asm/cpufeatures.h>
2455     @@ -214,10 +216,17 @@ enum spectre_v2_mitigation {
2456     SPECTRE_V2_RETPOLINE_MINIMAL_AMD,
2457     SPECTRE_V2_RETPOLINE_GENERIC,
2458     SPECTRE_V2_RETPOLINE_AMD,
2459     - SPECTRE_V2_IBRS,
2460     SPECTRE_V2_IBRS_ENHANCED,
2461     };
2462    
2463     +/* The indirect branch speculation control variants */
2464     +enum spectre_v2_user_mitigation {
2465     + SPECTRE_V2_USER_NONE,
2466     + SPECTRE_V2_USER_STRICT,
2467     + SPECTRE_V2_USER_PRCTL,
2468     + SPECTRE_V2_USER_SECCOMP,
2469     +};
2470     +
2471     /* The Speculative Store Bypass disable variants */
2472     enum ssb_mitigation {
2473     SPEC_STORE_BYPASS_NONE,
2474     @@ -295,6 +304,60 @@ do { \
2475     preempt_enable(); \
2476     } while (0)
2477    
2478     +DECLARE_STATIC_KEY_FALSE(switch_to_cond_stibp);
2479     +DECLARE_STATIC_KEY_FALSE(switch_mm_cond_ibpb);
2480     +DECLARE_STATIC_KEY_FALSE(switch_mm_always_ibpb);
2481     +
2482     +DECLARE_STATIC_KEY_FALSE(mds_user_clear);
2483     +DECLARE_STATIC_KEY_FALSE(mds_idle_clear);
2484     +
2485     +#include <asm/segment.h>
2486     +
2487     +/**
2488     + * mds_clear_cpu_buffers - Mitigation for MDS vulnerability
2489     + *
2490     + * This uses the otherwise unused and obsolete VERW instruction in
2491     + * combination with microcode which triggers a CPU buffer flush when the
2492     + * instruction is executed.
2493     + */
2494     +static inline void mds_clear_cpu_buffers(void)
2495     +{
2496     + static const u16 ds = __KERNEL_DS;
2497     +
2498     + /*
2499     + * Has to be the memory-operand variant because only that
2500     + * guarantees the CPU buffer flush functionality according to
2501     + * documentation. The register-operand variant does not.
2502     + * Works with any segment selector, but a valid writable
2503     + * data segment is the fastest variant.
2504     + *
2505     + * "cc" clobber is required because VERW modifies ZF.
2506     + */
2507     + asm volatile("verw %[ds]" : : [ds] "m" (ds) : "cc");
2508     +}
2509     +
2510     +/**
2511     + * mds_user_clear_cpu_buffers - Mitigation for MDS vulnerability
2512     + *
2513     + * Clear CPU buffers if the corresponding static key is enabled
2514     + */
2515     +static inline void mds_user_clear_cpu_buffers(void)
2516     +{
2517     + if (static_branch_likely(&mds_user_clear))
2518     + mds_clear_cpu_buffers();
2519     +}
2520     +
2521     +/**
2522     + * mds_idle_clear_cpu_buffers - Mitigation for MDS vulnerability
2523     + *
2524     + * Clear CPU buffers if the corresponding static key is enabled
2525     + */
2526     +static inline void mds_idle_clear_cpu_buffers(void)
2527     +{
2528     + if (static_branch_likely(&mds_idle_clear))
2529     + mds_clear_cpu_buffers();
2530     +}
2531     +
2532     #endif /* __ASSEMBLY__ */
2533    
2534     /*
2535     diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
2536     index 221a32ed1372..f12e61e2a86b 100644
2537     --- a/arch/x86/include/asm/pgtable_64.h
2538     +++ b/arch/x86/include/asm/pgtable_64.h
2539     @@ -44,15 +44,15 @@ struct mm_struct;
2540     void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte);
2541    
2542    
2543     -static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr,
2544     - pte_t *ptep)
2545     +static inline void native_set_pte(pte_t *ptep, pte_t pte)
2546     {
2547     - *ptep = native_make_pte(0);
2548     + WRITE_ONCE(*ptep, pte);
2549     }
2550    
2551     -static inline void native_set_pte(pte_t *ptep, pte_t pte)
2552     +static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr,
2553     + pte_t *ptep)
2554     {
2555     - *ptep = pte;
2556     + native_set_pte(ptep, native_make_pte(0));
2557     }
2558    
2559     static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
2560     @@ -62,7 +62,7 @@ static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
2561    
2562     static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
2563     {
2564     - *pmdp = pmd;
2565     + WRITE_ONCE(*pmdp, pmd);
2566     }
2567    
2568     static inline void native_pmd_clear(pmd_t *pmd)
2569     @@ -98,7 +98,7 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
2570    
2571     static inline void native_set_pud(pud_t *pudp, pud_t pud)
2572     {
2573     - *pudp = pud;
2574     + WRITE_ONCE(*pudp, pud);
2575     }
2576    
2577     static inline void native_pud_clear(pud_t *pud)
2578     @@ -131,7 +131,7 @@ static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)
2579    
2580     static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
2581     {
2582     - *pgdp = kaiser_set_shadow_pgd(pgdp, pgd);
2583     + WRITE_ONCE(*pgdp, kaiser_set_shadow_pgd(pgdp, pgd));
2584     }
2585    
2586     static inline void native_pgd_clear(pgd_t *pgd)
2587     diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
2588     index ee8c6290c421..155e49fc7010 100644
2589     --- a/arch/x86/include/asm/processor.h
2590     +++ b/arch/x86/include/asm/processor.h
2591     @@ -874,4 +874,10 @@ enum l1tf_mitigations {
2592    
2593     extern enum l1tf_mitigations l1tf_mitigation;
2594    
2595     +enum mds_mitigations {
2596     + MDS_MITIGATION_OFF,
2597     + MDS_MITIGATION_FULL,
2598     + MDS_MITIGATION_VMWERV,
2599     +};
2600     +
2601     #endif /* _ASM_X86_PROCESSOR_H */
2602     diff --git a/arch/x86/include/asm/spec-ctrl.h b/arch/x86/include/asm/spec-ctrl.h
2603     index ae7c2c5cd7f0..5393babc0598 100644
2604     --- a/arch/x86/include/asm/spec-ctrl.h
2605     +++ b/arch/x86/include/asm/spec-ctrl.h
2606     @@ -53,12 +53,24 @@ static inline u64 ssbd_tif_to_spec_ctrl(u64 tifn)
2607     return (tifn & _TIF_SSBD) >> (TIF_SSBD - SPEC_CTRL_SSBD_SHIFT);
2608     }
2609    
2610     +static inline u64 stibp_tif_to_spec_ctrl(u64 tifn)
2611     +{
2612     + BUILD_BUG_ON(TIF_SPEC_IB < SPEC_CTRL_STIBP_SHIFT);
2613     + return (tifn & _TIF_SPEC_IB) >> (TIF_SPEC_IB - SPEC_CTRL_STIBP_SHIFT);
2614     +}
2615     +
2616     static inline unsigned long ssbd_spec_ctrl_to_tif(u64 spec_ctrl)
2617     {
2618     BUILD_BUG_ON(TIF_SSBD < SPEC_CTRL_SSBD_SHIFT);
2619     return (spec_ctrl & SPEC_CTRL_SSBD) << (TIF_SSBD - SPEC_CTRL_SSBD_SHIFT);
2620     }
2621    
2622     +static inline unsigned long stibp_spec_ctrl_to_tif(u64 spec_ctrl)
2623     +{
2624     + BUILD_BUG_ON(TIF_SPEC_IB < SPEC_CTRL_STIBP_SHIFT);
2625     + return (spec_ctrl & SPEC_CTRL_STIBP) << (TIF_SPEC_IB - SPEC_CTRL_STIBP_SHIFT);
2626     +}
2627     +
2628     static inline u64 ssbd_tif_to_amd_ls_cfg(u64 tifn)
2629     {
2630     return (tifn & _TIF_SSBD) ? x86_amd_ls_cfg_ssbd_mask : 0ULL;
2631     @@ -70,11 +82,7 @@ extern void speculative_store_bypass_ht_init(void);
2632     static inline void speculative_store_bypass_ht_init(void) { }
2633     #endif
2634    
2635     -extern void speculative_store_bypass_update(unsigned long tif);
2636     -
2637     -static inline void speculative_store_bypass_update_current(void)
2638     -{
2639     - speculative_store_bypass_update(current_thread_info()->flags);
2640     -}
2641     +extern void speculation_ctrl_update(unsigned long tif);
2642     +extern void speculation_ctrl_update_current(void);
2643    
2644     #endif
2645     diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
2646     index 5cb436acd463..676e84f521ba 100644
2647     --- a/arch/x86/include/asm/switch_to.h
2648     +++ b/arch/x86/include/asm/switch_to.h
2649     @@ -8,9 +8,6 @@ struct task_struct *__switch_to_asm(struct task_struct *prev,
2650    
2651     __visible struct task_struct *__switch_to(struct task_struct *prev,
2652     struct task_struct *next);
2653     -struct tss_struct;
2654     -void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
2655     - struct tss_struct *tss);
2656    
2657     /* This runs runs on the previous thread's stack. */
2658     static inline void prepare_switch_to(struct task_struct *prev,
2659     diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
2660     index 2d8788a59b4d..0438f7fbb383 100644
2661     --- a/arch/x86/include/asm/thread_info.h
2662     +++ b/arch/x86/include/asm/thread_info.h
2663     @@ -83,10 +83,12 @@ struct thread_info {
2664     #define TIF_SIGPENDING 2 /* signal pending */
2665     #define TIF_NEED_RESCHED 3 /* rescheduling necessary */
2666     #define TIF_SINGLESTEP 4 /* reenable singlestep on user return*/
2667     -#define TIF_SSBD 5 /* Reduced data speculation */
2668     +#define TIF_SSBD 5 /* Speculative store bypass disable */
2669     #define TIF_SYSCALL_EMU 6 /* syscall emulation active */
2670     #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
2671     #define TIF_SECCOMP 8 /* secure computing */
2672     +#define TIF_SPEC_IB 9 /* Indirect branch speculation mitigation */
2673     +#define TIF_SPEC_FORCE_UPDATE 10 /* Force speculation MSR update in context switch */
2674     #define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */
2675     #define TIF_UPROBE 12 /* breakpointed or singlestepping */
2676     #define TIF_NOTSC 16 /* TSC is not accessible in userland */
2677     @@ -111,6 +113,8 @@ struct thread_info {
2678     #define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU)
2679     #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
2680     #define _TIF_SECCOMP (1 << TIF_SECCOMP)
2681     +#define _TIF_SPEC_IB (1 << TIF_SPEC_IB)
2682     +#define _TIF_SPEC_FORCE_UPDATE (1 << TIF_SPEC_FORCE_UPDATE)
2683     #define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY)
2684     #define _TIF_UPROBE (1 << TIF_UPROBE)
2685     #define _TIF_NOTSC (1 << TIF_NOTSC)
2686     @@ -140,8 +144,18 @@ struct thread_info {
2687     _TIF_NOHZ)
2688    
2689     /* flags to check in __switch_to() */
2690     -#define _TIF_WORK_CTXSW \
2691     - (_TIF_IO_BITMAP|_TIF_NOTSC|_TIF_BLOCKSTEP|_TIF_SSBD)
2692     +#define _TIF_WORK_CTXSW_BASE \
2693     + (_TIF_IO_BITMAP|_TIF_NOTSC|_TIF_BLOCKSTEP| \
2694     + _TIF_SSBD | _TIF_SPEC_FORCE_UPDATE)
2695     +
2696     +/*
2697     + * Avoid calls to __switch_to_xtra() on UP as STIBP is not evaluated.
2698     + */
2699     +#ifdef CONFIG_SMP
2700     +# define _TIF_WORK_CTXSW (_TIF_WORK_CTXSW_BASE | _TIF_SPEC_IB)
2701     +#else
2702     +# define _TIF_WORK_CTXSW (_TIF_WORK_CTXSW_BASE)
2703     +#endif
2704    
2705     #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
2706     #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW)
2707     diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
2708     index 686a58d793e5..f5ca15622dc9 100644
2709     --- a/arch/x86/include/asm/tlbflush.h
2710     +++ b/arch/x86/include/asm/tlbflush.h
2711     @@ -68,8 +68,12 @@ static inline void invpcid_flush_all_nonglobals(void)
2712     struct tlb_state {
2713     struct mm_struct *active_mm;
2714     int state;
2715     - /* last user mm's ctx id */
2716     - u64 last_ctx_id;
2717     +
2718     + /* Last user mm for optimizing IBPB */
2719     + union {
2720     + struct mm_struct *last_user_mm;
2721     + unsigned long last_user_mm_ibpb;
2722     + };
2723    
2724     /*
2725     * Access to this CR4 shadow and to H/W CR4 is protected by
2726     diff --git a/arch/x86/include/uapi/asm/Kbuild b/arch/x86/include/uapi/asm/Kbuild
2727     index 3dec769cadf7..1c532b3f18ea 100644
2728     --- a/arch/x86/include/uapi/asm/Kbuild
2729     +++ b/arch/x86/include/uapi/asm/Kbuild
2730     @@ -27,7 +27,6 @@ header-y += ldt.h
2731     header-y += mce.h
2732     header-y += mman.h
2733     header-y += msgbuf.h
2734     -header-y += msr-index.h
2735     header-y += msr.h
2736     header-y += mtrr.h
2737     header-y += param.h
2738     diff --git a/arch/x86/include/uapi/asm/mce.h b/arch/x86/include/uapi/asm/mce.h
2739     index 69a6e07e3149..db7dae58745f 100644
2740     --- a/arch/x86/include/uapi/asm/mce.h
2741     +++ b/arch/x86/include/uapi/asm/mce.h
2742     @@ -28,6 +28,8 @@ struct mce {
2743     __u64 mcgcap; /* MCGCAP MSR: machine check capabilities of CPU */
2744     __u64 synd; /* MCA_SYND MSR: only valid on SMCA systems */
2745     __u64 ipid; /* MCA_IPID MSR: only valid on SMCA systems */
2746     + __u64 ppin; /* Protected Processor Inventory Number */
2747     + __u32 microcode;/* Microcode revision */
2748     };
2749    
2750     #define MCE_GET_RECORD_LEN _IOR('M', 1, int)
2751     diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
2752     index 6221166e3fca..16970c39baea 100644
2753     --- a/arch/x86/kernel/cpu/bugs.c
2754     +++ b/arch/x86/kernel/cpu/bugs.c
2755     @@ -13,6 +13,7 @@
2756     #include <linux/module.h>
2757     #include <linux/nospec.h>
2758     #include <linux/prctl.h>
2759     +#include <linux/sched/smt.h>
2760    
2761     #include <asm/spec-ctrl.h>
2762     #include <asm/cmdline.h>
2763     @@ -24,6 +25,7 @@
2764     #include <asm/vmx.h>
2765     #include <asm/paravirt.h>
2766     #include <asm/alternative.h>
2767     +#include <asm/hypervisor.h>
2768     #include <asm/pgtable.h>
2769     #include <asm/cacheflush.h>
2770     #include <asm/intel-family.h>
2771     @@ -32,13 +34,12 @@
2772     static void __init spectre_v2_select_mitigation(void);
2773     static void __init ssb_select_mitigation(void);
2774     static void __init l1tf_select_mitigation(void);
2775     +static void __init mds_select_mitigation(void);
2776    
2777     -/*
2778     - * Our boot-time value of the SPEC_CTRL MSR. We read it once so that any
2779     - * writes to SPEC_CTRL contain whatever reserved bits have been set.
2780     - */
2781     -u64 __ro_after_init x86_spec_ctrl_base;
2782     +/* The base value of the SPEC_CTRL MSR that always has to be preserved. */
2783     +u64 x86_spec_ctrl_base;
2784     EXPORT_SYMBOL_GPL(x86_spec_ctrl_base);
2785     +static DEFINE_MUTEX(spec_ctrl_mutex);
2786    
2787     /*
2788     * The vendor and possibly platform specific bits which can be modified in
2789     @@ -53,6 +54,20 @@ static u64 __ro_after_init x86_spec_ctrl_mask = SPEC_CTRL_IBRS;
2790     u64 __ro_after_init x86_amd_ls_cfg_base;
2791     u64 __ro_after_init x86_amd_ls_cfg_ssbd_mask;
2792    
2793     +/* Control conditional STIPB in switch_to() */
2794     +DEFINE_STATIC_KEY_FALSE(switch_to_cond_stibp);
2795     +/* Control conditional IBPB in switch_mm() */
2796     +DEFINE_STATIC_KEY_FALSE(switch_mm_cond_ibpb);
2797     +/* Control unconditional IBPB in switch_mm() */
2798     +DEFINE_STATIC_KEY_FALSE(switch_mm_always_ibpb);
2799     +
2800     +/* Control MDS CPU buffer clear before returning to user space */
2801     +DEFINE_STATIC_KEY_FALSE(mds_user_clear);
2802     +EXPORT_SYMBOL_GPL(mds_user_clear);
2803     +/* Control MDS CPU buffer clear before idling (halt, mwait) */
2804     +DEFINE_STATIC_KEY_FALSE(mds_idle_clear);
2805     +EXPORT_SYMBOL_GPL(mds_idle_clear);
2806     +
2807     void __init check_bugs(void)
2808     {
2809     identify_boot_cpu();
2810     @@ -91,6 +106,10 @@ void __init check_bugs(void)
2811    
2812     l1tf_select_mitigation();
2813    
2814     + mds_select_mitigation();
2815     +
2816     + arch_smt_update();
2817     +
2818     #ifdef CONFIG_X86_32
2819     /*
2820     * Check whether we are able to run this kernel safely on SMP.
2821     @@ -123,31 +142,6 @@ void __init check_bugs(void)
2822     #endif
2823     }
2824    
2825     -/* The kernel command line selection */
2826     -enum spectre_v2_mitigation_cmd {
2827     - SPECTRE_V2_CMD_NONE,
2828     - SPECTRE_V2_CMD_AUTO,
2829     - SPECTRE_V2_CMD_FORCE,
2830     - SPECTRE_V2_CMD_RETPOLINE,
2831     - SPECTRE_V2_CMD_RETPOLINE_GENERIC,
2832     - SPECTRE_V2_CMD_RETPOLINE_AMD,
2833     -};
2834     -
2835     -static const char *spectre_v2_strings[] = {
2836     - [SPECTRE_V2_NONE] = "Vulnerable",
2837     - [SPECTRE_V2_RETPOLINE_MINIMAL] = "Vulnerable: Minimal generic ASM retpoline",
2838     - [SPECTRE_V2_RETPOLINE_MINIMAL_AMD] = "Vulnerable: Minimal AMD ASM retpoline",
2839     - [SPECTRE_V2_RETPOLINE_GENERIC] = "Mitigation: Full generic retpoline",
2840     - [SPECTRE_V2_RETPOLINE_AMD] = "Mitigation: Full AMD retpoline",
2841     - [SPECTRE_V2_IBRS_ENHANCED] = "Mitigation: Enhanced IBRS",
2842     -};
2843     -
2844     -#undef pr_fmt
2845     -#define pr_fmt(fmt) "Spectre V2 : " fmt
2846     -
2847     -static enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init =
2848     - SPECTRE_V2_NONE;
2849     -
2850     void
2851     x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest)
2852     {
2853     @@ -165,9 +159,14 @@ x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest)
2854     guestval |= guest_spec_ctrl & x86_spec_ctrl_mask;
2855    
2856     /* SSBD controlled in MSR_SPEC_CTRL */
2857     - if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD))
2858     + if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) ||
2859     + static_cpu_has(X86_FEATURE_AMD_SSBD))
2860     hostval |= ssbd_tif_to_spec_ctrl(ti->flags);
2861    
2862     + /* Conditional STIBP enabled? */
2863     + if (static_branch_unlikely(&switch_to_cond_stibp))
2864     + hostval |= stibp_tif_to_spec_ctrl(ti->flags);
2865     +
2866     if (hostval != guestval) {
2867     msrval = setguest ? guestval : hostval;
2868     wrmsrl(MSR_IA32_SPEC_CTRL, msrval);
2869     @@ -201,7 +200,7 @@ x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest)
2870     tif = setguest ? ssbd_spec_ctrl_to_tif(guestval) :
2871     ssbd_spec_ctrl_to_tif(hostval);
2872    
2873     - speculative_store_bypass_update(tif);
2874     + speculation_ctrl_update(tif);
2875     }
2876     }
2877     EXPORT_SYMBOL_GPL(x86_virt_spec_ctrl);
2878     @@ -216,6 +215,70 @@ static void x86_amd_ssb_disable(void)
2879     wrmsrl(MSR_AMD64_LS_CFG, msrval);
2880     }
2881    
2882     +#undef pr_fmt
2883     +#define pr_fmt(fmt) "MDS: " fmt
2884     +
2885     +/* Default mitigation for MDS-affected CPUs */
2886     +static enum mds_mitigations mds_mitigation __ro_after_init = MDS_MITIGATION_FULL;
2887     +static bool mds_nosmt __ro_after_init = false;
2888     +
2889     +static const char * const mds_strings[] = {
2890     + [MDS_MITIGATION_OFF] = "Vulnerable",
2891     + [MDS_MITIGATION_FULL] = "Mitigation: Clear CPU buffers",
2892     + [MDS_MITIGATION_VMWERV] = "Vulnerable: Clear CPU buffers attempted, no microcode",
2893     +};
2894     +
2895     +static void __init mds_select_mitigation(void)
2896     +{
2897     + if (!boot_cpu_has_bug(X86_BUG_MDS) || cpu_mitigations_off()) {
2898     + mds_mitigation = MDS_MITIGATION_OFF;
2899     + return;
2900     + }
2901     +
2902     + if (mds_mitigation == MDS_MITIGATION_FULL) {
2903     + if (!boot_cpu_has(X86_FEATURE_MD_CLEAR))
2904     + mds_mitigation = MDS_MITIGATION_VMWERV;
2905     +
2906     + static_branch_enable(&mds_user_clear);
2907     +
2908     + if (!boot_cpu_has(X86_BUG_MSBDS_ONLY) &&
2909     + (mds_nosmt || cpu_mitigations_auto_nosmt()))
2910     + cpu_smt_disable(false);
2911     + }
2912     +
2913     + pr_info("%s\n", mds_strings[mds_mitigation]);
2914     +}
2915     +
2916     +static int __init mds_cmdline(char *str)
2917     +{
2918     + if (!boot_cpu_has_bug(X86_BUG_MDS))
2919     + return 0;
2920     +
2921     + if (!str)
2922     + return -EINVAL;
2923     +
2924     + if (!strcmp(str, "off"))
2925     + mds_mitigation = MDS_MITIGATION_OFF;
2926     + else if (!strcmp(str, "full"))
2927     + mds_mitigation = MDS_MITIGATION_FULL;
2928     + else if (!strcmp(str, "full,nosmt")) {
2929     + mds_mitigation = MDS_MITIGATION_FULL;
2930     + mds_nosmt = true;
2931     + }
2932     +
2933     + return 0;
2934     +}
2935     +early_param("mds", mds_cmdline);
2936     +
2937     +#undef pr_fmt
2938     +#define pr_fmt(fmt) "Spectre V2 : " fmt
2939     +
2940     +static enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init =
2941     + SPECTRE_V2_NONE;
2942     +
2943     +static enum spectre_v2_user_mitigation spectre_v2_user __ro_after_init =
2944     + SPECTRE_V2_USER_NONE;
2945     +
2946     #ifdef RETPOLINE
2947     static bool spectre_v2_bad_module;
2948    
2949     @@ -237,67 +300,225 @@ static inline const char *spectre_v2_module_string(void)
2950     static inline const char *spectre_v2_module_string(void) { return ""; }
2951     #endif
2952    
2953     -static void __init spec2_print_if_insecure(const char *reason)
2954     +static inline bool match_option(const char *arg, int arglen, const char *opt)
2955     {
2956     - if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
2957     - pr_info("%s selected on command line.\n", reason);
2958     + int len = strlen(opt);
2959     +
2960     + return len == arglen && !strncmp(arg, opt, len);
2961     }
2962    
2963     -static void __init spec2_print_if_secure(const char *reason)
2964     +/* The kernel command line selection for spectre v2 */
2965     +enum spectre_v2_mitigation_cmd {
2966     + SPECTRE_V2_CMD_NONE,
2967     + SPECTRE_V2_CMD_AUTO,
2968     + SPECTRE_V2_CMD_FORCE,
2969     + SPECTRE_V2_CMD_RETPOLINE,
2970     + SPECTRE_V2_CMD_RETPOLINE_GENERIC,
2971     + SPECTRE_V2_CMD_RETPOLINE_AMD,
2972     +};
2973     +
2974     +enum spectre_v2_user_cmd {
2975     + SPECTRE_V2_USER_CMD_NONE,
2976     + SPECTRE_V2_USER_CMD_AUTO,
2977     + SPECTRE_V2_USER_CMD_FORCE,
2978     + SPECTRE_V2_USER_CMD_PRCTL,
2979     + SPECTRE_V2_USER_CMD_PRCTL_IBPB,
2980     + SPECTRE_V2_USER_CMD_SECCOMP,
2981     + SPECTRE_V2_USER_CMD_SECCOMP_IBPB,
2982     +};
2983     +
2984     +static const char * const spectre_v2_user_strings[] = {
2985     + [SPECTRE_V2_USER_NONE] = "User space: Vulnerable",
2986     + [SPECTRE_V2_USER_STRICT] = "User space: Mitigation: STIBP protection",
2987     + [SPECTRE_V2_USER_PRCTL] = "User space: Mitigation: STIBP via prctl",
2988     + [SPECTRE_V2_USER_SECCOMP] = "User space: Mitigation: STIBP via seccomp and prctl",
2989     +};
2990     +
2991     +static const struct {
2992     + const char *option;
2993     + enum spectre_v2_user_cmd cmd;
2994     + bool secure;
2995     +} v2_user_options[] __initconst = {
2996     + { "auto", SPECTRE_V2_USER_CMD_AUTO, false },
2997     + { "off", SPECTRE_V2_USER_CMD_NONE, false },
2998     + { "on", SPECTRE_V2_USER_CMD_FORCE, true },
2999     + { "prctl", SPECTRE_V2_USER_CMD_PRCTL, false },
3000     + { "prctl,ibpb", SPECTRE_V2_USER_CMD_PRCTL_IBPB, false },
3001     + { "seccomp", SPECTRE_V2_USER_CMD_SECCOMP, false },
3002     + { "seccomp,ibpb", SPECTRE_V2_USER_CMD_SECCOMP_IBPB, false },
3003     +};
3004     +
3005     +static void __init spec_v2_user_print_cond(const char *reason, bool secure)
3006     {
3007     - if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
3008     - pr_info("%s selected on command line.\n", reason);
3009     + if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2) != secure)
3010     + pr_info("spectre_v2_user=%s forced on command line.\n", reason);
3011     }
3012    
3013     -static inline bool retp_compiler(void)
3014     +static enum spectre_v2_user_cmd __init
3015     +spectre_v2_parse_user_cmdline(enum spectre_v2_mitigation_cmd v2_cmd)
3016     {
3017     - return __is_defined(RETPOLINE);
3018     + char arg[20];
3019     + int ret, i;
3020     +
3021     + switch (v2_cmd) {
3022     + case SPECTRE_V2_CMD_NONE:
3023     + return SPECTRE_V2_USER_CMD_NONE;
3024     + case SPECTRE_V2_CMD_FORCE:
3025     + return SPECTRE_V2_USER_CMD_FORCE;
3026     + default:
3027     + break;
3028     + }
3029     +
3030     + ret = cmdline_find_option(boot_command_line, "spectre_v2_user",
3031     + arg, sizeof(arg));
3032     + if (ret < 0)
3033     + return SPECTRE_V2_USER_CMD_AUTO;
3034     +
3035     + for (i = 0; i < ARRAY_SIZE(v2_user_options); i++) {
3036     + if (match_option(arg, ret, v2_user_options[i].option)) {
3037     + spec_v2_user_print_cond(v2_user_options[i].option,
3038     + v2_user_options[i].secure);
3039     + return v2_user_options[i].cmd;
3040     + }
3041     + }
3042     +
3043     + pr_err("Unknown user space protection option (%s). Switching to AUTO select\n", arg);
3044     + return SPECTRE_V2_USER_CMD_AUTO;
3045     }
3046    
3047     -static inline bool match_option(const char *arg, int arglen, const char *opt)
3048     +static void __init
3049     +spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd)
3050     {
3051     - int len = strlen(opt);
3052     + enum spectre_v2_user_mitigation mode = SPECTRE_V2_USER_NONE;
3053     + bool smt_possible = IS_ENABLED(CONFIG_SMP);
3054     + enum spectre_v2_user_cmd cmd;
3055    
3056     - return len == arglen && !strncmp(arg, opt, len);
3057     + if (!boot_cpu_has(X86_FEATURE_IBPB) && !boot_cpu_has(X86_FEATURE_STIBP))
3058     + return;
3059     +
3060     + if (cpu_smt_control == CPU_SMT_FORCE_DISABLED ||
3061     + cpu_smt_control == CPU_SMT_NOT_SUPPORTED)
3062     + smt_possible = false;
3063     +
3064     + cmd = spectre_v2_parse_user_cmdline(v2_cmd);
3065     + switch (cmd) {
3066     + case SPECTRE_V2_USER_CMD_NONE:
3067     + goto set_mode;
3068     + case SPECTRE_V2_USER_CMD_FORCE:
3069     + mode = SPECTRE_V2_USER_STRICT;
3070     + break;
3071     + case SPECTRE_V2_USER_CMD_PRCTL:
3072     + case SPECTRE_V2_USER_CMD_PRCTL_IBPB:
3073     + mode = SPECTRE_V2_USER_PRCTL;
3074     + break;
3075     + case SPECTRE_V2_USER_CMD_AUTO:
3076     + case SPECTRE_V2_USER_CMD_SECCOMP:
3077     + case SPECTRE_V2_USER_CMD_SECCOMP_IBPB:
3078     + if (IS_ENABLED(CONFIG_SECCOMP))
3079     + mode = SPECTRE_V2_USER_SECCOMP;
3080     + else
3081     + mode = SPECTRE_V2_USER_PRCTL;
3082     + break;
3083     + }
3084     +
3085     + /* Initialize Indirect Branch Prediction Barrier */
3086     + if (boot_cpu_has(X86_FEATURE_IBPB)) {
3087     + setup_force_cpu_cap(X86_FEATURE_USE_IBPB);
3088     +
3089     + switch (cmd) {
3090     + case SPECTRE_V2_USER_CMD_FORCE:
3091     + case SPECTRE_V2_USER_CMD_PRCTL_IBPB:
3092     + case SPECTRE_V2_USER_CMD_SECCOMP_IBPB:
3093     + static_branch_enable(&switch_mm_always_ibpb);
3094     + break;
3095     + case SPECTRE_V2_USER_CMD_PRCTL:
3096     + case SPECTRE_V2_USER_CMD_AUTO:
3097     + case SPECTRE_V2_USER_CMD_SECCOMP:
3098     + static_branch_enable(&switch_mm_cond_ibpb);
3099     + break;
3100     + default:
3101     + break;
3102     + }
3103     +
3104     + pr_info("mitigation: Enabling %s Indirect Branch Prediction Barrier\n",
3105     + static_key_enabled(&switch_mm_always_ibpb) ?
3106     + "always-on" : "conditional");
3107     + }
3108     +
3109     + /* If enhanced IBRS is enabled no STIPB required */
3110     + if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED)
3111     + return;
3112     +
3113     + /*
3114     + * If SMT is not possible or STIBP is not available clear the STIPB
3115     + * mode.
3116     + */
3117     + if (!smt_possible || !boot_cpu_has(X86_FEATURE_STIBP))
3118     + mode = SPECTRE_V2_USER_NONE;
3119     +set_mode:
3120     + spectre_v2_user = mode;
3121     + /* Only print the STIBP mode when SMT possible */
3122     + if (smt_possible)
3123     + pr_info("%s\n", spectre_v2_user_strings[mode]);
3124     }
3125    
3126     +static const char * const spectre_v2_strings[] = {
3127     + [SPECTRE_V2_NONE] = "Vulnerable",
3128     + [SPECTRE_V2_RETPOLINE_MINIMAL] = "Vulnerable: Minimal generic ASM retpoline",
3129     + [SPECTRE_V2_RETPOLINE_MINIMAL_AMD] = "Vulnerable: Minimal AMD ASM retpoline",
3130     + [SPECTRE_V2_RETPOLINE_GENERIC] = "Mitigation: Full generic retpoline",
3131     + [SPECTRE_V2_RETPOLINE_AMD] = "Mitigation: Full AMD retpoline",
3132     + [SPECTRE_V2_IBRS_ENHANCED] = "Mitigation: Enhanced IBRS",
3133     +};
3134     +
3135     static const struct {
3136     const char *option;
3137     enum spectre_v2_mitigation_cmd cmd;
3138     bool secure;
3139     -} mitigation_options[] = {
3140     - { "off", SPECTRE_V2_CMD_NONE, false },
3141     - { "on", SPECTRE_V2_CMD_FORCE, true },
3142     - { "retpoline", SPECTRE_V2_CMD_RETPOLINE, false },
3143     - { "retpoline,amd", SPECTRE_V2_CMD_RETPOLINE_AMD, false },
3144     - { "retpoline,generic", SPECTRE_V2_CMD_RETPOLINE_GENERIC, false },
3145     - { "auto", SPECTRE_V2_CMD_AUTO, false },
3146     +} mitigation_options[] __initconst = {
3147     + { "off", SPECTRE_V2_CMD_NONE, false },
3148     + { "on", SPECTRE_V2_CMD_FORCE, true },
3149     + { "retpoline", SPECTRE_V2_CMD_RETPOLINE, false },
3150     + { "retpoline,amd", SPECTRE_V2_CMD_RETPOLINE_AMD, false },
3151     + { "retpoline,generic", SPECTRE_V2_CMD_RETPOLINE_GENERIC, false },
3152     + { "auto", SPECTRE_V2_CMD_AUTO, false },
3153     };
3154    
3155     +static void __init spec_v2_print_cond(const char *reason, bool secure)
3156     +{
3157     + if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2) != secure)
3158     + pr_info("%s selected on command line.\n", reason);
3159     +}
3160     +
3161     +static inline bool retp_compiler(void)
3162     +{
3163     + return __is_defined(RETPOLINE);
3164     +}
3165     +
3166     static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
3167     {
3168     + enum spectre_v2_mitigation_cmd cmd = SPECTRE_V2_CMD_AUTO;
3169     char arg[20];
3170     int ret, i;
3171     - enum spectre_v2_mitigation_cmd cmd = SPECTRE_V2_CMD_AUTO;
3172    
3173     - if (cmdline_find_option_bool(boot_command_line, "nospectre_v2"))
3174     + if (cmdline_find_option_bool(boot_command_line, "nospectre_v2") ||
3175     + cpu_mitigations_off())
3176     return SPECTRE_V2_CMD_NONE;
3177     - else {
3178     - ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, sizeof(arg));
3179     - if (ret < 0)
3180     - return SPECTRE_V2_CMD_AUTO;
3181    
3182     - for (i = 0; i < ARRAY_SIZE(mitigation_options); i++) {
3183     - if (!match_option(arg, ret, mitigation_options[i].option))
3184     - continue;
3185     - cmd = mitigation_options[i].cmd;
3186     - break;
3187     - }
3188     + ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, sizeof(arg));
3189     + if (ret < 0)
3190     + return SPECTRE_V2_CMD_AUTO;
3191    
3192     - if (i >= ARRAY_SIZE(mitigation_options)) {
3193     - pr_err("unknown option (%s). Switching to AUTO select\n", arg);
3194     - return SPECTRE_V2_CMD_AUTO;
3195     - }
3196     + for (i = 0; i < ARRAY_SIZE(mitigation_options); i++) {
3197     + if (!match_option(arg, ret, mitigation_options[i].option))
3198     + continue;
3199     + cmd = mitigation_options[i].cmd;
3200     + break;
3201     + }
3202     +
3203     + if (i >= ARRAY_SIZE(mitigation_options)) {
3204     + pr_err("unknown option (%s). Switching to AUTO select\n", arg);
3205     + return SPECTRE_V2_CMD_AUTO;
3206     }
3207    
3208     if ((cmd == SPECTRE_V2_CMD_RETPOLINE ||
3209     @@ -314,11 +535,8 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
3210     return SPECTRE_V2_CMD_AUTO;
3211     }
3212    
3213     - if (mitigation_options[i].secure)
3214     - spec2_print_if_secure(mitigation_options[i].option);
3215     - else
3216     - spec2_print_if_insecure(mitigation_options[i].option);
3217     -
3218     + spec_v2_print_cond(mitigation_options[i].option,
3219     + mitigation_options[i].secure);
3220     return cmd;
3221     }
3222    
3223     @@ -400,12 +618,6 @@ specv2_set_mode:
3224     setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW);
3225     pr_info("Spectre v2 / SpectreRSB mitigation: Filling RSB on context switch\n");
3226    
3227     - /* Initialize Indirect Branch Prediction Barrier if supported */
3228     - if (boot_cpu_has(X86_FEATURE_IBPB)) {
3229     - setup_force_cpu_cap(X86_FEATURE_USE_IBPB);
3230     - pr_info("Spectre v2 mitigation: Enabling Indirect Branch Prediction Barrier\n");
3231     - }
3232     -
3233     /*
3234     * Retpoline means the kernel is safe because it has no indirect
3235     * branches. Enhanced IBRS protects firmware too, so, enable restricted
3236     @@ -421,6 +633,99 @@ specv2_set_mode:
3237     setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW);
3238     pr_info("Enabling Restricted Speculation for firmware calls\n");
3239     }
3240     +
3241     + /* Set up IBPB and STIBP depending on the general spectre V2 command */
3242     + spectre_v2_user_select_mitigation(cmd);
3243     +}
3244     +
3245     +static void update_stibp_msr(void * __unused)
3246     +{
3247     + wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
3248     +}
3249     +
3250     +/* Update x86_spec_ctrl_base in case SMT state changed. */
3251     +static void update_stibp_strict(void)
3252     +{
3253     + u64 mask = x86_spec_ctrl_base & ~SPEC_CTRL_STIBP;
3254     +
3255     + if (sched_smt_active())
3256     + mask |= SPEC_CTRL_STIBP;
3257     +
3258     + if (mask == x86_spec_ctrl_base)
3259     + return;
3260     +
3261     + pr_info("Update user space SMT mitigation: STIBP %s\n",
3262     + mask & SPEC_CTRL_STIBP ? "always-on" : "off");
3263     + x86_spec_ctrl_base = mask;
3264     + on_each_cpu(update_stibp_msr, NULL, 1);
3265     +}
3266     +
3267     +/* Update the static key controlling the evaluation of TIF_SPEC_IB */
3268     +static void update_indir_branch_cond(void)
3269     +{
3270     + if (sched_smt_active())
3271     + static_branch_enable(&switch_to_cond_stibp);
3272     + else
3273     + static_branch_disable(&switch_to_cond_stibp);
3274     +}
3275     +
3276     +#undef pr_fmt
3277     +#define pr_fmt(fmt) fmt
3278     +
3279     +/* Update the static key controlling the MDS CPU buffer clear in idle */
3280     +static void update_mds_branch_idle(void)
3281     +{
3282     + /*
3283     + * Enable the idle clearing if SMT is active on CPUs which are
3284     + * affected only by MSBDS and not any other MDS variant.
3285     + *
3286     + * The other variants cannot be mitigated when SMT is enabled, so
3287     + * clearing the buffers on idle just to prevent the Store Buffer
3288     + * repartitioning leak would be a window dressing exercise.
3289     + */
3290     + if (!boot_cpu_has_bug(X86_BUG_MSBDS_ONLY))
3291     + return;
3292     +
3293     + if (sched_smt_active())
3294     + static_branch_enable(&mds_idle_clear);
3295     + else
3296     + static_branch_disable(&mds_idle_clear);
3297     +}
3298     +
3299     +#define MDS_MSG_SMT "MDS CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/mds.html for more details.\n"
3300     +
3301     +void arch_smt_update(void)
3302     +{
3303     + /* Enhanced IBRS implies STIBP. No update required. */
3304     + if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED)
3305     + return;
3306     +
3307     + mutex_lock(&spec_ctrl_mutex);
3308     +
3309     + switch (spectre_v2_user) {
3310     + case SPECTRE_V2_USER_NONE:
3311     + break;
3312     + case SPECTRE_V2_USER_STRICT:
3313     + update_stibp_strict();
3314     + break;
3315     + case SPECTRE_V2_USER_PRCTL:
3316     + case SPECTRE_V2_USER_SECCOMP:
3317     + update_indir_branch_cond();
3318     + break;
3319     + }
3320     +
3321     + switch (mds_mitigation) {
3322     + case MDS_MITIGATION_FULL:
3323     + case MDS_MITIGATION_VMWERV:
3324     + if (sched_smt_active() && !boot_cpu_has(X86_BUG_MSBDS_ONLY))
3325     + pr_warn_once(MDS_MSG_SMT);
3326     + update_mds_branch_idle();
3327     + break;
3328     + case MDS_MITIGATION_OFF:
3329     + break;
3330     + }
3331     +
3332     + mutex_unlock(&spec_ctrl_mutex);
3333     }
3334    
3335     #undef pr_fmt
3336     @@ -437,7 +742,7 @@ enum ssb_mitigation_cmd {
3337     SPEC_STORE_BYPASS_CMD_SECCOMP,
3338     };
3339    
3340     -static const char *ssb_strings[] = {
3341     +static const char * const ssb_strings[] = {
3342     [SPEC_STORE_BYPASS_NONE] = "Vulnerable",
3343     [SPEC_STORE_BYPASS_DISABLE] = "Mitigation: Speculative Store Bypass disabled",
3344     [SPEC_STORE_BYPASS_PRCTL] = "Mitigation: Speculative Store Bypass disabled via prctl",
3345     @@ -447,7 +752,7 @@ static const char *ssb_strings[] = {
3346     static const struct {
3347     const char *option;
3348     enum ssb_mitigation_cmd cmd;
3349     -} ssb_mitigation_options[] = {
3350     +} ssb_mitigation_options[] __initconst = {
3351     { "auto", SPEC_STORE_BYPASS_CMD_AUTO }, /* Platform decides */
3352     { "on", SPEC_STORE_BYPASS_CMD_ON }, /* Disable Speculative Store Bypass */
3353     { "off", SPEC_STORE_BYPASS_CMD_NONE }, /* Don't touch Speculative Store Bypass */
3354     @@ -461,7 +766,8 @@ static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void)
3355     char arg[20];
3356     int ret, i;
3357    
3358     - if (cmdline_find_option_bool(boot_command_line, "nospec_store_bypass_disable")) {
3359     + if (cmdline_find_option_bool(boot_command_line, "nospec_store_bypass_disable") ||
3360     + cpu_mitigations_off()) {
3361     return SPEC_STORE_BYPASS_CMD_NONE;
3362     } else {
3363     ret = cmdline_find_option(boot_command_line, "spec_store_bypass_disable",
3364     @@ -531,18 +837,16 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void)
3365     if (mode == SPEC_STORE_BYPASS_DISABLE) {
3366     setup_force_cpu_cap(X86_FEATURE_SPEC_STORE_BYPASS_DISABLE);
3367     /*
3368     - * Intel uses the SPEC CTRL MSR Bit(2) for this, while AMD uses
3369     - * a completely different MSR and bit dependent on family.
3370     + * Intel uses the SPEC CTRL MSR Bit(2) for this, while AMD may
3371     + * use a completely different MSR and bit dependent on family.
3372     */
3373     - switch (boot_cpu_data.x86_vendor) {
3374     - case X86_VENDOR_INTEL:
3375     + if (!static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) &&
3376     + !static_cpu_has(X86_FEATURE_AMD_SSBD)) {
3377     + x86_amd_ssb_disable();
3378     + } else {
3379     x86_spec_ctrl_base |= SPEC_CTRL_SSBD;
3380     x86_spec_ctrl_mask |= SPEC_CTRL_SSBD;
3381     wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
3382     - break;
3383     - case X86_VENDOR_AMD:
3384     - x86_amd_ssb_disable();
3385     - break;
3386     }
3387     }
3388    
3389     @@ -560,10 +864,25 @@ static void ssb_select_mitigation(void)
3390     #undef pr_fmt
3391     #define pr_fmt(fmt) "Speculation prctl: " fmt
3392    
3393     -static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl)
3394     +static void task_update_spec_tif(struct task_struct *tsk)
3395     {
3396     - bool update;
3397     + /* Force the update of the real TIF bits */
3398     + set_tsk_thread_flag(tsk, TIF_SPEC_FORCE_UPDATE);
3399    
3400     + /*
3401     + * Immediately update the speculation control MSRs for the current
3402     + * task, but for a non-current task delay setting the CPU
3403     + * mitigation until it is scheduled next.
3404     + *
3405     + * This can only happen for SECCOMP mitigation. For PRCTL it's
3406     + * always the current task.
3407     + */
3408     + if (tsk == current)
3409     + speculation_ctrl_update_current();
3410     +}
3411     +
3412     +static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl)
3413     +{
3414     if (ssb_mode != SPEC_STORE_BYPASS_PRCTL &&
3415     ssb_mode != SPEC_STORE_BYPASS_SECCOMP)
3416     return -ENXIO;
3417     @@ -574,28 +893,56 @@ static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl)
3418     if (task_spec_ssb_force_disable(task))
3419     return -EPERM;
3420     task_clear_spec_ssb_disable(task);
3421     - update = test_and_clear_tsk_thread_flag(task, TIF_SSBD);
3422     + task_update_spec_tif(task);
3423     break;
3424     case PR_SPEC_DISABLE:
3425     task_set_spec_ssb_disable(task);
3426     - update = !test_and_set_tsk_thread_flag(task, TIF_SSBD);
3427     + task_update_spec_tif(task);
3428     break;
3429     case PR_SPEC_FORCE_DISABLE:
3430     task_set_spec_ssb_disable(task);
3431     task_set_spec_ssb_force_disable(task);
3432     - update = !test_and_set_tsk_thread_flag(task, TIF_SSBD);
3433     + task_update_spec_tif(task);
3434     break;
3435     default:
3436     return -ERANGE;
3437     }
3438     + return 0;
3439     +}
3440    
3441     - /*
3442     - * If being set on non-current task, delay setting the CPU
3443     - * mitigation until it is next scheduled.
3444     - */
3445     - if (task == current && update)
3446     - speculative_store_bypass_update_current();
3447     -
3448     +static int ib_prctl_set(struct task_struct *task, unsigned long ctrl)
3449     +{
3450     + switch (ctrl) {
3451     + case PR_SPEC_ENABLE:
3452     + if (spectre_v2_user == SPECTRE_V2_USER_NONE)
3453     + return 0;
3454     + /*
3455     + * Indirect branch speculation is always disabled in strict
3456     + * mode.
3457     + */
3458     + if (spectre_v2_user == SPECTRE_V2_USER_STRICT)
3459     + return -EPERM;
3460     + task_clear_spec_ib_disable(task);
3461     + task_update_spec_tif(task);
3462     + break;
3463     + case PR_SPEC_DISABLE:
3464     + case PR_SPEC_FORCE_DISABLE:
3465     + /*
3466     + * Indirect branch speculation is always allowed when
3467     + * mitigation is force disabled.
3468     + */
3469     + if (spectre_v2_user == SPECTRE_V2_USER_NONE)
3470     + return -EPERM;
3471     + if (spectre_v2_user == SPECTRE_V2_USER_STRICT)
3472     + return 0;
3473     + task_set_spec_ib_disable(task);
3474     + if (ctrl == PR_SPEC_FORCE_DISABLE)
3475     + task_set_spec_ib_force_disable(task);
3476     + task_update_spec_tif(task);
3477     + break;
3478     + default:
3479     + return -ERANGE;
3480     + }
3481     return 0;
3482     }
3483    
3484     @@ -605,6 +952,8 @@ int arch_prctl_spec_ctrl_set(struct task_struct *task, unsigned long which,
3485     switch (which) {
3486     case PR_SPEC_STORE_BYPASS:
3487     return ssb_prctl_set(task, ctrl);
3488     + case PR_SPEC_INDIRECT_BRANCH:
3489     + return ib_prctl_set(task, ctrl);
3490     default:
3491     return -ENODEV;
3492     }
3493     @@ -615,6 +964,8 @@ void arch_seccomp_spec_mitigate(struct task_struct *task)
3494     {
3495     if (ssb_mode == SPEC_STORE_BYPASS_SECCOMP)
3496     ssb_prctl_set(task, PR_SPEC_FORCE_DISABLE);
3497     + if (spectre_v2_user == SPECTRE_V2_USER_SECCOMP)
3498     + ib_prctl_set(task, PR_SPEC_FORCE_DISABLE);
3499     }
3500     #endif
3501    
3502     @@ -637,11 +988,35 @@ static int ssb_prctl_get(struct task_struct *task)
3503     }
3504     }
3505    
3506     +static int ib_prctl_get(struct task_struct *task)
3507     +{
3508     + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
3509     + return PR_SPEC_NOT_AFFECTED;
3510     +
3511     + switch (spectre_v2_user) {
3512     + case SPECTRE_V2_USER_NONE:
3513     + return PR_SPEC_ENABLE;
3514     + case SPECTRE_V2_USER_PRCTL:
3515     + case SPECTRE_V2_USER_SECCOMP:
3516     + if (task_spec_ib_force_disable(task))
3517     + return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE;
3518     + if (task_spec_ib_disable(task))
3519     + return PR_SPEC_PRCTL | PR_SPEC_DISABLE;
3520     + return PR_SPEC_PRCTL | PR_SPEC_ENABLE;
3521     + case SPECTRE_V2_USER_STRICT:
3522     + return PR_SPEC_DISABLE;
3523     + default:
3524     + return PR_SPEC_NOT_AFFECTED;
3525     + }
3526     +}
3527     +
3528     int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which)
3529     {
3530     switch (which) {
3531     case PR_SPEC_STORE_BYPASS:
3532     return ssb_prctl_get(task);
3533     + case PR_SPEC_INDIRECT_BRANCH:
3534     + return ib_prctl_get(task);
3535     default:
3536     return -ENODEV;
3537     }
3538     @@ -713,6 +1088,11 @@ static void __init l1tf_select_mitigation(void)
3539     if (!boot_cpu_has_bug(X86_BUG_L1TF))
3540     return;
3541    
3542     + if (cpu_mitigations_off())
3543     + l1tf_mitigation = L1TF_MITIGATION_OFF;
3544     + else if (cpu_mitigations_auto_nosmt())
3545     + l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOSMT;
3546     +
3547     override_cache_bits(&boot_cpu_data);
3548    
3549     switch (l1tf_mitigation) {
3550     @@ -735,12 +1115,13 @@ static void __init l1tf_select_mitigation(void)
3551     #endif
3552    
3553     half_pa = (u64)l1tf_pfn_limit() << PAGE_SHIFT;
3554     - if (e820_any_mapped(half_pa, ULLONG_MAX - half_pa, E820_RAM)) {
3555     + if (l1tf_mitigation != L1TF_MITIGATION_OFF &&
3556     + e820_any_mapped(half_pa, ULLONG_MAX - half_pa, E820_RAM)) {
3557     pr_warn("System has more than MAX_PA/2 memory. L1TF mitigation not effective.\n");
3558     pr_info("You may make it effective by booting the kernel with mem=%llu parameter.\n",
3559     half_pa);
3560     pr_info("However, doing so will make a part of your RAM unusable.\n");
3561     - pr_info("Reading https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html might help you decide.\n");
3562     + pr_info("Reading https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html might help you decide.\n");
3563     return;
3564     }
3565    
3566     @@ -773,13 +1154,14 @@ static int __init l1tf_cmdline(char *str)
3567     early_param("l1tf", l1tf_cmdline);
3568    
3569     #undef pr_fmt
3570     +#define pr_fmt(fmt) fmt
3571    
3572     #ifdef CONFIG_SYSFS
3573    
3574     #define L1TF_DEFAULT_MSG "Mitigation: PTE Inversion"
3575    
3576     #if IS_ENABLED(CONFIG_KVM_INTEL)
3577     -static const char *l1tf_vmx_states[] = {
3578     +static const char * const l1tf_vmx_states[] = {
3579     [VMENTER_L1D_FLUSH_AUTO] = "auto",
3580     [VMENTER_L1D_FLUSH_NEVER] = "vulnerable",
3581     [VMENTER_L1D_FLUSH_COND] = "conditional cache flushes",
3582     @@ -795,13 +1177,14 @@ static ssize_t l1tf_show_state(char *buf)
3583    
3584     if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_EPT_DISABLED ||
3585     (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER &&
3586     - cpu_smt_control == CPU_SMT_ENABLED))
3587     + sched_smt_active())) {
3588     return sprintf(buf, "%s; VMX: %s\n", L1TF_DEFAULT_MSG,
3589     l1tf_vmx_states[l1tf_vmx_mitigation]);
3590     + }
3591    
3592     return sprintf(buf, "%s; VMX: %s, SMT %s\n", L1TF_DEFAULT_MSG,
3593     l1tf_vmx_states[l1tf_vmx_mitigation],
3594     - cpu_smt_control == CPU_SMT_ENABLED ? "vulnerable" : "disabled");
3595     + sched_smt_active() ? "vulnerable" : "disabled");
3596     }
3597     #else
3598     static ssize_t l1tf_show_state(char *buf)
3599     @@ -810,6 +1193,55 @@ static ssize_t l1tf_show_state(char *buf)
3600     }
3601     #endif
3602    
3603     +static ssize_t mds_show_state(char *buf)
3604     +{
3605     +#ifdef CONFIG_HYPERVISOR_GUEST
3606     + if (x86_hyper) {
3607     + return sprintf(buf, "%s; SMT Host state unknown\n",
3608     + mds_strings[mds_mitigation]);
3609     + }
3610     +#endif
3611     +
3612     + if (boot_cpu_has(X86_BUG_MSBDS_ONLY)) {
3613     + return sprintf(buf, "%s; SMT %s\n", mds_strings[mds_mitigation],
3614     + (mds_mitigation == MDS_MITIGATION_OFF ? "vulnerable" :
3615     + sched_smt_active() ? "mitigated" : "disabled"));
3616     + }
3617     +
3618     + return sprintf(buf, "%s; SMT %s\n", mds_strings[mds_mitigation],
3619     + sched_smt_active() ? "vulnerable" : "disabled");
3620     +}
3621     +
3622     +static char *stibp_state(void)
3623     +{
3624     + if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED)
3625     + return "";
3626     +
3627     + switch (spectre_v2_user) {
3628     + case SPECTRE_V2_USER_NONE:
3629     + return ", STIBP: disabled";
3630     + case SPECTRE_V2_USER_STRICT:
3631     + return ", STIBP: forced";
3632     + case SPECTRE_V2_USER_PRCTL:
3633     + case SPECTRE_V2_USER_SECCOMP:
3634     + if (static_key_enabled(&switch_to_cond_stibp))
3635     + return ", STIBP: conditional";
3636     + }
3637     + return "";
3638     +}
3639     +
3640     +static char *ibpb_state(void)
3641     +{
3642     + if (boot_cpu_has(X86_FEATURE_IBPB)) {
3643     + if (static_key_enabled(&switch_mm_always_ibpb))
3644     + return ", IBPB: always-on";
3645     + if (static_key_enabled(&switch_mm_cond_ibpb))
3646     + return ", IBPB: conditional";
3647     + return ", IBPB: disabled";
3648     + }
3649     + return "";
3650     +}
3651     +
3652     static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr,
3653     char *buf, unsigned int bug)
3654     {
3655     @@ -827,9 +1259,11 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr
3656     return sprintf(buf, "Mitigation: __user pointer sanitization\n");
3657    
3658     case X86_BUG_SPECTRE_V2:
3659     - return sprintf(buf, "%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled],
3660     - boot_cpu_has(X86_FEATURE_USE_IBPB) ? ", IBPB" : "",
3661     + return sprintf(buf, "%s%s%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled],
3662     + ibpb_state(),
3663     boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "",
3664     + stibp_state(),
3665     + boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "",
3666     spectre_v2_module_string());
3667    
3668     case X86_BUG_SPEC_STORE_BYPASS:
3669     @@ -839,6 +1273,10 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr
3670     if (boot_cpu_has(X86_FEATURE_L1TF_PTEINV))
3671     return l1tf_show_state(buf);
3672     break;
3673     +
3674     + case X86_BUG_MDS:
3675     + return mds_show_state(buf);
3676     +
3677     default:
3678     break;
3679     }
3680     @@ -870,4 +1308,9 @@ ssize_t cpu_show_l1tf(struct device *dev, struct device_attribute *attr, char *b
3681     {
3682     return cpu_show_common(dev, attr, buf, X86_BUG_L1TF);
3683     }
3684     +
3685     +ssize_t cpu_show_mds(struct device *dev, struct device_attribute *attr, char *buf)
3686     +{
3687     + return cpu_show_common(dev, attr, buf, X86_BUG_MDS);
3688     +}
3689     #endif
3690     diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
3691     index 3c01610c5ba9..cda130dc56b9 100644
3692     --- a/arch/x86/kernel/cpu/common.c
3693     +++ b/arch/x86/kernel/cpu/common.c
3694     @@ -752,6 +752,12 @@ static void init_speculation_control(struct cpuinfo_x86 *c)
3695     set_cpu_cap(c, X86_FEATURE_STIBP);
3696     set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL);
3697     }
3698     +
3699     + if (cpu_has(c, X86_FEATURE_AMD_SSBD)) {
3700     + set_cpu_cap(c, X86_FEATURE_SSBD);
3701     + set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL);
3702     + clear_cpu_cap(c, X86_FEATURE_VIRT_SSBD);
3703     + }
3704     }
3705    
3706     void get_cpu_cap(struct cpuinfo_x86 *c)
3707     @@ -885,84 +891,95 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
3708     c->x86_cache_bits = c->x86_phys_bits;
3709     }
3710    
3711     -static const __initconst struct x86_cpu_id cpu_no_speculation[] = {
3712     - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CEDARVIEW, X86_FEATURE_ANY },
3713     - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CLOVERVIEW, X86_FEATURE_ANY },
3714     - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_LINCROFT, X86_FEATURE_ANY },
3715     - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PENWELL, X86_FEATURE_ANY },
3716     - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PINEVIEW, X86_FEATURE_ANY },
3717     - { X86_VENDOR_CENTAUR, 5 },
3718     - { X86_VENDOR_INTEL, 5 },
3719     - { X86_VENDOR_NSC, 5 },
3720     - { X86_VENDOR_ANY, 4 },
3721     - {}
3722     -};
3723     +#define NO_SPECULATION BIT(0)
3724     +#define NO_MELTDOWN BIT(1)
3725     +#define NO_SSB BIT(2)
3726     +#define NO_L1TF BIT(3)
3727     +#define NO_MDS BIT(4)
3728     +#define MSBDS_ONLY BIT(5)
3729    
3730     -static const __initconst struct x86_cpu_id cpu_no_meltdown[] = {
3731     - { X86_VENDOR_AMD },
3732     - {}
3733     -};
3734     +#define VULNWL(_vendor, _family, _model, _whitelist) \
3735     + { X86_VENDOR_##_vendor, _family, _model, X86_FEATURE_ANY, _whitelist }
3736    
3737     -static const __initconst struct x86_cpu_id cpu_no_spec_store_bypass[] = {
3738     - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PINEVIEW },
3739     - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_LINCROFT },
3740     - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PENWELL },
3741     - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CLOVERVIEW },
3742     - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CEDARVIEW },
3743     - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT1 },
3744     - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_AIRMONT },
3745     - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT2 },
3746     - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_MERRIFIELD },
3747     - { X86_VENDOR_INTEL, 6, INTEL_FAM6_CORE_YONAH },
3748     - { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNL },
3749     - { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNM },
3750     - { X86_VENDOR_CENTAUR, 5, },
3751     - { X86_VENDOR_INTEL, 5, },
3752     - { X86_VENDOR_NSC, 5, },
3753     - { X86_VENDOR_AMD, 0x12, },
3754     - { X86_VENDOR_AMD, 0x11, },
3755     - { X86_VENDOR_AMD, 0x10, },
3756     - { X86_VENDOR_AMD, 0xf, },
3757     - { X86_VENDOR_ANY, 4, },
3758     - {}
3759     -};
3760     +#define VULNWL_INTEL(model, whitelist) \
3761     + VULNWL(INTEL, 6, INTEL_FAM6_##model, whitelist)
3762     +
3763     +#define VULNWL_AMD(family, whitelist) \
3764     + VULNWL(AMD, family, X86_MODEL_ANY, whitelist)
3765     +
3766     +static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
3767     + VULNWL(ANY, 4, X86_MODEL_ANY, NO_SPECULATION),
3768     + VULNWL(CENTAUR, 5, X86_MODEL_ANY, NO_SPECULATION),
3769     + VULNWL(INTEL, 5, X86_MODEL_ANY, NO_SPECULATION),
3770     + VULNWL(NSC, 5, X86_MODEL_ANY, NO_SPECULATION),
3771     +
3772     + /* Intel Family 6 */
3773     + VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION),
3774     + VULNWL_INTEL(ATOM_SALTWELL_TABLET, NO_SPECULATION),
3775     + VULNWL_INTEL(ATOM_SALTWELL_MID, NO_SPECULATION),
3776     + VULNWL_INTEL(ATOM_BONNELL, NO_SPECULATION),
3777     + VULNWL_INTEL(ATOM_BONNELL_MID, NO_SPECULATION),
3778     +
3779     + VULNWL_INTEL(ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY),
3780     + VULNWL_INTEL(ATOM_SILVERMONT_X, NO_SSB | NO_L1TF | MSBDS_ONLY),
3781     + VULNWL_INTEL(ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY),
3782     + VULNWL_INTEL(ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY),
3783     + VULNWL_INTEL(XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY),
3784     + VULNWL_INTEL(XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY),
3785     +
3786     + VULNWL_INTEL(CORE_YONAH, NO_SSB),
3787     +
3788     + VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY),
3789     +
3790     + VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF),
3791     + VULNWL_INTEL(ATOM_GOLDMONT_X, NO_MDS | NO_L1TF),
3792     + VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF),
3793    
3794     -static const __initconst struct x86_cpu_id cpu_no_l1tf[] = {
3795     - /* in addition to cpu_no_speculation */
3796     - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT1 },
3797     - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT2 },
3798     - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_AIRMONT },
3799     - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_MERRIFIELD },
3800     - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_MOOREFIELD },
3801     - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GOLDMONT },
3802     - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_DENVERTON },
3803     - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GEMINI_LAKE },
3804     - { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNL },
3805     - { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNM },
3806     + /* AMD Family 0xf - 0x12 */
3807     + VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS),
3808     + VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS),
3809     + VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS),
3810     + VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS),
3811     +
3812     + /* FAMILY_ANY must be last, otherwise 0x0f - 0x12 matches won't work */
3813     + VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS),
3814     {}
3815     };
3816    
3817     -static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
3818     +static bool __init cpu_matches(unsigned long which)
3819     {
3820     - u64 ia32_cap = 0;
3821     + const struct x86_cpu_id *m = x86_match_cpu(cpu_vuln_whitelist);
3822    
3823     - if (cpu_has(c, X86_FEATURE_ARCH_CAPABILITIES))
3824     - rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap);
3825     + return m && !!(m->driver_data & which);
3826     +}
3827    
3828     - if (!x86_match_cpu(cpu_no_spec_store_bypass) &&
3829     - !(ia32_cap & ARCH_CAP_SSB_NO))
3830     - setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS);
3831     +static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
3832     +{
3833     + u64 ia32_cap = 0;
3834    
3835     - if (x86_match_cpu(cpu_no_speculation))
3836     + if (cpu_matches(NO_SPECULATION))
3837     return;
3838    
3839     setup_force_cpu_bug(X86_BUG_SPECTRE_V1);
3840     setup_force_cpu_bug(X86_BUG_SPECTRE_V2);
3841    
3842     + if (cpu_has(c, X86_FEATURE_ARCH_CAPABILITIES))
3843     + rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap);
3844     +
3845     + if (!cpu_matches(NO_SSB) && !(ia32_cap & ARCH_CAP_SSB_NO) &&
3846     + !cpu_has(c, X86_FEATURE_AMD_SSB_NO))
3847     + setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS);
3848     +
3849     if (ia32_cap & ARCH_CAP_IBRS_ALL)
3850     setup_force_cpu_cap(X86_FEATURE_IBRS_ENHANCED);
3851    
3852     - if (x86_match_cpu(cpu_no_meltdown))
3853     + if (!cpu_matches(NO_MDS) && !(ia32_cap & ARCH_CAP_MDS_NO)) {
3854     + setup_force_cpu_bug(X86_BUG_MDS);
3855     + if (cpu_matches(MSBDS_ONLY))
3856     + setup_force_cpu_bug(X86_BUG_MSBDS_ONLY);
3857     + }
3858     +
3859     + if (cpu_matches(NO_MELTDOWN))
3860     return;
3861    
3862     /* Rogue Data Cache Load? No! */
3863     @@ -971,7 +988,7 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
3864    
3865     setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN);
3866    
3867     - if (x86_match_cpu(cpu_no_l1tf))
3868     + if (cpu_matches(NO_L1TF))
3869     return;
3870    
3871     setup_force_cpu_bug(X86_BUG_L1TF);
3872     diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
3873     index cee0fec0d232..860f2fd9f540 100644
3874     --- a/arch/x86/kernel/cpu/intel.c
3875     +++ b/arch/x86/kernel/cpu/intel.c
3876     @@ -14,6 +14,7 @@
3877     #include <asm/bugs.h>
3878     #include <asm/cpu.h>
3879     #include <asm/intel-family.h>
3880     +#include <asm/microcode_intel.h>
3881    
3882     #ifdef CONFIG_X86_64
3883     #include <linux/topology.h>
3884     @@ -137,14 +138,8 @@ static void early_init_intel(struct cpuinfo_x86 *c)
3885     (c->x86 == 0x6 && c->x86_model >= 0x0e))
3886     set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
3887    
3888     - if (c->x86 >= 6 && !cpu_has(c, X86_FEATURE_IA64)) {
3889     - unsigned lower_word;
3890     -
3891     - wrmsr(MSR_IA32_UCODE_REV, 0, 0);
3892     - /* Required by the SDM */
3893     - sync_core();
3894     - rdmsr(MSR_IA32_UCODE_REV, lower_word, c->microcode);
3895     - }
3896     + if (c->x86 >= 6 && !cpu_has(c, X86_FEATURE_IA64))
3897     + c->microcode = intel_get_microcode_revision();
3898    
3899     /* Now if any of them are set, check the blacklist and clear the lot */
3900     if ((cpu_has(c, X86_FEATURE_SPEC_CTRL) ||
3901     diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
3902     index 25310d2b8609..d9ad49ca3cbe 100644
3903     --- a/arch/x86/kernel/cpu/mcheck/mce.c
3904     +++ b/arch/x86/kernel/cpu/mcheck/mce.c
3905     @@ -139,6 +139,8 @@ void mce_setup(struct mce *m)
3906     m->socketid = cpu_data(m->extcpu).phys_proc_id;
3907     m->apicid = cpu_data(m->extcpu).initial_apicid;
3908     rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
3909     +
3910     + m->microcode = boot_cpu_data.microcode;
3911     }
3912    
3913     DEFINE_PER_CPU(struct mce, injectm);
3914     @@ -309,7 +311,7 @@ static void print_mce(struct mce *m)
3915     */
3916     pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n",
3917     m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid,
3918     - cpu_data(m->extcpu).microcode);
3919     + m->microcode);
3920    
3921     pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
3922     }
3923     diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
3924     index 732bb03fcf91..a19fddfb6bf8 100644
3925     --- a/arch/x86/kernel/cpu/microcode/amd.c
3926     +++ b/arch/x86/kernel/cpu/microcode/amd.c
3927     @@ -707,22 +707,26 @@ int apply_microcode_amd(int cpu)
3928     return -1;
3929    
3930     /* need to apply patch? */
3931     - if (rev >= mc_amd->hdr.patch_id) {
3932     - c->microcode = rev;
3933     - uci->cpu_sig.rev = rev;
3934     - return 0;
3935     - }
3936     + if (rev >= mc_amd->hdr.patch_id)
3937     + goto out;
3938    
3939     if (__apply_microcode_amd(mc_amd)) {
3940     pr_err("CPU%d: update failed for patch_level=0x%08x\n",
3941     cpu, mc_amd->hdr.patch_id);
3942     return -1;
3943     }
3944     - pr_info("CPU%d: new patch_level=0x%08x\n", cpu,
3945     - mc_amd->hdr.patch_id);
3946    
3947     - uci->cpu_sig.rev = mc_amd->hdr.patch_id;
3948     - c->microcode = mc_amd->hdr.patch_id;
3949     + rev = mc_amd->hdr.patch_id;
3950     +
3951     + pr_info("CPU%d: new patch_level=0x%08x\n", cpu, rev);
3952     +
3953     +out:
3954     + uci->cpu_sig.rev = rev;
3955     + c->microcode = rev;
3956     +
3957     + /* Update boot_cpu_data's revision too, if we're on the BSP: */
3958     + if (c->cpu_index == boot_cpu_data.cpu_index)
3959     + boot_cpu_data.microcode = rev;
3960    
3961     return 0;
3962     }
3963     diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
3964     index 79291d6fb301..1308abfc4758 100644
3965     --- a/arch/x86/kernel/cpu/microcode/intel.c
3966     +++ b/arch/x86/kernel/cpu/microcode/intel.c
3967     @@ -386,15 +386,8 @@ static int collect_cpu_info_early(struct ucode_cpu_info *uci)
3968     native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
3969     csig.pf = 1 << ((val[1] >> 18) & 7);
3970     }
3971     - native_wrmsrl(MSR_IA32_UCODE_REV, 0);
3972    
3973     - /* As documented in the SDM: Do a CPUID 1 here */
3974     - sync_core();
3975     -
3976     - /* get the current revision from MSR 0x8B */
3977     - native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
3978     -
3979     - csig.rev = val[1];
3980     + csig.rev = intel_get_microcode_revision();
3981    
3982     uci->cpu_sig = csig;
3983     uci->valid = 1;
3984     @@ -618,29 +611,35 @@ static inline void print_ucode(struct ucode_cpu_info *uci)
3985     static int apply_microcode_early(struct ucode_cpu_info *uci, bool early)
3986     {
3987     struct microcode_intel *mc;
3988     - unsigned int val[2];
3989     + u32 rev;
3990    
3991     mc = uci->mc;
3992     if (!mc)
3993     return 0;
3994    
3995     + /*
3996     + * Save us the MSR write below - which is a particular expensive
3997     + * operation - when the other hyperthread has updated the microcode
3998     + * already.
3999     + */
4000     + rev = intel_get_microcode_revision();
4001     + if (rev >= mc->hdr.rev) {
4002     + uci->cpu_sig.rev = rev;
4003     + return 0;
4004     + }
4005     +
4006     /* write microcode via MSR 0x79 */
4007     native_wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits);
4008     - native_wrmsrl(MSR_IA32_UCODE_REV, 0);
4009     -
4010     - /* As documented in the SDM: Do a CPUID 1 here */
4011     - sync_core();
4012    
4013     - /* get the current revision from MSR 0x8B */
4014     - native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
4015     - if (val[1] != mc->hdr.rev)
4016     + rev = intel_get_microcode_revision();
4017     + if (rev != mc->hdr.rev)
4018     return -1;
4019    
4020     #ifdef CONFIG_X86_64
4021     /* Flush global tlb. This is precaution. */
4022     flush_tlb_early();
4023     #endif
4024     - uci->cpu_sig.rev = val[1];
4025     + uci->cpu_sig.rev = rev;
4026    
4027     if (early)
4028     print_ucode(uci);
4029     @@ -903,9 +902,9 @@ static int apply_microcode_intel(int cpu)
4030     {
4031     struct microcode_intel *mc;
4032     struct ucode_cpu_info *uci;
4033     - struct cpuinfo_x86 *c;
4034     - unsigned int val[2];
4035     + struct cpuinfo_x86 *c = &cpu_data(cpu);
4036     static int prev_rev;
4037     + u32 rev;
4038    
4039     /* We should bind the task to the CPU */
4040     if (WARN_ON(raw_smp_processor_id() != cpu))
4041     @@ -924,35 +923,42 @@ static int apply_microcode_intel(int cpu)
4042     if (!get_matching_mc(mc, cpu))
4043     return 0;
4044    
4045     + /*
4046     + * Save us the MSR write below - which is a particular expensive
4047     + * operation - when the other hyperthread has updated the microcode
4048     + * already.
4049     + */
4050     + rev = intel_get_microcode_revision();
4051     + if (rev >= mc->hdr.rev)
4052     + goto out;
4053     +
4054     /* write microcode via MSR 0x79 */
4055     wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits);
4056     - wrmsrl(MSR_IA32_UCODE_REV, 0);
4057    
4058     - /* As documented in the SDM: Do a CPUID 1 here */
4059     - sync_core();
4060     + rev = intel_get_microcode_revision();
4061    
4062     - /* get the current revision from MSR 0x8B */
4063     - rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
4064     -
4065     - if (val[1] != mc->hdr.rev) {
4066     + if (rev != mc->hdr.rev) {
4067     pr_err("CPU%d update to revision 0x%x failed\n",
4068     cpu, mc->hdr.rev);
4069     return -1;
4070     }
4071    
4072     - if (val[1] != prev_rev) {
4073     + if (rev != prev_rev) {
4074     pr_info("updated to revision 0x%x, date = %04x-%02x-%02x\n",
4075     - val[1],
4076     + rev,
4077     mc->hdr.date & 0xffff,
4078     mc->hdr.date >> 24,
4079     (mc->hdr.date >> 16) & 0xff);
4080     - prev_rev = val[1];
4081     + prev_rev = rev;
4082     }
4083    
4084     - c = &cpu_data(cpu);
4085     +out:
4086     + uci->cpu_sig.rev = rev;
4087     + c->microcode = rev;
4088    
4089     - uci->cpu_sig.rev = val[1];
4090     - c->microcode = val[1];
4091     + /* Update boot_cpu_data's revision too, if we're on the BSP: */
4092     + if (c->cpu_index == boot_cpu_data.cpu_index)
4093     + boot_cpu_data.microcode = rev;
4094    
4095     return 0;
4096     }
4097     diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
4098     index bfe4d6c96fbd..6b7b35d80264 100644
4099     --- a/arch/x86/kernel/nmi.c
4100     +++ b/arch/x86/kernel/nmi.c
4101     @@ -32,6 +32,7 @@
4102     #include <asm/x86_init.h>
4103     #include <asm/reboot.h>
4104     #include <asm/cache.h>
4105     +#include <asm/nospec-branch.h>
4106    
4107     #define CREATE_TRACE_POINTS
4108     #include <trace/events/nmi.h>
4109     @@ -544,6 +545,9 @@ nmi_restart:
4110     write_cr2(this_cpu_read(nmi_cr2));
4111     if (this_cpu_dec_return(nmi_state))
4112     goto nmi_restart;
4113     +
4114     + if (user_mode(regs))
4115     + mds_user_clear_cpu_buffers();
4116     }
4117     NOKPROBE_SYMBOL(do_nmi);
4118    
4119     diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
4120     index 00a9047539d7..2e4eab22ca37 100644
4121     --- a/arch/x86/kernel/process.c
4122     +++ b/arch/x86/kernel/process.c
4123     @@ -35,6 +35,8 @@
4124     #include <asm/switch_to.h>
4125     #include <asm/spec-ctrl.h>
4126    
4127     +#include "process.h"
4128     +
4129     /*
4130     * per-CPU TSS segments. Threads are completely 'soft' on Linux,
4131     * no more per-task TSS's. The TSS size is kept cacheline-aligned
4132     @@ -183,11 +185,12 @@ int set_tsc_mode(unsigned int val)
4133     return 0;
4134     }
4135    
4136     -static inline void switch_to_bitmap(struct tss_struct *tss,
4137     - struct thread_struct *prev,
4138     +static inline void switch_to_bitmap(struct thread_struct *prev,
4139     struct thread_struct *next,
4140     unsigned long tifp, unsigned long tifn)
4141     {
4142     + struct tss_struct *tss = this_cpu_ptr(&cpu_tss);
4143     +
4144     if (tifn & _TIF_IO_BITMAP) {
4145     /*
4146     * Copy the relevant range of the IO bitmap.
4147     @@ -321,32 +324,85 @@ static __always_inline void amd_set_ssb_virt_state(unsigned long tifn)
4148     wrmsrl(MSR_AMD64_VIRT_SPEC_CTRL, ssbd_tif_to_spec_ctrl(tifn));
4149     }
4150    
4151     -static __always_inline void intel_set_ssb_state(unsigned long tifn)
4152     +/*
4153     + * Update the MSRs managing speculation control, during context switch.
4154     + *
4155     + * tifp: Previous task's thread flags
4156     + * tifn: Next task's thread flags
4157     + */
4158     +static __always_inline void __speculation_ctrl_update(unsigned long tifp,
4159     + unsigned long tifn)
4160     {
4161     - u64 msr = x86_spec_ctrl_base | ssbd_tif_to_spec_ctrl(tifn);
4162     + unsigned long tif_diff = tifp ^ tifn;
4163     + u64 msr = x86_spec_ctrl_base;
4164     + bool updmsr = false;
4165     +
4166     + /*
4167     + * If TIF_SSBD is different, select the proper mitigation
4168     + * method. Note that if SSBD mitigation is disabled or permanentely
4169     + * enabled this branch can't be taken because nothing can set
4170     + * TIF_SSBD.
4171     + */
4172     + if (tif_diff & _TIF_SSBD) {
4173     + if (static_cpu_has(X86_FEATURE_VIRT_SSBD)) {
4174     + amd_set_ssb_virt_state(tifn);
4175     + } else if (static_cpu_has(X86_FEATURE_LS_CFG_SSBD)) {
4176     + amd_set_core_ssb_state(tifn);
4177     + } else if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) ||
4178     + static_cpu_has(X86_FEATURE_AMD_SSBD)) {
4179     + msr |= ssbd_tif_to_spec_ctrl(tifn);
4180     + updmsr = true;
4181     + }
4182     + }
4183     +
4184     + /*
4185     + * Only evaluate TIF_SPEC_IB if conditional STIBP is enabled,
4186     + * otherwise avoid the MSR write.
4187     + */
4188     + if (IS_ENABLED(CONFIG_SMP) &&
4189     + static_branch_unlikely(&switch_to_cond_stibp)) {
4190     + updmsr |= !!(tif_diff & _TIF_SPEC_IB);
4191     + msr |= stibp_tif_to_spec_ctrl(tifn);
4192     + }
4193    
4194     - wrmsrl(MSR_IA32_SPEC_CTRL, msr);
4195     + if (updmsr)
4196     + wrmsrl(MSR_IA32_SPEC_CTRL, msr);
4197     }
4198    
4199     -static __always_inline void __speculative_store_bypass_update(unsigned long tifn)
4200     +static unsigned long speculation_ctrl_update_tif(struct task_struct *tsk)
4201     {
4202     - if (static_cpu_has(X86_FEATURE_VIRT_SSBD))
4203     - amd_set_ssb_virt_state(tifn);
4204     - else if (static_cpu_has(X86_FEATURE_LS_CFG_SSBD))
4205     - amd_set_core_ssb_state(tifn);
4206     - else
4207     - intel_set_ssb_state(tifn);
4208     + if (test_and_clear_tsk_thread_flag(tsk, TIF_SPEC_FORCE_UPDATE)) {
4209     + if (task_spec_ssb_disable(tsk))
4210     + set_tsk_thread_flag(tsk, TIF_SSBD);
4211     + else
4212     + clear_tsk_thread_flag(tsk, TIF_SSBD);
4213     +
4214     + if (task_spec_ib_disable(tsk))
4215     + set_tsk_thread_flag(tsk, TIF_SPEC_IB);
4216     + else
4217     + clear_tsk_thread_flag(tsk, TIF_SPEC_IB);
4218     + }
4219     + /* Return the updated threadinfo flags*/
4220     + return task_thread_info(tsk)->flags;
4221     }
4222    
4223     -void speculative_store_bypass_update(unsigned long tif)
4224     +void speculation_ctrl_update(unsigned long tif)
4225     {
4226     + /* Forced update. Make sure all relevant TIF flags are different */
4227     preempt_disable();
4228     - __speculative_store_bypass_update(tif);
4229     + __speculation_ctrl_update(~tif, tif);
4230     preempt_enable();
4231     }
4232    
4233     -void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
4234     - struct tss_struct *tss)
4235     +/* Called from seccomp/prctl update */
4236     +void speculation_ctrl_update_current(void)
4237     +{
4238     + preempt_disable();
4239     + speculation_ctrl_update(speculation_ctrl_update_tif(current));
4240     + preempt_enable();
4241     +}
4242     +
4243     +void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p)
4244     {
4245     struct thread_struct *prev, *next;
4246     unsigned long tifp, tifn;
4247     @@ -356,7 +412,7 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
4248    
4249     tifn = READ_ONCE(task_thread_info(next_p)->flags);
4250     tifp = READ_ONCE(task_thread_info(prev_p)->flags);
4251     - switch_to_bitmap(tss, prev, next, tifp, tifn);
4252     + switch_to_bitmap(prev, next, tifp, tifn);
4253    
4254     propagate_user_return_notify(prev_p, next_p);
4255    
4256     @@ -374,8 +430,15 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
4257     if ((tifp ^ tifn) & _TIF_NOTSC)
4258     cr4_toggle_bits(X86_CR4_TSD);
4259    
4260     - if ((tifp ^ tifn) & _TIF_SSBD)
4261     - __speculative_store_bypass_update(tifn);
4262     + if (likely(!((tifp | tifn) & _TIF_SPEC_FORCE_UPDATE))) {
4263     + __speculation_ctrl_update(tifp, tifn);
4264     + } else {
4265     + speculation_ctrl_update_tif(prev_p);
4266     + tifn = speculation_ctrl_update_tif(next_p);
4267     +
4268     + /* Enforce MSR update to ensure consistent state */
4269     + __speculation_ctrl_update(~tifn, tifn);
4270     + }
4271     }
4272    
4273     /*
4274     diff --git a/arch/x86/kernel/process.h b/arch/x86/kernel/process.h
4275     new file mode 100644
4276     index 000000000000..898e97cf6629
4277     --- /dev/null
4278     +++ b/arch/x86/kernel/process.h
4279     @@ -0,0 +1,39 @@
4280     +// SPDX-License-Identifier: GPL-2.0
4281     +//
4282     +// Code shared between 32 and 64 bit
4283     +
4284     +#include <asm/spec-ctrl.h>
4285     +
4286     +void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p);
4287     +
4288     +/*
4289     + * This needs to be inline to optimize for the common case where no extra
4290     + * work needs to be done.
4291     + */
4292     +static inline void switch_to_extra(struct task_struct *prev,
4293     + struct task_struct *next)
4294     +{
4295     + unsigned long next_tif = task_thread_info(next)->flags;
4296     + unsigned long prev_tif = task_thread_info(prev)->flags;
4297     +
4298     + if (IS_ENABLED(CONFIG_SMP)) {
4299     + /*
4300     + * Avoid __switch_to_xtra() invocation when conditional
4301     + * STIPB is disabled and the only different bit is
4302     + * TIF_SPEC_IB. For CONFIG_SMP=n TIF_SPEC_IB is not
4303     + * in the TIF_WORK_CTXSW masks.
4304     + */
4305     + if (!static_branch_likely(&switch_to_cond_stibp)) {
4306     + prev_tif &= ~_TIF_SPEC_IB;
4307     + next_tif &= ~_TIF_SPEC_IB;
4308     + }
4309     + }
4310     +
4311     + /*
4312     + * __switch_to_xtra() handles debug registers, i/o bitmaps,
4313     + * speculation mitigations etc.
4314     + */
4315     + if (unlikely(next_tif & _TIF_WORK_CTXSW_NEXT ||
4316     + prev_tif & _TIF_WORK_CTXSW_PREV))
4317     + __switch_to_xtra(prev, next);
4318     +}
4319     diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
4320     index bd7be8efdc4c..912246fd6cd9 100644
4321     --- a/arch/x86/kernel/process_32.c
4322     +++ b/arch/x86/kernel/process_32.c
4323     @@ -55,6 +55,8 @@
4324     #include <asm/switch_to.h>
4325     #include <asm/vm86.h>
4326    
4327     +#include "process.h"
4328     +
4329     void __show_regs(struct pt_regs *regs, int all)
4330     {
4331     unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
4332     @@ -264,12 +266,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
4333     if (get_kernel_rpl() && unlikely(prev->iopl != next->iopl))
4334     set_iopl_mask(next->iopl);
4335    
4336     - /*
4337     - * Now maybe handle debug registers and/or IO bitmaps
4338     - */
4339     - if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV ||
4340     - task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
4341     - __switch_to_xtra(prev_p, next_p, tss);
4342     + switch_to_extra(prev_p, next_p);
4343    
4344     /*
4345     * Leave lazy mode, flushing any hypercalls made here.
4346     diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
4347     index a2661814bde0..81eec65fe053 100644
4348     --- a/arch/x86/kernel/process_64.c
4349     +++ b/arch/x86/kernel/process_64.c
4350     @@ -51,6 +51,8 @@
4351     #include <asm/xen/hypervisor.h>
4352     #include <asm/vdso.h>
4353    
4354     +#include "process.h"
4355     +
4356     __visible DEFINE_PER_CPU(unsigned long, rsp_scratch);
4357    
4358     /* Prints also some state that isn't saved in the pt_regs */
4359     @@ -454,12 +456,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
4360     /* Reload esp0 and ss1. This changes current_thread_info(). */
4361     load_sp0(tss, next);
4362    
4363     - /*
4364     - * Now maybe reload the debug registers and handle I/O bitmaps
4365     - */
4366     - if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
4367     - task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
4368     - __switch_to_xtra(prev_p, next_p, tss);
4369     + switch_to_extra(prev_p, next_p);
4370    
4371     #ifdef CONFIG_XEN
4372     /*
4373     diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
4374     index 5bbfa2f63b8c..ef225fa8e928 100644
4375     --- a/arch/x86/kernel/traps.c
4376     +++ b/arch/x86/kernel/traps.c
4377     @@ -62,6 +62,7 @@
4378     #include <asm/alternative.h>
4379     #include <asm/fpu/xstate.h>
4380     #include <asm/trace/mpx.h>
4381     +#include <asm/nospec-branch.h>
4382     #include <asm/mpx.h>
4383     #include <asm/vm86.h>
4384    
4385     @@ -340,6 +341,13 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
4386     regs->ip = (unsigned long)general_protection;
4387     regs->sp = (unsigned long)&normal_regs->orig_ax;
4388    
4389     + /*
4390     + * This situation can be triggered by userspace via
4391     + * modify_ldt(2) and the return does not take the regular
4392     + * user space exit, so a CPU buffer clear is required when
4393     + * MDS mitigation is enabled.
4394     + */
4395     + mds_user_clear_cpu_buffers();
4396     return;
4397     }
4398     #endif
4399     diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
4400     index 769c370011d6..cb768417429d 100644
4401     --- a/arch/x86/kernel/tsc.c
4402     +++ b/arch/x86/kernel/tsc.c
4403     @@ -713,7 +713,7 @@ unsigned long native_calibrate_tsc(void)
4404     case INTEL_FAM6_KABYLAKE_DESKTOP:
4405     crystal_khz = 24000; /* 24.0 MHz */
4406     break;
4407     - case INTEL_FAM6_ATOM_DENVERTON:
4408     + case INTEL_FAM6_ATOM_GOLDMONT_X:
4409     crystal_khz = 25000; /* 25.0 MHz */
4410     break;
4411     case INTEL_FAM6_ATOM_GOLDMONT:
4412     diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
4413     index c17d3893ae60..fc8236fd2495 100644
4414     --- a/arch/x86/kvm/cpuid.c
4415     +++ b/arch/x86/kvm/cpuid.c
4416     @@ -355,7 +355,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
4417    
4418     /* cpuid 0x80000008.ebx */
4419     const u32 kvm_cpuid_8000_0008_ebx_x86_features =
4420     - F(AMD_IBPB) | F(AMD_IBRS) | F(VIRT_SSBD);
4421     + F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) |
4422     + F(AMD_SSB_NO) | F(AMD_STIBP);
4423    
4424     /* cpuid 0xC0000001.edx */
4425     const u32 kvm_cpuid_C000_0001_edx_x86_features =
4426     @@ -380,7 +381,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
4427    
4428     /* cpuid 7.0.edx*/
4429     const u32 kvm_cpuid_7_0_edx_x86_features =
4430     - F(SPEC_CTRL) | F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES);
4431     + F(SPEC_CTRL) | F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) |
4432     + F(INTEL_STIBP) | F(MD_CLEAR);
4433    
4434     /* all calls to cpuid_count() should be made on the same cpu */
4435     get_cpu();
4436     @@ -633,7 +635,12 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
4437     entry->ebx |= F(VIRT_SSBD);
4438     entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features;
4439     cpuid_mask(&entry->ebx, CPUID_8000_0008_EBX);
4440     - if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD))
4441     + /*
4442     + * The preference is to use SPEC CTRL MSR instead of the
4443     + * VIRT_SPEC MSR.
4444     + */
4445     + if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) &&
4446     + !boot_cpu_has(X86_FEATURE_AMD_SSBD))
4447     entry->ebx |= F(VIRT_SSBD);
4448     break;
4449     }
4450     diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
4451     index 8a841b9d8f84..b2bf8e1d5782 100644
4452     --- a/arch/x86/kvm/cpuid.h
4453     +++ b/arch/x86/kvm/cpuid.h
4454     @@ -176,7 +176,7 @@ static inline bool guest_cpuid_has_spec_ctrl(struct kvm_vcpu *vcpu)
4455     struct kvm_cpuid_entry2 *best;
4456    
4457     best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
4458     - if (best && (best->ebx & bit(X86_FEATURE_AMD_IBRS)))
4459     + if (best && (best->ebx & (bit(X86_FEATURE_AMD_IBRS | bit(X86_FEATURE_AMD_SSBD)))))
4460     return true;
4461     best = kvm_find_cpuid_entry(vcpu, 7, 0);
4462     return best && (best->edx & (bit(X86_FEATURE_SPEC_CTRL) | bit(X86_FEATURE_SPEC_CTRL_SSBD)));
4463     diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
4464     index 9a6d258c3c16..9338136a6a23 100644
4465     --- a/arch/x86/kvm/svm.c
4466     +++ b/arch/x86/kvm/svm.c
4467     @@ -3704,7 +3704,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
4468     return 1;
4469    
4470     /* The STIBP bit doesn't fault even if it's not advertised */
4471     - if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP))
4472     + if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD))
4473     return 1;
4474    
4475     svm->spec_ctrl = data;
4476     diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
4477     index 75466d9417b8..8feb4f7e2e59 100644
4478     --- a/arch/x86/kvm/vmx.c
4479     +++ b/arch/x86/kvm/vmx.c
4480     @@ -9206,8 +9206,11 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
4481    
4482     vmx->__launched = vmx->loaded_vmcs->launched;
4483    
4484     + /* L1D Flush includes CPU buffer clear to mitigate MDS */
4485     if (static_branch_unlikely(&vmx_l1d_should_flush))
4486     vmx_l1d_flush(vcpu);
4487     + else if (static_branch_unlikely(&mds_user_clear))
4488     + mds_clear_cpu_buffers();
4489    
4490     asm(
4491     /* Store host registers */
4492     @@ -9566,8 +9569,8 @@ free_vcpu:
4493     return ERR_PTR(err);
4494     }
4495    
4496     -#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n"
4497     -#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n"
4498     +#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
4499     +#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
4500    
4501     static int vmx_vm_init(struct kvm *kvm)
4502     {
4503     diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
4504     index 90801a8f19c9..ce092a62fc5d 100644
4505     --- a/arch/x86/mm/init.c
4506     +++ b/arch/x86/mm/init.c
4507     @@ -790,7 +790,7 @@ unsigned long max_swapfile_size(void)
4508    
4509     pages = generic_max_swapfile_size();
4510    
4511     - if (boot_cpu_has_bug(X86_BUG_L1TF)) {
4512     + if (boot_cpu_has_bug(X86_BUG_L1TF) && l1tf_mitigation != L1TF_MITIGATION_OFF) {
4513     /* Limit the swap file size to MAX_PA/2 for L1TF workaround */
4514     unsigned long long l1tf_limit = l1tf_pfn_limit();
4515     /*
4516     diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
4517     index 3f729e20f0e3..12522dbae615 100644
4518     --- a/arch/x86/mm/kaiser.c
4519     +++ b/arch/x86/mm/kaiser.c
4520     @@ -9,6 +9,7 @@
4521     #include <linux/spinlock.h>
4522     #include <linux/mm.h>
4523     #include <linux/uaccess.h>
4524     +#include <linux/cpu.h>
4525    
4526     #undef pr_fmt
4527     #define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt
4528     @@ -297,7 +298,8 @@ void __init kaiser_check_boottime_disable(void)
4529     goto skip;
4530     }
4531    
4532     - if (cmdline_find_option_bool(boot_command_line, "nopti"))
4533     + if (cmdline_find_option_bool(boot_command_line, "nopti") ||
4534     + cpu_mitigations_off())
4535     goto disable;
4536    
4537     skip:
4538     diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
4539     index e30baa8ad94f..dff8ac2d255c 100644
4540     --- a/arch/x86/mm/pgtable.c
4541     +++ b/arch/x86/mm/pgtable.c
4542     @@ -251,7 +251,7 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
4543     if (pgd_val(pgd) != 0) {
4544     pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
4545    
4546     - pgdp[i] = native_make_pgd(0);
4547     + pgd_clear(&pgdp[i]);
4548    
4549     paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
4550     pmd_free(mm, pmd);
4551     @@ -419,7 +419,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
4552     int changed = !pte_same(*ptep, entry);
4553    
4554     if (changed && dirty) {
4555     - *ptep = entry;
4556     + set_pte(ptep, entry);
4557     pte_update(vma->vm_mm, address, ptep);
4558     }
4559    
4560     @@ -436,7 +436,7 @@ int pmdp_set_access_flags(struct vm_area_struct *vma,
4561     VM_BUG_ON(address & ~HPAGE_PMD_MASK);
4562    
4563     if (changed && dirty) {
4564     - *pmdp = entry;
4565     + set_pmd(pmdp, entry);
4566     /*
4567     * We had a write-protection fault here and changed the pmd
4568     * to to more permissive. No need to flush the TLB for that,
4569     diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
4570     index eac92e2d171b..a112bb175dd4 100644
4571     --- a/arch/x86/mm/tlb.c
4572     +++ b/arch/x86/mm/tlb.c
4573     @@ -30,6 +30,12 @@
4574     * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
4575     */
4576    
4577     +/*
4578     + * Use bit 0 to mangle the TIF_SPEC_IB state into the mm pointer which is
4579     + * stored in cpu_tlb_state.last_user_mm_ibpb.
4580     + */
4581     +#define LAST_USER_MM_IBPB 0x1UL
4582     +
4583     atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);
4584    
4585     struct flush_tlb_info {
4586     @@ -101,33 +107,101 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next,
4587     local_irq_restore(flags);
4588     }
4589    
4590     +static inline unsigned long mm_mangle_tif_spec_ib(struct task_struct *next)
4591     +{
4592     + unsigned long next_tif = task_thread_info(next)->flags;
4593     + unsigned long ibpb = (next_tif >> TIF_SPEC_IB) & LAST_USER_MM_IBPB;
4594     +
4595     + return (unsigned long)next->mm | ibpb;
4596     +}
4597     +
4598     +static void cond_ibpb(struct task_struct *next)
4599     +{
4600     + if (!next || !next->mm)
4601     + return;
4602     +
4603     + /*
4604     + * Both, the conditional and the always IBPB mode use the mm
4605     + * pointer to avoid the IBPB when switching between tasks of the
4606     + * same process. Using the mm pointer instead of mm->context.ctx_id
4607     + * opens a hypothetical hole vs. mm_struct reuse, which is more or
4608     + * less impossible to control by an attacker. Aside of that it
4609     + * would only affect the first schedule so the theoretically
4610     + * exposed data is not really interesting.
4611     + */
4612     + if (static_branch_likely(&switch_mm_cond_ibpb)) {
4613     + unsigned long prev_mm, next_mm;
4614     +
4615     + /*
4616     + * This is a bit more complex than the always mode because
4617     + * it has to handle two cases:
4618     + *
4619     + * 1) Switch from a user space task (potential attacker)
4620     + * which has TIF_SPEC_IB set to a user space task
4621     + * (potential victim) which has TIF_SPEC_IB not set.
4622     + *
4623     + * 2) Switch from a user space task (potential attacker)
4624     + * which has TIF_SPEC_IB not set to a user space task
4625     + * (potential victim) which has TIF_SPEC_IB set.
4626     + *
4627     + * This could be done by unconditionally issuing IBPB when
4628     + * a task which has TIF_SPEC_IB set is either scheduled in
4629     + * or out. Though that results in two flushes when:
4630     + *
4631     + * - the same user space task is scheduled out and later
4632     + * scheduled in again and only a kernel thread ran in
4633     + * between.
4634     + *
4635     + * - a user space task belonging to the same process is
4636     + * scheduled in after a kernel thread ran in between
4637     + *
4638     + * - a user space task belonging to the same process is
4639     + * scheduled in immediately.
4640     + *
4641     + * Optimize this with reasonably small overhead for the
4642     + * above cases. Mangle the TIF_SPEC_IB bit into the mm
4643     + * pointer of the incoming task which is stored in
4644     + * cpu_tlbstate.last_user_mm_ibpb for comparison.
4645     + */
4646     + next_mm = mm_mangle_tif_spec_ib(next);
4647     + prev_mm = this_cpu_read(cpu_tlbstate.last_user_mm_ibpb);
4648     +
4649     + /*
4650     + * Issue IBPB only if the mm's are different and one or
4651     + * both have the IBPB bit set.
4652     + */
4653     + if (next_mm != prev_mm &&
4654     + (next_mm | prev_mm) & LAST_USER_MM_IBPB)
4655     + indirect_branch_prediction_barrier();
4656     +
4657     + this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, next_mm);
4658     + }
4659     +
4660     + if (static_branch_unlikely(&switch_mm_always_ibpb)) {
4661     + /*
4662     + * Only flush when switching to a user space task with a
4663     + * different context than the user space task which ran
4664     + * last on this CPU.
4665     + */
4666     + if (this_cpu_read(cpu_tlbstate.last_user_mm) != next->mm) {
4667     + indirect_branch_prediction_barrier();
4668     + this_cpu_write(cpu_tlbstate.last_user_mm, next->mm);
4669     + }
4670     + }
4671     +}
4672     +
4673     void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
4674     struct task_struct *tsk)
4675     {
4676     unsigned cpu = smp_processor_id();
4677    
4678     if (likely(prev != next)) {
4679     - u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id);
4680     -
4681     /*
4682     * Avoid user/user BTB poisoning by flushing the branch
4683     * predictor when switching between processes. This stops
4684     * one process from doing Spectre-v2 attacks on another.
4685     - *
4686     - * As an optimization, flush indirect branches only when
4687     - * switching into processes that disable dumping. This
4688     - * protects high value processes like gpg, without having
4689     - * too high performance overhead. IBPB is *expensive*!
4690     - *
4691     - * This will not flush branches when switching into kernel
4692     - * threads. It will also not flush if we switch to idle
4693     - * thread and back to the same process. It will flush if we
4694     - * switch to a different non-dumpable process.
4695     */
4696     - if (tsk && tsk->mm &&
4697     - tsk->mm->context.ctx_id != last_ctx_id &&
4698     - get_dumpable(tsk->mm) != SUID_DUMP_USER)
4699     - indirect_branch_prediction_barrier();
4700     + cond_ibpb(tsk);
4701    
4702     if (IS_ENABLED(CONFIG_VMAP_STACK)) {
4703     /*
4704     @@ -143,14 +217,6 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
4705     set_pgd(pgd, init_mm.pgd[stack_pgd_index]);
4706     }
4707    
4708     - /*
4709     - * Record last user mm's context id, so we can avoid
4710     - * flushing branch buffer with IBPB if we switch back
4711     - * to the same user.
4712     - */
4713     - if (next != &init_mm)
4714     - this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
4715     -
4716     this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
4717     this_cpu_write(cpu_tlbstate.active_mm, next);
4718    
4719     diff --git a/arch/x86/platform/atom/punit_atom_debug.c b/arch/x86/platform/atom/punit_atom_debug.c
4720     index d49d3be81953..ecb5866aaf84 100644
4721     --- a/arch/x86/platform/atom/punit_atom_debug.c
4722     +++ b/arch/x86/platform/atom/punit_atom_debug.c
4723     @@ -154,8 +154,8 @@ static void punit_dbgfs_unregister(void)
4724     (kernel_ulong_t)&drv_data }
4725    
4726     static const struct x86_cpu_id intel_punit_cpu_ids[] = {
4727     - ICPU(INTEL_FAM6_ATOM_SILVERMONT1, punit_device_byt),
4728     - ICPU(INTEL_FAM6_ATOM_MERRIFIELD, punit_device_tng),
4729     + ICPU(INTEL_FAM6_ATOM_SILVERMONT, punit_device_byt),
4730     + ICPU(INTEL_FAM6_ATOM_SILVERMONT_MID, punit_device_tng),
4731     ICPU(INTEL_FAM6_ATOM_AIRMONT, punit_device_cht),
4732     {}
4733     };
4734     diff --git a/drivers/acpi/acpi_lpss.c b/drivers/acpi/acpi_lpss.c
4735     index 957d3fa3b543..8e38249311bd 100644
4736     --- a/drivers/acpi/acpi_lpss.c
4737     +++ b/drivers/acpi/acpi_lpss.c
4738     @@ -243,7 +243,7 @@ static const struct lpss_device_desc bsw_spi_dev_desc = {
4739     #define ICPU(model) { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, }
4740    
4741     static const struct x86_cpu_id lpss_cpu_ids[] = {
4742     - ICPU(INTEL_FAM6_ATOM_SILVERMONT1), /* Valleyview, Bay Trail */
4743     + ICPU(INTEL_FAM6_ATOM_SILVERMONT), /* Valleyview, Bay Trail */
4744     ICPU(INTEL_FAM6_ATOM_AIRMONT), /* Braswell, Cherry Trail */
4745     {}
4746     };
4747     diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
4748     index f1f4ce7ddb47..3b123735a1c4 100644
4749     --- a/drivers/base/cpu.c
4750     +++ b/drivers/base/cpu.c
4751     @@ -531,11 +531,18 @@ ssize_t __weak cpu_show_l1tf(struct device *dev,
4752     return sprintf(buf, "Not affected\n");
4753     }
4754    
4755     +ssize_t __weak cpu_show_mds(struct device *dev,
4756     + struct device_attribute *attr, char *buf)
4757     +{
4758     + return sprintf(buf, "Not affected\n");
4759     +}
4760     +
4761     static DEVICE_ATTR(meltdown, 0444, cpu_show_meltdown, NULL);
4762     static DEVICE_ATTR(spectre_v1, 0444, cpu_show_spectre_v1, NULL);
4763     static DEVICE_ATTR(spectre_v2, 0444, cpu_show_spectre_v2, NULL);
4764     static DEVICE_ATTR(spec_store_bypass, 0444, cpu_show_spec_store_bypass, NULL);
4765     static DEVICE_ATTR(l1tf, 0444, cpu_show_l1tf, NULL);
4766     +static DEVICE_ATTR(mds, 0444, cpu_show_mds, NULL);
4767    
4768     static struct attribute *cpu_root_vulnerabilities_attrs[] = {
4769     &dev_attr_meltdown.attr,
4770     @@ -543,6 +550,7 @@ static struct attribute *cpu_root_vulnerabilities_attrs[] = {
4771     &dev_attr_spectre_v2.attr,
4772     &dev_attr_spec_store_bypass.attr,
4773     &dev_attr_l1tf.attr,
4774     + &dev_attr_mds.attr,
4775     NULL
4776     };
4777    
4778     diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
4779     index f690085b1ad9..4fe999687415 100644
4780     --- a/drivers/cpufreq/intel_pstate.c
4781     +++ b/drivers/cpufreq/intel_pstate.c
4782     @@ -1413,7 +1413,7 @@ static void intel_pstate_update_util(struct update_util_data *data, u64 time,
4783     static const struct x86_cpu_id intel_pstate_cpu_ids[] = {
4784     ICPU(INTEL_FAM6_SANDYBRIDGE, core_params),
4785     ICPU(INTEL_FAM6_SANDYBRIDGE_X, core_params),
4786     - ICPU(INTEL_FAM6_ATOM_SILVERMONT1, silvermont_params),
4787     + ICPU(INTEL_FAM6_ATOM_SILVERMONT, silvermont_params),
4788     ICPU(INTEL_FAM6_IVYBRIDGE, core_params),
4789     ICPU(INTEL_FAM6_HASWELL_CORE, core_params),
4790     ICPU(INTEL_FAM6_BROADWELL_CORE, core_params),
4791     diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
4792     index 5ded9b22b015..a6fa32c7e068 100644
4793     --- a/drivers/idle/intel_idle.c
4794     +++ b/drivers/idle/intel_idle.c
4795     @@ -1107,14 +1107,14 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = {
4796     ICPU(INTEL_FAM6_WESTMERE, idle_cpu_nehalem),
4797     ICPU(INTEL_FAM6_WESTMERE_EP, idle_cpu_nehalem),
4798     ICPU(INTEL_FAM6_NEHALEM_EX, idle_cpu_nehalem),
4799     - ICPU(INTEL_FAM6_ATOM_PINEVIEW, idle_cpu_atom),
4800     - ICPU(INTEL_FAM6_ATOM_LINCROFT, idle_cpu_lincroft),
4801     + ICPU(INTEL_FAM6_ATOM_BONNELL, idle_cpu_atom),
4802     + ICPU(INTEL_FAM6_ATOM_BONNELL_MID, idle_cpu_lincroft),
4803     ICPU(INTEL_FAM6_WESTMERE_EX, idle_cpu_nehalem),
4804     ICPU(INTEL_FAM6_SANDYBRIDGE, idle_cpu_snb),
4805     ICPU(INTEL_FAM6_SANDYBRIDGE_X, idle_cpu_snb),
4806     - ICPU(INTEL_FAM6_ATOM_CEDARVIEW, idle_cpu_atom),
4807     - ICPU(INTEL_FAM6_ATOM_SILVERMONT1, idle_cpu_byt),
4808     - ICPU(INTEL_FAM6_ATOM_MERRIFIELD, idle_cpu_tangier),
4809     + ICPU(INTEL_FAM6_ATOM_SALTWELL, idle_cpu_atom),
4810     + ICPU(INTEL_FAM6_ATOM_SILVERMONT, idle_cpu_byt),
4811     + ICPU(INTEL_FAM6_ATOM_SILVERMONT_MID, idle_cpu_tangier),
4812     ICPU(INTEL_FAM6_ATOM_AIRMONT, idle_cpu_cht),
4813     ICPU(INTEL_FAM6_IVYBRIDGE, idle_cpu_ivb),
4814     ICPU(INTEL_FAM6_IVYBRIDGE_X, idle_cpu_ivt),
4815     @@ -1122,7 +1122,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = {
4816     ICPU(INTEL_FAM6_HASWELL_X, idle_cpu_hsw),
4817     ICPU(INTEL_FAM6_HASWELL_ULT, idle_cpu_hsw),
4818     ICPU(INTEL_FAM6_HASWELL_GT3E, idle_cpu_hsw),
4819     - ICPU(INTEL_FAM6_ATOM_SILVERMONT2, idle_cpu_avn),
4820     + ICPU(INTEL_FAM6_ATOM_SILVERMONT_X, idle_cpu_avn),
4821     ICPU(INTEL_FAM6_BROADWELL_CORE, idle_cpu_bdw),
4822     ICPU(INTEL_FAM6_BROADWELL_GT3E, idle_cpu_bdw),
4823     ICPU(INTEL_FAM6_BROADWELL_X, idle_cpu_bdw),
4824     @@ -1134,7 +1134,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = {
4825     ICPU(INTEL_FAM6_SKYLAKE_X, idle_cpu_skx),
4826     ICPU(INTEL_FAM6_XEON_PHI_KNL, idle_cpu_knl),
4827     ICPU(INTEL_FAM6_ATOM_GOLDMONT, idle_cpu_bxt),
4828     - ICPU(INTEL_FAM6_ATOM_DENVERTON, idle_cpu_dnv),
4829     + ICPU(INTEL_FAM6_ATOM_GOLDMONT_X, idle_cpu_dnv),
4830     {}
4831     };
4832    
4833     diff --git a/drivers/mmc/host/sdhci-acpi.c b/drivers/mmc/host/sdhci-acpi.c
4834     index 80918abfc468..4398398c0935 100644
4835     --- a/drivers/mmc/host/sdhci-acpi.c
4836     +++ b/drivers/mmc/host/sdhci-acpi.c
4837     @@ -127,7 +127,7 @@ static const struct sdhci_acpi_chip sdhci_acpi_chip_int = {
4838     static bool sdhci_acpi_byt(void)
4839     {
4840     static const struct x86_cpu_id byt[] = {
4841     - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT1 },
4842     + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT },
4843     {}
4844     };
4845    
4846     diff --git a/drivers/pci/pci-mid.c b/drivers/pci/pci-mid.c
4847     index c7f3408e3148..54b3f9bc5ad8 100644
4848     --- a/drivers/pci/pci-mid.c
4849     +++ b/drivers/pci/pci-mid.c
4850     @@ -71,8 +71,8 @@ static struct pci_platform_pm_ops mid_pci_platform_pm = {
4851     * arch/x86/platform/intel-mid/pwr.c.
4852     */
4853     static const struct x86_cpu_id lpss_cpu_ids[] = {
4854     - ICPU(INTEL_FAM6_ATOM_PENWELL),
4855     - ICPU(INTEL_FAM6_ATOM_MERRIFIELD),
4856     + ICPU(INTEL_FAM6_ATOM_SALTWELL_MID),
4857     + ICPU(INTEL_FAM6_ATOM_SILVERMONT_MID),
4858     {}
4859     };
4860    
4861     diff --git a/drivers/powercap/intel_rapl.c b/drivers/powercap/intel_rapl.c
4862     index 3c71f608b444..8809c1a20bed 100644
4863     --- a/drivers/powercap/intel_rapl.c
4864     +++ b/drivers/powercap/intel_rapl.c
4865     @@ -1175,12 +1175,12 @@ static const struct x86_cpu_id rapl_ids[] __initconst = {
4866     RAPL_CPU(INTEL_FAM6_KABYLAKE_MOBILE, rapl_defaults_core),
4867     RAPL_CPU(INTEL_FAM6_KABYLAKE_DESKTOP, rapl_defaults_core),
4868    
4869     - RAPL_CPU(INTEL_FAM6_ATOM_SILVERMONT1, rapl_defaults_byt),
4870     + RAPL_CPU(INTEL_FAM6_ATOM_SILVERMONT, rapl_defaults_byt),
4871     RAPL_CPU(INTEL_FAM6_ATOM_AIRMONT, rapl_defaults_cht),
4872     - RAPL_CPU(INTEL_FAM6_ATOM_MERRIFIELD, rapl_defaults_tng),
4873     - RAPL_CPU(INTEL_FAM6_ATOM_MOOREFIELD, rapl_defaults_ann),
4874     + RAPL_CPU(INTEL_FAM6_ATOM_SILVERMONT_MID,rapl_defaults_tng),
4875     + RAPL_CPU(INTEL_FAM6_ATOM_AIRMONT_MID, rapl_defaults_ann),
4876     RAPL_CPU(INTEL_FAM6_ATOM_GOLDMONT, rapl_defaults_core),
4877     - RAPL_CPU(INTEL_FAM6_ATOM_DENVERTON, rapl_defaults_core),
4878     + RAPL_CPU(INTEL_FAM6_ATOM_GOLDMONT_X, rapl_defaults_core),
4879    
4880     RAPL_CPU(INTEL_FAM6_XEON_PHI_KNL, rapl_defaults_hsw_server),
4881     {}
4882     diff --git a/drivers/thermal/intel_soc_dts_thermal.c b/drivers/thermal/intel_soc_dts_thermal.c
4883     index b2bbaa1c60b0..18788109cae6 100644
4884     --- a/drivers/thermal/intel_soc_dts_thermal.c
4885     +++ b/drivers/thermal/intel_soc_dts_thermal.c
4886     @@ -43,7 +43,7 @@ static irqreturn_t soc_irq_thread_fn(int irq, void *dev_data)
4887     }
4888    
4889     static const struct x86_cpu_id soc_thermal_ids[] = {
4890     - { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT1, 0,
4891     + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT, 0,
4892     BYT_SOC_DTS_APIC_IRQ},
4893     {}
4894     };
4895     diff --git a/include/linux/bitops.h b/include/linux/bitops.h
4896     index a83c822c35c2..d4b167fc9ecb 100644
4897     --- a/include/linux/bitops.h
4898     +++ b/include/linux/bitops.h
4899     @@ -1,28 +1,9 @@
4900     #ifndef _LINUX_BITOPS_H
4901     #define _LINUX_BITOPS_H
4902     #include <asm/types.h>
4903     +#include <linux/bits.h>
4904    
4905     -#ifdef __KERNEL__
4906     -#define BIT(nr) (1UL << (nr))
4907     -#define BIT_ULL(nr) (1ULL << (nr))
4908     -#define BIT_MASK(nr) (1UL << ((nr) % BITS_PER_LONG))
4909     -#define BIT_WORD(nr) ((nr) / BITS_PER_LONG)
4910     -#define BIT_ULL_MASK(nr) (1ULL << ((nr) % BITS_PER_LONG_LONG))
4911     -#define BIT_ULL_WORD(nr) ((nr) / BITS_PER_LONG_LONG)
4912     -#define BITS_PER_BYTE 8
4913     #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(long))
4914     -#endif
4915     -
4916     -/*
4917     - * Create a contiguous bitmask starting at bit position @l and ending at
4918     - * position @h. For example
4919     - * GENMASK_ULL(39, 21) gives us the 64bit vector 0x000000ffffe00000.
4920     - */
4921     -#define GENMASK(h, l) \
4922     - (((~0UL) << (l)) & (~0UL >> (BITS_PER_LONG - 1 - (h))))
4923     -
4924     -#define GENMASK_ULL(h, l) \
4925     - (((~0ULL) << (l)) & (~0ULL >> (BITS_PER_LONG_LONG - 1 - (h))))
4926    
4927     extern unsigned int __sw_hweight8(unsigned int w);
4928     extern unsigned int __sw_hweight16(unsigned int w);
4929     diff --git a/include/linux/bits.h b/include/linux/bits.h
4930     new file mode 100644
4931     index 000000000000..2b7b532c1d51
4932     --- /dev/null
4933     +++ b/include/linux/bits.h
4934     @@ -0,0 +1,26 @@
4935     +/* SPDX-License-Identifier: GPL-2.0 */
4936     +#ifndef __LINUX_BITS_H
4937     +#define __LINUX_BITS_H
4938     +#include <asm/bitsperlong.h>
4939     +
4940     +#define BIT(nr) (1UL << (nr))
4941     +#define BIT_ULL(nr) (1ULL << (nr))
4942     +#define BIT_MASK(nr) (1UL << ((nr) % BITS_PER_LONG))
4943     +#define BIT_WORD(nr) ((nr) / BITS_PER_LONG)
4944     +#define BIT_ULL_MASK(nr) (1ULL << ((nr) % BITS_PER_LONG_LONG))
4945     +#define BIT_ULL_WORD(nr) ((nr) / BITS_PER_LONG_LONG)
4946     +#define BITS_PER_BYTE 8
4947     +
4948     +/*
4949     + * Create a contiguous bitmask starting at bit position @l and ending at
4950     + * position @h. For example
4951     + * GENMASK_ULL(39, 21) gives us the 64bit vector 0x000000ffffe00000.
4952     + */
4953     +#define GENMASK(h, l) \
4954     + (((~0UL) - (1UL << (l)) + 1) & (~0UL >> (BITS_PER_LONG - 1 - (h))))
4955     +
4956     +#define GENMASK_ULL(h, l) \
4957     + (((~0ULL) - (1ULL << (l)) + 1) & \
4958     + (~0ULL >> (BITS_PER_LONG_LONG - 1 - (h))))
4959     +
4960     +#endif /* __LINUX_BITS_H */
4961     diff --git a/include/linux/cpu.h b/include/linux/cpu.h
4962     index ae5ac89324df..166686209f2c 100644
4963     --- a/include/linux/cpu.h
4964     +++ b/include/linux/cpu.h
4965     @@ -54,6 +54,8 @@ extern ssize_t cpu_show_spec_store_bypass(struct device *dev,
4966     struct device_attribute *attr, char *buf);
4967     extern ssize_t cpu_show_l1tf(struct device *dev,
4968     struct device_attribute *attr, char *buf);
4969     +extern ssize_t cpu_show_mds(struct device *dev,
4970     + struct device_attribute *attr, char *buf);
4971    
4972     extern __printf(4, 5)
4973     struct device *cpu_device_create(struct device *parent, void *drvdata,
4974     @@ -276,4 +278,28 @@ static inline void cpu_smt_check_topology_early(void) { }
4975     static inline void cpu_smt_check_topology(void) { }
4976     #endif
4977    
4978     +/*
4979     + * These are used for a global "mitigations=" cmdline option for toggling
4980     + * optional CPU mitigations.
4981     + */
4982     +enum cpu_mitigations {
4983     + CPU_MITIGATIONS_OFF,
4984     + CPU_MITIGATIONS_AUTO,
4985     + CPU_MITIGATIONS_AUTO_NOSMT,
4986     +};
4987     +
4988     +extern enum cpu_mitigations cpu_mitigations;
4989     +
4990     +/* mitigations=off */
4991     +static inline bool cpu_mitigations_off(void)
4992     +{
4993     + return cpu_mitigations == CPU_MITIGATIONS_OFF;
4994     +}
4995     +
4996     +/* mitigations=auto,nosmt */
4997     +static inline bool cpu_mitigations_auto_nosmt(void)
4998     +{
4999     + return cpu_mitigations == CPU_MITIGATIONS_AUTO_NOSMT;
5000     +}
5001     +
5002     #endif /* _LINUX_CPU_H_ */
5003     diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
5004     index d53a23100401..58ae371556bc 100644
5005     --- a/include/linux/ptrace.h
5006     +++ b/include/linux/ptrace.h
5007     @@ -60,14 +60,17 @@ extern void exit_ptrace(struct task_struct *tracer, struct list_head *dead);
5008     #define PTRACE_MODE_READ 0x01
5009     #define PTRACE_MODE_ATTACH 0x02
5010     #define PTRACE_MODE_NOAUDIT 0x04
5011     -#define PTRACE_MODE_FSCREDS 0x08
5012     -#define PTRACE_MODE_REALCREDS 0x10
5013     +#define PTRACE_MODE_FSCREDS 0x08
5014     +#define PTRACE_MODE_REALCREDS 0x10
5015     +#define PTRACE_MODE_SCHED 0x20
5016     +#define PTRACE_MODE_IBPB 0x40
5017    
5018     /* shorthands for READ/ATTACH and FSCREDS/REALCREDS combinations */
5019     #define PTRACE_MODE_READ_FSCREDS (PTRACE_MODE_READ | PTRACE_MODE_FSCREDS)
5020     #define PTRACE_MODE_READ_REALCREDS (PTRACE_MODE_READ | PTRACE_MODE_REALCREDS)
5021     #define PTRACE_MODE_ATTACH_FSCREDS (PTRACE_MODE_ATTACH | PTRACE_MODE_FSCREDS)
5022     #define PTRACE_MODE_ATTACH_REALCREDS (PTRACE_MODE_ATTACH | PTRACE_MODE_REALCREDS)
5023     +#define PTRACE_MODE_SPEC_IBPB (PTRACE_MODE_ATTACH_REALCREDS | PTRACE_MODE_IBPB)
5024    
5025     /**
5026     * ptrace_may_access - check whether the caller is permitted to access
5027     @@ -85,6 +88,20 @@ extern void exit_ptrace(struct task_struct *tracer, struct list_head *dead);
5028     */
5029     extern bool ptrace_may_access(struct task_struct *task, unsigned int mode);
5030    
5031     +/**
5032     + * ptrace_may_access - check whether the caller is permitted to access
5033     + * a target task.
5034     + * @task: target task
5035     + * @mode: selects type of access and caller credentials
5036     + *
5037     + * Returns true on success, false on denial.
5038     + *
5039     + * Similar to ptrace_may_access(). Only to be called from context switch
5040     + * code. Does not call into audit and the regular LSM hooks due to locking
5041     + * constraints.
5042     + */
5043     +extern bool ptrace_may_access_sched(struct task_struct *task, unsigned int mode);
5044     +
5045     static inline int ptrace_reparented(struct task_struct *child)
5046     {
5047     return !same_thread_group(child->real_parent, child->parent);
5048     diff --git a/include/linux/sched.h b/include/linux/sched.h
5049     index ebd0afb35d16..1c487a3abd84 100644
5050     --- a/include/linux/sched.h
5051     +++ b/include/linux/sched.h
5052     @@ -2357,6 +2357,8 @@ static inline void memalloc_noio_restore(unsigned int flags)
5053     #define PFA_LMK_WAITING 3 /* Lowmemorykiller is waiting */
5054     #define PFA_SPEC_SSB_DISABLE 4 /* Speculative Store Bypass disabled */
5055     #define PFA_SPEC_SSB_FORCE_DISABLE 5 /* Speculative Store Bypass force disabled*/
5056     +#define PFA_SPEC_IB_DISABLE 6 /* Indirect branch speculation restricted */
5057     +#define PFA_SPEC_IB_FORCE_DISABLE 7 /* Indirect branch speculation permanently restricted */
5058    
5059    
5060     #define TASK_PFA_TEST(name, func) \
5061     @@ -2390,6 +2392,13 @@ TASK_PFA_CLEAR(SPEC_SSB_DISABLE, spec_ssb_disable)
5062     TASK_PFA_TEST(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable)
5063     TASK_PFA_SET(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable)
5064    
5065     +TASK_PFA_TEST(SPEC_IB_DISABLE, spec_ib_disable)
5066     +TASK_PFA_SET(SPEC_IB_DISABLE, spec_ib_disable)
5067     +TASK_PFA_CLEAR(SPEC_IB_DISABLE, spec_ib_disable)
5068     +
5069     +TASK_PFA_TEST(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable)
5070     +TASK_PFA_SET(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable)
5071     +
5072     /*
5073     * task->jobctl flags
5074     */
5075     diff --git a/include/linux/sched/smt.h b/include/linux/sched/smt.h
5076     new file mode 100644
5077     index 000000000000..559ac4590593
5078     --- /dev/null
5079     +++ b/include/linux/sched/smt.h
5080     @@ -0,0 +1,20 @@
5081     +/* SPDX-License-Identifier: GPL-2.0 */
5082     +#ifndef _LINUX_SCHED_SMT_H
5083     +#define _LINUX_SCHED_SMT_H
5084     +
5085     +#include <linux/atomic.h>
5086     +
5087     +#ifdef CONFIG_SCHED_SMT
5088     +extern atomic_t sched_smt_present;
5089     +
5090     +static __always_inline bool sched_smt_active(void)
5091     +{
5092     + return atomic_read(&sched_smt_present);
5093     +}
5094     +#else
5095     +static inline bool sched_smt_active(void) { return false; }
5096     +#endif
5097     +
5098     +void arch_smt_update(void);
5099     +
5100     +#endif
5101     diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
5102     index 64776b72e1eb..64ec0d62e5f5 100644
5103     --- a/include/uapi/linux/prctl.h
5104     +++ b/include/uapi/linux/prctl.h
5105     @@ -202,6 +202,7 @@ struct prctl_mm_map {
5106     #define PR_SET_SPECULATION_CTRL 53
5107     /* Speculation control variants */
5108     # define PR_SPEC_STORE_BYPASS 0
5109     +# define PR_SPEC_INDIRECT_BRANCH 1
5110     /* Return and control values for PR_SET/GET_SPECULATION_CTRL */
5111     # define PR_SPEC_NOT_AFFECTED 0
5112     # define PR_SPEC_PRCTL (1UL << 0)
5113     diff --git a/kernel/cpu.c b/kernel/cpu.c
5114     index bf24e8400903..db1a0bc46c3e 100644
5115     --- a/kernel/cpu.c
5116     +++ b/kernel/cpu.c
5117     @@ -8,6 +8,7 @@
5118     #include <linux/init.h>
5119     #include <linux/notifier.h>
5120     #include <linux/sched.h>
5121     +#include <linux/sched/smt.h>
5122     #include <linux/unistd.h>
5123     #include <linux/cpu.h>
5124     #include <linux/oom.h>
5125     @@ -356,6 +357,12 @@ void cpu_hotplug_enable(void)
5126     EXPORT_SYMBOL_GPL(cpu_hotplug_enable);
5127     #endif /* CONFIG_HOTPLUG_CPU */
5128    
5129     +/*
5130     + * Architectures that need SMT-specific errata handling during SMT hotplug
5131     + * should override this.
5132     + */
5133     +void __weak arch_smt_update(void) { }
5134     +
5135     #ifdef CONFIG_HOTPLUG_SMT
5136     enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED;
5137     EXPORT_SYMBOL_GPL(cpu_smt_control);
5138     @@ -1058,6 +1065,7 @@ out:
5139     /* This post dead nonsense must die */
5140     if (!ret && hasdied)
5141     cpu_notify_nofail(CPU_POST_DEAD, cpu);
5142     + arch_smt_update();
5143     return ret;
5144     }
5145    
5146     @@ -1177,6 +1185,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
5147     ret = cpuhp_up_callbacks(cpu, st, target);
5148     out:
5149     cpu_hotplug_done();
5150     + arch_smt_update();
5151     return ret;
5152     }
5153    
5154     @@ -2012,8 +2021,10 @@ static int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval)
5155     */
5156     cpuhp_offline_cpu_device(cpu);
5157     }
5158     - if (!ret)
5159     + if (!ret) {
5160     cpu_smt_control = ctrlval;
5161     + arch_smt_update();
5162     + }
5163     cpu_maps_update_done();
5164     return ret;
5165     }
5166     @@ -2024,6 +2035,7 @@ static int cpuhp_smt_enable(void)
5167    
5168     cpu_maps_update_begin();
5169     cpu_smt_control = CPU_SMT_ENABLED;
5170     + arch_smt_update();
5171     for_each_present_cpu(cpu) {
5172     /* Skip online CPUs and CPUs on offline nodes */
5173     if (cpu_online(cpu) || !node_online(cpu_to_node(cpu)))
5174     @@ -2222,3 +2234,18 @@ void __init boot_cpu_hotplug_init(void)
5175     #endif
5176     this_cpu_write(cpuhp_state.state, CPUHP_ONLINE);
5177     }
5178     +
5179     +enum cpu_mitigations cpu_mitigations __ro_after_init = CPU_MITIGATIONS_AUTO;
5180     +
5181     +static int __init mitigations_parse_cmdline(char *arg)
5182     +{
5183     + if (!strcmp(arg, "off"))
5184     + cpu_mitigations = CPU_MITIGATIONS_OFF;
5185     + else if (!strcmp(arg, "auto"))
5186     + cpu_mitigations = CPU_MITIGATIONS_AUTO;
5187     + else if (!strcmp(arg, "auto,nosmt"))
5188     + cpu_mitigations = CPU_MITIGATIONS_AUTO_NOSMT;
5189     +
5190     + return 0;
5191     +}
5192     +early_param("mitigations", mitigations_parse_cmdline);
5193     diff --git a/kernel/ptrace.c b/kernel/ptrace.c
5194     index f39a7be98fc1..efba851ee018 100644
5195     --- a/kernel/ptrace.c
5196     +++ b/kernel/ptrace.c
5197     @@ -258,6 +258,9 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state)
5198    
5199     static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode)
5200     {
5201     + if (mode & PTRACE_MODE_SCHED)
5202     + return false;
5203     +
5204     if (mode & PTRACE_MODE_NOAUDIT)
5205     return has_ns_capability_noaudit(current, ns, CAP_SYS_PTRACE);
5206     else
5207     @@ -325,9 +328,16 @@ ok:
5208     !ptrace_has_cap(mm->user_ns, mode)))
5209     return -EPERM;
5210    
5211     + if (mode & PTRACE_MODE_SCHED)
5212     + return 0;
5213     return security_ptrace_access_check(task, mode);
5214     }
5215    
5216     +bool ptrace_may_access_sched(struct task_struct *task, unsigned int mode)
5217     +{
5218     + return __ptrace_may_access(task, mode | PTRACE_MODE_SCHED);
5219     +}
5220     +
5221     bool ptrace_may_access(struct task_struct *task, unsigned int mode)
5222     {
5223     int err;
5224     diff --git a/kernel/sched/core.c b/kernel/sched/core.c
5225     index 6b3fff6a6437..50e80b1be2c8 100644
5226     --- a/kernel/sched/core.c
5227     +++ b/kernel/sched/core.c
5228     @@ -7355,11 +7355,22 @@ static int cpuset_cpu_inactive(unsigned int cpu)
5229     return 0;
5230     }
5231    
5232     +#ifdef CONFIG_SCHED_SMT
5233     +atomic_t sched_smt_present = ATOMIC_INIT(0);
5234     +#endif
5235     +
5236     int sched_cpu_activate(unsigned int cpu)
5237     {
5238     struct rq *rq = cpu_rq(cpu);
5239     unsigned long flags;
5240    
5241     +#ifdef CONFIG_SCHED_SMT
5242     + /*
5243     + * When going up, increment the number of cores with SMT present.
5244     + */
5245     + if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
5246     + atomic_inc(&sched_smt_present);
5247     +#endif
5248     set_cpu_active(cpu, true);
5249    
5250     if (sched_smp_initialized) {
5251     @@ -7408,6 +7419,14 @@ int sched_cpu_deactivate(unsigned int cpu)
5252     else
5253     synchronize_rcu();
5254    
5255     +#ifdef CONFIG_SCHED_SMT
5256     + /*
5257     + * When going down, decrement the number of cores with SMT present.
5258     + */
5259     + if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
5260     + atomic_dec(&sched_smt_present);
5261     +#endif
5262     +
5263     if (!sched_smp_initialized)
5264     return 0;
5265    
5266     diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
5267     index ec6e838e991a..15c08752926b 100644
5268     --- a/kernel/sched/sched.h
5269     +++ b/kernel/sched/sched.h
5270     @@ -2,6 +2,7 @@
5271     #include <linux/sched.h>
5272     #include <linux/sched/sysctl.h>
5273     #include <linux/sched/rt.h>
5274     +#include <linux/sched/smt.h>
5275     #include <linux/u64_stats_sync.h>
5276     #include <linux/sched/deadline.h>
5277     #include <linux/kernel_stat.h>
5278     diff --git a/tools/power/x86/turbostat/Makefile b/tools/power/x86/turbostat/Makefile
5279     index 8561e7ddca59..92be948c922d 100644
5280     --- a/tools/power/x86/turbostat/Makefile
5281     +++ b/tools/power/x86/turbostat/Makefile
5282     @@ -8,7 +8,7 @@ ifeq ("$(origin O)", "command line")
5283     endif
5284    
5285     turbostat : turbostat.c
5286     -CFLAGS += -Wall
5287     +CFLAGS += -Wall -I../../../include
5288     CFLAGS += -DMSRHEADER='"../../../../arch/x86/include/asm/msr-index.h"'
5289    
5290     %: %.c