Magellan Linux

Annotation of /trunk/kernel-alx-legacy/patches-4.9/0219-4.9.120-all-fixes.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 3608 - (hide annotations) (download)
Fri Aug 14 07:34:29 2020 UTC (3 years, 10 months ago) by niro
File size: 175079 byte(s)
-added kerenl-alx-legacy pkg
1 niro 3608 diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu
2     index 6d75a9c00e8a..069e8d52c991 100644
3     --- a/Documentation/ABI/testing/sysfs-devices-system-cpu
4     +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
5     @@ -356,6 +356,7 @@ What: /sys/devices/system/cpu/vulnerabilities
6     /sys/devices/system/cpu/vulnerabilities/spectre_v1
7     /sys/devices/system/cpu/vulnerabilities/spectre_v2
8     /sys/devices/system/cpu/vulnerabilities/spec_store_bypass
9     + /sys/devices/system/cpu/vulnerabilities/l1tf
10     Date: January 2018
11     Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org>
12     Description: Information about CPU vulnerabilities
13     @@ -367,3 +368,26 @@ Description: Information about CPU vulnerabilities
14     "Not affected" CPU is not affected by the vulnerability
15     "Vulnerable" CPU is affected and no mitigation in effect
16     "Mitigation: $M" CPU is affected and mitigation $M is in effect
17     +
18     + Details about the l1tf file can be found in
19     + Documentation/admin-guide/l1tf.rst
20     +
21     +What: /sys/devices/system/cpu/smt
22     + /sys/devices/system/cpu/smt/active
23     + /sys/devices/system/cpu/smt/control
24     +Date: June 2018
25     +Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org>
26     +Description: Control Symetric Multi Threading (SMT)
27     +
28     + active: Tells whether SMT is active (enabled and siblings online)
29     +
30     + control: Read/write interface to control SMT. Possible
31     + values:
32     +
33     + "on" SMT is enabled
34     + "off" SMT is disabled
35     + "forceoff" SMT is force disabled. Cannot be changed.
36     + "notsupported" SMT is not supported by the CPU
37     +
38     + If control status is "forceoff" or "notsupported" writes
39     + are rejected.
40     diff --git a/Documentation/index.rst b/Documentation/index.rst
41     index c53d089455a4..213399aac757 100644
42     --- a/Documentation/index.rst
43     +++ b/Documentation/index.rst
44     @@ -12,6 +12,7 @@ Contents:
45     :maxdepth: 2
46    
47     kernel-documentation
48     + l1tf
49     development-process/index
50     dev-tools/tools
51     driver-api/index
52     diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
53     index a16f87e4dd10..a36a695318c6 100644
54     --- a/Documentation/kernel-parameters.txt
55     +++ b/Documentation/kernel-parameters.txt
56     @@ -2010,10 +2010,84 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
57     (virtualized real and unpaged mode) on capable
58     Intel chips. Default is 1 (enabled)
59    
60     + kvm-intel.vmentry_l1d_flush=[KVM,Intel] Mitigation for L1 Terminal Fault
61     + CVE-2018-3620.
62     +
63     + Valid arguments: never, cond, always
64     +
65     + always: L1D cache flush on every VMENTER.
66     + cond: Flush L1D on VMENTER only when the code between
67     + VMEXIT and VMENTER can leak host memory.
68     + never: Disables the mitigation
69     +
70     + Default is cond (do L1 cache flush in specific instances)
71     +
72     kvm-intel.vpid= [KVM,Intel] Disable Virtual Processor Identification
73     feature (tagged TLBs) on capable Intel chips.
74     Default is 1 (enabled)
75    
76     + l1tf= [X86] Control mitigation of the L1TF vulnerability on
77     + affected CPUs
78     +
79     + The kernel PTE inversion protection is unconditionally
80     + enabled and cannot be disabled.
81     +
82     + full
83     + Provides all available mitigations for the
84     + L1TF vulnerability. Disables SMT and
85     + enables all mitigations in the
86     + hypervisors, i.e. unconditional L1D flush.
87     +
88     + SMT control and L1D flush control via the
89     + sysfs interface is still possible after
90     + boot. Hypervisors will issue a warning
91     + when the first VM is started in a
92     + potentially insecure configuration,
93     + i.e. SMT enabled or L1D flush disabled.
94     +
95     + full,force
96     + Same as 'full', but disables SMT and L1D
97     + flush runtime control. Implies the
98     + 'nosmt=force' command line option.
99     + (i.e. sysfs control of SMT is disabled.)
100     +
101     + flush
102     + Leaves SMT enabled and enables the default
103     + hypervisor mitigation, i.e. conditional
104     + L1D flush.
105     +
106     + SMT control and L1D flush control via the
107     + sysfs interface is still possible after
108     + boot. Hypervisors will issue a warning
109     + when the first VM is started in a
110     + potentially insecure configuration,
111     + i.e. SMT enabled or L1D flush disabled.
112     +
113     + flush,nosmt
114     +
115     + Disables SMT and enables the default
116     + hypervisor mitigation.
117     +
118     + SMT control and L1D flush control via the
119     + sysfs interface is still possible after
120     + boot. Hypervisors will issue a warning
121     + when the first VM is started in a
122     + potentially insecure configuration,
123     + i.e. SMT enabled or L1D flush disabled.
124     +
125     + flush,nowarn
126     + Same as 'flush', but hypervisors will not
127     + warn when a VM is started in a potentially
128     + insecure configuration.
129     +
130     + off
131     + Disables hypervisor mitigations and doesn't
132     + emit any warnings.
133     +
134     + Default is 'flush'.
135     +
136     + For details see: Documentation/admin-guide/l1tf.rst
137     +
138     l2cr= [PPC]
139    
140     l3cr= [PPC]
141     @@ -2694,6 +2768,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
142     nosmt [KNL,S390] Disable symmetric multithreading (SMT).
143     Equivalent to smt=1.
144    
145     + [KNL,x86] Disable symmetric multithreading (SMT).
146     + nosmt=force: Force disable SMT, cannot be undone
147     + via the sysfs control file.
148     +
149     nospectre_v2 [X86] Disable all mitigations for the Spectre variant 2
150     (indirect branch prediction) vulnerability. System may
151     allow data leaks with this option, which is equivalent
152     diff --git a/Documentation/l1tf.rst b/Documentation/l1tf.rst
153     new file mode 100644
154     index 000000000000..bae52b845de0
155     --- /dev/null
156     +++ b/Documentation/l1tf.rst
157     @@ -0,0 +1,610 @@
158     +L1TF - L1 Terminal Fault
159     +========================
160     +
161     +L1 Terminal Fault is a hardware vulnerability which allows unprivileged
162     +speculative access to data which is available in the Level 1 Data Cache
163     +when the page table entry controlling the virtual address, which is used
164     +for the access, has the Present bit cleared or other reserved bits set.
165     +
166     +Affected processors
167     +-------------------
168     +
169     +This vulnerability affects a wide range of Intel processors. The
170     +vulnerability is not present on:
171     +
172     + - Processors from AMD, Centaur and other non Intel vendors
173     +
174     + - Older processor models, where the CPU family is < 6
175     +
176     + - A range of Intel ATOM processors (Cedarview, Cloverview, Lincroft,
177     + Penwell, Pineview, Silvermont, Airmont, Merrifield)
178     +
179     + - The Intel XEON PHI family
180     +
181     + - Intel processors which have the ARCH_CAP_RDCL_NO bit set in the
182     + IA32_ARCH_CAPABILITIES MSR. If the bit is set the CPU is not affected
183     + by the Meltdown vulnerability either. These CPUs should become
184     + available by end of 2018.
185     +
186     +Whether a processor is affected or not can be read out from the L1TF
187     +vulnerability file in sysfs. See :ref:`l1tf_sys_info`.
188     +
189     +Related CVEs
190     +------------
191     +
192     +The following CVE entries are related to the L1TF vulnerability:
193     +
194     + ============= ================= ==============================
195     + CVE-2018-3615 L1 Terminal Fault SGX related aspects
196     + CVE-2018-3620 L1 Terminal Fault OS, SMM related aspects
197     + CVE-2018-3646 L1 Terminal Fault Virtualization related aspects
198     + ============= ================= ==============================
199     +
200     +Problem
201     +-------
202     +
203     +If an instruction accesses a virtual address for which the relevant page
204     +table entry (PTE) has the Present bit cleared or other reserved bits set,
205     +then speculative execution ignores the invalid PTE and loads the referenced
206     +data if it is present in the Level 1 Data Cache, as if the page referenced
207     +by the address bits in the PTE was still present and accessible.
208     +
209     +While this is a purely speculative mechanism and the instruction will raise
210     +a page fault when it is retired eventually, the pure act of loading the
211     +data and making it available to other speculative instructions opens up the
212     +opportunity for side channel attacks to unprivileged malicious code,
213     +similar to the Meltdown attack.
214     +
215     +While Meltdown breaks the user space to kernel space protection, L1TF
216     +allows to attack any physical memory address in the system and the attack
217     +works across all protection domains. It allows an attack of SGX and also
218     +works from inside virtual machines because the speculation bypasses the
219     +extended page table (EPT) protection mechanism.
220     +
221     +
222     +Attack scenarios
223     +----------------
224     +
225     +1. Malicious user space
226     +^^^^^^^^^^^^^^^^^^^^^^^
227     +
228     + Operating Systems store arbitrary information in the address bits of a
229     + PTE which is marked non present. This allows a malicious user space
230     + application to attack the physical memory to which these PTEs resolve.
231     + In some cases user-space can maliciously influence the information
232     + encoded in the address bits of the PTE, thus making attacks more
233     + deterministic and more practical.
234     +
235     + The Linux kernel contains a mitigation for this attack vector, PTE
236     + inversion, which is permanently enabled and has no performance
237     + impact. The kernel ensures that the address bits of PTEs, which are not
238     + marked present, never point to cacheable physical memory space.
239     +
240     + A system with an up to date kernel is protected against attacks from
241     + malicious user space applications.
242     +
243     +2. Malicious guest in a virtual machine
244     +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
245     +
246     + The fact that L1TF breaks all domain protections allows malicious guest
247     + OSes, which can control the PTEs directly, and malicious guest user
248     + space applications, which run on an unprotected guest kernel lacking the
249     + PTE inversion mitigation for L1TF, to attack physical host memory.
250     +
251     + A special aspect of L1TF in the context of virtualization is symmetric
252     + multi threading (SMT). The Intel implementation of SMT is called
253     + HyperThreading. The fact that Hyperthreads on the affected processors
254     + share the L1 Data Cache (L1D) is important for this. As the flaw allows
255     + only to attack data which is present in L1D, a malicious guest running
256     + on one Hyperthread can attack the data which is brought into the L1D by
257     + the context which runs on the sibling Hyperthread of the same physical
258     + core. This context can be host OS, host user space or a different guest.
259     +
260     + If the processor does not support Extended Page Tables, the attack is
261     + only possible, when the hypervisor does not sanitize the content of the
262     + effective (shadow) page tables.
263     +
264     + While solutions exist to mitigate these attack vectors fully, these
265     + mitigations are not enabled by default in the Linux kernel because they
266     + can affect performance significantly. The kernel provides several
267     + mechanisms which can be utilized to address the problem depending on the
268     + deployment scenario. The mitigations, their protection scope and impact
269     + are described in the next sections.
270     +
271     + The default mitigations and the rationale for choosing them are explained
272     + at the end of this document. See :ref:`default_mitigations`.
273     +
274     +.. _l1tf_sys_info:
275     +
276     +L1TF system information
277     +-----------------------
278     +
279     +The Linux kernel provides a sysfs interface to enumerate the current L1TF
280     +status of the system: whether the system is vulnerable, and which
281     +mitigations are active. The relevant sysfs file is:
282     +
283     +/sys/devices/system/cpu/vulnerabilities/l1tf
284     +
285     +The possible values in this file are:
286     +
287     + =========================== ===============================
288     + 'Not affected' The processor is not vulnerable
289     + 'Mitigation: PTE Inversion' The host protection is active
290     + =========================== ===============================
291     +
292     +If KVM/VMX is enabled and the processor is vulnerable then the following
293     +information is appended to the 'Mitigation: PTE Inversion' part:
294     +
295     + - SMT status:
296     +
297     + ===================== ================
298     + 'VMX: SMT vulnerable' SMT is enabled
299     + 'VMX: SMT disabled' SMT is disabled
300     + ===================== ================
301     +
302     + - L1D Flush mode:
303     +
304     + ================================ ====================================
305     + 'L1D vulnerable' L1D flushing is disabled
306     +
307     + 'L1D conditional cache flushes' L1D flush is conditionally enabled
308     +
309     + 'L1D cache flushes' L1D flush is unconditionally enabled
310     + ================================ ====================================
311     +
312     +The resulting grade of protection is discussed in the following sections.
313     +
314     +
315     +Host mitigation mechanism
316     +-------------------------
317     +
318     +The kernel is unconditionally protected against L1TF attacks from malicious
319     +user space running on the host.
320     +
321     +
322     +Guest mitigation mechanisms
323     +---------------------------
324     +
325     +.. _l1d_flush:
326     +
327     +1. L1D flush on VMENTER
328     +^^^^^^^^^^^^^^^^^^^^^^^
329     +
330     + To make sure that a guest cannot attack data which is present in the L1D
331     + the hypervisor flushes the L1D before entering the guest.
332     +
333     + Flushing the L1D evicts not only the data which should not be accessed
334     + by a potentially malicious guest, it also flushes the guest
335     + data. Flushing the L1D has a performance impact as the processor has to
336     + bring the flushed guest data back into the L1D. Depending on the
337     + frequency of VMEXIT/VMENTER and the type of computations in the guest
338     + performance degradation in the range of 1% to 50% has been observed. For
339     + scenarios where guest VMEXIT/VMENTER are rare the performance impact is
340     + minimal. Virtio and mechanisms like posted interrupts are designed to
341     + confine the VMEXITs to a bare minimum, but specific configurations and
342     + application scenarios might still suffer from a high VMEXIT rate.
343     +
344     + The kernel provides two L1D flush modes:
345     + - conditional ('cond')
346     + - unconditional ('always')
347     +
348     + The conditional mode avoids L1D flushing after VMEXITs which execute
349     + only audited code paths before the corresponding VMENTER. These code
350     + paths have been verified that they cannot expose secrets or other
351     + interesting data to an attacker, but they can leak information about the
352     + address space layout of the hypervisor.
353     +
354     + Unconditional mode flushes L1D on all VMENTER invocations and provides
355     + maximum protection. It has a higher overhead than the conditional
356     + mode. The overhead cannot be quantified correctly as it depends on the
357     + workload scenario and the resulting number of VMEXITs.
358     +
359     + The general recommendation is to enable L1D flush on VMENTER. The kernel
360     + defaults to conditional mode on affected processors.
361     +
362     + **Note**, that L1D flush does not prevent the SMT problem because the
363     + sibling thread will also bring back its data into the L1D which makes it
364     + attackable again.
365     +
366     + L1D flush can be controlled by the administrator via the kernel command
367     + line and sysfs control files. See :ref:`mitigation_control_command_line`
368     + and :ref:`mitigation_control_kvm`.
369     +
370     +.. _guest_confinement:
371     +
372     +2. Guest VCPU confinement to dedicated physical cores
373     +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
374     +
375     + To address the SMT problem, it is possible to make a guest or a group of
376     + guests affine to one or more physical cores. The proper mechanism for
377     + that is to utilize exclusive cpusets to ensure that no other guest or
378     + host tasks can run on these cores.
379     +
380     + If only a single guest or related guests run on sibling SMT threads on
381     + the same physical core then they can only attack their own memory and
382     + restricted parts of the host memory.
383     +
384     + Host memory is attackable, when one of the sibling SMT threads runs in
385     + host OS (hypervisor) context and the other in guest context. The amount
386     + of valuable information from the host OS context depends on the context
387     + which the host OS executes, i.e. interrupts, soft interrupts and kernel
388     + threads. The amount of valuable data from these contexts cannot be
389     + declared as non-interesting for an attacker without deep inspection of
390     + the code.
391     +
392     + **Note**, that assigning guests to a fixed set of physical cores affects
393     + the ability of the scheduler to do load balancing and might have
394     + negative effects on CPU utilization depending on the hosting
395     + scenario. Disabling SMT might be a viable alternative for particular
396     + scenarios.
397     +
398     + For further information about confining guests to a single or to a group
399     + of cores consult the cpusets documentation:
400     +
401     + https://www.kernel.org/doc/Documentation/cgroup-v1/cpusets.txt
402     +
403     +.. _interrupt_isolation:
404     +
405     +3. Interrupt affinity
406     +^^^^^^^^^^^^^^^^^^^^^
407     +
408     + Interrupts can be made affine to logical CPUs. This is not universally
409     + true because there are types of interrupts which are truly per CPU
410     + interrupts, e.g. the local timer interrupt. Aside of that multi queue
411     + devices affine their interrupts to single CPUs or groups of CPUs per
412     + queue without allowing the administrator to control the affinities.
413     +
414     + Moving the interrupts, which can be affinity controlled, away from CPUs
415     + which run untrusted guests, reduces the attack vector space.
416     +
417     + Whether the interrupts with are affine to CPUs, which run untrusted
418     + guests, provide interesting data for an attacker depends on the system
419     + configuration and the scenarios which run on the system. While for some
420     + of the interrupts it can be assumed that they won't expose interesting
421     + information beyond exposing hints about the host OS memory layout, there
422     + is no way to make general assumptions.
423     +
424     + Interrupt affinity can be controlled by the administrator via the
425     + /proc/irq/$NR/smp_affinity[_list] files. Limited documentation is
426     + available at:
427     +
428     + https://www.kernel.org/doc/Documentation/IRQ-affinity.txt
429     +
430     +.. _smt_control:
431     +
432     +4. SMT control
433     +^^^^^^^^^^^^^^
434     +
435     + To prevent the SMT issues of L1TF it might be necessary to disable SMT
436     + completely. Disabling SMT can have a significant performance impact, but
437     + the impact depends on the hosting scenario and the type of workloads.
438     + The impact of disabling SMT needs also to be weighted against the impact
439     + of other mitigation solutions like confining guests to dedicated cores.
440     +
441     + The kernel provides a sysfs interface to retrieve the status of SMT and
442     + to control it. It also provides a kernel command line interface to
443     + control SMT.
444     +
445     + The kernel command line interface consists of the following options:
446     +
447     + =========== ==========================================================
448     + nosmt Affects the bring up of the secondary CPUs during boot. The
449     + kernel tries to bring all present CPUs online during the
450     + boot process. "nosmt" makes sure that from each physical
451     + core only one - the so called primary (hyper) thread is
452     + activated. Due to a design flaw of Intel processors related
453     + to Machine Check Exceptions the non primary siblings have
454     + to be brought up at least partially and are then shut down
455     + again. "nosmt" can be undone via the sysfs interface.
456     +
457     + nosmt=force Has the same effect as "nosmt" but it does not allow to
458     + undo the SMT disable via the sysfs interface.
459     + =========== ==========================================================
460     +
461     + The sysfs interface provides two files:
462     +
463     + - /sys/devices/system/cpu/smt/control
464     + - /sys/devices/system/cpu/smt/active
465     +
466     + /sys/devices/system/cpu/smt/control:
467     +
468     + This file allows to read out the SMT control state and provides the
469     + ability to disable or (re)enable SMT. The possible states are:
470     +
471     + ============== ===================================================
472     + on SMT is supported by the CPU and enabled. All
473     + logical CPUs can be onlined and offlined without
474     + restrictions.
475     +
476     + off SMT is supported by the CPU and disabled. Only
477     + the so called primary SMT threads can be onlined
478     + and offlined without restrictions. An attempt to
479     + online a non-primary sibling is rejected
480     +
481     + forceoff Same as 'off' but the state cannot be controlled.
482     + Attempts to write to the control file are rejected.
483     +
484     + notsupported The processor does not support SMT. It's therefore
485     + not affected by the SMT implications of L1TF.
486     + Attempts to write to the control file are rejected.
487     + ============== ===================================================
488     +
489     + The possible states which can be written into this file to control SMT
490     + state are:
491     +
492     + - on
493     + - off
494     + - forceoff
495     +
496     + /sys/devices/system/cpu/smt/active:
497     +
498     + This file reports whether SMT is enabled and active, i.e. if on any
499     + physical core two or more sibling threads are online.
500     +
501     + SMT control is also possible at boot time via the l1tf kernel command
502     + line parameter in combination with L1D flush control. See
503     + :ref:`mitigation_control_command_line`.
504     +
505     +5. Disabling EPT
506     +^^^^^^^^^^^^^^^^
507     +
508     + Disabling EPT for virtual machines provides full mitigation for L1TF even
509     + with SMT enabled, because the effective page tables for guests are
510     + managed and sanitized by the hypervisor. Though disabling EPT has a
511     + significant performance impact especially when the Meltdown mitigation
512     + KPTI is enabled.
513     +
514     + EPT can be disabled in the hypervisor via the 'kvm-intel.ept' parameter.
515     +
516     +There is ongoing research and development for new mitigation mechanisms to
517     +address the performance impact of disabling SMT or EPT.
518     +
519     +.. _mitigation_control_command_line:
520     +
521     +Mitigation control on the kernel command line
522     +---------------------------------------------
523     +
524     +The kernel command line allows to control the L1TF mitigations at boot
525     +time with the option "l1tf=". The valid arguments for this option are:
526     +
527     + ============ =============================================================
528     + full Provides all available mitigations for the L1TF
529     + vulnerability. Disables SMT and enables all mitigations in
530     + the hypervisors, i.e. unconditional L1D flushing
531     +
532     + SMT control and L1D flush control via the sysfs interface
533     + is still possible after boot. Hypervisors will issue a
534     + warning when the first VM is started in a potentially
535     + insecure configuration, i.e. SMT enabled or L1D flush
536     + disabled.
537     +
538     + full,force Same as 'full', but disables SMT and L1D flush runtime
539     + control. Implies the 'nosmt=force' command line option.
540     + (i.e. sysfs control of SMT is disabled.)
541     +
542     + flush Leaves SMT enabled and enables the default hypervisor
543     + mitigation, i.e. conditional L1D flushing
544     +
545     + SMT control and L1D flush control via the sysfs interface
546     + is still possible after boot. Hypervisors will issue a
547     + warning when the first VM is started in a potentially
548     + insecure configuration, i.e. SMT enabled or L1D flush
549     + disabled.
550     +
551     + flush,nosmt Disables SMT and enables the default hypervisor mitigation,
552     + i.e. conditional L1D flushing.
553     +
554     + SMT control and L1D flush control via the sysfs interface
555     + is still possible after boot. Hypervisors will issue a
556     + warning when the first VM is started in a potentially
557     + insecure configuration, i.e. SMT enabled or L1D flush
558     + disabled.
559     +
560     + flush,nowarn Same as 'flush', but hypervisors will not warn when a VM is
561     + started in a potentially insecure configuration.
562     +
563     + off Disables hypervisor mitigations and doesn't emit any
564     + warnings.
565     + ============ =============================================================
566     +
567     +The default is 'flush'. For details about L1D flushing see :ref:`l1d_flush`.
568     +
569     +
570     +.. _mitigation_control_kvm:
571     +
572     +Mitigation control for KVM - module parameter
573     +-------------------------------------------------------------
574     +
575     +The KVM hypervisor mitigation mechanism, flushing the L1D cache when
576     +entering a guest, can be controlled with a module parameter.
577     +
578     +The option/parameter is "kvm-intel.vmentry_l1d_flush=". It takes the
579     +following arguments:
580     +
581     + ============ ==============================================================
582     + always L1D cache flush on every VMENTER.
583     +
584     + cond Flush L1D on VMENTER only when the code between VMEXIT and
585     + VMENTER can leak host memory which is considered
586     + interesting for an attacker. This still can leak host memory
587     + which allows e.g. to determine the hosts address space layout.
588     +
589     + never Disables the mitigation
590     + ============ ==============================================================
591     +
592     +The parameter can be provided on the kernel command line, as a module
593     +parameter when loading the modules and at runtime modified via the sysfs
594     +file:
595     +
596     +/sys/module/kvm_intel/parameters/vmentry_l1d_flush
597     +
598     +The default is 'cond'. If 'l1tf=full,force' is given on the kernel command
599     +line, then 'always' is enforced and the kvm-intel.vmentry_l1d_flush
600     +module parameter is ignored and writes to the sysfs file are rejected.
601     +
602     +
603     +Mitigation selection guide
604     +--------------------------
605     +
606     +1. No virtualization in use
607     +^^^^^^^^^^^^^^^^^^^^^^^^^^^
608     +
609     + The system is protected by the kernel unconditionally and no further
610     + action is required.
611     +
612     +2. Virtualization with trusted guests
613     +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
614     +
615     + If the guest comes from a trusted source and the guest OS kernel is
616     + guaranteed to have the L1TF mitigations in place the system is fully
617     + protected against L1TF and no further action is required.
618     +
619     + To avoid the overhead of the default L1D flushing on VMENTER the
620     + administrator can disable the flushing via the kernel command line and
621     + sysfs control files. See :ref:`mitigation_control_command_line` and
622     + :ref:`mitigation_control_kvm`.
623     +
624     +
625     +3. Virtualization with untrusted guests
626     +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
627     +
628     +3.1. SMT not supported or disabled
629     +""""""""""""""""""""""""""""""""""
630     +
631     + If SMT is not supported by the processor or disabled in the BIOS or by
632     + the kernel, it's only required to enforce L1D flushing on VMENTER.
633     +
634     + Conditional L1D flushing is the default behaviour and can be tuned. See
635     + :ref:`mitigation_control_command_line` and :ref:`mitigation_control_kvm`.
636     +
637     +3.2. EPT not supported or disabled
638     +""""""""""""""""""""""""""""""""""
639     +
640     + If EPT is not supported by the processor or disabled in the hypervisor,
641     + the system is fully protected. SMT can stay enabled and L1D flushing on
642     + VMENTER is not required.
643     +
644     + EPT can be disabled in the hypervisor via the 'kvm-intel.ept' parameter.
645     +
646     +3.3. SMT and EPT supported and active
647     +"""""""""""""""""""""""""""""""""""""
648     +
649     + If SMT and EPT are supported and active then various degrees of
650     + mitigations can be employed:
651     +
652     + - L1D flushing on VMENTER:
653     +
654     + L1D flushing on VMENTER is the minimal protection requirement, but it
655     + is only potent in combination with other mitigation methods.
656     +
657     + Conditional L1D flushing is the default behaviour and can be tuned. See
658     + :ref:`mitigation_control_command_line` and :ref:`mitigation_control_kvm`.
659     +
660     + - Guest confinement:
661     +
662     + Confinement of guests to a single or a group of physical cores which
663     + are not running any other processes, can reduce the attack surface
664     + significantly, but interrupts, soft interrupts and kernel threads can
665     + still expose valuable data to a potential attacker. See
666     + :ref:`guest_confinement`.
667     +
668     + - Interrupt isolation:
669     +
670     + Isolating the guest CPUs from interrupts can reduce the attack surface
671     + further, but still allows a malicious guest to explore a limited amount
672     + of host physical memory. This can at least be used to gain knowledge
673     + about the host address space layout. The interrupts which have a fixed
674     + affinity to the CPUs which run the untrusted guests can depending on
675     + the scenario still trigger soft interrupts and schedule kernel threads
676     + which might expose valuable information. See
677     + :ref:`interrupt_isolation`.
678     +
679     +The above three mitigation methods combined can provide protection to a
680     +certain degree, but the risk of the remaining attack surface has to be
681     +carefully analyzed. For full protection the following methods are
682     +available:
683     +
684     + - Disabling SMT:
685     +
686     + Disabling SMT and enforcing the L1D flushing provides the maximum
687     + amount of protection. This mitigation is not depending on any of the
688     + above mitigation methods.
689     +
690     + SMT control and L1D flushing can be tuned by the command line
691     + parameters 'nosmt', 'l1tf', 'kvm-intel.vmentry_l1d_flush' and at run
692     + time with the matching sysfs control files. See :ref:`smt_control`,
693     + :ref:`mitigation_control_command_line` and
694     + :ref:`mitigation_control_kvm`.
695     +
696     + - Disabling EPT:
697     +
698     + Disabling EPT provides the maximum amount of protection as well. It is
699     + not depending on any of the above mitigation methods. SMT can stay
700     + enabled and L1D flushing is not required, but the performance impact is
701     + significant.
702     +
703     + EPT can be disabled in the hypervisor via the 'kvm-intel.ept'
704     + parameter.
705     +
706     +3.4. Nested virtual machines
707     +""""""""""""""""""""""""""""
708     +
709     +When nested virtualization is in use, three operating systems are involved:
710     +the bare metal hypervisor, the nested hypervisor and the nested virtual
711     +machine. VMENTER operations from the nested hypervisor into the nested
712     +guest will always be processed by the bare metal hypervisor. If KVM is the
713     +bare metal hypervisor it wiil:
714     +
715     + - Flush the L1D cache on every switch from the nested hypervisor to the
716     + nested virtual machine, so that the nested hypervisor's secrets are not
717     + exposed to the nested virtual machine;
718     +
719     + - Flush the L1D cache on every switch from the nested virtual machine to
720     + the nested hypervisor; this is a complex operation, and flushing the L1D
721     + cache avoids that the bare metal hypervisor's secrets are exposed to the
722     + nested virtual machine;
723     +
724     + - Instruct the nested hypervisor to not perform any L1D cache flush. This
725     + is an optimization to avoid double L1D flushing.
726     +
727     +
728     +.. _default_mitigations:
729     +
730     +Default mitigations
731     +-------------------
732     +
733     + The kernel default mitigations for vulnerable processors are:
734     +
735     + - PTE inversion to protect against malicious user space. This is done
736     + unconditionally and cannot be controlled.
737     +
738     + - L1D conditional flushing on VMENTER when EPT is enabled for
739     + a guest.
740     +
741     + The kernel does not by default enforce the disabling of SMT, which leaves
742     + SMT systems vulnerable when running untrusted guests with EPT enabled.
743     +
744     + The rationale for this choice is:
745     +
746     + - Force disabling SMT can break existing setups, especially with
747     + unattended updates.
748     +
749     + - If regular users run untrusted guests on their machine, then L1TF is
750     + just an add on to other malware which might be embedded in an untrusted
751     + guest, e.g. spam-bots or attacks on the local network.
752     +
753     + There is no technical way to prevent a user from running untrusted code
754     + on their machines blindly.
755     +
756     + - It's technically extremely unlikely and from today's knowledge even
757     + impossible that L1TF can be exploited via the most popular attack
758     + mechanisms like JavaScript because these mechanisms have no way to
759     + control PTEs. If this would be possible and not other mitigation would
760     + be possible, then the default might be different.
761     +
762     + - The administrators of cloud and hosting setups have to carefully
763     + analyze the risk for their scenarios and make the appropriate
764     + mitigation choices, which might even vary across their deployed
765     + machines and also result in other changes of their overall setup.
766     + There is no way for the kernel to provide a sensible default for this
767     + kind of scenarios.
768     diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
769     index e46c14fac9da..3ff58a8ffabb 100644
770     --- a/Documentation/virtual/kvm/api.txt
771     +++ b/Documentation/virtual/kvm/api.txt
772     @@ -122,14 +122,15 @@ KVM_CAP_S390_UCONTROL and use the flag KVM_VM_S390_UCONTROL as
773     privileged user (CAP_SYS_ADMIN).
774    
775    
776     -4.3 KVM_GET_MSR_INDEX_LIST
777     +4.3 KVM_GET_MSR_INDEX_LIST, KVM_GET_MSR_FEATURE_INDEX_LIST
778    
779     -Capability: basic
780     +Capability: basic, KVM_CAP_GET_MSR_FEATURES for KVM_GET_MSR_FEATURE_INDEX_LIST
781     Architectures: x86
782     -Type: system
783     +Type: system ioctl
784     Parameters: struct kvm_msr_list (in/out)
785     Returns: 0 on success; -1 on error
786     Errors:
787     + EFAULT: the msr index list cannot be read from or written to
788     E2BIG: the msr index list is to be to fit in the array specified by
789     the user.
790    
791     @@ -138,16 +139,23 @@ struct kvm_msr_list {
792     __u32 indices[0];
793     };
794    
795     -This ioctl returns the guest msrs that are supported. The list varies
796     -by kvm version and host processor, but does not change otherwise. The
797     -user fills in the size of the indices array in nmsrs, and in return
798     -kvm adjusts nmsrs to reflect the actual number of msrs and fills in
799     -the indices array with their numbers.
800     +The user fills in the size of the indices array in nmsrs, and in return
801     +kvm adjusts nmsrs to reflect the actual number of msrs and fills in the
802     +indices array with their numbers.
803     +
804     +KVM_GET_MSR_INDEX_LIST returns the guest msrs that are supported. The list
805     +varies by kvm version and host processor, but does not change otherwise.
806    
807     Note: if kvm indicates supports MCE (KVM_CAP_MCE), then the MCE bank MSRs are
808     not returned in the MSR list, as different vcpus can have a different number
809     of banks, as set via the KVM_X86_SETUP_MCE ioctl.
810    
811     +KVM_GET_MSR_FEATURE_INDEX_LIST returns the list of MSRs that can be passed
812     +to the KVM_GET_MSRS system ioctl. This lets userspace probe host capabilities
813     +and processor features that are exposed via MSRs (e.g., VMX capabilities).
814     +This list also varies by kvm version and host processor, but does not change
815     +otherwise.
816     +
817    
818     4.4 KVM_CHECK_EXTENSION
819    
820     @@ -474,14 +482,22 @@ Support for this has been removed. Use KVM_SET_GUEST_DEBUG instead.
821    
822     4.18 KVM_GET_MSRS
823    
824     -Capability: basic
825     +Capability: basic (vcpu), KVM_CAP_GET_MSR_FEATURES (system)
826     Architectures: x86
827     -Type: vcpu ioctl
828     +Type: system ioctl, vcpu ioctl
829     Parameters: struct kvm_msrs (in/out)
830     -Returns: 0 on success, -1 on error
831     +Returns: number of msrs successfully returned;
832     + -1 on error
833     +
834     +When used as a system ioctl:
835     +Reads the values of MSR-based features that are available for the VM. This
836     +is similar to KVM_GET_SUPPORTED_CPUID, but it returns MSR indices and values.
837     +The list of msr-based features can be obtained using KVM_GET_MSR_FEATURE_INDEX_LIST
838     +in a system ioctl.
839    
840     +When used as a vcpu ioctl:
841     Reads model-specific registers from the vcpu. Supported msr indices can
842     -be obtained using KVM_GET_MSR_INDEX_LIST.
843     +be obtained using KVM_GET_MSR_INDEX_LIST in a system ioctl.
844    
845     struct kvm_msrs {
846     __u32 nmsrs; /* number of msrs in entries */
847     diff --git a/Makefile b/Makefile
848     index 0723bbe1d4a7..fea2fe577185 100644
849     --- a/Makefile
850     +++ b/Makefile
851     @@ -1,6 +1,6 @@
852     VERSION = 4
853     PATCHLEVEL = 9
854     -SUBLEVEL = 119
855     +SUBLEVEL = 120
856     EXTRAVERSION =
857     NAME = Roaring Lionus
858    
859     diff --git a/arch/Kconfig b/arch/Kconfig
860     index 659bdd079277..b39d0f93c67b 100644
861     --- a/arch/Kconfig
862     +++ b/arch/Kconfig
863     @@ -5,6 +5,9 @@
864     config KEXEC_CORE
865     bool
866    
867     +config HOTPLUG_SMT
868     + bool
869     +
870     config OPROFILE
871     tristate "OProfile system profiling"
872     depends on PROFILING
873     diff --git a/arch/arm/boot/dts/imx6sx.dtsi b/arch/arm/boot/dts/imx6sx.dtsi
874     index 1a473e83efbf..a885052157f0 100644
875     --- a/arch/arm/boot/dts/imx6sx.dtsi
876     +++ b/arch/arm/boot/dts/imx6sx.dtsi
877     @@ -1280,7 +1280,7 @@
878     /* non-prefetchable memory */
879     0x82000000 0 0x08000000 0x08000000 0 0x00f00000>;
880     num-lanes = <1>;
881     - interrupts = <GIC_SPI 123 IRQ_TYPE_LEVEL_HIGH>;
882     + interrupts = <GIC_SPI 120 IRQ_TYPE_LEVEL_HIGH>;
883     clocks = <&clks IMX6SX_CLK_PCIE_REF_125M>,
884     <&clks IMX6SX_CLK_PCIE_AXI>,
885     <&clks IMX6SX_CLK_LVDS1_OUT>,
886     diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
887     index a14b86587013..3c37af11dab6 100644
888     --- a/arch/parisc/Kconfig
889     +++ b/arch/parisc/Kconfig
890     @@ -184,7 +184,7 @@ config PREFETCH
891    
892     config MLONGCALLS
893     bool "Enable the -mlong-calls compiler option for big kernels"
894     - def_bool y if (!MODULES)
895     + default y
896     depends on PA8X00
897     help
898     If you configure the kernel to include many drivers built-in instead
899     diff --git a/arch/parisc/include/asm/barrier.h b/arch/parisc/include/asm/barrier.h
900     new file mode 100644
901     index 000000000000..dbaaca84f27f
902     --- /dev/null
903     +++ b/arch/parisc/include/asm/barrier.h
904     @@ -0,0 +1,32 @@
905     +/* SPDX-License-Identifier: GPL-2.0 */
906     +#ifndef __ASM_BARRIER_H
907     +#define __ASM_BARRIER_H
908     +
909     +#ifndef __ASSEMBLY__
910     +
911     +/* The synchronize caches instruction executes as a nop on systems in
912     + which all memory references are performed in order. */
913     +#define synchronize_caches() __asm__ __volatile__ ("sync" : : : "memory")
914     +
915     +#if defined(CONFIG_SMP)
916     +#define mb() do { synchronize_caches(); } while (0)
917     +#define rmb() mb()
918     +#define wmb() mb()
919     +#define dma_rmb() mb()
920     +#define dma_wmb() mb()
921     +#else
922     +#define mb() barrier()
923     +#define rmb() barrier()
924     +#define wmb() barrier()
925     +#define dma_rmb() barrier()
926     +#define dma_wmb() barrier()
927     +#endif
928     +
929     +#define __smp_mb() mb()
930     +#define __smp_rmb() mb()
931     +#define __smp_wmb() mb()
932     +
933     +#include <asm-generic/barrier.h>
934     +
935     +#endif /* !__ASSEMBLY__ */
936     +#endif /* __ASM_BARRIER_H */
937     diff --git a/arch/parisc/kernel/entry.S b/arch/parisc/kernel/entry.S
938     index e3d3e8e1d708..015614405755 100644
939     --- a/arch/parisc/kernel/entry.S
940     +++ b/arch/parisc/kernel/entry.S
941     @@ -481,6 +481,8 @@
942     /* Release pa_tlb_lock lock without reloading lock address. */
943     .macro tlb_unlock0 spc,tmp
944     #ifdef CONFIG_SMP
945     + or,COND(=) %r0,\spc,%r0
946     + sync
947     or,COND(=) %r0,\spc,%r0
948     stw \spc,0(\tmp)
949     #endif
950     diff --git a/arch/parisc/kernel/pacache.S b/arch/parisc/kernel/pacache.S
951     index 67b0f7532e83..3e163df49cf3 100644
952     --- a/arch/parisc/kernel/pacache.S
953     +++ b/arch/parisc/kernel/pacache.S
954     @@ -354,6 +354,7 @@ ENDPROC_CFI(flush_data_cache_local)
955     .macro tlb_unlock la,flags,tmp
956     #ifdef CONFIG_SMP
957     ldi 1,\tmp
958     + sync
959     stw \tmp,0(\la)
960     mtsm \flags
961     #endif
962     diff --git a/arch/parisc/kernel/syscall.S b/arch/parisc/kernel/syscall.S
963     index e775f80ae28c..4886a6db42e9 100644
964     --- a/arch/parisc/kernel/syscall.S
965     +++ b/arch/parisc/kernel/syscall.S
966     @@ -633,6 +633,7 @@ cas_action:
967     sub,<> %r28, %r25, %r0
968     2: stw,ma %r24, 0(%r26)
969     /* Free lock */
970     + sync
971     stw,ma %r20, 0(%sr2,%r20)
972     #if ENABLE_LWS_DEBUG
973     /* Clear thread register indicator */
974     @@ -647,6 +648,7 @@ cas_action:
975     3:
976     /* Error occurred on load or store */
977     /* Free lock */
978     + sync
979     stw %r20, 0(%sr2,%r20)
980     #if ENABLE_LWS_DEBUG
981     stw %r0, 4(%sr2,%r20)
982     @@ -848,6 +850,7 @@ cas2_action:
983    
984     cas2_end:
985     /* Free lock */
986     + sync
987     stw,ma %r20, 0(%sr2,%r20)
988     /* Enable interrupts */
989     ssm PSW_SM_I, %r0
990     @@ -858,6 +861,7 @@ cas2_end:
991     22:
992     /* Error occurred on load or store */
993     /* Free lock */
994     + sync
995     stw %r20, 0(%sr2,%r20)
996     ssm PSW_SM_I, %r0
997     ldo 1(%r0),%r28
998     diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
999     index a4ac7bab15f7..e31001ec4c07 100644
1000     --- a/arch/x86/Kconfig
1001     +++ b/arch/x86/Kconfig
1002     @@ -147,6 +147,7 @@ config X86
1003     select HAVE_UID16 if X86_32 || IA32_EMULATION
1004     select HAVE_UNSTABLE_SCHED_CLOCK
1005     select HAVE_USER_RETURN_NOTIFIER
1006     + select HOTPLUG_SMT if SMP
1007     select IRQ_FORCED_THREADING
1008     select MODULES_USE_ELF_RELA if X86_64
1009     select MODULES_USE_ELF_REL if X86_32
1010     diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
1011     index f5aaf6c83222..2188b5af8167 100644
1012     --- a/arch/x86/include/asm/apic.h
1013     +++ b/arch/x86/include/asm/apic.h
1014     @@ -12,6 +12,7 @@
1015     #include <asm/mpspec.h>
1016     #include <asm/msr.h>
1017     #include <asm/idle.h>
1018     +#include <asm/hardirq.h>
1019    
1020     #define ARCH_APICTIMER_STOPS_ON_C3 1
1021    
1022     @@ -633,6 +634,13 @@ extern int default_check_phys_apicid_present(int phys_apicid);
1023     #endif
1024    
1025     #endif /* CONFIG_X86_LOCAL_APIC */
1026     +
1027     +#ifdef CONFIG_SMP
1028     +bool apic_id_is_primary_thread(unsigned int id);
1029     +#else
1030     +static inline bool apic_id_is_primary_thread(unsigned int id) { return false; }
1031     +#endif
1032     +
1033     extern void irq_enter(void);
1034     extern void irq_exit(void);
1035    
1036     @@ -640,6 +648,7 @@ static inline void entering_irq(void)
1037     {
1038     irq_enter();
1039     exit_idle();
1040     + kvm_set_cpu_l1tf_flush_l1d();
1041     }
1042    
1043     static inline void entering_ack_irq(void)
1044     @@ -652,6 +661,7 @@ static inline void ipi_entering_ack_irq(void)
1045     {
1046     irq_enter();
1047     ack_APIC_irq();
1048     + kvm_set_cpu_l1tf_flush_l1d();
1049     }
1050    
1051     static inline void exiting_irq(void)
1052     diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
1053     index aea30afeddb8..fbc1474960e3 100644
1054     --- a/arch/x86/include/asm/cpufeatures.h
1055     +++ b/arch/x86/include/asm/cpufeatures.h
1056     @@ -213,7 +213,7 @@
1057     #define X86_FEATURE_IBPB ( 7*32+26) /* Indirect Branch Prediction Barrier */
1058     #define X86_FEATURE_STIBP ( 7*32+27) /* Single Thread Indirect Branch Predictors */
1059     #define X86_FEATURE_ZEN ( 7*32+28) /* "" CPU is AMD family 0x17 (Zen) */
1060     -
1061     +#define X86_FEATURE_L1TF_PTEINV ( 7*32+29) /* "" L1TF workaround PTE inversion */
1062    
1063     /* Virtualization flags: Linux defined, word 8 */
1064     #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */
1065     @@ -317,6 +317,7 @@
1066     #define X86_FEATURE_PCONFIG (18*32+18) /* Intel PCONFIG */
1067     #define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */
1068     #define X86_FEATURE_INTEL_STIBP (18*32+27) /* "" Single Thread Indirect Branch Predictors */
1069     +#define X86_FEATURE_FLUSH_L1D (18*32+28) /* Flush L1D cache */
1070     #define X86_FEATURE_ARCH_CAPABILITIES (18*32+29) /* IA32_ARCH_CAPABILITIES MSR (Intel) */
1071     #define X86_FEATURE_SPEC_CTRL_SSBD (18*32+31) /* "" Speculative Store Bypass Disable */
1072    
1073     @@ -349,5 +350,6 @@
1074     #define X86_BUG_SPECTRE_V1 X86_BUG(15) /* CPU is affected by Spectre variant 1 attack with conditional branches */
1075     #define X86_BUG_SPECTRE_V2 X86_BUG(16) /* CPU is affected by Spectre variant 2 attack with indirect branches */
1076     #define X86_BUG_SPEC_STORE_BYPASS X86_BUG(17) /* CPU is affected by speculative store bypass attack */
1077     +#define X86_BUG_L1TF X86_BUG(18) /* CPU is affected by L1 Terminal Fault */
1078    
1079     #endif /* _ASM_X86_CPUFEATURES_H */
1080     diff --git a/arch/x86/include/asm/dmi.h b/arch/x86/include/asm/dmi.h
1081     index 3c69fed215c5..d8b95604a2e7 100644
1082     --- a/arch/x86/include/asm/dmi.h
1083     +++ b/arch/x86/include/asm/dmi.h
1084     @@ -3,8 +3,8 @@
1085    
1086     #include <linux/compiler.h>
1087     #include <linux/init.h>
1088     +#include <linux/io.h>
1089    
1090     -#include <asm/io.h>
1091     #include <asm/setup.h>
1092    
1093     static __always_inline __init void *dmi_alloc(unsigned len)
1094     diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
1095     index 9b76cd331990..987165924a32 100644
1096     --- a/arch/x86/include/asm/hardirq.h
1097     +++ b/arch/x86/include/asm/hardirq.h
1098     @@ -2,10 +2,12 @@
1099     #define _ASM_X86_HARDIRQ_H
1100    
1101     #include <linux/threads.h>
1102     -#include <linux/irq.h>
1103    
1104     typedef struct {
1105     - unsigned int __softirq_pending;
1106     + u16 __softirq_pending;
1107     +#if IS_ENABLED(CONFIG_KVM_INTEL)
1108     + u8 kvm_cpu_l1tf_flush_l1d;
1109     +#endif
1110     unsigned int __nmi_count; /* arch dependent */
1111     #ifdef CONFIG_X86_LOCAL_APIC
1112     unsigned int apic_timer_irqs; /* arch dependent */
1113     @@ -60,4 +62,24 @@ extern u64 arch_irq_stat_cpu(unsigned int cpu);
1114     extern u64 arch_irq_stat(void);
1115     #define arch_irq_stat arch_irq_stat
1116    
1117     +
1118     +#if IS_ENABLED(CONFIG_KVM_INTEL)
1119     +static inline void kvm_set_cpu_l1tf_flush_l1d(void)
1120     +{
1121     + __this_cpu_write(irq_stat.kvm_cpu_l1tf_flush_l1d, 1);
1122     +}
1123     +
1124     +static inline void kvm_clear_cpu_l1tf_flush_l1d(void)
1125     +{
1126     + __this_cpu_write(irq_stat.kvm_cpu_l1tf_flush_l1d, 0);
1127     +}
1128     +
1129     +static inline bool kvm_get_cpu_l1tf_flush_l1d(void)
1130     +{
1131     + return __this_cpu_read(irq_stat.kvm_cpu_l1tf_flush_l1d);
1132     +}
1133     +#else /* !IS_ENABLED(CONFIG_KVM_INTEL) */
1134     +static inline void kvm_set_cpu_l1tf_flush_l1d(void) { }
1135     +#endif /* IS_ENABLED(CONFIG_KVM_INTEL) */
1136     +
1137     #endif /* _ASM_X86_HARDIRQ_H */
1138     diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
1139     index 8a8a6c66be9a..5b1177f5a963 100644
1140     --- a/arch/x86/include/asm/irqflags.h
1141     +++ b/arch/x86/include/asm/irqflags.h
1142     @@ -12,6 +12,8 @@
1143     * Interrupt control:
1144     */
1145    
1146     +/* Declaration required for gcc < 4.9 to prevent -Werror=missing-prototypes */
1147     +extern inline unsigned long native_save_fl(void);
1148     extern inline unsigned long native_save_fl(void)
1149     {
1150     unsigned long flags;
1151     diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
1152     index 7598a6c26f76..22a0ccb17ad0 100644
1153     --- a/arch/x86/include/asm/kvm_host.h
1154     +++ b/arch/x86/include/asm/kvm_host.h
1155     @@ -17,6 +17,7 @@
1156     #include <linux/tracepoint.h>
1157     #include <linux/cpumask.h>
1158     #include <linux/irq_work.h>
1159     +#include <linux/irq.h>
1160    
1161     #include <linux/kvm.h>
1162     #include <linux/kvm_para.h>
1163     @@ -485,6 +486,7 @@ struct kvm_vcpu_arch {
1164     u64 smbase;
1165     bool tpr_access_reporting;
1166     u64 ia32_xss;
1167     + u64 microcode_version;
1168    
1169     /*
1170     * Paging state of the vcpu
1171     @@ -659,6 +661,9 @@ struct kvm_vcpu_arch {
1172    
1173     int pending_ioapic_eoi;
1174     int pending_external_vector;
1175     +
1176     + /* Flush the L1 Data cache for L1TF mitigation on VMENTER */
1177     + bool l1tf_flush_l1d;
1178     };
1179    
1180     struct kvm_lpage_info {
1181     @@ -819,6 +824,7 @@ struct kvm_vcpu_stat {
1182     u64 signal_exits;
1183     u64 irq_window_exits;
1184     u64 nmi_window_exits;
1185     + u64 l1d_flush;
1186     u64 halt_exits;
1187     u64 halt_successful_poll;
1188     u64 halt_attempted_poll;
1189     @@ -1020,6 +1026,8 @@ struct kvm_x86_ops {
1190     void (*cancel_hv_timer)(struct kvm_vcpu *vcpu);
1191    
1192     void (*setup_mce)(struct kvm_vcpu *vcpu);
1193     +
1194     + int (*get_msr_feature)(struct kvm_msr_entry *entry);
1195     };
1196    
1197     struct kvm_arch_async_pf {
1198     @@ -1338,6 +1346,7 @@ void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu);
1199     void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
1200     unsigned long address);
1201    
1202     +u64 kvm_get_arch_capabilities(void);
1203     void kvm_define_shared_msr(unsigned index, u32 msr);
1204     int kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
1205    
1206     diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
1207     index 1ec13e253174..bbbb9b14ade1 100644
1208     --- a/arch/x86/include/asm/msr-index.h
1209     +++ b/arch/x86/include/asm/msr-index.h
1210     @@ -63,12 +63,19 @@
1211     #define MSR_IA32_ARCH_CAPABILITIES 0x0000010a
1212     #define ARCH_CAP_RDCL_NO (1 << 0) /* Not susceptible to Meltdown */
1213     #define ARCH_CAP_IBRS_ALL (1 << 1) /* Enhanced IBRS support */
1214     +#define ARCH_CAP_SKIP_VMENTRY_L1DFLUSH (1 << 3) /* Skip L1D flush on vmentry */
1215     #define ARCH_CAP_SSB_NO (1 << 4) /*
1216     * Not susceptible to Speculative Store Bypass
1217     * attack, so no Speculative Store Bypass
1218     * control required.
1219     */
1220    
1221     +#define MSR_IA32_FLUSH_CMD 0x0000010b
1222     +#define L1D_FLUSH (1 << 0) /*
1223     + * Writeback and invalidate the
1224     + * L1 data cache.
1225     + */
1226     +
1227     #define MSR_IA32_BBL_CR_CTL 0x00000119
1228     #define MSR_IA32_BBL_CR_CTL3 0x0000011e
1229    
1230     diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h
1231     index 3bae4969ac65..2622984b8f1c 100644
1232     --- a/arch/x86/include/asm/page_32_types.h
1233     +++ b/arch/x86/include/asm/page_32_types.h
1234     @@ -28,8 +28,13 @@
1235     #define N_EXCEPTION_STACKS 1
1236    
1237     #ifdef CONFIG_X86_PAE
1238     -/* 44=32+12, the limit we can fit into an unsigned long pfn */
1239     -#define __PHYSICAL_MASK_SHIFT 44
1240     +/*
1241     + * This is beyond the 44 bit limit imposed by the 32bit long pfns,
1242     + * but we need the full mask to make sure inverted PROT_NONE
1243     + * entries have all the host bits set in a guest.
1244     + * The real limit is still 44 bits.
1245     + */
1246     +#define __PHYSICAL_MASK_SHIFT 52
1247     #define __VIRTUAL_MASK_SHIFT 32
1248    
1249     #else /* !CONFIG_X86_PAE */
1250     diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h
1251     index fd74a11959de..89c50332a71e 100644
1252     --- a/arch/x86/include/asm/pgtable-2level.h
1253     +++ b/arch/x86/include/asm/pgtable-2level.h
1254     @@ -77,4 +77,21 @@ static inline unsigned long pte_bitop(unsigned long value, unsigned int rightshi
1255     #define __pte_to_swp_entry(pte) ((swp_entry_t) { (pte).pte_low })
1256     #define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val })
1257    
1258     +/* No inverted PFNs on 2 level page tables */
1259     +
1260     +static inline u64 protnone_mask(u64 val)
1261     +{
1262     + return 0;
1263     +}
1264     +
1265     +static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask)
1266     +{
1267     + return val;
1268     +}
1269     +
1270     +static inline bool __pte_needs_invert(u64 val)
1271     +{
1272     + return false;
1273     +}
1274     +
1275     #endif /* _ASM_X86_PGTABLE_2LEVEL_H */
1276     diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
1277     index cdaa58c9b39e..5c686382d84b 100644
1278     --- a/arch/x86/include/asm/pgtable-3level.h
1279     +++ b/arch/x86/include/asm/pgtable-3level.h
1280     @@ -177,11 +177,44 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp)
1281     #endif
1282    
1283     /* Encode and de-code a swap entry */
1284     +#define SWP_TYPE_BITS 5
1285     +
1286     +#define SWP_OFFSET_FIRST_BIT (_PAGE_BIT_PROTNONE + 1)
1287     +
1288     +/* We always extract/encode the offset by shifting it all the way up, and then down again */
1289     +#define SWP_OFFSET_SHIFT (SWP_OFFSET_FIRST_BIT + SWP_TYPE_BITS)
1290     +
1291     #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > 5)
1292     #define __swp_type(x) (((x).val) & 0x1f)
1293     #define __swp_offset(x) ((x).val >> 5)
1294     #define __swp_entry(type, offset) ((swp_entry_t){(type) | (offset) << 5})
1295     -#define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high })
1296     -#define __swp_entry_to_pte(x) ((pte_t){ { .pte_high = (x).val } })
1297     +
1298     +/*
1299     + * Normally, __swp_entry() converts from arch-independent swp_entry_t to
1300     + * arch-dependent swp_entry_t, and __swp_entry_to_pte() just stores the result
1301     + * to pte. But here we have 32bit swp_entry_t and 64bit pte, and need to use the
1302     + * whole 64 bits. Thus, we shift the "real" arch-dependent conversion to
1303     + * __swp_entry_to_pte() through the following helper macro based on 64bit
1304     + * __swp_entry().
1305     + */
1306     +#define __swp_pteval_entry(type, offset) ((pteval_t) { \
1307     + (~(pteval_t)(offset) << SWP_OFFSET_SHIFT >> SWP_TYPE_BITS) \
1308     + | ((pteval_t)(type) << (64 - SWP_TYPE_BITS)) })
1309     +
1310     +#define __swp_entry_to_pte(x) ((pte_t){ .pte = \
1311     + __swp_pteval_entry(__swp_type(x), __swp_offset(x)) })
1312     +/*
1313     + * Analogically, __pte_to_swp_entry() doesn't just extract the arch-dependent
1314     + * swp_entry_t, but also has to convert it from 64bit to the 32bit
1315     + * intermediate representation, using the following macros based on 64bit
1316     + * __swp_type() and __swp_offset().
1317     + */
1318     +#define __pteval_swp_type(x) ((unsigned long)((x).pte >> (64 - SWP_TYPE_BITS)))
1319     +#define __pteval_swp_offset(x) ((unsigned long)(~((x).pte) << SWP_TYPE_BITS >> SWP_OFFSET_SHIFT))
1320     +
1321     +#define __pte_to_swp_entry(pte) (__swp_entry(__pteval_swp_type(pte), \
1322     + __pteval_swp_offset(pte)))
1323     +
1324     +#include <asm/pgtable-invert.h>
1325    
1326     #endif /* _ASM_X86_PGTABLE_3LEVEL_H */
1327     diff --git a/arch/x86/include/asm/pgtable-invert.h b/arch/x86/include/asm/pgtable-invert.h
1328     new file mode 100644
1329     index 000000000000..44b1203ece12
1330     --- /dev/null
1331     +++ b/arch/x86/include/asm/pgtable-invert.h
1332     @@ -0,0 +1,32 @@
1333     +/* SPDX-License-Identifier: GPL-2.0 */
1334     +#ifndef _ASM_PGTABLE_INVERT_H
1335     +#define _ASM_PGTABLE_INVERT_H 1
1336     +
1337     +#ifndef __ASSEMBLY__
1338     +
1339     +static inline bool __pte_needs_invert(u64 val)
1340     +{
1341     + return !(val & _PAGE_PRESENT);
1342     +}
1343     +
1344     +/* Get a mask to xor with the page table entry to get the correct pfn. */
1345     +static inline u64 protnone_mask(u64 val)
1346     +{
1347     + return __pte_needs_invert(val) ? ~0ull : 0;
1348     +}
1349     +
1350     +static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask)
1351     +{
1352     + /*
1353     + * When a PTE transitions from NONE to !NONE or vice-versa
1354     + * invert the PFN part to stop speculation.
1355     + * pte_pfn undoes this when needed.
1356     + */
1357     + if (__pte_needs_invert(oldval) != __pte_needs_invert(val))
1358     + val = (val & ~mask) | (~val & mask);
1359     + return val;
1360     +}
1361     +
1362     +#endif /* __ASSEMBLY__ */
1363     +
1364     +#endif
1365     diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
1366     index 5af0401ccff2..5008be1ab183 100644
1367     --- a/arch/x86/include/asm/pgtable.h
1368     +++ b/arch/x86/include/asm/pgtable.h
1369     @@ -165,19 +165,29 @@ static inline int pte_special(pte_t pte)
1370     return pte_flags(pte) & _PAGE_SPECIAL;
1371     }
1372    
1373     +/* Entries that were set to PROT_NONE are inverted */
1374     +
1375     +static inline u64 protnone_mask(u64 val);
1376     +
1377     static inline unsigned long pte_pfn(pte_t pte)
1378     {
1379     - return (pte_val(pte) & PTE_PFN_MASK) >> PAGE_SHIFT;
1380     + phys_addr_t pfn = pte_val(pte);
1381     + pfn ^= protnone_mask(pfn);
1382     + return (pfn & PTE_PFN_MASK) >> PAGE_SHIFT;
1383     }
1384    
1385     static inline unsigned long pmd_pfn(pmd_t pmd)
1386     {
1387     - return (pmd_val(pmd) & pmd_pfn_mask(pmd)) >> PAGE_SHIFT;
1388     + phys_addr_t pfn = pmd_val(pmd);
1389     + pfn ^= protnone_mask(pfn);
1390     + return (pfn & pmd_pfn_mask(pmd)) >> PAGE_SHIFT;
1391     }
1392    
1393     static inline unsigned long pud_pfn(pud_t pud)
1394     {
1395     - return (pud_val(pud) & pud_pfn_mask(pud)) >> PAGE_SHIFT;
1396     + phys_addr_t pfn = pud_val(pud);
1397     + pfn ^= protnone_mask(pfn);
1398     + return (pfn & pud_pfn_mask(pud)) >> PAGE_SHIFT;
1399     }
1400    
1401     #define pte_page(pte) pfn_to_page(pte_pfn(pte))
1402     @@ -340,11 +350,6 @@ static inline pmd_t pmd_mkwrite(pmd_t pmd)
1403     return pmd_set_flags(pmd, _PAGE_RW);
1404     }
1405    
1406     -static inline pmd_t pmd_mknotpresent(pmd_t pmd)
1407     -{
1408     - return pmd_clear_flags(pmd, _PAGE_PRESENT | _PAGE_PROTNONE);
1409     -}
1410     -
1411     #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
1412     static inline int pte_soft_dirty(pte_t pte)
1413     {
1414     @@ -394,19 +399,58 @@ static inline pgprotval_t massage_pgprot(pgprot_t pgprot)
1415    
1416     static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
1417     {
1418     - return __pte(((phys_addr_t)page_nr << PAGE_SHIFT) |
1419     - massage_pgprot(pgprot));
1420     + phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT;
1421     + pfn ^= protnone_mask(pgprot_val(pgprot));
1422     + pfn &= PTE_PFN_MASK;
1423     + return __pte(pfn | massage_pgprot(pgprot));
1424     }
1425    
1426     static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
1427     {
1428     - return __pmd(((phys_addr_t)page_nr << PAGE_SHIFT) |
1429     - massage_pgprot(pgprot));
1430     + phys_addr_t pfn = (phys_addr_t)page_nr << PAGE_SHIFT;
1431     + pfn ^= protnone_mask(pgprot_val(pgprot));
1432     + pfn &= PHYSICAL_PMD_PAGE_MASK;
1433     + return __pmd(pfn | massage_pgprot(pgprot));
1434     +}
1435     +
1436     +static inline pud_t pfn_pud(unsigned long page_nr, pgprot_t pgprot)
1437     +{
1438     + phys_addr_t pfn = page_nr << PAGE_SHIFT;
1439     + pfn ^= protnone_mask(pgprot_val(pgprot));
1440     + pfn &= PHYSICAL_PUD_PAGE_MASK;
1441     + return __pud(pfn | massage_pgprot(pgprot));
1442     +}
1443     +
1444     +static inline pmd_t pmd_mknotpresent(pmd_t pmd)
1445     +{
1446     + return pfn_pmd(pmd_pfn(pmd),
1447     + __pgprot(pmd_flags(pmd) & ~(_PAGE_PRESENT|_PAGE_PROTNONE)));
1448     +}
1449     +
1450     +static inline pud_t pud_set_flags(pud_t pud, pudval_t set)
1451     +{
1452     + pudval_t v = native_pud_val(pud);
1453     +
1454     + return __pud(v | set);
1455     +}
1456     +
1457     +static inline pud_t pud_clear_flags(pud_t pud, pudval_t clear)
1458     +{
1459     + pudval_t v = native_pud_val(pud);
1460     +
1461     + return __pud(v & ~clear);
1462     +}
1463     +
1464     +static inline pud_t pud_mkhuge(pud_t pud)
1465     +{
1466     + return pud_set_flags(pud, _PAGE_PSE);
1467     }
1468    
1469     +static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask);
1470     +
1471     static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
1472     {
1473     - pteval_t val = pte_val(pte);
1474     + pteval_t val = pte_val(pte), oldval = val;
1475    
1476     /*
1477     * Chop off the NX bit (if present), and add the NX portion of
1478     @@ -414,17 +458,17 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
1479     */
1480     val &= _PAGE_CHG_MASK;
1481     val |= massage_pgprot(newprot) & ~_PAGE_CHG_MASK;
1482     -
1483     + val = flip_protnone_guard(oldval, val, PTE_PFN_MASK);
1484     return __pte(val);
1485     }
1486    
1487     static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
1488     {
1489     - pmdval_t val = pmd_val(pmd);
1490     + pmdval_t val = pmd_val(pmd), oldval = val;
1491    
1492     val &= _HPAGE_CHG_MASK;
1493     val |= massage_pgprot(newprot) & ~_HPAGE_CHG_MASK;
1494     -
1495     + val = flip_protnone_guard(oldval, val, PHYSICAL_PMD_PAGE_MASK);
1496     return __pmd(val);
1497     }
1498    
1499     @@ -1010,6 +1054,15 @@ static inline u16 pte_flags_pkey(unsigned long pte_flags)
1500     #endif
1501     }
1502    
1503     +
1504     +#define __HAVE_ARCH_PFN_MODIFY_ALLOWED 1
1505     +extern bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot);
1506     +
1507     +static inline bool arch_has_pfn_modify_check(void)
1508     +{
1509     + return boot_cpu_has_bug(X86_BUG_L1TF);
1510     +}
1511     +
1512     #include <asm-generic/pgtable.h>
1513     #endif /* __ASSEMBLY__ */
1514    
1515     diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
1516     index ce97c8c6a310..221a32ed1372 100644
1517     --- a/arch/x86/include/asm/pgtable_64.h
1518     +++ b/arch/x86/include/asm/pgtable_64.h
1519     @@ -166,29 +166,49 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
1520     /*
1521     * Encode and de-code a swap entry
1522     *
1523     - * | ... | 11| 10| 9|8|7|6|5| 4| 3|2|1|0| <- bit number
1524     - * | ... |SW3|SW2|SW1|G|L|D|A|CD|WT|U|W|P| <- bit names
1525     - * | OFFSET (14->63) | TYPE (9-13) |0|X|X|X| X| X|X|X|0| <- swp entry
1526     + * | ... | 11| 10| 9|8|7|6|5| 4| 3|2| 1|0| <- bit number
1527     + * | ... |SW3|SW2|SW1|G|L|D|A|CD|WT|U| W|P| <- bit names
1528     + * | TYPE (59-63) | ~OFFSET (9-58) |0|0|X|X| X| X|X|SD|0| <- swp entry
1529     *
1530     * G (8) is aliased and used as a PROT_NONE indicator for
1531     * !present ptes. We need to start storing swap entries above
1532     * there. We also need to avoid using A and D because of an
1533     * erratum where they can be incorrectly set by hardware on
1534     * non-present PTEs.
1535     + *
1536     + * SD (1) in swp entry is used to store soft dirty bit, which helps us
1537     + * remember soft dirty over page migration
1538     + *
1539     + * Bit 7 in swp entry should be 0 because pmd_present checks not only P,
1540     + * but also L and G.
1541     + *
1542     + * The offset is inverted by a binary not operation to make the high
1543     + * physical bits set.
1544     */
1545     -#define SWP_TYPE_FIRST_BIT (_PAGE_BIT_PROTNONE + 1)
1546     -#define SWP_TYPE_BITS 5
1547     -/* Place the offset above the type: */
1548     -#define SWP_OFFSET_FIRST_BIT (SWP_TYPE_FIRST_BIT + SWP_TYPE_BITS)
1549     +#define SWP_TYPE_BITS 5
1550     +
1551     +#define SWP_OFFSET_FIRST_BIT (_PAGE_BIT_PROTNONE + 1)
1552     +
1553     +/* We always extract/encode the offset by shifting it all the way up, and then down again */
1554     +#define SWP_OFFSET_SHIFT (SWP_OFFSET_FIRST_BIT+SWP_TYPE_BITS)
1555    
1556     #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS)
1557    
1558     -#define __swp_type(x) (((x).val >> (SWP_TYPE_FIRST_BIT)) \
1559     - & ((1U << SWP_TYPE_BITS) - 1))
1560     -#define __swp_offset(x) ((x).val >> SWP_OFFSET_FIRST_BIT)
1561     -#define __swp_entry(type, offset) ((swp_entry_t) { \
1562     - ((type) << (SWP_TYPE_FIRST_BIT)) \
1563     - | ((offset) << SWP_OFFSET_FIRST_BIT) })
1564     +/* Extract the high bits for type */
1565     +#define __swp_type(x) ((x).val >> (64 - SWP_TYPE_BITS))
1566     +
1567     +/* Shift up (to get rid of type), then down to get value */
1568     +#define __swp_offset(x) (~(x).val << SWP_TYPE_BITS >> SWP_OFFSET_SHIFT)
1569     +
1570     +/*
1571     + * Shift the offset up "too far" by TYPE bits, then down again
1572     + * The offset is inverted by a binary not operation to make the high
1573     + * physical bits set.
1574     + */
1575     +#define __swp_entry(type, offset) ((swp_entry_t) { \
1576     + (~(unsigned long)(offset) << SWP_OFFSET_SHIFT >> SWP_TYPE_BITS) \
1577     + | ((unsigned long)(type) << (64-SWP_TYPE_BITS)) })
1578     +
1579     #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) })
1580     #define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val })
1581    
1582     @@ -215,6 +235,8 @@ extern void cleanup_highmap(void);
1583     extern void init_extra_mapping_uc(unsigned long phys, unsigned long size);
1584     extern void init_extra_mapping_wb(unsigned long phys, unsigned long size);
1585    
1586     +#include <asm/pgtable-invert.h>
1587     +
1588     #endif /* !__ASSEMBLY__ */
1589    
1590     #endif /* _ASM_X86_PGTABLE_64_H */
1591     diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
1592     index f1c8ac468292..dfdb7e21ba56 100644
1593     --- a/arch/x86/include/asm/pgtable_types.h
1594     +++ b/arch/x86/include/asm/pgtable_types.h
1595     @@ -97,15 +97,15 @@
1596     /*
1597     * Tracking soft dirty bit when a page goes to a swap is tricky.
1598     * We need a bit which can be stored in pte _and_ not conflict
1599     - * with swap entry format. On x86 bits 6 and 7 are *not* involved
1600     - * into swap entry computation, but bit 6 is used for nonlinear
1601     - * file mapping, so we borrow bit 7 for soft dirty tracking.
1602     + * with swap entry format. On x86 bits 1-4 are *not* involved
1603     + * into swap entry computation, but bit 7 is used for thp migration,
1604     + * so we borrow bit 1 for soft dirty tracking.
1605     *
1606     * Please note that this bit must be treated as swap dirty page
1607     - * mark if and only if the PTE has present bit clear!
1608     + * mark if and only if the PTE/PMD has present bit clear!
1609     */
1610     #ifdef CONFIG_MEM_SOFT_DIRTY
1611     -#define _PAGE_SWP_SOFT_DIRTY _PAGE_PSE
1612     +#define _PAGE_SWP_SOFT_DIRTY _PAGE_RW
1613     #else
1614     #define _PAGE_SWP_SOFT_DIRTY (_AT(pteval_t, 0))
1615     #endif
1616     diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
1617     index ec15ca2b32d0..d5525a7e119e 100644
1618     --- a/arch/x86/include/asm/processor.h
1619     +++ b/arch/x86/include/asm/processor.h
1620     @@ -173,6 +173,11 @@ extern const struct seq_operations cpuinfo_op;
1621    
1622     extern void cpu_detect(struct cpuinfo_x86 *c);
1623    
1624     +static inline unsigned long l1tf_pfn_limit(void)
1625     +{
1626     + return BIT(boot_cpu_data.x86_phys_bits - 1 - PAGE_SHIFT) - 1;
1627     +}
1628     +
1629     extern void early_cpu_init(void);
1630     extern void identify_boot_cpu(void);
1631     extern void identify_secondary_cpu(struct cpuinfo_x86 *);
1632     @@ -855,4 +860,16 @@ bool xen_set_default_idle(void);
1633    
1634     void stop_this_cpu(void *dummy);
1635     void df_debug(struct pt_regs *regs, long error_code);
1636     +
1637     +enum l1tf_mitigations {
1638     + L1TF_MITIGATION_OFF,
1639     + L1TF_MITIGATION_FLUSH_NOWARN,
1640     + L1TF_MITIGATION_FLUSH,
1641     + L1TF_MITIGATION_FLUSH_NOSMT,
1642     + L1TF_MITIGATION_FULL,
1643     + L1TF_MITIGATION_FULL_FORCE
1644     +};
1645     +
1646     +extern enum l1tf_mitigations l1tf_mitigation;
1647     +
1648     #endif /* _ASM_X86_PROCESSOR_H */
1649     diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
1650     index 026ea82ecc60..d25fb6beb2f0 100644
1651     --- a/arch/x86/include/asm/smp.h
1652     +++ b/arch/x86/include/asm/smp.h
1653     @@ -156,7 +156,6 @@ static inline int wbinvd_on_all_cpus(void)
1654     wbinvd();
1655     return 0;
1656     }
1657     -#define smp_num_siblings 1
1658     #endif /* CONFIG_SMP */
1659    
1660     extern unsigned disabled_cpus;
1661     diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
1662     index cf75871d2f81..1fbb174c846b 100644
1663     --- a/arch/x86/include/asm/topology.h
1664     +++ b/arch/x86/include/asm/topology.h
1665     @@ -129,13 +129,17 @@ static inline int topology_max_smt_threads(void)
1666     }
1667    
1668     int topology_update_package_map(unsigned int apicid, unsigned int cpu);
1669     -extern int topology_phys_to_logical_pkg(unsigned int pkg);
1670     +int topology_phys_to_logical_pkg(unsigned int pkg);
1671     +bool topology_is_primary_thread(unsigned int cpu);
1672     +bool topology_smt_supported(void);
1673     #else
1674     #define topology_max_packages() (1)
1675     static inline int
1676     topology_update_package_map(unsigned int apicid, unsigned int cpu) { return 0; }
1677     static inline int topology_phys_to_logical_pkg(unsigned int pkg) { return 0; }
1678     static inline int topology_max_smt_threads(void) { return 1; }
1679     +static inline bool topology_is_primary_thread(unsigned int cpu) { return true; }
1680     +static inline bool topology_smt_supported(void) { return false; }
1681     #endif
1682    
1683     static inline void arch_fix_phys_package_id(int num, u32 slot)
1684     diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
1685     index 9cbfbef6a115..72cacb027b98 100644
1686     --- a/arch/x86/include/asm/vmx.h
1687     +++ b/arch/x86/include/asm/vmx.h
1688     @@ -499,4 +499,15 @@ enum vm_instruction_error_number {
1689     VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID = 28,
1690     };
1691    
1692     +enum vmx_l1d_flush_state {
1693     + VMENTER_L1D_FLUSH_AUTO,
1694     + VMENTER_L1D_FLUSH_NEVER,
1695     + VMENTER_L1D_FLUSH_COND,
1696     + VMENTER_L1D_FLUSH_ALWAYS,
1697     + VMENTER_L1D_FLUSH_EPT_DISABLED,
1698     + VMENTER_L1D_FLUSH_NOT_REQUIRED,
1699     +};
1700     +
1701     +extern enum vmx_l1d_flush_state l1tf_vmx_mitigation;
1702     +
1703     #endif
1704     diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
1705     index 76cf21f887bd..4f2af1ee09cb 100644
1706     --- a/arch/x86/kernel/apic/apic.c
1707     +++ b/arch/x86/kernel/apic/apic.c
1708     @@ -34,6 +34,7 @@
1709     #include <linux/dmi.h>
1710     #include <linux/smp.h>
1711     #include <linux/mm.h>
1712     +#include <linux/irq.h>
1713    
1714     #include <asm/trace/irq_vectors.h>
1715     #include <asm/irq_remapping.h>
1716     @@ -55,6 +56,7 @@
1717     #include <asm/mce.h>
1718     #include <asm/tsc.h>
1719     #include <asm/hypervisor.h>
1720     +#include <asm/irq_regs.h>
1721    
1722     unsigned int num_processors;
1723    
1724     @@ -2041,6 +2043,23 @@ static int cpuid_to_apicid[] = {
1725     [0 ... NR_CPUS - 1] = -1,
1726     };
1727    
1728     +#ifdef CONFIG_SMP
1729     +/**
1730     + * apic_id_is_primary_thread - Check whether APIC ID belongs to a primary thread
1731     + * @id: APIC ID to check
1732     + */
1733     +bool apic_id_is_primary_thread(unsigned int apicid)
1734     +{
1735     + u32 mask;
1736     +
1737     + if (smp_num_siblings == 1)
1738     + return true;
1739     + /* Isolate the SMT bit(s) in the APICID and check for 0 */
1740     + mask = (1U << (fls(smp_num_siblings) - 1)) - 1;
1741     + return !(apicid & mask);
1742     +}
1743     +#endif
1744     +
1745     /*
1746     * Should use this API to allocate logical CPU IDs to keep nr_logical_cpuids
1747     * and cpuid_to_apicid[] synchronized.
1748     diff --git a/arch/x86/kernel/apic/htirq.c b/arch/x86/kernel/apic/htirq.c
1749     index ae50d3454d78..89d6e96d0038 100644
1750     --- a/arch/x86/kernel/apic/htirq.c
1751     +++ b/arch/x86/kernel/apic/htirq.c
1752     @@ -16,6 +16,8 @@
1753     #include <linux/device.h>
1754     #include <linux/pci.h>
1755     #include <linux/htirq.h>
1756     +#include <linux/irq.h>
1757     +
1758     #include <asm/irqdomain.h>
1759     #include <asm/hw_irq.h>
1760     #include <asm/apic.h>
1761     diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
1762     index cf89928dbd46..d34629d70421 100644
1763     --- a/arch/x86/kernel/apic/io_apic.c
1764     +++ b/arch/x86/kernel/apic/io_apic.c
1765     @@ -32,6 +32,7 @@
1766    
1767     #include <linux/mm.h>
1768     #include <linux/interrupt.h>
1769     +#include <linux/irq.h>
1770     #include <linux/init.h>
1771     #include <linux/delay.h>
1772     #include <linux/sched.h>
1773     diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
1774     index 015bbf30e3e3..cfd17a3518bb 100644
1775     --- a/arch/x86/kernel/apic/msi.c
1776     +++ b/arch/x86/kernel/apic/msi.c
1777     @@ -12,6 +12,7 @@
1778     */
1779     #include <linux/mm.h>
1780     #include <linux/interrupt.h>
1781     +#include <linux/irq.h>
1782     #include <linux/pci.h>
1783     #include <linux/dmar.h>
1784     #include <linux/hpet.h>
1785     diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
1786     index 4922ab66fd29..c6bd3f9b4383 100644
1787     --- a/arch/x86/kernel/apic/vector.c
1788     +++ b/arch/x86/kernel/apic/vector.c
1789     @@ -11,6 +11,7 @@
1790     * published by the Free Software Foundation.
1791     */
1792     #include <linux/interrupt.h>
1793     +#include <linux/irq.h>
1794     #include <linux/init.h>
1795     #include <linux/compiler.h>
1796     #include <linux/slab.h>
1797     diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
1798     index 4c2be99fa0fb..4c2648b96c9a 100644
1799     --- a/arch/x86/kernel/cpu/amd.c
1800     +++ b/arch/x86/kernel/cpu/amd.c
1801     @@ -296,13 +296,34 @@ static int nearby_node(int apicid)
1802     }
1803     #endif
1804    
1805     +static void amd_get_topology_early(struct cpuinfo_x86 *c)
1806     +{
1807     + if (cpu_has(c, X86_FEATURE_TOPOEXT))
1808     + smp_num_siblings = ((cpuid_ebx(0x8000001e) >> 8) & 0xff) + 1;
1809     +}
1810     +
1811     +/*
1812     + * Fix up cpu_core_id for pre-F17h systems to be in the
1813     + * [0 .. cores_per_node - 1] range. Not really needed but
1814     + * kept so as not to break existing setups.
1815     + */
1816     +static void legacy_fixup_core_id(struct cpuinfo_x86 *c)
1817     +{
1818     + u32 cus_per_node;
1819     +
1820     + if (c->x86 >= 0x17)
1821     + return;
1822     +
1823     + cus_per_node = c->x86_max_cores / nodes_per_socket;
1824     + c->cpu_core_id %= cus_per_node;
1825     +}
1826     +
1827     /*
1828     * Fixup core topology information for
1829     * (1) AMD multi-node processors
1830     * Assumption: Number of cores in each internal node is the same.
1831     * (2) AMD processors supporting compute units
1832     */
1833     -#ifdef CONFIG_SMP
1834     static void amd_get_topology(struct cpuinfo_x86 *c)
1835     {
1836     u8 node_id;
1837     @@ -315,7 +336,6 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
1838     cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
1839    
1840     node_id = ecx & 0xff;
1841     - smp_num_siblings = ((ebx >> 8) & 0xff) + 1;
1842    
1843     if (c->x86 == 0x15)
1844     c->cu_id = ebx & 0xff;
1845     @@ -353,18 +373,11 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
1846     } else
1847     return;
1848    
1849     - /* fixup multi-node processor information */
1850     if (nodes_per_socket > 1) {
1851     - u32 cus_per_node;
1852     -
1853     set_cpu_cap(c, X86_FEATURE_AMD_DCM);
1854     - cus_per_node = c->x86_max_cores / nodes_per_socket;
1855     -
1856     - /* core id has to be in the [0 .. cores_per_node - 1] range */
1857     - c->cpu_core_id %= cus_per_node;
1858     + legacy_fixup_core_id(c);
1859     }
1860     }
1861     -#endif
1862    
1863     /*
1864     * On a AMD dual core setup the lower bits of the APIC id distinguish the cores.
1865     @@ -372,7 +385,6 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
1866     */
1867     static void amd_detect_cmp(struct cpuinfo_x86 *c)
1868     {
1869     -#ifdef CONFIG_SMP
1870     unsigned bits;
1871     int cpu = smp_processor_id();
1872    
1873     @@ -384,16 +396,11 @@ static void amd_detect_cmp(struct cpuinfo_x86 *c)
1874     /* use socket ID also for last level cache */
1875     per_cpu(cpu_llc_id, cpu) = c->phys_proc_id;
1876     amd_get_topology(c);
1877     -#endif
1878     }
1879    
1880     u16 amd_get_nb_id(int cpu)
1881     {
1882     - u16 id = 0;
1883     -#ifdef CONFIG_SMP
1884     - id = per_cpu(cpu_llc_id, cpu);
1885     -#endif
1886     - return id;
1887     + return per_cpu(cpu_llc_id, cpu);
1888     }
1889     EXPORT_SYMBOL_GPL(amd_get_nb_id);
1890    
1891     @@ -567,6 +574,8 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
1892    
1893     static void early_init_amd(struct cpuinfo_x86 *c)
1894     {
1895     + u64 value;
1896     +
1897     early_init_amd_mc(c);
1898    
1899     /*
1900     @@ -633,6 +642,23 @@ static void early_init_amd(struct cpuinfo_x86 *c)
1901     */
1902     if (cpu_has_amd_erratum(c, amd_erratum_400))
1903     set_cpu_bug(c, X86_BUG_AMD_E400);
1904     +
1905     +
1906     + /* Re-enable TopologyExtensions if switched off by BIOS */
1907     + if (c->x86 == 0x15 &&
1908     + (c->x86_model >= 0x10 && c->x86_model <= 0x6f) &&
1909     + !cpu_has(c, X86_FEATURE_TOPOEXT)) {
1910     +
1911     + if (msr_set_bit(0xc0011005, 54) > 0) {
1912     + rdmsrl(0xc0011005, value);
1913     + if (value & BIT_64(54)) {
1914     + set_cpu_cap(c, X86_FEATURE_TOPOEXT);
1915     + pr_info_once(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n");
1916     + }
1917     + }
1918     + }
1919     +
1920     + amd_get_topology_early(c);
1921     }
1922    
1923     static void init_amd_k8(struct cpuinfo_x86 *c)
1924     @@ -724,19 +750,6 @@ static void init_amd_bd(struct cpuinfo_x86 *c)
1925     {
1926     u64 value;
1927    
1928     - /* re-enable TopologyExtensions if switched off by BIOS */
1929     - if ((c->x86_model >= 0x10) && (c->x86_model <= 0x6f) &&
1930     - !cpu_has(c, X86_FEATURE_TOPOEXT)) {
1931     -
1932     - if (msr_set_bit(0xc0011005, 54) > 0) {
1933     - rdmsrl(0xc0011005, value);
1934     - if (value & BIT_64(54)) {
1935     - set_cpu_cap(c, X86_FEATURE_TOPOEXT);
1936     - pr_info_once(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n");
1937     - }
1938     - }
1939     - }
1940     -
1941     /*
1942     * The way access filter has a performance penalty on some workloads.
1943     * Disable it on the affected CPUs.
1944     @@ -799,15 +812,8 @@ static void init_amd(struct cpuinfo_x86 *c)
1945    
1946     cpu_detect_cache_sizes(c);
1947    
1948     - /* Multi core CPU? */
1949     - if (c->extended_cpuid_level >= 0x80000008) {
1950     - amd_detect_cmp(c);
1951     - srat_detect_node(c);
1952     - }
1953     -
1954     -#ifdef CONFIG_X86_32
1955     - detect_ht(c);
1956     -#endif
1957     + amd_detect_cmp(c);
1958     + srat_detect_node(c);
1959    
1960     init_amd_cacheinfo(c);
1961    
1962     diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
1963     index 86af9b1b049d..5229eaf73828 100644
1964     --- a/arch/x86/kernel/cpu/bugs.c
1965     +++ b/arch/x86/kernel/cpu/bugs.c
1966     @@ -21,14 +21,17 @@
1967     #include <asm/processor-flags.h>
1968     #include <asm/fpu/internal.h>
1969     #include <asm/msr.h>
1970     +#include <asm/vmx.h>
1971     #include <asm/paravirt.h>
1972     #include <asm/alternative.h>
1973     #include <asm/pgtable.h>
1974     #include <asm/cacheflush.h>
1975     #include <asm/intel-family.h>
1976     +#include <asm/e820.h>
1977    
1978     static void __init spectre_v2_select_mitigation(void);
1979     static void __init ssb_select_mitigation(void);
1980     +static void __init l1tf_select_mitigation(void);
1981    
1982     /*
1983     * Our boot-time value of the SPEC_CTRL MSR. We read it once so that any
1984     @@ -54,6 +57,12 @@ void __init check_bugs(void)
1985     {
1986     identify_boot_cpu();
1987    
1988     + /*
1989     + * identify_boot_cpu() initialized SMT support information, let the
1990     + * core code know.
1991     + */
1992     + cpu_smt_check_topology_early();
1993     +
1994     if (!IS_ENABLED(CONFIG_SMP)) {
1995     pr_info("CPU: ");
1996     print_cpu_info(&boot_cpu_data);
1997     @@ -80,6 +89,8 @@ void __init check_bugs(void)
1998     */
1999     ssb_select_mitigation();
2000    
2001     + l1tf_select_mitigation();
2002     +
2003     #ifdef CONFIG_X86_32
2004     /*
2005     * Check whether we are able to run this kernel safely on SMP.
2006     @@ -310,23 +321,6 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
2007     return cmd;
2008     }
2009    
2010     -/* Check for Skylake-like CPUs (for RSB handling) */
2011     -static bool __init is_skylake_era(void)
2012     -{
2013     - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
2014     - boot_cpu_data.x86 == 6) {
2015     - switch (boot_cpu_data.x86_model) {
2016     - case INTEL_FAM6_SKYLAKE_MOBILE:
2017     - case INTEL_FAM6_SKYLAKE_DESKTOP:
2018     - case INTEL_FAM6_SKYLAKE_X:
2019     - case INTEL_FAM6_KABYLAKE_MOBILE:
2020     - case INTEL_FAM6_KABYLAKE_DESKTOP:
2021     - return true;
2022     - }
2023     - }
2024     - return false;
2025     -}
2026     -
2027     static void __init spectre_v2_select_mitigation(void)
2028     {
2029     enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline();
2030     @@ -387,22 +381,15 @@ retpoline_auto:
2031     pr_info("%s\n", spectre_v2_strings[mode]);
2032    
2033     /*
2034     - * If neither SMEP nor PTI are available, there is a risk of
2035     - * hitting userspace addresses in the RSB after a context switch
2036     - * from a shallow call stack to a deeper one. To prevent this fill
2037     - * the entire RSB, even when using IBRS.
2038     + * If spectre v2 protection has been enabled, unconditionally fill
2039     + * RSB during a context switch; this protects against two independent
2040     + * issues:
2041     *
2042     - * Skylake era CPUs have a separate issue with *underflow* of the
2043     - * RSB, when they will predict 'ret' targets from the generic BTB.
2044     - * The proper mitigation for this is IBRS. If IBRS is not supported
2045     - * or deactivated in favour of retpolines the RSB fill on context
2046     - * switch is required.
2047     + * - RSB underflow (and switch to BTB) on Skylake+
2048     + * - SpectreRSB variant of spectre v2 on X86_BUG_SPECTRE_V2 CPUs
2049     */
2050     - if ((!boot_cpu_has(X86_FEATURE_KAISER) &&
2051     - !boot_cpu_has(X86_FEATURE_SMEP)) || is_skylake_era()) {
2052     - setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW);
2053     - pr_info("Spectre v2 mitigation: Filling RSB on context switch\n");
2054     - }
2055     + setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW);
2056     + pr_info("Spectre v2 / SpectreRSB mitigation: Filling RSB on context switch\n");
2057    
2058     /* Initialize Indirect Branch Prediction Barrier if supported */
2059     if (boot_cpu_has(X86_FEATURE_IBPB)) {
2060     @@ -653,8 +640,121 @@ void x86_spec_ctrl_setup_ap(void)
2061     x86_amd_ssb_disable();
2062     }
2063    
2064     +#undef pr_fmt
2065     +#define pr_fmt(fmt) "L1TF: " fmt
2066     +
2067     +/* Default mitigation for L1TF-affected CPUs */
2068     +enum l1tf_mitigations l1tf_mitigation __ro_after_init = L1TF_MITIGATION_FLUSH;
2069     +#if IS_ENABLED(CONFIG_KVM_INTEL)
2070     +EXPORT_SYMBOL_GPL(l1tf_mitigation);
2071     +
2072     +enum vmx_l1d_flush_state l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
2073     +EXPORT_SYMBOL_GPL(l1tf_vmx_mitigation);
2074     +#endif
2075     +
2076     +static void __init l1tf_select_mitigation(void)
2077     +{
2078     + u64 half_pa;
2079     +
2080     + if (!boot_cpu_has_bug(X86_BUG_L1TF))
2081     + return;
2082     +
2083     + switch (l1tf_mitigation) {
2084     + case L1TF_MITIGATION_OFF:
2085     + case L1TF_MITIGATION_FLUSH_NOWARN:
2086     + case L1TF_MITIGATION_FLUSH:
2087     + break;
2088     + case L1TF_MITIGATION_FLUSH_NOSMT:
2089     + case L1TF_MITIGATION_FULL:
2090     + cpu_smt_disable(false);
2091     + break;
2092     + case L1TF_MITIGATION_FULL_FORCE:
2093     + cpu_smt_disable(true);
2094     + break;
2095     + }
2096     +
2097     +#if CONFIG_PGTABLE_LEVELS == 2
2098     + pr_warn("Kernel not compiled for PAE. No mitigation for L1TF\n");
2099     + return;
2100     +#endif
2101     +
2102     + /*
2103     + * This is extremely unlikely to happen because almost all
2104     + * systems have far more MAX_PA/2 than RAM can be fit into
2105     + * DIMM slots.
2106     + */
2107     + half_pa = (u64)l1tf_pfn_limit() << PAGE_SHIFT;
2108     + if (e820_any_mapped(half_pa, ULLONG_MAX - half_pa, E820_RAM)) {
2109     + pr_warn("System has more than MAX_PA/2 memory. L1TF mitigation not effective.\n");
2110     + return;
2111     + }
2112     +
2113     + setup_force_cpu_cap(X86_FEATURE_L1TF_PTEINV);
2114     +}
2115     +
2116     +static int __init l1tf_cmdline(char *str)
2117     +{
2118     + if (!boot_cpu_has_bug(X86_BUG_L1TF))
2119     + return 0;
2120     +
2121     + if (!str)
2122     + return -EINVAL;
2123     +
2124     + if (!strcmp(str, "off"))
2125     + l1tf_mitigation = L1TF_MITIGATION_OFF;
2126     + else if (!strcmp(str, "flush,nowarn"))
2127     + l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOWARN;
2128     + else if (!strcmp(str, "flush"))
2129     + l1tf_mitigation = L1TF_MITIGATION_FLUSH;
2130     + else if (!strcmp(str, "flush,nosmt"))
2131     + l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOSMT;
2132     + else if (!strcmp(str, "full"))
2133     + l1tf_mitigation = L1TF_MITIGATION_FULL;
2134     + else if (!strcmp(str, "full,force"))
2135     + l1tf_mitigation = L1TF_MITIGATION_FULL_FORCE;
2136     +
2137     + return 0;
2138     +}
2139     +early_param("l1tf", l1tf_cmdline);
2140     +
2141     +#undef pr_fmt
2142     +
2143     #ifdef CONFIG_SYSFS
2144    
2145     +#define L1TF_DEFAULT_MSG "Mitigation: PTE Inversion"
2146     +
2147     +#if IS_ENABLED(CONFIG_KVM_INTEL)
2148     +static const char *l1tf_vmx_states[] = {
2149     + [VMENTER_L1D_FLUSH_AUTO] = "auto",
2150     + [VMENTER_L1D_FLUSH_NEVER] = "vulnerable",
2151     + [VMENTER_L1D_FLUSH_COND] = "conditional cache flushes",
2152     + [VMENTER_L1D_FLUSH_ALWAYS] = "cache flushes",
2153     + [VMENTER_L1D_FLUSH_EPT_DISABLED] = "EPT disabled",
2154     + [VMENTER_L1D_FLUSH_NOT_REQUIRED] = "flush not necessary"
2155     +};
2156     +
2157     +static ssize_t l1tf_show_state(char *buf)
2158     +{
2159     + if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO)
2160     + return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG);
2161     +
2162     + if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_EPT_DISABLED ||
2163     + (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER &&
2164     + cpu_smt_control == CPU_SMT_ENABLED))
2165     + return sprintf(buf, "%s; VMX: %s\n", L1TF_DEFAULT_MSG,
2166     + l1tf_vmx_states[l1tf_vmx_mitigation]);
2167     +
2168     + return sprintf(buf, "%s; VMX: %s, SMT %s\n", L1TF_DEFAULT_MSG,
2169     + l1tf_vmx_states[l1tf_vmx_mitigation],
2170     + cpu_smt_control == CPU_SMT_ENABLED ? "vulnerable" : "disabled");
2171     +}
2172     +#else
2173     +static ssize_t l1tf_show_state(char *buf)
2174     +{
2175     + return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG);
2176     +}
2177     +#endif
2178     +
2179     static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr,
2180     char *buf, unsigned int bug)
2181     {
2182     @@ -680,6 +780,10 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr
2183     case X86_BUG_SPEC_STORE_BYPASS:
2184     return sprintf(buf, "%s\n", ssb_strings[ssb_mode]);
2185    
2186     + case X86_BUG_L1TF:
2187     + if (boot_cpu_has(X86_FEATURE_L1TF_PTEINV))
2188     + return l1tf_show_state(buf);
2189     + break;
2190     default:
2191     break;
2192     }
2193     @@ -706,4 +810,9 @@ ssize_t cpu_show_spec_store_bypass(struct device *dev, struct device_attribute *
2194     {
2195     return cpu_show_common(dev, attr, buf, X86_BUG_SPEC_STORE_BYPASS);
2196     }
2197     +
2198     +ssize_t cpu_show_l1tf(struct device *dev, struct device_attribute *attr, char *buf)
2199     +{
2200     + return cpu_show_common(dev, attr, buf, X86_BUG_L1TF);
2201     +}
2202     #endif
2203     diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
2204     index 7a4279d8a902..13471b71bec7 100644
2205     --- a/arch/x86/kernel/cpu/common.c
2206     +++ b/arch/x86/kernel/cpu/common.c
2207     @@ -61,6 +61,13 @@ cpumask_var_t cpu_callin_mask;
2208     /* representing cpus for which sibling maps can be computed */
2209     cpumask_var_t cpu_sibling_setup_mask;
2210    
2211     +/* Number of siblings per CPU package */
2212     +int smp_num_siblings = 1;
2213     +EXPORT_SYMBOL(smp_num_siblings);
2214     +
2215     +/* Last level cache ID of each logical CPU */
2216     +DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id) = BAD_APICID;
2217     +
2218     /* correctly size the local cpu masks */
2219     void __init setup_cpu_local_masks(void)
2220     {
2221     @@ -606,33 +613,36 @@ static void cpu_detect_tlb(struct cpuinfo_x86 *c)
2222     tlb_lld_4m[ENTRIES], tlb_lld_1g[ENTRIES]);
2223     }
2224    
2225     -void detect_ht(struct cpuinfo_x86 *c)
2226     +int detect_ht_early(struct cpuinfo_x86 *c)
2227     {
2228     #ifdef CONFIG_SMP
2229     u32 eax, ebx, ecx, edx;
2230     - int index_msb, core_bits;
2231     - static bool printed;
2232    
2233     if (!cpu_has(c, X86_FEATURE_HT))
2234     - return;
2235     + return -1;
2236    
2237     if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
2238     - goto out;
2239     + return -1;
2240    
2241     if (cpu_has(c, X86_FEATURE_XTOPOLOGY))
2242     - return;
2243     + return -1;
2244    
2245     cpuid(1, &eax, &ebx, &ecx, &edx);
2246    
2247     smp_num_siblings = (ebx & 0xff0000) >> 16;
2248     -
2249     - if (smp_num_siblings == 1) {
2250     + if (smp_num_siblings == 1)
2251     pr_info_once("CPU0: Hyper-Threading is disabled\n");
2252     - goto out;
2253     - }
2254     +#endif
2255     + return 0;
2256     +}
2257    
2258     - if (smp_num_siblings <= 1)
2259     - goto out;
2260     +void detect_ht(struct cpuinfo_x86 *c)
2261     +{
2262     +#ifdef CONFIG_SMP
2263     + int index_msb, core_bits;
2264     +
2265     + if (detect_ht_early(c) < 0)
2266     + return;
2267    
2268     index_msb = get_count_order(smp_num_siblings);
2269     c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb);
2270     @@ -645,15 +655,6 @@ void detect_ht(struct cpuinfo_x86 *c)
2271    
2272     c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, index_msb) &
2273     ((1 << core_bits) - 1);
2274     -
2275     -out:
2276     - if (!printed && (c->x86_max_cores * smp_num_siblings) > 1) {
2277     - pr_info("CPU: Physical Processor ID: %d\n",
2278     - c->phys_proc_id);
2279     - pr_info("CPU: Processor Core ID: %d\n",
2280     - c->cpu_core_id);
2281     - printed = 1;
2282     - }
2283     #endif
2284     }
2285    
2286     @@ -925,6 +926,21 @@ static const __initconst struct x86_cpu_id cpu_no_spec_store_bypass[] = {
2287     {}
2288     };
2289    
2290     +static const __initconst struct x86_cpu_id cpu_no_l1tf[] = {
2291     + /* in addition to cpu_no_speculation */
2292     + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT1 },
2293     + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT2 },
2294     + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_AIRMONT },
2295     + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_MERRIFIELD },
2296     + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_MOOREFIELD },
2297     + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GOLDMONT },
2298     + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_DENVERTON },
2299     + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GEMINI_LAKE },
2300     + { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNL },
2301     + { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNM },
2302     + {}
2303     +};
2304     +
2305     static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
2306     {
2307     u64 ia32_cap = 0;
2308     @@ -950,6 +966,11 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
2309     return;
2310    
2311     setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN);
2312     +
2313     + if (x86_match_cpu(cpu_no_l1tf))
2314     + return;
2315     +
2316     + setup_force_cpu_bug(X86_BUG_L1TF);
2317     }
2318    
2319     /*
2320     diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
2321     index 3b19d82f7932..2275900d4d1b 100644
2322     --- a/arch/x86/kernel/cpu/cpu.h
2323     +++ b/arch/x86/kernel/cpu/cpu.h
2324     @@ -46,6 +46,8 @@ extern const struct cpu_dev *const __x86_cpu_dev_start[],
2325    
2326     extern void get_cpu_cap(struct cpuinfo_x86 *c);
2327     extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c);
2328     +extern int detect_extended_topology_early(struct cpuinfo_x86 *c);
2329     +extern int detect_ht_early(struct cpuinfo_x86 *c);
2330    
2331     extern void x86_spec_ctrl_setup_ap(void);
2332    
2333     diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
2334     index 93781e3f05b2..9ad86c4bf360 100644
2335     --- a/arch/x86/kernel/cpu/intel.c
2336     +++ b/arch/x86/kernel/cpu/intel.c
2337     @@ -283,6 +283,13 @@ static void early_init_intel(struct cpuinfo_x86 *c)
2338     }
2339    
2340     check_mpx_erratum(c);
2341     +
2342     + /*
2343     + * Get the number of SMT siblings early from the extended topology
2344     + * leaf, if available. Otherwise try the legacy SMT detection.
2345     + */
2346     + if (detect_extended_topology_early(c) < 0)
2347     + detect_ht_early(c);
2348     }
2349    
2350     #ifdef CONFIG_X86_32
2351     diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
2352     index 0afaf00b029b..b53a6579767d 100644
2353     --- a/arch/x86/kernel/cpu/microcode/core.c
2354     +++ b/arch/x86/kernel/cpu/microcode/core.c
2355     @@ -384,6 +384,24 @@ static void __exit microcode_dev_exit(void)
2356     /* fake device for request_firmware */
2357     static struct platform_device *microcode_pdev;
2358    
2359     +static int check_online_cpus(void)
2360     +{
2361     + unsigned int cpu;
2362     +
2363     + /*
2364     + * Make sure all CPUs are online. It's fine for SMT to be disabled if
2365     + * all the primary threads are still online.
2366     + */
2367     + for_each_present_cpu(cpu) {
2368     + if (topology_is_primary_thread(cpu) && !cpu_online(cpu)) {
2369     + pr_err("Not all CPUs online, aborting microcode update.\n");
2370     + return -EINVAL;
2371     + }
2372     + }
2373     +
2374     + return 0;
2375     +}
2376     +
2377     static int reload_for_cpu(int cpu)
2378     {
2379     struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
2380     @@ -418,7 +436,13 @@ static ssize_t reload_store(struct device *dev,
2381     return size;
2382    
2383     get_online_cpus();
2384     +
2385     + ret = check_online_cpus();
2386     + if (ret)
2387     + goto put;
2388     +
2389     mutex_lock(&microcode_mutex);
2390     +
2391     for_each_online_cpu(cpu) {
2392     tmp_ret = reload_for_cpu(cpu);
2393     if (tmp_ret != 0)
2394     @@ -431,6 +455,8 @@ static ssize_t reload_store(struct device *dev,
2395     if (!ret)
2396     perf_check_microcode();
2397     mutex_unlock(&microcode_mutex);
2398     +
2399     +put:
2400     put_online_cpus();
2401    
2402     if (!ret)
2403     diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c
2404     index cd531355e838..6b5a850885ac 100644
2405     --- a/arch/x86/kernel/cpu/topology.c
2406     +++ b/arch/x86/kernel/cpu/topology.c
2407     @@ -26,16 +26,13 @@
2408     * exists, use it for populating initial_apicid and cpu topology
2409     * detection.
2410     */
2411     -void detect_extended_topology(struct cpuinfo_x86 *c)
2412     +int detect_extended_topology_early(struct cpuinfo_x86 *c)
2413     {
2414     #ifdef CONFIG_SMP
2415     - unsigned int eax, ebx, ecx, edx, sub_index;
2416     - unsigned int ht_mask_width, core_plus_mask_width;
2417     - unsigned int core_select_mask, core_level_siblings;
2418     - static bool printed;
2419     + unsigned int eax, ebx, ecx, edx;
2420    
2421     if (c->cpuid_level < 0xb)
2422     - return;
2423     + return -1;
2424    
2425     cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx);
2426    
2427     @@ -43,7 +40,7 @@ void detect_extended_topology(struct cpuinfo_x86 *c)
2428     * check if the cpuid leaf 0xb is actually implemented.
2429     */
2430     if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE))
2431     - return;
2432     + return -1;
2433    
2434     set_cpu_cap(c, X86_FEATURE_XTOPOLOGY);
2435    
2436     @@ -51,10 +48,30 @@ void detect_extended_topology(struct cpuinfo_x86 *c)
2437     * initial apic id, which also represents 32-bit extended x2apic id.
2438     */
2439     c->initial_apicid = edx;
2440     + smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx);
2441     +#endif
2442     + return 0;
2443     +}
2444     +
2445     +/*
2446     + * Check for extended topology enumeration cpuid leaf 0xb and if it
2447     + * exists, use it for populating initial_apicid and cpu topology
2448     + * detection.
2449     + */
2450     +void detect_extended_topology(struct cpuinfo_x86 *c)
2451     +{
2452     +#ifdef CONFIG_SMP
2453     + unsigned int eax, ebx, ecx, edx, sub_index;
2454     + unsigned int ht_mask_width, core_plus_mask_width;
2455     + unsigned int core_select_mask, core_level_siblings;
2456     +
2457     + if (detect_extended_topology_early(c) < 0)
2458     + return;
2459    
2460     /*
2461     * Populate HT related information from sub-leaf level 0.
2462     */
2463     + cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx);
2464     core_level_siblings = smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx);
2465     core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
2466    
2467     @@ -85,15 +102,5 @@ void detect_extended_topology(struct cpuinfo_x86 *c)
2468     c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
2469    
2470     c->x86_max_cores = (core_level_siblings / smp_num_siblings);
2471     -
2472     - if (!printed) {
2473     - pr_info("CPU: Physical Processor ID: %d\n",
2474     - c->phys_proc_id);
2475     - if (c->x86_max_cores > 1)
2476     - pr_info("CPU: Processor Core ID: %d\n",
2477     - c->cpu_core_id);
2478     - printed = 1;
2479     - }
2480     - return;
2481     #endif
2482     }
2483     diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
2484     index 96d80dfac383..430c095cfa0e 100644
2485     --- a/arch/x86/kernel/fpu/core.c
2486     +++ b/arch/x86/kernel/fpu/core.c
2487     @@ -10,6 +10,7 @@
2488     #include <asm/fpu/signal.h>
2489     #include <asm/fpu/types.h>
2490     #include <asm/traps.h>
2491     +#include <asm/irq_regs.h>
2492    
2493     #include <linux/hardirq.h>
2494     #include <linux/pkeys.h>
2495     diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
2496     index 6bf09f5594b2..5e06ffefc5db 100644
2497     --- a/arch/x86/kernel/ftrace.c
2498     +++ b/arch/x86/kernel/ftrace.c
2499     @@ -26,6 +26,7 @@
2500    
2501     #include <asm/cacheflush.h>
2502     #include <asm/kprobes.h>
2503     +#include <asm/sections.h>
2504     #include <asm/ftrace.h>
2505     #include <asm/nops.h>
2506    
2507     diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
2508     index 9512529e8eab..756634f14df6 100644
2509     --- a/arch/x86/kernel/hpet.c
2510     +++ b/arch/x86/kernel/hpet.c
2511     @@ -1,6 +1,7 @@
2512     #include <linux/clocksource.h>
2513     #include <linux/clockchips.h>
2514     #include <linux/interrupt.h>
2515     +#include <linux/irq.h>
2516     #include <linux/export.h>
2517     #include <linux/delay.h>
2518     #include <linux/errno.h>
2519     diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
2520     index 4e3b8a587c88..26d5451b6b42 100644
2521     --- a/arch/x86/kernel/i8259.c
2522     +++ b/arch/x86/kernel/i8259.c
2523     @@ -4,6 +4,7 @@
2524     #include <linux/sched.h>
2525     #include <linux/ioport.h>
2526     #include <linux/interrupt.h>
2527     +#include <linux/irq.h>
2528     #include <linux/timex.h>
2529     #include <linux/random.h>
2530     #include <linux/init.h>
2531     diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
2532     index 8a7ad9fb22c1..c6f0ef1d9ab7 100644
2533     --- a/arch/x86/kernel/irq.c
2534     +++ b/arch/x86/kernel/irq.c
2535     @@ -10,6 +10,7 @@
2536     #include <linux/ftrace.h>
2537     #include <linux/delay.h>
2538     #include <linux/export.h>
2539     +#include <linux/irq.h>
2540    
2541     #include <asm/apic.h>
2542     #include <asm/io_apic.h>
2543     diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
2544     index 2763573ee1d2..5aaa39a10823 100644
2545     --- a/arch/x86/kernel/irq_32.c
2546     +++ b/arch/x86/kernel/irq_32.c
2547     @@ -10,6 +10,7 @@
2548    
2549     #include <linux/seq_file.h>
2550     #include <linux/interrupt.h>
2551     +#include <linux/irq.h>
2552     #include <linux/kernel_stat.h>
2553     #include <linux/notifier.h>
2554     #include <linux/cpu.h>
2555     diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
2556     index 9ebd0b0e73d9..bcd1b82c86e8 100644
2557     --- a/arch/x86/kernel/irq_64.c
2558     +++ b/arch/x86/kernel/irq_64.c
2559     @@ -10,6 +10,7 @@
2560    
2561     #include <linux/kernel_stat.h>
2562     #include <linux/interrupt.h>
2563     +#include <linux/irq.h>
2564     #include <linux/seq_file.h>
2565     #include <linux/delay.h>
2566     #include <linux/ftrace.h>
2567     diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
2568     index f480b38a03c3..eeb77e5e5179 100644
2569     --- a/arch/x86/kernel/irqinit.c
2570     +++ b/arch/x86/kernel/irqinit.c
2571     @@ -4,6 +4,7 @@
2572     #include <linux/sched.h>
2573     #include <linux/ioport.h>
2574     #include <linux/interrupt.h>
2575     +#include <linux/irq.h>
2576     #include <linux/timex.h>
2577     #include <linux/random.h>
2578     #include <linux/kprobes.h>
2579     diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
2580     index 516be613bd41..64a70b2e2285 100644
2581     --- a/arch/x86/kernel/kprobes/core.c
2582     +++ b/arch/x86/kernel/kprobes/core.c
2583     @@ -61,6 +61,7 @@
2584     #include <asm/alternative.h>
2585     #include <asm/insn.h>
2586     #include <asm/debugreg.h>
2587     +#include <asm/sections.h>
2588    
2589     #include "common.h"
2590    
2591     @@ -396,7 +397,6 @@ int __copy_instruction(u8 *dest, u8 *src)
2592     newdisp = (u8 *) src + (s64) insn.displacement.value - (u8 *) dest;
2593     if ((s64) (s32) newdisp != newdisp) {
2594     pr_err("Kprobes error: new displacement does not fit into s32 (%llx)\n", newdisp);
2595     - pr_err("\tSrc: %p, Dest: %p, old disp: %x\n", src, dest, insn.displacement.value);
2596     return 0;
2597     }
2598     disp = (u8 *) dest + insn_offset_displacement(&insn);
2599     @@ -612,8 +612,7 @@ static int reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
2600     * Raise a BUG or we'll continue in an endless reentering loop
2601     * and eventually a stack overflow.
2602     */
2603     - printk(KERN_WARNING "Unrecoverable kprobe detected at %p.\n",
2604     - p->addr);
2605     + pr_err("Unrecoverable kprobe detected.\n");
2606     dump_kprobe(p);
2607     BUG();
2608     default:
2609     diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
2610     index 1808a9cc7701..1009d63a2b79 100644
2611     --- a/arch/x86/kernel/kprobes/opt.c
2612     +++ b/arch/x86/kernel/kprobes/opt.c
2613     @@ -39,6 +39,7 @@
2614     #include <asm/insn.h>
2615     #include <asm/debugreg.h>
2616     #include <asm/nospec-branch.h>
2617     +#include <asm/sections.h>
2618    
2619     #include "common.h"
2620    
2621     diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
2622     index bbf3d5933eaa..29d465627919 100644
2623     --- a/arch/x86/kernel/paravirt.c
2624     +++ b/arch/x86/kernel/paravirt.c
2625     @@ -88,10 +88,12 @@ unsigned paravirt_patch_call(void *insnbuf,
2626     struct branch *b = insnbuf;
2627     unsigned long delta = (unsigned long)target - (addr+5);
2628    
2629     - if (tgt_clobbers & ~site_clobbers)
2630     - return len; /* target would clobber too much for this site */
2631     - if (len < 5)
2632     + if (len < 5) {
2633     +#ifdef CONFIG_RETPOLINE
2634     + WARN_ONCE("Failing to patch indirect CALL in %ps\n", (void *)addr);
2635     +#endif
2636     return len; /* call too long for patch site */
2637     + }
2638    
2639     b->opcode = 0xe8; /* call */
2640     b->delta = delta;
2641     @@ -106,8 +108,12 @@ unsigned paravirt_patch_jmp(void *insnbuf, const void *target,
2642     struct branch *b = insnbuf;
2643     unsigned long delta = (unsigned long)target - (addr+5);
2644    
2645     - if (len < 5)
2646     + if (len < 5) {
2647     +#ifdef CONFIG_RETPOLINE
2648     + WARN_ONCE("Failing to patch indirect JMP in %ps\n", (void *)addr);
2649     +#endif
2650     return len; /* call too long for patch site */
2651     + }
2652    
2653     b->opcode = 0xe9; /* jmp */
2654     b->delta = delta;
2655     diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
2656     index 6b55012d02a3..49960ecfc322 100644
2657     --- a/arch/x86/kernel/setup.c
2658     +++ b/arch/x86/kernel/setup.c
2659     @@ -854,6 +854,12 @@ void __init setup_arch(char **cmdline_p)
2660     memblock_reserve(__pa_symbol(_text),
2661     (unsigned long)__bss_stop - (unsigned long)_text);
2662    
2663     + /*
2664     + * Make sure page 0 is always reserved because on systems with
2665     + * L1TF its contents can be leaked to user processes.
2666     + */
2667     + memblock_reserve(0, PAGE_SIZE);
2668     +
2669     early_reserve_initrd();
2670    
2671     /*
2672     diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
2673     index ea217caa731c..2863ad306692 100644
2674     --- a/arch/x86/kernel/smp.c
2675     +++ b/arch/x86/kernel/smp.c
2676     @@ -271,6 +271,7 @@ __visible void __irq_entry smp_reschedule_interrupt(struct pt_regs *regs)
2677     /*
2678     * KVM uses this interrupt to force a cpu out of guest mode
2679     */
2680     + kvm_set_cpu_l1tf_flush_l1d();
2681     }
2682    
2683     __visible void __irq_entry smp_trace_reschedule_interrupt(struct pt_regs *regs)
2684     diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
2685     index 10b22fc6ef5a..ef38bc1d1c00 100644
2686     --- a/arch/x86/kernel/smpboot.c
2687     +++ b/arch/x86/kernel/smpboot.c
2688     @@ -76,13 +76,7 @@
2689     #include <asm/realmode.h>
2690     #include <asm/misc.h>
2691     #include <asm/spec-ctrl.h>
2692     -
2693     -/* Number of siblings per CPU package */
2694     -int smp_num_siblings = 1;
2695     -EXPORT_SYMBOL(smp_num_siblings);
2696     -
2697     -/* Last level cache ID of each logical CPU */
2698     -DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id) = BAD_APICID;
2699     +#include <asm/hw_irq.h>
2700    
2701     /* representing HT siblings of each logical CPU */
2702     DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
2703     @@ -295,6 +289,23 @@ found:
2704     return 0;
2705     }
2706    
2707     +/**
2708     + * topology_is_primary_thread - Check whether CPU is the primary SMT thread
2709     + * @cpu: CPU to check
2710     + */
2711     +bool topology_is_primary_thread(unsigned int cpu)
2712     +{
2713     + return apic_id_is_primary_thread(per_cpu(x86_cpu_to_apicid, cpu));
2714     +}
2715     +
2716     +/**
2717     + * topology_smt_supported - Check whether SMT is supported by the CPUs
2718     + */
2719     +bool topology_smt_supported(void)
2720     +{
2721     + return smp_num_siblings > 1;
2722     +}
2723     +
2724     /**
2725     * topology_phys_to_logical_pkg - Map a physical package id to a logical
2726     *
2727     diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
2728     index d39c09119db6..f8a0518d2810 100644
2729     --- a/arch/x86/kernel/time.c
2730     +++ b/arch/x86/kernel/time.c
2731     @@ -11,6 +11,7 @@
2732    
2733     #include <linux/clockchips.h>
2734     #include <linux/interrupt.h>
2735     +#include <linux/irq.h>
2736     #include <linux/i8253.h>
2737     #include <linux/time.h>
2738     #include <linux/export.h>
2739     diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
2740     index c4cd1280ac3e..c855080c7a71 100644
2741     --- a/arch/x86/kvm/svm.c
2742     +++ b/arch/x86/kvm/svm.c
2743     @@ -175,6 +175,8 @@ struct vcpu_svm {
2744     uint64_t sysenter_eip;
2745     uint64_t tsc_aux;
2746    
2747     + u64 msr_decfg;
2748     +
2749     u64 next_rip;
2750    
2751     u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
2752     @@ -1567,6 +1569,7 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
2753     u32 dummy;
2754     u32 eax = 1;
2755    
2756     + vcpu->arch.microcode_version = 0x01000065;
2757     svm->spec_ctrl = 0;
2758     svm->virt_spec_ctrl = 0;
2759    
2760     @@ -2124,6 +2127,8 @@ static int pf_interception(struct vcpu_svm *svm)
2761     u32 error_code;
2762     int r = 1;
2763    
2764     + svm->vcpu.arch.l1tf_flush_l1d = true;
2765     +
2766     switch (svm->apf_reason) {
2767     default:
2768     error_code = svm->vmcb->control.exit_info_1;
2769     @@ -3483,6 +3488,22 @@ static int cr8_write_interception(struct vcpu_svm *svm)
2770     return 0;
2771     }
2772    
2773     +static int svm_get_msr_feature(struct kvm_msr_entry *msr)
2774     +{
2775     + msr->data = 0;
2776     +
2777     + switch (msr->index) {
2778     + case MSR_F10H_DECFG:
2779     + if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
2780     + msr->data |= MSR_F10H_DECFG_LFENCE_SERIALIZE;
2781     + break;
2782     + default:
2783     + return 1;
2784     + }
2785     +
2786     + return 0;
2787     +}
2788     +
2789     static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2790     {
2791     struct vcpu_svm *svm = to_svm(vcpu);
2792     @@ -3565,9 +3586,6 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2793    
2794     msr_info->data = svm->virt_spec_ctrl;
2795     break;
2796     - case MSR_IA32_UCODE_REV:
2797     - msr_info->data = 0x01000065;
2798     - break;
2799     case MSR_F15H_IC_CFG: {
2800    
2801     int family, model;
2802     @@ -3585,6 +3603,9 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2803     msr_info->data = 0x1E;
2804     }
2805     break;
2806     + case MSR_F10H_DECFG:
2807     + msr_info->data = svm->msr_decfg;
2808     + break;
2809     default:
2810     return kvm_get_msr_common(vcpu, msr_info);
2811     }
2812     @@ -3773,6 +3794,24 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
2813     case MSR_VM_IGNNE:
2814     vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
2815     break;
2816     + case MSR_F10H_DECFG: {
2817     + struct kvm_msr_entry msr_entry;
2818     +
2819     + msr_entry.index = msr->index;
2820     + if (svm_get_msr_feature(&msr_entry))
2821     + return 1;
2822     +
2823     + /* Check the supported bits */
2824     + if (data & ~msr_entry.data)
2825     + return 1;
2826     +
2827     + /* Don't allow the guest to change a bit, #GP */
2828     + if (!msr->host_initiated && (data ^ msr_entry.data))
2829     + return 1;
2830     +
2831     + svm->msr_decfg = data;
2832     + break;
2833     + }
2834     case MSR_IA32_APICBASE:
2835     if (kvm_vcpu_apicv_active(vcpu))
2836     avic_update_vapic_bar(to_svm(vcpu), data);
2837     @@ -5502,6 +5541,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
2838     .vcpu_unblocking = svm_vcpu_unblocking,
2839    
2840     .update_bp_intercept = update_bp_intercept,
2841     + .get_msr_feature = svm_get_msr_feature,
2842     .get_msr = svm_get_msr,
2843     .set_msr = svm_set_msr,
2844     .get_segment_base = svm_get_segment_base,
2845     diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
2846     index 30b74b491909..12826607a995 100644
2847     --- a/arch/x86/kvm/vmx.c
2848     +++ b/arch/x86/kvm/vmx.c
2849     @@ -189,6 +189,150 @@ module_param(ple_window_max, int, S_IRUGO);
2850    
2851     extern const ulong vmx_return;
2852    
2853     +static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
2854     +static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond);
2855     +static DEFINE_MUTEX(vmx_l1d_flush_mutex);
2856     +
2857     +/* Storage for pre module init parameter parsing */
2858     +static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO;
2859     +
2860     +static const struct {
2861     + const char *option;
2862     + enum vmx_l1d_flush_state cmd;
2863     +} vmentry_l1d_param[] = {
2864     + {"auto", VMENTER_L1D_FLUSH_AUTO},
2865     + {"never", VMENTER_L1D_FLUSH_NEVER},
2866     + {"cond", VMENTER_L1D_FLUSH_COND},
2867     + {"always", VMENTER_L1D_FLUSH_ALWAYS},
2868     +};
2869     +
2870     +#define L1D_CACHE_ORDER 4
2871     +static void *vmx_l1d_flush_pages;
2872     +
2873     +static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
2874     +{
2875     + struct page *page;
2876     + unsigned int i;
2877     +
2878     + if (!enable_ept) {
2879     + l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED;
2880     + return 0;
2881     + }
2882     +
2883     + if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {
2884     + u64 msr;
2885     +
2886     + rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);
2887     + if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
2888     + l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
2889     + return 0;
2890     + }
2891     + }
2892     +
2893     + /* If set to auto use the default l1tf mitigation method */
2894     + if (l1tf == VMENTER_L1D_FLUSH_AUTO) {
2895     + switch (l1tf_mitigation) {
2896     + case L1TF_MITIGATION_OFF:
2897     + l1tf = VMENTER_L1D_FLUSH_NEVER;
2898     + break;
2899     + case L1TF_MITIGATION_FLUSH_NOWARN:
2900     + case L1TF_MITIGATION_FLUSH:
2901     + case L1TF_MITIGATION_FLUSH_NOSMT:
2902     + l1tf = VMENTER_L1D_FLUSH_COND;
2903     + break;
2904     + case L1TF_MITIGATION_FULL:
2905     + case L1TF_MITIGATION_FULL_FORCE:
2906     + l1tf = VMENTER_L1D_FLUSH_ALWAYS;
2907     + break;
2908     + }
2909     + } else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) {
2910     + l1tf = VMENTER_L1D_FLUSH_ALWAYS;
2911     + }
2912     +
2913     + if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages &&
2914     + !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) {
2915     + page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);
2916     + if (!page)
2917     + return -ENOMEM;
2918     + vmx_l1d_flush_pages = page_address(page);
2919     +
2920     + /*
2921     + * Initialize each page with a different pattern in
2922     + * order to protect against KSM in the nested
2923     + * virtualization case.
2924     + */
2925     + for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) {
2926     + memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1,
2927     + PAGE_SIZE);
2928     + }
2929     + }
2930     +
2931     + l1tf_vmx_mitigation = l1tf;
2932     +
2933     + if (l1tf != VMENTER_L1D_FLUSH_NEVER)
2934     + static_branch_enable(&vmx_l1d_should_flush);
2935     + else
2936     + static_branch_disable(&vmx_l1d_should_flush);
2937     +
2938     + if (l1tf == VMENTER_L1D_FLUSH_COND)
2939     + static_branch_enable(&vmx_l1d_flush_cond);
2940     + else
2941     + static_branch_disable(&vmx_l1d_flush_cond);
2942     + return 0;
2943     +}
2944     +
2945     +static int vmentry_l1d_flush_parse(const char *s)
2946     +{
2947     + unsigned int i;
2948     +
2949     + if (s) {
2950     + for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) {
2951     + if (sysfs_streq(s, vmentry_l1d_param[i].option))
2952     + return vmentry_l1d_param[i].cmd;
2953     + }
2954     + }
2955     + return -EINVAL;
2956     +}
2957     +
2958     +static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
2959     +{
2960     + int l1tf, ret;
2961     +
2962     + if (!boot_cpu_has(X86_BUG_L1TF))
2963     + return 0;
2964     +
2965     + l1tf = vmentry_l1d_flush_parse(s);
2966     + if (l1tf < 0)
2967     + return l1tf;
2968     +
2969     + /*
2970     + * Has vmx_init() run already? If not then this is the pre init
2971     + * parameter parsing. In that case just store the value and let
2972     + * vmx_init() do the proper setup after enable_ept has been
2973     + * established.
2974     + */
2975     + if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) {
2976     + vmentry_l1d_flush_param = l1tf;
2977     + return 0;
2978     + }
2979     +
2980     + mutex_lock(&vmx_l1d_flush_mutex);
2981     + ret = vmx_setup_l1d_flush(l1tf);
2982     + mutex_unlock(&vmx_l1d_flush_mutex);
2983     + return ret;
2984     +}
2985     +
2986     +static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
2987     +{
2988     + return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);
2989     +}
2990     +
2991     +static const struct kernel_param_ops vmentry_l1d_flush_ops = {
2992     + .set = vmentry_l1d_flush_set,
2993     + .get = vmentry_l1d_flush_get,
2994     +};
2995     +module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
2996     +
2997     #define NR_AUTOLOAD_MSRS 8
2998    
2999     struct vmcs {
3000     @@ -541,6 +685,11 @@ static inline int pi_test_sn(struct pi_desc *pi_desc)
3001     (unsigned long *)&pi_desc->control);
3002     }
3003    
3004     +struct vmx_msrs {
3005     + unsigned int nr;
3006     + struct vmx_msr_entry val[NR_AUTOLOAD_MSRS];
3007     +};
3008     +
3009     struct vcpu_vmx {
3010     struct kvm_vcpu vcpu;
3011     unsigned long host_rsp;
3012     @@ -573,9 +722,8 @@ struct vcpu_vmx {
3013     struct loaded_vmcs *loaded_vmcs;
3014     bool __launched; /* temporary, used in vmx_vcpu_run */
3015     struct msr_autoload {
3016     - unsigned nr;
3017     - struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS];
3018     - struct vmx_msr_entry host[NR_AUTOLOAD_MSRS];
3019     + struct vmx_msrs guest;
3020     + struct vmx_msrs host;
3021     } msr_autoload;
3022     struct {
3023     int loaded;
3024     @@ -1920,9 +2068,20 @@ static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
3025     vm_exit_controls_clearbit(vmx, exit);
3026     }
3027    
3028     +static int find_msr(struct vmx_msrs *m, unsigned int msr)
3029     +{
3030     + unsigned int i;
3031     +
3032     + for (i = 0; i < m->nr; ++i) {
3033     + if (m->val[i].index == msr)
3034     + return i;
3035     + }
3036     + return -ENOENT;
3037     +}
3038     +
3039     static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
3040     {
3041     - unsigned i;
3042     + int i;
3043     struct msr_autoload *m = &vmx->msr_autoload;
3044    
3045     switch (msr) {
3046     @@ -1943,18 +2102,21 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
3047     }
3048     break;
3049     }
3050     + i = find_msr(&m->guest, msr);
3051     + if (i < 0)
3052     + goto skip_guest;
3053     + --m->guest.nr;
3054     + m->guest.val[i] = m->guest.val[m->guest.nr];
3055     + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
3056    
3057     - for (i = 0; i < m->nr; ++i)
3058     - if (m->guest[i].index == msr)
3059     - break;
3060     -
3061     - if (i == m->nr)
3062     +skip_guest:
3063     + i = find_msr(&m->host, msr);
3064     + if (i < 0)
3065     return;
3066     - --m->nr;
3067     - m->guest[i] = m->guest[m->nr];
3068     - m->host[i] = m->host[m->nr];
3069     - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
3070     - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
3071     +
3072     + --m->host.nr;
3073     + m->host.val[i] = m->host.val[m->host.nr];
3074     + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
3075     }
3076    
3077     static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
3078     @@ -1969,9 +2131,9 @@ static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
3079     }
3080    
3081     static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
3082     - u64 guest_val, u64 host_val)
3083     + u64 guest_val, u64 host_val, bool entry_only)
3084     {
3085     - unsigned i;
3086     + int i, j = 0;
3087     struct msr_autoload *m = &vmx->msr_autoload;
3088    
3089     switch (msr) {
3090     @@ -2006,24 +2168,31 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
3091     wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
3092     }
3093    
3094     - for (i = 0; i < m->nr; ++i)
3095     - if (m->guest[i].index == msr)
3096     - break;
3097     + i = find_msr(&m->guest, msr);
3098     + if (!entry_only)
3099     + j = find_msr(&m->host, msr);
3100    
3101     - if (i == NR_AUTOLOAD_MSRS) {
3102     + if (i == NR_AUTOLOAD_MSRS || j == NR_AUTOLOAD_MSRS) {
3103     printk_once(KERN_WARNING "Not enough msr switch entries. "
3104     "Can't add msr %x\n", msr);
3105     return;
3106     - } else if (i == m->nr) {
3107     - ++m->nr;
3108     - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
3109     - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
3110     }
3111     + if (i < 0) {
3112     + i = m->guest.nr++;
3113     + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
3114     + }
3115     + m->guest.val[i].index = msr;
3116     + m->guest.val[i].value = guest_val;
3117    
3118     - m->guest[i].index = msr;
3119     - m->guest[i].value = guest_val;
3120     - m->host[i].index = msr;
3121     - m->host[i].value = host_val;
3122     + if (entry_only)
3123     + return;
3124     +
3125     + if (j < 0) {
3126     + j = m->host.nr++;
3127     + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
3128     + }
3129     + m->host.val[j].index = msr;
3130     + m->host.val[j].value = host_val;
3131     }
3132    
3133     static void reload_tss(void)
3134     @@ -2080,7 +2249,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
3135     guest_efer &= ~EFER_LME;
3136     if (guest_efer != host_efer)
3137     add_atomic_switch_msr(vmx, MSR_EFER,
3138     - guest_efer, host_efer);
3139     + guest_efer, host_efer, false);
3140     return false;
3141     } else {
3142     guest_efer &= ~ignore_bits;
3143     @@ -2994,6 +3163,11 @@ static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,
3144     return !(val & ~valid_bits);
3145     }
3146    
3147     +static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
3148     +{
3149     + return 1;
3150     +}
3151     +
3152     /*
3153     * Reads an msr value (of 'msr_index') into 'pdata'.
3154     * Returns 0 on success, non-0 otherwise.
3155     @@ -3244,7 +3418,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
3156     vcpu->arch.ia32_xss = data;
3157     if (vcpu->arch.ia32_xss != host_xss)
3158     add_atomic_switch_msr(vmx, MSR_IA32_XSS,
3159     - vcpu->arch.ia32_xss, host_xss);
3160     + vcpu->arch.ia32_xss, host_xss, false);
3161     else
3162     clear_atomic_switch_msr(vmx, MSR_IA32_XSS);
3163     break;
3164     @@ -5265,9 +5439,9 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
3165    
3166     vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
3167     vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
3168     - vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host));
3169     + vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
3170     vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
3171     - vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest));
3172     + vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
3173    
3174     if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
3175     vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
3176     @@ -5287,8 +5461,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
3177     ++vmx->nmsrs;
3178     }
3179    
3180     - if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
3181     - rdmsrl(MSR_IA32_ARCH_CAPABILITIES, vmx->arch_capabilities);
3182     + vmx->arch_capabilities = kvm_get_arch_capabilities();
3183    
3184     vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl);
3185    
3186     @@ -5317,6 +5490,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
3187     u64 cr0;
3188    
3189     vmx->rmode.vm86_active = 0;
3190     + vcpu->arch.microcode_version = 0x100000000ULL;
3191     vmx->spec_ctrl = 0;
3192    
3193     vmx->soft_vnmi_blocked = 0;
3194     @@ -5722,6 +5896,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
3195     BUG_ON(enable_ept);
3196     cr2 = vmcs_readl(EXIT_QUALIFICATION);
3197     trace_kvm_page_fault(cr2, error_code);
3198     + vcpu->arch.l1tf_flush_l1d = true;
3199    
3200     if (kvm_event_needs_reinjection(vcpu))
3201     kvm_mmu_unprotect_page_virt(vcpu, cr2);
3202     @@ -8485,6 +8660,79 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
3203     }
3204     }
3205    
3206     +/*
3207     + * Software based L1D cache flush which is used when microcode providing
3208     + * the cache control MSR is not loaded.
3209     + *
3210     + * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to
3211     + * flush it is required to read in 64 KiB because the replacement algorithm
3212     + * is not exactly LRU. This could be sized at runtime via topology
3213     + * information but as all relevant affected CPUs have 32KiB L1D cache size
3214     + * there is no point in doing so.
3215     + */
3216     +#define L1D_CACHE_ORDER 4
3217     +static void *vmx_l1d_flush_pages;
3218     +
3219     +static void vmx_l1d_flush(struct kvm_vcpu *vcpu)
3220     +{
3221     + int size = PAGE_SIZE << L1D_CACHE_ORDER;
3222     +
3223     + /*
3224     + * This code is only executed when the the flush mode is 'cond' or
3225     + * 'always'
3226     + */
3227     + if (static_branch_likely(&vmx_l1d_flush_cond)) {
3228     + bool flush_l1d;
3229     +
3230     + /*
3231     + * Clear the per-vcpu flush bit, it gets set again
3232     + * either from vcpu_run() or from one of the unsafe
3233     + * VMEXIT handlers.
3234     + */
3235     + flush_l1d = vcpu->arch.l1tf_flush_l1d;
3236     + vcpu->arch.l1tf_flush_l1d = false;
3237     +
3238     + /*
3239     + * Clear the per-cpu flush bit, it gets set again from
3240     + * the interrupt handlers.
3241     + */
3242     + flush_l1d |= kvm_get_cpu_l1tf_flush_l1d();
3243     + kvm_clear_cpu_l1tf_flush_l1d();
3244     +
3245     + if (!flush_l1d)
3246     + return;
3247     + }
3248     +
3249     + vcpu->stat.l1d_flush++;
3250     +
3251     + if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
3252     + wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
3253     + return;
3254     + }
3255     +
3256     + asm volatile(
3257     + /* First ensure the pages are in the TLB */
3258     + "xorl %%eax, %%eax\n"
3259     + ".Lpopulate_tlb:\n\t"
3260     + "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
3261     + "addl $4096, %%eax\n\t"
3262     + "cmpl %%eax, %[size]\n\t"
3263     + "jne .Lpopulate_tlb\n\t"
3264     + "xorl %%eax, %%eax\n\t"
3265     + "cpuid\n\t"
3266     + /* Now fill the cache */
3267     + "xorl %%eax, %%eax\n"
3268     + ".Lfill_cache:\n"
3269     + "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
3270     + "addl $64, %%eax\n\t"
3271     + "cmpl %%eax, %[size]\n\t"
3272     + "jne .Lfill_cache\n\t"
3273     + "lfence\n"
3274     + :: [flush_pages] "r" (vmx_l1d_flush_pages),
3275     + [size] "r" (size)
3276     + : "eax", "ebx", "ecx", "edx");
3277     +}
3278     +
3279     static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
3280     {
3281     struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3282     @@ -8857,7 +9105,7 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
3283     clear_atomic_switch_msr(vmx, msrs[i].msr);
3284     else
3285     add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest,
3286     - msrs[i].host);
3287     + msrs[i].host, false);
3288     }
3289    
3290     void vmx_arm_hv_timer(struct kvm_vcpu *vcpu)
3291     @@ -8941,6 +9189,9 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
3292    
3293     vmx->__launched = vmx->loaded_vmcs->launched;
3294    
3295     + if (static_branch_unlikely(&vmx_l1d_should_flush))
3296     + vmx_l1d_flush(vcpu);
3297     +
3298     asm(
3299     /* Store host registers */
3300     "push %%" _ASM_DX "; push %%" _ASM_BP ";"
3301     @@ -9298,6 +9549,37 @@ free_vcpu:
3302     return ERR_PTR(err);
3303     }
3304    
3305     +#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n"
3306     +#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n"
3307     +
3308     +static int vmx_vm_init(struct kvm *kvm)
3309     +{
3310     + if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) {
3311     + switch (l1tf_mitigation) {
3312     + case L1TF_MITIGATION_OFF:
3313     + case L1TF_MITIGATION_FLUSH_NOWARN:
3314     + /* 'I explicitly don't care' is set */
3315     + break;
3316     + case L1TF_MITIGATION_FLUSH:
3317     + case L1TF_MITIGATION_FLUSH_NOSMT:
3318     + case L1TF_MITIGATION_FULL:
3319     + /*
3320     + * Warn upon starting the first VM in a potentially
3321     + * insecure environment.
3322     + */
3323     + if (cpu_smt_control == CPU_SMT_ENABLED)
3324     + pr_warn_once(L1TF_MSG_SMT);
3325     + if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER)
3326     + pr_warn_once(L1TF_MSG_L1D);
3327     + break;
3328     + case L1TF_MITIGATION_FULL_FORCE:
3329     + /* Flush is enforced */
3330     + break;
3331     + }
3332     + }
3333     + return 0;
3334     +}
3335     +
3336     static void __init vmx_check_processor_compat(void *rtn)
3337     {
3338     struct vmcs_config vmcs_conf;
3339     @@ -10092,6 +10374,15 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
3340     */
3341     vmx_set_constant_host_state(vmx);
3342    
3343     + /*
3344     + * Set the MSR load/store lists to match L0's settings.
3345     + */
3346     + vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
3347     + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
3348     + vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
3349     + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
3350     + vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
3351     +
3352     /*
3353     * HOST_RSP is normally set correctly in vmx_vcpu_run() just before
3354     * entry, but only if the current (host) sp changed from the value
3355     @@ -10442,6 +10733,9 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
3356    
3357     vmcs12->launch_state = 1;
3358    
3359     + /* Hide L1D cache contents from the nested guest. */
3360     + vmx->vcpu.arch.l1tf_flush_l1d = true;
3361     +
3362     if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT)
3363     return kvm_vcpu_halt(vcpu);
3364    
3365     @@ -10936,6 +11230,8 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
3366     load_vmcs12_host_state(vcpu, vmcs12);
3367    
3368     /* Update any VMCS fields that might have changed while L2 ran */
3369     + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
3370     + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
3371     vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
3372     if (vmx->hv_deadline_tsc == -1)
3373     vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
3374     @@ -11367,6 +11663,8 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
3375     .cpu_has_accelerated_tpr = report_flexpriority,
3376     .has_emulated_msr = vmx_has_emulated_msr,
3377    
3378     + .vm_init = vmx_vm_init,
3379     +
3380     .vcpu_create = vmx_create_vcpu,
3381     .vcpu_free = vmx_free_vcpu,
3382     .vcpu_reset = vmx_vcpu_reset,
3383     @@ -11376,6 +11674,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
3384     .vcpu_put = vmx_vcpu_put,
3385    
3386     .update_bp_intercept = update_exception_bitmap,
3387     + .get_msr_feature = vmx_get_msr_feature,
3388     .get_msr = vmx_get_msr,
3389     .set_msr = vmx_set_msr,
3390     .get_segment_base = vmx_get_segment_base,
3391     @@ -11486,22 +11785,18 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
3392     .setup_mce = vmx_setup_mce,
3393     };
3394    
3395     -static int __init vmx_init(void)
3396     +static void vmx_cleanup_l1d_flush(void)
3397     {
3398     - int r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
3399     - __alignof__(struct vcpu_vmx), THIS_MODULE);
3400     - if (r)
3401     - return r;
3402     -
3403     -#ifdef CONFIG_KEXEC_CORE
3404     - rcu_assign_pointer(crash_vmclear_loaded_vmcss,
3405     - crash_vmclear_local_loaded_vmcss);
3406     -#endif
3407     -
3408     - return 0;
3409     + if (vmx_l1d_flush_pages) {
3410     + free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER);
3411     + vmx_l1d_flush_pages = NULL;
3412     + }
3413     + /* Restore state so sysfs ignores VMX */
3414     + l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
3415     }
3416    
3417     -static void __exit vmx_exit(void)
3418     +
3419     +static void vmx_exit(void)
3420     {
3421     #ifdef CONFIG_KEXEC_CORE
3422     RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
3423     @@ -11509,7 +11804,40 @@ static void __exit vmx_exit(void)
3424     #endif
3425    
3426     kvm_exit();
3427     +
3428     + vmx_cleanup_l1d_flush();
3429     }
3430     +module_exit(vmx_exit)
3431     +
3432     +static int __init vmx_init(void)
3433     +{
3434     + int r;
3435     +
3436     + r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
3437     + __alignof__(struct vcpu_vmx), THIS_MODULE);
3438     + if (r)
3439     + return r;
3440    
3441     + /*
3442     + * Must be called after kvm_init() so enable_ept is properly set
3443     + * up. Hand the parameter mitigation value in which was stored in
3444     + * the pre module init parser. If no parameter was given, it will
3445     + * contain 'auto' which will be turned into the default 'cond'
3446     + * mitigation mode.
3447     + */
3448     + if (boot_cpu_has(X86_BUG_L1TF)) {
3449     + r = vmx_setup_l1d_flush(vmentry_l1d_flush_param);
3450     + if (r) {
3451     + vmx_exit();
3452     + return r;
3453     + }
3454     + }
3455     +
3456     +#ifdef CONFIG_KEXEC_CORE
3457     + rcu_assign_pointer(crash_vmclear_loaded_vmcss,
3458     + crash_vmclear_local_loaded_vmcss);
3459     +#endif
3460     +
3461     + return 0;
3462     +}
3463     module_init(vmx_init)
3464     -module_exit(vmx_exit)
3465     diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
3466     index 5ca23af44c81..203d42340fc1 100644
3467     --- a/arch/x86/kvm/x86.c
3468     +++ b/arch/x86/kvm/x86.c
3469     @@ -180,6 +180,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
3470     { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
3471     { "irq_injections", VCPU_STAT(irq_injections) },
3472     { "nmi_injections", VCPU_STAT(nmi_injections) },
3473     + { "l1d_flush", VCPU_STAT(l1d_flush) },
3474     { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
3475     { "mmu_pte_write", VM_STAT(mmu_pte_write) },
3476     { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
3477     @@ -1007,6 +1008,71 @@ static u32 emulated_msrs[] = {
3478    
3479     static unsigned num_emulated_msrs;
3480    
3481     +/*
3482     + * List of msr numbers which are used to expose MSR-based features that
3483     + * can be used by a hypervisor to validate requested CPU features.
3484     + */
3485     +static u32 msr_based_features[] = {
3486     + MSR_F10H_DECFG,
3487     + MSR_IA32_UCODE_REV,
3488     + MSR_IA32_ARCH_CAPABILITIES,
3489     +};
3490     +
3491     +static unsigned int num_msr_based_features;
3492     +
3493     +u64 kvm_get_arch_capabilities(void)
3494     +{
3495     + u64 data;
3496     +
3497     + rdmsrl_safe(MSR_IA32_ARCH_CAPABILITIES, &data);
3498     +
3499     + /*
3500     + * If we're doing cache flushes (either "always" or "cond")
3501     + * we will do one whenever the guest does a vmlaunch/vmresume.
3502     + * If an outer hypervisor is doing the cache flush for us
3503     + * (VMENTER_L1D_FLUSH_NESTED_VM), we can safely pass that
3504     + * capability to the guest too, and if EPT is disabled we're not
3505     + * vulnerable. Overall, only VMENTER_L1D_FLUSH_NEVER will
3506     + * require a nested hypervisor to do a flush of its own.
3507     + */
3508     + if (l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NEVER)
3509     + data |= ARCH_CAP_SKIP_VMENTRY_L1DFLUSH;
3510     +
3511     + return data;
3512     +}
3513     +EXPORT_SYMBOL_GPL(kvm_get_arch_capabilities);
3514     +
3515     +static int kvm_get_msr_feature(struct kvm_msr_entry *msr)
3516     +{
3517     + switch (msr->index) {
3518     + case MSR_IA32_ARCH_CAPABILITIES:
3519     + msr->data = kvm_get_arch_capabilities();
3520     + break;
3521     + case MSR_IA32_UCODE_REV:
3522     + rdmsrl_safe(msr->index, &msr->data);
3523     + break;
3524     + default:
3525     + if (kvm_x86_ops->get_msr_feature(msr))
3526     + return 1;
3527     + }
3528     + return 0;
3529     +}
3530     +
3531     +static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
3532     +{
3533     + struct kvm_msr_entry msr;
3534     + int r;
3535     +
3536     + msr.index = index;
3537     + r = kvm_get_msr_feature(&msr);
3538     + if (r)
3539     + return r;
3540     +
3541     + *data = msr.data;
3542     +
3543     + return 0;
3544     +}
3545     +
3546     bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
3547     {
3548     if (efer & efer_reserved_bits)
3549     @@ -2121,13 +2187,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
3550    
3551     switch (msr) {
3552     case MSR_AMD64_NB_CFG:
3553     - case MSR_IA32_UCODE_REV:
3554     case MSR_IA32_UCODE_WRITE:
3555     case MSR_VM_HSAVE_PA:
3556     case MSR_AMD64_PATCH_LOADER:
3557     case MSR_AMD64_BU_CFG2:
3558     break;
3559    
3560     + case MSR_IA32_UCODE_REV:
3561     + if (msr_info->host_initiated)
3562     + vcpu->arch.microcode_version = data;
3563     + break;
3564     case MSR_EFER:
3565     return set_efer(vcpu, data);
3566     case MSR_K7_HWCR:
3567     @@ -2402,7 +2471,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
3568     msr_info->data = 0;
3569     break;
3570     case MSR_IA32_UCODE_REV:
3571     - msr_info->data = 0x100000000ULL;
3572     + msr_info->data = vcpu->arch.microcode_version;
3573     break;
3574     case MSR_MTRRcap:
3575     case 0x200 ... 0x2ff:
3576     @@ -2545,13 +2614,11 @@ static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
3577     int (*do_msr)(struct kvm_vcpu *vcpu,
3578     unsigned index, u64 *data))
3579     {
3580     - int i, idx;
3581     + int i;
3582    
3583     - idx = srcu_read_lock(&vcpu->kvm->srcu);
3584     for (i = 0; i < msrs->nmsrs; ++i)
3585     if (do_msr(vcpu, entries[i].index, &entries[i].data))
3586     break;
3587     - srcu_read_unlock(&vcpu->kvm->srcu, idx);
3588    
3589     return i;
3590     }
3591     @@ -2651,6 +2718,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
3592     case KVM_CAP_ASSIGN_DEV_IRQ:
3593     case KVM_CAP_PCI_2_3:
3594     #endif
3595     + case KVM_CAP_GET_MSR_FEATURES:
3596     r = 1;
3597     break;
3598     case KVM_CAP_ADJUST_CLOCK:
3599     @@ -2770,6 +2838,31 @@ long kvm_arch_dev_ioctl(struct file *filp,
3600     goto out;
3601     r = 0;
3602     break;
3603     + case KVM_GET_MSR_FEATURE_INDEX_LIST: {
3604     + struct kvm_msr_list __user *user_msr_list = argp;
3605     + struct kvm_msr_list msr_list;
3606     + unsigned int n;
3607     +
3608     + r = -EFAULT;
3609     + if (copy_from_user(&msr_list, user_msr_list, sizeof(msr_list)))
3610     + goto out;
3611     + n = msr_list.nmsrs;
3612     + msr_list.nmsrs = num_msr_based_features;
3613     + if (copy_to_user(user_msr_list, &msr_list, sizeof(msr_list)))
3614     + goto out;
3615     + r = -E2BIG;
3616     + if (n < msr_list.nmsrs)
3617     + goto out;
3618     + r = -EFAULT;
3619     + if (copy_to_user(user_msr_list->indices, &msr_based_features,
3620     + num_msr_based_features * sizeof(u32)))
3621     + goto out;
3622     + r = 0;
3623     + break;
3624     + }
3625     + case KVM_GET_MSRS:
3626     + r = msr_io(NULL, argp, do_get_msr_feature, 1);
3627     + break;
3628     }
3629     default:
3630     r = -EINVAL;
3631     @@ -3451,12 +3544,18 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
3632     r = 0;
3633     break;
3634     }
3635     - case KVM_GET_MSRS:
3636     + case KVM_GET_MSRS: {
3637     + int idx = srcu_read_lock(&vcpu->kvm->srcu);
3638     r = msr_io(vcpu, argp, do_get_msr, 1);
3639     + srcu_read_unlock(&vcpu->kvm->srcu, idx);
3640     break;
3641     - case KVM_SET_MSRS:
3642     + }
3643     + case KVM_SET_MSRS: {
3644     + int idx = srcu_read_lock(&vcpu->kvm->srcu);
3645     r = msr_io(vcpu, argp, do_set_msr, 0);
3646     + srcu_read_unlock(&vcpu->kvm->srcu, idx);
3647     break;
3648     + }
3649     case KVM_TPR_ACCESS_REPORTING: {
3650     struct kvm_tpr_access_ctl tac;
3651    
3652     @@ -4236,6 +4335,19 @@ static void kvm_init_msr_list(void)
3653     j++;
3654     }
3655     num_emulated_msrs = j;
3656     +
3657     + for (i = j = 0; i < ARRAY_SIZE(msr_based_features); i++) {
3658     + struct kvm_msr_entry msr;
3659     +
3660     + msr.index = msr_based_features[i];
3661     + if (kvm_get_msr_feature(&msr))
3662     + continue;
3663     +
3664     + if (j < i)
3665     + msr_based_features[j] = msr_based_features[i];
3666     + j++;
3667     + }
3668     + num_msr_based_features = j;
3669     }
3670    
3671     static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
3672     @@ -4476,6 +4588,9 @@ static int emulator_write_std(struct x86_emulate_ctxt *ctxt, gva_t addr, void *v
3673     int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val,
3674     unsigned int bytes, struct x86_exception *exception)
3675     {
3676     + /* kvm_write_guest_virt_system can pull in tons of pages. */
3677     + vcpu->arch.l1tf_flush_l1d = true;
3678     +
3679     return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
3680     PFERR_WRITE_MASK, exception);
3681     }
3682     @@ -5574,6 +5689,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
3683     bool writeback = true;
3684     bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
3685    
3686     + vcpu->arch.l1tf_flush_l1d = true;
3687     +
3688     /*
3689     * Clear write_fault_to_shadow_pgtable here to ensure it is
3690     * never reused.
3691     @@ -6929,6 +7046,7 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
3692     struct kvm *kvm = vcpu->kvm;
3693    
3694     vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
3695     + vcpu->arch.l1tf_flush_l1d = true;
3696    
3697     for (;;) {
3698     if (kvm_vcpu_running(vcpu)) {
3699     @@ -7899,6 +8017,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
3700    
3701     void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
3702     {
3703     + vcpu->arch.l1tf_flush_l1d = true;
3704     kvm_x86_ops->sched_in(vcpu, cpu);
3705     }
3706    
3707     diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
3708     index ae23c996e3a8..acef3c6a32a2 100644
3709     --- a/arch/x86/mm/fault.c
3710     +++ b/arch/x86/mm/fault.c
3711     @@ -23,6 +23,7 @@
3712     #include <asm/vsyscall.h> /* emulate_vsyscall */
3713     #include <asm/vm86.h> /* struct vm86 */
3714     #include <asm/mmu_context.h> /* vma_pkey() */
3715     +#include <asm/sections.h>
3716    
3717     #define CREATE_TRACE_POINTS
3718     #include <asm/trace/exceptions.h>
3719     diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
3720     index ae9b84cae57c..5d35b555115a 100644
3721     --- a/arch/x86/mm/init.c
3722     +++ b/arch/x86/mm/init.c
3723     @@ -4,6 +4,8 @@
3724     #include <linux/swap.h>
3725     #include <linux/memblock.h>
3726     #include <linux/bootmem.h> /* for max_low_pfn */
3727     +#include <linux/swapfile.h>
3728     +#include <linux/swapops.h>
3729    
3730     #include <asm/cacheflush.h>
3731     #include <asm/e820.h>
3732     @@ -780,3 +782,26 @@ void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache)
3733     __cachemode2pte_tbl[cache] = __cm_idx2pte(entry);
3734     __pte2cachemode_tbl[entry] = cache;
3735     }
3736     +
3737     +#ifdef CONFIG_SWAP
3738     +unsigned long max_swapfile_size(void)
3739     +{
3740     + unsigned long pages;
3741     +
3742     + pages = generic_max_swapfile_size();
3743     +
3744     + if (boot_cpu_has_bug(X86_BUG_L1TF)) {
3745     + /* Limit the swap file size to MAX_PA/2 for L1TF workaround */
3746     + unsigned long l1tf_limit = l1tf_pfn_limit() + 1;
3747     + /*
3748     + * We encode swap offsets also with 3 bits below those for pfn
3749     + * which makes the usable limit higher.
3750     + */
3751     +#if CONFIG_PGTABLE_LEVELS > 2
3752     + l1tf_limit <<= PAGE_SHIFT - SWP_OFFSET_FIRST_BIT;
3753     +#endif
3754     + pages = min_t(unsigned long, l1tf_limit, pages);
3755     + }
3756     + return pages;
3757     +}
3758     +#endif
3759     diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
3760     index ec678aafa3f8..3f729e20f0e3 100644
3761     --- a/arch/x86/mm/kaiser.c
3762     +++ b/arch/x86/mm/kaiser.c
3763     @@ -20,6 +20,7 @@
3764     #include <asm/desc.h>
3765     #include <asm/cmdline.h>
3766     #include <asm/vsyscall.h>
3767     +#include <asm/sections.h>
3768    
3769     int kaiser_enabled __read_mostly = 1;
3770     EXPORT_SYMBOL(kaiser_enabled); /* for inlined TLB flush functions */
3771     diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
3772     index cadb82be5f36..c695272d89be 100644
3773     --- a/arch/x86/mm/kmmio.c
3774     +++ b/arch/x86/mm/kmmio.c
3775     @@ -125,24 +125,29 @@ static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long addr)
3776    
3777     static void clear_pmd_presence(pmd_t *pmd, bool clear, pmdval_t *old)
3778     {
3779     + pmd_t new_pmd;
3780     pmdval_t v = pmd_val(*pmd);
3781     if (clear) {
3782     - *old = v & _PAGE_PRESENT;
3783     - v &= ~_PAGE_PRESENT;
3784     - } else /* presume this has been called with clear==true previously */
3785     - v |= *old;
3786     - set_pmd(pmd, __pmd(v));
3787     + *old = v;
3788     + new_pmd = pmd_mknotpresent(*pmd);
3789     + } else {
3790     + /* Presume this has been called with clear==true previously */
3791     + new_pmd = __pmd(*old);
3792     + }
3793     + set_pmd(pmd, new_pmd);
3794     }
3795    
3796     static void clear_pte_presence(pte_t *pte, bool clear, pteval_t *old)
3797     {
3798     pteval_t v = pte_val(*pte);
3799     if (clear) {
3800     - *old = v & _PAGE_PRESENT;
3801     - v &= ~_PAGE_PRESENT;
3802     - } else /* presume this has been called with clear==true previously */
3803     - v |= *old;
3804     - set_pte_atomic(pte, __pte(v));
3805     + *old = v;
3806     + /* Nothing should care about address */
3807     + pte_clear(&init_mm, 0, pte);
3808     + } else {
3809     + /* Presume this has been called with clear==true previously */
3810     + set_pte_atomic(pte, __pte(*old));
3811     + }
3812     }
3813    
3814     static int clear_page_presence(struct kmmio_fault_page *f, bool clear)
3815     diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
3816     index d2dc0438d654..5aad869fa205 100644
3817     --- a/arch/x86/mm/mmap.c
3818     +++ b/arch/x86/mm/mmap.c
3819     @@ -121,3 +121,24 @@ const char *arch_vma_name(struct vm_area_struct *vma)
3820     return "[mpx]";
3821     return NULL;
3822     }
3823     +
3824     +/*
3825     + * Only allow root to set high MMIO mappings to PROT_NONE.
3826     + * This prevents an unpriv. user to set them to PROT_NONE and invert
3827     + * them, then pointing to valid memory for L1TF speculation.
3828     + *
3829     + * Note: for locked down kernels may want to disable the root override.
3830     + */
3831     +bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot)
3832     +{
3833     + if (!boot_cpu_has_bug(X86_BUG_L1TF))
3834     + return true;
3835     + if (!__pte_needs_invert(pgprot_val(prot)))
3836     + return true;
3837     + /* If it's real memory always allow */
3838     + if (pfn_valid(pfn))
3839     + return true;
3840     + if (pfn > l1tf_pfn_limit() && !capable(CAP_SYS_ADMIN))
3841     + return false;
3842     + return true;
3843     +}
3844     diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
3845     index dcd671467154..1271bc9fa3c6 100644
3846     --- a/arch/x86/mm/pageattr.c
3847     +++ b/arch/x86/mm/pageattr.c
3848     @@ -1001,8 +1001,8 @@ static long populate_pmd(struct cpa_data *cpa,
3849    
3850     pmd = pmd_offset(pud, start);
3851    
3852     - set_pmd(pmd, __pmd(cpa->pfn << PAGE_SHIFT | _PAGE_PSE |
3853     - massage_pgprot(pmd_pgprot)));
3854     + set_pmd(pmd, pmd_mkhuge(pfn_pmd(cpa->pfn,
3855     + canon_pgprot(pmd_pgprot))));
3856    
3857     start += PMD_SIZE;
3858     cpa->pfn += PMD_SIZE >> PAGE_SHIFT;
3859     @@ -1074,8 +1074,8 @@ static long populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd,
3860     * Map everything starting from the Gb boundary, possibly with 1G pages
3861     */
3862     while (boot_cpu_has(X86_FEATURE_GBPAGES) && end - start >= PUD_SIZE) {
3863     - set_pud(pud, __pud(cpa->pfn << PAGE_SHIFT | _PAGE_PSE |
3864     - massage_pgprot(pud_pgprot)));
3865     + set_pud(pud, pud_mkhuge(pfn_pud(cpa->pfn,
3866     + canon_pgprot(pud_pgprot))));
3867    
3868     start += PUD_SIZE;
3869     cpa->pfn += PUD_SIZE >> PAGE_SHIFT;
3870     diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
3871     index dcb2d9d185a2..351a55dc4a1d 100644
3872     --- a/arch/x86/platform/efi/efi_64.c
3873     +++ b/arch/x86/platform/efi/efi_64.c
3874     @@ -45,6 +45,7 @@
3875     #include <asm/realmode.h>
3876     #include <asm/time.h>
3877     #include <asm/pgalloc.h>
3878     +#include <asm/sections.h>
3879    
3880     /*
3881     * We allocate runtime services regions bottom-up, starting from -4G, i.e.
3882     diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c
3883     index 393a0c0288d1..dee99391d7b2 100644
3884     --- a/arch/x86/platform/efi/quirks.c
3885     +++ b/arch/x86/platform/efi/quirks.c
3886     @@ -13,6 +13,7 @@
3887     #include <linux/dmi.h>
3888     #include <asm/efi.h>
3889     #include <asm/uv/uv.h>
3890     +#include <asm/sections.h>
3891    
3892     #define EFI_MIN_RESERVE 5120
3893    
3894     diff --git a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c
3895     index 10bad1e55fcc..85e112ea7aff 100644
3896     --- a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c
3897     +++ b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_wdt.c
3898     @@ -18,6 +18,7 @@
3899     #include <asm/intel-mid.h>
3900     #include <asm/intel_scu_ipc.h>
3901     #include <asm/io_apic.h>
3902     +#include <asm/hw_irq.h>
3903    
3904     #define TANGIER_EXT_TIMER0_MSI 12
3905    
3906     diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
3907     index 0f0175186f1b..16d4967d59ea 100644
3908     --- a/arch/x86/platform/uv/tlb_uv.c
3909     +++ b/arch/x86/platform/uv/tlb_uv.c
3910     @@ -1283,6 +1283,7 @@ void uv_bau_message_interrupt(struct pt_regs *regs)
3911     struct msg_desc msgdesc;
3912    
3913     ack_APIC_irq();
3914     + kvm_set_cpu_l1tf_flush_l1d();
3915     time_start = get_cycles();
3916    
3917     bcp = &per_cpu(bau_control, smp_processor_id());
3918     diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
3919     index 2986a13b9786..db7cf8727e1c 100644
3920     --- a/arch/x86/xen/enlighten.c
3921     +++ b/arch/x86/xen/enlighten.c
3922     @@ -35,6 +35,7 @@
3923     #include <linux/frame.h>
3924    
3925     #include <linux/kexec.h>
3926     +#include <linux/slab.h>
3927    
3928     #include <xen/xen.h>
3929     #include <xen/events.h>
3930     diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
3931     index 9f21b0c5945d..36bfafb2a853 100644
3932     --- a/arch/x86/xen/setup.c
3933     +++ b/arch/x86/xen/setup.c
3934     @@ -18,6 +18,7 @@
3935     #include <asm/setup.h>
3936     #include <asm/acpi.h>
3937     #include <asm/numa.h>
3938     +#include <asm/sections.h>
3939     #include <asm/xen/hypervisor.h>
3940     #include <asm/xen/hypercall.h>
3941    
3942     diff --git a/drivers/acpi/acpi_lpss.c b/drivers/acpi/acpi_lpss.c
3943     index 373657f7e35a..3cdd2c3a5bfc 100644
3944     --- a/drivers/acpi/acpi_lpss.c
3945     +++ b/drivers/acpi/acpi_lpss.c
3946     @@ -187,10 +187,12 @@ static const struct lpss_device_desc lpt_sdio_dev_desc = {
3947    
3948     static const struct lpss_device_desc byt_pwm_dev_desc = {
3949     .flags = LPSS_SAVE_CTX,
3950     + .prv_offset = 0x800,
3951     };
3952    
3953     static const struct lpss_device_desc bsw_pwm_dev_desc = {
3954     .flags = LPSS_SAVE_CTX | LPSS_NO_D3_DELAY,
3955     + .prv_offset = 0x800,
3956     };
3957    
3958     static const struct lpss_device_desc byt_uart_dev_desc = {
3959     diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
3960     index cbb1cc6bbdb4..f1f4ce7ddb47 100644
3961     --- a/drivers/base/cpu.c
3962     +++ b/drivers/base/cpu.c
3963     @@ -525,16 +525,24 @@ ssize_t __weak cpu_show_spec_store_bypass(struct device *dev,
3964     return sprintf(buf, "Not affected\n");
3965     }
3966    
3967     +ssize_t __weak cpu_show_l1tf(struct device *dev,
3968     + struct device_attribute *attr, char *buf)
3969     +{
3970     + return sprintf(buf, "Not affected\n");
3971     +}
3972     +
3973     static DEVICE_ATTR(meltdown, 0444, cpu_show_meltdown, NULL);
3974     static DEVICE_ATTR(spectre_v1, 0444, cpu_show_spectre_v1, NULL);
3975     static DEVICE_ATTR(spectre_v2, 0444, cpu_show_spectre_v2, NULL);
3976     static DEVICE_ATTR(spec_store_bypass, 0444, cpu_show_spec_store_bypass, NULL);
3977     +static DEVICE_ATTR(l1tf, 0444, cpu_show_l1tf, NULL);
3978    
3979     static struct attribute *cpu_root_vulnerabilities_attrs[] = {
3980     &dev_attr_meltdown.attr,
3981     &dev_attr_spectre_v1.attr,
3982     &dev_attr_spectre_v2.attr,
3983     &dev_attr_spec_store_bypass.attr,
3984     + &dev_attr_l1tf.attr,
3985     NULL
3986     };
3987    
3988     diff --git a/drivers/char/tpm/tpm-dev.c b/drivers/char/tpm/tpm-dev.c
3989     index 65b824954bdc..1662e4688ee2 100644
3990     --- a/drivers/char/tpm/tpm-dev.c
3991     +++ b/drivers/char/tpm/tpm-dev.c
3992     @@ -25,7 +25,7 @@ struct file_priv {
3993     struct tpm_chip *chip;
3994    
3995     /* Data passed to and from the tpm via the read/write calls */
3996     - atomic_t data_pending;
3997     + size_t data_pending;
3998     struct mutex buffer_mutex;
3999    
4000     struct timer_list user_read_timer; /* user needs to claim result */
4001     @@ -46,7 +46,7 @@ static void timeout_work(struct work_struct *work)
4002     struct file_priv *priv = container_of(work, struct file_priv, work);
4003    
4004     mutex_lock(&priv->buffer_mutex);
4005     - atomic_set(&priv->data_pending, 0);
4006     + priv->data_pending = 0;
4007     memset(priv->data_buffer, 0, sizeof(priv->data_buffer));
4008     mutex_unlock(&priv->buffer_mutex);
4009     }
4010     @@ -72,7 +72,6 @@ static int tpm_open(struct inode *inode, struct file *file)
4011     }
4012    
4013     priv->chip = chip;
4014     - atomic_set(&priv->data_pending, 0);
4015     mutex_init(&priv->buffer_mutex);
4016     setup_timer(&priv->user_read_timer, user_reader_timeout,
4017     (unsigned long)priv);
4018     @@ -86,28 +85,24 @@ static ssize_t tpm_read(struct file *file, char __user *buf,
4019     size_t size, loff_t *off)
4020     {
4021     struct file_priv *priv = file->private_data;
4022     - ssize_t ret_size;
4023     + ssize_t ret_size = 0;
4024     int rc;
4025    
4026     del_singleshot_timer_sync(&priv->user_read_timer);
4027     flush_work(&priv->work);
4028     - ret_size = atomic_read(&priv->data_pending);
4029     - if (ret_size > 0) { /* relay data */
4030     - ssize_t orig_ret_size = ret_size;
4031     - if (size < ret_size)
4032     - ret_size = size;
4033     + mutex_lock(&priv->buffer_mutex);
4034    
4035     - mutex_lock(&priv->buffer_mutex);
4036     + if (priv->data_pending) {
4037     + ret_size = min_t(ssize_t, size, priv->data_pending);
4038     rc = copy_to_user(buf, priv->data_buffer, ret_size);
4039     - memset(priv->data_buffer, 0, orig_ret_size);
4040     + memset(priv->data_buffer, 0, priv->data_pending);
4041     if (rc)
4042     ret_size = -EFAULT;
4043    
4044     - mutex_unlock(&priv->buffer_mutex);
4045     + priv->data_pending = 0;
4046     }
4047    
4048     - atomic_set(&priv->data_pending, 0);
4049     -
4050     + mutex_unlock(&priv->buffer_mutex);
4051     return ret_size;
4052     }
4053    
4054     @@ -118,18 +113,20 @@ static ssize_t tpm_write(struct file *file, const char __user *buf,
4055     size_t in_size = size;
4056     ssize_t out_size;
4057    
4058     - /* cannot perform a write until the read has cleared
4059     - either via tpm_read or a user_read_timer timeout.
4060     - This also prevents splitted buffered writes from blocking here.
4061     - */
4062     - if (atomic_read(&priv->data_pending) != 0)
4063     - return -EBUSY;
4064     -
4065     if (in_size > TPM_BUFSIZE)
4066     return -E2BIG;
4067    
4068     mutex_lock(&priv->buffer_mutex);
4069    
4070     + /* Cannot perform a write until the read has cleared either via
4071     + * tpm_read or a user_read_timer timeout. This also prevents split
4072     + * buffered writes from blocking here.
4073     + */
4074     + if (priv->data_pending != 0) {
4075     + mutex_unlock(&priv->buffer_mutex);
4076     + return -EBUSY;
4077     + }
4078     +
4079     if (copy_from_user
4080     (priv->data_buffer, (void __user *) buf, in_size)) {
4081     mutex_unlock(&priv->buffer_mutex);
4082     @@ -159,7 +156,7 @@ static ssize_t tpm_write(struct file *file, const char __user *buf,
4083     return out_size;
4084     }
4085    
4086     - atomic_set(&priv->data_pending, out_size);
4087     + priv->data_pending = out_size;
4088     mutex_unlock(&priv->buffer_mutex);
4089    
4090     /* Set a timeout by which the reader must come claim the result */
4091     @@ -178,7 +175,7 @@ static int tpm_release(struct inode *inode, struct file *file)
4092     del_singleshot_timer_sync(&priv->user_read_timer);
4093     flush_work(&priv->work);
4094     file->private_data = NULL;
4095     - atomic_set(&priv->data_pending, 0);
4096     + priv->data_pending = 0;
4097     clear_bit(0, &priv->chip->is_open);
4098     kfree(priv);
4099     return 0;
4100     diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
4101     index e74aa1d60fdb..99cebf3a9163 100644
4102     --- a/drivers/infiniband/core/umem.c
4103     +++ b/drivers/infiniband/core/umem.c
4104     @@ -122,16 +122,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
4105     umem->address = addr;
4106     umem->page_size = PAGE_SIZE;
4107     umem->pid = get_task_pid(current, PIDTYPE_PID);
4108     - /*
4109     - * We ask for writable memory if any of the following
4110     - * access flags are set. "Local write" and "remote write"
4111     - * obviously require write access. "Remote atomic" can do
4112     - * things like fetch and add, which will modify memory, and
4113     - * "MW bind" can change permissions by binding a window.
4114     - */
4115     - umem->writable = !!(access &
4116     - (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE |
4117     - IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_MW_BIND));
4118     + umem->writable = ib_access_writable(access);
4119    
4120     if (access & IB_ACCESS_ON_DEMAND) {
4121     put_pid(umem->pid);
4122     diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c
4123     index ae41623e0f13..0d4878efd643 100644
4124     --- a/drivers/infiniband/hw/mlx4/mr.c
4125     +++ b/drivers/infiniband/hw/mlx4/mr.c
4126     @@ -131,6 +131,40 @@ out:
4127     return err;
4128     }
4129    
4130     +static struct ib_umem *mlx4_get_umem_mr(struct ib_ucontext *context, u64 start,
4131     + u64 length, u64 virt_addr,
4132     + int access_flags)
4133     +{
4134     + /*
4135     + * Force registering the memory as writable if the underlying pages
4136     + * are writable. This is so rereg can change the access permissions
4137     + * from readable to writable without having to run through ib_umem_get
4138     + * again
4139     + */
4140     + if (!ib_access_writable(access_flags)) {
4141     + struct vm_area_struct *vma;
4142     +
4143     + down_read(&current->mm->mmap_sem);
4144     + /*
4145     + * FIXME: Ideally this would iterate over all the vmas that
4146     + * cover the memory, but for now it requires a single vma to
4147     + * entirely cover the MR to support RO mappings.
4148     + */
4149     + vma = find_vma(current->mm, start);
4150     + if (vma && vma->vm_end >= start + length &&
4151     + vma->vm_start <= start) {
4152     + if (vma->vm_flags & VM_WRITE)
4153     + access_flags |= IB_ACCESS_LOCAL_WRITE;
4154     + } else {
4155     + access_flags |= IB_ACCESS_LOCAL_WRITE;
4156     + }
4157     +
4158     + up_read(&current->mm->mmap_sem);
4159     + }
4160     +
4161     + return ib_umem_get(context, start, length, access_flags, 0);
4162     +}
4163     +
4164     struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
4165     u64 virt_addr, int access_flags,
4166     struct ib_udata *udata)
4167     @@ -145,10 +179,8 @@ struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
4168     if (!mr)
4169     return ERR_PTR(-ENOMEM);
4170    
4171     - /* Force registering the memory as writable. */
4172     - /* Used for memory re-registeration. HCA protects the access */
4173     - mr->umem = ib_umem_get(pd->uobject->context, start, length,
4174     - access_flags | IB_ACCESS_LOCAL_WRITE, 0);
4175     + mr->umem = mlx4_get_umem_mr(pd->uobject->context, start, length,
4176     + virt_addr, access_flags);
4177     if (IS_ERR(mr->umem)) {
4178     err = PTR_ERR(mr->umem);
4179     goto err_free;
4180     @@ -215,6 +247,9 @@ int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags,
4181     }
4182    
4183     if (flags & IB_MR_REREG_ACCESS) {
4184     + if (ib_access_writable(mr_access_flags) && !mmr->umem->writable)
4185     + return -EPERM;
4186     +
4187     err = mlx4_mr_hw_change_access(dev->dev, *pmpt_entry,
4188     convert_access(mr_access_flags));
4189    
4190     @@ -228,10 +263,9 @@ int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags,
4191    
4192     mlx4_mr_rereg_mem_cleanup(dev->dev, &mmr->mmr);
4193     ib_umem_release(mmr->umem);
4194     - mmr->umem = ib_umem_get(mr->uobject->context, start, length,
4195     - mr_access_flags |
4196     - IB_ACCESS_LOCAL_WRITE,
4197     - 0);
4198     + mmr->umem =
4199     + mlx4_get_umem_mr(mr->uobject->context, start, length,
4200     + virt_addr, mr_access_flags);
4201     if (IS_ERR(mmr->umem)) {
4202     err = PTR_ERR(mmr->umem);
4203     /* Prevent mlx4_ib_dereg_mr from free'ing invalid pointer */
4204     diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_stats.c b/drivers/infiniband/hw/ocrdma/ocrdma_stats.c
4205     index 265943069b35..84349d976162 100644
4206     --- a/drivers/infiniband/hw/ocrdma/ocrdma_stats.c
4207     +++ b/drivers/infiniband/hw/ocrdma/ocrdma_stats.c
4208     @@ -645,7 +645,7 @@ static ssize_t ocrdma_dbgfs_ops_write(struct file *filp,
4209     struct ocrdma_stats *pstats = filp->private_data;
4210     struct ocrdma_dev *dev = pstats->dev;
4211    
4212     - if (count > 32)
4213     + if (*ppos != 0 || count == 0 || count > sizeof(tmp_str))
4214     goto err;
4215    
4216     if (copy_from_user(tmp_str, buffer, count))
4217     diff --git a/drivers/mtd/nand/qcom_nandc.c b/drivers/mtd/nand/qcom_nandc.c
4218     index 6f0fd1512ad2..dc4943134649 100644
4219     --- a/drivers/mtd/nand/qcom_nandc.c
4220     +++ b/drivers/mtd/nand/qcom_nandc.c
4221     @@ -2008,6 +2008,9 @@ static int qcom_nand_host_init(struct qcom_nand_controller *nandc,
4222    
4223     nand_set_flash_node(chip, dn);
4224     mtd->name = devm_kasprintf(dev, GFP_KERNEL, "qcom_nand.%d", host->cs);
4225     + if (!mtd->name)
4226     + return -ENOMEM;
4227     +
4228     mtd->owner = THIS_MODULE;
4229     mtd->dev.parent = dev;
4230    
4231     diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
4232     index 681256f97cb3..cd2c6ffdbdde 100644
4233     --- a/drivers/net/xen-netfront.c
4234     +++ b/drivers/net/xen-netfront.c
4235     @@ -893,7 +893,6 @@ static RING_IDX xennet_fill_frags(struct netfront_queue *queue,
4236     struct sk_buff *skb,
4237     struct sk_buff_head *list)
4238     {
4239     - struct skb_shared_info *shinfo = skb_shinfo(skb);
4240     RING_IDX cons = queue->rx.rsp_cons;
4241     struct sk_buff *nskb;
4242    
4243     @@ -902,15 +901,16 @@ static RING_IDX xennet_fill_frags(struct netfront_queue *queue,
4244     RING_GET_RESPONSE(&queue->rx, ++cons);
4245     skb_frag_t *nfrag = &skb_shinfo(nskb)->frags[0];
4246    
4247     - if (shinfo->nr_frags == MAX_SKB_FRAGS) {
4248     + if (skb_shinfo(skb)->nr_frags == MAX_SKB_FRAGS) {
4249     unsigned int pull_to = NETFRONT_SKB_CB(skb)->pull_to;
4250    
4251     BUG_ON(pull_to <= skb_headlen(skb));
4252     __pskb_pull_tail(skb, pull_to - skb_headlen(skb));
4253     }
4254     - BUG_ON(shinfo->nr_frags >= MAX_SKB_FRAGS);
4255     + BUG_ON(skb_shinfo(skb)->nr_frags >= MAX_SKB_FRAGS);
4256    
4257     - skb_add_rx_frag(skb, shinfo->nr_frags, skb_frag_page(nfrag),
4258     + skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags,
4259     + skb_frag_page(nfrag),
4260     rx->offset, rx->status, PAGE_SIZE);
4261    
4262     skb_shinfo(nskb)->nr_frags = 0;
4263     diff --git a/drivers/pci/host/pci-hyperv.c b/drivers/pci/host/pci-hyperv.c
4264     index d392a55ec0a9..b4d8ccfd9f7c 100644
4265     --- a/drivers/pci/host/pci-hyperv.c
4266     +++ b/drivers/pci/host/pci-hyperv.c
4267     @@ -52,6 +52,8 @@
4268     #include <linux/pci.h>
4269     #include <linux/semaphore.h>
4270     #include <linux/irqdomain.h>
4271     +#include <linux/irq.h>
4272     +
4273     #include <asm/irqdomain.h>
4274     #include <asm/apic.h>
4275     #include <linux/msi.h>
4276     diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c
4277     index 01699845c42c..cc484cb287d2 100644
4278     --- a/drivers/scsi/sr.c
4279     +++ b/drivers/scsi/sr.c
4280     @@ -520,18 +520,26 @@ static int sr_init_command(struct scsi_cmnd *SCpnt)
4281     static int sr_block_open(struct block_device *bdev, fmode_t mode)
4282     {
4283     struct scsi_cd *cd;
4284     + struct scsi_device *sdev;
4285     int ret = -ENXIO;
4286    
4287     + cd = scsi_cd_get(bdev->bd_disk);
4288     + if (!cd)
4289     + goto out;
4290     +
4291     + sdev = cd->device;
4292     + scsi_autopm_get_device(sdev);
4293     check_disk_change(bdev);
4294    
4295     mutex_lock(&sr_mutex);
4296     - cd = scsi_cd_get(bdev->bd_disk);
4297     - if (cd) {
4298     - ret = cdrom_open(&cd->cdi, bdev, mode);
4299     - if (ret)
4300     - scsi_cd_put(cd);
4301     - }
4302     + ret = cdrom_open(&cd->cdi, bdev, mode);
4303     mutex_unlock(&sr_mutex);
4304     +
4305     + scsi_autopm_put_device(sdev);
4306     + if (ret)
4307     + scsi_cd_put(cd);
4308     +
4309     +out:
4310     return ret;
4311     }
4312    
4313     @@ -559,6 +567,8 @@ static int sr_block_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
4314     if (ret)
4315     goto out;
4316    
4317     + scsi_autopm_get_device(sdev);
4318     +
4319     /*
4320     * Send SCSI addressing ioctls directly to mid level, send other
4321     * ioctls to cdrom/block level.
4322     @@ -567,15 +577,18 @@ static int sr_block_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
4323     case SCSI_IOCTL_GET_IDLUN:
4324     case SCSI_IOCTL_GET_BUS_NUMBER:
4325     ret = scsi_ioctl(sdev, cmd, argp);
4326     - goto out;
4327     + goto put;
4328     }
4329    
4330     ret = cdrom_ioctl(&cd->cdi, bdev, mode, cmd, arg);
4331     if (ret != -ENOSYS)
4332     - goto out;
4333     + goto put;
4334    
4335     ret = scsi_ioctl(sdev, cmd, argp);
4336    
4337     +put:
4338     + scsi_autopm_put_device(sdev);
4339     +
4340     out:
4341     mutex_unlock(&sr_mutex);
4342     return ret;
4343     diff --git a/fs/dcache.c b/fs/dcache.c
4344     index 7a5e6f9717f5..461ff8f234e3 100644
4345     --- a/fs/dcache.c
4346     +++ b/fs/dcache.c
4347     @@ -352,14 +352,11 @@ static void dentry_unlink_inode(struct dentry * dentry)
4348     __releases(dentry->d_inode->i_lock)
4349     {
4350     struct inode *inode = dentry->d_inode;
4351     - bool hashed = !d_unhashed(dentry);
4352    
4353     - if (hashed)
4354     - raw_write_seqcount_begin(&dentry->d_seq);
4355     + raw_write_seqcount_begin(&dentry->d_seq);
4356     __d_clear_type_and_inode(dentry);
4357     hlist_del_init(&dentry->d_u.d_alias);
4358     - if (hashed)
4359     - raw_write_seqcount_end(&dentry->d_seq);
4360     + raw_write_seqcount_end(&dentry->d_seq);
4361     spin_unlock(&dentry->d_lock);
4362     spin_unlock(&inode->i_lock);
4363     if (!inode->i_nlink)
4364     @@ -1914,10 +1911,12 @@ struct dentry *d_make_root(struct inode *root_inode)
4365    
4366     if (root_inode) {
4367     res = __d_alloc(root_inode->i_sb, NULL);
4368     - if (res)
4369     + if (res) {
4370     + res->d_flags |= DCACHE_RCUACCESS;
4371     d_instantiate(res, root_inode);
4372     - else
4373     + } else {
4374     iput(root_inode);
4375     + }
4376     }
4377     return res;
4378     }
4379     diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
4380     index ffaf66a51de3..4f78e099de1d 100644
4381     --- a/fs/ext4/ialloc.c
4382     +++ b/fs/ext4/ialloc.c
4383     @@ -1316,7 +1316,10 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
4384     ext4_itable_unused_count(sb, gdp)),
4385     sbi->s_inodes_per_block);
4386    
4387     - if ((used_blks < 0) || (used_blks > sbi->s_itb_per_group)) {
4388     + if ((used_blks < 0) || (used_blks > sbi->s_itb_per_group) ||
4389     + ((group == 0) && ((EXT4_INODES_PER_GROUP(sb) -
4390     + ext4_itable_unused_count(sb, gdp)) <
4391     + EXT4_FIRST_INO(sb)))) {
4392     ext4_error(sb, "Something is wrong with group %u: "
4393     "used itable blocks: %d; "
4394     "itable unused count: %u",
4395     diff --git a/fs/ext4/super.c b/fs/ext4/super.c
4396     index 6cbb0f7ead2f..9d44b3683b46 100644
4397     --- a/fs/ext4/super.c
4398     +++ b/fs/ext4/super.c
4399     @@ -3031,14 +3031,8 @@ static ext4_group_t ext4_has_uninit_itable(struct super_block *sb)
4400     if (!gdp)
4401     continue;
4402    
4403     - if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))
4404     - continue;
4405     - if (group != 0)
4406     + if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
4407     break;
4408     - ext4_error(sb, "Inode table for bg 0 marked as "
4409     - "needing zeroing");
4410     - if (sb->s_flags & MS_RDONLY)
4411     - return ngroups;
4412     }
4413    
4414     return group;
4415     diff --git a/fs/namespace.c b/fs/namespace.c
4416     index 6c873b330a93..0a9e766b4087 100644
4417     --- a/fs/namespace.c
4418     +++ b/fs/namespace.c
4419     @@ -603,12 +603,21 @@ int __legitimize_mnt(struct vfsmount *bastard, unsigned seq)
4420     return 0;
4421     mnt = real_mount(bastard);
4422     mnt_add_count(mnt, 1);
4423     + smp_mb(); // see mntput_no_expire()
4424     if (likely(!read_seqretry(&mount_lock, seq)))
4425     return 0;
4426     if (bastard->mnt_flags & MNT_SYNC_UMOUNT) {
4427     mnt_add_count(mnt, -1);
4428     return 1;
4429     }
4430     + lock_mount_hash();
4431     + if (unlikely(bastard->mnt_flags & MNT_DOOMED)) {
4432     + mnt_add_count(mnt, -1);
4433     + unlock_mount_hash();
4434     + return 1;
4435     + }
4436     + unlock_mount_hash();
4437     + /* caller will mntput() */
4438     return -1;
4439     }
4440    
4441     @@ -1139,12 +1148,27 @@ static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput);
4442     static void mntput_no_expire(struct mount *mnt)
4443     {
4444     rcu_read_lock();
4445     - mnt_add_count(mnt, -1);
4446     - if (likely(mnt->mnt_ns)) { /* shouldn't be the last one */
4447     + if (likely(READ_ONCE(mnt->mnt_ns))) {
4448     + /*
4449     + * Since we don't do lock_mount_hash() here,
4450     + * ->mnt_ns can change under us. However, if it's
4451     + * non-NULL, then there's a reference that won't
4452     + * be dropped until after an RCU delay done after
4453     + * turning ->mnt_ns NULL. So if we observe it
4454     + * non-NULL under rcu_read_lock(), the reference
4455     + * we are dropping is not the final one.
4456     + */
4457     + mnt_add_count(mnt, -1);
4458     rcu_read_unlock();
4459     return;
4460     }
4461     lock_mount_hash();
4462     + /*
4463     + * make sure that if __legitimize_mnt() has not seen us grab
4464     + * mount_lock, we'll see their refcount increment here.
4465     + */
4466     + smp_mb();
4467     + mnt_add_count(mnt, -1);
4468     if (mnt_get_count(mnt)) {
4469     rcu_read_unlock();
4470     unlock_mount_hash();
4471     diff --git a/fs/proc/inode.c b/fs/proc/inode.c
4472     index e69ebe648a34..c2afe39f0b9e 100644
4473     --- a/fs/proc/inode.c
4474     +++ b/fs/proc/inode.c
4475     @@ -43,10 +43,11 @@ static void proc_evict_inode(struct inode *inode)
4476     de = PDE(inode);
4477     if (de)
4478     pde_put(de);
4479     +
4480     head = PROC_I(inode)->sysctl;
4481     if (head) {
4482     RCU_INIT_POINTER(PROC_I(inode)->sysctl, NULL);
4483     - sysctl_head_put(head);
4484     + proc_sys_evict_inode(inode, head);
4485     }
4486     }
4487    
4488     diff --git a/fs/proc/internal.h b/fs/proc/internal.h
4489     index 5378441ec1b7..c0bdeceaaeb6 100644
4490     --- a/fs/proc/internal.h
4491     +++ b/fs/proc/internal.h
4492     @@ -65,6 +65,7 @@ struct proc_inode {
4493     struct proc_dir_entry *pde;
4494     struct ctl_table_header *sysctl;
4495     struct ctl_table *sysctl_entry;
4496     + struct hlist_node sysctl_inodes;
4497     const struct proc_ns_operations *ns_ops;
4498     struct inode vfs_inode;
4499     };
4500     @@ -249,10 +250,12 @@ extern void proc_thread_self_init(void);
4501     */
4502     #ifdef CONFIG_PROC_SYSCTL
4503     extern int proc_sys_init(void);
4504     -extern void sysctl_head_put(struct ctl_table_header *);
4505     +extern void proc_sys_evict_inode(struct inode *inode,
4506     + struct ctl_table_header *head);
4507     #else
4508     static inline void proc_sys_init(void) { }
4509     -static inline void sysctl_head_put(struct ctl_table_header *head) { }
4510     +static inline void proc_sys_evict_inode(struct inode *inode,
4511     + struct ctl_table_header *head) { }
4512     #endif
4513    
4514     /*
4515     diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
4516     index 847f23420b40..46cd2e1b055b 100644
4517     --- a/fs/proc/proc_sysctl.c
4518     +++ b/fs/proc/proc_sysctl.c
4519     @@ -190,6 +190,7 @@ static void init_header(struct ctl_table_header *head,
4520     head->set = set;
4521     head->parent = NULL;
4522     head->node = node;
4523     + INIT_HLIST_HEAD(&head->inodes);
4524     if (node) {
4525     struct ctl_table *entry;
4526     for (entry = table; entry->procname; entry++, node++)
4527     @@ -259,6 +260,44 @@ static void unuse_table(struct ctl_table_header *p)
4528     complete(p->unregistering);
4529     }
4530    
4531     +static void proc_sys_prune_dcache(struct ctl_table_header *head)
4532     +{
4533     + struct inode *inode;
4534     + struct proc_inode *ei;
4535     + struct hlist_node *node;
4536     + struct super_block *sb;
4537     +
4538     + rcu_read_lock();
4539     + for (;;) {
4540     + node = hlist_first_rcu(&head->inodes);
4541     + if (!node)
4542     + break;
4543     + ei = hlist_entry(node, struct proc_inode, sysctl_inodes);
4544     + spin_lock(&sysctl_lock);
4545     + hlist_del_init_rcu(&ei->sysctl_inodes);
4546     + spin_unlock(&sysctl_lock);
4547     +
4548     + inode = &ei->vfs_inode;
4549     + sb = inode->i_sb;
4550     + if (!atomic_inc_not_zero(&sb->s_active))
4551     + continue;
4552     + inode = igrab(inode);
4553     + rcu_read_unlock();
4554     + if (unlikely(!inode)) {
4555     + deactivate_super(sb);
4556     + rcu_read_lock();
4557     + continue;
4558     + }
4559     +
4560     + d_prune_aliases(inode);
4561     + iput(inode);
4562     + deactivate_super(sb);
4563     +
4564     + rcu_read_lock();
4565     + }
4566     + rcu_read_unlock();
4567     +}
4568     +
4569     /* called under sysctl_lock, will reacquire if has to wait */
4570     static void start_unregistering(struct ctl_table_header *p)
4571     {
4572     @@ -272,31 +311,22 @@ static void start_unregistering(struct ctl_table_header *p)
4573     p->unregistering = &wait;
4574     spin_unlock(&sysctl_lock);
4575     wait_for_completion(&wait);
4576     - spin_lock(&sysctl_lock);
4577     } else {
4578     /* anything non-NULL; we'll never dereference it */
4579     p->unregistering = ERR_PTR(-EINVAL);
4580     + spin_unlock(&sysctl_lock);
4581     }
4582     + /*
4583     + * Prune dentries for unregistered sysctls: namespaced sysctls
4584     + * can have duplicate names and contaminate dcache very badly.
4585     + */
4586     + proc_sys_prune_dcache(p);
4587     /*
4588     * do not remove from the list until nobody holds it; walking the
4589     * list in do_sysctl() relies on that.
4590     */
4591     - erase_header(p);
4592     -}
4593     -
4594     -static void sysctl_head_get(struct ctl_table_header *head)
4595     -{
4596     spin_lock(&sysctl_lock);
4597     - head->count++;
4598     - spin_unlock(&sysctl_lock);
4599     -}
4600     -
4601     -void sysctl_head_put(struct ctl_table_header *head)
4602     -{
4603     - spin_lock(&sysctl_lock);
4604     - if (!--head->count)
4605     - kfree_rcu(head, rcu);
4606     - spin_unlock(&sysctl_lock);
4607     + erase_header(p);
4608     }
4609    
4610     static struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head)
4611     @@ -440,10 +470,20 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
4612    
4613     inode->i_ino = get_next_ino();
4614    
4615     - sysctl_head_get(head);
4616     ei = PROC_I(inode);
4617     +
4618     + spin_lock(&sysctl_lock);
4619     + if (unlikely(head->unregistering)) {
4620     + spin_unlock(&sysctl_lock);
4621     + iput(inode);
4622     + inode = NULL;
4623     + goto out;
4624     + }
4625     ei->sysctl = head;
4626     ei->sysctl_entry = table;
4627     + hlist_add_head_rcu(&ei->sysctl_inodes, &head->inodes);
4628     + head->count++;
4629     + spin_unlock(&sysctl_lock);
4630    
4631     inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
4632     inode->i_mode = table->mode;
4633     @@ -466,6 +506,15 @@ out:
4634     return inode;
4635     }
4636    
4637     +void proc_sys_evict_inode(struct inode *inode, struct ctl_table_header *head)
4638     +{
4639     + spin_lock(&sysctl_lock);
4640     + hlist_del_init_rcu(&PROC_I(inode)->sysctl_inodes);
4641     + if (!--head->count)
4642     + kfree_rcu(head, rcu);
4643     + spin_unlock(&sysctl_lock);
4644     +}
4645     +
4646     static struct ctl_table_header *grab_header(struct inode *inode)
4647     {
4648     struct ctl_table_header *head = PROC_I(inode)->sysctl;
4649     diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
4650     index 4e8551c8ef18..a88ea9e37a25 100644
4651     --- a/include/asm-generic/pgtable.h
4652     +++ b/include/asm-generic/pgtable.h
4653     @@ -828,6 +828,19 @@ static inline int pmd_free_pte_page(pmd_t *pmd)
4654     struct file;
4655     int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
4656     unsigned long size, pgprot_t *vma_prot);
4657     +
4658     +#ifndef __HAVE_ARCH_PFN_MODIFY_ALLOWED
4659     +static inline bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot)
4660     +{
4661     + return true;
4662     +}
4663     +
4664     +static inline bool arch_has_pfn_modify_check(void)
4665     +{
4666     + return false;
4667     +}
4668     +#endif /* !_HAVE_ARCH_PFN_MODIFY_ALLOWED */
4669     +
4670     #endif /* !__ASSEMBLY__ */
4671    
4672     #ifndef io_remap_pfn_range
4673     diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h
4674     index 01225b0059b1..21c88a7ac23b 100644
4675     --- a/include/linux/compiler-clang.h
4676     +++ b/include/linux/compiler-clang.h
4677     @@ -16,6 +16,9 @@
4678     */
4679     #define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__)
4680    
4681     +#undef __no_sanitize_address
4682     +#define __no_sanitize_address __attribute__((no_sanitize("address")))
4683     +
4684     /* Clang doesn't have a way to turn it off per-function, yet. */
4685     #ifdef __noretpoline
4686     #undef __noretpoline
4687     diff --git a/include/linux/cpu.h b/include/linux/cpu.h
4688     index 917829b27350..ae5ac89324df 100644
4689     --- a/include/linux/cpu.h
4690     +++ b/include/linux/cpu.h
4691     @@ -29,7 +29,7 @@ struct cpu {
4692     };
4693    
4694     extern void boot_cpu_init(void);
4695     -extern void boot_cpu_state_init(void);
4696     +extern void boot_cpu_hotplug_init(void);
4697    
4698     extern int register_cpu(struct cpu *cpu, int num);
4699     extern struct device *get_cpu_device(unsigned cpu);
4700     @@ -52,6 +52,8 @@ extern ssize_t cpu_show_spectre_v2(struct device *dev,
4701     struct device_attribute *attr, char *buf);
4702     extern ssize_t cpu_show_spec_store_bypass(struct device *dev,
4703     struct device_attribute *attr, char *buf);
4704     +extern ssize_t cpu_show_l1tf(struct device *dev,
4705     + struct device_attribute *attr, char *buf);
4706    
4707     extern __printf(4, 5)
4708     struct device *cpu_device_create(struct device *parent, void *drvdata,
4709     @@ -255,4 +257,23 @@ void cpuhp_report_idle_dead(void);
4710     static inline void cpuhp_report_idle_dead(void) { }
4711     #endif /* #ifdef CONFIG_HOTPLUG_CPU */
4712    
4713     +enum cpuhp_smt_control {
4714     + CPU_SMT_ENABLED,
4715     + CPU_SMT_DISABLED,
4716     + CPU_SMT_FORCE_DISABLED,
4717     + CPU_SMT_NOT_SUPPORTED,
4718     +};
4719     +
4720     +#if defined(CONFIG_SMP) && defined(CONFIG_HOTPLUG_SMT)
4721     +extern enum cpuhp_smt_control cpu_smt_control;
4722     +extern void cpu_smt_disable(bool force);
4723     +extern void cpu_smt_check_topology_early(void);
4724     +extern void cpu_smt_check_topology(void);
4725     +#else
4726     +# define cpu_smt_control (CPU_SMT_ENABLED)
4727     +static inline void cpu_smt_disable(bool force) { }
4728     +static inline void cpu_smt_check_topology_early(void) { }
4729     +static inline void cpu_smt_check_topology(void) { }
4730     +#endif
4731     +
4732     #endif /* _LINUX_CPU_H_ */
4733     diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h
4734     index 388293a91e8c..e4594de79bc4 100644
4735     --- a/include/linux/swapfile.h
4736     +++ b/include/linux/swapfile.h
4737     @@ -9,5 +9,7 @@ extern spinlock_t swap_lock;
4738     extern struct plist_head swap_active_head;
4739     extern struct swap_info_struct *swap_info[];
4740     extern int try_to_unuse(unsigned int, bool, unsigned long);
4741     +extern unsigned long generic_max_swapfile_size(void);
4742     +extern unsigned long max_swapfile_size(void);
4743    
4744     #endif /* _LINUX_SWAPFILE_H */
4745     diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
4746     index adf4e51cf597..0e5cc33b9b25 100644
4747     --- a/include/linux/sysctl.h
4748     +++ b/include/linux/sysctl.h
4749     @@ -143,6 +143,7 @@ struct ctl_table_header
4750     struct ctl_table_set *set;
4751     struct ctl_dir *parent;
4752     struct ctl_node *node;
4753     + struct hlist_head inodes; /* head for proc_inode->sysctl_inodes */
4754     };
4755    
4756     struct ctl_dir {
4757     diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
4758     index 5ad43a487745..a42535f252b5 100644
4759     --- a/include/rdma/ib_verbs.h
4760     +++ b/include/rdma/ib_verbs.h
4761     @@ -3308,6 +3308,20 @@ static inline int ib_check_mr_access(int flags)
4762     return 0;
4763     }
4764    
4765     +static inline bool ib_access_writable(int access_flags)
4766     +{
4767     + /*
4768     + * We have writable memory backing the MR if any of the following
4769     + * access flags are set. "Local write" and "remote write" obviously
4770     + * require write access. "Remote atomic" can do things like fetch and
4771     + * add, which will modify memory, and "MW bind" can change permissions
4772     + * by binding a window.
4773     + */
4774     + return access_flags &
4775     + (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE |
4776     + IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_MW_BIND);
4777     +}
4778     +
4779     /**
4780     * ib_check_mr_status: lightweight check of MR status.
4781     * This routine may provide status checks on a selected
4782     diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
4783     index 05b9bb63dbec..a0a365cbf3c9 100644
4784     --- a/include/uapi/linux/kvm.h
4785     +++ b/include/uapi/linux/kvm.h
4786     @@ -717,6 +717,7 @@ struct kvm_ppc_smmu_info {
4787     #define KVM_TRACE_PAUSE __KVM_DEPRECATED_MAIN_0x07
4788     #define KVM_TRACE_DISABLE __KVM_DEPRECATED_MAIN_0x08
4789     #define KVM_GET_EMULATED_CPUID _IOWR(KVMIO, 0x09, struct kvm_cpuid2)
4790     +#define KVM_GET_MSR_FEATURE_INDEX_LIST _IOWR(KVMIO, 0x0a, struct kvm_msr_list)
4791    
4792     /*
4793     * Extension capability list.
4794     @@ -871,6 +872,7 @@ struct kvm_ppc_smmu_info {
4795     #define KVM_CAP_MSI_DEVID 131
4796     #define KVM_CAP_PPC_HTM 132
4797     #define KVM_CAP_S390_BPB 152
4798     +#define KVM_CAP_GET_MSR_FEATURES 153
4799    
4800     #ifdef KVM_CAP_IRQ_ROUTING
4801    
4802     diff --git a/init/main.c b/init/main.c
4803     index f22957afb37e..4313772d634a 100644
4804     --- a/init/main.c
4805     +++ b/init/main.c
4806     @@ -509,8 +509,8 @@ asmlinkage __visible void __init start_kernel(void)
4807     setup_command_line(command_line);
4808     setup_nr_cpu_ids();
4809     setup_per_cpu_areas();
4810     - boot_cpu_state_init();
4811     smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
4812     + boot_cpu_hotplug_init();
4813    
4814     build_all_zonelists(NULL, NULL);
4815     page_alloc_init();
4816     diff --git a/kernel/cpu.c b/kernel/cpu.c
4817     index 967163fb90a8..b5a0165b7300 100644
4818     --- a/kernel/cpu.c
4819     +++ b/kernel/cpu.c
4820     @@ -54,6 +54,7 @@ struct cpuhp_cpu_state {
4821     bool rollback;
4822     bool single;
4823     bool bringup;
4824     + bool booted_once;
4825     struct hlist_node *node;
4826     enum cpuhp_state cb_state;
4827     int result;
4828     @@ -355,6 +356,85 @@ void cpu_hotplug_enable(void)
4829     EXPORT_SYMBOL_GPL(cpu_hotplug_enable);
4830     #endif /* CONFIG_HOTPLUG_CPU */
4831    
4832     +#ifdef CONFIG_HOTPLUG_SMT
4833     +enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED;
4834     +EXPORT_SYMBOL_GPL(cpu_smt_control);
4835     +
4836     +static bool cpu_smt_available __read_mostly;
4837     +
4838     +void __init cpu_smt_disable(bool force)
4839     +{
4840     + if (cpu_smt_control == CPU_SMT_FORCE_DISABLED ||
4841     + cpu_smt_control == CPU_SMT_NOT_SUPPORTED)
4842     + return;
4843     +
4844     + if (force) {
4845     + pr_info("SMT: Force disabled\n");
4846     + cpu_smt_control = CPU_SMT_FORCE_DISABLED;
4847     + } else {
4848     + cpu_smt_control = CPU_SMT_DISABLED;
4849     + }
4850     +}
4851     +
4852     +/*
4853     + * The decision whether SMT is supported can only be done after the full
4854     + * CPU identification. Called from architecture code before non boot CPUs
4855     + * are brought up.
4856     + */
4857     +void __init cpu_smt_check_topology_early(void)
4858     +{
4859     + if (!topology_smt_supported())
4860     + cpu_smt_control = CPU_SMT_NOT_SUPPORTED;
4861     +}
4862     +
4863     +/*
4864     + * If SMT was disabled by BIOS, detect it here, after the CPUs have been
4865     + * brought online. This ensures the smt/l1tf sysfs entries are consistent
4866     + * with reality. cpu_smt_available is set to true during the bringup of non
4867     + * boot CPUs when a SMT sibling is detected. Note, this may overwrite
4868     + * cpu_smt_control's previous setting.
4869     + */
4870     +void __init cpu_smt_check_topology(void)
4871     +{
4872     + if (!cpu_smt_available)
4873     + cpu_smt_control = CPU_SMT_NOT_SUPPORTED;
4874     +}
4875     +
4876     +static int __init smt_cmdline_disable(char *str)
4877     +{
4878     + cpu_smt_disable(str && !strcmp(str, "force"));
4879     + return 0;
4880     +}
4881     +early_param("nosmt", smt_cmdline_disable);
4882     +
4883     +static inline bool cpu_smt_allowed(unsigned int cpu)
4884     +{
4885     + if (topology_is_primary_thread(cpu))
4886     + return true;
4887     +
4888     + /*
4889     + * If the CPU is not a 'primary' thread and the booted_once bit is
4890     + * set then the processor has SMT support. Store this information
4891     + * for the late check of SMT support in cpu_smt_check_topology().
4892     + */
4893     + if (per_cpu(cpuhp_state, cpu).booted_once)
4894     + cpu_smt_available = true;
4895     +
4896     + if (cpu_smt_control == CPU_SMT_ENABLED)
4897     + return true;
4898     +
4899     + /*
4900     + * On x86 it's required to boot all logical CPUs at least once so
4901     + * that the init code can get a chance to set CR4.MCE on each
4902     + * CPU. Otherwise, a broadacasted MCE observing CR4.MCE=0b on any
4903     + * core will shutdown the machine.
4904     + */
4905     + return !per_cpu(cpuhp_state, cpu).booted_once;
4906     +}
4907     +#else
4908     +static inline bool cpu_smt_allowed(unsigned int cpu) { return true; }
4909     +#endif
4910     +
4911     /* Need to know about CPUs going up/down? */
4912     int register_cpu_notifier(struct notifier_block *nb)
4913     {
4914     @@ -431,6 +511,16 @@ static int bringup_wait_for_ap(unsigned int cpu)
4915     stop_machine_unpark(cpu);
4916     kthread_unpark(st->thread);
4917    
4918     + /*
4919     + * SMT soft disabling on X86 requires to bring the CPU out of the
4920     + * BIOS 'wait for SIPI' state in order to set the CR4.MCE bit. The
4921     + * CPU marked itself as booted_once in cpu_notify_starting() so the
4922     + * cpu_smt_allowed() check will now return false if this is not the
4923     + * primary sibling.
4924     + */
4925     + if (!cpu_smt_allowed(cpu))
4926     + return -ECANCELED;
4927     +
4928     /* Should we go further up ? */
4929     if (st->target > CPUHP_AP_ONLINE_IDLE) {
4930     __cpuhp_kick_ap_work(st);
4931     @@ -817,7 +907,6 @@ static int takedown_cpu(unsigned int cpu)
4932    
4933     /* Park the smpboot threads */
4934     kthread_park(per_cpu_ptr(&cpuhp_state, cpu)->thread);
4935     - smpboot_park_threads(cpu);
4936    
4937     /*
4938     * Prevent irq alloc/free while the dying cpu reorganizes the
4939     @@ -956,20 +1045,19 @@ out:
4940     return ret;
4941     }
4942    
4943     +static int cpu_down_maps_locked(unsigned int cpu, enum cpuhp_state target)
4944     +{
4945     + if (cpu_hotplug_disabled)
4946     + return -EBUSY;
4947     + return _cpu_down(cpu, 0, target);
4948     +}
4949     +
4950     static int do_cpu_down(unsigned int cpu, enum cpuhp_state target)
4951     {
4952     int err;
4953    
4954     cpu_maps_update_begin();
4955     -
4956     - if (cpu_hotplug_disabled) {
4957     - err = -EBUSY;
4958     - goto out;
4959     - }
4960     -
4961     - err = _cpu_down(cpu, 0, target);
4962     -
4963     -out:
4964     + err = cpu_down_maps_locked(cpu, target);
4965     cpu_maps_update_done();
4966     return err;
4967     }
4968     @@ -993,6 +1081,7 @@ void notify_cpu_starting(unsigned int cpu)
4969     enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE);
4970    
4971     rcu_cpu_starting(cpu); /* Enables RCU usage on this CPU. */
4972     + st->booted_once = true;
4973     while (st->state < target) {
4974     st->state++;
4975     cpuhp_invoke_callback(cpu, st->state, true, NULL);
4976     @@ -1098,6 +1187,10 @@ static int do_cpu_up(unsigned int cpu, enum cpuhp_state target)
4977     err = -EBUSY;
4978     goto out;
4979     }
4980     + if (!cpu_smt_allowed(cpu)) {
4981     + err = -EPERM;
4982     + goto out;
4983     + }
4984    
4985     err = _cpu_up(cpu, 0, target);
4986     out:
4987     @@ -1389,7 +1482,7 @@ static struct cpuhp_step cpuhp_ap_states[] = {
4988     [CPUHP_AP_SMPBOOT_THREADS] = {
4989     .name = "smpboot/threads:online",
4990     .startup.single = smpboot_unpark_threads,
4991     - .teardown.single = NULL,
4992     + .teardown.single = smpboot_park_threads,
4993     },
4994     [CPUHP_AP_PERF_ONLINE] = {
4995     .name = "perf:online",
4996     @@ -1844,10 +1937,172 @@ static struct attribute_group cpuhp_cpu_root_attr_group = {
4997     NULL
4998     };
4999    
5000     +#ifdef CONFIG_HOTPLUG_SMT
5001     +
5002     +static const char *smt_states[] = {
5003     + [CPU_SMT_ENABLED] = "on",
5004     + [CPU_SMT_DISABLED] = "off",
5005     + [CPU_SMT_FORCE_DISABLED] = "forceoff",
5006     + [CPU_SMT_NOT_SUPPORTED] = "notsupported",
5007     +};
5008     +
5009     +static ssize_t
5010     +show_smt_control(struct device *dev, struct device_attribute *attr, char *buf)
5011     +{
5012     + return snprintf(buf, PAGE_SIZE - 2, "%s\n", smt_states[cpu_smt_control]);
5013     +}
5014     +
5015     +static void cpuhp_offline_cpu_device(unsigned int cpu)
5016     +{
5017     + struct device *dev = get_cpu_device(cpu);
5018     +
5019     + dev->offline = true;
5020     + /* Tell user space about the state change */
5021     + kobject_uevent(&dev->kobj, KOBJ_OFFLINE);
5022     +}
5023     +
5024     +static void cpuhp_online_cpu_device(unsigned int cpu)
5025     +{
5026     + struct device *dev = get_cpu_device(cpu);
5027     +
5028     + dev->offline = false;
5029     + /* Tell user space about the state change */
5030     + kobject_uevent(&dev->kobj, KOBJ_ONLINE);
5031     +}
5032     +
5033     +static int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval)
5034     +{
5035     + int cpu, ret = 0;
5036     +
5037     + cpu_maps_update_begin();
5038     + for_each_online_cpu(cpu) {
5039     + if (topology_is_primary_thread(cpu))
5040     + continue;
5041     + ret = cpu_down_maps_locked(cpu, CPUHP_OFFLINE);
5042     + if (ret)
5043     + break;
5044     + /*
5045     + * As this needs to hold the cpu maps lock it's impossible
5046     + * to call device_offline() because that ends up calling
5047     + * cpu_down() which takes cpu maps lock. cpu maps lock
5048     + * needs to be held as this might race against in kernel
5049     + * abusers of the hotplug machinery (thermal management).
5050     + *
5051     + * So nothing would update device:offline state. That would
5052     + * leave the sysfs entry stale and prevent onlining after
5053     + * smt control has been changed to 'off' again. This is
5054     + * called under the sysfs hotplug lock, so it is properly
5055     + * serialized against the regular offline usage.
5056     + */
5057     + cpuhp_offline_cpu_device(cpu);
5058     + }
5059     + if (!ret)
5060     + cpu_smt_control = ctrlval;
5061     + cpu_maps_update_done();
5062     + return ret;
5063     +}
5064     +
5065     +static int cpuhp_smt_enable(void)
5066     +{
5067     + int cpu, ret = 0;
5068     +
5069     + cpu_maps_update_begin();
5070     + cpu_smt_control = CPU_SMT_ENABLED;
5071     + for_each_present_cpu(cpu) {
5072     + /* Skip online CPUs and CPUs on offline nodes */
5073     + if (cpu_online(cpu) || !node_online(cpu_to_node(cpu)))
5074     + continue;
5075     + ret = _cpu_up(cpu, 0, CPUHP_ONLINE);
5076     + if (ret)
5077     + break;
5078     + /* See comment in cpuhp_smt_disable() */
5079     + cpuhp_online_cpu_device(cpu);
5080     + }
5081     + cpu_maps_update_done();
5082     + return ret;
5083     +}
5084     +
5085     +static ssize_t
5086     +store_smt_control(struct device *dev, struct device_attribute *attr,
5087     + const char *buf, size_t count)
5088     +{
5089     + int ctrlval, ret;
5090     +
5091     + if (sysfs_streq(buf, "on"))
5092     + ctrlval = CPU_SMT_ENABLED;
5093     + else if (sysfs_streq(buf, "off"))
5094     + ctrlval = CPU_SMT_DISABLED;
5095     + else if (sysfs_streq(buf, "forceoff"))
5096     + ctrlval = CPU_SMT_FORCE_DISABLED;
5097     + else
5098     + return -EINVAL;
5099     +
5100     + if (cpu_smt_control == CPU_SMT_FORCE_DISABLED)
5101     + return -EPERM;
5102     +
5103     + if (cpu_smt_control == CPU_SMT_NOT_SUPPORTED)
5104     + return -ENODEV;
5105     +
5106     + ret = lock_device_hotplug_sysfs();
5107     + if (ret)
5108     + return ret;
5109     +
5110     + if (ctrlval != cpu_smt_control) {
5111     + switch (ctrlval) {
5112     + case CPU_SMT_ENABLED:
5113     + ret = cpuhp_smt_enable();
5114     + break;
5115     + case CPU_SMT_DISABLED:
5116     + case CPU_SMT_FORCE_DISABLED:
5117     + ret = cpuhp_smt_disable(ctrlval);
5118     + break;
5119     + }
5120     + }
5121     +
5122     + unlock_device_hotplug();
5123     + return ret ? ret : count;
5124     +}
5125     +static DEVICE_ATTR(control, 0644, show_smt_control, store_smt_control);
5126     +
5127     +static ssize_t
5128     +show_smt_active(struct device *dev, struct device_attribute *attr, char *buf)
5129     +{
5130     + bool active = topology_max_smt_threads() > 1;
5131     +
5132     + return snprintf(buf, PAGE_SIZE - 2, "%d\n", active);
5133     +}
5134     +static DEVICE_ATTR(active, 0444, show_smt_active, NULL);
5135     +
5136     +static struct attribute *cpuhp_smt_attrs[] = {
5137     + &dev_attr_control.attr,
5138     + &dev_attr_active.attr,
5139     + NULL
5140     +};
5141     +
5142     +static const struct attribute_group cpuhp_smt_attr_group = {
5143     + .attrs = cpuhp_smt_attrs,
5144     + .name = "smt",
5145     + NULL
5146     +};
5147     +
5148     +static int __init cpu_smt_state_init(void)
5149     +{
5150     + return sysfs_create_group(&cpu_subsys.dev_root->kobj,
5151     + &cpuhp_smt_attr_group);
5152     +}
5153     +
5154     +#else
5155     +static inline int cpu_smt_state_init(void) { return 0; }
5156     +#endif
5157     +
5158     static int __init cpuhp_sysfs_init(void)
5159     {
5160     int cpu, ret;
5161    
5162     + ret = cpu_smt_state_init();
5163     + if (ret)
5164     + return ret;
5165     +
5166     ret = sysfs_create_group(&cpu_subsys.dev_root->kobj,
5167     &cpuhp_cpu_root_attr_group);
5168     if (ret)
5169     @@ -1944,7 +2199,10 @@ void __init boot_cpu_init(void)
5170     /*
5171     * Must be called _AFTER_ setting up the per_cpu areas
5172     */
5173     -void __init boot_cpu_state_init(void)
5174     +void __init boot_cpu_hotplug_init(void)
5175     {
5176     - per_cpu_ptr(&cpuhp_state, smp_processor_id())->state = CPUHP_ONLINE;
5177     +#ifdef CONFIG_SMP
5178     + this_cpu_write(cpuhp_state.booted_once, true);
5179     +#endif
5180     + this_cpu_write(cpuhp_state.state, CPUHP_ONLINE);
5181     }
5182     diff --git a/kernel/smp.c b/kernel/smp.c
5183     index bba3b201668d..399905fdfa3f 100644
5184     --- a/kernel/smp.c
5185     +++ b/kernel/smp.c
5186     @@ -564,6 +564,8 @@ void __init smp_init(void)
5187     cpu_up(cpu);
5188     }
5189    
5190     + /* Final decision about SMT support */
5191     + cpu_smt_check_topology();
5192     /* Any cleanup work */
5193     smp_announce();
5194     smp_cpus_done(setup_max_cpus);
5195     diff --git a/kernel/softirq.c b/kernel/softirq.c
5196     index 744fa611cae0..d257e624be25 100644
5197     --- a/kernel/softirq.c
5198     +++ b/kernel/softirq.c
5199     @@ -79,12 +79,16 @@ static void wakeup_softirqd(void)
5200    
5201     /*
5202     * If ksoftirqd is scheduled, we do not want to process pending softirqs
5203     - * right now. Let ksoftirqd handle this at its own rate, to get fairness.
5204     + * right now. Let ksoftirqd handle this at its own rate, to get fairness,
5205     + * unless we're doing some of the synchronous softirqs.
5206     */
5207     -static bool ksoftirqd_running(void)
5208     +#define SOFTIRQ_NOW_MASK ((1 << HI_SOFTIRQ) | (1 << TASKLET_SOFTIRQ))
5209     +static bool ksoftirqd_running(unsigned long pending)
5210     {
5211     struct task_struct *tsk = __this_cpu_read(ksoftirqd);
5212    
5213     + if (pending & SOFTIRQ_NOW_MASK)
5214     + return false;
5215     return tsk && (tsk->state == TASK_RUNNING);
5216     }
5217    
5218     @@ -324,7 +328,7 @@ asmlinkage __visible void do_softirq(void)
5219    
5220     pending = local_softirq_pending();
5221    
5222     - if (pending && !ksoftirqd_running())
5223     + if (pending && !ksoftirqd_running(pending))
5224     do_softirq_own_stack();
5225    
5226     local_irq_restore(flags);
5227     @@ -351,7 +355,7 @@ void irq_enter(void)
5228    
5229     static inline void invoke_softirq(void)
5230     {
5231     - if (ksoftirqd_running())
5232     + if (ksoftirqd_running(local_softirq_pending()))
5233     return;
5234    
5235     if (!force_irqthreads) {
5236     diff --git a/mm/memory.c b/mm/memory.c
5237     index d2db2c4eb0a4..88f8d6a2af05 100644
5238     --- a/mm/memory.c
5239     +++ b/mm/memory.c
5240     @@ -1641,6 +1641,9 @@ int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
5241     if (track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV)))
5242     return -EINVAL;
5243    
5244     + if (!pfn_modify_allowed(pfn, pgprot))
5245     + return -EACCES;
5246     +
5247     ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot);
5248    
5249     return ret;
5250     @@ -1659,6 +1662,9 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
5251     if (track_pfn_insert(vma, &pgprot, pfn))
5252     return -EINVAL;
5253    
5254     + if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot))
5255     + return -EACCES;
5256     +
5257     /*
5258     * If we don't have pte special, then we have to use the pfn_valid()
5259     * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must*
5260     @@ -1692,6 +1698,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
5261     {
5262     pte_t *pte;
5263     spinlock_t *ptl;
5264     + int err = 0;
5265    
5266     pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
5267     if (!pte)
5268     @@ -1699,12 +1706,16 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
5269     arch_enter_lazy_mmu_mode();
5270     do {
5271     BUG_ON(!pte_none(*pte));
5272     + if (!pfn_modify_allowed(pfn, prot)) {
5273     + err = -EACCES;
5274     + break;
5275     + }
5276     set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
5277     pfn++;
5278     } while (pte++, addr += PAGE_SIZE, addr != end);
5279     arch_leave_lazy_mmu_mode();
5280     pte_unmap_unlock(pte - 1, ptl);
5281     - return 0;
5282     + return err;
5283     }
5284    
5285     static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
5286     @@ -1713,6 +1724,7 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
5287     {
5288     pmd_t *pmd;
5289     unsigned long next;
5290     + int err;
5291    
5292     pfn -= addr >> PAGE_SHIFT;
5293     pmd = pmd_alloc(mm, pud, addr);
5294     @@ -1721,9 +1733,10 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
5295     VM_BUG_ON(pmd_trans_huge(*pmd));
5296     do {
5297     next = pmd_addr_end(addr, end);
5298     - if (remap_pte_range(mm, pmd, addr, next,
5299     - pfn + (addr >> PAGE_SHIFT), prot))
5300     - return -ENOMEM;
5301     + err = remap_pte_range(mm, pmd, addr, next,
5302     + pfn + (addr >> PAGE_SHIFT), prot);
5303     + if (err)
5304     + return err;
5305     } while (pmd++, addr = next, addr != end);
5306     return 0;
5307     }
5308     @@ -1734,6 +1747,7 @@ static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd,
5309     {
5310     pud_t *pud;
5311     unsigned long next;
5312     + int err;
5313    
5314     pfn -= addr >> PAGE_SHIFT;
5315     pud = pud_alloc(mm, pgd, addr);
5316     @@ -1741,9 +1755,10 @@ static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd,
5317     return -ENOMEM;
5318     do {
5319     next = pud_addr_end(addr, end);
5320     - if (remap_pmd_range(mm, pud, addr, next,
5321     - pfn + (addr >> PAGE_SHIFT), prot))
5322     - return -ENOMEM;
5323     + err = remap_pmd_range(mm, pud, addr, next,
5324     + pfn + (addr >> PAGE_SHIFT), prot);
5325     + if (err)
5326     + return err;
5327     } while (pud++, addr = next, addr != end);
5328     return 0;
5329     }
5330     diff --git a/mm/mprotect.c b/mm/mprotect.c
5331     index ae740c9b1f9b..6896f77be166 100644
5332     --- a/mm/mprotect.c
5333     +++ b/mm/mprotect.c
5334     @@ -260,6 +260,42 @@ unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
5335     return pages;
5336     }
5337    
5338     +static int prot_none_pte_entry(pte_t *pte, unsigned long addr,
5339     + unsigned long next, struct mm_walk *walk)
5340     +{
5341     + return pfn_modify_allowed(pte_pfn(*pte), *(pgprot_t *)(walk->private)) ?
5342     + 0 : -EACCES;
5343     +}
5344     +
5345     +static int prot_none_hugetlb_entry(pte_t *pte, unsigned long hmask,
5346     + unsigned long addr, unsigned long next,
5347     + struct mm_walk *walk)
5348     +{
5349     + return pfn_modify_allowed(pte_pfn(*pte), *(pgprot_t *)(walk->private)) ?
5350     + 0 : -EACCES;
5351     +}
5352     +
5353     +static int prot_none_test(unsigned long addr, unsigned long next,
5354     + struct mm_walk *walk)
5355     +{
5356     + return 0;
5357     +}
5358     +
5359     +static int prot_none_walk(struct vm_area_struct *vma, unsigned long start,
5360     + unsigned long end, unsigned long newflags)
5361     +{
5362     + pgprot_t new_pgprot = vm_get_page_prot(newflags);
5363     + struct mm_walk prot_none_walk = {
5364     + .pte_entry = prot_none_pte_entry,
5365     + .hugetlb_entry = prot_none_hugetlb_entry,
5366     + .test_walk = prot_none_test,
5367     + .mm = current->mm,
5368     + .private = &new_pgprot,
5369     + };
5370     +
5371     + return walk_page_range(start, end, &prot_none_walk);
5372     +}
5373     +
5374     int
5375     mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
5376     unsigned long start, unsigned long end, unsigned long newflags)
5377     @@ -277,6 +313,19 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
5378     return 0;
5379     }
5380    
5381     + /*
5382     + * Do PROT_NONE PFN permission checks here when we can still
5383     + * bail out without undoing a lot of state. This is a rather
5384     + * uncommon case, so doesn't need to be very optimized.
5385     + */
5386     + if (arch_has_pfn_modify_check() &&
5387     + (vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
5388     + (newflags & (VM_READ|VM_WRITE|VM_EXEC)) == 0) {
5389     + error = prot_none_walk(vma, start, end, newflags);
5390     + if (error)
5391     + return error;
5392     + }
5393     +
5394     /*
5395     * If we make a private mapping writable we increase our commit;
5396     * but (without finer accounting) cannot reduce our commit if we
5397     diff --git a/mm/swapfile.c b/mm/swapfile.c
5398     index 79c03ecd31c8..855f62ab8c1b 100644
5399     --- a/mm/swapfile.c
5400     +++ b/mm/swapfile.c
5401     @@ -2219,6 +2219,35 @@ static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
5402     return 0;
5403     }
5404    
5405     +
5406     +/*
5407     + * Find out how many pages are allowed for a single swap device. There
5408     + * are two limiting factors:
5409     + * 1) the number of bits for the swap offset in the swp_entry_t type, and
5410     + * 2) the number of bits in the swap pte, as defined by the different
5411     + * architectures.
5412     + *
5413     + * In order to find the largest possible bit mask, a swap entry with
5414     + * swap type 0 and swap offset ~0UL is created, encoded to a swap pte,
5415     + * decoded to a swp_entry_t again, and finally the swap offset is
5416     + * extracted.
5417     + *
5418     + * This will mask all the bits from the initial ~0UL mask that can't
5419     + * be encoded in either the swp_entry_t or the architecture definition
5420     + * of a swap pte.
5421     + */
5422     +unsigned long generic_max_swapfile_size(void)
5423     +{
5424     + return swp_offset(pte_to_swp_entry(
5425     + swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
5426     +}
5427     +
5428     +/* Can be overridden by an architecture for additional checks. */
5429     +__weak unsigned long max_swapfile_size(void)
5430     +{
5431     + return generic_max_swapfile_size();
5432     +}
5433     +
5434     static unsigned long read_swap_header(struct swap_info_struct *p,
5435     union swap_header *swap_header,
5436     struct inode *inode)
5437     @@ -2254,22 +2283,7 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
5438     p->cluster_next = 1;
5439     p->cluster_nr = 0;
5440    
5441     - /*
5442     - * Find out how many pages are allowed for a single swap
5443     - * device. There are two limiting factors: 1) the number
5444     - * of bits for the swap offset in the swp_entry_t type, and
5445     - * 2) the number of bits in the swap pte as defined by the
5446     - * different architectures. In order to find the
5447     - * largest possible bit mask, a swap entry with swap type 0
5448     - * and swap offset ~0UL is created, encoded to a swap pte,
5449     - * decoded to a swp_entry_t again, and finally the swap
5450     - * offset is extracted. This will mask all the bits from
5451     - * the initial ~0UL mask that can't be encoded in either
5452     - * the swp_entry_t or the architecture definition of a
5453     - * swap pte.
5454     - */
5455     - maxpages = swp_offset(pte_to_swp_entry(
5456     - swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
5457     + maxpages = max_swapfile_size();
5458     last_page = swap_header->info.last_page;
5459     if (!last_page) {
5460     pr_warn("Empty swap-file\n");
5461     diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h
5462     index aea30afeddb8..fbc1474960e3 100644
5463     --- a/tools/arch/x86/include/asm/cpufeatures.h
5464     +++ b/tools/arch/x86/include/asm/cpufeatures.h
5465     @@ -213,7 +213,7 @@
5466     #define X86_FEATURE_IBPB ( 7*32+26) /* Indirect Branch Prediction Barrier */
5467     #define X86_FEATURE_STIBP ( 7*32+27) /* Single Thread Indirect Branch Predictors */
5468     #define X86_FEATURE_ZEN ( 7*32+28) /* "" CPU is AMD family 0x17 (Zen) */
5469     -
5470     +#define X86_FEATURE_L1TF_PTEINV ( 7*32+29) /* "" L1TF workaround PTE inversion */
5471    
5472     /* Virtualization flags: Linux defined, word 8 */
5473     #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */
5474     @@ -317,6 +317,7 @@
5475     #define X86_FEATURE_PCONFIG (18*32+18) /* Intel PCONFIG */
5476     #define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */
5477     #define X86_FEATURE_INTEL_STIBP (18*32+27) /* "" Single Thread Indirect Branch Predictors */
5478     +#define X86_FEATURE_FLUSH_L1D (18*32+28) /* Flush L1D cache */
5479     #define X86_FEATURE_ARCH_CAPABILITIES (18*32+29) /* IA32_ARCH_CAPABILITIES MSR (Intel) */
5480     #define X86_FEATURE_SPEC_CTRL_SSBD (18*32+31) /* "" Speculative Store Bypass Disable */
5481    
5482     @@ -349,5 +350,6 @@
5483     #define X86_BUG_SPECTRE_V1 X86_BUG(15) /* CPU is affected by Spectre variant 1 attack with conditional branches */
5484     #define X86_BUG_SPECTRE_V2 X86_BUG(16) /* CPU is affected by Spectre variant 2 attack with indirect branches */
5485     #define X86_BUG_SPEC_STORE_BYPASS X86_BUG(17) /* CPU is affected by speculative store bypass attack */
5486     +#define X86_BUG_L1TF X86_BUG(18) /* CPU is affected by L1 Terminal Fault */
5487    
5488     #endif /* _ASM_X86_CPUFEATURES_H */