Annotation of /trunk/kernel-alx/patches-4.9/0301-4.9.202-all-fixes.patch
Parent Directory | Revision Log
Revision 3576 -
(hide annotations)
(download)
Thu Aug 13 10:21:18 2020 UTC (3 years, 9 months ago) by niro
File size: 111545 byte(s)
Thu Aug 13 10:21:18 2020 UTC (3 years, 9 months ago) by niro
File size: 111545 byte(s)
linux-202
1 | niro | 3576 | diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu |
2 | index cadb7a9a5218..b41046b5713b 100644 | ||
3 | --- a/Documentation/ABI/testing/sysfs-devices-system-cpu | ||
4 | +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu | ||
5 | @@ -358,6 +358,8 @@ What: /sys/devices/system/cpu/vulnerabilities | ||
6 | /sys/devices/system/cpu/vulnerabilities/spec_store_bypass | ||
7 | /sys/devices/system/cpu/vulnerabilities/l1tf | ||
8 | /sys/devices/system/cpu/vulnerabilities/mds | ||
9 | + /sys/devices/system/cpu/vulnerabilities/tsx_async_abort | ||
10 | + /sys/devices/system/cpu/vulnerabilities/itlb_multihit | ||
11 | Date: January 2018 | ||
12 | Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org> | ||
13 | Description: Information about CPU vulnerabilities | ||
14 | diff --git a/Documentation/hw-vuln/index.rst b/Documentation/hw-vuln/index.rst | ||
15 | index ffc064c1ec68..24f53c501366 100644 | ||
16 | --- a/Documentation/hw-vuln/index.rst | ||
17 | +++ b/Documentation/hw-vuln/index.rst | ||
18 | @@ -11,3 +11,5 @@ are configurable at compile, boot or run time. | ||
19 | |||
20 | l1tf | ||
21 | mds | ||
22 | + tsx_async_abort | ||
23 | + multihit.rst | ||
24 | diff --git a/Documentation/hw-vuln/multihit.rst b/Documentation/hw-vuln/multihit.rst | ||
25 | new file mode 100644 | ||
26 | index 000000000000..ba9988d8bce5 | ||
27 | --- /dev/null | ||
28 | +++ b/Documentation/hw-vuln/multihit.rst | ||
29 | @@ -0,0 +1,163 @@ | ||
30 | +iTLB multihit | ||
31 | +============= | ||
32 | + | ||
33 | +iTLB multihit is an erratum where some processors may incur a machine check | ||
34 | +error, possibly resulting in an unrecoverable CPU lockup, when an | ||
35 | +instruction fetch hits multiple entries in the instruction TLB. This can | ||
36 | +occur when the page size is changed along with either the physical address | ||
37 | +or cache type. A malicious guest running on a virtualized system can | ||
38 | +exploit this erratum to perform a denial of service attack. | ||
39 | + | ||
40 | + | ||
41 | +Affected processors | ||
42 | +------------------- | ||
43 | + | ||
44 | +Variations of this erratum are present on most Intel Core and Xeon processor | ||
45 | +models. The erratum is not present on: | ||
46 | + | ||
47 | + - non-Intel processors | ||
48 | + | ||
49 | + - Some Atoms (Airmont, Bonnell, Goldmont, GoldmontPlus, Saltwell, Silvermont) | ||
50 | + | ||
51 | + - Intel processors that have the PSCHANGE_MC_NO bit set in the | ||
52 | + IA32_ARCH_CAPABILITIES MSR. | ||
53 | + | ||
54 | + | ||
55 | +Related CVEs | ||
56 | +------------ | ||
57 | + | ||
58 | +The following CVE entry is related to this issue: | ||
59 | + | ||
60 | + ============== ================================================= | ||
61 | + CVE-2018-12207 Machine Check Error Avoidance on Page Size Change | ||
62 | + ============== ================================================= | ||
63 | + | ||
64 | + | ||
65 | +Problem | ||
66 | +------- | ||
67 | + | ||
68 | +Privileged software, including OS and virtual machine managers (VMM), are in | ||
69 | +charge of memory management. A key component in memory management is the control | ||
70 | +of the page tables. Modern processors use virtual memory, a technique that creates | ||
71 | +the illusion of a very large memory for processors. This virtual space is split | ||
72 | +into pages of a given size. Page tables translate virtual addresses to physical | ||
73 | +addresses. | ||
74 | + | ||
75 | +To reduce latency when performing a virtual to physical address translation, | ||
76 | +processors include a structure, called TLB, that caches recent translations. | ||
77 | +There are separate TLBs for instruction (iTLB) and data (dTLB). | ||
78 | + | ||
79 | +Under this errata, instructions are fetched from a linear address translated | ||
80 | +using a 4 KB translation cached in the iTLB. Privileged software modifies the | ||
81 | +paging structure so that the same linear address using large page size (2 MB, 4 | ||
82 | +MB, 1 GB) with a different physical address or memory type. After the page | ||
83 | +structure modification but before the software invalidates any iTLB entries for | ||
84 | +the linear address, a code fetch that happens on the same linear address may | ||
85 | +cause a machine-check error which can result in a system hang or shutdown. | ||
86 | + | ||
87 | + | ||
88 | +Attack scenarios | ||
89 | +---------------- | ||
90 | + | ||
91 | +Attacks against the iTLB multihit erratum can be mounted from malicious | ||
92 | +guests in a virtualized system. | ||
93 | + | ||
94 | + | ||
95 | +iTLB multihit system information | ||
96 | +-------------------------------- | ||
97 | + | ||
98 | +The Linux kernel provides a sysfs interface to enumerate the current iTLB | ||
99 | +multihit status of the system:whether the system is vulnerable and which | ||
100 | +mitigations are active. The relevant sysfs file is: | ||
101 | + | ||
102 | +/sys/devices/system/cpu/vulnerabilities/itlb_multihit | ||
103 | + | ||
104 | +The possible values in this file are: | ||
105 | + | ||
106 | +.. list-table:: | ||
107 | + | ||
108 | + * - Not affected | ||
109 | + - The processor is not vulnerable. | ||
110 | + * - KVM: Mitigation: Split huge pages | ||
111 | + - Software changes mitigate this issue. | ||
112 | + * - KVM: Vulnerable | ||
113 | + - The processor is vulnerable, but no mitigation enabled | ||
114 | + | ||
115 | + | ||
116 | +Enumeration of the erratum | ||
117 | +-------------------------------- | ||
118 | + | ||
119 | +A new bit has been allocated in the IA32_ARCH_CAPABILITIES (PSCHANGE_MC_NO) msr | ||
120 | +and will be set on CPU's which are mitigated against this issue. | ||
121 | + | ||
122 | + ======================================= =========== =============================== | ||
123 | + IA32_ARCH_CAPABILITIES MSR Not present Possibly vulnerable,check model | ||
124 | + IA32_ARCH_CAPABILITIES[PSCHANGE_MC_NO] '0' Likely vulnerable,check model | ||
125 | + IA32_ARCH_CAPABILITIES[PSCHANGE_MC_NO] '1' Not vulnerable | ||
126 | + ======================================= =========== =============================== | ||
127 | + | ||
128 | + | ||
129 | +Mitigation mechanism | ||
130 | +------------------------- | ||
131 | + | ||
132 | +This erratum can be mitigated by restricting the use of large page sizes to | ||
133 | +non-executable pages. This forces all iTLB entries to be 4K, and removes | ||
134 | +the possibility of multiple hits. | ||
135 | + | ||
136 | +In order to mitigate the vulnerability, KVM initially marks all huge pages | ||
137 | +as non-executable. If the guest attempts to execute in one of those pages, | ||
138 | +the page is broken down into 4K pages, which are then marked executable. | ||
139 | + | ||
140 | +If EPT is disabled or not available on the host, KVM is in control of TLB | ||
141 | +flushes and the problematic situation cannot happen. However, the shadow | ||
142 | +EPT paging mechanism used by nested virtualization is vulnerable, because | ||
143 | +the nested guest can trigger multiple iTLB hits by modifying its own | ||
144 | +(non-nested) page tables. For simplicity, KVM will make large pages | ||
145 | +non-executable in all shadow paging modes. | ||
146 | + | ||
147 | +Mitigation control on the kernel command line and KVM - module parameter | ||
148 | +------------------------------------------------------------------------ | ||
149 | + | ||
150 | +The KVM hypervisor mitigation mechanism for marking huge pages as | ||
151 | +non-executable can be controlled with a module parameter "nx_huge_pages=". | ||
152 | +The kernel command line allows to control the iTLB multihit mitigations at | ||
153 | +boot time with the option "kvm.nx_huge_pages=". | ||
154 | + | ||
155 | +The valid arguments for these options are: | ||
156 | + | ||
157 | + ========== ================================================================ | ||
158 | + force Mitigation is enabled. In this case, the mitigation implements | ||
159 | + non-executable huge pages in Linux kernel KVM module. All huge | ||
160 | + pages in the EPT are marked as non-executable. | ||
161 | + If a guest attempts to execute in one of those pages, the page is | ||
162 | + broken down into 4K pages, which are then marked executable. | ||
163 | + | ||
164 | + off Mitigation is disabled. | ||
165 | + | ||
166 | + auto Enable mitigation only if the platform is affected and the kernel | ||
167 | + was not booted with the "mitigations=off" command line parameter. | ||
168 | + This is the default option. | ||
169 | + ========== ================================================================ | ||
170 | + | ||
171 | + | ||
172 | +Mitigation selection guide | ||
173 | +-------------------------- | ||
174 | + | ||
175 | +1. No virtualization in use | ||
176 | +^^^^^^^^^^^^^^^^^^^^^^^^^^^ | ||
177 | + | ||
178 | + The system is protected by the kernel unconditionally and no further | ||
179 | + action is required. | ||
180 | + | ||
181 | +2. Virtualization with trusted guests | ||
182 | +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | ||
183 | + | ||
184 | + If the guest comes from a trusted source, you may assume that the guest will | ||
185 | + not attempt to maliciously exploit these errata and no further action is | ||
186 | + required. | ||
187 | + | ||
188 | +3. Virtualization with untrusted guests | ||
189 | +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | ||
190 | + If the guest comes from an untrusted source, the guest host kernel will need | ||
191 | + to apply iTLB multihit mitigation via the kernel command line or kvm | ||
192 | + module parameter. | ||
193 | diff --git a/Documentation/hw-vuln/tsx_async_abort.rst b/Documentation/hw-vuln/tsx_async_abort.rst | ||
194 | new file mode 100644 | ||
195 | index 000000000000..fddbd7579c53 | ||
196 | --- /dev/null | ||
197 | +++ b/Documentation/hw-vuln/tsx_async_abort.rst | ||
198 | @@ -0,0 +1,276 @@ | ||
199 | +.. SPDX-License-Identifier: GPL-2.0 | ||
200 | + | ||
201 | +TAA - TSX Asynchronous Abort | ||
202 | +====================================== | ||
203 | + | ||
204 | +TAA is a hardware vulnerability that allows unprivileged speculative access to | ||
205 | +data which is available in various CPU internal buffers by using asynchronous | ||
206 | +aborts within an Intel TSX transactional region. | ||
207 | + | ||
208 | +Affected processors | ||
209 | +------------------- | ||
210 | + | ||
211 | +This vulnerability only affects Intel processors that support Intel | ||
212 | +Transactional Synchronization Extensions (TSX) when the TAA_NO bit (bit 8) | ||
213 | +is 0 in the IA32_ARCH_CAPABILITIES MSR. On processors where the MDS_NO bit | ||
214 | +(bit 5) is 0 in the IA32_ARCH_CAPABILITIES MSR, the existing MDS mitigations | ||
215 | +also mitigate against TAA. | ||
216 | + | ||
217 | +Whether a processor is affected or not can be read out from the TAA | ||
218 | +vulnerability file in sysfs. See :ref:`tsx_async_abort_sys_info`. | ||
219 | + | ||
220 | +Related CVEs | ||
221 | +------------ | ||
222 | + | ||
223 | +The following CVE entry is related to this TAA issue: | ||
224 | + | ||
225 | + ============== ===== =================================================== | ||
226 | + CVE-2019-11135 TAA TSX Asynchronous Abort (TAA) condition on some | ||
227 | + microprocessors utilizing speculative execution may | ||
228 | + allow an authenticated user to potentially enable | ||
229 | + information disclosure via a side channel with | ||
230 | + local access. | ||
231 | + ============== ===== =================================================== | ||
232 | + | ||
233 | +Problem | ||
234 | +------- | ||
235 | + | ||
236 | +When performing store, load or L1 refill operations, processors write | ||
237 | +data into temporary microarchitectural structures (buffers). The data in | ||
238 | +those buffers can be forwarded to load operations as an optimization. | ||
239 | + | ||
240 | +Intel TSX is an extension to the x86 instruction set architecture that adds | ||
241 | +hardware transactional memory support to improve performance of multi-threaded | ||
242 | +software. TSX lets the processor expose and exploit concurrency hidden in an | ||
243 | +application due to dynamically avoiding unnecessary synchronization. | ||
244 | + | ||
245 | +TSX supports atomic memory transactions that are either committed (success) or | ||
246 | +aborted. During an abort, operations that happened within the transactional region | ||
247 | +are rolled back. An asynchronous abort takes place, among other options, when a | ||
248 | +different thread accesses a cache line that is also used within the transactional | ||
249 | +region when that access might lead to a data race. | ||
250 | + | ||
251 | +Immediately after an uncompleted asynchronous abort, certain speculatively | ||
252 | +executed loads may read data from those internal buffers and pass it to dependent | ||
253 | +operations. This can be then used to infer the value via a cache side channel | ||
254 | +attack. | ||
255 | + | ||
256 | +Because the buffers are potentially shared between Hyper-Threads cross | ||
257 | +Hyper-Thread attacks are possible. | ||
258 | + | ||
259 | +The victim of a malicious actor does not need to make use of TSX. Only the | ||
260 | +attacker needs to begin a TSX transaction and raise an asynchronous abort | ||
261 | +which in turn potenitally leaks data stored in the buffers. | ||
262 | + | ||
263 | +More detailed technical information is available in the TAA specific x86 | ||
264 | +architecture section: :ref:`Documentation/x86/tsx_async_abort.rst <tsx_async_abort>`. | ||
265 | + | ||
266 | + | ||
267 | +Attack scenarios | ||
268 | +---------------- | ||
269 | + | ||
270 | +Attacks against the TAA vulnerability can be implemented from unprivileged | ||
271 | +applications running on hosts or guests. | ||
272 | + | ||
273 | +As for MDS, the attacker has no control over the memory addresses that can | ||
274 | +be leaked. Only the victim is responsible for bringing data to the CPU. As | ||
275 | +a result, the malicious actor has to sample as much data as possible and | ||
276 | +then postprocess it to try to infer any useful information from it. | ||
277 | + | ||
278 | +A potential attacker only has read access to the data. Also, there is no direct | ||
279 | +privilege escalation by using this technique. | ||
280 | + | ||
281 | + | ||
282 | +.. _tsx_async_abort_sys_info: | ||
283 | + | ||
284 | +TAA system information | ||
285 | +----------------------- | ||
286 | + | ||
287 | +The Linux kernel provides a sysfs interface to enumerate the current TAA status | ||
288 | +of mitigated systems. The relevant sysfs file is: | ||
289 | + | ||
290 | +/sys/devices/system/cpu/vulnerabilities/tsx_async_abort | ||
291 | + | ||
292 | +The possible values in this file are: | ||
293 | + | ||
294 | +.. list-table:: | ||
295 | + | ||
296 | + * - 'Vulnerable' | ||
297 | + - The CPU is affected by this vulnerability and the microcode and kernel mitigation are not applied. | ||
298 | + * - 'Vulnerable: Clear CPU buffers attempted, no microcode' | ||
299 | + - The system tries to clear the buffers but the microcode might not support the operation. | ||
300 | + * - 'Mitigation: Clear CPU buffers' | ||
301 | + - The microcode has been updated to clear the buffers. TSX is still enabled. | ||
302 | + * - 'Mitigation: TSX disabled' | ||
303 | + - TSX is disabled. | ||
304 | + * - 'Not affected' | ||
305 | + - The CPU is not affected by this issue. | ||
306 | + | ||
307 | +.. _ucode_needed: | ||
308 | + | ||
309 | +Best effort mitigation mode | ||
310 | +^^^^^^^^^^^^^^^^^^^^^^^^^^^ | ||
311 | + | ||
312 | +If the processor is vulnerable, but the availability of the microcode-based | ||
313 | +mitigation mechanism is not advertised via CPUID the kernel selects a best | ||
314 | +effort mitigation mode. This mode invokes the mitigation instructions | ||
315 | +without a guarantee that they clear the CPU buffers. | ||
316 | + | ||
317 | +This is done to address virtualization scenarios where the host has the | ||
318 | +microcode update applied, but the hypervisor is not yet updated to expose the | ||
319 | +CPUID to the guest. If the host has updated microcode the protection takes | ||
320 | +effect; otherwise a few CPU cycles are wasted pointlessly. | ||
321 | + | ||
322 | +The state in the tsx_async_abort sysfs file reflects this situation | ||
323 | +accordingly. | ||
324 | + | ||
325 | + | ||
326 | +Mitigation mechanism | ||
327 | +-------------------- | ||
328 | + | ||
329 | +The kernel detects the affected CPUs and the presence of the microcode which is | ||
330 | +required. If a CPU is affected and the microcode is available, then the kernel | ||
331 | +enables the mitigation by default. | ||
332 | + | ||
333 | + | ||
334 | +The mitigation can be controlled at boot time via a kernel command line option. | ||
335 | +See :ref:`taa_mitigation_control_command_line`. | ||
336 | + | ||
337 | +.. _virt_mechanism: | ||
338 | + | ||
339 | +Virtualization mitigation | ||
340 | +^^^^^^^^^^^^^^^^^^^^^^^^^ | ||
341 | + | ||
342 | +Affected systems where the host has TAA microcode and TAA is mitigated by | ||
343 | +having disabled TSX previously, are not vulnerable regardless of the status | ||
344 | +of the VMs. | ||
345 | + | ||
346 | +In all other cases, if the host either does not have the TAA microcode or | ||
347 | +the kernel is not mitigated, the system might be vulnerable. | ||
348 | + | ||
349 | + | ||
350 | +.. _taa_mitigation_control_command_line: | ||
351 | + | ||
352 | +Mitigation control on the kernel command line | ||
353 | +--------------------------------------------- | ||
354 | + | ||
355 | +The kernel command line allows to control the TAA mitigations at boot time with | ||
356 | +the option "tsx_async_abort=". The valid arguments for this option are: | ||
357 | + | ||
358 | + ============ ============================================================= | ||
359 | + off This option disables the TAA mitigation on affected platforms. | ||
360 | + If the system has TSX enabled (see next parameter) and the CPU | ||
361 | + is affected, the system is vulnerable. | ||
362 | + | ||
363 | + full TAA mitigation is enabled. If TSX is enabled, on an affected | ||
364 | + system it will clear CPU buffers on ring transitions. On | ||
365 | + systems which are MDS-affected and deploy MDS mitigation, | ||
366 | + TAA is also mitigated. Specifying this option on those | ||
367 | + systems will have no effect. | ||
368 | + | ||
369 | + full,nosmt The same as tsx_async_abort=full, with SMT disabled on | ||
370 | + vulnerable CPUs that have TSX enabled. This is the complete | ||
371 | + mitigation. When TSX is disabled, SMT is not disabled because | ||
372 | + CPU is not vulnerable to cross-thread TAA attacks. | ||
373 | + ============ ============================================================= | ||
374 | + | ||
375 | +Not specifying this option is equivalent to "tsx_async_abort=full". | ||
376 | + | ||
377 | +The kernel command line also allows to control the TSX feature using the | ||
378 | +parameter "tsx=" on CPUs which support TSX control. MSR_IA32_TSX_CTRL is used | ||
379 | +to control the TSX feature and the enumeration of the TSX feature bits (RTM | ||
380 | +and HLE) in CPUID. | ||
381 | + | ||
382 | +The valid options are: | ||
383 | + | ||
384 | + ============ ============================================================= | ||
385 | + off Disables TSX on the system. | ||
386 | + | ||
387 | + Note that this option takes effect only on newer CPUs which are | ||
388 | + not vulnerable to MDS, i.e., have MSR_IA32_ARCH_CAPABILITIES.MDS_NO=1 | ||
389 | + and which get the new IA32_TSX_CTRL MSR through a microcode | ||
390 | + update. This new MSR allows for the reliable deactivation of | ||
391 | + the TSX functionality. | ||
392 | + | ||
393 | + on Enables TSX. | ||
394 | + | ||
395 | + Although there are mitigations for all known security | ||
396 | + vulnerabilities, TSX has been known to be an accelerator for | ||
397 | + several previous speculation-related CVEs, and so there may be | ||
398 | + unknown security risks associated with leaving it enabled. | ||
399 | + | ||
400 | + auto Disables TSX if X86_BUG_TAA is present, otherwise enables TSX | ||
401 | + on the system. | ||
402 | + ============ ============================================================= | ||
403 | + | ||
404 | +Not specifying this option is equivalent to "tsx=off". | ||
405 | + | ||
406 | +The following combinations of the "tsx_async_abort" and "tsx" are possible. For | ||
407 | +affected platforms tsx=auto is equivalent to tsx=off and the result will be: | ||
408 | + | ||
409 | + ========= ========================== ========================================= | ||
410 | + tsx=on tsx_async_abort=full The system will use VERW to clear CPU | ||
411 | + buffers. Cross-thread attacks are still | ||
412 | + possible on SMT machines. | ||
413 | + tsx=on tsx_async_abort=full,nosmt As above, cross-thread attacks on SMT | ||
414 | + mitigated. | ||
415 | + tsx=on tsx_async_abort=off The system is vulnerable. | ||
416 | + tsx=off tsx_async_abort=full TSX might be disabled if microcode | ||
417 | + provides a TSX control MSR. If so, | ||
418 | + system is not vulnerable. | ||
419 | + tsx=off tsx_async_abort=full,nosmt Ditto | ||
420 | + tsx=off tsx_async_abort=off ditto | ||
421 | + ========= ========================== ========================================= | ||
422 | + | ||
423 | + | ||
424 | +For unaffected platforms "tsx=on" and "tsx_async_abort=full" does not clear CPU | ||
425 | +buffers. For platforms without TSX control (MSR_IA32_ARCH_CAPABILITIES.MDS_NO=0) | ||
426 | +"tsx" command line argument has no effect. | ||
427 | + | ||
428 | +For the affected platforms below table indicates the mitigation status for the | ||
429 | +combinations of CPUID bit MD_CLEAR and IA32_ARCH_CAPABILITIES MSR bits MDS_NO | ||
430 | +and TSX_CTRL_MSR. | ||
431 | + | ||
432 | + ======= ========= ============= ======================================== | ||
433 | + MDS_NO MD_CLEAR TSX_CTRL_MSR Status | ||
434 | + ======= ========= ============= ======================================== | ||
435 | + 0 0 0 Vulnerable (needs microcode) | ||
436 | + 0 1 0 MDS and TAA mitigated via VERW | ||
437 | + 1 1 0 MDS fixed, TAA vulnerable if TSX enabled | ||
438 | + because MD_CLEAR has no meaning and | ||
439 | + VERW is not guaranteed to clear buffers | ||
440 | + 1 X 1 MDS fixed, TAA can be mitigated by | ||
441 | + VERW or TSX_CTRL_MSR | ||
442 | + ======= ========= ============= ======================================== | ||
443 | + | ||
444 | +Mitigation selection guide | ||
445 | +-------------------------- | ||
446 | + | ||
447 | +1. Trusted userspace and guests | ||
448 | +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | ||
449 | + | ||
450 | +If all user space applications are from a trusted source and do not execute | ||
451 | +untrusted code which is supplied externally, then the mitigation can be | ||
452 | +disabled. The same applies to virtualized environments with trusted guests. | ||
453 | + | ||
454 | + | ||
455 | +2. Untrusted userspace and guests | ||
456 | +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | ||
457 | + | ||
458 | +If there are untrusted applications or guests on the system, enabling TSX | ||
459 | +might allow a malicious actor to leak data from the host or from other | ||
460 | +processes running on the same physical core. | ||
461 | + | ||
462 | +If the microcode is available and the TSX is disabled on the host, attacks | ||
463 | +are prevented in a virtualized environment as well, even if the VMs do not | ||
464 | +explicitly enable the mitigation. | ||
465 | + | ||
466 | + | ||
467 | +.. _taa_default_mitigations: | ||
468 | + | ||
469 | +Default mitigations | ||
470 | +------------------- | ||
471 | + | ||
472 | +The kernel's default action for vulnerable processors is: | ||
473 | + | ||
474 | + - Deploy TSX disable mitigation (tsx_async_abort=full tsx=off). | ||
475 | diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt | ||
476 | index 61b73e42f488..c81a008d6512 100644 | ||
477 | --- a/Documentation/kernel-parameters.txt | ||
478 | +++ b/Documentation/kernel-parameters.txt | ||
479 | @@ -1975,6 +1975,25 @@ bytes respectively. Such letter suffixes can also be entirely omitted. | ||
480 | KVM MMU at runtime. | ||
481 | Default is 0 (off) | ||
482 | |||
483 | + kvm.nx_huge_pages= | ||
484 | + [KVM] Controls the software workaround for the | ||
485 | + X86_BUG_ITLB_MULTIHIT bug. | ||
486 | + force : Always deploy workaround. | ||
487 | + off : Never deploy workaround. | ||
488 | + auto : Deploy workaround based on the presence of | ||
489 | + X86_BUG_ITLB_MULTIHIT. | ||
490 | + | ||
491 | + Default is 'auto'. | ||
492 | + | ||
493 | + If the software workaround is enabled for the host, | ||
494 | + guests do need not to enable it for nested guests. | ||
495 | + | ||
496 | + kvm.nx_huge_pages_recovery_ratio= | ||
497 | + [KVM] Controls how many 4KiB pages are periodically zapped | ||
498 | + back to huge pages. 0 disables the recovery, otherwise if | ||
499 | + the value is N KVM will zap 1/Nth of the 4KiB pages every | ||
500 | + minute. The default is 60. | ||
501 | + | ||
502 | kvm-amd.nested= [KVM,AMD] Allow nested virtualization in KVM/SVM. | ||
503 | Default is 1 (enabled) | ||
504 | |||
505 | @@ -2490,6 +2509,13 @@ bytes respectively. Such letter suffixes can also be entirely omitted. | ||
506 | spec_store_bypass_disable=off [X86] | ||
507 | l1tf=off [X86] | ||
508 | mds=off [X86] | ||
509 | + tsx_async_abort=off [X86] | ||
510 | + kvm.nx_huge_pages=off [X86] | ||
511 | + | ||
512 | + Exceptions: | ||
513 | + This does not have any effect on | ||
514 | + kvm.nx_huge_pages when | ||
515 | + kvm.nx_huge_pages=force. | ||
516 | |||
517 | auto (default) | ||
518 | Mitigate all CPU vulnerabilities, but leave SMT | ||
519 | @@ -2505,6 +2531,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted. | ||
520 | be fully mitigated, even if it means losing SMT. | ||
521 | Equivalent to: l1tf=flush,nosmt [X86] | ||
522 | mds=full,nosmt [X86] | ||
523 | + tsx_async_abort=full,nosmt [X86] | ||
524 | |||
525 | mminit_loglevel= | ||
526 | [KNL] When CONFIG_DEBUG_MEMORY_INIT is set, this | ||
527 | @@ -4516,6 +4543,71 @@ bytes respectively. Such letter suffixes can also be entirely omitted. | ||
528 | platforms where RDTSC is slow and this accounting | ||
529 | can add overhead. | ||
530 | |||
531 | + tsx= [X86] Control Transactional Synchronization | ||
532 | + Extensions (TSX) feature in Intel processors that | ||
533 | + support TSX control. | ||
534 | + | ||
535 | + This parameter controls the TSX feature. The options are: | ||
536 | + | ||
537 | + on - Enable TSX on the system. Although there are | ||
538 | + mitigations for all known security vulnerabilities, | ||
539 | + TSX has been known to be an accelerator for | ||
540 | + several previous speculation-related CVEs, and | ||
541 | + so there may be unknown security risks associated | ||
542 | + with leaving it enabled. | ||
543 | + | ||
544 | + off - Disable TSX on the system. (Note that this | ||
545 | + option takes effect only on newer CPUs which are | ||
546 | + not vulnerable to MDS, i.e., have | ||
547 | + MSR_IA32_ARCH_CAPABILITIES.MDS_NO=1 and which get | ||
548 | + the new IA32_TSX_CTRL MSR through a microcode | ||
549 | + update. This new MSR allows for the reliable | ||
550 | + deactivation of the TSX functionality.) | ||
551 | + | ||
552 | + auto - Disable TSX if X86_BUG_TAA is present, | ||
553 | + otherwise enable TSX on the system. | ||
554 | + | ||
555 | + Not specifying this option is equivalent to tsx=off. | ||
556 | + | ||
557 | + See Documentation/hw-vuln/tsx_async_abort.rst | ||
558 | + for more details. | ||
559 | + | ||
560 | + tsx_async_abort= [X86,INTEL] Control mitigation for the TSX Async | ||
561 | + Abort (TAA) vulnerability. | ||
562 | + | ||
563 | + Similar to Micro-architectural Data Sampling (MDS) | ||
564 | + certain CPUs that support Transactional | ||
565 | + Synchronization Extensions (TSX) are vulnerable to an | ||
566 | + exploit against CPU internal buffers which can forward | ||
567 | + information to a disclosure gadget under certain | ||
568 | + conditions. | ||
569 | + | ||
570 | + In vulnerable processors, the speculatively forwarded | ||
571 | + data can be used in a cache side channel attack, to | ||
572 | + access data to which the attacker does not have direct | ||
573 | + access. | ||
574 | + | ||
575 | + This parameter controls the TAA mitigation. The | ||
576 | + options are: | ||
577 | + | ||
578 | + full - Enable TAA mitigation on vulnerable CPUs | ||
579 | + if TSX is enabled. | ||
580 | + | ||
581 | + full,nosmt - Enable TAA mitigation and disable SMT on | ||
582 | + vulnerable CPUs. If TSX is disabled, SMT | ||
583 | + is not disabled because CPU is not | ||
584 | + vulnerable to cross-thread TAA attacks. | ||
585 | + off - Unconditionally disable TAA mitigation | ||
586 | + | ||
587 | + Not specifying this option is equivalent to | ||
588 | + tsx_async_abort=full. On CPUs which are MDS affected | ||
589 | + and deploy MDS mitigation, TAA mitigation is not | ||
590 | + required and doesn't provide any additional | ||
591 | + mitigation. | ||
592 | + | ||
593 | + For details see: | ||
594 | + Documentation/hw-vuln/tsx_async_abort.rst | ||
595 | + | ||
596 | turbografx.map[2|3]= [HW,JOY] | ||
597 | TurboGraFX parallel port interface | ||
598 | Format: | ||
599 | diff --git a/Documentation/virtual/kvm/locking.txt b/Documentation/virtual/kvm/locking.txt | ||
600 | index e5dd9f4d6100..46ef3680c8ab 100644 | ||
601 | --- a/Documentation/virtual/kvm/locking.txt | ||
602 | +++ b/Documentation/virtual/kvm/locking.txt | ||
603 | @@ -13,8 +13,8 @@ The acquisition orders for mutexes are as follows: | ||
604 | - kvm->slots_lock is taken outside kvm->irq_lock, though acquiring | ||
605 | them together is quite rare. | ||
606 | |||
607 | -For spinlocks, kvm_lock is taken outside kvm->mmu_lock. Everything | ||
608 | -else is a leaf: no other lock is taken inside the critical sections. | ||
609 | +Everything else is a leaf: no other lock is taken inside the critical | ||
610 | +sections. | ||
611 | |||
612 | 2: Exception | ||
613 | ------------ | ||
614 | @@ -142,7 +142,7 @@ See the comments in spte_has_volatile_bits() and mmu_spte_update(). | ||
615 | ------------ | ||
616 | |||
617 | Name: kvm_lock | ||
618 | -Type: spinlock_t | ||
619 | +Type: mutex | ||
620 | Arch: any | ||
621 | Protects: - vm_list | ||
622 | |||
623 | diff --git a/Documentation/x86/index.rst b/Documentation/x86/index.rst | ||
624 | index ef389dcf1b1d..0780d55c5aa8 100644 | ||
625 | --- a/Documentation/x86/index.rst | ||
626 | +++ b/Documentation/x86/index.rst | ||
627 | @@ -6,3 +6,4 @@ x86 architecture specifics | ||
628 | :maxdepth: 1 | ||
629 | |||
630 | mds | ||
631 | + tsx_async_abort | ||
632 | diff --git a/Documentation/x86/tsx_async_abort.rst b/Documentation/x86/tsx_async_abort.rst | ||
633 | new file mode 100644 | ||
634 | index 000000000000..4a4336a89372 | ||
635 | --- /dev/null | ||
636 | +++ b/Documentation/x86/tsx_async_abort.rst | ||
637 | @@ -0,0 +1,117 @@ | ||
638 | +.. SPDX-License-Identifier: GPL-2.0 | ||
639 | + | ||
640 | +TSX Async Abort (TAA) mitigation | ||
641 | +================================ | ||
642 | + | ||
643 | +.. _tsx_async_abort: | ||
644 | + | ||
645 | +Overview | ||
646 | +-------- | ||
647 | + | ||
648 | +TSX Async Abort (TAA) is a side channel attack on internal buffers in some | ||
649 | +Intel processors similar to Microachitectural Data Sampling (MDS). In this | ||
650 | +case certain loads may speculatively pass invalid data to dependent operations | ||
651 | +when an asynchronous abort condition is pending in a Transactional | ||
652 | +Synchronization Extensions (TSX) transaction. This includes loads with no | ||
653 | +fault or assist condition. Such loads may speculatively expose stale data from | ||
654 | +the same uarch data structures as in MDS, with same scope of exposure i.e. | ||
655 | +same-thread and cross-thread. This issue affects all current processors that | ||
656 | +support TSX. | ||
657 | + | ||
658 | +Mitigation strategy | ||
659 | +------------------- | ||
660 | + | ||
661 | +a) TSX disable - one of the mitigations is to disable TSX. A new MSR | ||
662 | +IA32_TSX_CTRL will be available in future and current processors after | ||
663 | +microcode update which can be used to disable TSX. In addition, it | ||
664 | +controls the enumeration of the TSX feature bits (RTM and HLE) in CPUID. | ||
665 | + | ||
666 | +b) Clear CPU buffers - similar to MDS, clearing the CPU buffers mitigates this | ||
667 | +vulnerability. More details on this approach can be found in | ||
668 | +:ref:`Documentation/hw-vuln/mds.rst <mds>`. | ||
669 | + | ||
670 | +Kernel internal mitigation modes | ||
671 | +-------------------------------- | ||
672 | + | ||
673 | + ============= ============================================================ | ||
674 | + off Mitigation is disabled. Either the CPU is not affected or | ||
675 | + tsx_async_abort=off is supplied on the kernel command line. | ||
676 | + | ||
677 | + tsx disabled Mitigation is enabled. TSX feature is disabled by default at | ||
678 | + bootup on processors that support TSX control. | ||
679 | + | ||
680 | + verw Mitigation is enabled. CPU is affected and MD_CLEAR is | ||
681 | + advertised in CPUID. | ||
682 | + | ||
683 | + ucode needed Mitigation is enabled. CPU is affected and MD_CLEAR is not | ||
684 | + advertised in CPUID. That is mainly for virtualization | ||
685 | + scenarios where the host has the updated microcode but the | ||
686 | + hypervisor does not expose MD_CLEAR in CPUID. It's a best | ||
687 | + effort approach without guarantee. | ||
688 | + ============= ============================================================ | ||
689 | + | ||
690 | +If the CPU is affected and the "tsx_async_abort" kernel command line parameter is | ||
691 | +not provided then the kernel selects an appropriate mitigation depending on the | ||
692 | +status of RTM and MD_CLEAR CPUID bits. | ||
693 | + | ||
694 | +Below tables indicate the impact of tsx=on|off|auto cmdline options on state of | ||
695 | +TAA mitigation, VERW behavior and TSX feature for various combinations of | ||
696 | +MSR_IA32_ARCH_CAPABILITIES bits. | ||
697 | + | ||
698 | +1. "tsx=off" | ||
699 | + | ||
700 | +========= ========= ============ ============ ============== =================== ====================== | ||
701 | +MSR_IA32_ARCH_CAPABILITIES bits Result with cmdline tsx=off | ||
702 | +---------------------------------- ------------------------------------------------------------------------- | ||
703 | +TAA_NO MDS_NO TSX_CTRL_MSR TSX state VERW can clear TAA mitigation TAA mitigation | ||
704 | + after bootup CPU buffers tsx_async_abort=off tsx_async_abort=full | ||
705 | +========= ========= ============ ============ ============== =================== ====================== | ||
706 | + 0 0 0 HW default Yes Same as MDS Same as MDS | ||
707 | + 0 0 1 Invalid case Invalid case Invalid case Invalid case | ||
708 | + 0 1 0 HW default No Need ucode update Need ucode update | ||
709 | + 0 1 1 Disabled Yes TSX disabled TSX disabled | ||
710 | + 1 X 1 Disabled X None needed None needed | ||
711 | +========= ========= ============ ============ ============== =================== ====================== | ||
712 | + | ||
713 | +2. "tsx=on" | ||
714 | + | ||
715 | +========= ========= ============ ============ ============== =================== ====================== | ||
716 | +MSR_IA32_ARCH_CAPABILITIES bits Result with cmdline tsx=on | ||
717 | +---------------------------------- ------------------------------------------------------------------------- | ||
718 | +TAA_NO MDS_NO TSX_CTRL_MSR TSX state VERW can clear TAA mitigation TAA mitigation | ||
719 | + after bootup CPU buffers tsx_async_abort=off tsx_async_abort=full | ||
720 | +========= ========= ============ ============ ============== =================== ====================== | ||
721 | + 0 0 0 HW default Yes Same as MDS Same as MDS | ||
722 | + 0 0 1 Invalid case Invalid case Invalid case Invalid case | ||
723 | + 0 1 0 HW default No Need ucode update Need ucode update | ||
724 | + 0 1 1 Enabled Yes None Same as MDS | ||
725 | + 1 X 1 Enabled X None needed None needed | ||
726 | +========= ========= ============ ============ ============== =================== ====================== | ||
727 | + | ||
728 | +3. "tsx=auto" | ||
729 | + | ||
730 | +========= ========= ============ ============ ============== =================== ====================== | ||
731 | +MSR_IA32_ARCH_CAPABILITIES bits Result with cmdline tsx=auto | ||
732 | +---------------------------------- ------------------------------------------------------------------------- | ||
733 | +TAA_NO MDS_NO TSX_CTRL_MSR TSX state VERW can clear TAA mitigation TAA mitigation | ||
734 | + after bootup CPU buffers tsx_async_abort=off tsx_async_abort=full | ||
735 | +========= ========= ============ ============ ============== =================== ====================== | ||
736 | + 0 0 0 HW default Yes Same as MDS Same as MDS | ||
737 | + 0 0 1 Invalid case Invalid case Invalid case Invalid case | ||
738 | + 0 1 0 HW default No Need ucode update Need ucode update | ||
739 | + 0 1 1 Disabled Yes TSX disabled TSX disabled | ||
740 | + 1 X 1 Enabled X None needed None needed | ||
741 | +========= ========= ============ ============ ============== =================== ====================== | ||
742 | + | ||
743 | +In the tables, TSX_CTRL_MSR is a new bit in MSR_IA32_ARCH_CAPABILITIES that | ||
744 | +indicates whether MSR_IA32_TSX_CTRL is supported. | ||
745 | + | ||
746 | +There are two control bits in IA32_TSX_CTRL MSR: | ||
747 | + | ||
748 | + Bit 0: When set it disables the Restricted Transactional Memory (RTM) | ||
749 | + sub-feature of TSX (will force all transactions to abort on the | ||
750 | + XBEGIN instruction). | ||
751 | + | ||
752 | + Bit 1: When set it disables the enumeration of the RTM and HLE feature | ||
753 | + (i.e. it will make CPUID(EAX=7).EBX{bit4} and | ||
754 | + CPUID(EAX=7).EBX{bit11} read as 0). | ||
755 | diff --git a/Makefile b/Makefile | ||
756 | index 4741bbdfaa10..1e322e669301 100644 | ||
757 | --- a/Makefile | ||
758 | +++ b/Makefile | ||
759 | @@ -1,6 +1,6 @@ | ||
760 | VERSION = 4 | ||
761 | PATCHLEVEL = 9 | ||
762 | -SUBLEVEL = 201 | ||
763 | +SUBLEVEL = 202 | ||
764 | EXTRAVERSION = | ||
765 | NAME = Roaring Lionus | ||
766 | |||
767 | diff --git a/arch/mips/bcm63xx/reset.c b/arch/mips/bcm63xx/reset.c | ||
768 | index d1fe51edf5e6..4d411da2497b 100644 | ||
769 | --- a/arch/mips/bcm63xx/reset.c | ||
770 | +++ b/arch/mips/bcm63xx/reset.c | ||
771 | @@ -119,7 +119,7 @@ | ||
772 | #define BCM6368_RESET_DSL 0 | ||
773 | #define BCM6368_RESET_SAR SOFTRESET_6368_SAR_MASK | ||
774 | #define BCM6368_RESET_EPHY SOFTRESET_6368_EPHY_MASK | ||
775 | -#define BCM6368_RESET_ENETSW 0 | ||
776 | +#define BCM6368_RESET_ENETSW SOFTRESET_6368_ENETSW_MASK | ||
777 | #define BCM6368_RESET_PCM SOFTRESET_6368_PCM_MASK | ||
778 | #define BCM6368_RESET_MPI SOFTRESET_6368_MPI_MASK | ||
779 | #define BCM6368_RESET_PCIE 0 | ||
780 | diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c | ||
781 | index 3dc96b455e0c..37c254677ccd 100644 | ||
782 | --- a/arch/s390/kvm/kvm-s390.c | ||
783 | +++ b/arch/s390/kvm/kvm-s390.c | ||
784 | @@ -1422,13 +1422,13 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) | ||
785 | kvm->arch.sca = (struct bsca_block *) get_zeroed_page(alloc_flags); | ||
786 | if (!kvm->arch.sca) | ||
787 | goto out_err; | ||
788 | - spin_lock(&kvm_lock); | ||
789 | + mutex_lock(&kvm_lock); | ||
790 | sca_offset += 16; | ||
791 | if (sca_offset + sizeof(struct bsca_block) > PAGE_SIZE) | ||
792 | sca_offset = 0; | ||
793 | kvm->arch.sca = (struct bsca_block *) | ||
794 | ((char *) kvm->arch.sca + sca_offset); | ||
795 | - spin_unlock(&kvm_lock); | ||
796 | + mutex_unlock(&kvm_lock); | ||
797 | |||
798 | sprintf(debug_name, "kvm-%u", current->pid); | ||
799 | |||
800 | diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig | ||
801 | index e0055b4302d6..1067f7668c4e 100644 | ||
802 | --- a/arch/x86/Kconfig | ||
803 | +++ b/arch/x86/Kconfig | ||
804 | @@ -1755,6 +1755,51 @@ config X86_INTEL_MEMORY_PROTECTION_KEYS | ||
805 | |||
806 | If unsure, say y. | ||
807 | |||
808 | +choice | ||
809 | + prompt "TSX enable mode" | ||
810 | + depends on CPU_SUP_INTEL | ||
811 | + default X86_INTEL_TSX_MODE_OFF | ||
812 | + help | ||
813 | + Intel's TSX (Transactional Synchronization Extensions) feature | ||
814 | + allows to optimize locking protocols through lock elision which | ||
815 | + can lead to a noticeable performance boost. | ||
816 | + | ||
817 | + On the other hand it has been shown that TSX can be exploited | ||
818 | + to form side channel attacks (e.g. TAA) and chances are there | ||
819 | + will be more of those attacks discovered in the future. | ||
820 | + | ||
821 | + Therefore TSX is not enabled by default (aka tsx=off). An admin | ||
822 | + might override this decision by tsx=on the command line parameter. | ||
823 | + Even with TSX enabled, the kernel will attempt to enable the best | ||
824 | + possible TAA mitigation setting depending on the microcode available | ||
825 | + for the particular machine. | ||
826 | + | ||
827 | + This option allows to set the default tsx mode between tsx=on, =off | ||
828 | + and =auto. See Documentation/kernel-parameters.txt for more | ||
829 | + details. | ||
830 | + | ||
831 | + Say off if not sure, auto if TSX is in use but it should be used on safe | ||
832 | + platforms or on if TSX is in use and the security aspect of tsx is not | ||
833 | + relevant. | ||
834 | + | ||
835 | +config X86_INTEL_TSX_MODE_OFF | ||
836 | + bool "off" | ||
837 | + help | ||
838 | + TSX is disabled if possible - equals to tsx=off command line parameter. | ||
839 | + | ||
840 | +config X86_INTEL_TSX_MODE_ON | ||
841 | + bool "on" | ||
842 | + help | ||
843 | + TSX is always enabled on TSX capable HW - equals the tsx=on command | ||
844 | + line parameter. | ||
845 | + | ||
846 | +config X86_INTEL_TSX_MODE_AUTO | ||
847 | + bool "auto" | ||
848 | + help | ||
849 | + TSX is enabled on TSX capable HW that is believed to be safe against | ||
850 | + side channel attacks- equals the tsx=auto command line parameter. | ||
851 | +endchoice | ||
852 | + | ||
853 | config EFI | ||
854 | bool "EFI runtime service support" | ||
855 | depends on ACPI | ||
856 | diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h | ||
857 | index 3a972da155d6..ccc4420f051b 100644 | ||
858 | --- a/arch/x86/include/asm/cpufeatures.h | ||
859 | +++ b/arch/x86/include/asm/cpufeatures.h | ||
860 | @@ -357,5 +357,7 @@ | ||
861 | #define X86_BUG_MDS X86_BUG(19) /* CPU is affected by Microarchitectural data sampling */ | ||
862 | #define X86_BUG_MSBDS_ONLY X86_BUG(20) /* CPU is only affected by the MSDBS variant of BUG_MDS */ | ||
863 | #define X86_BUG_SWAPGS X86_BUG(21) /* CPU is affected by speculation through SWAPGS */ | ||
864 | +#define X86_BUG_TAA X86_BUG(22) /* CPU is affected by TSX Async Abort(TAA) */ | ||
865 | +#define X86_BUG_ITLB_MULTIHIT X86_BUG(23) /* CPU may incur MCE during certain page attribute changes */ | ||
866 | |||
867 | #endif /* _ASM_X86_CPUFEATURES_H */ | ||
868 | diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h | ||
869 | index 222cb69e1219..d2c14a96ec28 100644 | ||
870 | --- a/arch/x86/include/asm/kvm_host.h | ||
871 | +++ b/arch/x86/include/asm/kvm_host.h | ||
872 | @@ -261,6 +261,7 @@ struct kvm_rmap_head { | ||
873 | struct kvm_mmu_page { | ||
874 | struct list_head link; | ||
875 | struct hlist_node hash_link; | ||
876 | + struct list_head lpage_disallowed_link; | ||
877 | |||
878 | /* | ||
879 | * The following two entries are used to key the shadow page in the | ||
880 | @@ -273,6 +274,7 @@ struct kvm_mmu_page { | ||
881 | /* hold the gfn of each spte inside spt */ | ||
882 | gfn_t *gfns; | ||
883 | bool unsync; | ||
884 | + bool lpage_disallowed; /* Can't be replaced by an equiv large page */ | ||
885 | int root_count; /* Currently serving as active root */ | ||
886 | unsigned int unsync_children; | ||
887 | struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */ | ||
888 | @@ -724,6 +726,7 @@ struct kvm_arch { | ||
889 | */ | ||
890 | struct list_head active_mmu_pages; | ||
891 | struct list_head zapped_obsolete_pages; | ||
892 | + struct list_head lpage_disallowed_mmu_pages; | ||
893 | struct kvm_page_track_notifier_node mmu_sp_tracker; | ||
894 | struct kvm_page_track_notifier_head track_notifier_head; | ||
895 | |||
896 | @@ -798,6 +801,8 @@ struct kvm_arch { | ||
897 | |||
898 | bool x2apic_format; | ||
899 | bool x2apic_broadcast_quirk_disabled; | ||
900 | + | ||
901 | + struct task_struct *nx_lpage_recovery_thread; | ||
902 | }; | ||
903 | |||
904 | struct kvm_vm_stat { | ||
905 | @@ -811,6 +816,7 @@ struct kvm_vm_stat { | ||
906 | ulong mmu_unsync; | ||
907 | ulong remote_tlb_flush; | ||
908 | ulong lpages; | ||
909 | + ulong nx_lpage_splits; | ||
910 | }; | ||
911 | |||
912 | struct kvm_vcpu_stat { | ||
913 | diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h | ||
914 | index 86166868db8c..8d162e0f2881 100644 | ||
915 | --- a/arch/x86/include/asm/msr-index.h | ||
916 | +++ b/arch/x86/include/asm/msr-index.h | ||
917 | @@ -77,6 +77,18 @@ | ||
918 | * Microarchitectural Data | ||
919 | * Sampling (MDS) vulnerabilities. | ||
920 | */ | ||
921 | +#define ARCH_CAP_PSCHANGE_MC_NO BIT(6) /* | ||
922 | + * The processor is not susceptible to a | ||
923 | + * machine check error due to modifying the | ||
924 | + * code page size along with either the | ||
925 | + * physical address or cache type | ||
926 | + * without TLB invalidation. | ||
927 | + */ | ||
928 | +#define ARCH_CAP_TSX_CTRL_MSR BIT(7) /* MSR for TSX control is available. */ | ||
929 | +#define ARCH_CAP_TAA_NO BIT(8) /* | ||
930 | + * Not susceptible to | ||
931 | + * TSX Async Abort (TAA) vulnerabilities. | ||
932 | + */ | ||
933 | |||
934 | #define MSR_IA32_FLUSH_CMD 0x0000010b | ||
935 | #define L1D_FLUSH BIT(0) /* | ||
936 | @@ -87,6 +99,10 @@ | ||
937 | #define MSR_IA32_BBL_CR_CTL 0x00000119 | ||
938 | #define MSR_IA32_BBL_CR_CTL3 0x0000011e | ||
939 | |||
940 | +#define MSR_IA32_TSX_CTRL 0x00000122 | ||
941 | +#define TSX_CTRL_RTM_DISABLE BIT(0) /* Disable RTM feature */ | ||
942 | +#define TSX_CTRL_CPUID_CLEAR BIT(1) /* Disable TSX enumeration */ | ||
943 | + | ||
944 | #define MSR_IA32_SYSENTER_CS 0x00000174 | ||
945 | #define MSR_IA32_SYSENTER_ESP 0x00000175 | ||
946 | #define MSR_IA32_SYSENTER_EIP 0x00000176 | ||
947 | diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h | ||
948 | index 10a48505abb5..8d56d701b5f7 100644 | ||
949 | --- a/arch/x86/include/asm/nospec-branch.h | ||
950 | +++ b/arch/x86/include/asm/nospec-branch.h | ||
951 | @@ -314,7 +314,7 @@ DECLARE_STATIC_KEY_FALSE(mds_idle_clear); | ||
952 | #include <asm/segment.h> | ||
953 | |||
954 | /** | ||
955 | - * mds_clear_cpu_buffers - Mitigation for MDS vulnerability | ||
956 | + * mds_clear_cpu_buffers - Mitigation for MDS and TAA vulnerability | ||
957 | * | ||
958 | * This uses the otherwise unused and obsolete VERW instruction in | ||
959 | * combination with microcode which triggers a CPU buffer flush when the | ||
960 | @@ -337,7 +337,7 @@ static inline void mds_clear_cpu_buffers(void) | ||
961 | } | ||
962 | |||
963 | /** | ||
964 | - * mds_user_clear_cpu_buffers - Mitigation for MDS vulnerability | ||
965 | + * mds_user_clear_cpu_buffers - Mitigation for MDS and TAA vulnerability | ||
966 | * | ||
967 | * Clear CPU buffers if the corresponding static key is enabled | ||
968 | */ | ||
969 | diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h | ||
970 | index 155e49fc7010..92703fa09c19 100644 | ||
971 | --- a/arch/x86/include/asm/processor.h | ||
972 | +++ b/arch/x86/include/asm/processor.h | ||
973 | @@ -880,4 +880,11 @@ enum mds_mitigations { | ||
974 | MDS_MITIGATION_VMWERV, | ||
975 | }; | ||
976 | |||
977 | +enum taa_mitigations { | ||
978 | + TAA_MITIGATION_OFF, | ||
979 | + TAA_MITIGATION_UCODE_NEEDED, | ||
980 | + TAA_MITIGATION_VERW, | ||
981 | + TAA_MITIGATION_TSX_DISABLED, | ||
982 | +}; | ||
983 | + | ||
984 | #endif /* _ASM_X86_PROCESSOR_H */ | ||
985 | diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile | ||
986 | index 33b63670bf09..f6e386fe510c 100644 | ||
987 | --- a/arch/x86/kernel/cpu/Makefile | ||
988 | +++ b/arch/x86/kernel/cpu/Makefile | ||
989 | @@ -25,7 +25,7 @@ obj-y += bugs.o | ||
990 | obj-$(CONFIG_PROC_FS) += proc.o | ||
991 | obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o | ||
992 | |||
993 | -obj-$(CONFIG_CPU_SUP_INTEL) += intel.o | ||
994 | +obj-$(CONFIG_CPU_SUP_INTEL) += intel.o tsx.o | ||
995 | obj-$(CONFIG_CPU_SUP_AMD) += amd.o | ||
996 | obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o | ||
997 | obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o | ||
998 | diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c | ||
999 | index 2a42fef275ad..827fc38df97a 100644 | ||
1000 | --- a/arch/x86/kernel/cpu/bugs.c | ||
1001 | +++ b/arch/x86/kernel/cpu/bugs.c | ||
1002 | @@ -31,11 +31,14 @@ | ||
1003 | #include <asm/intel-family.h> | ||
1004 | #include <asm/e820.h> | ||
1005 | |||
1006 | +#include "cpu.h" | ||
1007 | + | ||
1008 | static void __init spectre_v1_select_mitigation(void); | ||
1009 | static void __init spectre_v2_select_mitigation(void); | ||
1010 | static void __init ssb_select_mitigation(void); | ||
1011 | static void __init l1tf_select_mitigation(void); | ||
1012 | static void __init mds_select_mitigation(void); | ||
1013 | +static void __init taa_select_mitigation(void); | ||
1014 | |||
1015 | /* The base value of the SPEC_CTRL MSR that always has to be preserved. */ | ||
1016 | u64 x86_spec_ctrl_base; | ||
1017 | @@ -102,6 +105,7 @@ void __init check_bugs(void) | ||
1018 | ssb_select_mitigation(); | ||
1019 | l1tf_select_mitigation(); | ||
1020 | mds_select_mitigation(); | ||
1021 | + taa_select_mitigation(); | ||
1022 | |||
1023 | arch_smt_update(); | ||
1024 | |||
1025 | @@ -265,6 +269,100 @@ static int __init mds_cmdline(char *str) | ||
1026 | } | ||
1027 | early_param("mds", mds_cmdline); | ||
1028 | |||
1029 | +#undef pr_fmt | ||
1030 | +#define pr_fmt(fmt) "TAA: " fmt | ||
1031 | + | ||
1032 | +/* Default mitigation for TAA-affected CPUs */ | ||
1033 | +static enum taa_mitigations taa_mitigation __ro_after_init = TAA_MITIGATION_VERW; | ||
1034 | +static bool taa_nosmt __ro_after_init; | ||
1035 | + | ||
1036 | +static const char * const taa_strings[] = { | ||
1037 | + [TAA_MITIGATION_OFF] = "Vulnerable", | ||
1038 | + [TAA_MITIGATION_UCODE_NEEDED] = "Vulnerable: Clear CPU buffers attempted, no microcode", | ||
1039 | + [TAA_MITIGATION_VERW] = "Mitigation: Clear CPU buffers", | ||
1040 | + [TAA_MITIGATION_TSX_DISABLED] = "Mitigation: TSX disabled", | ||
1041 | +}; | ||
1042 | + | ||
1043 | +static void __init taa_select_mitigation(void) | ||
1044 | +{ | ||
1045 | + u64 ia32_cap; | ||
1046 | + | ||
1047 | + if (!boot_cpu_has_bug(X86_BUG_TAA)) { | ||
1048 | + taa_mitigation = TAA_MITIGATION_OFF; | ||
1049 | + return; | ||
1050 | + } | ||
1051 | + | ||
1052 | + /* TSX previously disabled by tsx=off */ | ||
1053 | + if (!boot_cpu_has(X86_FEATURE_RTM)) { | ||
1054 | + taa_mitigation = TAA_MITIGATION_TSX_DISABLED; | ||
1055 | + goto out; | ||
1056 | + } | ||
1057 | + | ||
1058 | + if (cpu_mitigations_off()) { | ||
1059 | + taa_mitigation = TAA_MITIGATION_OFF; | ||
1060 | + return; | ||
1061 | + } | ||
1062 | + | ||
1063 | + /* TAA mitigation is turned off on the cmdline (tsx_async_abort=off) */ | ||
1064 | + if (taa_mitigation == TAA_MITIGATION_OFF) | ||
1065 | + goto out; | ||
1066 | + | ||
1067 | + if (boot_cpu_has(X86_FEATURE_MD_CLEAR)) | ||
1068 | + taa_mitigation = TAA_MITIGATION_VERW; | ||
1069 | + else | ||
1070 | + taa_mitigation = TAA_MITIGATION_UCODE_NEEDED; | ||
1071 | + | ||
1072 | + /* | ||
1073 | + * VERW doesn't clear the CPU buffers when MD_CLEAR=1 and MDS_NO=1. | ||
1074 | + * A microcode update fixes this behavior to clear CPU buffers. It also | ||
1075 | + * adds support for MSR_IA32_TSX_CTRL which is enumerated by the | ||
1076 | + * ARCH_CAP_TSX_CTRL_MSR bit. | ||
1077 | + * | ||
1078 | + * On MDS_NO=1 CPUs if ARCH_CAP_TSX_CTRL_MSR is not set, microcode | ||
1079 | + * update is required. | ||
1080 | + */ | ||
1081 | + ia32_cap = x86_read_arch_cap_msr(); | ||
1082 | + if ( (ia32_cap & ARCH_CAP_MDS_NO) && | ||
1083 | + !(ia32_cap & ARCH_CAP_TSX_CTRL_MSR)) | ||
1084 | + taa_mitigation = TAA_MITIGATION_UCODE_NEEDED; | ||
1085 | + | ||
1086 | + /* | ||
1087 | + * TSX is enabled, select alternate mitigation for TAA which is | ||
1088 | + * the same as MDS. Enable MDS static branch to clear CPU buffers. | ||
1089 | + * | ||
1090 | + * For guests that can't determine whether the correct microcode is | ||
1091 | + * present on host, enable the mitigation for UCODE_NEEDED as well. | ||
1092 | + */ | ||
1093 | + static_branch_enable(&mds_user_clear); | ||
1094 | + | ||
1095 | + if (taa_nosmt || cpu_mitigations_auto_nosmt()) | ||
1096 | + cpu_smt_disable(false); | ||
1097 | + | ||
1098 | +out: | ||
1099 | + pr_info("%s\n", taa_strings[taa_mitigation]); | ||
1100 | +} | ||
1101 | + | ||
1102 | +static int __init tsx_async_abort_parse_cmdline(char *str) | ||
1103 | +{ | ||
1104 | + if (!boot_cpu_has_bug(X86_BUG_TAA)) | ||
1105 | + return 0; | ||
1106 | + | ||
1107 | + if (!str) | ||
1108 | + return -EINVAL; | ||
1109 | + | ||
1110 | + if (!strcmp(str, "off")) { | ||
1111 | + taa_mitigation = TAA_MITIGATION_OFF; | ||
1112 | + } else if (!strcmp(str, "full")) { | ||
1113 | + taa_mitigation = TAA_MITIGATION_VERW; | ||
1114 | + } else if (!strcmp(str, "full,nosmt")) { | ||
1115 | + taa_mitigation = TAA_MITIGATION_VERW; | ||
1116 | + taa_nosmt = true; | ||
1117 | + } | ||
1118 | + | ||
1119 | + return 0; | ||
1120 | +} | ||
1121 | +early_param("tsx_async_abort", tsx_async_abort_parse_cmdline); | ||
1122 | + | ||
1123 | #undef pr_fmt | ||
1124 | #define pr_fmt(fmt) "Spectre V1 : " fmt | ||
1125 | |||
1126 | @@ -780,13 +878,10 @@ static void update_mds_branch_idle(void) | ||
1127 | } | ||
1128 | |||
1129 | #define MDS_MSG_SMT "MDS CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/mds.html for more details.\n" | ||
1130 | +#define TAA_MSG_SMT "TAA CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/tsx_async_abort.html for more details.\n" | ||
1131 | |||
1132 | void arch_smt_update(void) | ||
1133 | { | ||
1134 | - /* Enhanced IBRS implies STIBP. No update required. */ | ||
1135 | - if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) | ||
1136 | - return; | ||
1137 | - | ||
1138 | mutex_lock(&spec_ctrl_mutex); | ||
1139 | |||
1140 | switch (spectre_v2_user) { | ||
1141 | @@ -812,6 +907,17 @@ void arch_smt_update(void) | ||
1142 | break; | ||
1143 | } | ||
1144 | |||
1145 | + switch (taa_mitigation) { | ||
1146 | + case TAA_MITIGATION_VERW: | ||
1147 | + case TAA_MITIGATION_UCODE_NEEDED: | ||
1148 | + if (sched_smt_active()) | ||
1149 | + pr_warn_once(TAA_MSG_SMT); | ||
1150 | + break; | ||
1151 | + case TAA_MITIGATION_TSX_DISABLED: | ||
1152 | + case TAA_MITIGATION_OFF: | ||
1153 | + break; | ||
1154 | + } | ||
1155 | + | ||
1156 | mutex_unlock(&spec_ctrl_mutex); | ||
1157 | } | ||
1158 | |||
1159 | @@ -1127,6 +1233,9 @@ void x86_spec_ctrl_setup_ap(void) | ||
1160 | x86_amd_ssb_disable(); | ||
1161 | } | ||
1162 | |||
1163 | +bool itlb_multihit_kvm_mitigation; | ||
1164 | +EXPORT_SYMBOL_GPL(itlb_multihit_kvm_mitigation); | ||
1165 | + | ||
1166 | #undef pr_fmt | ||
1167 | #define pr_fmt(fmt) "L1TF: " fmt | ||
1168 | |||
1169 | @@ -1282,11 +1391,24 @@ static ssize_t l1tf_show_state(char *buf) | ||
1170 | l1tf_vmx_states[l1tf_vmx_mitigation], | ||
1171 | sched_smt_active() ? "vulnerable" : "disabled"); | ||
1172 | } | ||
1173 | + | ||
1174 | +static ssize_t itlb_multihit_show_state(char *buf) | ||
1175 | +{ | ||
1176 | + if (itlb_multihit_kvm_mitigation) | ||
1177 | + return sprintf(buf, "KVM: Mitigation: Split huge pages\n"); | ||
1178 | + else | ||
1179 | + return sprintf(buf, "KVM: Vulnerable\n"); | ||
1180 | +} | ||
1181 | #else | ||
1182 | static ssize_t l1tf_show_state(char *buf) | ||
1183 | { | ||
1184 | return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG); | ||
1185 | } | ||
1186 | + | ||
1187 | +static ssize_t itlb_multihit_show_state(char *buf) | ||
1188 | +{ | ||
1189 | + return sprintf(buf, "Processor vulnerable\n"); | ||
1190 | +} | ||
1191 | #endif | ||
1192 | |||
1193 | static ssize_t mds_show_state(char *buf) | ||
1194 | @@ -1308,6 +1430,21 @@ static ssize_t mds_show_state(char *buf) | ||
1195 | sched_smt_active() ? "vulnerable" : "disabled"); | ||
1196 | } | ||
1197 | |||
1198 | +static ssize_t tsx_async_abort_show_state(char *buf) | ||
1199 | +{ | ||
1200 | + if ((taa_mitigation == TAA_MITIGATION_TSX_DISABLED) || | ||
1201 | + (taa_mitigation == TAA_MITIGATION_OFF)) | ||
1202 | + return sprintf(buf, "%s\n", taa_strings[taa_mitigation]); | ||
1203 | + | ||
1204 | + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { | ||
1205 | + return sprintf(buf, "%s; SMT Host state unknown\n", | ||
1206 | + taa_strings[taa_mitigation]); | ||
1207 | + } | ||
1208 | + | ||
1209 | + return sprintf(buf, "%s; SMT %s\n", taa_strings[taa_mitigation], | ||
1210 | + sched_smt_active() ? "vulnerable" : "disabled"); | ||
1211 | +} | ||
1212 | + | ||
1213 | static char *stibp_state(void) | ||
1214 | { | ||
1215 | if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) | ||
1216 | @@ -1373,6 +1510,12 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr | ||
1217 | case X86_BUG_MDS: | ||
1218 | return mds_show_state(buf); | ||
1219 | |||
1220 | + case X86_BUG_TAA: | ||
1221 | + return tsx_async_abort_show_state(buf); | ||
1222 | + | ||
1223 | + case X86_BUG_ITLB_MULTIHIT: | ||
1224 | + return itlb_multihit_show_state(buf); | ||
1225 | + | ||
1226 | default: | ||
1227 | break; | ||
1228 | } | ||
1229 | @@ -1409,4 +1552,14 @@ ssize_t cpu_show_mds(struct device *dev, struct device_attribute *attr, char *bu | ||
1230 | { | ||
1231 | return cpu_show_common(dev, attr, buf, X86_BUG_MDS); | ||
1232 | } | ||
1233 | + | ||
1234 | +ssize_t cpu_show_tsx_async_abort(struct device *dev, struct device_attribute *attr, char *buf) | ||
1235 | +{ | ||
1236 | + return cpu_show_common(dev, attr, buf, X86_BUG_TAA); | ||
1237 | +} | ||
1238 | + | ||
1239 | +ssize_t cpu_show_itlb_multihit(struct device *dev, struct device_attribute *attr, char *buf) | ||
1240 | +{ | ||
1241 | + return cpu_show_common(dev, attr, buf, X86_BUG_ITLB_MULTIHIT); | ||
1242 | +} | ||
1243 | #endif | ||
1244 | diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c | ||
1245 | index 12fa16051871..477df9782fdf 100644 | ||
1246 | --- a/arch/x86/kernel/cpu/common.c | ||
1247 | +++ b/arch/x86/kernel/cpu/common.c | ||
1248 | @@ -891,13 +891,14 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) | ||
1249 | c->x86_cache_bits = c->x86_phys_bits; | ||
1250 | } | ||
1251 | |||
1252 | -#define NO_SPECULATION BIT(0) | ||
1253 | -#define NO_MELTDOWN BIT(1) | ||
1254 | -#define NO_SSB BIT(2) | ||
1255 | -#define NO_L1TF BIT(3) | ||
1256 | -#define NO_MDS BIT(4) | ||
1257 | -#define MSBDS_ONLY BIT(5) | ||
1258 | -#define NO_SWAPGS BIT(6) | ||
1259 | +#define NO_SPECULATION BIT(0) | ||
1260 | +#define NO_MELTDOWN BIT(1) | ||
1261 | +#define NO_SSB BIT(2) | ||
1262 | +#define NO_L1TF BIT(3) | ||
1263 | +#define NO_MDS BIT(4) | ||
1264 | +#define MSBDS_ONLY BIT(5) | ||
1265 | +#define NO_SWAPGS BIT(6) | ||
1266 | +#define NO_ITLB_MULTIHIT BIT(7) | ||
1267 | |||
1268 | #define VULNWL(_vendor, _family, _model, _whitelist) \ | ||
1269 | { X86_VENDOR_##_vendor, _family, _model, X86_FEATURE_ANY, _whitelist } | ||
1270 | @@ -915,26 +916,26 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { | ||
1271 | VULNWL(NSC, 5, X86_MODEL_ANY, NO_SPECULATION), | ||
1272 | |||
1273 | /* Intel Family 6 */ | ||
1274 | - VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION), | ||
1275 | - VULNWL_INTEL(ATOM_SALTWELL_TABLET, NO_SPECULATION), | ||
1276 | - VULNWL_INTEL(ATOM_SALTWELL_MID, NO_SPECULATION), | ||
1277 | - VULNWL_INTEL(ATOM_BONNELL, NO_SPECULATION), | ||
1278 | - VULNWL_INTEL(ATOM_BONNELL_MID, NO_SPECULATION), | ||
1279 | - | ||
1280 | - VULNWL_INTEL(ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), | ||
1281 | - VULNWL_INTEL(ATOM_SILVERMONT_X, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), | ||
1282 | - VULNWL_INTEL(ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), | ||
1283 | - VULNWL_INTEL(ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), | ||
1284 | - VULNWL_INTEL(XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), | ||
1285 | - VULNWL_INTEL(XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), | ||
1286 | + VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION | NO_ITLB_MULTIHIT), | ||
1287 | + VULNWL_INTEL(ATOM_SALTWELL_TABLET, NO_SPECULATION | NO_ITLB_MULTIHIT), | ||
1288 | + VULNWL_INTEL(ATOM_SALTWELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT), | ||
1289 | + VULNWL_INTEL(ATOM_BONNELL, NO_SPECULATION | NO_ITLB_MULTIHIT), | ||
1290 | + VULNWL_INTEL(ATOM_BONNELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT), | ||
1291 | + | ||
1292 | + VULNWL_INTEL(ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), | ||
1293 | + VULNWL_INTEL(ATOM_SILVERMONT_X, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), | ||
1294 | + VULNWL_INTEL(ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), | ||
1295 | + VULNWL_INTEL(ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), | ||
1296 | + VULNWL_INTEL(XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), | ||
1297 | + VULNWL_INTEL(XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), | ||
1298 | |||
1299 | VULNWL_INTEL(CORE_YONAH, NO_SSB), | ||
1300 | |||
1301 | - VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY | NO_SWAPGS), | ||
1302 | + VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), | ||
1303 | |||
1304 | - VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS), | ||
1305 | - VULNWL_INTEL(ATOM_GOLDMONT_X, NO_MDS | NO_L1TF | NO_SWAPGS), | ||
1306 | - VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS), | ||
1307 | + VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), | ||
1308 | + VULNWL_INTEL(ATOM_GOLDMONT_X, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), | ||
1309 | + VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), | ||
1310 | |||
1311 | /* | ||
1312 | * Technically, swapgs isn't serializing on AMD (despite it previously | ||
1313 | @@ -945,13 +946,13 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { | ||
1314 | */ | ||
1315 | |||
1316 | /* AMD Family 0xf - 0x12 */ | ||
1317 | - VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS), | ||
1318 | - VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS), | ||
1319 | - VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS), | ||
1320 | - VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS), | ||
1321 | + VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), | ||
1322 | + VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), | ||
1323 | + VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), | ||
1324 | + VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), | ||
1325 | |||
1326 | /* FAMILY_ANY must be last, otherwise 0x0f - 0x12 matches won't work */ | ||
1327 | - VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS), | ||
1328 | + VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), | ||
1329 | {} | ||
1330 | }; | ||
1331 | |||
1332 | @@ -962,19 +963,30 @@ static bool __init cpu_matches(unsigned long which) | ||
1333 | return m && !!(m->driver_data & which); | ||
1334 | } | ||
1335 | |||
1336 | -static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) | ||
1337 | +u64 x86_read_arch_cap_msr(void) | ||
1338 | { | ||
1339 | u64 ia32_cap = 0; | ||
1340 | |||
1341 | + if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) | ||
1342 | + rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap); | ||
1343 | + | ||
1344 | + return ia32_cap; | ||
1345 | +} | ||
1346 | + | ||
1347 | +static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) | ||
1348 | +{ | ||
1349 | + u64 ia32_cap = x86_read_arch_cap_msr(); | ||
1350 | + | ||
1351 | + /* Set ITLB_MULTIHIT bug if cpu is not in the whitelist and not mitigated */ | ||
1352 | + if (!cpu_matches(NO_ITLB_MULTIHIT) && !(ia32_cap & ARCH_CAP_PSCHANGE_MC_NO)) | ||
1353 | + setup_force_cpu_bug(X86_BUG_ITLB_MULTIHIT); | ||
1354 | + | ||
1355 | if (cpu_matches(NO_SPECULATION)) | ||
1356 | return; | ||
1357 | |||
1358 | setup_force_cpu_bug(X86_BUG_SPECTRE_V1); | ||
1359 | setup_force_cpu_bug(X86_BUG_SPECTRE_V2); | ||
1360 | |||
1361 | - if (cpu_has(c, X86_FEATURE_ARCH_CAPABILITIES)) | ||
1362 | - rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap); | ||
1363 | - | ||
1364 | if (!cpu_matches(NO_SSB) && !(ia32_cap & ARCH_CAP_SSB_NO) && | ||
1365 | !cpu_has(c, X86_FEATURE_AMD_SSB_NO)) | ||
1366 | setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS); | ||
1367 | @@ -991,6 +1003,21 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) | ||
1368 | if (!cpu_matches(NO_SWAPGS)) | ||
1369 | setup_force_cpu_bug(X86_BUG_SWAPGS); | ||
1370 | |||
1371 | + /* | ||
1372 | + * When the CPU is not mitigated for TAA (TAA_NO=0) set TAA bug when: | ||
1373 | + * - TSX is supported or | ||
1374 | + * - TSX_CTRL is present | ||
1375 | + * | ||
1376 | + * TSX_CTRL check is needed for cases when TSX could be disabled before | ||
1377 | + * the kernel boot e.g. kexec. | ||
1378 | + * TSX_CTRL check alone is not sufficient for cases when the microcode | ||
1379 | + * update is not present or running as guest that don't get TSX_CTRL. | ||
1380 | + */ | ||
1381 | + if (!(ia32_cap & ARCH_CAP_TAA_NO) && | ||
1382 | + (cpu_has(c, X86_FEATURE_RTM) || | ||
1383 | + (ia32_cap & ARCH_CAP_TSX_CTRL_MSR))) | ||
1384 | + setup_force_cpu_bug(X86_BUG_TAA); | ||
1385 | + | ||
1386 | if (cpu_matches(NO_MELTDOWN)) | ||
1387 | return; | ||
1388 | |||
1389 | @@ -1409,6 +1436,8 @@ void __init identify_boot_cpu(void) | ||
1390 | enable_sep_cpu(); | ||
1391 | #endif | ||
1392 | cpu_detect_tlb(&boot_cpu_data); | ||
1393 | + | ||
1394 | + tsx_init(); | ||
1395 | } | ||
1396 | |||
1397 | void identify_secondary_cpu(struct cpuinfo_x86 *c) | ||
1398 | diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h | ||
1399 | index 2275900d4d1b..4350f50b5deb 100644 | ||
1400 | --- a/arch/x86/kernel/cpu/cpu.h | ||
1401 | +++ b/arch/x86/kernel/cpu/cpu.h | ||
1402 | @@ -44,6 +44,22 @@ struct _tlb_table { | ||
1403 | extern const struct cpu_dev *const __x86_cpu_dev_start[], | ||
1404 | *const __x86_cpu_dev_end[]; | ||
1405 | |||
1406 | +#ifdef CONFIG_CPU_SUP_INTEL | ||
1407 | +enum tsx_ctrl_states { | ||
1408 | + TSX_CTRL_ENABLE, | ||
1409 | + TSX_CTRL_DISABLE, | ||
1410 | + TSX_CTRL_NOT_SUPPORTED, | ||
1411 | +}; | ||
1412 | + | ||
1413 | +extern __ro_after_init enum tsx_ctrl_states tsx_ctrl_state; | ||
1414 | + | ||
1415 | +extern void __init tsx_init(void); | ||
1416 | +extern void tsx_enable(void); | ||
1417 | +extern void tsx_disable(void); | ||
1418 | +#else | ||
1419 | +static inline void tsx_init(void) { } | ||
1420 | +#endif /* CONFIG_CPU_SUP_INTEL */ | ||
1421 | + | ||
1422 | extern void get_cpu_cap(struct cpuinfo_x86 *c); | ||
1423 | extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c); | ||
1424 | extern int detect_extended_topology_early(struct cpuinfo_x86 *c); | ||
1425 | @@ -51,4 +67,6 @@ extern int detect_ht_early(struct cpuinfo_x86 *c); | ||
1426 | |||
1427 | extern void x86_spec_ctrl_setup_ap(void); | ||
1428 | |||
1429 | +extern u64 x86_read_arch_cap_msr(void); | ||
1430 | + | ||
1431 | #endif /* ARCH_X86_CPU_H */ | ||
1432 | diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c | ||
1433 | index 860f2fd9f540..476a9d5c2f35 100644 | ||
1434 | --- a/arch/x86/kernel/cpu/intel.c | ||
1435 | +++ b/arch/x86/kernel/cpu/intel.c | ||
1436 | @@ -642,6 +642,11 @@ static void init_intel(struct cpuinfo_x86 *c) | ||
1437 | detect_vmx_virtcap(c); | ||
1438 | |||
1439 | init_intel_energy_perf(c); | ||
1440 | + | ||
1441 | + if (tsx_ctrl_state == TSX_CTRL_ENABLE) | ||
1442 | + tsx_enable(); | ||
1443 | + if (tsx_ctrl_state == TSX_CTRL_DISABLE) | ||
1444 | + tsx_disable(); | ||
1445 | } | ||
1446 | |||
1447 | #ifdef CONFIG_X86_32 | ||
1448 | diff --git a/arch/x86/kernel/cpu/tsx.c b/arch/x86/kernel/cpu/tsx.c | ||
1449 | new file mode 100644 | ||
1450 | index 000000000000..3e20d322bc98 | ||
1451 | --- /dev/null | ||
1452 | +++ b/arch/x86/kernel/cpu/tsx.c | ||
1453 | @@ -0,0 +1,140 @@ | ||
1454 | +// SPDX-License-Identifier: GPL-2.0 | ||
1455 | +/* | ||
1456 | + * Intel Transactional Synchronization Extensions (TSX) control. | ||
1457 | + * | ||
1458 | + * Copyright (C) 2019 Intel Corporation | ||
1459 | + * | ||
1460 | + * Author: | ||
1461 | + * Pawan Gupta <pawan.kumar.gupta@linux.intel.com> | ||
1462 | + */ | ||
1463 | + | ||
1464 | +#include <linux/cpufeature.h> | ||
1465 | + | ||
1466 | +#include <asm/cmdline.h> | ||
1467 | + | ||
1468 | +#include "cpu.h" | ||
1469 | + | ||
1470 | +enum tsx_ctrl_states tsx_ctrl_state __ro_after_init = TSX_CTRL_NOT_SUPPORTED; | ||
1471 | + | ||
1472 | +void tsx_disable(void) | ||
1473 | +{ | ||
1474 | + u64 tsx; | ||
1475 | + | ||
1476 | + rdmsrl(MSR_IA32_TSX_CTRL, tsx); | ||
1477 | + | ||
1478 | + /* Force all transactions to immediately abort */ | ||
1479 | + tsx |= TSX_CTRL_RTM_DISABLE; | ||
1480 | + | ||
1481 | + /* | ||
1482 | + * Ensure TSX support is not enumerated in CPUID. | ||
1483 | + * This is visible to userspace and will ensure they | ||
1484 | + * do not waste resources trying TSX transactions that | ||
1485 | + * will always abort. | ||
1486 | + */ | ||
1487 | + tsx |= TSX_CTRL_CPUID_CLEAR; | ||
1488 | + | ||
1489 | + wrmsrl(MSR_IA32_TSX_CTRL, tsx); | ||
1490 | +} | ||
1491 | + | ||
1492 | +void tsx_enable(void) | ||
1493 | +{ | ||
1494 | + u64 tsx; | ||
1495 | + | ||
1496 | + rdmsrl(MSR_IA32_TSX_CTRL, tsx); | ||
1497 | + | ||
1498 | + /* Enable the RTM feature in the cpu */ | ||
1499 | + tsx &= ~TSX_CTRL_RTM_DISABLE; | ||
1500 | + | ||
1501 | + /* | ||
1502 | + * Ensure TSX support is enumerated in CPUID. | ||
1503 | + * This is visible to userspace and will ensure they | ||
1504 | + * can enumerate and use the TSX feature. | ||
1505 | + */ | ||
1506 | + tsx &= ~TSX_CTRL_CPUID_CLEAR; | ||
1507 | + | ||
1508 | + wrmsrl(MSR_IA32_TSX_CTRL, tsx); | ||
1509 | +} | ||
1510 | + | ||
1511 | +static bool __init tsx_ctrl_is_supported(void) | ||
1512 | +{ | ||
1513 | + u64 ia32_cap = x86_read_arch_cap_msr(); | ||
1514 | + | ||
1515 | + /* | ||
1516 | + * TSX is controlled via MSR_IA32_TSX_CTRL. However, support for this | ||
1517 | + * MSR is enumerated by ARCH_CAP_TSX_MSR bit in MSR_IA32_ARCH_CAPABILITIES. | ||
1518 | + * | ||
1519 | + * TSX control (aka MSR_IA32_TSX_CTRL) is only available after a | ||
1520 | + * microcode update on CPUs that have their MSR_IA32_ARCH_CAPABILITIES | ||
1521 | + * bit MDS_NO=1. CPUs with MDS_NO=0 are not planned to get | ||
1522 | + * MSR_IA32_TSX_CTRL support even after a microcode update. Thus, | ||
1523 | + * tsx= cmdline requests will do nothing on CPUs without | ||
1524 | + * MSR_IA32_TSX_CTRL support. | ||
1525 | + */ | ||
1526 | + return !!(ia32_cap & ARCH_CAP_TSX_CTRL_MSR); | ||
1527 | +} | ||
1528 | + | ||
1529 | +static enum tsx_ctrl_states x86_get_tsx_auto_mode(void) | ||
1530 | +{ | ||
1531 | + if (boot_cpu_has_bug(X86_BUG_TAA)) | ||
1532 | + return TSX_CTRL_DISABLE; | ||
1533 | + | ||
1534 | + return TSX_CTRL_ENABLE; | ||
1535 | +} | ||
1536 | + | ||
1537 | +void __init tsx_init(void) | ||
1538 | +{ | ||
1539 | + char arg[5] = {}; | ||
1540 | + int ret; | ||
1541 | + | ||
1542 | + if (!tsx_ctrl_is_supported()) | ||
1543 | + return; | ||
1544 | + | ||
1545 | + ret = cmdline_find_option(boot_command_line, "tsx", arg, sizeof(arg)); | ||
1546 | + if (ret >= 0) { | ||
1547 | + if (!strcmp(arg, "on")) { | ||
1548 | + tsx_ctrl_state = TSX_CTRL_ENABLE; | ||
1549 | + } else if (!strcmp(arg, "off")) { | ||
1550 | + tsx_ctrl_state = TSX_CTRL_DISABLE; | ||
1551 | + } else if (!strcmp(arg, "auto")) { | ||
1552 | + tsx_ctrl_state = x86_get_tsx_auto_mode(); | ||
1553 | + } else { | ||
1554 | + tsx_ctrl_state = TSX_CTRL_DISABLE; | ||
1555 | + pr_err("tsx: invalid option, defaulting to off\n"); | ||
1556 | + } | ||
1557 | + } else { | ||
1558 | + /* tsx= not provided */ | ||
1559 | + if (IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_AUTO)) | ||
1560 | + tsx_ctrl_state = x86_get_tsx_auto_mode(); | ||
1561 | + else if (IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_OFF)) | ||
1562 | + tsx_ctrl_state = TSX_CTRL_DISABLE; | ||
1563 | + else | ||
1564 | + tsx_ctrl_state = TSX_CTRL_ENABLE; | ||
1565 | + } | ||
1566 | + | ||
1567 | + if (tsx_ctrl_state == TSX_CTRL_DISABLE) { | ||
1568 | + tsx_disable(); | ||
1569 | + | ||
1570 | + /* | ||
1571 | + * tsx_disable() will change the state of the | ||
1572 | + * RTM CPUID bit. Clear it here since it is now | ||
1573 | + * expected to be not set. | ||
1574 | + */ | ||
1575 | + setup_clear_cpu_cap(X86_FEATURE_RTM); | ||
1576 | + } else if (tsx_ctrl_state == TSX_CTRL_ENABLE) { | ||
1577 | + | ||
1578 | + /* | ||
1579 | + * HW defaults TSX to be enabled at bootup. | ||
1580 | + * We may still need the TSX enable support | ||
1581 | + * during init for special cases like | ||
1582 | + * kexec after TSX is disabled. | ||
1583 | + */ | ||
1584 | + tsx_enable(); | ||
1585 | + | ||
1586 | + /* | ||
1587 | + * tsx_enable() will change the state of the | ||
1588 | + * RTM CPUID bit. Force it here since it is now | ||
1589 | + * expected to be set. | ||
1590 | + */ | ||
1591 | + setup_force_cpu_cap(X86_FEATURE_RTM); | ||
1592 | + } | ||
1593 | +} | ||
1594 | diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c | ||
1595 | index fc8236fd2495..18c5b4920e92 100644 | ||
1596 | --- a/arch/x86/kvm/cpuid.c | ||
1597 | +++ b/arch/x86/kvm/cpuid.c | ||
1598 | @@ -466,8 +466,16 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, | ||
1599 | /* PKU is not yet implemented for shadow paging. */ | ||
1600 | if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE)) | ||
1601 | entry->ecx &= ~F(PKU); | ||
1602 | + | ||
1603 | entry->edx &= kvm_cpuid_7_0_edx_x86_features; | ||
1604 | cpuid_mask(&entry->edx, CPUID_7_EDX); | ||
1605 | + if (boot_cpu_has(X86_FEATURE_IBPB) && | ||
1606 | + boot_cpu_has(X86_FEATURE_IBRS)) | ||
1607 | + entry->edx |= F(SPEC_CTRL); | ||
1608 | + if (boot_cpu_has(X86_FEATURE_STIBP)) | ||
1609 | + entry->edx |= F(INTEL_STIBP); | ||
1610 | + if (boot_cpu_has(X86_FEATURE_SSBD)) | ||
1611 | + entry->edx |= F(SPEC_CTRL_SSBD); | ||
1612 | /* | ||
1613 | * We emulate ARCH_CAPABILITIES in software even | ||
1614 | * if the host doesn't support it. | ||
1615 | diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c | ||
1616 | index 676edfc19a95..f0f180158c26 100644 | ||
1617 | --- a/arch/x86/kvm/mmu.c | ||
1618 | +++ b/arch/x86/kvm/mmu.c | ||
1619 | @@ -37,6 +37,7 @@ | ||
1620 | #include <linux/srcu.h> | ||
1621 | #include <linux/slab.h> | ||
1622 | #include <linux/uaccess.h> | ||
1623 | +#include <linux/kthread.h> | ||
1624 | |||
1625 | #include <asm/page.h> | ||
1626 | #include <asm/cmpxchg.h> | ||
1627 | @@ -44,6 +45,30 @@ | ||
1628 | #include <asm/vmx.h> | ||
1629 | #include <asm/kvm_page_track.h> | ||
1630 | |||
1631 | +extern bool itlb_multihit_kvm_mitigation; | ||
1632 | + | ||
1633 | +static int __read_mostly nx_huge_pages = -1; | ||
1634 | +static uint __read_mostly nx_huge_pages_recovery_ratio = 60; | ||
1635 | + | ||
1636 | +static int set_nx_huge_pages(const char *val, const struct kernel_param *kp); | ||
1637 | +static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp); | ||
1638 | + | ||
1639 | +static struct kernel_param_ops nx_huge_pages_ops = { | ||
1640 | + .set = set_nx_huge_pages, | ||
1641 | + .get = param_get_bool, | ||
1642 | +}; | ||
1643 | + | ||
1644 | +static struct kernel_param_ops nx_huge_pages_recovery_ratio_ops = { | ||
1645 | + .set = set_nx_huge_pages_recovery_ratio, | ||
1646 | + .get = param_get_uint, | ||
1647 | +}; | ||
1648 | + | ||
1649 | +module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644); | ||
1650 | +__MODULE_PARM_TYPE(nx_huge_pages, "bool"); | ||
1651 | +module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_ratio_ops, | ||
1652 | + &nx_huge_pages_recovery_ratio, 0644); | ||
1653 | +__MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint"); | ||
1654 | + | ||
1655 | /* | ||
1656 | * When setting this variable to true it enables Two-Dimensional-Paging | ||
1657 | * where the hardware walks 2 page tables: | ||
1658 | @@ -131,9 +156,6 @@ module_param(dbg, bool, 0644); | ||
1659 | |||
1660 | #include <trace/events/kvm.h> | ||
1661 | |||
1662 | -#define CREATE_TRACE_POINTS | ||
1663 | -#include "mmutrace.h" | ||
1664 | - | ||
1665 | #define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) | ||
1666 | #define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1)) | ||
1667 | |||
1668 | @@ -142,6 +164,20 @@ module_param(dbg, bool, 0644); | ||
1669 | /* make pte_list_desc fit well in cache line */ | ||
1670 | #define PTE_LIST_EXT 3 | ||
1671 | |||
1672 | +/* | ||
1673 | + * Return values of handle_mmio_page_fault and mmu.page_fault: | ||
1674 | + * RET_PF_RETRY: let CPU fault again on the address. | ||
1675 | + * RET_PF_EMULATE: mmio page fault, emulate the instruction directly. | ||
1676 | + * | ||
1677 | + * For handle_mmio_page_fault only: | ||
1678 | + * RET_PF_INVALID: the spte is invalid, let the real page fault path update it. | ||
1679 | + */ | ||
1680 | +enum { | ||
1681 | + RET_PF_RETRY = 0, | ||
1682 | + RET_PF_EMULATE = 1, | ||
1683 | + RET_PF_INVALID = 2, | ||
1684 | +}; | ||
1685 | + | ||
1686 | struct pte_list_desc { | ||
1687 | u64 *sptes[PTE_LIST_EXT]; | ||
1688 | struct pte_list_desc *more; | ||
1689 | @@ -179,14 +215,23 @@ static u64 __read_mostly shadow_mmio_mask; | ||
1690 | static u64 __read_mostly shadow_present_mask; | ||
1691 | |||
1692 | static void mmu_spte_set(u64 *sptep, u64 spte); | ||
1693 | +static bool is_executable_pte(u64 spte); | ||
1694 | static void mmu_free_roots(struct kvm_vcpu *vcpu); | ||
1695 | |||
1696 | +#define CREATE_TRACE_POINTS | ||
1697 | +#include "mmutrace.h" | ||
1698 | + | ||
1699 | void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask) | ||
1700 | { | ||
1701 | shadow_mmio_mask = mmio_mask; | ||
1702 | } | ||
1703 | EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); | ||
1704 | |||
1705 | +static bool is_nx_huge_page_enabled(void) | ||
1706 | +{ | ||
1707 | + return READ_ONCE(nx_huge_pages); | ||
1708 | +} | ||
1709 | + | ||
1710 | /* | ||
1711 | * the low bit of the generation number is always presumed to be zero. | ||
1712 | * This disables mmio caching during memslot updates. The concept is | ||
1713 | @@ -324,6 +369,11 @@ static int is_last_spte(u64 pte, int level) | ||
1714 | return 0; | ||
1715 | } | ||
1716 | |||
1717 | +static bool is_executable_pte(u64 spte) | ||
1718 | +{ | ||
1719 | + return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask; | ||
1720 | +} | ||
1721 | + | ||
1722 | static kvm_pfn_t spte_to_pfn(u64 pte) | ||
1723 | { | ||
1724 | return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; | ||
1725 | @@ -767,10 +817,16 @@ static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index) | ||
1726 | |||
1727 | static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn) | ||
1728 | { | ||
1729 | - if (sp->role.direct) | ||
1730 | - BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index)); | ||
1731 | - else | ||
1732 | + if (!sp->role.direct) { | ||
1733 | sp->gfns[index] = gfn; | ||
1734 | + return; | ||
1735 | + } | ||
1736 | + | ||
1737 | + if (WARN_ON(gfn != kvm_mmu_page_get_gfn(sp, index))) | ||
1738 | + pr_err_ratelimited("gfn mismatch under direct page %llx " | ||
1739 | + "(expected %llx, got %llx)\n", | ||
1740 | + sp->gfn, | ||
1741 | + kvm_mmu_page_get_gfn(sp, index), gfn); | ||
1742 | } | ||
1743 | |||
1744 | /* | ||
1745 | @@ -829,6 +885,17 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) | ||
1746 | kvm_mmu_gfn_disallow_lpage(slot, gfn); | ||
1747 | } | ||
1748 | |||
1749 | +static void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) | ||
1750 | +{ | ||
1751 | + if (sp->lpage_disallowed) | ||
1752 | + return; | ||
1753 | + | ||
1754 | + ++kvm->stat.nx_lpage_splits; | ||
1755 | + list_add_tail(&sp->lpage_disallowed_link, | ||
1756 | + &kvm->arch.lpage_disallowed_mmu_pages); | ||
1757 | + sp->lpage_disallowed = true; | ||
1758 | +} | ||
1759 | + | ||
1760 | static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) | ||
1761 | { | ||
1762 | struct kvm_memslots *slots; | ||
1763 | @@ -846,6 +913,13 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) | ||
1764 | kvm_mmu_gfn_allow_lpage(slot, gfn); | ||
1765 | } | ||
1766 | |||
1767 | +static void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) | ||
1768 | +{ | ||
1769 | + --kvm->stat.nx_lpage_splits; | ||
1770 | + sp->lpage_disallowed = false; | ||
1771 | + list_del(&sp->lpage_disallowed_link); | ||
1772 | +} | ||
1773 | + | ||
1774 | static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level, | ||
1775 | struct kvm_memory_slot *slot) | ||
1776 | { | ||
1777 | @@ -2382,6 +2456,9 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, | ||
1778 | kvm_reload_remote_mmus(kvm); | ||
1779 | } | ||
1780 | |||
1781 | + if (sp->lpage_disallowed) | ||
1782 | + unaccount_huge_nx_page(kvm, sp); | ||
1783 | + | ||
1784 | sp->role.invalid = 1; | ||
1785 | return ret; | ||
1786 | } | ||
1787 | @@ -2533,6 +2610,11 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | ||
1788 | if (!speculative) | ||
1789 | spte |= shadow_accessed_mask; | ||
1790 | |||
1791 | + if (level > PT_PAGE_TABLE_LEVEL && (pte_access & ACC_EXEC_MASK) && | ||
1792 | + is_nx_huge_page_enabled()) { | ||
1793 | + pte_access &= ~ACC_EXEC_MASK; | ||
1794 | + } | ||
1795 | + | ||
1796 | if (pte_access & ACC_EXEC_MASK) | ||
1797 | spte |= shadow_x_mask; | ||
1798 | else | ||
1799 | @@ -2598,13 +2680,13 @@ done: | ||
1800 | return ret; | ||
1801 | } | ||
1802 | |||
1803 | -static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, | ||
1804 | - int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn, | ||
1805 | - bool speculative, bool host_writable) | ||
1806 | +static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, | ||
1807 | + int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn, | ||
1808 | + bool speculative, bool host_writable) | ||
1809 | { | ||
1810 | int was_rmapped = 0; | ||
1811 | int rmap_count; | ||
1812 | - bool emulate = false; | ||
1813 | + int ret = RET_PF_RETRY; | ||
1814 | |||
1815 | pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__, | ||
1816 | *sptep, write_fault, gfn); | ||
1817 | @@ -2634,18 +2716,15 @@ static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, | ||
1818 | if (set_spte(vcpu, sptep, pte_access, level, gfn, pfn, speculative, | ||
1819 | true, host_writable)) { | ||
1820 | if (write_fault) | ||
1821 | - emulate = true; | ||
1822 | + ret = RET_PF_EMULATE; | ||
1823 | kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); | ||
1824 | } | ||
1825 | |||
1826 | if (unlikely(is_mmio_spte(*sptep))) | ||
1827 | - emulate = true; | ||
1828 | + ret = RET_PF_EMULATE; | ||
1829 | |||
1830 | pgprintk("%s: setting spte %llx\n", __func__, *sptep); | ||
1831 | - pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n", | ||
1832 | - is_large_pte(*sptep)? "2MB" : "4kB", | ||
1833 | - *sptep & PT_PRESENT_MASK ?"RW":"R", gfn, | ||
1834 | - *sptep, sptep); | ||
1835 | + trace_kvm_mmu_set_spte(level, gfn, sptep); | ||
1836 | if (!was_rmapped && is_large_pte(*sptep)) | ||
1837 | ++vcpu->kvm->stat.lpages; | ||
1838 | |||
1839 | @@ -2657,9 +2736,7 @@ static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, | ||
1840 | } | ||
1841 | } | ||
1842 | |||
1843 | - kvm_release_pfn_clean(pfn); | ||
1844 | - | ||
1845 | - return emulate; | ||
1846 | + return ret; | ||
1847 | } | ||
1848 | |||
1849 | static kvm_pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, | ||
1850 | @@ -2693,9 +2770,11 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, | ||
1851 | if (ret <= 0) | ||
1852 | return -1; | ||
1853 | |||
1854 | - for (i = 0; i < ret; i++, gfn++, start++) | ||
1855 | + for (i = 0; i < ret; i++, gfn++, start++) { | ||
1856 | mmu_set_spte(vcpu, start, access, 0, sp->role.level, gfn, | ||
1857 | page_to_pfn(pages[i]), true, true); | ||
1858 | + put_page(pages[i]); | ||
1859 | + } | ||
1860 | |||
1861 | return 0; | ||
1862 | } | ||
1863 | @@ -2743,40 +2822,71 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) | ||
1864 | __direct_pte_prefetch(vcpu, sp, sptep); | ||
1865 | } | ||
1866 | |||
1867 | -static int __direct_map(struct kvm_vcpu *vcpu, int write, int map_writable, | ||
1868 | - int level, gfn_t gfn, kvm_pfn_t pfn, bool prefault) | ||
1869 | +static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it, | ||
1870 | + gfn_t gfn, kvm_pfn_t *pfnp, int *levelp) | ||
1871 | { | ||
1872 | - struct kvm_shadow_walk_iterator iterator; | ||
1873 | + int level = *levelp; | ||
1874 | + u64 spte = *it.sptep; | ||
1875 | + | ||
1876 | + if (it.level == level && level > PT_PAGE_TABLE_LEVEL && | ||
1877 | + is_nx_huge_page_enabled() && | ||
1878 | + is_shadow_present_pte(spte) && | ||
1879 | + !is_large_pte(spte)) { | ||
1880 | + /* | ||
1881 | + * A small SPTE exists for this pfn, but FNAME(fetch) | ||
1882 | + * and __direct_map would like to create a large PTE | ||
1883 | + * instead: just force them to go down another level, | ||
1884 | + * patching back for them into pfn the next 9 bits of | ||
1885 | + * the address. | ||
1886 | + */ | ||
1887 | + u64 page_mask = KVM_PAGES_PER_HPAGE(level) - KVM_PAGES_PER_HPAGE(level - 1); | ||
1888 | + *pfnp |= gfn & page_mask; | ||
1889 | + (*levelp)--; | ||
1890 | + } | ||
1891 | +} | ||
1892 | + | ||
1893 | +static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, | ||
1894 | + int map_writable, int level, kvm_pfn_t pfn, | ||
1895 | + bool prefault, bool lpage_disallowed) | ||
1896 | +{ | ||
1897 | + struct kvm_shadow_walk_iterator it; | ||
1898 | struct kvm_mmu_page *sp; | ||
1899 | - int emulate = 0; | ||
1900 | - gfn_t pseudo_gfn; | ||
1901 | + int ret; | ||
1902 | + gfn_t gfn = gpa >> PAGE_SHIFT; | ||
1903 | + gfn_t base_gfn = gfn; | ||
1904 | |||
1905 | if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) | ||
1906 | - return 0; | ||
1907 | + return RET_PF_RETRY; | ||
1908 | |||
1909 | - for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { | ||
1910 | - if (iterator.level == level) { | ||
1911 | - emulate = mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, | ||
1912 | - write, level, gfn, pfn, prefault, | ||
1913 | - map_writable); | ||
1914 | - direct_pte_prefetch(vcpu, iterator.sptep); | ||
1915 | - ++vcpu->stat.pf_fixed; | ||
1916 | - break; | ||
1917 | - } | ||
1918 | + trace_kvm_mmu_spte_requested(gpa, level, pfn); | ||
1919 | + for_each_shadow_entry(vcpu, gpa, it) { | ||
1920 | + /* | ||
1921 | + * We cannot overwrite existing page tables with an NX | ||
1922 | + * large page, as the leaf could be executable. | ||
1923 | + */ | ||
1924 | + disallowed_hugepage_adjust(it, gfn, &pfn, &level); | ||
1925 | |||
1926 | - drop_large_spte(vcpu, iterator.sptep); | ||
1927 | - if (!is_shadow_present_pte(*iterator.sptep)) { | ||
1928 | - u64 base_addr = iterator.addr; | ||
1929 | + base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); | ||
1930 | + if (it.level == level) | ||
1931 | + break; | ||
1932 | |||
1933 | - base_addr &= PT64_LVL_ADDR_MASK(iterator.level); | ||
1934 | - pseudo_gfn = base_addr >> PAGE_SHIFT; | ||
1935 | - sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr, | ||
1936 | - iterator.level - 1, 1, ACC_ALL); | ||
1937 | + drop_large_spte(vcpu, it.sptep); | ||
1938 | + if (!is_shadow_present_pte(*it.sptep)) { | ||
1939 | + sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr, | ||
1940 | + it.level - 1, true, ACC_ALL); | ||
1941 | |||
1942 | - link_shadow_page(vcpu, iterator.sptep, sp); | ||
1943 | + link_shadow_page(vcpu, it.sptep, sp); | ||
1944 | + if (lpage_disallowed) | ||
1945 | + account_huge_nx_page(vcpu->kvm, sp); | ||
1946 | } | ||
1947 | } | ||
1948 | - return emulate; | ||
1949 | + | ||
1950 | + ret = mmu_set_spte(vcpu, it.sptep, ACC_ALL, | ||
1951 | + write, level, base_gfn, pfn, prefault, | ||
1952 | + map_writable); | ||
1953 | + direct_pte_prefetch(vcpu, it.sptep); | ||
1954 | + ++vcpu->stat.pf_fixed; | ||
1955 | + return ret; | ||
1956 | } | ||
1957 | |||
1958 | static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk) | ||
1959 | @@ -2798,25 +2908,23 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn) | ||
1960 | * Do not cache the mmio info caused by writing the readonly gfn | ||
1961 | * into the spte otherwise read access on readonly gfn also can | ||
1962 | * caused mmio page fault and treat it as mmio access. | ||
1963 | - * Return 1 to tell kvm to emulate it. | ||
1964 | */ | ||
1965 | if (pfn == KVM_PFN_ERR_RO_FAULT) | ||
1966 | - return 1; | ||
1967 | + return RET_PF_EMULATE; | ||
1968 | |||
1969 | if (pfn == KVM_PFN_ERR_HWPOISON) { | ||
1970 | kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current); | ||
1971 | - return 0; | ||
1972 | + return RET_PF_RETRY; | ||
1973 | } | ||
1974 | |||
1975 | return -EFAULT; | ||
1976 | } | ||
1977 | |||
1978 | static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, | ||
1979 | - gfn_t *gfnp, kvm_pfn_t *pfnp, | ||
1980 | + gfn_t gfn, kvm_pfn_t *pfnp, | ||
1981 | int *levelp) | ||
1982 | { | ||
1983 | kvm_pfn_t pfn = *pfnp; | ||
1984 | - gfn_t gfn = *gfnp; | ||
1985 | int level = *levelp; | ||
1986 | |||
1987 | /* | ||
1988 | @@ -2843,8 +2951,6 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, | ||
1989 | mask = KVM_PAGES_PER_HPAGE(level) - 1; | ||
1990 | VM_BUG_ON((gfn & mask) != (pfn & mask)); | ||
1991 | if (pfn & mask) { | ||
1992 | - gfn &= ~mask; | ||
1993 | - *gfnp = gfn; | ||
1994 | kvm_release_pfn_clean(pfn); | ||
1995 | pfn &= ~mask; | ||
1996 | kvm_get_pfn(pfn); | ||
1997 | @@ -3012,11 +3118,14 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, | ||
1998 | { | ||
1999 | int r; | ||
2000 | int level; | ||
2001 | - bool force_pt_level = false; | ||
2002 | + bool force_pt_level; | ||
2003 | kvm_pfn_t pfn; | ||
2004 | unsigned long mmu_seq; | ||
2005 | bool map_writable, write = error_code & PFERR_WRITE_MASK; | ||
2006 | + bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) && | ||
2007 | + is_nx_huge_page_enabled(); | ||
2008 | |||
2009 | + force_pt_level = lpage_disallowed; | ||
2010 | level = mapping_level(vcpu, gfn, &force_pt_level); | ||
2011 | if (likely(!force_pt_level)) { | ||
2012 | /* | ||
2013 | @@ -3031,32 +3140,30 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, | ||
2014 | } | ||
2015 | |||
2016 | if (fast_page_fault(vcpu, v, level, error_code)) | ||
2017 | - return 0; | ||
2018 | + return RET_PF_RETRY; | ||
2019 | |||
2020 | mmu_seq = vcpu->kvm->mmu_notifier_seq; | ||
2021 | smp_rmb(); | ||
2022 | |||
2023 | if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable)) | ||
2024 | - return 0; | ||
2025 | + return RET_PF_RETRY; | ||
2026 | |||
2027 | if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r)) | ||
2028 | return r; | ||
2029 | |||
2030 | + r = RET_PF_RETRY; | ||
2031 | spin_lock(&vcpu->kvm->mmu_lock); | ||
2032 | if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) | ||
2033 | goto out_unlock; | ||
2034 | make_mmu_pages_available(vcpu); | ||
2035 | if (likely(!force_pt_level)) | ||
2036 | - transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); | ||
2037 | - r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault); | ||
2038 | - spin_unlock(&vcpu->kvm->mmu_lock); | ||
2039 | - | ||
2040 | - return r; | ||
2041 | - | ||
2042 | + transparent_hugepage_adjust(vcpu, gfn, &pfn, &level); | ||
2043 | + r = __direct_map(vcpu, v, write, map_writable, level, pfn, | ||
2044 | + prefault, false); | ||
2045 | out_unlock: | ||
2046 | spin_unlock(&vcpu->kvm->mmu_lock); | ||
2047 | kvm_release_pfn_clean(pfn); | ||
2048 | - return 0; | ||
2049 | + return r; | ||
2050 | } | ||
2051 | |||
2052 | |||
2053 | @@ -3383,38 +3490,38 @@ exit: | ||
2054 | return reserved; | ||
2055 | } | ||
2056 | |||
2057 | -int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct) | ||
2058 | +static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct) | ||
2059 | { | ||
2060 | u64 spte; | ||
2061 | bool reserved; | ||
2062 | |||
2063 | if (mmio_info_in_cache(vcpu, addr, direct)) | ||
2064 | - return RET_MMIO_PF_EMULATE; | ||
2065 | + return RET_PF_EMULATE; | ||
2066 | |||
2067 | reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte); | ||
2068 | if (WARN_ON(reserved)) | ||
2069 | - return RET_MMIO_PF_BUG; | ||
2070 | + return -EINVAL; | ||
2071 | |||
2072 | if (is_mmio_spte(spte)) { | ||
2073 | gfn_t gfn = get_mmio_spte_gfn(spte); | ||
2074 | unsigned access = get_mmio_spte_access(spte); | ||
2075 | |||
2076 | if (!check_mmio_spte(vcpu, spte)) | ||
2077 | - return RET_MMIO_PF_INVALID; | ||
2078 | + return RET_PF_INVALID; | ||
2079 | |||
2080 | if (direct) | ||
2081 | addr = 0; | ||
2082 | |||
2083 | trace_handle_mmio_page_fault(addr, gfn, access); | ||
2084 | vcpu_cache_mmio_info(vcpu, addr, gfn, access); | ||
2085 | - return RET_MMIO_PF_EMULATE; | ||
2086 | + return RET_PF_EMULATE; | ||
2087 | } | ||
2088 | |||
2089 | /* | ||
2090 | * If the page table is zapped by other cpus, let CPU fault again on | ||
2091 | * the address. | ||
2092 | */ | ||
2093 | - return RET_MMIO_PF_RETRY; | ||
2094 | + return RET_PF_RETRY; | ||
2095 | } | ||
2096 | EXPORT_SYMBOL_GPL(handle_mmio_page_fault); | ||
2097 | |||
2098 | @@ -3464,7 +3571,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, | ||
2099 | pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code); | ||
2100 | |||
2101 | if (page_fault_handle_page_track(vcpu, error_code, gfn)) | ||
2102 | - return 1; | ||
2103 | + return RET_PF_EMULATE; | ||
2104 | |||
2105 | r = mmu_topup_memory_caches(vcpu); | ||
2106 | if (r) | ||
2107 | @@ -3548,18 +3655,21 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, | ||
2108 | unsigned long mmu_seq; | ||
2109 | int write = error_code & PFERR_WRITE_MASK; | ||
2110 | bool map_writable; | ||
2111 | + bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) && | ||
2112 | + is_nx_huge_page_enabled(); | ||
2113 | |||
2114 | MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); | ||
2115 | |||
2116 | if (page_fault_handle_page_track(vcpu, error_code, gfn)) | ||
2117 | - return 1; | ||
2118 | + return RET_PF_EMULATE; | ||
2119 | |||
2120 | r = mmu_topup_memory_caches(vcpu); | ||
2121 | if (r) | ||
2122 | return r; | ||
2123 | |||
2124 | - force_pt_level = !check_hugepage_cache_consistency(vcpu, gfn, | ||
2125 | - PT_DIRECTORY_LEVEL); | ||
2126 | + force_pt_level = | ||
2127 | + lpage_disallowed || | ||
2128 | + !check_hugepage_cache_consistency(vcpu, gfn, PT_DIRECTORY_LEVEL); | ||
2129 | level = mapping_level(vcpu, gfn, &force_pt_level); | ||
2130 | if (likely(!force_pt_level)) { | ||
2131 | if (level > PT_DIRECTORY_LEVEL && | ||
2132 | @@ -3569,32 +3679,30 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, | ||
2133 | } | ||
2134 | |||
2135 | if (fast_page_fault(vcpu, gpa, level, error_code)) | ||
2136 | - return 0; | ||
2137 | + return RET_PF_RETRY; | ||
2138 | |||
2139 | mmu_seq = vcpu->kvm->mmu_notifier_seq; | ||
2140 | smp_rmb(); | ||
2141 | |||
2142 | if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable)) | ||
2143 | - return 0; | ||
2144 | + return RET_PF_RETRY; | ||
2145 | |||
2146 | if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r)) | ||
2147 | return r; | ||
2148 | |||
2149 | + r = RET_PF_RETRY; | ||
2150 | spin_lock(&vcpu->kvm->mmu_lock); | ||
2151 | if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) | ||
2152 | goto out_unlock; | ||
2153 | make_mmu_pages_available(vcpu); | ||
2154 | if (likely(!force_pt_level)) | ||
2155 | - transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); | ||
2156 | - r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault); | ||
2157 | - spin_unlock(&vcpu->kvm->mmu_lock); | ||
2158 | - | ||
2159 | - return r; | ||
2160 | - | ||
2161 | + transparent_hugepage_adjust(vcpu, gfn, &pfn, &level); | ||
2162 | + r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, | ||
2163 | + prefault, lpage_disallowed); | ||
2164 | out_unlock: | ||
2165 | spin_unlock(&vcpu->kvm->mmu_lock); | ||
2166 | kvm_release_pfn_clean(pfn); | ||
2167 | - return 0; | ||
2168 | + return r; | ||
2169 | } | ||
2170 | |||
2171 | static void nonpaging_init_context(struct kvm_vcpu *vcpu, | ||
2172 | @@ -4510,23 +4618,24 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, | ||
2173 | enum emulation_result er; | ||
2174 | bool direct = vcpu->arch.mmu.direct_map || mmu_is_nested(vcpu); | ||
2175 | |||
2176 | + r = RET_PF_INVALID; | ||
2177 | if (unlikely(error_code & PFERR_RSVD_MASK)) { | ||
2178 | r = handle_mmio_page_fault(vcpu, cr2, direct); | ||
2179 | - if (r == RET_MMIO_PF_EMULATE) { | ||
2180 | + if (r == RET_PF_EMULATE) { | ||
2181 | emulation_type = 0; | ||
2182 | goto emulate; | ||
2183 | } | ||
2184 | - if (r == RET_MMIO_PF_RETRY) | ||
2185 | - return 1; | ||
2186 | - if (r < 0) | ||
2187 | - return r; | ||
2188 | } | ||
2189 | |||
2190 | - r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false); | ||
2191 | + if (r == RET_PF_INVALID) { | ||
2192 | + r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false); | ||
2193 | + WARN_ON(r == RET_PF_INVALID); | ||
2194 | + } | ||
2195 | + | ||
2196 | + if (r == RET_PF_RETRY) | ||
2197 | + return 1; | ||
2198 | if (r < 0) | ||
2199 | return r; | ||
2200 | - if (!r) | ||
2201 | - return 1; | ||
2202 | |||
2203 | if (mmio_info_in_cache(vcpu, cr2, direct)) | ||
2204 | emulation_type = 0; | ||
2205 | @@ -4965,7 +5074,7 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) | ||
2206 | int nr_to_scan = sc->nr_to_scan; | ||
2207 | unsigned long freed = 0; | ||
2208 | |||
2209 | - spin_lock(&kvm_lock); | ||
2210 | + mutex_lock(&kvm_lock); | ||
2211 | |||
2212 | list_for_each_entry(kvm, &vm_list, vm_list) { | ||
2213 | int idx; | ||
2214 | @@ -5015,7 +5124,7 @@ unlock: | ||
2215 | break; | ||
2216 | } | ||
2217 | |||
2218 | - spin_unlock(&kvm_lock); | ||
2219 | + mutex_unlock(&kvm_lock); | ||
2220 | return freed; | ||
2221 | } | ||
2222 | |||
2223 | @@ -5039,8 +5148,58 @@ static void mmu_destroy_caches(void) | ||
2224 | kmem_cache_destroy(mmu_page_header_cache); | ||
2225 | } | ||
2226 | |||
2227 | +static bool get_nx_auto_mode(void) | ||
2228 | +{ | ||
2229 | + /* Return true when CPU has the bug, and mitigations are ON */ | ||
2230 | + return boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT) && !cpu_mitigations_off(); | ||
2231 | +} | ||
2232 | + | ||
2233 | +static void __set_nx_huge_pages(bool val) | ||
2234 | +{ | ||
2235 | + nx_huge_pages = itlb_multihit_kvm_mitigation = val; | ||
2236 | +} | ||
2237 | + | ||
2238 | +static int set_nx_huge_pages(const char *val, const struct kernel_param *kp) | ||
2239 | +{ | ||
2240 | + bool old_val = nx_huge_pages; | ||
2241 | + bool new_val; | ||
2242 | + | ||
2243 | + /* In "auto" mode deploy workaround only if CPU has the bug. */ | ||
2244 | + if (sysfs_streq(val, "off")) | ||
2245 | + new_val = 0; | ||
2246 | + else if (sysfs_streq(val, "force")) | ||
2247 | + new_val = 1; | ||
2248 | + else if (sysfs_streq(val, "auto")) | ||
2249 | + new_val = get_nx_auto_mode(); | ||
2250 | + else if (strtobool(val, &new_val) < 0) | ||
2251 | + return -EINVAL; | ||
2252 | + | ||
2253 | + __set_nx_huge_pages(new_val); | ||
2254 | + | ||
2255 | + if (new_val != old_val) { | ||
2256 | + struct kvm *kvm; | ||
2257 | + int idx; | ||
2258 | + | ||
2259 | + mutex_lock(&kvm_lock); | ||
2260 | + | ||
2261 | + list_for_each_entry(kvm, &vm_list, vm_list) { | ||
2262 | + idx = srcu_read_lock(&kvm->srcu); | ||
2263 | + kvm_mmu_invalidate_zap_all_pages(kvm); | ||
2264 | + srcu_read_unlock(&kvm->srcu, idx); | ||
2265 | + | ||
2266 | + wake_up_process(kvm->arch.nx_lpage_recovery_thread); | ||
2267 | + } | ||
2268 | + mutex_unlock(&kvm_lock); | ||
2269 | + } | ||
2270 | + | ||
2271 | + return 0; | ||
2272 | +} | ||
2273 | + | ||
2274 | int kvm_mmu_module_init(void) | ||
2275 | { | ||
2276 | + if (nx_huge_pages == -1) | ||
2277 | + __set_nx_huge_pages(get_nx_auto_mode()); | ||
2278 | + | ||
2279 | pte_list_desc_cache = kmem_cache_create("pte_list_desc", | ||
2280 | sizeof(struct pte_list_desc), | ||
2281 | 0, SLAB_ACCOUNT, NULL); | ||
2282 | @@ -5104,3 +5263,116 @@ void kvm_mmu_module_exit(void) | ||
2283 | unregister_shrinker(&mmu_shrinker); | ||
2284 | mmu_audit_disable(); | ||
2285 | } | ||
2286 | + | ||
2287 | +static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp) | ||
2288 | +{ | ||
2289 | + unsigned int old_val; | ||
2290 | + int err; | ||
2291 | + | ||
2292 | + old_val = nx_huge_pages_recovery_ratio; | ||
2293 | + err = param_set_uint(val, kp); | ||
2294 | + if (err) | ||
2295 | + return err; | ||
2296 | + | ||
2297 | + if (READ_ONCE(nx_huge_pages) && | ||
2298 | + !old_val && nx_huge_pages_recovery_ratio) { | ||
2299 | + struct kvm *kvm; | ||
2300 | + | ||
2301 | + mutex_lock(&kvm_lock); | ||
2302 | + | ||
2303 | + list_for_each_entry(kvm, &vm_list, vm_list) | ||
2304 | + wake_up_process(kvm->arch.nx_lpage_recovery_thread); | ||
2305 | + | ||
2306 | + mutex_unlock(&kvm_lock); | ||
2307 | + } | ||
2308 | + | ||
2309 | + return err; | ||
2310 | +} | ||
2311 | + | ||
2312 | +static void kvm_recover_nx_lpages(struct kvm *kvm) | ||
2313 | +{ | ||
2314 | + int rcu_idx; | ||
2315 | + struct kvm_mmu_page *sp; | ||
2316 | + unsigned int ratio; | ||
2317 | + LIST_HEAD(invalid_list); | ||
2318 | + ulong to_zap; | ||
2319 | + | ||
2320 | + rcu_idx = srcu_read_lock(&kvm->srcu); | ||
2321 | + spin_lock(&kvm->mmu_lock); | ||
2322 | + | ||
2323 | + ratio = READ_ONCE(nx_huge_pages_recovery_ratio); | ||
2324 | + to_zap = ratio ? DIV_ROUND_UP(kvm->stat.nx_lpage_splits, ratio) : 0; | ||
2325 | + while (to_zap && !list_empty(&kvm->arch.lpage_disallowed_mmu_pages)) { | ||
2326 | + /* | ||
2327 | + * We use a separate list instead of just using active_mmu_pages | ||
2328 | + * because the number of lpage_disallowed pages is expected to | ||
2329 | + * be relatively small compared to the total. | ||
2330 | + */ | ||
2331 | + sp = list_first_entry(&kvm->arch.lpage_disallowed_mmu_pages, | ||
2332 | + struct kvm_mmu_page, | ||
2333 | + lpage_disallowed_link); | ||
2334 | + WARN_ON_ONCE(!sp->lpage_disallowed); | ||
2335 | + kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); | ||
2336 | + WARN_ON_ONCE(sp->lpage_disallowed); | ||
2337 | + | ||
2338 | + if (!--to_zap || need_resched() || spin_needbreak(&kvm->mmu_lock)) { | ||
2339 | + kvm_mmu_commit_zap_page(kvm, &invalid_list); | ||
2340 | + if (to_zap) | ||
2341 | + cond_resched_lock(&kvm->mmu_lock); | ||
2342 | + } | ||
2343 | + } | ||
2344 | + | ||
2345 | + spin_unlock(&kvm->mmu_lock); | ||
2346 | + srcu_read_unlock(&kvm->srcu, rcu_idx); | ||
2347 | +} | ||
2348 | + | ||
2349 | +static long get_nx_lpage_recovery_timeout(u64 start_time) | ||
2350 | +{ | ||
2351 | + return READ_ONCE(nx_huge_pages) && READ_ONCE(nx_huge_pages_recovery_ratio) | ||
2352 | + ? start_time + 60 * HZ - get_jiffies_64() | ||
2353 | + : MAX_SCHEDULE_TIMEOUT; | ||
2354 | +} | ||
2355 | + | ||
2356 | +static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data) | ||
2357 | +{ | ||
2358 | + u64 start_time; | ||
2359 | + long remaining_time; | ||
2360 | + | ||
2361 | + while (true) { | ||
2362 | + start_time = get_jiffies_64(); | ||
2363 | + remaining_time = get_nx_lpage_recovery_timeout(start_time); | ||
2364 | + | ||
2365 | + set_current_state(TASK_INTERRUPTIBLE); | ||
2366 | + while (!kthread_should_stop() && remaining_time > 0) { | ||
2367 | + schedule_timeout(remaining_time); | ||
2368 | + remaining_time = get_nx_lpage_recovery_timeout(start_time); | ||
2369 | + set_current_state(TASK_INTERRUPTIBLE); | ||
2370 | + } | ||
2371 | + | ||
2372 | + set_current_state(TASK_RUNNING); | ||
2373 | + | ||
2374 | + if (kthread_should_stop()) | ||
2375 | + return 0; | ||
2376 | + | ||
2377 | + kvm_recover_nx_lpages(kvm); | ||
2378 | + } | ||
2379 | +} | ||
2380 | + | ||
2381 | +int kvm_mmu_post_init_vm(struct kvm *kvm) | ||
2382 | +{ | ||
2383 | + int err; | ||
2384 | + | ||
2385 | + err = kvm_vm_create_worker_thread(kvm, kvm_nx_lpage_recovery_worker, 0, | ||
2386 | + "kvm-nx-lpage-recovery", | ||
2387 | + &kvm->arch.nx_lpage_recovery_thread); | ||
2388 | + if (!err) | ||
2389 | + kthread_unpark(kvm->arch.nx_lpage_recovery_thread); | ||
2390 | + | ||
2391 | + return err; | ||
2392 | +} | ||
2393 | + | ||
2394 | +void kvm_mmu_pre_destroy_vm(struct kvm *kvm) | ||
2395 | +{ | ||
2396 | + if (kvm->arch.nx_lpage_recovery_thread) | ||
2397 | + kthread_stop(kvm->arch.nx_lpage_recovery_thread); | ||
2398 | +} | ||
2399 | diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h | ||
2400 | index c92834c55c59..e584689e7d46 100644 | ||
2401 | --- a/arch/x86/kvm/mmu.h | ||
2402 | +++ b/arch/x86/kvm/mmu.h | ||
2403 | @@ -56,23 +56,6 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask); | ||
2404 | void | ||
2405 | reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context); | ||
2406 | |||
2407 | -/* | ||
2408 | - * Return values of handle_mmio_page_fault: | ||
2409 | - * RET_MMIO_PF_EMULATE: it is a real mmio page fault, emulate the instruction | ||
2410 | - * directly. | ||
2411 | - * RET_MMIO_PF_INVALID: invalid spte is detected then let the real page | ||
2412 | - * fault path update the mmio spte. | ||
2413 | - * RET_MMIO_PF_RETRY: let CPU fault again on the address. | ||
2414 | - * RET_MMIO_PF_BUG: a bug was detected (and a WARN was printed). | ||
2415 | - */ | ||
2416 | -enum { | ||
2417 | - RET_MMIO_PF_EMULATE = 1, | ||
2418 | - RET_MMIO_PF_INVALID = 2, | ||
2419 | - RET_MMIO_PF_RETRY = 0, | ||
2420 | - RET_MMIO_PF_BUG = -1 | ||
2421 | -}; | ||
2422 | - | ||
2423 | -int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct); | ||
2424 | void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu); | ||
2425 | void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly); | ||
2426 | bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu); | ||
2427 | @@ -202,4 +185,8 @@ void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); | ||
2428 | void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); | ||
2429 | bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, | ||
2430 | struct kvm_memory_slot *slot, u64 gfn); | ||
2431 | + | ||
2432 | +int kvm_mmu_post_init_vm(struct kvm *kvm); | ||
2433 | +void kvm_mmu_pre_destroy_vm(struct kvm *kvm); | ||
2434 | + | ||
2435 | #endif | ||
2436 | diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h | ||
2437 | index 5a24b846a1cb..756b14ecc957 100644 | ||
2438 | --- a/arch/x86/kvm/mmutrace.h | ||
2439 | +++ b/arch/x86/kvm/mmutrace.h | ||
2440 | @@ -322,6 +322,65 @@ TRACE_EVENT( | ||
2441 | __entry->kvm_gen == __entry->spte_gen | ||
2442 | ) | ||
2443 | ); | ||
2444 | + | ||
2445 | +TRACE_EVENT( | ||
2446 | + kvm_mmu_set_spte, | ||
2447 | + TP_PROTO(int level, gfn_t gfn, u64 *sptep), | ||
2448 | + TP_ARGS(level, gfn, sptep), | ||
2449 | + | ||
2450 | + TP_STRUCT__entry( | ||
2451 | + __field(u64, gfn) | ||
2452 | + __field(u64, spte) | ||
2453 | + __field(u64, sptep) | ||
2454 | + __field(u8, level) | ||
2455 | + /* These depend on page entry type, so compute them now. */ | ||
2456 | + __field(bool, r) | ||
2457 | + __field(bool, x) | ||
2458 | + __field(u8, u) | ||
2459 | + ), | ||
2460 | + | ||
2461 | + TP_fast_assign( | ||
2462 | + __entry->gfn = gfn; | ||
2463 | + __entry->spte = *sptep; | ||
2464 | + __entry->sptep = virt_to_phys(sptep); | ||
2465 | + __entry->level = level; | ||
2466 | + __entry->r = shadow_present_mask || (__entry->spte & PT_PRESENT_MASK); | ||
2467 | + __entry->x = is_executable_pte(__entry->spte); | ||
2468 | + __entry->u = shadow_user_mask ? !!(__entry->spte & shadow_user_mask) : -1; | ||
2469 | + ), | ||
2470 | + | ||
2471 | + TP_printk("gfn %llx spte %llx (%s%s%s%s) level %d at %llx", | ||
2472 | + __entry->gfn, __entry->spte, | ||
2473 | + __entry->r ? "r" : "-", | ||
2474 | + __entry->spte & PT_PRESENT_MASK ? "w" : "-", | ||
2475 | + __entry->x ? "x" : "-", | ||
2476 | + __entry->u == -1 ? "" : (__entry->u ? "u" : "-"), | ||
2477 | + __entry->level, __entry->sptep | ||
2478 | + ) | ||
2479 | +); | ||
2480 | + | ||
2481 | +TRACE_EVENT( | ||
2482 | + kvm_mmu_spte_requested, | ||
2483 | + TP_PROTO(gpa_t addr, int level, kvm_pfn_t pfn), | ||
2484 | + TP_ARGS(addr, level, pfn), | ||
2485 | + | ||
2486 | + TP_STRUCT__entry( | ||
2487 | + __field(u64, gfn) | ||
2488 | + __field(u64, pfn) | ||
2489 | + __field(u8, level) | ||
2490 | + ), | ||
2491 | + | ||
2492 | + TP_fast_assign( | ||
2493 | + __entry->gfn = addr >> PAGE_SHIFT; | ||
2494 | + __entry->pfn = pfn | (__entry->gfn & (KVM_PAGES_PER_HPAGE(level) - 1)); | ||
2495 | + __entry->level = level; | ||
2496 | + ), | ||
2497 | + | ||
2498 | + TP_printk("gfn %llx pfn %llx level %d", | ||
2499 | + __entry->gfn, __entry->pfn, __entry->level | ||
2500 | + ) | ||
2501 | +); | ||
2502 | + | ||
2503 | #endif /* _TRACE_KVMMMU_H */ | ||
2504 | |||
2505 | #undef TRACE_INCLUDE_PATH | ||
2506 | diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h | ||
2507 | index 37363900297d..e03225e707b2 100644 | ||
2508 | --- a/arch/x86/kvm/paging_tmpl.h | ||
2509 | +++ b/arch/x86/kvm/paging_tmpl.h | ||
2510 | @@ -499,6 +499,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, | ||
2511 | mmu_set_spte(vcpu, spte, pte_access, 0, PT_PAGE_TABLE_LEVEL, gfn, pfn, | ||
2512 | true, true); | ||
2513 | |||
2514 | + kvm_release_pfn_clean(pfn); | ||
2515 | return true; | ||
2516 | } | ||
2517 | |||
2518 | @@ -572,12 +573,14 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, | ||
2519 | static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, | ||
2520 | struct guest_walker *gw, | ||
2521 | int write_fault, int hlevel, | ||
2522 | - kvm_pfn_t pfn, bool map_writable, bool prefault) | ||
2523 | + kvm_pfn_t pfn, bool map_writable, bool prefault, | ||
2524 | + bool lpage_disallowed) | ||
2525 | { | ||
2526 | struct kvm_mmu_page *sp = NULL; | ||
2527 | struct kvm_shadow_walk_iterator it; | ||
2528 | unsigned direct_access, access = gw->pt_access; | ||
2529 | - int top_level, emulate; | ||
2530 | + int top_level, ret; | ||
2531 | + gfn_t gfn, base_gfn; | ||
2532 | |||
2533 | direct_access = gw->pte_access; | ||
2534 | |||
2535 | @@ -622,36 +625,49 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, | ||
2536 | link_shadow_page(vcpu, it.sptep, sp); | ||
2537 | } | ||
2538 | |||
2539 | - for (; | ||
2540 | - shadow_walk_okay(&it) && it.level > hlevel; | ||
2541 | - shadow_walk_next(&it)) { | ||
2542 | - gfn_t direct_gfn; | ||
2543 | + /* | ||
2544 | + * FNAME(page_fault) might have clobbered the bottom bits of | ||
2545 | + * gw->gfn, restore them from the virtual address. | ||
2546 | + */ | ||
2547 | + gfn = gw->gfn | ((addr & PT_LVL_OFFSET_MASK(gw->level)) >> PAGE_SHIFT); | ||
2548 | + base_gfn = gfn; | ||
2549 | |||
2550 | + trace_kvm_mmu_spte_requested(addr, gw->level, pfn); | ||
2551 | + | ||
2552 | + for (; shadow_walk_okay(&it); shadow_walk_next(&it)) { | ||
2553 | clear_sp_write_flooding_count(it.sptep); | ||
2554 | - validate_direct_spte(vcpu, it.sptep, direct_access); | ||
2555 | |||
2556 | - drop_large_spte(vcpu, it.sptep); | ||
2557 | + /* | ||
2558 | + * We cannot overwrite existing page tables with an NX | ||
2559 | + * large page, as the leaf could be executable. | ||
2560 | + */ | ||
2561 | + disallowed_hugepage_adjust(it, gfn, &pfn, &hlevel); | ||
2562 | |||
2563 | - if (is_shadow_present_pte(*it.sptep)) | ||
2564 | - continue; | ||
2565 | + base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); | ||
2566 | + if (it.level == hlevel) | ||
2567 | + break; | ||
2568 | |||
2569 | - direct_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); | ||
2570 | + validate_direct_spte(vcpu, it.sptep, direct_access); | ||
2571 | |||
2572 | - sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1, | ||
2573 | - true, direct_access); | ||
2574 | - link_shadow_page(vcpu, it.sptep, sp); | ||
2575 | + drop_large_spte(vcpu, it.sptep); | ||
2576 | + | ||
2577 | + if (!is_shadow_present_pte(*it.sptep)) { | ||
2578 | + sp = kvm_mmu_get_page(vcpu, base_gfn, addr, | ||
2579 | + it.level - 1, true, direct_access); | ||
2580 | + link_shadow_page(vcpu, it.sptep, sp); | ||
2581 | + if (lpage_disallowed) | ||
2582 | + account_huge_nx_page(vcpu->kvm, sp); | ||
2583 | + } | ||
2584 | } | ||
2585 | |||
2586 | - clear_sp_write_flooding_count(it.sptep); | ||
2587 | - emulate = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault, | ||
2588 | - it.level, gw->gfn, pfn, prefault, map_writable); | ||
2589 | + ret = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault, | ||
2590 | + it.level, base_gfn, pfn, prefault, map_writable); | ||
2591 | FNAME(pte_prefetch)(vcpu, gw, it.sptep); | ||
2592 | - | ||
2593 | - return emulate; | ||
2594 | + ++vcpu->stat.pf_fixed; | ||
2595 | + return ret; | ||
2596 | |||
2597 | out_gpte_changed: | ||
2598 | - kvm_release_pfn_clean(pfn); | ||
2599 | - return 0; | ||
2600 | + return RET_PF_RETRY; | ||
2601 | } | ||
2602 | |||
2603 | /* | ||
2604 | @@ -717,9 +733,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, | ||
2605 | int r; | ||
2606 | kvm_pfn_t pfn; | ||
2607 | int level = PT_PAGE_TABLE_LEVEL; | ||
2608 | - bool force_pt_level = false; | ||
2609 | unsigned long mmu_seq; | ||
2610 | bool map_writable, is_self_change_mapping; | ||
2611 | + bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) && | ||
2612 | + is_nx_huge_page_enabled(); | ||
2613 | + bool force_pt_level = lpage_disallowed; | ||
2614 | |||
2615 | pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); | ||
2616 | |||
2617 | @@ -746,12 +764,12 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, | ||
2618 | if (!prefault) | ||
2619 | inject_page_fault(vcpu, &walker.fault); | ||
2620 | |||
2621 | - return 0; | ||
2622 | + return RET_PF_RETRY; | ||
2623 | } | ||
2624 | |||
2625 | if (page_fault_handle_page_track(vcpu, error_code, walker.gfn)) { | ||
2626 | shadow_page_table_clear_flood(vcpu, addr); | ||
2627 | - return 1; | ||
2628 | + return RET_PF_EMULATE; | ||
2629 | } | ||
2630 | |||
2631 | vcpu->arch.write_fault_to_shadow_pgtable = false; | ||
2632 | @@ -773,7 +791,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, | ||
2633 | |||
2634 | if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault, | ||
2635 | &map_writable)) | ||
2636 | - return 0; | ||
2637 | + return RET_PF_RETRY; | ||
2638 | |||
2639 | if (handle_abnormal_pfn(vcpu, mmu_is_nested(vcpu) ? 0 : addr, | ||
2640 | walker.gfn, pfn, walker.pte_access, &r)) | ||
2641 | @@ -799,6 +817,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, | ||
2642 | walker.pte_access &= ~ACC_EXEC_MASK; | ||
2643 | } | ||
2644 | |||
2645 | + r = RET_PF_RETRY; | ||
2646 | spin_lock(&vcpu->kvm->mmu_lock); | ||
2647 | if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) | ||
2648 | goto out_unlock; | ||
2649 | @@ -806,19 +825,15 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, | ||
2650 | kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); | ||
2651 | make_mmu_pages_available(vcpu); | ||
2652 | if (!force_pt_level) | ||
2653 | - transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level); | ||
2654 | + transparent_hugepage_adjust(vcpu, walker.gfn, &pfn, &level); | ||
2655 | r = FNAME(fetch)(vcpu, addr, &walker, write_fault, | ||
2656 | - level, pfn, map_writable, prefault); | ||
2657 | - ++vcpu->stat.pf_fixed; | ||
2658 | + level, pfn, map_writable, prefault, lpage_disallowed); | ||
2659 | kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); | ||
2660 | - spin_unlock(&vcpu->kvm->mmu_lock); | ||
2661 | - | ||
2662 | - return r; | ||
2663 | |||
2664 | out_unlock: | ||
2665 | spin_unlock(&vcpu->kvm->mmu_lock); | ||
2666 | kvm_release_pfn_clean(pfn); | ||
2667 | - return 0; | ||
2668 | + return r; | ||
2669 | } | ||
2670 | |||
2671 | static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp) | ||
2672 | diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c | ||
2673 | index f7a7b98b3271..1079228e4fef 100644 | ||
2674 | --- a/arch/x86/kvm/svm.c | ||
2675 | +++ b/arch/x86/kvm/svm.c | ||
2676 | @@ -590,8 +590,14 @@ static int get_npt_level(void) | ||
2677 | static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) | ||
2678 | { | ||
2679 | vcpu->arch.efer = efer; | ||
2680 | - if (!npt_enabled && !(efer & EFER_LMA)) | ||
2681 | - efer &= ~EFER_LME; | ||
2682 | + | ||
2683 | + if (!npt_enabled) { | ||
2684 | + /* Shadow paging assumes NX to be available. */ | ||
2685 | + efer |= EFER_NX; | ||
2686 | + | ||
2687 | + if (!(efer & EFER_LMA)) | ||
2688 | + efer &= ~EFER_LME; | ||
2689 | + } | ||
2690 | |||
2691 | to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME; | ||
2692 | mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); | ||
2693 | diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c | ||
2694 | index 6b66d1f0d185..4c0d6d0d6337 100644 | ||
2695 | --- a/arch/x86/kvm/vmx.c | ||
2696 | +++ b/arch/x86/kvm/vmx.c | ||
2697 | @@ -2219,17 +2219,9 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) | ||
2698 | u64 guest_efer = vmx->vcpu.arch.efer; | ||
2699 | u64 ignore_bits = 0; | ||
2700 | |||
2701 | - if (!enable_ept) { | ||
2702 | - /* | ||
2703 | - * NX is needed to handle CR0.WP=1, CR4.SMEP=1. Testing | ||
2704 | - * host CPUID is more efficient than testing guest CPUID | ||
2705 | - * or CR4. Host SMEP is anyway a requirement for guest SMEP. | ||
2706 | - */ | ||
2707 | - if (boot_cpu_has(X86_FEATURE_SMEP)) | ||
2708 | - guest_efer |= EFER_NX; | ||
2709 | - else if (!(guest_efer & EFER_NX)) | ||
2710 | - ignore_bits |= EFER_NX; | ||
2711 | - } | ||
2712 | + /* Shadow paging assumes NX to be available. */ | ||
2713 | + if (!enable_ept) | ||
2714 | + guest_efer |= EFER_NX; | ||
2715 | |||
2716 | /* | ||
2717 | * LMA and LME handled by hardware; SCE meaningless outside long mode. | ||
2718 | @@ -6556,16 +6548,9 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) | ||
2719 | NULL, 0) == EMULATE_DONE; | ||
2720 | } | ||
2721 | |||
2722 | - ret = handle_mmio_page_fault(vcpu, gpa, true); | ||
2723 | - if (likely(ret == RET_MMIO_PF_EMULATE)) | ||
2724 | - return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) == | ||
2725 | - EMULATE_DONE; | ||
2726 | - | ||
2727 | - if (unlikely(ret == RET_MMIO_PF_INVALID)) | ||
2728 | - return kvm_mmu_page_fault(vcpu, gpa, 0, NULL, 0); | ||
2729 | - | ||
2730 | - if (unlikely(ret == RET_MMIO_PF_RETRY)) | ||
2731 | - return 1; | ||
2732 | + ret = kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0); | ||
2733 | + if (ret >= 0) | ||
2734 | + return ret; | ||
2735 | |||
2736 | /* It is the real ept misconfig */ | ||
2737 | WARN_ON(1); | ||
2738 | diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c | ||
2739 | index 0b6517f5821b..06cd710e1d45 100644 | ||
2740 | --- a/arch/x86/kvm/x86.c | ||
2741 | +++ b/arch/x86/kvm/x86.c | ||
2742 | @@ -191,6 +191,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { | ||
2743 | { "mmu_unsync", VM_STAT(mmu_unsync) }, | ||
2744 | { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, | ||
2745 | { "largepages", VM_STAT(lpages) }, | ||
2746 | + { "nx_largepages_splitted", VM_STAT(nx_lpage_splits) }, | ||
2747 | { NULL } | ||
2748 | }; | ||
2749 | |||
2750 | @@ -587,7 +588,7 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu) | ||
2751 | gfn_t gfn; | ||
2752 | int r; | ||
2753 | |||
2754 | - if (is_long_mode(vcpu) || !is_pae(vcpu)) | ||
2755 | + if (is_long_mode(vcpu) || !is_pae(vcpu) || !is_paging(vcpu)) | ||
2756 | return false; | ||
2757 | |||
2758 | if (!test_bit(VCPU_EXREG_PDPTR, | ||
2759 | @@ -1031,6 +1032,14 @@ u64 kvm_get_arch_capabilities(void) | ||
2760 | |||
2761 | rdmsrl_safe(MSR_IA32_ARCH_CAPABILITIES, &data); | ||
2762 | |||
2763 | + /* | ||
2764 | + * If nx_huge_pages is enabled, KVM's shadow paging will ensure that | ||
2765 | + * the nested hypervisor runs with NX huge pages. If it is not, | ||
2766 | + * L1 is anyway vulnerable to ITLB_MULTIHIT explots from other | ||
2767 | + * L1 guests, so it need not worry about its own (L2) guests. | ||
2768 | + */ | ||
2769 | + data |= ARCH_CAP_PSCHANGE_MC_NO; | ||
2770 | + | ||
2771 | /* | ||
2772 | * If we're doing cache flushes (either "always" or "cond") | ||
2773 | * we will do one whenever the guest does a vmlaunch/vmresume. | ||
2774 | @@ -1043,8 +1052,35 @@ u64 kvm_get_arch_capabilities(void) | ||
2775 | if (l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NEVER) | ||
2776 | data |= ARCH_CAP_SKIP_VMENTRY_L1DFLUSH; | ||
2777 | |||
2778 | + if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) | ||
2779 | + data |= ARCH_CAP_RDCL_NO; | ||
2780 | + if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) | ||
2781 | + data |= ARCH_CAP_SSB_NO; | ||
2782 | + if (!boot_cpu_has_bug(X86_BUG_MDS)) | ||
2783 | + data |= ARCH_CAP_MDS_NO; | ||
2784 | + | ||
2785 | + /* | ||
2786 | + * On TAA affected systems, export MDS_NO=0 when: | ||
2787 | + * - TSX is enabled on the host, i.e. X86_FEATURE_RTM=1. | ||
2788 | + * - Updated microcode is present. This is detected by | ||
2789 | + * the presence of ARCH_CAP_TSX_CTRL_MSR and ensures | ||
2790 | + * that VERW clears CPU buffers. | ||
2791 | + * | ||
2792 | + * When MDS_NO=0 is exported, guests deploy clear CPU buffer | ||
2793 | + * mitigation and don't complain: | ||
2794 | + * | ||
2795 | + * "Vulnerable: Clear CPU buffers attempted, no microcode" | ||
2796 | + * | ||
2797 | + * If TSX is disabled on the system, guests are also mitigated against | ||
2798 | + * TAA and clear CPU buffer mitigation is not required for guests. | ||
2799 | + */ | ||
2800 | + if (boot_cpu_has_bug(X86_BUG_TAA) && boot_cpu_has(X86_FEATURE_RTM) && | ||
2801 | + (data & ARCH_CAP_TSX_CTRL_MSR)) | ||
2802 | + data &= ~ARCH_CAP_MDS_NO; | ||
2803 | + | ||
2804 | return data; | ||
2805 | } | ||
2806 | + | ||
2807 | EXPORT_SYMBOL_GPL(kvm_get_arch_capabilities); | ||
2808 | |||
2809 | static int kvm_get_msr_feature(struct kvm_msr_entry *msr) | ||
2810 | @@ -5951,17 +5987,17 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va | ||
2811 | |||
2812 | smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1); | ||
2813 | |||
2814 | - spin_lock(&kvm_lock); | ||
2815 | + mutex_lock(&kvm_lock); | ||
2816 | list_for_each_entry(kvm, &vm_list, vm_list) { | ||
2817 | kvm_for_each_vcpu(i, vcpu, kvm) { | ||
2818 | if (vcpu->cpu != freq->cpu) | ||
2819 | continue; | ||
2820 | kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); | ||
2821 | - if (vcpu->cpu != smp_processor_id()) | ||
2822 | + if (vcpu->cpu != raw_smp_processor_id()) | ||
2823 | send_ipi = 1; | ||
2824 | } | ||
2825 | } | ||
2826 | - spin_unlock(&kvm_lock); | ||
2827 | + mutex_unlock(&kvm_lock); | ||
2828 | |||
2829 | if (freq->old < freq->new && send_ipi) { | ||
2830 | /* | ||
2831 | @@ -6099,12 +6135,12 @@ static void pvclock_gtod_update_fn(struct work_struct *work) | ||
2832 | struct kvm_vcpu *vcpu; | ||
2833 | int i; | ||
2834 | |||
2835 | - spin_lock(&kvm_lock); | ||
2836 | + mutex_lock(&kvm_lock); | ||
2837 | list_for_each_entry(kvm, &vm_list, vm_list) | ||
2838 | kvm_for_each_vcpu(i, vcpu, kvm) | ||
2839 | kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); | ||
2840 | atomic_set(&kvm_guest_has_master_clock, 0); | ||
2841 | - spin_unlock(&kvm_lock); | ||
2842 | + mutex_unlock(&kvm_lock); | ||
2843 | } | ||
2844 | |||
2845 | static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn); | ||
2846 | @@ -7491,7 +7527,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, | ||
2847 | kvm_update_cpuid(vcpu); | ||
2848 | |||
2849 | idx = srcu_read_lock(&vcpu->kvm->srcu); | ||
2850 | - if (!is_long_mode(vcpu) && is_pae(vcpu)) { | ||
2851 | + if (!is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu)) { | ||
2852 | load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)); | ||
2853 | mmu_reset_needed = 1; | ||
2854 | } | ||
2855 | @@ -8072,6 +8108,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) | ||
2856 | INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list); | ||
2857 | INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); | ||
2858 | INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages); | ||
2859 | + INIT_LIST_HEAD(&kvm->arch.lpage_disallowed_mmu_pages); | ||
2860 | INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); | ||
2861 | atomic_set(&kvm->arch.noncoherent_dma_count, 0); | ||
2862 | |||
2863 | @@ -8100,6 +8137,11 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) | ||
2864 | return 0; | ||
2865 | } | ||
2866 | |||
2867 | +int kvm_arch_post_init_vm(struct kvm *kvm) | ||
2868 | +{ | ||
2869 | + return kvm_mmu_post_init_vm(kvm); | ||
2870 | +} | ||
2871 | + | ||
2872 | static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) | ||
2873 | { | ||
2874 | int r; | ||
2875 | @@ -8206,6 +8248,11 @@ int x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) | ||
2876 | } | ||
2877 | EXPORT_SYMBOL_GPL(x86_set_memory_region); | ||
2878 | |||
2879 | +void kvm_arch_pre_destroy_vm(struct kvm *kvm) | ||
2880 | +{ | ||
2881 | + kvm_mmu_pre_destroy_vm(kvm); | ||
2882 | +} | ||
2883 | + | ||
2884 | void kvm_arch_destroy_vm(struct kvm *kvm) | ||
2885 | { | ||
2886 | if (current->mm == kvm->mm) { | ||
2887 | diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c | ||
2888 | index 3b123735a1c4..677c5f36674b 100644 | ||
2889 | --- a/drivers/base/cpu.c | ||
2890 | +++ b/drivers/base/cpu.c | ||
2891 | @@ -537,12 +537,27 @@ ssize_t __weak cpu_show_mds(struct device *dev, | ||
2892 | return sprintf(buf, "Not affected\n"); | ||
2893 | } | ||
2894 | |||
2895 | +ssize_t __weak cpu_show_tsx_async_abort(struct device *dev, | ||
2896 | + struct device_attribute *attr, | ||
2897 | + char *buf) | ||
2898 | +{ | ||
2899 | + return sprintf(buf, "Not affected\n"); | ||
2900 | +} | ||
2901 | + | ||
2902 | +ssize_t __weak cpu_show_itlb_multihit(struct device *dev, | ||
2903 | + struct device_attribute *attr, char *buf) | ||
2904 | +{ | ||
2905 | + return sprintf(buf, "Not affected\n"); | ||
2906 | +} | ||
2907 | + | ||
2908 | static DEVICE_ATTR(meltdown, 0444, cpu_show_meltdown, NULL); | ||
2909 | static DEVICE_ATTR(spectre_v1, 0444, cpu_show_spectre_v1, NULL); | ||
2910 | static DEVICE_ATTR(spectre_v2, 0444, cpu_show_spectre_v2, NULL); | ||
2911 | static DEVICE_ATTR(spec_store_bypass, 0444, cpu_show_spec_store_bypass, NULL); | ||
2912 | static DEVICE_ATTR(l1tf, 0444, cpu_show_l1tf, NULL); | ||
2913 | static DEVICE_ATTR(mds, 0444, cpu_show_mds, NULL); | ||
2914 | +static DEVICE_ATTR(tsx_async_abort, 0444, cpu_show_tsx_async_abort, NULL); | ||
2915 | +static DEVICE_ATTR(itlb_multihit, 0444, cpu_show_itlb_multihit, NULL); | ||
2916 | |||
2917 | static struct attribute *cpu_root_vulnerabilities_attrs[] = { | ||
2918 | &dev_attr_meltdown.attr, | ||
2919 | @@ -551,6 +566,8 @@ static struct attribute *cpu_root_vulnerabilities_attrs[] = { | ||
2920 | &dev_attr_spec_store_bypass.attr, | ||
2921 | &dev_attr_l1tf.attr, | ||
2922 | &dev_attr_mds.attr, | ||
2923 | + &dev_attr_tsx_async_abort.attr, | ||
2924 | + &dev_attr_itlb_multihit.attr, | ||
2925 | NULL | ||
2926 | }; | ||
2927 | |||
2928 | diff --git a/drivers/bluetooth/hci_ldisc.c b/drivers/bluetooth/hci_ldisc.c | ||
2929 | index a2f6953a86f5..0a21fb86fd67 100644 | ||
2930 | --- a/drivers/bluetooth/hci_ldisc.c | ||
2931 | +++ b/drivers/bluetooth/hci_ldisc.c | ||
2932 | @@ -653,15 +653,14 @@ static int hci_uart_set_proto(struct hci_uart *hu, int id) | ||
2933 | return err; | ||
2934 | |||
2935 | hu->proto = p; | ||
2936 | - set_bit(HCI_UART_PROTO_READY, &hu->flags); | ||
2937 | |||
2938 | err = hci_uart_register_dev(hu); | ||
2939 | if (err) { | ||
2940 | - clear_bit(HCI_UART_PROTO_READY, &hu->flags); | ||
2941 | p->close(hu); | ||
2942 | return err; | ||
2943 | } | ||
2944 | |||
2945 | + set_bit(HCI_UART_PROTO_READY, &hu->flags); | ||
2946 | return 0; | ||
2947 | } | ||
2948 | |||
2949 | diff --git a/drivers/usb/gadget/udc/core.c b/drivers/usb/gadget/udc/core.c | ||
2950 | index 95e28ecfde0a..99c7cf4822c3 100644 | ||
2951 | --- a/drivers/usb/gadget/udc/core.c | ||
2952 | +++ b/drivers/usb/gadget/udc/core.c | ||
2953 | @@ -817,6 +817,8 @@ int usb_gadget_map_request_by_dev(struct device *dev, | ||
2954 | dev_err(dev, "failed to map buffer\n"); | ||
2955 | return -EFAULT; | ||
2956 | } | ||
2957 | + | ||
2958 | + req->dma_mapped = 1; | ||
2959 | } | ||
2960 | |||
2961 | return 0; | ||
2962 | @@ -841,9 +843,10 @@ void usb_gadget_unmap_request_by_dev(struct device *dev, | ||
2963 | is_in ? DMA_TO_DEVICE : DMA_FROM_DEVICE); | ||
2964 | |||
2965 | req->num_mapped_sgs = 0; | ||
2966 | - } else { | ||
2967 | + } else if (req->dma_mapped) { | ||
2968 | dma_unmap_single(dev, req->dma, req->length, | ||
2969 | is_in ? DMA_TO_DEVICE : DMA_FROM_DEVICE); | ||
2970 | + req->dma_mapped = 0; | ||
2971 | } | ||
2972 | } | ||
2973 | EXPORT_SYMBOL_GPL(usb_gadget_unmap_request_by_dev); | ||
2974 | diff --git a/include/linux/cpu.h b/include/linux/cpu.h | ||
2975 | index b27c9b2e683f..e19bbc38a722 100644 | ||
2976 | --- a/include/linux/cpu.h | ||
2977 | +++ b/include/linux/cpu.h | ||
2978 | @@ -56,6 +56,11 @@ extern ssize_t cpu_show_l1tf(struct device *dev, | ||
2979 | struct device_attribute *attr, char *buf); | ||
2980 | extern ssize_t cpu_show_mds(struct device *dev, | ||
2981 | struct device_attribute *attr, char *buf); | ||
2982 | +extern ssize_t cpu_show_tsx_async_abort(struct device *dev, | ||
2983 | + struct device_attribute *attr, | ||
2984 | + char *buf); | ||
2985 | +extern ssize_t cpu_show_itlb_multihit(struct device *dev, | ||
2986 | + struct device_attribute *attr, char *buf); | ||
2987 | |||
2988 | extern __printf(4, 5) | ||
2989 | struct device *cpu_device_create(struct device *parent, void *drvdata, | ||
2990 | @@ -282,28 +287,7 @@ static inline int cpuhp_smt_enable(void) { return 0; } | ||
2991 | static inline int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) { return 0; } | ||
2992 | #endif | ||
2993 | |||
2994 | -/* | ||
2995 | - * These are used for a global "mitigations=" cmdline option for toggling | ||
2996 | - * optional CPU mitigations. | ||
2997 | - */ | ||
2998 | -enum cpu_mitigations { | ||
2999 | - CPU_MITIGATIONS_OFF, | ||
3000 | - CPU_MITIGATIONS_AUTO, | ||
3001 | - CPU_MITIGATIONS_AUTO_NOSMT, | ||
3002 | -}; | ||
3003 | - | ||
3004 | -extern enum cpu_mitigations cpu_mitigations; | ||
3005 | - | ||
3006 | -/* mitigations=off */ | ||
3007 | -static inline bool cpu_mitigations_off(void) | ||
3008 | -{ | ||
3009 | - return cpu_mitigations == CPU_MITIGATIONS_OFF; | ||
3010 | -} | ||
3011 | - | ||
3012 | -/* mitigations=auto,nosmt */ | ||
3013 | -static inline bool cpu_mitigations_auto_nosmt(void) | ||
3014 | -{ | ||
3015 | - return cpu_mitigations == CPU_MITIGATIONS_AUTO_NOSMT; | ||
3016 | -} | ||
3017 | +extern bool cpu_mitigations_off(void); | ||
3018 | +extern bool cpu_mitigations_auto_nosmt(void); | ||
3019 | |||
3020 | #endif /* _LINUX_CPU_H_ */ | ||
3021 | diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h | ||
3022 | index eb55374b73f3..0590e7d47b02 100644 | ||
3023 | --- a/include/linux/kvm_host.h | ||
3024 | +++ b/include/linux/kvm_host.h | ||
3025 | @@ -129,7 +129,7 @@ static inline bool is_error_page(struct page *page) | ||
3026 | |||
3027 | extern struct kmem_cache *kvm_vcpu_cache; | ||
3028 | |||
3029 | -extern spinlock_t kvm_lock; | ||
3030 | +extern struct mutex kvm_lock; | ||
3031 | extern struct list_head vm_list; | ||
3032 | |||
3033 | struct kvm_io_range { | ||
3034 | @@ -1208,4 +1208,10 @@ static inline bool vcpu_valid_wakeup(struct kvm_vcpu *vcpu) | ||
3035 | } | ||
3036 | #endif /* CONFIG_HAVE_KVM_INVALID_WAKEUPS */ | ||
3037 | |||
3038 | +typedef int (*kvm_vm_thread_fn_t)(struct kvm *kvm, uintptr_t data); | ||
3039 | + | ||
3040 | +int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn, | ||
3041 | + uintptr_t data, const char *name, | ||
3042 | + struct task_struct **thread_ptr); | ||
3043 | + | ||
3044 | #endif | ||
3045 | diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h | ||
3046 | index e4516e9ded0f..4b810bc7ae63 100644 | ||
3047 | --- a/include/linux/usb/gadget.h | ||
3048 | +++ b/include/linux/usb/gadget.h | ||
3049 | @@ -48,6 +48,7 @@ struct usb_ep; | ||
3050 | * by adding a zero length packet as needed; | ||
3051 | * @short_not_ok: When reading data, makes short packets be | ||
3052 | * treated as errors (queue stops advancing till cleanup). | ||
3053 | + * @dma_mapped: Indicates if request has been mapped to DMA (internal) | ||
3054 | * @complete: Function called when request completes, so this request and | ||
3055 | * its buffer may be re-used. The function will always be called with | ||
3056 | * interrupts disabled, and it must not sleep. | ||
3057 | @@ -103,6 +104,7 @@ struct usb_request { | ||
3058 | unsigned no_interrupt:1; | ||
3059 | unsigned zero:1; | ||
3060 | unsigned short_not_ok:1; | ||
3061 | + unsigned dma_mapped:1; | ||
3062 | |||
3063 | void (*complete)(struct usb_ep *ep, | ||
3064 | struct usb_request *req); | ||
3065 | diff --git a/kernel/cpu.c b/kernel/cpu.c | ||
3066 | index c947bb35b89f..0ed3e9deda30 100644 | ||
3067 | --- a/kernel/cpu.c | ||
3068 | +++ b/kernel/cpu.c | ||
3069 | @@ -2235,7 +2235,18 @@ void __init boot_cpu_hotplug_init(void) | ||
3070 | this_cpu_write(cpuhp_state.state, CPUHP_ONLINE); | ||
3071 | } | ||
3072 | |||
3073 | -enum cpu_mitigations cpu_mitigations __ro_after_init = CPU_MITIGATIONS_AUTO; | ||
3074 | +/* | ||
3075 | + * These are used for a global "mitigations=" cmdline option for toggling | ||
3076 | + * optional CPU mitigations. | ||
3077 | + */ | ||
3078 | +enum cpu_mitigations { | ||
3079 | + CPU_MITIGATIONS_OFF, | ||
3080 | + CPU_MITIGATIONS_AUTO, | ||
3081 | + CPU_MITIGATIONS_AUTO_NOSMT, | ||
3082 | +}; | ||
3083 | + | ||
3084 | +static enum cpu_mitigations cpu_mitigations __ro_after_init = | ||
3085 | + CPU_MITIGATIONS_AUTO; | ||
3086 | |||
3087 | static int __init mitigations_parse_cmdline(char *arg) | ||
3088 | { | ||
3089 | @@ -2252,3 +2263,17 @@ static int __init mitigations_parse_cmdline(char *arg) | ||
3090 | return 0; | ||
3091 | } | ||
3092 | early_param("mitigations", mitigations_parse_cmdline); | ||
3093 | + | ||
3094 | +/* mitigations=off */ | ||
3095 | +bool cpu_mitigations_off(void) | ||
3096 | +{ | ||
3097 | + return cpu_mitigations == CPU_MITIGATIONS_OFF; | ||
3098 | +} | ||
3099 | +EXPORT_SYMBOL_GPL(cpu_mitigations_off); | ||
3100 | + | ||
3101 | +/* mitigations=auto,nosmt */ | ||
3102 | +bool cpu_mitigations_auto_nosmt(void) | ||
3103 | +{ | ||
3104 | + return cpu_mitigations == CPU_MITIGATIONS_AUTO_NOSMT; | ||
3105 | +} | ||
3106 | +EXPORT_SYMBOL_GPL(cpu_mitigations_auto_nosmt); | ||
3107 | diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c | ||
3108 | index c72586a094ed..0fc93519e63e 100644 | ||
3109 | --- a/virt/kvm/kvm_main.c | ||
3110 | +++ b/virt/kvm/kvm_main.c | ||
3111 | @@ -49,6 +49,7 @@ | ||
3112 | #include <linux/slab.h> | ||
3113 | #include <linux/sort.h> | ||
3114 | #include <linux/bsearch.h> | ||
3115 | +#include <linux/kthread.h> | ||
3116 | |||
3117 | #include <asm/processor.h> | ||
3118 | #include <asm/io.h> | ||
3119 | @@ -87,7 +88,7 @@ module_param(halt_poll_ns_shrink, uint, S_IRUGO | S_IWUSR); | ||
3120 | * kvm->lock --> kvm->slots_lock --> kvm->irq_lock | ||
3121 | */ | ||
3122 | |||
3123 | -DEFINE_SPINLOCK(kvm_lock); | ||
3124 | +DEFINE_MUTEX(kvm_lock); | ||
3125 | static DEFINE_RAW_SPINLOCK(kvm_count_lock); | ||
3126 | LIST_HEAD(vm_list); | ||
3127 | |||
3128 | @@ -612,6 +613,23 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd) | ||
3129 | return 0; | ||
3130 | } | ||
3131 | |||
3132 | +/* | ||
3133 | + * Called after the VM is otherwise initialized, but just before adding it to | ||
3134 | + * the vm_list. | ||
3135 | + */ | ||
3136 | +int __weak kvm_arch_post_init_vm(struct kvm *kvm) | ||
3137 | +{ | ||
3138 | + return 0; | ||
3139 | +} | ||
3140 | + | ||
3141 | +/* | ||
3142 | + * Called just after removing the VM from the vm_list, but before doing any | ||
3143 | + * other destruction. | ||
3144 | + */ | ||
3145 | +void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm) | ||
3146 | +{ | ||
3147 | +} | ||
3148 | + | ||
3149 | static struct kvm *kvm_create_vm(unsigned long type) | ||
3150 | { | ||
3151 | int r, i; | ||
3152 | @@ -659,22 +677,31 @@ static struct kvm *kvm_create_vm(unsigned long type) | ||
3153 | kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus), | ||
3154 | GFP_KERNEL); | ||
3155 | if (!kvm->buses[i]) | ||
3156 | - goto out_err; | ||
3157 | + goto out_err_no_mmu_notifier; | ||
3158 | } | ||
3159 | |||
3160 | r = kvm_init_mmu_notifier(kvm); | ||
3161 | + if (r) | ||
3162 | + goto out_err_no_mmu_notifier; | ||
3163 | + | ||
3164 | + r = kvm_arch_post_init_vm(kvm); | ||
3165 | if (r) | ||
3166 | goto out_err; | ||
3167 | |||
3168 | - spin_lock(&kvm_lock); | ||
3169 | + mutex_lock(&kvm_lock); | ||
3170 | list_add(&kvm->vm_list, &vm_list); | ||
3171 | - spin_unlock(&kvm_lock); | ||
3172 | + mutex_unlock(&kvm_lock); | ||
3173 | |||
3174 | preempt_notifier_inc(); | ||
3175 | |||
3176 | return kvm; | ||
3177 | |||
3178 | out_err: | ||
3179 | +#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) | ||
3180 | + if (kvm->mmu_notifier.ops) | ||
3181 | + mmu_notifier_unregister(&kvm->mmu_notifier, current->mm); | ||
3182 | +#endif | ||
3183 | +out_err_no_mmu_notifier: | ||
3184 | cleanup_srcu_struct(&kvm->irq_srcu); | ||
3185 | out_err_no_irq_srcu: | ||
3186 | cleanup_srcu_struct(&kvm->srcu); | ||
3187 | @@ -724,9 +751,11 @@ static void kvm_destroy_vm(struct kvm *kvm) | ||
3188 | |||
3189 | kvm_destroy_vm_debugfs(kvm); | ||
3190 | kvm_arch_sync_events(kvm); | ||
3191 | - spin_lock(&kvm_lock); | ||
3192 | + mutex_lock(&kvm_lock); | ||
3193 | list_del(&kvm->vm_list); | ||
3194 | - spin_unlock(&kvm_lock); | ||
3195 | + mutex_unlock(&kvm_lock); | ||
3196 | + kvm_arch_pre_destroy_vm(kvm); | ||
3197 | + | ||
3198 | kvm_free_irq_routing(kvm); | ||
3199 | for (i = 0; i < KVM_NR_BUSES; i++) { | ||
3200 | if (kvm->buses[i]) | ||
3201 | @@ -3752,13 +3781,13 @@ static int vm_stat_get(void *_offset, u64 *val) | ||
3202 | u64 tmp_val; | ||
3203 | |||
3204 | *val = 0; | ||
3205 | - spin_lock(&kvm_lock); | ||
3206 | + mutex_lock(&kvm_lock); | ||
3207 | list_for_each_entry(kvm, &vm_list, vm_list) { | ||
3208 | stat_tmp.kvm = kvm; | ||
3209 | vm_stat_get_per_vm((void *)&stat_tmp, &tmp_val); | ||
3210 | *val += tmp_val; | ||
3211 | } | ||
3212 | - spin_unlock(&kvm_lock); | ||
3213 | + mutex_unlock(&kvm_lock); | ||
3214 | return 0; | ||
3215 | } | ||
3216 | |||
3217 | @@ -3772,13 +3801,13 @@ static int vcpu_stat_get(void *_offset, u64 *val) | ||
3218 | u64 tmp_val; | ||
3219 | |||
3220 | *val = 0; | ||
3221 | - spin_lock(&kvm_lock); | ||
3222 | + mutex_lock(&kvm_lock); | ||
3223 | list_for_each_entry(kvm, &vm_list, vm_list) { | ||
3224 | stat_tmp.kvm = kvm; | ||
3225 | vcpu_stat_get_per_vm((void *)&stat_tmp, &tmp_val); | ||
3226 | *val += tmp_val; | ||
3227 | } | ||
3228 | - spin_unlock(&kvm_lock); | ||
3229 | + mutex_unlock(&kvm_lock); | ||
3230 | return 0; | ||
3231 | } | ||
3232 | |||
3233 | @@ -3987,3 +4016,86 @@ void kvm_exit(void) | ||
3234 | kvm_vfio_ops_exit(); | ||
3235 | } | ||
3236 | EXPORT_SYMBOL_GPL(kvm_exit); | ||
3237 | + | ||
3238 | +struct kvm_vm_worker_thread_context { | ||
3239 | + struct kvm *kvm; | ||
3240 | + struct task_struct *parent; | ||
3241 | + struct completion init_done; | ||
3242 | + kvm_vm_thread_fn_t thread_fn; | ||
3243 | + uintptr_t data; | ||
3244 | + int err; | ||
3245 | +}; | ||
3246 | + | ||
3247 | +static int kvm_vm_worker_thread(void *context) | ||
3248 | +{ | ||
3249 | + /* | ||
3250 | + * The init_context is allocated on the stack of the parent thread, so | ||
3251 | + * we have to locally copy anything that is needed beyond initialization | ||
3252 | + */ | ||
3253 | + struct kvm_vm_worker_thread_context *init_context = context; | ||
3254 | + struct kvm *kvm = init_context->kvm; | ||
3255 | + kvm_vm_thread_fn_t thread_fn = init_context->thread_fn; | ||
3256 | + uintptr_t data = init_context->data; | ||
3257 | + int err; | ||
3258 | + | ||
3259 | + err = kthread_park(current); | ||
3260 | + /* kthread_park(current) is never supposed to return an error */ | ||
3261 | + WARN_ON(err != 0); | ||
3262 | + if (err) | ||
3263 | + goto init_complete; | ||
3264 | + | ||
3265 | + err = cgroup_attach_task_all(init_context->parent, current); | ||
3266 | + if (err) { | ||
3267 | + kvm_err("%s: cgroup_attach_task_all failed with err %d\n", | ||
3268 | + __func__, err); | ||
3269 | + goto init_complete; | ||
3270 | + } | ||
3271 | + | ||
3272 | + set_user_nice(current, task_nice(init_context->parent)); | ||
3273 | + | ||
3274 | +init_complete: | ||
3275 | + init_context->err = err; | ||
3276 | + complete(&init_context->init_done); | ||
3277 | + init_context = NULL; | ||
3278 | + | ||
3279 | + if (err) | ||
3280 | + return err; | ||
3281 | + | ||
3282 | + /* Wait to be woken up by the spawner before proceeding. */ | ||
3283 | + kthread_parkme(); | ||
3284 | + | ||
3285 | + if (!kthread_should_stop()) | ||
3286 | + err = thread_fn(kvm, data); | ||
3287 | + | ||
3288 | + return err; | ||
3289 | +} | ||
3290 | + | ||
3291 | +int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn, | ||
3292 | + uintptr_t data, const char *name, | ||
3293 | + struct task_struct **thread_ptr) | ||
3294 | +{ | ||
3295 | + struct kvm_vm_worker_thread_context init_context = {}; | ||
3296 | + struct task_struct *thread; | ||
3297 | + | ||
3298 | + *thread_ptr = NULL; | ||
3299 | + init_context.kvm = kvm; | ||
3300 | + init_context.parent = current; | ||
3301 | + init_context.thread_fn = thread_fn; | ||
3302 | + init_context.data = data; | ||
3303 | + init_completion(&init_context.init_done); | ||
3304 | + | ||
3305 | + thread = kthread_run(kvm_vm_worker_thread, &init_context, | ||
3306 | + "%s-%d", name, task_pid_nr(current)); | ||
3307 | + if (IS_ERR(thread)) | ||
3308 | + return PTR_ERR(thread); | ||
3309 | + | ||
3310 | + /* kthread_run is never supposed to return NULL */ | ||
3311 | + WARN_ON(thread == NULL); | ||
3312 | + | ||
3313 | + wait_for_completion(&init_context.init_done); | ||
3314 | + | ||
3315 | + if (!init_context.err) | ||
3316 | + *thread_ptr = thread; | ||
3317 | + | ||
3318 | + return init_context.err; | ||
3319 | +} |