Magellan Linux

Contents of /trunk/kernel-alx/patches-4.9/0313-4.9.214-all-fixes.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 3588 - (show annotations) (download)
Thu Aug 13 10:21:30 2020 UTC (3 years, 9 months ago) by niro
File size: 355649 byte(s)
linux-214
1 diff --git a/Makefile b/Makefile
2 index de79c801abcd..9a6aa41a9ec1 100644
3 --- a/Makefile
4 +++ b/Makefile
5 @@ -1,6 +1,6 @@
6 VERSION = 4
7 PATCHLEVEL = 9
8 -SUBLEVEL = 213
9 +SUBLEVEL = 214
10 EXTRAVERSION =
11 NAME = Roaring Lionus
12
13 diff --git a/arch/arc/boot/dts/axs10x_mb.dtsi b/arch/arc/boot/dts/axs10x_mb.dtsi
14 index d6c1bbc98ac3..15698b3e490f 100644
15 --- a/arch/arc/boot/dts/axs10x_mb.dtsi
16 +++ b/arch/arc/boot/dts/axs10x_mb.dtsi
17 @@ -63,6 +63,7 @@
18 interrupt-names = "macirq";
19 phy-mode = "rgmii";
20 snps,pbl = < 32 >;
21 + snps,multicast-filter-bins = <256>;
22 clocks = <&apbclk>;
23 clock-names = "stmmaceth";
24 max-speed = <100>;
25 diff --git a/arch/arm/boot/dts/sama5d3.dtsi b/arch/arm/boot/dts/sama5d3.dtsi
26 index 4c84d333fc7e..33c0d2668934 100644
27 --- a/arch/arm/boot/dts/sama5d3.dtsi
28 +++ b/arch/arm/boot/dts/sama5d3.dtsi
29 @@ -1109,49 +1109,49 @@
30 usart0_clk: usart0_clk {
31 #clock-cells = <0>;
32 reg = <12>;
33 - atmel,clk-output-range = <0 66000000>;
34 + atmel,clk-output-range = <0 83000000>;
35 };
36
37 usart1_clk: usart1_clk {
38 #clock-cells = <0>;
39 reg = <13>;
40 - atmel,clk-output-range = <0 66000000>;
41 + atmel,clk-output-range = <0 83000000>;
42 };
43
44 usart2_clk: usart2_clk {
45 #clock-cells = <0>;
46 reg = <14>;
47 - atmel,clk-output-range = <0 66000000>;
48 + atmel,clk-output-range = <0 83000000>;
49 };
50
51 usart3_clk: usart3_clk {
52 #clock-cells = <0>;
53 reg = <15>;
54 - atmel,clk-output-range = <0 66000000>;
55 + atmel,clk-output-range = <0 83000000>;
56 };
57
58 uart0_clk: uart0_clk {
59 #clock-cells = <0>;
60 reg = <16>;
61 - atmel,clk-output-range = <0 66000000>;
62 + atmel,clk-output-range = <0 83000000>;
63 };
64
65 twi0_clk: twi0_clk {
66 reg = <18>;
67 #clock-cells = <0>;
68 - atmel,clk-output-range = <0 16625000>;
69 + atmel,clk-output-range = <0 41500000>;
70 };
71
72 twi1_clk: twi1_clk {
73 #clock-cells = <0>;
74 reg = <19>;
75 - atmel,clk-output-range = <0 16625000>;
76 + atmel,clk-output-range = <0 41500000>;
77 };
78
79 twi2_clk: twi2_clk {
80 #clock-cells = <0>;
81 reg = <20>;
82 - atmel,clk-output-range = <0 16625000>;
83 + atmel,clk-output-range = <0 41500000>;
84 };
85
86 mci0_clk: mci0_clk {
87 @@ -1167,19 +1167,19 @@
88 spi0_clk: spi0_clk {
89 #clock-cells = <0>;
90 reg = <24>;
91 - atmel,clk-output-range = <0 133000000>;
92 + atmel,clk-output-range = <0 166000000>;
93 };
94
95 spi1_clk: spi1_clk {
96 #clock-cells = <0>;
97 reg = <25>;
98 - atmel,clk-output-range = <0 133000000>;
99 + atmel,clk-output-range = <0 166000000>;
100 };
101
102 tcb0_clk: tcb0_clk {
103 #clock-cells = <0>;
104 reg = <26>;
105 - atmel,clk-output-range = <0 133000000>;
106 + atmel,clk-output-range = <0 166000000>;
107 };
108
109 pwm_clk: pwm_clk {
110 @@ -1190,7 +1190,7 @@
111 adc_clk: adc_clk {
112 #clock-cells = <0>;
113 reg = <29>;
114 - atmel,clk-output-range = <0 66000000>;
115 + atmel,clk-output-range = <0 83000000>;
116 };
117
118 dma0_clk: dma0_clk {
119 @@ -1221,13 +1221,13 @@
120 ssc0_clk: ssc0_clk {
121 #clock-cells = <0>;
122 reg = <38>;
123 - atmel,clk-output-range = <0 66000000>;
124 + atmel,clk-output-range = <0 83000000>;
125 };
126
127 ssc1_clk: ssc1_clk {
128 #clock-cells = <0>;
129 reg = <39>;
130 - atmel,clk-output-range = <0 66000000>;
131 + atmel,clk-output-range = <0 83000000>;
132 };
133
134 sha_clk: sha_clk {
135 diff --git a/arch/arm/boot/dts/sama5d3_can.dtsi b/arch/arm/boot/dts/sama5d3_can.dtsi
136 index c5a3772741bf..0fac79f75c06 100644
137 --- a/arch/arm/boot/dts/sama5d3_can.dtsi
138 +++ b/arch/arm/boot/dts/sama5d3_can.dtsi
139 @@ -37,13 +37,13 @@
140 can0_clk: can0_clk {
141 #clock-cells = <0>;
142 reg = <40>;
143 - atmel,clk-output-range = <0 66000000>;
144 + atmel,clk-output-range = <0 83000000>;
145 };
146
147 can1_clk: can1_clk {
148 #clock-cells = <0>;
149 reg = <41>;
150 - atmel,clk-output-range = <0 66000000>;
151 + atmel,clk-output-range = <0 83000000>;
152 };
153 };
154 };
155 diff --git a/arch/arm/boot/dts/sama5d3_tcb1.dtsi b/arch/arm/boot/dts/sama5d3_tcb1.dtsi
156 index 801f9745e82f..b80dbc45a3c2 100644
157 --- a/arch/arm/boot/dts/sama5d3_tcb1.dtsi
158 +++ b/arch/arm/boot/dts/sama5d3_tcb1.dtsi
159 @@ -23,6 +23,7 @@
160 tcb1_clk: tcb1_clk {
161 #clock-cells = <0>;
162 reg = <27>;
163 + atmel,clk-output-range = <0 166000000>;
164 };
165 };
166 };
167 diff --git a/arch/arm/boot/dts/sama5d3_uart.dtsi b/arch/arm/boot/dts/sama5d3_uart.dtsi
168 index 2511d748867b..71818c7bfb67 100644
169 --- a/arch/arm/boot/dts/sama5d3_uart.dtsi
170 +++ b/arch/arm/boot/dts/sama5d3_uart.dtsi
171 @@ -42,13 +42,13 @@
172 uart0_clk: uart0_clk {
173 #clock-cells = <0>;
174 reg = <16>;
175 - atmel,clk-output-range = <0 66000000>;
176 + atmel,clk-output-range = <0 83000000>;
177 };
178
179 uart1_clk: uart1_clk {
180 #clock-cells = <0>;
181 reg = <17>;
182 - atmel,clk-output-range = <0 66000000>;
183 + atmel,clk-output-range = <0 83000000>;
184 };
185 };
186 };
187 diff --git a/arch/arm/mach-tegra/sleep-tegra30.S b/arch/arm/mach-tegra/sleep-tegra30.S
188 index 16e5ff03383c..91b3f06e5425 100644
189 --- a/arch/arm/mach-tegra/sleep-tegra30.S
190 +++ b/arch/arm/mach-tegra/sleep-tegra30.S
191 @@ -382,6 +382,14 @@ _pll_m_c_x_done:
192 pll_locked r1, r0, CLK_RESET_PLLC_BASE
193 pll_locked r1, r0, CLK_RESET_PLLX_BASE
194
195 + tegra_get_soc_id TEGRA_APB_MISC_BASE, r1
196 + cmp r1, #TEGRA30
197 + beq 1f
198 + ldr r1, [r0, #CLK_RESET_PLLP_BASE]
199 + bic r1, r1, #(1<<31) @ disable PllP bypass
200 + str r1, [r0, #CLK_RESET_PLLP_BASE]
201 +1:
202 +
203 mov32 r7, TEGRA_TMRUS_BASE
204 ldr r1, [r7]
205 add r1, r1, #LOCK_DELAY
206 @@ -641,7 +649,10 @@ tegra30_switch_cpu_to_clk32k:
207 str r0, [r4, #PMC_PLLP_WB0_OVERRIDE]
208
209 /* disable PLLP, PLLA, PLLC and PLLX */
210 + tegra_get_soc_id TEGRA_APB_MISC_BASE, r1
211 + cmp r1, #TEGRA30
212 ldr r0, [r5, #CLK_RESET_PLLP_BASE]
213 + orrne r0, r0, #(1 << 31) @ enable PllP bypass on fast cluster
214 bic r0, r0, #(1 << 30)
215 str r0, [r5, #CLK_RESET_PLLP_BASE]
216 ldr r0, [r5, #CLK_RESET_PLLA_BASE]
217 diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
218 index fa8f2aa88189..f529d3d9d88d 100644
219 --- a/arch/powerpc/Kconfig
220 +++ b/arch/powerpc/Kconfig
221 @@ -85,6 +85,7 @@ config PPC
222 select BINFMT_ELF
223 select ARCH_HAS_ELF_RANDOMIZE
224 select OF
225 + select OF_DMA_DEFAULT_COHERENT if !NOT_COHERENT_CACHE
226 select OF_EARLY_FLATTREE
227 select OF_RESERVED_MEM
228 select HAVE_FTRACE_MCOUNT_RECORD
229 diff --git a/arch/powerpc/boot/4xx.c b/arch/powerpc/boot/4xx.c
230 index 9d3bd4c45a24..1c4354f922fd 100644
231 --- a/arch/powerpc/boot/4xx.c
232 +++ b/arch/powerpc/boot/4xx.c
233 @@ -232,7 +232,7 @@ void ibm4xx_denali_fixup_memsize(void)
234 dpath = 8; /* 64 bits */
235
236 /* get address pins (rows) */
237 - val = SDRAM0_READ(DDR0_42);
238 + val = SDRAM0_READ(DDR0_42);
239
240 row = DDR_GET_VAL(val, DDR_APIN, DDR_APIN_SHIFT);
241 if (row > max_row)
242 diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
243 index e840f943cd2c..5cf1392dff96 100644
244 --- a/arch/powerpc/kvm/book3s_hv.c
245 +++ b/arch/powerpc/kvm/book3s_hv.c
246 @@ -1766,7 +1766,7 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
247 mutex_unlock(&kvm->lock);
248
249 if (!vcore)
250 - goto free_vcpu;
251 + goto uninit_vcpu;
252
253 spin_lock(&vcore->lock);
254 ++vcore->num_threads;
255 @@ -1782,6 +1782,8 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
256
257 return vcpu;
258
259 +uninit_vcpu:
260 + kvm_vcpu_uninit(vcpu);
261 free_vcpu:
262 kmem_cache_free(kvm_vcpu_cache, vcpu);
263 out:
264 diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
265 index e0d88d0890aa..8172021bcee6 100644
266 --- a/arch/powerpc/kvm/book3s_pr.c
267 +++ b/arch/powerpc/kvm/book3s_pr.c
268 @@ -1482,10 +1482,12 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_pr(struct kvm *kvm,
269
270 err = kvmppc_mmu_init(vcpu);
271 if (err < 0)
272 - goto uninit_vcpu;
273 + goto free_shared_page;
274
275 return vcpu;
276
277 +free_shared_page:
278 + free_page((unsigned long)vcpu->arch.shared);
279 uninit_vcpu:
280 kvm_vcpu_uninit(vcpu);
281 free_shadow_vcpu:
282 diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c
283 index 6c12b02f4a61..eee45b9220e0 100644
284 --- a/arch/powerpc/platforms/pseries/hotplug-memory.c
285 +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
286 @@ -398,8 +398,10 @@ static bool lmb_is_removable(struct of_drconf_cell *lmb)
287
288 for (i = 0; i < scns_per_block; i++) {
289 pfn = PFN_DOWN(phys_addr);
290 - if (!pfn_present(pfn))
291 + if (!pfn_present(pfn)) {
292 + phys_addr += MIN_MEMORY_BLOCK_SIZE;
293 continue;
294 + }
295
296 rc &= is_mem_section_removable(pfn, PAGES_PER_SECTION);
297 phys_addr += MIN_MEMORY_BLOCK_SIZE;
298 diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
299 index 0024e451bb36..c0f094c96cd6 100644
300 --- a/arch/powerpc/platforms/pseries/iommu.c
301 +++ b/arch/powerpc/platforms/pseries/iommu.c
302 @@ -167,10 +167,10 @@ static unsigned long tce_get_pseries(struct iommu_table *tbl, long index)
303 return be64_to_cpu(*tcep);
304 }
305
306 -static void tce_free_pSeriesLP(struct iommu_table*, long, long);
307 +static void tce_free_pSeriesLP(unsigned long liobn, long, long);
308 static void tce_freemulti_pSeriesLP(struct iommu_table*, long, long);
309
310 -static int tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum,
311 +static int tce_build_pSeriesLP(unsigned long liobn, long tcenum, long tceshift,
312 long npages, unsigned long uaddr,
313 enum dma_data_direction direction,
314 unsigned long attrs)
315 @@ -181,25 +181,25 @@ static int tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum,
316 int ret = 0;
317 long tcenum_start = tcenum, npages_start = npages;
318
319 - rpn = __pa(uaddr) >> TCE_SHIFT;
320 + rpn = __pa(uaddr) >> tceshift;
321 proto_tce = TCE_PCI_READ;
322 if (direction != DMA_TO_DEVICE)
323 proto_tce |= TCE_PCI_WRITE;
324
325 while (npages--) {
326 - tce = proto_tce | (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT;
327 - rc = plpar_tce_put((u64)tbl->it_index, (u64)tcenum << 12, tce);
328 + tce = proto_tce | (rpn & TCE_RPN_MASK) << tceshift;
329 + rc = plpar_tce_put((u64)liobn, (u64)tcenum << tceshift, tce);
330
331 if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) {
332 ret = (int)rc;
333 - tce_free_pSeriesLP(tbl, tcenum_start,
334 + tce_free_pSeriesLP(liobn, tcenum_start,
335 (npages_start - (npages + 1)));
336 break;
337 }
338
339 if (rc && printk_ratelimit()) {
340 printk("tce_build_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc);
341 - printk("\tindex = 0x%llx\n", (u64)tbl->it_index);
342 + printk("\tindex = 0x%llx\n", (u64)liobn);
343 printk("\ttcenum = 0x%llx\n", (u64)tcenum);
344 printk("\ttce val = 0x%llx\n", tce );
345 dump_stack();
346 @@ -228,7 +228,8 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
347 unsigned long flags;
348
349 if ((npages == 1) || !firmware_has_feature(FW_FEATURE_MULTITCE)) {
350 - return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr,
351 + return tce_build_pSeriesLP(tbl->it_index, tcenum,
352 + tbl->it_page_shift, npages, uaddr,
353 direction, attrs);
354 }
355
356 @@ -244,8 +245,9 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
357 /* If allocation fails, fall back to the loop implementation */
358 if (!tcep) {
359 local_irq_restore(flags);
360 - return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr,
361 - direction, attrs);
362 + return tce_build_pSeriesLP(tbl->it_index, tcenum,
363 + tbl->it_page_shift,
364 + npages, uaddr, direction, attrs);
365 }
366 __this_cpu_write(tce_page, tcep);
367 }
368 @@ -296,16 +298,16 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
369 return ret;
370 }
371
372 -static void tce_free_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages)
373 +static void tce_free_pSeriesLP(unsigned long liobn, long tcenum, long npages)
374 {
375 u64 rc;
376
377 while (npages--) {
378 - rc = plpar_tce_put((u64)tbl->it_index, (u64)tcenum << 12, 0);
379 + rc = plpar_tce_put((u64)liobn, (u64)tcenum << 12, 0);
380
381 if (rc && printk_ratelimit()) {
382 printk("tce_free_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc);
383 - printk("\tindex = 0x%llx\n", (u64)tbl->it_index);
384 + printk("\tindex = 0x%llx\n", (u64)liobn);
385 printk("\ttcenum = 0x%llx\n", (u64)tcenum);
386 dump_stack();
387 }
388 @@ -320,7 +322,7 @@ static void tce_freemulti_pSeriesLP(struct iommu_table *tbl, long tcenum, long n
389 u64 rc;
390
391 if (!firmware_has_feature(FW_FEATURE_MULTITCE))
392 - return tce_free_pSeriesLP(tbl, tcenum, npages);
393 + return tce_free_pSeriesLP(tbl->it_index, tcenum, npages);
394
395 rc = plpar_tce_stuff((u64)tbl->it_index, (u64)tcenum << 12, 0, npages);
396
397 @@ -435,6 +437,19 @@ static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn,
398 u64 rc = 0;
399 long l, limit;
400
401 + if (!firmware_has_feature(FW_FEATURE_MULTITCE)) {
402 + unsigned long tceshift = be32_to_cpu(maprange->tce_shift);
403 + unsigned long dmastart = (start_pfn << PAGE_SHIFT) +
404 + be64_to_cpu(maprange->dma_base);
405 + unsigned long tcenum = dmastart >> tceshift;
406 + unsigned long npages = num_pfn << PAGE_SHIFT >> tceshift;
407 + void *uaddr = __va(start_pfn << PAGE_SHIFT);
408 +
409 + return tce_build_pSeriesLP(be32_to_cpu(maprange->liobn),
410 + tcenum, tceshift, npages, (unsigned long) uaddr,
411 + DMA_BIDIRECTIONAL, 0);
412 + }
413 +
414 local_irq_disable(); /* to protect tcep and the page behind it */
415 tcep = __this_cpu_read(tce_page);
416
417 diff --git a/arch/sparc/include/uapi/asm/ipcbuf.h b/arch/sparc/include/uapi/asm/ipcbuf.h
418 index 66013b4fe10d..58da9c4addb2 100644
419 --- a/arch/sparc/include/uapi/asm/ipcbuf.h
420 +++ b/arch/sparc/include/uapi/asm/ipcbuf.h
421 @@ -14,19 +14,19 @@
422
423 struct ipc64_perm
424 {
425 - __kernel_key_t key;
426 - __kernel_uid_t uid;
427 - __kernel_gid_t gid;
428 - __kernel_uid_t cuid;
429 - __kernel_gid_t cgid;
430 + __kernel_key_t key;
431 + __kernel_uid32_t uid;
432 + __kernel_gid32_t gid;
433 + __kernel_uid32_t cuid;
434 + __kernel_gid32_t cgid;
435 #ifndef __arch64__
436 - unsigned short __pad0;
437 + unsigned short __pad0;
438 #endif
439 - __kernel_mode_t mode;
440 - unsigned short __pad1;
441 - unsigned short seq;
442 - unsigned long long __unused1;
443 - unsigned long long __unused2;
444 + __kernel_mode_t mode;
445 + unsigned short __pad1;
446 + unsigned short seq;
447 + unsigned long long __unused1;
448 + unsigned long long __unused2;
449 };
450
451 #endif /* __SPARC_IPCBUF_H */
452 diff --git a/arch/x86/kernel/cpu/tsx.c b/arch/x86/kernel/cpu/tsx.c
453 index 3e20d322bc98..032509adf9de 100644
454 --- a/arch/x86/kernel/cpu/tsx.c
455 +++ b/arch/x86/kernel/cpu/tsx.c
456 @@ -115,11 +115,12 @@ void __init tsx_init(void)
457 tsx_disable();
458
459 /*
460 - * tsx_disable() will change the state of the
461 - * RTM CPUID bit. Clear it here since it is now
462 - * expected to be not set.
463 + * tsx_disable() will change the state of the RTM and HLE CPUID
464 + * bits. Clear them here since they are now expected to be not
465 + * set.
466 */
467 setup_clear_cpu_cap(X86_FEATURE_RTM);
468 + setup_clear_cpu_cap(X86_FEATURE_HLE);
469 } else if (tsx_ctrl_state == TSX_CTRL_ENABLE) {
470
471 /*
472 @@ -131,10 +132,10 @@ void __init tsx_init(void)
473 tsx_enable();
474
475 /*
476 - * tsx_enable() will change the state of the
477 - * RTM CPUID bit. Force it here since it is now
478 - * expected to be set.
479 + * tsx_enable() will change the state of the RTM and HLE CPUID
480 + * bits. Force them here since they are now expected to be set.
481 */
482 setup_force_cpu_cap(X86_FEATURE_RTM);
483 + setup_force_cpu_cap(X86_FEATURE_HLE);
484 }
485 }
486 diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
487 index 660c35f854f8..c456a9dbade8 100644
488 --- a/arch/x86/kvm/emulate.c
489 +++ b/arch/x86/kvm/emulate.c
490 @@ -21,6 +21,7 @@
491 */
492
493 #include <linux/kvm_host.h>
494 +#include <linux/nospec.h>
495 #include "kvm_cache_regs.h"
496 #include <asm/kvm_emulate.h>
497 #include <linux/stringify.h>
498 @@ -5053,16 +5054,28 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
499 ctxt->ad_bytes = def_ad_bytes ^ 6;
500 break;
501 case 0x26: /* ES override */
502 + has_seg_override = true;
503 + ctxt->seg_override = VCPU_SREG_ES;
504 + break;
505 case 0x2e: /* CS override */
506 + has_seg_override = true;
507 + ctxt->seg_override = VCPU_SREG_CS;
508 + break;
509 case 0x36: /* SS override */
510 + has_seg_override = true;
511 + ctxt->seg_override = VCPU_SREG_SS;
512 + break;
513 case 0x3e: /* DS override */
514 has_seg_override = true;
515 - ctxt->seg_override = (ctxt->b >> 3) & 3;
516 + ctxt->seg_override = VCPU_SREG_DS;
517 break;
518 case 0x64: /* FS override */
519 + has_seg_override = true;
520 + ctxt->seg_override = VCPU_SREG_FS;
521 + break;
522 case 0x65: /* GS override */
523 has_seg_override = true;
524 - ctxt->seg_override = ctxt->b & 7;
525 + ctxt->seg_override = VCPU_SREG_GS;
526 break;
527 case 0x40 ... 0x4f: /* REX */
528 if (mode != X86EMUL_MODE_PROT64)
529 @@ -5146,10 +5159,15 @@ done_prefixes:
530 }
531 break;
532 case Escape:
533 - if (ctxt->modrm > 0xbf)
534 - opcode = opcode.u.esc->high[ctxt->modrm - 0xc0];
535 - else
536 + if (ctxt->modrm > 0xbf) {
537 + size_t size = ARRAY_SIZE(opcode.u.esc->high);
538 + u32 index = array_index_nospec(
539 + ctxt->modrm - 0xc0, size);
540 +
541 + opcode = opcode.u.esc->high[index];
542 + } else {
543 opcode = opcode.u.esc->op[(ctxt->modrm >> 3) & 7];
544 + }
545 break;
546 case InstrDual:
547 if ((ctxt->modrm >> 6) == 3)
548 diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
549 index 42b1c83741c8..5e837c96e93f 100644
550 --- a/arch/x86/kvm/hyperv.c
551 +++ b/arch/x86/kvm/hyperv.c
552 @@ -28,6 +28,7 @@
553
554 #include <linux/kvm_host.h>
555 #include <linux/highmem.h>
556 +#include <linux/nospec.h>
557 #include <asm/apicdef.h>
558 #include <trace/events/kvm.h>
559
560 @@ -719,11 +720,12 @@ static int kvm_hv_msr_get_crash_data(struct kvm_vcpu *vcpu,
561 u32 index, u64 *pdata)
562 {
563 struct kvm_hv *hv = &vcpu->kvm->arch.hyperv;
564 + size_t size = ARRAY_SIZE(hv->hv_crash_param);
565
566 - if (WARN_ON_ONCE(index >= ARRAY_SIZE(hv->hv_crash_param)))
567 + if (WARN_ON_ONCE(index >= size))
568 return -EINVAL;
569
570 - *pdata = hv->hv_crash_param[index];
571 + *pdata = hv->hv_crash_param[array_index_nospec(index, size)];
572 return 0;
573 }
574
575 @@ -762,11 +764,12 @@ static int kvm_hv_msr_set_crash_data(struct kvm_vcpu *vcpu,
576 u32 index, u64 data)
577 {
578 struct kvm_hv *hv = &vcpu->kvm->arch.hyperv;
579 + size_t size = ARRAY_SIZE(hv->hv_crash_param);
580
581 - if (WARN_ON_ONCE(index >= ARRAY_SIZE(hv->hv_crash_param)))
582 + if (WARN_ON_ONCE(index >= size))
583 return -EINVAL;
584
585 - hv->hv_crash_param[index] = data;
586 + hv->hv_crash_param[array_index_nospec(index, size)] = data;
587 return 0;
588 }
589
590 diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
591 index 5f810bb80802..aa34b16e62c2 100644
592 --- a/arch/x86/kvm/ioapic.c
593 +++ b/arch/x86/kvm/ioapic.c
594 @@ -36,6 +36,7 @@
595 #include <linux/io.h>
596 #include <linux/slab.h>
597 #include <linux/export.h>
598 +#include <linux/nospec.h>
599 #include <asm/processor.h>
600 #include <asm/page.h>
601 #include <asm/current.h>
602 @@ -73,13 +74,14 @@ static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic,
603 default:
604 {
605 u32 redir_index = (ioapic->ioregsel - 0x10) >> 1;
606 - u64 redir_content;
607 + u64 redir_content = ~0ULL;
608
609 - if (redir_index < IOAPIC_NUM_PINS)
610 - redir_content =
611 - ioapic->redirtbl[redir_index].bits;
612 - else
613 - redir_content = ~0ULL;
614 + if (redir_index < IOAPIC_NUM_PINS) {
615 + u32 index = array_index_nospec(
616 + redir_index, IOAPIC_NUM_PINS);
617 +
618 + redir_content = ioapic->redirtbl[index].bits;
619 + }
620
621 result = (ioapic->ioregsel & 0x1) ?
622 (redir_content >> 32) & 0xffffffff :
623 @@ -299,6 +301,7 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
624 ioapic_debug("change redir index %x val %x\n", index, val);
625 if (index >= IOAPIC_NUM_PINS)
626 return;
627 + index = array_index_nospec(index, IOAPIC_NUM_PINS);
628 e = &ioapic->redirtbl[index];
629 mask_before = e->fields.mask;
630 /* Preserve read-only fields */
631 diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
632 index cf32533225bb..caa17f8d4221 100644
633 --- a/arch/x86/kvm/lapic.c
634 +++ b/arch/x86/kvm/lapic.c
635 @@ -28,6 +28,7 @@
636 #include <linux/export.h>
637 #include <linux/math64.h>
638 #include <linux/slab.h>
639 +#include <linux/nospec.h>
640 #include <asm/processor.h>
641 #include <asm/msr.h>
642 #include <asm/page.h>
643 @@ -1587,15 +1588,20 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
644 case APIC_LVTTHMR:
645 case APIC_LVTPC:
646 case APIC_LVT1:
647 - case APIC_LVTERR:
648 + case APIC_LVTERR: {
649 /* TODO: Check vector */
650 + size_t size;
651 + u32 index;
652 +
653 if (!kvm_apic_sw_enabled(apic))
654 val |= APIC_LVT_MASKED;
655 -
656 - val &= apic_lvt_mask[(reg - APIC_LVTT) >> 4];
657 + size = ARRAY_SIZE(apic_lvt_mask);
658 + index = array_index_nospec(
659 + (reg - APIC_LVTT) >> 4, size);
660 + val &= apic_lvt_mask[index];
661 kvm_lapic_set_reg(apic, reg, val);
662 -
663 break;
664 + }
665
666 case APIC_LVTT:
667 if (!kvm_apic_sw_enabled(apic))
668 diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c
669 index 0149ac59c273..3e3016411020 100644
670 --- a/arch/x86/kvm/mtrr.c
671 +++ b/arch/x86/kvm/mtrr.c
672 @@ -17,6 +17,7 @@
673 */
674
675 #include <linux/kvm_host.h>
676 +#include <linux/nospec.h>
677 #include <asm/mtrr.h>
678
679 #include "cpuid.h"
680 @@ -202,11 +203,15 @@ static bool fixed_msr_to_seg_unit(u32 msr, int *seg, int *unit)
681 break;
682 case MSR_MTRRfix16K_80000 ... MSR_MTRRfix16K_A0000:
683 *seg = 1;
684 - *unit = msr - MSR_MTRRfix16K_80000;
685 + *unit = array_index_nospec(
686 + msr - MSR_MTRRfix16K_80000,
687 + MSR_MTRRfix16K_A0000 - MSR_MTRRfix16K_80000 + 1);
688 break;
689 case MSR_MTRRfix4K_C0000 ... MSR_MTRRfix4K_F8000:
690 *seg = 2;
691 - *unit = msr - MSR_MTRRfix4K_C0000;
692 + *unit = array_index_nospec(
693 + msr - MSR_MTRRfix4K_C0000,
694 + MSR_MTRRfix4K_F8000 - MSR_MTRRfix4K_C0000 + 1);
695 break;
696 default:
697 return false;
698 diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
699 index f96e1f962587..fbf3d25af765 100644
700 --- a/arch/x86/kvm/pmu.h
701 +++ b/arch/x86/kvm/pmu.h
702 @@ -1,6 +1,8 @@
703 #ifndef __KVM_X86_PMU_H
704 #define __KVM_X86_PMU_H
705
706 +#include <linux/nospec.h>
707 +
708 #define vcpu_to_pmu(vcpu) (&(vcpu)->arch.pmu)
709 #define pmu_to_vcpu(pmu) (container_of((pmu), struct kvm_vcpu, arch.pmu))
710 #define pmc_to_pmu(pmc) (&(pmc)->vcpu->arch.pmu)
711 @@ -80,8 +82,12 @@ static inline bool pmc_is_enabled(struct kvm_pmc *pmc)
712 static inline struct kvm_pmc *get_gp_pmc(struct kvm_pmu *pmu, u32 msr,
713 u32 base)
714 {
715 - if (msr >= base && msr < base + pmu->nr_arch_gp_counters)
716 - return &pmu->gp_counters[msr - base];
717 + if (msr >= base && msr < base + pmu->nr_arch_gp_counters) {
718 + u32 index = array_index_nospec(msr - base,
719 + pmu->nr_arch_gp_counters);
720 +
721 + return &pmu->gp_counters[index];
722 + }
723
724 return NULL;
725 }
726 @@ -91,8 +97,12 @@ static inline struct kvm_pmc *get_fixed_pmc(struct kvm_pmu *pmu, u32 msr)
727 {
728 int base = MSR_CORE_PERF_FIXED_CTR0;
729
730 - if (msr >= base && msr < base + pmu->nr_arch_fixed_counters)
731 - return &pmu->fixed_counters[msr - base];
732 + if (msr >= base && msr < base + pmu->nr_arch_fixed_counters) {
733 + u32 index = array_index_nospec(msr - base,
734 + pmu->nr_arch_fixed_counters);
735 +
736 + return &pmu->fixed_counters[index];
737 + }
738
739 return NULL;
740 }
741 diff --git a/arch/x86/kvm/pmu_intel.c b/arch/x86/kvm/pmu_intel.c
742 index 2729131fe9bf..84ae4dd261ca 100644
743 --- a/arch/x86/kvm/pmu_intel.c
744 +++ b/arch/x86/kvm/pmu_intel.c
745 @@ -87,10 +87,14 @@ static unsigned intel_find_arch_event(struct kvm_pmu *pmu,
746
747 static unsigned intel_find_fixed_event(int idx)
748 {
749 - if (idx >= ARRAY_SIZE(fixed_pmc_events))
750 + u32 event;
751 + size_t size = ARRAY_SIZE(fixed_pmc_events);
752 +
753 + if (idx >= size)
754 return PERF_COUNT_HW_MAX;
755
756 - return intel_arch_events[fixed_pmc_events[idx]].event_type;
757 + event = fixed_pmc_events[array_index_nospec(idx, size)];
758 + return intel_arch_events[event].event_type;
759 }
760
761 /* check if a PMC is enabled by comparing it with globl_ctrl bits. */
762 @@ -131,15 +135,19 @@ static struct kvm_pmc *intel_msr_idx_to_pmc(struct kvm_vcpu *vcpu,
763 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
764 bool fixed = idx & (1u << 30);
765 struct kvm_pmc *counters;
766 + unsigned int num_counters;
767
768 idx &= ~(3u << 30);
769 - if (!fixed && idx >= pmu->nr_arch_gp_counters)
770 - return NULL;
771 - if (fixed && idx >= pmu->nr_arch_fixed_counters)
772 + if (fixed) {
773 + counters = pmu->fixed_counters;
774 + num_counters = pmu->nr_arch_fixed_counters;
775 + } else {
776 + counters = pmu->gp_counters;
777 + num_counters = pmu->nr_arch_gp_counters;
778 + }
779 + if (idx >= num_counters)
780 return NULL;
781 - counters = fixed ? pmu->fixed_counters : pmu->gp_counters;
782 -
783 - return &counters[idx];
784 + return &counters[array_index_nospec(idx, num_counters)];
785 }
786
787 static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
788 diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
789 index f76caa03f4f8..67cdb08a736f 100644
790 --- a/arch/x86/kvm/vmx.c
791 +++ b/arch/x86/kvm/vmx.c
792 @@ -7653,8 +7653,10 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
793 /* _system ok, as nested_vmx_check_permission verified cpl=0 */
794 if (kvm_write_guest_virt_system(vcpu, gva, &field_value,
795 (is_long_mode(vcpu) ? 8 : 4),
796 - &e))
797 + &e)) {
798 kvm_inject_page_fault(vcpu, &e);
799 + return 1;
800 + }
801 }
802
803 nested_vmx_succeed(vcpu);
804 diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
805 new file mode 100644
806 index 000000000000..3791ce8d269e
807 --- /dev/null
808 +++ b/arch/x86/kvm/vmx/vmx.c
809 @@ -0,0 +1,8033 @@
810 +// SPDX-License-Identifier: GPL-2.0-only
811 +/*
812 + * Kernel-based Virtual Machine driver for Linux
813 + *
814 + * This module enables machines with Intel VT-x extensions to run virtual
815 + * machines without emulation or binary translation.
816 + *
817 + * Copyright (C) 2006 Qumranet, Inc.
818 + * Copyright 2010 Red Hat, Inc. and/or its affiliates.
819 + *
820 + * Authors:
821 + * Avi Kivity <avi@qumranet.com>
822 + * Yaniv Kamay <yaniv@qumranet.com>
823 + */
824 +
825 +#include <linux/frame.h>
826 +#include <linux/highmem.h>
827 +#include <linux/hrtimer.h>
828 +#include <linux/kernel.h>
829 +#include <linux/kvm_host.h>
830 +#include <linux/module.h>
831 +#include <linux/moduleparam.h>
832 +#include <linux/mod_devicetable.h>
833 +#include <linux/mm.h>
834 +#include <linux/sched.h>
835 +#include <linux/sched/smt.h>
836 +#include <linux/slab.h>
837 +#include <linux/tboot.h>
838 +#include <linux/trace_events.h>
839 +
840 +#include <asm/apic.h>
841 +#include <asm/asm.h>
842 +#include <asm/cpu.h>
843 +#include <asm/debugreg.h>
844 +#include <asm/desc.h>
845 +#include <asm/fpu/internal.h>
846 +#include <asm/io.h>
847 +#include <asm/irq_remapping.h>
848 +#include <asm/kexec.h>
849 +#include <asm/perf_event.h>
850 +#include <asm/mce.h>
851 +#include <asm/mmu_context.h>
852 +#include <asm/mshyperv.h>
853 +#include <asm/spec-ctrl.h>
854 +#include <asm/virtext.h>
855 +#include <asm/vmx.h>
856 +
857 +#include "capabilities.h"
858 +#include "cpuid.h"
859 +#include "evmcs.h"
860 +#include "irq.h"
861 +#include "kvm_cache_regs.h"
862 +#include "lapic.h"
863 +#include "mmu.h"
864 +#include "nested.h"
865 +#include "ops.h"
866 +#include "pmu.h"
867 +#include "trace.h"
868 +#include "vmcs.h"
869 +#include "vmcs12.h"
870 +#include "vmx.h"
871 +#include "x86.h"
872 +
873 +MODULE_AUTHOR("Qumranet");
874 +MODULE_LICENSE("GPL");
875 +
876 +static const struct x86_cpu_id vmx_cpu_id[] = {
877 + X86_FEATURE_MATCH(X86_FEATURE_VMX),
878 + {}
879 +};
880 +MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
881 +
882 +bool __read_mostly enable_vpid = 1;
883 +module_param_named(vpid, enable_vpid, bool, 0444);
884 +
885 +static bool __read_mostly enable_vnmi = 1;
886 +module_param_named(vnmi, enable_vnmi, bool, S_IRUGO);
887 +
888 +bool __read_mostly flexpriority_enabled = 1;
889 +module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
890 +
891 +bool __read_mostly enable_ept = 1;
892 +module_param_named(ept, enable_ept, bool, S_IRUGO);
893 +
894 +bool __read_mostly enable_unrestricted_guest = 1;
895 +module_param_named(unrestricted_guest,
896 + enable_unrestricted_guest, bool, S_IRUGO);
897 +
898 +bool __read_mostly enable_ept_ad_bits = 1;
899 +module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO);
900 +
901 +static bool __read_mostly emulate_invalid_guest_state = true;
902 +module_param(emulate_invalid_guest_state, bool, S_IRUGO);
903 +
904 +static bool __read_mostly fasteoi = 1;
905 +module_param(fasteoi, bool, S_IRUGO);
906 +
907 +static bool __read_mostly enable_apicv = 1;
908 +module_param(enable_apicv, bool, S_IRUGO);
909 +
910 +/*
911 + * If nested=1, nested virtualization is supported, i.e., guests may use
912 + * VMX and be a hypervisor for its own guests. If nested=0, guests may not
913 + * use VMX instructions.
914 + */
915 +static bool __read_mostly nested = 1;
916 +module_param(nested, bool, S_IRUGO);
917 +
918 +bool __read_mostly enable_pml = 1;
919 +module_param_named(pml, enable_pml, bool, S_IRUGO);
920 +
921 +static bool __read_mostly dump_invalid_vmcs = 0;
922 +module_param(dump_invalid_vmcs, bool, 0644);
923 +
924 +#define MSR_BITMAP_MODE_X2APIC 1
925 +#define MSR_BITMAP_MODE_X2APIC_APICV 2
926 +
927 +#define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL
928 +
929 +/* Guest_tsc -> host_tsc conversion requires 64-bit division. */
930 +static int __read_mostly cpu_preemption_timer_multi;
931 +static bool __read_mostly enable_preemption_timer = 1;
932 +#ifdef CONFIG_X86_64
933 +module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
934 +#endif
935 +
936 +#define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD)
937 +#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE
938 +#define KVM_VM_CR0_ALWAYS_ON \
939 + (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | \
940 + X86_CR0_WP | X86_CR0_PG | X86_CR0_PE)
941 +#define KVM_CR4_GUEST_OWNED_BITS \
942 + (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
943 + | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_TSD)
944 +
945 +#define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE
946 +#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
947 +#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
948 +
949 +#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))
950 +
951 +#define MSR_IA32_RTIT_STATUS_MASK (~(RTIT_STATUS_FILTEREN | \
952 + RTIT_STATUS_CONTEXTEN | RTIT_STATUS_TRIGGEREN | \
953 + RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \
954 + RTIT_STATUS_BYTECNT))
955 +
956 +#define MSR_IA32_RTIT_OUTPUT_BASE_MASK \
957 + (~((1UL << cpuid_query_maxphyaddr(vcpu)) - 1) | 0x7f)
958 +
959 +/*
960 + * These 2 parameters are used to config the controls for Pause-Loop Exiting:
961 + * ple_gap: upper bound on the amount of time between two successive
962 + * executions of PAUSE in a loop. Also indicate if ple enabled.
963 + * According to test, this time is usually smaller than 128 cycles.
964 + * ple_window: upper bound on the amount of time a guest is allowed to execute
965 + * in a PAUSE loop. Tests indicate that most spinlocks are held for
966 + * less than 2^12 cycles
967 + * Time is measured based on a counter that runs at the same rate as the TSC,
968 + * refer SDM volume 3b section 21.6.13 & 22.1.3.
969 + */
970 +static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP;
971 +module_param(ple_gap, uint, 0444);
972 +
973 +static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
974 +module_param(ple_window, uint, 0444);
975 +
976 +/* Default doubles per-vcpu window every exit. */
977 +static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
978 +module_param(ple_window_grow, uint, 0444);
979 +
980 +/* Default resets per-vcpu window every exit to ple_window. */
981 +static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
982 +module_param(ple_window_shrink, uint, 0444);
983 +
984 +/* Default is to compute the maximum so we can never overflow. */
985 +static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
986 +module_param(ple_window_max, uint, 0444);
987 +
988 +/* Default is SYSTEM mode, 1 for host-guest mode */
989 +int __read_mostly pt_mode = PT_MODE_SYSTEM;
990 +module_param(pt_mode, int, S_IRUGO);
991 +
992 +static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
993 +static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond);
994 +static DEFINE_MUTEX(vmx_l1d_flush_mutex);
995 +
996 +/* Storage for pre module init parameter parsing */
997 +static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO;
998 +
999 +static const struct {
1000 + const char *option;
1001 + bool for_parse;
1002 +} vmentry_l1d_param[] = {
1003 + [VMENTER_L1D_FLUSH_AUTO] = {"auto", true},
1004 + [VMENTER_L1D_FLUSH_NEVER] = {"never", true},
1005 + [VMENTER_L1D_FLUSH_COND] = {"cond", true},
1006 + [VMENTER_L1D_FLUSH_ALWAYS] = {"always", true},
1007 + [VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false},
1008 + [VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false},
1009 +};
1010 +
1011 +#define L1D_CACHE_ORDER 4
1012 +static void *vmx_l1d_flush_pages;
1013 +
1014 +static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
1015 +{
1016 + struct page *page;
1017 + unsigned int i;
1018 +
1019 + if (!boot_cpu_has_bug(X86_BUG_L1TF)) {
1020 + l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
1021 + return 0;
1022 + }
1023 +
1024 + if (!enable_ept) {
1025 + l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED;
1026 + return 0;
1027 + }
1028 +
1029 + if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {
1030 + u64 msr;
1031 +
1032 + rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);
1033 + if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
1034 + l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
1035 + return 0;
1036 + }
1037 + }
1038 +
1039 + /* If set to auto use the default l1tf mitigation method */
1040 + if (l1tf == VMENTER_L1D_FLUSH_AUTO) {
1041 + switch (l1tf_mitigation) {
1042 + case L1TF_MITIGATION_OFF:
1043 + l1tf = VMENTER_L1D_FLUSH_NEVER;
1044 + break;
1045 + case L1TF_MITIGATION_FLUSH_NOWARN:
1046 + case L1TF_MITIGATION_FLUSH:
1047 + case L1TF_MITIGATION_FLUSH_NOSMT:
1048 + l1tf = VMENTER_L1D_FLUSH_COND;
1049 + break;
1050 + case L1TF_MITIGATION_FULL:
1051 + case L1TF_MITIGATION_FULL_FORCE:
1052 + l1tf = VMENTER_L1D_FLUSH_ALWAYS;
1053 + break;
1054 + }
1055 + } else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) {
1056 + l1tf = VMENTER_L1D_FLUSH_ALWAYS;
1057 + }
1058 +
1059 + if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages &&
1060 + !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) {
1061 + /*
1062 + * This allocation for vmx_l1d_flush_pages is not tied to a VM
1063 + * lifetime and so should not be charged to a memcg.
1064 + */
1065 + page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);
1066 + if (!page)
1067 + return -ENOMEM;
1068 + vmx_l1d_flush_pages = page_address(page);
1069 +
1070 + /*
1071 + * Initialize each page with a different pattern in
1072 + * order to protect against KSM in the nested
1073 + * virtualization case.
1074 + */
1075 + for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) {
1076 + memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1,
1077 + PAGE_SIZE);
1078 + }
1079 + }
1080 +
1081 + l1tf_vmx_mitigation = l1tf;
1082 +
1083 + if (l1tf != VMENTER_L1D_FLUSH_NEVER)
1084 + static_branch_enable(&vmx_l1d_should_flush);
1085 + else
1086 + static_branch_disable(&vmx_l1d_should_flush);
1087 +
1088 + if (l1tf == VMENTER_L1D_FLUSH_COND)
1089 + static_branch_enable(&vmx_l1d_flush_cond);
1090 + else
1091 + static_branch_disable(&vmx_l1d_flush_cond);
1092 + return 0;
1093 +}
1094 +
1095 +static int vmentry_l1d_flush_parse(const char *s)
1096 +{
1097 + unsigned int i;
1098 +
1099 + if (s) {
1100 + for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) {
1101 + if (vmentry_l1d_param[i].for_parse &&
1102 + sysfs_streq(s, vmentry_l1d_param[i].option))
1103 + return i;
1104 + }
1105 + }
1106 + return -EINVAL;
1107 +}
1108 +
1109 +static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
1110 +{
1111 + int l1tf, ret;
1112 +
1113 + l1tf = vmentry_l1d_flush_parse(s);
1114 + if (l1tf < 0)
1115 + return l1tf;
1116 +
1117 + if (!boot_cpu_has(X86_BUG_L1TF))
1118 + return 0;
1119 +
1120 + /*
1121 + * Has vmx_init() run already? If not then this is the pre init
1122 + * parameter parsing. In that case just store the value and let
1123 + * vmx_init() do the proper setup after enable_ept has been
1124 + * established.
1125 + */
1126 + if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) {
1127 + vmentry_l1d_flush_param = l1tf;
1128 + return 0;
1129 + }
1130 +
1131 + mutex_lock(&vmx_l1d_flush_mutex);
1132 + ret = vmx_setup_l1d_flush(l1tf);
1133 + mutex_unlock(&vmx_l1d_flush_mutex);
1134 + return ret;
1135 +}
1136 +
1137 +static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
1138 +{
1139 + if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param)))
1140 + return sprintf(s, "???\n");
1141 +
1142 + return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);
1143 +}
1144 +
1145 +static const struct kernel_param_ops vmentry_l1d_flush_ops = {
1146 + .set = vmentry_l1d_flush_set,
1147 + .get = vmentry_l1d_flush_get,
1148 +};
1149 +module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
1150 +
1151 +static bool guest_state_valid(struct kvm_vcpu *vcpu);
1152 +static u32 vmx_segment_access_rights(struct kvm_segment *var);
1153 +static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
1154 + u32 msr, int type);
1155 +
1156 +void vmx_vmexit(void);
1157 +
1158 +#define vmx_insn_failed(fmt...) \
1159 +do { \
1160 + WARN_ONCE(1, fmt); \
1161 + pr_warn_ratelimited(fmt); \
1162 +} while (0)
1163 +
1164 +asmlinkage void vmread_error(unsigned long field, bool fault)
1165 +{
1166 + if (fault)
1167 + kvm_spurious_fault();
1168 + else
1169 + vmx_insn_failed("kvm: vmread failed: field=%lx\n", field);
1170 +}
1171 +
1172 +noinline void vmwrite_error(unsigned long field, unsigned long value)
1173 +{
1174 + vmx_insn_failed("kvm: vmwrite failed: field=%lx val=%lx err=%d\n",
1175 + field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
1176 +}
1177 +
1178 +noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr)
1179 +{
1180 + vmx_insn_failed("kvm: vmclear failed: %p/%llx\n", vmcs, phys_addr);
1181 +}
1182 +
1183 +noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr)
1184 +{
1185 + vmx_insn_failed("kvm: vmptrld failed: %p/%llx\n", vmcs, phys_addr);
1186 +}
1187 +
1188 +noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva)
1189 +{
1190 + vmx_insn_failed("kvm: invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n",
1191 + ext, vpid, gva);
1192 +}
1193 +
1194 +noinline void invept_error(unsigned long ext, u64 eptp, gpa_t gpa)
1195 +{
1196 + vmx_insn_failed("kvm: invept failed: ext=0x%lx eptp=%llx gpa=0x%llx\n",
1197 + ext, eptp, gpa);
1198 +}
1199 +
1200 +static DEFINE_PER_CPU(struct vmcs *, vmxarea);
1201 +DEFINE_PER_CPU(struct vmcs *, current_vmcs);
1202 +/*
1203 + * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
1204 + * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
1205 + */
1206 +static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
1207 +
1208 +/*
1209 + * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we
1210 + * can find which vCPU should be waken up.
1211 + */
1212 +static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
1213 +static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
1214 +
1215 +static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
1216 +static DEFINE_SPINLOCK(vmx_vpid_lock);
1217 +
1218 +struct vmcs_config vmcs_config;
1219 +struct vmx_capability vmx_capability;
1220 +
1221 +#define VMX_SEGMENT_FIELD(seg) \
1222 + [VCPU_SREG_##seg] = { \
1223 + .selector = GUEST_##seg##_SELECTOR, \
1224 + .base = GUEST_##seg##_BASE, \
1225 + .limit = GUEST_##seg##_LIMIT, \
1226 + .ar_bytes = GUEST_##seg##_AR_BYTES, \
1227 + }
1228 +
1229 +static const struct kvm_vmx_segment_field {
1230 + unsigned selector;
1231 + unsigned base;
1232 + unsigned limit;
1233 + unsigned ar_bytes;
1234 +} kvm_vmx_segment_fields[] = {
1235 + VMX_SEGMENT_FIELD(CS),
1236 + VMX_SEGMENT_FIELD(DS),
1237 + VMX_SEGMENT_FIELD(ES),
1238 + VMX_SEGMENT_FIELD(FS),
1239 + VMX_SEGMENT_FIELD(GS),
1240 + VMX_SEGMENT_FIELD(SS),
1241 + VMX_SEGMENT_FIELD(TR),
1242 + VMX_SEGMENT_FIELD(LDTR),
1243 +};
1244 +
1245 +u64 host_efer;
1246 +static unsigned long host_idt_base;
1247 +
1248 +/*
1249 + * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm
1250 + * will emulate SYSCALL in legacy mode if the vendor string in guest
1251 + * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To
1252 + * support this emulation, IA32_STAR must always be included in
1253 + * vmx_msr_index[], even in i386 builds.
1254 + */
1255 +const u32 vmx_msr_index[] = {
1256 +#ifdef CONFIG_X86_64
1257 + MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
1258 +#endif
1259 + MSR_EFER, MSR_TSC_AUX, MSR_STAR,
1260 + MSR_IA32_TSX_CTRL,
1261 +};
1262 +
1263 +#if IS_ENABLED(CONFIG_HYPERV)
1264 +static bool __read_mostly enlightened_vmcs = true;
1265 +module_param(enlightened_vmcs, bool, 0444);
1266 +
1267 +/* check_ept_pointer() should be under protection of ept_pointer_lock. */
1268 +static void check_ept_pointer_match(struct kvm *kvm)
1269 +{
1270 + struct kvm_vcpu *vcpu;
1271 + u64 tmp_eptp = INVALID_PAGE;
1272 + int i;
1273 +
1274 + kvm_for_each_vcpu(i, vcpu, kvm) {
1275 + if (!VALID_PAGE(tmp_eptp)) {
1276 + tmp_eptp = to_vmx(vcpu)->ept_pointer;
1277 + } else if (tmp_eptp != to_vmx(vcpu)->ept_pointer) {
1278 + to_kvm_vmx(kvm)->ept_pointers_match
1279 + = EPT_POINTERS_MISMATCH;
1280 + return;
1281 + }
1282 + }
1283 +
1284 + to_kvm_vmx(kvm)->ept_pointers_match = EPT_POINTERS_MATCH;
1285 +}
1286 +
1287 +static int kvm_fill_hv_flush_list_func(struct hv_guest_mapping_flush_list *flush,
1288 + void *data)
1289 +{
1290 + struct kvm_tlb_range *range = data;
1291 +
1292 + return hyperv_fill_flush_guest_mapping_list(flush, range->start_gfn,
1293 + range->pages);
1294 +}
1295 +
1296 +static inline int __hv_remote_flush_tlb_with_range(struct kvm *kvm,
1297 + struct kvm_vcpu *vcpu, struct kvm_tlb_range *range)
1298 +{
1299 + u64 ept_pointer = to_vmx(vcpu)->ept_pointer;
1300 +
1301 + /*
1302 + * FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE hypercall needs address
1303 + * of the base of EPT PML4 table, strip off EPT configuration
1304 + * information.
1305 + */
1306 + if (range)
1307 + return hyperv_flush_guest_mapping_range(ept_pointer & PAGE_MASK,
1308 + kvm_fill_hv_flush_list_func, (void *)range);
1309 + else
1310 + return hyperv_flush_guest_mapping(ept_pointer & PAGE_MASK);
1311 +}
1312 +
1313 +static int hv_remote_flush_tlb_with_range(struct kvm *kvm,
1314 + struct kvm_tlb_range *range)
1315 +{
1316 + struct kvm_vcpu *vcpu;
1317 + int ret = 0, i;
1318 +
1319 + spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
1320 +
1321 + if (to_kvm_vmx(kvm)->ept_pointers_match == EPT_POINTERS_CHECK)
1322 + check_ept_pointer_match(kvm);
1323 +
1324 + if (to_kvm_vmx(kvm)->ept_pointers_match != EPT_POINTERS_MATCH) {
1325 + kvm_for_each_vcpu(i, vcpu, kvm) {
1326 + /* If ept_pointer is invalid pointer, bypass flush request. */
1327 + if (VALID_PAGE(to_vmx(vcpu)->ept_pointer))
1328 + ret |= __hv_remote_flush_tlb_with_range(
1329 + kvm, vcpu, range);
1330 + }
1331 + } else {
1332 + ret = __hv_remote_flush_tlb_with_range(kvm,
1333 + kvm_get_vcpu(kvm, 0), range);
1334 + }
1335 +
1336 + spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
1337 + return ret;
1338 +}
1339 +static int hv_remote_flush_tlb(struct kvm *kvm)
1340 +{
1341 + return hv_remote_flush_tlb_with_range(kvm, NULL);
1342 +}
1343 +
1344 +static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu)
1345 +{
1346 + struct hv_enlightened_vmcs *evmcs;
1347 + struct hv_partition_assist_pg **p_hv_pa_pg =
1348 + &vcpu->kvm->arch.hyperv.hv_pa_pg;
1349 + /*
1350 + * Synthetic VM-Exit is not enabled in current code and so All
1351 + * evmcs in singe VM shares same assist page.
1352 + */
1353 + if (!*p_hv_pa_pg)
1354 + *p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL);
1355 +
1356 + if (!*p_hv_pa_pg)
1357 + return -ENOMEM;
1358 +
1359 + evmcs = (struct hv_enlightened_vmcs *)to_vmx(vcpu)->loaded_vmcs->vmcs;
1360 +
1361 + evmcs->partition_assist_page =
1362 + __pa(*p_hv_pa_pg);
1363 + evmcs->hv_vm_id = (unsigned long)vcpu->kvm;
1364 + evmcs->hv_enlightenments_control.nested_flush_hypercall = 1;
1365 +
1366 + return 0;
1367 +}
1368 +
1369 +#endif /* IS_ENABLED(CONFIG_HYPERV) */
1370 +
1371 +/*
1372 + * Comment's format: document - errata name - stepping - processor name.
1373 + * Refer from
1374 + * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp
1375 + */
1376 +static u32 vmx_preemption_cpu_tfms[] = {
1377 +/* 323344.pdf - BA86 - D0 - Xeon 7500 Series */
1378 +0x000206E6,
1379 +/* 323056.pdf - AAX65 - C2 - Xeon L3406 */
1380 +/* 322814.pdf - AAT59 - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */
1381 +/* 322911.pdf - AAU65 - C2 - i5-600, i3-500 Desktop and Pentium G6950 */
1382 +0x00020652,
1383 +/* 322911.pdf - AAU65 - K0 - i5-600, i3-500 Desktop and Pentium G6950 */
1384 +0x00020655,
1385 +/* 322373.pdf - AAO95 - B1 - Xeon 3400 Series */
1386 +/* 322166.pdf - AAN92 - B1 - i7-800 and i5-700 Desktop */
1387 +/*
1388 + * 320767.pdf - AAP86 - B1 -
1389 + * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile
1390 + */
1391 +0x000106E5,
1392 +/* 321333.pdf - AAM126 - C0 - Xeon 3500 */
1393 +0x000106A0,
1394 +/* 321333.pdf - AAM126 - C1 - Xeon 3500 */
1395 +0x000106A1,
1396 +/* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */
1397 +0x000106A4,
1398 + /* 321333.pdf - AAM126 - D0 - Xeon 3500 */
1399 + /* 321324.pdf - AAK139 - D0 - Xeon 5500 */
1400 + /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */
1401 +0x000106A5,
1402 + /* Xeon E3-1220 V2 */
1403 +0x000306A8,
1404 +};
1405 +
1406 +static inline bool cpu_has_broken_vmx_preemption_timer(void)
1407 +{
1408 + u32 eax = cpuid_eax(0x00000001), i;
1409 +
1410 + /* Clear the reserved bits */
1411 + eax &= ~(0x3U << 14 | 0xfU << 28);
1412 + for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++)
1413 + if (eax == vmx_preemption_cpu_tfms[i])
1414 + return true;
1415 +
1416 + return false;
1417 +}
1418 +
1419 +static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
1420 +{
1421 + return flexpriority_enabled && lapic_in_kernel(vcpu);
1422 +}
1423 +
1424 +static inline bool report_flexpriority(void)
1425 +{
1426 + return flexpriority_enabled;
1427 +}
1428 +
1429 +static inline int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
1430 +{
1431 + int i;
1432 +
1433 + for (i = 0; i < vmx->nmsrs; ++i)
1434 + if (vmx_msr_index[vmx->guest_msrs[i].index] == msr)
1435 + return i;
1436 + return -1;
1437 +}
1438 +
1439 +struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
1440 +{
1441 + int i;
1442 +
1443 + i = __find_msr_index(vmx, msr);
1444 + if (i >= 0)
1445 + return &vmx->guest_msrs[i];
1446 + return NULL;
1447 +}
1448 +
1449 +static int vmx_set_guest_msr(struct vcpu_vmx *vmx, struct shared_msr_entry *msr, u64 data)
1450 +{
1451 + int ret = 0;
1452 +
1453 + u64 old_msr_data = msr->data;
1454 + msr->data = data;
1455 + if (msr - vmx->guest_msrs < vmx->save_nmsrs) {
1456 + preempt_disable();
1457 + ret = kvm_set_shared_msr(msr->index, msr->data,
1458 + msr->mask);
1459 + preempt_enable();
1460 + if (ret)
1461 + msr->data = old_msr_data;
1462 + }
1463 + return ret;
1464 +}
1465 +
1466 +void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs)
1467 +{
1468 + vmcs_clear(loaded_vmcs->vmcs);
1469 + if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
1470 + vmcs_clear(loaded_vmcs->shadow_vmcs);
1471 + loaded_vmcs->cpu = -1;
1472 + loaded_vmcs->launched = 0;
1473 +}
1474 +
1475 +#ifdef CONFIG_KEXEC_CORE
1476 +/*
1477 + * This bitmap is used to indicate whether the vmclear
1478 + * operation is enabled on all cpus. All disabled by
1479 + * default.
1480 + */
1481 +static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE;
1482 +
1483 +static inline void crash_enable_local_vmclear(int cpu)
1484 +{
1485 + cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap);
1486 +}
1487 +
1488 +static inline void crash_disable_local_vmclear(int cpu)
1489 +{
1490 + cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap);
1491 +}
1492 +
1493 +static inline int crash_local_vmclear_enabled(int cpu)
1494 +{
1495 + return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap);
1496 +}
1497 +
1498 +static void crash_vmclear_local_loaded_vmcss(void)
1499 +{
1500 + int cpu = raw_smp_processor_id();
1501 + struct loaded_vmcs *v;
1502 +
1503 + if (!crash_local_vmclear_enabled(cpu))
1504 + return;
1505 +
1506 + list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
1507 + loaded_vmcss_on_cpu_link)
1508 + vmcs_clear(v->vmcs);
1509 +}
1510 +#else
1511 +static inline void crash_enable_local_vmclear(int cpu) { }
1512 +static inline void crash_disable_local_vmclear(int cpu) { }
1513 +#endif /* CONFIG_KEXEC_CORE */
1514 +
1515 +static void __loaded_vmcs_clear(void *arg)
1516 +{
1517 + struct loaded_vmcs *loaded_vmcs = arg;
1518 + int cpu = raw_smp_processor_id();
1519 +
1520 + if (loaded_vmcs->cpu != cpu)
1521 + return; /* vcpu migration can race with cpu offline */
1522 + if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
1523 + per_cpu(current_vmcs, cpu) = NULL;
1524 + crash_disable_local_vmclear(cpu);
1525 + list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
1526 +
1527 + /*
1528 + * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link
1529 + * is before setting loaded_vmcs->vcpu to -1 which is done in
1530 + * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist
1531 + * then adds the vmcs into percpu list before it is deleted.
1532 + */
1533 + smp_wmb();
1534 +
1535 + loaded_vmcs_init(loaded_vmcs);
1536 + crash_enable_local_vmclear(cpu);
1537 +}
1538 +
1539 +void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
1540 +{
1541 + int cpu = loaded_vmcs->cpu;
1542 +
1543 + if (cpu != -1)
1544 + smp_call_function_single(cpu,
1545 + __loaded_vmcs_clear, loaded_vmcs, 1);
1546 +}
1547 +
1548 +static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg,
1549 + unsigned field)
1550 +{
1551 + bool ret;
1552 + u32 mask = 1 << (seg * SEG_FIELD_NR + field);
1553 +
1554 + if (!kvm_register_is_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS)) {
1555 + kvm_register_mark_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS);
1556 + vmx->segment_cache.bitmask = 0;
1557 + }
1558 + ret = vmx->segment_cache.bitmask & mask;
1559 + vmx->segment_cache.bitmask |= mask;
1560 + return ret;
1561 +}
1562 +
1563 +static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg)
1564 +{
1565 + u16 *p = &vmx->segment_cache.seg[seg].selector;
1566 +
1567 + if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL))
1568 + *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector);
1569 + return *p;
1570 +}
1571 +
1572 +static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg)
1573 +{
1574 + ulong *p = &vmx->segment_cache.seg[seg].base;
1575 +
1576 + if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE))
1577 + *p = vmcs_readl(kvm_vmx_segment_fields[seg].base);
1578 + return *p;
1579 +}
1580 +
1581 +static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg)
1582 +{
1583 + u32 *p = &vmx->segment_cache.seg[seg].limit;
1584 +
1585 + if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT))
1586 + *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit);
1587 + return *p;
1588 +}
1589 +
1590 +static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg)
1591 +{
1592 + u32 *p = &vmx->segment_cache.seg[seg].ar;
1593 +
1594 + if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR))
1595 + *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes);
1596 + return *p;
1597 +}
1598 +
1599 +void update_exception_bitmap(struct kvm_vcpu *vcpu)
1600 +{
1601 + u32 eb;
1602 +
1603 + eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
1604 + (1u << DB_VECTOR) | (1u << AC_VECTOR);
1605 + /*
1606 + * Guest access to VMware backdoor ports could legitimately
1607 + * trigger #GP because of TSS I/O permission bitmap.
1608 + * We intercept those #GP and allow access to them anyway
1609 + * as VMware does.
1610 + */
1611 + if (enable_vmware_backdoor)
1612 + eb |= (1u << GP_VECTOR);
1613 + if ((vcpu->guest_debug &
1614 + (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
1615 + (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
1616 + eb |= 1u << BP_VECTOR;
1617 + if (to_vmx(vcpu)->rmode.vm86_active)
1618 + eb = ~0;
1619 + if (enable_ept)
1620 + eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
1621 +
1622 + /* When we are running a nested L2 guest and L1 specified for it a
1623 + * certain exception bitmap, we must trap the same exceptions and pass
1624 + * them to L1. When running L2, we will only handle the exceptions
1625 + * specified above if L1 did not want them.
1626 + */
1627 + if (is_guest_mode(vcpu))
1628 + eb |= get_vmcs12(vcpu)->exception_bitmap;
1629 +
1630 + vmcs_write32(EXCEPTION_BITMAP, eb);
1631 +}
1632 +
1633 +/*
1634 + * Check if MSR is intercepted for currently loaded MSR bitmap.
1635 + */
1636 +static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
1637 +{
1638 + unsigned long *msr_bitmap;
1639 + int f = sizeof(unsigned long);
1640 +
1641 + if (!cpu_has_vmx_msr_bitmap())
1642 + return true;
1643 +
1644 + msr_bitmap = to_vmx(vcpu)->loaded_vmcs->msr_bitmap;
1645 +
1646 + if (msr <= 0x1fff) {
1647 + return !!test_bit(msr, msr_bitmap + 0x800 / f);
1648 + } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
1649 + msr &= 0x1fff;
1650 + return !!test_bit(msr, msr_bitmap + 0xc00 / f);
1651 + }
1652 +
1653 + return true;
1654 +}
1655 +
1656 +static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
1657 + unsigned long entry, unsigned long exit)
1658 +{
1659 + vm_entry_controls_clearbit(vmx, entry);
1660 + vm_exit_controls_clearbit(vmx, exit);
1661 +}
1662 +
1663 +int vmx_find_msr_index(struct vmx_msrs *m, u32 msr)
1664 +{
1665 + unsigned int i;
1666 +
1667 + for (i = 0; i < m->nr; ++i) {
1668 + if (m->val[i].index == msr)
1669 + return i;
1670 + }
1671 + return -ENOENT;
1672 +}
1673 +
1674 +static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
1675 +{
1676 + int i;
1677 + struct msr_autoload *m = &vmx->msr_autoload;
1678 +
1679 + switch (msr) {
1680 + case MSR_EFER:
1681 + if (cpu_has_load_ia32_efer()) {
1682 + clear_atomic_switch_msr_special(vmx,
1683 + VM_ENTRY_LOAD_IA32_EFER,
1684 + VM_EXIT_LOAD_IA32_EFER);
1685 + return;
1686 + }
1687 + break;
1688 + case MSR_CORE_PERF_GLOBAL_CTRL:
1689 + if (cpu_has_load_perf_global_ctrl()) {
1690 + clear_atomic_switch_msr_special(vmx,
1691 + VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
1692 + VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
1693 + return;
1694 + }
1695 + break;
1696 + }
1697 + i = vmx_find_msr_index(&m->guest, msr);
1698 + if (i < 0)
1699 + goto skip_guest;
1700 + --m->guest.nr;
1701 + m->guest.val[i] = m->guest.val[m->guest.nr];
1702 + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
1703 +
1704 +skip_guest:
1705 + i = vmx_find_msr_index(&m->host, msr);
1706 + if (i < 0)
1707 + return;
1708 +
1709 + --m->host.nr;
1710 + m->host.val[i] = m->host.val[m->host.nr];
1711 + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
1712 +}
1713 +
1714 +static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
1715 + unsigned long entry, unsigned long exit,
1716 + unsigned long guest_val_vmcs, unsigned long host_val_vmcs,
1717 + u64 guest_val, u64 host_val)
1718 +{
1719 + vmcs_write64(guest_val_vmcs, guest_val);
1720 + if (host_val_vmcs != HOST_IA32_EFER)
1721 + vmcs_write64(host_val_vmcs, host_val);
1722 + vm_entry_controls_setbit(vmx, entry);
1723 + vm_exit_controls_setbit(vmx, exit);
1724 +}
1725 +
1726 +static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
1727 + u64 guest_val, u64 host_val, bool entry_only)
1728 +{
1729 + int i, j = 0;
1730 + struct msr_autoload *m = &vmx->msr_autoload;
1731 +
1732 + switch (msr) {
1733 + case MSR_EFER:
1734 + if (cpu_has_load_ia32_efer()) {
1735 + add_atomic_switch_msr_special(vmx,
1736 + VM_ENTRY_LOAD_IA32_EFER,
1737 + VM_EXIT_LOAD_IA32_EFER,
1738 + GUEST_IA32_EFER,
1739 + HOST_IA32_EFER,
1740 + guest_val, host_val);
1741 + return;
1742 + }
1743 + break;
1744 + case MSR_CORE_PERF_GLOBAL_CTRL:
1745 + if (cpu_has_load_perf_global_ctrl()) {
1746 + add_atomic_switch_msr_special(vmx,
1747 + VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
1748 + VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL,
1749 + GUEST_IA32_PERF_GLOBAL_CTRL,
1750 + HOST_IA32_PERF_GLOBAL_CTRL,
1751 + guest_val, host_val);
1752 + return;
1753 + }
1754 + break;
1755 + case MSR_IA32_PEBS_ENABLE:
1756 + /* PEBS needs a quiescent period after being disabled (to write
1757 + * a record). Disabling PEBS through VMX MSR swapping doesn't
1758 + * provide that period, so a CPU could write host's record into
1759 + * guest's memory.
1760 + */
1761 + wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
1762 + }
1763 +
1764 + i = vmx_find_msr_index(&m->guest, msr);
1765 + if (!entry_only)
1766 + j = vmx_find_msr_index(&m->host, msr);
1767 +
1768 + if ((i < 0 && m->guest.nr == NR_LOADSTORE_MSRS) ||
1769 + (j < 0 && m->host.nr == NR_LOADSTORE_MSRS)) {
1770 + printk_once(KERN_WARNING "Not enough msr switch entries. "
1771 + "Can't add msr %x\n", msr);
1772 + return;
1773 + }
1774 + if (i < 0) {
1775 + i = m->guest.nr++;
1776 + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
1777 + }
1778 + m->guest.val[i].index = msr;
1779 + m->guest.val[i].value = guest_val;
1780 +
1781 + if (entry_only)
1782 + return;
1783 +
1784 + if (j < 0) {
1785 + j = m->host.nr++;
1786 + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
1787 + }
1788 + m->host.val[j].index = msr;
1789 + m->host.val[j].value = host_val;
1790 +}
1791 +
1792 +static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
1793 +{
1794 + u64 guest_efer = vmx->vcpu.arch.efer;
1795 + u64 ignore_bits = 0;
1796 +
1797 + /* Shadow paging assumes NX to be available. */
1798 + if (!enable_ept)
1799 + guest_efer |= EFER_NX;
1800 +
1801 + /*
1802 + * LMA and LME handled by hardware; SCE meaningless outside long mode.
1803 + */
1804 + ignore_bits |= EFER_SCE;
1805 +#ifdef CONFIG_X86_64
1806 + ignore_bits |= EFER_LMA | EFER_LME;
1807 + /* SCE is meaningful only in long mode on Intel */
1808 + if (guest_efer & EFER_LMA)
1809 + ignore_bits &= ~(u64)EFER_SCE;
1810 +#endif
1811 +
1812 + /*
1813 + * On EPT, we can't emulate NX, so we must switch EFER atomically.
1814 + * On CPUs that support "load IA32_EFER", always switch EFER
1815 + * atomically, since it's faster than switching it manually.
1816 + */
1817 + if (cpu_has_load_ia32_efer() ||
1818 + (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) {
1819 + if (!(guest_efer & EFER_LMA))
1820 + guest_efer &= ~EFER_LME;
1821 + if (guest_efer != host_efer)
1822 + add_atomic_switch_msr(vmx, MSR_EFER,
1823 + guest_efer, host_efer, false);
1824 + else
1825 + clear_atomic_switch_msr(vmx, MSR_EFER);
1826 + return false;
1827 + } else {
1828 + clear_atomic_switch_msr(vmx, MSR_EFER);
1829 +
1830 + guest_efer &= ~ignore_bits;
1831 + guest_efer |= host_efer & ignore_bits;
1832 +
1833 + vmx->guest_msrs[efer_offset].data = guest_efer;
1834 + vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
1835 +
1836 + return true;
1837 + }
1838 +}
1839 +
1840 +#ifdef CONFIG_X86_32
1841 +/*
1842 + * On 32-bit kernels, VM exits still load the FS and GS bases from the
1843 + * VMCS rather than the segment table. KVM uses this helper to figure
1844 + * out the current bases to poke them into the VMCS before entry.
1845 + */
1846 +static unsigned long segment_base(u16 selector)
1847 +{
1848 + struct desc_struct *table;
1849 + unsigned long v;
1850 +
1851 + if (!(selector & ~SEGMENT_RPL_MASK))
1852 + return 0;
1853 +
1854 + table = get_current_gdt_ro();
1855 +
1856 + if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) {
1857 + u16 ldt_selector = kvm_read_ldt();
1858 +
1859 + if (!(ldt_selector & ~SEGMENT_RPL_MASK))
1860 + return 0;
1861 +
1862 + table = (struct desc_struct *)segment_base(ldt_selector);
1863 + }
1864 + v = get_desc_base(&table[selector >> 3]);
1865 + return v;
1866 +}
1867 +#endif
1868 +
1869 +static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range)
1870 +{
1871 + u32 i;
1872 +
1873 + wrmsrl(MSR_IA32_RTIT_STATUS, ctx->status);
1874 + wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
1875 + wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
1876 + wrmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
1877 + for (i = 0; i < addr_range; i++) {
1878 + wrmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
1879 + wrmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
1880 + }
1881 +}
1882 +
1883 +static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range)
1884 +{
1885 + u32 i;
1886 +
1887 + rdmsrl(MSR_IA32_RTIT_STATUS, ctx->status);
1888 + rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
1889 + rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
1890 + rdmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
1891 + for (i = 0; i < addr_range; i++) {
1892 + rdmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
1893 + rdmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
1894 + }
1895 +}
1896 +
1897 +static void pt_guest_enter(struct vcpu_vmx *vmx)
1898 +{
1899 + if (pt_mode == PT_MODE_SYSTEM)
1900 + return;
1901 +
1902 + /*
1903 + * GUEST_IA32_RTIT_CTL is already set in the VMCS.
1904 + * Save host state before VM entry.
1905 + */
1906 + rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
1907 + if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
1908 + wrmsrl(MSR_IA32_RTIT_CTL, 0);
1909 + pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.addr_range);
1910 + pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.addr_range);
1911 + }
1912 +}
1913 +
1914 +static void pt_guest_exit(struct vcpu_vmx *vmx)
1915 +{
1916 + if (pt_mode == PT_MODE_SYSTEM)
1917 + return;
1918 +
1919 + if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
1920 + pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.addr_range);
1921 + pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.addr_range);
1922 + }
1923 +
1924 + /* Reload host state (IA32_RTIT_CTL will be cleared on VM exit). */
1925 + wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
1926 +}
1927 +
1928 +void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel,
1929 + unsigned long fs_base, unsigned long gs_base)
1930 +{
1931 + if (unlikely(fs_sel != host->fs_sel)) {
1932 + if (!(fs_sel & 7))
1933 + vmcs_write16(HOST_FS_SELECTOR, fs_sel);
1934 + else
1935 + vmcs_write16(HOST_FS_SELECTOR, 0);
1936 + host->fs_sel = fs_sel;
1937 + }
1938 + if (unlikely(gs_sel != host->gs_sel)) {
1939 + if (!(gs_sel & 7))
1940 + vmcs_write16(HOST_GS_SELECTOR, gs_sel);
1941 + else
1942 + vmcs_write16(HOST_GS_SELECTOR, 0);
1943 + host->gs_sel = gs_sel;
1944 + }
1945 + if (unlikely(fs_base != host->fs_base)) {
1946 + vmcs_writel(HOST_FS_BASE, fs_base);
1947 + host->fs_base = fs_base;
1948 + }
1949 + if (unlikely(gs_base != host->gs_base)) {
1950 + vmcs_writel(HOST_GS_BASE, gs_base);
1951 + host->gs_base = gs_base;
1952 + }
1953 +}
1954 +
1955 +void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
1956 +{
1957 + struct vcpu_vmx *vmx = to_vmx(vcpu);
1958 + struct vmcs_host_state *host_state;
1959 +#ifdef CONFIG_X86_64
1960 + int cpu = raw_smp_processor_id();
1961 +#endif
1962 + unsigned long fs_base, gs_base;
1963 + u16 fs_sel, gs_sel;
1964 + int i;
1965 +
1966 + vmx->req_immediate_exit = false;
1967 +
1968 + /*
1969 + * Note that guest MSRs to be saved/restored can also be changed
1970 + * when guest state is loaded. This happens when guest transitions
1971 + * to/from long-mode by setting MSR_EFER.LMA.
1972 + */
1973 + if (!vmx->guest_msrs_ready) {
1974 + vmx->guest_msrs_ready = true;
1975 + for (i = 0; i < vmx->save_nmsrs; ++i)
1976 + kvm_set_shared_msr(vmx->guest_msrs[i].index,
1977 + vmx->guest_msrs[i].data,
1978 + vmx->guest_msrs[i].mask);
1979 +
1980 + }
1981 + if (vmx->guest_state_loaded)
1982 + return;
1983 +
1984 + host_state = &vmx->loaded_vmcs->host_state;
1985 +
1986 + /*
1987 + * Set host fs and gs selectors. Unfortunately, 22.2.3 does not
1988 + * allow segment selectors with cpl > 0 or ti == 1.
1989 + */
1990 + host_state->ldt_sel = kvm_read_ldt();
1991 +
1992 +#ifdef CONFIG_X86_64
1993 + savesegment(ds, host_state->ds_sel);
1994 + savesegment(es, host_state->es_sel);
1995 +
1996 + gs_base = cpu_kernelmode_gs_base(cpu);
1997 + if (likely(is_64bit_mm(current->mm))) {
1998 + save_fsgs_for_kvm();
1999 + fs_sel = current->thread.fsindex;
2000 + gs_sel = current->thread.gsindex;
2001 + fs_base = current->thread.fsbase;
2002 + vmx->msr_host_kernel_gs_base = current->thread.gsbase;
2003 + } else {
2004 + savesegment(fs, fs_sel);
2005 + savesegment(gs, gs_sel);
2006 + fs_base = read_msr(MSR_FS_BASE);
2007 + vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
2008 + }
2009 +
2010 + wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
2011 +#else
2012 + savesegment(fs, fs_sel);
2013 + savesegment(gs, gs_sel);
2014 + fs_base = segment_base(fs_sel);
2015 + gs_base = segment_base(gs_sel);
2016 +#endif
2017 +
2018 + vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base);
2019 + vmx->guest_state_loaded = true;
2020 +}
2021 +
2022 +static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
2023 +{
2024 + struct vmcs_host_state *host_state;
2025 +
2026 + if (!vmx->guest_state_loaded)
2027 + return;
2028 +
2029 + host_state = &vmx->loaded_vmcs->host_state;
2030 +
2031 + ++vmx->vcpu.stat.host_state_reload;
2032 +
2033 +#ifdef CONFIG_X86_64
2034 + rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
2035 +#endif
2036 + if (host_state->ldt_sel || (host_state->gs_sel & 7)) {
2037 + kvm_load_ldt(host_state->ldt_sel);
2038 +#ifdef CONFIG_X86_64
2039 + load_gs_index(host_state->gs_sel);
2040 +#else
2041 + loadsegment(gs, host_state->gs_sel);
2042 +#endif
2043 + }
2044 + if (host_state->fs_sel & 7)
2045 + loadsegment(fs, host_state->fs_sel);
2046 +#ifdef CONFIG_X86_64
2047 + if (unlikely(host_state->ds_sel | host_state->es_sel)) {
2048 + loadsegment(ds, host_state->ds_sel);
2049 + loadsegment(es, host_state->es_sel);
2050 + }
2051 +#endif
2052 + invalidate_tss_limit();
2053 +#ifdef CONFIG_X86_64
2054 + wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
2055 +#endif
2056 + load_fixmap_gdt(raw_smp_processor_id());
2057 + vmx->guest_state_loaded = false;
2058 + vmx->guest_msrs_ready = false;
2059 +}
2060 +
2061 +#ifdef CONFIG_X86_64
2062 +static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx)
2063 +{
2064 + preempt_disable();
2065 + if (vmx->guest_state_loaded)
2066 + rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
2067 + preempt_enable();
2068 + return vmx->msr_guest_kernel_gs_base;
2069 +}
2070 +
2071 +static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data)
2072 +{
2073 + preempt_disable();
2074 + if (vmx->guest_state_loaded)
2075 + wrmsrl(MSR_KERNEL_GS_BASE, data);
2076 + preempt_enable();
2077 + vmx->msr_guest_kernel_gs_base = data;
2078 +}
2079 +#endif
2080 +
2081 +static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
2082 +{
2083 + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
2084 + struct pi_desc old, new;
2085 + unsigned int dest;
2086 +
2087 + /*
2088 + * In case of hot-plug or hot-unplug, we may have to undo
2089 + * vmx_vcpu_pi_put even if there is no assigned device. And we
2090 + * always keep PI.NDST up to date for simplicity: it makes the
2091 + * code easier, and CPU migration is not a fast path.
2092 + */
2093 + if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
2094 + return;
2095 +
2096 + /*
2097 + * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change
2098 + * PI.NDST: pi_post_block is the one expected to change PID.NDST and the
2099 + * wakeup handler expects the vCPU to be on the blocked_vcpu_list that
2100 + * matches PI.NDST. Otherwise, a vcpu may not be able to be woken up
2101 + * correctly.
2102 + */
2103 + if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || vcpu->cpu == cpu) {
2104 + pi_clear_sn(pi_desc);
2105 + goto after_clear_sn;
2106 + }
2107 +
2108 + /* The full case. */
2109 + do {
2110 + old.control = new.control = pi_desc->control;
2111 +
2112 + dest = cpu_physical_id(cpu);
2113 +
2114 + if (x2apic_enabled())
2115 + new.ndst = dest;
2116 + else
2117 + new.ndst = (dest << 8) & 0xFF00;
2118 +
2119 + new.sn = 0;
2120 + } while (cmpxchg64(&pi_desc->control, old.control,
2121 + new.control) != old.control);
2122 +
2123 +after_clear_sn:
2124 +
2125 + /*
2126 + * Clear SN before reading the bitmap. The VT-d firmware
2127 + * writes the bitmap and reads SN atomically (5.2.3 in the
2128 + * spec), so it doesn't really have a memory barrier that
2129 + * pairs with this, but we cannot do that and we need one.
2130 + */
2131 + smp_mb__after_atomic();
2132 +
2133 + if (!pi_is_pir_empty(pi_desc))
2134 + pi_set_on(pi_desc);
2135 +}
2136 +
2137 +void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu)
2138 +{
2139 + struct vcpu_vmx *vmx = to_vmx(vcpu);
2140 + bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
2141 +
2142 + if (!already_loaded) {
2143 + loaded_vmcs_clear(vmx->loaded_vmcs);
2144 + local_irq_disable();
2145 + crash_disable_local_vmclear(cpu);
2146 +
2147 + /*
2148 + * Read loaded_vmcs->cpu should be before fetching
2149 + * loaded_vmcs->loaded_vmcss_on_cpu_link.
2150 + * See the comments in __loaded_vmcs_clear().
2151 + */
2152 + smp_rmb();
2153 +
2154 + list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
2155 + &per_cpu(loaded_vmcss_on_cpu, cpu));
2156 + crash_enable_local_vmclear(cpu);
2157 + local_irq_enable();
2158 + }
2159 +
2160 + if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
2161 + per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
2162 + vmcs_load(vmx->loaded_vmcs->vmcs);
2163 + indirect_branch_prediction_barrier();
2164 + }
2165 +
2166 + if (!already_loaded) {
2167 + void *gdt = get_current_gdt_ro();
2168 + unsigned long sysenter_esp;
2169 +
2170 + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
2171 +
2172 + /*
2173 + * Linux uses per-cpu TSS and GDT, so set these when switching
2174 + * processors. See 22.2.4.
2175 + */
2176 + vmcs_writel(HOST_TR_BASE,
2177 + (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);
2178 + vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */
2179 +
2180 + rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
2181 + vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
2182 +
2183 + vmx->loaded_vmcs->cpu = cpu;
2184 + }
2185 +
2186 + /* Setup TSC multiplier */
2187 + if (kvm_has_tsc_control &&
2188 + vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio)
2189 + decache_tsc_multiplier(vmx);
2190 +}
2191 +
2192 +/*
2193 + * Switches to specified vcpu, until a matching vcpu_put(), but assumes
2194 + * vcpu mutex is already taken.
2195 + */
2196 +void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
2197 +{
2198 + struct vcpu_vmx *vmx = to_vmx(vcpu);
2199 +
2200 + vmx_vcpu_load_vmcs(vcpu, cpu);
2201 +
2202 + vmx_vcpu_pi_load(vcpu, cpu);
2203 +
2204 + vmx->host_pkru = read_pkru();
2205 + vmx->host_debugctlmsr = get_debugctlmsr();
2206 +}
2207 +
2208 +static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
2209 +{
2210 + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
2211 +
2212 + if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
2213 + !irq_remapping_cap(IRQ_POSTING_CAP) ||
2214 + !kvm_vcpu_apicv_active(vcpu))
2215 + return;
2216 +
2217 + /* Set SN when the vCPU is preempted */
2218 + if (vcpu->preempted)
2219 + pi_set_sn(pi_desc);
2220 +}
2221 +
2222 +static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
2223 +{
2224 + vmx_vcpu_pi_put(vcpu);
2225 +
2226 + vmx_prepare_switch_to_host(to_vmx(vcpu));
2227 +}
2228 +
2229 +static bool emulation_required(struct kvm_vcpu *vcpu)
2230 +{
2231 + return emulate_invalid_guest_state && !guest_state_valid(vcpu);
2232 +}
2233 +
2234 +static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
2235 +
2236 +unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
2237 +{
2238 + struct vcpu_vmx *vmx = to_vmx(vcpu);
2239 + unsigned long rflags, save_rflags;
2240 +
2241 + if (!kvm_register_is_available(vcpu, VCPU_EXREG_RFLAGS)) {
2242 + kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
2243 + rflags = vmcs_readl(GUEST_RFLAGS);
2244 + if (vmx->rmode.vm86_active) {
2245 + rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
2246 + save_rflags = vmx->rmode.save_rflags;
2247 + rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
2248 + }
2249 + vmx->rflags = rflags;
2250 + }
2251 + return vmx->rflags;
2252 +}
2253 +
2254 +void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
2255 +{
2256 + struct vcpu_vmx *vmx = to_vmx(vcpu);
2257 + unsigned long old_rflags;
2258 +
2259 + if (enable_unrestricted_guest) {
2260 + kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
2261 + vmx->rflags = rflags;
2262 + vmcs_writel(GUEST_RFLAGS, rflags);
2263 + return;
2264 + }
2265 +
2266 + old_rflags = vmx_get_rflags(vcpu);
2267 + vmx->rflags = rflags;
2268 + if (vmx->rmode.vm86_active) {
2269 + vmx->rmode.save_rflags = rflags;
2270 + rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
2271 + }
2272 + vmcs_writel(GUEST_RFLAGS, rflags);
2273 +
2274 + if ((old_rflags ^ vmx->rflags) & X86_EFLAGS_VM)
2275 + vmx->emulation_required = emulation_required(vcpu);
2276 +}
2277 +
2278 +u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
2279 +{
2280 + u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
2281 + int ret = 0;
2282 +
2283 + if (interruptibility & GUEST_INTR_STATE_STI)
2284 + ret |= KVM_X86_SHADOW_INT_STI;
2285 + if (interruptibility & GUEST_INTR_STATE_MOV_SS)
2286 + ret |= KVM_X86_SHADOW_INT_MOV_SS;
2287 +
2288 + return ret;
2289 +}
2290 +
2291 +void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
2292 +{
2293 + u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
2294 + u32 interruptibility = interruptibility_old;
2295 +
2296 + interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
2297 +
2298 + if (mask & KVM_X86_SHADOW_INT_MOV_SS)
2299 + interruptibility |= GUEST_INTR_STATE_MOV_SS;
2300 + else if (mask & KVM_X86_SHADOW_INT_STI)
2301 + interruptibility |= GUEST_INTR_STATE_STI;
2302 +
2303 + if ((interruptibility != interruptibility_old))
2304 + vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
2305 +}
2306 +
2307 +static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
2308 +{
2309 + struct vcpu_vmx *vmx = to_vmx(vcpu);
2310 + unsigned long value;
2311 +
2312 + /*
2313 + * Any MSR write that attempts to change bits marked reserved will
2314 + * case a #GP fault.
2315 + */
2316 + if (data & vmx->pt_desc.ctl_bitmask)
2317 + return 1;
2318 +
2319 + /*
2320 + * Any attempt to modify IA32_RTIT_CTL while TraceEn is set will
2321 + * result in a #GP unless the same write also clears TraceEn.
2322 + */
2323 + if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) &&
2324 + ((vmx->pt_desc.guest.ctl ^ data) & ~RTIT_CTL_TRACEEN))
2325 + return 1;
2326 +
2327 + /*
2328 + * WRMSR to IA32_RTIT_CTL that sets TraceEn but clears this bit
2329 + * and FabricEn would cause #GP, if
2330 + * CPUID.(EAX=14H, ECX=0):ECX.SNGLRGNOUT[bit 2] = 0
2331 + */
2332 + if ((data & RTIT_CTL_TRACEEN) && !(data & RTIT_CTL_TOPA) &&
2333 + !(data & RTIT_CTL_FABRIC_EN) &&
2334 + !intel_pt_validate_cap(vmx->pt_desc.caps,
2335 + PT_CAP_single_range_output))
2336 + return 1;
2337 +
2338 + /*
2339 + * MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that
2340 + * utilize encodings marked reserved will casue a #GP fault.
2341 + */
2342 + value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc_periods);
2343 + if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc) &&
2344 + !test_bit((data & RTIT_CTL_MTC_RANGE) >>
2345 + RTIT_CTL_MTC_RANGE_OFFSET, &value))
2346 + return 1;
2347 + value = intel_pt_validate_cap(vmx->pt_desc.caps,
2348 + PT_CAP_cycle_thresholds);
2349 + if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
2350 + !test_bit((data & RTIT_CTL_CYC_THRESH) >>
2351 + RTIT_CTL_CYC_THRESH_OFFSET, &value))
2352 + return 1;
2353 + value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_periods);
2354 + if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
2355 + !test_bit((data & RTIT_CTL_PSB_FREQ) >>
2356 + RTIT_CTL_PSB_FREQ_OFFSET, &value))
2357 + return 1;
2358 +
2359 + /*
2360 + * If ADDRx_CFG is reserved or the encodings is >2 will
2361 + * cause a #GP fault.
2362 + */
2363 + value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET;
2364 + if ((value && (vmx->pt_desc.addr_range < 1)) || (value > 2))
2365 + return 1;
2366 + value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET;
2367 + if ((value && (vmx->pt_desc.addr_range < 2)) || (value > 2))
2368 + return 1;
2369 + value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET;
2370 + if ((value && (vmx->pt_desc.addr_range < 3)) || (value > 2))
2371 + return 1;
2372 + value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET;
2373 + if ((value && (vmx->pt_desc.addr_range < 4)) || (value > 2))
2374 + return 1;
2375 +
2376 + return 0;
2377 +}
2378 +
2379 +static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
2380 +{
2381 + unsigned long rip;
2382 +
2383 + /*
2384 + * Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on
2385 + * undefined behavior: Intel's SDM doesn't mandate the VMCS field be
2386 + * set when EPT misconfig occurs. In practice, real hardware updates
2387 + * VM_EXIT_INSTRUCTION_LEN on EPT misconfig, but other hypervisors
2388 + * (namely Hyper-V) don't set it due to it being undefined behavior,
2389 + * i.e. we end up advancing IP with some random value.
2390 + */
2391 + if (!static_cpu_has(X86_FEATURE_HYPERVISOR) ||
2392 + to_vmx(vcpu)->exit_reason != EXIT_REASON_EPT_MISCONFIG) {
2393 + rip = kvm_rip_read(vcpu);
2394 + rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
2395 + kvm_rip_write(vcpu, rip);
2396 + } else {
2397 + if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
2398 + return 0;
2399 + }
2400 +
2401 + /* skipping an emulated instruction also counts */
2402 + vmx_set_interrupt_shadow(vcpu, 0);
2403 +
2404 + return 1;
2405 +}
2406 +
2407 +static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
2408 +{
2409 + /*
2410 + * Ensure that we clear the HLT state in the VMCS. We don't need to
2411 + * explicitly skip the instruction because if the HLT state is set,
2412 + * then the instruction is already executing and RIP has already been
2413 + * advanced.
2414 + */
2415 + if (kvm_hlt_in_guest(vcpu->kvm) &&
2416 + vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
2417 + vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
2418 +}
2419 +
2420 +static void vmx_queue_exception(struct kvm_vcpu *vcpu)
2421 +{
2422 + struct vcpu_vmx *vmx = to_vmx(vcpu);
2423 + unsigned nr = vcpu->arch.exception.nr;
2424 + bool has_error_code = vcpu->arch.exception.has_error_code;
2425 + u32 error_code = vcpu->arch.exception.error_code;
2426 + u32 intr_info = nr | INTR_INFO_VALID_MASK;
2427 +
2428 + kvm_deliver_exception_payload(vcpu);
2429 +
2430 + if (has_error_code) {
2431 + vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
2432 + intr_info |= INTR_INFO_DELIVER_CODE_MASK;
2433 + }
2434 +
2435 + if (vmx->rmode.vm86_active) {
2436 + int inc_eip = 0;
2437 + if (kvm_exception_is_soft(nr))
2438 + inc_eip = vcpu->arch.event_exit_inst_len;
2439 + kvm_inject_realmode_interrupt(vcpu, nr, inc_eip);
2440 + return;
2441 + }
2442 +
2443 + WARN_ON_ONCE(vmx->emulation_required);
2444 +
2445 + if (kvm_exception_is_soft(nr)) {
2446 + vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
2447 + vmx->vcpu.arch.event_exit_inst_len);
2448 + intr_info |= INTR_TYPE_SOFT_EXCEPTION;
2449 + } else
2450 + intr_info |= INTR_TYPE_HARD_EXCEPTION;
2451 +
2452 + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
2453 +
2454 + vmx_clear_hlt(vcpu);
2455 +}
2456 +
2457 +static bool vmx_rdtscp_supported(void)
2458 +{
2459 + return cpu_has_vmx_rdtscp();
2460 +}
2461 +
2462 +static bool vmx_invpcid_supported(void)
2463 +{
2464 + return cpu_has_vmx_invpcid();
2465 +}
2466 +
2467 +/*
2468 + * Swap MSR entry in host/guest MSR entry array.
2469 + */
2470 +static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
2471 +{
2472 + struct shared_msr_entry tmp;
2473 +
2474 + tmp = vmx->guest_msrs[to];
2475 + vmx->guest_msrs[to] = vmx->guest_msrs[from];
2476 + vmx->guest_msrs[from] = tmp;
2477 +}
2478 +
2479 +/*
2480 + * Set up the vmcs to automatically save and restore system
2481 + * msrs. Don't touch the 64-bit msrs if the guest is in legacy
2482 + * mode, as fiddling with msrs is very expensive.
2483 + */
2484 +static void setup_msrs(struct vcpu_vmx *vmx)
2485 +{
2486 + int save_nmsrs, index;
2487 +
2488 + save_nmsrs = 0;
2489 +#ifdef CONFIG_X86_64
2490 + /*
2491 + * The SYSCALL MSRs are only needed on long mode guests, and only
2492 + * when EFER.SCE is set.
2493 + */
2494 + if (is_long_mode(&vmx->vcpu) && (vmx->vcpu.arch.efer & EFER_SCE)) {
2495 + index = __find_msr_index(vmx, MSR_STAR);
2496 + if (index >= 0)
2497 + move_msr_up(vmx, index, save_nmsrs++);
2498 + index = __find_msr_index(vmx, MSR_LSTAR);
2499 + if (index >= 0)
2500 + move_msr_up(vmx, index, save_nmsrs++);
2501 + index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
2502 + if (index >= 0)
2503 + move_msr_up(vmx, index, save_nmsrs++);
2504 + }
2505 +#endif
2506 + index = __find_msr_index(vmx, MSR_EFER);
2507 + if (index >= 0 && update_transition_efer(vmx, index))
2508 + move_msr_up(vmx, index, save_nmsrs++);
2509 + index = __find_msr_index(vmx, MSR_TSC_AUX);
2510 + if (index >= 0 && guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP))
2511 + move_msr_up(vmx, index, save_nmsrs++);
2512 + index = __find_msr_index(vmx, MSR_IA32_TSX_CTRL);
2513 + if (index >= 0)
2514 + move_msr_up(vmx, index, save_nmsrs++);
2515 +
2516 + vmx->save_nmsrs = save_nmsrs;
2517 + vmx->guest_msrs_ready = false;
2518 +
2519 + if (cpu_has_vmx_msr_bitmap())
2520 + vmx_update_msr_bitmap(&vmx->vcpu);
2521 +}
2522 +
2523 +static u64 vmx_read_l1_tsc_offset(struct kvm_vcpu *vcpu)
2524 +{
2525 + struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
2526 +
2527 + if (is_guest_mode(vcpu) &&
2528 + (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING))
2529 + return vcpu->arch.tsc_offset - vmcs12->tsc_offset;
2530 +
2531 + return vcpu->arch.tsc_offset;
2532 +}
2533 +
2534 +static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
2535 +{
2536 + struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
2537 + u64 g_tsc_offset = 0;
2538 +
2539 + /*
2540 + * We're here if L1 chose not to trap WRMSR to TSC. According
2541 + * to the spec, this should set L1's TSC; The offset that L1
2542 + * set for L2 remains unchanged, and still needs to be added
2543 + * to the newly set TSC to get L2's TSC.
2544 + */
2545 + if (is_guest_mode(vcpu) &&
2546 + (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING))
2547 + g_tsc_offset = vmcs12->tsc_offset;
2548 +
2549 + trace_kvm_write_tsc_offset(vcpu->vcpu_id,
2550 + vcpu->arch.tsc_offset - g_tsc_offset,
2551 + offset);
2552 + vmcs_write64(TSC_OFFSET, offset + g_tsc_offset);
2553 + return offset + g_tsc_offset;
2554 +}
2555 +
2556 +/*
2557 + * nested_vmx_allowed() checks whether a guest should be allowed to use VMX
2558 + * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for
2559 + * all guests if the "nested" module option is off, and can also be disabled
2560 + * for a single guest by disabling its VMX cpuid bit.
2561 + */
2562 +bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
2563 +{
2564 + return nested && guest_cpuid_has(vcpu, X86_FEATURE_VMX);
2565 +}
2566 +
2567 +static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,
2568 + uint64_t val)
2569 +{
2570 + uint64_t valid_bits = to_vmx(vcpu)->msr_ia32_feature_control_valid_bits;
2571 +
2572 + return !(val & ~valid_bits);
2573 +}
2574 +
2575 +static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
2576 +{
2577 + switch (msr->index) {
2578 + case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
2579 + if (!nested)
2580 + return 1;
2581 + return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data);
2582 + default:
2583 + return 1;
2584 + }
2585 +}
2586 +
2587 +/*
2588 + * Reads an msr value (of 'msr_index') into 'pdata'.
2589 + * Returns 0 on success, non-0 otherwise.
2590 + * Assumes vcpu_load() was already called.
2591 + */
2592 +static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2593 +{
2594 + struct vcpu_vmx *vmx = to_vmx(vcpu);
2595 + struct shared_msr_entry *msr;
2596 + u32 index;
2597 +
2598 + switch (msr_info->index) {
2599 +#ifdef CONFIG_X86_64
2600 + case MSR_FS_BASE:
2601 + msr_info->data = vmcs_readl(GUEST_FS_BASE);
2602 + break;
2603 + case MSR_GS_BASE:
2604 + msr_info->data = vmcs_readl(GUEST_GS_BASE);
2605 + break;
2606 + case MSR_KERNEL_GS_BASE:
2607 + msr_info->data = vmx_read_guest_kernel_gs_base(vmx);
2608 + break;
2609 +#endif
2610 + case MSR_EFER:
2611 + return kvm_get_msr_common(vcpu, msr_info);
2612 + case MSR_IA32_TSX_CTRL:
2613 + if (!msr_info->host_initiated &&
2614 + !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR))
2615 + return 1;
2616 + goto find_shared_msr;
2617 + case MSR_IA32_UMWAIT_CONTROL:
2618 + if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))
2619 + return 1;
2620 +
2621 + msr_info->data = vmx->msr_ia32_umwait_control;
2622 + break;
2623 + case MSR_IA32_SPEC_CTRL:
2624 + if (!msr_info->host_initiated &&
2625 + !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
2626 + return 1;
2627 +
2628 + msr_info->data = to_vmx(vcpu)->spec_ctrl;
2629 + break;
2630 + case MSR_IA32_SYSENTER_CS:
2631 + msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
2632 + break;
2633 + case MSR_IA32_SYSENTER_EIP:
2634 + msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP);
2635 + break;
2636 + case MSR_IA32_SYSENTER_ESP:
2637 + msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP);
2638 + break;
2639 + case MSR_IA32_BNDCFGS:
2640 + if (!kvm_mpx_supported() ||
2641 + (!msr_info->host_initiated &&
2642 + !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
2643 + return 1;
2644 + msr_info->data = vmcs_read64(GUEST_BNDCFGS);
2645 + break;
2646 + case MSR_IA32_MCG_EXT_CTL:
2647 + if (!msr_info->host_initiated &&
2648 + !(vmx->msr_ia32_feature_control &
2649 + FEATURE_CONTROL_LMCE))
2650 + return 1;
2651 + msr_info->data = vcpu->arch.mcg_ext_ctl;
2652 + break;
2653 + case MSR_IA32_FEATURE_CONTROL:
2654 + msr_info->data = vmx->msr_ia32_feature_control;
2655 + break;
2656 + case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
2657 + if (!nested_vmx_allowed(vcpu))
2658 + return 1;
2659 + return vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
2660 + &msr_info->data);
2661 + case MSR_IA32_RTIT_CTL:
2662 + if (pt_mode != PT_MODE_HOST_GUEST)
2663 + return 1;
2664 + msr_info->data = vmx->pt_desc.guest.ctl;
2665 + break;
2666 + case MSR_IA32_RTIT_STATUS:
2667 + if (pt_mode != PT_MODE_HOST_GUEST)
2668 + return 1;
2669 + msr_info->data = vmx->pt_desc.guest.status;
2670 + break;
2671 + case MSR_IA32_RTIT_CR3_MATCH:
2672 + if ((pt_mode != PT_MODE_HOST_GUEST) ||
2673 + !intel_pt_validate_cap(vmx->pt_desc.caps,
2674 + PT_CAP_cr3_filtering))
2675 + return 1;
2676 + msr_info->data = vmx->pt_desc.guest.cr3_match;
2677 + break;
2678 + case MSR_IA32_RTIT_OUTPUT_BASE:
2679 + if ((pt_mode != PT_MODE_HOST_GUEST) ||
2680 + (!intel_pt_validate_cap(vmx->pt_desc.caps,
2681 + PT_CAP_topa_output) &&
2682 + !intel_pt_validate_cap(vmx->pt_desc.caps,
2683 + PT_CAP_single_range_output)))
2684 + return 1;
2685 + msr_info->data = vmx->pt_desc.guest.output_base;
2686 + break;
2687 + case MSR_IA32_RTIT_OUTPUT_MASK:
2688 + if ((pt_mode != PT_MODE_HOST_GUEST) ||
2689 + (!intel_pt_validate_cap(vmx->pt_desc.caps,
2690 + PT_CAP_topa_output) &&
2691 + !intel_pt_validate_cap(vmx->pt_desc.caps,
2692 + PT_CAP_single_range_output)))
2693 + return 1;
2694 + msr_info->data = vmx->pt_desc.guest.output_mask;
2695 + break;
2696 + case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
2697 + index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
2698 + if ((pt_mode != PT_MODE_HOST_GUEST) ||
2699 + (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps,
2700 + PT_CAP_num_address_ranges)))
2701 + return 1;
2702 + if (is_noncanonical_address(data, vcpu))
2703 + return 1;
2704 + if (index % 2)
2705 + msr_info->data = vmx->pt_desc.guest.addr_b[index / 2];
2706 + else
2707 + msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
2708 + break;
2709 + case MSR_TSC_AUX:
2710 + if (!msr_info->host_initiated &&
2711 + !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
2712 + return 1;
2713 + goto find_shared_msr;
2714 + default:
2715 + find_shared_msr:
2716 + msr = find_msr_entry(vmx, msr_info->index);
2717 + if (msr) {
2718 + msr_info->data = msr->data;
2719 + break;
2720 + }
2721 + return kvm_get_msr_common(vcpu, msr_info);
2722 + }
2723 +
2724 + return 0;
2725 +}
2726 +
2727 +/*
2728 + * Writes msr value into the appropriate "register".
2729 + * Returns 0 on success, non-0 otherwise.
2730 + * Assumes vcpu_load() was already called.
2731 + */
2732 +static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2733 +{
2734 + struct vcpu_vmx *vmx = to_vmx(vcpu);
2735 + struct shared_msr_entry *msr;
2736 + int ret = 0;
2737 + u32 msr_index = msr_info->index;
2738 + u64 data = msr_info->data;
2739 + u32 index;
2740 +
2741 + switch (msr_index) {
2742 + case MSR_EFER:
2743 + ret = kvm_set_msr_common(vcpu, msr_info);
2744 + break;
2745 +#ifdef CONFIG_X86_64
2746 + case MSR_FS_BASE:
2747 + vmx_segment_cache_clear(vmx);
2748 + vmcs_writel(GUEST_FS_BASE, data);
2749 + break;
2750 + case MSR_GS_BASE:
2751 + vmx_segment_cache_clear(vmx);
2752 + vmcs_writel(GUEST_GS_BASE, data);
2753 + break;
2754 + case MSR_KERNEL_GS_BASE:
2755 + vmx_write_guest_kernel_gs_base(vmx, data);
2756 + break;
2757 +#endif
2758 + case MSR_IA32_SYSENTER_CS:
2759 + if (is_guest_mode(vcpu))
2760 + get_vmcs12(vcpu)->guest_sysenter_cs = data;
2761 + vmcs_write32(GUEST_SYSENTER_CS, data);
2762 + break;
2763 + case MSR_IA32_SYSENTER_EIP:
2764 + if (is_guest_mode(vcpu))
2765 + get_vmcs12(vcpu)->guest_sysenter_eip = data;
2766 + vmcs_writel(GUEST_SYSENTER_EIP, data);
2767 + break;
2768 + case MSR_IA32_SYSENTER_ESP:
2769 + if (is_guest_mode(vcpu))
2770 + get_vmcs12(vcpu)->guest_sysenter_esp = data;
2771 + vmcs_writel(GUEST_SYSENTER_ESP, data);
2772 + break;
2773 + case MSR_IA32_DEBUGCTLMSR:
2774 + if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls &
2775 + VM_EXIT_SAVE_DEBUG_CONTROLS)
2776 + get_vmcs12(vcpu)->guest_ia32_debugctl = data;
2777 +
2778 + ret = kvm_set_msr_common(vcpu, msr_info);
2779 + break;
2780 +
2781 + case MSR_IA32_BNDCFGS:
2782 + if (!kvm_mpx_supported() ||
2783 + (!msr_info->host_initiated &&
2784 + !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
2785 + return 1;
2786 + if (is_noncanonical_address(data & PAGE_MASK, vcpu) ||
2787 + (data & MSR_IA32_BNDCFGS_RSVD))
2788 + return 1;
2789 + vmcs_write64(GUEST_BNDCFGS, data);
2790 + break;
2791 + case MSR_IA32_UMWAIT_CONTROL:
2792 + if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))
2793 + return 1;
2794 +
2795 + /* The reserved bit 1 and non-32 bit [63:32] should be zero */
2796 + if (data & (BIT_ULL(1) | GENMASK_ULL(63, 32)))
2797 + return 1;
2798 +
2799 + vmx->msr_ia32_umwait_control = data;
2800 + break;
2801 + case MSR_IA32_SPEC_CTRL:
2802 + if (!msr_info->host_initiated &&
2803 + !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
2804 + return 1;
2805 +
2806 + /* The STIBP bit doesn't fault even if it's not advertised */
2807 + if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD))
2808 + return 1;
2809 +
2810 + vmx->spec_ctrl = data;
2811 +
2812 + if (!data)
2813 + break;
2814 +
2815 + /*
2816 + * For non-nested:
2817 + * When it's written (to non-zero) for the first time, pass
2818 + * it through.
2819 + *
2820 + * For nested:
2821 + * The handling of the MSR bitmap for L2 guests is done in
2822 + * nested_vmx_prepare_msr_bitmap. We should not touch the
2823 + * vmcs02.msr_bitmap here since it gets completely overwritten
2824 + * in the merging. We update the vmcs01 here for L1 as well
2825 + * since it will end up touching the MSR anyway now.
2826 + */
2827 + vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap,
2828 + MSR_IA32_SPEC_CTRL,
2829 + MSR_TYPE_RW);
2830 + break;
2831 + case MSR_IA32_TSX_CTRL:
2832 + if (!msr_info->host_initiated &&
2833 + !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR))
2834 + return 1;
2835 + if (data & ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR))
2836 + return 1;
2837 + goto find_shared_msr;
2838 + case MSR_IA32_PRED_CMD:
2839 + if (!msr_info->host_initiated &&
2840 + !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
2841 + return 1;
2842 +
2843 + if (data & ~PRED_CMD_IBPB)
2844 + return 1;
2845 +
2846 + if (!data)
2847 + break;
2848 +
2849 + wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
2850 +
2851 + /*
2852 + * For non-nested:
2853 + * When it's written (to non-zero) for the first time, pass
2854 + * it through.
2855 + *
2856 + * For nested:
2857 + * The handling of the MSR bitmap for L2 guests is done in
2858 + * nested_vmx_prepare_msr_bitmap. We should not touch the
2859 + * vmcs02.msr_bitmap here since it gets completely overwritten
2860 + * in the merging.
2861 + */
2862 + vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD,
2863 + MSR_TYPE_W);
2864 + break;
2865 + case MSR_IA32_CR_PAT:
2866 + if (!kvm_pat_valid(data))
2867 + return 1;
2868 +
2869 + if (is_guest_mode(vcpu) &&
2870 + get_vmcs12(vcpu)->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT)
2871 + get_vmcs12(vcpu)->guest_ia32_pat = data;
2872 +
2873 + if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
2874 + vmcs_write64(GUEST_IA32_PAT, data);
2875 + vcpu->arch.pat = data;
2876 + break;
2877 + }
2878 + ret = kvm_set_msr_common(vcpu, msr_info);
2879 + break;
2880 + case MSR_IA32_TSC_ADJUST:
2881 + ret = kvm_set_msr_common(vcpu, msr_info);
2882 + break;
2883 + case MSR_IA32_MCG_EXT_CTL:
2884 + if ((!msr_info->host_initiated &&
2885 + !(to_vmx(vcpu)->msr_ia32_feature_control &
2886 + FEATURE_CONTROL_LMCE)) ||
2887 + (data & ~MCG_EXT_CTL_LMCE_EN))
2888 + return 1;
2889 + vcpu->arch.mcg_ext_ctl = data;
2890 + break;
2891 + case MSR_IA32_FEATURE_CONTROL:
2892 + if (!vmx_feature_control_msr_valid(vcpu, data) ||
2893 + (to_vmx(vcpu)->msr_ia32_feature_control &
2894 + FEATURE_CONTROL_LOCKED && !msr_info->host_initiated))
2895 + return 1;
2896 + vmx->msr_ia32_feature_control = data;
2897 + if (msr_info->host_initiated && data == 0)
2898 + vmx_leave_nested(vcpu);
2899 + break;
2900 + case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
2901 + if (!msr_info->host_initiated)
2902 + return 1; /* they are read-only */
2903 + if (!nested_vmx_allowed(vcpu))
2904 + return 1;
2905 + return vmx_set_vmx_msr(vcpu, msr_index, data);
2906 + case MSR_IA32_RTIT_CTL:
2907 + if ((pt_mode != PT_MODE_HOST_GUEST) ||
2908 + vmx_rtit_ctl_check(vcpu, data) ||
2909 + vmx->nested.vmxon)
2910 + return 1;
2911 + vmcs_write64(GUEST_IA32_RTIT_CTL, data);
2912 + vmx->pt_desc.guest.ctl = data;
2913 + pt_update_intercept_for_msr(vmx);
2914 + break;
2915 + case MSR_IA32_RTIT_STATUS:
2916 + if ((pt_mode != PT_MODE_HOST_GUEST) ||
2917 + (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) ||
2918 + (data & MSR_IA32_RTIT_STATUS_MASK))
2919 + return 1;
2920 + vmx->pt_desc.guest.status = data;
2921 + break;
2922 + case MSR_IA32_RTIT_CR3_MATCH:
2923 + if ((pt_mode != PT_MODE_HOST_GUEST) ||
2924 + (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) ||
2925 + !intel_pt_validate_cap(vmx->pt_desc.caps,
2926 + PT_CAP_cr3_filtering))
2927 + return 1;
2928 + vmx->pt_desc.guest.cr3_match = data;
2929 + break;
2930 + case MSR_IA32_RTIT_OUTPUT_BASE:
2931 + if ((pt_mode != PT_MODE_HOST_GUEST) ||
2932 + (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) ||
2933 + (!intel_pt_validate_cap(vmx->pt_desc.caps,
2934 + PT_CAP_topa_output) &&
2935 + !intel_pt_validate_cap(vmx->pt_desc.caps,
2936 + PT_CAP_single_range_output)) ||
2937 + (data & MSR_IA32_RTIT_OUTPUT_BASE_MASK))
2938 + return 1;
2939 + vmx->pt_desc.guest.output_base = data;
2940 + break;
2941 + case MSR_IA32_RTIT_OUTPUT_MASK:
2942 + if ((pt_mode != PT_MODE_HOST_GUEST) ||
2943 + (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) ||
2944 + (!intel_pt_validate_cap(vmx->pt_desc.caps,
2945 + PT_CAP_topa_output) &&
2946 + !intel_pt_validate_cap(vmx->pt_desc.caps,
2947 + PT_CAP_single_range_output)))
2948 + return 1;
2949 + vmx->pt_desc.guest.output_mask = data;
2950 + break;
2951 + case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
2952 + index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
2953 + if ((pt_mode != PT_MODE_HOST_GUEST) ||
2954 + (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) ||
2955 + (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps,
2956 + PT_CAP_num_address_ranges)))
2957 + return 1;
2958 + if (is_noncanonical_address(data, vcpu))
2959 + return 1;
2960 + if (index % 2)
2961 + vmx->pt_desc.guest.addr_b[index / 2] = data;
2962 + else
2963 + vmx->pt_desc.guest.addr_a[index / 2] = data;
2964 + break;
2965 + case MSR_TSC_AUX:
2966 + if (!msr_info->host_initiated &&
2967 + !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
2968 + return 1;
2969 + /* Check reserved bit, higher 32 bits should be zero */
2970 + if ((data >> 32) != 0)
2971 + return 1;
2972 + goto find_shared_msr;
2973 +
2974 + default:
2975 + find_shared_msr:
2976 + msr = find_msr_entry(vmx, msr_index);
2977 + if (msr)
2978 + ret = vmx_set_guest_msr(vmx, msr, data);
2979 + else
2980 + ret = kvm_set_msr_common(vcpu, msr_info);
2981 + }
2982 +
2983 + return ret;
2984 +}
2985 +
2986 +static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
2987 +{
2988 + kvm_register_mark_available(vcpu, reg);
2989 +
2990 + switch (reg) {
2991 + case VCPU_REGS_RSP:
2992 + vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
2993 + break;
2994 + case VCPU_REGS_RIP:
2995 + vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
2996 + break;
2997 + case VCPU_EXREG_PDPTR:
2998 + if (enable_ept)
2999 + ept_save_pdptrs(vcpu);
3000 + break;
3001 + case VCPU_EXREG_CR3:
3002 + if (enable_unrestricted_guest || (enable_ept && is_paging(vcpu)))
3003 + vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
3004 + break;
3005 + default:
3006 + WARN_ON_ONCE(1);
3007 + break;
3008 + }
3009 +}
3010 +
3011 +static __init int cpu_has_kvm_support(void)
3012 +{
3013 + return cpu_has_vmx();
3014 +}
3015 +
3016 +static __init int vmx_disabled_by_bios(void)
3017 +{
3018 + u64 msr;
3019 +
3020 + rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
3021 + if (msr & FEATURE_CONTROL_LOCKED) {
3022 + /* launched w/ TXT and VMX disabled */
3023 + if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
3024 + && tboot_enabled())
3025 + return 1;
3026 + /* launched w/o TXT and VMX only enabled w/ TXT */
3027 + if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
3028 + && (msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
3029 + && !tboot_enabled()) {
3030 + printk(KERN_WARNING "kvm: disable TXT in the BIOS or "
3031 + "activate TXT before enabling KVM\n");
3032 + return 1;
3033 + }
3034 + /* launched w/o TXT and VMX disabled */
3035 + if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
3036 + && !tboot_enabled())
3037 + return 1;
3038 + }
3039 +
3040 + return 0;
3041 +}
3042 +
3043 +static void kvm_cpu_vmxon(u64 addr)
3044 +{
3045 + cr4_set_bits(X86_CR4_VMXE);
3046 + intel_pt_handle_vmx(1);
3047 +
3048 + asm volatile ("vmxon %0" : : "m"(addr));
3049 +}
3050 +
3051 +static int hardware_enable(void)
3052 +{
3053 + int cpu = raw_smp_processor_id();
3054 + u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
3055 + u64 old, test_bits;
3056 +
3057 + if (cr4_read_shadow() & X86_CR4_VMXE)
3058 + return -EBUSY;
3059 +
3060 + /*
3061 + * This can happen if we hot-added a CPU but failed to allocate
3062 + * VP assist page for it.
3063 + */
3064 + if (static_branch_unlikely(&enable_evmcs) &&
3065 + !hv_get_vp_assist_page(cpu))
3066 + return -EFAULT;
3067 +
3068 + INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
3069 + INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
3070 + spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
3071 +
3072 + /*
3073 + * Now we can enable the vmclear operation in kdump
3074 + * since the loaded_vmcss_on_cpu list on this cpu
3075 + * has been initialized.
3076 + *
3077 + * Though the cpu is not in VMX operation now, there
3078 + * is no problem to enable the vmclear operation
3079 + * for the loaded_vmcss_on_cpu list is empty!
3080 + */
3081 + crash_enable_local_vmclear(cpu);
3082 +
3083 + rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
3084 +
3085 + test_bits = FEATURE_CONTROL_LOCKED;
3086 + test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
3087 + if (tboot_enabled())
3088 + test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX;
3089 +
3090 + if ((old & test_bits) != test_bits) {
3091 + /* enable and lock */
3092 + wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
3093 + }
3094 + kvm_cpu_vmxon(phys_addr);
3095 + if (enable_ept)
3096 + ept_sync_global();
3097 +
3098 + return 0;
3099 +}
3100 +
3101 +static void vmclear_local_loaded_vmcss(void)
3102 +{
3103 + int cpu = raw_smp_processor_id();
3104 + struct loaded_vmcs *v, *n;
3105 +
3106 + list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
3107 + loaded_vmcss_on_cpu_link)
3108 + __loaded_vmcs_clear(v);
3109 +}
3110 +
3111 +
3112 +/* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot()
3113 + * tricks.
3114 + */
3115 +static void kvm_cpu_vmxoff(void)
3116 +{
3117 + asm volatile (__ex("vmxoff"));
3118 +
3119 + intel_pt_handle_vmx(0);
3120 + cr4_clear_bits(X86_CR4_VMXE);
3121 +}
3122 +
3123 +static void hardware_disable(void)
3124 +{
3125 + vmclear_local_loaded_vmcss();
3126 + kvm_cpu_vmxoff();
3127 +}
3128 +
3129 +static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
3130 + u32 msr, u32 *result)
3131 +{
3132 + u32 vmx_msr_low, vmx_msr_high;
3133 + u32 ctl = ctl_min | ctl_opt;
3134 +
3135 + rdmsr(msr, vmx_msr_low, vmx_msr_high);
3136 +
3137 + ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
3138 + ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */
3139 +
3140 + /* Ensure minimum (required) set of control bits are supported. */
3141 + if (ctl_min & ~ctl)
3142 + return -EIO;
3143 +
3144 + *result = ctl;
3145 + return 0;
3146 +}
3147 +
3148 +static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
3149 + struct vmx_capability *vmx_cap)
3150 +{
3151 + u32 vmx_msr_low, vmx_msr_high;
3152 + u32 min, opt, min2, opt2;
3153 + u32 _pin_based_exec_control = 0;
3154 + u32 _cpu_based_exec_control = 0;
3155 + u32 _cpu_based_2nd_exec_control = 0;
3156 + u32 _vmexit_control = 0;
3157 + u32 _vmentry_control = 0;
3158 +
3159 + memset(vmcs_conf, 0, sizeof(*vmcs_conf));
3160 + min = CPU_BASED_HLT_EXITING |
3161 +#ifdef CONFIG_X86_64
3162 + CPU_BASED_CR8_LOAD_EXITING |
3163 + CPU_BASED_CR8_STORE_EXITING |
3164 +#endif
3165 + CPU_BASED_CR3_LOAD_EXITING |
3166 + CPU_BASED_CR3_STORE_EXITING |
3167 + CPU_BASED_UNCOND_IO_EXITING |
3168 + CPU_BASED_MOV_DR_EXITING |
3169 + CPU_BASED_USE_TSC_OFFSETTING |
3170 + CPU_BASED_MWAIT_EXITING |
3171 + CPU_BASED_MONITOR_EXITING |
3172 + CPU_BASED_INVLPG_EXITING |
3173 + CPU_BASED_RDPMC_EXITING;
3174 +
3175 + opt = CPU_BASED_TPR_SHADOW |
3176 + CPU_BASED_USE_MSR_BITMAPS |
3177 + CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
3178 + if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
3179 + &_cpu_based_exec_control) < 0)
3180 + return -EIO;
3181 +#ifdef CONFIG_X86_64
3182 + if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
3183 + _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
3184 + ~CPU_BASED_CR8_STORE_EXITING;
3185 +#endif
3186 + if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
3187 + min2 = 0;
3188 + opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
3189 + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
3190 + SECONDARY_EXEC_WBINVD_EXITING |
3191 + SECONDARY_EXEC_ENABLE_VPID |
3192 + SECONDARY_EXEC_ENABLE_EPT |
3193 + SECONDARY_EXEC_UNRESTRICTED_GUEST |
3194 + SECONDARY_EXEC_PAUSE_LOOP_EXITING |
3195 + SECONDARY_EXEC_DESC |
3196 + SECONDARY_EXEC_RDTSCP |
3197 + SECONDARY_EXEC_ENABLE_INVPCID |
3198 + SECONDARY_EXEC_APIC_REGISTER_VIRT |
3199 + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
3200 + SECONDARY_EXEC_SHADOW_VMCS |
3201 + SECONDARY_EXEC_XSAVES |
3202 + SECONDARY_EXEC_RDSEED_EXITING |
3203 + SECONDARY_EXEC_RDRAND_EXITING |
3204 + SECONDARY_EXEC_ENABLE_PML |
3205 + SECONDARY_EXEC_TSC_SCALING |
3206 + SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE |
3207 + SECONDARY_EXEC_PT_USE_GPA |
3208 + SECONDARY_EXEC_PT_CONCEAL_VMX |
3209 + SECONDARY_EXEC_ENABLE_VMFUNC |
3210 + SECONDARY_EXEC_ENCLS_EXITING;
3211 + if (adjust_vmx_controls(min2, opt2,
3212 + MSR_IA32_VMX_PROCBASED_CTLS2,
3213 + &_cpu_based_2nd_exec_control) < 0)
3214 + return -EIO;
3215 + }
3216 +#ifndef CONFIG_X86_64
3217 + if (!(_cpu_based_2nd_exec_control &
3218 + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
3219 + _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
3220 +#endif
3221 +
3222 + if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
3223 + _cpu_based_2nd_exec_control &= ~(
3224 + SECONDARY_EXEC_APIC_REGISTER_VIRT |
3225 + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
3226 + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
3227 +
3228 + rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP,
3229 + &vmx_cap->ept, &vmx_cap->vpid);
3230 +
3231 + if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
3232 + /* CR3 accesses and invlpg don't need to cause VM Exits when EPT
3233 + enabled */
3234 + _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
3235 + CPU_BASED_CR3_STORE_EXITING |
3236 + CPU_BASED_INVLPG_EXITING);
3237 + } else if (vmx_cap->ept) {
3238 + vmx_cap->ept = 0;
3239 + pr_warn_once("EPT CAP should not exist if not support "
3240 + "1-setting enable EPT VM-execution control\n");
3241 + }
3242 + if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
3243 + vmx_cap->vpid) {
3244 + vmx_cap->vpid = 0;
3245 + pr_warn_once("VPID CAP should not exist if not support "
3246 + "1-setting enable VPID VM-execution control\n");
3247 + }
3248 +
3249 + min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT;
3250 +#ifdef CONFIG_X86_64
3251 + min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
3252 +#endif
3253 + opt = VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL |
3254 + VM_EXIT_LOAD_IA32_PAT |
3255 + VM_EXIT_LOAD_IA32_EFER |
3256 + VM_EXIT_CLEAR_BNDCFGS |
3257 + VM_EXIT_PT_CONCEAL_PIP |
3258 + VM_EXIT_CLEAR_IA32_RTIT_CTL;
3259 + if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
3260 + &_vmexit_control) < 0)
3261 + return -EIO;
3262 +
3263 + min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
3264 + opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR |
3265 + PIN_BASED_VMX_PREEMPTION_TIMER;
3266 + if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
3267 + &_pin_based_exec_control) < 0)
3268 + return -EIO;
3269 +
3270 + if (cpu_has_broken_vmx_preemption_timer())
3271 + _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
3272 + if (!(_cpu_based_2nd_exec_control &
3273 + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY))
3274 + _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
3275 +
3276 + min = VM_ENTRY_LOAD_DEBUG_CONTROLS;
3277 + opt = VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL |
3278 + VM_ENTRY_LOAD_IA32_PAT |
3279 + VM_ENTRY_LOAD_IA32_EFER |
3280 + VM_ENTRY_LOAD_BNDCFGS |
3281 + VM_ENTRY_PT_CONCEAL_PIP |
3282 + VM_ENTRY_LOAD_IA32_RTIT_CTL;
3283 + if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
3284 + &_vmentry_control) < 0)
3285 + return -EIO;
3286 +
3287 + /*
3288 + * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they
3289 + * can't be used due to an errata where VM Exit may incorrectly clear
3290 + * IA32_PERF_GLOBAL_CTRL[34:32]. Workaround the errata by using the
3291 + * MSR load mechanism to switch IA32_PERF_GLOBAL_CTRL.
3292 + */
3293 + if (boot_cpu_data.x86 == 0x6) {
3294 + switch (boot_cpu_data.x86_model) {
3295 + case 26: /* AAK155 */
3296 + case 30: /* AAP115 */
3297 + case 37: /* AAT100 */
3298 + case 44: /* BC86,AAY89,BD102 */
3299 + case 46: /* BA97 */
3300 + _vmentry_control &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
3301 + _vmexit_control &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
3302 + pr_warn_once("kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
3303 + "does not work properly. Using workaround\n");
3304 + break;
3305 + default:
3306 + break;
3307 + }
3308 + }
3309 +
3310 +
3311 + rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
3312 +
3313 + /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
3314 + if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
3315 + return -EIO;
3316 +
3317 +#ifdef CONFIG_X86_64
3318 + /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
3319 + if (vmx_msr_high & (1u<<16))
3320 + return -EIO;
3321 +#endif
3322 +
3323 + /* Require Write-Back (WB) memory type for VMCS accesses. */
3324 + if (((vmx_msr_high >> 18) & 15) != 6)
3325 + return -EIO;
3326 +
3327 + vmcs_conf->size = vmx_msr_high & 0x1fff;
3328 + vmcs_conf->order = get_order(vmcs_conf->size);
3329 + vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
3330 +
3331 + vmcs_conf->revision_id = vmx_msr_low;
3332 +
3333 + vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
3334 + vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
3335 + vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
3336 + vmcs_conf->vmexit_ctrl = _vmexit_control;
3337 + vmcs_conf->vmentry_ctrl = _vmentry_control;
3338 +
3339 + if (static_branch_unlikely(&enable_evmcs))
3340 + evmcs_sanitize_exec_ctrls(vmcs_conf);
3341 +
3342 + return 0;
3343 +}
3344 +
3345 +struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags)
3346 +{
3347 + int node = cpu_to_node(cpu);
3348 + struct page *pages;
3349 + struct vmcs *vmcs;
3350 +
3351 + pages = __alloc_pages_node(node, flags, vmcs_config.order);
3352 + if (!pages)
3353 + return NULL;
3354 + vmcs = page_address(pages);
3355 + memset(vmcs, 0, vmcs_config.size);
3356 +
3357 + /* KVM supports Enlightened VMCS v1 only */
3358 + if (static_branch_unlikely(&enable_evmcs))
3359 + vmcs->hdr.revision_id = KVM_EVMCS_VERSION;
3360 + else
3361 + vmcs->hdr.revision_id = vmcs_config.revision_id;
3362 +
3363 + if (shadow)
3364 + vmcs->hdr.shadow_vmcs = 1;
3365 + return vmcs;
3366 +}
3367 +
3368 +void free_vmcs(struct vmcs *vmcs)
3369 +{
3370 + free_pages((unsigned long)vmcs, vmcs_config.order);
3371 +}
3372 +
3373 +/*
3374 + * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded
3375 + */
3376 +void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
3377 +{
3378 + if (!loaded_vmcs->vmcs)
3379 + return;
3380 + loaded_vmcs_clear(loaded_vmcs);
3381 + free_vmcs(loaded_vmcs->vmcs);
3382 + loaded_vmcs->vmcs = NULL;
3383 + if (loaded_vmcs->msr_bitmap)
3384 + free_page((unsigned long)loaded_vmcs->msr_bitmap);
3385 + WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
3386 +}
3387 +
3388 +int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
3389 +{
3390 + loaded_vmcs->vmcs = alloc_vmcs(false);
3391 + if (!loaded_vmcs->vmcs)
3392 + return -ENOMEM;
3393 +
3394 + loaded_vmcs->shadow_vmcs = NULL;
3395 + loaded_vmcs->hv_timer_soft_disabled = false;
3396 + loaded_vmcs_init(loaded_vmcs);
3397 +
3398 + if (cpu_has_vmx_msr_bitmap()) {
3399 + loaded_vmcs->msr_bitmap = (unsigned long *)
3400 + __get_free_page(GFP_KERNEL_ACCOUNT);
3401 + if (!loaded_vmcs->msr_bitmap)
3402 + goto out_vmcs;
3403 + memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
3404 +
3405 + if (IS_ENABLED(CONFIG_HYPERV) &&
3406 + static_branch_unlikely(&enable_evmcs) &&
3407 + (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {
3408 + struct hv_enlightened_vmcs *evmcs =
3409 + (struct hv_enlightened_vmcs *)loaded_vmcs->vmcs;
3410 +
3411 + evmcs->hv_enlightenments_control.msr_bitmap = 1;
3412 + }
3413 + }
3414 +
3415 + memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state));
3416 + memset(&loaded_vmcs->controls_shadow, 0,
3417 + sizeof(struct vmcs_controls_shadow));
3418 +
3419 + return 0;
3420 +
3421 +out_vmcs:
3422 + free_loaded_vmcs(loaded_vmcs);
3423 + return -ENOMEM;
3424 +}
3425 +
3426 +static void free_kvm_area(void)
3427 +{
3428 + int cpu;
3429 +
3430 + for_each_possible_cpu(cpu) {
3431 + free_vmcs(per_cpu(vmxarea, cpu));
3432 + per_cpu(vmxarea, cpu) = NULL;
3433 + }
3434 +}
3435 +
3436 +static __init int alloc_kvm_area(void)
3437 +{
3438 + int cpu;
3439 +
3440 + for_each_possible_cpu(cpu) {
3441 + struct vmcs *vmcs;
3442 +
3443 + vmcs = alloc_vmcs_cpu(false, cpu, GFP_KERNEL);
3444 + if (!vmcs) {
3445 + free_kvm_area();
3446 + return -ENOMEM;
3447 + }
3448 +
3449 + /*
3450 + * When eVMCS is enabled, alloc_vmcs_cpu() sets
3451 + * vmcs->revision_id to KVM_EVMCS_VERSION instead of
3452 + * revision_id reported by MSR_IA32_VMX_BASIC.
3453 + *
3454 + * However, even though not explicitly documented by
3455 + * TLFS, VMXArea passed as VMXON argument should
3456 + * still be marked with revision_id reported by
3457 + * physical CPU.
3458 + */
3459 + if (static_branch_unlikely(&enable_evmcs))
3460 + vmcs->hdr.revision_id = vmcs_config.revision_id;
3461 +
3462 + per_cpu(vmxarea, cpu) = vmcs;
3463 + }
3464 + return 0;
3465 +}
3466 +
3467 +static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg,
3468 + struct kvm_segment *save)
3469 +{
3470 + if (!emulate_invalid_guest_state) {
3471 + /*
3472 + * CS and SS RPL should be equal during guest entry according
3473 + * to VMX spec, but in reality it is not always so. Since vcpu
3474 + * is in the middle of the transition from real mode to
3475 + * protected mode it is safe to assume that RPL 0 is a good
3476 + * default value.
3477 + */
3478 + if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS)
3479 + save->selector &= ~SEGMENT_RPL_MASK;
3480 + save->dpl = save->selector & SEGMENT_RPL_MASK;
3481 + save->s = 1;
3482 + }
3483 + vmx_set_segment(vcpu, save, seg);
3484 +}
3485 +
3486 +static void enter_pmode(struct kvm_vcpu *vcpu)
3487 +{
3488 + unsigned long flags;
3489 + struct vcpu_vmx *vmx = to_vmx(vcpu);
3490 +
3491 + /*
3492 + * Update real mode segment cache. It may be not up-to-date if sement
3493 + * register was written while vcpu was in a guest mode.
3494 + */
3495 + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
3496 + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
3497 + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
3498 + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
3499 + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
3500 + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
3501 +
3502 + vmx->rmode.vm86_active = 0;
3503 +
3504 + vmx_segment_cache_clear(vmx);
3505 +
3506 + vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
3507 +
3508 + flags = vmcs_readl(GUEST_RFLAGS);
3509 + flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
3510 + flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
3511 + vmcs_writel(GUEST_RFLAGS, flags);
3512 +
3513 + vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
3514 + (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
3515 +
3516 + update_exception_bitmap(vcpu);
3517 +
3518 + fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
3519 + fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
3520 + fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
3521 + fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
3522 + fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
3523 + fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
3524 +}
3525 +
3526 +static void fix_rmode_seg(int seg, struct kvm_segment *save)
3527 +{
3528 + const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
3529 + struct kvm_segment var = *save;
3530 +
3531 + var.dpl = 0x3;
3532 + if (seg == VCPU_SREG_CS)
3533 + var.type = 0x3;
3534 +
3535 + if (!emulate_invalid_guest_state) {
3536 + var.selector = var.base >> 4;
3537 + var.base = var.base & 0xffff0;
3538 + var.limit = 0xffff;
3539 + var.g = 0;
3540 + var.db = 0;
3541 + var.present = 1;
3542 + var.s = 1;
3543 + var.l = 0;
3544 + var.unusable = 0;
3545 + var.type = 0x3;
3546 + var.avl = 0;
3547 + if (save->base & 0xf)
3548 + printk_once(KERN_WARNING "kvm: segment base is not "
3549 + "paragraph aligned when entering "
3550 + "protected mode (seg=%d)", seg);
3551 + }
3552 +
3553 + vmcs_write16(sf->selector, var.selector);
3554 + vmcs_writel(sf->base, var.base);
3555 + vmcs_write32(sf->limit, var.limit);
3556 + vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var));
3557 +}
3558 +
3559 +static void enter_rmode(struct kvm_vcpu *vcpu)
3560 +{
3561 + unsigned long flags;
3562 + struct vcpu_vmx *vmx = to_vmx(vcpu);
3563 + struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm);
3564 +
3565 + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
3566 + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
3567 + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
3568 + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
3569 + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
3570 + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
3571 + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
3572 +
3573 + vmx->rmode.vm86_active = 1;
3574 +
3575 + /*
3576 + * Very old userspace does not call KVM_SET_TSS_ADDR before entering
3577 + * vcpu. Warn the user that an update is overdue.
3578 + */
3579 + if (!kvm_vmx->tss_addr)
3580 + printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be "
3581 + "called before entering vcpu\n");
3582 +
3583 + vmx_segment_cache_clear(vmx);
3584 +
3585 + vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr);
3586 + vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
3587 + vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
3588 +
3589 + flags = vmcs_readl(GUEST_RFLAGS);
3590 + vmx->rmode.save_rflags = flags;
3591 +
3592 + flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
3593 +
3594 + vmcs_writel(GUEST_RFLAGS, flags);
3595 + vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
3596 + update_exception_bitmap(vcpu);
3597 +
3598 + fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
3599 + fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
3600 + fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
3601 + fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
3602 + fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
3603 + fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
3604 +
3605 + kvm_mmu_reset_context(vcpu);
3606 +}
3607 +
3608 +void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
3609 +{
3610 + struct vcpu_vmx *vmx = to_vmx(vcpu);
3611 + struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
3612 +
3613 + if (!msr)
3614 + return;
3615 +
3616 + vcpu->arch.efer = efer;
3617 + if (efer & EFER_LMA) {
3618 + vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
3619 + msr->data = efer;
3620 + } else {
3621 + vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
3622 +
3623 + msr->data = efer & ~EFER_LME;
3624 + }
3625 + setup_msrs(vmx);
3626 +}
3627 +
3628 +#ifdef CONFIG_X86_64
3629 +
3630 +static void enter_lmode(struct kvm_vcpu *vcpu)
3631 +{
3632 + u32 guest_tr_ar;
3633 +
3634 + vmx_segment_cache_clear(to_vmx(vcpu));
3635 +
3636 + guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
3637 + if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) {
3638 + pr_debug_ratelimited("%s: tss fixup for long mode. \n",
3639 + __func__);
3640 + vmcs_write32(GUEST_TR_AR_BYTES,
3641 + (guest_tr_ar & ~VMX_AR_TYPE_MASK)
3642 + | VMX_AR_TYPE_BUSY_64_TSS);
3643 + }
3644 + vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA);
3645 +}
3646 +
3647 +static void exit_lmode(struct kvm_vcpu *vcpu)
3648 +{
3649 + vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
3650 + vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA);
3651 +}
3652 +
3653 +#endif
3654 +
3655 +static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
3656 +{
3657 + int vpid = to_vmx(vcpu)->vpid;
3658 +
3659 + if (!vpid_sync_vcpu_addr(vpid, addr))
3660 + vpid_sync_context(vpid);
3661 +
3662 + /*
3663 + * If VPIDs are not supported or enabled, then the above is a no-op.
3664 + * But we don't really need a TLB flush in that case anyway, because
3665 + * each VM entry/exit includes an implicit flush when VPID is 0.
3666 + */
3667 +}
3668 +
3669 +static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
3670 +{
3671 + ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
3672 +
3673 + vcpu->arch.cr0 &= ~cr0_guest_owned_bits;
3674 + vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits;
3675 +}
3676 +
3677 +static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
3678 +{
3679 + ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
3680 +
3681 + vcpu->arch.cr4 &= ~cr4_guest_owned_bits;
3682 + vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits;
3683 +}
3684 +
3685 +static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
3686 +{
3687 + struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
3688 +
3689 + if (!kvm_register_is_dirty(vcpu, VCPU_EXREG_PDPTR))
3690 + return;
3691 +
3692 + if (is_pae_paging(vcpu)) {
3693 + vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]);
3694 + vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]);
3695 + vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]);
3696 + vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]);
3697 + }
3698 +}
3699 +
3700 +void ept_save_pdptrs(struct kvm_vcpu *vcpu)
3701 +{
3702 + struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
3703 +
3704 + if (is_pae_paging(vcpu)) {
3705 + mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
3706 + mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
3707 + mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
3708 + mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
3709 + }
3710 +
3711 + kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
3712 +}
3713 +
3714 +static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
3715 + unsigned long cr0,
3716 + struct kvm_vcpu *vcpu)
3717 +{
3718 + struct vcpu_vmx *vmx = to_vmx(vcpu);
3719 +
3720 + if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3))
3721 + vmx_cache_reg(vcpu, VCPU_EXREG_CR3);
3722 + if (!(cr0 & X86_CR0_PG)) {
3723 + /* From paging/starting to nonpaging */
3724 + exec_controls_setbit(vmx, CPU_BASED_CR3_LOAD_EXITING |
3725 + CPU_BASED_CR3_STORE_EXITING);
3726 + vcpu->arch.cr0 = cr0;
3727 + vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
3728 + } else if (!is_paging(vcpu)) {
3729 + /* From nonpaging to paging */
3730 + exec_controls_clearbit(vmx, CPU_BASED_CR3_LOAD_EXITING |
3731 + CPU_BASED_CR3_STORE_EXITING);
3732 + vcpu->arch.cr0 = cr0;
3733 + vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
3734 + }
3735 +
3736 + if (!(cr0 & X86_CR0_WP))
3737 + *hw_cr0 &= ~X86_CR0_WP;
3738 +}
3739 +
3740 +void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
3741 +{
3742 + struct vcpu_vmx *vmx = to_vmx(vcpu);
3743 + unsigned long hw_cr0;
3744 +
3745 + hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF);
3746 + if (enable_unrestricted_guest)
3747 + hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
3748 + else {
3749 + hw_cr0 |= KVM_VM_CR0_ALWAYS_ON;
3750 +
3751 + if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
3752 + enter_pmode(vcpu);
3753 +
3754 + if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE))
3755 + enter_rmode(vcpu);
3756 + }
3757 +
3758 +#ifdef CONFIG_X86_64
3759 + if (vcpu->arch.efer & EFER_LME) {
3760 + if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
3761 + enter_lmode(vcpu);
3762 + if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
3763 + exit_lmode(vcpu);
3764 + }
3765 +#endif
3766 +
3767 + if (enable_ept && !enable_unrestricted_guest)
3768 + ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
3769 +
3770 + vmcs_writel(CR0_READ_SHADOW, cr0);
3771 + vmcs_writel(GUEST_CR0, hw_cr0);
3772 + vcpu->arch.cr0 = cr0;
3773 +
3774 + /* depends on vcpu->arch.cr0 to be set to a new value */
3775 + vmx->emulation_required = emulation_required(vcpu);
3776 +}
3777 +
3778 +static int get_ept_level(struct kvm_vcpu *vcpu)
3779 +{
3780 + if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48))
3781 + return 5;
3782 + return 4;
3783 +}
3784 +
3785 +u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa)
3786 +{
3787 + u64 eptp = VMX_EPTP_MT_WB;
3788 +
3789 + eptp |= (get_ept_level(vcpu) == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4;
3790 +
3791 + if (enable_ept_ad_bits &&
3792 + (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu)))
3793 + eptp |= VMX_EPTP_AD_ENABLE_BIT;
3794 + eptp |= (root_hpa & PAGE_MASK);
3795 +
3796 + return eptp;
3797 +}
3798 +
3799 +void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
3800 +{
3801 + struct kvm *kvm = vcpu->kvm;
3802 + bool update_guest_cr3 = true;
3803 + unsigned long guest_cr3;
3804 + u64 eptp;
3805 +
3806 + guest_cr3 = cr3;
3807 + if (enable_ept) {
3808 + eptp = construct_eptp(vcpu, cr3);
3809 + vmcs_write64(EPT_POINTER, eptp);
3810 +
3811 + if (kvm_x86_ops->tlb_remote_flush) {
3812 + spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
3813 + to_vmx(vcpu)->ept_pointer = eptp;
3814 + to_kvm_vmx(kvm)->ept_pointers_match
3815 + = EPT_POINTERS_CHECK;
3816 + spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
3817 + }
3818 +
3819 + /* Loading vmcs02.GUEST_CR3 is handled by nested VM-Enter. */
3820 + if (is_guest_mode(vcpu))
3821 + update_guest_cr3 = false;
3822 + else if (!enable_unrestricted_guest && !is_paging(vcpu))
3823 + guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr;
3824 + else if (test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
3825 + guest_cr3 = vcpu->arch.cr3;
3826 + else /* vmcs01.GUEST_CR3 is already up-to-date. */
3827 + update_guest_cr3 = false;
3828 + ept_load_pdptrs(vcpu);
3829 + }
3830 +
3831 + if (update_guest_cr3)
3832 + vmcs_writel(GUEST_CR3, guest_cr3);
3833 +}
3834 +
3835 +int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
3836 +{
3837 + struct vcpu_vmx *vmx = to_vmx(vcpu);
3838 + /*
3839 + * Pass through host's Machine Check Enable value to hw_cr4, which
3840 + * is in force while we are in guest mode. Do not let guests control
3841 + * this bit, even if host CR4.MCE == 0.
3842 + */
3843 + unsigned long hw_cr4;
3844 +
3845 + hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE);
3846 + if (enable_unrestricted_guest)
3847 + hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST;
3848 + else if (vmx->rmode.vm86_active)
3849 + hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON;
3850 + else
3851 + hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON;
3852 +
3853 + if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated()) {
3854 + if (cr4 & X86_CR4_UMIP) {
3855 + secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_DESC);
3856 + hw_cr4 &= ~X86_CR4_UMIP;
3857 + } else if (!is_guest_mode(vcpu) ||
3858 + !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC)) {
3859 + secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_DESC);
3860 + }
3861 + }
3862 +
3863 + if (cr4 & X86_CR4_VMXE) {
3864 + /*
3865 + * To use VMXON (and later other VMX instructions), a guest
3866 + * must first be able to turn on cr4.VMXE (see handle_vmon()).
3867 + * So basically the check on whether to allow nested VMX
3868 + * is here. We operate under the default treatment of SMM,
3869 + * so VMX cannot be enabled under SMM.
3870 + */
3871 + if (!nested_vmx_allowed(vcpu) || is_smm(vcpu))
3872 + return 1;
3873 + }
3874 +
3875 + if (vmx->nested.vmxon && !nested_cr4_valid(vcpu, cr4))
3876 + return 1;
3877 +
3878 + vcpu->arch.cr4 = cr4;
3879 +
3880 + if (!enable_unrestricted_guest) {
3881 + if (enable_ept) {
3882 + if (!is_paging(vcpu)) {
3883 + hw_cr4 &= ~X86_CR4_PAE;
3884 + hw_cr4 |= X86_CR4_PSE;
3885 + } else if (!(cr4 & X86_CR4_PAE)) {
3886 + hw_cr4 &= ~X86_CR4_PAE;
3887 + }
3888 + }
3889 +
3890 + /*
3891 + * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in
3892 + * hardware. To emulate this behavior, SMEP/SMAP/PKU needs
3893 + * to be manually disabled when guest switches to non-paging
3894 + * mode.
3895 + *
3896 + * If !enable_unrestricted_guest, the CPU is always running
3897 + * with CR0.PG=1 and CR4 needs to be modified.
3898 + * If enable_unrestricted_guest, the CPU automatically
3899 + * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0.
3900 + */
3901 + if (!is_paging(vcpu))
3902 + hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
3903 + }
3904 +
3905 + vmcs_writel(CR4_READ_SHADOW, cr4);
3906 + vmcs_writel(GUEST_CR4, hw_cr4);
3907 + return 0;
3908 +}
3909 +
3910 +void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
3911 +{
3912 + struct vcpu_vmx *vmx = to_vmx(vcpu);
3913 + u32 ar;
3914 +
3915 + if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
3916 + *var = vmx->rmode.segs[seg];
3917 + if (seg == VCPU_SREG_TR
3918 + || var->selector == vmx_read_guest_seg_selector(vmx, seg))
3919 + return;
3920 + var->base = vmx_read_guest_seg_base(vmx, seg);
3921 + var->selector = vmx_read_guest_seg_selector(vmx, seg);
3922 + return;
3923 + }
3924 + var->base = vmx_read_guest_seg_base(vmx, seg);
3925 + var->limit = vmx_read_guest_seg_limit(vmx, seg);
3926 + var->selector = vmx_read_guest_seg_selector(vmx, seg);
3927 + ar = vmx_read_guest_seg_ar(vmx, seg);
3928 + var->unusable = (ar >> 16) & 1;
3929 + var->type = ar & 15;
3930 + var->s = (ar >> 4) & 1;
3931 + var->dpl = (ar >> 5) & 3;
3932 + /*
3933 + * Some userspaces do not preserve unusable property. Since usable
3934 + * segment has to be present according to VMX spec we can use present
3935 + * property to amend userspace bug by making unusable segment always
3936 + * nonpresent. vmx_segment_access_rights() already marks nonpresent
3937 + * segment as unusable.
3938 + */
3939 + var->present = !var->unusable;
3940 + var->avl = (ar >> 12) & 1;
3941 + var->l = (ar >> 13) & 1;
3942 + var->db = (ar >> 14) & 1;
3943 + var->g = (ar >> 15) & 1;
3944 +}
3945 +
3946 +static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
3947 +{
3948 + struct kvm_segment s;
3949 +
3950 + if (to_vmx(vcpu)->rmode.vm86_active) {
3951 + vmx_get_segment(vcpu, &s, seg);
3952 + return s.base;
3953 + }
3954 + return vmx_read_guest_seg_base(to_vmx(vcpu), seg);
3955 +}
3956 +
3957 +int vmx_get_cpl(struct kvm_vcpu *vcpu)
3958 +{
3959 + struct vcpu_vmx *vmx = to_vmx(vcpu);
3960 +
3961 + if (unlikely(vmx->rmode.vm86_active))
3962 + return 0;
3963 + else {
3964 + int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS);
3965 + return VMX_AR_DPL(ar);
3966 + }
3967 +}
3968 +
3969 +static u32 vmx_segment_access_rights(struct kvm_segment *var)
3970 +{
3971 + u32 ar;
3972 +
3973 + if (var->unusable || !var->present)
3974 + ar = 1 << 16;
3975 + else {
3976 + ar = var->type & 15;
3977 + ar |= (var->s & 1) << 4;
3978 + ar |= (var->dpl & 3) << 5;
3979 + ar |= (var->present & 1) << 7;
3980 + ar |= (var->avl & 1) << 12;
3981 + ar |= (var->l & 1) << 13;
3982 + ar |= (var->db & 1) << 14;
3983 + ar |= (var->g & 1) << 15;
3984 + }
3985 +
3986 + return ar;
3987 +}
3988 +
3989 +void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
3990 +{
3991 + struct vcpu_vmx *vmx = to_vmx(vcpu);
3992 + const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
3993 +
3994 + vmx_segment_cache_clear(vmx);
3995 +
3996 + if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
3997 + vmx->rmode.segs[seg] = *var;
3998 + if (seg == VCPU_SREG_TR)
3999 + vmcs_write16(sf->selector, var->selector);
4000 + else if (var->s)
4001 + fix_rmode_seg(seg, &vmx->rmode.segs[seg]);
4002 + goto out;
4003 + }
4004 +
4005 + vmcs_writel(sf->base, var->base);
4006 + vmcs_write32(sf->limit, var->limit);
4007 + vmcs_write16(sf->selector, var->selector);
4008 +
4009 + /*
4010 + * Fix the "Accessed" bit in AR field of segment registers for older
4011 + * qemu binaries.
4012 + * IA32 arch specifies that at the time of processor reset the
4013 + * "Accessed" bit in the AR field of segment registers is 1. And qemu
4014 + * is setting it to 0 in the userland code. This causes invalid guest
4015 + * state vmexit when "unrestricted guest" mode is turned on.
4016 + * Fix for this setup issue in cpu_reset is being pushed in the qemu
4017 + * tree. Newer qemu binaries with that qemu fix would not need this
4018 + * kvm hack.
4019 + */
4020 + if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR))
4021 + var->type |= 0x1; /* Accessed */
4022 +
4023 + vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var));
4024 +
4025 +out:
4026 + vmx->emulation_required = emulation_required(vcpu);
4027 +}
4028 +
4029 +static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
4030 +{
4031 + u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS);
4032 +
4033 + *db = (ar >> 14) & 1;
4034 + *l = (ar >> 13) & 1;
4035 +}
4036 +
4037 +static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
4038 +{
4039 + dt->size = vmcs_read32(GUEST_IDTR_LIMIT);
4040 + dt->address = vmcs_readl(GUEST_IDTR_BASE);
4041 +}
4042 +
4043 +static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
4044 +{
4045 + vmcs_write32(GUEST_IDTR_LIMIT, dt->size);
4046 + vmcs_writel(GUEST_IDTR_BASE, dt->address);
4047 +}
4048 +
4049 +static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
4050 +{
4051 + dt->size = vmcs_read32(GUEST_GDTR_LIMIT);
4052 + dt->address = vmcs_readl(GUEST_GDTR_BASE);
4053 +}
4054 +
4055 +static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
4056 +{
4057 + vmcs_write32(GUEST_GDTR_LIMIT, dt->size);
4058 + vmcs_writel(GUEST_GDTR_BASE, dt->address);
4059 +}
4060 +
4061 +static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
4062 +{
4063 + struct kvm_segment var;
4064 + u32 ar;
4065 +
4066 + vmx_get_segment(vcpu, &var, seg);
4067 + var.dpl = 0x3;
4068 + if (seg == VCPU_SREG_CS)
4069 + var.type = 0x3;
4070 + ar = vmx_segment_access_rights(&var);
4071 +
4072 + if (var.base != (var.selector << 4))
4073 + return false;
4074 + if (var.limit != 0xffff)
4075 + return false;
4076 + if (ar != 0xf3)
4077 + return false;
4078 +
4079 + return true;
4080 +}
4081 +
4082 +static bool code_segment_valid(struct kvm_vcpu *vcpu)
4083 +{
4084 + struct kvm_segment cs;
4085 + unsigned int cs_rpl;
4086 +
4087 + vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
4088 + cs_rpl = cs.selector & SEGMENT_RPL_MASK;
4089 +
4090 + if (cs.unusable)
4091 + return false;
4092 + if (~cs.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_ACCESSES_MASK))
4093 + return false;
4094 + if (!cs.s)
4095 + return false;
4096 + if (cs.type & VMX_AR_TYPE_WRITEABLE_MASK) {
4097 + if (cs.dpl > cs_rpl)
4098 + return false;
4099 + } else {
4100 + if (cs.dpl != cs_rpl)
4101 + return false;
4102 + }
4103 + if (!cs.present)
4104 + return false;
4105 +
4106 + /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */
4107 + return true;
4108 +}
4109 +
4110 +static bool stack_segment_valid(struct kvm_vcpu *vcpu)
4111 +{
4112 + struct kvm_segment ss;
4113 + unsigned int ss_rpl;
4114 +
4115 + vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
4116 + ss_rpl = ss.selector & SEGMENT_RPL_MASK;
4117 +
4118 + if (ss.unusable)
4119 + return true;
4120 + if (ss.type != 3 && ss.type != 7)
4121 + return false;
4122 + if (!ss.s)
4123 + return false;
4124 + if (ss.dpl != ss_rpl) /* DPL != RPL */
4125 + return false;
4126 + if (!ss.present)
4127 + return false;
4128 +
4129 + return true;
4130 +}
4131 +
4132 +static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
4133 +{
4134 + struct kvm_segment var;
4135 + unsigned int rpl;
4136 +
4137 + vmx_get_segment(vcpu, &var, seg);
4138 + rpl = var.selector & SEGMENT_RPL_MASK;
4139 +
4140 + if (var.unusable)
4141 + return true;
4142 + if (!var.s)
4143 + return false;
4144 + if (!var.present)
4145 + return false;
4146 + if (~var.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_WRITEABLE_MASK)) {
4147 + if (var.dpl < rpl) /* DPL < RPL */
4148 + return false;
4149 + }
4150 +
4151 + /* TODO: Add other members to kvm_segment_field to allow checking for other access
4152 + * rights flags
4153 + */
4154 + return true;
4155 +}
4156 +
4157 +static bool tr_valid(struct kvm_vcpu *vcpu)
4158 +{
4159 + struct kvm_segment tr;
4160 +
4161 + vmx_get_segment(vcpu, &tr, VCPU_SREG_TR);
4162 +
4163 + if (tr.unusable)
4164 + return false;
4165 + if (tr.selector & SEGMENT_TI_MASK) /* TI = 1 */
4166 + return false;
4167 + if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */
4168 + return false;
4169 + if (!tr.present)
4170 + return false;
4171 +
4172 + return true;
4173 +}
4174 +
4175 +static bool ldtr_valid(struct kvm_vcpu *vcpu)
4176 +{
4177 + struct kvm_segment ldtr;
4178 +
4179 + vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR);
4180 +
4181 + if (ldtr.unusable)
4182 + return true;
4183 + if (ldtr.selector & SEGMENT_TI_MASK) /* TI = 1 */
4184 + return false;
4185 + if (ldtr.type != 2)
4186 + return false;
4187 + if (!ldtr.present)
4188 + return false;
4189 +
4190 + return true;
4191 +}
4192 +
4193 +static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
4194 +{
4195 + struct kvm_segment cs, ss;
4196 +
4197 + vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
4198 + vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
4199 +
4200 + return ((cs.selector & SEGMENT_RPL_MASK) ==
4201 + (ss.selector & SEGMENT_RPL_MASK));
4202 +}
4203 +
4204 +/*
4205 + * Check if guest state is valid. Returns true if valid, false if
4206 + * not.
4207 + * We assume that registers are always usable
4208 + */
4209 +static bool guest_state_valid(struct kvm_vcpu *vcpu)
4210 +{
4211 + if (enable_unrestricted_guest)
4212 + return true;
4213 +
4214 + /* real mode guest state checks */
4215 + if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
4216 + if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
4217 + return false;
4218 + if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
4219 + return false;
4220 + if (!rmode_segment_valid(vcpu, VCPU_SREG_DS))
4221 + return false;
4222 + if (!rmode_segment_valid(vcpu, VCPU_SREG_ES))
4223 + return false;
4224 + if (!rmode_segment_valid(vcpu, VCPU_SREG_FS))
4225 + return false;
4226 + if (!rmode_segment_valid(vcpu, VCPU_SREG_GS))
4227 + return false;
4228 + } else {
4229 + /* protected mode guest state checks */
4230 + if (!cs_ss_rpl_check(vcpu))
4231 + return false;
4232 + if (!code_segment_valid(vcpu))
4233 + return false;
4234 + if (!stack_segment_valid(vcpu))
4235 + return false;
4236 + if (!data_segment_valid(vcpu, VCPU_SREG_DS))
4237 + return false;
4238 + if (!data_segment_valid(vcpu, VCPU_SREG_ES))
4239 + return false;
4240 + if (!data_segment_valid(vcpu, VCPU_SREG_FS))
4241 + return false;
4242 + if (!data_segment_valid(vcpu, VCPU_SREG_GS))
4243 + return false;
4244 + if (!tr_valid(vcpu))
4245 + return false;
4246 + if (!ldtr_valid(vcpu))
4247 + return false;
4248 + }
4249 + /* TODO:
4250 + * - Add checks on RIP
4251 + * - Add checks on RFLAGS
4252 + */
4253 +
4254 + return true;
4255 +}
4256 +
4257 +static int init_rmode_tss(struct kvm *kvm)
4258 +{
4259 + gfn_t fn;
4260 + u16 data = 0;
4261 + int idx, r;
4262 +
4263 + idx = srcu_read_lock(&kvm->srcu);
4264 + fn = to_kvm_vmx(kvm)->tss_addr >> PAGE_SHIFT;
4265 + r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
4266 + if (r < 0)
4267 + goto out;
4268 + data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
4269 + r = kvm_write_guest_page(kvm, fn++, &data,
4270 + TSS_IOPB_BASE_OFFSET, sizeof(u16));
4271 + if (r < 0)
4272 + goto out;
4273 + r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
4274 + if (r < 0)
4275 + goto out;
4276 + r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
4277 + if (r < 0)
4278 + goto out;
4279 + data = ~0;
4280 + r = kvm_write_guest_page(kvm, fn, &data,
4281 + RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1,
4282 + sizeof(u8));
4283 +out:
4284 + srcu_read_unlock(&kvm->srcu, idx);
4285 + return r;
4286 +}
4287 +
4288 +static int init_rmode_identity_map(struct kvm *kvm)
4289 +{
4290 + struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
4291 + int i, idx, r = 0;
4292 + kvm_pfn_t identity_map_pfn;
4293 + u32 tmp;
4294 +
4295 + /* Protect kvm_vmx->ept_identity_pagetable_done. */
4296 + mutex_lock(&kvm->slots_lock);
4297 +
4298 + if (likely(kvm_vmx->ept_identity_pagetable_done))
4299 + goto out2;
4300 +
4301 + if (!kvm_vmx->ept_identity_map_addr)
4302 + kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR;
4303 + identity_map_pfn = kvm_vmx->ept_identity_map_addr >> PAGE_SHIFT;
4304 +
4305 + r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
4306 + kvm_vmx->ept_identity_map_addr, PAGE_SIZE);
4307 + if (r < 0)
4308 + goto out2;
4309 +
4310 + idx = srcu_read_lock(&kvm->srcu);
4311 + r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
4312 + if (r < 0)
4313 + goto out;
4314 + /* Set up identity-mapping pagetable for EPT in real mode */
4315 + for (i = 0; i < PT32_ENT_PER_PAGE; i++) {
4316 + tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
4317 + _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
4318 + r = kvm_write_guest_page(kvm, identity_map_pfn,
4319 + &tmp, i * sizeof(tmp), sizeof(tmp));
4320 + if (r < 0)
4321 + goto out;
4322 + }
4323 + kvm_vmx->ept_identity_pagetable_done = true;
4324 +
4325 +out:
4326 + srcu_read_unlock(&kvm->srcu, idx);
4327 +
4328 +out2:
4329 + mutex_unlock(&kvm->slots_lock);
4330 + return r;
4331 +}
4332 +
4333 +static void seg_setup(int seg)
4334 +{
4335 + const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
4336 + unsigned int ar;
4337 +
4338 + vmcs_write16(sf->selector, 0);
4339 + vmcs_writel(sf->base, 0);
4340 + vmcs_write32(sf->limit, 0xffff);
4341 + ar = 0x93;
4342 + if (seg == VCPU_SREG_CS)
4343 + ar |= 0x08; /* code segment */
4344 +
4345 + vmcs_write32(sf->ar_bytes, ar);
4346 +}
4347 +
4348 +static int alloc_apic_access_page(struct kvm *kvm)
4349 +{
4350 + struct page *page;
4351 + int r = 0;
4352 +
4353 + mutex_lock(&kvm->slots_lock);
4354 + if (kvm->arch.apic_access_page_done)
4355 + goto out;
4356 + r = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
4357 + APIC_DEFAULT_PHYS_BASE, PAGE_SIZE);
4358 + if (r)
4359 + goto out;
4360 +
4361 + page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
4362 + if (is_error_page(page)) {
4363 + r = -EFAULT;
4364 + goto out;
4365 + }
4366 +
4367 + /*
4368 + * Do not pin the page in memory, so that memory hot-unplug
4369 + * is able to migrate it.
4370 + */
4371 + put_page(page);
4372 + kvm->arch.apic_access_page_done = true;
4373 +out:
4374 + mutex_unlock(&kvm->slots_lock);
4375 + return r;
4376 +}
4377 +
4378 +int allocate_vpid(void)
4379 +{
4380 + int vpid;
4381 +
4382 + if (!enable_vpid)
4383 + return 0;
4384 + spin_lock(&vmx_vpid_lock);
4385 + vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
4386 + if (vpid < VMX_NR_VPIDS)
4387 + __set_bit(vpid, vmx_vpid_bitmap);
4388 + else
4389 + vpid = 0;
4390 + spin_unlock(&vmx_vpid_lock);
4391 + return vpid;
4392 +}
4393 +
4394 +void free_vpid(int vpid)
4395 +{
4396 + if (!enable_vpid || vpid == 0)
4397 + return;
4398 + spin_lock(&vmx_vpid_lock);
4399 + __clear_bit(vpid, vmx_vpid_bitmap);
4400 + spin_unlock(&vmx_vpid_lock);
4401 +}
4402 +
4403 +static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
4404 + u32 msr, int type)
4405 +{
4406 + int f = sizeof(unsigned long);
4407 +
4408 + if (!cpu_has_vmx_msr_bitmap())
4409 + return;
4410 +
4411 + if (static_branch_unlikely(&enable_evmcs))
4412 + evmcs_touch_msr_bitmap();
4413 +
4414 + /*
4415 + * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
4416 + * have the write-low and read-high bitmap offsets the wrong way round.
4417 + * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
4418 + */
4419 + if (msr <= 0x1fff) {
4420 + if (type & MSR_TYPE_R)
4421 + /* read-low */
4422 + __clear_bit(msr, msr_bitmap + 0x000 / f);
4423 +
4424 + if (type & MSR_TYPE_W)
4425 + /* write-low */
4426 + __clear_bit(msr, msr_bitmap + 0x800 / f);
4427 +
4428 + } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
4429 + msr &= 0x1fff;
4430 + if (type & MSR_TYPE_R)
4431 + /* read-high */
4432 + __clear_bit(msr, msr_bitmap + 0x400 / f);
4433 +
4434 + if (type & MSR_TYPE_W)
4435 + /* write-high */
4436 + __clear_bit(msr, msr_bitmap + 0xc00 / f);
4437 +
4438 + }
4439 +}
4440 +
4441 +static __always_inline void vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
4442 + u32 msr, int type)
4443 +{
4444 + int f = sizeof(unsigned long);
4445 +
4446 + if (!cpu_has_vmx_msr_bitmap())
4447 + return;
4448 +
4449 + if (static_branch_unlikely(&enable_evmcs))
4450 + evmcs_touch_msr_bitmap();
4451 +
4452 + /*
4453 + * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
4454 + * have the write-low and read-high bitmap offsets the wrong way round.
4455 + * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
4456 + */
4457 + if (msr <= 0x1fff) {
4458 + if (type & MSR_TYPE_R)
4459 + /* read-low */
4460 + __set_bit(msr, msr_bitmap + 0x000 / f);
4461 +
4462 + if (type & MSR_TYPE_W)
4463 + /* write-low */
4464 + __set_bit(msr, msr_bitmap + 0x800 / f);
4465 +
4466 + } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
4467 + msr &= 0x1fff;
4468 + if (type & MSR_TYPE_R)
4469 + /* read-high */
4470 + __set_bit(msr, msr_bitmap + 0x400 / f);
4471 +
4472 + if (type & MSR_TYPE_W)
4473 + /* write-high */
4474 + __set_bit(msr, msr_bitmap + 0xc00 / f);
4475 +
4476 + }
4477 +}
4478 +
4479 +static __always_inline void vmx_set_intercept_for_msr(unsigned long *msr_bitmap,
4480 + u32 msr, int type, bool value)
4481 +{
4482 + if (value)
4483 + vmx_enable_intercept_for_msr(msr_bitmap, msr, type);
4484 + else
4485 + vmx_disable_intercept_for_msr(msr_bitmap, msr, type);
4486 +}
4487 +
4488 +static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu)
4489 +{
4490 + u8 mode = 0;
4491 +
4492 + if (cpu_has_secondary_exec_ctrls() &&
4493 + (secondary_exec_controls_get(to_vmx(vcpu)) &
4494 + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
4495 + mode |= MSR_BITMAP_MODE_X2APIC;
4496 + if (enable_apicv && kvm_vcpu_apicv_active(vcpu))
4497 + mode |= MSR_BITMAP_MODE_X2APIC_APICV;
4498 + }
4499 +
4500 + return mode;
4501 +}
4502 +
4503 +static void vmx_update_msr_bitmap_x2apic(unsigned long *msr_bitmap,
4504 + u8 mode)
4505 +{
4506 + int msr;
4507 +
4508 + for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
4509 + unsigned word = msr / BITS_PER_LONG;
4510 + msr_bitmap[word] = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0;
4511 + msr_bitmap[word + (0x800 / sizeof(long))] = ~0;
4512 + }
4513 +
4514 + if (mode & MSR_BITMAP_MODE_X2APIC) {
4515 + /*
4516 + * TPR reads and writes can be virtualized even if virtual interrupt
4517 + * delivery is not in use.
4518 + */
4519 + vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW);
4520 + if (mode & MSR_BITMAP_MODE_X2APIC_APICV) {
4521 + vmx_enable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_R);
4522 + vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
4523 + vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
4524 + }
4525 + }
4526 +}
4527 +
4528 +void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu)
4529 +{
4530 + struct vcpu_vmx *vmx = to_vmx(vcpu);
4531 + unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
4532 + u8 mode = vmx_msr_bitmap_mode(vcpu);
4533 + u8 changed = mode ^ vmx->msr_bitmap_mode;
4534 +
4535 + if (!changed)
4536 + return;
4537 +
4538 + if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV))
4539 + vmx_update_msr_bitmap_x2apic(msr_bitmap, mode);
4540 +
4541 + vmx->msr_bitmap_mode = mode;
4542 +}
4543 +
4544 +void pt_update_intercept_for_msr(struct vcpu_vmx *vmx)
4545 +{
4546 + unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
4547 + bool flag = !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
4548 + u32 i;
4549 +
4550 + vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_STATUS,
4551 + MSR_TYPE_RW, flag);
4552 + vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_BASE,
4553 + MSR_TYPE_RW, flag);
4554 + vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_MASK,
4555 + MSR_TYPE_RW, flag);
4556 + vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_CR3_MATCH,
4557 + MSR_TYPE_RW, flag);
4558 + for (i = 0; i < vmx->pt_desc.addr_range; i++) {
4559 + vmx_set_intercept_for_msr(msr_bitmap,
4560 + MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag);
4561 + vmx_set_intercept_for_msr(msr_bitmap,
4562 + MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag);
4563 + }
4564 +}
4565 +
4566 +static bool vmx_get_enable_apicv(struct kvm *kvm)
4567 +{
4568 + return enable_apicv;
4569 +}
4570 +
4571 +static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
4572 +{
4573 + struct vcpu_vmx *vmx = to_vmx(vcpu);
4574 + void *vapic_page;
4575 + u32 vppr;
4576 + int rvi;
4577 +
4578 + if (WARN_ON_ONCE(!is_guest_mode(vcpu)) ||
4579 + !nested_cpu_has_vid(get_vmcs12(vcpu)) ||
4580 + WARN_ON_ONCE(!vmx->nested.virtual_apic_map.gfn))
4581 + return false;
4582 +
4583 + rvi = vmx_get_rvi();
4584 +
4585 + vapic_page = vmx->nested.virtual_apic_map.hva;
4586 + vppr = *((u32 *)(vapic_page + APIC_PROCPRI));
4587 +
4588 + return ((rvi & 0xf0) > (vppr & 0xf0));
4589 +}
4590 +
4591 +static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
4592 + bool nested)
4593 +{
4594 +#ifdef CONFIG_SMP
4595 + int pi_vec = nested ? POSTED_INTR_NESTED_VECTOR : POSTED_INTR_VECTOR;
4596 +
4597 + if (vcpu->mode == IN_GUEST_MODE) {
4598 + /*
4599 + * The vector of interrupt to be delivered to vcpu had
4600 + * been set in PIR before this function.
4601 + *
4602 + * Following cases will be reached in this block, and
4603 + * we always send a notification event in all cases as
4604 + * explained below.
4605 + *
4606 + * Case 1: vcpu keeps in non-root mode. Sending a
4607 + * notification event posts the interrupt to vcpu.
4608 + *
4609 + * Case 2: vcpu exits to root mode and is still
4610 + * runnable. PIR will be synced to vIRR before the
4611 + * next vcpu entry. Sending a notification event in
4612 + * this case has no effect, as vcpu is not in root
4613 + * mode.
4614 + *
4615 + * Case 3: vcpu exits to root mode and is blocked.
4616 + * vcpu_block() has already synced PIR to vIRR and
4617 + * never blocks vcpu if vIRR is not cleared. Therefore,
4618 + * a blocked vcpu here does not wait for any requested
4619 + * interrupts in PIR, and sending a notification event
4620 + * which has no effect is safe here.
4621 + */
4622 +
4623 + apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec);
4624 + return true;
4625 + }
4626 +#endif
4627 + return false;
4628 +}
4629 +
4630 +static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
4631 + int vector)
4632 +{
4633 + struct vcpu_vmx *vmx = to_vmx(vcpu);
4634 +
4635 + if (is_guest_mode(vcpu) &&
4636 + vector == vmx->nested.posted_intr_nv) {
4637 + /*
4638 + * If a posted intr is not recognized by hardware,
4639 + * we will accomplish it in the next vmentry.
4640 + */
4641 + vmx->nested.pi_pending = true;
4642 + kvm_make_request(KVM_REQ_EVENT, vcpu);
4643 + /* the PIR and ON have been set by L1. */
4644 + if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true))
4645 + kvm_vcpu_kick(vcpu);
4646 + return 0;
4647 + }
4648 + return -1;
4649 +}
4650 +/*
4651 + * Send interrupt to vcpu via posted interrupt way.
4652 + * 1. If target vcpu is running(non-root mode), send posted interrupt
4653 + * notification to vcpu and hardware will sync PIR to vIRR atomically.
4654 + * 2. If target vcpu isn't running(root mode), kick it to pick up the
4655 + * interrupt from PIR in next vmentry.
4656 + */
4657 +static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
4658 +{
4659 + struct vcpu_vmx *vmx = to_vmx(vcpu);
4660 + int r;
4661 +
4662 + r = vmx_deliver_nested_posted_interrupt(vcpu, vector);
4663 + if (!r)
4664 + return;
4665 +
4666 + if (pi_test_and_set_pir(vector, &vmx->pi_desc))
4667 + return;
4668 +
4669 + /* If a previous notification has sent the IPI, nothing to do. */
4670 + if (pi_test_and_set_on(&vmx->pi_desc))
4671 + return;
4672 +
4673 + if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false))
4674 + kvm_vcpu_kick(vcpu);
4675 +}
4676 +
4677 +/*
4678 + * Set up the vmcs's constant host-state fields, i.e., host-state fields that
4679 + * will not change in the lifetime of the guest.
4680 + * Note that host-state that does change is set elsewhere. E.g., host-state
4681 + * that is set differently for each CPU is set in vmx_vcpu_load(), not here.
4682 + */
4683 +void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
4684 +{
4685 + u32 low32, high32;
4686 + unsigned long tmpl;
4687 + unsigned long cr0, cr3, cr4;
4688 +
4689 + cr0 = read_cr0();
4690 + WARN_ON(cr0 & X86_CR0_TS);
4691 + vmcs_writel(HOST_CR0, cr0); /* 22.2.3 */
4692 +
4693 + /*
4694 + * Save the most likely value for this task's CR3 in the VMCS.
4695 + * We can't use __get_current_cr3_fast() because we're not atomic.
4696 + */
4697 + cr3 = __read_cr3();
4698 + vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */
4699 + vmx->loaded_vmcs->host_state.cr3 = cr3;
4700 +
4701 + /* Save the most likely value for this task's CR4 in the VMCS. */
4702 + cr4 = cr4_read_shadow();
4703 + vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */
4704 + vmx->loaded_vmcs->host_state.cr4 = cr4;
4705 +
4706 + vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */
4707 +#ifdef CONFIG_X86_64
4708 + /*
4709 + * Load null selectors, so we can avoid reloading them in
4710 + * vmx_prepare_switch_to_host(), in case userspace uses
4711 + * the null selectors too (the expected case).
4712 + */
4713 + vmcs_write16(HOST_DS_SELECTOR, 0);
4714 + vmcs_write16(HOST_ES_SELECTOR, 0);
4715 +#else
4716 + vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
4717 + vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */
4718 +#endif
4719 + vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
4720 + vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */
4721 +
4722 + vmcs_writel(HOST_IDTR_BASE, host_idt_base); /* 22.2.4 */
4723 +
4724 + vmcs_writel(HOST_RIP, (unsigned long)vmx_vmexit); /* 22.2.5 */
4725 +
4726 + rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
4727 + vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
4728 + rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
4729 + vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */
4730 +
4731 + if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
4732 + rdmsr(MSR_IA32_CR_PAT, low32, high32);
4733 + vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32));
4734 + }
4735 +
4736 + if (cpu_has_load_ia32_efer())
4737 + vmcs_write64(HOST_IA32_EFER, host_efer);
4738 +}
4739 +
4740 +void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
4741 +{
4742 + vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS;
4743 + if (enable_ept)
4744 + vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
4745 + if (is_guest_mode(&vmx->vcpu))
4746 + vmx->vcpu.arch.cr4_guest_owned_bits &=
4747 + ~get_vmcs12(&vmx->vcpu)->cr4_guest_host_mask;
4748 + vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
4749 +}
4750 +
4751 +u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
4752 +{
4753 + u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;
4754 +
4755 + if (!kvm_vcpu_apicv_active(&vmx->vcpu))
4756 + pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
4757 +
4758 + if (!enable_vnmi)
4759 + pin_based_exec_ctrl &= ~PIN_BASED_VIRTUAL_NMIS;
4760 +
4761 + if (!enable_preemption_timer)
4762 + pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
4763 +
4764 + return pin_based_exec_ctrl;
4765 +}
4766 +
4767 +static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
4768 +{
4769 + struct vcpu_vmx *vmx = to_vmx(vcpu);
4770 +
4771 + pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
4772 + if (cpu_has_secondary_exec_ctrls()) {
4773 + if (kvm_vcpu_apicv_active(vcpu))
4774 + secondary_exec_controls_setbit(vmx,
4775 + SECONDARY_EXEC_APIC_REGISTER_VIRT |
4776 + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
4777 + else
4778 + secondary_exec_controls_clearbit(vmx,
4779 + SECONDARY_EXEC_APIC_REGISTER_VIRT |
4780 + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
4781 + }
4782 +
4783 + if (cpu_has_vmx_msr_bitmap())
4784 + vmx_update_msr_bitmap(vcpu);
4785 +}
4786 +
4787 +u32 vmx_exec_control(struct vcpu_vmx *vmx)
4788 +{
4789 + u32 exec_control = vmcs_config.cpu_based_exec_ctrl;
4790 +
4791 + if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)
4792 + exec_control &= ~CPU_BASED_MOV_DR_EXITING;
4793 +
4794 + if (!cpu_need_tpr_shadow(&vmx->vcpu)) {
4795 + exec_control &= ~CPU_BASED_TPR_SHADOW;
4796 +#ifdef CONFIG_X86_64
4797 + exec_control |= CPU_BASED_CR8_STORE_EXITING |
4798 + CPU_BASED_CR8_LOAD_EXITING;
4799 +#endif
4800 + }
4801 + if (!enable_ept)
4802 + exec_control |= CPU_BASED_CR3_STORE_EXITING |
4803 + CPU_BASED_CR3_LOAD_EXITING |
4804 + CPU_BASED_INVLPG_EXITING;
4805 + if (kvm_mwait_in_guest(vmx->vcpu.kvm))
4806 + exec_control &= ~(CPU_BASED_MWAIT_EXITING |
4807 + CPU_BASED_MONITOR_EXITING);
4808 + if (kvm_hlt_in_guest(vmx->vcpu.kvm))
4809 + exec_control &= ~CPU_BASED_HLT_EXITING;
4810 + return exec_control;
4811 +}
4812 +
4813 +
4814 +static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
4815 +{
4816 + struct kvm_vcpu *vcpu = &vmx->vcpu;
4817 +
4818 + u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
4819 +
4820 + if (pt_mode == PT_MODE_SYSTEM)
4821 + exec_control &= ~(SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX);
4822 + if (!cpu_need_virtualize_apic_accesses(vcpu))
4823 + exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
4824 + if (vmx->vpid == 0)
4825 + exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
4826 + if (!enable_ept) {
4827 + exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
4828 + enable_unrestricted_guest = 0;
4829 + }
4830 + if (!enable_unrestricted_guest)
4831 + exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
4832 + if (kvm_pause_in_guest(vmx->vcpu.kvm))
4833 + exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
4834 + if (!kvm_vcpu_apicv_active(vcpu))
4835 + exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
4836 + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
4837 + exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
4838 +
4839 + /* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP,
4840 + * in vmx_set_cr4. */
4841 + exec_control &= ~SECONDARY_EXEC_DESC;
4842 +
4843 + /* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD
4844 + (handle_vmptrld).
4845 + We can NOT enable shadow_vmcs here because we don't have yet
4846 + a current VMCS12
4847 + */
4848 + exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
4849 +
4850 + if (!enable_pml)
4851 + exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
4852 +
4853 + if (vmx_xsaves_supported()) {
4854 + /* Exposing XSAVES only when XSAVE is exposed */
4855 + bool xsaves_enabled =
4856 + guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
4857 + guest_cpuid_has(vcpu, X86_FEATURE_XSAVES);
4858 +
4859 + vcpu->arch.xsaves_enabled = xsaves_enabled;
4860 +
4861 + if (!xsaves_enabled)
4862 + exec_control &= ~SECONDARY_EXEC_XSAVES;
4863 +
4864 + if (nested) {
4865 + if (xsaves_enabled)
4866 + vmx->nested.msrs.secondary_ctls_high |=
4867 + SECONDARY_EXEC_XSAVES;
4868 + else
4869 + vmx->nested.msrs.secondary_ctls_high &=
4870 + ~SECONDARY_EXEC_XSAVES;
4871 + }
4872 + }
4873 +
4874 + if (vmx_rdtscp_supported()) {
4875 + bool rdtscp_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP);
4876 + if (!rdtscp_enabled)
4877 + exec_control &= ~SECONDARY_EXEC_RDTSCP;
4878 +
4879 + if (nested) {
4880 + if (rdtscp_enabled)
4881 + vmx->nested.msrs.secondary_ctls_high |=
4882 + SECONDARY_EXEC_RDTSCP;
4883 + else
4884 + vmx->nested.msrs.secondary_ctls_high &=
4885 + ~SECONDARY_EXEC_RDTSCP;
4886 + }
4887 + }
4888 +
4889 + if (vmx_invpcid_supported()) {
4890 + /* Exposing INVPCID only when PCID is exposed */
4891 + bool invpcid_enabled =
4892 + guest_cpuid_has(vcpu, X86_FEATURE_INVPCID) &&
4893 + guest_cpuid_has(vcpu, X86_FEATURE_PCID);
4894 +
4895 + if (!invpcid_enabled) {
4896 + exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
4897 + guest_cpuid_clear(vcpu, X86_FEATURE_INVPCID);
4898 + }
4899 +
4900 + if (nested) {
4901 + if (invpcid_enabled)
4902 + vmx->nested.msrs.secondary_ctls_high |=
4903 + SECONDARY_EXEC_ENABLE_INVPCID;
4904 + else
4905 + vmx->nested.msrs.secondary_ctls_high &=
4906 + ~SECONDARY_EXEC_ENABLE_INVPCID;
4907 + }
4908 + }
4909 +
4910 + if (vmx_rdrand_supported()) {
4911 + bool rdrand_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDRAND);
4912 + if (rdrand_enabled)
4913 + exec_control &= ~SECONDARY_EXEC_RDRAND_EXITING;
4914 +
4915 + if (nested) {
4916 + if (rdrand_enabled)
4917 + vmx->nested.msrs.secondary_ctls_high |=
4918 + SECONDARY_EXEC_RDRAND_EXITING;
4919 + else
4920 + vmx->nested.msrs.secondary_ctls_high &=
4921 + ~SECONDARY_EXEC_RDRAND_EXITING;
4922 + }
4923 + }
4924 +
4925 + if (vmx_rdseed_supported()) {
4926 + bool rdseed_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDSEED);
4927 + if (rdseed_enabled)
4928 + exec_control &= ~SECONDARY_EXEC_RDSEED_EXITING;
4929 +
4930 + if (nested) {
4931 + if (rdseed_enabled)
4932 + vmx->nested.msrs.secondary_ctls_high |=
4933 + SECONDARY_EXEC_RDSEED_EXITING;
4934 + else
4935 + vmx->nested.msrs.secondary_ctls_high &=
4936 + ~SECONDARY_EXEC_RDSEED_EXITING;
4937 + }
4938 + }
4939 +
4940 + if (vmx_waitpkg_supported()) {
4941 + bool waitpkg_enabled =
4942 + guest_cpuid_has(vcpu, X86_FEATURE_WAITPKG);
4943 +
4944 + if (!waitpkg_enabled)
4945 + exec_control &= ~SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
4946 +
4947 + if (nested) {
4948 + if (waitpkg_enabled)
4949 + vmx->nested.msrs.secondary_ctls_high |=
4950 + SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
4951 + else
4952 + vmx->nested.msrs.secondary_ctls_high &=
4953 + ~SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
4954 + }
4955 + }
4956 +
4957 + vmx->secondary_exec_control = exec_control;
4958 +}
4959 +
4960 +static void ept_set_mmio_spte_mask(void)
4961 +{
4962 + /*
4963 + * EPT Misconfigurations can be generated if the value of bits 2:0
4964 + * of an EPT paging-structure entry is 110b (write/execute).
4965 + */
4966 + kvm_mmu_set_mmio_spte_mask(VMX_EPT_RWX_MASK,
4967 + VMX_EPT_MISCONFIG_WX_VALUE, 0);
4968 +}
4969 +
4970 +#define VMX_XSS_EXIT_BITMAP 0
4971 +
4972 +/*
4973 + * Noting that the initialization of Guest-state Area of VMCS is in
4974 + * vmx_vcpu_reset().
4975 + */
4976 +static void init_vmcs(struct vcpu_vmx *vmx)
4977 +{
4978 + if (nested)
4979 + nested_vmx_set_vmcs_shadowing_bitmap();
4980 +
4981 + if (cpu_has_vmx_msr_bitmap())
4982 + vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap));
4983 +
4984 + vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
4985 +
4986 + /* Control */
4987 + pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
4988 +
4989 + exec_controls_set(vmx, vmx_exec_control(vmx));
4990 +
4991 + if (cpu_has_secondary_exec_ctrls()) {
4992 + vmx_compute_secondary_exec_control(vmx);
4993 + secondary_exec_controls_set(vmx, vmx->secondary_exec_control);
4994 + }
4995 +
4996 + if (kvm_vcpu_apicv_active(&vmx->vcpu)) {
4997 + vmcs_write64(EOI_EXIT_BITMAP0, 0);
4998 + vmcs_write64(EOI_EXIT_BITMAP1, 0);
4999 + vmcs_write64(EOI_EXIT_BITMAP2, 0);
5000 + vmcs_write64(EOI_EXIT_BITMAP3, 0);
5001 +
5002 + vmcs_write16(GUEST_INTR_STATUS, 0);
5003 +
5004 + vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
5005 + vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc)));
5006 + }
5007 +
5008 + if (!kvm_pause_in_guest(vmx->vcpu.kvm)) {
5009 + vmcs_write32(PLE_GAP, ple_gap);
5010 + vmx->ple_window = ple_window;
5011 + vmx->ple_window_dirty = true;
5012 + }
5013 +
5014 + vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
5015 + vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
5016 + vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
5017 +
5018 + vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */
5019 + vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */
5020 + vmx_set_constant_host_state(vmx);
5021 + vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
5022 + vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
5023 +
5024 + if (cpu_has_vmx_vmfunc())
5025 + vmcs_write64(VM_FUNCTION_CONTROL, 0);
5026 +
5027 + vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
5028 + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
5029 + vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
5030 + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
5031 + vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
5032 +
5033 + if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
5034 + vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
5035 +
5036 + vm_exit_controls_set(vmx, vmx_vmexit_ctrl());
5037 +
5038 + /* 22.2.1, 20.8.1 */
5039 + vm_entry_controls_set(vmx, vmx_vmentry_ctrl());
5040 +
5041 + vmx->vcpu.arch.cr0_guest_owned_bits = X86_CR0_TS;
5042 + vmcs_writel(CR0_GUEST_HOST_MASK, ~X86_CR0_TS);
5043 +
5044 + set_cr4_guest_host_mask(vmx);
5045 +
5046 + if (vmx->vpid != 0)
5047 + vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
5048 +
5049 + if (vmx_xsaves_supported())
5050 + vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP);
5051 +
5052 + if (enable_pml) {
5053 + vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
5054 + vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
5055 + }
5056 +
5057 + if (cpu_has_vmx_encls_vmexit())
5058 + vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
5059 +
5060 + if (pt_mode == PT_MODE_HOST_GUEST) {
5061 + memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc));
5062 + /* Bit[6~0] are forced to 1, writes are ignored. */
5063 + vmx->pt_desc.guest.output_mask = 0x7F;
5064 + vmcs_write64(GUEST_IA32_RTIT_CTL, 0);
5065 + }
5066 +}
5067 +
5068 +static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
5069 +{
5070 + struct vcpu_vmx *vmx = to_vmx(vcpu);
5071 + struct msr_data apic_base_msr;
5072 + u64 cr0;
5073 +
5074 + vmx->rmode.vm86_active = 0;
5075 + vmx->spec_ctrl = 0;
5076 +
5077 + vmx->msr_ia32_umwait_control = 0;
5078 +
5079 + vcpu->arch.microcode_version = 0x100000000ULL;
5080 + vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
5081 + vmx->hv_deadline_tsc = -1;
5082 + kvm_set_cr8(vcpu, 0);
5083 +
5084 + if (!init_event) {
5085 + apic_base_msr.data = APIC_DEFAULT_PHYS_BASE |
5086 + MSR_IA32_APICBASE_ENABLE;
5087 + if (kvm_vcpu_is_reset_bsp(vcpu))
5088 + apic_base_msr.data |= MSR_IA32_APICBASE_BSP;
5089 + apic_base_msr.host_initiated = true;
5090 + kvm_set_apic_base(vcpu, &apic_base_msr);
5091 + }
5092 +
5093 + vmx_segment_cache_clear(vmx);
5094 +
5095 + seg_setup(VCPU_SREG_CS);
5096 + vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
5097 + vmcs_writel(GUEST_CS_BASE, 0xffff0000ul);
5098 +
5099 + seg_setup(VCPU_SREG_DS);
5100 + seg_setup(VCPU_SREG_ES);
5101 + seg_setup(VCPU_SREG_FS);
5102 + seg_setup(VCPU_SREG_GS);
5103 + seg_setup(VCPU_SREG_SS);
5104 +
5105 + vmcs_write16(GUEST_TR_SELECTOR, 0);
5106 + vmcs_writel(GUEST_TR_BASE, 0);
5107 + vmcs_write32(GUEST_TR_LIMIT, 0xffff);
5108 + vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
5109 +
5110 + vmcs_write16(GUEST_LDTR_SELECTOR, 0);
5111 + vmcs_writel(GUEST_LDTR_BASE, 0);
5112 + vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
5113 + vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
5114 +
5115 + if (!init_event) {
5116 + vmcs_write32(GUEST_SYSENTER_CS, 0);
5117 + vmcs_writel(GUEST_SYSENTER_ESP, 0);
5118 + vmcs_writel(GUEST_SYSENTER_EIP, 0);
5119 + vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
5120 + }
5121 +
5122 + kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
5123 + kvm_rip_write(vcpu, 0xfff0);
5124 +
5125 + vmcs_writel(GUEST_GDTR_BASE, 0);
5126 + vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
5127 +
5128 + vmcs_writel(GUEST_IDTR_BASE, 0);
5129 + vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
5130 +
5131 + vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
5132 + vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
5133 + vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0);
5134 + if (kvm_mpx_supported())
5135 + vmcs_write64(GUEST_BNDCFGS, 0);
5136 +
5137 + setup_msrs(vmx);
5138 +
5139 + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */
5140 +
5141 + if (cpu_has_vmx_tpr_shadow() && !init_event) {
5142 + vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
5143 + if (cpu_need_tpr_shadow(vcpu))
5144 + vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
5145 + __pa(vcpu->arch.apic->regs));
5146 + vmcs_write32(TPR_THRESHOLD, 0);
5147 + }
5148 +
5149 + kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
5150 +
5151 + cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
5152 + vmx->vcpu.arch.cr0 = cr0;
5153 + vmx_set_cr0(vcpu, cr0); /* enter rmode */
5154 + vmx_set_cr4(vcpu, 0);
5155 + vmx_set_efer(vcpu, 0);
5156 +
5157 + update_exception_bitmap(vcpu);
5158 +
5159 + vpid_sync_context(vmx->vpid);
5160 + if (init_event)
5161 + vmx_clear_hlt(vcpu);
5162 +}
5163 +
5164 +static void enable_irq_window(struct kvm_vcpu *vcpu)
5165 +{
5166 + exec_controls_setbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);
5167 +}
5168 +
5169 +static void enable_nmi_window(struct kvm_vcpu *vcpu)
5170 +{
5171 + if (!enable_vnmi ||
5172 + vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
5173 + enable_irq_window(vcpu);
5174 + return;
5175 + }
5176 +
5177 + exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
5178 +}
5179 +
5180 +static void vmx_inject_irq(struct kvm_vcpu *vcpu)
5181 +{
5182 + struct vcpu_vmx *vmx = to_vmx(vcpu);
5183 + uint32_t intr;
5184 + int irq = vcpu->arch.interrupt.nr;
5185 +
5186 + trace_kvm_inj_virq(irq);
5187 +
5188 + ++vcpu->stat.irq_injections;
5189 + if (vmx->rmode.vm86_active) {
5190 + int inc_eip = 0;
5191 + if (vcpu->arch.interrupt.soft)
5192 + inc_eip = vcpu->arch.event_exit_inst_len;
5193 + kvm_inject_realmode_interrupt(vcpu, irq, inc_eip);
5194 + return;
5195 + }
5196 + intr = irq | INTR_INFO_VALID_MASK;
5197 + if (vcpu->arch.interrupt.soft) {
5198 + intr |= INTR_TYPE_SOFT_INTR;
5199 + vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
5200 + vmx->vcpu.arch.event_exit_inst_len);
5201 + } else
5202 + intr |= INTR_TYPE_EXT_INTR;
5203 + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
5204 +
5205 + vmx_clear_hlt(vcpu);
5206 +}
5207 +
5208 +static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
5209 +{
5210 + struct vcpu_vmx *vmx = to_vmx(vcpu);
5211 +
5212 + if (!enable_vnmi) {
5213 + /*
5214 + * Tracking the NMI-blocked state in software is built upon
5215 + * finding the next open IRQ window. This, in turn, depends on
5216 + * well-behaving guests: They have to keep IRQs disabled at
5217 + * least as long as the NMI handler runs. Otherwise we may
5218 + * cause NMI nesting, maybe breaking the guest. But as this is
5219 + * highly unlikely, we can live with the residual risk.
5220 + */
5221 + vmx->loaded_vmcs->soft_vnmi_blocked = 1;
5222 + vmx->loaded_vmcs->vnmi_blocked_time = 0;
5223 + }
5224 +
5225 + ++vcpu->stat.nmi_injections;
5226 + vmx->loaded_vmcs->nmi_known_unmasked = false;
5227 +
5228 + if (vmx->rmode.vm86_active) {
5229 + kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0);
5230 + return;
5231 + }
5232 +
5233 + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
5234 + INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
5235 +
5236 + vmx_clear_hlt(vcpu);
5237 +}
5238 +
5239 +bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
5240 +{
5241 + struct vcpu_vmx *vmx = to_vmx(vcpu);
5242 + bool masked;
5243 +
5244 + if (!enable_vnmi)
5245 + return vmx->loaded_vmcs->soft_vnmi_blocked;
5246 + if (vmx->loaded_vmcs->nmi_known_unmasked)
5247 + return false;
5248 + masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;
5249 + vmx->loaded_vmcs->nmi_known_unmasked = !masked;
5250 + return masked;
5251 +}
5252 +
5253 +void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
5254 +{
5255 + struct vcpu_vmx *vmx = to_vmx(vcpu);
5256 +
5257 + if (!enable_vnmi) {
5258 + if (vmx->loaded_vmcs->soft_vnmi_blocked != masked) {
5259 + vmx->loaded_vmcs->soft_vnmi_blocked = masked;
5260 + vmx->loaded_vmcs->vnmi_blocked_time = 0;
5261 + }
5262 + } else {
5263 + vmx->loaded_vmcs->nmi_known_unmasked = !masked;
5264 + if (masked)
5265 + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
5266 + GUEST_INTR_STATE_NMI);
5267 + else
5268 + vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
5269 + GUEST_INTR_STATE_NMI);
5270 + }
5271 +}
5272 +
5273 +static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
5274 +{
5275 + if (to_vmx(vcpu)->nested.nested_run_pending)
5276 + return 0;
5277 +
5278 + if (!enable_vnmi &&
5279 + to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked)
5280 + return 0;
5281 +
5282 + return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
5283 + (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI
5284 + | GUEST_INTR_STATE_NMI));
5285 +}
5286 +
5287 +static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
5288 +{
5289 + return (!to_vmx(vcpu)->nested.nested_run_pending &&
5290 + vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
5291 + !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
5292 + (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
5293 +}
5294 +
5295 +static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
5296 +{
5297 + int ret;
5298 +
5299 + if (enable_unrestricted_guest)
5300 + return 0;
5301 +
5302 + ret = x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr,
5303 + PAGE_SIZE * 3);
5304 + if (ret)
5305 + return ret;
5306 + to_kvm_vmx(kvm)->tss_addr = addr;
5307 + return init_rmode_tss(kvm);
5308 +}
5309 +
5310 +static int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
5311 +{
5312 + to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr;
5313 + return 0;
5314 +}
5315 +
5316 +static bool rmode_exception(struct kvm_vcpu *vcpu, int vec)
5317 +{
5318 + switch (vec) {
5319 + case BP_VECTOR:
5320 + /*
5321 + * Update instruction length as we may reinject the exception
5322 + * from user space while in guest debugging mode.
5323 + */
5324 + to_vmx(vcpu)->vcpu.arch.event_exit_inst_len =
5325 + vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
5326 + if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
5327 + return false;
5328 + /* fall through */
5329 + case DB_VECTOR:
5330 + if (vcpu->guest_debug &
5331 + (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
5332 + return false;
5333 + /* fall through */
5334 + case DE_VECTOR:
5335 + case OF_VECTOR:
5336 + case BR_VECTOR:
5337 + case UD_VECTOR:
5338 + case DF_VECTOR:
5339 + case SS_VECTOR:
5340 + case GP_VECTOR:
5341 + case MF_VECTOR:
5342 + return true;
5343 + break;
5344 + }
5345 + return false;
5346 +}
5347 +
5348 +static int handle_rmode_exception(struct kvm_vcpu *vcpu,
5349 + int vec, u32 err_code)
5350 +{
5351 + /*
5352 + * Instruction with address size override prefix opcode 0x67
5353 + * Cause the #SS fault with 0 error code in VM86 mode.
5354 + */
5355 + if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) {
5356 + if (kvm_emulate_instruction(vcpu, 0)) {
5357 + if (vcpu->arch.halt_request) {
5358 + vcpu->arch.halt_request = 0;
5359 + return kvm_vcpu_halt(vcpu);
5360 + }
5361 + return 1;
5362 + }
5363 + return 0;
5364 + }
5365 +
5366 + /*
5367 + * Forward all other exceptions that are valid in real mode.
5368 + * FIXME: Breaks guest debugging in real mode, needs to be fixed with
5369 + * the required debugging infrastructure rework.
5370 + */
5371 + kvm_queue_exception(vcpu, vec);
5372 + return 1;
5373 +}
5374 +
5375 +/*
5376 + * Trigger machine check on the host. We assume all the MSRs are already set up
5377 + * by the CPU and that we still run on the same CPU as the MCE occurred on.
5378 + * We pass a fake environment to the machine check handler because we want
5379 + * the guest to be always treated like user space, no matter what context
5380 + * it used internally.
5381 + */
5382 +static void kvm_machine_check(void)
5383 +{
5384 +#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64)
5385 + struct pt_regs regs = {
5386 + .cs = 3, /* Fake ring 3 no matter what the guest ran on */
5387 + .flags = X86_EFLAGS_IF,
5388 + };
5389 +
5390 + do_machine_check(&regs, 0);
5391 +#endif
5392 +}
5393 +
5394 +static int handle_machine_check(struct kvm_vcpu *vcpu)
5395 +{
5396 + /* handled by vmx_vcpu_run() */
5397 + return 1;
5398 +}
5399 +
5400 +static int handle_exception_nmi(struct kvm_vcpu *vcpu)
5401 +{
5402 + struct vcpu_vmx *vmx = to_vmx(vcpu);
5403 + struct kvm_run *kvm_run = vcpu->run;
5404 + u32 intr_info, ex_no, error_code;
5405 + unsigned long cr2, rip, dr6;
5406 + u32 vect_info;
5407 +
5408 + vect_info = vmx->idt_vectoring_info;
5409 + intr_info = vmx->exit_intr_info;
5410 +
5411 + if (is_machine_check(intr_info) || is_nmi(intr_info))
5412 + return 1; /* handled by handle_exception_nmi_irqoff() */
5413 +
5414 + if (is_invalid_opcode(intr_info))
5415 + return handle_ud(vcpu);
5416 +
5417 + error_code = 0;
5418 + if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
5419 + error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
5420 +
5421 + if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) {
5422 + WARN_ON_ONCE(!enable_vmware_backdoor);
5423 +
5424 + /*
5425 + * VMware backdoor emulation on #GP interception only handles
5426 + * IN{S}, OUT{S}, and RDPMC, none of which generate a non-zero
5427 + * error code on #GP.
5428 + */
5429 + if (error_code) {
5430 + kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
5431 + return 1;
5432 + }
5433 + return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
5434 + }
5435 +
5436 + /*
5437 + * The #PF with PFEC.RSVD = 1 indicates the guest is accessing
5438 + * MMIO, it is better to report an internal error.
5439 + * See the comments in vmx_handle_exit.
5440 + */
5441 + if ((vect_info & VECTORING_INFO_VALID_MASK) &&
5442 + !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) {
5443 + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
5444 + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
5445 + vcpu->run->internal.ndata = 3;
5446 + vcpu->run->internal.data[0] = vect_info;
5447 + vcpu->run->internal.data[1] = intr_info;
5448 + vcpu->run->internal.data[2] = error_code;
5449 + return 0;
5450 + }
5451 +
5452 + if (is_page_fault(intr_info)) {
5453 + cr2 = vmcs_readl(EXIT_QUALIFICATION);
5454 + /* EPT won't cause page fault directly */
5455 + WARN_ON_ONCE(!vcpu->arch.apf.host_apf_reason && enable_ept);
5456 + return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0);
5457 + }
5458 +
5459 + ex_no = intr_info & INTR_INFO_VECTOR_MASK;
5460 +
5461 + if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no))
5462 + return handle_rmode_exception(vcpu, ex_no, error_code);
5463 +
5464 + switch (ex_no) {
5465 + case AC_VECTOR:
5466 + kvm_queue_exception_e(vcpu, AC_VECTOR, error_code);
5467 + return 1;
5468 + case DB_VECTOR:
5469 + dr6 = vmcs_readl(EXIT_QUALIFICATION);
5470 + if (!(vcpu->guest_debug &
5471 + (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
5472 + vcpu->arch.dr6 &= ~DR_TRAP_BITS;
5473 + vcpu->arch.dr6 |= dr6 | DR6_RTM;
5474 + if (is_icebp(intr_info))
5475 + WARN_ON(!skip_emulated_instruction(vcpu));
5476 +
5477 + kvm_queue_exception(vcpu, DB_VECTOR);
5478 + return 1;
5479 + }
5480 + kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1;
5481 + kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);
5482 + /* fall through */
5483 + case BP_VECTOR:
5484 + /*
5485 + * Update instruction length as we may reinject #BP from
5486 + * user space while in guest debugging mode. Reading it for
5487 + * #DB as well causes no harm, it is not used in that case.
5488 + */
5489 + vmx->vcpu.arch.event_exit_inst_len =
5490 + vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
5491 + kvm_run->exit_reason = KVM_EXIT_DEBUG;
5492 + rip = kvm_rip_read(vcpu);
5493 + kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip;
5494 + kvm_run->debug.arch.exception = ex_no;
5495 + break;
5496 + default:
5497 + kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
5498 + kvm_run->ex.exception = ex_no;
5499 + kvm_run->ex.error_code = error_code;
5500 + break;
5501 + }
5502 + return 0;
5503 +}
5504 +
5505 +static __always_inline int handle_external_interrupt(struct kvm_vcpu *vcpu)
5506 +{
5507 + ++vcpu->stat.irq_exits;
5508 + return 1;
5509 +}
5510 +
5511 +static int handle_triple_fault(struct kvm_vcpu *vcpu)
5512 +{
5513 + vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
5514 + vcpu->mmio_needed = 0;
5515 + return 0;
5516 +}
5517 +
5518 +static int handle_io(struct kvm_vcpu *vcpu)
5519 +{
5520 + unsigned long exit_qualification;
5521 + int size, in, string;
5522 + unsigned port;
5523 +
5524 + exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5525 + string = (exit_qualification & 16) != 0;
5526 +
5527 + ++vcpu->stat.io_exits;
5528 +
5529 + if (string)
5530 + return kvm_emulate_instruction(vcpu, 0);
5531 +
5532 + port = exit_qualification >> 16;
5533 + size = (exit_qualification & 7) + 1;
5534 + in = (exit_qualification & 8) != 0;
5535 +
5536 + return kvm_fast_pio(vcpu, size, port, in);
5537 +}
5538 +
5539 +static void
5540 +vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
5541 +{
5542 + /*
5543 + * Patch in the VMCALL instruction:
5544 + */
5545 + hypercall[0] = 0x0f;
5546 + hypercall[1] = 0x01;
5547 + hypercall[2] = 0xc1;
5548 +}
5549 +
5550 +/* called to set cr0 as appropriate for a mov-to-cr0 exit. */
5551 +static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
5552 +{
5553 + if (is_guest_mode(vcpu)) {
5554 + struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5555 + unsigned long orig_val = val;
5556 +
5557 + /*
5558 + * We get here when L2 changed cr0 in a way that did not change
5559 + * any of L1's shadowed bits (see nested_vmx_exit_handled_cr),
5560 + * but did change L0 shadowed bits. So we first calculate the
5561 + * effective cr0 value that L1 would like to write into the
5562 + * hardware. It consists of the L2-owned bits from the new
5563 + * value combined with the L1-owned bits from L1's guest_cr0.
5564 + */
5565 + val = (val & ~vmcs12->cr0_guest_host_mask) |
5566 + (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask);
5567 +
5568 + if (!nested_guest_cr0_valid(vcpu, val))
5569 + return 1;
5570 +
5571 + if (kvm_set_cr0(vcpu, val))
5572 + return 1;
5573 + vmcs_writel(CR0_READ_SHADOW, orig_val);
5574 + return 0;
5575 + } else {
5576 + if (to_vmx(vcpu)->nested.vmxon &&
5577 + !nested_host_cr0_valid(vcpu, val))
5578 + return 1;
5579 +
5580 + return kvm_set_cr0(vcpu, val);
5581 + }
5582 +}
5583 +
5584 +static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
5585 +{
5586 + if (is_guest_mode(vcpu)) {
5587 + struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5588 + unsigned long orig_val = val;
5589 +
5590 + /* analogously to handle_set_cr0 */
5591 + val = (val & ~vmcs12->cr4_guest_host_mask) |
5592 + (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask);
5593 + if (kvm_set_cr4(vcpu, val))
5594 + return 1;
5595 + vmcs_writel(CR4_READ_SHADOW, orig_val);
5596 + return 0;
5597 + } else
5598 + return kvm_set_cr4(vcpu, val);
5599 +}
5600 +
5601 +static int handle_desc(struct kvm_vcpu *vcpu)
5602 +{
5603 + WARN_ON(!(vcpu->arch.cr4 & X86_CR4_UMIP));
5604 + return kvm_emulate_instruction(vcpu, 0);
5605 +}
5606 +
5607 +static int handle_cr(struct kvm_vcpu *vcpu)
5608 +{
5609 + unsigned long exit_qualification, val;
5610 + int cr;
5611 + int reg;
5612 + int err;
5613 + int ret;
5614 +
5615 + exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5616 + cr = exit_qualification & 15;
5617 + reg = (exit_qualification >> 8) & 15;
5618 + switch ((exit_qualification >> 4) & 3) {
5619 + case 0: /* mov to cr */
5620 + val = kvm_register_readl(vcpu, reg);
5621 + trace_kvm_cr_write(cr, val);
5622 + switch (cr) {
5623 + case 0:
5624 + err = handle_set_cr0(vcpu, val);
5625 + return kvm_complete_insn_gp(vcpu, err);
5626 + case 3:
5627 + WARN_ON_ONCE(enable_unrestricted_guest);
5628 + err = kvm_set_cr3(vcpu, val);
5629 + return kvm_complete_insn_gp(vcpu, err);
5630 + case 4:
5631 + err = handle_set_cr4(vcpu, val);
5632 + return kvm_complete_insn_gp(vcpu, err);
5633 + case 8: {
5634 + u8 cr8_prev = kvm_get_cr8(vcpu);
5635 + u8 cr8 = (u8)val;
5636 + err = kvm_set_cr8(vcpu, cr8);
5637 + ret = kvm_complete_insn_gp(vcpu, err);
5638 + if (lapic_in_kernel(vcpu))
5639 + return ret;
5640 + if (cr8_prev <= cr8)
5641 + return ret;
5642 + /*
5643 + * TODO: we might be squashing a
5644 + * KVM_GUESTDBG_SINGLESTEP-triggered
5645 + * KVM_EXIT_DEBUG here.
5646 + */
5647 + vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
5648 + return 0;
5649 + }
5650 + }
5651 + break;
5652 + case 2: /* clts */
5653 + WARN_ONCE(1, "Guest should always own CR0.TS");
5654 + vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
5655 + trace_kvm_cr_write(0, kvm_read_cr0(vcpu));
5656 + return kvm_skip_emulated_instruction(vcpu);
5657 + case 1: /*mov from cr*/
5658 + switch (cr) {
5659 + case 3:
5660 + WARN_ON_ONCE(enable_unrestricted_guest);
5661 + val = kvm_read_cr3(vcpu);
5662 + kvm_register_write(vcpu, reg, val);
5663 + trace_kvm_cr_read(cr, val);
5664 + return kvm_skip_emulated_instruction(vcpu);
5665 + case 8:
5666 + val = kvm_get_cr8(vcpu);
5667 + kvm_register_write(vcpu, reg, val);
5668 + trace_kvm_cr_read(cr, val);
5669 + return kvm_skip_emulated_instruction(vcpu);
5670 + }
5671 + break;
5672 + case 3: /* lmsw */
5673 + val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
5674 + trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val);
5675 + kvm_lmsw(vcpu, val);
5676 +
5677 + return kvm_skip_emulated_instruction(vcpu);
5678 + default:
5679 + break;
5680 + }
5681 + vcpu->run->exit_reason = 0;
5682 + vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
5683 + (int)(exit_qualification >> 4) & 3, cr);
5684 + return 0;
5685 +}
5686 +
5687 +static int handle_dr(struct kvm_vcpu *vcpu)
5688 +{
5689 + unsigned long exit_qualification;
5690 + int dr, dr7, reg;
5691 +
5692 + exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5693 + dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
5694 +
5695 + /* First, if DR does not exist, trigger UD */
5696 + if (!kvm_require_dr(vcpu, dr))
5697 + return 1;
5698 +
5699 + /* Do not handle if the CPL > 0, will trigger GP on re-entry */
5700 + if (!kvm_require_cpl(vcpu, 0))
5701 + return 1;
5702 + dr7 = vmcs_readl(GUEST_DR7);
5703 + if (dr7 & DR7_GD) {
5704 + /*
5705 + * As the vm-exit takes precedence over the debug trap, we
5706 + * need to emulate the latter, either for the host or the
5707 + * guest debugging itself.
5708 + */
5709 + if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
5710 + vcpu->run->debug.arch.dr6 = vcpu->arch.dr6;
5711 + vcpu->run->debug.arch.dr7 = dr7;
5712 + vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu);
5713 + vcpu->run->debug.arch.exception = DB_VECTOR;
5714 + vcpu->run->exit_reason = KVM_EXIT_DEBUG;
5715 + return 0;
5716 + } else {
5717 + vcpu->arch.dr6 &= ~DR_TRAP_BITS;
5718 + vcpu->arch.dr6 |= DR6_BD | DR6_RTM;
5719 + kvm_queue_exception(vcpu, DB_VECTOR);
5720 + return 1;
5721 + }
5722 + }
5723 +
5724 + if (vcpu->guest_debug == 0) {
5725 + exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING);
5726 +
5727 + /*
5728 + * No more DR vmexits; force a reload of the debug registers
5729 + * and reenter on this instruction. The next vmexit will
5730 + * retrieve the full state of the debug registers.
5731 + */
5732 + vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
5733 + return 1;
5734 + }
5735 +
5736 + reg = DEBUG_REG_ACCESS_REG(exit_qualification);
5737 + if (exit_qualification & TYPE_MOV_FROM_DR) {
5738 + unsigned long val;
5739 +
5740 + if (kvm_get_dr(vcpu, dr, &val))
5741 + return 1;
5742 + kvm_register_write(vcpu, reg, val);
5743 + } else
5744 + if (kvm_set_dr(vcpu, dr, kvm_register_readl(vcpu, reg)))
5745 + return 1;
5746 +
5747 + return kvm_skip_emulated_instruction(vcpu);
5748 +}
5749 +
5750 +static u64 vmx_get_dr6(struct kvm_vcpu *vcpu)
5751 +{
5752 + return vcpu->arch.dr6;
5753 +}
5754 +
5755 +static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val)
5756 +{
5757 +}
5758 +
5759 +static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
5760 +{
5761 + get_debugreg(vcpu->arch.db[0], 0);
5762 + get_debugreg(vcpu->arch.db[1], 1);
5763 + get_debugreg(vcpu->arch.db[2], 2);
5764 + get_debugreg(vcpu->arch.db[3], 3);
5765 + get_debugreg(vcpu->arch.dr6, 6);
5766 + vcpu->arch.dr7 = vmcs_readl(GUEST_DR7);
5767 +
5768 + vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
5769 + exec_controls_setbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING);
5770 +}
5771 +
5772 +static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
5773 +{
5774 + vmcs_writel(GUEST_DR7, val);
5775 +}
5776 +
5777 +static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
5778 +{
5779 + kvm_apic_update_ppr(vcpu);
5780 + return 1;
5781 +}
5782 +
5783 +static int handle_interrupt_window(struct kvm_vcpu *vcpu)
5784 +{
5785 + exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);
5786 +
5787 + kvm_make_request(KVM_REQ_EVENT, vcpu);
5788 +
5789 + ++vcpu->stat.irq_window_exits;
5790 + return 1;
5791 +}
5792 +
5793 +static int handle_vmcall(struct kvm_vcpu *vcpu)
5794 +{
5795 + return kvm_emulate_hypercall(vcpu);
5796 +}
5797 +
5798 +static int handle_invd(struct kvm_vcpu *vcpu)
5799 +{
5800 + return kvm_emulate_instruction(vcpu, 0);
5801 +}
5802 +
5803 +static int handle_invlpg(struct kvm_vcpu *vcpu)
5804 +{
5805 + unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5806 +
5807 + kvm_mmu_invlpg(vcpu, exit_qualification);
5808 + return kvm_skip_emulated_instruction(vcpu);
5809 +}
5810 +
5811 +static int handle_rdpmc(struct kvm_vcpu *vcpu)
5812 +{
5813 + int err;
5814 +
5815 + err = kvm_rdpmc(vcpu);
5816 + return kvm_complete_insn_gp(vcpu, err);
5817 +}
5818 +
5819 +static int handle_wbinvd(struct kvm_vcpu *vcpu)
5820 +{
5821 + return kvm_emulate_wbinvd(vcpu);
5822 +}
5823 +
5824 +static int handle_xsetbv(struct kvm_vcpu *vcpu)
5825 +{
5826 + u64 new_bv = kvm_read_edx_eax(vcpu);
5827 + u32 index = kvm_rcx_read(vcpu);
5828 +
5829 + if (kvm_set_xcr(vcpu, index, new_bv) == 0)
5830 + return kvm_skip_emulated_instruction(vcpu);
5831 + return 1;
5832 +}
5833 +
5834 +static int handle_apic_access(struct kvm_vcpu *vcpu)
5835 +{
5836 + if (likely(fasteoi)) {
5837 + unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5838 + int access_type, offset;
5839 +
5840 + access_type = exit_qualification & APIC_ACCESS_TYPE;
5841 + offset = exit_qualification & APIC_ACCESS_OFFSET;
5842 + /*
5843 + * Sane guest uses MOV to write EOI, with written value
5844 + * not cared. So make a short-circuit here by avoiding
5845 + * heavy instruction emulation.
5846 + */
5847 + if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) &&
5848 + (offset == APIC_EOI)) {
5849 + kvm_lapic_set_eoi(vcpu);
5850 + return kvm_skip_emulated_instruction(vcpu);
5851 + }
5852 + }
5853 + return kvm_emulate_instruction(vcpu, 0);
5854 +}
5855 +
5856 +static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu)
5857 +{
5858 + unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5859 + int vector = exit_qualification & 0xff;
5860 +
5861 + /* EOI-induced VM exit is trap-like and thus no need to adjust IP */
5862 + kvm_apic_set_eoi_accelerated(vcpu, vector);
5863 + return 1;
5864 +}
5865 +
5866 +static int handle_apic_write(struct kvm_vcpu *vcpu)
5867 +{
5868 + unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5869 + u32 offset = exit_qualification & 0xfff;
5870 +
5871 + /* APIC-write VM exit is trap-like and thus no need to adjust IP */
5872 + kvm_apic_write_nodecode(vcpu, offset);
5873 + return 1;
5874 +}
5875 +
5876 +static int handle_task_switch(struct kvm_vcpu *vcpu)
5877 +{
5878 + struct vcpu_vmx *vmx = to_vmx(vcpu);
5879 + unsigned long exit_qualification;
5880 + bool has_error_code = false;
5881 + u32 error_code = 0;
5882 + u16 tss_selector;
5883 + int reason, type, idt_v, idt_index;
5884 +
5885 + idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
5886 + idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK);
5887 + type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
5888 +
5889 + exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5890 +
5891 + reason = (u32)exit_qualification >> 30;
5892 + if (reason == TASK_SWITCH_GATE && idt_v) {
5893 + switch (type) {
5894 + case INTR_TYPE_NMI_INTR:
5895 + vcpu->arch.nmi_injected = false;
5896 + vmx_set_nmi_mask(vcpu, true);
5897 + break;
5898 + case INTR_TYPE_EXT_INTR:
5899 + case INTR_TYPE_SOFT_INTR:
5900 + kvm_clear_interrupt_queue(vcpu);
5901 + break;
5902 + case INTR_TYPE_HARD_EXCEPTION:
5903 + if (vmx->idt_vectoring_info &
5904 + VECTORING_INFO_DELIVER_CODE_MASK) {
5905 + has_error_code = true;
5906 + error_code =
5907 + vmcs_read32(IDT_VECTORING_ERROR_CODE);
5908 + }
5909 + /* fall through */
5910 + case INTR_TYPE_SOFT_EXCEPTION:
5911 + kvm_clear_exception_queue(vcpu);
5912 + break;
5913 + default:
5914 + break;
5915 + }
5916 + }
5917 + tss_selector = exit_qualification;
5918 +
5919 + if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION &&
5920 + type != INTR_TYPE_EXT_INTR &&
5921 + type != INTR_TYPE_NMI_INTR))
5922 + WARN_ON(!skip_emulated_instruction(vcpu));
5923 +
5924 + /*
5925 + * TODO: What about debug traps on tss switch?
5926 + * Are we supposed to inject them and update dr6?
5927 + */
5928 + return kvm_task_switch(vcpu, tss_selector,
5929 + type == INTR_TYPE_SOFT_INTR ? idt_index : -1,
5930 + reason, has_error_code, error_code);
5931 +}
5932 +
5933 +static int handle_ept_violation(struct kvm_vcpu *vcpu)
5934 +{
5935 + unsigned long exit_qualification;
5936 + gpa_t gpa;
5937 + u64 error_code;
5938 +
5939 + exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5940 +
5941 + /*
5942 + * EPT violation happened while executing iret from NMI,
5943 + * "blocked by NMI" bit has to be set before next VM entry.
5944 + * There are errata that may cause this bit to not be set:
5945 + * AAK134, BY25.
5946 + */
5947 + if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
5948 + enable_vnmi &&
5949 + (exit_qualification & INTR_INFO_UNBLOCK_NMI))
5950 + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI);
5951 +
5952 + gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
5953 + trace_kvm_page_fault(gpa, exit_qualification);
5954 +
5955 + /* Is it a read fault? */
5956 + error_code = (exit_qualification & EPT_VIOLATION_ACC_READ)
5957 + ? PFERR_USER_MASK : 0;
5958 + /* Is it a write fault? */
5959 + error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE)
5960 + ? PFERR_WRITE_MASK : 0;
5961 + /* Is it a fetch fault? */
5962 + error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR)
5963 + ? PFERR_FETCH_MASK : 0;
5964 + /* ept page table entry is present? */
5965 + error_code |= (exit_qualification &
5966 + (EPT_VIOLATION_READABLE | EPT_VIOLATION_WRITABLE |
5967 + EPT_VIOLATION_EXECUTABLE))
5968 + ? PFERR_PRESENT_MASK : 0;
5969 +
5970 + error_code |= (exit_qualification & 0x100) != 0 ?
5971 + PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
5972 +
5973 + vcpu->arch.exit_qualification = exit_qualification;
5974 + return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
5975 +}
5976 +
5977 +static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
5978 +{
5979 + gpa_t gpa;
5980 +
5981 + /*
5982 + * A nested guest cannot optimize MMIO vmexits, because we have an
5983 + * nGPA here instead of the required GPA.
5984 + */
5985 + gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
5986 + if (!is_guest_mode(vcpu) &&
5987 + !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
5988 + trace_kvm_fast_mmio(gpa);
5989 + return kvm_skip_emulated_instruction(vcpu);
5990 + }
5991 +
5992 + return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0);
5993 +}
5994 +
5995 +static int handle_nmi_window(struct kvm_vcpu *vcpu)
5996 +{
5997 + WARN_ON_ONCE(!enable_vnmi);
5998 + exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
5999 + ++vcpu->stat.nmi_window_exits;
6000 + kvm_make_request(KVM_REQ_EVENT, vcpu);
6001 +
6002 + return 1;
6003 +}
6004 +
6005 +static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
6006 +{
6007 + struct vcpu_vmx *vmx = to_vmx(vcpu);
6008 + bool intr_window_requested;
6009 + unsigned count = 130;
6010 +
6011 + /*
6012 + * We should never reach the point where we are emulating L2
6013 + * due to invalid guest state as that means we incorrectly
6014 + * allowed a nested VMEntry with an invalid vmcs12.
6015 + */
6016 + WARN_ON_ONCE(vmx->emulation_required && vmx->nested.nested_run_pending);
6017 +
6018 + intr_window_requested = exec_controls_get(vmx) &
6019 + CPU_BASED_INTR_WINDOW_EXITING;
6020 +
6021 + while (vmx->emulation_required && count-- != 0) {
6022 + if (intr_window_requested && vmx_interrupt_allowed(vcpu))
6023 + return handle_interrupt_window(&vmx->vcpu);
6024 +
6025 + if (kvm_test_request(KVM_REQ_EVENT, vcpu))
6026 + return 1;
6027 +
6028 + if (!kvm_emulate_instruction(vcpu, 0))
6029 + return 0;
6030 +
6031 + if (vmx->emulation_required && !vmx->rmode.vm86_active &&
6032 + vcpu->arch.exception.pending) {
6033 + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
6034 + vcpu->run->internal.suberror =
6035 + KVM_INTERNAL_ERROR_EMULATION;
6036 + vcpu->run->internal.ndata = 0;
6037 + return 0;
6038 + }
6039 +
6040 + if (vcpu->arch.halt_request) {
6041 + vcpu->arch.halt_request = 0;
6042 + return kvm_vcpu_halt(vcpu);
6043 + }
6044 +
6045 + /*
6046 + * Note, return 1 and not 0, vcpu_run() is responsible for
6047 + * morphing the pending signal into the proper return code.
6048 + */
6049 + if (signal_pending(current))
6050 + return 1;
6051 +
6052 + if (need_resched())
6053 + schedule();
6054 + }
6055 +
6056 + return 1;
6057 +}
6058 +
6059 +static void grow_ple_window(struct kvm_vcpu *vcpu)
6060 +{
6061 + struct vcpu_vmx *vmx = to_vmx(vcpu);
6062 + unsigned int old = vmx->ple_window;
6063 +
6064 + vmx->ple_window = __grow_ple_window(old, ple_window,
6065 + ple_window_grow,
6066 + ple_window_max);
6067 +
6068 + if (vmx->ple_window != old) {
6069 + vmx->ple_window_dirty = true;
6070 + trace_kvm_ple_window_update(vcpu->vcpu_id,
6071 + vmx->ple_window, old);
6072 + }
6073 +}
6074 +
6075 +static void shrink_ple_window(struct kvm_vcpu *vcpu)
6076 +{
6077 + struct vcpu_vmx *vmx = to_vmx(vcpu);
6078 + unsigned int old = vmx->ple_window;
6079 +
6080 + vmx->ple_window = __shrink_ple_window(old, ple_window,
6081 + ple_window_shrink,
6082 + ple_window);
6083 +
6084 + if (vmx->ple_window != old) {
6085 + vmx->ple_window_dirty = true;
6086 + trace_kvm_ple_window_update(vcpu->vcpu_id,
6087 + vmx->ple_window, old);
6088 + }
6089 +}
6090 +
6091 +/*
6092 + * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
6093 + */
6094 +static void wakeup_handler(void)
6095 +{
6096 + struct kvm_vcpu *vcpu;
6097 + int cpu = smp_processor_id();
6098 +
6099 + spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
6100 + list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu),
6101 + blocked_vcpu_list) {
6102 + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
6103 +
6104 + if (pi_test_on(pi_desc) == 1)
6105 + kvm_vcpu_kick(vcpu);
6106 + }
6107 + spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
6108 +}
6109 +
6110 +static void vmx_enable_tdp(void)
6111 +{
6112 + kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
6113 + enable_ept_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull,
6114 + enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull,
6115 + 0ull, VMX_EPT_EXECUTABLE_MASK,
6116 + cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK,
6117 + VMX_EPT_RWX_MASK, 0ull);
6118 +
6119 + ept_set_mmio_spte_mask();
6120 + kvm_enable_tdp();
6121 +}
6122 +
6123 +/*
6124 + * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
6125 + * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
6126 + */
6127 +static int handle_pause(struct kvm_vcpu *vcpu)
6128 +{
6129 + if (!kvm_pause_in_guest(vcpu->kvm))
6130 + grow_ple_window(vcpu);
6131 +
6132 + /*
6133 + * Intel sdm vol3 ch-25.1.3 says: The "PAUSE-loop exiting"
6134 + * VM-execution control is ignored if CPL > 0. OTOH, KVM
6135 + * never set PAUSE_EXITING and just set PLE if supported,
6136 + * so the vcpu must be CPL=0 if it gets a PAUSE exit.
6137 + */
6138 + kvm_vcpu_on_spin(vcpu, true);
6139 + return kvm_skip_emulated_instruction(vcpu);
6140 +}
6141 +
6142 +static int handle_nop(struct kvm_vcpu *vcpu)
6143 +{
6144 + return kvm_skip_emulated_instruction(vcpu);
6145 +}
6146 +
6147 +static int handle_mwait(struct kvm_vcpu *vcpu)
6148 +{
6149 + printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n");
6150 + return handle_nop(vcpu);
6151 +}
6152 +
6153 +static int handle_invalid_op(struct kvm_vcpu *vcpu)
6154 +{
6155 + kvm_queue_exception(vcpu, UD_VECTOR);
6156 + return 1;
6157 +}
6158 +
6159 +static int handle_monitor_trap(struct kvm_vcpu *vcpu)
6160 +{
6161 + return 1;
6162 +}
6163 +
6164 +static int handle_monitor(struct kvm_vcpu *vcpu)
6165 +{
6166 + printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
6167 + return handle_nop(vcpu);
6168 +}
6169 +
6170 +static int handle_invpcid(struct kvm_vcpu *vcpu)
6171 +{
6172 + u32 vmx_instruction_info;
6173 + unsigned long type;
6174 + bool pcid_enabled;
6175 + gva_t gva;
6176 + struct x86_exception e;
6177 + unsigned i;
6178 + unsigned long roots_to_free = 0;
6179 + struct {
6180 + u64 pcid;
6181 + u64 gla;
6182 + } operand;
6183 +
6184 + if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {
6185 + kvm_queue_exception(vcpu, UD_VECTOR);
6186 + return 1;
6187 + }
6188 +
6189 + vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
6190 + type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
6191 +
6192 + if (type > 3) {
6193 + kvm_inject_gp(vcpu, 0);
6194 + return 1;
6195 + }
6196 +
6197 + /* According to the Intel instruction reference, the memory operand
6198 + * is read even if it isn't needed (e.g., for type==all)
6199 + */
6200 + if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
6201 + vmx_instruction_info, false,
6202 + sizeof(operand), &gva))
6203 + return 1;
6204 +
6205 + if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
6206 + kvm_inject_page_fault(vcpu, &e);
6207 + return 1;
6208 + }
6209 +
6210 + if (operand.pcid >> 12 != 0) {
6211 + kvm_inject_gp(vcpu, 0);
6212 + return 1;
6213 + }
6214 +
6215 + pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
6216 +
6217 + switch (type) {
6218 + case INVPCID_TYPE_INDIV_ADDR:
6219 + if ((!pcid_enabled && (operand.pcid != 0)) ||
6220 + is_noncanonical_address(operand.gla, vcpu)) {
6221 + kvm_inject_gp(vcpu, 0);
6222 + return 1;
6223 + }
6224 + kvm_mmu_invpcid_gva(vcpu, operand.gla, operand.pcid);
6225 + return kvm_skip_emulated_instruction(vcpu);
6226 +
6227 + case INVPCID_TYPE_SINGLE_CTXT:
6228 + if (!pcid_enabled && (operand.pcid != 0)) {
6229 + kvm_inject_gp(vcpu, 0);
6230 + return 1;
6231 + }
6232 +
6233 + if (kvm_get_active_pcid(vcpu) == operand.pcid) {
6234 + kvm_mmu_sync_roots(vcpu);
6235 + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
6236 + }
6237 +
6238 + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
6239 + if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].cr3)
6240 + == operand.pcid)
6241 + roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
6242 +
6243 + kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free);
6244 + /*
6245 + * If neither the current cr3 nor any of the prev_roots use the
6246 + * given PCID, then nothing needs to be done here because a
6247 + * resync will happen anyway before switching to any other CR3.
6248 + */
6249 +
6250 + return kvm_skip_emulated_instruction(vcpu);
6251 +
6252 + case INVPCID_TYPE_ALL_NON_GLOBAL:
6253 + /*
6254 + * Currently, KVM doesn't mark global entries in the shadow
6255 + * page tables, so a non-global flush just degenerates to a
6256 + * global flush. If needed, we could optimize this later by
6257 + * keeping track of global entries in shadow page tables.
6258 + */
6259 +
6260 + /* fall-through */
6261 + case INVPCID_TYPE_ALL_INCL_GLOBAL:
6262 + kvm_mmu_unload(vcpu);
6263 + return kvm_skip_emulated_instruction(vcpu);
6264 +
6265 + default:
6266 + BUG(); /* We have already checked above that type <= 3 */
6267 + }
6268 +}
6269 +
6270 +static int handle_pml_full(struct kvm_vcpu *vcpu)
6271 +{
6272 + unsigned long exit_qualification;
6273 +
6274 + trace_kvm_pml_full(vcpu->vcpu_id);
6275 +
6276 + exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
6277 +
6278 + /*
6279 + * PML buffer FULL happened while executing iret from NMI,
6280 + * "blocked by NMI" bit has to be set before next VM entry.
6281 + */
6282 + if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
6283 + enable_vnmi &&
6284 + (exit_qualification & INTR_INFO_UNBLOCK_NMI))
6285 + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
6286 + GUEST_INTR_STATE_NMI);
6287 +
6288 + /*
6289 + * PML buffer already flushed at beginning of VMEXIT. Nothing to do
6290 + * here.., and there's no userspace involvement needed for PML.
6291 + */
6292 + return 1;
6293 +}
6294 +
6295 +static int handle_preemption_timer(struct kvm_vcpu *vcpu)
6296 +{
6297 + struct vcpu_vmx *vmx = to_vmx(vcpu);
6298 +
6299 + if (!vmx->req_immediate_exit &&
6300 + !unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled))
6301 + kvm_lapic_expired_hv_timer(vcpu);
6302 +
6303 + return 1;
6304 +}
6305 +
6306 +/*
6307 + * When nested=0, all VMX instruction VM Exits filter here. The handlers
6308 + * are overwritten by nested_vmx_setup() when nested=1.
6309 + */
6310 +static int handle_vmx_instruction(struct kvm_vcpu *vcpu)
6311 +{
6312 + kvm_queue_exception(vcpu, UD_VECTOR);
6313 + return 1;
6314 +}
6315 +
6316 +static int handle_encls(struct kvm_vcpu *vcpu)
6317 +{
6318 + /*
6319 + * SGX virtualization is not yet supported. There is no software
6320 + * enable bit for SGX, so we have to trap ENCLS and inject a #UD
6321 + * to prevent the guest from executing ENCLS.
6322 + */
6323 + kvm_queue_exception(vcpu, UD_VECTOR);
6324 + return 1;
6325 +}
6326 +
6327 +/*
6328 + * The exit handlers return 1 if the exit was handled fully and guest execution
6329 + * may resume. Otherwise they set the kvm_run parameter to indicate what needs
6330 + * to be done to userspace and return 0.
6331 + */
6332 +static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
6333 + [EXIT_REASON_EXCEPTION_NMI] = handle_exception_nmi,
6334 + [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,
6335 + [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault,
6336 + [EXIT_REASON_NMI_WINDOW] = handle_nmi_window,
6337 + [EXIT_REASON_IO_INSTRUCTION] = handle_io,
6338 + [EXIT_REASON_CR_ACCESS] = handle_cr,
6339 + [EXIT_REASON_DR_ACCESS] = handle_dr,
6340 + [EXIT_REASON_CPUID] = kvm_emulate_cpuid,
6341 + [EXIT_REASON_MSR_READ] = kvm_emulate_rdmsr,
6342 + [EXIT_REASON_MSR_WRITE] = kvm_emulate_wrmsr,
6343 + [EXIT_REASON_INTERRUPT_WINDOW] = handle_interrupt_window,
6344 + [EXIT_REASON_HLT] = kvm_emulate_halt,
6345 + [EXIT_REASON_INVD] = handle_invd,
6346 + [EXIT_REASON_INVLPG] = handle_invlpg,
6347 + [EXIT_REASON_RDPMC] = handle_rdpmc,
6348 + [EXIT_REASON_VMCALL] = handle_vmcall,
6349 + [EXIT_REASON_VMCLEAR] = handle_vmx_instruction,
6350 + [EXIT_REASON_VMLAUNCH] = handle_vmx_instruction,
6351 + [EXIT_REASON_VMPTRLD] = handle_vmx_instruction,
6352 + [EXIT_REASON_VMPTRST] = handle_vmx_instruction,
6353 + [EXIT_REASON_VMREAD] = handle_vmx_instruction,
6354 + [EXIT_REASON_VMRESUME] = handle_vmx_instruction,
6355 + [EXIT_REASON_VMWRITE] = handle_vmx_instruction,
6356 + [EXIT_REASON_VMOFF] = handle_vmx_instruction,
6357 + [EXIT_REASON_VMON] = handle_vmx_instruction,
6358 + [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
6359 + [EXIT_REASON_APIC_ACCESS] = handle_apic_access,
6360 + [EXIT_REASON_APIC_WRITE] = handle_apic_write,
6361 + [EXIT_REASON_EOI_INDUCED] = handle_apic_eoi_induced,
6362 + [EXIT_REASON_WBINVD] = handle_wbinvd,
6363 + [EXIT_REASON_XSETBV] = handle_xsetbv,
6364 + [EXIT_REASON_TASK_SWITCH] = handle_task_switch,
6365 + [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check,
6366 + [EXIT_REASON_GDTR_IDTR] = handle_desc,
6367 + [EXIT_REASON_LDTR_TR] = handle_desc,
6368 + [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
6369 + [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig,
6370 + [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause,
6371 + [EXIT_REASON_MWAIT_INSTRUCTION] = handle_mwait,
6372 + [EXIT_REASON_MONITOR_TRAP_FLAG] = handle_monitor_trap,
6373 + [EXIT_REASON_MONITOR_INSTRUCTION] = handle_monitor,
6374 + [EXIT_REASON_INVEPT] = handle_vmx_instruction,
6375 + [EXIT_REASON_INVVPID] = handle_vmx_instruction,
6376 + [EXIT_REASON_RDRAND] = handle_invalid_op,
6377 + [EXIT_REASON_RDSEED] = handle_invalid_op,
6378 + [EXIT_REASON_PML_FULL] = handle_pml_full,
6379 + [EXIT_REASON_INVPCID] = handle_invpcid,
6380 + [EXIT_REASON_VMFUNC] = handle_vmx_instruction,
6381 + [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer,
6382 + [EXIT_REASON_ENCLS] = handle_encls,
6383 +};
6384 +
6385 +static const int kvm_vmx_max_exit_handlers =
6386 + ARRAY_SIZE(kvm_vmx_exit_handlers);
6387 +
6388 +static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
6389 +{
6390 + *info1 = vmcs_readl(EXIT_QUALIFICATION);
6391 + *info2 = vmcs_read32(VM_EXIT_INTR_INFO);
6392 +}
6393 +
6394 +static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
6395 +{
6396 + if (vmx->pml_pg) {
6397 + __free_page(vmx->pml_pg);
6398 + vmx->pml_pg = NULL;
6399 + }
6400 +}
6401 +
6402 +static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu)
6403 +{
6404 + struct vcpu_vmx *vmx = to_vmx(vcpu);
6405 + u64 *pml_buf;
6406 + u16 pml_idx;
6407 +
6408 + pml_idx = vmcs_read16(GUEST_PML_INDEX);
6409 +
6410 + /* Do nothing if PML buffer is empty */
6411 + if (pml_idx == (PML_ENTITY_NUM - 1))
6412 + return;
6413 +
6414 + /* PML index always points to next available PML buffer entity */
6415 + if (pml_idx >= PML_ENTITY_NUM)
6416 + pml_idx = 0;
6417 + else
6418 + pml_idx++;
6419 +
6420 + pml_buf = page_address(vmx->pml_pg);
6421 + for (; pml_idx < PML_ENTITY_NUM; pml_idx++) {
6422 + u64 gpa;
6423 +
6424 + gpa = pml_buf[pml_idx];
6425 + WARN_ON(gpa & (PAGE_SIZE - 1));
6426 + kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
6427 + }
6428 +
6429 + /* reset PML index */
6430 + vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
6431 +}
6432 +
6433 +/*
6434 + * Flush all vcpus' PML buffer and update logged GPAs to dirty_bitmap.
6435 + * Called before reporting dirty_bitmap to userspace.
6436 + */
6437 +static void kvm_flush_pml_buffers(struct kvm *kvm)
6438 +{
6439 + int i;
6440 + struct kvm_vcpu *vcpu;
6441 + /*
6442 + * We only need to kick vcpu out of guest mode here, as PML buffer
6443 + * is flushed at beginning of all VMEXITs, and it's obvious that only
6444 + * vcpus running in guest are possible to have unflushed GPAs in PML
6445 + * buffer.
6446 + */
6447 + kvm_for_each_vcpu(i, vcpu, kvm)
6448 + kvm_vcpu_kick(vcpu);
6449 +}
6450 +
6451 +static void vmx_dump_sel(char *name, uint32_t sel)
6452 +{
6453 + pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n",
6454 + name, vmcs_read16(sel),
6455 + vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR),
6456 + vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR),
6457 + vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR));
6458 +}
6459 +
6460 +static void vmx_dump_dtsel(char *name, uint32_t limit)
6461 +{
6462 + pr_err("%s limit=0x%08x, base=0x%016lx\n",
6463 + name, vmcs_read32(limit),
6464 + vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT));
6465 +}
6466 +
6467 +void dump_vmcs(void)
6468 +{
6469 + u32 vmentry_ctl, vmexit_ctl;
6470 + u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control;
6471 + unsigned long cr4;
6472 + u64 efer;
6473 + int i, n;
6474 +
6475 + if (!dump_invalid_vmcs) {
6476 + pr_warn_ratelimited("set kvm_intel.dump_invalid_vmcs=1 to dump internal KVM state.\n");
6477 + return;
6478 + }
6479 +
6480 + vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS);
6481 + vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS);
6482 + cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
6483 + pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
6484 + cr4 = vmcs_readl(GUEST_CR4);
6485 + efer = vmcs_read64(GUEST_IA32_EFER);
6486 + secondary_exec_control = 0;
6487 + if (cpu_has_secondary_exec_ctrls())
6488 + secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
6489 +
6490 + pr_err("*** Guest State ***\n");
6491 + pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
6492 + vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW),
6493 + vmcs_readl(CR0_GUEST_HOST_MASK));
6494 + pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
6495 + cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK));
6496 + pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3));
6497 + if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT) &&
6498 + (cr4 & X86_CR4_PAE) && !(efer & EFER_LMA))
6499 + {
6500 + pr_err("PDPTR0 = 0x%016llx PDPTR1 = 0x%016llx\n",
6501 + vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1));
6502 + pr_err("PDPTR2 = 0x%016llx PDPTR3 = 0x%016llx\n",
6503 + vmcs_read64(GUEST_PDPTR2), vmcs_read64(GUEST_PDPTR3));
6504 + }
6505 + pr_err("RSP = 0x%016lx RIP = 0x%016lx\n",
6506 + vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP));
6507 + pr_err("RFLAGS=0x%08lx DR7 = 0x%016lx\n",
6508 + vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7));
6509 + pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
6510 + vmcs_readl(GUEST_SYSENTER_ESP),
6511 + vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP));
6512 + vmx_dump_sel("CS: ", GUEST_CS_SELECTOR);
6513 + vmx_dump_sel("DS: ", GUEST_DS_SELECTOR);
6514 + vmx_dump_sel("SS: ", GUEST_SS_SELECTOR);
6515 + vmx_dump_sel("ES: ", GUEST_ES_SELECTOR);
6516 + vmx_dump_sel("FS: ", GUEST_FS_SELECTOR);
6517 + vmx_dump_sel("GS: ", GUEST_GS_SELECTOR);
6518 + vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT);
6519 + vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR);
6520 + vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT);
6521 + vmx_dump_sel("TR: ", GUEST_TR_SELECTOR);
6522 + if ((vmexit_ctl & (VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER)) ||
6523 + (vmentry_ctl & (VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_IA32_EFER)))
6524 + pr_err("EFER = 0x%016llx PAT = 0x%016llx\n",
6525 + efer, vmcs_read64(GUEST_IA32_PAT));
6526 + pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n",
6527 + vmcs_read64(GUEST_IA32_DEBUGCTL),
6528 + vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS));
6529 + if (cpu_has_load_perf_global_ctrl() &&
6530 + vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
6531 + pr_err("PerfGlobCtl = 0x%016llx\n",
6532 + vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL));
6533 + if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS)
6534 + pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS));
6535 + pr_err("Interruptibility = %08x ActivityState = %08x\n",
6536 + vmcs_read32(GUEST_INTERRUPTIBILITY_INFO),
6537 + vmcs_read32(GUEST_ACTIVITY_STATE));
6538 + if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
6539 + pr_err("InterruptStatus = %04x\n",
6540 + vmcs_read16(GUEST_INTR_STATUS));
6541 +
6542 + pr_err("*** Host State ***\n");
6543 + pr_err("RIP = 0x%016lx RSP = 0x%016lx\n",
6544 + vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP));
6545 + pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n",
6546 + vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR),
6547 + vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR),
6548 + vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR),
6549 + vmcs_read16(HOST_TR_SELECTOR));
6550 + pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n",
6551 + vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE),
6552 + vmcs_readl(HOST_TR_BASE));
6553 + pr_err("GDTBase=%016lx IDTBase=%016lx\n",
6554 + vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE));
6555 + pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n",
6556 + vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3),
6557 + vmcs_readl(HOST_CR4));
6558 + pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
6559 + vmcs_readl(HOST_IA32_SYSENTER_ESP),
6560 + vmcs_read32(HOST_IA32_SYSENTER_CS),
6561 + vmcs_readl(HOST_IA32_SYSENTER_EIP));
6562 + if (vmexit_ctl & (VM_EXIT_LOAD_IA32_PAT | VM_EXIT_LOAD_IA32_EFER))
6563 + pr_err("EFER = 0x%016llx PAT = 0x%016llx\n",
6564 + vmcs_read64(HOST_IA32_EFER),
6565 + vmcs_read64(HOST_IA32_PAT));
6566 + if (cpu_has_load_perf_global_ctrl() &&
6567 + vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
6568 + pr_err("PerfGlobCtl = 0x%016llx\n",
6569 + vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL));
6570 +
6571 + pr_err("*** Control State ***\n");
6572 + pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n",
6573 + pin_based_exec_ctrl, cpu_based_exec_ctrl, secondary_exec_control);
6574 + pr_err("EntryControls=%08x ExitControls=%08x\n", vmentry_ctl, vmexit_ctl);
6575 + pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n",
6576 + vmcs_read32(EXCEPTION_BITMAP),
6577 + vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK),
6578 + vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH));
6579 + pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n",
6580 + vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
6581 + vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE),
6582 + vmcs_read32(VM_ENTRY_INSTRUCTION_LEN));
6583 + pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n",
6584 + vmcs_read32(VM_EXIT_INTR_INFO),
6585 + vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
6586 + vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
6587 + pr_err(" reason=%08x qualification=%016lx\n",
6588 + vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION));
6589 + pr_err("IDTVectoring: info=%08x errcode=%08x\n",
6590 + vmcs_read32(IDT_VECTORING_INFO_FIELD),
6591 + vmcs_read32(IDT_VECTORING_ERROR_CODE));
6592 + pr_err("TSC Offset = 0x%016llx\n", vmcs_read64(TSC_OFFSET));
6593 + if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING)
6594 + pr_err("TSC Multiplier = 0x%016llx\n",
6595 + vmcs_read64(TSC_MULTIPLIER));
6596 + if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) {
6597 + if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) {
6598 + u16 status = vmcs_read16(GUEST_INTR_STATUS);
6599 + pr_err("SVI|RVI = %02x|%02x ", status >> 8, status & 0xff);
6600 + }
6601 + pr_cont("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD));
6602 + if (secondary_exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)
6603 + pr_err("APIC-access addr = 0x%016llx ", vmcs_read64(APIC_ACCESS_ADDR));
6604 + pr_cont("virt-APIC addr = 0x%016llx\n", vmcs_read64(VIRTUAL_APIC_PAGE_ADDR));
6605 + }
6606 + if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR)
6607 + pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV));
6608 + if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT))
6609 + pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER));
6610 + n = vmcs_read32(CR3_TARGET_COUNT);
6611 + for (i = 0; i + 1 < n; i += 4)
6612 + pr_err("CR3 target%u=%016lx target%u=%016lx\n",
6613 + i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2),
6614 + i + 1, vmcs_readl(CR3_TARGET_VALUE0 + i * 2 + 2));
6615 + if (i < n)
6616 + pr_err("CR3 target%u=%016lx\n",
6617 + i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2));
6618 + if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING)
6619 + pr_err("PLE Gap=%08x Window=%08x\n",
6620 + vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW));
6621 + if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID)
6622 + pr_err("Virtual processor ID = 0x%04x\n",
6623 + vmcs_read16(VIRTUAL_PROCESSOR_ID));
6624 +}
6625 +
6626 +/*
6627 + * The guest has exited. See if we can fix it or if we need userspace
6628 + * assistance.
6629 + */
6630 +static int vmx_handle_exit(struct kvm_vcpu *vcpu,
6631 + enum exit_fastpath_completion exit_fastpath)
6632 +{
6633 + struct vcpu_vmx *vmx = to_vmx(vcpu);
6634 + u32 exit_reason = vmx->exit_reason;
6635 + u32 vectoring_info = vmx->idt_vectoring_info;
6636 +
6637 + trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX);
6638 +
6639 + /*
6640 + * Flush logged GPAs PML buffer, this will make dirty_bitmap more
6641 + * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before
6642 + * querying dirty_bitmap, we only need to kick all vcpus out of guest
6643 + * mode as if vcpus is in root mode, the PML buffer must has been
6644 + * flushed already.
6645 + */
6646 + if (enable_pml)
6647 + vmx_flush_pml_buffer(vcpu);
6648 +
6649 + /* If guest state is invalid, start emulating */
6650 + if (vmx->emulation_required)
6651 + return handle_invalid_guest_state(vcpu);
6652 +
6653 + if (is_guest_mode(vcpu) && nested_vmx_exit_reflected(vcpu, exit_reason))
6654 + return nested_vmx_reflect_vmexit(vcpu, exit_reason);
6655 +
6656 + if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
6657 + dump_vmcs();
6658 + vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
6659 + vcpu->run->fail_entry.hardware_entry_failure_reason
6660 + = exit_reason;
6661 + return 0;
6662 + }
6663 +
6664 + if (unlikely(vmx->fail)) {
6665 + dump_vmcs();
6666 + vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
6667 + vcpu->run->fail_entry.hardware_entry_failure_reason
6668 + = vmcs_read32(VM_INSTRUCTION_ERROR);
6669 + return 0;
6670 + }
6671 +
6672 + /*
6673 + * Note:
6674 + * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by
6675 + * delivery event since it indicates guest is accessing MMIO.
6676 + * The vm-exit can be triggered again after return to guest that
6677 + * will cause infinite loop.
6678 + */
6679 + if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
6680 + (exit_reason != EXIT_REASON_EXCEPTION_NMI &&
6681 + exit_reason != EXIT_REASON_EPT_VIOLATION &&
6682 + exit_reason != EXIT_REASON_PML_FULL &&
6683 + exit_reason != EXIT_REASON_TASK_SWITCH)) {
6684 + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
6685 + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
6686 + vcpu->run->internal.ndata = 3;
6687 + vcpu->run->internal.data[0] = vectoring_info;
6688 + vcpu->run->internal.data[1] = exit_reason;
6689 + vcpu->run->internal.data[2] = vcpu->arch.exit_qualification;
6690 + if (exit_reason == EXIT_REASON_EPT_MISCONFIG) {
6691 + vcpu->run->internal.ndata++;
6692 + vcpu->run->internal.data[3] =
6693 + vmcs_read64(GUEST_PHYSICAL_ADDRESS);
6694 + }
6695 + return 0;
6696 + }
6697 +
6698 + if (unlikely(!enable_vnmi &&
6699 + vmx->loaded_vmcs->soft_vnmi_blocked)) {
6700 + if (vmx_interrupt_allowed(vcpu)) {
6701 + vmx->loaded_vmcs->soft_vnmi_blocked = 0;
6702 + } else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL &&
6703 + vcpu->arch.nmi_pending) {
6704 + /*
6705 + * This CPU don't support us in finding the end of an
6706 + * NMI-blocked window if the guest runs with IRQs
6707 + * disabled. So we pull the trigger after 1 s of
6708 + * futile waiting, but inform the user about this.
6709 + */
6710 + printk(KERN_WARNING "%s: Breaking out of NMI-blocked "
6711 + "state on VCPU %d after 1 s timeout\n",
6712 + __func__, vcpu->vcpu_id);
6713 + vmx->loaded_vmcs->soft_vnmi_blocked = 0;
6714 + }
6715 + }
6716 +
6717 + if (exit_fastpath == EXIT_FASTPATH_SKIP_EMUL_INS) {
6718 + kvm_skip_emulated_instruction(vcpu);
6719 + return 1;
6720 + } else if (exit_reason < kvm_vmx_max_exit_handlers
6721 + && kvm_vmx_exit_handlers[exit_reason]) {
6722 +#ifdef CONFIG_RETPOLINE
6723 + if (exit_reason == EXIT_REASON_MSR_WRITE)
6724 + return kvm_emulate_wrmsr(vcpu);
6725 + else if (exit_reason == EXIT_REASON_PREEMPTION_TIMER)
6726 + return handle_preemption_timer(vcpu);
6727 + else if (exit_reason == EXIT_REASON_INTERRUPT_WINDOW)
6728 + return handle_interrupt_window(vcpu);
6729 + else if (exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
6730 + return handle_external_interrupt(vcpu);
6731 + else if (exit_reason == EXIT_REASON_HLT)
6732 + return kvm_emulate_halt(vcpu);
6733 + else if (exit_reason == EXIT_REASON_EPT_MISCONFIG)
6734 + return handle_ept_misconfig(vcpu);
6735 +#endif
6736 + return kvm_vmx_exit_handlers[exit_reason](vcpu);
6737 + } else {
6738 + vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n",
6739 + exit_reason);
6740 + dump_vmcs();
6741 + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
6742 + vcpu->run->internal.suberror =
6743 + KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
6744 + vcpu->run->internal.ndata = 1;
6745 + vcpu->run->internal.data[0] = exit_reason;
6746 + return 0;
6747 + }
6748 +}
6749 +
6750 +/*
6751 + * Software based L1D cache flush which is used when microcode providing
6752 + * the cache control MSR is not loaded.
6753 + *
6754 + * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to
6755 + * flush it is required to read in 64 KiB because the replacement algorithm
6756 + * is not exactly LRU. This could be sized at runtime via topology
6757 + * information but as all relevant affected CPUs have 32KiB L1D cache size
6758 + * there is no point in doing so.
6759 + */
6760 +static void vmx_l1d_flush(struct kvm_vcpu *vcpu)
6761 +{
6762 + int size = PAGE_SIZE << L1D_CACHE_ORDER;
6763 +
6764 + /*
6765 + * This code is only executed when the the flush mode is 'cond' or
6766 + * 'always'
6767 + */
6768 + if (static_branch_likely(&vmx_l1d_flush_cond)) {
6769 + bool flush_l1d;
6770 +
6771 + /*
6772 + * Clear the per-vcpu flush bit, it gets set again
6773 + * either from vcpu_run() or from one of the unsafe
6774 + * VMEXIT handlers.
6775 + */
6776 + flush_l1d = vcpu->arch.l1tf_flush_l1d;
6777 + vcpu->arch.l1tf_flush_l1d = false;
6778 +
6779 + /*
6780 + * Clear the per-cpu flush bit, it gets set again from
6781 + * the interrupt handlers.
6782 + */
6783 + flush_l1d |= kvm_get_cpu_l1tf_flush_l1d();
6784 + kvm_clear_cpu_l1tf_flush_l1d();
6785 +
6786 + if (!flush_l1d)
6787 + return;
6788 + }
6789 +
6790 + vcpu->stat.l1d_flush++;
6791 +
6792 + if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
6793 + wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
6794 + return;
6795 + }
6796 +
6797 + asm volatile(
6798 + /* First ensure the pages are in the TLB */
6799 + "xorl %%eax, %%eax\n"
6800 + ".Lpopulate_tlb:\n\t"
6801 + "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
6802 + "addl $4096, %%eax\n\t"
6803 + "cmpl %%eax, %[size]\n\t"
6804 + "jne .Lpopulate_tlb\n\t"
6805 + "xorl %%eax, %%eax\n\t"
6806 + "cpuid\n\t"
6807 + /* Now fill the cache */
6808 + "xorl %%eax, %%eax\n"
6809 + ".Lfill_cache:\n"
6810 + "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
6811 + "addl $64, %%eax\n\t"
6812 + "cmpl %%eax, %[size]\n\t"
6813 + "jne .Lfill_cache\n\t"
6814 + "lfence\n"
6815 + :: [flush_pages] "r" (vmx_l1d_flush_pages),
6816 + [size] "r" (size)
6817 + : "eax", "ebx", "ecx", "edx");
6818 +}
6819 +
6820 +static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
6821 +{
6822 + struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
6823 + int tpr_threshold;
6824 +
6825 + if (is_guest_mode(vcpu) &&
6826 + nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
6827 + return;
6828 +
6829 + tpr_threshold = (irr == -1 || tpr < irr) ? 0 : irr;
6830 + if (is_guest_mode(vcpu))
6831 + to_vmx(vcpu)->nested.l1_tpr_threshold = tpr_threshold;
6832 + else
6833 + vmcs_write32(TPR_THRESHOLD, tpr_threshold);
6834 +}
6835 +
6836 +void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
6837 +{
6838 + struct vcpu_vmx *vmx = to_vmx(vcpu);
6839 + u32 sec_exec_control;
6840 +
6841 + if (!lapic_in_kernel(vcpu))
6842 + return;
6843 +
6844 + if (!flexpriority_enabled &&
6845 + !cpu_has_vmx_virtualize_x2apic_mode())
6846 + return;
6847 +
6848 + /* Postpone execution until vmcs01 is the current VMCS. */
6849 + if (is_guest_mode(vcpu)) {
6850 + vmx->nested.change_vmcs01_virtual_apic_mode = true;
6851 + return;
6852 + }
6853 +
6854 + sec_exec_control = secondary_exec_controls_get(vmx);
6855 + sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
6856 + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
6857 +
6858 + switch (kvm_get_apic_mode(vcpu)) {
6859 + case LAPIC_MODE_INVALID:
6860 + WARN_ONCE(true, "Invalid local APIC state");
6861 + case LAPIC_MODE_DISABLED:
6862 + break;
6863 + case LAPIC_MODE_XAPIC:
6864 + if (flexpriority_enabled) {
6865 + sec_exec_control |=
6866 + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
6867 + vmx_flush_tlb(vcpu, true);
6868 + }
6869 + break;
6870 + case LAPIC_MODE_X2APIC:
6871 + if (cpu_has_vmx_virtualize_x2apic_mode())
6872 + sec_exec_control |=
6873 + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
6874 + break;
6875 + }
6876 + secondary_exec_controls_set(vmx, sec_exec_control);
6877 +
6878 + vmx_update_msr_bitmap(vcpu);
6879 +}
6880 +
6881 +static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa)
6882 +{
6883 + if (!is_guest_mode(vcpu)) {
6884 + vmcs_write64(APIC_ACCESS_ADDR, hpa);
6885 + vmx_flush_tlb(vcpu, true);
6886 + }
6887 +}
6888 +
6889 +static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
6890 +{
6891 + u16 status;
6892 + u8 old;
6893 +
6894 + if (max_isr == -1)
6895 + max_isr = 0;
6896 +
6897 + status = vmcs_read16(GUEST_INTR_STATUS);
6898 + old = status >> 8;
6899 + if (max_isr != old) {
6900 + status &= 0xff;
6901 + status |= max_isr << 8;
6902 + vmcs_write16(GUEST_INTR_STATUS, status);
6903 + }
6904 +}
6905 +
6906 +static void vmx_set_rvi(int vector)
6907 +{
6908 + u16 status;
6909 + u8 old;
6910 +
6911 + if (vector == -1)
6912 + vector = 0;
6913 +
6914 + status = vmcs_read16(GUEST_INTR_STATUS);
6915 + old = (u8)status & 0xff;
6916 + if ((u8)vector != old) {
6917 + status &= ~0xff;
6918 + status |= (u8)vector;
6919 + vmcs_write16(GUEST_INTR_STATUS, status);
6920 + }
6921 +}
6922 +
6923 +static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
6924 +{
6925 + /*
6926 + * When running L2, updating RVI is only relevant when
6927 + * vmcs12 virtual-interrupt-delivery enabled.
6928 + * However, it can be enabled only when L1 also
6929 + * intercepts external-interrupts and in that case
6930 + * we should not update vmcs02 RVI but instead intercept
6931 + * interrupt. Therefore, do nothing when running L2.
6932 + */
6933 + if (!is_guest_mode(vcpu))
6934 + vmx_set_rvi(max_irr);
6935 +}
6936 +
6937 +static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
6938 +{
6939 + struct vcpu_vmx *vmx = to_vmx(vcpu);
6940 + int max_irr;
6941 + bool max_irr_updated;
6942 +
6943 + WARN_ON(!vcpu->arch.apicv_active);
6944 + if (pi_test_on(&vmx->pi_desc)) {
6945 + pi_clear_on(&vmx->pi_desc);
6946 + /*
6947 + * IOMMU can write to PID.ON, so the barrier matters even on UP.
6948 + * But on x86 this is just a compiler barrier anyway.
6949 + */
6950 + smp_mb__after_atomic();
6951 + max_irr_updated =
6952 + kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr);
6953 +
6954 + /*
6955 + * If we are running L2 and L1 has a new pending interrupt
6956 + * which can be injected, we should re-evaluate
6957 + * what should be done with this new L1 interrupt.
6958 + * If L1 intercepts external-interrupts, we should
6959 + * exit from L2 to L1. Otherwise, interrupt should be
6960 + * delivered directly to L2.
6961 + */
6962 + if (is_guest_mode(vcpu) && max_irr_updated) {
6963 + if (nested_exit_on_intr(vcpu))
6964 + kvm_vcpu_exiting_guest_mode(vcpu);
6965 + else
6966 + kvm_make_request(KVM_REQ_EVENT, vcpu);
6967 + }
6968 + } else {
6969 + max_irr = kvm_lapic_find_highest_irr(vcpu);
6970 + }
6971 + vmx_hwapic_irr_update(vcpu, max_irr);
6972 + return max_irr;
6973 +}
6974 +
6975 +static bool vmx_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu)
6976 +{
6977 + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
6978 +
6979 + return pi_test_on(pi_desc) ||
6980 + (pi_test_sn(pi_desc) && !pi_is_pir_empty(pi_desc));
6981 +}
6982 +
6983 +static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
6984 +{
6985 + if (!kvm_vcpu_apicv_active(vcpu))
6986 + return;
6987 +
6988 + vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]);
6989 + vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]);
6990 + vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]);
6991 + vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]);
6992 +}
6993 +
6994 +static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu)
6995 +{
6996 + struct vcpu_vmx *vmx = to_vmx(vcpu);
6997 +
6998 + pi_clear_on(&vmx->pi_desc);
6999 + memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir));
7000 +}
7001 +
7002 +static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
7003 +{
7004 + vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
7005 +
7006 + /* if exit due to PF check for async PF */
7007 + if (is_page_fault(vmx->exit_intr_info))
7008 + vmx->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason();
7009 +
7010 + /* Handle machine checks before interrupts are enabled */
7011 + if (is_machine_check(vmx->exit_intr_info))
7012 + kvm_machine_check();
7013 +
7014 + /* We need to handle NMIs before interrupts are enabled */
7015 + if (is_nmi(vmx->exit_intr_info)) {
7016 + kvm_before_interrupt(&vmx->vcpu);
7017 + asm("int $2");
7018 + kvm_after_interrupt(&vmx->vcpu);
7019 + }
7020 +}
7021 +
7022 +static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
7023 +{
7024 + unsigned int vector;
7025 + unsigned long entry;
7026 +#ifdef CONFIG_X86_64
7027 + unsigned long tmp;
7028 +#endif
7029 + gate_desc *desc;
7030 + u32 intr_info;
7031 +
7032 + intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
7033 + if (WARN_ONCE(!is_external_intr(intr_info),
7034 + "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info))
7035 + return;
7036 +
7037 + vector = intr_info & INTR_INFO_VECTOR_MASK;
7038 + desc = (gate_desc *)host_idt_base + vector;
7039 + entry = gate_offset(desc);
7040 +
7041 + kvm_before_interrupt(vcpu);
7042 +
7043 + asm volatile(
7044 +#ifdef CONFIG_X86_64
7045 + "mov %%" _ASM_SP ", %[sp]\n\t"
7046 + "and $0xfffffffffffffff0, %%" _ASM_SP "\n\t"
7047 + "push $%c[ss]\n\t"
7048 + "push %[sp]\n\t"
7049 +#endif
7050 + "pushf\n\t"
7051 + __ASM_SIZE(push) " $%c[cs]\n\t"
7052 + CALL_NOSPEC
7053 + :
7054 +#ifdef CONFIG_X86_64
7055 + [sp]"=&r"(tmp),
7056 +#endif
7057 + ASM_CALL_CONSTRAINT
7058 + :
7059 + THUNK_TARGET(entry),
7060 + [ss]"i"(__KERNEL_DS),
7061 + [cs]"i"(__KERNEL_CS)
7062 + );
7063 +
7064 + kvm_after_interrupt(vcpu);
7065 +}
7066 +STACK_FRAME_NON_STANDARD(handle_external_interrupt_irqoff);
7067 +
7068 +static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu,
7069 + enum exit_fastpath_completion *exit_fastpath)
7070 +{
7071 + struct vcpu_vmx *vmx = to_vmx(vcpu);
7072 +
7073 + if (vmx->exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
7074 + handle_external_interrupt_irqoff(vcpu);
7075 + else if (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI)
7076 + handle_exception_nmi_irqoff(vmx);
7077 + else if (!is_guest_mode(vcpu) &&
7078 + vmx->exit_reason == EXIT_REASON_MSR_WRITE)
7079 + *exit_fastpath = handle_fastpath_set_msr_irqoff(vcpu);
7080 +}
7081 +
7082 +static bool vmx_has_emulated_msr(int index)
7083 +{
7084 + switch (index) {
7085 + case MSR_IA32_SMBASE:
7086 + /*
7087 + * We cannot do SMM unless we can run the guest in big
7088 + * real mode.
7089 + */
7090 + return enable_unrestricted_guest || emulate_invalid_guest_state;
7091 + case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
7092 + return nested;
7093 + case MSR_AMD64_VIRT_SPEC_CTRL:
7094 + /* This is AMD only. */
7095 + return false;
7096 + default:
7097 + return true;
7098 + }
7099 +}
7100 +
7101 +static bool vmx_pt_supported(void)
7102 +{
7103 + return pt_mode == PT_MODE_HOST_GUEST;
7104 +}
7105 +
7106 +static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
7107 +{
7108 + u32 exit_intr_info;
7109 + bool unblock_nmi;
7110 + u8 vector;
7111 + bool idtv_info_valid;
7112 +
7113 + idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK;
7114 +
7115 + if (enable_vnmi) {
7116 + if (vmx->loaded_vmcs->nmi_known_unmasked)
7117 + return;
7118 + /*
7119 + * Can't use vmx->exit_intr_info since we're not sure what
7120 + * the exit reason is.
7121 + */
7122 + exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
7123 + unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
7124 + vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
7125 + /*
7126 + * SDM 3: 27.7.1.2 (September 2008)
7127 + * Re-set bit "block by NMI" before VM entry if vmexit caused by
7128 + * a guest IRET fault.
7129 + * SDM 3: 23.2.2 (September 2008)
7130 + * Bit 12 is undefined in any of the following cases:
7131 + * If the VM exit sets the valid bit in the IDT-vectoring
7132 + * information field.
7133 + * If the VM exit is due to a double fault.
7134 + */
7135 + if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
7136 + vector != DF_VECTOR && !idtv_info_valid)
7137 + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
7138 + GUEST_INTR_STATE_NMI);
7139 + else
7140 + vmx->loaded_vmcs->nmi_known_unmasked =
7141 + !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)
7142 + & GUEST_INTR_STATE_NMI);
7143 + } else if (unlikely(vmx->loaded_vmcs->soft_vnmi_blocked))
7144 + vmx->loaded_vmcs->vnmi_blocked_time +=
7145 + ktime_to_ns(ktime_sub(ktime_get(),
7146 + vmx->loaded_vmcs->entry_time));
7147 +}
7148 +
7149 +static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
7150 + u32 idt_vectoring_info,
7151 + int instr_len_field,
7152 + int error_code_field)
7153 +{
7154 + u8 vector;
7155 + int type;
7156 + bool idtv_info_valid;
7157 +
7158 + idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
7159 +
7160 + vcpu->arch.nmi_injected = false;
7161 + kvm_clear_exception_queue(vcpu);
7162 + kvm_clear_interrupt_queue(vcpu);
7163 +
7164 + if (!idtv_info_valid)
7165 + return;
7166 +
7167 + kvm_make_request(KVM_REQ_EVENT, vcpu);
7168 +
7169 + vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
7170 + type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
7171 +
7172 + switch (type) {
7173 + case INTR_TYPE_NMI_INTR:
7174 + vcpu->arch.nmi_injected = true;
7175 + /*
7176 + * SDM 3: 27.7.1.2 (September 2008)
7177 + * Clear bit "block by NMI" before VM entry if a NMI
7178 + * delivery faulted.
7179 + */
7180 + vmx_set_nmi_mask(vcpu, false);
7181 + break;
7182 + case INTR_TYPE_SOFT_EXCEPTION:
7183 + vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
7184 + /* fall through */
7185 + case INTR_TYPE_HARD_EXCEPTION:
7186 + if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
7187 + u32 err = vmcs_read32(error_code_field);
7188 + kvm_requeue_exception_e(vcpu, vector, err);
7189 + } else
7190 + kvm_requeue_exception(vcpu, vector);
7191 + break;
7192 + case INTR_TYPE_SOFT_INTR:
7193 + vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
7194 + /* fall through */
7195 + case INTR_TYPE_EXT_INTR:
7196 + kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR);
7197 + break;
7198 + default:
7199 + break;
7200 + }
7201 +}
7202 +
7203 +static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
7204 +{
7205 + __vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info,
7206 + VM_EXIT_INSTRUCTION_LEN,
7207 + IDT_VECTORING_ERROR_CODE);
7208 +}
7209 +
7210 +static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
7211 +{
7212 + __vmx_complete_interrupts(vcpu,
7213 + vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
7214 + VM_ENTRY_INSTRUCTION_LEN,
7215 + VM_ENTRY_EXCEPTION_ERROR_CODE);
7216 +
7217 + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
7218 +}
7219 +
7220 +static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
7221 +{
7222 + int i, nr_msrs;
7223 + struct perf_guest_switch_msr *msrs;
7224 +
7225 + msrs = perf_guest_get_msrs(&nr_msrs);
7226 +
7227 + if (!msrs)
7228 + return;
7229 +
7230 + for (i = 0; i < nr_msrs; i++)
7231 + if (msrs[i].host == msrs[i].guest)
7232 + clear_atomic_switch_msr(vmx, msrs[i].msr);
7233 + else
7234 + add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest,
7235 + msrs[i].host, false);
7236 +}
7237 +
7238 +static void atomic_switch_umwait_control_msr(struct vcpu_vmx *vmx)
7239 +{
7240 + u32 host_umwait_control;
7241 +
7242 + if (!vmx_has_waitpkg(vmx))
7243 + return;
7244 +
7245 + host_umwait_control = get_umwait_control_msr();
7246 +
7247 + if (vmx->msr_ia32_umwait_control != host_umwait_control)
7248 + add_atomic_switch_msr(vmx, MSR_IA32_UMWAIT_CONTROL,
7249 + vmx->msr_ia32_umwait_control,
7250 + host_umwait_control, false);
7251 + else
7252 + clear_atomic_switch_msr(vmx, MSR_IA32_UMWAIT_CONTROL);
7253 +}
7254 +
7255 +static void vmx_update_hv_timer(struct kvm_vcpu *vcpu)
7256 +{
7257 + struct vcpu_vmx *vmx = to_vmx(vcpu);
7258 + u64 tscl;
7259 + u32 delta_tsc;
7260 +
7261 + if (vmx->req_immediate_exit) {
7262 + vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0);
7263 + vmx->loaded_vmcs->hv_timer_soft_disabled = false;
7264 + } else if (vmx->hv_deadline_tsc != -1) {
7265 + tscl = rdtsc();
7266 + if (vmx->hv_deadline_tsc > tscl)
7267 + /* set_hv_timer ensures the delta fits in 32-bits */
7268 + delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >>
7269 + cpu_preemption_timer_multi);
7270 + else
7271 + delta_tsc = 0;
7272 +
7273 + vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc);
7274 + vmx->loaded_vmcs->hv_timer_soft_disabled = false;
7275 + } else if (!vmx->loaded_vmcs->hv_timer_soft_disabled) {
7276 + vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, -1);
7277 + vmx->loaded_vmcs->hv_timer_soft_disabled = true;
7278 + }
7279 +}
7280 +
7281 +void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
7282 +{
7283 + if (unlikely(host_rsp != vmx->loaded_vmcs->host_state.rsp)) {
7284 + vmx->loaded_vmcs->host_state.rsp = host_rsp;
7285 + vmcs_writel(HOST_RSP, host_rsp);
7286 + }
7287 +}
7288 +
7289 +bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched);
7290 +
7291 +static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
7292 +{
7293 + struct vcpu_vmx *vmx = to_vmx(vcpu);
7294 + unsigned long cr3, cr4;
7295 +
7296 + /* Record the guest's net vcpu time for enforced NMI injections. */
7297 + if (unlikely(!enable_vnmi &&
7298 + vmx->loaded_vmcs->soft_vnmi_blocked))
7299 + vmx->loaded_vmcs->entry_time = ktime_get();
7300 +
7301 + /* Don't enter VMX if guest state is invalid, let the exit handler
7302 + start emulation until we arrive back to a valid state */
7303 + if (vmx->emulation_required)
7304 + return;
7305 +
7306 + if (vmx->ple_window_dirty) {
7307 + vmx->ple_window_dirty = false;
7308 + vmcs_write32(PLE_WINDOW, vmx->ple_window);
7309 + }
7310 +
7311 + if (vmx->nested.need_vmcs12_to_shadow_sync)
7312 + nested_sync_vmcs12_to_shadow(vcpu);
7313 +
7314 + if (kvm_register_is_dirty(vcpu, VCPU_REGS_RSP))
7315 + vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
7316 + if (kvm_register_is_dirty(vcpu, VCPU_REGS_RIP))
7317 + vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
7318 +
7319 + cr3 = __get_current_cr3_fast();
7320 + if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
7321 + vmcs_writel(HOST_CR3, cr3);
7322 + vmx->loaded_vmcs->host_state.cr3 = cr3;
7323 + }
7324 +
7325 + cr4 = cr4_read_shadow();
7326 + if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
7327 + vmcs_writel(HOST_CR4, cr4);
7328 + vmx->loaded_vmcs->host_state.cr4 = cr4;
7329 + }
7330 +
7331 + /* When single-stepping over STI and MOV SS, we must clear the
7332 + * corresponding interruptibility bits in the guest state. Otherwise
7333 + * vmentry fails as it then expects bit 14 (BS) in pending debug
7334 + * exceptions being set, but that's not correct for the guest debugging
7335 + * case. */
7336 + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
7337 + vmx_set_interrupt_shadow(vcpu, 0);
7338 +
7339 + kvm_load_guest_xsave_state(vcpu);
7340 +
7341 + if (static_cpu_has(X86_FEATURE_PKU) &&
7342 + kvm_read_cr4_bits(vcpu, X86_CR4_PKE) &&
7343 + vcpu->arch.pkru != vmx->host_pkru)
7344 + __write_pkru(vcpu->arch.pkru);
7345 +
7346 + pt_guest_enter(vmx);
7347 +
7348 + atomic_switch_perf_msrs(vmx);
7349 + atomic_switch_umwait_control_msr(vmx);
7350 +
7351 + if (enable_preemption_timer)
7352 + vmx_update_hv_timer(vcpu);
7353 +
7354 + if (lapic_in_kernel(vcpu) &&
7355 + vcpu->arch.apic->lapic_timer.timer_advance_ns)
7356 + kvm_wait_lapic_expire(vcpu);
7357 +
7358 + /*
7359 + * If this vCPU has touched SPEC_CTRL, restore the guest's value if
7360 + * it's non-zero. Since vmentry is serialising on affected CPUs, there
7361 + * is no need to worry about the conditional branch over the wrmsr
7362 + * being speculatively taken.
7363 + */
7364 + x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0);
7365 +
7366 + /* L1D Flush includes CPU buffer clear to mitigate MDS */
7367 + if (static_branch_unlikely(&vmx_l1d_should_flush))
7368 + vmx_l1d_flush(vcpu);
7369 + else if (static_branch_unlikely(&mds_user_clear))
7370 + mds_clear_cpu_buffers();
7371 +
7372 + if (vcpu->arch.cr2 != read_cr2())
7373 + write_cr2(vcpu->arch.cr2);
7374 +
7375 + vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
7376 + vmx->loaded_vmcs->launched);
7377 +
7378 + vcpu->arch.cr2 = read_cr2();
7379 +
7380 + /*
7381 + * We do not use IBRS in the kernel. If this vCPU has used the
7382 + * SPEC_CTRL MSR it may have left it on; save the value and
7383 + * turn it off. This is much more efficient than blindly adding
7384 + * it to the atomic save/restore list. Especially as the former
7385 + * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
7386 + *
7387 + * For non-nested case:
7388 + * If the L01 MSR bitmap does not intercept the MSR, then we need to
7389 + * save it.
7390 + *
7391 + * For nested case:
7392 + * If the L02 MSR bitmap does not intercept the MSR, then we need to
7393 + * save it.
7394 + */
7395 + if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
7396 + vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
7397 +
7398 + x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0);
7399 +
7400 + /* All fields are clean at this point */
7401 + if (static_branch_unlikely(&enable_evmcs))
7402 + current_evmcs->hv_clean_fields |=
7403 + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
7404 +
7405 + if (static_branch_unlikely(&enable_evmcs))
7406 + current_evmcs->hv_vp_id = vcpu->arch.hyperv.vp_index;
7407 +
7408 + /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
7409 + if (vmx->host_debugctlmsr)
7410 + update_debugctlmsr(vmx->host_debugctlmsr);
7411 +
7412 +#ifndef CONFIG_X86_64
7413 + /*
7414 + * The sysexit path does not restore ds/es, so we must set them to
7415 + * a reasonable value ourselves.
7416 + *
7417 + * We can't defer this to vmx_prepare_switch_to_host() since that
7418 + * function may be executed in interrupt context, which saves and
7419 + * restore segments around it, nullifying its effect.
7420 + */
7421 + loadsegment(ds, __USER_DS);
7422 + loadsegment(es, __USER_DS);
7423 +#endif
7424 +
7425 + vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
7426 + | (1 << VCPU_EXREG_RFLAGS)
7427 + | (1 << VCPU_EXREG_PDPTR)
7428 + | (1 << VCPU_EXREG_SEGMENTS)
7429 + | (1 << VCPU_EXREG_CR3));
7430 + vcpu->arch.regs_dirty = 0;
7431 +
7432 + pt_guest_exit(vmx);
7433 +
7434 + /*
7435 + * eager fpu is enabled if PKEY is supported and CR4 is switched
7436 + * back on host, so it is safe to read guest PKRU from current
7437 + * XSAVE.
7438 + */
7439 + if (static_cpu_has(X86_FEATURE_PKU) &&
7440 + kvm_read_cr4_bits(vcpu, X86_CR4_PKE)) {
7441 + vcpu->arch.pkru = rdpkru();
7442 + if (vcpu->arch.pkru != vmx->host_pkru)
7443 + __write_pkru(vmx->host_pkru);
7444 + }
7445 +
7446 + kvm_load_host_xsave_state(vcpu);
7447 +
7448 + vmx->nested.nested_run_pending = 0;
7449 + vmx->idt_vectoring_info = 0;
7450 +
7451 + vmx->exit_reason = vmx->fail ? 0xdead : vmcs_read32(VM_EXIT_REASON);
7452 + if ((u16)vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)
7453 + kvm_machine_check();
7454 +
7455 + if (vmx->fail || (vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
7456 + return;
7457 +
7458 + vmx->loaded_vmcs->launched = 1;
7459 + vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
7460 +
7461 + vmx_recover_nmi_blocking(vmx);
7462 + vmx_complete_interrupts(vmx);
7463 +}
7464 +
7465 +static struct kvm *vmx_vm_alloc(void)
7466 +{
7467 + struct kvm_vmx *kvm_vmx = __vmalloc(sizeof(struct kvm_vmx),
7468 + GFP_KERNEL_ACCOUNT | __GFP_ZERO,
7469 + PAGE_KERNEL);
7470 + return &kvm_vmx->kvm;
7471 +}
7472 +
7473 +static void vmx_vm_free(struct kvm *kvm)
7474 +{
7475 + kfree(kvm->arch.hyperv.hv_pa_pg);
7476 + vfree(to_kvm_vmx(kvm));
7477 +}
7478 +
7479 +static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
7480 +{
7481 + struct vcpu_vmx *vmx = to_vmx(vcpu);
7482 +
7483 + if (enable_pml)
7484 + vmx_destroy_pml_buffer(vmx);
7485 + free_vpid(vmx->vpid);
7486 + nested_vmx_free_vcpu(vcpu);
7487 + free_loaded_vmcs(vmx->loaded_vmcs);
7488 + kvm_vcpu_uninit(vcpu);
7489 + kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.user_fpu);
7490 + kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.guest_fpu);
7491 + kmem_cache_free(kvm_vcpu_cache, vmx);
7492 +}
7493 +
7494 +static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
7495 +{
7496 + int err;
7497 + struct vcpu_vmx *vmx;
7498 + unsigned long *msr_bitmap;
7499 + int i, cpu;
7500 +
7501 + BUILD_BUG_ON_MSG(offsetof(struct vcpu_vmx, vcpu) != 0,
7502 + "struct kvm_vcpu must be at offset 0 for arch usercopy region");
7503 +
7504 + vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT);
7505 + if (!vmx)
7506 + return ERR_PTR(-ENOMEM);
7507 +
7508 + vmx->vcpu.arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache,
7509 + GFP_KERNEL_ACCOUNT);
7510 + if (!vmx->vcpu.arch.user_fpu) {
7511 + printk(KERN_ERR "kvm: failed to allocate kvm userspace's fpu\n");
7512 + err = -ENOMEM;
7513 + goto free_partial_vcpu;
7514 + }
7515 +
7516 + vmx->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache,
7517 + GFP_KERNEL_ACCOUNT);
7518 + if (!vmx->vcpu.arch.guest_fpu) {
7519 + printk(KERN_ERR "kvm: failed to allocate vcpu's fpu\n");
7520 + err = -ENOMEM;
7521 + goto free_user_fpu;
7522 + }
7523 +
7524 + vmx->vpid = allocate_vpid();
7525 +
7526 + err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
7527 + if (err)
7528 + goto free_vcpu;
7529 +
7530 + err = -ENOMEM;
7531 +
7532 + /*
7533 + * If PML is turned on, failure on enabling PML just results in failure
7534 + * of creating the vcpu, therefore we can simplify PML logic (by
7535 + * avoiding dealing with cases, such as enabling PML partially on vcpus
7536 + * for the guest), etc.
7537 + */
7538 + if (enable_pml) {
7539 + vmx->pml_pg = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
7540 + if (!vmx->pml_pg)
7541 + goto uninit_vcpu;
7542 + }
7543 +
7544 + BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) != NR_SHARED_MSRS);
7545 +
7546 + for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) {
7547 + u32 index = vmx_msr_index[i];
7548 + u32 data_low, data_high;
7549 + int j = vmx->nmsrs;
7550 +
7551 + if (rdmsr_safe(index, &data_low, &data_high) < 0)
7552 + continue;
7553 + if (wrmsr_safe(index, data_low, data_high) < 0)
7554 + continue;
7555 +
7556 + vmx->guest_msrs[j].index = i;
7557 + vmx->guest_msrs[j].data = 0;
7558 + switch (index) {
7559 + case MSR_IA32_TSX_CTRL:
7560 + /*
7561 + * No need to pass TSX_CTRL_CPUID_CLEAR through, so
7562 + * let's avoid changing CPUID bits under the host
7563 + * kernel's feet.
7564 + */
7565 + vmx->guest_msrs[j].mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
7566 + break;
7567 + default:
7568 + vmx->guest_msrs[j].mask = -1ull;
7569 + break;
7570 + }
7571 + ++vmx->nmsrs;
7572 + }
7573 +
7574 + err = alloc_loaded_vmcs(&vmx->vmcs01);
7575 + if (err < 0)
7576 + goto free_pml;
7577 +
7578 + msr_bitmap = vmx->vmcs01.msr_bitmap;
7579 + vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_TSC, MSR_TYPE_R);
7580 + vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE, MSR_TYPE_RW);
7581 + vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE, MSR_TYPE_RW);
7582 + vmx_disable_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
7583 + vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
7584 + vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
7585 + vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
7586 + if (kvm_cstate_in_guest(kvm)) {
7587 + vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C1_RES, MSR_TYPE_R);
7588 + vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R);
7589 + vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
7590 + vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
7591 + }
7592 + vmx->msr_bitmap_mode = 0;
7593 +
7594 + vmx->loaded_vmcs = &vmx->vmcs01;
7595 + cpu = get_cpu();
7596 + vmx_vcpu_load(&vmx->vcpu, cpu);
7597 + vmx->vcpu.cpu = cpu;
7598 + init_vmcs(vmx);
7599 + vmx_vcpu_put(&vmx->vcpu);
7600 + put_cpu();
7601 + if (cpu_need_virtualize_apic_accesses(&vmx->vcpu)) {
7602 + err = alloc_apic_access_page(kvm);
7603 + if (err)
7604 + goto free_vmcs;
7605 + }
7606 +
7607 + if (enable_ept && !enable_unrestricted_guest) {
7608 + err = init_rmode_identity_map(kvm);
7609 + if (err)
7610 + goto free_vmcs;
7611 + }
7612 +
7613 + if (nested)
7614 + nested_vmx_setup_ctls_msrs(&vmx->nested.msrs,
7615 + vmx_capability.ept,
7616 + kvm_vcpu_apicv_active(&vmx->vcpu));
7617 + else
7618 + memset(&vmx->nested.msrs, 0, sizeof(vmx->nested.msrs));
7619 +
7620 + vmx->nested.posted_intr_nv = -1;
7621 + vmx->nested.current_vmptr = -1ull;
7622 +
7623 + vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED;
7624 +
7625 + /*
7626 + * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR
7627 + * or POSTED_INTR_WAKEUP_VECTOR.
7628 + */
7629 + vmx->pi_desc.nv = POSTED_INTR_VECTOR;
7630 + vmx->pi_desc.sn = 1;
7631 +
7632 + vmx->ept_pointer = INVALID_PAGE;
7633 +
7634 + return &vmx->vcpu;
7635 +
7636 +free_vmcs:
7637 + free_loaded_vmcs(vmx->loaded_vmcs);
7638 +free_pml:
7639 + vmx_destroy_pml_buffer(vmx);
7640 +uninit_vcpu:
7641 + kvm_vcpu_uninit(&vmx->vcpu);
7642 +free_vcpu:
7643 + free_vpid(vmx->vpid);
7644 + kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.guest_fpu);
7645 +free_user_fpu:
7646 + kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.user_fpu);
7647 +free_partial_vcpu:
7648 + kmem_cache_free(kvm_vcpu_cache, vmx);
7649 + return ERR_PTR(err);
7650 +}
7651 +
7652 +#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
7653 +#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
7654 +
7655 +static int vmx_vm_init(struct kvm *kvm)
7656 +{
7657 + spin_lock_init(&to_kvm_vmx(kvm)->ept_pointer_lock);
7658 +
7659 + if (!ple_gap)
7660 + kvm->arch.pause_in_guest = true;
7661 +
7662 + if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) {
7663 + switch (l1tf_mitigation) {
7664 + case L1TF_MITIGATION_OFF:
7665 + case L1TF_MITIGATION_FLUSH_NOWARN:
7666 + /* 'I explicitly don't care' is set */
7667 + break;
7668 + case L1TF_MITIGATION_FLUSH:
7669 + case L1TF_MITIGATION_FLUSH_NOSMT:
7670 + case L1TF_MITIGATION_FULL:
7671 + /*
7672 + * Warn upon starting the first VM in a potentially
7673 + * insecure environment.
7674 + */
7675 + if (sched_smt_active())
7676 + pr_warn_once(L1TF_MSG_SMT);
7677 + if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER)
7678 + pr_warn_once(L1TF_MSG_L1D);
7679 + break;
7680 + case L1TF_MITIGATION_FULL_FORCE:
7681 + /* Flush is enforced */
7682 + break;
7683 + }
7684 + }
7685 + return 0;
7686 +}
7687 +
7688 +static int __init vmx_check_processor_compat(void)
7689 +{
7690 + struct vmcs_config vmcs_conf;
7691 + struct vmx_capability vmx_cap;
7692 +
7693 + if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0)
7694 + return -EIO;
7695 + if (nested)
7696 + nested_vmx_setup_ctls_msrs(&vmcs_conf.nested, vmx_cap.ept,
7697 + enable_apicv);
7698 + if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) {
7699 + printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n",
7700 + smp_processor_id());
7701 + return -EIO;
7702 + }
7703 + return 0;
7704 +}
7705 +
7706 +static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
7707 +{
7708 + u8 cache;
7709 + u64 ipat = 0;
7710 +
7711 + /* For VT-d and EPT combination
7712 + * 1. MMIO: always map as UC
7713 + * 2. EPT with VT-d:
7714 + * a. VT-d without snooping control feature: can't guarantee the
7715 + * result, try to trust guest.
7716 + * b. VT-d with snooping control feature: snooping control feature of
7717 + * VT-d engine can guarantee the cache correctness. Just set it
7718 + * to WB to keep consistent with host. So the same as item 3.
7719 + * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep
7720 + * consistent with host MTRR
7721 + */
7722 + if (is_mmio) {
7723 + cache = MTRR_TYPE_UNCACHABLE;
7724 + goto exit;
7725 + }
7726 +
7727 + if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) {
7728 + ipat = VMX_EPT_IPAT_BIT;
7729 + cache = MTRR_TYPE_WRBACK;
7730 + goto exit;
7731 + }
7732 +
7733 + if (kvm_read_cr0(vcpu) & X86_CR0_CD) {
7734 + ipat = VMX_EPT_IPAT_BIT;
7735 + if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
7736 + cache = MTRR_TYPE_WRBACK;
7737 + else
7738 + cache = MTRR_TYPE_UNCACHABLE;
7739 + goto exit;
7740 + }
7741 +
7742 + cache = kvm_mtrr_get_guest_memory_type(vcpu, gfn);
7743 +
7744 +exit:
7745 + return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat;
7746 +}
7747 +
7748 +static int vmx_get_lpage_level(void)
7749 +{
7750 + if (enable_ept && !cpu_has_vmx_ept_1g_page())
7751 + return PT_DIRECTORY_LEVEL;
7752 + else
7753 + /* For shadow and EPT supported 1GB page */
7754 + return PT_PDPE_LEVEL;
7755 +}
7756 +
7757 +static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx)
7758 +{
7759 + /*
7760 + * These bits in the secondary execution controls field
7761 + * are dynamic, the others are mostly based on the hypervisor
7762 + * architecture and the guest's CPUID. Do not touch the
7763 + * dynamic bits.
7764 + */
7765 + u32 mask =
7766 + SECONDARY_EXEC_SHADOW_VMCS |
7767 + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
7768 + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
7769 + SECONDARY_EXEC_DESC;
7770 +
7771 + u32 new_ctl = vmx->secondary_exec_control;
7772 + u32 cur_ctl = secondary_exec_controls_get(vmx);
7773 +
7774 + secondary_exec_controls_set(vmx, (new_ctl & ~mask) | (cur_ctl & mask));
7775 +}
7776 +
7777 +/*
7778 + * Generate MSR_IA32_VMX_CR{0,4}_FIXED1 according to CPUID. Only set bits
7779 + * (indicating "allowed-1") if they are supported in the guest's CPUID.
7780 + */
7781 +static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)
7782 +{
7783 + struct vcpu_vmx *vmx = to_vmx(vcpu);
7784 + struct kvm_cpuid_entry2 *entry;
7785 +
7786 + vmx->nested.msrs.cr0_fixed1 = 0xffffffff;
7787 + vmx->nested.msrs.cr4_fixed1 = X86_CR4_PCE;
7788 +
7789 +#define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do { \
7790 + if (entry && (entry->_reg & (_cpuid_mask))) \
7791 + vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask); \
7792 +} while (0)
7793 +
7794 + entry = kvm_find_cpuid_entry(vcpu, 0x1, 0);
7795 + cr4_fixed1_update(X86_CR4_VME, edx, bit(X86_FEATURE_VME));
7796 + cr4_fixed1_update(X86_CR4_PVI, edx, bit(X86_FEATURE_VME));
7797 + cr4_fixed1_update(X86_CR4_TSD, edx, bit(X86_FEATURE_TSC));
7798 + cr4_fixed1_update(X86_CR4_DE, edx, bit(X86_FEATURE_DE));
7799 + cr4_fixed1_update(X86_CR4_PSE, edx, bit(X86_FEATURE_PSE));
7800 + cr4_fixed1_update(X86_CR4_PAE, edx, bit(X86_FEATURE_PAE));
7801 + cr4_fixed1_update(X86_CR4_MCE, edx, bit(X86_FEATURE_MCE));
7802 + cr4_fixed1_update(X86_CR4_PGE, edx, bit(X86_FEATURE_PGE));
7803 + cr4_fixed1_update(X86_CR4_OSFXSR, edx, bit(X86_FEATURE_FXSR));
7804 + cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, bit(X86_FEATURE_XMM));
7805 + cr4_fixed1_update(X86_CR4_VMXE, ecx, bit(X86_FEATURE_VMX));
7806 + cr4_fixed1_update(X86_CR4_SMXE, ecx, bit(X86_FEATURE_SMX));
7807 + cr4_fixed1_update(X86_CR4_PCIDE, ecx, bit(X86_FEATURE_PCID));
7808 + cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, bit(X86_FEATURE_XSAVE));
7809 +
7810 + entry = kvm_find_cpuid_entry(vcpu, 0x7, 0);
7811 + cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, bit(X86_FEATURE_FSGSBASE));
7812 + cr4_fixed1_update(X86_CR4_SMEP, ebx, bit(X86_FEATURE_SMEP));
7813 + cr4_fixed1_update(X86_CR4_SMAP, ebx, bit(X86_FEATURE_SMAP));
7814 + cr4_fixed1_update(X86_CR4_PKE, ecx, bit(X86_FEATURE_PKU));
7815 + cr4_fixed1_update(X86_CR4_UMIP, ecx, bit(X86_FEATURE_UMIP));
7816 + cr4_fixed1_update(X86_CR4_LA57, ecx, bit(X86_FEATURE_LA57));
7817 +
7818 +#undef cr4_fixed1_update
7819 +}
7820 +
7821 +static void nested_vmx_entry_exit_ctls_update(struct kvm_vcpu *vcpu)
7822 +{
7823 + struct vcpu_vmx *vmx = to_vmx(vcpu);
7824 +
7825 + if (kvm_mpx_supported()) {
7826 + bool mpx_enabled = guest_cpuid_has(vcpu, X86_FEATURE_MPX);
7827 +
7828 + if (mpx_enabled) {
7829 + vmx->nested.msrs.entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
7830 + vmx->nested.msrs.exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
7831 + } else {
7832 + vmx->nested.msrs.entry_ctls_high &= ~VM_ENTRY_LOAD_BNDCFGS;
7833 + vmx->nested.msrs.exit_ctls_high &= ~VM_EXIT_CLEAR_BNDCFGS;
7834 + }
7835 + }
7836 +}
7837 +
7838 +static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
7839 +{
7840 + struct vcpu_vmx *vmx = to_vmx(vcpu);
7841 + struct kvm_cpuid_entry2 *best = NULL;
7842 + int i;
7843 +
7844 + for (i = 0; i < PT_CPUID_LEAVES; i++) {
7845 + best = kvm_find_cpuid_entry(vcpu, 0x14, i);
7846 + if (!best)
7847 + return;
7848 + vmx->pt_desc.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM] = best->eax;
7849 + vmx->pt_desc.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM] = best->ebx;
7850 + vmx->pt_desc.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM] = best->ecx;
7851 + vmx->pt_desc.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM] = best->edx;
7852 + }
7853 +
7854 + /* Get the number of configurable Address Ranges for filtering */
7855 + vmx->pt_desc.addr_range = intel_pt_validate_cap(vmx->pt_desc.caps,
7856 + PT_CAP_num_address_ranges);
7857 +
7858 + /* Initialize and clear the no dependency bits */
7859 + vmx->pt_desc.ctl_bitmask = ~(RTIT_CTL_TRACEEN | RTIT_CTL_OS |
7860 + RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC);
7861 +
7862 + /*
7863 + * If CPUID.(EAX=14H,ECX=0):EBX[0]=1 CR3Filter can be set otherwise
7864 + * will inject an #GP
7865 + */
7866 + if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_cr3_filtering))
7867 + vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_CR3EN;
7868 +
7869 + /*
7870 + * If CPUID.(EAX=14H,ECX=0):EBX[1]=1 CYCEn, CycThresh and
7871 + * PSBFreq can be set
7872 + */
7873 + if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc))
7874 + vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_CYCLEACC |
7875 + RTIT_CTL_CYC_THRESH | RTIT_CTL_PSB_FREQ);
7876 +
7877 + /*
7878 + * If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn BranchEn and
7879 + * MTCFreq can be set
7880 + */
7881 + if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc))
7882 + vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_MTC_EN |
7883 + RTIT_CTL_BRANCH_EN | RTIT_CTL_MTC_RANGE);
7884 +
7885 + /* If CPUID.(EAX=14H,ECX=0):EBX[4]=1 FUPonPTW and PTWEn can be set */
7886 + if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_ptwrite))
7887 + vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_FUP_ON_PTW |
7888 + RTIT_CTL_PTW_EN);
7889 +
7890 + /* If CPUID.(EAX=14H,ECX=0):EBX[5]=1 PwrEvEn can be set */
7891 + if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_power_event_trace))
7892 + vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_PWR_EVT_EN;
7893 +
7894 + /* If CPUID.(EAX=14H,ECX=0):ECX[0]=1 ToPA can be set */
7895 + if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_topa_output))
7896 + vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_TOPA;
7897 +
7898 + /* If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabircEn can be set */
7899 + if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_output_subsys))
7900 + vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_FABRIC_EN;
7901 +
7902 + /* unmask address range configure area */
7903 + for (i = 0; i < vmx->pt_desc.addr_range; i++)
7904 + vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4));
7905 +}
7906 +
7907 +static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
7908 +{
7909 + struct vcpu_vmx *vmx = to_vmx(vcpu);
7910 +
7911 + /* xsaves_enabled is recomputed in vmx_compute_secondary_exec_control(). */
7912 + vcpu->arch.xsaves_enabled = false;
7913 +
7914 + if (cpu_has_secondary_exec_ctrls()) {
7915 + vmx_compute_secondary_exec_control(vmx);
7916 + vmcs_set_secondary_exec_control(vmx);
7917 + }
7918 +
7919 + if (nested_vmx_allowed(vcpu))
7920 + to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
7921 + FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX |
7922 + FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
7923 + else
7924 + to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
7925 + ~(FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX |
7926 + FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX);
7927 +
7928 + if (nested_vmx_allowed(vcpu)) {
7929 + nested_vmx_cr_fixed1_bits_update(vcpu);
7930 + nested_vmx_entry_exit_ctls_update(vcpu);
7931 + }
7932 +
7933 + if (boot_cpu_has(X86_FEATURE_INTEL_PT) &&
7934 + guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT))
7935 + update_intel_pt_cfg(vcpu);
7936 +
7937 + if (boot_cpu_has(X86_FEATURE_RTM)) {
7938 + struct shared_msr_entry *msr;
7939 + msr = find_msr_entry(vmx, MSR_IA32_TSX_CTRL);
7940 + if (msr) {
7941 + bool enabled = guest_cpuid_has(vcpu, X86_FEATURE_RTM);
7942 + vmx_set_guest_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE);
7943 + }
7944 + }
7945 +}
7946 +
7947 +static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
7948 +{
7949 + if (func == 1 && nested)
7950 + entry->ecx |= bit(X86_FEATURE_VMX);
7951 +}
7952 +
7953 +static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu)
7954 +{
7955 + to_vmx(vcpu)->req_immediate_exit = true;
7956 +}
7957 +
7958 +static int vmx_check_intercept(struct kvm_vcpu *vcpu,
7959 + struct x86_instruction_info *info,
7960 + enum x86_intercept_stage stage)
7961 +{
7962 + struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
7963 + struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
7964 +
7965 + /*
7966 + * RDPID causes #UD if disabled through secondary execution controls.
7967 + * Because it is marked as EmulateOnUD, we need to intercept it here.
7968 + */
7969 + if (info->intercept == x86_intercept_rdtscp &&
7970 + !nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDTSCP)) {
7971 + ctxt->exception.vector = UD_VECTOR;
7972 + ctxt->exception.error_code_valid = false;
7973 + return X86EMUL_PROPAGATE_FAULT;
7974 + }
7975 +
7976 + /* TODO: check more intercepts... */
7977 + return X86EMUL_CONTINUE;
7978 +}
7979 +
7980 +#ifdef CONFIG_X86_64
7981 +/* (a << shift) / divisor, return 1 if overflow otherwise 0 */
7982 +static inline int u64_shl_div_u64(u64 a, unsigned int shift,
7983 + u64 divisor, u64 *result)
7984 +{
7985 + u64 low = a << shift, high = a >> (64 - shift);
7986 +
7987 + /* To avoid the overflow on divq */
7988 + if (high >= divisor)
7989 + return 1;
7990 +
7991 + /* Low hold the result, high hold rem which is discarded */
7992 + asm("divq %2\n\t" : "=a" (low), "=d" (high) :
7993 + "rm" (divisor), "0" (low), "1" (high));
7994 + *result = low;
7995 +
7996 + return 0;
7997 +}
7998 +
7999 +static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
8000 + bool *expired)
8001 +{
8002 + struct vcpu_vmx *vmx;
8003 + u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles;
8004 + struct kvm_timer *ktimer = &vcpu->arch.apic->lapic_timer;
8005 +
8006 + if (kvm_mwait_in_guest(vcpu->kvm) ||
8007 + kvm_can_post_timer_interrupt(vcpu))
8008 + return -EOPNOTSUPP;
8009 +
8010 + vmx = to_vmx(vcpu);
8011 + tscl = rdtsc();
8012 + guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
8013 + delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
8014 + lapic_timer_advance_cycles = nsec_to_cycles(vcpu,
8015 + ktimer->timer_advance_ns);
8016 +
8017 + if (delta_tsc > lapic_timer_advance_cycles)
8018 + delta_tsc -= lapic_timer_advance_cycles;
8019 + else
8020 + delta_tsc = 0;
8021 +
8022 + /* Convert to host delta tsc if tsc scaling is enabled */
8023 + if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio &&
8024 + delta_tsc && u64_shl_div_u64(delta_tsc,
8025 + kvm_tsc_scaling_ratio_frac_bits,
8026 + vcpu->arch.tsc_scaling_ratio, &delta_tsc))
8027 + return -ERANGE;
8028 +
8029 + /*
8030 + * If the delta tsc can't fit in the 32 bit after the multi shift,
8031 + * we can't use the preemption timer.
8032 + * It's possible that it fits on later vmentries, but checking
8033 + * on every vmentry is costly so we just use an hrtimer.
8034 + */
8035 + if (delta_tsc >> (cpu_preemption_timer_multi + 32))
8036 + return -ERANGE;
8037 +
8038 + vmx->hv_deadline_tsc = tscl + delta_tsc;
8039 + *expired = !delta_tsc;
8040 + return 0;
8041 +}
8042 +
8043 +static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
8044 +{
8045 + to_vmx(vcpu)->hv_deadline_tsc = -1;
8046 +}
8047 +#endif
8048 +
8049 +static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
8050 +{
8051 + if (!kvm_pause_in_guest(vcpu->kvm))
8052 + shrink_ple_window(vcpu);
8053 +}
8054 +
8055 +static void vmx_slot_enable_log_dirty(struct kvm *kvm,
8056 + struct kvm_memory_slot *slot)
8057 +{
8058 + kvm_mmu_slot_leaf_clear_dirty(kvm, slot);
8059 + kvm_mmu_slot_largepage_remove_write_access(kvm, slot);
8060 +}
8061 +
8062 +static void vmx_slot_disable_log_dirty(struct kvm *kvm,
8063 + struct kvm_memory_slot *slot)
8064 +{
8065 + kvm_mmu_slot_set_dirty(kvm, slot);
8066 +}
8067 +
8068 +static void vmx_flush_log_dirty(struct kvm *kvm)
8069 +{
8070 + kvm_flush_pml_buffers(kvm);
8071 +}
8072 +
8073 +static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu)
8074 +{
8075 + struct vmcs12 *vmcs12;
8076 + struct vcpu_vmx *vmx = to_vmx(vcpu);
8077 + gpa_t gpa, dst;
8078 +
8079 + if (is_guest_mode(vcpu)) {
8080 + WARN_ON_ONCE(vmx->nested.pml_full);
8081 +
8082 + /*
8083 + * Check if PML is enabled for the nested guest.
8084 + * Whether eptp bit 6 is set is already checked
8085 + * as part of A/D emulation.
8086 + */
8087 + vmcs12 = get_vmcs12(vcpu);
8088 + if (!nested_cpu_has_pml(vmcs12))
8089 + return 0;
8090 +
8091 + if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) {
8092 + vmx->nested.pml_full = true;
8093 + return 1;
8094 + }
8095 +
8096 + gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS) & ~0xFFFull;
8097 + dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index;
8098 +
8099 + if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa,
8100 + offset_in_page(dst), sizeof(gpa)))
8101 + return 0;
8102 +
8103 + vmcs12->guest_pml_index--;
8104 + }
8105 +
8106 + return 0;
8107 +}
8108 +
8109 +static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
8110 + struct kvm_memory_slot *memslot,
8111 + gfn_t offset, unsigned long mask)
8112 +{
8113 + kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask);
8114 +}
8115 +
8116 +static void __pi_post_block(struct kvm_vcpu *vcpu)
8117 +{
8118 + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
8119 + struct pi_desc old, new;
8120 + unsigned int dest;
8121 +
8122 + do {
8123 + old.control = new.control = pi_desc->control;
8124 + WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR,
8125 + "Wakeup handler not enabled while the VCPU is blocked\n");
8126 +
8127 + dest = cpu_physical_id(vcpu->cpu);
8128 +
8129 + if (x2apic_enabled())
8130 + new.ndst = dest;
8131 + else
8132 + new.ndst = (dest << 8) & 0xFF00;
8133 +
8134 + /* set 'NV' to 'notification vector' */
8135 + new.nv = POSTED_INTR_VECTOR;
8136 + } while (cmpxchg64(&pi_desc->control, old.control,
8137 + new.control) != old.control);
8138 +
8139 + if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) {
8140 + spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
8141 + list_del(&vcpu->blocked_vcpu_list);
8142 + spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
8143 + vcpu->pre_pcpu = -1;
8144 + }
8145 +}
8146 +
8147 +/*
8148 + * This routine does the following things for vCPU which is going
8149 + * to be blocked if VT-d PI is enabled.
8150 + * - Store the vCPU to the wakeup list, so when interrupts happen
8151 + * we can find the right vCPU to wake up.
8152 + * - Change the Posted-interrupt descriptor as below:
8153 + * 'NDST' <-- vcpu->pre_pcpu
8154 + * 'NV' <-- POSTED_INTR_WAKEUP_VECTOR
8155 + * - If 'ON' is set during this process, which means at least one
8156 + * interrupt is posted for this vCPU, we cannot block it, in
8157 + * this case, return 1, otherwise, return 0.
8158 + *
8159 + */
8160 +static int pi_pre_block(struct kvm_vcpu *vcpu)
8161 +{
8162 + unsigned int dest;
8163 + struct pi_desc old, new;
8164 + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
8165 +
8166 + if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
8167 + !irq_remapping_cap(IRQ_POSTING_CAP) ||
8168 + !kvm_vcpu_apicv_active(vcpu))
8169 + return 0;
8170 +
8171 + WARN_ON(irqs_disabled());
8172 + local_irq_disable();
8173 + if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) {
8174 + vcpu->pre_pcpu = vcpu->cpu;
8175 + spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
8176 + list_add_tail(&vcpu->blocked_vcpu_list,
8177 + &per_cpu(blocked_vcpu_on_cpu,
8178 + vcpu->pre_pcpu));
8179 + spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
8180 + }
8181 +
8182 + do {
8183 + old.control = new.control = pi_desc->control;
8184 +
8185 + WARN((pi_desc->sn == 1),
8186 + "Warning: SN field of posted-interrupts "
8187 + "is set before blocking\n");
8188 +
8189 + /*
8190 + * Since vCPU can be preempted during this process,
8191 + * vcpu->cpu could be different with pre_pcpu, we
8192 + * need to set pre_pcpu as the destination of wakeup
8193 + * notification event, then we can find the right vCPU
8194 + * to wakeup in wakeup handler if interrupts happen
8195 + * when the vCPU is in blocked state.
8196 + */
8197 + dest = cpu_physical_id(vcpu->pre_pcpu);
8198 +
8199 + if (x2apic_enabled())
8200 + new.ndst = dest;
8201 + else
8202 + new.ndst = (dest << 8) & 0xFF00;
8203 +
8204 + /* set 'NV' to 'wakeup vector' */
8205 + new.nv = POSTED_INTR_WAKEUP_VECTOR;
8206 + } while (cmpxchg64(&pi_desc->control, old.control,
8207 + new.control) != old.control);
8208 +
8209 + /* We should not block the vCPU if an interrupt is posted for it. */
8210 + if (pi_test_on(pi_desc) == 1)
8211 + __pi_post_block(vcpu);
8212 +
8213 + local_irq_enable();
8214 + return (vcpu->pre_pcpu == -1);
8215 +}
8216 +
8217 +static int vmx_pre_block(struct kvm_vcpu *vcpu)
8218 +{
8219 + if (pi_pre_block(vcpu))
8220 + return 1;
8221 +
8222 + if (kvm_lapic_hv_timer_in_use(vcpu))
8223 + kvm_lapic_switch_to_sw_timer(vcpu);
8224 +
8225 + return 0;
8226 +}
8227 +
8228 +static void pi_post_block(struct kvm_vcpu *vcpu)
8229 +{
8230 + if (vcpu->pre_pcpu == -1)
8231 + return;
8232 +
8233 + WARN_ON(irqs_disabled());
8234 + local_irq_disable();
8235 + __pi_post_block(vcpu);
8236 + local_irq_enable();
8237 +}
8238 +
8239 +static void vmx_post_block(struct kvm_vcpu *vcpu)
8240 +{
8241 + if (kvm_x86_ops->set_hv_timer)
8242 + kvm_lapic_switch_to_hv_timer(vcpu);
8243 +
8244 + pi_post_block(vcpu);
8245 +}
8246 +
8247 +/*
8248 + * vmx_update_pi_irte - set IRTE for Posted-Interrupts
8249 + *
8250 + * @kvm: kvm
8251 + * @host_irq: host irq of the interrupt
8252 + * @guest_irq: gsi of the interrupt
8253 + * @set: set or unset PI
8254 + * returns 0 on success, < 0 on failure
8255 + */
8256 +static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
8257 + uint32_t guest_irq, bool set)
8258 +{
8259 + struct kvm_kernel_irq_routing_entry *e;
8260 + struct kvm_irq_routing_table *irq_rt;
8261 + struct kvm_lapic_irq irq;
8262 + struct kvm_vcpu *vcpu;
8263 + struct vcpu_data vcpu_info;
8264 + int idx, ret = 0;
8265 +
8266 + if (!kvm_arch_has_assigned_device(kvm) ||
8267 + !irq_remapping_cap(IRQ_POSTING_CAP) ||
8268 + !kvm_vcpu_apicv_active(kvm->vcpus[0]))
8269 + return 0;
8270 +
8271 + idx = srcu_read_lock(&kvm->irq_srcu);
8272 + irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
8273 + if (guest_irq >= irq_rt->nr_rt_entries ||
8274 + hlist_empty(&irq_rt->map[guest_irq])) {
8275 + pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n",
8276 + guest_irq, irq_rt->nr_rt_entries);
8277 + goto out;
8278 + }
8279 +
8280 + hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
8281 + if (e->type != KVM_IRQ_ROUTING_MSI)
8282 + continue;
8283 + /*
8284 + * VT-d PI cannot support posting multicast/broadcast
8285 + * interrupts to a vCPU, we still use interrupt remapping
8286 + * for these kind of interrupts.
8287 + *
8288 + * For lowest-priority interrupts, we only support
8289 + * those with single CPU as the destination, e.g. user
8290 + * configures the interrupts via /proc/irq or uses
8291 + * irqbalance to make the interrupts single-CPU.
8292 + *
8293 + * We will support full lowest-priority interrupt later.
8294 + *
8295 + * In addition, we can only inject generic interrupts using
8296 + * the PI mechanism, refuse to route others through it.
8297 + */
8298 +
8299 + kvm_set_msi_irq(kvm, e, &irq);
8300 + if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) ||
8301 + !kvm_irq_is_postable(&irq)) {
8302 + /*
8303 + * Make sure the IRTE is in remapped mode if
8304 + * we don't handle it in posted mode.
8305 + */
8306 + ret = irq_set_vcpu_affinity(host_irq, NULL);
8307 + if (ret < 0) {
8308 + printk(KERN_INFO
8309 + "failed to back to remapped mode, irq: %u\n",
8310 + host_irq);
8311 + goto out;
8312 + }
8313 +
8314 + continue;
8315 + }
8316 +
8317 + vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
8318 + vcpu_info.vector = irq.vector;
8319 +
8320 + trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi,
8321 + vcpu_info.vector, vcpu_info.pi_desc_addr, set);
8322 +
8323 + if (set)
8324 + ret = irq_set_vcpu_affinity(host_irq, &vcpu_info);
8325 + else
8326 + ret = irq_set_vcpu_affinity(host_irq, NULL);
8327 +
8328 + if (ret < 0) {
8329 + printk(KERN_INFO "%s: failed to update PI IRTE\n",
8330 + __func__);
8331 + goto out;
8332 + }
8333 + }
8334 +
8335 + ret = 0;
8336 +out:
8337 + srcu_read_unlock(&kvm->irq_srcu, idx);
8338 + return ret;
8339 +}
8340 +
8341 +static void vmx_setup_mce(struct kvm_vcpu *vcpu)
8342 +{
8343 + if (vcpu->arch.mcg_cap & MCG_LMCE_P)
8344 + to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
8345 + FEATURE_CONTROL_LMCE;
8346 + else
8347 + to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
8348 + ~FEATURE_CONTROL_LMCE;
8349 +}
8350 +
8351 +static int vmx_smi_allowed(struct kvm_vcpu *vcpu)
8352 +{
8353 + /* we need a nested vmexit to enter SMM, postpone if run is pending */
8354 + if (to_vmx(vcpu)->nested.nested_run_pending)
8355 + return 0;
8356 + return 1;
8357 +}
8358 +
8359 +static int vmx_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
8360 +{
8361 + struct vcpu_vmx *vmx = to_vmx(vcpu);
8362 +
8363 + vmx->nested.smm.guest_mode = is_guest_mode(vcpu);
8364 + if (vmx->nested.smm.guest_mode)
8365 + nested_vmx_vmexit(vcpu, -1, 0, 0);
8366 +
8367 + vmx->nested.smm.vmxon = vmx->nested.vmxon;
8368 + vmx->nested.vmxon = false;
8369 + vmx_clear_hlt(vcpu);
8370 + return 0;
8371 +}
8372 +
8373 +static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
8374 +{
8375 + struct vcpu_vmx *vmx = to_vmx(vcpu);
8376 + int ret;
8377 +
8378 + if (vmx->nested.smm.vmxon) {
8379 + vmx->nested.vmxon = true;
8380 + vmx->nested.smm.vmxon = false;
8381 + }
8382 +
8383 + if (vmx->nested.smm.guest_mode) {
8384 + ret = nested_vmx_enter_non_root_mode(vcpu, false);
8385 + if (ret)
8386 + return ret;
8387 +
8388 + vmx->nested.smm.guest_mode = false;
8389 + }
8390 + return 0;
8391 +}
8392 +
8393 +static int enable_smi_window(struct kvm_vcpu *vcpu)
8394 +{
8395 + return 0;
8396 +}
8397 +
8398 +static bool vmx_need_emulation_on_page_fault(struct kvm_vcpu *vcpu)
8399 +{
8400 + return false;
8401 +}
8402 +
8403 +static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
8404 +{
8405 + return to_vmx(vcpu)->nested.vmxon;
8406 +}
8407 +
8408 +static __init int hardware_setup(void)
8409 +{
8410 + unsigned long host_bndcfgs;
8411 + struct desc_ptr dt;
8412 + int r, i;
8413 +
8414 + rdmsrl_safe(MSR_EFER, &host_efer);
8415 +
8416 + store_idt(&dt);
8417 + host_idt_base = dt.address;
8418 +
8419 + for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i)
8420 + kvm_define_shared_msr(i, vmx_msr_index[i]);
8421 +
8422 + if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0)
8423 + return -EIO;
8424 +
8425 + if (boot_cpu_has(X86_FEATURE_NX))
8426 + kvm_enable_efer_bits(EFER_NX);
8427 +
8428 + if (boot_cpu_has(X86_FEATURE_MPX)) {
8429 + rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs);
8430 + WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost");
8431 + }
8432 +
8433 + if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() ||
8434 + !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global()))
8435 + enable_vpid = 0;
8436 +
8437 + if (!cpu_has_vmx_ept() ||
8438 + !cpu_has_vmx_ept_4levels() ||
8439 + !cpu_has_vmx_ept_mt_wb() ||
8440 + !cpu_has_vmx_invept_global())
8441 + enable_ept = 0;
8442 +
8443 + if (!cpu_has_vmx_ept_ad_bits() || !enable_ept)
8444 + enable_ept_ad_bits = 0;
8445 +
8446 + if (!cpu_has_vmx_unrestricted_guest() || !enable_ept)
8447 + enable_unrestricted_guest = 0;
8448 +
8449 + if (!cpu_has_vmx_flexpriority())
8450 + flexpriority_enabled = 0;
8451 +
8452 + if (!cpu_has_virtual_nmis())
8453 + enable_vnmi = 0;
8454 +
8455 + /*
8456 + * set_apic_access_page_addr() is used to reload apic access
8457 + * page upon invalidation. No need to do anything if not
8458 + * using the APIC_ACCESS_ADDR VMCS field.
8459 + */
8460 + if (!flexpriority_enabled)
8461 + kvm_x86_ops->set_apic_access_page_addr = NULL;
8462 +
8463 + if (!cpu_has_vmx_tpr_shadow())
8464 + kvm_x86_ops->update_cr8_intercept = NULL;
8465 +
8466 + if (enable_ept && !cpu_has_vmx_ept_2m_page())
8467 + kvm_disable_largepages();
8468 +
8469 +#if IS_ENABLED(CONFIG_HYPERV)
8470 + if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH
8471 + && enable_ept) {
8472 + kvm_x86_ops->tlb_remote_flush = hv_remote_flush_tlb;
8473 + kvm_x86_ops->tlb_remote_flush_with_range =
8474 + hv_remote_flush_tlb_with_range;
8475 + }
8476 +#endif
8477 +
8478 + if (!cpu_has_vmx_ple()) {
8479 + ple_gap = 0;
8480 + ple_window = 0;
8481 + ple_window_grow = 0;
8482 + ple_window_max = 0;
8483 + ple_window_shrink = 0;
8484 + }
8485 +
8486 + if (!cpu_has_vmx_apicv()) {
8487 + enable_apicv = 0;
8488 + kvm_x86_ops->sync_pir_to_irr = NULL;
8489 + }
8490 +
8491 + if (cpu_has_vmx_tsc_scaling()) {
8492 + kvm_has_tsc_control = true;
8493 + kvm_max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX;
8494 + kvm_tsc_scaling_ratio_frac_bits = 48;
8495 + }
8496 +
8497 + set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
8498 +
8499 + if (enable_ept)
8500 + vmx_enable_tdp();
8501 + else
8502 + kvm_disable_tdp();
8503 +
8504 + /*
8505 + * Only enable PML when hardware supports PML feature, and both EPT
8506 + * and EPT A/D bit features are enabled -- PML depends on them to work.
8507 + */
8508 + if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml())
8509 + enable_pml = 0;
8510 +
8511 + if (!enable_pml) {
8512 + kvm_x86_ops->slot_enable_log_dirty = NULL;
8513 + kvm_x86_ops->slot_disable_log_dirty = NULL;
8514 + kvm_x86_ops->flush_log_dirty = NULL;
8515 + kvm_x86_ops->enable_log_dirty_pt_masked = NULL;
8516 + }
8517 +
8518 + if (!cpu_has_vmx_preemption_timer())
8519 + enable_preemption_timer = false;
8520 +
8521 + if (enable_preemption_timer) {
8522 + u64 use_timer_freq = 5000ULL * 1000 * 1000;
8523 + u64 vmx_msr;
8524 +
8525 + rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
8526 + cpu_preemption_timer_multi =
8527 + vmx_msr & VMX_MISC_PREEMPTION_TIMER_RATE_MASK;
8528 +
8529 + if (tsc_khz)
8530 + use_timer_freq = (u64)tsc_khz * 1000;
8531 + use_timer_freq >>= cpu_preemption_timer_multi;
8532 +
8533 + /*
8534 + * KVM "disables" the preemption timer by setting it to its max
8535 + * value. Don't use the timer if it might cause spurious exits
8536 + * at a rate faster than 0.1 Hz (of uninterrupted guest time).
8537 + */
8538 + if (use_timer_freq > 0xffffffffu / 10)
8539 + enable_preemption_timer = false;
8540 + }
8541 +
8542 + if (!enable_preemption_timer) {
8543 + kvm_x86_ops->set_hv_timer = NULL;
8544 + kvm_x86_ops->cancel_hv_timer = NULL;
8545 + kvm_x86_ops->request_immediate_exit = __kvm_request_immediate_exit;
8546 + }
8547 +
8548 + kvm_set_posted_intr_wakeup_handler(wakeup_handler);
8549 +
8550 + kvm_mce_cap_supported |= MCG_LMCE_P;
8551 +
8552 + if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST)
8553 + return -EINVAL;
8554 + if (!enable_ept || !cpu_has_vmx_intel_pt())
8555 + pt_mode = PT_MODE_SYSTEM;
8556 +
8557 + if (nested) {
8558 + nested_vmx_setup_ctls_msrs(&vmcs_config.nested,
8559 + vmx_capability.ept, enable_apicv);
8560 +
8561 + r = nested_vmx_hardware_setup(kvm_vmx_exit_handlers);
8562 + if (r)
8563 + return r;
8564 + }
8565 +
8566 + r = alloc_kvm_area();
8567 + if (r)
8568 + nested_vmx_hardware_unsetup();
8569 + return r;
8570 +}
8571 +
8572 +static __exit void hardware_unsetup(void)
8573 +{
8574 + if (nested)
8575 + nested_vmx_hardware_unsetup();
8576 +
8577 + free_kvm_area();
8578 +}
8579 +
8580 +static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
8581 + .cpu_has_kvm_support = cpu_has_kvm_support,
8582 + .disabled_by_bios = vmx_disabled_by_bios,
8583 + .hardware_setup = hardware_setup,
8584 + .hardware_unsetup = hardware_unsetup,
8585 + .check_processor_compatibility = vmx_check_processor_compat,
8586 + .hardware_enable = hardware_enable,
8587 + .hardware_disable = hardware_disable,
8588 + .cpu_has_accelerated_tpr = report_flexpriority,
8589 + .has_emulated_msr = vmx_has_emulated_msr,
8590 +
8591 + .vm_init = vmx_vm_init,
8592 + .vm_alloc = vmx_vm_alloc,
8593 + .vm_free = vmx_vm_free,
8594 +
8595 + .vcpu_create = vmx_create_vcpu,
8596 + .vcpu_free = vmx_free_vcpu,
8597 + .vcpu_reset = vmx_vcpu_reset,
8598 +
8599 + .prepare_guest_switch = vmx_prepare_switch_to_guest,
8600 + .vcpu_load = vmx_vcpu_load,
8601 + .vcpu_put = vmx_vcpu_put,
8602 +
8603 + .update_bp_intercept = update_exception_bitmap,
8604 + .get_msr_feature = vmx_get_msr_feature,
8605 + .get_msr = vmx_get_msr,
8606 + .set_msr = vmx_set_msr,
8607 + .get_segment_base = vmx_get_segment_base,
8608 + .get_segment = vmx_get_segment,
8609 + .set_segment = vmx_set_segment,
8610 + .get_cpl = vmx_get_cpl,
8611 + .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
8612 + .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits,
8613 + .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
8614 + .set_cr0 = vmx_set_cr0,
8615 + .set_cr3 = vmx_set_cr3,
8616 + .set_cr4 = vmx_set_cr4,
8617 + .set_efer = vmx_set_efer,
8618 + .get_idt = vmx_get_idt,
8619 + .set_idt = vmx_set_idt,
8620 + .get_gdt = vmx_get_gdt,
8621 + .set_gdt = vmx_set_gdt,
8622 + .get_dr6 = vmx_get_dr6,
8623 + .set_dr6 = vmx_set_dr6,
8624 + .set_dr7 = vmx_set_dr7,
8625 + .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
8626 + .cache_reg = vmx_cache_reg,
8627 + .get_rflags = vmx_get_rflags,
8628 + .set_rflags = vmx_set_rflags,
8629 +
8630 + .tlb_flush = vmx_flush_tlb,
8631 + .tlb_flush_gva = vmx_flush_tlb_gva,
8632 +
8633 + .run = vmx_vcpu_run,
8634 + .handle_exit = vmx_handle_exit,
8635 + .skip_emulated_instruction = skip_emulated_instruction,
8636 + .set_interrupt_shadow = vmx_set_interrupt_shadow,
8637 + .get_interrupt_shadow = vmx_get_interrupt_shadow,
8638 + .patch_hypercall = vmx_patch_hypercall,
8639 + .set_irq = vmx_inject_irq,
8640 + .set_nmi = vmx_inject_nmi,
8641 + .queue_exception = vmx_queue_exception,
8642 + .cancel_injection = vmx_cancel_injection,
8643 + .interrupt_allowed = vmx_interrupt_allowed,
8644 + .nmi_allowed = vmx_nmi_allowed,
8645 + .get_nmi_mask = vmx_get_nmi_mask,
8646 + .set_nmi_mask = vmx_set_nmi_mask,
8647 + .enable_nmi_window = enable_nmi_window,
8648 + .enable_irq_window = enable_irq_window,
8649 + .update_cr8_intercept = update_cr8_intercept,
8650 + .set_virtual_apic_mode = vmx_set_virtual_apic_mode,
8651 + .set_apic_access_page_addr = vmx_set_apic_access_page_addr,
8652 + .get_enable_apicv = vmx_get_enable_apicv,
8653 + .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
8654 + .load_eoi_exitmap = vmx_load_eoi_exitmap,
8655 + .apicv_post_state_restore = vmx_apicv_post_state_restore,
8656 + .hwapic_irr_update = vmx_hwapic_irr_update,
8657 + .hwapic_isr_update = vmx_hwapic_isr_update,
8658 + .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt,
8659 + .sync_pir_to_irr = vmx_sync_pir_to_irr,
8660 + .deliver_posted_interrupt = vmx_deliver_posted_interrupt,
8661 + .dy_apicv_has_pending_interrupt = vmx_dy_apicv_has_pending_interrupt,
8662 +
8663 + .set_tss_addr = vmx_set_tss_addr,
8664 + .set_identity_map_addr = vmx_set_identity_map_addr,
8665 + .get_tdp_level = get_ept_level,
8666 + .get_mt_mask = vmx_get_mt_mask,
8667 +
8668 + .get_exit_info = vmx_get_exit_info,
8669 +
8670 + .get_lpage_level = vmx_get_lpage_level,
8671 +
8672 + .cpuid_update = vmx_cpuid_update,
8673 +
8674 + .rdtscp_supported = vmx_rdtscp_supported,
8675 + .invpcid_supported = vmx_invpcid_supported,
8676 +
8677 + .set_supported_cpuid = vmx_set_supported_cpuid,
8678 +
8679 + .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
8680 +
8681 + .read_l1_tsc_offset = vmx_read_l1_tsc_offset,
8682 + .write_l1_tsc_offset = vmx_write_l1_tsc_offset,
8683 +
8684 + .set_tdp_cr3 = vmx_set_cr3,
8685 +
8686 + .check_intercept = vmx_check_intercept,
8687 + .handle_exit_irqoff = vmx_handle_exit_irqoff,
8688 + .mpx_supported = vmx_mpx_supported,
8689 + .xsaves_supported = vmx_xsaves_supported,
8690 + .umip_emulated = vmx_umip_emulated,
8691 + .pt_supported = vmx_pt_supported,
8692 +
8693 + .request_immediate_exit = vmx_request_immediate_exit,
8694 +
8695 + .sched_in = vmx_sched_in,
8696 +
8697 + .slot_enable_log_dirty = vmx_slot_enable_log_dirty,
8698 + .slot_disable_log_dirty = vmx_slot_disable_log_dirty,
8699 + .flush_log_dirty = vmx_flush_log_dirty,
8700 + .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked,
8701 + .write_log_dirty = vmx_write_pml_buffer,
8702 +
8703 + .pre_block = vmx_pre_block,
8704 + .post_block = vmx_post_block,
8705 +
8706 + .pmu_ops = &intel_pmu_ops,
8707 +
8708 + .update_pi_irte = vmx_update_pi_irte,
8709 +
8710 +#ifdef CONFIG_X86_64
8711 + .set_hv_timer = vmx_set_hv_timer,
8712 + .cancel_hv_timer = vmx_cancel_hv_timer,
8713 +#endif
8714 +
8715 + .setup_mce = vmx_setup_mce,
8716 +
8717 + .smi_allowed = vmx_smi_allowed,
8718 + .pre_enter_smm = vmx_pre_enter_smm,
8719 + .pre_leave_smm = vmx_pre_leave_smm,
8720 + .enable_smi_window = enable_smi_window,
8721 +
8722 + .check_nested_events = NULL,
8723 + .get_nested_state = NULL,
8724 + .set_nested_state = NULL,
8725 + .get_vmcs12_pages = NULL,
8726 + .nested_enable_evmcs = NULL,
8727 + .nested_get_evmcs_version = NULL,
8728 + .need_emulation_on_page_fault = vmx_need_emulation_on_page_fault,
8729 + .apic_init_signal_blocked = vmx_apic_init_signal_blocked,
8730 +};
8731 +
8732 +static void vmx_cleanup_l1d_flush(void)
8733 +{
8734 + if (vmx_l1d_flush_pages) {
8735 + free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER);
8736 + vmx_l1d_flush_pages = NULL;
8737 + }
8738 + /* Restore state so sysfs ignores VMX */
8739 + l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
8740 +}
8741 +
8742 +static void vmx_exit(void)
8743 +{
8744 +#ifdef CONFIG_KEXEC_CORE
8745 + RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
8746 + synchronize_rcu();
8747 +#endif
8748 +
8749 + kvm_exit();
8750 +
8751 +#if IS_ENABLED(CONFIG_HYPERV)
8752 + if (static_branch_unlikely(&enable_evmcs)) {
8753 + int cpu;
8754 + struct hv_vp_assist_page *vp_ap;
8755 + /*
8756 + * Reset everything to support using non-enlightened VMCS
8757 + * access later (e.g. when we reload the module with
8758 + * enlightened_vmcs=0)
8759 + */
8760 + for_each_online_cpu(cpu) {
8761 + vp_ap = hv_get_vp_assist_page(cpu);
8762 +
8763 + if (!vp_ap)
8764 + continue;
8765 +
8766 + vp_ap->nested_control.features.directhypercall = 0;
8767 + vp_ap->current_nested_vmcs = 0;
8768 + vp_ap->enlighten_vmentry = 0;
8769 + }
8770 +
8771 + static_branch_disable(&enable_evmcs);
8772 + }
8773 +#endif
8774 + vmx_cleanup_l1d_flush();
8775 +}
8776 +module_exit(vmx_exit);
8777 +
8778 +static int __init vmx_init(void)
8779 +{
8780 + int r;
8781 +
8782 +#if IS_ENABLED(CONFIG_HYPERV)
8783 + /*
8784 + * Enlightened VMCS usage should be recommended and the host needs
8785 + * to support eVMCS v1 or above. We can also disable eVMCS support
8786 + * with module parameter.
8787 + */
8788 + if (enlightened_vmcs &&
8789 + ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED &&
8790 + (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >=
8791 + KVM_EVMCS_VERSION) {
8792 + int cpu;
8793 +
8794 + /* Check that we have assist pages on all online CPUs */
8795 + for_each_online_cpu(cpu) {
8796 + if (!hv_get_vp_assist_page(cpu)) {
8797 + enlightened_vmcs = false;
8798 + break;
8799 + }
8800 + }
8801 +
8802 + if (enlightened_vmcs) {
8803 + pr_info("KVM: vmx: using Hyper-V Enlightened VMCS\n");
8804 + static_branch_enable(&enable_evmcs);
8805 + }
8806 +
8807 + if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH)
8808 + vmx_x86_ops.enable_direct_tlbflush
8809 + = hv_enable_direct_tlbflush;
8810 +
8811 + } else {
8812 + enlightened_vmcs = false;
8813 + }
8814 +#endif
8815 +
8816 + r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
8817 + __alignof__(struct vcpu_vmx), THIS_MODULE);
8818 + if (r)
8819 + return r;
8820 +
8821 + /*
8822 + * Must be called after kvm_init() so enable_ept is properly set
8823 + * up. Hand the parameter mitigation value in which was stored in
8824 + * the pre module init parser. If no parameter was given, it will
8825 + * contain 'auto' which will be turned into the default 'cond'
8826 + * mitigation mode.
8827 + */
8828 + r = vmx_setup_l1d_flush(vmentry_l1d_flush_param);
8829 + if (r) {
8830 + vmx_exit();
8831 + return r;
8832 + }
8833 +
8834 +#ifdef CONFIG_KEXEC_CORE
8835 + rcu_assign_pointer(crash_vmclear_loaded_vmcss,
8836 + crash_vmclear_local_loaded_vmcss);
8837 +#endif
8838 + vmx_check_vmcs12_offsets();
8839 +
8840 + return 0;
8841 +}
8842 +module_init(vmx_init);
8843 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
8844 index c9c533370e88..43aabd72019b 100644
8845 --- a/arch/x86/kvm/x86.c
8846 +++ b/arch/x86/kvm/x86.c
8847 @@ -54,6 +54,7 @@
8848 #include <linux/pvclock_gtod.h>
8849 #include <linux/kvm_irqfd.h>
8850 #include <linux/irqbypass.h>
8851 +#include <linux/nospec.h>
8852 #include <trace/events/kvm.h>
8853
8854 #include <asm/debugreg.h>
8855 @@ -889,9 +890,11 @@ static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu)
8856
8857 static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
8858 {
8859 + size_t size = ARRAY_SIZE(vcpu->arch.db);
8860 +
8861 switch (dr) {
8862 case 0 ... 3:
8863 - vcpu->arch.db[dr] = val;
8864 + vcpu->arch.db[array_index_nospec(dr, size)] = val;
8865 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
8866 vcpu->arch.eff_db[dr] = val;
8867 break;
8868 @@ -928,9 +931,11 @@ EXPORT_SYMBOL_GPL(kvm_set_dr);
8869
8870 int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
8871 {
8872 + size_t size = ARRAY_SIZE(vcpu->arch.db);
8873 +
8874 switch (dr) {
8875 case 0 ... 3:
8876 - *val = vcpu->arch.db[dr];
8877 + *val = vcpu->arch.db[array_index_nospec(dr, size)];
8878 break;
8879 case 4:
8880 /* fall through */
8881 @@ -2125,7 +2130,10 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
8882 default:
8883 if (msr >= MSR_IA32_MC0_CTL &&
8884 msr < MSR_IA32_MCx_CTL(bank_num)) {
8885 - u32 offset = msr - MSR_IA32_MC0_CTL;
8886 + u32 offset = array_index_nospec(
8887 + msr - MSR_IA32_MC0_CTL,
8888 + MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL);
8889 +
8890 /* only 0 or all 1s can be written to IA32_MCi_CTL
8891 * some Linux kernels though clear bit 10 in bank 4 to
8892 * workaround a BIOS/GART TBL issue on AMD K8s, ignore
8893 @@ -2493,7 +2501,10 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
8894 default:
8895 if (msr >= MSR_IA32_MC0_CTL &&
8896 msr < MSR_IA32_MCx_CTL(bank_num)) {
8897 - u32 offset = msr - MSR_IA32_MC0_CTL;
8898 + u32 offset = array_index_nospec(
8899 + msr - MSR_IA32_MC0_CTL,
8900 + MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL);
8901 +
8902 data = vcpu->arch.mce_banks[offset];
8903 break;
8904 }
8905 @@ -6121,14 +6132,12 @@ static void kvm_set_mmio_spte_mask(void)
8906 /* Set the present bit. */
8907 mask |= 1ull;
8908
8909 -#ifdef CONFIG_X86_64
8910 /*
8911 * If reserved bit is not supported, clear the present bit to disable
8912 * mmio page fault.
8913 */
8914 if (maxphyaddr == 52)
8915 mask &= ~1ull;
8916 -#endif
8917
8918 kvm_mmu_set_mmio_spte_mask(mask);
8919 }
8920 @@ -7798,7 +7807,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
8921 kvm_mmu_unload(vcpu);
8922 vcpu_put(vcpu);
8923
8924 - kvm_x86_ops->vcpu_free(vcpu);
8925 + kvm_arch_vcpu_free(vcpu);
8926 }
8927
8928 void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
8929 diff --git a/crypto/algapi.c b/crypto/algapi.c
8930 index 5c098ffa7d3d..9e5b24329b41 100644
8931 --- a/crypto/algapi.c
8932 +++ b/crypto/algapi.c
8933 @@ -652,11 +652,9 @@ EXPORT_SYMBOL_GPL(crypto_grab_spawn);
8934
8935 void crypto_drop_spawn(struct crypto_spawn *spawn)
8936 {
8937 - if (!spawn->alg)
8938 - return;
8939 -
8940 down_write(&crypto_alg_sem);
8941 - list_del(&spawn->list);
8942 + if (spawn->alg)
8943 + list_del(&spawn->list);
8944 up_write(&crypto_alg_sem);
8945 }
8946 EXPORT_SYMBOL_GPL(crypto_drop_spawn);
8947 @@ -664,22 +662,16 @@ EXPORT_SYMBOL_GPL(crypto_drop_spawn);
8948 static struct crypto_alg *crypto_spawn_alg(struct crypto_spawn *spawn)
8949 {
8950 struct crypto_alg *alg;
8951 - struct crypto_alg *alg2;
8952
8953 down_read(&crypto_alg_sem);
8954 alg = spawn->alg;
8955 - alg2 = alg;
8956 - if (alg2)
8957 - alg2 = crypto_mod_get(alg2);
8958 - up_read(&crypto_alg_sem);
8959 -
8960 - if (!alg2) {
8961 - if (alg)
8962 - crypto_shoot_alg(alg);
8963 - return ERR_PTR(-EAGAIN);
8964 + if (alg && !crypto_mod_get(alg)) {
8965 + alg->cra_flags |= CRYPTO_ALG_DYING;
8966 + alg = NULL;
8967 }
8968 + up_read(&crypto_alg_sem);
8969
8970 - return alg;
8971 + return alg ?: ERR_PTR(-EAGAIN);
8972 }
8973
8974 struct crypto_tfm *crypto_spawn_tfm(struct crypto_spawn *spawn, u32 type,
8975 diff --git a/crypto/api.c b/crypto/api.c
8976 index abf53e67e3d8..b273b3a726a9 100644
8977 --- a/crypto/api.c
8978 +++ b/crypto/api.c
8979 @@ -355,13 +355,12 @@ static unsigned int crypto_ctxsize(struct crypto_alg *alg, u32 type, u32 mask)
8980 return len;
8981 }
8982
8983 -void crypto_shoot_alg(struct crypto_alg *alg)
8984 +static void crypto_shoot_alg(struct crypto_alg *alg)
8985 {
8986 down_write(&crypto_alg_sem);
8987 alg->cra_flags |= CRYPTO_ALG_DYING;
8988 up_write(&crypto_alg_sem);
8989 }
8990 -EXPORT_SYMBOL_GPL(crypto_shoot_alg);
8991
8992 struct crypto_tfm *__crypto_alloc_tfm(struct crypto_alg *alg, u32 type,
8993 u32 mask)
8994 diff --git a/crypto/internal.h b/crypto/internal.h
8995 index 7eefcdb00227..6184c4226a8f 100644
8996 --- a/crypto/internal.h
8997 +++ b/crypto/internal.h
8998 @@ -87,7 +87,6 @@ void crypto_alg_tested(const char *name, int err);
8999 void crypto_remove_spawns(struct crypto_alg *alg, struct list_head *list,
9000 struct crypto_alg *nalg);
9001 void crypto_remove_final(struct list_head *list);
9002 -void crypto_shoot_alg(struct crypto_alg *alg);
9003 struct crypto_tfm *__crypto_alloc_tfm(struct crypto_alg *alg, u32 type,
9004 u32 mask);
9005 void *crypto_create_tfm(struct crypto_alg *alg,
9006 diff --git a/crypto/pcrypt.c b/crypto/pcrypt.c
9007 index 1348541da463..85082574c515 100644
9008 --- a/crypto/pcrypt.c
9009 +++ b/crypto/pcrypt.c
9010 @@ -130,7 +130,6 @@ static void pcrypt_aead_done(struct crypto_async_request *areq, int err)
9011 struct padata_priv *padata = pcrypt_request_padata(preq);
9012
9013 padata->info = err;
9014 - req->base.flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
9015
9016 padata_do_serial(padata);
9017 }
9018 diff --git a/drivers/clk/tegra/clk-tegra-periph.c b/drivers/clk/tegra/clk-tegra-periph.c
9019 index 4ce4e7fb1124..d9c1f229c644 100644
9020 --- a/drivers/clk/tegra/clk-tegra-periph.c
9021 +++ b/drivers/clk/tegra/clk-tegra-periph.c
9022 @@ -797,7 +797,11 @@ static struct tegra_periph_init_data gate_clks[] = {
9023 GATE("vcp", "clk_m", 29, 0, tegra_clk_vcp, 0),
9024 GATE("apbdma", "clk_m", 34, 0, tegra_clk_apbdma, 0),
9025 GATE("kbc", "clk_32k", 36, TEGRA_PERIPH_ON_APB | TEGRA_PERIPH_NO_RESET, tegra_clk_kbc, 0),
9026 - GATE("fuse", "clk_m", 39, TEGRA_PERIPH_ON_APB, tegra_clk_fuse, 0),
9027 + /*
9028 + * Critical for RAM re-repair operation, which must occur on resume
9029 + * from LP1 system suspend and as part of CCPLEX cluster switching.
9030 + */
9031 + GATE("fuse", "clk_m", 39, TEGRA_PERIPH_ON_APB, tegra_clk_fuse, CLK_IS_CRITICAL),
9032 GATE("fuse_burn", "clk_m", 39, TEGRA_PERIPH_ON_APB, tegra_clk_fuse_burn, 0),
9033 GATE("kfuse", "clk_m", 40, TEGRA_PERIPH_ON_APB, tegra_clk_kfuse, 0),
9034 GATE("apbif", "clk_m", 107, TEGRA_PERIPH_ON_APB, tegra_clk_apbif, 0),
9035 diff --git a/drivers/crypto/atmel-aes.c b/drivers/crypto/atmel-aes.c
9036 index e3d40a8dfffb..915253b9c912 100644
9037 --- a/drivers/crypto/atmel-aes.c
9038 +++ b/drivers/crypto/atmel-aes.c
9039 @@ -87,7 +87,6 @@
9040 struct atmel_aes_caps {
9041 bool has_dualbuff;
9042 bool has_cfb64;
9043 - bool has_ctr32;
9044 bool has_gcm;
9045 u32 max_burst_size;
9046 };
9047 @@ -923,8 +922,9 @@ static int atmel_aes_ctr_transfer(struct atmel_aes_dev *dd)
9048 struct atmel_aes_ctr_ctx *ctx = atmel_aes_ctr_ctx_cast(dd->ctx);
9049 struct ablkcipher_request *req = ablkcipher_request_cast(dd->areq);
9050 struct scatterlist *src, *dst;
9051 - u32 ctr, blocks;
9052 size_t datalen;
9053 + u32 ctr;
9054 + u16 blocks, start, end;
9055 bool use_dma, fragmented = false;
9056
9057 /* Check for transfer completion. */
9058 @@ -936,27 +936,17 @@ static int atmel_aes_ctr_transfer(struct atmel_aes_dev *dd)
9059 datalen = req->nbytes - ctx->offset;
9060 blocks = DIV_ROUND_UP(datalen, AES_BLOCK_SIZE);
9061 ctr = be32_to_cpu(ctx->iv[3]);
9062 - if (dd->caps.has_ctr32) {
9063 - /* Check 32bit counter overflow. */
9064 - u32 start = ctr;
9065 - u32 end = start + blocks - 1;
9066 -
9067 - if (end < start) {
9068 - ctr |= 0xffffffff;
9069 - datalen = AES_BLOCK_SIZE * -start;
9070 - fragmented = true;
9071 - }
9072 - } else {
9073 - /* Check 16bit counter overflow. */
9074 - u16 start = ctr & 0xffff;
9075 - u16 end = start + (u16)blocks - 1;
9076 -
9077 - if (blocks >> 16 || end < start) {
9078 - ctr |= 0xffff;
9079 - datalen = AES_BLOCK_SIZE * (0x10000-start);
9080 - fragmented = true;
9081 - }
9082 +
9083 + /* Check 16bit counter overflow. */
9084 + start = ctr & 0xffff;
9085 + end = start + blocks - 1;
9086 +
9087 + if (blocks >> 16 || end < start) {
9088 + ctr |= 0xffff;
9089 + datalen = AES_BLOCK_SIZE * (0x10000 - start);
9090 + fragmented = true;
9091 }
9092 +
9093 use_dma = (datalen >= ATMEL_AES_DMA_THRESHOLD);
9094
9095 /* Jump to offset. */
9096 @@ -1926,7 +1916,6 @@ static void atmel_aes_get_cap(struct atmel_aes_dev *dd)
9097 {
9098 dd->caps.has_dualbuff = 0;
9099 dd->caps.has_cfb64 = 0;
9100 - dd->caps.has_ctr32 = 0;
9101 dd->caps.has_gcm = 0;
9102 dd->caps.max_burst_size = 1;
9103
9104 @@ -1935,14 +1924,12 @@ static void atmel_aes_get_cap(struct atmel_aes_dev *dd)
9105 case 0x500:
9106 dd->caps.has_dualbuff = 1;
9107 dd->caps.has_cfb64 = 1;
9108 - dd->caps.has_ctr32 = 1;
9109 dd->caps.has_gcm = 1;
9110 dd->caps.max_burst_size = 4;
9111 break;
9112 case 0x200:
9113 dd->caps.has_dualbuff = 1;
9114 dd->caps.has_cfb64 = 1;
9115 - dd->caps.has_ctr32 = 1;
9116 dd->caps.has_gcm = 1;
9117 dd->caps.max_burst_size = 4;
9118 break;
9119 diff --git a/drivers/crypto/picoxcell_crypto.c b/drivers/crypto/picoxcell_crypto.c
9120 index 47576098831f..b3ea6d60c458 100644
9121 --- a/drivers/crypto/picoxcell_crypto.c
9122 +++ b/drivers/crypto/picoxcell_crypto.c
9123 @@ -1632,6 +1632,11 @@ static bool spacc_is_compatible(struct platform_device *pdev,
9124 return false;
9125 }
9126
9127 +static void spacc_tasklet_kill(void *data)
9128 +{
9129 + tasklet_kill(data);
9130 +}
9131 +
9132 static int spacc_probe(struct platform_device *pdev)
9133 {
9134 int i, err, ret = -EINVAL;
9135 @@ -1674,6 +1679,14 @@ static int spacc_probe(struct platform_device *pdev)
9136 return -ENXIO;
9137 }
9138
9139 + tasklet_init(&engine->complete, spacc_spacc_complete,
9140 + (unsigned long)engine);
9141 +
9142 + ret = devm_add_action(&pdev->dev, spacc_tasklet_kill,
9143 + &engine->complete);
9144 + if (ret)
9145 + return ret;
9146 +
9147 if (devm_request_irq(&pdev->dev, irq->start, spacc_spacc_irq, 0,
9148 engine->name, engine)) {
9149 dev_err(engine->dev, "failed to request IRQ\n");
9150 @@ -1736,8 +1749,6 @@ static int spacc_probe(struct platform_device *pdev)
9151 INIT_LIST_HEAD(&engine->completed);
9152 INIT_LIST_HEAD(&engine->in_progress);
9153 engine->in_flight = 0;
9154 - tasklet_init(&engine->complete, spacc_spacc_complete,
9155 - (unsigned long)engine);
9156
9157 platform_set_drvdata(pdev, engine);
9158
9159 diff --git a/drivers/gpu/drm/atmel-hlcdc/atmel_hlcdc_crtc.c b/drivers/gpu/drm/atmel-hlcdc/atmel_hlcdc_crtc.c
9160 index 9b17a66cf0e1..aa54a6a2ad1d 100644
9161 --- a/drivers/gpu/drm/atmel-hlcdc/atmel_hlcdc_crtc.c
9162 +++ b/drivers/gpu/drm/atmel-hlcdc/atmel_hlcdc_crtc.c
9163 @@ -81,7 +81,11 @@ static void atmel_hlcdc_crtc_mode_set_nofb(struct drm_crtc *c)
9164 struct videomode vm;
9165 unsigned long prate;
9166 unsigned int cfg;
9167 - int div;
9168 + int div, ret;
9169 +
9170 + ret = clk_prepare_enable(crtc->dc->hlcdc->sys_clk);
9171 + if (ret)
9172 + return;
9173
9174 vm.vfront_porch = adj->crtc_vsync_start - adj->crtc_vdisplay;
9175 vm.vback_porch = adj->crtc_vtotal - adj->crtc_vsync_end;
9176 @@ -140,6 +144,8 @@ static void atmel_hlcdc_crtc_mode_set_nofb(struct drm_crtc *c)
9177 ATMEL_HLCDC_VSPSU | ATMEL_HLCDC_VSPHO |
9178 ATMEL_HLCDC_GUARDTIME_MASK | ATMEL_HLCDC_MODE_MASK,
9179 cfg);
9180 +
9181 + clk_disable_unprepare(crtc->dc->hlcdc->sys_clk);
9182 }
9183
9184 static bool atmel_hlcdc_crtc_mode_fixup(struct drm_crtc *c,
9185 diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c
9186 index 1baa25e82bdd..f7d23c1081dc 100644
9187 --- a/drivers/infiniband/core/addr.c
9188 +++ b/drivers/infiniband/core/addr.c
9189 @@ -141,7 +141,7 @@ int ib_nl_handle_ip_res_resp(struct sk_buff *skb,
9190 if (ib_nl_is_good_ip_resp(nlh))
9191 ib_nl_process_good_ip_rsep(nlh);
9192
9193 - return skb->len;
9194 + return 0;
9195 }
9196
9197 static int ib_nl_ip_send_msg(struct rdma_dev_addr *dev_addr,
9198 diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c
9199 index 5879a06ada93..1c459725d64e 100644
9200 --- a/drivers/infiniband/core/sa_query.c
9201 +++ b/drivers/infiniband/core/sa_query.c
9202 @@ -848,7 +848,7 @@ int ib_nl_handle_set_timeout(struct sk_buff *skb,
9203 }
9204
9205 settimeout_out:
9206 - return skb->len;
9207 + return 0;
9208 }
9209
9210 static inline int ib_nl_is_good_resolve_resp(const struct nlmsghdr *nlh)
9211 @@ -920,7 +920,7 @@ int ib_nl_handle_resolve_resp(struct sk_buff *skb,
9212 }
9213
9214 resp_out:
9215 - return skb->len;
9216 + return 0;
9217 }
9218
9219 static void free_sm_ah(struct kref *kref)
9220 diff --git a/drivers/infiniband/hw/mlx5/gsi.c b/drivers/infiniband/hw/mlx5/gsi.c
9221 index 79e6309460dc..262c18b2f525 100644
9222 --- a/drivers/infiniband/hw/mlx5/gsi.c
9223 +++ b/drivers/infiniband/hw/mlx5/gsi.c
9224 @@ -507,8 +507,7 @@ int mlx5_ib_gsi_post_send(struct ib_qp *qp, struct ib_send_wr *wr,
9225 ret = ib_post_send(tx_qp, &cur_wr.wr, bad_wr);
9226 if (ret) {
9227 /* Undo the effect of adding the outstanding wr */
9228 - gsi->outstanding_pi = (gsi->outstanding_pi - 1) %
9229 - gsi->cap.max_send_wr;
9230 + gsi->outstanding_pi--;
9231 goto err;
9232 }
9233 spin_unlock_irqrestore(&gsi->lock, flags);
9234 diff --git a/drivers/md/dm.c b/drivers/md/dm.c
9235 index 36e6221fabab..dd154027adc9 100644
9236 --- a/drivers/md/dm.c
9237 +++ b/drivers/md/dm.c
9238 @@ -1457,7 +1457,6 @@ void dm_init_md_queue(struct mapped_device *md)
9239 * - must do so here (in alloc_dev callchain) before queue is used
9240 */
9241 md->queue->queuedata = md;
9242 - md->queue->backing_dev_info.congested_data = md;
9243 }
9244
9245 void dm_init_normal_md_queue(struct mapped_device *md)
9246 @@ -1468,6 +1467,7 @@ void dm_init_normal_md_queue(struct mapped_device *md)
9247 /*
9248 * Initialize aspects of queue that aren't relevant for blk-mq
9249 */
9250 + md->queue->backing_dev_info.congested_data = md;
9251 md->queue->backing_dev_info.congested_fn = dm_any_congested;
9252 blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
9253 }
9254 @@ -1555,6 +1555,12 @@ static struct mapped_device *alloc_dev(int minor)
9255 goto bad;
9256
9257 dm_init_md_queue(md);
9258 + /*
9259 + * default to bio-based required ->make_request_fn until DM
9260 + * table is loaded and md->type established. If request-based
9261 + * table is loaded: blk-mq will override accordingly.
9262 + */
9263 + blk_queue_make_request(md->queue, dm_make_request);
9264
9265 md->disk = alloc_disk_node(1, numa_node_id);
9266 if (!md->disk)
9267 @@ -1853,7 +1859,6 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
9268 case DM_TYPE_BIO_BASED:
9269 case DM_TYPE_DAX_BIO_BASED:
9270 dm_init_normal_md_queue(md);
9271 - blk_queue_make_request(md->queue, dm_make_request);
9272 /*
9273 * DM handles splitting bios as needed. Free the bio_split bioset
9274 * since it won't be used (saves 1 process per bio-based DM device).
9275 diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c
9276 index 306d2e4502c4..22729fd92a1b 100644
9277 --- a/drivers/md/persistent-data/dm-space-map-common.c
9278 +++ b/drivers/md/persistent-data/dm-space-map-common.c
9279 @@ -382,6 +382,33 @@ int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin,
9280 return -ENOSPC;
9281 }
9282
9283 +int sm_ll_find_common_free_block(struct ll_disk *old_ll, struct ll_disk *new_ll,
9284 + dm_block_t begin, dm_block_t end, dm_block_t *b)
9285 +{
9286 + int r;
9287 + uint32_t count;
9288 +
9289 + do {
9290 + r = sm_ll_find_free_block(new_ll, begin, new_ll->nr_blocks, b);
9291 + if (r)
9292 + break;
9293 +
9294 + /* double check this block wasn't used in the old transaction */
9295 + if (*b >= old_ll->nr_blocks)
9296 + count = 0;
9297 + else {
9298 + r = sm_ll_lookup(old_ll, *b, &count);
9299 + if (r)
9300 + break;
9301 +
9302 + if (count)
9303 + begin = *b + 1;
9304 + }
9305 + } while (count);
9306 +
9307 + return r;
9308 +}
9309 +
9310 static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b,
9311 int (*mutator)(void *context, uint32_t old, uint32_t *new),
9312 void *context, enum allocation_event *ev)
9313 diff --git a/drivers/md/persistent-data/dm-space-map-common.h b/drivers/md/persistent-data/dm-space-map-common.h
9314 index b3078d5eda0c..8de63ce39bdd 100644
9315 --- a/drivers/md/persistent-data/dm-space-map-common.h
9316 +++ b/drivers/md/persistent-data/dm-space-map-common.h
9317 @@ -109,6 +109,8 @@ int sm_ll_lookup_bitmap(struct ll_disk *ll, dm_block_t b, uint32_t *result);
9318 int sm_ll_lookup(struct ll_disk *ll, dm_block_t b, uint32_t *result);
9319 int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin,
9320 dm_block_t end, dm_block_t *result);
9321 +int sm_ll_find_common_free_block(struct ll_disk *old_ll, struct ll_disk *new_ll,
9322 + dm_block_t begin, dm_block_t end, dm_block_t *result);
9323 int sm_ll_insert(struct ll_disk *ll, dm_block_t b, uint32_t ref_count, enum allocation_event *ev);
9324 int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev);
9325 int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev);
9326 diff --git a/drivers/md/persistent-data/dm-space-map-disk.c b/drivers/md/persistent-data/dm-space-map-disk.c
9327 index 32adf6b4a9c7..bf4c5e2ccb6f 100644
9328 --- a/drivers/md/persistent-data/dm-space-map-disk.c
9329 +++ b/drivers/md/persistent-data/dm-space-map-disk.c
9330 @@ -167,8 +167,10 @@ static int sm_disk_new_block(struct dm_space_map *sm, dm_block_t *b)
9331 enum allocation_event ev;
9332 struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
9333
9334 - /* FIXME: we should loop round a couple of times */
9335 - r = sm_ll_find_free_block(&smd->old_ll, smd->begin, smd->old_ll.nr_blocks, b);
9336 + /*
9337 + * Any block we allocate has to be free in both the old and current ll.
9338 + */
9339 + r = sm_ll_find_common_free_block(&smd->old_ll, &smd->ll, smd->begin, smd->ll.nr_blocks, b);
9340 if (r)
9341 return r;
9342
9343 diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c
9344 index 1d29771af380..967d8f2a731f 100644
9345 --- a/drivers/md/persistent-data/dm-space-map-metadata.c
9346 +++ b/drivers/md/persistent-data/dm-space-map-metadata.c
9347 @@ -447,7 +447,10 @@ static int sm_metadata_new_block_(struct dm_space_map *sm, dm_block_t *b)
9348 enum allocation_event ev;
9349 struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
9350
9351 - r = sm_ll_find_free_block(&smm->old_ll, smm->begin, smm->old_ll.nr_blocks, b);
9352 + /*
9353 + * Any block we allocate has to be free in both the old and current ll.
9354 + */
9355 + r = sm_ll_find_common_free_block(&smm->old_ll, &smm->ll, smm->begin, smm->ll.nr_blocks, b);
9356 if (r)
9357 return r;
9358
9359 diff --git a/drivers/media/rc/iguanair.c b/drivers/media/rc/iguanair.c
9360 index 25470395c43f..246795c31553 100644
9361 --- a/drivers/media/rc/iguanair.c
9362 +++ b/drivers/media/rc/iguanair.c
9363 @@ -430,7 +430,7 @@ static int iguanair_probe(struct usb_interface *intf,
9364 int ret, pipein, pipeout;
9365 struct usb_host_interface *idesc;
9366
9367 - idesc = intf->altsetting;
9368 + idesc = intf->cur_altsetting;
9369 if (idesc->desc.bNumEndpoints < 2)
9370 return -ENODEV;
9371
9372 diff --git a/drivers/media/usb/uvc/uvc_driver.c b/drivers/media/usb/uvc/uvc_driver.c
9373 index 7c375b6dd318..9803135f2e59 100644
9374 --- a/drivers/media/usb/uvc/uvc_driver.c
9375 +++ b/drivers/media/usb/uvc/uvc_driver.c
9376 @@ -1411,6 +1411,11 @@ static int uvc_scan_chain_forward(struct uvc_video_chain *chain,
9377 break;
9378 if (forward == prev)
9379 continue;
9380 + if (forward->chain.next || forward->chain.prev) {
9381 + uvc_trace(UVC_TRACE_DESCR, "Found reference to "
9382 + "entity %d already in chain.\n", forward->id);
9383 + return -EINVAL;
9384 + }
9385
9386 switch (UVC_ENTITY_TYPE(forward)) {
9387 case UVC_VC_EXTENSION_UNIT:
9388 @@ -1492,6 +1497,13 @@ static int uvc_scan_chain_backward(struct uvc_video_chain *chain,
9389 return -1;
9390 }
9391
9392 + if (term->chain.next || term->chain.prev) {
9393 + uvc_trace(UVC_TRACE_DESCR, "Found reference to "
9394 + "entity %d already in chain.\n",
9395 + term->id);
9396 + return -EINVAL;
9397 + }
9398 +
9399 if (uvc_trace_param & UVC_TRACE_PROBE)
9400 printk(" %d", term->id);
9401
9402 diff --git a/drivers/media/v4l2-core/videobuf-dma-sg.c b/drivers/media/v4l2-core/videobuf-dma-sg.c
9403 index b6189a4958c5..025822bc6941 100644
9404 --- a/drivers/media/v4l2-core/videobuf-dma-sg.c
9405 +++ b/drivers/media/v4l2-core/videobuf-dma-sg.c
9406 @@ -352,8 +352,11 @@ int videobuf_dma_free(struct videobuf_dmabuf *dma)
9407 BUG_ON(dma->sglen);
9408
9409 if (dma->pages) {
9410 - for (i = 0; i < dma->nr_pages; i++)
9411 + for (i = 0; i < dma->nr_pages; i++) {
9412 + if (dma->direction == DMA_FROM_DEVICE)
9413 + set_page_dirty_lock(dma->pages[i]);
9414 put_page(dma->pages[i]);
9415 + }
9416 kfree(dma->pages);
9417 dma->pages = NULL;
9418 }
9419 diff --git a/drivers/mfd/da9062-core.c b/drivers/mfd/da9062-core.c
9420 index 8f873866ea60..86aab322da33 100644
9421 --- a/drivers/mfd/da9062-core.c
9422 +++ b/drivers/mfd/da9062-core.c
9423 @@ -142,7 +142,7 @@ static const struct mfd_cell da9062_devs[] = {
9424 .name = "da9062-watchdog",
9425 .num_resources = ARRAY_SIZE(da9062_wdt_resources),
9426 .resources = da9062_wdt_resources,
9427 - .of_compatible = "dlg,da9062-wdt",
9428 + .of_compatible = "dlg,da9062-watchdog",
9429 },
9430 {
9431 .name = "da9062-thermal",
9432 diff --git a/drivers/mfd/dln2.c b/drivers/mfd/dln2.c
9433 index 704e189ca162..95d0f2df0ad4 100644
9434 --- a/drivers/mfd/dln2.c
9435 +++ b/drivers/mfd/dln2.c
9436 @@ -729,6 +729,8 @@ static int dln2_probe(struct usb_interface *interface,
9437 const struct usb_device_id *usb_id)
9438 {
9439 struct usb_host_interface *hostif = interface->cur_altsetting;
9440 + struct usb_endpoint_descriptor *epin;
9441 + struct usb_endpoint_descriptor *epout;
9442 struct device *dev = &interface->dev;
9443 struct dln2_dev *dln2;
9444 int ret;
9445 @@ -738,12 +740,19 @@ static int dln2_probe(struct usb_interface *interface,
9446 hostif->desc.bNumEndpoints < 2)
9447 return -ENODEV;
9448
9449 + epin = &hostif->endpoint[0].desc;
9450 + epout = &hostif->endpoint[1].desc;
9451 + if (!usb_endpoint_is_bulk_out(epout))
9452 + return -ENODEV;
9453 + if (!usb_endpoint_is_bulk_in(epin))
9454 + return -ENODEV;
9455 +
9456 dln2 = kzalloc(sizeof(*dln2), GFP_KERNEL);
9457 if (!dln2)
9458 return -ENOMEM;
9459
9460 - dln2->ep_out = hostif->endpoint[0].desc.bEndpointAddress;
9461 - dln2->ep_in = hostif->endpoint[1].desc.bEndpointAddress;
9462 + dln2->ep_out = epout->bEndpointAddress;
9463 + dln2->ep_in = epin->bEndpointAddress;
9464 dln2->usb_dev = usb_get_dev(interface_to_usbdev(interface));
9465 dln2->interface = interface;
9466 usb_set_intfdata(interface, dln2);
9467 diff --git a/drivers/mfd/rn5t618.c b/drivers/mfd/rn5t618.c
9468 index ee94080e1cbb..dd20c3e32352 100644
9469 --- a/drivers/mfd/rn5t618.c
9470 +++ b/drivers/mfd/rn5t618.c
9471 @@ -32,6 +32,7 @@ static bool rn5t618_volatile_reg(struct device *dev, unsigned int reg)
9472 case RN5T618_WATCHDOGCNT:
9473 case RN5T618_DCIRQ:
9474 case RN5T618_ILIMDATAH ... RN5T618_AIN0DATAL:
9475 + case RN5T618_ADCCNT3:
9476 case RN5T618_IR_ADC1 ... RN5T618_IR_ADC3:
9477 case RN5T618_IR_GPR:
9478 case RN5T618_IR_GPF:
9479 diff --git a/drivers/mmc/host/mmc_spi.c b/drivers/mmc/host/mmc_spi.c
9480 index c2df68e958b3..279d5da6e54b 100644
9481 --- a/drivers/mmc/host/mmc_spi.c
9482 +++ b/drivers/mmc/host/mmc_spi.c
9483 @@ -1157,17 +1157,22 @@ static void mmc_spi_initsequence(struct mmc_spi_host *host)
9484 * SPI protocol. Another is that when chipselect is released while
9485 * the card returns BUSY status, the clock must issue several cycles
9486 * with chipselect high before the card will stop driving its output.
9487 + *
9488 + * SPI_CS_HIGH means "asserted" here. In some cases like when using
9489 + * GPIOs for chip select, SPI_CS_HIGH is set but this will be logically
9490 + * inverted by gpiolib, so if we want to ascertain to drive it high
9491 + * we should toggle the default with an XOR as we do here.
9492 */
9493 - host->spi->mode |= SPI_CS_HIGH;
9494 + host->spi->mode ^= SPI_CS_HIGH;
9495 if (spi_setup(host->spi) != 0) {
9496 /* Just warn; most cards work without it. */
9497 dev_warn(&host->spi->dev,
9498 "can't change chip-select polarity\n");
9499 - host->spi->mode &= ~SPI_CS_HIGH;
9500 + host->spi->mode ^= SPI_CS_HIGH;
9501 } else {
9502 mmc_spi_readbytes(host, 18);
9503
9504 - host->spi->mode &= ~SPI_CS_HIGH;
9505 + host->spi->mode ^= SPI_CS_HIGH;
9506 if (spi_setup(host->spi) != 0) {
9507 /* Wot, we can't get the same setup we had before? */
9508 dev_err(&host->spi->dev,
9509 diff --git a/drivers/mtd/ubi/fastmap.c b/drivers/mtd/ubi/fastmap.c
9510 index b44c8d348e78..e7b177c61642 100644
9511 --- a/drivers/mtd/ubi/fastmap.c
9512 +++ b/drivers/mtd/ubi/fastmap.c
9513 @@ -73,7 +73,7 @@ static int self_check_seen(struct ubi_device *ubi, unsigned long *seen)
9514 return 0;
9515
9516 for (pnum = 0; pnum < ubi->peb_count; pnum++) {
9517 - if (test_bit(pnum, seen) && ubi->lookuptbl[pnum]) {
9518 + if (!test_bit(pnum, seen) && ubi->lookuptbl[pnum]) {
9519 ubi_err(ubi, "self-check failed for PEB %d, fastmap didn't see it", pnum);
9520 ret = -EINVAL;
9521 }
9522 @@ -1127,7 +1127,7 @@ static int ubi_write_fastmap(struct ubi_device *ubi,
9523 struct rb_node *tmp_rb;
9524 int ret, i, j, free_peb_count, used_peb_count, vol_count;
9525 int scrub_peb_count, erase_peb_count;
9526 - unsigned long *seen_pebs = NULL;
9527 + unsigned long *seen_pebs;
9528
9529 fm_raw = ubi->fm_buf;
9530 memset(ubi->fm_buf, 0, ubi->fm_size);
9531 @@ -1141,7 +1141,7 @@ static int ubi_write_fastmap(struct ubi_device *ubi,
9532 dvbuf = new_fm_vbuf(ubi, UBI_FM_DATA_VOLUME_ID);
9533 if (!dvbuf) {
9534 ret = -ENOMEM;
9535 - goto out_kfree;
9536 + goto out_free_avbuf;
9537 }
9538
9539 avhdr = ubi_get_vid_hdr(avbuf);
9540 @@ -1150,7 +1150,7 @@ static int ubi_write_fastmap(struct ubi_device *ubi,
9541 seen_pebs = init_seen(ubi);
9542 if (IS_ERR(seen_pebs)) {
9543 ret = PTR_ERR(seen_pebs);
9544 - goto out_kfree;
9545 + goto out_free_dvbuf;
9546 }
9547
9548 spin_lock(&ubi->volumes_lock);
9549 @@ -1318,7 +1318,7 @@ static int ubi_write_fastmap(struct ubi_device *ubi,
9550 ret = ubi_io_write_vid_hdr(ubi, new_fm->e[0]->pnum, avbuf);
9551 if (ret) {
9552 ubi_err(ubi, "unable to write vid_hdr to fastmap SB!");
9553 - goto out_kfree;
9554 + goto out_free_seen;
9555 }
9556
9557 for (i = 0; i < new_fm->used_blocks; i++) {
9558 @@ -1340,7 +1340,7 @@ static int ubi_write_fastmap(struct ubi_device *ubi,
9559 if (ret) {
9560 ubi_err(ubi, "unable to write vid_hdr to PEB %i!",
9561 new_fm->e[i]->pnum);
9562 - goto out_kfree;
9563 + goto out_free_seen;
9564 }
9565 }
9566
9567 @@ -1350,7 +1350,7 @@ static int ubi_write_fastmap(struct ubi_device *ubi,
9568 if (ret) {
9569 ubi_err(ubi, "unable to write fastmap to PEB %i!",
9570 new_fm->e[i]->pnum);
9571 - goto out_kfree;
9572 + goto out_free_seen;
9573 }
9574 }
9575
9576 @@ -1360,10 +1360,13 @@ static int ubi_write_fastmap(struct ubi_device *ubi,
9577 ret = self_check_seen(ubi, seen_pebs);
9578 dbg_bld("fastmap written!");
9579
9580 -out_kfree:
9581 - ubi_free_vid_buf(avbuf);
9582 - ubi_free_vid_buf(dvbuf);
9583 +out_free_seen:
9584 free_seen(seen_pebs);
9585 +out_free_dvbuf:
9586 + ubi_free_vid_buf(dvbuf);
9587 +out_free_avbuf:
9588 + ubi_free_vid_buf(avbuf);
9589 +
9590 out:
9591 return ret;
9592 }
9593 diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
9594 index 91d8a48e53c3..9834d28d52e8 100644
9595 --- a/drivers/net/bonding/bond_alb.c
9596 +++ b/drivers/net/bonding/bond_alb.c
9597 @@ -1371,26 +1371,31 @@ int bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev)
9598 bool do_tx_balance = true;
9599 u32 hash_index = 0;
9600 const u8 *hash_start = NULL;
9601 - struct ipv6hdr *ip6hdr;
9602
9603 skb_reset_mac_header(skb);
9604 eth_data = eth_hdr(skb);
9605
9606 switch (ntohs(skb->protocol)) {
9607 case ETH_P_IP: {
9608 - const struct iphdr *iph = ip_hdr(skb);
9609 + const struct iphdr *iph;
9610
9611 if (ether_addr_equal_64bits(eth_data->h_dest, mac_bcast) ||
9612 - (iph->daddr == ip_bcast) ||
9613 - (iph->protocol == IPPROTO_IGMP)) {
9614 + (!pskb_network_may_pull(skb, sizeof(*iph)))) {
9615 + do_tx_balance = false;
9616 + break;
9617 + }
9618 + iph = ip_hdr(skb);
9619 + if (iph->daddr == ip_bcast || iph->protocol == IPPROTO_IGMP) {
9620 do_tx_balance = false;
9621 break;
9622 }
9623 hash_start = (char *)&(iph->daddr);
9624 hash_size = sizeof(iph->daddr);
9625 - }
9626 break;
9627 - case ETH_P_IPV6:
9628 + }
9629 + case ETH_P_IPV6: {
9630 + const struct ipv6hdr *ip6hdr;
9631 +
9632 /* IPv6 doesn't really use broadcast mac address, but leave
9633 * that here just in case.
9634 */
9635 @@ -1407,7 +1412,11 @@ int bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev)
9636 break;
9637 }
9638
9639 - /* Additianally, DAD probes should not be tx-balanced as that
9640 + if (!pskb_network_may_pull(skb, sizeof(*ip6hdr))) {
9641 + do_tx_balance = false;
9642 + break;
9643 + }
9644 + /* Additionally, DAD probes should not be tx-balanced as that
9645 * will lead to false positives for duplicate addresses and
9646 * prevent address configuration from working.
9647 */
9648 @@ -1417,17 +1426,26 @@ int bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev)
9649 break;
9650 }
9651
9652 - hash_start = (char *)&(ipv6_hdr(skb)->daddr);
9653 - hash_size = sizeof(ipv6_hdr(skb)->daddr);
9654 + hash_start = (char *)&ip6hdr->daddr;
9655 + hash_size = sizeof(ip6hdr->daddr);
9656 break;
9657 - case ETH_P_IPX:
9658 - if (ipx_hdr(skb)->ipx_checksum != IPX_NO_CHECKSUM) {
9659 + }
9660 + case ETH_P_IPX: {
9661 + const struct ipxhdr *ipxhdr;
9662 +
9663 + if (pskb_network_may_pull(skb, sizeof(*ipxhdr))) {
9664 + do_tx_balance = false;
9665 + break;
9666 + }
9667 + ipxhdr = (struct ipxhdr *)skb_network_header(skb);
9668 +
9669 + if (ipxhdr->ipx_checksum != IPX_NO_CHECKSUM) {
9670 /* something is wrong with this packet */
9671 do_tx_balance = false;
9672 break;
9673 }
9674
9675 - if (ipx_hdr(skb)->ipx_type != IPX_TYPE_NCP) {
9676 + if (ipxhdr->ipx_type != IPX_TYPE_NCP) {
9677 /* The only protocol worth balancing in
9678 * this family since it has an "ARP" like
9679 * mechanism
9680 @@ -1436,9 +1454,11 @@ int bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev)
9681 break;
9682 }
9683
9684 + eth_data = eth_hdr(skb);
9685 hash_start = (char *)eth_data->h_dest;
9686 hash_size = ETH_ALEN;
9687 break;
9688 + }
9689 case ETH_P_ARP:
9690 do_tx_balance = false;
9691 if (bond_info->rlb_enabled)
9692 diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c
9693 index e3b41af65d18..6519dd33c7ca 100644
9694 --- a/drivers/net/ethernet/broadcom/bcmsysport.c
9695 +++ b/drivers/net/ethernet/broadcom/bcmsysport.c
9696 @@ -1983,6 +1983,9 @@ static int bcm_sysport_resume(struct device *d)
9697
9698 umac_reset(priv);
9699
9700 + /* Disable the UniMAC RX/TX */
9701 + umac_enable_set(priv, CMD_RX_EN | CMD_TX_EN, 0);
9702 +
9703 /* We may have been suspended and never received a WOL event that
9704 * would turn off MPD detection, take care of that now
9705 */
9706 diff --git a/drivers/net/ethernet/dec/tulip/dmfe.c b/drivers/net/ethernet/dec/tulip/dmfe.c
9707 index 8ed0fd8b1dda..74cb9b3c2f41 100644
9708 --- a/drivers/net/ethernet/dec/tulip/dmfe.c
9709 +++ b/drivers/net/ethernet/dec/tulip/dmfe.c
9710 @@ -2225,15 +2225,16 @@ static int __init dmfe_init_module(void)
9711 if (cr6set)
9712 dmfe_cr6_user_set = cr6set;
9713
9714 - switch(mode) {
9715 - case DMFE_10MHF:
9716 + switch (mode) {
9717 + case DMFE_10MHF:
9718 case DMFE_100MHF:
9719 case DMFE_10MFD:
9720 case DMFE_100MFD:
9721 case DMFE_1M_HPNA:
9722 dmfe_media_mode = mode;
9723 break;
9724 - default:dmfe_media_mode = DMFE_AUTO;
9725 + default:
9726 + dmfe_media_mode = DMFE_AUTO;
9727 break;
9728 }
9729
9730 diff --git a/drivers/net/ethernet/dec/tulip/uli526x.c b/drivers/net/ethernet/dec/tulip/uli526x.c
9731 index e750b5ddc0fb..5f79e2731b76 100644
9732 --- a/drivers/net/ethernet/dec/tulip/uli526x.c
9733 +++ b/drivers/net/ethernet/dec/tulip/uli526x.c
9734 @@ -1813,8 +1813,8 @@ static int __init uli526x_init_module(void)
9735 if (cr6set)
9736 uli526x_cr6_user_set = cr6set;
9737
9738 - switch (mode) {
9739 - case ULI526X_10MHF:
9740 + switch (mode) {
9741 + case ULI526X_10MHF:
9742 case ULI526X_100MHF:
9743 case ULI526X_10MFD:
9744 case ULI526X_100MFD:
9745 diff --git a/drivers/net/ethernet/smsc/smc911x.c b/drivers/net/ethernet/smsc/smc911x.c
9746 index 323b3ac16bc0..d0cf971aa4eb 100644
9747 --- a/drivers/net/ethernet/smsc/smc911x.c
9748 +++ b/drivers/net/ethernet/smsc/smc911x.c
9749 @@ -948,7 +948,7 @@ static void smc911x_phy_configure(struct work_struct *work)
9750 if (lp->ctl_rspeed != 100)
9751 my_ad_caps &= ~(ADVERTISE_100BASE4|ADVERTISE_100FULL|ADVERTISE_100HALF);
9752
9753 - if (!lp->ctl_rfduplx)
9754 + if (!lp->ctl_rfduplx)
9755 my_ad_caps &= ~(ADVERTISE_100FULL|ADVERTISE_10FULL);
9756
9757 /* Update our Auto-Neg Advertisement Register */
9758 diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c
9759 index 5077c69eb652..a9e8a7356c41 100644
9760 --- a/drivers/net/gtp.c
9761 +++ b/drivers/net/gtp.c
9762 @@ -784,11 +784,13 @@ static int gtp_hashtable_new(struct gtp_dev *gtp, int hsize)
9763 {
9764 int i;
9765
9766 - gtp->addr_hash = kmalloc(sizeof(struct hlist_head) * hsize, GFP_KERNEL);
9767 + gtp->addr_hash = kmalloc(sizeof(struct hlist_head) * hsize,
9768 + GFP_KERNEL | __GFP_NOWARN);
9769 if (gtp->addr_hash == NULL)
9770 return -ENOMEM;
9771
9772 - gtp->tid_hash = kmalloc(sizeof(struct hlist_head) * hsize, GFP_KERNEL);
9773 + gtp->tid_hash = kmalloc(sizeof(struct hlist_head) * hsize,
9774 + GFP_KERNEL | __GFP_NOWARN);
9775 if (gtp->tid_hash == NULL)
9776 goto err1;
9777
9778 diff --git a/drivers/net/ppp/ppp_async.c b/drivers/net/ppp/ppp_async.c
9779 index 9c889e0303dd..cef40de1bd05 100644
9780 --- a/drivers/net/ppp/ppp_async.c
9781 +++ b/drivers/net/ppp/ppp_async.c
9782 @@ -878,15 +878,15 @@ ppp_async_input(struct asyncppp *ap, const unsigned char *buf,
9783 skb = dev_alloc_skb(ap->mru + PPP_HDRLEN + 2);
9784 if (!skb)
9785 goto nomem;
9786 - ap->rpkt = skb;
9787 - }
9788 - if (skb->len == 0) {
9789 - /* Try to get the payload 4-byte aligned.
9790 - * This should match the
9791 - * PPP_ALLSTATIONS/PPP_UI/compressed tests in
9792 - * process_input_packet, but we do not have
9793 - * enough chars here to test buf[1] and buf[2].
9794 - */
9795 + ap->rpkt = skb;
9796 + }
9797 + if (skb->len == 0) {
9798 + /* Try to get the payload 4-byte aligned.
9799 + * This should match the
9800 + * PPP_ALLSTATIONS/PPP_UI/compressed tests in
9801 + * process_input_packet, but we do not have
9802 + * enough chars here to test buf[1] and buf[2].
9803 + */
9804 if (buf[0] != PPP_ALLSTATIONS)
9805 skb_reserve(skb, 2 + (buf[0] & 1));
9806 }
9807 diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/usb.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/usb.c
9808 index 05df9d8f76e9..31727f34381f 100644
9809 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/usb.c
9810 +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/usb.c
9811 @@ -438,6 +438,7 @@ fail:
9812 usb_free_urb(req->urb);
9813 list_del(q->next);
9814 }
9815 + kfree(reqs);
9816 return NULL;
9817
9818 }
9819 diff --git a/drivers/net/wireless/marvell/libertas/cfg.c b/drivers/net/wireless/marvell/libertas/cfg.c
9820 index 3eab802c7d3f..ece6d72cf90c 100644
9821 --- a/drivers/net/wireless/marvell/libertas/cfg.c
9822 +++ b/drivers/net/wireless/marvell/libertas/cfg.c
9823 @@ -1859,6 +1859,8 @@ static int lbs_ibss_join_existing(struct lbs_private *priv,
9824 rates_max = rates_eid[1];
9825 if (rates_max > MAX_RATES) {
9826 lbs_deb_join("invalid rates");
9827 + rcu_read_unlock();
9828 + ret = -EINVAL;
9829 goto out;
9830 }
9831 rates = cmd.bss.rates;
9832 diff --git a/drivers/net/wireless/marvell/mwifiex/scan.c b/drivers/net/wireless/marvell/mwifiex/scan.c
9833 index 828c6f5eb83c..5fde2e2f1fea 100644
9834 --- a/drivers/net/wireless/marvell/mwifiex/scan.c
9835 +++ b/drivers/net/wireless/marvell/mwifiex/scan.c
9836 @@ -2878,6 +2878,13 @@ mwifiex_cmd_append_vsie_tlv(struct mwifiex_private *priv,
9837 vs_param_set->header.len =
9838 cpu_to_le16((((u16) priv->vs_ie[id].ie[1])
9839 & 0x00FF) + 2);
9840 + if (le16_to_cpu(vs_param_set->header.len) >
9841 + MWIFIEX_MAX_VSIE_LEN) {
9842 + mwifiex_dbg(priv->adapter, ERROR,
9843 + "Invalid param length!\n");
9844 + break;
9845 + }
9846 +
9847 memcpy(vs_param_set->ie, priv->vs_ie[id].ie,
9848 le16_to_cpu(vs_param_set->header.len));
9849 *buffer += le16_to_cpu(vs_param_set->header.len) +
9850 diff --git a/drivers/net/wireless/marvell/mwifiex/sta_ioctl.c b/drivers/net/wireless/marvell/mwifiex/sta_ioctl.c
9851 index be3be7a63cf0..f2d10ba19920 100644
9852 --- a/drivers/net/wireless/marvell/mwifiex/sta_ioctl.c
9853 +++ b/drivers/net/wireless/marvell/mwifiex/sta_ioctl.c
9854 @@ -274,6 +274,7 @@ static int mwifiex_process_country_ie(struct mwifiex_private *priv,
9855
9856 if (country_ie_len >
9857 (IEEE80211_COUNTRY_STRING_LEN + MWIFIEX_MAX_TRIPLET_802_11D)) {
9858 + rcu_read_unlock();
9859 mwifiex_dbg(priv->adapter, ERROR,
9860 "11D: country_ie_len overflow!, deauth AP\n");
9861 return -EINVAL;
9862 diff --git a/drivers/net/wireless/marvell/mwifiex/wmm.c b/drivers/net/wireless/marvell/mwifiex/wmm.c
9863 index 9843560e784f..c93fcafbcc7a 100644
9864 --- a/drivers/net/wireless/marvell/mwifiex/wmm.c
9865 +++ b/drivers/net/wireless/marvell/mwifiex/wmm.c
9866 @@ -980,6 +980,10 @@ int mwifiex_ret_wmm_get_status(struct mwifiex_private *priv,
9867 "WMM Parameter Set Count: %d\n",
9868 wmm_param_ie->qos_info_bitmap & mask);
9869
9870 + if (wmm_param_ie->vend_hdr.len + 2 >
9871 + sizeof(struct ieee_types_wmm_parameter))
9872 + break;
9873 +
9874 memcpy((u8 *) &priv->curr_bss_params.bss_descriptor.
9875 wmm_ie, wmm_param_ie,
9876 wmm_param_ie->vend_hdr.len + 2);
9877 diff --git a/drivers/nfc/pn544/pn544.c b/drivers/nfc/pn544/pn544.c
9878 index 12e819ddf17a..3afc53ff7369 100644
9879 --- a/drivers/nfc/pn544/pn544.c
9880 +++ b/drivers/nfc/pn544/pn544.c
9881 @@ -704,7 +704,7 @@ static int pn544_hci_check_presence(struct nfc_hci_dev *hdev,
9882 target->nfcid1_len != 10)
9883 return -EOPNOTSUPP;
9884
9885 - return nfc_hci_send_cmd(hdev, NFC_HCI_RF_READER_A_GATE,
9886 + return nfc_hci_send_cmd(hdev, NFC_HCI_RF_READER_A_GATE,
9887 PN544_RF_READER_CMD_ACTIVATE_NEXT,
9888 target->nfcid1, target->nfcid1_len, NULL);
9889 } else if (target->supported_protocols & (NFC_PROTO_JEWEL_MASK |
9890 diff --git a/drivers/of/Kconfig b/drivers/of/Kconfig
9891 index ba7b034b2b91..6b8646db110c 100644
9892 --- a/drivers/of/Kconfig
9893 +++ b/drivers/of/Kconfig
9894 @@ -112,4 +112,8 @@ config OF_OVERLAY
9895 config OF_NUMA
9896 bool
9897
9898 +config OF_DMA_DEFAULT_COHERENT
9899 + # arches should select this if DMA is coherent by default for OF devices
9900 + bool
9901 +
9902 endif # OF
9903 diff --git a/drivers/of/address.c b/drivers/of/address.c
9904 index 72914cdfce2a..37619bb2c97a 100644
9905 --- a/drivers/of/address.c
9906 +++ b/drivers/of/address.c
9907 @@ -896,12 +896,16 @@ EXPORT_SYMBOL_GPL(of_dma_get_range);
9908 * @np: device node
9909 *
9910 * It returns true if "dma-coherent" property was found
9911 - * for this device in DT.
9912 + * for this device in the DT, or if DMA is coherent by
9913 + * default for OF devices on the current platform.
9914 */
9915 bool of_dma_is_coherent(struct device_node *np)
9916 {
9917 struct device_node *node = of_node_get(np);
9918
9919 + if (IS_ENABLED(CONFIG_OF_DMA_DEFAULT_COHERENT))
9920 + return true;
9921 +
9922 while (node) {
9923 if (of_property_read_bool(node, "dma-coherent")) {
9924 of_node_put(node);
9925 diff --git a/drivers/pci/host/pci-keystone-dw.c b/drivers/pci/host/pci-keystone-dw.c
9926 index 9397c4667106..f011a8780ff5 100644
9927 --- a/drivers/pci/host/pci-keystone-dw.c
9928 +++ b/drivers/pci/host/pci-keystone-dw.c
9929 @@ -502,7 +502,7 @@ void ks_dw_pcie_initiate_link_train(struct keystone_pcie *ks_pcie)
9930 /* Disable Link training */
9931 val = ks_dw_app_readl(ks_pcie, CMD_STATUS);
9932 val &= ~LTSSM_EN_VAL;
9933 - ks_dw_app_writel(ks_pcie, CMD_STATUS, LTSSM_EN_VAL | val);
9934 + ks_dw_app_writel(ks_pcie, CMD_STATUS, val);
9935
9936 /* Initiate Link Training */
9937 val = ks_dw_app_readl(ks_pcie, CMD_STATUS);
9938 diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
9939 index f30ca75b5b6c..e631636a0aa5 100644
9940 --- a/drivers/pci/setup-bus.c
9941 +++ b/drivers/pci/setup-bus.c
9942 @@ -1833,12 +1833,18 @@ again:
9943 /* restore size and flags */
9944 list_for_each_entry(fail_res, &fail_head, list) {
9945 struct resource *res = fail_res->res;
9946 + int idx;
9947
9948 res->start = fail_res->start;
9949 res->end = fail_res->end;
9950 res->flags = fail_res->flags;
9951 - if (fail_res->dev->subordinate)
9952 - res->flags = 0;
9953 +
9954 + if (pci_is_bridge(fail_res->dev)) {
9955 + idx = res - &fail_res->dev->resource[0];
9956 + if (idx >= PCI_BRIDGE_RESOURCES &&
9957 + idx <= PCI_BRIDGE_RESOURCE_END)
9958 + res->flags = 0;
9959 + }
9960 }
9961 free_list(&fail_head);
9962
9963 @@ -1904,12 +1910,18 @@ again:
9964 /* restore size and flags */
9965 list_for_each_entry(fail_res, &fail_head, list) {
9966 struct resource *res = fail_res->res;
9967 + int idx;
9968
9969 res->start = fail_res->start;
9970 res->end = fail_res->end;
9971 res->flags = fail_res->flags;
9972 - if (fail_res->dev->subordinate)
9973 - res->flags = 0;
9974 +
9975 + if (pci_is_bridge(fail_res->dev)) {
9976 + idx = res - &fail_res->dev->resource[0];
9977 + if (idx >= PCI_BRIDGE_RESOURCES &&
9978 + idx <= PCI_BRIDGE_RESOURCE_END)
9979 + res->flags = 0;
9980 + }
9981 }
9982 free_list(&fail_head);
9983
9984 diff --git a/drivers/pinctrl/sh-pfc/pfc-r8a7778.c b/drivers/pinctrl/sh-pfc/pfc-r8a7778.c
9985 index 18ef7042b3d1..771689a41dbf 100644
9986 --- a/drivers/pinctrl/sh-pfc/pfc-r8a7778.c
9987 +++ b/drivers/pinctrl/sh-pfc/pfc-r8a7778.c
9988 @@ -2324,7 +2324,7 @@ static const struct pinmux_cfg_reg pinmux_config_regs[] = {
9989 FN_ATAG0_A, 0, FN_REMOCON_B, 0,
9990 /* IP0_11_8 [4] */
9991 FN_SD1_DAT2_A, FN_MMC_D2, 0, FN_BS,
9992 - FN_ATADIR0_A, 0, FN_SDSELF_B, 0,
9993 + FN_ATADIR0_A, 0, FN_SDSELF_A, 0,
9994 FN_PWM4_B, 0, 0, 0,
9995 0, 0, 0, 0,
9996 /* IP0_7_5 [3] */
9997 @@ -2366,7 +2366,7 @@ static const struct pinmux_cfg_reg pinmux_config_regs[] = {
9998 FN_TS_SDAT0_A, 0, 0, 0,
9999 0, 0, 0, 0,
10000 /* IP1_10_8 [3] */
10001 - FN_SD1_CLK_B, FN_MMC_D6, 0, FN_A24,
10002 + FN_SD1_CD_A, FN_MMC_D6, 0, FN_A24,
10003 FN_DREQ1_A, 0, FN_HRX0_B, FN_TS_SPSYNC0_A,
10004 /* IP1_7_5 [3] */
10005 FN_A23, FN_HTX0_B, FN_TX2_B, FN_DACK2_A,
10006 diff --git a/drivers/power/supply/ltc2941-battery-gauge.c b/drivers/power/supply/ltc2941-battery-gauge.c
10007 index 4adf2ba021ce..043de9d039d5 100644
10008 --- a/drivers/power/supply/ltc2941-battery-gauge.c
10009 +++ b/drivers/power/supply/ltc2941-battery-gauge.c
10010 @@ -364,7 +364,7 @@ static int ltc294x_i2c_remove(struct i2c_client *client)
10011 {
10012 struct ltc294x_info *info = i2c_get_clientdata(client);
10013
10014 - cancel_delayed_work(&info->work);
10015 + cancel_delayed_work_sync(&info->work);
10016 power_supply_unregister(info->supply);
10017 return 0;
10018 }
10019 diff --git a/drivers/rtc/rtc-cmos.c b/drivers/rtc/rtc-cmos.c
10020 index c554e529fc4e..b962dbe51750 100644
10021 --- a/drivers/rtc/rtc-cmos.c
10022 +++ b/drivers/rtc/rtc-cmos.c
10023 @@ -730,7 +730,7 @@ cmos_do_probe(struct device *dev, struct resource *ports, int rtc_irq)
10024 rtc_cmos_int_handler = cmos_interrupt;
10025
10026 retval = request_irq(rtc_irq, rtc_cmos_int_handler,
10027 - IRQF_SHARED, dev_name(&cmos_rtc.rtc->dev),
10028 + 0, dev_name(&cmos_rtc.rtc->dev),
10029 cmos_rtc.rtc);
10030 if (retval < 0) {
10031 dev_dbg(dev, "IRQ %d is already in use\n", rtc_irq);
10032 diff --git a/drivers/rtc/rtc-hym8563.c b/drivers/rtc/rtc-hym8563.c
10033 index e5ad527cb75e..a8c2d38b2411 100644
10034 --- a/drivers/rtc/rtc-hym8563.c
10035 +++ b/drivers/rtc/rtc-hym8563.c
10036 @@ -105,7 +105,7 @@ static int hym8563_rtc_read_time(struct device *dev, struct rtc_time *tm)
10037
10038 if (!hym8563->valid) {
10039 dev_warn(&client->dev, "no valid clock/calendar values available\n");
10040 - return -EPERM;
10041 + return -EINVAL;
10042 }
10043
10044 ret = i2c_smbus_read_i2c_block_data(client, HYM8563_SEC, 7, buf);
10045 diff --git a/drivers/scsi/csiostor/csio_scsi.c b/drivers/scsi/csiostor/csio_scsi.c
10046 index 894d97e4ace5..5db57671fa28 100644
10047 --- a/drivers/scsi/csiostor/csio_scsi.c
10048 +++ b/drivers/scsi/csiostor/csio_scsi.c
10049 @@ -1383,7 +1383,7 @@ csio_device_reset(struct device *dev,
10050 return -EINVAL;
10051
10052 /* Delete NPIV lnodes */
10053 - csio_lnodes_exit(hw, 1);
10054 + csio_lnodes_exit(hw, 1);
10055
10056 /* Block upper IOs */
10057 csio_lnodes_block_request(hw);
10058 diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c
10059 index 10ae624dd266..9fa6a560b162 100644
10060 --- a/drivers/scsi/megaraid/megaraid_sas_base.c
10061 +++ b/drivers/scsi/megaraid/megaraid_sas_base.c
10062 @@ -3978,7 +3978,8 @@ dcmd_timeout_ocr_possible(struct megasas_instance *instance) {
10063 if (!instance->ctrl_context)
10064 return KILL_ADAPTER;
10065 else if (instance->unload ||
10066 - test_bit(MEGASAS_FUSION_IN_RESET, &instance->reset_flags))
10067 + test_bit(MEGASAS_FUSION_OCR_NOT_POSSIBLE,
10068 + &instance->reset_flags))
10069 return IGNORE_TIMEOUT;
10070 else
10071 return INITIATE_OCR;
10072 diff --git a/drivers/scsi/megaraid/megaraid_sas_fusion.c b/drivers/scsi/megaraid/megaraid_sas_fusion.c
10073 index fe1a20973e47..874e5a7f7998 100644
10074 --- a/drivers/scsi/megaraid/megaraid_sas_fusion.c
10075 +++ b/drivers/scsi/megaraid/megaraid_sas_fusion.c
10076 @@ -3438,6 +3438,7 @@ int megasas_reset_fusion(struct Scsi_Host *shost, int reason)
10077 if (instance->requestorId && !instance->skip_heartbeat_timer_del)
10078 del_timer_sync(&instance->sriov_heartbeat_timer);
10079 set_bit(MEGASAS_FUSION_IN_RESET, &instance->reset_flags);
10080 + set_bit(MEGASAS_FUSION_OCR_NOT_POSSIBLE, &instance->reset_flags);
10081 atomic_set(&instance->adprecovery, MEGASAS_ADPRESET_SM_POLLING);
10082 instance->instancet->disable_intr(instance);
10083 msleep(1000);
10084 @@ -3594,7 +3595,7 @@ fail_kill_adapter:
10085 atomic_set(&instance->adprecovery, MEGASAS_HBA_OPERATIONAL);
10086 }
10087 out:
10088 - clear_bit(MEGASAS_FUSION_IN_RESET, &instance->reset_flags);
10089 + clear_bit(MEGASAS_FUSION_OCR_NOT_POSSIBLE, &instance->reset_flags);
10090 mutex_unlock(&instance->reset_mutex);
10091 return retval;
10092 }
10093 diff --git a/drivers/scsi/megaraid/megaraid_sas_fusion.h b/drivers/scsi/megaraid/megaraid_sas_fusion.h
10094 index e3bee04c1eb1..034653d93365 100644
10095 --- a/drivers/scsi/megaraid/megaraid_sas_fusion.h
10096 +++ b/drivers/scsi/megaraid/megaraid_sas_fusion.h
10097 @@ -93,6 +93,7 @@ enum MR_RAID_FLAGS_IO_SUB_TYPE {
10098
10099 #define MEGASAS_FP_CMD_LEN 16
10100 #define MEGASAS_FUSION_IN_RESET 0
10101 +#define MEGASAS_FUSION_OCR_NOT_POSSIBLE 1
10102 #define THRESHOLD_REPLY_COUNT 50
10103 #define JBOD_MAPS_COUNT 2
10104
10105 diff --git a/drivers/scsi/qla2xxx/qla_mbx.c b/drivers/scsi/qla2xxx/qla_mbx.c
10106 index bf29ad454118..9bbea4223917 100644
10107 --- a/drivers/scsi/qla2xxx/qla_mbx.c
10108 +++ b/drivers/scsi/qla2xxx/qla_mbx.c
10109 @@ -5723,9 +5723,8 @@ qla2x00_dump_mctp_data(scsi_qla_host_t *vha, dma_addr_t req_dma, uint32_t addr,
10110 mcp->mb[7] = LSW(MSD(req_dma));
10111 mcp->mb[8] = MSW(addr);
10112 /* Setting RAM ID to valid */
10113 - mcp->mb[10] |= BIT_7;
10114 /* For MCTP RAM ID is 0x40 */
10115 - mcp->mb[10] |= 0x40;
10116 + mcp->mb[10] = BIT_7 | 0x40;
10117
10118 mcp->out_mb |= MBX_10|MBX_8|MBX_7|MBX_6|MBX_5|MBX_4|MBX_3|MBX_2|MBX_1|
10119 MBX_0;
10120 diff --git a/drivers/scsi/qla2xxx/qla_nx.c b/drivers/scsi/qla2xxx/qla_nx.c
10121 index 54380b434b30..104e13ae3428 100644
10122 --- a/drivers/scsi/qla2xxx/qla_nx.c
10123 +++ b/drivers/scsi/qla2xxx/qla_nx.c
10124 @@ -1600,8 +1600,7 @@ qla82xx_get_bootld_offset(struct qla_hw_data *ha)
10125 return (u8 *)&ha->hablob->fw->data[offset];
10126 }
10127
10128 -static __le32
10129 -qla82xx_get_fw_size(struct qla_hw_data *ha)
10130 +static u32 qla82xx_get_fw_size(struct qla_hw_data *ha)
10131 {
10132 struct qla82xx_uri_data_desc *uri_desc = NULL;
10133
10134 @@ -1612,7 +1611,7 @@ qla82xx_get_fw_size(struct qla_hw_data *ha)
10135 return cpu_to_le32(uri_desc->size);
10136 }
10137
10138 - return cpu_to_le32(*(u32 *)&ha->hablob->fw->data[FW_SIZE_OFFSET]);
10139 + return get_unaligned_le32(&ha->hablob->fw->data[FW_SIZE_OFFSET]);
10140 }
10141
10142 static u8 *
10143 @@ -1803,7 +1802,7 @@ qla82xx_fw_load_from_blob(struct qla_hw_data *ha)
10144 }
10145
10146 flashaddr = FLASH_ADDR_START;
10147 - size = (__force u32)qla82xx_get_fw_size(ha) / 8;
10148 + size = qla82xx_get_fw_size(ha) / 8;
10149 ptr64 = (u64 *)qla82xx_get_fw_offs(ha);
10150
10151 for (i = 0; i < size; i++) {
10152 diff --git a/drivers/scsi/qla4xxx/ql4_os.c b/drivers/scsi/qla4xxx/ql4_os.c
10153 index f714d5f917d1..3fda5836aac6 100644
10154 --- a/drivers/scsi/qla4xxx/ql4_os.c
10155 +++ b/drivers/scsi/qla4xxx/ql4_os.c
10156 @@ -4150,7 +4150,7 @@ static void qla4xxx_mem_free(struct scsi_qla_host *ha)
10157 dma_free_coherent(&ha->pdev->dev, ha->queues_len, ha->queues,
10158 ha->queues_dma);
10159
10160 - if (ha->fw_dump)
10161 + if (ha->fw_dump)
10162 vfree(ha->fw_dump);
10163
10164 ha->queues_len = 0;
10165 diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
10166 index 094e879af121..394df57894e6 100644
10167 --- a/drivers/scsi/ufs/ufshcd.c
10168 +++ b/drivers/scsi/ufs/ufshcd.c
10169 @@ -5347,7 +5347,8 @@ static int ufshcd_probe_hba(struct ufs_hba *hba)
10170 ufshcd_init_icc_levels(hba);
10171
10172 /* Add required well known logical units to scsi mid layer */
10173 - if (ufshcd_scsi_add_wlus(hba))
10174 + ret = ufshcd_scsi_add_wlus(hba);
10175 + if (ret)
10176 goto out;
10177
10178 scsi_scan_host(hba->host);
10179 diff --git a/drivers/usb/gadget/function/f_ecm.c b/drivers/usb/gadget/function/f_ecm.c
10180 index dc99ed94f03d..8e3e44382785 100644
10181 --- a/drivers/usb/gadget/function/f_ecm.c
10182 +++ b/drivers/usb/gadget/function/f_ecm.c
10183 @@ -56,6 +56,7 @@ struct f_ecm {
10184 struct usb_ep *notify;
10185 struct usb_request *notify_req;
10186 u8 notify_state;
10187 + atomic_t notify_count;
10188 bool is_open;
10189
10190 /* FIXME is_open needs some irq-ish locking
10191 @@ -384,7 +385,7 @@ static void ecm_do_notify(struct f_ecm *ecm)
10192 int status;
10193
10194 /* notification already in flight? */
10195 - if (!req)
10196 + if (atomic_read(&ecm->notify_count))
10197 return;
10198
10199 event = req->buf;
10200 @@ -424,10 +425,10 @@ static void ecm_do_notify(struct f_ecm *ecm)
10201 event->bmRequestType = 0xA1;
10202 event->wIndex = cpu_to_le16(ecm->ctrl_id);
10203
10204 - ecm->notify_req = NULL;
10205 + atomic_inc(&ecm->notify_count);
10206 status = usb_ep_queue(ecm->notify, req, GFP_ATOMIC);
10207 if (status < 0) {
10208 - ecm->notify_req = req;
10209 + atomic_dec(&ecm->notify_count);
10210 DBG(cdev, "notify --> %d\n", status);
10211 }
10212 }
10213 @@ -452,17 +453,19 @@ static void ecm_notify_complete(struct usb_ep *ep, struct usb_request *req)
10214 switch (req->status) {
10215 case 0:
10216 /* no fault */
10217 + atomic_dec(&ecm->notify_count);
10218 break;
10219 case -ECONNRESET:
10220 case -ESHUTDOWN:
10221 + atomic_set(&ecm->notify_count, 0);
10222 ecm->notify_state = ECM_NOTIFY_NONE;
10223 break;
10224 default:
10225 DBG(cdev, "event %02x --> %d\n",
10226 event->bNotificationType, req->status);
10227 + atomic_dec(&ecm->notify_count);
10228 break;
10229 }
10230 - ecm->notify_req = req;
10231 ecm_do_notify(ecm);
10232 }
10233
10234 @@ -909,6 +912,11 @@ static void ecm_unbind(struct usb_configuration *c, struct usb_function *f)
10235
10236 usb_free_all_descriptors(f);
10237
10238 + if (atomic_read(&ecm->notify_count)) {
10239 + usb_ep_dequeue(ecm->notify, ecm->notify_req);
10240 + atomic_set(&ecm->notify_count, 0);
10241 + }
10242 +
10243 kfree(ecm->notify_req->buf);
10244 usb_ep_free_request(ecm->notify, ecm->notify_req);
10245 }
10246 diff --git a/drivers/usb/gadget/function/f_ncm.c b/drivers/usb/gadget/function/f_ncm.c
10247 index 639603722709..6399923239e7 100644
10248 --- a/drivers/usb/gadget/function/f_ncm.c
10249 +++ b/drivers/usb/gadget/function/f_ncm.c
10250 @@ -57,6 +57,7 @@ struct f_ncm {
10251 struct usb_ep *notify;
10252 struct usb_request *notify_req;
10253 u8 notify_state;
10254 + atomic_t notify_count;
10255 bool is_open;
10256
10257 const struct ndp_parser_opts *parser_opts;
10258 @@ -552,7 +553,7 @@ static void ncm_do_notify(struct f_ncm *ncm)
10259 int status;
10260
10261 /* notification already in flight? */
10262 - if (!req)
10263 + if (atomic_read(&ncm->notify_count))
10264 return;
10265
10266 event = req->buf;
10267 @@ -592,7 +593,8 @@ static void ncm_do_notify(struct f_ncm *ncm)
10268 event->bmRequestType = 0xA1;
10269 event->wIndex = cpu_to_le16(ncm->ctrl_id);
10270
10271 - ncm->notify_req = NULL;
10272 + atomic_inc(&ncm->notify_count);
10273 +
10274 /*
10275 * In double buffering if there is a space in FIFO,
10276 * completion callback can be called right after the call,
10277 @@ -602,7 +604,7 @@ static void ncm_do_notify(struct f_ncm *ncm)
10278 status = usb_ep_queue(ncm->notify, req, GFP_ATOMIC);
10279 spin_lock(&ncm->lock);
10280 if (status < 0) {
10281 - ncm->notify_req = req;
10282 + atomic_dec(&ncm->notify_count);
10283 DBG(cdev, "notify --> %d\n", status);
10284 }
10285 }
10286 @@ -637,17 +639,19 @@ static void ncm_notify_complete(struct usb_ep *ep, struct usb_request *req)
10287 case 0:
10288 VDBG(cdev, "Notification %02x sent\n",
10289 event->bNotificationType);
10290 + atomic_dec(&ncm->notify_count);
10291 break;
10292 case -ECONNRESET:
10293 case -ESHUTDOWN:
10294 + atomic_set(&ncm->notify_count, 0);
10295 ncm->notify_state = NCM_NOTIFY_NONE;
10296 break;
10297 default:
10298 DBG(cdev, "event %02x --> %d\n",
10299 event->bNotificationType, req->status);
10300 + atomic_dec(&ncm->notify_count);
10301 break;
10302 }
10303 - ncm->notify_req = req;
10304 ncm_do_notify(ncm);
10305 spin_unlock(&ncm->lock);
10306 }
10307 @@ -1639,6 +1643,11 @@ static void ncm_unbind(struct usb_configuration *c, struct usb_function *f)
10308 ncm_string_defs[0].id = 0;
10309 usb_free_all_descriptors(f);
10310
10311 + if (atomic_read(&ncm->notify_count)) {
10312 + usb_ep_dequeue(ncm->notify, ncm->notify_req);
10313 + atomic_set(&ncm->notify_count, 0);
10314 + }
10315 +
10316 kfree(ncm->notify_req->buf);
10317 usb_ep_free_request(ncm->notify, ncm->notify_req);
10318 }
10319 diff --git a/drivers/usb/gadget/legacy/cdc2.c b/drivers/usb/gadget/legacy/cdc2.c
10320 index 51c08682de84..5ee25beb52f0 100644
10321 --- a/drivers/usb/gadget/legacy/cdc2.c
10322 +++ b/drivers/usb/gadget/legacy/cdc2.c
10323 @@ -229,7 +229,7 @@ static struct usb_composite_driver cdc_driver = {
10324 .name = "g_cdc",
10325 .dev = &device_desc,
10326 .strings = dev_strings,
10327 - .max_speed = USB_SPEED_HIGH,
10328 + .max_speed = USB_SPEED_SUPER,
10329 .bind = cdc_bind,
10330 .unbind = cdc_unbind,
10331 };
10332 diff --git a/drivers/usb/gadget/legacy/g_ffs.c b/drivers/usb/gadget/legacy/g_ffs.c
10333 index 6da7316f8e87..54ee4e31645b 100644
10334 --- a/drivers/usb/gadget/legacy/g_ffs.c
10335 +++ b/drivers/usb/gadget/legacy/g_ffs.c
10336 @@ -153,7 +153,7 @@ static struct usb_composite_driver gfs_driver = {
10337 .name = DRIVER_NAME,
10338 .dev = &gfs_dev_desc,
10339 .strings = gfs_dev_strings,
10340 - .max_speed = USB_SPEED_HIGH,
10341 + .max_speed = USB_SPEED_SUPER,
10342 .bind = gfs_bind,
10343 .unbind = gfs_unbind,
10344 };
10345 diff --git a/drivers/usb/gadget/legacy/multi.c b/drivers/usb/gadget/legacy/multi.c
10346 index a70a406580ea..3b7fc5c7e9c3 100644
10347 --- a/drivers/usb/gadget/legacy/multi.c
10348 +++ b/drivers/usb/gadget/legacy/multi.c
10349 @@ -486,7 +486,7 @@ static struct usb_composite_driver multi_driver = {
10350 .name = "g_multi",
10351 .dev = &device_desc,
10352 .strings = dev_strings,
10353 - .max_speed = USB_SPEED_HIGH,
10354 + .max_speed = USB_SPEED_SUPER,
10355 .bind = multi_bind,
10356 .unbind = multi_unbind,
10357 .needs_serial = 1,
10358 diff --git a/drivers/usb/gadget/legacy/ncm.c b/drivers/usb/gadget/legacy/ncm.c
10359 index 0aba68253e3d..2fb4a847dd52 100644
10360 --- a/drivers/usb/gadget/legacy/ncm.c
10361 +++ b/drivers/usb/gadget/legacy/ncm.c
10362 @@ -203,7 +203,7 @@ static struct usb_composite_driver ncm_driver = {
10363 .name = "g_ncm",
10364 .dev = &device_desc,
10365 .strings = dev_strings,
10366 - .max_speed = USB_SPEED_HIGH,
10367 + .max_speed = USB_SPEED_SUPER,
10368 .bind = gncm_bind,
10369 .unbind = gncm_unbind,
10370 };
10371 diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
10372 index 305deb6e59c3..b5ebb43b1824 100644
10373 --- a/fs/btrfs/ctree.c
10374 +++ b/fs/btrfs/ctree.c
10375 @@ -331,26 +331,6 @@ struct tree_mod_elem {
10376 struct tree_mod_root old_root;
10377 };
10378
10379 -static inline void tree_mod_log_read_lock(struct btrfs_fs_info *fs_info)
10380 -{
10381 - read_lock(&fs_info->tree_mod_log_lock);
10382 -}
10383 -
10384 -static inline void tree_mod_log_read_unlock(struct btrfs_fs_info *fs_info)
10385 -{
10386 - read_unlock(&fs_info->tree_mod_log_lock);
10387 -}
10388 -
10389 -static inline void tree_mod_log_write_lock(struct btrfs_fs_info *fs_info)
10390 -{
10391 - write_lock(&fs_info->tree_mod_log_lock);
10392 -}
10393 -
10394 -static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info)
10395 -{
10396 - write_unlock(&fs_info->tree_mod_log_lock);
10397 -}
10398 -
10399 /*
10400 * Pull a new tree mod seq number for our operation.
10401 */
10402 @@ -370,14 +350,12 @@ static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
10403 u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
10404 struct seq_list *elem)
10405 {
10406 - tree_mod_log_write_lock(fs_info);
10407 - spin_lock(&fs_info->tree_mod_seq_lock);
10408 + write_lock(&fs_info->tree_mod_log_lock);
10409 if (!elem->seq) {
10410 elem->seq = btrfs_inc_tree_mod_seq(fs_info);
10411 list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
10412 }
10413 - spin_unlock(&fs_info->tree_mod_seq_lock);
10414 - tree_mod_log_write_unlock(fs_info);
10415 + write_unlock(&fs_info->tree_mod_log_lock);
10416
10417 return elem->seq;
10418 }
10419 @@ -396,7 +374,7 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
10420 if (!seq_putting)
10421 return;
10422
10423 - spin_lock(&fs_info->tree_mod_seq_lock);
10424 + write_lock(&fs_info->tree_mod_log_lock);
10425 list_del(&elem->list);
10426 elem->seq = 0;
10427
10428 @@ -407,19 +385,17 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
10429 * blocker with lower sequence number exists, we
10430 * cannot remove anything from the log
10431 */
10432 - spin_unlock(&fs_info->tree_mod_seq_lock);
10433 + write_unlock(&fs_info->tree_mod_log_lock);
10434 return;
10435 }
10436 min_seq = cur_elem->seq;
10437 }
10438 }
10439 - spin_unlock(&fs_info->tree_mod_seq_lock);
10440
10441 /*
10442 * anything that's lower than the lowest existing (read: blocked)
10443 * sequence number can be removed from the tree.
10444 */
10445 - tree_mod_log_write_lock(fs_info);
10446 tm_root = &fs_info->tree_mod_log;
10447 for (node = rb_first(tm_root); node; node = next) {
10448 next = rb_next(node);
10449 @@ -429,7 +405,7 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
10450 rb_erase(node, tm_root);
10451 kfree(tm);
10452 }
10453 - tree_mod_log_write_unlock(fs_info);
10454 + write_unlock(&fs_info->tree_mod_log_lock);
10455 }
10456
10457 /*
10458 @@ -440,7 +416,7 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
10459 * for root replace operations, or the logical address of the affected
10460 * block for all other operations.
10461 *
10462 - * Note: must be called with write lock (tree_mod_log_write_lock).
10463 + * Note: must be called with write lock for fs_info::tree_mod_log_lock.
10464 */
10465 static noinline int
10466 __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
10467 @@ -480,7 +456,7 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
10468 * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
10469 * returns zero with the tree_mod_log_lock acquired. The caller must hold
10470 * this until all tree mod log insertions are recorded in the rb tree and then
10471 - * call tree_mod_log_write_unlock() to release.
10472 + * write unlock fs_info::tree_mod_log_lock.
10473 */
10474 static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
10475 struct extent_buffer *eb) {
10476 @@ -490,9 +466,9 @@ static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
10477 if (eb && btrfs_header_level(eb) == 0)
10478 return 1;
10479
10480 - tree_mod_log_write_lock(fs_info);
10481 + write_lock(&fs_info->tree_mod_log_lock);
10482 if (list_empty(&(fs_info)->tree_mod_seq_list)) {
10483 - tree_mod_log_write_unlock(fs_info);
10484 + write_unlock(&fs_info->tree_mod_log_lock);
10485 return 1;
10486 }
10487
10488 @@ -556,7 +532,7 @@ tree_mod_log_insert_key(struct btrfs_fs_info *fs_info,
10489 }
10490
10491 ret = __tree_mod_log_insert(fs_info, tm);
10492 - tree_mod_log_write_unlock(fs_info);
10493 + write_unlock(&eb->fs_info->tree_mod_log_lock);
10494 if (ret)
10495 kfree(tm);
10496
10497 @@ -620,7 +596,7 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
10498 ret = __tree_mod_log_insert(fs_info, tm);
10499 if (ret)
10500 goto free_tms;
10501 - tree_mod_log_write_unlock(fs_info);
10502 + write_unlock(&eb->fs_info->tree_mod_log_lock);
10503 kfree(tm_list);
10504
10505 return 0;
10506 @@ -631,7 +607,7 @@ free_tms:
10507 kfree(tm_list[i]);
10508 }
10509 if (locked)
10510 - tree_mod_log_write_unlock(fs_info);
10511 + write_unlock(&eb->fs_info->tree_mod_log_lock);
10512 kfree(tm_list);
10513 kfree(tm);
10514
10515 @@ -712,7 +688,7 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
10516 if (!ret)
10517 ret = __tree_mod_log_insert(fs_info, tm);
10518
10519 - tree_mod_log_write_unlock(fs_info);
10520 + write_unlock(&fs_info->tree_mod_log_lock);
10521 if (ret)
10522 goto free_tms;
10523 kfree(tm_list);
10524 @@ -739,7 +715,7 @@ __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
10525 struct tree_mod_elem *cur = NULL;
10526 struct tree_mod_elem *found = NULL;
10527
10528 - tree_mod_log_read_lock(fs_info);
10529 + read_lock(&fs_info->tree_mod_log_lock);
10530 tm_root = &fs_info->tree_mod_log;
10531 node = tm_root->rb_node;
10532 while (node) {
10533 @@ -767,7 +743,7 @@ __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
10534 break;
10535 }
10536 }
10537 - tree_mod_log_read_unlock(fs_info);
10538 + read_unlock(&fs_info->tree_mod_log_lock);
10539
10540 return found;
10541 }
10542 @@ -848,7 +824,7 @@ tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
10543 goto free_tms;
10544 }
10545
10546 - tree_mod_log_write_unlock(fs_info);
10547 + write_unlock(&fs_info->tree_mod_log_lock);
10548 kfree(tm_list);
10549
10550 return 0;
10551 @@ -860,7 +836,7 @@ free_tms:
10552 kfree(tm_list[i]);
10553 }
10554 if (locked)
10555 - tree_mod_log_write_unlock(fs_info);
10556 + write_unlock(&fs_info->tree_mod_log_lock);
10557 kfree(tm_list);
10558
10559 return ret;
10560 @@ -920,7 +896,7 @@ tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
10561 goto free_tms;
10562
10563 ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
10564 - tree_mod_log_write_unlock(fs_info);
10565 + write_unlock(&eb->fs_info->tree_mod_log_lock);
10566 if (ret)
10567 goto free_tms;
10568 kfree(tm_list);
10569 @@ -1271,7 +1247,7 @@ __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
10570 unsigned long p_size = sizeof(struct btrfs_key_ptr);
10571
10572 n = btrfs_header_nritems(eb);
10573 - tree_mod_log_read_lock(fs_info);
10574 + read_lock(&fs_info->tree_mod_log_lock);
10575 while (tm && tm->seq >= time_seq) {
10576 /*
10577 * all the operations are recorded with the operator used for
10578 @@ -1326,7 +1302,7 @@ __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
10579 if (tm->logical != first_tm->logical)
10580 break;
10581 }
10582 - tree_mod_log_read_unlock(fs_info);
10583 + read_unlock(&fs_info->tree_mod_log_lock);
10584 btrfs_set_header_nritems(eb, n);
10585 }
10586
10587 diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
10588 index a423c36bcd72..2bc37d03d407 100644
10589 --- a/fs/btrfs/ctree.h
10590 +++ b/fs/btrfs/ctree.h
10591 @@ -851,14 +851,12 @@ struct btrfs_fs_info {
10592 struct list_head delayed_iputs;
10593 struct mutex cleaner_delayed_iput_mutex;
10594
10595 - /* this protects tree_mod_seq_list */
10596 - spinlock_t tree_mod_seq_lock;
10597 atomic64_t tree_mod_seq;
10598 - struct list_head tree_mod_seq_list;
10599
10600 - /* this protects tree_mod_log */
10601 + /* this protects tree_mod_log and tree_mod_seq_list */
10602 rwlock_t tree_mod_log_lock;
10603 struct rb_root tree_mod_log;
10604 + struct list_head tree_mod_seq_list;
10605
10606 atomic_t nr_async_submits;
10607 atomic_t async_submit_draining;
10608 diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
10609 index 74c17db75201..c1ca4ce11e69 100644
10610 --- a/fs/btrfs/delayed-ref.c
10611 +++ b/fs/btrfs/delayed-ref.c
10612 @@ -279,7 +279,7 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
10613 if (head->is_data)
10614 return;
10615
10616 - spin_lock(&fs_info->tree_mod_seq_lock);
10617 + read_lock(&fs_info->tree_mod_log_lock);
10618 if (!list_empty(&fs_info->tree_mod_seq_list)) {
10619 struct seq_list *elem;
10620
10621 @@ -287,7 +287,7 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
10622 struct seq_list, list);
10623 seq = elem->seq;
10624 }
10625 - spin_unlock(&fs_info->tree_mod_seq_lock);
10626 + read_unlock(&fs_info->tree_mod_log_lock);
10627
10628 ref = list_first_entry(&head->ref_list, struct btrfs_delayed_ref_node,
10629 list);
10630 @@ -315,7 +315,7 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
10631 struct seq_list *elem;
10632 int ret = 0;
10633
10634 - spin_lock(&fs_info->tree_mod_seq_lock);
10635 + read_lock(&fs_info->tree_mod_log_lock);
10636 if (!list_empty(&fs_info->tree_mod_seq_list)) {
10637 elem = list_first_entry(&fs_info->tree_mod_seq_list,
10638 struct seq_list, list);
10639 @@ -329,7 +329,7 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
10640 }
10641 }
10642
10643 - spin_unlock(&fs_info->tree_mod_seq_lock);
10644 + read_unlock(&fs_info->tree_mod_log_lock);
10645 return ret;
10646 }
10647
10648 diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
10649 index b37519241eb1..e3524ecce3d7 100644
10650 --- a/fs/btrfs/disk-io.c
10651 +++ b/fs/btrfs/disk-io.c
10652 @@ -2104,7 +2104,7 @@ static void free_root_extent_buffers(struct btrfs_root *root)
10653 }
10654
10655 /* helper to cleanup tree roots */
10656 -static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
10657 +static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root)
10658 {
10659 free_root_extent_buffers(info->tree_root);
10660
10661 @@ -2113,7 +2113,7 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
10662 free_root_extent_buffers(info->csum_root);
10663 free_root_extent_buffers(info->quota_root);
10664 free_root_extent_buffers(info->uuid_root);
10665 - if (chunk_root)
10666 + if (free_chunk_root)
10667 free_root_extent_buffers(info->chunk_root);
10668 free_root_extent_buffers(info->free_space_root);
10669 }
10670 @@ -2519,7 +2519,6 @@ int open_ctree(struct super_block *sb,
10671 spin_lock_init(&fs_info->delayed_iput_lock);
10672 spin_lock_init(&fs_info->defrag_inodes_lock);
10673 spin_lock_init(&fs_info->free_chunk_lock);
10674 - spin_lock_init(&fs_info->tree_mod_seq_lock);
10675 spin_lock_init(&fs_info->super_lock);
10676 spin_lock_init(&fs_info->qgroup_op_lock);
10677 spin_lock_init(&fs_info->buffer_lock);
10678 @@ -3136,7 +3135,7 @@ fail_block_groups:
10679 btrfs_free_block_groups(fs_info);
10680
10681 fail_tree_roots:
10682 - free_root_pointers(fs_info, 1);
10683 + free_root_pointers(fs_info, true);
10684 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
10685
10686 fail_sb_buffer:
10687 @@ -3165,7 +3164,7 @@ recovery_tree_root:
10688 if (!btrfs_test_opt(tree_root->fs_info, USEBACKUPROOT))
10689 goto fail_tree_roots;
10690
10691 - free_root_pointers(fs_info, 0);
10692 + free_root_pointers(fs_info, false);
10693
10694 /* don't use the log in recovery mode, it won't be valid */
10695 btrfs_set_super_log_root(disk_super, 0);
10696 @@ -3862,7 +3861,7 @@ void close_ctree(struct btrfs_root *root)
10697 btrfs_stop_all_workers(fs_info);
10698
10699 clear_bit(BTRFS_FS_OPEN, &fs_info->flags);
10700 - free_root_pointers(fs_info, 1);
10701 + free_root_pointers(fs_info, true);
10702
10703 iput(fs_info->btree_inode);
10704
10705 diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
10706 index 37a28e2369b9..1372d3e5d90b 100644
10707 --- a/fs/btrfs/extent_io.c
10708 +++ b/fs/btrfs/extent_io.c
10709 @@ -4060,6 +4060,14 @@ retry:
10710 */
10711 scanned = 1;
10712 index = 0;
10713 +
10714 + /*
10715 + * If we're looping we could run into a page that is locked by a
10716 + * writer and that writer could be waiting on writeback for a
10717 + * page in our current bio, and thus deadlock, so flush the
10718 + * write bio here.
10719 + */
10720 + flush_write_bio(data);
10721 goto retry;
10722 }
10723
10724 diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
10725 index bf62ad919a95..9edc2674b8a7 100644
10726 --- a/fs/btrfs/tests/btrfs-tests.c
10727 +++ b/fs/btrfs/tests/btrfs-tests.c
10728 @@ -112,7 +112,6 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void)
10729 spin_lock_init(&fs_info->qgroup_op_lock);
10730 spin_lock_init(&fs_info->super_lock);
10731 spin_lock_init(&fs_info->fs_roots_radix_lock);
10732 - spin_lock_init(&fs_info->tree_mod_seq_lock);
10733 mutex_init(&fs_info->qgroup_ioctl_lock);
10734 mutex_init(&fs_info->qgroup_rescan_lock);
10735 rwlock_init(&fs_info->tree_mod_log_lock);
10736 diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
10737 index fd6c74662e9a..31df020634cd 100644
10738 --- a/fs/btrfs/transaction.c
10739 +++ b/fs/btrfs/transaction.c
10740 @@ -1917,6 +1917,14 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
10741 struct btrfs_transaction *prev_trans = NULL;
10742 int ret;
10743
10744 + /*
10745 + * Some places just start a transaction to commit it. We need to make
10746 + * sure that if this commit fails that the abort code actually marks the
10747 + * transaction as failed, so set trans->dirty to make the abort code do
10748 + * the right thing.
10749 + */
10750 + trans->dirty = true;
10751 +
10752 /* Stop the commit early if ->aborted is set */
10753 if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
10754 ret = cur_trans->aborted;
10755 diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
10756 index 7ee573cddde9..f79682937faf 100644
10757 --- a/fs/btrfs/tree-log.c
10758 +++ b/fs/btrfs/tree-log.c
10759 @@ -4443,13 +4443,8 @@ static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans,
10760 struct btrfs_file_extent_item);
10761
10762 if (btrfs_file_extent_type(leaf, extent) ==
10763 - BTRFS_FILE_EXTENT_INLINE) {
10764 - len = btrfs_file_extent_inline_len(leaf,
10765 - path->slots[0],
10766 - extent);
10767 - ASSERT(len == i_size);
10768 + BTRFS_FILE_EXTENT_INLINE)
10769 return 0;
10770 - }
10771
10772 len = btrfs_file_extent_num_bytes(leaf, extent);
10773 /* Last extent goes beyond i_size, no need to log a hole. */
10774 diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
10775 index 5255deac86b2..e8dc28dbe563 100644
10776 --- a/fs/cifs/smb2pdu.c
10777 +++ b/fs/cifs/smb2pdu.c
10778 @@ -247,9 +247,14 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon)
10779 */
10780 mutex_lock(&tcon->ses->session_mutex);
10781 rc = cifs_negotiate_protocol(0, tcon->ses);
10782 - if (!rc && tcon->ses->need_reconnect)
10783 + if (!rc && tcon->ses->need_reconnect) {
10784 rc = cifs_setup_session(0, tcon->ses, nls_codepage);
10785 -
10786 + if ((rc == -EACCES) && !tcon->retry) {
10787 + rc = -EHOSTDOWN;
10788 + mutex_unlock(&tcon->ses->session_mutex);
10789 + goto failed;
10790 + }
10791 + }
10792 if (rc || !tcon->need_reconnect) {
10793 mutex_unlock(&tcon->ses->session_mutex);
10794 goto out;
10795 @@ -291,6 +296,7 @@ out:
10796 case SMB2_SET_INFO:
10797 rc = -EAGAIN;
10798 }
10799 +failed:
10800 unload_nls(nls_codepage);
10801 return rc;
10802 }
10803 diff --git a/fs/ext2/super.c b/fs/ext2/super.c
10804 index 6fcb29b393d3..186912c9bf56 100644
10805 --- a/fs/ext2/super.c
10806 +++ b/fs/ext2/super.c
10807 @@ -1047,9 +1047,9 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
10808
10809 if (EXT2_BLOCKS_PER_GROUP(sb) == 0)
10810 goto cantfind_ext2;
10811 - sbi->s_groups_count = ((le32_to_cpu(es->s_blocks_count) -
10812 - le32_to_cpu(es->s_first_data_block) - 1)
10813 - / EXT2_BLOCKS_PER_GROUP(sb)) + 1;
10814 + sbi->s_groups_count = ((le32_to_cpu(es->s_blocks_count) -
10815 + le32_to_cpu(es->s_first_data_block) - 1)
10816 + / EXT2_BLOCKS_PER_GROUP(sb)) + 1;
10817 db_count = (sbi->s_groups_count + EXT2_DESC_PER_BLOCK(sb) - 1) /
10818 EXT2_DESC_PER_BLOCK(sb);
10819 sbi->s_group_desc = kmalloc (db_count * sizeof (struct buffer_head *), GFP_KERNEL);
10820 diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
10821 index 0094923e5ebf..94f60f9d57fd 100644
10822 --- a/fs/ext4/page-io.c
10823 +++ b/fs/ext4/page-io.c
10824 @@ -469,16 +469,25 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
10825 nr_to_submit) {
10826 gfp_t gfp_flags = GFP_NOFS;
10827
10828 + /*
10829 + * Since bounce page allocation uses a mempool, we can only use
10830 + * a waiting mask (i.e. request guaranteed allocation) on the
10831 + * first page of the bio. Otherwise it can deadlock.
10832 + */
10833 + if (io->io_bio)
10834 + gfp_flags = GFP_NOWAIT | __GFP_NOWARN;
10835 retry_encrypt:
10836 data_page = fscrypt_encrypt_page(inode, page, gfp_flags);
10837 if (IS_ERR(data_page)) {
10838 ret = PTR_ERR(data_page);
10839 - if (ret == -ENOMEM && wbc->sync_mode == WB_SYNC_ALL) {
10840 - if (io->io_bio) {
10841 + if (ret == -ENOMEM &&
10842 + (io->io_bio || wbc->sync_mode == WB_SYNC_ALL)) {
10843 + gfp_flags = GFP_NOFS;
10844 + if (io->io_bio)
10845 ext4_io_submit(io);
10846 - congestion_wait(BLK_RW_ASYNC, HZ/50);
10847 - }
10848 - gfp_flags |= __GFP_NOFAIL;
10849 + else
10850 + gfp_flags |= __GFP_NOFAIL;
10851 + congestion_wait(BLK_RW_ASYNC, HZ/50);
10852 goto retry_encrypt;
10853 }
10854 data_page = NULL;
10855 diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
10856 index b1daeafbea92..c3428767332c 100644
10857 --- a/fs/nfs/Kconfig
10858 +++ b/fs/nfs/Kconfig
10859 @@ -89,7 +89,7 @@ config NFS_V4
10860 config NFS_SWAP
10861 bool "Provide swap over NFS support"
10862 default n
10863 - depends on NFS_FS
10864 + depends on NFS_FS && SWAP
10865 select SUNRPC_SWAP
10866 help
10867 This option enables swapon to work on files located on NFS mounts.
10868 diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
10869 index 9d7537446260..0d4a56c77a1a 100644
10870 --- a/fs/nfs/callback_proc.c
10871 +++ b/fs/nfs/callback_proc.c
10872 @@ -419,7 +419,7 @@ static bool referring_call_exists(struct nfs_client *clp,
10873 uint32_t nrclists,
10874 struct referring_call_list *rclists)
10875 {
10876 - bool status = 0;
10877 + bool status = false;
10878 int i, j;
10879 struct nfs4_session *session;
10880 struct nfs4_slot_table *tbl;
10881 diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
10882 index 1e5321d1ed22..c2665d920cf8 100644
10883 --- a/fs/nfs/dir.c
10884 +++ b/fs/nfs/dir.c
10885 @@ -57,7 +57,7 @@ static void nfs_readdir_clear_array(struct page*);
10886 const struct file_operations nfs_dir_operations = {
10887 .llseek = nfs_llseek_dir,
10888 .read = generic_read_dir,
10889 - .iterate_shared = nfs_readdir,
10890 + .iterate = nfs_readdir,
10891 .open = nfs_opendir,
10892 .release = nfs_closedir,
10893 .fsync = nfs_fsync_dir,
10894 @@ -145,7 +145,6 @@ struct nfs_cache_array_entry {
10895 };
10896
10897 struct nfs_cache_array {
10898 - atomic_t refcount;
10899 int size;
10900 int eof_index;
10901 u64 last_cookie;
10902 @@ -170,6 +169,17 @@ typedef struct {
10903 unsigned int eof:1;
10904 } nfs_readdir_descriptor_t;
10905
10906 +static
10907 +void nfs_readdir_init_array(struct page *page)
10908 +{
10909 + struct nfs_cache_array *array;
10910 +
10911 + array = kmap_atomic(page);
10912 + memset(array, 0, sizeof(struct nfs_cache_array));
10913 + array->eof_index = -1;
10914 + kunmap_atomic(array);
10915 +}
10916 +
10917 /*
10918 * The caller is responsible for calling nfs_readdir_release_array(page)
10919 */
10920 @@ -201,20 +211,12 @@ void nfs_readdir_clear_array(struct page *page)
10921 int i;
10922
10923 array = kmap_atomic(page);
10924 - if (atomic_dec_and_test(&array->refcount))
10925 - for (i = 0; i < array->size; i++)
10926 - kfree(array->array[i].string.name);
10927 + for (i = 0; i < array->size; i++)
10928 + kfree(array->array[i].string.name);
10929 + array->size = 0;
10930 kunmap_atomic(array);
10931 }
10932
10933 -static bool grab_page(struct page *page)
10934 -{
10935 - struct nfs_cache_array *array = kmap_atomic(page);
10936 - bool res = atomic_inc_not_zero(&array->refcount);
10937 - kunmap_atomic(array);
10938 - return res;
10939 -}
10940 -
10941 /*
10942 * the caller is responsible for freeing qstr.name
10943 * when called by nfs_readdir_add_to_array, the strings will be freed in
10944 @@ -287,7 +289,7 @@ int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descri
10945 desc->cache_entry_index = index;
10946 return 0;
10947 out_eof:
10948 - desc->eof = 1;
10949 + desc->eof = true;
10950 return -EBADCOOKIE;
10951 }
10952
10953 @@ -341,7 +343,7 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
10954 if (array->eof_index >= 0) {
10955 status = -EBADCOOKIE;
10956 if (*desc->dir_cookie == array->last_cookie)
10957 - desc->eof = 1;
10958 + desc->eof = true;
10959 }
10960 out:
10961 return status;
10962 @@ -653,6 +655,8 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
10963 int status = -ENOMEM;
10964 unsigned int array_size = ARRAY_SIZE(pages);
10965
10966 + nfs_readdir_init_array(page);
10967 +
10968 entry.prev_cookie = 0;
10969 entry.cookie = desc->last_cookie;
10970 entry.eof = 0;
10971 @@ -673,9 +677,8 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
10972 status = PTR_ERR(array);
10973 goto out_label_free;
10974 }
10975 - memset(array, 0, sizeof(struct nfs_cache_array));
10976 - atomic_set(&array->refcount, 1);
10977 - array->eof_index = -1;
10978 +
10979 + array = kmap(page);
10980
10981 status = nfs_readdir_alloc_pages(pages, array_size);
10982 if (status < 0)
10983 @@ -730,6 +733,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page)
10984 unlock_page(page);
10985 return 0;
10986 error:
10987 + nfs_readdir_clear_array(page);
10988 unlock_page(page);
10989 return ret;
10990 }
10991 @@ -737,7 +741,6 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page)
10992 static
10993 void cache_page_release(nfs_readdir_descriptor_t *desc)
10994 {
10995 - nfs_readdir_clear_array(desc->page);
10996 put_page(desc->page);
10997 desc->page = NULL;
10998 }
10999 @@ -745,33 +748,34 @@ void cache_page_release(nfs_readdir_descriptor_t *desc)
11000 static
11001 struct page *get_cache_page(nfs_readdir_descriptor_t *desc)
11002 {
11003 - struct page *page;
11004 -
11005 - for (;;) {
11006 - page = read_cache_page(desc->file->f_mapping,
11007 + return read_cache_page(desc->file->f_mapping,
11008 desc->page_index, (filler_t *)nfs_readdir_filler, desc);
11009 - if (IS_ERR(page) || grab_page(page))
11010 - break;
11011 - put_page(page);
11012 - }
11013 - return page;
11014 }
11015
11016 /*
11017 * Returns 0 if desc->dir_cookie was found on page desc->page_index
11018 + * and locks the page to prevent removal from the page cache.
11019 */
11020 static
11021 -int find_cache_page(nfs_readdir_descriptor_t *desc)
11022 +int find_and_lock_cache_page(nfs_readdir_descriptor_t *desc)
11023 {
11024 int res;
11025
11026 desc->page = get_cache_page(desc);
11027 if (IS_ERR(desc->page))
11028 return PTR_ERR(desc->page);
11029 -
11030 - res = nfs_readdir_search_array(desc);
11031 + res = lock_page_killable(desc->page);
11032 if (res != 0)
11033 - cache_page_release(desc);
11034 + goto error;
11035 + res = -EAGAIN;
11036 + if (desc->page->mapping != NULL) {
11037 + res = nfs_readdir_search_array(desc);
11038 + if (res == 0)
11039 + return 0;
11040 + }
11041 + unlock_page(desc->page);
11042 +error:
11043 + cache_page_release(desc);
11044 return res;
11045 }
11046
11047 @@ -786,7 +790,7 @@ int readdir_search_pagecache(nfs_readdir_descriptor_t *desc)
11048 desc->last_cookie = 0;
11049 }
11050 do {
11051 - res = find_cache_page(desc);
11052 + res = find_and_lock_cache_page(desc);
11053 } while (res == -EAGAIN);
11054 return res;
11055 }
11056 @@ -815,7 +819,7 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc)
11057 ent = &array->array[i];
11058 if (!dir_emit(desc->ctx, ent->string.name, ent->string.len,
11059 nfs_compat_user_ino64(ent->ino), ent->d_type)) {
11060 - desc->eof = 1;
11061 + desc->eof = true;
11062 break;
11063 }
11064 desc->ctx->pos++;
11065 @@ -827,11 +831,10 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc)
11066 ctx->duped = 1;
11067 }
11068 if (array->eof_index >= 0)
11069 - desc->eof = 1;
11070 + desc->eof = true;
11071
11072 nfs_readdir_release_array(desc->page);
11073 out:
11074 - cache_page_release(desc);
11075 dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n",
11076 (unsigned long long)*desc->dir_cookie, res);
11077 return res;
11078 @@ -877,13 +880,13 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc)
11079
11080 status = nfs_do_filldir(desc);
11081
11082 + out_release:
11083 + nfs_readdir_clear_array(desc->page);
11084 + cache_page_release(desc);
11085 out:
11086 dfprintk(DIRCACHE, "NFS: %s: returns %d\n",
11087 __func__, status);
11088 return status;
11089 - out_release:
11090 - cache_page_release(desc);
11091 - goto out;
11092 }
11093
11094 /* The file offset position represents the dirent entry number. A
11095 @@ -928,7 +931,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
11096 if (res == -EBADCOOKIE) {
11097 res = 0;
11098 /* This means either end of directory */
11099 - if (*desc->dir_cookie && desc->eof == 0) {
11100 + if (*desc->dir_cookie && !desc->eof) {
11101 /* Or that the server has 'lost' a cookie */
11102 res = uncached_readdir(desc);
11103 if (res == 0)
11104 @@ -948,6 +951,8 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
11105 break;
11106
11107 res = nfs_do_filldir(desc);
11108 + unlock_page(desc->page);
11109 + cache_page_release(desc);
11110 if (res < 0)
11111 break;
11112 } while (!desc->eof);
11113 @@ -960,11 +965,13 @@ out:
11114
11115 static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence)
11116 {
11117 + struct inode *inode = file_inode(filp);
11118 struct nfs_open_dir_context *dir_ctx = filp->private_data;
11119
11120 dfprintk(FILE, "NFS: llseek dir(%pD2, %lld, %d)\n",
11121 filp, offset, whence);
11122
11123 + inode_lock(inode);
11124 switch (whence) {
11125 case 1:
11126 offset += filp->f_pos;
11127 @@ -972,13 +979,16 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence)
11128 if (offset >= 0)
11129 break;
11130 default:
11131 - return -EINVAL;
11132 + offset = -EINVAL;
11133 + goto out;
11134 }
11135 if (offset != filp->f_pos) {
11136 filp->f_pos = offset;
11137 dir_ctx->dir_cookie = 0;
11138 dir_ctx->duped = 0;
11139 }
11140 +out:
11141 + inode_unlock(inode);
11142 return offset;
11143 }
11144
11145 diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
11146 index 1ec6dd4f3e2e..3ee60c533217 100644
11147 --- a/fs/nfs/nfs4client.c
11148 +++ b/fs/nfs/nfs4client.c
11149 @@ -847,7 +847,7 @@ nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr,
11150
11151 spin_lock(&nn->nfs_client_lock);
11152 list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) {
11153 - if (nfs4_cb_match_client(addr, clp, minorversion) == false)
11154 + if (!nfs4_cb_match_client(addr, clp, minorversion))
11155 continue;
11156
11157 if (!nfs4_has_session(clp))
11158 diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
11159 index ca4249ae644f..632d3c3f8dfb 100644
11160 --- a/fs/nfs/nfs4proc.c
11161 +++ b/fs/nfs/nfs4proc.c
11162 @@ -2916,6 +2916,11 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir,
11163 exception.retry = 1;
11164 continue;
11165 }
11166 + if (status == -NFS4ERR_EXPIRED) {
11167 + nfs4_schedule_lease_recovery(server->nfs_client);
11168 + exception.retry = 1;
11169 + continue;
11170 + }
11171 if (status == -EAGAIN) {
11172 /* We must have found a delegation */
11173 exception.retry = 1;
11174 diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
11175 index 0e008db16b16..c3abf92adfb7 100644
11176 --- a/fs/nfs/pnfs.c
11177 +++ b/fs/nfs/pnfs.c
11178 @@ -1436,7 +1436,7 @@ pnfs_lseg_range_match(const struct pnfs_layout_range *ls_range,
11179 if ((range->iomode == IOMODE_RW &&
11180 ls_range->iomode != IOMODE_RW) ||
11181 (range->iomode != ls_range->iomode &&
11182 - strict_iomode == true) ||
11183 + strict_iomode) ||
11184 !pnfs_lseg_range_intersecting(ls_range, range))
11185 return 0;
11186
11187 diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
11188 index 64813697f4c4..f6cc2fddb78b 100644
11189 --- a/fs/nfsd/nfs4layouts.c
11190 +++ b/fs/nfsd/nfs4layouts.c
11191 @@ -680,7 +680,7 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
11192
11193 /* Client gets 2 lease periods to return it */
11194 cutoff = ktime_add_ns(task->tk_start,
11195 - nn->nfsd4_lease * NSEC_PER_SEC * 2);
11196 + (u64)nn->nfsd4_lease * NSEC_PER_SEC * 2);
11197
11198 if (ktime_before(now, cutoff)) {
11199 rpc_delay(task, HZ/100); /* 10 mili-seconds */
11200 diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
11201 index db4bd70b62d0..4509c76716e3 100644
11202 --- a/fs/nfsd/nfs4state.c
11203 +++ b/fs/nfsd/nfs4state.c
11204 @@ -6034,7 +6034,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
11205 }
11206
11207 if (fl_flags & FL_SLEEP) {
11208 - nbl->nbl_time = jiffies;
11209 + nbl->nbl_time = get_seconds();
11210 spin_lock(&nn->blocked_locks_lock);
11211 list_add_tail(&nbl->nbl_list, &lock_sop->lo_blocked);
11212 list_add_tail(&nbl->nbl_lru, &nn->blocked_locks_lru);
11213 diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
11214 index 133d8bf62a5c..7872b1ead885 100644
11215 --- a/fs/nfsd/state.h
11216 +++ b/fs/nfsd/state.h
11217 @@ -591,7 +591,7 @@ static inline bool nfsd4_stateid_generation_after(stateid_t *a, stateid_t *b)
11218 struct nfsd4_blocked_lock {
11219 struct list_head nbl_list;
11220 struct list_head nbl_lru;
11221 - unsigned long nbl_time;
11222 + time_t nbl_time;
11223 struct file_lock nbl_lock;
11224 struct knfsd_fh nbl_fh;
11225 struct nfsd4_callback nbl_cb;
11226 diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
11227 index b4fbeefba246..5ef0d1d60743 100644
11228 --- a/fs/ubifs/file.c
11229 +++ b/fs/ubifs/file.c
11230 @@ -721,6 +721,7 @@ static int ubifs_do_bulk_read(struct ubifs_info *c, struct bu_info *bu,
11231 int err, page_idx, page_cnt, ret = 0, n = 0;
11232 int allocate = bu->buf ? 0 : 1;
11233 loff_t isize;
11234 + gfp_t ra_gfp_mask = readahead_gfp_mask(mapping) & ~__GFP_FS;
11235
11236 err = ubifs_tnc_get_bu_keys(c, bu);
11237 if (err)
11238 @@ -782,8 +783,9 @@ static int ubifs_do_bulk_read(struct ubifs_info *c, struct bu_info *bu,
11239
11240 if (page_offset > end_index)
11241 break;
11242 - page = find_or_create_page(mapping, page_offset,
11243 - GFP_NOFS | __GFP_COLD);
11244 + page = pagecache_get_page(mapping, page_offset,
11245 + FGP_LOCK|FGP_ACCESSED|FGP_CREAT|FGP_NOWAIT,
11246 + ra_gfp_mask);
11247 if (!page)
11248 break;
11249 if (!PageUptodate(page))
11250 diff --git a/include/media/v4l2-rect.h b/include/media/v4l2-rect.h
11251 index d2125f0cc7cd..1584c760b993 100644
11252 --- a/include/media/v4l2-rect.h
11253 +++ b/include/media/v4l2-rect.h
11254 @@ -75,10 +75,10 @@ static inline void v4l2_rect_map_inside(struct v4l2_rect *r,
11255 r->left = boundary->left;
11256 if (r->top < boundary->top)
11257 r->top = boundary->top;
11258 - if (r->left + r->width > boundary->width)
11259 - r->left = boundary->width - r->width;
11260 - if (r->top + r->height > boundary->height)
11261 - r->top = boundary->height - r->height;
11262 + if (r->left + r->width > boundary->left + boundary->width)
11263 + r->left = boundary->left + boundary->width - r->width;
11264 + if (r->top + r->height > boundary->top + boundary->height)
11265 + r->top = boundary->top + boundary->height - r->height;
11266 }
11267
11268 /**
11269 diff --git a/kernel/events/core.c b/kernel/events/core.c
11270 index 64ace5e9af2a..97b90faceb97 100644
11271 --- a/kernel/events/core.c
11272 +++ b/kernel/events/core.c
11273 @@ -5303,7 +5303,15 @@ accounting:
11274 */
11275 user_lock_limit *= num_online_cpus();
11276
11277 - user_locked = atomic_long_read(&user->locked_vm) + user_extra;
11278 + user_locked = atomic_long_read(&user->locked_vm);
11279 +
11280 + /*
11281 + * sysctl_perf_event_mlock may have changed, so that
11282 + * user->locked_vm > user_lock_limit
11283 + */
11284 + if (user_locked > user_lock_limit)
11285 + user_locked = user_lock_limit;
11286 + user_locked += user_extra;
11287
11288 if (user_locked > user_lock_limit)
11289 extra = user_locked - user_lock_limit;
11290 diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
11291 index 7e4fad75acaa..2924ff544c9e 100644
11292 --- a/kernel/time/clocksource.c
11293 +++ b/kernel/time/clocksource.c
11294 @@ -272,8 +272,15 @@ static void clocksource_watchdog(unsigned long data)
11295 next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask);
11296 if (next_cpu >= nr_cpu_ids)
11297 next_cpu = cpumask_first(cpu_online_mask);
11298 - watchdog_timer.expires += WATCHDOG_INTERVAL;
11299 - add_timer_on(&watchdog_timer, next_cpu);
11300 +
11301 + /*
11302 + * Arm timer if not already pending: could race with concurrent
11303 + * pair clocksource_stop_watchdog() clocksource_start_watchdog().
11304 + */
11305 + if (!timer_pending(&watchdog_timer)) {
11306 + watchdog_timer.expires += WATCHDOG_INTERVAL;
11307 + add_timer_on(&watchdog_timer, next_cpu);
11308 + }
11309 out:
11310 spin_unlock(&watchdog_lock);
11311 }
11312 diff --git a/lib/test_kasan.c b/lib/test_kasan.c
11313 index 4ba4cbe169a8..6e76a448867d 100644
11314 --- a/lib/test_kasan.c
11315 +++ b/lib/test_kasan.c
11316 @@ -124,6 +124,7 @@ static noinline void __init kmalloc_oob_krealloc_more(void)
11317 if (!ptr1 || !ptr2) {
11318 pr_err("Allocation failed\n");
11319 kfree(ptr1);
11320 + kfree(ptr2);
11321 return;
11322 }
11323
11324 diff --git a/net/hsr/hsr_slave.c b/net/hsr/hsr_slave.c
11325 index f5b60388d02f..4ff6e02d8b73 100644
11326 --- a/net/hsr/hsr_slave.c
11327 +++ b/net/hsr/hsr_slave.c
11328 @@ -31,6 +31,8 @@ static rx_handler_result_t hsr_handle_frame(struct sk_buff **pskb)
11329
11330 rcu_read_lock(); /* hsr->node_db, hsr->ports */
11331 port = hsr_port_get_rcu(skb->dev);
11332 + if (!port)
11333 + goto finish_pass;
11334
11335 if (hsr_addr_is_self(port->hsr, eth_hdr(skb)->h_source)) {
11336 /* Directly kill frames sent by ourselves */
11337 diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
11338 index 6e25524c6a74..02918e0d635e 100644
11339 --- a/net/ipv4/tcp.c
11340 +++ b/net/ipv4/tcp.c
11341 @@ -2298,9 +2298,11 @@ int tcp_disconnect(struct sock *sk, int flags)
11342 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
11343 tp->snd_cwnd_cnt = 0;
11344 tp->window_clamp = 0;
11345 + tp->delivered = 0;
11346 tcp_set_ca_state(sk, TCP_CA_Open);
11347 tp->is_sack_reneg = 0;
11348 tcp_clear_retrans(tp);
11349 + tp->total_retrans = 0;
11350 inet_csk_delack_init(sk);
11351 /* Initialize rcv_mss to TCP_MIN_MSS to avoid division by 0
11352 * issue in __tcp_select_window()
11353 @@ -2312,8 +2314,12 @@ int tcp_disconnect(struct sock *sk, int flags)
11354 dst_release(sk->sk_rx_dst);
11355 sk->sk_rx_dst = NULL;
11356 tcp_saved_syn_free(tp);
11357 + tp->segs_in = 0;
11358 + tp->segs_out = 0;
11359 tp->bytes_acked = 0;
11360 tp->bytes_received = 0;
11361 + tp->data_segs_in = 0;
11362 + tp->data_segs_out = 0;
11363
11364 WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
11365
11366 diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
11367 index f60e35576526..d6a64771463f 100644
11368 --- a/net/rxrpc/ar-internal.h
11369 +++ b/net/rxrpc/ar-internal.h
11370 @@ -401,6 +401,7 @@ enum rxrpc_call_flag {
11371 RXRPC_CALL_SEND_PING, /* A ping will need to be sent */
11372 RXRPC_CALL_PINGING, /* Ping in process */
11373 RXRPC_CALL_RETRANS_TIMEOUT, /* Retransmission due to timeout occurred */
11374 + RXRPC_CALL_DISCONNECTED, /* The call has been disconnected */
11375 };
11376
11377 /*
11378 diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
11379 index 1ed18d8c9c9f..88bcd146142f 100644
11380 --- a/net/rxrpc/call_object.c
11381 +++ b/net/rxrpc/call_object.c
11382 @@ -464,7 +464,7 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call)
11383
11384 _debug("RELEASE CALL %p (%d CONN %p)", call, call->debug_id, conn);
11385
11386 - if (conn)
11387 + if (conn && !test_bit(RXRPC_CALL_DISCONNECTED, &call->flags))
11388 rxrpc_disconnect_call(call);
11389
11390 for (i = 0; i < RXRPC_RXTX_BUFF_SIZE; i++) {
11391 @@ -539,6 +539,7 @@ static void rxrpc_rcu_destroy_call(struct rcu_head *rcu)
11392 {
11393 struct rxrpc_call *call = container_of(rcu, struct rxrpc_call, rcu);
11394
11395 + rxrpc_put_connection(call->conn);
11396 rxrpc_put_peer(call->peer);
11397 kfree(call->rxtx_buffer);
11398 kfree(call->rxtx_annotations);
11399 @@ -560,7 +561,6 @@ void rxrpc_cleanup_call(struct rxrpc_call *call)
11400
11401 ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE);
11402 ASSERT(test_bit(RXRPC_CALL_RELEASED, &call->flags));
11403 - ASSERTCMP(call->conn, ==, NULL);
11404
11405 /* Clean up the Rx/Tx buffer */
11406 for (i = 0; i < RXRPC_RXTX_BUFF_SIZE; i++)
11407 diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c
11408 index 0fce919bf47d..dd41733f182a 100644
11409 --- a/net/rxrpc/conn_client.c
11410 +++ b/net/rxrpc/conn_client.c
11411 @@ -736,9 +736,9 @@ void rxrpc_disconnect_client_call(struct rxrpc_call *call)
11412 struct rxrpc_channel *chan = &conn->channels[channel];
11413
11414 trace_rxrpc_client(conn, channel, rxrpc_client_chan_disconnect);
11415 - call->conn = NULL;
11416
11417 spin_lock(&conn->channel_lock);
11418 + set_bit(RXRPC_CALL_DISCONNECTED, &call->flags);
11419
11420 /* Calls that have never actually been assigned a channel can simply be
11421 * discarded. If the conn didn't get used either, it will follow
11422 @@ -828,7 +828,6 @@ out:
11423 spin_unlock(&rxrpc_client_conn_cache_lock);
11424 out_2:
11425 spin_unlock(&conn->channel_lock);
11426 - rxrpc_put_connection(conn);
11427 _leave("");
11428 return;
11429
11430 diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
11431 index e1e83af47866..e7c89b978587 100644
11432 --- a/net/rxrpc/conn_object.c
11433 +++ b/net/rxrpc/conn_object.c
11434 @@ -211,9 +211,8 @@ void rxrpc_disconnect_call(struct rxrpc_call *call)
11435 __rxrpc_disconnect_call(conn, call);
11436 spin_unlock(&conn->channel_lock);
11437
11438 - call->conn = NULL;
11439 + set_bit(RXRPC_CALL_DISCONNECTED, &call->flags);
11440 conn->idle_timestamp = jiffies;
11441 - rxrpc_put_connection(conn);
11442 }
11443
11444 /*
11445 diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
11446 index a4380e182e6c..f0ccc6a04c7a 100644
11447 --- a/net/rxrpc/input.c
11448 +++ b/net/rxrpc/input.c
11449 @@ -582,8 +582,7 @@ ack:
11450 immediate_ack, true,
11451 rxrpc_propose_ack_input_data);
11452
11453 - if (sp->hdr.seq == READ_ONCE(call->rx_hard_ack) + 1)
11454 - rxrpc_notify_socket(call);
11455 + rxrpc_notify_socket(call);
11456 _leave(" [queued]");
11457 }
11458
11459 diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c
11460 index 64389f493bb2..d568c96f5262 100644
11461 --- a/net/rxrpc/output.c
11462 +++ b/net/rxrpc/output.c
11463 @@ -96,7 +96,7 @@ static size_t rxrpc_fill_out_ack(struct rxrpc_call *call,
11464 */
11465 int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping)
11466 {
11467 - struct rxrpc_connection *conn = NULL;
11468 + struct rxrpc_connection *conn;
11469 struct rxrpc_ack_buffer *pkt;
11470 struct msghdr msg;
11471 struct kvec iov[2];
11472 @@ -106,18 +106,14 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping)
11473 int ret;
11474 u8 reason;
11475
11476 - spin_lock_bh(&call->lock);
11477 - if (call->conn)
11478 - conn = rxrpc_get_connection_maybe(call->conn);
11479 - spin_unlock_bh(&call->lock);
11480 - if (!conn)
11481 + if (test_bit(RXRPC_CALL_DISCONNECTED, &call->flags))
11482 return -ECONNRESET;
11483
11484 pkt = kzalloc(sizeof(*pkt), GFP_KERNEL);
11485 - if (!pkt) {
11486 - rxrpc_put_connection(conn);
11487 + if (!pkt)
11488 return -ENOMEM;
11489 - }
11490 +
11491 + conn = call->conn;
11492
11493 msg.msg_name = &call->peer->srx.transport;
11494 msg.msg_namelen = call->peer->srx.transport_len;
11495 @@ -204,7 +200,6 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping)
11496 }
11497
11498 out:
11499 - rxrpc_put_connection(conn);
11500 kfree(pkt);
11501 return ret;
11502 }
11503 @@ -214,20 +209,18 @@ out:
11504 */
11505 int rxrpc_send_abort_packet(struct rxrpc_call *call)
11506 {
11507 - struct rxrpc_connection *conn = NULL;
11508 + struct rxrpc_connection *conn;
11509 struct rxrpc_abort_buffer pkt;
11510 struct msghdr msg;
11511 struct kvec iov[1];
11512 rxrpc_serial_t serial;
11513 int ret;
11514
11515 - spin_lock_bh(&call->lock);
11516 - if (call->conn)
11517 - conn = rxrpc_get_connection_maybe(call->conn);
11518 - spin_unlock_bh(&call->lock);
11519 - if (!conn)
11520 + if (test_bit(RXRPC_CALL_DISCONNECTED, &call->flags))
11521 return -ECONNRESET;
11522
11523 + conn = call->conn;
11524 +
11525 msg.msg_name = &call->peer->srx.transport;
11526 msg.msg_namelen = call->peer->srx.transport_len;
11527 msg.msg_control = NULL;
11528 @@ -255,7 +248,6 @@ int rxrpc_send_abort_packet(struct rxrpc_call *call)
11529 ret = kernel_sendmsg(conn->params.local->socket,
11530 &msg, iov, 1, sizeof(pkt));
11531
11532 - rxrpc_put_connection(conn);
11533 return ret;
11534 }
11535
11536 diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h
11537 index 322438fb3ffc..8673f9817f91 100644
11538 --- a/net/sched/cls_rsvp.h
11539 +++ b/net/sched/cls_rsvp.h
11540 @@ -455,10 +455,8 @@ static u32 gen_tunnel(struct rsvp_head *data)
11541
11542 static const struct nla_policy rsvp_policy[TCA_RSVP_MAX + 1] = {
11543 [TCA_RSVP_CLASSID] = { .type = NLA_U32 },
11544 - [TCA_RSVP_DST] = { .type = NLA_BINARY,
11545 - .len = RSVP_DST_LEN * sizeof(u32) },
11546 - [TCA_RSVP_SRC] = { .type = NLA_BINARY,
11547 - .len = RSVP_DST_LEN * sizeof(u32) },
11548 + [TCA_RSVP_DST] = { .len = RSVP_DST_LEN * sizeof(u32) },
11549 + [TCA_RSVP_SRC] = { .len = RSVP_DST_LEN * sizeof(u32) },
11550 [TCA_RSVP_PINFO] = { .len = sizeof(struct tc_rsvp_pinfo) },
11551 };
11552
11553 diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c
11554 index db80a6440f37..3e1695b66e31 100644
11555 --- a/net/sched/cls_tcindex.c
11556 +++ b/net/sched/cls_tcindex.c
11557 @@ -301,12 +301,31 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
11558 cp->fall_through = p->fall_through;
11559 cp->tp = tp;
11560
11561 + if (tb[TCA_TCINDEX_HASH])
11562 + cp->hash = nla_get_u32(tb[TCA_TCINDEX_HASH]);
11563 +
11564 + if (tb[TCA_TCINDEX_MASK])
11565 + cp->mask = nla_get_u16(tb[TCA_TCINDEX_MASK]);
11566 +
11567 + if (tb[TCA_TCINDEX_SHIFT])
11568 + cp->shift = nla_get_u32(tb[TCA_TCINDEX_SHIFT]);
11569 +
11570 + if (!cp->hash) {
11571 + /* Hash not specified, use perfect hash if the upper limit
11572 + * of the hashing index is below the threshold.
11573 + */
11574 + if ((cp->mask >> cp->shift) < PERFECT_HASH_THRESHOLD)
11575 + cp->hash = (cp->mask >> cp->shift) + 1;
11576 + else
11577 + cp->hash = DEFAULT_HASH_SIZE;
11578 + }
11579 +
11580 if (p->perfect) {
11581 int i;
11582
11583 if (tcindex_alloc_perfect_hash(cp) < 0)
11584 goto errout;
11585 - for (i = 0; i < cp->hash; i++)
11586 + for (i = 0; i < min(cp->hash, p->hash); i++)
11587 cp->perfect[i].res = p->perfect[i].res;
11588 balloc = 1;
11589 }
11590 @@ -321,15 +340,6 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
11591 if (old_r)
11592 cr.res = r->res;
11593
11594 - if (tb[TCA_TCINDEX_HASH])
11595 - cp->hash = nla_get_u32(tb[TCA_TCINDEX_HASH]);
11596 -
11597 - if (tb[TCA_TCINDEX_MASK])
11598 - cp->mask = nla_get_u16(tb[TCA_TCINDEX_MASK]);
11599 -
11600 - if (tb[TCA_TCINDEX_SHIFT])
11601 - cp->shift = nla_get_u32(tb[TCA_TCINDEX_SHIFT]);
11602 -
11603 err = -EBUSY;
11604
11605 /* Hash already allocated, make sure that we still meet the
11606 @@ -347,16 +357,6 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
11607 if (tb[TCA_TCINDEX_FALL_THROUGH])
11608 cp->fall_through = nla_get_u32(tb[TCA_TCINDEX_FALL_THROUGH]);
11609
11610 - if (!cp->hash) {
11611 - /* Hash not specified, use perfect hash if the upper limit
11612 - * of the hashing index is below the threshold.
11613 - */
11614 - if ((cp->mask >> cp->shift) < PERFECT_HASH_THRESHOLD)
11615 - cp->hash = (cp->mask >> cp->shift) + 1;
11616 - else
11617 - cp->hash = DEFAULT_HASH_SIZE;
11618 - }
11619 -
11620 if (!cp->perfect && !cp->h)
11621 cp->alloc_hash = cp->hash;
11622
11623 diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
11624 index b4b68c6e3f8b..d7775ca2fbb9 100644
11625 --- a/net/sunrpc/auth_gss/svcauth_gss.c
11626 +++ b/net/sunrpc/auth_gss/svcauth_gss.c
11627 @@ -1180,6 +1180,7 @@ static int gss_proxy_save_rsc(struct cache_detail *cd,
11628 dprintk("RPC: No creds found!\n");
11629 goto out;
11630 } else {
11631 + struct timespec64 boot;
11632
11633 /* steal creds */
11634 rsci.cred = ud->creds;
11635 @@ -1200,6 +1201,9 @@ static int gss_proxy_save_rsc(struct cache_detail *cd,
11636 &expiry, GFP_KERNEL);
11637 if (status)
11638 goto out;
11639 +
11640 + getboottime64(&boot);
11641 + expiry -= boot.tv_sec;
11642 }
11643
11644 rsci.h.expiry_time = expiry;
11645 diff --git a/sound/drivers/dummy.c b/sound/drivers/dummy.c
11646 index 172dacd925f5..c182341c1714 100644
11647 --- a/sound/drivers/dummy.c
11648 +++ b/sound/drivers/dummy.c
11649 @@ -925,7 +925,7 @@ static void print_formats(struct snd_dummy *dummy,
11650 {
11651 int i;
11652
11653 - for (i = 0; i < SNDRV_PCM_FORMAT_LAST; i++) {
11654 + for (i = 0; i <= SNDRV_PCM_FORMAT_LAST; i++) {
11655 if (dummy->pcm_hw.formats & (1ULL << i))
11656 snd_iprintf(buffer, " %s", snd_pcm_format_name(i));
11657 }
11658 diff --git a/sound/soc/qcom/apq8016_sbc.c b/sound/soc/qcom/apq8016_sbc.c
11659 index 754742018515..3d91cef3704a 100644
11660 --- a/sound/soc/qcom/apq8016_sbc.c
11661 +++ b/sound/soc/qcom/apq8016_sbc.c
11662 @@ -128,7 +128,8 @@ static struct apq8016_sbc_data *apq8016_sbc_parse_of(struct snd_soc_card *card)
11663 link->codec_of_node = of_parse_phandle(codec, "sound-dai", 0);
11664 if (!link->codec_of_node) {
11665 dev_err(card->dev, "error getting codec phandle\n");
11666 - return ERR_PTR(-EINVAL);
11667 + ret = -EINVAL;
11668 + goto error;
11669 }
11670
11671 ret = snd_soc_of_get_dai_name(cpu, &link->cpu_dai_name);
11672 diff --git a/sound/soc/soc-pcm.c b/sound/soc/soc-pcm.c
11673 index 635b22fa1101..280bb5cab87f 100644
11674 --- a/sound/soc/soc-pcm.c
11675 +++ b/sound/soc/soc-pcm.c
11676 @@ -2137,42 +2137,81 @@ int dpcm_be_dai_trigger(struct snd_soc_pcm_runtime *fe, int stream,
11677 }
11678 EXPORT_SYMBOL_GPL(dpcm_be_dai_trigger);
11679
11680 +static int dpcm_dai_trigger_fe_be(struct snd_pcm_substream *substream,
11681 + int cmd, bool fe_first)
11682 +{
11683 + struct snd_soc_pcm_runtime *fe = substream->private_data;
11684 + int ret;
11685 +
11686 + /* call trigger on the frontend before the backend. */
11687 + if (fe_first) {
11688 + dev_dbg(fe->dev, "ASoC: pre trigger FE %s cmd %d\n",
11689 + fe->dai_link->name, cmd);
11690 +
11691 + ret = soc_pcm_trigger(substream, cmd);
11692 + if (ret < 0)
11693 + return ret;
11694 +
11695 + ret = dpcm_be_dai_trigger(fe, substream->stream, cmd);
11696 + return ret;
11697 + }
11698 +
11699 + /* call trigger on the frontend after the backend. */
11700 + ret = dpcm_be_dai_trigger(fe, substream->stream, cmd);
11701 + if (ret < 0)
11702 + return ret;
11703 +
11704 + dev_dbg(fe->dev, "ASoC: post trigger FE %s cmd %d\n",
11705 + fe->dai_link->name, cmd);
11706 +
11707 + ret = soc_pcm_trigger(substream, cmd);
11708 +
11709 + return ret;
11710 +}
11711 +
11712 static int dpcm_fe_dai_do_trigger(struct snd_pcm_substream *substream, int cmd)
11713 {
11714 struct snd_soc_pcm_runtime *fe = substream->private_data;
11715 - int stream = substream->stream, ret;
11716 + int stream = substream->stream;
11717 + int ret = 0;
11718 enum snd_soc_dpcm_trigger trigger = fe->dai_link->trigger[stream];
11719
11720 fe->dpcm[stream].runtime_update = SND_SOC_DPCM_UPDATE_FE;
11721
11722 switch (trigger) {
11723 case SND_SOC_DPCM_TRIGGER_PRE:
11724 - /* call trigger on the frontend before the backend. */
11725 -
11726 - dev_dbg(fe->dev, "ASoC: pre trigger FE %s cmd %d\n",
11727 - fe->dai_link->name, cmd);
11728 -
11729 - ret = soc_pcm_trigger(substream, cmd);
11730 - if (ret < 0) {
11731 - dev_err(fe->dev,"ASoC: trigger FE failed %d\n", ret);
11732 - goto out;
11733 + switch (cmd) {
11734 + case SNDRV_PCM_TRIGGER_START:
11735 + case SNDRV_PCM_TRIGGER_RESUME:
11736 + case SNDRV_PCM_TRIGGER_PAUSE_RELEASE:
11737 + ret = dpcm_dai_trigger_fe_be(substream, cmd, true);
11738 + break;
11739 + case SNDRV_PCM_TRIGGER_STOP:
11740 + case SNDRV_PCM_TRIGGER_SUSPEND:
11741 + case SNDRV_PCM_TRIGGER_PAUSE_PUSH:
11742 + ret = dpcm_dai_trigger_fe_be(substream, cmd, false);
11743 + break;
11744 + default:
11745 + ret = -EINVAL;
11746 + break;
11747 }
11748 -
11749 - ret = dpcm_be_dai_trigger(fe, substream->stream, cmd);
11750 break;
11751 case SND_SOC_DPCM_TRIGGER_POST:
11752 - /* call trigger on the frontend after the backend. */
11753 -
11754 - ret = dpcm_be_dai_trigger(fe, substream->stream, cmd);
11755 - if (ret < 0) {
11756 - dev_err(fe->dev,"ASoC: trigger FE failed %d\n", ret);
11757 - goto out;
11758 + switch (cmd) {
11759 + case SNDRV_PCM_TRIGGER_START:
11760 + case SNDRV_PCM_TRIGGER_RESUME:
11761 + case SNDRV_PCM_TRIGGER_PAUSE_RELEASE:
11762 + ret = dpcm_dai_trigger_fe_be(substream, cmd, false);
11763 + break;
11764 + case SNDRV_PCM_TRIGGER_STOP:
11765 + case SNDRV_PCM_TRIGGER_SUSPEND:
11766 + case SNDRV_PCM_TRIGGER_PAUSE_PUSH:
11767 + ret = dpcm_dai_trigger_fe_be(substream, cmd, true);
11768 + break;
11769 + default:
11770 + ret = -EINVAL;
11771 + break;
11772 }
11773 -
11774 - dev_dbg(fe->dev, "ASoC: post trigger FE %s cmd %d\n",
11775 - fe->dai_link->name, cmd);
11776 -
11777 - ret = soc_pcm_trigger(substream, cmd);
11778 break;
11779 case SND_SOC_DPCM_TRIGGER_BESPOKE:
11780 /* bespoke trigger() - handles both FE and BEs */
11781 @@ -2181,10 +2220,6 @@ static int dpcm_fe_dai_do_trigger(struct snd_pcm_substream *substream, int cmd)
11782 fe->dai_link->name, cmd);
11783
11784 ret = soc_pcm_bespoke_trigger(substream, cmd);
11785 - if (ret < 0) {
11786 - dev_err(fe->dev,"ASoC: trigger FE failed %d\n", ret);
11787 - goto out;
11788 - }
11789 break;
11790 default:
11791 dev_err(fe->dev, "ASoC: invalid trigger cmd %d for %s\n", cmd,
11792 @@ -2193,6 +2228,12 @@ static int dpcm_fe_dai_do_trigger(struct snd_pcm_substream *substream, int cmd)
11793 goto out;
11794 }
11795
11796 + if (ret < 0) {
11797 + dev_err(fe->dev, "ASoC: trigger FE cmd: %d failed: %d\n",
11798 + cmd, ret);
11799 + goto out;
11800 + }
11801 +
11802 switch (cmd) {
11803 case SNDRV_PCM_TRIGGER_START:
11804 case SNDRV_PCM_TRIGGER_RESUME:
11805 diff --git a/tools/power/acpi/Makefile.config b/tools/power/acpi/Makefile.config
11806 index a1883bbb0144..fb5559f9819a 100644
11807 --- a/tools/power/acpi/Makefile.config
11808 +++ b/tools/power/acpi/Makefile.config
11809 @@ -18,7 +18,7 @@ include $(srctree)/../../scripts/Makefile.include
11810
11811 OUTPUT=$(srctree)/
11812 ifeq ("$(origin O)", "command line")
11813 - OUTPUT := $(O)/power/acpi/
11814 + OUTPUT := $(O)/tools/power/acpi/
11815 endif
11816 #$(info Determined 'OUTPUT' to be $(OUTPUT))
11817