Magellan Linux

Annotation of /trunk/kernel-alx-legacy/patches-4.9/0180-4.9.81-all-fixes.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 3608 - (hide annotations) (download)
Fri Aug 14 07:34:29 2020 UTC (3 years, 8 months ago) by niro
File size: 166338 byte(s)
-added kerenl-alx-legacy pkg
1 niro 3608 diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
2     index 4c2667aa4634..466c039c622b 100644
3     --- a/Documentation/kernel-parameters.txt
4     +++ b/Documentation/kernel-parameters.txt
5     @@ -2805,8 +2805,6 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
6     norandmaps Don't use address space randomization. Equivalent to
7     echo 0 > /proc/sys/kernel/randomize_va_space
8    
9     - noreplace-paravirt [X86,IA-64,PV_OPS] Don't patch paravirt_ops
10     -
11     noreplace-smp [X86-32,SMP] Don't replace SMP instructions
12     with UP alternatives
13    
14     diff --git a/Documentation/speculation.txt b/Documentation/speculation.txt
15     new file mode 100644
16     index 000000000000..e9e6cbae2841
17     --- /dev/null
18     +++ b/Documentation/speculation.txt
19     @@ -0,0 +1,90 @@
20     +This document explains potential effects of speculation, and how undesirable
21     +effects can be mitigated portably using common APIs.
22     +
23     +===========
24     +Speculation
25     +===========
26     +
27     +To improve performance and minimize average latencies, many contemporary CPUs
28     +employ speculative execution techniques such as branch prediction, performing
29     +work which may be discarded at a later stage.
30     +
31     +Typically speculative execution cannot be observed from architectural state,
32     +such as the contents of registers. However, in some cases it is possible to
33     +observe its impact on microarchitectural state, such as the presence or
34     +absence of data in caches. Such state may form side-channels which can be
35     +observed to extract secret information.
36     +
37     +For example, in the presence of branch prediction, it is possible for bounds
38     +checks to be ignored by code which is speculatively executed. Consider the
39     +following code:
40     +
41     + int load_array(int *array, unsigned int index)
42     + {
43     + if (index >= MAX_ARRAY_ELEMS)
44     + return 0;
45     + else
46     + return array[index];
47     + }
48     +
49     +Which, on arm64, may be compiled to an assembly sequence such as:
50     +
51     + CMP <index>, #MAX_ARRAY_ELEMS
52     + B.LT less
53     + MOV <returnval>, #0
54     + RET
55     + less:
56     + LDR <returnval>, [<array>, <index>]
57     + RET
58     +
59     +It is possible that a CPU mis-predicts the conditional branch, and
60     +speculatively loads array[index], even if index >= MAX_ARRAY_ELEMS. This
61     +value will subsequently be discarded, but the speculated load may affect
62     +microarchitectural state which can be subsequently measured.
63     +
64     +More complex sequences involving multiple dependent memory accesses may
65     +result in sensitive information being leaked. Consider the following
66     +code, building on the prior example:
67     +
68     + int load_dependent_arrays(int *arr1, int *arr2, int index)
69     + {
70     + int val1, val2,
71     +
72     + val1 = load_array(arr1, index);
73     + val2 = load_array(arr2, val1);
74     +
75     + return val2;
76     + }
77     +
78     +Under speculation, the first call to load_array() may return the value
79     +of an out-of-bounds address, while the second call will influence
80     +microarchitectural state dependent on this value. This may provide an
81     +arbitrary read primitive.
82     +
83     +====================================
84     +Mitigating speculation side-channels
85     +====================================
86     +
87     +The kernel provides a generic API to ensure that bounds checks are
88     +respected even under speculation. Architectures which are affected by
89     +speculation-based side-channels are expected to implement these
90     +primitives.
91     +
92     +The array_index_nospec() helper in <linux/nospec.h> can be used to
93     +prevent information from being leaked via side-channels.
94     +
95     +A call to array_index_nospec(index, size) returns a sanitized index
96     +value that is bounded to [0, size) even under cpu speculation
97     +conditions.
98     +
99     +This can be used to protect the earlier load_array() example:
100     +
101     + int load_array(int *array, unsigned int index)
102     + {
103     + if (index >= MAX_ARRAY_ELEMS)
104     + return 0;
105     + else {
106     + index = array_index_nospec(index, MAX_ARRAY_ELEMS);
107     + return array[index];
108     + }
109     + }
110     diff --git a/Makefile b/Makefile
111     index 9550b6939076..4d5753f1c37b 100644
112     --- a/Makefile
113     +++ b/Makefile
114     @@ -1,6 +1,6 @@
115     VERSION = 4
116     PATCHLEVEL = 9
117     -SUBLEVEL = 80
118     +SUBLEVEL = 81
119     EXTRAVERSION =
120     NAME = Roaring Lionus
121    
122     diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
123     index 6eda5abbd719..0a6bb48854e3 100644
124     --- a/arch/powerpc/Kconfig
125     +++ b/arch/powerpc/Kconfig
126     @@ -128,6 +128,7 @@ config PPC
127     select ARCH_HAS_GCOV_PROFILE_ALL
128     select GENERIC_SMP_IDLE_THREAD
129     select GENERIC_CMOS_UPDATE
130     + select GENERIC_CPU_VULNERABILITIES if PPC_BOOK3S_64
131     select GENERIC_TIME_VSYSCALL_OLD
132     select GENERIC_CLOCKEVENTS
133     select GENERIC_CLOCKEVENTS_BROADCAST if SMP
134     diff --git a/arch/powerpc/include/asm/exception-64e.h b/arch/powerpc/include/asm/exception-64e.h
135     index a703452d67b6..555e22d5e07f 100644
136     --- a/arch/powerpc/include/asm/exception-64e.h
137     +++ b/arch/powerpc/include/asm/exception-64e.h
138     @@ -209,5 +209,11 @@ exc_##label##_book3e:
139     ori r3,r3,vector_offset@l; \
140     mtspr SPRN_IVOR##vector_number,r3;
141    
142     +#define RFI_TO_KERNEL \
143     + rfi
144     +
145     +#define RFI_TO_USER \
146     + rfi
147     +
148     #endif /* _ASM_POWERPC_EXCEPTION_64E_H */
149    
150     diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h
151     index 9a3eee661297..cab6d2a46c41 100644
152     --- a/arch/powerpc/include/asm/exception-64s.h
153     +++ b/arch/powerpc/include/asm/exception-64s.h
154     @@ -51,6 +51,59 @@
155     #define EX_PPR 88 /* SMT thread status register (priority) */
156     #define EX_CTR 96
157    
158     +/*
159     + * Macros for annotating the expected destination of (h)rfid
160     + *
161     + * The nop instructions allow us to insert one or more instructions to flush the
162     + * L1-D cache when returning to userspace or a guest.
163     + */
164     +#define RFI_FLUSH_SLOT \
165     + RFI_FLUSH_FIXUP_SECTION; \
166     + nop; \
167     + nop; \
168     + nop
169     +
170     +#define RFI_TO_KERNEL \
171     + rfid
172     +
173     +#define RFI_TO_USER \
174     + RFI_FLUSH_SLOT; \
175     + rfid; \
176     + b rfi_flush_fallback
177     +
178     +#define RFI_TO_USER_OR_KERNEL \
179     + RFI_FLUSH_SLOT; \
180     + rfid; \
181     + b rfi_flush_fallback
182     +
183     +#define RFI_TO_GUEST \
184     + RFI_FLUSH_SLOT; \
185     + rfid; \
186     + b rfi_flush_fallback
187     +
188     +#define HRFI_TO_KERNEL \
189     + hrfid
190     +
191     +#define HRFI_TO_USER \
192     + RFI_FLUSH_SLOT; \
193     + hrfid; \
194     + b hrfi_flush_fallback
195     +
196     +#define HRFI_TO_USER_OR_KERNEL \
197     + RFI_FLUSH_SLOT; \
198     + hrfid; \
199     + b hrfi_flush_fallback
200     +
201     +#define HRFI_TO_GUEST \
202     + RFI_FLUSH_SLOT; \
203     + hrfid; \
204     + b hrfi_flush_fallback
205     +
206     +#define HRFI_TO_UNKNOWN \
207     + RFI_FLUSH_SLOT; \
208     + hrfid; \
209     + b hrfi_flush_fallback
210     +
211     #ifdef CONFIG_RELOCATABLE
212     #define __EXCEPTION_RELON_PROLOG_PSERIES_1(label, h) \
213     mfspr r11,SPRN_##h##SRR0; /* save SRR0 */ \
214     diff --git a/arch/powerpc/include/asm/feature-fixups.h b/arch/powerpc/include/asm/feature-fixups.h
215     index ddf54f5bbdd1..7b332342071c 100644
216     --- a/arch/powerpc/include/asm/feature-fixups.h
217     +++ b/arch/powerpc/include/asm/feature-fixups.h
218     @@ -189,4 +189,19 @@ void apply_feature_fixups(void);
219     void setup_feature_keys(void);
220     #endif
221    
222     +#define RFI_FLUSH_FIXUP_SECTION \
223     +951: \
224     + .pushsection __rfi_flush_fixup,"a"; \
225     + .align 2; \
226     +952: \
227     + FTR_ENTRY_OFFSET 951b-952b; \
228     + .popsection;
229     +
230     +
231     +#ifndef __ASSEMBLY__
232     +
233     +extern long __start___rfi_flush_fixup, __stop___rfi_flush_fixup;
234     +
235     +#endif
236     +
237     #endif /* __ASM_POWERPC_FEATURE_FIXUPS_H */
238     diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h
239     index 708edebcf147..0e12cb2437d1 100644
240     --- a/arch/powerpc/include/asm/hvcall.h
241     +++ b/arch/powerpc/include/asm/hvcall.h
242     @@ -240,6 +240,7 @@
243     #define H_GET_HCA_INFO 0x1B8
244     #define H_GET_PERF_COUNT 0x1BC
245     #define H_MANAGE_TRACE 0x1C0
246     +#define H_GET_CPU_CHARACTERISTICS 0x1C8
247     #define H_FREE_LOGICAL_LAN_BUFFER 0x1D4
248     #define H_QUERY_INT_STATE 0x1E4
249     #define H_POLL_PENDING 0x1D8
250     @@ -306,6 +307,17 @@
251     #define H_SET_MODE_RESOURCE_ADDR_TRANS_MODE 3
252     #define H_SET_MODE_RESOURCE_LE 4
253    
254     +/* H_GET_CPU_CHARACTERISTICS return values */
255     +#define H_CPU_CHAR_SPEC_BAR_ORI31 (1ull << 63) // IBM bit 0
256     +#define H_CPU_CHAR_BCCTRL_SERIALISED (1ull << 62) // IBM bit 1
257     +#define H_CPU_CHAR_L1D_FLUSH_ORI30 (1ull << 61) // IBM bit 2
258     +#define H_CPU_CHAR_L1D_FLUSH_TRIG2 (1ull << 60) // IBM bit 3
259     +#define H_CPU_CHAR_L1D_THREAD_PRIV (1ull << 59) // IBM bit 4
260     +
261     +#define H_CPU_BEHAV_FAVOUR_SECURITY (1ull << 63) // IBM bit 0
262     +#define H_CPU_BEHAV_L1D_FLUSH_PR (1ull << 62) // IBM bit 1
263     +#define H_CPU_BEHAV_BNDS_CHK_SPEC_BAR (1ull << 61) // IBM bit 2
264     +
265     #ifndef __ASSEMBLY__
266    
267     /**
268     @@ -433,6 +445,11 @@ static inline unsigned long cmo_get_page_size(void)
269     }
270     #endif /* CONFIG_PPC_PSERIES */
271    
272     +struct h_cpu_char_result {
273     + u64 character;
274     + u64 behaviour;
275     +};
276     +
277     #endif /* __ASSEMBLY__ */
278     #endif /* __KERNEL__ */
279     #endif /* _ASM_POWERPC_HVCALL_H */
280     diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
281     index 6a6792bb39fb..ea43897183fd 100644
282     --- a/arch/powerpc/include/asm/paca.h
283     +++ b/arch/powerpc/include/asm/paca.h
284     @@ -205,6 +205,16 @@ struct paca_struct {
285     struct sibling_subcore_state *sibling_subcore_state;
286     #endif
287     #endif
288     +#ifdef CONFIG_PPC_BOOK3S_64
289     + /*
290     + * rfi fallback flush must be in its own cacheline to prevent
291     + * other paca data leaking into the L1d
292     + */
293     + u64 exrfi[13] __aligned(0x80);
294     + void *rfi_flush_fallback_area;
295     + u64 l1d_flush_congruence;
296     + u64 l1d_flush_sets;
297     +#endif
298     };
299    
300     #ifdef CONFIG_PPC_BOOK3S
301     diff --git a/arch/powerpc/include/asm/plpar_wrappers.h b/arch/powerpc/include/asm/plpar_wrappers.h
302     index 1b394247afc2..4e53b8570d1f 100644
303     --- a/arch/powerpc/include/asm/plpar_wrappers.h
304     +++ b/arch/powerpc/include/asm/plpar_wrappers.h
305     @@ -340,4 +340,18 @@ static inline long plapr_set_watchpoint0(unsigned long dawr0, unsigned long dawr
306     return plpar_set_mode(0, H_SET_MODE_RESOURCE_SET_DAWR, dawr0, dawrx0);
307     }
308    
309     +static inline long plpar_get_cpu_characteristics(struct h_cpu_char_result *p)
310     +{
311     + unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
312     + long rc;
313     +
314     + rc = plpar_hcall(H_GET_CPU_CHARACTERISTICS, retbuf);
315     + if (rc == H_SUCCESS) {
316     + p->character = retbuf[0];
317     + p->behaviour = retbuf[1];
318     + }
319     +
320     + return rc;
321     +}
322     +
323     #endif /* _ASM_POWERPC_PLPAR_WRAPPERS_H */
324     diff --git a/arch/powerpc/include/asm/setup.h b/arch/powerpc/include/asm/setup.h
325     index 654d64c9f3ac..6825a67cc3db 100644
326     --- a/arch/powerpc/include/asm/setup.h
327     +++ b/arch/powerpc/include/asm/setup.h
328     @@ -38,6 +38,19 @@ static inline void pseries_big_endian_exceptions(void) {}
329     static inline void pseries_little_endian_exceptions(void) {}
330     #endif /* CONFIG_PPC_PSERIES */
331    
332     +void rfi_flush_enable(bool enable);
333     +
334     +/* These are bit flags */
335     +enum l1d_flush_type {
336     + L1D_FLUSH_NONE = 0x1,
337     + L1D_FLUSH_FALLBACK = 0x2,
338     + L1D_FLUSH_ORI = 0x4,
339     + L1D_FLUSH_MTTRIG = 0x8,
340     +};
341     +
342     +void __init setup_rfi_flush(enum l1d_flush_type, bool enable);
343     +void do_rfi_flush_fixups(enum l1d_flush_type types);
344     +
345     #endif /* !__ASSEMBLY__ */
346    
347     #endif /* _ASM_POWERPC_SETUP_H */
348     diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
349     index c833d88c423d..64bcbd580495 100644
350     --- a/arch/powerpc/kernel/asm-offsets.c
351     +++ b/arch/powerpc/kernel/asm-offsets.c
352     @@ -240,6 +240,10 @@ int main(void)
353     #ifdef CONFIG_PPC_BOOK3S_64
354     DEFINE(PACAMCEMERGSP, offsetof(struct paca_struct, mc_emergency_sp));
355     DEFINE(PACA_IN_MCE, offsetof(struct paca_struct, in_mce));
356     + DEFINE(PACA_RFI_FLUSH_FALLBACK_AREA, offsetof(struct paca_struct, rfi_flush_fallback_area));
357     + DEFINE(PACA_EXRFI, offsetof(struct paca_struct, exrfi));
358     + DEFINE(PACA_L1D_FLUSH_CONGRUENCE, offsetof(struct paca_struct, l1d_flush_congruence));
359     + DEFINE(PACA_L1D_FLUSH_SETS, offsetof(struct paca_struct, l1d_flush_sets));
360     #endif
361     DEFINE(PACAHWCPUID, offsetof(struct paca_struct, hw_cpu_id));
362     DEFINE(PACAKEXECSTATE, offsetof(struct paca_struct, kexec_state));
363     diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
364     index caa659671599..c33b69d10919 100644
365     --- a/arch/powerpc/kernel/entry_64.S
366     +++ b/arch/powerpc/kernel/entry_64.S
367     @@ -251,13 +251,23 @@ BEGIN_FTR_SECTION
368     END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
369    
370     ld r13,GPR13(r1) /* only restore r13 if returning to usermode */
371     + ld r2,GPR2(r1)
372     + ld r1,GPR1(r1)
373     + mtlr r4
374     + mtcr r5
375     + mtspr SPRN_SRR0,r7
376     + mtspr SPRN_SRR1,r8
377     + RFI_TO_USER
378     + b . /* prevent speculative execution */
379     +
380     + /* exit to kernel */
381     1: ld r2,GPR2(r1)
382     ld r1,GPR1(r1)
383     mtlr r4
384     mtcr r5
385     mtspr SPRN_SRR0,r7
386     mtspr SPRN_SRR1,r8
387     - RFI
388     + RFI_TO_KERNEL
389     b . /* prevent speculative execution */
390    
391     syscall_error:
392     @@ -859,7 +869,7 @@ BEGIN_FTR_SECTION
393     END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
394     ACCOUNT_CPU_USER_EXIT(r13, r2, r4)
395     REST_GPR(13, r1)
396     -1:
397     +
398     mtspr SPRN_SRR1,r3
399    
400     ld r2,_CCR(r1)
401     @@ -872,8 +882,22 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
402     ld r3,GPR3(r1)
403     ld r4,GPR4(r1)
404     ld r1,GPR1(r1)
405     + RFI_TO_USER
406     + b . /* prevent speculative execution */
407    
408     - rfid
409     +1: mtspr SPRN_SRR1,r3
410     +
411     + ld r2,_CCR(r1)
412     + mtcrf 0xFF,r2
413     + ld r2,_NIP(r1)
414     + mtspr SPRN_SRR0,r2
415     +
416     + ld r0,GPR0(r1)
417     + ld r2,GPR2(r1)
418     + ld r3,GPR3(r1)
419     + ld r4,GPR4(r1)
420     + ld r1,GPR1(r1)
421     + RFI_TO_KERNEL
422     b . /* prevent speculative execution */
423    
424     #endif /* CONFIG_PPC_BOOK3E */
425     diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
426     index fd68e19b9ef7..96db6c3adebe 100644
427     --- a/arch/powerpc/kernel/exceptions-64s.S
428     +++ b/arch/powerpc/kernel/exceptions-64s.S
429     @@ -655,6 +655,8 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX)
430    
431     andi. r10,r12,MSR_RI /* check for unrecoverable exception */
432     beq- 2f
433     + andi. r10,r12,MSR_PR /* check for user mode (PR != 0) */
434     + bne 1f
435    
436     /* All done -- return from exception. */
437    
438     @@ -671,7 +673,23 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX)
439     ld r11,PACA_EXSLB+EX_R11(r13)
440     ld r12,PACA_EXSLB+EX_R12(r13)
441     ld r13,PACA_EXSLB+EX_R13(r13)
442     - rfid
443     + RFI_TO_KERNEL
444     + b . /* prevent speculative execution */
445     +
446     +1:
447     +.machine push
448     +.machine "power4"
449     + mtcrf 0x80,r9
450     + mtcrf 0x01,r9 /* slb_allocate uses cr0 and cr7 */
451     +.machine pop
452     +
453     + RESTORE_PPR_PACA(PACA_EXSLB, r9)
454     + ld r9,PACA_EXSLB+EX_R9(r13)
455     + ld r10,PACA_EXSLB+EX_R10(r13)
456     + ld r11,PACA_EXSLB+EX_R11(r13)
457     + ld r12,PACA_EXSLB+EX_R12(r13)
458     + ld r13,PACA_EXSLB+EX_R13(r13)
459     + RFI_TO_USER
460     b . /* prevent speculative execution */
461    
462     2: mfspr r11,SPRN_SRR0
463     @@ -679,7 +697,7 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX)
464     mtspr SPRN_SRR0,r10
465     ld r10,PACAKMSR(r13)
466     mtspr SPRN_SRR1,r10
467     - rfid
468     + RFI_TO_KERNEL
469     b .
470    
471     8: mfspr r11,SPRN_SRR0
472     @@ -1576,6 +1594,92 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR)
473     bl kernel_bad_stack
474     b 1b
475    
476     + .globl rfi_flush_fallback
477     +rfi_flush_fallback:
478     + SET_SCRATCH0(r13);
479     + GET_PACA(r13);
480     + std r9,PACA_EXRFI+EX_R9(r13)
481     + std r10,PACA_EXRFI+EX_R10(r13)
482     + std r11,PACA_EXRFI+EX_R11(r13)
483     + std r12,PACA_EXRFI+EX_R12(r13)
484     + std r8,PACA_EXRFI+EX_R13(r13)
485     + mfctr r9
486     + ld r10,PACA_RFI_FLUSH_FALLBACK_AREA(r13)
487     + ld r11,PACA_L1D_FLUSH_SETS(r13)
488     + ld r12,PACA_L1D_FLUSH_CONGRUENCE(r13)
489     + /*
490     + * The load adresses are at staggered offsets within cachelines,
491     + * which suits some pipelines better (on others it should not
492     + * hurt).
493     + */
494     + addi r12,r12,8
495     + mtctr r11
496     + DCBT_STOP_ALL_STREAM_IDS(r11) /* Stop prefetch streams */
497     +
498     + /* order ld/st prior to dcbt stop all streams with flushing */
499     + sync
500     +1: li r8,0
501     + .rept 8 /* 8-way set associative */
502     + ldx r11,r10,r8
503     + add r8,r8,r12
504     + xor r11,r11,r11 // Ensure r11 is 0 even if fallback area is not
505     + add r8,r8,r11 // Add 0, this creates a dependency on the ldx
506     + .endr
507     + addi r10,r10,128 /* 128 byte cache line */
508     + bdnz 1b
509     +
510     + mtctr r9
511     + ld r9,PACA_EXRFI+EX_R9(r13)
512     + ld r10,PACA_EXRFI+EX_R10(r13)
513     + ld r11,PACA_EXRFI+EX_R11(r13)
514     + ld r12,PACA_EXRFI+EX_R12(r13)
515     + ld r8,PACA_EXRFI+EX_R13(r13)
516     + GET_SCRATCH0(r13);
517     + rfid
518     +
519     + .globl hrfi_flush_fallback
520     +hrfi_flush_fallback:
521     + SET_SCRATCH0(r13);
522     + GET_PACA(r13);
523     + std r9,PACA_EXRFI+EX_R9(r13)
524     + std r10,PACA_EXRFI+EX_R10(r13)
525     + std r11,PACA_EXRFI+EX_R11(r13)
526     + std r12,PACA_EXRFI+EX_R12(r13)
527     + std r8,PACA_EXRFI+EX_R13(r13)
528     + mfctr r9
529     + ld r10,PACA_RFI_FLUSH_FALLBACK_AREA(r13)
530     + ld r11,PACA_L1D_FLUSH_SETS(r13)
531     + ld r12,PACA_L1D_FLUSH_CONGRUENCE(r13)
532     + /*
533     + * The load adresses are at staggered offsets within cachelines,
534     + * which suits some pipelines better (on others it should not
535     + * hurt).
536     + */
537     + addi r12,r12,8
538     + mtctr r11
539     + DCBT_STOP_ALL_STREAM_IDS(r11) /* Stop prefetch streams */
540     +
541     + /* order ld/st prior to dcbt stop all streams with flushing */
542     + sync
543     +1: li r8,0
544     + .rept 8 /* 8-way set associative */
545     + ldx r11,r10,r8
546     + add r8,r8,r12
547     + xor r11,r11,r11 // Ensure r11 is 0 even if fallback area is not
548     + add r8,r8,r11 // Add 0, this creates a dependency on the ldx
549     + .endr
550     + addi r10,r10,128 /* 128 byte cache line */
551     + bdnz 1b
552     +
553     + mtctr r9
554     + ld r9,PACA_EXRFI+EX_R9(r13)
555     + ld r10,PACA_EXRFI+EX_R10(r13)
556     + ld r11,PACA_EXRFI+EX_R11(r13)
557     + ld r12,PACA_EXRFI+EX_R12(r13)
558     + ld r8,PACA_EXRFI+EX_R13(r13)
559     + GET_SCRATCH0(r13);
560     + hrfid
561     +
562     /*
563     * Called from arch_local_irq_enable when an interrupt needs
564     * to be resent. r3 contains 0x500, 0x900, 0xa00 or 0xe80 to indicate
565     diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
566     index a12be60181bf..7c30a91c1f86 100644
567     --- a/arch/powerpc/kernel/setup_64.c
568     +++ b/arch/powerpc/kernel/setup_64.c
569     @@ -37,6 +37,7 @@
570     #include <linux/memblock.h>
571     #include <linux/memory.h>
572     #include <linux/nmi.h>
573     +#include <linux/debugfs.h>
574    
575     #include <asm/io.h>
576     #include <asm/kdump.h>
577     @@ -678,4 +679,142 @@ static int __init disable_hardlockup_detector(void)
578     return 0;
579     }
580     early_initcall(disable_hardlockup_detector);
581     +
582     +#ifdef CONFIG_PPC_BOOK3S_64
583     +static enum l1d_flush_type enabled_flush_types;
584     +static void *l1d_flush_fallback_area;
585     +static bool no_rfi_flush;
586     +bool rfi_flush;
587     +
588     +static int __init handle_no_rfi_flush(char *p)
589     +{
590     + pr_info("rfi-flush: disabled on command line.");
591     + no_rfi_flush = true;
592     + return 0;
593     +}
594     +early_param("no_rfi_flush", handle_no_rfi_flush);
595     +
596     +/*
597     + * The RFI flush is not KPTI, but because users will see doco that says to use
598     + * nopti we hijack that option here to also disable the RFI flush.
599     + */
600     +static int __init handle_no_pti(char *p)
601     +{
602     + pr_info("rfi-flush: disabling due to 'nopti' on command line.\n");
603     + handle_no_rfi_flush(NULL);
604     + return 0;
605     +}
606     +early_param("nopti", handle_no_pti);
607     +
608     +static void do_nothing(void *unused)
609     +{
610     + /*
611     + * We don't need to do the flush explicitly, just enter+exit kernel is
612     + * sufficient, the RFI exit handlers will do the right thing.
613     + */
614     +}
615     +
616     +void rfi_flush_enable(bool enable)
617     +{
618     + if (rfi_flush == enable)
619     + return;
620     +
621     + if (enable) {
622     + do_rfi_flush_fixups(enabled_flush_types);
623     + on_each_cpu(do_nothing, NULL, 1);
624     + } else
625     + do_rfi_flush_fixups(L1D_FLUSH_NONE);
626     +
627     + rfi_flush = enable;
628     +}
629     +
630     +static void init_fallback_flush(void)
631     +{
632     + u64 l1d_size, limit;
633     + int cpu;
634     +
635     + l1d_size = ppc64_caches.dsize;
636     + limit = min(safe_stack_limit(), ppc64_rma_size);
637     +
638     + /*
639     + * Align to L1d size, and size it at 2x L1d size, to catch possible
640     + * hardware prefetch runoff. We don't have a recipe for load patterns to
641     + * reliably avoid the prefetcher.
642     + */
643     + l1d_flush_fallback_area = __va(memblock_alloc_base(l1d_size * 2, l1d_size, limit));
644     + memset(l1d_flush_fallback_area, 0, l1d_size * 2);
645     +
646     + for_each_possible_cpu(cpu) {
647     + /*
648     + * The fallback flush is currently coded for 8-way
649     + * associativity. Different associativity is possible, but it
650     + * will be treated as 8-way and may not evict the lines as
651     + * effectively.
652     + *
653     + * 128 byte lines are mandatory.
654     + */
655     + u64 c = l1d_size / 8;
656     +
657     + paca[cpu].rfi_flush_fallback_area = l1d_flush_fallback_area;
658     + paca[cpu].l1d_flush_congruence = c;
659     + paca[cpu].l1d_flush_sets = c / 128;
660     + }
661     +}
662     +
663     +void __init setup_rfi_flush(enum l1d_flush_type types, bool enable)
664     +{
665     + if (types & L1D_FLUSH_FALLBACK) {
666     + pr_info("rfi-flush: Using fallback displacement flush\n");
667     + init_fallback_flush();
668     + }
669     +
670     + if (types & L1D_FLUSH_ORI)
671     + pr_info("rfi-flush: Using ori type flush\n");
672     +
673     + if (types & L1D_FLUSH_MTTRIG)
674     + pr_info("rfi-flush: Using mttrig type flush\n");
675     +
676     + enabled_flush_types = types;
677     +
678     + if (!no_rfi_flush)
679     + rfi_flush_enable(enable);
680     +}
681     +
682     +#ifdef CONFIG_DEBUG_FS
683     +static int rfi_flush_set(void *data, u64 val)
684     +{
685     + if (val == 1)
686     + rfi_flush_enable(true);
687     + else if (val == 0)
688     + rfi_flush_enable(false);
689     + else
690     + return -EINVAL;
691     +
692     + return 0;
693     +}
694     +
695     +static int rfi_flush_get(void *data, u64 *val)
696     +{
697     + *val = rfi_flush ? 1 : 0;
698     + return 0;
699     +}
700     +
701     +DEFINE_SIMPLE_ATTRIBUTE(fops_rfi_flush, rfi_flush_get, rfi_flush_set, "%llu\n");
702     +
703     +static __init int rfi_flush_debugfs_init(void)
704     +{
705     + debugfs_create_file("rfi_flush", 0600, powerpc_debugfs_root, NULL, &fops_rfi_flush);
706     + return 0;
707     +}
708     +device_initcall(rfi_flush_debugfs_init);
709     +#endif
710     +
711     +ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr, char *buf)
712     +{
713     + if (rfi_flush)
714     + return sprintf(buf, "Mitigation: RFI Flush\n");
715     +
716     + return sprintf(buf, "Vulnerable\n");
717     +}
718     +#endif /* CONFIG_PPC_BOOK3S_64 */
719     #endif
720     diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S
721     index 7394b770ae1f..b61fb7902018 100644
722     --- a/arch/powerpc/kernel/vmlinux.lds.S
723     +++ b/arch/powerpc/kernel/vmlinux.lds.S
724     @@ -132,6 +132,15 @@ SECTIONS
725     /* Read-only data */
726     RODATA
727    
728     +#ifdef CONFIG_PPC64
729     + . = ALIGN(8);
730     + __rfi_flush_fixup : AT(ADDR(__rfi_flush_fixup) - LOAD_OFFSET) {
731     + __start___rfi_flush_fixup = .;
732     + *(__rfi_flush_fixup)
733     + __stop___rfi_flush_fixup = .;
734     + }
735     +#endif
736     +
737     EXCEPTION_TABLE(0)
738    
739     NOTES :kernel :notes
740     diff --git a/arch/powerpc/lib/feature-fixups.c b/arch/powerpc/lib/feature-fixups.c
741     index 043415f0bdb1..e86bfa111f3c 100644
742     --- a/arch/powerpc/lib/feature-fixups.c
743     +++ b/arch/powerpc/lib/feature-fixups.c
744     @@ -23,6 +23,7 @@
745     #include <asm/sections.h>
746     #include <asm/setup.h>
747     #include <asm/firmware.h>
748     +#include <asm/setup.h>
749    
750     struct fixup_entry {
751     unsigned long mask;
752     @@ -115,6 +116,47 @@ void do_feature_fixups(unsigned long value, void *fixup_start, void *fixup_end)
753     }
754     }
755    
756     +#ifdef CONFIG_PPC_BOOK3S_64
757     +void do_rfi_flush_fixups(enum l1d_flush_type types)
758     +{
759     + unsigned int instrs[3], *dest;
760     + long *start, *end;
761     + int i;
762     +
763     + start = PTRRELOC(&__start___rfi_flush_fixup),
764     + end = PTRRELOC(&__stop___rfi_flush_fixup);
765     +
766     + instrs[0] = 0x60000000; /* nop */
767     + instrs[1] = 0x60000000; /* nop */
768     + instrs[2] = 0x60000000; /* nop */
769     +
770     + if (types & L1D_FLUSH_FALLBACK)
771     + /* b .+16 to fallback flush */
772     + instrs[0] = 0x48000010;
773     +
774     + i = 0;
775     + if (types & L1D_FLUSH_ORI) {
776     + instrs[i++] = 0x63ff0000; /* ori 31,31,0 speculation barrier */
777     + instrs[i++] = 0x63de0000; /* ori 30,30,0 L1d flush*/
778     + }
779     +
780     + if (types & L1D_FLUSH_MTTRIG)
781     + instrs[i++] = 0x7c12dba6; /* mtspr TRIG2,r0 (SPR #882) */
782     +
783     + for (i = 0; start < end; start++, i++) {
784     + dest = (void *)start + *start;
785     +
786     + pr_devel("patching dest %lx\n", (unsigned long)dest);
787     +
788     + patch_instruction(dest, instrs[0]);
789     + patch_instruction(dest + 1, instrs[1]);
790     + patch_instruction(dest + 2, instrs[2]);
791     + }
792     +
793     + printk(KERN_DEBUG "rfi-flush: patched %d locations\n", i);
794     +}
795     +#endif /* CONFIG_PPC_BOOK3S_64 */
796     +
797     void do_lwsync_fixups(unsigned long value, void *fixup_start, void *fixup_end)
798     {
799     long *start, *end;
800     diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
801     index b33faa0015cc..6f8b4c19373a 100644
802     --- a/arch/powerpc/platforms/powernv/setup.c
803     +++ b/arch/powerpc/platforms/powernv/setup.c
804     @@ -35,13 +35,63 @@
805     #include <asm/opal.h>
806     #include <asm/kexec.h>
807     #include <asm/smp.h>
808     +#include <asm/tm.h>
809     +#include <asm/setup.h>
810    
811     #include "powernv.h"
812    
813     +static void pnv_setup_rfi_flush(void)
814     +{
815     + struct device_node *np, *fw_features;
816     + enum l1d_flush_type type;
817     + int enable;
818     +
819     + /* Default to fallback in case fw-features are not available */
820     + type = L1D_FLUSH_FALLBACK;
821     + enable = 1;
822     +
823     + np = of_find_node_by_name(NULL, "ibm,opal");
824     + fw_features = of_get_child_by_name(np, "fw-features");
825     + of_node_put(np);
826     +
827     + if (fw_features) {
828     + np = of_get_child_by_name(fw_features, "inst-l1d-flush-trig2");
829     + if (np && of_property_read_bool(np, "enabled"))
830     + type = L1D_FLUSH_MTTRIG;
831     +
832     + of_node_put(np);
833     +
834     + np = of_get_child_by_name(fw_features, "inst-l1d-flush-ori30,30,0");
835     + if (np && of_property_read_bool(np, "enabled"))
836     + type = L1D_FLUSH_ORI;
837     +
838     + of_node_put(np);
839     +
840     + /* Enable unless firmware says NOT to */
841     + enable = 2;
842     + np = of_get_child_by_name(fw_features, "needs-l1d-flush-msr-hv-1-to-0");
843     + if (np && of_property_read_bool(np, "disabled"))
844     + enable--;
845     +
846     + of_node_put(np);
847     +
848     + np = of_get_child_by_name(fw_features, "needs-l1d-flush-msr-pr-0-to-1");
849     + if (np && of_property_read_bool(np, "disabled"))
850     + enable--;
851     +
852     + of_node_put(np);
853     + of_node_put(fw_features);
854     + }
855     +
856     + setup_rfi_flush(type, enable > 0);
857     +}
858     +
859     static void __init pnv_setup_arch(void)
860     {
861     set_arch_panic_timeout(10, ARCH_PANIC_TIMEOUT);
862    
863     + pnv_setup_rfi_flush();
864     +
865     /* Initialize SMP */
866     pnv_smp_init();
867    
868     diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
869     index 97aa3f332f24..1845fc611912 100644
870     --- a/arch/powerpc/platforms/pseries/setup.c
871     +++ b/arch/powerpc/platforms/pseries/setup.c
872     @@ -450,6 +450,39 @@ static void __init find_and_init_phbs(void)
873     of_pci_check_probe_only();
874     }
875    
876     +static void pseries_setup_rfi_flush(void)
877     +{
878     + struct h_cpu_char_result result;
879     + enum l1d_flush_type types;
880     + bool enable;
881     + long rc;
882     +
883     + /* Enable by default */
884     + enable = true;
885     +
886     + rc = plpar_get_cpu_characteristics(&result);
887     + if (rc == H_SUCCESS) {
888     + types = L1D_FLUSH_NONE;
889     +
890     + if (result.character & H_CPU_CHAR_L1D_FLUSH_TRIG2)
891     + types |= L1D_FLUSH_MTTRIG;
892     + if (result.character & H_CPU_CHAR_L1D_FLUSH_ORI30)
893     + types |= L1D_FLUSH_ORI;
894     +
895     + /* Use fallback if nothing set in hcall */
896     + if (types == L1D_FLUSH_NONE)
897     + types = L1D_FLUSH_FALLBACK;
898     +
899     + if (!(result.behaviour & H_CPU_BEHAV_L1D_FLUSH_PR))
900     + enable = false;
901     + } else {
902     + /* Default to fallback if case hcall is not available */
903     + types = L1D_FLUSH_FALLBACK;
904     + }
905     +
906     + setup_rfi_flush(types, enable);
907     +}
908     +
909     static void __init pSeries_setup_arch(void)
910     {
911     set_arch_panic_timeout(10, ARCH_PANIC_TIMEOUT);
912     @@ -467,6 +500,8 @@ static void __init pSeries_setup_arch(void)
913    
914     fwnmi_init();
915    
916     + pseries_setup_rfi_flush();
917     +
918     /* By default, only probe PCI (can be overridden by rtas_pci) */
919     pci_add_flags(PCI_PROBE_ONLY);
920    
921     diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
922     index bdd9cc59d20f..b0cd306dc527 100644
923     --- a/arch/x86/entry/common.c
924     +++ b/arch/x86/entry/common.c
925     @@ -20,6 +20,7 @@
926     #include <linux/export.h>
927     #include <linux/context_tracking.h>
928     #include <linux/user-return-notifier.h>
929     +#include <linux/nospec.h>
930     #include <linux/uprobes.h>
931    
932     #include <asm/desc.h>
933     @@ -201,7 +202,7 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
934     * special case only applies after poking regs and before the
935     * very next return to user mode.
936     */
937     - current->thread.status &= ~(TS_COMPAT|TS_I386_REGS_POKED);
938     + ti->status &= ~(TS_COMPAT|TS_I386_REGS_POKED);
939     #endif
940    
941     user_enter_irqoff();
942     @@ -277,7 +278,8 @@ __visible void do_syscall_64(struct pt_regs *regs)
943     * regs->orig_ax, which changes the behavior of some syscalls.
944     */
945     if (likely((nr & __SYSCALL_MASK) < NR_syscalls)) {
946     - regs->ax = sys_call_table[nr & __SYSCALL_MASK](
947     + nr = array_index_nospec(nr & __SYSCALL_MASK, NR_syscalls);
948     + regs->ax = sys_call_table[nr](
949     regs->di, regs->si, regs->dx,
950     regs->r10, regs->r8, regs->r9);
951     }
952     @@ -299,7 +301,7 @@ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
953     unsigned int nr = (unsigned int)regs->orig_ax;
954    
955     #ifdef CONFIG_IA32_EMULATION
956     - current->thread.status |= TS_COMPAT;
957     + ti->status |= TS_COMPAT;
958     #endif
959    
960     if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) {
961     @@ -313,6 +315,7 @@ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
962     }
963    
964     if (likely(nr < IA32_NR_syscalls)) {
965     + nr = array_index_nospec(nr, IA32_NR_syscalls);
966     /*
967     * It's possible that a 32-bit syscall implementation
968     * takes a 64-bit parameter but nonetheless assumes that
969     diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
970     index a76dc738ec61..f5434b4670c1 100644
971     --- a/arch/x86/entry/entry_32.S
972     +++ b/arch/x86/entry/entry_32.S
973     @@ -237,7 +237,8 @@ ENTRY(__switch_to_asm)
974     * exist, overwrite the RSB with entries which capture
975     * speculative execution to prevent attack.
976     */
977     - FILL_RETURN_BUFFER %ebx, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
978     + /* Clobbers %ebx */
979     + FILL_RETURN_BUFFER RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
980     #endif
981    
982     /* restore callee-saved registers */
983     diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
984     index e729e1528584..db5009ce065a 100644
985     --- a/arch/x86/entry/entry_64.S
986     +++ b/arch/x86/entry/entry_64.S
987     @@ -177,96 +177,17 @@ GLOBAL(entry_SYSCALL_64_after_swapgs)
988     pushq %r9 /* pt_regs->r9 */
989     pushq %r10 /* pt_regs->r10 */
990     pushq %r11 /* pt_regs->r11 */
991     - sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */
992     + pushq %rbx /* pt_regs->rbx */
993     + pushq %rbp /* pt_regs->rbp */
994     + pushq %r12 /* pt_regs->r12 */
995     + pushq %r13 /* pt_regs->r13 */
996     + pushq %r14 /* pt_regs->r14 */
997     + pushq %r15 /* pt_regs->r15 */
998    
999     - /*
1000     - * If we need to do entry work or if we guess we'll need to do
1001     - * exit work, go straight to the slow path.
1002     - */
1003     - movq PER_CPU_VAR(current_task), %r11
1004     - testl $_TIF_WORK_SYSCALL_ENTRY|_TIF_ALLWORK_MASK, TASK_TI_flags(%r11)
1005     - jnz entry_SYSCALL64_slow_path
1006     -
1007     -entry_SYSCALL_64_fastpath:
1008     - /*
1009     - * Easy case: enable interrupts and issue the syscall. If the syscall
1010     - * needs pt_regs, we'll call a stub that disables interrupts again
1011     - * and jumps to the slow path.
1012     - */
1013     - TRACE_IRQS_ON
1014     - ENABLE_INTERRUPTS(CLBR_NONE)
1015     -#if __SYSCALL_MASK == ~0
1016     - cmpq $__NR_syscall_max, %rax
1017     -#else
1018     - andl $__SYSCALL_MASK, %eax
1019     - cmpl $__NR_syscall_max, %eax
1020     -#endif
1021     - ja 1f /* return -ENOSYS (already in pt_regs->ax) */
1022     - movq %r10, %rcx
1023     -
1024     - /*
1025     - * This call instruction is handled specially in stub_ptregs_64.
1026     - * It might end up jumping to the slow path. If it jumps, RAX
1027     - * and all argument registers are clobbered.
1028     - */
1029     -#ifdef CONFIG_RETPOLINE
1030     - movq sys_call_table(, %rax, 8), %rax
1031     - call __x86_indirect_thunk_rax
1032     -#else
1033     - call *sys_call_table(, %rax, 8)
1034     -#endif
1035     -.Lentry_SYSCALL_64_after_fastpath_call:
1036     -
1037     - movq %rax, RAX(%rsp)
1038     -1:
1039     -
1040     - /*
1041     - * If we get here, then we know that pt_regs is clean for SYSRET64.
1042     - * If we see that no exit work is required (which we are required
1043     - * to check with IRQs off), then we can go straight to SYSRET64.
1044     - */
1045     - DISABLE_INTERRUPTS(CLBR_NONE)
1046     - TRACE_IRQS_OFF
1047     - movq PER_CPU_VAR(current_task), %r11
1048     - testl $_TIF_ALLWORK_MASK, TASK_TI_flags(%r11)
1049     - jnz 1f
1050     -
1051     - LOCKDEP_SYS_EXIT
1052     - TRACE_IRQS_ON /* user mode is traced as IRQs on */
1053     - movq RIP(%rsp), %rcx
1054     - movq EFLAGS(%rsp), %r11
1055     - RESTORE_C_REGS_EXCEPT_RCX_R11
1056     - /*
1057     - * This opens a window where we have a user CR3, but are
1058     - * running in the kernel. This makes using the CS
1059     - * register useless for telling whether or not we need to
1060     - * switch CR3 in NMIs. Normal interrupts are OK because
1061     - * they are off here.
1062     - */
1063     - SWITCH_USER_CR3
1064     - movq RSP(%rsp), %rsp
1065     - USERGS_SYSRET64
1066     -
1067     -1:
1068     - /*
1069     - * The fast path looked good when we started, but something changed
1070     - * along the way and we need to switch to the slow path. Calling
1071     - * raise(3) will trigger this, for example. IRQs are off.
1072     - */
1073     - TRACE_IRQS_ON
1074     - ENABLE_INTERRUPTS(CLBR_NONE)
1075     - SAVE_EXTRA_REGS
1076     - movq %rsp, %rdi
1077     - call syscall_return_slowpath /* returns with IRQs disabled */
1078     - jmp return_from_SYSCALL_64
1079     -
1080     -entry_SYSCALL64_slow_path:
1081     /* IRQs are off. */
1082     - SAVE_EXTRA_REGS
1083     movq %rsp, %rdi
1084     call do_syscall_64 /* returns with IRQs disabled */
1085    
1086     -return_from_SYSCALL_64:
1087     RESTORE_EXTRA_REGS
1088     TRACE_IRQS_IRETQ /* we're about to change IF */
1089    
1090     @@ -339,6 +260,7 @@ return_from_SYSCALL_64:
1091     syscall_return_via_sysret:
1092     /* rcx and r11 are already restored (see code above) */
1093     RESTORE_C_REGS_EXCEPT_RCX_R11
1094     +
1095     /*
1096     * This opens a window where we have a user CR3, but are
1097     * running in the kernel. This makes using the CS
1098     @@ -363,45 +285,6 @@ opportunistic_sysret_failed:
1099     jmp restore_c_regs_and_iret
1100     END(entry_SYSCALL_64)
1101    
1102     -ENTRY(stub_ptregs_64)
1103     - /*
1104     - * Syscalls marked as needing ptregs land here.
1105     - * If we are on the fast path, we need to save the extra regs,
1106     - * which we achieve by trying again on the slow path. If we are on
1107     - * the slow path, the extra regs are already saved.
1108     - *
1109     - * RAX stores a pointer to the C function implementing the syscall.
1110     - * IRQs are on.
1111     - */
1112     - cmpq $.Lentry_SYSCALL_64_after_fastpath_call, (%rsp)
1113     - jne 1f
1114     -
1115     - /*
1116     - * Called from fast path -- disable IRQs again, pop return address
1117     - * and jump to slow path
1118     - */
1119     - DISABLE_INTERRUPTS(CLBR_NONE)
1120     - TRACE_IRQS_OFF
1121     - popq %rax
1122     - jmp entry_SYSCALL64_slow_path
1123     -
1124     -1:
1125     - JMP_NOSPEC %rax /* Called from C */
1126     -END(stub_ptregs_64)
1127     -
1128     -.macro ptregs_stub func
1129     -ENTRY(ptregs_\func)
1130     - leaq \func(%rip), %rax
1131     - jmp stub_ptregs_64
1132     -END(ptregs_\func)
1133     -.endm
1134     -
1135     -/* Instantiate ptregs_stub for each ptregs-using syscall */
1136     -#define __SYSCALL_64_QUAL_(sym)
1137     -#define __SYSCALL_64_QUAL_ptregs(sym) ptregs_stub sym
1138     -#define __SYSCALL_64(nr, sym, qual) __SYSCALL_64_QUAL_##qual(sym)
1139     -#include <asm/syscalls_64.h>
1140     -
1141     /*
1142     * %rdi: prev task
1143     * %rsi: next task
1144     @@ -435,7 +318,8 @@ ENTRY(__switch_to_asm)
1145     * exist, overwrite the RSB with entries which capture
1146     * speculative execution to prevent attack.
1147     */
1148     - FILL_RETURN_BUFFER %r12, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
1149     + /* Clobbers %rbx */
1150     + FILL_RETURN_BUFFER RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
1151     #endif
1152    
1153     /* restore callee-saved registers */
1154     diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c
1155     index 9dbc5abb6162..6705edda4ac3 100644
1156     --- a/arch/x86/entry/syscall_64.c
1157     +++ b/arch/x86/entry/syscall_64.c
1158     @@ -6,14 +6,11 @@
1159     #include <asm/asm-offsets.h>
1160     #include <asm/syscall.h>
1161    
1162     -#define __SYSCALL_64_QUAL_(sym) sym
1163     -#define __SYSCALL_64_QUAL_ptregs(sym) ptregs_##sym
1164     -
1165     -#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long __SYSCALL_64_QUAL_##qual(sym)(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
1166     +#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
1167     #include <asm/syscalls_64.h>
1168     #undef __SYSCALL_64
1169    
1170     -#define __SYSCALL_64(nr, sym, qual) [nr] = __SYSCALL_64_QUAL_##qual(sym),
1171     +#define __SYSCALL_64(nr, sym, qual) [nr] = sym,
1172    
1173     extern long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
1174    
1175     diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c
1176     index 982c9e31daca..21298c173b0e 100644
1177     --- a/arch/x86/events/intel/bts.c
1178     +++ b/arch/x86/events/intel/bts.c
1179     @@ -22,6 +22,7 @@
1180     #include <linux/debugfs.h>
1181     #include <linux/device.h>
1182     #include <linux/coredump.h>
1183     +#include <linux/kaiser.h>
1184    
1185     #include <asm-generic/sizes.h>
1186     #include <asm/perf_event.h>
1187     @@ -77,6 +78,23 @@ static size_t buf_size(struct page *page)
1188     return 1 << (PAGE_SHIFT + page_private(page));
1189     }
1190    
1191     +static void bts_buffer_free_aux(void *data)
1192     +{
1193     +#ifdef CONFIG_PAGE_TABLE_ISOLATION
1194     + struct bts_buffer *buf = data;
1195     + int nbuf;
1196     +
1197     + for (nbuf = 0; nbuf < buf->nr_bufs; nbuf++) {
1198     + struct page *page = buf->buf[nbuf].page;
1199     + void *kaddr = page_address(page);
1200     + size_t page_size = buf_size(page);
1201     +
1202     + kaiser_remove_mapping((unsigned long)kaddr, page_size);
1203     + }
1204     +#endif
1205     + kfree(data);
1206     +}
1207     +
1208     static void *
1209     bts_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool overwrite)
1210     {
1211     @@ -113,29 +131,33 @@ bts_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool overwrite)
1212     buf->real_size = size - size % BTS_RECORD_SIZE;
1213    
1214     for (pg = 0, nbuf = 0, offset = 0, pad = 0; nbuf < buf->nr_bufs; nbuf++) {
1215     - unsigned int __nr_pages;
1216     + void *kaddr = pages[pg];
1217     + size_t page_size;
1218     +
1219     + page = virt_to_page(kaddr);
1220     + page_size = buf_size(page);
1221     +
1222     + if (kaiser_add_mapping((unsigned long)kaddr,
1223     + page_size, __PAGE_KERNEL) < 0) {
1224     + buf->nr_bufs = nbuf;
1225     + bts_buffer_free_aux(buf);
1226     + return NULL;
1227     + }
1228    
1229     - page = virt_to_page(pages[pg]);
1230     - __nr_pages = PagePrivate(page) ? 1 << page_private(page) : 1;
1231     buf->buf[nbuf].page = page;
1232     buf->buf[nbuf].offset = offset;
1233     buf->buf[nbuf].displacement = (pad ? BTS_RECORD_SIZE - pad : 0);
1234     - buf->buf[nbuf].size = buf_size(page) - buf->buf[nbuf].displacement;
1235     + buf->buf[nbuf].size = page_size - buf->buf[nbuf].displacement;
1236     pad = buf->buf[nbuf].size % BTS_RECORD_SIZE;
1237     buf->buf[nbuf].size -= pad;
1238    
1239     - pg += __nr_pages;
1240     - offset += __nr_pages << PAGE_SHIFT;
1241     + pg += page_size >> PAGE_SHIFT;
1242     + offset += page_size;
1243     }
1244    
1245     return buf;
1246     }
1247    
1248     -static void bts_buffer_free_aux(void *data)
1249     -{
1250     - kfree(data);
1251     -}
1252     -
1253     static unsigned long bts_buffer_offset(struct bts_buffer *buf, unsigned int idx)
1254     {
1255     return buf->buf[idx].offset + buf->buf[idx].displacement;
1256     diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h
1257     index b15aa4083dfd..166654218329 100644
1258     --- a/arch/x86/include/asm/asm-prototypes.h
1259     +++ b/arch/x86/include/asm/asm-prototypes.h
1260     @@ -37,5 +37,7 @@ INDIRECT_THUNK(dx)
1261     INDIRECT_THUNK(si)
1262     INDIRECT_THUNK(di)
1263     INDIRECT_THUNK(bp)
1264     -INDIRECT_THUNK(sp)
1265     +asmlinkage void __fill_rsb(void);
1266     +asmlinkage void __clear_rsb(void);
1267     +
1268     #endif /* CONFIG_RETPOLINE */
1269     diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
1270     index 00523524edbf..7bb29a416b77 100644
1271     --- a/arch/x86/include/asm/asm.h
1272     +++ b/arch/x86/include/asm/asm.h
1273     @@ -11,10 +11,12 @@
1274     # define __ASM_FORM_COMMA(x) " " #x ","
1275     #endif
1276    
1277     -#ifdef CONFIG_X86_32
1278     +#ifndef __x86_64__
1279     +/* 32 bit */
1280     # define __ASM_SEL(a,b) __ASM_FORM(a)
1281     # define __ASM_SEL_RAW(a,b) __ASM_FORM_RAW(a)
1282     #else
1283     +/* 64 bit */
1284     # define __ASM_SEL(a,b) __ASM_FORM(b)
1285     # define __ASM_SEL_RAW(a,b) __ASM_FORM_RAW(b)
1286     #endif
1287     diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
1288     index bfb28caf97b1..857590390397 100644
1289     --- a/arch/x86/include/asm/barrier.h
1290     +++ b/arch/x86/include/asm/barrier.h
1291     @@ -23,6 +23,34 @@
1292     #define wmb() asm volatile("sfence" ::: "memory")
1293     #endif
1294    
1295     +/**
1296     + * array_index_mask_nospec() - generate a mask that is ~0UL when the
1297     + * bounds check succeeds and 0 otherwise
1298     + * @index: array element index
1299     + * @size: number of elements in array
1300     + *
1301     + * Returns:
1302     + * 0 - (index < size)
1303     + */
1304     +static inline unsigned long array_index_mask_nospec(unsigned long index,
1305     + unsigned long size)
1306     +{
1307     + unsigned long mask;
1308     +
1309     + asm ("cmp %1,%2; sbb %0,%0;"
1310     + :"=r" (mask)
1311     + :"r"(size),"r" (index)
1312     + :"cc");
1313     + return mask;
1314     +}
1315     +
1316     +/* Override the default implementation from linux/nospec.h. */
1317     +#define array_index_mask_nospec array_index_mask_nospec
1318     +
1319     +/* Prevent speculative execution past this barrier. */
1320     +#define barrier_nospec() alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC, \
1321     + "lfence", X86_FEATURE_LFENCE_RDTSC)
1322     +
1323     #ifdef CONFIG_X86_PPRO_FENCE
1324     #define dma_rmb() rmb()
1325     #else
1326     diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
1327     index 9ea67a04ff4f..8c101579f535 100644
1328     --- a/arch/x86/include/asm/cpufeature.h
1329     +++ b/arch/x86/include/asm/cpufeature.h
1330     @@ -28,6 +28,7 @@ enum cpuid_leafs
1331     CPUID_8000_000A_EDX,
1332     CPUID_7_ECX,
1333     CPUID_8000_0007_EBX,
1334     + CPUID_7_EDX,
1335     };
1336    
1337     #ifdef CONFIG_X86_FEATURE_NAMES
1338     @@ -78,8 +79,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
1339     CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 15, feature_bit) || \
1340     CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 16, feature_bit) || \
1341     CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 17, feature_bit) || \
1342     + CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 18, feature_bit) || \
1343     REQUIRED_MASK_CHECK || \
1344     - BUILD_BUG_ON_ZERO(NCAPINTS != 18))
1345     + BUILD_BUG_ON_ZERO(NCAPINTS != 19))
1346    
1347     #define DISABLED_MASK_BIT_SET(feature_bit) \
1348     ( CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 0, feature_bit) || \
1349     @@ -100,8 +102,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
1350     CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 15, feature_bit) || \
1351     CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 16, feature_bit) || \
1352     CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 17, feature_bit) || \
1353     + CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 18, feature_bit) || \
1354     DISABLED_MASK_CHECK || \
1355     - BUILD_BUG_ON_ZERO(NCAPINTS != 18))
1356     + BUILD_BUG_ON_ZERO(NCAPINTS != 19))
1357    
1358     #define cpu_has(c, bit) \
1359     (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \
1360     diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
1361     index 8537a21acd8b..8eb23f5cf7f4 100644
1362     --- a/arch/x86/include/asm/cpufeatures.h
1363     +++ b/arch/x86/include/asm/cpufeatures.h
1364     @@ -12,7 +12,7 @@
1365     /*
1366     * Defines x86 CPU feature bits
1367     */
1368     -#define NCAPINTS 18 /* N 32-bit words worth of info */
1369     +#define NCAPINTS 19 /* N 32-bit words worth of info */
1370     #define NBUGINTS 1 /* N 32-bit bug flags */
1371    
1372     /*
1373     @@ -194,16 +194,16 @@
1374     #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */
1375     #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
1376    
1377     -#define X86_FEATURE_RETPOLINE ( 7*32+12) /* Generic Retpoline mitigation for Spectre variant 2 */
1378     -#define X86_FEATURE_RETPOLINE_AMD ( 7*32+13) /* AMD Retpoline mitigation for Spectre variant 2 */
1379     +#define X86_FEATURE_RETPOLINE ( 7*32+12) /* "" Generic Retpoline mitigation for Spectre variant 2 */
1380     +#define X86_FEATURE_RETPOLINE_AMD ( 7*32+13) /* "" AMD Retpoline mitigation for Spectre variant 2 */
1381    
1382     -#define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */
1383     -#define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */
1384     -#define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* Fill RSB on context switches */
1385     +#define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* "" Fill RSB on context switches */
1386    
1387     /* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */
1388     #define X86_FEATURE_KAISER ( 7*32+31) /* CONFIG_PAGE_TABLE_ISOLATION w/o nokaiser */
1389    
1390     +#define X86_FEATURE_USE_IBPB ( 7*32+21) /* "" Indirect Branch Prediction Barrier enabled */
1391     +
1392     /* Virtualization flags: Linux defined, word 8 */
1393     #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */
1394     #define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */
1395     @@ -260,6 +260,9 @@
1396     /* AMD-defined CPU features, CPUID level 0x80000008 (ebx), word 13 */
1397     #define X86_FEATURE_CLZERO (13*32+0) /* CLZERO instruction */
1398     #define X86_FEATURE_IRPERF (13*32+1) /* Instructions Retired Count */
1399     +#define X86_FEATURE_IBPB (13*32+12) /* Indirect Branch Prediction Barrier */
1400     +#define X86_FEATURE_IBRS (13*32+14) /* Indirect Branch Restricted Speculation */
1401     +#define X86_FEATURE_STIBP (13*32+15) /* Single Thread Indirect Branch Predictors */
1402    
1403     /* Thermal and Power Management Leaf, CPUID level 0x00000006 (eax), word 14 */
1404     #define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */
1405     @@ -295,6 +298,13 @@
1406     #define X86_FEATURE_SUCCOR (17*32+1) /* Uncorrectable error containment and recovery */
1407     #define X86_FEATURE_SMCA (17*32+3) /* Scalable MCA */
1408    
1409     +/* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */
1410     +#define X86_FEATURE_AVX512_4VNNIW (18*32+ 2) /* AVX-512 Neural Network Instructions */
1411     +#define X86_FEATURE_AVX512_4FMAPS (18*32+ 3) /* AVX-512 Multiply Accumulation Single precision */
1412     +#define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */
1413     +#define X86_FEATURE_INTEL_STIBP (18*32+27) /* "" Single Thread Indirect Branch Predictors */
1414     +#define X86_FEATURE_ARCH_CAPABILITIES (18*32+29) /* IA32_ARCH_CAPABILITIES MSR (Intel) */
1415     +
1416     /*
1417     * BUG word(s)
1418     */
1419     diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h
1420     index 21c5ac15657b..1f8cca459c6c 100644
1421     --- a/arch/x86/include/asm/disabled-features.h
1422     +++ b/arch/x86/include/asm/disabled-features.h
1423     @@ -59,6 +59,7 @@
1424     #define DISABLED_MASK15 0
1425     #define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE)
1426     #define DISABLED_MASK17 0
1427     -#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 18)
1428     +#define DISABLED_MASK18 0
1429     +#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19)
1430    
1431     #endif /* _ASM_X86_DISABLED_FEATURES_H */
1432     diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h
1433     index 34a46dc076d3..75b748a1deb8 100644
1434     --- a/arch/x86/include/asm/intel-family.h
1435     +++ b/arch/x86/include/asm/intel-family.h
1436     @@ -12,6 +12,7 @@
1437     */
1438    
1439     #define INTEL_FAM6_CORE_YONAH 0x0E
1440     +
1441     #define INTEL_FAM6_CORE2_MEROM 0x0F
1442     #define INTEL_FAM6_CORE2_MEROM_L 0x16
1443     #define INTEL_FAM6_CORE2_PENRYN 0x17
1444     @@ -21,6 +22,7 @@
1445     #define INTEL_FAM6_NEHALEM_G 0x1F /* Auburndale / Havendale */
1446     #define INTEL_FAM6_NEHALEM_EP 0x1A
1447     #define INTEL_FAM6_NEHALEM_EX 0x2E
1448     +
1449     #define INTEL_FAM6_WESTMERE 0x25
1450     #define INTEL_FAM6_WESTMERE_EP 0x2C
1451     #define INTEL_FAM6_WESTMERE_EX 0x2F
1452     @@ -36,9 +38,9 @@
1453     #define INTEL_FAM6_HASWELL_GT3E 0x46
1454    
1455     #define INTEL_FAM6_BROADWELL_CORE 0x3D
1456     -#define INTEL_FAM6_BROADWELL_XEON_D 0x56
1457     #define INTEL_FAM6_BROADWELL_GT3E 0x47
1458     #define INTEL_FAM6_BROADWELL_X 0x4F
1459     +#define INTEL_FAM6_BROADWELL_XEON_D 0x56
1460    
1461     #define INTEL_FAM6_SKYLAKE_MOBILE 0x4E
1462     #define INTEL_FAM6_SKYLAKE_DESKTOP 0x5E
1463     @@ -57,9 +59,10 @@
1464     #define INTEL_FAM6_ATOM_SILVERMONT2 0x4D /* Avaton/Rangely */
1465     #define INTEL_FAM6_ATOM_AIRMONT 0x4C /* CherryTrail / Braswell */
1466     #define INTEL_FAM6_ATOM_MERRIFIELD 0x4A /* Tangier */
1467     -#define INTEL_FAM6_ATOM_MOOREFIELD 0x5A /* Annidale */
1468     +#define INTEL_FAM6_ATOM_MOOREFIELD 0x5A /* Anniedale */
1469     #define INTEL_FAM6_ATOM_GOLDMONT 0x5C
1470     #define INTEL_FAM6_ATOM_DENVERTON 0x5F /* Goldmont Microserver */
1471     +#define INTEL_FAM6_ATOM_GEMINI_LAKE 0x7A
1472    
1473     /* Xeon Phi */
1474    
1475     diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
1476     index b11c4c072df8..c768bc1550a1 100644
1477     --- a/arch/x86/include/asm/msr-index.h
1478     +++ b/arch/x86/include/asm/msr-index.h
1479     @@ -37,6 +37,13 @@
1480     #define EFER_FFXSR (1<<_EFER_FFXSR)
1481    
1482     /* Intel MSRs. Some also available on other CPUs */
1483     +#define MSR_IA32_SPEC_CTRL 0x00000048 /* Speculation Control */
1484     +#define SPEC_CTRL_IBRS (1 << 0) /* Indirect Branch Restricted Speculation */
1485     +#define SPEC_CTRL_STIBP (1 << 1) /* Single Thread Indirect Branch Predictors */
1486     +
1487     +#define MSR_IA32_PRED_CMD 0x00000049 /* Prediction Command */
1488     +#define PRED_CMD_IBPB (1 << 0) /* Indirect Branch Prediction Barrier */
1489     +
1490     #define MSR_IA32_PERFCTR0 0x000000c1
1491     #define MSR_IA32_PERFCTR1 0x000000c2
1492     #define MSR_FSB_FREQ 0x000000cd
1493     @@ -50,6 +57,11 @@
1494     #define SNB_C3_AUTO_UNDEMOTE (1UL << 28)
1495    
1496     #define MSR_MTRRcap 0x000000fe
1497     +
1498     +#define MSR_IA32_ARCH_CAPABILITIES 0x0000010a
1499     +#define ARCH_CAP_RDCL_NO (1 << 0) /* Not susceptible to Meltdown */
1500     +#define ARCH_CAP_IBRS_ALL (1 << 1) /* Enhanced IBRS support */
1501     +
1502     #define MSR_IA32_BBL_CR_CTL 0x00000119
1503     #define MSR_IA32_BBL_CR_CTL3 0x0000011e
1504    
1505     diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
1506     index b5fee97813cd..ed35b915b5c9 100644
1507     --- a/arch/x86/include/asm/msr.h
1508     +++ b/arch/x86/include/asm/msr.h
1509     @@ -188,8 +188,7 @@ static __always_inline unsigned long long rdtsc_ordered(void)
1510     * that some other imaginary CPU is updating continuously with a
1511     * time stamp.
1512     */
1513     - alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC,
1514     - "lfence", X86_FEATURE_LFENCE_RDTSC);
1515     + barrier_nospec();
1516     return rdtsc();
1517     }
1518    
1519     diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
1520     index 4ad41087ce0e..300cc159b4a0 100644
1521     --- a/arch/x86/include/asm/nospec-branch.h
1522     +++ b/arch/x86/include/asm/nospec-branch.h
1523     @@ -1,56 +1,12 @@
1524     /* SPDX-License-Identifier: GPL-2.0 */
1525    
1526     -#ifndef __NOSPEC_BRANCH_H__
1527     -#define __NOSPEC_BRANCH_H__
1528     +#ifndef _ASM_X86_NOSPEC_BRANCH_H_
1529     +#define _ASM_X86_NOSPEC_BRANCH_H_
1530    
1531     #include <asm/alternative.h>
1532     #include <asm/alternative-asm.h>
1533     #include <asm/cpufeatures.h>
1534    
1535     -/*
1536     - * Fill the CPU return stack buffer.
1537     - *
1538     - * Each entry in the RSB, if used for a speculative 'ret', contains an
1539     - * infinite 'pause; lfence; jmp' loop to capture speculative execution.
1540     - *
1541     - * This is required in various cases for retpoline and IBRS-based
1542     - * mitigations for the Spectre variant 2 vulnerability. Sometimes to
1543     - * eliminate potentially bogus entries from the RSB, and sometimes
1544     - * purely to ensure that it doesn't get empty, which on some CPUs would
1545     - * allow predictions from other (unwanted!) sources to be used.
1546     - *
1547     - * We define a CPP macro such that it can be used from both .S files and
1548     - * inline assembly. It's possible to do a .macro and then include that
1549     - * from C via asm(".include <asm/nospec-branch.h>") but let's not go there.
1550     - */
1551     -
1552     -#define RSB_CLEAR_LOOPS 32 /* To forcibly overwrite all entries */
1553     -#define RSB_FILL_LOOPS 16 /* To avoid underflow */
1554     -
1555     -/*
1556     - * Google experimented with loop-unrolling and this turned out to be
1557     - * the optimal version — two calls, each with their own speculation
1558     - * trap should their return address end up getting used, in a loop.
1559     - */
1560     -#define __FILL_RETURN_BUFFER(reg, nr, sp) \
1561     - mov $(nr/2), reg; \
1562     -771: \
1563     - call 772f; \
1564     -773: /* speculation trap */ \
1565     - pause; \
1566     - lfence; \
1567     - jmp 773b; \
1568     -772: \
1569     - call 774f; \
1570     -775: /* speculation trap */ \
1571     - pause; \
1572     - lfence; \
1573     - jmp 775b; \
1574     -774: \
1575     - dec reg; \
1576     - jnz 771b; \
1577     - add $(BITS_PER_LONG/8) * nr, sp;
1578     -
1579     #ifdef __ASSEMBLY__
1580    
1581     /*
1582     @@ -121,17 +77,10 @@
1583     #endif
1584     .endm
1585    
1586     - /*
1587     - * A simpler FILL_RETURN_BUFFER macro. Don't make people use the CPP
1588     - * monstrosity above, manually.
1589     - */
1590     -.macro FILL_RETURN_BUFFER reg:req nr:req ftr:req
1591     +/* This clobbers the BX register */
1592     +.macro FILL_RETURN_BUFFER nr:req ftr:req
1593     #ifdef CONFIG_RETPOLINE
1594     - ANNOTATE_NOSPEC_ALTERNATIVE
1595     - ALTERNATIVE "jmp .Lskip_rsb_\@", \
1596     - __stringify(__FILL_RETURN_BUFFER(\reg,\nr,%_ASM_SP)) \
1597     - \ftr
1598     -.Lskip_rsb_\@:
1599     + ALTERNATIVE "", "call __clear_rsb", \ftr
1600     #endif
1601     .endm
1602    
1603     @@ -201,22 +150,30 @@ extern char __indirect_thunk_end[];
1604     * On VMEXIT we must ensure that no RSB predictions learned in the guest
1605     * can be followed in the host, by overwriting the RSB completely. Both
1606     * retpoline and IBRS mitigations for Spectre v2 need this; only on future
1607     - * CPUs with IBRS_ATT *might* it be avoided.
1608     + * CPUs with IBRS_ALL *might* it be avoided.
1609     */
1610     static inline void vmexit_fill_RSB(void)
1611     {
1612     #ifdef CONFIG_RETPOLINE
1613     - unsigned long loops;
1614     -
1615     - asm volatile (ANNOTATE_NOSPEC_ALTERNATIVE
1616     - ALTERNATIVE("jmp 910f",
1617     - __stringify(__FILL_RETURN_BUFFER(%0, RSB_CLEAR_LOOPS, %1)),
1618     - X86_FEATURE_RETPOLINE)
1619     - "910:"
1620     - : "=r" (loops), ASM_CALL_CONSTRAINT
1621     - : : "memory" );
1622     + alternative_input("",
1623     + "call __fill_rsb",
1624     + X86_FEATURE_RETPOLINE,
1625     + ASM_NO_INPUT_CLOBBER(_ASM_BX, "memory"));
1626     #endif
1627     }
1628    
1629     +static inline void indirect_branch_prediction_barrier(void)
1630     +{
1631     + asm volatile(ALTERNATIVE("",
1632     + "movl %[msr], %%ecx\n\t"
1633     + "movl %[val], %%eax\n\t"
1634     + "movl $0, %%edx\n\t"
1635     + "wrmsr",
1636     + X86_FEATURE_USE_IBPB)
1637     + : : [msr] "i" (MSR_IA32_PRED_CMD),
1638     + [val] "i" (PRED_CMD_IBPB)
1639     + : "eax", "ecx", "edx", "memory");
1640     +}
1641     +
1642     #endif /* __ASSEMBLY__ */
1643     -#endif /* __NOSPEC_BRANCH_H__ */
1644     +#endif /* _ASM_X86_NOSPEC_BRANCH_H_ */
1645     diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
1646     index 1178a51b77f3..b6d425999f99 100644
1647     --- a/arch/x86/include/asm/pgalloc.h
1648     +++ b/arch/x86/include/asm/pgalloc.h
1649     @@ -27,17 +27,6 @@ static inline void paravirt_release_pud(unsigned long pfn) {}
1650     */
1651     extern gfp_t __userpte_alloc_gfp;
1652    
1653     -#ifdef CONFIG_PAGE_TABLE_ISOLATION
1654     -/*
1655     - * Instead of one PGD, we acquire two PGDs. Being order-1, it is
1656     - * both 8k in size and 8k-aligned. That lets us just flip bit 12
1657     - * in a pointer to swap between the two 4k halves.
1658     - */
1659     -#define PGD_ALLOCATION_ORDER 1
1660     -#else
1661     -#define PGD_ALLOCATION_ORDER 0
1662     -#endif
1663     -
1664     /*
1665     * Allocate and free page tables.
1666     */
1667     diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
1668     index 2536f90cd30c..5af0401ccff2 100644
1669     --- a/arch/x86/include/asm/pgtable.h
1670     +++ b/arch/x86/include/asm/pgtable.h
1671     @@ -20,9 +20,15 @@
1672    
1673     #ifdef CONFIG_PAGE_TABLE_ISOLATION
1674     extern int kaiser_enabled;
1675     +/*
1676     + * Instead of one PGD, we acquire two PGDs. Being order-1, it is
1677     + * both 8k in size and 8k-aligned. That lets us just flip bit 12
1678     + * in a pointer to swap between the two 4k halves.
1679     + */
1680     #else
1681     #define kaiser_enabled 0
1682     #endif
1683     +#define PGD_ALLOCATION_ORDER kaiser_enabled
1684    
1685     void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd);
1686     void ptdump_walk_pgd_level_checkwx(void);
1687     diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
1688     index 353f038ec645..cb866ae1bc5d 100644
1689     --- a/arch/x86/include/asm/processor.h
1690     +++ b/arch/x86/include/asm/processor.h
1691     @@ -391,8 +391,6 @@ struct thread_struct {
1692     unsigned short gsindex;
1693     #endif
1694    
1695     - u32 status; /* thread synchronous flags */
1696     -
1697     #ifdef CONFIG_X86_64
1698     unsigned long fsbase;
1699     unsigned long gsbase;
1700     diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h
1701     index fac9a5c0abe9..6847d85400a8 100644
1702     --- a/arch/x86/include/asm/required-features.h
1703     +++ b/arch/x86/include/asm/required-features.h
1704     @@ -100,6 +100,7 @@
1705     #define REQUIRED_MASK15 0
1706     #define REQUIRED_MASK16 0
1707     #define REQUIRED_MASK17 0
1708     -#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 18)
1709     +#define REQUIRED_MASK18 0
1710     +#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 19)
1711    
1712     #endif /* _ASM_X86_REQUIRED_FEATURES_H */
1713     diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h
1714     index e3c95e8e61c5..03eedc21246d 100644
1715     --- a/arch/x86/include/asm/syscall.h
1716     +++ b/arch/x86/include/asm/syscall.h
1717     @@ -60,7 +60,7 @@ static inline long syscall_get_error(struct task_struct *task,
1718     * TS_COMPAT is set for 32-bit syscall entries and then
1719     * remains set until we return to user mode.
1720     */
1721     - if (task->thread.status & (TS_COMPAT|TS_I386_REGS_POKED))
1722     + if (task->thread_info.status & (TS_COMPAT|TS_I386_REGS_POKED))
1723     /*
1724     * Sign-extend the value so (int)-EFOO becomes (long)-EFOO
1725     * and will match correctly in comparisons.
1726     @@ -116,7 +116,7 @@ static inline void syscall_get_arguments(struct task_struct *task,
1727     unsigned long *args)
1728     {
1729     # ifdef CONFIG_IA32_EMULATION
1730     - if (task->thread.status & TS_COMPAT)
1731     + if (task->thread_info.status & TS_COMPAT)
1732     switch (i) {
1733     case 0:
1734     if (!n--) break;
1735     @@ -177,7 +177,7 @@ static inline void syscall_set_arguments(struct task_struct *task,
1736     const unsigned long *args)
1737     {
1738     # ifdef CONFIG_IA32_EMULATION
1739     - if (task->thread.status & TS_COMPAT)
1740     + if (task->thread_info.status & TS_COMPAT)
1741     switch (i) {
1742     case 0:
1743     if (!n--) break;
1744     diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
1745     index bdf9c4c91572..89978b9c667a 100644
1746     --- a/arch/x86/include/asm/thread_info.h
1747     +++ b/arch/x86/include/asm/thread_info.h
1748     @@ -54,6 +54,7 @@ struct task_struct;
1749    
1750     struct thread_info {
1751     unsigned long flags; /* low level flags */
1752     + u32 status; /* thread synchronous flags */
1753     };
1754    
1755     #define INIT_THREAD_INFO(tsk) \
1756     @@ -213,7 +214,7 @@ static inline int arch_within_stack_frames(const void * const stack,
1757     #define in_ia32_syscall() true
1758     #else
1759     #define in_ia32_syscall() (IS_ENABLED(CONFIG_IA32_EMULATION) && \
1760     - current->thread.status & TS_COMPAT)
1761     + current_thread_info()->status & TS_COMPAT)
1762     #endif
1763    
1764     /*
1765     diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
1766     index dead0f3921f3..a8d85a687cf4 100644
1767     --- a/arch/x86/include/asm/uaccess.h
1768     +++ b/arch/x86/include/asm/uaccess.h
1769     @@ -123,6 +123,11 @@ extern int __get_user_bad(void);
1770    
1771     #define __uaccess_begin() stac()
1772     #define __uaccess_end() clac()
1773     +#define __uaccess_begin_nospec() \
1774     +({ \
1775     + stac(); \
1776     + barrier_nospec(); \
1777     +})
1778    
1779     /*
1780     * This is a type: either unsigned long, if the argument fits into
1781     @@ -432,7 +437,7 @@ do { \
1782     ({ \
1783     int __gu_err; \
1784     __inttype(*(ptr)) __gu_val; \
1785     - __uaccess_begin(); \
1786     + __uaccess_begin_nospec(); \
1787     __get_user_size(__gu_val, (ptr), (size), __gu_err, -EFAULT); \
1788     __uaccess_end(); \
1789     (x) = (__force __typeof__(*(ptr)))__gu_val; \
1790     @@ -474,6 +479,10 @@ struct __large_struct { unsigned long buf[100]; };
1791     __uaccess_begin(); \
1792     barrier();
1793    
1794     +#define uaccess_try_nospec do { \
1795     + current->thread.uaccess_err = 0; \
1796     + __uaccess_begin_nospec(); \
1797     +
1798     #define uaccess_catch(err) \
1799     __uaccess_end(); \
1800     (err) |= (current->thread.uaccess_err ? -EFAULT : 0); \
1801     @@ -538,7 +547,7 @@ struct __large_struct { unsigned long buf[100]; };
1802     * get_user_ex(...);
1803     * } get_user_catch(err)
1804     */
1805     -#define get_user_try uaccess_try
1806     +#define get_user_try uaccess_try_nospec
1807     #define get_user_catch(err) uaccess_catch(err)
1808    
1809     #define get_user_ex(x, ptr) do { \
1810     @@ -573,7 +582,7 @@ extern void __cmpxchg_wrong_size(void)
1811     __typeof__(ptr) __uval = (uval); \
1812     __typeof__(*(ptr)) __old = (old); \
1813     __typeof__(*(ptr)) __new = (new); \
1814     - __uaccess_begin(); \
1815     + __uaccess_begin_nospec(); \
1816     switch (size) { \
1817     case 1: \
1818     { \
1819     diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
1820     index 7d3bdd1ed697..d6d245088dd5 100644
1821     --- a/arch/x86/include/asm/uaccess_32.h
1822     +++ b/arch/x86/include/asm/uaccess_32.h
1823     @@ -102,17 +102,17 @@ __copy_from_user(void *to, const void __user *from, unsigned long n)
1824    
1825     switch (n) {
1826     case 1:
1827     - __uaccess_begin();
1828     + __uaccess_begin_nospec();
1829     __get_user_size(*(u8 *)to, from, 1, ret, 1);
1830     __uaccess_end();
1831     return ret;
1832     case 2:
1833     - __uaccess_begin();
1834     + __uaccess_begin_nospec();
1835     __get_user_size(*(u16 *)to, from, 2, ret, 2);
1836     __uaccess_end();
1837     return ret;
1838     case 4:
1839     - __uaccess_begin();
1840     + __uaccess_begin_nospec();
1841     __get_user_size(*(u32 *)to, from, 4, ret, 4);
1842     __uaccess_end();
1843     return ret;
1844     @@ -130,17 +130,17 @@ static __always_inline unsigned long __copy_from_user_nocache(void *to,
1845    
1846     switch (n) {
1847     case 1:
1848     - __uaccess_begin();
1849     + __uaccess_begin_nospec();
1850     __get_user_size(*(u8 *)to, from, 1, ret, 1);
1851     __uaccess_end();
1852     return ret;
1853     case 2:
1854     - __uaccess_begin();
1855     + __uaccess_begin_nospec();
1856     __get_user_size(*(u16 *)to, from, 2, ret, 2);
1857     __uaccess_end();
1858     return ret;
1859     case 4:
1860     - __uaccess_begin();
1861     + __uaccess_begin_nospec();
1862     __get_user_size(*(u32 *)to, from, 4, ret, 4);
1863     __uaccess_end();
1864     return ret;
1865     diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
1866     index 673059a109fe..6e5cc08134ba 100644
1867     --- a/arch/x86/include/asm/uaccess_64.h
1868     +++ b/arch/x86/include/asm/uaccess_64.h
1869     @@ -59,31 +59,31 @@ int __copy_from_user_nocheck(void *dst, const void __user *src, unsigned size)
1870     return copy_user_generic(dst, (__force void *)src, size);
1871     switch (size) {
1872     case 1:
1873     - __uaccess_begin();
1874     + __uaccess_begin_nospec();
1875     __get_user_asm(*(u8 *)dst, (u8 __user *)src,
1876     ret, "b", "b", "=q", 1);
1877     __uaccess_end();
1878     return ret;
1879     case 2:
1880     - __uaccess_begin();
1881     + __uaccess_begin_nospec();
1882     __get_user_asm(*(u16 *)dst, (u16 __user *)src,
1883     ret, "w", "w", "=r", 2);
1884     __uaccess_end();
1885     return ret;
1886     case 4:
1887     - __uaccess_begin();
1888     + __uaccess_begin_nospec();
1889     __get_user_asm(*(u32 *)dst, (u32 __user *)src,
1890     ret, "l", "k", "=r", 4);
1891     __uaccess_end();
1892     return ret;
1893     case 8:
1894     - __uaccess_begin();
1895     + __uaccess_begin_nospec();
1896     __get_user_asm(*(u64 *)dst, (u64 __user *)src,
1897     ret, "q", "", "=r", 8);
1898     __uaccess_end();
1899     return ret;
1900     case 10:
1901     - __uaccess_begin();
1902     + __uaccess_begin_nospec();
1903     __get_user_asm(*(u64 *)dst, (u64 __user *)src,
1904     ret, "q", "", "=r", 10);
1905     if (likely(!ret))
1906     @@ -93,7 +93,7 @@ int __copy_from_user_nocheck(void *dst, const void __user *src, unsigned size)
1907     __uaccess_end();
1908     return ret;
1909     case 16:
1910     - __uaccess_begin();
1911     + __uaccess_begin_nospec();
1912     __get_user_asm(*(u64 *)dst, (u64 __user *)src,
1913     ret, "q", "", "=r", 16);
1914     if (likely(!ret))
1915     diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
1916     index 10d5a3d6affc..03b6e5c6cf23 100644
1917     --- a/arch/x86/kernel/alternative.c
1918     +++ b/arch/x86/kernel/alternative.c
1919     @@ -46,17 +46,6 @@ static int __init setup_noreplace_smp(char *str)
1920     }
1921     __setup("noreplace-smp", setup_noreplace_smp);
1922    
1923     -#ifdef CONFIG_PARAVIRT
1924     -static int __initdata_or_module noreplace_paravirt = 0;
1925     -
1926     -static int __init setup_noreplace_paravirt(char *str)
1927     -{
1928     - noreplace_paravirt = 1;
1929     - return 1;
1930     -}
1931     -__setup("noreplace-paravirt", setup_noreplace_paravirt);
1932     -#endif
1933     -
1934     #define DPRINTK(fmt, args...) \
1935     do { \
1936     if (debug_alternative) \
1937     @@ -588,9 +577,6 @@ void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
1938     struct paravirt_patch_site *p;
1939     char insnbuf[MAX_PATCH_LEN];
1940    
1941     - if (noreplace_paravirt)
1942     - return;
1943     -
1944     for (p = start; p < end; p++) {
1945     unsigned int used;
1946    
1947     diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
1948     index 8cacf62ec458..957ad443b786 100644
1949     --- a/arch/x86/kernel/cpu/bugs.c
1950     +++ b/arch/x86/kernel/cpu/bugs.c
1951     @@ -10,6 +10,7 @@
1952     #include <linux/init.h>
1953     #include <linux/utsname.h>
1954     #include <linux/cpu.h>
1955     +#include <linux/module.h>
1956    
1957     #include <asm/nospec-branch.h>
1958     #include <asm/cmdline.h>
1959     @@ -89,20 +90,41 @@ static const char *spectre_v2_strings[] = {
1960     };
1961    
1962     #undef pr_fmt
1963     -#define pr_fmt(fmt) "Spectre V2 mitigation: " fmt
1964     +#define pr_fmt(fmt) "Spectre V2 : " fmt
1965    
1966     static enum spectre_v2_mitigation spectre_v2_enabled = SPECTRE_V2_NONE;
1967    
1968     +#ifdef RETPOLINE
1969     +static bool spectre_v2_bad_module;
1970     +
1971     +bool retpoline_module_ok(bool has_retpoline)
1972     +{
1973     + if (spectre_v2_enabled == SPECTRE_V2_NONE || has_retpoline)
1974     + return true;
1975     +
1976     + pr_err("System may be vulnerable to spectre v2\n");
1977     + spectre_v2_bad_module = true;
1978     + return false;
1979     +}
1980     +
1981     +static inline const char *spectre_v2_module_string(void)
1982     +{
1983     + return spectre_v2_bad_module ? " - vulnerable module loaded" : "";
1984     +}
1985     +#else
1986     +static inline const char *spectre_v2_module_string(void) { return ""; }
1987     +#endif
1988     +
1989     static void __init spec2_print_if_insecure(const char *reason)
1990     {
1991     if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
1992     - pr_info("%s\n", reason);
1993     + pr_info("%s selected on command line.\n", reason);
1994     }
1995    
1996     static void __init spec2_print_if_secure(const char *reason)
1997     {
1998     if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
1999     - pr_info("%s\n", reason);
2000     + pr_info("%s selected on command line.\n", reason);
2001     }
2002    
2003     static inline bool retp_compiler(void)
2004     @@ -117,42 +139,68 @@ static inline bool match_option(const char *arg, int arglen, const char *opt)
2005     return len == arglen && !strncmp(arg, opt, len);
2006     }
2007    
2008     +static const struct {
2009     + const char *option;
2010     + enum spectre_v2_mitigation_cmd cmd;
2011     + bool secure;
2012     +} mitigation_options[] = {
2013     + { "off", SPECTRE_V2_CMD_NONE, false },
2014     + { "on", SPECTRE_V2_CMD_FORCE, true },
2015     + { "retpoline", SPECTRE_V2_CMD_RETPOLINE, false },
2016     + { "retpoline,amd", SPECTRE_V2_CMD_RETPOLINE_AMD, false },
2017     + { "retpoline,generic", SPECTRE_V2_CMD_RETPOLINE_GENERIC, false },
2018     + { "auto", SPECTRE_V2_CMD_AUTO, false },
2019     +};
2020     +
2021     static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
2022     {
2023     char arg[20];
2024     - int ret;
2025     -
2026     - ret = cmdline_find_option(boot_command_line, "spectre_v2", arg,
2027     - sizeof(arg));
2028     - if (ret > 0) {
2029     - if (match_option(arg, ret, "off")) {
2030     - goto disable;
2031     - } else if (match_option(arg, ret, "on")) {
2032     - spec2_print_if_secure("force enabled on command line.");
2033     - return SPECTRE_V2_CMD_FORCE;
2034     - } else if (match_option(arg, ret, "retpoline")) {
2035     - spec2_print_if_insecure("retpoline selected on command line.");
2036     - return SPECTRE_V2_CMD_RETPOLINE;
2037     - } else if (match_option(arg, ret, "retpoline,amd")) {
2038     - if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
2039     - pr_err("retpoline,amd selected but CPU is not AMD. Switching to AUTO select\n");
2040     - return SPECTRE_V2_CMD_AUTO;
2041     - }
2042     - spec2_print_if_insecure("AMD retpoline selected on command line.");
2043     - return SPECTRE_V2_CMD_RETPOLINE_AMD;
2044     - } else if (match_option(arg, ret, "retpoline,generic")) {
2045     - spec2_print_if_insecure("generic retpoline selected on command line.");
2046     - return SPECTRE_V2_CMD_RETPOLINE_GENERIC;
2047     - } else if (match_option(arg, ret, "auto")) {
2048     + int ret, i;
2049     + enum spectre_v2_mitigation_cmd cmd = SPECTRE_V2_CMD_AUTO;
2050     +
2051     + if (cmdline_find_option_bool(boot_command_line, "nospectre_v2"))
2052     + return SPECTRE_V2_CMD_NONE;
2053     + else {
2054     + ret = cmdline_find_option(boot_command_line, "spectre_v2", arg,
2055     + sizeof(arg));
2056     + if (ret < 0)
2057     return SPECTRE_V2_CMD_AUTO;
2058     +
2059     + for (i = 0; i < ARRAY_SIZE(mitigation_options); i++) {
2060     + if (!match_option(arg, ret, mitigation_options[i].option))
2061     + continue;
2062     + cmd = mitigation_options[i].cmd;
2063     + break;
2064     }
2065     +
2066     + if (i >= ARRAY_SIZE(mitigation_options)) {
2067     + pr_err("unknown option (%s). Switching to AUTO select\n",
2068     + mitigation_options[i].option);
2069     + return SPECTRE_V2_CMD_AUTO;
2070     + }
2071     + }
2072     +
2073     + if ((cmd == SPECTRE_V2_CMD_RETPOLINE ||
2074     + cmd == SPECTRE_V2_CMD_RETPOLINE_AMD ||
2075     + cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC) &&
2076     + !IS_ENABLED(CONFIG_RETPOLINE)) {
2077     + pr_err("%s selected but not compiled in. Switching to AUTO select\n",
2078     + mitigation_options[i].option);
2079     + return SPECTRE_V2_CMD_AUTO;
2080     }
2081    
2082     - if (!cmdline_find_option_bool(boot_command_line, "nospectre_v2"))
2083     + if (cmd == SPECTRE_V2_CMD_RETPOLINE_AMD &&
2084     + boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
2085     + pr_err("retpoline,amd selected but CPU is not AMD. Switching to AUTO select\n");
2086     return SPECTRE_V2_CMD_AUTO;
2087     -disable:
2088     - spec2_print_if_insecure("disabled on command line.");
2089     - return SPECTRE_V2_CMD_NONE;
2090     + }
2091     +
2092     + if (mitigation_options[i].secure)
2093     + spec2_print_if_secure(mitigation_options[i].option);
2094     + else
2095     + spec2_print_if_insecure(mitigation_options[i].option);
2096     +
2097     + return cmd;
2098     }
2099    
2100     /* Check for Skylake-like CPUs (for RSB handling) */
2101     @@ -190,10 +238,10 @@ static void __init spectre_v2_select_mitigation(void)
2102     return;
2103    
2104     case SPECTRE_V2_CMD_FORCE:
2105     - /* FALLTRHU */
2106     case SPECTRE_V2_CMD_AUTO:
2107     - goto retpoline_auto;
2108     -
2109     + if (IS_ENABLED(CONFIG_RETPOLINE))
2110     + goto retpoline_auto;
2111     + break;
2112     case SPECTRE_V2_CMD_RETPOLINE_AMD:
2113     if (IS_ENABLED(CONFIG_RETPOLINE))
2114     goto retpoline_amd;
2115     @@ -248,6 +296,12 @@ static void __init spectre_v2_select_mitigation(void)
2116     setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW);
2117     pr_info("Filling RSB on context switch\n");
2118     }
2119     +
2120     + /* Initialize Indirect Branch Prediction Barrier if supported */
2121     + if (boot_cpu_has(X86_FEATURE_IBPB)) {
2122     + setup_force_cpu_cap(X86_FEATURE_USE_IBPB);
2123     + pr_info("Enabling Indirect Branch Prediction Barrier\n");
2124     + }
2125     }
2126    
2127     #undef pr_fmt
2128     @@ -268,7 +322,7 @@ ssize_t cpu_show_spectre_v1(struct device *dev,
2129     {
2130     if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1))
2131     return sprintf(buf, "Not affected\n");
2132     - return sprintf(buf, "Vulnerable\n");
2133     + return sprintf(buf, "Mitigation: __user pointer sanitization\n");
2134     }
2135    
2136     ssize_t cpu_show_spectre_v2(struct device *dev,
2137     @@ -277,6 +331,8 @@ ssize_t cpu_show_spectre_v2(struct device *dev,
2138     if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
2139     return sprintf(buf, "Not affected\n");
2140    
2141     - return sprintf(buf, "%s\n", spectre_v2_strings[spectre_v2_enabled]);
2142     + return sprintf(buf, "%s%s%s\n", spectre_v2_strings[spectre_v2_enabled],
2143     + boot_cpu_has(X86_FEATURE_USE_IBPB) ? ", IBPB" : "",
2144     + spectre_v2_module_string());
2145     }
2146     #endif
2147     diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
2148     index d198ae02f2b7..08e89ed6aa87 100644
2149     --- a/arch/x86/kernel/cpu/common.c
2150     +++ b/arch/x86/kernel/cpu/common.c
2151     @@ -44,6 +44,8 @@
2152     #include <asm/pat.h>
2153     #include <asm/microcode.h>
2154     #include <asm/microcode_intel.h>
2155     +#include <asm/intel-family.h>
2156     +#include <asm/cpu_device_id.h>
2157    
2158     #ifdef CONFIG_X86_LOCAL_APIC
2159     #include <asm/uv/uv.h>
2160     @@ -716,6 +718,26 @@ static void apply_forced_caps(struct cpuinfo_x86 *c)
2161     }
2162     }
2163    
2164     +static void init_speculation_control(struct cpuinfo_x86 *c)
2165     +{
2166     + /*
2167     + * The Intel SPEC_CTRL CPUID bit implies IBRS and IBPB support,
2168     + * and they also have a different bit for STIBP support. Also,
2169     + * a hypervisor might have set the individual AMD bits even on
2170     + * Intel CPUs, for finer-grained selection of what's available.
2171     + *
2172     + * We use the AMD bits in 0x8000_0008 EBX as the generic hardware
2173     + * features, which are visible in /proc/cpuinfo and used by the
2174     + * kernel. So set those accordingly from the Intel bits.
2175     + */
2176     + if (cpu_has(c, X86_FEATURE_SPEC_CTRL)) {
2177     + set_cpu_cap(c, X86_FEATURE_IBRS);
2178     + set_cpu_cap(c, X86_FEATURE_IBPB);
2179     + }
2180     + if (cpu_has(c, X86_FEATURE_INTEL_STIBP))
2181     + set_cpu_cap(c, X86_FEATURE_STIBP);
2182     +}
2183     +
2184     void get_cpu_cap(struct cpuinfo_x86 *c)
2185     {
2186     u32 eax, ebx, ecx, edx;
2187     @@ -737,6 +759,7 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
2188     cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx);
2189     c->x86_capability[CPUID_7_0_EBX] = ebx;
2190     c->x86_capability[CPUID_7_ECX] = ecx;
2191     + c->x86_capability[CPUID_7_EDX] = edx;
2192     }
2193    
2194     /* Extended state features: level 0x0000000d */
2195     @@ -809,6 +832,7 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
2196     c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x8000000a);
2197    
2198     init_scattered_cpuid_features(c);
2199     + init_speculation_control(c);
2200     }
2201    
2202     static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
2203     @@ -837,6 +861,41 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
2204     #endif
2205     }
2206    
2207     +static const __initconst struct x86_cpu_id cpu_no_speculation[] = {
2208     + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CEDARVIEW, X86_FEATURE_ANY },
2209     + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CLOVERVIEW, X86_FEATURE_ANY },
2210     + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_LINCROFT, X86_FEATURE_ANY },
2211     + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PENWELL, X86_FEATURE_ANY },
2212     + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PINEVIEW, X86_FEATURE_ANY },
2213     + { X86_VENDOR_CENTAUR, 5 },
2214     + { X86_VENDOR_INTEL, 5 },
2215     + { X86_VENDOR_NSC, 5 },
2216     + { X86_VENDOR_ANY, 4 },
2217     + {}
2218     +};
2219     +
2220     +static const __initconst struct x86_cpu_id cpu_no_meltdown[] = {
2221     + { X86_VENDOR_AMD },
2222     + {}
2223     +};
2224     +
2225     +static bool __init cpu_vulnerable_to_meltdown(struct cpuinfo_x86 *c)
2226     +{
2227     + u64 ia32_cap = 0;
2228     +
2229     + if (x86_match_cpu(cpu_no_meltdown))
2230     + return false;
2231     +
2232     + if (cpu_has(c, X86_FEATURE_ARCH_CAPABILITIES))
2233     + rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap);
2234     +
2235     + /* Rogue Data Cache Load? No! */
2236     + if (ia32_cap & ARCH_CAP_RDCL_NO)
2237     + return false;
2238     +
2239     + return true;
2240     +}
2241     +
2242     /*
2243     * Do minimum CPU detection early.
2244     * Fields really needed: vendor, cpuid_level, family, model, mask,
2245     @@ -883,11 +942,12 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
2246    
2247     setup_force_cpu_cap(X86_FEATURE_ALWAYS);
2248    
2249     - if (c->x86_vendor != X86_VENDOR_AMD)
2250     - setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN);
2251     -
2252     - setup_force_cpu_bug(X86_BUG_SPECTRE_V1);
2253     - setup_force_cpu_bug(X86_BUG_SPECTRE_V2);
2254     + if (!x86_match_cpu(cpu_no_speculation)) {
2255     + if (cpu_vulnerable_to_meltdown(c))
2256     + setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN);
2257     + setup_force_cpu_bug(X86_BUG_SPECTRE_V1);
2258     + setup_force_cpu_bug(X86_BUG_SPECTRE_V2);
2259     + }
2260    
2261     fpu__init_system(c);
2262    
2263     diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
2264     index fcd484d2bb03..4097b43cba2d 100644
2265     --- a/arch/x86/kernel/cpu/intel.c
2266     +++ b/arch/x86/kernel/cpu/intel.c
2267     @@ -61,6 +61,59 @@ void check_mpx_erratum(struct cpuinfo_x86 *c)
2268     }
2269     }
2270    
2271     +/*
2272     + * Early microcode releases for the Spectre v2 mitigation were broken.
2273     + * Information taken from;
2274     + * - https://newsroom.intel.com/wp-content/uploads/sites/11/2018/01/microcode-update-guidance.pdf
2275     + * - https://kb.vmware.com/s/article/52345
2276     + * - Microcode revisions observed in the wild
2277     + * - Release note from 20180108 microcode release
2278     + */
2279     +struct sku_microcode {
2280     + u8 model;
2281     + u8 stepping;
2282     + u32 microcode;
2283     +};
2284     +static const struct sku_microcode spectre_bad_microcodes[] = {
2285     + { INTEL_FAM6_KABYLAKE_DESKTOP, 0x0B, 0x84 },
2286     + { INTEL_FAM6_KABYLAKE_DESKTOP, 0x0A, 0x84 },
2287     + { INTEL_FAM6_KABYLAKE_DESKTOP, 0x09, 0x84 },
2288     + { INTEL_FAM6_KABYLAKE_MOBILE, 0x0A, 0x84 },
2289     + { INTEL_FAM6_KABYLAKE_MOBILE, 0x09, 0x84 },
2290     + { INTEL_FAM6_SKYLAKE_X, 0x03, 0x0100013e },
2291     + { INTEL_FAM6_SKYLAKE_X, 0x04, 0x0200003c },
2292     + { INTEL_FAM6_SKYLAKE_MOBILE, 0x03, 0xc2 },
2293     + { INTEL_FAM6_SKYLAKE_DESKTOP, 0x03, 0xc2 },
2294     + { INTEL_FAM6_BROADWELL_CORE, 0x04, 0x28 },
2295     + { INTEL_FAM6_BROADWELL_GT3E, 0x01, 0x1b },
2296     + { INTEL_FAM6_BROADWELL_XEON_D, 0x02, 0x14 },
2297     + { INTEL_FAM6_BROADWELL_XEON_D, 0x03, 0x07000011 },
2298     + { INTEL_FAM6_BROADWELL_X, 0x01, 0x0b000025 },
2299     + { INTEL_FAM6_HASWELL_ULT, 0x01, 0x21 },
2300     + { INTEL_FAM6_HASWELL_GT3E, 0x01, 0x18 },
2301     + { INTEL_FAM6_HASWELL_CORE, 0x03, 0x23 },
2302     + { INTEL_FAM6_HASWELL_X, 0x02, 0x3b },
2303     + { INTEL_FAM6_HASWELL_X, 0x04, 0x10 },
2304     + { INTEL_FAM6_IVYBRIDGE_X, 0x04, 0x42a },
2305     + /* Updated in the 20180108 release; blacklist until we know otherwise */
2306     + { INTEL_FAM6_ATOM_GEMINI_LAKE, 0x01, 0x22 },
2307     + /* Observed in the wild */
2308     + { INTEL_FAM6_SANDYBRIDGE_X, 0x06, 0x61b },
2309     + { INTEL_FAM6_SANDYBRIDGE_X, 0x07, 0x712 },
2310     +};
2311     +
2312     +static bool bad_spectre_microcode(struct cpuinfo_x86 *c)
2313     +{
2314     + int i;
2315     +
2316     + for (i = 0; i < ARRAY_SIZE(spectre_bad_microcodes); i++) {
2317     + if (c->x86_model == spectre_bad_microcodes[i].model &&
2318     + c->x86_mask == spectre_bad_microcodes[i].stepping)
2319     + return (c->microcode <= spectre_bad_microcodes[i].microcode);
2320     + }
2321     + return false;
2322     +}
2323     +
2324     static void early_init_intel(struct cpuinfo_x86 *c)
2325     {
2326     u64 misc_enable;
2327     @@ -87,6 +140,19 @@ static void early_init_intel(struct cpuinfo_x86 *c)
2328     rdmsr(MSR_IA32_UCODE_REV, lower_word, c->microcode);
2329     }
2330    
2331     + /* Now if any of them are set, check the blacklist and clear the lot */
2332     + if ((cpu_has(c, X86_FEATURE_SPEC_CTRL) ||
2333     + cpu_has(c, X86_FEATURE_INTEL_STIBP) ||
2334     + cpu_has(c, X86_FEATURE_IBRS) || cpu_has(c, X86_FEATURE_IBPB) ||
2335     + cpu_has(c, X86_FEATURE_STIBP)) && bad_spectre_microcode(c)) {
2336     + pr_warn("Intel Spectre v2 broken microcode detected; disabling Speculation Control\n");
2337     + setup_clear_cpu_cap(X86_FEATURE_IBRS);
2338     + setup_clear_cpu_cap(X86_FEATURE_IBPB);
2339     + setup_clear_cpu_cap(X86_FEATURE_STIBP);
2340     + setup_clear_cpu_cap(X86_FEATURE_SPEC_CTRL);
2341     + setup_clear_cpu_cap(X86_FEATURE_INTEL_STIBP);
2342     + }
2343     +
2344     /*
2345     * Atom erratum AAE44/AAF40/AAG38/AAH41:
2346     *
2347     diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
2348     index 5ce5155f0695..0afaf00b029b 100644
2349     --- a/arch/x86/kernel/cpu/microcode/core.c
2350     +++ b/arch/x86/kernel/cpu/microcode/core.c
2351     @@ -43,7 +43,7 @@
2352     #define MICROCODE_VERSION "2.01"
2353    
2354     static struct microcode_ops *microcode_ops;
2355     -static bool dis_ucode_ldr;
2356     +static bool dis_ucode_ldr = true;
2357    
2358     /*
2359     * Synchronization.
2360     @@ -73,6 +73,7 @@ struct cpu_info_ctx {
2361     static bool __init check_loader_disabled_bsp(void)
2362     {
2363     static const char *__dis_opt_str = "dis_ucode_ldr";
2364     + u32 a, b, c, d;
2365    
2366     #ifdef CONFIG_X86_32
2367     const char *cmdline = (const char *)__pa_nodebug(boot_command_line);
2368     @@ -85,8 +86,20 @@ static bool __init check_loader_disabled_bsp(void)
2369     bool *res = &dis_ucode_ldr;
2370     #endif
2371    
2372     - if (cmdline_find_option_bool(cmdline, option))
2373     - *res = true;
2374     + a = 1;
2375     + c = 0;
2376     + native_cpuid(&a, &b, &c, &d);
2377     +
2378     + /*
2379     + * CPUID(1).ECX[31]: reserved for hypervisor use. This is still not
2380     + * completely accurate as xen pv guests don't see that CPUID bit set but
2381     + * that's good enough as they don't land on the BSP path anyway.
2382     + */
2383     + if (c & BIT(31))
2384     + return *res;
2385     +
2386     + if (cmdline_find_option_bool(cmdline, option) <= 0)
2387     + *res = false;
2388    
2389     return *res;
2390     }
2391     @@ -114,9 +127,7 @@ void __init load_ucode_bsp(void)
2392     {
2393     int vendor;
2394     unsigned int family;
2395     -
2396     - if (check_loader_disabled_bsp())
2397     - return;
2398     + bool intel = true;
2399    
2400     if (!have_cpuid_p())
2401     return;
2402     @@ -126,16 +137,27 @@ void __init load_ucode_bsp(void)
2403    
2404     switch (vendor) {
2405     case X86_VENDOR_INTEL:
2406     - if (family >= 6)
2407     - load_ucode_intel_bsp();
2408     + if (family < 6)
2409     + return;
2410     break;
2411     +
2412     case X86_VENDOR_AMD:
2413     - if (family >= 0x10)
2414     - load_ucode_amd_bsp(family);
2415     + if (family < 0x10)
2416     + return;
2417     + intel = false;
2418     break;
2419     +
2420     default:
2421     - break;
2422     + return;
2423     }
2424     +
2425     + if (check_loader_disabled_bsp())
2426     + return;
2427     +
2428     + if (intel)
2429     + load_ucode_intel_bsp();
2430     + else
2431     + load_ucode_amd_bsp(family);
2432     }
2433    
2434     static bool check_loader_disabled_ap(void)
2435     @@ -154,9 +176,6 @@ void load_ucode_ap(void)
2436     if (check_loader_disabled_ap())
2437     return;
2438    
2439     - if (!have_cpuid_p())
2440     - return;
2441     -
2442     vendor = x86_cpuid_vendor();
2443     family = x86_cpuid_family();
2444    
2445     diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
2446     index b0dd9aec183d..afbb52532791 100644
2447     --- a/arch/x86/kernel/cpu/scattered.c
2448     +++ b/arch/x86/kernel/cpu/scattered.c
2449     @@ -31,8 +31,6 @@ void init_scattered_cpuid_features(struct cpuinfo_x86 *c)
2450     const struct cpuid_bit *cb;
2451    
2452     static const struct cpuid_bit cpuid_bits[] = {
2453     - { X86_FEATURE_AVX512_4VNNIW, CR_EDX, 2, 0x00000007, 0 },
2454     - { X86_FEATURE_AVX512_4FMAPS, CR_EDX, 3, 0x00000007, 0 },
2455     { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 },
2456     { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 },
2457     { X86_FEATURE_HW_PSTATE, CR_EDX, 7, 0x80000007, 0 },
2458     diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
2459     index 0887d2ae3797..dffe81d3c261 100644
2460     --- a/arch/x86/kernel/process_64.c
2461     +++ b/arch/x86/kernel/process_64.c
2462     @@ -538,7 +538,7 @@ void set_personality_ia32(bool x32)
2463     current->personality &= ~READ_IMPLIES_EXEC;
2464     /* in_compat_syscall() uses the presence of the x32
2465     syscall bit flag to determine compat status */
2466     - current->thread.status &= ~TS_COMPAT;
2467     + current_thread_info()->status &= ~TS_COMPAT;
2468     } else {
2469     set_thread_flag(TIF_IA32);
2470     clear_thread_flag(TIF_X32);
2471     @@ -546,7 +546,7 @@ void set_personality_ia32(bool x32)
2472     current->mm->context.ia32_compat = TIF_IA32;
2473     current->personality |= force_personality32;
2474     /* Prepare the first "return" to user space */
2475     - current->thread.status |= TS_COMPAT;
2476     + current_thread_info()->status |= TS_COMPAT;
2477     }
2478     }
2479     EXPORT_SYMBOL_GPL(set_personality_ia32);
2480     diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
2481     index 0e63c0267f99..e497d374412a 100644
2482     --- a/arch/x86/kernel/ptrace.c
2483     +++ b/arch/x86/kernel/ptrace.c
2484     @@ -934,7 +934,7 @@ static int putreg32(struct task_struct *child, unsigned regno, u32 value)
2485     */
2486     regs->orig_ax = value;
2487     if (syscall_get_nr(child, regs) >= 0)
2488     - child->thread.status |= TS_I386_REGS_POKED;
2489     + child->thread_info.status |= TS_I386_REGS_POKED;
2490     break;
2491    
2492     case offsetof(struct user32, regs.eflags):
2493     diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
2494     index 763af1d0de64..b1a5d252d482 100644
2495     --- a/arch/x86/kernel/signal.c
2496     +++ b/arch/x86/kernel/signal.c
2497     @@ -785,7 +785,7 @@ static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs)
2498     * than the tracee.
2499     */
2500     #ifdef CONFIG_IA32_EMULATION
2501     - if (current->thread.status & (TS_COMPAT|TS_I386_REGS_POKED))
2502     + if (current_thread_info()->status & (TS_COMPAT|TS_I386_REGS_POKED))
2503     return __NR_ia32_restart_syscall;
2504     #endif
2505     #ifdef CONFIG_X86_X32_ABI
2506     diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
2507     index 8402907825b0..21454e254a4c 100644
2508     --- a/arch/x86/kernel/tboot.c
2509     +++ b/arch/x86/kernel/tboot.c
2510     @@ -134,6 +134,16 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn,
2511     return -1;
2512     set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot));
2513     pte_unmap(pte);
2514     +
2515     + /*
2516     + * PTI poisons low addresses in the kernel page tables in the
2517     + * name of making them unusable for userspace. To execute
2518     + * code at such a low address, the poison must be cleared.
2519     + *
2520     + * Note: 'pgd' actually gets set in pud_alloc().
2521     + */
2522     + pgd->pgd &= ~_PAGE_NX;
2523     +
2524     return 0;
2525     }
2526    
2527     diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
2528     index 91af75e37306..93f924de06cf 100644
2529     --- a/arch/x86/kvm/cpuid.c
2530     +++ b/arch/x86/kvm/cpuid.c
2531     @@ -355,6 +355,10 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2532     F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) |
2533     0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM);
2534    
2535     + /* cpuid 0x80000008.ebx */
2536     + const u32 kvm_cpuid_8000_0008_ebx_x86_features =
2537     + F(IBPB) | F(IBRS);
2538     +
2539     /* cpuid 0xC0000001.edx */
2540     const u32 kvm_cpuid_C000_0001_edx_x86_features =
2541     F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) |
2542     @@ -376,6 +380,10 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2543     /* cpuid 7.0.ecx*/
2544     const u32 kvm_cpuid_7_0_ecx_x86_features = F(PKU) | 0 /*OSPKE*/;
2545    
2546     + /* cpuid 7.0.edx*/
2547     + const u32 kvm_cpuid_7_0_edx_x86_features =
2548     + F(SPEC_CTRL) | F(ARCH_CAPABILITIES);
2549     +
2550     /* all calls to cpuid_count() should be made on the same cpu */
2551     get_cpu();
2552    
2553     @@ -458,12 +466,14 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2554     /* PKU is not yet implemented for shadow paging. */
2555     if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE))
2556     entry->ecx &= ~F(PKU);
2557     + entry->edx &= kvm_cpuid_7_0_edx_x86_features;
2558     + cpuid_mask(&entry->edx, CPUID_7_EDX);
2559     } else {
2560     entry->ebx = 0;
2561     entry->ecx = 0;
2562     + entry->edx = 0;
2563     }
2564     entry->eax = 0;
2565     - entry->edx = 0;
2566     break;
2567     }
2568     case 9:
2569     @@ -607,7 +617,14 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2570     if (!g_phys_as)
2571     g_phys_as = phys_as;
2572     entry->eax = g_phys_as | (virt_as << 8);
2573     - entry->ebx = entry->edx = 0;
2574     + entry->edx = 0;
2575     + /* IBRS and IBPB aren't necessarily present in hardware cpuid */
2576     + if (boot_cpu_has(X86_FEATURE_IBPB))
2577     + entry->ebx |= F(IBPB);
2578     + if (boot_cpu_has(X86_FEATURE_IBRS))
2579     + entry->ebx |= F(IBRS);
2580     + entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features;
2581     + cpuid_mask(&entry->ebx, CPUID_8000_0008_EBX);
2582     break;
2583     }
2584     case 0x80000019:
2585     diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
2586     index 9368fecca3ee..d1beb7156704 100644
2587     --- a/arch/x86/kvm/cpuid.h
2588     +++ b/arch/x86/kvm/cpuid.h
2589     @@ -160,6 +160,37 @@ static inline bool guest_cpuid_has_rdtscp(struct kvm_vcpu *vcpu)
2590     return best && (best->edx & bit(X86_FEATURE_RDTSCP));
2591     }
2592    
2593     +static inline bool guest_cpuid_has_ibpb(struct kvm_vcpu *vcpu)
2594     +{
2595     + struct kvm_cpuid_entry2 *best;
2596     +
2597     + best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
2598     + if (best && (best->ebx & bit(X86_FEATURE_IBPB)))
2599     + return true;
2600     + best = kvm_find_cpuid_entry(vcpu, 7, 0);
2601     + return best && (best->edx & bit(X86_FEATURE_SPEC_CTRL));
2602     +}
2603     +
2604     +static inline bool guest_cpuid_has_ibrs(struct kvm_vcpu *vcpu)
2605     +{
2606     + struct kvm_cpuid_entry2 *best;
2607     +
2608     + best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
2609     + if (best && (best->ebx & bit(X86_FEATURE_IBRS)))
2610     + return true;
2611     + best = kvm_find_cpuid_entry(vcpu, 7, 0);
2612     + return best && (best->edx & bit(X86_FEATURE_SPEC_CTRL));
2613     +}
2614     +
2615     +static inline bool guest_cpuid_has_arch_capabilities(struct kvm_vcpu *vcpu)
2616     +{
2617     + struct kvm_cpuid_entry2 *best;
2618     +
2619     + best = kvm_find_cpuid_entry(vcpu, 7, 0);
2620     + return best && (best->edx & bit(X86_FEATURE_ARCH_CAPABILITIES));
2621     +}
2622     +
2623     +
2624     /*
2625     * NRIPS is provided through cpuidfn 0x8000000a.edx bit 3
2626     */
2627     diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
2628     index 6f5a3b076341..c8d573822e60 100644
2629     --- a/arch/x86/kvm/emulate.c
2630     +++ b/arch/x86/kvm/emulate.c
2631     @@ -25,6 +25,7 @@
2632     #include <asm/kvm_emulate.h>
2633     #include <linux/stringify.h>
2634     #include <asm/debugreg.h>
2635     +#include <asm/nospec-branch.h>
2636    
2637     #include "x86.h"
2638     #include "tss.h"
2639     @@ -1012,8 +1013,8 @@ static __always_inline u8 test_cc(unsigned int condition, unsigned long flags)
2640     void (*fop)(void) = (void *)em_setcc + 4 * (condition & 0xf);
2641    
2642     flags = (flags & EFLAGS_MASK) | X86_EFLAGS_IF;
2643     - asm("push %[flags]; popf; call *%[fastop]"
2644     - : "=a"(rc) : [fastop]"r"(fop), [flags]"r"(flags));
2645     + asm("push %[flags]; popf; " CALL_NOSPEC
2646     + : "=a"(rc) : [thunk_target]"r"(fop), [flags]"r"(flags));
2647     return rc;
2648     }
2649    
2650     @@ -5306,15 +5307,14 @@ static void fetch_possible_mmx_operand(struct x86_emulate_ctxt *ctxt,
2651    
2652     static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *))
2653     {
2654     - register void *__sp asm(_ASM_SP);
2655     ulong flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF;
2656    
2657     if (!(ctxt->d & ByteOp))
2658     fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE;
2659    
2660     - asm("push %[flags]; popf; call *%[fastop]; pushf; pop %[flags]\n"
2661     + asm("push %[flags]; popf; " CALL_NOSPEC " ; pushf; pop %[flags]\n"
2662     : "+a"(ctxt->dst.val), "+d"(ctxt->src.val), [flags]"+D"(flags),
2663     - [fastop]"+S"(fop), "+r"(__sp)
2664     + [thunk_target]"+S"(fop), ASM_CALL_CONSTRAINT
2665     : "c"(ctxt->src2.val));
2666    
2667     ctxt->eflags = (ctxt->eflags & ~EFLAGS_MASK) | (flags & EFLAGS_MASK);
2668     diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
2669     index 24af898fb3a6..be644afab1bb 100644
2670     --- a/arch/x86/kvm/svm.c
2671     +++ b/arch/x86/kvm/svm.c
2672     @@ -183,6 +183,8 @@ struct vcpu_svm {
2673     u64 gs_base;
2674     } host;
2675    
2676     + u64 spec_ctrl;
2677     +
2678     u32 *msrpm;
2679    
2680     ulong nmi_iret_rip;
2681     @@ -248,6 +250,8 @@ static const struct svm_direct_access_msrs {
2682     { .index = MSR_CSTAR, .always = true },
2683     { .index = MSR_SYSCALL_MASK, .always = true },
2684     #endif
2685     + { .index = MSR_IA32_SPEC_CTRL, .always = false },
2686     + { .index = MSR_IA32_PRED_CMD, .always = false },
2687     { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false },
2688     { .index = MSR_IA32_LASTBRANCHTOIP, .always = false },
2689     { .index = MSR_IA32_LASTINTFROMIP, .always = false },
2690     @@ -510,6 +514,7 @@ struct svm_cpu_data {
2691     struct kvm_ldttss_desc *tss_desc;
2692    
2693     struct page *save_area;
2694     + struct vmcb *current_vmcb;
2695     };
2696    
2697     static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
2698     @@ -861,6 +866,25 @@ static bool valid_msr_intercept(u32 index)
2699     return false;
2700     }
2701    
2702     +static bool msr_write_intercepted(struct kvm_vcpu *vcpu, unsigned msr)
2703     +{
2704     + u8 bit_write;
2705     + unsigned long tmp;
2706     + u32 offset;
2707     + u32 *msrpm;
2708     +
2709     + msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm:
2710     + to_svm(vcpu)->msrpm;
2711     +
2712     + offset = svm_msrpm_offset(msr);
2713     + bit_write = 2 * (msr & 0x0f) + 1;
2714     + tmp = msrpm[offset];
2715     +
2716     + BUG_ON(offset == MSR_INVALID);
2717     +
2718     + return !!test_bit(bit_write, &tmp);
2719     +}
2720     +
2721     static void set_msr_interception(u32 *msrpm, unsigned msr,
2722     int read, int write)
2723     {
2724     @@ -1535,6 +1559,8 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
2725     u32 dummy;
2726     u32 eax = 1;
2727    
2728     + svm->spec_ctrl = 0;
2729     +
2730     if (!init_event) {
2731     svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
2732     MSR_IA32_APICBASE_ENABLE;
2733     @@ -1644,11 +1670,17 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
2734     __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
2735     kvm_vcpu_uninit(vcpu);
2736     kmem_cache_free(kvm_vcpu_cache, svm);
2737     + /*
2738     + * The vmcb page can be recycled, causing a false negative in
2739     + * svm_vcpu_load(). So do a full IBPB now.
2740     + */
2741     + indirect_branch_prediction_barrier();
2742     }
2743    
2744     static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
2745     {
2746     struct vcpu_svm *svm = to_svm(vcpu);
2747     + struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
2748     int i;
2749    
2750     if (unlikely(cpu != vcpu->cpu)) {
2751     @@ -1677,6 +1709,10 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
2752     if (static_cpu_has(X86_FEATURE_RDTSCP))
2753     wrmsrl(MSR_TSC_AUX, svm->tsc_aux);
2754    
2755     + if (sd->current_vmcb != svm->vmcb) {
2756     + sd->current_vmcb = svm->vmcb;
2757     + indirect_branch_prediction_barrier();
2758     + }
2759     avic_vcpu_load(vcpu, cpu);
2760     }
2761    
2762     @@ -3508,6 +3544,13 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2763     case MSR_VM_CR:
2764     msr_info->data = svm->nested.vm_cr_msr;
2765     break;
2766     + case MSR_IA32_SPEC_CTRL:
2767     + if (!msr_info->host_initiated &&
2768     + !guest_cpuid_has_ibrs(vcpu))
2769     + return 1;
2770     +
2771     + msr_info->data = svm->spec_ctrl;
2772     + break;
2773     case MSR_IA32_UCODE_REV:
2774     msr_info->data = 0x01000065;
2775     break;
2776     @@ -3599,6 +3642,49 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
2777     case MSR_IA32_TSC:
2778     kvm_write_tsc(vcpu, msr);
2779     break;
2780     + case MSR_IA32_SPEC_CTRL:
2781     + if (!msr->host_initiated &&
2782     + !guest_cpuid_has_ibrs(vcpu))
2783     + return 1;
2784     +
2785     + /* The STIBP bit doesn't fault even if it's not advertised */
2786     + if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP))
2787     + return 1;
2788     +
2789     + svm->spec_ctrl = data;
2790     +
2791     + if (!data)
2792     + break;
2793     +
2794     + /*
2795     + * For non-nested:
2796     + * When it's written (to non-zero) for the first time, pass
2797     + * it through.
2798     + *
2799     + * For nested:
2800     + * The handling of the MSR bitmap for L2 guests is done in
2801     + * nested_svm_vmrun_msrpm.
2802     + * We update the L1 MSR bit as well since it will end up
2803     + * touching the MSR anyway now.
2804     + */
2805     + set_msr_interception(svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
2806     + break;
2807     + case MSR_IA32_PRED_CMD:
2808     + if (!msr->host_initiated &&
2809     + !guest_cpuid_has_ibpb(vcpu))
2810     + return 1;
2811     +
2812     + if (data & ~PRED_CMD_IBPB)
2813     + return 1;
2814     +
2815     + if (!data)
2816     + break;
2817     +
2818     + wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
2819     + if (is_guest_mode(vcpu))
2820     + break;
2821     + set_msr_interception(svm->msrpm, MSR_IA32_PRED_CMD, 0, 1);
2822     + break;
2823     case MSR_STAR:
2824     svm->vmcb->save.star = data;
2825     break;
2826     @@ -4826,6 +4912,15 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
2827    
2828     local_irq_enable();
2829    
2830     + /*
2831     + * If this vCPU has touched SPEC_CTRL, restore the guest's value if
2832     + * it's non-zero. Since vmentry is serialising on affected CPUs, there
2833     + * is no need to worry about the conditional branch over the wrmsr
2834     + * being speculatively taken.
2835     + */
2836     + if (svm->spec_ctrl)
2837     + wrmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl);
2838     +
2839     asm volatile (
2840     "push %%" _ASM_BP "; \n\t"
2841     "mov %c[rbx](%[svm]), %%" _ASM_BX " \n\t"
2842     @@ -4918,6 +5013,27 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
2843     #endif
2844     );
2845    
2846     + /*
2847     + * We do not use IBRS in the kernel. If this vCPU has used the
2848     + * SPEC_CTRL MSR it may have left it on; save the value and
2849     + * turn it off. This is much more efficient than blindly adding
2850     + * it to the atomic save/restore list. Especially as the former
2851     + * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
2852     + *
2853     + * For non-nested case:
2854     + * If the L01 MSR bitmap does not intercept the MSR, then we need to
2855     + * save it.
2856     + *
2857     + * For nested case:
2858     + * If the L02 MSR bitmap does not intercept the MSR, then we need to
2859     + * save it.
2860     + */
2861     + if (!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))
2862     + rdmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl);
2863     +
2864     + if (svm->spec_ctrl)
2865     + wrmsrl(MSR_IA32_SPEC_CTRL, 0);
2866     +
2867     /* Eliminate branch target predictions from guest mode */
2868     vmexit_fill_RSB();
2869    
2870     diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
2871     index 178a344f55f8..d49da86e3099 100644
2872     --- a/arch/x86/kvm/vmx.c
2873     +++ b/arch/x86/kvm/vmx.c
2874     @@ -33,6 +33,7 @@
2875     #include <linux/slab.h>
2876     #include <linux/tboot.h>
2877     #include <linux/hrtimer.h>
2878     +#include <linux/nospec.h>
2879     #include "kvm_cache_regs.h"
2880     #include "x86.h"
2881    
2882     @@ -109,6 +110,14 @@ static u64 __read_mostly host_xss;
2883     static bool __read_mostly enable_pml = 1;
2884     module_param_named(pml, enable_pml, bool, S_IRUGO);
2885    
2886     +#define MSR_TYPE_R 1
2887     +#define MSR_TYPE_W 2
2888     +#define MSR_TYPE_RW 3
2889     +
2890     +#define MSR_BITMAP_MODE_X2APIC 1
2891     +#define MSR_BITMAP_MODE_X2APIC_APICV 2
2892     +#define MSR_BITMAP_MODE_LM 4
2893     +
2894     #define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL
2895    
2896     /* Guest_tsc -> host_tsc conversion requires 64-bit division. */
2897     @@ -173,7 +182,6 @@ module_param(ple_window_max, int, S_IRUGO);
2898     extern const ulong vmx_return;
2899    
2900     #define NR_AUTOLOAD_MSRS 8
2901     -#define VMCS02_POOL_SIZE 1
2902    
2903     struct vmcs {
2904     u32 revision_id;
2905     @@ -191,6 +199,7 @@ struct loaded_vmcs {
2906     struct vmcs *shadow_vmcs;
2907     int cpu;
2908     int launched;
2909     + unsigned long *msr_bitmap;
2910     struct list_head loaded_vmcss_on_cpu_link;
2911     };
2912    
2913     @@ -207,7 +216,7 @@ struct shared_msr_entry {
2914     * stored in guest memory specified by VMPTRLD, but is opaque to the guest,
2915     * which must access it using VMREAD/VMWRITE/VMCLEAR instructions.
2916     * More than one of these structures may exist, if L1 runs multiple L2 guests.
2917     - * nested_vmx_run() will use the data here to build a vmcs02: a VMCS for the
2918     + * nested_vmx_run() will use the data here to build the vmcs02: a VMCS for the
2919     * underlying hardware which will be used to run L2.
2920     * This structure is packed to ensure that its layout is identical across
2921     * machines (necessary for live migration).
2922     @@ -386,13 +395,6 @@ struct __packed vmcs12 {
2923     */
2924     #define VMCS12_SIZE 0x1000
2925    
2926     -/* Used to remember the last vmcs02 used for some recently used vmcs12s */
2927     -struct vmcs02_list {
2928     - struct list_head list;
2929     - gpa_t vmptr;
2930     - struct loaded_vmcs vmcs02;
2931     -};
2932     -
2933     /*
2934     * The nested_vmx structure is part of vcpu_vmx, and holds information we need
2935     * for correct emulation of VMX (i.e., nested VMX) on this vcpu.
2936     @@ -419,15 +421,15 @@ struct nested_vmx {
2937     */
2938     bool sync_shadow_vmcs;
2939    
2940     - /* vmcs02_list cache of VMCSs recently used to run L2 guests */
2941     - struct list_head vmcs02_pool;
2942     - int vmcs02_num;
2943     bool change_vmcs01_virtual_x2apic_mode;
2944     /* L2 must run next, and mustn't decide to exit to L1. */
2945     bool nested_run_pending;
2946     +
2947     + struct loaded_vmcs vmcs02;
2948     +
2949     /*
2950     - * Guest pages referred to in vmcs02 with host-physical pointers, so
2951     - * we must keep them pinned while L2 runs.
2952     + * Guest pages referred to in the vmcs02 with host-physical
2953     + * pointers, so we must keep them pinned while L2 runs.
2954     */
2955     struct page *apic_access_page;
2956     struct page *virtual_apic_page;
2957     @@ -436,8 +438,6 @@ struct nested_vmx {
2958     bool pi_pending;
2959     u16 posted_intr_nv;
2960    
2961     - unsigned long *msr_bitmap;
2962     -
2963     struct hrtimer preemption_timer;
2964     bool preemption_timer_expired;
2965    
2966     @@ -538,6 +538,7 @@ struct vcpu_vmx {
2967     unsigned long host_rsp;
2968     u8 fail;
2969     bool nmi_known_unmasked;
2970     + u8 msr_bitmap_mode;
2971     u32 exit_intr_info;
2972     u32 idt_vectoring_info;
2973     ulong rflags;
2974     @@ -549,6 +550,10 @@ struct vcpu_vmx {
2975     u64 msr_host_kernel_gs_base;
2976     u64 msr_guest_kernel_gs_base;
2977     #endif
2978     +
2979     + u64 arch_capabilities;
2980     + u64 spec_ctrl;
2981     +
2982     u32 vm_entry_controls_shadow;
2983     u32 vm_exit_controls_shadow;
2984     /*
2985     @@ -856,21 +861,18 @@ static const unsigned short vmcs_field_to_offset_table[] = {
2986    
2987     static inline short vmcs_field_to_offset(unsigned long field)
2988     {
2989     - BUILD_BUG_ON(ARRAY_SIZE(vmcs_field_to_offset_table) > SHRT_MAX);
2990     + const size_t size = ARRAY_SIZE(vmcs_field_to_offset_table);
2991     + unsigned short offset;
2992    
2993     - if (field >= ARRAY_SIZE(vmcs_field_to_offset_table))
2994     + BUILD_BUG_ON(size > SHRT_MAX);
2995     + if (field >= size)
2996     return -ENOENT;
2997    
2998     - /*
2999     - * FIXME: Mitigation for CVE-2017-5753. To be replaced with a
3000     - * generic mechanism.
3001     - */
3002     - asm("lfence");
3003     -
3004     - if (vmcs_field_to_offset_table[field] == 0)
3005     + field = array_index_nospec(field, size);
3006     + offset = vmcs_field_to_offset_table[field];
3007     + if (offset == 0)
3008     return -ENOENT;
3009     -
3010     - return vmcs_field_to_offset_table[field];
3011     + return offset;
3012     }
3013    
3014     static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
3015     @@ -912,6 +914,9 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var);
3016     static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx);
3017     static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx);
3018     static int alloc_identity_pagetable(struct kvm *kvm);
3019     +static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
3020     +static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
3021     + u32 msr, int type);
3022    
3023     static DEFINE_PER_CPU(struct vmcs *, vmxarea);
3024     static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
3025     @@ -931,12 +936,6 @@ static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
3026    
3027     static unsigned long *vmx_io_bitmap_a;
3028     static unsigned long *vmx_io_bitmap_b;
3029     -static unsigned long *vmx_msr_bitmap_legacy;
3030     -static unsigned long *vmx_msr_bitmap_longmode;
3031     -static unsigned long *vmx_msr_bitmap_legacy_x2apic;
3032     -static unsigned long *vmx_msr_bitmap_longmode_x2apic;
3033     -static unsigned long *vmx_msr_bitmap_legacy_x2apic_apicv_inactive;
3034     -static unsigned long *vmx_msr_bitmap_longmode_x2apic_apicv_inactive;
3035     static unsigned long *vmx_vmread_bitmap;
3036     static unsigned long *vmx_vmwrite_bitmap;
3037    
3038     @@ -1853,6 +1852,52 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
3039     vmcs_write32(EXCEPTION_BITMAP, eb);
3040     }
3041    
3042     +/*
3043     + * Check if MSR is intercepted for currently loaded MSR bitmap.
3044     + */
3045     +static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
3046     +{
3047     + unsigned long *msr_bitmap;
3048     + int f = sizeof(unsigned long);
3049     +
3050     + if (!cpu_has_vmx_msr_bitmap())
3051     + return true;
3052     +
3053     + msr_bitmap = to_vmx(vcpu)->loaded_vmcs->msr_bitmap;
3054     +
3055     + if (msr <= 0x1fff) {
3056     + return !!test_bit(msr, msr_bitmap + 0x800 / f);
3057     + } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
3058     + msr &= 0x1fff;
3059     + return !!test_bit(msr, msr_bitmap + 0xc00 / f);
3060     + }
3061     +
3062     + return true;
3063     +}
3064     +
3065     +/*
3066     + * Check if MSR is intercepted for L01 MSR bitmap.
3067     + */
3068     +static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr)
3069     +{
3070     + unsigned long *msr_bitmap;
3071     + int f = sizeof(unsigned long);
3072     +
3073     + if (!cpu_has_vmx_msr_bitmap())
3074     + return true;
3075     +
3076     + msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap;
3077     +
3078     + if (msr <= 0x1fff) {
3079     + return !!test_bit(msr, msr_bitmap + 0x800 / f);
3080     + } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
3081     + msr &= 0x1fff;
3082     + return !!test_bit(msr, msr_bitmap + 0xc00 / f);
3083     + }
3084     +
3085     + return true;
3086     +}
3087     +
3088     static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
3089     unsigned long entry, unsigned long exit)
3090     {
3091     @@ -2262,6 +2307,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
3092     if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
3093     per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
3094     vmcs_load(vmx->loaded_vmcs->vmcs);
3095     + indirect_branch_prediction_barrier();
3096     }
3097    
3098     if (!already_loaded) {
3099     @@ -2530,36 +2576,6 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
3100     vmx->guest_msrs[from] = tmp;
3101     }
3102    
3103     -static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
3104     -{
3105     - unsigned long *msr_bitmap;
3106     -
3107     - if (is_guest_mode(vcpu))
3108     - msr_bitmap = to_vmx(vcpu)->nested.msr_bitmap;
3109     - else if (cpu_has_secondary_exec_ctrls() &&
3110     - (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
3111     - SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
3112     - if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) {
3113     - if (is_long_mode(vcpu))
3114     - msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
3115     - else
3116     - msr_bitmap = vmx_msr_bitmap_legacy_x2apic;
3117     - } else {
3118     - if (is_long_mode(vcpu))
3119     - msr_bitmap = vmx_msr_bitmap_longmode_x2apic_apicv_inactive;
3120     - else
3121     - msr_bitmap = vmx_msr_bitmap_legacy_x2apic_apicv_inactive;
3122     - }
3123     - } else {
3124     - if (is_long_mode(vcpu))
3125     - msr_bitmap = vmx_msr_bitmap_longmode;
3126     - else
3127     - msr_bitmap = vmx_msr_bitmap_legacy;
3128     - }
3129     -
3130     - vmcs_write64(MSR_BITMAP, __pa(msr_bitmap));
3131     -}
3132     -
3133     /*
3134     * Set up the vmcs to automatically save and restore system
3135     * msrs. Don't touch the 64-bit msrs if the guest is in legacy
3136     @@ -2600,7 +2616,7 @@ static void setup_msrs(struct vcpu_vmx *vmx)
3137     vmx->save_nmsrs = save_nmsrs;
3138    
3139     if (cpu_has_vmx_msr_bitmap())
3140     - vmx_set_msr_bitmap(&vmx->vcpu);
3141     + vmx_update_msr_bitmap(&vmx->vcpu);
3142     }
3143    
3144     /*
3145     @@ -2989,6 +3005,19 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
3146     case MSR_IA32_TSC:
3147     msr_info->data = guest_read_tsc(vcpu);
3148     break;
3149     + case MSR_IA32_SPEC_CTRL:
3150     + if (!msr_info->host_initiated &&
3151     + !guest_cpuid_has_ibrs(vcpu))
3152     + return 1;
3153     +
3154     + msr_info->data = to_vmx(vcpu)->spec_ctrl;
3155     + break;
3156     + case MSR_IA32_ARCH_CAPABILITIES:
3157     + if (!msr_info->host_initiated &&
3158     + !guest_cpuid_has_arch_capabilities(vcpu))
3159     + return 1;
3160     + msr_info->data = to_vmx(vcpu)->arch_capabilities;
3161     + break;
3162     case MSR_IA32_SYSENTER_CS:
3163     msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
3164     break;
3165     @@ -3093,6 +3122,68 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
3166     case MSR_IA32_TSC:
3167     kvm_write_tsc(vcpu, msr_info);
3168     break;
3169     + case MSR_IA32_SPEC_CTRL:
3170     + if (!msr_info->host_initiated &&
3171     + !guest_cpuid_has_ibrs(vcpu))
3172     + return 1;
3173     +
3174     + /* The STIBP bit doesn't fault even if it's not advertised */
3175     + if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP))
3176     + return 1;
3177     +
3178     + vmx->spec_ctrl = data;
3179     +
3180     + if (!data)
3181     + break;
3182     +
3183     + /*
3184     + * For non-nested:
3185     + * When it's written (to non-zero) for the first time, pass
3186     + * it through.
3187     + *
3188     + * For nested:
3189     + * The handling of the MSR bitmap for L2 guests is done in
3190     + * nested_vmx_merge_msr_bitmap. We should not touch the
3191     + * vmcs02.msr_bitmap here since it gets completely overwritten
3192     + * in the merging. We update the vmcs01 here for L1 as well
3193     + * since it will end up touching the MSR anyway now.
3194     + */
3195     + vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap,
3196     + MSR_IA32_SPEC_CTRL,
3197     + MSR_TYPE_RW);
3198     + break;
3199     + case MSR_IA32_PRED_CMD:
3200     + if (!msr_info->host_initiated &&
3201     + !guest_cpuid_has_ibpb(vcpu))
3202     + return 1;
3203     +
3204     + if (data & ~PRED_CMD_IBPB)
3205     + return 1;
3206     +
3207     + if (!data)
3208     + break;
3209     +
3210     + wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
3211     +
3212     + /*
3213     + * For non-nested:
3214     + * When it's written (to non-zero) for the first time, pass
3215     + * it through.
3216     + *
3217     + * For nested:
3218     + * The handling of the MSR bitmap for L2 guests is done in
3219     + * nested_vmx_merge_msr_bitmap. We should not touch the
3220     + * vmcs02.msr_bitmap here since it gets completely overwritten
3221     + * in the merging.
3222     + */
3223     + vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD,
3224     + MSR_TYPE_W);
3225     + break;
3226     + case MSR_IA32_ARCH_CAPABILITIES:
3227     + if (!msr_info->host_initiated)
3228     + return 1;
3229     + vmx->arch_capabilities = data;
3230     + break;
3231     case MSR_IA32_CR_PAT:
3232     if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
3233     if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
3234     @@ -3532,11 +3623,6 @@ static struct vmcs *alloc_vmcs_cpu(int cpu)
3235     return vmcs;
3236     }
3237    
3238     -static struct vmcs *alloc_vmcs(void)
3239     -{
3240     - return alloc_vmcs_cpu(raw_smp_processor_id());
3241     -}
3242     -
3243     static void free_vmcs(struct vmcs *vmcs)
3244     {
3245     free_pages((unsigned long)vmcs, vmcs_config.order);
3246     @@ -3552,9 +3638,38 @@ static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
3247     loaded_vmcs_clear(loaded_vmcs);
3248     free_vmcs(loaded_vmcs->vmcs);
3249     loaded_vmcs->vmcs = NULL;
3250     + if (loaded_vmcs->msr_bitmap)
3251     + free_page((unsigned long)loaded_vmcs->msr_bitmap);
3252     WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
3253     }
3254    
3255     +static struct vmcs *alloc_vmcs(void)
3256     +{
3257     + return alloc_vmcs_cpu(raw_smp_processor_id());
3258     +}
3259     +
3260     +static int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
3261     +{
3262     + loaded_vmcs->vmcs = alloc_vmcs();
3263     + if (!loaded_vmcs->vmcs)
3264     + return -ENOMEM;
3265     +
3266     + loaded_vmcs->shadow_vmcs = NULL;
3267     + loaded_vmcs_init(loaded_vmcs);
3268     +
3269     + if (cpu_has_vmx_msr_bitmap()) {
3270     + loaded_vmcs->msr_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
3271     + if (!loaded_vmcs->msr_bitmap)
3272     + goto out_vmcs;
3273     + memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
3274     + }
3275     + return 0;
3276     +
3277     +out_vmcs:
3278     + free_loaded_vmcs(loaded_vmcs);
3279     + return -ENOMEM;
3280     +}
3281     +
3282     static void free_kvm_area(void)
3283     {
3284     int cpu;
3285     @@ -4561,10 +4676,8 @@ static void free_vpid(int vpid)
3286     spin_unlock(&vmx_vpid_lock);
3287     }
3288    
3289     -#define MSR_TYPE_R 1
3290     -#define MSR_TYPE_W 2
3291     -static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
3292     - u32 msr, int type)
3293     +static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
3294     + u32 msr, int type)
3295     {
3296     int f = sizeof(unsigned long);
3297    
3298     @@ -4598,8 +4711,8 @@ static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
3299     }
3300     }
3301    
3302     -static void __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
3303     - u32 msr, int type)
3304     +static void __always_inline vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
3305     + u32 msr, int type)
3306     {
3307     int f = sizeof(unsigned long);
3308    
3309     @@ -4633,6 +4746,15 @@ static void __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
3310     }
3311     }
3312    
3313     +static void __always_inline vmx_set_intercept_for_msr(unsigned long *msr_bitmap,
3314     + u32 msr, int type, bool value)
3315     +{
3316     + if (value)
3317     + vmx_enable_intercept_for_msr(msr_bitmap, msr, type);
3318     + else
3319     + vmx_disable_intercept_for_msr(msr_bitmap, msr, type);
3320     +}
3321     +
3322     /*
3323     * If a msr is allowed by L0, we should check whether it is allowed by L1.
3324     * The corresponding bit will be cleared unless both of L0 and L1 allow it.
3325     @@ -4679,58 +4801,68 @@ static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1,
3326     }
3327     }
3328    
3329     -static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
3330     +static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu)
3331     {
3332     - if (!longmode_only)
3333     - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy,
3334     - msr, MSR_TYPE_R | MSR_TYPE_W);
3335     - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
3336     - msr, MSR_TYPE_R | MSR_TYPE_W);
3337     -}
3338     + u8 mode = 0;
3339    
3340     -static void vmx_enable_intercept_msr_read_x2apic(u32 msr, bool apicv_active)
3341     -{
3342     - if (apicv_active) {
3343     - __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
3344     - msr, MSR_TYPE_R);
3345     - __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
3346     - msr, MSR_TYPE_R);
3347     - } else {
3348     - __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive,
3349     - msr, MSR_TYPE_R);
3350     - __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive,
3351     - msr, MSR_TYPE_R);
3352     + if (cpu_has_secondary_exec_ctrls() &&
3353     + (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
3354     + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
3355     + mode |= MSR_BITMAP_MODE_X2APIC;
3356     + if (enable_apicv && kvm_vcpu_apicv_active(vcpu))
3357     + mode |= MSR_BITMAP_MODE_X2APIC_APICV;
3358     }
3359     +
3360     + if (is_long_mode(vcpu))
3361     + mode |= MSR_BITMAP_MODE_LM;
3362     +
3363     + return mode;
3364     }
3365    
3366     -static void vmx_disable_intercept_msr_read_x2apic(u32 msr, bool apicv_active)
3367     +#define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4))
3368     +
3369     +static void vmx_update_msr_bitmap_x2apic(unsigned long *msr_bitmap,
3370     + u8 mode)
3371     {
3372     - if (apicv_active) {
3373     - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
3374     - msr, MSR_TYPE_R);
3375     - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
3376     - msr, MSR_TYPE_R);
3377     - } else {
3378     - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive,
3379     - msr, MSR_TYPE_R);
3380     - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive,
3381     - msr, MSR_TYPE_R);
3382     + int msr;
3383     +
3384     + for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
3385     + unsigned word = msr / BITS_PER_LONG;
3386     + msr_bitmap[word] = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0;
3387     + msr_bitmap[word + (0x800 / sizeof(long))] = ~0;
3388     + }
3389     +
3390     + if (mode & MSR_BITMAP_MODE_X2APIC) {
3391     + /*
3392     + * TPR reads and writes can be virtualized even if virtual interrupt
3393     + * delivery is not in use.
3394     + */
3395     + vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW);
3396     + if (mode & MSR_BITMAP_MODE_X2APIC_APICV) {
3397     + vmx_enable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_R);
3398     + vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
3399     + vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
3400     + }
3401     }
3402     }
3403    
3404     -static void vmx_disable_intercept_msr_write_x2apic(u32 msr, bool apicv_active)
3405     +static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu)
3406     {
3407     - if (apicv_active) {
3408     - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
3409     - msr, MSR_TYPE_W);
3410     - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
3411     - msr, MSR_TYPE_W);
3412     - } else {
3413     - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive,
3414     - msr, MSR_TYPE_W);
3415     - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive,
3416     - msr, MSR_TYPE_W);
3417     - }
3418     + struct vcpu_vmx *vmx = to_vmx(vcpu);
3419     + unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
3420     + u8 mode = vmx_msr_bitmap_mode(vcpu);
3421     + u8 changed = mode ^ vmx->msr_bitmap_mode;
3422     +
3423     + if (!changed)
3424     + return;
3425     +
3426     + vmx_set_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW,
3427     + !(mode & MSR_BITMAP_MODE_LM));
3428     +
3429     + if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV))
3430     + vmx_update_msr_bitmap_x2apic(msr_bitmap, mode);
3431     +
3432     + vmx->msr_bitmap_mode = mode;
3433     }
3434    
3435     static bool vmx_get_enable_apicv(void)
3436     @@ -4738,30 +4870,45 @@ static bool vmx_get_enable_apicv(void)
3437     return enable_apicv;
3438     }
3439    
3440     -static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
3441     +static void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu)
3442     +{
3443     + struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3444     + gfn_t gfn;
3445     +
3446     + /*
3447     + * Don't need to mark the APIC access page dirty; it is never
3448     + * written to by the CPU during APIC virtualization.
3449     + */
3450     +
3451     + if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
3452     + gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT;
3453     + kvm_vcpu_mark_page_dirty(vcpu, gfn);
3454     + }
3455     +
3456     + if (nested_cpu_has_posted_intr(vmcs12)) {
3457     + gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT;
3458     + kvm_vcpu_mark_page_dirty(vcpu, gfn);
3459     + }
3460     +}
3461     +
3462     +
3463     +static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
3464     {
3465     struct vcpu_vmx *vmx = to_vmx(vcpu);
3466     int max_irr;
3467     void *vapic_page;
3468     u16 status;
3469    
3470     - if (vmx->nested.pi_desc &&
3471     - vmx->nested.pi_pending) {
3472     - vmx->nested.pi_pending = false;
3473     - if (!pi_test_and_clear_on(vmx->nested.pi_desc))
3474     - return 0;
3475     -
3476     - max_irr = find_last_bit(
3477     - (unsigned long *)vmx->nested.pi_desc->pir, 256);
3478     + if (!vmx->nested.pi_desc || !vmx->nested.pi_pending)
3479     + return;
3480    
3481     - if (max_irr == 256)
3482     - return 0;
3483     + vmx->nested.pi_pending = false;
3484     + if (!pi_test_and_clear_on(vmx->nested.pi_desc))
3485     + return;
3486    
3487     + max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256);
3488     + if (max_irr != 256) {
3489     vapic_page = kmap(vmx->nested.virtual_apic_page);
3490     - if (!vapic_page) {
3491     - WARN_ON(1);
3492     - return -ENOMEM;
3493     - }
3494     __kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page);
3495     kunmap(vmx->nested.virtual_apic_page);
3496    
3497     @@ -4772,7 +4919,8 @@ static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
3498     vmcs_write16(GUEST_INTR_STATUS, status);
3499     }
3500     }
3501     - return 0;
3502     +
3503     + nested_mark_vmcs12_pages_dirty(vcpu);
3504     }
3505    
3506     static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu)
3507     @@ -4959,7 +5107,7 @@ static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
3508     }
3509    
3510     if (cpu_has_vmx_msr_bitmap())
3511     - vmx_set_msr_bitmap(vcpu);
3512     + vmx_update_msr_bitmap(vcpu);
3513     }
3514    
3515     static u32 vmx_exec_control(struct vcpu_vmx *vmx)
3516     @@ -5048,7 +5196,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
3517     vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
3518     }
3519     if (cpu_has_vmx_msr_bitmap())
3520     - vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy));
3521     + vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap));
3522    
3523     vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
3524    
3525     @@ -5122,6 +5270,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
3526     ++vmx->nmsrs;
3527     }
3528    
3529     + if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
3530     + rdmsrl(MSR_IA32_ARCH_CAPABILITIES, vmx->arch_capabilities);
3531    
3532     vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl);
3533    
3534     @@ -5150,6 +5300,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
3535     u64 cr0;
3536    
3537     vmx->rmode.vm86_active = 0;
3538     + vmx->spec_ctrl = 0;
3539    
3540     vmx->soft_vnmi_blocked = 0;
3541    
3542     @@ -6379,7 +6530,7 @@ static void wakeup_handler(void)
3543    
3544     static __init int hardware_setup(void)
3545     {
3546     - int r = -ENOMEM, i, msr;
3547     + int r = -ENOMEM, i;
3548    
3549     rdmsrl_safe(MSR_EFER, &host_efer);
3550    
3551     @@ -6394,41 +6545,13 @@ static __init int hardware_setup(void)
3552     if (!vmx_io_bitmap_b)
3553     goto out;
3554    
3555     - vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL);
3556     - if (!vmx_msr_bitmap_legacy)
3557     - goto out1;
3558     -
3559     - vmx_msr_bitmap_legacy_x2apic =
3560     - (unsigned long *)__get_free_page(GFP_KERNEL);
3561     - if (!vmx_msr_bitmap_legacy_x2apic)
3562     - goto out2;
3563     -
3564     - vmx_msr_bitmap_legacy_x2apic_apicv_inactive =
3565     - (unsigned long *)__get_free_page(GFP_KERNEL);
3566     - if (!vmx_msr_bitmap_legacy_x2apic_apicv_inactive)
3567     - goto out3;
3568     -
3569     - vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL);
3570     - if (!vmx_msr_bitmap_longmode)
3571     - goto out4;
3572     -
3573     - vmx_msr_bitmap_longmode_x2apic =
3574     - (unsigned long *)__get_free_page(GFP_KERNEL);
3575     - if (!vmx_msr_bitmap_longmode_x2apic)
3576     - goto out5;
3577     -
3578     - vmx_msr_bitmap_longmode_x2apic_apicv_inactive =
3579     - (unsigned long *)__get_free_page(GFP_KERNEL);
3580     - if (!vmx_msr_bitmap_longmode_x2apic_apicv_inactive)
3581     - goto out6;
3582     -
3583     vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
3584     if (!vmx_vmread_bitmap)
3585     - goto out7;
3586     + goto out1;
3587    
3588     vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
3589     if (!vmx_vmwrite_bitmap)
3590     - goto out8;
3591     + goto out2;
3592    
3593     memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
3594     memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
3595     @@ -6437,12 +6560,9 @@ static __init int hardware_setup(void)
3596    
3597     memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE);
3598    
3599     - memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
3600     - memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
3601     -
3602     if (setup_vmcs_config(&vmcs_config) < 0) {
3603     r = -EIO;
3604     - goto out9;
3605     + goto out3;
3606     }
3607    
3608     if (boot_cpu_has(X86_FEATURE_NX))
3609     @@ -6499,47 +6619,8 @@ static __init int hardware_setup(void)
3610     kvm_tsc_scaling_ratio_frac_bits = 48;
3611     }
3612    
3613     - vmx_disable_intercept_for_msr(MSR_FS_BASE, false);
3614     - vmx_disable_intercept_for_msr(MSR_GS_BASE, false);
3615     - vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true);
3616     - vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
3617     - vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
3618     - vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
3619     -
3620     - memcpy(vmx_msr_bitmap_legacy_x2apic,
3621     - vmx_msr_bitmap_legacy, PAGE_SIZE);
3622     - memcpy(vmx_msr_bitmap_longmode_x2apic,
3623     - vmx_msr_bitmap_longmode, PAGE_SIZE);
3624     - memcpy(vmx_msr_bitmap_legacy_x2apic_apicv_inactive,
3625     - vmx_msr_bitmap_legacy, PAGE_SIZE);
3626     - memcpy(vmx_msr_bitmap_longmode_x2apic_apicv_inactive,
3627     - vmx_msr_bitmap_longmode, PAGE_SIZE);
3628     -
3629     set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
3630    
3631     - /*
3632     - * enable_apicv && kvm_vcpu_apicv_active()
3633     - */
3634     - for (msr = 0x800; msr <= 0x8ff; msr++)
3635     - vmx_disable_intercept_msr_read_x2apic(msr, true);
3636     -
3637     - /* TMCCT */
3638     - vmx_enable_intercept_msr_read_x2apic(0x839, true);
3639     - /* TPR */
3640     - vmx_disable_intercept_msr_write_x2apic(0x808, true);
3641     - /* EOI */
3642     - vmx_disable_intercept_msr_write_x2apic(0x80b, true);
3643     - /* SELF-IPI */
3644     - vmx_disable_intercept_msr_write_x2apic(0x83f, true);
3645     -
3646     - /*
3647     - * (enable_apicv && !kvm_vcpu_apicv_active()) ||
3648     - * !enable_apicv
3649     - */
3650     - /* TPR */
3651     - vmx_disable_intercept_msr_read_x2apic(0x808, false);
3652     - vmx_disable_intercept_msr_write_x2apic(0x808, false);
3653     -
3654     if (enable_ept) {
3655     kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
3656     (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull,
3657     @@ -6585,22 +6666,10 @@ static __init int hardware_setup(void)
3658    
3659     return alloc_kvm_area();
3660    
3661     -out9:
3662     - free_page((unsigned long)vmx_vmwrite_bitmap);
3663     -out8:
3664     - free_page((unsigned long)vmx_vmread_bitmap);
3665     -out7:
3666     - free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic_apicv_inactive);
3667     -out6:
3668     - free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
3669     -out5:
3670     - free_page((unsigned long)vmx_msr_bitmap_longmode);
3671     -out4:
3672     - free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic_apicv_inactive);
3673     out3:
3674     - free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
3675     + free_page((unsigned long)vmx_vmwrite_bitmap);
3676     out2:
3677     - free_page((unsigned long)vmx_msr_bitmap_legacy);
3678     + free_page((unsigned long)vmx_vmread_bitmap);
3679     out1:
3680     free_page((unsigned long)vmx_io_bitmap_b);
3681     out:
3682     @@ -6611,12 +6680,6 @@ static __init int hardware_setup(void)
3683    
3684     static __exit void hardware_unsetup(void)
3685     {
3686     - free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
3687     - free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic_apicv_inactive);
3688     - free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
3689     - free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic_apicv_inactive);
3690     - free_page((unsigned long)vmx_msr_bitmap_legacy);
3691     - free_page((unsigned long)vmx_msr_bitmap_longmode);
3692     free_page((unsigned long)vmx_io_bitmap_b);
3693     free_page((unsigned long)vmx_io_bitmap_a);
3694     free_page((unsigned long)vmx_vmwrite_bitmap);
3695     @@ -6663,94 +6726,6 @@ static int handle_monitor(struct kvm_vcpu *vcpu)
3696     return handle_nop(vcpu);
3697     }
3698    
3699     -/*
3700     - * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12.
3701     - * We could reuse a single VMCS for all the L2 guests, but we also want the
3702     - * option to allocate a separate vmcs02 for each separate loaded vmcs12 - this
3703     - * allows keeping them loaded on the processor, and in the future will allow
3704     - * optimizations where prepare_vmcs02 doesn't need to set all the fields on
3705     - * every entry if they never change.
3706     - * So we keep, in vmx->nested.vmcs02_pool, a cache of size VMCS02_POOL_SIZE
3707     - * (>=0) with a vmcs02 for each recently loaded vmcs12s, most recent first.
3708     - *
3709     - * The following functions allocate and free a vmcs02 in this pool.
3710     - */
3711     -
3712     -/* Get a VMCS from the pool to use as vmcs02 for the current vmcs12. */
3713     -static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx)
3714     -{
3715     - struct vmcs02_list *item;
3716     - list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
3717     - if (item->vmptr == vmx->nested.current_vmptr) {
3718     - list_move(&item->list, &vmx->nested.vmcs02_pool);
3719     - return &item->vmcs02;
3720     - }
3721     -
3722     - if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) {
3723     - /* Recycle the least recently used VMCS. */
3724     - item = list_last_entry(&vmx->nested.vmcs02_pool,
3725     - struct vmcs02_list, list);
3726     - item->vmptr = vmx->nested.current_vmptr;
3727     - list_move(&item->list, &vmx->nested.vmcs02_pool);
3728     - return &item->vmcs02;
3729     - }
3730     -
3731     - /* Create a new VMCS */
3732     - item = kmalloc(sizeof(struct vmcs02_list), GFP_KERNEL);
3733     - if (!item)
3734     - return NULL;
3735     - item->vmcs02.vmcs = alloc_vmcs();
3736     - item->vmcs02.shadow_vmcs = NULL;
3737     - if (!item->vmcs02.vmcs) {
3738     - kfree(item);
3739     - return NULL;
3740     - }
3741     - loaded_vmcs_init(&item->vmcs02);
3742     - item->vmptr = vmx->nested.current_vmptr;
3743     - list_add(&(item->list), &(vmx->nested.vmcs02_pool));
3744     - vmx->nested.vmcs02_num++;
3745     - return &item->vmcs02;
3746     -}
3747     -
3748     -/* Free and remove from pool a vmcs02 saved for a vmcs12 (if there is one) */
3749     -static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr)
3750     -{
3751     - struct vmcs02_list *item;
3752     - list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
3753     - if (item->vmptr == vmptr) {
3754     - free_loaded_vmcs(&item->vmcs02);
3755     - list_del(&item->list);
3756     - kfree(item);
3757     - vmx->nested.vmcs02_num--;
3758     - return;
3759     - }
3760     -}
3761     -
3762     -/*
3763     - * Free all VMCSs saved for this vcpu, except the one pointed by
3764     - * vmx->loaded_vmcs. We must be running L1, so vmx->loaded_vmcs
3765     - * must be &vmx->vmcs01.
3766     - */
3767     -static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx)
3768     -{
3769     - struct vmcs02_list *item, *n;
3770     -
3771     - WARN_ON(vmx->loaded_vmcs != &vmx->vmcs01);
3772     - list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) {
3773     - /*
3774     - * Something will leak if the above WARN triggers. Better than
3775     - * a use-after-free.
3776     - */
3777     - if (vmx->loaded_vmcs == &item->vmcs02)
3778     - continue;
3779     -
3780     - free_loaded_vmcs(&item->vmcs02);
3781     - list_del(&item->list);
3782     - kfree(item);
3783     - vmx->nested.vmcs02_num--;
3784     - }
3785     -}
3786     -
3787     /*
3788     * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
3789     * set the success or error code of an emulated VMX instruction, as specified
3790     @@ -7025,6 +7000,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
3791     struct vmcs *shadow_vmcs;
3792     const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED
3793     | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
3794     + int r;
3795    
3796     /* The Intel VMX Instruction Reference lists a bunch of bits that
3797     * are prerequisite to running VMXON, most notably cr4.VMXE must be
3798     @@ -7064,12 +7040,9 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
3799     return 1;
3800     }
3801    
3802     - if (cpu_has_vmx_msr_bitmap()) {
3803     - vmx->nested.msr_bitmap =
3804     - (unsigned long *)__get_free_page(GFP_KERNEL);
3805     - if (!vmx->nested.msr_bitmap)
3806     - goto out_msr_bitmap;
3807     - }
3808     + r = alloc_loaded_vmcs(&vmx->nested.vmcs02);
3809     + if (r < 0)
3810     + goto out_vmcs02;
3811    
3812     vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
3813     if (!vmx->nested.cached_vmcs12)
3814     @@ -7086,9 +7059,6 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
3815     vmx->vmcs01.shadow_vmcs = shadow_vmcs;
3816     }
3817    
3818     - INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
3819     - vmx->nested.vmcs02_num = 0;
3820     -
3821     hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
3822     HRTIMER_MODE_REL_PINNED);
3823     vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
3824     @@ -7103,9 +7073,9 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
3825     kfree(vmx->nested.cached_vmcs12);
3826    
3827     out_cached_vmcs12:
3828     - free_page((unsigned long)vmx->nested.msr_bitmap);
3829     + free_loaded_vmcs(&vmx->nested.vmcs02);
3830    
3831     -out_msr_bitmap:
3832     +out_vmcs02:
3833     return -ENOMEM;
3834     }
3835    
3836     @@ -7181,17 +7151,13 @@ static void free_nested(struct vcpu_vmx *vmx)
3837     vmx->nested.vmxon = false;
3838     free_vpid(vmx->nested.vpid02);
3839     nested_release_vmcs12(vmx);
3840     - if (vmx->nested.msr_bitmap) {
3841     - free_page((unsigned long)vmx->nested.msr_bitmap);
3842     - vmx->nested.msr_bitmap = NULL;
3843     - }
3844     if (enable_shadow_vmcs) {
3845     vmcs_clear(vmx->vmcs01.shadow_vmcs);
3846     free_vmcs(vmx->vmcs01.shadow_vmcs);
3847     vmx->vmcs01.shadow_vmcs = NULL;
3848     }
3849     kfree(vmx->nested.cached_vmcs12);
3850     - /* Unpin physical memory we referred to in current vmcs02 */
3851     + /* Unpin physical memory we referred to in the vmcs02 */
3852     if (vmx->nested.apic_access_page) {
3853     nested_release_page(vmx->nested.apic_access_page);
3854     vmx->nested.apic_access_page = NULL;
3855     @@ -7207,7 +7173,7 @@ static void free_nested(struct vcpu_vmx *vmx)
3856     vmx->nested.pi_desc = NULL;
3857     }
3858    
3859     - nested_free_all_saved_vmcss(vmx);
3860     + free_loaded_vmcs(&vmx->nested.vmcs02);
3861     }
3862    
3863     /* Emulate the VMXOFF instruction */
3864     @@ -7241,8 +7207,6 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
3865     vmptr + offsetof(struct vmcs12, launch_state),
3866     &zero, sizeof(zero));
3867    
3868     - nested_free_vmcs02(vmx, vmptr);
3869     -
3870     skip_emulated_instruction(vcpu);
3871     nested_vmx_succeed(vcpu);
3872     return 1;
3873     @@ -8029,6 +7993,19 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
3874     vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
3875     KVM_ISA_VMX);
3876    
3877     + /*
3878     + * The host physical addresses of some pages of guest memory
3879     + * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
3880     + * Page). The CPU may write to these pages via their host
3881     + * physical address while L2 is running, bypassing any
3882     + * address-translation-based dirty tracking (e.g. EPT write
3883     + * protection).
3884     + *
3885     + * Mark them dirty on every exit from L2 to prevent them from
3886     + * getting out of sync with dirty tracking.
3887     + */
3888     + nested_mark_vmcs12_pages_dirty(vcpu);
3889     +
3890     if (vmx->nested.nested_run_pending)
3891     return false;
3892    
3893     @@ -8520,7 +8497,7 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
3894     }
3895     vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control);
3896    
3897     - vmx_set_msr_bitmap(vcpu);
3898     + vmx_update_msr_bitmap(vcpu);
3899     }
3900    
3901     static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa)
3902     @@ -8676,14 +8653,14 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
3903     #endif
3904     "pushf\n\t"
3905     __ASM_SIZE(push) " $%c[cs]\n\t"
3906     - "call *%[entry]\n\t"
3907     + CALL_NOSPEC
3908     :
3909     #ifdef CONFIG_X86_64
3910     [sp]"=&r"(tmp),
3911     #endif
3912     "+r"(__sp)
3913     :
3914     - [entry]"r"(entry),
3915     + THUNK_TARGET(entry),
3916     [ss]"i"(__KERNEL_DS),
3917     [cs]"i"(__KERNEL_CS)
3918     );
3919     @@ -8909,6 +8886,15 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
3920    
3921     vmx_arm_hv_timer(vcpu);
3922    
3923     + /*
3924     + * If this vCPU has touched SPEC_CTRL, restore the guest's value if
3925     + * it's non-zero. Since vmentry is serialising on affected CPUs, there
3926     + * is no need to worry about the conditional branch over the wrmsr
3927     + * being speculatively taken.
3928     + */
3929     + if (vmx->spec_ctrl)
3930     + wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
3931     +
3932     vmx->__launched = vmx->loaded_vmcs->launched;
3933     asm(
3934     /* Store host registers */
3935     @@ -9027,6 +9013,27 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
3936     #endif
3937     );
3938    
3939     + /*
3940     + * We do not use IBRS in the kernel. If this vCPU has used the
3941     + * SPEC_CTRL MSR it may have left it on; save the value and
3942     + * turn it off. This is much more efficient than blindly adding
3943     + * it to the atomic save/restore list. Especially as the former
3944     + * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
3945     + *
3946     + * For non-nested case:
3947     + * If the L01 MSR bitmap does not intercept the MSR, then we need to
3948     + * save it.
3949     + *
3950     + * For nested case:
3951     + * If the L02 MSR bitmap does not intercept the MSR, then we need to
3952     + * save it.
3953     + */
3954     + if (!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))
3955     + rdmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
3956     +
3957     + if (vmx->spec_ctrl)
3958     + wrmsrl(MSR_IA32_SPEC_CTRL, 0);
3959     +
3960     /* Eliminate branch target predictions from guest mode */
3961     vmexit_fill_RSB();
3962    
3963     @@ -9140,6 +9147,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
3964     {
3965     int err;
3966     struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
3967     + unsigned long *msr_bitmap;
3968     int cpu;
3969    
3970     if (!vmx)
3971     @@ -9172,17 +9180,24 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
3972     if (!vmx->guest_msrs)
3973     goto free_pml;
3974    
3975     - vmx->loaded_vmcs = &vmx->vmcs01;
3976     - vmx->loaded_vmcs->vmcs = alloc_vmcs();
3977     - vmx->loaded_vmcs->shadow_vmcs = NULL;
3978     - if (!vmx->loaded_vmcs->vmcs)
3979     - goto free_msrs;
3980     if (!vmm_exclusive)
3981     kvm_cpu_vmxon(__pa(per_cpu(vmxarea, raw_smp_processor_id())));
3982     - loaded_vmcs_init(vmx->loaded_vmcs);
3983     + err = alloc_loaded_vmcs(&vmx->vmcs01);
3984     if (!vmm_exclusive)
3985     kvm_cpu_vmxoff();
3986     + if (err < 0)
3987     + goto free_msrs;
3988    
3989     + msr_bitmap = vmx->vmcs01.msr_bitmap;
3990     + vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE, MSR_TYPE_RW);
3991     + vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE, MSR_TYPE_RW);
3992     + vmx_disable_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
3993     + vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
3994     + vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
3995     + vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
3996     + vmx->msr_bitmap_mode = 0;
3997     +
3998     + vmx->loaded_vmcs = &vmx->vmcs01;
3999     cpu = get_cpu();
4000     vmx_vcpu_load(&vmx->vcpu, cpu);
4001     vmx->vcpu.cpu = cpu;
4002     @@ -9576,21 +9591,31 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
4003     int msr;
4004     struct page *page;
4005     unsigned long *msr_bitmap_l1;
4006     - unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.msr_bitmap;
4007     + unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap;
4008     + /*
4009     + * pred_cmd & spec_ctrl are trying to verify two things:
4010     + *
4011     + * 1. L0 gave a permission to L1 to actually passthrough the MSR. This
4012     + * ensures that we do not accidentally generate an L02 MSR bitmap
4013     + * from the L12 MSR bitmap that is too permissive.
4014     + * 2. That L1 or L2s have actually used the MSR. This avoids
4015     + * unnecessarily merging of the bitmap if the MSR is unused. This
4016     + * works properly because we only update the L01 MSR bitmap lazily.
4017     + * So even if L0 should pass L1 these MSRs, the L01 bitmap is only
4018     + * updated to reflect this when L1 (or its L2s) actually write to
4019     + * the MSR.
4020     + */
4021     + bool pred_cmd = msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD);
4022     + bool spec_ctrl = msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL);
4023    
4024     - /* This shortcut is ok because we support only x2APIC MSRs so far. */
4025     - if (!nested_cpu_has_virt_x2apic_mode(vmcs12))
4026     + if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
4027     + !pred_cmd && !spec_ctrl)
4028     return false;
4029    
4030     page = nested_get_page(vcpu, vmcs12->msr_bitmap);
4031     if (!page)
4032     return false;
4033     msr_bitmap_l1 = (unsigned long *)kmap(page);
4034     - if (!msr_bitmap_l1) {
4035     - nested_release_page_clean(page);
4036     - WARN_ON(1);
4037     - return false;
4038     - }
4039    
4040     memset(msr_bitmap_l0, 0xff, PAGE_SIZE);
4041    
4042     @@ -9617,6 +9642,19 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
4043     MSR_TYPE_W);
4044     }
4045     }
4046     +
4047     + if (spec_ctrl)
4048     + nested_vmx_disable_intercept_for_msr(
4049     + msr_bitmap_l1, msr_bitmap_l0,
4050     + MSR_IA32_SPEC_CTRL,
4051     + MSR_TYPE_R | MSR_TYPE_W);
4052     +
4053     + if (pred_cmd)
4054     + nested_vmx_disable_intercept_for_msr(
4055     + msr_bitmap_l1, msr_bitmap_l0,
4056     + MSR_IA32_PRED_CMD,
4057     + MSR_TYPE_W);
4058     +
4059     kunmap(page);
4060     nested_release_page_clean(page);
4061    
4062     @@ -10096,6 +10134,9 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
4063     if (kvm_has_tsc_control)
4064     decache_tsc_multiplier(vmx);
4065    
4066     + if (cpu_has_vmx_msr_bitmap())
4067     + vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
4068     +
4069     if (enable_vpid) {
4070     /*
4071     * There is no direct mapping between vpid02 and vpid12, the
4072     @@ -10191,7 +10232,6 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
4073     struct vmcs12 *vmcs12;
4074     struct vcpu_vmx *vmx = to_vmx(vcpu);
4075     int cpu;
4076     - struct loaded_vmcs *vmcs02;
4077     bool ia32e;
4078     u32 msr_entry_idx;
4079    
4080     @@ -10331,17 +10371,13 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
4081     * the nested entry.
4082     */
4083    
4084     - vmcs02 = nested_get_current_vmcs02(vmx);
4085     - if (!vmcs02)
4086     - return -ENOMEM;
4087     -
4088     enter_guest_mode(vcpu);
4089    
4090     if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
4091     vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
4092    
4093     cpu = get_cpu();
4094     - vmx->loaded_vmcs = vmcs02;
4095     + vmx->loaded_vmcs = &vmx->nested.vmcs02;
4096     vmx_vcpu_put(vcpu);
4097     vmx_vcpu_load(vcpu, cpu);
4098     vcpu->cpu = cpu;
4099     @@ -10493,7 +10529,8 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)
4100     return 0;
4101     }
4102    
4103     - return vmx_complete_nested_posted_interrupt(vcpu);
4104     + vmx_complete_nested_posted_interrupt(vcpu);
4105     + return 0;
4106     }
4107    
4108     static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
4109     @@ -10804,7 +10841,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
4110     vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
4111    
4112     if (cpu_has_vmx_msr_bitmap())
4113     - vmx_set_msr_bitmap(vcpu);
4114     + vmx_update_msr_bitmap(vcpu);
4115    
4116     if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
4117     vmcs12->vm_exit_msr_load_count))
4118     @@ -10855,10 +10892,6 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
4119     vm_exit_controls_reset_shadow(vmx);
4120     vmx_segment_cache_clear(vmx);
4121    
4122     - /* if no vmcs02 cache requested, remove the one we used */
4123     - if (VMCS02_POOL_SIZE == 0)
4124     - nested_free_vmcs02(vmx, vmx->nested.current_vmptr);
4125     -
4126     load_vmcs12_host_state(vcpu, vmcs12);
4127    
4128     /* Update any VMCS fields that might have changed while L2 ran */
4129     diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
4130     index e023ef981feb..75f756eac979 100644
4131     --- a/arch/x86/kvm/x86.c
4132     +++ b/arch/x86/kvm/x86.c
4133     @@ -975,6 +975,7 @@ static u32 msrs_to_save[] = {
4134     #endif
4135     MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
4136     MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
4137     + MSR_IA32_SPEC_CTRL, MSR_IA32_ARCH_CAPABILITIES
4138     };
4139    
4140     static unsigned num_msrs_to_save;
4141     diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
4142     index 6bf1898ddf49..4ad7c4dd311c 100644
4143     --- a/arch/x86/lib/Makefile
4144     +++ b/arch/x86/lib/Makefile
4145     @@ -26,6 +26,7 @@ lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
4146     lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o
4147     lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o
4148     lib-$(CONFIG_RETPOLINE) += retpoline.o
4149     +OBJECT_FILES_NON_STANDARD_retpoline.o :=y
4150    
4151     obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o
4152    
4153     diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S
4154     index 37b62d412148..b12b214713a6 100644
4155     --- a/arch/x86/lib/getuser.S
4156     +++ b/arch/x86/lib/getuser.S
4157     @@ -39,6 +39,8 @@ ENTRY(__get_user_1)
4158     mov PER_CPU_VAR(current_task), %_ASM_DX
4159     cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX
4160     jae bad_get_user
4161     + sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */
4162     + and %_ASM_DX, %_ASM_AX
4163     ASM_STAC
4164     1: movzbl (%_ASM_AX),%edx
4165     xor %eax,%eax
4166     @@ -53,6 +55,8 @@ ENTRY(__get_user_2)
4167     mov PER_CPU_VAR(current_task), %_ASM_DX
4168     cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX
4169     jae bad_get_user
4170     + sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */
4171     + and %_ASM_DX, %_ASM_AX
4172     ASM_STAC
4173     2: movzwl -1(%_ASM_AX),%edx
4174     xor %eax,%eax
4175     @@ -67,6 +71,8 @@ ENTRY(__get_user_4)
4176     mov PER_CPU_VAR(current_task), %_ASM_DX
4177     cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX
4178     jae bad_get_user
4179     + sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */
4180     + and %_ASM_DX, %_ASM_AX
4181     ASM_STAC
4182     3: movl -3(%_ASM_AX),%edx
4183     xor %eax,%eax
4184     @@ -82,6 +88,8 @@ ENTRY(__get_user_8)
4185     mov PER_CPU_VAR(current_task), %_ASM_DX
4186     cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX
4187     jae bad_get_user
4188     + sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */
4189     + and %_ASM_DX, %_ASM_AX
4190     ASM_STAC
4191     4: movq -7(%_ASM_AX),%rdx
4192     xor %eax,%eax
4193     @@ -93,6 +101,8 @@ ENTRY(__get_user_8)
4194     mov PER_CPU_VAR(current_task), %_ASM_DX
4195     cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX
4196     jae bad_get_user_8
4197     + sbb %_ASM_DX, %_ASM_DX /* array_index_mask_nospec() */
4198     + and %_ASM_DX, %_ASM_AX
4199     ASM_STAC
4200     4: movl -7(%_ASM_AX),%edx
4201     5: movl -3(%_ASM_AX),%ecx
4202     diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S
4203     index dfb2ba91b670..480edc3a5e03 100644
4204     --- a/arch/x86/lib/retpoline.S
4205     +++ b/arch/x86/lib/retpoline.S
4206     @@ -7,6 +7,7 @@
4207     #include <asm/alternative-asm.h>
4208     #include <asm/export.h>
4209     #include <asm/nospec-branch.h>
4210     +#include <asm/bitsperlong.h>
4211    
4212     .macro THUNK reg
4213     .section .text.__x86.indirect_thunk
4214     @@ -36,7 +37,6 @@ GENERATE_THUNK(_ASM_DX)
4215     GENERATE_THUNK(_ASM_SI)
4216     GENERATE_THUNK(_ASM_DI)
4217     GENERATE_THUNK(_ASM_BP)
4218     -GENERATE_THUNK(_ASM_SP)
4219     #ifdef CONFIG_64BIT
4220     GENERATE_THUNK(r8)
4221     GENERATE_THUNK(r9)
4222     @@ -47,3 +47,58 @@ GENERATE_THUNK(r13)
4223     GENERATE_THUNK(r14)
4224     GENERATE_THUNK(r15)
4225     #endif
4226     +
4227     +/*
4228     + * Fill the CPU return stack buffer.
4229     + *
4230     + * Each entry in the RSB, if used for a speculative 'ret', contains an
4231     + * infinite 'pause; lfence; jmp' loop to capture speculative execution.
4232     + *
4233     + * This is required in various cases for retpoline and IBRS-based
4234     + * mitigations for the Spectre variant 2 vulnerability. Sometimes to
4235     + * eliminate potentially bogus entries from the RSB, and sometimes
4236     + * purely to ensure that it doesn't get empty, which on some CPUs would
4237     + * allow predictions from other (unwanted!) sources to be used.
4238     + *
4239     + * Google experimented with loop-unrolling and this turned out to be
4240     + * the optimal version - two calls, each with their own speculation
4241     + * trap should their return address end up getting used, in a loop.
4242     + */
4243     +.macro STUFF_RSB nr:req sp:req
4244     + mov $(\nr / 2), %_ASM_BX
4245     + .align 16
4246     +771:
4247     + call 772f
4248     +773: /* speculation trap */
4249     + pause
4250     + lfence
4251     + jmp 773b
4252     + .align 16
4253     +772:
4254     + call 774f
4255     +775: /* speculation trap */
4256     + pause
4257     + lfence
4258     + jmp 775b
4259     + .align 16
4260     +774:
4261     + dec %_ASM_BX
4262     + jnz 771b
4263     + add $((BITS_PER_LONG/8) * \nr), \sp
4264     +.endm
4265     +
4266     +#define RSB_FILL_LOOPS 16 /* To avoid underflow */
4267     +
4268     +ENTRY(__fill_rsb)
4269     + STUFF_RSB RSB_FILL_LOOPS, %_ASM_SP
4270     + ret
4271     +END(__fill_rsb)
4272     +EXPORT_SYMBOL_GPL(__fill_rsb)
4273     +
4274     +#define RSB_CLEAR_LOOPS 32 /* To forcibly overwrite all entries */
4275     +
4276     +ENTRY(__clear_rsb)
4277     + STUFF_RSB RSB_CLEAR_LOOPS, %_ASM_SP
4278     + ret
4279     +END(__clear_rsb)
4280     +EXPORT_SYMBOL_GPL(__clear_rsb)
4281     diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
4282     index 3bc7baf2a711..5c06dbffc52f 100644
4283     --- a/arch/x86/lib/usercopy_32.c
4284     +++ b/arch/x86/lib/usercopy_32.c
4285     @@ -570,12 +570,12 @@ do { \
4286     unsigned long __copy_to_user_ll(void __user *to, const void *from,
4287     unsigned long n)
4288     {
4289     - stac();
4290     + __uaccess_begin_nospec();
4291     if (movsl_is_ok(to, from, n))
4292     __copy_user(to, from, n);
4293     else
4294     n = __copy_user_intel(to, from, n);
4295     - clac();
4296     + __uaccess_end();
4297     return n;
4298     }
4299     EXPORT_SYMBOL(__copy_to_user_ll);
4300     @@ -627,7 +627,7 @@ EXPORT_SYMBOL(__copy_from_user_ll_nocache);
4301     unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *from,
4302     unsigned long n)
4303     {
4304     - stac();
4305     + __uaccess_begin_nospec();
4306     #ifdef CONFIG_X86_INTEL_USERCOPY
4307     if (n > 64 && static_cpu_has(X86_FEATURE_XMM2))
4308     n = __copy_user_intel_nocache(to, from, n);
4309     @@ -636,7 +636,7 @@ unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *fr
4310     #else
4311     __copy_user(to, from, n);
4312     #endif
4313     - clac();
4314     + __uaccess_end();
4315     return n;
4316     }
4317     EXPORT_SYMBOL(__copy_from_user_ll_nocache_nozero);
4318     diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c
4319     index e3af318af2db..2a07341aca46 100644
4320     --- a/crypto/tcrypt.c
4321     +++ b/crypto/tcrypt.c
4322     @@ -223,11 +223,13 @@ static void sg_init_aead(struct scatterlist *sg, char *xbuf[XBUFSIZE],
4323     }
4324    
4325     sg_init_table(sg, np + 1);
4326     - np--;
4327     + if (rem)
4328     + np--;
4329     for (k = 0; k < np; k++)
4330     sg_set_buf(&sg[k + 1], xbuf[k], PAGE_SIZE);
4331    
4332     - sg_set_buf(&sg[k + 1], xbuf[k], rem);
4333     + if (rem)
4334     + sg_set_buf(&sg[k + 1], xbuf[k], rem);
4335     }
4336    
4337     static void test_aead_speed(const char *algo, int enc, unsigned int secs,
4338     diff --git a/drivers/auxdisplay/img-ascii-lcd.c b/drivers/auxdisplay/img-ascii-lcd.c
4339     index 83f1439e57fd..6e8eaa7fe7a6 100644
4340     --- a/drivers/auxdisplay/img-ascii-lcd.c
4341     +++ b/drivers/auxdisplay/img-ascii-lcd.c
4342     @@ -442,3 +442,7 @@ static struct platform_driver img_ascii_lcd_driver = {
4343     .remove = img_ascii_lcd_remove,
4344     };
4345     module_platform_driver(img_ascii_lcd_driver);
4346     +
4347     +MODULE_DESCRIPTION("Imagination Technologies ASCII LCD Display");
4348     +MODULE_AUTHOR("Paul Burton <paul.burton@mips.com>");
4349     +MODULE_LICENSE("GPL");
4350     diff --git a/drivers/gpu/drm/rcar-du/rcar_du_crtc.c b/drivers/gpu/drm/rcar-du/rcar_du_crtc.c
4351     index a2ec6d8796a0..3322b157106d 100644
4352     --- a/drivers/gpu/drm/rcar-du/rcar_du_crtc.c
4353     +++ b/drivers/gpu/drm/rcar-du/rcar_du_crtc.c
4354     @@ -392,6 +392,31 @@ static void rcar_du_crtc_start(struct rcar_du_crtc *rcrtc)
4355     rcrtc->started = true;
4356     }
4357    
4358     +static void rcar_du_crtc_disable_planes(struct rcar_du_crtc *rcrtc)
4359     +{
4360     + struct rcar_du_device *rcdu = rcrtc->group->dev;
4361     + struct drm_crtc *crtc = &rcrtc->crtc;
4362     + u32 status;
4363     + /* Make sure vblank interrupts are enabled. */
4364     + drm_crtc_vblank_get(crtc);
4365     + /*
4366     + * Disable planes and calculate how many vertical blanking interrupts we
4367     + * have to wait for. If a vertical blanking interrupt has been triggered
4368     + * but not processed yet, we don't know whether it occurred before or
4369     + * after the planes got disabled. We thus have to wait for two vblank
4370     + * interrupts in that case.
4371     + */
4372     + spin_lock_irq(&rcrtc->vblank_lock);
4373     + rcar_du_group_write(rcrtc->group, rcrtc->index % 2 ? DS2PR : DS1PR, 0);
4374     + status = rcar_du_crtc_read(rcrtc, DSSR);
4375     + rcrtc->vblank_count = status & DSSR_VBK ? 2 : 1;
4376     + spin_unlock_irq(&rcrtc->vblank_lock);
4377     + if (!wait_event_timeout(rcrtc->vblank_wait, rcrtc->vblank_count == 0,
4378     + msecs_to_jiffies(100)))
4379     + dev_warn(rcdu->dev, "vertical blanking timeout\n");
4380     + drm_crtc_vblank_put(crtc);
4381     +}
4382     +
4383     static void rcar_du_crtc_stop(struct rcar_du_crtc *rcrtc)
4384     {
4385     struct drm_crtc *crtc = &rcrtc->crtc;
4386     @@ -400,17 +425,16 @@ static void rcar_du_crtc_stop(struct rcar_du_crtc *rcrtc)
4387     return;
4388    
4389     /* Disable all planes and wait for the change to take effect. This is
4390     - * required as the DSnPR registers are updated on vblank, and no vblank
4391     - * will occur once the CRTC is stopped. Disabling planes when starting
4392     - * the CRTC thus wouldn't be enough as it would start scanning out
4393     - * immediately from old frame buffers until the next vblank.
4394     + * required as the plane enable registers are updated on vblank, and no
4395     + * vblank will occur once the CRTC is stopped. Disabling planes when
4396     + * starting the CRTC thus wouldn't be enough as it would start scanning
4397     + * out immediately from old frame buffers until the next vblank.
4398     *
4399     * This increases the CRTC stop delay, especially when multiple CRTCs
4400     * are stopped in one operation as we now wait for one vblank per CRTC.
4401     * Whether this can be improved needs to be researched.
4402     */
4403     - rcar_du_group_write(rcrtc->group, rcrtc->index % 2 ? DS2PR : DS1PR, 0);
4404     - drm_crtc_wait_one_vblank(crtc);
4405     + rcar_du_crtc_disable_planes(rcrtc);
4406    
4407     /* Disable vertical blanking interrupt reporting. We first need to wait
4408     * for page flip completion before stopping the CRTC as userspace
4409     @@ -548,10 +572,25 @@ static irqreturn_t rcar_du_crtc_irq(int irq, void *arg)
4410     irqreturn_t ret = IRQ_NONE;
4411     u32 status;
4412    
4413     + spin_lock(&rcrtc->vblank_lock);
4414     +
4415     status = rcar_du_crtc_read(rcrtc, DSSR);
4416     rcar_du_crtc_write(rcrtc, DSRCR, status & DSRCR_MASK);
4417    
4418     - if (status & DSSR_FRM) {
4419     + if (status & DSSR_VBK) {
4420     + /*
4421     + * Wake up the vblank wait if the counter reaches 0. This must
4422     + * be protected by the vblank_lock to avoid races in
4423     + * rcar_du_crtc_disable_planes().
4424     + */
4425     + if (rcrtc->vblank_count) {
4426     + if (--rcrtc->vblank_count == 0)
4427     + wake_up(&rcrtc->vblank_wait);
4428     + }
4429     + }
4430     + spin_unlock(&rcrtc->vblank_lock);
4431     +
4432     + if (status & DSSR_VBK) {
4433     drm_crtc_handle_vblank(&rcrtc->crtc);
4434     rcar_du_crtc_finish_page_flip(rcrtc);
4435     ret = IRQ_HANDLED;
4436     @@ -606,6 +645,8 @@ int rcar_du_crtc_create(struct rcar_du_group *rgrp, unsigned int index)
4437     }
4438    
4439     init_waitqueue_head(&rcrtc->flip_wait);
4440     + init_waitqueue_head(&rcrtc->vblank_wait);
4441     + spin_lock_init(&rcrtc->vblank_lock);
4442    
4443     rcrtc->group = rgrp;
4444     rcrtc->mmio_offset = mmio_offsets[index];
4445     diff --git a/drivers/gpu/drm/rcar-du/rcar_du_crtc.h b/drivers/gpu/drm/rcar-du/rcar_du_crtc.h
4446     index 6f08b7e7db06..48bef05b4c62 100644
4447     --- a/drivers/gpu/drm/rcar-du/rcar_du_crtc.h
4448     +++ b/drivers/gpu/drm/rcar-du/rcar_du_crtc.h
4449     @@ -15,6 +15,7 @@
4450     #define __RCAR_DU_CRTC_H__
4451    
4452     #include <linux/mutex.h>
4453     +#include <linux/spinlock.h>
4454     #include <linux/wait.h>
4455    
4456     #include <drm/drmP.h>
4457     @@ -33,6 +34,9 @@ struct rcar_du_vsp;
4458     * @started: whether the CRTC has been started and is running
4459     * @event: event to post when the pending page flip completes
4460     * @flip_wait: wait queue used to signal page flip completion
4461     + * @vblank_lock: protects vblank_wait and vblank_count
4462     + * @vblank_wait: wait queue used to signal vertical blanking
4463     + * @vblank_count: number of vertical blanking interrupts to wait for
4464     * @outputs: bitmask of the outputs (enum rcar_du_output) driven by this CRTC
4465     * @group: CRTC group this CRTC belongs to
4466     */
4467     @@ -48,6 +52,10 @@ struct rcar_du_crtc {
4468     struct drm_pending_vblank_event *event;
4469     wait_queue_head_t flip_wait;
4470    
4471     + spinlock_t vblank_lock;
4472     + wait_queue_head_t vblank_wait;
4473     + unsigned int vblank_count;
4474     +
4475     unsigned int outputs;
4476    
4477     struct rcar_du_group *group;
4478     diff --git a/drivers/media/platform/soc_camera/soc_scale_crop.c b/drivers/media/platform/soc_camera/soc_scale_crop.c
4479     index f77252d6ccd3..d29c24854c2c 100644
4480     --- a/drivers/media/platform/soc_camera/soc_scale_crop.c
4481     +++ b/drivers/media/platform/soc_camera/soc_scale_crop.c
4482     @@ -418,3 +418,7 @@ void soc_camera_calc_client_output(struct soc_camera_device *icd,
4483     mf->height = soc_camera_shift_scale(rect->height, shift, scale_v);
4484     }
4485     EXPORT_SYMBOL(soc_camera_calc_client_output);
4486     +
4487     +MODULE_DESCRIPTION("soc-camera scaling-cropping functions");
4488     +MODULE_AUTHOR("Guennadi Liakhovetski <kernel@pengutronix.de>");
4489     +MODULE_LICENSE("GPL");
4490     diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c
4491     index bdbcd2b088a0..c3c28f0960e5 100644
4492     --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c
4493     +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c
4494     @@ -3849,7 +3849,7 @@ static void qlcnic_83xx_flush_mbx_queue(struct qlcnic_adapter *adapter)
4495     struct list_head *head = &mbx->cmd_q;
4496     struct qlcnic_cmd_args *cmd = NULL;
4497    
4498     - spin_lock(&mbx->queue_lock);
4499     + spin_lock_bh(&mbx->queue_lock);
4500    
4501     while (!list_empty(head)) {
4502     cmd = list_entry(head->next, struct qlcnic_cmd_args, list);
4503     @@ -3860,7 +3860,7 @@ static void qlcnic_83xx_flush_mbx_queue(struct qlcnic_adapter *adapter)
4504     qlcnic_83xx_notify_cmd_completion(adapter, cmd);
4505     }
4506    
4507     - spin_unlock(&mbx->queue_lock);
4508     + spin_unlock_bh(&mbx->queue_lock);
4509     }
4510    
4511     static int qlcnic_83xx_check_mbx_status(struct qlcnic_adapter *adapter)
4512     @@ -3896,12 +3896,12 @@ static void qlcnic_83xx_dequeue_mbx_cmd(struct qlcnic_adapter *adapter,
4513     {
4514     struct qlcnic_mailbox *mbx = adapter->ahw->mailbox;
4515    
4516     - spin_lock(&mbx->queue_lock);
4517     + spin_lock_bh(&mbx->queue_lock);
4518    
4519     list_del(&cmd->list);
4520     mbx->num_cmds--;
4521    
4522     - spin_unlock(&mbx->queue_lock);
4523     + spin_unlock_bh(&mbx->queue_lock);
4524    
4525     qlcnic_83xx_notify_cmd_completion(adapter, cmd);
4526     }
4527     @@ -3966,7 +3966,7 @@ static int qlcnic_83xx_enqueue_mbx_cmd(struct qlcnic_adapter *adapter,
4528     init_completion(&cmd->completion);
4529     cmd->rsp_opcode = QLC_83XX_MBX_RESPONSE_UNKNOWN;
4530    
4531     - spin_lock(&mbx->queue_lock);
4532     + spin_lock_bh(&mbx->queue_lock);
4533    
4534     list_add_tail(&cmd->list, &mbx->cmd_q);
4535     mbx->num_cmds++;
4536     @@ -3974,7 +3974,7 @@ static int qlcnic_83xx_enqueue_mbx_cmd(struct qlcnic_adapter *adapter,
4537     *timeout = cmd->total_cmds * QLC_83XX_MBX_TIMEOUT;
4538     queue_work(mbx->work_q, &mbx->work);
4539    
4540     - spin_unlock(&mbx->queue_lock);
4541     + spin_unlock_bh(&mbx->queue_lock);
4542    
4543     return 0;
4544     }
4545     @@ -4070,15 +4070,15 @@ static void qlcnic_83xx_mailbox_worker(struct work_struct *work)
4546     mbx->rsp_status = QLC_83XX_MBX_RESPONSE_WAIT;
4547     spin_unlock_irqrestore(&mbx->aen_lock, flags);
4548    
4549     - spin_lock(&mbx->queue_lock);
4550     + spin_lock_bh(&mbx->queue_lock);
4551    
4552     if (list_empty(head)) {
4553     - spin_unlock(&mbx->queue_lock);
4554     + spin_unlock_bh(&mbx->queue_lock);
4555     return;
4556     }
4557     cmd = list_entry(head->next, struct qlcnic_cmd_args, list);
4558    
4559     - spin_unlock(&mbx->queue_lock);
4560     + spin_unlock_bh(&mbx->queue_lock);
4561    
4562     mbx_ops->encode_cmd(adapter, cmd);
4563     mbx_ops->nofity_fw(adapter, QLC_83XX_MBX_REQUEST);
4564     diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
4565     index 298b74ebc1e9..18e68c91e651 100644
4566     --- a/drivers/net/ethernet/realtek/r8169.c
4567     +++ b/drivers/net/ethernet/realtek/r8169.c
4568     @@ -1387,7 +1387,7 @@ DECLARE_RTL_COND(rtl_ocp_tx_cond)
4569     {
4570     void __iomem *ioaddr = tp->mmio_addr;
4571    
4572     - return RTL_R8(IBISR0) & 0x02;
4573     + return RTL_R8(IBISR0) & 0x20;
4574     }
4575    
4576     static void rtl8168ep_stop_cmac(struct rtl8169_private *tp)
4577     @@ -1395,7 +1395,7 @@ static void rtl8168ep_stop_cmac(struct rtl8169_private *tp)
4578     void __iomem *ioaddr = tp->mmio_addr;
4579    
4580     RTL_W8(IBCR2, RTL_R8(IBCR2) & ~0x01);
4581     - rtl_msleep_loop_wait_low(tp, &rtl_ocp_tx_cond, 50, 2000);
4582     + rtl_msleep_loop_wait_high(tp, &rtl_ocp_tx_cond, 50, 2000);
4583     RTL_W8(IBISR0, RTL_R8(IBISR0) | 0x20);
4584     RTL_W8(IBCR0, RTL_R8(IBCR0) & ~0x01);
4585     }
4586     diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c
4587     index db65d9ad4488..e1e5e8438457 100644
4588     --- a/drivers/net/usb/qmi_wwan.c
4589     +++ b/drivers/net/usb/qmi_wwan.c
4590     @@ -944,6 +944,7 @@ static const struct usb_device_id products[] = {
4591     {QMI_QUIRK_SET_DTR(0x2c7c, 0x0125, 4)}, /* Quectel EC25, EC20 R2.0 Mini PCIe */
4592     {QMI_QUIRK_SET_DTR(0x2c7c, 0x0121, 4)}, /* Quectel EC21 Mini PCIe */
4593     {QMI_FIXED_INTF(0x2c7c, 0x0296, 4)}, /* Quectel BG96 */
4594     + {QMI_QUIRK_SET_DTR(0x2c7c, 0x0306, 4)}, /* Quectel EP06 Mini PCIe */
4595    
4596     /* 4. Gobi 1000 devices */
4597     {QMI_GOBI1K_DEVICE(0x05c6, 0x9212)}, /* Acer Gobi Modem Device */
4598     diff --git a/drivers/net/wireless/broadcom/b43/main.c b/drivers/net/wireless/broadcom/b43/main.c
4599     index 6e5d9095b195..a635fc6b1722 100644
4600     --- a/drivers/net/wireless/broadcom/b43/main.c
4601     +++ b/drivers/net/wireless/broadcom/b43/main.c
4602     @@ -71,8 +71,18 @@ MODULE_FIRMWARE("b43/ucode11.fw");
4603     MODULE_FIRMWARE("b43/ucode13.fw");
4604     MODULE_FIRMWARE("b43/ucode14.fw");
4605     MODULE_FIRMWARE("b43/ucode15.fw");
4606     +MODULE_FIRMWARE("b43/ucode16_lp.fw");
4607     MODULE_FIRMWARE("b43/ucode16_mimo.fw");
4608     +MODULE_FIRMWARE("b43/ucode24_lcn.fw");
4609     +MODULE_FIRMWARE("b43/ucode25_lcn.fw");
4610     +MODULE_FIRMWARE("b43/ucode25_mimo.fw");
4611     +MODULE_FIRMWARE("b43/ucode26_mimo.fw");
4612     +MODULE_FIRMWARE("b43/ucode29_mimo.fw");
4613     +MODULE_FIRMWARE("b43/ucode33_lcn40.fw");
4614     +MODULE_FIRMWARE("b43/ucode30_mimo.fw");
4615     MODULE_FIRMWARE("b43/ucode5.fw");
4616     +MODULE_FIRMWARE("b43/ucode40.fw");
4617     +MODULE_FIRMWARE("b43/ucode42.fw");
4618     MODULE_FIRMWARE("b43/ucode9.fw");
4619    
4620     static int modparam_bad_frames_preempt;
4621     diff --git a/drivers/pinctrl/pxa/pinctrl-pxa2xx.c b/drivers/pinctrl/pxa/pinctrl-pxa2xx.c
4622     index 866aa3ce1ac9..6cf0006d4c8d 100644
4623     --- a/drivers/pinctrl/pxa/pinctrl-pxa2xx.c
4624     +++ b/drivers/pinctrl/pxa/pinctrl-pxa2xx.c
4625     @@ -436,3 +436,7 @@ int pxa2xx_pinctrl_exit(struct platform_device *pdev)
4626     return 0;
4627     }
4628     EXPORT_SYMBOL_GPL(pxa2xx_pinctrl_exit);
4629     +
4630     +MODULE_AUTHOR("Robert Jarzmik <robert.jarzmik@free.fr>");
4631     +MODULE_DESCRIPTION("Marvell PXA2xx pinctrl driver");
4632     +MODULE_LICENSE("GPL v2");
4633     diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
4634     index f2303f390345..23973a8124fc 100644
4635     --- a/drivers/tty/serial/serial_core.c
4636     +++ b/drivers/tty/serial/serial_core.c
4637     @@ -965,6 +965,8 @@ static int uart_set_info(struct tty_struct *tty, struct tty_port *port,
4638     }
4639     } else {
4640     retval = uart_startup(tty, state, 1);
4641     + if (retval == 0)
4642     + tty_port_set_initialized(port, true);
4643     if (retval > 0)
4644     retval = 0;
4645     }
4646     diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
4647     index 96a0661011fd..e5b7652234fc 100644
4648     --- a/drivers/vhost/net.c
4649     +++ b/drivers/vhost/net.c
4650     @@ -1078,6 +1078,7 @@ static long vhost_net_reset_owner(struct vhost_net *n)
4651     }
4652     vhost_net_stop(n, &tx_sock, &rx_sock);
4653     vhost_net_flush(n);
4654     + vhost_dev_stop(&n->dev);
4655     vhost_dev_reset_owner(&n->dev, umem);
4656     vhost_net_vq_reset(n);
4657     done:
4658     diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
4659     index 6e84b2cae6ad..442b54a14cbc 100644
4660     --- a/include/linux/fdtable.h
4661     +++ b/include/linux/fdtable.h
4662     @@ -9,6 +9,7 @@
4663     #include <linux/compiler.h>
4664     #include <linux/spinlock.h>
4665     #include <linux/rcupdate.h>
4666     +#include <linux/nospec.h>
4667     #include <linux/types.h>
4668     #include <linux/init.h>
4669     #include <linux/fs.h>
4670     @@ -81,8 +82,10 @@ static inline struct file *__fcheck_files(struct files_struct *files, unsigned i
4671     {
4672     struct fdtable *fdt = rcu_dereference_raw(files->fdt);
4673    
4674     - if (fd < fdt->max_fds)
4675     + if (fd < fdt->max_fds) {
4676     + fd = array_index_nospec(fd, fdt->max_fds);
4677     return rcu_dereference_raw(fdt->fd[fd]);
4678     + }
4679     return NULL;
4680     }
4681    
4682     diff --git a/include/linux/init.h b/include/linux/init.h
4683     index e30104ceb86d..8e346d1bd837 100644
4684     --- a/include/linux/init.h
4685     +++ b/include/linux/init.h
4686     @@ -4,6 +4,13 @@
4687     #include <linux/compiler.h>
4688     #include <linux/types.h>
4689    
4690     +/* Built-in __init functions needn't be compiled with retpoline */
4691     +#if defined(RETPOLINE) && !defined(MODULE)
4692     +#define __noretpoline __attribute__((indirect_branch("keep")))
4693     +#else
4694     +#define __noretpoline
4695     +#endif
4696     +
4697     /* These macros are used to mark some functions or
4698     * initialized data (doesn't apply to uninitialized data)
4699     * as `initialization' functions. The kernel can take this
4700     @@ -39,7 +46,7 @@
4701    
4702     /* These are for everybody (although not all archs will actually
4703     discard it in modules) */
4704     -#define __init __section(.init.text) __cold notrace __latent_entropy
4705     +#define __init __section(.init.text) __cold notrace __latent_entropy __noretpoline
4706     #define __initdata __section(.init.data)
4707     #define __initconst __section(.init.rodata)
4708     #define __exitdata __section(.exit.data)
4709     diff --git a/include/linux/module.h b/include/linux/module.h
4710     index 0c3207d26ac0..d2224a09b4b5 100644
4711     --- a/include/linux/module.h
4712     +++ b/include/linux/module.h
4713     @@ -791,6 +791,15 @@ static inline void module_bug_finalize(const Elf_Ehdr *hdr,
4714     static inline void module_bug_cleanup(struct module *mod) {}
4715     #endif /* CONFIG_GENERIC_BUG */
4716    
4717     +#ifdef RETPOLINE
4718     +extern bool retpoline_module_ok(bool has_retpoline);
4719     +#else
4720     +static inline bool retpoline_module_ok(bool has_retpoline)
4721     +{
4722     + return true;
4723     +}
4724     +#endif
4725     +
4726     #ifdef CONFIG_MODULE_SIG
4727     static inline bool module_sig_ok(struct module *module)
4728     {
4729     diff --git a/include/linux/nospec.h b/include/linux/nospec.h
4730     new file mode 100644
4731     index 000000000000..b99bced39ac2
4732     --- /dev/null
4733     +++ b/include/linux/nospec.h
4734     @@ -0,0 +1,72 @@
4735     +// SPDX-License-Identifier: GPL-2.0
4736     +// Copyright(c) 2018 Linus Torvalds. All rights reserved.
4737     +// Copyright(c) 2018 Alexei Starovoitov. All rights reserved.
4738     +// Copyright(c) 2018 Intel Corporation. All rights reserved.
4739     +
4740     +#ifndef _LINUX_NOSPEC_H
4741     +#define _LINUX_NOSPEC_H
4742     +
4743     +/**
4744     + * array_index_mask_nospec() - generate a ~0 mask when index < size, 0 otherwise
4745     + * @index: array element index
4746     + * @size: number of elements in array
4747     + *
4748     + * When @index is out of bounds (@index >= @size), the sign bit will be
4749     + * set. Extend the sign bit to all bits and invert, giving a result of
4750     + * zero for an out of bounds index, or ~0 if within bounds [0, @size).
4751     + */
4752     +#ifndef array_index_mask_nospec
4753     +static inline unsigned long array_index_mask_nospec(unsigned long index,
4754     + unsigned long size)
4755     +{
4756     + /*
4757     + * Warn developers about inappropriate array_index_nospec() usage.
4758     + *
4759     + * Even if the CPU speculates past the WARN_ONCE branch, the
4760     + * sign bit of @index is taken into account when generating the
4761     + * mask.
4762     + *
4763     + * This warning is compiled out when the compiler can infer that
4764     + * @index and @size are less than LONG_MAX.
4765     + */
4766     + if (WARN_ONCE(index > LONG_MAX || size > LONG_MAX,
4767     + "array_index_nospec() limited to range of [0, LONG_MAX]\n"))
4768     + return 0;
4769     +
4770     + /*
4771     + * Always calculate and emit the mask even if the compiler
4772     + * thinks the mask is not needed. The compiler does not take
4773     + * into account the value of @index under speculation.
4774     + */
4775     + OPTIMIZER_HIDE_VAR(index);
4776     + return ~(long)(index | (size - 1UL - index)) >> (BITS_PER_LONG - 1);
4777     +}
4778     +#endif
4779     +
4780     +/*
4781     + * array_index_nospec - sanitize an array index after a bounds check
4782     + *
4783     + * For a code sequence like:
4784     + *
4785     + * if (index < size) {
4786     + * index = array_index_nospec(index, size);
4787     + * val = array[index];
4788     + * }
4789     + *
4790     + * ...if the CPU speculates past the bounds check then
4791     + * array_index_nospec() will clamp the index within the range of [0,
4792     + * size).
4793     + */
4794     +#define array_index_nospec(index, size) \
4795     +({ \
4796     + typeof(index) _i = (index); \
4797     + typeof(size) _s = (size); \
4798     + unsigned long _mask = array_index_mask_nospec(_i, _s); \
4799     + \
4800     + BUILD_BUG_ON(sizeof(_i) > sizeof(long)); \
4801     + BUILD_BUG_ON(sizeof(_s) > sizeof(long)); \
4802     + \
4803     + _i &= _mask; \
4804     + _i; \
4805     +})
4806     +#endif /* _LINUX_NOSPEC_H */
4807     diff --git a/kernel/module.c b/kernel/module.c
4808     index 0e54d5bf0097..07bfb9971f2f 100644
4809     --- a/kernel/module.c
4810     +++ b/kernel/module.c
4811     @@ -2817,6 +2817,15 @@ static int check_modinfo_livepatch(struct module *mod, struct load_info *info)
4812     }
4813     #endif /* CONFIG_LIVEPATCH */
4814    
4815     +static void check_modinfo_retpoline(struct module *mod, struct load_info *info)
4816     +{
4817     + if (retpoline_module_ok(get_modinfo(info, "retpoline")))
4818     + return;
4819     +
4820     + pr_warn("%s: loading module not compiled with retpoline compiler.\n",
4821     + mod->name);
4822     +}
4823     +
4824     /* Sets info->hdr and info->len. */
4825     static int copy_module_from_user(const void __user *umod, unsigned long len,
4826     struct load_info *info)
4827     @@ -2969,6 +2978,8 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags)
4828     add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK);
4829     }
4830    
4831     + check_modinfo_retpoline(mod, info);
4832     +
4833     if (get_modinfo(info, "staging")) {
4834     add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK);
4835     pr_warn("%s: module is from the staging directory, the quality "
4836     diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
4837     index 77f396b679ce..5dce4291f0ed 100644
4838     --- a/net/core/sock_reuseport.c
4839     +++ b/net/core/sock_reuseport.c
4840     @@ -93,6 +93,16 @@ static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse)
4841     return more_reuse;
4842     }
4843    
4844     +static void reuseport_free_rcu(struct rcu_head *head)
4845     +{
4846     + struct sock_reuseport *reuse;
4847     +
4848     + reuse = container_of(head, struct sock_reuseport, rcu);
4849     + if (reuse->prog)
4850     + bpf_prog_destroy(reuse->prog);
4851     + kfree(reuse);
4852     +}
4853     +
4854     /**
4855     * reuseport_add_sock - Add a socket to the reuseport group of another.
4856     * @sk: New socket to add to the group.
4857     @@ -101,7 +111,7 @@ static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse)
4858     */
4859     int reuseport_add_sock(struct sock *sk, struct sock *sk2)
4860     {
4861     - struct sock_reuseport *reuse;
4862     + struct sock_reuseport *old_reuse, *reuse;
4863    
4864     if (!rcu_access_pointer(sk2->sk_reuseport_cb)) {
4865     int err = reuseport_alloc(sk2);
4866     @@ -112,10 +122,13 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2)
4867    
4868     spin_lock_bh(&reuseport_lock);
4869     reuse = rcu_dereference_protected(sk2->sk_reuseport_cb,
4870     - lockdep_is_held(&reuseport_lock)),
4871     - WARN_ONCE(rcu_dereference_protected(sk->sk_reuseport_cb,
4872     - lockdep_is_held(&reuseport_lock)),
4873     - "socket already in reuseport group");
4874     + lockdep_is_held(&reuseport_lock));
4875     + old_reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
4876     + lockdep_is_held(&reuseport_lock));
4877     + if (old_reuse && old_reuse->num_socks != 1) {
4878     + spin_unlock_bh(&reuseport_lock);
4879     + return -EBUSY;
4880     + }
4881    
4882     if (reuse->num_socks == reuse->max_socks) {
4883     reuse = reuseport_grow(reuse);
4884     @@ -133,19 +146,11 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2)
4885    
4886     spin_unlock_bh(&reuseport_lock);
4887    
4888     + if (old_reuse)
4889     + call_rcu(&old_reuse->rcu, reuseport_free_rcu);
4890     return 0;
4891     }
4892    
4893     -static void reuseport_free_rcu(struct rcu_head *head)
4894     -{
4895     - struct sock_reuseport *reuse;
4896     -
4897     - reuse = container_of(head, struct sock_reuseport, rcu);
4898     - if (reuse->prog)
4899     - bpf_prog_destroy(reuse->prog);
4900     - kfree(reuse);
4901     -}
4902     -
4903     void reuseport_detach_sock(struct sock *sk)
4904     {
4905     struct sock_reuseport *reuse;
4906     diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
4907     index 9c7a4cea1628..7f5fe07d0b13 100644
4908     --- a/net/ipv4/igmp.c
4909     +++ b/net/ipv4/igmp.c
4910     @@ -386,7 +386,11 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, unsigned int mtu)
4911     pip->frag_off = htons(IP_DF);
4912     pip->ttl = 1;
4913     pip->daddr = fl4.daddr;
4914     +
4915     + rcu_read_lock();
4916     pip->saddr = igmpv3_get_srcaddr(dev, &fl4);
4917     + rcu_read_unlock();
4918     +
4919     pip->protocol = IPPROTO_IGMP;
4920     pip->tot_len = 0; /* filled in later */
4921     ip_select_ident(net, skb, NULL);
4922     diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
4923     index 7efa6b062049..0d1a767db1bb 100644
4924     --- a/net/ipv4/tcp.c
4925     +++ b/net/ipv4/tcp.c
4926     @@ -2316,6 +2316,12 @@ int tcp_disconnect(struct sock *sk, int flags)
4927    
4928     WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
4929    
4930     + if (sk->sk_frag.page) {
4931     + put_page(sk->sk_frag.page);
4932     + sk->sk_frag.page = NULL;
4933     + sk->sk_frag.offset = 0;
4934     + }
4935     +
4936     sk->sk_error_report(sk);
4937     return err;
4938     }
4939     diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
4940     index e86a34fd5484..8ec60532be2b 100644
4941     --- a/net/ipv4/tcp_bbr.c
4942     +++ b/net/ipv4/tcp_bbr.c
4943     @@ -452,7 +452,8 @@ static void bbr_advance_cycle_phase(struct sock *sk)
4944    
4945     bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1);
4946     bbr->cycle_mstamp = tp->delivered_mstamp;
4947     - bbr->pacing_gain = bbr_pacing_gain[bbr->cycle_idx];
4948     + bbr->pacing_gain = bbr->lt_use_bw ? BBR_UNIT :
4949     + bbr_pacing_gain[bbr->cycle_idx];
4950     }
4951    
4952     /* Gain cycling: cycle pacing gain to converge to fair share of available bw. */
4953     @@ -461,8 +462,7 @@ static void bbr_update_cycle_phase(struct sock *sk,
4954     {
4955     struct bbr *bbr = inet_csk_ca(sk);
4956    
4957     - if ((bbr->mode == BBR_PROBE_BW) && !bbr->lt_use_bw &&
4958     - bbr_is_next_cycle_phase(sk, rs))
4959     + if (bbr->mode == BBR_PROBE_BW && bbr_is_next_cycle_phase(sk, rs))
4960     bbr_advance_cycle_phase(sk);
4961     }
4962    
4963     diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
4964     index 5cad76f87536..421379014995 100644
4965     --- a/net/ipv6/af_inet6.c
4966     +++ b/net/ipv6/af_inet6.c
4967     @@ -274,6 +274,7 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
4968     struct net *net = sock_net(sk);
4969     __be32 v4addr = 0;
4970     unsigned short snum;
4971     + bool saved_ipv6only;
4972     int addr_type = 0;
4973     int err = 0;
4974    
4975     @@ -378,19 +379,21 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
4976     if (!(addr_type & IPV6_ADDR_MULTICAST))
4977     np->saddr = addr->sin6_addr;
4978    
4979     + saved_ipv6only = sk->sk_ipv6only;
4980     + if (addr_type != IPV6_ADDR_ANY && addr_type != IPV6_ADDR_MAPPED)
4981     + sk->sk_ipv6only = 1;
4982     +
4983     /* Make sure we are allowed to bind here. */
4984     if ((snum || !inet->bind_address_no_port) &&
4985     sk->sk_prot->get_port(sk, snum)) {
4986     + sk->sk_ipv6only = saved_ipv6only;
4987     inet_reset_saddr(sk);
4988     err = -EADDRINUSE;
4989     goto out;
4990     }
4991    
4992     - if (addr_type != IPV6_ADDR_ANY) {
4993     + if (addr_type != IPV6_ADDR_ANY)
4994     sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
4995     - if (addr_type != IPV6_ADDR_MAPPED)
4996     - sk->sk_ipv6only = 1;
4997     - }
4998     if (snum)
4999     sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
5000     inet->inet_sport = htons(inet->inet_num);
5001     diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
5002     index 117405dd07a3..a30e7e925c9b 100644
5003     --- a/net/ipv6/ip6mr.c
5004     +++ b/net/ipv6/ip6mr.c
5005     @@ -495,6 +495,7 @@ static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
5006     return ERR_PTR(-ENOENT);
5007    
5008     it->mrt = mrt;
5009     + it->cache = NULL;
5010     return *pos ? ipmr_mfc_seq_idx(net, seq->private, *pos - 1)
5011     : SEQ_START_TOKEN;
5012     }
5013     diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
5014     index ae83c3aec308..da574a16e7b3 100644
5015     --- a/net/sched/cls_u32.c
5016     +++ b/net/sched/cls_u32.c
5017     @@ -496,6 +496,7 @@ static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h)
5018     static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n,
5019     u32 flags)
5020     {
5021     + struct tc_u_hnode *ht = rtnl_dereference(n->ht_down);
5022     struct net_device *dev = tp->q->dev_queue->dev;
5023     struct tc_cls_u32_offload u32_offload = {0};
5024     struct tc_to_netdev offload;
5025     @@ -520,7 +521,7 @@ static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n,
5026     offload.cls_u32->knode.sel = &n->sel;
5027     offload.cls_u32->knode.exts = &n->exts;
5028     if (n->ht_down)
5029     - offload.cls_u32->knode.link_handle = n->ht_down->handle;
5030     + offload.cls_u32->knode.link_handle = ht->handle;
5031    
5032     err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
5033     tp->protocol, &offload);
5034     @@ -788,8 +789,9 @@ static void u32_replace_knode(struct tcf_proto *tp, struct tc_u_common *tp_c,
5035     static struct tc_u_knode *u32_init_knode(struct tcf_proto *tp,
5036     struct tc_u_knode *n)
5037     {
5038     - struct tc_u_knode *new;
5039     + struct tc_u_hnode *ht = rtnl_dereference(n->ht_down);
5040     struct tc_u32_sel *s = &n->sel;
5041     + struct tc_u_knode *new;
5042    
5043     new = kzalloc(sizeof(*n) + s->nkeys*sizeof(struct tc_u32_key),
5044     GFP_KERNEL);
5045     @@ -807,11 +809,11 @@ static struct tc_u_knode *u32_init_knode(struct tcf_proto *tp,
5046     new->fshift = n->fshift;
5047     new->res = n->res;
5048     new->flags = n->flags;
5049     - RCU_INIT_POINTER(new->ht_down, n->ht_down);
5050     + RCU_INIT_POINTER(new->ht_down, ht);
5051    
5052     /* bump reference count as long as we hold pointer to structure */
5053     - if (new->ht_down)
5054     - new->ht_down->refcnt++;
5055     + if (ht)
5056     + ht->refcnt++;
5057    
5058     #ifdef CONFIG_CLS_U32_PERF
5059     /* Statistics may be incremented by readers during update
5060     diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
5061     index c626f679e1c8..91722e97cdd5 100644
5062     --- a/net/wireless/nl80211.c
5063     +++ b/net/wireless/nl80211.c
5064     @@ -16,6 +16,7 @@
5065     #include <linux/nl80211.h>
5066     #include <linux/rtnetlink.h>
5067     #include <linux/netlink.h>
5068     +#include <linux/nospec.h>
5069     #include <linux/etherdevice.h>
5070     #include <net/net_namespace.h>
5071     #include <net/genetlink.h>
5072     @@ -2014,20 +2015,22 @@ static const struct nla_policy txq_params_policy[NL80211_TXQ_ATTR_MAX + 1] = {
5073     static int parse_txq_params(struct nlattr *tb[],
5074     struct ieee80211_txq_params *txq_params)
5075     {
5076     + u8 ac;
5077     +
5078     if (!tb[NL80211_TXQ_ATTR_AC] || !tb[NL80211_TXQ_ATTR_TXOP] ||
5079     !tb[NL80211_TXQ_ATTR_CWMIN] || !tb[NL80211_TXQ_ATTR_CWMAX] ||
5080     !tb[NL80211_TXQ_ATTR_AIFS])
5081     return -EINVAL;
5082    
5083     - txq_params->ac = nla_get_u8(tb[NL80211_TXQ_ATTR_AC]);
5084     + ac = nla_get_u8(tb[NL80211_TXQ_ATTR_AC]);
5085     txq_params->txop = nla_get_u16(tb[NL80211_TXQ_ATTR_TXOP]);
5086     txq_params->cwmin = nla_get_u16(tb[NL80211_TXQ_ATTR_CWMIN]);
5087     txq_params->cwmax = nla_get_u16(tb[NL80211_TXQ_ATTR_CWMAX]);
5088     txq_params->aifs = nla_get_u8(tb[NL80211_TXQ_ATTR_AIFS]);
5089    
5090     - if (txq_params->ac >= NL80211_NUM_ACS)
5091     + if (ac >= NL80211_NUM_ACS)
5092     return -EINVAL;
5093     -
5094     + txq_params->ac = array_index_nospec(ac, NL80211_NUM_ACS);
5095     return 0;
5096     }
5097    
5098     diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c
5099     index 845eb9b800f3..238db4ffd30c 100644
5100     --- a/scripts/mod/modpost.c
5101     +++ b/scripts/mod/modpost.c
5102     @@ -2130,6 +2130,14 @@ static void add_intree_flag(struct buffer *b, int is_intree)
5103     buf_printf(b, "\nMODULE_INFO(intree, \"Y\");\n");
5104     }
5105    
5106     +/* Cannot check for assembler */
5107     +static void add_retpoline(struct buffer *b)
5108     +{
5109     + buf_printf(b, "\n#ifdef RETPOLINE\n");
5110     + buf_printf(b, "MODULE_INFO(retpoline, \"Y\");\n");
5111     + buf_printf(b, "#endif\n");
5112     +}
5113     +
5114     static void add_staging_flag(struct buffer *b, const char *name)
5115     {
5116     static const char *staging_dir = "drivers/staging";
5117     @@ -2474,6 +2482,7 @@ int main(int argc, char **argv)
5118    
5119     add_header(&buf, mod);
5120     add_intree_flag(&buf, !external_module);
5121     + add_retpoline(&buf);
5122     add_staging_flag(&buf, mod->name);
5123     err |= add_versions(&buf, mod);
5124     add_depends(&buf, mod, modules);
5125     diff --git a/security/keys/encrypted-keys/encrypted.c b/security/keys/encrypted-keys/encrypted.c
5126     index a871159bf03c..ead2fd60244d 100644
5127     --- a/security/keys/encrypted-keys/encrypted.c
5128     +++ b/security/keys/encrypted-keys/encrypted.c
5129     @@ -141,23 +141,22 @@ static int valid_ecryptfs_desc(const char *ecryptfs_desc)
5130     */
5131     static int valid_master_desc(const char *new_desc, const char *orig_desc)
5132     {
5133     - if (!memcmp(new_desc, KEY_TRUSTED_PREFIX, KEY_TRUSTED_PREFIX_LEN)) {
5134     - if (strlen(new_desc) == KEY_TRUSTED_PREFIX_LEN)
5135     - goto out;
5136     - if (orig_desc)
5137     - if (memcmp(new_desc, orig_desc, KEY_TRUSTED_PREFIX_LEN))
5138     - goto out;
5139     - } else if (!memcmp(new_desc, KEY_USER_PREFIX, KEY_USER_PREFIX_LEN)) {
5140     - if (strlen(new_desc) == KEY_USER_PREFIX_LEN)
5141     - goto out;
5142     - if (orig_desc)
5143     - if (memcmp(new_desc, orig_desc, KEY_USER_PREFIX_LEN))
5144     - goto out;
5145     - } else
5146     - goto out;
5147     + int prefix_len;
5148     +
5149     + if (!strncmp(new_desc, KEY_TRUSTED_PREFIX, KEY_TRUSTED_PREFIX_LEN))
5150     + prefix_len = KEY_TRUSTED_PREFIX_LEN;
5151     + else if (!strncmp(new_desc, KEY_USER_PREFIX, KEY_USER_PREFIX_LEN))
5152     + prefix_len = KEY_USER_PREFIX_LEN;
5153     + else
5154     + return -EINVAL;
5155     +
5156     + if (!new_desc[prefix_len])
5157     + return -EINVAL;
5158     +
5159     + if (orig_desc && strncmp(new_desc, orig_desc, prefix_len))
5160     + return -EINVAL;
5161     +
5162     return 0;
5163     -out:
5164     - return -EINVAL;
5165     }
5166    
5167     /*
5168     diff --git a/sound/soc/codecs/pcm512x-spi.c b/sound/soc/codecs/pcm512x-spi.c
5169     index 712ed6598c48..ebdf9bd5a64c 100644
5170     --- a/sound/soc/codecs/pcm512x-spi.c
5171     +++ b/sound/soc/codecs/pcm512x-spi.c
5172     @@ -70,3 +70,7 @@ static struct spi_driver pcm512x_spi_driver = {
5173     };
5174    
5175     module_spi_driver(pcm512x_spi_driver);
5176     +
5177     +MODULE_DESCRIPTION("ASoC PCM512x codec driver - SPI");
5178     +MODULE_AUTHOR("Mark Brown <broonie@kernel.org>");
5179     +MODULE_LICENSE("GPL v2");
5180     diff --git a/sound/soc/generic/simple-card.c b/sound/soc/generic/simple-card.c
5181     index f608f8d23f3d..dd88c2cb6470 100644
5182     --- a/sound/soc/generic/simple-card.c
5183     +++ b/sound/soc/generic/simple-card.c
5184     @@ -232,13 +232,19 @@ static int asoc_simple_card_dai_link_of(struct device_node *node,
5185     snprintf(prop, sizeof(prop), "%scpu", prefix);
5186     cpu = of_get_child_by_name(node, prop);
5187    
5188     + if (!cpu) {
5189     + ret = -EINVAL;
5190     + dev_err(dev, "%s: Can't find %s DT node\n", __func__, prop);
5191     + goto dai_link_of_err;
5192     + }
5193     +
5194     snprintf(prop, sizeof(prop), "%splat", prefix);
5195     plat = of_get_child_by_name(node, prop);
5196    
5197     snprintf(prop, sizeof(prop), "%scodec", prefix);
5198     codec = of_get_child_by_name(node, prop);
5199    
5200     - if (!cpu || !codec) {
5201     + if (!codec) {
5202     ret = -EINVAL;
5203     dev_err(dev, "%s: Can't find %s DT node\n", __func__, prop);
5204     goto dai_link_of_err;
5205     diff --git a/sound/soc/sh/rcar/ssi.c b/sound/soc/sh/rcar/ssi.c
5206     index 560cf4b51a99..a9a43acce30e 100644
5207     --- a/sound/soc/sh/rcar/ssi.c
5208     +++ b/sound/soc/sh/rcar/ssi.c
5209     @@ -699,9 +699,14 @@ static int rsnd_ssi_dma_remove(struct rsnd_mod *mod,
5210     struct rsnd_priv *priv)
5211     {
5212     struct rsnd_ssi *ssi = rsnd_mod_to_ssi(mod);
5213     + struct rsnd_mod *pure_ssi_mod = rsnd_io_to_mod_ssi(io);
5214     struct device *dev = rsnd_priv_to_dev(priv);
5215     int irq = ssi->irq;
5216    
5217     + /* Do nothing if non SSI (= SSI parent, multi SSI) mod */
5218     + if (pure_ssi_mod != mod)
5219     + return 0;
5220     +
5221     /* PIO will request IRQ again */
5222     devm_free_irq(dev, irq, mod);
5223