Magellan Linux

Annotation of /trunk/kernel-alx/patches-4.14/0110-4.14.11-all-fixes.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 3238 - (hide annotations) (download)
Fri Nov 9 12:14:58 2018 UTC (5 years, 6 months ago) by niro
File size: 234762 byte(s)
-added up to patches-4.14.79
1 niro 3238 diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
2     index 05496622b4ef..520fdec15bbb 100644
3     --- a/Documentation/admin-guide/kernel-parameters.txt
4     +++ b/Documentation/admin-guide/kernel-parameters.txt
5     @@ -2685,6 +2685,8 @@
6     steal time is computed, but won't influence scheduler
7     behaviour
8    
9     + nopti [X86-64] Disable kernel page table isolation
10     +
11     nolapic [X86-32,APIC] Do not enable or use the local APIC.
12    
13     nolapic_timer [X86-32,APIC] Do not use the local APIC timer.
14     @@ -3253,6 +3255,12 @@
15     pt. [PARIDE]
16     See Documentation/blockdev/paride.txt.
17    
18     + pti= [X86_64]
19     + Control user/kernel address space isolation:
20     + on - enable
21     + off - disable
22     + auto - default setting
23     +
24     pty.legacy_count=
25     [KNL] Number of legacy pty's. Overwrites compiled-in
26     default number.
27     diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt
28     index 51101708a03a..ad41b3813f0a 100644
29     --- a/Documentation/x86/x86_64/mm.txt
30     +++ b/Documentation/x86/x86_64/mm.txt
31     @@ -12,6 +12,7 @@ ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB)
32     ... unused hole ...
33     ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB)
34     ... unused hole ...
35     +fffffe0000000000 - fffffe7fffffffff (=39 bits) LDT remap for PTI
36     fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping
37     ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
38     ... unused hole ...
39     @@ -29,8 +30,8 @@ Virtual memory map with 5 level page tables:
40     hole caused by [56:63] sign extension
41     ff00000000000000 - ff0fffffffffffff (=52 bits) guard hole, reserved for hypervisor
42     ff10000000000000 - ff8fffffffffffff (=55 bits) direct mapping of all phys. memory
43     -ff90000000000000 - ff91ffffffffffff (=49 bits) hole
44     -ff92000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space
45     +ff90000000000000 - ff9fffffffffffff (=52 bits) LDT remap for PTI
46     +ffa0000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space (12800 TB)
47     ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole
48     ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB)
49     ... unused hole ...
50     diff --git a/Makefile b/Makefile
51     index 9edfb78836a9..655887067dc7 100644
52     --- a/Makefile
53     +++ b/Makefile
54     @@ -1,7 +1,7 @@
55     # SPDX-License-Identifier: GPL-2.0
56     VERSION = 4
57     PATCHLEVEL = 14
58     -SUBLEVEL = 10
59     +SUBLEVEL = 11
60     EXTRAVERSION =
61     NAME = Petit Gorille
62    
63     @@ -802,6 +802,9 @@ KBUILD_CFLAGS += $(call cc-disable-warning, pointer-sign)
64     # disable invalid "can't wrap" optimizations for signed / pointers
65     KBUILD_CFLAGS += $(call cc-option,-fno-strict-overflow)
66    
67     +# Make sure -fstack-check isn't enabled (like gentoo apparently did)
68     +KBUILD_CFLAGS += $(call cc-option,-fno-stack-check,)
69     +
70     # conserve stack if available
71     KBUILD_CFLAGS += $(call cc-option,-fconserve-stack)
72    
73     diff --git a/arch/sparc/lib/hweight.S b/arch/sparc/lib/hweight.S
74     index e5547b22cd18..0ddbbb031822 100644
75     --- a/arch/sparc/lib/hweight.S
76     +++ b/arch/sparc/lib/hweight.S
77     @@ -44,8 +44,8 @@ EXPORT_SYMBOL(__arch_hweight32)
78     .previous
79    
80     ENTRY(__arch_hweight64)
81     - sethi %hi(__sw_hweight16), %g1
82     - jmpl %g1 + %lo(__sw_hweight16), %g0
83     + sethi %hi(__sw_hweight64), %g1
84     + jmpl %g1 + %lo(__sw_hweight64), %g0
85     nop
86     ENDPROC(__arch_hweight64)
87     EXPORT_SYMBOL(__arch_hweight64)
88     diff --git a/arch/x86/boot/compressed/pagetable.c b/arch/x86/boot/compressed/pagetable.c
89     index 972319ff5b01..e691ff734cb5 100644
90     --- a/arch/x86/boot/compressed/pagetable.c
91     +++ b/arch/x86/boot/compressed/pagetable.c
92     @@ -23,6 +23,9 @@
93     */
94     #undef CONFIG_AMD_MEM_ENCRYPT
95    
96     +/* No PAGE_TABLE_ISOLATION support needed either: */
97     +#undef CONFIG_PAGE_TABLE_ISOLATION
98     +
99     #include "misc.h"
100    
101     /* These actually do the work of building the kernel identity maps. */
102     diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
103     index 3fd8bc560fae..45a63e00a6af 100644
104     --- a/arch/x86/entry/calling.h
105     +++ b/arch/x86/entry/calling.h
106     @@ -1,6 +1,11 @@
107     /* SPDX-License-Identifier: GPL-2.0 */
108     #include <linux/jump_label.h>
109     #include <asm/unwind_hints.h>
110     +#include <asm/cpufeatures.h>
111     +#include <asm/page_types.h>
112     +#include <asm/percpu.h>
113     +#include <asm/asm-offsets.h>
114     +#include <asm/processor-flags.h>
115    
116     /*
117    
118     @@ -187,6 +192,146 @@ For 32-bit we have the following conventions - kernel is built with
119     #endif
120     .endm
121    
122     +#ifdef CONFIG_PAGE_TABLE_ISOLATION
123     +
124     +/*
125     + * PAGE_TABLE_ISOLATION PGDs are 8k. Flip bit 12 to switch between the two
126     + * halves:
127     + */
128     +#define PTI_SWITCH_PGTABLES_MASK (1<<PAGE_SHIFT)
129     +#define PTI_SWITCH_MASK (PTI_SWITCH_PGTABLES_MASK|(1<<X86_CR3_PTI_SWITCH_BIT))
130     +
131     +.macro SET_NOFLUSH_BIT reg:req
132     + bts $X86_CR3_PCID_NOFLUSH_BIT, \reg
133     +.endm
134     +
135     +.macro ADJUST_KERNEL_CR3 reg:req
136     + ALTERNATIVE "", "SET_NOFLUSH_BIT \reg", X86_FEATURE_PCID
137     + /* Clear PCID and "PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */
138     + andq $(~PTI_SWITCH_MASK), \reg
139     +.endm
140     +
141     +.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
142     + ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
143     + mov %cr3, \scratch_reg
144     + ADJUST_KERNEL_CR3 \scratch_reg
145     + mov \scratch_reg, %cr3
146     +.Lend_\@:
147     +.endm
148     +
149     +#define THIS_CPU_user_pcid_flush_mask \
150     + PER_CPU_VAR(cpu_tlbstate) + TLB_STATE_user_pcid_flush_mask
151     +
152     +.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req
153     + ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
154     + mov %cr3, \scratch_reg
155     +
156     + ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID
157     +
158     + /*
159     + * Test if the ASID needs a flush.
160     + */
161     + movq \scratch_reg, \scratch_reg2
162     + andq $(0x7FF), \scratch_reg /* mask ASID */
163     + bt \scratch_reg, THIS_CPU_user_pcid_flush_mask
164     + jnc .Lnoflush_\@
165     +
166     + /* Flush needed, clear the bit */
167     + btr \scratch_reg, THIS_CPU_user_pcid_flush_mask
168     + movq \scratch_reg2, \scratch_reg
169     + jmp .Lwrcr3_\@
170     +
171     +.Lnoflush_\@:
172     + movq \scratch_reg2, \scratch_reg
173     + SET_NOFLUSH_BIT \scratch_reg
174     +
175     +.Lwrcr3_\@:
176     + /* Flip the PGD and ASID to the user version */
177     + orq $(PTI_SWITCH_MASK), \scratch_reg
178     + mov \scratch_reg, %cr3
179     +.Lend_\@:
180     +.endm
181     +
182     +.macro SWITCH_TO_USER_CR3_STACK scratch_reg:req
183     + pushq %rax
184     + SWITCH_TO_USER_CR3_NOSTACK scratch_reg=\scratch_reg scratch_reg2=%rax
185     + popq %rax
186     +.endm
187     +
188     +.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
189     + ALTERNATIVE "jmp .Ldone_\@", "", X86_FEATURE_PTI
190     + movq %cr3, \scratch_reg
191     + movq \scratch_reg, \save_reg
192     + /*
193     + * Is the "switch mask" all zero? That means that both of
194     + * these are zero:
195     + *
196     + * 1. The user/kernel PCID bit, and
197     + * 2. The user/kernel "bit" that points CR3 to the
198     + * bottom half of the 8k PGD
199     + *
200     + * That indicates a kernel CR3 value, not a user CR3.
201     + */
202     + testq $(PTI_SWITCH_MASK), \scratch_reg
203     + jz .Ldone_\@
204     +
205     + ADJUST_KERNEL_CR3 \scratch_reg
206     + movq \scratch_reg, %cr3
207     +
208     +.Ldone_\@:
209     +.endm
210     +
211     +.macro RESTORE_CR3 scratch_reg:req save_reg:req
212     + ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
213     +
214     + ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID
215     +
216     + /*
217     + * KERNEL pages can always resume with NOFLUSH as we do
218     + * explicit flushes.
219     + */
220     + bt $X86_CR3_PTI_SWITCH_BIT, \save_reg
221     + jnc .Lnoflush_\@
222     +
223     + /*
224     + * Check if there's a pending flush for the user ASID we're
225     + * about to set.
226     + */
227     + movq \save_reg, \scratch_reg
228     + andq $(0x7FF), \scratch_reg
229     + bt \scratch_reg, THIS_CPU_user_pcid_flush_mask
230     + jnc .Lnoflush_\@
231     +
232     + btr \scratch_reg, THIS_CPU_user_pcid_flush_mask
233     + jmp .Lwrcr3_\@
234     +
235     +.Lnoflush_\@:
236     + SET_NOFLUSH_BIT \save_reg
237     +
238     +.Lwrcr3_\@:
239     + /*
240     + * The CR3 write could be avoided when not changing its value,
241     + * but would require a CR3 read *and* a scratch register.
242     + */
243     + movq \save_reg, %cr3
244     +.Lend_\@:
245     +.endm
246     +
247     +#else /* CONFIG_PAGE_TABLE_ISOLATION=n: */
248     +
249     +.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
250     +.endm
251     +.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req
252     +.endm
253     +.macro SWITCH_TO_USER_CR3_STACK scratch_reg:req
254     +.endm
255     +.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
256     +.endm
257     +.macro RESTORE_CR3 scratch_reg:req save_reg:req
258     +.endm
259     +
260     +#endif
261     +
262     #endif /* CONFIG_X86_64 */
263    
264     /*
265     diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
266     index 22c891c3b78d..dd696b966e58 100644
267     --- a/arch/x86/entry/entry_64.S
268     +++ b/arch/x86/entry/entry_64.S
269     @@ -23,7 +23,6 @@
270     #include <asm/segment.h>
271     #include <asm/cache.h>
272     #include <asm/errno.h>
273     -#include "calling.h"
274     #include <asm/asm-offsets.h>
275     #include <asm/msr.h>
276     #include <asm/unistd.h>
277     @@ -40,6 +39,8 @@
278     #include <asm/frame.h>
279     #include <linux/err.h>
280    
281     +#include "calling.h"
282     +
283     .code64
284     .section .entry.text, "ax"
285    
286     @@ -164,6 +165,9 @@ ENTRY(entry_SYSCALL_64_trampoline)
287     /* Stash the user RSP. */
288     movq %rsp, RSP_SCRATCH
289    
290     + /* Note: using %rsp as a scratch reg. */
291     + SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
292     +
293     /* Load the top of the task stack into RSP */
294     movq CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp
295    
296     @@ -203,6 +207,10 @@ ENTRY(entry_SYSCALL_64)
297     */
298    
299     swapgs
300     + /*
301     + * This path is not taken when PAGE_TABLE_ISOLATION is disabled so it
302     + * is not required to switch CR3.
303     + */
304     movq %rsp, PER_CPU_VAR(rsp_scratch)
305     movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
306    
307     @@ -399,6 +407,7 @@ syscall_return_via_sysret:
308     * We are on the trampoline stack. All regs except RDI are live.
309     * We can do future final exit work right here.
310     */
311     + SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
312    
313     popq %rdi
314     popq %rsp
315     @@ -736,6 +745,8 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode)
316     * We can do future final exit work right here.
317     */
318    
319     + SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
320     +
321     /* Restore RDI. */
322     popq %rdi
323     SWAPGS
324     @@ -818,7 +829,9 @@ native_irq_return_ldt:
325     */
326    
327     pushq %rdi /* Stash user RDI */
328     - SWAPGS
329     + SWAPGS /* to kernel GS */
330     + SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi /* to kernel CR3 */
331     +
332     movq PER_CPU_VAR(espfix_waddr), %rdi
333     movq %rax, (0*8)(%rdi) /* user RAX */
334     movq (1*8)(%rsp), %rax /* user RIP */
335     @@ -834,7 +847,6 @@ native_irq_return_ldt:
336     /* Now RAX == RSP. */
337    
338     andl $0xffff0000, %eax /* RAX = (RSP & 0xffff0000) */
339     - popq %rdi /* Restore user RDI */
340    
341     /*
342     * espfix_stack[31:16] == 0. The page tables are set up such that
343     @@ -845,7 +857,11 @@ native_irq_return_ldt:
344     * still points to an RO alias of the ESPFIX stack.
345     */
346     orq PER_CPU_VAR(espfix_stack), %rax
347     - SWAPGS
348     +
349     + SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
350     + SWAPGS /* to user GS */
351     + popq %rdi /* Restore user RDI */
352     +
353     movq %rax, %rsp
354     UNWIND_HINT_IRET_REGS offset=8
355    
356     @@ -945,6 +961,8 @@ ENTRY(switch_to_thread_stack)
357     UNWIND_HINT_FUNC
358    
359     pushq %rdi
360     + /* Need to switch before accessing the thread stack. */
361     + SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
362     movq %rsp, %rdi
363     movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
364     UNWIND_HINT sp_offset=16 sp_reg=ORC_REG_DI
365     @@ -1244,7 +1262,11 @@ ENTRY(paranoid_entry)
366     js 1f /* negative -> in kernel */
367     SWAPGS
368     xorl %ebx, %ebx
369     -1: ret
370     +
371     +1:
372     + SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14
373     +
374     + ret
375     END(paranoid_entry)
376    
377     /*
378     @@ -1266,6 +1288,7 @@ ENTRY(paranoid_exit)
379     testl %ebx, %ebx /* swapgs needed? */
380     jnz .Lparanoid_exit_no_swapgs
381     TRACE_IRQS_IRETQ
382     + RESTORE_CR3 scratch_reg=%rbx save_reg=%r14
383     SWAPGS_UNSAFE_STACK
384     jmp .Lparanoid_exit_restore
385     .Lparanoid_exit_no_swapgs:
386     @@ -1293,6 +1316,8 @@ ENTRY(error_entry)
387     * from user mode due to an IRET fault.
388     */
389     SWAPGS
390     + /* We have user CR3. Change to kernel CR3. */
391     + SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
392    
393     .Lerror_entry_from_usermode_after_swapgs:
394     /* Put us onto the real thread stack. */
395     @@ -1339,6 +1364,7 @@ ENTRY(error_entry)
396     * .Lgs_change's error handler with kernel gsbase.
397     */
398     SWAPGS
399     + SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
400     jmp .Lerror_entry_done
401    
402     .Lbstep_iret:
403     @@ -1348,10 +1374,11 @@ ENTRY(error_entry)
404    
405     .Lerror_bad_iret:
406     /*
407     - * We came from an IRET to user mode, so we have user gsbase.
408     - * Switch to kernel gsbase:
409     + * We came from an IRET to user mode, so we have user
410     + * gsbase and CR3. Switch to kernel gsbase and CR3:
411     */
412     SWAPGS
413     + SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
414    
415     /*
416     * Pretend that the exception came from user mode: set up pt_regs
417     @@ -1383,6 +1410,10 @@ END(error_exit)
418     /*
419     * Runs on exception stack. Xen PV does not go through this path at all,
420     * so we can use real assembly here.
421     + *
422     + * Registers:
423     + * %r14: Used to save/restore the CR3 of the interrupted context
424     + * when PAGE_TABLE_ISOLATION is in use. Do not clobber.
425     */
426     ENTRY(nmi)
427     UNWIND_HINT_IRET_REGS
428     @@ -1446,6 +1477,7 @@ ENTRY(nmi)
429    
430     swapgs
431     cld
432     + SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx
433     movq %rsp, %rdx
434     movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
435     UNWIND_HINT_IRET_REGS base=%rdx offset=8
436     @@ -1698,6 +1730,8 @@ end_repeat_nmi:
437     movq $-1, %rsi
438     call do_nmi
439    
440     + RESTORE_CR3 scratch_reg=%r15 save_reg=%r14
441     +
442     testl %ebx, %ebx /* swapgs needed? */
443     jnz nmi_restore
444     nmi_swapgs:
445     diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
446     index 95ad40eb7eff..40f17009ec20 100644
447     --- a/arch/x86/entry/entry_64_compat.S
448     +++ b/arch/x86/entry/entry_64_compat.S
449     @@ -49,6 +49,10 @@
450     ENTRY(entry_SYSENTER_compat)
451     /* Interrupts are off on entry. */
452     SWAPGS
453     +
454     + /* We are about to clobber %rsp anyway, clobbering here is OK */
455     + SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
456     +
457     movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
458    
459     /*
460     @@ -215,6 +219,12 @@ GLOBAL(entry_SYSCALL_compat_after_hwframe)
461     pushq $0 /* pt_regs->r14 = 0 */
462     pushq $0 /* pt_regs->r15 = 0 */
463    
464     + /*
465     + * We just saved %rdi so it is safe to clobber. It is not
466     + * preserved during the C calls inside TRACE_IRQS_OFF anyway.
467     + */
468     + SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
469     +
470     /*
471     * User mode is traced as though IRQs are on, and SYSENTER
472     * turned them off.
473     @@ -256,10 +266,22 @@ sysret32_from_system_call:
474     * when the system call started, which is already known to user
475     * code. We zero R8-R10 to avoid info leaks.
476     */
477     + movq RSP-ORIG_RAX(%rsp), %rsp
478     +
479     + /*
480     + * The original userspace %rsp (RSP-ORIG_RAX(%rsp)) is stored
481     + * on the process stack which is not mapped to userspace and
482     + * not readable after we SWITCH_TO_USER_CR3. Delay the CR3
483     + * switch until after after the last reference to the process
484     + * stack.
485     + *
486     + * %r8/%r9 are zeroed before the sysret, thus safe to clobber.
487     + */
488     + SWITCH_TO_USER_CR3_NOSTACK scratch_reg=%r8 scratch_reg2=%r9
489     +
490     xorq %r8, %r8
491     xorq %r9, %r9
492     xorq %r10, %r10
493     - movq RSP-ORIG_RAX(%rsp), %rsp
494     swapgs
495     sysretl
496     END(entry_SYSCALL_compat)
497     diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
498     index 1faf40f2dda9..577fa8adb785 100644
499     --- a/arch/x86/entry/vsyscall/vsyscall_64.c
500     +++ b/arch/x86/entry/vsyscall/vsyscall_64.c
501     @@ -344,14 +344,14 @@ int in_gate_area_no_mm(unsigned long addr)
502     * vsyscalls but leave the page not present. If so, we skip calling
503     * this.
504     */
505     -static void __init set_vsyscall_pgtable_user_bits(void)
506     +void __init set_vsyscall_pgtable_user_bits(pgd_t *root)
507     {
508     pgd_t *pgd;
509     p4d_t *p4d;
510     pud_t *pud;
511     pmd_t *pmd;
512    
513     - pgd = pgd_offset_k(VSYSCALL_ADDR);
514     + pgd = pgd_offset_pgd(root, VSYSCALL_ADDR);
515     set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER));
516     p4d = p4d_offset(pgd, VSYSCALL_ADDR);
517     #if CONFIG_PGTABLE_LEVELS >= 5
518     @@ -373,7 +373,7 @@ void __init map_vsyscall(void)
519     vsyscall_mode == NATIVE
520     ? PAGE_KERNEL_VSYSCALL
521     : PAGE_KERNEL_VVAR);
522     - set_vsyscall_pgtable_user_bits();
523     + set_vsyscall_pgtable_user_bits(swapper_pg_dir);
524     }
525    
526     BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) !=
527     diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
528     index 3674a4b6f8bd..8f0aace08b87 100644
529     --- a/arch/x86/events/intel/ds.c
530     +++ b/arch/x86/events/intel/ds.c
531     @@ -3,16 +3,18 @@
532     #include <linux/types.h>
533     #include <linux/slab.h>
534    
535     +#include <asm/cpu_entry_area.h>
536     #include <asm/perf_event.h>
537     #include <asm/insn.h>
538    
539     #include "../perf_event.h"
540    
541     +/* Waste a full page so it can be mapped into the cpu_entry_area */
542     +DEFINE_PER_CPU_PAGE_ALIGNED(struct debug_store, cpu_debug_store);
543     +
544     /* The size of a BTS record in bytes: */
545     #define BTS_RECORD_SIZE 24
546    
547     -#define BTS_BUFFER_SIZE (PAGE_SIZE << 4)
548     -#define PEBS_BUFFER_SIZE (PAGE_SIZE << 4)
549     #define PEBS_FIXUP_SIZE PAGE_SIZE
550    
551     /*
552     @@ -279,17 +281,52 @@ void fini_debug_store_on_cpu(int cpu)
553    
554     static DEFINE_PER_CPU(void *, insn_buffer);
555    
556     -static int alloc_pebs_buffer(int cpu)
557     +static void ds_update_cea(void *cea, void *addr, size_t size, pgprot_t prot)
558     {
559     - struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
560     + phys_addr_t pa;
561     + size_t msz = 0;
562     +
563     + pa = virt_to_phys(addr);
564     + for (; msz < size; msz += PAGE_SIZE, pa += PAGE_SIZE, cea += PAGE_SIZE)
565     + cea_set_pte(cea, pa, prot);
566     +}
567     +
568     +static void ds_clear_cea(void *cea, size_t size)
569     +{
570     + size_t msz = 0;
571     +
572     + for (; msz < size; msz += PAGE_SIZE, cea += PAGE_SIZE)
573     + cea_set_pte(cea, 0, PAGE_NONE);
574     +}
575     +
576     +static void *dsalloc_pages(size_t size, gfp_t flags, int cpu)
577     +{
578     + unsigned int order = get_order(size);
579     int node = cpu_to_node(cpu);
580     - int max;
581     - void *buffer, *ibuffer;
582     + struct page *page;
583     +
584     + page = __alloc_pages_node(node, flags | __GFP_ZERO, order);
585     + return page ? page_address(page) : NULL;
586     +}
587     +
588     +static void dsfree_pages(const void *buffer, size_t size)
589     +{
590     + if (buffer)
591     + free_pages((unsigned long)buffer, get_order(size));
592     +}
593     +
594     +static int alloc_pebs_buffer(int cpu)
595     +{
596     + struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
597     + struct debug_store *ds = hwev->ds;
598     + size_t bsiz = x86_pmu.pebs_buffer_size;
599     + int max, node = cpu_to_node(cpu);
600     + void *buffer, *ibuffer, *cea;
601    
602     if (!x86_pmu.pebs)
603     return 0;
604    
605     - buffer = kzalloc_node(x86_pmu.pebs_buffer_size, GFP_KERNEL, node);
606     + buffer = dsalloc_pages(bsiz, GFP_KERNEL, cpu);
607     if (unlikely(!buffer))
608     return -ENOMEM;
609    
610     @@ -300,25 +337,27 @@ static int alloc_pebs_buffer(int cpu)
611     if (x86_pmu.intel_cap.pebs_format < 2) {
612     ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node);
613     if (!ibuffer) {
614     - kfree(buffer);
615     + dsfree_pages(buffer, bsiz);
616     return -ENOMEM;
617     }
618     per_cpu(insn_buffer, cpu) = ibuffer;
619     }
620     -
621     - max = x86_pmu.pebs_buffer_size / x86_pmu.pebs_record_size;
622     -
623     - ds->pebs_buffer_base = (u64)(unsigned long)buffer;
624     + hwev->ds_pebs_vaddr = buffer;
625     + /* Update the cpu entry area mapping */
626     + cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer;
627     + ds->pebs_buffer_base = (unsigned long) cea;
628     + ds_update_cea(cea, buffer, bsiz, PAGE_KERNEL);
629     ds->pebs_index = ds->pebs_buffer_base;
630     - ds->pebs_absolute_maximum = ds->pebs_buffer_base +
631     - max * x86_pmu.pebs_record_size;
632     -
633     + max = x86_pmu.pebs_record_size * (bsiz / x86_pmu.pebs_record_size);
634     + ds->pebs_absolute_maximum = ds->pebs_buffer_base + max;
635     return 0;
636     }
637    
638     static void release_pebs_buffer(int cpu)
639     {
640     - struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
641     + struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
642     + struct debug_store *ds = hwev->ds;
643     + void *cea;
644    
645     if (!ds || !x86_pmu.pebs)
646     return;
647     @@ -326,73 +365,70 @@ static void release_pebs_buffer(int cpu)
648     kfree(per_cpu(insn_buffer, cpu));
649     per_cpu(insn_buffer, cpu) = NULL;
650    
651     - kfree((void *)(unsigned long)ds->pebs_buffer_base);
652     + /* Clear the fixmap */
653     + cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer;
654     + ds_clear_cea(cea, x86_pmu.pebs_buffer_size);
655     ds->pebs_buffer_base = 0;
656     + dsfree_pages(hwev->ds_pebs_vaddr, x86_pmu.pebs_buffer_size);
657     + hwev->ds_pebs_vaddr = NULL;
658     }
659    
660     static int alloc_bts_buffer(int cpu)
661     {
662     - struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
663     - int node = cpu_to_node(cpu);
664     - int max, thresh;
665     - void *buffer;
666     + struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
667     + struct debug_store *ds = hwev->ds;
668     + void *buffer, *cea;
669     + int max;
670    
671     if (!x86_pmu.bts)
672     return 0;
673    
674     - buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);
675     + buffer = dsalloc_pages(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, cpu);
676     if (unlikely(!buffer)) {
677     WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__);
678     return -ENOMEM;
679     }
680     -
681     - max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
682     - thresh = max / 16;
683     -
684     - ds->bts_buffer_base = (u64)(unsigned long)buffer;
685     + hwev->ds_bts_vaddr = buffer;
686     + /* Update the fixmap */
687     + cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer;
688     + ds->bts_buffer_base = (unsigned long) cea;
689     + ds_update_cea(cea, buffer, BTS_BUFFER_SIZE, PAGE_KERNEL);
690     ds->bts_index = ds->bts_buffer_base;
691     - ds->bts_absolute_maximum = ds->bts_buffer_base +
692     - max * BTS_RECORD_SIZE;
693     - ds->bts_interrupt_threshold = ds->bts_absolute_maximum -
694     - thresh * BTS_RECORD_SIZE;
695     -
696     + max = BTS_RECORD_SIZE * (BTS_BUFFER_SIZE / BTS_RECORD_SIZE);
697     + ds->bts_absolute_maximum = ds->bts_buffer_base + max;
698     + ds->bts_interrupt_threshold = ds->bts_absolute_maximum - (max / 16);
699     return 0;
700     }
701    
702     static void release_bts_buffer(int cpu)
703     {
704     - struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
705     + struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
706     + struct debug_store *ds = hwev->ds;
707     + void *cea;
708    
709     if (!ds || !x86_pmu.bts)
710     return;
711    
712     - kfree((void *)(unsigned long)ds->bts_buffer_base);
713     + /* Clear the fixmap */
714     + cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer;
715     + ds_clear_cea(cea, BTS_BUFFER_SIZE);
716     ds->bts_buffer_base = 0;
717     + dsfree_pages(hwev->ds_bts_vaddr, BTS_BUFFER_SIZE);
718     + hwev->ds_bts_vaddr = NULL;
719     }
720    
721     static int alloc_ds_buffer(int cpu)
722     {
723     - int node = cpu_to_node(cpu);
724     - struct debug_store *ds;
725     -
726     - ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node);
727     - if (unlikely(!ds))
728     - return -ENOMEM;
729     + struct debug_store *ds = &get_cpu_entry_area(cpu)->cpu_debug_store;
730    
731     + memset(ds, 0, sizeof(*ds));
732     per_cpu(cpu_hw_events, cpu).ds = ds;
733     -
734     return 0;
735     }
736    
737     static void release_ds_buffer(int cpu)
738     {
739     - struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
740     -
741     - if (!ds)
742     - return;
743     -
744     per_cpu(cpu_hw_events, cpu).ds = NULL;
745     - kfree(ds);
746     }
747    
748     void release_ds_buffers(void)
749     diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
750     index f7aaadf9331f..8e4ea143ed96 100644
751     --- a/arch/x86/events/perf_event.h
752     +++ b/arch/x86/events/perf_event.h
753     @@ -14,6 +14,8 @@
754    
755     #include <linux/perf_event.h>
756    
757     +#include <asm/intel_ds.h>
758     +
759     /* To enable MSR tracing please use the generic trace points. */
760    
761     /*
762     @@ -77,8 +79,6 @@ struct amd_nb {
763     struct event_constraint event_constraints[X86_PMC_IDX_MAX];
764     };
765    
766     -/* The maximal number of PEBS events: */
767     -#define MAX_PEBS_EVENTS 8
768     #define PEBS_COUNTER_MASK ((1ULL << MAX_PEBS_EVENTS) - 1)
769    
770     /*
771     @@ -95,23 +95,6 @@ struct amd_nb {
772     PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR | \
773     PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER)
774    
775     -/*
776     - * A debug store configuration.
777     - *
778     - * We only support architectures that use 64bit fields.
779     - */
780     -struct debug_store {
781     - u64 bts_buffer_base;
782     - u64 bts_index;
783     - u64 bts_absolute_maximum;
784     - u64 bts_interrupt_threshold;
785     - u64 pebs_buffer_base;
786     - u64 pebs_index;
787     - u64 pebs_absolute_maximum;
788     - u64 pebs_interrupt_threshold;
789     - u64 pebs_event_reset[MAX_PEBS_EVENTS];
790     -};
791     -
792     #define PEBS_REGS \
793     (PERF_REG_X86_AX | \
794     PERF_REG_X86_BX | \
795     @@ -216,6 +199,8 @@ struct cpu_hw_events {
796     * Intel DebugStore bits
797     */
798     struct debug_store *ds;
799     + void *ds_pebs_vaddr;
800     + void *ds_bts_vaddr;
801     u64 pebs_enabled;
802     int n_pebs;
803     int n_large_pebs;
804     diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h
805     index 2fbc69a0916e..4a7884b8dca5 100644
806     --- a/arch/x86/include/asm/cpu_entry_area.h
807     +++ b/arch/x86/include/asm/cpu_entry_area.h
808     @@ -5,6 +5,7 @@
809    
810     #include <linux/percpu-defs.h>
811     #include <asm/processor.h>
812     +#include <asm/intel_ds.h>
813    
814     /*
815     * cpu_entry_area is a percpu region that contains things needed by the CPU
816     @@ -40,6 +41,18 @@ struct cpu_entry_area {
817     */
818     char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ];
819     #endif
820     +#ifdef CONFIG_CPU_SUP_INTEL
821     + /*
822     + * Per CPU debug store for Intel performance monitoring. Wastes a
823     + * full page at the moment.
824     + */
825     + struct debug_store cpu_debug_store;
826     + /*
827     + * The actual PEBS/BTS buffers must be mapped to user space
828     + * Reserve enough fixmap PTEs.
829     + */
830     + struct debug_store_buffers cpu_debug_buffers;
831     +#endif
832     };
833    
834     #define CPU_ENTRY_AREA_SIZE (sizeof(struct cpu_entry_area))
835     diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
836     index 800104c8a3ed..07cdd1715705 100644
837     --- a/arch/x86/include/asm/cpufeatures.h
838     +++ b/arch/x86/include/asm/cpufeatures.h
839     @@ -197,11 +197,12 @@
840     #define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* Cache Allocation Technology L3 */
841     #define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* Cache Allocation Technology L2 */
842     #define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* Code and Data Prioritization L3 */
843     +#define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 7) /* Effectively INVPCID && CR4.PCIDE=1 */
844    
845     #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */
846     #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
847     #define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */
848     -
849     +#define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */
850     #define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */
851     #define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */
852     #define X86_FEATURE_AVX512_4VNNIW ( 7*32+16) /* AVX-512 Neural Network Instructions */
853     @@ -340,5 +341,6 @@
854     #define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */
855     #define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */
856     #define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */
857     +#define X86_BUG_CPU_INSECURE X86_BUG(14) /* CPU is insecure and needs kernel page table isolation */
858    
859     #endif /* _ASM_X86_CPUFEATURES_H */
860     diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
861     index bc359dd2f7f6..85e23bb7b34e 100644
862     --- a/arch/x86/include/asm/desc.h
863     +++ b/arch/x86/include/asm/desc.h
864     @@ -21,6 +21,8 @@ static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *in
865    
866     desc->type = (info->read_exec_only ^ 1) << 1;
867     desc->type |= info->contents << 2;
868     + /* Set the ACCESS bit so it can be mapped RO */
869     + desc->type |= 1;
870    
871     desc->s = 1;
872     desc->dpl = 0x3;
873     diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h
874     index c10c9128f54e..e428e16dd822 100644
875     --- a/arch/x86/include/asm/disabled-features.h
876     +++ b/arch/x86/include/asm/disabled-features.h
877     @@ -44,6 +44,12 @@
878     # define DISABLE_LA57 (1<<(X86_FEATURE_LA57 & 31))
879     #endif
880    
881     +#ifdef CONFIG_PAGE_TABLE_ISOLATION
882     +# define DISABLE_PTI 0
883     +#else
884     +# define DISABLE_PTI (1 << (X86_FEATURE_PTI & 31))
885     +#endif
886     +
887     /*
888     * Make sure to add features to the correct mask
889     */
890     @@ -54,7 +60,7 @@
891     #define DISABLED_MASK4 (DISABLE_PCID)
892     #define DISABLED_MASK5 0
893     #define DISABLED_MASK6 0
894     -#define DISABLED_MASK7 0
895     +#define DISABLED_MASK7 (DISABLE_PTI)
896     #define DISABLED_MASK8 0
897     #define DISABLED_MASK9 (DISABLE_MPX)
898     #define DISABLED_MASK10 0
899     diff --git a/arch/x86/include/asm/intel_ds.h b/arch/x86/include/asm/intel_ds.h
900     new file mode 100644
901     index 000000000000..62a9f4966b42
902     --- /dev/null
903     +++ b/arch/x86/include/asm/intel_ds.h
904     @@ -0,0 +1,36 @@
905     +#ifndef _ASM_INTEL_DS_H
906     +#define _ASM_INTEL_DS_H
907     +
908     +#include <linux/percpu-defs.h>
909     +
910     +#define BTS_BUFFER_SIZE (PAGE_SIZE << 4)
911     +#define PEBS_BUFFER_SIZE (PAGE_SIZE << 4)
912     +
913     +/* The maximal number of PEBS events: */
914     +#define MAX_PEBS_EVENTS 8
915     +
916     +/*
917     + * A debug store configuration.
918     + *
919     + * We only support architectures that use 64bit fields.
920     + */
921     +struct debug_store {
922     + u64 bts_buffer_base;
923     + u64 bts_index;
924     + u64 bts_absolute_maximum;
925     + u64 bts_interrupt_threshold;
926     + u64 pebs_buffer_base;
927     + u64 pebs_index;
928     + u64 pebs_absolute_maximum;
929     + u64 pebs_interrupt_threshold;
930     + u64 pebs_event_reset[MAX_PEBS_EVENTS];
931     +} __aligned(PAGE_SIZE);
932     +
933     +DECLARE_PER_CPU_PAGE_ALIGNED(struct debug_store, cpu_debug_store);
934     +
935     +struct debug_store_buffers {
936     + char bts_buffer[BTS_BUFFER_SIZE];
937     + char pebs_buffer[PEBS_BUFFER_SIZE];
938     +};
939     +
940     +#endif
941     diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
942     index 5ede7cae1d67..c931b88982a0 100644
943     --- a/arch/x86/include/asm/mmu_context.h
944     +++ b/arch/x86/include/asm/mmu_context.h
945     @@ -50,10 +50,33 @@ struct ldt_struct {
946     * call gates. On native, we could merge the ldt_struct and LDT
947     * allocations, but it's not worth trying to optimize.
948     */
949     - struct desc_struct *entries;
950     - unsigned int nr_entries;
951     + struct desc_struct *entries;
952     + unsigned int nr_entries;
953     +
954     + /*
955     + * If PTI is in use, then the entries array is not mapped while we're
956     + * in user mode. The whole array will be aliased at the addressed
957     + * given by ldt_slot_va(slot). We use two slots so that we can allocate
958     + * and map, and enable a new LDT without invalidating the mapping
959     + * of an older, still-in-use LDT.
960     + *
961     + * slot will be -1 if this LDT doesn't have an alias mapping.
962     + */
963     + int slot;
964     };
965    
966     +/* This is a multiple of PAGE_SIZE. */
967     +#define LDT_SLOT_STRIDE (LDT_ENTRIES * LDT_ENTRY_SIZE)
968     +
969     +static inline void *ldt_slot_va(int slot)
970     +{
971     +#ifdef CONFIG_X86_64
972     + return (void *)(LDT_BASE_ADDR + LDT_SLOT_STRIDE * slot);
973     +#else
974     + BUG();
975     +#endif
976     +}
977     +
978     /*
979     * Used for LDT copy/destruction.
980     */
981     @@ -64,6 +87,7 @@ static inline void init_new_context_ldt(struct mm_struct *mm)
982     }
983     int ldt_dup_context(struct mm_struct *oldmm, struct mm_struct *mm);
984     void destroy_context_ldt(struct mm_struct *mm);
985     +void ldt_arch_exit_mmap(struct mm_struct *mm);
986     #else /* CONFIG_MODIFY_LDT_SYSCALL */
987     static inline void init_new_context_ldt(struct mm_struct *mm) { }
988     static inline int ldt_dup_context(struct mm_struct *oldmm,
989     @@ -71,7 +95,8 @@ static inline int ldt_dup_context(struct mm_struct *oldmm,
990     {
991     return 0;
992     }
993     -static inline void destroy_context_ldt(struct mm_struct *mm) {}
994     +static inline void destroy_context_ldt(struct mm_struct *mm) { }
995     +static inline void ldt_arch_exit_mmap(struct mm_struct *mm) { }
996     #endif
997    
998     static inline void load_mm_ldt(struct mm_struct *mm)
999     @@ -96,10 +121,31 @@ static inline void load_mm_ldt(struct mm_struct *mm)
1000     * that we can see.
1001     */
1002    
1003     - if (unlikely(ldt))
1004     - set_ldt(ldt->entries, ldt->nr_entries);
1005     - else
1006     + if (unlikely(ldt)) {
1007     + if (static_cpu_has(X86_FEATURE_PTI)) {
1008     + if (WARN_ON_ONCE((unsigned long)ldt->slot > 1)) {
1009     + /*
1010     + * Whoops -- either the new LDT isn't mapped
1011     + * (if slot == -1) or is mapped into a bogus
1012     + * slot (if slot > 1).
1013     + */
1014     + clear_LDT();
1015     + return;
1016     + }
1017     +
1018     + /*
1019     + * If page table isolation is enabled, ldt->entries
1020     + * will not be mapped in the userspace pagetables.
1021     + * Tell the CPU to access the LDT through the alias
1022     + * at ldt_slot_va(ldt->slot).
1023     + */
1024     + set_ldt(ldt_slot_va(ldt->slot), ldt->nr_entries);
1025     + } else {
1026     + set_ldt(ldt->entries, ldt->nr_entries);
1027     + }
1028     + } else {
1029     clear_LDT();
1030     + }
1031     #else
1032     clear_LDT();
1033     #endif
1034     @@ -194,6 +240,7 @@ static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
1035     static inline void arch_exit_mmap(struct mm_struct *mm)
1036     {
1037     paravirt_arch_exit_mmap(mm);
1038     + ldt_arch_exit_mmap(mm);
1039     }
1040    
1041     #ifdef CONFIG_X86_64
1042     diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
1043     index 4b5e1eafada7..aff42e1da6ee 100644
1044     --- a/arch/x86/include/asm/pgalloc.h
1045     +++ b/arch/x86/include/asm/pgalloc.h
1046     @@ -30,6 +30,17 @@ static inline void paravirt_release_p4d(unsigned long pfn) {}
1047     */
1048     extern gfp_t __userpte_alloc_gfp;
1049    
1050     +#ifdef CONFIG_PAGE_TABLE_ISOLATION
1051     +/*
1052     + * Instead of one PGD, we acquire two PGDs. Being order-1, it is
1053     + * both 8k in size and 8k-aligned. That lets us just flip bit 12
1054     + * in a pointer to swap between the two 4k halves.
1055     + */
1056     +#define PGD_ALLOCATION_ORDER 1
1057     +#else
1058     +#define PGD_ALLOCATION_ORDER 0
1059     +#endif
1060     +
1061     /*
1062     * Allocate and free page tables.
1063     */
1064     diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
1065     index f02de8bc1f72..211368922cad 100644
1066     --- a/arch/x86/include/asm/pgtable.h
1067     +++ b/arch/x86/include/asm/pgtable.h
1068     @@ -28,6 +28,7 @@ extern pgd_t early_top_pgt[PTRS_PER_PGD];
1069     int __init __early_make_pgtable(unsigned long address, pmdval_t pmd);
1070    
1071     void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd);
1072     +void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user);
1073     void ptdump_walk_pgd_level_checkwx(void);
1074    
1075     #ifdef CONFIG_DEBUG_WX
1076     @@ -846,7 +847,12 @@ static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)
1077    
1078     static inline int p4d_bad(p4d_t p4d)
1079     {
1080     - return (p4d_flags(p4d) & ~(_KERNPG_TABLE | _PAGE_USER)) != 0;
1081     + unsigned long ignore_flags = _KERNPG_TABLE | _PAGE_USER;
1082     +
1083     + if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
1084     + ignore_flags |= _PAGE_NX;
1085     +
1086     + return (p4d_flags(p4d) & ~ignore_flags) != 0;
1087     }
1088     #endif /* CONFIG_PGTABLE_LEVELS > 3 */
1089    
1090     @@ -880,7 +886,12 @@ static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address)
1091    
1092     static inline int pgd_bad(pgd_t pgd)
1093     {
1094     - return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE;
1095     + unsigned long ignore_flags = _PAGE_USER;
1096     +
1097     + if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
1098     + ignore_flags |= _PAGE_NX;
1099     +
1100     + return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
1101     }
1102    
1103     static inline int pgd_none(pgd_t pgd)
1104     @@ -909,7 +920,11 @@ static inline int pgd_none(pgd_t pgd)
1105     * pgd_offset() returns a (pgd_t *)
1106     * pgd_index() is used get the offset into the pgd page's array of pgd_t's;
1107     */
1108     -#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address)))
1109     +#define pgd_offset_pgd(pgd, address) (pgd + pgd_index((address)))
1110     +/*
1111     + * a shortcut to get a pgd_t in a given mm
1112     + */
1113     +#define pgd_offset(mm, address) pgd_offset_pgd((mm)->pgd, (address))
1114     /*
1115     * a shortcut which implies the use of the kernel's pgd, instead
1116     * of a process's
1117     @@ -1111,7 +1126,14 @@ static inline int pud_write(pud_t pud)
1118     */
1119     static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
1120     {
1121     - memcpy(dst, src, count * sizeof(pgd_t));
1122     + memcpy(dst, src, count * sizeof(pgd_t));
1123     +#ifdef CONFIG_PAGE_TABLE_ISOLATION
1124     + if (!static_cpu_has(X86_FEATURE_PTI))
1125     + return;
1126     + /* Clone the user space pgd as well */
1127     + memcpy(kernel_to_user_pgdp(dst), kernel_to_user_pgdp(src),
1128     + count * sizeof(pgd_t));
1129     +#endif
1130     }
1131    
1132     #define PTE_SHIFT ilog2(PTRS_PER_PTE)
1133     diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
1134     index e9f05331e732..81462e9a34f6 100644
1135     --- a/arch/x86/include/asm/pgtable_64.h
1136     +++ b/arch/x86/include/asm/pgtable_64.h
1137     @@ -131,9 +131,97 @@ static inline pud_t native_pudp_get_and_clear(pud_t *xp)
1138     #endif
1139     }
1140    
1141     +#ifdef CONFIG_PAGE_TABLE_ISOLATION
1142     +/*
1143     + * All top-level PAGE_TABLE_ISOLATION page tables are order-1 pages
1144     + * (8k-aligned and 8k in size). The kernel one is at the beginning 4k and
1145     + * the user one is in the last 4k. To switch between them, you
1146     + * just need to flip the 12th bit in their addresses.
1147     + */
1148     +#define PTI_PGTABLE_SWITCH_BIT PAGE_SHIFT
1149     +
1150     +/*
1151     + * This generates better code than the inline assembly in
1152     + * __set_bit().
1153     + */
1154     +static inline void *ptr_set_bit(void *ptr, int bit)
1155     +{
1156     + unsigned long __ptr = (unsigned long)ptr;
1157     +
1158     + __ptr |= BIT(bit);
1159     + return (void *)__ptr;
1160     +}
1161     +static inline void *ptr_clear_bit(void *ptr, int bit)
1162     +{
1163     + unsigned long __ptr = (unsigned long)ptr;
1164     +
1165     + __ptr &= ~BIT(bit);
1166     + return (void *)__ptr;
1167     +}
1168     +
1169     +static inline pgd_t *kernel_to_user_pgdp(pgd_t *pgdp)
1170     +{
1171     + return ptr_set_bit(pgdp, PTI_PGTABLE_SWITCH_BIT);
1172     +}
1173     +
1174     +static inline pgd_t *user_to_kernel_pgdp(pgd_t *pgdp)
1175     +{
1176     + return ptr_clear_bit(pgdp, PTI_PGTABLE_SWITCH_BIT);
1177     +}
1178     +
1179     +static inline p4d_t *kernel_to_user_p4dp(p4d_t *p4dp)
1180     +{
1181     + return ptr_set_bit(p4dp, PTI_PGTABLE_SWITCH_BIT);
1182     +}
1183     +
1184     +static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp)
1185     +{
1186     + return ptr_clear_bit(p4dp, PTI_PGTABLE_SWITCH_BIT);
1187     +}
1188     +#endif /* CONFIG_PAGE_TABLE_ISOLATION */
1189     +
1190     +/*
1191     + * Page table pages are page-aligned. The lower half of the top
1192     + * level is used for userspace and the top half for the kernel.
1193     + *
1194     + * Returns true for parts of the PGD that map userspace and
1195     + * false for the parts that map the kernel.
1196     + */
1197     +static inline bool pgdp_maps_userspace(void *__ptr)
1198     +{
1199     + unsigned long ptr = (unsigned long)__ptr;
1200     +
1201     + return (ptr & ~PAGE_MASK) < (PAGE_SIZE / 2);
1202     +}
1203     +
1204     +#ifdef CONFIG_PAGE_TABLE_ISOLATION
1205     +pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd);
1206     +
1207     +/*
1208     + * Take a PGD location (pgdp) and a pgd value that needs to be set there.
1209     + * Populates the user and returns the resulting PGD that must be set in
1210     + * the kernel copy of the page tables.
1211     + */
1212     +static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
1213     +{
1214     + if (!static_cpu_has(X86_FEATURE_PTI))
1215     + return pgd;
1216     + return __pti_set_user_pgd(pgdp, pgd);
1217     +}
1218     +#else
1219     +static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
1220     +{
1221     + return pgd;
1222     +}
1223     +#endif
1224     +
1225     static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d)
1226     {
1227     +#if defined(CONFIG_PAGE_TABLE_ISOLATION) && !defined(CONFIG_X86_5LEVEL)
1228     + p4dp->pgd = pti_set_user_pgd(&p4dp->pgd, p4d.pgd);
1229     +#else
1230     *p4dp = p4d;
1231     +#endif
1232     }
1233    
1234     static inline void native_p4d_clear(p4d_t *p4d)
1235     @@ -147,7 +235,11 @@ static inline void native_p4d_clear(p4d_t *p4d)
1236    
1237     static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
1238     {
1239     +#ifdef CONFIG_PAGE_TABLE_ISOLATION
1240     + *pgdp = pti_set_user_pgd(pgdp, pgd);
1241     +#else
1242     *pgdp = pgd;
1243     +#endif
1244     }
1245    
1246     static inline void native_pgd_clear(pgd_t *pgd)
1247     diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
1248     index 3d27831bc58d..b97a539bcdee 100644
1249     --- a/arch/x86/include/asm/pgtable_64_types.h
1250     +++ b/arch/x86/include/asm/pgtable_64_types.h
1251     @@ -79,13 +79,17 @@ typedef struct { pteval_t pte; } pte_t;
1252     #define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
1253    
1254     #ifdef CONFIG_X86_5LEVEL
1255     -# define VMALLOC_SIZE_TB _AC(16384, UL)
1256     -# define __VMALLOC_BASE _AC(0xff92000000000000, UL)
1257     +# define VMALLOC_SIZE_TB _AC(12800, UL)
1258     +# define __VMALLOC_BASE _AC(0xffa0000000000000, UL)
1259     # define __VMEMMAP_BASE _AC(0xffd4000000000000, UL)
1260     +# define LDT_PGD_ENTRY _AC(-112, UL)
1261     +# define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT)
1262     #else
1263     # define VMALLOC_SIZE_TB _AC(32, UL)
1264     # define __VMALLOC_BASE _AC(0xffffc90000000000, UL)
1265     # define __VMEMMAP_BASE _AC(0xffffea0000000000, UL)
1266     +# define LDT_PGD_ENTRY _AC(-4, UL)
1267     +# define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT)
1268     #endif
1269    
1270     #ifdef CONFIG_RANDOMIZE_MEMORY
1271     diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
1272     index 43212a43ee69..6a60fea90b9d 100644
1273     --- a/arch/x86/include/asm/processor-flags.h
1274     +++ b/arch/x86/include/asm/processor-flags.h
1275     @@ -38,6 +38,11 @@
1276     #define CR3_ADDR_MASK __sme_clr(0x7FFFFFFFFFFFF000ull)
1277     #define CR3_PCID_MASK 0xFFFull
1278     #define CR3_NOFLUSH BIT_ULL(63)
1279     +
1280     +#ifdef CONFIG_PAGE_TABLE_ISOLATION
1281     +# define X86_CR3_PTI_SWITCH_BIT 11
1282     +#endif
1283     +
1284     #else
1285     /*
1286     * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save
1287     diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
1288     index 9e482d8b0b97..9c18da64daa9 100644
1289     --- a/arch/x86/include/asm/processor.h
1290     +++ b/arch/x86/include/asm/processor.h
1291     @@ -851,13 +851,22 @@ static inline void spin_lock_prefetch(const void *x)
1292    
1293     #else
1294     /*
1295     - * User space process size. 47bits minus one guard page. The guard
1296     - * page is necessary on Intel CPUs: if a SYSCALL instruction is at
1297     - * the highest possible canonical userspace address, then that
1298     - * syscall will enter the kernel with a non-canonical return
1299     - * address, and SYSRET will explode dangerously. We avoid this
1300     - * particular problem by preventing anything from being mapped
1301     - * at the maximum canonical address.
1302     + * User space process size. This is the first address outside the user range.
1303     + * There are a few constraints that determine this:
1304     + *
1305     + * On Intel CPUs, if a SYSCALL instruction is at the highest canonical
1306     + * address, then that syscall will enter the kernel with a
1307     + * non-canonical return address, and SYSRET will explode dangerously.
1308     + * We avoid this particular problem by preventing anything executable
1309     + * from being mapped at the maximum canonical address.
1310     + *
1311     + * On AMD CPUs in the Ryzen family, there's a nasty bug in which the
1312     + * CPUs malfunction if they execute code from the highest canonical page.
1313     + * They'll speculate right off the end of the canonical space, and
1314     + * bad things happen. This is worked around in the same way as the
1315     + * Intel problem.
1316     + *
1317     + * With page table isolation enabled, we map the LDT in ... [stay tuned]
1318     */
1319     #define TASK_SIZE_MAX ((1UL << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE)
1320    
1321     diff --git a/arch/x86/include/asm/pti.h b/arch/x86/include/asm/pti.h
1322     new file mode 100644
1323     index 000000000000..0b5ef05b2d2d
1324     --- /dev/null
1325     +++ b/arch/x86/include/asm/pti.h
1326     @@ -0,0 +1,14 @@
1327     +// SPDX-License-Identifier: GPL-2.0
1328     +#ifndef _ASM_X86_PTI_H
1329     +#define _ASM_X86_PTI_H
1330     +#ifndef __ASSEMBLY__
1331     +
1332     +#ifdef CONFIG_PAGE_TABLE_ISOLATION
1333     +extern void pti_init(void);
1334     +extern void pti_check_boottime_disable(void);
1335     +#else
1336     +static inline void pti_check_boottime_disable(void) { }
1337     +#endif
1338     +
1339     +#endif /* __ASSEMBLY__ */
1340     +#endif /* _ASM_X86_PTI_H */
1341     diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
1342     index 171b429f43a2..f9b48ce152eb 100644
1343     --- a/arch/x86/include/asm/tlbflush.h
1344     +++ b/arch/x86/include/asm/tlbflush.h
1345     @@ -10,38 +10,90 @@
1346     #include <asm/special_insns.h>
1347     #include <asm/smp.h>
1348     #include <asm/invpcid.h>
1349     +#include <asm/pti.h>
1350     +#include <asm/processor-flags.h>
1351    
1352     -static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
1353     -{
1354     - /*
1355     - * Bump the generation count. This also serves as a full barrier
1356     - * that synchronizes with switch_mm(): callers are required to order
1357     - * their read of mm_cpumask after their writes to the paging
1358     - * structures.
1359     - */
1360     - return atomic64_inc_return(&mm->context.tlb_gen);
1361     -}
1362     +/*
1363     + * The x86 feature is called PCID (Process Context IDentifier). It is similar
1364     + * to what is traditionally called ASID on the RISC processors.
1365     + *
1366     + * We don't use the traditional ASID implementation, where each process/mm gets
1367     + * its own ASID and flush/restart when we run out of ASID space.
1368     + *
1369     + * Instead we have a small per-cpu array of ASIDs and cache the last few mm's
1370     + * that came by on this CPU, allowing cheaper switch_mm between processes on
1371     + * this CPU.
1372     + *
1373     + * We end up with different spaces for different things. To avoid confusion we
1374     + * use different names for each of them:
1375     + *
1376     + * ASID - [0, TLB_NR_DYN_ASIDS-1]
1377     + * the canonical identifier for an mm
1378     + *
1379     + * kPCID - [1, TLB_NR_DYN_ASIDS]
1380     + * the value we write into the PCID part of CR3; corresponds to the
1381     + * ASID+1, because PCID 0 is special.
1382     + *
1383     + * uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS]
1384     + * for KPTI each mm has two address spaces and thus needs two
1385     + * PCID values, but we can still do with a single ASID denomination
1386     + * for each mm. Corresponds to kPCID + 2048.
1387     + *
1388     + */
1389    
1390     /* There are 12 bits of space for ASIDS in CR3 */
1391     #define CR3_HW_ASID_BITS 12
1392     +
1393     /*
1394     * When enabled, PAGE_TABLE_ISOLATION consumes a single bit for
1395     * user/kernel switches
1396     */
1397     -#define PTI_CONSUMED_ASID_BITS 0
1398     +#ifdef CONFIG_PAGE_TABLE_ISOLATION
1399     +# define PTI_CONSUMED_PCID_BITS 1
1400     +#else
1401     +# define PTI_CONSUMED_PCID_BITS 0
1402     +#endif
1403     +
1404     +#define CR3_AVAIL_PCID_BITS (X86_CR3_PCID_BITS - PTI_CONSUMED_PCID_BITS)
1405    
1406     -#define CR3_AVAIL_ASID_BITS (CR3_HW_ASID_BITS - PTI_CONSUMED_ASID_BITS)
1407     /*
1408     * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account
1409     - * for them being zero-based. Another -1 is because ASID 0 is reserved for
1410     + * for them being zero-based. Another -1 is because PCID 0 is reserved for
1411     * use by non-PCID-aware users.
1412     */
1413     -#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_ASID_BITS) - 2)
1414     +#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2)
1415     +
1416     +/*
1417     + * 6 because 6 should be plenty and struct tlb_state will fit in two cache
1418     + * lines.
1419     + */
1420     +#define TLB_NR_DYN_ASIDS 6
1421    
1422     +/*
1423     + * Given @asid, compute kPCID
1424     + */
1425     static inline u16 kern_pcid(u16 asid)
1426     {
1427     VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
1428     +
1429     +#ifdef CONFIG_PAGE_TABLE_ISOLATION
1430     + /*
1431     + * Make sure that the dynamic ASID space does not confict with the
1432     + * bit we are using to switch between user and kernel ASIDs.
1433     + */
1434     + BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_SWITCH_BIT));
1435     +
1436     /*
1437     + * The ASID being passed in here should have respected the
1438     + * MAX_ASID_AVAILABLE and thus never have the switch bit set.
1439     + */
1440     + VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_SWITCH_BIT));
1441     +#endif
1442     + /*
1443     + * The dynamically-assigned ASIDs that get passed in are small
1444     + * (<TLB_NR_DYN_ASIDS). They never have the high switch bit set,
1445     + * so do not bother to clear it.
1446     + *
1447     * If PCID is on, ASID-aware code paths put the ASID+1 into the
1448     * PCID bits. This serves two purposes. It prevents a nasty
1449     * situation in which PCID-unaware code saves CR3, loads some other
1450     @@ -53,6 +105,18 @@ static inline u16 kern_pcid(u16 asid)
1451     return asid + 1;
1452     }
1453    
1454     +/*
1455     + * Given @asid, compute uPCID
1456     + */
1457     +static inline u16 user_pcid(u16 asid)
1458     +{
1459     + u16 ret = kern_pcid(asid);
1460     +#ifdef CONFIG_PAGE_TABLE_ISOLATION
1461     + ret |= 1 << X86_CR3_PTI_SWITCH_BIT;
1462     +#endif
1463     + return ret;
1464     +}
1465     +
1466     struct pgd_t;
1467     static inline unsigned long build_cr3(pgd_t *pgd, u16 asid)
1468     {
1469     @@ -95,12 +159,6 @@ static inline bool tlb_defer_switch_to_init_mm(void)
1470     return !static_cpu_has(X86_FEATURE_PCID);
1471     }
1472    
1473     -/*
1474     - * 6 because 6 should be plenty and struct tlb_state will fit in
1475     - * two cache lines.
1476     - */
1477     -#define TLB_NR_DYN_ASIDS 6
1478     -
1479     struct tlb_context {
1480     u64 ctx_id;
1481     u64 tlb_gen;
1482     @@ -134,6 +192,24 @@ struct tlb_state {
1483     */
1484     bool is_lazy;
1485    
1486     + /*
1487     + * If set we changed the page tables in such a way that we
1488     + * needed an invalidation of all contexts (aka. PCIDs / ASIDs).
1489     + * This tells us to go invalidate all the non-loaded ctxs[]
1490     + * on the next context switch.
1491     + *
1492     + * The current ctx was kept up-to-date as it ran and does not
1493     + * need to be invalidated.
1494     + */
1495     + bool invalidate_other;
1496     +
1497     + /*
1498     + * Mask that contains TLB_NR_DYN_ASIDS+1 bits to indicate
1499     + * the corresponding user PCID needs a flush next time we
1500     + * switch to it; see SWITCH_TO_USER_CR3.
1501     + */
1502     + unsigned short user_pcid_flush_mask;
1503     +
1504     /*
1505     * Access to this CR4 shadow and to H/W CR4 is protected by
1506     * disabling interrupts when modifying either one.
1507     @@ -211,6 +287,14 @@ static inline unsigned long cr4_read_shadow(void)
1508     return this_cpu_read(cpu_tlbstate.cr4);
1509     }
1510    
1511     +/*
1512     + * Mark all other ASIDs as invalid, preserves the current.
1513     + */
1514     +static inline void invalidate_other_asid(void)
1515     +{
1516     + this_cpu_write(cpu_tlbstate.invalidate_other, true);
1517     +}
1518     +
1519     /*
1520     * Save some of cr4 feature set we're using (e.g. Pentium 4MB
1521     * enable and PPro Global page enable), so that any CPU's that boot
1522     @@ -230,19 +314,48 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask)
1523    
1524     extern void initialize_tlbstate_and_flush(void);
1525    
1526     +/*
1527     + * Given an ASID, flush the corresponding user ASID. We can delay this
1528     + * until the next time we switch to it.
1529     + *
1530     + * See SWITCH_TO_USER_CR3.
1531     + */
1532     +static inline void invalidate_user_asid(u16 asid)
1533     +{
1534     + /* There is no user ASID if address space separation is off */
1535     + if (!IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
1536     + return;
1537     +
1538     + /*
1539     + * We only have a single ASID if PCID is off and the CR3
1540     + * write will have flushed it.
1541     + */
1542     + if (!cpu_feature_enabled(X86_FEATURE_PCID))
1543     + return;
1544     +
1545     + if (!static_cpu_has(X86_FEATURE_PTI))
1546     + return;
1547     +
1548     + __set_bit(kern_pcid(asid),
1549     + (unsigned long *)this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask));
1550     +}
1551     +
1552     /*
1553     * flush the entire current user mapping
1554     */
1555     static inline void __native_flush_tlb(void)
1556     {
1557     /*
1558     - * If current->mm == NULL then we borrow a mm which may change during a
1559     - * task switch and therefore we must not be preempted while we write CR3
1560     - * back:
1561     + * Preemption or interrupts must be disabled to protect the access
1562     + * to the per CPU variable and to prevent being preempted between
1563     + * read_cr3() and write_cr3().
1564     */
1565     - preempt_disable();
1566     + WARN_ON_ONCE(preemptible());
1567     +
1568     + invalidate_user_asid(this_cpu_read(cpu_tlbstate.loaded_mm_asid));
1569     +
1570     + /* If current->mm == NULL then the read_cr3() "borrows" an mm */
1571     native_write_cr3(__native_read_cr3());
1572     - preempt_enable();
1573     }
1574    
1575     /*
1576     @@ -256,6 +369,8 @@ static inline void __native_flush_tlb_global(void)
1577     /*
1578     * Using INVPCID is considerably faster than a pair of writes
1579     * to CR4 sandwiched inside an IRQ flag save/restore.
1580     + *
1581     + * Note, this works with CR4.PCIDE=0 or 1.
1582     */
1583     invpcid_flush_all();
1584     return;
1585     @@ -282,7 +397,21 @@ static inline void __native_flush_tlb_global(void)
1586     */
1587     static inline void __native_flush_tlb_single(unsigned long addr)
1588     {
1589     + u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
1590     +
1591     asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
1592     +
1593     + if (!static_cpu_has(X86_FEATURE_PTI))
1594     + return;
1595     +
1596     + /*
1597     + * Some platforms #GP if we call invpcid(type=1/2) before CR4.PCIDE=1.
1598     + * Just use invalidate_user_asid() in case we are called early.
1599     + */
1600     + if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE))
1601     + invalidate_user_asid(loaded_mm_asid);
1602     + else
1603     + invpcid_flush_one(user_pcid(loaded_mm_asid), addr);
1604     }
1605    
1606     /*
1607     @@ -298,14 +427,6 @@ static inline void __flush_tlb_all(void)
1608     */
1609     __flush_tlb();
1610     }
1611     -
1612     - /*
1613     - * Note: if we somehow had PCID but not PGE, then this wouldn't work --
1614     - * we'd end up flushing kernel translations for the current ASID but
1615     - * we might fail to flush kernel translations for other cached ASIDs.
1616     - *
1617     - * To avoid this issue, we force PCID off if PGE is off.
1618     - */
1619     }
1620    
1621     /*
1622     @@ -315,6 +436,16 @@ static inline void __flush_tlb_one(unsigned long addr)
1623     {
1624     count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
1625     __flush_tlb_single(addr);
1626     +
1627     + if (!static_cpu_has(X86_FEATURE_PTI))
1628     + return;
1629     +
1630     + /*
1631     + * __flush_tlb_single() will have cleared the TLB entry for this ASID,
1632     + * but since kernel space is replicated across all, we must also
1633     + * invalidate all others.
1634     + */
1635     + invalidate_other_asid();
1636     }
1637    
1638     #define TLB_FLUSH_ALL -1UL
1639     @@ -375,6 +506,17 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a)
1640     void native_flush_tlb_others(const struct cpumask *cpumask,
1641     const struct flush_tlb_info *info);
1642    
1643     +static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
1644     +{
1645     + /*
1646     + * Bump the generation count. This also serves as a full barrier
1647     + * that synchronizes with switch_mm(): callers are required to order
1648     + * their read of mm_cpumask after their writes to the paging
1649     + * structures.
1650     + */
1651     + return atomic64_inc_return(&mm->context.tlb_gen);
1652     +}
1653     +
1654     static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch,
1655     struct mm_struct *mm)
1656     {
1657     diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h
1658     index d9a7c659009c..b986b2ca688a 100644
1659     --- a/arch/x86/include/asm/vsyscall.h
1660     +++ b/arch/x86/include/asm/vsyscall.h
1661     @@ -7,6 +7,7 @@
1662    
1663     #ifdef CONFIG_X86_VSYSCALL_EMULATION
1664     extern void map_vsyscall(void);
1665     +extern void set_vsyscall_pgtable_user_bits(pgd_t *root);
1666    
1667     /*
1668     * Called on instruction fetch fault in vsyscall page.
1669     diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h
1670     index 53b4ca55ebb6..97abdaab9535 100644
1671     --- a/arch/x86/include/uapi/asm/processor-flags.h
1672     +++ b/arch/x86/include/uapi/asm/processor-flags.h
1673     @@ -78,7 +78,12 @@
1674     #define X86_CR3_PWT _BITUL(X86_CR3_PWT_BIT)
1675     #define X86_CR3_PCD_BIT 4 /* Page Cache Disable */
1676     #define X86_CR3_PCD _BITUL(X86_CR3_PCD_BIT)
1677     -#define X86_CR3_PCID_MASK _AC(0x00000fff,UL) /* PCID Mask */
1678     +
1679     +#define X86_CR3_PCID_BITS 12
1680     +#define X86_CR3_PCID_MASK (_AC((1UL << X86_CR3_PCID_BITS) - 1, UL))
1681     +
1682     +#define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */
1683     +#define X86_CR3_PCID_NOFLUSH _BITULL(X86_CR3_PCID_NOFLUSH_BIT)
1684    
1685     /*
1686     * Intel CPU features in CR4
1687     diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
1688     index 676b7cf4b62b..76417a9aab73 100644
1689     --- a/arch/x86/kernel/asm-offsets.c
1690     +++ b/arch/x86/kernel/asm-offsets.c
1691     @@ -17,6 +17,7 @@
1692     #include <asm/sigframe.h>
1693     #include <asm/bootparam.h>
1694     #include <asm/suspend.h>
1695     +#include <asm/tlbflush.h>
1696    
1697     #ifdef CONFIG_XEN
1698     #include <xen/interface/xen.h>
1699     @@ -94,6 +95,9 @@ void common(void) {
1700     BLANK();
1701     DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
1702    
1703     + /* TLB state for the entry code */
1704     + OFFSET(TLB_STATE_user_pcid_flush_mask, tlb_state, user_pcid_flush_mask);
1705     +
1706     /* Layout info for cpu_entry_area */
1707     OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss);
1708     OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline);
1709     diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
1710     index 8ddcfa4d4165..f2a94dfb434e 100644
1711     --- a/arch/x86/kernel/cpu/common.c
1712     +++ b/arch/x86/kernel/cpu/common.c
1713     @@ -898,6 +898,10 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
1714     }
1715    
1716     setup_force_cpu_cap(X86_FEATURE_ALWAYS);
1717     +
1718     + /* Assume for now that ALL x86 CPUs are insecure */
1719     + setup_force_cpu_bug(X86_BUG_CPU_INSECURE);
1720     +
1721     fpu__init_system(c);
1722    
1723     #ifdef CONFIG_X86_32
1724     @@ -1335,7 +1339,10 @@ void syscall_init(void)
1725     (entry_SYSCALL_64_trampoline - _entry_trampoline);
1726    
1727     wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
1728     - wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline);
1729     + if (static_cpu_has(X86_FEATURE_PTI))
1730     + wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline);
1731     + else
1732     + wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
1733    
1734     #ifdef CONFIG_IA32_EMULATION
1735     wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat);
1736     diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
1737     index 36b17e0febe8..5fa110699ed2 100644
1738     --- a/arch/x86/kernel/dumpstack.c
1739     +++ b/arch/x86/kernel/dumpstack.c
1740     @@ -297,11 +297,13 @@ int __die(const char *str, struct pt_regs *regs, long err)
1741     unsigned long sp;
1742     #endif
1743     printk(KERN_DEFAULT
1744     - "%s: %04lx [#%d]%s%s%s%s\n", str, err & 0xffff, ++die_counter,
1745     + "%s: %04lx [#%d]%s%s%s%s%s\n", str, err & 0xffff, ++die_counter,
1746     IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "",
1747     IS_ENABLED(CONFIG_SMP) ? " SMP" : "",
1748     debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "",
1749     - IS_ENABLED(CONFIG_KASAN) ? " KASAN" : "");
1750     + IS_ENABLED(CONFIG_KASAN) ? " KASAN" : "",
1751     + IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION) ?
1752     + (boot_cpu_has(X86_FEATURE_PTI) ? " PTI" : " NOPTI") : "");
1753    
1754     if (notify_die(DIE_OOPS, str, regs, err,
1755     current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP)
1756     diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
1757     index 7dca675fe78d..04a625f0fcda 100644
1758     --- a/arch/x86/kernel/head_64.S
1759     +++ b/arch/x86/kernel/head_64.S
1760     @@ -341,6 +341,27 @@ GLOBAL(early_recursion_flag)
1761     .balign PAGE_SIZE; \
1762     GLOBAL(name)
1763    
1764     +#ifdef CONFIG_PAGE_TABLE_ISOLATION
1765     +/*
1766     + * Each PGD needs to be 8k long and 8k aligned. We do not
1767     + * ever go out to userspace with these, so we do not
1768     + * strictly *need* the second page, but this allows us to
1769     + * have a single set_pgd() implementation that does not
1770     + * need to worry about whether it has 4k or 8k to work
1771     + * with.
1772     + *
1773     + * This ensures PGDs are 8k long:
1774     + */
1775     +#define PTI_USER_PGD_FILL 512
1776     +/* This ensures they are 8k-aligned: */
1777     +#define NEXT_PGD_PAGE(name) \
1778     + .balign 2 * PAGE_SIZE; \
1779     +GLOBAL(name)
1780     +#else
1781     +#define NEXT_PGD_PAGE(name) NEXT_PAGE(name)
1782     +#define PTI_USER_PGD_FILL 0
1783     +#endif
1784     +
1785     /* Automate the creation of 1 to 1 mapping pmd entries */
1786     #define PMDS(START, PERM, COUNT) \
1787     i = 0 ; \
1788     @@ -350,13 +371,14 @@ GLOBAL(name)
1789     .endr
1790    
1791     __INITDATA
1792     -NEXT_PAGE(early_top_pgt)
1793     +NEXT_PGD_PAGE(early_top_pgt)
1794     .fill 511,8,0
1795     #ifdef CONFIG_X86_5LEVEL
1796     .quad level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
1797     #else
1798     .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
1799     #endif
1800     + .fill PTI_USER_PGD_FILL,8,0
1801    
1802     NEXT_PAGE(early_dynamic_pgts)
1803     .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0
1804     @@ -364,13 +386,14 @@ NEXT_PAGE(early_dynamic_pgts)
1805     .data
1806    
1807     #if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH)
1808     -NEXT_PAGE(init_top_pgt)
1809     +NEXT_PGD_PAGE(init_top_pgt)
1810     .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
1811     .org init_top_pgt + PGD_PAGE_OFFSET*8, 0
1812     .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
1813     .org init_top_pgt + PGD_START_KERNEL*8, 0
1814     /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
1815     .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
1816     + .fill PTI_USER_PGD_FILL,8,0
1817    
1818     NEXT_PAGE(level3_ident_pgt)
1819     .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
1820     @@ -381,8 +404,9 @@ NEXT_PAGE(level2_ident_pgt)
1821     */
1822     PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
1823     #else
1824     -NEXT_PAGE(init_top_pgt)
1825     +NEXT_PGD_PAGE(init_top_pgt)
1826     .fill 512,8,0
1827     + .fill PTI_USER_PGD_FILL,8,0
1828     #endif
1829    
1830     #ifdef CONFIG_X86_5LEVEL
1831     diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
1832     index a6b5d62f45a7..26d713ecad34 100644
1833     --- a/arch/x86/kernel/ldt.c
1834     +++ b/arch/x86/kernel/ldt.c
1835     @@ -24,6 +24,7 @@
1836     #include <linux/uaccess.h>
1837    
1838     #include <asm/ldt.h>
1839     +#include <asm/tlb.h>
1840     #include <asm/desc.h>
1841     #include <asm/mmu_context.h>
1842     #include <asm/syscalls.h>
1843     @@ -51,13 +52,11 @@ static void refresh_ldt_segments(void)
1844     static void flush_ldt(void *__mm)
1845     {
1846     struct mm_struct *mm = __mm;
1847     - mm_context_t *pc;
1848    
1849     if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm)
1850     return;
1851    
1852     - pc = &mm->context;
1853     - set_ldt(pc->ldt->entries, pc->ldt->nr_entries);
1854     + load_mm_ldt(mm);
1855    
1856     refresh_ldt_segments();
1857     }
1858     @@ -94,10 +93,126 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries)
1859     return NULL;
1860     }
1861    
1862     + /* The new LDT isn't aliased for PTI yet. */
1863     + new_ldt->slot = -1;
1864     +
1865     new_ldt->nr_entries = num_entries;
1866     return new_ldt;
1867     }
1868    
1869     +/*
1870     + * If PTI is enabled, this maps the LDT into the kernelmode and
1871     + * usermode tables for the given mm.
1872     + *
1873     + * There is no corresponding unmap function. Even if the LDT is freed, we
1874     + * leave the PTEs around until the slot is reused or the mm is destroyed.
1875     + * This is harmless: the LDT is always in ordinary memory, and no one will
1876     + * access the freed slot.
1877     + *
1878     + * If we wanted to unmap freed LDTs, we'd also need to do a flush to make
1879     + * it useful, and the flush would slow down modify_ldt().
1880     + */
1881     +static int
1882     +map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
1883     +{
1884     +#ifdef CONFIG_PAGE_TABLE_ISOLATION
1885     + bool is_vmalloc, had_top_level_entry;
1886     + unsigned long va;
1887     + spinlock_t *ptl;
1888     + pgd_t *pgd;
1889     + int i;
1890     +
1891     + if (!static_cpu_has(X86_FEATURE_PTI))
1892     + return 0;
1893     +
1894     + /*
1895     + * Any given ldt_struct should have map_ldt_struct() called at most
1896     + * once.
1897     + */
1898     + WARN_ON(ldt->slot != -1);
1899     +
1900     + /*
1901     + * Did we already have the top level entry allocated? We can't
1902     + * use pgd_none() for this because it doens't do anything on
1903     + * 4-level page table kernels.
1904     + */
1905     + pgd = pgd_offset(mm, LDT_BASE_ADDR);
1906     + had_top_level_entry = (pgd->pgd != 0);
1907     +
1908     + is_vmalloc = is_vmalloc_addr(ldt->entries);
1909     +
1910     + for (i = 0; i * PAGE_SIZE < ldt->nr_entries * LDT_ENTRY_SIZE; i++) {
1911     + unsigned long offset = i << PAGE_SHIFT;
1912     + const void *src = (char *)ldt->entries + offset;
1913     + unsigned long pfn;
1914     + pte_t pte, *ptep;
1915     +
1916     + va = (unsigned long)ldt_slot_va(slot) + offset;
1917     + pfn = is_vmalloc ? vmalloc_to_pfn(src) :
1918     + page_to_pfn(virt_to_page(src));
1919     + /*
1920     + * Treat the PTI LDT range as a *userspace* range.
1921     + * get_locked_pte() will allocate all needed pagetables
1922     + * and account for them in this mm.
1923     + */
1924     + ptep = get_locked_pte(mm, va, &ptl);
1925     + if (!ptep)
1926     + return -ENOMEM;
1927     + /*
1928     + * Map it RO so the easy to find address is not a primary
1929     + * target via some kernel interface which misses a
1930     + * permission check.
1931     + */
1932     + pte = pfn_pte(pfn, __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL));
1933     + set_pte_at(mm, va, ptep, pte);
1934     + pte_unmap_unlock(ptep, ptl);
1935     + }
1936     +
1937     + if (mm->context.ldt) {
1938     + /*
1939     + * We already had an LDT. The top-level entry should already
1940     + * have been allocated and synchronized with the usermode
1941     + * tables.
1942     + */
1943     + WARN_ON(!had_top_level_entry);
1944     + if (static_cpu_has(X86_FEATURE_PTI))
1945     + WARN_ON(!kernel_to_user_pgdp(pgd)->pgd);
1946     + } else {
1947     + /*
1948     + * This is the first time we're mapping an LDT for this process.
1949     + * Sync the pgd to the usermode tables.
1950     + */
1951     + WARN_ON(had_top_level_entry);
1952     + if (static_cpu_has(X86_FEATURE_PTI)) {
1953     + WARN_ON(kernel_to_user_pgdp(pgd)->pgd);
1954     + set_pgd(kernel_to_user_pgdp(pgd), *pgd);
1955     + }
1956     + }
1957     +
1958     + va = (unsigned long)ldt_slot_va(slot);
1959     + flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, 0);
1960     +
1961     + ldt->slot = slot;
1962     +#endif
1963     + return 0;
1964     +}
1965     +
1966     +static void free_ldt_pgtables(struct mm_struct *mm)
1967     +{
1968     +#ifdef CONFIG_PAGE_TABLE_ISOLATION
1969     + struct mmu_gather tlb;
1970     + unsigned long start = LDT_BASE_ADDR;
1971     + unsigned long end = start + (1UL << PGDIR_SHIFT);
1972     +
1973     + if (!static_cpu_has(X86_FEATURE_PTI))
1974     + return;
1975     +
1976     + tlb_gather_mmu(&tlb, mm, start, end);
1977     + free_pgd_range(&tlb, start, end, start, end);
1978     + tlb_finish_mmu(&tlb, start, end);
1979     +#endif
1980     +}
1981     +
1982     /* After calling this, the LDT is immutable. */
1983     static void finalize_ldt_struct(struct ldt_struct *ldt)
1984     {
1985     @@ -156,6 +271,12 @@ int ldt_dup_context(struct mm_struct *old_mm, struct mm_struct *mm)
1986     new_ldt->nr_entries * LDT_ENTRY_SIZE);
1987     finalize_ldt_struct(new_ldt);
1988    
1989     + retval = map_ldt_struct(mm, new_ldt, 0);
1990     + if (retval) {
1991     + free_ldt_pgtables(mm);
1992     + free_ldt_struct(new_ldt);
1993     + goto out_unlock;
1994     + }
1995     mm->context.ldt = new_ldt;
1996    
1997     out_unlock:
1998     @@ -174,6 +295,11 @@ void destroy_context_ldt(struct mm_struct *mm)
1999     mm->context.ldt = NULL;
2000     }
2001    
2002     +void ldt_arch_exit_mmap(struct mm_struct *mm)
2003     +{
2004     + free_ldt_pgtables(mm);
2005     +}
2006     +
2007     static int read_ldt(void __user *ptr, unsigned long bytecount)
2008     {
2009     struct mm_struct *mm = current->mm;
2010     @@ -287,6 +413,25 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
2011     new_ldt->entries[ldt_info.entry_number] = ldt;
2012     finalize_ldt_struct(new_ldt);
2013    
2014     + /*
2015     + * If we are using PTI, map the new LDT into the userspace pagetables.
2016     + * If there is already an LDT, use the other slot so that other CPUs
2017     + * will continue to use the old LDT until install_ldt() switches
2018     + * them over to the new LDT.
2019     + */
2020     + error = map_ldt_struct(mm, new_ldt, old_ldt ? !old_ldt->slot : 0);
2021     + if (error) {
2022     + /*
2023     + * This only can fail for the first LDT setup. If an LDT is
2024     + * already installed then the PTE page is already
2025     + * populated. Mop up a half populated page table.
2026     + */
2027     + if (!WARN_ON_ONCE(old_ldt))
2028     + free_ldt_pgtables(mm);
2029     + free_ldt_struct(new_ldt);
2030     + goto out_unlock;
2031     + }
2032     +
2033     install_ldt(mm, new_ldt);
2034     free_ldt_struct(old_ldt);
2035     error = 0;
2036     diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
2037     index 00bc751c861c..edfede768688 100644
2038     --- a/arch/x86/kernel/machine_kexec_32.c
2039     +++ b/arch/x86/kernel/machine_kexec_32.c
2040     @@ -48,8 +48,6 @@ static void load_segments(void)
2041     "\tmovl $"STR(__KERNEL_DS)",%%eax\n"
2042     "\tmovl %%eax,%%ds\n"
2043     "\tmovl %%eax,%%es\n"
2044     - "\tmovl %%eax,%%fs\n"
2045     - "\tmovl %%eax,%%gs\n"
2046     "\tmovl %%eax,%%ss\n"
2047     : : : "eax", "memory");
2048     #undef STR
2049     @@ -232,8 +230,8 @@ void machine_kexec(struct kimage *image)
2050     * The gdt & idt are now invalid.
2051     * If you want to load them you must set up your own idt & gdt.
2052     */
2053     - set_gdt(phys_to_virt(0), 0);
2054     idt_invalidate(phys_to_virt(0));
2055     + set_gdt(phys_to_virt(0), 0);
2056    
2057     /* now call it */
2058     image->start = relocate_kernel_ptr((unsigned long)image->head,
2059     diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
2060     index 12bf07d44dfe..2651ca2112c4 100644
2061     --- a/arch/x86/kernel/smpboot.c
2062     +++ b/arch/x86/kernel/smpboot.c
2063     @@ -128,25 +128,16 @@ static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
2064     spin_lock_irqsave(&rtc_lock, flags);
2065     CMOS_WRITE(0xa, 0xf);
2066     spin_unlock_irqrestore(&rtc_lock, flags);
2067     - local_flush_tlb();
2068     - pr_debug("1.\n");
2069     *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) =
2070     start_eip >> 4;
2071     - pr_debug("2.\n");
2072     *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) =
2073     start_eip & 0xf;
2074     - pr_debug("3.\n");
2075     }
2076    
2077     static inline void smpboot_restore_warm_reset_vector(void)
2078     {
2079     unsigned long flags;
2080    
2081     - /*
2082     - * Install writable page 0 entry to set BIOS data area.
2083     - */
2084     - local_flush_tlb();
2085     -
2086     /*
2087     * Paranoid: Set warm reset code and vector here back
2088     * to default values.
2089     diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c
2090     index 9a9c9b076955..a5b802a12212 100644
2091     --- a/arch/x86/kernel/tls.c
2092     +++ b/arch/x86/kernel/tls.c
2093     @@ -93,17 +93,10 @@ static void set_tls_desc(struct task_struct *p, int idx,
2094     cpu = get_cpu();
2095    
2096     while (n-- > 0) {
2097     - if (LDT_empty(info) || LDT_zero(info)) {
2098     + if (LDT_empty(info) || LDT_zero(info))
2099     memset(desc, 0, sizeof(*desc));
2100     - } else {
2101     + else
2102     fill_ldt(desc, info);
2103     -
2104     - /*
2105     - * Always set the accessed bit so that the CPU
2106     - * doesn't try to write to the (read-only) GDT.
2107     - */
2108     - desc->type |= 1;
2109     - }
2110     ++info;
2111     ++desc;
2112     }
2113     diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
2114     index 7c16fe0b60c2..b33e860d32fe 100644
2115     --- a/arch/x86/kernel/traps.c
2116     +++ b/arch/x86/kernel/traps.c
2117     @@ -361,7 +361,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
2118     *
2119     * No need for ist_enter here because we don't use RCU.
2120     */
2121     - if (((long)regs->sp >> PGDIR_SHIFT) == ESPFIX_PGD_ENTRY &&
2122     + if (((long)regs->sp >> P4D_SHIFT) == ESPFIX_PGD_ENTRY &&
2123     regs->cs == __KERNEL_CS &&
2124     regs->ip == (unsigned long)native_irq_return_iret)
2125     {
2126     diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
2127     index d2a8b5a24a44..1e413a9326aa 100644
2128     --- a/arch/x86/kernel/vmlinux.lds.S
2129     +++ b/arch/x86/kernel/vmlinux.lds.S
2130     @@ -61,11 +61,17 @@ jiffies_64 = jiffies;
2131     . = ALIGN(HPAGE_SIZE); \
2132     __end_rodata_hpage_align = .;
2133    
2134     +#define ALIGN_ENTRY_TEXT_BEGIN . = ALIGN(PMD_SIZE);
2135     +#define ALIGN_ENTRY_TEXT_END . = ALIGN(PMD_SIZE);
2136     +
2137     #else
2138    
2139     #define X64_ALIGN_RODATA_BEGIN
2140     #define X64_ALIGN_RODATA_END
2141    
2142     +#define ALIGN_ENTRY_TEXT_BEGIN
2143     +#define ALIGN_ENTRY_TEXT_END
2144     +
2145     #endif
2146    
2147     PHDRS {
2148     @@ -102,8 +108,10 @@ SECTIONS
2149     CPUIDLE_TEXT
2150     LOCK_TEXT
2151     KPROBES_TEXT
2152     + ALIGN_ENTRY_TEXT_BEGIN
2153     ENTRY_TEXT
2154     IRQENTRY_TEXT
2155     + ALIGN_ENTRY_TEXT_END
2156     SOFTIRQENTRY_TEXT
2157     *(.fixup)
2158     *(.gnu.warning)
2159     diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
2160     index 2e0017af8f9b..52906808e277 100644
2161     --- a/arch/x86/mm/Makefile
2162     +++ b/arch/x86/mm/Makefile
2163     @@ -43,9 +43,10 @@ obj-$(CONFIG_AMD_NUMA) += amdtopology.o
2164     obj-$(CONFIG_ACPI_NUMA) += srat.o
2165     obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
2166    
2167     -obj-$(CONFIG_X86_INTEL_MPX) += mpx.o
2168     -obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
2169     -obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
2170     +obj-$(CONFIG_X86_INTEL_MPX) += mpx.o
2171     +obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
2172     +obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
2173     +obj-$(CONFIG_PAGE_TABLE_ISOLATION) += pti.o
2174    
2175     obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt.o
2176     obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_boot.o
2177     diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c
2178     index fe814fd5e014..b9283cc27622 100644
2179     --- a/arch/x86/mm/cpu_entry_area.c
2180     +++ b/arch/x86/mm/cpu_entry_area.c
2181     @@ -38,6 +38,32 @@ cea_map_percpu_pages(void *cea_vaddr, void *ptr, int pages, pgprot_t prot)
2182     cea_set_pte(cea_vaddr, per_cpu_ptr_to_phys(ptr), prot);
2183     }
2184    
2185     +static void percpu_setup_debug_store(int cpu)
2186     +{
2187     +#ifdef CONFIG_CPU_SUP_INTEL
2188     + int npages;
2189     + void *cea;
2190     +
2191     + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
2192     + return;
2193     +
2194     + cea = &get_cpu_entry_area(cpu)->cpu_debug_store;
2195     + npages = sizeof(struct debug_store) / PAGE_SIZE;
2196     + BUILD_BUG_ON(sizeof(struct debug_store) % PAGE_SIZE != 0);
2197     + cea_map_percpu_pages(cea, &per_cpu(cpu_debug_store, cpu), npages,
2198     + PAGE_KERNEL);
2199     +
2200     + cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers;
2201     + /*
2202     + * Force the population of PMDs for not yet allocated per cpu
2203     + * memory like debug store buffers.
2204     + */
2205     + npages = sizeof(struct debug_store_buffers) / PAGE_SIZE;
2206     + for (; npages; npages--, cea += PAGE_SIZE)
2207     + cea_set_pte(cea, 0, PAGE_NONE);
2208     +#endif
2209     +}
2210     +
2211     /* Setup the fixmap mappings only once per-processor */
2212     static void __init setup_cpu_entry_area(int cpu)
2213     {
2214     @@ -109,6 +135,7 @@ static void __init setup_cpu_entry_area(int cpu)
2215     cea_set_pte(&get_cpu_entry_area(cpu)->entry_trampoline,
2216     __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX);
2217     #endif
2218     + percpu_setup_debug_store(cpu);
2219     }
2220    
2221     static __init void setup_cpu_entry_area_ptes(void)
2222     diff --git a/arch/x86/mm/debug_pagetables.c b/arch/x86/mm/debug_pagetables.c
2223     index bfcffdf6c577..421f2664ffa0 100644
2224     --- a/arch/x86/mm/debug_pagetables.c
2225     +++ b/arch/x86/mm/debug_pagetables.c
2226     @@ -5,7 +5,7 @@
2227    
2228     static int ptdump_show(struct seq_file *m, void *v)
2229     {
2230     - ptdump_walk_pgd_level(m, NULL);
2231     + ptdump_walk_pgd_level_debugfs(m, NULL, false);
2232     return 0;
2233     }
2234    
2235     @@ -22,21 +22,89 @@ static const struct file_operations ptdump_fops = {
2236     .release = single_release,
2237     };
2238    
2239     -static struct dentry *pe;
2240     +static int ptdump_show_curknl(struct seq_file *m, void *v)
2241     +{
2242     + if (current->mm->pgd) {
2243     + down_read(&current->mm->mmap_sem);
2244     + ptdump_walk_pgd_level_debugfs(m, current->mm->pgd, false);
2245     + up_read(&current->mm->mmap_sem);
2246     + }
2247     + return 0;
2248     +}
2249     +
2250     +static int ptdump_open_curknl(struct inode *inode, struct file *filp)
2251     +{
2252     + return single_open(filp, ptdump_show_curknl, NULL);
2253     +}
2254     +
2255     +static const struct file_operations ptdump_curknl_fops = {
2256     + .owner = THIS_MODULE,
2257     + .open = ptdump_open_curknl,
2258     + .read = seq_read,
2259     + .llseek = seq_lseek,
2260     + .release = single_release,
2261     +};
2262     +
2263     +#ifdef CONFIG_PAGE_TABLE_ISOLATION
2264     +static struct dentry *pe_curusr;
2265     +
2266     +static int ptdump_show_curusr(struct seq_file *m, void *v)
2267     +{
2268     + if (current->mm->pgd) {
2269     + down_read(&current->mm->mmap_sem);
2270     + ptdump_walk_pgd_level_debugfs(m, current->mm->pgd, true);
2271     + up_read(&current->mm->mmap_sem);
2272     + }
2273     + return 0;
2274     +}
2275     +
2276     +static int ptdump_open_curusr(struct inode *inode, struct file *filp)
2277     +{
2278     + return single_open(filp, ptdump_show_curusr, NULL);
2279     +}
2280     +
2281     +static const struct file_operations ptdump_curusr_fops = {
2282     + .owner = THIS_MODULE,
2283     + .open = ptdump_open_curusr,
2284     + .read = seq_read,
2285     + .llseek = seq_lseek,
2286     + .release = single_release,
2287     +};
2288     +#endif
2289     +
2290     +static struct dentry *dir, *pe_knl, *pe_curknl;
2291    
2292     static int __init pt_dump_debug_init(void)
2293     {
2294     - pe = debugfs_create_file("kernel_page_tables", S_IRUSR, NULL, NULL,
2295     - &ptdump_fops);
2296     - if (!pe)
2297     + dir = debugfs_create_dir("page_tables", NULL);
2298     + if (!dir)
2299     return -ENOMEM;
2300    
2301     + pe_knl = debugfs_create_file("kernel", 0400, dir, NULL,
2302     + &ptdump_fops);
2303     + if (!pe_knl)
2304     + goto err;
2305     +
2306     + pe_curknl = debugfs_create_file("current_kernel", 0400,
2307     + dir, NULL, &ptdump_curknl_fops);
2308     + if (!pe_curknl)
2309     + goto err;
2310     +
2311     +#ifdef CONFIG_PAGE_TABLE_ISOLATION
2312     + pe_curusr = debugfs_create_file("current_user", 0400,
2313     + dir, NULL, &ptdump_curusr_fops);
2314     + if (!pe_curusr)
2315     + goto err;
2316     +#endif
2317     return 0;
2318     +err:
2319     + debugfs_remove_recursive(dir);
2320     + return -ENOMEM;
2321     }
2322    
2323     static void __exit pt_dump_debug_exit(void)
2324     {
2325     - debugfs_remove_recursive(pe);
2326     + debugfs_remove_recursive(dir);
2327     }
2328    
2329     module_init(pt_dump_debug_init);
2330     diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
2331     index 43dedbfb7257..f56902c1f04b 100644
2332     --- a/arch/x86/mm/dump_pagetables.c
2333     +++ b/arch/x86/mm/dump_pagetables.c
2334     @@ -52,11 +52,17 @@ enum address_markers_idx {
2335     USER_SPACE_NR = 0,
2336     KERNEL_SPACE_NR,
2337     LOW_KERNEL_NR,
2338     +#if defined(CONFIG_MODIFY_LDT_SYSCALL) && defined(CONFIG_X86_5LEVEL)
2339     + LDT_NR,
2340     +#endif
2341     VMALLOC_START_NR,
2342     VMEMMAP_START_NR,
2343     #ifdef CONFIG_KASAN
2344     KASAN_SHADOW_START_NR,
2345     KASAN_SHADOW_END_NR,
2346     +#endif
2347     +#if defined(CONFIG_MODIFY_LDT_SYSCALL) && !defined(CONFIG_X86_5LEVEL)
2348     + LDT_NR,
2349     #endif
2350     CPU_ENTRY_AREA_NR,
2351     #ifdef CONFIG_X86_ESPFIX64
2352     @@ -81,6 +87,9 @@ static struct addr_marker address_markers[] = {
2353     #ifdef CONFIG_KASAN
2354     [KASAN_SHADOW_START_NR] = { KASAN_SHADOW_START, "KASAN shadow" },
2355     [KASAN_SHADOW_END_NR] = { KASAN_SHADOW_END, "KASAN shadow end" },
2356     +#endif
2357     +#ifdef CONFIG_MODIFY_LDT_SYSCALL
2358     + [LDT_NR] = { LDT_BASE_ADDR, "LDT remap" },
2359     #endif
2360     [CPU_ENTRY_AREA_NR] = { CPU_ENTRY_AREA_BASE,"CPU entry Area" },
2361     #ifdef CONFIG_X86_ESPFIX64
2362     @@ -467,7 +476,7 @@ static inline bool is_hypervisor_range(int idx)
2363     }
2364    
2365     static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
2366     - bool checkwx)
2367     + bool checkwx, bool dmesg)
2368     {
2369     #ifdef CONFIG_X86_64
2370     pgd_t *start = (pgd_t *) &init_top_pgt;
2371     @@ -480,7 +489,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
2372    
2373     if (pgd) {
2374     start = pgd;
2375     - st.to_dmesg = true;
2376     + st.to_dmesg = dmesg;
2377     }
2378    
2379     st.check_wx = checkwx;
2380     @@ -518,13 +527,37 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
2381    
2382     void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd)
2383     {
2384     - ptdump_walk_pgd_level_core(m, pgd, false);
2385     + ptdump_walk_pgd_level_core(m, pgd, false, true);
2386     +}
2387     +
2388     +void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user)
2389     +{
2390     +#ifdef CONFIG_PAGE_TABLE_ISOLATION
2391     + if (user && static_cpu_has(X86_FEATURE_PTI))
2392     + pgd = kernel_to_user_pgdp(pgd);
2393     +#endif
2394     + ptdump_walk_pgd_level_core(m, pgd, false, false);
2395     +}
2396     +EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs);
2397     +
2398     +static void ptdump_walk_user_pgd_level_checkwx(void)
2399     +{
2400     +#ifdef CONFIG_PAGE_TABLE_ISOLATION
2401     + pgd_t *pgd = (pgd_t *) &init_top_pgt;
2402     +
2403     + if (!static_cpu_has(X86_FEATURE_PTI))
2404     + return;
2405     +
2406     + pr_info("x86/mm: Checking user space page tables\n");
2407     + pgd = kernel_to_user_pgdp(pgd);
2408     + ptdump_walk_pgd_level_core(NULL, pgd, true, false);
2409     +#endif
2410     }
2411     -EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level);
2412    
2413     void ptdump_walk_pgd_level_checkwx(void)
2414     {
2415     - ptdump_walk_pgd_level_core(NULL, NULL, true);
2416     + ptdump_walk_pgd_level_core(NULL, NULL, true, false);
2417     + ptdump_walk_user_pgd_level_checkwx();
2418     }
2419    
2420     static int __init pt_dump_init(void)
2421     diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
2422     index a22c2b95e513..80259ad8c386 100644
2423     --- a/arch/x86/mm/init.c
2424     +++ b/arch/x86/mm/init.c
2425     @@ -20,6 +20,7 @@
2426     #include <asm/kaslr.h>
2427     #include <asm/hypervisor.h>
2428     #include <asm/cpufeature.h>
2429     +#include <asm/pti.h>
2430    
2431     /*
2432     * We need to define the tracepoints somewhere, and tlb.c
2433     @@ -161,6 +162,12 @@ struct map_range {
2434    
2435     static int page_size_mask;
2436    
2437     +static void enable_global_pages(void)
2438     +{
2439     + if (!static_cpu_has(X86_FEATURE_PTI))
2440     + __supported_pte_mask |= _PAGE_GLOBAL;
2441     +}
2442     +
2443     static void __init probe_page_size_mask(void)
2444     {
2445     /*
2446     @@ -179,11 +186,11 @@ static void __init probe_page_size_mask(void)
2447     cr4_set_bits_and_update_boot(X86_CR4_PSE);
2448    
2449     /* Enable PGE if available */
2450     + __supported_pte_mask &= ~_PAGE_GLOBAL;
2451     if (boot_cpu_has(X86_FEATURE_PGE)) {
2452     cr4_set_bits_and_update_boot(X86_CR4_PGE);
2453     - __supported_pte_mask |= _PAGE_GLOBAL;
2454     - } else
2455     - __supported_pte_mask &= ~_PAGE_GLOBAL;
2456     + enable_global_pages();
2457     + }
2458    
2459     /* Enable 1 GB linear kernel mappings if available: */
2460     if (direct_gbpages && boot_cpu_has(X86_FEATURE_GBPAGES)) {
2461     @@ -196,34 +203,44 @@ static void __init probe_page_size_mask(void)
2462    
2463     static void setup_pcid(void)
2464     {
2465     -#ifdef CONFIG_X86_64
2466     - if (boot_cpu_has(X86_FEATURE_PCID)) {
2467     - if (boot_cpu_has(X86_FEATURE_PGE)) {
2468     - /*
2469     - * This can't be cr4_set_bits_and_update_boot() --
2470     - * the trampoline code can't handle CR4.PCIDE and
2471     - * it wouldn't do any good anyway. Despite the name,
2472     - * cr4_set_bits_and_update_boot() doesn't actually
2473     - * cause the bits in question to remain set all the
2474     - * way through the secondary boot asm.
2475     - *
2476     - * Instead, we brute-force it and set CR4.PCIDE
2477     - * manually in start_secondary().
2478     - */
2479     - cr4_set_bits(X86_CR4_PCIDE);
2480     - } else {
2481     - /*
2482     - * flush_tlb_all(), as currently implemented, won't
2483     - * work if PCID is on but PGE is not. Since that
2484     - * combination doesn't exist on real hardware, there's
2485     - * no reason to try to fully support it, but it's
2486     - * polite to avoid corrupting data if we're on
2487     - * an improperly configured VM.
2488     - */
2489     - setup_clear_cpu_cap(X86_FEATURE_PCID);
2490     - }
2491     + if (!IS_ENABLED(CONFIG_X86_64))
2492     + return;
2493     +
2494     + if (!boot_cpu_has(X86_FEATURE_PCID))
2495     + return;
2496     +
2497     + if (boot_cpu_has(X86_FEATURE_PGE)) {
2498     + /*
2499     + * This can't be cr4_set_bits_and_update_boot() -- the
2500     + * trampoline code can't handle CR4.PCIDE and it wouldn't
2501     + * do any good anyway. Despite the name,
2502     + * cr4_set_bits_and_update_boot() doesn't actually cause
2503     + * the bits in question to remain set all the way through
2504     + * the secondary boot asm.
2505     + *
2506     + * Instead, we brute-force it and set CR4.PCIDE manually in
2507     + * start_secondary().
2508     + */
2509     + cr4_set_bits(X86_CR4_PCIDE);
2510     +
2511     + /*
2512     + * INVPCID's single-context modes (2/3) only work if we set
2513     + * X86_CR4_PCIDE, *and* we INVPCID support. It's unusable
2514     + * on systems that have X86_CR4_PCIDE clear, or that have
2515     + * no INVPCID support at all.
2516     + */
2517     + if (boot_cpu_has(X86_FEATURE_INVPCID))
2518     + setup_force_cpu_cap(X86_FEATURE_INVPCID_SINGLE);
2519     + } else {
2520     + /*
2521     + * flush_tlb_all(), as currently implemented, won't work if
2522     + * PCID is on but PGE is not. Since that combination
2523     + * doesn't exist on real hardware, there's no reason to try
2524     + * to fully support it, but it's polite to avoid corrupting
2525     + * data if we're on an improperly configured VM.
2526     + */
2527     + setup_clear_cpu_cap(X86_FEATURE_PCID);
2528     }
2529     -#endif
2530     }
2531    
2532     #ifdef CONFIG_X86_32
2533     @@ -624,6 +641,7 @@ void __init init_mem_mapping(void)
2534     {
2535     unsigned long end;
2536    
2537     + pti_check_boottime_disable();
2538     probe_page_size_mask();
2539     setup_pcid();
2540    
2541     @@ -847,7 +865,7 @@ void __init zone_sizes_init(void)
2542     free_area_init_nodes(max_zone_pfns);
2543     }
2544    
2545     -DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
2546     +__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
2547     .loaded_mm = &init_mm,
2548     .next_asid = 1,
2549     .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */
2550     diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
2551     index 17ebc5a978cc..9b7bcbd33cc2 100644
2552     --- a/arch/x86/mm/pgtable.c
2553     +++ b/arch/x86/mm/pgtable.c
2554     @@ -355,14 +355,15 @@ static inline void _pgd_free(pgd_t *pgd)
2555     kmem_cache_free(pgd_cache, pgd);
2556     }
2557     #else
2558     +
2559     static inline pgd_t *_pgd_alloc(void)
2560     {
2561     - return (pgd_t *)__get_free_page(PGALLOC_GFP);
2562     + return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER);
2563     }
2564    
2565     static inline void _pgd_free(pgd_t *pgd)
2566     {
2567     - free_page((unsigned long)pgd);
2568     + free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
2569     }
2570     #endif /* CONFIG_X86_PAE */
2571    
2572     diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
2573     new file mode 100644
2574     index 000000000000..bce8aea65606
2575     --- /dev/null
2576     +++ b/arch/x86/mm/pti.c
2577     @@ -0,0 +1,387 @@
2578     +/*
2579     + * Copyright(c) 2017 Intel Corporation. All rights reserved.
2580     + *
2581     + * This program is free software; you can redistribute it and/or modify
2582     + * it under the terms of version 2 of the GNU General Public License as
2583     + * published by the Free Software Foundation.
2584     + *
2585     + * This program is distributed in the hope that it will be useful, but
2586     + * WITHOUT ANY WARRANTY; without even the implied warranty of
2587     + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
2588     + * General Public License for more details.
2589     + *
2590     + * This code is based in part on work published here:
2591     + *
2592     + * https://github.com/IAIK/KAISER
2593     + *
2594     + * The original work was written by and and signed off by for the Linux
2595     + * kernel by:
2596     + *
2597     + * Signed-off-by: Richard Fellner <richard.fellner@student.tugraz.at>
2598     + * Signed-off-by: Moritz Lipp <moritz.lipp@iaik.tugraz.at>
2599     + * Signed-off-by: Daniel Gruss <daniel.gruss@iaik.tugraz.at>
2600     + * Signed-off-by: Michael Schwarz <michael.schwarz@iaik.tugraz.at>
2601     + *
2602     + * Major changes to the original code by: Dave Hansen <dave.hansen@intel.com>
2603     + * Mostly rewritten by Thomas Gleixner <tglx@linutronix.de> and
2604     + * Andy Lutomirsky <luto@amacapital.net>
2605     + */
2606     +#include <linux/kernel.h>
2607     +#include <linux/errno.h>
2608     +#include <linux/string.h>
2609     +#include <linux/types.h>
2610     +#include <linux/bug.h>
2611     +#include <linux/init.h>
2612     +#include <linux/spinlock.h>
2613     +#include <linux/mm.h>
2614     +#include <linux/uaccess.h>
2615     +
2616     +#include <asm/cpufeature.h>
2617     +#include <asm/hypervisor.h>
2618     +#include <asm/vsyscall.h>
2619     +#include <asm/cmdline.h>
2620     +#include <asm/pti.h>
2621     +#include <asm/pgtable.h>
2622     +#include <asm/pgalloc.h>
2623     +#include <asm/tlbflush.h>
2624     +#include <asm/desc.h>
2625     +
2626     +#undef pr_fmt
2627     +#define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt
2628     +
2629     +/* Backporting helper */
2630     +#ifndef __GFP_NOTRACK
2631     +#define __GFP_NOTRACK 0
2632     +#endif
2633     +
2634     +static void __init pti_print_if_insecure(const char *reason)
2635     +{
2636     + if (boot_cpu_has_bug(X86_BUG_CPU_INSECURE))
2637     + pr_info("%s\n", reason);
2638     +}
2639     +
2640     +static void __init pti_print_if_secure(const char *reason)
2641     +{
2642     + if (!boot_cpu_has_bug(X86_BUG_CPU_INSECURE))
2643     + pr_info("%s\n", reason);
2644     +}
2645     +
2646     +void __init pti_check_boottime_disable(void)
2647     +{
2648     + char arg[5];
2649     + int ret;
2650     +
2651     + if (hypervisor_is_type(X86_HYPER_XEN_PV)) {
2652     + pti_print_if_insecure("disabled on XEN PV.");
2653     + return;
2654     + }
2655     +
2656     + ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg));
2657     + if (ret > 0) {
2658     + if (ret == 3 && !strncmp(arg, "off", 3)) {
2659     + pti_print_if_insecure("disabled on command line.");
2660     + return;
2661     + }
2662     + if (ret == 2 && !strncmp(arg, "on", 2)) {
2663     + pti_print_if_secure("force enabled on command line.");
2664     + goto enable;
2665     + }
2666     + if (ret == 4 && !strncmp(arg, "auto", 4))
2667     + goto autosel;
2668     + }
2669     +
2670     + if (cmdline_find_option_bool(boot_command_line, "nopti")) {
2671     + pti_print_if_insecure("disabled on command line.");
2672     + return;
2673     + }
2674     +
2675     +autosel:
2676     + if (!boot_cpu_has_bug(X86_BUG_CPU_INSECURE))
2677     + return;
2678     +enable:
2679     + setup_force_cpu_cap(X86_FEATURE_PTI);
2680     +}
2681     +
2682     +pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
2683     +{
2684     + /*
2685     + * Changes to the high (kernel) portion of the kernelmode page
2686     + * tables are not automatically propagated to the usermode tables.
2687     + *
2688     + * Users should keep in mind that, unlike the kernelmode tables,
2689     + * there is no vmalloc_fault equivalent for the usermode tables.
2690     + * Top-level entries added to init_mm's usermode pgd after boot
2691     + * will not be automatically propagated to other mms.
2692     + */
2693     + if (!pgdp_maps_userspace(pgdp))
2694     + return pgd;
2695     +
2696     + /*
2697     + * The user page tables get the full PGD, accessible from
2698     + * userspace:
2699     + */
2700     + kernel_to_user_pgdp(pgdp)->pgd = pgd.pgd;
2701     +
2702     + /*
2703     + * If this is normal user memory, make it NX in the kernel
2704     + * pagetables so that, if we somehow screw up and return to
2705     + * usermode with the kernel CR3 loaded, we'll get a page fault
2706     + * instead of allowing user code to execute with the wrong CR3.
2707     + *
2708     + * As exceptions, we don't set NX if:
2709     + * - _PAGE_USER is not set. This could be an executable
2710     + * EFI runtime mapping or something similar, and the kernel
2711     + * may execute from it
2712     + * - we don't have NX support
2713     + * - we're clearing the PGD (i.e. the new pgd is not present).
2714     + */
2715     + if ((pgd.pgd & (_PAGE_USER|_PAGE_PRESENT)) == (_PAGE_USER|_PAGE_PRESENT) &&
2716     + (__supported_pte_mask & _PAGE_NX))
2717     + pgd.pgd |= _PAGE_NX;
2718     +
2719     + /* return the copy of the PGD we want the kernel to use: */
2720     + return pgd;
2721     +}
2722     +
2723     +/*
2724     + * Walk the user copy of the page tables (optionally) trying to allocate
2725     + * page table pages on the way down.
2726     + *
2727     + * Returns a pointer to a P4D on success, or NULL on failure.
2728     + */
2729     +static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address)
2730     +{
2731     + pgd_t *pgd = kernel_to_user_pgdp(pgd_offset_k(address));
2732     + gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
2733     +
2734     + if (address < PAGE_OFFSET) {
2735     + WARN_ONCE(1, "attempt to walk user address\n");
2736     + return NULL;
2737     + }
2738     +
2739     + if (pgd_none(*pgd)) {
2740     + unsigned long new_p4d_page = __get_free_page(gfp);
2741     + if (!new_p4d_page)
2742     + return NULL;
2743     +
2744     + if (pgd_none(*pgd)) {
2745     + set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page)));
2746     + new_p4d_page = 0;
2747     + }
2748     + if (new_p4d_page)
2749     + free_page(new_p4d_page);
2750     + }
2751     + BUILD_BUG_ON(pgd_large(*pgd) != 0);
2752     +
2753     + return p4d_offset(pgd, address);
2754     +}
2755     +
2756     +/*
2757     + * Walk the user copy of the page tables (optionally) trying to allocate
2758     + * page table pages on the way down.
2759     + *
2760     + * Returns a pointer to a PMD on success, or NULL on failure.
2761     + */
2762     +static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
2763     +{
2764     + gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
2765     + p4d_t *p4d = pti_user_pagetable_walk_p4d(address);
2766     + pud_t *pud;
2767     +
2768     + BUILD_BUG_ON(p4d_large(*p4d) != 0);
2769     + if (p4d_none(*p4d)) {
2770     + unsigned long new_pud_page = __get_free_page(gfp);
2771     + if (!new_pud_page)
2772     + return NULL;
2773     +
2774     + if (p4d_none(*p4d)) {
2775     + set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page)));
2776     + new_pud_page = 0;
2777     + }
2778     + if (new_pud_page)
2779     + free_page(new_pud_page);
2780     + }
2781     +
2782     + pud = pud_offset(p4d, address);
2783     + /* The user page tables do not use large mappings: */
2784     + if (pud_large(*pud)) {
2785     + WARN_ON(1);
2786     + return NULL;
2787     + }
2788     + if (pud_none(*pud)) {
2789     + unsigned long new_pmd_page = __get_free_page(gfp);
2790     + if (!new_pmd_page)
2791     + return NULL;
2792     +
2793     + if (pud_none(*pud)) {
2794     + set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
2795     + new_pmd_page = 0;
2796     + }
2797     + if (new_pmd_page)
2798     + free_page(new_pmd_page);
2799     + }
2800     +
2801     + return pmd_offset(pud, address);
2802     +}
2803     +
2804     +#ifdef CONFIG_X86_VSYSCALL_EMULATION
2805     +/*
2806     + * Walk the shadow copy of the page tables (optionally) trying to allocate
2807     + * page table pages on the way down. Does not support large pages.
2808     + *
2809     + * Note: this is only used when mapping *new* kernel data into the
2810     + * user/shadow page tables. It is never used for userspace data.
2811     + *
2812     + * Returns a pointer to a PTE on success, or NULL on failure.
2813     + */
2814     +static __init pte_t *pti_user_pagetable_walk_pte(unsigned long address)
2815     +{
2816     + gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
2817     + pmd_t *pmd = pti_user_pagetable_walk_pmd(address);
2818     + pte_t *pte;
2819     +
2820     + /* We can't do anything sensible if we hit a large mapping. */
2821     + if (pmd_large(*pmd)) {
2822     + WARN_ON(1);
2823     + return NULL;
2824     + }
2825     +
2826     + if (pmd_none(*pmd)) {
2827     + unsigned long new_pte_page = __get_free_page(gfp);
2828     + if (!new_pte_page)
2829     + return NULL;
2830     +
2831     + if (pmd_none(*pmd)) {
2832     + set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page)));
2833     + new_pte_page = 0;
2834     + }
2835     + if (new_pte_page)
2836     + free_page(new_pte_page);
2837     + }
2838     +
2839     + pte = pte_offset_kernel(pmd, address);
2840     + if (pte_flags(*pte) & _PAGE_USER) {
2841     + WARN_ONCE(1, "attempt to walk to user pte\n");
2842     + return NULL;
2843     + }
2844     + return pte;
2845     +}
2846     +
2847     +static void __init pti_setup_vsyscall(void)
2848     +{
2849     + pte_t *pte, *target_pte;
2850     + unsigned int level;
2851     +
2852     + pte = lookup_address(VSYSCALL_ADDR, &level);
2853     + if (!pte || WARN_ON(level != PG_LEVEL_4K) || pte_none(*pte))
2854     + return;
2855     +
2856     + target_pte = pti_user_pagetable_walk_pte(VSYSCALL_ADDR);
2857     + if (WARN_ON(!target_pte))
2858     + return;
2859     +
2860     + *target_pte = *pte;
2861     + set_vsyscall_pgtable_user_bits(kernel_to_user_pgdp(swapper_pg_dir));
2862     +}
2863     +#else
2864     +static void __init pti_setup_vsyscall(void) { }
2865     +#endif
2866     +
2867     +static void __init
2868     +pti_clone_pmds(unsigned long start, unsigned long end, pmdval_t clear)
2869     +{
2870     + unsigned long addr;
2871     +
2872     + /*
2873     + * Clone the populated PMDs which cover start to end. These PMD areas
2874     + * can have holes.
2875     + */
2876     + for (addr = start; addr < end; addr += PMD_SIZE) {
2877     + pmd_t *pmd, *target_pmd;
2878     + pgd_t *pgd;
2879     + p4d_t *p4d;
2880     + pud_t *pud;
2881     +
2882     + pgd = pgd_offset_k(addr);
2883     + if (WARN_ON(pgd_none(*pgd)))
2884     + return;
2885     + p4d = p4d_offset(pgd, addr);
2886     + if (WARN_ON(p4d_none(*p4d)))
2887     + return;
2888     + pud = pud_offset(p4d, addr);
2889     + if (pud_none(*pud))
2890     + continue;
2891     + pmd = pmd_offset(pud, addr);
2892     + if (pmd_none(*pmd))
2893     + continue;
2894     +
2895     + target_pmd = pti_user_pagetable_walk_pmd(addr);
2896     + if (WARN_ON(!target_pmd))
2897     + return;
2898     +
2899     + /*
2900     + * Copy the PMD. That is, the kernelmode and usermode
2901     + * tables will share the last-level page tables of this
2902     + * address range
2903     + */
2904     + *target_pmd = pmd_clear_flags(*pmd, clear);
2905     + }
2906     +}
2907     +
2908     +/*
2909     + * Clone a single p4d (i.e. a top-level entry on 4-level systems and a
2910     + * next-level entry on 5-level systems.
2911     + */
2912     +static void __init pti_clone_p4d(unsigned long addr)
2913     +{
2914     + p4d_t *kernel_p4d, *user_p4d;
2915     + pgd_t *kernel_pgd;
2916     +
2917     + user_p4d = pti_user_pagetable_walk_p4d(addr);
2918     + kernel_pgd = pgd_offset_k(addr);
2919     + kernel_p4d = p4d_offset(kernel_pgd, addr);
2920     + *user_p4d = *kernel_p4d;
2921     +}
2922     +
2923     +/*
2924     + * Clone the CPU_ENTRY_AREA into the user space visible page table.
2925     + */
2926     +static void __init pti_clone_user_shared(void)
2927     +{
2928     + pti_clone_p4d(CPU_ENTRY_AREA_BASE);
2929     +}
2930     +
2931     +/*
2932     + * Clone the ESPFIX P4D into the user space visinble page table
2933     + */
2934     +static void __init pti_setup_espfix64(void)
2935     +{
2936     +#ifdef CONFIG_X86_ESPFIX64
2937     + pti_clone_p4d(ESPFIX_BASE_ADDR);
2938     +#endif
2939     +}
2940     +
2941     +/*
2942     + * Clone the populated PMDs of the entry and irqentry text and force it RO.
2943     + */
2944     +static void __init pti_clone_entry_text(void)
2945     +{
2946     + pti_clone_pmds((unsigned long) __entry_text_start,
2947     + (unsigned long) __irqentry_text_end, _PAGE_RW);
2948     +}
2949     +
2950     +/*
2951     + * Initialize kernel page table isolation
2952     + */
2953     +void __init pti_init(void)
2954     +{
2955     + if (!static_cpu_has(X86_FEATURE_PTI))
2956     + return;
2957     +
2958     + pr_info("enabled\n");
2959     +
2960     + pti_clone_user_shared();
2961     + pti_clone_entry_text();
2962     + pti_setup_espfix64();
2963     + pti_setup_vsyscall();
2964     +}
2965     diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
2966     index 0a1be3adc97e..a1561957dccb 100644
2967     --- a/arch/x86/mm/tlb.c
2968     +++ b/arch/x86/mm/tlb.c
2969     @@ -28,6 +28,38 @@
2970     * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
2971     */
2972    
2973     +/*
2974     + * We get here when we do something requiring a TLB invalidation
2975     + * but could not go invalidate all of the contexts. We do the
2976     + * necessary invalidation by clearing out the 'ctx_id' which
2977     + * forces a TLB flush when the context is loaded.
2978     + */
2979     +void clear_asid_other(void)
2980     +{
2981     + u16 asid;
2982     +
2983     + /*
2984     + * This is only expected to be set if we have disabled
2985     + * kernel _PAGE_GLOBAL pages.
2986     + */
2987     + if (!static_cpu_has(X86_FEATURE_PTI)) {
2988     + WARN_ON_ONCE(1);
2989     + return;
2990     + }
2991     +
2992     + for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
2993     + /* Do not need to flush the current asid */
2994     + if (asid == this_cpu_read(cpu_tlbstate.loaded_mm_asid))
2995     + continue;
2996     + /*
2997     + * Make sure the next time we go to switch to
2998     + * this asid, we do a flush:
2999     + */
3000     + this_cpu_write(cpu_tlbstate.ctxs[asid].ctx_id, 0);
3001     + }
3002     + this_cpu_write(cpu_tlbstate.invalidate_other, false);
3003     +}
3004     +
3005     atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);
3006    
3007    
3008     @@ -42,6 +74,9 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
3009     return;
3010     }
3011    
3012     + if (this_cpu_read(cpu_tlbstate.invalidate_other))
3013     + clear_asid_other();
3014     +
3015     for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
3016     if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) !=
3017     next->context.ctx_id)
3018     @@ -65,6 +100,25 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
3019     *need_flush = true;
3020     }
3021    
3022     +static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush)
3023     +{
3024     + unsigned long new_mm_cr3;
3025     +
3026     + if (need_flush) {
3027     + invalidate_user_asid(new_asid);
3028     + new_mm_cr3 = build_cr3(pgdir, new_asid);
3029     + } else {
3030     + new_mm_cr3 = build_cr3_noflush(pgdir, new_asid);
3031     + }
3032     +
3033     + /*
3034     + * Caution: many callers of this function expect
3035     + * that load_cr3() is serializing and orders TLB
3036     + * fills with respect to the mm_cpumask writes.
3037     + */
3038     + write_cr3(new_mm_cr3);
3039     +}
3040     +
3041     void leave_mm(int cpu)
3042     {
3043     struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
3044     @@ -195,7 +249,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
3045     if (need_flush) {
3046     this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
3047     this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
3048     - write_cr3(build_cr3(next->pgd, new_asid));
3049     + load_new_mm_cr3(next->pgd, new_asid, true);
3050    
3051     /*
3052     * NB: This gets called via leave_mm() in the idle path
3053     @@ -208,7 +262,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
3054     trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
3055     } else {
3056     /* The new ASID is already up to date. */
3057     - write_cr3(build_cr3_noflush(next->pgd, new_asid));
3058     + load_new_mm_cr3(next->pgd, new_asid, false);
3059    
3060     /* See above wrt _rcuidle. */
3061     trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
3062     diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
3063     index 20fb31579b69..39c4b35ac7a4 100644
3064     --- a/arch/x86/platform/efi/efi_64.c
3065     +++ b/arch/x86/platform/efi/efi_64.c
3066     @@ -195,6 +195,9 @@ static pgd_t *efi_pgd;
3067     * because we want to avoid inserting EFI region mappings (EFI_VA_END
3068     * to EFI_VA_START) into the standard kernel page tables. Everything
3069     * else can be shared, see efi_sync_low_kernel_mappings().
3070     + *
3071     + * We don't want the pgd on the pgd_list and cannot use pgd_alloc() for the
3072     + * allocation.
3073     */
3074     int __init efi_alloc_page_tables(void)
3075     {
3076     @@ -207,7 +210,7 @@ int __init efi_alloc_page_tables(void)
3077     return 0;
3078    
3079     gfp_mask = GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO;
3080     - efi_pgd = (pgd_t *)__get_free_page(gfp_mask);
3081     + efi_pgd = (pgd_t *)__get_free_pages(gfp_mask, PGD_ALLOCATION_ORDER);
3082     if (!efi_pgd)
3083     return -ENOMEM;
3084    
3085     diff --git a/block/blk-map.c b/block/blk-map.c
3086     index d5251edcc0dd..368daa02714e 100644
3087     --- a/block/blk-map.c
3088     +++ b/block/blk-map.c
3089     @@ -12,22 +12,29 @@
3090     #include "blk.h"
3091    
3092     /*
3093     - * Append a bio to a passthrough request. Only works can be merged into
3094     - * the request based on the driver constraints.
3095     + * Append a bio to a passthrough request. Only works if the bio can be merged
3096     + * into the request based on the driver constraints.
3097     */
3098     -int blk_rq_append_bio(struct request *rq, struct bio *bio)
3099     +int blk_rq_append_bio(struct request *rq, struct bio **bio)
3100     {
3101     - blk_queue_bounce(rq->q, &bio);
3102     + struct bio *orig_bio = *bio;
3103     +
3104     + blk_queue_bounce(rq->q, bio);
3105    
3106     if (!rq->bio) {
3107     - blk_rq_bio_prep(rq->q, rq, bio);
3108     + blk_rq_bio_prep(rq->q, rq, *bio);
3109     } else {
3110     - if (!ll_back_merge_fn(rq->q, rq, bio))
3111     + if (!ll_back_merge_fn(rq->q, rq, *bio)) {
3112     + if (orig_bio != *bio) {
3113     + bio_put(*bio);
3114     + *bio = orig_bio;
3115     + }
3116     return -EINVAL;
3117     + }
3118    
3119     - rq->biotail->bi_next = bio;
3120     - rq->biotail = bio;
3121     - rq->__data_len += bio->bi_iter.bi_size;
3122     + rq->biotail->bi_next = *bio;
3123     + rq->biotail = *bio;
3124     + rq->__data_len += (*bio)->bi_iter.bi_size;
3125     }
3126    
3127     return 0;
3128     @@ -80,14 +87,12 @@ static int __blk_rq_map_user_iov(struct request *rq,
3129     * We link the bounce buffer in and could have to traverse it
3130     * later so we have to get a ref to prevent it from being freed
3131     */
3132     - ret = blk_rq_append_bio(rq, bio);
3133     - bio_get(bio);
3134     + ret = blk_rq_append_bio(rq, &bio);
3135     if (ret) {
3136     - bio_endio(bio);
3137     __blk_rq_unmap_user(orig_bio);
3138     - bio_put(bio);
3139     return ret;
3140     }
3141     + bio_get(bio);
3142    
3143     return 0;
3144     }
3145     @@ -220,7 +225,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
3146     int reading = rq_data_dir(rq) == READ;
3147     unsigned long addr = (unsigned long) kbuf;
3148     int do_copy = 0;
3149     - struct bio *bio;
3150     + struct bio *bio, *orig_bio;
3151     int ret;
3152    
3153     if (len > (queue_max_hw_sectors(q) << 9))
3154     @@ -243,10 +248,11 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
3155     if (do_copy)
3156     rq->rq_flags |= RQF_COPY_USER;
3157    
3158     - ret = blk_rq_append_bio(rq, bio);
3159     + orig_bio = bio;
3160     + ret = blk_rq_append_bio(rq, &bio);
3161     if (unlikely(ret)) {
3162     /* request is too big */
3163     - bio_put(bio);
3164     + bio_put(orig_bio);
3165     return ret;
3166     }
3167    
3168     diff --git a/block/bounce.c b/block/bounce.c
3169     index fceb1a96480b..1d05c422c932 100644
3170     --- a/block/bounce.c
3171     +++ b/block/bounce.c
3172     @@ -200,6 +200,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
3173     unsigned i = 0;
3174     bool bounce = false;
3175     int sectors = 0;
3176     + bool passthrough = bio_is_passthrough(*bio_orig);
3177    
3178     bio_for_each_segment(from, *bio_orig, iter) {
3179     if (i++ < BIO_MAX_PAGES)
3180     @@ -210,13 +211,14 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
3181     if (!bounce)
3182     return;
3183    
3184     - if (sectors < bio_sectors(*bio_orig)) {
3185     + if (!passthrough && sectors < bio_sectors(*bio_orig)) {
3186     bio = bio_split(*bio_orig, sectors, GFP_NOIO, bounce_bio_split);
3187     bio_chain(bio, *bio_orig);
3188     generic_make_request(*bio_orig);
3189     *bio_orig = bio;
3190     }
3191     - bio = bio_clone_bioset(*bio_orig, GFP_NOIO, bounce_bio_set);
3192     + bio = bio_clone_bioset(*bio_orig, GFP_NOIO, passthrough ? NULL :
3193     + bounce_bio_set);
3194    
3195     bio_for_each_segment_all(to, bio, i) {
3196     struct page *page = to->bv_page;
3197     diff --git a/drivers/android/binder.c b/drivers/android/binder.c
3198     index 88b4bbe58100..a340766b51fe 100644
3199     --- a/drivers/android/binder.c
3200     +++ b/drivers/android/binder.c
3201     @@ -482,7 +482,8 @@ enum binder_deferred_state {
3202     * @tsk task_struct for group_leader of process
3203     * (invariant after initialized)
3204     * @files files_struct for process
3205     - * (invariant after initialized)
3206     + * (protected by @files_lock)
3207     + * @files_lock mutex to protect @files
3208     * @deferred_work_node: element for binder_deferred_list
3209     * (protected by binder_deferred_lock)
3210     * @deferred_work: bitmap of deferred work to perform
3211     @@ -530,6 +531,7 @@ struct binder_proc {
3212     int pid;
3213     struct task_struct *tsk;
3214     struct files_struct *files;
3215     + struct mutex files_lock;
3216     struct hlist_node deferred_work_node;
3217     int deferred_work;
3218     bool is_dead;
3219     @@ -877,20 +879,26 @@ static void binder_inc_node_tmpref_ilocked(struct binder_node *node);
3220    
3221     static int task_get_unused_fd_flags(struct binder_proc *proc, int flags)
3222     {
3223     - struct files_struct *files = proc->files;
3224     unsigned long rlim_cur;
3225     unsigned long irqs;
3226     + int ret;
3227    
3228     - if (files == NULL)
3229     - return -ESRCH;
3230     -
3231     - if (!lock_task_sighand(proc->tsk, &irqs))
3232     - return -EMFILE;
3233     -
3234     + mutex_lock(&proc->files_lock);
3235     + if (proc->files == NULL) {
3236     + ret = -ESRCH;
3237     + goto err;
3238     + }
3239     + if (!lock_task_sighand(proc->tsk, &irqs)) {
3240     + ret = -EMFILE;
3241     + goto err;
3242     + }
3243     rlim_cur = task_rlimit(proc->tsk, RLIMIT_NOFILE);
3244     unlock_task_sighand(proc->tsk, &irqs);
3245    
3246     - return __alloc_fd(files, 0, rlim_cur, flags);
3247     + ret = __alloc_fd(proc->files, 0, rlim_cur, flags);
3248     +err:
3249     + mutex_unlock(&proc->files_lock);
3250     + return ret;
3251     }
3252    
3253     /*
3254     @@ -899,8 +907,10 @@ static int task_get_unused_fd_flags(struct binder_proc *proc, int flags)
3255     static void task_fd_install(
3256     struct binder_proc *proc, unsigned int fd, struct file *file)
3257     {
3258     + mutex_lock(&proc->files_lock);
3259     if (proc->files)
3260     __fd_install(proc->files, fd, file);
3261     + mutex_unlock(&proc->files_lock);
3262     }
3263    
3264     /*
3265     @@ -910,9 +920,11 @@ static long task_close_fd(struct binder_proc *proc, unsigned int fd)
3266     {
3267     int retval;
3268    
3269     - if (proc->files == NULL)
3270     - return -ESRCH;
3271     -
3272     + mutex_lock(&proc->files_lock);
3273     + if (proc->files == NULL) {
3274     + retval = -ESRCH;
3275     + goto err;
3276     + }
3277     retval = __close_fd(proc->files, fd);
3278     /* can't restart close syscall because file table entry was cleared */
3279     if (unlikely(retval == -ERESTARTSYS ||
3280     @@ -920,7 +932,8 @@ static long task_close_fd(struct binder_proc *proc, unsigned int fd)
3281     retval == -ERESTARTNOHAND ||
3282     retval == -ERESTART_RESTARTBLOCK))
3283     retval = -EINTR;
3284     -
3285     +err:
3286     + mutex_unlock(&proc->files_lock);
3287     return retval;
3288     }
3289    
3290     @@ -4627,7 +4640,9 @@ static int binder_mmap(struct file *filp, struct vm_area_struct *vma)
3291     ret = binder_alloc_mmap_handler(&proc->alloc, vma);
3292     if (ret)
3293     return ret;
3294     + mutex_lock(&proc->files_lock);
3295     proc->files = get_files_struct(current);
3296     + mutex_unlock(&proc->files_lock);
3297     return 0;
3298    
3299     err_bad_arg:
3300     @@ -4651,6 +4666,7 @@ static int binder_open(struct inode *nodp, struct file *filp)
3301     spin_lock_init(&proc->outer_lock);
3302     get_task_struct(current->group_leader);
3303     proc->tsk = current->group_leader;
3304     + mutex_init(&proc->files_lock);
3305     INIT_LIST_HEAD(&proc->todo);
3306     proc->default_priority = task_nice(current);
3307     binder_dev = container_of(filp->private_data, struct binder_device,
3308     @@ -4903,9 +4919,11 @@ static void binder_deferred_func(struct work_struct *work)
3309    
3310     files = NULL;
3311     if (defer & BINDER_DEFERRED_PUT_FILES) {
3312     + mutex_lock(&proc->files_lock);
3313     files = proc->files;
3314     if (files)
3315     proc->files = NULL;
3316     + mutex_unlock(&proc->files_lock);
3317     }
3318    
3319     if (defer & BINDER_DEFERRED_FLUSH)
3320     diff --git a/drivers/base/cacheinfo.c b/drivers/base/cacheinfo.c
3321     index eb3af2739537..07532d83be0b 100644
3322     --- a/drivers/base/cacheinfo.c
3323     +++ b/drivers/base/cacheinfo.c
3324     @@ -186,6 +186,11 @@ static void cache_associativity(struct cacheinfo *this_leaf)
3325     this_leaf->ways_of_associativity = (size / nr_sets) / line_size;
3326     }
3327    
3328     +static bool cache_node_is_unified(struct cacheinfo *this_leaf)
3329     +{
3330     + return of_property_read_bool(this_leaf->of_node, "cache-unified");
3331     +}
3332     +
3333     static void cache_of_override_properties(unsigned int cpu)
3334     {
3335     int index;
3336     @@ -194,6 +199,14 @@ static void cache_of_override_properties(unsigned int cpu)
3337    
3338     for (index = 0; index < cache_leaves(cpu); index++) {
3339     this_leaf = this_cpu_ci->info_list + index;
3340     + /*
3341     + * init_cache_level must setup the cache level correctly
3342     + * overriding the architecturally specified levels, so
3343     + * if type is NONE at this stage, it should be unified
3344     + */
3345     + if (this_leaf->type == CACHE_TYPE_NOCACHE &&
3346     + cache_node_is_unified(this_leaf))
3347     + this_leaf->type = CACHE_TYPE_UNIFIED;
3348     cache_size(this_leaf);
3349     cache_get_line_size(this_leaf);
3350     cache_nr_sets(this_leaf);
3351     diff --git a/drivers/gpio/gpiolib-acpi.c b/drivers/gpio/gpiolib-acpi.c
3352     index eb4528c87c0b..d6f3d9ee1350 100644
3353     --- a/drivers/gpio/gpiolib-acpi.c
3354     +++ b/drivers/gpio/gpiolib-acpi.c
3355     @@ -1074,7 +1074,7 @@ void acpi_gpiochip_add(struct gpio_chip *chip)
3356     }
3357    
3358     if (!chip->names)
3359     - devprop_gpiochip_set_names(chip);
3360     + devprop_gpiochip_set_names(chip, dev_fwnode(chip->parent));
3361    
3362     acpi_gpiochip_request_regions(acpi_gpio);
3363     acpi_gpiochip_scan_gpios(acpi_gpio);
3364     diff --git a/drivers/gpio/gpiolib-devprop.c b/drivers/gpio/gpiolib-devprop.c
3365     index 27f383bda7d9..f748aa3e77f7 100644
3366     --- a/drivers/gpio/gpiolib-devprop.c
3367     +++ b/drivers/gpio/gpiolib-devprop.c
3368     @@ -19,30 +19,27 @@
3369     /**
3370     * devprop_gpiochip_set_names - Set GPIO line names using device properties
3371     * @chip: GPIO chip whose lines should be named, if possible
3372     + * @fwnode: Property Node containing the gpio-line-names property
3373     *
3374     * Looks for device property "gpio-line-names" and if it exists assigns
3375     * GPIO line names for the chip. The memory allocated for the assigned
3376     * names belong to the underlying firmware node and should not be released
3377     * by the caller.
3378     */
3379     -void devprop_gpiochip_set_names(struct gpio_chip *chip)
3380     +void devprop_gpiochip_set_names(struct gpio_chip *chip,
3381     + const struct fwnode_handle *fwnode)
3382     {
3383     struct gpio_device *gdev = chip->gpiodev;
3384     const char **names;
3385     int ret, i;
3386    
3387     - if (!chip->parent) {
3388     - dev_warn(&gdev->dev, "GPIO chip parent is NULL\n");
3389     - return;
3390     - }
3391     -
3392     - ret = device_property_read_string_array(chip->parent, "gpio-line-names",
3393     + ret = fwnode_property_read_string_array(fwnode, "gpio-line-names",
3394     NULL, 0);
3395     if (ret < 0)
3396     return;
3397    
3398     if (ret != gdev->ngpio) {
3399     - dev_warn(chip->parent,
3400     + dev_warn(&gdev->dev,
3401     "names %d do not match number of GPIOs %d\n", ret,
3402     gdev->ngpio);
3403     return;
3404     @@ -52,10 +49,10 @@ void devprop_gpiochip_set_names(struct gpio_chip *chip)
3405     if (!names)
3406     return;
3407    
3408     - ret = device_property_read_string_array(chip->parent, "gpio-line-names",
3409     + ret = fwnode_property_read_string_array(fwnode, "gpio-line-names",
3410     names, gdev->ngpio);
3411     if (ret < 0) {
3412     - dev_warn(chip->parent, "failed to read GPIO line names\n");
3413     + dev_warn(&gdev->dev, "failed to read GPIO line names\n");
3414     kfree(names);
3415     return;
3416     }
3417     diff --git a/drivers/gpio/gpiolib-of.c b/drivers/gpio/gpiolib-of.c
3418     index bfcd20699ec8..ba38f530e403 100644
3419     --- a/drivers/gpio/gpiolib-of.c
3420     +++ b/drivers/gpio/gpiolib-of.c
3421     @@ -493,7 +493,8 @@ int of_gpiochip_add(struct gpio_chip *chip)
3422    
3423     /* If the chip defines names itself, these take precedence */
3424     if (!chip->names)
3425     - devprop_gpiochip_set_names(chip);
3426     + devprop_gpiochip_set_names(chip,
3427     + of_fwnode_handle(chip->of_node));
3428    
3429     of_node_get(chip->of_node);
3430    
3431     diff --git a/drivers/gpio/gpiolib.h b/drivers/gpio/gpiolib.h
3432     index d003ccb12781..3d4d0634c9dd 100644
3433     --- a/drivers/gpio/gpiolib.h
3434     +++ b/drivers/gpio/gpiolib.h
3435     @@ -224,7 +224,8 @@ static inline int gpio_chip_hwgpio(const struct gpio_desc *desc)
3436     return desc - &desc->gdev->descs[0];
3437     }
3438    
3439     -void devprop_gpiochip_set_names(struct gpio_chip *chip);
3440     +void devprop_gpiochip_set_names(struct gpio_chip *chip,
3441     + const struct fwnode_handle *fwnode);
3442    
3443     /* With descriptor prefix */
3444    
3445     diff --git a/drivers/infiniband/core/security.c b/drivers/infiniband/core/security.c
3446     index feafdb961c48..59b2f96d986a 100644
3447     --- a/drivers/infiniband/core/security.c
3448     +++ b/drivers/infiniband/core/security.c
3449     @@ -386,6 +386,9 @@ int ib_open_shared_qp_security(struct ib_qp *qp, struct ib_device *dev)
3450     if (ret)
3451     return ret;
3452    
3453     + if (!qp->qp_sec)
3454     + return 0;
3455     +
3456     mutex_lock(&real_qp->qp_sec->mutex);
3457     ret = check_qp_port_pkey_settings(real_qp->qp_sec->ports_pkeys,
3458     qp->qp_sec);
3459     diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
3460     index d8f540054392..93c1a57dbff1 100644
3461     --- a/drivers/infiniband/core/uverbs_cmd.c
3462     +++ b/drivers/infiniband/core/uverbs_cmd.c
3463     @@ -2085,8 +2085,8 @@ int ib_uverbs_ex_modify_qp(struct ib_uverbs_file *file,
3464     return -EOPNOTSUPP;
3465    
3466     if (ucore->inlen > sizeof(cmd)) {
3467     - if (ib_is_udata_cleared(ucore, sizeof(cmd),
3468     - ucore->inlen - sizeof(cmd)))
3469     + if (!ib_is_udata_cleared(ucore, sizeof(cmd),
3470     + ucore->inlen - sizeof(cmd)))
3471     return -EOPNOTSUPP;
3472     }
3473    
3474     diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
3475     index de57d6c11a25..9032f77cc38d 100644
3476     --- a/drivers/infiniband/core/verbs.c
3477     +++ b/drivers/infiniband/core/verbs.c
3478     @@ -1400,7 +1400,8 @@ int ib_close_qp(struct ib_qp *qp)
3479     spin_unlock_irqrestore(&real_qp->device->event_handler_lock, flags);
3480    
3481     atomic_dec(&real_qp->usecnt);
3482     - ib_close_shared_qp_security(qp->qp_sec);
3483     + if (qp->qp_sec)
3484     + ib_close_shared_qp_security(qp->qp_sec);
3485     kfree(qp);
3486    
3487     return 0;
3488     diff --git a/drivers/infiniband/hw/cxgb4/cq.c b/drivers/infiniband/hw/cxgb4/cq.c
3489     index eae8ea81c6e2..514c1000ded1 100644
3490     --- a/drivers/infiniband/hw/cxgb4/cq.c
3491     +++ b/drivers/infiniband/hw/cxgb4/cq.c
3492     @@ -586,10 +586,10 @@ static int poll_cq(struct t4_wq *wq, struct t4_cq *cq, struct t4_cqe *cqe,
3493     ret = -EAGAIN;
3494     goto skip_cqe;
3495     }
3496     - if (unlikely((CQE_WRID_MSN(hw_cqe) != (wq->rq.msn)))) {
3497     + if (unlikely(!CQE_STATUS(hw_cqe) &&
3498     + CQE_WRID_MSN(hw_cqe) != wq->rq.msn)) {
3499     t4_set_wq_in_error(wq);
3500     - hw_cqe->header |= htonl(CQE_STATUS_V(T4_ERR_MSN));
3501     - goto proc_cqe;
3502     + hw_cqe->header |= cpu_to_be32(CQE_STATUS_V(T4_ERR_MSN));
3503     }
3504     goto proc_cqe;
3505     }
3506     diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h
3507     index 6ff44dc606eb..3409eee16092 100644
3508     --- a/drivers/infiniband/hw/hfi1/hfi.h
3509     +++ b/drivers/infiniband/hw/hfi1/hfi.h
3510     @@ -1129,7 +1129,6 @@ struct hfi1_devdata {
3511     u16 pcie_lnkctl;
3512     u16 pcie_devctl2;
3513     u32 pci_msix0;
3514     - u32 pci_lnkctl3;
3515     u32 pci_tph2;
3516    
3517     /*
3518     diff --git a/drivers/infiniband/hw/hfi1/pcie.c b/drivers/infiniband/hw/hfi1/pcie.c
3519     index 09e50fd2a08f..8c7e7a60b715 100644
3520     --- a/drivers/infiniband/hw/hfi1/pcie.c
3521     +++ b/drivers/infiniband/hw/hfi1/pcie.c
3522     @@ -411,15 +411,12 @@ int restore_pci_variables(struct hfi1_devdata *dd)
3523     if (ret)
3524     goto error;
3525    
3526     - ret = pci_write_config_dword(dd->pcidev, PCIE_CFG_SPCIE1,
3527     - dd->pci_lnkctl3);
3528     - if (ret)
3529     - goto error;
3530     -
3531     - ret = pci_write_config_dword(dd->pcidev, PCIE_CFG_TPH2, dd->pci_tph2);
3532     - if (ret)
3533     - goto error;
3534     -
3535     + if (pci_find_ext_capability(dd->pcidev, PCI_EXT_CAP_ID_TPH)) {
3536     + ret = pci_write_config_dword(dd->pcidev, PCIE_CFG_TPH2,
3537     + dd->pci_tph2);
3538     + if (ret)
3539     + goto error;
3540     + }
3541     return 0;
3542    
3543     error:
3544     @@ -469,15 +466,12 @@ int save_pci_variables(struct hfi1_devdata *dd)
3545     if (ret)
3546     goto error;
3547    
3548     - ret = pci_read_config_dword(dd->pcidev, PCIE_CFG_SPCIE1,
3549     - &dd->pci_lnkctl3);
3550     - if (ret)
3551     - goto error;
3552     -
3553     - ret = pci_read_config_dword(dd->pcidev, PCIE_CFG_TPH2, &dd->pci_tph2);
3554     - if (ret)
3555     - goto error;
3556     -
3557     + if (pci_find_ext_capability(dd->pcidev, PCI_EXT_CAP_ID_TPH)) {
3558     + ret = pci_read_config_dword(dd->pcidev, PCIE_CFG_TPH2,
3559     + &dd->pci_tph2);
3560     + if (ret)
3561     + goto error;
3562     + }
3563     return 0;
3564    
3565     error:
3566     diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
3567     index 5aff1e33d984..30d479f87cb8 100644
3568     --- a/drivers/infiniband/hw/mlx5/main.c
3569     +++ b/drivers/infiniband/hw/mlx5/main.c
3570     @@ -1415,6 +1415,7 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
3571     }
3572    
3573     INIT_LIST_HEAD(&context->vma_private_list);
3574     + mutex_init(&context->vma_private_list_mutex);
3575     INIT_LIST_HEAD(&context->db_page_list);
3576     mutex_init(&context->db_page_mutex);
3577    
3578     @@ -1576,7 +1577,9 @@ static void mlx5_ib_vma_close(struct vm_area_struct *area)
3579     * mlx5_ib_disassociate_ucontext().
3580     */
3581     mlx5_ib_vma_priv_data->vma = NULL;
3582     + mutex_lock(mlx5_ib_vma_priv_data->vma_private_list_mutex);
3583     list_del(&mlx5_ib_vma_priv_data->list);
3584     + mutex_unlock(mlx5_ib_vma_priv_data->vma_private_list_mutex);
3585     kfree(mlx5_ib_vma_priv_data);
3586     }
3587    
3588     @@ -1596,10 +1599,13 @@ static int mlx5_ib_set_vma_data(struct vm_area_struct *vma,
3589     return -ENOMEM;
3590    
3591     vma_prv->vma = vma;
3592     + vma_prv->vma_private_list_mutex = &ctx->vma_private_list_mutex;
3593     vma->vm_private_data = vma_prv;
3594     vma->vm_ops = &mlx5_ib_vm_ops;
3595    
3596     + mutex_lock(&ctx->vma_private_list_mutex);
3597     list_add(&vma_prv->list, vma_head);
3598     + mutex_unlock(&ctx->vma_private_list_mutex);
3599    
3600     return 0;
3601     }
3602     @@ -1642,6 +1648,7 @@ static void mlx5_ib_disassociate_ucontext(struct ib_ucontext *ibcontext)
3603     * mlx5_ib_vma_close.
3604     */
3605     down_write(&owning_mm->mmap_sem);
3606     + mutex_lock(&context->vma_private_list_mutex);
3607     list_for_each_entry_safe(vma_private, n, &context->vma_private_list,
3608     list) {
3609     vma = vma_private->vma;
3610     @@ -1656,6 +1663,7 @@ static void mlx5_ib_disassociate_ucontext(struct ib_ucontext *ibcontext)
3611     list_del(&vma_private->list);
3612     kfree(vma_private);
3613     }
3614     + mutex_unlock(&context->vma_private_list_mutex);
3615     up_write(&owning_mm->mmap_sem);
3616     mmput(owning_mm);
3617     put_task_struct(owning_process);
3618     diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
3619     index 189e80cd6b2f..754103372faa 100644
3620     --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
3621     +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
3622     @@ -115,6 +115,8 @@ enum {
3623     struct mlx5_ib_vma_private_data {
3624     struct list_head list;
3625     struct vm_area_struct *vma;
3626     + /* protect vma_private_list add/del */
3627     + struct mutex *vma_private_list_mutex;
3628     };
3629    
3630     struct mlx5_ib_ucontext {
3631     @@ -129,6 +131,8 @@ struct mlx5_ib_ucontext {
3632     /* Transport Domain number */
3633     u32 tdn;
3634     struct list_head vma_private_list;
3635     + /* protect vma_private_list add/del */
3636     + struct mutex vma_private_list_mutex;
3637    
3638     unsigned long upd_xlt_page;
3639     /* protect ODP/KSM */
3640     diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c
3641     index d7b53d53c116..72d6ffbfd638 100644
3642     --- a/drivers/net/dsa/bcm_sf2.c
3643     +++ b/drivers/net/dsa/bcm_sf2.c
3644     @@ -167,7 +167,7 @@ static void bcm_sf2_gphy_enable_set(struct dsa_switch *ds, bool enable)
3645     reg = reg_readl(priv, REG_SPHY_CNTRL);
3646     if (enable) {
3647     reg |= PHY_RESET;
3648     - reg &= ~(EXT_PWR_DOWN | IDDQ_BIAS | CK25_DIS);
3649     + reg &= ~(EXT_PWR_DOWN | IDDQ_BIAS | IDDQ_GLOBAL_PWR | CK25_DIS);
3650     reg_writel(priv, reg, REG_SPHY_CNTRL);
3651     udelay(21);
3652     reg = reg_readl(priv, REG_SPHY_CNTRL);
3653     diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
3654     index dc5de275352a..aa764c5e3c6b 100644
3655     --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
3656     +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
3657     @@ -1875,7 +1875,7 @@ static int bnxt_poll_work(struct bnxt *bp, struct bnxt_napi *bnapi, int budget)
3658     * here forever if we consistently cannot allocate
3659     * buffers.
3660     */
3661     - else if (rc == -ENOMEM)
3662     + else if (rc == -ENOMEM && budget)
3663     rx_pkts++;
3664     else if (rc == -EBUSY) /* partial completion */
3665     break;
3666     @@ -1961,7 +1961,7 @@ static int bnxt_poll_nitroa0(struct napi_struct *napi, int budget)
3667     cpu_to_le32(RX_CMPL_ERRORS_CRC_ERROR);
3668    
3669     rc = bnxt_rx_pkt(bp, bnapi, &raw_cons, &event);
3670     - if (likely(rc == -EIO))
3671     + if (likely(rc == -EIO) && budget)
3672     rx_pkts++;
3673     else if (rc == -EBUSY) /* partial completion */
3674     break;
3675     diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
3676     index 656e6af70f0a..aef3fcf2f5b9 100644
3677     --- a/drivers/net/ethernet/broadcom/tg3.c
3678     +++ b/drivers/net/ethernet/broadcom/tg3.c
3679     @@ -14227,7 +14227,9 @@ static int tg3_change_mtu(struct net_device *dev, int new_mtu)
3680     /* Reset PHY, otherwise the read DMA engine will be in a mode that
3681     * breaks all requests to 256 bytes.
3682     */
3683     - if (tg3_asic_rev(tp) == ASIC_REV_57766)
3684     + if (tg3_asic_rev(tp) == ASIC_REV_57766 ||
3685     + tg3_asic_rev(tp) == ASIC_REV_5717 ||
3686     + tg3_asic_rev(tp) == ASIC_REV_5719)
3687     reset_phy = true;
3688    
3689     err = tg3_restart_hw(tp, reset_phy);
3690     diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c
3691     index 3dc2d771a222..faf7cdc97ebf 100644
3692     --- a/drivers/net/ethernet/freescale/fec_main.c
3693     +++ b/drivers/net/ethernet/freescale/fec_main.c
3694     @@ -818,6 +818,12 @@ static void fec_enet_bd_init(struct net_device *dev)
3695     for (i = 0; i < txq->bd.ring_size; i++) {
3696     /* Initialize the BD for every fragment in the page. */
3697     bdp->cbd_sc = cpu_to_fec16(0);
3698     + if (bdp->cbd_bufaddr &&
3699     + !IS_TSO_HEADER(txq, fec32_to_cpu(bdp->cbd_bufaddr)))
3700     + dma_unmap_single(&fep->pdev->dev,
3701     + fec32_to_cpu(bdp->cbd_bufaddr),
3702     + fec16_to_cpu(bdp->cbd_datlen),
3703     + DMA_TO_DEVICE);
3704     if (txq->tx_skbuff[i]) {
3705     dev_kfree_skb_any(txq->tx_skbuff[i]);
3706     txq->tx_skbuff[i] = NULL;
3707     diff --git a/drivers/net/ethernet/marvell/mvmdio.c b/drivers/net/ethernet/marvell/mvmdio.c
3708     index c9798210fa0f..0495487f7b42 100644
3709     --- a/drivers/net/ethernet/marvell/mvmdio.c
3710     +++ b/drivers/net/ethernet/marvell/mvmdio.c
3711     @@ -344,7 +344,8 @@ static int orion_mdio_probe(struct platform_device *pdev)
3712     dev->regs + MVMDIO_ERR_INT_MASK);
3713    
3714     } else if (dev->err_interrupt == -EPROBE_DEFER) {
3715     - return -EPROBE_DEFER;
3716     + ret = -EPROBE_DEFER;
3717     + goto out_mdio;
3718     }
3719    
3720     if (pdev->dev.of_node)
3721     diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
3722     index 1fffdebbc9e8..e9a1fbcc4adf 100644
3723     --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
3724     +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
3725     @@ -362,7 +362,7 @@ static int mlx5_internal_err_ret_value(struct mlx5_core_dev *dev, u16 op,
3726     case MLX5_CMD_OP_QUERY_VPORT_COUNTER:
3727     case MLX5_CMD_OP_ALLOC_Q_COUNTER:
3728     case MLX5_CMD_OP_QUERY_Q_COUNTER:
3729     - case MLX5_CMD_OP_SET_RATE_LIMIT:
3730     + case MLX5_CMD_OP_SET_PP_RATE_LIMIT:
3731     case MLX5_CMD_OP_QUERY_RATE_LIMIT:
3732     case MLX5_CMD_OP_CREATE_SCHEDULING_ELEMENT:
3733     case MLX5_CMD_OP_QUERY_SCHEDULING_ELEMENT:
3734     @@ -505,7 +505,7 @@ const char *mlx5_command_str(int command)
3735     MLX5_COMMAND_STR_CASE(ALLOC_Q_COUNTER);
3736     MLX5_COMMAND_STR_CASE(DEALLOC_Q_COUNTER);
3737     MLX5_COMMAND_STR_CASE(QUERY_Q_COUNTER);
3738     - MLX5_COMMAND_STR_CASE(SET_RATE_LIMIT);
3739     + MLX5_COMMAND_STR_CASE(SET_PP_RATE_LIMIT);
3740     MLX5_COMMAND_STR_CASE(QUERY_RATE_LIMIT);
3741     MLX5_COMMAND_STR_CASE(CREATE_SCHEDULING_ELEMENT);
3742     MLX5_COMMAND_STR_CASE(DESTROY_SCHEDULING_ELEMENT);
3743     diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
3744     index 13b5ef9d8703..5fa071620104 100644
3745     --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
3746     +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
3747     @@ -590,6 +590,7 @@ struct mlx5e_channel {
3748     struct mlx5_core_dev *mdev;
3749     struct mlx5e_tstamp *tstamp;
3750     int ix;
3751     + int cpu;
3752     };
3753    
3754     struct mlx5e_channels {
3755     diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
3756     index cc11bbbd0309..3cdb932cae76 100644
3757     --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
3758     +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
3759     @@ -71,11 +71,6 @@ struct mlx5e_channel_param {
3760     struct mlx5e_cq_param icosq_cq;
3761     };
3762    
3763     -static int mlx5e_get_node(struct mlx5e_priv *priv, int ix)
3764     -{
3765     - return pci_irq_get_node(priv->mdev->pdev, MLX5_EQ_VEC_COMP_BASE + ix);
3766     -}
3767     -
3768     static bool mlx5e_check_fragmented_striding_rq_cap(struct mlx5_core_dev *mdev)
3769     {
3770     return MLX5_CAP_GEN(mdev, striding_rq) &&
3771     @@ -452,17 +447,16 @@ static int mlx5e_rq_alloc_mpwqe_info(struct mlx5e_rq *rq,
3772     int wq_sz = mlx5_wq_ll_get_size(&rq->wq);
3773     int mtt_sz = mlx5e_get_wqe_mtt_sz();
3774     int mtt_alloc = mtt_sz + MLX5_UMR_ALIGN - 1;
3775     - int node = mlx5e_get_node(c->priv, c->ix);
3776     int i;
3777    
3778     rq->mpwqe.info = kzalloc_node(wq_sz * sizeof(*rq->mpwqe.info),
3779     - GFP_KERNEL, node);
3780     + GFP_KERNEL, cpu_to_node(c->cpu));
3781     if (!rq->mpwqe.info)
3782     goto err_out;
3783    
3784     /* We allocate more than mtt_sz as we will align the pointer */
3785     - rq->mpwqe.mtt_no_align = kzalloc_node(mtt_alloc * wq_sz,
3786     - GFP_KERNEL, node);
3787     + rq->mpwqe.mtt_no_align = kzalloc_node(mtt_alloc * wq_sz, GFP_KERNEL,
3788     + cpu_to_node(c->cpu));
3789     if (unlikely(!rq->mpwqe.mtt_no_align))
3790     goto err_free_wqe_info;
3791    
3792     @@ -570,7 +564,7 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
3793     int err;
3794     int i;
3795    
3796     - rqp->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix);
3797     + rqp->wq.db_numa_node = cpu_to_node(c->cpu);
3798    
3799     err = mlx5_wq_ll_create(mdev, &rqp->wq, rqc_wq, &rq->wq,
3800     &rq->wq_ctrl);
3801     @@ -636,8 +630,7 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
3802     default: /* MLX5_WQ_TYPE_LINKED_LIST */
3803     rq->wqe.frag_info =
3804     kzalloc_node(wq_sz * sizeof(*rq->wqe.frag_info),
3805     - GFP_KERNEL,
3806     - mlx5e_get_node(c->priv, c->ix));
3807     + GFP_KERNEL, cpu_to_node(c->cpu));
3808     if (!rq->wqe.frag_info) {
3809     err = -ENOMEM;
3810     goto err_rq_wq_destroy;
3811     @@ -1007,13 +1000,13 @@ static int mlx5e_alloc_xdpsq(struct mlx5e_channel *c,
3812     sq->uar_map = mdev->mlx5e_res.bfreg.map;
3813     sq->min_inline_mode = params->tx_min_inline_mode;
3814    
3815     - param->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix);
3816     + param->wq.db_numa_node = cpu_to_node(c->cpu);
3817     err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, &sq->wq, &sq->wq_ctrl);
3818     if (err)
3819     return err;
3820     sq->wq.db = &sq->wq.db[MLX5_SND_DBR];
3821    
3822     - err = mlx5e_alloc_xdpsq_db(sq, mlx5e_get_node(c->priv, c->ix));
3823     + err = mlx5e_alloc_xdpsq_db(sq, cpu_to_node(c->cpu));
3824     if (err)
3825     goto err_sq_wq_destroy;
3826    
3827     @@ -1060,13 +1053,13 @@ static int mlx5e_alloc_icosq(struct mlx5e_channel *c,
3828     sq->channel = c;
3829     sq->uar_map = mdev->mlx5e_res.bfreg.map;
3830    
3831     - param->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix);
3832     + param->wq.db_numa_node = cpu_to_node(c->cpu);
3833     err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, &sq->wq, &sq->wq_ctrl);
3834     if (err)
3835     return err;
3836     sq->wq.db = &sq->wq.db[MLX5_SND_DBR];
3837    
3838     - err = mlx5e_alloc_icosq_db(sq, mlx5e_get_node(c->priv, c->ix));
3839     + err = mlx5e_alloc_icosq_db(sq, cpu_to_node(c->cpu));
3840     if (err)
3841     goto err_sq_wq_destroy;
3842    
3843     @@ -1132,13 +1125,13 @@ static int mlx5e_alloc_txqsq(struct mlx5e_channel *c,
3844     if (MLX5_IPSEC_DEV(c->priv->mdev))
3845     set_bit(MLX5E_SQ_STATE_IPSEC, &sq->state);
3846    
3847     - param->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix);
3848     + param->wq.db_numa_node = cpu_to_node(c->cpu);
3849     err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, &sq->wq, &sq->wq_ctrl);
3850     if (err)
3851     return err;
3852     sq->wq.db = &sq->wq.db[MLX5_SND_DBR];
3853    
3854     - err = mlx5e_alloc_txqsq_db(sq, mlx5e_get_node(c->priv, c->ix));
3855     + err = mlx5e_alloc_txqsq_db(sq, cpu_to_node(c->cpu));
3856     if (err)
3857     goto err_sq_wq_destroy;
3858    
3859     @@ -1510,8 +1503,8 @@ static int mlx5e_alloc_cq(struct mlx5e_channel *c,
3860     struct mlx5_core_dev *mdev = c->priv->mdev;
3861     int err;
3862    
3863     - param->wq.buf_numa_node = mlx5e_get_node(c->priv, c->ix);
3864     - param->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix);
3865     + param->wq.buf_numa_node = cpu_to_node(c->cpu);
3866     + param->wq.db_numa_node = cpu_to_node(c->cpu);
3867     param->eq_ix = c->ix;
3868    
3869     err = mlx5e_alloc_cq_common(mdev, param, cq);
3870     @@ -1610,6 +1603,11 @@ static void mlx5e_close_cq(struct mlx5e_cq *cq)
3871     mlx5e_free_cq(cq);
3872     }
3873    
3874     +static int mlx5e_get_cpu(struct mlx5e_priv *priv, int ix)
3875     +{
3876     + return cpumask_first(priv->mdev->priv.irq_info[ix].mask);
3877     +}
3878     +
3879     static int mlx5e_open_tx_cqs(struct mlx5e_channel *c,
3880     struct mlx5e_params *params,
3881     struct mlx5e_channel_param *cparam)
3882     @@ -1758,12 +1756,13 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix,
3883     {
3884     struct mlx5e_cq_moder icocq_moder = {0, 0};
3885     struct net_device *netdev = priv->netdev;
3886     + int cpu = mlx5e_get_cpu(priv, ix);
3887     struct mlx5e_channel *c;
3888     unsigned int irq;
3889     int err;
3890     int eqn;
3891    
3892     - c = kzalloc_node(sizeof(*c), GFP_KERNEL, mlx5e_get_node(priv, ix));
3893     + c = kzalloc_node(sizeof(*c), GFP_KERNEL, cpu_to_node(cpu));
3894     if (!c)
3895     return -ENOMEM;
3896    
3897     @@ -1771,6 +1770,7 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix,
3898     c->mdev = priv->mdev;
3899     c->tstamp = &priv->tstamp;
3900     c->ix = ix;
3901     + c->cpu = cpu;
3902     c->pdev = &priv->mdev->pdev->dev;
3903     c->netdev = priv->netdev;
3904     c->mkey_be = cpu_to_be32(priv->mdev->mlx5e_res.mkey.key);
3905     @@ -1859,8 +1859,7 @@ static void mlx5e_activate_channel(struct mlx5e_channel *c)
3906     for (tc = 0; tc < c->num_tc; tc++)
3907     mlx5e_activate_txqsq(&c->sq[tc]);
3908     mlx5e_activate_rq(&c->rq);
3909     - netif_set_xps_queue(c->netdev,
3910     - mlx5_get_vector_affinity(c->priv->mdev, c->ix), c->ix);
3911     + netif_set_xps_queue(c->netdev, get_cpu_mask(c->cpu), c->ix);
3912     }
3913    
3914     static void mlx5e_deactivate_channel(struct mlx5e_channel *c)
3915     @@ -3554,6 +3553,7 @@ static netdev_features_t mlx5e_tunnel_features_check(struct mlx5e_priv *priv,
3916     struct sk_buff *skb,
3917     netdev_features_t features)
3918     {
3919     + unsigned int offset = 0;
3920     struct udphdr *udph;
3921     u8 proto;
3922     u16 port;
3923     @@ -3563,7 +3563,7 @@ static netdev_features_t mlx5e_tunnel_features_check(struct mlx5e_priv *priv,
3924     proto = ip_hdr(skb)->protocol;
3925     break;
3926     case htons(ETH_P_IPV6):
3927     - proto = ipv6_hdr(skb)->nexthdr;
3928     + proto = ipv6_find_hdr(skb, &offset, -1, NULL, NULL);
3929     break;
3930     default:
3931     goto out;
3932     diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.c
3933     index 3c11d6e2160a..14962969c5ba 100644
3934     --- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.c
3935     +++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.c
3936     @@ -66,6 +66,9 @@ static int mlx5_fpga_mem_read_i2c(struct mlx5_fpga_device *fdev, size_t size,
3937     u8 actual_size;
3938     int err;
3939    
3940     + if (!size)
3941     + return -EINVAL;
3942     +
3943     if (!fdev->mdev)
3944     return -ENOTCONN;
3945    
3946     @@ -95,6 +98,9 @@ static int mlx5_fpga_mem_write_i2c(struct mlx5_fpga_device *fdev, size_t size,
3947     u8 actual_size;
3948     int err;
3949    
3950     + if (!size)
3951     + return -EINVAL;
3952     +
3953     if (!fdev->mdev)
3954     return -ENOTCONN;
3955    
3956     diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
3957     index 06562c9a6b9c..8bfc37e4ec87 100644
3958     --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
3959     +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
3960     @@ -316,9 +316,6 @@ static int mlx5_alloc_irq_vectors(struct mlx5_core_dev *dev)
3961     {
3962     struct mlx5_priv *priv = &dev->priv;
3963     struct mlx5_eq_table *table = &priv->eq_table;
3964     - struct irq_affinity irqdesc = {
3965     - .pre_vectors = MLX5_EQ_VEC_COMP_BASE,
3966     - };
3967     int num_eqs = 1 << MLX5_CAP_GEN(dev, log_max_eq);
3968     int nvec;
3969    
3970     @@ -332,10 +329,9 @@ static int mlx5_alloc_irq_vectors(struct mlx5_core_dev *dev)
3971     if (!priv->irq_info)
3972     goto err_free_msix;
3973    
3974     - nvec = pci_alloc_irq_vectors_affinity(dev->pdev,
3975     + nvec = pci_alloc_irq_vectors(dev->pdev,
3976     MLX5_EQ_VEC_COMP_BASE + 1, nvec,
3977     - PCI_IRQ_MSIX | PCI_IRQ_AFFINITY,
3978     - &irqdesc);
3979     + PCI_IRQ_MSIX);
3980     if (nvec < 0)
3981     return nvec;
3982    
3983     @@ -621,6 +617,63 @@ u64 mlx5_read_internal_timer(struct mlx5_core_dev *dev)
3984     return (u64)timer_l | (u64)timer_h1 << 32;
3985     }
3986    
3987     +static int mlx5_irq_set_affinity_hint(struct mlx5_core_dev *mdev, int i)
3988     +{
3989     + struct mlx5_priv *priv = &mdev->priv;
3990     + int irq = pci_irq_vector(mdev->pdev, MLX5_EQ_VEC_COMP_BASE + i);
3991     +
3992     + if (!zalloc_cpumask_var(&priv->irq_info[i].mask, GFP_KERNEL)) {
3993     + mlx5_core_warn(mdev, "zalloc_cpumask_var failed");
3994     + return -ENOMEM;
3995     + }
3996     +
3997     + cpumask_set_cpu(cpumask_local_spread(i, priv->numa_node),
3998     + priv->irq_info[i].mask);
3999     +
4000     + if (IS_ENABLED(CONFIG_SMP) &&
4001     + irq_set_affinity_hint(irq, priv->irq_info[i].mask))
4002     + mlx5_core_warn(mdev, "irq_set_affinity_hint failed, irq 0x%.4x", irq);
4003     +
4004     + return 0;
4005     +}
4006     +
4007     +static void mlx5_irq_clear_affinity_hint(struct mlx5_core_dev *mdev, int i)
4008     +{
4009     + struct mlx5_priv *priv = &mdev->priv;
4010     + int irq = pci_irq_vector(mdev->pdev, MLX5_EQ_VEC_COMP_BASE + i);
4011     +
4012     + irq_set_affinity_hint(irq, NULL);
4013     + free_cpumask_var(priv->irq_info[i].mask);
4014     +}
4015     +
4016     +static int mlx5_irq_set_affinity_hints(struct mlx5_core_dev *mdev)
4017     +{
4018     + int err;
4019     + int i;
4020     +
4021     + for (i = 0; i < mdev->priv.eq_table.num_comp_vectors; i++) {
4022     + err = mlx5_irq_set_affinity_hint(mdev, i);
4023     + if (err)
4024     + goto err_out;
4025     + }
4026     +
4027     + return 0;
4028     +
4029     +err_out:
4030     + for (i--; i >= 0; i--)
4031     + mlx5_irq_clear_affinity_hint(mdev, i);
4032     +
4033     + return err;
4034     +}
4035     +
4036     +static void mlx5_irq_clear_affinity_hints(struct mlx5_core_dev *mdev)
4037     +{
4038     + int i;
4039     +
4040     + for (i = 0; i < mdev->priv.eq_table.num_comp_vectors; i++)
4041     + mlx5_irq_clear_affinity_hint(mdev, i);
4042     +}
4043     +
4044     int mlx5_vector2eqn(struct mlx5_core_dev *dev, int vector, int *eqn,
4045     unsigned int *irqn)
4046     {
4047     @@ -1093,6 +1146,12 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
4048     goto err_stop_eqs;
4049     }
4050    
4051     + err = mlx5_irq_set_affinity_hints(dev);
4052     + if (err) {
4053     + dev_err(&pdev->dev, "Failed to alloc affinity hint cpumask\n");
4054     + goto err_affinity_hints;
4055     + }
4056     +
4057     err = mlx5_init_fs(dev);
4058     if (err) {
4059     dev_err(&pdev->dev, "Failed to init flow steering\n");
4060     @@ -1150,6 +1209,9 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
4061     mlx5_cleanup_fs(dev);
4062    
4063     err_fs:
4064     + mlx5_irq_clear_affinity_hints(dev);
4065     +
4066     +err_affinity_hints:
4067     free_comp_eqs(dev);
4068    
4069     err_stop_eqs:
4070     @@ -1218,6 +1280,7 @@ static int mlx5_unload_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
4071    
4072     mlx5_sriov_detach(dev);
4073     mlx5_cleanup_fs(dev);
4074     + mlx5_irq_clear_affinity_hints(dev);
4075     free_comp_eqs(dev);
4076     mlx5_stop_eqs(dev);
4077     mlx5_put_uars_page(dev, priv->uar);
4078     diff --git a/drivers/net/ethernet/mellanox/mlx5/core/qp.c b/drivers/net/ethernet/mellanox/mlx5/core/qp.c
4079     index db9e665ab104..889130edb715 100644
4080     --- a/drivers/net/ethernet/mellanox/mlx5/core/qp.c
4081     +++ b/drivers/net/ethernet/mellanox/mlx5/core/qp.c
4082     @@ -213,8 +213,8 @@ int mlx5_core_create_qp(struct mlx5_core_dev *dev,
4083     err_cmd:
4084     memset(din, 0, sizeof(din));
4085     memset(dout, 0, sizeof(dout));
4086     - MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP);
4087     - MLX5_SET(destroy_qp_in, in, qpn, qp->qpn);
4088     + MLX5_SET(destroy_qp_in, din, opcode, MLX5_CMD_OP_DESTROY_QP);
4089     + MLX5_SET(destroy_qp_in, din, qpn, qp->qpn);
4090     mlx5_cmd_exec(dev, din, sizeof(din), dout, sizeof(dout));
4091     return err;
4092     }
4093     diff --git a/drivers/net/ethernet/mellanox/mlx5/core/rl.c b/drivers/net/ethernet/mellanox/mlx5/core/rl.c
4094     index e651e4c02867..d3c33e9eea72 100644
4095     --- a/drivers/net/ethernet/mellanox/mlx5/core/rl.c
4096     +++ b/drivers/net/ethernet/mellanox/mlx5/core/rl.c
4097     @@ -125,16 +125,16 @@ static struct mlx5_rl_entry *find_rl_entry(struct mlx5_rl_table *table,
4098     return ret_entry;
4099     }
4100    
4101     -static int mlx5_set_rate_limit_cmd(struct mlx5_core_dev *dev,
4102     +static int mlx5_set_pp_rate_limit_cmd(struct mlx5_core_dev *dev,
4103     u32 rate, u16 index)
4104     {
4105     - u32 in[MLX5_ST_SZ_DW(set_rate_limit_in)] = {0};
4106     - u32 out[MLX5_ST_SZ_DW(set_rate_limit_out)] = {0};
4107     + u32 in[MLX5_ST_SZ_DW(set_pp_rate_limit_in)] = {0};
4108     + u32 out[MLX5_ST_SZ_DW(set_pp_rate_limit_out)] = {0};
4109    
4110     - MLX5_SET(set_rate_limit_in, in, opcode,
4111     - MLX5_CMD_OP_SET_RATE_LIMIT);
4112     - MLX5_SET(set_rate_limit_in, in, rate_limit_index, index);
4113     - MLX5_SET(set_rate_limit_in, in, rate_limit, rate);
4114     + MLX5_SET(set_pp_rate_limit_in, in, opcode,
4115     + MLX5_CMD_OP_SET_PP_RATE_LIMIT);
4116     + MLX5_SET(set_pp_rate_limit_in, in, rate_limit_index, index);
4117     + MLX5_SET(set_pp_rate_limit_in, in, rate_limit, rate);
4118     return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
4119     }
4120    
4121     @@ -173,7 +173,7 @@ int mlx5_rl_add_rate(struct mlx5_core_dev *dev, u32 rate, u16 *index)
4122     entry->refcount++;
4123     } else {
4124     /* new rate limit */
4125     - err = mlx5_set_rate_limit_cmd(dev, rate, entry->index);
4126     + err = mlx5_set_pp_rate_limit_cmd(dev, rate, entry->index);
4127     if (err) {
4128     mlx5_core_err(dev, "Failed configuring rate: %u (%d)\n",
4129     rate, err);
4130     @@ -209,7 +209,7 @@ void mlx5_rl_remove_rate(struct mlx5_core_dev *dev, u32 rate)
4131     entry->refcount--;
4132     if (!entry->refcount) {
4133     /* need to remove rate */
4134     - mlx5_set_rate_limit_cmd(dev, 0, entry->index);
4135     + mlx5_set_pp_rate_limit_cmd(dev, 0, entry->index);
4136     entry->rate = 0;
4137     }
4138    
4139     @@ -262,8 +262,8 @@ void mlx5_cleanup_rl_table(struct mlx5_core_dev *dev)
4140     /* Clear all configured rates */
4141     for (i = 0; i < table->max_size; i++)
4142     if (table->rl_entry[i].rate)
4143     - mlx5_set_rate_limit_cmd(dev, 0,
4144     - table->rl_entry[i].index);
4145     + mlx5_set_pp_rate_limit_cmd(dev, 0,
4146     + table->rl_entry[i].index);
4147    
4148     kfree(dev->priv.rl_table.rl_entry);
4149     }
4150     diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
4151     index 07a9ba6cfc70..2f74953e4561 100644
4152     --- a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
4153     +++ b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
4154     @@ -71,9 +71,9 @@ struct mlx5e_vxlan *mlx5e_vxlan_lookup_port(struct mlx5e_priv *priv, u16 port)
4155     struct mlx5e_vxlan_db *vxlan_db = &priv->vxlan;
4156     struct mlx5e_vxlan *vxlan;
4157    
4158     - spin_lock(&vxlan_db->lock);
4159     + spin_lock_bh(&vxlan_db->lock);
4160     vxlan = radix_tree_lookup(&vxlan_db->tree, port);
4161     - spin_unlock(&vxlan_db->lock);
4162     + spin_unlock_bh(&vxlan_db->lock);
4163    
4164     return vxlan;
4165     }
4166     @@ -88,8 +88,12 @@ static void mlx5e_vxlan_add_port(struct work_struct *work)
4167     struct mlx5e_vxlan *vxlan;
4168     int err;
4169    
4170     - if (mlx5e_vxlan_lookup_port(priv, port))
4171     + mutex_lock(&priv->state_lock);
4172     + vxlan = mlx5e_vxlan_lookup_port(priv, port);
4173     + if (vxlan) {
4174     + atomic_inc(&vxlan->refcount);
4175     goto free_work;
4176     + }
4177    
4178     if (mlx5e_vxlan_core_add_port_cmd(priv->mdev, port))
4179     goto free_work;
4180     @@ -99,10 +103,11 @@ static void mlx5e_vxlan_add_port(struct work_struct *work)
4181     goto err_delete_port;
4182    
4183     vxlan->udp_port = port;
4184     + atomic_set(&vxlan->refcount, 1);
4185    
4186     - spin_lock_irq(&vxlan_db->lock);
4187     + spin_lock_bh(&vxlan_db->lock);
4188     err = radix_tree_insert(&vxlan_db->tree, vxlan->udp_port, vxlan);
4189     - spin_unlock_irq(&vxlan_db->lock);
4190     + spin_unlock_bh(&vxlan_db->lock);
4191     if (err)
4192     goto err_free;
4193    
4194     @@ -113,35 +118,39 @@ static void mlx5e_vxlan_add_port(struct work_struct *work)
4195     err_delete_port:
4196     mlx5e_vxlan_core_del_port_cmd(priv->mdev, port);
4197     free_work:
4198     + mutex_unlock(&priv->state_lock);
4199     kfree(vxlan_work);
4200     }
4201    
4202     -static void __mlx5e_vxlan_core_del_port(struct mlx5e_priv *priv, u16 port)
4203     +static void mlx5e_vxlan_del_port(struct work_struct *work)
4204     {
4205     + struct mlx5e_vxlan_work *vxlan_work =
4206     + container_of(work, struct mlx5e_vxlan_work, work);
4207     + struct mlx5e_priv *priv = vxlan_work->priv;
4208     struct mlx5e_vxlan_db *vxlan_db = &priv->vxlan;
4209     + u16 port = vxlan_work->port;
4210     struct mlx5e_vxlan *vxlan;
4211     + bool remove = false;
4212    
4213     - spin_lock_irq(&vxlan_db->lock);
4214     - vxlan = radix_tree_delete(&vxlan_db->tree, port);
4215     - spin_unlock_irq(&vxlan_db->lock);
4216     -
4217     + mutex_lock(&priv->state_lock);
4218     + spin_lock_bh(&vxlan_db->lock);
4219     + vxlan = radix_tree_lookup(&vxlan_db->tree, port);
4220     if (!vxlan)
4221     - return;
4222     -
4223     - mlx5e_vxlan_core_del_port_cmd(priv->mdev, vxlan->udp_port);
4224     -
4225     - kfree(vxlan);
4226     -}
4227     + goto out_unlock;
4228    
4229     -static void mlx5e_vxlan_del_port(struct work_struct *work)
4230     -{
4231     - struct mlx5e_vxlan_work *vxlan_work =
4232     - container_of(work, struct mlx5e_vxlan_work, work);
4233     - struct mlx5e_priv *priv = vxlan_work->priv;
4234     - u16 port = vxlan_work->port;
4235     + if (atomic_dec_and_test(&vxlan->refcount)) {
4236     + radix_tree_delete(&vxlan_db->tree, port);
4237     + remove = true;
4238     + }
4239    
4240     - __mlx5e_vxlan_core_del_port(priv, port);
4241     +out_unlock:
4242     + spin_unlock_bh(&vxlan_db->lock);
4243    
4244     + if (remove) {
4245     + mlx5e_vxlan_core_del_port_cmd(priv->mdev, port);
4246     + kfree(vxlan);
4247     + }
4248     + mutex_unlock(&priv->state_lock);
4249     kfree(vxlan_work);
4250     }
4251    
4252     @@ -171,12 +180,11 @@ void mlx5e_vxlan_cleanup(struct mlx5e_priv *priv)
4253     struct mlx5e_vxlan *vxlan;
4254     unsigned int port = 0;
4255    
4256     - spin_lock_irq(&vxlan_db->lock);
4257     + /* Lockless since we are the only radix-tree consumers, wq is disabled */
4258     while (radix_tree_gang_lookup(&vxlan_db->tree, (void **)&vxlan, port, 1)) {
4259     port = vxlan->udp_port;
4260     - spin_unlock_irq(&vxlan_db->lock);
4261     - __mlx5e_vxlan_core_del_port(priv, (u16)port);
4262     - spin_lock_irq(&vxlan_db->lock);
4263     + radix_tree_delete(&vxlan_db->tree, port);
4264     + mlx5e_vxlan_core_del_port_cmd(priv->mdev, port);
4265     + kfree(vxlan);
4266     }
4267     - spin_unlock_irq(&vxlan_db->lock);
4268     }
4269     diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.h b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.h
4270     index 5def12c048e3..5ef6ae7d568a 100644
4271     --- a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.h
4272     +++ b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.h
4273     @@ -36,6 +36,7 @@
4274     #include "en.h"
4275    
4276     struct mlx5e_vxlan {
4277     + atomic_t refcount;
4278     u16 udp_port;
4279     };
4280    
4281     diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
4282     index db38880f54b4..3ead7439821c 100644
4283     --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
4284     +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
4285     @@ -4164,6 +4164,7 @@ static int mlxsw_sp_port_stp_set(struct mlxsw_sp_port *mlxsw_sp_port,
4286    
4287     static int mlxsw_sp_port_ovs_join(struct mlxsw_sp_port *mlxsw_sp_port)
4288     {
4289     + u16 vid = 1;
4290     int err;
4291    
4292     err = mlxsw_sp_port_vp_mode_set(mlxsw_sp_port, true);
4293     @@ -4176,8 +4177,19 @@ static int mlxsw_sp_port_ovs_join(struct mlxsw_sp_port *mlxsw_sp_port)
4294     true, false);
4295     if (err)
4296     goto err_port_vlan_set;
4297     +
4298     + for (; vid <= VLAN_N_VID - 1; vid++) {
4299     + err = mlxsw_sp_port_vid_learning_set(mlxsw_sp_port,
4300     + vid, false);
4301     + if (err)
4302     + goto err_vid_learning_set;
4303     + }
4304     +
4305     return 0;
4306    
4307     +err_vid_learning_set:
4308     + for (vid--; vid >= 1; vid--)
4309     + mlxsw_sp_port_vid_learning_set(mlxsw_sp_port, vid, true);
4310     err_port_vlan_set:
4311     mlxsw_sp_port_stp_set(mlxsw_sp_port, false);
4312     err_port_stp_set:
4313     @@ -4187,6 +4199,12 @@ static int mlxsw_sp_port_ovs_join(struct mlxsw_sp_port *mlxsw_sp_port)
4314    
4315     static void mlxsw_sp_port_ovs_leave(struct mlxsw_sp_port *mlxsw_sp_port)
4316     {
4317     + u16 vid;
4318     +
4319     + for (vid = VLAN_N_VID - 1; vid >= 1; vid--)
4320     + mlxsw_sp_port_vid_learning_set(mlxsw_sp_port,
4321     + vid, true);
4322     +
4323     mlxsw_sp_port_vlan_set(mlxsw_sp_port, 2, VLAN_N_VID - 1,
4324     false, false);
4325     mlxsw_sp_port_stp_set(mlxsw_sp_port, false);
4326     diff --git a/drivers/net/ethernet/sfc/tx.c b/drivers/net/ethernet/sfc/tx.c
4327     index 32bf1fecf864..9b85cbd5a231 100644
4328     --- a/drivers/net/ethernet/sfc/tx.c
4329     +++ b/drivers/net/ethernet/sfc/tx.c
4330     @@ -77,6 +77,7 @@ static void efx_dequeue_buffer(struct efx_tx_queue *tx_queue,
4331     }
4332    
4333     if (buffer->flags & EFX_TX_BUF_SKB) {
4334     + EFX_WARN_ON_PARANOID(!pkts_compl || !bytes_compl);
4335     (*pkts_compl)++;
4336     (*bytes_compl) += buffer->skb->len;
4337     dev_consume_skb_any((struct sk_buff *)buffer->skb);
4338     @@ -426,12 +427,14 @@ static int efx_tx_map_data(struct efx_tx_queue *tx_queue, struct sk_buff *skb,
4339     static void efx_enqueue_unwind(struct efx_tx_queue *tx_queue)
4340     {
4341     struct efx_tx_buffer *buffer;
4342     + unsigned int bytes_compl = 0;
4343     + unsigned int pkts_compl = 0;
4344    
4345     /* Work backwards until we hit the original insert pointer value */
4346     while (tx_queue->insert_count != tx_queue->write_count) {
4347     --tx_queue->insert_count;
4348     buffer = __efx_tx_queue_get_insert_buffer(tx_queue);
4349     - efx_dequeue_buffer(tx_queue, buffer, NULL, NULL);
4350     + efx_dequeue_buffer(tx_queue, buffer, &pkts_compl, &bytes_compl);
4351     }
4352     }
4353    
4354     diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c
4355     index 4d02b27df044..a3f456b91c99 100644
4356     --- a/drivers/net/phy/marvell.c
4357     +++ b/drivers/net/phy/marvell.c
4358     @@ -2069,7 +2069,7 @@ static struct phy_driver marvell_drivers[] = {
4359     .flags = PHY_HAS_INTERRUPT,
4360     .probe = marvell_probe,
4361     .config_init = &m88e1145_config_init,
4362     - .config_aneg = &marvell_config_aneg,
4363     + .config_aneg = &m88e1101_config_aneg,
4364     .read_status = &genphy_read_status,
4365     .ack_interrupt = &marvell_ack_interrupt,
4366     .config_intr = &marvell_config_intr,
4367     diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c
4368     index fdb43dd9b5cd..6c45ff650ec7 100644
4369     --- a/drivers/net/phy/micrel.c
4370     +++ b/drivers/net/phy/micrel.c
4371     @@ -622,6 +622,7 @@ static int ksz9031_read_status(struct phy_device *phydev)
4372     phydev->link = 0;
4373     if (phydev->drv->config_intr && phy_interrupt_is_valid(phydev))
4374     phydev->drv->config_intr(phydev);
4375     + return genphy_config_aneg(phydev);
4376     }
4377    
4378     return 0;
4379     diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
4380     index bcb4755bcd95..4b377b978a0b 100644
4381     --- a/drivers/net/phy/phylink.c
4382     +++ b/drivers/net/phy/phylink.c
4383     @@ -525,6 +525,7 @@ struct phylink *phylink_create(struct net_device *ndev, struct device_node *np,
4384     pl->link_config.pause = MLO_PAUSE_AN;
4385     pl->link_config.speed = SPEED_UNKNOWN;
4386     pl->link_config.duplex = DUPLEX_UNKNOWN;
4387     + pl->link_config.an_enabled = true;
4388     pl->ops = ops;
4389     __set_bit(PHYLINK_DISABLE_STOPPED, &pl->phylink_disable_state);
4390    
4391     @@ -948,6 +949,7 @@ int phylink_ethtool_ksettings_set(struct phylink *pl,
4392     mutex_lock(&pl->state_mutex);
4393     /* Configure the MAC to match the new settings */
4394     linkmode_copy(pl->link_config.advertising, our_kset.link_modes.advertising);
4395     + pl->link_config.interface = config.interface;
4396     pl->link_config.speed = our_kset.base.speed;
4397     pl->link_config.duplex = our_kset.base.duplex;
4398     pl->link_config.an_enabled = our_kset.base.autoneg != AUTONEG_DISABLE;
4399     diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c
4400     index 81394a4b2803..2092febfcb42 100644
4401     --- a/drivers/net/usb/qmi_wwan.c
4402     +++ b/drivers/net/usb/qmi_wwan.c
4403     @@ -1204,6 +1204,7 @@ static const struct usb_device_id products[] = {
4404     {QMI_FIXED_INTF(0x1199, 0x9079, 10)}, /* Sierra Wireless EM74xx */
4405     {QMI_FIXED_INTF(0x1199, 0x907b, 8)}, /* Sierra Wireless EM74xx */
4406     {QMI_FIXED_INTF(0x1199, 0x907b, 10)}, /* Sierra Wireless EM74xx */
4407     + {QMI_FIXED_INTF(0x1199, 0x9091, 8)}, /* Sierra Wireless EM7565 */
4408     {QMI_FIXED_INTF(0x1bbb, 0x011e, 4)}, /* Telekom Speedstick LTE II (Alcatel One Touch L100V LTE) */
4409     {QMI_FIXED_INTF(0x1bbb, 0x0203, 2)}, /* Alcatel L800MA */
4410     {QMI_FIXED_INTF(0x2357, 0x0201, 4)}, /* TP-LINK HSUPA Modem MA180 */
4411     diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
4412     index a2f4e52fadb5..9e9202b50e73 100644
4413     --- a/drivers/net/vxlan.c
4414     +++ b/drivers/net/vxlan.c
4415     @@ -3105,6 +3105,11 @@ static void vxlan_config_apply(struct net_device *dev,
4416    
4417     max_mtu = lowerdev->mtu - (use_ipv6 ? VXLAN6_HEADROOM :
4418     VXLAN_HEADROOM);
4419     + if (max_mtu < ETH_MIN_MTU)
4420     + max_mtu = ETH_MIN_MTU;
4421     +
4422     + if (!changelink && !conf->mtu)
4423     + dev->mtu = max_mtu;
4424     }
4425    
4426     if (dev->mtu > max_mtu)
4427     diff --git a/drivers/phy/tegra/xusb.c b/drivers/phy/tegra/xusb.c
4428     index 4307bf0013e1..63e916d4d069 100644
4429     --- a/drivers/phy/tegra/xusb.c
4430     +++ b/drivers/phy/tegra/xusb.c
4431     @@ -75,14 +75,14 @@ MODULE_DEVICE_TABLE(of, tegra_xusb_padctl_of_match);
4432     static struct device_node *
4433     tegra_xusb_find_pad_node(struct tegra_xusb_padctl *padctl, const char *name)
4434     {
4435     - /*
4436     - * of_find_node_by_name() drops a reference, so make sure to grab one.
4437     - */
4438     - struct device_node *np = of_node_get(padctl->dev->of_node);
4439     + struct device_node *pads, *np;
4440     +
4441     + pads = of_get_child_by_name(padctl->dev->of_node, "pads");
4442     + if (!pads)
4443     + return NULL;
4444    
4445     - np = of_find_node_by_name(np, "pads");
4446     - if (np)
4447     - np = of_find_node_by_name(np, name);
4448     + np = of_get_child_by_name(pads, name);
4449     + of_node_put(pads);
4450    
4451     return np;
4452     }
4453     @@ -90,16 +90,16 @@ tegra_xusb_find_pad_node(struct tegra_xusb_padctl *padctl, const char *name)
4454     static struct device_node *
4455     tegra_xusb_pad_find_phy_node(struct tegra_xusb_pad *pad, unsigned int index)
4456     {
4457     - /*
4458     - * of_find_node_by_name() drops a reference, so make sure to grab one.
4459     - */
4460     - struct device_node *np = of_node_get(pad->dev.of_node);
4461     + struct device_node *np, *lanes;
4462    
4463     - np = of_find_node_by_name(np, "lanes");
4464     - if (!np)
4465     + lanes = of_get_child_by_name(pad->dev.of_node, "lanes");
4466     + if (!lanes)
4467     return NULL;
4468    
4469     - return of_find_node_by_name(np, pad->soc->lanes[index].name);
4470     + np = of_get_child_by_name(lanes, pad->soc->lanes[index].name);
4471     + of_node_put(lanes);
4472     +
4473     + return np;
4474     }
4475    
4476     static int
4477     @@ -195,7 +195,7 @@ int tegra_xusb_pad_register(struct tegra_xusb_pad *pad,
4478     unsigned int i;
4479     int err;
4480    
4481     - children = of_find_node_by_name(pad->dev.of_node, "lanes");
4482     + children = of_get_child_by_name(pad->dev.of_node, "lanes");
4483     if (!children)
4484     return -ENODEV;
4485    
4486     @@ -444,21 +444,21 @@ static struct device_node *
4487     tegra_xusb_find_port_node(struct tegra_xusb_padctl *padctl, const char *type,
4488     unsigned int index)
4489     {
4490     - /*
4491     - * of_find_node_by_name() drops a reference, so make sure to grab one.
4492     - */
4493     - struct device_node *np = of_node_get(padctl->dev->of_node);
4494     + struct device_node *ports, *np;
4495     + char *name;
4496    
4497     - np = of_find_node_by_name(np, "ports");
4498     - if (np) {
4499     - char *name;
4500     + ports = of_get_child_by_name(padctl->dev->of_node, "ports");
4501     + if (!ports)
4502     + return NULL;
4503    
4504     - name = kasprintf(GFP_KERNEL, "%s-%u", type, index);
4505     - if (!name)
4506     - return ERR_PTR(-ENOMEM);
4507     - np = of_find_node_by_name(np, name);
4508     - kfree(name);
4509     + name = kasprintf(GFP_KERNEL, "%s-%u", type, index);
4510     + if (!name) {
4511     + of_node_put(ports);
4512     + return ERR_PTR(-ENOMEM);
4513     }
4514     + np = of_get_child_by_name(ports, name);
4515     + kfree(name);
4516     + of_node_put(ports);
4517    
4518     return np;
4519     }
4520     @@ -847,7 +847,7 @@ static void tegra_xusb_remove_ports(struct tegra_xusb_padctl *padctl)
4521    
4522     static int tegra_xusb_padctl_probe(struct platform_device *pdev)
4523     {
4524     - struct device_node *np = of_node_get(pdev->dev.of_node);
4525     + struct device_node *np = pdev->dev.of_node;
4526     const struct tegra_xusb_padctl_soc *soc;
4527     struct tegra_xusb_padctl *padctl;
4528     const struct of_device_id *match;
4529     @@ -855,7 +855,7 @@ static int tegra_xusb_padctl_probe(struct platform_device *pdev)
4530     int err;
4531    
4532     /* for backwards compatibility with old device trees */
4533     - np = of_find_node_by_name(np, "pads");
4534     + np = of_get_child_by_name(np, "pads");
4535     if (!np) {
4536     dev_warn(&pdev->dev, "deprecated DT, using legacy driver\n");
4537     return tegra_xusb_padctl_legacy_probe(pdev);
4538     diff --git a/drivers/s390/net/qeth_core.h b/drivers/s390/net/qeth_core.h
4539     index 5340efc673a9..92dd4aef21a3 100644
4540     --- a/drivers/s390/net/qeth_core.h
4541     +++ b/drivers/s390/net/qeth_core.h
4542     @@ -564,9 +564,9 @@ enum qeth_cq {
4543     };
4544    
4545     struct qeth_ipato {
4546     - int enabled;
4547     - int invert4;
4548     - int invert6;
4549     + bool enabled;
4550     + bool invert4;
4551     + bool invert6;
4552     struct list_head entries;
4553     };
4554    
4555     diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c
4556     index 330e5d3dadf3..7c7a244b6684 100644
4557     --- a/drivers/s390/net/qeth_core_main.c
4558     +++ b/drivers/s390/net/qeth_core_main.c
4559     @@ -1479,9 +1479,9 @@ static int qeth_setup_card(struct qeth_card *card)
4560     qeth_set_intial_options(card);
4561     /* IP address takeover */
4562     INIT_LIST_HEAD(&card->ipato.entries);
4563     - card->ipato.enabled = 0;
4564     - card->ipato.invert4 = 0;
4565     - card->ipato.invert6 = 0;
4566     + card->ipato.enabled = false;
4567     + card->ipato.invert4 = false;
4568     + card->ipato.invert6 = false;
4569     /* init QDIO stuff */
4570     qeth_init_qdio_info(card);
4571     INIT_DELAYED_WORK(&card->buffer_reclaim_work, qeth_buffer_reclaim_work);
4572     @@ -5445,6 +5445,13 @@ int qeth_poll(struct napi_struct *napi, int budget)
4573     }
4574     EXPORT_SYMBOL_GPL(qeth_poll);
4575    
4576     +static int qeth_setassparms_inspect_rc(struct qeth_ipa_cmd *cmd)
4577     +{
4578     + if (!cmd->hdr.return_code)
4579     + cmd->hdr.return_code = cmd->data.setassparms.hdr.return_code;
4580     + return cmd->hdr.return_code;
4581     +}
4582     +
4583     int qeth_setassparms_cb(struct qeth_card *card,
4584     struct qeth_reply *reply, unsigned long data)
4585     {
4586     @@ -6304,7 +6311,7 @@ static int qeth_ipa_checksum_run_cmd_cb(struct qeth_card *card,
4587     (struct qeth_checksum_cmd *)reply->param;
4588    
4589     QETH_CARD_TEXT(card, 4, "chkdoccb");
4590     - if (cmd->hdr.return_code)
4591     + if (qeth_setassparms_inspect_rc(cmd))
4592     return 0;
4593    
4594     memset(chksum_cb, 0, sizeof(*chksum_cb));
4595     diff --git a/drivers/s390/net/qeth_l3.h b/drivers/s390/net/qeth_l3.h
4596     index 194ae9b577cc..e5833837b799 100644
4597     --- a/drivers/s390/net/qeth_l3.h
4598     +++ b/drivers/s390/net/qeth_l3.h
4599     @@ -82,7 +82,7 @@ void qeth_l3_del_vipa(struct qeth_card *, enum qeth_prot_versions, const u8 *);
4600     int qeth_l3_add_rxip(struct qeth_card *, enum qeth_prot_versions, const u8 *);
4601     void qeth_l3_del_rxip(struct qeth_card *card, enum qeth_prot_versions,
4602     const u8 *);
4603     -int qeth_l3_is_addr_covered_by_ipato(struct qeth_card *, struct qeth_ipaddr *);
4604     +void qeth_l3_update_ipato(struct qeth_card *card);
4605     struct qeth_ipaddr *qeth_l3_get_addr_buffer(enum qeth_prot_versions);
4606     int qeth_l3_add_ip(struct qeth_card *, struct qeth_ipaddr *);
4607     int qeth_l3_delete_ip(struct qeth_card *, struct qeth_ipaddr *);
4608     diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c
4609     index 27185ab38136..36dee176f8e2 100644
4610     --- a/drivers/s390/net/qeth_l3_main.c
4611     +++ b/drivers/s390/net/qeth_l3_main.c
4612     @@ -163,8 +163,8 @@ static void qeth_l3_convert_addr_to_bits(u8 *addr, u8 *bits, int len)
4613     }
4614     }
4615    
4616     -int qeth_l3_is_addr_covered_by_ipato(struct qeth_card *card,
4617     - struct qeth_ipaddr *addr)
4618     +static bool qeth_l3_is_addr_covered_by_ipato(struct qeth_card *card,
4619     + struct qeth_ipaddr *addr)
4620     {
4621     struct qeth_ipato_entry *ipatoe;
4622     u8 addr_bits[128] = {0, };
4623     @@ -173,6 +173,8 @@ int qeth_l3_is_addr_covered_by_ipato(struct qeth_card *card,
4624    
4625     if (!card->ipato.enabled)
4626     return 0;
4627     + if (addr->type != QETH_IP_TYPE_NORMAL)
4628     + return 0;
4629    
4630     qeth_l3_convert_addr_to_bits((u8 *) &addr->u, addr_bits,
4631     (addr->proto == QETH_PROT_IPV4)? 4:16);
4632     @@ -289,8 +291,7 @@ int qeth_l3_add_ip(struct qeth_card *card, struct qeth_ipaddr *tmp_addr)
4633     memcpy(addr, tmp_addr, sizeof(struct qeth_ipaddr));
4634     addr->ref_counter = 1;
4635    
4636     - if (addr->type == QETH_IP_TYPE_NORMAL &&
4637     - qeth_l3_is_addr_covered_by_ipato(card, addr)) {
4638     + if (qeth_l3_is_addr_covered_by_ipato(card, addr)) {
4639     QETH_CARD_TEXT(card, 2, "tkovaddr");
4640     addr->set_flags |= QETH_IPA_SETIP_TAKEOVER_FLAG;
4641     }
4642     @@ -604,6 +605,27 @@ int qeth_l3_setrouting_v6(struct qeth_card *card)
4643     /*
4644     * IP address takeover related functions
4645     */
4646     +
4647     +/**
4648     + * qeth_l3_update_ipato() - Update 'takeover' property, for all NORMAL IPs.
4649     + *
4650     + * Caller must hold ip_lock.
4651     + */
4652     +void qeth_l3_update_ipato(struct qeth_card *card)
4653     +{
4654     + struct qeth_ipaddr *addr;
4655     + unsigned int i;
4656     +
4657     + hash_for_each(card->ip_htable, i, addr, hnode) {
4658     + if (addr->type != QETH_IP_TYPE_NORMAL)
4659     + continue;
4660     + if (qeth_l3_is_addr_covered_by_ipato(card, addr))
4661     + addr->set_flags |= QETH_IPA_SETIP_TAKEOVER_FLAG;
4662     + else
4663     + addr->set_flags &= ~QETH_IPA_SETIP_TAKEOVER_FLAG;
4664     + }
4665     +}
4666     +
4667     static void qeth_l3_clear_ipato_list(struct qeth_card *card)
4668     {
4669     struct qeth_ipato_entry *ipatoe, *tmp;
4670     @@ -615,6 +637,7 @@ static void qeth_l3_clear_ipato_list(struct qeth_card *card)
4671     kfree(ipatoe);
4672     }
4673    
4674     + qeth_l3_update_ipato(card);
4675     spin_unlock_bh(&card->ip_lock);
4676     }
4677    
4678     @@ -639,8 +662,10 @@ int qeth_l3_add_ipato_entry(struct qeth_card *card,
4679     }
4680     }
4681    
4682     - if (!rc)
4683     + if (!rc) {
4684     list_add_tail(&new->entry, &card->ipato.entries);
4685     + qeth_l3_update_ipato(card);
4686     + }
4687    
4688     spin_unlock_bh(&card->ip_lock);
4689    
4690     @@ -663,6 +688,7 @@ void qeth_l3_del_ipato_entry(struct qeth_card *card,
4691     (proto == QETH_PROT_IPV4)? 4:16) &&
4692     (ipatoe->mask_bits == mask_bits)) {
4693     list_del(&ipatoe->entry);
4694     + qeth_l3_update_ipato(card);
4695     kfree(ipatoe);
4696     }
4697     }
4698     diff --git a/drivers/s390/net/qeth_l3_sys.c b/drivers/s390/net/qeth_l3_sys.c
4699     index 7a829ad77783..1295dd8ec849 100644
4700     --- a/drivers/s390/net/qeth_l3_sys.c
4701     +++ b/drivers/s390/net/qeth_l3_sys.c
4702     @@ -370,8 +370,8 @@ static ssize_t qeth_l3_dev_ipato_enable_store(struct device *dev,
4703     struct device_attribute *attr, const char *buf, size_t count)
4704     {
4705     struct qeth_card *card = dev_get_drvdata(dev);
4706     - struct qeth_ipaddr *addr;
4707     - int i, rc = 0;
4708     + bool enable;
4709     + int rc = 0;
4710    
4711     if (!card)
4712     return -EINVAL;
4713     @@ -384,25 +384,18 @@ static ssize_t qeth_l3_dev_ipato_enable_store(struct device *dev,
4714     }
4715    
4716     if (sysfs_streq(buf, "toggle")) {
4717     - card->ipato.enabled = (card->ipato.enabled)? 0 : 1;
4718     - } else if (sysfs_streq(buf, "1")) {
4719     - card->ipato.enabled = 1;
4720     - hash_for_each(card->ip_htable, i, addr, hnode) {
4721     - if ((addr->type == QETH_IP_TYPE_NORMAL) &&
4722     - qeth_l3_is_addr_covered_by_ipato(card, addr))
4723     - addr->set_flags |=
4724     - QETH_IPA_SETIP_TAKEOVER_FLAG;
4725     - }
4726     - } else if (sysfs_streq(buf, "0")) {
4727     - card->ipato.enabled = 0;
4728     - hash_for_each(card->ip_htable, i, addr, hnode) {
4729     - if (addr->set_flags &
4730     - QETH_IPA_SETIP_TAKEOVER_FLAG)
4731     - addr->set_flags &=
4732     - ~QETH_IPA_SETIP_TAKEOVER_FLAG;
4733     - }
4734     - } else
4735     + enable = !card->ipato.enabled;
4736     + } else if (kstrtobool(buf, &enable)) {
4737     rc = -EINVAL;
4738     + goto out;
4739     + }
4740     +
4741     + if (card->ipato.enabled != enable) {
4742     + card->ipato.enabled = enable;
4743     + spin_lock_bh(&card->ip_lock);
4744     + qeth_l3_update_ipato(card);
4745     + spin_unlock_bh(&card->ip_lock);
4746     + }
4747     out:
4748     mutex_unlock(&card->conf_mutex);
4749     return rc ? rc : count;
4750     @@ -428,20 +421,27 @@ static ssize_t qeth_l3_dev_ipato_invert4_store(struct device *dev,
4751     const char *buf, size_t count)
4752     {
4753     struct qeth_card *card = dev_get_drvdata(dev);
4754     + bool invert;
4755     int rc = 0;
4756    
4757     if (!card)
4758     return -EINVAL;
4759    
4760     mutex_lock(&card->conf_mutex);
4761     - if (sysfs_streq(buf, "toggle"))
4762     - card->ipato.invert4 = (card->ipato.invert4)? 0 : 1;
4763     - else if (sysfs_streq(buf, "1"))
4764     - card->ipato.invert4 = 1;
4765     - else if (sysfs_streq(buf, "0"))
4766     - card->ipato.invert4 = 0;
4767     - else
4768     + if (sysfs_streq(buf, "toggle")) {
4769     + invert = !card->ipato.invert4;
4770     + } else if (kstrtobool(buf, &invert)) {
4771     rc = -EINVAL;
4772     + goto out;
4773     + }
4774     +
4775     + if (card->ipato.invert4 != invert) {
4776     + card->ipato.invert4 = invert;
4777     + spin_lock_bh(&card->ip_lock);
4778     + qeth_l3_update_ipato(card);
4779     + spin_unlock_bh(&card->ip_lock);
4780     + }
4781     +out:
4782     mutex_unlock(&card->conf_mutex);
4783     return rc ? rc : count;
4784     }
4785     @@ -607,20 +607,27 @@ static ssize_t qeth_l3_dev_ipato_invert6_store(struct device *dev,
4786     struct device_attribute *attr, const char *buf, size_t count)
4787     {
4788     struct qeth_card *card = dev_get_drvdata(dev);
4789     + bool invert;
4790     int rc = 0;
4791    
4792     if (!card)
4793     return -EINVAL;
4794    
4795     mutex_lock(&card->conf_mutex);
4796     - if (sysfs_streq(buf, "toggle"))
4797     - card->ipato.invert6 = (card->ipato.invert6)? 0 : 1;
4798     - else if (sysfs_streq(buf, "1"))
4799     - card->ipato.invert6 = 1;
4800     - else if (sysfs_streq(buf, "0"))
4801     - card->ipato.invert6 = 0;
4802     - else
4803     + if (sysfs_streq(buf, "toggle")) {
4804     + invert = !card->ipato.invert6;
4805     + } else if (kstrtobool(buf, &invert)) {
4806     rc = -EINVAL;
4807     + goto out;
4808     + }
4809     +
4810     + if (card->ipato.invert6 != invert) {
4811     + card->ipato.invert6 = invert;
4812     + spin_lock_bh(&card->ip_lock);
4813     + qeth_l3_update_ipato(card);
4814     + spin_unlock_bh(&card->ip_lock);
4815     + }
4816     +out:
4817     mutex_unlock(&card->conf_mutex);
4818     return rc ? rc : count;
4819     }
4820     diff --git a/drivers/scsi/osd/osd_initiator.c b/drivers/scsi/osd/osd_initiator.c
4821     index a4f28b7e4c65..e18877177f1b 100644
4822     --- a/drivers/scsi/osd/osd_initiator.c
4823     +++ b/drivers/scsi/osd/osd_initiator.c
4824     @@ -1576,7 +1576,9 @@ static struct request *_make_request(struct request_queue *q, bool has_write,
4825     return req;
4826    
4827     for_each_bio(bio) {
4828     - ret = blk_rq_append_bio(req, bio);
4829     + struct bio *bounce_bio = bio;
4830     +
4831     + ret = blk_rq_append_bio(req, &bounce_bio);
4832     if (ret)
4833     return ERR_PTR(ret);
4834     }
4835     diff --git a/drivers/staging/android/ion/ion.c b/drivers/staging/android/ion/ion.c
4836     index 93e2c90fa77d..83dc3292e9ab 100644
4837     --- a/drivers/staging/android/ion/ion.c
4838     +++ b/drivers/staging/android/ion/ion.c
4839     @@ -348,7 +348,7 @@ static int ion_dma_buf_begin_cpu_access(struct dma_buf *dmabuf,
4840     mutex_lock(&buffer->lock);
4841     list_for_each_entry(a, &buffer->attachments, list) {
4842     dma_sync_sg_for_cpu(a->dev, a->table->sgl, a->table->nents,
4843     - DMA_BIDIRECTIONAL);
4844     + direction);
4845     }
4846     mutex_unlock(&buffer->lock);
4847    
4848     @@ -370,7 +370,7 @@ static int ion_dma_buf_end_cpu_access(struct dma_buf *dmabuf,
4849     mutex_lock(&buffer->lock);
4850     list_for_each_entry(a, &buffer->attachments, list) {
4851     dma_sync_sg_for_device(a->dev, a->table->sgl, a->table->nents,
4852     - DMA_BIDIRECTIONAL);
4853     + direction);
4854     }
4855     mutex_unlock(&buffer->lock);
4856    
4857     diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c
4858     index 7c69b4a9694d..0d99b242e82e 100644
4859     --- a/drivers/target/target_core_pscsi.c
4860     +++ b/drivers/target/target_core_pscsi.c
4861     @@ -920,7 +920,7 @@ pscsi_map_sg(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents,
4862     " %d i: %d bio: %p, allocating another"
4863     " bio\n", bio->bi_vcnt, i, bio);
4864    
4865     - rc = blk_rq_append_bio(req, bio);
4866     + rc = blk_rq_append_bio(req, &bio);
4867     if (rc) {
4868     pr_err("pSCSI: failed to append bio\n");
4869     goto fail;
4870     @@ -938,7 +938,7 @@ pscsi_map_sg(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents,
4871     }
4872    
4873     if (bio) {
4874     - rc = blk_rq_append_bio(req, bio);
4875     + rc = blk_rq_append_bio(req, &bio);
4876     if (rc) {
4877     pr_err("pSCSI: failed to append bio\n");
4878     goto fail;
4879     diff --git a/drivers/tty/n_tty.c b/drivers/tty/n_tty.c
4880     index bdf0e6e89991..faf50df81622 100644
4881     --- a/drivers/tty/n_tty.c
4882     +++ b/drivers/tty/n_tty.c
4883     @@ -1764,7 +1764,7 @@ static void n_tty_set_termios(struct tty_struct *tty, struct ktermios *old)
4884     {
4885     struct n_tty_data *ldata = tty->disc_data;
4886    
4887     - if (!old || (old->c_lflag ^ tty->termios.c_lflag) & ICANON) {
4888     + if (!old || (old->c_lflag ^ tty->termios.c_lflag) & (ICANON | EXTPROC)) {
4889     bitmap_zero(ldata->read_flags, N_TTY_BUF_SIZE);
4890     ldata->line_start = ldata->read_tail;
4891     if (!L_ICANON(tty) || !read_cnt(ldata)) {
4892     @@ -2427,7 +2427,7 @@ static int n_tty_ioctl(struct tty_struct *tty, struct file *file,
4893     return put_user(tty_chars_in_buffer(tty), (int __user *) arg);
4894     case TIOCINQ:
4895     down_write(&tty->termios_rwsem);
4896     - if (L_ICANON(tty))
4897     + if (L_ICANON(tty) && !L_EXTPROC(tty))
4898     retval = inq_canon(ldata);
4899     else
4900     retval = read_cnt(ldata);
4901     diff --git a/drivers/tty/tty_buffer.c b/drivers/tty/tty_buffer.c
4902     index f8eba1c5412f..677fa99b7747 100644
4903     --- a/drivers/tty/tty_buffer.c
4904     +++ b/drivers/tty/tty_buffer.c
4905     @@ -446,7 +446,7 @@ EXPORT_SYMBOL_GPL(tty_prepare_flip_string);
4906     * Callers other than flush_to_ldisc() need to exclude the kworker
4907     * from concurrent use of the line discipline, see paste_selection().
4908     *
4909     - * Returns the number of bytes not processed
4910     + * Returns the number of bytes processed
4911     */
4912     int tty_ldisc_receive_buf(struct tty_ldisc *ld, const unsigned char *p,
4913     char *f, int count)
4914     diff --git a/drivers/usb/chipidea/ci_hdrc_msm.c b/drivers/usb/chipidea/ci_hdrc_msm.c
4915     index bb626120296f..53f3bf459dd1 100644
4916     --- a/drivers/usb/chipidea/ci_hdrc_msm.c
4917     +++ b/drivers/usb/chipidea/ci_hdrc_msm.c
4918     @@ -251,7 +251,7 @@ static int ci_hdrc_msm_probe(struct platform_device *pdev)
4919     if (ret)
4920     goto err_mux;
4921    
4922     - ulpi_node = of_find_node_by_name(of_node_get(pdev->dev.of_node), "ulpi");
4923     + ulpi_node = of_get_child_by_name(pdev->dev.of_node, "ulpi");
4924     if (ulpi_node) {
4925     phy_node = of_get_next_available_child(ulpi_node, NULL);
4926     ci->hsic = of_device_is_compatible(phy_node, "qcom,usb-hsic-phy");
4927     diff --git a/drivers/usb/core/config.c b/drivers/usb/core/config.c
4928     index 843ef46d2537..9e3355b97396 100644
4929     --- a/drivers/usb/core/config.c
4930     +++ b/drivers/usb/core/config.c
4931     @@ -1007,7 +1007,7 @@ int usb_get_bos_descriptor(struct usb_device *dev)
4932     case USB_SSP_CAP_TYPE:
4933     ssp_cap = (struct usb_ssp_cap_descriptor *)buffer;
4934     ssac = (le32_to_cpu(ssp_cap->bmAttributes) &
4935     - USB_SSP_SUBLINK_SPEED_ATTRIBS) + 1;
4936     + USB_SSP_SUBLINK_SPEED_ATTRIBS);
4937     if (length >= USB_DT_USB_SSP_CAP_SIZE(ssac))
4938     dev->bos->ssp_cap = ssp_cap;
4939     break;
4940     diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c
4941     index 50010282c010..c05c4f877750 100644
4942     --- a/drivers/usb/core/quirks.c
4943     +++ b/drivers/usb/core/quirks.c
4944     @@ -57,10 +57,11 @@ static const struct usb_device_id usb_quirk_list[] = {
4945     /* Microsoft LifeCam-VX700 v2.0 */
4946     { USB_DEVICE(0x045e, 0x0770), .driver_info = USB_QUIRK_RESET_RESUME },
4947    
4948     - /* Logitech HD Pro Webcams C920, C920-C and C930e */
4949     + /* Logitech HD Pro Webcams C920, C920-C, C925e and C930e */
4950     { USB_DEVICE(0x046d, 0x082d), .driver_info = USB_QUIRK_DELAY_INIT },
4951     { USB_DEVICE(0x046d, 0x0841), .driver_info = USB_QUIRK_DELAY_INIT },
4952     { USB_DEVICE(0x046d, 0x0843), .driver_info = USB_QUIRK_DELAY_INIT },
4953     + { USB_DEVICE(0x046d, 0x085b), .driver_info = USB_QUIRK_DELAY_INIT },
4954    
4955     /* Logitech ConferenceCam CC3000e */
4956     { USB_DEVICE(0x046d, 0x0847), .driver_info = USB_QUIRK_DELAY_INIT },
4957     @@ -154,6 +155,9 @@ static const struct usb_device_id usb_quirk_list[] = {
4958     /* Genesys Logic hub, internally used by KY-688 USB 3.1 Type-C Hub */
4959     { USB_DEVICE(0x05e3, 0x0612), .driver_info = USB_QUIRK_NO_LPM },
4960    
4961     + /* ELSA MicroLink 56K */
4962     + { USB_DEVICE(0x05cc, 0x2267), .driver_info = USB_QUIRK_RESET_RESUME },
4963     +
4964     /* Genesys Logic hub, internally used by Moshi USB to Ethernet Adapter */
4965     { USB_DEVICE(0x05e3, 0x0616), .driver_info = USB_QUIRK_NO_LPM },
4966    
4967     diff --git a/drivers/usb/host/xhci-pci.c b/drivers/usb/host/xhci-pci.c
4968     index 76f392954733..abb8f19ae40f 100644
4969     --- a/drivers/usb/host/xhci-pci.c
4970     +++ b/drivers/usb/host/xhci-pci.c
4971     @@ -189,6 +189,9 @@ static void xhci_pci_quirks(struct device *dev, struct xhci_hcd *xhci)
4972     xhci->quirks |= XHCI_TRUST_TX_LENGTH;
4973     xhci->quirks |= XHCI_BROKEN_STREAMS;
4974     }
4975     + if (pdev->vendor == PCI_VENDOR_ID_RENESAS &&
4976     + pdev->device == 0x0014)
4977     + xhci->quirks |= XHCI_TRUST_TX_LENGTH;
4978     if (pdev->vendor == PCI_VENDOR_ID_RENESAS &&
4979     pdev->device == 0x0015)
4980     xhci->quirks |= XHCI_RESET_ON_RESUME;
4981     diff --git a/drivers/usb/serial/ftdi_sio.c b/drivers/usb/serial/ftdi_sio.c
4982     index 49d1b2d4606d..d038e543c246 100644
4983     --- a/drivers/usb/serial/ftdi_sio.c
4984     +++ b/drivers/usb/serial/ftdi_sio.c
4985     @@ -1017,6 +1017,7 @@ static const struct usb_device_id id_table_combined[] = {
4986     .driver_info = (kernel_ulong_t)&ftdi_jtag_quirk },
4987     { USB_DEVICE(CYPRESS_VID, CYPRESS_WICED_BT_USB_PID) },
4988     { USB_DEVICE(CYPRESS_VID, CYPRESS_WICED_WL_USB_PID) },
4989     + { USB_DEVICE(AIRBUS_DS_VID, AIRBUS_DS_P8GR) },
4990     { } /* Terminating entry */
4991     };
4992    
4993     diff --git a/drivers/usb/serial/ftdi_sio_ids.h b/drivers/usb/serial/ftdi_sio_ids.h
4994     index 4faa09fe308c..8b4ecd2bd297 100644
4995     --- a/drivers/usb/serial/ftdi_sio_ids.h
4996     +++ b/drivers/usb/serial/ftdi_sio_ids.h
4997     @@ -914,6 +914,12 @@
4998     #define ICPDAS_I7561U_PID 0x0104
4999     #define ICPDAS_I7563U_PID 0x0105
5000    
5001     +/*
5002     + * Airbus Defence and Space
5003     + */
5004     +#define AIRBUS_DS_VID 0x1e8e /* Vendor ID */
5005     +#define AIRBUS_DS_P8GR 0x6001 /* Tetra P8GR */
5006     +
5007     /*
5008     * RT Systems programming cables for various ham radios
5009     */
5010     diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c
5011     index 54e316b1892d..a9400458ccea 100644
5012     --- a/drivers/usb/serial/option.c
5013     +++ b/drivers/usb/serial/option.c
5014     @@ -236,6 +236,8 @@ static void option_instat_callback(struct urb *urb);
5015     /* These Quectel products use Qualcomm's vendor ID */
5016     #define QUECTEL_PRODUCT_UC20 0x9003
5017     #define QUECTEL_PRODUCT_UC15 0x9090
5018     +/* These Yuga products use Qualcomm's vendor ID */
5019     +#define YUGA_PRODUCT_CLM920_NC5 0x9625
5020    
5021     #define QUECTEL_VENDOR_ID 0x2c7c
5022     /* These Quectel products use Quectel's vendor ID */
5023     @@ -283,6 +285,7 @@ static void option_instat_callback(struct urb *urb);
5024     #define TELIT_PRODUCT_LE922_USBCFG3 0x1043
5025     #define TELIT_PRODUCT_LE922_USBCFG5 0x1045
5026     #define TELIT_PRODUCT_ME910 0x1100
5027     +#define TELIT_PRODUCT_ME910_DUAL_MODEM 0x1101
5028     #define TELIT_PRODUCT_LE920 0x1200
5029     #define TELIT_PRODUCT_LE910 0x1201
5030     #define TELIT_PRODUCT_LE910_USBCFG4 0x1206
5031     @@ -648,6 +651,11 @@ static const struct option_blacklist_info telit_me910_blacklist = {
5032     .reserved = BIT(1) | BIT(3),
5033     };
5034    
5035     +static const struct option_blacklist_info telit_me910_dual_modem_blacklist = {
5036     + .sendsetup = BIT(0),
5037     + .reserved = BIT(3),
5038     +};
5039     +
5040     static const struct option_blacklist_info telit_le910_blacklist = {
5041     .sendsetup = BIT(0),
5042     .reserved = BIT(1) | BIT(2),
5043     @@ -677,6 +685,10 @@ static const struct option_blacklist_info cinterion_rmnet2_blacklist = {
5044     .reserved = BIT(4) | BIT(5),
5045     };
5046    
5047     +static const struct option_blacklist_info yuga_clm920_nc5_blacklist = {
5048     + .reserved = BIT(1) | BIT(4),
5049     +};
5050     +
5051     static const struct usb_device_id option_ids[] = {
5052     { USB_DEVICE(OPTION_VENDOR_ID, OPTION_PRODUCT_COLT) },
5053     { USB_DEVICE(OPTION_VENDOR_ID, OPTION_PRODUCT_RICOLA) },
5054     @@ -1181,6 +1193,9 @@ static const struct usb_device_id option_ids[] = {
5055     { USB_DEVICE(QUALCOMM_VENDOR_ID, QUECTEL_PRODUCT_UC15)},
5056     { USB_DEVICE(QUALCOMM_VENDOR_ID, QUECTEL_PRODUCT_UC20),
5057     .driver_info = (kernel_ulong_t)&net_intf4_blacklist },
5058     + /* Yuga products use Qualcomm vendor ID */
5059     + { USB_DEVICE(QUALCOMM_VENDOR_ID, YUGA_PRODUCT_CLM920_NC5),
5060     + .driver_info = (kernel_ulong_t)&yuga_clm920_nc5_blacklist },
5061     /* Quectel products using Quectel vendor ID */
5062     { USB_DEVICE(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EC21),
5063     .driver_info = (kernel_ulong_t)&net_intf4_blacklist },
5064     @@ -1247,6 +1262,8 @@ static const struct usb_device_id option_ids[] = {
5065     .driver_info = (kernel_ulong_t)&telit_le922_blacklist_usbcfg0 },
5066     { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_ME910),
5067     .driver_info = (kernel_ulong_t)&telit_me910_blacklist },
5068     + { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_ME910_DUAL_MODEM),
5069     + .driver_info = (kernel_ulong_t)&telit_me910_dual_modem_blacklist },
5070     { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE910),
5071     .driver_info = (kernel_ulong_t)&telit_le910_blacklist },
5072     { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE910_USBCFG4),
5073     diff --git a/drivers/usb/serial/qcserial.c b/drivers/usb/serial/qcserial.c
5074     index 9f9d3a904464..55a8fb25ce2b 100644
5075     --- a/drivers/usb/serial/qcserial.c
5076     +++ b/drivers/usb/serial/qcserial.c
5077     @@ -166,6 +166,8 @@ static const struct usb_device_id id_table[] = {
5078     {DEVICE_SWI(0x1199, 0x9079)}, /* Sierra Wireless EM74xx */
5079     {DEVICE_SWI(0x1199, 0x907a)}, /* Sierra Wireless EM74xx QDL */
5080     {DEVICE_SWI(0x1199, 0x907b)}, /* Sierra Wireless EM74xx */
5081     + {DEVICE_SWI(0x1199, 0x9090)}, /* Sierra Wireless EM7565 QDL */
5082     + {DEVICE_SWI(0x1199, 0x9091)}, /* Sierra Wireless EM7565 */
5083     {DEVICE_SWI(0x413c, 0x81a2)}, /* Dell Wireless 5806 Gobi(TM) 4G LTE Mobile Broadband Card */
5084     {DEVICE_SWI(0x413c, 0x81a3)}, /* Dell Wireless 5570 HSPA+ (42Mbps) Mobile Broadband Card */
5085     {DEVICE_SWI(0x413c, 0x81a4)}, /* Dell Wireless 5570e HSPA+ (42Mbps) Mobile Broadband Card */
5086     @@ -346,6 +348,7 @@ static int qcprobe(struct usb_serial *serial, const struct usb_device_id *id)
5087     break;
5088     case 2:
5089     dev_dbg(dev, "NMEA GPS interface found\n");
5090     + sendsetup = true;
5091     break;
5092     case 3:
5093     dev_dbg(dev, "Modem port found\n");
5094     diff --git a/drivers/usb/usbip/stub_dev.c b/drivers/usb/usbip/stub_dev.c
5095     index c653ce533430..720408d39f11 100644
5096     --- a/drivers/usb/usbip/stub_dev.c
5097     +++ b/drivers/usb/usbip/stub_dev.c
5098     @@ -163,8 +163,7 @@ static void stub_shutdown_connection(struct usbip_device *ud)
5099     * step 1?
5100     */
5101     if (ud->tcp_socket) {
5102     - dev_dbg(&sdev->udev->dev, "shutdown tcp_socket %p\n",
5103     - ud->tcp_socket);
5104     + dev_dbg(&sdev->udev->dev, "shutdown sockfd %d\n", ud->sockfd);
5105     kernel_sock_shutdown(ud->tcp_socket, SHUT_RDWR);
5106     }
5107    
5108     diff --git a/drivers/usb/usbip/stub_main.c b/drivers/usb/usbip/stub_main.c
5109     index 7170404e8979..6968c906fa29 100644
5110     --- a/drivers/usb/usbip/stub_main.c
5111     +++ b/drivers/usb/usbip/stub_main.c
5112     @@ -251,11 +251,12 @@ void stub_device_cleanup_urbs(struct stub_device *sdev)
5113     struct stub_priv *priv;
5114     struct urb *urb;
5115    
5116     - dev_dbg(&sdev->udev->dev, "free sdev %p\n", sdev);
5117     + dev_dbg(&sdev->udev->dev, "Stub device cleaning up urbs\n");
5118    
5119     while ((priv = stub_priv_pop(sdev))) {
5120     urb = priv->urb;
5121     - dev_dbg(&sdev->udev->dev, "free urb %p\n", urb);
5122     + dev_dbg(&sdev->udev->dev, "free urb seqnum %lu\n",
5123     + priv->seqnum);
5124     usb_kill_urb(urb);
5125    
5126     kmem_cache_free(stub_priv_cache, priv);
5127     diff --git a/drivers/usb/usbip/stub_rx.c b/drivers/usb/usbip/stub_rx.c
5128     index 283a9be77a22..5b807185f79e 100644
5129     --- a/drivers/usb/usbip/stub_rx.c
5130     +++ b/drivers/usb/usbip/stub_rx.c
5131     @@ -225,9 +225,6 @@ static int stub_recv_cmd_unlink(struct stub_device *sdev,
5132     if (priv->seqnum != pdu->u.cmd_unlink.seqnum)
5133     continue;
5134    
5135     - dev_info(&priv->urb->dev->dev, "unlink urb %p\n",
5136     - priv->urb);
5137     -
5138     /*
5139     * This matched urb is not completed yet (i.e., be in
5140     * flight in usb hcd hardware/driver). Now we are
5141     @@ -266,8 +263,8 @@ static int stub_recv_cmd_unlink(struct stub_device *sdev,
5142     ret = usb_unlink_urb(priv->urb);
5143     if (ret != -EINPROGRESS)
5144     dev_err(&priv->urb->dev->dev,
5145     - "failed to unlink a urb %p, ret %d\n",
5146     - priv->urb, ret);
5147     + "failed to unlink a urb # %lu, ret %d\n",
5148     + priv->seqnum, ret);
5149    
5150     return 0;
5151     }
5152     diff --git a/drivers/usb/usbip/stub_tx.c b/drivers/usb/usbip/stub_tx.c
5153     index 87ff94be4235..96aa375b80d9 100644
5154     --- a/drivers/usb/usbip/stub_tx.c
5155     +++ b/drivers/usb/usbip/stub_tx.c
5156     @@ -102,7 +102,7 @@ void stub_complete(struct urb *urb)
5157     /* link a urb to the queue of tx. */
5158     spin_lock_irqsave(&sdev->priv_lock, flags);
5159     if (sdev->ud.tcp_socket == NULL) {
5160     - usbip_dbg_stub_tx("ignore urb for closed connection %p", urb);
5161     + usbip_dbg_stub_tx("ignore urb for closed connection\n");
5162     /* It will be freed in stub_device_cleanup_urbs(). */
5163     } else if (priv->unlinking) {
5164     stub_enqueue_ret_unlink(sdev, priv->seqnum, urb->status);
5165     @@ -204,8 +204,8 @@ static int stub_send_ret_submit(struct stub_device *sdev)
5166    
5167     /* 1. setup usbip_header */
5168     setup_ret_submit_pdu(&pdu_header, urb);
5169     - usbip_dbg_stub_tx("setup txdata seqnum: %d urb: %p\n",
5170     - pdu_header.base.seqnum, urb);
5171     + usbip_dbg_stub_tx("setup txdata seqnum: %d\n",
5172     + pdu_header.base.seqnum);
5173     usbip_header_correct_endian(&pdu_header, 1);
5174    
5175     iov[iovnum].iov_base = &pdu_header;
5176     diff --git a/drivers/usb/usbip/usbip_common.c b/drivers/usb/usbip/usbip_common.c
5177     index 2281f3562870..17b599b923f3 100644
5178     --- a/drivers/usb/usbip/usbip_common.c
5179     +++ b/drivers/usb/usbip/usbip_common.c
5180     @@ -331,26 +331,20 @@ int usbip_recv(struct socket *sock, void *buf, int size)
5181     struct msghdr msg = {.msg_flags = MSG_NOSIGNAL};
5182     int total = 0;
5183    
5184     + if (!sock || !buf || !size)
5185     + return -EINVAL;
5186     +
5187     iov_iter_kvec(&msg.msg_iter, READ|ITER_KVEC, &iov, 1, size);
5188    
5189     usbip_dbg_xmit("enter\n");
5190    
5191     - if (!sock || !buf || !size) {
5192     - pr_err("invalid arg, sock %p buff %p size %d\n", sock, buf,
5193     - size);
5194     - return -EINVAL;
5195     - }
5196     -
5197     do {
5198     - int sz = msg_data_left(&msg);
5199     + msg_data_left(&msg);
5200     sock->sk->sk_allocation = GFP_NOIO;
5201    
5202     result = sock_recvmsg(sock, &msg, MSG_WAITALL);
5203     - if (result <= 0) {
5204     - pr_debug("receive sock %p buf %p size %u ret %d total %d\n",
5205     - sock, buf + total, sz, result, total);
5206     + if (result <= 0)
5207     goto err;
5208     - }
5209    
5210     total += result;
5211     } while (msg_data_left(&msg));
5212     diff --git a/drivers/usb/usbip/vhci_hcd.c b/drivers/usb/usbip/vhci_hcd.c
5213     index 1f0cf81cc145..692cfdef667e 100644
5214     --- a/drivers/usb/usbip/vhci_hcd.c
5215     +++ b/drivers/usb/usbip/vhci_hcd.c
5216     @@ -670,9 +670,6 @@ static int vhci_urb_enqueue(struct usb_hcd *hcd, struct urb *urb, gfp_t mem_flag
5217     struct vhci_device *vdev;
5218     unsigned long flags;
5219    
5220     - usbip_dbg_vhci_hc("enter, usb_hcd %p urb %p mem_flags %d\n",
5221     - hcd, urb, mem_flags);
5222     -
5223     if (portnum > VHCI_HC_PORTS) {
5224     pr_err("invalid port number %d\n", portnum);
5225     return -ENODEV;
5226     @@ -836,8 +833,6 @@ static int vhci_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status)
5227     struct vhci_device *vdev;
5228     unsigned long flags;
5229    
5230     - pr_info("dequeue a urb %p\n", urb);
5231     -
5232     spin_lock_irqsave(&vhci->lock, flags);
5233    
5234     priv = urb->hcpriv;
5235     @@ -865,7 +860,6 @@ static int vhci_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status)
5236     /* tcp connection is closed */
5237     spin_lock(&vdev->priv_lock);
5238    
5239     - pr_info("device %p seems to be disconnected\n", vdev);
5240     list_del(&priv->list);
5241     kfree(priv);
5242     urb->hcpriv = NULL;
5243     @@ -877,8 +871,6 @@ static int vhci_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status)
5244     * vhci_rx will receive RET_UNLINK and give back the URB.
5245     * Otherwise, we give back it here.
5246     */
5247     - pr_info("gives back urb %p\n", urb);
5248     -
5249     usb_hcd_unlink_urb_from_ep(hcd, urb);
5250    
5251     spin_unlock_irqrestore(&vhci->lock, flags);
5252     @@ -906,8 +898,6 @@ static int vhci_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status)
5253    
5254     unlink->unlink_seqnum = priv->seqnum;
5255    
5256     - pr_info("device %p seems to be still connected\n", vdev);
5257     -
5258     /* send cmd_unlink and try to cancel the pending URB in the
5259     * peer */
5260     list_add_tail(&unlink->list, &vdev->unlink_tx);
5261     @@ -989,7 +979,7 @@ static void vhci_shutdown_connection(struct usbip_device *ud)
5262    
5263     /* need this? see stub_dev.c */
5264     if (ud->tcp_socket) {
5265     - pr_debug("shutdown tcp_socket %p\n", ud->tcp_socket);
5266     + pr_debug("shutdown tcp_socket %d\n", ud->sockfd);
5267     kernel_sock_shutdown(ud->tcp_socket, SHUT_RDWR);
5268     }
5269    
5270     diff --git a/drivers/usb/usbip/vhci_rx.c b/drivers/usb/usbip/vhci_rx.c
5271     index ef2f2d5ca6b2..1343037d00f9 100644
5272     --- a/drivers/usb/usbip/vhci_rx.c
5273     +++ b/drivers/usb/usbip/vhci_rx.c
5274     @@ -37,24 +37,23 @@ struct urb *pickup_urb_and_free_priv(struct vhci_device *vdev, __u32 seqnum)
5275     urb = priv->urb;
5276     status = urb->status;
5277    
5278     - usbip_dbg_vhci_rx("find urb %p vurb %p seqnum %u\n",
5279     - urb, priv, seqnum);
5280     + usbip_dbg_vhci_rx("find urb seqnum %u\n", seqnum);
5281    
5282     switch (status) {
5283     case -ENOENT:
5284     /* fall through */
5285     case -ECONNRESET:
5286     - dev_info(&urb->dev->dev,
5287     - "urb %p was unlinked %ssynchronuously.\n", urb,
5288     - status == -ENOENT ? "" : "a");
5289     + dev_dbg(&urb->dev->dev,
5290     + "urb seq# %u was unlinked %ssynchronuously\n",
5291     + seqnum, status == -ENOENT ? "" : "a");
5292     break;
5293     case -EINPROGRESS:
5294     /* no info output */
5295     break;
5296     default:
5297     - dev_info(&urb->dev->dev,
5298     - "urb %p may be in a error, status %d\n", urb,
5299     - status);
5300     + dev_dbg(&urb->dev->dev,
5301     + "urb seq# %u may be in a error, status %d\n",
5302     + seqnum, status);
5303     }
5304    
5305     list_del(&priv->list);
5306     @@ -81,8 +80,8 @@ static void vhci_recv_ret_submit(struct vhci_device *vdev,
5307     spin_unlock_irqrestore(&vdev->priv_lock, flags);
5308    
5309     if (!urb) {
5310     - pr_err("cannot find a urb of seqnum %u\n", pdu->base.seqnum);
5311     - pr_info("max seqnum %d\n",
5312     + pr_err("cannot find a urb of seqnum %u max seqnum %d\n",
5313     + pdu->base.seqnum,
5314     atomic_read(&vhci_hcd->seqnum));
5315     usbip_event_add(ud, VDEV_EVENT_ERROR_TCP);
5316     return;
5317     @@ -105,7 +104,7 @@ static void vhci_recv_ret_submit(struct vhci_device *vdev,
5318     if (usbip_dbg_flag_vhci_rx)
5319     usbip_dump_urb(urb);
5320    
5321     - usbip_dbg_vhci_rx("now giveback urb %p\n", urb);
5322     + usbip_dbg_vhci_rx("now giveback urb %u\n", pdu->base.seqnum);
5323    
5324     spin_lock_irqsave(&vhci->lock, flags);
5325     usb_hcd_unlink_urb_from_ep(vhci_hcd_to_hcd(vhci_hcd), urb);
5326     @@ -172,7 +171,7 @@ static void vhci_recv_ret_unlink(struct vhci_device *vdev,
5327     pr_info("the urb (seqnum %d) was already given back\n",
5328     pdu->base.seqnum);
5329     } else {
5330     - usbip_dbg_vhci_rx("now giveback urb %p\n", urb);
5331     + usbip_dbg_vhci_rx("now giveback urb %d\n", pdu->base.seqnum);
5332    
5333     /* If unlink is successful, status is -ECONNRESET */
5334     urb->status = pdu->u.ret_unlink.status;
5335     diff --git a/drivers/usb/usbip/vhci_tx.c b/drivers/usb/usbip/vhci_tx.c
5336     index 3e7878fe2fd4..a9a663a578b6 100644
5337     --- a/drivers/usb/usbip/vhci_tx.c
5338     +++ b/drivers/usb/usbip/vhci_tx.c
5339     @@ -83,7 +83,8 @@ static int vhci_send_cmd_submit(struct vhci_device *vdev)
5340     memset(&msg, 0, sizeof(msg));
5341     memset(&iov, 0, sizeof(iov));
5342    
5343     - usbip_dbg_vhci_tx("setup txdata urb %p\n", urb);
5344     + usbip_dbg_vhci_tx("setup txdata urb seqnum %lu\n",
5345     + priv->seqnum);
5346    
5347     /* 1. setup usbip_header */
5348     setup_cmd_submit_pdu(&pdu_header, urb);
5349     diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
5350     index fd47bd96b5d3..6362e3606aa5 100644
5351     --- a/include/linux/blkdev.h
5352     +++ b/include/linux/blkdev.h
5353     @@ -241,14 +241,24 @@ struct request {
5354     struct request *next_rq;
5355     };
5356    
5357     +static inline bool blk_op_is_scsi(unsigned int op)
5358     +{
5359     + return op == REQ_OP_SCSI_IN || op == REQ_OP_SCSI_OUT;
5360     +}
5361     +
5362     +static inline bool blk_op_is_private(unsigned int op)
5363     +{
5364     + return op == REQ_OP_DRV_IN || op == REQ_OP_DRV_OUT;
5365     +}
5366     +
5367     static inline bool blk_rq_is_scsi(struct request *rq)
5368     {
5369     - return req_op(rq) == REQ_OP_SCSI_IN || req_op(rq) == REQ_OP_SCSI_OUT;
5370     + return blk_op_is_scsi(req_op(rq));
5371     }
5372    
5373     static inline bool blk_rq_is_private(struct request *rq)
5374     {
5375     - return req_op(rq) == REQ_OP_DRV_IN || req_op(rq) == REQ_OP_DRV_OUT;
5376     + return blk_op_is_private(req_op(rq));
5377     }
5378    
5379     static inline bool blk_rq_is_passthrough(struct request *rq)
5380     @@ -256,6 +266,13 @@ static inline bool blk_rq_is_passthrough(struct request *rq)
5381     return blk_rq_is_scsi(rq) || blk_rq_is_private(rq);
5382     }
5383    
5384     +static inline bool bio_is_passthrough(struct bio *bio)
5385     +{
5386     + unsigned op = bio_op(bio);
5387     +
5388     + return blk_op_is_scsi(op) || blk_op_is_private(op);
5389     +}
5390     +
5391     static inline unsigned short req_get_ioprio(struct request *req)
5392     {
5393     return req->ioprio;
5394     @@ -952,7 +969,7 @@ extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
5395     extern void blk_rq_unprep_clone(struct request *rq);
5396     extern blk_status_t blk_insert_cloned_request(struct request_queue *q,
5397     struct request *rq);
5398     -extern int blk_rq_append_bio(struct request *rq, struct bio *bio);
5399     +extern int blk_rq_append_bio(struct request *rq, struct bio **bio);
5400     extern void blk_delay_queue(struct request_queue *, unsigned long);
5401     extern void blk_queue_split(struct request_queue *, struct bio **);
5402     extern void blk_recount_segments(struct request_queue *, struct bio *);
5403     diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
5404     index 2477a5cb5bd5..fb83dee528a1 100644
5405     --- a/include/linux/cpuhotplug.h
5406     +++ b/include/linux/cpuhotplug.h
5407     @@ -86,7 +86,7 @@ enum cpuhp_state {
5408     CPUHP_MM_ZSWP_POOL_PREPARE,
5409     CPUHP_KVM_PPC_BOOK3S_PREPARE,
5410     CPUHP_ZCOMP_PREPARE,
5411     - CPUHP_TIMERS_DEAD,
5412     + CPUHP_TIMERS_PREPARE,
5413     CPUHP_MIPS_SOC_PREPARE,
5414     CPUHP_BP_PREPARE_DYN,
5415     CPUHP_BP_PREPARE_DYN_END = CPUHP_BP_PREPARE_DYN + 20,
5416     diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
5417     index ea04ca024f0d..067a6fa675ed 100644
5418     --- a/include/linux/ipv6.h
5419     +++ b/include/linux/ipv6.h
5420     @@ -272,7 +272,8 @@ struct ipv6_pinfo {
5421     * 100: prefer care-of address
5422     */
5423     dontfrag:1,
5424     - autoflowlabel:1;
5425     + autoflowlabel:1,
5426     + autoflowlabel_set:1;
5427     __u8 min_hopcount;
5428     __u8 tclass;
5429     __be32 rcv_flowinfo;
5430     diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
5431     index 401c8972cc3a..8b3d0103c03a 100644
5432     --- a/include/linux/mlx5/driver.h
5433     +++ b/include/linux/mlx5/driver.h
5434     @@ -546,6 +546,7 @@ struct mlx5_core_sriov {
5435     };
5436    
5437     struct mlx5_irq_info {
5438     + cpumask_var_t mask;
5439     char name[MLX5_MAX_IRQ_NAME];
5440     };
5441    
5442     diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
5443     index 69772347f866..c8091f06eaa4 100644
5444     --- a/include/linux/mlx5/mlx5_ifc.h
5445     +++ b/include/linux/mlx5/mlx5_ifc.h
5446     @@ -147,7 +147,7 @@ enum {
5447     MLX5_CMD_OP_ALLOC_Q_COUNTER = 0x771,
5448     MLX5_CMD_OP_DEALLOC_Q_COUNTER = 0x772,
5449     MLX5_CMD_OP_QUERY_Q_COUNTER = 0x773,
5450     - MLX5_CMD_OP_SET_RATE_LIMIT = 0x780,
5451     + MLX5_CMD_OP_SET_PP_RATE_LIMIT = 0x780,
5452     MLX5_CMD_OP_QUERY_RATE_LIMIT = 0x781,
5453     MLX5_CMD_OP_CREATE_SCHEDULING_ELEMENT = 0x782,
5454     MLX5_CMD_OP_DESTROY_SCHEDULING_ELEMENT = 0x783,
5455     @@ -7233,7 +7233,7 @@ struct mlx5_ifc_add_vxlan_udp_dport_in_bits {
5456     u8 vxlan_udp_port[0x10];
5457     };
5458    
5459     -struct mlx5_ifc_set_rate_limit_out_bits {
5460     +struct mlx5_ifc_set_pp_rate_limit_out_bits {
5461     u8 status[0x8];
5462     u8 reserved_at_8[0x18];
5463    
5464     @@ -7242,7 +7242,7 @@ struct mlx5_ifc_set_rate_limit_out_bits {
5465     u8 reserved_at_40[0x40];
5466     };
5467    
5468     -struct mlx5_ifc_set_rate_limit_in_bits {
5469     +struct mlx5_ifc_set_pp_rate_limit_in_bits {
5470     u8 opcode[0x10];
5471     u8 reserved_at_10[0x10];
5472    
5473     @@ -7255,6 +7255,8 @@ struct mlx5_ifc_set_rate_limit_in_bits {
5474     u8 reserved_at_60[0x20];
5475    
5476     u8 rate_limit[0x20];
5477     +
5478     + u8 reserved_at_a0[0x160];
5479     };
5480    
5481     struct mlx5_ifc_access_register_out_bits {
5482     diff --git a/include/linux/pti.h b/include/linux/pti.h
5483     new file mode 100644
5484     index 000000000000..0174883a935a
5485     --- /dev/null
5486     +++ b/include/linux/pti.h
5487     @@ -0,0 +1,11 @@
5488     +// SPDX-License-Identifier: GPL-2.0
5489     +#ifndef _INCLUDE_PTI_H
5490     +#define _INCLUDE_PTI_H
5491     +
5492     +#ifdef CONFIG_PAGE_TABLE_ISOLATION
5493     +#include <asm/pti.h>
5494     +#else
5495     +static inline void pti_init(void) { }
5496     +#endif
5497     +
5498     +#endif
5499     diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h
5500     index 37b4bb2545b3..6866df4f31b5 100644
5501     --- a/include/linux/ptr_ring.h
5502     +++ b/include/linux/ptr_ring.h
5503     @@ -101,12 +101,18 @@ static inline bool ptr_ring_full_bh(struct ptr_ring *r)
5504    
5505     /* Note: callers invoking this in a loop must use a compiler barrier,
5506     * for example cpu_relax(). Callers must hold producer_lock.
5507     + * Callers are responsible for making sure pointer that is being queued
5508     + * points to a valid data.
5509     */
5510     static inline int __ptr_ring_produce(struct ptr_ring *r, void *ptr)
5511     {
5512     if (unlikely(!r->size) || r->queue[r->producer])
5513     return -ENOSPC;
5514    
5515     + /* Make sure the pointer we are storing points to a valid data. */
5516     + /* Pairs with smp_read_barrier_depends in __ptr_ring_consume. */
5517     + smp_wmb();
5518     +
5519     r->queue[r->producer++] = ptr;
5520     if (unlikely(r->producer >= r->size))
5521     r->producer = 0;
5522     @@ -275,6 +281,9 @@ static inline void *__ptr_ring_consume(struct ptr_ring *r)
5523     if (ptr)
5524     __ptr_ring_discard_one(r);
5525    
5526     + /* Make sure anyone accessing data through the pointer is up to date. */
5527     + /* Pairs with smp_wmb in __ptr_ring_produce. */
5528     + smp_read_barrier_depends();
5529     return ptr;
5530     }
5531    
5532     diff --git a/include/linux/tcp.h b/include/linux/tcp.h
5533     index 4aa40ef02d32..e8418fc77a43 100644
5534     --- a/include/linux/tcp.h
5535     +++ b/include/linux/tcp.h
5536     @@ -214,7 +214,8 @@ struct tcp_sock {
5537     u8 chrono_type:2, /* current chronograph type */
5538     rate_app_limited:1, /* rate_{delivered,interval_us} limited? */
5539     fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */
5540     - unused:4;
5541     + is_sack_reneg:1, /* in recovery from loss with SACK reneg? */
5542     + unused:3;
5543     u8 nonagle : 4,/* Disable Nagle algorithm? */
5544     thin_lto : 1,/* Use linear timeouts for thin streams */
5545     unused1 : 1,
5546     diff --git a/include/linux/tick.h b/include/linux/tick.h
5547     index cf413b344ddb..5cdac11dd317 100644
5548     --- a/include/linux/tick.h
5549     +++ b/include/linux/tick.h
5550     @@ -119,6 +119,7 @@ extern void tick_nohz_idle_exit(void);
5551     extern void tick_nohz_irq_exit(void);
5552     extern ktime_t tick_nohz_get_sleep_length(void);
5553     extern unsigned long tick_nohz_get_idle_calls(void);
5554     +extern unsigned long tick_nohz_get_idle_calls_cpu(int cpu);
5555     extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time);
5556     extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time);
5557     #else /* !CONFIG_NO_HZ_COMMON */
5558     diff --git a/include/linux/timer.h b/include/linux/timer.h
5559     index ac66f29c6916..e0ea1fe87572 100644
5560     --- a/include/linux/timer.h
5561     +++ b/include/linux/timer.h
5562     @@ -246,9 +246,11 @@ unsigned long round_jiffies_up(unsigned long j);
5563     unsigned long round_jiffies_up_relative(unsigned long j);
5564    
5565     #ifdef CONFIG_HOTPLUG_CPU
5566     +int timers_prepare_cpu(unsigned int cpu);
5567     int timers_dead_cpu(unsigned int cpu);
5568     #else
5569     -#define timers_dead_cpu NULL
5570     +#define timers_prepare_cpu NULL
5571     +#define timers_dead_cpu NULL
5572     #endif
5573    
5574     #endif
5575     diff --git a/include/net/ip.h b/include/net/ip.h
5576     index 9896f46cbbf1..af8addbaa3c1 100644
5577     --- a/include/net/ip.h
5578     +++ b/include/net/ip.h
5579     @@ -34,6 +34,7 @@
5580     #include <net/flow_dissector.h>
5581    
5582     #define IPV4_MAX_PMTU 65535U /* RFC 2675, Section 5.1 */
5583     +#define IPV4_MIN_MTU 68 /* RFC 791 */
5584    
5585     struct sock;
5586    
5587     diff --git a/include/net/tcp.h b/include/net/tcp.h
5588     index 6ced69940f5c..0a13574134b8 100644
5589     --- a/include/net/tcp.h
5590     +++ b/include/net/tcp.h
5591     @@ -1085,7 +1085,7 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb);
5592     void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
5593     struct rate_sample *rs);
5594     void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
5595     - struct rate_sample *rs);
5596     + bool is_sack_reneg, struct rate_sample *rs);
5597     void tcp_rate_check_app_limited(struct sock *sk);
5598    
5599     /* These functions determine how the current flow behaves in respect of SACK
5600     diff --git a/init/main.c b/init/main.c
5601     index 8a390f60ec81..b32ec72cdf3d 100644
5602     --- a/init/main.c
5603     +++ b/init/main.c
5604     @@ -75,6 +75,7 @@
5605     #include <linux/slab.h>
5606     #include <linux/perf_event.h>
5607     #include <linux/ptrace.h>
5608     +#include <linux/pti.h>
5609     #include <linux/blkdev.h>
5610     #include <linux/elevator.h>
5611     #include <linux/sched_clock.h>
5612     @@ -506,6 +507,8 @@ static void __init mm_init(void)
5613     ioremap_huge_init();
5614     /* Should be run before the first non-init thread is created */
5615     init_espfix_bsp();
5616     + /* Should be run after espfix64 is set up. */
5617     + pti_init();
5618     }
5619    
5620     asmlinkage __visible void __init start_kernel(void)
5621     diff --git a/kernel/cpu.c b/kernel/cpu.c
5622     index 7891aecc6aec..f21bfa3172d8 100644
5623     --- a/kernel/cpu.c
5624     +++ b/kernel/cpu.c
5625     @@ -1277,9 +1277,9 @@ static struct cpuhp_step cpuhp_bp_states[] = {
5626     * before blk_mq_queue_reinit_notify() from notify_dead(),
5627     * otherwise a RCU stall occurs.
5628     */
5629     - [CPUHP_TIMERS_DEAD] = {
5630     + [CPUHP_TIMERS_PREPARE] = {
5631     .name = "timers:dead",
5632     - .startup.single = NULL,
5633     + .startup.single = timers_prepare_cpu,
5634     .teardown.single = timers_dead_cpu,
5635     },
5636     /* Kicks the plugged cpu into life */
5637     diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
5638     index 2f52ec0f1539..d6717a3331a1 100644
5639     --- a/kernel/sched/cpufreq_schedutil.c
5640     +++ b/kernel/sched/cpufreq_schedutil.c
5641     @@ -244,7 +244,7 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
5642     #ifdef CONFIG_NO_HZ_COMMON
5643     static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu)
5644     {
5645     - unsigned long idle_calls = tick_nohz_get_idle_calls();
5646     + unsigned long idle_calls = tick_nohz_get_idle_calls_cpu(sg_cpu->cpu);
5647     bool ret = idle_calls == sg_cpu->saved_idle_calls;
5648    
5649     sg_cpu->saved_idle_calls = idle_calls;
5650     diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
5651     index c7a899c5ce64..dfa4a117fee3 100644
5652     --- a/kernel/time/tick-sched.c
5653     +++ b/kernel/time/tick-sched.c
5654     @@ -674,6 +674,11 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
5655     ts->next_tick = 0;
5656     }
5657    
5658     +static inline bool local_timer_softirq_pending(void)
5659     +{
5660     + return local_softirq_pending() & TIMER_SOFTIRQ;
5661     +}
5662     +
5663     static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
5664     ktime_t now, int cpu)
5665     {
5666     @@ -690,8 +695,18 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
5667     } while (read_seqretry(&jiffies_lock, seq));
5668     ts->last_jiffies = basejiff;
5669    
5670     - if (rcu_needs_cpu(basemono, &next_rcu) ||
5671     - arch_needs_cpu() || irq_work_needs_cpu()) {
5672     + /*
5673     + * Keep the periodic tick, when RCU, architecture or irq_work
5674     + * requests it.
5675     + * Aside of that check whether the local timer softirq is
5676     + * pending. If so its a bad idea to call get_next_timer_interrupt()
5677     + * because there is an already expired timer, so it will request
5678     + * immeditate expiry, which rearms the hardware timer with a
5679     + * minimal delta which brings us back to this place
5680     + * immediately. Lather, rinse and repeat...
5681     + */
5682     + if (rcu_needs_cpu(basemono, &next_rcu) || arch_needs_cpu() ||
5683     + irq_work_needs_cpu() || local_timer_softirq_pending()) {
5684     next_tick = basemono + TICK_NSEC;
5685     } else {
5686     /*
5687     @@ -1009,6 +1024,19 @@ ktime_t tick_nohz_get_sleep_length(void)
5688     return ts->sleep_length;
5689     }
5690    
5691     +/**
5692     + * tick_nohz_get_idle_calls_cpu - return the current idle calls counter value
5693     + * for a particular CPU.
5694     + *
5695     + * Called from the schedutil frequency scaling governor in scheduler context.
5696     + */
5697     +unsigned long tick_nohz_get_idle_calls_cpu(int cpu)
5698     +{
5699     + struct tick_sched *ts = tick_get_tick_sched(cpu);
5700     +
5701     + return ts->idle_calls;
5702     +}
5703     +
5704     /**
5705     * tick_nohz_get_idle_calls - return the current idle calls counter value
5706     *
5707     diff --git a/kernel/time/timer.c b/kernel/time/timer.c
5708     index f2674a056c26..73e3cdbc61f1 100644
5709     --- a/kernel/time/timer.c
5710     +++ b/kernel/time/timer.c
5711     @@ -814,11 +814,10 @@ static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu)
5712     struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_STD], cpu);
5713    
5714     /*
5715     - * If the timer is deferrable and nohz is active then we need to use
5716     - * the deferrable base.
5717     + * If the timer is deferrable and NO_HZ_COMMON is set then we need
5718     + * to use the deferrable base.
5719     */
5720     - if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active &&
5721     - (tflags & TIMER_DEFERRABLE))
5722     + if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE))
5723     base = per_cpu_ptr(&timer_bases[BASE_DEF], cpu);
5724     return base;
5725     }
5726     @@ -828,11 +827,10 @@ static inline struct timer_base *get_timer_this_cpu_base(u32 tflags)
5727     struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
5728    
5729     /*
5730     - * If the timer is deferrable and nohz is active then we need to use
5731     - * the deferrable base.
5732     + * If the timer is deferrable and NO_HZ_COMMON is set then we need
5733     + * to use the deferrable base.
5734     */
5735     - if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active &&
5736     - (tflags & TIMER_DEFERRABLE))
5737     + if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE))
5738     base = this_cpu_ptr(&timer_bases[BASE_DEF]);
5739     return base;
5740     }
5741     @@ -984,8 +982,6 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
5742     if (!ret && pending_only)
5743     goto out_unlock;
5744    
5745     - debug_activate(timer, expires);
5746     -
5747     new_base = get_target_base(base, timer->flags);
5748    
5749     if (base != new_base) {
5750     @@ -1009,6 +1005,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
5751     }
5752     }
5753    
5754     + debug_activate(timer, expires);
5755     +
5756     timer->expires = expires;
5757     /*
5758     * If 'idx' was calculated above and the base time did not advance
5759     @@ -1644,7 +1642,7 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h)
5760     base->must_forward_clk = false;
5761    
5762     __run_timers(base);
5763     - if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active)
5764     + if (IS_ENABLED(CONFIG_NO_HZ_COMMON))
5765     __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF]));
5766     }
5767    
5768     @@ -1803,6 +1801,21 @@ static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *h
5769     }
5770     }
5771    
5772     +int timers_prepare_cpu(unsigned int cpu)
5773     +{
5774     + struct timer_base *base;
5775     + int b;
5776     +
5777     + for (b = 0; b < NR_BASES; b++) {
5778     + base = per_cpu_ptr(&timer_bases[b], cpu);
5779     + base->clk = jiffies;
5780     + base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
5781     + base->is_idle = false;
5782     + base->must_forward_clk = true;
5783     + }
5784     + return 0;
5785     +}
5786     +
5787     int timers_dead_cpu(unsigned int cpu)
5788     {
5789     struct timer_base *old_base;
5790     diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
5791     index 81279c6602ff..0476a9372014 100644
5792     --- a/kernel/trace/ring_buffer.c
5793     +++ b/kernel/trace/ring_buffer.c
5794     @@ -281,6 +281,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
5795     /* Missed count stored at end */
5796     #define RB_MISSED_STORED (1 << 30)
5797    
5798     +#define RB_MISSED_FLAGS (RB_MISSED_EVENTS|RB_MISSED_STORED)
5799     +
5800     struct buffer_data_page {
5801     u64 time_stamp; /* page time stamp */
5802     local_t commit; /* write committed index */
5803     @@ -332,7 +334,9 @@ static void rb_init_page(struct buffer_data_page *bpage)
5804     */
5805     size_t ring_buffer_page_len(void *page)
5806     {
5807     - return local_read(&((struct buffer_data_page *)page)->commit)
5808     + struct buffer_data_page *bpage = page;
5809     +
5810     + return (local_read(&bpage->commit) & ~RB_MISSED_FLAGS)
5811     + BUF_PAGE_HDR_SIZE;
5812     }
5813    
5814     @@ -4439,8 +4443,13 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, int cpu, void *data)
5815     {
5816     struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
5817     struct buffer_data_page *bpage = data;
5818     + struct page *page = virt_to_page(bpage);
5819     unsigned long flags;
5820    
5821     + /* If the page is still in use someplace else, we can't reuse it */
5822     + if (page_ref_count(page) > 1)
5823     + goto out;
5824     +
5825     local_irq_save(flags);
5826     arch_spin_lock(&cpu_buffer->lock);
5827    
5828     @@ -4452,6 +4461,7 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, int cpu, void *data)
5829     arch_spin_unlock(&cpu_buffer->lock);
5830     local_irq_restore(flags);
5831    
5832     + out:
5833     free_page((unsigned long)bpage);
5834     }
5835     EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
5836     diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
5837     index 80de14973b42..76bcc80b893e 100644
5838     --- a/kernel/trace/trace.c
5839     +++ b/kernel/trace/trace.c
5840     @@ -6769,7 +6769,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
5841     .spd_release = buffer_spd_release,
5842     };
5843     struct buffer_ref *ref;
5844     - int entries, size, i;
5845     + int entries, i;
5846     ssize_t ret = 0;
5847    
5848     #ifdef CONFIG_TRACER_MAX_TRACE
5849     @@ -6823,14 +6823,6 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
5850     break;
5851     }
5852    
5853     - /*
5854     - * zero out any left over data, this is going to
5855     - * user land.
5856     - */
5857     - size = ring_buffer_page_len(ref->page);
5858     - if (size < PAGE_SIZE)
5859     - memset(ref->page + size, 0, PAGE_SIZE - size);
5860     -
5861     page = virt_to_page(ref->page);
5862    
5863     spd.pages[i] = page;
5864     @@ -7588,6 +7580,7 @@ allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size
5865     buf->data = alloc_percpu(struct trace_array_cpu);
5866     if (!buf->data) {
5867     ring_buffer_free(buf->buffer);
5868     + buf->buffer = NULL;
5869     return -ENOMEM;
5870     }
5871    
5872     @@ -7611,7 +7604,9 @@ static int allocate_trace_buffers(struct trace_array *tr, int size)
5873     allocate_snapshot ? size : 1);
5874     if (WARN_ON(ret)) {
5875     ring_buffer_free(tr->trace_buffer.buffer);
5876     + tr->trace_buffer.buffer = NULL;
5877     free_percpu(tr->trace_buffer.data);
5878     + tr->trace_buffer.data = NULL;
5879     return -ENOMEM;
5880     }
5881     tr->allocated_snapshot = allocate_snapshot;
5882     diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
5883     index de2152730809..08190db0a2dc 100644
5884     --- a/net/bridge/br_netlink.c
5885     +++ b/net/bridge/br_netlink.c
5886     @@ -1223,19 +1223,20 @@ static int br_dev_newlink(struct net *src_net, struct net_device *dev,
5887     struct net_bridge *br = netdev_priv(dev);
5888     int err;
5889    
5890     + err = register_netdevice(dev);
5891     + if (err)
5892     + return err;
5893     +
5894     if (tb[IFLA_ADDRESS]) {
5895     spin_lock_bh(&br->lock);
5896     br_stp_change_bridge_id(br, nla_data(tb[IFLA_ADDRESS]));
5897     spin_unlock_bh(&br->lock);
5898     }
5899    
5900     - err = register_netdevice(dev);
5901     - if (err)
5902     - return err;
5903     -
5904     err = br_changelink(dev, tb, data, extack);
5905     if (err)
5906     - unregister_netdevice(dev);
5907     + br_dev_delete(dev, NULL);
5908     +
5909     return err;
5910     }
5911    
5912     diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
5913     index 6cfdc7c84c48..0dd6359e5924 100644
5914     --- a/net/core/net_namespace.c
5915     +++ b/net/core/net_namespace.c
5916     @@ -266,7 +266,7 @@ struct net *get_net_ns_by_id(struct net *net, int id)
5917     spin_lock_bh(&net->nsid_lock);
5918     peer = idr_find(&net->netns_ids, id);
5919     if (peer)
5920     - get_net(peer);
5921     + peer = maybe_get_net(peer);
5922     spin_unlock_bh(&net->nsid_lock);
5923     rcu_read_unlock();
5924    
5925     diff --git a/net/core/skbuff.c b/net/core/skbuff.c
5926     index e140ba49b30a..15fa5baa8fae 100644
5927     --- a/net/core/skbuff.c
5928     +++ b/net/core/skbuff.c
5929     @@ -1181,12 +1181,12 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
5930     int i, new_frags;
5931     u32 d_off;
5932    
5933     - if (!num_frags)
5934     - return 0;
5935     -
5936     if (skb_shared(skb) || skb_unclone(skb, gfp_mask))
5937     return -EINVAL;
5938    
5939     + if (!num_frags)
5940     + goto release;
5941     +
5942     new_frags = (__skb_pagelen(skb) + PAGE_SIZE - 1) >> PAGE_SHIFT;
5943     for (i = 0; i < new_frags; i++) {
5944     page = alloc_page(gfp_mask);
5945     @@ -1242,6 +1242,7 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
5946     __skb_fill_page_desc(skb, new_frags - 1, head, 0, d_off);
5947     skb_shinfo(skb)->nr_frags = new_frags;
5948    
5949     +release:
5950     skb_zcopy_clear(skb, false);
5951     return 0;
5952     }
5953     @@ -3657,8 +3658,6 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
5954    
5955     skb_shinfo(nskb)->tx_flags |= skb_shinfo(head_skb)->tx_flags &
5956     SKBTX_SHARED_FRAG;
5957     - if (skb_zerocopy_clone(nskb, head_skb, GFP_ATOMIC))
5958     - goto err;
5959    
5960     while (pos < offset + len) {
5961     if (i >= nfrags) {
5962     @@ -3684,6 +3683,8 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
5963    
5964     if (unlikely(skb_orphan_frags(frag_skb, GFP_ATOMIC)))
5965     goto err;
5966     + if (skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC))
5967     + goto err;
5968    
5969     *nskb_frag = *frag;
5970     __skb_frag_ref(nskb_frag);
5971     @@ -4296,7 +4297,7 @@ void skb_complete_tx_timestamp(struct sk_buff *skb,
5972     struct sock *sk = skb->sk;
5973    
5974     if (!skb_may_tx_timestamp(sk, false))
5975     - return;
5976     + goto err;
5977    
5978     /* Take a reference to prevent skb_orphan() from freeing the socket,
5979     * but only if the socket refcount is not zero.
5980     @@ -4305,7 +4306,11 @@ void skb_complete_tx_timestamp(struct sk_buff *skb,
5981     *skb_hwtstamps(skb) = *hwtstamps;
5982     __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND, false);
5983     sock_put(sk);
5984     + return;
5985     }
5986     +
5987     +err:
5988     + kfree_skb(skb);
5989     }
5990     EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
5991    
5992     diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
5993     index d7adc0616599..bffa88ecc534 100644
5994     --- a/net/ipv4/devinet.c
5995     +++ b/net/ipv4/devinet.c
5996     @@ -1420,7 +1420,7 @@ static void inetdev_changename(struct net_device *dev, struct in_device *in_dev)
5997    
5998     static bool inetdev_valid_mtu(unsigned int mtu)
5999     {
6000     - return mtu >= 68;
6001     + return mtu >= IPV4_MIN_MTU;
6002     }
6003    
6004     static void inetdev_send_gratuitous_arp(struct net_device *dev,
6005     diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
6006     index 37819ab4cc74..d72874150905 100644
6007     --- a/net/ipv4/fib_frontend.c
6008     +++ b/net/ipv4/fib_frontend.c
6009     @@ -1274,14 +1274,19 @@ static int __net_init ip_fib_net_init(struct net *net)
6010    
6011     static void ip_fib_net_exit(struct net *net)
6012     {
6013     - unsigned int i;
6014     + int i;
6015    
6016     rtnl_lock();
6017     #ifdef CONFIG_IP_MULTIPLE_TABLES
6018     RCU_INIT_POINTER(net->ipv4.fib_main, NULL);
6019     RCU_INIT_POINTER(net->ipv4.fib_default, NULL);
6020     #endif
6021     - for (i = 0; i < FIB_TABLE_HASHSZ; i++) {
6022     + /* Destroy the tables in reverse order to guarantee that the
6023     + * local table, ID 255, is destroyed before the main table, ID
6024     + * 254. This is necessary as the local table may contain
6025     + * references to data contained in the main table.
6026     + */
6027     + for (i = FIB_TABLE_HASHSZ - 1; i >= 0; i--) {
6028     struct hlist_head *head = &net->ipv4.fib_table_hash[i];
6029     struct hlist_node *tmp;
6030     struct fib_table *tb;
6031     diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
6032     index 01ed22139ac2..aff3751df950 100644
6033     --- a/net/ipv4/fib_semantics.c
6034     +++ b/net/ipv4/fib_semantics.c
6035     @@ -706,7 +706,7 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi)
6036    
6037     nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
6038     int type = nla_type(nla);
6039     - u32 val;
6040     + u32 fi_val, val;
6041    
6042     if (!type)
6043     continue;
6044     @@ -723,7 +723,11 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi)
6045     val = nla_get_u32(nla);
6046     }
6047    
6048     - if (fi->fib_metrics->metrics[type - 1] != val)
6049     + fi_val = fi->fib_metrics->metrics[type - 1];
6050     + if (type == RTAX_FEATURES)
6051     + fi_val &= ~DST_FEATURE_ECN_CA;
6052     +
6053     + if (fi_val != val)
6054     return false;
6055     }
6056    
6057     diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
6058     index ab183af0b5b6..c621266e0306 100644
6059     --- a/net/ipv4/igmp.c
6060     +++ b/net/ipv4/igmp.c
6061     @@ -89,6 +89,7 @@
6062     #include <linux/rtnetlink.h>
6063     #include <linux/times.h>
6064     #include <linux/pkt_sched.h>
6065     +#include <linux/byteorder/generic.h>
6066    
6067     #include <net/net_namespace.h>
6068     #include <net/arp.h>
6069     @@ -321,6 +322,23 @@ igmp_scount(struct ip_mc_list *pmc, int type, int gdeleted, int sdeleted)
6070     return scount;
6071     }
6072    
6073     +/* source address selection per RFC 3376 section 4.2.13 */
6074     +static __be32 igmpv3_get_srcaddr(struct net_device *dev,
6075     + const struct flowi4 *fl4)
6076     +{
6077     + struct in_device *in_dev = __in_dev_get_rcu(dev);
6078     +
6079     + if (!in_dev)
6080     + return htonl(INADDR_ANY);
6081     +
6082     + for_ifa(in_dev) {
6083     + if (inet_ifa_match(fl4->saddr, ifa))
6084     + return fl4->saddr;
6085     + } endfor_ifa(in_dev);
6086     +
6087     + return htonl(INADDR_ANY);
6088     +}
6089     +
6090     static struct sk_buff *igmpv3_newpack(struct net_device *dev, unsigned int mtu)
6091     {
6092     struct sk_buff *skb;
6093     @@ -368,7 +386,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, unsigned int mtu)
6094     pip->frag_off = htons(IP_DF);
6095     pip->ttl = 1;
6096     pip->daddr = fl4.daddr;
6097     - pip->saddr = fl4.saddr;
6098     + pip->saddr = igmpv3_get_srcaddr(dev, &fl4);
6099     pip->protocol = IPPROTO_IGMP;
6100     pip->tot_len = 0; /* filled in later */
6101     ip_select_ident(net, skb, NULL);
6102     @@ -404,16 +422,17 @@ static int grec_size(struct ip_mc_list *pmc, int type, int gdel, int sdel)
6103     }
6104    
6105     static struct sk_buff *add_grhead(struct sk_buff *skb, struct ip_mc_list *pmc,
6106     - int type, struct igmpv3_grec **ppgr)
6107     + int type, struct igmpv3_grec **ppgr, unsigned int mtu)
6108     {
6109     struct net_device *dev = pmc->interface->dev;
6110     struct igmpv3_report *pih;
6111     struct igmpv3_grec *pgr;
6112    
6113     - if (!skb)
6114     - skb = igmpv3_newpack(dev, dev->mtu);
6115     - if (!skb)
6116     - return NULL;
6117     + if (!skb) {
6118     + skb = igmpv3_newpack(dev, mtu);
6119     + if (!skb)
6120     + return NULL;
6121     + }
6122     pgr = skb_put(skb, sizeof(struct igmpv3_grec));
6123     pgr->grec_type = type;
6124     pgr->grec_auxwords = 0;
6125     @@ -436,12 +455,17 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
6126     struct igmpv3_grec *pgr = NULL;
6127     struct ip_sf_list *psf, *psf_next, *psf_prev, **psf_list;
6128     int scount, stotal, first, isquery, truncate;
6129     + unsigned int mtu;
6130    
6131     if (pmc->multiaddr == IGMP_ALL_HOSTS)
6132     return skb;
6133     if (ipv4_is_local_multicast(pmc->multiaddr) && !net->ipv4.sysctl_igmp_llm_reports)
6134     return skb;
6135    
6136     + mtu = READ_ONCE(dev->mtu);
6137     + if (mtu < IPV4_MIN_MTU)
6138     + return skb;
6139     +
6140     isquery = type == IGMPV3_MODE_IS_INCLUDE ||
6141     type == IGMPV3_MODE_IS_EXCLUDE;
6142     truncate = type == IGMPV3_MODE_IS_EXCLUDE ||
6143     @@ -462,7 +486,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
6144     AVAILABLE(skb) < grec_size(pmc, type, gdeleted, sdeleted)) {
6145     if (skb)
6146     igmpv3_sendpack(skb);
6147     - skb = igmpv3_newpack(dev, dev->mtu);
6148     + skb = igmpv3_newpack(dev, mtu);
6149     }
6150     }
6151     first = 1;
6152     @@ -498,12 +522,12 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
6153     pgr->grec_nsrcs = htons(scount);
6154     if (skb)
6155     igmpv3_sendpack(skb);
6156     - skb = igmpv3_newpack(dev, dev->mtu);
6157     + skb = igmpv3_newpack(dev, mtu);
6158     first = 1;
6159     scount = 0;
6160     }
6161     if (first) {
6162     - skb = add_grhead(skb, pmc, type, &pgr);
6163     + skb = add_grhead(skb, pmc, type, &pgr, mtu);
6164     first = 0;
6165     }
6166     if (!skb)
6167     @@ -538,7 +562,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
6168     igmpv3_sendpack(skb);
6169     skb = NULL; /* add_grhead will get a new one */
6170     }
6171     - skb = add_grhead(skb, pmc, type, &pgr);
6172     + skb = add_grhead(skb, pmc, type, &pgr, mtu);
6173     }
6174     }
6175     if (pgr)
6176     diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
6177     index e9805ad664ac..4e90082b23a6 100644
6178     --- a/net/ipv4/ip_tunnel.c
6179     +++ b/net/ipv4/ip_tunnel.c
6180     @@ -349,8 +349,8 @@ static int ip_tunnel_bind_dev(struct net_device *dev)
6181     dev->needed_headroom = t_hlen + hlen;
6182     mtu -= (dev->hard_header_len + t_hlen);
6183    
6184     - if (mtu < 68)
6185     - mtu = 68;
6186     + if (mtu < IPV4_MIN_MTU)
6187     + mtu = IPV4_MIN_MTU;
6188    
6189     return mtu;
6190     }
6191     diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
6192     index 33b70bfd1122..125c1eab3eaa 100644
6193     --- a/net/ipv4/raw.c
6194     +++ b/net/ipv4/raw.c
6195     @@ -513,11 +513,16 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
6196     int err;
6197     struct ip_options_data opt_copy;
6198     struct raw_frag_vec rfv;
6199     + int hdrincl;
6200    
6201     err = -EMSGSIZE;
6202     if (len > 0xFFFF)
6203     goto out;
6204    
6205     + /* hdrincl should be READ_ONCE(inet->hdrincl)
6206     + * but READ_ONCE() doesn't work with bit fields
6207     + */
6208     + hdrincl = inet->hdrincl;
6209     /*
6210     * Check the flags.
6211     */
6212     @@ -593,7 +598,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
6213     /* Linux does not mangle headers on raw sockets,
6214     * so that IP options + IP_HDRINCL is non-sense.
6215     */
6216     - if (inet->hdrincl)
6217     + if (hdrincl)
6218     goto done;
6219     if (ipc.opt->opt.srr) {
6220     if (!daddr)
6221     @@ -615,12 +620,12 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
6222    
6223     flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos,
6224     RT_SCOPE_UNIVERSE,
6225     - inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
6226     + hdrincl ? IPPROTO_RAW : sk->sk_protocol,
6227     inet_sk_flowi_flags(sk) |
6228     - (inet->hdrincl ? FLOWI_FLAG_KNOWN_NH : 0),
6229     + (hdrincl ? FLOWI_FLAG_KNOWN_NH : 0),
6230     daddr, saddr, 0, 0, sk->sk_uid);
6231    
6232     - if (!inet->hdrincl) {
6233     + if (!hdrincl) {
6234     rfv.msg = msg;
6235     rfv.hlen = 0;
6236    
6237     @@ -645,7 +650,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
6238     goto do_confirm;
6239     back_from_confirm:
6240    
6241     - if (inet->hdrincl)
6242     + if (hdrincl)
6243     err = raw_send_hdrinc(sk, &fl4, msg, len,
6244     &rt, msg->msg_flags, &ipc.sockc);
6245    
6246     diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
6247     index 5091402720ab..a0c72b09cefc 100644
6248     --- a/net/ipv4/tcp.c
6249     +++ b/net/ipv4/tcp.c
6250     @@ -2356,6 +2356,7 @@ int tcp_disconnect(struct sock *sk, int flags)
6251     tp->snd_cwnd_cnt = 0;
6252     tp->window_clamp = 0;
6253     tcp_set_ca_state(sk, TCP_CA_Open);
6254     + tp->is_sack_reneg = 0;
6255     tcp_clear_retrans(tp);
6256     inet_csk_delack_init(sk);
6257     /* Initialize rcv_mss to TCP_MIN_MSS to avoid division by 0
6258     diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
6259     index 69ee877574d0..8322f26e770e 100644
6260     --- a/net/ipv4/tcp_bbr.c
6261     +++ b/net/ipv4/tcp_bbr.c
6262     @@ -110,7 +110,8 @@ struct bbr {
6263     u32 lt_last_lost; /* LT intvl start: tp->lost */
6264     u32 pacing_gain:10, /* current gain for setting pacing rate */
6265     cwnd_gain:10, /* current gain for setting cwnd */
6266     - full_bw_cnt:3, /* number of rounds without large bw gains */
6267     + full_bw_reached:1, /* reached full bw in Startup? */
6268     + full_bw_cnt:2, /* number of rounds without large bw gains */
6269     cycle_idx:3, /* current index in pacing_gain cycle array */
6270     has_seen_rtt:1, /* have we seen an RTT sample yet? */
6271     unused_b:5;
6272     @@ -180,7 +181,7 @@ static bool bbr_full_bw_reached(const struct sock *sk)
6273     {
6274     const struct bbr *bbr = inet_csk_ca(sk);
6275    
6276     - return bbr->full_bw_cnt >= bbr_full_bw_cnt;
6277     + return bbr->full_bw_reached;
6278     }
6279    
6280     /* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */
6281     @@ -717,6 +718,7 @@ static void bbr_check_full_bw_reached(struct sock *sk,
6282     return;
6283     }
6284     ++bbr->full_bw_cnt;
6285     + bbr->full_bw_reached = bbr->full_bw_cnt >= bbr_full_bw_cnt;
6286     }
6287    
6288     /* If pipe is probably full, drain the queue and then enter steady-state. */
6289     @@ -850,6 +852,7 @@ static void bbr_init(struct sock *sk)
6290     bbr->restore_cwnd = 0;
6291     bbr->round_start = 0;
6292     bbr->idle_restart = 0;
6293     + bbr->full_bw_reached = 0;
6294     bbr->full_bw = 0;
6295     bbr->full_bw_cnt = 0;
6296     bbr->cycle_mstamp = 0;
6297     @@ -871,6 +874,11 @@ static u32 bbr_sndbuf_expand(struct sock *sk)
6298     */
6299     static u32 bbr_undo_cwnd(struct sock *sk)
6300     {
6301     + struct bbr *bbr = inet_csk_ca(sk);
6302     +
6303     + bbr->full_bw = 0; /* spurious slow-down; reset full pipe detection */
6304     + bbr->full_bw_cnt = 0;
6305     + bbr_reset_lt_bw_sampling(sk);
6306     return tcp_sk(sk)->snd_cwnd;
6307     }
6308    
6309     diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
6310     index c5447b9f8517..ff48ac654e5a 100644
6311     --- a/net/ipv4/tcp_input.c
6312     +++ b/net/ipv4/tcp_input.c
6313     @@ -521,9 +521,6 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
6314     u32 new_sample = tp->rcv_rtt_est.rtt_us;
6315     long m = sample;
6316    
6317     - if (m == 0)
6318     - m = 1;
6319     -
6320     if (new_sample != 0) {
6321     /* If we sample in larger samples in the non-timestamp
6322     * case, we could grossly overestimate the RTT especially
6323     @@ -560,6 +557,8 @@ static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp)
6324     if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq))
6325     return;
6326     delta_us = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcv_rtt_est.time);
6327     + if (!delta_us)
6328     + delta_us = 1;
6329     tcp_rcv_rtt_update(tp, delta_us, 1);
6330    
6331     new_measure:
6332     @@ -576,8 +575,11 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
6333     (TCP_SKB_CB(skb)->end_seq -
6334     TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss)) {
6335     u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr;
6336     - u32 delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ);
6337     + u32 delta_us;
6338    
6339     + if (!delta)
6340     + delta = 1;
6341     + delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ);
6342     tcp_rcv_rtt_update(tp, delta_us, 0);
6343     }
6344     }
6345     @@ -1975,6 +1977,8 @@ void tcp_enter_loss(struct sock *sk)
6346     NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
6347     tp->sacked_out = 0;
6348     tp->fackets_out = 0;
6349     + /* Mark SACK reneging until we recover from this loss event. */
6350     + tp->is_sack_reneg = 1;
6351     }
6352     tcp_clear_all_retrans_hints(tp);
6353    
6354     @@ -2428,6 +2432,7 @@ static bool tcp_try_undo_recovery(struct sock *sk)
6355     return true;
6356     }
6357     tcp_set_ca_state(sk, TCP_CA_Open);
6358     + tp->is_sack_reneg = 0;
6359     return false;
6360     }
6361    
6362     @@ -2459,8 +2464,10 @@ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo)
6363     NET_INC_STATS(sock_net(sk),
6364     LINUX_MIB_TCPSPURIOUSRTOS);
6365     inet_csk(sk)->icsk_retransmits = 0;
6366     - if (frto_undo || tcp_is_sack(tp))
6367     + if (frto_undo || tcp_is_sack(tp)) {
6368     tcp_set_ca_state(sk, TCP_CA_Open);
6369     + tp->is_sack_reneg = 0;
6370     + }
6371     return true;
6372     }
6373     return false;
6374     @@ -3551,6 +3558,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
6375     struct tcp_sacktag_state sack_state;
6376     struct rate_sample rs = { .prior_delivered = 0 };
6377     u32 prior_snd_una = tp->snd_una;
6378     + bool is_sack_reneg = tp->is_sack_reneg;
6379     u32 ack_seq = TCP_SKB_CB(skb)->seq;
6380     u32 ack = TCP_SKB_CB(skb)->ack_seq;
6381     bool is_dupack = false;
6382     @@ -3666,7 +3674,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
6383    
6384     delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */
6385     lost = tp->lost - lost; /* freshly marked lost */
6386     - tcp_rate_gen(sk, delivered, lost, sack_state.rate);
6387     + tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate);
6388     tcp_cong_control(sk, ack, delivered, flag, sack_state.rate);
6389     tcp_xmit_recovery(sk, rexmit);
6390     return 1;
6391     diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
6392     index 5a5ed4f14678..cab4b935e474 100644
6393     --- a/net/ipv4/tcp_ipv4.c
6394     +++ b/net/ipv4/tcp_ipv4.c
6395     @@ -844,7 +844,7 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
6396     tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
6397     req->ts_recent,
6398     0,
6399     - tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
6400     + tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
6401     AF_INET),
6402     inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
6403     ip_hdr(skb)->tos);
6404     diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c
6405     index 3330a370d306..c61240e43923 100644
6406     --- a/net/ipv4/tcp_rate.c
6407     +++ b/net/ipv4/tcp_rate.c
6408     @@ -106,7 +106,7 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
6409    
6410     /* Update the connection delivery information and generate a rate sample. */
6411     void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
6412     - struct rate_sample *rs)
6413     + bool is_sack_reneg, struct rate_sample *rs)
6414     {
6415     struct tcp_sock *tp = tcp_sk(sk);
6416     u32 snd_us, ack_us;
6417     @@ -124,8 +124,12 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
6418    
6419     rs->acked_sacked = delivered; /* freshly ACKed or SACKed */
6420     rs->losses = lost; /* freshly marked lost */
6421     - /* Return an invalid sample if no timing information is available. */
6422     - if (!rs->prior_mstamp) {
6423     + /* Return an invalid sample if no timing information is available or
6424     + * in recovery from loss with SACK reneging. Rate samples taken during
6425     + * a SACK reneging event may overestimate bw by including packets that
6426     + * were SACKed before the reneg.
6427     + */
6428     + if (!rs->prior_mstamp || is_sack_reneg) {
6429     rs->delivered = -1;
6430     rs->interval_us = -1;
6431     return;
6432     diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
6433     index 655dd8d7f064..e9af1879cd53 100644
6434     --- a/net/ipv4/tcp_timer.c
6435     +++ b/net/ipv4/tcp_timer.c
6436     @@ -264,6 +264,7 @@ void tcp_delack_timer_handler(struct sock *sk)
6437     icsk->icsk_ack.pingpong = 0;
6438     icsk->icsk_ack.ato = TCP_ATO_MIN;
6439     }
6440     + tcp_mstamp_refresh(tcp_sk(sk));
6441     tcp_send_ack(sk);
6442     __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKS);
6443     }
6444     @@ -627,6 +628,7 @@ static void tcp_keepalive_timer (unsigned long data)
6445     goto out;
6446     }
6447    
6448     + tcp_mstamp_refresh(tp);
6449     if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {
6450     if (tp->linger2 >= 0) {
6451     const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN;
6452     diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
6453     index 2ec39404c449..c5318f5f6a14 100644
6454     --- a/net/ipv6/addrconf.c
6455     +++ b/net/ipv6/addrconf.c
6456     @@ -231,7 +231,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
6457     .proxy_ndp = 0,
6458     .accept_source_route = 0, /* we do not accept RH0 by default. */
6459     .disable_ipv6 = 0,
6460     - .accept_dad = 1,
6461     + .accept_dad = 0,
6462     .suppress_frag_ndisc = 1,
6463     .accept_ra_mtu = 1,
6464     .stable_secret = {
6465     diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
6466     index fe5262fd6aa5..bcbd5f3bf8bd 100644
6467     --- a/net/ipv6/af_inet6.c
6468     +++ b/net/ipv6/af_inet6.c
6469     @@ -210,7 +210,6 @@ static int inet6_create(struct net *net, struct socket *sock, int protocol,
6470     np->mcast_hops = IPV6_DEFAULT_MCASTHOPS;
6471     np->mc_loop = 1;
6472     np->pmtudisc = IPV6_PMTUDISC_WANT;
6473     - np->autoflowlabel = ip6_default_np_autolabel(net);
6474     np->repflow = net->ipv6.sysctl.flowlabel_reflect;
6475     sk->sk_ipv6only = net->ipv6.sysctl.bindv6only;
6476    
6477     diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
6478     index 5d6bee070871..7a2df6646486 100644
6479     --- a/net/ipv6/ip6_gre.c
6480     +++ b/net/ipv6/ip6_gre.c
6481     @@ -1020,6 +1020,36 @@ static void ip6gre_tunnel_setup(struct net_device *dev)
6482     eth_random_addr(dev->perm_addr);
6483     }
6484    
6485     +#define GRE6_FEATURES (NETIF_F_SG | \
6486     + NETIF_F_FRAGLIST | \
6487     + NETIF_F_HIGHDMA | \
6488     + NETIF_F_HW_CSUM)
6489     +
6490     +static void ip6gre_tnl_init_features(struct net_device *dev)
6491     +{
6492     + struct ip6_tnl *nt = netdev_priv(dev);
6493     +
6494     + dev->features |= GRE6_FEATURES;
6495     + dev->hw_features |= GRE6_FEATURES;
6496     +
6497     + if (!(nt->parms.o_flags & TUNNEL_SEQ)) {
6498     + /* TCP offload with GRE SEQ is not supported, nor
6499     + * can we support 2 levels of outer headers requiring
6500     + * an update.
6501     + */
6502     + if (!(nt->parms.o_flags & TUNNEL_CSUM) ||
6503     + nt->encap.type == TUNNEL_ENCAP_NONE) {
6504     + dev->features |= NETIF_F_GSO_SOFTWARE;
6505     + dev->hw_features |= NETIF_F_GSO_SOFTWARE;
6506     + }
6507     +
6508     + /* Can use a lockless transmit, unless we generate
6509     + * output sequences
6510     + */
6511     + dev->features |= NETIF_F_LLTX;
6512     + }
6513     +}
6514     +
6515     static int ip6gre_tunnel_init_common(struct net_device *dev)
6516     {
6517     struct ip6_tnl *tunnel;
6518     @@ -1054,6 +1084,8 @@ static int ip6gre_tunnel_init_common(struct net_device *dev)
6519     if (!(tunnel->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
6520     dev->mtu -= 8;
6521    
6522     + ip6gre_tnl_init_features(dev);
6523     +
6524     return 0;
6525     }
6526    
6527     @@ -1302,11 +1334,6 @@ static const struct net_device_ops ip6gre_tap_netdev_ops = {
6528     .ndo_get_iflink = ip6_tnl_get_iflink,
6529     };
6530    
6531     -#define GRE6_FEATURES (NETIF_F_SG | \
6532     - NETIF_F_FRAGLIST | \
6533     - NETIF_F_HIGHDMA | \
6534     - NETIF_F_HW_CSUM)
6535     -
6536     static void ip6gre_tap_setup(struct net_device *dev)
6537     {
6538    
6539     @@ -1386,26 +1413,6 @@ static int ip6gre_newlink(struct net *src_net, struct net_device *dev,
6540     nt->net = dev_net(dev);
6541     ip6gre_tnl_link_config(nt, !tb[IFLA_MTU]);
6542    
6543     - dev->features |= GRE6_FEATURES;
6544     - dev->hw_features |= GRE6_FEATURES;
6545     -
6546     - if (!(nt->parms.o_flags & TUNNEL_SEQ)) {
6547     - /* TCP offload with GRE SEQ is not supported, nor
6548     - * can we support 2 levels of outer headers requiring
6549     - * an update.
6550     - */
6551     - if (!(nt->parms.o_flags & TUNNEL_CSUM) ||
6552     - (nt->encap.type == TUNNEL_ENCAP_NONE)) {
6553     - dev->features |= NETIF_F_GSO_SOFTWARE;
6554     - dev->hw_features |= NETIF_F_GSO_SOFTWARE;
6555     - }
6556     -
6557     - /* Can use a lockless transmit, unless we generate
6558     - * output sequences
6559     - */
6560     - dev->features |= NETIF_F_LLTX;
6561     - }
6562     -
6563     err = register_netdevice(dev);
6564     if (err)
6565     goto out;
6566     diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
6567     index 5110a418cc4d..f7dd51c42314 100644
6568     --- a/net/ipv6/ip6_output.c
6569     +++ b/net/ipv6/ip6_output.c
6570     @@ -166,6 +166,14 @@ int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
6571     !(IP6CB(skb)->flags & IP6SKB_REROUTED));
6572     }
6573    
6574     +static bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
6575     +{
6576     + if (!np->autoflowlabel_set)
6577     + return ip6_default_np_autolabel(net);
6578     + else
6579     + return np->autoflowlabel;
6580     +}
6581     +
6582     /*
6583     * xmit an sk_buff (used by TCP, SCTP and DCCP)
6584     * Note : socket lock is not held for SYNACK packets, but might be modified
6585     @@ -230,7 +238,7 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
6586     hlimit = ip6_dst_hoplimit(dst);
6587    
6588     ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
6589     - np->autoflowlabel, fl6));
6590     + ip6_autoflowlabel(net, np), fl6));
6591    
6592     hdr->payload_len = htons(seg_len);
6593     hdr->nexthdr = proto;
6594     @@ -1626,7 +1634,7 @@ struct sk_buff *__ip6_make_skb(struct sock *sk,
6595    
6596     ip6_flow_hdr(hdr, v6_cork->tclass,
6597     ip6_make_flowlabel(net, skb, fl6->flowlabel,
6598     - np->autoflowlabel, fl6));
6599     + ip6_autoflowlabel(net, np), fl6));
6600     hdr->hop_limit = v6_cork->hop_limit;
6601     hdr->nexthdr = proto;
6602     hdr->saddr = fl6->saddr;
6603     diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
6604     index a1c24443cd9e..ef958d50746b 100644
6605     --- a/net/ipv6/ip6_tunnel.c
6606     +++ b/net/ipv6/ip6_tunnel.c
6607     @@ -912,7 +912,7 @@ static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto,
6608     if (t->parms.collect_md) {
6609     tun_dst = ipv6_tun_rx_dst(skb, 0, 0, 0);
6610     if (!tun_dst)
6611     - return 0;
6612     + goto drop;
6613     }
6614     ret = __ip6_tnl_rcv(t, skb, tpi, tun_dst, dscp_ecn_decapsulate,
6615     log_ecn_error);
6616     diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
6617     index a5e466d4e093..90dbfa78a390 100644
6618     --- a/net/ipv6/ipv6_sockglue.c
6619     +++ b/net/ipv6/ipv6_sockglue.c
6620     @@ -878,6 +878,7 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
6621     break;
6622     case IPV6_AUTOFLOWLABEL:
6623     np->autoflowlabel = valbool;
6624     + np->autoflowlabel_set = 1;
6625     retv = 0;
6626     break;
6627     case IPV6_RECVFRAGSIZE:
6628     diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
6629     index 12b7c27ce5ce..9a38a2c641fa 100644
6630     --- a/net/ipv6/mcast.c
6631     +++ b/net/ipv6/mcast.c
6632     @@ -1682,16 +1682,16 @@ static int grec_size(struct ifmcaddr6 *pmc, int type, int gdel, int sdel)
6633     }
6634    
6635     static struct sk_buff *add_grhead(struct sk_buff *skb, struct ifmcaddr6 *pmc,
6636     - int type, struct mld2_grec **ppgr)
6637     + int type, struct mld2_grec **ppgr, unsigned int mtu)
6638     {
6639     - struct net_device *dev = pmc->idev->dev;
6640     struct mld2_report *pmr;
6641     struct mld2_grec *pgr;
6642    
6643     - if (!skb)
6644     - skb = mld_newpack(pmc->idev, dev->mtu);
6645     - if (!skb)
6646     - return NULL;
6647     + if (!skb) {
6648     + skb = mld_newpack(pmc->idev, mtu);
6649     + if (!skb)
6650     + return NULL;
6651     + }
6652     pgr = skb_put(skb, sizeof(struct mld2_grec));
6653     pgr->grec_type = type;
6654     pgr->grec_auxwords = 0;
6655     @@ -1714,10 +1714,15 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc,
6656     struct mld2_grec *pgr = NULL;
6657     struct ip6_sf_list *psf, *psf_next, *psf_prev, **psf_list;
6658     int scount, stotal, first, isquery, truncate;
6659     + unsigned int mtu;
6660    
6661     if (pmc->mca_flags & MAF_NOREPORT)
6662     return skb;
6663    
6664     + mtu = READ_ONCE(dev->mtu);
6665     + if (mtu < IPV6_MIN_MTU)
6666     + return skb;
6667     +
6668     isquery = type == MLD2_MODE_IS_INCLUDE ||
6669     type == MLD2_MODE_IS_EXCLUDE;
6670     truncate = type == MLD2_MODE_IS_EXCLUDE ||
6671     @@ -1738,7 +1743,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc,
6672     AVAILABLE(skb) < grec_size(pmc, type, gdeleted, sdeleted)) {
6673     if (skb)
6674     mld_sendpack(skb);
6675     - skb = mld_newpack(idev, dev->mtu);
6676     + skb = mld_newpack(idev, mtu);
6677     }
6678     }
6679     first = 1;
6680     @@ -1774,12 +1779,12 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc,
6681     pgr->grec_nsrcs = htons(scount);
6682     if (skb)
6683     mld_sendpack(skb);
6684     - skb = mld_newpack(idev, dev->mtu);
6685     + skb = mld_newpack(idev, mtu);
6686     first = 1;
6687     scount = 0;
6688     }
6689     if (first) {
6690     - skb = add_grhead(skb, pmc, type, &pgr);
6691     + skb = add_grhead(skb, pmc, type, &pgr, mtu);
6692     first = 0;
6693     }
6694     if (!skb)
6695     @@ -1814,7 +1819,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc,
6696     mld_sendpack(skb);
6697     skb = NULL; /* add_grhead will get a new one */
6698     }
6699     - skb = add_grhead(skb, pmc, type, &pgr);
6700     + skb = add_grhead(skb, pmc, type, &pgr, mtu);
6701     }
6702     }
6703     if (pgr)
6704     diff --git a/net/ipv6/route.c b/net/ipv6/route.c
6705     index 598efa8cfe25..ca8d3266e92e 100644
6706     --- a/net/ipv6/route.c
6707     +++ b/net/ipv6/route.c
6708     @@ -3700,19 +3700,13 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
6709     if (!ipv6_addr_any(&fl6.saddr))
6710     flags |= RT6_LOOKUP_F_HAS_SADDR;
6711    
6712     - if (!fibmatch)
6713     - dst = ip6_route_input_lookup(net, dev, &fl6, flags);
6714     - else
6715     - dst = ip6_route_lookup(net, &fl6, 0);
6716     + dst = ip6_route_input_lookup(net, dev, &fl6, flags);
6717    
6718     rcu_read_unlock();
6719     } else {
6720     fl6.flowi6_oif = oif;
6721    
6722     - if (!fibmatch)
6723     - dst = ip6_route_output(net, NULL, &fl6);
6724     - else
6725     - dst = ip6_route_lookup(net, &fl6, 0);
6726     + dst = ip6_route_output(net, NULL, &fl6);
6727     }
6728    
6729    
6730     @@ -3729,6 +3723,15 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
6731     goto errout;
6732     }
6733    
6734     + if (fibmatch && rt->dst.from) {
6735     + struct rt6_info *ort = container_of(rt->dst.from,
6736     + struct rt6_info, dst);
6737     +
6738     + dst_hold(&ort->dst);
6739     + ip6_rt_put(rt);
6740     + rt = ort;
6741     + }
6742     +
6743     skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
6744     if (!skb) {
6745     ip6_rt_put(rt);
6746     diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
6747     index 32ded300633d..237cc6187c5a 100644
6748     --- a/net/ipv6/tcp_ipv6.c
6749     +++ b/net/ipv6/tcp_ipv6.c
6750     @@ -988,7 +988,7 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
6751     req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
6752     tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
6753     req->ts_recent, sk->sk_bound_dev_if,
6754     - tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr),
6755     + tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->saddr),
6756     0, 0);
6757     }
6758    
6759     diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
6760     index 15c99dfa3d72..aac9d68b4636 100644
6761     --- a/net/netlink/af_netlink.c
6762     +++ b/net/netlink/af_netlink.c
6763     @@ -254,6 +254,9 @@ static int __netlink_deliver_tap_skb(struct sk_buff *skb,
6764     struct sock *sk = skb->sk;
6765     int ret = -ENOMEM;
6766    
6767     + if (!net_eq(dev_net(dev), sock_net(sk)))
6768     + return 0;
6769     +
6770     dev_hold(dev);
6771    
6772     if (is_vmalloc_addr(skb->head))
6773     diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
6774     index cfb652a4e007..dbe1079a1651 100644
6775     --- a/net/openvswitch/flow.c
6776     +++ b/net/openvswitch/flow.c
6777     @@ -532,6 +532,7 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
6778     return -EINVAL;
6779    
6780     skb_reset_network_header(skb);
6781     + key->eth.type = skb->protocol;
6782     } else {
6783     eth = eth_hdr(skb);
6784     ether_addr_copy(key->eth.src, eth->h_source);
6785     @@ -545,15 +546,23 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
6786     if (unlikely(parse_vlan(skb, key)))
6787     return -ENOMEM;
6788    
6789     - skb->protocol = parse_ethertype(skb);
6790     - if (unlikely(skb->protocol == htons(0)))
6791     + key->eth.type = parse_ethertype(skb);
6792     + if (unlikely(key->eth.type == htons(0)))
6793     return -ENOMEM;
6794    
6795     + /* Multiple tagged packets need to retain TPID to satisfy
6796     + * skb_vlan_pop(), which will later shift the ethertype into
6797     + * skb->protocol.
6798     + */
6799     + if (key->eth.cvlan.tci & htons(VLAN_TAG_PRESENT))
6800     + skb->protocol = key->eth.cvlan.tpid;
6801     + else
6802     + skb->protocol = key->eth.type;
6803     +
6804     skb_reset_network_header(skb);
6805     __skb_push(skb, skb->data - skb_mac_header(skb));
6806     }
6807     skb_reset_mac_len(skb);
6808     - key->eth.type = skb->protocol;
6809    
6810     /* Network layer. */
6811     if (key->eth.type == htons(ETH_P_IP)) {
6812     diff --git a/net/rds/send.c b/net/rds/send.c
6813     index b52cdc8ae428..f72466c63f0c 100644
6814     --- a/net/rds/send.c
6815     +++ b/net/rds/send.c
6816     @@ -1009,6 +1009,9 @@ static int rds_rdma_bytes(struct msghdr *msg, size_t *rdma_bytes)
6817     continue;
6818    
6819     if (cmsg->cmsg_type == RDS_CMSG_RDMA_ARGS) {
6820     + if (cmsg->cmsg_len <
6821     + CMSG_LEN(sizeof(struct rds_rdma_args)))
6822     + return -EINVAL;
6823     args = CMSG_DATA(cmsg);
6824     *rdma_bytes += args->remote_vec.bytes;
6825     }
6826     diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c
6827     index 44de4ee51ce9..a08a32fa0949 100644
6828     --- a/net/sched/sch_ingress.c
6829     +++ b/net/sched/sch_ingress.c
6830     @@ -59,11 +59,12 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt)
6831     struct net_device *dev = qdisc_dev(sch);
6832     int err;
6833    
6834     + net_inc_ingress_queue();
6835     +
6836     err = tcf_block_get(&q->block, &dev->ingress_cl_list);
6837     if (err)
6838     return err;
6839    
6840     - net_inc_ingress_queue();
6841     sch->flags |= TCQ_F_CPUSTATS;
6842    
6843     return 0;
6844     @@ -153,6 +154,9 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt)
6845     struct net_device *dev = qdisc_dev(sch);
6846     int err;
6847    
6848     + net_inc_ingress_queue();
6849     + net_inc_egress_queue();
6850     +
6851     err = tcf_block_get(&q->ingress_block, &dev->ingress_cl_list);
6852     if (err)
6853     return err;
6854     @@ -161,9 +165,6 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt)
6855     if (err)
6856     return err;
6857    
6858     - net_inc_ingress_queue();
6859     - net_inc_egress_queue();
6860     -
6861     sch->flags |= TCQ_F_CPUSTATS;
6862    
6863     return 0;
6864     diff --git a/net/sctp/socket.c b/net/sctp/socket.c
6865     index d6163f7aefb1..df806b8819aa 100644
6866     --- a/net/sctp/socket.c
6867     +++ b/net/sctp/socket.c
6868     @@ -3874,13 +3874,17 @@ static int sctp_setsockopt_reset_streams(struct sock *sk,
6869     struct sctp_association *asoc;
6870     int retval = -EINVAL;
6871    
6872     - if (optlen < sizeof(struct sctp_reset_streams))
6873     + if (optlen < sizeof(*params))
6874     return -EINVAL;
6875    
6876     params = memdup_user(optval, optlen);
6877     if (IS_ERR(params))
6878     return PTR_ERR(params);
6879    
6880     + if (params->srs_number_streams * sizeof(__u16) >
6881     + optlen - sizeof(*params))
6882     + goto out;
6883     +
6884     asoc = sctp_id2assoc(sk, params->srs_assoc_id);
6885     if (!asoc)
6886     goto out;
6887     @@ -4413,7 +4417,7 @@ static int sctp_init_sock(struct sock *sk)
6888     SCTP_DBG_OBJCNT_INC(sock);
6889    
6890     local_bh_disable();
6891     - percpu_counter_inc(&sctp_sockets_allocated);
6892     + sk_sockets_allocated_inc(sk);
6893     sock_prot_inuse_add(net, sk->sk_prot, 1);
6894    
6895     /* Nothing can fail after this block, otherwise
6896     @@ -4457,7 +4461,7 @@ static void sctp_destroy_sock(struct sock *sk)
6897     }
6898     sctp_endpoint_free(sp->ep);
6899     local_bh_disable();
6900     - percpu_counter_dec(&sctp_sockets_allocated);
6901     + sk_sockets_allocated_dec(sk);
6902     sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
6903     local_bh_enable();
6904     }
6905     diff --git a/net/tipc/socket.c b/net/tipc/socket.c
6906     index d50edd6e0019..98a44ecb11e7 100644
6907     --- a/net/tipc/socket.c
6908     +++ b/net/tipc/socket.c
6909     @@ -709,11 +709,11 @@ static unsigned int tipc_poll(struct file *file, struct socket *sock,
6910    
6911     switch (sk->sk_state) {
6912     case TIPC_ESTABLISHED:
6913     + case TIPC_CONNECTING:
6914     if (!tsk->cong_link_cnt && !tsk_conn_cong(tsk))
6915     mask |= POLLOUT;
6916     /* fall thru' */
6917     case TIPC_LISTEN:
6918     - case TIPC_CONNECTING:
6919     if (!skb_queue_empty(&sk->sk_receive_queue))
6920     mask |= (POLLIN | POLLRDNORM);
6921     break;
6922     diff --git a/security/Kconfig b/security/Kconfig
6923     index e8e449444e65..6614b9312b45 100644
6924     --- a/security/Kconfig
6925     +++ b/security/Kconfig
6926     @@ -54,6 +54,17 @@ config SECURITY_NETWORK
6927     implement socket and networking access controls.
6928     If you are unsure how to answer this question, answer N.
6929    
6930     +config PAGE_TABLE_ISOLATION
6931     + bool "Remove the kernel mapping in user mode"
6932     + depends on X86_64 && !UML
6933     + default y
6934     + help
6935     + This feature reduces the number of hardware side channels by
6936     + ensuring that the majority of kernel addresses are not mapped
6937     + into userspace.
6938     +
6939     + See Documentation/x86/pagetable-isolation.txt for more details.
6940     +
6941     config SECURITY_INFINIBAND
6942     bool "Infiniband Security Hooks"
6943     depends on SECURITY && INFINIBAND
6944     diff --git a/sound/hda/hdac_i915.c b/sound/hda/hdac_i915.c
6945     index 038a180d3f81..cbe818eda336 100644
6946     --- a/sound/hda/hdac_i915.c
6947     +++ b/sound/hda/hdac_i915.c
6948     @@ -325,7 +325,7 @@ static int hdac_component_master_match(struct device *dev, void *data)
6949     */
6950     int snd_hdac_i915_register_notifier(const struct i915_audio_component_audio_ops *aops)
6951     {
6952     - if (WARN_ON(!hdac_acomp))
6953     + if (!hdac_acomp)
6954     return -ENODEV;
6955    
6956     hdac_acomp->audio_ops = aops;
6957     diff --git a/sound/pci/hda/patch_conexant.c b/sound/pci/hda/patch_conexant.c
6958     index a81aacf684b2..37e1cf8218ff 100644
6959     --- a/sound/pci/hda/patch_conexant.c
6960     +++ b/sound/pci/hda/patch_conexant.c
6961     @@ -271,6 +271,8 @@ enum {
6962     CXT_FIXUP_HP_SPECTRE,
6963     CXT_FIXUP_HP_GATE_MIC,
6964     CXT_FIXUP_MUTE_LED_GPIO,
6965     + CXT_FIXUP_HEADSET_MIC,
6966     + CXT_FIXUP_HP_MIC_NO_PRESENCE,
6967     };
6968    
6969     /* for hda_fixup_thinkpad_acpi() */
6970     @@ -350,6 +352,18 @@ static void cxt_fixup_headphone_mic(struct hda_codec *codec,
6971     }
6972     }
6973    
6974     +static void cxt_fixup_headset_mic(struct hda_codec *codec,
6975     + const struct hda_fixup *fix, int action)
6976     +{
6977     + struct conexant_spec *spec = codec->spec;
6978     +
6979     + switch (action) {
6980     + case HDA_FIXUP_ACT_PRE_PROBE:
6981     + spec->parse_flags |= HDA_PINCFG_HEADSET_MIC;
6982     + break;
6983     + }
6984     +}
6985     +
6986     /* OPLC XO 1.5 fixup */
6987    
6988     /* OLPC XO-1.5 supports DC input mode (e.g. for use with analog sensors)
6989     @@ -880,6 +894,19 @@ static const struct hda_fixup cxt_fixups[] = {
6990     .type = HDA_FIXUP_FUNC,
6991     .v.func = cxt_fixup_mute_led_gpio,
6992     },
6993     + [CXT_FIXUP_HEADSET_MIC] = {
6994     + .type = HDA_FIXUP_FUNC,
6995     + .v.func = cxt_fixup_headset_mic,
6996     + },
6997     + [CXT_FIXUP_HP_MIC_NO_PRESENCE] = {
6998     + .type = HDA_FIXUP_PINS,
6999     + .v.pins = (const struct hda_pintbl[]) {
7000     + { 0x1a, 0x02a1113c },
7001     + { }
7002     + },
7003     + .chained = true,
7004     + .chain_id = CXT_FIXUP_HEADSET_MIC,
7005     + },
7006     };
7007    
7008     static const struct snd_pci_quirk cxt5045_fixups[] = {
7009     @@ -934,6 +961,8 @@ static const struct snd_pci_quirk cxt5066_fixups[] = {
7010     SND_PCI_QUIRK(0x103c, 0x8115, "HP Z1 Gen3", CXT_FIXUP_HP_GATE_MIC),
7011     SND_PCI_QUIRK(0x103c, 0x814f, "HP ZBook 15u G3", CXT_FIXUP_MUTE_LED_GPIO),
7012     SND_PCI_QUIRK(0x103c, 0x822e, "HP ProBook 440 G4", CXT_FIXUP_MUTE_LED_GPIO),
7013     + SND_PCI_QUIRK(0x103c, 0x8299, "HP 800 G3 SFF", CXT_FIXUP_HP_MIC_NO_PRESENCE),
7014     + SND_PCI_QUIRK(0x103c, 0x829a, "HP 800 G3 DM", CXT_FIXUP_HP_MIC_NO_PRESENCE),
7015     SND_PCI_QUIRK(0x1043, 0x138d, "Asus", CXT_FIXUP_HEADPHONE_MIC_PIN),
7016     SND_PCI_QUIRK(0x152d, 0x0833, "OLPC XO-1.5", CXT_FIXUP_OLPC_XO),
7017     SND_PCI_QUIRK(0x17aa, 0x20f2, "Lenovo T400", CXT_PINCFG_LENOVO_TP410),
7018     diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c
7019     index 9ac4b9076ee2..acdb196ddb44 100644
7020     --- a/sound/pci/hda/patch_realtek.c
7021     +++ b/sound/pci/hda/patch_realtek.c
7022     @@ -324,8 +324,12 @@ static void alc_fill_eapd_coef(struct hda_codec *codec)
7023     case 0x10ec0292:
7024     alc_update_coef_idx(codec, 0x4, 1<<15, 0);
7025     break;
7026     - case 0x10ec0215:
7027     case 0x10ec0225:
7028     + case 0x10ec0295:
7029     + case 0x10ec0299:
7030     + alc_update_coef_idx(codec, 0x67, 0xf000, 0x3000);
7031     + /* fallthrough */
7032     + case 0x10ec0215:
7033     case 0x10ec0233:
7034     case 0x10ec0236:
7035     case 0x10ec0255:
7036     @@ -336,10 +340,8 @@ static void alc_fill_eapd_coef(struct hda_codec *codec)
7037     case 0x10ec0286:
7038     case 0x10ec0288:
7039     case 0x10ec0285:
7040     - case 0x10ec0295:
7041     case 0x10ec0298:
7042     case 0x10ec0289:
7043     - case 0x10ec0299:
7044     alc_update_coef_idx(codec, 0x10, 1<<9, 0);
7045     break;
7046     case 0x10ec0275:
7047     @@ -6305,6 +6307,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = {
7048     SND_PCI_QUIRK(0x17aa, 0x30bb, "ThinkCentre AIO", ALC233_FIXUP_LENOVO_LINE2_MIC_HOTKEY),
7049     SND_PCI_QUIRK(0x17aa, 0x30e2, "ThinkCentre AIO", ALC233_FIXUP_LENOVO_LINE2_MIC_HOTKEY),
7050     SND_PCI_QUIRK(0x17aa, 0x310c, "ThinkCentre Station", ALC294_FIXUP_LENOVO_MIC_LOCATION),
7051     + SND_PCI_QUIRK(0x17aa, 0x313c, "ThinkCentre Station", ALC294_FIXUP_LENOVO_MIC_LOCATION),
7052     SND_PCI_QUIRK(0x17aa, 0x3112, "ThinkCentre AIO", ALC233_FIXUP_LENOVO_LINE2_MIC_HOTKEY),
7053     SND_PCI_QUIRK(0x17aa, 0x3902, "Lenovo E50-80", ALC269_FIXUP_DMIC_THINKPAD_ACPI),
7054     SND_PCI_QUIRK(0x17aa, 0x3977, "IdeaPad S210", ALC283_FIXUP_INT_MIC),
7055     @@ -6557,6 +6560,11 @@ static const struct snd_hda_pin_quirk alc269_pin_fixup_tbl[] = {
7056     SND_HDA_PIN_QUIRK(0x10ec0255, 0x1028, "Dell", ALC255_FIXUP_DELL1_MIC_NO_PRESENCE,
7057     {0x1b, 0x01011020},
7058     {0x21, 0x02211010}),
7059     + SND_HDA_PIN_QUIRK(0x10ec0256, 0x1028, "Dell", ALC255_FIXUP_DELL1_MIC_NO_PRESENCE,
7060     + {0x12, 0x90a60130},
7061     + {0x14, 0x90170110},
7062     + {0x1b, 0x01011020},
7063     + {0x21, 0x0221101f}),
7064     SND_HDA_PIN_QUIRK(0x10ec0256, 0x1028, "Dell", ALC255_FIXUP_DELL1_MIC_NO_PRESENCE,
7065     {0x12, 0x90a60160},
7066     {0x14, 0x90170120},
7067     diff --git a/sound/soc/codecs/da7218.c b/sound/soc/codecs/da7218.c
7068     index b2d42ec1dcd9..56564ce90cb6 100644
7069     --- a/sound/soc/codecs/da7218.c
7070     +++ b/sound/soc/codecs/da7218.c
7071     @@ -2520,7 +2520,7 @@ static struct da7218_pdata *da7218_of_to_pdata(struct snd_soc_codec *codec)
7072     }
7073    
7074     if (da7218->dev_id == DA7218_DEV_ID) {
7075     - hpldet_np = of_find_node_by_name(np, "da7218_hpldet");
7076     + hpldet_np = of_get_child_by_name(np, "da7218_hpldet");
7077     if (!hpldet_np)
7078     return pdata;
7079    
7080     diff --git a/sound/soc/codecs/msm8916-wcd-analog.c b/sound/soc/codecs/msm8916-wcd-analog.c
7081     index 18933bf6473f..8c7063e1aa46 100644
7082     --- a/sound/soc/codecs/msm8916-wcd-analog.c
7083     +++ b/sound/soc/codecs/msm8916-wcd-analog.c
7084     @@ -267,7 +267,7 @@
7085     #define MSM8916_WCD_ANALOG_RATES (SNDRV_PCM_RATE_8000 | SNDRV_PCM_RATE_16000 |\
7086     SNDRV_PCM_RATE_32000 | SNDRV_PCM_RATE_48000)
7087     #define MSM8916_WCD_ANALOG_FORMATS (SNDRV_PCM_FMTBIT_S16_LE |\
7088     - SNDRV_PCM_FMTBIT_S24_LE)
7089     + SNDRV_PCM_FMTBIT_S32_LE)
7090    
7091     static int btn_mask = SND_JACK_BTN_0 | SND_JACK_BTN_1 |
7092     SND_JACK_BTN_2 | SND_JACK_BTN_3 | SND_JACK_BTN_4;
7093     diff --git a/sound/soc/codecs/msm8916-wcd-digital.c b/sound/soc/codecs/msm8916-wcd-digital.c
7094     index 66df8f810f0d..694db27b11fa 100644
7095     --- a/sound/soc/codecs/msm8916-wcd-digital.c
7096     +++ b/sound/soc/codecs/msm8916-wcd-digital.c
7097     @@ -194,7 +194,7 @@
7098     SNDRV_PCM_RATE_32000 | \
7099     SNDRV_PCM_RATE_48000)
7100     #define MSM8916_WCD_DIGITAL_FORMATS (SNDRV_PCM_FMTBIT_S16_LE |\
7101     - SNDRV_PCM_FMTBIT_S24_LE)
7102     + SNDRV_PCM_FMTBIT_S32_LE)
7103    
7104     struct msm8916_wcd_digital_priv {
7105     struct clk *ahbclk, *mclk;
7106     @@ -645,7 +645,7 @@ static int msm8916_wcd_digital_hw_params(struct snd_pcm_substream *substream,
7107     RX_I2S_CTL_RX_I2S_MODE_MASK,
7108     RX_I2S_CTL_RX_I2S_MODE_16);
7109     break;
7110     - case SNDRV_PCM_FORMAT_S24_LE:
7111     + case SNDRV_PCM_FORMAT_S32_LE:
7112     snd_soc_update_bits(dai->codec, LPASS_CDC_CLK_TX_I2S_CTL,
7113     TX_I2S_CTL_TX_I2S_MODE_MASK,
7114     TX_I2S_CTL_TX_I2S_MODE_32);
7115     diff --git a/sound/soc/codecs/tlv320aic31xx.h b/sound/soc/codecs/tlv320aic31xx.h
7116     index 730fb2058869..1ff3edb7bbb6 100644
7117     --- a/sound/soc/codecs/tlv320aic31xx.h
7118     +++ b/sound/soc/codecs/tlv320aic31xx.h
7119     @@ -116,7 +116,7 @@ struct aic31xx_pdata {
7120     /* INT2 interrupt control */
7121     #define AIC31XX_INT2CTRL AIC31XX_REG(0, 49)
7122     /* GPIO1 control */
7123     -#define AIC31XX_GPIO1 AIC31XX_REG(0, 50)
7124     +#define AIC31XX_GPIO1 AIC31XX_REG(0, 51)
7125    
7126     #define AIC31XX_DACPRB AIC31XX_REG(0, 60)
7127     /* ADC Instruction Set Register */
7128     diff --git a/sound/soc/codecs/twl4030.c b/sound/soc/codecs/twl4030.c
7129     index c482b2e7a7d2..cfe72b9d4356 100644
7130     --- a/sound/soc/codecs/twl4030.c
7131     +++ b/sound/soc/codecs/twl4030.c
7132     @@ -232,7 +232,7 @@ static struct twl4030_codec_data *twl4030_get_pdata(struct snd_soc_codec *codec)
7133     struct twl4030_codec_data *pdata = dev_get_platdata(codec->dev);
7134     struct device_node *twl4030_codec_node = NULL;
7135    
7136     - twl4030_codec_node = of_find_node_by_name(codec->dev->parent->of_node,
7137     + twl4030_codec_node = of_get_child_by_name(codec->dev->parent->of_node,
7138     "codec");
7139    
7140     if (!pdata && twl4030_codec_node) {
7141     @@ -241,9 +241,11 @@ static struct twl4030_codec_data *twl4030_get_pdata(struct snd_soc_codec *codec)
7142     GFP_KERNEL);
7143     if (!pdata) {
7144     dev_err(codec->dev, "Can not allocate memory\n");
7145     + of_node_put(twl4030_codec_node);
7146     return NULL;
7147     }
7148     twl4030_setup_pdata_of(pdata, twl4030_codec_node);
7149     + of_node_put(twl4030_codec_node);
7150     }
7151    
7152     return pdata;
7153     diff --git a/sound/soc/codecs/wm_adsp.c b/sound/soc/codecs/wm_adsp.c
7154     index 65c059b5ffd7..66e32f5d2917 100644
7155     --- a/sound/soc/codecs/wm_adsp.c
7156     +++ b/sound/soc/codecs/wm_adsp.c
7157     @@ -1733,7 +1733,7 @@ static int wm_adsp_load(struct wm_adsp *dsp)
7158     le64_to_cpu(footer->timestamp));
7159    
7160     while (pos < firmware->size &&
7161     - pos - firmware->size > sizeof(*region)) {
7162     + sizeof(*region) < firmware->size - pos) {
7163     region = (void *)&(firmware->data[pos]);
7164     region_name = "Unknown";
7165     reg = 0;
7166     @@ -1782,8 +1782,8 @@ static int wm_adsp_load(struct wm_adsp *dsp)
7167     regions, le32_to_cpu(region->len), offset,
7168     region_name);
7169    
7170     - if ((pos + le32_to_cpu(region->len) + sizeof(*region)) >
7171     - firmware->size) {
7172     + if (le32_to_cpu(region->len) >
7173     + firmware->size - pos - sizeof(*region)) {
7174     adsp_err(dsp,
7175     "%s.%d: %s region len %d bytes exceeds file length %zu\n",
7176     file, regions, region_name,
7177     @@ -2253,7 +2253,7 @@ static int wm_adsp_load_coeff(struct wm_adsp *dsp)
7178    
7179     blocks = 0;
7180     while (pos < firmware->size &&
7181     - pos - firmware->size > sizeof(*blk)) {
7182     + sizeof(*blk) < firmware->size - pos) {
7183     blk = (void *)(&firmware->data[pos]);
7184    
7185     type = le16_to_cpu(blk->type);
7186     @@ -2327,8 +2327,8 @@ static int wm_adsp_load_coeff(struct wm_adsp *dsp)
7187     }
7188    
7189     if (reg) {
7190     - if ((pos + le32_to_cpu(blk->len) + sizeof(*blk)) >
7191     - firmware->size) {
7192     + if (le32_to_cpu(blk->len) >
7193     + firmware->size - pos - sizeof(*blk)) {
7194     adsp_err(dsp,
7195     "%s.%d: %s region len %d bytes exceeds file length %zu\n",
7196     file, blocks, region_name,
7197     diff --git a/sound/soc/fsl/fsl_ssi.c b/sound/soc/fsl/fsl_ssi.c
7198     index 64598d1183f8..3ffbb498cc70 100644
7199     --- a/sound/soc/fsl/fsl_ssi.c
7200     +++ b/sound/soc/fsl/fsl_ssi.c
7201     @@ -1452,12 +1452,6 @@ static int fsl_ssi_probe(struct platform_device *pdev)
7202     sizeof(fsl_ssi_ac97_dai));
7203    
7204     fsl_ac97_data = ssi_private;
7205     -
7206     - ret = snd_soc_set_ac97_ops_of_reset(&fsl_ssi_ac97_ops, pdev);
7207     - if (ret) {
7208     - dev_err(&pdev->dev, "could not set AC'97 ops\n");
7209     - return ret;
7210     - }
7211     } else {
7212     /* Initialize this copy of the CPU DAI driver structure */
7213     memcpy(&ssi_private->cpu_dai_drv, &fsl_ssi_dai_template,
7214     @@ -1568,6 +1562,14 @@ static int fsl_ssi_probe(struct platform_device *pdev)
7215     return ret;
7216     }
7217    
7218     + if (fsl_ssi_is_ac97(ssi_private)) {
7219     + ret = snd_soc_set_ac97_ops_of_reset(&fsl_ssi_ac97_ops, pdev);
7220     + if (ret) {
7221     + dev_err(&pdev->dev, "could not set AC'97 ops\n");
7222     + goto error_ac97_ops;
7223     + }
7224     + }
7225     +
7226     ret = devm_snd_soc_register_component(&pdev->dev, &fsl_ssi_component,
7227     &ssi_private->cpu_dai_drv, 1);
7228     if (ret) {
7229     @@ -1651,6 +1653,10 @@ static int fsl_ssi_probe(struct platform_device *pdev)
7230     fsl_ssi_debugfs_remove(&ssi_private->dbg_stats);
7231    
7232     error_asoc_register:
7233     + if (fsl_ssi_is_ac97(ssi_private))
7234     + snd_soc_set_ac97_ops(NULL);
7235     +
7236     +error_ac97_ops:
7237     if (ssi_private->soc->imx)
7238     fsl_ssi_imx_clean(pdev, ssi_private);
7239    
7240     diff --git a/tools/testing/selftests/x86/ldt_gdt.c b/tools/testing/selftests/x86/ldt_gdt.c
7241     index 0304ffb714f2..1aef72df20a1 100644
7242     --- a/tools/testing/selftests/x86/ldt_gdt.c
7243     +++ b/tools/testing/selftests/x86/ldt_gdt.c
7244     @@ -122,8 +122,7 @@ static void check_valid_segment(uint16_t index, int ldt,
7245     * NB: Different Linux versions do different things with the
7246     * accessed bit in set_thread_area().
7247     */
7248     - if (ar != expected_ar &&
7249     - (ldt || ar != (expected_ar | AR_ACCESSED))) {
7250     + if (ar != expected_ar && ar != (expected_ar | AR_ACCESSED)) {
7251     printf("[FAIL]\t%s entry %hu has AR 0x%08X but expected 0x%08X\n",
7252     (ldt ? "LDT" : "GDT"), index, ar, expected_ar);
7253     nerrs++;
7254     diff --git a/tools/usb/usbip/src/utils.c b/tools/usb/usbip/src/utils.c
7255     index 2b3d6d235015..3d7b42e77299 100644
7256     --- a/tools/usb/usbip/src/utils.c
7257     +++ b/tools/usb/usbip/src/utils.c
7258     @@ -30,6 +30,7 @@ int modify_match_busid(char *busid, int add)
7259     char command[SYSFS_BUS_ID_SIZE + 4];
7260     char match_busid_attr_path[SYSFS_PATH_MAX];
7261     int rc;
7262     + int cmd_size;
7263    
7264     snprintf(match_busid_attr_path, sizeof(match_busid_attr_path),
7265     "%s/%s/%s/%s/%s/%s", SYSFS_MNT_PATH, SYSFS_BUS_NAME,
7266     @@ -37,12 +38,14 @@ int modify_match_busid(char *busid, int add)
7267     attr_name);
7268    
7269     if (add)
7270     - snprintf(command, SYSFS_BUS_ID_SIZE + 4, "add %s", busid);
7271     + cmd_size = snprintf(command, SYSFS_BUS_ID_SIZE + 4, "add %s",
7272     + busid);
7273     else
7274     - snprintf(command, SYSFS_BUS_ID_SIZE + 4, "del %s", busid);
7275     + cmd_size = snprintf(command, SYSFS_BUS_ID_SIZE + 4, "del %s",
7276     + busid);
7277    
7278     rc = write_sysfs_attribute(match_busid_attr_path, command,
7279     - sizeof(command));
7280     + cmd_size);
7281     if (rc < 0) {
7282     dbg("failed to write match_busid: %s", strerror(errno));
7283     return -1;