Magellan Linux

Contents of /trunk/kernel-magellan/patches-4.14/0110-4.14.11-all-fixes.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 3071 - (show annotations) (download)
Wed Jan 17 13:27:01 2018 UTC (6 years, 3 months ago) by niro
File size: 234762 byte(s)
-linux-4.14.11
1 diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
2 index 05496622b4ef..520fdec15bbb 100644
3 --- a/Documentation/admin-guide/kernel-parameters.txt
4 +++ b/Documentation/admin-guide/kernel-parameters.txt
5 @@ -2685,6 +2685,8 @@
6 steal time is computed, but won't influence scheduler
7 behaviour
8
9 + nopti [X86-64] Disable kernel page table isolation
10 +
11 nolapic [X86-32,APIC] Do not enable or use the local APIC.
12
13 nolapic_timer [X86-32,APIC] Do not use the local APIC timer.
14 @@ -3253,6 +3255,12 @@
15 pt. [PARIDE]
16 See Documentation/blockdev/paride.txt.
17
18 + pti= [X86_64]
19 + Control user/kernel address space isolation:
20 + on - enable
21 + off - disable
22 + auto - default setting
23 +
24 pty.legacy_count=
25 [KNL] Number of legacy pty's. Overwrites compiled-in
26 default number.
27 diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt
28 index 51101708a03a..ad41b3813f0a 100644
29 --- a/Documentation/x86/x86_64/mm.txt
30 +++ b/Documentation/x86/x86_64/mm.txt
31 @@ -12,6 +12,7 @@ ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB)
32 ... unused hole ...
33 ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB)
34 ... unused hole ...
35 +fffffe0000000000 - fffffe7fffffffff (=39 bits) LDT remap for PTI
36 fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping
37 ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
38 ... unused hole ...
39 @@ -29,8 +30,8 @@ Virtual memory map with 5 level page tables:
40 hole caused by [56:63] sign extension
41 ff00000000000000 - ff0fffffffffffff (=52 bits) guard hole, reserved for hypervisor
42 ff10000000000000 - ff8fffffffffffff (=55 bits) direct mapping of all phys. memory
43 -ff90000000000000 - ff91ffffffffffff (=49 bits) hole
44 -ff92000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space
45 +ff90000000000000 - ff9fffffffffffff (=52 bits) LDT remap for PTI
46 +ffa0000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space (12800 TB)
47 ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole
48 ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB)
49 ... unused hole ...
50 diff --git a/Makefile b/Makefile
51 index 9edfb78836a9..655887067dc7 100644
52 --- a/Makefile
53 +++ b/Makefile
54 @@ -1,7 +1,7 @@
55 # SPDX-License-Identifier: GPL-2.0
56 VERSION = 4
57 PATCHLEVEL = 14
58 -SUBLEVEL = 10
59 +SUBLEVEL = 11
60 EXTRAVERSION =
61 NAME = Petit Gorille
62
63 @@ -802,6 +802,9 @@ KBUILD_CFLAGS += $(call cc-disable-warning, pointer-sign)
64 # disable invalid "can't wrap" optimizations for signed / pointers
65 KBUILD_CFLAGS += $(call cc-option,-fno-strict-overflow)
66
67 +# Make sure -fstack-check isn't enabled (like gentoo apparently did)
68 +KBUILD_CFLAGS += $(call cc-option,-fno-stack-check,)
69 +
70 # conserve stack if available
71 KBUILD_CFLAGS += $(call cc-option,-fconserve-stack)
72
73 diff --git a/arch/sparc/lib/hweight.S b/arch/sparc/lib/hweight.S
74 index e5547b22cd18..0ddbbb031822 100644
75 --- a/arch/sparc/lib/hweight.S
76 +++ b/arch/sparc/lib/hweight.S
77 @@ -44,8 +44,8 @@ EXPORT_SYMBOL(__arch_hweight32)
78 .previous
79
80 ENTRY(__arch_hweight64)
81 - sethi %hi(__sw_hweight16), %g1
82 - jmpl %g1 + %lo(__sw_hweight16), %g0
83 + sethi %hi(__sw_hweight64), %g1
84 + jmpl %g1 + %lo(__sw_hweight64), %g0
85 nop
86 ENDPROC(__arch_hweight64)
87 EXPORT_SYMBOL(__arch_hweight64)
88 diff --git a/arch/x86/boot/compressed/pagetable.c b/arch/x86/boot/compressed/pagetable.c
89 index 972319ff5b01..e691ff734cb5 100644
90 --- a/arch/x86/boot/compressed/pagetable.c
91 +++ b/arch/x86/boot/compressed/pagetable.c
92 @@ -23,6 +23,9 @@
93 */
94 #undef CONFIG_AMD_MEM_ENCRYPT
95
96 +/* No PAGE_TABLE_ISOLATION support needed either: */
97 +#undef CONFIG_PAGE_TABLE_ISOLATION
98 +
99 #include "misc.h"
100
101 /* These actually do the work of building the kernel identity maps. */
102 diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
103 index 3fd8bc560fae..45a63e00a6af 100644
104 --- a/arch/x86/entry/calling.h
105 +++ b/arch/x86/entry/calling.h
106 @@ -1,6 +1,11 @@
107 /* SPDX-License-Identifier: GPL-2.0 */
108 #include <linux/jump_label.h>
109 #include <asm/unwind_hints.h>
110 +#include <asm/cpufeatures.h>
111 +#include <asm/page_types.h>
112 +#include <asm/percpu.h>
113 +#include <asm/asm-offsets.h>
114 +#include <asm/processor-flags.h>
115
116 /*
117
118 @@ -187,6 +192,146 @@ For 32-bit we have the following conventions - kernel is built with
119 #endif
120 .endm
121
122 +#ifdef CONFIG_PAGE_TABLE_ISOLATION
123 +
124 +/*
125 + * PAGE_TABLE_ISOLATION PGDs are 8k. Flip bit 12 to switch between the two
126 + * halves:
127 + */
128 +#define PTI_SWITCH_PGTABLES_MASK (1<<PAGE_SHIFT)
129 +#define PTI_SWITCH_MASK (PTI_SWITCH_PGTABLES_MASK|(1<<X86_CR3_PTI_SWITCH_BIT))
130 +
131 +.macro SET_NOFLUSH_BIT reg:req
132 + bts $X86_CR3_PCID_NOFLUSH_BIT, \reg
133 +.endm
134 +
135 +.macro ADJUST_KERNEL_CR3 reg:req
136 + ALTERNATIVE "", "SET_NOFLUSH_BIT \reg", X86_FEATURE_PCID
137 + /* Clear PCID and "PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */
138 + andq $(~PTI_SWITCH_MASK), \reg
139 +.endm
140 +
141 +.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
142 + ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
143 + mov %cr3, \scratch_reg
144 + ADJUST_KERNEL_CR3 \scratch_reg
145 + mov \scratch_reg, %cr3
146 +.Lend_\@:
147 +.endm
148 +
149 +#define THIS_CPU_user_pcid_flush_mask \
150 + PER_CPU_VAR(cpu_tlbstate) + TLB_STATE_user_pcid_flush_mask
151 +
152 +.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req
153 + ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
154 + mov %cr3, \scratch_reg
155 +
156 + ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID
157 +
158 + /*
159 + * Test if the ASID needs a flush.
160 + */
161 + movq \scratch_reg, \scratch_reg2
162 + andq $(0x7FF), \scratch_reg /* mask ASID */
163 + bt \scratch_reg, THIS_CPU_user_pcid_flush_mask
164 + jnc .Lnoflush_\@
165 +
166 + /* Flush needed, clear the bit */
167 + btr \scratch_reg, THIS_CPU_user_pcid_flush_mask
168 + movq \scratch_reg2, \scratch_reg
169 + jmp .Lwrcr3_\@
170 +
171 +.Lnoflush_\@:
172 + movq \scratch_reg2, \scratch_reg
173 + SET_NOFLUSH_BIT \scratch_reg
174 +
175 +.Lwrcr3_\@:
176 + /* Flip the PGD and ASID to the user version */
177 + orq $(PTI_SWITCH_MASK), \scratch_reg
178 + mov \scratch_reg, %cr3
179 +.Lend_\@:
180 +.endm
181 +
182 +.macro SWITCH_TO_USER_CR3_STACK scratch_reg:req
183 + pushq %rax
184 + SWITCH_TO_USER_CR3_NOSTACK scratch_reg=\scratch_reg scratch_reg2=%rax
185 + popq %rax
186 +.endm
187 +
188 +.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
189 + ALTERNATIVE "jmp .Ldone_\@", "", X86_FEATURE_PTI
190 + movq %cr3, \scratch_reg
191 + movq \scratch_reg, \save_reg
192 + /*
193 + * Is the "switch mask" all zero? That means that both of
194 + * these are zero:
195 + *
196 + * 1. The user/kernel PCID bit, and
197 + * 2. The user/kernel "bit" that points CR3 to the
198 + * bottom half of the 8k PGD
199 + *
200 + * That indicates a kernel CR3 value, not a user CR3.
201 + */
202 + testq $(PTI_SWITCH_MASK), \scratch_reg
203 + jz .Ldone_\@
204 +
205 + ADJUST_KERNEL_CR3 \scratch_reg
206 + movq \scratch_reg, %cr3
207 +
208 +.Ldone_\@:
209 +.endm
210 +
211 +.macro RESTORE_CR3 scratch_reg:req save_reg:req
212 + ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
213 +
214 + ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID
215 +
216 + /*
217 + * KERNEL pages can always resume with NOFLUSH as we do
218 + * explicit flushes.
219 + */
220 + bt $X86_CR3_PTI_SWITCH_BIT, \save_reg
221 + jnc .Lnoflush_\@
222 +
223 + /*
224 + * Check if there's a pending flush for the user ASID we're
225 + * about to set.
226 + */
227 + movq \save_reg, \scratch_reg
228 + andq $(0x7FF), \scratch_reg
229 + bt \scratch_reg, THIS_CPU_user_pcid_flush_mask
230 + jnc .Lnoflush_\@
231 +
232 + btr \scratch_reg, THIS_CPU_user_pcid_flush_mask
233 + jmp .Lwrcr3_\@
234 +
235 +.Lnoflush_\@:
236 + SET_NOFLUSH_BIT \save_reg
237 +
238 +.Lwrcr3_\@:
239 + /*
240 + * The CR3 write could be avoided when not changing its value,
241 + * but would require a CR3 read *and* a scratch register.
242 + */
243 + movq \save_reg, %cr3
244 +.Lend_\@:
245 +.endm
246 +
247 +#else /* CONFIG_PAGE_TABLE_ISOLATION=n: */
248 +
249 +.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
250 +.endm
251 +.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req
252 +.endm
253 +.macro SWITCH_TO_USER_CR3_STACK scratch_reg:req
254 +.endm
255 +.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
256 +.endm
257 +.macro RESTORE_CR3 scratch_reg:req save_reg:req
258 +.endm
259 +
260 +#endif
261 +
262 #endif /* CONFIG_X86_64 */
263
264 /*
265 diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
266 index 22c891c3b78d..dd696b966e58 100644
267 --- a/arch/x86/entry/entry_64.S
268 +++ b/arch/x86/entry/entry_64.S
269 @@ -23,7 +23,6 @@
270 #include <asm/segment.h>
271 #include <asm/cache.h>
272 #include <asm/errno.h>
273 -#include "calling.h"
274 #include <asm/asm-offsets.h>
275 #include <asm/msr.h>
276 #include <asm/unistd.h>
277 @@ -40,6 +39,8 @@
278 #include <asm/frame.h>
279 #include <linux/err.h>
280
281 +#include "calling.h"
282 +
283 .code64
284 .section .entry.text, "ax"
285
286 @@ -164,6 +165,9 @@ ENTRY(entry_SYSCALL_64_trampoline)
287 /* Stash the user RSP. */
288 movq %rsp, RSP_SCRATCH
289
290 + /* Note: using %rsp as a scratch reg. */
291 + SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
292 +
293 /* Load the top of the task stack into RSP */
294 movq CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp
295
296 @@ -203,6 +207,10 @@ ENTRY(entry_SYSCALL_64)
297 */
298
299 swapgs
300 + /*
301 + * This path is not taken when PAGE_TABLE_ISOLATION is disabled so it
302 + * is not required to switch CR3.
303 + */
304 movq %rsp, PER_CPU_VAR(rsp_scratch)
305 movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
306
307 @@ -399,6 +407,7 @@ syscall_return_via_sysret:
308 * We are on the trampoline stack. All regs except RDI are live.
309 * We can do future final exit work right here.
310 */
311 + SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
312
313 popq %rdi
314 popq %rsp
315 @@ -736,6 +745,8 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode)
316 * We can do future final exit work right here.
317 */
318
319 + SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
320 +
321 /* Restore RDI. */
322 popq %rdi
323 SWAPGS
324 @@ -818,7 +829,9 @@ native_irq_return_ldt:
325 */
326
327 pushq %rdi /* Stash user RDI */
328 - SWAPGS
329 + SWAPGS /* to kernel GS */
330 + SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi /* to kernel CR3 */
331 +
332 movq PER_CPU_VAR(espfix_waddr), %rdi
333 movq %rax, (0*8)(%rdi) /* user RAX */
334 movq (1*8)(%rsp), %rax /* user RIP */
335 @@ -834,7 +847,6 @@ native_irq_return_ldt:
336 /* Now RAX == RSP. */
337
338 andl $0xffff0000, %eax /* RAX = (RSP & 0xffff0000) */
339 - popq %rdi /* Restore user RDI */
340
341 /*
342 * espfix_stack[31:16] == 0. The page tables are set up such that
343 @@ -845,7 +857,11 @@ native_irq_return_ldt:
344 * still points to an RO alias of the ESPFIX stack.
345 */
346 orq PER_CPU_VAR(espfix_stack), %rax
347 - SWAPGS
348 +
349 + SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
350 + SWAPGS /* to user GS */
351 + popq %rdi /* Restore user RDI */
352 +
353 movq %rax, %rsp
354 UNWIND_HINT_IRET_REGS offset=8
355
356 @@ -945,6 +961,8 @@ ENTRY(switch_to_thread_stack)
357 UNWIND_HINT_FUNC
358
359 pushq %rdi
360 + /* Need to switch before accessing the thread stack. */
361 + SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
362 movq %rsp, %rdi
363 movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
364 UNWIND_HINT sp_offset=16 sp_reg=ORC_REG_DI
365 @@ -1244,7 +1262,11 @@ ENTRY(paranoid_entry)
366 js 1f /* negative -> in kernel */
367 SWAPGS
368 xorl %ebx, %ebx
369 -1: ret
370 +
371 +1:
372 + SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14
373 +
374 + ret
375 END(paranoid_entry)
376
377 /*
378 @@ -1266,6 +1288,7 @@ ENTRY(paranoid_exit)
379 testl %ebx, %ebx /* swapgs needed? */
380 jnz .Lparanoid_exit_no_swapgs
381 TRACE_IRQS_IRETQ
382 + RESTORE_CR3 scratch_reg=%rbx save_reg=%r14
383 SWAPGS_UNSAFE_STACK
384 jmp .Lparanoid_exit_restore
385 .Lparanoid_exit_no_swapgs:
386 @@ -1293,6 +1316,8 @@ ENTRY(error_entry)
387 * from user mode due to an IRET fault.
388 */
389 SWAPGS
390 + /* We have user CR3. Change to kernel CR3. */
391 + SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
392
393 .Lerror_entry_from_usermode_after_swapgs:
394 /* Put us onto the real thread stack. */
395 @@ -1339,6 +1364,7 @@ ENTRY(error_entry)
396 * .Lgs_change's error handler with kernel gsbase.
397 */
398 SWAPGS
399 + SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
400 jmp .Lerror_entry_done
401
402 .Lbstep_iret:
403 @@ -1348,10 +1374,11 @@ ENTRY(error_entry)
404
405 .Lerror_bad_iret:
406 /*
407 - * We came from an IRET to user mode, so we have user gsbase.
408 - * Switch to kernel gsbase:
409 + * We came from an IRET to user mode, so we have user
410 + * gsbase and CR3. Switch to kernel gsbase and CR3:
411 */
412 SWAPGS
413 + SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
414
415 /*
416 * Pretend that the exception came from user mode: set up pt_regs
417 @@ -1383,6 +1410,10 @@ END(error_exit)
418 /*
419 * Runs on exception stack. Xen PV does not go through this path at all,
420 * so we can use real assembly here.
421 + *
422 + * Registers:
423 + * %r14: Used to save/restore the CR3 of the interrupted context
424 + * when PAGE_TABLE_ISOLATION is in use. Do not clobber.
425 */
426 ENTRY(nmi)
427 UNWIND_HINT_IRET_REGS
428 @@ -1446,6 +1477,7 @@ ENTRY(nmi)
429
430 swapgs
431 cld
432 + SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx
433 movq %rsp, %rdx
434 movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
435 UNWIND_HINT_IRET_REGS base=%rdx offset=8
436 @@ -1698,6 +1730,8 @@ end_repeat_nmi:
437 movq $-1, %rsi
438 call do_nmi
439
440 + RESTORE_CR3 scratch_reg=%r15 save_reg=%r14
441 +
442 testl %ebx, %ebx /* swapgs needed? */
443 jnz nmi_restore
444 nmi_swapgs:
445 diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
446 index 95ad40eb7eff..40f17009ec20 100644
447 --- a/arch/x86/entry/entry_64_compat.S
448 +++ b/arch/x86/entry/entry_64_compat.S
449 @@ -49,6 +49,10 @@
450 ENTRY(entry_SYSENTER_compat)
451 /* Interrupts are off on entry. */
452 SWAPGS
453 +
454 + /* We are about to clobber %rsp anyway, clobbering here is OK */
455 + SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
456 +
457 movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
458
459 /*
460 @@ -215,6 +219,12 @@ GLOBAL(entry_SYSCALL_compat_after_hwframe)
461 pushq $0 /* pt_regs->r14 = 0 */
462 pushq $0 /* pt_regs->r15 = 0 */
463
464 + /*
465 + * We just saved %rdi so it is safe to clobber. It is not
466 + * preserved during the C calls inside TRACE_IRQS_OFF anyway.
467 + */
468 + SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
469 +
470 /*
471 * User mode is traced as though IRQs are on, and SYSENTER
472 * turned them off.
473 @@ -256,10 +266,22 @@ sysret32_from_system_call:
474 * when the system call started, which is already known to user
475 * code. We zero R8-R10 to avoid info leaks.
476 */
477 + movq RSP-ORIG_RAX(%rsp), %rsp
478 +
479 + /*
480 + * The original userspace %rsp (RSP-ORIG_RAX(%rsp)) is stored
481 + * on the process stack which is not mapped to userspace and
482 + * not readable after we SWITCH_TO_USER_CR3. Delay the CR3
483 + * switch until after after the last reference to the process
484 + * stack.
485 + *
486 + * %r8/%r9 are zeroed before the sysret, thus safe to clobber.
487 + */
488 + SWITCH_TO_USER_CR3_NOSTACK scratch_reg=%r8 scratch_reg2=%r9
489 +
490 xorq %r8, %r8
491 xorq %r9, %r9
492 xorq %r10, %r10
493 - movq RSP-ORIG_RAX(%rsp), %rsp
494 swapgs
495 sysretl
496 END(entry_SYSCALL_compat)
497 diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
498 index 1faf40f2dda9..577fa8adb785 100644
499 --- a/arch/x86/entry/vsyscall/vsyscall_64.c
500 +++ b/arch/x86/entry/vsyscall/vsyscall_64.c
501 @@ -344,14 +344,14 @@ int in_gate_area_no_mm(unsigned long addr)
502 * vsyscalls but leave the page not present. If so, we skip calling
503 * this.
504 */
505 -static void __init set_vsyscall_pgtable_user_bits(void)
506 +void __init set_vsyscall_pgtable_user_bits(pgd_t *root)
507 {
508 pgd_t *pgd;
509 p4d_t *p4d;
510 pud_t *pud;
511 pmd_t *pmd;
512
513 - pgd = pgd_offset_k(VSYSCALL_ADDR);
514 + pgd = pgd_offset_pgd(root, VSYSCALL_ADDR);
515 set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER));
516 p4d = p4d_offset(pgd, VSYSCALL_ADDR);
517 #if CONFIG_PGTABLE_LEVELS >= 5
518 @@ -373,7 +373,7 @@ void __init map_vsyscall(void)
519 vsyscall_mode == NATIVE
520 ? PAGE_KERNEL_VSYSCALL
521 : PAGE_KERNEL_VVAR);
522 - set_vsyscall_pgtable_user_bits();
523 + set_vsyscall_pgtable_user_bits(swapper_pg_dir);
524 }
525
526 BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) !=
527 diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
528 index 3674a4b6f8bd..8f0aace08b87 100644
529 --- a/arch/x86/events/intel/ds.c
530 +++ b/arch/x86/events/intel/ds.c
531 @@ -3,16 +3,18 @@
532 #include <linux/types.h>
533 #include <linux/slab.h>
534
535 +#include <asm/cpu_entry_area.h>
536 #include <asm/perf_event.h>
537 #include <asm/insn.h>
538
539 #include "../perf_event.h"
540
541 +/* Waste a full page so it can be mapped into the cpu_entry_area */
542 +DEFINE_PER_CPU_PAGE_ALIGNED(struct debug_store, cpu_debug_store);
543 +
544 /* The size of a BTS record in bytes: */
545 #define BTS_RECORD_SIZE 24
546
547 -#define BTS_BUFFER_SIZE (PAGE_SIZE << 4)
548 -#define PEBS_BUFFER_SIZE (PAGE_SIZE << 4)
549 #define PEBS_FIXUP_SIZE PAGE_SIZE
550
551 /*
552 @@ -279,17 +281,52 @@ void fini_debug_store_on_cpu(int cpu)
553
554 static DEFINE_PER_CPU(void *, insn_buffer);
555
556 -static int alloc_pebs_buffer(int cpu)
557 +static void ds_update_cea(void *cea, void *addr, size_t size, pgprot_t prot)
558 {
559 - struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
560 + phys_addr_t pa;
561 + size_t msz = 0;
562 +
563 + pa = virt_to_phys(addr);
564 + for (; msz < size; msz += PAGE_SIZE, pa += PAGE_SIZE, cea += PAGE_SIZE)
565 + cea_set_pte(cea, pa, prot);
566 +}
567 +
568 +static void ds_clear_cea(void *cea, size_t size)
569 +{
570 + size_t msz = 0;
571 +
572 + for (; msz < size; msz += PAGE_SIZE, cea += PAGE_SIZE)
573 + cea_set_pte(cea, 0, PAGE_NONE);
574 +}
575 +
576 +static void *dsalloc_pages(size_t size, gfp_t flags, int cpu)
577 +{
578 + unsigned int order = get_order(size);
579 int node = cpu_to_node(cpu);
580 - int max;
581 - void *buffer, *ibuffer;
582 + struct page *page;
583 +
584 + page = __alloc_pages_node(node, flags | __GFP_ZERO, order);
585 + return page ? page_address(page) : NULL;
586 +}
587 +
588 +static void dsfree_pages(const void *buffer, size_t size)
589 +{
590 + if (buffer)
591 + free_pages((unsigned long)buffer, get_order(size));
592 +}
593 +
594 +static int alloc_pebs_buffer(int cpu)
595 +{
596 + struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
597 + struct debug_store *ds = hwev->ds;
598 + size_t bsiz = x86_pmu.pebs_buffer_size;
599 + int max, node = cpu_to_node(cpu);
600 + void *buffer, *ibuffer, *cea;
601
602 if (!x86_pmu.pebs)
603 return 0;
604
605 - buffer = kzalloc_node(x86_pmu.pebs_buffer_size, GFP_KERNEL, node);
606 + buffer = dsalloc_pages(bsiz, GFP_KERNEL, cpu);
607 if (unlikely(!buffer))
608 return -ENOMEM;
609
610 @@ -300,25 +337,27 @@ static int alloc_pebs_buffer(int cpu)
611 if (x86_pmu.intel_cap.pebs_format < 2) {
612 ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node);
613 if (!ibuffer) {
614 - kfree(buffer);
615 + dsfree_pages(buffer, bsiz);
616 return -ENOMEM;
617 }
618 per_cpu(insn_buffer, cpu) = ibuffer;
619 }
620 -
621 - max = x86_pmu.pebs_buffer_size / x86_pmu.pebs_record_size;
622 -
623 - ds->pebs_buffer_base = (u64)(unsigned long)buffer;
624 + hwev->ds_pebs_vaddr = buffer;
625 + /* Update the cpu entry area mapping */
626 + cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer;
627 + ds->pebs_buffer_base = (unsigned long) cea;
628 + ds_update_cea(cea, buffer, bsiz, PAGE_KERNEL);
629 ds->pebs_index = ds->pebs_buffer_base;
630 - ds->pebs_absolute_maximum = ds->pebs_buffer_base +
631 - max * x86_pmu.pebs_record_size;
632 -
633 + max = x86_pmu.pebs_record_size * (bsiz / x86_pmu.pebs_record_size);
634 + ds->pebs_absolute_maximum = ds->pebs_buffer_base + max;
635 return 0;
636 }
637
638 static void release_pebs_buffer(int cpu)
639 {
640 - struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
641 + struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
642 + struct debug_store *ds = hwev->ds;
643 + void *cea;
644
645 if (!ds || !x86_pmu.pebs)
646 return;
647 @@ -326,73 +365,70 @@ static void release_pebs_buffer(int cpu)
648 kfree(per_cpu(insn_buffer, cpu));
649 per_cpu(insn_buffer, cpu) = NULL;
650
651 - kfree((void *)(unsigned long)ds->pebs_buffer_base);
652 + /* Clear the fixmap */
653 + cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer;
654 + ds_clear_cea(cea, x86_pmu.pebs_buffer_size);
655 ds->pebs_buffer_base = 0;
656 + dsfree_pages(hwev->ds_pebs_vaddr, x86_pmu.pebs_buffer_size);
657 + hwev->ds_pebs_vaddr = NULL;
658 }
659
660 static int alloc_bts_buffer(int cpu)
661 {
662 - struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
663 - int node = cpu_to_node(cpu);
664 - int max, thresh;
665 - void *buffer;
666 + struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
667 + struct debug_store *ds = hwev->ds;
668 + void *buffer, *cea;
669 + int max;
670
671 if (!x86_pmu.bts)
672 return 0;
673
674 - buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);
675 + buffer = dsalloc_pages(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, cpu);
676 if (unlikely(!buffer)) {
677 WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__);
678 return -ENOMEM;
679 }
680 -
681 - max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
682 - thresh = max / 16;
683 -
684 - ds->bts_buffer_base = (u64)(unsigned long)buffer;
685 + hwev->ds_bts_vaddr = buffer;
686 + /* Update the fixmap */
687 + cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer;
688 + ds->bts_buffer_base = (unsigned long) cea;
689 + ds_update_cea(cea, buffer, BTS_BUFFER_SIZE, PAGE_KERNEL);
690 ds->bts_index = ds->bts_buffer_base;
691 - ds->bts_absolute_maximum = ds->bts_buffer_base +
692 - max * BTS_RECORD_SIZE;
693 - ds->bts_interrupt_threshold = ds->bts_absolute_maximum -
694 - thresh * BTS_RECORD_SIZE;
695 -
696 + max = BTS_RECORD_SIZE * (BTS_BUFFER_SIZE / BTS_RECORD_SIZE);
697 + ds->bts_absolute_maximum = ds->bts_buffer_base + max;
698 + ds->bts_interrupt_threshold = ds->bts_absolute_maximum - (max / 16);
699 return 0;
700 }
701
702 static void release_bts_buffer(int cpu)
703 {
704 - struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
705 + struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
706 + struct debug_store *ds = hwev->ds;
707 + void *cea;
708
709 if (!ds || !x86_pmu.bts)
710 return;
711
712 - kfree((void *)(unsigned long)ds->bts_buffer_base);
713 + /* Clear the fixmap */
714 + cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer;
715 + ds_clear_cea(cea, BTS_BUFFER_SIZE);
716 ds->bts_buffer_base = 0;
717 + dsfree_pages(hwev->ds_bts_vaddr, BTS_BUFFER_SIZE);
718 + hwev->ds_bts_vaddr = NULL;
719 }
720
721 static int alloc_ds_buffer(int cpu)
722 {
723 - int node = cpu_to_node(cpu);
724 - struct debug_store *ds;
725 -
726 - ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node);
727 - if (unlikely(!ds))
728 - return -ENOMEM;
729 + struct debug_store *ds = &get_cpu_entry_area(cpu)->cpu_debug_store;
730
731 + memset(ds, 0, sizeof(*ds));
732 per_cpu(cpu_hw_events, cpu).ds = ds;
733 -
734 return 0;
735 }
736
737 static void release_ds_buffer(int cpu)
738 {
739 - struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
740 -
741 - if (!ds)
742 - return;
743 -
744 per_cpu(cpu_hw_events, cpu).ds = NULL;
745 - kfree(ds);
746 }
747
748 void release_ds_buffers(void)
749 diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
750 index f7aaadf9331f..8e4ea143ed96 100644
751 --- a/arch/x86/events/perf_event.h
752 +++ b/arch/x86/events/perf_event.h
753 @@ -14,6 +14,8 @@
754
755 #include <linux/perf_event.h>
756
757 +#include <asm/intel_ds.h>
758 +
759 /* To enable MSR tracing please use the generic trace points. */
760
761 /*
762 @@ -77,8 +79,6 @@ struct amd_nb {
763 struct event_constraint event_constraints[X86_PMC_IDX_MAX];
764 };
765
766 -/* The maximal number of PEBS events: */
767 -#define MAX_PEBS_EVENTS 8
768 #define PEBS_COUNTER_MASK ((1ULL << MAX_PEBS_EVENTS) - 1)
769
770 /*
771 @@ -95,23 +95,6 @@ struct amd_nb {
772 PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR | \
773 PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER)
774
775 -/*
776 - * A debug store configuration.
777 - *
778 - * We only support architectures that use 64bit fields.
779 - */
780 -struct debug_store {
781 - u64 bts_buffer_base;
782 - u64 bts_index;
783 - u64 bts_absolute_maximum;
784 - u64 bts_interrupt_threshold;
785 - u64 pebs_buffer_base;
786 - u64 pebs_index;
787 - u64 pebs_absolute_maximum;
788 - u64 pebs_interrupt_threshold;
789 - u64 pebs_event_reset[MAX_PEBS_EVENTS];
790 -};
791 -
792 #define PEBS_REGS \
793 (PERF_REG_X86_AX | \
794 PERF_REG_X86_BX | \
795 @@ -216,6 +199,8 @@ struct cpu_hw_events {
796 * Intel DebugStore bits
797 */
798 struct debug_store *ds;
799 + void *ds_pebs_vaddr;
800 + void *ds_bts_vaddr;
801 u64 pebs_enabled;
802 int n_pebs;
803 int n_large_pebs;
804 diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h
805 index 2fbc69a0916e..4a7884b8dca5 100644
806 --- a/arch/x86/include/asm/cpu_entry_area.h
807 +++ b/arch/x86/include/asm/cpu_entry_area.h
808 @@ -5,6 +5,7 @@
809
810 #include <linux/percpu-defs.h>
811 #include <asm/processor.h>
812 +#include <asm/intel_ds.h>
813
814 /*
815 * cpu_entry_area is a percpu region that contains things needed by the CPU
816 @@ -40,6 +41,18 @@ struct cpu_entry_area {
817 */
818 char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ];
819 #endif
820 +#ifdef CONFIG_CPU_SUP_INTEL
821 + /*
822 + * Per CPU debug store for Intel performance monitoring. Wastes a
823 + * full page at the moment.
824 + */
825 + struct debug_store cpu_debug_store;
826 + /*
827 + * The actual PEBS/BTS buffers must be mapped to user space
828 + * Reserve enough fixmap PTEs.
829 + */
830 + struct debug_store_buffers cpu_debug_buffers;
831 +#endif
832 };
833
834 #define CPU_ENTRY_AREA_SIZE (sizeof(struct cpu_entry_area))
835 diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
836 index 800104c8a3ed..07cdd1715705 100644
837 --- a/arch/x86/include/asm/cpufeatures.h
838 +++ b/arch/x86/include/asm/cpufeatures.h
839 @@ -197,11 +197,12 @@
840 #define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* Cache Allocation Technology L3 */
841 #define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* Cache Allocation Technology L2 */
842 #define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* Code and Data Prioritization L3 */
843 +#define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 7) /* Effectively INVPCID && CR4.PCIDE=1 */
844
845 #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */
846 #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
847 #define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */
848 -
849 +#define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */
850 #define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */
851 #define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */
852 #define X86_FEATURE_AVX512_4VNNIW ( 7*32+16) /* AVX-512 Neural Network Instructions */
853 @@ -340,5 +341,6 @@
854 #define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */
855 #define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */
856 #define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */
857 +#define X86_BUG_CPU_INSECURE X86_BUG(14) /* CPU is insecure and needs kernel page table isolation */
858
859 #endif /* _ASM_X86_CPUFEATURES_H */
860 diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
861 index bc359dd2f7f6..85e23bb7b34e 100644
862 --- a/arch/x86/include/asm/desc.h
863 +++ b/arch/x86/include/asm/desc.h
864 @@ -21,6 +21,8 @@ static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *in
865
866 desc->type = (info->read_exec_only ^ 1) << 1;
867 desc->type |= info->contents << 2;
868 + /* Set the ACCESS bit so it can be mapped RO */
869 + desc->type |= 1;
870
871 desc->s = 1;
872 desc->dpl = 0x3;
873 diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h
874 index c10c9128f54e..e428e16dd822 100644
875 --- a/arch/x86/include/asm/disabled-features.h
876 +++ b/arch/x86/include/asm/disabled-features.h
877 @@ -44,6 +44,12 @@
878 # define DISABLE_LA57 (1<<(X86_FEATURE_LA57 & 31))
879 #endif
880
881 +#ifdef CONFIG_PAGE_TABLE_ISOLATION
882 +# define DISABLE_PTI 0
883 +#else
884 +# define DISABLE_PTI (1 << (X86_FEATURE_PTI & 31))
885 +#endif
886 +
887 /*
888 * Make sure to add features to the correct mask
889 */
890 @@ -54,7 +60,7 @@
891 #define DISABLED_MASK4 (DISABLE_PCID)
892 #define DISABLED_MASK5 0
893 #define DISABLED_MASK6 0
894 -#define DISABLED_MASK7 0
895 +#define DISABLED_MASK7 (DISABLE_PTI)
896 #define DISABLED_MASK8 0
897 #define DISABLED_MASK9 (DISABLE_MPX)
898 #define DISABLED_MASK10 0
899 diff --git a/arch/x86/include/asm/intel_ds.h b/arch/x86/include/asm/intel_ds.h
900 new file mode 100644
901 index 000000000000..62a9f4966b42
902 --- /dev/null
903 +++ b/arch/x86/include/asm/intel_ds.h
904 @@ -0,0 +1,36 @@
905 +#ifndef _ASM_INTEL_DS_H
906 +#define _ASM_INTEL_DS_H
907 +
908 +#include <linux/percpu-defs.h>
909 +
910 +#define BTS_BUFFER_SIZE (PAGE_SIZE << 4)
911 +#define PEBS_BUFFER_SIZE (PAGE_SIZE << 4)
912 +
913 +/* The maximal number of PEBS events: */
914 +#define MAX_PEBS_EVENTS 8
915 +
916 +/*
917 + * A debug store configuration.
918 + *
919 + * We only support architectures that use 64bit fields.
920 + */
921 +struct debug_store {
922 + u64 bts_buffer_base;
923 + u64 bts_index;
924 + u64 bts_absolute_maximum;
925 + u64 bts_interrupt_threshold;
926 + u64 pebs_buffer_base;
927 + u64 pebs_index;
928 + u64 pebs_absolute_maximum;
929 + u64 pebs_interrupt_threshold;
930 + u64 pebs_event_reset[MAX_PEBS_EVENTS];
931 +} __aligned(PAGE_SIZE);
932 +
933 +DECLARE_PER_CPU_PAGE_ALIGNED(struct debug_store, cpu_debug_store);
934 +
935 +struct debug_store_buffers {
936 + char bts_buffer[BTS_BUFFER_SIZE];
937 + char pebs_buffer[PEBS_BUFFER_SIZE];
938 +};
939 +
940 +#endif
941 diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
942 index 5ede7cae1d67..c931b88982a0 100644
943 --- a/arch/x86/include/asm/mmu_context.h
944 +++ b/arch/x86/include/asm/mmu_context.h
945 @@ -50,10 +50,33 @@ struct ldt_struct {
946 * call gates. On native, we could merge the ldt_struct and LDT
947 * allocations, but it's not worth trying to optimize.
948 */
949 - struct desc_struct *entries;
950 - unsigned int nr_entries;
951 + struct desc_struct *entries;
952 + unsigned int nr_entries;
953 +
954 + /*
955 + * If PTI is in use, then the entries array is not mapped while we're
956 + * in user mode. The whole array will be aliased at the addressed
957 + * given by ldt_slot_va(slot). We use two slots so that we can allocate
958 + * and map, and enable a new LDT without invalidating the mapping
959 + * of an older, still-in-use LDT.
960 + *
961 + * slot will be -1 if this LDT doesn't have an alias mapping.
962 + */
963 + int slot;
964 };
965
966 +/* This is a multiple of PAGE_SIZE. */
967 +#define LDT_SLOT_STRIDE (LDT_ENTRIES * LDT_ENTRY_SIZE)
968 +
969 +static inline void *ldt_slot_va(int slot)
970 +{
971 +#ifdef CONFIG_X86_64
972 + return (void *)(LDT_BASE_ADDR + LDT_SLOT_STRIDE * slot);
973 +#else
974 + BUG();
975 +#endif
976 +}
977 +
978 /*
979 * Used for LDT copy/destruction.
980 */
981 @@ -64,6 +87,7 @@ static inline void init_new_context_ldt(struct mm_struct *mm)
982 }
983 int ldt_dup_context(struct mm_struct *oldmm, struct mm_struct *mm);
984 void destroy_context_ldt(struct mm_struct *mm);
985 +void ldt_arch_exit_mmap(struct mm_struct *mm);
986 #else /* CONFIG_MODIFY_LDT_SYSCALL */
987 static inline void init_new_context_ldt(struct mm_struct *mm) { }
988 static inline int ldt_dup_context(struct mm_struct *oldmm,
989 @@ -71,7 +95,8 @@ static inline int ldt_dup_context(struct mm_struct *oldmm,
990 {
991 return 0;
992 }
993 -static inline void destroy_context_ldt(struct mm_struct *mm) {}
994 +static inline void destroy_context_ldt(struct mm_struct *mm) { }
995 +static inline void ldt_arch_exit_mmap(struct mm_struct *mm) { }
996 #endif
997
998 static inline void load_mm_ldt(struct mm_struct *mm)
999 @@ -96,10 +121,31 @@ static inline void load_mm_ldt(struct mm_struct *mm)
1000 * that we can see.
1001 */
1002
1003 - if (unlikely(ldt))
1004 - set_ldt(ldt->entries, ldt->nr_entries);
1005 - else
1006 + if (unlikely(ldt)) {
1007 + if (static_cpu_has(X86_FEATURE_PTI)) {
1008 + if (WARN_ON_ONCE((unsigned long)ldt->slot > 1)) {
1009 + /*
1010 + * Whoops -- either the new LDT isn't mapped
1011 + * (if slot == -1) or is mapped into a bogus
1012 + * slot (if slot > 1).
1013 + */
1014 + clear_LDT();
1015 + return;
1016 + }
1017 +
1018 + /*
1019 + * If page table isolation is enabled, ldt->entries
1020 + * will not be mapped in the userspace pagetables.
1021 + * Tell the CPU to access the LDT through the alias
1022 + * at ldt_slot_va(ldt->slot).
1023 + */
1024 + set_ldt(ldt_slot_va(ldt->slot), ldt->nr_entries);
1025 + } else {
1026 + set_ldt(ldt->entries, ldt->nr_entries);
1027 + }
1028 + } else {
1029 clear_LDT();
1030 + }
1031 #else
1032 clear_LDT();
1033 #endif
1034 @@ -194,6 +240,7 @@ static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
1035 static inline void arch_exit_mmap(struct mm_struct *mm)
1036 {
1037 paravirt_arch_exit_mmap(mm);
1038 + ldt_arch_exit_mmap(mm);
1039 }
1040
1041 #ifdef CONFIG_X86_64
1042 diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
1043 index 4b5e1eafada7..aff42e1da6ee 100644
1044 --- a/arch/x86/include/asm/pgalloc.h
1045 +++ b/arch/x86/include/asm/pgalloc.h
1046 @@ -30,6 +30,17 @@ static inline void paravirt_release_p4d(unsigned long pfn) {}
1047 */
1048 extern gfp_t __userpte_alloc_gfp;
1049
1050 +#ifdef CONFIG_PAGE_TABLE_ISOLATION
1051 +/*
1052 + * Instead of one PGD, we acquire two PGDs. Being order-1, it is
1053 + * both 8k in size and 8k-aligned. That lets us just flip bit 12
1054 + * in a pointer to swap between the two 4k halves.
1055 + */
1056 +#define PGD_ALLOCATION_ORDER 1
1057 +#else
1058 +#define PGD_ALLOCATION_ORDER 0
1059 +#endif
1060 +
1061 /*
1062 * Allocate and free page tables.
1063 */
1064 diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
1065 index f02de8bc1f72..211368922cad 100644
1066 --- a/arch/x86/include/asm/pgtable.h
1067 +++ b/arch/x86/include/asm/pgtable.h
1068 @@ -28,6 +28,7 @@ extern pgd_t early_top_pgt[PTRS_PER_PGD];
1069 int __init __early_make_pgtable(unsigned long address, pmdval_t pmd);
1070
1071 void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd);
1072 +void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user);
1073 void ptdump_walk_pgd_level_checkwx(void);
1074
1075 #ifdef CONFIG_DEBUG_WX
1076 @@ -846,7 +847,12 @@ static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)
1077
1078 static inline int p4d_bad(p4d_t p4d)
1079 {
1080 - return (p4d_flags(p4d) & ~(_KERNPG_TABLE | _PAGE_USER)) != 0;
1081 + unsigned long ignore_flags = _KERNPG_TABLE | _PAGE_USER;
1082 +
1083 + if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
1084 + ignore_flags |= _PAGE_NX;
1085 +
1086 + return (p4d_flags(p4d) & ~ignore_flags) != 0;
1087 }
1088 #endif /* CONFIG_PGTABLE_LEVELS > 3 */
1089
1090 @@ -880,7 +886,12 @@ static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address)
1091
1092 static inline int pgd_bad(pgd_t pgd)
1093 {
1094 - return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE;
1095 + unsigned long ignore_flags = _PAGE_USER;
1096 +
1097 + if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
1098 + ignore_flags |= _PAGE_NX;
1099 +
1100 + return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
1101 }
1102
1103 static inline int pgd_none(pgd_t pgd)
1104 @@ -909,7 +920,11 @@ static inline int pgd_none(pgd_t pgd)
1105 * pgd_offset() returns a (pgd_t *)
1106 * pgd_index() is used get the offset into the pgd page's array of pgd_t's;
1107 */
1108 -#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address)))
1109 +#define pgd_offset_pgd(pgd, address) (pgd + pgd_index((address)))
1110 +/*
1111 + * a shortcut to get a pgd_t in a given mm
1112 + */
1113 +#define pgd_offset(mm, address) pgd_offset_pgd((mm)->pgd, (address))
1114 /*
1115 * a shortcut which implies the use of the kernel's pgd, instead
1116 * of a process's
1117 @@ -1111,7 +1126,14 @@ static inline int pud_write(pud_t pud)
1118 */
1119 static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
1120 {
1121 - memcpy(dst, src, count * sizeof(pgd_t));
1122 + memcpy(dst, src, count * sizeof(pgd_t));
1123 +#ifdef CONFIG_PAGE_TABLE_ISOLATION
1124 + if (!static_cpu_has(X86_FEATURE_PTI))
1125 + return;
1126 + /* Clone the user space pgd as well */
1127 + memcpy(kernel_to_user_pgdp(dst), kernel_to_user_pgdp(src),
1128 + count * sizeof(pgd_t));
1129 +#endif
1130 }
1131
1132 #define PTE_SHIFT ilog2(PTRS_PER_PTE)
1133 diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
1134 index e9f05331e732..81462e9a34f6 100644
1135 --- a/arch/x86/include/asm/pgtable_64.h
1136 +++ b/arch/x86/include/asm/pgtable_64.h
1137 @@ -131,9 +131,97 @@ static inline pud_t native_pudp_get_and_clear(pud_t *xp)
1138 #endif
1139 }
1140
1141 +#ifdef CONFIG_PAGE_TABLE_ISOLATION
1142 +/*
1143 + * All top-level PAGE_TABLE_ISOLATION page tables are order-1 pages
1144 + * (8k-aligned and 8k in size). The kernel one is at the beginning 4k and
1145 + * the user one is in the last 4k. To switch between them, you
1146 + * just need to flip the 12th bit in their addresses.
1147 + */
1148 +#define PTI_PGTABLE_SWITCH_BIT PAGE_SHIFT
1149 +
1150 +/*
1151 + * This generates better code than the inline assembly in
1152 + * __set_bit().
1153 + */
1154 +static inline void *ptr_set_bit(void *ptr, int bit)
1155 +{
1156 + unsigned long __ptr = (unsigned long)ptr;
1157 +
1158 + __ptr |= BIT(bit);
1159 + return (void *)__ptr;
1160 +}
1161 +static inline void *ptr_clear_bit(void *ptr, int bit)
1162 +{
1163 + unsigned long __ptr = (unsigned long)ptr;
1164 +
1165 + __ptr &= ~BIT(bit);
1166 + return (void *)__ptr;
1167 +}
1168 +
1169 +static inline pgd_t *kernel_to_user_pgdp(pgd_t *pgdp)
1170 +{
1171 + return ptr_set_bit(pgdp, PTI_PGTABLE_SWITCH_BIT);
1172 +}
1173 +
1174 +static inline pgd_t *user_to_kernel_pgdp(pgd_t *pgdp)
1175 +{
1176 + return ptr_clear_bit(pgdp, PTI_PGTABLE_SWITCH_BIT);
1177 +}
1178 +
1179 +static inline p4d_t *kernel_to_user_p4dp(p4d_t *p4dp)
1180 +{
1181 + return ptr_set_bit(p4dp, PTI_PGTABLE_SWITCH_BIT);
1182 +}
1183 +
1184 +static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp)
1185 +{
1186 + return ptr_clear_bit(p4dp, PTI_PGTABLE_SWITCH_BIT);
1187 +}
1188 +#endif /* CONFIG_PAGE_TABLE_ISOLATION */
1189 +
1190 +/*
1191 + * Page table pages are page-aligned. The lower half of the top
1192 + * level is used for userspace and the top half for the kernel.
1193 + *
1194 + * Returns true for parts of the PGD that map userspace and
1195 + * false for the parts that map the kernel.
1196 + */
1197 +static inline bool pgdp_maps_userspace(void *__ptr)
1198 +{
1199 + unsigned long ptr = (unsigned long)__ptr;
1200 +
1201 + return (ptr & ~PAGE_MASK) < (PAGE_SIZE / 2);
1202 +}
1203 +
1204 +#ifdef CONFIG_PAGE_TABLE_ISOLATION
1205 +pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd);
1206 +
1207 +/*
1208 + * Take a PGD location (pgdp) and a pgd value that needs to be set there.
1209 + * Populates the user and returns the resulting PGD that must be set in
1210 + * the kernel copy of the page tables.
1211 + */
1212 +static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
1213 +{
1214 + if (!static_cpu_has(X86_FEATURE_PTI))
1215 + return pgd;
1216 + return __pti_set_user_pgd(pgdp, pgd);
1217 +}
1218 +#else
1219 +static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
1220 +{
1221 + return pgd;
1222 +}
1223 +#endif
1224 +
1225 static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d)
1226 {
1227 +#if defined(CONFIG_PAGE_TABLE_ISOLATION) && !defined(CONFIG_X86_5LEVEL)
1228 + p4dp->pgd = pti_set_user_pgd(&p4dp->pgd, p4d.pgd);
1229 +#else
1230 *p4dp = p4d;
1231 +#endif
1232 }
1233
1234 static inline void native_p4d_clear(p4d_t *p4d)
1235 @@ -147,7 +235,11 @@ static inline void native_p4d_clear(p4d_t *p4d)
1236
1237 static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
1238 {
1239 +#ifdef CONFIG_PAGE_TABLE_ISOLATION
1240 + *pgdp = pti_set_user_pgd(pgdp, pgd);
1241 +#else
1242 *pgdp = pgd;
1243 +#endif
1244 }
1245
1246 static inline void native_pgd_clear(pgd_t *pgd)
1247 diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
1248 index 3d27831bc58d..b97a539bcdee 100644
1249 --- a/arch/x86/include/asm/pgtable_64_types.h
1250 +++ b/arch/x86/include/asm/pgtable_64_types.h
1251 @@ -79,13 +79,17 @@ typedef struct { pteval_t pte; } pte_t;
1252 #define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
1253
1254 #ifdef CONFIG_X86_5LEVEL
1255 -# define VMALLOC_SIZE_TB _AC(16384, UL)
1256 -# define __VMALLOC_BASE _AC(0xff92000000000000, UL)
1257 +# define VMALLOC_SIZE_TB _AC(12800, UL)
1258 +# define __VMALLOC_BASE _AC(0xffa0000000000000, UL)
1259 # define __VMEMMAP_BASE _AC(0xffd4000000000000, UL)
1260 +# define LDT_PGD_ENTRY _AC(-112, UL)
1261 +# define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT)
1262 #else
1263 # define VMALLOC_SIZE_TB _AC(32, UL)
1264 # define __VMALLOC_BASE _AC(0xffffc90000000000, UL)
1265 # define __VMEMMAP_BASE _AC(0xffffea0000000000, UL)
1266 +# define LDT_PGD_ENTRY _AC(-4, UL)
1267 +# define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT)
1268 #endif
1269
1270 #ifdef CONFIG_RANDOMIZE_MEMORY
1271 diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
1272 index 43212a43ee69..6a60fea90b9d 100644
1273 --- a/arch/x86/include/asm/processor-flags.h
1274 +++ b/arch/x86/include/asm/processor-flags.h
1275 @@ -38,6 +38,11 @@
1276 #define CR3_ADDR_MASK __sme_clr(0x7FFFFFFFFFFFF000ull)
1277 #define CR3_PCID_MASK 0xFFFull
1278 #define CR3_NOFLUSH BIT_ULL(63)
1279 +
1280 +#ifdef CONFIG_PAGE_TABLE_ISOLATION
1281 +# define X86_CR3_PTI_SWITCH_BIT 11
1282 +#endif
1283 +
1284 #else
1285 /*
1286 * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save
1287 diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
1288 index 9e482d8b0b97..9c18da64daa9 100644
1289 --- a/arch/x86/include/asm/processor.h
1290 +++ b/arch/x86/include/asm/processor.h
1291 @@ -851,13 +851,22 @@ static inline void spin_lock_prefetch(const void *x)
1292
1293 #else
1294 /*
1295 - * User space process size. 47bits minus one guard page. The guard
1296 - * page is necessary on Intel CPUs: if a SYSCALL instruction is at
1297 - * the highest possible canonical userspace address, then that
1298 - * syscall will enter the kernel with a non-canonical return
1299 - * address, and SYSRET will explode dangerously. We avoid this
1300 - * particular problem by preventing anything from being mapped
1301 - * at the maximum canonical address.
1302 + * User space process size. This is the first address outside the user range.
1303 + * There are a few constraints that determine this:
1304 + *
1305 + * On Intel CPUs, if a SYSCALL instruction is at the highest canonical
1306 + * address, then that syscall will enter the kernel with a
1307 + * non-canonical return address, and SYSRET will explode dangerously.
1308 + * We avoid this particular problem by preventing anything executable
1309 + * from being mapped at the maximum canonical address.
1310 + *
1311 + * On AMD CPUs in the Ryzen family, there's a nasty bug in which the
1312 + * CPUs malfunction if they execute code from the highest canonical page.
1313 + * They'll speculate right off the end of the canonical space, and
1314 + * bad things happen. This is worked around in the same way as the
1315 + * Intel problem.
1316 + *
1317 + * With page table isolation enabled, we map the LDT in ... [stay tuned]
1318 */
1319 #define TASK_SIZE_MAX ((1UL << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE)
1320
1321 diff --git a/arch/x86/include/asm/pti.h b/arch/x86/include/asm/pti.h
1322 new file mode 100644
1323 index 000000000000..0b5ef05b2d2d
1324 --- /dev/null
1325 +++ b/arch/x86/include/asm/pti.h
1326 @@ -0,0 +1,14 @@
1327 +// SPDX-License-Identifier: GPL-2.0
1328 +#ifndef _ASM_X86_PTI_H
1329 +#define _ASM_X86_PTI_H
1330 +#ifndef __ASSEMBLY__
1331 +
1332 +#ifdef CONFIG_PAGE_TABLE_ISOLATION
1333 +extern void pti_init(void);
1334 +extern void pti_check_boottime_disable(void);
1335 +#else
1336 +static inline void pti_check_boottime_disable(void) { }
1337 +#endif
1338 +
1339 +#endif /* __ASSEMBLY__ */
1340 +#endif /* _ASM_X86_PTI_H */
1341 diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
1342 index 171b429f43a2..f9b48ce152eb 100644
1343 --- a/arch/x86/include/asm/tlbflush.h
1344 +++ b/arch/x86/include/asm/tlbflush.h
1345 @@ -10,38 +10,90 @@
1346 #include <asm/special_insns.h>
1347 #include <asm/smp.h>
1348 #include <asm/invpcid.h>
1349 +#include <asm/pti.h>
1350 +#include <asm/processor-flags.h>
1351
1352 -static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
1353 -{
1354 - /*
1355 - * Bump the generation count. This also serves as a full barrier
1356 - * that synchronizes with switch_mm(): callers are required to order
1357 - * their read of mm_cpumask after their writes to the paging
1358 - * structures.
1359 - */
1360 - return atomic64_inc_return(&mm->context.tlb_gen);
1361 -}
1362 +/*
1363 + * The x86 feature is called PCID (Process Context IDentifier). It is similar
1364 + * to what is traditionally called ASID on the RISC processors.
1365 + *
1366 + * We don't use the traditional ASID implementation, where each process/mm gets
1367 + * its own ASID and flush/restart when we run out of ASID space.
1368 + *
1369 + * Instead we have a small per-cpu array of ASIDs and cache the last few mm's
1370 + * that came by on this CPU, allowing cheaper switch_mm between processes on
1371 + * this CPU.
1372 + *
1373 + * We end up with different spaces for different things. To avoid confusion we
1374 + * use different names for each of them:
1375 + *
1376 + * ASID - [0, TLB_NR_DYN_ASIDS-1]
1377 + * the canonical identifier for an mm
1378 + *
1379 + * kPCID - [1, TLB_NR_DYN_ASIDS]
1380 + * the value we write into the PCID part of CR3; corresponds to the
1381 + * ASID+1, because PCID 0 is special.
1382 + *
1383 + * uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS]
1384 + * for KPTI each mm has two address spaces and thus needs two
1385 + * PCID values, but we can still do with a single ASID denomination
1386 + * for each mm. Corresponds to kPCID + 2048.
1387 + *
1388 + */
1389
1390 /* There are 12 bits of space for ASIDS in CR3 */
1391 #define CR3_HW_ASID_BITS 12
1392 +
1393 /*
1394 * When enabled, PAGE_TABLE_ISOLATION consumes a single bit for
1395 * user/kernel switches
1396 */
1397 -#define PTI_CONSUMED_ASID_BITS 0
1398 +#ifdef CONFIG_PAGE_TABLE_ISOLATION
1399 +# define PTI_CONSUMED_PCID_BITS 1
1400 +#else
1401 +# define PTI_CONSUMED_PCID_BITS 0
1402 +#endif
1403 +
1404 +#define CR3_AVAIL_PCID_BITS (X86_CR3_PCID_BITS - PTI_CONSUMED_PCID_BITS)
1405
1406 -#define CR3_AVAIL_ASID_BITS (CR3_HW_ASID_BITS - PTI_CONSUMED_ASID_BITS)
1407 /*
1408 * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account
1409 - * for them being zero-based. Another -1 is because ASID 0 is reserved for
1410 + * for them being zero-based. Another -1 is because PCID 0 is reserved for
1411 * use by non-PCID-aware users.
1412 */
1413 -#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_ASID_BITS) - 2)
1414 +#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2)
1415 +
1416 +/*
1417 + * 6 because 6 should be plenty and struct tlb_state will fit in two cache
1418 + * lines.
1419 + */
1420 +#define TLB_NR_DYN_ASIDS 6
1421
1422 +/*
1423 + * Given @asid, compute kPCID
1424 + */
1425 static inline u16 kern_pcid(u16 asid)
1426 {
1427 VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
1428 +
1429 +#ifdef CONFIG_PAGE_TABLE_ISOLATION
1430 + /*
1431 + * Make sure that the dynamic ASID space does not confict with the
1432 + * bit we are using to switch between user and kernel ASIDs.
1433 + */
1434 + BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_SWITCH_BIT));
1435 +
1436 /*
1437 + * The ASID being passed in here should have respected the
1438 + * MAX_ASID_AVAILABLE and thus never have the switch bit set.
1439 + */
1440 + VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_SWITCH_BIT));
1441 +#endif
1442 + /*
1443 + * The dynamically-assigned ASIDs that get passed in are small
1444 + * (<TLB_NR_DYN_ASIDS). They never have the high switch bit set,
1445 + * so do not bother to clear it.
1446 + *
1447 * If PCID is on, ASID-aware code paths put the ASID+1 into the
1448 * PCID bits. This serves two purposes. It prevents a nasty
1449 * situation in which PCID-unaware code saves CR3, loads some other
1450 @@ -53,6 +105,18 @@ static inline u16 kern_pcid(u16 asid)
1451 return asid + 1;
1452 }
1453
1454 +/*
1455 + * Given @asid, compute uPCID
1456 + */
1457 +static inline u16 user_pcid(u16 asid)
1458 +{
1459 + u16 ret = kern_pcid(asid);
1460 +#ifdef CONFIG_PAGE_TABLE_ISOLATION
1461 + ret |= 1 << X86_CR3_PTI_SWITCH_BIT;
1462 +#endif
1463 + return ret;
1464 +}
1465 +
1466 struct pgd_t;
1467 static inline unsigned long build_cr3(pgd_t *pgd, u16 asid)
1468 {
1469 @@ -95,12 +159,6 @@ static inline bool tlb_defer_switch_to_init_mm(void)
1470 return !static_cpu_has(X86_FEATURE_PCID);
1471 }
1472
1473 -/*
1474 - * 6 because 6 should be plenty and struct tlb_state will fit in
1475 - * two cache lines.
1476 - */
1477 -#define TLB_NR_DYN_ASIDS 6
1478 -
1479 struct tlb_context {
1480 u64 ctx_id;
1481 u64 tlb_gen;
1482 @@ -134,6 +192,24 @@ struct tlb_state {
1483 */
1484 bool is_lazy;
1485
1486 + /*
1487 + * If set we changed the page tables in such a way that we
1488 + * needed an invalidation of all contexts (aka. PCIDs / ASIDs).
1489 + * This tells us to go invalidate all the non-loaded ctxs[]
1490 + * on the next context switch.
1491 + *
1492 + * The current ctx was kept up-to-date as it ran and does not
1493 + * need to be invalidated.
1494 + */
1495 + bool invalidate_other;
1496 +
1497 + /*
1498 + * Mask that contains TLB_NR_DYN_ASIDS+1 bits to indicate
1499 + * the corresponding user PCID needs a flush next time we
1500 + * switch to it; see SWITCH_TO_USER_CR3.
1501 + */
1502 + unsigned short user_pcid_flush_mask;
1503 +
1504 /*
1505 * Access to this CR4 shadow and to H/W CR4 is protected by
1506 * disabling interrupts when modifying either one.
1507 @@ -211,6 +287,14 @@ static inline unsigned long cr4_read_shadow(void)
1508 return this_cpu_read(cpu_tlbstate.cr4);
1509 }
1510
1511 +/*
1512 + * Mark all other ASIDs as invalid, preserves the current.
1513 + */
1514 +static inline void invalidate_other_asid(void)
1515 +{
1516 + this_cpu_write(cpu_tlbstate.invalidate_other, true);
1517 +}
1518 +
1519 /*
1520 * Save some of cr4 feature set we're using (e.g. Pentium 4MB
1521 * enable and PPro Global page enable), so that any CPU's that boot
1522 @@ -230,19 +314,48 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask)
1523
1524 extern void initialize_tlbstate_and_flush(void);
1525
1526 +/*
1527 + * Given an ASID, flush the corresponding user ASID. We can delay this
1528 + * until the next time we switch to it.
1529 + *
1530 + * See SWITCH_TO_USER_CR3.
1531 + */
1532 +static inline void invalidate_user_asid(u16 asid)
1533 +{
1534 + /* There is no user ASID if address space separation is off */
1535 + if (!IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
1536 + return;
1537 +
1538 + /*
1539 + * We only have a single ASID if PCID is off and the CR3
1540 + * write will have flushed it.
1541 + */
1542 + if (!cpu_feature_enabled(X86_FEATURE_PCID))
1543 + return;
1544 +
1545 + if (!static_cpu_has(X86_FEATURE_PTI))
1546 + return;
1547 +
1548 + __set_bit(kern_pcid(asid),
1549 + (unsigned long *)this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask));
1550 +}
1551 +
1552 /*
1553 * flush the entire current user mapping
1554 */
1555 static inline void __native_flush_tlb(void)
1556 {
1557 /*
1558 - * If current->mm == NULL then we borrow a mm which may change during a
1559 - * task switch and therefore we must not be preempted while we write CR3
1560 - * back:
1561 + * Preemption or interrupts must be disabled to protect the access
1562 + * to the per CPU variable and to prevent being preempted between
1563 + * read_cr3() and write_cr3().
1564 */
1565 - preempt_disable();
1566 + WARN_ON_ONCE(preemptible());
1567 +
1568 + invalidate_user_asid(this_cpu_read(cpu_tlbstate.loaded_mm_asid));
1569 +
1570 + /* If current->mm == NULL then the read_cr3() "borrows" an mm */
1571 native_write_cr3(__native_read_cr3());
1572 - preempt_enable();
1573 }
1574
1575 /*
1576 @@ -256,6 +369,8 @@ static inline void __native_flush_tlb_global(void)
1577 /*
1578 * Using INVPCID is considerably faster than a pair of writes
1579 * to CR4 sandwiched inside an IRQ flag save/restore.
1580 + *
1581 + * Note, this works with CR4.PCIDE=0 or 1.
1582 */
1583 invpcid_flush_all();
1584 return;
1585 @@ -282,7 +397,21 @@ static inline void __native_flush_tlb_global(void)
1586 */
1587 static inline void __native_flush_tlb_single(unsigned long addr)
1588 {
1589 + u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
1590 +
1591 asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
1592 +
1593 + if (!static_cpu_has(X86_FEATURE_PTI))
1594 + return;
1595 +
1596 + /*
1597 + * Some platforms #GP if we call invpcid(type=1/2) before CR4.PCIDE=1.
1598 + * Just use invalidate_user_asid() in case we are called early.
1599 + */
1600 + if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE))
1601 + invalidate_user_asid(loaded_mm_asid);
1602 + else
1603 + invpcid_flush_one(user_pcid(loaded_mm_asid), addr);
1604 }
1605
1606 /*
1607 @@ -298,14 +427,6 @@ static inline void __flush_tlb_all(void)
1608 */
1609 __flush_tlb();
1610 }
1611 -
1612 - /*
1613 - * Note: if we somehow had PCID but not PGE, then this wouldn't work --
1614 - * we'd end up flushing kernel translations for the current ASID but
1615 - * we might fail to flush kernel translations for other cached ASIDs.
1616 - *
1617 - * To avoid this issue, we force PCID off if PGE is off.
1618 - */
1619 }
1620
1621 /*
1622 @@ -315,6 +436,16 @@ static inline void __flush_tlb_one(unsigned long addr)
1623 {
1624 count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
1625 __flush_tlb_single(addr);
1626 +
1627 + if (!static_cpu_has(X86_FEATURE_PTI))
1628 + return;
1629 +
1630 + /*
1631 + * __flush_tlb_single() will have cleared the TLB entry for this ASID,
1632 + * but since kernel space is replicated across all, we must also
1633 + * invalidate all others.
1634 + */
1635 + invalidate_other_asid();
1636 }
1637
1638 #define TLB_FLUSH_ALL -1UL
1639 @@ -375,6 +506,17 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a)
1640 void native_flush_tlb_others(const struct cpumask *cpumask,
1641 const struct flush_tlb_info *info);
1642
1643 +static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
1644 +{
1645 + /*
1646 + * Bump the generation count. This also serves as a full barrier
1647 + * that synchronizes with switch_mm(): callers are required to order
1648 + * their read of mm_cpumask after their writes to the paging
1649 + * structures.
1650 + */
1651 + return atomic64_inc_return(&mm->context.tlb_gen);
1652 +}
1653 +
1654 static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch,
1655 struct mm_struct *mm)
1656 {
1657 diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h
1658 index d9a7c659009c..b986b2ca688a 100644
1659 --- a/arch/x86/include/asm/vsyscall.h
1660 +++ b/arch/x86/include/asm/vsyscall.h
1661 @@ -7,6 +7,7 @@
1662
1663 #ifdef CONFIG_X86_VSYSCALL_EMULATION
1664 extern void map_vsyscall(void);
1665 +extern void set_vsyscall_pgtable_user_bits(pgd_t *root);
1666
1667 /*
1668 * Called on instruction fetch fault in vsyscall page.
1669 diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h
1670 index 53b4ca55ebb6..97abdaab9535 100644
1671 --- a/arch/x86/include/uapi/asm/processor-flags.h
1672 +++ b/arch/x86/include/uapi/asm/processor-flags.h
1673 @@ -78,7 +78,12 @@
1674 #define X86_CR3_PWT _BITUL(X86_CR3_PWT_BIT)
1675 #define X86_CR3_PCD_BIT 4 /* Page Cache Disable */
1676 #define X86_CR3_PCD _BITUL(X86_CR3_PCD_BIT)
1677 -#define X86_CR3_PCID_MASK _AC(0x00000fff,UL) /* PCID Mask */
1678 +
1679 +#define X86_CR3_PCID_BITS 12
1680 +#define X86_CR3_PCID_MASK (_AC((1UL << X86_CR3_PCID_BITS) - 1, UL))
1681 +
1682 +#define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */
1683 +#define X86_CR3_PCID_NOFLUSH _BITULL(X86_CR3_PCID_NOFLUSH_BIT)
1684
1685 /*
1686 * Intel CPU features in CR4
1687 diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
1688 index 676b7cf4b62b..76417a9aab73 100644
1689 --- a/arch/x86/kernel/asm-offsets.c
1690 +++ b/arch/x86/kernel/asm-offsets.c
1691 @@ -17,6 +17,7 @@
1692 #include <asm/sigframe.h>
1693 #include <asm/bootparam.h>
1694 #include <asm/suspend.h>
1695 +#include <asm/tlbflush.h>
1696
1697 #ifdef CONFIG_XEN
1698 #include <xen/interface/xen.h>
1699 @@ -94,6 +95,9 @@ void common(void) {
1700 BLANK();
1701 DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
1702
1703 + /* TLB state for the entry code */
1704 + OFFSET(TLB_STATE_user_pcid_flush_mask, tlb_state, user_pcid_flush_mask);
1705 +
1706 /* Layout info for cpu_entry_area */
1707 OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss);
1708 OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline);
1709 diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
1710 index 8ddcfa4d4165..f2a94dfb434e 100644
1711 --- a/arch/x86/kernel/cpu/common.c
1712 +++ b/arch/x86/kernel/cpu/common.c
1713 @@ -898,6 +898,10 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
1714 }
1715
1716 setup_force_cpu_cap(X86_FEATURE_ALWAYS);
1717 +
1718 + /* Assume for now that ALL x86 CPUs are insecure */
1719 + setup_force_cpu_bug(X86_BUG_CPU_INSECURE);
1720 +
1721 fpu__init_system(c);
1722
1723 #ifdef CONFIG_X86_32
1724 @@ -1335,7 +1339,10 @@ void syscall_init(void)
1725 (entry_SYSCALL_64_trampoline - _entry_trampoline);
1726
1727 wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
1728 - wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline);
1729 + if (static_cpu_has(X86_FEATURE_PTI))
1730 + wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline);
1731 + else
1732 + wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
1733
1734 #ifdef CONFIG_IA32_EMULATION
1735 wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat);
1736 diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
1737 index 36b17e0febe8..5fa110699ed2 100644
1738 --- a/arch/x86/kernel/dumpstack.c
1739 +++ b/arch/x86/kernel/dumpstack.c
1740 @@ -297,11 +297,13 @@ int __die(const char *str, struct pt_regs *regs, long err)
1741 unsigned long sp;
1742 #endif
1743 printk(KERN_DEFAULT
1744 - "%s: %04lx [#%d]%s%s%s%s\n", str, err & 0xffff, ++die_counter,
1745 + "%s: %04lx [#%d]%s%s%s%s%s\n", str, err & 0xffff, ++die_counter,
1746 IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "",
1747 IS_ENABLED(CONFIG_SMP) ? " SMP" : "",
1748 debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "",
1749 - IS_ENABLED(CONFIG_KASAN) ? " KASAN" : "");
1750 + IS_ENABLED(CONFIG_KASAN) ? " KASAN" : "",
1751 + IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION) ?
1752 + (boot_cpu_has(X86_FEATURE_PTI) ? " PTI" : " NOPTI") : "");
1753
1754 if (notify_die(DIE_OOPS, str, regs, err,
1755 current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP)
1756 diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
1757 index 7dca675fe78d..04a625f0fcda 100644
1758 --- a/arch/x86/kernel/head_64.S
1759 +++ b/arch/x86/kernel/head_64.S
1760 @@ -341,6 +341,27 @@ GLOBAL(early_recursion_flag)
1761 .balign PAGE_SIZE; \
1762 GLOBAL(name)
1763
1764 +#ifdef CONFIG_PAGE_TABLE_ISOLATION
1765 +/*
1766 + * Each PGD needs to be 8k long and 8k aligned. We do not
1767 + * ever go out to userspace with these, so we do not
1768 + * strictly *need* the second page, but this allows us to
1769 + * have a single set_pgd() implementation that does not
1770 + * need to worry about whether it has 4k or 8k to work
1771 + * with.
1772 + *
1773 + * This ensures PGDs are 8k long:
1774 + */
1775 +#define PTI_USER_PGD_FILL 512
1776 +/* This ensures they are 8k-aligned: */
1777 +#define NEXT_PGD_PAGE(name) \
1778 + .balign 2 * PAGE_SIZE; \
1779 +GLOBAL(name)
1780 +#else
1781 +#define NEXT_PGD_PAGE(name) NEXT_PAGE(name)
1782 +#define PTI_USER_PGD_FILL 0
1783 +#endif
1784 +
1785 /* Automate the creation of 1 to 1 mapping pmd entries */
1786 #define PMDS(START, PERM, COUNT) \
1787 i = 0 ; \
1788 @@ -350,13 +371,14 @@ GLOBAL(name)
1789 .endr
1790
1791 __INITDATA
1792 -NEXT_PAGE(early_top_pgt)
1793 +NEXT_PGD_PAGE(early_top_pgt)
1794 .fill 511,8,0
1795 #ifdef CONFIG_X86_5LEVEL
1796 .quad level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
1797 #else
1798 .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
1799 #endif
1800 + .fill PTI_USER_PGD_FILL,8,0
1801
1802 NEXT_PAGE(early_dynamic_pgts)
1803 .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0
1804 @@ -364,13 +386,14 @@ NEXT_PAGE(early_dynamic_pgts)
1805 .data
1806
1807 #if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH)
1808 -NEXT_PAGE(init_top_pgt)
1809 +NEXT_PGD_PAGE(init_top_pgt)
1810 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
1811 .org init_top_pgt + PGD_PAGE_OFFSET*8, 0
1812 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
1813 .org init_top_pgt + PGD_START_KERNEL*8, 0
1814 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
1815 .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
1816 + .fill PTI_USER_PGD_FILL,8,0
1817
1818 NEXT_PAGE(level3_ident_pgt)
1819 .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
1820 @@ -381,8 +404,9 @@ NEXT_PAGE(level2_ident_pgt)
1821 */
1822 PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
1823 #else
1824 -NEXT_PAGE(init_top_pgt)
1825 +NEXT_PGD_PAGE(init_top_pgt)
1826 .fill 512,8,0
1827 + .fill PTI_USER_PGD_FILL,8,0
1828 #endif
1829
1830 #ifdef CONFIG_X86_5LEVEL
1831 diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
1832 index a6b5d62f45a7..26d713ecad34 100644
1833 --- a/arch/x86/kernel/ldt.c
1834 +++ b/arch/x86/kernel/ldt.c
1835 @@ -24,6 +24,7 @@
1836 #include <linux/uaccess.h>
1837
1838 #include <asm/ldt.h>
1839 +#include <asm/tlb.h>
1840 #include <asm/desc.h>
1841 #include <asm/mmu_context.h>
1842 #include <asm/syscalls.h>
1843 @@ -51,13 +52,11 @@ static void refresh_ldt_segments(void)
1844 static void flush_ldt(void *__mm)
1845 {
1846 struct mm_struct *mm = __mm;
1847 - mm_context_t *pc;
1848
1849 if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm)
1850 return;
1851
1852 - pc = &mm->context;
1853 - set_ldt(pc->ldt->entries, pc->ldt->nr_entries);
1854 + load_mm_ldt(mm);
1855
1856 refresh_ldt_segments();
1857 }
1858 @@ -94,10 +93,126 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries)
1859 return NULL;
1860 }
1861
1862 + /* The new LDT isn't aliased for PTI yet. */
1863 + new_ldt->slot = -1;
1864 +
1865 new_ldt->nr_entries = num_entries;
1866 return new_ldt;
1867 }
1868
1869 +/*
1870 + * If PTI is enabled, this maps the LDT into the kernelmode and
1871 + * usermode tables for the given mm.
1872 + *
1873 + * There is no corresponding unmap function. Even if the LDT is freed, we
1874 + * leave the PTEs around until the slot is reused or the mm is destroyed.
1875 + * This is harmless: the LDT is always in ordinary memory, and no one will
1876 + * access the freed slot.
1877 + *
1878 + * If we wanted to unmap freed LDTs, we'd also need to do a flush to make
1879 + * it useful, and the flush would slow down modify_ldt().
1880 + */
1881 +static int
1882 +map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
1883 +{
1884 +#ifdef CONFIG_PAGE_TABLE_ISOLATION
1885 + bool is_vmalloc, had_top_level_entry;
1886 + unsigned long va;
1887 + spinlock_t *ptl;
1888 + pgd_t *pgd;
1889 + int i;
1890 +
1891 + if (!static_cpu_has(X86_FEATURE_PTI))
1892 + return 0;
1893 +
1894 + /*
1895 + * Any given ldt_struct should have map_ldt_struct() called at most
1896 + * once.
1897 + */
1898 + WARN_ON(ldt->slot != -1);
1899 +
1900 + /*
1901 + * Did we already have the top level entry allocated? We can't
1902 + * use pgd_none() for this because it doens't do anything on
1903 + * 4-level page table kernels.
1904 + */
1905 + pgd = pgd_offset(mm, LDT_BASE_ADDR);
1906 + had_top_level_entry = (pgd->pgd != 0);
1907 +
1908 + is_vmalloc = is_vmalloc_addr(ldt->entries);
1909 +
1910 + for (i = 0; i * PAGE_SIZE < ldt->nr_entries * LDT_ENTRY_SIZE; i++) {
1911 + unsigned long offset = i << PAGE_SHIFT;
1912 + const void *src = (char *)ldt->entries + offset;
1913 + unsigned long pfn;
1914 + pte_t pte, *ptep;
1915 +
1916 + va = (unsigned long)ldt_slot_va(slot) + offset;
1917 + pfn = is_vmalloc ? vmalloc_to_pfn(src) :
1918 + page_to_pfn(virt_to_page(src));
1919 + /*
1920 + * Treat the PTI LDT range as a *userspace* range.
1921 + * get_locked_pte() will allocate all needed pagetables
1922 + * and account for them in this mm.
1923 + */
1924 + ptep = get_locked_pte(mm, va, &ptl);
1925 + if (!ptep)
1926 + return -ENOMEM;
1927 + /*
1928 + * Map it RO so the easy to find address is not a primary
1929 + * target via some kernel interface which misses a
1930 + * permission check.
1931 + */
1932 + pte = pfn_pte(pfn, __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL));
1933 + set_pte_at(mm, va, ptep, pte);
1934 + pte_unmap_unlock(ptep, ptl);
1935 + }
1936 +
1937 + if (mm->context.ldt) {
1938 + /*
1939 + * We already had an LDT. The top-level entry should already
1940 + * have been allocated and synchronized with the usermode
1941 + * tables.
1942 + */
1943 + WARN_ON(!had_top_level_entry);
1944 + if (static_cpu_has(X86_FEATURE_PTI))
1945 + WARN_ON(!kernel_to_user_pgdp(pgd)->pgd);
1946 + } else {
1947 + /*
1948 + * This is the first time we're mapping an LDT for this process.
1949 + * Sync the pgd to the usermode tables.
1950 + */
1951 + WARN_ON(had_top_level_entry);
1952 + if (static_cpu_has(X86_FEATURE_PTI)) {
1953 + WARN_ON(kernel_to_user_pgdp(pgd)->pgd);
1954 + set_pgd(kernel_to_user_pgdp(pgd), *pgd);
1955 + }
1956 + }
1957 +
1958 + va = (unsigned long)ldt_slot_va(slot);
1959 + flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, 0);
1960 +
1961 + ldt->slot = slot;
1962 +#endif
1963 + return 0;
1964 +}
1965 +
1966 +static void free_ldt_pgtables(struct mm_struct *mm)
1967 +{
1968 +#ifdef CONFIG_PAGE_TABLE_ISOLATION
1969 + struct mmu_gather tlb;
1970 + unsigned long start = LDT_BASE_ADDR;
1971 + unsigned long end = start + (1UL << PGDIR_SHIFT);
1972 +
1973 + if (!static_cpu_has(X86_FEATURE_PTI))
1974 + return;
1975 +
1976 + tlb_gather_mmu(&tlb, mm, start, end);
1977 + free_pgd_range(&tlb, start, end, start, end);
1978 + tlb_finish_mmu(&tlb, start, end);
1979 +#endif
1980 +}
1981 +
1982 /* After calling this, the LDT is immutable. */
1983 static void finalize_ldt_struct(struct ldt_struct *ldt)
1984 {
1985 @@ -156,6 +271,12 @@ int ldt_dup_context(struct mm_struct *old_mm, struct mm_struct *mm)
1986 new_ldt->nr_entries * LDT_ENTRY_SIZE);
1987 finalize_ldt_struct(new_ldt);
1988
1989 + retval = map_ldt_struct(mm, new_ldt, 0);
1990 + if (retval) {
1991 + free_ldt_pgtables(mm);
1992 + free_ldt_struct(new_ldt);
1993 + goto out_unlock;
1994 + }
1995 mm->context.ldt = new_ldt;
1996
1997 out_unlock:
1998 @@ -174,6 +295,11 @@ void destroy_context_ldt(struct mm_struct *mm)
1999 mm->context.ldt = NULL;
2000 }
2001
2002 +void ldt_arch_exit_mmap(struct mm_struct *mm)
2003 +{
2004 + free_ldt_pgtables(mm);
2005 +}
2006 +
2007 static int read_ldt(void __user *ptr, unsigned long bytecount)
2008 {
2009 struct mm_struct *mm = current->mm;
2010 @@ -287,6 +413,25 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
2011 new_ldt->entries[ldt_info.entry_number] = ldt;
2012 finalize_ldt_struct(new_ldt);
2013
2014 + /*
2015 + * If we are using PTI, map the new LDT into the userspace pagetables.
2016 + * If there is already an LDT, use the other slot so that other CPUs
2017 + * will continue to use the old LDT until install_ldt() switches
2018 + * them over to the new LDT.
2019 + */
2020 + error = map_ldt_struct(mm, new_ldt, old_ldt ? !old_ldt->slot : 0);
2021 + if (error) {
2022 + /*
2023 + * This only can fail for the first LDT setup. If an LDT is
2024 + * already installed then the PTE page is already
2025 + * populated. Mop up a half populated page table.
2026 + */
2027 + if (!WARN_ON_ONCE(old_ldt))
2028 + free_ldt_pgtables(mm);
2029 + free_ldt_struct(new_ldt);
2030 + goto out_unlock;
2031 + }
2032 +
2033 install_ldt(mm, new_ldt);
2034 free_ldt_struct(old_ldt);
2035 error = 0;
2036 diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
2037 index 00bc751c861c..edfede768688 100644
2038 --- a/arch/x86/kernel/machine_kexec_32.c
2039 +++ b/arch/x86/kernel/machine_kexec_32.c
2040 @@ -48,8 +48,6 @@ static void load_segments(void)
2041 "\tmovl $"STR(__KERNEL_DS)",%%eax\n"
2042 "\tmovl %%eax,%%ds\n"
2043 "\tmovl %%eax,%%es\n"
2044 - "\tmovl %%eax,%%fs\n"
2045 - "\tmovl %%eax,%%gs\n"
2046 "\tmovl %%eax,%%ss\n"
2047 : : : "eax", "memory");
2048 #undef STR
2049 @@ -232,8 +230,8 @@ void machine_kexec(struct kimage *image)
2050 * The gdt & idt are now invalid.
2051 * If you want to load them you must set up your own idt & gdt.
2052 */
2053 - set_gdt(phys_to_virt(0), 0);
2054 idt_invalidate(phys_to_virt(0));
2055 + set_gdt(phys_to_virt(0), 0);
2056
2057 /* now call it */
2058 image->start = relocate_kernel_ptr((unsigned long)image->head,
2059 diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
2060 index 12bf07d44dfe..2651ca2112c4 100644
2061 --- a/arch/x86/kernel/smpboot.c
2062 +++ b/arch/x86/kernel/smpboot.c
2063 @@ -128,25 +128,16 @@ static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
2064 spin_lock_irqsave(&rtc_lock, flags);
2065 CMOS_WRITE(0xa, 0xf);
2066 spin_unlock_irqrestore(&rtc_lock, flags);
2067 - local_flush_tlb();
2068 - pr_debug("1.\n");
2069 *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) =
2070 start_eip >> 4;
2071 - pr_debug("2.\n");
2072 *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) =
2073 start_eip & 0xf;
2074 - pr_debug("3.\n");
2075 }
2076
2077 static inline void smpboot_restore_warm_reset_vector(void)
2078 {
2079 unsigned long flags;
2080
2081 - /*
2082 - * Install writable page 0 entry to set BIOS data area.
2083 - */
2084 - local_flush_tlb();
2085 -
2086 /*
2087 * Paranoid: Set warm reset code and vector here back
2088 * to default values.
2089 diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c
2090 index 9a9c9b076955..a5b802a12212 100644
2091 --- a/arch/x86/kernel/tls.c
2092 +++ b/arch/x86/kernel/tls.c
2093 @@ -93,17 +93,10 @@ static void set_tls_desc(struct task_struct *p, int idx,
2094 cpu = get_cpu();
2095
2096 while (n-- > 0) {
2097 - if (LDT_empty(info) || LDT_zero(info)) {
2098 + if (LDT_empty(info) || LDT_zero(info))
2099 memset(desc, 0, sizeof(*desc));
2100 - } else {
2101 + else
2102 fill_ldt(desc, info);
2103 -
2104 - /*
2105 - * Always set the accessed bit so that the CPU
2106 - * doesn't try to write to the (read-only) GDT.
2107 - */
2108 - desc->type |= 1;
2109 - }
2110 ++info;
2111 ++desc;
2112 }
2113 diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
2114 index 7c16fe0b60c2..b33e860d32fe 100644
2115 --- a/arch/x86/kernel/traps.c
2116 +++ b/arch/x86/kernel/traps.c
2117 @@ -361,7 +361,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
2118 *
2119 * No need for ist_enter here because we don't use RCU.
2120 */
2121 - if (((long)regs->sp >> PGDIR_SHIFT) == ESPFIX_PGD_ENTRY &&
2122 + if (((long)regs->sp >> P4D_SHIFT) == ESPFIX_PGD_ENTRY &&
2123 regs->cs == __KERNEL_CS &&
2124 regs->ip == (unsigned long)native_irq_return_iret)
2125 {
2126 diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
2127 index d2a8b5a24a44..1e413a9326aa 100644
2128 --- a/arch/x86/kernel/vmlinux.lds.S
2129 +++ b/arch/x86/kernel/vmlinux.lds.S
2130 @@ -61,11 +61,17 @@ jiffies_64 = jiffies;
2131 . = ALIGN(HPAGE_SIZE); \
2132 __end_rodata_hpage_align = .;
2133
2134 +#define ALIGN_ENTRY_TEXT_BEGIN . = ALIGN(PMD_SIZE);
2135 +#define ALIGN_ENTRY_TEXT_END . = ALIGN(PMD_SIZE);
2136 +
2137 #else
2138
2139 #define X64_ALIGN_RODATA_BEGIN
2140 #define X64_ALIGN_RODATA_END
2141
2142 +#define ALIGN_ENTRY_TEXT_BEGIN
2143 +#define ALIGN_ENTRY_TEXT_END
2144 +
2145 #endif
2146
2147 PHDRS {
2148 @@ -102,8 +108,10 @@ SECTIONS
2149 CPUIDLE_TEXT
2150 LOCK_TEXT
2151 KPROBES_TEXT
2152 + ALIGN_ENTRY_TEXT_BEGIN
2153 ENTRY_TEXT
2154 IRQENTRY_TEXT
2155 + ALIGN_ENTRY_TEXT_END
2156 SOFTIRQENTRY_TEXT
2157 *(.fixup)
2158 *(.gnu.warning)
2159 diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
2160 index 2e0017af8f9b..52906808e277 100644
2161 --- a/arch/x86/mm/Makefile
2162 +++ b/arch/x86/mm/Makefile
2163 @@ -43,9 +43,10 @@ obj-$(CONFIG_AMD_NUMA) += amdtopology.o
2164 obj-$(CONFIG_ACPI_NUMA) += srat.o
2165 obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
2166
2167 -obj-$(CONFIG_X86_INTEL_MPX) += mpx.o
2168 -obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
2169 -obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
2170 +obj-$(CONFIG_X86_INTEL_MPX) += mpx.o
2171 +obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
2172 +obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
2173 +obj-$(CONFIG_PAGE_TABLE_ISOLATION) += pti.o
2174
2175 obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt.o
2176 obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_boot.o
2177 diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c
2178 index fe814fd5e014..b9283cc27622 100644
2179 --- a/arch/x86/mm/cpu_entry_area.c
2180 +++ b/arch/x86/mm/cpu_entry_area.c
2181 @@ -38,6 +38,32 @@ cea_map_percpu_pages(void *cea_vaddr, void *ptr, int pages, pgprot_t prot)
2182 cea_set_pte(cea_vaddr, per_cpu_ptr_to_phys(ptr), prot);
2183 }
2184
2185 +static void percpu_setup_debug_store(int cpu)
2186 +{
2187 +#ifdef CONFIG_CPU_SUP_INTEL
2188 + int npages;
2189 + void *cea;
2190 +
2191 + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
2192 + return;
2193 +
2194 + cea = &get_cpu_entry_area(cpu)->cpu_debug_store;
2195 + npages = sizeof(struct debug_store) / PAGE_SIZE;
2196 + BUILD_BUG_ON(sizeof(struct debug_store) % PAGE_SIZE != 0);
2197 + cea_map_percpu_pages(cea, &per_cpu(cpu_debug_store, cpu), npages,
2198 + PAGE_KERNEL);
2199 +
2200 + cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers;
2201 + /*
2202 + * Force the population of PMDs for not yet allocated per cpu
2203 + * memory like debug store buffers.
2204 + */
2205 + npages = sizeof(struct debug_store_buffers) / PAGE_SIZE;
2206 + for (; npages; npages--, cea += PAGE_SIZE)
2207 + cea_set_pte(cea, 0, PAGE_NONE);
2208 +#endif
2209 +}
2210 +
2211 /* Setup the fixmap mappings only once per-processor */
2212 static void __init setup_cpu_entry_area(int cpu)
2213 {
2214 @@ -109,6 +135,7 @@ static void __init setup_cpu_entry_area(int cpu)
2215 cea_set_pte(&get_cpu_entry_area(cpu)->entry_trampoline,
2216 __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX);
2217 #endif
2218 + percpu_setup_debug_store(cpu);
2219 }
2220
2221 static __init void setup_cpu_entry_area_ptes(void)
2222 diff --git a/arch/x86/mm/debug_pagetables.c b/arch/x86/mm/debug_pagetables.c
2223 index bfcffdf6c577..421f2664ffa0 100644
2224 --- a/arch/x86/mm/debug_pagetables.c
2225 +++ b/arch/x86/mm/debug_pagetables.c
2226 @@ -5,7 +5,7 @@
2227
2228 static int ptdump_show(struct seq_file *m, void *v)
2229 {
2230 - ptdump_walk_pgd_level(m, NULL);
2231 + ptdump_walk_pgd_level_debugfs(m, NULL, false);
2232 return 0;
2233 }
2234
2235 @@ -22,21 +22,89 @@ static const struct file_operations ptdump_fops = {
2236 .release = single_release,
2237 };
2238
2239 -static struct dentry *pe;
2240 +static int ptdump_show_curknl(struct seq_file *m, void *v)
2241 +{
2242 + if (current->mm->pgd) {
2243 + down_read(&current->mm->mmap_sem);
2244 + ptdump_walk_pgd_level_debugfs(m, current->mm->pgd, false);
2245 + up_read(&current->mm->mmap_sem);
2246 + }
2247 + return 0;
2248 +}
2249 +
2250 +static int ptdump_open_curknl(struct inode *inode, struct file *filp)
2251 +{
2252 + return single_open(filp, ptdump_show_curknl, NULL);
2253 +}
2254 +
2255 +static const struct file_operations ptdump_curknl_fops = {
2256 + .owner = THIS_MODULE,
2257 + .open = ptdump_open_curknl,
2258 + .read = seq_read,
2259 + .llseek = seq_lseek,
2260 + .release = single_release,
2261 +};
2262 +
2263 +#ifdef CONFIG_PAGE_TABLE_ISOLATION
2264 +static struct dentry *pe_curusr;
2265 +
2266 +static int ptdump_show_curusr(struct seq_file *m, void *v)
2267 +{
2268 + if (current->mm->pgd) {
2269 + down_read(&current->mm->mmap_sem);
2270 + ptdump_walk_pgd_level_debugfs(m, current->mm->pgd, true);
2271 + up_read(&current->mm->mmap_sem);
2272 + }
2273 + return 0;
2274 +}
2275 +
2276 +static int ptdump_open_curusr(struct inode *inode, struct file *filp)
2277 +{
2278 + return single_open(filp, ptdump_show_curusr, NULL);
2279 +}
2280 +
2281 +static const struct file_operations ptdump_curusr_fops = {
2282 + .owner = THIS_MODULE,
2283 + .open = ptdump_open_curusr,
2284 + .read = seq_read,
2285 + .llseek = seq_lseek,
2286 + .release = single_release,
2287 +};
2288 +#endif
2289 +
2290 +static struct dentry *dir, *pe_knl, *pe_curknl;
2291
2292 static int __init pt_dump_debug_init(void)
2293 {
2294 - pe = debugfs_create_file("kernel_page_tables", S_IRUSR, NULL, NULL,
2295 - &ptdump_fops);
2296 - if (!pe)
2297 + dir = debugfs_create_dir("page_tables", NULL);
2298 + if (!dir)
2299 return -ENOMEM;
2300
2301 + pe_knl = debugfs_create_file("kernel", 0400, dir, NULL,
2302 + &ptdump_fops);
2303 + if (!pe_knl)
2304 + goto err;
2305 +
2306 + pe_curknl = debugfs_create_file("current_kernel", 0400,
2307 + dir, NULL, &ptdump_curknl_fops);
2308 + if (!pe_curknl)
2309 + goto err;
2310 +
2311 +#ifdef CONFIG_PAGE_TABLE_ISOLATION
2312 + pe_curusr = debugfs_create_file("current_user", 0400,
2313 + dir, NULL, &ptdump_curusr_fops);
2314 + if (!pe_curusr)
2315 + goto err;
2316 +#endif
2317 return 0;
2318 +err:
2319 + debugfs_remove_recursive(dir);
2320 + return -ENOMEM;
2321 }
2322
2323 static void __exit pt_dump_debug_exit(void)
2324 {
2325 - debugfs_remove_recursive(pe);
2326 + debugfs_remove_recursive(dir);
2327 }
2328
2329 module_init(pt_dump_debug_init);
2330 diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
2331 index 43dedbfb7257..f56902c1f04b 100644
2332 --- a/arch/x86/mm/dump_pagetables.c
2333 +++ b/arch/x86/mm/dump_pagetables.c
2334 @@ -52,11 +52,17 @@ enum address_markers_idx {
2335 USER_SPACE_NR = 0,
2336 KERNEL_SPACE_NR,
2337 LOW_KERNEL_NR,
2338 +#if defined(CONFIG_MODIFY_LDT_SYSCALL) && defined(CONFIG_X86_5LEVEL)
2339 + LDT_NR,
2340 +#endif
2341 VMALLOC_START_NR,
2342 VMEMMAP_START_NR,
2343 #ifdef CONFIG_KASAN
2344 KASAN_SHADOW_START_NR,
2345 KASAN_SHADOW_END_NR,
2346 +#endif
2347 +#if defined(CONFIG_MODIFY_LDT_SYSCALL) && !defined(CONFIG_X86_5LEVEL)
2348 + LDT_NR,
2349 #endif
2350 CPU_ENTRY_AREA_NR,
2351 #ifdef CONFIG_X86_ESPFIX64
2352 @@ -81,6 +87,9 @@ static struct addr_marker address_markers[] = {
2353 #ifdef CONFIG_KASAN
2354 [KASAN_SHADOW_START_NR] = { KASAN_SHADOW_START, "KASAN shadow" },
2355 [KASAN_SHADOW_END_NR] = { KASAN_SHADOW_END, "KASAN shadow end" },
2356 +#endif
2357 +#ifdef CONFIG_MODIFY_LDT_SYSCALL
2358 + [LDT_NR] = { LDT_BASE_ADDR, "LDT remap" },
2359 #endif
2360 [CPU_ENTRY_AREA_NR] = { CPU_ENTRY_AREA_BASE,"CPU entry Area" },
2361 #ifdef CONFIG_X86_ESPFIX64
2362 @@ -467,7 +476,7 @@ static inline bool is_hypervisor_range(int idx)
2363 }
2364
2365 static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
2366 - bool checkwx)
2367 + bool checkwx, bool dmesg)
2368 {
2369 #ifdef CONFIG_X86_64
2370 pgd_t *start = (pgd_t *) &init_top_pgt;
2371 @@ -480,7 +489,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
2372
2373 if (pgd) {
2374 start = pgd;
2375 - st.to_dmesg = true;
2376 + st.to_dmesg = dmesg;
2377 }
2378
2379 st.check_wx = checkwx;
2380 @@ -518,13 +527,37 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
2381
2382 void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd)
2383 {
2384 - ptdump_walk_pgd_level_core(m, pgd, false);
2385 + ptdump_walk_pgd_level_core(m, pgd, false, true);
2386 +}
2387 +
2388 +void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user)
2389 +{
2390 +#ifdef CONFIG_PAGE_TABLE_ISOLATION
2391 + if (user && static_cpu_has(X86_FEATURE_PTI))
2392 + pgd = kernel_to_user_pgdp(pgd);
2393 +#endif
2394 + ptdump_walk_pgd_level_core(m, pgd, false, false);
2395 +}
2396 +EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs);
2397 +
2398 +static void ptdump_walk_user_pgd_level_checkwx(void)
2399 +{
2400 +#ifdef CONFIG_PAGE_TABLE_ISOLATION
2401 + pgd_t *pgd = (pgd_t *) &init_top_pgt;
2402 +
2403 + if (!static_cpu_has(X86_FEATURE_PTI))
2404 + return;
2405 +
2406 + pr_info("x86/mm: Checking user space page tables\n");
2407 + pgd = kernel_to_user_pgdp(pgd);
2408 + ptdump_walk_pgd_level_core(NULL, pgd, true, false);
2409 +#endif
2410 }
2411 -EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level);
2412
2413 void ptdump_walk_pgd_level_checkwx(void)
2414 {
2415 - ptdump_walk_pgd_level_core(NULL, NULL, true);
2416 + ptdump_walk_pgd_level_core(NULL, NULL, true, false);
2417 + ptdump_walk_user_pgd_level_checkwx();
2418 }
2419
2420 static int __init pt_dump_init(void)
2421 diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
2422 index a22c2b95e513..80259ad8c386 100644
2423 --- a/arch/x86/mm/init.c
2424 +++ b/arch/x86/mm/init.c
2425 @@ -20,6 +20,7 @@
2426 #include <asm/kaslr.h>
2427 #include <asm/hypervisor.h>
2428 #include <asm/cpufeature.h>
2429 +#include <asm/pti.h>
2430
2431 /*
2432 * We need to define the tracepoints somewhere, and tlb.c
2433 @@ -161,6 +162,12 @@ struct map_range {
2434
2435 static int page_size_mask;
2436
2437 +static void enable_global_pages(void)
2438 +{
2439 + if (!static_cpu_has(X86_FEATURE_PTI))
2440 + __supported_pte_mask |= _PAGE_GLOBAL;
2441 +}
2442 +
2443 static void __init probe_page_size_mask(void)
2444 {
2445 /*
2446 @@ -179,11 +186,11 @@ static void __init probe_page_size_mask(void)
2447 cr4_set_bits_and_update_boot(X86_CR4_PSE);
2448
2449 /* Enable PGE if available */
2450 + __supported_pte_mask &= ~_PAGE_GLOBAL;
2451 if (boot_cpu_has(X86_FEATURE_PGE)) {
2452 cr4_set_bits_and_update_boot(X86_CR4_PGE);
2453 - __supported_pte_mask |= _PAGE_GLOBAL;
2454 - } else
2455 - __supported_pte_mask &= ~_PAGE_GLOBAL;
2456 + enable_global_pages();
2457 + }
2458
2459 /* Enable 1 GB linear kernel mappings if available: */
2460 if (direct_gbpages && boot_cpu_has(X86_FEATURE_GBPAGES)) {
2461 @@ -196,34 +203,44 @@ static void __init probe_page_size_mask(void)
2462
2463 static void setup_pcid(void)
2464 {
2465 -#ifdef CONFIG_X86_64
2466 - if (boot_cpu_has(X86_FEATURE_PCID)) {
2467 - if (boot_cpu_has(X86_FEATURE_PGE)) {
2468 - /*
2469 - * This can't be cr4_set_bits_and_update_boot() --
2470 - * the trampoline code can't handle CR4.PCIDE and
2471 - * it wouldn't do any good anyway. Despite the name,
2472 - * cr4_set_bits_and_update_boot() doesn't actually
2473 - * cause the bits in question to remain set all the
2474 - * way through the secondary boot asm.
2475 - *
2476 - * Instead, we brute-force it and set CR4.PCIDE
2477 - * manually in start_secondary().
2478 - */
2479 - cr4_set_bits(X86_CR4_PCIDE);
2480 - } else {
2481 - /*
2482 - * flush_tlb_all(), as currently implemented, won't
2483 - * work if PCID is on but PGE is not. Since that
2484 - * combination doesn't exist on real hardware, there's
2485 - * no reason to try to fully support it, but it's
2486 - * polite to avoid corrupting data if we're on
2487 - * an improperly configured VM.
2488 - */
2489 - setup_clear_cpu_cap(X86_FEATURE_PCID);
2490 - }
2491 + if (!IS_ENABLED(CONFIG_X86_64))
2492 + return;
2493 +
2494 + if (!boot_cpu_has(X86_FEATURE_PCID))
2495 + return;
2496 +
2497 + if (boot_cpu_has(X86_FEATURE_PGE)) {
2498 + /*
2499 + * This can't be cr4_set_bits_and_update_boot() -- the
2500 + * trampoline code can't handle CR4.PCIDE and it wouldn't
2501 + * do any good anyway. Despite the name,
2502 + * cr4_set_bits_and_update_boot() doesn't actually cause
2503 + * the bits in question to remain set all the way through
2504 + * the secondary boot asm.
2505 + *
2506 + * Instead, we brute-force it and set CR4.PCIDE manually in
2507 + * start_secondary().
2508 + */
2509 + cr4_set_bits(X86_CR4_PCIDE);
2510 +
2511 + /*
2512 + * INVPCID's single-context modes (2/3) only work if we set
2513 + * X86_CR4_PCIDE, *and* we INVPCID support. It's unusable
2514 + * on systems that have X86_CR4_PCIDE clear, or that have
2515 + * no INVPCID support at all.
2516 + */
2517 + if (boot_cpu_has(X86_FEATURE_INVPCID))
2518 + setup_force_cpu_cap(X86_FEATURE_INVPCID_SINGLE);
2519 + } else {
2520 + /*
2521 + * flush_tlb_all(), as currently implemented, won't work if
2522 + * PCID is on but PGE is not. Since that combination
2523 + * doesn't exist on real hardware, there's no reason to try
2524 + * to fully support it, but it's polite to avoid corrupting
2525 + * data if we're on an improperly configured VM.
2526 + */
2527 + setup_clear_cpu_cap(X86_FEATURE_PCID);
2528 }
2529 -#endif
2530 }
2531
2532 #ifdef CONFIG_X86_32
2533 @@ -624,6 +641,7 @@ void __init init_mem_mapping(void)
2534 {
2535 unsigned long end;
2536
2537 + pti_check_boottime_disable();
2538 probe_page_size_mask();
2539 setup_pcid();
2540
2541 @@ -847,7 +865,7 @@ void __init zone_sizes_init(void)
2542 free_area_init_nodes(max_zone_pfns);
2543 }
2544
2545 -DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
2546 +__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
2547 .loaded_mm = &init_mm,
2548 .next_asid = 1,
2549 .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */
2550 diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
2551 index 17ebc5a978cc..9b7bcbd33cc2 100644
2552 --- a/arch/x86/mm/pgtable.c
2553 +++ b/arch/x86/mm/pgtable.c
2554 @@ -355,14 +355,15 @@ static inline void _pgd_free(pgd_t *pgd)
2555 kmem_cache_free(pgd_cache, pgd);
2556 }
2557 #else
2558 +
2559 static inline pgd_t *_pgd_alloc(void)
2560 {
2561 - return (pgd_t *)__get_free_page(PGALLOC_GFP);
2562 + return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER);
2563 }
2564
2565 static inline void _pgd_free(pgd_t *pgd)
2566 {
2567 - free_page((unsigned long)pgd);
2568 + free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
2569 }
2570 #endif /* CONFIG_X86_PAE */
2571
2572 diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
2573 new file mode 100644
2574 index 000000000000..bce8aea65606
2575 --- /dev/null
2576 +++ b/arch/x86/mm/pti.c
2577 @@ -0,0 +1,387 @@
2578 +/*
2579 + * Copyright(c) 2017 Intel Corporation. All rights reserved.
2580 + *
2581 + * This program is free software; you can redistribute it and/or modify
2582 + * it under the terms of version 2 of the GNU General Public License as
2583 + * published by the Free Software Foundation.
2584 + *
2585 + * This program is distributed in the hope that it will be useful, but
2586 + * WITHOUT ANY WARRANTY; without even the implied warranty of
2587 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
2588 + * General Public License for more details.
2589 + *
2590 + * This code is based in part on work published here:
2591 + *
2592 + * https://github.com/IAIK/KAISER
2593 + *
2594 + * The original work was written by and and signed off by for the Linux
2595 + * kernel by:
2596 + *
2597 + * Signed-off-by: Richard Fellner <richard.fellner@student.tugraz.at>
2598 + * Signed-off-by: Moritz Lipp <moritz.lipp@iaik.tugraz.at>
2599 + * Signed-off-by: Daniel Gruss <daniel.gruss@iaik.tugraz.at>
2600 + * Signed-off-by: Michael Schwarz <michael.schwarz@iaik.tugraz.at>
2601 + *
2602 + * Major changes to the original code by: Dave Hansen <dave.hansen@intel.com>
2603 + * Mostly rewritten by Thomas Gleixner <tglx@linutronix.de> and
2604 + * Andy Lutomirsky <luto@amacapital.net>
2605 + */
2606 +#include <linux/kernel.h>
2607 +#include <linux/errno.h>
2608 +#include <linux/string.h>
2609 +#include <linux/types.h>
2610 +#include <linux/bug.h>
2611 +#include <linux/init.h>
2612 +#include <linux/spinlock.h>
2613 +#include <linux/mm.h>
2614 +#include <linux/uaccess.h>
2615 +
2616 +#include <asm/cpufeature.h>
2617 +#include <asm/hypervisor.h>
2618 +#include <asm/vsyscall.h>
2619 +#include <asm/cmdline.h>
2620 +#include <asm/pti.h>
2621 +#include <asm/pgtable.h>
2622 +#include <asm/pgalloc.h>
2623 +#include <asm/tlbflush.h>
2624 +#include <asm/desc.h>
2625 +
2626 +#undef pr_fmt
2627 +#define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt
2628 +
2629 +/* Backporting helper */
2630 +#ifndef __GFP_NOTRACK
2631 +#define __GFP_NOTRACK 0
2632 +#endif
2633 +
2634 +static void __init pti_print_if_insecure(const char *reason)
2635 +{
2636 + if (boot_cpu_has_bug(X86_BUG_CPU_INSECURE))
2637 + pr_info("%s\n", reason);
2638 +}
2639 +
2640 +static void __init pti_print_if_secure(const char *reason)
2641 +{
2642 + if (!boot_cpu_has_bug(X86_BUG_CPU_INSECURE))
2643 + pr_info("%s\n", reason);
2644 +}
2645 +
2646 +void __init pti_check_boottime_disable(void)
2647 +{
2648 + char arg[5];
2649 + int ret;
2650 +
2651 + if (hypervisor_is_type(X86_HYPER_XEN_PV)) {
2652 + pti_print_if_insecure("disabled on XEN PV.");
2653 + return;
2654 + }
2655 +
2656 + ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg));
2657 + if (ret > 0) {
2658 + if (ret == 3 && !strncmp(arg, "off", 3)) {
2659 + pti_print_if_insecure("disabled on command line.");
2660 + return;
2661 + }
2662 + if (ret == 2 && !strncmp(arg, "on", 2)) {
2663 + pti_print_if_secure("force enabled on command line.");
2664 + goto enable;
2665 + }
2666 + if (ret == 4 && !strncmp(arg, "auto", 4))
2667 + goto autosel;
2668 + }
2669 +
2670 + if (cmdline_find_option_bool(boot_command_line, "nopti")) {
2671 + pti_print_if_insecure("disabled on command line.");
2672 + return;
2673 + }
2674 +
2675 +autosel:
2676 + if (!boot_cpu_has_bug(X86_BUG_CPU_INSECURE))
2677 + return;
2678 +enable:
2679 + setup_force_cpu_cap(X86_FEATURE_PTI);
2680 +}
2681 +
2682 +pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
2683 +{
2684 + /*
2685 + * Changes to the high (kernel) portion of the kernelmode page
2686 + * tables are not automatically propagated to the usermode tables.
2687 + *
2688 + * Users should keep in mind that, unlike the kernelmode tables,
2689 + * there is no vmalloc_fault equivalent for the usermode tables.
2690 + * Top-level entries added to init_mm's usermode pgd after boot
2691 + * will not be automatically propagated to other mms.
2692 + */
2693 + if (!pgdp_maps_userspace(pgdp))
2694 + return pgd;
2695 +
2696 + /*
2697 + * The user page tables get the full PGD, accessible from
2698 + * userspace:
2699 + */
2700 + kernel_to_user_pgdp(pgdp)->pgd = pgd.pgd;
2701 +
2702 + /*
2703 + * If this is normal user memory, make it NX in the kernel
2704 + * pagetables so that, if we somehow screw up and return to
2705 + * usermode with the kernel CR3 loaded, we'll get a page fault
2706 + * instead of allowing user code to execute with the wrong CR3.
2707 + *
2708 + * As exceptions, we don't set NX if:
2709 + * - _PAGE_USER is not set. This could be an executable
2710 + * EFI runtime mapping or something similar, and the kernel
2711 + * may execute from it
2712 + * - we don't have NX support
2713 + * - we're clearing the PGD (i.e. the new pgd is not present).
2714 + */
2715 + if ((pgd.pgd & (_PAGE_USER|_PAGE_PRESENT)) == (_PAGE_USER|_PAGE_PRESENT) &&
2716 + (__supported_pte_mask & _PAGE_NX))
2717 + pgd.pgd |= _PAGE_NX;
2718 +
2719 + /* return the copy of the PGD we want the kernel to use: */
2720 + return pgd;
2721 +}
2722 +
2723 +/*
2724 + * Walk the user copy of the page tables (optionally) trying to allocate
2725 + * page table pages on the way down.
2726 + *
2727 + * Returns a pointer to a P4D on success, or NULL on failure.
2728 + */
2729 +static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address)
2730 +{
2731 + pgd_t *pgd = kernel_to_user_pgdp(pgd_offset_k(address));
2732 + gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
2733 +
2734 + if (address < PAGE_OFFSET) {
2735 + WARN_ONCE(1, "attempt to walk user address\n");
2736 + return NULL;
2737 + }
2738 +
2739 + if (pgd_none(*pgd)) {
2740 + unsigned long new_p4d_page = __get_free_page(gfp);
2741 + if (!new_p4d_page)
2742 + return NULL;
2743 +
2744 + if (pgd_none(*pgd)) {
2745 + set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page)));
2746 + new_p4d_page = 0;
2747 + }
2748 + if (new_p4d_page)
2749 + free_page(new_p4d_page);
2750 + }
2751 + BUILD_BUG_ON(pgd_large(*pgd) != 0);
2752 +
2753 + return p4d_offset(pgd, address);
2754 +}
2755 +
2756 +/*
2757 + * Walk the user copy of the page tables (optionally) trying to allocate
2758 + * page table pages on the way down.
2759 + *
2760 + * Returns a pointer to a PMD on success, or NULL on failure.
2761 + */
2762 +static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
2763 +{
2764 + gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
2765 + p4d_t *p4d = pti_user_pagetable_walk_p4d(address);
2766 + pud_t *pud;
2767 +
2768 + BUILD_BUG_ON(p4d_large(*p4d) != 0);
2769 + if (p4d_none(*p4d)) {
2770 + unsigned long new_pud_page = __get_free_page(gfp);
2771 + if (!new_pud_page)
2772 + return NULL;
2773 +
2774 + if (p4d_none(*p4d)) {
2775 + set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page)));
2776 + new_pud_page = 0;
2777 + }
2778 + if (new_pud_page)
2779 + free_page(new_pud_page);
2780 + }
2781 +
2782 + pud = pud_offset(p4d, address);
2783 + /* The user page tables do not use large mappings: */
2784 + if (pud_large(*pud)) {
2785 + WARN_ON(1);
2786 + return NULL;
2787 + }
2788 + if (pud_none(*pud)) {
2789 + unsigned long new_pmd_page = __get_free_page(gfp);
2790 + if (!new_pmd_page)
2791 + return NULL;
2792 +
2793 + if (pud_none(*pud)) {
2794 + set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
2795 + new_pmd_page = 0;
2796 + }
2797 + if (new_pmd_page)
2798 + free_page(new_pmd_page);
2799 + }
2800 +
2801 + return pmd_offset(pud, address);
2802 +}
2803 +
2804 +#ifdef CONFIG_X86_VSYSCALL_EMULATION
2805 +/*
2806 + * Walk the shadow copy of the page tables (optionally) trying to allocate
2807 + * page table pages on the way down. Does not support large pages.
2808 + *
2809 + * Note: this is only used when mapping *new* kernel data into the
2810 + * user/shadow page tables. It is never used for userspace data.
2811 + *
2812 + * Returns a pointer to a PTE on success, or NULL on failure.
2813 + */
2814 +static __init pte_t *pti_user_pagetable_walk_pte(unsigned long address)
2815 +{
2816 + gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
2817 + pmd_t *pmd = pti_user_pagetable_walk_pmd(address);
2818 + pte_t *pte;
2819 +
2820 + /* We can't do anything sensible if we hit a large mapping. */
2821 + if (pmd_large(*pmd)) {
2822 + WARN_ON(1);
2823 + return NULL;
2824 + }
2825 +
2826 + if (pmd_none(*pmd)) {
2827 + unsigned long new_pte_page = __get_free_page(gfp);
2828 + if (!new_pte_page)
2829 + return NULL;
2830 +
2831 + if (pmd_none(*pmd)) {
2832 + set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page)));
2833 + new_pte_page = 0;
2834 + }
2835 + if (new_pte_page)
2836 + free_page(new_pte_page);
2837 + }
2838 +
2839 + pte = pte_offset_kernel(pmd, address);
2840 + if (pte_flags(*pte) & _PAGE_USER) {
2841 + WARN_ONCE(1, "attempt to walk to user pte\n");
2842 + return NULL;
2843 + }
2844 + return pte;
2845 +}
2846 +
2847 +static void __init pti_setup_vsyscall(void)
2848 +{
2849 + pte_t *pte, *target_pte;
2850 + unsigned int level;
2851 +
2852 + pte = lookup_address(VSYSCALL_ADDR, &level);
2853 + if (!pte || WARN_ON(level != PG_LEVEL_4K) || pte_none(*pte))
2854 + return;
2855 +
2856 + target_pte = pti_user_pagetable_walk_pte(VSYSCALL_ADDR);
2857 + if (WARN_ON(!target_pte))
2858 + return;
2859 +
2860 + *target_pte = *pte;
2861 + set_vsyscall_pgtable_user_bits(kernel_to_user_pgdp(swapper_pg_dir));
2862 +}
2863 +#else
2864 +static void __init pti_setup_vsyscall(void) { }
2865 +#endif
2866 +
2867 +static void __init
2868 +pti_clone_pmds(unsigned long start, unsigned long end, pmdval_t clear)
2869 +{
2870 + unsigned long addr;
2871 +
2872 + /*
2873 + * Clone the populated PMDs which cover start to end. These PMD areas
2874 + * can have holes.
2875 + */
2876 + for (addr = start; addr < end; addr += PMD_SIZE) {
2877 + pmd_t *pmd, *target_pmd;
2878 + pgd_t *pgd;
2879 + p4d_t *p4d;
2880 + pud_t *pud;
2881 +
2882 + pgd = pgd_offset_k(addr);
2883 + if (WARN_ON(pgd_none(*pgd)))
2884 + return;
2885 + p4d = p4d_offset(pgd, addr);
2886 + if (WARN_ON(p4d_none(*p4d)))
2887 + return;
2888 + pud = pud_offset(p4d, addr);
2889 + if (pud_none(*pud))
2890 + continue;
2891 + pmd = pmd_offset(pud, addr);
2892 + if (pmd_none(*pmd))
2893 + continue;
2894 +
2895 + target_pmd = pti_user_pagetable_walk_pmd(addr);
2896 + if (WARN_ON(!target_pmd))
2897 + return;
2898 +
2899 + /*
2900 + * Copy the PMD. That is, the kernelmode and usermode
2901 + * tables will share the last-level page tables of this
2902 + * address range
2903 + */
2904 + *target_pmd = pmd_clear_flags(*pmd, clear);
2905 + }
2906 +}
2907 +
2908 +/*
2909 + * Clone a single p4d (i.e. a top-level entry on 4-level systems and a
2910 + * next-level entry on 5-level systems.
2911 + */
2912 +static void __init pti_clone_p4d(unsigned long addr)
2913 +{
2914 + p4d_t *kernel_p4d, *user_p4d;
2915 + pgd_t *kernel_pgd;
2916 +
2917 + user_p4d = pti_user_pagetable_walk_p4d(addr);
2918 + kernel_pgd = pgd_offset_k(addr);
2919 + kernel_p4d = p4d_offset(kernel_pgd, addr);
2920 + *user_p4d = *kernel_p4d;
2921 +}
2922 +
2923 +/*
2924 + * Clone the CPU_ENTRY_AREA into the user space visible page table.
2925 + */
2926 +static void __init pti_clone_user_shared(void)
2927 +{
2928 + pti_clone_p4d(CPU_ENTRY_AREA_BASE);
2929 +}
2930 +
2931 +/*
2932 + * Clone the ESPFIX P4D into the user space visinble page table
2933 + */
2934 +static void __init pti_setup_espfix64(void)
2935 +{
2936 +#ifdef CONFIG_X86_ESPFIX64
2937 + pti_clone_p4d(ESPFIX_BASE_ADDR);
2938 +#endif
2939 +}
2940 +
2941 +/*
2942 + * Clone the populated PMDs of the entry and irqentry text and force it RO.
2943 + */
2944 +static void __init pti_clone_entry_text(void)
2945 +{
2946 + pti_clone_pmds((unsigned long) __entry_text_start,
2947 + (unsigned long) __irqentry_text_end, _PAGE_RW);
2948 +}
2949 +
2950 +/*
2951 + * Initialize kernel page table isolation
2952 + */
2953 +void __init pti_init(void)
2954 +{
2955 + if (!static_cpu_has(X86_FEATURE_PTI))
2956 + return;
2957 +
2958 + pr_info("enabled\n");
2959 +
2960 + pti_clone_user_shared();
2961 + pti_clone_entry_text();
2962 + pti_setup_espfix64();
2963 + pti_setup_vsyscall();
2964 +}
2965 diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
2966 index 0a1be3adc97e..a1561957dccb 100644
2967 --- a/arch/x86/mm/tlb.c
2968 +++ b/arch/x86/mm/tlb.c
2969 @@ -28,6 +28,38 @@
2970 * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
2971 */
2972
2973 +/*
2974 + * We get here when we do something requiring a TLB invalidation
2975 + * but could not go invalidate all of the contexts. We do the
2976 + * necessary invalidation by clearing out the 'ctx_id' which
2977 + * forces a TLB flush when the context is loaded.
2978 + */
2979 +void clear_asid_other(void)
2980 +{
2981 + u16 asid;
2982 +
2983 + /*
2984 + * This is only expected to be set if we have disabled
2985 + * kernel _PAGE_GLOBAL pages.
2986 + */
2987 + if (!static_cpu_has(X86_FEATURE_PTI)) {
2988 + WARN_ON_ONCE(1);
2989 + return;
2990 + }
2991 +
2992 + for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
2993 + /* Do not need to flush the current asid */
2994 + if (asid == this_cpu_read(cpu_tlbstate.loaded_mm_asid))
2995 + continue;
2996 + /*
2997 + * Make sure the next time we go to switch to
2998 + * this asid, we do a flush:
2999 + */
3000 + this_cpu_write(cpu_tlbstate.ctxs[asid].ctx_id, 0);
3001 + }
3002 + this_cpu_write(cpu_tlbstate.invalidate_other, false);
3003 +}
3004 +
3005 atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);
3006
3007
3008 @@ -42,6 +74,9 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
3009 return;
3010 }
3011
3012 + if (this_cpu_read(cpu_tlbstate.invalidate_other))
3013 + clear_asid_other();
3014 +
3015 for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
3016 if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) !=
3017 next->context.ctx_id)
3018 @@ -65,6 +100,25 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
3019 *need_flush = true;
3020 }
3021
3022 +static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush)
3023 +{
3024 + unsigned long new_mm_cr3;
3025 +
3026 + if (need_flush) {
3027 + invalidate_user_asid(new_asid);
3028 + new_mm_cr3 = build_cr3(pgdir, new_asid);
3029 + } else {
3030 + new_mm_cr3 = build_cr3_noflush(pgdir, new_asid);
3031 + }
3032 +
3033 + /*
3034 + * Caution: many callers of this function expect
3035 + * that load_cr3() is serializing and orders TLB
3036 + * fills with respect to the mm_cpumask writes.
3037 + */
3038 + write_cr3(new_mm_cr3);
3039 +}
3040 +
3041 void leave_mm(int cpu)
3042 {
3043 struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
3044 @@ -195,7 +249,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
3045 if (need_flush) {
3046 this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
3047 this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
3048 - write_cr3(build_cr3(next->pgd, new_asid));
3049 + load_new_mm_cr3(next->pgd, new_asid, true);
3050
3051 /*
3052 * NB: This gets called via leave_mm() in the idle path
3053 @@ -208,7 +262,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
3054 trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
3055 } else {
3056 /* The new ASID is already up to date. */
3057 - write_cr3(build_cr3_noflush(next->pgd, new_asid));
3058 + load_new_mm_cr3(next->pgd, new_asid, false);
3059
3060 /* See above wrt _rcuidle. */
3061 trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
3062 diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
3063 index 20fb31579b69..39c4b35ac7a4 100644
3064 --- a/arch/x86/platform/efi/efi_64.c
3065 +++ b/arch/x86/platform/efi/efi_64.c
3066 @@ -195,6 +195,9 @@ static pgd_t *efi_pgd;
3067 * because we want to avoid inserting EFI region mappings (EFI_VA_END
3068 * to EFI_VA_START) into the standard kernel page tables. Everything
3069 * else can be shared, see efi_sync_low_kernel_mappings().
3070 + *
3071 + * We don't want the pgd on the pgd_list and cannot use pgd_alloc() for the
3072 + * allocation.
3073 */
3074 int __init efi_alloc_page_tables(void)
3075 {
3076 @@ -207,7 +210,7 @@ int __init efi_alloc_page_tables(void)
3077 return 0;
3078
3079 gfp_mask = GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO;
3080 - efi_pgd = (pgd_t *)__get_free_page(gfp_mask);
3081 + efi_pgd = (pgd_t *)__get_free_pages(gfp_mask, PGD_ALLOCATION_ORDER);
3082 if (!efi_pgd)
3083 return -ENOMEM;
3084
3085 diff --git a/block/blk-map.c b/block/blk-map.c
3086 index d5251edcc0dd..368daa02714e 100644
3087 --- a/block/blk-map.c
3088 +++ b/block/blk-map.c
3089 @@ -12,22 +12,29 @@
3090 #include "blk.h"
3091
3092 /*
3093 - * Append a bio to a passthrough request. Only works can be merged into
3094 - * the request based on the driver constraints.
3095 + * Append a bio to a passthrough request. Only works if the bio can be merged
3096 + * into the request based on the driver constraints.
3097 */
3098 -int blk_rq_append_bio(struct request *rq, struct bio *bio)
3099 +int blk_rq_append_bio(struct request *rq, struct bio **bio)
3100 {
3101 - blk_queue_bounce(rq->q, &bio);
3102 + struct bio *orig_bio = *bio;
3103 +
3104 + blk_queue_bounce(rq->q, bio);
3105
3106 if (!rq->bio) {
3107 - blk_rq_bio_prep(rq->q, rq, bio);
3108 + blk_rq_bio_prep(rq->q, rq, *bio);
3109 } else {
3110 - if (!ll_back_merge_fn(rq->q, rq, bio))
3111 + if (!ll_back_merge_fn(rq->q, rq, *bio)) {
3112 + if (orig_bio != *bio) {
3113 + bio_put(*bio);
3114 + *bio = orig_bio;
3115 + }
3116 return -EINVAL;
3117 + }
3118
3119 - rq->biotail->bi_next = bio;
3120 - rq->biotail = bio;
3121 - rq->__data_len += bio->bi_iter.bi_size;
3122 + rq->biotail->bi_next = *bio;
3123 + rq->biotail = *bio;
3124 + rq->__data_len += (*bio)->bi_iter.bi_size;
3125 }
3126
3127 return 0;
3128 @@ -80,14 +87,12 @@ static int __blk_rq_map_user_iov(struct request *rq,
3129 * We link the bounce buffer in and could have to traverse it
3130 * later so we have to get a ref to prevent it from being freed
3131 */
3132 - ret = blk_rq_append_bio(rq, bio);
3133 - bio_get(bio);
3134 + ret = blk_rq_append_bio(rq, &bio);
3135 if (ret) {
3136 - bio_endio(bio);
3137 __blk_rq_unmap_user(orig_bio);
3138 - bio_put(bio);
3139 return ret;
3140 }
3141 + bio_get(bio);
3142
3143 return 0;
3144 }
3145 @@ -220,7 +225,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
3146 int reading = rq_data_dir(rq) == READ;
3147 unsigned long addr = (unsigned long) kbuf;
3148 int do_copy = 0;
3149 - struct bio *bio;
3150 + struct bio *bio, *orig_bio;
3151 int ret;
3152
3153 if (len > (queue_max_hw_sectors(q) << 9))
3154 @@ -243,10 +248,11 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
3155 if (do_copy)
3156 rq->rq_flags |= RQF_COPY_USER;
3157
3158 - ret = blk_rq_append_bio(rq, bio);
3159 + orig_bio = bio;
3160 + ret = blk_rq_append_bio(rq, &bio);
3161 if (unlikely(ret)) {
3162 /* request is too big */
3163 - bio_put(bio);
3164 + bio_put(orig_bio);
3165 return ret;
3166 }
3167
3168 diff --git a/block/bounce.c b/block/bounce.c
3169 index fceb1a96480b..1d05c422c932 100644
3170 --- a/block/bounce.c
3171 +++ b/block/bounce.c
3172 @@ -200,6 +200,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
3173 unsigned i = 0;
3174 bool bounce = false;
3175 int sectors = 0;
3176 + bool passthrough = bio_is_passthrough(*bio_orig);
3177
3178 bio_for_each_segment(from, *bio_orig, iter) {
3179 if (i++ < BIO_MAX_PAGES)
3180 @@ -210,13 +211,14 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
3181 if (!bounce)
3182 return;
3183
3184 - if (sectors < bio_sectors(*bio_orig)) {
3185 + if (!passthrough && sectors < bio_sectors(*bio_orig)) {
3186 bio = bio_split(*bio_orig, sectors, GFP_NOIO, bounce_bio_split);
3187 bio_chain(bio, *bio_orig);
3188 generic_make_request(*bio_orig);
3189 *bio_orig = bio;
3190 }
3191 - bio = bio_clone_bioset(*bio_orig, GFP_NOIO, bounce_bio_set);
3192 + bio = bio_clone_bioset(*bio_orig, GFP_NOIO, passthrough ? NULL :
3193 + bounce_bio_set);
3194
3195 bio_for_each_segment_all(to, bio, i) {
3196 struct page *page = to->bv_page;
3197 diff --git a/drivers/android/binder.c b/drivers/android/binder.c
3198 index 88b4bbe58100..a340766b51fe 100644
3199 --- a/drivers/android/binder.c
3200 +++ b/drivers/android/binder.c
3201 @@ -482,7 +482,8 @@ enum binder_deferred_state {
3202 * @tsk task_struct for group_leader of process
3203 * (invariant after initialized)
3204 * @files files_struct for process
3205 - * (invariant after initialized)
3206 + * (protected by @files_lock)
3207 + * @files_lock mutex to protect @files
3208 * @deferred_work_node: element for binder_deferred_list
3209 * (protected by binder_deferred_lock)
3210 * @deferred_work: bitmap of deferred work to perform
3211 @@ -530,6 +531,7 @@ struct binder_proc {
3212 int pid;
3213 struct task_struct *tsk;
3214 struct files_struct *files;
3215 + struct mutex files_lock;
3216 struct hlist_node deferred_work_node;
3217 int deferred_work;
3218 bool is_dead;
3219 @@ -877,20 +879,26 @@ static void binder_inc_node_tmpref_ilocked(struct binder_node *node);
3220
3221 static int task_get_unused_fd_flags(struct binder_proc *proc, int flags)
3222 {
3223 - struct files_struct *files = proc->files;
3224 unsigned long rlim_cur;
3225 unsigned long irqs;
3226 + int ret;
3227
3228 - if (files == NULL)
3229 - return -ESRCH;
3230 -
3231 - if (!lock_task_sighand(proc->tsk, &irqs))
3232 - return -EMFILE;
3233 -
3234 + mutex_lock(&proc->files_lock);
3235 + if (proc->files == NULL) {
3236 + ret = -ESRCH;
3237 + goto err;
3238 + }
3239 + if (!lock_task_sighand(proc->tsk, &irqs)) {
3240 + ret = -EMFILE;
3241 + goto err;
3242 + }
3243 rlim_cur = task_rlimit(proc->tsk, RLIMIT_NOFILE);
3244 unlock_task_sighand(proc->tsk, &irqs);
3245
3246 - return __alloc_fd(files, 0, rlim_cur, flags);
3247 + ret = __alloc_fd(proc->files, 0, rlim_cur, flags);
3248 +err:
3249 + mutex_unlock(&proc->files_lock);
3250 + return ret;
3251 }
3252
3253 /*
3254 @@ -899,8 +907,10 @@ static int task_get_unused_fd_flags(struct binder_proc *proc, int flags)
3255 static void task_fd_install(
3256 struct binder_proc *proc, unsigned int fd, struct file *file)
3257 {
3258 + mutex_lock(&proc->files_lock);
3259 if (proc->files)
3260 __fd_install(proc->files, fd, file);
3261 + mutex_unlock(&proc->files_lock);
3262 }
3263
3264 /*
3265 @@ -910,9 +920,11 @@ static long task_close_fd(struct binder_proc *proc, unsigned int fd)
3266 {
3267 int retval;
3268
3269 - if (proc->files == NULL)
3270 - return -ESRCH;
3271 -
3272 + mutex_lock(&proc->files_lock);
3273 + if (proc->files == NULL) {
3274 + retval = -ESRCH;
3275 + goto err;
3276 + }
3277 retval = __close_fd(proc->files, fd);
3278 /* can't restart close syscall because file table entry was cleared */
3279 if (unlikely(retval == -ERESTARTSYS ||
3280 @@ -920,7 +932,8 @@ static long task_close_fd(struct binder_proc *proc, unsigned int fd)
3281 retval == -ERESTARTNOHAND ||
3282 retval == -ERESTART_RESTARTBLOCK))
3283 retval = -EINTR;
3284 -
3285 +err:
3286 + mutex_unlock(&proc->files_lock);
3287 return retval;
3288 }
3289
3290 @@ -4627,7 +4640,9 @@ static int binder_mmap(struct file *filp, struct vm_area_struct *vma)
3291 ret = binder_alloc_mmap_handler(&proc->alloc, vma);
3292 if (ret)
3293 return ret;
3294 + mutex_lock(&proc->files_lock);
3295 proc->files = get_files_struct(current);
3296 + mutex_unlock(&proc->files_lock);
3297 return 0;
3298
3299 err_bad_arg:
3300 @@ -4651,6 +4666,7 @@ static int binder_open(struct inode *nodp, struct file *filp)
3301 spin_lock_init(&proc->outer_lock);
3302 get_task_struct(current->group_leader);
3303 proc->tsk = current->group_leader;
3304 + mutex_init(&proc->files_lock);
3305 INIT_LIST_HEAD(&proc->todo);
3306 proc->default_priority = task_nice(current);
3307 binder_dev = container_of(filp->private_data, struct binder_device,
3308 @@ -4903,9 +4919,11 @@ static void binder_deferred_func(struct work_struct *work)
3309
3310 files = NULL;
3311 if (defer & BINDER_DEFERRED_PUT_FILES) {
3312 + mutex_lock(&proc->files_lock);
3313 files = proc->files;
3314 if (files)
3315 proc->files = NULL;
3316 + mutex_unlock(&proc->files_lock);
3317 }
3318
3319 if (defer & BINDER_DEFERRED_FLUSH)
3320 diff --git a/drivers/base/cacheinfo.c b/drivers/base/cacheinfo.c
3321 index eb3af2739537..07532d83be0b 100644
3322 --- a/drivers/base/cacheinfo.c
3323 +++ b/drivers/base/cacheinfo.c
3324 @@ -186,6 +186,11 @@ static void cache_associativity(struct cacheinfo *this_leaf)
3325 this_leaf->ways_of_associativity = (size / nr_sets) / line_size;
3326 }
3327
3328 +static bool cache_node_is_unified(struct cacheinfo *this_leaf)
3329 +{
3330 + return of_property_read_bool(this_leaf->of_node, "cache-unified");
3331 +}
3332 +
3333 static void cache_of_override_properties(unsigned int cpu)
3334 {
3335 int index;
3336 @@ -194,6 +199,14 @@ static void cache_of_override_properties(unsigned int cpu)
3337
3338 for (index = 0; index < cache_leaves(cpu); index++) {
3339 this_leaf = this_cpu_ci->info_list + index;
3340 + /*
3341 + * init_cache_level must setup the cache level correctly
3342 + * overriding the architecturally specified levels, so
3343 + * if type is NONE at this stage, it should be unified
3344 + */
3345 + if (this_leaf->type == CACHE_TYPE_NOCACHE &&
3346 + cache_node_is_unified(this_leaf))
3347 + this_leaf->type = CACHE_TYPE_UNIFIED;
3348 cache_size(this_leaf);
3349 cache_get_line_size(this_leaf);
3350 cache_nr_sets(this_leaf);
3351 diff --git a/drivers/gpio/gpiolib-acpi.c b/drivers/gpio/gpiolib-acpi.c
3352 index eb4528c87c0b..d6f3d9ee1350 100644
3353 --- a/drivers/gpio/gpiolib-acpi.c
3354 +++ b/drivers/gpio/gpiolib-acpi.c
3355 @@ -1074,7 +1074,7 @@ void acpi_gpiochip_add(struct gpio_chip *chip)
3356 }
3357
3358 if (!chip->names)
3359 - devprop_gpiochip_set_names(chip);
3360 + devprop_gpiochip_set_names(chip, dev_fwnode(chip->parent));
3361
3362 acpi_gpiochip_request_regions(acpi_gpio);
3363 acpi_gpiochip_scan_gpios(acpi_gpio);
3364 diff --git a/drivers/gpio/gpiolib-devprop.c b/drivers/gpio/gpiolib-devprop.c
3365 index 27f383bda7d9..f748aa3e77f7 100644
3366 --- a/drivers/gpio/gpiolib-devprop.c
3367 +++ b/drivers/gpio/gpiolib-devprop.c
3368 @@ -19,30 +19,27 @@
3369 /**
3370 * devprop_gpiochip_set_names - Set GPIO line names using device properties
3371 * @chip: GPIO chip whose lines should be named, if possible
3372 + * @fwnode: Property Node containing the gpio-line-names property
3373 *
3374 * Looks for device property "gpio-line-names" and if it exists assigns
3375 * GPIO line names for the chip. The memory allocated for the assigned
3376 * names belong to the underlying firmware node and should not be released
3377 * by the caller.
3378 */
3379 -void devprop_gpiochip_set_names(struct gpio_chip *chip)
3380 +void devprop_gpiochip_set_names(struct gpio_chip *chip,
3381 + const struct fwnode_handle *fwnode)
3382 {
3383 struct gpio_device *gdev = chip->gpiodev;
3384 const char **names;
3385 int ret, i;
3386
3387 - if (!chip->parent) {
3388 - dev_warn(&gdev->dev, "GPIO chip parent is NULL\n");
3389 - return;
3390 - }
3391 -
3392 - ret = device_property_read_string_array(chip->parent, "gpio-line-names",
3393 + ret = fwnode_property_read_string_array(fwnode, "gpio-line-names",
3394 NULL, 0);
3395 if (ret < 0)
3396 return;
3397
3398 if (ret != gdev->ngpio) {
3399 - dev_warn(chip->parent,
3400 + dev_warn(&gdev->dev,
3401 "names %d do not match number of GPIOs %d\n", ret,
3402 gdev->ngpio);
3403 return;
3404 @@ -52,10 +49,10 @@ void devprop_gpiochip_set_names(struct gpio_chip *chip)
3405 if (!names)
3406 return;
3407
3408 - ret = device_property_read_string_array(chip->parent, "gpio-line-names",
3409 + ret = fwnode_property_read_string_array(fwnode, "gpio-line-names",
3410 names, gdev->ngpio);
3411 if (ret < 0) {
3412 - dev_warn(chip->parent, "failed to read GPIO line names\n");
3413 + dev_warn(&gdev->dev, "failed to read GPIO line names\n");
3414 kfree(names);
3415 return;
3416 }
3417 diff --git a/drivers/gpio/gpiolib-of.c b/drivers/gpio/gpiolib-of.c
3418 index bfcd20699ec8..ba38f530e403 100644
3419 --- a/drivers/gpio/gpiolib-of.c
3420 +++ b/drivers/gpio/gpiolib-of.c
3421 @@ -493,7 +493,8 @@ int of_gpiochip_add(struct gpio_chip *chip)
3422
3423 /* If the chip defines names itself, these take precedence */
3424 if (!chip->names)
3425 - devprop_gpiochip_set_names(chip);
3426 + devprop_gpiochip_set_names(chip,
3427 + of_fwnode_handle(chip->of_node));
3428
3429 of_node_get(chip->of_node);
3430
3431 diff --git a/drivers/gpio/gpiolib.h b/drivers/gpio/gpiolib.h
3432 index d003ccb12781..3d4d0634c9dd 100644
3433 --- a/drivers/gpio/gpiolib.h
3434 +++ b/drivers/gpio/gpiolib.h
3435 @@ -224,7 +224,8 @@ static inline int gpio_chip_hwgpio(const struct gpio_desc *desc)
3436 return desc - &desc->gdev->descs[0];
3437 }
3438
3439 -void devprop_gpiochip_set_names(struct gpio_chip *chip);
3440 +void devprop_gpiochip_set_names(struct gpio_chip *chip,
3441 + const struct fwnode_handle *fwnode);
3442
3443 /* With descriptor prefix */
3444
3445 diff --git a/drivers/infiniband/core/security.c b/drivers/infiniband/core/security.c
3446 index feafdb961c48..59b2f96d986a 100644
3447 --- a/drivers/infiniband/core/security.c
3448 +++ b/drivers/infiniband/core/security.c
3449 @@ -386,6 +386,9 @@ int ib_open_shared_qp_security(struct ib_qp *qp, struct ib_device *dev)
3450 if (ret)
3451 return ret;
3452
3453 + if (!qp->qp_sec)
3454 + return 0;
3455 +
3456 mutex_lock(&real_qp->qp_sec->mutex);
3457 ret = check_qp_port_pkey_settings(real_qp->qp_sec->ports_pkeys,
3458 qp->qp_sec);
3459 diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
3460 index d8f540054392..93c1a57dbff1 100644
3461 --- a/drivers/infiniband/core/uverbs_cmd.c
3462 +++ b/drivers/infiniband/core/uverbs_cmd.c
3463 @@ -2085,8 +2085,8 @@ int ib_uverbs_ex_modify_qp(struct ib_uverbs_file *file,
3464 return -EOPNOTSUPP;
3465
3466 if (ucore->inlen > sizeof(cmd)) {
3467 - if (ib_is_udata_cleared(ucore, sizeof(cmd),
3468 - ucore->inlen - sizeof(cmd)))
3469 + if (!ib_is_udata_cleared(ucore, sizeof(cmd),
3470 + ucore->inlen - sizeof(cmd)))
3471 return -EOPNOTSUPP;
3472 }
3473
3474 diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
3475 index de57d6c11a25..9032f77cc38d 100644
3476 --- a/drivers/infiniband/core/verbs.c
3477 +++ b/drivers/infiniband/core/verbs.c
3478 @@ -1400,7 +1400,8 @@ int ib_close_qp(struct ib_qp *qp)
3479 spin_unlock_irqrestore(&real_qp->device->event_handler_lock, flags);
3480
3481 atomic_dec(&real_qp->usecnt);
3482 - ib_close_shared_qp_security(qp->qp_sec);
3483 + if (qp->qp_sec)
3484 + ib_close_shared_qp_security(qp->qp_sec);
3485 kfree(qp);
3486
3487 return 0;
3488 diff --git a/drivers/infiniband/hw/cxgb4/cq.c b/drivers/infiniband/hw/cxgb4/cq.c
3489 index eae8ea81c6e2..514c1000ded1 100644
3490 --- a/drivers/infiniband/hw/cxgb4/cq.c
3491 +++ b/drivers/infiniband/hw/cxgb4/cq.c
3492 @@ -586,10 +586,10 @@ static int poll_cq(struct t4_wq *wq, struct t4_cq *cq, struct t4_cqe *cqe,
3493 ret = -EAGAIN;
3494 goto skip_cqe;
3495 }
3496 - if (unlikely((CQE_WRID_MSN(hw_cqe) != (wq->rq.msn)))) {
3497 + if (unlikely(!CQE_STATUS(hw_cqe) &&
3498 + CQE_WRID_MSN(hw_cqe) != wq->rq.msn)) {
3499 t4_set_wq_in_error(wq);
3500 - hw_cqe->header |= htonl(CQE_STATUS_V(T4_ERR_MSN));
3501 - goto proc_cqe;
3502 + hw_cqe->header |= cpu_to_be32(CQE_STATUS_V(T4_ERR_MSN));
3503 }
3504 goto proc_cqe;
3505 }
3506 diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h
3507 index 6ff44dc606eb..3409eee16092 100644
3508 --- a/drivers/infiniband/hw/hfi1/hfi.h
3509 +++ b/drivers/infiniband/hw/hfi1/hfi.h
3510 @@ -1129,7 +1129,6 @@ struct hfi1_devdata {
3511 u16 pcie_lnkctl;
3512 u16 pcie_devctl2;
3513 u32 pci_msix0;
3514 - u32 pci_lnkctl3;
3515 u32 pci_tph2;
3516
3517 /*
3518 diff --git a/drivers/infiniband/hw/hfi1/pcie.c b/drivers/infiniband/hw/hfi1/pcie.c
3519 index 09e50fd2a08f..8c7e7a60b715 100644
3520 --- a/drivers/infiniband/hw/hfi1/pcie.c
3521 +++ b/drivers/infiniband/hw/hfi1/pcie.c
3522 @@ -411,15 +411,12 @@ int restore_pci_variables(struct hfi1_devdata *dd)
3523 if (ret)
3524 goto error;
3525
3526 - ret = pci_write_config_dword(dd->pcidev, PCIE_CFG_SPCIE1,
3527 - dd->pci_lnkctl3);
3528 - if (ret)
3529 - goto error;
3530 -
3531 - ret = pci_write_config_dword(dd->pcidev, PCIE_CFG_TPH2, dd->pci_tph2);
3532 - if (ret)
3533 - goto error;
3534 -
3535 + if (pci_find_ext_capability(dd->pcidev, PCI_EXT_CAP_ID_TPH)) {
3536 + ret = pci_write_config_dword(dd->pcidev, PCIE_CFG_TPH2,
3537 + dd->pci_tph2);
3538 + if (ret)
3539 + goto error;
3540 + }
3541 return 0;
3542
3543 error:
3544 @@ -469,15 +466,12 @@ int save_pci_variables(struct hfi1_devdata *dd)
3545 if (ret)
3546 goto error;
3547
3548 - ret = pci_read_config_dword(dd->pcidev, PCIE_CFG_SPCIE1,
3549 - &dd->pci_lnkctl3);
3550 - if (ret)
3551 - goto error;
3552 -
3553 - ret = pci_read_config_dword(dd->pcidev, PCIE_CFG_TPH2, &dd->pci_tph2);
3554 - if (ret)
3555 - goto error;
3556 -
3557 + if (pci_find_ext_capability(dd->pcidev, PCI_EXT_CAP_ID_TPH)) {
3558 + ret = pci_read_config_dword(dd->pcidev, PCIE_CFG_TPH2,
3559 + &dd->pci_tph2);
3560 + if (ret)
3561 + goto error;
3562 + }
3563 return 0;
3564
3565 error:
3566 diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
3567 index 5aff1e33d984..30d479f87cb8 100644
3568 --- a/drivers/infiniband/hw/mlx5/main.c
3569 +++ b/drivers/infiniband/hw/mlx5/main.c
3570 @@ -1415,6 +1415,7 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
3571 }
3572
3573 INIT_LIST_HEAD(&context->vma_private_list);
3574 + mutex_init(&context->vma_private_list_mutex);
3575 INIT_LIST_HEAD(&context->db_page_list);
3576 mutex_init(&context->db_page_mutex);
3577
3578 @@ -1576,7 +1577,9 @@ static void mlx5_ib_vma_close(struct vm_area_struct *area)
3579 * mlx5_ib_disassociate_ucontext().
3580 */
3581 mlx5_ib_vma_priv_data->vma = NULL;
3582 + mutex_lock(mlx5_ib_vma_priv_data->vma_private_list_mutex);
3583 list_del(&mlx5_ib_vma_priv_data->list);
3584 + mutex_unlock(mlx5_ib_vma_priv_data->vma_private_list_mutex);
3585 kfree(mlx5_ib_vma_priv_data);
3586 }
3587
3588 @@ -1596,10 +1599,13 @@ static int mlx5_ib_set_vma_data(struct vm_area_struct *vma,
3589 return -ENOMEM;
3590
3591 vma_prv->vma = vma;
3592 + vma_prv->vma_private_list_mutex = &ctx->vma_private_list_mutex;
3593 vma->vm_private_data = vma_prv;
3594 vma->vm_ops = &mlx5_ib_vm_ops;
3595
3596 + mutex_lock(&ctx->vma_private_list_mutex);
3597 list_add(&vma_prv->list, vma_head);
3598 + mutex_unlock(&ctx->vma_private_list_mutex);
3599
3600 return 0;
3601 }
3602 @@ -1642,6 +1648,7 @@ static void mlx5_ib_disassociate_ucontext(struct ib_ucontext *ibcontext)
3603 * mlx5_ib_vma_close.
3604 */
3605 down_write(&owning_mm->mmap_sem);
3606 + mutex_lock(&context->vma_private_list_mutex);
3607 list_for_each_entry_safe(vma_private, n, &context->vma_private_list,
3608 list) {
3609 vma = vma_private->vma;
3610 @@ -1656,6 +1663,7 @@ static void mlx5_ib_disassociate_ucontext(struct ib_ucontext *ibcontext)
3611 list_del(&vma_private->list);
3612 kfree(vma_private);
3613 }
3614 + mutex_unlock(&context->vma_private_list_mutex);
3615 up_write(&owning_mm->mmap_sem);
3616 mmput(owning_mm);
3617 put_task_struct(owning_process);
3618 diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
3619 index 189e80cd6b2f..754103372faa 100644
3620 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
3621 +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
3622 @@ -115,6 +115,8 @@ enum {
3623 struct mlx5_ib_vma_private_data {
3624 struct list_head list;
3625 struct vm_area_struct *vma;
3626 + /* protect vma_private_list add/del */
3627 + struct mutex *vma_private_list_mutex;
3628 };
3629
3630 struct mlx5_ib_ucontext {
3631 @@ -129,6 +131,8 @@ struct mlx5_ib_ucontext {
3632 /* Transport Domain number */
3633 u32 tdn;
3634 struct list_head vma_private_list;
3635 + /* protect vma_private_list add/del */
3636 + struct mutex vma_private_list_mutex;
3637
3638 unsigned long upd_xlt_page;
3639 /* protect ODP/KSM */
3640 diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c
3641 index d7b53d53c116..72d6ffbfd638 100644
3642 --- a/drivers/net/dsa/bcm_sf2.c
3643 +++ b/drivers/net/dsa/bcm_sf2.c
3644 @@ -167,7 +167,7 @@ static void bcm_sf2_gphy_enable_set(struct dsa_switch *ds, bool enable)
3645 reg = reg_readl(priv, REG_SPHY_CNTRL);
3646 if (enable) {
3647 reg |= PHY_RESET;
3648 - reg &= ~(EXT_PWR_DOWN | IDDQ_BIAS | CK25_DIS);
3649 + reg &= ~(EXT_PWR_DOWN | IDDQ_BIAS | IDDQ_GLOBAL_PWR | CK25_DIS);
3650 reg_writel(priv, reg, REG_SPHY_CNTRL);
3651 udelay(21);
3652 reg = reg_readl(priv, REG_SPHY_CNTRL);
3653 diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
3654 index dc5de275352a..aa764c5e3c6b 100644
3655 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
3656 +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
3657 @@ -1875,7 +1875,7 @@ static int bnxt_poll_work(struct bnxt *bp, struct bnxt_napi *bnapi, int budget)
3658 * here forever if we consistently cannot allocate
3659 * buffers.
3660 */
3661 - else if (rc == -ENOMEM)
3662 + else if (rc == -ENOMEM && budget)
3663 rx_pkts++;
3664 else if (rc == -EBUSY) /* partial completion */
3665 break;
3666 @@ -1961,7 +1961,7 @@ static int bnxt_poll_nitroa0(struct napi_struct *napi, int budget)
3667 cpu_to_le32(RX_CMPL_ERRORS_CRC_ERROR);
3668
3669 rc = bnxt_rx_pkt(bp, bnapi, &raw_cons, &event);
3670 - if (likely(rc == -EIO))
3671 + if (likely(rc == -EIO) && budget)
3672 rx_pkts++;
3673 else if (rc == -EBUSY) /* partial completion */
3674 break;
3675 diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
3676 index 656e6af70f0a..aef3fcf2f5b9 100644
3677 --- a/drivers/net/ethernet/broadcom/tg3.c
3678 +++ b/drivers/net/ethernet/broadcom/tg3.c
3679 @@ -14227,7 +14227,9 @@ static int tg3_change_mtu(struct net_device *dev, int new_mtu)
3680 /* Reset PHY, otherwise the read DMA engine will be in a mode that
3681 * breaks all requests to 256 bytes.
3682 */
3683 - if (tg3_asic_rev(tp) == ASIC_REV_57766)
3684 + if (tg3_asic_rev(tp) == ASIC_REV_57766 ||
3685 + tg3_asic_rev(tp) == ASIC_REV_5717 ||
3686 + tg3_asic_rev(tp) == ASIC_REV_5719)
3687 reset_phy = true;
3688
3689 err = tg3_restart_hw(tp, reset_phy);
3690 diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c
3691 index 3dc2d771a222..faf7cdc97ebf 100644
3692 --- a/drivers/net/ethernet/freescale/fec_main.c
3693 +++ b/drivers/net/ethernet/freescale/fec_main.c
3694 @@ -818,6 +818,12 @@ static void fec_enet_bd_init(struct net_device *dev)
3695 for (i = 0; i < txq->bd.ring_size; i++) {
3696 /* Initialize the BD for every fragment in the page. */
3697 bdp->cbd_sc = cpu_to_fec16(0);
3698 + if (bdp->cbd_bufaddr &&
3699 + !IS_TSO_HEADER(txq, fec32_to_cpu(bdp->cbd_bufaddr)))
3700 + dma_unmap_single(&fep->pdev->dev,
3701 + fec32_to_cpu(bdp->cbd_bufaddr),
3702 + fec16_to_cpu(bdp->cbd_datlen),
3703 + DMA_TO_DEVICE);
3704 if (txq->tx_skbuff[i]) {
3705 dev_kfree_skb_any(txq->tx_skbuff[i]);
3706 txq->tx_skbuff[i] = NULL;
3707 diff --git a/drivers/net/ethernet/marvell/mvmdio.c b/drivers/net/ethernet/marvell/mvmdio.c
3708 index c9798210fa0f..0495487f7b42 100644
3709 --- a/drivers/net/ethernet/marvell/mvmdio.c
3710 +++ b/drivers/net/ethernet/marvell/mvmdio.c
3711 @@ -344,7 +344,8 @@ static int orion_mdio_probe(struct platform_device *pdev)
3712 dev->regs + MVMDIO_ERR_INT_MASK);
3713
3714 } else if (dev->err_interrupt == -EPROBE_DEFER) {
3715 - return -EPROBE_DEFER;
3716 + ret = -EPROBE_DEFER;
3717 + goto out_mdio;
3718 }
3719
3720 if (pdev->dev.of_node)
3721 diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
3722 index 1fffdebbc9e8..e9a1fbcc4adf 100644
3723 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
3724 +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
3725 @@ -362,7 +362,7 @@ static int mlx5_internal_err_ret_value(struct mlx5_core_dev *dev, u16 op,
3726 case MLX5_CMD_OP_QUERY_VPORT_COUNTER:
3727 case MLX5_CMD_OP_ALLOC_Q_COUNTER:
3728 case MLX5_CMD_OP_QUERY_Q_COUNTER:
3729 - case MLX5_CMD_OP_SET_RATE_LIMIT:
3730 + case MLX5_CMD_OP_SET_PP_RATE_LIMIT:
3731 case MLX5_CMD_OP_QUERY_RATE_LIMIT:
3732 case MLX5_CMD_OP_CREATE_SCHEDULING_ELEMENT:
3733 case MLX5_CMD_OP_QUERY_SCHEDULING_ELEMENT:
3734 @@ -505,7 +505,7 @@ const char *mlx5_command_str(int command)
3735 MLX5_COMMAND_STR_CASE(ALLOC_Q_COUNTER);
3736 MLX5_COMMAND_STR_CASE(DEALLOC_Q_COUNTER);
3737 MLX5_COMMAND_STR_CASE(QUERY_Q_COUNTER);
3738 - MLX5_COMMAND_STR_CASE(SET_RATE_LIMIT);
3739 + MLX5_COMMAND_STR_CASE(SET_PP_RATE_LIMIT);
3740 MLX5_COMMAND_STR_CASE(QUERY_RATE_LIMIT);
3741 MLX5_COMMAND_STR_CASE(CREATE_SCHEDULING_ELEMENT);
3742 MLX5_COMMAND_STR_CASE(DESTROY_SCHEDULING_ELEMENT);
3743 diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
3744 index 13b5ef9d8703..5fa071620104 100644
3745 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
3746 +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
3747 @@ -590,6 +590,7 @@ struct mlx5e_channel {
3748 struct mlx5_core_dev *mdev;
3749 struct mlx5e_tstamp *tstamp;
3750 int ix;
3751 + int cpu;
3752 };
3753
3754 struct mlx5e_channels {
3755 diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
3756 index cc11bbbd0309..3cdb932cae76 100644
3757 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
3758 +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
3759 @@ -71,11 +71,6 @@ struct mlx5e_channel_param {
3760 struct mlx5e_cq_param icosq_cq;
3761 };
3762
3763 -static int mlx5e_get_node(struct mlx5e_priv *priv, int ix)
3764 -{
3765 - return pci_irq_get_node(priv->mdev->pdev, MLX5_EQ_VEC_COMP_BASE + ix);
3766 -}
3767 -
3768 static bool mlx5e_check_fragmented_striding_rq_cap(struct mlx5_core_dev *mdev)
3769 {
3770 return MLX5_CAP_GEN(mdev, striding_rq) &&
3771 @@ -452,17 +447,16 @@ static int mlx5e_rq_alloc_mpwqe_info(struct mlx5e_rq *rq,
3772 int wq_sz = mlx5_wq_ll_get_size(&rq->wq);
3773 int mtt_sz = mlx5e_get_wqe_mtt_sz();
3774 int mtt_alloc = mtt_sz + MLX5_UMR_ALIGN - 1;
3775 - int node = mlx5e_get_node(c->priv, c->ix);
3776 int i;
3777
3778 rq->mpwqe.info = kzalloc_node(wq_sz * sizeof(*rq->mpwqe.info),
3779 - GFP_KERNEL, node);
3780 + GFP_KERNEL, cpu_to_node(c->cpu));
3781 if (!rq->mpwqe.info)
3782 goto err_out;
3783
3784 /* We allocate more than mtt_sz as we will align the pointer */
3785 - rq->mpwqe.mtt_no_align = kzalloc_node(mtt_alloc * wq_sz,
3786 - GFP_KERNEL, node);
3787 + rq->mpwqe.mtt_no_align = kzalloc_node(mtt_alloc * wq_sz, GFP_KERNEL,
3788 + cpu_to_node(c->cpu));
3789 if (unlikely(!rq->mpwqe.mtt_no_align))
3790 goto err_free_wqe_info;
3791
3792 @@ -570,7 +564,7 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
3793 int err;
3794 int i;
3795
3796 - rqp->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix);
3797 + rqp->wq.db_numa_node = cpu_to_node(c->cpu);
3798
3799 err = mlx5_wq_ll_create(mdev, &rqp->wq, rqc_wq, &rq->wq,
3800 &rq->wq_ctrl);
3801 @@ -636,8 +630,7 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
3802 default: /* MLX5_WQ_TYPE_LINKED_LIST */
3803 rq->wqe.frag_info =
3804 kzalloc_node(wq_sz * sizeof(*rq->wqe.frag_info),
3805 - GFP_KERNEL,
3806 - mlx5e_get_node(c->priv, c->ix));
3807 + GFP_KERNEL, cpu_to_node(c->cpu));
3808 if (!rq->wqe.frag_info) {
3809 err = -ENOMEM;
3810 goto err_rq_wq_destroy;
3811 @@ -1007,13 +1000,13 @@ static int mlx5e_alloc_xdpsq(struct mlx5e_channel *c,
3812 sq->uar_map = mdev->mlx5e_res.bfreg.map;
3813 sq->min_inline_mode = params->tx_min_inline_mode;
3814
3815 - param->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix);
3816 + param->wq.db_numa_node = cpu_to_node(c->cpu);
3817 err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, &sq->wq, &sq->wq_ctrl);
3818 if (err)
3819 return err;
3820 sq->wq.db = &sq->wq.db[MLX5_SND_DBR];
3821
3822 - err = mlx5e_alloc_xdpsq_db(sq, mlx5e_get_node(c->priv, c->ix));
3823 + err = mlx5e_alloc_xdpsq_db(sq, cpu_to_node(c->cpu));
3824 if (err)
3825 goto err_sq_wq_destroy;
3826
3827 @@ -1060,13 +1053,13 @@ static int mlx5e_alloc_icosq(struct mlx5e_channel *c,
3828 sq->channel = c;
3829 sq->uar_map = mdev->mlx5e_res.bfreg.map;
3830
3831 - param->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix);
3832 + param->wq.db_numa_node = cpu_to_node(c->cpu);
3833 err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, &sq->wq, &sq->wq_ctrl);
3834 if (err)
3835 return err;
3836 sq->wq.db = &sq->wq.db[MLX5_SND_DBR];
3837
3838 - err = mlx5e_alloc_icosq_db(sq, mlx5e_get_node(c->priv, c->ix));
3839 + err = mlx5e_alloc_icosq_db(sq, cpu_to_node(c->cpu));
3840 if (err)
3841 goto err_sq_wq_destroy;
3842
3843 @@ -1132,13 +1125,13 @@ static int mlx5e_alloc_txqsq(struct mlx5e_channel *c,
3844 if (MLX5_IPSEC_DEV(c->priv->mdev))
3845 set_bit(MLX5E_SQ_STATE_IPSEC, &sq->state);
3846
3847 - param->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix);
3848 + param->wq.db_numa_node = cpu_to_node(c->cpu);
3849 err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, &sq->wq, &sq->wq_ctrl);
3850 if (err)
3851 return err;
3852 sq->wq.db = &sq->wq.db[MLX5_SND_DBR];
3853
3854 - err = mlx5e_alloc_txqsq_db(sq, mlx5e_get_node(c->priv, c->ix));
3855 + err = mlx5e_alloc_txqsq_db(sq, cpu_to_node(c->cpu));
3856 if (err)
3857 goto err_sq_wq_destroy;
3858
3859 @@ -1510,8 +1503,8 @@ static int mlx5e_alloc_cq(struct mlx5e_channel *c,
3860 struct mlx5_core_dev *mdev = c->priv->mdev;
3861 int err;
3862
3863 - param->wq.buf_numa_node = mlx5e_get_node(c->priv, c->ix);
3864 - param->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix);
3865 + param->wq.buf_numa_node = cpu_to_node(c->cpu);
3866 + param->wq.db_numa_node = cpu_to_node(c->cpu);
3867 param->eq_ix = c->ix;
3868
3869 err = mlx5e_alloc_cq_common(mdev, param, cq);
3870 @@ -1610,6 +1603,11 @@ static void mlx5e_close_cq(struct mlx5e_cq *cq)
3871 mlx5e_free_cq(cq);
3872 }
3873
3874 +static int mlx5e_get_cpu(struct mlx5e_priv *priv, int ix)
3875 +{
3876 + return cpumask_first(priv->mdev->priv.irq_info[ix].mask);
3877 +}
3878 +
3879 static int mlx5e_open_tx_cqs(struct mlx5e_channel *c,
3880 struct mlx5e_params *params,
3881 struct mlx5e_channel_param *cparam)
3882 @@ -1758,12 +1756,13 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix,
3883 {
3884 struct mlx5e_cq_moder icocq_moder = {0, 0};
3885 struct net_device *netdev = priv->netdev;
3886 + int cpu = mlx5e_get_cpu(priv, ix);
3887 struct mlx5e_channel *c;
3888 unsigned int irq;
3889 int err;
3890 int eqn;
3891
3892 - c = kzalloc_node(sizeof(*c), GFP_KERNEL, mlx5e_get_node(priv, ix));
3893 + c = kzalloc_node(sizeof(*c), GFP_KERNEL, cpu_to_node(cpu));
3894 if (!c)
3895 return -ENOMEM;
3896
3897 @@ -1771,6 +1770,7 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix,
3898 c->mdev = priv->mdev;
3899 c->tstamp = &priv->tstamp;
3900 c->ix = ix;
3901 + c->cpu = cpu;
3902 c->pdev = &priv->mdev->pdev->dev;
3903 c->netdev = priv->netdev;
3904 c->mkey_be = cpu_to_be32(priv->mdev->mlx5e_res.mkey.key);
3905 @@ -1859,8 +1859,7 @@ static void mlx5e_activate_channel(struct mlx5e_channel *c)
3906 for (tc = 0; tc < c->num_tc; tc++)
3907 mlx5e_activate_txqsq(&c->sq[tc]);
3908 mlx5e_activate_rq(&c->rq);
3909 - netif_set_xps_queue(c->netdev,
3910 - mlx5_get_vector_affinity(c->priv->mdev, c->ix), c->ix);
3911 + netif_set_xps_queue(c->netdev, get_cpu_mask(c->cpu), c->ix);
3912 }
3913
3914 static void mlx5e_deactivate_channel(struct mlx5e_channel *c)
3915 @@ -3554,6 +3553,7 @@ static netdev_features_t mlx5e_tunnel_features_check(struct mlx5e_priv *priv,
3916 struct sk_buff *skb,
3917 netdev_features_t features)
3918 {
3919 + unsigned int offset = 0;
3920 struct udphdr *udph;
3921 u8 proto;
3922 u16 port;
3923 @@ -3563,7 +3563,7 @@ static netdev_features_t mlx5e_tunnel_features_check(struct mlx5e_priv *priv,
3924 proto = ip_hdr(skb)->protocol;
3925 break;
3926 case htons(ETH_P_IPV6):
3927 - proto = ipv6_hdr(skb)->nexthdr;
3928 + proto = ipv6_find_hdr(skb, &offset, -1, NULL, NULL);
3929 break;
3930 default:
3931 goto out;
3932 diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.c
3933 index 3c11d6e2160a..14962969c5ba 100644
3934 --- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.c
3935 +++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.c
3936 @@ -66,6 +66,9 @@ static int mlx5_fpga_mem_read_i2c(struct mlx5_fpga_device *fdev, size_t size,
3937 u8 actual_size;
3938 int err;
3939
3940 + if (!size)
3941 + return -EINVAL;
3942 +
3943 if (!fdev->mdev)
3944 return -ENOTCONN;
3945
3946 @@ -95,6 +98,9 @@ static int mlx5_fpga_mem_write_i2c(struct mlx5_fpga_device *fdev, size_t size,
3947 u8 actual_size;
3948 int err;
3949
3950 + if (!size)
3951 + return -EINVAL;
3952 +
3953 if (!fdev->mdev)
3954 return -ENOTCONN;
3955
3956 diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
3957 index 06562c9a6b9c..8bfc37e4ec87 100644
3958 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
3959 +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
3960 @@ -316,9 +316,6 @@ static int mlx5_alloc_irq_vectors(struct mlx5_core_dev *dev)
3961 {
3962 struct mlx5_priv *priv = &dev->priv;
3963 struct mlx5_eq_table *table = &priv->eq_table;
3964 - struct irq_affinity irqdesc = {
3965 - .pre_vectors = MLX5_EQ_VEC_COMP_BASE,
3966 - };
3967 int num_eqs = 1 << MLX5_CAP_GEN(dev, log_max_eq);
3968 int nvec;
3969
3970 @@ -332,10 +329,9 @@ static int mlx5_alloc_irq_vectors(struct mlx5_core_dev *dev)
3971 if (!priv->irq_info)
3972 goto err_free_msix;
3973
3974 - nvec = pci_alloc_irq_vectors_affinity(dev->pdev,
3975 + nvec = pci_alloc_irq_vectors(dev->pdev,
3976 MLX5_EQ_VEC_COMP_BASE + 1, nvec,
3977 - PCI_IRQ_MSIX | PCI_IRQ_AFFINITY,
3978 - &irqdesc);
3979 + PCI_IRQ_MSIX);
3980 if (nvec < 0)
3981 return nvec;
3982
3983 @@ -621,6 +617,63 @@ u64 mlx5_read_internal_timer(struct mlx5_core_dev *dev)
3984 return (u64)timer_l | (u64)timer_h1 << 32;
3985 }
3986
3987 +static int mlx5_irq_set_affinity_hint(struct mlx5_core_dev *mdev, int i)
3988 +{
3989 + struct mlx5_priv *priv = &mdev->priv;
3990 + int irq = pci_irq_vector(mdev->pdev, MLX5_EQ_VEC_COMP_BASE + i);
3991 +
3992 + if (!zalloc_cpumask_var(&priv->irq_info[i].mask, GFP_KERNEL)) {
3993 + mlx5_core_warn(mdev, "zalloc_cpumask_var failed");
3994 + return -ENOMEM;
3995 + }
3996 +
3997 + cpumask_set_cpu(cpumask_local_spread(i, priv->numa_node),
3998 + priv->irq_info[i].mask);
3999 +
4000 + if (IS_ENABLED(CONFIG_SMP) &&
4001 + irq_set_affinity_hint(irq, priv->irq_info[i].mask))
4002 + mlx5_core_warn(mdev, "irq_set_affinity_hint failed, irq 0x%.4x", irq);
4003 +
4004 + return 0;
4005 +}
4006 +
4007 +static void mlx5_irq_clear_affinity_hint(struct mlx5_core_dev *mdev, int i)
4008 +{
4009 + struct mlx5_priv *priv = &mdev->priv;
4010 + int irq = pci_irq_vector(mdev->pdev, MLX5_EQ_VEC_COMP_BASE + i);
4011 +
4012 + irq_set_affinity_hint(irq, NULL);
4013 + free_cpumask_var(priv->irq_info[i].mask);
4014 +}
4015 +
4016 +static int mlx5_irq_set_affinity_hints(struct mlx5_core_dev *mdev)
4017 +{
4018 + int err;
4019 + int i;
4020 +
4021 + for (i = 0; i < mdev->priv.eq_table.num_comp_vectors; i++) {
4022 + err = mlx5_irq_set_affinity_hint(mdev, i);
4023 + if (err)
4024 + goto err_out;
4025 + }
4026 +
4027 + return 0;
4028 +
4029 +err_out:
4030 + for (i--; i >= 0; i--)
4031 + mlx5_irq_clear_affinity_hint(mdev, i);
4032 +
4033 + return err;
4034 +}
4035 +
4036 +static void mlx5_irq_clear_affinity_hints(struct mlx5_core_dev *mdev)
4037 +{
4038 + int i;
4039 +
4040 + for (i = 0; i < mdev->priv.eq_table.num_comp_vectors; i++)
4041 + mlx5_irq_clear_affinity_hint(mdev, i);
4042 +}
4043 +
4044 int mlx5_vector2eqn(struct mlx5_core_dev *dev, int vector, int *eqn,
4045 unsigned int *irqn)
4046 {
4047 @@ -1093,6 +1146,12 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
4048 goto err_stop_eqs;
4049 }
4050
4051 + err = mlx5_irq_set_affinity_hints(dev);
4052 + if (err) {
4053 + dev_err(&pdev->dev, "Failed to alloc affinity hint cpumask\n");
4054 + goto err_affinity_hints;
4055 + }
4056 +
4057 err = mlx5_init_fs(dev);
4058 if (err) {
4059 dev_err(&pdev->dev, "Failed to init flow steering\n");
4060 @@ -1150,6 +1209,9 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
4061 mlx5_cleanup_fs(dev);
4062
4063 err_fs:
4064 + mlx5_irq_clear_affinity_hints(dev);
4065 +
4066 +err_affinity_hints:
4067 free_comp_eqs(dev);
4068
4069 err_stop_eqs:
4070 @@ -1218,6 +1280,7 @@ static int mlx5_unload_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
4071
4072 mlx5_sriov_detach(dev);
4073 mlx5_cleanup_fs(dev);
4074 + mlx5_irq_clear_affinity_hints(dev);
4075 free_comp_eqs(dev);
4076 mlx5_stop_eqs(dev);
4077 mlx5_put_uars_page(dev, priv->uar);
4078 diff --git a/drivers/net/ethernet/mellanox/mlx5/core/qp.c b/drivers/net/ethernet/mellanox/mlx5/core/qp.c
4079 index db9e665ab104..889130edb715 100644
4080 --- a/drivers/net/ethernet/mellanox/mlx5/core/qp.c
4081 +++ b/drivers/net/ethernet/mellanox/mlx5/core/qp.c
4082 @@ -213,8 +213,8 @@ int mlx5_core_create_qp(struct mlx5_core_dev *dev,
4083 err_cmd:
4084 memset(din, 0, sizeof(din));
4085 memset(dout, 0, sizeof(dout));
4086 - MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP);
4087 - MLX5_SET(destroy_qp_in, in, qpn, qp->qpn);
4088 + MLX5_SET(destroy_qp_in, din, opcode, MLX5_CMD_OP_DESTROY_QP);
4089 + MLX5_SET(destroy_qp_in, din, qpn, qp->qpn);
4090 mlx5_cmd_exec(dev, din, sizeof(din), dout, sizeof(dout));
4091 return err;
4092 }
4093 diff --git a/drivers/net/ethernet/mellanox/mlx5/core/rl.c b/drivers/net/ethernet/mellanox/mlx5/core/rl.c
4094 index e651e4c02867..d3c33e9eea72 100644
4095 --- a/drivers/net/ethernet/mellanox/mlx5/core/rl.c
4096 +++ b/drivers/net/ethernet/mellanox/mlx5/core/rl.c
4097 @@ -125,16 +125,16 @@ static struct mlx5_rl_entry *find_rl_entry(struct mlx5_rl_table *table,
4098 return ret_entry;
4099 }
4100
4101 -static int mlx5_set_rate_limit_cmd(struct mlx5_core_dev *dev,
4102 +static int mlx5_set_pp_rate_limit_cmd(struct mlx5_core_dev *dev,
4103 u32 rate, u16 index)
4104 {
4105 - u32 in[MLX5_ST_SZ_DW(set_rate_limit_in)] = {0};
4106 - u32 out[MLX5_ST_SZ_DW(set_rate_limit_out)] = {0};
4107 + u32 in[MLX5_ST_SZ_DW(set_pp_rate_limit_in)] = {0};
4108 + u32 out[MLX5_ST_SZ_DW(set_pp_rate_limit_out)] = {0};
4109
4110 - MLX5_SET(set_rate_limit_in, in, opcode,
4111 - MLX5_CMD_OP_SET_RATE_LIMIT);
4112 - MLX5_SET(set_rate_limit_in, in, rate_limit_index, index);
4113 - MLX5_SET(set_rate_limit_in, in, rate_limit, rate);
4114 + MLX5_SET(set_pp_rate_limit_in, in, opcode,
4115 + MLX5_CMD_OP_SET_PP_RATE_LIMIT);
4116 + MLX5_SET(set_pp_rate_limit_in, in, rate_limit_index, index);
4117 + MLX5_SET(set_pp_rate_limit_in, in, rate_limit, rate);
4118 return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
4119 }
4120
4121 @@ -173,7 +173,7 @@ int mlx5_rl_add_rate(struct mlx5_core_dev *dev, u32 rate, u16 *index)
4122 entry->refcount++;
4123 } else {
4124 /* new rate limit */
4125 - err = mlx5_set_rate_limit_cmd(dev, rate, entry->index);
4126 + err = mlx5_set_pp_rate_limit_cmd(dev, rate, entry->index);
4127 if (err) {
4128 mlx5_core_err(dev, "Failed configuring rate: %u (%d)\n",
4129 rate, err);
4130 @@ -209,7 +209,7 @@ void mlx5_rl_remove_rate(struct mlx5_core_dev *dev, u32 rate)
4131 entry->refcount--;
4132 if (!entry->refcount) {
4133 /* need to remove rate */
4134 - mlx5_set_rate_limit_cmd(dev, 0, entry->index);
4135 + mlx5_set_pp_rate_limit_cmd(dev, 0, entry->index);
4136 entry->rate = 0;
4137 }
4138
4139 @@ -262,8 +262,8 @@ void mlx5_cleanup_rl_table(struct mlx5_core_dev *dev)
4140 /* Clear all configured rates */
4141 for (i = 0; i < table->max_size; i++)
4142 if (table->rl_entry[i].rate)
4143 - mlx5_set_rate_limit_cmd(dev, 0,
4144 - table->rl_entry[i].index);
4145 + mlx5_set_pp_rate_limit_cmd(dev, 0,
4146 + table->rl_entry[i].index);
4147
4148 kfree(dev->priv.rl_table.rl_entry);
4149 }
4150 diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
4151 index 07a9ba6cfc70..2f74953e4561 100644
4152 --- a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
4153 +++ b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
4154 @@ -71,9 +71,9 @@ struct mlx5e_vxlan *mlx5e_vxlan_lookup_port(struct mlx5e_priv *priv, u16 port)
4155 struct mlx5e_vxlan_db *vxlan_db = &priv->vxlan;
4156 struct mlx5e_vxlan *vxlan;
4157
4158 - spin_lock(&vxlan_db->lock);
4159 + spin_lock_bh(&vxlan_db->lock);
4160 vxlan = radix_tree_lookup(&vxlan_db->tree, port);
4161 - spin_unlock(&vxlan_db->lock);
4162 + spin_unlock_bh(&vxlan_db->lock);
4163
4164 return vxlan;
4165 }
4166 @@ -88,8 +88,12 @@ static void mlx5e_vxlan_add_port(struct work_struct *work)
4167 struct mlx5e_vxlan *vxlan;
4168 int err;
4169
4170 - if (mlx5e_vxlan_lookup_port(priv, port))
4171 + mutex_lock(&priv->state_lock);
4172 + vxlan = mlx5e_vxlan_lookup_port(priv, port);
4173 + if (vxlan) {
4174 + atomic_inc(&vxlan->refcount);
4175 goto free_work;
4176 + }
4177
4178 if (mlx5e_vxlan_core_add_port_cmd(priv->mdev, port))
4179 goto free_work;
4180 @@ -99,10 +103,11 @@ static void mlx5e_vxlan_add_port(struct work_struct *work)
4181 goto err_delete_port;
4182
4183 vxlan->udp_port = port;
4184 + atomic_set(&vxlan->refcount, 1);
4185
4186 - spin_lock_irq(&vxlan_db->lock);
4187 + spin_lock_bh(&vxlan_db->lock);
4188 err = radix_tree_insert(&vxlan_db->tree, vxlan->udp_port, vxlan);
4189 - spin_unlock_irq(&vxlan_db->lock);
4190 + spin_unlock_bh(&vxlan_db->lock);
4191 if (err)
4192 goto err_free;
4193
4194 @@ -113,35 +118,39 @@ static void mlx5e_vxlan_add_port(struct work_struct *work)
4195 err_delete_port:
4196 mlx5e_vxlan_core_del_port_cmd(priv->mdev, port);
4197 free_work:
4198 + mutex_unlock(&priv->state_lock);
4199 kfree(vxlan_work);
4200 }
4201
4202 -static void __mlx5e_vxlan_core_del_port(struct mlx5e_priv *priv, u16 port)
4203 +static void mlx5e_vxlan_del_port(struct work_struct *work)
4204 {
4205 + struct mlx5e_vxlan_work *vxlan_work =
4206 + container_of(work, struct mlx5e_vxlan_work, work);
4207 + struct mlx5e_priv *priv = vxlan_work->priv;
4208 struct mlx5e_vxlan_db *vxlan_db = &priv->vxlan;
4209 + u16 port = vxlan_work->port;
4210 struct mlx5e_vxlan *vxlan;
4211 + bool remove = false;
4212
4213 - spin_lock_irq(&vxlan_db->lock);
4214 - vxlan = radix_tree_delete(&vxlan_db->tree, port);
4215 - spin_unlock_irq(&vxlan_db->lock);
4216 -
4217 + mutex_lock(&priv->state_lock);
4218 + spin_lock_bh(&vxlan_db->lock);
4219 + vxlan = radix_tree_lookup(&vxlan_db->tree, port);
4220 if (!vxlan)
4221 - return;
4222 -
4223 - mlx5e_vxlan_core_del_port_cmd(priv->mdev, vxlan->udp_port);
4224 -
4225 - kfree(vxlan);
4226 -}
4227 + goto out_unlock;
4228
4229 -static void mlx5e_vxlan_del_port(struct work_struct *work)
4230 -{
4231 - struct mlx5e_vxlan_work *vxlan_work =
4232 - container_of(work, struct mlx5e_vxlan_work, work);
4233 - struct mlx5e_priv *priv = vxlan_work->priv;
4234 - u16 port = vxlan_work->port;
4235 + if (atomic_dec_and_test(&vxlan->refcount)) {
4236 + radix_tree_delete(&vxlan_db->tree, port);
4237 + remove = true;
4238 + }
4239
4240 - __mlx5e_vxlan_core_del_port(priv, port);
4241 +out_unlock:
4242 + spin_unlock_bh(&vxlan_db->lock);
4243
4244 + if (remove) {
4245 + mlx5e_vxlan_core_del_port_cmd(priv->mdev, port);
4246 + kfree(vxlan);
4247 + }
4248 + mutex_unlock(&priv->state_lock);
4249 kfree(vxlan_work);
4250 }
4251
4252 @@ -171,12 +180,11 @@ void mlx5e_vxlan_cleanup(struct mlx5e_priv *priv)
4253 struct mlx5e_vxlan *vxlan;
4254 unsigned int port = 0;
4255
4256 - spin_lock_irq(&vxlan_db->lock);
4257 + /* Lockless since we are the only radix-tree consumers, wq is disabled */
4258 while (radix_tree_gang_lookup(&vxlan_db->tree, (void **)&vxlan, port, 1)) {
4259 port = vxlan->udp_port;
4260 - spin_unlock_irq(&vxlan_db->lock);
4261 - __mlx5e_vxlan_core_del_port(priv, (u16)port);
4262 - spin_lock_irq(&vxlan_db->lock);
4263 + radix_tree_delete(&vxlan_db->tree, port);
4264 + mlx5e_vxlan_core_del_port_cmd(priv->mdev, port);
4265 + kfree(vxlan);
4266 }
4267 - spin_unlock_irq(&vxlan_db->lock);
4268 }
4269 diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.h b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.h
4270 index 5def12c048e3..5ef6ae7d568a 100644
4271 --- a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.h
4272 +++ b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.h
4273 @@ -36,6 +36,7 @@
4274 #include "en.h"
4275
4276 struct mlx5e_vxlan {
4277 + atomic_t refcount;
4278 u16 udp_port;
4279 };
4280
4281 diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
4282 index db38880f54b4..3ead7439821c 100644
4283 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
4284 +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
4285 @@ -4164,6 +4164,7 @@ static int mlxsw_sp_port_stp_set(struct mlxsw_sp_port *mlxsw_sp_port,
4286
4287 static int mlxsw_sp_port_ovs_join(struct mlxsw_sp_port *mlxsw_sp_port)
4288 {
4289 + u16 vid = 1;
4290 int err;
4291
4292 err = mlxsw_sp_port_vp_mode_set(mlxsw_sp_port, true);
4293 @@ -4176,8 +4177,19 @@ static int mlxsw_sp_port_ovs_join(struct mlxsw_sp_port *mlxsw_sp_port)
4294 true, false);
4295 if (err)
4296 goto err_port_vlan_set;
4297 +
4298 + for (; vid <= VLAN_N_VID - 1; vid++) {
4299 + err = mlxsw_sp_port_vid_learning_set(mlxsw_sp_port,
4300 + vid, false);
4301 + if (err)
4302 + goto err_vid_learning_set;
4303 + }
4304 +
4305 return 0;
4306
4307 +err_vid_learning_set:
4308 + for (vid--; vid >= 1; vid--)
4309 + mlxsw_sp_port_vid_learning_set(mlxsw_sp_port, vid, true);
4310 err_port_vlan_set:
4311 mlxsw_sp_port_stp_set(mlxsw_sp_port, false);
4312 err_port_stp_set:
4313 @@ -4187,6 +4199,12 @@ static int mlxsw_sp_port_ovs_join(struct mlxsw_sp_port *mlxsw_sp_port)
4314
4315 static void mlxsw_sp_port_ovs_leave(struct mlxsw_sp_port *mlxsw_sp_port)
4316 {
4317 + u16 vid;
4318 +
4319 + for (vid = VLAN_N_VID - 1; vid >= 1; vid--)
4320 + mlxsw_sp_port_vid_learning_set(mlxsw_sp_port,
4321 + vid, true);
4322 +
4323 mlxsw_sp_port_vlan_set(mlxsw_sp_port, 2, VLAN_N_VID - 1,
4324 false, false);
4325 mlxsw_sp_port_stp_set(mlxsw_sp_port, false);
4326 diff --git a/drivers/net/ethernet/sfc/tx.c b/drivers/net/ethernet/sfc/tx.c
4327 index 32bf1fecf864..9b85cbd5a231 100644
4328 --- a/drivers/net/ethernet/sfc/tx.c
4329 +++ b/drivers/net/ethernet/sfc/tx.c
4330 @@ -77,6 +77,7 @@ static void efx_dequeue_buffer(struct efx_tx_queue *tx_queue,
4331 }
4332
4333 if (buffer->flags & EFX_TX_BUF_SKB) {
4334 + EFX_WARN_ON_PARANOID(!pkts_compl || !bytes_compl);
4335 (*pkts_compl)++;
4336 (*bytes_compl) += buffer->skb->len;
4337 dev_consume_skb_any((struct sk_buff *)buffer->skb);
4338 @@ -426,12 +427,14 @@ static int efx_tx_map_data(struct efx_tx_queue *tx_queue, struct sk_buff *skb,
4339 static void efx_enqueue_unwind(struct efx_tx_queue *tx_queue)
4340 {
4341 struct efx_tx_buffer *buffer;
4342 + unsigned int bytes_compl = 0;
4343 + unsigned int pkts_compl = 0;
4344
4345 /* Work backwards until we hit the original insert pointer value */
4346 while (tx_queue->insert_count != tx_queue->write_count) {
4347 --tx_queue->insert_count;
4348 buffer = __efx_tx_queue_get_insert_buffer(tx_queue);
4349 - efx_dequeue_buffer(tx_queue, buffer, NULL, NULL);
4350 + efx_dequeue_buffer(tx_queue, buffer, &pkts_compl, &bytes_compl);
4351 }
4352 }
4353
4354 diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c
4355 index 4d02b27df044..a3f456b91c99 100644
4356 --- a/drivers/net/phy/marvell.c
4357 +++ b/drivers/net/phy/marvell.c
4358 @@ -2069,7 +2069,7 @@ static struct phy_driver marvell_drivers[] = {
4359 .flags = PHY_HAS_INTERRUPT,
4360 .probe = marvell_probe,
4361 .config_init = &m88e1145_config_init,
4362 - .config_aneg = &marvell_config_aneg,
4363 + .config_aneg = &m88e1101_config_aneg,
4364 .read_status = &genphy_read_status,
4365 .ack_interrupt = &marvell_ack_interrupt,
4366 .config_intr = &marvell_config_intr,
4367 diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c
4368 index fdb43dd9b5cd..6c45ff650ec7 100644
4369 --- a/drivers/net/phy/micrel.c
4370 +++ b/drivers/net/phy/micrel.c
4371 @@ -622,6 +622,7 @@ static int ksz9031_read_status(struct phy_device *phydev)
4372 phydev->link = 0;
4373 if (phydev->drv->config_intr && phy_interrupt_is_valid(phydev))
4374 phydev->drv->config_intr(phydev);
4375 + return genphy_config_aneg(phydev);
4376 }
4377
4378 return 0;
4379 diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
4380 index bcb4755bcd95..4b377b978a0b 100644
4381 --- a/drivers/net/phy/phylink.c
4382 +++ b/drivers/net/phy/phylink.c
4383 @@ -525,6 +525,7 @@ struct phylink *phylink_create(struct net_device *ndev, struct device_node *np,
4384 pl->link_config.pause = MLO_PAUSE_AN;
4385 pl->link_config.speed = SPEED_UNKNOWN;
4386 pl->link_config.duplex = DUPLEX_UNKNOWN;
4387 + pl->link_config.an_enabled = true;
4388 pl->ops = ops;
4389 __set_bit(PHYLINK_DISABLE_STOPPED, &pl->phylink_disable_state);
4390
4391 @@ -948,6 +949,7 @@ int phylink_ethtool_ksettings_set(struct phylink *pl,
4392 mutex_lock(&pl->state_mutex);
4393 /* Configure the MAC to match the new settings */
4394 linkmode_copy(pl->link_config.advertising, our_kset.link_modes.advertising);
4395 + pl->link_config.interface = config.interface;
4396 pl->link_config.speed = our_kset.base.speed;
4397 pl->link_config.duplex = our_kset.base.duplex;
4398 pl->link_config.an_enabled = our_kset.base.autoneg != AUTONEG_DISABLE;
4399 diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c
4400 index 81394a4b2803..2092febfcb42 100644
4401 --- a/drivers/net/usb/qmi_wwan.c
4402 +++ b/drivers/net/usb/qmi_wwan.c
4403 @@ -1204,6 +1204,7 @@ static const struct usb_device_id products[] = {
4404 {QMI_FIXED_INTF(0x1199, 0x9079, 10)}, /* Sierra Wireless EM74xx */
4405 {QMI_FIXED_INTF(0x1199, 0x907b, 8)}, /* Sierra Wireless EM74xx */
4406 {QMI_FIXED_INTF(0x1199, 0x907b, 10)}, /* Sierra Wireless EM74xx */
4407 + {QMI_FIXED_INTF(0x1199, 0x9091, 8)}, /* Sierra Wireless EM7565 */
4408 {QMI_FIXED_INTF(0x1bbb, 0x011e, 4)}, /* Telekom Speedstick LTE II (Alcatel One Touch L100V LTE) */
4409 {QMI_FIXED_INTF(0x1bbb, 0x0203, 2)}, /* Alcatel L800MA */
4410 {QMI_FIXED_INTF(0x2357, 0x0201, 4)}, /* TP-LINK HSUPA Modem MA180 */
4411 diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
4412 index a2f4e52fadb5..9e9202b50e73 100644
4413 --- a/drivers/net/vxlan.c
4414 +++ b/drivers/net/vxlan.c
4415 @@ -3105,6 +3105,11 @@ static void vxlan_config_apply(struct net_device *dev,
4416
4417 max_mtu = lowerdev->mtu - (use_ipv6 ? VXLAN6_HEADROOM :
4418 VXLAN_HEADROOM);
4419 + if (max_mtu < ETH_MIN_MTU)
4420 + max_mtu = ETH_MIN_MTU;
4421 +
4422 + if (!changelink && !conf->mtu)
4423 + dev->mtu = max_mtu;
4424 }
4425
4426 if (dev->mtu > max_mtu)
4427 diff --git a/drivers/phy/tegra/xusb.c b/drivers/phy/tegra/xusb.c
4428 index 4307bf0013e1..63e916d4d069 100644
4429 --- a/drivers/phy/tegra/xusb.c
4430 +++ b/drivers/phy/tegra/xusb.c
4431 @@ -75,14 +75,14 @@ MODULE_DEVICE_TABLE(of, tegra_xusb_padctl_of_match);
4432 static struct device_node *
4433 tegra_xusb_find_pad_node(struct tegra_xusb_padctl *padctl, const char *name)
4434 {
4435 - /*
4436 - * of_find_node_by_name() drops a reference, so make sure to grab one.
4437 - */
4438 - struct device_node *np = of_node_get(padctl->dev->of_node);
4439 + struct device_node *pads, *np;
4440 +
4441 + pads = of_get_child_by_name(padctl->dev->of_node, "pads");
4442 + if (!pads)
4443 + return NULL;
4444
4445 - np = of_find_node_by_name(np, "pads");
4446 - if (np)
4447 - np = of_find_node_by_name(np, name);
4448 + np = of_get_child_by_name(pads, name);
4449 + of_node_put(pads);
4450
4451 return np;
4452 }
4453 @@ -90,16 +90,16 @@ tegra_xusb_find_pad_node(struct tegra_xusb_padctl *padctl, const char *name)
4454 static struct device_node *
4455 tegra_xusb_pad_find_phy_node(struct tegra_xusb_pad *pad, unsigned int index)
4456 {
4457 - /*
4458 - * of_find_node_by_name() drops a reference, so make sure to grab one.
4459 - */
4460 - struct device_node *np = of_node_get(pad->dev.of_node);
4461 + struct device_node *np, *lanes;
4462
4463 - np = of_find_node_by_name(np, "lanes");
4464 - if (!np)
4465 + lanes = of_get_child_by_name(pad->dev.of_node, "lanes");
4466 + if (!lanes)
4467 return NULL;
4468
4469 - return of_find_node_by_name(np, pad->soc->lanes[index].name);
4470 + np = of_get_child_by_name(lanes, pad->soc->lanes[index].name);
4471 + of_node_put(lanes);
4472 +
4473 + return np;
4474 }
4475
4476 static int
4477 @@ -195,7 +195,7 @@ int tegra_xusb_pad_register(struct tegra_xusb_pad *pad,
4478 unsigned int i;
4479 int err;
4480
4481 - children = of_find_node_by_name(pad->dev.of_node, "lanes");
4482 + children = of_get_child_by_name(pad->dev.of_node, "lanes");
4483 if (!children)
4484 return -ENODEV;
4485
4486 @@ -444,21 +444,21 @@ static struct device_node *
4487 tegra_xusb_find_port_node(struct tegra_xusb_padctl *padctl, const char *type,
4488 unsigned int index)
4489 {
4490 - /*
4491 - * of_find_node_by_name() drops a reference, so make sure to grab one.
4492 - */
4493 - struct device_node *np = of_node_get(padctl->dev->of_node);
4494 + struct device_node *ports, *np;
4495 + char *name;
4496
4497 - np = of_find_node_by_name(np, "ports");
4498 - if (np) {
4499 - char *name;
4500 + ports = of_get_child_by_name(padctl->dev->of_node, "ports");
4501 + if (!ports)
4502 + return NULL;
4503
4504 - name = kasprintf(GFP_KERNEL, "%s-%u", type, index);
4505 - if (!name)
4506 - return ERR_PTR(-ENOMEM);
4507 - np = of_find_node_by_name(np, name);
4508 - kfree(name);
4509 + name = kasprintf(GFP_KERNEL, "%s-%u", type, index);
4510 + if (!name) {
4511 + of_node_put(ports);
4512 + return ERR_PTR(-ENOMEM);
4513 }
4514 + np = of_get_child_by_name(ports, name);
4515 + kfree(name);
4516 + of_node_put(ports);
4517
4518 return np;
4519 }
4520 @@ -847,7 +847,7 @@ static void tegra_xusb_remove_ports(struct tegra_xusb_padctl *padctl)
4521
4522 static int tegra_xusb_padctl_probe(struct platform_device *pdev)
4523 {
4524 - struct device_node *np = of_node_get(pdev->dev.of_node);
4525 + struct device_node *np = pdev->dev.of_node;
4526 const struct tegra_xusb_padctl_soc *soc;
4527 struct tegra_xusb_padctl *padctl;
4528 const struct of_device_id *match;
4529 @@ -855,7 +855,7 @@ static int tegra_xusb_padctl_probe(struct platform_device *pdev)
4530 int err;
4531
4532 /* for backwards compatibility with old device trees */
4533 - np = of_find_node_by_name(np, "pads");
4534 + np = of_get_child_by_name(np, "pads");
4535 if (!np) {
4536 dev_warn(&pdev->dev, "deprecated DT, using legacy driver\n");
4537 return tegra_xusb_padctl_legacy_probe(pdev);
4538 diff --git a/drivers/s390/net/qeth_core.h b/drivers/s390/net/qeth_core.h
4539 index 5340efc673a9..92dd4aef21a3 100644
4540 --- a/drivers/s390/net/qeth_core.h
4541 +++ b/drivers/s390/net/qeth_core.h
4542 @@ -564,9 +564,9 @@ enum qeth_cq {
4543 };
4544
4545 struct qeth_ipato {
4546 - int enabled;
4547 - int invert4;
4548 - int invert6;
4549 + bool enabled;
4550 + bool invert4;
4551 + bool invert6;
4552 struct list_head entries;
4553 };
4554
4555 diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c
4556 index 330e5d3dadf3..7c7a244b6684 100644
4557 --- a/drivers/s390/net/qeth_core_main.c
4558 +++ b/drivers/s390/net/qeth_core_main.c
4559 @@ -1479,9 +1479,9 @@ static int qeth_setup_card(struct qeth_card *card)
4560 qeth_set_intial_options(card);
4561 /* IP address takeover */
4562 INIT_LIST_HEAD(&card->ipato.entries);
4563 - card->ipato.enabled = 0;
4564 - card->ipato.invert4 = 0;
4565 - card->ipato.invert6 = 0;
4566 + card->ipato.enabled = false;
4567 + card->ipato.invert4 = false;
4568 + card->ipato.invert6 = false;
4569 /* init QDIO stuff */
4570 qeth_init_qdio_info(card);
4571 INIT_DELAYED_WORK(&card->buffer_reclaim_work, qeth_buffer_reclaim_work);
4572 @@ -5445,6 +5445,13 @@ int qeth_poll(struct napi_struct *napi, int budget)
4573 }
4574 EXPORT_SYMBOL_GPL(qeth_poll);
4575
4576 +static int qeth_setassparms_inspect_rc(struct qeth_ipa_cmd *cmd)
4577 +{
4578 + if (!cmd->hdr.return_code)
4579 + cmd->hdr.return_code = cmd->data.setassparms.hdr.return_code;
4580 + return cmd->hdr.return_code;
4581 +}
4582 +
4583 int qeth_setassparms_cb(struct qeth_card *card,
4584 struct qeth_reply *reply, unsigned long data)
4585 {
4586 @@ -6304,7 +6311,7 @@ static int qeth_ipa_checksum_run_cmd_cb(struct qeth_card *card,
4587 (struct qeth_checksum_cmd *)reply->param;
4588
4589 QETH_CARD_TEXT(card, 4, "chkdoccb");
4590 - if (cmd->hdr.return_code)
4591 + if (qeth_setassparms_inspect_rc(cmd))
4592 return 0;
4593
4594 memset(chksum_cb, 0, sizeof(*chksum_cb));
4595 diff --git a/drivers/s390/net/qeth_l3.h b/drivers/s390/net/qeth_l3.h
4596 index 194ae9b577cc..e5833837b799 100644
4597 --- a/drivers/s390/net/qeth_l3.h
4598 +++ b/drivers/s390/net/qeth_l3.h
4599 @@ -82,7 +82,7 @@ void qeth_l3_del_vipa(struct qeth_card *, enum qeth_prot_versions, const u8 *);
4600 int qeth_l3_add_rxip(struct qeth_card *, enum qeth_prot_versions, const u8 *);
4601 void qeth_l3_del_rxip(struct qeth_card *card, enum qeth_prot_versions,
4602 const u8 *);
4603 -int qeth_l3_is_addr_covered_by_ipato(struct qeth_card *, struct qeth_ipaddr *);
4604 +void qeth_l3_update_ipato(struct qeth_card *card);
4605 struct qeth_ipaddr *qeth_l3_get_addr_buffer(enum qeth_prot_versions);
4606 int qeth_l3_add_ip(struct qeth_card *, struct qeth_ipaddr *);
4607 int qeth_l3_delete_ip(struct qeth_card *, struct qeth_ipaddr *);
4608 diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c
4609 index 27185ab38136..36dee176f8e2 100644
4610 --- a/drivers/s390/net/qeth_l3_main.c
4611 +++ b/drivers/s390/net/qeth_l3_main.c
4612 @@ -163,8 +163,8 @@ static void qeth_l3_convert_addr_to_bits(u8 *addr, u8 *bits, int len)
4613 }
4614 }
4615
4616 -int qeth_l3_is_addr_covered_by_ipato(struct qeth_card *card,
4617 - struct qeth_ipaddr *addr)
4618 +static bool qeth_l3_is_addr_covered_by_ipato(struct qeth_card *card,
4619 + struct qeth_ipaddr *addr)
4620 {
4621 struct qeth_ipato_entry *ipatoe;
4622 u8 addr_bits[128] = {0, };
4623 @@ -173,6 +173,8 @@ int qeth_l3_is_addr_covered_by_ipato(struct qeth_card *card,
4624
4625 if (!card->ipato.enabled)
4626 return 0;
4627 + if (addr->type != QETH_IP_TYPE_NORMAL)
4628 + return 0;
4629
4630 qeth_l3_convert_addr_to_bits((u8 *) &addr->u, addr_bits,
4631 (addr->proto == QETH_PROT_IPV4)? 4:16);
4632 @@ -289,8 +291,7 @@ int qeth_l3_add_ip(struct qeth_card *card, struct qeth_ipaddr *tmp_addr)
4633 memcpy(addr, tmp_addr, sizeof(struct qeth_ipaddr));
4634 addr->ref_counter = 1;
4635
4636 - if (addr->type == QETH_IP_TYPE_NORMAL &&
4637 - qeth_l3_is_addr_covered_by_ipato(card, addr)) {
4638 + if (qeth_l3_is_addr_covered_by_ipato(card, addr)) {
4639 QETH_CARD_TEXT(card, 2, "tkovaddr");
4640 addr->set_flags |= QETH_IPA_SETIP_TAKEOVER_FLAG;
4641 }
4642 @@ -604,6 +605,27 @@ int qeth_l3_setrouting_v6(struct qeth_card *card)
4643 /*
4644 * IP address takeover related functions
4645 */
4646 +
4647 +/**
4648 + * qeth_l3_update_ipato() - Update 'takeover' property, for all NORMAL IPs.
4649 + *
4650 + * Caller must hold ip_lock.
4651 + */
4652 +void qeth_l3_update_ipato(struct qeth_card *card)
4653 +{
4654 + struct qeth_ipaddr *addr;
4655 + unsigned int i;
4656 +
4657 + hash_for_each(card->ip_htable, i, addr, hnode) {
4658 + if (addr->type != QETH_IP_TYPE_NORMAL)
4659 + continue;
4660 + if (qeth_l3_is_addr_covered_by_ipato(card, addr))
4661 + addr->set_flags |= QETH_IPA_SETIP_TAKEOVER_FLAG;
4662 + else
4663 + addr->set_flags &= ~QETH_IPA_SETIP_TAKEOVER_FLAG;
4664 + }
4665 +}
4666 +
4667 static void qeth_l3_clear_ipato_list(struct qeth_card *card)
4668 {
4669 struct qeth_ipato_entry *ipatoe, *tmp;
4670 @@ -615,6 +637,7 @@ static void qeth_l3_clear_ipato_list(struct qeth_card *card)
4671 kfree(ipatoe);
4672 }
4673
4674 + qeth_l3_update_ipato(card);
4675 spin_unlock_bh(&card->ip_lock);
4676 }
4677
4678 @@ -639,8 +662,10 @@ int qeth_l3_add_ipato_entry(struct qeth_card *card,
4679 }
4680 }
4681
4682 - if (!rc)
4683 + if (!rc) {
4684 list_add_tail(&new->entry, &card->ipato.entries);
4685 + qeth_l3_update_ipato(card);
4686 + }
4687
4688 spin_unlock_bh(&card->ip_lock);
4689
4690 @@ -663,6 +688,7 @@ void qeth_l3_del_ipato_entry(struct qeth_card *card,
4691 (proto == QETH_PROT_IPV4)? 4:16) &&
4692 (ipatoe->mask_bits == mask_bits)) {
4693 list_del(&ipatoe->entry);
4694 + qeth_l3_update_ipato(card);
4695 kfree(ipatoe);
4696 }
4697 }
4698 diff --git a/drivers/s390/net/qeth_l3_sys.c b/drivers/s390/net/qeth_l3_sys.c
4699 index 7a829ad77783..1295dd8ec849 100644
4700 --- a/drivers/s390/net/qeth_l3_sys.c
4701 +++ b/drivers/s390/net/qeth_l3_sys.c
4702 @@ -370,8 +370,8 @@ static ssize_t qeth_l3_dev_ipato_enable_store(struct device *dev,
4703 struct device_attribute *attr, const char *buf, size_t count)
4704 {
4705 struct qeth_card *card = dev_get_drvdata(dev);
4706 - struct qeth_ipaddr *addr;
4707 - int i, rc = 0;
4708 + bool enable;
4709 + int rc = 0;
4710
4711 if (!card)
4712 return -EINVAL;
4713 @@ -384,25 +384,18 @@ static ssize_t qeth_l3_dev_ipato_enable_store(struct device *dev,
4714 }
4715
4716 if (sysfs_streq(buf, "toggle")) {
4717 - card->ipato.enabled = (card->ipato.enabled)? 0 : 1;
4718 - } else if (sysfs_streq(buf, "1")) {
4719 - card->ipato.enabled = 1;
4720 - hash_for_each(card->ip_htable, i, addr, hnode) {
4721 - if ((addr->type == QETH_IP_TYPE_NORMAL) &&
4722 - qeth_l3_is_addr_covered_by_ipato(card, addr))
4723 - addr->set_flags |=
4724 - QETH_IPA_SETIP_TAKEOVER_FLAG;
4725 - }
4726 - } else if (sysfs_streq(buf, "0")) {
4727 - card->ipato.enabled = 0;
4728 - hash_for_each(card->ip_htable, i, addr, hnode) {
4729 - if (addr->set_flags &
4730 - QETH_IPA_SETIP_TAKEOVER_FLAG)
4731 - addr->set_flags &=
4732 - ~QETH_IPA_SETIP_TAKEOVER_FLAG;
4733 - }
4734 - } else
4735 + enable = !card->ipato.enabled;
4736 + } else if (kstrtobool(buf, &enable)) {
4737 rc = -EINVAL;
4738 + goto out;
4739 + }
4740 +
4741 + if (card->ipato.enabled != enable) {
4742 + card->ipato.enabled = enable;
4743 + spin_lock_bh(&card->ip_lock);
4744 + qeth_l3_update_ipato(card);
4745 + spin_unlock_bh(&card->ip_lock);
4746 + }
4747 out:
4748 mutex_unlock(&card->conf_mutex);
4749 return rc ? rc : count;
4750 @@ -428,20 +421,27 @@ static ssize_t qeth_l3_dev_ipato_invert4_store(struct device *dev,
4751 const char *buf, size_t count)
4752 {
4753 struct qeth_card *card = dev_get_drvdata(dev);
4754 + bool invert;
4755 int rc = 0;
4756
4757 if (!card)
4758 return -EINVAL;
4759
4760 mutex_lock(&card->conf_mutex);
4761 - if (sysfs_streq(buf, "toggle"))
4762 - card->ipato.invert4 = (card->ipato.invert4)? 0 : 1;
4763 - else if (sysfs_streq(buf, "1"))
4764 - card->ipato.invert4 = 1;
4765 - else if (sysfs_streq(buf, "0"))
4766 - card->ipato.invert4 = 0;
4767 - else
4768 + if (sysfs_streq(buf, "toggle")) {
4769 + invert = !card->ipato.invert4;
4770 + } else if (kstrtobool(buf, &invert)) {
4771 rc = -EINVAL;
4772 + goto out;
4773 + }
4774 +
4775 + if (card->ipato.invert4 != invert) {
4776 + card->ipato.invert4 = invert;
4777 + spin_lock_bh(&card->ip_lock);
4778 + qeth_l3_update_ipato(card);
4779 + spin_unlock_bh(&card->ip_lock);
4780 + }
4781 +out:
4782 mutex_unlock(&card->conf_mutex);
4783 return rc ? rc : count;
4784 }
4785 @@ -607,20 +607,27 @@ static ssize_t qeth_l3_dev_ipato_invert6_store(struct device *dev,
4786 struct device_attribute *attr, const char *buf, size_t count)
4787 {
4788 struct qeth_card *card = dev_get_drvdata(dev);
4789 + bool invert;
4790 int rc = 0;
4791
4792 if (!card)
4793 return -EINVAL;
4794
4795 mutex_lock(&card->conf_mutex);
4796 - if (sysfs_streq(buf, "toggle"))
4797 - card->ipato.invert6 = (card->ipato.invert6)? 0 : 1;
4798 - else if (sysfs_streq(buf, "1"))
4799 - card->ipato.invert6 = 1;
4800 - else if (sysfs_streq(buf, "0"))
4801 - card->ipato.invert6 = 0;
4802 - else
4803 + if (sysfs_streq(buf, "toggle")) {
4804 + invert = !card->ipato.invert6;
4805 + } else if (kstrtobool(buf, &invert)) {
4806 rc = -EINVAL;
4807 + goto out;
4808 + }
4809 +
4810 + if (card->ipato.invert6 != invert) {
4811 + card->ipato.invert6 = invert;
4812 + spin_lock_bh(&card->ip_lock);
4813 + qeth_l3_update_ipato(card);
4814 + spin_unlock_bh(&card->ip_lock);
4815 + }
4816 +out:
4817 mutex_unlock(&card->conf_mutex);
4818 return rc ? rc : count;
4819 }
4820 diff --git a/drivers/scsi/osd/osd_initiator.c b/drivers/scsi/osd/osd_initiator.c
4821 index a4f28b7e4c65..e18877177f1b 100644
4822 --- a/drivers/scsi/osd/osd_initiator.c
4823 +++ b/drivers/scsi/osd/osd_initiator.c
4824 @@ -1576,7 +1576,9 @@ static struct request *_make_request(struct request_queue *q, bool has_write,
4825 return req;
4826
4827 for_each_bio(bio) {
4828 - ret = blk_rq_append_bio(req, bio);
4829 + struct bio *bounce_bio = bio;
4830 +
4831 + ret = blk_rq_append_bio(req, &bounce_bio);
4832 if (ret)
4833 return ERR_PTR(ret);
4834 }
4835 diff --git a/drivers/staging/android/ion/ion.c b/drivers/staging/android/ion/ion.c
4836 index 93e2c90fa77d..83dc3292e9ab 100644
4837 --- a/drivers/staging/android/ion/ion.c
4838 +++ b/drivers/staging/android/ion/ion.c
4839 @@ -348,7 +348,7 @@ static int ion_dma_buf_begin_cpu_access(struct dma_buf *dmabuf,
4840 mutex_lock(&buffer->lock);
4841 list_for_each_entry(a, &buffer->attachments, list) {
4842 dma_sync_sg_for_cpu(a->dev, a->table->sgl, a->table->nents,
4843 - DMA_BIDIRECTIONAL);
4844 + direction);
4845 }
4846 mutex_unlock(&buffer->lock);
4847
4848 @@ -370,7 +370,7 @@ static int ion_dma_buf_end_cpu_access(struct dma_buf *dmabuf,
4849 mutex_lock(&buffer->lock);
4850 list_for_each_entry(a, &buffer->attachments, list) {
4851 dma_sync_sg_for_device(a->dev, a->table->sgl, a->table->nents,
4852 - DMA_BIDIRECTIONAL);
4853 + direction);
4854 }
4855 mutex_unlock(&buffer->lock);
4856
4857 diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c
4858 index 7c69b4a9694d..0d99b242e82e 100644
4859 --- a/drivers/target/target_core_pscsi.c
4860 +++ b/drivers/target/target_core_pscsi.c
4861 @@ -920,7 +920,7 @@ pscsi_map_sg(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents,
4862 " %d i: %d bio: %p, allocating another"
4863 " bio\n", bio->bi_vcnt, i, bio);
4864
4865 - rc = blk_rq_append_bio(req, bio);
4866 + rc = blk_rq_append_bio(req, &bio);
4867 if (rc) {
4868 pr_err("pSCSI: failed to append bio\n");
4869 goto fail;
4870 @@ -938,7 +938,7 @@ pscsi_map_sg(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents,
4871 }
4872
4873 if (bio) {
4874 - rc = blk_rq_append_bio(req, bio);
4875 + rc = blk_rq_append_bio(req, &bio);
4876 if (rc) {
4877 pr_err("pSCSI: failed to append bio\n");
4878 goto fail;
4879 diff --git a/drivers/tty/n_tty.c b/drivers/tty/n_tty.c
4880 index bdf0e6e89991..faf50df81622 100644
4881 --- a/drivers/tty/n_tty.c
4882 +++ b/drivers/tty/n_tty.c
4883 @@ -1764,7 +1764,7 @@ static void n_tty_set_termios(struct tty_struct *tty, struct ktermios *old)
4884 {
4885 struct n_tty_data *ldata = tty->disc_data;
4886
4887 - if (!old || (old->c_lflag ^ tty->termios.c_lflag) & ICANON) {
4888 + if (!old || (old->c_lflag ^ tty->termios.c_lflag) & (ICANON | EXTPROC)) {
4889 bitmap_zero(ldata->read_flags, N_TTY_BUF_SIZE);
4890 ldata->line_start = ldata->read_tail;
4891 if (!L_ICANON(tty) || !read_cnt(ldata)) {
4892 @@ -2427,7 +2427,7 @@ static int n_tty_ioctl(struct tty_struct *tty, struct file *file,
4893 return put_user(tty_chars_in_buffer(tty), (int __user *) arg);
4894 case TIOCINQ:
4895 down_write(&tty->termios_rwsem);
4896 - if (L_ICANON(tty))
4897 + if (L_ICANON(tty) && !L_EXTPROC(tty))
4898 retval = inq_canon(ldata);
4899 else
4900 retval = read_cnt(ldata);
4901 diff --git a/drivers/tty/tty_buffer.c b/drivers/tty/tty_buffer.c
4902 index f8eba1c5412f..677fa99b7747 100644
4903 --- a/drivers/tty/tty_buffer.c
4904 +++ b/drivers/tty/tty_buffer.c
4905 @@ -446,7 +446,7 @@ EXPORT_SYMBOL_GPL(tty_prepare_flip_string);
4906 * Callers other than flush_to_ldisc() need to exclude the kworker
4907 * from concurrent use of the line discipline, see paste_selection().
4908 *
4909 - * Returns the number of bytes not processed
4910 + * Returns the number of bytes processed
4911 */
4912 int tty_ldisc_receive_buf(struct tty_ldisc *ld, const unsigned char *p,
4913 char *f, int count)
4914 diff --git a/drivers/usb/chipidea/ci_hdrc_msm.c b/drivers/usb/chipidea/ci_hdrc_msm.c
4915 index bb626120296f..53f3bf459dd1 100644
4916 --- a/drivers/usb/chipidea/ci_hdrc_msm.c
4917 +++ b/drivers/usb/chipidea/ci_hdrc_msm.c
4918 @@ -251,7 +251,7 @@ static int ci_hdrc_msm_probe(struct platform_device *pdev)
4919 if (ret)
4920 goto err_mux;
4921
4922 - ulpi_node = of_find_node_by_name(of_node_get(pdev->dev.of_node), "ulpi");
4923 + ulpi_node = of_get_child_by_name(pdev->dev.of_node, "ulpi");
4924 if (ulpi_node) {
4925 phy_node = of_get_next_available_child(ulpi_node, NULL);
4926 ci->hsic = of_device_is_compatible(phy_node, "qcom,usb-hsic-phy");
4927 diff --git a/drivers/usb/core/config.c b/drivers/usb/core/config.c
4928 index 843ef46d2537..9e3355b97396 100644
4929 --- a/drivers/usb/core/config.c
4930 +++ b/drivers/usb/core/config.c
4931 @@ -1007,7 +1007,7 @@ int usb_get_bos_descriptor(struct usb_device *dev)
4932 case USB_SSP_CAP_TYPE:
4933 ssp_cap = (struct usb_ssp_cap_descriptor *)buffer;
4934 ssac = (le32_to_cpu(ssp_cap->bmAttributes) &
4935 - USB_SSP_SUBLINK_SPEED_ATTRIBS) + 1;
4936 + USB_SSP_SUBLINK_SPEED_ATTRIBS);
4937 if (length >= USB_DT_USB_SSP_CAP_SIZE(ssac))
4938 dev->bos->ssp_cap = ssp_cap;
4939 break;
4940 diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c
4941 index 50010282c010..c05c4f877750 100644
4942 --- a/drivers/usb/core/quirks.c
4943 +++ b/drivers/usb/core/quirks.c
4944 @@ -57,10 +57,11 @@ static const struct usb_device_id usb_quirk_list[] = {
4945 /* Microsoft LifeCam-VX700 v2.0 */
4946 { USB_DEVICE(0x045e, 0x0770), .driver_info = USB_QUIRK_RESET_RESUME },
4947
4948 - /* Logitech HD Pro Webcams C920, C920-C and C930e */
4949 + /* Logitech HD Pro Webcams C920, C920-C, C925e and C930e */
4950 { USB_DEVICE(0x046d, 0x082d), .driver_info = USB_QUIRK_DELAY_INIT },
4951 { USB_DEVICE(0x046d, 0x0841), .driver_info = USB_QUIRK_DELAY_INIT },
4952 { USB_DEVICE(0x046d, 0x0843), .driver_info = USB_QUIRK_DELAY_INIT },
4953 + { USB_DEVICE(0x046d, 0x085b), .driver_info = USB_QUIRK_DELAY_INIT },
4954
4955 /* Logitech ConferenceCam CC3000e */
4956 { USB_DEVICE(0x046d, 0x0847), .driver_info = USB_QUIRK_DELAY_INIT },
4957 @@ -154,6 +155,9 @@ static const struct usb_device_id usb_quirk_list[] = {
4958 /* Genesys Logic hub, internally used by KY-688 USB 3.1 Type-C Hub */
4959 { USB_DEVICE(0x05e3, 0x0612), .driver_info = USB_QUIRK_NO_LPM },
4960
4961 + /* ELSA MicroLink 56K */
4962 + { USB_DEVICE(0x05cc, 0x2267), .driver_info = USB_QUIRK_RESET_RESUME },
4963 +
4964 /* Genesys Logic hub, internally used by Moshi USB to Ethernet Adapter */
4965 { USB_DEVICE(0x05e3, 0x0616), .driver_info = USB_QUIRK_NO_LPM },
4966
4967 diff --git a/drivers/usb/host/xhci-pci.c b/drivers/usb/host/xhci-pci.c
4968 index 76f392954733..abb8f19ae40f 100644
4969 --- a/drivers/usb/host/xhci-pci.c
4970 +++ b/drivers/usb/host/xhci-pci.c
4971 @@ -189,6 +189,9 @@ static void xhci_pci_quirks(struct device *dev, struct xhci_hcd *xhci)
4972 xhci->quirks |= XHCI_TRUST_TX_LENGTH;
4973 xhci->quirks |= XHCI_BROKEN_STREAMS;
4974 }
4975 + if (pdev->vendor == PCI_VENDOR_ID_RENESAS &&
4976 + pdev->device == 0x0014)
4977 + xhci->quirks |= XHCI_TRUST_TX_LENGTH;
4978 if (pdev->vendor == PCI_VENDOR_ID_RENESAS &&
4979 pdev->device == 0x0015)
4980 xhci->quirks |= XHCI_RESET_ON_RESUME;
4981 diff --git a/drivers/usb/serial/ftdi_sio.c b/drivers/usb/serial/ftdi_sio.c
4982 index 49d1b2d4606d..d038e543c246 100644
4983 --- a/drivers/usb/serial/ftdi_sio.c
4984 +++ b/drivers/usb/serial/ftdi_sio.c
4985 @@ -1017,6 +1017,7 @@ static const struct usb_device_id id_table_combined[] = {
4986 .driver_info = (kernel_ulong_t)&ftdi_jtag_quirk },
4987 { USB_DEVICE(CYPRESS_VID, CYPRESS_WICED_BT_USB_PID) },
4988 { USB_DEVICE(CYPRESS_VID, CYPRESS_WICED_WL_USB_PID) },
4989 + { USB_DEVICE(AIRBUS_DS_VID, AIRBUS_DS_P8GR) },
4990 { } /* Terminating entry */
4991 };
4992
4993 diff --git a/drivers/usb/serial/ftdi_sio_ids.h b/drivers/usb/serial/ftdi_sio_ids.h
4994 index 4faa09fe308c..8b4ecd2bd297 100644
4995 --- a/drivers/usb/serial/ftdi_sio_ids.h
4996 +++ b/drivers/usb/serial/ftdi_sio_ids.h
4997 @@ -914,6 +914,12 @@
4998 #define ICPDAS_I7561U_PID 0x0104
4999 #define ICPDAS_I7563U_PID 0x0105
5000
5001 +/*
5002 + * Airbus Defence and Space
5003 + */
5004 +#define AIRBUS_DS_VID 0x1e8e /* Vendor ID */
5005 +#define AIRBUS_DS_P8GR 0x6001 /* Tetra P8GR */
5006 +
5007 /*
5008 * RT Systems programming cables for various ham radios
5009 */
5010 diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c
5011 index 54e316b1892d..a9400458ccea 100644
5012 --- a/drivers/usb/serial/option.c
5013 +++ b/drivers/usb/serial/option.c
5014 @@ -236,6 +236,8 @@ static void option_instat_callback(struct urb *urb);
5015 /* These Quectel products use Qualcomm's vendor ID */
5016 #define QUECTEL_PRODUCT_UC20 0x9003
5017 #define QUECTEL_PRODUCT_UC15 0x9090
5018 +/* These Yuga products use Qualcomm's vendor ID */
5019 +#define YUGA_PRODUCT_CLM920_NC5 0x9625
5020
5021 #define QUECTEL_VENDOR_ID 0x2c7c
5022 /* These Quectel products use Quectel's vendor ID */
5023 @@ -283,6 +285,7 @@ static void option_instat_callback(struct urb *urb);
5024 #define TELIT_PRODUCT_LE922_USBCFG3 0x1043
5025 #define TELIT_PRODUCT_LE922_USBCFG5 0x1045
5026 #define TELIT_PRODUCT_ME910 0x1100
5027 +#define TELIT_PRODUCT_ME910_DUAL_MODEM 0x1101
5028 #define TELIT_PRODUCT_LE920 0x1200
5029 #define TELIT_PRODUCT_LE910 0x1201
5030 #define TELIT_PRODUCT_LE910_USBCFG4 0x1206
5031 @@ -648,6 +651,11 @@ static const struct option_blacklist_info telit_me910_blacklist = {
5032 .reserved = BIT(1) | BIT(3),
5033 };
5034
5035 +static const struct option_blacklist_info telit_me910_dual_modem_blacklist = {
5036 + .sendsetup = BIT(0),
5037 + .reserved = BIT(3),
5038 +};
5039 +
5040 static const struct option_blacklist_info telit_le910_blacklist = {
5041 .sendsetup = BIT(0),
5042 .reserved = BIT(1) | BIT(2),
5043 @@ -677,6 +685,10 @@ static const struct option_blacklist_info cinterion_rmnet2_blacklist = {
5044 .reserved = BIT(4) | BIT(5),
5045 };
5046
5047 +static const struct option_blacklist_info yuga_clm920_nc5_blacklist = {
5048 + .reserved = BIT(1) | BIT(4),
5049 +};
5050 +
5051 static const struct usb_device_id option_ids[] = {
5052 { USB_DEVICE(OPTION_VENDOR_ID, OPTION_PRODUCT_COLT) },
5053 { USB_DEVICE(OPTION_VENDOR_ID, OPTION_PRODUCT_RICOLA) },
5054 @@ -1181,6 +1193,9 @@ static const struct usb_device_id option_ids[] = {
5055 { USB_DEVICE(QUALCOMM_VENDOR_ID, QUECTEL_PRODUCT_UC15)},
5056 { USB_DEVICE(QUALCOMM_VENDOR_ID, QUECTEL_PRODUCT_UC20),
5057 .driver_info = (kernel_ulong_t)&net_intf4_blacklist },
5058 + /* Yuga products use Qualcomm vendor ID */
5059 + { USB_DEVICE(QUALCOMM_VENDOR_ID, YUGA_PRODUCT_CLM920_NC5),
5060 + .driver_info = (kernel_ulong_t)&yuga_clm920_nc5_blacklist },
5061 /* Quectel products using Quectel vendor ID */
5062 { USB_DEVICE(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EC21),
5063 .driver_info = (kernel_ulong_t)&net_intf4_blacklist },
5064 @@ -1247,6 +1262,8 @@ static const struct usb_device_id option_ids[] = {
5065 .driver_info = (kernel_ulong_t)&telit_le922_blacklist_usbcfg0 },
5066 { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_ME910),
5067 .driver_info = (kernel_ulong_t)&telit_me910_blacklist },
5068 + { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_ME910_DUAL_MODEM),
5069 + .driver_info = (kernel_ulong_t)&telit_me910_dual_modem_blacklist },
5070 { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE910),
5071 .driver_info = (kernel_ulong_t)&telit_le910_blacklist },
5072 { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE910_USBCFG4),
5073 diff --git a/drivers/usb/serial/qcserial.c b/drivers/usb/serial/qcserial.c
5074 index 9f9d3a904464..55a8fb25ce2b 100644
5075 --- a/drivers/usb/serial/qcserial.c
5076 +++ b/drivers/usb/serial/qcserial.c
5077 @@ -166,6 +166,8 @@ static const struct usb_device_id id_table[] = {
5078 {DEVICE_SWI(0x1199, 0x9079)}, /* Sierra Wireless EM74xx */
5079 {DEVICE_SWI(0x1199, 0x907a)}, /* Sierra Wireless EM74xx QDL */
5080 {DEVICE_SWI(0x1199, 0x907b)}, /* Sierra Wireless EM74xx */
5081 + {DEVICE_SWI(0x1199, 0x9090)}, /* Sierra Wireless EM7565 QDL */
5082 + {DEVICE_SWI(0x1199, 0x9091)}, /* Sierra Wireless EM7565 */
5083 {DEVICE_SWI(0x413c, 0x81a2)}, /* Dell Wireless 5806 Gobi(TM) 4G LTE Mobile Broadband Card */
5084 {DEVICE_SWI(0x413c, 0x81a3)}, /* Dell Wireless 5570 HSPA+ (42Mbps) Mobile Broadband Card */
5085 {DEVICE_SWI(0x413c, 0x81a4)}, /* Dell Wireless 5570e HSPA+ (42Mbps) Mobile Broadband Card */
5086 @@ -346,6 +348,7 @@ static int qcprobe(struct usb_serial *serial, const struct usb_device_id *id)
5087 break;
5088 case 2:
5089 dev_dbg(dev, "NMEA GPS interface found\n");
5090 + sendsetup = true;
5091 break;
5092 case 3:
5093 dev_dbg(dev, "Modem port found\n");
5094 diff --git a/drivers/usb/usbip/stub_dev.c b/drivers/usb/usbip/stub_dev.c
5095 index c653ce533430..720408d39f11 100644
5096 --- a/drivers/usb/usbip/stub_dev.c
5097 +++ b/drivers/usb/usbip/stub_dev.c
5098 @@ -163,8 +163,7 @@ static void stub_shutdown_connection(struct usbip_device *ud)
5099 * step 1?
5100 */
5101 if (ud->tcp_socket) {
5102 - dev_dbg(&sdev->udev->dev, "shutdown tcp_socket %p\n",
5103 - ud->tcp_socket);
5104 + dev_dbg(&sdev->udev->dev, "shutdown sockfd %d\n", ud->sockfd);
5105 kernel_sock_shutdown(ud->tcp_socket, SHUT_RDWR);
5106 }
5107
5108 diff --git a/drivers/usb/usbip/stub_main.c b/drivers/usb/usbip/stub_main.c
5109 index 7170404e8979..6968c906fa29 100644
5110 --- a/drivers/usb/usbip/stub_main.c
5111 +++ b/drivers/usb/usbip/stub_main.c
5112 @@ -251,11 +251,12 @@ void stub_device_cleanup_urbs(struct stub_device *sdev)
5113 struct stub_priv *priv;
5114 struct urb *urb;
5115
5116 - dev_dbg(&sdev->udev->dev, "free sdev %p\n", sdev);
5117 + dev_dbg(&sdev->udev->dev, "Stub device cleaning up urbs\n");
5118
5119 while ((priv = stub_priv_pop(sdev))) {
5120 urb = priv->urb;
5121 - dev_dbg(&sdev->udev->dev, "free urb %p\n", urb);
5122 + dev_dbg(&sdev->udev->dev, "free urb seqnum %lu\n",
5123 + priv->seqnum);
5124 usb_kill_urb(urb);
5125
5126 kmem_cache_free(stub_priv_cache, priv);
5127 diff --git a/drivers/usb/usbip/stub_rx.c b/drivers/usb/usbip/stub_rx.c
5128 index 283a9be77a22..5b807185f79e 100644
5129 --- a/drivers/usb/usbip/stub_rx.c
5130 +++ b/drivers/usb/usbip/stub_rx.c
5131 @@ -225,9 +225,6 @@ static int stub_recv_cmd_unlink(struct stub_device *sdev,
5132 if (priv->seqnum != pdu->u.cmd_unlink.seqnum)
5133 continue;
5134
5135 - dev_info(&priv->urb->dev->dev, "unlink urb %p\n",
5136 - priv->urb);
5137 -
5138 /*
5139 * This matched urb is not completed yet (i.e., be in
5140 * flight in usb hcd hardware/driver). Now we are
5141 @@ -266,8 +263,8 @@ static int stub_recv_cmd_unlink(struct stub_device *sdev,
5142 ret = usb_unlink_urb(priv->urb);
5143 if (ret != -EINPROGRESS)
5144 dev_err(&priv->urb->dev->dev,
5145 - "failed to unlink a urb %p, ret %d\n",
5146 - priv->urb, ret);
5147 + "failed to unlink a urb # %lu, ret %d\n",
5148 + priv->seqnum, ret);
5149
5150 return 0;
5151 }
5152 diff --git a/drivers/usb/usbip/stub_tx.c b/drivers/usb/usbip/stub_tx.c
5153 index 87ff94be4235..96aa375b80d9 100644
5154 --- a/drivers/usb/usbip/stub_tx.c
5155 +++ b/drivers/usb/usbip/stub_tx.c
5156 @@ -102,7 +102,7 @@ void stub_complete(struct urb *urb)
5157 /* link a urb to the queue of tx. */
5158 spin_lock_irqsave(&sdev->priv_lock, flags);
5159 if (sdev->ud.tcp_socket == NULL) {
5160 - usbip_dbg_stub_tx("ignore urb for closed connection %p", urb);
5161 + usbip_dbg_stub_tx("ignore urb for closed connection\n");
5162 /* It will be freed in stub_device_cleanup_urbs(). */
5163 } else if (priv->unlinking) {
5164 stub_enqueue_ret_unlink(sdev, priv->seqnum, urb->status);
5165 @@ -204,8 +204,8 @@ static int stub_send_ret_submit(struct stub_device *sdev)
5166
5167 /* 1. setup usbip_header */
5168 setup_ret_submit_pdu(&pdu_header, urb);
5169 - usbip_dbg_stub_tx("setup txdata seqnum: %d urb: %p\n",
5170 - pdu_header.base.seqnum, urb);
5171 + usbip_dbg_stub_tx("setup txdata seqnum: %d\n",
5172 + pdu_header.base.seqnum);
5173 usbip_header_correct_endian(&pdu_header, 1);
5174
5175 iov[iovnum].iov_base = &pdu_header;
5176 diff --git a/drivers/usb/usbip/usbip_common.c b/drivers/usb/usbip/usbip_common.c
5177 index 2281f3562870..17b599b923f3 100644
5178 --- a/drivers/usb/usbip/usbip_common.c
5179 +++ b/drivers/usb/usbip/usbip_common.c
5180 @@ -331,26 +331,20 @@ int usbip_recv(struct socket *sock, void *buf, int size)
5181 struct msghdr msg = {.msg_flags = MSG_NOSIGNAL};
5182 int total = 0;
5183
5184 + if (!sock || !buf || !size)
5185 + return -EINVAL;
5186 +
5187 iov_iter_kvec(&msg.msg_iter, READ|ITER_KVEC, &iov, 1, size);
5188
5189 usbip_dbg_xmit("enter\n");
5190
5191 - if (!sock || !buf || !size) {
5192 - pr_err("invalid arg, sock %p buff %p size %d\n", sock, buf,
5193 - size);
5194 - return -EINVAL;
5195 - }
5196 -
5197 do {
5198 - int sz = msg_data_left(&msg);
5199 + msg_data_left(&msg);
5200 sock->sk->sk_allocation = GFP_NOIO;
5201
5202 result = sock_recvmsg(sock, &msg, MSG_WAITALL);
5203 - if (result <= 0) {
5204 - pr_debug("receive sock %p buf %p size %u ret %d total %d\n",
5205 - sock, buf + total, sz, result, total);
5206 + if (result <= 0)
5207 goto err;
5208 - }
5209
5210 total += result;
5211 } while (msg_data_left(&msg));
5212 diff --git a/drivers/usb/usbip/vhci_hcd.c b/drivers/usb/usbip/vhci_hcd.c
5213 index 1f0cf81cc145..692cfdef667e 100644
5214 --- a/drivers/usb/usbip/vhci_hcd.c
5215 +++ b/drivers/usb/usbip/vhci_hcd.c
5216 @@ -670,9 +670,6 @@ static int vhci_urb_enqueue(struct usb_hcd *hcd, struct urb *urb, gfp_t mem_flag
5217 struct vhci_device *vdev;
5218 unsigned long flags;
5219
5220 - usbip_dbg_vhci_hc("enter, usb_hcd %p urb %p mem_flags %d\n",
5221 - hcd, urb, mem_flags);
5222 -
5223 if (portnum > VHCI_HC_PORTS) {
5224 pr_err("invalid port number %d\n", portnum);
5225 return -ENODEV;
5226 @@ -836,8 +833,6 @@ static int vhci_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status)
5227 struct vhci_device *vdev;
5228 unsigned long flags;
5229
5230 - pr_info("dequeue a urb %p\n", urb);
5231 -
5232 spin_lock_irqsave(&vhci->lock, flags);
5233
5234 priv = urb->hcpriv;
5235 @@ -865,7 +860,6 @@ static int vhci_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status)
5236 /* tcp connection is closed */
5237 spin_lock(&vdev->priv_lock);
5238
5239 - pr_info("device %p seems to be disconnected\n", vdev);
5240 list_del(&priv->list);
5241 kfree(priv);
5242 urb->hcpriv = NULL;
5243 @@ -877,8 +871,6 @@ static int vhci_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status)
5244 * vhci_rx will receive RET_UNLINK and give back the URB.
5245 * Otherwise, we give back it here.
5246 */
5247 - pr_info("gives back urb %p\n", urb);
5248 -
5249 usb_hcd_unlink_urb_from_ep(hcd, urb);
5250
5251 spin_unlock_irqrestore(&vhci->lock, flags);
5252 @@ -906,8 +898,6 @@ static int vhci_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status)
5253
5254 unlink->unlink_seqnum = priv->seqnum;
5255
5256 - pr_info("device %p seems to be still connected\n", vdev);
5257 -
5258 /* send cmd_unlink and try to cancel the pending URB in the
5259 * peer */
5260 list_add_tail(&unlink->list, &vdev->unlink_tx);
5261 @@ -989,7 +979,7 @@ static void vhci_shutdown_connection(struct usbip_device *ud)
5262
5263 /* need this? see stub_dev.c */
5264 if (ud->tcp_socket) {
5265 - pr_debug("shutdown tcp_socket %p\n", ud->tcp_socket);
5266 + pr_debug("shutdown tcp_socket %d\n", ud->sockfd);
5267 kernel_sock_shutdown(ud->tcp_socket, SHUT_RDWR);
5268 }
5269
5270 diff --git a/drivers/usb/usbip/vhci_rx.c b/drivers/usb/usbip/vhci_rx.c
5271 index ef2f2d5ca6b2..1343037d00f9 100644
5272 --- a/drivers/usb/usbip/vhci_rx.c
5273 +++ b/drivers/usb/usbip/vhci_rx.c
5274 @@ -37,24 +37,23 @@ struct urb *pickup_urb_and_free_priv(struct vhci_device *vdev, __u32 seqnum)
5275 urb = priv->urb;
5276 status = urb->status;
5277
5278 - usbip_dbg_vhci_rx("find urb %p vurb %p seqnum %u\n",
5279 - urb, priv, seqnum);
5280 + usbip_dbg_vhci_rx("find urb seqnum %u\n", seqnum);
5281
5282 switch (status) {
5283 case -ENOENT:
5284 /* fall through */
5285 case -ECONNRESET:
5286 - dev_info(&urb->dev->dev,
5287 - "urb %p was unlinked %ssynchronuously.\n", urb,
5288 - status == -ENOENT ? "" : "a");
5289 + dev_dbg(&urb->dev->dev,
5290 + "urb seq# %u was unlinked %ssynchronuously\n",
5291 + seqnum, status == -ENOENT ? "" : "a");
5292 break;
5293 case -EINPROGRESS:
5294 /* no info output */
5295 break;
5296 default:
5297 - dev_info(&urb->dev->dev,
5298 - "urb %p may be in a error, status %d\n", urb,
5299 - status);
5300 + dev_dbg(&urb->dev->dev,
5301 + "urb seq# %u may be in a error, status %d\n",
5302 + seqnum, status);
5303 }
5304
5305 list_del(&priv->list);
5306 @@ -81,8 +80,8 @@ static void vhci_recv_ret_submit(struct vhci_device *vdev,
5307 spin_unlock_irqrestore(&vdev->priv_lock, flags);
5308
5309 if (!urb) {
5310 - pr_err("cannot find a urb of seqnum %u\n", pdu->base.seqnum);
5311 - pr_info("max seqnum %d\n",
5312 + pr_err("cannot find a urb of seqnum %u max seqnum %d\n",
5313 + pdu->base.seqnum,
5314 atomic_read(&vhci_hcd->seqnum));
5315 usbip_event_add(ud, VDEV_EVENT_ERROR_TCP);
5316 return;
5317 @@ -105,7 +104,7 @@ static void vhci_recv_ret_submit(struct vhci_device *vdev,
5318 if (usbip_dbg_flag_vhci_rx)
5319 usbip_dump_urb(urb);
5320
5321 - usbip_dbg_vhci_rx("now giveback urb %p\n", urb);
5322 + usbip_dbg_vhci_rx("now giveback urb %u\n", pdu->base.seqnum);
5323
5324 spin_lock_irqsave(&vhci->lock, flags);
5325 usb_hcd_unlink_urb_from_ep(vhci_hcd_to_hcd(vhci_hcd), urb);
5326 @@ -172,7 +171,7 @@ static void vhci_recv_ret_unlink(struct vhci_device *vdev,
5327 pr_info("the urb (seqnum %d) was already given back\n",
5328 pdu->base.seqnum);
5329 } else {
5330 - usbip_dbg_vhci_rx("now giveback urb %p\n", urb);
5331 + usbip_dbg_vhci_rx("now giveback urb %d\n", pdu->base.seqnum);
5332
5333 /* If unlink is successful, status is -ECONNRESET */
5334 urb->status = pdu->u.ret_unlink.status;
5335 diff --git a/drivers/usb/usbip/vhci_tx.c b/drivers/usb/usbip/vhci_tx.c
5336 index 3e7878fe2fd4..a9a663a578b6 100644
5337 --- a/drivers/usb/usbip/vhci_tx.c
5338 +++ b/drivers/usb/usbip/vhci_tx.c
5339 @@ -83,7 +83,8 @@ static int vhci_send_cmd_submit(struct vhci_device *vdev)
5340 memset(&msg, 0, sizeof(msg));
5341 memset(&iov, 0, sizeof(iov));
5342
5343 - usbip_dbg_vhci_tx("setup txdata urb %p\n", urb);
5344 + usbip_dbg_vhci_tx("setup txdata urb seqnum %lu\n",
5345 + priv->seqnum);
5346
5347 /* 1. setup usbip_header */
5348 setup_cmd_submit_pdu(&pdu_header, urb);
5349 diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
5350 index fd47bd96b5d3..6362e3606aa5 100644
5351 --- a/include/linux/blkdev.h
5352 +++ b/include/linux/blkdev.h
5353 @@ -241,14 +241,24 @@ struct request {
5354 struct request *next_rq;
5355 };
5356
5357 +static inline bool blk_op_is_scsi(unsigned int op)
5358 +{
5359 + return op == REQ_OP_SCSI_IN || op == REQ_OP_SCSI_OUT;
5360 +}
5361 +
5362 +static inline bool blk_op_is_private(unsigned int op)
5363 +{
5364 + return op == REQ_OP_DRV_IN || op == REQ_OP_DRV_OUT;
5365 +}
5366 +
5367 static inline bool blk_rq_is_scsi(struct request *rq)
5368 {
5369 - return req_op(rq) == REQ_OP_SCSI_IN || req_op(rq) == REQ_OP_SCSI_OUT;
5370 + return blk_op_is_scsi(req_op(rq));
5371 }
5372
5373 static inline bool blk_rq_is_private(struct request *rq)
5374 {
5375 - return req_op(rq) == REQ_OP_DRV_IN || req_op(rq) == REQ_OP_DRV_OUT;
5376 + return blk_op_is_private(req_op(rq));
5377 }
5378
5379 static inline bool blk_rq_is_passthrough(struct request *rq)
5380 @@ -256,6 +266,13 @@ static inline bool blk_rq_is_passthrough(struct request *rq)
5381 return blk_rq_is_scsi(rq) || blk_rq_is_private(rq);
5382 }
5383
5384 +static inline bool bio_is_passthrough(struct bio *bio)
5385 +{
5386 + unsigned op = bio_op(bio);
5387 +
5388 + return blk_op_is_scsi(op) || blk_op_is_private(op);
5389 +}
5390 +
5391 static inline unsigned short req_get_ioprio(struct request *req)
5392 {
5393 return req->ioprio;
5394 @@ -952,7 +969,7 @@ extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
5395 extern void blk_rq_unprep_clone(struct request *rq);
5396 extern blk_status_t blk_insert_cloned_request(struct request_queue *q,
5397 struct request *rq);
5398 -extern int blk_rq_append_bio(struct request *rq, struct bio *bio);
5399 +extern int blk_rq_append_bio(struct request *rq, struct bio **bio);
5400 extern void blk_delay_queue(struct request_queue *, unsigned long);
5401 extern void blk_queue_split(struct request_queue *, struct bio **);
5402 extern void blk_recount_segments(struct request_queue *, struct bio *);
5403 diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
5404 index 2477a5cb5bd5..fb83dee528a1 100644
5405 --- a/include/linux/cpuhotplug.h
5406 +++ b/include/linux/cpuhotplug.h
5407 @@ -86,7 +86,7 @@ enum cpuhp_state {
5408 CPUHP_MM_ZSWP_POOL_PREPARE,
5409 CPUHP_KVM_PPC_BOOK3S_PREPARE,
5410 CPUHP_ZCOMP_PREPARE,
5411 - CPUHP_TIMERS_DEAD,
5412 + CPUHP_TIMERS_PREPARE,
5413 CPUHP_MIPS_SOC_PREPARE,
5414 CPUHP_BP_PREPARE_DYN,
5415 CPUHP_BP_PREPARE_DYN_END = CPUHP_BP_PREPARE_DYN + 20,
5416 diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
5417 index ea04ca024f0d..067a6fa675ed 100644
5418 --- a/include/linux/ipv6.h
5419 +++ b/include/linux/ipv6.h
5420 @@ -272,7 +272,8 @@ struct ipv6_pinfo {
5421 * 100: prefer care-of address
5422 */
5423 dontfrag:1,
5424 - autoflowlabel:1;
5425 + autoflowlabel:1,
5426 + autoflowlabel_set:1;
5427 __u8 min_hopcount;
5428 __u8 tclass;
5429 __be32 rcv_flowinfo;
5430 diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
5431 index 401c8972cc3a..8b3d0103c03a 100644
5432 --- a/include/linux/mlx5/driver.h
5433 +++ b/include/linux/mlx5/driver.h
5434 @@ -546,6 +546,7 @@ struct mlx5_core_sriov {
5435 };
5436
5437 struct mlx5_irq_info {
5438 + cpumask_var_t mask;
5439 char name[MLX5_MAX_IRQ_NAME];
5440 };
5441
5442 diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
5443 index 69772347f866..c8091f06eaa4 100644
5444 --- a/include/linux/mlx5/mlx5_ifc.h
5445 +++ b/include/linux/mlx5/mlx5_ifc.h
5446 @@ -147,7 +147,7 @@ enum {
5447 MLX5_CMD_OP_ALLOC_Q_COUNTER = 0x771,
5448 MLX5_CMD_OP_DEALLOC_Q_COUNTER = 0x772,
5449 MLX5_CMD_OP_QUERY_Q_COUNTER = 0x773,
5450 - MLX5_CMD_OP_SET_RATE_LIMIT = 0x780,
5451 + MLX5_CMD_OP_SET_PP_RATE_LIMIT = 0x780,
5452 MLX5_CMD_OP_QUERY_RATE_LIMIT = 0x781,
5453 MLX5_CMD_OP_CREATE_SCHEDULING_ELEMENT = 0x782,
5454 MLX5_CMD_OP_DESTROY_SCHEDULING_ELEMENT = 0x783,
5455 @@ -7233,7 +7233,7 @@ struct mlx5_ifc_add_vxlan_udp_dport_in_bits {
5456 u8 vxlan_udp_port[0x10];
5457 };
5458
5459 -struct mlx5_ifc_set_rate_limit_out_bits {
5460 +struct mlx5_ifc_set_pp_rate_limit_out_bits {
5461 u8 status[0x8];
5462 u8 reserved_at_8[0x18];
5463
5464 @@ -7242,7 +7242,7 @@ struct mlx5_ifc_set_rate_limit_out_bits {
5465 u8 reserved_at_40[0x40];
5466 };
5467
5468 -struct mlx5_ifc_set_rate_limit_in_bits {
5469 +struct mlx5_ifc_set_pp_rate_limit_in_bits {
5470 u8 opcode[0x10];
5471 u8 reserved_at_10[0x10];
5472
5473 @@ -7255,6 +7255,8 @@ struct mlx5_ifc_set_rate_limit_in_bits {
5474 u8 reserved_at_60[0x20];
5475
5476 u8 rate_limit[0x20];
5477 +
5478 + u8 reserved_at_a0[0x160];
5479 };
5480
5481 struct mlx5_ifc_access_register_out_bits {
5482 diff --git a/include/linux/pti.h b/include/linux/pti.h
5483 new file mode 100644
5484 index 000000000000..0174883a935a
5485 --- /dev/null
5486 +++ b/include/linux/pti.h
5487 @@ -0,0 +1,11 @@
5488 +// SPDX-License-Identifier: GPL-2.0
5489 +#ifndef _INCLUDE_PTI_H
5490 +#define _INCLUDE_PTI_H
5491 +
5492 +#ifdef CONFIG_PAGE_TABLE_ISOLATION
5493 +#include <asm/pti.h>
5494 +#else
5495 +static inline void pti_init(void) { }
5496 +#endif
5497 +
5498 +#endif
5499 diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h
5500 index 37b4bb2545b3..6866df4f31b5 100644
5501 --- a/include/linux/ptr_ring.h
5502 +++ b/include/linux/ptr_ring.h
5503 @@ -101,12 +101,18 @@ static inline bool ptr_ring_full_bh(struct ptr_ring *r)
5504
5505 /* Note: callers invoking this in a loop must use a compiler barrier,
5506 * for example cpu_relax(). Callers must hold producer_lock.
5507 + * Callers are responsible for making sure pointer that is being queued
5508 + * points to a valid data.
5509 */
5510 static inline int __ptr_ring_produce(struct ptr_ring *r, void *ptr)
5511 {
5512 if (unlikely(!r->size) || r->queue[r->producer])
5513 return -ENOSPC;
5514
5515 + /* Make sure the pointer we are storing points to a valid data. */
5516 + /* Pairs with smp_read_barrier_depends in __ptr_ring_consume. */
5517 + smp_wmb();
5518 +
5519 r->queue[r->producer++] = ptr;
5520 if (unlikely(r->producer >= r->size))
5521 r->producer = 0;
5522 @@ -275,6 +281,9 @@ static inline void *__ptr_ring_consume(struct ptr_ring *r)
5523 if (ptr)
5524 __ptr_ring_discard_one(r);
5525
5526 + /* Make sure anyone accessing data through the pointer is up to date. */
5527 + /* Pairs with smp_wmb in __ptr_ring_produce. */
5528 + smp_read_barrier_depends();
5529 return ptr;
5530 }
5531
5532 diff --git a/include/linux/tcp.h b/include/linux/tcp.h
5533 index 4aa40ef02d32..e8418fc77a43 100644
5534 --- a/include/linux/tcp.h
5535 +++ b/include/linux/tcp.h
5536 @@ -214,7 +214,8 @@ struct tcp_sock {
5537 u8 chrono_type:2, /* current chronograph type */
5538 rate_app_limited:1, /* rate_{delivered,interval_us} limited? */
5539 fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */
5540 - unused:4;
5541 + is_sack_reneg:1, /* in recovery from loss with SACK reneg? */
5542 + unused:3;
5543 u8 nonagle : 4,/* Disable Nagle algorithm? */
5544 thin_lto : 1,/* Use linear timeouts for thin streams */
5545 unused1 : 1,
5546 diff --git a/include/linux/tick.h b/include/linux/tick.h
5547 index cf413b344ddb..5cdac11dd317 100644
5548 --- a/include/linux/tick.h
5549 +++ b/include/linux/tick.h
5550 @@ -119,6 +119,7 @@ extern void tick_nohz_idle_exit(void);
5551 extern void tick_nohz_irq_exit(void);
5552 extern ktime_t tick_nohz_get_sleep_length(void);
5553 extern unsigned long tick_nohz_get_idle_calls(void);
5554 +extern unsigned long tick_nohz_get_idle_calls_cpu(int cpu);
5555 extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time);
5556 extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time);
5557 #else /* !CONFIG_NO_HZ_COMMON */
5558 diff --git a/include/linux/timer.h b/include/linux/timer.h
5559 index ac66f29c6916..e0ea1fe87572 100644
5560 --- a/include/linux/timer.h
5561 +++ b/include/linux/timer.h
5562 @@ -246,9 +246,11 @@ unsigned long round_jiffies_up(unsigned long j);
5563 unsigned long round_jiffies_up_relative(unsigned long j);
5564
5565 #ifdef CONFIG_HOTPLUG_CPU
5566 +int timers_prepare_cpu(unsigned int cpu);
5567 int timers_dead_cpu(unsigned int cpu);
5568 #else
5569 -#define timers_dead_cpu NULL
5570 +#define timers_prepare_cpu NULL
5571 +#define timers_dead_cpu NULL
5572 #endif
5573
5574 #endif
5575 diff --git a/include/net/ip.h b/include/net/ip.h
5576 index 9896f46cbbf1..af8addbaa3c1 100644
5577 --- a/include/net/ip.h
5578 +++ b/include/net/ip.h
5579 @@ -34,6 +34,7 @@
5580 #include <net/flow_dissector.h>
5581
5582 #define IPV4_MAX_PMTU 65535U /* RFC 2675, Section 5.1 */
5583 +#define IPV4_MIN_MTU 68 /* RFC 791 */
5584
5585 struct sock;
5586
5587 diff --git a/include/net/tcp.h b/include/net/tcp.h
5588 index 6ced69940f5c..0a13574134b8 100644
5589 --- a/include/net/tcp.h
5590 +++ b/include/net/tcp.h
5591 @@ -1085,7 +1085,7 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb);
5592 void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
5593 struct rate_sample *rs);
5594 void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
5595 - struct rate_sample *rs);
5596 + bool is_sack_reneg, struct rate_sample *rs);
5597 void tcp_rate_check_app_limited(struct sock *sk);
5598
5599 /* These functions determine how the current flow behaves in respect of SACK
5600 diff --git a/init/main.c b/init/main.c
5601 index 8a390f60ec81..b32ec72cdf3d 100644
5602 --- a/init/main.c
5603 +++ b/init/main.c
5604 @@ -75,6 +75,7 @@
5605 #include <linux/slab.h>
5606 #include <linux/perf_event.h>
5607 #include <linux/ptrace.h>
5608 +#include <linux/pti.h>
5609 #include <linux/blkdev.h>
5610 #include <linux/elevator.h>
5611 #include <linux/sched_clock.h>
5612 @@ -506,6 +507,8 @@ static void __init mm_init(void)
5613 ioremap_huge_init();
5614 /* Should be run before the first non-init thread is created */
5615 init_espfix_bsp();
5616 + /* Should be run after espfix64 is set up. */
5617 + pti_init();
5618 }
5619
5620 asmlinkage __visible void __init start_kernel(void)
5621 diff --git a/kernel/cpu.c b/kernel/cpu.c
5622 index 7891aecc6aec..f21bfa3172d8 100644
5623 --- a/kernel/cpu.c
5624 +++ b/kernel/cpu.c
5625 @@ -1277,9 +1277,9 @@ static struct cpuhp_step cpuhp_bp_states[] = {
5626 * before blk_mq_queue_reinit_notify() from notify_dead(),
5627 * otherwise a RCU stall occurs.
5628 */
5629 - [CPUHP_TIMERS_DEAD] = {
5630 + [CPUHP_TIMERS_PREPARE] = {
5631 .name = "timers:dead",
5632 - .startup.single = NULL,
5633 + .startup.single = timers_prepare_cpu,
5634 .teardown.single = timers_dead_cpu,
5635 },
5636 /* Kicks the plugged cpu into life */
5637 diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
5638 index 2f52ec0f1539..d6717a3331a1 100644
5639 --- a/kernel/sched/cpufreq_schedutil.c
5640 +++ b/kernel/sched/cpufreq_schedutil.c
5641 @@ -244,7 +244,7 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
5642 #ifdef CONFIG_NO_HZ_COMMON
5643 static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu)
5644 {
5645 - unsigned long idle_calls = tick_nohz_get_idle_calls();
5646 + unsigned long idle_calls = tick_nohz_get_idle_calls_cpu(sg_cpu->cpu);
5647 bool ret = idle_calls == sg_cpu->saved_idle_calls;
5648
5649 sg_cpu->saved_idle_calls = idle_calls;
5650 diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
5651 index c7a899c5ce64..dfa4a117fee3 100644
5652 --- a/kernel/time/tick-sched.c
5653 +++ b/kernel/time/tick-sched.c
5654 @@ -674,6 +674,11 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
5655 ts->next_tick = 0;
5656 }
5657
5658 +static inline bool local_timer_softirq_pending(void)
5659 +{
5660 + return local_softirq_pending() & TIMER_SOFTIRQ;
5661 +}
5662 +
5663 static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
5664 ktime_t now, int cpu)
5665 {
5666 @@ -690,8 +695,18 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
5667 } while (read_seqretry(&jiffies_lock, seq));
5668 ts->last_jiffies = basejiff;
5669
5670 - if (rcu_needs_cpu(basemono, &next_rcu) ||
5671 - arch_needs_cpu() || irq_work_needs_cpu()) {
5672 + /*
5673 + * Keep the periodic tick, when RCU, architecture or irq_work
5674 + * requests it.
5675 + * Aside of that check whether the local timer softirq is
5676 + * pending. If so its a bad idea to call get_next_timer_interrupt()
5677 + * because there is an already expired timer, so it will request
5678 + * immeditate expiry, which rearms the hardware timer with a
5679 + * minimal delta which brings us back to this place
5680 + * immediately. Lather, rinse and repeat...
5681 + */
5682 + if (rcu_needs_cpu(basemono, &next_rcu) || arch_needs_cpu() ||
5683 + irq_work_needs_cpu() || local_timer_softirq_pending()) {
5684 next_tick = basemono + TICK_NSEC;
5685 } else {
5686 /*
5687 @@ -1009,6 +1024,19 @@ ktime_t tick_nohz_get_sleep_length(void)
5688 return ts->sleep_length;
5689 }
5690
5691 +/**
5692 + * tick_nohz_get_idle_calls_cpu - return the current idle calls counter value
5693 + * for a particular CPU.
5694 + *
5695 + * Called from the schedutil frequency scaling governor in scheduler context.
5696 + */
5697 +unsigned long tick_nohz_get_idle_calls_cpu(int cpu)
5698 +{
5699 + struct tick_sched *ts = tick_get_tick_sched(cpu);
5700 +
5701 + return ts->idle_calls;
5702 +}
5703 +
5704 /**
5705 * tick_nohz_get_idle_calls - return the current idle calls counter value
5706 *
5707 diff --git a/kernel/time/timer.c b/kernel/time/timer.c
5708 index f2674a056c26..73e3cdbc61f1 100644
5709 --- a/kernel/time/timer.c
5710 +++ b/kernel/time/timer.c
5711 @@ -814,11 +814,10 @@ static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu)
5712 struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_STD], cpu);
5713
5714 /*
5715 - * If the timer is deferrable and nohz is active then we need to use
5716 - * the deferrable base.
5717 + * If the timer is deferrable and NO_HZ_COMMON is set then we need
5718 + * to use the deferrable base.
5719 */
5720 - if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active &&
5721 - (tflags & TIMER_DEFERRABLE))
5722 + if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE))
5723 base = per_cpu_ptr(&timer_bases[BASE_DEF], cpu);
5724 return base;
5725 }
5726 @@ -828,11 +827,10 @@ static inline struct timer_base *get_timer_this_cpu_base(u32 tflags)
5727 struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
5728
5729 /*
5730 - * If the timer is deferrable and nohz is active then we need to use
5731 - * the deferrable base.
5732 + * If the timer is deferrable and NO_HZ_COMMON is set then we need
5733 + * to use the deferrable base.
5734 */
5735 - if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active &&
5736 - (tflags & TIMER_DEFERRABLE))
5737 + if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE))
5738 base = this_cpu_ptr(&timer_bases[BASE_DEF]);
5739 return base;
5740 }
5741 @@ -984,8 +982,6 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
5742 if (!ret && pending_only)
5743 goto out_unlock;
5744
5745 - debug_activate(timer, expires);
5746 -
5747 new_base = get_target_base(base, timer->flags);
5748
5749 if (base != new_base) {
5750 @@ -1009,6 +1005,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
5751 }
5752 }
5753
5754 + debug_activate(timer, expires);
5755 +
5756 timer->expires = expires;
5757 /*
5758 * If 'idx' was calculated above and the base time did not advance
5759 @@ -1644,7 +1642,7 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h)
5760 base->must_forward_clk = false;
5761
5762 __run_timers(base);
5763 - if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active)
5764 + if (IS_ENABLED(CONFIG_NO_HZ_COMMON))
5765 __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF]));
5766 }
5767
5768 @@ -1803,6 +1801,21 @@ static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *h
5769 }
5770 }
5771
5772 +int timers_prepare_cpu(unsigned int cpu)
5773 +{
5774 + struct timer_base *base;
5775 + int b;
5776 +
5777 + for (b = 0; b < NR_BASES; b++) {
5778 + base = per_cpu_ptr(&timer_bases[b], cpu);
5779 + base->clk = jiffies;
5780 + base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
5781 + base->is_idle = false;
5782 + base->must_forward_clk = true;
5783 + }
5784 + return 0;
5785 +}
5786 +
5787 int timers_dead_cpu(unsigned int cpu)
5788 {
5789 struct timer_base *old_base;
5790 diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
5791 index 81279c6602ff..0476a9372014 100644
5792 --- a/kernel/trace/ring_buffer.c
5793 +++ b/kernel/trace/ring_buffer.c
5794 @@ -281,6 +281,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
5795 /* Missed count stored at end */
5796 #define RB_MISSED_STORED (1 << 30)
5797
5798 +#define RB_MISSED_FLAGS (RB_MISSED_EVENTS|RB_MISSED_STORED)
5799 +
5800 struct buffer_data_page {
5801 u64 time_stamp; /* page time stamp */
5802 local_t commit; /* write committed index */
5803 @@ -332,7 +334,9 @@ static void rb_init_page(struct buffer_data_page *bpage)
5804 */
5805 size_t ring_buffer_page_len(void *page)
5806 {
5807 - return local_read(&((struct buffer_data_page *)page)->commit)
5808 + struct buffer_data_page *bpage = page;
5809 +
5810 + return (local_read(&bpage->commit) & ~RB_MISSED_FLAGS)
5811 + BUF_PAGE_HDR_SIZE;
5812 }
5813
5814 @@ -4439,8 +4443,13 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, int cpu, void *data)
5815 {
5816 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
5817 struct buffer_data_page *bpage = data;
5818 + struct page *page = virt_to_page(bpage);
5819 unsigned long flags;
5820
5821 + /* If the page is still in use someplace else, we can't reuse it */
5822 + if (page_ref_count(page) > 1)
5823 + goto out;
5824 +
5825 local_irq_save(flags);
5826 arch_spin_lock(&cpu_buffer->lock);
5827
5828 @@ -4452,6 +4461,7 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, int cpu, void *data)
5829 arch_spin_unlock(&cpu_buffer->lock);
5830 local_irq_restore(flags);
5831
5832 + out:
5833 free_page((unsigned long)bpage);
5834 }
5835 EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
5836 diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
5837 index 80de14973b42..76bcc80b893e 100644
5838 --- a/kernel/trace/trace.c
5839 +++ b/kernel/trace/trace.c
5840 @@ -6769,7 +6769,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
5841 .spd_release = buffer_spd_release,
5842 };
5843 struct buffer_ref *ref;
5844 - int entries, size, i;
5845 + int entries, i;
5846 ssize_t ret = 0;
5847
5848 #ifdef CONFIG_TRACER_MAX_TRACE
5849 @@ -6823,14 +6823,6 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
5850 break;
5851 }
5852
5853 - /*
5854 - * zero out any left over data, this is going to
5855 - * user land.
5856 - */
5857 - size = ring_buffer_page_len(ref->page);
5858 - if (size < PAGE_SIZE)
5859 - memset(ref->page + size, 0, PAGE_SIZE - size);
5860 -
5861 page = virt_to_page(ref->page);
5862
5863 spd.pages[i] = page;
5864 @@ -7588,6 +7580,7 @@ allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size
5865 buf->data = alloc_percpu(struct trace_array_cpu);
5866 if (!buf->data) {
5867 ring_buffer_free(buf->buffer);
5868 + buf->buffer = NULL;
5869 return -ENOMEM;
5870 }
5871
5872 @@ -7611,7 +7604,9 @@ static int allocate_trace_buffers(struct trace_array *tr, int size)
5873 allocate_snapshot ? size : 1);
5874 if (WARN_ON(ret)) {
5875 ring_buffer_free(tr->trace_buffer.buffer);
5876 + tr->trace_buffer.buffer = NULL;
5877 free_percpu(tr->trace_buffer.data);
5878 + tr->trace_buffer.data = NULL;
5879 return -ENOMEM;
5880 }
5881 tr->allocated_snapshot = allocate_snapshot;
5882 diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
5883 index de2152730809..08190db0a2dc 100644
5884 --- a/net/bridge/br_netlink.c
5885 +++ b/net/bridge/br_netlink.c
5886 @@ -1223,19 +1223,20 @@ static int br_dev_newlink(struct net *src_net, struct net_device *dev,
5887 struct net_bridge *br = netdev_priv(dev);
5888 int err;
5889
5890 + err = register_netdevice(dev);
5891 + if (err)
5892 + return err;
5893 +
5894 if (tb[IFLA_ADDRESS]) {
5895 spin_lock_bh(&br->lock);
5896 br_stp_change_bridge_id(br, nla_data(tb[IFLA_ADDRESS]));
5897 spin_unlock_bh(&br->lock);
5898 }
5899
5900 - err = register_netdevice(dev);
5901 - if (err)
5902 - return err;
5903 -
5904 err = br_changelink(dev, tb, data, extack);
5905 if (err)
5906 - unregister_netdevice(dev);
5907 + br_dev_delete(dev, NULL);
5908 +
5909 return err;
5910 }
5911
5912 diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
5913 index 6cfdc7c84c48..0dd6359e5924 100644
5914 --- a/net/core/net_namespace.c
5915 +++ b/net/core/net_namespace.c
5916 @@ -266,7 +266,7 @@ struct net *get_net_ns_by_id(struct net *net, int id)
5917 spin_lock_bh(&net->nsid_lock);
5918 peer = idr_find(&net->netns_ids, id);
5919 if (peer)
5920 - get_net(peer);
5921 + peer = maybe_get_net(peer);
5922 spin_unlock_bh(&net->nsid_lock);
5923 rcu_read_unlock();
5924
5925 diff --git a/net/core/skbuff.c b/net/core/skbuff.c
5926 index e140ba49b30a..15fa5baa8fae 100644
5927 --- a/net/core/skbuff.c
5928 +++ b/net/core/skbuff.c
5929 @@ -1181,12 +1181,12 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
5930 int i, new_frags;
5931 u32 d_off;
5932
5933 - if (!num_frags)
5934 - return 0;
5935 -
5936 if (skb_shared(skb) || skb_unclone(skb, gfp_mask))
5937 return -EINVAL;
5938
5939 + if (!num_frags)
5940 + goto release;
5941 +
5942 new_frags = (__skb_pagelen(skb) + PAGE_SIZE - 1) >> PAGE_SHIFT;
5943 for (i = 0; i < new_frags; i++) {
5944 page = alloc_page(gfp_mask);
5945 @@ -1242,6 +1242,7 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
5946 __skb_fill_page_desc(skb, new_frags - 1, head, 0, d_off);
5947 skb_shinfo(skb)->nr_frags = new_frags;
5948
5949 +release:
5950 skb_zcopy_clear(skb, false);
5951 return 0;
5952 }
5953 @@ -3657,8 +3658,6 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
5954
5955 skb_shinfo(nskb)->tx_flags |= skb_shinfo(head_skb)->tx_flags &
5956 SKBTX_SHARED_FRAG;
5957 - if (skb_zerocopy_clone(nskb, head_skb, GFP_ATOMIC))
5958 - goto err;
5959
5960 while (pos < offset + len) {
5961 if (i >= nfrags) {
5962 @@ -3684,6 +3683,8 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
5963
5964 if (unlikely(skb_orphan_frags(frag_skb, GFP_ATOMIC)))
5965 goto err;
5966 + if (skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC))
5967 + goto err;
5968
5969 *nskb_frag = *frag;
5970 __skb_frag_ref(nskb_frag);
5971 @@ -4296,7 +4297,7 @@ void skb_complete_tx_timestamp(struct sk_buff *skb,
5972 struct sock *sk = skb->sk;
5973
5974 if (!skb_may_tx_timestamp(sk, false))
5975 - return;
5976 + goto err;
5977
5978 /* Take a reference to prevent skb_orphan() from freeing the socket,
5979 * but only if the socket refcount is not zero.
5980 @@ -4305,7 +4306,11 @@ void skb_complete_tx_timestamp(struct sk_buff *skb,
5981 *skb_hwtstamps(skb) = *hwtstamps;
5982 __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND, false);
5983 sock_put(sk);
5984 + return;
5985 }
5986 +
5987 +err:
5988 + kfree_skb(skb);
5989 }
5990 EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
5991
5992 diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
5993 index d7adc0616599..bffa88ecc534 100644
5994 --- a/net/ipv4/devinet.c
5995 +++ b/net/ipv4/devinet.c
5996 @@ -1420,7 +1420,7 @@ static void inetdev_changename(struct net_device *dev, struct in_device *in_dev)
5997
5998 static bool inetdev_valid_mtu(unsigned int mtu)
5999 {
6000 - return mtu >= 68;
6001 + return mtu >= IPV4_MIN_MTU;
6002 }
6003
6004 static void inetdev_send_gratuitous_arp(struct net_device *dev,
6005 diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
6006 index 37819ab4cc74..d72874150905 100644
6007 --- a/net/ipv4/fib_frontend.c
6008 +++ b/net/ipv4/fib_frontend.c
6009 @@ -1274,14 +1274,19 @@ static int __net_init ip_fib_net_init(struct net *net)
6010
6011 static void ip_fib_net_exit(struct net *net)
6012 {
6013 - unsigned int i;
6014 + int i;
6015
6016 rtnl_lock();
6017 #ifdef CONFIG_IP_MULTIPLE_TABLES
6018 RCU_INIT_POINTER(net->ipv4.fib_main, NULL);
6019 RCU_INIT_POINTER(net->ipv4.fib_default, NULL);
6020 #endif
6021 - for (i = 0; i < FIB_TABLE_HASHSZ; i++) {
6022 + /* Destroy the tables in reverse order to guarantee that the
6023 + * local table, ID 255, is destroyed before the main table, ID
6024 + * 254. This is necessary as the local table may contain
6025 + * references to data contained in the main table.
6026 + */
6027 + for (i = FIB_TABLE_HASHSZ - 1; i >= 0; i--) {
6028 struct hlist_head *head = &net->ipv4.fib_table_hash[i];
6029 struct hlist_node *tmp;
6030 struct fib_table *tb;
6031 diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
6032 index 01ed22139ac2..aff3751df950 100644
6033 --- a/net/ipv4/fib_semantics.c
6034 +++ b/net/ipv4/fib_semantics.c
6035 @@ -706,7 +706,7 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi)
6036
6037 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
6038 int type = nla_type(nla);
6039 - u32 val;
6040 + u32 fi_val, val;
6041
6042 if (!type)
6043 continue;
6044 @@ -723,7 +723,11 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi)
6045 val = nla_get_u32(nla);
6046 }
6047
6048 - if (fi->fib_metrics->metrics[type - 1] != val)
6049 + fi_val = fi->fib_metrics->metrics[type - 1];
6050 + if (type == RTAX_FEATURES)
6051 + fi_val &= ~DST_FEATURE_ECN_CA;
6052 +
6053 + if (fi_val != val)
6054 return false;
6055 }
6056
6057 diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
6058 index ab183af0b5b6..c621266e0306 100644
6059 --- a/net/ipv4/igmp.c
6060 +++ b/net/ipv4/igmp.c
6061 @@ -89,6 +89,7 @@
6062 #include <linux/rtnetlink.h>
6063 #include <linux/times.h>
6064 #include <linux/pkt_sched.h>
6065 +#include <linux/byteorder/generic.h>
6066
6067 #include <net/net_namespace.h>
6068 #include <net/arp.h>
6069 @@ -321,6 +322,23 @@ igmp_scount(struct ip_mc_list *pmc, int type, int gdeleted, int sdeleted)
6070 return scount;
6071 }
6072
6073 +/* source address selection per RFC 3376 section 4.2.13 */
6074 +static __be32 igmpv3_get_srcaddr(struct net_device *dev,
6075 + const struct flowi4 *fl4)
6076 +{
6077 + struct in_device *in_dev = __in_dev_get_rcu(dev);
6078 +
6079 + if (!in_dev)
6080 + return htonl(INADDR_ANY);
6081 +
6082 + for_ifa(in_dev) {
6083 + if (inet_ifa_match(fl4->saddr, ifa))
6084 + return fl4->saddr;
6085 + } endfor_ifa(in_dev);
6086 +
6087 + return htonl(INADDR_ANY);
6088 +}
6089 +
6090 static struct sk_buff *igmpv3_newpack(struct net_device *dev, unsigned int mtu)
6091 {
6092 struct sk_buff *skb;
6093 @@ -368,7 +386,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, unsigned int mtu)
6094 pip->frag_off = htons(IP_DF);
6095 pip->ttl = 1;
6096 pip->daddr = fl4.daddr;
6097 - pip->saddr = fl4.saddr;
6098 + pip->saddr = igmpv3_get_srcaddr(dev, &fl4);
6099 pip->protocol = IPPROTO_IGMP;
6100 pip->tot_len = 0; /* filled in later */
6101 ip_select_ident(net, skb, NULL);
6102 @@ -404,16 +422,17 @@ static int grec_size(struct ip_mc_list *pmc, int type, int gdel, int sdel)
6103 }
6104
6105 static struct sk_buff *add_grhead(struct sk_buff *skb, struct ip_mc_list *pmc,
6106 - int type, struct igmpv3_grec **ppgr)
6107 + int type, struct igmpv3_grec **ppgr, unsigned int mtu)
6108 {
6109 struct net_device *dev = pmc->interface->dev;
6110 struct igmpv3_report *pih;
6111 struct igmpv3_grec *pgr;
6112
6113 - if (!skb)
6114 - skb = igmpv3_newpack(dev, dev->mtu);
6115 - if (!skb)
6116 - return NULL;
6117 + if (!skb) {
6118 + skb = igmpv3_newpack(dev, mtu);
6119 + if (!skb)
6120 + return NULL;
6121 + }
6122 pgr = skb_put(skb, sizeof(struct igmpv3_grec));
6123 pgr->grec_type = type;
6124 pgr->grec_auxwords = 0;
6125 @@ -436,12 +455,17 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
6126 struct igmpv3_grec *pgr = NULL;
6127 struct ip_sf_list *psf, *psf_next, *psf_prev, **psf_list;
6128 int scount, stotal, first, isquery, truncate;
6129 + unsigned int mtu;
6130
6131 if (pmc->multiaddr == IGMP_ALL_HOSTS)
6132 return skb;
6133 if (ipv4_is_local_multicast(pmc->multiaddr) && !net->ipv4.sysctl_igmp_llm_reports)
6134 return skb;
6135
6136 + mtu = READ_ONCE(dev->mtu);
6137 + if (mtu < IPV4_MIN_MTU)
6138 + return skb;
6139 +
6140 isquery = type == IGMPV3_MODE_IS_INCLUDE ||
6141 type == IGMPV3_MODE_IS_EXCLUDE;
6142 truncate = type == IGMPV3_MODE_IS_EXCLUDE ||
6143 @@ -462,7 +486,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
6144 AVAILABLE(skb) < grec_size(pmc, type, gdeleted, sdeleted)) {
6145 if (skb)
6146 igmpv3_sendpack(skb);
6147 - skb = igmpv3_newpack(dev, dev->mtu);
6148 + skb = igmpv3_newpack(dev, mtu);
6149 }
6150 }
6151 first = 1;
6152 @@ -498,12 +522,12 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
6153 pgr->grec_nsrcs = htons(scount);
6154 if (skb)
6155 igmpv3_sendpack(skb);
6156 - skb = igmpv3_newpack(dev, dev->mtu);
6157 + skb = igmpv3_newpack(dev, mtu);
6158 first = 1;
6159 scount = 0;
6160 }
6161 if (first) {
6162 - skb = add_grhead(skb, pmc, type, &pgr);
6163 + skb = add_grhead(skb, pmc, type, &pgr, mtu);
6164 first = 0;
6165 }
6166 if (!skb)
6167 @@ -538,7 +562,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
6168 igmpv3_sendpack(skb);
6169 skb = NULL; /* add_grhead will get a new one */
6170 }
6171 - skb = add_grhead(skb, pmc, type, &pgr);
6172 + skb = add_grhead(skb, pmc, type, &pgr, mtu);
6173 }
6174 }
6175 if (pgr)
6176 diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
6177 index e9805ad664ac..4e90082b23a6 100644
6178 --- a/net/ipv4/ip_tunnel.c
6179 +++ b/net/ipv4/ip_tunnel.c
6180 @@ -349,8 +349,8 @@ static int ip_tunnel_bind_dev(struct net_device *dev)
6181 dev->needed_headroom = t_hlen + hlen;
6182 mtu -= (dev->hard_header_len + t_hlen);
6183
6184 - if (mtu < 68)
6185 - mtu = 68;
6186 + if (mtu < IPV4_MIN_MTU)
6187 + mtu = IPV4_MIN_MTU;
6188
6189 return mtu;
6190 }
6191 diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
6192 index 33b70bfd1122..125c1eab3eaa 100644
6193 --- a/net/ipv4/raw.c
6194 +++ b/net/ipv4/raw.c
6195 @@ -513,11 +513,16 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
6196 int err;
6197 struct ip_options_data opt_copy;
6198 struct raw_frag_vec rfv;
6199 + int hdrincl;
6200
6201 err = -EMSGSIZE;
6202 if (len > 0xFFFF)
6203 goto out;
6204
6205 + /* hdrincl should be READ_ONCE(inet->hdrincl)
6206 + * but READ_ONCE() doesn't work with bit fields
6207 + */
6208 + hdrincl = inet->hdrincl;
6209 /*
6210 * Check the flags.
6211 */
6212 @@ -593,7 +598,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
6213 /* Linux does not mangle headers on raw sockets,
6214 * so that IP options + IP_HDRINCL is non-sense.
6215 */
6216 - if (inet->hdrincl)
6217 + if (hdrincl)
6218 goto done;
6219 if (ipc.opt->opt.srr) {
6220 if (!daddr)
6221 @@ -615,12 +620,12 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
6222
6223 flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos,
6224 RT_SCOPE_UNIVERSE,
6225 - inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
6226 + hdrincl ? IPPROTO_RAW : sk->sk_protocol,
6227 inet_sk_flowi_flags(sk) |
6228 - (inet->hdrincl ? FLOWI_FLAG_KNOWN_NH : 0),
6229 + (hdrincl ? FLOWI_FLAG_KNOWN_NH : 0),
6230 daddr, saddr, 0, 0, sk->sk_uid);
6231
6232 - if (!inet->hdrincl) {
6233 + if (!hdrincl) {
6234 rfv.msg = msg;
6235 rfv.hlen = 0;
6236
6237 @@ -645,7 +650,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
6238 goto do_confirm;
6239 back_from_confirm:
6240
6241 - if (inet->hdrincl)
6242 + if (hdrincl)
6243 err = raw_send_hdrinc(sk, &fl4, msg, len,
6244 &rt, msg->msg_flags, &ipc.sockc);
6245
6246 diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
6247 index 5091402720ab..a0c72b09cefc 100644
6248 --- a/net/ipv4/tcp.c
6249 +++ b/net/ipv4/tcp.c
6250 @@ -2356,6 +2356,7 @@ int tcp_disconnect(struct sock *sk, int flags)
6251 tp->snd_cwnd_cnt = 0;
6252 tp->window_clamp = 0;
6253 tcp_set_ca_state(sk, TCP_CA_Open);
6254 + tp->is_sack_reneg = 0;
6255 tcp_clear_retrans(tp);
6256 inet_csk_delack_init(sk);
6257 /* Initialize rcv_mss to TCP_MIN_MSS to avoid division by 0
6258 diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
6259 index 69ee877574d0..8322f26e770e 100644
6260 --- a/net/ipv4/tcp_bbr.c
6261 +++ b/net/ipv4/tcp_bbr.c
6262 @@ -110,7 +110,8 @@ struct bbr {
6263 u32 lt_last_lost; /* LT intvl start: tp->lost */
6264 u32 pacing_gain:10, /* current gain for setting pacing rate */
6265 cwnd_gain:10, /* current gain for setting cwnd */
6266 - full_bw_cnt:3, /* number of rounds without large bw gains */
6267 + full_bw_reached:1, /* reached full bw in Startup? */
6268 + full_bw_cnt:2, /* number of rounds without large bw gains */
6269 cycle_idx:3, /* current index in pacing_gain cycle array */
6270 has_seen_rtt:1, /* have we seen an RTT sample yet? */
6271 unused_b:5;
6272 @@ -180,7 +181,7 @@ static bool bbr_full_bw_reached(const struct sock *sk)
6273 {
6274 const struct bbr *bbr = inet_csk_ca(sk);
6275
6276 - return bbr->full_bw_cnt >= bbr_full_bw_cnt;
6277 + return bbr->full_bw_reached;
6278 }
6279
6280 /* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */
6281 @@ -717,6 +718,7 @@ static void bbr_check_full_bw_reached(struct sock *sk,
6282 return;
6283 }
6284 ++bbr->full_bw_cnt;
6285 + bbr->full_bw_reached = bbr->full_bw_cnt >= bbr_full_bw_cnt;
6286 }
6287
6288 /* If pipe is probably full, drain the queue and then enter steady-state. */
6289 @@ -850,6 +852,7 @@ static void bbr_init(struct sock *sk)
6290 bbr->restore_cwnd = 0;
6291 bbr->round_start = 0;
6292 bbr->idle_restart = 0;
6293 + bbr->full_bw_reached = 0;
6294 bbr->full_bw = 0;
6295 bbr->full_bw_cnt = 0;
6296 bbr->cycle_mstamp = 0;
6297 @@ -871,6 +874,11 @@ static u32 bbr_sndbuf_expand(struct sock *sk)
6298 */
6299 static u32 bbr_undo_cwnd(struct sock *sk)
6300 {
6301 + struct bbr *bbr = inet_csk_ca(sk);
6302 +
6303 + bbr->full_bw = 0; /* spurious slow-down; reset full pipe detection */
6304 + bbr->full_bw_cnt = 0;
6305 + bbr_reset_lt_bw_sampling(sk);
6306 return tcp_sk(sk)->snd_cwnd;
6307 }
6308
6309 diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
6310 index c5447b9f8517..ff48ac654e5a 100644
6311 --- a/net/ipv4/tcp_input.c
6312 +++ b/net/ipv4/tcp_input.c
6313 @@ -521,9 +521,6 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
6314 u32 new_sample = tp->rcv_rtt_est.rtt_us;
6315 long m = sample;
6316
6317 - if (m == 0)
6318 - m = 1;
6319 -
6320 if (new_sample != 0) {
6321 /* If we sample in larger samples in the non-timestamp
6322 * case, we could grossly overestimate the RTT especially
6323 @@ -560,6 +557,8 @@ static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp)
6324 if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq))
6325 return;
6326 delta_us = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcv_rtt_est.time);
6327 + if (!delta_us)
6328 + delta_us = 1;
6329 tcp_rcv_rtt_update(tp, delta_us, 1);
6330
6331 new_measure:
6332 @@ -576,8 +575,11 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
6333 (TCP_SKB_CB(skb)->end_seq -
6334 TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss)) {
6335 u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr;
6336 - u32 delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ);
6337 + u32 delta_us;
6338
6339 + if (!delta)
6340 + delta = 1;
6341 + delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ);
6342 tcp_rcv_rtt_update(tp, delta_us, 0);
6343 }
6344 }
6345 @@ -1975,6 +1977,8 @@ void tcp_enter_loss(struct sock *sk)
6346 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
6347 tp->sacked_out = 0;
6348 tp->fackets_out = 0;
6349 + /* Mark SACK reneging until we recover from this loss event. */
6350 + tp->is_sack_reneg = 1;
6351 }
6352 tcp_clear_all_retrans_hints(tp);
6353
6354 @@ -2428,6 +2432,7 @@ static bool tcp_try_undo_recovery(struct sock *sk)
6355 return true;
6356 }
6357 tcp_set_ca_state(sk, TCP_CA_Open);
6358 + tp->is_sack_reneg = 0;
6359 return false;
6360 }
6361
6362 @@ -2459,8 +2464,10 @@ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo)
6363 NET_INC_STATS(sock_net(sk),
6364 LINUX_MIB_TCPSPURIOUSRTOS);
6365 inet_csk(sk)->icsk_retransmits = 0;
6366 - if (frto_undo || tcp_is_sack(tp))
6367 + if (frto_undo || tcp_is_sack(tp)) {
6368 tcp_set_ca_state(sk, TCP_CA_Open);
6369 + tp->is_sack_reneg = 0;
6370 + }
6371 return true;
6372 }
6373 return false;
6374 @@ -3551,6 +3558,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
6375 struct tcp_sacktag_state sack_state;
6376 struct rate_sample rs = { .prior_delivered = 0 };
6377 u32 prior_snd_una = tp->snd_una;
6378 + bool is_sack_reneg = tp->is_sack_reneg;
6379 u32 ack_seq = TCP_SKB_CB(skb)->seq;
6380 u32 ack = TCP_SKB_CB(skb)->ack_seq;
6381 bool is_dupack = false;
6382 @@ -3666,7 +3674,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
6383
6384 delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */
6385 lost = tp->lost - lost; /* freshly marked lost */
6386 - tcp_rate_gen(sk, delivered, lost, sack_state.rate);
6387 + tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate);
6388 tcp_cong_control(sk, ack, delivered, flag, sack_state.rate);
6389 tcp_xmit_recovery(sk, rexmit);
6390 return 1;
6391 diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
6392 index 5a5ed4f14678..cab4b935e474 100644
6393 --- a/net/ipv4/tcp_ipv4.c
6394 +++ b/net/ipv4/tcp_ipv4.c
6395 @@ -844,7 +844,7 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
6396 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
6397 req->ts_recent,
6398 0,
6399 - tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
6400 + tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
6401 AF_INET),
6402 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
6403 ip_hdr(skb)->tos);
6404 diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c
6405 index 3330a370d306..c61240e43923 100644
6406 --- a/net/ipv4/tcp_rate.c
6407 +++ b/net/ipv4/tcp_rate.c
6408 @@ -106,7 +106,7 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
6409
6410 /* Update the connection delivery information and generate a rate sample. */
6411 void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
6412 - struct rate_sample *rs)
6413 + bool is_sack_reneg, struct rate_sample *rs)
6414 {
6415 struct tcp_sock *tp = tcp_sk(sk);
6416 u32 snd_us, ack_us;
6417 @@ -124,8 +124,12 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
6418
6419 rs->acked_sacked = delivered; /* freshly ACKed or SACKed */
6420 rs->losses = lost; /* freshly marked lost */
6421 - /* Return an invalid sample if no timing information is available. */
6422 - if (!rs->prior_mstamp) {
6423 + /* Return an invalid sample if no timing information is available or
6424 + * in recovery from loss with SACK reneging. Rate samples taken during
6425 + * a SACK reneging event may overestimate bw by including packets that
6426 + * were SACKed before the reneg.
6427 + */
6428 + if (!rs->prior_mstamp || is_sack_reneg) {
6429 rs->delivered = -1;
6430 rs->interval_us = -1;
6431 return;
6432 diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
6433 index 655dd8d7f064..e9af1879cd53 100644
6434 --- a/net/ipv4/tcp_timer.c
6435 +++ b/net/ipv4/tcp_timer.c
6436 @@ -264,6 +264,7 @@ void tcp_delack_timer_handler(struct sock *sk)
6437 icsk->icsk_ack.pingpong = 0;
6438 icsk->icsk_ack.ato = TCP_ATO_MIN;
6439 }
6440 + tcp_mstamp_refresh(tcp_sk(sk));
6441 tcp_send_ack(sk);
6442 __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKS);
6443 }
6444 @@ -627,6 +628,7 @@ static void tcp_keepalive_timer (unsigned long data)
6445 goto out;
6446 }
6447
6448 + tcp_mstamp_refresh(tp);
6449 if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {
6450 if (tp->linger2 >= 0) {
6451 const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN;
6452 diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
6453 index 2ec39404c449..c5318f5f6a14 100644
6454 --- a/net/ipv6/addrconf.c
6455 +++ b/net/ipv6/addrconf.c
6456 @@ -231,7 +231,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
6457 .proxy_ndp = 0,
6458 .accept_source_route = 0, /* we do not accept RH0 by default. */
6459 .disable_ipv6 = 0,
6460 - .accept_dad = 1,
6461 + .accept_dad = 0,
6462 .suppress_frag_ndisc = 1,
6463 .accept_ra_mtu = 1,
6464 .stable_secret = {
6465 diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
6466 index fe5262fd6aa5..bcbd5f3bf8bd 100644
6467 --- a/net/ipv6/af_inet6.c
6468 +++ b/net/ipv6/af_inet6.c
6469 @@ -210,7 +210,6 @@ static int inet6_create(struct net *net, struct socket *sock, int protocol,
6470 np->mcast_hops = IPV6_DEFAULT_MCASTHOPS;
6471 np->mc_loop = 1;
6472 np->pmtudisc = IPV6_PMTUDISC_WANT;
6473 - np->autoflowlabel = ip6_default_np_autolabel(net);
6474 np->repflow = net->ipv6.sysctl.flowlabel_reflect;
6475 sk->sk_ipv6only = net->ipv6.sysctl.bindv6only;
6476
6477 diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
6478 index 5d6bee070871..7a2df6646486 100644
6479 --- a/net/ipv6/ip6_gre.c
6480 +++ b/net/ipv6/ip6_gre.c
6481 @@ -1020,6 +1020,36 @@ static void ip6gre_tunnel_setup(struct net_device *dev)
6482 eth_random_addr(dev->perm_addr);
6483 }
6484
6485 +#define GRE6_FEATURES (NETIF_F_SG | \
6486 + NETIF_F_FRAGLIST | \
6487 + NETIF_F_HIGHDMA | \
6488 + NETIF_F_HW_CSUM)
6489 +
6490 +static void ip6gre_tnl_init_features(struct net_device *dev)
6491 +{
6492 + struct ip6_tnl *nt = netdev_priv(dev);
6493 +
6494 + dev->features |= GRE6_FEATURES;
6495 + dev->hw_features |= GRE6_FEATURES;
6496 +
6497 + if (!(nt->parms.o_flags & TUNNEL_SEQ)) {
6498 + /* TCP offload with GRE SEQ is not supported, nor
6499 + * can we support 2 levels of outer headers requiring
6500 + * an update.
6501 + */
6502 + if (!(nt->parms.o_flags & TUNNEL_CSUM) ||
6503 + nt->encap.type == TUNNEL_ENCAP_NONE) {
6504 + dev->features |= NETIF_F_GSO_SOFTWARE;
6505 + dev->hw_features |= NETIF_F_GSO_SOFTWARE;
6506 + }
6507 +
6508 + /* Can use a lockless transmit, unless we generate
6509 + * output sequences
6510 + */
6511 + dev->features |= NETIF_F_LLTX;
6512 + }
6513 +}
6514 +
6515 static int ip6gre_tunnel_init_common(struct net_device *dev)
6516 {
6517 struct ip6_tnl *tunnel;
6518 @@ -1054,6 +1084,8 @@ static int ip6gre_tunnel_init_common(struct net_device *dev)
6519 if (!(tunnel->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
6520 dev->mtu -= 8;
6521
6522 + ip6gre_tnl_init_features(dev);
6523 +
6524 return 0;
6525 }
6526
6527 @@ -1302,11 +1334,6 @@ static const struct net_device_ops ip6gre_tap_netdev_ops = {
6528 .ndo_get_iflink = ip6_tnl_get_iflink,
6529 };
6530
6531 -#define GRE6_FEATURES (NETIF_F_SG | \
6532 - NETIF_F_FRAGLIST | \
6533 - NETIF_F_HIGHDMA | \
6534 - NETIF_F_HW_CSUM)
6535 -
6536 static void ip6gre_tap_setup(struct net_device *dev)
6537 {
6538
6539 @@ -1386,26 +1413,6 @@ static int ip6gre_newlink(struct net *src_net, struct net_device *dev,
6540 nt->net = dev_net(dev);
6541 ip6gre_tnl_link_config(nt, !tb[IFLA_MTU]);
6542
6543 - dev->features |= GRE6_FEATURES;
6544 - dev->hw_features |= GRE6_FEATURES;
6545 -
6546 - if (!(nt->parms.o_flags & TUNNEL_SEQ)) {
6547 - /* TCP offload with GRE SEQ is not supported, nor
6548 - * can we support 2 levels of outer headers requiring
6549 - * an update.
6550 - */
6551 - if (!(nt->parms.o_flags & TUNNEL_CSUM) ||
6552 - (nt->encap.type == TUNNEL_ENCAP_NONE)) {
6553 - dev->features |= NETIF_F_GSO_SOFTWARE;
6554 - dev->hw_features |= NETIF_F_GSO_SOFTWARE;
6555 - }
6556 -
6557 - /* Can use a lockless transmit, unless we generate
6558 - * output sequences
6559 - */
6560 - dev->features |= NETIF_F_LLTX;
6561 - }
6562 -
6563 err = register_netdevice(dev);
6564 if (err)
6565 goto out;
6566 diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
6567 index 5110a418cc4d..f7dd51c42314 100644
6568 --- a/net/ipv6/ip6_output.c
6569 +++ b/net/ipv6/ip6_output.c
6570 @@ -166,6 +166,14 @@ int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
6571 !(IP6CB(skb)->flags & IP6SKB_REROUTED));
6572 }
6573
6574 +static bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
6575 +{
6576 + if (!np->autoflowlabel_set)
6577 + return ip6_default_np_autolabel(net);
6578 + else
6579 + return np->autoflowlabel;
6580 +}
6581 +
6582 /*
6583 * xmit an sk_buff (used by TCP, SCTP and DCCP)
6584 * Note : socket lock is not held for SYNACK packets, but might be modified
6585 @@ -230,7 +238,7 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
6586 hlimit = ip6_dst_hoplimit(dst);
6587
6588 ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
6589 - np->autoflowlabel, fl6));
6590 + ip6_autoflowlabel(net, np), fl6));
6591
6592 hdr->payload_len = htons(seg_len);
6593 hdr->nexthdr = proto;
6594 @@ -1626,7 +1634,7 @@ struct sk_buff *__ip6_make_skb(struct sock *sk,
6595
6596 ip6_flow_hdr(hdr, v6_cork->tclass,
6597 ip6_make_flowlabel(net, skb, fl6->flowlabel,
6598 - np->autoflowlabel, fl6));
6599 + ip6_autoflowlabel(net, np), fl6));
6600 hdr->hop_limit = v6_cork->hop_limit;
6601 hdr->nexthdr = proto;
6602 hdr->saddr = fl6->saddr;
6603 diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
6604 index a1c24443cd9e..ef958d50746b 100644
6605 --- a/net/ipv6/ip6_tunnel.c
6606 +++ b/net/ipv6/ip6_tunnel.c
6607 @@ -912,7 +912,7 @@ static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto,
6608 if (t->parms.collect_md) {
6609 tun_dst = ipv6_tun_rx_dst(skb, 0, 0, 0);
6610 if (!tun_dst)
6611 - return 0;
6612 + goto drop;
6613 }
6614 ret = __ip6_tnl_rcv(t, skb, tpi, tun_dst, dscp_ecn_decapsulate,
6615 log_ecn_error);
6616 diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
6617 index a5e466d4e093..90dbfa78a390 100644
6618 --- a/net/ipv6/ipv6_sockglue.c
6619 +++ b/net/ipv6/ipv6_sockglue.c
6620 @@ -878,6 +878,7 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
6621 break;
6622 case IPV6_AUTOFLOWLABEL:
6623 np->autoflowlabel = valbool;
6624 + np->autoflowlabel_set = 1;
6625 retv = 0;
6626 break;
6627 case IPV6_RECVFRAGSIZE:
6628 diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
6629 index 12b7c27ce5ce..9a38a2c641fa 100644
6630 --- a/net/ipv6/mcast.c
6631 +++ b/net/ipv6/mcast.c
6632 @@ -1682,16 +1682,16 @@ static int grec_size(struct ifmcaddr6 *pmc, int type, int gdel, int sdel)
6633 }
6634
6635 static struct sk_buff *add_grhead(struct sk_buff *skb, struct ifmcaddr6 *pmc,
6636 - int type, struct mld2_grec **ppgr)
6637 + int type, struct mld2_grec **ppgr, unsigned int mtu)
6638 {
6639 - struct net_device *dev = pmc->idev->dev;
6640 struct mld2_report *pmr;
6641 struct mld2_grec *pgr;
6642
6643 - if (!skb)
6644 - skb = mld_newpack(pmc->idev, dev->mtu);
6645 - if (!skb)
6646 - return NULL;
6647 + if (!skb) {
6648 + skb = mld_newpack(pmc->idev, mtu);
6649 + if (!skb)
6650 + return NULL;
6651 + }
6652 pgr = skb_put(skb, sizeof(struct mld2_grec));
6653 pgr->grec_type = type;
6654 pgr->grec_auxwords = 0;
6655 @@ -1714,10 +1714,15 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc,
6656 struct mld2_grec *pgr = NULL;
6657 struct ip6_sf_list *psf, *psf_next, *psf_prev, **psf_list;
6658 int scount, stotal, first, isquery, truncate;
6659 + unsigned int mtu;
6660
6661 if (pmc->mca_flags & MAF_NOREPORT)
6662 return skb;
6663
6664 + mtu = READ_ONCE(dev->mtu);
6665 + if (mtu < IPV6_MIN_MTU)
6666 + return skb;
6667 +
6668 isquery = type == MLD2_MODE_IS_INCLUDE ||
6669 type == MLD2_MODE_IS_EXCLUDE;
6670 truncate = type == MLD2_MODE_IS_EXCLUDE ||
6671 @@ -1738,7 +1743,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc,
6672 AVAILABLE(skb) < grec_size(pmc, type, gdeleted, sdeleted)) {
6673 if (skb)
6674 mld_sendpack(skb);
6675 - skb = mld_newpack(idev, dev->mtu);
6676 + skb = mld_newpack(idev, mtu);
6677 }
6678 }
6679 first = 1;
6680 @@ -1774,12 +1779,12 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc,
6681 pgr->grec_nsrcs = htons(scount);
6682 if (skb)
6683 mld_sendpack(skb);
6684 - skb = mld_newpack(idev, dev->mtu);
6685 + skb = mld_newpack(idev, mtu);
6686 first = 1;
6687 scount = 0;
6688 }
6689 if (first) {
6690 - skb = add_grhead(skb, pmc, type, &pgr);
6691 + skb = add_grhead(skb, pmc, type, &pgr, mtu);
6692 first = 0;
6693 }
6694 if (!skb)
6695 @@ -1814,7 +1819,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc,
6696 mld_sendpack(skb);
6697 skb = NULL; /* add_grhead will get a new one */
6698 }
6699 - skb = add_grhead(skb, pmc, type, &pgr);
6700 + skb = add_grhead(skb, pmc, type, &pgr, mtu);
6701 }
6702 }
6703 if (pgr)
6704 diff --git a/net/ipv6/route.c b/net/ipv6/route.c
6705 index 598efa8cfe25..ca8d3266e92e 100644
6706 --- a/net/ipv6/route.c
6707 +++ b/net/ipv6/route.c
6708 @@ -3700,19 +3700,13 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
6709 if (!ipv6_addr_any(&fl6.saddr))
6710 flags |= RT6_LOOKUP_F_HAS_SADDR;
6711
6712 - if (!fibmatch)
6713 - dst = ip6_route_input_lookup(net, dev, &fl6, flags);
6714 - else
6715 - dst = ip6_route_lookup(net, &fl6, 0);
6716 + dst = ip6_route_input_lookup(net, dev, &fl6, flags);
6717
6718 rcu_read_unlock();
6719 } else {
6720 fl6.flowi6_oif = oif;
6721
6722 - if (!fibmatch)
6723 - dst = ip6_route_output(net, NULL, &fl6);
6724 - else
6725 - dst = ip6_route_lookup(net, &fl6, 0);
6726 + dst = ip6_route_output(net, NULL, &fl6);
6727 }
6728
6729
6730 @@ -3729,6 +3723,15 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
6731 goto errout;
6732 }
6733
6734 + if (fibmatch && rt->dst.from) {
6735 + struct rt6_info *ort = container_of(rt->dst.from,
6736 + struct rt6_info, dst);
6737 +
6738 + dst_hold(&ort->dst);
6739 + ip6_rt_put(rt);
6740 + rt = ort;
6741 + }
6742 +
6743 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
6744 if (!skb) {
6745 ip6_rt_put(rt);
6746 diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
6747 index 32ded300633d..237cc6187c5a 100644
6748 --- a/net/ipv6/tcp_ipv6.c
6749 +++ b/net/ipv6/tcp_ipv6.c
6750 @@ -988,7 +988,7 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
6751 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
6752 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
6753 req->ts_recent, sk->sk_bound_dev_if,
6754 - tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr),
6755 + tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->saddr),
6756 0, 0);
6757 }
6758
6759 diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
6760 index 15c99dfa3d72..aac9d68b4636 100644
6761 --- a/net/netlink/af_netlink.c
6762 +++ b/net/netlink/af_netlink.c
6763 @@ -254,6 +254,9 @@ static int __netlink_deliver_tap_skb(struct sk_buff *skb,
6764 struct sock *sk = skb->sk;
6765 int ret = -ENOMEM;
6766
6767 + if (!net_eq(dev_net(dev), sock_net(sk)))
6768 + return 0;
6769 +
6770 dev_hold(dev);
6771
6772 if (is_vmalloc_addr(skb->head))
6773 diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
6774 index cfb652a4e007..dbe1079a1651 100644
6775 --- a/net/openvswitch/flow.c
6776 +++ b/net/openvswitch/flow.c
6777 @@ -532,6 +532,7 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
6778 return -EINVAL;
6779
6780 skb_reset_network_header(skb);
6781 + key->eth.type = skb->protocol;
6782 } else {
6783 eth = eth_hdr(skb);
6784 ether_addr_copy(key->eth.src, eth->h_source);
6785 @@ -545,15 +546,23 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
6786 if (unlikely(parse_vlan(skb, key)))
6787 return -ENOMEM;
6788
6789 - skb->protocol = parse_ethertype(skb);
6790 - if (unlikely(skb->protocol == htons(0)))
6791 + key->eth.type = parse_ethertype(skb);
6792 + if (unlikely(key->eth.type == htons(0)))
6793 return -ENOMEM;
6794
6795 + /* Multiple tagged packets need to retain TPID to satisfy
6796 + * skb_vlan_pop(), which will later shift the ethertype into
6797 + * skb->protocol.
6798 + */
6799 + if (key->eth.cvlan.tci & htons(VLAN_TAG_PRESENT))
6800 + skb->protocol = key->eth.cvlan.tpid;
6801 + else
6802 + skb->protocol = key->eth.type;
6803 +
6804 skb_reset_network_header(skb);
6805 __skb_push(skb, skb->data - skb_mac_header(skb));
6806 }
6807 skb_reset_mac_len(skb);
6808 - key->eth.type = skb->protocol;
6809
6810 /* Network layer. */
6811 if (key->eth.type == htons(ETH_P_IP)) {
6812 diff --git a/net/rds/send.c b/net/rds/send.c
6813 index b52cdc8ae428..f72466c63f0c 100644
6814 --- a/net/rds/send.c
6815 +++ b/net/rds/send.c
6816 @@ -1009,6 +1009,9 @@ static int rds_rdma_bytes(struct msghdr *msg, size_t *rdma_bytes)
6817 continue;
6818
6819 if (cmsg->cmsg_type == RDS_CMSG_RDMA_ARGS) {
6820 + if (cmsg->cmsg_len <
6821 + CMSG_LEN(sizeof(struct rds_rdma_args)))
6822 + return -EINVAL;
6823 args = CMSG_DATA(cmsg);
6824 *rdma_bytes += args->remote_vec.bytes;
6825 }
6826 diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c
6827 index 44de4ee51ce9..a08a32fa0949 100644
6828 --- a/net/sched/sch_ingress.c
6829 +++ b/net/sched/sch_ingress.c
6830 @@ -59,11 +59,12 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt)
6831 struct net_device *dev = qdisc_dev(sch);
6832 int err;
6833
6834 + net_inc_ingress_queue();
6835 +
6836 err = tcf_block_get(&q->block, &dev->ingress_cl_list);
6837 if (err)
6838 return err;
6839
6840 - net_inc_ingress_queue();
6841 sch->flags |= TCQ_F_CPUSTATS;
6842
6843 return 0;
6844 @@ -153,6 +154,9 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt)
6845 struct net_device *dev = qdisc_dev(sch);
6846 int err;
6847
6848 + net_inc_ingress_queue();
6849 + net_inc_egress_queue();
6850 +
6851 err = tcf_block_get(&q->ingress_block, &dev->ingress_cl_list);
6852 if (err)
6853 return err;
6854 @@ -161,9 +165,6 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt)
6855 if (err)
6856 return err;
6857
6858 - net_inc_ingress_queue();
6859 - net_inc_egress_queue();
6860 -
6861 sch->flags |= TCQ_F_CPUSTATS;
6862
6863 return 0;
6864 diff --git a/net/sctp/socket.c b/net/sctp/socket.c
6865 index d6163f7aefb1..df806b8819aa 100644
6866 --- a/net/sctp/socket.c
6867 +++ b/net/sctp/socket.c
6868 @@ -3874,13 +3874,17 @@ static int sctp_setsockopt_reset_streams(struct sock *sk,
6869 struct sctp_association *asoc;
6870 int retval = -EINVAL;
6871
6872 - if (optlen < sizeof(struct sctp_reset_streams))
6873 + if (optlen < sizeof(*params))
6874 return -EINVAL;
6875
6876 params = memdup_user(optval, optlen);
6877 if (IS_ERR(params))
6878 return PTR_ERR(params);
6879
6880 + if (params->srs_number_streams * sizeof(__u16) >
6881 + optlen - sizeof(*params))
6882 + goto out;
6883 +
6884 asoc = sctp_id2assoc(sk, params->srs_assoc_id);
6885 if (!asoc)
6886 goto out;
6887 @@ -4413,7 +4417,7 @@ static int sctp_init_sock(struct sock *sk)
6888 SCTP_DBG_OBJCNT_INC(sock);
6889
6890 local_bh_disable();
6891 - percpu_counter_inc(&sctp_sockets_allocated);
6892 + sk_sockets_allocated_inc(sk);
6893 sock_prot_inuse_add(net, sk->sk_prot, 1);
6894
6895 /* Nothing can fail after this block, otherwise
6896 @@ -4457,7 +4461,7 @@ static void sctp_destroy_sock(struct sock *sk)
6897 }
6898 sctp_endpoint_free(sp->ep);
6899 local_bh_disable();
6900 - percpu_counter_dec(&sctp_sockets_allocated);
6901 + sk_sockets_allocated_dec(sk);
6902 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
6903 local_bh_enable();
6904 }
6905 diff --git a/net/tipc/socket.c b/net/tipc/socket.c
6906 index d50edd6e0019..98a44ecb11e7 100644
6907 --- a/net/tipc/socket.c
6908 +++ b/net/tipc/socket.c
6909 @@ -709,11 +709,11 @@ static unsigned int tipc_poll(struct file *file, struct socket *sock,
6910
6911 switch (sk->sk_state) {
6912 case TIPC_ESTABLISHED:
6913 + case TIPC_CONNECTING:
6914 if (!tsk->cong_link_cnt && !tsk_conn_cong(tsk))
6915 mask |= POLLOUT;
6916 /* fall thru' */
6917 case TIPC_LISTEN:
6918 - case TIPC_CONNECTING:
6919 if (!skb_queue_empty(&sk->sk_receive_queue))
6920 mask |= (POLLIN | POLLRDNORM);
6921 break;
6922 diff --git a/security/Kconfig b/security/Kconfig
6923 index e8e449444e65..6614b9312b45 100644
6924 --- a/security/Kconfig
6925 +++ b/security/Kconfig
6926 @@ -54,6 +54,17 @@ config SECURITY_NETWORK
6927 implement socket and networking access controls.
6928 If you are unsure how to answer this question, answer N.
6929
6930 +config PAGE_TABLE_ISOLATION
6931 + bool "Remove the kernel mapping in user mode"
6932 + depends on X86_64 && !UML
6933 + default y
6934 + help
6935 + This feature reduces the number of hardware side channels by
6936 + ensuring that the majority of kernel addresses are not mapped
6937 + into userspace.
6938 +
6939 + See Documentation/x86/pagetable-isolation.txt for more details.
6940 +
6941 config SECURITY_INFINIBAND
6942 bool "Infiniband Security Hooks"
6943 depends on SECURITY && INFINIBAND
6944 diff --git a/sound/hda/hdac_i915.c b/sound/hda/hdac_i915.c
6945 index 038a180d3f81..cbe818eda336 100644
6946 --- a/sound/hda/hdac_i915.c
6947 +++ b/sound/hda/hdac_i915.c
6948 @@ -325,7 +325,7 @@ static int hdac_component_master_match(struct device *dev, void *data)
6949 */
6950 int snd_hdac_i915_register_notifier(const struct i915_audio_component_audio_ops *aops)
6951 {
6952 - if (WARN_ON(!hdac_acomp))
6953 + if (!hdac_acomp)
6954 return -ENODEV;
6955
6956 hdac_acomp->audio_ops = aops;
6957 diff --git a/sound/pci/hda/patch_conexant.c b/sound/pci/hda/patch_conexant.c
6958 index a81aacf684b2..37e1cf8218ff 100644
6959 --- a/sound/pci/hda/patch_conexant.c
6960 +++ b/sound/pci/hda/patch_conexant.c
6961 @@ -271,6 +271,8 @@ enum {
6962 CXT_FIXUP_HP_SPECTRE,
6963 CXT_FIXUP_HP_GATE_MIC,
6964 CXT_FIXUP_MUTE_LED_GPIO,
6965 + CXT_FIXUP_HEADSET_MIC,
6966 + CXT_FIXUP_HP_MIC_NO_PRESENCE,
6967 };
6968
6969 /* for hda_fixup_thinkpad_acpi() */
6970 @@ -350,6 +352,18 @@ static void cxt_fixup_headphone_mic(struct hda_codec *codec,
6971 }
6972 }
6973
6974 +static void cxt_fixup_headset_mic(struct hda_codec *codec,
6975 + const struct hda_fixup *fix, int action)
6976 +{
6977 + struct conexant_spec *spec = codec->spec;
6978 +
6979 + switch (action) {
6980 + case HDA_FIXUP_ACT_PRE_PROBE:
6981 + spec->parse_flags |= HDA_PINCFG_HEADSET_MIC;
6982 + break;
6983 + }
6984 +}
6985 +
6986 /* OPLC XO 1.5 fixup */
6987
6988 /* OLPC XO-1.5 supports DC input mode (e.g. for use with analog sensors)
6989 @@ -880,6 +894,19 @@ static const struct hda_fixup cxt_fixups[] = {
6990 .type = HDA_FIXUP_FUNC,
6991 .v.func = cxt_fixup_mute_led_gpio,
6992 },
6993 + [CXT_FIXUP_HEADSET_MIC] = {
6994 + .type = HDA_FIXUP_FUNC,
6995 + .v.func = cxt_fixup_headset_mic,
6996 + },
6997 + [CXT_FIXUP_HP_MIC_NO_PRESENCE] = {
6998 + .type = HDA_FIXUP_PINS,
6999 + .v.pins = (const struct hda_pintbl[]) {
7000 + { 0x1a, 0x02a1113c },
7001 + { }
7002 + },
7003 + .chained = true,
7004 + .chain_id = CXT_FIXUP_HEADSET_MIC,
7005 + },
7006 };
7007
7008 static const struct snd_pci_quirk cxt5045_fixups[] = {
7009 @@ -934,6 +961,8 @@ static const struct snd_pci_quirk cxt5066_fixups[] = {
7010 SND_PCI_QUIRK(0x103c, 0x8115, "HP Z1 Gen3", CXT_FIXUP_HP_GATE_MIC),
7011 SND_PCI_QUIRK(0x103c, 0x814f, "HP ZBook 15u G3", CXT_FIXUP_MUTE_LED_GPIO),
7012 SND_PCI_QUIRK(0x103c, 0x822e, "HP ProBook 440 G4", CXT_FIXUP_MUTE_LED_GPIO),
7013 + SND_PCI_QUIRK(0x103c, 0x8299, "HP 800 G3 SFF", CXT_FIXUP_HP_MIC_NO_PRESENCE),
7014 + SND_PCI_QUIRK(0x103c, 0x829a, "HP 800 G3 DM", CXT_FIXUP_HP_MIC_NO_PRESENCE),
7015 SND_PCI_QUIRK(0x1043, 0x138d, "Asus", CXT_FIXUP_HEADPHONE_MIC_PIN),
7016 SND_PCI_QUIRK(0x152d, 0x0833, "OLPC XO-1.5", CXT_FIXUP_OLPC_XO),
7017 SND_PCI_QUIRK(0x17aa, 0x20f2, "Lenovo T400", CXT_PINCFG_LENOVO_TP410),
7018 diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c
7019 index 9ac4b9076ee2..acdb196ddb44 100644
7020 --- a/sound/pci/hda/patch_realtek.c
7021 +++ b/sound/pci/hda/patch_realtek.c
7022 @@ -324,8 +324,12 @@ static void alc_fill_eapd_coef(struct hda_codec *codec)
7023 case 0x10ec0292:
7024 alc_update_coef_idx(codec, 0x4, 1<<15, 0);
7025 break;
7026 - case 0x10ec0215:
7027 case 0x10ec0225:
7028 + case 0x10ec0295:
7029 + case 0x10ec0299:
7030 + alc_update_coef_idx(codec, 0x67, 0xf000, 0x3000);
7031 + /* fallthrough */
7032 + case 0x10ec0215:
7033 case 0x10ec0233:
7034 case 0x10ec0236:
7035 case 0x10ec0255:
7036 @@ -336,10 +340,8 @@ static void alc_fill_eapd_coef(struct hda_codec *codec)
7037 case 0x10ec0286:
7038 case 0x10ec0288:
7039 case 0x10ec0285:
7040 - case 0x10ec0295:
7041 case 0x10ec0298:
7042 case 0x10ec0289:
7043 - case 0x10ec0299:
7044 alc_update_coef_idx(codec, 0x10, 1<<9, 0);
7045 break;
7046 case 0x10ec0275:
7047 @@ -6305,6 +6307,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = {
7048 SND_PCI_QUIRK(0x17aa, 0x30bb, "ThinkCentre AIO", ALC233_FIXUP_LENOVO_LINE2_MIC_HOTKEY),
7049 SND_PCI_QUIRK(0x17aa, 0x30e2, "ThinkCentre AIO", ALC233_FIXUP_LENOVO_LINE2_MIC_HOTKEY),
7050 SND_PCI_QUIRK(0x17aa, 0x310c, "ThinkCentre Station", ALC294_FIXUP_LENOVO_MIC_LOCATION),
7051 + SND_PCI_QUIRK(0x17aa, 0x313c, "ThinkCentre Station", ALC294_FIXUP_LENOVO_MIC_LOCATION),
7052 SND_PCI_QUIRK(0x17aa, 0x3112, "ThinkCentre AIO", ALC233_FIXUP_LENOVO_LINE2_MIC_HOTKEY),
7053 SND_PCI_QUIRK(0x17aa, 0x3902, "Lenovo E50-80", ALC269_FIXUP_DMIC_THINKPAD_ACPI),
7054 SND_PCI_QUIRK(0x17aa, 0x3977, "IdeaPad S210", ALC283_FIXUP_INT_MIC),
7055 @@ -6557,6 +6560,11 @@ static const struct snd_hda_pin_quirk alc269_pin_fixup_tbl[] = {
7056 SND_HDA_PIN_QUIRK(0x10ec0255, 0x1028, "Dell", ALC255_FIXUP_DELL1_MIC_NO_PRESENCE,
7057 {0x1b, 0x01011020},
7058 {0x21, 0x02211010}),
7059 + SND_HDA_PIN_QUIRK(0x10ec0256, 0x1028, "Dell", ALC255_FIXUP_DELL1_MIC_NO_PRESENCE,
7060 + {0x12, 0x90a60130},
7061 + {0x14, 0x90170110},
7062 + {0x1b, 0x01011020},
7063 + {0x21, 0x0221101f}),
7064 SND_HDA_PIN_QUIRK(0x10ec0256, 0x1028, "Dell", ALC255_FIXUP_DELL1_MIC_NO_PRESENCE,
7065 {0x12, 0x90a60160},
7066 {0x14, 0x90170120},
7067 diff --git a/sound/soc/codecs/da7218.c b/sound/soc/codecs/da7218.c
7068 index b2d42ec1dcd9..56564ce90cb6 100644
7069 --- a/sound/soc/codecs/da7218.c
7070 +++ b/sound/soc/codecs/da7218.c
7071 @@ -2520,7 +2520,7 @@ static struct da7218_pdata *da7218_of_to_pdata(struct snd_soc_codec *codec)
7072 }
7073
7074 if (da7218->dev_id == DA7218_DEV_ID) {
7075 - hpldet_np = of_find_node_by_name(np, "da7218_hpldet");
7076 + hpldet_np = of_get_child_by_name(np, "da7218_hpldet");
7077 if (!hpldet_np)
7078 return pdata;
7079
7080 diff --git a/sound/soc/codecs/msm8916-wcd-analog.c b/sound/soc/codecs/msm8916-wcd-analog.c
7081 index 18933bf6473f..8c7063e1aa46 100644
7082 --- a/sound/soc/codecs/msm8916-wcd-analog.c
7083 +++ b/sound/soc/codecs/msm8916-wcd-analog.c
7084 @@ -267,7 +267,7 @@
7085 #define MSM8916_WCD_ANALOG_RATES (SNDRV_PCM_RATE_8000 | SNDRV_PCM_RATE_16000 |\
7086 SNDRV_PCM_RATE_32000 | SNDRV_PCM_RATE_48000)
7087 #define MSM8916_WCD_ANALOG_FORMATS (SNDRV_PCM_FMTBIT_S16_LE |\
7088 - SNDRV_PCM_FMTBIT_S24_LE)
7089 + SNDRV_PCM_FMTBIT_S32_LE)
7090
7091 static int btn_mask = SND_JACK_BTN_0 | SND_JACK_BTN_1 |
7092 SND_JACK_BTN_2 | SND_JACK_BTN_3 | SND_JACK_BTN_4;
7093 diff --git a/sound/soc/codecs/msm8916-wcd-digital.c b/sound/soc/codecs/msm8916-wcd-digital.c
7094 index 66df8f810f0d..694db27b11fa 100644
7095 --- a/sound/soc/codecs/msm8916-wcd-digital.c
7096 +++ b/sound/soc/codecs/msm8916-wcd-digital.c
7097 @@ -194,7 +194,7 @@
7098 SNDRV_PCM_RATE_32000 | \
7099 SNDRV_PCM_RATE_48000)
7100 #define MSM8916_WCD_DIGITAL_FORMATS (SNDRV_PCM_FMTBIT_S16_LE |\
7101 - SNDRV_PCM_FMTBIT_S24_LE)
7102 + SNDRV_PCM_FMTBIT_S32_LE)
7103
7104 struct msm8916_wcd_digital_priv {
7105 struct clk *ahbclk, *mclk;
7106 @@ -645,7 +645,7 @@ static int msm8916_wcd_digital_hw_params(struct snd_pcm_substream *substream,
7107 RX_I2S_CTL_RX_I2S_MODE_MASK,
7108 RX_I2S_CTL_RX_I2S_MODE_16);
7109 break;
7110 - case SNDRV_PCM_FORMAT_S24_LE:
7111 + case SNDRV_PCM_FORMAT_S32_LE:
7112 snd_soc_update_bits(dai->codec, LPASS_CDC_CLK_TX_I2S_CTL,
7113 TX_I2S_CTL_TX_I2S_MODE_MASK,
7114 TX_I2S_CTL_TX_I2S_MODE_32);
7115 diff --git a/sound/soc/codecs/tlv320aic31xx.h b/sound/soc/codecs/tlv320aic31xx.h
7116 index 730fb2058869..1ff3edb7bbb6 100644
7117 --- a/sound/soc/codecs/tlv320aic31xx.h
7118 +++ b/sound/soc/codecs/tlv320aic31xx.h
7119 @@ -116,7 +116,7 @@ struct aic31xx_pdata {
7120 /* INT2 interrupt control */
7121 #define AIC31XX_INT2CTRL AIC31XX_REG(0, 49)
7122 /* GPIO1 control */
7123 -#define AIC31XX_GPIO1 AIC31XX_REG(0, 50)
7124 +#define AIC31XX_GPIO1 AIC31XX_REG(0, 51)
7125
7126 #define AIC31XX_DACPRB AIC31XX_REG(0, 60)
7127 /* ADC Instruction Set Register */
7128 diff --git a/sound/soc/codecs/twl4030.c b/sound/soc/codecs/twl4030.c
7129 index c482b2e7a7d2..cfe72b9d4356 100644
7130 --- a/sound/soc/codecs/twl4030.c
7131 +++ b/sound/soc/codecs/twl4030.c
7132 @@ -232,7 +232,7 @@ static struct twl4030_codec_data *twl4030_get_pdata(struct snd_soc_codec *codec)
7133 struct twl4030_codec_data *pdata = dev_get_platdata(codec->dev);
7134 struct device_node *twl4030_codec_node = NULL;
7135
7136 - twl4030_codec_node = of_find_node_by_name(codec->dev->parent->of_node,
7137 + twl4030_codec_node = of_get_child_by_name(codec->dev->parent->of_node,
7138 "codec");
7139
7140 if (!pdata && twl4030_codec_node) {
7141 @@ -241,9 +241,11 @@ static struct twl4030_codec_data *twl4030_get_pdata(struct snd_soc_codec *codec)
7142 GFP_KERNEL);
7143 if (!pdata) {
7144 dev_err(codec->dev, "Can not allocate memory\n");
7145 + of_node_put(twl4030_codec_node);
7146 return NULL;
7147 }
7148 twl4030_setup_pdata_of(pdata, twl4030_codec_node);
7149 + of_node_put(twl4030_codec_node);
7150 }
7151
7152 return pdata;
7153 diff --git a/sound/soc/codecs/wm_adsp.c b/sound/soc/codecs/wm_adsp.c
7154 index 65c059b5ffd7..66e32f5d2917 100644
7155 --- a/sound/soc/codecs/wm_adsp.c
7156 +++ b/sound/soc/codecs/wm_adsp.c
7157 @@ -1733,7 +1733,7 @@ static int wm_adsp_load(struct wm_adsp *dsp)
7158 le64_to_cpu(footer->timestamp));
7159
7160 while (pos < firmware->size &&
7161 - pos - firmware->size > sizeof(*region)) {
7162 + sizeof(*region) < firmware->size - pos) {
7163 region = (void *)&(firmware->data[pos]);
7164 region_name = "Unknown";
7165 reg = 0;
7166 @@ -1782,8 +1782,8 @@ static int wm_adsp_load(struct wm_adsp *dsp)
7167 regions, le32_to_cpu(region->len), offset,
7168 region_name);
7169
7170 - if ((pos + le32_to_cpu(region->len) + sizeof(*region)) >
7171 - firmware->size) {
7172 + if (le32_to_cpu(region->len) >
7173 + firmware->size - pos - sizeof(*region)) {
7174 adsp_err(dsp,
7175 "%s.%d: %s region len %d bytes exceeds file length %zu\n",
7176 file, regions, region_name,
7177 @@ -2253,7 +2253,7 @@ static int wm_adsp_load_coeff(struct wm_adsp *dsp)
7178
7179 blocks = 0;
7180 while (pos < firmware->size &&
7181 - pos - firmware->size > sizeof(*blk)) {
7182 + sizeof(*blk) < firmware->size - pos) {
7183 blk = (void *)(&firmware->data[pos]);
7184
7185 type = le16_to_cpu(blk->type);
7186 @@ -2327,8 +2327,8 @@ static int wm_adsp_load_coeff(struct wm_adsp *dsp)
7187 }
7188
7189 if (reg) {
7190 - if ((pos + le32_to_cpu(blk->len) + sizeof(*blk)) >
7191 - firmware->size) {
7192 + if (le32_to_cpu(blk->len) >
7193 + firmware->size - pos - sizeof(*blk)) {
7194 adsp_err(dsp,
7195 "%s.%d: %s region len %d bytes exceeds file length %zu\n",
7196 file, blocks, region_name,
7197 diff --git a/sound/soc/fsl/fsl_ssi.c b/sound/soc/fsl/fsl_ssi.c
7198 index 64598d1183f8..3ffbb498cc70 100644
7199 --- a/sound/soc/fsl/fsl_ssi.c
7200 +++ b/sound/soc/fsl/fsl_ssi.c
7201 @@ -1452,12 +1452,6 @@ static int fsl_ssi_probe(struct platform_device *pdev)
7202 sizeof(fsl_ssi_ac97_dai));
7203
7204 fsl_ac97_data = ssi_private;
7205 -
7206 - ret = snd_soc_set_ac97_ops_of_reset(&fsl_ssi_ac97_ops, pdev);
7207 - if (ret) {
7208 - dev_err(&pdev->dev, "could not set AC'97 ops\n");
7209 - return ret;
7210 - }
7211 } else {
7212 /* Initialize this copy of the CPU DAI driver structure */
7213 memcpy(&ssi_private->cpu_dai_drv, &fsl_ssi_dai_template,
7214 @@ -1568,6 +1562,14 @@ static int fsl_ssi_probe(struct platform_device *pdev)
7215 return ret;
7216 }
7217
7218 + if (fsl_ssi_is_ac97(ssi_private)) {
7219 + ret = snd_soc_set_ac97_ops_of_reset(&fsl_ssi_ac97_ops, pdev);
7220 + if (ret) {
7221 + dev_err(&pdev->dev, "could not set AC'97 ops\n");
7222 + goto error_ac97_ops;
7223 + }
7224 + }
7225 +
7226 ret = devm_snd_soc_register_component(&pdev->dev, &fsl_ssi_component,
7227 &ssi_private->cpu_dai_drv, 1);
7228 if (ret) {
7229 @@ -1651,6 +1653,10 @@ static int fsl_ssi_probe(struct platform_device *pdev)
7230 fsl_ssi_debugfs_remove(&ssi_private->dbg_stats);
7231
7232 error_asoc_register:
7233 + if (fsl_ssi_is_ac97(ssi_private))
7234 + snd_soc_set_ac97_ops(NULL);
7235 +
7236 +error_ac97_ops:
7237 if (ssi_private->soc->imx)
7238 fsl_ssi_imx_clean(pdev, ssi_private);
7239
7240 diff --git a/tools/testing/selftests/x86/ldt_gdt.c b/tools/testing/selftests/x86/ldt_gdt.c
7241 index 0304ffb714f2..1aef72df20a1 100644
7242 --- a/tools/testing/selftests/x86/ldt_gdt.c
7243 +++ b/tools/testing/selftests/x86/ldt_gdt.c
7244 @@ -122,8 +122,7 @@ static void check_valid_segment(uint16_t index, int ldt,
7245 * NB: Different Linux versions do different things with the
7246 * accessed bit in set_thread_area().
7247 */
7248 - if (ar != expected_ar &&
7249 - (ldt || ar != (expected_ar | AR_ACCESSED))) {
7250 + if (ar != expected_ar && ar != (expected_ar | AR_ACCESSED)) {
7251 printf("[FAIL]\t%s entry %hu has AR 0x%08X but expected 0x%08X\n",
7252 (ldt ? "LDT" : "GDT"), index, ar, expected_ar);
7253 nerrs++;
7254 diff --git a/tools/usb/usbip/src/utils.c b/tools/usb/usbip/src/utils.c
7255 index 2b3d6d235015..3d7b42e77299 100644
7256 --- a/tools/usb/usbip/src/utils.c
7257 +++ b/tools/usb/usbip/src/utils.c
7258 @@ -30,6 +30,7 @@ int modify_match_busid(char *busid, int add)
7259 char command[SYSFS_BUS_ID_SIZE + 4];
7260 char match_busid_attr_path[SYSFS_PATH_MAX];
7261 int rc;
7262 + int cmd_size;
7263
7264 snprintf(match_busid_attr_path, sizeof(match_busid_attr_path),
7265 "%s/%s/%s/%s/%s/%s", SYSFS_MNT_PATH, SYSFS_BUS_NAME,
7266 @@ -37,12 +38,14 @@ int modify_match_busid(char *busid, int add)
7267 attr_name);
7268
7269 if (add)
7270 - snprintf(command, SYSFS_BUS_ID_SIZE + 4, "add %s", busid);
7271 + cmd_size = snprintf(command, SYSFS_BUS_ID_SIZE + 4, "add %s",
7272 + busid);
7273 else
7274 - snprintf(command, SYSFS_BUS_ID_SIZE + 4, "del %s", busid);
7275 + cmd_size = snprintf(command, SYSFS_BUS_ID_SIZE + 4, "del %s",
7276 + busid);
7277
7278 rc = write_sysfs_attribute(match_busid_attr_path, command,
7279 - sizeof(command));
7280 + cmd_size);
7281 if (rc < 0) {
7282 dbg("failed to write match_busid: %s", strerror(errno));
7283 return -1;