Magellan Linux

Annotation of /trunk/kernel-magellan/patches-4.13/0102-4.13.3-all-fixes.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 2995 - (hide annotations) (download)
Mon Oct 9 08:50:06 2017 UTC (6 years, 7 months ago) by niro
File size: 89990 byte(s)
-linux-4.13.3
1 niro 2995 diff --git a/Documentation/ABI/testing/sysfs-bus-thunderbolt b/Documentation/ABI/testing/sysfs-bus-thunderbolt
2     index 2a98149943ea..392bef5bd399 100644
3     --- a/Documentation/ABI/testing/sysfs-bus-thunderbolt
4     +++ b/Documentation/ABI/testing/sysfs-bus-thunderbolt
5     @@ -45,6 +45,8 @@ Contact: thunderbolt-software@lists.01.org
6     Description: When a devices supports Thunderbolt secure connect it will
7     have this attribute. Writing 32 byte hex string changes
8     authorization to use the secure connection method instead.
9     + Writing an empty string clears the key and regular connection
10     + method can be used again.
11    
12     What: /sys/bus/thunderbolt/devices/.../device
13     Date: Sep 2017
14     diff --git a/Makefile b/Makefile
15     index 8aad6bc50d52..0f31ef4aea7b 100644
16     --- a/Makefile
17     +++ b/Makefile
18     @@ -1,6 +1,6 @@
19     VERSION = 4
20     PATCHLEVEL = 13
21     -SUBLEVEL = 2
22     +SUBLEVEL = 3
23     EXTRAVERSION =
24     NAME = Fearless Coyote
25    
26     diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
27     index 9aeb91935ce0..e2c4dd051ef8 100644
28     --- a/arch/x86/include/asm/elf.h
29     +++ b/arch/x86/include/asm/elf.h
30     @@ -204,6 +204,7 @@ void set_personality_ia32(bool);
31    
32     #define ELF_CORE_COPY_REGS(pr_reg, regs) \
33     do { \
34     + unsigned long base; \
35     unsigned v; \
36     (pr_reg)[0] = (regs)->r15; \
37     (pr_reg)[1] = (regs)->r14; \
38     @@ -226,8 +227,8 @@ do { \
39     (pr_reg)[18] = (regs)->flags; \
40     (pr_reg)[19] = (regs)->sp; \
41     (pr_reg)[20] = (regs)->ss; \
42     - (pr_reg)[21] = current->thread.fsbase; \
43     - (pr_reg)[22] = current->thread.gsbase; \
44     + rdmsrl(MSR_FS_BASE, base); (pr_reg)[21] = base; \
45     + rdmsrl(MSR_KERNEL_GS_BASE, base); (pr_reg)[22] = base; \
46     asm("movl %%ds,%0" : "=r" (v)); (pr_reg)[23] = v; \
47     asm("movl %%es,%0" : "=r" (v)); (pr_reg)[24] = v; \
48     asm("movl %%fs,%0" : "=r" (v)); (pr_reg)[25] = v; \
49     diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
50     index b4a0d43248cf..b50df06ad251 100644
51     --- a/arch/x86/include/asm/page_64.h
52     +++ b/arch/x86/include/asm/page_64.h
53     @@ -51,6 +51,10 @@ static inline void clear_page(void *page)
54    
55     void copy_page(void *to, void *from);
56    
57     +#ifdef CONFIG_X86_MCE
58     +#define arch_unmap_kpfn arch_unmap_kpfn
59     +#endif
60     +
61     #endif /* !__ASSEMBLY__ */
62    
63     #ifdef CONFIG_X86_VSYSCALL_EMULATION
64     diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
65     index 6dde0497efc7..3b413065c613 100644
66     --- a/arch/x86/kernel/cpu/mcheck/mce.c
67     +++ b/arch/x86/kernel/cpu/mcheck/mce.c
68     @@ -51,6 +51,7 @@
69     #include <asm/mce.h>
70     #include <asm/msr.h>
71     #include <asm/reboot.h>
72     +#include <asm/set_memory.h>
73    
74     #include "mce-internal.h"
75    
76     @@ -1051,6 +1052,48 @@ static int do_memory_failure(struct mce *m)
77     return ret;
78     }
79    
80     +#if defined(arch_unmap_kpfn) && defined(CONFIG_MEMORY_FAILURE)
81     +
82     +void arch_unmap_kpfn(unsigned long pfn)
83     +{
84     + unsigned long decoy_addr;
85     +
86     + /*
87     + * Unmap this page from the kernel 1:1 mappings to make sure
88     + * we don't log more errors because of speculative access to
89     + * the page.
90     + * We would like to just call:
91     + * set_memory_np((unsigned long)pfn_to_kaddr(pfn), 1);
92     + * but doing that would radically increase the odds of a
93     + * speculative access to the posion page because we'd have
94     + * the virtual address of the kernel 1:1 mapping sitting
95     + * around in registers.
96     + * Instead we get tricky. We create a non-canonical address
97     + * that looks just like the one we want, but has bit 63 flipped.
98     + * This relies on set_memory_np() not checking whether we passed
99     + * a legal address.
100     + */
101     +
102     +/*
103     + * Build time check to see if we have a spare virtual bit. Don't want
104     + * to leave this until run time because most developers don't have a
105     + * system that can exercise this code path. This will only become a
106     + * problem if/when we move beyond 5-level page tables.
107     + *
108     + * Hard code "9" here because cpp doesn't grok ilog2(PTRS_PER_PGD)
109     + */
110     +#if PGDIR_SHIFT + 9 < 63
111     + decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63));
112     +#else
113     +#error "no unused virtual bit available"
114     +#endif
115     +
116     + if (set_memory_np(decoy_addr, 1))
117     + pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn);
118     +
119     +}
120     +#endif
121     +
122     /*
123     * The actual machine check handler. This only handles real
124     * exceptions when something got corrupted coming in through int 18.
125     diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
126     index c3169be4c596..8c44e0cb2912 100644
127     --- a/arch/x86/kernel/process_64.c
128     +++ b/arch/x86/kernel/process_64.c
129     @@ -149,6 +149,123 @@ void release_thread(struct task_struct *dead_task)
130     }
131     }
132    
133     +enum which_selector {
134     + FS,
135     + GS
136     +};
137     +
138     +/*
139     + * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are
140     + * not available. The goal is to be reasonably fast on non-FSGSBASE systems.
141     + * It's forcibly inlined because it'll generate better code and this function
142     + * is hot.
143     + */
144     +static __always_inline void save_base_legacy(struct task_struct *prev_p,
145     + unsigned short selector,
146     + enum which_selector which)
147     +{
148     + if (likely(selector == 0)) {
149     + /*
150     + * On Intel (without X86_BUG_NULL_SEG), the segment base could
151     + * be the pre-existing saved base or it could be zero. On AMD
152     + * (with X86_BUG_NULL_SEG), the segment base could be almost
153     + * anything.
154     + *
155     + * This branch is very hot (it's hit twice on almost every
156     + * context switch between 64-bit programs), and avoiding
157     + * the RDMSR helps a lot, so we just assume that whatever
158     + * value is already saved is correct. This matches historical
159     + * Linux behavior, so it won't break existing applications.
160     + *
161     + * To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we
162     + * report that the base is zero, it needs to actually be zero:
163     + * see the corresponding logic in load_seg_legacy.
164     + */
165     + } else {
166     + /*
167     + * If the selector is 1, 2, or 3, then the base is zero on
168     + * !X86_BUG_NULL_SEG CPUs and could be anything on
169     + * X86_BUG_NULL_SEG CPUs. In the latter case, Linux
170     + * has never attempted to preserve the base across context
171     + * switches.
172     + *
173     + * If selector > 3, then it refers to a real segment, and
174     + * saving the base isn't necessary.
175     + */
176     + if (which == FS)
177     + prev_p->thread.fsbase = 0;
178     + else
179     + prev_p->thread.gsbase = 0;
180     + }
181     +}
182     +
183     +static __always_inline void save_fsgs(struct task_struct *task)
184     +{
185     + savesegment(fs, task->thread.fsindex);
186     + savesegment(gs, task->thread.gsindex);
187     + save_base_legacy(task, task->thread.fsindex, FS);
188     + save_base_legacy(task, task->thread.gsindex, GS);
189     +}
190     +
191     +static __always_inline void loadseg(enum which_selector which,
192     + unsigned short sel)
193     +{
194     + if (which == FS)
195     + loadsegment(fs, sel);
196     + else
197     + load_gs_index(sel);
198     +}
199     +
200     +static __always_inline void load_seg_legacy(unsigned short prev_index,
201     + unsigned long prev_base,
202     + unsigned short next_index,
203     + unsigned long next_base,
204     + enum which_selector which)
205     +{
206     + if (likely(next_index <= 3)) {
207     + /*
208     + * The next task is using 64-bit TLS, is not using this
209     + * segment at all, or is having fun with arcane CPU features.
210     + */
211     + if (next_base == 0) {
212     + /*
213     + * Nasty case: on AMD CPUs, we need to forcibly zero
214     + * the base.
215     + */
216     + if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
217     + loadseg(which, __USER_DS);
218     + loadseg(which, next_index);
219     + } else {
220     + /*
221     + * We could try to exhaustively detect cases
222     + * under which we can skip the segment load,
223     + * but there's really only one case that matters
224     + * for performance: if both the previous and
225     + * next states are fully zeroed, we can skip
226     + * the load.
227     + *
228     + * (This assumes that prev_base == 0 has no
229     + * false positives. This is the case on
230     + * Intel-style CPUs.)
231     + */
232     + if (likely(prev_index | next_index | prev_base))
233     + loadseg(which, next_index);
234     + }
235     + } else {
236     + if (prev_index != next_index)
237     + loadseg(which, next_index);
238     + wrmsrl(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE,
239     + next_base);
240     + }
241     + } else {
242     + /*
243     + * The next task is using a real segment. Loading the selector
244     + * is sufficient.
245     + */
246     + loadseg(which, next_index);
247     + }
248     +}
249     +
250     int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
251     unsigned long arg, struct task_struct *p, unsigned long tls)
252     {
253     @@ -229,10 +346,19 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip,
254     unsigned long new_sp,
255     unsigned int _cs, unsigned int _ss, unsigned int _ds)
256     {
257     + WARN_ON_ONCE(regs != current_pt_regs());
258     +
259     + if (static_cpu_has(X86_BUG_NULL_SEG)) {
260     + /* Loading zero below won't clear the base. */
261     + loadsegment(fs, __USER_DS);
262     + load_gs_index(__USER_DS);
263     + }
264     +
265     loadsegment(fs, 0);
266     loadsegment(es, _ds);
267     loadsegment(ds, _ds);
268     load_gs_index(0);
269     +
270     regs->ip = new_ip;
271     regs->sp = new_sp;
272     regs->cs = _cs;
273     @@ -277,7 +403,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
274     struct fpu *next_fpu = &next->fpu;
275     int cpu = smp_processor_id();
276     struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
277     - unsigned prev_fsindex, prev_gsindex;
278    
279     switch_fpu_prepare(prev_fpu, cpu);
280    
281     @@ -286,8 +411,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
282     *
283     * (e.g. xen_load_tls())
284     */
285     - savesegment(fs, prev_fsindex);
286     - savesegment(gs, prev_gsindex);
287     + save_fsgs(prev_p);
288    
289     /*
290     * Load TLS before restoring any segments so that segment loads
291     @@ -326,108 +450,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
292     if (unlikely(next->ds | prev->ds))
293     loadsegment(ds, next->ds);
294    
295     - /*
296     - * Switch FS and GS.
297     - *
298     - * These are even more complicated than DS and ES: they have
299     - * 64-bit bases are that controlled by arch_prctl. The bases
300     - * don't necessarily match the selectors, as user code can do
301     - * any number of things to cause them to be inconsistent.
302     - *
303     - * We don't promise to preserve the bases if the selectors are
304     - * nonzero. We also don't promise to preserve the base if the
305     - * selector is zero and the base doesn't match whatever was
306     - * most recently passed to ARCH_SET_FS/GS. (If/when the
307     - * FSGSBASE instructions are enabled, we'll need to offer
308     - * stronger guarantees.)
309     - *
310     - * As an invariant,
311     - * (fsbase != 0 && fsindex != 0) || (gsbase != 0 && gsindex != 0) is
312     - * impossible.
313     - */
314     - if (next->fsindex) {
315     - /* Loading a nonzero value into FS sets the index and base. */
316     - loadsegment(fs, next->fsindex);
317     - } else {
318     - if (next->fsbase) {
319     - /* Next index is zero but next base is nonzero. */
320     - if (prev_fsindex)
321     - loadsegment(fs, 0);
322     - wrmsrl(MSR_FS_BASE, next->fsbase);
323     - } else {
324     - /* Next base and index are both zero. */
325     - if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
326     - /*
327     - * We don't know the previous base and can't
328     - * find out without RDMSR. Forcibly clear it.
329     - */
330     - loadsegment(fs, __USER_DS);
331     - loadsegment(fs, 0);
332     - } else {
333     - /*
334     - * If the previous index is zero and ARCH_SET_FS
335     - * didn't change the base, then the base is
336     - * also zero and we don't need to do anything.
337     - */
338     - if (prev->fsbase || prev_fsindex)
339     - loadsegment(fs, 0);
340     - }
341     - }
342     - }
343     - /*
344     - * Save the old state and preserve the invariant.
345     - * NB: if prev_fsindex == 0, then we can't reliably learn the base
346     - * without RDMSR because Intel user code can zero it without telling
347     - * us and AMD user code can program any 32-bit value without telling
348     - * us.
349     - */
350     - if (prev_fsindex)
351     - prev->fsbase = 0;
352     - prev->fsindex = prev_fsindex;
353     -
354     - if (next->gsindex) {
355     - /* Loading a nonzero value into GS sets the index and base. */
356     - load_gs_index(next->gsindex);
357     - } else {
358     - if (next->gsbase) {
359     - /* Next index is zero but next base is nonzero. */
360     - if (prev_gsindex)
361     - load_gs_index(0);
362     - wrmsrl(MSR_KERNEL_GS_BASE, next->gsbase);
363     - } else {
364     - /* Next base and index are both zero. */
365     - if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
366     - /*
367     - * We don't know the previous base and can't
368     - * find out without RDMSR. Forcibly clear it.
369     - *
370     - * This contains a pointless SWAPGS pair.
371     - * Fixing it would involve an explicit check
372     - * for Xen or a new pvop.
373     - */
374     - load_gs_index(__USER_DS);
375     - load_gs_index(0);
376     - } else {
377     - /*
378     - * If the previous index is zero and ARCH_SET_GS
379     - * didn't change the base, then the base is
380     - * also zero and we don't need to do anything.
381     - */
382     - if (prev->gsbase || prev_gsindex)
383     - load_gs_index(0);
384     - }
385     - }
386     - }
387     - /*
388     - * Save the old state and preserve the invariant.
389     - * NB: if prev_gsindex == 0, then we can't reliably learn the base
390     - * without RDMSR because Intel user code can zero it without telling
391     - * us and AMD user code can program any 32-bit value without telling
392     - * us.
393     - */
394     - if (prev_gsindex)
395     - prev->gsbase = 0;
396     - prev->gsindex = prev_gsindex;
397     + load_seg_legacy(prev->fsindex, prev->fsbase,
398     + next->fsindex, next->fsbase, FS);
399     + load_seg_legacy(prev->gsindex, prev->gsbase,
400     + next->gsindex, next->gsbase, GS);
401    
402     switch_fpu_finish(next_fpu, cpu);
403    
404     diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
405     index f50958ded9f0..79474f47eeef 100644
406     --- a/drivers/md/raid1.c
407     +++ b/drivers/md/raid1.c
408     @@ -2564,6 +2564,23 @@ static int init_resync(struct r1conf *conf)
409     return 0;
410     }
411    
412     +static struct r1bio *raid1_alloc_init_r1buf(struct r1conf *conf)
413     +{
414     + struct r1bio *r1bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO);
415     + struct resync_pages *rps;
416     + struct bio *bio;
417     + int i;
418     +
419     + for (i = conf->poolinfo->raid_disks; i--; ) {
420     + bio = r1bio->bios[i];
421     + rps = bio->bi_private;
422     + bio_reset(bio);
423     + bio->bi_private = rps;
424     + }
425     + r1bio->master_bio = NULL;
426     + return r1bio;
427     +}
428     +
429     /*
430     * perform a "sync" on one "block"
431     *
432     @@ -2649,7 +2666,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
433    
434     bitmap_cond_end_sync(mddev->bitmap, sector_nr,
435     mddev_is_clustered(mddev) && (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high));
436     - r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO);
437     + r1_bio = raid1_alloc_init_r1buf(conf);
438    
439     raise_barrier(conf, sector_nr);
440    
441     diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
442     index f55d4cc085f6..d51ac02e98ef 100644
443     --- a/drivers/md/raid10.c
444     +++ b/drivers/md/raid10.c
445     @@ -2798,6 +2798,35 @@ static int init_resync(struct r10conf *conf)
446     return 0;
447     }
448    
449     +static struct r10bio *raid10_alloc_init_r10buf(struct r10conf *conf)
450     +{
451     + struct r10bio *r10bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
452     + struct rsync_pages *rp;
453     + struct bio *bio;
454     + int nalloc;
455     + int i;
456     +
457     + if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) ||
458     + test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery))
459     + nalloc = conf->copies; /* resync */
460     + else
461     + nalloc = 2; /* recovery */
462     +
463     + for (i = 0; i < nalloc; i++) {
464     + bio = r10bio->devs[i].bio;
465     + rp = bio->bi_private;
466     + bio_reset(bio);
467     + bio->bi_private = rp;
468     + bio = r10bio->devs[i].repl_bio;
469     + if (bio) {
470     + rp = bio->bi_private;
471     + bio_reset(bio);
472     + bio->bi_private = rp;
473     + }
474     + }
475     + return r10bio;
476     +}
477     +
478     /*
479     * perform a "sync" on one "block"
480     *
481     @@ -3027,7 +3056,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
482     atomic_inc(&mreplace->nr_pending);
483     rcu_read_unlock();
484    
485     - r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
486     + r10_bio = raid10_alloc_init_r10buf(conf);
487     r10_bio->state = 0;
488     raise_barrier(conf, rb2 != NULL);
489     atomic_set(&r10_bio->remaining, 0);
490     @@ -3236,7 +3265,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
491     }
492     if (sync_blocks < max_sync)
493     max_sync = sync_blocks;
494     - r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
495     + r10_bio = raid10_alloc_init_r10buf(conf);
496     r10_bio->state = 0;
497    
498     r10_bio->mddev = mddev;
499     @@ -4360,7 +4389,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
500    
501     read_more:
502     /* Now schedule reads for blocks from sector_nr to last */
503     - r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
504     + r10_bio = raid10_alloc_init_r10buf(conf);
505     r10_bio->state = 0;
506     raise_barrier(conf, sectors_done != 0);
507     atomic_set(&r10_bio->remaining, 0);
508     diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
509     index 0fc2748aaf95..e13a8ce7f589 100644
510     --- a/drivers/md/raid5.c
511     +++ b/drivers/md/raid5.c
512     @@ -6235,6 +6235,10 @@ static void raid5_do_work(struct work_struct *work)
513    
514     spin_unlock_irq(&conf->device_lock);
515    
516     + flush_deferred_bios(conf);
517     +
518     + r5l_flush_stripe_to_raid(conf->log);
519     +
520     async_tx_issue_pending_all();
521     blk_finish_plug(&plug);
522    
523     diff --git a/drivers/net/ethernet/freescale/gianfar.c b/drivers/net/ethernet/freescale/gianfar.c
524     index c4b4b0a1bbf0..5be52d89b182 100644
525     --- a/drivers/net/ethernet/freescale/gianfar.c
526     +++ b/drivers/net/ethernet/freescale/gianfar.c
527     @@ -3687,7 +3687,7 @@ static noinline void gfar_update_link_state(struct gfar_private *priv)
528     u32 tempval1 = gfar_read(&regs->maccfg1);
529     u32 tempval = gfar_read(&regs->maccfg2);
530     u32 ecntrl = gfar_read(&regs->ecntrl);
531     - u32 tx_flow_oldval = (tempval & MACCFG1_TX_FLOW);
532     + u32 tx_flow_oldval = (tempval1 & MACCFG1_TX_FLOW);
533    
534     if (phydev->duplex != priv->oldduplex) {
535     if (!(phydev->duplex))
536     diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
537     index 14323faf8bd9..7ec6393b6ba1 100644
538     --- a/drivers/nvdimm/btt.c
539     +++ b/drivers/nvdimm/btt.c
540     @@ -1429,6 +1429,8 @@ int nvdimm_namespace_attach_btt(struct nd_namespace_common *ndns)
541     }
542    
543     btt_sb = devm_kzalloc(&nd_btt->dev, sizeof(*btt_sb), GFP_KERNEL);
544     + if (!btt_sb)
545     + return -ENOMEM;
546    
547     /*
548     * If this returns < 0, that is ok as it just means there wasn't
549     diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
550     index 937fafa1886a..54eb14c7ef90 100644
551     --- a/drivers/nvdimm/bus.c
552     +++ b/drivers/nvdimm/bus.c
553     @@ -905,19 +905,20 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm,
554     int read_only, unsigned int ioctl_cmd, unsigned long arg)
555     {
556     struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc;
557     - size_t buf_len = 0, in_len = 0, out_len = 0;
558     static char out_env[ND_CMD_MAX_ENVELOPE];
559     static char in_env[ND_CMD_MAX_ENVELOPE];
560     const struct nd_cmd_desc *desc = NULL;
561     unsigned int cmd = _IOC_NR(ioctl_cmd);
562     - unsigned int func = cmd;
563     - void __user *p = (void __user *) arg;
564     struct device *dev = &nvdimm_bus->dev;
565     - struct nd_cmd_pkg pkg;
566     + void __user *p = (void __user *) arg;
567     const char *cmd_name, *dimm_name;
568     + u32 in_len = 0, out_len = 0;
569     + unsigned int func = cmd;
570     unsigned long cmd_mask;
571     - void *buf;
572     + struct nd_cmd_pkg pkg;
573     int rc, i, cmd_rc;
574     + u64 buf_len = 0;
575     + void *buf;
576    
577     if (nvdimm) {
578     desc = nd_cmd_dimm_desc(cmd);
579     @@ -977,7 +978,7 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm,
580    
581     if (cmd == ND_CMD_CALL) {
582     func = pkg.nd_command;
583     - dev_dbg(dev, "%s:%s, idx: %llu, in: %zu, out: %zu, len %zu\n",
584     + dev_dbg(dev, "%s:%s, idx: %llu, in: %u, out: %u, len %llu\n",
585     __func__, dimm_name, pkg.nd_command,
586     in_len, out_len, buf_len);
587    
588     @@ -1007,9 +1008,9 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm,
589     out_len += out_size;
590     }
591    
592     - buf_len = out_len + in_len;
593     + buf_len = (u64) out_len + (u64) in_len;
594     if (buf_len > ND_IOCTL_MAX_BUFLEN) {
595     - dev_dbg(dev, "%s:%s cmd: %s buf_len: %zu > %d\n", __func__,
596     + dev_dbg(dev, "%s:%s cmd: %s buf_len: %llu > %d\n", __func__,
597     dimm_name, cmd_name, buf_len,
598     ND_IOCTL_MAX_BUFLEN);
599     return -EINVAL;
600     diff --git a/drivers/thunderbolt/switch.c b/drivers/thunderbolt/switch.c
601     index e9391bbd4036..53f40c57df59 100644
602     --- a/drivers/thunderbolt/switch.c
603     +++ b/drivers/thunderbolt/switch.c
604     @@ -807,11 +807,11 @@ static ssize_t key_store(struct device *dev, struct device_attribute *attr,
605     struct tb_switch *sw = tb_to_switch(dev);
606     u8 key[TB_SWITCH_KEY_SIZE];
607     ssize_t ret = count;
608     + bool clear = false;
609    
610     - if (count < 64)
611     - return -EINVAL;
612     -
613     - if (hex2bin(key, buf, sizeof(key)))
614     + if (!strcmp(buf, "\n"))
615     + clear = true;
616     + else if (hex2bin(key, buf, sizeof(key)))
617     return -EINVAL;
618    
619     if (mutex_lock_interruptible(&switch_lock))
620     @@ -821,15 +821,19 @@ static ssize_t key_store(struct device *dev, struct device_attribute *attr,
621     ret = -EBUSY;
622     } else {
623     kfree(sw->key);
624     - sw->key = kmemdup(key, sizeof(key), GFP_KERNEL);
625     - if (!sw->key)
626     - ret = -ENOMEM;
627     + if (clear) {
628     + sw->key = NULL;
629     + } else {
630     + sw->key = kmemdup(key, sizeof(key), GFP_KERNEL);
631     + if (!sw->key)
632     + ret = -ENOMEM;
633     + }
634     }
635    
636     mutex_unlock(&switch_lock);
637     return ret;
638     }
639     -static DEVICE_ATTR_RW(key);
640     +static DEVICE_ATTR(key, 0600, key_show, key_store);
641    
642     static ssize_t nvm_authenticate_show(struct device *dev,
643     struct device_attribute *attr, char *buf)
644     diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
645     index 06d044862e58..1c75572f5a3f 100644
646     --- a/drivers/vhost/net.c
647     +++ b/drivers/vhost/net.c
648     @@ -634,8 +634,13 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
649    
650     preempt_enable();
651    
652     - if (vhost_enable_notify(&net->dev, vq))
653     + if (!vhost_vq_avail_empty(&net->dev, vq))
654     vhost_poll_queue(&vq->poll);
655     + else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
656     + vhost_disable_notify(&net->dev, vq);
657     + vhost_poll_queue(&vq->poll);
658     + }
659     +
660     mutex_unlock(&vq->mutex);
661    
662     len = peek_head_len(rvq, sk);
663     diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
664     index 907d6b7dde6a..86d813a3f5d1 100644
665     --- a/fs/f2fs/recovery.c
666     +++ b/fs/f2fs/recovery.c
667     @@ -291,7 +291,7 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
668     return 0;
669    
670     /* Get the previous summary */
671     - for (i = CURSEG_WARM_DATA; i <= CURSEG_COLD_DATA; i++) {
672     + for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
673     struct curseg_info *curseg = CURSEG_I(sbi, i);
674     if (curseg->segno == segno) {
675     sum = curseg->sum_blk->entries[blkoff];
676     @@ -599,8 +599,6 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
677     }
678    
679     clear_sbi_flag(sbi, SBI_POR_DOING);
680     - if (err)
681     - set_ckpt_flags(sbi, CP_ERROR_FLAG);
682     mutex_unlock(&sbi->cp_mutex);
683    
684     /* let's drop all the directory inodes for clean checkpoint */
685     diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
686     index c16d00e53264..13c65dd2d37d 100644
687     --- a/fs/fuse/dev.c
688     +++ b/fs/fuse/dev.c
689     @@ -1222,9 +1222,6 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file,
690     struct fuse_in *in;
691     unsigned reqsize;
692    
693     - if (task_active_pid_ns(current) != fc->pid_ns)
694     - return -EIO;
695     -
696     restart:
697     spin_lock(&fiq->waitq.lock);
698     err = -EAGAIN;
699     @@ -1262,6 +1259,13 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file,
700    
701     in = &req->in;
702     reqsize = in->h.len;
703     +
704     + if (task_active_pid_ns(current) != fc->pid_ns) {
705     + rcu_read_lock();
706     + in->h.pid = pid_vnr(find_pid_ns(in->h.pid, fc->pid_ns));
707     + rcu_read_unlock();
708     + }
709     +
710     /* If request is too large, reply with an error and restart the read */
711     if (nbytes < reqsize) {
712     req->out.h.error = -EIO;
713     @@ -1823,9 +1827,6 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
714     struct fuse_req *req;
715     struct fuse_out_header oh;
716    
717     - if (task_active_pid_ns(current) != fc->pid_ns)
718     - return -EIO;
719     -
720     if (nbytes < sizeof(struct fuse_out_header))
721     return -EINVAL;
722    
723     diff --git a/fs/fuse/file.c b/fs/fuse/file.c
724     index ab60051be6e5..6d8e65cec01a 100644
725     --- a/fs/fuse/file.c
726     +++ b/fs/fuse/file.c
727     @@ -2181,9 +2181,6 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
728     if ((fl->fl_flags & FL_CLOSE_POSIX) == FL_CLOSE_POSIX)
729     return 0;
730    
731     - if (pid && pid_nr == 0)
732     - return -EOVERFLOW;
733     -
734     fuse_lk_fill(&args, file, fl, opcode, pid_nr, flock, &inarg);
735     err = fuse_simple_request(fc, &args);
736    
737     diff --git a/fs/inode.c b/fs/inode.c
738     index 50370599e371..6a1626e0edaf 100644
739     --- a/fs/inode.c
740     +++ b/fs/inode.c
741     @@ -637,6 +637,7 @@ void evict_inodes(struct super_block *sb)
742    
743     dispose_list(&dispose);
744     }
745     +EXPORT_SYMBOL_GPL(evict_inodes);
746    
747     /**
748     * invalidate_inodes - attempt to free all inodes on a superblock
749     diff --git a/fs/internal.h b/fs/internal.h
750     index 9676fe11c093..fedfe94d84ba 100644
751     --- a/fs/internal.h
752     +++ b/fs/internal.h
753     @@ -132,7 +132,6 @@ static inline bool atime_needs_update_rcu(const struct path *path,
754     extern void inode_io_list_del(struct inode *inode);
755    
756     extern long get_nr_dirty_inodes(void);
757     -extern void evict_inodes(struct super_block *);
758     extern int invalidate_inodes(struct super_block *, bool);
759    
760     /*
761     diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
762     index 5bc71642b226..ef55c926463c 100644
763     --- a/fs/overlayfs/inode.c
764     +++ b/fs/overlayfs/inode.c
765     @@ -576,10 +576,13 @@ static int ovl_inode_set(struct inode *inode, void *data)
766     static bool ovl_verify_inode(struct inode *inode, struct dentry *lowerdentry,
767     struct dentry *upperdentry)
768     {
769     - struct inode *lowerinode = lowerdentry ? d_inode(lowerdentry) : NULL;
770     -
771     - /* Lower (origin) inode must match, even if NULL */
772     - if (ovl_inode_lower(inode) != lowerinode)
773     + /*
774     + * Allow non-NULL lower inode in ovl_inode even if lowerdentry is NULL.
775     + * This happens when finding a copied up overlay inode for a renamed
776     + * or hardlinked overlay dentry and lower dentry cannot be followed
777     + * by origin because lower fs does not support file handles.
778     + */
779     + if (lowerdentry && ovl_inode_lower(inode) != d_inode(lowerdentry))
780     return false;
781    
782     /*
783     diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
784     index c09c16b1ad3b..6f2a5baded76 100644
785     --- a/fs/xfs/libxfs/xfs_bmap.c
786     +++ b/fs/xfs/libxfs/xfs_bmap.c
787     @@ -579,7 +579,7 @@ xfs_bmap_validate_ret(
788    
789     #else
790     #define xfs_bmap_check_leaf_extents(cur, ip, whichfork) do { } while (0)
791     -#define xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap)
792     +#define xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap) do { } while (0)
793     #endif /* DEBUG */
794    
795     /*
796     diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
797     index 85de22513014..a6331ffa51e3 100644
798     --- a/fs/xfs/libxfs/xfs_bmap_btree.c
799     +++ b/fs/xfs/libxfs/xfs_bmap_btree.c
800     @@ -858,6 +858,7 @@ xfs_bmbt_change_owner(
801     cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork);
802     if (!cur)
803     return -ENOMEM;
804     + cur->bc_private.b.flags |= XFS_BTCUR_BPRV_INVALID_OWNER;
805    
806     error = xfs_btree_change_owner(cur, new_owner, buffer_list);
807     xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
808     diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
809     index e0bcc4a59efd..5bfb88261c7e 100644
810     --- a/fs/xfs/libxfs/xfs_btree.c
811     +++ b/fs/xfs/libxfs/xfs_btree.c
812     @@ -1791,6 +1791,7 @@ xfs_btree_lookup_get_block(
813    
814     /* Check the inode owner since the verifiers don't. */
815     if (xfs_sb_version_hascrc(&cur->bc_mp->m_sb) &&
816     + !(cur->bc_private.b.flags & XFS_BTCUR_BPRV_INVALID_OWNER) &&
817     (cur->bc_flags & XFS_BTREE_LONG_PTRS) &&
818     be64_to_cpu((*blkp)->bb_u.l.bb_owner) !=
819     cur->bc_private.b.ip->i_ino)
820     @@ -4451,10 +4452,15 @@ xfs_btree_block_change_owner(
821    
822     /* modify the owner */
823     block = xfs_btree_get_block(cur, level, &bp);
824     - if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
825     + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
826     + if (block->bb_u.l.bb_owner == cpu_to_be64(bbcoi->new_owner))
827     + return 0;
828     block->bb_u.l.bb_owner = cpu_to_be64(bbcoi->new_owner);
829     - else
830     + } else {
831     + if (block->bb_u.s.bb_owner == cpu_to_be32(bbcoi->new_owner))
832     + return 0;
833     block->bb_u.s.bb_owner = cpu_to_be32(bbcoi->new_owner);
834     + }
835    
836     /*
837     * If the block is a root block hosted in an inode, we might not have a
838     @@ -4463,16 +4469,19 @@ xfs_btree_block_change_owner(
839     * block is formatted into the on-disk inode fork. We still change it,
840     * though, so everything is consistent in memory.
841     */
842     - if (bp) {
843     - if (cur->bc_tp) {
844     - xfs_trans_ordered_buf(cur->bc_tp, bp);
845     + if (!bp) {
846     + ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
847     + ASSERT(level == cur->bc_nlevels - 1);
848     + return 0;
849     + }
850     +
851     + if (cur->bc_tp) {
852     + if (!xfs_trans_ordered_buf(cur->bc_tp, bp)) {
853     xfs_btree_log_block(cur, bp, XFS_BB_OWNER);
854     - } else {
855     - xfs_buf_delwri_queue(bp, bbcoi->buffer_list);
856     + return -EAGAIN;
857     }
858     } else {
859     - ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
860     - ASSERT(level == cur->bc_nlevels - 1);
861     + xfs_buf_delwri_queue(bp, bbcoi->buffer_list);
862     }
863    
864     return 0;
865     diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
866     index 9c95e965cfe5..f2a88c3b1159 100644
867     --- a/fs/xfs/libxfs/xfs_btree.h
868     +++ b/fs/xfs/libxfs/xfs_btree.h
869     @@ -233,7 +233,8 @@ typedef struct xfs_btree_cur
870     short forksize; /* fork's inode space */
871     char whichfork; /* data or attr fork */
872     char flags; /* flags */
873     -#define XFS_BTCUR_BPRV_WASDEL 1 /* was delayed */
874     +#define XFS_BTCUR_BPRV_WASDEL (1<<0) /* was delayed */
875     +#define XFS_BTCUR_BPRV_INVALID_OWNER (1<<1) /* for ext swap */
876     } b;
877     } bc_private; /* per-btree type data */
878     } xfs_btree_cur_t;
879     diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
880     index abf5beaae907..988bb3f31446 100644
881     --- a/fs/xfs/libxfs/xfs_ialloc.c
882     +++ b/fs/xfs/libxfs/xfs_ialloc.c
883     @@ -378,8 +378,6 @@ xfs_ialloc_inode_init(
884     * transaction and pin the log appropriately.
885     */
886     xfs_trans_ordered_buf(tp, fbuf);
887     - xfs_trans_log_buf(tp, fbuf, 0,
888     - BBTOB(fbuf->b_length) - 1);
889     }
890     } else {
891     fbuf->b_flags |= XBF_DONE;
892     @@ -1133,6 +1131,7 @@ xfs_dialloc_ag_inobt(
893     int error;
894     int offset;
895     int i, j;
896     + int searchdistance = 10;
897    
898     pag = xfs_perag_get(mp, agno);
899    
900     @@ -1159,7 +1158,6 @@ xfs_dialloc_ag_inobt(
901     if (pagno == agno) {
902     int doneleft; /* done, to the left */
903     int doneright; /* done, to the right */
904     - int searchdistance = 10;
905    
906     error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i);
907     if (error)
908     @@ -1220,21 +1218,9 @@ xfs_dialloc_ag_inobt(
909     /*
910     * Loop until we find an inode chunk with a free inode.
911     */
912     - while (!doneleft || !doneright) {
913     + while (--searchdistance > 0 && (!doneleft || !doneright)) {
914     int useleft; /* using left inode chunk this time */
915    
916     - if (!--searchdistance) {
917     - /*
918     - * Not in range - save last search
919     - * location and allocate a new inode
920     - */
921     - xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
922     - pag->pagl_leftrec = trec.ir_startino;
923     - pag->pagl_rightrec = rec.ir_startino;
924     - pag->pagl_pagino = pagino;
925     - goto newino;
926     - }
927     -
928     /* figure out the closer block if both are valid. */
929     if (!doneleft && !doneright) {
930     useleft = pagino -
931     @@ -1278,26 +1264,37 @@ xfs_dialloc_ag_inobt(
932     goto error1;
933     }
934    
935     - /*
936     - * We've reached the end of the btree. because
937     - * we are only searching a small chunk of the
938     - * btree each search, there is obviously free
939     - * inodes closer to the parent inode than we
940     - * are now. restart the search again.
941     - */
942     - pag->pagl_pagino = NULLAGINO;
943     - pag->pagl_leftrec = NULLAGINO;
944     - pag->pagl_rightrec = NULLAGINO;
945     - xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
946     - xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
947     - goto restart_pagno;
948     + if (searchdistance <= 0) {
949     + /*
950     + * Not in range - save last search
951     + * location and allocate a new inode
952     + */
953     + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
954     + pag->pagl_leftrec = trec.ir_startino;
955     + pag->pagl_rightrec = rec.ir_startino;
956     + pag->pagl_pagino = pagino;
957     +
958     + } else {
959     + /*
960     + * We've reached the end of the btree. because
961     + * we are only searching a small chunk of the
962     + * btree each search, there is obviously free
963     + * inodes closer to the parent inode than we
964     + * are now. restart the search again.
965     + */
966     + pag->pagl_pagino = NULLAGINO;
967     + pag->pagl_leftrec = NULLAGINO;
968     + pag->pagl_rightrec = NULLAGINO;
969     + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
970     + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
971     + goto restart_pagno;
972     + }
973     }
974    
975     /*
976     * In a different AG from the parent.
977     * See if the most recently allocated block has any free.
978     */
979     -newino:
980     if (agi->agi_newino != cpu_to_be32(NULLAGINO)) {
981     error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino),
982     XFS_LOOKUP_EQ, &i);
983     diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
984     index 0e80f34fe97c..5eb165555934 100644
985     --- a/fs/xfs/libxfs/xfs_inode_fork.c
986     +++ b/fs/xfs/libxfs/xfs_inode_fork.c
987     @@ -1499,14 +1499,11 @@ xfs_iext_realloc_indirect(
988     xfs_ifork_t *ifp, /* inode fork pointer */
989     int new_size) /* new indirection array size */
990     {
991     - int nlists; /* number of irec's (ex lists) */
992     - int size; /* current indirection array size */
993     -
994     ASSERT(ifp->if_flags & XFS_IFEXTIREC);
995     - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
996     - size = nlists * sizeof(xfs_ext_irec_t);
997     ASSERT(ifp->if_real_bytes);
998     - ASSERT((new_size >= 0) && (new_size != size));
999     + ASSERT((new_size >= 0) &&
1000     + (new_size != ((ifp->if_real_bytes / XFS_IEXT_BUFSZ) *
1001     + sizeof(xfs_ext_irec_t))));
1002     if (new_size == 0) {
1003     xfs_iext_destroy(ifp);
1004     } else {
1005     diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
1006     index 6bf120bb1a17..f9efd67f6fa1 100644
1007     --- a/fs/xfs/xfs_aops.c
1008     +++ b/fs/xfs/xfs_aops.c
1009     @@ -85,11 +85,11 @@ xfs_find_bdev_for_inode(
1010     * associated buffer_heads, paying attention to the start and end offsets that
1011     * we need to process on the page.
1012     *
1013     - * Landmine Warning: bh->b_end_io() will call end_page_writeback() on the last
1014     - * buffer in the IO. Once it does this, it is unsafe to access the bufferhead or
1015     - * the page at all, as we may be racing with memory reclaim and it can free both
1016     - * the bufferhead chain and the page as it will see the page as clean and
1017     - * unused.
1018     + * Note that we open code the action in end_buffer_async_write here so that we
1019     + * only have to iterate over the buffers attached to the page once. This is not
1020     + * only more efficient, but also ensures that we only calls end_page_writeback
1021     + * at the end of the iteration, and thus avoids the pitfall of having the page
1022     + * and buffers potentially freed after every call to end_buffer_async_write.
1023     */
1024     static void
1025     xfs_finish_page_writeback(
1026     @@ -97,29 +97,44 @@ xfs_finish_page_writeback(
1027     struct bio_vec *bvec,
1028     int error)
1029     {
1030     - unsigned int end = bvec->bv_offset + bvec->bv_len - 1;
1031     - struct buffer_head *head, *bh, *next;
1032     + struct buffer_head *head = page_buffers(bvec->bv_page), *bh = head;
1033     + bool busy = false;
1034     unsigned int off = 0;
1035     - unsigned int bsize;
1036     + unsigned long flags;
1037    
1038     ASSERT(bvec->bv_offset < PAGE_SIZE);
1039     ASSERT((bvec->bv_offset & (i_blocksize(inode) - 1)) == 0);
1040     - ASSERT(end < PAGE_SIZE);
1041     + ASSERT(bvec->bv_offset + bvec->bv_len <= PAGE_SIZE);
1042     ASSERT((bvec->bv_len & (i_blocksize(inode) - 1)) == 0);
1043    
1044     - bh = head = page_buffers(bvec->bv_page);
1045     -
1046     - bsize = bh->b_size;
1047     + local_irq_save(flags);
1048     + bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
1049     do {
1050     - if (off > end)
1051     - break;
1052     - next = bh->b_this_page;
1053     - if (off < bvec->bv_offset)
1054     - goto next_bh;
1055     - bh->b_end_io(bh, !error);
1056     -next_bh:
1057     - off += bsize;
1058     - } while ((bh = next) != head);
1059     + if (off >= bvec->bv_offset &&
1060     + off < bvec->bv_offset + bvec->bv_len) {
1061     + ASSERT(buffer_async_write(bh));
1062     + ASSERT(bh->b_end_io == NULL);
1063     +
1064     + if (error) {
1065     + mark_buffer_write_io_error(bh);
1066     + clear_buffer_uptodate(bh);
1067     + SetPageError(bvec->bv_page);
1068     + } else {
1069     + set_buffer_uptodate(bh);
1070     + }
1071     + clear_buffer_async_write(bh);
1072     + unlock_buffer(bh);
1073     + } else if (buffer_async_write(bh)) {
1074     + ASSERT(buffer_locked(bh));
1075     + busy = true;
1076     + }
1077     + off += bh->b_size;
1078     + } while ((bh = bh->b_this_page) != head);
1079     + bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
1080     + local_irq_restore(flags);
1081     +
1082     + if (!busy)
1083     + end_page_writeback(bvec->bv_page);
1084     }
1085    
1086     /*
1087     @@ -133,8 +148,10 @@ xfs_destroy_ioend(
1088     int error)
1089     {
1090     struct inode *inode = ioend->io_inode;
1091     - struct bio *last = ioend->io_bio;
1092     - struct bio *bio, *next;
1093     + struct bio *bio = &ioend->io_inline_bio;
1094     + struct bio *last = ioend->io_bio, *next;
1095     + u64 start = bio->bi_iter.bi_sector;
1096     + bool quiet = bio_flagged(bio, BIO_QUIET);
1097    
1098     for (bio = &ioend->io_inline_bio; bio; bio = next) {
1099     struct bio_vec *bvec;
1100     @@ -155,6 +172,11 @@ xfs_destroy_ioend(
1101    
1102     bio_put(bio);
1103     }
1104     +
1105     + if (unlikely(error && !quiet)) {
1106     + xfs_err_ratelimited(XFS_I(inode)->i_mount,
1107     + "writeback error on sector %llu", start);
1108     + }
1109     }
1110    
1111     /*
1112     @@ -423,7 +445,8 @@ xfs_start_buffer_writeback(
1113     ASSERT(!buffer_delay(bh));
1114     ASSERT(!buffer_unwritten(bh));
1115    
1116     - mark_buffer_async_write(bh);
1117     + bh->b_end_io = NULL;
1118     + set_buffer_async_write(bh);
1119     set_buffer_uptodate(bh);
1120     clear_buffer_dirty(bh);
1121     }
1122     diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
1123     index 93e955262d07..3e9b7a4fb8fd 100644
1124     --- a/fs/xfs/xfs_bmap_util.c
1125     +++ b/fs/xfs/xfs_bmap_util.c
1126     @@ -1840,29 +1840,18 @@ xfs_swap_extent_forks(
1127     }
1128    
1129     /*
1130     - * Before we've swapped the forks, lets set the owners of the forks
1131     - * appropriately. We have to do this as we are demand paging the btree
1132     - * buffers, and so the validation done on read will expect the owner
1133     - * field to be correctly set. Once we change the owners, we can swap the
1134     - * inode forks.
1135     + * Btree format (v3) inodes have the inode number stamped in the bmbt
1136     + * block headers. We can't start changing the bmbt blocks until the
1137     + * inode owner change is logged so recovery does the right thing in the
1138     + * event of a crash. Set the owner change log flags now and leave the
1139     + * bmbt scan as the last step.
1140     */
1141     if (ip->i_d.di_version == 3 &&
1142     - ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
1143     + ip->i_d.di_format == XFS_DINODE_FMT_BTREE)
1144     (*target_log_flags) |= XFS_ILOG_DOWNER;
1145     - error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK,
1146     - tip->i_ino, NULL);
1147     - if (error)
1148     - return error;
1149     - }
1150     -
1151     if (tip->i_d.di_version == 3 &&
1152     - tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
1153     + tip->i_d.di_format == XFS_DINODE_FMT_BTREE)
1154     (*src_log_flags) |= XFS_ILOG_DOWNER;
1155     - error = xfs_bmbt_change_owner(tp, tip, XFS_DATA_FORK,
1156     - ip->i_ino, NULL);
1157     - if (error)
1158     - return error;
1159     - }
1160    
1161     /*
1162     * Swap the data forks of the inodes
1163     @@ -1940,6 +1929,48 @@ xfs_swap_extent_forks(
1164     return 0;
1165     }
1166    
1167     +/*
1168     + * Fix up the owners of the bmbt blocks to refer to the current inode. The
1169     + * change owner scan attempts to order all modified buffers in the current
1170     + * transaction. In the event of ordered buffer failure, the offending buffer is
1171     + * physically logged as a fallback and the scan returns -EAGAIN. We must roll
1172     + * the transaction in this case to replenish the fallback log reservation and
1173     + * restart the scan. This process repeats until the scan completes.
1174     + */
1175     +static int
1176     +xfs_swap_change_owner(
1177     + struct xfs_trans **tpp,
1178     + struct xfs_inode *ip,
1179     + struct xfs_inode *tmpip)
1180     +{
1181     + int error;
1182     + struct xfs_trans *tp = *tpp;
1183     +
1184     + do {
1185     + error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK, ip->i_ino,
1186     + NULL);
1187     + /* success or fatal error */
1188     + if (error != -EAGAIN)
1189     + break;
1190     +
1191     + error = xfs_trans_roll(tpp, NULL);
1192     + if (error)
1193     + break;
1194     + tp = *tpp;
1195     +
1196     + /*
1197     + * Redirty both inodes so they can relog and keep the log tail
1198     + * moving forward.
1199     + */
1200     + xfs_trans_ijoin(tp, ip, 0);
1201     + xfs_trans_ijoin(tp, tmpip, 0);
1202     + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1203     + xfs_trans_log_inode(tp, tmpip, XFS_ILOG_CORE);
1204     + } while (true);
1205     +
1206     + return error;
1207     +}
1208     +
1209     int
1210     xfs_swap_extents(
1211     struct xfs_inode *ip, /* target inode */
1212     @@ -1954,7 +1985,7 @@ xfs_swap_extents(
1213     int lock_flags;
1214     struct xfs_ifork *cowfp;
1215     uint64_t f;
1216     - int resblks;
1217     + int resblks = 0;
1218    
1219     /*
1220     * Lock the inodes against other IO, page faults and truncate to
1221     @@ -2002,11 +2033,8 @@ xfs_swap_extents(
1222     XFS_SWAP_RMAP_SPACE_RES(mp,
1223     XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK),
1224     XFS_DATA_FORK);
1225     - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks,
1226     - 0, 0, &tp);
1227     - } else
1228     - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0,
1229     - 0, 0, &tp);
1230     + }
1231     + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
1232     if (error)
1233     goto out_unlock;
1234    
1235     @@ -2091,6 +2119,23 @@ xfs_swap_extents(
1236     xfs_trans_log_inode(tp, ip, src_log_flags);
1237     xfs_trans_log_inode(tp, tip, target_log_flags);
1238    
1239     + /*
1240     + * The extent forks have been swapped, but crc=1,rmapbt=0 filesystems
1241     + * have inode number owner values in the bmbt blocks that still refer to
1242     + * the old inode. Scan each bmbt to fix up the owner values with the
1243     + * inode number of the current inode.
1244     + */
1245     + if (src_log_flags & XFS_ILOG_DOWNER) {
1246     + error = xfs_swap_change_owner(&tp, ip, tip);
1247     + if (error)
1248     + goto out_trans_cancel;
1249     + }
1250     + if (target_log_flags & XFS_ILOG_DOWNER) {
1251     + error = xfs_swap_change_owner(&tp, tip, ip);
1252     + if (error)
1253     + goto out_trans_cancel;
1254     + }
1255     +
1256     /*
1257     * If this is a synchronous mount, make sure that the
1258     * transaction goes to disk before returning to the user.
1259     diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
1260     index f6a8422e9562..e0a0af0946f2 100644
1261     --- a/fs/xfs/xfs_buf_item.c
1262     +++ b/fs/xfs/xfs_buf_item.c
1263     @@ -29,6 +29,7 @@
1264     #include "xfs_error.h"
1265     #include "xfs_trace.h"
1266     #include "xfs_log.h"
1267     +#include "xfs_inode.h"
1268    
1269    
1270     kmem_zone_t *xfs_buf_item_zone;
1271     @@ -322,6 +323,8 @@ xfs_buf_item_format(
1272     ASSERT((bip->bli_flags & XFS_BLI_STALE) ||
1273     (xfs_blft_from_flags(&bip->__bli_format) > XFS_BLFT_UNKNOWN_BUF
1274     && xfs_blft_from_flags(&bip->__bli_format) < XFS_BLFT_MAX_BUF));
1275     + ASSERT(!(bip->bli_flags & XFS_BLI_ORDERED) ||
1276     + (bip->bli_flags & XFS_BLI_STALE));
1277    
1278    
1279     /*
1280     @@ -346,16 +349,6 @@ xfs_buf_item_format(
1281     bip->bli_flags &= ~XFS_BLI_INODE_BUF;
1282     }
1283    
1284     - if ((bip->bli_flags & (XFS_BLI_ORDERED|XFS_BLI_STALE)) ==
1285     - XFS_BLI_ORDERED) {
1286     - /*
1287     - * The buffer has been logged just to order it. It is not being
1288     - * included in the transaction commit, so don't format it.
1289     - */
1290     - trace_xfs_buf_item_format_ordered(bip);
1291     - return;
1292     - }
1293     -
1294     for (i = 0; i < bip->bli_format_count; i++) {
1295     xfs_buf_item_format_segment(bip, lv, &vecp, offset,
1296     &bip->bli_formats[i]);
1297     @@ -574,26 +567,20 @@ xfs_buf_item_unlock(
1298     {
1299     struct xfs_buf_log_item *bip = BUF_ITEM(lip);
1300     struct xfs_buf *bp = bip->bli_buf;
1301     - bool clean;
1302     - bool aborted;
1303     - int flags;
1304     + bool aborted = !!(lip->li_flags & XFS_LI_ABORTED);
1305     + bool hold = !!(bip->bli_flags & XFS_BLI_HOLD);
1306     + bool dirty = !!(bip->bli_flags & XFS_BLI_DIRTY);
1307     +#if defined(DEBUG) || defined(XFS_WARN)
1308     + bool ordered = !!(bip->bli_flags & XFS_BLI_ORDERED);
1309     +#endif
1310    
1311     /* Clear the buffer's association with this transaction. */
1312     bp->b_transp = NULL;
1313    
1314     /*
1315     - * If this is a transaction abort, don't return early. Instead, allow
1316     - * the brelse to happen. Normally it would be done for stale
1317     - * (cancelled) buffers at unpin time, but we'll never go through the
1318     - * pin/unpin cycle if we abort inside commit.
1319     + * The per-transaction state has been copied above so clear it from the
1320     + * bli.
1321     */
1322     - aborted = (lip->li_flags & XFS_LI_ABORTED) ? true : false;
1323     - /*
1324     - * Before possibly freeing the buf item, copy the per-transaction state
1325     - * so we can reference it safely later after clearing it from the
1326     - * buffer log item.
1327     - */
1328     - flags = bip->bli_flags;
1329     bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED);
1330    
1331     /*
1332     @@ -601,7 +588,7 @@ xfs_buf_item_unlock(
1333     * unlock the buffer and free the buf item when the buffer is unpinned
1334     * for the last time.
1335     */
1336     - if (flags & XFS_BLI_STALE) {
1337     + if (bip->bli_flags & XFS_BLI_STALE) {
1338     trace_xfs_buf_item_unlock_stale(bip);
1339     ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
1340     if (!aborted) {
1341     @@ -619,20 +606,11 @@ xfs_buf_item_unlock(
1342     * regardless of whether it is dirty or not. A dirty abort implies a
1343     * shutdown, anyway.
1344     *
1345     - * Ordered buffers are dirty but may have no recorded changes, so ensure
1346     - * we only release clean items here.
1347     + * The bli dirty state should match whether the blf has logged segments
1348     + * except for ordered buffers, where only the bli should be dirty.
1349     */
1350     - clean = (flags & XFS_BLI_DIRTY) ? false : true;
1351     - if (clean) {
1352     - int i;
1353     - for (i = 0; i < bip->bli_format_count; i++) {
1354     - if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map,
1355     - bip->bli_formats[i].blf_map_size)) {
1356     - clean = false;
1357     - break;
1358     - }
1359     - }
1360     - }
1361     + ASSERT((!ordered && dirty == xfs_buf_item_dirty_format(bip)) ||
1362     + (ordered && dirty && !xfs_buf_item_dirty_format(bip)));
1363    
1364     /*
1365     * Clean buffers, by definition, cannot be in the AIL. However, aborted
1366     @@ -651,11 +629,11 @@ xfs_buf_item_unlock(
1367     ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp));
1368     xfs_trans_ail_remove(lip, SHUTDOWN_LOG_IO_ERROR);
1369     xfs_buf_item_relse(bp);
1370     - } else if (clean)
1371     + } else if (!dirty)
1372     xfs_buf_item_relse(bp);
1373     }
1374    
1375     - if (!(flags & XFS_BLI_HOLD))
1376     + if (!hold)
1377     xfs_buf_relse(bp);
1378     }
1379    
1380     @@ -945,14 +923,22 @@ xfs_buf_item_log(
1381    
1382    
1383     /*
1384     - * Return 1 if the buffer has been logged or ordered in a transaction (at any
1385     - * point, not just the current transaction) and 0 if not.
1386     + * Return true if the buffer has any ranges logged/dirtied by a transaction,
1387     + * false otherwise.
1388     */
1389     -uint
1390     -xfs_buf_item_dirty(
1391     - xfs_buf_log_item_t *bip)
1392     +bool
1393     +xfs_buf_item_dirty_format(
1394     + struct xfs_buf_log_item *bip)
1395     {
1396     - return (bip->bli_flags & XFS_BLI_DIRTY);
1397     + int i;
1398     +
1399     + for (i = 0; i < bip->bli_format_count; i++) {
1400     + if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map,
1401     + bip->bli_formats[i].blf_map_size))
1402     + return true;
1403     + }
1404     +
1405     + return false;
1406     }
1407    
1408     STATIC void
1409     @@ -1054,6 +1040,31 @@ xfs_buf_do_callbacks(
1410     }
1411     }
1412    
1413     +/*
1414     + * Invoke the error state callback for each log item affected by the failed I/O.
1415     + *
1416     + * If a metadata buffer write fails with a non-permanent error, the buffer is
1417     + * eventually resubmitted and so the completion callbacks are not run. The error
1418     + * state may need to be propagated to the log items attached to the buffer,
1419     + * however, so the next AIL push of the item knows hot to handle it correctly.
1420     + */
1421     +STATIC void
1422     +xfs_buf_do_callbacks_fail(
1423     + struct xfs_buf *bp)
1424     +{
1425     + struct xfs_log_item *next;
1426     + struct xfs_log_item *lip = bp->b_fspriv;
1427     + struct xfs_ail *ailp = lip->li_ailp;
1428     +
1429     + spin_lock(&ailp->xa_lock);
1430     + for (; lip; lip = next) {
1431     + next = lip->li_bio_list;
1432     + if (lip->li_ops->iop_error)
1433     + lip->li_ops->iop_error(lip, bp);
1434     + }
1435     + spin_unlock(&ailp->xa_lock);
1436     +}
1437     +
1438     static bool
1439     xfs_buf_iodone_callback_error(
1440     struct xfs_buf *bp)
1441     @@ -1123,7 +1134,11 @@ xfs_buf_iodone_callback_error(
1442     if ((mp->m_flags & XFS_MOUNT_UNMOUNTING) && mp->m_fail_unmount)
1443     goto permanent_error;
1444    
1445     - /* still a transient error, higher layers will retry */
1446     + /*
1447     + * Still a transient error, run IO completion failure callbacks and let
1448     + * the higher layers retry the buffer.
1449     + */
1450     + xfs_buf_do_callbacks_fail(bp);
1451     xfs_buf_ioerror(bp, 0);
1452     xfs_buf_relse(bp);
1453     return true;
1454     @@ -1204,3 +1219,31 @@ xfs_buf_iodone(
1455     xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE);
1456     xfs_buf_item_free(BUF_ITEM(lip));
1457     }
1458     +
1459     +/*
1460     + * Requeue a failed buffer for writeback
1461     + *
1462     + * Return true if the buffer has been re-queued properly, false otherwise
1463     + */
1464     +bool
1465     +xfs_buf_resubmit_failed_buffers(
1466     + struct xfs_buf *bp,
1467     + struct xfs_log_item *lip,
1468     + struct list_head *buffer_list)
1469     +{
1470     + struct xfs_log_item *next;
1471     +
1472     + /*
1473     + * Clear XFS_LI_FAILED flag from all items before resubmit
1474     + *
1475     + * XFS_LI_FAILED set/clear is protected by xa_lock, caller this
1476     + * function already have it acquired
1477     + */
1478     + for (; lip; lip = next) {
1479     + next = lip->li_bio_list;
1480     + xfs_clear_li_failed(lip);
1481     + }
1482     +
1483     + /* Add this buffer back to the delayed write list */
1484     + return xfs_buf_delwri_queue(bp, buffer_list);
1485     +}
1486     diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
1487     index f7eba99d19dd..9690ce62c9a7 100644
1488     --- a/fs/xfs/xfs_buf_item.h
1489     +++ b/fs/xfs/xfs_buf_item.h
1490     @@ -64,12 +64,15 @@ typedef struct xfs_buf_log_item {
1491     int xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *);
1492     void xfs_buf_item_relse(struct xfs_buf *);
1493     void xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint);
1494     -uint xfs_buf_item_dirty(xfs_buf_log_item_t *);
1495     +bool xfs_buf_item_dirty_format(struct xfs_buf_log_item *);
1496     void xfs_buf_attach_iodone(struct xfs_buf *,
1497     void(*)(struct xfs_buf *, xfs_log_item_t *),
1498     xfs_log_item_t *);
1499     void xfs_buf_iodone_callbacks(struct xfs_buf *);
1500     void xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *);
1501     +bool xfs_buf_resubmit_failed_buffers(struct xfs_buf *,
1502     + struct xfs_log_item *,
1503     + struct list_head *);
1504    
1505     extern kmem_zone_t *xfs_buf_item_zone;
1506    
1507     diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
1508     index 0a9e6985a0d0..34227115a5d6 100644
1509     --- a/fs/xfs/xfs_icache.c
1510     +++ b/fs/xfs/xfs_icache.c
1511     @@ -1124,11 +1124,11 @@ xfs_reclaim_inode(
1512     * Because we use RCU freeing we need to ensure the inode always appears
1513     * to be reclaimed with an invalid inode number when in the free state.
1514     * We do this as early as possible under the ILOCK so that
1515     - * xfs_iflush_cluster() can be guaranteed to detect races with us here.
1516     - * By doing this, we guarantee that once xfs_iflush_cluster has locked
1517     - * XFS_ILOCK that it will see either a valid, flushable inode that will
1518     - * serialise correctly, or it will see a clean (and invalid) inode that
1519     - * it can skip.
1520     + * xfs_iflush_cluster() and xfs_ifree_cluster() can be guaranteed to
1521     + * detect races with us here. By doing this, we guarantee that once
1522     + * xfs_iflush_cluster() or xfs_ifree_cluster() has locked XFS_ILOCK that
1523     + * it will see either a valid inode that will serialise correctly, or it
1524     + * will see an invalid inode that it can skip.
1525     */
1526     spin_lock(&ip->i_flags_lock);
1527     ip->i_flags = XFS_IRECLAIM;
1528     diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
1529     index ff48f0096810..97045e8dfed5 100644
1530     --- a/fs/xfs/xfs_inode.c
1531     +++ b/fs/xfs/xfs_inode.c
1532     @@ -2359,11 +2359,24 @@ xfs_ifree_cluster(
1533     * already marked stale. If we can't lock it, back off
1534     * and retry.
1535     */
1536     - if (ip != free_ip &&
1537     - !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
1538     - rcu_read_unlock();
1539     - delay(1);
1540     - goto retry;
1541     + if (ip != free_ip) {
1542     + if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
1543     + rcu_read_unlock();
1544     + delay(1);
1545     + goto retry;
1546     + }
1547     +
1548     + /*
1549     + * Check the inode number again in case we're
1550     + * racing with freeing in xfs_reclaim_inode().
1551     + * See the comments in that function for more
1552     + * information as to why the initial check is
1553     + * not sufficient.
1554     + */
1555     + if (ip->i_ino != inum + i) {
1556     + xfs_iunlock(ip, XFS_ILOCK_EXCL);
1557     + continue;
1558     + }
1559     }
1560     rcu_read_unlock();
1561    
1562     diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
1563     index 013cc78d7daf..6d0f74ec31e8 100644
1564     --- a/fs/xfs/xfs_inode_item.c
1565     +++ b/fs/xfs/xfs_inode_item.c
1566     @@ -27,6 +27,7 @@
1567     #include "xfs_error.h"
1568     #include "xfs_trace.h"
1569     #include "xfs_trans_priv.h"
1570     +#include "xfs_buf_item.h"
1571     #include "xfs_log.h"
1572    
1573    
1574     @@ -475,6 +476,23 @@ xfs_inode_item_unpin(
1575     wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT);
1576     }
1577    
1578     +/*
1579     + * Callback used to mark a buffer with XFS_LI_FAILED when items in the buffer
1580     + * have been failed during writeback
1581     + *
1582     + * This informs the AIL that the inode is already flush locked on the next push,
1583     + * and acquires a hold on the buffer to ensure that it isn't reclaimed before
1584     + * dirty data makes it to disk.
1585     + */
1586     +STATIC void
1587     +xfs_inode_item_error(
1588     + struct xfs_log_item *lip,
1589     + struct xfs_buf *bp)
1590     +{
1591     + ASSERT(xfs_isiflocked(INODE_ITEM(lip)->ili_inode));
1592     + xfs_set_li_failed(lip, bp);
1593     +}
1594     +
1595     STATIC uint
1596     xfs_inode_item_push(
1597     struct xfs_log_item *lip,
1598     @@ -484,13 +502,28 @@ xfs_inode_item_push(
1599     {
1600     struct xfs_inode_log_item *iip = INODE_ITEM(lip);
1601     struct xfs_inode *ip = iip->ili_inode;
1602     - struct xfs_buf *bp = NULL;
1603     + struct xfs_buf *bp = lip->li_buf;
1604     uint rval = XFS_ITEM_SUCCESS;
1605     int error;
1606    
1607     if (xfs_ipincount(ip) > 0)
1608     return XFS_ITEM_PINNED;
1609    
1610     + /*
1611     + * The buffer containing this item failed to be written back
1612     + * previously. Resubmit the buffer for IO.
1613     + */
1614     + if (lip->li_flags & XFS_LI_FAILED) {
1615     + if (!xfs_buf_trylock(bp))
1616     + return XFS_ITEM_LOCKED;
1617     +
1618     + if (!xfs_buf_resubmit_failed_buffers(bp, lip, buffer_list))
1619     + rval = XFS_ITEM_FLUSHING;
1620     +
1621     + xfs_buf_unlock(bp);
1622     + return rval;
1623     + }
1624     +
1625     if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
1626     return XFS_ITEM_LOCKED;
1627    
1628     @@ -622,7 +655,8 @@ static const struct xfs_item_ops xfs_inode_item_ops = {
1629     .iop_unlock = xfs_inode_item_unlock,
1630     .iop_committed = xfs_inode_item_committed,
1631     .iop_push = xfs_inode_item_push,
1632     - .iop_committing = xfs_inode_item_committing
1633     + .iop_committing = xfs_inode_item_committing,
1634     + .iop_error = xfs_inode_item_error
1635     };
1636    
1637    
1638     @@ -710,7 +744,8 @@ xfs_iflush_done(
1639     * the AIL lock.
1640     */
1641     iip = INODE_ITEM(blip);
1642     - if (iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn)
1643     + if ((iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn) ||
1644     + lip->li_flags & XFS_LI_FAILED)
1645     need_ail++;
1646    
1647     blip = next;
1648     @@ -718,7 +753,8 @@ xfs_iflush_done(
1649    
1650     /* make sure we capture the state of the initial inode. */
1651     iip = INODE_ITEM(lip);
1652     - if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn)
1653     + if ((iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) ||
1654     + lip->li_flags & XFS_LI_FAILED)
1655     need_ail++;
1656    
1657     /*
1658     @@ -739,6 +775,9 @@ xfs_iflush_done(
1659     if (INODE_ITEM(blip)->ili_logged &&
1660     blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn)
1661     mlip_changed |= xfs_ail_delete_one(ailp, blip);
1662     + else {
1663     + xfs_clear_li_failed(blip);
1664     + }
1665     }
1666    
1667     if (mlip_changed) {
1668     diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
1669     index 9c0c7a920304..5049e8ab6e30 100644
1670     --- a/fs/xfs/xfs_ioctl.c
1671     +++ b/fs/xfs/xfs_ioctl.c
1672     @@ -931,16 +931,15 @@ xfs_ioc_fsgetxattr(
1673     return 0;
1674     }
1675    
1676     -STATIC void
1677     -xfs_set_diflags(
1678     +STATIC uint16_t
1679     +xfs_flags2diflags(
1680     struct xfs_inode *ip,
1681     unsigned int xflags)
1682     {
1683     - unsigned int di_flags;
1684     - uint64_t di_flags2;
1685     -
1686     /* can't set PREALLOC this way, just preserve it */
1687     - di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC);
1688     + uint16_t di_flags =
1689     + (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC);
1690     +
1691     if (xflags & FS_XFLAG_IMMUTABLE)
1692     di_flags |= XFS_DIFLAG_IMMUTABLE;
1693     if (xflags & FS_XFLAG_APPEND)
1694     @@ -970,19 +969,24 @@ xfs_set_diflags(
1695     if (xflags & FS_XFLAG_EXTSIZE)
1696     di_flags |= XFS_DIFLAG_EXTSIZE;
1697     }
1698     - ip->i_d.di_flags = di_flags;
1699    
1700     - /* diflags2 only valid for v3 inodes. */
1701     - if (ip->i_d.di_version < 3)
1702     - return;
1703     + return di_flags;
1704     +}
1705     +
1706     +STATIC uint64_t
1707     +xfs_flags2diflags2(
1708     + struct xfs_inode *ip,
1709     + unsigned int xflags)
1710     +{
1711     + uint64_t di_flags2 =
1712     + (ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK);
1713    
1714     - di_flags2 = (ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK);
1715     if (xflags & FS_XFLAG_DAX)
1716     di_flags2 |= XFS_DIFLAG2_DAX;
1717     if (xflags & FS_XFLAG_COWEXTSIZE)
1718     di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
1719    
1720     - ip->i_d.di_flags2 = di_flags2;
1721     + return di_flags2;
1722     }
1723    
1724     STATIC void
1725     @@ -1008,11 +1012,12 @@ xfs_diflags_to_linux(
1726     inode->i_flags |= S_NOATIME;
1727     else
1728     inode->i_flags &= ~S_NOATIME;
1729     +#if 0 /* disabled until the flag switching races are sorted out */
1730     if (xflags & FS_XFLAG_DAX)
1731     inode->i_flags |= S_DAX;
1732     else
1733     inode->i_flags &= ~S_DAX;
1734     -
1735     +#endif
1736     }
1737    
1738     static int
1739     @@ -1022,6 +1027,7 @@ xfs_ioctl_setattr_xflags(
1740     struct fsxattr *fa)
1741     {
1742     struct xfs_mount *mp = ip->i_mount;
1743     + uint64_t di_flags2;
1744    
1745     /* Can't change realtime flag if any extents are allocated. */
1746     if ((ip->i_d.di_nextents || ip->i_delayed_blks) &&
1747     @@ -1052,7 +1058,14 @@ xfs_ioctl_setattr_xflags(
1748     !capable(CAP_LINUX_IMMUTABLE))
1749     return -EPERM;
1750    
1751     - xfs_set_diflags(ip, fa->fsx_xflags);
1752     + /* diflags2 only valid for v3 inodes. */
1753     + di_flags2 = xfs_flags2diflags2(ip, fa->fsx_xflags);
1754     + if (di_flags2 && ip->i_d.di_version < 3)
1755     + return -EINVAL;
1756     +
1757     + ip->i_d.di_flags = xfs_flags2diflags(ip, fa->fsx_xflags);
1758     + ip->i_d.di_flags2 = di_flags2;
1759     +
1760     xfs_diflags_to_linux(ip);
1761     xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
1762     xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1763     diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
1764     index 469c9fa4c178..17081c77ef86 100644
1765     --- a/fs/xfs/xfs_iops.c
1766     +++ b/fs/xfs/xfs_iops.c
1767     @@ -817,7 +817,7 @@ xfs_vn_setattr_nonsize(
1768     * Caution: The caller of this function is responsible for calling
1769     * setattr_prepare() or otherwise verifying the change is fine.
1770     */
1771     -int
1772     +STATIC int
1773     xfs_setattr_size(
1774     struct xfs_inode *ip,
1775     struct iattr *iattr)
1776     diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
1777     index 4ebd0bafc914..c5107c7bc4bf 100644
1778     --- a/fs/xfs/xfs_log.c
1779     +++ b/fs/xfs/xfs_log.c
1780     @@ -743,10 +743,14 @@ xfs_log_mount_finish(
1781     struct xfs_mount *mp)
1782     {
1783     int error = 0;
1784     + bool readonly = (mp->m_flags & XFS_MOUNT_RDONLY);
1785    
1786     if (mp->m_flags & XFS_MOUNT_NORECOVERY) {
1787     ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
1788     return 0;
1789     + } else if (readonly) {
1790     + /* Allow unlinked processing to proceed */
1791     + mp->m_flags &= ~XFS_MOUNT_RDONLY;
1792     }
1793    
1794     /*
1795     @@ -757,12 +761,27 @@ xfs_log_mount_finish(
1796     * inodes. Turn it off immediately after recovery finishes
1797     * so that we don't leak the quota inodes if subsequent mount
1798     * activities fail.
1799     + *
1800     + * We let all inodes involved in redo item processing end up on
1801     + * the LRU instead of being evicted immediately so that if we do
1802     + * something to an unlinked inode, the irele won't cause
1803     + * premature truncation and freeing of the inode, which results
1804     + * in log recovery failure. We have to evict the unreferenced
1805     + * lru inodes after clearing MS_ACTIVE because we don't
1806     + * otherwise clean up the lru if there's a subsequent failure in
1807     + * xfs_mountfs, which leads to us leaking the inodes if nothing
1808     + * else (e.g. quotacheck) references the inodes before the
1809     + * mount failure occurs.
1810     */
1811     mp->m_super->s_flags |= MS_ACTIVE;
1812     error = xlog_recover_finish(mp->m_log);
1813     if (!error)
1814     xfs_log_work_queue(mp);
1815     mp->m_super->s_flags &= ~MS_ACTIVE;
1816     + evict_inodes(mp->m_super);
1817     +
1818     + if (readonly)
1819     + mp->m_flags |= XFS_MOUNT_RDONLY;
1820    
1821     return error;
1822     }
1823     @@ -812,11 +831,14 @@ xfs_log_unmount_write(xfs_mount_t *mp)
1824     int error;
1825    
1826     /*
1827     - * Don't write out unmount record on read-only mounts.
1828     + * Don't write out unmount record on norecovery mounts or ro devices.
1829     * Or, if we are doing a forced umount (typically because of IO errors).
1830     */
1831     - if (mp->m_flags & XFS_MOUNT_RDONLY)
1832     + if (mp->m_flags & XFS_MOUNT_NORECOVERY ||
1833     + xfs_readonly_buftarg(log->l_mp->m_logdev_targp)) {
1834     + ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
1835     return 0;
1836     + }
1837    
1838     error = _xfs_log_force(mp, XFS_LOG_SYNC, NULL);
1839     ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log)));
1840     @@ -3353,8 +3375,6 @@ _xfs_log_force(
1841     */
1842     if (iclog->ic_state & XLOG_STATE_IOERROR)
1843     return -EIO;
1844     - if (log_flushed)
1845     - *log_flushed = 1;
1846     } else {
1847    
1848     no_sleep:
1849     @@ -3458,8 +3478,6 @@ _xfs_log_force_lsn(
1850    
1851     xlog_wait(&iclog->ic_prev->ic_write_wait,
1852     &log->l_icloglock);
1853     - if (log_flushed)
1854     - *log_flushed = 1;
1855     already_slept = 1;
1856     goto try_again;
1857     }
1858     @@ -3493,9 +3511,6 @@ _xfs_log_force_lsn(
1859     */
1860     if (iclog->ic_state & XLOG_STATE_IOERROR)
1861     return -EIO;
1862     -
1863     - if (log_flushed)
1864     - *log_flushed = 1;
1865     } else { /* just return */
1866     spin_unlock(&log->l_icloglock);
1867     }
1868     diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
1869     index 9549188f5a36..093ee8289057 100644
1870     --- a/fs/xfs/xfs_log_recover.c
1871     +++ b/fs/xfs/xfs_log_recover.c
1872     @@ -1029,61 +1029,106 @@ xlog_seek_logrec_hdr(
1873     }
1874    
1875     /*
1876     - * Check the log tail for torn writes. This is required when torn writes are
1877     - * detected at the head and the head had to be walked back to a previous record.
1878     - * The tail of the previous record must now be verified to ensure the torn
1879     - * writes didn't corrupt the previous tail.
1880     + * Calculate distance from head to tail (i.e., unused space in the log).
1881     + */
1882     +static inline int
1883     +xlog_tail_distance(
1884     + struct xlog *log,
1885     + xfs_daddr_t head_blk,
1886     + xfs_daddr_t tail_blk)
1887     +{
1888     + if (head_blk < tail_blk)
1889     + return tail_blk - head_blk;
1890     +
1891     + return tail_blk + (log->l_logBBsize - head_blk);
1892     +}
1893     +
1894     +/*
1895     + * Verify the log tail. This is particularly important when torn or incomplete
1896     + * writes have been detected near the front of the log and the head has been
1897     + * walked back accordingly.
1898     + *
1899     + * We also have to handle the case where the tail was pinned and the head
1900     + * blocked behind the tail right before a crash. If the tail had been pushed
1901     + * immediately prior to the crash and the subsequent checkpoint was only
1902     + * partially written, it's possible it overwrote the last referenced tail in the
1903     + * log with garbage. This is not a coherency problem because the tail must have
1904     + * been pushed before it can be overwritten, but appears as log corruption to
1905     + * recovery because we have no way to know the tail was updated if the
1906     + * subsequent checkpoint didn't write successfully.
1907     *
1908     - * Return an error if CRC verification fails as recovery cannot proceed.
1909     + * Therefore, CRC check the log from tail to head. If a failure occurs and the
1910     + * offending record is within max iclog bufs from the head, walk the tail
1911     + * forward and retry until a valid tail is found or corruption is detected out
1912     + * of the range of a possible overwrite.
1913     */
1914     STATIC int
1915     xlog_verify_tail(
1916     struct xlog *log,
1917     xfs_daddr_t head_blk,
1918     - xfs_daddr_t tail_blk)
1919     + xfs_daddr_t *tail_blk,
1920     + int hsize)
1921     {
1922     struct xlog_rec_header *thead;
1923     struct xfs_buf *bp;
1924     xfs_daddr_t first_bad;
1925     - int count;
1926     int error = 0;
1927     bool wrapped;
1928     - xfs_daddr_t tmp_head;
1929     + xfs_daddr_t tmp_tail;
1930     + xfs_daddr_t orig_tail = *tail_blk;
1931    
1932     bp = xlog_get_bp(log, 1);
1933     if (!bp)
1934     return -ENOMEM;
1935    
1936     /*
1937     - * Seek XLOG_MAX_ICLOGS + 1 records past the current tail record to get
1938     - * a temporary head block that points after the last possible
1939     - * concurrently written record of the tail.
1940     + * Make sure the tail points to a record (returns positive count on
1941     + * success).
1942     */
1943     - count = xlog_seek_logrec_hdr(log, head_blk, tail_blk,
1944     - XLOG_MAX_ICLOGS + 1, bp, &tmp_head, &thead,
1945     - &wrapped);
1946     - if (count < 0) {
1947     - error = count;
1948     + error = xlog_seek_logrec_hdr(log, head_blk, *tail_blk, 1, bp,
1949     + &tmp_tail, &thead, &wrapped);
1950     + if (error < 0)
1951     goto out;
1952     - }
1953     -
1954     - /*
1955     - * If the call above didn't find XLOG_MAX_ICLOGS + 1 records, we ran
1956     - * into the actual log head. tmp_head points to the start of the record
1957     - * so update it to the actual head block.
1958     - */
1959     - if (count < XLOG_MAX_ICLOGS + 1)
1960     - tmp_head = head_blk;
1961     + if (*tail_blk != tmp_tail)
1962     + *tail_blk = tmp_tail;
1963    
1964     /*
1965     - * We now have a tail and temporary head block that covers at least
1966     - * XLOG_MAX_ICLOGS records from the tail. We need to verify that these
1967     - * records were completely written. Run a CRC verification pass from
1968     - * tail to head and return the result.
1969     + * Run a CRC check from the tail to the head. We can't just check
1970     + * MAX_ICLOGS records past the tail because the tail may point to stale
1971     + * blocks cleared during the search for the head/tail. These blocks are
1972     + * overwritten with zero-length records and thus record count is not a
1973     + * reliable indicator of the iclog state before a crash.
1974     */
1975     - error = xlog_do_recovery_pass(log, tmp_head, tail_blk,
1976     + first_bad = 0;
1977     + error = xlog_do_recovery_pass(log, head_blk, *tail_blk,
1978     XLOG_RECOVER_CRCPASS, &first_bad);
1979     + while ((error == -EFSBADCRC || error == -EFSCORRUPTED) && first_bad) {
1980     + int tail_distance;
1981     +
1982     + /*
1983     + * Is corruption within range of the head? If so, retry from
1984     + * the next record. Otherwise return an error.
1985     + */
1986     + tail_distance = xlog_tail_distance(log, head_blk, first_bad);
1987     + if (tail_distance > BTOBB(XLOG_MAX_ICLOGS * hsize))
1988     + break;
1989    
1990     + /* skip to the next record; returns positive count on success */
1991     + error = xlog_seek_logrec_hdr(log, head_blk, first_bad, 2, bp,
1992     + &tmp_tail, &thead, &wrapped);
1993     + if (error < 0)
1994     + goto out;
1995     +
1996     + *tail_blk = tmp_tail;
1997     + first_bad = 0;
1998     + error = xlog_do_recovery_pass(log, head_blk, *tail_blk,
1999     + XLOG_RECOVER_CRCPASS, &first_bad);
2000     + }
2001     +
2002     + if (!error && *tail_blk != orig_tail)
2003     + xfs_warn(log->l_mp,
2004     + "Tail block (0x%llx) overwrite detected. Updated to 0x%llx",
2005     + orig_tail, *tail_blk);
2006     out:
2007     xlog_put_bp(bp);
2008     return error;
2009     @@ -1143,7 +1188,7 @@ xlog_verify_head(
2010     */
2011     error = xlog_do_recovery_pass(log, *head_blk, tmp_rhead_blk,
2012     XLOG_RECOVER_CRCPASS, &first_bad);
2013     - if (error == -EFSBADCRC) {
2014     + if ((error == -EFSBADCRC || error == -EFSCORRUPTED) && first_bad) {
2015     /*
2016     * We've hit a potential torn write. Reset the error and warn
2017     * about it.
2018     @@ -1183,31 +1228,12 @@ xlog_verify_head(
2019     ASSERT(0);
2020     return 0;
2021     }
2022     -
2023     - /*
2024     - * Now verify the tail based on the updated head. This is
2025     - * required because the torn writes trimmed from the head could
2026     - * have been written over the tail of a previous record. Return
2027     - * any errors since recovery cannot proceed if the tail is
2028     - * corrupt.
2029     - *
2030     - * XXX: This leaves a gap in truly robust protection from torn
2031     - * writes in the log. If the head is behind the tail, the tail
2032     - * pushes forward to create some space and then a crash occurs
2033     - * causing the writes into the previous record's tail region to
2034     - * tear, log recovery isn't able to recover.
2035     - *
2036     - * How likely is this to occur? If possible, can we do something
2037     - * more intelligent here? Is it safe to push the tail forward if
2038     - * we can determine that the tail is within the range of the
2039     - * torn write (e.g., the kernel can only overwrite the tail if
2040     - * it has actually been pushed forward)? Alternatively, could we
2041     - * somehow prevent this condition at runtime?
2042     - */
2043     - error = xlog_verify_tail(log, *head_blk, *tail_blk);
2044     }
2045     + if (error)
2046     + return error;
2047    
2048     - return error;
2049     + return xlog_verify_tail(log, *head_blk, tail_blk,
2050     + be32_to_cpu((*rhead)->h_size));
2051     }
2052    
2053     /*
2054     @@ -4801,12 +4827,16 @@ xlog_recover_process_intents(
2055     int error = 0;
2056     struct xfs_ail_cursor cur;
2057     struct xfs_ail *ailp;
2058     +#if defined(DEBUG) || defined(XFS_WARN)
2059     xfs_lsn_t last_lsn;
2060     +#endif
2061    
2062     ailp = log->l_ailp;
2063     spin_lock(&ailp->xa_lock);
2064     lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
2065     +#if defined(DEBUG) || defined(XFS_WARN)
2066     last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block);
2067     +#endif
2068     while (lip != NULL) {
2069     /*
2070     * We're done when we see something other than an intent.
2071     @@ -5218,7 +5248,7 @@ xlog_do_recovery_pass(
2072     xfs_daddr_t *first_bad) /* out: first bad log rec */
2073     {
2074     xlog_rec_header_t *rhead;
2075     - xfs_daddr_t blk_no;
2076     + xfs_daddr_t blk_no, rblk_no;
2077     xfs_daddr_t rhead_blk;
2078     char *offset;
2079     xfs_buf_t *hbp, *dbp;
2080     @@ -5231,7 +5261,7 @@ xlog_do_recovery_pass(
2081     LIST_HEAD (buffer_list);
2082    
2083     ASSERT(head_blk != tail_blk);
2084     - rhead_blk = 0;
2085     + blk_no = rhead_blk = tail_blk;
2086    
2087     for (i = 0; i < XLOG_RHASH_SIZE; i++)
2088     INIT_HLIST_HEAD(&rhash[i]);
2089     @@ -5309,7 +5339,6 @@ xlog_do_recovery_pass(
2090     }
2091    
2092     memset(rhash, 0, sizeof(rhash));
2093     - blk_no = rhead_blk = tail_blk;
2094     if (tail_blk > head_blk) {
2095     /*
2096     * Perform recovery around the end of the physical log.
2097     @@ -5371,9 +5400,19 @@ xlog_do_recovery_pass(
2098     bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
2099     blk_no += hblks;
2100    
2101     - /* Read in data for log record */
2102     - if (blk_no + bblks <= log->l_logBBsize) {
2103     - error = xlog_bread(log, blk_no, bblks, dbp,
2104     + /*
2105     + * Read the log record data in multiple reads if it
2106     + * wraps around the end of the log. Note that if the
2107     + * header already wrapped, blk_no could point past the
2108     + * end of the log. The record data is contiguous in
2109     + * that case.
2110     + */
2111     + if (blk_no + bblks <= log->l_logBBsize ||
2112     + blk_no >= log->l_logBBsize) {
2113     + /* mod blk_no in case the header wrapped and
2114     + * pushed it beyond the end of the log */
2115     + rblk_no = do_mod(blk_no, log->l_logBBsize);
2116     + error = xlog_bread(log, rblk_no, bblks, dbp,
2117     &offset);
2118     if (error)
2119     goto bread_err2;
2120     diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
2121     index 38aaacdbb8b3..c1c4c2ea1014 100644
2122     --- a/fs/xfs/xfs_super.c
2123     +++ b/fs/xfs/xfs_super.c
2124     @@ -1220,7 +1220,7 @@ xfs_test_remount_options(
2125     tmp_mp->m_super = sb;
2126     error = xfs_parseargs(tmp_mp, options);
2127     xfs_free_fsname(tmp_mp);
2128     - kfree(tmp_mp);
2129     + kmem_free(tmp_mp);
2130    
2131     return error;
2132     }
2133     diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
2134     index bcc3cdf8e1c5..bb0099708827 100644
2135     --- a/fs/xfs/xfs_trace.h
2136     +++ b/fs/xfs/xfs_trace.h
2137     @@ -517,7 +517,6 @@ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size);
2138     DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_ordered);
2139     DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_stale);
2140     DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format);
2141     -DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_ordered);
2142     DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_stale);
2143     DEFINE_BUF_ITEM_EVENT(xfs_buf_item_ordered);
2144     DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin);
2145     diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
2146     index 6bdad6f58934..4709823e04b9 100644
2147     --- a/fs/xfs/xfs_trans.h
2148     +++ b/fs/xfs/xfs_trans.h
2149     @@ -49,6 +49,7 @@ typedef struct xfs_log_item {
2150     struct xfs_ail *li_ailp; /* ptr to AIL */
2151     uint li_type; /* item type */
2152     uint li_flags; /* misc flags */
2153     + struct xfs_buf *li_buf; /* real buffer pointer */
2154     struct xfs_log_item *li_bio_list; /* buffer item list */
2155     void (*li_cb)(struct xfs_buf *,
2156     struct xfs_log_item *);
2157     @@ -64,11 +65,13 @@ typedef struct xfs_log_item {
2158     } xfs_log_item_t;
2159    
2160     #define XFS_LI_IN_AIL 0x1
2161     -#define XFS_LI_ABORTED 0x2
2162     +#define XFS_LI_ABORTED 0x2
2163     +#define XFS_LI_FAILED 0x4
2164    
2165     #define XFS_LI_FLAGS \
2166     { XFS_LI_IN_AIL, "IN_AIL" }, \
2167     - { XFS_LI_ABORTED, "ABORTED" }
2168     + { XFS_LI_ABORTED, "ABORTED" }, \
2169     + { XFS_LI_FAILED, "FAILED" }
2170    
2171     struct xfs_item_ops {
2172     void (*iop_size)(xfs_log_item_t *, int *, int *);
2173     @@ -79,6 +82,7 @@ struct xfs_item_ops {
2174     void (*iop_unlock)(xfs_log_item_t *);
2175     xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t);
2176     void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t);
2177     + void (*iop_error)(xfs_log_item_t *, xfs_buf_t *);
2178     };
2179    
2180     void xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item,
2181     @@ -208,12 +212,14 @@ void xfs_trans_bhold_release(xfs_trans_t *, struct xfs_buf *);
2182     void xfs_trans_binval(xfs_trans_t *, struct xfs_buf *);
2183     void xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *);
2184     void xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *);
2185     -void xfs_trans_ordered_buf(xfs_trans_t *, struct xfs_buf *);
2186     +bool xfs_trans_ordered_buf(xfs_trans_t *, struct xfs_buf *);
2187     void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint);
2188     void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *);
2189     void xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int);
2190     void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *, uint);
2191     -void xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint);
2192     +void xfs_trans_log_buf(struct xfs_trans *, struct xfs_buf *, uint,
2193     + uint);
2194     +void xfs_trans_dirty_buf(struct xfs_trans *, struct xfs_buf *);
2195     void xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint);
2196    
2197     void xfs_extent_free_init_defer_op(void);
2198     diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
2199     index 9056c0f34a3c..70f5ab017323 100644
2200     --- a/fs/xfs/xfs_trans_ail.c
2201     +++ b/fs/xfs/xfs_trans_ail.c
2202     @@ -687,12 +687,13 @@ xfs_trans_ail_update_bulk(
2203     bool
2204     xfs_ail_delete_one(
2205     struct xfs_ail *ailp,
2206     - struct xfs_log_item *lip)
2207     + struct xfs_log_item *lip)
2208     {
2209     struct xfs_log_item *mlip = xfs_ail_min(ailp);
2210    
2211     trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn);
2212     xfs_ail_delete(ailp, lip);
2213     + xfs_clear_li_failed(lip);
2214     lip->li_flags &= ~XFS_LI_IN_AIL;
2215     lip->li_lsn = 0;
2216    
2217     diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
2218     index 86987d823d76..3ba7a96a8abd 100644
2219     --- a/fs/xfs/xfs_trans_buf.c
2220     +++ b/fs/xfs/xfs_trans_buf.c
2221     @@ -435,7 +435,7 @@ xfs_trans_brelse(xfs_trans_t *tp,
2222     if (XFS_FORCED_SHUTDOWN(tp->t_mountp) && freed) {
2223     xfs_trans_ail_remove(&bip->bli_item, SHUTDOWN_LOG_IO_ERROR);
2224     xfs_buf_item_relse(bp);
2225     - } else if (!xfs_buf_item_dirty(bip)) {
2226     + } else if (!(bip->bli_flags & XFS_BLI_DIRTY)) {
2227     /***
2228     ASSERT(bp->b_pincount == 0);
2229     ***/
2230     @@ -493,25 +493,17 @@ xfs_trans_bhold_release(xfs_trans_t *tp,
2231     }
2232    
2233     /*
2234     - * This is called to mark bytes first through last inclusive of the given
2235     - * buffer as needing to be logged when the transaction is committed.
2236     - * The buffer must already be associated with the given transaction.
2237     - *
2238     - * First and last are numbers relative to the beginning of this buffer,
2239     - * so the first byte in the buffer is numbered 0 regardless of the
2240     - * value of b_blkno.
2241     + * Mark a buffer dirty in the transaction.
2242     */
2243     void
2244     -xfs_trans_log_buf(xfs_trans_t *tp,
2245     - xfs_buf_t *bp,
2246     - uint first,
2247     - uint last)
2248     +xfs_trans_dirty_buf(
2249     + struct xfs_trans *tp,
2250     + struct xfs_buf *bp)
2251     {
2252     - xfs_buf_log_item_t *bip = bp->b_fspriv;
2253     + struct xfs_buf_log_item *bip = bp->b_fspriv;
2254    
2255     ASSERT(bp->b_transp == tp);
2256     ASSERT(bip != NULL);
2257     - ASSERT(first <= last && last < BBTOB(bp->b_length));
2258     ASSERT(bp->b_iodone == NULL ||
2259     bp->b_iodone == xfs_buf_iodone_callbacks);
2260    
2261     @@ -531,8 +523,6 @@ xfs_trans_log_buf(xfs_trans_t *tp,
2262     bp->b_iodone = xfs_buf_iodone_callbacks;
2263     bip->bli_item.li_cb = xfs_buf_iodone;
2264    
2265     - trace_xfs_trans_log_buf(bip);
2266     -
2267     /*
2268     * If we invalidated the buffer within this transaction, then
2269     * cancel the invalidation now that we're dirtying the buffer
2270     @@ -545,17 +535,37 @@ xfs_trans_log_buf(xfs_trans_t *tp,
2271     bp->b_flags &= ~XBF_STALE;
2272     bip->__bli_format.blf_flags &= ~XFS_BLF_CANCEL;
2273     }
2274     + bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED;
2275    
2276     tp->t_flags |= XFS_TRANS_DIRTY;
2277     bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY;
2278     +}
2279    
2280     - /*
2281     - * If we have an ordered buffer we are not logging any dirty range but
2282     - * it still needs to be marked dirty and that it has been logged.
2283     - */
2284     - bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED;
2285     - if (!(bip->bli_flags & XFS_BLI_ORDERED))
2286     - xfs_buf_item_log(bip, first, last);
2287     +/*
2288     + * This is called to mark bytes first through last inclusive of the given
2289     + * buffer as needing to be logged when the transaction is committed.
2290     + * The buffer must already be associated with the given transaction.
2291     + *
2292     + * First and last are numbers relative to the beginning of this buffer,
2293     + * so the first byte in the buffer is numbered 0 regardless of the
2294     + * value of b_blkno.
2295     + */
2296     +void
2297     +xfs_trans_log_buf(
2298     + struct xfs_trans *tp,
2299     + struct xfs_buf *bp,
2300     + uint first,
2301     + uint last)
2302     +{
2303     + struct xfs_buf_log_item *bip = bp->b_fspriv;
2304     +
2305     + ASSERT(first <= last && last < BBTOB(bp->b_length));
2306     + ASSERT(!(bip->bli_flags & XFS_BLI_ORDERED));
2307     +
2308     + xfs_trans_dirty_buf(tp, bp);
2309     +
2310     + trace_xfs_trans_log_buf(bip);
2311     + xfs_buf_item_log(bip, first, last);
2312     }
2313    
2314    
2315     @@ -708,14 +718,13 @@ xfs_trans_inode_alloc_buf(
2316     }
2317    
2318     /*
2319     - * Mark the buffer as ordered for this transaction. This means
2320     - * that the contents of the buffer are not recorded in the transaction
2321     - * but it is tracked in the AIL as though it was. This allows us
2322     - * to record logical changes in transactions rather than the physical
2323     - * changes we make to the buffer without changing writeback ordering
2324     - * constraints of metadata buffers.
2325     + * Mark the buffer as ordered for this transaction. This means that the contents
2326     + * of the buffer are not recorded in the transaction but it is tracked in the
2327     + * AIL as though it was. This allows us to record logical changes in
2328     + * transactions rather than the physical changes we make to the buffer without
2329     + * changing writeback ordering constraints of metadata buffers.
2330     */
2331     -void
2332     +bool
2333     xfs_trans_ordered_buf(
2334     struct xfs_trans *tp,
2335     struct xfs_buf *bp)
2336     @@ -726,8 +735,18 @@ xfs_trans_ordered_buf(
2337     ASSERT(bip != NULL);
2338     ASSERT(atomic_read(&bip->bli_refcount) > 0);
2339    
2340     + if (xfs_buf_item_dirty_format(bip))
2341     + return false;
2342     +
2343     bip->bli_flags |= XFS_BLI_ORDERED;
2344     trace_xfs_buf_item_ordered(bip);
2345     +
2346     + /*
2347     + * We don't log a dirty range of an ordered buffer but it still needs
2348     + * to be marked dirty and that it has been logged.
2349     + */
2350     + xfs_trans_dirty_buf(tp, bp);
2351     + return true;
2352     }
2353    
2354     /*
2355     diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
2356     index d91706c56c63..b317a3644c00 100644
2357     --- a/fs/xfs/xfs_trans_priv.h
2358     +++ b/fs/xfs/xfs_trans_priv.h
2359     @@ -164,4 +164,35 @@ xfs_trans_ail_copy_lsn(
2360     *dst = *src;
2361     }
2362     #endif
2363     +
2364     +static inline void
2365     +xfs_clear_li_failed(
2366     + struct xfs_log_item *lip)
2367     +{
2368     + struct xfs_buf *bp = lip->li_buf;
2369     +
2370     + ASSERT(lip->li_flags & XFS_LI_IN_AIL);
2371     + lockdep_assert_held(&lip->li_ailp->xa_lock);
2372     +
2373     + if (lip->li_flags & XFS_LI_FAILED) {
2374     + lip->li_flags &= ~XFS_LI_FAILED;
2375     + lip->li_buf = NULL;
2376     + xfs_buf_rele(bp);
2377     + }
2378     +}
2379     +
2380     +static inline void
2381     +xfs_set_li_failed(
2382     + struct xfs_log_item *lip,
2383     + struct xfs_buf *bp)
2384     +{
2385     + lockdep_assert_held(&lip->li_ailp->xa_lock);
2386     +
2387     + if (!(lip->li_flags & XFS_LI_FAILED)) {
2388     + xfs_buf_hold(bp);
2389     + lip->li_flags |= XFS_LI_FAILED;
2390     + lip->li_buf = bp;
2391     + }
2392     +}
2393     +
2394     #endif /* __XFS_TRANS_PRIV_H__ */
2395     diff --git a/include/linux/fs.h b/include/linux/fs.h
2396     index cbfe127bccf8..d0c0ca8ea8c1 100644
2397     --- a/include/linux/fs.h
2398     +++ b/include/linux/fs.h
2399     @@ -2831,6 +2831,7 @@ static inline void lockdep_annotate_inode_mutex_key(struct inode *inode) { };
2400     #endif
2401     extern void unlock_new_inode(struct inode *);
2402     extern unsigned int get_next_ino(void);
2403     +extern void evict_inodes(struct super_block *sb);
2404    
2405     extern void __iget(struct inode * inode);
2406     extern void iget_failed(struct inode *);
2407     diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
2408     index e030a68ead7e..25438b2b6f22 100644
2409     --- a/include/linux/mm_inline.h
2410     +++ b/include/linux/mm_inline.h
2411     @@ -126,4 +126,10 @@ static __always_inline enum lru_list page_lru(struct page *page)
2412    
2413     #define lru_to_page(head) (list_entry((head)->prev, struct page, lru))
2414    
2415     +#ifdef arch_unmap_kpfn
2416     +extern void arch_unmap_kpfn(unsigned long pfn);
2417     +#else
2418     +static __always_inline void arch_unmap_kpfn(unsigned long pfn) { }
2419     +#endif
2420     +
2421     #endif
2422     diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
2423     index d67a8182e5eb..63df75ae70ee 100644
2424     --- a/include/linux/skbuff.h
2425     +++ b/include/linux/skbuff.h
2426     @@ -885,7 +885,7 @@ void kfree_skb(struct sk_buff *skb);
2427     void kfree_skb_list(struct sk_buff *segs);
2428     void skb_tx_error(struct sk_buff *skb);
2429     void consume_skb(struct sk_buff *skb);
2430     -void consume_stateless_skb(struct sk_buff *skb);
2431     +void __consume_stateless_skb(struct sk_buff *skb);
2432     void __kfree_skb(struct sk_buff *skb);
2433     extern struct kmem_cache *skbuff_head_cache;
2434    
2435     diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
2436     index 6fdcd2427776..fc59e0775e00 100644
2437     --- a/include/net/inet_frag.h
2438     +++ b/include/net/inet_frag.h
2439     @@ -1,14 +1,9 @@
2440     #ifndef __NET_FRAG_H__
2441     #define __NET_FRAG_H__
2442    
2443     -#include <linux/percpu_counter.h>
2444     -
2445     struct netns_frags {
2446     - /* The percpu_counter "mem" need to be cacheline aligned.
2447     - * mem.count must not share cacheline with other writers
2448     - */
2449     - struct percpu_counter mem ____cacheline_aligned_in_smp;
2450     -
2451     + /* Keep atomic mem on separate cachelines in structs that include it */
2452     + atomic_t mem ____cacheline_aligned_in_smp;
2453     /* sysctls */
2454     int timeout;
2455     int high_thresh;
2456     @@ -108,15 +103,10 @@ struct inet_frags {
2457     int inet_frags_init(struct inet_frags *);
2458     void inet_frags_fini(struct inet_frags *);
2459    
2460     -static inline int inet_frags_init_net(struct netns_frags *nf)
2461     -{
2462     - return percpu_counter_init(&nf->mem, 0, GFP_KERNEL);
2463     -}
2464     -static inline void inet_frags_uninit_net(struct netns_frags *nf)
2465     +static inline void inet_frags_init_net(struct netns_frags *nf)
2466     {
2467     - percpu_counter_destroy(&nf->mem);
2468     + atomic_set(&nf->mem, 0);
2469     }
2470     -
2471     void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f);
2472    
2473     void inet_frag_kill(struct inet_frag_queue *q, struct inet_frags *f);
2474     @@ -140,31 +130,24 @@ static inline bool inet_frag_evicting(struct inet_frag_queue *q)
2475    
2476     /* Memory Tracking Functions. */
2477    
2478     -/* The default percpu_counter batch size is not big enough to scale to
2479     - * fragmentation mem acct sizes.
2480     - * The mem size of a 64K fragment is approx:
2481     - * (44 fragments * 2944 truesize) + frag_queue struct(200) = 129736 bytes
2482     - */
2483     -static unsigned int frag_percpu_counter_batch = 130000;
2484     -
2485     static inline int frag_mem_limit(struct netns_frags *nf)
2486     {
2487     - return percpu_counter_read(&nf->mem);
2488     + return atomic_read(&nf->mem);
2489     }
2490    
2491     static inline void sub_frag_mem_limit(struct netns_frags *nf, int i)
2492     {
2493     - percpu_counter_add_batch(&nf->mem, -i, frag_percpu_counter_batch);
2494     + atomic_sub(i, &nf->mem);
2495     }
2496    
2497     static inline void add_frag_mem_limit(struct netns_frags *nf, int i)
2498     {
2499     - percpu_counter_add_batch(&nf->mem, i, frag_percpu_counter_batch);
2500     + atomic_add(i, &nf->mem);
2501     }
2502    
2503     -static inline unsigned int sum_frag_mem_limit(struct netns_frags *nf)
2504     +static inline int sum_frag_mem_limit(struct netns_frags *nf)
2505     {
2506     - return percpu_counter_sum_positive(&nf->mem);
2507     + return atomic_read(&nf->mem);
2508     }
2509    
2510     /* RFC 3168 support :
2511     diff --git a/lib/idr.c b/lib/idr.c
2512     index b13682bb0a1c..20c2779e8d12 100644
2513     --- a/lib/idr.c
2514     +++ b/lib/idr.c
2515     @@ -154,7 +154,7 @@ void *idr_replace(struct idr *idr, void *ptr, int id)
2516     void __rcu **slot = NULL;
2517     void *entry;
2518    
2519     - if (WARN_ON_ONCE(id < 0))
2520     + if (id < 0)
2521     return ERR_PTR(-EINVAL);
2522     if (WARN_ON_ONCE(radix_tree_is_internal_node(ptr)))
2523     return ERR_PTR(-EINVAL);
2524     diff --git a/mm/memory-failure.c b/mm/memory-failure.c
2525     index 1cd3b3569af8..88366626c0b7 100644
2526     --- a/mm/memory-failure.c
2527     +++ b/mm/memory-failure.c
2528     @@ -1146,6 +1146,8 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
2529     return 0;
2530     }
2531    
2532     + arch_unmap_kpfn(pfn);
2533     +
2534     orig_head = hpage = compound_head(p);
2535     num_poisoned_pages_inc();
2536    
2537     diff --git a/net/core/skbuff.c b/net/core/skbuff.c
2538     index e07556606284..72eb23d2426f 100644
2539     --- a/net/core/skbuff.c
2540     +++ b/net/core/skbuff.c
2541     @@ -753,14 +753,11 @@ EXPORT_SYMBOL(consume_skb);
2542     * consume_stateless_skb - free an skbuff, assuming it is stateless
2543     * @skb: buffer to free
2544     *
2545     - * Works like consume_skb(), but this variant assumes that all the head
2546     - * states have been already dropped.
2547     + * Alike consume_skb(), but this variant assumes that this is the last
2548     + * skb reference and all the head states have been already dropped
2549     */
2550     -void consume_stateless_skb(struct sk_buff *skb)
2551     +void __consume_stateless_skb(struct sk_buff *skb)
2552     {
2553     - if (!skb_unref(skb))
2554     - return;
2555     -
2556     trace_consume_skb(skb);
2557     if (likely(skb->head))
2558     skb_release_data(skb);
2559     diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c
2560     index 30d875dff6b5..f85b08baff16 100644
2561     --- a/net/ieee802154/6lowpan/reassembly.c
2562     +++ b/net/ieee802154/6lowpan/reassembly.c
2563     @@ -580,19 +580,14 @@ static int __net_init lowpan_frags_init_net(struct net *net)
2564     {
2565     struct netns_ieee802154_lowpan *ieee802154_lowpan =
2566     net_ieee802154_lowpan(net);
2567     - int res;
2568    
2569     ieee802154_lowpan->frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
2570     ieee802154_lowpan->frags.low_thresh = IPV6_FRAG_LOW_THRESH;
2571     ieee802154_lowpan->frags.timeout = IPV6_FRAG_TIMEOUT;
2572    
2573     - res = inet_frags_init_net(&ieee802154_lowpan->frags);
2574     - if (res)
2575     - return res;
2576     - res = lowpan_frags_ns_sysctl_register(net);
2577     - if (res)
2578     - inet_frags_uninit_net(&ieee802154_lowpan->frags);
2579     - return res;
2580     + inet_frags_init_net(&ieee802154_lowpan->frags);
2581     +
2582     + return lowpan_frags_ns_sysctl_register(net);
2583     }
2584    
2585     static void __net_exit lowpan_frags_exit_net(struct net *net)
2586     diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
2587     index 96e95e83cc61..af74d0433453 100644
2588     --- a/net/ipv4/inet_fragment.c
2589     +++ b/net/ipv4/inet_fragment.c
2590     @@ -234,10 +234,8 @@ void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f)
2591     cond_resched();
2592    
2593     if (read_seqretry(&f->rnd_seqlock, seq) ||
2594     - percpu_counter_sum(&nf->mem))
2595     + sum_frag_mem_limit(nf))
2596     goto evict_again;
2597     -
2598     - percpu_counter_destroy(&nf->mem);
2599     }
2600     EXPORT_SYMBOL(inet_frags_exit_net);
2601    
2602     diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
2603     index 9a8cfac503dc..46408c220d9d 100644
2604     --- a/net/ipv4/ip_fragment.c
2605     +++ b/net/ipv4/ip_fragment.c
2606     @@ -844,8 +844,6 @@ static void __init ip4_frags_ctl_register(void)
2607    
2608     static int __net_init ipv4_frags_init_net(struct net *net)
2609     {
2610     - int res;
2611     -
2612     /* Fragment cache limits.
2613     *
2614     * The fragment memory accounting code, (tries to) account for
2615     @@ -871,13 +869,9 @@ static int __net_init ipv4_frags_init_net(struct net *net)
2616    
2617     net->ipv4.frags.max_dist = 64;
2618    
2619     - res = inet_frags_init_net(&net->ipv4.frags);
2620     - if (res)
2621     - return res;
2622     - res = ip4_frags_ns_ctl_register(net);
2623     - if (res)
2624     - inet_frags_uninit_net(&net->ipv4.frags);
2625     - return res;
2626     + inet_frags_init_net(&net->ipv4.frags);
2627     +
2628     + return ip4_frags_ns_ctl_register(net);
2629     }
2630    
2631     static void __net_exit ipv4_frags_exit_net(struct net *net)
2632     diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
2633     index 129d1a3616f8..e1856bfa753d 100644
2634     --- a/net/ipv4/ip_tunnel.c
2635     +++ b/net/ipv4/ip_tunnel.c
2636     @@ -618,8 +618,8 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto)
2637     ip_rt_put(rt);
2638     goto tx_dropped;
2639     }
2640     - iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, key->tos,
2641     - key->ttl, df, !net_eq(tunnel->net, dev_net(dev)));
2642     + iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
2643     + df, !net_eq(tunnel->net, dev_net(dev)));
2644     return;
2645     tx_error:
2646     dev->stats.tx_errors++;
2647     diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
2648     index e9252c7df809..21022db7a2a6 100644
2649     --- a/net/ipv4/tcp_ipv4.c
2650     +++ b/net/ipv4/tcp_ipv4.c
2651     @@ -1722,9 +1722,9 @@ int tcp_v4_rcv(struct sk_buff *skb)
2652     */
2653     sock_hold(sk);
2654     refcounted = true;
2655     - if (tcp_filter(sk, skb))
2656     - goto discard_and_relse;
2657     - nsk = tcp_check_req(sk, skb, req, false);
2658     + nsk = NULL;
2659     + if (!tcp_filter(sk, skb))
2660     + nsk = tcp_check_req(sk, skb, req, false);
2661     if (!nsk) {
2662     reqsk_put(req);
2663     goto discard_and_relse;
2664     diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
2665     index 62344804baae..979e4d8526ba 100644
2666     --- a/net/ipv4/udp.c
2667     +++ b/net/ipv4/udp.c
2668     @@ -1386,12 +1386,15 @@ void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len)
2669     unlock_sock_fast(sk, slow);
2670     }
2671    
2672     + if (!skb_unref(skb))
2673     + return;
2674     +
2675     /* In the more common cases we cleared the head states previously,
2676     * see __udp_queue_rcv_skb().
2677     */
2678     if (unlikely(udp_skb_has_head_state(skb)))
2679     skb_release_head_state(skb);
2680     - consume_stateless_skb(skb);
2681     + __consume_stateless_skb(skb);
2682     }
2683     EXPORT_SYMBOL_GPL(skb_consume_udp);
2684    
2685     diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
2686     index e1c85bb4eac0..1792bbfd80e1 100644
2687     --- a/net/ipv6/ip6_fib.c
2688     +++ b/net/ipv6/ip6_fib.c
2689     @@ -198,6 +198,12 @@ static void rt6_release(struct rt6_info *rt)
2690     }
2691     }
2692    
2693     +static void fib6_free_table(struct fib6_table *table)
2694     +{
2695     + inetpeer_invalidate_tree(&table->tb6_peers);
2696     + kfree(table);
2697     +}
2698     +
2699     static void fib6_link_table(struct net *net, struct fib6_table *tb)
2700     {
2701     unsigned int h;
2702     @@ -1915,15 +1921,22 @@ static int __net_init fib6_net_init(struct net *net)
2703    
2704     static void fib6_net_exit(struct net *net)
2705     {
2706     + unsigned int i;
2707     +
2708     rt6_ifdown(net, NULL);
2709     del_timer_sync(&net->ipv6.ip6_fib_timer);
2710    
2711     -#ifdef CONFIG_IPV6_MULTIPLE_TABLES
2712     - inetpeer_invalidate_tree(&net->ipv6.fib6_local_tbl->tb6_peers);
2713     - kfree(net->ipv6.fib6_local_tbl);
2714     -#endif
2715     - inetpeer_invalidate_tree(&net->ipv6.fib6_main_tbl->tb6_peers);
2716     - kfree(net->ipv6.fib6_main_tbl);
2717     + for (i = 0; i < FIB6_TABLE_HASHSZ; i++) {
2718     + struct hlist_head *head = &net->ipv6.fib_table_hash[i];
2719     + struct hlist_node *tmp;
2720     + struct fib6_table *tb;
2721     +
2722     + hlist_for_each_entry_safe(tb, tmp, head, tb6_hlist) {
2723     + hlist_del(&tb->tb6_hlist);
2724     + fib6_free_table(tb);
2725     + }
2726     + }
2727     +
2728     kfree(net->ipv6.fib_table_hash);
2729     kfree(net->ipv6.rt6_stats);
2730     }
2731     diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
2732     index 67ff2aaf5dcb..b7a72d409334 100644
2733     --- a/net/ipv6/ip6_gre.c
2734     +++ b/net/ipv6/ip6_gre.c
2735     @@ -432,7 +432,9 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
2736     }
2737     break;
2738     case ICMPV6_PKT_TOOBIG:
2739     - mtu = be32_to_cpu(info) - offset;
2740     + mtu = be32_to_cpu(info) - offset - t->tun_hlen;
2741     + if (t->dev->type == ARPHRD_ETHER)
2742     + mtu -= ETH_HLEN;
2743     if (mtu < IPV6_MIN_MTU)
2744     mtu = IPV6_MIN_MTU;
2745     t->dev->mtu = mtu;
2746     diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
2747     index 986d4ca38832..b263bf3a19f7 100644
2748     --- a/net/ipv6/netfilter/nf_conntrack_reasm.c
2749     +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
2750     @@ -622,18 +622,12 @@ EXPORT_SYMBOL_GPL(nf_ct_frag6_gather);
2751    
2752     static int nf_ct_net_init(struct net *net)
2753     {
2754     - int res;
2755     -
2756     net->nf_frag.frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
2757     net->nf_frag.frags.low_thresh = IPV6_FRAG_LOW_THRESH;
2758     net->nf_frag.frags.timeout = IPV6_FRAG_TIMEOUT;
2759     - res = inet_frags_init_net(&net->nf_frag.frags);
2760     - if (res)
2761     - return res;
2762     - res = nf_ct_frag6_sysctl_register(net);
2763     - if (res)
2764     - inet_frags_uninit_net(&net->nf_frag.frags);
2765     - return res;
2766     + inet_frags_init_net(&net->nf_frag.frags);
2767     +
2768     + return nf_ct_frag6_sysctl_register(net);
2769     }
2770    
2771     static void nf_ct_net_exit(struct net *net)
2772     diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
2773     index e1da5b888cc4..846012eae526 100644
2774     --- a/net/ipv6/reassembly.c
2775     +++ b/net/ipv6/reassembly.c
2776     @@ -714,19 +714,13 @@ static void ip6_frags_sysctl_unregister(void)
2777    
2778     static int __net_init ipv6_frags_init_net(struct net *net)
2779     {
2780     - int res;
2781     -
2782     net->ipv6.frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
2783     net->ipv6.frags.low_thresh = IPV6_FRAG_LOW_THRESH;
2784     net->ipv6.frags.timeout = IPV6_FRAG_TIMEOUT;
2785    
2786     - res = inet_frags_init_net(&net->ipv6.frags);
2787     - if (res)
2788     - return res;
2789     - res = ip6_frags_ns_sysctl_register(net);
2790     - if (res)
2791     - inet_frags_uninit_net(&net->ipv6.frags);
2792     - return res;
2793     + inet_frags_init_net(&net->ipv6.frags);
2794     +
2795     + return ip6_frags_ns_sysctl_register(net);
2796     }
2797    
2798     static void __net_exit ipv6_frags_exit_net(struct net *net)
2799     diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
2800     index 206210125fd7..660b9b2a8a25 100644
2801     --- a/net/ipv6/tcp_ipv6.c
2802     +++ b/net/ipv6/tcp_ipv6.c
2803     @@ -1456,9 +1456,9 @@ static int tcp_v6_rcv(struct sk_buff *skb)
2804     }
2805     sock_hold(sk);
2806     refcounted = true;
2807     - if (tcp_filter(sk, skb))
2808     - goto discard_and_relse;
2809     - nsk = tcp_check_req(sk, skb, req, false);
2810     + nsk = NULL;
2811     + if (!tcp_filter(sk, skb))
2812     + nsk = tcp_check_req(sk, skb, req, false);
2813     if (!nsk) {
2814     reqsk_put(req);
2815     goto discard_and_relse;
2816     diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c
2817     index 0225d62a869f..a71be33f3afe 100644
2818     --- a/net/sctp/ulpqueue.c
2819     +++ b/net/sctp/ulpqueue.c
2820     @@ -265,7 +265,8 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event)
2821     sctp_ulpq_clear_pd(ulpq);
2822    
2823     if (queue == &sk->sk_receive_queue && !sp->data_ready_signalled) {
2824     - sp->data_ready_signalled = 1;
2825     + if (!sock_owned_by_user(sk))
2826     + sp->data_ready_signalled = 1;
2827     sk->sk_data_ready(sk);
2828     }
2829     return 1;