Magellan Linux

Annotation of /trunk/kernel-alx/patches-4.9/0150-4.9.51-all-fixes.patch

Parent Directory Parent Directory | Revision Log Revision Log


Revision 3035 - (hide annotations) (download)
Wed Dec 20 11:48:36 2017 UTC (6 years, 4 months ago) by niro
File size: 125407 byte(s)
-linux-4.9.51
1 niro 3035 diff --git a/Makefile b/Makefile
2     index 038d126a15fc..b48aebbe187f 100644
3     --- a/Makefile
4     +++ b/Makefile
5     @@ -1,6 +1,6 @@
6     VERSION = 4
7     PATCHLEVEL = 9
8     -SUBLEVEL = 50
9     +SUBLEVEL = 51
10     EXTRAVERSION =
11     NAME = Roaring Lionus
12    
13     diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
14     index b31761ecce63..7bcd138c3aa9 100644
15     --- a/arch/x86/include/asm/elf.h
16     +++ b/arch/x86/include/asm/elf.h
17     @@ -204,6 +204,7 @@ void set_personality_ia32(bool);
18    
19     #define ELF_CORE_COPY_REGS(pr_reg, regs) \
20     do { \
21     + unsigned long base; \
22     unsigned v; \
23     (pr_reg)[0] = (regs)->r15; \
24     (pr_reg)[1] = (regs)->r14; \
25     @@ -226,8 +227,8 @@ do { \
26     (pr_reg)[18] = (regs)->flags; \
27     (pr_reg)[19] = (regs)->sp; \
28     (pr_reg)[20] = (regs)->ss; \
29     - (pr_reg)[21] = current->thread.fsbase; \
30     - (pr_reg)[22] = current->thread.gsbase; \
31     + rdmsrl(MSR_FS_BASE, base); (pr_reg)[21] = base; \
32     + rdmsrl(MSR_KERNEL_GS_BASE, base); (pr_reg)[22] = base; \
33     asm("movl %%ds,%0" : "=r" (v)); (pr_reg)[23] = v; \
34     asm("movl %%es,%0" : "=r" (v)); (pr_reg)[24] = v; \
35     asm("movl %%fs,%0" : "=r" (v)); (pr_reg)[25] = v; \
36     diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
37     index b3760b3c1ca0..0887d2ae3797 100644
38     --- a/arch/x86/kernel/process_64.c
39     +++ b/arch/x86/kernel/process_64.c
40     @@ -136,6 +136,123 @@ void release_thread(struct task_struct *dead_task)
41     }
42     }
43    
44     +enum which_selector {
45     + FS,
46     + GS
47     +};
48     +
49     +/*
50     + * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are
51     + * not available. The goal is to be reasonably fast on non-FSGSBASE systems.
52     + * It's forcibly inlined because it'll generate better code and this function
53     + * is hot.
54     + */
55     +static __always_inline void save_base_legacy(struct task_struct *prev_p,
56     + unsigned short selector,
57     + enum which_selector which)
58     +{
59     + if (likely(selector == 0)) {
60     + /*
61     + * On Intel (without X86_BUG_NULL_SEG), the segment base could
62     + * be the pre-existing saved base or it could be zero. On AMD
63     + * (with X86_BUG_NULL_SEG), the segment base could be almost
64     + * anything.
65     + *
66     + * This branch is very hot (it's hit twice on almost every
67     + * context switch between 64-bit programs), and avoiding
68     + * the RDMSR helps a lot, so we just assume that whatever
69     + * value is already saved is correct. This matches historical
70     + * Linux behavior, so it won't break existing applications.
71     + *
72     + * To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we
73     + * report that the base is zero, it needs to actually be zero:
74     + * see the corresponding logic in load_seg_legacy.
75     + */
76     + } else {
77     + /*
78     + * If the selector is 1, 2, or 3, then the base is zero on
79     + * !X86_BUG_NULL_SEG CPUs and could be anything on
80     + * X86_BUG_NULL_SEG CPUs. In the latter case, Linux
81     + * has never attempted to preserve the base across context
82     + * switches.
83     + *
84     + * If selector > 3, then it refers to a real segment, and
85     + * saving the base isn't necessary.
86     + */
87     + if (which == FS)
88     + prev_p->thread.fsbase = 0;
89     + else
90     + prev_p->thread.gsbase = 0;
91     + }
92     +}
93     +
94     +static __always_inline void save_fsgs(struct task_struct *task)
95     +{
96     + savesegment(fs, task->thread.fsindex);
97     + savesegment(gs, task->thread.gsindex);
98     + save_base_legacy(task, task->thread.fsindex, FS);
99     + save_base_legacy(task, task->thread.gsindex, GS);
100     +}
101     +
102     +static __always_inline void loadseg(enum which_selector which,
103     + unsigned short sel)
104     +{
105     + if (which == FS)
106     + loadsegment(fs, sel);
107     + else
108     + load_gs_index(sel);
109     +}
110     +
111     +static __always_inline void load_seg_legacy(unsigned short prev_index,
112     + unsigned long prev_base,
113     + unsigned short next_index,
114     + unsigned long next_base,
115     + enum which_selector which)
116     +{
117     + if (likely(next_index <= 3)) {
118     + /*
119     + * The next task is using 64-bit TLS, is not using this
120     + * segment at all, or is having fun with arcane CPU features.
121     + */
122     + if (next_base == 0) {
123     + /*
124     + * Nasty case: on AMD CPUs, we need to forcibly zero
125     + * the base.
126     + */
127     + if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
128     + loadseg(which, __USER_DS);
129     + loadseg(which, next_index);
130     + } else {
131     + /*
132     + * We could try to exhaustively detect cases
133     + * under which we can skip the segment load,
134     + * but there's really only one case that matters
135     + * for performance: if both the previous and
136     + * next states are fully zeroed, we can skip
137     + * the load.
138     + *
139     + * (This assumes that prev_base == 0 has no
140     + * false positives. This is the case on
141     + * Intel-style CPUs.)
142     + */
143     + if (likely(prev_index | next_index | prev_base))
144     + loadseg(which, next_index);
145     + }
146     + } else {
147     + if (prev_index != next_index)
148     + loadseg(which, next_index);
149     + wrmsrl(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE,
150     + next_base);
151     + }
152     + } else {
153     + /*
154     + * The next task is using a real segment. Loading the selector
155     + * is sufficient.
156     + */
157     + loadseg(which, next_index);
158     + }
159     +}
160     +
161     int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
162     unsigned long arg, struct task_struct *p, unsigned long tls)
163     {
164     @@ -216,10 +333,19 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip,
165     unsigned long new_sp,
166     unsigned int _cs, unsigned int _ss, unsigned int _ds)
167     {
168     + WARN_ON_ONCE(regs != current_pt_regs());
169     +
170     + if (static_cpu_has(X86_BUG_NULL_SEG)) {
171     + /* Loading zero below won't clear the base. */
172     + loadsegment(fs, __USER_DS);
173     + load_gs_index(__USER_DS);
174     + }
175     +
176     loadsegment(fs, 0);
177     loadsegment(es, _ds);
178     loadsegment(ds, _ds);
179     load_gs_index(0);
180     +
181     regs->ip = new_ip;
182     regs->sp = new_sp;
183     regs->cs = _cs;
184     @@ -264,7 +390,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
185     struct fpu *next_fpu = &next->fpu;
186     int cpu = smp_processor_id();
187     struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
188     - unsigned prev_fsindex, prev_gsindex;
189     fpu_switch_t fpu_switch;
190    
191     fpu_switch = switch_fpu_prepare(prev_fpu, next_fpu, cpu);
192     @@ -274,8 +399,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
193     *
194     * (e.g. xen_load_tls())
195     */
196     - savesegment(fs, prev_fsindex);
197     - savesegment(gs, prev_gsindex);
198     + save_fsgs(prev_p);
199    
200     /*
201     * Load TLS before restoring any segments so that segment loads
202     @@ -314,108 +438,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
203     if (unlikely(next->ds | prev->ds))
204     loadsegment(ds, next->ds);
205    
206     - /*
207     - * Switch FS and GS.
208     - *
209     - * These are even more complicated than DS and ES: they have
210     - * 64-bit bases are that controlled by arch_prctl. The bases
211     - * don't necessarily match the selectors, as user code can do
212     - * any number of things to cause them to be inconsistent.
213     - *
214     - * We don't promise to preserve the bases if the selectors are
215     - * nonzero. We also don't promise to preserve the base if the
216     - * selector is zero and the base doesn't match whatever was
217     - * most recently passed to ARCH_SET_FS/GS. (If/when the
218     - * FSGSBASE instructions are enabled, we'll need to offer
219     - * stronger guarantees.)
220     - *
221     - * As an invariant,
222     - * (fsbase != 0 && fsindex != 0) || (gsbase != 0 && gsindex != 0) is
223     - * impossible.
224     - */
225     - if (next->fsindex) {
226     - /* Loading a nonzero value into FS sets the index and base. */
227     - loadsegment(fs, next->fsindex);
228     - } else {
229     - if (next->fsbase) {
230     - /* Next index is zero but next base is nonzero. */
231     - if (prev_fsindex)
232     - loadsegment(fs, 0);
233     - wrmsrl(MSR_FS_BASE, next->fsbase);
234     - } else {
235     - /* Next base and index are both zero. */
236     - if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
237     - /*
238     - * We don't know the previous base and can't
239     - * find out without RDMSR. Forcibly clear it.
240     - */
241     - loadsegment(fs, __USER_DS);
242     - loadsegment(fs, 0);
243     - } else {
244     - /*
245     - * If the previous index is zero and ARCH_SET_FS
246     - * didn't change the base, then the base is
247     - * also zero and we don't need to do anything.
248     - */
249     - if (prev->fsbase || prev_fsindex)
250     - loadsegment(fs, 0);
251     - }
252     - }
253     - }
254     - /*
255     - * Save the old state and preserve the invariant.
256     - * NB: if prev_fsindex == 0, then we can't reliably learn the base
257     - * without RDMSR because Intel user code can zero it without telling
258     - * us and AMD user code can program any 32-bit value without telling
259     - * us.
260     - */
261     - if (prev_fsindex)
262     - prev->fsbase = 0;
263     - prev->fsindex = prev_fsindex;
264     -
265     - if (next->gsindex) {
266     - /* Loading a nonzero value into GS sets the index and base. */
267     - load_gs_index(next->gsindex);
268     - } else {
269     - if (next->gsbase) {
270     - /* Next index is zero but next base is nonzero. */
271     - if (prev_gsindex)
272     - load_gs_index(0);
273     - wrmsrl(MSR_KERNEL_GS_BASE, next->gsbase);
274     - } else {
275     - /* Next base and index are both zero. */
276     - if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
277     - /*
278     - * We don't know the previous base and can't
279     - * find out without RDMSR. Forcibly clear it.
280     - *
281     - * This contains a pointless SWAPGS pair.
282     - * Fixing it would involve an explicit check
283     - * for Xen or a new pvop.
284     - */
285     - load_gs_index(__USER_DS);
286     - load_gs_index(0);
287     - } else {
288     - /*
289     - * If the previous index is zero and ARCH_SET_GS
290     - * didn't change the base, then the base is
291     - * also zero and we don't need to do anything.
292     - */
293     - if (prev->gsbase || prev_gsindex)
294     - load_gs_index(0);
295     - }
296     - }
297     - }
298     - /*
299     - * Save the old state and preserve the invariant.
300     - * NB: if prev_gsindex == 0, then we can't reliably learn the base
301     - * without RDMSR because Intel user code can zero it without telling
302     - * us and AMD user code can program any 32-bit value without telling
303     - * us.
304     - */
305     - if (prev_gsindex)
306     - prev->gsbase = 0;
307     - prev->gsindex = prev_gsindex;
308     + load_seg_legacy(prev->fsindex, prev->fsbase,
309     + next->fsindex, next->fsbase, FS);
310     + load_seg_legacy(prev->gsindex, prev->gsbase,
311     + next->gsindex, next->gsbase, GS);
312    
313     switch_fpu_finish(next_fpu, fpu_switch);
314    
315     diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
316     index 383f19c6bf24..549b4afd12e1 100644
317     --- a/drivers/md/raid5.c
318     +++ b/drivers/md/raid5.c
319     @@ -5844,6 +5844,8 @@ static void raid5_do_work(struct work_struct *work)
320    
321     spin_unlock_irq(&conf->device_lock);
322    
323     + r5l_flush_stripe_to_raid(conf->log);
324     +
325     async_tx_issue_pending_all();
326     blk_finish_plug(&plug);
327    
328     diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
329     index e8139514d32c..9e073fb6870a 100644
330     --- a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
331     +++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
332     @@ -317,12 +317,12 @@ int t4_wr_mbox_meat_timeout(struct adapter *adap, int mbox, const void *cmd,
333    
334     if (v != MBOX_OWNER_DRV) {
335     ret = (v == MBOX_OWNER_FW) ? -EBUSY : -ETIMEDOUT;
336     - t4_record_mbox(adap, cmd, MBOX_LEN, access, ret);
337     + t4_record_mbox(adap, cmd, size, access, ret);
338     return ret;
339     }
340    
341     /* Copy in the new mailbox command and send it on its way ... */
342     - t4_record_mbox(adap, cmd, MBOX_LEN, access, 0);
343     + t4_record_mbox(adap, cmd, size, access, 0);
344     for (i = 0; i < size; i += 8)
345     t4_write_reg64(adap, data_reg + i, be64_to_cpu(*p++));
346    
347     @@ -371,7 +371,7 @@ int t4_wr_mbox_meat_timeout(struct adapter *adap, int mbox, const void *cmd,
348     }
349    
350     ret = (pcie_fw & PCIE_FW_ERR_F) ? -ENXIO : -ETIMEDOUT;
351     - t4_record_mbox(adap, cmd, MBOX_LEN, access, ret);
352     + t4_record_mbox(adap, cmd, size, access, ret);
353     dev_err(adap->pdev_dev, "command %#x in mailbox %d timed out\n",
354     *(const u8 *)cmd, mbox);
355     t4_report_fw_error(adap);
356     diff --git a/drivers/net/ethernet/freescale/fman/mac.c b/drivers/net/ethernet/freescale/fman/mac.c
357     index 736db9d9b0ad..81021f87e4f3 100644
358     --- a/drivers/net/ethernet/freescale/fman/mac.c
359     +++ b/drivers/net/ethernet/freescale/fman/mac.c
360     @@ -622,6 +622,9 @@ static struct platform_device *dpaa_eth_add_device(int fman_id,
361     goto no_mem;
362     }
363    
364     + pdev->dev.of_node = node;
365     + pdev->dev.parent = priv->dev;
366     +
367     ret = platform_device_add_data(pdev, &data, sizeof(data));
368     if (ret)
369     goto err;
370     diff --git a/drivers/net/ethernet/freescale/gianfar.c b/drivers/net/ethernet/freescale/gianfar.c
371     index 3f4e71148808..fd206889a433 100644
372     --- a/drivers/net/ethernet/freescale/gianfar.c
373     +++ b/drivers/net/ethernet/freescale/gianfar.c
374     @@ -3690,7 +3690,7 @@ static noinline void gfar_update_link_state(struct gfar_private *priv)
375     u32 tempval1 = gfar_read(&regs->maccfg1);
376     u32 tempval = gfar_read(&regs->maccfg2);
377     u32 ecntrl = gfar_read(&regs->ecntrl);
378     - u32 tx_flow_oldval = (tempval & MACCFG1_TX_FLOW);
379     + u32 tx_flow_oldval = (tempval1 & MACCFG1_TX_FLOW);
380    
381     if (phydev->duplex != priv->oldduplex) {
382     if (!(phydev->duplex))
383     diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
384     index f902c4d3de99..1806b1fc6e4c 100644
385     --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
386     +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
387     @@ -4172,6 +4172,8 @@ static int mlxsw_sp_netdevice_port_upper_event(struct net_device *dev,
388     return -EINVAL;
389     if (!info->linking)
390     break;
391     + if (netdev_has_any_upper_dev(upper_dev))
392     + return -EINVAL;
393     /* HW limitation forbids to put ports to multiple bridges. */
394     if (netif_is_bridge_master(upper_dev) &&
395     !mlxsw_sp_master_bridge_check(mlxsw_sp, upper_dev))
396     @@ -4185,6 +4187,10 @@ static int mlxsw_sp_netdevice_port_upper_event(struct net_device *dev,
397     if (netif_is_lag_port(dev) && is_vlan_dev(upper_dev) &&
398     !netif_is_lag_master(vlan_dev_real_dev(upper_dev)))
399     return -EINVAL;
400     + if (!info->linking)
401     + break;
402     + if (netdev_has_any_upper_dev(upper_dev))
403     + return -EINVAL;
404     break;
405     case NETDEV_CHANGEUPPER:
406     upper_dev = info->upper_dev;
407     diff --git a/drivers/net/ethernet/qlogic/qlge/qlge_dbg.c b/drivers/net/ethernet/qlogic/qlge/qlge_dbg.c
408     index 829be21f97b2..be258d90de9e 100644
409     --- a/drivers/net/ethernet/qlogic/qlge/qlge_dbg.c
410     +++ b/drivers/net/ethernet/qlogic/qlge/qlge_dbg.c
411     @@ -724,7 +724,7 @@ static void ql_build_coredump_seg_header(
412     seg_hdr->cookie = MPI_COREDUMP_COOKIE;
413     seg_hdr->segNum = seg_number;
414     seg_hdr->segSize = seg_size;
415     - memcpy(seg_hdr->description, desc, (sizeof(seg_hdr->description)) - 1);
416     + strncpy(seg_hdr->description, desc, (sizeof(seg_hdr->description)) - 1);
417     }
418    
419     /*
420     diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
421     index ff038e507fd6..36a04e182af1 100644
422     --- a/drivers/net/hyperv/netvsc_drv.c
423     +++ b/drivers/net/hyperv/netvsc_drv.c
424     @@ -1084,7 +1084,12 @@ static void netvsc_link_change(struct work_struct *w)
425     bool notify = false, reschedule = false;
426     unsigned long flags, next_reconfig, delay;
427    
428     - rtnl_lock();
429     + /* if changes are happening, comeback later */
430     + if (!rtnl_trylock()) {
431     + schedule_delayed_work(&ndev_ctx->dwork, LINKCHANGE_INT);
432     + return;
433     + }
434     +
435     if (ndev_ctx->start_remove)
436     goto out_unlock;
437    
438     diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c
439     index a5d66e205bb2..2caac0c37059 100644
440     --- a/drivers/net/macsec.c
441     +++ b/drivers/net/macsec.c
442     @@ -3510,6 +3510,7 @@ module_init(macsec_init);
443     module_exit(macsec_exit);
444    
445     MODULE_ALIAS_RTNL_LINK("macsec");
446     +MODULE_ALIAS_GENL_FAMILY("macsec");
447    
448     MODULE_DESCRIPTION("MACsec IEEE 802.1AE");
449     MODULE_LICENSE("GPL v2");
450     diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
451     index 775a6e1fdef9..6e12401b5102 100644
452     --- a/drivers/net/phy/phy.c
453     +++ b/drivers/net/phy/phy.c
454     @@ -674,9 +674,6 @@ void phy_stop_machine(struct phy_device *phydev)
455     if (phydev->state > PHY_UP && phydev->state != PHY_HALTED)
456     phydev->state = PHY_UP;
457     mutex_unlock(&phydev->lock);
458     -
459     - /* Now we can run the state machine synchronously */
460     - phy_state_machine(&phydev->state_queue.work);
461     }
462    
463     /**
464     diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
465     index 5dc128a8da83..96a0661011fd 100644
466     --- a/drivers/vhost/net.c
467     +++ b/drivers/vhost/net.c
468     @@ -537,8 +537,13 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
469    
470     preempt_enable();
471    
472     - if (vhost_enable_notify(&net->dev, vq))
473     + if (!vhost_vq_avail_empty(&net->dev, vq))
474     vhost_poll_queue(&vq->poll);
475     + else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
476     + vhost_disable_notify(&net->dev, vq);
477     + vhost_poll_queue(&vq->poll);
478     + }
479     +
480     mutex_unlock(&vq->mutex);
481    
482     len = peek_head_len(sk);
483     diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
484     index 2fc84a991325..98c1a63a4614 100644
485     --- a/fs/f2fs/recovery.c
486     +++ b/fs/f2fs/recovery.c
487     @@ -316,7 +316,7 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
488     return 0;
489    
490     /* Get the previous summary */
491     - for (i = CURSEG_WARM_DATA; i <= CURSEG_COLD_DATA; i++) {
492     + for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
493     struct curseg_info *curseg = CURSEG_I(sbi, i);
494     if (curseg->segno == segno) {
495     sum = curseg->sum_blk->entries[blkoff];
496     @@ -626,8 +626,6 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
497     }
498    
499     clear_sbi_flag(sbi, SBI_POR_DOING);
500     - if (err)
501     - set_ckpt_flags(sbi, CP_ERROR_FLAG);
502     mutex_unlock(&sbi->cp_mutex);
503    
504     /* let's drop all the directory inodes for clean checkpoint */
505     diff --git a/fs/inode.c b/fs/inode.c
506     index 88110fd0b282..920aa0b1c6b0 100644
507     --- a/fs/inode.c
508     +++ b/fs/inode.c
509     @@ -637,6 +637,7 @@ void evict_inodes(struct super_block *sb)
510    
511     dispose_list(&dispose);
512     }
513     +EXPORT_SYMBOL_GPL(evict_inodes);
514    
515     /**
516     * invalidate_inodes - attempt to free all inodes on a superblock
517     diff --git a/fs/internal.h b/fs/internal.h
518     index f4da3341b4a3..8b7143b0211c 100644
519     --- a/fs/internal.h
520     +++ b/fs/internal.h
521     @@ -136,7 +136,6 @@ extern bool atime_needs_update_rcu(const struct path *, struct inode *);
522     extern void inode_io_list_del(struct inode *inode);
523    
524     extern long get_nr_dirty_inodes(void);
525     -extern void evict_inodes(struct super_block *);
526     extern int invalidate_inodes(struct super_block *, bool);
527    
528     /*
529     diff --git a/fs/iomap.c b/fs/iomap.c
530     index 798c291cbc75..a49db8806a3a 100644
531     --- a/fs/iomap.c
532     +++ b/fs/iomap.c
533     @@ -281,7 +281,7 @@ iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
534     unsigned long bytes; /* Bytes to write to page */
535    
536     offset = (pos & (PAGE_SIZE - 1));
537     - bytes = min_t(unsigned long, PAGE_SIZE - offset, length);
538     + bytes = min_t(loff_t, PAGE_SIZE - offset, length);
539    
540     rpage = __iomap_read_page(inode, pos);
541     if (IS_ERR(rpage))
542     @@ -376,7 +376,7 @@ iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count,
543     unsigned offset, bytes;
544    
545     offset = pos & (PAGE_SIZE - 1); /* Within page */
546     - bytes = min_t(unsigned, PAGE_SIZE - offset, count);
547     + bytes = min_t(loff_t, PAGE_SIZE - offset, count);
548    
549     if (IS_DAX(inode))
550     status = iomap_dax_zero(pos, offset, bytes, iomap);
551     diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
552     index 2852521fc8ec..c6c15e5717e4 100644
553     --- a/fs/xfs/libxfs/xfs_attr_leaf.c
554     +++ b/fs/xfs/libxfs/xfs_attr_leaf.c
555     @@ -351,7 +351,7 @@ xfs_attr3_leaf_read(
556    
557     err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
558     XFS_ATTR_FORK, &xfs_attr3_leaf_buf_ops);
559     - if (!err && tp)
560     + if (!err && tp && *bpp)
561     xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_ATTR_LEAF_BUF);
562     return err;
563     }
564     diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
565     index 2a8cbd15d5d1..d2f4ab175096 100644
566     --- a/fs/xfs/libxfs/xfs_bmap.c
567     +++ b/fs/xfs/libxfs/xfs_bmap.c
568     @@ -579,7 +579,7 @@ xfs_bmap_validate_ret(
569    
570     #else
571     #define xfs_bmap_check_leaf_extents(cur, ip, whichfork) do { } while (0)
572     -#define xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap)
573     +#define xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap) do { } while (0)
574     #endif /* DEBUG */
575    
576     /*
577     @@ -5555,6 +5555,8 @@ __xfs_bunmapi(
578     int whichfork; /* data or attribute fork */
579     xfs_fsblock_t sum;
580     xfs_filblks_t len = *rlen; /* length to unmap in file */
581     + xfs_fileoff_t max_len;
582     + xfs_agnumber_t prev_agno = NULLAGNUMBER, agno;
583    
584     trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_);
585    
586     @@ -5576,6 +5578,16 @@ __xfs_bunmapi(
587     ASSERT(len > 0);
588     ASSERT(nexts >= 0);
589    
590     + /*
591     + * Guesstimate how many blocks we can unmap without running the risk of
592     + * blowing out the transaction with a mix of EFIs and reflink
593     + * adjustments.
594     + */
595     + if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK)
596     + max_len = min(len, xfs_refcount_max_unmap(tp->t_log_res));
597     + else
598     + max_len = len;
599     +
600     if (!(ifp->if_flags & XFS_IFEXTENTS) &&
601     (error = xfs_iread_extents(tp, ip, whichfork)))
602     return error;
603     @@ -5621,7 +5633,7 @@ __xfs_bunmapi(
604    
605     extno = 0;
606     while (bno != (xfs_fileoff_t)-1 && bno >= start && lastx >= 0 &&
607     - (nexts == 0 || extno < nexts)) {
608     + (nexts == 0 || extno < nexts) && max_len > 0) {
609     /*
610     * Is the found extent after a hole in which bno lives?
611     * Just back up to the previous extent, if so.
612     @@ -5647,6 +5659,17 @@ __xfs_bunmapi(
613     ASSERT(ep != NULL);
614     del = got;
615     wasdel = isnullstartblock(del.br_startblock);
616     +
617     + /*
618     + * Make sure we don't touch multiple AGF headers out of order
619     + * in a single transaction, as that could cause AB-BA deadlocks.
620     + */
621     + if (!wasdel) {
622     + agno = XFS_FSB_TO_AGNO(mp, del.br_startblock);
623     + if (prev_agno != NULLAGNUMBER && prev_agno > agno)
624     + break;
625     + prev_agno = agno;
626     + }
627     if (got.br_startoff < start) {
628     del.br_startoff = start;
629     del.br_blockcount -= start - got.br_startoff;
630     @@ -5655,6 +5678,15 @@ __xfs_bunmapi(
631     }
632     if (del.br_startoff + del.br_blockcount > bno + 1)
633     del.br_blockcount = bno + 1 - del.br_startoff;
634     +
635     + /* How much can we safely unmap? */
636     + if (max_len < del.br_blockcount) {
637     + del.br_startoff += del.br_blockcount - max_len;
638     + if (!wasdel)
639     + del.br_startblock += del.br_blockcount - max_len;
640     + del.br_blockcount = max_len;
641     + }
642     +
643     sum = del.br_startblock + del.br_blockcount;
644     if (isrt &&
645     (mod = do_mod(sum, mp->m_sb.sb_rextsize))) {
646     @@ -5835,6 +5867,7 @@ __xfs_bunmapi(
647     if (!isrt && wasdel)
648     xfs_mod_fdblocks(mp, (int64_t)del.br_blockcount, false);
649    
650     + max_len -= del.br_blockcount;
651     bno = del.br_startoff - 1;
652     nodelete:
653     /*
654     @@ -6604,25 +6637,33 @@ xfs_bmap_finish_one(
655     int whichfork,
656     xfs_fileoff_t startoff,
657     xfs_fsblock_t startblock,
658     - xfs_filblks_t blockcount,
659     + xfs_filblks_t *blockcount,
660     xfs_exntst_t state)
661     {
662     struct xfs_bmbt_irec bmap;
663     int nimaps = 1;
664     xfs_fsblock_t firstfsb;
665     int flags = XFS_BMAPI_REMAP;
666     - int done;
667     int error = 0;
668    
669     bmap.br_startblock = startblock;
670     bmap.br_startoff = startoff;
671     - bmap.br_blockcount = blockcount;
672     + bmap.br_blockcount = *blockcount;
673     bmap.br_state = state;
674    
675     + /*
676     + * firstfsb is tied to the transaction lifetime and is used to
677     + * ensure correct AG locking order and schedule work item
678     + * continuations. XFS_BUI_MAX_FAST_EXTENTS (== 1) restricts us
679     + * to only making one bmap call per transaction, so it should
680     + * be safe to have it as a local variable here.
681     + */
682     + firstfsb = NULLFSBLOCK;
683     +
684     trace_xfs_bmap_deferred(tp->t_mountp,
685     XFS_FSB_TO_AGNO(tp->t_mountp, startblock), type,
686     XFS_FSB_TO_AGBNO(tp->t_mountp, startblock),
687     - ip->i_ino, whichfork, startoff, blockcount, state);
688     + ip->i_ino, whichfork, startoff, *blockcount, state);
689    
690     if (whichfork != XFS_DATA_FORK && whichfork != XFS_ATTR_FORK)
691     return -EFSCORRUPTED;
692     @@ -6641,12 +6682,11 @@ xfs_bmap_finish_one(
693     bmap.br_blockcount, flags, &firstfsb,
694     bmap.br_blockcount, &bmap, &nimaps,
695     dfops);
696     + *blockcount = 0;
697     break;
698     case XFS_BMAP_UNMAP:
699     - error = xfs_bunmapi(tp, ip, bmap.br_startoff,
700     - bmap.br_blockcount, flags, 1, &firstfsb,
701     - dfops, &done);
702     - ASSERT(done);
703     + error = __xfs_bunmapi(tp, ip, startoff, blockcount,
704     + XFS_BMAPI_REMAP, 1, &firstfsb, dfops);
705     break;
706     default:
707     ASSERT(0);
708     diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
709     index e7d40b39f18f..db53ac7ff6df 100644
710     --- a/fs/xfs/libxfs/xfs_bmap.h
711     +++ b/fs/xfs/libxfs/xfs_bmap.h
712     @@ -265,7 +265,7 @@ struct xfs_bmap_intent {
713     int xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_defer_ops *dfops,
714     struct xfs_inode *ip, enum xfs_bmap_intent_type type,
715     int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock,
716     - xfs_filblks_t blockcount, xfs_exntst_t state);
717     + xfs_filblks_t *blockcount, xfs_exntst_t state);
718     int xfs_bmap_map_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
719     struct xfs_inode *ip, struct xfs_bmbt_irec *imap);
720     int xfs_bmap_unmap_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
721     diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
722     index 5c3918678bb6..9968a746c649 100644
723     --- a/fs/xfs/libxfs/xfs_bmap_btree.c
724     +++ b/fs/xfs/libxfs/xfs_bmap_btree.c
725     @@ -888,6 +888,7 @@ xfs_bmbt_change_owner(
726     cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork);
727     if (!cur)
728     return -ENOMEM;
729     + cur->bc_private.b.flags |= XFS_BTCUR_BPRV_INVALID_OWNER;
730    
731     error = xfs_btree_change_owner(cur, new_owner, buffer_list);
732     xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
733     diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
734     index 91c68913d495..4ad1e214b1b2 100644
735     --- a/fs/xfs/libxfs/xfs_btree.c
736     +++ b/fs/xfs/libxfs/xfs_btree.c
737     @@ -714,7 +714,8 @@ xfs_btree_firstrec(
738     * Get the block pointer for this level.
739     */
740     block = xfs_btree_get_block(cur, level, &bp);
741     - xfs_btree_check_block(cur, block, level, bp);
742     + if (xfs_btree_check_block(cur, block, level, bp))
743     + return 0;
744     /*
745     * It's empty, there is no such record.
746     */
747     @@ -743,7 +744,8 @@ xfs_btree_lastrec(
748     * Get the block pointer for this level.
749     */
750     block = xfs_btree_get_block(cur, level, &bp);
751     - xfs_btree_check_block(cur, block, level, bp);
752     + if (xfs_btree_check_block(cur, block, level, bp))
753     + return 0;
754     /*
755     * It's empty, there is no such record.
756     */
757     @@ -1772,6 +1774,7 @@ xfs_btree_lookup_get_block(
758    
759     /* Check the inode owner since the verifiers don't. */
760     if (xfs_sb_version_hascrc(&cur->bc_mp->m_sb) &&
761     + !(cur->bc_private.b.flags & XFS_BTCUR_BPRV_INVALID_OWNER) &&
762     (cur->bc_flags & XFS_BTREE_LONG_PTRS) &&
763     be64_to_cpu((*blkp)->bb_u.l.bb_owner) !=
764     cur->bc_private.b.ip->i_ino)
765     @@ -4432,10 +4435,15 @@ xfs_btree_block_change_owner(
766    
767     /* modify the owner */
768     block = xfs_btree_get_block(cur, level, &bp);
769     - if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
770     + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
771     + if (block->bb_u.l.bb_owner == cpu_to_be64(bbcoi->new_owner))
772     + return 0;
773     block->bb_u.l.bb_owner = cpu_to_be64(bbcoi->new_owner);
774     - else
775     + } else {
776     + if (block->bb_u.s.bb_owner == cpu_to_be32(bbcoi->new_owner))
777     + return 0;
778     block->bb_u.s.bb_owner = cpu_to_be32(bbcoi->new_owner);
779     + }
780    
781     /*
782     * If the block is a root block hosted in an inode, we might not have a
783     @@ -4444,16 +4452,19 @@ xfs_btree_block_change_owner(
784     * block is formatted into the on-disk inode fork. We still change it,
785     * though, so everything is consistent in memory.
786     */
787     - if (bp) {
788     - if (cur->bc_tp) {
789     - xfs_trans_ordered_buf(cur->bc_tp, bp);
790     + if (!bp) {
791     + ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
792     + ASSERT(level == cur->bc_nlevels - 1);
793     + return 0;
794     + }
795     +
796     + if (cur->bc_tp) {
797     + if (!xfs_trans_ordered_buf(cur->bc_tp, bp)) {
798     xfs_btree_log_block(cur, bp, XFS_BB_OWNER);
799     - } else {
800     - xfs_buf_delwri_queue(bp, bbcoi->buffer_list);
801     + return -EAGAIN;
802     }
803     } else {
804     - ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
805     - ASSERT(level == cur->bc_nlevels - 1);
806     + xfs_buf_delwri_queue(bp, bbcoi->buffer_list);
807     }
808    
809     return 0;
810     diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
811     index 3b0fc1afada5..33c7be2357b9 100644
812     --- a/fs/xfs/libxfs/xfs_btree.h
813     +++ b/fs/xfs/libxfs/xfs_btree.h
814     @@ -268,7 +268,8 @@ typedef struct xfs_btree_cur
815     short forksize; /* fork's inode space */
816     char whichfork; /* data or attr fork */
817     char flags; /* flags */
818     -#define XFS_BTCUR_BPRV_WASDEL 1 /* was delayed */
819     +#define XFS_BTCUR_BPRV_WASDEL (1<<0) /* was delayed */
820     +#define XFS_BTCUR_BPRV_INVALID_OWNER (1<<1) /* for ext swap */
821     } b;
822     } bc_private; /* per-btree type data */
823     } xfs_btree_cur_t;
824     diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
825     index 1bdf2888295b..b305dbfd81c4 100644
826     --- a/fs/xfs/libxfs/xfs_da_btree.c
827     +++ b/fs/xfs/libxfs/xfs_da_btree.c
828     @@ -263,7 +263,7 @@ xfs_da3_node_read(
829    
830     err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
831     which_fork, &xfs_da3_node_buf_ops);
832     - if (!err && tp) {
833     + if (!err && tp && *bpp) {
834     struct xfs_da_blkinfo *info = (*bpp)->b_addr;
835     int type;
836    
837     diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c
838     index aa17cb788946..43c902f7a68d 100644
839     --- a/fs/xfs/libxfs/xfs_dir2_block.c
840     +++ b/fs/xfs/libxfs/xfs_dir2_block.c
841     @@ -139,7 +139,7 @@ xfs_dir3_block_read(
842    
843     err = xfs_da_read_buf(tp, dp, mp->m_dir_geo->datablk, -1, bpp,
844     XFS_DATA_FORK, &xfs_dir3_block_buf_ops);
845     - if (!err && tp)
846     + if (!err && tp && *bpp)
847     xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_BLOCK_BUF);
848     return err;
849     }
850     diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c
851     index b887fb2a2bcf..f2e342e05365 100644
852     --- a/fs/xfs/libxfs/xfs_dir2_leaf.c
853     +++ b/fs/xfs/libxfs/xfs_dir2_leaf.c
854     @@ -268,7 +268,7 @@ xfs_dir3_leaf_read(
855    
856     err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
857     XFS_DATA_FORK, &xfs_dir3_leaf1_buf_ops);
858     - if (!err && tp)
859     + if (!err && tp && *bpp)
860     xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAF1_BUF);
861     return err;
862     }
863     @@ -285,7 +285,7 @@ xfs_dir3_leafn_read(
864    
865     err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
866     XFS_DATA_FORK, &xfs_dir3_leafn_buf_ops);
867     - if (!err && tp)
868     + if (!err && tp && *bpp)
869     xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAFN_BUF);
870     return err;
871     }
872     diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
873     index a2818f6e8598..42fef0731e2a 100644
874     --- a/fs/xfs/libxfs/xfs_ialloc.c
875     +++ b/fs/xfs/libxfs/xfs_ialloc.c
876     @@ -368,8 +368,6 @@ xfs_ialloc_inode_init(
877     * transaction and pin the log appropriately.
878     */
879     xfs_trans_ordered_buf(tp, fbuf);
880     - xfs_trans_log_buf(tp, fbuf, 0,
881     - BBTOB(fbuf->b_length) - 1);
882     }
883     } else {
884     fbuf->b_flags |= XBF_DONE;
885     @@ -1123,6 +1121,7 @@ xfs_dialloc_ag_inobt(
886     int error;
887     int offset;
888     int i, j;
889     + int searchdistance = 10;
890    
891     pag = xfs_perag_get(mp, agno);
892    
893     @@ -1149,7 +1148,6 @@ xfs_dialloc_ag_inobt(
894     if (pagno == agno) {
895     int doneleft; /* done, to the left */
896     int doneright; /* done, to the right */
897     - int searchdistance = 10;
898    
899     error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i);
900     if (error)
901     @@ -1210,21 +1208,9 @@ xfs_dialloc_ag_inobt(
902     /*
903     * Loop until we find an inode chunk with a free inode.
904     */
905     - while (!doneleft || !doneright) {
906     + while (--searchdistance > 0 && (!doneleft || !doneright)) {
907     int useleft; /* using left inode chunk this time */
908    
909     - if (!--searchdistance) {
910     - /*
911     - * Not in range - save last search
912     - * location and allocate a new inode
913     - */
914     - xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
915     - pag->pagl_leftrec = trec.ir_startino;
916     - pag->pagl_rightrec = rec.ir_startino;
917     - pag->pagl_pagino = pagino;
918     - goto newino;
919     - }
920     -
921     /* figure out the closer block if both are valid. */
922     if (!doneleft && !doneright) {
923     useleft = pagino -
924     @@ -1236,13 +1222,13 @@ xfs_dialloc_ag_inobt(
925    
926     /* free inodes to the left? */
927     if (useleft && trec.ir_freecount) {
928     - rec = trec;
929     xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
930     cur = tcur;
931    
932     pag->pagl_leftrec = trec.ir_startino;
933     pag->pagl_rightrec = rec.ir_startino;
934     pag->pagl_pagino = pagino;
935     + rec = trec;
936     goto alloc_inode;
937     }
938    
939     @@ -1268,26 +1254,37 @@ xfs_dialloc_ag_inobt(
940     goto error1;
941     }
942    
943     - /*
944     - * We've reached the end of the btree. because
945     - * we are only searching a small chunk of the
946     - * btree each search, there is obviously free
947     - * inodes closer to the parent inode than we
948     - * are now. restart the search again.
949     - */
950     - pag->pagl_pagino = NULLAGINO;
951     - pag->pagl_leftrec = NULLAGINO;
952     - pag->pagl_rightrec = NULLAGINO;
953     - xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
954     - xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
955     - goto restart_pagno;
956     + if (searchdistance <= 0) {
957     + /*
958     + * Not in range - save last search
959     + * location and allocate a new inode
960     + */
961     + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
962     + pag->pagl_leftrec = trec.ir_startino;
963     + pag->pagl_rightrec = rec.ir_startino;
964     + pag->pagl_pagino = pagino;
965     +
966     + } else {
967     + /*
968     + * We've reached the end of the btree. because
969     + * we are only searching a small chunk of the
970     + * btree each search, there is obviously free
971     + * inodes closer to the parent inode than we
972     + * are now. restart the search again.
973     + */
974     + pag->pagl_pagino = NULLAGINO;
975     + pag->pagl_leftrec = NULLAGINO;
976     + pag->pagl_rightrec = NULLAGINO;
977     + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
978     + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
979     + goto restart_pagno;
980     + }
981     }
982    
983     /*
984     * In a different AG from the parent.
985     * See if the most recently allocated block has any free.
986     */
987     -newino:
988     if (agi->agi_newino != cpu_to_be32(NULLAGINO)) {
989     error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino),
990     XFS_LOOKUP_EQ, &i);
991     diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
992     index 8a37efe04de3..4e30448c4465 100644
993     --- a/fs/xfs/libxfs/xfs_inode_fork.c
994     +++ b/fs/xfs/libxfs/xfs_inode_fork.c
995     @@ -1539,14 +1539,11 @@ xfs_iext_realloc_indirect(
996     xfs_ifork_t *ifp, /* inode fork pointer */
997     int new_size) /* new indirection array size */
998     {
999     - int nlists; /* number of irec's (ex lists) */
1000     - int size; /* current indirection array size */
1001     -
1002     ASSERT(ifp->if_flags & XFS_IFEXTIREC);
1003     - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
1004     - size = nlists * sizeof(xfs_ext_irec_t);
1005     ASSERT(ifp->if_real_bytes);
1006     - ASSERT((new_size >= 0) && (new_size != size));
1007     + ASSERT((new_size >= 0) &&
1008     + (new_size != ((ifp->if_real_bytes / XFS_IEXT_BUFSZ) *
1009     + sizeof(xfs_ext_irec_t))));
1010     if (new_size == 0) {
1011     xfs_iext_destroy(ifp);
1012     } else {
1013     diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
1014     index 82a38d86ebad..d71cb63cdea3 100644
1015     --- a/fs/xfs/libxfs/xfs_refcount.c
1016     +++ b/fs/xfs/libxfs/xfs_refcount.c
1017     @@ -784,14 +784,6 @@ xfs_refcount_merge_extents(
1018     }
1019    
1020     /*
1021     - * While we're adjusting the refcounts records of an extent, we have
1022     - * to keep an eye on the number of extents we're dirtying -- run too
1023     - * many in a single transaction and we'll exceed the transaction's
1024     - * reservation and crash the fs. Each record adds 12 bytes to the
1025     - * log (plus any key updates) so we'll conservatively assume 24 bytes
1026     - * per record. We must also leave space for btree splits on both ends
1027     - * of the range and space for the CUD and a new CUI.
1028     - *
1029     * XXX: This is a pretty hand-wavy estimate. The penalty for guessing
1030     * true incorrectly is a shutdown FS; the penalty for guessing false
1031     * incorrectly is more transaction rolls than might be necessary.
1032     @@ -822,7 +814,7 @@ xfs_refcount_still_have_space(
1033     else if (overhead > cur->bc_tp->t_log_res)
1034     return false;
1035     return cur->bc_tp->t_log_res - overhead >
1036     - cur->bc_private.a.priv.refc.nr_ops * 32;
1037     + cur->bc_private.a.priv.refc.nr_ops * XFS_REFCOUNT_ITEM_OVERHEAD;
1038     }
1039    
1040     /*
1041     @@ -1648,6 +1640,10 @@ xfs_refcount_recover_cow_leftovers(
1042     error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp);
1043     if (error)
1044     goto out_trans;
1045     + if (!agbp) {
1046     + error = -ENOMEM;
1047     + goto out_trans;
1048     + }
1049     cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno, NULL);
1050    
1051     /* Find all the leftover CoW staging extents. */
1052     diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h
1053     index 098dc668ab2c..eafb9d1f3b37 100644
1054     --- a/fs/xfs/libxfs/xfs_refcount.h
1055     +++ b/fs/xfs/libxfs/xfs_refcount.h
1056     @@ -67,4 +67,20 @@ extern int xfs_refcount_free_cow_extent(struct xfs_mount *mp,
1057     extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp,
1058     xfs_agnumber_t agno);
1059    
1060     +/*
1061     + * While we're adjusting the refcounts records of an extent, we have
1062     + * to keep an eye on the number of extents we're dirtying -- run too
1063     + * many in a single transaction and we'll exceed the transaction's
1064     + * reservation and crash the fs. Each record adds 12 bytes to the
1065     + * log (plus any key updates) so we'll conservatively assume 32 bytes
1066     + * per record. We must also leave space for btree splits on both ends
1067     + * of the range and space for the CUD and a new CUI.
1068     + */
1069     +#define XFS_REFCOUNT_ITEM_OVERHEAD 32
1070     +
1071     +static inline xfs_fileoff_t xfs_refcount_max_unmap(int log_res)
1072     +{
1073     + return (log_res * 3 / 4) / XFS_REFCOUNT_ITEM_OVERHEAD;
1074     +}
1075     +
1076     #endif /* __XFS_REFCOUNT_H__ */
1077     diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
1078     index 578981412615..d23889e0bedc 100644
1079     --- a/fs/xfs/xfs_aops.c
1080     +++ b/fs/xfs/xfs_aops.c
1081     @@ -90,11 +90,11 @@ xfs_find_bdev_for_inode(
1082     * associated buffer_heads, paying attention to the start and end offsets that
1083     * we need to process on the page.
1084     *
1085     - * Landmine Warning: bh->b_end_io() will call end_page_writeback() on the last
1086     - * buffer in the IO. Once it does this, it is unsafe to access the bufferhead or
1087     - * the page at all, as we may be racing with memory reclaim and it can free both
1088     - * the bufferhead chain and the page as it will see the page as clean and
1089     - * unused.
1090     + * Note that we open code the action in end_buffer_async_write here so that we
1091     + * only have to iterate over the buffers attached to the page once. This is not
1092     + * only more efficient, but also ensures that we only calls end_page_writeback
1093     + * at the end of the iteration, and thus avoids the pitfall of having the page
1094     + * and buffers potentially freed after every call to end_buffer_async_write.
1095     */
1096     static void
1097     xfs_finish_page_writeback(
1098     @@ -102,29 +102,45 @@ xfs_finish_page_writeback(
1099     struct bio_vec *bvec,
1100     int error)
1101     {
1102     - unsigned int end = bvec->bv_offset + bvec->bv_len - 1;
1103     - struct buffer_head *head, *bh, *next;
1104     + struct buffer_head *head = page_buffers(bvec->bv_page), *bh = head;
1105     + bool busy = false;
1106     unsigned int off = 0;
1107     - unsigned int bsize;
1108     + unsigned long flags;
1109    
1110     ASSERT(bvec->bv_offset < PAGE_SIZE);
1111     ASSERT((bvec->bv_offset & (i_blocksize(inode) - 1)) == 0);
1112     - ASSERT(end < PAGE_SIZE);
1113     + ASSERT(bvec->bv_offset + bvec->bv_len <= PAGE_SIZE);
1114     ASSERT((bvec->bv_len & (i_blocksize(inode) - 1)) == 0);
1115    
1116     - bh = head = page_buffers(bvec->bv_page);
1117     -
1118     - bsize = bh->b_size;
1119     + local_irq_save(flags);
1120     + bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
1121     do {
1122     - if (off > end)
1123     - break;
1124     - next = bh->b_this_page;
1125     - if (off < bvec->bv_offset)
1126     - goto next_bh;
1127     - bh->b_end_io(bh, !error);
1128     -next_bh:
1129     - off += bsize;
1130     - } while ((bh = next) != head);
1131     + if (off >= bvec->bv_offset &&
1132     + off < bvec->bv_offset + bvec->bv_len) {
1133     + ASSERT(buffer_async_write(bh));
1134     + ASSERT(bh->b_end_io == NULL);
1135     +
1136     + if (error) {
1137     + mapping_set_error(bvec->bv_page->mapping, -EIO);
1138     + set_buffer_write_io_error(bh);
1139     + clear_buffer_uptodate(bh);
1140     + SetPageError(bvec->bv_page);
1141     + } else {
1142     + set_buffer_uptodate(bh);
1143     + }
1144     + clear_buffer_async_write(bh);
1145     + unlock_buffer(bh);
1146     + } else if (buffer_async_write(bh)) {
1147     + ASSERT(buffer_locked(bh));
1148     + busy = true;
1149     + }
1150     + off += bh->b_size;
1151     + } while ((bh = bh->b_this_page) != head);
1152     + bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
1153     + local_irq_restore(flags);
1154     +
1155     + if (!busy)
1156     + end_page_writeback(bvec->bv_page);
1157     }
1158    
1159     /*
1160     @@ -138,8 +154,10 @@ xfs_destroy_ioend(
1161     int error)
1162     {
1163     struct inode *inode = ioend->io_inode;
1164     - struct bio *last = ioend->io_bio;
1165     - struct bio *bio, *next;
1166     + struct bio *bio = &ioend->io_inline_bio;
1167     + struct bio *last = ioend->io_bio, *next;
1168     + u64 start = bio->bi_iter.bi_sector;
1169     + bool quiet = bio_flagged(bio, BIO_QUIET);
1170    
1171     for (bio = &ioend->io_inline_bio; bio; bio = next) {
1172     struct bio_vec *bvec;
1173     @@ -160,6 +178,11 @@ xfs_destroy_ioend(
1174    
1175     bio_put(bio);
1176     }
1177     +
1178     + if (unlikely(error && !quiet)) {
1179     + xfs_err_ratelimited(XFS_I(inode)->i_mount,
1180     + "writeback error on sector %llu", start);
1181     + }
1182     }
1183    
1184     /*
1185     @@ -427,7 +450,8 @@ xfs_start_buffer_writeback(
1186     ASSERT(!buffer_delay(bh));
1187     ASSERT(!buffer_unwritten(bh));
1188    
1189     - mark_buffer_async_write(bh);
1190     + bh->b_end_io = NULL;
1191     + set_buffer_async_write(bh);
1192     set_buffer_uptodate(bh);
1193     clear_buffer_dirty(bh);
1194     }
1195     @@ -1566,9 +1590,12 @@ xfs_vm_bmap(
1196     * The swap code (ab-)uses ->bmap to get a block mapping and then
1197     * bypasseÑ• the file system for actual I/O. We really can't allow
1198     * that on reflinks inodes, so we have to skip out here. And yes,
1199     - * 0 is the magic code for a bmap error..
1200     + * 0 is the magic code for a bmap error.
1201     + *
1202     + * Since we don't pass back blockdev info, we can't return bmap
1203     + * information for rt files either.
1204     */
1205     - if (xfs_is_reflink_inode(ip)) {
1206     + if (xfs_is_reflink_inode(ip) || XFS_IS_REALTIME_INODE(ip)) {
1207     xfs_iunlock(ip, XFS_IOLOCK_SHARED);
1208     return 0;
1209     }
1210     diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
1211     index c4b90e794e41..5a54dcd7e7b1 100644
1212     --- a/fs/xfs/xfs_bmap_item.c
1213     +++ b/fs/xfs/xfs_bmap_item.c
1214     @@ -395,6 +395,7 @@ xfs_bui_recover(
1215     struct xfs_map_extent *bmap;
1216     xfs_fsblock_t startblock_fsb;
1217     xfs_fsblock_t inode_fsb;
1218     + xfs_filblks_t count;
1219     bool op_ok;
1220     struct xfs_bud_log_item *budp;
1221     enum xfs_bmap_intent_type type;
1222     @@ -403,6 +404,7 @@ xfs_bui_recover(
1223     struct xfs_trans *tp;
1224     struct xfs_inode *ip = NULL;
1225     struct xfs_defer_ops dfops;
1226     + struct xfs_bmbt_irec irec;
1227     xfs_fsblock_t firstfsb;
1228    
1229     ASSERT(!test_bit(XFS_BUI_RECOVERED, &buip->bui_flags));
1230     @@ -480,13 +482,24 @@ xfs_bui_recover(
1231     }
1232     xfs_trans_ijoin(tp, ip, 0);
1233    
1234     + count = bmap->me_len;
1235     error = xfs_trans_log_finish_bmap_update(tp, budp, &dfops, type,
1236     ip, whichfork, bmap->me_startoff,
1237     - bmap->me_startblock, bmap->me_len,
1238     - state);
1239     + bmap->me_startblock, &count, state);
1240     if (error)
1241     goto err_dfops;
1242    
1243     + if (count > 0) {
1244     + ASSERT(type == XFS_BMAP_UNMAP);
1245     + irec.br_startblock = bmap->me_startblock;
1246     + irec.br_blockcount = count;
1247     + irec.br_startoff = bmap->me_startoff;
1248     + irec.br_state = state;
1249     + error = xfs_bmap_unmap_extent(tp->t_mountp, &dfops, ip, &irec);
1250     + if (error)
1251     + goto err_dfops;
1252     + }
1253     +
1254     /* Finish transaction, free inodes. */
1255     error = xfs_defer_finish(&tp, &dfops, NULL);
1256     if (error)
1257     diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
1258     index 87b495e2f15a..5ffefac081f7 100644
1259     --- a/fs/xfs/xfs_bmap_util.c
1260     +++ b/fs/xfs/xfs_bmap_util.c
1261     @@ -1825,29 +1825,18 @@ xfs_swap_extent_forks(
1262     }
1263    
1264     /*
1265     - * Before we've swapped the forks, lets set the owners of the forks
1266     - * appropriately. We have to do this as we are demand paging the btree
1267     - * buffers, and so the validation done on read will expect the owner
1268     - * field to be correctly set. Once we change the owners, we can swap the
1269     - * inode forks.
1270     + * Btree format (v3) inodes have the inode number stamped in the bmbt
1271     + * block headers. We can't start changing the bmbt blocks until the
1272     + * inode owner change is logged so recovery does the right thing in the
1273     + * event of a crash. Set the owner change log flags now and leave the
1274     + * bmbt scan as the last step.
1275     */
1276     if (ip->i_d.di_version == 3 &&
1277     - ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
1278     + ip->i_d.di_format == XFS_DINODE_FMT_BTREE)
1279     (*target_log_flags) |= XFS_ILOG_DOWNER;
1280     - error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK,
1281     - tip->i_ino, NULL);
1282     - if (error)
1283     - return error;
1284     - }
1285     -
1286     if (tip->i_d.di_version == 3 &&
1287     - tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
1288     + tip->i_d.di_format == XFS_DINODE_FMT_BTREE)
1289     (*src_log_flags) |= XFS_ILOG_DOWNER;
1290     - error = xfs_bmbt_change_owner(tp, tip, XFS_DATA_FORK,
1291     - ip->i_ino, NULL);
1292     - if (error)
1293     - return error;
1294     - }
1295    
1296     /*
1297     * Swap the data forks of the inodes
1298     @@ -1925,6 +1914,48 @@ xfs_swap_extent_forks(
1299     return 0;
1300     }
1301    
1302     +/*
1303     + * Fix up the owners of the bmbt blocks to refer to the current inode. The
1304     + * change owner scan attempts to order all modified buffers in the current
1305     + * transaction. In the event of ordered buffer failure, the offending buffer is
1306     + * physically logged as a fallback and the scan returns -EAGAIN. We must roll
1307     + * the transaction in this case to replenish the fallback log reservation and
1308     + * restart the scan. This process repeats until the scan completes.
1309     + */
1310     +static int
1311     +xfs_swap_change_owner(
1312     + struct xfs_trans **tpp,
1313     + struct xfs_inode *ip,
1314     + struct xfs_inode *tmpip)
1315     +{
1316     + int error;
1317     + struct xfs_trans *tp = *tpp;
1318     +
1319     + do {
1320     + error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK, ip->i_ino,
1321     + NULL);
1322     + /* success or fatal error */
1323     + if (error != -EAGAIN)
1324     + break;
1325     +
1326     + error = xfs_trans_roll(tpp, NULL);
1327     + if (error)
1328     + break;
1329     + tp = *tpp;
1330     +
1331     + /*
1332     + * Redirty both inodes so they can relog and keep the log tail
1333     + * moving forward.
1334     + */
1335     + xfs_trans_ijoin(tp, ip, 0);
1336     + xfs_trans_ijoin(tp, tmpip, 0);
1337     + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1338     + xfs_trans_log_inode(tp, tmpip, XFS_ILOG_CORE);
1339     + } while (true);
1340     +
1341     + return error;
1342     +}
1343     +
1344     int
1345     xfs_swap_extents(
1346     struct xfs_inode *ip, /* target inode */
1347     @@ -1938,8 +1969,8 @@ xfs_swap_extents(
1348     int error = 0;
1349     int lock_flags;
1350     struct xfs_ifork *cowfp;
1351     - __uint64_t f;
1352     - int resblks;
1353     + uint64_t f;
1354     + int resblks = 0;
1355    
1356     /*
1357     * Lock the inodes against other IO, page faults and truncate to
1358     @@ -1987,11 +2018,8 @@ xfs_swap_extents(
1359     XFS_SWAP_RMAP_SPACE_RES(mp,
1360     XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK),
1361     XFS_DATA_FORK);
1362     - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks,
1363     - 0, 0, &tp);
1364     - } else
1365     - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0,
1366     - 0, 0, &tp);
1367     + }
1368     + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
1369     if (error)
1370     goto out_unlock;
1371    
1372     @@ -2076,6 +2104,23 @@ xfs_swap_extents(
1373     xfs_trans_log_inode(tp, ip, src_log_flags);
1374     xfs_trans_log_inode(tp, tip, target_log_flags);
1375    
1376     + /*
1377     + * The extent forks have been swapped, but crc=1,rmapbt=0 filesystems
1378     + * have inode number owner values in the bmbt blocks that still refer to
1379     + * the old inode. Scan each bmbt to fix up the owner values with the
1380     + * inode number of the current inode.
1381     + */
1382     + if (src_log_flags & XFS_ILOG_DOWNER) {
1383     + error = xfs_swap_change_owner(&tp, ip, tip);
1384     + if (error)
1385     + goto out_trans_cancel;
1386     + }
1387     + if (target_log_flags & XFS_ILOG_DOWNER) {
1388     + error = xfs_swap_change_owner(&tp, tip, ip);
1389     + if (error)
1390     + goto out_trans_cancel;
1391     + }
1392     +
1393     /*
1394     * If this is a synchronous mount, make sure that the
1395     * transaction goes to disk before returning to the user.
1396     diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
1397     index 16269271ebd6..eca7baecc9f0 100644
1398     --- a/fs/xfs/xfs_buf.c
1399     +++ b/fs/xfs/xfs_buf.c
1400     @@ -116,7 +116,7 @@ static inline void
1401     __xfs_buf_ioacct_dec(
1402     struct xfs_buf *bp)
1403     {
1404     - ASSERT(spin_is_locked(&bp->b_lock));
1405     + lockdep_assert_held(&bp->b_lock);
1406    
1407     if (bp->b_state & XFS_BSTATE_IN_FLIGHT) {
1408     bp->b_state &= ~XFS_BSTATE_IN_FLIGHT;
1409     @@ -2022,6 +2022,66 @@ xfs_buf_delwri_submit(
1410     return error;
1411     }
1412    
1413     +/*
1414     + * Push a single buffer on a delwri queue.
1415     + *
1416     + * The purpose of this function is to submit a single buffer of a delwri queue
1417     + * and return with the buffer still on the original queue. The waiting delwri
1418     + * buffer submission infrastructure guarantees transfer of the delwri queue
1419     + * buffer reference to a temporary wait list. We reuse this infrastructure to
1420     + * transfer the buffer back to the original queue.
1421     + *
1422     + * Note the buffer transitions from the queued state, to the submitted and wait
1423     + * listed state and back to the queued state during this call. The buffer
1424     + * locking and queue management logic between _delwri_pushbuf() and
1425     + * _delwri_queue() guarantee that the buffer cannot be queued to another list
1426     + * before returning.
1427     + */
1428     +int
1429     +xfs_buf_delwri_pushbuf(
1430     + struct xfs_buf *bp,
1431     + struct list_head *buffer_list)
1432     +{
1433     + LIST_HEAD (submit_list);
1434     + int error;
1435     +
1436     + ASSERT(bp->b_flags & _XBF_DELWRI_Q);
1437     +
1438     + trace_xfs_buf_delwri_pushbuf(bp, _RET_IP_);
1439     +
1440     + /*
1441     + * Isolate the buffer to a new local list so we can submit it for I/O
1442     + * independently from the rest of the original list.
1443     + */
1444     + xfs_buf_lock(bp);
1445     + list_move(&bp->b_list, &submit_list);
1446     + xfs_buf_unlock(bp);
1447     +
1448     + /*
1449     + * Delwri submission clears the DELWRI_Q buffer flag and returns with
1450     + * the buffer on the wait list with an associated reference. Rather than
1451     + * bounce the buffer from a local wait list back to the original list
1452     + * after I/O completion, reuse the original list as the wait list.
1453     + */
1454     + xfs_buf_delwri_submit_buffers(&submit_list, buffer_list);
1455     +
1456     + /*
1457     + * The buffer is now under I/O and wait listed as during typical delwri
1458     + * submission. Lock the buffer to wait for I/O completion. Rather than
1459     + * remove the buffer from the wait list and release the reference, we
1460     + * want to return with the buffer queued to the original list. The
1461     + * buffer already sits on the original list with a wait list reference,
1462     + * however. If we let the queue inherit that wait list reference, all we
1463     + * need to do is reset the DELWRI_Q flag.
1464     + */
1465     + xfs_buf_lock(bp);
1466     + error = bp->b_error;
1467     + bp->b_flags |= _XBF_DELWRI_Q;
1468     + xfs_buf_unlock(bp);
1469     +
1470     + return error;
1471     +}
1472     +
1473     int __init
1474     xfs_buf_init(void)
1475     {
1476     diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
1477     index ad514a8025dd..f961b19b9cc2 100644
1478     --- a/fs/xfs/xfs_buf.h
1479     +++ b/fs/xfs/xfs_buf.h
1480     @@ -333,6 +333,7 @@ extern void xfs_buf_delwri_cancel(struct list_head *);
1481     extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *);
1482     extern int xfs_buf_delwri_submit(struct list_head *);
1483     extern int xfs_buf_delwri_submit_nowait(struct list_head *);
1484     +extern int xfs_buf_delwri_pushbuf(struct xfs_buf *, struct list_head *);
1485    
1486     /* Buffer Daemon Setup Routines */
1487     extern int xfs_buf_init(void);
1488     diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
1489     index 0306168af332..e0a0af0946f2 100644
1490     --- a/fs/xfs/xfs_buf_item.c
1491     +++ b/fs/xfs/xfs_buf_item.c
1492     @@ -29,6 +29,7 @@
1493     #include "xfs_error.h"
1494     #include "xfs_trace.h"
1495     #include "xfs_log.h"
1496     +#include "xfs_inode.h"
1497    
1498    
1499     kmem_zone_t *xfs_buf_item_zone;
1500     @@ -322,6 +323,8 @@ xfs_buf_item_format(
1501     ASSERT((bip->bli_flags & XFS_BLI_STALE) ||
1502     (xfs_blft_from_flags(&bip->__bli_format) > XFS_BLFT_UNKNOWN_BUF
1503     && xfs_blft_from_flags(&bip->__bli_format) < XFS_BLFT_MAX_BUF));
1504     + ASSERT(!(bip->bli_flags & XFS_BLI_ORDERED) ||
1505     + (bip->bli_flags & XFS_BLI_STALE));
1506    
1507    
1508     /*
1509     @@ -346,16 +349,6 @@ xfs_buf_item_format(
1510     bip->bli_flags &= ~XFS_BLI_INODE_BUF;
1511     }
1512    
1513     - if ((bip->bli_flags & (XFS_BLI_ORDERED|XFS_BLI_STALE)) ==
1514     - XFS_BLI_ORDERED) {
1515     - /*
1516     - * The buffer has been logged just to order it. It is not being
1517     - * included in the transaction commit, so don't format it.
1518     - */
1519     - trace_xfs_buf_item_format_ordered(bip);
1520     - return;
1521     - }
1522     -
1523     for (i = 0; i < bip->bli_format_count; i++) {
1524     xfs_buf_item_format_segment(bip, lv, &vecp, offset,
1525     &bip->bli_formats[i]);
1526     @@ -574,26 +567,20 @@ xfs_buf_item_unlock(
1527     {
1528     struct xfs_buf_log_item *bip = BUF_ITEM(lip);
1529     struct xfs_buf *bp = bip->bli_buf;
1530     - bool clean;
1531     - bool aborted;
1532     - int flags;
1533     + bool aborted = !!(lip->li_flags & XFS_LI_ABORTED);
1534     + bool hold = !!(bip->bli_flags & XFS_BLI_HOLD);
1535     + bool dirty = !!(bip->bli_flags & XFS_BLI_DIRTY);
1536     +#if defined(DEBUG) || defined(XFS_WARN)
1537     + bool ordered = !!(bip->bli_flags & XFS_BLI_ORDERED);
1538     +#endif
1539    
1540     /* Clear the buffer's association with this transaction. */
1541     bp->b_transp = NULL;
1542    
1543     /*
1544     - * If this is a transaction abort, don't return early. Instead, allow
1545     - * the brelse to happen. Normally it would be done for stale
1546     - * (cancelled) buffers at unpin time, but we'll never go through the
1547     - * pin/unpin cycle if we abort inside commit.
1548     - */
1549     - aborted = (lip->li_flags & XFS_LI_ABORTED) ? true : false;
1550     - /*
1551     - * Before possibly freeing the buf item, copy the per-transaction state
1552     - * so we can reference it safely later after clearing it from the
1553     - * buffer log item.
1554     + * The per-transaction state has been copied above so clear it from the
1555     + * bli.
1556     */
1557     - flags = bip->bli_flags;
1558     bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED);
1559    
1560     /*
1561     @@ -601,7 +588,7 @@ xfs_buf_item_unlock(
1562     * unlock the buffer and free the buf item when the buffer is unpinned
1563     * for the last time.
1564     */
1565     - if (flags & XFS_BLI_STALE) {
1566     + if (bip->bli_flags & XFS_BLI_STALE) {
1567     trace_xfs_buf_item_unlock_stale(bip);
1568     ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
1569     if (!aborted) {
1570     @@ -619,40 +606,34 @@ xfs_buf_item_unlock(
1571     * regardless of whether it is dirty or not. A dirty abort implies a
1572     * shutdown, anyway.
1573     *
1574     - * Ordered buffers are dirty but may have no recorded changes, so ensure
1575     - * we only release clean items here.
1576     + * The bli dirty state should match whether the blf has logged segments
1577     + * except for ordered buffers, where only the bli should be dirty.
1578     */
1579     - clean = (flags & XFS_BLI_DIRTY) ? false : true;
1580     - if (clean) {
1581     - int i;
1582     - for (i = 0; i < bip->bli_format_count; i++) {
1583     - if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map,
1584     - bip->bli_formats[i].blf_map_size)) {
1585     - clean = false;
1586     - break;
1587     - }
1588     - }
1589     - }
1590     + ASSERT((!ordered && dirty == xfs_buf_item_dirty_format(bip)) ||
1591     + (ordered && dirty && !xfs_buf_item_dirty_format(bip)));
1592    
1593     /*
1594     * Clean buffers, by definition, cannot be in the AIL. However, aborted
1595     - * buffers may be dirty and hence in the AIL. Therefore if we are
1596     - * aborting a buffer and we've just taken the last refernce away, we
1597     - * have to check if it is in the AIL before freeing it. We need to free
1598     - * it in this case, because an aborted transaction has already shut the
1599     - * filesystem down and this is the last chance we will have to do so.
1600     + * buffers may be in the AIL regardless of dirty state. An aborted
1601     + * transaction that invalidates a buffer already in the AIL may have
1602     + * marked it stale and cleared the dirty state, for example.
1603     + *
1604     + * Therefore if we are aborting a buffer and we've just taken the last
1605     + * reference away, we have to check if it is in the AIL before freeing
1606     + * it. We need to free it in this case, because an aborted transaction
1607     + * has already shut the filesystem down and this is the last chance we
1608     + * will have to do so.
1609     */
1610     if (atomic_dec_and_test(&bip->bli_refcount)) {
1611     - if (clean)
1612     - xfs_buf_item_relse(bp);
1613     - else if (aborted) {
1614     + if (aborted) {
1615     ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp));
1616     xfs_trans_ail_remove(lip, SHUTDOWN_LOG_IO_ERROR);
1617     xfs_buf_item_relse(bp);
1618     - }
1619     + } else if (!dirty)
1620     + xfs_buf_item_relse(bp);
1621     }
1622    
1623     - if (!(flags & XFS_BLI_HOLD))
1624     + if (!hold)
1625     xfs_buf_relse(bp);
1626     }
1627    
1628     @@ -942,14 +923,22 @@ xfs_buf_item_log(
1629    
1630    
1631     /*
1632     - * Return 1 if the buffer has been logged or ordered in a transaction (at any
1633     - * point, not just the current transaction) and 0 if not.
1634     + * Return true if the buffer has any ranges logged/dirtied by a transaction,
1635     + * false otherwise.
1636     */
1637     -uint
1638     -xfs_buf_item_dirty(
1639     - xfs_buf_log_item_t *bip)
1640     +bool
1641     +xfs_buf_item_dirty_format(
1642     + struct xfs_buf_log_item *bip)
1643     {
1644     - return (bip->bli_flags & XFS_BLI_DIRTY);
1645     + int i;
1646     +
1647     + for (i = 0; i < bip->bli_format_count; i++) {
1648     + if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map,
1649     + bip->bli_formats[i].blf_map_size))
1650     + return true;
1651     + }
1652     +
1653     + return false;
1654     }
1655    
1656     STATIC void
1657     @@ -1051,6 +1040,31 @@ xfs_buf_do_callbacks(
1658     }
1659     }
1660    
1661     +/*
1662     + * Invoke the error state callback for each log item affected by the failed I/O.
1663     + *
1664     + * If a metadata buffer write fails with a non-permanent error, the buffer is
1665     + * eventually resubmitted and so the completion callbacks are not run. The error
1666     + * state may need to be propagated to the log items attached to the buffer,
1667     + * however, so the next AIL push of the item knows hot to handle it correctly.
1668     + */
1669     +STATIC void
1670     +xfs_buf_do_callbacks_fail(
1671     + struct xfs_buf *bp)
1672     +{
1673     + struct xfs_log_item *next;
1674     + struct xfs_log_item *lip = bp->b_fspriv;
1675     + struct xfs_ail *ailp = lip->li_ailp;
1676     +
1677     + spin_lock(&ailp->xa_lock);
1678     + for (; lip; lip = next) {
1679     + next = lip->li_bio_list;
1680     + if (lip->li_ops->iop_error)
1681     + lip->li_ops->iop_error(lip, bp);
1682     + }
1683     + spin_unlock(&ailp->xa_lock);
1684     +}
1685     +
1686     static bool
1687     xfs_buf_iodone_callback_error(
1688     struct xfs_buf *bp)
1689     @@ -1120,7 +1134,11 @@ xfs_buf_iodone_callback_error(
1690     if ((mp->m_flags & XFS_MOUNT_UNMOUNTING) && mp->m_fail_unmount)
1691     goto permanent_error;
1692    
1693     - /* still a transient error, higher layers will retry */
1694     + /*
1695     + * Still a transient error, run IO completion failure callbacks and let
1696     + * the higher layers retry the buffer.
1697     + */
1698     + xfs_buf_do_callbacks_fail(bp);
1699     xfs_buf_ioerror(bp, 0);
1700     xfs_buf_relse(bp);
1701     return true;
1702     @@ -1201,3 +1219,31 @@ xfs_buf_iodone(
1703     xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE);
1704     xfs_buf_item_free(BUF_ITEM(lip));
1705     }
1706     +
1707     +/*
1708     + * Requeue a failed buffer for writeback
1709     + *
1710     + * Return true if the buffer has been re-queued properly, false otherwise
1711     + */
1712     +bool
1713     +xfs_buf_resubmit_failed_buffers(
1714     + struct xfs_buf *bp,
1715     + struct xfs_log_item *lip,
1716     + struct list_head *buffer_list)
1717     +{
1718     + struct xfs_log_item *next;
1719     +
1720     + /*
1721     + * Clear XFS_LI_FAILED flag from all items before resubmit
1722     + *
1723     + * XFS_LI_FAILED set/clear is protected by xa_lock, caller this
1724     + * function already have it acquired
1725     + */
1726     + for (; lip; lip = next) {
1727     + next = lip->li_bio_list;
1728     + xfs_clear_li_failed(lip);
1729     + }
1730     +
1731     + /* Add this buffer back to the delayed write list */
1732     + return xfs_buf_delwri_queue(bp, buffer_list);
1733     +}
1734     diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
1735     index f7eba99d19dd..9690ce62c9a7 100644
1736     --- a/fs/xfs/xfs_buf_item.h
1737     +++ b/fs/xfs/xfs_buf_item.h
1738     @@ -64,12 +64,15 @@ typedef struct xfs_buf_log_item {
1739     int xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *);
1740     void xfs_buf_item_relse(struct xfs_buf *);
1741     void xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint);
1742     -uint xfs_buf_item_dirty(xfs_buf_log_item_t *);
1743     +bool xfs_buf_item_dirty_format(struct xfs_buf_log_item *);
1744     void xfs_buf_attach_iodone(struct xfs_buf *,
1745     void(*)(struct xfs_buf *, xfs_log_item_t *),
1746     xfs_log_item_t *);
1747     void xfs_buf_iodone_callbacks(struct xfs_buf *);
1748     void xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *);
1749     +bool xfs_buf_resubmit_failed_buffers(struct xfs_buf *,
1750     + struct xfs_log_item *,
1751     + struct list_head *);
1752    
1753     extern kmem_zone_t *xfs_buf_item_zone;
1754    
1755     diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
1756     index df206cfc21f7..586b398f268d 100644
1757     --- a/fs/xfs/xfs_file.c
1758     +++ b/fs/xfs/xfs_file.c
1759     @@ -729,6 +729,7 @@ xfs_file_buffered_aio_write(
1760     xfs_rw_iunlock(ip, iolock);
1761     eofb.eof_flags = XFS_EOF_FLAGS_SYNC;
1762     xfs_icache_free_eofblocks(ip->i_mount, &eofb);
1763     + xfs_icache_free_cowblocks(ip->i_mount, &eofb);
1764     goto write_retry;
1765     }
1766    
1767     @@ -1139,29 +1140,8 @@ xfs_find_get_desired_pgoff(
1768     want = min_t(pgoff_t, end - index, PAGEVEC_SIZE - 1) + 1;
1769     nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index,
1770     want);
1771     - /*
1772     - * No page mapped into given range. If we are searching holes
1773     - * and if this is the first time we got into the loop, it means
1774     - * that the given offset is landed in a hole, return it.
1775     - *
1776     - * If we have already stepped through some block buffers to find
1777     - * holes but they all contains data. In this case, the last
1778     - * offset is already updated and pointed to the end of the last
1779     - * mapped page, if it does not reach the endpoint to search,
1780     - * that means there should be a hole between them.
1781     - */
1782     - if (nr_pages == 0) {
1783     - /* Data search found nothing */
1784     - if (type == DATA_OFF)
1785     - break;
1786     -
1787     - ASSERT(type == HOLE_OFF);
1788     - if (lastoff == startoff || lastoff < endoff) {
1789     - found = true;
1790     - *offset = lastoff;
1791     - }
1792     + if (nr_pages == 0)
1793     break;
1794     - }
1795    
1796     for (i = 0; i < nr_pages; i++) {
1797     struct page *page = pvec.pages[i];
1798     @@ -1227,21 +1207,20 @@ xfs_find_get_desired_pgoff(
1799    
1800     /*
1801     * The number of returned pages less than our desired, search
1802     - * done. In this case, nothing was found for searching data,
1803     - * but we found a hole behind the last offset.
1804     + * done.
1805     */
1806     - if (nr_pages < want) {
1807     - if (type == HOLE_OFF) {
1808     - *offset = lastoff;
1809     - found = true;
1810     - }
1811     + if (nr_pages < want)
1812     break;
1813     - }
1814    
1815     index = pvec.pages[i - 1]->index + 1;
1816     pagevec_release(&pvec);
1817     } while (index <= end);
1818    
1819     + /* No page at lastoff and we are not done - we found a hole. */
1820     + if (type == HOLE_OFF && lastoff < endoff) {
1821     + *offset = lastoff;
1822     + found = true;
1823     + }
1824     out:
1825     pagevec_release(&pvec);
1826     return found;
1827     diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
1828     index 74304b6ce84b..86a4911520cc 100644
1829     --- a/fs/xfs/xfs_icache.c
1830     +++ b/fs/xfs/xfs_icache.c
1831     @@ -66,7 +66,6 @@ xfs_inode_alloc(
1832    
1833     XFS_STATS_INC(mp, vn_active);
1834     ASSERT(atomic_read(&ip->i_pincount) == 0);
1835     - ASSERT(!spin_is_locked(&ip->i_flags_lock));
1836     ASSERT(!xfs_isiflocked(ip));
1837     ASSERT(ip->i_ino == 0);
1838    
1839     @@ -192,7 +191,7 @@ xfs_perag_set_reclaim_tag(
1840     {
1841     struct xfs_mount *mp = pag->pag_mount;
1842    
1843     - ASSERT(spin_is_locked(&pag->pag_ici_lock));
1844     + lockdep_assert_held(&pag->pag_ici_lock);
1845     if (pag->pag_ici_reclaimable++)
1846     return;
1847    
1848     @@ -214,7 +213,7 @@ xfs_perag_clear_reclaim_tag(
1849     {
1850     struct xfs_mount *mp = pag->pag_mount;
1851    
1852     - ASSERT(spin_is_locked(&pag->pag_ici_lock));
1853     + lockdep_assert_held(&pag->pag_ici_lock);
1854     if (--pag->pag_ici_reclaimable)
1855     return;
1856    
1857     @@ -1079,11 +1078,11 @@ xfs_reclaim_inode(
1858     * Because we use RCU freeing we need to ensure the inode always appears
1859     * to be reclaimed with an invalid inode number when in the free state.
1860     * We do this as early as possible under the ILOCK so that
1861     - * xfs_iflush_cluster() can be guaranteed to detect races with us here.
1862     - * By doing this, we guarantee that once xfs_iflush_cluster has locked
1863     - * XFS_ILOCK that it will see either a valid, flushable inode that will
1864     - * serialise correctly, or it will see a clean (and invalid) inode that
1865     - * it can skip.
1866     + * xfs_iflush_cluster() and xfs_ifree_cluster() can be guaranteed to
1867     + * detect races with us here. By doing this, we guarantee that once
1868     + * xfs_iflush_cluster() or xfs_ifree_cluster() has locked XFS_ILOCK that
1869     + * it will see either a valid inode that will serialise correctly, or it
1870     + * will see an invalid inode that it can skip.
1871     */
1872     spin_lock(&ip->i_flags_lock);
1873     ip->i_flags = XFS_IRECLAIM;
1874     diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
1875     index 7a0b4eeb99e4..9e795ab08a53 100644
1876     --- a/fs/xfs/xfs_inode.c
1877     +++ b/fs/xfs/xfs_inode.c
1878     @@ -881,7 +881,6 @@ xfs_ialloc(
1879     case S_IFREG:
1880     case S_IFDIR:
1881     if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) {
1882     - uint64_t di_flags2 = 0;
1883     uint di_flags = 0;
1884    
1885     if (S_ISDIR(mode)) {
1886     @@ -918,20 +917,23 @@ xfs_ialloc(
1887     di_flags |= XFS_DIFLAG_NODEFRAG;
1888     if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM)
1889     di_flags |= XFS_DIFLAG_FILESTREAM;
1890     - if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX)
1891     - di_flags2 |= XFS_DIFLAG2_DAX;
1892    
1893     ip->i_d.di_flags |= di_flags;
1894     - ip->i_d.di_flags2 |= di_flags2;
1895     }
1896     if (pip &&
1897     (pip->i_d.di_flags2 & XFS_DIFLAG2_ANY) &&
1898     pip->i_d.di_version == 3 &&
1899     ip->i_d.di_version == 3) {
1900     + uint64_t di_flags2 = 0;
1901     +
1902     if (pip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) {
1903     - ip->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
1904     + di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
1905     ip->i_d.di_cowextsize = pip->i_d.di_cowextsize;
1906     }
1907     + if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX)
1908     + di_flags2 |= XFS_DIFLAG2_DAX;
1909     +
1910     + ip->i_d.di_flags2 |= di_flags2;
1911     }
1912     /* FALLTHROUGH */
1913     case S_IFLNK:
1914     @@ -2366,11 +2368,24 @@ xfs_ifree_cluster(
1915     * already marked stale. If we can't lock it, back off
1916     * and retry.
1917     */
1918     - if (ip != free_ip &&
1919     - !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
1920     - rcu_read_unlock();
1921     - delay(1);
1922     - goto retry;
1923     + if (ip != free_ip) {
1924     + if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
1925     + rcu_read_unlock();
1926     + delay(1);
1927     + goto retry;
1928     + }
1929     +
1930     + /*
1931     + * Check the inode number again in case we're
1932     + * racing with freeing in xfs_reclaim_inode().
1933     + * See the comments in that function for more
1934     + * information as to why the initial check is
1935     + * not sufficient.
1936     + */
1937     + if (ip->i_ino != inum + i) {
1938     + xfs_iunlock(ip, XFS_ILOCK_EXCL);
1939     + continue;
1940     + }
1941     }
1942     rcu_read_unlock();
1943    
1944     diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
1945     index d90e7811ccdd..94915747042c 100644
1946     --- a/fs/xfs/xfs_inode_item.c
1947     +++ b/fs/xfs/xfs_inode_item.c
1948     @@ -27,6 +27,7 @@
1949     #include "xfs_error.h"
1950     #include "xfs_trace.h"
1951     #include "xfs_trans_priv.h"
1952     +#include "xfs_buf_item.h"
1953     #include "xfs_log.h"
1954    
1955    
1956     @@ -475,6 +476,23 @@ xfs_inode_item_unpin(
1957     wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT);
1958     }
1959    
1960     +/*
1961     + * Callback used to mark a buffer with XFS_LI_FAILED when items in the buffer
1962     + * have been failed during writeback
1963     + *
1964     + * This informs the AIL that the inode is already flush locked on the next push,
1965     + * and acquires a hold on the buffer to ensure that it isn't reclaimed before
1966     + * dirty data makes it to disk.
1967     + */
1968     +STATIC void
1969     +xfs_inode_item_error(
1970     + struct xfs_log_item *lip,
1971     + struct xfs_buf *bp)
1972     +{
1973     + ASSERT(xfs_isiflocked(INODE_ITEM(lip)->ili_inode));
1974     + xfs_set_li_failed(lip, bp);
1975     +}
1976     +
1977     STATIC uint
1978     xfs_inode_item_push(
1979     struct xfs_log_item *lip,
1980     @@ -484,13 +502,28 @@ xfs_inode_item_push(
1981     {
1982     struct xfs_inode_log_item *iip = INODE_ITEM(lip);
1983     struct xfs_inode *ip = iip->ili_inode;
1984     - struct xfs_buf *bp = NULL;
1985     + struct xfs_buf *bp = lip->li_buf;
1986     uint rval = XFS_ITEM_SUCCESS;
1987     int error;
1988    
1989     if (xfs_ipincount(ip) > 0)
1990     return XFS_ITEM_PINNED;
1991    
1992     + /*
1993     + * The buffer containing this item failed to be written back
1994     + * previously. Resubmit the buffer for IO.
1995     + */
1996     + if (lip->li_flags & XFS_LI_FAILED) {
1997     + if (!xfs_buf_trylock(bp))
1998     + return XFS_ITEM_LOCKED;
1999     +
2000     + if (!xfs_buf_resubmit_failed_buffers(bp, lip, buffer_list))
2001     + rval = XFS_ITEM_FLUSHING;
2002     +
2003     + xfs_buf_unlock(bp);
2004     + return rval;
2005     + }
2006     +
2007     if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
2008     return XFS_ITEM_LOCKED;
2009    
2010     @@ -622,7 +655,8 @@ static const struct xfs_item_ops xfs_inode_item_ops = {
2011     .iop_unlock = xfs_inode_item_unlock,
2012     .iop_committed = xfs_inode_item_committed,
2013     .iop_push = xfs_inode_item_push,
2014     - .iop_committing = xfs_inode_item_committing
2015     + .iop_committing = xfs_inode_item_committing,
2016     + .iop_error = xfs_inode_item_error
2017     };
2018    
2019    
2020     @@ -710,7 +744,8 @@ xfs_iflush_done(
2021     * the AIL lock.
2022     */
2023     iip = INODE_ITEM(blip);
2024     - if (iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn)
2025     + if ((iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn) ||
2026     + lip->li_flags & XFS_LI_FAILED)
2027     need_ail++;
2028    
2029     blip = next;
2030     @@ -718,7 +753,8 @@ xfs_iflush_done(
2031    
2032     /* make sure we capture the state of the initial inode. */
2033     iip = INODE_ITEM(lip);
2034     - if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn)
2035     + if ((iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) ||
2036     + lip->li_flags & XFS_LI_FAILED)
2037     need_ail++;
2038    
2039     /*
2040     @@ -731,22 +767,30 @@ xfs_iflush_done(
2041     * holding the lock before removing the inode from the AIL.
2042     */
2043     if (need_ail) {
2044     - struct xfs_log_item *log_items[need_ail];
2045     - int i = 0;
2046     + bool mlip_changed = false;
2047     +
2048     + /* this is an opencoded batch version of xfs_trans_ail_delete */
2049     spin_lock(&ailp->xa_lock);
2050     for (blip = lip; blip; blip = blip->li_bio_list) {
2051     - iip = INODE_ITEM(blip);
2052     - if (iip->ili_logged &&
2053     - blip->li_lsn == iip->ili_flush_lsn) {
2054     - log_items[i++] = blip;
2055     + if (INODE_ITEM(blip)->ili_logged &&
2056     + blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn)
2057     + mlip_changed |= xfs_ail_delete_one(ailp, blip);
2058     + else {
2059     + xfs_clear_li_failed(blip);
2060     }
2061     - ASSERT(i <= need_ail);
2062     }
2063     - /* xfs_trans_ail_delete_bulk() drops the AIL lock. */
2064     - xfs_trans_ail_delete_bulk(ailp, log_items, i,
2065     - SHUTDOWN_CORRUPT_INCORE);
2066     - }
2067    
2068     + if (mlip_changed) {
2069     + if (!XFS_FORCED_SHUTDOWN(ailp->xa_mount))
2070     + xlog_assign_tail_lsn_locked(ailp->xa_mount);
2071     + if (list_empty(&ailp->xa_ail))
2072     + wake_up_all(&ailp->xa_empty);
2073     + }
2074     + spin_unlock(&ailp->xa_lock);
2075     +
2076     + if (mlip_changed)
2077     + xfs_log_space_wake(ailp->xa_mount);
2078     + }
2079    
2080     /*
2081     * clean up and unlock the flush lock now we are done. We can clear the
2082     diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
2083     index 73cfc7179124..bce2e260f55e 100644
2084     --- a/fs/xfs/xfs_ioctl.c
2085     +++ b/fs/xfs/xfs_ioctl.c
2086     @@ -928,16 +928,15 @@ xfs_ioc_fsgetxattr(
2087     return 0;
2088     }
2089    
2090     -STATIC void
2091     -xfs_set_diflags(
2092     +STATIC uint16_t
2093     +xfs_flags2diflags(
2094     struct xfs_inode *ip,
2095     unsigned int xflags)
2096     {
2097     - unsigned int di_flags;
2098     - uint64_t di_flags2;
2099     -
2100     /* can't set PREALLOC this way, just preserve it */
2101     - di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC);
2102     + uint16_t di_flags =
2103     + (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC);
2104     +
2105     if (xflags & FS_XFLAG_IMMUTABLE)
2106     di_flags |= XFS_DIFLAG_IMMUTABLE;
2107     if (xflags & FS_XFLAG_APPEND)
2108     @@ -967,19 +966,24 @@ xfs_set_diflags(
2109     if (xflags & FS_XFLAG_EXTSIZE)
2110     di_flags |= XFS_DIFLAG_EXTSIZE;
2111     }
2112     - ip->i_d.di_flags = di_flags;
2113    
2114     - /* diflags2 only valid for v3 inodes. */
2115     - if (ip->i_d.di_version < 3)
2116     - return;
2117     + return di_flags;
2118     +}
2119     +
2120     +STATIC uint64_t
2121     +xfs_flags2diflags2(
2122     + struct xfs_inode *ip,
2123     + unsigned int xflags)
2124     +{
2125     + uint64_t di_flags2 =
2126     + (ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK);
2127    
2128     - di_flags2 = (ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK);
2129     if (xflags & FS_XFLAG_DAX)
2130     di_flags2 |= XFS_DIFLAG2_DAX;
2131     if (xflags & FS_XFLAG_COWEXTSIZE)
2132     di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
2133    
2134     - ip->i_d.di_flags2 = di_flags2;
2135     + return di_flags2;
2136     }
2137    
2138     STATIC void
2139     @@ -1005,11 +1009,12 @@ xfs_diflags_to_linux(
2140     inode->i_flags |= S_NOATIME;
2141     else
2142     inode->i_flags &= ~S_NOATIME;
2143     +#if 0 /* disabled until the flag switching races are sorted out */
2144     if (xflags & FS_XFLAG_DAX)
2145     inode->i_flags |= S_DAX;
2146     else
2147     inode->i_flags &= ~S_DAX;
2148     -
2149     +#endif
2150     }
2151    
2152     static int
2153     @@ -1019,6 +1024,7 @@ xfs_ioctl_setattr_xflags(
2154     struct fsxattr *fa)
2155     {
2156     struct xfs_mount *mp = ip->i_mount;
2157     + uint64_t di_flags2;
2158    
2159     /* Can't change realtime flag if any extents are allocated. */
2160     if ((ip->i_d.di_nextents || ip->i_delayed_blks) &&
2161     @@ -1049,7 +1055,14 @@ xfs_ioctl_setattr_xflags(
2162     !capable(CAP_LINUX_IMMUTABLE))
2163     return -EPERM;
2164    
2165     - xfs_set_diflags(ip, fa->fsx_xflags);
2166     + /* diflags2 only valid for v3 inodes. */
2167     + di_flags2 = xfs_flags2diflags2(ip, fa->fsx_xflags);
2168     + if (di_flags2 && ip->i_d.di_version < 3)
2169     + return -EINVAL;
2170     +
2171     + ip->i_d.di_flags = xfs_flags2diflags(ip, fa->fsx_xflags);
2172     + ip->i_d.di_flags2 = di_flags2;
2173     +
2174     xfs_diflags_to_linux(ip);
2175     xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
2176     xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
2177     diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
2178     index a1247c3c1efb..5b81f7f41b80 100644
2179     --- a/fs/xfs/xfs_iops.c
2180     +++ b/fs/xfs/xfs_iops.c
2181     @@ -802,7 +802,7 @@ xfs_vn_setattr_nonsize(
2182     * Caution: The caller of this function is responsible for calling
2183     * setattr_prepare() or otherwise verifying the change is fine.
2184     */
2185     -int
2186     +STATIC int
2187     xfs_setattr_size(
2188     struct xfs_inode *ip,
2189     struct iattr *iattr)
2190     diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
2191     index b57ab34fbf3c..33c9a3aae948 100644
2192     --- a/fs/xfs/xfs_log.c
2193     +++ b/fs/xfs/xfs_log.c
2194     @@ -743,15 +743,45 @@ xfs_log_mount_finish(
2195     struct xfs_mount *mp)
2196     {
2197     int error = 0;
2198     + bool readonly = (mp->m_flags & XFS_MOUNT_RDONLY);
2199    
2200     if (mp->m_flags & XFS_MOUNT_NORECOVERY) {
2201     ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
2202     return 0;
2203     + } else if (readonly) {
2204     + /* Allow unlinked processing to proceed */
2205     + mp->m_flags &= ~XFS_MOUNT_RDONLY;
2206     }
2207    
2208     + /*
2209     + * During the second phase of log recovery, we need iget and
2210     + * iput to behave like they do for an active filesystem.
2211     + * xfs_fs_drop_inode needs to be able to prevent the deletion
2212     + * of inodes before we're done replaying log items on those
2213     + * inodes. Turn it off immediately after recovery finishes
2214     + * so that we don't leak the quota inodes if subsequent mount
2215     + * activities fail.
2216     + *
2217     + * We let all inodes involved in redo item processing end up on
2218     + * the LRU instead of being evicted immediately so that if we do
2219     + * something to an unlinked inode, the irele won't cause
2220     + * premature truncation and freeing of the inode, which results
2221     + * in log recovery failure. We have to evict the unreferenced
2222     + * lru inodes after clearing MS_ACTIVE because we don't
2223     + * otherwise clean up the lru if there's a subsequent failure in
2224     + * xfs_mountfs, which leads to us leaking the inodes if nothing
2225     + * else (e.g. quotacheck) references the inodes before the
2226     + * mount failure occurs.
2227     + */
2228     + mp->m_super->s_flags |= MS_ACTIVE;
2229     error = xlog_recover_finish(mp->m_log);
2230     if (!error)
2231     xfs_log_work_queue(mp);
2232     + mp->m_super->s_flags &= ~MS_ACTIVE;
2233     + evict_inodes(mp->m_super);
2234     +
2235     + if (readonly)
2236     + mp->m_flags |= XFS_MOUNT_RDONLY;
2237    
2238     return error;
2239     }
2240     @@ -801,11 +831,14 @@ xfs_log_unmount_write(xfs_mount_t *mp)
2241     int error;
2242    
2243     /*
2244     - * Don't write out unmount record on read-only mounts.
2245     + * Don't write out unmount record on norecovery mounts or ro devices.
2246     * Or, if we are doing a forced umount (typically because of IO errors).
2247     */
2248     - if (mp->m_flags & XFS_MOUNT_RDONLY)
2249     + if (mp->m_flags & XFS_MOUNT_NORECOVERY ||
2250     + xfs_readonly_buftarg(log->l_mp->m_logdev_targp)) {
2251     + ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
2252     return 0;
2253     + }
2254    
2255     error = _xfs_log_force(mp, XFS_LOG_SYNC, NULL);
2256     ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log)));
2257     @@ -3304,8 +3337,6 @@ _xfs_log_force(
2258     */
2259     if (iclog->ic_state & XLOG_STATE_IOERROR)
2260     return -EIO;
2261     - if (log_flushed)
2262     - *log_flushed = 1;
2263     } else {
2264    
2265     no_sleep:
2266     @@ -3409,8 +3440,6 @@ _xfs_log_force_lsn(
2267    
2268     xlog_wait(&iclog->ic_prev->ic_write_wait,
2269     &log->l_icloglock);
2270     - if (log_flushed)
2271     - *log_flushed = 1;
2272     already_slept = 1;
2273     goto try_again;
2274     }
2275     @@ -3444,9 +3473,6 @@ _xfs_log_force_lsn(
2276     */
2277     if (iclog->ic_state & XLOG_STATE_IOERROR)
2278     return -EIO;
2279     -
2280     - if (log_flushed)
2281     - *log_flushed = 1;
2282     } else { /* just return */
2283     spin_unlock(&log->l_icloglock);
2284     }
2285     diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
2286     index 9b3d7c76915d..05909269f973 100644
2287     --- a/fs/xfs/xfs_log_recover.c
2288     +++ b/fs/xfs/xfs_log_recover.c
2289     @@ -1029,61 +1029,106 @@ xlog_seek_logrec_hdr(
2290     }
2291    
2292     /*
2293     - * Check the log tail for torn writes. This is required when torn writes are
2294     - * detected at the head and the head had to be walked back to a previous record.
2295     - * The tail of the previous record must now be verified to ensure the torn
2296     - * writes didn't corrupt the previous tail.
2297     + * Calculate distance from head to tail (i.e., unused space in the log).
2298     + */
2299     +static inline int
2300     +xlog_tail_distance(
2301     + struct xlog *log,
2302     + xfs_daddr_t head_blk,
2303     + xfs_daddr_t tail_blk)
2304     +{
2305     + if (head_blk < tail_blk)
2306     + return tail_blk - head_blk;
2307     +
2308     + return tail_blk + (log->l_logBBsize - head_blk);
2309     +}
2310     +
2311     +/*
2312     + * Verify the log tail. This is particularly important when torn or incomplete
2313     + * writes have been detected near the front of the log and the head has been
2314     + * walked back accordingly.
2315     *
2316     - * Return an error if CRC verification fails as recovery cannot proceed.
2317     + * We also have to handle the case where the tail was pinned and the head
2318     + * blocked behind the tail right before a crash. If the tail had been pushed
2319     + * immediately prior to the crash and the subsequent checkpoint was only
2320     + * partially written, it's possible it overwrote the last referenced tail in the
2321     + * log with garbage. This is not a coherency problem because the tail must have
2322     + * been pushed before it can be overwritten, but appears as log corruption to
2323     + * recovery because we have no way to know the tail was updated if the
2324     + * subsequent checkpoint didn't write successfully.
2325     + *
2326     + * Therefore, CRC check the log from tail to head. If a failure occurs and the
2327     + * offending record is within max iclog bufs from the head, walk the tail
2328     + * forward and retry until a valid tail is found or corruption is detected out
2329     + * of the range of a possible overwrite.
2330     */
2331     STATIC int
2332     xlog_verify_tail(
2333     struct xlog *log,
2334     xfs_daddr_t head_blk,
2335     - xfs_daddr_t tail_blk)
2336     + xfs_daddr_t *tail_blk,
2337     + int hsize)
2338     {
2339     struct xlog_rec_header *thead;
2340     struct xfs_buf *bp;
2341     xfs_daddr_t first_bad;
2342     - int count;
2343     int error = 0;
2344     bool wrapped;
2345     - xfs_daddr_t tmp_head;
2346     + xfs_daddr_t tmp_tail;
2347     + xfs_daddr_t orig_tail = *tail_blk;
2348    
2349     bp = xlog_get_bp(log, 1);
2350     if (!bp)
2351     return -ENOMEM;
2352    
2353     /*
2354     - * Seek XLOG_MAX_ICLOGS + 1 records past the current tail record to get
2355     - * a temporary head block that points after the last possible
2356     - * concurrently written record of the tail.
2357     + * Make sure the tail points to a record (returns positive count on
2358     + * success).
2359     */
2360     - count = xlog_seek_logrec_hdr(log, head_blk, tail_blk,
2361     - XLOG_MAX_ICLOGS + 1, bp, &tmp_head, &thead,
2362     - &wrapped);
2363     - if (count < 0) {
2364     - error = count;
2365     + error = xlog_seek_logrec_hdr(log, head_blk, *tail_blk, 1, bp,
2366     + &tmp_tail, &thead, &wrapped);
2367     + if (error < 0)
2368     goto out;
2369     - }
2370     -
2371     - /*
2372     - * If the call above didn't find XLOG_MAX_ICLOGS + 1 records, we ran
2373     - * into the actual log head. tmp_head points to the start of the record
2374     - * so update it to the actual head block.
2375     - */
2376     - if (count < XLOG_MAX_ICLOGS + 1)
2377     - tmp_head = head_blk;
2378     + if (*tail_blk != tmp_tail)
2379     + *tail_blk = tmp_tail;
2380    
2381     /*
2382     - * We now have a tail and temporary head block that covers at least
2383     - * XLOG_MAX_ICLOGS records from the tail. We need to verify that these
2384     - * records were completely written. Run a CRC verification pass from
2385     - * tail to head and return the result.
2386     + * Run a CRC check from the tail to the head. We can't just check
2387     + * MAX_ICLOGS records past the tail because the tail may point to stale
2388     + * blocks cleared during the search for the head/tail. These blocks are
2389     + * overwritten with zero-length records and thus record count is not a
2390     + * reliable indicator of the iclog state before a crash.
2391     */
2392     - error = xlog_do_recovery_pass(log, tmp_head, tail_blk,
2393     + first_bad = 0;
2394     + error = xlog_do_recovery_pass(log, head_blk, *tail_blk,
2395     XLOG_RECOVER_CRCPASS, &first_bad);
2396     + while ((error == -EFSBADCRC || error == -EFSCORRUPTED) && first_bad) {
2397     + int tail_distance;
2398     +
2399     + /*
2400     + * Is corruption within range of the head? If so, retry from
2401     + * the next record. Otherwise return an error.
2402     + */
2403     + tail_distance = xlog_tail_distance(log, head_blk, first_bad);
2404     + if (tail_distance > BTOBB(XLOG_MAX_ICLOGS * hsize))
2405     + break;
2406     +
2407     + /* skip to the next record; returns positive count on success */
2408     + error = xlog_seek_logrec_hdr(log, head_blk, first_bad, 2, bp,
2409     + &tmp_tail, &thead, &wrapped);
2410     + if (error < 0)
2411     + goto out;
2412     +
2413     + *tail_blk = tmp_tail;
2414     + first_bad = 0;
2415     + error = xlog_do_recovery_pass(log, head_blk, *tail_blk,
2416     + XLOG_RECOVER_CRCPASS, &first_bad);
2417     + }
2418    
2419     + if (!error && *tail_blk != orig_tail)
2420     + xfs_warn(log->l_mp,
2421     + "Tail block (0x%llx) overwrite detected. Updated to 0x%llx",
2422     + orig_tail, *tail_blk);
2423     out:
2424     xlog_put_bp(bp);
2425     return error;
2426     @@ -1143,7 +1188,7 @@ xlog_verify_head(
2427     */
2428     error = xlog_do_recovery_pass(log, *head_blk, tmp_rhead_blk,
2429     XLOG_RECOVER_CRCPASS, &first_bad);
2430     - if (error == -EFSBADCRC) {
2431     + if ((error == -EFSBADCRC || error == -EFSCORRUPTED) && first_bad) {
2432     /*
2433     * We've hit a potential torn write. Reset the error and warn
2434     * about it.
2435     @@ -1183,31 +1228,12 @@ xlog_verify_head(
2436     ASSERT(0);
2437     return 0;
2438     }
2439     -
2440     - /*
2441     - * Now verify the tail based on the updated head. This is
2442     - * required because the torn writes trimmed from the head could
2443     - * have been written over the tail of a previous record. Return
2444     - * any errors since recovery cannot proceed if the tail is
2445     - * corrupt.
2446     - *
2447     - * XXX: This leaves a gap in truly robust protection from torn
2448     - * writes in the log. If the head is behind the tail, the tail
2449     - * pushes forward to create some space and then a crash occurs
2450     - * causing the writes into the previous record's tail region to
2451     - * tear, log recovery isn't able to recover.
2452     - *
2453     - * How likely is this to occur? If possible, can we do something
2454     - * more intelligent here? Is it safe to push the tail forward if
2455     - * we can determine that the tail is within the range of the
2456     - * torn write (e.g., the kernel can only overwrite the tail if
2457     - * it has actually been pushed forward)? Alternatively, could we
2458     - * somehow prevent this condition at runtime?
2459     - */
2460     - error = xlog_verify_tail(log, *head_blk, *tail_blk);
2461     }
2462     + if (error)
2463     + return error;
2464    
2465     - return error;
2466     + return xlog_verify_tail(log, *head_blk, tail_blk,
2467     + be32_to_cpu((*rhead)->h_size));
2468     }
2469    
2470     /*
2471     @@ -4152,7 +4178,7 @@ xlog_recover_commit_trans(
2472    
2473     #define XLOG_RECOVER_COMMIT_QUEUE_MAX 100
2474    
2475     - hlist_del(&trans->r_list);
2476     + hlist_del_init(&trans->r_list);
2477    
2478     error = xlog_recover_reorder_trans(log, trans, pass);
2479     if (error)
2480     @@ -4354,6 +4380,8 @@ xlog_recover_free_trans(
2481     xlog_recover_item_t *item, *n;
2482     int i;
2483    
2484     + hlist_del_init(&trans->r_list);
2485     +
2486     list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) {
2487     /* Free the regions in the item. */
2488     list_del(&item->ri_list);
2489     @@ -4799,12 +4827,16 @@ xlog_recover_process_intents(
2490     int error = 0;
2491     struct xfs_ail_cursor cur;
2492     struct xfs_ail *ailp;
2493     +#if defined(DEBUG) || defined(XFS_WARN)
2494     xfs_lsn_t last_lsn;
2495     +#endif
2496    
2497     ailp = log->l_ailp;
2498     spin_lock(&ailp->xa_lock);
2499     lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
2500     +#if defined(DEBUG) || defined(XFS_WARN)
2501     last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block);
2502     +#endif
2503     while (lip != NULL) {
2504     /*
2505     * We're done when we see something other than an intent.
2506     @@ -5214,7 +5246,7 @@ xlog_do_recovery_pass(
2507     xfs_daddr_t *first_bad) /* out: first bad log rec */
2508     {
2509     xlog_rec_header_t *rhead;
2510     - xfs_daddr_t blk_no;
2511     + xfs_daddr_t blk_no, rblk_no;
2512     xfs_daddr_t rhead_blk;
2513     char *offset;
2514     xfs_buf_t *hbp, *dbp;
2515     @@ -5222,11 +5254,15 @@ xlog_do_recovery_pass(
2516     int error2 = 0;
2517     int bblks, split_bblks;
2518     int hblks, split_hblks, wrapped_hblks;
2519     + int i;
2520     struct hlist_head rhash[XLOG_RHASH_SIZE];
2521     LIST_HEAD (buffer_list);
2522    
2523     ASSERT(head_blk != tail_blk);
2524     - rhead_blk = 0;
2525     + blk_no = rhead_blk = tail_blk;
2526     +
2527     + for (i = 0; i < XLOG_RHASH_SIZE; i++)
2528     + INIT_HLIST_HEAD(&rhash[i]);
2529    
2530     /*
2531     * Read the header of the tail block and get the iclog buffer size from
2532     @@ -5301,7 +5337,6 @@ xlog_do_recovery_pass(
2533     }
2534    
2535     memset(rhash, 0, sizeof(rhash));
2536     - blk_no = rhead_blk = tail_blk;
2537     if (tail_blk > head_blk) {
2538     /*
2539     * Perform recovery around the end of the physical log.
2540     @@ -5363,9 +5398,19 @@ xlog_do_recovery_pass(
2541     bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
2542     blk_no += hblks;
2543    
2544     - /* Read in data for log record */
2545     - if (blk_no + bblks <= log->l_logBBsize) {
2546     - error = xlog_bread(log, blk_no, bblks, dbp,
2547     + /*
2548     + * Read the log record data in multiple reads if it
2549     + * wraps around the end of the log. Note that if the
2550     + * header already wrapped, blk_no could point past the
2551     + * end of the log. The record data is contiguous in
2552     + * that case.
2553     + */
2554     + if (blk_no + bblks <= log->l_logBBsize ||
2555     + blk_no >= log->l_logBBsize) {
2556     + /* mod blk_no in case the header wrapped and
2557     + * pushed it beyond the end of the log */
2558     + rblk_no = do_mod(blk_no, log->l_logBBsize);
2559     + error = xlog_bread(log, rblk_no, bblks, dbp,
2560     &offset);
2561     if (error)
2562     goto bread_err2;
2563     @@ -5464,6 +5509,19 @@ xlog_do_recovery_pass(
2564     if (error && first_bad)
2565     *first_bad = rhead_blk;
2566    
2567     + /*
2568     + * Transactions are freed at commit time but transactions without commit
2569     + * records on disk are never committed. Free any that may be left in the
2570     + * hash table.
2571     + */
2572     + for (i = 0; i < XLOG_RHASH_SIZE; i++) {
2573     + struct hlist_node *tmp;
2574     + struct xlog_recover *trans;
2575     +
2576     + hlist_for_each_entry_safe(trans, tmp, &rhash[i], r_list)
2577     + xlog_recover_free_trans(trans);
2578     + }
2579     +
2580     return error ? error : error2;
2581     }
2582    
2583     @@ -5542,6 +5600,8 @@ xlog_do_recover(
2584     xfs_buf_t *bp;
2585     xfs_sb_t *sbp;
2586    
2587     + trace_xfs_log_recover(log, head_blk, tail_blk);
2588     +
2589     /*
2590     * First replay the images in the log.
2591     */
2592     diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
2593     index 13796f212f98..d4ce8d277992 100644
2594     --- a/fs/xfs/xfs_mount.c
2595     +++ b/fs/xfs/xfs_mount.c
2596     @@ -924,15 +924,6 @@ xfs_mountfs(
2597     }
2598     }
2599    
2600     - /*
2601     - * During the second phase of log recovery, we need iget and
2602     - * iput to behave like they do for an active filesystem.
2603     - * xfs_fs_drop_inode needs to be able to prevent the deletion
2604     - * of inodes before we're done replaying log items on those
2605     - * inodes.
2606     - */
2607     - mp->m_super->s_flags |= MS_ACTIVE;
2608     -
2609     /*
2610     * Finish recovering the file system. This part needed to be delayed
2611     * until after the root and real-time bitmap inodes were consistently
2612     @@ -1008,12 +999,13 @@ xfs_mountfs(
2613     out_quota:
2614     xfs_qm_unmount_quotas(mp);
2615     out_rtunmount:
2616     - mp->m_super->s_flags &= ~MS_ACTIVE;
2617     xfs_rtunmount_inodes(mp);
2618     out_rele_rip:
2619     IRELE(rip);
2620     cancel_delayed_work_sync(&mp->m_reclaim_work);
2621     xfs_reclaim_inodes(mp, SYNC_WAIT);
2622     + /* Clean out dquots that might be in memory after quotacheck. */
2623     + xfs_qm_unmount(mp);
2624     out_log_dealloc:
2625     mp->m_flags |= XFS_MOUNT_UNMOUNTING;
2626     xfs_log_mount_cancel(mp);
2627     diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
2628     index 8b9a9f15f022..1fdd3face2d9 100644
2629     --- a/fs/xfs/xfs_qm.c
2630     +++ b/fs/xfs/xfs_qm.c
2631     @@ -111,6 +111,9 @@ xfs_qm_dquot_walk(
2632     skipped = 0;
2633     break;
2634     }
2635     + /* we're done if id overflows back to zero */
2636     + if (!next_index)
2637     + break;
2638     }
2639    
2640     if (skipped) {
2641     @@ -1247,6 +1250,7 @@ xfs_qm_flush_one(
2642     struct xfs_dquot *dqp,
2643     void *data)
2644     {
2645     + struct xfs_mount *mp = dqp->q_mount;
2646     struct list_head *buffer_list = data;
2647     struct xfs_buf *bp = NULL;
2648     int error = 0;
2649     @@ -1257,7 +1261,32 @@ xfs_qm_flush_one(
2650     if (!XFS_DQ_IS_DIRTY(dqp))
2651     goto out_unlock;
2652    
2653     - xfs_dqflock(dqp);
2654     + /*
2655     + * The only way the dquot is already flush locked by the time quotacheck
2656     + * gets here is if reclaim flushed it before the dqadjust walk dirtied
2657     + * it for the final time. Quotacheck collects all dquot bufs in the
2658     + * local delwri queue before dquots are dirtied, so reclaim can't have
2659     + * possibly queued it for I/O. The only way out is to push the buffer to
2660     + * cycle the flush lock.
2661     + */
2662     + if (!xfs_dqflock_nowait(dqp)) {
2663     + /* buf is pinned in-core by delwri list */
2664     + DEFINE_SINGLE_BUF_MAP(map, dqp->q_blkno,
2665     + mp->m_quotainfo->qi_dqchunklen);
2666     + bp = _xfs_buf_find(mp->m_ddev_targp, &map, 1, 0, NULL);
2667     + if (!bp) {
2668     + error = -EINVAL;
2669     + goto out_unlock;
2670     + }
2671     + xfs_buf_unlock(bp);
2672     +
2673     + xfs_buf_delwri_pushbuf(bp, buffer_list);
2674     + xfs_buf_rele(bp);
2675     +
2676     + error = -EAGAIN;
2677     + goto out_unlock;
2678     + }
2679     +
2680     error = xfs_qm_dqflush(dqp, &bp);
2681     if (error)
2682     goto out_unlock;
2683     diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
2684     index 29a75ecb2425..0015c19c7455 100644
2685     --- a/fs/xfs/xfs_reflink.c
2686     +++ b/fs/xfs/xfs_reflink.c
2687     @@ -169,6 +169,8 @@ xfs_reflink_find_shared(
2688     error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
2689     if (error)
2690     return error;
2691     + if (!agbp)
2692     + return -ENOMEM;
2693    
2694     cur = xfs_refcountbt_init_cursor(mp, NULL, agbp, agno, NULL);
2695    
2696     @@ -333,7 +335,7 @@ xfs_reflink_convert_cow_extent(
2697     struct xfs_defer_ops *dfops)
2698     {
2699     struct xfs_bmbt_irec irec = *imap;
2700     - xfs_fsblock_t first_block;
2701     + xfs_fsblock_t first_block = NULLFSBLOCK;
2702     int nimaps = 1;
2703    
2704     if (imap->br_state == XFS_EXT_NORM)
2705     diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
2706     index 882fb8524fcb..67d589e0a49f 100644
2707     --- a/fs/xfs/xfs_super.c
2708     +++ b/fs/xfs/xfs_super.c
2709     @@ -1214,7 +1214,7 @@ xfs_test_remount_options(
2710     tmp_mp->m_super = sb;
2711     error = xfs_parseargs(tmp_mp, options);
2712     xfs_free_fsname(tmp_mp);
2713     - kfree(tmp_mp);
2714     + kmem_free(tmp_mp);
2715    
2716     return error;
2717     }
2718     diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
2719     index 828f383df121..bdf69e1c7410 100644
2720     --- a/fs/xfs/xfs_trace.h
2721     +++ b/fs/xfs/xfs_trace.h
2722     @@ -366,6 +366,7 @@ DEFINE_BUF_EVENT(xfs_buf_iowait_done);
2723     DEFINE_BUF_EVENT(xfs_buf_delwri_queue);
2724     DEFINE_BUF_EVENT(xfs_buf_delwri_queued);
2725     DEFINE_BUF_EVENT(xfs_buf_delwri_split);
2726     +DEFINE_BUF_EVENT(xfs_buf_delwri_pushbuf);
2727     DEFINE_BUF_EVENT(xfs_buf_get_uncached);
2728     DEFINE_BUF_EVENT(xfs_bdstrat_shut);
2729     DEFINE_BUF_EVENT(xfs_buf_item_relse);
2730     @@ -519,7 +520,6 @@ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size);
2731     DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_ordered);
2732     DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_stale);
2733     DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format);
2734     -DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_ordered);
2735     DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_stale);
2736     DEFINE_BUF_ITEM_EVENT(xfs_buf_item_ordered);
2737     DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin);
2738     @@ -1990,6 +1990,24 @@ DEFINE_EVENT(xfs_swap_extent_class, name, \
2739     DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before);
2740     DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after);
2741    
2742     +TRACE_EVENT(xfs_log_recover,
2743     + TP_PROTO(struct xlog *log, xfs_daddr_t headblk, xfs_daddr_t tailblk),
2744     + TP_ARGS(log, headblk, tailblk),
2745     + TP_STRUCT__entry(
2746     + __field(dev_t, dev)
2747     + __field(xfs_daddr_t, headblk)
2748     + __field(xfs_daddr_t, tailblk)
2749     + ),
2750     + TP_fast_assign(
2751     + __entry->dev = log->l_mp->m_super->s_dev;
2752     + __entry->headblk = headblk;
2753     + __entry->tailblk = tailblk;
2754     + ),
2755     + TP_printk("dev %d:%d headblk 0x%llx tailblk 0x%llx",
2756     + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->headblk,
2757     + __entry->tailblk)
2758     +)
2759     +
2760     TRACE_EVENT(xfs_log_recover_record,
2761     TP_PROTO(struct xlog *log, struct xlog_rec_header *rhead, int pass),
2762     TP_ARGS(log, rhead, pass),
2763     diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
2764     index 98024cb933ef..5669cf00bae0 100644
2765     --- a/fs/xfs/xfs_trans.h
2766     +++ b/fs/xfs/xfs_trans.h
2767     @@ -50,6 +50,7 @@ typedef struct xfs_log_item {
2768     struct xfs_ail *li_ailp; /* ptr to AIL */
2769     uint li_type; /* item type */
2770     uint li_flags; /* misc flags */
2771     + struct xfs_buf *li_buf; /* real buffer pointer */
2772     struct xfs_log_item *li_bio_list; /* buffer item list */
2773     void (*li_cb)(struct xfs_buf *,
2774     struct xfs_log_item *);
2775     @@ -65,11 +66,13 @@ typedef struct xfs_log_item {
2776     } xfs_log_item_t;
2777    
2778     #define XFS_LI_IN_AIL 0x1
2779     -#define XFS_LI_ABORTED 0x2
2780     +#define XFS_LI_ABORTED 0x2
2781     +#define XFS_LI_FAILED 0x4
2782    
2783     #define XFS_LI_FLAGS \
2784     { XFS_LI_IN_AIL, "IN_AIL" }, \
2785     - { XFS_LI_ABORTED, "ABORTED" }
2786     + { XFS_LI_ABORTED, "ABORTED" }, \
2787     + { XFS_LI_FAILED, "FAILED" }
2788    
2789     struct xfs_item_ops {
2790     void (*iop_size)(xfs_log_item_t *, int *, int *);
2791     @@ -80,6 +83,7 @@ struct xfs_item_ops {
2792     void (*iop_unlock)(xfs_log_item_t *);
2793     xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t);
2794     void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t);
2795     + void (*iop_error)(xfs_log_item_t *, xfs_buf_t *);
2796     };
2797    
2798     void xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item,
2799     @@ -213,12 +217,14 @@ void xfs_trans_bhold_release(xfs_trans_t *, struct xfs_buf *);
2800     void xfs_trans_binval(xfs_trans_t *, struct xfs_buf *);
2801     void xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *);
2802     void xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *);
2803     -void xfs_trans_ordered_buf(xfs_trans_t *, struct xfs_buf *);
2804     +bool xfs_trans_ordered_buf(xfs_trans_t *, struct xfs_buf *);
2805     void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint);
2806     void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *);
2807     void xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int);
2808     void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *, uint);
2809     -void xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint);
2810     +void xfs_trans_log_buf(struct xfs_trans *, struct xfs_buf *, uint,
2811     + uint);
2812     +void xfs_trans_dirty_buf(struct xfs_trans *, struct xfs_buf *);
2813     void xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint);
2814    
2815     void xfs_extent_free_init_defer_op(void);
2816     @@ -277,6 +283,6 @@ int xfs_trans_log_finish_bmap_update(struct xfs_trans *tp,
2817     struct xfs_bud_log_item *rudp, struct xfs_defer_ops *dfops,
2818     enum xfs_bmap_intent_type type, struct xfs_inode *ip,
2819     int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock,
2820     - xfs_filblks_t blockcount, xfs_exntst_t state);
2821     + xfs_filblks_t *blockcount, xfs_exntst_t state);
2822    
2823     #endif /* __XFS_TRANS_H__ */
2824     diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
2825     index d6c9c3e9e02b..70f5ab017323 100644
2826     --- a/fs/xfs/xfs_trans_ail.c
2827     +++ b/fs/xfs/xfs_trans_ail.c
2828     @@ -684,8 +684,24 @@ xfs_trans_ail_update_bulk(
2829     }
2830     }
2831    
2832     -/*
2833     - * xfs_trans_ail_delete_bulk - remove multiple log items from the AIL
2834     +bool
2835     +xfs_ail_delete_one(
2836     + struct xfs_ail *ailp,
2837     + struct xfs_log_item *lip)
2838     +{
2839     + struct xfs_log_item *mlip = xfs_ail_min(ailp);
2840     +
2841     + trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn);
2842     + xfs_ail_delete(ailp, lip);
2843     + xfs_clear_li_failed(lip);
2844     + lip->li_flags &= ~XFS_LI_IN_AIL;
2845     + lip->li_lsn = 0;
2846     +
2847     + return mlip == lip;
2848     +}
2849     +
2850     +/**
2851     + * Remove a log items from the AIL
2852     *
2853     * @xfs_trans_ail_delete_bulk takes an array of log items that all need to
2854     * removed from the AIL. The caller is already holding the AIL lock, and done
2855     @@ -706,52 +722,36 @@ xfs_trans_ail_update_bulk(
2856     * before returning.
2857     */
2858     void
2859     -xfs_trans_ail_delete_bulk(
2860     +xfs_trans_ail_delete(
2861     struct xfs_ail *ailp,
2862     - struct xfs_log_item **log_items,
2863     - int nr_items,
2864     + struct xfs_log_item *lip,
2865     int shutdown_type) __releases(ailp->xa_lock)
2866     {
2867     - xfs_log_item_t *mlip;
2868     - int mlip_changed = 0;
2869     - int i;
2870     -
2871     - mlip = xfs_ail_min(ailp);
2872     + struct xfs_mount *mp = ailp->xa_mount;
2873     + bool mlip_changed;
2874    
2875     - for (i = 0; i < nr_items; i++) {
2876     - struct xfs_log_item *lip = log_items[i];
2877     - if (!(lip->li_flags & XFS_LI_IN_AIL)) {
2878     - struct xfs_mount *mp = ailp->xa_mount;
2879     -
2880     - spin_unlock(&ailp->xa_lock);
2881     - if (!XFS_FORCED_SHUTDOWN(mp)) {
2882     - xfs_alert_tag(mp, XFS_PTAG_AILDELETE,
2883     - "%s: attempting to delete a log item that is not in the AIL",
2884     - __func__);
2885     - xfs_force_shutdown(mp, shutdown_type);
2886     - }
2887     - return;
2888     + if (!(lip->li_flags & XFS_LI_IN_AIL)) {
2889     + spin_unlock(&ailp->xa_lock);
2890     + if (!XFS_FORCED_SHUTDOWN(mp)) {
2891     + xfs_alert_tag(mp, XFS_PTAG_AILDELETE,
2892     + "%s: attempting to delete a log item that is not in the AIL",
2893     + __func__);
2894     + xfs_force_shutdown(mp, shutdown_type);
2895     }
2896     -
2897     - trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn);
2898     - xfs_ail_delete(ailp, lip);
2899     - lip->li_flags &= ~XFS_LI_IN_AIL;
2900     - lip->li_lsn = 0;
2901     - if (mlip == lip)
2902     - mlip_changed = 1;
2903     + return;
2904     }
2905    
2906     + mlip_changed = xfs_ail_delete_one(ailp, lip);
2907     if (mlip_changed) {
2908     - if (!XFS_FORCED_SHUTDOWN(ailp->xa_mount))
2909     - xlog_assign_tail_lsn_locked(ailp->xa_mount);
2910     + if (!XFS_FORCED_SHUTDOWN(mp))
2911     + xlog_assign_tail_lsn_locked(mp);
2912     if (list_empty(&ailp->xa_ail))
2913     wake_up_all(&ailp->xa_empty);
2914     - spin_unlock(&ailp->xa_lock);
2915     + }
2916    
2917     + spin_unlock(&ailp->xa_lock);
2918     + if (mlip_changed)
2919     xfs_log_space_wake(ailp->xa_mount);
2920     - } else {
2921     - spin_unlock(&ailp->xa_lock);
2922     - }
2923     }
2924    
2925     int
2926     diff --git a/fs/xfs/xfs_trans_bmap.c b/fs/xfs/xfs_trans_bmap.c
2927     index 6408e7d7c08c..14543d93cd4b 100644
2928     --- a/fs/xfs/xfs_trans_bmap.c
2929     +++ b/fs/xfs/xfs_trans_bmap.c
2930     @@ -63,7 +63,7 @@ xfs_trans_log_finish_bmap_update(
2931     int whichfork,
2932     xfs_fileoff_t startoff,
2933     xfs_fsblock_t startblock,
2934     - xfs_filblks_t blockcount,
2935     + xfs_filblks_t *blockcount,
2936     xfs_exntst_t state)
2937     {
2938     int error;
2939     @@ -196,16 +196,23 @@ xfs_bmap_update_finish_item(
2940     void **state)
2941     {
2942     struct xfs_bmap_intent *bmap;
2943     + xfs_filblks_t count;
2944     int error;
2945    
2946     bmap = container_of(item, struct xfs_bmap_intent, bi_list);
2947     + count = bmap->bi_bmap.br_blockcount;
2948     error = xfs_trans_log_finish_bmap_update(tp, done_item, dop,
2949     bmap->bi_type,
2950     bmap->bi_owner, bmap->bi_whichfork,
2951     bmap->bi_bmap.br_startoff,
2952     bmap->bi_bmap.br_startblock,
2953     - bmap->bi_bmap.br_blockcount,
2954     + &count,
2955     bmap->bi_bmap.br_state);
2956     + if (!error && count > 0) {
2957     + ASSERT(bmap->bi_type == XFS_BMAP_UNMAP);
2958     + bmap->bi_bmap.br_blockcount = count;
2959     + return -EAGAIN;
2960     + }
2961     kmem_free(bmap);
2962     return error;
2963     }
2964     diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
2965     index 8ee29ca132dc..3ba7a96a8abd 100644
2966     --- a/fs/xfs/xfs_trans_buf.c
2967     +++ b/fs/xfs/xfs_trans_buf.c
2968     @@ -356,6 +356,7 @@ xfs_trans_brelse(xfs_trans_t *tp,
2969     xfs_buf_t *bp)
2970     {
2971     xfs_buf_log_item_t *bip;
2972     + int freed;
2973    
2974     /*
2975     * Default to a normal brelse() call if the tp is NULL.
2976     @@ -419,16 +420,22 @@ xfs_trans_brelse(xfs_trans_t *tp,
2977     /*
2978     * Drop our reference to the buf log item.
2979     */
2980     - atomic_dec(&bip->bli_refcount);
2981     + freed = atomic_dec_and_test(&bip->bli_refcount);
2982    
2983     /*
2984     - * If the buf item is not tracking data in the log, then
2985     - * we must free it before releasing the buffer back to the
2986     - * free pool. Before releasing the buffer to the free pool,
2987     - * clear the transaction pointer in b_fsprivate2 to dissolve
2988     - * its relation to this transaction.
2989     + * If the buf item is not tracking data in the log, then we must free it
2990     + * before releasing the buffer back to the free pool.
2991     + *
2992     + * If the fs has shutdown and we dropped the last reference, it may fall
2993     + * on us to release a (possibly dirty) bli if it never made it to the
2994     + * AIL (e.g., the aborted unpin already happened and didn't release it
2995     + * due to our reference). Since we're already shutdown and need xa_lock,
2996     + * just force remove from the AIL and release the bli here.
2997     */
2998     - if (!xfs_buf_item_dirty(bip)) {
2999     + if (XFS_FORCED_SHUTDOWN(tp->t_mountp) && freed) {
3000     + xfs_trans_ail_remove(&bip->bli_item, SHUTDOWN_LOG_IO_ERROR);
3001     + xfs_buf_item_relse(bp);
3002     + } else if (!(bip->bli_flags & XFS_BLI_DIRTY)) {
3003     /***
3004     ASSERT(bp->b_pincount == 0);
3005     ***/
3006     @@ -486,25 +493,17 @@ xfs_trans_bhold_release(xfs_trans_t *tp,
3007     }
3008    
3009     /*
3010     - * This is called to mark bytes first through last inclusive of the given
3011     - * buffer as needing to be logged when the transaction is committed.
3012     - * The buffer must already be associated with the given transaction.
3013     - *
3014     - * First and last are numbers relative to the beginning of this buffer,
3015     - * so the first byte in the buffer is numbered 0 regardless of the
3016     - * value of b_blkno.
3017     + * Mark a buffer dirty in the transaction.
3018     */
3019     void
3020     -xfs_trans_log_buf(xfs_trans_t *tp,
3021     - xfs_buf_t *bp,
3022     - uint first,
3023     - uint last)
3024     +xfs_trans_dirty_buf(
3025     + struct xfs_trans *tp,
3026     + struct xfs_buf *bp)
3027     {
3028     - xfs_buf_log_item_t *bip = bp->b_fspriv;
3029     + struct xfs_buf_log_item *bip = bp->b_fspriv;
3030    
3031     ASSERT(bp->b_transp == tp);
3032     ASSERT(bip != NULL);
3033     - ASSERT(first <= last && last < BBTOB(bp->b_length));
3034     ASSERT(bp->b_iodone == NULL ||
3035     bp->b_iodone == xfs_buf_iodone_callbacks);
3036    
3037     @@ -524,8 +523,6 @@ xfs_trans_log_buf(xfs_trans_t *tp,
3038     bp->b_iodone = xfs_buf_iodone_callbacks;
3039     bip->bli_item.li_cb = xfs_buf_iodone;
3040    
3041     - trace_xfs_trans_log_buf(bip);
3042     -
3043     /*
3044     * If we invalidated the buffer within this transaction, then
3045     * cancel the invalidation now that we're dirtying the buffer
3046     @@ -538,17 +535,37 @@ xfs_trans_log_buf(xfs_trans_t *tp,
3047     bp->b_flags &= ~XBF_STALE;
3048     bip->__bli_format.blf_flags &= ~XFS_BLF_CANCEL;
3049     }
3050     + bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED;
3051    
3052     tp->t_flags |= XFS_TRANS_DIRTY;
3053     bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY;
3054     +}
3055    
3056     - /*
3057     - * If we have an ordered buffer we are not logging any dirty range but
3058     - * it still needs to be marked dirty and that it has been logged.
3059     - */
3060     - bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED;
3061     - if (!(bip->bli_flags & XFS_BLI_ORDERED))
3062     - xfs_buf_item_log(bip, first, last);
3063     +/*
3064     + * This is called to mark bytes first through last inclusive of the given
3065     + * buffer as needing to be logged when the transaction is committed.
3066     + * The buffer must already be associated with the given transaction.
3067     + *
3068     + * First and last are numbers relative to the beginning of this buffer,
3069     + * so the first byte in the buffer is numbered 0 regardless of the
3070     + * value of b_blkno.
3071     + */
3072     +void
3073     +xfs_trans_log_buf(
3074     + struct xfs_trans *tp,
3075     + struct xfs_buf *bp,
3076     + uint first,
3077     + uint last)
3078     +{
3079     + struct xfs_buf_log_item *bip = bp->b_fspriv;
3080     +
3081     + ASSERT(first <= last && last < BBTOB(bp->b_length));
3082     + ASSERT(!(bip->bli_flags & XFS_BLI_ORDERED));
3083     +
3084     + xfs_trans_dirty_buf(tp, bp);
3085     +
3086     + trace_xfs_trans_log_buf(bip);
3087     + xfs_buf_item_log(bip, first, last);
3088     }
3089    
3090    
3091     @@ -701,14 +718,13 @@ xfs_trans_inode_alloc_buf(
3092     }
3093    
3094     /*
3095     - * Mark the buffer as ordered for this transaction. This means
3096     - * that the contents of the buffer are not recorded in the transaction
3097     - * but it is tracked in the AIL as though it was. This allows us
3098     - * to record logical changes in transactions rather than the physical
3099     - * changes we make to the buffer without changing writeback ordering
3100     - * constraints of metadata buffers.
3101     + * Mark the buffer as ordered for this transaction. This means that the contents
3102     + * of the buffer are not recorded in the transaction but it is tracked in the
3103     + * AIL as though it was. This allows us to record logical changes in
3104     + * transactions rather than the physical changes we make to the buffer without
3105     + * changing writeback ordering constraints of metadata buffers.
3106     */
3107     -void
3108     +bool
3109     xfs_trans_ordered_buf(
3110     struct xfs_trans *tp,
3111     struct xfs_buf *bp)
3112     @@ -719,8 +735,18 @@ xfs_trans_ordered_buf(
3113     ASSERT(bip != NULL);
3114     ASSERT(atomic_read(&bip->bli_refcount) > 0);
3115    
3116     + if (xfs_buf_item_dirty_format(bip))
3117     + return false;
3118     +
3119     bip->bli_flags |= XFS_BLI_ORDERED;
3120     trace_xfs_buf_item_ordered(bip);
3121     +
3122     + /*
3123     + * We don't log a dirty range of an ordered buffer but it still needs
3124     + * to be marked dirty and that it has been logged.
3125     + */
3126     + xfs_trans_dirty_buf(tp, bp);
3127     + return true;
3128     }
3129    
3130     /*
3131     diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
3132     index 49931b72da8a..b317a3644c00 100644
3133     --- a/fs/xfs/xfs_trans_priv.h
3134     +++ b/fs/xfs/xfs_trans_priv.h
3135     @@ -106,18 +106,9 @@ xfs_trans_ail_update(
3136     xfs_trans_ail_update_bulk(ailp, NULL, &lip, 1, lsn);
3137     }
3138    
3139     -void xfs_trans_ail_delete_bulk(struct xfs_ail *ailp,
3140     - struct xfs_log_item **log_items, int nr_items,
3141     - int shutdown_type)
3142     - __releases(ailp->xa_lock);
3143     -static inline void
3144     -xfs_trans_ail_delete(
3145     - struct xfs_ail *ailp,
3146     - xfs_log_item_t *lip,
3147     - int shutdown_type) __releases(ailp->xa_lock)
3148     -{
3149     - xfs_trans_ail_delete_bulk(ailp, &lip, 1, shutdown_type);
3150     -}
3151     +bool xfs_ail_delete_one(struct xfs_ail *ailp, struct xfs_log_item *lip);
3152     +void xfs_trans_ail_delete(struct xfs_ail *ailp, struct xfs_log_item *lip,
3153     + int shutdown_type) __releases(ailp->xa_lock);
3154    
3155     static inline void
3156     xfs_trans_ail_remove(
3157     @@ -173,4 +164,35 @@ xfs_trans_ail_copy_lsn(
3158     *dst = *src;
3159     }
3160     #endif
3161     +
3162     +static inline void
3163     +xfs_clear_li_failed(
3164     + struct xfs_log_item *lip)
3165     +{
3166     + struct xfs_buf *bp = lip->li_buf;
3167     +
3168     + ASSERT(lip->li_flags & XFS_LI_IN_AIL);
3169     + lockdep_assert_held(&lip->li_ailp->xa_lock);
3170     +
3171     + if (lip->li_flags & XFS_LI_FAILED) {
3172     + lip->li_flags &= ~XFS_LI_FAILED;
3173     + lip->li_buf = NULL;
3174     + xfs_buf_rele(bp);
3175     + }
3176     +}
3177     +
3178     +static inline void
3179     +xfs_set_li_failed(
3180     + struct xfs_log_item *lip,
3181     + struct xfs_buf *bp)
3182     +{
3183     + lockdep_assert_held(&lip->li_ailp->xa_lock);
3184     +
3185     + if (!(lip->li_flags & XFS_LI_FAILED)) {
3186     + xfs_buf_hold(bp);
3187     + lip->li_flags |= XFS_LI_FAILED;
3188     + lip->li_buf = bp;
3189     + }
3190     +}
3191     +
3192     #endif /* __XFS_TRANS_PRIV_H__ */
3193     diff --git a/include/linux/fs.h b/include/linux/fs.h
3194     index dd88ded27fc8..d705ae084edd 100644
3195     --- a/include/linux/fs.h
3196     +++ b/include/linux/fs.h
3197     @@ -2760,6 +2760,7 @@ static inline void lockdep_annotate_inode_mutex_key(struct inode *inode) { };
3198     #endif
3199     extern void unlock_new_inode(struct inode *);
3200     extern unsigned int get_next_ino(void);
3201     +extern void evict_inodes(struct super_block *sb);
3202    
3203     extern void __iget(struct inode * inode);
3204     extern void iget_failed(struct inode *);
3205     diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
3206     index 780e7171f548..23db1ae37464 100644
3207     --- a/include/linux/netdevice.h
3208     +++ b/include/linux/netdevice.h
3209     @@ -3901,6 +3901,8 @@ struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
3210     updev; \
3211     updev = netdev_all_upper_get_next_dev_rcu(dev, &(iter)))
3212    
3213     +bool netdev_has_any_upper_dev(struct net_device *dev);
3214     +
3215     void *netdev_lower_get_next_private(struct net_device *dev,
3216     struct list_head **iter);
3217     void *netdev_lower_get_next_private_rcu(struct net_device *dev,
3218     diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
3219     index 909972aa3acd..634d19203e7d 100644
3220     --- a/include/net/inet_frag.h
3221     +++ b/include/net/inet_frag.h
3222     @@ -1,14 +1,9 @@
3223     #ifndef __NET_FRAG_H__
3224     #define __NET_FRAG_H__
3225    
3226     -#include <linux/percpu_counter.h>
3227     -
3228     struct netns_frags {
3229     - /* The percpu_counter "mem" need to be cacheline aligned.
3230     - * mem.count must not share cacheline with other writers
3231     - */
3232     - struct percpu_counter mem ____cacheline_aligned_in_smp;
3233     -
3234     + /* Keep atomic mem on separate cachelines in structs that include it */
3235     + atomic_t mem ____cacheline_aligned_in_smp;
3236     /* sysctls */
3237     int timeout;
3238     int high_thresh;
3239     @@ -108,15 +103,10 @@ struct inet_frags {
3240     int inet_frags_init(struct inet_frags *);
3241     void inet_frags_fini(struct inet_frags *);
3242    
3243     -static inline int inet_frags_init_net(struct netns_frags *nf)
3244     -{
3245     - return percpu_counter_init(&nf->mem, 0, GFP_KERNEL);
3246     -}
3247     -static inline void inet_frags_uninit_net(struct netns_frags *nf)
3248     +static inline void inet_frags_init_net(struct netns_frags *nf)
3249     {
3250     - percpu_counter_destroy(&nf->mem);
3251     + atomic_set(&nf->mem, 0);
3252     }
3253     -
3254     void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f);
3255    
3256     void inet_frag_kill(struct inet_frag_queue *q, struct inet_frags *f);
3257     @@ -140,37 +130,24 @@ static inline bool inet_frag_evicting(struct inet_frag_queue *q)
3258    
3259     /* Memory Tracking Functions. */
3260    
3261     -/* The default percpu_counter batch size is not big enough to scale to
3262     - * fragmentation mem acct sizes.
3263     - * The mem size of a 64K fragment is approx:
3264     - * (44 fragments * 2944 truesize) + frag_queue struct(200) = 129736 bytes
3265     - */
3266     -static unsigned int frag_percpu_counter_batch = 130000;
3267     -
3268     static inline int frag_mem_limit(struct netns_frags *nf)
3269     {
3270     - return percpu_counter_read(&nf->mem);
3271     + return atomic_read(&nf->mem);
3272     }
3273    
3274     static inline void sub_frag_mem_limit(struct netns_frags *nf, int i)
3275     {
3276     - __percpu_counter_add(&nf->mem, -i, frag_percpu_counter_batch);
3277     + atomic_sub(i, &nf->mem);
3278     }
3279    
3280     static inline void add_frag_mem_limit(struct netns_frags *nf, int i)
3281     {
3282     - __percpu_counter_add(&nf->mem, i, frag_percpu_counter_batch);
3283     + atomic_add(i, &nf->mem);
3284     }
3285    
3286     -static inline unsigned int sum_frag_mem_limit(struct netns_frags *nf)
3287     +static inline int sum_frag_mem_limit(struct netns_frags *nf)
3288     {
3289     - unsigned int res;
3290     -
3291     - local_bh_disable();
3292     - res = percpu_counter_sum_positive(&nf->mem);
3293     - local_bh_enable();
3294     -
3295     - return res;
3296     + return atomic_read(&nf->mem);
3297     }
3298    
3299     /* RFC 3168 support :
3300     diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
3301     index a74e2aa40ef4..a6bcb18ac4c3 100644
3302     --- a/include/net/ip6_fib.h
3303     +++ b/include/net/ip6_fib.h
3304     @@ -68,6 +68,7 @@ struct fib6_node {
3305     __u16 fn_flags;
3306     int fn_sernum;
3307     struct rt6_info *rr_ptr;
3308     + struct rcu_head rcu;
3309     };
3310    
3311     #ifndef CONFIG_IPV6_SUBTREES
3312     @@ -102,7 +103,7 @@ struct rt6_info {
3313     * the same cache line.
3314     */
3315     struct fib6_table *rt6i_table;
3316     - struct fib6_node *rt6i_node;
3317     + struct fib6_node __rcu *rt6i_node;
3318    
3319     struct in6_addr rt6i_gateway;
3320    
3321     @@ -165,13 +166,40 @@ static inline void rt6_update_expires(struct rt6_info *rt0, int timeout)
3322     rt0->rt6i_flags |= RTF_EXPIRES;
3323     }
3324    
3325     +/* Function to safely get fn->sernum for passed in rt
3326     + * and store result in passed in cookie.
3327     + * Return true if we can get cookie safely
3328     + * Return false if not
3329     + */
3330     +static inline bool rt6_get_cookie_safe(const struct rt6_info *rt,
3331     + u32 *cookie)
3332     +{
3333     + struct fib6_node *fn;
3334     + bool status = false;
3335     +
3336     + rcu_read_lock();
3337     + fn = rcu_dereference(rt->rt6i_node);
3338     +
3339     + if (fn) {
3340     + *cookie = fn->fn_sernum;
3341     + status = true;
3342     + }
3343     +
3344     + rcu_read_unlock();
3345     + return status;
3346     +}
3347     +
3348     static inline u32 rt6_get_cookie(const struct rt6_info *rt)
3349     {
3350     + u32 cookie = 0;
3351     +
3352     if (rt->rt6i_flags & RTF_PCPU ||
3353     (unlikely(rt->dst.flags & DST_NOCACHE) && rt->dst.from))
3354     rt = (struct rt6_info *)(rt->dst.from);
3355    
3356     - return rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
3357     + rt6_get_cookie_safe(rt, &cookie);
3358     +
3359     + return cookie;
3360     }
3361    
3362     static inline void ip6_rt_put(struct rt6_info *rt)
3363     diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
3364     index 89a687f3c0a3..5f5e28f210e0 100644
3365     --- a/net/bridge/br_device.c
3366     +++ b/net/bridge/br_device.c
3367     @@ -53,6 +53,9 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
3368     brstats->tx_bytes += skb->len;
3369     u64_stats_update_end(&brstats->syncp);
3370    
3371     +#ifdef CONFIG_NET_SWITCHDEV
3372     + skb->offload_fwd_mark = 0;
3373     +#endif
3374     BR_INPUT_SKB_CB(skb)->brdev = dev;
3375    
3376     skb_reset_mac_header(skb);
3377     diff --git a/net/core/datagram.c b/net/core/datagram.c
3378     index 58dfa23d12ca..4fa4011feec1 100644
3379     --- a/net/core/datagram.c
3380     +++ b/net/core/datagram.c
3381     @@ -351,7 +351,7 @@ int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags)
3382     if (flags & MSG_PEEK) {
3383     err = -ENOENT;
3384     spin_lock_bh(&sk->sk_receive_queue.lock);
3385     - if (skb == skb_peek(&sk->sk_receive_queue)) {
3386     + if (skb->next) {
3387     __skb_unlink(skb, &sk->sk_receive_queue);
3388     atomic_dec(&skb->users);
3389     err = 0;
3390     diff --git a/net/core/dev.c b/net/core/dev.c
3391     index 1d0a7369d5a2..ba7b8121a414 100644
3392     --- a/net/core/dev.c
3393     +++ b/net/core/dev.c
3394     @@ -5337,12 +5337,13 @@ EXPORT_SYMBOL(netdev_has_upper_dev);
3395     * Find out if a device is linked to an upper device and return true in case
3396     * it is. The caller must hold the RTNL lock.
3397     */
3398     -static bool netdev_has_any_upper_dev(struct net_device *dev)
3399     +bool netdev_has_any_upper_dev(struct net_device *dev)
3400     {
3401     ASSERT_RTNL();
3402    
3403     return !list_empty(&dev->all_adj_list.upper);
3404     }
3405     +EXPORT_SYMBOL(netdev_has_any_upper_dev);
3406    
3407     /**
3408     * netdev_master_upper_dev_get - Get master upper device
3409     diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c
3410     index 30d875dff6b5..f85b08baff16 100644
3411     --- a/net/ieee802154/6lowpan/reassembly.c
3412     +++ b/net/ieee802154/6lowpan/reassembly.c
3413     @@ -580,19 +580,14 @@ static int __net_init lowpan_frags_init_net(struct net *net)
3414     {
3415     struct netns_ieee802154_lowpan *ieee802154_lowpan =
3416     net_ieee802154_lowpan(net);
3417     - int res;
3418    
3419     ieee802154_lowpan->frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
3420     ieee802154_lowpan->frags.low_thresh = IPV6_FRAG_LOW_THRESH;
3421     ieee802154_lowpan->frags.timeout = IPV6_FRAG_TIMEOUT;
3422    
3423     - res = inet_frags_init_net(&ieee802154_lowpan->frags);
3424     - if (res)
3425     - return res;
3426     - res = lowpan_frags_ns_sysctl_register(net);
3427     - if (res)
3428     - inet_frags_uninit_net(&ieee802154_lowpan->frags);
3429     - return res;
3430     + inet_frags_init_net(&ieee802154_lowpan->frags);
3431     +
3432     + return lowpan_frags_ns_sysctl_register(net);
3433     }
3434    
3435     static void __net_exit lowpan_frags_exit_net(struct net *net)
3436     diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
3437     index b5e9317eaf9e..631c0d0d7cf8 100644
3438     --- a/net/ipv4/inet_fragment.c
3439     +++ b/net/ipv4/inet_fragment.c
3440     @@ -234,10 +234,8 @@ void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f)
3441     cond_resched();
3442    
3443     if (read_seqretry(&f->rnd_seqlock, seq) ||
3444     - percpu_counter_sum(&nf->mem))
3445     + sum_frag_mem_limit(nf))
3446     goto evict_again;
3447     -
3448     - percpu_counter_destroy(&nf->mem);
3449     }
3450     EXPORT_SYMBOL(inet_frags_exit_net);
3451    
3452     diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
3453     index bbe7f72db9c1..453db950dc9f 100644
3454     --- a/net/ipv4/ip_fragment.c
3455     +++ b/net/ipv4/ip_fragment.c
3456     @@ -835,8 +835,6 @@ static void __init ip4_frags_ctl_register(void)
3457    
3458     static int __net_init ipv4_frags_init_net(struct net *net)
3459     {
3460     - int res;
3461     -
3462     /* Fragment cache limits.
3463     *
3464     * The fragment memory accounting code, (tries to) account for
3465     @@ -862,13 +860,9 @@ static int __net_init ipv4_frags_init_net(struct net *net)
3466    
3467     net->ipv4.frags.max_dist = 64;
3468    
3469     - res = inet_frags_init_net(&net->ipv4.frags);
3470     - if (res)
3471     - return res;
3472     - res = ip4_frags_ns_ctl_register(net);
3473     - if (res)
3474     - inet_frags_uninit_net(&net->ipv4.frags);
3475     - return res;
3476     + inet_frags_init_net(&net->ipv4.frags);
3477     +
3478     + return ip4_frags_ns_ctl_register(net);
3479     }
3480    
3481     static void __net_exit ipv4_frags_exit_net(struct net *net)
3482     diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
3483     index 5719d6ba0824..bd7f1836bb70 100644
3484     --- a/net/ipv4/ip_tunnel.c
3485     +++ b/net/ipv4/ip_tunnel.c
3486     @@ -609,8 +609,8 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto)
3487     ip_rt_put(rt);
3488     goto tx_dropped;
3489     }
3490     - iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, key->tos,
3491     - key->ttl, df, !net_eq(tunnel->net, dev_net(dev)));
3492     + iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
3493     + df, !net_eq(tunnel->net, dev_net(dev)));
3494     return;
3495     tx_error:
3496     dev->stats.tx_errors++;
3497     diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
3498     index 1a4db27f5833..6b3d27e50317 100644
3499     --- a/net/ipv4/tcp.c
3500     +++ b/net/ipv4/tcp.c
3501     @@ -2297,6 +2297,10 @@ int tcp_disconnect(struct sock *sk, int flags)
3502     tcp_set_ca_state(sk, TCP_CA_Open);
3503     tcp_clear_retrans(tp);
3504     inet_csk_delack_init(sk);
3505     + /* Initialize rcv_mss to TCP_MIN_MSS to avoid division by 0
3506     + * issue in __tcp_select_window()
3507     + */
3508     + icsk->icsk_ack.rcv_mss = TCP_MIN_MSS;
3509     tcp_init_send_head(sk);
3510     memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
3511     __sk_dst_reset(sk);
3512     diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
3513     index b2cabda72320..cc101b1be903 100644
3514     --- a/net/ipv6/addrconf.c
3515     +++ b/net/ipv6/addrconf.c
3516     @@ -5443,7 +5443,7 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
3517     * our DAD process, so we don't need
3518     * to do it again
3519     */
3520     - if (!(ifp->rt->rt6i_node))
3521     + if (!rcu_access_pointer(ifp->rt->rt6i_node))
3522     ip6_ins_rt(ifp->rt);
3523     if (ifp->idev->cnf.forwarding)
3524     addrconf_join_anycast(ifp);
3525     diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
3526     index ff389591a340..5da864997495 100644
3527     --- a/net/ipv6/ip6_fib.c
3528     +++ b/net/ipv6/ip6_fib.c
3529     @@ -148,11 +148,23 @@ static struct fib6_node *node_alloc(void)
3530     return fn;
3531     }
3532    
3533     -static void node_free(struct fib6_node *fn)
3534     +static void node_free_immediate(struct fib6_node *fn)
3535     +{
3536     + kmem_cache_free(fib6_node_kmem, fn);
3537     +}
3538     +
3539     +static void node_free_rcu(struct rcu_head *head)
3540     {
3541     + struct fib6_node *fn = container_of(head, struct fib6_node, rcu);
3542     +
3543     kmem_cache_free(fib6_node_kmem, fn);
3544     }
3545    
3546     +static void node_free(struct fib6_node *fn)
3547     +{
3548     + call_rcu(&fn->rcu, node_free_rcu);
3549     +}
3550     +
3551     static void rt6_rcu_free(struct rt6_info *rt)
3552     {
3553     call_rcu(&rt->dst.rcu_head, dst_rcu_free);
3554     @@ -189,6 +201,12 @@ static void rt6_release(struct rt6_info *rt)
3555     }
3556     }
3557    
3558     +static void fib6_free_table(struct fib6_table *table)
3559     +{
3560     + inetpeer_invalidate_tree(&table->tb6_peers);
3561     + kfree(table);
3562     +}
3563     +
3564     static void fib6_link_table(struct net *net, struct fib6_table *tb)
3565     {
3566     unsigned int h;
3567     @@ -589,9 +607,9 @@ static struct fib6_node *fib6_add_1(struct fib6_node *root,
3568    
3569     if (!in || !ln) {
3570     if (in)
3571     - node_free(in);
3572     + node_free_immediate(in);
3573     if (ln)
3574     - node_free(ln);
3575     + node_free_immediate(ln);
3576     return ERR_PTR(-ENOMEM);
3577     }
3578    
3579     @@ -862,7 +880,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
3580    
3581     rt->dst.rt6_next = iter;
3582     *ins = rt;
3583     - rt->rt6i_node = fn;
3584     + rcu_assign_pointer(rt->rt6i_node, fn);
3585     atomic_inc(&rt->rt6i_ref);
3586     inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
3587     info->nl_net->ipv6.rt6_stats->fib_rt_entries++;
3588     @@ -887,7 +905,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
3589     return err;
3590    
3591     *ins = rt;
3592     - rt->rt6i_node = fn;
3593     + rcu_assign_pointer(rt->rt6i_node, fn);
3594     rt->dst.rt6_next = iter->dst.rt6_next;
3595     atomic_inc(&rt->rt6i_ref);
3596     inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE);
3597     @@ -1020,7 +1038,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,
3598     root, and then (in failure) stale node
3599     in main tree.
3600     */
3601     - node_free(sfn);
3602     + node_free_immediate(sfn);
3603     err = PTR_ERR(sn);
3604     goto failure;
3605     }
3606     @@ -1447,8 +1465,9 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
3607    
3608     int fib6_del(struct rt6_info *rt, struct nl_info *info)
3609     {
3610     + struct fib6_node *fn = rcu_dereference_protected(rt->rt6i_node,
3611     + lockdep_is_held(&rt->rt6i_table->tb6_lock));
3612     struct net *net = info->nl_net;
3613     - struct fib6_node *fn = rt->rt6i_node;
3614     struct rt6_info **rtp;
3615    
3616     #if RT6_DEBUG >= 2
3617     @@ -1637,7 +1656,9 @@ static int fib6_clean_node(struct fib6_walker *w)
3618     if (res) {
3619     #if RT6_DEBUG >= 2
3620     pr_debug("%s: del failed: rt=%p@%p err=%d\n",
3621     - __func__, rt, rt->rt6i_node, res);
3622     + __func__, rt,
3623     + rcu_access_pointer(rt->rt6i_node),
3624     + res);
3625     #endif
3626     continue;
3627     }
3628     @@ -1878,15 +1899,22 @@ static int __net_init fib6_net_init(struct net *net)
3629    
3630     static void fib6_net_exit(struct net *net)
3631     {
3632     + unsigned int i;
3633     +
3634     rt6_ifdown(net, NULL);
3635     del_timer_sync(&net->ipv6.ip6_fib_timer);
3636    
3637     -#ifdef CONFIG_IPV6_MULTIPLE_TABLES
3638     - inetpeer_invalidate_tree(&net->ipv6.fib6_local_tbl->tb6_peers);
3639     - kfree(net->ipv6.fib6_local_tbl);
3640     -#endif
3641     - inetpeer_invalidate_tree(&net->ipv6.fib6_main_tbl->tb6_peers);
3642     - kfree(net->ipv6.fib6_main_tbl);
3643     + for (i = 0; i < FIB6_TABLE_HASHSZ; i++) {
3644     + struct hlist_head *head = &net->ipv6.fib_table_hash[i];
3645     + struct hlist_node *tmp;
3646     + struct fib6_table *tb;
3647     +
3648     + hlist_for_each_entry_safe(tb, tmp, head, tb6_hlist) {
3649     + hlist_del(&tb->tb6_hlist);
3650     + fib6_free_table(tb);
3651     + }
3652     + }
3653     +
3654     kfree(net->ipv6.fib_table_hash);
3655     kfree(net->ipv6.rt6_stats);
3656     }
3657     diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
3658     index d2844ee469cb..f78afe43bdff 100644
3659     --- a/net/ipv6/ip6_gre.c
3660     +++ b/net/ipv6/ip6_gre.c
3661     @@ -432,7 +432,9 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
3662     }
3663     break;
3664     case ICMPV6_PKT_TOOBIG:
3665     - mtu = be32_to_cpu(info) - offset;
3666     + mtu = be32_to_cpu(info) - offset - t->tun_hlen;
3667     + if (t->dev->type == ARPHRD_ETHER)
3668     + mtu -= ETH_HLEN;
3669     if (mtu < IPV6_MIN_MTU)
3670     mtu = IPV6_MIN_MTU;
3671     t->dev->mtu = mtu;
3672     diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
3673     index 986d4ca38832..b263bf3a19f7 100644
3674     --- a/net/ipv6/netfilter/nf_conntrack_reasm.c
3675     +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
3676     @@ -622,18 +622,12 @@ EXPORT_SYMBOL_GPL(nf_ct_frag6_gather);
3677    
3678     static int nf_ct_net_init(struct net *net)
3679     {
3680     - int res;
3681     -
3682     net->nf_frag.frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
3683     net->nf_frag.frags.low_thresh = IPV6_FRAG_LOW_THRESH;
3684     net->nf_frag.frags.timeout = IPV6_FRAG_TIMEOUT;
3685     - res = inet_frags_init_net(&net->nf_frag.frags);
3686     - if (res)
3687     - return res;
3688     - res = nf_ct_frag6_sysctl_register(net);
3689     - if (res)
3690     - inet_frags_uninit_net(&net->nf_frag.frags);
3691     - return res;
3692     + inet_frags_init_net(&net->nf_frag.frags);
3693     +
3694     + return nf_ct_frag6_sysctl_register(net);
3695     }
3696    
3697     static void nf_ct_net_exit(struct net *net)
3698     diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c
3699     index abb2c307fbe8..a338bbc33cf3 100644
3700     --- a/net/ipv6/output_core.c
3701     +++ b/net/ipv6/output_core.c
3702     @@ -86,7 +86,6 @@ int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
3703    
3704     while (offset <= packet_len) {
3705     struct ipv6_opt_hdr *exthdr;
3706     - unsigned int len;
3707    
3708     switch (**nexthdr) {
3709    
3710     @@ -112,10 +111,9 @@ int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
3711    
3712     exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
3713     offset);
3714     - len = ipv6_optlen(exthdr);
3715     - if (len + offset >= IPV6_MAXPLEN)
3716     + offset += ipv6_optlen(exthdr);
3717     + if (offset > IPV6_MAXPLEN)
3718     return -EINVAL;
3719     - offset += len;
3720     *nexthdr = &exthdr->nexthdr;
3721     }
3722    
3723     diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
3724     index 3815e8505ed2..e585c0a2591c 100644
3725     --- a/net/ipv6/reassembly.c
3726     +++ b/net/ipv6/reassembly.c
3727     @@ -709,19 +709,13 @@ static void ip6_frags_sysctl_unregister(void)
3728    
3729     static int __net_init ipv6_frags_init_net(struct net *net)
3730     {
3731     - int res;
3732     -
3733     net->ipv6.frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
3734     net->ipv6.frags.low_thresh = IPV6_FRAG_LOW_THRESH;
3735     net->ipv6.frags.timeout = IPV6_FRAG_TIMEOUT;
3736    
3737     - res = inet_frags_init_net(&net->ipv6.frags);
3738     - if (res)
3739     - return res;
3740     - res = ip6_frags_ns_sysctl_register(net);
3741     - if (res)
3742     - inet_frags_uninit_net(&net->ipv6.frags);
3743     - return res;
3744     + inet_frags_init_net(&net->ipv6.frags);
3745     +
3746     + return ip6_frags_ns_sysctl_register(net);
3747     }
3748    
3749     static void __net_exit ipv6_frags_exit_net(struct net *net)
3750     diff --git a/net/ipv6/route.c b/net/ipv6/route.c
3751     index 5764a84465f8..61729641e027 100644
3752     --- a/net/ipv6/route.c
3753     +++ b/net/ipv6/route.c
3754     @@ -1267,7 +1267,9 @@ static void rt6_dst_from_metrics_check(struct rt6_info *rt)
3755    
3756     static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
3757     {
3758     - if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie))
3759     + u32 rt_cookie = 0;
3760     +
3761     + if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie)
3762     return NULL;
3763    
3764     if (rt6_check_expired(rt))
3765     @@ -1335,8 +1337,14 @@ static void ip6_link_failure(struct sk_buff *skb)
3766     if (rt->rt6i_flags & RTF_CACHE) {
3767     dst_hold(&rt->dst);
3768     ip6_del_rt(rt);
3769     - } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) {
3770     - rt->rt6i_node->fn_sernum = -1;
3771     + } else {
3772     + struct fib6_node *fn;
3773     +
3774     + rcu_read_lock();
3775     + fn = rcu_dereference(rt->rt6i_node);
3776     + if (fn && (rt->rt6i_flags & RTF_DEFAULT))
3777     + fn->fn_sernum = -1;
3778     + rcu_read_unlock();
3779     }
3780     }
3781     }
3782     @@ -1353,7 +1361,8 @@ static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
3783     static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
3784     {
3785     return !(rt->rt6i_flags & RTF_CACHE) &&
3786     - (rt->rt6i_flags & RTF_PCPU || rt->rt6i_node);
3787     + (rt->rt6i_flags & RTF_PCPU ||
3788     + rcu_access_pointer(rt->rt6i_node));
3789     }
3790    
3791     static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
3792     diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
3793     index fecad1098cf8..7eb0e8fe3ca8 100644
3794     --- a/net/kcm/kcmsock.c
3795     +++ b/net/kcm/kcmsock.c
3796     @@ -1381,6 +1381,10 @@ static int kcm_attach(struct socket *sock, struct socket *csock,
3797     if (!csk)
3798     return -EINVAL;
3799    
3800     + /* We must prevent loops or risk deadlock ! */
3801     + if (csk->sk_family == PF_KCM)
3802     + return -EOPNOTSUPP;
3803     +
3804     psock = kmem_cache_zalloc(kcm_psockp, GFP_KERNEL);
3805     if (!psock)
3806     return -ENOMEM;
3807     diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
3808     index ae7bfd26cd91..35ba4b60d927 100644
3809     --- a/net/packet/af_packet.c
3810     +++ b/net/packet/af_packet.c
3811     @@ -2151,6 +2151,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
3812     struct timespec ts;
3813     __u32 ts_status;
3814     bool is_drop_n_account = false;
3815     + bool do_vnet = false;
3816    
3817     /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
3818     * We may add members to them until current aligned size without forcing
3819     @@ -2201,8 +2202,10 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
3820     netoff = TPACKET_ALIGN(po->tp_hdrlen +
3821     (maclen < 16 ? 16 : maclen)) +
3822     po->tp_reserve;
3823     - if (po->has_vnet_hdr)
3824     + if (po->has_vnet_hdr) {
3825     netoff += sizeof(struct virtio_net_hdr);
3826     + do_vnet = true;
3827     + }
3828     macoff = netoff - maclen;
3829     }
3830     if (po->tp_version <= TPACKET_V2) {
3831     @@ -2219,8 +2222,10 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
3832     skb_set_owner_r(copy_skb, sk);
3833     }
3834     snaplen = po->rx_ring.frame_size - macoff;
3835     - if ((int)snaplen < 0)
3836     + if ((int)snaplen < 0) {
3837     snaplen = 0;
3838     + do_vnet = false;
3839     + }
3840     }
3841     } else if (unlikely(macoff + snaplen >
3842     GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) {
3843     @@ -2233,6 +2238,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
3844     if (unlikely((int)snaplen < 0)) {
3845     snaplen = 0;
3846     macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len;
3847     + do_vnet = false;
3848     }
3849     }
3850     spin_lock(&sk->sk_receive_queue.lock);
3851     @@ -2258,7 +2264,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
3852     }
3853     spin_unlock(&sk->sk_receive_queue.lock);
3854    
3855     - if (po->has_vnet_hdr) {
3856     + if (do_vnet) {
3857     if (__packet_rcv_vnet(skb, h.raw + macoff -
3858     sizeof(struct virtio_net_hdr))) {
3859     spin_lock(&sk->sk_receive_queue.lock);
3860     diff --git a/net/sctp/sctp_diag.c b/net/sctp/sctp_diag.c
3861     index 048954eee984..e8f56b7c5afb 100644
3862     --- a/net/sctp/sctp_diag.c
3863     +++ b/net/sctp/sctp_diag.c
3864     @@ -70,7 +70,8 @@ static int inet_diag_msg_sctpladdrs_fill(struct sk_buff *skb,
3865    
3866     info = nla_data(attr);
3867     list_for_each_entry_rcu(laddr, address_list, list) {
3868     - memcpy(info, &laddr->a, addrlen);
3869     + memcpy(info, &laddr->a, sizeof(laddr->a));
3870     + memset(info + sizeof(laddr->a), 0, addrlen - sizeof(laddr->a));
3871     info += addrlen;
3872     }
3873    
3874     @@ -93,7 +94,9 @@ static int inet_diag_msg_sctpaddrs_fill(struct sk_buff *skb,
3875     info = nla_data(attr);
3876     list_for_each_entry(from, &asoc->peer.transport_addr_list,
3877     transports) {
3878     - memcpy(info, &from->ipaddr, addrlen);
3879     + memcpy(info, &from->ipaddr, sizeof(from->ipaddr));
3880     + memset(info + sizeof(from->ipaddr), 0,
3881     + addrlen - sizeof(from->ipaddr));
3882     info += addrlen;
3883     }
3884    
3885     diff --git a/net/sctp/socket.c b/net/sctp/socket.c
3886     index 9647e314d4fc..3ef725229449 100644
3887     --- a/net/sctp/socket.c
3888     +++ b/net/sctp/socket.c
3889     @@ -4373,8 +4373,7 @@ int sctp_get_sctp_info(struct sock *sk, struct sctp_association *asoc,
3890     info->sctpi_ictrlchunks = asoc->stats.ictrlchunks;
3891    
3892     prim = asoc->peer.primary_path;
3893     - memcpy(&info->sctpi_p_address, &prim->ipaddr,
3894     - sizeof(struct sockaddr_storage));
3895     + memcpy(&info->sctpi_p_address, &prim->ipaddr, sizeof(prim->ipaddr));
3896     info->sctpi_p_state = prim->state;
3897     info->sctpi_p_cwnd = prim->cwnd;
3898     info->sctpi_p_srtt = prim->srtt;
3899     diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c
3900     index 84d0fdaf7de9..d3cfbf2f407d 100644
3901     --- a/net/sctp/ulpqueue.c
3902     +++ b/net/sctp/ulpqueue.c
3903     @@ -265,7 +265,8 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event)
3904     sctp_ulpq_clear_pd(ulpq);
3905    
3906     if (queue == &sk->sk_receive_queue && !sp->data_ready_signalled) {
3907     - sp->data_ready_signalled = 1;
3908     + if (!sock_owned_by_user(sk))
3909     + sp->data_ready_signalled = 1;
3910     sk->sk_data_ready(sk);
3911     }
3912     return 1;