Annotation of /trunk/kernel-alx/patches-4.9/0150-4.9.51-all-fixes.patch
Parent Directory | Revision Log
Revision 3035 -
(hide annotations)
(download)
Wed Dec 20 11:48:36 2017 UTC (6 years, 9 months ago) by niro
File size: 125407 byte(s)
Wed Dec 20 11:48:36 2017 UTC (6 years, 9 months ago) by niro
File size: 125407 byte(s)
-linux-4.9.51
1 | niro | 3035 | diff --git a/Makefile b/Makefile |
2 | index 038d126a15fc..b48aebbe187f 100644 | ||
3 | --- a/Makefile | ||
4 | +++ b/Makefile | ||
5 | @@ -1,6 +1,6 @@ | ||
6 | VERSION = 4 | ||
7 | PATCHLEVEL = 9 | ||
8 | -SUBLEVEL = 50 | ||
9 | +SUBLEVEL = 51 | ||
10 | EXTRAVERSION = | ||
11 | NAME = Roaring Lionus | ||
12 | |||
13 | diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h | ||
14 | index b31761ecce63..7bcd138c3aa9 100644 | ||
15 | --- a/arch/x86/include/asm/elf.h | ||
16 | +++ b/arch/x86/include/asm/elf.h | ||
17 | @@ -204,6 +204,7 @@ void set_personality_ia32(bool); | ||
18 | |||
19 | #define ELF_CORE_COPY_REGS(pr_reg, regs) \ | ||
20 | do { \ | ||
21 | + unsigned long base; \ | ||
22 | unsigned v; \ | ||
23 | (pr_reg)[0] = (regs)->r15; \ | ||
24 | (pr_reg)[1] = (regs)->r14; \ | ||
25 | @@ -226,8 +227,8 @@ do { \ | ||
26 | (pr_reg)[18] = (regs)->flags; \ | ||
27 | (pr_reg)[19] = (regs)->sp; \ | ||
28 | (pr_reg)[20] = (regs)->ss; \ | ||
29 | - (pr_reg)[21] = current->thread.fsbase; \ | ||
30 | - (pr_reg)[22] = current->thread.gsbase; \ | ||
31 | + rdmsrl(MSR_FS_BASE, base); (pr_reg)[21] = base; \ | ||
32 | + rdmsrl(MSR_KERNEL_GS_BASE, base); (pr_reg)[22] = base; \ | ||
33 | asm("movl %%ds,%0" : "=r" (v)); (pr_reg)[23] = v; \ | ||
34 | asm("movl %%es,%0" : "=r" (v)); (pr_reg)[24] = v; \ | ||
35 | asm("movl %%fs,%0" : "=r" (v)); (pr_reg)[25] = v; \ | ||
36 | diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c | ||
37 | index b3760b3c1ca0..0887d2ae3797 100644 | ||
38 | --- a/arch/x86/kernel/process_64.c | ||
39 | +++ b/arch/x86/kernel/process_64.c | ||
40 | @@ -136,6 +136,123 @@ void release_thread(struct task_struct *dead_task) | ||
41 | } | ||
42 | } | ||
43 | |||
44 | +enum which_selector { | ||
45 | + FS, | ||
46 | + GS | ||
47 | +}; | ||
48 | + | ||
49 | +/* | ||
50 | + * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are | ||
51 | + * not available. The goal is to be reasonably fast on non-FSGSBASE systems. | ||
52 | + * It's forcibly inlined because it'll generate better code and this function | ||
53 | + * is hot. | ||
54 | + */ | ||
55 | +static __always_inline void save_base_legacy(struct task_struct *prev_p, | ||
56 | + unsigned short selector, | ||
57 | + enum which_selector which) | ||
58 | +{ | ||
59 | + if (likely(selector == 0)) { | ||
60 | + /* | ||
61 | + * On Intel (without X86_BUG_NULL_SEG), the segment base could | ||
62 | + * be the pre-existing saved base or it could be zero. On AMD | ||
63 | + * (with X86_BUG_NULL_SEG), the segment base could be almost | ||
64 | + * anything. | ||
65 | + * | ||
66 | + * This branch is very hot (it's hit twice on almost every | ||
67 | + * context switch between 64-bit programs), and avoiding | ||
68 | + * the RDMSR helps a lot, so we just assume that whatever | ||
69 | + * value is already saved is correct. This matches historical | ||
70 | + * Linux behavior, so it won't break existing applications. | ||
71 | + * | ||
72 | + * To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we | ||
73 | + * report that the base is zero, it needs to actually be zero: | ||
74 | + * see the corresponding logic in load_seg_legacy. | ||
75 | + */ | ||
76 | + } else { | ||
77 | + /* | ||
78 | + * If the selector is 1, 2, or 3, then the base is zero on | ||
79 | + * !X86_BUG_NULL_SEG CPUs and could be anything on | ||
80 | + * X86_BUG_NULL_SEG CPUs. In the latter case, Linux | ||
81 | + * has never attempted to preserve the base across context | ||
82 | + * switches. | ||
83 | + * | ||
84 | + * If selector > 3, then it refers to a real segment, and | ||
85 | + * saving the base isn't necessary. | ||
86 | + */ | ||
87 | + if (which == FS) | ||
88 | + prev_p->thread.fsbase = 0; | ||
89 | + else | ||
90 | + prev_p->thread.gsbase = 0; | ||
91 | + } | ||
92 | +} | ||
93 | + | ||
94 | +static __always_inline void save_fsgs(struct task_struct *task) | ||
95 | +{ | ||
96 | + savesegment(fs, task->thread.fsindex); | ||
97 | + savesegment(gs, task->thread.gsindex); | ||
98 | + save_base_legacy(task, task->thread.fsindex, FS); | ||
99 | + save_base_legacy(task, task->thread.gsindex, GS); | ||
100 | +} | ||
101 | + | ||
102 | +static __always_inline void loadseg(enum which_selector which, | ||
103 | + unsigned short sel) | ||
104 | +{ | ||
105 | + if (which == FS) | ||
106 | + loadsegment(fs, sel); | ||
107 | + else | ||
108 | + load_gs_index(sel); | ||
109 | +} | ||
110 | + | ||
111 | +static __always_inline void load_seg_legacy(unsigned short prev_index, | ||
112 | + unsigned long prev_base, | ||
113 | + unsigned short next_index, | ||
114 | + unsigned long next_base, | ||
115 | + enum which_selector which) | ||
116 | +{ | ||
117 | + if (likely(next_index <= 3)) { | ||
118 | + /* | ||
119 | + * The next task is using 64-bit TLS, is not using this | ||
120 | + * segment at all, or is having fun with arcane CPU features. | ||
121 | + */ | ||
122 | + if (next_base == 0) { | ||
123 | + /* | ||
124 | + * Nasty case: on AMD CPUs, we need to forcibly zero | ||
125 | + * the base. | ||
126 | + */ | ||
127 | + if (static_cpu_has_bug(X86_BUG_NULL_SEG)) { | ||
128 | + loadseg(which, __USER_DS); | ||
129 | + loadseg(which, next_index); | ||
130 | + } else { | ||
131 | + /* | ||
132 | + * We could try to exhaustively detect cases | ||
133 | + * under which we can skip the segment load, | ||
134 | + * but there's really only one case that matters | ||
135 | + * for performance: if both the previous and | ||
136 | + * next states are fully zeroed, we can skip | ||
137 | + * the load. | ||
138 | + * | ||
139 | + * (This assumes that prev_base == 0 has no | ||
140 | + * false positives. This is the case on | ||
141 | + * Intel-style CPUs.) | ||
142 | + */ | ||
143 | + if (likely(prev_index | next_index | prev_base)) | ||
144 | + loadseg(which, next_index); | ||
145 | + } | ||
146 | + } else { | ||
147 | + if (prev_index != next_index) | ||
148 | + loadseg(which, next_index); | ||
149 | + wrmsrl(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE, | ||
150 | + next_base); | ||
151 | + } | ||
152 | + } else { | ||
153 | + /* | ||
154 | + * The next task is using a real segment. Loading the selector | ||
155 | + * is sufficient. | ||
156 | + */ | ||
157 | + loadseg(which, next_index); | ||
158 | + } | ||
159 | +} | ||
160 | + | ||
161 | int copy_thread_tls(unsigned long clone_flags, unsigned long sp, | ||
162 | unsigned long arg, struct task_struct *p, unsigned long tls) | ||
163 | { | ||
164 | @@ -216,10 +333,19 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip, | ||
165 | unsigned long new_sp, | ||
166 | unsigned int _cs, unsigned int _ss, unsigned int _ds) | ||
167 | { | ||
168 | + WARN_ON_ONCE(regs != current_pt_regs()); | ||
169 | + | ||
170 | + if (static_cpu_has(X86_BUG_NULL_SEG)) { | ||
171 | + /* Loading zero below won't clear the base. */ | ||
172 | + loadsegment(fs, __USER_DS); | ||
173 | + load_gs_index(__USER_DS); | ||
174 | + } | ||
175 | + | ||
176 | loadsegment(fs, 0); | ||
177 | loadsegment(es, _ds); | ||
178 | loadsegment(ds, _ds); | ||
179 | load_gs_index(0); | ||
180 | + | ||
181 | regs->ip = new_ip; | ||
182 | regs->sp = new_sp; | ||
183 | regs->cs = _cs; | ||
184 | @@ -264,7 +390,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | ||
185 | struct fpu *next_fpu = &next->fpu; | ||
186 | int cpu = smp_processor_id(); | ||
187 | struct tss_struct *tss = &per_cpu(cpu_tss, cpu); | ||
188 | - unsigned prev_fsindex, prev_gsindex; | ||
189 | fpu_switch_t fpu_switch; | ||
190 | |||
191 | fpu_switch = switch_fpu_prepare(prev_fpu, next_fpu, cpu); | ||
192 | @@ -274,8 +399,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | ||
193 | * | ||
194 | * (e.g. xen_load_tls()) | ||
195 | */ | ||
196 | - savesegment(fs, prev_fsindex); | ||
197 | - savesegment(gs, prev_gsindex); | ||
198 | + save_fsgs(prev_p); | ||
199 | |||
200 | /* | ||
201 | * Load TLS before restoring any segments so that segment loads | ||
202 | @@ -314,108 +438,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | ||
203 | if (unlikely(next->ds | prev->ds)) | ||
204 | loadsegment(ds, next->ds); | ||
205 | |||
206 | - /* | ||
207 | - * Switch FS and GS. | ||
208 | - * | ||
209 | - * These are even more complicated than DS and ES: they have | ||
210 | - * 64-bit bases are that controlled by arch_prctl. The bases | ||
211 | - * don't necessarily match the selectors, as user code can do | ||
212 | - * any number of things to cause them to be inconsistent. | ||
213 | - * | ||
214 | - * We don't promise to preserve the bases if the selectors are | ||
215 | - * nonzero. We also don't promise to preserve the base if the | ||
216 | - * selector is zero and the base doesn't match whatever was | ||
217 | - * most recently passed to ARCH_SET_FS/GS. (If/when the | ||
218 | - * FSGSBASE instructions are enabled, we'll need to offer | ||
219 | - * stronger guarantees.) | ||
220 | - * | ||
221 | - * As an invariant, | ||
222 | - * (fsbase != 0 && fsindex != 0) || (gsbase != 0 && gsindex != 0) is | ||
223 | - * impossible. | ||
224 | - */ | ||
225 | - if (next->fsindex) { | ||
226 | - /* Loading a nonzero value into FS sets the index and base. */ | ||
227 | - loadsegment(fs, next->fsindex); | ||
228 | - } else { | ||
229 | - if (next->fsbase) { | ||
230 | - /* Next index is zero but next base is nonzero. */ | ||
231 | - if (prev_fsindex) | ||
232 | - loadsegment(fs, 0); | ||
233 | - wrmsrl(MSR_FS_BASE, next->fsbase); | ||
234 | - } else { | ||
235 | - /* Next base and index are both zero. */ | ||
236 | - if (static_cpu_has_bug(X86_BUG_NULL_SEG)) { | ||
237 | - /* | ||
238 | - * We don't know the previous base and can't | ||
239 | - * find out without RDMSR. Forcibly clear it. | ||
240 | - */ | ||
241 | - loadsegment(fs, __USER_DS); | ||
242 | - loadsegment(fs, 0); | ||
243 | - } else { | ||
244 | - /* | ||
245 | - * If the previous index is zero and ARCH_SET_FS | ||
246 | - * didn't change the base, then the base is | ||
247 | - * also zero and we don't need to do anything. | ||
248 | - */ | ||
249 | - if (prev->fsbase || prev_fsindex) | ||
250 | - loadsegment(fs, 0); | ||
251 | - } | ||
252 | - } | ||
253 | - } | ||
254 | - /* | ||
255 | - * Save the old state and preserve the invariant. | ||
256 | - * NB: if prev_fsindex == 0, then we can't reliably learn the base | ||
257 | - * without RDMSR because Intel user code can zero it without telling | ||
258 | - * us and AMD user code can program any 32-bit value without telling | ||
259 | - * us. | ||
260 | - */ | ||
261 | - if (prev_fsindex) | ||
262 | - prev->fsbase = 0; | ||
263 | - prev->fsindex = prev_fsindex; | ||
264 | - | ||
265 | - if (next->gsindex) { | ||
266 | - /* Loading a nonzero value into GS sets the index and base. */ | ||
267 | - load_gs_index(next->gsindex); | ||
268 | - } else { | ||
269 | - if (next->gsbase) { | ||
270 | - /* Next index is zero but next base is nonzero. */ | ||
271 | - if (prev_gsindex) | ||
272 | - load_gs_index(0); | ||
273 | - wrmsrl(MSR_KERNEL_GS_BASE, next->gsbase); | ||
274 | - } else { | ||
275 | - /* Next base and index are both zero. */ | ||
276 | - if (static_cpu_has_bug(X86_BUG_NULL_SEG)) { | ||
277 | - /* | ||
278 | - * We don't know the previous base and can't | ||
279 | - * find out without RDMSR. Forcibly clear it. | ||
280 | - * | ||
281 | - * This contains a pointless SWAPGS pair. | ||
282 | - * Fixing it would involve an explicit check | ||
283 | - * for Xen or a new pvop. | ||
284 | - */ | ||
285 | - load_gs_index(__USER_DS); | ||
286 | - load_gs_index(0); | ||
287 | - } else { | ||
288 | - /* | ||
289 | - * If the previous index is zero and ARCH_SET_GS | ||
290 | - * didn't change the base, then the base is | ||
291 | - * also zero and we don't need to do anything. | ||
292 | - */ | ||
293 | - if (prev->gsbase || prev_gsindex) | ||
294 | - load_gs_index(0); | ||
295 | - } | ||
296 | - } | ||
297 | - } | ||
298 | - /* | ||
299 | - * Save the old state and preserve the invariant. | ||
300 | - * NB: if prev_gsindex == 0, then we can't reliably learn the base | ||
301 | - * without RDMSR because Intel user code can zero it without telling | ||
302 | - * us and AMD user code can program any 32-bit value without telling | ||
303 | - * us. | ||
304 | - */ | ||
305 | - if (prev_gsindex) | ||
306 | - prev->gsbase = 0; | ||
307 | - prev->gsindex = prev_gsindex; | ||
308 | + load_seg_legacy(prev->fsindex, prev->fsbase, | ||
309 | + next->fsindex, next->fsbase, FS); | ||
310 | + load_seg_legacy(prev->gsindex, prev->gsbase, | ||
311 | + next->gsindex, next->gsbase, GS); | ||
312 | |||
313 | switch_fpu_finish(next_fpu, fpu_switch); | ||
314 | |||
315 | diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c | ||
316 | index 383f19c6bf24..549b4afd12e1 100644 | ||
317 | --- a/drivers/md/raid5.c | ||
318 | +++ b/drivers/md/raid5.c | ||
319 | @@ -5844,6 +5844,8 @@ static void raid5_do_work(struct work_struct *work) | ||
320 | |||
321 | spin_unlock_irq(&conf->device_lock); | ||
322 | |||
323 | + r5l_flush_stripe_to_raid(conf->log); | ||
324 | + | ||
325 | async_tx_issue_pending_all(); | ||
326 | blk_finish_plug(&plug); | ||
327 | |||
328 | diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c | ||
329 | index e8139514d32c..9e073fb6870a 100644 | ||
330 | --- a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c | ||
331 | +++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c | ||
332 | @@ -317,12 +317,12 @@ int t4_wr_mbox_meat_timeout(struct adapter *adap, int mbox, const void *cmd, | ||
333 | |||
334 | if (v != MBOX_OWNER_DRV) { | ||
335 | ret = (v == MBOX_OWNER_FW) ? -EBUSY : -ETIMEDOUT; | ||
336 | - t4_record_mbox(adap, cmd, MBOX_LEN, access, ret); | ||
337 | + t4_record_mbox(adap, cmd, size, access, ret); | ||
338 | return ret; | ||
339 | } | ||
340 | |||
341 | /* Copy in the new mailbox command and send it on its way ... */ | ||
342 | - t4_record_mbox(adap, cmd, MBOX_LEN, access, 0); | ||
343 | + t4_record_mbox(adap, cmd, size, access, 0); | ||
344 | for (i = 0; i < size; i += 8) | ||
345 | t4_write_reg64(adap, data_reg + i, be64_to_cpu(*p++)); | ||
346 | |||
347 | @@ -371,7 +371,7 @@ int t4_wr_mbox_meat_timeout(struct adapter *adap, int mbox, const void *cmd, | ||
348 | } | ||
349 | |||
350 | ret = (pcie_fw & PCIE_FW_ERR_F) ? -ENXIO : -ETIMEDOUT; | ||
351 | - t4_record_mbox(adap, cmd, MBOX_LEN, access, ret); | ||
352 | + t4_record_mbox(adap, cmd, size, access, ret); | ||
353 | dev_err(adap->pdev_dev, "command %#x in mailbox %d timed out\n", | ||
354 | *(const u8 *)cmd, mbox); | ||
355 | t4_report_fw_error(adap); | ||
356 | diff --git a/drivers/net/ethernet/freescale/fman/mac.c b/drivers/net/ethernet/freescale/fman/mac.c | ||
357 | index 736db9d9b0ad..81021f87e4f3 100644 | ||
358 | --- a/drivers/net/ethernet/freescale/fman/mac.c | ||
359 | +++ b/drivers/net/ethernet/freescale/fman/mac.c | ||
360 | @@ -622,6 +622,9 @@ static struct platform_device *dpaa_eth_add_device(int fman_id, | ||
361 | goto no_mem; | ||
362 | } | ||
363 | |||
364 | + pdev->dev.of_node = node; | ||
365 | + pdev->dev.parent = priv->dev; | ||
366 | + | ||
367 | ret = platform_device_add_data(pdev, &data, sizeof(data)); | ||
368 | if (ret) | ||
369 | goto err; | ||
370 | diff --git a/drivers/net/ethernet/freescale/gianfar.c b/drivers/net/ethernet/freescale/gianfar.c | ||
371 | index 3f4e71148808..fd206889a433 100644 | ||
372 | --- a/drivers/net/ethernet/freescale/gianfar.c | ||
373 | +++ b/drivers/net/ethernet/freescale/gianfar.c | ||
374 | @@ -3690,7 +3690,7 @@ static noinline void gfar_update_link_state(struct gfar_private *priv) | ||
375 | u32 tempval1 = gfar_read(®s->maccfg1); | ||
376 | u32 tempval = gfar_read(®s->maccfg2); | ||
377 | u32 ecntrl = gfar_read(®s->ecntrl); | ||
378 | - u32 tx_flow_oldval = (tempval & MACCFG1_TX_FLOW); | ||
379 | + u32 tx_flow_oldval = (tempval1 & MACCFG1_TX_FLOW); | ||
380 | |||
381 | if (phydev->duplex != priv->oldduplex) { | ||
382 | if (!(phydev->duplex)) | ||
383 | diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c | ||
384 | index f902c4d3de99..1806b1fc6e4c 100644 | ||
385 | --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c | ||
386 | +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c | ||
387 | @@ -4172,6 +4172,8 @@ static int mlxsw_sp_netdevice_port_upper_event(struct net_device *dev, | ||
388 | return -EINVAL; | ||
389 | if (!info->linking) | ||
390 | break; | ||
391 | + if (netdev_has_any_upper_dev(upper_dev)) | ||
392 | + return -EINVAL; | ||
393 | /* HW limitation forbids to put ports to multiple bridges. */ | ||
394 | if (netif_is_bridge_master(upper_dev) && | ||
395 | !mlxsw_sp_master_bridge_check(mlxsw_sp, upper_dev)) | ||
396 | @@ -4185,6 +4187,10 @@ static int mlxsw_sp_netdevice_port_upper_event(struct net_device *dev, | ||
397 | if (netif_is_lag_port(dev) && is_vlan_dev(upper_dev) && | ||
398 | !netif_is_lag_master(vlan_dev_real_dev(upper_dev))) | ||
399 | return -EINVAL; | ||
400 | + if (!info->linking) | ||
401 | + break; | ||
402 | + if (netdev_has_any_upper_dev(upper_dev)) | ||
403 | + return -EINVAL; | ||
404 | break; | ||
405 | case NETDEV_CHANGEUPPER: | ||
406 | upper_dev = info->upper_dev; | ||
407 | diff --git a/drivers/net/ethernet/qlogic/qlge/qlge_dbg.c b/drivers/net/ethernet/qlogic/qlge/qlge_dbg.c | ||
408 | index 829be21f97b2..be258d90de9e 100644 | ||
409 | --- a/drivers/net/ethernet/qlogic/qlge/qlge_dbg.c | ||
410 | +++ b/drivers/net/ethernet/qlogic/qlge/qlge_dbg.c | ||
411 | @@ -724,7 +724,7 @@ static void ql_build_coredump_seg_header( | ||
412 | seg_hdr->cookie = MPI_COREDUMP_COOKIE; | ||
413 | seg_hdr->segNum = seg_number; | ||
414 | seg_hdr->segSize = seg_size; | ||
415 | - memcpy(seg_hdr->description, desc, (sizeof(seg_hdr->description)) - 1); | ||
416 | + strncpy(seg_hdr->description, desc, (sizeof(seg_hdr->description)) - 1); | ||
417 | } | ||
418 | |||
419 | /* | ||
420 | diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c | ||
421 | index ff038e507fd6..36a04e182af1 100644 | ||
422 | --- a/drivers/net/hyperv/netvsc_drv.c | ||
423 | +++ b/drivers/net/hyperv/netvsc_drv.c | ||
424 | @@ -1084,7 +1084,12 @@ static void netvsc_link_change(struct work_struct *w) | ||
425 | bool notify = false, reschedule = false; | ||
426 | unsigned long flags, next_reconfig, delay; | ||
427 | |||
428 | - rtnl_lock(); | ||
429 | + /* if changes are happening, comeback later */ | ||
430 | + if (!rtnl_trylock()) { | ||
431 | + schedule_delayed_work(&ndev_ctx->dwork, LINKCHANGE_INT); | ||
432 | + return; | ||
433 | + } | ||
434 | + | ||
435 | if (ndev_ctx->start_remove) | ||
436 | goto out_unlock; | ||
437 | |||
438 | diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c | ||
439 | index a5d66e205bb2..2caac0c37059 100644 | ||
440 | --- a/drivers/net/macsec.c | ||
441 | +++ b/drivers/net/macsec.c | ||
442 | @@ -3510,6 +3510,7 @@ module_init(macsec_init); | ||
443 | module_exit(macsec_exit); | ||
444 | |||
445 | MODULE_ALIAS_RTNL_LINK("macsec"); | ||
446 | +MODULE_ALIAS_GENL_FAMILY("macsec"); | ||
447 | |||
448 | MODULE_DESCRIPTION("MACsec IEEE 802.1AE"); | ||
449 | MODULE_LICENSE("GPL v2"); | ||
450 | diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c | ||
451 | index 775a6e1fdef9..6e12401b5102 100644 | ||
452 | --- a/drivers/net/phy/phy.c | ||
453 | +++ b/drivers/net/phy/phy.c | ||
454 | @@ -674,9 +674,6 @@ void phy_stop_machine(struct phy_device *phydev) | ||
455 | if (phydev->state > PHY_UP && phydev->state != PHY_HALTED) | ||
456 | phydev->state = PHY_UP; | ||
457 | mutex_unlock(&phydev->lock); | ||
458 | - | ||
459 | - /* Now we can run the state machine synchronously */ | ||
460 | - phy_state_machine(&phydev->state_queue.work); | ||
461 | } | ||
462 | |||
463 | /** | ||
464 | diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c | ||
465 | index 5dc128a8da83..96a0661011fd 100644 | ||
466 | --- a/drivers/vhost/net.c | ||
467 | +++ b/drivers/vhost/net.c | ||
468 | @@ -537,8 +537,13 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk) | ||
469 | |||
470 | preempt_enable(); | ||
471 | |||
472 | - if (vhost_enable_notify(&net->dev, vq)) | ||
473 | + if (!vhost_vq_avail_empty(&net->dev, vq)) | ||
474 | vhost_poll_queue(&vq->poll); | ||
475 | + else if (unlikely(vhost_enable_notify(&net->dev, vq))) { | ||
476 | + vhost_disable_notify(&net->dev, vq); | ||
477 | + vhost_poll_queue(&vq->poll); | ||
478 | + } | ||
479 | + | ||
480 | mutex_unlock(&vq->mutex); | ||
481 | |||
482 | len = peek_head_len(sk); | ||
483 | diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c | ||
484 | index 2fc84a991325..98c1a63a4614 100644 | ||
485 | --- a/fs/f2fs/recovery.c | ||
486 | +++ b/fs/f2fs/recovery.c | ||
487 | @@ -316,7 +316,7 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, | ||
488 | return 0; | ||
489 | |||
490 | /* Get the previous summary */ | ||
491 | - for (i = CURSEG_WARM_DATA; i <= CURSEG_COLD_DATA; i++) { | ||
492 | + for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { | ||
493 | struct curseg_info *curseg = CURSEG_I(sbi, i); | ||
494 | if (curseg->segno == segno) { | ||
495 | sum = curseg->sum_blk->entries[blkoff]; | ||
496 | @@ -626,8 +626,6 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) | ||
497 | } | ||
498 | |||
499 | clear_sbi_flag(sbi, SBI_POR_DOING); | ||
500 | - if (err) | ||
501 | - set_ckpt_flags(sbi, CP_ERROR_FLAG); | ||
502 | mutex_unlock(&sbi->cp_mutex); | ||
503 | |||
504 | /* let's drop all the directory inodes for clean checkpoint */ | ||
505 | diff --git a/fs/inode.c b/fs/inode.c | ||
506 | index 88110fd0b282..920aa0b1c6b0 100644 | ||
507 | --- a/fs/inode.c | ||
508 | +++ b/fs/inode.c | ||
509 | @@ -637,6 +637,7 @@ void evict_inodes(struct super_block *sb) | ||
510 | |||
511 | dispose_list(&dispose); | ||
512 | } | ||
513 | +EXPORT_SYMBOL_GPL(evict_inodes); | ||
514 | |||
515 | /** | ||
516 | * invalidate_inodes - attempt to free all inodes on a superblock | ||
517 | diff --git a/fs/internal.h b/fs/internal.h | ||
518 | index f4da3341b4a3..8b7143b0211c 100644 | ||
519 | --- a/fs/internal.h | ||
520 | +++ b/fs/internal.h | ||
521 | @@ -136,7 +136,6 @@ extern bool atime_needs_update_rcu(const struct path *, struct inode *); | ||
522 | extern void inode_io_list_del(struct inode *inode); | ||
523 | |||
524 | extern long get_nr_dirty_inodes(void); | ||
525 | -extern void evict_inodes(struct super_block *); | ||
526 | extern int invalidate_inodes(struct super_block *, bool); | ||
527 | |||
528 | /* | ||
529 | diff --git a/fs/iomap.c b/fs/iomap.c | ||
530 | index 798c291cbc75..a49db8806a3a 100644 | ||
531 | --- a/fs/iomap.c | ||
532 | +++ b/fs/iomap.c | ||
533 | @@ -281,7 +281,7 @@ iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data, | ||
534 | unsigned long bytes; /* Bytes to write to page */ | ||
535 | |||
536 | offset = (pos & (PAGE_SIZE - 1)); | ||
537 | - bytes = min_t(unsigned long, PAGE_SIZE - offset, length); | ||
538 | + bytes = min_t(loff_t, PAGE_SIZE - offset, length); | ||
539 | |||
540 | rpage = __iomap_read_page(inode, pos); | ||
541 | if (IS_ERR(rpage)) | ||
542 | @@ -376,7 +376,7 @@ iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count, | ||
543 | unsigned offset, bytes; | ||
544 | |||
545 | offset = pos & (PAGE_SIZE - 1); /* Within page */ | ||
546 | - bytes = min_t(unsigned, PAGE_SIZE - offset, count); | ||
547 | + bytes = min_t(loff_t, PAGE_SIZE - offset, count); | ||
548 | |||
549 | if (IS_DAX(inode)) | ||
550 | status = iomap_dax_zero(pos, offset, bytes, iomap); | ||
551 | diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c | ||
552 | index 2852521fc8ec..c6c15e5717e4 100644 | ||
553 | --- a/fs/xfs/libxfs/xfs_attr_leaf.c | ||
554 | +++ b/fs/xfs/libxfs/xfs_attr_leaf.c | ||
555 | @@ -351,7 +351,7 @@ xfs_attr3_leaf_read( | ||
556 | |||
557 | err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp, | ||
558 | XFS_ATTR_FORK, &xfs_attr3_leaf_buf_ops); | ||
559 | - if (!err && tp) | ||
560 | + if (!err && tp && *bpp) | ||
561 | xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_ATTR_LEAF_BUF); | ||
562 | return err; | ||
563 | } | ||
564 | diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c | ||
565 | index 2a8cbd15d5d1..d2f4ab175096 100644 | ||
566 | --- a/fs/xfs/libxfs/xfs_bmap.c | ||
567 | +++ b/fs/xfs/libxfs/xfs_bmap.c | ||
568 | @@ -579,7 +579,7 @@ xfs_bmap_validate_ret( | ||
569 | |||
570 | #else | ||
571 | #define xfs_bmap_check_leaf_extents(cur, ip, whichfork) do { } while (0) | ||
572 | -#define xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap) | ||
573 | +#define xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap) do { } while (0) | ||
574 | #endif /* DEBUG */ | ||
575 | |||
576 | /* | ||
577 | @@ -5555,6 +5555,8 @@ __xfs_bunmapi( | ||
578 | int whichfork; /* data or attribute fork */ | ||
579 | xfs_fsblock_t sum; | ||
580 | xfs_filblks_t len = *rlen; /* length to unmap in file */ | ||
581 | + xfs_fileoff_t max_len; | ||
582 | + xfs_agnumber_t prev_agno = NULLAGNUMBER, agno; | ||
583 | |||
584 | trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_); | ||
585 | |||
586 | @@ -5576,6 +5578,16 @@ __xfs_bunmapi( | ||
587 | ASSERT(len > 0); | ||
588 | ASSERT(nexts >= 0); | ||
589 | |||
590 | + /* | ||
591 | + * Guesstimate how many blocks we can unmap without running the risk of | ||
592 | + * blowing out the transaction with a mix of EFIs and reflink | ||
593 | + * adjustments. | ||
594 | + */ | ||
595 | + if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) | ||
596 | + max_len = min(len, xfs_refcount_max_unmap(tp->t_log_res)); | ||
597 | + else | ||
598 | + max_len = len; | ||
599 | + | ||
600 | if (!(ifp->if_flags & XFS_IFEXTENTS) && | ||
601 | (error = xfs_iread_extents(tp, ip, whichfork))) | ||
602 | return error; | ||
603 | @@ -5621,7 +5633,7 @@ __xfs_bunmapi( | ||
604 | |||
605 | extno = 0; | ||
606 | while (bno != (xfs_fileoff_t)-1 && bno >= start && lastx >= 0 && | ||
607 | - (nexts == 0 || extno < nexts)) { | ||
608 | + (nexts == 0 || extno < nexts) && max_len > 0) { | ||
609 | /* | ||
610 | * Is the found extent after a hole in which bno lives? | ||
611 | * Just back up to the previous extent, if so. | ||
612 | @@ -5647,6 +5659,17 @@ __xfs_bunmapi( | ||
613 | ASSERT(ep != NULL); | ||
614 | del = got; | ||
615 | wasdel = isnullstartblock(del.br_startblock); | ||
616 | + | ||
617 | + /* | ||
618 | + * Make sure we don't touch multiple AGF headers out of order | ||
619 | + * in a single transaction, as that could cause AB-BA deadlocks. | ||
620 | + */ | ||
621 | + if (!wasdel) { | ||
622 | + agno = XFS_FSB_TO_AGNO(mp, del.br_startblock); | ||
623 | + if (prev_agno != NULLAGNUMBER && prev_agno > agno) | ||
624 | + break; | ||
625 | + prev_agno = agno; | ||
626 | + } | ||
627 | if (got.br_startoff < start) { | ||
628 | del.br_startoff = start; | ||
629 | del.br_blockcount -= start - got.br_startoff; | ||
630 | @@ -5655,6 +5678,15 @@ __xfs_bunmapi( | ||
631 | } | ||
632 | if (del.br_startoff + del.br_blockcount > bno + 1) | ||
633 | del.br_blockcount = bno + 1 - del.br_startoff; | ||
634 | + | ||
635 | + /* How much can we safely unmap? */ | ||
636 | + if (max_len < del.br_blockcount) { | ||
637 | + del.br_startoff += del.br_blockcount - max_len; | ||
638 | + if (!wasdel) | ||
639 | + del.br_startblock += del.br_blockcount - max_len; | ||
640 | + del.br_blockcount = max_len; | ||
641 | + } | ||
642 | + | ||
643 | sum = del.br_startblock + del.br_blockcount; | ||
644 | if (isrt && | ||
645 | (mod = do_mod(sum, mp->m_sb.sb_rextsize))) { | ||
646 | @@ -5835,6 +5867,7 @@ __xfs_bunmapi( | ||
647 | if (!isrt && wasdel) | ||
648 | xfs_mod_fdblocks(mp, (int64_t)del.br_blockcount, false); | ||
649 | |||
650 | + max_len -= del.br_blockcount; | ||
651 | bno = del.br_startoff - 1; | ||
652 | nodelete: | ||
653 | /* | ||
654 | @@ -6604,25 +6637,33 @@ xfs_bmap_finish_one( | ||
655 | int whichfork, | ||
656 | xfs_fileoff_t startoff, | ||
657 | xfs_fsblock_t startblock, | ||
658 | - xfs_filblks_t blockcount, | ||
659 | + xfs_filblks_t *blockcount, | ||
660 | xfs_exntst_t state) | ||
661 | { | ||
662 | struct xfs_bmbt_irec bmap; | ||
663 | int nimaps = 1; | ||
664 | xfs_fsblock_t firstfsb; | ||
665 | int flags = XFS_BMAPI_REMAP; | ||
666 | - int done; | ||
667 | int error = 0; | ||
668 | |||
669 | bmap.br_startblock = startblock; | ||
670 | bmap.br_startoff = startoff; | ||
671 | - bmap.br_blockcount = blockcount; | ||
672 | + bmap.br_blockcount = *blockcount; | ||
673 | bmap.br_state = state; | ||
674 | |||
675 | + /* | ||
676 | + * firstfsb is tied to the transaction lifetime and is used to | ||
677 | + * ensure correct AG locking order and schedule work item | ||
678 | + * continuations. XFS_BUI_MAX_FAST_EXTENTS (== 1) restricts us | ||
679 | + * to only making one bmap call per transaction, so it should | ||
680 | + * be safe to have it as a local variable here. | ||
681 | + */ | ||
682 | + firstfsb = NULLFSBLOCK; | ||
683 | + | ||
684 | trace_xfs_bmap_deferred(tp->t_mountp, | ||
685 | XFS_FSB_TO_AGNO(tp->t_mountp, startblock), type, | ||
686 | XFS_FSB_TO_AGBNO(tp->t_mountp, startblock), | ||
687 | - ip->i_ino, whichfork, startoff, blockcount, state); | ||
688 | + ip->i_ino, whichfork, startoff, *blockcount, state); | ||
689 | |||
690 | if (whichfork != XFS_DATA_FORK && whichfork != XFS_ATTR_FORK) | ||
691 | return -EFSCORRUPTED; | ||
692 | @@ -6641,12 +6682,11 @@ xfs_bmap_finish_one( | ||
693 | bmap.br_blockcount, flags, &firstfsb, | ||
694 | bmap.br_blockcount, &bmap, &nimaps, | ||
695 | dfops); | ||
696 | + *blockcount = 0; | ||
697 | break; | ||
698 | case XFS_BMAP_UNMAP: | ||
699 | - error = xfs_bunmapi(tp, ip, bmap.br_startoff, | ||
700 | - bmap.br_blockcount, flags, 1, &firstfsb, | ||
701 | - dfops, &done); | ||
702 | - ASSERT(done); | ||
703 | + error = __xfs_bunmapi(tp, ip, startoff, blockcount, | ||
704 | + XFS_BMAPI_REMAP, 1, &firstfsb, dfops); | ||
705 | break; | ||
706 | default: | ||
707 | ASSERT(0); | ||
708 | diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h | ||
709 | index e7d40b39f18f..db53ac7ff6df 100644 | ||
710 | --- a/fs/xfs/libxfs/xfs_bmap.h | ||
711 | +++ b/fs/xfs/libxfs/xfs_bmap.h | ||
712 | @@ -265,7 +265,7 @@ struct xfs_bmap_intent { | ||
713 | int xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_defer_ops *dfops, | ||
714 | struct xfs_inode *ip, enum xfs_bmap_intent_type type, | ||
715 | int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock, | ||
716 | - xfs_filblks_t blockcount, xfs_exntst_t state); | ||
717 | + xfs_filblks_t *blockcount, xfs_exntst_t state); | ||
718 | int xfs_bmap_map_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops, | ||
719 | struct xfs_inode *ip, struct xfs_bmbt_irec *imap); | ||
720 | int xfs_bmap_unmap_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops, | ||
721 | diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c | ||
722 | index 5c3918678bb6..9968a746c649 100644 | ||
723 | --- a/fs/xfs/libxfs/xfs_bmap_btree.c | ||
724 | +++ b/fs/xfs/libxfs/xfs_bmap_btree.c | ||
725 | @@ -888,6 +888,7 @@ xfs_bmbt_change_owner( | ||
726 | cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork); | ||
727 | if (!cur) | ||
728 | return -ENOMEM; | ||
729 | + cur->bc_private.b.flags |= XFS_BTCUR_BPRV_INVALID_OWNER; | ||
730 | |||
731 | error = xfs_btree_change_owner(cur, new_owner, buffer_list); | ||
732 | xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); | ||
733 | diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c | ||
734 | index 91c68913d495..4ad1e214b1b2 100644 | ||
735 | --- a/fs/xfs/libxfs/xfs_btree.c | ||
736 | +++ b/fs/xfs/libxfs/xfs_btree.c | ||
737 | @@ -714,7 +714,8 @@ xfs_btree_firstrec( | ||
738 | * Get the block pointer for this level. | ||
739 | */ | ||
740 | block = xfs_btree_get_block(cur, level, &bp); | ||
741 | - xfs_btree_check_block(cur, block, level, bp); | ||
742 | + if (xfs_btree_check_block(cur, block, level, bp)) | ||
743 | + return 0; | ||
744 | /* | ||
745 | * It's empty, there is no such record. | ||
746 | */ | ||
747 | @@ -743,7 +744,8 @@ xfs_btree_lastrec( | ||
748 | * Get the block pointer for this level. | ||
749 | */ | ||
750 | block = xfs_btree_get_block(cur, level, &bp); | ||
751 | - xfs_btree_check_block(cur, block, level, bp); | ||
752 | + if (xfs_btree_check_block(cur, block, level, bp)) | ||
753 | + return 0; | ||
754 | /* | ||
755 | * It's empty, there is no such record. | ||
756 | */ | ||
757 | @@ -1772,6 +1774,7 @@ xfs_btree_lookup_get_block( | ||
758 | |||
759 | /* Check the inode owner since the verifiers don't. */ | ||
760 | if (xfs_sb_version_hascrc(&cur->bc_mp->m_sb) && | ||
761 | + !(cur->bc_private.b.flags & XFS_BTCUR_BPRV_INVALID_OWNER) && | ||
762 | (cur->bc_flags & XFS_BTREE_LONG_PTRS) && | ||
763 | be64_to_cpu((*blkp)->bb_u.l.bb_owner) != | ||
764 | cur->bc_private.b.ip->i_ino) | ||
765 | @@ -4432,10 +4435,15 @@ xfs_btree_block_change_owner( | ||
766 | |||
767 | /* modify the owner */ | ||
768 | block = xfs_btree_get_block(cur, level, &bp); | ||
769 | - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) | ||
770 | + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { | ||
771 | + if (block->bb_u.l.bb_owner == cpu_to_be64(bbcoi->new_owner)) | ||
772 | + return 0; | ||
773 | block->bb_u.l.bb_owner = cpu_to_be64(bbcoi->new_owner); | ||
774 | - else | ||
775 | + } else { | ||
776 | + if (block->bb_u.s.bb_owner == cpu_to_be32(bbcoi->new_owner)) | ||
777 | + return 0; | ||
778 | block->bb_u.s.bb_owner = cpu_to_be32(bbcoi->new_owner); | ||
779 | + } | ||
780 | |||
781 | /* | ||
782 | * If the block is a root block hosted in an inode, we might not have a | ||
783 | @@ -4444,16 +4452,19 @@ xfs_btree_block_change_owner( | ||
784 | * block is formatted into the on-disk inode fork. We still change it, | ||
785 | * though, so everything is consistent in memory. | ||
786 | */ | ||
787 | - if (bp) { | ||
788 | - if (cur->bc_tp) { | ||
789 | - xfs_trans_ordered_buf(cur->bc_tp, bp); | ||
790 | + if (!bp) { | ||
791 | + ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); | ||
792 | + ASSERT(level == cur->bc_nlevels - 1); | ||
793 | + return 0; | ||
794 | + } | ||
795 | + | ||
796 | + if (cur->bc_tp) { | ||
797 | + if (!xfs_trans_ordered_buf(cur->bc_tp, bp)) { | ||
798 | xfs_btree_log_block(cur, bp, XFS_BB_OWNER); | ||
799 | - } else { | ||
800 | - xfs_buf_delwri_queue(bp, bbcoi->buffer_list); | ||
801 | + return -EAGAIN; | ||
802 | } | ||
803 | } else { | ||
804 | - ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); | ||
805 | - ASSERT(level == cur->bc_nlevels - 1); | ||
806 | + xfs_buf_delwri_queue(bp, bbcoi->buffer_list); | ||
807 | } | ||
808 | |||
809 | return 0; | ||
810 | diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h | ||
811 | index 3b0fc1afada5..33c7be2357b9 100644 | ||
812 | --- a/fs/xfs/libxfs/xfs_btree.h | ||
813 | +++ b/fs/xfs/libxfs/xfs_btree.h | ||
814 | @@ -268,7 +268,8 @@ typedef struct xfs_btree_cur | ||
815 | short forksize; /* fork's inode space */ | ||
816 | char whichfork; /* data or attr fork */ | ||
817 | char flags; /* flags */ | ||
818 | -#define XFS_BTCUR_BPRV_WASDEL 1 /* was delayed */ | ||
819 | +#define XFS_BTCUR_BPRV_WASDEL (1<<0) /* was delayed */ | ||
820 | +#define XFS_BTCUR_BPRV_INVALID_OWNER (1<<1) /* for ext swap */ | ||
821 | } b; | ||
822 | } bc_private; /* per-btree type data */ | ||
823 | } xfs_btree_cur_t; | ||
824 | diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c | ||
825 | index 1bdf2888295b..b305dbfd81c4 100644 | ||
826 | --- a/fs/xfs/libxfs/xfs_da_btree.c | ||
827 | +++ b/fs/xfs/libxfs/xfs_da_btree.c | ||
828 | @@ -263,7 +263,7 @@ xfs_da3_node_read( | ||
829 | |||
830 | err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp, | ||
831 | which_fork, &xfs_da3_node_buf_ops); | ||
832 | - if (!err && tp) { | ||
833 | + if (!err && tp && *bpp) { | ||
834 | struct xfs_da_blkinfo *info = (*bpp)->b_addr; | ||
835 | int type; | ||
836 | |||
837 | diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c | ||
838 | index aa17cb788946..43c902f7a68d 100644 | ||
839 | --- a/fs/xfs/libxfs/xfs_dir2_block.c | ||
840 | +++ b/fs/xfs/libxfs/xfs_dir2_block.c | ||
841 | @@ -139,7 +139,7 @@ xfs_dir3_block_read( | ||
842 | |||
843 | err = xfs_da_read_buf(tp, dp, mp->m_dir_geo->datablk, -1, bpp, | ||
844 | XFS_DATA_FORK, &xfs_dir3_block_buf_ops); | ||
845 | - if (!err && tp) | ||
846 | + if (!err && tp && *bpp) | ||
847 | xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_BLOCK_BUF); | ||
848 | return err; | ||
849 | } | ||
850 | diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c | ||
851 | index b887fb2a2bcf..f2e342e05365 100644 | ||
852 | --- a/fs/xfs/libxfs/xfs_dir2_leaf.c | ||
853 | +++ b/fs/xfs/libxfs/xfs_dir2_leaf.c | ||
854 | @@ -268,7 +268,7 @@ xfs_dir3_leaf_read( | ||
855 | |||
856 | err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, | ||
857 | XFS_DATA_FORK, &xfs_dir3_leaf1_buf_ops); | ||
858 | - if (!err && tp) | ||
859 | + if (!err && tp && *bpp) | ||
860 | xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAF1_BUF); | ||
861 | return err; | ||
862 | } | ||
863 | @@ -285,7 +285,7 @@ xfs_dir3_leafn_read( | ||
864 | |||
865 | err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, | ||
866 | XFS_DATA_FORK, &xfs_dir3_leafn_buf_ops); | ||
867 | - if (!err && tp) | ||
868 | + if (!err && tp && *bpp) | ||
869 | xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAFN_BUF); | ||
870 | return err; | ||
871 | } | ||
872 | diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c | ||
873 | index a2818f6e8598..42fef0731e2a 100644 | ||
874 | --- a/fs/xfs/libxfs/xfs_ialloc.c | ||
875 | +++ b/fs/xfs/libxfs/xfs_ialloc.c | ||
876 | @@ -368,8 +368,6 @@ xfs_ialloc_inode_init( | ||
877 | * transaction and pin the log appropriately. | ||
878 | */ | ||
879 | xfs_trans_ordered_buf(tp, fbuf); | ||
880 | - xfs_trans_log_buf(tp, fbuf, 0, | ||
881 | - BBTOB(fbuf->b_length) - 1); | ||
882 | } | ||
883 | } else { | ||
884 | fbuf->b_flags |= XBF_DONE; | ||
885 | @@ -1123,6 +1121,7 @@ xfs_dialloc_ag_inobt( | ||
886 | int error; | ||
887 | int offset; | ||
888 | int i, j; | ||
889 | + int searchdistance = 10; | ||
890 | |||
891 | pag = xfs_perag_get(mp, agno); | ||
892 | |||
893 | @@ -1149,7 +1148,6 @@ xfs_dialloc_ag_inobt( | ||
894 | if (pagno == agno) { | ||
895 | int doneleft; /* done, to the left */ | ||
896 | int doneright; /* done, to the right */ | ||
897 | - int searchdistance = 10; | ||
898 | |||
899 | error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i); | ||
900 | if (error) | ||
901 | @@ -1210,21 +1208,9 @@ xfs_dialloc_ag_inobt( | ||
902 | /* | ||
903 | * Loop until we find an inode chunk with a free inode. | ||
904 | */ | ||
905 | - while (!doneleft || !doneright) { | ||
906 | + while (--searchdistance > 0 && (!doneleft || !doneright)) { | ||
907 | int useleft; /* using left inode chunk this time */ | ||
908 | |||
909 | - if (!--searchdistance) { | ||
910 | - /* | ||
911 | - * Not in range - save last search | ||
912 | - * location and allocate a new inode | ||
913 | - */ | ||
914 | - xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); | ||
915 | - pag->pagl_leftrec = trec.ir_startino; | ||
916 | - pag->pagl_rightrec = rec.ir_startino; | ||
917 | - pag->pagl_pagino = pagino; | ||
918 | - goto newino; | ||
919 | - } | ||
920 | - | ||
921 | /* figure out the closer block if both are valid. */ | ||
922 | if (!doneleft && !doneright) { | ||
923 | useleft = pagino - | ||
924 | @@ -1236,13 +1222,13 @@ xfs_dialloc_ag_inobt( | ||
925 | |||
926 | /* free inodes to the left? */ | ||
927 | if (useleft && trec.ir_freecount) { | ||
928 | - rec = trec; | ||
929 | xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); | ||
930 | cur = tcur; | ||
931 | |||
932 | pag->pagl_leftrec = trec.ir_startino; | ||
933 | pag->pagl_rightrec = rec.ir_startino; | ||
934 | pag->pagl_pagino = pagino; | ||
935 | + rec = trec; | ||
936 | goto alloc_inode; | ||
937 | } | ||
938 | |||
939 | @@ -1268,26 +1254,37 @@ xfs_dialloc_ag_inobt( | ||
940 | goto error1; | ||
941 | } | ||
942 | |||
943 | - /* | ||
944 | - * We've reached the end of the btree. because | ||
945 | - * we are only searching a small chunk of the | ||
946 | - * btree each search, there is obviously free | ||
947 | - * inodes closer to the parent inode than we | ||
948 | - * are now. restart the search again. | ||
949 | - */ | ||
950 | - pag->pagl_pagino = NULLAGINO; | ||
951 | - pag->pagl_leftrec = NULLAGINO; | ||
952 | - pag->pagl_rightrec = NULLAGINO; | ||
953 | - xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); | ||
954 | - xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); | ||
955 | - goto restart_pagno; | ||
956 | + if (searchdistance <= 0) { | ||
957 | + /* | ||
958 | + * Not in range - save last search | ||
959 | + * location and allocate a new inode | ||
960 | + */ | ||
961 | + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); | ||
962 | + pag->pagl_leftrec = trec.ir_startino; | ||
963 | + pag->pagl_rightrec = rec.ir_startino; | ||
964 | + pag->pagl_pagino = pagino; | ||
965 | + | ||
966 | + } else { | ||
967 | + /* | ||
968 | + * We've reached the end of the btree. because | ||
969 | + * we are only searching a small chunk of the | ||
970 | + * btree each search, there is obviously free | ||
971 | + * inodes closer to the parent inode than we | ||
972 | + * are now. restart the search again. | ||
973 | + */ | ||
974 | + pag->pagl_pagino = NULLAGINO; | ||
975 | + pag->pagl_leftrec = NULLAGINO; | ||
976 | + pag->pagl_rightrec = NULLAGINO; | ||
977 | + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); | ||
978 | + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); | ||
979 | + goto restart_pagno; | ||
980 | + } | ||
981 | } | ||
982 | |||
983 | /* | ||
984 | * In a different AG from the parent. | ||
985 | * See if the most recently allocated block has any free. | ||
986 | */ | ||
987 | -newino: | ||
988 | if (agi->agi_newino != cpu_to_be32(NULLAGINO)) { | ||
989 | error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino), | ||
990 | XFS_LOOKUP_EQ, &i); | ||
991 | diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c | ||
992 | index 8a37efe04de3..4e30448c4465 100644 | ||
993 | --- a/fs/xfs/libxfs/xfs_inode_fork.c | ||
994 | +++ b/fs/xfs/libxfs/xfs_inode_fork.c | ||
995 | @@ -1539,14 +1539,11 @@ xfs_iext_realloc_indirect( | ||
996 | xfs_ifork_t *ifp, /* inode fork pointer */ | ||
997 | int new_size) /* new indirection array size */ | ||
998 | { | ||
999 | - int nlists; /* number of irec's (ex lists) */ | ||
1000 | - int size; /* current indirection array size */ | ||
1001 | - | ||
1002 | ASSERT(ifp->if_flags & XFS_IFEXTIREC); | ||
1003 | - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; | ||
1004 | - size = nlists * sizeof(xfs_ext_irec_t); | ||
1005 | ASSERT(ifp->if_real_bytes); | ||
1006 | - ASSERT((new_size >= 0) && (new_size != size)); | ||
1007 | + ASSERT((new_size >= 0) && | ||
1008 | + (new_size != ((ifp->if_real_bytes / XFS_IEXT_BUFSZ) * | ||
1009 | + sizeof(xfs_ext_irec_t)))); | ||
1010 | if (new_size == 0) { | ||
1011 | xfs_iext_destroy(ifp); | ||
1012 | } else { | ||
1013 | diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c | ||
1014 | index 82a38d86ebad..d71cb63cdea3 100644 | ||
1015 | --- a/fs/xfs/libxfs/xfs_refcount.c | ||
1016 | +++ b/fs/xfs/libxfs/xfs_refcount.c | ||
1017 | @@ -784,14 +784,6 @@ xfs_refcount_merge_extents( | ||
1018 | } | ||
1019 | |||
1020 | /* | ||
1021 | - * While we're adjusting the refcounts records of an extent, we have | ||
1022 | - * to keep an eye on the number of extents we're dirtying -- run too | ||
1023 | - * many in a single transaction and we'll exceed the transaction's | ||
1024 | - * reservation and crash the fs. Each record adds 12 bytes to the | ||
1025 | - * log (plus any key updates) so we'll conservatively assume 24 bytes | ||
1026 | - * per record. We must also leave space for btree splits on both ends | ||
1027 | - * of the range and space for the CUD and a new CUI. | ||
1028 | - * | ||
1029 | * XXX: This is a pretty hand-wavy estimate. The penalty for guessing | ||
1030 | * true incorrectly is a shutdown FS; the penalty for guessing false | ||
1031 | * incorrectly is more transaction rolls than might be necessary. | ||
1032 | @@ -822,7 +814,7 @@ xfs_refcount_still_have_space( | ||
1033 | else if (overhead > cur->bc_tp->t_log_res) | ||
1034 | return false; | ||
1035 | return cur->bc_tp->t_log_res - overhead > | ||
1036 | - cur->bc_private.a.priv.refc.nr_ops * 32; | ||
1037 | + cur->bc_private.a.priv.refc.nr_ops * XFS_REFCOUNT_ITEM_OVERHEAD; | ||
1038 | } | ||
1039 | |||
1040 | /* | ||
1041 | @@ -1648,6 +1640,10 @@ xfs_refcount_recover_cow_leftovers( | ||
1042 | error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp); | ||
1043 | if (error) | ||
1044 | goto out_trans; | ||
1045 | + if (!agbp) { | ||
1046 | + error = -ENOMEM; | ||
1047 | + goto out_trans; | ||
1048 | + } | ||
1049 | cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno, NULL); | ||
1050 | |||
1051 | /* Find all the leftover CoW staging extents. */ | ||
1052 | diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h | ||
1053 | index 098dc668ab2c..eafb9d1f3b37 100644 | ||
1054 | --- a/fs/xfs/libxfs/xfs_refcount.h | ||
1055 | +++ b/fs/xfs/libxfs/xfs_refcount.h | ||
1056 | @@ -67,4 +67,20 @@ extern int xfs_refcount_free_cow_extent(struct xfs_mount *mp, | ||
1057 | extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp, | ||
1058 | xfs_agnumber_t agno); | ||
1059 | |||
1060 | +/* | ||
1061 | + * While we're adjusting the refcounts records of an extent, we have | ||
1062 | + * to keep an eye on the number of extents we're dirtying -- run too | ||
1063 | + * many in a single transaction and we'll exceed the transaction's | ||
1064 | + * reservation and crash the fs. Each record adds 12 bytes to the | ||
1065 | + * log (plus any key updates) so we'll conservatively assume 32 bytes | ||
1066 | + * per record. We must also leave space for btree splits on both ends | ||
1067 | + * of the range and space for the CUD and a new CUI. | ||
1068 | + */ | ||
1069 | +#define XFS_REFCOUNT_ITEM_OVERHEAD 32 | ||
1070 | + | ||
1071 | +static inline xfs_fileoff_t xfs_refcount_max_unmap(int log_res) | ||
1072 | +{ | ||
1073 | + return (log_res * 3 / 4) / XFS_REFCOUNT_ITEM_OVERHEAD; | ||
1074 | +} | ||
1075 | + | ||
1076 | #endif /* __XFS_REFCOUNT_H__ */ | ||
1077 | diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c | ||
1078 | index 578981412615..d23889e0bedc 100644 | ||
1079 | --- a/fs/xfs/xfs_aops.c | ||
1080 | +++ b/fs/xfs/xfs_aops.c | ||
1081 | @@ -90,11 +90,11 @@ xfs_find_bdev_for_inode( | ||
1082 | * associated buffer_heads, paying attention to the start and end offsets that | ||
1083 | * we need to process on the page. | ||
1084 | * | ||
1085 | - * Landmine Warning: bh->b_end_io() will call end_page_writeback() on the last | ||
1086 | - * buffer in the IO. Once it does this, it is unsafe to access the bufferhead or | ||
1087 | - * the page at all, as we may be racing with memory reclaim and it can free both | ||
1088 | - * the bufferhead chain and the page as it will see the page as clean and | ||
1089 | - * unused. | ||
1090 | + * Note that we open code the action in end_buffer_async_write here so that we | ||
1091 | + * only have to iterate over the buffers attached to the page once. This is not | ||
1092 | + * only more efficient, but also ensures that we only calls end_page_writeback | ||
1093 | + * at the end of the iteration, and thus avoids the pitfall of having the page | ||
1094 | + * and buffers potentially freed after every call to end_buffer_async_write. | ||
1095 | */ | ||
1096 | static void | ||
1097 | xfs_finish_page_writeback( | ||
1098 | @@ -102,29 +102,45 @@ xfs_finish_page_writeback( | ||
1099 | struct bio_vec *bvec, | ||
1100 | int error) | ||
1101 | { | ||
1102 | - unsigned int end = bvec->bv_offset + bvec->bv_len - 1; | ||
1103 | - struct buffer_head *head, *bh, *next; | ||
1104 | + struct buffer_head *head = page_buffers(bvec->bv_page), *bh = head; | ||
1105 | + bool busy = false; | ||
1106 | unsigned int off = 0; | ||
1107 | - unsigned int bsize; | ||
1108 | + unsigned long flags; | ||
1109 | |||
1110 | ASSERT(bvec->bv_offset < PAGE_SIZE); | ||
1111 | ASSERT((bvec->bv_offset & (i_blocksize(inode) - 1)) == 0); | ||
1112 | - ASSERT(end < PAGE_SIZE); | ||
1113 | + ASSERT(bvec->bv_offset + bvec->bv_len <= PAGE_SIZE); | ||
1114 | ASSERT((bvec->bv_len & (i_blocksize(inode) - 1)) == 0); | ||
1115 | |||
1116 | - bh = head = page_buffers(bvec->bv_page); | ||
1117 | - | ||
1118 | - bsize = bh->b_size; | ||
1119 | + local_irq_save(flags); | ||
1120 | + bit_spin_lock(BH_Uptodate_Lock, &head->b_state); | ||
1121 | do { | ||
1122 | - if (off > end) | ||
1123 | - break; | ||
1124 | - next = bh->b_this_page; | ||
1125 | - if (off < bvec->bv_offset) | ||
1126 | - goto next_bh; | ||
1127 | - bh->b_end_io(bh, !error); | ||
1128 | -next_bh: | ||
1129 | - off += bsize; | ||
1130 | - } while ((bh = next) != head); | ||
1131 | + if (off >= bvec->bv_offset && | ||
1132 | + off < bvec->bv_offset + bvec->bv_len) { | ||
1133 | + ASSERT(buffer_async_write(bh)); | ||
1134 | + ASSERT(bh->b_end_io == NULL); | ||
1135 | + | ||
1136 | + if (error) { | ||
1137 | + mapping_set_error(bvec->bv_page->mapping, -EIO); | ||
1138 | + set_buffer_write_io_error(bh); | ||
1139 | + clear_buffer_uptodate(bh); | ||
1140 | + SetPageError(bvec->bv_page); | ||
1141 | + } else { | ||
1142 | + set_buffer_uptodate(bh); | ||
1143 | + } | ||
1144 | + clear_buffer_async_write(bh); | ||
1145 | + unlock_buffer(bh); | ||
1146 | + } else if (buffer_async_write(bh)) { | ||
1147 | + ASSERT(buffer_locked(bh)); | ||
1148 | + busy = true; | ||
1149 | + } | ||
1150 | + off += bh->b_size; | ||
1151 | + } while ((bh = bh->b_this_page) != head); | ||
1152 | + bit_spin_unlock(BH_Uptodate_Lock, &head->b_state); | ||
1153 | + local_irq_restore(flags); | ||
1154 | + | ||
1155 | + if (!busy) | ||
1156 | + end_page_writeback(bvec->bv_page); | ||
1157 | } | ||
1158 | |||
1159 | /* | ||
1160 | @@ -138,8 +154,10 @@ xfs_destroy_ioend( | ||
1161 | int error) | ||
1162 | { | ||
1163 | struct inode *inode = ioend->io_inode; | ||
1164 | - struct bio *last = ioend->io_bio; | ||
1165 | - struct bio *bio, *next; | ||
1166 | + struct bio *bio = &ioend->io_inline_bio; | ||
1167 | + struct bio *last = ioend->io_bio, *next; | ||
1168 | + u64 start = bio->bi_iter.bi_sector; | ||
1169 | + bool quiet = bio_flagged(bio, BIO_QUIET); | ||
1170 | |||
1171 | for (bio = &ioend->io_inline_bio; bio; bio = next) { | ||
1172 | struct bio_vec *bvec; | ||
1173 | @@ -160,6 +178,11 @@ xfs_destroy_ioend( | ||
1174 | |||
1175 | bio_put(bio); | ||
1176 | } | ||
1177 | + | ||
1178 | + if (unlikely(error && !quiet)) { | ||
1179 | + xfs_err_ratelimited(XFS_I(inode)->i_mount, | ||
1180 | + "writeback error on sector %llu", start); | ||
1181 | + } | ||
1182 | } | ||
1183 | |||
1184 | /* | ||
1185 | @@ -427,7 +450,8 @@ xfs_start_buffer_writeback( | ||
1186 | ASSERT(!buffer_delay(bh)); | ||
1187 | ASSERT(!buffer_unwritten(bh)); | ||
1188 | |||
1189 | - mark_buffer_async_write(bh); | ||
1190 | + bh->b_end_io = NULL; | ||
1191 | + set_buffer_async_write(bh); | ||
1192 | set_buffer_uptodate(bh); | ||
1193 | clear_buffer_dirty(bh); | ||
1194 | } | ||
1195 | @@ -1566,9 +1590,12 @@ xfs_vm_bmap( | ||
1196 | * The swap code (ab-)uses ->bmap to get a block mapping and then | ||
1197 | * bypasseѕ the file system for actual I/O. We really can't allow | ||
1198 | * that on reflinks inodes, so we have to skip out here. And yes, | ||
1199 | - * 0 is the magic code for a bmap error.. | ||
1200 | + * 0 is the magic code for a bmap error. | ||
1201 | + * | ||
1202 | + * Since we don't pass back blockdev info, we can't return bmap | ||
1203 | + * information for rt files either. | ||
1204 | */ | ||
1205 | - if (xfs_is_reflink_inode(ip)) { | ||
1206 | + if (xfs_is_reflink_inode(ip) || XFS_IS_REALTIME_INODE(ip)) { | ||
1207 | xfs_iunlock(ip, XFS_IOLOCK_SHARED); | ||
1208 | return 0; | ||
1209 | } | ||
1210 | diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c | ||
1211 | index c4b90e794e41..5a54dcd7e7b1 100644 | ||
1212 | --- a/fs/xfs/xfs_bmap_item.c | ||
1213 | +++ b/fs/xfs/xfs_bmap_item.c | ||
1214 | @@ -395,6 +395,7 @@ xfs_bui_recover( | ||
1215 | struct xfs_map_extent *bmap; | ||
1216 | xfs_fsblock_t startblock_fsb; | ||
1217 | xfs_fsblock_t inode_fsb; | ||
1218 | + xfs_filblks_t count; | ||
1219 | bool op_ok; | ||
1220 | struct xfs_bud_log_item *budp; | ||
1221 | enum xfs_bmap_intent_type type; | ||
1222 | @@ -403,6 +404,7 @@ xfs_bui_recover( | ||
1223 | struct xfs_trans *tp; | ||
1224 | struct xfs_inode *ip = NULL; | ||
1225 | struct xfs_defer_ops dfops; | ||
1226 | + struct xfs_bmbt_irec irec; | ||
1227 | xfs_fsblock_t firstfsb; | ||
1228 | |||
1229 | ASSERT(!test_bit(XFS_BUI_RECOVERED, &buip->bui_flags)); | ||
1230 | @@ -480,13 +482,24 @@ xfs_bui_recover( | ||
1231 | } | ||
1232 | xfs_trans_ijoin(tp, ip, 0); | ||
1233 | |||
1234 | + count = bmap->me_len; | ||
1235 | error = xfs_trans_log_finish_bmap_update(tp, budp, &dfops, type, | ||
1236 | ip, whichfork, bmap->me_startoff, | ||
1237 | - bmap->me_startblock, bmap->me_len, | ||
1238 | - state); | ||
1239 | + bmap->me_startblock, &count, state); | ||
1240 | if (error) | ||
1241 | goto err_dfops; | ||
1242 | |||
1243 | + if (count > 0) { | ||
1244 | + ASSERT(type == XFS_BMAP_UNMAP); | ||
1245 | + irec.br_startblock = bmap->me_startblock; | ||
1246 | + irec.br_blockcount = count; | ||
1247 | + irec.br_startoff = bmap->me_startoff; | ||
1248 | + irec.br_state = state; | ||
1249 | + error = xfs_bmap_unmap_extent(tp->t_mountp, &dfops, ip, &irec); | ||
1250 | + if (error) | ||
1251 | + goto err_dfops; | ||
1252 | + } | ||
1253 | + | ||
1254 | /* Finish transaction, free inodes. */ | ||
1255 | error = xfs_defer_finish(&tp, &dfops, NULL); | ||
1256 | if (error) | ||
1257 | diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c | ||
1258 | index 87b495e2f15a..5ffefac081f7 100644 | ||
1259 | --- a/fs/xfs/xfs_bmap_util.c | ||
1260 | +++ b/fs/xfs/xfs_bmap_util.c | ||
1261 | @@ -1825,29 +1825,18 @@ xfs_swap_extent_forks( | ||
1262 | } | ||
1263 | |||
1264 | /* | ||
1265 | - * Before we've swapped the forks, lets set the owners of the forks | ||
1266 | - * appropriately. We have to do this as we are demand paging the btree | ||
1267 | - * buffers, and so the validation done on read will expect the owner | ||
1268 | - * field to be correctly set. Once we change the owners, we can swap the | ||
1269 | - * inode forks. | ||
1270 | + * Btree format (v3) inodes have the inode number stamped in the bmbt | ||
1271 | + * block headers. We can't start changing the bmbt blocks until the | ||
1272 | + * inode owner change is logged so recovery does the right thing in the | ||
1273 | + * event of a crash. Set the owner change log flags now and leave the | ||
1274 | + * bmbt scan as the last step. | ||
1275 | */ | ||
1276 | if (ip->i_d.di_version == 3 && | ||
1277 | - ip->i_d.di_format == XFS_DINODE_FMT_BTREE) { | ||
1278 | + ip->i_d.di_format == XFS_DINODE_FMT_BTREE) | ||
1279 | (*target_log_flags) |= XFS_ILOG_DOWNER; | ||
1280 | - error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK, | ||
1281 | - tip->i_ino, NULL); | ||
1282 | - if (error) | ||
1283 | - return error; | ||
1284 | - } | ||
1285 | - | ||
1286 | if (tip->i_d.di_version == 3 && | ||
1287 | - tip->i_d.di_format == XFS_DINODE_FMT_BTREE) { | ||
1288 | + tip->i_d.di_format == XFS_DINODE_FMT_BTREE) | ||
1289 | (*src_log_flags) |= XFS_ILOG_DOWNER; | ||
1290 | - error = xfs_bmbt_change_owner(tp, tip, XFS_DATA_FORK, | ||
1291 | - ip->i_ino, NULL); | ||
1292 | - if (error) | ||
1293 | - return error; | ||
1294 | - } | ||
1295 | |||
1296 | /* | ||
1297 | * Swap the data forks of the inodes | ||
1298 | @@ -1925,6 +1914,48 @@ xfs_swap_extent_forks( | ||
1299 | return 0; | ||
1300 | } | ||
1301 | |||
1302 | +/* | ||
1303 | + * Fix up the owners of the bmbt blocks to refer to the current inode. The | ||
1304 | + * change owner scan attempts to order all modified buffers in the current | ||
1305 | + * transaction. In the event of ordered buffer failure, the offending buffer is | ||
1306 | + * physically logged as a fallback and the scan returns -EAGAIN. We must roll | ||
1307 | + * the transaction in this case to replenish the fallback log reservation and | ||
1308 | + * restart the scan. This process repeats until the scan completes. | ||
1309 | + */ | ||
1310 | +static int | ||
1311 | +xfs_swap_change_owner( | ||
1312 | + struct xfs_trans **tpp, | ||
1313 | + struct xfs_inode *ip, | ||
1314 | + struct xfs_inode *tmpip) | ||
1315 | +{ | ||
1316 | + int error; | ||
1317 | + struct xfs_trans *tp = *tpp; | ||
1318 | + | ||
1319 | + do { | ||
1320 | + error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK, ip->i_ino, | ||
1321 | + NULL); | ||
1322 | + /* success or fatal error */ | ||
1323 | + if (error != -EAGAIN) | ||
1324 | + break; | ||
1325 | + | ||
1326 | + error = xfs_trans_roll(tpp, NULL); | ||
1327 | + if (error) | ||
1328 | + break; | ||
1329 | + tp = *tpp; | ||
1330 | + | ||
1331 | + /* | ||
1332 | + * Redirty both inodes so they can relog and keep the log tail | ||
1333 | + * moving forward. | ||
1334 | + */ | ||
1335 | + xfs_trans_ijoin(tp, ip, 0); | ||
1336 | + xfs_trans_ijoin(tp, tmpip, 0); | ||
1337 | + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); | ||
1338 | + xfs_trans_log_inode(tp, tmpip, XFS_ILOG_CORE); | ||
1339 | + } while (true); | ||
1340 | + | ||
1341 | + return error; | ||
1342 | +} | ||
1343 | + | ||
1344 | int | ||
1345 | xfs_swap_extents( | ||
1346 | struct xfs_inode *ip, /* target inode */ | ||
1347 | @@ -1938,8 +1969,8 @@ xfs_swap_extents( | ||
1348 | int error = 0; | ||
1349 | int lock_flags; | ||
1350 | struct xfs_ifork *cowfp; | ||
1351 | - __uint64_t f; | ||
1352 | - int resblks; | ||
1353 | + uint64_t f; | ||
1354 | + int resblks = 0; | ||
1355 | |||
1356 | /* | ||
1357 | * Lock the inodes against other IO, page faults and truncate to | ||
1358 | @@ -1987,11 +2018,8 @@ xfs_swap_extents( | ||
1359 | XFS_SWAP_RMAP_SPACE_RES(mp, | ||
1360 | XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK), | ||
1361 | XFS_DATA_FORK); | ||
1362 | - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, | ||
1363 | - 0, 0, &tp); | ||
1364 | - } else | ||
1365 | - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, | ||
1366 | - 0, 0, &tp); | ||
1367 | + } | ||
1368 | + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); | ||
1369 | if (error) | ||
1370 | goto out_unlock; | ||
1371 | |||
1372 | @@ -2076,6 +2104,23 @@ xfs_swap_extents( | ||
1373 | xfs_trans_log_inode(tp, ip, src_log_flags); | ||
1374 | xfs_trans_log_inode(tp, tip, target_log_flags); | ||
1375 | |||
1376 | + /* | ||
1377 | + * The extent forks have been swapped, but crc=1,rmapbt=0 filesystems | ||
1378 | + * have inode number owner values in the bmbt blocks that still refer to | ||
1379 | + * the old inode. Scan each bmbt to fix up the owner values with the | ||
1380 | + * inode number of the current inode. | ||
1381 | + */ | ||
1382 | + if (src_log_flags & XFS_ILOG_DOWNER) { | ||
1383 | + error = xfs_swap_change_owner(&tp, ip, tip); | ||
1384 | + if (error) | ||
1385 | + goto out_trans_cancel; | ||
1386 | + } | ||
1387 | + if (target_log_flags & XFS_ILOG_DOWNER) { | ||
1388 | + error = xfs_swap_change_owner(&tp, tip, ip); | ||
1389 | + if (error) | ||
1390 | + goto out_trans_cancel; | ||
1391 | + } | ||
1392 | + | ||
1393 | /* | ||
1394 | * If this is a synchronous mount, make sure that the | ||
1395 | * transaction goes to disk before returning to the user. | ||
1396 | diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c | ||
1397 | index 16269271ebd6..eca7baecc9f0 100644 | ||
1398 | --- a/fs/xfs/xfs_buf.c | ||
1399 | +++ b/fs/xfs/xfs_buf.c | ||
1400 | @@ -116,7 +116,7 @@ static inline void | ||
1401 | __xfs_buf_ioacct_dec( | ||
1402 | struct xfs_buf *bp) | ||
1403 | { | ||
1404 | - ASSERT(spin_is_locked(&bp->b_lock)); | ||
1405 | + lockdep_assert_held(&bp->b_lock); | ||
1406 | |||
1407 | if (bp->b_state & XFS_BSTATE_IN_FLIGHT) { | ||
1408 | bp->b_state &= ~XFS_BSTATE_IN_FLIGHT; | ||
1409 | @@ -2022,6 +2022,66 @@ xfs_buf_delwri_submit( | ||
1410 | return error; | ||
1411 | } | ||
1412 | |||
1413 | +/* | ||
1414 | + * Push a single buffer on a delwri queue. | ||
1415 | + * | ||
1416 | + * The purpose of this function is to submit a single buffer of a delwri queue | ||
1417 | + * and return with the buffer still on the original queue. The waiting delwri | ||
1418 | + * buffer submission infrastructure guarantees transfer of the delwri queue | ||
1419 | + * buffer reference to a temporary wait list. We reuse this infrastructure to | ||
1420 | + * transfer the buffer back to the original queue. | ||
1421 | + * | ||
1422 | + * Note the buffer transitions from the queued state, to the submitted and wait | ||
1423 | + * listed state and back to the queued state during this call. The buffer | ||
1424 | + * locking and queue management logic between _delwri_pushbuf() and | ||
1425 | + * _delwri_queue() guarantee that the buffer cannot be queued to another list | ||
1426 | + * before returning. | ||
1427 | + */ | ||
1428 | +int | ||
1429 | +xfs_buf_delwri_pushbuf( | ||
1430 | + struct xfs_buf *bp, | ||
1431 | + struct list_head *buffer_list) | ||
1432 | +{ | ||
1433 | + LIST_HEAD (submit_list); | ||
1434 | + int error; | ||
1435 | + | ||
1436 | + ASSERT(bp->b_flags & _XBF_DELWRI_Q); | ||
1437 | + | ||
1438 | + trace_xfs_buf_delwri_pushbuf(bp, _RET_IP_); | ||
1439 | + | ||
1440 | + /* | ||
1441 | + * Isolate the buffer to a new local list so we can submit it for I/O | ||
1442 | + * independently from the rest of the original list. | ||
1443 | + */ | ||
1444 | + xfs_buf_lock(bp); | ||
1445 | + list_move(&bp->b_list, &submit_list); | ||
1446 | + xfs_buf_unlock(bp); | ||
1447 | + | ||
1448 | + /* | ||
1449 | + * Delwri submission clears the DELWRI_Q buffer flag and returns with | ||
1450 | + * the buffer on the wait list with an associated reference. Rather than | ||
1451 | + * bounce the buffer from a local wait list back to the original list | ||
1452 | + * after I/O completion, reuse the original list as the wait list. | ||
1453 | + */ | ||
1454 | + xfs_buf_delwri_submit_buffers(&submit_list, buffer_list); | ||
1455 | + | ||
1456 | + /* | ||
1457 | + * The buffer is now under I/O and wait listed as during typical delwri | ||
1458 | + * submission. Lock the buffer to wait for I/O completion. Rather than | ||
1459 | + * remove the buffer from the wait list and release the reference, we | ||
1460 | + * want to return with the buffer queued to the original list. The | ||
1461 | + * buffer already sits on the original list with a wait list reference, | ||
1462 | + * however. If we let the queue inherit that wait list reference, all we | ||
1463 | + * need to do is reset the DELWRI_Q flag. | ||
1464 | + */ | ||
1465 | + xfs_buf_lock(bp); | ||
1466 | + error = bp->b_error; | ||
1467 | + bp->b_flags |= _XBF_DELWRI_Q; | ||
1468 | + xfs_buf_unlock(bp); | ||
1469 | + | ||
1470 | + return error; | ||
1471 | +} | ||
1472 | + | ||
1473 | int __init | ||
1474 | xfs_buf_init(void) | ||
1475 | { | ||
1476 | diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h | ||
1477 | index ad514a8025dd..f961b19b9cc2 100644 | ||
1478 | --- a/fs/xfs/xfs_buf.h | ||
1479 | +++ b/fs/xfs/xfs_buf.h | ||
1480 | @@ -333,6 +333,7 @@ extern void xfs_buf_delwri_cancel(struct list_head *); | ||
1481 | extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *); | ||
1482 | extern int xfs_buf_delwri_submit(struct list_head *); | ||
1483 | extern int xfs_buf_delwri_submit_nowait(struct list_head *); | ||
1484 | +extern int xfs_buf_delwri_pushbuf(struct xfs_buf *, struct list_head *); | ||
1485 | |||
1486 | /* Buffer Daemon Setup Routines */ | ||
1487 | extern int xfs_buf_init(void); | ||
1488 | diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c | ||
1489 | index 0306168af332..e0a0af0946f2 100644 | ||
1490 | --- a/fs/xfs/xfs_buf_item.c | ||
1491 | +++ b/fs/xfs/xfs_buf_item.c | ||
1492 | @@ -29,6 +29,7 @@ | ||
1493 | #include "xfs_error.h" | ||
1494 | #include "xfs_trace.h" | ||
1495 | #include "xfs_log.h" | ||
1496 | +#include "xfs_inode.h" | ||
1497 | |||
1498 | |||
1499 | kmem_zone_t *xfs_buf_item_zone; | ||
1500 | @@ -322,6 +323,8 @@ xfs_buf_item_format( | ||
1501 | ASSERT((bip->bli_flags & XFS_BLI_STALE) || | ||
1502 | (xfs_blft_from_flags(&bip->__bli_format) > XFS_BLFT_UNKNOWN_BUF | ||
1503 | && xfs_blft_from_flags(&bip->__bli_format) < XFS_BLFT_MAX_BUF)); | ||
1504 | + ASSERT(!(bip->bli_flags & XFS_BLI_ORDERED) || | ||
1505 | + (bip->bli_flags & XFS_BLI_STALE)); | ||
1506 | |||
1507 | |||
1508 | /* | ||
1509 | @@ -346,16 +349,6 @@ xfs_buf_item_format( | ||
1510 | bip->bli_flags &= ~XFS_BLI_INODE_BUF; | ||
1511 | } | ||
1512 | |||
1513 | - if ((bip->bli_flags & (XFS_BLI_ORDERED|XFS_BLI_STALE)) == | ||
1514 | - XFS_BLI_ORDERED) { | ||
1515 | - /* | ||
1516 | - * The buffer has been logged just to order it. It is not being | ||
1517 | - * included in the transaction commit, so don't format it. | ||
1518 | - */ | ||
1519 | - trace_xfs_buf_item_format_ordered(bip); | ||
1520 | - return; | ||
1521 | - } | ||
1522 | - | ||
1523 | for (i = 0; i < bip->bli_format_count; i++) { | ||
1524 | xfs_buf_item_format_segment(bip, lv, &vecp, offset, | ||
1525 | &bip->bli_formats[i]); | ||
1526 | @@ -574,26 +567,20 @@ xfs_buf_item_unlock( | ||
1527 | { | ||
1528 | struct xfs_buf_log_item *bip = BUF_ITEM(lip); | ||
1529 | struct xfs_buf *bp = bip->bli_buf; | ||
1530 | - bool clean; | ||
1531 | - bool aborted; | ||
1532 | - int flags; | ||
1533 | + bool aborted = !!(lip->li_flags & XFS_LI_ABORTED); | ||
1534 | + bool hold = !!(bip->bli_flags & XFS_BLI_HOLD); | ||
1535 | + bool dirty = !!(bip->bli_flags & XFS_BLI_DIRTY); | ||
1536 | +#if defined(DEBUG) || defined(XFS_WARN) | ||
1537 | + bool ordered = !!(bip->bli_flags & XFS_BLI_ORDERED); | ||
1538 | +#endif | ||
1539 | |||
1540 | /* Clear the buffer's association with this transaction. */ | ||
1541 | bp->b_transp = NULL; | ||
1542 | |||
1543 | /* | ||
1544 | - * If this is a transaction abort, don't return early. Instead, allow | ||
1545 | - * the brelse to happen. Normally it would be done for stale | ||
1546 | - * (cancelled) buffers at unpin time, but we'll never go through the | ||
1547 | - * pin/unpin cycle if we abort inside commit. | ||
1548 | - */ | ||
1549 | - aborted = (lip->li_flags & XFS_LI_ABORTED) ? true : false; | ||
1550 | - /* | ||
1551 | - * Before possibly freeing the buf item, copy the per-transaction state | ||
1552 | - * so we can reference it safely later after clearing it from the | ||
1553 | - * buffer log item. | ||
1554 | + * The per-transaction state has been copied above so clear it from the | ||
1555 | + * bli. | ||
1556 | */ | ||
1557 | - flags = bip->bli_flags; | ||
1558 | bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED); | ||
1559 | |||
1560 | /* | ||
1561 | @@ -601,7 +588,7 @@ xfs_buf_item_unlock( | ||
1562 | * unlock the buffer and free the buf item when the buffer is unpinned | ||
1563 | * for the last time. | ||
1564 | */ | ||
1565 | - if (flags & XFS_BLI_STALE) { | ||
1566 | + if (bip->bli_flags & XFS_BLI_STALE) { | ||
1567 | trace_xfs_buf_item_unlock_stale(bip); | ||
1568 | ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); | ||
1569 | if (!aborted) { | ||
1570 | @@ -619,40 +606,34 @@ xfs_buf_item_unlock( | ||
1571 | * regardless of whether it is dirty or not. A dirty abort implies a | ||
1572 | * shutdown, anyway. | ||
1573 | * | ||
1574 | - * Ordered buffers are dirty but may have no recorded changes, so ensure | ||
1575 | - * we only release clean items here. | ||
1576 | + * The bli dirty state should match whether the blf has logged segments | ||
1577 | + * except for ordered buffers, where only the bli should be dirty. | ||
1578 | */ | ||
1579 | - clean = (flags & XFS_BLI_DIRTY) ? false : true; | ||
1580 | - if (clean) { | ||
1581 | - int i; | ||
1582 | - for (i = 0; i < bip->bli_format_count; i++) { | ||
1583 | - if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map, | ||
1584 | - bip->bli_formats[i].blf_map_size)) { | ||
1585 | - clean = false; | ||
1586 | - break; | ||
1587 | - } | ||
1588 | - } | ||
1589 | - } | ||
1590 | + ASSERT((!ordered && dirty == xfs_buf_item_dirty_format(bip)) || | ||
1591 | + (ordered && dirty && !xfs_buf_item_dirty_format(bip))); | ||
1592 | |||
1593 | /* | ||
1594 | * Clean buffers, by definition, cannot be in the AIL. However, aborted | ||
1595 | - * buffers may be dirty and hence in the AIL. Therefore if we are | ||
1596 | - * aborting a buffer and we've just taken the last refernce away, we | ||
1597 | - * have to check if it is in the AIL before freeing it. We need to free | ||
1598 | - * it in this case, because an aborted transaction has already shut the | ||
1599 | - * filesystem down and this is the last chance we will have to do so. | ||
1600 | + * buffers may be in the AIL regardless of dirty state. An aborted | ||
1601 | + * transaction that invalidates a buffer already in the AIL may have | ||
1602 | + * marked it stale and cleared the dirty state, for example. | ||
1603 | + * | ||
1604 | + * Therefore if we are aborting a buffer and we've just taken the last | ||
1605 | + * reference away, we have to check if it is in the AIL before freeing | ||
1606 | + * it. We need to free it in this case, because an aborted transaction | ||
1607 | + * has already shut the filesystem down and this is the last chance we | ||
1608 | + * will have to do so. | ||
1609 | */ | ||
1610 | if (atomic_dec_and_test(&bip->bli_refcount)) { | ||
1611 | - if (clean) | ||
1612 | - xfs_buf_item_relse(bp); | ||
1613 | - else if (aborted) { | ||
1614 | + if (aborted) { | ||
1615 | ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp)); | ||
1616 | xfs_trans_ail_remove(lip, SHUTDOWN_LOG_IO_ERROR); | ||
1617 | xfs_buf_item_relse(bp); | ||
1618 | - } | ||
1619 | + } else if (!dirty) | ||
1620 | + xfs_buf_item_relse(bp); | ||
1621 | } | ||
1622 | |||
1623 | - if (!(flags & XFS_BLI_HOLD)) | ||
1624 | + if (!hold) | ||
1625 | xfs_buf_relse(bp); | ||
1626 | } | ||
1627 | |||
1628 | @@ -942,14 +923,22 @@ xfs_buf_item_log( | ||
1629 | |||
1630 | |||
1631 | /* | ||
1632 | - * Return 1 if the buffer has been logged or ordered in a transaction (at any | ||
1633 | - * point, not just the current transaction) and 0 if not. | ||
1634 | + * Return true if the buffer has any ranges logged/dirtied by a transaction, | ||
1635 | + * false otherwise. | ||
1636 | */ | ||
1637 | -uint | ||
1638 | -xfs_buf_item_dirty( | ||
1639 | - xfs_buf_log_item_t *bip) | ||
1640 | +bool | ||
1641 | +xfs_buf_item_dirty_format( | ||
1642 | + struct xfs_buf_log_item *bip) | ||
1643 | { | ||
1644 | - return (bip->bli_flags & XFS_BLI_DIRTY); | ||
1645 | + int i; | ||
1646 | + | ||
1647 | + for (i = 0; i < bip->bli_format_count; i++) { | ||
1648 | + if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map, | ||
1649 | + bip->bli_formats[i].blf_map_size)) | ||
1650 | + return true; | ||
1651 | + } | ||
1652 | + | ||
1653 | + return false; | ||
1654 | } | ||
1655 | |||
1656 | STATIC void | ||
1657 | @@ -1051,6 +1040,31 @@ xfs_buf_do_callbacks( | ||
1658 | } | ||
1659 | } | ||
1660 | |||
1661 | +/* | ||
1662 | + * Invoke the error state callback for each log item affected by the failed I/O. | ||
1663 | + * | ||
1664 | + * If a metadata buffer write fails with a non-permanent error, the buffer is | ||
1665 | + * eventually resubmitted and so the completion callbacks are not run. The error | ||
1666 | + * state may need to be propagated to the log items attached to the buffer, | ||
1667 | + * however, so the next AIL push of the item knows hot to handle it correctly. | ||
1668 | + */ | ||
1669 | +STATIC void | ||
1670 | +xfs_buf_do_callbacks_fail( | ||
1671 | + struct xfs_buf *bp) | ||
1672 | +{ | ||
1673 | + struct xfs_log_item *next; | ||
1674 | + struct xfs_log_item *lip = bp->b_fspriv; | ||
1675 | + struct xfs_ail *ailp = lip->li_ailp; | ||
1676 | + | ||
1677 | + spin_lock(&ailp->xa_lock); | ||
1678 | + for (; lip; lip = next) { | ||
1679 | + next = lip->li_bio_list; | ||
1680 | + if (lip->li_ops->iop_error) | ||
1681 | + lip->li_ops->iop_error(lip, bp); | ||
1682 | + } | ||
1683 | + spin_unlock(&ailp->xa_lock); | ||
1684 | +} | ||
1685 | + | ||
1686 | static bool | ||
1687 | xfs_buf_iodone_callback_error( | ||
1688 | struct xfs_buf *bp) | ||
1689 | @@ -1120,7 +1134,11 @@ xfs_buf_iodone_callback_error( | ||
1690 | if ((mp->m_flags & XFS_MOUNT_UNMOUNTING) && mp->m_fail_unmount) | ||
1691 | goto permanent_error; | ||
1692 | |||
1693 | - /* still a transient error, higher layers will retry */ | ||
1694 | + /* | ||
1695 | + * Still a transient error, run IO completion failure callbacks and let | ||
1696 | + * the higher layers retry the buffer. | ||
1697 | + */ | ||
1698 | + xfs_buf_do_callbacks_fail(bp); | ||
1699 | xfs_buf_ioerror(bp, 0); | ||
1700 | xfs_buf_relse(bp); | ||
1701 | return true; | ||
1702 | @@ -1201,3 +1219,31 @@ xfs_buf_iodone( | ||
1703 | xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE); | ||
1704 | xfs_buf_item_free(BUF_ITEM(lip)); | ||
1705 | } | ||
1706 | + | ||
1707 | +/* | ||
1708 | + * Requeue a failed buffer for writeback | ||
1709 | + * | ||
1710 | + * Return true if the buffer has been re-queued properly, false otherwise | ||
1711 | + */ | ||
1712 | +bool | ||
1713 | +xfs_buf_resubmit_failed_buffers( | ||
1714 | + struct xfs_buf *bp, | ||
1715 | + struct xfs_log_item *lip, | ||
1716 | + struct list_head *buffer_list) | ||
1717 | +{ | ||
1718 | + struct xfs_log_item *next; | ||
1719 | + | ||
1720 | + /* | ||
1721 | + * Clear XFS_LI_FAILED flag from all items before resubmit | ||
1722 | + * | ||
1723 | + * XFS_LI_FAILED set/clear is protected by xa_lock, caller this | ||
1724 | + * function already have it acquired | ||
1725 | + */ | ||
1726 | + for (; lip; lip = next) { | ||
1727 | + next = lip->li_bio_list; | ||
1728 | + xfs_clear_li_failed(lip); | ||
1729 | + } | ||
1730 | + | ||
1731 | + /* Add this buffer back to the delayed write list */ | ||
1732 | + return xfs_buf_delwri_queue(bp, buffer_list); | ||
1733 | +} | ||
1734 | diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h | ||
1735 | index f7eba99d19dd..9690ce62c9a7 100644 | ||
1736 | --- a/fs/xfs/xfs_buf_item.h | ||
1737 | +++ b/fs/xfs/xfs_buf_item.h | ||
1738 | @@ -64,12 +64,15 @@ typedef struct xfs_buf_log_item { | ||
1739 | int xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *); | ||
1740 | void xfs_buf_item_relse(struct xfs_buf *); | ||
1741 | void xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint); | ||
1742 | -uint xfs_buf_item_dirty(xfs_buf_log_item_t *); | ||
1743 | +bool xfs_buf_item_dirty_format(struct xfs_buf_log_item *); | ||
1744 | void xfs_buf_attach_iodone(struct xfs_buf *, | ||
1745 | void(*)(struct xfs_buf *, xfs_log_item_t *), | ||
1746 | xfs_log_item_t *); | ||
1747 | void xfs_buf_iodone_callbacks(struct xfs_buf *); | ||
1748 | void xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *); | ||
1749 | +bool xfs_buf_resubmit_failed_buffers(struct xfs_buf *, | ||
1750 | + struct xfs_log_item *, | ||
1751 | + struct list_head *); | ||
1752 | |||
1753 | extern kmem_zone_t *xfs_buf_item_zone; | ||
1754 | |||
1755 | diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c | ||
1756 | index df206cfc21f7..586b398f268d 100644 | ||
1757 | --- a/fs/xfs/xfs_file.c | ||
1758 | +++ b/fs/xfs/xfs_file.c | ||
1759 | @@ -729,6 +729,7 @@ xfs_file_buffered_aio_write( | ||
1760 | xfs_rw_iunlock(ip, iolock); | ||
1761 | eofb.eof_flags = XFS_EOF_FLAGS_SYNC; | ||
1762 | xfs_icache_free_eofblocks(ip->i_mount, &eofb); | ||
1763 | + xfs_icache_free_cowblocks(ip->i_mount, &eofb); | ||
1764 | goto write_retry; | ||
1765 | } | ||
1766 | |||
1767 | @@ -1139,29 +1140,8 @@ xfs_find_get_desired_pgoff( | ||
1768 | want = min_t(pgoff_t, end - index, PAGEVEC_SIZE - 1) + 1; | ||
1769 | nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, | ||
1770 | want); | ||
1771 | - /* | ||
1772 | - * No page mapped into given range. If we are searching holes | ||
1773 | - * and if this is the first time we got into the loop, it means | ||
1774 | - * that the given offset is landed in a hole, return it. | ||
1775 | - * | ||
1776 | - * If we have already stepped through some block buffers to find | ||
1777 | - * holes but they all contains data. In this case, the last | ||
1778 | - * offset is already updated and pointed to the end of the last | ||
1779 | - * mapped page, if it does not reach the endpoint to search, | ||
1780 | - * that means there should be a hole between them. | ||
1781 | - */ | ||
1782 | - if (nr_pages == 0) { | ||
1783 | - /* Data search found nothing */ | ||
1784 | - if (type == DATA_OFF) | ||
1785 | - break; | ||
1786 | - | ||
1787 | - ASSERT(type == HOLE_OFF); | ||
1788 | - if (lastoff == startoff || lastoff < endoff) { | ||
1789 | - found = true; | ||
1790 | - *offset = lastoff; | ||
1791 | - } | ||
1792 | + if (nr_pages == 0) | ||
1793 | break; | ||
1794 | - } | ||
1795 | |||
1796 | for (i = 0; i < nr_pages; i++) { | ||
1797 | struct page *page = pvec.pages[i]; | ||
1798 | @@ -1227,21 +1207,20 @@ xfs_find_get_desired_pgoff( | ||
1799 | |||
1800 | /* | ||
1801 | * The number of returned pages less than our desired, search | ||
1802 | - * done. In this case, nothing was found for searching data, | ||
1803 | - * but we found a hole behind the last offset. | ||
1804 | + * done. | ||
1805 | */ | ||
1806 | - if (nr_pages < want) { | ||
1807 | - if (type == HOLE_OFF) { | ||
1808 | - *offset = lastoff; | ||
1809 | - found = true; | ||
1810 | - } | ||
1811 | + if (nr_pages < want) | ||
1812 | break; | ||
1813 | - } | ||
1814 | |||
1815 | index = pvec.pages[i - 1]->index + 1; | ||
1816 | pagevec_release(&pvec); | ||
1817 | } while (index <= end); | ||
1818 | |||
1819 | + /* No page at lastoff and we are not done - we found a hole. */ | ||
1820 | + if (type == HOLE_OFF && lastoff < endoff) { | ||
1821 | + *offset = lastoff; | ||
1822 | + found = true; | ||
1823 | + } | ||
1824 | out: | ||
1825 | pagevec_release(&pvec); | ||
1826 | return found; | ||
1827 | diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c | ||
1828 | index 74304b6ce84b..86a4911520cc 100644 | ||
1829 | --- a/fs/xfs/xfs_icache.c | ||
1830 | +++ b/fs/xfs/xfs_icache.c | ||
1831 | @@ -66,7 +66,6 @@ xfs_inode_alloc( | ||
1832 | |||
1833 | XFS_STATS_INC(mp, vn_active); | ||
1834 | ASSERT(atomic_read(&ip->i_pincount) == 0); | ||
1835 | - ASSERT(!spin_is_locked(&ip->i_flags_lock)); | ||
1836 | ASSERT(!xfs_isiflocked(ip)); | ||
1837 | ASSERT(ip->i_ino == 0); | ||
1838 | |||
1839 | @@ -192,7 +191,7 @@ xfs_perag_set_reclaim_tag( | ||
1840 | { | ||
1841 | struct xfs_mount *mp = pag->pag_mount; | ||
1842 | |||
1843 | - ASSERT(spin_is_locked(&pag->pag_ici_lock)); | ||
1844 | + lockdep_assert_held(&pag->pag_ici_lock); | ||
1845 | if (pag->pag_ici_reclaimable++) | ||
1846 | return; | ||
1847 | |||
1848 | @@ -214,7 +213,7 @@ xfs_perag_clear_reclaim_tag( | ||
1849 | { | ||
1850 | struct xfs_mount *mp = pag->pag_mount; | ||
1851 | |||
1852 | - ASSERT(spin_is_locked(&pag->pag_ici_lock)); | ||
1853 | + lockdep_assert_held(&pag->pag_ici_lock); | ||
1854 | if (--pag->pag_ici_reclaimable) | ||
1855 | return; | ||
1856 | |||
1857 | @@ -1079,11 +1078,11 @@ xfs_reclaim_inode( | ||
1858 | * Because we use RCU freeing we need to ensure the inode always appears | ||
1859 | * to be reclaimed with an invalid inode number when in the free state. | ||
1860 | * We do this as early as possible under the ILOCK so that | ||
1861 | - * xfs_iflush_cluster() can be guaranteed to detect races with us here. | ||
1862 | - * By doing this, we guarantee that once xfs_iflush_cluster has locked | ||
1863 | - * XFS_ILOCK that it will see either a valid, flushable inode that will | ||
1864 | - * serialise correctly, or it will see a clean (and invalid) inode that | ||
1865 | - * it can skip. | ||
1866 | + * xfs_iflush_cluster() and xfs_ifree_cluster() can be guaranteed to | ||
1867 | + * detect races with us here. By doing this, we guarantee that once | ||
1868 | + * xfs_iflush_cluster() or xfs_ifree_cluster() has locked XFS_ILOCK that | ||
1869 | + * it will see either a valid inode that will serialise correctly, or it | ||
1870 | + * will see an invalid inode that it can skip. | ||
1871 | */ | ||
1872 | spin_lock(&ip->i_flags_lock); | ||
1873 | ip->i_flags = XFS_IRECLAIM; | ||
1874 | diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c | ||
1875 | index 7a0b4eeb99e4..9e795ab08a53 100644 | ||
1876 | --- a/fs/xfs/xfs_inode.c | ||
1877 | +++ b/fs/xfs/xfs_inode.c | ||
1878 | @@ -881,7 +881,6 @@ xfs_ialloc( | ||
1879 | case S_IFREG: | ||
1880 | case S_IFDIR: | ||
1881 | if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) { | ||
1882 | - uint64_t di_flags2 = 0; | ||
1883 | uint di_flags = 0; | ||
1884 | |||
1885 | if (S_ISDIR(mode)) { | ||
1886 | @@ -918,20 +917,23 @@ xfs_ialloc( | ||
1887 | di_flags |= XFS_DIFLAG_NODEFRAG; | ||
1888 | if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM) | ||
1889 | di_flags |= XFS_DIFLAG_FILESTREAM; | ||
1890 | - if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX) | ||
1891 | - di_flags2 |= XFS_DIFLAG2_DAX; | ||
1892 | |||
1893 | ip->i_d.di_flags |= di_flags; | ||
1894 | - ip->i_d.di_flags2 |= di_flags2; | ||
1895 | } | ||
1896 | if (pip && | ||
1897 | (pip->i_d.di_flags2 & XFS_DIFLAG2_ANY) && | ||
1898 | pip->i_d.di_version == 3 && | ||
1899 | ip->i_d.di_version == 3) { | ||
1900 | + uint64_t di_flags2 = 0; | ||
1901 | + | ||
1902 | if (pip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) { | ||
1903 | - ip->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; | ||
1904 | + di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; | ||
1905 | ip->i_d.di_cowextsize = pip->i_d.di_cowextsize; | ||
1906 | } | ||
1907 | + if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX) | ||
1908 | + di_flags2 |= XFS_DIFLAG2_DAX; | ||
1909 | + | ||
1910 | + ip->i_d.di_flags2 |= di_flags2; | ||
1911 | } | ||
1912 | /* FALLTHROUGH */ | ||
1913 | case S_IFLNK: | ||
1914 | @@ -2366,11 +2368,24 @@ xfs_ifree_cluster( | ||
1915 | * already marked stale. If we can't lock it, back off | ||
1916 | * and retry. | ||
1917 | */ | ||
1918 | - if (ip != free_ip && | ||
1919 | - !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { | ||
1920 | - rcu_read_unlock(); | ||
1921 | - delay(1); | ||
1922 | - goto retry; | ||
1923 | + if (ip != free_ip) { | ||
1924 | + if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { | ||
1925 | + rcu_read_unlock(); | ||
1926 | + delay(1); | ||
1927 | + goto retry; | ||
1928 | + } | ||
1929 | + | ||
1930 | + /* | ||
1931 | + * Check the inode number again in case we're | ||
1932 | + * racing with freeing in xfs_reclaim_inode(). | ||
1933 | + * See the comments in that function for more | ||
1934 | + * information as to why the initial check is | ||
1935 | + * not sufficient. | ||
1936 | + */ | ||
1937 | + if (ip->i_ino != inum + i) { | ||
1938 | + xfs_iunlock(ip, XFS_ILOCK_EXCL); | ||
1939 | + continue; | ||
1940 | + } | ||
1941 | } | ||
1942 | rcu_read_unlock(); | ||
1943 | |||
1944 | diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c | ||
1945 | index d90e7811ccdd..94915747042c 100644 | ||
1946 | --- a/fs/xfs/xfs_inode_item.c | ||
1947 | +++ b/fs/xfs/xfs_inode_item.c | ||
1948 | @@ -27,6 +27,7 @@ | ||
1949 | #include "xfs_error.h" | ||
1950 | #include "xfs_trace.h" | ||
1951 | #include "xfs_trans_priv.h" | ||
1952 | +#include "xfs_buf_item.h" | ||
1953 | #include "xfs_log.h" | ||
1954 | |||
1955 | |||
1956 | @@ -475,6 +476,23 @@ xfs_inode_item_unpin( | ||
1957 | wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT); | ||
1958 | } | ||
1959 | |||
1960 | +/* | ||
1961 | + * Callback used to mark a buffer with XFS_LI_FAILED when items in the buffer | ||
1962 | + * have been failed during writeback | ||
1963 | + * | ||
1964 | + * This informs the AIL that the inode is already flush locked on the next push, | ||
1965 | + * and acquires a hold on the buffer to ensure that it isn't reclaimed before | ||
1966 | + * dirty data makes it to disk. | ||
1967 | + */ | ||
1968 | +STATIC void | ||
1969 | +xfs_inode_item_error( | ||
1970 | + struct xfs_log_item *lip, | ||
1971 | + struct xfs_buf *bp) | ||
1972 | +{ | ||
1973 | + ASSERT(xfs_isiflocked(INODE_ITEM(lip)->ili_inode)); | ||
1974 | + xfs_set_li_failed(lip, bp); | ||
1975 | +} | ||
1976 | + | ||
1977 | STATIC uint | ||
1978 | xfs_inode_item_push( | ||
1979 | struct xfs_log_item *lip, | ||
1980 | @@ -484,13 +502,28 @@ xfs_inode_item_push( | ||
1981 | { | ||
1982 | struct xfs_inode_log_item *iip = INODE_ITEM(lip); | ||
1983 | struct xfs_inode *ip = iip->ili_inode; | ||
1984 | - struct xfs_buf *bp = NULL; | ||
1985 | + struct xfs_buf *bp = lip->li_buf; | ||
1986 | uint rval = XFS_ITEM_SUCCESS; | ||
1987 | int error; | ||
1988 | |||
1989 | if (xfs_ipincount(ip) > 0) | ||
1990 | return XFS_ITEM_PINNED; | ||
1991 | |||
1992 | + /* | ||
1993 | + * The buffer containing this item failed to be written back | ||
1994 | + * previously. Resubmit the buffer for IO. | ||
1995 | + */ | ||
1996 | + if (lip->li_flags & XFS_LI_FAILED) { | ||
1997 | + if (!xfs_buf_trylock(bp)) | ||
1998 | + return XFS_ITEM_LOCKED; | ||
1999 | + | ||
2000 | + if (!xfs_buf_resubmit_failed_buffers(bp, lip, buffer_list)) | ||
2001 | + rval = XFS_ITEM_FLUSHING; | ||
2002 | + | ||
2003 | + xfs_buf_unlock(bp); | ||
2004 | + return rval; | ||
2005 | + } | ||
2006 | + | ||
2007 | if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) | ||
2008 | return XFS_ITEM_LOCKED; | ||
2009 | |||
2010 | @@ -622,7 +655,8 @@ static const struct xfs_item_ops xfs_inode_item_ops = { | ||
2011 | .iop_unlock = xfs_inode_item_unlock, | ||
2012 | .iop_committed = xfs_inode_item_committed, | ||
2013 | .iop_push = xfs_inode_item_push, | ||
2014 | - .iop_committing = xfs_inode_item_committing | ||
2015 | + .iop_committing = xfs_inode_item_committing, | ||
2016 | + .iop_error = xfs_inode_item_error | ||
2017 | }; | ||
2018 | |||
2019 | |||
2020 | @@ -710,7 +744,8 @@ xfs_iflush_done( | ||
2021 | * the AIL lock. | ||
2022 | */ | ||
2023 | iip = INODE_ITEM(blip); | ||
2024 | - if (iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn) | ||
2025 | + if ((iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn) || | ||
2026 | + lip->li_flags & XFS_LI_FAILED) | ||
2027 | need_ail++; | ||
2028 | |||
2029 | blip = next; | ||
2030 | @@ -718,7 +753,8 @@ xfs_iflush_done( | ||
2031 | |||
2032 | /* make sure we capture the state of the initial inode. */ | ||
2033 | iip = INODE_ITEM(lip); | ||
2034 | - if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) | ||
2035 | + if ((iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) || | ||
2036 | + lip->li_flags & XFS_LI_FAILED) | ||
2037 | need_ail++; | ||
2038 | |||
2039 | /* | ||
2040 | @@ -731,22 +767,30 @@ xfs_iflush_done( | ||
2041 | * holding the lock before removing the inode from the AIL. | ||
2042 | */ | ||
2043 | if (need_ail) { | ||
2044 | - struct xfs_log_item *log_items[need_ail]; | ||
2045 | - int i = 0; | ||
2046 | + bool mlip_changed = false; | ||
2047 | + | ||
2048 | + /* this is an opencoded batch version of xfs_trans_ail_delete */ | ||
2049 | spin_lock(&ailp->xa_lock); | ||
2050 | for (blip = lip; blip; blip = blip->li_bio_list) { | ||
2051 | - iip = INODE_ITEM(blip); | ||
2052 | - if (iip->ili_logged && | ||
2053 | - blip->li_lsn == iip->ili_flush_lsn) { | ||
2054 | - log_items[i++] = blip; | ||
2055 | + if (INODE_ITEM(blip)->ili_logged && | ||
2056 | + blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn) | ||
2057 | + mlip_changed |= xfs_ail_delete_one(ailp, blip); | ||
2058 | + else { | ||
2059 | + xfs_clear_li_failed(blip); | ||
2060 | } | ||
2061 | - ASSERT(i <= need_ail); | ||
2062 | } | ||
2063 | - /* xfs_trans_ail_delete_bulk() drops the AIL lock. */ | ||
2064 | - xfs_trans_ail_delete_bulk(ailp, log_items, i, | ||
2065 | - SHUTDOWN_CORRUPT_INCORE); | ||
2066 | - } | ||
2067 | |||
2068 | + if (mlip_changed) { | ||
2069 | + if (!XFS_FORCED_SHUTDOWN(ailp->xa_mount)) | ||
2070 | + xlog_assign_tail_lsn_locked(ailp->xa_mount); | ||
2071 | + if (list_empty(&ailp->xa_ail)) | ||
2072 | + wake_up_all(&ailp->xa_empty); | ||
2073 | + } | ||
2074 | + spin_unlock(&ailp->xa_lock); | ||
2075 | + | ||
2076 | + if (mlip_changed) | ||
2077 | + xfs_log_space_wake(ailp->xa_mount); | ||
2078 | + } | ||
2079 | |||
2080 | /* | ||
2081 | * clean up and unlock the flush lock now we are done. We can clear the | ||
2082 | diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c | ||
2083 | index 73cfc7179124..bce2e260f55e 100644 | ||
2084 | --- a/fs/xfs/xfs_ioctl.c | ||
2085 | +++ b/fs/xfs/xfs_ioctl.c | ||
2086 | @@ -928,16 +928,15 @@ xfs_ioc_fsgetxattr( | ||
2087 | return 0; | ||
2088 | } | ||
2089 | |||
2090 | -STATIC void | ||
2091 | -xfs_set_diflags( | ||
2092 | +STATIC uint16_t | ||
2093 | +xfs_flags2diflags( | ||
2094 | struct xfs_inode *ip, | ||
2095 | unsigned int xflags) | ||
2096 | { | ||
2097 | - unsigned int di_flags; | ||
2098 | - uint64_t di_flags2; | ||
2099 | - | ||
2100 | /* can't set PREALLOC this way, just preserve it */ | ||
2101 | - di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC); | ||
2102 | + uint16_t di_flags = | ||
2103 | + (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC); | ||
2104 | + | ||
2105 | if (xflags & FS_XFLAG_IMMUTABLE) | ||
2106 | di_flags |= XFS_DIFLAG_IMMUTABLE; | ||
2107 | if (xflags & FS_XFLAG_APPEND) | ||
2108 | @@ -967,19 +966,24 @@ xfs_set_diflags( | ||
2109 | if (xflags & FS_XFLAG_EXTSIZE) | ||
2110 | di_flags |= XFS_DIFLAG_EXTSIZE; | ||
2111 | } | ||
2112 | - ip->i_d.di_flags = di_flags; | ||
2113 | |||
2114 | - /* diflags2 only valid for v3 inodes. */ | ||
2115 | - if (ip->i_d.di_version < 3) | ||
2116 | - return; | ||
2117 | + return di_flags; | ||
2118 | +} | ||
2119 | + | ||
2120 | +STATIC uint64_t | ||
2121 | +xfs_flags2diflags2( | ||
2122 | + struct xfs_inode *ip, | ||
2123 | + unsigned int xflags) | ||
2124 | +{ | ||
2125 | + uint64_t di_flags2 = | ||
2126 | + (ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK); | ||
2127 | |||
2128 | - di_flags2 = (ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK); | ||
2129 | if (xflags & FS_XFLAG_DAX) | ||
2130 | di_flags2 |= XFS_DIFLAG2_DAX; | ||
2131 | if (xflags & FS_XFLAG_COWEXTSIZE) | ||
2132 | di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; | ||
2133 | |||
2134 | - ip->i_d.di_flags2 = di_flags2; | ||
2135 | + return di_flags2; | ||
2136 | } | ||
2137 | |||
2138 | STATIC void | ||
2139 | @@ -1005,11 +1009,12 @@ xfs_diflags_to_linux( | ||
2140 | inode->i_flags |= S_NOATIME; | ||
2141 | else | ||
2142 | inode->i_flags &= ~S_NOATIME; | ||
2143 | +#if 0 /* disabled until the flag switching races are sorted out */ | ||
2144 | if (xflags & FS_XFLAG_DAX) | ||
2145 | inode->i_flags |= S_DAX; | ||
2146 | else | ||
2147 | inode->i_flags &= ~S_DAX; | ||
2148 | - | ||
2149 | +#endif | ||
2150 | } | ||
2151 | |||
2152 | static int | ||
2153 | @@ -1019,6 +1024,7 @@ xfs_ioctl_setattr_xflags( | ||
2154 | struct fsxattr *fa) | ||
2155 | { | ||
2156 | struct xfs_mount *mp = ip->i_mount; | ||
2157 | + uint64_t di_flags2; | ||
2158 | |||
2159 | /* Can't change realtime flag if any extents are allocated. */ | ||
2160 | if ((ip->i_d.di_nextents || ip->i_delayed_blks) && | ||
2161 | @@ -1049,7 +1055,14 @@ xfs_ioctl_setattr_xflags( | ||
2162 | !capable(CAP_LINUX_IMMUTABLE)) | ||
2163 | return -EPERM; | ||
2164 | |||
2165 | - xfs_set_diflags(ip, fa->fsx_xflags); | ||
2166 | + /* diflags2 only valid for v3 inodes. */ | ||
2167 | + di_flags2 = xfs_flags2diflags2(ip, fa->fsx_xflags); | ||
2168 | + if (di_flags2 && ip->i_d.di_version < 3) | ||
2169 | + return -EINVAL; | ||
2170 | + | ||
2171 | + ip->i_d.di_flags = xfs_flags2diflags(ip, fa->fsx_xflags); | ||
2172 | + ip->i_d.di_flags2 = di_flags2; | ||
2173 | + | ||
2174 | xfs_diflags_to_linux(ip); | ||
2175 | xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); | ||
2176 | xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); | ||
2177 | diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c | ||
2178 | index a1247c3c1efb..5b81f7f41b80 100644 | ||
2179 | --- a/fs/xfs/xfs_iops.c | ||
2180 | +++ b/fs/xfs/xfs_iops.c | ||
2181 | @@ -802,7 +802,7 @@ xfs_vn_setattr_nonsize( | ||
2182 | * Caution: The caller of this function is responsible for calling | ||
2183 | * setattr_prepare() or otherwise verifying the change is fine. | ||
2184 | */ | ||
2185 | -int | ||
2186 | +STATIC int | ||
2187 | xfs_setattr_size( | ||
2188 | struct xfs_inode *ip, | ||
2189 | struct iattr *iattr) | ||
2190 | diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c | ||
2191 | index b57ab34fbf3c..33c9a3aae948 100644 | ||
2192 | --- a/fs/xfs/xfs_log.c | ||
2193 | +++ b/fs/xfs/xfs_log.c | ||
2194 | @@ -743,15 +743,45 @@ xfs_log_mount_finish( | ||
2195 | struct xfs_mount *mp) | ||
2196 | { | ||
2197 | int error = 0; | ||
2198 | + bool readonly = (mp->m_flags & XFS_MOUNT_RDONLY); | ||
2199 | |||
2200 | if (mp->m_flags & XFS_MOUNT_NORECOVERY) { | ||
2201 | ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); | ||
2202 | return 0; | ||
2203 | + } else if (readonly) { | ||
2204 | + /* Allow unlinked processing to proceed */ | ||
2205 | + mp->m_flags &= ~XFS_MOUNT_RDONLY; | ||
2206 | } | ||
2207 | |||
2208 | + /* | ||
2209 | + * During the second phase of log recovery, we need iget and | ||
2210 | + * iput to behave like they do for an active filesystem. | ||
2211 | + * xfs_fs_drop_inode needs to be able to prevent the deletion | ||
2212 | + * of inodes before we're done replaying log items on those | ||
2213 | + * inodes. Turn it off immediately after recovery finishes | ||
2214 | + * so that we don't leak the quota inodes if subsequent mount | ||
2215 | + * activities fail. | ||
2216 | + * | ||
2217 | + * We let all inodes involved in redo item processing end up on | ||
2218 | + * the LRU instead of being evicted immediately so that if we do | ||
2219 | + * something to an unlinked inode, the irele won't cause | ||
2220 | + * premature truncation and freeing of the inode, which results | ||
2221 | + * in log recovery failure. We have to evict the unreferenced | ||
2222 | + * lru inodes after clearing MS_ACTIVE because we don't | ||
2223 | + * otherwise clean up the lru if there's a subsequent failure in | ||
2224 | + * xfs_mountfs, which leads to us leaking the inodes if nothing | ||
2225 | + * else (e.g. quotacheck) references the inodes before the | ||
2226 | + * mount failure occurs. | ||
2227 | + */ | ||
2228 | + mp->m_super->s_flags |= MS_ACTIVE; | ||
2229 | error = xlog_recover_finish(mp->m_log); | ||
2230 | if (!error) | ||
2231 | xfs_log_work_queue(mp); | ||
2232 | + mp->m_super->s_flags &= ~MS_ACTIVE; | ||
2233 | + evict_inodes(mp->m_super); | ||
2234 | + | ||
2235 | + if (readonly) | ||
2236 | + mp->m_flags |= XFS_MOUNT_RDONLY; | ||
2237 | |||
2238 | return error; | ||
2239 | } | ||
2240 | @@ -801,11 +831,14 @@ xfs_log_unmount_write(xfs_mount_t *mp) | ||
2241 | int error; | ||
2242 | |||
2243 | /* | ||
2244 | - * Don't write out unmount record on read-only mounts. | ||
2245 | + * Don't write out unmount record on norecovery mounts or ro devices. | ||
2246 | * Or, if we are doing a forced umount (typically because of IO errors). | ||
2247 | */ | ||
2248 | - if (mp->m_flags & XFS_MOUNT_RDONLY) | ||
2249 | + if (mp->m_flags & XFS_MOUNT_NORECOVERY || | ||
2250 | + xfs_readonly_buftarg(log->l_mp->m_logdev_targp)) { | ||
2251 | + ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); | ||
2252 | return 0; | ||
2253 | + } | ||
2254 | |||
2255 | error = _xfs_log_force(mp, XFS_LOG_SYNC, NULL); | ||
2256 | ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log))); | ||
2257 | @@ -3304,8 +3337,6 @@ _xfs_log_force( | ||
2258 | */ | ||
2259 | if (iclog->ic_state & XLOG_STATE_IOERROR) | ||
2260 | return -EIO; | ||
2261 | - if (log_flushed) | ||
2262 | - *log_flushed = 1; | ||
2263 | } else { | ||
2264 | |||
2265 | no_sleep: | ||
2266 | @@ -3409,8 +3440,6 @@ _xfs_log_force_lsn( | ||
2267 | |||
2268 | xlog_wait(&iclog->ic_prev->ic_write_wait, | ||
2269 | &log->l_icloglock); | ||
2270 | - if (log_flushed) | ||
2271 | - *log_flushed = 1; | ||
2272 | already_slept = 1; | ||
2273 | goto try_again; | ||
2274 | } | ||
2275 | @@ -3444,9 +3473,6 @@ _xfs_log_force_lsn( | ||
2276 | */ | ||
2277 | if (iclog->ic_state & XLOG_STATE_IOERROR) | ||
2278 | return -EIO; | ||
2279 | - | ||
2280 | - if (log_flushed) | ||
2281 | - *log_flushed = 1; | ||
2282 | } else { /* just return */ | ||
2283 | spin_unlock(&log->l_icloglock); | ||
2284 | } | ||
2285 | diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c | ||
2286 | index 9b3d7c76915d..05909269f973 100644 | ||
2287 | --- a/fs/xfs/xfs_log_recover.c | ||
2288 | +++ b/fs/xfs/xfs_log_recover.c | ||
2289 | @@ -1029,61 +1029,106 @@ xlog_seek_logrec_hdr( | ||
2290 | } | ||
2291 | |||
2292 | /* | ||
2293 | - * Check the log tail for torn writes. This is required when torn writes are | ||
2294 | - * detected at the head and the head had to be walked back to a previous record. | ||
2295 | - * The tail of the previous record must now be verified to ensure the torn | ||
2296 | - * writes didn't corrupt the previous tail. | ||
2297 | + * Calculate distance from head to tail (i.e., unused space in the log). | ||
2298 | + */ | ||
2299 | +static inline int | ||
2300 | +xlog_tail_distance( | ||
2301 | + struct xlog *log, | ||
2302 | + xfs_daddr_t head_blk, | ||
2303 | + xfs_daddr_t tail_blk) | ||
2304 | +{ | ||
2305 | + if (head_blk < tail_blk) | ||
2306 | + return tail_blk - head_blk; | ||
2307 | + | ||
2308 | + return tail_blk + (log->l_logBBsize - head_blk); | ||
2309 | +} | ||
2310 | + | ||
2311 | +/* | ||
2312 | + * Verify the log tail. This is particularly important when torn or incomplete | ||
2313 | + * writes have been detected near the front of the log and the head has been | ||
2314 | + * walked back accordingly. | ||
2315 | * | ||
2316 | - * Return an error if CRC verification fails as recovery cannot proceed. | ||
2317 | + * We also have to handle the case where the tail was pinned and the head | ||
2318 | + * blocked behind the tail right before a crash. If the tail had been pushed | ||
2319 | + * immediately prior to the crash and the subsequent checkpoint was only | ||
2320 | + * partially written, it's possible it overwrote the last referenced tail in the | ||
2321 | + * log with garbage. This is not a coherency problem because the tail must have | ||
2322 | + * been pushed before it can be overwritten, but appears as log corruption to | ||
2323 | + * recovery because we have no way to know the tail was updated if the | ||
2324 | + * subsequent checkpoint didn't write successfully. | ||
2325 | + * | ||
2326 | + * Therefore, CRC check the log from tail to head. If a failure occurs and the | ||
2327 | + * offending record is within max iclog bufs from the head, walk the tail | ||
2328 | + * forward and retry until a valid tail is found or corruption is detected out | ||
2329 | + * of the range of a possible overwrite. | ||
2330 | */ | ||
2331 | STATIC int | ||
2332 | xlog_verify_tail( | ||
2333 | struct xlog *log, | ||
2334 | xfs_daddr_t head_blk, | ||
2335 | - xfs_daddr_t tail_blk) | ||
2336 | + xfs_daddr_t *tail_blk, | ||
2337 | + int hsize) | ||
2338 | { | ||
2339 | struct xlog_rec_header *thead; | ||
2340 | struct xfs_buf *bp; | ||
2341 | xfs_daddr_t first_bad; | ||
2342 | - int count; | ||
2343 | int error = 0; | ||
2344 | bool wrapped; | ||
2345 | - xfs_daddr_t tmp_head; | ||
2346 | + xfs_daddr_t tmp_tail; | ||
2347 | + xfs_daddr_t orig_tail = *tail_blk; | ||
2348 | |||
2349 | bp = xlog_get_bp(log, 1); | ||
2350 | if (!bp) | ||
2351 | return -ENOMEM; | ||
2352 | |||
2353 | /* | ||
2354 | - * Seek XLOG_MAX_ICLOGS + 1 records past the current tail record to get | ||
2355 | - * a temporary head block that points after the last possible | ||
2356 | - * concurrently written record of the tail. | ||
2357 | + * Make sure the tail points to a record (returns positive count on | ||
2358 | + * success). | ||
2359 | */ | ||
2360 | - count = xlog_seek_logrec_hdr(log, head_blk, tail_blk, | ||
2361 | - XLOG_MAX_ICLOGS + 1, bp, &tmp_head, &thead, | ||
2362 | - &wrapped); | ||
2363 | - if (count < 0) { | ||
2364 | - error = count; | ||
2365 | + error = xlog_seek_logrec_hdr(log, head_blk, *tail_blk, 1, bp, | ||
2366 | + &tmp_tail, &thead, &wrapped); | ||
2367 | + if (error < 0) | ||
2368 | goto out; | ||
2369 | - } | ||
2370 | - | ||
2371 | - /* | ||
2372 | - * If the call above didn't find XLOG_MAX_ICLOGS + 1 records, we ran | ||
2373 | - * into the actual log head. tmp_head points to the start of the record | ||
2374 | - * so update it to the actual head block. | ||
2375 | - */ | ||
2376 | - if (count < XLOG_MAX_ICLOGS + 1) | ||
2377 | - tmp_head = head_blk; | ||
2378 | + if (*tail_blk != tmp_tail) | ||
2379 | + *tail_blk = tmp_tail; | ||
2380 | |||
2381 | /* | ||
2382 | - * We now have a tail and temporary head block that covers at least | ||
2383 | - * XLOG_MAX_ICLOGS records from the tail. We need to verify that these | ||
2384 | - * records were completely written. Run a CRC verification pass from | ||
2385 | - * tail to head and return the result. | ||
2386 | + * Run a CRC check from the tail to the head. We can't just check | ||
2387 | + * MAX_ICLOGS records past the tail because the tail may point to stale | ||
2388 | + * blocks cleared during the search for the head/tail. These blocks are | ||
2389 | + * overwritten with zero-length records and thus record count is not a | ||
2390 | + * reliable indicator of the iclog state before a crash. | ||
2391 | */ | ||
2392 | - error = xlog_do_recovery_pass(log, tmp_head, tail_blk, | ||
2393 | + first_bad = 0; | ||
2394 | + error = xlog_do_recovery_pass(log, head_blk, *tail_blk, | ||
2395 | XLOG_RECOVER_CRCPASS, &first_bad); | ||
2396 | + while ((error == -EFSBADCRC || error == -EFSCORRUPTED) && first_bad) { | ||
2397 | + int tail_distance; | ||
2398 | + | ||
2399 | + /* | ||
2400 | + * Is corruption within range of the head? If so, retry from | ||
2401 | + * the next record. Otherwise return an error. | ||
2402 | + */ | ||
2403 | + tail_distance = xlog_tail_distance(log, head_blk, first_bad); | ||
2404 | + if (tail_distance > BTOBB(XLOG_MAX_ICLOGS * hsize)) | ||
2405 | + break; | ||
2406 | + | ||
2407 | + /* skip to the next record; returns positive count on success */ | ||
2408 | + error = xlog_seek_logrec_hdr(log, head_blk, first_bad, 2, bp, | ||
2409 | + &tmp_tail, &thead, &wrapped); | ||
2410 | + if (error < 0) | ||
2411 | + goto out; | ||
2412 | + | ||
2413 | + *tail_blk = tmp_tail; | ||
2414 | + first_bad = 0; | ||
2415 | + error = xlog_do_recovery_pass(log, head_blk, *tail_blk, | ||
2416 | + XLOG_RECOVER_CRCPASS, &first_bad); | ||
2417 | + } | ||
2418 | |||
2419 | + if (!error && *tail_blk != orig_tail) | ||
2420 | + xfs_warn(log->l_mp, | ||
2421 | + "Tail block (0x%llx) overwrite detected. Updated to 0x%llx", | ||
2422 | + orig_tail, *tail_blk); | ||
2423 | out: | ||
2424 | xlog_put_bp(bp); | ||
2425 | return error; | ||
2426 | @@ -1143,7 +1188,7 @@ xlog_verify_head( | ||
2427 | */ | ||
2428 | error = xlog_do_recovery_pass(log, *head_blk, tmp_rhead_blk, | ||
2429 | XLOG_RECOVER_CRCPASS, &first_bad); | ||
2430 | - if (error == -EFSBADCRC) { | ||
2431 | + if ((error == -EFSBADCRC || error == -EFSCORRUPTED) && first_bad) { | ||
2432 | /* | ||
2433 | * We've hit a potential torn write. Reset the error and warn | ||
2434 | * about it. | ||
2435 | @@ -1183,31 +1228,12 @@ xlog_verify_head( | ||
2436 | ASSERT(0); | ||
2437 | return 0; | ||
2438 | } | ||
2439 | - | ||
2440 | - /* | ||
2441 | - * Now verify the tail based on the updated head. This is | ||
2442 | - * required because the torn writes trimmed from the head could | ||
2443 | - * have been written over the tail of a previous record. Return | ||
2444 | - * any errors since recovery cannot proceed if the tail is | ||
2445 | - * corrupt. | ||
2446 | - * | ||
2447 | - * XXX: This leaves a gap in truly robust protection from torn | ||
2448 | - * writes in the log. If the head is behind the tail, the tail | ||
2449 | - * pushes forward to create some space and then a crash occurs | ||
2450 | - * causing the writes into the previous record's tail region to | ||
2451 | - * tear, log recovery isn't able to recover. | ||
2452 | - * | ||
2453 | - * How likely is this to occur? If possible, can we do something | ||
2454 | - * more intelligent here? Is it safe to push the tail forward if | ||
2455 | - * we can determine that the tail is within the range of the | ||
2456 | - * torn write (e.g., the kernel can only overwrite the tail if | ||
2457 | - * it has actually been pushed forward)? Alternatively, could we | ||
2458 | - * somehow prevent this condition at runtime? | ||
2459 | - */ | ||
2460 | - error = xlog_verify_tail(log, *head_blk, *tail_blk); | ||
2461 | } | ||
2462 | + if (error) | ||
2463 | + return error; | ||
2464 | |||
2465 | - return error; | ||
2466 | + return xlog_verify_tail(log, *head_blk, tail_blk, | ||
2467 | + be32_to_cpu((*rhead)->h_size)); | ||
2468 | } | ||
2469 | |||
2470 | /* | ||
2471 | @@ -4152,7 +4178,7 @@ xlog_recover_commit_trans( | ||
2472 | |||
2473 | #define XLOG_RECOVER_COMMIT_QUEUE_MAX 100 | ||
2474 | |||
2475 | - hlist_del(&trans->r_list); | ||
2476 | + hlist_del_init(&trans->r_list); | ||
2477 | |||
2478 | error = xlog_recover_reorder_trans(log, trans, pass); | ||
2479 | if (error) | ||
2480 | @@ -4354,6 +4380,8 @@ xlog_recover_free_trans( | ||
2481 | xlog_recover_item_t *item, *n; | ||
2482 | int i; | ||
2483 | |||
2484 | + hlist_del_init(&trans->r_list); | ||
2485 | + | ||
2486 | list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) { | ||
2487 | /* Free the regions in the item. */ | ||
2488 | list_del(&item->ri_list); | ||
2489 | @@ -4799,12 +4827,16 @@ xlog_recover_process_intents( | ||
2490 | int error = 0; | ||
2491 | struct xfs_ail_cursor cur; | ||
2492 | struct xfs_ail *ailp; | ||
2493 | +#if defined(DEBUG) || defined(XFS_WARN) | ||
2494 | xfs_lsn_t last_lsn; | ||
2495 | +#endif | ||
2496 | |||
2497 | ailp = log->l_ailp; | ||
2498 | spin_lock(&ailp->xa_lock); | ||
2499 | lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); | ||
2500 | +#if defined(DEBUG) || defined(XFS_WARN) | ||
2501 | last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block); | ||
2502 | +#endif | ||
2503 | while (lip != NULL) { | ||
2504 | /* | ||
2505 | * We're done when we see something other than an intent. | ||
2506 | @@ -5214,7 +5246,7 @@ xlog_do_recovery_pass( | ||
2507 | xfs_daddr_t *first_bad) /* out: first bad log rec */ | ||
2508 | { | ||
2509 | xlog_rec_header_t *rhead; | ||
2510 | - xfs_daddr_t blk_no; | ||
2511 | + xfs_daddr_t blk_no, rblk_no; | ||
2512 | xfs_daddr_t rhead_blk; | ||
2513 | char *offset; | ||
2514 | xfs_buf_t *hbp, *dbp; | ||
2515 | @@ -5222,11 +5254,15 @@ xlog_do_recovery_pass( | ||
2516 | int error2 = 0; | ||
2517 | int bblks, split_bblks; | ||
2518 | int hblks, split_hblks, wrapped_hblks; | ||
2519 | + int i; | ||
2520 | struct hlist_head rhash[XLOG_RHASH_SIZE]; | ||
2521 | LIST_HEAD (buffer_list); | ||
2522 | |||
2523 | ASSERT(head_blk != tail_blk); | ||
2524 | - rhead_blk = 0; | ||
2525 | + blk_no = rhead_blk = tail_blk; | ||
2526 | + | ||
2527 | + for (i = 0; i < XLOG_RHASH_SIZE; i++) | ||
2528 | + INIT_HLIST_HEAD(&rhash[i]); | ||
2529 | |||
2530 | /* | ||
2531 | * Read the header of the tail block and get the iclog buffer size from | ||
2532 | @@ -5301,7 +5337,6 @@ xlog_do_recovery_pass( | ||
2533 | } | ||
2534 | |||
2535 | memset(rhash, 0, sizeof(rhash)); | ||
2536 | - blk_no = rhead_blk = tail_blk; | ||
2537 | if (tail_blk > head_blk) { | ||
2538 | /* | ||
2539 | * Perform recovery around the end of the physical log. | ||
2540 | @@ -5363,9 +5398,19 @@ xlog_do_recovery_pass( | ||
2541 | bblks = (int)BTOBB(be32_to_cpu(rhead->h_len)); | ||
2542 | blk_no += hblks; | ||
2543 | |||
2544 | - /* Read in data for log record */ | ||
2545 | - if (blk_no + bblks <= log->l_logBBsize) { | ||
2546 | - error = xlog_bread(log, blk_no, bblks, dbp, | ||
2547 | + /* | ||
2548 | + * Read the log record data in multiple reads if it | ||
2549 | + * wraps around the end of the log. Note that if the | ||
2550 | + * header already wrapped, blk_no could point past the | ||
2551 | + * end of the log. The record data is contiguous in | ||
2552 | + * that case. | ||
2553 | + */ | ||
2554 | + if (blk_no + bblks <= log->l_logBBsize || | ||
2555 | + blk_no >= log->l_logBBsize) { | ||
2556 | + /* mod blk_no in case the header wrapped and | ||
2557 | + * pushed it beyond the end of the log */ | ||
2558 | + rblk_no = do_mod(blk_no, log->l_logBBsize); | ||
2559 | + error = xlog_bread(log, rblk_no, bblks, dbp, | ||
2560 | &offset); | ||
2561 | if (error) | ||
2562 | goto bread_err2; | ||
2563 | @@ -5464,6 +5509,19 @@ xlog_do_recovery_pass( | ||
2564 | if (error && first_bad) | ||
2565 | *first_bad = rhead_blk; | ||
2566 | |||
2567 | + /* | ||
2568 | + * Transactions are freed at commit time but transactions without commit | ||
2569 | + * records on disk are never committed. Free any that may be left in the | ||
2570 | + * hash table. | ||
2571 | + */ | ||
2572 | + for (i = 0; i < XLOG_RHASH_SIZE; i++) { | ||
2573 | + struct hlist_node *tmp; | ||
2574 | + struct xlog_recover *trans; | ||
2575 | + | ||
2576 | + hlist_for_each_entry_safe(trans, tmp, &rhash[i], r_list) | ||
2577 | + xlog_recover_free_trans(trans); | ||
2578 | + } | ||
2579 | + | ||
2580 | return error ? error : error2; | ||
2581 | } | ||
2582 | |||
2583 | @@ -5542,6 +5600,8 @@ xlog_do_recover( | ||
2584 | xfs_buf_t *bp; | ||
2585 | xfs_sb_t *sbp; | ||
2586 | |||
2587 | + trace_xfs_log_recover(log, head_blk, tail_blk); | ||
2588 | + | ||
2589 | /* | ||
2590 | * First replay the images in the log. | ||
2591 | */ | ||
2592 | diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c | ||
2593 | index 13796f212f98..d4ce8d277992 100644 | ||
2594 | --- a/fs/xfs/xfs_mount.c | ||
2595 | +++ b/fs/xfs/xfs_mount.c | ||
2596 | @@ -924,15 +924,6 @@ xfs_mountfs( | ||
2597 | } | ||
2598 | } | ||
2599 | |||
2600 | - /* | ||
2601 | - * During the second phase of log recovery, we need iget and | ||
2602 | - * iput to behave like they do for an active filesystem. | ||
2603 | - * xfs_fs_drop_inode needs to be able to prevent the deletion | ||
2604 | - * of inodes before we're done replaying log items on those | ||
2605 | - * inodes. | ||
2606 | - */ | ||
2607 | - mp->m_super->s_flags |= MS_ACTIVE; | ||
2608 | - | ||
2609 | /* | ||
2610 | * Finish recovering the file system. This part needed to be delayed | ||
2611 | * until after the root and real-time bitmap inodes were consistently | ||
2612 | @@ -1008,12 +999,13 @@ xfs_mountfs( | ||
2613 | out_quota: | ||
2614 | xfs_qm_unmount_quotas(mp); | ||
2615 | out_rtunmount: | ||
2616 | - mp->m_super->s_flags &= ~MS_ACTIVE; | ||
2617 | xfs_rtunmount_inodes(mp); | ||
2618 | out_rele_rip: | ||
2619 | IRELE(rip); | ||
2620 | cancel_delayed_work_sync(&mp->m_reclaim_work); | ||
2621 | xfs_reclaim_inodes(mp, SYNC_WAIT); | ||
2622 | + /* Clean out dquots that might be in memory after quotacheck. */ | ||
2623 | + xfs_qm_unmount(mp); | ||
2624 | out_log_dealloc: | ||
2625 | mp->m_flags |= XFS_MOUNT_UNMOUNTING; | ||
2626 | xfs_log_mount_cancel(mp); | ||
2627 | diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c | ||
2628 | index 8b9a9f15f022..1fdd3face2d9 100644 | ||
2629 | --- a/fs/xfs/xfs_qm.c | ||
2630 | +++ b/fs/xfs/xfs_qm.c | ||
2631 | @@ -111,6 +111,9 @@ xfs_qm_dquot_walk( | ||
2632 | skipped = 0; | ||
2633 | break; | ||
2634 | } | ||
2635 | + /* we're done if id overflows back to zero */ | ||
2636 | + if (!next_index) | ||
2637 | + break; | ||
2638 | } | ||
2639 | |||
2640 | if (skipped) { | ||
2641 | @@ -1247,6 +1250,7 @@ xfs_qm_flush_one( | ||
2642 | struct xfs_dquot *dqp, | ||
2643 | void *data) | ||
2644 | { | ||
2645 | + struct xfs_mount *mp = dqp->q_mount; | ||
2646 | struct list_head *buffer_list = data; | ||
2647 | struct xfs_buf *bp = NULL; | ||
2648 | int error = 0; | ||
2649 | @@ -1257,7 +1261,32 @@ xfs_qm_flush_one( | ||
2650 | if (!XFS_DQ_IS_DIRTY(dqp)) | ||
2651 | goto out_unlock; | ||
2652 | |||
2653 | - xfs_dqflock(dqp); | ||
2654 | + /* | ||
2655 | + * The only way the dquot is already flush locked by the time quotacheck | ||
2656 | + * gets here is if reclaim flushed it before the dqadjust walk dirtied | ||
2657 | + * it for the final time. Quotacheck collects all dquot bufs in the | ||
2658 | + * local delwri queue before dquots are dirtied, so reclaim can't have | ||
2659 | + * possibly queued it for I/O. The only way out is to push the buffer to | ||
2660 | + * cycle the flush lock. | ||
2661 | + */ | ||
2662 | + if (!xfs_dqflock_nowait(dqp)) { | ||
2663 | + /* buf is pinned in-core by delwri list */ | ||
2664 | + DEFINE_SINGLE_BUF_MAP(map, dqp->q_blkno, | ||
2665 | + mp->m_quotainfo->qi_dqchunklen); | ||
2666 | + bp = _xfs_buf_find(mp->m_ddev_targp, &map, 1, 0, NULL); | ||
2667 | + if (!bp) { | ||
2668 | + error = -EINVAL; | ||
2669 | + goto out_unlock; | ||
2670 | + } | ||
2671 | + xfs_buf_unlock(bp); | ||
2672 | + | ||
2673 | + xfs_buf_delwri_pushbuf(bp, buffer_list); | ||
2674 | + xfs_buf_rele(bp); | ||
2675 | + | ||
2676 | + error = -EAGAIN; | ||
2677 | + goto out_unlock; | ||
2678 | + } | ||
2679 | + | ||
2680 | error = xfs_qm_dqflush(dqp, &bp); | ||
2681 | if (error) | ||
2682 | goto out_unlock; | ||
2683 | diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c | ||
2684 | index 29a75ecb2425..0015c19c7455 100644 | ||
2685 | --- a/fs/xfs/xfs_reflink.c | ||
2686 | +++ b/fs/xfs/xfs_reflink.c | ||
2687 | @@ -169,6 +169,8 @@ xfs_reflink_find_shared( | ||
2688 | error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); | ||
2689 | if (error) | ||
2690 | return error; | ||
2691 | + if (!agbp) | ||
2692 | + return -ENOMEM; | ||
2693 | |||
2694 | cur = xfs_refcountbt_init_cursor(mp, NULL, agbp, agno, NULL); | ||
2695 | |||
2696 | @@ -333,7 +335,7 @@ xfs_reflink_convert_cow_extent( | ||
2697 | struct xfs_defer_ops *dfops) | ||
2698 | { | ||
2699 | struct xfs_bmbt_irec irec = *imap; | ||
2700 | - xfs_fsblock_t first_block; | ||
2701 | + xfs_fsblock_t first_block = NULLFSBLOCK; | ||
2702 | int nimaps = 1; | ||
2703 | |||
2704 | if (imap->br_state == XFS_EXT_NORM) | ||
2705 | diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c | ||
2706 | index 882fb8524fcb..67d589e0a49f 100644 | ||
2707 | --- a/fs/xfs/xfs_super.c | ||
2708 | +++ b/fs/xfs/xfs_super.c | ||
2709 | @@ -1214,7 +1214,7 @@ xfs_test_remount_options( | ||
2710 | tmp_mp->m_super = sb; | ||
2711 | error = xfs_parseargs(tmp_mp, options); | ||
2712 | xfs_free_fsname(tmp_mp); | ||
2713 | - kfree(tmp_mp); | ||
2714 | + kmem_free(tmp_mp); | ||
2715 | |||
2716 | return error; | ||
2717 | } | ||
2718 | diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h | ||
2719 | index 828f383df121..bdf69e1c7410 100644 | ||
2720 | --- a/fs/xfs/xfs_trace.h | ||
2721 | +++ b/fs/xfs/xfs_trace.h | ||
2722 | @@ -366,6 +366,7 @@ DEFINE_BUF_EVENT(xfs_buf_iowait_done); | ||
2723 | DEFINE_BUF_EVENT(xfs_buf_delwri_queue); | ||
2724 | DEFINE_BUF_EVENT(xfs_buf_delwri_queued); | ||
2725 | DEFINE_BUF_EVENT(xfs_buf_delwri_split); | ||
2726 | +DEFINE_BUF_EVENT(xfs_buf_delwri_pushbuf); | ||
2727 | DEFINE_BUF_EVENT(xfs_buf_get_uncached); | ||
2728 | DEFINE_BUF_EVENT(xfs_bdstrat_shut); | ||
2729 | DEFINE_BUF_EVENT(xfs_buf_item_relse); | ||
2730 | @@ -519,7 +520,6 @@ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size); | ||
2731 | DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_ordered); | ||
2732 | DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_stale); | ||
2733 | DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format); | ||
2734 | -DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_ordered); | ||
2735 | DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_stale); | ||
2736 | DEFINE_BUF_ITEM_EVENT(xfs_buf_item_ordered); | ||
2737 | DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin); | ||
2738 | @@ -1990,6 +1990,24 @@ DEFINE_EVENT(xfs_swap_extent_class, name, \ | ||
2739 | DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before); | ||
2740 | DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after); | ||
2741 | |||
2742 | +TRACE_EVENT(xfs_log_recover, | ||
2743 | + TP_PROTO(struct xlog *log, xfs_daddr_t headblk, xfs_daddr_t tailblk), | ||
2744 | + TP_ARGS(log, headblk, tailblk), | ||
2745 | + TP_STRUCT__entry( | ||
2746 | + __field(dev_t, dev) | ||
2747 | + __field(xfs_daddr_t, headblk) | ||
2748 | + __field(xfs_daddr_t, tailblk) | ||
2749 | + ), | ||
2750 | + TP_fast_assign( | ||
2751 | + __entry->dev = log->l_mp->m_super->s_dev; | ||
2752 | + __entry->headblk = headblk; | ||
2753 | + __entry->tailblk = tailblk; | ||
2754 | + ), | ||
2755 | + TP_printk("dev %d:%d headblk 0x%llx tailblk 0x%llx", | ||
2756 | + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->headblk, | ||
2757 | + __entry->tailblk) | ||
2758 | +) | ||
2759 | + | ||
2760 | TRACE_EVENT(xfs_log_recover_record, | ||
2761 | TP_PROTO(struct xlog *log, struct xlog_rec_header *rhead, int pass), | ||
2762 | TP_ARGS(log, rhead, pass), | ||
2763 | diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h | ||
2764 | index 98024cb933ef..5669cf00bae0 100644 | ||
2765 | --- a/fs/xfs/xfs_trans.h | ||
2766 | +++ b/fs/xfs/xfs_trans.h | ||
2767 | @@ -50,6 +50,7 @@ typedef struct xfs_log_item { | ||
2768 | struct xfs_ail *li_ailp; /* ptr to AIL */ | ||
2769 | uint li_type; /* item type */ | ||
2770 | uint li_flags; /* misc flags */ | ||
2771 | + struct xfs_buf *li_buf; /* real buffer pointer */ | ||
2772 | struct xfs_log_item *li_bio_list; /* buffer item list */ | ||
2773 | void (*li_cb)(struct xfs_buf *, | ||
2774 | struct xfs_log_item *); | ||
2775 | @@ -65,11 +66,13 @@ typedef struct xfs_log_item { | ||
2776 | } xfs_log_item_t; | ||
2777 | |||
2778 | #define XFS_LI_IN_AIL 0x1 | ||
2779 | -#define XFS_LI_ABORTED 0x2 | ||
2780 | +#define XFS_LI_ABORTED 0x2 | ||
2781 | +#define XFS_LI_FAILED 0x4 | ||
2782 | |||
2783 | #define XFS_LI_FLAGS \ | ||
2784 | { XFS_LI_IN_AIL, "IN_AIL" }, \ | ||
2785 | - { XFS_LI_ABORTED, "ABORTED" } | ||
2786 | + { XFS_LI_ABORTED, "ABORTED" }, \ | ||
2787 | + { XFS_LI_FAILED, "FAILED" } | ||
2788 | |||
2789 | struct xfs_item_ops { | ||
2790 | void (*iop_size)(xfs_log_item_t *, int *, int *); | ||
2791 | @@ -80,6 +83,7 @@ struct xfs_item_ops { | ||
2792 | void (*iop_unlock)(xfs_log_item_t *); | ||
2793 | xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t); | ||
2794 | void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t); | ||
2795 | + void (*iop_error)(xfs_log_item_t *, xfs_buf_t *); | ||
2796 | }; | ||
2797 | |||
2798 | void xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item, | ||
2799 | @@ -213,12 +217,14 @@ void xfs_trans_bhold_release(xfs_trans_t *, struct xfs_buf *); | ||
2800 | void xfs_trans_binval(xfs_trans_t *, struct xfs_buf *); | ||
2801 | void xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *); | ||
2802 | void xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *); | ||
2803 | -void xfs_trans_ordered_buf(xfs_trans_t *, struct xfs_buf *); | ||
2804 | +bool xfs_trans_ordered_buf(xfs_trans_t *, struct xfs_buf *); | ||
2805 | void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint); | ||
2806 | void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *); | ||
2807 | void xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int); | ||
2808 | void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *, uint); | ||
2809 | -void xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint); | ||
2810 | +void xfs_trans_log_buf(struct xfs_trans *, struct xfs_buf *, uint, | ||
2811 | + uint); | ||
2812 | +void xfs_trans_dirty_buf(struct xfs_trans *, struct xfs_buf *); | ||
2813 | void xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint); | ||
2814 | |||
2815 | void xfs_extent_free_init_defer_op(void); | ||
2816 | @@ -277,6 +283,6 @@ int xfs_trans_log_finish_bmap_update(struct xfs_trans *tp, | ||
2817 | struct xfs_bud_log_item *rudp, struct xfs_defer_ops *dfops, | ||
2818 | enum xfs_bmap_intent_type type, struct xfs_inode *ip, | ||
2819 | int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock, | ||
2820 | - xfs_filblks_t blockcount, xfs_exntst_t state); | ||
2821 | + xfs_filblks_t *blockcount, xfs_exntst_t state); | ||
2822 | |||
2823 | #endif /* __XFS_TRANS_H__ */ | ||
2824 | diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c | ||
2825 | index d6c9c3e9e02b..70f5ab017323 100644 | ||
2826 | --- a/fs/xfs/xfs_trans_ail.c | ||
2827 | +++ b/fs/xfs/xfs_trans_ail.c | ||
2828 | @@ -684,8 +684,24 @@ xfs_trans_ail_update_bulk( | ||
2829 | } | ||
2830 | } | ||
2831 | |||
2832 | -/* | ||
2833 | - * xfs_trans_ail_delete_bulk - remove multiple log items from the AIL | ||
2834 | +bool | ||
2835 | +xfs_ail_delete_one( | ||
2836 | + struct xfs_ail *ailp, | ||
2837 | + struct xfs_log_item *lip) | ||
2838 | +{ | ||
2839 | + struct xfs_log_item *mlip = xfs_ail_min(ailp); | ||
2840 | + | ||
2841 | + trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn); | ||
2842 | + xfs_ail_delete(ailp, lip); | ||
2843 | + xfs_clear_li_failed(lip); | ||
2844 | + lip->li_flags &= ~XFS_LI_IN_AIL; | ||
2845 | + lip->li_lsn = 0; | ||
2846 | + | ||
2847 | + return mlip == lip; | ||
2848 | +} | ||
2849 | + | ||
2850 | +/** | ||
2851 | + * Remove a log items from the AIL | ||
2852 | * | ||
2853 | * @xfs_trans_ail_delete_bulk takes an array of log items that all need to | ||
2854 | * removed from the AIL. The caller is already holding the AIL lock, and done | ||
2855 | @@ -706,52 +722,36 @@ xfs_trans_ail_update_bulk( | ||
2856 | * before returning. | ||
2857 | */ | ||
2858 | void | ||
2859 | -xfs_trans_ail_delete_bulk( | ||
2860 | +xfs_trans_ail_delete( | ||
2861 | struct xfs_ail *ailp, | ||
2862 | - struct xfs_log_item **log_items, | ||
2863 | - int nr_items, | ||
2864 | + struct xfs_log_item *lip, | ||
2865 | int shutdown_type) __releases(ailp->xa_lock) | ||
2866 | { | ||
2867 | - xfs_log_item_t *mlip; | ||
2868 | - int mlip_changed = 0; | ||
2869 | - int i; | ||
2870 | - | ||
2871 | - mlip = xfs_ail_min(ailp); | ||
2872 | + struct xfs_mount *mp = ailp->xa_mount; | ||
2873 | + bool mlip_changed; | ||
2874 | |||
2875 | - for (i = 0; i < nr_items; i++) { | ||
2876 | - struct xfs_log_item *lip = log_items[i]; | ||
2877 | - if (!(lip->li_flags & XFS_LI_IN_AIL)) { | ||
2878 | - struct xfs_mount *mp = ailp->xa_mount; | ||
2879 | - | ||
2880 | - spin_unlock(&ailp->xa_lock); | ||
2881 | - if (!XFS_FORCED_SHUTDOWN(mp)) { | ||
2882 | - xfs_alert_tag(mp, XFS_PTAG_AILDELETE, | ||
2883 | - "%s: attempting to delete a log item that is not in the AIL", | ||
2884 | - __func__); | ||
2885 | - xfs_force_shutdown(mp, shutdown_type); | ||
2886 | - } | ||
2887 | - return; | ||
2888 | + if (!(lip->li_flags & XFS_LI_IN_AIL)) { | ||
2889 | + spin_unlock(&ailp->xa_lock); | ||
2890 | + if (!XFS_FORCED_SHUTDOWN(mp)) { | ||
2891 | + xfs_alert_tag(mp, XFS_PTAG_AILDELETE, | ||
2892 | + "%s: attempting to delete a log item that is not in the AIL", | ||
2893 | + __func__); | ||
2894 | + xfs_force_shutdown(mp, shutdown_type); | ||
2895 | } | ||
2896 | - | ||
2897 | - trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn); | ||
2898 | - xfs_ail_delete(ailp, lip); | ||
2899 | - lip->li_flags &= ~XFS_LI_IN_AIL; | ||
2900 | - lip->li_lsn = 0; | ||
2901 | - if (mlip == lip) | ||
2902 | - mlip_changed = 1; | ||
2903 | + return; | ||
2904 | } | ||
2905 | |||
2906 | + mlip_changed = xfs_ail_delete_one(ailp, lip); | ||
2907 | if (mlip_changed) { | ||
2908 | - if (!XFS_FORCED_SHUTDOWN(ailp->xa_mount)) | ||
2909 | - xlog_assign_tail_lsn_locked(ailp->xa_mount); | ||
2910 | + if (!XFS_FORCED_SHUTDOWN(mp)) | ||
2911 | + xlog_assign_tail_lsn_locked(mp); | ||
2912 | if (list_empty(&ailp->xa_ail)) | ||
2913 | wake_up_all(&ailp->xa_empty); | ||
2914 | - spin_unlock(&ailp->xa_lock); | ||
2915 | + } | ||
2916 | |||
2917 | + spin_unlock(&ailp->xa_lock); | ||
2918 | + if (mlip_changed) | ||
2919 | xfs_log_space_wake(ailp->xa_mount); | ||
2920 | - } else { | ||
2921 | - spin_unlock(&ailp->xa_lock); | ||
2922 | - } | ||
2923 | } | ||
2924 | |||
2925 | int | ||
2926 | diff --git a/fs/xfs/xfs_trans_bmap.c b/fs/xfs/xfs_trans_bmap.c | ||
2927 | index 6408e7d7c08c..14543d93cd4b 100644 | ||
2928 | --- a/fs/xfs/xfs_trans_bmap.c | ||
2929 | +++ b/fs/xfs/xfs_trans_bmap.c | ||
2930 | @@ -63,7 +63,7 @@ xfs_trans_log_finish_bmap_update( | ||
2931 | int whichfork, | ||
2932 | xfs_fileoff_t startoff, | ||
2933 | xfs_fsblock_t startblock, | ||
2934 | - xfs_filblks_t blockcount, | ||
2935 | + xfs_filblks_t *blockcount, | ||
2936 | xfs_exntst_t state) | ||
2937 | { | ||
2938 | int error; | ||
2939 | @@ -196,16 +196,23 @@ xfs_bmap_update_finish_item( | ||
2940 | void **state) | ||
2941 | { | ||
2942 | struct xfs_bmap_intent *bmap; | ||
2943 | + xfs_filblks_t count; | ||
2944 | int error; | ||
2945 | |||
2946 | bmap = container_of(item, struct xfs_bmap_intent, bi_list); | ||
2947 | + count = bmap->bi_bmap.br_blockcount; | ||
2948 | error = xfs_trans_log_finish_bmap_update(tp, done_item, dop, | ||
2949 | bmap->bi_type, | ||
2950 | bmap->bi_owner, bmap->bi_whichfork, | ||
2951 | bmap->bi_bmap.br_startoff, | ||
2952 | bmap->bi_bmap.br_startblock, | ||
2953 | - bmap->bi_bmap.br_blockcount, | ||
2954 | + &count, | ||
2955 | bmap->bi_bmap.br_state); | ||
2956 | + if (!error && count > 0) { | ||
2957 | + ASSERT(bmap->bi_type == XFS_BMAP_UNMAP); | ||
2958 | + bmap->bi_bmap.br_blockcount = count; | ||
2959 | + return -EAGAIN; | ||
2960 | + } | ||
2961 | kmem_free(bmap); | ||
2962 | return error; | ||
2963 | } | ||
2964 | diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c | ||
2965 | index 8ee29ca132dc..3ba7a96a8abd 100644 | ||
2966 | --- a/fs/xfs/xfs_trans_buf.c | ||
2967 | +++ b/fs/xfs/xfs_trans_buf.c | ||
2968 | @@ -356,6 +356,7 @@ xfs_trans_brelse(xfs_trans_t *tp, | ||
2969 | xfs_buf_t *bp) | ||
2970 | { | ||
2971 | xfs_buf_log_item_t *bip; | ||
2972 | + int freed; | ||
2973 | |||
2974 | /* | ||
2975 | * Default to a normal brelse() call if the tp is NULL. | ||
2976 | @@ -419,16 +420,22 @@ xfs_trans_brelse(xfs_trans_t *tp, | ||
2977 | /* | ||
2978 | * Drop our reference to the buf log item. | ||
2979 | */ | ||
2980 | - atomic_dec(&bip->bli_refcount); | ||
2981 | + freed = atomic_dec_and_test(&bip->bli_refcount); | ||
2982 | |||
2983 | /* | ||
2984 | - * If the buf item is not tracking data in the log, then | ||
2985 | - * we must free it before releasing the buffer back to the | ||
2986 | - * free pool. Before releasing the buffer to the free pool, | ||
2987 | - * clear the transaction pointer in b_fsprivate2 to dissolve | ||
2988 | - * its relation to this transaction. | ||
2989 | + * If the buf item is not tracking data in the log, then we must free it | ||
2990 | + * before releasing the buffer back to the free pool. | ||
2991 | + * | ||
2992 | + * If the fs has shutdown and we dropped the last reference, it may fall | ||
2993 | + * on us to release a (possibly dirty) bli if it never made it to the | ||
2994 | + * AIL (e.g., the aborted unpin already happened and didn't release it | ||
2995 | + * due to our reference). Since we're already shutdown and need xa_lock, | ||
2996 | + * just force remove from the AIL and release the bli here. | ||
2997 | */ | ||
2998 | - if (!xfs_buf_item_dirty(bip)) { | ||
2999 | + if (XFS_FORCED_SHUTDOWN(tp->t_mountp) && freed) { | ||
3000 | + xfs_trans_ail_remove(&bip->bli_item, SHUTDOWN_LOG_IO_ERROR); | ||
3001 | + xfs_buf_item_relse(bp); | ||
3002 | + } else if (!(bip->bli_flags & XFS_BLI_DIRTY)) { | ||
3003 | /*** | ||
3004 | ASSERT(bp->b_pincount == 0); | ||
3005 | ***/ | ||
3006 | @@ -486,25 +493,17 @@ xfs_trans_bhold_release(xfs_trans_t *tp, | ||
3007 | } | ||
3008 | |||
3009 | /* | ||
3010 | - * This is called to mark bytes first through last inclusive of the given | ||
3011 | - * buffer as needing to be logged when the transaction is committed. | ||
3012 | - * The buffer must already be associated with the given transaction. | ||
3013 | - * | ||
3014 | - * First and last are numbers relative to the beginning of this buffer, | ||
3015 | - * so the first byte in the buffer is numbered 0 regardless of the | ||
3016 | - * value of b_blkno. | ||
3017 | + * Mark a buffer dirty in the transaction. | ||
3018 | */ | ||
3019 | void | ||
3020 | -xfs_trans_log_buf(xfs_trans_t *tp, | ||
3021 | - xfs_buf_t *bp, | ||
3022 | - uint first, | ||
3023 | - uint last) | ||
3024 | +xfs_trans_dirty_buf( | ||
3025 | + struct xfs_trans *tp, | ||
3026 | + struct xfs_buf *bp) | ||
3027 | { | ||
3028 | - xfs_buf_log_item_t *bip = bp->b_fspriv; | ||
3029 | + struct xfs_buf_log_item *bip = bp->b_fspriv; | ||
3030 | |||
3031 | ASSERT(bp->b_transp == tp); | ||
3032 | ASSERT(bip != NULL); | ||
3033 | - ASSERT(first <= last && last < BBTOB(bp->b_length)); | ||
3034 | ASSERT(bp->b_iodone == NULL || | ||
3035 | bp->b_iodone == xfs_buf_iodone_callbacks); | ||
3036 | |||
3037 | @@ -524,8 +523,6 @@ xfs_trans_log_buf(xfs_trans_t *tp, | ||
3038 | bp->b_iodone = xfs_buf_iodone_callbacks; | ||
3039 | bip->bli_item.li_cb = xfs_buf_iodone; | ||
3040 | |||
3041 | - trace_xfs_trans_log_buf(bip); | ||
3042 | - | ||
3043 | /* | ||
3044 | * If we invalidated the buffer within this transaction, then | ||
3045 | * cancel the invalidation now that we're dirtying the buffer | ||
3046 | @@ -538,17 +535,37 @@ xfs_trans_log_buf(xfs_trans_t *tp, | ||
3047 | bp->b_flags &= ~XBF_STALE; | ||
3048 | bip->__bli_format.blf_flags &= ~XFS_BLF_CANCEL; | ||
3049 | } | ||
3050 | + bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED; | ||
3051 | |||
3052 | tp->t_flags |= XFS_TRANS_DIRTY; | ||
3053 | bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY; | ||
3054 | +} | ||
3055 | |||
3056 | - /* | ||
3057 | - * If we have an ordered buffer we are not logging any dirty range but | ||
3058 | - * it still needs to be marked dirty and that it has been logged. | ||
3059 | - */ | ||
3060 | - bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED; | ||
3061 | - if (!(bip->bli_flags & XFS_BLI_ORDERED)) | ||
3062 | - xfs_buf_item_log(bip, first, last); | ||
3063 | +/* | ||
3064 | + * This is called to mark bytes first through last inclusive of the given | ||
3065 | + * buffer as needing to be logged when the transaction is committed. | ||
3066 | + * The buffer must already be associated with the given transaction. | ||
3067 | + * | ||
3068 | + * First and last are numbers relative to the beginning of this buffer, | ||
3069 | + * so the first byte in the buffer is numbered 0 regardless of the | ||
3070 | + * value of b_blkno. | ||
3071 | + */ | ||
3072 | +void | ||
3073 | +xfs_trans_log_buf( | ||
3074 | + struct xfs_trans *tp, | ||
3075 | + struct xfs_buf *bp, | ||
3076 | + uint first, | ||
3077 | + uint last) | ||
3078 | +{ | ||
3079 | + struct xfs_buf_log_item *bip = bp->b_fspriv; | ||
3080 | + | ||
3081 | + ASSERT(first <= last && last < BBTOB(bp->b_length)); | ||
3082 | + ASSERT(!(bip->bli_flags & XFS_BLI_ORDERED)); | ||
3083 | + | ||
3084 | + xfs_trans_dirty_buf(tp, bp); | ||
3085 | + | ||
3086 | + trace_xfs_trans_log_buf(bip); | ||
3087 | + xfs_buf_item_log(bip, first, last); | ||
3088 | } | ||
3089 | |||
3090 | |||
3091 | @@ -701,14 +718,13 @@ xfs_trans_inode_alloc_buf( | ||
3092 | } | ||
3093 | |||
3094 | /* | ||
3095 | - * Mark the buffer as ordered for this transaction. This means | ||
3096 | - * that the contents of the buffer are not recorded in the transaction | ||
3097 | - * but it is tracked in the AIL as though it was. This allows us | ||
3098 | - * to record logical changes in transactions rather than the physical | ||
3099 | - * changes we make to the buffer without changing writeback ordering | ||
3100 | - * constraints of metadata buffers. | ||
3101 | + * Mark the buffer as ordered for this transaction. This means that the contents | ||
3102 | + * of the buffer are not recorded in the transaction but it is tracked in the | ||
3103 | + * AIL as though it was. This allows us to record logical changes in | ||
3104 | + * transactions rather than the physical changes we make to the buffer without | ||
3105 | + * changing writeback ordering constraints of metadata buffers. | ||
3106 | */ | ||
3107 | -void | ||
3108 | +bool | ||
3109 | xfs_trans_ordered_buf( | ||
3110 | struct xfs_trans *tp, | ||
3111 | struct xfs_buf *bp) | ||
3112 | @@ -719,8 +735,18 @@ xfs_trans_ordered_buf( | ||
3113 | ASSERT(bip != NULL); | ||
3114 | ASSERT(atomic_read(&bip->bli_refcount) > 0); | ||
3115 | |||
3116 | + if (xfs_buf_item_dirty_format(bip)) | ||
3117 | + return false; | ||
3118 | + | ||
3119 | bip->bli_flags |= XFS_BLI_ORDERED; | ||
3120 | trace_xfs_buf_item_ordered(bip); | ||
3121 | + | ||
3122 | + /* | ||
3123 | + * We don't log a dirty range of an ordered buffer but it still needs | ||
3124 | + * to be marked dirty and that it has been logged. | ||
3125 | + */ | ||
3126 | + xfs_trans_dirty_buf(tp, bp); | ||
3127 | + return true; | ||
3128 | } | ||
3129 | |||
3130 | /* | ||
3131 | diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h | ||
3132 | index 49931b72da8a..b317a3644c00 100644 | ||
3133 | --- a/fs/xfs/xfs_trans_priv.h | ||
3134 | +++ b/fs/xfs/xfs_trans_priv.h | ||
3135 | @@ -106,18 +106,9 @@ xfs_trans_ail_update( | ||
3136 | xfs_trans_ail_update_bulk(ailp, NULL, &lip, 1, lsn); | ||
3137 | } | ||
3138 | |||
3139 | -void xfs_trans_ail_delete_bulk(struct xfs_ail *ailp, | ||
3140 | - struct xfs_log_item **log_items, int nr_items, | ||
3141 | - int shutdown_type) | ||
3142 | - __releases(ailp->xa_lock); | ||
3143 | -static inline void | ||
3144 | -xfs_trans_ail_delete( | ||
3145 | - struct xfs_ail *ailp, | ||
3146 | - xfs_log_item_t *lip, | ||
3147 | - int shutdown_type) __releases(ailp->xa_lock) | ||
3148 | -{ | ||
3149 | - xfs_trans_ail_delete_bulk(ailp, &lip, 1, shutdown_type); | ||
3150 | -} | ||
3151 | +bool xfs_ail_delete_one(struct xfs_ail *ailp, struct xfs_log_item *lip); | ||
3152 | +void xfs_trans_ail_delete(struct xfs_ail *ailp, struct xfs_log_item *lip, | ||
3153 | + int shutdown_type) __releases(ailp->xa_lock); | ||
3154 | |||
3155 | static inline void | ||
3156 | xfs_trans_ail_remove( | ||
3157 | @@ -173,4 +164,35 @@ xfs_trans_ail_copy_lsn( | ||
3158 | *dst = *src; | ||
3159 | } | ||
3160 | #endif | ||
3161 | + | ||
3162 | +static inline void | ||
3163 | +xfs_clear_li_failed( | ||
3164 | + struct xfs_log_item *lip) | ||
3165 | +{ | ||
3166 | + struct xfs_buf *bp = lip->li_buf; | ||
3167 | + | ||
3168 | + ASSERT(lip->li_flags & XFS_LI_IN_AIL); | ||
3169 | + lockdep_assert_held(&lip->li_ailp->xa_lock); | ||
3170 | + | ||
3171 | + if (lip->li_flags & XFS_LI_FAILED) { | ||
3172 | + lip->li_flags &= ~XFS_LI_FAILED; | ||
3173 | + lip->li_buf = NULL; | ||
3174 | + xfs_buf_rele(bp); | ||
3175 | + } | ||
3176 | +} | ||
3177 | + | ||
3178 | +static inline void | ||
3179 | +xfs_set_li_failed( | ||
3180 | + struct xfs_log_item *lip, | ||
3181 | + struct xfs_buf *bp) | ||
3182 | +{ | ||
3183 | + lockdep_assert_held(&lip->li_ailp->xa_lock); | ||
3184 | + | ||
3185 | + if (!(lip->li_flags & XFS_LI_FAILED)) { | ||
3186 | + xfs_buf_hold(bp); | ||
3187 | + lip->li_flags |= XFS_LI_FAILED; | ||
3188 | + lip->li_buf = bp; | ||
3189 | + } | ||
3190 | +} | ||
3191 | + | ||
3192 | #endif /* __XFS_TRANS_PRIV_H__ */ | ||
3193 | diff --git a/include/linux/fs.h b/include/linux/fs.h | ||
3194 | index dd88ded27fc8..d705ae084edd 100644 | ||
3195 | --- a/include/linux/fs.h | ||
3196 | +++ b/include/linux/fs.h | ||
3197 | @@ -2760,6 +2760,7 @@ static inline void lockdep_annotate_inode_mutex_key(struct inode *inode) { }; | ||
3198 | #endif | ||
3199 | extern void unlock_new_inode(struct inode *); | ||
3200 | extern unsigned int get_next_ino(void); | ||
3201 | +extern void evict_inodes(struct super_block *sb); | ||
3202 | |||
3203 | extern void __iget(struct inode * inode); | ||
3204 | extern void iget_failed(struct inode *); | ||
3205 | diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h | ||
3206 | index 780e7171f548..23db1ae37464 100644 | ||
3207 | --- a/include/linux/netdevice.h | ||
3208 | +++ b/include/linux/netdevice.h | ||
3209 | @@ -3901,6 +3901,8 @@ struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev, | ||
3210 | updev; \ | ||
3211 | updev = netdev_all_upper_get_next_dev_rcu(dev, &(iter))) | ||
3212 | |||
3213 | +bool netdev_has_any_upper_dev(struct net_device *dev); | ||
3214 | + | ||
3215 | void *netdev_lower_get_next_private(struct net_device *dev, | ||
3216 | struct list_head **iter); | ||
3217 | void *netdev_lower_get_next_private_rcu(struct net_device *dev, | ||
3218 | diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h | ||
3219 | index 909972aa3acd..634d19203e7d 100644 | ||
3220 | --- a/include/net/inet_frag.h | ||
3221 | +++ b/include/net/inet_frag.h | ||
3222 | @@ -1,14 +1,9 @@ | ||
3223 | #ifndef __NET_FRAG_H__ | ||
3224 | #define __NET_FRAG_H__ | ||
3225 | |||
3226 | -#include <linux/percpu_counter.h> | ||
3227 | - | ||
3228 | struct netns_frags { | ||
3229 | - /* The percpu_counter "mem" need to be cacheline aligned. | ||
3230 | - * mem.count must not share cacheline with other writers | ||
3231 | - */ | ||
3232 | - struct percpu_counter mem ____cacheline_aligned_in_smp; | ||
3233 | - | ||
3234 | + /* Keep atomic mem on separate cachelines in structs that include it */ | ||
3235 | + atomic_t mem ____cacheline_aligned_in_smp; | ||
3236 | /* sysctls */ | ||
3237 | int timeout; | ||
3238 | int high_thresh; | ||
3239 | @@ -108,15 +103,10 @@ struct inet_frags { | ||
3240 | int inet_frags_init(struct inet_frags *); | ||
3241 | void inet_frags_fini(struct inet_frags *); | ||
3242 | |||
3243 | -static inline int inet_frags_init_net(struct netns_frags *nf) | ||
3244 | -{ | ||
3245 | - return percpu_counter_init(&nf->mem, 0, GFP_KERNEL); | ||
3246 | -} | ||
3247 | -static inline void inet_frags_uninit_net(struct netns_frags *nf) | ||
3248 | +static inline void inet_frags_init_net(struct netns_frags *nf) | ||
3249 | { | ||
3250 | - percpu_counter_destroy(&nf->mem); | ||
3251 | + atomic_set(&nf->mem, 0); | ||
3252 | } | ||
3253 | - | ||
3254 | void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f); | ||
3255 | |||
3256 | void inet_frag_kill(struct inet_frag_queue *q, struct inet_frags *f); | ||
3257 | @@ -140,37 +130,24 @@ static inline bool inet_frag_evicting(struct inet_frag_queue *q) | ||
3258 | |||
3259 | /* Memory Tracking Functions. */ | ||
3260 | |||
3261 | -/* The default percpu_counter batch size is not big enough to scale to | ||
3262 | - * fragmentation mem acct sizes. | ||
3263 | - * The mem size of a 64K fragment is approx: | ||
3264 | - * (44 fragments * 2944 truesize) + frag_queue struct(200) = 129736 bytes | ||
3265 | - */ | ||
3266 | -static unsigned int frag_percpu_counter_batch = 130000; | ||
3267 | - | ||
3268 | static inline int frag_mem_limit(struct netns_frags *nf) | ||
3269 | { | ||
3270 | - return percpu_counter_read(&nf->mem); | ||
3271 | + return atomic_read(&nf->mem); | ||
3272 | } | ||
3273 | |||
3274 | static inline void sub_frag_mem_limit(struct netns_frags *nf, int i) | ||
3275 | { | ||
3276 | - __percpu_counter_add(&nf->mem, -i, frag_percpu_counter_batch); | ||
3277 | + atomic_sub(i, &nf->mem); | ||
3278 | } | ||
3279 | |||
3280 | static inline void add_frag_mem_limit(struct netns_frags *nf, int i) | ||
3281 | { | ||
3282 | - __percpu_counter_add(&nf->mem, i, frag_percpu_counter_batch); | ||
3283 | + atomic_add(i, &nf->mem); | ||
3284 | } | ||
3285 | |||
3286 | -static inline unsigned int sum_frag_mem_limit(struct netns_frags *nf) | ||
3287 | +static inline int sum_frag_mem_limit(struct netns_frags *nf) | ||
3288 | { | ||
3289 | - unsigned int res; | ||
3290 | - | ||
3291 | - local_bh_disable(); | ||
3292 | - res = percpu_counter_sum_positive(&nf->mem); | ||
3293 | - local_bh_enable(); | ||
3294 | - | ||
3295 | - return res; | ||
3296 | + return atomic_read(&nf->mem); | ||
3297 | } | ||
3298 | |||
3299 | /* RFC 3168 support : | ||
3300 | diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h | ||
3301 | index a74e2aa40ef4..a6bcb18ac4c3 100644 | ||
3302 | --- a/include/net/ip6_fib.h | ||
3303 | +++ b/include/net/ip6_fib.h | ||
3304 | @@ -68,6 +68,7 @@ struct fib6_node { | ||
3305 | __u16 fn_flags; | ||
3306 | int fn_sernum; | ||
3307 | struct rt6_info *rr_ptr; | ||
3308 | + struct rcu_head rcu; | ||
3309 | }; | ||
3310 | |||
3311 | #ifndef CONFIG_IPV6_SUBTREES | ||
3312 | @@ -102,7 +103,7 @@ struct rt6_info { | ||
3313 | * the same cache line. | ||
3314 | */ | ||
3315 | struct fib6_table *rt6i_table; | ||
3316 | - struct fib6_node *rt6i_node; | ||
3317 | + struct fib6_node __rcu *rt6i_node; | ||
3318 | |||
3319 | struct in6_addr rt6i_gateway; | ||
3320 | |||
3321 | @@ -165,13 +166,40 @@ static inline void rt6_update_expires(struct rt6_info *rt0, int timeout) | ||
3322 | rt0->rt6i_flags |= RTF_EXPIRES; | ||
3323 | } | ||
3324 | |||
3325 | +/* Function to safely get fn->sernum for passed in rt | ||
3326 | + * and store result in passed in cookie. | ||
3327 | + * Return true if we can get cookie safely | ||
3328 | + * Return false if not | ||
3329 | + */ | ||
3330 | +static inline bool rt6_get_cookie_safe(const struct rt6_info *rt, | ||
3331 | + u32 *cookie) | ||
3332 | +{ | ||
3333 | + struct fib6_node *fn; | ||
3334 | + bool status = false; | ||
3335 | + | ||
3336 | + rcu_read_lock(); | ||
3337 | + fn = rcu_dereference(rt->rt6i_node); | ||
3338 | + | ||
3339 | + if (fn) { | ||
3340 | + *cookie = fn->fn_sernum; | ||
3341 | + status = true; | ||
3342 | + } | ||
3343 | + | ||
3344 | + rcu_read_unlock(); | ||
3345 | + return status; | ||
3346 | +} | ||
3347 | + | ||
3348 | static inline u32 rt6_get_cookie(const struct rt6_info *rt) | ||
3349 | { | ||
3350 | + u32 cookie = 0; | ||
3351 | + | ||
3352 | if (rt->rt6i_flags & RTF_PCPU || | ||
3353 | (unlikely(rt->dst.flags & DST_NOCACHE) && rt->dst.from)) | ||
3354 | rt = (struct rt6_info *)(rt->dst.from); | ||
3355 | |||
3356 | - return rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0; | ||
3357 | + rt6_get_cookie_safe(rt, &cookie); | ||
3358 | + | ||
3359 | + return cookie; | ||
3360 | } | ||
3361 | |||
3362 | static inline void ip6_rt_put(struct rt6_info *rt) | ||
3363 | diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c | ||
3364 | index 89a687f3c0a3..5f5e28f210e0 100644 | ||
3365 | --- a/net/bridge/br_device.c | ||
3366 | +++ b/net/bridge/br_device.c | ||
3367 | @@ -53,6 +53,9 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) | ||
3368 | brstats->tx_bytes += skb->len; | ||
3369 | u64_stats_update_end(&brstats->syncp); | ||
3370 | |||
3371 | +#ifdef CONFIG_NET_SWITCHDEV | ||
3372 | + skb->offload_fwd_mark = 0; | ||
3373 | +#endif | ||
3374 | BR_INPUT_SKB_CB(skb)->brdev = dev; | ||
3375 | |||
3376 | skb_reset_mac_header(skb); | ||
3377 | diff --git a/net/core/datagram.c b/net/core/datagram.c | ||
3378 | index 58dfa23d12ca..4fa4011feec1 100644 | ||
3379 | --- a/net/core/datagram.c | ||
3380 | +++ b/net/core/datagram.c | ||
3381 | @@ -351,7 +351,7 @@ int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags) | ||
3382 | if (flags & MSG_PEEK) { | ||
3383 | err = -ENOENT; | ||
3384 | spin_lock_bh(&sk->sk_receive_queue.lock); | ||
3385 | - if (skb == skb_peek(&sk->sk_receive_queue)) { | ||
3386 | + if (skb->next) { | ||
3387 | __skb_unlink(skb, &sk->sk_receive_queue); | ||
3388 | atomic_dec(&skb->users); | ||
3389 | err = 0; | ||
3390 | diff --git a/net/core/dev.c b/net/core/dev.c | ||
3391 | index 1d0a7369d5a2..ba7b8121a414 100644 | ||
3392 | --- a/net/core/dev.c | ||
3393 | +++ b/net/core/dev.c | ||
3394 | @@ -5337,12 +5337,13 @@ EXPORT_SYMBOL(netdev_has_upper_dev); | ||
3395 | * Find out if a device is linked to an upper device and return true in case | ||
3396 | * it is. The caller must hold the RTNL lock. | ||
3397 | */ | ||
3398 | -static bool netdev_has_any_upper_dev(struct net_device *dev) | ||
3399 | +bool netdev_has_any_upper_dev(struct net_device *dev) | ||
3400 | { | ||
3401 | ASSERT_RTNL(); | ||
3402 | |||
3403 | return !list_empty(&dev->all_adj_list.upper); | ||
3404 | } | ||
3405 | +EXPORT_SYMBOL(netdev_has_any_upper_dev); | ||
3406 | |||
3407 | /** | ||
3408 | * netdev_master_upper_dev_get - Get master upper device | ||
3409 | diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c | ||
3410 | index 30d875dff6b5..f85b08baff16 100644 | ||
3411 | --- a/net/ieee802154/6lowpan/reassembly.c | ||
3412 | +++ b/net/ieee802154/6lowpan/reassembly.c | ||
3413 | @@ -580,19 +580,14 @@ static int __net_init lowpan_frags_init_net(struct net *net) | ||
3414 | { | ||
3415 | struct netns_ieee802154_lowpan *ieee802154_lowpan = | ||
3416 | net_ieee802154_lowpan(net); | ||
3417 | - int res; | ||
3418 | |||
3419 | ieee802154_lowpan->frags.high_thresh = IPV6_FRAG_HIGH_THRESH; | ||
3420 | ieee802154_lowpan->frags.low_thresh = IPV6_FRAG_LOW_THRESH; | ||
3421 | ieee802154_lowpan->frags.timeout = IPV6_FRAG_TIMEOUT; | ||
3422 | |||
3423 | - res = inet_frags_init_net(&ieee802154_lowpan->frags); | ||
3424 | - if (res) | ||
3425 | - return res; | ||
3426 | - res = lowpan_frags_ns_sysctl_register(net); | ||
3427 | - if (res) | ||
3428 | - inet_frags_uninit_net(&ieee802154_lowpan->frags); | ||
3429 | - return res; | ||
3430 | + inet_frags_init_net(&ieee802154_lowpan->frags); | ||
3431 | + | ||
3432 | + return lowpan_frags_ns_sysctl_register(net); | ||
3433 | } | ||
3434 | |||
3435 | static void __net_exit lowpan_frags_exit_net(struct net *net) | ||
3436 | diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c | ||
3437 | index b5e9317eaf9e..631c0d0d7cf8 100644 | ||
3438 | --- a/net/ipv4/inet_fragment.c | ||
3439 | +++ b/net/ipv4/inet_fragment.c | ||
3440 | @@ -234,10 +234,8 @@ void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f) | ||
3441 | cond_resched(); | ||
3442 | |||
3443 | if (read_seqretry(&f->rnd_seqlock, seq) || | ||
3444 | - percpu_counter_sum(&nf->mem)) | ||
3445 | + sum_frag_mem_limit(nf)) | ||
3446 | goto evict_again; | ||
3447 | - | ||
3448 | - percpu_counter_destroy(&nf->mem); | ||
3449 | } | ||
3450 | EXPORT_SYMBOL(inet_frags_exit_net); | ||
3451 | |||
3452 | diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c | ||
3453 | index bbe7f72db9c1..453db950dc9f 100644 | ||
3454 | --- a/net/ipv4/ip_fragment.c | ||
3455 | +++ b/net/ipv4/ip_fragment.c | ||
3456 | @@ -835,8 +835,6 @@ static void __init ip4_frags_ctl_register(void) | ||
3457 | |||
3458 | static int __net_init ipv4_frags_init_net(struct net *net) | ||
3459 | { | ||
3460 | - int res; | ||
3461 | - | ||
3462 | /* Fragment cache limits. | ||
3463 | * | ||
3464 | * The fragment memory accounting code, (tries to) account for | ||
3465 | @@ -862,13 +860,9 @@ static int __net_init ipv4_frags_init_net(struct net *net) | ||
3466 | |||
3467 | net->ipv4.frags.max_dist = 64; | ||
3468 | |||
3469 | - res = inet_frags_init_net(&net->ipv4.frags); | ||
3470 | - if (res) | ||
3471 | - return res; | ||
3472 | - res = ip4_frags_ns_ctl_register(net); | ||
3473 | - if (res) | ||
3474 | - inet_frags_uninit_net(&net->ipv4.frags); | ||
3475 | - return res; | ||
3476 | + inet_frags_init_net(&net->ipv4.frags); | ||
3477 | + | ||
3478 | + return ip4_frags_ns_ctl_register(net); | ||
3479 | } | ||
3480 | |||
3481 | static void __net_exit ipv4_frags_exit_net(struct net *net) | ||
3482 | diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c | ||
3483 | index 5719d6ba0824..bd7f1836bb70 100644 | ||
3484 | --- a/net/ipv4/ip_tunnel.c | ||
3485 | +++ b/net/ipv4/ip_tunnel.c | ||
3486 | @@ -609,8 +609,8 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto) | ||
3487 | ip_rt_put(rt); | ||
3488 | goto tx_dropped; | ||
3489 | } | ||
3490 | - iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, key->tos, | ||
3491 | - key->ttl, df, !net_eq(tunnel->net, dev_net(dev))); | ||
3492 | + iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl, | ||
3493 | + df, !net_eq(tunnel->net, dev_net(dev))); | ||
3494 | return; | ||
3495 | tx_error: | ||
3496 | dev->stats.tx_errors++; | ||
3497 | diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c | ||
3498 | index 1a4db27f5833..6b3d27e50317 100644 | ||
3499 | --- a/net/ipv4/tcp.c | ||
3500 | +++ b/net/ipv4/tcp.c | ||
3501 | @@ -2297,6 +2297,10 @@ int tcp_disconnect(struct sock *sk, int flags) | ||
3502 | tcp_set_ca_state(sk, TCP_CA_Open); | ||
3503 | tcp_clear_retrans(tp); | ||
3504 | inet_csk_delack_init(sk); | ||
3505 | + /* Initialize rcv_mss to TCP_MIN_MSS to avoid division by 0 | ||
3506 | + * issue in __tcp_select_window() | ||
3507 | + */ | ||
3508 | + icsk->icsk_ack.rcv_mss = TCP_MIN_MSS; | ||
3509 | tcp_init_send_head(sk); | ||
3510 | memset(&tp->rx_opt, 0, sizeof(tp->rx_opt)); | ||
3511 | __sk_dst_reset(sk); | ||
3512 | diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c | ||
3513 | index b2cabda72320..cc101b1be903 100644 | ||
3514 | --- a/net/ipv6/addrconf.c | ||
3515 | +++ b/net/ipv6/addrconf.c | ||
3516 | @@ -5443,7 +5443,7 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) | ||
3517 | * our DAD process, so we don't need | ||
3518 | * to do it again | ||
3519 | */ | ||
3520 | - if (!(ifp->rt->rt6i_node)) | ||
3521 | + if (!rcu_access_pointer(ifp->rt->rt6i_node)) | ||
3522 | ip6_ins_rt(ifp->rt); | ||
3523 | if (ifp->idev->cnf.forwarding) | ||
3524 | addrconf_join_anycast(ifp); | ||
3525 | diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c | ||
3526 | index ff389591a340..5da864997495 100644 | ||
3527 | --- a/net/ipv6/ip6_fib.c | ||
3528 | +++ b/net/ipv6/ip6_fib.c | ||
3529 | @@ -148,11 +148,23 @@ static struct fib6_node *node_alloc(void) | ||
3530 | return fn; | ||
3531 | } | ||
3532 | |||
3533 | -static void node_free(struct fib6_node *fn) | ||
3534 | +static void node_free_immediate(struct fib6_node *fn) | ||
3535 | +{ | ||
3536 | + kmem_cache_free(fib6_node_kmem, fn); | ||
3537 | +} | ||
3538 | + | ||
3539 | +static void node_free_rcu(struct rcu_head *head) | ||
3540 | { | ||
3541 | + struct fib6_node *fn = container_of(head, struct fib6_node, rcu); | ||
3542 | + | ||
3543 | kmem_cache_free(fib6_node_kmem, fn); | ||
3544 | } | ||
3545 | |||
3546 | +static void node_free(struct fib6_node *fn) | ||
3547 | +{ | ||
3548 | + call_rcu(&fn->rcu, node_free_rcu); | ||
3549 | +} | ||
3550 | + | ||
3551 | static void rt6_rcu_free(struct rt6_info *rt) | ||
3552 | { | ||
3553 | call_rcu(&rt->dst.rcu_head, dst_rcu_free); | ||
3554 | @@ -189,6 +201,12 @@ static void rt6_release(struct rt6_info *rt) | ||
3555 | } | ||
3556 | } | ||
3557 | |||
3558 | +static void fib6_free_table(struct fib6_table *table) | ||
3559 | +{ | ||
3560 | + inetpeer_invalidate_tree(&table->tb6_peers); | ||
3561 | + kfree(table); | ||
3562 | +} | ||
3563 | + | ||
3564 | static void fib6_link_table(struct net *net, struct fib6_table *tb) | ||
3565 | { | ||
3566 | unsigned int h; | ||
3567 | @@ -589,9 +607,9 @@ static struct fib6_node *fib6_add_1(struct fib6_node *root, | ||
3568 | |||
3569 | if (!in || !ln) { | ||
3570 | if (in) | ||
3571 | - node_free(in); | ||
3572 | + node_free_immediate(in); | ||
3573 | if (ln) | ||
3574 | - node_free(ln); | ||
3575 | + node_free_immediate(ln); | ||
3576 | return ERR_PTR(-ENOMEM); | ||
3577 | } | ||
3578 | |||
3579 | @@ -862,7 +880,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, | ||
3580 | |||
3581 | rt->dst.rt6_next = iter; | ||
3582 | *ins = rt; | ||
3583 | - rt->rt6i_node = fn; | ||
3584 | + rcu_assign_pointer(rt->rt6i_node, fn); | ||
3585 | atomic_inc(&rt->rt6i_ref); | ||
3586 | inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); | ||
3587 | info->nl_net->ipv6.rt6_stats->fib_rt_entries++; | ||
3588 | @@ -887,7 +905,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, | ||
3589 | return err; | ||
3590 | |||
3591 | *ins = rt; | ||
3592 | - rt->rt6i_node = fn; | ||
3593 | + rcu_assign_pointer(rt->rt6i_node, fn); | ||
3594 | rt->dst.rt6_next = iter->dst.rt6_next; | ||
3595 | atomic_inc(&rt->rt6i_ref); | ||
3596 | inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE); | ||
3597 | @@ -1020,7 +1038,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, | ||
3598 | root, and then (in failure) stale node | ||
3599 | in main tree. | ||
3600 | */ | ||
3601 | - node_free(sfn); | ||
3602 | + node_free_immediate(sfn); | ||
3603 | err = PTR_ERR(sn); | ||
3604 | goto failure; | ||
3605 | } | ||
3606 | @@ -1447,8 +1465,9 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, | ||
3607 | |||
3608 | int fib6_del(struct rt6_info *rt, struct nl_info *info) | ||
3609 | { | ||
3610 | + struct fib6_node *fn = rcu_dereference_protected(rt->rt6i_node, | ||
3611 | + lockdep_is_held(&rt->rt6i_table->tb6_lock)); | ||
3612 | struct net *net = info->nl_net; | ||
3613 | - struct fib6_node *fn = rt->rt6i_node; | ||
3614 | struct rt6_info **rtp; | ||
3615 | |||
3616 | #if RT6_DEBUG >= 2 | ||
3617 | @@ -1637,7 +1656,9 @@ static int fib6_clean_node(struct fib6_walker *w) | ||
3618 | if (res) { | ||
3619 | #if RT6_DEBUG >= 2 | ||
3620 | pr_debug("%s: del failed: rt=%p@%p err=%d\n", | ||
3621 | - __func__, rt, rt->rt6i_node, res); | ||
3622 | + __func__, rt, | ||
3623 | + rcu_access_pointer(rt->rt6i_node), | ||
3624 | + res); | ||
3625 | #endif | ||
3626 | continue; | ||
3627 | } | ||
3628 | @@ -1878,15 +1899,22 @@ static int __net_init fib6_net_init(struct net *net) | ||
3629 | |||
3630 | static void fib6_net_exit(struct net *net) | ||
3631 | { | ||
3632 | + unsigned int i; | ||
3633 | + | ||
3634 | rt6_ifdown(net, NULL); | ||
3635 | del_timer_sync(&net->ipv6.ip6_fib_timer); | ||
3636 | |||
3637 | -#ifdef CONFIG_IPV6_MULTIPLE_TABLES | ||
3638 | - inetpeer_invalidate_tree(&net->ipv6.fib6_local_tbl->tb6_peers); | ||
3639 | - kfree(net->ipv6.fib6_local_tbl); | ||
3640 | -#endif | ||
3641 | - inetpeer_invalidate_tree(&net->ipv6.fib6_main_tbl->tb6_peers); | ||
3642 | - kfree(net->ipv6.fib6_main_tbl); | ||
3643 | + for (i = 0; i < FIB6_TABLE_HASHSZ; i++) { | ||
3644 | + struct hlist_head *head = &net->ipv6.fib_table_hash[i]; | ||
3645 | + struct hlist_node *tmp; | ||
3646 | + struct fib6_table *tb; | ||
3647 | + | ||
3648 | + hlist_for_each_entry_safe(tb, tmp, head, tb6_hlist) { | ||
3649 | + hlist_del(&tb->tb6_hlist); | ||
3650 | + fib6_free_table(tb); | ||
3651 | + } | ||
3652 | + } | ||
3653 | + | ||
3654 | kfree(net->ipv6.fib_table_hash); | ||
3655 | kfree(net->ipv6.rt6_stats); | ||
3656 | } | ||
3657 | diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c | ||
3658 | index d2844ee469cb..f78afe43bdff 100644 | ||
3659 | --- a/net/ipv6/ip6_gre.c | ||
3660 | +++ b/net/ipv6/ip6_gre.c | ||
3661 | @@ -432,7 +432,9 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, | ||
3662 | } | ||
3663 | break; | ||
3664 | case ICMPV6_PKT_TOOBIG: | ||
3665 | - mtu = be32_to_cpu(info) - offset; | ||
3666 | + mtu = be32_to_cpu(info) - offset - t->tun_hlen; | ||
3667 | + if (t->dev->type == ARPHRD_ETHER) | ||
3668 | + mtu -= ETH_HLEN; | ||
3669 | if (mtu < IPV6_MIN_MTU) | ||
3670 | mtu = IPV6_MIN_MTU; | ||
3671 | t->dev->mtu = mtu; | ||
3672 | diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c | ||
3673 | index 986d4ca38832..b263bf3a19f7 100644 | ||
3674 | --- a/net/ipv6/netfilter/nf_conntrack_reasm.c | ||
3675 | +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c | ||
3676 | @@ -622,18 +622,12 @@ EXPORT_SYMBOL_GPL(nf_ct_frag6_gather); | ||
3677 | |||
3678 | static int nf_ct_net_init(struct net *net) | ||
3679 | { | ||
3680 | - int res; | ||
3681 | - | ||
3682 | net->nf_frag.frags.high_thresh = IPV6_FRAG_HIGH_THRESH; | ||
3683 | net->nf_frag.frags.low_thresh = IPV6_FRAG_LOW_THRESH; | ||
3684 | net->nf_frag.frags.timeout = IPV6_FRAG_TIMEOUT; | ||
3685 | - res = inet_frags_init_net(&net->nf_frag.frags); | ||
3686 | - if (res) | ||
3687 | - return res; | ||
3688 | - res = nf_ct_frag6_sysctl_register(net); | ||
3689 | - if (res) | ||
3690 | - inet_frags_uninit_net(&net->nf_frag.frags); | ||
3691 | - return res; | ||
3692 | + inet_frags_init_net(&net->nf_frag.frags); | ||
3693 | + | ||
3694 | + return nf_ct_frag6_sysctl_register(net); | ||
3695 | } | ||
3696 | |||
3697 | static void nf_ct_net_exit(struct net *net) | ||
3698 | diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c | ||
3699 | index abb2c307fbe8..a338bbc33cf3 100644 | ||
3700 | --- a/net/ipv6/output_core.c | ||
3701 | +++ b/net/ipv6/output_core.c | ||
3702 | @@ -86,7 +86,6 @@ int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr) | ||
3703 | |||
3704 | while (offset <= packet_len) { | ||
3705 | struct ipv6_opt_hdr *exthdr; | ||
3706 | - unsigned int len; | ||
3707 | |||
3708 | switch (**nexthdr) { | ||
3709 | |||
3710 | @@ -112,10 +111,9 @@ int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr) | ||
3711 | |||
3712 | exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) + | ||
3713 | offset); | ||
3714 | - len = ipv6_optlen(exthdr); | ||
3715 | - if (len + offset >= IPV6_MAXPLEN) | ||
3716 | + offset += ipv6_optlen(exthdr); | ||
3717 | + if (offset > IPV6_MAXPLEN) | ||
3718 | return -EINVAL; | ||
3719 | - offset += len; | ||
3720 | *nexthdr = &exthdr->nexthdr; | ||
3721 | } | ||
3722 | |||
3723 | diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c | ||
3724 | index 3815e8505ed2..e585c0a2591c 100644 | ||
3725 | --- a/net/ipv6/reassembly.c | ||
3726 | +++ b/net/ipv6/reassembly.c | ||
3727 | @@ -709,19 +709,13 @@ static void ip6_frags_sysctl_unregister(void) | ||
3728 | |||
3729 | static int __net_init ipv6_frags_init_net(struct net *net) | ||
3730 | { | ||
3731 | - int res; | ||
3732 | - | ||
3733 | net->ipv6.frags.high_thresh = IPV6_FRAG_HIGH_THRESH; | ||
3734 | net->ipv6.frags.low_thresh = IPV6_FRAG_LOW_THRESH; | ||
3735 | net->ipv6.frags.timeout = IPV6_FRAG_TIMEOUT; | ||
3736 | |||
3737 | - res = inet_frags_init_net(&net->ipv6.frags); | ||
3738 | - if (res) | ||
3739 | - return res; | ||
3740 | - res = ip6_frags_ns_sysctl_register(net); | ||
3741 | - if (res) | ||
3742 | - inet_frags_uninit_net(&net->ipv6.frags); | ||
3743 | - return res; | ||
3744 | + inet_frags_init_net(&net->ipv6.frags); | ||
3745 | + | ||
3746 | + return ip6_frags_ns_sysctl_register(net); | ||
3747 | } | ||
3748 | |||
3749 | static void __net_exit ipv6_frags_exit_net(struct net *net) | ||
3750 | diff --git a/net/ipv6/route.c b/net/ipv6/route.c | ||
3751 | index 5764a84465f8..61729641e027 100644 | ||
3752 | --- a/net/ipv6/route.c | ||
3753 | +++ b/net/ipv6/route.c | ||
3754 | @@ -1267,7 +1267,9 @@ static void rt6_dst_from_metrics_check(struct rt6_info *rt) | ||
3755 | |||
3756 | static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie) | ||
3757 | { | ||
3758 | - if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie)) | ||
3759 | + u32 rt_cookie = 0; | ||
3760 | + | ||
3761 | + if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie) | ||
3762 | return NULL; | ||
3763 | |||
3764 | if (rt6_check_expired(rt)) | ||
3765 | @@ -1335,8 +1337,14 @@ static void ip6_link_failure(struct sk_buff *skb) | ||
3766 | if (rt->rt6i_flags & RTF_CACHE) { | ||
3767 | dst_hold(&rt->dst); | ||
3768 | ip6_del_rt(rt); | ||
3769 | - } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) { | ||
3770 | - rt->rt6i_node->fn_sernum = -1; | ||
3771 | + } else { | ||
3772 | + struct fib6_node *fn; | ||
3773 | + | ||
3774 | + rcu_read_lock(); | ||
3775 | + fn = rcu_dereference(rt->rt6i_node); | ||
3776 | + if (fn && (rt->rt6i_flags & RTF_DEFAULT)) | ||
3777 | + fn->fn_sernum = -1; | ||
3778 | + rcu_read_unlock(); | ||
3779 | } | ||
3780 | } | ||
3781 | } | ||
3782 | @@ -1353,7 +1361,8 @@ static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu) | ||
3783 | static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt) | ||
3784 | { | ||
3785 | return !(rt->rt6i_flags & RTF_CACHE) && | ||
3786 | - (rt->rt6i_flags & RTF_PCPU || rt->rt6i_node); | ||
3787 | + (rt->rt6i_flags & RTF_PCPU || | ||
3788 | + rcu_access_pointer(rt->rt6i_node)); | ||
3789 | } | ||
3790 | |||
3791 | static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, | ||
3792 | diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c | ||
3793 | index fecad1098cf8..7eb0e8fe3ca8 100644 | ||
3794 | --- a/net/kcm/kcmsock.c | ||
3795 | +++ b/net/kcm/kcmsock.c | ||
3796 | @@ -1381,6 +1381,10 @@ static int kcm_attach(struct socket *sock, struct socket *csock, | ||
3797 | if (!csk) | ||
3798 | return -EINVAL; | ||
3799 | |||
3800 | + /* We must prevent loops or risk deadlock ! */ | ||
3801 | + if (csk->sk_family == PF_KCM) | ||
3802 | + return -EOPNOTSUPP; | ||
3803 | + | ||
3804 | psock = kmem_cache_zalloc(kcm_psockp, GFP_KERNEL); | ||
3805 | if (!psock) | ||
3806 | return -ENOMEM; | ||
3807 | diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c | ||
3808 | index ae7bfd26cd91..35ba4b60d927 100644 | ||
3809 | --- a/net/packet/af_packet.c | ||
3810 | +++ b/net/packet/af_packet.c | ||
3811 | @@ -2151,6 +2151,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, | ||
3812 | struct timespec ts; | ||
3813 | __u32 ts_status; | ||
3814 | bool is_drop_n_account = false; | ||
3815 | + bool do_vnet = false; | ||
3816 | |||
3817 | /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT. | ||
3818 | * We may add members to them until current aligned size without forcing | ||
3819 | @@ -2201,8 +2202,10 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, | ||
3820 | netoff = TPACKET_ALIGN(po->tp_hdrlen + | ||
3821 | (maclen < 16 ? 16 : maclen)) + | ||
3822 | po->tp_reserve; | ||
3823 | - if (po->has_vnet_hdr) | ||
3824 | + if (po->has_vnet_hdr) { | ||
3825 | netoff += sizeof(struct virtio_net_hdr); | ||
3826 | + do_vnet = true; | ||
3827 | + } | ||
3828 | macoff = netoff - maclen; | ||
3829 | } | ||
3830 | if (po->tp_version <= TPACKET_V2) { | ||
3831 | @@ -2219,8 +2222,10 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, | ||
3832 | skb_set_owner_r(copy_skb, sk); | ||
3833 | } | ||
3834 | snaplen = po->rx_ring.frame_size - macoff; | ||
3835 | - if ((int)snaplen < 0) | ||
3836 | + if ((int)snaplen < 0) { | ||
3837 | snaplen = 0; | ||
3838 | + do_vnet = false; | ||
3839 | + } | ||
3840 | } | ||
3841 | } else if (unlikely(macoff + snaplen > | ||
3842 | GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) { | ||
3843 | @@ -2233,6 +2238,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, | ||
3844 | if (unlikely((int)snaplen < 0)) { | ||
3845 | snaplen = 0; | ||
3846 | macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len; | ||
3847 | + do_vnet = false; | ||
3848 | } | ||
3849 | } | ||
3850 | spin_lock(&sk->sk_receive_queue.lock); | ||
3851 | @@ -2258,7 +2264,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, | ||
3852 | } | ||
3853 | spin_unlock(&sk->sk_receive_queue.lock); | ||
3854 | |||
3855 | - if (po->has_vnet_hdr) { | ||
3856 | + if (do_vnet) { | ||
3857 | if (__packet_rcv_vnet(skb, h.raw + macoff - | ||
3858 | sizeof(struct virtio_net_hdr))) { | ||
3859 | spin_lock(&sk->sk_receive_queue.lock); | ||
3860 | diff --git a/net/sctp/sctp_diag.c b/net/sctp/sctp_diag.c | ||
3861 | index 048954eee984..e8f56b7c5afb 100644 | ||
3862 | --- a/net/sctp/sctp_diag.c | ||
3863 | +++ b/net/sctp/sctp_diag.c | ||
3864 | @@ -70,7 +70,8 @@ static int inet_diag_msg_sctpladdrs_fill(struct sk_buff *skb, | ||
3865 | |||
3866 | info = nla_data(attr); | ||
3867 | list_for_each_entry_rcu(laddr, address_list, list) { | ||
3868 | - memcpy(info, &laddr->a, addrlen); | ||
3869 | + memcpy(info, &laddr->a, sizeof(laddr->a)); | ||
3870 | + memset(info + sizeof(laddr->a), 0, addrlen - sizeof(laddr->a)); | ||
3871 | info += addrlen; | ||
3872 | } | ||
3873 | |||
3874 | @@ -93,7 +94,9 @@ static int inet_diag_msg_sctpaddrs_fill(struct sk_buff *skb, | ||
3875 | info = nla_data(attr); | ||
3876 | list_for_each_entry(from, &asoc->peer.transport_addr_list, | ||
3877 | transports) { | ||
3878 | - memcpy(info, &from->ipaddr, addrlen); | ||
3879 | + memcpy(info, &from->ipaddr, sizeof(from->ipaddr)); | ||
3880 | + memset(info + sizeof(from->ipaddr), 0, | ||
3881 | + addrlen - sizeof(from->ipaddr)); | ||
3882 | info += addrlen; | ||
3883 | } | ||
3884 | |||
3885 | diff --git a/net/sctp/socket.c b/net/sctp/socket.c | ||
3886 | index 9647e314d4fc..3ef725229449 100644 | ||
3887 | --- a/net/sctp/socket.c | ||
3888 | +++ b/net/sctp/socket.c | ||
3889 | @@ -4373,8 +4373,7 @@ int sctp_get_sctp_info(struct sock *sk, struct sctp_association *asoc, | ||
3890 | info->sctpi_ictrlchunks = asoc->stats.ictrlchunks; | ||
3891 | |||
3892 | prim = asoc->peer.primary_path; | ||
3893 | - memcpy(&info->sctpi_p_address, &prim->ipaddr, | ||
3894 | - sizeof(struct sockaddr_storage)); | ||
3895 | + memcpy(&info->sctpi_p_address, &prim->ipaddr, sizeof(prim->ipaddr)); | ||
3896 | info->sctpi_p_state = prim->state; | ||
3897 | info->sctpi_p_cwnd = prim->cwnd; | ||
3898 | info->sctpi_p_srtt = prim->srtt; | ||
3899 | diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c | ||
3900 | index 84d0fdaf7de9..d3cfbf2f407d 100644 | ||
3901 | --- a/net/sctp/ulpqueue.c | ||
3902 | +++ b/net/sctp/ulpqueue.c | ||
3903 | @@ -265,7 +265,8 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) | ||
3904 | sctp_ulpq_clear_pd(ulpq); | ||
3905 | |||
3906 | if (queue == &sk->sk_receive_queue && !sp->data_ready_signalled) { | ||
3907 | - sp->data_ready_signalled = 1; | ||
3908 | + if (!sock_owned_by_user(sk)) | ||
3909 | + sp->data_ready_signalled = 1; | ||
3910 | sk->sk_data_ready(sk); | ||
3911 | } | ||
3912 | return 1; |