Contents of /trunk/kernel-alx-legacy/patches-4.9/0150-4.9.51-all-fixes.patch
Parent Directory | Revision Log
Revision 3608 -
(show annotations)
(download)
Fri Aug 14 07:34:29 2020 UTC (4 years, 1 month ago) by niro
File size: 125407 byte(s)
Fri Aug 14 07:34:29 2020 UTC (4 years, 1 month ago) by niro
File size: 125407 byte(s)
-added kerenl-alx-legacy pkg
1 | diff --git a/Makefile b/Makefile |
2 | index 038d126a15fc..b48aebbe187f 100644 |
3 | --- a/Makefile |
4 | +++ b/Makefile |
5 | @@ -1,6 +1,6 @@ |
6 | VERSION = 4 |
7 | PATCHLEVEL = 9 |
8 | -SUBLEVEL = 50 |
9 | +SUBLEVEL = 51 |
10 | EXTRAVERSION = |
11 | NAME = Roaring Lionus |
12 | |
13 | diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h |
14 | index b31761ecce63..7bcd138c3aa9 100644 |
15 | --- a/arch/x86/include/asm/elf.h |
16 | +++ b/arch/x86/include/asm/elf.h |
17 | @@ -204,6 +204,7 @@ void set_personality_ia32(bool); |
18 | |
19 | #define ELF_CORE_COPY_REGS(pr_reg, regs) \ |
20 | do { \ |
21 | + unsigned long base; \ |
22 | unsigned v; \ |
23 | (pr_reg)[0] = (regs)->r15; \ |
24 | (pr_reg)[1] = (regs)->r14; \ |
25 | @@ -226,8 +227,8 @@ do { \ |
26 | (pr_reg)[18] = (regs)->flags; \ |
27 | (pr_reg)[19] = (regs)->sp; \ |
28 | (pr_reg)[20] = (regs)->ss; \ |
29 | - (pr_reg)[21] = current->thread.fsbase; \ |
30 | - (pr_reg)[22] = current->thread.gsbase; \ |
31 | + rdmsrl(MSR_FS_BASE, base); (pr_reg)[21] = base; \ |
32 | + rdmsrl(MSR_KERNEL_GS_BASE, base); (pr_reg)[22] = base; \ |
33 | asm("movl %%ds,%0" : "=r" (v)); (pr_reg)[23] = v; \ |
34 | asm("movl %%es,%0" : "=r" (v)); (pr_reg)[24] = v; \ |
35 | asm("movl %%fs,%0" : "=r" (v)); (pr_reg)[25] = v; \ |
36 | diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c |
37 | index b3760b3c1ca0..0887d2ae3797 100644 |
38 | --- a/arch/x86/kernel/process_64.c |
39 | +++ b/arch/x86/kernel/process_64.c |
40 | @@ -136,6 +136,123 @@ void release_thread(struct task_struct *dead_task) |
41 | } |
42 | } |
43 | |
44 | +enum which_selector { |
45 | + FS, |
46 | + GS |
47 | +}; |
48 | + |
49 | +/* |
50 | + * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are |
51 | + * not available. The goal is to be reasonably fast on non-FSGSBASE systems. |
52 | + * It's forcibly inlined because it'll generate better code and this function |
53 | + * is hot. |
54 | + */ |
55 | +static __always_inline void save_base_legacy(struct task_struct *prev_p, |
56 | + unsigned short selector, |
57 | + enum which_selector which) |
58 | +{ |
59 | + if (likely(selector == 0)) { |
60 | + /* |
61 | + * On Intel (without X86_BUG_NULL_SEG), the segment base could |
62 | + * be the pre-existing saved base or it could be zero. On AMD |
63 | + * (with X86_BUG_NULL_SEG), the segment base could be almost |
64 | + * anything. |
65 | + * |
66 | + * This branch is very hot (it's hit twice on almost every |
67 | + * context switch between 64-bit programs), and avoiding |
68 | + * the RDMSR helps a lot, so we just assume that whatever |
69 | + * value is already saved is correct. This matches historical |
70 | + * Linux behavior, so it won't break existing applications. |
71 | + * |
72 | + * To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we |
73 | + * report that the base is zero, it needs to actually be zero: |
74 | + * see the corresponding logic in load_seg_legacy. |
75 | + */ |
76 | + } else { |
77 | + /* |
78 | + * If the selector is 1, 2, or 3, then the base is zero on |
79 | + * !X86_BUG_NULL_SEG CPUs and could be anything on |
80 | + * X86_BUG_NULL_SEG CPUs. In the latter case, Linux |
81 | + * has never attempted to preserve the base across context |
82 | + * switches. |
83 | + * |
84 | + * If selector > 3, then it refers to a real segment, and |
85 | + * saving the base isn't necessary. |
86 | + */ |
87 | + if (which == FS) |
88 | + prev_p->thread.fsbase = 0; |
89 | + else |
90 | + prev_p->thread.gsbase = 0; |
91 | + } |
92 | +} |
93 | + |
94 | +static __always_inline void save_fsgs(struct task_struct *task) |
95 | +{ |
96 | + savesegment(fs, task->thread.fsindex); |
97 | + savesegment(gs, task->thread.gsindex); |
98 | + save_base_legacy(task, task->thread.fsindex, FS); |
99 | + save_base_legacy(task, task->thread.gsindex, GS); |
100 | +} |
101 | + |
102 | +static __always_inline void loadseg(enum which_selector which, |
103 | + unsigned short sel) |
104 | +{ |
105 | + if (which == FS) |
106 | + loadsegment(fs, sel); |
107 | + else |
108 | + load_gs_index(sel); |
109 | +} |
110 | + |
111 | +static __always_inline void load_seg_legacy(unsigned short prev_index, |
112 | + unsigned long prev_base, |
113 | + unsigned short next_index, |
114 | + unsigned long next_base, |
115 | + enum which_selector which) |
116 | +{ |
117 | + if (likely(next_index <= 3)) { |
118 | + /* |
119 | + * The next task is using 64-bit TLS, is not using this |
120 | + * segment at all, or is having fun with arcane CPU features. |
121 | + */ |
122 | + if (next_base == 0) { |
123 | + /* |
124 | + * Nasty case: on AMD CPUs, we need to forcibly zero |
125 | + * the base. |
126 | + */ |
127 | + if (static_cpu_has_bug(X86_BUG_NULL_SEG)) { |
128 | + loadseg(which, __USER_DS); |
129 | + loadseg(which, next_index); |
130 | + } else { |
131 | + /* |
132 | + * We could try to exhaustively detect cases |
133 | + * under which we can skip the segment load, |
134 | + * but there's really only one case that matters |
135 | + * for performance: if both the previous and |
136 | + * next states are fully zeroed, we can skip |
137 | + * the load. |
138 | + * |
139 | + * (This assumes that prev_base == 0 has no |
140 | + * false positives. This is the case on |
141 | + * Intel-style CPUs.) |
142 | + */ |
143 | + if (likely(prev_index | next_index | prev_base)) |
144 | + loadseg(which, next_index); |
145 | + } |
146 | + } else { |
147 | + if (prev_index != next_index) |
148 | + loadseg(which, next_index); |
149 | + wrmsrl(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE, |
150 | + next_base); |
151 | + } |
152 | + } else { |
153 | + /* |
154 | + * The next task is using a real segment. Loading the selector |
155 | + * is sufficient. |
156 | + */ |
157 | + loadseg(which, next_index); |
158 | + } |
159 | +} |
160 | + |
161 | int copy_thread_tls(unsigned long clone_flags, unsigned long sp, |
162 | unsigned long arg, struct task_struct *p, unsigned long tls) |
163 | { |
164 | @@ -216,10 +333,19 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip, |
165 | unsigned long new_sp, |
166 | unsigned int _cs, unsigned int _ss, unsigned int _ds) |
167 | { |
168 | + WARN_ON_ONCE(regs != current_pt_regs()); |
169 | + |
170 | + if (static_cpu_has(X86_BUG_NULL_SEG)) { |
171 | + /* Loading zero below won't clear the base. */ |
172 | + loadsegment(fs, __USER_DS); |
173 | + load_gs_index(__USER_DS); |
174 | + } |
175 | + |
176 | loadsegment(fs, 0); |
177 | loadsegment(es, _ds); |
178 | loadsegment(ds, _ds); |
179 | load_gs_index(0); |
180 | + |
181 | regs->ip = new_ip; |
182 | regs->sp = new_sp; |
183 | regs->cs = _cs; |
184 | @@ -264,7 +390,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) |
185 | struct fpu *next_fpu = &next->fpu; |
186 | int cpu = smp_processor_id(); |
187 | struct tss_struct *tss = &per_cpu(cpu_tss, cpu); |
188 | - unsigned prev_fsindex, prev_gsindex; |
189 | fpu_switch_t fpu_switch; |
190 | |
191 | fpu_switch = switch_fpu_prepare(prev_fpu, next_fpu, cpu); |
192 | @@ -274,8 +399,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) |
193 | * |
194 | * (e.g. xen_load_tls()) |
195 | */ |
196 | - savesegment(fs, prev_fsindex); |
197 | - savesegment(gs, prev_gsindex); |
198 | + save_fsgs(prev_p); |
199 | |
200 | /* |
201 | * Load TLS before restoring any segments so that segment loads |
202 | @@ -314,108 +438,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) |
203 | if (unlikely(next->ds | prev->ds)) |
204 | loadsegment(ds, next->ds); |
205 | |
206 | - /* |
207 | - * Switch FS and GS. |
208 | - * |
209 | - * These are even more complicated than DS and ES: they have |
210 | - * 64-bit bases are that controlled by arch_prctl. The bases |
211 | - * don't necessarily match the selectors, as user code can do |
212 | - * any number of things to cause them to be inconsistent. |
213 | - * |
214 | - * We don't promise to preserve the bases if the selectors are |
215 | - * nonzero. We also don't promise to preserve the base if the |
216 | - * selector is zero and the base doesn't match whatever was |
217 | - * most recently passed to ARCH_SET_FS/GS. (If/when the |
218 | - * FSGSBASE instructions are enabled, we'll need to offer |
219 | - * stronger guarantees.) |
220 | - * |
221 | - * As an invariant, |
222 | - * (fsbase != 0 && fsindex != 0) || (gsbase != 0 && gsindex != 0) is |
223 | - * impossible. |
224 | - */ |
225 | - if (next->fsindex) { |
226 | - /* Loading a nonzero value into FS sets the index and base. */ |
227 | - loadsegment(fs, next->fsindex); |
228 | - } else { |
229 | - if (next->fsbase) { |
230 | - /* Next index is zero but next base is nonzero. */ |
231 | - if (prev_fsindex) |
232 | - loadsegment(fs, 0); |
233 | - wrmsrl(MSR_FS_BASE, next->fsbase); |
234 | - } else { |
235 | - /* Next base and index are both zero. */ |
236 | - if (static_cpu_has_bug(X86_BUG_NULL_SEG)) { |
237 | - /* |
238 | - * We don't know the previous base and can't |
239 | - * find out without RDMSR. Forcibly clear it. |
240 | - */ |
241 | - loadsegment(fs, __USER_DS); |
242 | - loadsegment(fs, 0); |
243 | - } else { |
244 | - /* |
245 | - * If the previous index is zero and ARCH_SET_FS |
246 | - * didn't change the base, then the base is |
247 | - * also zero and we don't need to do anything. |
248 | - */ |
249 | - if (prev->fsbase || prev_fsindex) |
250 | - loadsegment(fs, 0); |
251 | - } |
252 | - } |
253 | - } |
254 | - /* |
255 | - * Save the old state and preserve the invariant. |
256 | - * NB: if prev_fsindex == 0, then we can't reliably learn the base |
257 | - * without RDMSR because Intel user code can zero it without telling |
258 | - * us and AMD user code can program any 32-bit value without telling |
259 | - * us. |
260 | - */ |
261 | - if (prev_fsindex) |
262 | - prev->fsbase = 0; |
263 | - prev->fsindex = prev_fsindex; |
264 | - |
265 | - if (next->gsindex) { |
266 | - /* Loading a nonzero value into GS sets the index and base. */ |
267 | - load_gs_index(next->gsindex); |
268 | - } else { |
269 | - if (next->gsbase) { |
270 | - /* Next index is zero but next base is nonzero. */ |
271 | - if (prev_gsindex) |
272 | - load_gs_index(0); |
273 | - wrmsrl(MSR_KERNEL_GS_BASE, next->gsbase); |
274 | - } else { |
275 | - /* Next base and index are both zero. */ |
276 | - if (static_cpu_has_bug(X86_BUG_NULL_SEG)) { |
277 | - /* |
278 | - * We don't know the previous base and can't |
279 | - * find out without RDMSR. Forcibly clear it. |
280 | - * |
281 | - * This contains a pointless SWAPGS pair. |
282 | - * Fixing it would involve an explicit check |
283 | - * for Xen or a new pvop. |
284 | - */ |
285 | - load_gs_index(__USER_DS); |
286 | - load_gs_index(0); |
287 | - } else { |
288 | - /* |
289 | - * If the previous index is zero and ARCH_SET_GS |
290 | - * didn't change the base, then the base is |
291 | - * also zero and we don't need to do anything. |
292 | - */ |
293 | - if (prev->gsbase || prev_gsindex) |
294 | - load_gs_index(0); |
295 | - } |
296 | - } |
297 | - } |
298 | - /* |
299 | - * Save the old state and preserve the invariant. |
300 | - * NB: if prev_gsindex == 0, then we can't reliably learn the base |
301 | - * without RDMSR because Intel user code can zero it without telling |
302 | - * us and AMD user code can program any 32-bit value without telling |
303 | - * us. |
304 | - */ |
305 | - if (prev_gsindex) |
306 | - prev->gsbase = 0; |
307 | - prev->gsindex = prev_gsindex; |
308 | + load_seg_legacy(prev->fsindex, prev->fsbase, |
309 | + next->fsindex, next->fsbase, FS); |
310 | + load_seg_legacy(prev->gsindex, prev->gsbase, |
311 | + next->gsindex, next->gsbase, GS); |
312 | |
313 | switch_fpu_finish(next_fpu, fpu_switch); |
314 | |
315 | diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c |
316 | index 383f19c6bf24..549b4afd12e1 100644 |
317 | --- a/drivers/md/raid5.c |
318 | +++ b/drivers/md/raid5.c |
319 | @@ -5844,6 +5844,8 @@ static void raid5_do_work(struct work_struct *work) |
320 | |
321 | spin_unlock_irq(&conf->device_lock); |
322 | |
323 | + r5l_flush_stripe_to_raid(conf->log); |
324 | + |
325 | async_tx_issue_pending_all(); |
326 | blk_finish_plug(&plug); |
327 | |
328 | diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c |
329 | index e8139514d32c..9e073fb6870a 100644 |
330 | --- a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c |
331 | +++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c |
332 | @@ -317,12 +317,12 @@ int t4_wr_mbox_meat_timeout(struct adapter *adap, int mbox, const void *cmd, |
333 | |
334 | if (v != MBOX_OWNER_DRV) { |
335 | ret = (v == MBOX_OWNER_FW) ? -EBUSY : -ETIMEDOUT; |
336 | - t4_record_mbox(adap, cmd, MBOX_LEN, access, ret); |
337 | + t4_record_mbox(adap, cmd, size, access, ret); |
338 | return ret; |
339 | } |
340 | |
341 | /* Copy in the new mailbox command and send it on its way ... */ |
342 | - t4_record_mbox(adap, cmd, MBOX_LEN, access, 0); |
343 | + t4_record_mbox(adap, cmd, size, access, 0); |
344 | for (i = 0; i < size; i += 8) |
345 | t4_write_reg64(adap, data_reg + i, be64_to_cpu(*p++)); |
346 | |
347 | @@ -371,7 +371,7 @@ int t4_wr_mbox_meat_timeout(struct adapter *adap, int mbox, const void *cmd, |
348 | } |
349 | |
350 | ret = (pcie_fw & PCIE_FW_ERR_F) ? -ENXIO : -ETIMEDOUT; |
351 | - t4_record_mbox(adap, cmd, MBOX_LEN, access, ret); |
352 | + t4_record_mbox(adap, cmd, size, access, ret); |
353 | dev_err(adap->pdev_dev, "command %#x in mailbox %d timed out\n", |
354 | *(const u8 *)cmd, mbox); |
355 | t4_report_fw_error(adap); |
356 | diff --git a/drivers/net/ethernet/freescale/fman/mac.c b/drivers/net/ethernet/freescale/fman/mac.c |
357 | index 736db9d9b0ad..81021f87e4f3 100644 |
358 | --- a/drivers/net/ethernet/freescale/fman/mac.c |
359 | +++ b/drivers/net/ethernet/freescale/fman/mac.c |
360 | @@ -622,6 +622,9 @@ static struct platform_device *dpaa_eth_add_device(int fman_id, |
361 | goto no_mem; |
362 | } |
363 | |
364 | + pdev->dev.of_node = node; |
365 | + pdev->dev.parent = priv->dev; |
366 | + |
367 | ret = platform_device_add_data(pdev, &data, sizeof(data)); |
368 | if (ret) |
369 | goto err; |
370 | diff --git a/drivers/net/ethernet/freescale/gianfar.c b/drivers/net/ethernet/freescale/gianfar.c |
371 | index 3f4e71148808..fd206889a433 100644 |
372 | --- a/drivers/net/ethernet/freescale/gianfar.c |
373 | +++ b/drivers/net/ethernet/freescale/gianfar.c |
374 | @@ -3690,7 +3690,7 @@ static noinline void gfar_update_link_state(struct gfar_private *priv) |
375 | u32 tempval1 = gfar_read(®s->maccfg1); |
376 | u32 tempval = gfar_read(®s->maccfg2); |
377 | u32 ecntrl = gfar_read(®s->ecntrl); |
378 | - u32 tx_flow_oldval = (tempval & MACCFG1_TX_FLOW); |
379 | + u32 tx_flow_oldval = (tempval1 & MACCFG1_TX_FLOW); |
380 | |
381 | if (phydev->duplex != priv->oldduplex) { |
382 | if (!(phydev->duplex)) |
383 | diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c |
384 | index f902c4d3de99..1806b1fc6e4c 100644 |
385 | --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c |
386 | +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c |
387 | @@ -4172,6 +4172,8 @@ static int mlxsw_sp_netdevice_port_upper_event(struct net_device *dev, |
388 | return -EINVAL; |
389 | if (!info->linking) |
390 | break; |
391 | + if (netdev_has_any_upper_dev(upper_dev)) |
392 | + return -EINVAL; |
393 | /* HW limitation forbids to put ports to multiple bridges. */ |
394 | if (netif_is_bridge_master(upper_dev) && |
395 | !mlxsw_sp_master_bridge_check(mlxsw_sp, upper_dev)) |
396 | @@ -4185,6 +4187,10 @@ static int mlxsw_sp_netdevice_port_upper_event(struct net_device *dev, |
397 | if (netif_is_lag_port(dev) && is_vlan_dev(upper_dev) && |
398 | !netif_is_lag_master(vlan_dev_real_dev(upper_dev))) |
399 | return -EINVAL; |
400 | + if (!info->linking) |
401 | + break; |
402 | + if (netdev_has_any_upper_dev(upper_dev)) |
403 | + return -EINVAL; |
404 | break; |
405 | case NETDEV_CHANGEUPPER: |
406 | upper_dev = info->upper_dev; |
407 | diff --git a/drivers/net/ethernet/qlogic/qlge/qlge_dbg.c b/drivers/net/ethernet/qlogic/qlge/qlge_dbg.c |
408 | index 829be21f97b2..be258d90de9e 100644 |
409 | --- a/drivers/net/ethernet/qlogic/qlge/qlge_dbg.c |
410 | +++ b/drivers/net/ethernet/qlogic/qlge/qlge_dbg.c |
411 | @@ -724,7 +724,7 @@ static void ql_build_coredump_seg_header( |
412 | seg_hdr->cookie = MPI_COREDUMP_COOKIE; |
413 | seg_hdr->segNum = seg_number; |
414 | seg_hdr->segSize = seg_size; |
415 | - memcpy(seg_hdr->description, desc, (sizeof(seg_hdr->description)) - 1); |
416 | + strncpy(seg_hdr->description, desc, (sizeof(seg_hdr->description)) - 1); |
417 | } |
418 | |
419 | /* |
420 | diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c |
421 | index ff038e507fd6..36a04e182af1 100644 |
422 | --- a/drivers/net/hyperv/netvsc_drv.c |
423 | +++ b/drivers/net/hyperv/netvsc_drv.c |
424 | @@ -1084,7 +1084,12 @@ static void netvsc_link_change(struct work_struct *w) |
425 | bool notify = false, reschedule = false; |
426 | unsigned long flags, next_reconfig, delay; |
427 | |
428 | - rtnl_lock(); |
429 | + /* if changes are happening, comeback later */ |
430 | + if (!rtnl_trylock()) { |
431 | + schedule_delayed_work(&ndev_ctx->dwork, LINKCHANGE_INT); |
432 | + return; |
433 | + } |
434 | + |
435 | if (ndev_ctx->start_remove) |
436 | goto out_unlock; |
437 | |
438 | diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c |
439 | index a5d66e205bb2..2caac0c37059 100644 |
440 | --- a/drivers/net/macsec.c |
441 | +++ b/drivers/net/macsec.c |
442 | @@ -3510,6 +3510,7 @@ module_init(macsec_init); |
443 | module_exit(macsec_exit); |
444 | |
445 | MODULE_ALIAS_RTNL_LINK("macsec"); |
446 | +MODULE_ALIAS_GENL_FAMILY("macsec"); |
447 | |
448 | MODULE_DESCRIPTION("MACsec IEEE 802.1AE"); |
449 | MODULE_LICENSE("GPL v2"); |
450 | diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c |
451 | index 775a6e1fdef9..6e12401b5102 100644 |
452 | --- a/drivers/net/phy/phy.c |
453 | +++ b/drivers/net/phy/phy.c |
454 | @@ -674,9 +674,6 @@ void phy_stop_machine(struct phy_device *phydev) |
455 | if (phydev->state > PHY_UP && phydev->state != PHY_HALTED) |
456 | phydev->state = PHY_UP; |
457 | mutex_unlock(&phydev->lock); |
458 | - |
459 | - /* Now we can run the state machine synchronously */ |
460 | - phy_state_machine(&phydev->state_queue.work); |
461 | } |
462 | |
463 | /** |
464 | diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c |
465 | index 5dc128a8da83..96a0661011fd 100644 |
466 | --- a/drivers/vhost/net.c |
467 | +++ b/drivers/vhost/net.c |
468 | @@ -537,8 +537,13 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk) |
469 | |
470 | preempt_enable(); |
471 | |
472 | - if (vhost_enable_notify(&net->dev, vq)) |
473 | + if (!vhost_vq_avail_empty(&net->dev, vq)) |
474 | vhost_poll_queue(&vq->poll); |
475 | + else if (unlikely(vhost_enable_notify(&net->dev, vq))) { |
476 | + vhost_disable_notify(&net->dev, vq); |
477 | + vhost_poll_queue(&vq->poll); |
478 | + } |
479 | + |
480 | mutex_unlock(&vq->mutex); |
481 | |
482 | len = peek_head_len(sk); |
483 | diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c |
484 | index 2fc84a991325..98c1a63a4614 100644 |
485 | --- a/fs/f2fs/recovery.c |
486 | +++ b/fs/f2fs/recovery.c |
487 | @@ -316,7 +316,7 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, |
488 | return 0; |
489 | |
490 | /* Get the previous summary */ |
491 | - for (i = CURSEG_WARM_DATA; i <= CURSEG_COLD_DATA; i++) { |
492 | + for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { |
493 | struct curseg_info *curseg = CURSEG_I(sbi, i); |
494 | if (curseg->segno == segno) { |
495 | sum = curseg->sum_blk->entries[blkoff]; |
496 | @@ -626,8 +626,6 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) |
497 | } |
498 | |
499 | clear_sbi_flag(sbi, SBI_POR_DOING); |
500 | - if (err) |
501 | - set_ckpt_flags(sbi, CP_ERROR_FLAG); |
502 | mutex_unlock(&sbi->cp_mutex); |
503 | |
504 | /* let's drop all the directory inodes for clean checkpoint */ |
505 | diff --git a/fs/inode.c b/fs/inode.c |
506 | index 88110fd0b282..920aa0b1c6b0 100644 |
507 | --- a/fs/inode.c |
508 | +++ b/fs/inode.c |
509 | @@ -637,6 +637,7 @@ void evict_inodes(struct super_block *sb) |
510 | |
511 | dispose_list(&dispose); |
512 | } |
513 | +EXPORT_SYMBOL_GPL(evict_inodes); |
514 | |
515 | /** |
516 | * invalidate_inodes - attempt to free all inodes on a superblock |
517 | diff --git a/fs/internal.h b/fs/internal.h |
518 | index f4da3341b4a3..8b7143b0211c 100644 |
519 | --- a/fs/internal.h |
520 | +++ b/fs/internal.h |
521 | @@ -136,7 +136,6 @@ extern bool atime_needs_update_rcu(const struct path *, struct inode *); |
522 | extern void inode_io_list_del(struct inode *inode); |
523 | |
524 | extern long get_nr_dirty_inodes(void); |
525 | -extern void evict_inodes(struct super_block *); |
526 | extern int invalidate_inodes(struct super_block *, bool); |
527 | |
528 | /* |
529 | diff --git a/fs/iomap.c b/fs/iomap.c |
530 | index 798c291cbc75..a49db8806a3a 100644 |
531 | --- a/fs/iomap.c |
532 | +++ b/fs/iomap.c |
533 | @@ -281,7 +281,7 @@ iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data, |
534 | unsigned long bytes; /* Bytes to write to page */ |
535 | |
536 | offset = (pos & (PAGE_SIZE - 1)); |
537 | - bytes = min_t(unsigned long, PAGE_SIZE - offset, length); |
538 | + bytes = min_t(loff_t, PAGE_SIZE - offset, length); |
539 | |
540 | rpage = __iomap_read_page(inode, pos); |
541 | if (IS_ERR(rpage)) |
542 | @@ -376,7 +376,7 @@ iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count, |
543 | unsigned offset, bytes; |
544 | |
545 | offset = pos & (PAGE_SIZE - 1); /* Within page */ |
546 | - bytes = min_t(unsigned, PAGE_SIZE - offset, count); |
547 | + bytes = min_t(loff_t, PAGE_SIZE - offset, count); |
548 | |
549 | if (IS_DAX(inode)) |
550 | status = iomap_dax_zero(pos, offset, bytes, iomap); |
551 | diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c |
552 | index 2852521fc8ec..c6c15e5717e4 100644 |
553 | --- a/fs/xfs/libxfs/xfs_attr_leaf.c |
554 | +++ b/fs/xfs/libxfs/xfs_attr_leaf.c |
555 | @@ -351,7 +351,7 @@ xfs_attr3_leaf_read( |
556 | |
557 | err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp, |
558 | XFS_ATTR_FORK, &xfs_attr3_leaf_buf_ops); |
559 | - if (!err && tp) |
560 | + if (!err && tp && *bpp) |
561 | xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_ATTR_LEAF_BUF); |
562 | return err; |
563 | } |
564 | diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c |
565 | index 2a8cbd15d5d1..d2f4ab175096 100644 |
566 | --- a/fs/xfs/libxfs/xfs_bmap.c |
567 | +++ b/fs/xfs/libxfs/xfs_bmap.c |
568 | @@ -579,7 +579,7 @@ xfs_bmap_validate_ret( |
569 | |
570 | #else |
571 | #define xfs_bmap_check_leaf_extents(cur, ip, whichfork) do { } while (0) |
572 | -#define xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap) |
573 | +#define xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap) do { } while (0) |
574 | #endif /* DEBUG */ |
575 | |
576 | /* |
577 | @@ -5555,6 +5555,8 @@ __xfs_bunmapi( |
578 | int whichfork; /* data or attribute fork */ |
579 | xfs_fsblock_t sum; |
580 | xfs_filblks_t len = *rlen; /* length to unmap in file */ |
581 | + xfs_fileoff_t max_len; |
582 | + xfs_agnumber_t prev_agno = NULLAGNUMBER, agno; |
583 | |
584 | trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_); |
585 | |
586 | @@ -5576,6 +5578,16 @@ __xfs_bunmapi( |
587 | ASSERT(len > 0); |
588 | ASSERT(nexts >= 0); |
589 | |
590 | + /* |
591 | + * Guesstimate how many blocks we can unmap without running the risk of |
592 | + * blowing out the transaction with a mix of EFIs and reflink |
593 | + * adjustments. |
594 | + */ |
595 | + if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) |
596 | + max_len = min(len, xfs_refcount_max_unmap(tp->t_log_res)); |
597 | + else |
598 | + max_len = len; |
599 | + |
600 | if (!(ifp->if_flags & XFS_IFEXTENTS) && |
601 | (error = xfs_iread_extents(tp, ip, whichfork))) |
602 | return error; |
603 | @@ -5621,7 +5633,7 @@ __xfs_bunmapi( |
604 | |
605 | extno = 0; |
606 | while (bno != (xfs_fileoff_t)-1 && bno >= start && lastx >= 0 && |
607 | - (nexts == 0 || extno < nexts)) { |
608 | + (nexts == 0 || extno < nexts) && max_len > 0) { |
609 | /* |
610 | * Is the found extent after a hole in which bno lives? |
611 | * Just back up to the previous extent, if so. |
612 | @@ -5647,6 +5659,17 @@ __xfs_bunmapi( |
613 | ASSERT(ep != NULL); |
614 | del = got; |
615 | wasdel = isnullstartblock(del.br_startblock); |
616 | + |
617 | + /* |
618 | + * Make sure we don't touch multiple AGF headers out of order |
619 | + * in a single transaction, as that could cause AB-BA deadlocks. |
620 | + */ |
621 | + if (!wasdel) { |
622 | + agno = XFS_FSB_TO_AGNO(mp, del.br_startblock); |
623 | + if (prev_agno != NULLAGNUMBER && prev_agno > agno) |
624 | + break; |
625 | + prev_agno = agno; |
626 | + } |
627 | if (got.br_startoff < start) { |
628 | del.br_startoff = start; |
629 | del.br_blockcount -= start - got.br_startoff; |
630 | @@ -5655,6 +5678,15 @@ __xfs_bunmapi( |
631 | } |
632 | if (del.br_startoff + del.br_blockcount > bno + 1) |
633 | del.br_blockcount = bno + 1 - del.br_startoff; |
634 | + |
635 | + /* How much can we safely unmap? */ |
636 | + if (max_len < del.br_blockcount) { |
637 | + del.br_startoff += del.br_blockcount - max_len; |
638 | + if (!wasdel) |
639 | + del.br_startblock += del.br_blockcount - max_len; |
640 | + del.br_blockcount = max_len; |
641 | + } |
642 | + |
643 | sum = del.br_startblock + del.br_blockcount; |
644 | if (isrt && |
645 | (mod = do_mod(sum, mp->m_sb.sb_rextsize))) { |
646 | @@ -5835,6 +5867,7 @@ __xfs_bunmapi( |
647 | if (!isrt && wasdel) |
648 | xfs_mod_fdblocks(mp, (int64_t)del.br_blockcount, false); |
649 | |
650 | + max_len -= del.br_blockcount; |
651 | bno = del.br_startoff - 1; |
652 | nodelete: |
653 | /* |
654 | @@ -6604,25 +6637,33 @@ xfs_bmap_finish_one( |
655 | int whichfork, |
656 | xfs_fileoff_t startoff, |
657 | xfs_fsblock_t startblock, |
658 | - xfs_filblks_t blockcount, |
659 | + xfs_filblks_t *blockcount, |
660 | xfs_exntst_t state) |
661 | { |
662 | struct xfs_bmbt_irec bmap; |
663 | int nimaps = 1; |
664 | xfs_fsblock_t firstfsb; |
665 | int flags = XFS_BMAPI_REMAP; |
666 | - int done; |
667 | int error = 0; |
668 | |
669 | bmap.br_startblock = startblock; |
670 | bmap.br_startoff = startoff; |
671 | - bmap.br_blockcount = blockcount; |
672 | + bmap.br_blockcount = *blockcount; |
673 | bmap.br_state = state; |
674 | |
675 | + /* |
676 | + * firstfsb is tied to the transaction lifetime and is used to |
677 | + * ensure correct AG locking order and schedule work item |
678 | + * continuations. XFS_BUI_MAX_FAST_EXTENTS (== 1) restricts us |
679 | + * to only making one bmap call per transaction, so it should |
680 | + * be safe to have it as a local variable here. |
681 | + */ |
682 | + firstfsb = NULLFSBLOCK; |
683 | + |
684 | trace_xfs_bmap_deferred(tp->t_mountp, |
685 | XFS_FSB_TO_AGNO(tp->t_mountp, startblock), type, |
686 | XFS_FSB_TO_AGBNO(tp->t_mountp, startblock), |
687 | - ip->i_ino, whichfork, startoff, blockcount, state); |
688 | + ip->i_ino, whichfork, startoff, *blockcount, state); |
689 | |
690 | if (whichfork != XFS_DATA_FORK && whichfork != XFS_ATTR_FORK) |
691 | return -EFSCORRUPTED; |
692 | @@ -6641,12 +6682,11 @@ xfs_bmap_finish_one( |
693 | bmap.br_blockcount, flags, &firstfsb, |
694 | bmap.br_blockcount, &bmap, &nimaps, |
695 | dfops); |
696 | + *blockcount = 0; |
697 | break; |
698 | case XFS_BMAP_UNMAP: |
699 | - error = xfs_bunmapi(tp, ip, bmap.br_startoff, |
700 | - bmap.br_blockcount, flags, 1, &firstfsb, |
701 | - dfops, &done); |
702 | - ASSERT(done); |
703 | + error = __xfs_bunmapi(tp, ip, startoff, blockcount, |
704 | + XFS_BMAPI_REMAP, 1, &firstfsb, dfops); |
705 | break; |
706 | default: |
707 | ASSERT(0); |
708 | diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h |
709 | index e7d40b39f18f..db53ac7ff6df 100644 |
710 | --- a/fs/xfs/libxfs/xfs_bmap.h |
711 | +++ b/fs/xfs/libxfs/xfs_bmap.h |
712 | @@ -265,7 +265,7 @@ struct xfs_bmap_intent { |
713 | int xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_defer_ops *dfops, |
714 | struct xfs_inode *ip, enum xfs_bmap_intent_type type, |
715 | int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock, |
716 | - xfs_filblks_t blockcount, xfs_exntst_t state); |
717 | + xfs_filblks_t *blockcount, xfs_exntst_t state); |
718 | int xfs_bmap_map_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops, |
719 | struct xfs_inode *ip, struct xfs_bmbt_irec *imap); |
720 | int xfs_bmap_unmap_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops, |
721 | diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c |
722 | index 5c3918678bb6..9968a746c649 100644 |
723 | --- a/fs/xfs/libxfs/xfs_bmap_btree.c |
724 | +++ b/fs/xfs/libxfs/xfs_bmap_btree.c |
725 | @@ -888,6 +888,7 @@ xfs_bmbt_change_owner( |
726 | cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork); |
727 | if (!cur) |
728 | return -ENOMEM; |
729 | + cur->bc_private.b.flags |= XFS_BTCUR_BPRV_INVALID_OWNER; |
730 | |
731 | error = xfs_btree_change_owner(cur, new_owner, buffer_list); |
732 | xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); |
733 | diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c |
734 | index 91c68913d495..4ad1e214b1b2 100644 |
735 | --- a/fs/xfs/libxfs/xfs_btree.c |
736 | +++ b/fs/xfs/libxfs/xfs_btree.c |
737 | @@ -714,7 +714,8 @@ xfs_btree_firstrec( |
738 | * Get the block pointer for this level. |
739 | */ |
740 | block = xfs_btree_get_block(cur, level, &bp); |
741 | - xfs_btree_check_block(cur, block, level, bp); |
742 | + if (xfs_btree_check_block(cur, block, level, bp)) |
743 | + return 0; |
744 | /* |
745 | * It's empty, there is no such record. |
746 | */ |
747 | @@ -743,7 +744,8 @@ xfs_btree_lastrec( |
748 | * Get the block pointer for this level. |
749 | */ |
750 | block = xfs_btree_get_block(cur, level, &bp); |
751 | - xfs_btree_check_block(cur, block, level, bp); |
752 | + if (xfs_btree_check_block(cur, block, level, bp)) |
753 | + return 0; |
754 | /* |
755 | * It's empty, there is no such record. |
756 | */ |
757 | @@ -1772,6 +1774,7 @@ xfs_btree_lookup_get_block( |
758 | |
759 | /* Check the inode owner since the verifiers don't. */ |
760 | if (xfs_sb_version_hascrc(&cur->bc_mp->m_sb) && |
761 | + !(cur->bc_private.b.flags & XFS_BTCUR_BPRV_INVALID_OWNER) && |
762 | (cur->bc_flags & XFS_BTREE_LONG_PTRS) && |
763 | be64_to_cpu((*blkp)->bb_u.l.bb_owner) != |
764 | cur->bc_private.b.ip->i_ino) |
765 | @@ -4432,10 +4435,15 @@ xfs_btree_block_change_owner( |
766 | |
767 | /* modify the owner */ |
768 | block = xfs_btree_get_block(cur, level, &bp); |
769 | - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) |
770 | + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { |
771 | + if (block->bb_u.l.bb_owner == cpu_to_be64(bbcoi->new_owner)) |
772 | + return 0; |
773 | block->bb_u.l.bb_owner = cpu_to_be64(bbcoi->new_owner); |
774 | - else |
775 | + } else { |
776 | + if (block->bb_u.s.bb_owner == cpu_to_be32(bbcoi->new_owner)) |
777 | + return 0; |
778 | block->bb_u.s.bb_owner = cpu_to_be32(bbcoi->new_owner); |
779 | + } |
780 | |
781 | /* |
782 | * If the block is a root block hosted in an inode, we might not have a |
783 | @@ -4444,16 +4452,19 @@ xfs_btree_block_change_owner( |
784 | * block is formatted into the on-disk inode fork. We still change it, |
785 | * though, so everything is consistent in memory. |
786 | */ |
787 | - if (bp) { |
788 | - if (cur->bc_tp) { |
789 | - xfs_trans_ordered_buf(cur->bc_tp, bp); |
790 | + if (!bp) { |
791 | + ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); |
792 | + ASSERT(level == cur->bc_nlevels - 1); |
793 | + return 0; |
794 | + } |
795 | + |
796 | + if (cur->bc_tp) { |
797 | + if (!xfs_trans_ordered_buf(cur->bc_tp, bp)) { |
798 | xfs_btree_log_block(cur, bp, XFS_BB_OWNER); |
799 | - } else { |
800 | - xfs_buf_delwri_queue(bp, bbcoi->buffer_list); |
801 | + return -EAGAIN; |
802 | } |
803 | } else { |
804 | - ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); |
805 | - ASSERT(level == cur->bc_nlevels - 1); |
806 | + xfs_buf_delwri_queue(bp, bbcoi->buffer_list); |
807 | } |
808 | |
809 | return 0; |
810 | diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h |
811 | index 3b0fc1afada5..33c7be2357b9 100644 |
812 | --- a/fs/xfs/libxfs/xfs_btree.h |
813 | +++ b/fs/xfs/libxfs/xfs_btree.h |
814 | @@ -268,7 +268,8 @@ typedef struct xfs_btree_cur |
815 | short forksize; /* fork's inode space */ |
816 | char whichfork; /* data or attr fork */ |
817 | char flags; /* flags */ |
818 | -#define XFS_BTCUR_BPRV_WASDEL 1 /* was delayed */ |
819 | +#define XFS_BTCUR_BPRV_WASDEL (1<<0) /* was delayed */ |
820 | +#define XFS_BTCUR_BPRV_INVALID_OWNER (1<<1) /* for ext swap */ |
821 | } b; |
822 | } bc_private; /* per-btree type data */ |
823 | } xfs_btree_cur_t; |
824 | diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c |
825 | index 1bdf2888295b..b305dbfd81c4 100644 |
826 | --- a/fs/xfs/libxfs/xfs_da_btree.c |
827 | +++ b/fs/xfs/libxfs/xfs_da_btree.c |
828 | @@ -263,7 +263,7 @@ xfs_da3_node_read( |
829 | |
830 | err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp, |
831 | which_fork, &xfs_da3_node_buf_ops); |
832 | - if (!err && tp) { |
833 | + if (!err && tp && *bpp) { |
834 | struct xfs_da_blkinfo *info = (*bpp)->b_addr; |
835 | int type; |
836 | |
837 | diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c |
838 | index aa17cb788946..43c902f7a68d 100644 |
839 | --- a/fs/xfs/libxfs/xfs_dir2_block.c |
840 | +++ b/fs/xfs/libxfs/xfs_dir2_block.c |
841 | @@ -139,7 +139,7 @@ xfs_dir3_block_read( |
842 | |
843 | err = xfs_da_read_buf(tp, dp, mp->m_dir_geo->datablk, -1, bpp, |
844 | XFS_DATA_FORK, &xfs_dir3_block_buf_ops); |
845 | - if (!err && tp) |
846 | + if (!err && tp && *bpp) |
847 | xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_BLOCK_BUF); |
848 | return err; |
849 | } |
850 | diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c |
851 | index b887fb2a2bcf..f2e342e05365 100644 |
852 | --- a/fs/xfs/libxfs/xfs_dir2_leaf.c |
853 | +++ b/fs/xfs/libxfs/xfs_dir2_leaf.c |
854 | @@ -268,7 +268,7 @@ xfs_dir3_leaf_read( |
855 | |
856 | err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, |
857 | XFS_DATA_FORK, &xfs_dir3_leaf1_buf_ops); |
858 | - if (!err && tp) |
859 | + if (!err && tp && *bpp) |
860 | xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAF1_BUF); |
861 | return err; |
862 | } |
863 | @@ -285,7 +285,7 @@ xfs_dir3_leafn_read( |
864 | |
865 | err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, |
866 | XFS_DATA_FORK, &xfs_dir3_leafn_buf_ops); |
867 | - if (!err && tp) |
868 | + if (!err && tp && *bpp) |
869 | xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAFN_BUF); |
870 | return err; |
871 | } |
872 | diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c |
873 | index a2818f6e8598..42fef0731e2a 100644 |
874 | --- a/fs/xfs/libxfs/xfs_ialloc.c |
875 | +++ b/fs/xfs/libxfs/xfs_ialloc.c |
876 | @@ -368,8 +368,6 @@ xfs_ialloc_inode_init( |
877 | * transaction and pin the log appropriately. |
878 | */ |
879 | xfs_trans_ordered_buf(tp, fbuf); |
880 | - xfs_trans_log_buf(tp, fbuf, 0, |
881 | - BBTOB(fbuf->b_length) - 1); |
882 | } |
883 | } else { |
884 | fbuf->b_flags |= XBF_DONE; |
885 | @@ -1123,6 +1121,7 @@ xfs_dialloc_ag_inobt( |
886 | int error; |
887 | int offset; |
888 | int i, j; |
889 | + int searchdistance = 10; |
890 | |
891 | pag = xfs_perag_get(mp, agno); |
892 | |
893 | @@ -1149,7 +1148,6 @@ xfs_dialloc_ag_inobt( |
894 | if (pagno == agno) { |
895 | int doneleft; /* done, to the left */ |
896 | int doneright; /* done, to the right */ |
897 | - int searchdistance = 10; |
898 | |
899 | error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i); |
900 | if (error) |
901 | @@ -1210,21 +1208,9 @@ xfs_dialloc_ag_inobt( |
902 | /* |
903 | * Loop until we find an inode chunk with a free inode. |
904 | */ |
905 | - while (!doneleft || !doneright) { |
906 | + while (--searchdistance > 0 && (!doneleft || !doneright)) { |
907 | int useleft; /* using left inode chunk this time */ |
908 | |
909 | - if (!--searchdistance) { |
910 | - /* |
911 | - * Not in range - save last search |
912 | - * location and allocate a new inode |
913 | - */ |
914 | - xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); |
915 | - pag->pagl_leftrec = trec.ir_startino; |
916 | - pag->pagl_rightrec = rec.ir_startino; |
917 | - pag->pagl_pagino = pagino; |
918 | - goto newino; |
919 | - } |
920 | - |
921 | /* figure out the closer block if both are valid. */ |
922 | if (!doneleft && !doneright) { |
923 | useleft = pagino - |
924 | @@ -1236,13 +1222,13 @@ xfs_dialloc_ag_inobt( |
925 | |
926 | /* free inodes to the left? */ |
927 | if (useleft && trec.ir_freecount) { |
928 | - rec = trec; |
929 | xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); |
930 | cur = tcur; |
931 | |
932 | pag->pagl_leftrec = trec.ir_startino; |
933 | pag->pagl_rightrec = rec.ir_startino; |
934 | pag->pagl_pagino = pagino; |
935 | + rec = trec; |
936 | goto alloc_inode; |
937 | } |
938 | |
939 | @@ -1268,26 +1254,37 @@ xfs_dialloc_ag_inobt( |
940 | goto error1; |
941 | } |
942 | |
943 | - /* |
944 | - * We've reached the end of the btree. because |
945 | - * we are only searching a small chunk of the |
946 | - * btree each search, there is obviously free |
947 | - * inodes closer to the parent inode than we |
948 | - * are now. restart the search again. |
949 | - */ |
950 | - pag->pagl_pagino = NULLAGINO; |
951 | - pag->pagl_leftrec = NULLAGINO; |
952 | - pag->pagl_rightrec = NULLAGINO; |
953 | - xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); |
954 | - xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); |
955 | - goto restart_pagno; |
956 | + if (searchdistance <= 0) { |
957 | + /* |
958 | + * Not in range - save last search |
959 | + * location and allocate a new inode |
960 | + */ |
961 | + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); |
962 | + pag->pagl_leftrec = trec.ir_startino; |
963 | + pag->pagl_rightrec = rec.ir_startino; |
964 | + pag->pagl_pagino = pagino; |
965 | + |
966 | + } else { |
967 | + /* |
968 | + * We've reached the end of the btree. because |
969 | + * we are only searching a small chunk of the |
970 | + * btree each search, there is obviously free |
971 | + * inodes closer to the parent inode than we |
972 | + * are now. restart the search again. |
973 | + */ |
974 | + pag->pagl_pagino = NULLAGINO; |
975 | + pag->pagl_leftrec = NULLAGINO; |
976 | + pag->pagl_rightrec = NULLAGINO; |
977 | + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); |
978 | + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); |
979 | + goto restart_pagno; |
980 | + } |
981 | } |
982 | |
983 | /* |
984 | * In a different AG from the parent. |
985 | * See if the most recently allocated block has any free. |
986 | */ |
987 | -newino: |
988 | if (agi->agi_newino != cpu_to_be32(NULLAGINO)) { |
989 | error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino), |
990 | XFS_LOOKUP_EQ, &i); |
991 | diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c |
992 | index 8a37efe04de3..4e30448c4465 100644 |
993 | --- a/fs/xfs/libxfs/xfs_inode_fork.c |
994 | +++ b/fs/xfs/libxfs/xfs_inode_fork.c |
995 | @@ -1539,14 +1539,11 @@ xfs_iext_realloc_indirect( |
996 | xfs_ifork_t *ifp, /* inode fork pointer */ |
997 | int new_size) /* new indirection array size */ |
998 | { |
999 | - int nlists; /* number of irec's (ex lists) */ |
1000 | - int size; /* current indirection array size */ |
1001 | - |
1002 | ASSERT(ifp->if_flags & XFS_IFEXTIREC); |
1003 | - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; |
1004 | - size = nlists * sizeof(xfs_ext_irec_t); |
1005 | ASSERT(ifp->if_real_bytes); |
1006 | - ASSERT((new_size >= 0) && (new_size != size)); |
1007 | + ASSERT((new_size >= 0) && |
1008 | + (new_size != ((ifp->if_real_bytes / XFS_IEXT_BUFSZ) * |
1009 | + sizeof(xfs_ext_irec_t)))); |
1010 | if (new_size == 0) { |
1011 | xfs_iext_destroy(ifp); |
1012 | } else { |
1013 | diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c |
1014 | index 82a38d86ebad..d71cb63cdea3 100644 |
1015 | --- a/fs/xfs/libxfs/xfs_refcount.c |
1016 | +++ b/fs/xfs/libxfs/xfs_refcount.c |
1017 | @@ -784,14 +784,6 @@ xfs_refcount_merge_extents( |
1018 | } |
1019 | |
1020 | /* |
1021 | - * While we're adjusting the refcounts records of an extent, we have |
1022 | - * to keep an eye on the number of extents we're dirtying -- run too |
1023 | - * many in a single transaction and we'll exceed the transaction's |
1024 | - * reservation and crash the fs. Each record adds 12 bytes to the |
1025 | - * log (plus any key updates) so we'll conservatively assume 24 bytes |
1026 | - * per record. We must also leave space for btree splits on both ends |
1027 | - * of the range and space for the CUD and a new CUI. |
1028 | - * |
1029 | * XXX: This is a pretty hand-wavy estimate. The penalty for guessing |
1030 | * true incorrectly is a shutdown FS; the penalty for guessing false |
1031 | * incorrectly is more transaction rolls than might be necessary. |
1032 | @@ -822,7 +814,7 @@ xfs_refcount_still_have_space( |
1033 | else if (overhead > cur->bc_tp->t_log_res) |
1034 | return false; |
1035 | return cur->bc_tp->t_log_res - overhead > |
1036 | - cur->bc_private.a.priv.refc.nr_ops * 32; |
1037 | + cur->bc_private.a.priv.refc.nr_ops * XFS_REFCOUNT_ITEM_OVERHEAD; |
1038 | } |
1039 | |
1040 | /* |
1041 | @@ -1648,6 +1640,10 @@ xfs_refcount_recover_cow_leftovers( |
1042 | error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp); |
1043 | if (error) |
1044 | goto out_trans; |
1045 | + if (!agbp) { |
1046 | + error = -ENOMEM; |
1047 | + goto out_trans; |
1048 | + } |
1049 | cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno, NULL); |
1050 | |
1051 | /* Find all the leftover CoW staging extents. */ |
1052 | diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h |
1053 | index 098dc668ab2c..eafb9d1f3b37 100644 |
1054 | --- a/fs/xfs/libxfs/xfs_refcount.h |
1055 | +++ b/fs/xfs/libxfs/xfs_refcount.h |
1056 | @@ -67,4 +67,20 @@ extern int xfs_refcount_free_cow_extent(struct xfs_mount *mp, |
1057 | extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp, |
1058 | xfs_agnumber_t agno); |
1059 | |
1060 | +/* |
1061 | + * While we're adjusting the refcounts records of an extent, we have |
1062 | + * to keep an eye on the number of extents we're dirtying -- run too |
1063 | + * many in a single transaction and we'll exceed the transaction's |
1064 | + * reservation and crash the fs. Each record adds 12 bytes to the |
1065 | + * log (plus any key updates) so we'll conservatively assume 32 bytes |
1066 | + * per record. We must also leave space for btree splits on both ends |
1067 | + * of the range and space for the CUD and a new CUI. |
1068 | + */ |
1069 | +#define XFS_REFCOUNT_ITEM_OVERHEAD 32 |
1070 | + |
1071 | +static inline xfs_fileoff_t xfs_refcount_max_unmap(int log_res) |
1072 | +{ |
1073 | + return (log_res * 3 / 4) / XFS_REFCOUNT_ITEM_OVERHEAD; |
1074 | +} |
1075 | + |
1076 | #endif /* __XFS_REFCOUNT_H__ */ |
1077 | diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c |
1078 | index 578981412615..d23889e0bedc 100644 |
1079 | --- a/fs/xfs/xfs_aops.c |
1080 | +++ b/fs/xfs/xfs_aops.c |
1081 | @@ -90,11 +90,11 @@ xfs_find_bdev_for_inode( |
1082 | * associated buffer_heads, paying attention to the start and end offsets that |
1083 | * we need to process on the page. |
1084 | * |
1085 | - * Landmine Warning: bh->b_end_io() will call end_page_writeback() on the last |
1086 | - * buffer in the IO. Once it does this, it is unsafe to access the bufferhead or |
1087 | - * the page at all, as we may be racing with memory reclaim and it can free both |
1088 | - * the bufferhead chain and the page as it will see the page as clean and |
1089 | - * unused. |
1090 | + * Note that we open code the action in end_buffer_async_write here so that we |
1091 | + * only have to iterate over the buffers attached to the page once. This is not |
1092 | + * only more efficient, but also ensures that we only calls end_page_writeback |
1093 | + * at the end of the iteration, and thus avoids the pitfall of having the page |
1094 | + * and buffers potentially freed after every call to end_buffer_async_write. |
1095 | */ |
1096 | static void |
1097 | xfs_finish_page_writeback( |
1098 | @@ -102,29 +102,45 @@ xfs_finish_page_writeback( |
1099 | struct bio_vec *bvec, |
1100 | int error) |
1101 | { |
1102 | - unsigned int end = bvec->bv_offset + bvec->bv_len - 1; |
1103 | - struct buffer_head *head, *bh, *next; |
1104 | + struct buffer_head *head = page_buffers(bvec->bv_page), *bh = head; |
1105 | + bool busy = false; |
1106 | unsigned int off = 0; |
1107 | - unsigned int bsize; |
1108 | + unsigned long flags; |
1109 | |
1110 | ASSERT(bvec->bv_offset < PAGE_SIZE); |
1111 | ASSERT((bvec->bv_offset & (i_blocksize(inode) - 1)) == 0); |
1112 | - ASSERT(end < PAGE_SIZE); |
1113 | + ASSERT(bvec->bv_offset + bvec->bv_len <= PAGE_SIZE); |
1114 | ASSERT((bvec->bv_len & (i_blocksize(inode) - 1)) == 0); |
1115 | |
1116 | - bh = head = page_buffers(bvec->bv_page); |
1117 | - |
1118 | - bsize = bh->b_size; |
1119 | + local_irq_save(flags); |
1120 | + bit_spin_lock(BH_Uptodate_Lock, &head->b_state); |
1121 | do { |
1122 | - if (off > end) |
1123 | - break; |
1124 | - next = bh->b_this_page; |
1125 | - if (off < bvec->bv_offset) |
1126 | - goto next_bh; |
1127 | - bh->b_end_io(bh, !error); |
1128 | -next_bh: |
1129 | - off += bsize; |
1130 | - } while ((bh = next) != head); |
1131 | + if (off >= bvec->bv_offset && |
1132 | + off < bvec->bv_offset + bvec->bv_len) { |
1133 | + ASSERT(buffer_async_write(bh)); |
1134 | + ASSERT(bh->b_end_io == NULL); |
1135 | + |
1136 | + if (error) { |
1137 | + mapping_set_error(bvec->bv_page->mapping, -EIO); |
1138 | + set_buffer_write_io_error(bh); |
1139 | + clear_buffer_uptodate(bh); |
1140 | + SetPageError(bvec->bv_page); |
1141 | + } else { |
1142 | + set_buffer_uptodate(bh); |
1143 | + } |
1144 | + clear_buffer_async_write(bh); |
1145 | + unlock_buffer(bh); |
1146 | + } else if (buffer_async_write(bh)) { |
1147 | + ASSERT(buffer_locked(bh)); |
1148 | + busy = true; |
1149 | + } |
1150 | + off += bh->b_size; |
1151 | + } while ((bh = bh->b_this_page) != head); |
1152 | + bit_spin_unlock(BH_Uptodate_Lock, &head->b_state); |
1153 | + local_irq_restore(flags); |
1154 | + |
1155 | + if (!busy) |
1156 | + end_page_writeback(bvec->bv_page); |
1157 | } |
1158 | |
1159 | /* |
1160 | @@ -138,8 +154,10 @@ xfs_destroy_ioend( |
1161 | int error) |
1162 | { |
1163 | struct inode *inode = ioend->io_inode; |
1164 | - struct bio *last = ioend->io_bio; |
1165 | - struct bio *bio, *next; |
1166 | + struct bio *bio = &ioend->io_inline_bio; |
1167 | + struct bio *last = ioend->io_bio, *next; |
1168 | + u64 start = bio->bi_iter.bi_sector; |
1169 | + bool quiet = bio_flagged(bio, BIO_QUIET); |
1170 | |
1171 | for (bio = &ioend->io_inline_bio; bio; bio = next) { |
1172 | struct bio_vec *bvec; |
1173 | @@ -160,6 +178,11 @@ xfs_destroy_ioend( |
1174 | |
1175 | bio_put(bio); |
1176 | } |
1177 | + |
1178 | + if (unlikely(error && !quiet)) { |
1179 | + xfs_err_ratelimited(XFS_I(inode)->i_mount, |
1180 | + "writeback error on sector %llu", start); |
1181 | + } |
1182 | } |
1183 | |
1184 | /* |
1185 | @@ -427,7 +450,8 @@ xfs_start_buffer_writeback( |
1186 | ASSERT(!buffer_delay(bh)); |
1187 | ASSERT(!buffer_unwritten(bh)); |
1188 | |
1189 | - mark_buffer_async_write(bh); |
1190 | + bh->b_end_io = NULL; |
1191 | + set_buffer_async_write(bh); |
1192 | set_buffer_uptodate(bh); |
1193 | clear_buffer_dirty(bh); |
1194 | } |
1195 | @@ -1566,9 +1590,12 @@ xfs_vm_bmap( |
1196 | * The swap code (ab-)uses ->bmap to get a block mapping and then |
1197 | * bypasseѕ the file system for actual I/O. We really can't allow |
1198 | * that on reflinks inodes, so we have to skip out here. And yes, |
1199 | - * 0 is the magic code for a bmap error.. |
1200 | + * 0 is the magic code for a bmap error. |
1201 | + * |
1202 | + * Since we don't pass back blockdev info, we can't return bmap |
1203 | + * information for rt files either. |
1204 | */ |
1205 | - if (xfs_is_reflink_inode(ip)) { |
1206 | + if (xfs_is_reflink_inode(ip) || XFS_IS_REALTIME_INODE(ip)) { |
1207 | xfs_iunlock(ip, XFS_IOLOCK_SHARED); |
1208 | return 0; |
1209 | } |
1210 | diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c |
1211 | index c4b90e794e41..5a54dcd7e7b1 100644 |
1212 | --- a/fs/xfs/xfs_bmap_item.c |
1213 | +++ b/fs/xfs/xfs_bmap_item.c |
1214 | @@ -395,6 +395,7 @@ xfs_bui_recover( |
1215 | struct xfs_map_extent *bmap; |
1216 | xfs_fsblock_t startblock_fsb; |
1217 | xfs_fsblock_t inode_fsb; |
1218 | + xfs_filblks_t count; |
1219 | bool op_ok; |
1220 | struct xfs_bud_log_item *budp; |
1221 | enum xfs_bmap_intent_type type; |
1222 | @@ -403,6 +404,7 @@ xfs_bui_recover( |
1223 | struct xfs_trans *tp; |
1224 | struct xfs_inode *ip = NULL; |
1225 | struct xfs_defer_ops dfops; |
1226 | + struct xfs_bmbt_irec irec; |
1227 | xfs_fsblock_t firstfsb; |
1228 | |
1229 | ASSERT(!test_bit(XFS_BUI_RECOVERED, &buip->bui_flags)); |
1230 | @@ -480,13 +482,24 @@ xfs_bui_recover( |
1231 | } |
1232 | xfs_trans_ijoin(tp, ip, 0); |
1233 | |
1234 | + count = bmap->me_len; |
1235 | error = xfs_trans_log_finish_bmap_update(tp, budp, &dfops, type, |
1236 | ip, whichfork, bmap->me_startoff, |
1237 | - bmap->me_startblock, bmap->me_len, |
1238 | - state); |
1239 | + bmap->me_startblock, &count, state); |
1240 | if (error) |
1241 | goto err_dfops; |
1242 | |
1243 | + if (count > 0) { |
1244 | + ASSERT(type == XFS_BMAP_UNMAP); |
1245 | + irec.br_startblock = bmap->me_startblock; |
1246 | + irec.br_blockcount = count; |
1247 | + irec.br_startoff = bmap->me_startoff; |
1248 | + irec.br_state = state; |
1249 | + error = xfs_bmap_unmap_extent(tp->t_mountp, &dfops, ip, &irec); |
1250 | + if (error) |
1251 | + goto err_dfops; |
1252 | + } |
1253 | + |
1254 | /* Finish transaction, free inodes. */ |
1255 | error = xfs_defer_finish(&tp, &dfops, NULL); |
1256 | if (error) |
1257 | diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c |
1258 | index 87b495e2f15a..5ffefac081f7 100644 |
1259 | --- a/fs/xfs/xfs_bmap_util.c |
1260 | +++ b/fs/xfs/xfs_bmap_util.c |
1261 | @@ -1825,29 +1825,18 @@ xfs_swap_extent_forks( |
1262 | } |
1263 | |
1264 | /* |
1265 | - * Before we've swapped the forks, lets set the owners of the forks |
1266 | - * appropriately. We have to do this as we are demand paging the btree |
1267 | - * buffers, and so the validation done on read will expect the owner |
1268 | - * field to be correctly set. Once we change the owners, we can swap the |
1269 | - * inode forks. |
1270 | + * Btree format (v3) inodes have the inode number stamped in the bmbt |
1271 | + * block headers. We can't start changing the bmbt blocks until the |
1272 | + * inode owner change is logged so recovery does the right thing in the |
1273 | + * event of a crash. Set the owner change log flags now and leave the |
1274 | + * bmbt scan as the last step. |
1275 | */ |
1276 | if (ip->i_d.di_version == 3 && |
1277 | - ip->i_d.di_format == XFS_DINODE_FMT_BTREE) { |
1278 | + ip->i_d.di_format == XFS_DINODE_FMT_BTREE) |
1279 | (*target_log_flags) |= XFS_ILOG_DOWNER; |
1280 | - error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK, |
1281 | - tip->i_ino, NULL); |
1282 | - if (error) |
1283 | - return error; |
1284 | - } |
1285 | - |
1286 | if (tip->i_d.di_version == 3 && |
1287 | - tip->i_d.di_format == XFS_DINODE_FMT_BTREE) { |
1288 | + tip->i_d.di_format == XFS_DINODE_FMT_BTREE) |
1289 | (*src_log_flags) |= XFS_ILOG_DOWNER; |
1290 | - error = xfs_bmbt_change_owner(tp, tip, XFS_DATA_FORK, |
1291 | - ip->i_ino, NULL); |
1292 | - if (error) |
1293 | - return error; |
1294 | - } |
1295 | |
1296 | /* |
1297 | * Swap the data forks of the inodes |
1298 | @@ -1925,6 +1914,48 @@ xfs_swap_extent_forks( |
1299 | return 0; |
1300 | } |
1301 | |
1302 | +/* |
1303 | + * Fix up the owners of the bmbt blocks to refer to the current inode. The |
1304 | + * change owner scan attempts to order all modified buffers in the current |
1305 | + * transaction. In the event of ordered buffer failure, the offending buffer is |
1306 | + * physically logged as a fallback and the scan returns -EAGAIN. We must roll |
1307 | + * the transaction in this case to replenish the fallback log reservation and |
1308 | + * restart the scan. This process repeats until the scan completes. |
1309 | + */ |
1310 | +static int |
1311 | +xfs_swap_change_owner( |
1312 | + struct xfs_trans **tpp, |
1313 | + struct xfs_inode *ip, |
1314 | + struct xfs_inode *tmpip) |
1315 | +{ |
1316 | + int error; |
1317 | + struct xfs_trans *tp = *tpp; |
1318 | + |
1319 | + do { |
1320 | + error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK, ip->i_ino, |
1321 | + NULL); |
1322 | + /* success or fatal error */ |
1323 | + if (error != -EAGAIN) |
1324 | + break; |
1325 | + |
1326 | + error = xfs_trans_roll(tpp, NULL); |
1327 | + if (error) |
1328 | + break; |
1329 | + tp = *tpp; |
1330 | + |
1331 | + /* |
1332 | + * Redirty both inodes so they can relog and keep the log tail |
1333 | + * moving forward. |
1334 | + */ |
1335 | + xfs_trans_ijoin(tp, ip, 0); |
1336 | + xfs_trans_ijoin(tp, tmpip, 0); |
1337 | + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); |
1338 | + xfs_trans_log_inode(tp, tmpip, XFS_ILOG_CORE); |
1339 | + } while (true); |
1340 | + |
1341 | + return error; |
1342 | +} |
1343 | + |
1344 | int |
1345 | xfs_swap_extents( |
1346 | struct xfs_inode *ip, /* target inode */ |
1347 | @@ -1938,8 +1969,8 @@ xfs_swap_extents( |
1348 | int error = 0; |
1349 | int lock_flags; |
1350 | struct xfs_ifork *cowfp; |
1351 | - __uint64_t f; |
1352 | - int resblks; |
1353 | + uint64_t f; |
1354 | + int resblks = 0; |
1355 | |
1356 | /* |
1357 | * Lock the inodes against other IO, page faults and truncate to |
1358 | @@ -1987,11 +2018,8 @@ xfs_swap_extents( |
1359 | XFS_SWAP_RMAP_SPACE_RES(mp, |
1360 | XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK), |
1361 | XFS_DATA_FORK); |
1362 | - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, |
1363 | - 0, 0, &tp); |
1364 | - } else |
1365 | - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, |
1366 | - 0, 0, &tp); |
1367 | + } |
1368 | + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); |
1369 | if (error) |
1370 | goto out_unlock; |
1371 | |
1372 | @@ -2076,6 +2104,23 @@ xfs_swap_extents( |
1373 | xfs_trans_log_inode(tp, ip, src_log_flags); |
1374 | xfs_trans_log_inode(tp, tip, target_log_flags); |
1375 | |
1376 | + /* |
1377 | + * The extent forks have been swapped, but crc=1,rmapbt=0 filesystems |
1378 | + * have inode number owner values in the bmbt blocks that still refer to |
1379 | + * the old inode. Scan each bmbt to fix up the owner values with the |
1380 | + * inode number of the current inode. |
1381 | + */ |
1382 | + if (src_log_flags & XFS_ILOG_DOWNER) { |
1383 | + error = xfs_swap_change_owner(&tp, ip, tip); |
1384 | + if (error) |
1385 | + goto out_trans_cancel; |
1386 | + } |
1387 | + if (target_log_flags & XFS_ILOG_DOWNER) { |
1388 | + error = xfs_swap_change_owner(&tp, tip, ip); |
1389 | + if (error) |
1390 | + goto out_trans_cancel; |
1391 | + } |
1392 | + |
1393 | /* |
1394 | * If this is a synchronous mount, make sure that the |
1395 | * transaction goes to disk before returning to the user. |
1396 | diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c |
1397 | index 16269271ebd6..eca7baecc9f0 100644 |
1398 | --- a/fs/xfs/xfs_buf.c |
1399 | +++ b/fs/xfs/xfs_buf.c |
1400 | @@ -116,7 +116,7 @@ static inline void |
1401 | __xfs_buf_ioacct_dec( |
1402 | struct xfs_buf *bp) |
1403 | { |
1404 | - ASSERT(spin_is_locked(&bp->b_lock)); |
1405 | + lockdep_assert_held(&bp->b_lock); |
1406 | |
1407 | if (bp->b_state & XFS_BSTATE_IN_FLIGHT) { |
1408 | bp->b_state &= ~XFS_BSTATE_IN_FLIGHT; |
1409 | @@ -2022,6 +2022,66 @@ xfs_buf_delwri_submit( |
1410 | return error; |
1411 | } |
1412 | |
1413 | +/* |
1414 | + * Push a single buffer on a delwri queue. |
1415 | + * |
1416 | + * The purpose of this function is to submit a single buffer of a delwri queue |
1417 | + * and return with the buffer still on the original queue. The waiting delwri |
1418 | + * buffer submission infrastructure guarantees transfer of the delwri queue |
1419 | + * buffer reference to a temporary wait list. We reuse this infrastructure to |
1420 | + * transfer the buffer back to the original queue. |
1421 | + * |
1422 | + * Note the buffer transitions from the queued state, to the submitted and wait |
1423 | + * listed state and back to the queued state during this call. The buffer |
1424 | + * locking and queue management logic between _delwri_pushbuf() and |
1425 | + * _delwri_queue() guarantee that the buffer cannot be queued to another list |
1426 | + * before returning. |
1427 | + */ |
1428 | +int |
1429 | +xfs_buf_delwri_pushbuf( |
1430 | + struct xfs_buf *bp, |
1431 | + struct list_head *buffer_list) |
1432 | +{ |
1433 | + LIST_HEAD (submit_list); |
1434 | + int error; |
1435 | + |
1436 | + ASSERT(bp->b_flags & _XBF_DELWRI_Q); |
1437 | + |
1438 | + trace_xfs_buf_delwri_pushbuf(bp, _RET_IP_); |
1439 | + |
1440 | + /* |
1441 | + * Isolate the buffer to a new local list so we can submit it for I/O |
1442 | + * independently from the rest of the original list. |
1443 | + */ |
1444 | + xfs_buf_lock(bp); |
1445 | + list_move(&bp->b_list, &submit_list); |
1446 | + xfs_buf_unlock(bp); |
1447 | + |
1448 | + /* |
1449 | + * Delwri submission clears the DELWRI_Q buffer flag and returns with |
1450 | + * the buffer on the wait list with an associated reference. Rather than |
1451 | + * bounce the buffer from a local wait list back to the original list |
1452 | + * after I/O completion, reuse the original list as the wait list. |
1453 | + */ |
1454 | + xfs_buf_delwri_submit_buffers(&submit_list, buffer_list); |
1455 | + |
1456 | + /* |
1457 | + * The buffer is now under I/O and wait listed as during typical delwri |
1458 | + * submission. Lock the buffer to wait for I/O completion. Rather than |
1459 | + * remove the buffer from the wait list and release the reference, we |
1460 | + * want to return with the buffer queued to the original list. The |
1461 | + * buffer already sits on the original list with a wait list reference, |
1462 | + * however. If we let the queue inherit that wait list reference, all we |
1463 | + * need to do is reset the DELWRI_Q flag. |
1464 | + */ |
1465 | + xfs_buf_lock(bp); |
1466 | + error = bp->b_error; |
1467 | + bp->b_flags |= _XBF_DELWRI_Q; |
1468 | + xfs_buf_unlock(bp); |
1469 | + |
1470 | + return error; |
1471 | +} |
1472 | + |
1473 | int __init |
1474 | xfs_buf_init(void) |
1475 | { |
1476 | diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h |
1477 | index ad514a8025dd..f961b19b9cc2 100644 |
1478 | --- a/fs/xfs/xfs_buf.h |
1479 | +++ b/fs/xfs/xfs_buf.h |
1480 | @@ -333,6 +333,7 @@ extern void xfs_buf_delwri_cancel(struct list_head *); |
1481 | extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *); |
1482 | extern int xfs_buf_delwri_submit(struct list_head *); |
1483 | extern int xfs_buf_delwri_submit_nowait(struct list_head *); |
1484 | +extern int xfs_buf_delwri_pushbuf(struct xfs_buf *, struct list_head *); |
1485 | |
1486 | /* Buffer Daemon Setup Routines */ |
1487 | extern int xfs_buf_init(void); |
1488 | diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c |
1489 | index 0306168af332..e0a0af0946f2 100644 |
1490 | --- a/fs/xfs/xfs_buf_item.c |
1491 | +++ b/fs/xfs/xfs_buf_item.c |
1492 | @@ -29,6 +29,7 @@ |
1493 | #include "xfs_error.h" |
1494 | #include "xfs_trace.h" |
1495 | #include "xfs_log.h" |
1496 | +#include "xfs_inode.h" |
1497 | |
1498 | |
1499 | kmem_zone_t *xfs_buf_item_zone; |
1500 | @@ -322,6 +323,8 @@ xfs_buf_item_format( |
1501 | ASSERT((bip->bli_flags & XFS_BLI_STALE) || |
1502 | (xfs_blft_from_flags(&bip->__bli_format) > XFS_BLFT_UNKNOWN_BUF |
1503 | && xfs_blft_from_flags(&bip->__bli_format) < XFS_BLFT_MAX_BUF)); |
1504 | + ASSERT(!(bip->bli_flags & XFS_BLI_ORDERED) || |
1505 | + (bip->bli_flags & XFS_BLI_STALE)); |
1506 | |
1507 | |
1508 | /* |
1509 | @@ -346,16 +349,6 @@ xfs_buf_item_format( |
1510 | bip->bli_flags &= ~XFS_BLI_INODE_BUF; |
1511 | } |
1512 | |
1513 | - if ((bip->bli_flags & (XFS_BLI_ORDERED|XFS_BLI_STALE)) == |
1514 | - XFS_BLI_ORDERED) { |
1515 | - /* |
1516 | - * The buffer has been logged just to order it. It is not being |
1517 | - * included in the transaction commit, so don't format it. |
1518 | - */ |
1519 | - trace_xfs_buf_item_format_ordered(bip); |
1520 | - return; |
1521 | - } |
1522 | - |
1523 | for (i = 0; i < bip->bli_format_count; i++) { |
1524 | xfs_buf_item_format_segment(bip, lv, &vecp, offset, |
1525 | &bip->bli_formats[i]); |
1526 | @@ -574,26 +567,20 @@ xfs_buf_item_unlock( |
1527 | { |
1528 | struct xfs_buf_log_item *bip = BUF_ITEM(lip); |
1529 | struct xfs_buf *bp = bip->bli_buf; |
1530 | - bool clean; |
1531 | - bool aborted; |
1532 | - int flags; |
1533 | + bool aborted = !!(lip->li_flags & XFS_LI_ABORTED); |
1534 | + bool hold = !!(bip->bli_flags & XFS_BLI_HOLD); |
1535 | + bool dirty = !!(bip->bli_flags & XFS_BLI_DIRTY); |
1536 | +#if defined(DEBUG) || defined(XFS_WARN) |
1537 | + bool ordered = !!(bip->bli_flags & XFS_BLI_ORDERED); |
1538 | +#endif |
1539 | |
1540 | /* Clear the buffer's association with this transaction. */ |
1541 | bp->b_transp = NULL; |
1542 | |
1543 | /* |
1544 | - * If this is a transaction abort, don't return early. Instead, allow |
1545 | - * the brelse to happen. Normally it would be done for stale |
1546 | - * (cancelled) buffers at unpin time, but we'll never go through the |
1547 | - * pin/unpin cycle if we abort inside commit. |
1548 | - */ |
1549 | - aborted = (lip->li_flags & XFS_LI_ABORTED) ? true : false; |
1550 | - /* |
1551 | - * Before possibly freeing the buf item, copy the per-transaction state |
1552 | - * so we can reference it safely later after clearing it from the |
1553 | - * buffer log item. |
1554 | + * The per-transaction state has been copied above so clear it from the |
1555 | + * bli. |
1556 | */ |
1557 | - flags = bip->bli_flags; |
1558 | bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED); |
1559 | |
1560 | /* |
1561 | @@ -601,7 +588,7 @@ xfs_buf_item_unlock( |
1562 | * unlock the buffer and free the buf item when the buffer is unpinned |
1563 | * for the last time. |
1564 | */ |
1565 | - if (flags & XFS_BLI_STALE) { |
1566 | + if (bip->bli_flags & XFS_BLI_STALE) { |
1567 | trace_xfs_buf_item_unlock_stale(bip); |
1568 | ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); |
1569 | if (!aborted) { |
1570 | @@ -619,40 +606,34 @@ xfs_buf_item_unlock( |
1571 | * regardless of whether it is dirty or not. A dirty abort implies a |
1572 | * shutdown, anyway. |
1573 | * |
1574 | - * Ordered buffers are dirty but may have no recorded changes, so ensure |
1575 | - * we only release clean items here. |
1576 | + * The bli dirty state should match whether the blf has logged segments |
1577 | + * except for ordered buffers, where only the bli should be dirty. |
1578 | */ |
1579 | - clean = (flags & XFS_BLI_DIRTY) ? false : true; |
1580 | - if (clean) { |
1581 | - int i; |
1582 | - for (i = 0; i < bip->bli_format_count; i++) { |
1583 | - if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map, |
1584 | - bip->bli_formats[i].blf_map_size)) { |
1585 | - clean = false; |
1586 | - break; |
1587 | - } |
1588 | - } |
1589 | - } |
1590 | + ASSERT((!ordered && dirty == xfs_buf_item_dirty_format(bip)) || |
1591 | + (ordered && dirty && !xfs_buf_item_dirty_format(bip))); |
1592 | |
1593 | /* |
1594 | * Clean buffers, by definition, cannot be in the AIL. However, aborted |
1595 | - * buffers may be dirty and hence in the AIL. Therefore if we are |
1596 | - * aborting a buffer and we've just taken the last refernce away, we |
1597 | - * have to check if it is in the AIL before freeing it. We need to free |
1598 | - * it in this case, because an aborted transaction has already shut the |
1599 | - * filesystem down and this is the last chance we will have to do so. |
1600 | + * buffers may be in the AIL regardless of dirty state. An aborted |
1601 | + * transaction that invalidates a buffer already in the AIL may have |
1602 | + * marked it stale and cleared the dirty state, for example. |
1603 | + * |
1604 | + * Therefore if we are aborting a buffer and we've just taken the last |
1605 | + * reference away, we have to check if it is in the AIL before freeing |
1606 | + * it. We need to free it in this case, because an aborted transaction |
1607 | + * has already shut the filesystem down and this is the last chance we |
1608 | + * will have to do so. |
1609 | */ |
1610 | if (atomic_dec_and_test(&bip->bli_refcount)) { |
1611 | - if (clean) |
1612 | - xfs_buf_item_relse(bp); |
1613 | - else if (aborted) { |
1614 | + if (aborted) { |
1615 | ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp)); |
1616 | xfs_trans_ail_remove(lip, SHUTDOWN_LOG_IO_ERROR); |
1617 | xfs_buf_item_relse(bp); |
1618 | - } |
1619 | + } else if (!dirty) |
1620 | + xfs_buf_item_relse(bp); |
1621 | } |
1622 | |
1623 | - if (!(flags & XFS_BLI_HOLD)) |
1624 | + if (!hold) |
1625 | xfs_buf_relse(bp); |
1626 | } |
1627 | |
1628 | @@ -942,14 +923,22 @@ xfs_buf_item_log( |
1629 | |
1630 | |
1631 | /* |
1632 | - * Return 1 if the buffer has been logged or ordered in a transaction (at any |
1633 | - * point, not just the current transaction) and 0 if not. |
1634 | + * Return true if the buffer has any ranges logged/dirtied by a transaction, |
1635 | + * false otherwise. |
1636 | */ |
1637 | -uint |
1638 | -xfs_buf_item_dirty( |
1639 | - xfs_buf_log_item_t *bip) |
1640 | +bool |
1641 | +xfs_buf_item_dirty_format( |
1642 | + struct xfs_buf_log_item *bip) |
1643 | { |
1644 | - return (bip->bli_flags & XFS_BLI_DIRTY); |
1645 | + int i; |
1646 | + |
1647 | + for (i = 0; i < bip->bli_format_count; i++) { |
1648 | + if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map, |
1649 | + bip->bli_formats[i].blf_map_size)) |
1650 | + return true; |
1651 | + } |
1652 | + |
1653 | + return false; |
1654 | } |
1655 | |
1656 | STATIC void |
1657 | @@ -1051,6 +1040,31 @@ xfs_buf_do_callbacks( |
1658 | } |
1659 | } |
1660 | |
1661 | +/* |
1662 | + * Invoke the error state callback for each log item affected by the failed I/O. |
1663 | + * |
1664 | + * If a metadata buffer write fails with a non-permanent error, the buffer is |
1665 | + * eventually resubmitted and so the completion callbacks are not run. The error |
1666 | + * state may need to be propagated to the log items attached to the buffer, |
1667 | + * however, so the next AIL push of the item knows hot to handle it correctly. |
1668 | + */ |
1669 | +STATIC void |
1670 | +xfs_buf_do_callbacks_fail( |
1671 | + struct xfs_buf *bp) |
1672 | +{ |
1673 | + struct xfs_log_item *next; |
1674 | + struct xfs_log_item *lip = bp->b_fspriv; |
1675 | + struct xfs_ail *ailp = lip->li_ailp; |
1676 | + |
1677 | + spin_lock(&ailp->xa_lock); |
1678 | + for (; lip; lip = next) { |
1679 | + next = lip->li_bio_list; |
1680 | + if (lip->li_ops->iop_error) |
1681 | + lip->li_ops->iop_error(lip, bp); |
1682 | + } |
1683 | + spin_unlock(&ailp->xa_lock); |
1684 | +} |
1685 | + |
1686 | static bool |
1687 | xfs_buf_iodone_callback_error( |
1688 | struct xfs_buf *bp) |
1689 | @@ -1120,7 +1134,11 @@ xfs_buf_iodone_callback_error( |
1690 | if ((mp->m_flags & XFS_MOUNT_UNMOUNTING) && mp->m_fail_unmount) |
1691 | goto permanent_error; |
1692 | |
1693 | - /* still a transient error, higher layers will retry */ |
1694 | + /* |
1695 | + * Still a transient error, run IO completion failure callbacks and let |
1696 | + * the higher layers retry the buffer. |
1697 | + */ |
1698 | + xfs_buf_do_callbacks_fail(bp); |
1699 | xfs_buf_ioerror(bp, 0); |
1700 | xfs_buf_relse(bp); |
1701 | return true; |
1702 | @@ -1201,3 +1219,31 @@ xfs_buf_iodone( |
1703 | xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE); |
1704 | xfs_buf_item_free(BUF_ITEM(lip)); |
1705 | } |
1706 | + |
1707 | +/* |
1708 | + * Requeue a failed buffer for writeback |
1709 | + * |
1710 | + * Return true if the buffer has been re-queued properly, false otherwise |
1711 | + */ |
1712 | +bool |
1713 | +xfs_buf_resubmit_failed_buffers( |
1714 | + struct xfs_buf *bp, |
1715 | + struct xfs_log_item *lip, |
1716 | + struct list_head *buffer_list) |
1717 | +{ |
1718 | + struct xfs_log_item *next; |
1719 | + |
1720 | + /* |
1721 | + * Clear XFS_LI_FAILED flag from all items before resubmit |
1722 | + * |
1723 | + * XFS_LI_FAILED set/clear is protected by xa_lock, caller this |
1724 | + * function already have it acquired |
1725 | + */ |
1726 | + for (; lip; lip = next) { |
1727 | + next = lip->li_bio_list; |
1728 | + xfs_clear_li_failed(lip); |
1729 | + } |
1730 | + |
1731 | + /* Add this buffer back to the delayed write list */ |
1732 | + return xfs_buf_delwri_queue(bp, buffer_list); |
1733 | +} |
1734 | diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h |
1735 | index f7eba99d19dd..9690ce62c9a7 100644 |
1736 | --- a/fs/xfs/xfs_buf_item.h |
1737 | +++ b/fs/xfs/xfs_buf_item.h |
1738 | @@ -64,12 +64,15 @@ typedef struct xfs_buf_log_item { |
1739 | int xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *); |
1740 | void xfs_buf_item_relse(struct xfs_buf *); |
1741 | void xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint); |
1742 | -uint xfs_buf_item_dirty(xfs_buf_log_item_t *); |
1743 | +bool xfs_buf_item_dirty_format(struct xfs_buf_log_item *); |
1744 | void xfs_buf_attach_iodone(struct xfs_buf *, |
1745 | void(*)(struct xfs_buf *, xfs_log_item_t *), |
1746 | xfs_log_item_t *); |
1747 | void xfs_buf_iodone_callbacks(struct xfs_buf *); |
1748 | void xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *); |
1749 | +bool xfs_buf_resubmit_failed_buffers(struct xfs_buf *, |
1750 | + struct xfs_log_item *, |
1751 | + struct list_head *); |
1752 | |
1753 | extern kmem_zone_t *xfs_buf_item_zone; |
1754 | |
1755 | diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c |
1756 | index df206cfc21f7..586b398f268d 100644 |
1757 | --- a/fs/xfs/xfs_file.c |
1758 | +++ b/fs/xfs/xfs_file.c |
1759 | @@ -729,6 +729,7 @@ xfs_file_buffered_aio_write( |
1760 | xfs_rw_iunlock(ip, iolock); |
1761 | eofb.eof_flags = XFS_EOF_FLAGS_SYNC; |
1762 | xfs_icache_free_eofblocks(ip->i_mount, &eofb); |
1763 | + xfs_icache_free_cowblocks(ip->i_mount, &eofb); |
1764 | goto write_retry; |
1765 | } |
1766 | |
1767 | @@ -1139,29 +1140,8 @@ xfs_find_get_desired_pgoff( |
1768 | want = min_t(pgoff_t, end - index, PAGEVEC_SIZE - 1) + 1; |
1769 | nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, |
1770 | want); |
1771 | - /* |
1772 | - * No page mapped into given range. If we are searching holes |
1773 | - * and if this is the first time we got into the loop, it means |
1774 | - * that the given offset is landed in a hole, return it. |
1775 | - * |
1776 | - * If we have already stepped through some block buffers to find |
1777 | - * holes but they all contains data. In this case, the last |
1778 | - * offset is already updated and pointed to the end of the last |
1779 | - * mapped page, if it does not reach the endpoint to search, |
1780 | - * that means there should be a hole between them. |
1781 | - */ |
1782 | - if (nr_pages == 0) { |
1783 | - /* Data search found nothing */ |
1784 | - if (type == DATA_OFF) |
1785 | - break; |
1786 | - |
1787 | - ASSERT(type == HOLE_OFF); |
1788 | - if (lastoff == startoff || lastoff < endoff) { |
1789 | - found = true; |
1790 | - *offset = lastoff; |
1791 | - } |
1792 | + if (nr_pages == 0) |
1793 | break; |
1794 | - } |
1795 | |
1796 | for (i = 0; i < nr_pages; i++) { |
1797 | struct page *page = pvec.pages[i]; |
1798 | @@ -1227,21 +1207,20 @@ xfs_find_get_desired_pgoff( |
1799 | |
1800 | /* |
1801 | * The number of returned pages less than our desired, search |
1802 | - * done. In this case, nothing was found for searching data, |
1803 | - * but we found a hole behind the last offset. |
1804 | + * done. |
1805 | */ |
1806 | - if (nr_pages < want) { |
1807 | - if (type == HOLE_OFF) { |
1808 | - *offset = lastoff; |
1809 | - found = true; |
1810 | - } |
1811 | + if (nr_pages < want) |
1812 | break; |
1813 | - } |
1814 | |
1815 | index = pvec.pages[i - 1]->index + 1; |
1816 | pagevec_release(&pvec); |
1817 | } while (index <= end); |
1818 | |
1819 | + /* No page at lastoff and we are not done - we found a hole. */ |
1820 | + if (type == HOLE_OFF && lastoff < endoff) { |
1821 | + *offset = lastoff; |
1822 | + found = true; |
1823 | + } |
1824 | out: |
1825 | pagevec_release(&pvec); |
1826 | return found; |
1827 | diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c |
1828 | index 74304b6ce84b..86a4911520cc 100644 |
1829 | --- a/fs/xfs/xfs_icache.c |
1830 | +++ b/fs/xfs/xfs_icache.c |
1831 | @@ -66,7 +66,6 @@ xfs_inode_alloc( |
1832 | |
1833 | XFS_STATS_INC(mp, vn_active); |
1834 | ASSERT(atomic_read(&ip->i_pincount) == 0); |
1835 | - ASSERT(!spin_is_locked(&ip->i_flags_lock)); |
1836 | ASSERT(!xfs_isiflocked(ip)); |
1837 | ASSERT(ip->i_ino == 0); |
1838 | |
1839 | @@ -192,7 +191,7 @@ xfs_perag_set_reclaim_tag( |
1840 | { |
1841 | struct xfs_mount *mp = pag->pag_mount; |
1842 | |
1843 | - ASSERT(spin_is_locked(&pag->pag_ici_lock)); |
1844 | + lockdep_assert_held(&pag->pag_ici_lock); |
1845 | if (pag->pag_ici_reclaimable++) |
1846 | return; |
1847 | |
1848 | @@ -214,7 +213,7 @@ xfs_perag_clear_reclaim_tag( |
1849 | { |
1850 | struct xfs_mount *mp = pag->pag_mount; |
1851 | |
1852 | - ASSERT(spin_is_locked(&pag->pag_ici_lock)); |
1853 | + lockdep_assert_held(&pag->pag_ici_lock); |
1854 | if (--pag->pag_ici_reclaimable) |
1855 | return; |
1856 | |
1857 | @@ -1079,11 +1078,11 @@ xfs_reclaim_inode( |
1858 | * Because we use RCU freeing we need to ensure the inode always appears |
1859 | * to be reclaimed with an invalid inode number when in the free state. |
1860 | * We do this as early as possible under the ILOCK so that |
1861 | - * xfs_iflush_cluster() can be guaranteed to detect races with us here. |
1862 | - * By doing this, we guarantee that once xfs_iflush_cluster has locked |
1863 | - * XFS_ILOCK that it will see either a valid, flushable inode that will |
1864 | - * serialise correctly, or it will see a clean (and invalid) inode that |
1865 | - * it can skip. |
1866 | + * xfs_iflush_cluster() and xfs_ifree_cluster() can be guaranteed to |
1867 | + * detect races with us here. By doing this, we guarantee that once |
1868 | + * xfs_iflush_cluster() or xfs_ifree_cluster() has locked XFS_ILOCK that |
1869 | + * it will see either a valid inode that will serialise correctly, or it |
1870 | + * will see an invalid inode that it can skip. |
1871 | */ |
1872 | spin_lock(&ip->i_flags_lock); |
1873 | ip->i_flags = XFS_IRECLAIM; |
1874 | diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c |
1875 | index 7a0b4eeb99e4..9e795ab08a53 100644 |
1876 | --- a/fs/xfs/xfs_inode.c |
1877 | +++ b/fs/xfs/xfs_inode.c |
1878 | @@ -881,7 +881,6 @@ xfs_ialloc( |
1879 | case S_IFREG: |
1880 | case S_IFDIR: |
1881 | if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) { |
1882 | - uint64_t di_flags2 = 0; |
1883 | uint di_flags = 0; |
1884 | |
1885 | if (S_ISDIR(mode)) { |
1886 | @@ -918,20 +917,23 @@ xfs_ialloc( |
1887 | di_flags |= XFS_DIFLAG_NODEFRAG; |
1888 | if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM) |
1889 | di_flags |= XFS_DIFLAG_FILESTREAM; |
1890 | - if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX) |
1891 | - di_flags2 |= XFS_DIFLAG2_DAX; |
1892 | |
1893 | ip->i_d.di_flags |= di_flags; |
1894 | - ip->i_d.di_flags2 |= di_flags2; |
1895 | } |
1896 | if (pip && |
1897 | (pip->i_d.di_flags2 & XFS_DIFLAG2_ANY) && |
1898 | pip->i_d.di_version == 3 && |
1899 | ip->i_d.di_version == 3) { |
1900 | + uint64_t di_flags2 = 0; |
1901 | + |
1902 | if (pip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) { |
1903 | - ip->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; |
1904 | + di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; |
1905 | ip->i_d.di_cowextsize = pip->i_d.di_cowextsize; |
1906 | } |
1907 | + if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX) |
1908 | + di_flags2 |= XFS_DIFLAG2_DAX; |
1909 | + |
1910 | + ip->i_d.di_flags2 |= di_flags2; |
1911 | } |
1912 | /* FALLTHROUGH */ |
1913 | case S_IFLNK: |
1914 | @@ -2366,11 +2368,24 @@ xfs_ifree_cluster( |
1915 | * already marked stale. If we can't lock it, back off |
1916 | * and retry. |
1917 | */ |
1918 | - if (ip != free_ip && |
1919 | - !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { |
1920 | - rcu_read_unlock(); |
1921 | - delay(1); |
1922 | - goto retry; |
1923 | + if (ip != free_ip) { |
1924 | + if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { |
1925 | + rcu_read_unlock(); |
1926 | + delay(1); |
1927 | + goto retry; |
1928 | + } |
1929 | + |
1930 | + /* |
1931 | + * Check the inode number again in case we're |
1932 | + * racing with freeing in xfs_reclaim_inode(). |
1933 | + * See the comments in that function for more |
1934 | + * information as to why the initial check is |
1935 | + * not sufficient. |
1936 | + */ |
1937 | + if (ip->i_ino != inum + i) { |
1938 | + xfs_iunlock(ip, XFS_ILOCK_EXCL); |
1939 | + continue; |
1940 | + } |
1941 | } |
1942 | rcu_read_unlock(); |
1943 | |
1944 | diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c |
1945 | index d90e7811ccdd..94915747042c 100644 |
1946 | --- a/fs/xfs/xfs_inode_item.c |
1947 | +++ b/fs/xfs/xfs_inode_item.c |
1948 | @@ -27,6 +27,7 @@ |
1949 | #include "xfs_error.h" |
1950 | #include "xfs_trace.h" |
1951 | #include "xfs_trans_priv.h" |
1952 | +#include "xfs_buf_item.h" |
1953 | #include "xfs_log.h" |
1954 | |
1955 | |
1956 | @@ -475,6 +476,23 @@ xfs_inode_item_unpin( |
1957 | wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT); |
1958 | } |
1959 | |
1960 | +/* |
1961 | + * Callback used to mark a buffer with XFS_LI_FAILED when items in the buffer |
1962 | + * have been failed during writeback |
1963 | + * |
1964 | + * This informs the AIL that the inode is already flush locked on the next push, |
1965 | + * and acquires a hold on the buffer to ensure that it isn't reclaimed before |
1966 | + * dirty data makes it to disk. |
1967 | + */ |
1968 | +STATIC void |
1969 | +xfs_inode_item_error( |
1970 | + struct xfs_log_item *lip, |
1971 | + struct xfs_buf *bp) |
1972 | +{ |
1973 | + ASSERT(xfs_isiflocked(INODE_ITEM(lip)->ili_inode)); |
1974 | + xfs_set_li_failed(lip, bp); |
1975 | +} |
1976 | + |
1977 | STATIC uint |
1978 | xfs_inode_item_push( |
1979 | struct xfs_log_item *lip, |
1980 | @@ -484,13 +502,28 @@ xfs_inode_item_push( |
1981 | { |
1982 | struct xfs_inode_log_item *iip = INODE_ITEM(lip); |
1983 | struct xfs_inode *ip = iip->ili_inode; |
1984 | - struct xfs_buf *bp = NULL; |
1985 | + struct xfs_buf *bp = lip->li_buf; |
1986 | uint rval = XFS_ITEM_SUCCESS; |
1987 | int error; |
1988 | |
1989 | if (xfs_ipincount(ip) > 0) |
1990 | return XFS_ITEM_PINNED; |
1991 | |
1992 | + /* |
1993 | + * The buffer containing this item failed to be written back |
1994 | + * previously. Resubmit the buffer for IO. |
1995 | + */ |
1996 | + if (lip->li_flags & XFS_LI_FAILED) { |
1997 | + if (!xfs_buf_trylock(bp)) |
1998 | + return XFS_ITEM_LOCKED; |
1999 | + |
2000 | + if (!xfs_buf_resubmit_failed_buffers(bp, lip, buffer_list)) |
2001 | + rval = XFS_ITEM_FLUSHING; |
2002 | + |
2003 | + xfs_buf_unlock(bp); |
2004 | + return rval; |
2005 | + } |
2006 | + |
2007 | if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) |
2008 | return XFS_ITEM_LOCKED; |
2009 | |
2010 | @@ -622,7 +655,8 @@ static const struct xfs_item_ops xfs_inode_item_ops = { |
2011 | .iop_unlock = xfs_inode_item_unlock, |
2012 | .iop_committed = xfs_inode_item_committed, |
2013 | .iop_push = xfs_inode_item_push, |
2014 | - .iop_committing = xfs_inode_item_committing |
2015 | + .iop_committing = xfs_inode_item_committing, |
2016 | + .iop_error = xfs_inode_item_error |
2017 | }; |
2018 | |
2019 | |
2020 | @@ -710,7 +744,8 @@ xfs_iflush_done( |
2021 | * the AIL lock. |
2022 | */ |
2023 | iip = INODE_ITEM(blip); |
2024 | - if (iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn) |
2025 | + if ((iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn) || |
2026 | + lip->li_flags & XFS_LI_FAILED) |
2027 | need_ail++; |
2028 | |
2029 | blip = next; |
2030 | @@ -718,7 +753,8 @@ xfs_iflush_done( |
2031 | |
2032 | /* make sure we capture the state of the initial inode. */ |
2033 | iip = INODE_ITEM(lip); |
2034 | - if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) |
2035 | + if ((iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) || |
2036 | + lip->li_flags & XFS_LI_FAILED) |
2037 | need_ail++; |
2038 | |
2039 | /* |
2040 | @@ -731,22 +767,30 @@ xfs_iflush_done( |
2041 | * holding the lock before removing the inode from the AIL. |
2042 | */ |
2043 | if (need_ail) { |
2044 | - struct xfs_log_item *log_items[need_ail]; |
2045 | - int i = 0; |
2046 | + bool mlip_changed = false; |
2047 | + |
2048 | + /* this is an opencoded batch version of xfs_trans_ail_delete */ |
2049 | spin_lock(&ailp->xa_lock); |
2050 | for (blip = lip; blip; blip = blip->li_bio_list) { |
2051 | - iip = INODE_ITEM(blip); |
2052 | - if (iip->ili_logged && |
2053 | - blip->li_lsn == iip->ili_flush_lsn) { |
2054 | - log_items[i++] = blip; |
2055 | + if (INODE_ITEM(blip)->ili_logged && |
2056 | + blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn) |
2057 | + mlip_changed |= xfs_ail_delete_one(ailp, blip); |
2058 | + else { |
2059 | + xfs_clear_li_failed(blip); |
2060 | } |
2061 | - ASSERT(i <= need_ail); |
2062 | } |
2063 | - /* xfs_trans_ail_delete_bulk() drops the AIL lock. */ |
2064 | - xfs_trans_ail_delete_bulk(ailp, log_items, i, |
2065 | - SHUTDOWN_CORRUPT_INCORE); |
2066 | - } |
2067 | |
2068 | + if (mlip_changed) { |
2069 | + if (!XFS_FORCED_SHUTDOWN(ailp->xa_mount)) |
2070 | + xlog_assign_tail_lsn_locked(ailp->xa_mount); |
2071 | + if (list_empty(&ailp->xa_ail)) |
2072 | + wake_up_all(&ailp->xa_empty); |
2073 | + } |
2074 | + spin_unlock(&ailp->xa_lock); |
2075 | + |
2076 | + if (mlip_changed) |
2077 | + xfs_log_space_wake(ailp->xa_mount); |
2078 | + } |
2079 | |
2080 | /* |
2081 | * clean up and unlock the flush lock now we are done. We can clear the |
2082 | diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c |
2083 | index 73cfc7179124..bce2e260f55e 100644 |
2084 | --- a/fs/xfs/xfs_ioctl.c |
2085 | +++ b/fs/xfs/xfs_ioctl.c |
2086 | @@ -928,16 +928,15 @@ xfs_ioc_fsgetxattr( |
2087 | return 0; |
2088 | } |
2089 | |
2090 | -STATIC void |
2091 | -xfs_set_diflags( |
2092 | +STATIC uint16_t |
2093 | +xfs_flags2diflags( |
2094 | struct xfs_inode *ip, |
2095 | unsigned int xflags) |
2096 | { |
2097 | - unsigned int di_flags; |
2098 | - uint64_t di_flags2; |
2099 | - |
2100 | /* can't set PREALLOC this way, just preserve it */ |
2101 | - di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC); |
2102 | + uint16_t di_flags = |
2103 | + (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC); |
2104 | + |
2105 | if (xflags & FS_XFLAG_IMMUTABLE) |
2106 | di_flags |= XFS_DIFLAG_IMMUTABLE; |
2107 | if (xflags & FS_XFLAG_APPEND) |
2108 | @@ -967,19 +966,24 @@ xfs_set_diflags( |
2109 | if (xflags & FS_XFLAG_EXTSIZE) |
2110 | di_flags |= XFS_DIFLAG_EXTSIZE; |
2111 | } |
2112 | - ip->i_d.di_flags = di_flags; |
2113 | |
2114 | - /* diflags2 only valid for v3 inodes. */ |
2115 | - if (ip->i_d.di_version < 3) |
2116 | - return; |
2117 | + return di_flags; |
2118 | +} |
2119 | + |
2120 | +STATIC uint64_t |
2121 | +xfs_flags2diflags2( |
2122 | + struct xfs_inode *ip, |
2123 | + unsigned int xflags) |
2124 | +{ |
2125 | + uint64_t di_flags2 = |
2126 | + (ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK); |
2127 | |
2128 | - di_flags2 = (ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK); |
2129 | if (xflags & FS_XFLAG_DAX) |
2130 | di_flags2 |= XFS_DIFLAG2_DAX; |
2131 | if (xflags & FS_XFLAG_COWEXTSIZE) |
2132 | di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; |
2133 | |
2134 | - ip->i_d.di_flags2 = di_flags2; |
2135 | + return di_flags2; |
2136 | } |
2137 | |
2138 | STATIC void |
2139 | @@ -1005,11 +1009,12 @@ xfs_diflags_to_linux( |
2140 | inode->i_flags |= S_NOATIME; |
2141 | else |
2142 | inode->i_flags &= ~S_NOATIME; |
2143 | +#if 0 /* disabled until the flag switching races are sorted out */ |
2144 | if (xflags & FS_XFLAG_DAX) |
2145 | inode->i_flags |= S_DAX; |
2146 | else |
2147 | inode->i_flags &= ~S_DAX; |
2148 | - |
2149 | +#endif |
2150 | } |
2151 | |
2152 | static int |
2153 | @@ -1019,6 +1024,7 @@ xfs_ioctl_setattr_xflags( |
2154 | struct fsxattr *fa) |
2155 | { |
2156 | struct xfs_mount *mp = ip->i_mount; |
2157 | + uint64_t di_flags2; |
2158 | |
2159 | /* Can't change realtime flag if any extents are allocated. */ |
2160 | if ((ip->i_d.di_nextents || ip->i_delayed_blks) && |
2161 | @@ -1049,7 +1055,14 @@ xfs_ioctl_setattr_xflags( |
2162 | !capable(CAP_LINUX_IMMUTABLE)) |
2163 | return -EPERM; |
2164 | |
2165 | - xfs_set_diflags(ip, fa->fsx_xflags); |
2166 | + /* diflags2 only valid for v3 inodes. */ |
2167 | + di_flags2 = xfs_flags2diflags2(ip, fa->fsx_xflags); |
2168 | + if (di_flags2 && ip->i_d.di_version < 3) |
2169 | + return -EINVAL; |
2170 | + |
2171 | + ip->i_d.di_flags = xfs_flags2diflags(ip, fa->fsx_xflags); |
2172 | + ip->i_d.di_flags2 = di_flags2; |
2173 | + |
2174 | xfs_diflags_to_linux(ip); |
2175 | xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); |
2176 | xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); |
2177 | diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c |
2178 | index a1247c3c1efb..5b81f7f41b80 100644 |
2179 | --- a/fs/xfs/xfs_iops.c |
2180 | +++ b/fs/xfs/xfs_iops.c |
2181 | @@ -802,7 +802,7 @@ xfs_vn_setattr_nonsize( |
2182 | * Caution: The caller of this function is responsible for calling |
2183 | * setattr_prepare() or otherwise verifying the change is fine. |
2184 | */ |
2185 | -int |
2186 | +STATIC int |
2187 | xfs_setattr_size( |
2188 | struct xfs_inode *ip, |
2189 | struct iattr *iattr) |
2190 | diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c |
2191 | index b57ab34fbf3c..33c9a3aae948 100644 |
2192 | --- a/fs/xfs/xfs_log.c |
2193 | +++ b/fs/xfs/xfs_log.c |
2194 | @@ -743,15 +743,45 @@ xfs_log_mount_finish( |
2195 | struct xfs_mount *mp) |
2196 | { |
2197 | int error = 0; |
2198 | + bool readonly = (mp->m_flags & XFS_MOUNT_RDONLY); |
2199 | |
2200 | if (mp->m_flags & XFS_MOUNT_NORECOVERY) { |
2201 | ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); |
2202 | return 0; |
2203 | + } else if (readonly) { |
2204 | + /* Allow unlinked processing to proceed */ |
2205 | + mp->m_flags &= ~XFS_MOUNT_RDONLY; |
2206 | } |
2207 | |
2208 | + /* |
2209 | + * During the second phase of log recovery, we need iget and |
2210 | + * iput to behave like they do for an active filesystem. |
2211 | + * xfs_fs_drop_inode needs to be able to prevent the deletion |
2212 | + * of inodes before we're done replaying log items on those |
2213 | + * inodes. Turn it off immediately after recovery finishes |
2214 | + * so that we don't leak the quota inodes if subsequent mount |
2215 | + * activities fail. |
2216 | + * |
2217 | + * We let all inodes involved in redo item processing end up on |
2218 | + * the LRU instead of being evicted immediately so that if we do |
2219 | + * something to an unlinked inode, the irele won't cause |
2220 | + * premature truncation and freeing of the inode, which results |
2221 | + * in log recovery failure. We have to evict the unreferenced |
2222 | + * lru inodes after clearing MS_ACTIVE because we don't |
2223 | + * otherwise clean up the lru if there's a subsequent failure in |
2224 | + * xfs_mountfs, which leads to us leaking the inodes if nothing |
2225 | + * else (e.g. quotacheck) references the inodes before the |
2226 | + * mount failure occurs. |
2227 | + */ |
2228 | + mp->m_super->s_flags |= MS_ACTIVE; |
2229 | error = xlog_recover_finish(mp->m_log); |
2230 | if (!error) |
2231 | xfs_log_work_queue(mp); |
2232 | + mp->m_super->s_flags &= ~MS_ACTIVE; |
2233 | + evict_inodes(mp->m_super); |
2234 | + |
2235 | + if (readonly) |
2236 | + mp->m_flags |= XFS_MOUNT_RDONLY; |
2237 | |
2238 | return error; |
2239 | } |
2240 | @@ -801,11 +831,14 @@ xfs_log_unmount_write(xfs_mount_t *mp) |
2241 | int error; |
2242 | |
2243 | /* |
2244 | - * Don't write out unmount record on read-only mounts. |
2245 | + * Don't write out unmount record on norecovery mounts or ro devices. |
2246 | * Or, if we are doing a forced umount (typically because of IO errors). |
2247 | */ |
2248 | - if (mp->m_flags & XFS_MOUNT_RDONLY) |
2249 | + if (mp->m_flags & XFS_MOUNT_NORECOVERY || |
2250 | + xfs_readonly_buftarg(log->l_mp->m_logdev_targp)) { |
2251 | + ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); |
2252 | return 0; |
2253 | + } |
2254 | |
2255 | error = _xfs_log_force(mp, XFS_LOG_SYNC, NULL); |
2256 | ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log))); |
2257 | @@ -3304,8 +3337,6 @@ _xfs_log_force( |
2258 | */ |
2259 | if (iclog->ic_state & XLOG_STATE_IOERROR) |
2260 | return -EIO; |
2261 | - if (log_flushed) |
2262 | - *log_flushed = 1; |
2263 | } else { |
2264 | |
2265 | no_sleep: |
2266 | @@ -3409,8 +3440,6 @@ _xfs_log_force_lsn( |
2267 | |
2268 | xlog_wait(&iclog->ic_prev->ic_write_wait, |
2269 | &log->l_icloglock); |
2270 | - if (log_flushed) |
2271 | - *log_flushed = 1; |
2272 | already_slept = 1; |
2273 | goto try_again; |
2274 | } |
2275 | @@ -3444,9 +3473,6 @@ _xfs_log_force_lsn( |
2276 | */ |
2277 | if (iclog->ic_state & XLOG_STATE_IOERROR) |
2278 | return -EIO; |
2279 | - |
2280 | - if (log_flushed) |
2281 | - *log_flushed = 1; |
2282 | } else { /* just return */ |
2283 | spin_unlock(&log->l_icloglock); |
2284 | } |
2285 | diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c |
2286 | index 9b3d7c76915d..05909269f973 100644 |
2287 | --- a/fs/xfs/xfs_log_recover.c |
2288 | +++ b/fs/xfs/xfs_log_recover.c |
2289 | @@ -1029,61 +1029,106 @@ xlog_seek_logrec_hdr( |
2290 | } |
2291 | |
2292 | /* |
2293 | - * Check the log tail for torn writes. This is required when torn writes are |
2294 | - * detected at the head and the head had to be walked back to a previous record. |
2295 | - * The tail of the previous record must now be verified to ensure the torn |
2296 | - * writes didn't corrupt the previous tail. |
2297 | + * Calculate distance from head to tail (i.e., unused space in the log). |
2298 | + */ |
2299 | +static inline int |
2300 | +xlog_tail_distance( |
2301 | + struct xlog *log, |
2302 | + xfs_daddr_t head_blk, |
2303 | + xfs_daddr_t tail_blk) |
2304 | +{ |
2305 | + if (head_blk < tail_blk) |
2306 | + return tail_blk - head_blk; |
2307 | + |
2308 | + return tail_blk + (log->l_logBBsize - head_blk); |
2309 | +} |
2310 | + |
2311 | +/* |
2312 | + * Verify the log tail. This is particularly important when torn or incomplete |
2313 | + * writes have been detected near the front of the log and the head has been |
2314 | + * walked back accordingly. |
2315 | * |
2316 | - * Return an error if CRC verification fails as recovery cannot proceed. |
2317 | + * We also have to handle the case where the tail was pinned and the head |
2318 | + * blocked behind the tail right before a crash. If the tail had been pushed |
2319 | + * immediately prior to the crash and the subsequent checkpoint was only |
2320 | + * partially written, it's possible it overwrote the last referenced tail in the |
2321 | + * log with garbage. This is not a coherency problem because the tail must have |
2322 | + * been pushed before it can be overwritten, but appears as log corruption to |
2323 | + * recovery because we have no way to know the tail was updated if the |
2324 | + * subsequent checkpoint didn't write successfully. |
2325 | + * |
2326 | + * Therefore, CRC check the log from tail to head. If a failure occurs and the |
2327 | + * offending record is within max iclog bufs from the head, walk the tail |
2328 | + * forward and retry until a valid tail is found or corruption is detected out |
2329 | + * of the range of a possible overwrite. |
2330 | */ |
2331 | STATIC int |
2332 | xlog_verify_tail( |
2333 | struct xlog *log, |
2334 | xfs_daddr_t head_blk, |
2335 | - xfs_daddr_t tail_blk) |
2336 | + xfs_daddr_t *tail_blk, |
2337 | + int hsize) |
2338 | { |
2339 | struct xlog_rec_header *thead; |
2340 | struct xfs_buf *bp; |
2341 | xfs_daddr_t first_bad; |
2342 | - int count; |
2343 | int error = 0; |
2344 | bool wrapped; |
2345 | - xfs_daddr_t tmp_head; |
2346 | + xfs_daddr_t tmp_tail; |
2347 | + xfs_daddr_t orig_tail = *tail_blk; |
2348 | |
2349 | bp = xlog_get_bp(log, 1); |
2350 | if (!bp) |
2351 | return -ENOMEM; |
2352 | |
2353 | /* |
2354 | - * Seek XLOG_MAX_ICLOGS + 1 records past the current tail record to get |
2355 | - * a temporary head block that points after the last possible |
2356 | - * concurrently written record of the tail. |
2357 | + * Make sure the tail points to a record (returns positive count on |
2358 | + * success). |
2359 | */ |
2360 | - count = xlog_seek_logrec_hdr(log, head_blk, tail_blk, |
2361 | - XLOG_MAX_ICLOGS + 1, bp, &tmp_head, &thead, |
2362 | - &wrapped); |
2363 | - if (count < 0) { |
2364 | - error = count; |
2365 | + error = xlog_seek_logrec_hdr(log, head_blk, *tail_blk, 1, bp, |
2366 | + &tmp_tail, &thead, &wrapped); |
2367 | + if (error < 0) |
2368 | goto out; |
2369 | - } |
2370 | - |
2371 | - /* |
2372 | - * If the call above didn't find XLOG_MAX_ICLOGS + 1 records, we ran |
2373 | - * into the actual log head. tmp_head points to the start of the record |
2374 | - * so update it to the actual head block. |
2375 | - */ |
2376 | - if (count < XLOG_MAX_ICLOGS + 1) |
2377 | - tmp_head = head_blk; |
2378 | + if (*tail_blk != tmp_tail) |
2379 | + *tail_blk = tmp_tail; |
2380 | |
2381 | /* |
2382 | - * We now have a tail and temporary head block that covers at least |
2383 | - * XLOG_MAX_ICLOGS records from the tail. We need to verify that these |
2384 | - * records were completely written. Run a CRC verification pass from |
2385 | - * tail to head and return the result. |
2386 | + * Run a CRC check from the tail to the head. We can't just check |
2387 | + * MAX_ICLOGS records past the tail because the tail may point to stale |
2388 | + * blocks cleared during the search for the head/tail. These blocks are |
2389 | + * overwritten with zero-length records and thus record count is not a |
2390 | + * reliable indicator of the iclog state before a crash. |
2391 | */ |
2392 | - error = xlog_do_recovery_pass(log, tmp_head, tail_blk, |
2393 | + first_bad = 0; |
2394 | + error = xlog_do_recovery_pass(log, head_blk, *tail_blk, |
2395 | XLOG_RECOVER_CRCPASS, &first_bad); |
2396 | + while ((error == -EFSBADCRC || error == -EFSCORRUPTED) && first_bad) { |
2397 | + int tail_distance; |
2398 | + |
2399 | + /* |
2400 | + * Is corruption within range of the head? If so, retry from |
2401 | + * the next record. Otherwise return an error. |
2402 | + */ |
2403 | + tail_distance = xlog_tail_distance(log, head_blk, first_bad); |
2404 | + if (tail_distance > BTOBB(XLOG_MAX_ICLOGS * hsize)) |
2405 | + break; |
2406 | + |
2407 | + /* skip to the next record; returns positive count on success */ |
2408 | + error = xlog_seek_logrec_hdr(log, head_blk, first_bad, 2, bp, |
2409 | + &tmp_tail, &thead, &wrapped); |
2410 | + if (error < 0) |
2411 | + goto out; |
2412 | + |
2413 | + *tail_blk = tmp_tail; |
2414 | + first_bad = 0; |
2415 | + error = xlog_do_recovery_pass(log, head_blk, *tail_blk, |
2416 | + XLOG_RECOVER_CRCPASS, &first_bad); |
2417 | + } |
2418 | |
2419 | + if (!error && *tail_blk != orig_tail) |
2420 | + xfs_warn(log->l_mp, |
2421 | + "Tail block (0x%llx) overwrite detected. Updated to 0x%llx", |
2422 | + orig_tail, *tail_blk); |
2423 | out: |
2424 | xlog_put_bp(bp); |
2425 | return error; |
2426 | @@ -1143,7 +1188,7 @@ xlog_verify_head( |
2427 | */ |
2428 | error = xlog_do_recovery_pass(log, *head_blk, tmp_rhead_blk, |
2429 | XLOG_RECOVER_CRCPASS, &first_bad); |
2430 | - if (error == -EFSBADCRC) { |
2431 | + if ((error == -EFSBADCRC || error == -EFSCORRUPTED) && first_bad) { |
2432 | /* |
2433 | * We've hit a potential torn write. Reset the error and warn |
2434 | * about it. |
2435 | @@ -1183,31 +1228,12 @@ xlog_verify_head( |
2436 | ASSERT(0); |
2437 | return 0; |
2438 | } |
2439 | - |
2440 | - /* |
2441 | - * Now verify the tail based on the updated head. This is |
2442 | - * required because the torn writes trimmed from the head could |
2443 | - * have been written over the tail of a previous record. Return |
2444 | - * any errors since recovery cannot proceed if the tail is |
2445 | - * corrupt. |
2446 | - * |
2447 | - * XXX: This leaves a gap in truly robust protection from torn |
2448 | - * writes in the log. If the head is behind the tail, the tail |
2449 | - * pushes forward to create some space and then a crash occurs |
2450 | - * causing the writes into the previous record's tail region to |
2451 | - * tear, log recovery isn't able to recover. |
2452 | - * |
2453 | - * How likely is this to occur? If possible, can we do something |
2454 | - * more intelligent here? Is it safe to push the tail forward if |
2455 | - * we can determine that the tail is within the range of the |
2456 | - * torn write (e.g., the kernel can only overwrite the tail if |
2457 | - * it has actually been pushed forward)? Alternatively, could we |
2458 | - * somehow prevent this condition at runtime? |
2459 | - */ |
2460 | - error = xlog_verify_tail(log, *head_blk, *tail_blk); |
2461 | } |
2462 | + if (error) |
2463 | + return error; |
2464 | |
2465 | - return error; |
2466 | + return xlog_verify_tail(log, *head_blk, tail_blk, |
2467 | + be32_to_cpu((*rhead)->h_size)); |
2468 | } |
2469 | |
2470 | /* |
2471 | @@ -4152,7 +4178,7 @@ xlog_recover_commit_trans( |
2472 | |
2473 | #define XLOG_RECOVER_COMMIT_QUEUE_MAX 100 |
2474 | |
2475 | - hlist_del(&trans->r_list); |
2476 | + hlist_del_init(&trans->r_list); |
2477 | |
2478 | error = xlog_recover_reorder_trans(log, trans, pass); |
2479 | if (error) |
2480 | @@ -4354,6 +4380,8 @@ xlog_recover_free_trans( |
2481 | xlog_recover_item_t *item, *n; |
2482 | int i; |
2483 | |
2484 | + hlist_del_init(&trans->r_list); |
2485 | + |
2486 | list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) { |
2487 | /* Free the regions in the item. */ |
2488 | list_del(&item->ri_list); |
2489 | @@ -4799,12 +4827,16 @@ xlog_recover_process_intents( |
2490 | int error = 0; |
2491 | struct xfs_ail_cursor cur; |
2492 | struct xfs_ail *ailp; |
2493 | +#if defined(DEBUG) || defined(XFS_WARN) |
2494 | xfs_lsn_t last_lsn; |
2495 | +#endif |
2496 | |
2497 | ailp = log->l_ailp; |
2498 | spin_lock(&ailp->xa_lock); |
2499 | lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); |
2500 | +#if defined(DEBUG) || defined(XFS_WARN) |
2501 | last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block); |
2502 | +#endif |
2503 | while (lip != NULL) { |
2504 | /* |
2505 | * We're done when we see something other than an intent. |
2506 | @@ -5214,7 +5246,7 @@ xlog_do_recovery_pass( |
2507 | xfs_daddr_t *first_bad) /* out: first bad log rec */ |
2508 | { |
2509 | xlog_rec_header_t *rhead; |
2510 | - xfs_daddr_t blk_no; |
2511 | + xfs_daddr_t blk_no, rblk_no; |
2512 | xfs_daddr_t rhead_blk; |
2513 | char *offset; |
2514 | xfs_buf_t *hbp, *dbp; |
2515 | @@ -5222,11 +5254,15 @@ xlog_do_recovery_pass( |
2516 | int error2 = 0; |
2517 | int bblks, split_bblks; |
2518 | int hblks, split_hblks, wrapped_hblks; |
2519 | + int i; |
2520 | struct hlist_head rhash[XLOG_RHASH_SIZE]; |
2521 | LIST_HEAD (buffer_list); |
2522 | |
2523 | ASSERT(head_blk != tail_blk); |
2524 | - rhead_blk = 0; |
2525 | + blk_no = rhead_blk = tail_blk; |
2526 | + |
2527 | + for (i = 0; i < XLOG_RHASH_SIZE; i++) |
2528 | + INIT_HLIST_HEAD(&rhash[i]); |
2529 | |
2530 | /* |
2531 | * Read the header of the tail block and get the iclog buffer size from |
2532 | @@ -5301,7 +5337,6 @@ xlog_do_recovery_pass( |
2533 | } |
2534 | |
2535 | memset(rhash, 0, sizeof(rhash)); |
2536 | - blk_no = rhead_blk = tail_blk; |
2537 | if (tail_blk > head_blk) { |
2538 | /* |
2539 | * Perform recovery around the end of the physical log. |
2540 | @@ -5363,9 +5398,19 @@ xlog_do_recovery_pass( |
2541 | bblks = (int)BTOBB(be32_to_cpu(rhead->h_len)); |
2542 | blk_no += hblks; |
2543 | |
2544 | - /* Read in data for log record */ |
2545 | - if (blk_no + bblks <= log->l_logBBsize) { |
2546 | - error = xlog_bread(log, blk_no, bblks, dbp, |
2547 | + /* |
2548 | + * Read the log record data in multiple reads if it |
2549 | + * wraps around the end of the log. Note that if the |
2550 | + * header already wrapped, blk_no could point past the |
2551 | + * end of the log. The record data is contiguous in |
2552 | + * that case. |
2553 | + */ |
2554 | + if (blk_no + bblks <= log->l_logBBsize || |
2555 | + blk_no >= log->l_logBBsize) { |
2556 | + /* mod blk_no in case the header wrapped and |
2557 | + * pushed it beyond the end of the log */ |
2558 | + rblk_no = do_mod(blk_no, log->l_logBBsize); |
2559 | + error = xlog_bread(log, rblk_no, bblks, dbp, |
2560 | &offset); |
2561 | if (error) |
2562 | goto bread_err2; |
2563 | @@ -5464,6 +5509,19 @@ xlog_do_recovery_pass( |
2564 | if (error && first_bad) |
2565 | *first_bad = rhead_blk; |
2566 | |
2567 | + /* |
2568 | + * Transactions are freed at commit time but transactions without commit |
2569 | + * records on disk are never committed. Free any that may be left in the |
2570 | + * hash table. |
2571 | + */ |
2572 | + for (i = 0; i < XLOG_RHASH_SIZE; i++) { |
2573 | + struct hlist_node *tmp; |
2574 | + struct xlog_recover *trans; |
2575 | + |
2576 | + hlist_for_each_entry_safe(trans, tmp, &rhash[i], r_list) |
2577 | + xlog_recover_free_trans(trans); |
2578 | + } |
2579 | + |
2580 | return error ? error : error2; |
2581 | } |
2582 | |
2583 | @@ -5542,6 +5600,8 @@ xlog_do_recover( |
2584 | xfs_buf_t *bp; |
2585 | xfs_sb_t *sbp; |
2586 | |
2587 | + trace_xfs_log_recover(log, head_blk, tail_blk); |
2588 | + |
2589 | /* |
2590 | * First replay the images in the log. |
2591 | */ |
2592 | diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c |
2593 | index 13796f212f98..d4ce8d277992 100644 |
2594 | --- a/fs/xfs/xfs_mount.c |
2595 | +++ b/fs/xfs/xfs_mount.c |
2596 | @@ -924,15 +924,6 @@ xfs_mountfs( |
2597 | } |
2598 | } |
2599 | |
2600 | - /* |
2601 | - * During the second phase of log recovery, we need iget and |
2602 | - * iput to behave like they do for an active filesystem. |
2603 | - * xfs_fs_drop_inode needs to be able to prevent the deletion |
2604 | - * of inodes before we're done replaying log items on those |
2605 | - * inodes. |
2606 | - */ |
2607 | - mp->m_super->s_flags |= MS_ACTIVE; |
2608 | - |
2609 | /* |
2610 | * Finish recovering the file system. This part needed to be delayed |
2611 | * until after the root and real-time bitmap inodes were consistently |
2612 | @@ -1008,12 +999,13 @@ xfs_mountfs( |
2613 | out_quota: |
2614 | xfs_qm_unmount_quotas(mp); |
2615 | out_rtunmount: |
2616 | - mp->m_super->s_flags &= ~MS_ACTIVE; |
2617 | xfs_rtunmount_inodes(mp); |
2618 | out_rele_rip: |
2619 | IRELE(rip); |
2620 | cancel_delayed_work_sync(&mp->m_reclaim_work); |
2621 | xfs_reclaim_inodes(mp, SYNC_WAIT); |
2622 | + /* Clean out dquots that might be in memory after quotacheck. */ |
2623 | + xfs_qm_unmount(mp); |
2624 | out_log_dealloc: |
2625 | mp->m_flags |= XFS_MOUNT_UNMOUNTING; |
2626 | xfs_log_mount_cancel(mp); |
2627 | diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c |
2628 | index 8b9a9f15f022..1fdd3face2d9 100644 |
2629 | --- a/fs/xfs/xfs_qm.c |
2630 | +++ b/fs/xfs/xfs_qm.c |
2631 | @@ -111,6 +111,9 @@ xfs_qm_dquot_walk( |
2632 | skipped = 0; |
2633 | break; |
2634 | } |
2635 | + /* we're done if id overflows back to zero */ |
2636 | + if (!next_index) |
2637 | + break; |
2638 | } |
2639 | |
2640 | if (skipped) { |
2641 | @@ -1247,6 +1250,7 @@ xfs_qm_flush_one( |
2642 | struct xfs_dquot *dqp, |
2643 | void *data) |
2644 | { |
2645 | + struct xfs_mount *mp = dqp->q_mount; |
2646 | struct list_head *buffer_list = data; |
2647 | struct xfs_buf *bp = NULL; |
2648 | int error = 0; |
2649 | @@ -1257,7 +1261,32 @@ xfs_qm_flush_one( |
2650 | if (!XFS_DQ_IS_DIRTY(dqp)) |
2651 | goto out_unlock; |
2652 | |
2653 | - xfs_dqflock(dqp); |
2654 | + /* |
2655 | + * The only way the dquot is already flush locked by the time quotacheck |
2656 | + * gets here is if reclaim flushed it before the dqadjust walk dirtied |
2657 | + * it for the final time. Quotacheck collects all dquot bufs in the |
2658 | + * local delwri queue before dquots are dirtied, so reclaim can't have |
2659 | + * possibly queued it for I/O. The only way out is to push the buffer to |
2660 | + * cycle the flush lock. |
2661 | + */ |
2662 | + if (!xfs_dqflock_nowait(dqp)) { |
2663 | + /* buf is pinned in-core by delwri list */ |
2664 | + DEFINE_SINGLE_BUF_MAP(map, dqp->q_blkno, |
2665 | + mp->m_quotainfo->qi_dqchunklen); |
2666 | + bp = _xfs_buf_find(mp->m_ddev_targp, &map, 1, 0, NULL); |
2667 | + if (!bp) { |
2668 | + error = -EINVAL; |
2669 | + goto out_unlock; |
2670 | + } |
2671 | + xfs_buf_unlock(bp); |
2672 | + |
2673 | + xfs_buf_delwri_pushbuf(bp, buffer_list); |
2674 | + xfs_buf_rele(bp); |
2675 | + |
2676 | + error = -EAGAIN; |
2677 | + goto out_unlock; |
2678 | + } |
2679 | + |
2680 | error = xfs_qm_dqflush(dqp, &bp); |
2681 | if (error) |
2682 | goto out_unlock; |
2683 | diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c |
2684 | index 29a75ecb2425..0015c19c7455 100644 |
2685 | --- a/fs/xfs/xfs_reflink.c |
2686 | +++ b/fs/xfs/xfs_reflink.c |
2687 | @@ -169,6 +169,8 @@ xfs_reflink_find_shared( |
2688 | error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); |
2689 | if (error) |
2690 | return error; |
2691 | + if (!agbp) |
2692 | + return -ENOMEM; |
2693 | |
2694 | cur = xfs_refcountbt_init_cursor(mp, NULL, agbp, agno, NULL); |
2695 | |
2696 | @@ -333,7 +335,7 @@ xfs_reflink_convert_cow_extent( |
2697 | struct xfs_defer_ops *dfops) |
2698 | { |
2699 | struct xfs_bmbt_irec irec = *imap; |
2700 | - xfs_fsblock_t first_block; |
2701 | + xfs_fsblock_t first_block = NULLFSBLOCK; |
2702 | int nimaps = 1; |
2703 | |
2704 | if (imap->br_state == XFS_EXT_NORM) |
2705 | diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c |
2706 | index 882fb8524fcb..67d589e0a49f 100644 |
2707 | --- a/fs/xfs/xfs_super.c |
2708 | +++ b/fs/xfs/xfs_super.c |
2709 | @@ -1214,7 +1214,7 @@ xfs_test_remount_options( |
2710 | tmp_mp->m_super = sb; |
2711 | error = xfs_parseargs(tmp_mp, options); |
2712 | xfs_free_fsname(tmp_mp); |
2713 | - kfree(tmp_mp); |
2714 | + kmem_free(tmp_mp); |
2715 | |
2716 | return error; |
2717 | } |
2718 | diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h |
2719 | index 828f383df121..bdf69e1c7410 100644 |
2720 | --- a/fs/xfs/xfs_trace.h |
2721 | +++ b/fs/xfs/xfs_trace.h |
2722 | @@ -366,6 +366,7 @@ DEFINE_BUF_EVENT(xfs_buf_iowait_done); |
2723 | DEFINE_BUF_EVENT(xfs_buf_delwri_queue); |
2724 | DEFINE_BUF_EVENT(xfs_buf_delwri_queued); |
2725 | DEFINE_BUF_EVENT(xfs_buf_delwri_split); |
2726 | +DEFINE_BUF_EVENT(xfs_buf_delwri_pushbuf); |
2727 | DEFINE_BUF_EVENT(xfs_buf_get_uncached); |
2728 | DEFINE_BUF_EVENT(xfs_bdstrat_shut); |
2729 | DEFINE_BUF_EVENT(xfs_buf_item_relse); |
2730 | @@ -519,7 +520,6 @@ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size); |
2731 | DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_ordered); |
2732 | DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_stale); |
2733 | DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format); |
2734 | -DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_ordered); |
2735 | DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_stale); |
2736 | DEFINE_BUF_ITEM_EVENT(xfs_buf_item_ordered); |
2737 | DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin); |
2738 | @@ -1990,6 +1990,24 @@ DEFINE_EVENT(xfs_swap_extent_class, name, \ |
2739 | DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before); |
2740 | DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after); |
2741 | |
2742 | +TRACE_EVENT(xfs_log_recover, |
2743 | + TP_PROTO(struct xlog *log, xfs_daddr_t headblk, xfs_daddr_t tailblk), |
2744 | + TP_ARGS(log, headblk, tailblk), |
2745 | + TP_STRUCT__entry( |
2746 | + __field(dev_t, dev) |
2747 | + __field(xfs_daddr_t, headblk) |
2748 | + __field(xfs_daddr_t, tailblk) |
2749 | + ), |
2750 | + TP_fast_assign( |
2751 | + __entry->dev = log->l_mp->m_super->s_dev; |
2752 | + __entry->headblk = headblk; |
2753 | + __entry->tailblk = tailblk; |
2754 | + ), |
2755 | + TP_printk("dev %d:%d headblk 0x%llx tailblk 0x%llx", |
2756 | + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->headblk, |
2757 | + __entry->tailblk) |
2758 | +) |
2759 | + |
2760 | TRACE_EVENT(xfs_log_recover_record, |
2761 | TP_PROTO(struct xlog *log, struct xlog_rec_header *rhead, int pass), |
2762 | TP_ARGS(log, rhead, pass), |
2763 | diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h |
2764 | index 98024cb933ef..5669cf00bae0 100644 |
2765 | --- a/fs/xfs/xfs_trans.h |
2766 | +++ b/fs/xfs/xfs_trans.h |
2767 | @@ -50,6 +50,7 @@ typedef struct xfs_log_item { |
2768 | struct xfs_ail *li_ailp; /* ptr to AIL */ |
2769 | uint li_type; /* item type */ |
2770 | uint li_flags; /* misc flags */ |
2771 | + struct xfs_buf *li_buf; /* real buffer pointer */ |
2772 | struct xfs_log_item *li_bio_list; /* buffer item list */ |
2773 | void (*li_cb)(struct xfs_buf *, |
2774 | struct xfs_log_item *); |
2775 | @@ -65,11 +66,13 @@ typedef struct xfs_log_item { |
2776 | } xfs_log_item_t; |
2777 | |
2778 | #define XFS_LI_IN_AIL 0x1 |
2779 | -#define XFS_LI_ABORTED 0x2 |
2780 | +#define XFS_LI_ABORTED 0x2 |
2781 | +#define XFS_LI_FAILED 0x4 |
2782 | |
2783 | #define XFS_LI_FLAGS \ |
2784 | { XFS_LI_IN_AIL, "IN_AIL" }, \ |
2785 | - { XFS_LI_ABORTED, "ABORTED" } |
2786 | + { XFS_LI_ABORTED, "ABORTED" }, \ |
2787 | + { XFS_LI_FAILED, "FAILED" } |
2788 | |
2789 | struct xfs_item_ops { |
2790 | void (*iop_size)(xfs_log_item_t *, int *, int *); |
2791 | @@ -80,6 +83,7 @@ struct xfs_item_ops { |
2792 | void (*iop_unlock)(xfs_log_item_t *); |
2793 | xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t); |
2794 | void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t); |
2795 | + void (*iop_error)(xfs_log_item_t *, xfs_buf_t *); |
2796 | }; |
2797 | |
2798 | void xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item, |
2799 | @@ -213,12 +217,14 @@ void xfs_trans_bhold_release(xfs_trans_t *, struct xfs_buf *); |
2800 | void xfs_trans_binval(xfs_trans_t *, struct xfs_buf *); |
2801 | void xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *); |
2802 | void xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *); |
2803 | -void xfs_trans_ordered_buf(xfs_trans_t *, struct xfs_buf *); |
2804 | +bool xfs_trans_ordered_buf(xfs_trans_t *, struct xfs_buf *); |
2805 | void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint); |
2806 | void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *); |
2807 | void xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int); |
2808 | void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *, uint); |
2809 | -void xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint); |
2810 | +void xfs_trans_log_buf(struct xfs_trans *, struct xfs_buf *, uint, |
2811 | + uint); |
2812 | +void xfs_trans_dirty_buf(struct xfs_trans *, struct xfs_buf *); |
2813 | void xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint); |
2814 | |
2815 | void xfs_extent_free_init_defer_op(void); |
2816 | @@ -277,6 +283,6 @@ int xfs_trans_log_finish_bmap_update(struct xfs_trans *tp, |
2817 | struct xfs_bud_log_item *rudp, struct xfs_defer_ops *dfops, |
2818 | enum xfs_bmap_intent_type type, struct xfs_inode *ip, |
2819 | int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock, |
2820 | - xfs_filblks_t blockcount, xfs_exntst_t state); |
2821 | + xfs_filblks_t *blockcount, xfs_exntst_t state); |
2822 | |
2823 | #endif /* __XFS_TRANS_H__ */ |
2824 | diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c |
2825 | index d6c9c3e9e02b..70f5ab017323 100644 |
2826 | --- a/fs/xfs/xfs_trans_ail.c |
2827 | +++ b/fs/xfs/xfs_trans_ail.c |
2828 | @@ -684,8 +684,24 @@ xfs_trans_ail_update_bulk( |
2829 | } |
2830 | } |
2831 | |
2832 | -/* |
2833 | - * xfs_trans_ail_delete_bulk - remove multiple log items from the AIL |
2834 | +bool |
2835 | +xfs_ail_delete_one( |
2836 | + struct xfs_ail *ailp, |
2837 | + struct xfs_log_item *lip) |
2838 | +{ |
2839 | + struct xfs_log_item *mlip = xfs_ail_min(ailp); |
2840 | + |
2841 | + trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn); |
2842 | + xfs_ail_delete(ailp, lip); |
2843 | + xfs_clear_li_failed(lip); |
2844 | + lip->li_flags &= ~XFS_LI_IN_AIL; |
2845 | + lip->li_lsn = 0; |
2846 | + |
2847 | + return mlip == lip; |
2848 | +} |
2849 | + |
2850 | +/** |
2851 | + * Remove a log items from the AIL |
2852 | * |
2853 | * @xfs_trans_ail_delete_bulk takes an array of log items that all need to |
2854 | * removed from the AIL. The caller is already holding the AIL lock, and done |
2855 | @@ -706,52 +722,36 @@ xfs_trans_ail_update_bulk( |
2856 | * before returning. |
2857 | */ |
2858 | void |
2859 | -xfs_trans_ail_delete_bulk( |
2860 | +xfs_trans_ail_delete( |
2861 | struct xfs_ail *ailp, |
2862 | - struct xfs_log_item **log_items, |
2863 | - int nr_items, |
2864 | + struct xfs_log_item *lip, |
2865 | int shutdown_type) __releases(ailp->xa_lock) |
2866 | { |
2867 | - xfs_log_item_t *mlip; |
2868 | - int mlip_changed = 0; |
2869 | - int i; |
2870 | - |
2871 | - mlip = xfs_ail_min(ailp); |
2872 | + struct xfs_mount *mp = ailp->xa_mount; |
2873 | + bool mlip_changed; |
2874 | |
2875 | - for (i = 0; i < nr_items; i++) { |
2876 | - struct xfs_log_item *lip = log_items[i]; |
2877 | - if (!(lip->li_flags & XFS_LI_IN_AIL)) { |
2878 | - struct xfs_mount *mp = ailp->xa_mount; |
2879 | - |
2880 | - spin_unlock(&ailp->xa_lock); |
2881 | - if (!XFS_FORCED_SHUTDOWN(mp)) { |
2882 | - xfs_alert_tag(mp, XFS_PTAG_AILDELETE, |
2883 | - "%s: attempting to delete a log item that is not in the AIL", |
2884 | - __func__); |
2885 | - xfs_force_shutdown(mp, shutdown_type); |
2886 | - } |
2887 | - return; |
2888 | + if (!(lip->li_flags & XFS_LI_IN_AIL)) { |
2889 | + spin_unlock(&ailp->xa_lock); |
2890 | + if (!XFS_FORCED_SHUTDOWN(mp)) { |
2891 | + xfs_alert_tag(mp, XFS_PTAG_AILDELETE, |
2892 | + "%s: attempting to delete a log item that is not in the AIL", |
2893 | + __func__); |
2894 | + xfs_force_shutdown(mp, shutdown_type); |
2895 | } |
2896 | - |
2897 | - trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn); |
2898 | - xfs_ail_delete(ailp, lip); |
2899 | - lip->li_flags &= ~XFS_LI_IN_AIL; |
2900 | - lip->li_lsn = 0; |
2901 | - if (mlip == lip) |
2902 | - mlip_changed = 1; |
2903 | + return; |
2904 | } |
2905 | |
2906 | + mlip_changed = xfs_ail_delete_one(ailp, lip); |
2907 | if (mlip_changed) { |
2908 | - if (!XFS_FORCED_SHUTDOWN(ailp->xa_mount)) |
2909 | - xlog_assign_tail_lsn_locked(ailp->xa_mount); |
2910 | + if (!XFS_FORCED_SHUTDOWN(mp)) |
2911 | + xlog_assign_tail_lsn_locked(mp); |
2912 | if (list_empty(&ailp->xa_ail)) |
2913 | wake_up_all(&ailp->xa_empty); |
2914 | - spin_unlock(&ailp->xa_lock); |
2915 | + } |
2916 | |
2917 | + spin_unlock(&ailp->xa_lock); |
2918 | + if (mlip_changed) |
2919 | xfs_log_space_wake(ailp->xa_mount); |
2920 | - } else { |
2921 | - spin_unlock(&ailp->xa_lock); |
2922 | - } |
2923 | } |
2924 | |
2925 | int |
2926 | diff --git a/fs/xfs/xfs_trans_bmap.c b/fs/xfs/xfs_trans_bmap.c |
2927 | index 6408e7d7c08c..14543d93cd4b 100644 |
2928 | --- a/fs/xfs/xfs_trans_bmap.c |
2929 | +++ b/fs/xfs/xfs_trans_bmap.c |
2930 | @@ -63,7 +63,7 @@ xfs_trans_log_finish_bmap_update( |
2931 | int whichfork, |
2932 | xfs_fileoff_t startoff, |
2933 | xfs_fsblock_t startblock, |
2934 | - xfs_filblks_t blockcount, |
2935 | + xfs_filblks_t *blockcount, |
2936 | xfs_exntst_t state) |
2937 | { |
2938 | int error; |
2939 | @@ -196,16 +196,23 @@ xfs_bmap_update_finish_item( |
2940 | void **state) |
2941 | { |
2942 | struct xfs_bmap_intent *bmap; |
2943 | + xfs_filblks_t count; |
2944 | int error; |
2945 | |
2946 | bmap = container_of(item, struct xfs_bmap_intent, bi_list); |
2947 | + count = bmap->bi_bmap.br_blockcount; |
2948 | error = xfs_trans_log_finish_bmap_update(tp, done_item, dop, |
2949 | bmap->bi_type, |
2950 | bmap->bi_owner, bmap->bi_whichfork, |
2951 | bmap->bi_bmap.br_startoff, |
2952 | bmap->bi_bmap.br_startblock, |
2953 | - bmap->bi_bmap.br_blockcount, |
2954 | + &count, |
2955 | bmap->bi_bmap.br_state); |
2956 | + if (!error && count > 0) { |
2957 | + ASSERT(bmap->bi_type == XFS_BMAP_UNMAP); |
2958 | + bmap->bi_bmap.br_blockcount = count; |
2959 | + return -EAGAIN; |
2960 | + } |
2961 | kmem_free(bmap); |
2962 | return error; |
2963 | } |
2964 | diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c |
2965 | index 8ee29ca132dc..3ba7a96a8abd 100644 |
2966 | --- a/fs/xfs/xfs_trans_buf.c |
2967 | +++ b/fs/xfs/xfs_trans_buf.c |
2968 | @@ -356,6 +356,7 @@ xfs_trans_brelse(xfs_trans_t *tp, |
2969 | xfs_buf_t *bp) |
2970 | { |
2971 | xfs_buf_log_item_t *bip; |
2972 | + int freed; |
2973 | |
2974 | /* |
2975 | * Default to a normal brelse() call if the tp is NULL. |
2976 | @@ -419,16 +420,22 @@ xfs_trans_brelse(xfs_trans_t *tp, |
2977 | /* |
2978 | * Drop our reference to the buf log item. |
2979 | */ |
2980 | - atomic_dec(&bip->bli_refcount); |
2981 | + freed = atomic_dec_and_test(&bip->bli_refcount); |
2982 | |
2983 | /* |
2984 | - * If the buf item is not tracking data in the log, then |
2985 | - * we must free it before releasing the buffer back to the |
2986 | - * free pool. Before releasing the buffer to the free pool, |
2987 | - * clear the transaction pointer in b_fsprivate2 to dissolve |
2988 | - * its relation to this transaction. |
2989 | + * If the buf item is not tracking data in the log, then we must free it |
2990 | + * before releasing the buffer back to the free pool. |
2991 | + * |
2992 | + * If the fs has shutdown and we dropped the last reference, it may fall |
2993 | + * on us to release a (possibly dirty) bli if it never made it to the |
2994 | + * AIL (e.g., the aborted unpin already happened and didn't release it |
2995 | + * due to our reference). Since we're already shutdown and need xa_lock, |
2996 | + * just force remove from the AIL and release the bli here. |
2997 | */ |
2998 | - if (!xfs_buf_item_dirty(bip)) { |
2999 | + if (XFS_FORCED_SHUTDOWN(tp->t_mountp) && freed) { |
3000 | + xfs_trans_ail_remove(&bip->bli_item, SHUTDOWN_LOG_IO_ERROR); |
3001 | + xfs_buf_item_relse(bp); |
3002 | + } else if (!(bip->bli_flags & XFS_BLI_DIRTY)) { |
3003 | /*** |
3004 | ASSERT(bp->b_pincount == 0); |
3005 | ***/ |
3006 | @@ -486,25 +493,17 @@ xfs_trans_bhold_release(xfs_trans_t *tp, |
3007 | } |
3008 | |
3009 | /* |
3010 | - * This is called to mark bytes first through last inclusive of the given |
3011 | - * buffer as needing to be logged when the transaction is committed. |
3012 | - * The buffer must already be associated with the given transaction. |
3013 | - * |
3014 | - * First and last are numbers relative to the beginning of this buffer, |
3015 | - * so the first byte in the buffer is numbered 0 regardless of the |
3016 | - * value of b_blkno. |
3017 | + * Mark a buffer dirty in the transaction. |
3018 | */ |
3019 | void |
3020 | -xfs_trans_log_buf(xfs_trans_t *tp, |
3021 | - xfs_buf_t *bp, |
3022 | - uint first, |
3023 | - uint last) |
3024 | +xfs_trans_dirty_buf( |
3025 | + struct xfs_trans *tp, |
3026 | + struct xfs_buf *bp) |
3027 | { |
3028 | - xfs_buf_log_item_t *bip = bp->b_fspriv; |
3029 | + struct xfs_buf_log_item *bip = bp->b_fspriv; |
3030 | |
3031 | ASSERT(bp->b_transp == tp); |
3032 | ASSERT(bip != NULL); |
3033 | - ASSERT(first <= last && last < BBTOB(bp->b_length)); |
3034 | ASSERT(bp->b_iodone == NULL || |
3035 | bp->b_iodone == xfs_buf_iodone_callbacks); |
3036 | |
3037 | @@ -524,8 +523,6 @@ xfs_trans_log_buf(xfs_trans_t *tp, |
3038 | bp->b_iodone = xfs_buf_iodone_callbacks; |
3039 | bip->bli_item.li_cb = xfs_buf_iodone; |
3040 | |
3041 | - trace_xfs_trans_log_buf(bip); |
3042 | - |
3043 | /* |
3044 | * If we invalidated the buffer within this transaction, then |
3045 | * cancel the invalidation now that we're dirtying the buffer |
3046 | @@ -538,17 +535,37 @@ xfs_trans_log_buf(xfs_trans_t *tp, |
3047 | bp->b_flags &= ~XBF_STALE; |
3048 | bip->__bli_format.blf_flags &= ~XFS_BLF_CANCEL; |
3049 | } |
3050 | + bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED; |
3051 | |
3052 | tp->t_flags |= XFS_TRANS_DIRTY; |
3053 | bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY; |
3054 | +} |
3055 | |
3056 | - /* |
3057 | - * If we have an ordered buffer we are not logging any dirty range but |
3058 | - * it still needs to be marked dirty and that it has been logged. |
3059 | - */ |
3060 | - bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED; |
3061 | - if (!(bip->bli_flags & XFS_BLI_ORDERED)) |
3062 | - xfs_buf_item_log(bip, first, last); |
3063 | +/* |
3064 | + * This is called to mark bytes first through last inclusive of the given |
3065 | + * buffer as needing to be logged when the transaction is committed. |
3066 | + * The buffer must already be associated with the given transaction. |
3067 | + * |
3068 | + * First and last are numbers relative to the beginning of this buffer, |
3069 | + * so the first byte in the buffer is numbered 0 regardless of the |
3070 | + * value of b_blkno. |
3071 | + */ |
3072 | +void |
3073 | +xfs_trans_log_buf( |
3074 | + struct xfs_trans *tp, |
3075 | + struct xfs_buf *bp, |
3076 | + uint first, |
3077 | + uint last) |
3078 | +{ |
3079 | + struct xfs_buf_log_item *bip = bp->b_fspriv; |
3080 | + |
3081 | + ASSERT(first <= last && last < BBTOB(bp->b_length)); |
3082 | + ASSERT(!(bip->bli_flags & XFS_BLI_ORDERED)); |
3083 | + |
3084 | + xfs_trans_dirty_buf(tp, bp); |
3085 | + |
3086 | + trace_xfs_trans_log_buf(bip); |
3087 | + xfs_buf_item_log(bip, first, last); |
3088 | } |
3089 | |
3090 | |
3091 | @@ -701,14 +718,13 @@ xfs_trans_inode_alloc_buf( |
3092 | } |
3093 | |
3094 | /* |
3095 | - * Mark the buffer as ordered for this transaction. This means |
3096 | - * that the contents of the buffer are not recorded in the transaction |
3097 | - * but it is tracked in the AIL as though it was. This allows us |
3098 | - * to record logical changes in transactions rather than the physical |
3099 | - * changes we make to the buffer without changing writeback ordering |
3100 | - * constraints of metadata buffers. |
3101 | + * Mark the buffer as ordered for this transaction. This means that the contents |
3102 | + * of the buffer are not recorded in the transaction but it is tracked in the |
3103 | + * AIL as though it was. This allows us to record logical changes in |
3104 | + * transactions rather than the physical changes we make to the buffer without |
3105 | + * changing writeback ordering constraints of metadata buffers. |
3106 | */ |
3107 | -void |
3108 | +bool |
3109 | xfs_trans_ordered_buf( |
3110 | struct xfs_trans *tp, |
3111 | struct xfs_buf *bp) |
3112 | @@ -719,8 +735,18 @@ xfs_trans_ordered_buf( |
3113 | ASSERT(bip != NULL); |
3114 | ASSERT(atomic_read(&bip->bli_refcount) > 0); |
3115 | |
3116 | + if (xfs_buf_item_dirty_format(bip)) |
3117 | + return false; |
3118 | + |
3119 | bip->bli_flags |= XFS_BLI_ORDERED; |
3120 | trace_xfs_buf_item_ordered(bip); |
3121 | + |
3122 | + /* |
3123 | + * We don't log a dirty range of an ordered buffer but it still needs |
3124 | + * to be marked dirty and that it has been logged. |
3125 | + */ |
3126 | + xfs_trans_dirty_buf(tp, bp); |
3127 | + return true; |
3128 | } |
3129 | |
3130 | /* |
3131 | diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h |
3132 | index 49931b72da8a..b317a3644c00 100644 |
3133 | --- a/fs/xfs/xfs_trans_priv.h |
3134 | +++ b/fs/xfs/xfs_trans_priv.h |
3135 | @@ -106,18 +106,9 @@ xfs_trans_ail_update( |
3136 | xfs_trans_ail_update_bulk(ailp, NULL, &lip, 1, lsn); |
3137 | } |
3138 | |
3139 | -void xfs_trans_ail_delete_bulk(struct xfs_ail *ailp, |
3140 | - struct xfs_log_item **log_items, int nr_items, |
3141 | - int shutdown_type) |
3142 | - __releases(ailp->xa_lock); |
3143 | -static inline void |
3144 | -xfs_trans_ail_delete( |
3145 | - struct xfs_ail *ailp, |
3146 | - xfs_log_item_t *lip, |
3147 | - int shutdown_type) __releases(ailp->xa_lock) |
3148 | -{ |
3149 | - xfs_trans_ail_delete_bulk(ailp, &lip, 1, shutdown_type); |
3150 | -} |
3151 | +bool xfs_ail_delete_one(struct xfs_ail *ailp, struct xfs_log_item *lip); |
3152 | +void xfs_trans_ail_delete(struct xfs_ail *ailp, struct xfs_log_item *lip, |
3153 | + int shutdown_type) __releases(ailp->xa_lock); |
3154 | |
3155 | static inline void |
3156 | xfs_trans_ail_remove( |
3157 | @@ -173,4 +164,35 @@ xfs_trans_ail_copy_lsn( |
3158 | *dst = *src; |
3159 | } |
3160 | #endif |
3161 | + |
3162 | +static inline void |
3163 | +xfs_clear_li_failed( |
3164 | + struct xfs_log_item *lip) |
3165 | +{ |
3166 | + struct xfs_buf *bp = lip->li_buf; |
3167 | + |
3168 | + ASSERT(lip->li_flags & XFS_LI_IN_AIL); |
3169 | + lockdep_assert_held(&lip->li_ailp->xa_lock); |
3170 | + |
3171 | + if (lip->li_flags & XFS_LI_FAILED) { |
3172 | + lip->li_flags &= ~XFS_LI_FAILED; |
3173 | + lip->li_buf = NULL; |
3174 | + xfs_buf_rele(bp); |
3175 | + } |
3176 | +} |
3177 | + |
3178 | +static inline void |
3179 | +xfs_set_li_failed( |
3180 | + struct xfs_log_item *lip, |
3181 | + struct xfs_buf *bp) |
3182 | +{ |
3183 | + lockdep_assert_held(&lip->li_ailp->xa_lock); |
3184 | + |
3185 | + if (!(lip->li_flags & XFS_LI_FAILED)) { |
3186 | + xfs_buf_hold(bp); |
3187 | + lip->li_flags |= XFS_LI_FAILED; |
3188 | + lip->li_buf = bp; |
3189 | + } |
3190 | +} |
3191 | + |
3192 | #endif /* __XFS_TRANS_PRIV_H__ */ |
3193 | diff --git a/include/linux/fs.h b/include/linux/fs.h |
3194 | index dd88ded27fc8..d705ae084edd 100644 |
3195 | --- a/include/linux/fs.h |
3196 | +++ b/include/linux/fs.h |
3197 | @@ -2760,6 +2760,7 @@ static inline void lockdep_annotate_inode_mutex_key(struct inode *inode) { }; |
3198 | #endif |
3199 | extern void unlock_new_inode(struct inode *); |
3200 | extern unsigned int get_next_ino(void); |
3201 | +extern void evict_inodes(struct super_block *sb); |
3202 | |
3203 | extern void __iget(struct inode * inode); |
3204 | extern void iget_failed(struct inode *); |
3205 | diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h |
3206 | index 780e7171f548..23db1ae37464 100644 |
3207 | --- a/include/linux/netdevice.h |
3208 | +++ b/include/linux/netdevice.h |
3209 | @@ -3901,6 +3901,8 @@ struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev, |
3210 | updev; \ |
3211 | updev = netdev_all_upper_get_next_dev_rcu(dev, &(iter))) |
3212 | |
3213 | +bool netdev_has_any_upper_dev(struct net_device *dev); |
3214 | + |
3215 | void *netdev_lower_get_next_private(struct net_device *dev, |
3216 | struct list_head **iter); |
3217 | void *netdev_lower_get_next_private_rcu(struct net_device *dev, |
3218 | diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h |
3219 | index 909972aa3acd..634d19203e7d 100644 |
3220 | --- a/include/net/inet_frag.h |
3221 | +++ b/include/net/inet_frag.h |
3222 | @@ -1,14 +1,9 @@ |
3223 | #ifndef __NET_FRAG_H__ |
3224 | #define __NET_FRAG_H__ |
3225 | |
3226 | -#include <linux/percpu_counter.h> |
3227 | - |
3228 | struct netns_frags { |
3229 | - /* The percpu_counter "mem" need to be cacheline aligned. |
3230 | - * mem.count must not share cacheline with other writers |
3231 | - */ |
3232 | - struct percpu_counter mem ____cacheline_aligned_in_smp; |
3233 | - |
3234 | + /* Keep atomic mem on separate cachelines in structs that include it */ |
3235 | + atomic_t mem ____cacheline_aligned_in_smp; |
3236 | /* sysctls */ |
3237 | int timeout; |
3238 | int high_thresh; |
3239 | @@ -108,15 +103,10 @@ struct inet_frags { |
3240 | int inet_frags_init(struct inet_frags *); |
3241 | void inet_frags_fini(struct inet_frags *); |
3242 | |
3243 | -static inline int inet_frags_init_net(struct netns_frags *nf) |
3244 | -{ |
3245 | - return percpu_counter_init(&nf->mem, 0, GFP_KERNEL); |
3246 | -} |
3247 | -static inline void inet_frags_uninit_net(struct netns_frags *nf) |
3248 | +static inline void inet_frags_init_net(struct netns_frags *nf) |
3249 | { |
3250 | - percpu_counter_destroy(&nf->mem); |
3251 | + atomic_set(&nf->mem, 0); |
3252 | } |
3253 | - |
3254 | void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f); |
3255 | |
3256 | void inet_frag_kill(struct inet_frag_queue *q, struct inet_frags *f); |
3257 | @@ -140,37 +130,24 @@ static inline bool inet_frag_evicting(struct inet_frag_queue *q) |
3258 | |
3259 | /* Memory Tracking Functions. */ |
3260 | |
3261 | -/* The default percpu_counter batch size is not big enough to scale to |
3262 | - * fragmentation mem acct sizes. |
3263 | - * The mem size of a 64K fragment is approx: |
3264 | - * (44 fragments * 2944 truesize) + frag_queue struct(200) = 129736 bytes |
3265 | - */ |
3266 | -static unsigned int frag_percpu_counter_batch = 130000; |
3267 | - |
3268 | static inline int frag_mem_limit(struct netns_frags *nf) |
3269 | { |
3270 | - return percpu_counter_read(&nf->mem); |
3271 | + return atomic_read(&nf->mem); |
3272 | } |
3273 | |
3274 | static inline void sub_frag_mem_limit(struct netns_frags *nf, int i) |
3275 | { |
3276 | - __percpu_counter_add(&nf->mem, -i, frag_percpu_counter_batch); |
3277 | + atomic_sub(i, &nf->mem); |
3278 | } |
3279 | |
3280 | static inline void add_frag_mem_limit(struct netns_frags *nf, int i) |
3281 | { |
3282 | - __percpu_counter_add(&nf->mem, i, frag_percpu_counter_batch); |
3283 | + atomic_add(i, &nf->mem); |
3284 | } |
3285 | |
3286 | -static inline unsigned int sum_frag_mem_limit(struct netns_frags *nf) |
3287 | +static inline int sum_frag_mem_limit(struct netns_frags *nf) |
3288 | { |
3289 | - unsigned int res; |
3290 | - |
3291 | - local_bh_disable(); |
3292 | - res = percpu_counter_sum_positive(&nf->mem); |
3293 | - local_bh_enable(); |
3294 | - |
3295 | - return res; |
3296 | + return atomic_read(&nf->mem); |
3297 | } |
3298 | |
3299 | /* RFC 3168 support : |
3300 | diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h |
3301 | index a74e2aa40ef4..a6bcb18ac4c3 100644 |
3302 | --- a/include/net/ip6_fib.h |
3303 | +++ b/include/net/ip6_fib.h |
3304 | @@ -68,6 +68,7 @@ struct fib6_node { |
3305 | __u16 fn_flags; |
3306 | int fn_sernum; |
3307 | struct rt6_info *rr_ptr; |
3308 | + struct rcu_head rcu; |
3309 | }; |
3310 | |
3311 | #ifndef CONFIG_IPV6_SUBTREES |
3312 | @@ -102,7 +103,7 @@ struct rt6_info { |
3313 | * the same cache line. |
3314 | */ |
3315 | struct fib6_table *rt6i_table; |
3316 | - struct fib6_node *rt6i_node; |
3317 | + struct fib6_node __rcu *rt6i_node; |
3318 | |
3319 | struct in6_addr rt6i_gateway; |
3320 | |
3321 | @@ -165,13 +166,40 @@ static inline void rt6_update_expires(struct rt6_info *rt0, int timeout) |
3322 | rt0->rt6i_flags |= RTF_EXPIRES; |
3323 | } |
3324 | |
3325 | +/* Function to safely get fn->sernum for passed in rt |
3326 | + * and store result in passed in cookie. |
3327 | + * Return true if we can get cookie safely |
3328 | + * Return false if not |
3329 | + */ |
3330 | +static inline bool rt6_get_cookie_safe(const struct rt6_info *rt, |
3331 | + u32 *cookie) |
3332 | +{ |
3333 | + struct fib6_node *fn; |
3334 | + bool status = false; |
3335 | + |
3336 | + rcu_read_lock(); |
3337 | + fn = rcu_dereference(rt->rt6i_node); |
3338 | + |
3339 | + if (fn) { |
3340 | + *cookie = fn->fn_sernum; |
3341 | + status = true; |
3342 | + } |
3343 | + |
3344 | + rcu_read_unlock(); |
3345 | + return status; |
3346 | +} |
3347 | + |
3348 | static inline u32 rt6_get_cookie(const struct rt6_info *rt) |
3349 | { |
3350 | + u32 cookie = 0; |
3351 | + |
3352 | if (rt->rt6i_flags & RTF_PCPU || |
3353 | (unlikely(rt->dst.flags & DST_NOCACHE) && rt->dst.from)) |
3354 | rt = (struct rt6_info *)(rt->dst.from); |
3355 | |
3356 | - return rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0; |
3357 | + rt6_get_cookie_safe(rt, &cookie); |
3358 | + |
3359 | + return cookie; |
3360 | } |
3361 | |
3362 | static inline void ip6_rt_put(struct rt6_info *rt) |
3363 | diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c |
3364 | index 89a687f3c0a3..5f5e28f210e0 100644 |
3365 | --- a/net/bridge/br_device.c |
3366 | +++ b/net/bridge/br_device.c |
3367 | @@ -53,6 +53,9 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) |
3368 | brstats->tx_bytes += skb->len; |
3369 | u64_stats_update_end(&brstats->syncp); |
3370 | |
3371 | +#ifdef CONFIG_NET_SWITCHDEV |
3372 | + skb->offload_fwd_mark = 0; |
3373 | +#endif |
3374 | BR_INPUT_SKB_CB(skb)->brdev = dev; |
3375 | |
3376 | skb_reset_mac_header(skb); |
3377 | diff --git a/net/core/datagram.c b/net/core/datagram.c |
3378 | index 58dfa23d12ca..4fa4011feec1 100644 |
3379 | --- a/net/core/datagram.c |
3380 | +++ b/net/core/datagram.c |
3381 | @@ -351,7 +351,7 @@ int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags) |
3382 | if (flags & MSG_PEEK) { |
3383 | err = -ENOENT; |
3384 | spin_lock_bh(&sk->sk_receive_queue.lock); |
3385 | - if (skb == skb_peek(&sk->sk_receive_queue)) { |
3386 | + if (skb->next) { |
3387 | __skb_unlink(skb, &sk->sk_receive_queue); |
3388 | atomic_dec(&skb->users); |
3389 | err = 0; |
3390 | diff --git a/net/core/dev.c b/net/core/dev.c |
3391 | index 1d0a7369d5a2..ba7b8121a414 100644 |
3392 | --- a/net/core/dev.c |
3393 | +++ b/net/core/dev.c |
3394 | @@ -5337,12 +5337,13 @@ EXPORT_SYMBOL(netdev_has_upper_dev); |
3395 | * Find out if a device is linked to an upper device and return true in case |
3396 | * it is. The caller must hold the RTNL lock. |
3397 | */ |
3398 | -static bool netdev_has_any_upper_dev(struct net_device *dev) |
3399 | +bool netdev_has_any_upper_dev(struct net_device *dev) |
3400 | { |
3401 | ASSERT_RTNL(); |
3402 | |
3403 | return !list_empty(&dev->all_adj_list.upper); |
3404 | } |
3405 | +EXPORT_SYMBOL(netdev_has_any_upper_dev); |
3406 | |
3407 | /** |
3408 | * netdev_master_upper_dev_get - Get master upper device |
3409 | diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c |
3410 | index 30d875dff6b5..f85b08baff16 100644 |
3411 | --- a/net/ieee802154/6lowpan/reassembly.c |
3412 | +++ b/net/ieee802154/6lowpan/reassembly.c |
3413 | @@ -580,19 +580,14 @@ static int __net_init lowpan_frags_init_net(struct net *net) |
3414 | { |
3415 | struct netns_ieee802154_lowpan *ieee802154_lowpan = |
3416 | net_ieee802154_lowpan(net); |
3417 | - int res; |
3418 | |
3419 | ieee802154_lowpan->frags.high_thresh = IPV6_FRAG_HIGH_THRESH; |
3420 | ieee802154_lowpan->frags.low_thresh = IPV6_FRAG_LOW_THRESH; |
3421 | ieee802154_lowpan->frags.timeout = IPV6_FRAG_TIMEOUT; |
3422 | |
3423 | - res = inet_frags_init_net(&ieee802154_lowpan->frags); |
3424 | - if (res) |
3425 | - return res; |
3426 | - res = lowpan_frags_ns_sysctl_register(net); |
3427 | - if (res) |
3428 | - inet_frags_uninit_net(&ieee802154_lowpan->frags); |
3429 | - return res; |
3430 | + inet_frags_init_net(&ieee802154_lowpan->frags); |
3431 | + |
3432 | + return lowpan_frags_ns_sysctl_register(net); |
3433 | } |
3434 | |
3435 | static void __net_exit lowpan_frags_exit_net(struct net *net) |
3436 | diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c |
3437 | index b5e9317eaf9e..631c0d0d7cf8 100644 |
3438 | --- a/net/ipv4/inet_fragment.c |
3439 | +++ b/net/ipv4/inet_fragment.c |
3440 | @@ -234,10 +234,8 @@ void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f) |
3441 | cond_resched(); |
3442 | |
3443 | if (read_seqretry(&f->rnd_seqlock, seq) || |
3444 | - percpu_counter_sum(&nf->mem)) |
3445 | + sum_frag_mem_limit(nf)) |
3446 | goto evict_again; |
3447 | - |
3448 | - percpu_counter_destroy(&nf->mem); |
3449 | } |
3450 | EXPORT_SYMBOL(inet_frags_exit_net); |
3451 | |
3452 | diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c |
3453 | index bbe7f72db9c1..453db950dc9f 100644 |
3454 | --- a/net/ipv4/ip_fragment.c |
3455 | +++ b/net/ipv4/ip_fragment.c |
3456 | @@ -835,8 +835,6 @@ static void __init ip4_frags_ctl_register(void) |
3457 | |
3458 | static int __net_init ipv4_frags_init_net(struct net *net) |
3459 | { |
3460 | - int res; |
3461 | - |
3462 | /* Fragment cache limits. |
3463 | * |
3464 | * The fragment memory accounting code, (tries to) account for |
3465 | @@ -862,13 +860,9 @@ static int __net_init ipv4_frags_init_net(struct net *net) |
3466 | |
3467 | net->ipv4.frags.max_dist = 64; |
3468 | |
3469 | - res = inet_frags_init_net(&net->ipv4.frags); |
3470 | - if (res) |
3471 | - return res; |
3472 | - res = ip4_frags_ns_ctl_register(net); |
3473 | - if (res) |
3474 | - inet_frags_uninit_net(&net->ipv4.frags); |
3475 | - return res; |
3476 | + inet_frags_init_net(&net->ipv4.frags); |
3477 | + |
3478 | + return ip4_frags_ns_ctl_register(net); |
3479 | } |
3480 | |
3481 | static void __net_exit ipv4_frags_exit_net(struct net *net) |
3482 | diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c |
3483 | index 5719d6ba0824..bd7f1836bb70 100644 |
3484 | --- a/net/ipv4/ip_tunnel.c |
3485 | +++ b/net/ipv4/ip_tunnel.c |
3486 | @@ -609,8 +609,8 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto) |
3487 | ip_rt_put(rt); |
3488 | goto tx_dropped; |
3489 | } |
3490 | - iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, key->tos, |
3491 | - key->ttl, df, !net_eq(tunnel->net, dev_net(dev))); |
3492 | + iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl, |
3493 | + df, !net_eq(tunnel->net, dev_net(dev))); |
3494 | return; |
3495 | tx_error: |
3496 | dev->stats.tx_errors++; |
3497 | diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c |
3498 | index 1a4db27f5833..6b3d27e50317 100644 |
3499 | --- a/net/ipv4/tcp.c |
3500 | +++ b/net/ipv4/tcp.c |
3501 | @@ -2297,6 +2297,10 @@ int tcp_disconnect(struct sock *sk, int flags) |
3502 | tcp_set_ca_state(sk, TCP_CA_Open); |
3503 | tcp_clear_retrans(tp); |
3504 | inet_csk_delack_init(sk); |
3505 | + /* Initialize rcv_mss to TCP_MIN_MSS to avoid division by 0 |
3506 | + * issue in __tcp_select_window() |
3507 | + */ |
3508 | + icsk->icsk_ack.rcv_mss = TCP_MIN_MSS; |
3509 | tcp_init_send_head(sk); |
3510 | memset(&tp->rx_opt, 0, sizeof(tp->rx_opt)); |
3511 | __sk_dst_reset(sk); |
3512 | diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c |
3513 | index b2cabda72320..cc101b1be903 100644 |
3514 | --- a/net/ipv6/addrconf.c |
3515 | +++ b/net/ipv6/addrconf.c |
3516 | @@ -5443,7 +5443,7 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) |
3517 | * our DAD process, so we don't need |
3518 | * to do it again |
3519 | */ |
3520 | - if (!(ifp->rt->rt6i_node)) |
3521 | + if (!rcu_access_pointer(ifp->rt->rt6i_node)) |
3522 | ip6_ins_rt(ifp->rt); |
3523 | if (ifp->idev->cnf.forwarding) |
3524 | addrconf_join_anycast(ifp); |
3525 | diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c |
3526 | index ff389591a340..5da864997495 100644 |
3527 | --- a/net/ipv6/ip6_fib.c |
3528 | +++ b/net/ipv6/ip6_fib.c |
3529 | @@ -148,11 +148,23 @@ static struct fib6_node *node_alloc(void) |
3530 | return fn; |
3531 | } |
3532 | |
3533 | -static void node_free(struct fib6_node *fn) |
3534 | +static void node_free_immediate(struct fib6_node *fn) |
3535 | +{ |
3536 | + kmem_cache_free(fib6_node_kmem, fn); |
3537 | +} |
3538 | + |
3539 | +static void node_free_rcu(struct rcu_head *head) |
3540 | { |
3541 | + struct fib6_node *fn = container_of(head, struct fib6_node, rcu); |
3542 | + |
3543 | kmem_cache_free(fib6_node_kmem, fn); |
3544 | } |
3545 | |
3546 | +static void node_free(struct fib6_node *fn) |
3547 | +{ |
3548 | + call_rcu(&fn->rcu, node_free_rcu); |
3549 | +} |
3550 | + |
3551 | static void rt6_rcu_free(struct rt6_info *rt) |
3552 | { |
3553 | call_rcu(&rt->dst.rcu_head, dst_rcu_free); |
3554 | @@ -189,6 +201,12 @@ static void rt6_release(struct rt6_info *rt) |
3555 | } |
3556 | } |
3557 | |
3558 | +static void fib6_free_table(struct fib6_table *table) |
3559 | +{ |
3560 | + inetpeer_invalidate_tree(&table->tb6_peers); |
3561 | + kfree(table); |
3562 | +} |
3563 | + |
3564 | static void fib6_link_table(struct net *net, struct fib6_table *tb) |
3565 | { |
3566 | unsigned int h; |
3567 | @@ -589,9 +607,9 @@ static struct fib6_node *fib6_add_1(struct fib6_node *root, |
3568 | |
3569 | if (!in || !ln) { |
3570 | if (in) |
3571 | - node_free(in); |
3572 | + node_free_immediate(in); |
3573 | if (ln) |
3574 | - node_free(ln); |
3575 | + node_free_immediate(ln); |
3576 | return ERR_PTR(-ENOMEM); |
3577 | } |
3578 | |
3579 | @@ -862,7 +880,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, |
3580 | |
3581 | rt->dst.rt6_next = iter; |
3582 | *ins = rt; |
3583 | - rt->rt6i_node = fn; |
3584 | + rcu_assign_pointer(rt->rt6i_node, fn); |
3585 | atomic_inc(&rt->rt6i_ref); |
3586 | inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); |
3587 | info->nl_net->ipv6.rt6_stats->fib_rt_entries++; |
3588 | @@ -887,7 +905,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, |
3589 | return err; |
3590 | |
3591 | *ins = rt; |
3592 | - rt->rt6i_node = fn; |
3593 | + rcu_assign_pointer(rt->rt6i_node, fn); |
3594 | rt->dst.rt6_next = iter->dst.rt6_next; |
3595 | atomic_inc(&rt->rt6i_ref); |
3596 | inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE); |
3597 | @@ -1020,7 +1038,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, |
3598 | root, and then (in failure) stale node |
3599 | in main tree. |
3600 | */ |
3601 | - node_free(sfn); |
3602 | + node_free_immediate(sfn); |
3603 | err = PTR_ERR(sn); |
3604 | goto failure; |
3605 | } |
3606 | @@ -1447,8 +1465,9 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, |
3607 | |
3608 | int fib6_del(struct rt6_info *rt, struct nl_info *info) |
3609 | { |
3610 | + struct fib6_node *fn = rcu_dereference_protected(rt->rt6i_node, |
3611 | + lockdep_is_held(&rt->rt6i_table->tb6_lock)); |
3612 | struct net *net = info->nl_net; |
3613 | - struct fib6_node *fn = rt->rt6i_node; |
3614 | struct rt6_info **rtp; |
3615 | |
3616 | #if RT6_DEBUG >= 2 |
3617 | @@ -1637,7 +1656,9 @@ static int fib6_clean_node(struct fib6_walker *w) |
3618 | if (res) { |
3619 | #if RT6_DEBUG >= 2 |
3620 | pr_debug("%s: del failed: rt=%p@%p err=%d\n", |
3621 | - __func__, rt, rt->rt6i_node, res); |
3622 | + __func__, rt, |
3623 | + rcu_access_pointer(rt->rt6i_node), |
3624 | + res); |
3625 | #endif |
3626 | continue; |
3627 | } |
3628 | @@ -1878,15 +1899,22 @@ static int __net_init fib6_net_init(struct net *net) |
3629 | |
3630 | static void fib6_net_exit(struct net *net) |
3631 | { |
3632 | + unsigned int i; |
3633 | + |
3634 | rt6_ifdown(net, NULL); |
3635 | del_timer_sync(&net->ipv6.ip6_fib_timer); |
3636 | |
3637 | -#ifdef CONFIG_IPV6_MULTIPLE_TABLES |
3638 | - inetpeer_invalidate_tree(&net->ipv6.fib6_local_tbl->tb6_peers); |
3639 | - kfree(net->ipv6.fib6_local_tbl); |
3640 | -#endif |
3641 | - inetpeer_invalidate_tree(&net->ipv6.fib6_main_tbl->tb6_peers); |
3642 | - kfree(net->ipv6.fib6_main_tbl); |
3643 | + for (i = 0; i < FIB6_TABLE_HASHSZ; i++) { |
3644 | + struct hlist_head *head = &net->ipv6.fib_table_hash[i]; |
3645 | + struct hlist_node *tmp; |
3646 | + struct fib6_table *tb; |
3647 | + |
3648 | + hlist_for_each_entry_safe(tb, tmp, head, tb6_hlist) { |
3649 | + hlist_del(&tb->tb6_hlist); |
3650 | + fib6_free_table(tb); |
3651 | + } |
3652 | + } |
3653 | + |
3654 | kfree(net->ipv6.fib_table_hash); |
3655 | kfree(net->ipv6.rt6_stats); |
3656 | } |
3657 | diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c |
3658 | index d2844ee469cb..f78afe43bdff 100644 |
3659 | --- a/net/ipv6/ip6_gre.c |
3660 | +++ b/net/ipv6/ip6_gre.c |
3661 | @@ -432,7 +432,9 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, |
3662 | } |
3663 | break; |
3664 | case ICMPV6_PKT_TOOBIG: |
3665 | - mtu = be32_to_cpu(info) - offset; |
3666 | + mtu = be32_to_cpu(info) - offset - t->tun_hlen; |
3667 | + if (t->dev->type == ARPHRD_ETHER) |
3668 | + mtu -= ETH_HLEN; |
3669 | if (mtu < IPV6_MIN_MTU) |
3670 | mtu = IPV6_MIN_MTU; |
3671 | t->dev->mtu = mtu; |
3672 | diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c |
3673 | index 986d4ca38832..b263bf3a19f7 100644 |
3674 | --- a/net/ipv6/netfilter/nf_conntrack_reasm.c |
3675 | +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c |
3676 | @@ -622,18 +622,12 @@ EXPORT_SYMBOL_GPL(nf_ct_frag6_gather); |
3677 | |
3678 | static int nf_ct_net_init(struct net *net) |
3679 | { |
3680 | - int res; |
3681 | - |
3682 | net->nf_frag.frags.high_thresh = IPV6_FRAG_HIGH_THRESH; |
3683 | net->nf_frag.frags.low_thresh = IPV6_FRAG_LOW_THRESH; |
3684 | net->nf_frag.frags.timeout = IPV6_FRAG_TIMEOUT; |
3685 | - res = inet_frags_init_net(&net->nf_frag.frags); |
3686 | - if (res) |
3687 | - return res; |
3688 | - res = nf_ct_frag6_sysctl_register(net); |
3689 | - if (res) |
3690 | - inet_frags_uninit_net(&net->nf_frag.frags); |
3691 | - return res; |
3692 | + inet_frags_init_net(&net->nf_frag.frags); |
3693 | + |
3694 | + return nf_ct_frag6_sysctl_register(net); |
3695 | } |
3696 | |
3697 | static void nf_ct_net_exit(struct net *net) |
3698 | diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c |
3699 | index abb2c307fbe8..a338bbc33cf3 100644 |
3700 | --- a/net/ipv6/output_core.c |
3701 | +++ b/net/ipv6/output_core.c |
3702 | @@ -86,7 +86,6 @@ int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr) |
3703 | |
3704 | while (offset <= packet_len) { |
3705 | struct ipv6_opt_hdr *exthdr; |
3706 | - unsigned int len; |
3707 | |
3708 | switch (**nexthdr) { |
3709 | |
3710 | @@ -112,10 +111,9 @@ int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr) |
3711 | |
3712 | exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) + |
3713 | offset); |
3714 | - len = ipv6_optlen(exthdr); |
3715 | - if (len + offset >= IPV6_MAXPLEN) |
3716 | + offset += ipv6_optlen(exthdr); |
3717 | + if (offset > IPV6_MAXPLEN) |
3718 | return -EINVAL; |
3719 | - offset += len; |
3720 | *nexthdr = &exthdr->nexthdr; |
3721 | } |
3722 | |
3723 | diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c |
3724 | index 3815e8505ed2..e585c0a2591c 100644 |
3725 | --- a/net/ipv6/reassembly.c |
3726 | +++ b/net/ipv6/reassembly.c |
3727 | @@ -709,19 +709,13 @@ static void ip6_frags_sysctl_unregister(void) |
3728 | |
3729 | static int __net_init ipv6_frags_init_net(struct net *net) |
3730 | { |
3731 | - int res; |
3732 | - |
3733 | net->ipv6.frags.high_thresh = IPV6_FRAG_HIGH_THRESH; |
3734 | net->ipv6.frags.low_thresh = IPV6_FRAG_LOW_THRESH; |
3735 | net->ipv6.frags.timeout = IPV6_FRAG_TIMEOUT; |
3736 | |
3737 | - res = inet_frags_init_net(&net->ipv6.frags); |
3738 | - if (res) |
3739 | - return res; |
3740 | - res = ip6_frags_ns_sysctl_register(net); |
3741 | - if (res) |
3742 | - inet_frags_uninit_net(&net->ipv6.frags); |
3743 | - return res; |
3744 | + inet_frags_init_net(&net->ipv6.frags); |
3745 | + |
3746 | + return ip6_frags_ns_sysctl_register(net); |
3747 | } |
3748 | |
3749 | static void __net_exit ipv6_frags_exit_net(struct net *net) |
3750 | diff --git a/net/ipv6/route.c b/net/ipv6/route.c |
3751 | index 5764a84465f8..61729641e027 100644 |
3752 | --- a/net/ipv6/route.c |
3753 | +++ b/net/ipv6/route.c |
3754 | @@ -1267,7 +1267,9 @@ static void rt6_dst_from_metrics_check(struct rt6_info *rt) |
3755 | |
3756 | static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie) |
3757 | { |
3758 | - if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie)) |
3759 | + u32 rt_cookie = 0; |
3760 | + |
3761 | + if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie) |
3762 | return NULL; |
3763 | |
3764 | if (rt6_check_expired(rt)) |
3765 | @@ -1335,8 +1337,14 @@ static void ip6_link_failure(struct sk_buff *skb) |
3766 | if (rt->rt6i_flags & RTF_CACHE) { |
3767 | dst_hold(&rt->dst); |
3768 | ip6_del_rt(rt); |
3769 | - } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) { |
3770 | - rt->rt6i_node->fn_sernum = -1; |
3771 | + } else { |
3772 | + struct fib6_node *fn; |
3773 | + |
3774 | + rcu_read_lock(); |
3775 | + fn = rcu_dereference(rt->rt6i_node); |
3776 | + if (fn && (rt->rt6i_flags & RTF_DEFAULT)) |
3777 | + fn->fn_sernum = -1; |
3778 | + rcu_read_unlock(); |
3779 | } |
3780 | } |
3781 | } |
3782 | @@ -1353,7 +1361,8 @@ static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu) |
3783 | static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt) |
3784 | { |
3785 | return !(rt->rt6i_flags & RTF_CACHE) && |
3786 | - (rt->rt6i_flags & RTF_PCPU || rt->rt6i_node); |
3787 | + (rt->rt6i_flags & RTF_PCPU || |
3788 | + rcu_access_pointer(rt->rt6i_node)); |
3789 | } |
3790 | |
3791 | static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, |
3792 | diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c |
3793 | index fecad1098cf8..7eb0e8fe3ca8 100644 |
3794 | --- a/net/kcm/kcmsock.c |
3795 | +++ b/net/kcm/kcmsock.c |
3796 | @@ -1381,6 +1381,10 @@ static int kcm_attach(struct socket *sock, struct socket *csock, |
3797 | if (!csk) |
3798 | return -EINVAL; |
3799 | |
3800 | + /* We must prevent loops or risk deadlock ! */ |
3801 | + if (csk->sk_family == PF_KCM) |
3802 | + return -EOPNOTSUPP; |
3803 | + |
3804 | psock = kmem_cache_zalloc(kcm_psockp, GFP_KERNEL); |
3805 | if (!psock) |
3806 | return -ENOMEM; |
3807 | diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c |
3808 | index ae7bfd26cd91..35ba4b60d927 100644 |
3809 | --- a/net/packet/af_packet.c |
3810 | +++ b/net/packet/af_packet.c |
3811 | @@ -2151,6 +2151,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, |
3812 | struct timespec ts; |
3813 | __u32 ts_status; |
3814 | bool is_drop_n_account = false; |
3815 | + bool do_vnet = false; |
3816 | |
3817 | /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT. |
3818 | * We may add members to them until current aligned size without forcing |
3819 | @@ -2201,8 +2202,10 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, |
3820 | netoff = TPACKET_ALIGN(po->tp_hdrlen + |
3821 | (maclen < 16 ? 16 : maclen)) + |
3822 | po->tp_reserve; |
3823 | - if (po->has_vnet_hdr) |
3824 | + if (po->has_vnet_hdr) { |
3825 | netoff += sizeof(struct virtio_net_hdr); |
3826 | + do_vnet = true; |
3827 | + } |
3828 | macoff = netoff - maclen; |
3829 | } |
3830 | if (po->tp_version <= TPACKET_V2) { |
3831 | @@ -2219,8 +2222,10 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, |
3832 | skb_set_owner_r(copy_skb, sk); |
3833 | } |
3834 | snaplen = po->rx_ring.frame_size - macoff; |
3835 | - if ((int)snaplen < 0) |
3836 | + if ((int)snaplen < 0) { |
3837 | snaplen = 0; |
3838 | + do_vnet = false; |
3839 | + } |
3840 | } |
3841 | } else if (unlikely(macoff + snaplen > |
3842 | GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) { |
3843 | @@ -2233,6 +2238,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, |
3844 | if (unlikely((int)snaplen < 0)) { |
3845 | snaplen = 0; |
3846 | macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len; |
3847 | + do_vnet = false; |
3848 | } |
3849 | } |
3850 | spin_lock(&sk->sk_receive_queue.lock); |
3851 | @@ -2258,7 +2264,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, |
3852 | } |
3853 | spin_unlock(&sk->sk_receive_queue.lock); |
3854 | |
3855 | - if (po->has_vnet_hdr) { |
3856 | + if (do_vnet) { |
3857 | if (__packet_rcv_vnet(skb, h.raw + macoff - |
3858 | sizeof(struct virtio_net_hdr))) { |
3859 | spin_lock(&sk->sk_receive_queue.lock); |
3860 | diff --git a/net/sctp/sctp_diag.c b/net/sctp/sctp_diag.c |
3861 | index 048954eee984..e8f56b7c5afb 100644 |
3862 | --- a/net/sctp/sctp_diag.c |
3863 | +++ b/net/sctp/sctp_diag.c |
3864 | @@ -70,7 +70,8 @@ static int inet_diag_msg_sctpladdrs_fill(struct sk_buff *skb, |
3865 | |
3866 | info = nla_data(attr); |
3867 | list_for_each_entry_rcu(laddr, address_list, list) { |
3868 | - memcpy(info, &laddr->a, addrlen); |
3869 | + memcpy(info, &laddr->a, sizeof(laddr->a)); |
3870 | + memset(info + sizeof(laddr->a), 0, addrlen - sizeof(laddr->a)); |
3871 | info += addrlen; |
3872 | } |
3873 | |
3874 | @@ -93,7 +94,9 @@ static int inet_diag_msg_sctpaddrs_fill(struct sk_buff *skb, |
3875 | info = nla_data(attr); |
3876 | list_for_each_entry(from, &asoc->peer.transport_addr_list, |
3877 | transports) { |
3878 | - memcpy(info, &from->ipaddr, addrlen); |
3879 | + memcpy(info, &from->ipaddr, sizeof(from->ipaddr)); |
3880 | + memset(info + sizeof(from->ipaddr), 0, |
3881 | + addrlen - sizeof(from->ipaddr)); |
3882 | info += addrlen; |
3883 | } |
3884 | |
3885 | diff --git a/net/sctp/socket.c b/net/sctp/socket.c |
3886 | index 9647e314d4fc..3ef725229449 100644 |
3887 | --- a/net/sctp/socket.c |
3888 | +++ b/net/sctp/socket.c |
3889 | @@ -4373,8 +4373,7 @@ int sctp_get_sctp_info(struct sock *sk, struct sctp_association *asoc, |
3890 | info->sctpi_ictrlchunks = asoc->stats.ictrlchunks; |
3891 | |
3892 | prim = asoc->peer.primary_path; |
3893 | - memcpy(&info->sctpi_p_address, &prim->ipaddr, |
3894 | - sizeof(struct sockaddr_storage)); |
3895 | + memcpy(&info->sctpi_p_address, &prim->ipaddr, sizeof(prim->ipaddr)); |
3896 | info->sctpi_p_state = prim->state; |
3897 | info->sctpi_p_cwnd = prim->cwnd; |
3898 | info->sctpi_p_srtt = prim->srtt; |
3899 | diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c |
3900 | index 84d0fdaf7de9..d3cfbf2f407d 100644 |
3901 | --- a/net/sctp/ulpqueue.c |
3902 | +++ b/net/sctp/ulpqueue.c |
3903 | @@ -265,7 +265,8 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) |
3904 | sctp_ulpq_clear_pd(ulpq); |
3905 | |
3906 | if (queue == &sk->sk_receive_queue && !sp->data_ready_signalled) { |
3907 | - sp->data_ready_signalled = 1; |
3908 | + if (!sock_owned_by_user(sk)) |
3909 | + sp->data_ready_signalled = 1; |
3910 | sk->sk_data_ready(sk); |
3911 | } |
3912 | return 1; |