Contents of /trunk/kernel26-magellan/patches-2.6.31-r4/0107-2.6.31.8-all-fixes.patch
Parent Directory | Revision Log
Revision 968 -
(show annotations)
(download)
Fri Jan 1 14:52:51 2010 UTC (14 years, 8 months ago) by niro
File size: 165293 byte(s)
Fri Jan 1 14:52:51 2010 UTC (14 years, 8 months ago) by niro
File size: 165293 byte(s)
-2.6.31-magellan-r4: -updated to linux-2.6.31.9
1 | diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt |
2 | index 7be02ac..32c3da4 100644 |
3 | --- a/Documentation/filesystems/ext4.txt |
4 | +++ b/Documentation/filesystems/ext4.txt |
5 | @@ -153,8 +153,8 @@ journal_dev=devnum When the external journal device's major/minor numbers |
6 | identified through its new major/minor numbers encoded |
7 | in devnum. |
8 | |
9 | -noload Don't load the journal on mounting. Note that |
10 | - if the filesystem was not unmounted cleanly, |
11 | +norecovery Don't load the journal on mounting. Note that |
12 | +noload if the filesystem was not unmounted cleanly, |
13 | skipping the journal replay will lead to the |
14 | filesystem containing inconsistencies that can |
15 | lead to any number of problems. |
16 | @@ -338,6 +338,12 @@ noauto_da_alloc replacing existing files via patterns such as |
17 | system crashes before the delayed allocation |
18 | blocks are forced to disk. |
19 | |
20 | +discard Controls whether ext4 should issue discard/TRIM |
21 | +nodiscard(*) commands to the underlying block device when |
22 | + blocks are freed. This is useful for SSD devices |
23 | + and sparse/thinly-provisioned LUNs, but it is off |
24 | + by default until sufficient testing has been done. |
25 | + |
26 | Data Mode |
27 | ========= |
28 | There are 3 different data modes: |
29 | diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c |
30 | index 5fd2da4..28a753d 100644 |
31 | --- a/drivers/scsi/hosts.c |
32 | +++ b/drivers/scsi/hosts.c |
33 | @@ -180,14 +180,20 @@ void scsi_remove_host(struct Scsi_Host *shost) |
34 | EXPORT_SYMBOL(scsi_remove_host); |
35 | |
36 | /** |
37 | - * scsi_add_host - add a scsi host |
38 | + * scsi_add_host_with_dma - add a scsi host with dma device |
39 | * @shost: scsi host pointer to add |
40 | * @dev: a struct device of type scsi class |
41 | + * @dma_dev: dma device for the host |
42 | + * |
43 | + * Note: You rarely need to worry about this unless you're in a |
44 | + * virtualised host environments, so use the simpler scsi_add_host() |
45 | + * function instead. |
46 | * |
47 | * Return value: |
48 | * 0 on success / != 0 for error |
49 | **/ |
50 | -int scsi_add_host(struct Scsi_Host *shost, struct device *dev) |
51 | +int scsi_add_host_with_dma(struct Scsi_Host *shost, struct device *dev, |
52 | + struct device *dma_dev) |
53 | { |
54 | struct scsi_host_template *sht = shost->hostt; |
55 | int error = -EINVAL; |
56 | @@ -207,6 +213,7 @@ int scsi_add_host(struct Scsi_Host *shost, struct device *dev) |
57 | |
58 | if (!shost->shost_gendev.parent) |
59 | shost->shost_gendev.parent = dev ? dev : &platform_bus; |
60 | + shost->dma_dev = dma_dev; |
61 | |
62 | error = device_add(&shost->shost_gendev); |
63 | if (error) |
64 | @@ -262,7 +269,7 @@ int scsi_add_host(struct Scsi_Host *shost, struct device *dev) |
65 | fail: |
66 | return error; |
67 | } |
68 | -EXPORT_SYMBOL(scsi_add_host); |
69 | +EXPORT_SYMBOL(scsi_add_host_with_dma); |
70 | |
71 | static void scsi_host_dev_release(struct device *dev) |
72 | { |
73 | diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c |
74 | index fc67cc6..cf13ff2 100644 |
75 | --- a/drivers/scsi/lpfc/lpfc_init.c |
76 | +++ b/drivers/scsi/lpfc/lpfc_init.c |
77 | @@ -2384,7 +2384,7 @@ lpfc_create_port(struct lpfc_hba *phba, int instance, struct device *dev) |
78 | vport->els_tmofunc.function = lpfc_els_timeout; |
79 | vport->els_tmofunc.data = (unsigned long)vport; |
80 | |
81 | - error = scsi_add_host(shost, dev); |
82 | + error = scsi_add_host_with_dma(shost, dev, &phba->pcidev->dev); |
83 | if (error) |
84 | goto out_put_shost; |
85 | |
86 | diff --git a/drivers/scsi/megaraid/megaraid_sas.c b/drivers/scsi/megaraid/megaraid_sas.c |
87 | index 7dc3d18..7a838c8 100644 |
88 | --- a/drivers/scsi/megaraid/megaraid_sas.c |
89 | +++ b/drivers/scsi/megaraid/megaraid_sas.c |
90 | @@ -3032,7 +3032,7 @@ megasas_mgmt_fw_ioctl(struct megasas_instance *instance, |
91 | int error = 0, i; |
92 | void *sense = NULL; |
93 | dma_addr_t sense_handle; |
94 | - u32 *sense_ptr; |
95 | + unsigned long *sense_ptr; |
96 | |
97 | memset(kbuff_arr, 0, sizeof(kbuff_arr)); |
98 | |
99 | @@ -3109,7 +3109,7 @@ megasas_mgmt_fw_ioctl(struct megasas_instance *instance, |
100 | } |
101 | |
102 | sense_ptr = |
103 | - (u32 *) ((unsigned long)cmd->frame + ioc->sense_off); |
104 | + (unsigned long *) ((unsigned long)cmd->frame + ioc->sense_off); |
105 | *sense_ptr = sense_handle; |
106 | } |
107 | |
108 | @@ -3140,8 +3140,8 @@ megasas_mgmt_fw_ioctl(struct megasas_instance *instance, |
109 | * sense_ptr points to the location that has the user |
110 | * sense buffer address |
111 | */ |
112 | - sense_ptr = (u32 *) ((unsigned long)ioc->frame.raw + |
113 | - ioc->sense_off); |
114 | + sense_ptr = (unsigned long *) ((unsigned long)ioc->frame.raw + |
115 | + ioc->sense_off); |
116 | |
117 | if (copy_to_user((void __user *)((unsigned long)(*sense_ptr)), |
118 | sense, ioc->sense_len)) { |
119 | diff --git a/drivers/scsi/qla2xxx/qla_attr.c b/drivers/scsi/qla2xxx/qla_attr.c |
120 | index 0f87962..67e016d 100644 |
121 | --- a/drivers/scsi/qla2xxx/qla_attr.c |
122 | +++ b/drivers/scsi/qla2xxx/qla_attr.c |
123 | @@ -1654,7 +1654,8 @@ qla24xx_vport_create(struct fc_vport *fc_vport, bool disable) |
124 | fc_vport_set_state(fc_vport, FC_VPORT_LINKDOWN); |
125 | } |
126 | |
127 | - if (scsi_add_host(vha->host, &fc_vport->dev)) { |
128 | + if (scsi_add_host_with_dma(vha->host, &fc_vport->dev, |
129 | + &ha->pdev->dev)) { |
130 | DEBUG15(printk("scsi(%ld): scsi_add_host failure for VP[%d].\n", |
131 | vha->host_no, vha->vp_idx)); |
132 | goto vport_create_failed_2; |
133 | diff --git a/drivers/scsi/scsi_lib_dma.c b/drivers/scsi/scsi_lib_dma.c |
134 | index ac6855c..dcd1285 100644 |
135 | --- a/drivers/scsi/scsi_lib_dma.c |
136 | +++ b/drivers/scsi/scsi_lib_dma.c |
137 | @@ -23,7 +23,7 @@ int scsi_dma_map(struct scsi_cmnd *cmd) |
138 | int nseg = 0; |
139 | |
140 | if (scsi_sg_count(cmd)) { |
141 | - struct device *dev = cmd->device->host->shost_gendev.parent; |
142 | + struct device *dev = cmd->device->host->dma_dev; |
143 | |
144 | nseg = dma_map_sg(dev, scsi_sglist(cmd), scsi_sg_count(cmd), |
145 | cmd->sc_data_direction); |
146 | @@ -41,7 +41,7 @@ EXPORT_SYMBOL(scsi_dma_map); |
147 | void scsi_dma_unmap(struct scsi_cmnd *cmd) |
148 | { |
149 | if (scsi_sg_count(cmd)) { |
150 | - struct device *dev = cmd->device->host->shost_gendev.parent; |
151 | + struct device *dev = cmd->device->host->dma_dev; |
152 | |
153 | dma_unmap_sg(dev, scsi_sglist(cmd), scsi_sg_count(cmd), |
154 | cmd->sc_data_direction); |
155 | diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c |
156 | index e2126d7..34bb797 100644 |
157 | --- a/fs/ext4/balloc.c |
158 | +++ b/fs/ext4/balloc.c |
159 | @@ -761,7 +761,13 @@ static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb, |
160 | static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb, |
161 | ext4_group_t group) |
162 | { |
163 | - return ext4_bg_has_super(sb, group) ? EXT4_SB(sb)->s_gdb_count : 0; |
164 | + if (!ext4_bg_has_super(sb, group)) |
165 | + return 0; |
166 | + |
167 | + if (EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG)) |
168 | + return le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg); |
169 | + else |
170 | + return EXT4_SB(sb)->s_gdb_count; |
171 | } |
172 | |
173 | /** |
174 | diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c |
175 | index 50784ef..dc79b75 100644 |
176 | --- a/fs/ext4/block_validity.c |
177 | +++ b/fs/ext4/block_validity.c |
178 | @@ -160,7 +160,7 @@ int ext4_setup_system_zone(struct super_block *sb) |
179 | if (ext4_bg_has_super(sb, i) && |
180 | ((i < 5) || ((i % flex_size) == 0))) |
181 | add_system_zone(sbi, ext4_group_first_block_no(sb, i), |
182 | - sbi->s_gdb_count + 1); |
183 | + ext4_bg_num_gdb(sb, i) + 1); |
184 | gdp = ext4_get_group_desc(sb, i, NULL); |
185 | ret = add_system_zone(sbi, ext4_block_bitmap(sb, gdp), 1); |
186 | if (ret) |
187 | diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h |
188 | index 9714db3..3b8321b 100644 |
189 | --- a/fs/ext4/ext4.h |
190 | +++ b/fs/ext4/ext4.h |
191 | @@ -88,6 +88,8 @@ typedef unsigned int ext4_group_t; |
192 | #define EXT4_MB_HINT_TRY_GOAL 512 |
193 | /* blocks already pre-reserved by delayed allocation */ |
194 | #define EXT4_MB_DELALLOC_RESERVED 1024 |
195 | +/* We are doing stream allocation */ |
196 | +#define EXT4_MB_STREAM_ALLOC 2048 |
197 | |
198 | |
199 | struct ext4_allocation_request { |
200 | @@ -111,6 +113,33 @@ struct ext4_allocation_request { |
201 | unsigned int flags; |
202 | }; |
203 | |
204 | +#define DIO_AIO_UNWRITTEN 0x1 |
205 | +typedef struct ext4_io_end { |
206 | + struct list_head list; /* per-file finished AIO list */ |
207 | + struct inode *inode; /* file being written to */ |
208 | + unsigned int flag; /* sync IO or AIO */ |
209 | + int error; /* I/O error code */ |
210 | + ext4_lblk_t offset; /* offset in the file */ |
211 | + size_t size; /* size of the extent */ |
212 | + struct work_struct work; /* data work queue */ |
213 | +} ext4_io_end_t; |
214 | + |
215 | +/* |
216 | + * Delayed allocation stuff |
217 | + */ |
218 | + |
219 | +struct mpage_da_data { |
220 | + struct inode *inode; |
221 | + sector_t b_blocknr; /* start block number of extent */ |
222 | + size_t b_size; /* size of extent */ |
223 | + unsigned long b_state; /* state of the extent */ |
224 | + unsigned long first_page, next_page; /* extent of pages */ |
225 | + struct writeback_control *wbc; |
226 | + int io_done; |
227 | + int pages_written; |
228 | + int retval; |
229 | +}; |
230 | + |
231 | /* |
232 | * Special inodes numbers |
233 | */ |
234 | @@ -251,7 +280,6 @@ struct flex_groups { |
235 | #define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ |
236 | #define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */ |
237 | #define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */ |
238 | -#define EXT4_EXT_MIGRATE 0x00100000 /* Inode is migrating */ |
239 | #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ |
240 | |
241 | #define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */ |
242 | @@ -289,6 +317,8 @@ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags) |
243 | #define EXT4_STATE_XATTR 0x00000004 /* has in-inode xattrs */ |
244 | #define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */ |
245 | #define EXT4_STATE_DA_ALLOC_CLOSE 0x00000010 /* Alloc DA blks on close */ |
246 | +#define EXT4_STATE_EXT_MIGRATE 0x00000020 /* Inode is migrating */ |
247 | +#define EXT4_STATE_DIO_UNWRITTEN 0x00000040 /* need convert on dio done*/ |
248 | |
249 | /* Used to pass group descriptor data when online resize is done */ |
250 | struct ext4_new_group_input { |
251 | @@ -330,7 +360,16 @@ struct ext4_new_group_data { |
252 | /* Call ext4_da_update_reserve_space() after successfully |
253 | allocating the blocks */ |
254 | #define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE 0x0008 |
255 | - |
256 | + /* caller is from the direct IO path, request to creation of an |
257 | + unitialized extents if not allocated, split the uninitialized |
258 | + extent if blocks has been preallocated already*/ |
259 | +#define EXT4_GET_BLOCKS_DIO 0x0010 |
260 | +#define EXT4_GET_BLOCKS_CONVERT 0x0020 |
261 | +#define EXT4_GET_BLOCKS_DIO_CREATE_EXT (EXT4_GET_BLOCKS_DIO|\ |
262 | + EXT4_GET_BLOCKS_CREATE_UNINIT_EXT) |
263 | + /* Convert extent to initialized after direct IO complete */ |
264 | +#define EXT4_GET_BLOCKS_DIO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\ |
265 | + EXT4_GET_BLOCKS_DIO_CREATE_EXT) |
266 | |
267 | /* |
268 | * ioctl commands |
269 | @@ -386,6 +425,9 @@ struct ext4_mount_options { |
270 | #endif |
271 | }; |
272 | |
273 | +/* Max physical block we can addres w/o extents */ |
274 | +#define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF |
275 | + |
276 | /* |
277 | * Structure of an inode on the disk |
278 | */ |
279 | @@ -481,8 +523,8 @@ struct move_extent { |
280 | static inline __le32 ext4_encode_extra_time(struct timespec *time) |
281 | { |
282 | return cpu_to_le32((sizeof(time->tv_sec) > 4 ? |
283 | - time->tv_sec >> 32 : 0) | |
284 | - ((time->tv_nsec << 2) & EXT4_NSEC_MASK)); |
285 | + (time->tv_sec >> 32) & EXT4_EPOCH_MASK : 0) | |
286 | + ((time->tv_nsec << EXT4_EPOCH_BITS) & EXT4_NSEC_MASK)); |
287 | } |
288 | |
289 | static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra) |
290 | @@ -490,7 +532,7 @@ static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra) |
291 | if (sizeof(time->tv_sec) > 4) |
292 | time->tv_sec |= (__u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) |
293 | << 32; |
294 | - time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> 2; |
295 | + time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS; |
296 | } |
297 | |
298 | #define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \ |
299 | @@ -653,6 +695,18 @@ struct ext4_inode_info { |
300 | __u16 i_extra_isize; |
301 | |
302 | spinlock_t i_block_reservation_lock; |
303 | + |
304 | + /* completed async DIOs that might need unwritten extents handling */ |
305 | + struct list_head i_aio_dio_complete_list; |
306 | + /* current io_end structure for async DIO write*/ |
307 | + ext4_io_end_t *cur_aio_dio; |
308 | + |
309 | + /* |
310 | + * Transactions that contain inode's metadata needed to complete |
311 | + * fsync and fdatasync, respectively. |
312 | + */ |
313 | + tid_t i_sync_tid; |
314 | + tid_t i_datasync_tid; |
315 | }; |
316 | |
317 | /* |
318 | @@ -700,6 +754,7 @@ struct ext4_inode_info { |
319 | #define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ |
320 | #define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ |
321 | #define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */ |
322 | +#define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */ |
323 | |
324 | #define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt |
325 | #define set_opt(o, opt) o |= EXT4_MOUNT_##opt |
326 | @@ -841,6 +896,7 @@ struct ext4_sb_info { |
327 | unsigned long s_gdb_count; /* Number of group descriptor blocks */ |
328 | unsigned long s_desc_per_block; /* Number of group descriptors per block */ |
329 | ext4_group_t s_groups_count; /* Number of groups in the fs */ |
330 | + ext4_group_t s_blockfile_groups;/* Groups acceptable for non-extent files */ |
331 | unsigned long s_overhead_last; /* Last calculated overhead */ |
332 | unsigned long s_blocks_last; /* Last seen block count */ |
333 | loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */ |
334 | @@ -923,6 +979,7 @@ struct ext4_sb_info { |
335 | unsigned int s_mb_stats; |
336 | unsigned int s_mb_order2_reqs; |
337 | unsigned int s_mb_group_prealloc; |
338 | + unsigned int s_max_writeback_mb_bump; |
339 | /* where last allocation was done - for stream allocation */ |
340 | unsigned long s_mb_last_group; |
341 | unsigned long s_mb_last_start; |
342 | @@ -950,6 +1007,7 @@ struct ext4_sb_info { |
343 | atomic_t s_mb_lost_chunks; |
344 | atomic_t s_mb_preallocated; |
345 | atomic_t s_mb_discarded; |
346 | + atomic_t s_lock_busy; |
347 | |
348 | /* locality groups */ |
349 | struct ext4_locality_group *s_locality_groups; |
350 | @@ -960,6 +1018,9 @@ struct ext4_sb_info { |
351 | |
352 | unsigned int s_log_groups_per_flex; |
353 | struct flex_groups *s_flex_groups; |
354 | + |
355 | + /* workqueue for dio unwritten */ |
356 | + struct workqueue_struct *dio_unwritten_wq; |
357 | }; |
358 | |
359 | static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) |
360 | @@ -1367,6 +1428,7 @@ extern int ext4_change_inode_journal_flag(struct inode *, int); |
361 | extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); |
362 | extern int ext4_can_truncate(struct inode *inode); |
363 | extern void ext4_truncate(struct inode *); |
364 | +extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks); |
365 | extern void ext4_set_inode_flags(struct inode *); |
366 | extern void ext4_get_inode_flags(struct ext4_inode_info *); |
367 | extern int ext4_alloc_da_blocks(struct inode *inode); |
368 | @@ -1378,7 +1440,7 @@ extern int ext4_block_truncate_page(handle_t *handle, |
369 | struct address_space *mapping, loff_t from); |
370 | extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); |
371 | extern qsize_t ext4_get_reserved_space(struct inode *inode); |
372 | - |
373 | +extern int flush_aio_dio_completed_IO(struct inode *inode); |
374 | /* ioctl.c */ |
375 | extern long ext4_ioctl(struct file *, unsigned int, unsigned long); |
376 | extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); |
377 | @@ -1591,15 +1653,42 @@ struct ext4_group_info { |
378 | #define EXT4_MB_GRP_NEED_INIT(grp) \ |
379 | (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) |
380 | |
381 | +#define EXT4_MAX_CONTENTION 8 |
382 | +#define EXT4_CONTENTION_THRESHOLD 2 |
383 | + |
384 | static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb, |
385 | ext4_group_t group) |
386 | { |
387 | return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group); |
388 | } |
389 | |
390 | +/* |
391 | + * Returns true if the filesystem is busy enough that attempts to |
392 | + * access the block group locks has run into contention. |
393 | + */ |
394 | +static inline int ext4_fs_is_busy(struct ext4_sb_info *sbi) |
395 | +{ |
396 | + return (atomic_read(&sbi->s_lock_busy) > EXT4_CONTENTION_THRESHOLD); |
397 | +} |
398 | + |
399 | static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group) |
400 | { |
401 | - spin_lock(ext4_group_lock_ptr(sb, group)); |
402 | + spinlock_t *lock = ext4_group_lock_ptr(sb, group); |
403 | + if (spin_trylock(lock)) |
404 | + /* |
405 | + * We're able to grab the lock right away, so drop the |
406 | + * lock contention counter. |
407 | + */ |
408 | + atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0); |
409 | + else { |
410 | + /* |
411 | + * The lock is busy, so bump the contention counter, |
412 | + * and then wait on the spin lock. |
413 | + */ |
414 | + atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, 1, |
415 | + EXT4_MAX_CONTENTION); |
416 | + spin_lock(lock); |
417 | + } |
418 | } |
419 | |
420 | static inline void ext4_unlock_group(struct super_block *sb, |
421 | @@ -1650,6 +1739,8 @@ extern void ext4_ext_init(struct super_block *); |
422 | extern void ext4_ext_release(struct super_block *); |
423 | extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, |
424 | loff_t len); |
425 | +extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, |
426 | + loff_t len); |
427 | extern int ext4_get_blocks(handle_t *handle, struct inode *inode, |
428 | sector_t block, unsigned int max_blocks, |
429 | struct buffer_head *bh, int flags); |
430 | diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h |
431 | index 20a8410..1c2db3f 100644 |
432 | --- a/fs/ext4/ext4_extents.h |
433 | +++ b/fs/ext4/ext4_extents.h |
434 | @@ -220,6 +220,11 @@ static inline int ext4_ext_get_actual_len(struct ext4_extent *ext) |
435 | (le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN)); |
436 | } |
437 | |
438 | +static inline void ext4_ext_mark_initialized(struct ext4_extent *ext) |
439 | +{ |
440 | + ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext)); |
441 | +} |
442 | + |
443 | extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks); |
444 | extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex); |
445 | extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *); |
446 | @@ -235,7 +240,7 @@ extern int ext4_ext_try_to_merge(struct inode *inode, |
447 | struct ext4_ext_path *path, |
448 | struct ext4_extent *); |
449 | extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *); |
450 | -extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *); |
451 | +extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *, int); |
452 | extern int ext4_ext_walk_space(struct inode *, ext4_lblk_t, ext4_lblk_t, |
453 | ext_prepare_callback, void *); |
454 | extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t, |
455 | diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c |
456 | index eb27fd0..6a94099 100644 |
457 | --- a/fs/ext4/ext4_jbd2.c |
458 | +++ b/fs/ext4/ext4_jbd2.c |
459 | @@ -44,7 +44,7 @@ int __ext4_journal_forget(const char *where, handle_t *handle, |
460 | handle, err); |
461 | } |
462 | else |
463 | - brelse(bh); |
464 | + bforget(bh); |
465 | return err; |
466 | } |
467 | |
468 | @@ -60,7 +60,7 @@ int __ext4_journal_revoke(const char *where, handle_t *handle, |
469 | handle, err); |
470 | } |
471 | else |
472 | - brelse(bh); |
473 | + bforget(bh); |
474 | return err; |
475 | } |
476 | |
477 | @@ -89,7 +89,10 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle, |
478 | ext4_journal_abort_handle(where, __func__, bh, |
479 | handle, err); |
480 | } else { |
481 | - mark_buffer_dirty(bh); |
482 | + if (inode && bh) |
483 | + mark_buffer_dirty_inode(bh, inode); |
484 | + else |
485 | + mark_buffer_dirty(bh); |
486 | if (inode && inode_needs_sync(inode)) { |
487 | sync_dirty_buffer(bh); |
488 | if (buffer_req(bh) && !buffer_uptodate(bh)) { |
489 | diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h |
490 | index 139fb8c..1892a77 100644 |
491 | --- a/fs/ext4/ext4_jbd2.h |
492 | +++ b/fs/ext4/ext4_jbd2.h |
493 | @@ -49,7 +49,7 @@ |
494 | |
495 | #define EXT4_DATA_TRANS_BLOCKS(sb) (EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + \ |
496 | EXT4_XATTR_TRANS_BLOCKS - 2 + \ |
497 | - 2*EXT4_QUOTA_TRANS_BLOCKS(sb)) |
498 | + EXT4_MAXQUOTAS_TRANS_BLOCKS(sb)) |
499 | |
500 | /* |
501 | * Define the number of metadata blocks we need to account to modify data. |
502 | @@ -57,7 +57,7 @@ |
503 | * This include super block, inode block, quota blocks and xattr blocks |
504 | */ |
505 | #define EXT4_META_TRANS_BLOCKS(sb) (EXT4_XATTR_TRANS_BLOCKS + \ |
506 | - 2*EXT4_QUOTA_TRANS_BLOCKS(sb)) |
507 | + EXT4_MAXQUOTAS_TRANS_BLOCKS(sb)) |
508 | |
509 | /* Delete operations potentially hit one directory's namespace plus an |
510 | * entire inode, plus arbitrary amounts of bitmap/indirection data. Be |
511 | @@ -92,6 +92,7 @@ |
512 | * but inode, sb and group updates are done only once */ |
513 | #define EXT4_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\ |
514 | (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_INIT_REWRITE) : 0) |
515 | + |
516 | #define EXT4_QUOTA_DEL_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_DEL_ALLOC*\ |
517 | (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_DEL_REWRITE) : 0) |
518 | #else |
519 | @@ -99,6 +100,9 @@ |
520 | #define EXT4_QUOTA_INIT_BLOCKS(sb) 0 |
521 | #define EXT4_QUOTA_DEL_BLOCKS(sb) 0 |
522 | #endif |
523 | +#define EXT4_MAXQUOTAS_TRANS_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_TRANS_BLOCKS(sb)) |
524 | +#define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb)) |
525 | +#define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb)) |
526 | |
527 | int |
528 | ext4_mark_iloc_dirty(handle_t *handle, |
529 | @@ -161,11 +165,13 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle, |
530 | handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks); |
531 | int __ext4_journal_stop(const char *where, handle_t *handle); |
532 | |
533 | -#define EXT4_NOJOURNAL_HANDLE ((handle_t *) 0x1) |
534 | +#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096) |
535 | |
536 | +/* Note: Do not use this for NULL handles. This is only to determine if |
537 | + * a properly allocated handle is using a journal or not. */ |
538 | static inline int ext4_handle_valid(handle_t *handle) |
539 | { |
540 | - if (handle == EXT4_NOJOURNAL_HANDLE) |
541 | + if ((unsigned long)handle < EXT4_NOJOURNAL_MAX_REF_COUNT) |
542 | return 0; |
543 | return 1; |
544 | } |
545 | @@ -252,6 +258,19 @@ static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode) |
546 | return 0; |
547 | } |
548 | |
549 | +static inline void ext4_update_inode_fsync_trans(handle_t *handle, |
550 | + struct inode *inode, |
551 | + int datasync) |
552 | +{ |
553 | + struct ext4_inode_info *ei = EXT4_I(inode); |
554 | + |
555 | + if (ext4_handle_valid(handle)) { |
556 | + ei->i_sync_tid = handle->h_transaction->t_tid; |
557 | + if (datasync) |
558 | + ei->i_datasync_tid = handle->h_transaction->t_tid; |
559 | + } |
560 | +} |
561 | + |
562 | /* super.c */ |
563 | int ext4_force_commit(struct super_block *sb); |
564 | |
565 | diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c |
566 | index 73ebfb4..24fb20b 100644 |
567 | --- a/fs/ext4/extents.c |
568 | +++ b/fs/ext4/extents.c |
569 | @@ -93,7 +93,9 @@ static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb) |
570 | ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff); |
571 | } |
572 | |
573 | -static int ext4_ext_journal_restart(handle_t *handle, int needed) |
574 | +static int ext4_ext_truncate_extend_restart(handle_t *handle, |
575 | + struct inode *inode, |
576 | + int needed) |
577 | { |
578 | int err; |
579 | |
580 | @@ -104,7 +106,14 @@ static int ext4_ext_journal_restart(handle_t *handle, int needed) |
581 | err = ext4_journal_extend(handle, needed); |
582 | if (err <= 0) |
583 | return err; |
584 | - return ext4_journal_restart(handle, needed); |
585 | + err = ext4_truncate_restart_trans(handle, inode, needed); |
586 | + /* |
587 | + * We have dropped i_data_sem so someone might have cached again |
588 | + * an extent we are going to truncate. |
589 | + */ |
590 | + ext4_ext_invalidate_cache(inode); |
591 | + |
592 | + return err; |
593 | } |
594 | |
595 | /* |
596 | @@ -701,7 +710,7 @@ err: |
597 | * insert new index [@logical;@ptr] into the block at @curp; |
598 | * check where to insert: before @curp or after @curp |
599 | */ |
600 | -static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, |
601 | +int ext4_ext_insert_index(handle_t *handle, struct inode *inode, |
602 | struct ext4_ext_path *curp, |
603 | int logical, ext4_fsblk_t ptr) |
604 | { |
605 | @@ -1563,7 +1572,7 @@ out: |
606 | */ |
607 | int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, |
608 | struct ext4_ext_path *path, |
609 | - struct ext4_extent *newext) |
610 | + struct ext4_extent *newext, int flag) |
611 | { |
612 | struct ext4_extent_header *eh; |
613 | struct ext4_extent *ex, *fex; |
614 | @@ -1579,7 +1588,8 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, |
615 | BUG_ON(path[depth].p_hdr == NULL); |
616 | |
617 | /* try to insert block into found extent and return */ |
618 | - if (ex && ext4_can_extents_be_merged(inode, ex, newext)) { |
619 | + if (ex && (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT) |
620 | + && ext4_can_extents_be_merged(inode, ex, newext)) { |
621 | ext_debug("append %d block to %d:%d (from %llu)\n", |
622 | ext4_ext_get_actual_len(newext), |
623 | le32_to_cpu(ex->ee_block), |
624 | @@ -1694,7 +1704,8 @@ has_space: |
625 | |
626 | merge: |
627 | /* try to merge extents to the right */ |
628 | - ext4_ext_try_to_merge(inode, path, nearex); |
629 | + if (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT) |
630 | + ext4_ext_try_to_merge(inode, path, nearex); |
631 | |
632 | /* try to merge extents to the left */ |
633 | |
634 | @@ -1731,7 +1742,9 @@ int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, |
635 | while (block < last && block != EXT_MAX_BLOCK) { |
636 | num = last - block; |
637 | /* find extent for this block */ |
638 | + down_read(&EXT4_I(inode)->i_data_sem); |
639 | path = ext4_ext_find_extent(inode, block, path); |
640 | + up_read(&EXT4_I(inode)->i_data_sem); |
641 | if (IS_ERR(path)) { |
642 | err = PTR_ERR(path); |
643 | path = NULL; |
644 | @@ -2044,7 +2057,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, |
645 | ext_debug("free last %u blocks starting %llu\n", num, start); |
646 | for (i = 0; i < num; i++) { |
647 | bh = sb_find_get_block(inode->i_sb, start + i); |
648 | - ext4_forget(handle, 0, inode, bh, start + i); |
649 | + ext4_forget(handle, metadata, inode, bh, start + i); |
650 | } |
651 | ext4_free_blocks(handle, inode, start, num, metadata); |
652 | } else if (from == le32_to_cpu(ex->ee_block) |
653 | @@ -2136,9 +2149,9 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, |
654 | correct_index = 1; |
655 | credits += (ext_depth(inode)) + 1; |
656 | } |
657 | - credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); |
658 | + credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); |
659 | |
660 | - err = ext4_ext_journal_restart(handle, credits); |
661 | + err = ext4_ext_truncate_extend_restart(handle, inode, credits); |
662 | if (err) |
663 | goto out; |
664 | |
665 | @@ -2461,7 +2474,6 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) |
666 | } |
667 | |
668 | #define EXT4_EXT_ZERO_LEN 7 |
669 | - |
670 | /* |
671 | * This function is called by ext4_ext_get_blocks() if someone tries to write |
672 | * to an uninitialized extent. It may result in splitting the uninitialized |
673 | @@ -2554,7 +2566,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, |
674 | ex3->ee_block = cpu_to_le32(iblock); |
675 | ext4_ext_store_pblock(ex3, newblock); |
676 | ex3->ee_len = cpu_to_le16(allocated); |
677 | - err = ext4_ext_insert_extent(handle, inode, path, ex3); |
678 | + err = ext4_ext_insert_extent(handle, inode, path, |
679 | + ex3, 0); |
680 | if (err == -ENOSPC) { |
681 | err = ext4_ext_zeroout(inode, &orig_ex); |
682 | if (err) |
683 | @@ -2610,7 +2623,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, |
684 | ext4_ext_store_pblock(ex3, newblock + max_blocks); |
685 | ex3->ee_len = cpu_to_le16(allocated - max_blocks); |
686 | ext4_ext_mark_uninitialized(ex3); |
687 | - err = ext4_ext_insert_extent(handle, inode, path, ex3); |
688 | + err = ext4_ext_insert_extent(handle, inode, path, ex3, 0); |
689 | if (err == -ENOSPC) { |
690 | err = ext4_ext_zeroout(inode, &orig_ex); |
691 | if (err) |
692 | @@ -2728,7 +2741,191 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, |
693 | err = ext4_ext_dirty(handle, inode, path + depth); |
694 | goto out; |
695 | insert: |
696 | - err = ext4_ext_insert_extent(handle, inode, path, &newex); |
697 | + err = ext4_ext_insert_extent(handle, inode, path, &newex, 0); |
698 | + if (err == -ENOSPC) { |
699 | + err = ext4_ext_zeroout(inode, &orig_ex); |
700 | + if (err) |
701 | + goto fix_extent_len; |
702 | + /* update the extent length and mark as initialized */ |
703 | + ex->ee_block = orig_ex.ee_block; |
704 | + ex->ee_len = orig_ex.ee_len; |
705 | + ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); |
706 | + ext4_ext_dirty(handle, inode, path + depth); |
707 | + /* zero out the first half */ |
708 | + return allocated; |
709 | + } else if (err) |
710 | + goto fix_extent_len; |
711 | +out: |
712 | + return err ? err : allocated; |
713 | + |
714 | +fix_extent_len: |
715 | + ex->ee_block = orig_ex.ee_block; |
716 | + ex->ee_len = orig_ex.ee_len; |
717 | + ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); |
718 | + ext4_ext_mark_uninitialized(ex); |
719 | + ext4_ext_dirty(handle, inode, path + depth); |
720 | + return err; |
721 | +} |
722 | + |
723 | +/* |
724 | + * This function is called by ext4_ext_get_blocks() from |
725 | + * ext4_get_blocks_dio_write() when DIO to write |
726 | + * to an uninitialized extent. |
727 | + * |
728 | + * Writing to an uninitized extent may result in splitting the uninitialized |
729 | + * extent into multiple /intialized unintialized extents (up to three) |
730 | + * There are three possibilities: |
731 | + * a> There is no split required: Entire extent should be uninitialized |
732 | + * b> Splits in two extents: Write is happening at either end of the extent |
733 | + * c> Splits in three extents: Somone is writing in middle of the extent |
734 | + * |
735 | + * One of more index blocks maybe needed if the extent tree grow after |
736 | + * the unintialized extent split. To prevent ENOSPC occur at the IO |
737 | + * complete, we need to split the uninitialized extent before DIO submit |
738 | + * the IO. The uninitilized extent called at this time will be split |
739 | + * into three uninitialized extent(at most). After IO complete, the part |
740 | + * being filled will be convert to initialized by the end_io callback function |
741 | + * via ext4_convert_unwritten_extents(). |
742 | + * |
743 | + * Returns the size of uninitialized extent to be written on success. |
744 | + */ |
745 | +static int ext4_split_unwritten_extents(handle_t *handle, |
746 | + struct inode *inode, |
747 | + struct ext4_ext_path *path, |
748 | + ext4_lblk_t iblock, |
749 | + unsigned int max_blocks, |
750 | + int flags) |
751 | +{ |
752 | + struct ext4_extent *ex, newex, orig_ex; |
753 | + struct ext4_extent *ex1 = NULL; |
754 | + struct ext4_extent *ex2 = NULL; |
755 | + struct ext4_extent *ex3 = NULL; |
756 | + struct ext4_extent_header *eh; |
757 | + ext4_lblk_t ee_block; |
758 | + unsigned int allocated, ee_len, depth; |
759 | + ext4_fsblk_t newblock; |
760 | + int err = 0; |
761 | + |
762 | + ext_debug("ext4_split_unwritten_extents: inode %lu," |
763 | + "iblock %llu, max_blocks %u\n", inode->i_ino, |
764 | + (unsigned long long)iblock, max_blocks); |
765 | + depth = ext_depth(inode); |
766 | + eh = path[depth].p_hdr; |
767 | + ex = path[depth].p_ext; |
768 | + ee_block = le32_to_cpu(ex->ee_block); |
769 | + ee_len = ext4_ext_get_actual_len(ex); |
770 | + allocated = ee_len - (iblock - ee_block); |
771 | + newblock = iblock - ee_block + ext_pblock(ex); |
772 | + ex2 = ex; |
773 | + orig_ex.ee_block = ex->ee_block; |
774 | + orig_ex.ee_len = cpu_to_le16(ee_len); |
775 | + ext4_ext_store_pblock(&orig_ex, ext_pblock(ex)); |
776 | + |
777 | + /* |
778 | + * If the uninitialized extent begins at the same logical |
779 | + * block where the write begins, and the write completely |
780 | + * covers the extent, then we don't need to split it. |
781 | + */ |
782 | + if ((iblock == ee_block) && (allocated <= max_blocks)) |
783 | + return allocated; |
784 | + |
785 | + err = ext4_ext_get_access(handle, inode, path + depth); |
786 | + if (err) |
787 | + goto out; |
788 | + /* ex1: ee_block to iblock - 1 : uninitialized */ |
789 | + if (iblock > ee_block) { |
790 | + ex1 = ex; |
791 | + ex1->ee_len = cpu_to_le16(iblock - ee_block); |
792 | + ext4_ext_mark_uninitialized(ex1); |
793 | + ex2 = &newex; |
794 | + } |
795 | + /* |
796 | + * for sanity, update the length of the ex2 extent before |
797 | + * we insert ex3, if ex1 is NULL. This is to avoid temporary |
798 | + * overlap of blocks. |
799 | + */ |
800 | + if (!ex1 && allocated > max_blocks) |
801 | + ex2->ee_len = cpu_to_le16(max_blocks); |
802 | + /* ex3: to ee_block + ee_len : uninitialised */ |
803 | + if (allocated > max_blocks) { |
804 | + unsigned int newdepth; |
805 | + ex3 = &newex; |
806 | + ex3->ee_block = cpu_to_le32(iblock + max_blocks); |
807 | + ext4_ext_store_pblock(ex3, newblock + max_blocks); |
808 | + ex3->ee_len = cpu_to_le16(allocated - max_blocks); |
809 | + ext4_ext_mark_uninitialized(ex3); |
810 | + err = ext4_ext_insert_extent(handle, inode, path, ex3, flags); |
811 | + if (err == -ENOSPC) { |
812 | + err = ext4_ext_zeroout(inode, &orig_ex); |
813 | + if (err) |
814 | + goto fix_extent_len; |
815 | + /* update the extent length and mark as initialized */ |
816 | + ex->ee_block = orig_ex.ee_block; |
817 | + ex->ee_len = orig_ex.ee_len; |
818 | + ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); |
819 | + ext4_ext_dirty(handle, inode, path + depth); |
820 | + /* zeroed the full extent */ |
821 | + /* blocks available from iblock */ |
822 | + return allocated; |
823 | + |
824 | + } else if (err) |
825 | + goto fix_extent_len; |
826 | + /* |
827 | + * The depth, and hence eh & ex might change |
828 | + * as part of the insert above. |
829 | + */ |
830 | + newdepth = ext_depth(inode); |
831 | + /* |
832 | + * update the extent length after successful insert of the |
833 | + * split extent |
834 | + */ |
835 | + orig_ex.ee_len = cpu_to_le16(ee_len - |
836 | + ext4_ext_get_actual_len(ex3)); |
837 | + depth = newdepth; |
838 | + ext4_ext_drop_refs(path); |
839 | + path = ext4_ext_find_extent(inode, iblock, path); |
840 | + if (IS_ERR(path)) { |
841 | + err = PTR_ERR(path); |
842 | + goto out; |
843 | + } |
844 | + eh = path[depth].p_hdr; |
845 | + ex = path[depth].p_ext; |
846 | + if (ex2 != &newex) |
847 | + ex2 = ex; |
848 | + |
849 | + err = ext4_ext_get_access(handle, inode, path + depth); |
850 | + if (err) |
851 | + goto out; |
852 | + |
853 | + allocated = max_blocks; |
854 | + } |
855 | + /* |
856 | + * If there was a change of depth as part of the |
857 | + * insertion of ex3 above, we need to update the length |
858 | + * of the ex1 extent again here |
859 | + */ |
860 | + if (ex1 && ex1 != ex) { |
861 | + ex1 = ex; |
862 | + ex1->ee_len = cpu_to_le16(iblock - ee_block); |
863 | + ext4_ext_mark_uninitialized(ex1); |
864 | + ex2 = &newex; |
865 | + } |
866 | + /* |
867 | + * ex2: iblock to iblock + maxblocks-1 : to be direct IO written, |
868 | + * uninitialised still. |
869 | + */ |
870 | + ex2->ee_block = cpu_to_le32(iblock); |
871 | + ext4_ext_store_pblock(ex2, newblock); |
872 | + ex2->ee_len = cpu_to_le16(allocated); |
873 | + ext4_ext_mark_uninitialized(ex2); |
874 | + if (ex2 != ex) |
875 | + goto insert; |
876 | + /* Mark modified extent as dirty */ |
877 | + err = ext4_ext_dirty(handle, inode, path + depth); |
878 | + ext_debug("out here\n"); |
879 | + goto out; |
880 | +insert: |
881 | + err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); |
882 | if (err == -ENOSPC) { |
883 | err = ext4_ext_zeroout(inode, &orig_ex); |
884 | if (err) |
885 | @@ -2743,6 +2940,7 @@ insert: |
886 | } else if (err) |
887 | goto fix_extent_len; |
888 | out: |
889 | + ext4_ext_show_leaf(inode, path); |
890 | return err ? err : allocated; |
891 | |
892 | fix_extent_len: |
893 | @@ -2753,7 +2951,151 @@ fix_extent_len: |
894 | ext4_ext_dirty(handle, inode, path + depth); |
895 | return err; |
896 | } |
897 | +static int ext4_convert_unwritten_extents_dio(handle_t *handle, |
898 | + struct inode *inode, |
899 | + struct ext4_ext_path *path) |
900 | +{ |
901 | + struct ext4_extent *ex; |
902 | + struct ext4_extent_header *eh; |
903 | + int depth; |
904 | + int err = 0; |
905 | + int ret = 0; |
906 | + |
907 | + depth = ext_depth(inode); |
908 | + eh = path[depth].p_hdr; |
909 | + ex = path[depth].p_ext; |
910 | + |
911 | + err = ext4_ext_get_access(handle, inode, path + depth); |
912 | + if (err) |
913 | + goto out; |
914 | + /* first mark the extent as initialized */ |
915 | + ext4_ext_mark_initialized(ex); |
916 | + |
917 | + /* |
918 | + * We have to see if it can be merged with the extent |
919 | + * on the left. |
920 | + */ |
921 | + if (ex > EXT_FIRST_EXTENT(eh)) { |
922 | + /* |
923 | + * To merge left, pass "ex - 1" to try_to_merge(), |
924 | + * since it merges towards right _only_. |
925 | + */ |
926 | + ret = ext4_ext_try_to_merge(inode, path, ex - 1); |
927 | + if (ret) { |
928 | + err = ext4_ext_correct_indexes(handle, inode, path); |
929 | + if (err) |
930 | + goto out; |
931 | + depth = ext_depth(inode); |
932 | + ex--; |
933 | + } |
934 | + } |
935 | + /* |
936 | + * Try to Merge towards right. |
937 | + */ |
938 | + ret = ext4_ext_try_to_merge(inode, path, ex); |
939 | + if (ret) { |
940 | + err = ext4_ext_correct_indexes(handle, inode, path); |
941 | + if (err) |
942 | + goto out; |
943 | + depth = ext_depth(inode); |
944 | + } |
945 | + /* Mark modified extent as dirty */ |
946 | + err = ext4_ext_dirty(handle, inode, path + depth); |
947 | +out: |
948 | + ext4_ext_show_leaf(inode, path); |
949 | + return err; |
950 | +} |
951 | + |
952 | +static int |
953 | +ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, |
954 | + ext4_lblk_t iblock, unsigned int max_blocks, |
955 | + struct ext4_ext_path *path, int flags, |
956 | + unsigned int allocated, struct buffer_head *bh_result, |
957 | + ext4_fsblk_t newblock) |
958 | +{ |
959 | + int ret = 0; |
960 | + int err = 0; |
961 | + ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; |
962 | + |
963 | + ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical" |
964 | + "block %llu, max_blocks %u, flags %d, allocated %u", |
965 | + inode->i_ino, (unsigned long long)iblock, max_blocks, |
966 | + flags, allocated); |
967 | + ext4_ext_show_leaf(inode, path); |
968 | |
969 | + /* DIO get_block() before submit the IO, split the extent */ |
970 | + if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) { |
971 | + ret = ext4_split_unwritten_extents(handle, |
972 | + inode, path, iblock, |
973 | + max_blocks, flags); |
974 | + /* |
975 | + * Flag the inode(non aio case) or end_io struct (aio case) |
976 | + * that this IO needs to convertion to written when IO is |
977 | + * completed |
978 | + */ |
979 | + if (io) |
980 | + io->flag = DIO_AIO_UNWRITTEN; |
981 | + else |
982 | + EXT4_I(inode)->i_state |= EXT4_STATE_DIO_UNWRITTEN; |
983 | + goto out; |
984 | + } |
985 | + /* async DIO end_io complete, convert the filled extent to written */ |
986 | + if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) { |
987 | + ret = ext4_convert_unwritten_extents_dio(handle, inode, |
988 | + path); |
989 | + if (ret >= 0) |
990 | + ext4_update_inode_fsync_trans(handle, inode, 1); |
991 | + goto out2; |
992 | + } |
993 | + /* buffered IO case */ |
994 | + /* |
995 | + * repeat fallocate creation request |
996 | + * we already have an unwritten extent |
997 | + */ |
998 | + if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) |
999 | + goto map_out; |
1000 | + |
1001 | + /* buffered READ or buffered write_begin() lookup */ |
1002 | + if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { |
1003 | + /* |
1004 | + * We have blocks reserved already. We |
1005 | + * return allocated blocks so that delalloc |
1006 | + * won't do block reservation for us. But |
1007 | + * the buffer head will be unmapped so that |
1008 | + * a read from the block returns 0s. |
1009 | + */ |
1010 | + set_buffer_unwritten(bh_result); |
1011 | + goto out1; |
1012 | + } |
1013 | + |
1014 | + /* buffered write, writepage time, convert*/ |
1015 | + ret = ext4_ext_convert_to_initialized(handle, inode, |
1016 | + path, iblock, |
1017 | + max_blocks); |
1018 | + if (ret >= 0) |
1019 | + ext4_update_inode_fsync_trans(handle, inode, 1); |
1020 | +out: |
1021 | + if (ret <= 0) { |
1022 | + err = ret; |
1023 | + goto out2; |
1024 | + } else |
1025 | + allocated = ret; |
1026 | + set_buffer_new(bh_result); |
1027 | +map_out: |
1028 | + set_buffer_mapped(bh_result); |
1029 | +out1: |
1030 | + if (allocated > max_blocks) |
1031 | + allocated = max_blocks; |
1032 | + ext4_ext_show_leaf(inode, path); |
1033 | + bh_result->b_bdev = inode->i_sb->s_bdev; |
1034 | + bh_result->b_blocknr = newblock; |
1035 | +out2: |
1036 | + if (path) { |
1037 | + ext4_ext_drop_refs(path); |
1038 | + kfree(path); |
1039 | + } |
1040 | + return err ? err : allocated; |
1041 | +} |
1042 | /* |
1043 | * Block allocation/map/preallocation routine for extents based files |
1044 | * |
1045 | @@ -2784,6 +3126,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, |
1046 | int err = 0, depth, ret, cache_type; |
1047 | unsigned int allocated = 0; |
1048 | struct ext4_allocation_request ar; |
1049 | + ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; |
1050 | |
1051 | __clear_bit(BH_New, &bh_result->b_state); |
1052 | ext_debug("blocks %u/%u requested for inode %u\n", |
1053 | @@ -2859,33 +3202,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, |
1054 | EXT4_EXT_CACHE_EXTENT); |
1055 | goto out; |
1056 | } |
1057 | - if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) |
1058 | - goto out; |
1059 | - if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { |
1060 | - if (allocated > max_blocks) |
1061 | - allocated = max_blocks; |
1062 | - /* |
1063 | - * We have blocks reserved already. We |
1064 | - * return allocated blocks so that delalloc |
1065 | - * won't do block reservation for us. But |
1066 | - * the buffer head will be unmapped so that |
1067 | - * a read from the block returns 0s. |
1068 | - */ |
1069 | - set_buffer_unwritten(bh_result); |
1070 | - bh_result->b_bdev = inode->i_sb->s_bdev; |
1071 | - bh_result->b_blocknr = newblock; |
1072 | - goto out2; |
1073 | - } |
1074 | - |
1075 | - ret = ext4_ext_convert_to_initialized(handle, inode, |
1076 | - path, iblock, |
1077 | - max_blocks); |
1078 | - if (ret <= 0) { |
1079 | - err = ret; |
1080 | - goto out2; |
1081 | - } else |
1082 | - allocated = ret; |
1083 | - goto outnew; |
1084 | + ret = ext4_ext_handle_uninitialized_extents(handle, |
1085 | + inode, iblock, max_blocks, path, |
1086 | + flags, allocated, bh_result, newblock); |
1087 | + return ret; |
1088 | } |
1089 | } |
1090 | |
1091 | @@ -2956,9 +3276,27 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, |
1092 | /* try to insert new extent into found leaf and return */ |
1093 | ext4_ext_store_pblock(&newex, newblock); |
1094 | newex.ee_len = cpu_to_le16(ar.len); |
1095 | - if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) /* Mark uninitialized */ |
1096 | + /* Mark uninitialized */ |
1097 | + if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){ |
1098 | ext4_ext_mark_uninitialized(&newex); |
1099 | - err = ext4_ext_insert_extent(handle, inode, path, &newex); |
1100 | + /* |
1101 | + * io_end structure was created for every async |
1102 | + * direct IO write to the middle of the file. |
1103 | + * To avoid unecessary convertion for every aio dio rewrite |
1104 | + * to the mid of file, here we flag the IO that is really |
1105 | + * need the convertion. |
1106 | + * For non asycn direct IO case, flag the inode state |
1107 | + * that we need to perform convertion when IO is done. |
1108 | + */ |
1109 | + if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) { |
1110 | + if (io) |
1111 | + io->flag = DIO_AIO_UNWRITTEN; |
1112 | + else |
1113 | + EXT4_I(inode)->i_state |= |
1114 | + EXT4_STATE_DIO_UNWRITTEN;; |
1115 | + } |
1116 | + } |
1117 | + err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); |
1118 | if (err) { |
1119 | /* free data blocks we just allocated */ |
1120 | /* not a good idea to call discard here directly, |
1121 | @@ -2972,13 +3310,18 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, |
1122 | /* previous routine could use block we allocated */ |
1123 | newblock = ext_pblock(&newex); |
1124 | allocated = ext4_ext_get_actual_len(&newex); |
1125 | -outnew: |
1126 | set_buffer_new(bh_result); |
1127 | |
1128 | - /* Cache only when it is _not_ an uninitialized extent */ |
1129 | - if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) |
1130 | + /* |
1131 | + * Cache the extent and update transaction to commit on fdatasync only |
1132 | + * when it is _not_ an uninitialized extent. |
1133 | + */ |
1134 | + if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) { |
1135 | ext4_ext_put_in_cache(inode, iblock, allocated, newblock, |
1136 | EXT4_EXT_CACHE_EXTENT); |
1137 | + ext4_update_inode_fsync_trans(handle, inode, 1); |
1138 | + } else |
1139 | + ext4_update_inode_fsync_trans(handle, inode, 0); |
1140 | out: |
1141 | if (allocated > max_blocks) |
1142 | allocated = max_blocks; |
1143 | @@ -3171,6 +3514,64 @@ retry: |
1144 | } |
1145 | |
1146 | /* |
1147 | + * This function convert a range of blocks to written extents |
1148 | + * The caller of this function will pass the start offset and the size. |
1149 | + * all unwritten extents within this range will be converted to |
1150 | + * written extents. |
1151 | + * |
1152 | + * This function is called from the direct IO end io call back |
1153 | + * function, to convert the fallocated extents after IO is completed. |
1154 | + * Returns 0 on success. |
1155 | + */ |
1156 | +int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, |
1157 | + loff_t len) |
1158 | +{ |
1159 | + handle_t *handle; |
1160 | + ext4_lblk_t block; |
1161 | + unsigned int max_blocks; |
1162 | + int ret = 0; |
1163 | + int ret2 = 0; |
1164 | + struct buffer_head map_bh; |
1165 | + unsigned int credits, blkbits = inode->i_blkbits; |
1166 | + |
1167 | + block = offset >> blkbits; |
1168 | + /* |
1169 | + * We can't just convert len to max_blocks because |
1170 | + * If blocksize = 4096 offset = 3072 and len = 2048 |
1171 | + */ |
1172 | + max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) |
1173 | + - block; |
1174 | + /* |
1175 | + * credits to insert 1 extent into extent tree |
1176 | + */ |
1177 | + credits = ext4_chunk_trans_blocks(inode, max_blocks); |
1178 | + while (ret >= 0 && ret < max_blocks) { |
1179 | + block = block + ret; |
1180 | + max_blocks = max_blocks - ret; |
1181 | + handle = ext4_journal_start(inode, credits); |
1182 | + if (IS_ERR(handle)) { |
1183 | + ret = PTR_ERR(handle); |
1184 | + break; |
1185 | + } |
1186 | + map_bh.b_state = 0; |
1187 | + ret = ext4_get_blocks(handle, inode, block, |
1188 | + max_blocks, &map_bh, |
1189 | + EXT4_GET_BLOCKS_DIO_CONVERT_EXT); |
1190 | + if (ret <= 0) { |
1191 | + WARN_ON(ret <= 0); |
1192 | + printk(KERN_ERR "%s: ext4_ext_get_blocks " |
1193 | + "returned error inode#%lu, block=%u, " |
1194 | + "max_blocks=%u", __func__, |
1195 | + inode->i_ino, block, max_blocks); |
1196 | + } |
1197 | + ext4_mark_inode_dirty(handle, inode); |
1198 | + ret2 = ext4_journal_stop(handle); |
1199 | + if (ret <= 0 || ret2 ) |
1200 | + break; |
1201 | + } |
1202 | + return ret > 0 ? ret2 : ret; |
1203 | +} |
1204 | +/* |
1205 | * Callback function called for each extent to gather FIEMAP information. |
1206 | */ |
1207 | static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, |
1208 | @@ -3308,10 +3709,8 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, |
1209 | * Walk the extent tree gathering extent information. |
1210 | * ext4_ext_fiemap_cb will push extents back to user. |
1211 | */ |
1212 | - down_read(&EXT4_I(inode)->i_data_sem); |
1213 | error = ext4_ext_walk_space(inode, start_blk, len_blks, |
1214 | ext4_ext_fiemap_cb, fieinfo); |
1215 | - up_read(&EXT4_I(inode)->i_data_sem); |
1216 | } |
1217 | |
1218 | return error; |
1219 | diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c |
1220 | index 83cf641..d6049e4 100644 |
1221 | --- a/fs/ext4/fsync.c |
1222 | +++ b/fs/ext4/fsync.c |
1223 | @@ -44,27 +44,37 @@ |
1224 | * |
1225 | * What we do is just kick off a commit and wait on it. This will snapshot the |
1226 | * inode to disk. |
1227 | + * |
1228 | + * i_mutex lock is held when entering and exiting this function |
1229 | */ |
1230 | |
1231 | int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) |
1232 | { |
1233 | struct inode *inode = dentry->d_inode; |
1234 | + struct ext4_inode_info *ei = EXT4_I(inode); |
1235 | journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; |
1236 | - int ret = 0; |
1237 | + int ret; |
1238 | + tid_t commit_tid; |
1239 | |
1240 | J_ASSERT(ext4_journal_current_handle() == NULL); |
1241 | |
1242 | trace_ext4_sync_file(file, dentry, datasync); |
1243 | |
1244 | + if (inode->i_sb->s_flags & MS_RDONLY) |
1245 | + return 0; |
1246 | + |
1247 | + ret = flush_aio_dio_completed_IO(inode); |
1248 | + if (ret < 0) |
1249 | + return ret; |
1250 | + |
1251 | + if (!journal) |
1252 | + return simple_fsync(file, dentry, datasync); |
1253 | + |
1254 | /* |
1255 | - * data=writeback: |
1256 | + * data=writeback,ordered: |
1257 | * The caller's filemap_fdatawrite()/wait will sync the data. |
1258 | - * sync_inode() will sync the metadata |
1259 | - * |
1260 | - * data=ordered: |
1261 | - * The caller's filemap_fdatawrite() will write the data and |
1262 | - * sync_inode() will write the inode if it is dirty. Then the caller's |
1263 | - * filemap_fdatawait() will wait on the pages. |
1264 | + * Metadata is in the journal, we wait for proper transaction to |
1265 | + * commit here. |
1266 | * |
1267 | * data=journal: |
1268 | * filemap_fdatawrite won't do anything (the buffers are clean). |
1269 | @@ -74,27 +84,13 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) |
1270 | * (they were dirtied by commit). But that's OK - the blocks are |
1271 | * safe in-journal, which is all fsync() needs to ensure. |
1272 | */ |
1273 | - if (ext4_should_journal_data(inode)) { |
1274 | - ret = ext4_force_commit(inode->i_sb); |
1275 | - goto out; |
1276 | - } |
1277 | - |
1278 | - if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) |
1279 | - goto out; |
1280 | + if (ext4_should_journal_data(inode)) |
1281 | + return ext4_force_commit(inode->i_sb); |
1282 | |
1283 | - /* |
1284 | - * The VFS has written the file data. If the inode is unaltered |
1285 | - * then we need not start a commit. |
1286 | - */ |
1287 | - if (inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC)) { |
1288 | - struct writeback_control wbc = { |
1289 | - .sync_mode = WB_SYNC_ALL, |
1290 | - .nr_to_write = 0, /* sys_fsync did this */ |
1291 | - }; |
1292 | - ret = sync_inode(inode, &wbc); |
1293 | - if (journal && (journal->j_flags & JBD2_BARRIER)) |
1294 | - blkdev_issue_flush(inode->i_sb->s_bdev, NULL); |
1295 | - } |
1296 | -out: |
1297 | + commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid; |
1298 | + if (jbd2_log_start_commit(journal, commit_tid)) |
1299 | + jbd2_log_wait_commit(journal, commit_tid); |
1300 | + else if (journal->j_flags & JBD2_BARRIER) |
1301 | + blkdev_issue_flush(inode->i_sb->s_bdev, NULL); |
1302 | return ret; |
1303 | } |
1304 | diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c |
1305 | index f9c642b..38b2154 100644 |
1306 | --- a/fs/ext4/inode.c |
1307 | +++ b/fs/ext4/inode.c |
1308 | @@ -37,6 +37,7 @@ |
1309 | #include <linux/namei.h> |
1310 | #include <linux/uio.h> |
1311 | #include <linux/bio.h> |
1312 | +#include <linux/workqueue.h> |
1313 | |
1314 | #include "ext4_jbd2.h" |
1315 | #include "xattr.h" |
1316 | @@ -192,11 +193,25 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode) |
1317 | * so before we call here everything must be consistently dirtied against |
1318 | * this transaction. |
1319 | */ |
1320 | -static int ext4_journal_test_restart(handle_t *handle, struct inode *inode) |
1321 | +int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode, |
1322 | + int nblocks) |
1323 | { |
1324 | + int ret; |
1325 | + |
1326 | + /* |
1327 | + * Drop i_data_sem to avoid deadlock with ext4_get_blocks At this |
1328 | + * moment, get_block can be called only for blocks inside i_size since |
1329 | + * page cache has been already dropped and writes are blocked by |
1330 | + * i_mutex. So we can safely drop the i_data_sem here. |
1331 | + */ |
1332 | BUG_ON(EXT4_JOURNAL(inode) == NULL); |
1333 | jbd_debug(2, "restarting handle %p\n", handle); |
1334 | - return ext4_journal_restart(handle, blocks_for_truncate(inode)); |
1335 | + up_write(&EXT4_I(inode)->i_data_sem); |
1336 | + ret = ext4_journal_restart(handle, blocks_for_truncate(inode)); |
1337 | + down_write(&EXT4_I(inode)->i_data_sem); |
1338 | + ext4_discard_preallocations(inode); |
1339 | + |
1340 | + return ret; |
1341 | } |
1342 | |
1343 | /* |
1344 | @@ -551,15 +566,21 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) |
1345 | * |
1346 | * Normally this function find the preferred place for block allocation, |
1347 | * returns it. |
1348 | + * Because this is only used for non-extent files, we limit the block nr |
1349 | + * to 32 bits. |
1350 | */ |
1351 | static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, |
1352 | Indirect *partial) |
1353 | { |
1354 | + ext4_fsblk_t goal; |
1355 | + |
1356 | /* |
1357 | * XXX need to get goal block from mballoc's data structures |
1358 | */ |
1359 | |
1360 | - return ext4_find_near(inode, partial); |
1361 | + goal = ext4_find_near(inode, partial); |
1362 | + goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; |
1363 | + return goal; |
1364 | } |
1365 | |
1366 | /** |
1367 | @@ -640,6 +661,8 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, |
1368 | if (*err) |
1369 | goto failed_out; |
1370 | |
1371 | + BUG_ON(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS); |
1372 | + |
1373 | target -= count; |
1374 | /* allocate blocks for indirect blocks */ |
1375 | while (index < indirect_blks && count) { |
1376 | @@ -674,6 +697,7 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, |
1377 | ar.flags = EXT4_MB_HINT_DATA; |
1378 | |
1379 | current_block = ext4_mb_new_blocks(handle, &ar, err); |
1380 | + BUG_ON(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS); |
1381 | |
1382 | if (*err && (target == blks)) { |
1383 | /* |
1384 | @@ -998,10 +1022,12 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode, |
1385 | if (!err) |
1386 | err = ext4_splice_branch(handle, inode, iblock, |
1387 | partial, indirect_blks, count); |
1388 | - else |
1389 | + if (err) |
1390 | goto cleanup; |
1391 | |
1392 | set_buffer_new(bh_result); |
1393 | + |
1394 | + ext4_update_inode_fsync_trans(handle, inode, 1); |
1395 | got_it: |
1396 | map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); |
1397 | if (count > blocks_to_boundary) |
1398 | @@ -1029,7 +1055,7 @@ qsize_t ext4_get_reserved_space(struct inode *inode) |
1399 | EXT4_I(inode)->i_reserved_meta_blocks; |
1400 | spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); |
1401 | |
1402 | - return total; |
1403 | + return (total << inode->i_blkbits); |
1404 | } |
1405 | /* |
1406 | * Calculate the number of metadata blocks need to reserve |
1407 | @@ -1109,22 +1135,79 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used) |
1408 | ext4_discard_preallocations(inode); |
1409 | } |
1410 | |
1411 | -static int check_block_validity(struct inode *inode, sector_t logical, |
1412 | - sector_t phys, int len) |
1413 | +static int check_block_validity(struct inode *inode, const char *msg, |
1414 | + sector_t logical, sector_t phys, int len) |
1415 | { |
1416 | if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) { |
1417 | - ext4_error(inode->i_sb, "check_block_validity", |
1418 | + ext4_error(inode->i_sb, msg, |
1419 | "inode #%lu logical block %llu mapped to %llu " |
1420 | "(size %d)", inode->i_ino, |
1421 | (unsigned long long) logical, |
1422 | (unsigned long long) phys, len); |
1423 | - WARN_ON(1); |
1424 | return -EIO; |
1425 | } |
1426 | return 0; |
1427 | } |
1428 | |
1429 | /* |
1430 | + * Return the number of contiguous dirty pages in a given inode |
1431 | + * starting at page frame idx. |
1432 | + */ |
1433 | +static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx, |
1434 | + unsigned int max_pages) |
1435 | +{ |
1436 | + struct address_space *mapping = inode->i_mapping; |
1437 | + pgoff_t index; |
1438 | + struct pagevec pvec; |
1439 | + pgoff_t num = 0; |
1440 | + int i, nr_pages, done = 0; |
1441 | + |
1442 | + if (max_pages == 0) |
1443 | + return 0; |
1444 | + pagevec_init(&pvec, 0); |
1445 | + while (!done) { |
1446 | + index = idx; |
1447 | + nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, |
1448 | + PAGECACHE_TAG_DIRTY, |
1449 | + (pgoff_t)PAGEVEC_SIZE); |
1450 | + if (nr_pages == 0) |
1451 | + break; |
1452 | + for (i = 0; i < nr_pages; i++) { |
1453 | + struct page *page = pvec.pages[i]; |
1454 | + struct buffer_head *bh, *head; |
1455 | + |
1456 | + lock_page(page); |
1457 | + if (unlikely(page->mapping != mapping) || |
1458 | + !PageDirty(page) || |
1459 | + PageWriteback(page) || |
1460 | + page->index != idx) { |
1461 | + done = 1; |
1462 | + unlock_page(page); |
1463 | + break; |
1464 | + } |
1465 | + if (page_has_buffers(page)) { |
1466 | + bh = head = page_buffers(page); |
1467 | + do { |
1468 | + if (!buffer_delay(bh) && |
1469 | + !buffer_unwritten(bh)) |
1470 | + done = 1; |
1471 | + bh = bh->b_this_page; |
1472 | + } while (!done && (bh != head)); |
1473 | + } |
1474 | + unlock_page(page); |
1475 | + if (done) |
1476 | + break; |
1477 | + idx++; |
1478 | + num++; |
1479 | + if (num >= max_pages) |
1480 | + break; |
1481 | + } |
1482 | + pagevec_release(&pvec); |
1483 | + } |
1484 | + return num; |
1485 | +} |
1486 | + |
1487 | +/* |
1488 | * The ext4_get_blocks() function tries to look up the requested blocks, |
1489 | * and returns if the blocks are already mapped. |
1490 | * |
1491 | @@ -1155,6 +1238,9 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, |
1492 | clear_buffer_mapped(bh); |
1493 | clear_buffer_unwritten(bh); |
1494 | |
1495 | + ext_debug("ext4_get_blocks(): inode %lu, flag %d, max_blocks %u," |
1496 | + "logical block %lu\n", inode->i_ino, flags, max_blocks, |
1497 | + (unsigned long)block); |
1498 | /* |
1499 | * Try to see if we can get the block without requesting a new |
1500 | * file system block. |
1501 | @@ -1170,8 +1256,8 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, |
1502 | up_read((&EXT4_I(inode)->i_data_sem)); |
1503 | |
1504 | if (retval > 0 && buffer_mapped(bh)) { |
1505 | - int ret = check_block_validity(inode, block, |
1506 | - bh->b_blocknr, retval); |
1507 | + int ret = check_block_validity(inode, "file system corruption", |
1508 | + block, bh->b_blocknr, retval); |
1509 | if (ret != 0) |
1510 | return ret; |
1511 | } |
1512 | @@ -1235,8 +1321,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, |
1513 | * i_data's format changing. Force the migrate |
1514 | * to fail by clearing migrate flags |
1515 | */ |
1516 | - EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags & |
1517 | - ~EXT4_EXT_MIGRATE; |
1518 | + EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE; |
1519 | } |
1520 | } |
1521 | |
1522 | @@ -1252,8 +1337,9 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, |
1523 | |
1524 | up_write((&EXT4_I(inode)->i_data_sem)); |
1525 | if (retval > 0 && buffer_mapped(bh)) { |
1526 | - int ret = check_block_validity(inode, block, |
1527 | - bh->b_blocknr, retval); |
1528 | + int ret = check_block_validity(inode, "file system " |
1529 | + "corruption after allocation", |
1530 | + block, bh->b_blocknr, retval); |
1531 | if (ret != 0) |
1532 | return ret; |
1533 | } |
1534 | @@ -1451,6 +1537,16 @@ static int do_journal_get_write_access(handle_t *handle, |
1535 | return ext4_journal_get_write_access(handle, bh); |
1536 | } |
1537 | |
1538 | +/* |
1539 | + * Truncate blocks that were not used by write. We have to truncate the |
1540 | + * pagecache as well so that corresponding buffers get properly unmapped. |
1541 | + */ |
1542 | +static void ext4_truncate_failed_write(struct inode *inode) |
1543 | +{ |
1544 | + truncate_inode_pages(inode->i_mapping, inode->i_size); |
1545 | + ext4_truncate(inode); |
1546 | +} |
1547 | + |
1548 | static int ext4_write_begin(struct file *file, struct address_space *mapping, |
1549 | loff_t pos, unsigned len, unsigned flags, |
1550 | struct page **pagep, void **fsdata) |
1551 | @@ -1516,7 +1612,7 @@ retry: |
1552 | |
1553 | ext4_journal_stop(handle); |
1554 | if (pos + len > inode->i_size) { |
1555 | - ext4_truncate(inode); |
1556 | + ext4_truncate_failed_write(inode); |
1557 | /* |
1558 | * If truncate failed early the inode might |
1559 | * still be on the orphan list; we need to |
1560 | @@ -1626,7 +1722,7 @@ static int ext4_ordered_write_end(struct file *file, |
1561 | ret = ret2; |
1562 | |
1563 | if (pos + len > inode->i_size) { |
1564 | - ext4_truncate(inode); |
1565 | + ext4_truncate_failed_write(inode); |
1566 | /* |
1567 | * If truncate failed early the inode might still be |
1568 | * on the orphan list; we need to make sure the inode |
1569 | @@ -1668,7 +1764,7 @@ static int ext4_writeback_write_end(struct file *file, |
1570 | ret = ret2; |
1571 | |
1572 | if (pos + len > inode->i_size) { |
1573 | - ext4_truncate(inode); |
1574 | + ext4_truncate_failed_write(inode); |
1575 | /* |
1576 | * If truncate failed early the inode might still be |
1577 | * on the orphan list; we need to make sure the inode |
1578 | @@ -1731,7 +1827,7 @@ static int ext4_journalled_write_end(struct file *file, |
1579 | if (!ret) |
1580 | ret = ret2; |
1581 | if (pos + len > inode->i_size) { |
1582 | - ext4_truncate(inode); |
1583 | + ext4_truncate_failed_write(inode); |
1584 | /* |
1585 | * If truncate failed early the inode might still be |
1586 | * on the orphan list; we need to make sure the inode |
1587 | @@ -1776,11 +1872,11 @@ repeat: |
1588 | |
1589 | if (ext4_claim_free_blocks(sbi, total)) { |
1590 | spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); |
1591 | + vfs_dq_release_reservation_block(inode, total); |
1592 | if (ext4_should_retry_alloc(inode->i_sb, &retries)) { |
1593 | yield(); |
1594 | goto repeat; |
1595 | } |
1596 | - vfs_dq_release_reservation_block(inode, total); |
1597 | return -ENOSPC; |
1598 | } |
1599 | EXT4_I(inode)->i_reserved_data_blocks += nrblocks; |
1600 | @@ -1860,22 +1956,6 @@ static void ext4_da_page_release_reservation(struct page *page, |
1601 | } |
1602 | |
1603 | /* |
1604 | - * Delayed allocation stuff |
1605 | - */ |
1606 | - |
1607 | -struct mpage_da_data { |
1608 | - struct inode *inode; |
1609 | - sector_t b_blocknr; /* start block number of extent */ |
1610 | - size_t b_size; /* size of extent */ |
1611 | - unsigned long b_state; /* state of the extent */ |
1612 | - unsigned long first_page, next_page; /* extent of pages */ |
1613 | - struct writeback_control *wbc; |
1614 | - int io_done; |
1615 | - int pages_written; |
1616 | - int retval; |
1617 | -}; |
1618 | - |
1619 | -/* |
1620 | * mpage_da_submit_io - walks through extent of pages and try to write |
1621 | * them with writepage() call back |
1622 | * |
1623 | @@ -2717,7 +2797,7 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode) |
1624 | * number of contiguous block. So we will limit |
1625 | * number of contiguous block to a sane value |
1626 | */ |
1627 | - if (!(inode->i_flags & EXT4_EXTENTS_FL) && |
1628 | + if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) && |
1629 | (max_blocks > EXT4_MAX_TRANS_DATA)) |
1630 | max_blocks = EXT4_MAX_TRANS_DATA; |
1631 | |
1632 | @@ -2735,8 +2815,11 @@ static int ext4_da_writepages(struct address_space *mapping, |
1633 | int no_nrwrite_index_update; |
1634 | int pages_written = 0; |
1635 | long pages_skipped; |
1636 | + unsigned int max_pages; |
1637 | int range_cyclic, cycled = 1, io_done = 0; |
1638 | - int needed_blocks, ret = 0, nr_to_writebump = 0; |
1639 | + int needed_blocks, ret = 0; |
1640 | + long desired_nr_to_write, nr_to_writebump = 0; |
1641 | + loff_t range_start = wbc->range_start; |
1642 | struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); |
1643 | |
1644 | trace_ext4_da_writepages(inode, wbc); |
1645 | @@ -2762,16 +2845,6 @@ static int ext4_da_writepages(struct address_space *mapping, |
1646 | if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) |
1647 | return -EROFS; |
1648 | |
1649 | - /* |
1650 | - * Make sure nr_to_write is >= sbi->s_mb_stream_request |
1651 | - * This make sure small files blocks are allocated in |
1652 | - * single attempt. This ensure that small files |
1653 | - * get less fragmented. |
1654 | - */ |
1655 | - if (wbc->nr_to_write < sbi->s_mb_stream_request) { |
1656 | - nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write; |
1657 | - wbc->nr_to_write = sbi->s_mb_stream_request; |
1658 | - } |
1659 | if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) |
1660 | range_whole = 1; |
1661 | |
1662 | @@ -2786,6 +2859,36 @@ static int ext4_da_writepages(struct address_space *mapping, |
1663 | } else |
1664 | index = wbc->range_start >> PAGE_CACHE_SHIFT; |
1665 | |
1666 | + /* |
1667 | + * This works around two forms of stupidity. The first is in |
1668 | + * the writeback code, which caps the maximum number of pages |
1669 | + * written to be 1024 pages. This is wrong on multiple |
1670 | + * levels; different architectues have a different page size, |
1671 | + * which changes the maximum amount of data which gets |
1672 | + * written. Secondly, 4 megabytes is way too small. XFS |
1673 | + * forces this value to be 16 megabytes by multiplying |
1674 | + * nr_to_write parameter by four, and then relies on its |
1675 | + * allocator to allocate larger extents to make them |
1676 | + * contiguous. Unfortunately this brings us to the second |
1677 | + * stupidity, which is that ext4's mballoc code only allocates |
1678 | + * at most 2048 blocks. So we force contiguous writes up to |
1679 | + * the number of dirty blocks in the inode, or |
1680 | + * sbi->max_writeback_mb_bump whichever is smaller. |
1681 | + */ |
1682 | + max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT); |
1683 | + if (!range_cyclic && range_whole) |
1684 | + desired_nr_to_write = wbc->nr_to_write * 8; |
1685 | + else |
1686 | + desired_nr_to_write = ext4_num_dirty_pages(inode, index, |
1687 | + max_pages); |
1688 | + if (desired_nr_to_write > max_pages) |
1689 | + desired_nr_to_write = max_pages; |
1690 | + |
1691 | + if (wbc->nr_to_write < desired_nr_to_write) { |
1692 | + nr_to_writebump = desired_nr_to_write - wbc->nr_to_write; |
1693 | + wbc->nr_to_write = desired_nr_to_write; |
1694 | + } |
1695 | + |
1696 | mpd.wbc = wbc; |
1697 | mpd.inode = mapping->host; |
1698 | |
1699 | @@ -2904,7 +3007,9 @@ retry: |
1700 | out_writepages: |
1701 | if (!no_nrwrite_index_update) |
1702 | wbc->no_nrwrite_index_update = 0; |
1703 | - wbc->nr_to_write -= nr_to_writebump; |
1704 | + if (wbc->nr_to_write > nr_to_writebump) |
1705 | + wbc->nr_to_write -= nr_to_writebump; |
1706 | + wbc->range_start = range_start; |
1707 | trace_ext4_da_writepages_result(inode, wbc, ret, pages_written); |
1708 | return ret; |
1709 | } |
1710 | @@ -2994,7 +3099,7 @@ retry: |
1711 | * i_size_read because we hold i_mutex. |
1712 | */ |
1713 | if (pos + len > inode->i_size) |
1714 | - ext4_truncate(inode); |
1715 | + ext4_truncate_failed_write(inode); |
1716 | } |
1717 | |
1718 | if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) |
1719 | @@ -3259,6 +3364,8 @@ static int ext4_releasepage(struct page *page, gfp_t wait) |
1720 | } |
1721 | |
1722 | /* |
1723 | + * O_DIRECT for ext3 (or indirect map) based files |
1724 | + * |
1725 | * If the O_DIRECT write will extend the file then add this inode to the |
1726 | * orphan list. So recovery will truncate it back to the original size |
1727 | * if the machine crashes during the write. |
1728 | @@ -3267,7 +3374,7 @@ static int ext4_releasepage(struct page *page, gfp_t wait) |
1729 | * crashes then stale disk data _may_ be exposed inside the file. But current |
1730 | * VFS code falls back into buffered path in that case so we are safe. |
1731 | */ |
1732 | -static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, |
1733 | +static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, |
1734 | const struct iovec *iov, loff_t offset, |
1735 | unsigned long nr_segs) |
1736 | { |
1737 | @@ -3278,6 +3385,7 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, |
1738 | ssize_t ret; |
1739 | int orphan = 0; |
1740 | size_t count = iov_length(iov, nr_segs); |
1741 | + int retries = 0; |
1742 | |
1743 | if (rw == WRITE) { |
1744 | loff_t final_size = offset + count; |
1745 | @@ -3300,9 +3408,12 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, |
1746 | } |
1747 | } |
1748 | |
1749 | +retry: |
1750 | ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, |
1751 | offset, nr_segs, |
1752 | ext4_get_block, NULL); |
1753 | + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) |
1754 | + goto retry; |
1755 | |
1756 | if (orphan) { |
1757 | int err; |
1758 | @@ -3341,6 +3452,364 @@ out: |
1759 | return ret; |
1760 | } |
1761 | |
1762 | +static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock, |
1763 | + struct buffer_head *bh_result, int create) |
1764 | +{ |
1765 | + handle_t *handle = NULL; |
1766 | + int ret = 0; |
1767 | + unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; |
1768 | + int dio_credits; |
1769 | + |
1770 | + ext4_debug("ext4_get_block_dio_write: inode %lu, create flag %d\n", |
1771 | + inode->i_ino, create); |
1772 | + /* |
1773 | + * DIO VFS code passes create = 0 flag for write to |
1774 | + * the middle of file. It does this to avoid block |
1775 | + * allocation for holes, to prevent expose stale data |
1776 | + * out when there is parallel buffered read (which does |
1777 | + * not hold the i_mutex lock) while direct IO write has |
1778 | + * not completed. DIO request on holes finally falls back |
1779 | + * to buffered IO for this reason. |
1780 | + * |
1781 | + * For ext4 extent based file, since we support fallocate, |
1782 | + * new allocated extent as uninitialized, for holes, we |
1783 | + * could fallocate blocks for holes, thus parallel |
1784 | + * buffered IO read will zero out the page when read on |
1785 | + * a hole while parallel DIO write to the hole has not completed. |
1786 | + * |
1787 | + * when we come here, we know it's a direct IO write to |
1788 | + * to the middle of file (<i_size) |
1789 | + * so it's safe to override the create flag from VFS. |
1790 | + */ |
1791 | + create = EXT4_GET_BLOCKS_DIO_CREATE_EXT; |
1792 | + |
1793 | + if (max_blocks > DIO_MAX_BLOCKS) |
1794 | + max_blocks = DIO_MAX_BLOCKS; |
1795 | + dio_credits = ext4_chunk_trans_blocks(inode, max_blocks); |
1796 | + handle = ext4_journal_start(inode, dio_credits); |
1797 | + if (IS_ERR(handle)) { |
1798 | + ret = PTR_ERR(handle); |
1799 | + goto out; |
1800 | + } |
1801 | + ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result, |
1802 | + create); |
1803 | + if (ret > 0) { |
1804 | + bh_result->b_size = (ret << inode->i_blkbits); |
1805 | + ret = 0; |
1806 | + } |
1807 | + ext4_journal_stop(handle); |
1808 | +out: |
1809 | + return ret; |
1810 | +} |
1811 | + |
1812 | +static void ext4_free_io_end(ext4_io_end_t *io) |
1813 | +{ |
1814 | + BUG_ON(!io); |
1815 | + iput(io->inode); |
1816 | + kfree(io); |
1817 | +} |
1818 | +static void dump_aio_dio_list(struct inode * inode) |
1819 | +{ |
1820 | +#ifdef EXT4_DEBUG |
1821 | + struct list_head *cur, *before, *after; |
1822 | + ext4_io_end_t *io, *io0, *io1; |
1823 | + |
1824 | + if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){ |
1825 | + ext4_debug("inode %lu aio dio list is empty\n", inode->i_ino); |
1826 | + return; |
1827 | + } |
1828 | + |
1829 | + ext4_debug("Dump inode %lu aio_dio_completed_IO list \n", inode->i_ino); |
1830 | + list_for_each_entry(io, &EXT4_I(inode)->i_aio_dio_complete_list, list){ |
1831 | + cur = &io->list; |
1832 | + before = cur->prev; |
1833 | + io0 = container_of(before, ext4_io_end_t, list); |
1834 | + after = cur->next; |
1835 | + io1 = container_of(after, ext4_io_end_t, list); |
1836 | + |
1837 | + ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n", |
1838 | + io, inode->i_ino, io0, io1); |
1839 | + } |
1840 | +#endif |
1841 | +} |
1842 | + |
1843 | +/* |
1844 | + * check a range of space and convert unwritten extents to written. |
1845 | + */ |
1846 | +static int ext4_end_aio_dio_nolock(ext4_io_end_t *io) |
1847 | +{ |
1848 | + struct inode *inode = io->inode; |
1849 | + loff_t offset = io->offset; |
1850 | + size_t size = io->size; |
1851 | + int ret = 0; |
1852 | + |
1853 | + ext4_debug("end_aio_dio_onlock: io 0x%p from inode %lu,list->next 0x%p," |
1854 | + "list->prev 0x%p\n", |
1855 | + io, inode->i_ino, io->list.next, io->list.prev); |
1856 | + |
1857 | + if (list_empty(&io->list)) |
1858 | + return ret; |
1859 | + |
1860 | + if (io->flag != DIO_AIO_UNWRITTEN) |
1861 | + return ret; |
1862 | + |
1863 | + if (offset + size <= i_size_read(inode)) |
1864 | + ret = ext4_convert_unwritten_extents(inode, offset, size); |
1865 | + |
1866 | + if (ret < 0) { |
1867 | + printk(KERN_EMERG "%s: failed to convert unwritten" |
1868 | + "extents to written extents, error is %d" |
1869 | + " io is still on inode %lu aio dio list\n", |
1870 | + __func__, ret, inode->i_ino); |
1871 | + return ret; |
1872 | + } |
1873 | + |
1874 | + /* clear the DIO AIO unwritten flag */ |
1875 | + io->flag = 0; |
1876 | + return ret; |
1877 | +} |
1878 | +/* |
1879 | + * work on completed aio dio IO, to convert unwritten extents to extents |
1880 | + */ |
1881 | +static void ext4_end_aio_dio_work(struct work_struct *work) |
1882 | +{ |
1883 | + ext4_io_end_t *io = container_of(work, ext4_io_end_t, work); |
1884 | + struct inode *inode = io->inode; |
1885 | + int ret = 0; |
1886 | + |
1887 | + mutex_lock(&inode->i_mutex); |
1888 | + ret = ext4_end_aio_dio_nolock(io); |
1889 | + if (ret >= 0) { |
1890 | + if (!list_empty(&io->list)) |
1891 | + list_del_init(&io->list); |
1892 | + ext4_free_io_end(io); |
1893 | + } |
1894 | + mutex_unlock(&inode->i_mutex); |
1895 | +} |
1896 | +/* |
1897 | + * This function is called from ext4_sync_file(). |
1898 | + * |
1899 | + * When AIO DIO IO is completed, the work to convert unwritten |
1900 | + * extents to written is queued on workqueue but may not get immediately |
1901 | + * scheduled. When fsync is called, we need to ensure the |
1902 | + * conversion is complete before fsync returns. |
1903 | + * The inode keeps track of a list of completed AIO from DIO path |
1904 | + * that might needs to do the conversion. This function walks through |
1905 | + * the list and convert the related unwritten extents to written. |
1906 | + */ |
1907 | +int flush_aio_dio_completed_IO(struct inode *inode) |
1908 | +{ |
1909 | + ext4_io_end_t *io; |
1910 | + int ret = 0; |
1911 | + int ret2 = 0; |
1912 | + |
1913 | + if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)) |
1914 | + return ret; |
1915 | + |
1916 | + dump_aio_dio_list(inode); |
1917 | + while (!list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){ |
1918 | + io = list_entry(EXT4_I(inode)->i_aio_dio_complete_list.next, |
1919 | + ext4_io_end_t, list); |
1920 | + /* |
1921 | + * Calling ext4_end_aio_dio_nolock() to convert completed |
1922 | + * IO to written. |
1923 | + * |
1924 | + * When ext4_sync_file() is called, run_queue() may already |
1925 | + * about to flush the work corresponding to this io structure. |
1926 | + * It will be upset if it founds the io structure related |
1927 | + * to the work-to-be schedule is freed. |
1928 | + * |
1929 | + * Thus we need to keep the io structure still valid here after |
1930 | + * convertion finished. The io structure has a flag to |
1931 | + * avoid double converting from both fsync and background work |
1932 | + * queue work. |
1933 | + */ |
1934 | + ret = ext4_end_aio_dio_nolock(io); |
1935 | + if (ret < 0) |
1936 | + ret2 = ret; |
1937 | + else |
1938 | + list_del_init(&io->list); |
1939 | + } |
1940 | + return (ret2 < 0) ? ret2 : 0; |
1941 | +} |
1942 | + |
1943 | +static ext4_io_end_t *ext4_init_io_end (struct inode *inode) |
1944 | +{ |
1945 | + ext4_io_end_t *io = NULL; |
1946 | + |
1947 | + io = kmalloc(sizeof(*io), GFP_NOFS); |
1948 | + |
1949 | + if (io) { |
1950 | + igrab(inode); |
1951 | + io->inode = inode; |
1952 | + io->flag = 0; |
1953 | + io->offset = 0; |
1954 | + io->size = 0; |
1955 | + io->error = 0; |
1956 | + INIT_WORK(&io->work, ext4_end_aio_dio_work); |
1957 | + INIT_LIST_HEAD(&io->list); |
1958 | + } |
1959 | + |
1960 | + return io; |
1961 | +} |
1962 | + |
1963 | +static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, |
1964 | + ssize_t size, void *private) |
1965 | +{ |
1966 | + ext4_io_end_t *io_end = iocb->private; |
1967 | + struct workqueue_struct *wq; |
1968 | + |
1969 | + /* if not async direct IO or dio with 0 bytes write, just return */ |
1970 | + if (!io_end || !size) |
1971 | + return; |
1972 | + |
1973 | + ext_debug("ext4_end_io_dio(): io_end 0x%p" |
1974 | + "for inode %lu, iocb 0x%p, offset %llu, size %llu\n", |
1975 | + iocb->private, io_end->inode->i_ino, iocb, offset, |
1976 | + size); |
1977 | + |
1978 | + /* if not aio dio with unwritten extents, just free io and return */ |
1979 | + if (io_end->flag != DIO_AIO_UNWRITTEN){ |
1980 | + ext4_free_io_end(io_end); |
1981 | + iocb->private = NULL; |
1982 | + return; |
1983 | + } |
1984 | + |
1985 | + io_end->offset = offset; |
1986 | + io_end->size = size; |
1987 | + wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; |
1988 | + |
1989 | + /* queue the work to convert unwritten extents to written */ |
1990 | + queue_work(wq, &io_end->work); |
1991 | + |
1992 | + /* Add the io_end to per-inode completed aio dio list*/ |
1993 | + list_add_tail(&io_end->list, |
1994 | + &EXT4_I(io_end->inode)->i_aio_dio_complete_list); |
1995 | + iocb->private = NULL; |
1996 | +} |
1997 | +/* |
1998 | + * For ext4 extent files, ext4 will do direct-io write to holes, |
1999 | + * preallocated extents, and those write extend the file, no need to |
2000 | + * fall back to buffered IO. |
2001 | + * |
2002 | + * For holes, we fallocate those blocks, mark them as unintialized |
2003 | + * If those blocks were preallocated, we mark sure they are splited, but |
2004 | + * still keep the range to write as unintialized. |
2005 | + * |
2006 | + * The unwrritten extents will be converted to written when DIO is completed. |
2007 | + * For async direct IO, since the IO may still pending when return, we |
2008 | + * set up an end_io call back function, which will do the convertion |
2009 | + * when async direct IO completed. |
2010 | + * |
2011 | + * If the O_DIRECT write will extend the file then add this inode to the |
2012 | + * orphan list. So recovery will truncate it back to the original size |
2013 | + * if the machine crashes during the write. |
2014 | + * |
2015 | + */ |
2016 | +static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, |
2017 | + const struct iovec *iov, loff_t offset, |
2018 | + unsigned long nr_segs) |
2019 | +{ |
2020 | + struct file *file = iocb->ki_filp; |
2021 | + struct inode *inode = file->f_mapping->host; |
2022 | + ssize_t ret; |
2023 | + size_t count = iov_length(iov, nr_segs); |
2024 | + |
2025 | + loff_t final_size = offset + count; |
2026 | + if (rw == WRITE && final_size <= inode->i_size) { |
2027 | + /* |
2028 | + * We could direct write to holes and fallocate. |
2029 | + * |
2030 | + * Allocated blocks to fill the hole are marked as uninitialized |
2031 | + * to prevent paralel buffered read to expose the stale data |
2032 | + * before DIO complete the data IO. |
2033 | + * |
2034 | + * As to previously fallocated extents, ext4 get_block |
2035 | + * will just simply mark the buffer mapped but still |
2036 | + * keep the extents uninitialized. |
2037 | + * |
2038 | + * for non AIO case, we will convert those unwritten extents |
2039 | + * to written after return back from blockdev_direct_IO. |
2040 | + * |
2041 | + * for async DIO, the conversion needs to be defered when |
2042 | + * the IO is completed. The ext4 end_io callback function |
2043 | + * will be called to take care of the conversion work. |
2044 | + * Here for async case, we allocate an io_end structure to |
2045 | + * hook to the iocb. |
2046 | + */ |
2047 | + iocb->private = NULL; |
2048 | + EXT4_I(inode)->cur_aio_dio = NULL; |
2049 | + if (!is_sync_kiocb(iocb)) { |
2050 | + iocb->private = ext4_init_io_end(inode); |
2051 | + if (!iocb->private) |
2052 | + return -ENOMEM; |
2053 | + /* |
2054 | + * we save the io structure for current async |
2055 | + * direct IO, so that later ext4_get_blocks() |
2056 | + * could flag the io structure whether there |
2057 | + * is a unwritten extents needs to be converted |
2058 | + * when IO is completed. |
2059 | + */ |
2060 | + EXT4_I(inode)->cur_aio_dio = iocb->private; |
2061 | + } |
2062 | + |
2063 | + ret = blockdev_direct_IO(rw, iocb, inode, |
2064 | + inode->i_sb->s_bdev, iov, |
2065 | + offset, nr_segs, |
2066 | + ext4_get_block_dio_write, |
2067 | + ext4_end_io_dio); |
2068 | + if (iocb->private) |
2069 | + EXT4_I(inode)->cur_aio_dio = NULL; |
2070 | + /* |
2071 | + * The io_end structure takes a reference to the inode, |
2072 | + * that structure needs to be destroyed and the |
2073 | + * reference to the inode need to be dropped, when IO is |
2074 | + * complete, even with 0 byte write, or failed. |
2075 | + * |
2076 | + * In the successful AIO DIO case, the io_end structure will be |
2077 | + * desctroyed and the reference to the inode will be dropped |
2078 | + * after the end_io call back function is called. |
2079 | + * |
2080 | + * In the case there is 0 byte write, or error case, since |
2081 | + * VFS direct IO won't invoke the end_io call back function, |
2082 | + * we need to free the end_io structure here. |
2083 | + */ |
2084 | + if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { |
2085 | + ext4_free_io_end(iocb->private); |
2086 | + iocb->private = NULL; |
2087 | + } else if (ret > 0 && (EXT4_I(inode)->i_state & |
2088 | + EXT4_STATE_DIO_UNWRITTEN)) { |
2089 | + int err; |
2090 | + /* |
2091 | + * for non AIO case, since the IO is already |
2092 | + * completed, we could do the convertion right here |
2093 | + */ |
2094 | + err = ext4_convert_unwritten_extents(inode, |
2095 | + offset, ret); |
2096 | + if (err < 0) |
2097 | + ret = err; |
2098 | + EXT4_I(inode)->i_state &= ~EXT4_STATE_DIO_UNWRITTEN; |
2099 | + } |
2100 | + return ret; |
2101 | + } |
2102 | + |
2103 | + /* for write the the end of file case, we fall back to old way */ |
2104 | + return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); |
2105 | +} |
2106 | + |
2107 | +static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, |
2108 | + const struct iovec *iov, loff_t offset, |
2109 | + unsigned long nr_segs) |
2110 | +{ |
2111 | + struct file *file = iocb->ki_filp; |
2112 | + struct inode *inode = file->f_mapping->host; |
2113 | + |
2114 | + if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) |
2115 | + return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs); |
2116 | + |
2117 | + return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); |
2118 | +} |
2119 | + |
2120 | /* |
2121 | * Pages can be marked dirty completely asynchronously from ext4's journalling |
2122 | * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do |
2123 | @@ -3653,13 +4122,16 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode, |
2124 | __le32 *last) |
2125 | { |
2126 | __le32 *p; |
2127 | + int is_metadata = S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode); |
2128 | + |
2129 | if (try_to_extend_transaction(handle, inode)) { |
2130 | if (bh) { |
2131 | BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); |
2132 | ext4_handle_dirty_metadata(handle, inode, bh); |
2133 | } |
2134 | ext4_mark_inode_dirty(handle, inode); |
2135 | - ext4_journal_test_restart(handle, inode); |
2136 | + ext4_truncate_restart_trans(handle, inode, |
2137 | + blocks_for_truncate(inode)); |
2138 | if (bh) { |
2139 | BUFFER_TRACE(bh, "retaking write access"); |
2140 | ext4_journal_get_write_access(handle, bh); |
2141 | @@ -3682,11 +4154,11 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode, |
2142 | |
2143 | *p = 0; |
2144 | tbh = sb_find_get_block(inode->i_sb, nr); |
2145 | - ext4_forget(handle, 0, inode, tbh, nr); |
2146 | + ext4_forget(handle, is_metadata, inode, tbh, nr); |
2147 | } |
2148 | } |
2149 | |
2150 | - ext4_free_blocks(handle, inode, block_to_free, count, 0); |
2151 | + ext4_free_blocks(handle, inode, block_to_free, count, is_metadata); |
2152 | } |
2153 | |
2154 | /** |
2155 | @@ -3870,7 +4342,8 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, |
2156 | return; |
2157 | if (try_to_extend_transaction(handle, inode)) { |
2158 | ext4_mark_inode_dirty(handle, inode); |
2159 | - ext4_journal_test_restart(handle, inode); |
2160 | + ext4_truncate_restart_trans(handle, inode, |
2161 | + blocks_for_truncate(inode)); |
2162 | } |
2163 | |
2164 | ext4_free_blocks(handle, inode, nr, 1, 1); |
2165 | @@ -3958,8 +4431,7 @@ void ext4_truncate(struct inode *inode) |
2166 | if (!ext4_can_truncate(inode)) |
2167 | return; |
2168 | |
2169 | - if (ei->i_disksize && inode->i_size == 0 && |
2170 | - !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) |
2171 | + if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) |
2172 | ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE; |
2173 | |
2174 | if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { |
2175 | @@ -4313,8 +4785,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) |
2176 | struct ext4_iloc iloc; |
2177 | struct ext4_inode *raw_inode; |
2178 | struct ext4_inode_info *ei; |
2179 | - struct buffer_head *bh; |
2180 | struct inode *inode; |
2181 | + journal_t *journal = EXT4_SB(sb)->s_journal; |
2182 | long ret; |
2183 | int block; |
2184 | |
2185 | @@ -4325,11 +4797,11 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) |
2186 | return inode; |
2187 | |
2188 | ei = EXT4_I(inode); |
2189 | + iloc.bh = 0; |
2190 | |
2191 | ret = __ext4_get_inode_loc(inode, &iloc, 0); |
2192 | if (ret < 0) |
2193 | goto bad_inode; |
2194 | - bh = iloc.bh; |
2195 | raw_inode = ext4_raw_inode(&iloc); |
2196 | inode->i_mode = le16_to_cpu(raw_inode->i_mode); |
2197 | inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); |
2198 | @@ -4352,7 +4824,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) |
2199 | if (inode->i_mode == 0 || |
2200 | !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) { |
2201 | /* this inode is deleted */ |
2202 | - brelse(bh); |
2203 | ret = -ESTALE; |
2204 | goto bad_inode; |
2205 | } |
2206 | @@ -4380,11 +4851,35 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) |
2207 | ei->i_data[block] = raw_inode->i_block[block]; |
2208 | INIT_LIST_HEAD(&ei->i_orphan); |
2209 | |
2210 | + /* |
2211 | + * Set transaction id's of transactions that have to be committed |
2212 | + * to finish f[data]sync. We set them to currently running transaction |
2213 | + * as we cannot be sure that the inode or some of its metadata isn't |
2214 | + * part of the transaction - the inode could have been reclaimed and |
2215 | + * now it is reread from disk. |
2216 | + */ |
2217 | + if (journal) { |
2218 | + transaction_t *transaction; |
2219 | + tid_t tid; |
2220 | + |
2221 | + spin_lock(&journal->j_state_lock); |
2222 | + if (journal->j_running_transaction) |
2223 | + transaction = journal->j_running_transaction; |
2224 | + else |
2225 | + transaction = journal->j_committing_transaction; |
2226 | + if (transaction) |
2227 | + tid = transaction->t_tid; |
2228 | + else |
2229 | + tid = journal->j_commit_sequence; |
2230 | + spin_unlock(&journal->j_state_lock); |
2231 | + ei->i_sync_tid = tid; |
2232 | + ei->i_datasync_tid = tid; |
2233 | + } |
2234 | + |
2235 | if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { |
2236 | ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); |
2237 | if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > |
2238 | EXT4_INODE_SIZE(inode->i_sb)) { |
2239 | - brelse(bh); |
2240 | ret = -EIO; |
2241 | goto bad_inode; |
2242 | } |
2243 | @@ -4416,10 +4911,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) |
2244 | |
2245 | ret = 0; |
2246 | if (ei->i_file_acl && |
2247 | - ((ei->i_file_acl < |
2248 | - (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) + |
2249 | - EXT4_SB(sb)->s_gdb_count)) || |
2250 | - (ei->i_file_acl >= ext4_blocks_count(EXT4_SB(sb)->s_es)))) { |
2251 | + !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) { |
2252 | ext4_error(sb, __func__, |
2253 | "bad extended attribute block %llu in inode #%lu", |
2254 | ei->i_file_acl, inode->i_ino); |
2255 | @@ -4437,10 +4929,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) |
2256 | /* Validate block references which are part of inode */ |
2257 | ret = ext4_check_inode_blockref(inode); |
2258 | } |
2259 | - if (ret) { |
2260 | - brelse(bh); |
2261 | + if (ret) |
2262 | goto bad_inode; |
2263 | - } |
2264 | |
2265 | if (S_ISREG(inode->i_mode)) { |
2266 | inode->i_op = &ext4_file_inode_operations; |
2267 | @@ -4468,7 +4958,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) |
2268 | init_special_inode(inode, inode->i_mode, |
2269 | new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); |
2270 | } else { |
2271 | - brelse(bh); |
2272 | ret = -EIO; |
2273 | ext4_error(inode->i_sb, __func__, |
2274 | "bogus i_mode (%o) for inode=%lu", |
2275 | @@ -4481,6 +4970,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) |
2276 | return inode; |
2277 | |
2278 | bad_inode: |
2279 | + brelse(iloc.bh); |
2280 | iget_failed(inode); |
2281 | return ERR_PTR(ret); |
2282 | } |
2283 | @@ -4581,8 +5071,7 @@ static int ext4_do_update_inode(handle_t *handle, |
2284 | if (ext4_inode_blocks_set(handle, raw_inode, ei)) |
2285 | goto out_brelse; |
2286 | raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); |
2287 | - /* clear the migrate flag in the raw_inode */ |
2288 | - raw_inode->i_flags = cpu_to_le32(ei->i_flags & ~EXT4_EXT_MIGRATE); |
2289 | + raw_inode->i_flags = cpu_to_le32(ei->i_flags); |
2290 | if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != |
2291 | cpu_to_le32(EXT4_OS_HURD)) |
2292 | raw_inode->i_file_acl_high = |
2293 | @@ -4641,6 +5130,7 @@ static int ext4_do_update_inode(handle_t *handle, |
2294 | err = rc; |
2295 | ei->i_state &= ~EXT4_STATE_NEW; |
2296 | |
2297 | + ext4_update_inode_fsync_trans(handle, inode, 0); |
2298 | out_brelse: |
2299 | brelse(bh); |
2300 | ext4_std_error(inode->i_sb, err); |
2301 | @@ -4684,19 +5174,40 @@ out_brelse: |
2302 | */ |
2303 | int ext4_write_inode(struct inode *inode, int wait) |
2304 | { |
2305 | + int err; |
2306 | + |
2307 | if (current->flags & PF_MEMALLOC) |
2308 | return 0; |
2309 | |
2310 | - if (ext4_journal_current_handle()) { |
2311 | - jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n"); |
2312 | - dump_stack(); |
2313 | - return -EIO; |
2314 | - } |
2315 | + if (EXT4_SB(inode->i_sb)->s_journal) { |
2316 | + if (ext4_journal_current_handle()) { |
2317 | + jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n"); |
2318 | + dump_stack(); |
2319 | + return -EIO; |
2320 | + } |
2321 | |
2322 | - if (!wait) |
2323 | - return 0; |
2324 | + if (!wait) |
2325 | + return 0; |
2326 | + |
2327 | + err = ext4_force_commit(inode->i_sb); |
2328 | + } else { |
2329 | + struct ext4_iloc iloc; |
2330 | |
2331 | - return ext4_force_commit(inode->i_sb); |
2332 | + err = ext4_get_inode_loc(inode, &iloc); |
2333 | + if (err) |
2334 | + return err; |
2335 | + if (wait) |
2336 | + sync_dirty_buffer(iloc.bh); |
2337 | + if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { |
2338 | + ext4_error(inode->i_sb, __func__, |
2339 | + "IO error syncing inode, " |
2340 | + "inode=%lu, block=%llu", |
2341 | + inode->i_ino, |
2342 | + (unsigned long long)iloc.bh->b_blocknr); |
2343 | + err = -EIO; |
2344 | + } |
2345 | + } |
2346 | + return err; |
2347 | } |
2348 | |
2349 | /* |
2350 | @@ -4739,8 +5250,8 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) |
2351 | |
2352 | /* (user+group)*(old+new) structure, inode write (sb, |
2353 | * inode block, ? - but truncate inode update has it) */ |
2354 | - handle = ext4_journal_start(inode, 2*(EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)+ |
2355 | - EXT4_QUOTA_DEL_BLOCKS(inode->i_sb))+3); |
2356 | + handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+ |
2357 | + EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3); |
2358 | if (IS_ERR(handle)) { |
2359 | error = PTR_ERR(handle); |
2360 | goto err_out; |
2361 | @@ -5137,24 +5648,13 @@ void ext4_dirty_inode(struct inode *inode) |
2362 | handle_t *current_handle = ext4_journal_current_handle(); |
2363 | handle_t *handle; |
2364 | |
2365 | - if (!ext4_handle_valid(current_handle)) { |
2366 | - ext4_mark_inode_dirty(current_handle, inode); |
2367 | - return; |
2368 | - } |
2369 | - |
2370 | handle = ext4_journal_start(inode, 2); |
2371 | if (IS_ERR(handle)) |
2372 | goto out; |
2373 | - if (current_handle && |
2374 | - current_handle->h_transaction != handle->h_transaction) { |
2375 | - /* This task has a transaction open against a different fs */ |
2376 | - printk(KERN_EMERG "%s: transactions do not match!\n", |
2377 | - __func__); |
2378 | - } else { |
2379 | - jbd_debug(5, "marking dirty. outer handle=%p\n", |
2380 | - current_handle); |
2381 | - ext4_mark_inode_dirty(handle, inode); |
2382 | - } |
2383 | + |
2384 | + jbd_debug(5, "marking dirty. outer handle=%p\n", current_handle); |
2385 | + ext4_mark_inode_dirty(handle, inode); |
2386 | + |
2387 | ext4_journal_stop(handle); |
2388 | out: |
2389 | return; |
2390 | @@ -5281,12 +5781,21 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) |
2391 | else |
2392 | len = PAGE_CACHE_SIZE; |
2393 | |
2394 | + lock_page(page); |
2395 | + /* |
2396 | + * return if we have all the buffers mapped. This avoid |
2397 | + * the need to call write_begin/write_end which does a |
2398 | + * journal_start/journal_stop which can block and take |
2399 | + * long time |
2400 | + */ |
2401 | if (page_has_buffers(page)) { |
2402 | - /* return if we have all the buffers mapped */ |
2403 | if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, |
2404 | - ext4_bh_unmapped)) |
2405 | + ext4_bh_unmapped)) { |
2406 | + unlock_page(page); |
2407 | goto out_unlock; |
2408 | + } |
2409 | } |
2410 | + unlock_page(page); |
2411 | /* |
2412 | * OK, we need to fill the hole... Do write_begin write_end |
2413 | * to do block allocation/reservation.We are not holding |
2414 | diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c |
2415 | index 7050a9c..b63d193 100644 |
2416 | --- a/fs/ext4/ioctl.c |
2417 | +++ b/fs/ext4/ioctl.c |
2418 | @@ -221,32 +221,38 @@ setversion_out: |
2419 | struct file *donor_filp; |
2420 | int err; |
2421 | |
2422 | + if (!(filp->f_mode & FMODE_READ) || |
2423 | + !(filp->f_mode & FMODE_WRITE)) |
2424 | + return -EBADF; |
2425 | + |
2426 | if (copy_from_user(&me, |
2427 | (struct move_extent __user *)arg, sizeof(me))) |
2428 | return -EFAULT; |
2429 | + me.moved_len = 0; |
2430 | |
2431 | donor_filp = fget(me.donor_fd); |
2432 | if (!donor_filp) |
2433 | return -EBADF; |
2434 | |
2435 | - if (!capable(CAP_DAC_OVERRIDE)) { |
2436 | - if ((current->real_cred->fsuid != inode->i_uid) || |
2437 | - !(inode->i_mode & S_IRUSR) || |
2438 | - !(donor_filp->f_dentry->d_inode->i_mode & |
2439 | - S_IRUSR)) { |
2440 | - fput(donor_filp); |
2441 | - return -EACCES; |
2442 | - } |
2443 | + if (!(donor_filp->f_mode & FMODE_WRITE)) { |
2444 | + err = -EBADF; |
2445 | + goto mext_out; |
2446 | } |
2447 | |
2448 | + err = mnt_want_write(filp->f_path.mnt); |
2449 | + if (err) |
2450 | + goto mext_out; |
2451 | + |
2452 | err = ext4_move_extents(filp, donor_filp, me.orig_start, |
2453 | me.donor_start, me.len, &me.moved_len); |
2454 | - fput(donor_filp); |
2455 | + mnt_drop_write(filp->f_path.mnt); |
2456 | + if (me.moved_len > 0) |
2457 | + file_remove_suid(donor_filp); |
2458 | |
2459 | - if (!err) |
2460 | - if (copy_to_user((struct move_extent *)arg, |
2461 | - &me, sizeof(me))) |
2462 | - return -EFAULT; |
2463 | + if (copy_to_user((struct move_extent *)arg, &me, sizeof(me))) |
2464 | + err = -EFAULT; |
2465 | +mext_out: |
2466 | + fput(donor_filp); |
2467 | return err; |
2468 | } |
2469 | |
2470 | diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c |
2471 | index cd25846..099fd47 100644 |
2472 | --- a/fs/ext4/mballoc.c |
2473 | +++ b/fs/ext4/mballoc.c |
2474 | @@ -908,6 +908,97 @@ out: |
2475 | return err; |
2476 | } |
2477 | |
2478 | +static noinline_for_stack |
2479 | +int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) |
2480 | +{ |
2481 | + |
2482 | + int ret = 0; |
2483 | + void *bitmap; |
2484 | + int blocks_per_page; |
2485 | + int block, pnum, poff; |
2486 | + int num_grp_locked = 0; |
2487 | + struct ext4_group_info *this_grp; |
2488 | + struct ext4_sb_info *sbi = EXT4_SB(sb); |
2489 | + struct inode *inode = sbi->s_buddy_cache; |
2490 | + struct page *page = NULL, *bitmap_page = NULL; |
2491 | + |
2492 | + mb_debug("init group %lu\n", group); |
2493 | + blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; |
2494 | + this_grp = ext4_get_group_info(sb, group); |
2495 | + /* |
2496 | + * This ensures we don't add group |
2497 | + * to this buddy cache via resize |
2498 | + */ |
2499 | + num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group); |
2500 | + if (!EXT4_MB_GRP_NEED_INIT(this_grp)) { |
2501 | + /* |
2502 | + * somebody initialized the group |
2503 | + * return without doing anything |
2504 | + */ |
2505 | + ret = 0; |
2506 | + goto err; |
2507 | + } |
2508 | + /* |
2509 | + * the buddy cache inode stores the block bitmap |
2510 | + * and buddy information in consecutive blocks. |
2511 | + * So for each group we need two blocks. |
2512 | + */ |
2513 | + block = group * 2; |
2514 | + pnum = block / blocks_per_page; |
2515 | + poff = block % blocks_per_page; |
2516 | + page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); |
2517 | + if (page) { |
2518 | + BUG_ON(page->mapping != inode->i_mapping); |
2519 | + ret = ext4_mb_init_cache(page, NULL); |
2520 | + if (ret) { |
2521 | + unlock_page(page); |
2522 | + goto err; |
2523 | + } |
2524 | + unlock_page(page); |
2525 | + } |
2526 | + if (page == NULL || !PageUptodate(page)) { |
2527 | + ret = -EIO; |
2528 | + goto err; |
2529 | + } |
2530 | + mark_page_accessed(page); |
2531 | + bitmap_page = page; |
2532 | + bitmap = page_address(page) + (poff * sb->s_blocksize); |
2533 | + |
2534 | + /* init buddy cache */ |
2535 | + block++; |
2536 | + pnum = block / blocks_per_page; |
2537 | + poff = block % blocks_per_page; |
2538 | + page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); |
2539 | + if (page == bitmap_page) { |
2540 | + /* |
2541 | + * If both the bitmap and buddy are in |
2542 | + * the same page we don't need to force |
2543 | + * init the buddy |
2544 | + */ |
2545 | + unlock_page(page); |
2546 | + } else if (page) { |
2547 | + BUG_ON(page->mapping != inode->i_mapping); |
2548 | + ret = ext4_mb_init_cache(page, bitmap); |
2549 | + if (ret) { |
2550 | + unlock_page(page); |
2551 | + goto err; |
2552 | + } |
2553 | + unlock_page(page); |
2554 | + } |
2555 | + if (page == NULL || !PageUptodate(page)) { |
2556 | + ret = -EIO; |
2557 | + goto err; |
2558 | + } |
2559 | + mark_page_accessed(page); |
2560 | +err: |
2561 | + ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked); |
2562 | + if (bitmap_page) |
2563 | + page_cache_release(bitmap_page); |
2564 | + if (page) |
2565 | + page_cache_release(page); |
2566 | + return ret; |
2567 | +} |
2568 | + |
2569 | static noinline_for_stack int |
2570 | ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, |
2571 | struct ext4_buddy *e4b) |
2572 | @@ -941,8 +1032,26 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, |
2573 | * groups mapped by the page is blocked |
2574 | * till we are done with allocation |
2575 | */ |
2576 | +repeat_load_buddy: |
2577 | down_read(e4b->alloc_semp); |
2578 | |
2579 | + if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { |
2580 | + /* we need to check for group need init flag |
2581 | + * with alloc_semp held so that we can be sure |
2582 | + * that new blocks didn't get added to the group |
2583 | + * when we are loading the buddy cache |
2584 | + */ |
2585 | + up_read(e4b->alloc_semp); |
2586 | + /* |
2587 | + * we need full data about the group |
2588 | + * to make a good selection |
2589 | + */ |
2590 | + ret = ext4_mb_init_group(sb, group); |
2591 | + if (ret) |
2592 | + return ret; |
2593 | + goto repeat_load_buddy; |
2594 | + } |
2595 | + |
2596 | /* |
2597 | * the buddy cache inode stores the block bitmap |
2598 | * and buddy information in consecutive blocks. |
2599 | @@ -1360,7 +1469,7 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac, |
2600 | ac->alloc_semp = e4b->alloc_semp; |
2601 | e4b->alloc_semp = NULL; |
2602 | /* store last allocated for subsequent stream allocation */ |
2603 | - if ((ac->ac_flags & EXT4_MB_HINT_DATA)) { |
2604 | + if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { |
2605 | spin_lock(&sbi->s_md_lock); |
2606 | sbi->s_mb_last_group = ac->ac_f_ex.fe_group; |
2607 | sbi->s_mb_last_start = ac->ac_f_ex.fe_start; |
2608 | @@ -1837,97 +1946,6 @@ void ext4_mb_put_buddy_cache_lock(struct super_block *sb, |
2609 | |
2610 | } |
2611 | |
2612 | -static noinline_for_stack |
2613 | -int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) |
2614 | -{ |
2615 | - |
2616 | - int ret; |
2617 | - void *bitmap; |
2618 | - int blocks_per_page; |
2619 | - int block, pnum, poff; |
2620 | - int num_grp_locked = 0; |
2621 | - struct ext4_group_info *this_grp; |
2622 | - struct ext4_sb_info *sbi = EXT4_SB(sb); |
2623 | - struct inode *inode = sbi->s_buddy_cache; |
2624 | - struct page *page = NULL, *bitmap_page = NULL; |
2625 | - |
2626 | - mb_debug("init group %lu\n", group); |
2627 | - blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; |
2628 | - this_grp = ext4_get_group_info(sb, group); |
2629 | - /* |
2630 | - * This ensures we don't add group |
2631 | - * to this buddy cache via resize |
2632 | - */ |
2633 | - num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group); |
2634 | - if (!EXT4_MB_GRP_NEED_INIT(this_grp)) { |
2635 | - /* |
2636 | - * somebody initialized the group |
2637 | - * return without doing anything |
2638 | - */ |
2639 | - ret = 0; |
2640 | - goto err; |
2641 | - } |
2642 | - /* |
2643 | - * the buddy cache inode stores the block bitmap |
2644 | - * and buddy information in consecutive blocks. |
2645 | - * So for each group we need two blocks. |
2646 | - */ |
2647 | - block = group * 2; |
2648 | - pnum = block / blocks_per_page; |
2649 | - poff = block % blocks_per_page; |
2650 | - page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); |
2651 | - if (page) { |
2652 | - BUG_ON(page->mapping != inode->i_mapping); |
2653 | - ret = ext4_mb_init_cache(page, NULL); |
2654 | - if (ret) { |
2655 | - unlock_page(page); |
2656 | - goto err; |
2657 | - } |
2658 | - unlock_page(page); |
2659 | - } |
2660 | - if (page == NULL || !PageUptodate(page)) { |
2661 | - ret = -EIO; |
2662 | - goto err; |
2663 | - } |
2664 | - mark_page_accessed(page); |
2665 | - bitmap_page = page; |
2666 | - bitmap = page_address(page) + (poff * sb->s_blocksize); |
2667 | - |
2668 | - /* init buddy cache */ |
2669 | - block++; |
2670 | - pnum = block / blocks_per_page; |
2671 | - poff = block % blocks_per_page; |
2672 | - page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); |
2673 | - if (page == bitmap_page) { |
2674 | - /* |
2675 | - * If both the bitmap and buddy are in |
2676 | - * the same page we don't need to force |
2677 | - * init the buddy |
2678 | - */ |
2679 | - unlock_page(page); |
2680 | - } else if (page) { |
2681 | - BUG_ON(page->mapping != inode->i_mapping); |
2682 | - ret = ext4_mb_init_cache(page, bitmap); |
2683 | - if (ret) { |
2684 | - unlock_page(page); |
2685 | - goto err; |
2686 | - } |
2687 | - unlock_page(page); |
2688 | - } |
2689 | - if (page == NULL || !PageUptodate(page)) { |
2690 | - ret = -EIO; |
2691 | - goto err; |
2692 | - } |
2693 | - mark_page_accessed(page); |
2694 | -err: |
2695 | - ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked); |
2696 | - if (bitmap_page) |
2697 | - page_cache_release(bitmap_page); |
2698 | - if (page) |
2699 | - page_cache_release(page); |
2700 | - return ret; |
2701 | -} |
2702 | - |
2703 | static noinline_for_stack int |
2704 | ext4_mb_regular_allocator(struct ext4_allocation_context *ac) |
2705 | { |
2706 | @@ -1938,11 +1956,14 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) |
2707 | struct ext4_sb_info *sbi; |
2708 | struct super_block *sb; |
2709 | struct ext4_buddy e4b; |
2710 | - loff_t size, isize; |
2711 | |
2712 | sb = ac->ac_sb; |
2713 | sbi = EXT4_SB(sb); |
2714 | ngroups = ext4_get_groups_count(sb); |
2715 | + /* non-extent files are limited to low blocks/groups */ |
2716 | + if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL)) |
2717 | + ngroups = sbi->s_blockfile_groups; |
2718 | + |
2719 | BUG_ON(ac->ac_status == AC_STATUS_FOUND); |
2720 | |
2721 | /* first, try the goal */ |
2722 | @@ -1974,20 +1995,16 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) |
2723 | } |
2724 | |
2725 | bsbits = ac->ac_sb->s_blocksize_bits; |
2726 | - /* if stream allocation is enabled, use global goal */ |
2727 | - size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; |
2728 | - isize = i_size_read(ac->ac_inode) >> bsbits; |
2729 | - if (size < isize) |
2730 | - size = isize; |
2731 | |
2732 | - if (size < sbi->s_mb_stream_request && |
2733 | - (ac->ac_flags & EXT4_MB_HINT_DATA)) { |
2734 | + /* if stream allocation is enabled, use global goal */ |
2735 | + if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { |
2736 | /* TBD: may be hot point */ |
2737 | spin_lock(&sbi->s_md_lock); |
2738 | ac->ac_g_ex.fe_group = sbi->s_mb_last_group; |
2739 | ac->ac_g_ex.fe_start = sbi->s_mb_last_start; |
2740 | spin_unlock(&sbi->s_md_lock); |
2741 | } |
2742 | + |
2743 | /* Let's just scan groups to find more-less suitable blocks */ |
2744 | cr = ac->ac_2order ? 0 : 1; |
2745 | /* |
2746 | @@ -2015,27 +2032,6 @@ repeat: |
2747 | if (grp->bb_free == 0) |
2748 | continue; |
2749 | |
2750 | - /* |
2751 | - * if the group is already init we check whether it is |
2752 | - * a good group and if not we don't load the buddy |
2753 | - */ |
2754 | - if (EXT4_MB_GRP_NEED_INIT(grp)) { |
2755 | - /* |
2756 | - * we need full data about the group |
2757 | - * to make a good selection |
2758 | - */ |
2759 | - err = ext4_mb_init_group(sb, group); |
2760 | - if (err) |
2761 | - goto out; |
2762 | - } |
2763 | - |
2764 | - /* |
2765 | - * If the particular group doesn't satisfy our |
2766 | - * criteria we continue with the next group |
2767 | - */ |
2768 | - if (!ext4_mb_good_group(ac, group, cr)) |
2769 | - continue; |
2770 | - |
2771 | err = ext4_mb_load_buddy(sb, group, &e4b); |
2772 | if (err) |
2773 | goto out; |
2774 | @@ -2571,13 +2567,11 @@ static int ext4_mb_init_backend(struct super_block *sb) |
2775 | { |
2776 | ext4_group_t ngroups = ext4_get_groups_count(sb); |
2777 | ext4_group_t i; |
2778 | - int metalen; |
2779 | struct ext4_sb_info *sbi = EXT4_SB(sb); |
2780 | struct ext4_super_block *es = sbi->s_es; |
2781 | int num_meta_group_infos; |
2782 | int num_meta_group_infos_max; |
2783 | int array_size; |
2784 | - struct ext4_group_info **meta_group_info; |
2785 | struct ext4_group_desc *desc; |
2786 | |
2787 | /* This is the number of blocks used by GDT */ |
2788 | @@ -2622,22 +2616,6 @@ static int ext4_mb_init_backend(struct super_block *sb) |
2789 | goto err_freesgi; |
2790 | } |
2791 | EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; |
2792 | - |
2793 | - metalen = sizeof(*meta_group_info) << EXT4_DESC_PER_BLOCK_BITS(sb); |
2794 | - for (i = 0; i < num_meta_group_infos; i++) { |
2795 | - if ((i + 1) == num_meta_group_infos) |
2796 | - metalen = sizeof(*meta_group_info) * |
2797 | - (ngroups - |
2798 | - (i << EXT4_DESC_PER_BLOCK_BITS(sb))); |
2799 | - meta_group_info = kmalloc(metalen, GFP_KERNEL); |
2800 | - if (meta_group_info == NULL) { |
2801 | - printk(KERN_ERR "EXT4-fs: can't allocate mem for a " |
2802 | - "buddy group\n"); |
2803 | - goto err_freemeta; |
2804 | - } |
2805 | - sbi->s_group_info[i] = meta_group_info; |
2806 | - } |
2807 | - |
2808 | for (i = 0; i < ngroups; i++) { |
2809 | desc = ext4_get_group_desc(sb, i, NULL); |
2810 | if (desc == NULL) { |
2811 | @@ -2655,7 +2633,6 @@ err_freebuddy: |
2812 | while (i-- > 0) |
2813 | kfree(ext4_get_group_info(sb, i)); |
2814 | i = num_meta_group_infos; |
2815 | -err_freemeta: |
2816 | while (i-- > 0) |
2817 | kfree(sbi->s_group_info[i]); |
2818 | iput(sbi->s_buddy_cache); |
2819 | @@ -2833,7 +2810,6 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) |
2820 | struct ext4_group_info *db; |
2821 | int err, count = 0, count2 = 0; |
2822 | struct ext4_free_data *entry; |
2823 | - ext4_fsblk_t discard_block; |
2824 | struct list_head *l, *ltmp; |
2825 | |
2826 | list_for_each_safe(l, ltmp, &txn->t_private_list) { |
2827 | @@ -2863,13 +2839,19 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) |
2828 | page_cache_release(e4b.bd_bitmap_page); |
2829 | } |
2830 | ext4_unlock_group(sb, entry->group); |
2831 | - discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb) |
2832 | - + entry->start_blk |
2833 | - + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); |
2834 | - trace_ext4_discard_blocks(sb, (unsigned long long)discard_block, |
2835 | - entry->count); |
2836 | - sb_issue_discard(sb, discard_block, entry->count); |
2837 | - |
2838 | + if (test_opt(sb, DISCARD)) { |
2839 | + ext4_fsblk_t discard_block; |
2840 | + struct ext4_super_block *es = EXT4_SB(sb)->s_es; |
2841 | + |
2842 | + discard_block = (ext4_fsblk_t)entry->group * |
2843 | + EXT4_BLOCKS_PER_GROUP(sb) |
2844 | + + entry->start_blk |
2845 | + + le32_to_cpu(es->s_first_data_block); |
2846 | + trace_ext4_discard_blocks(sb, |
2847 | + (unsigned long long)discard_block, |
2848 | + entry->count); |
2849 | + sb_issue_discard(sb, discard_block, entry->count); |
2850 | + } |
2851 | kmem_cache_free(ext4_free_ext_cachep, entry); |
2852 | ext4_mb_release_desc(&e4b); |
2853 | } |
2854 | @@ -3276,6 +3258,24 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac) |
2855 | } |
2856 | |
2857 | /* |
2858 | + * Called on failure; free up any blocks from the inode PA for this |
2859 | + * context. We don't need this for MB_GROUP_PA because we only change |
2860 | + * pa_free in ext4_mb_release_context(), but on failure, we've already |
2861 | + * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed. |
2862 | + */ |
2863 | +static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac) |
2864 | +{ |
2865 | + struct ext4_prealloc_space *pa = ac->ac_pa; |
2866 | + int len; |
2867 | + |
2868 | + if (pa && pa->pa_type == MB_INODE_PA) { |
2869 | + len = ac->ac_b_ex.fe_len; |
2870 | + pa->pa_free += len; |
2871 | + } |
2872 | + |
2873 | +} |
2874 | + |
2875 | +/* |
2876 | * use blocks preallocated to inode |
2877 | */ |
2878 | static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac, |
2879 | @@ -3382,6 +3382,11 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) |
2880 | ac->ac_o_ex.fe_logical >= pa->pa_lstart + pa->pa_len) |
2881 | continue; |
2882 | |
2883 | + /* non-extent files can't have physical blocks past 2^32 */ |
2884 | + if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL) && |
2885 | + pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS) |
2886 | + continue; |
2887 | + |
2888 | /* found preallocated blocks, use them */ |
2889 | spin_lock(&pa->pa_lock); |
2890 | if (pa->pa_deleted == 0 && pa->pa_free) { |
2891 | @@ -4174,16 +4179,26 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) |
2892 | if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) |
2893 | return; |
2894 | |
2895 | + if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) |
2896 | + return; |
2897 | + |
2898 | size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; |
2899 | - isize = i_size_read(ac->ac_inode) >> bsbits; |
2900 | - size = max(size, isize); |
2901 | + isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1) |
2902 | + >> bsbits; |
2903 | |
2904 | - /* don't use group allocation for large files */ |
2905 | - if (size >= sbi->s_mb_stream_request) |
2906 | + if ((size == isize) && |
2907 | + !ext4_fs_is_busy(sbi) && |
2908 | + (atomic_read(&ac->ac_inode->i_writecount) == 0)) { |
2909 | + ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC; |
2910 | return; |
2911 | + } |
2912 | |
2913 | - if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) |
2914 | + /* don't use group allocation for large files */ |
2915 | + size = max(size, isize); |
2916 | + if (size >= sbi->s_mb_stream_request) { |
2917 | + ac->ac_flags |= EXT4_MB_STREAM_ALLOC; |
2918 | return; |
2919 | + } |
2920 | |
2921 | BUG_ON(ac->ac_lg != NULL); |
2922 | /* |
2923 | @@ -4549,6 +4564,7 @@ repeat: |
2924 | ac->ac_status = AC_STATUS_CONTINUE; |
2925 | goto repeat; |
2926 | } else if (*errp) { |
2927 | + ext4_discard_allocated_blocks(ac); |
2928 | ac->ac_b_ex.fe_len = 0; |
2929 | ar->len = 0; |
2930 | ext4_mb_show_ac(ac); |
2931 | diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c |
2932 | index 313a50b..8646149 100644 |
2933 | --- a/fs/ext4/migrate.c |
2934 | +++ b/fs/ext4/migrate.c |
2935 | @@ -75,7 +75,7 @@ static int finish_range(handle_t *handle, struct inode *inode, |
2936 | goto err_out; |
2937 | } |
2938 | } |
2939 | - retval = ext4_ext_insert_extent(handle, inode, path, &newext); |
2940 | + retval = ext4_ext_insert_extent(handle, inode, path, &newext, 0); |
2941 | err_out: |
2942 | if (path) { |
2943 | ext4_ext_drop_refs(path); |
2944 | @@ -238,7 +238,7 @@ static int extend_credit_for_blkdel(handle_t *handle, struct inode *inode) |
2945 | * So allocate a credit of 3. We may update |
2946 | * quota (user and group). |
2947 | */ |
2948 | - needed = 3 + 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); |
2949 | + needed = 3 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); |
2950 | |
2951 | if (ext4_journal_extend(handle, needed) != 0) |
2952 | retval = ext4_journal_restart(handle, needed); |
2953 | @@ -353,17 +353,16 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode, |
2954 | |
2955 | down_write(&EXT4_I(inode)->i_data_sem); |
2956 | /* |
2957 | - * if EXT4_EXT_MIGRATE is cleared a block allocation |
2958 | + * if EXT4_STATE_EXT_MIGRATE is cleared a block allocation |
2959 | * happened after we started the migrate. We need to |
2960 | * fail the migrate |
2961 | */ |
2962 | - if (!(EXT4_I(inode)->i_flags & EXT4_EXT_MIGRATE)) { |
2963 | + if (!(EXT4_I(inode)->i_state & EXT4_STATE_EXT_MIGRATE)) { |
2964 | retval = -EAGAIN; |
2965 | up_write(&EXT4_I(inode)->i_data_sem); |
2966 | goto err_out; |
2967 | } else |
2968 | - EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags & |
2969 | - ~EXT4_EXT_MIGRATE; |
2970 | + EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE; |
2971 | /* |
2972 | * We have the extent map build with the tmp inode. |
2973 | * Now copy the i_data across |
2974 | @@ -478,7 +477,7 @@ int ext4_ext_migrate(struct inode *inode) |
2975 | handle = ext4_journal_start(inode, |
2976 | EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + |
2977 | EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + |
2978 | - 2 * EXT4_QUOTA_INIT_BLOCKS(inode->i_sb) |
2979 | + EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb) |
2980 | + 1); |
2981 | if (IS_ERR(handle)) { |
2982 | retval = PTR_ERR(handle); |
2983 | @@ -517,14 +516,15 @@ int ext4_ext_migrate(struct inode *inode) |
2984 | * when we add extents we extent the journal |
2985 | */ |
2986 | /* |
2987 | - * Even though we take i_mutex we can still cause block allocation |
2988 | - * via mmap write to holes. If we have allocated new blocks we fail |
2989 | - * migrate. New block allocation will clear EXT4_EXT_MIGRATE flag. |
2990 | - * The flag is updated with i_data_sem held to prevent racing with |
2991 | - * block allocation. |
2992 | + * Even though we take i_mutex we can still cause block |
2993 | + * allocation via mmap write to holes. If we have allocated |
2994 | + * new blocks we fail migrate. New block allocation will |
2995 | + * clear EXT4_STATE_EXT_MIGRATE flag. The flag is updated |
2996 | + * with i_data_sem held to prevent racing with block |
2997 | + * allocation. |
2998 | */ |
2999 | down_read((&EXT4_I(inode)->i_data_sem)); |
3000 | - EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags | EXT4_EXT_MIGRATE; |
3001 | + EXT4_I(inode)->i_state |= EXT4_STATE_EXT_MIGRATE; |
3002 | up_read((&EXT4_I(inode)->i_data_sem)); |
3003 | |
3004 | handle = ext4_journal_start(inode, 1); |
3005 | @@ -618,7 +618,7 @@ err_out: |
3006 | tmp_inode->i_nlink = 0; |
3007 | |
3008 | ext4_journal_stop(handle); |
3009 | - |
3010 | + unlock_new_inode(tmp_inode); |
3011 | iput(tmp_inode); |
3012 | |
3013 | return retval; |
3014 | diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c |
3015 | index bbf2dd9..9a573a6 100644 |
3016 | --- a/fs/ext4/move_extent.c |
3017 | +++ b/fs/ext4/move_extent.c |
3018 | @@ -19,14 +19,31 @@ |
3019 | #include "ext4_extents.h" |
3020 | #include "ext4.h" |
3021 | |
3022 | -#define get_ext_path(path, inode, block, ret) \ |
3023 | - do { \ |
3024 | - path = ext4_ext_find_extent(inode, block, path); \ |
3025 | - if (IS_ERR(path)) { \ |
3026 | - ret = PTR_ERR(path); \ |
3027 | - path = NULL; \ |
3028 | - } \ |
3029 | - } while (0) |
3030 | +/** |
3031 | + * get_ext_path - Find an extent path for designated logical block number. |
3032 | + * |
3033 | + * @inode: an inode which is searched |
3034 | + * @lblock: logical block number to find an extent path |
3035 | + * @path: pointer to an extent path pointer (for output) |
3036 | + * |
3037 | + * ext4_ext_find_extent wrapper. Return 0 on success, or a negative error value |
3038 | + * on failure. |
3039 | + */ |
3040 | +static inline int |
3041 | +get_ext_path(struct inode *inode, ext4_lblk_t lblock, |
3042 | + struct ext4_ext_path **path) |
3043 | +{ |
3044 | + int ret = 0; |
3045 | + |
3046 | + *path = ext4_ext_find_extent(inode, lblock, *path); |
3047 | + if (IS_ERR(*path)) { |
3048 | + ret = PTR_ERR(*path); |
3049 | + *path = NULL; |
3050 | + } else if ((*path)[ext_depth(inode)].p_ext == NULL) |
3051 | + ret = -ENODATA; |
3052 | + |
3053 | + return ret; |
3054 | +} |
3055 | |
3056 | /** |
3057 | * copy_extent_status - Copy the extent's initialization status |
3058 | @@ -60,12 +77,14 @@ static int |
3059 | mext_next_extent(struct inode *inode, struct ext4_ext_path *path, |
3060 | struct ext4_extent **extent) |
3061 | { |
3062 | + struct ext4_extent_header *eh; |
3063 | int ppos, leaf_ppos = path->p_depth; |
3064 | |
3065 | ppos = leaf_ppos; |
3066 | if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) { |
3067 | /* leaf block */ |
3068 | *extent = ++path[ppos].p_ext; |
3069 | + path[ppos].p_block = ext_pblock(path[ppos].p_ext); |
3070 | return 0; |
3071 | } |
3072 | |
3073 | @@ -102,9 +121,18 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path, |
3074 | ext_block_hdr(path[cur_ppos+1].p_bh); |
3075 | } |
3076 | |
3077 | + path[leaf_ppos].p_ext = *extent = NULL; |
3078 | + |
3079 | + eh = path[leaf_ppos].p_hdr; |
3080 | + if (le16_to_cpu(eh->eh_entries) == 0) |
3081 | + /* empty leaf is found */ |
3082 | + return -ENODATA; |
3083 | + |
3084 | /* leaf block */ |
3085 | path[leaf_ppos].p_ext = *extent = |
3086 | EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr); |
3087 | + path[leaf_ppos].p_block = |
3088 | + ext_pblock(path[leaf_ppos].p_ext); |
3089 | return 0; |
3090 | } |
3091 | } |
3092 | @@ -113,47 +141,43 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path, |
3093 | } |
3094 | |
3095 | /** |
3096 | - * mext_double_down_read - Acquire two inodes' read semaphore |
3097 | + * mext_check_null_inode - NULL check for two inodes |
3098 | * |
3099 | - * @orig_inode: original inode structure |
3100 | - * @donor_inode: donor inode structure |
3101 | - * Acquire read semaphore of the two inodes (orig and donor) by i_ino order. |
3102 | + * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0. |
3103 | */ |
3104 | -static void |
3105 | -mext_double_down_read(struct inode *orig_inode, struct inode *donor_inode) |
3106 | +static int |
3107 | +mext_check_null_inode(struct inode *inode1, struct inode *inode2, |
3108 | + const char *function) |
3109 | { |
3110 | - struct inode *first = orig_inode, *second = donor_inode; |
3111 | - |
3112 | - BUG_ON(orig_inode == NULL || donor_inode == NULL); |
3113 | - |
3114 | - /* |
3115 | - * Use the inode number to provide the stable locking order instead |
3116 | - * of its address, because the C language doesn't guarantee you can |
3117 | - * compare pointers that don't come from the same array. |
3118 | - */ |
3119 | - if (donor_inode->i_ino < orig_inode->i_ino) { |
3120 | - first = donor_inode; |
3121 | - second = orig_inode; |
3122 | + int ret = 0; |
3123 | + |
3124 | + if (inode1 == NULL) { |
3125 | + ext4_error(inode2->i_sb, function, |
3126 | + "Both inodes should not be NULL: " |
3127 | + "inode1 NULL inode2 %lu", inode2->i_ino); |
3128 | + ret = -EIO; |
3129 | + } else if (inode2 == NULL) { |
3130 | + ext4_error(inode1->i_sb, function, |
3131 | + "Both inodes should not be NULL: " |
3132 | + "inode1 %lu inode2 NULL", inode1->i_ino); |
3133 | + ret = -EIO; |
3134 | } |
3135 | - |
3136 | - down_read(&EXT4_I(first)->i_data_sem); |
3137 | - down_read(&EXT4_I(second)->i_data_sem); |
3138 | + return ret; |
3139 | } |
3140 | |
3141 | /** |
3142 | - * mext_double_down_write - Acquire two inodes' write semaphore |
3143 | + * double_down_write_data_sem - Acquire two inodes' write lock of i_data_sem |
3144 | * |
3145 | * @orig_inode: original inode structure |
3146 | * @donor_inode: donor inode structure |
3147 | - * Acquire write semaphore of the two inodes (orig and donor) by i_ino order. |
3148 | + * Acquire write lock of i_data_sem of the two inodes (orig and donor) by |
3149 | + * i_ino order. |
3150 | */ |
3151 | static void |
3152 | -mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode) |
3153 | +double_down_write_data_sem(struct inode *orig_inode, struct inode *donor_inode) |
3154 | { |
3155 | struct inode *first = orig_inode, *second = donor_inode; |
3156 | |
3157 | - BUG_ON(orig_inode == NULL || donor_inode == NULL); |
3158 | - |
3159 | /* |
3160 | * Use the inode number to provide the stable locking order instead |
3161 | * of its address, because the C language doesn't guarantee you can |
3162 | @@ -165,37 +189,19 @@ mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode) |
3163 | } |
3164 | |
3165 | down_write(&EXT4_I(first)->i_data_sem); |
3166 | - down_write(&EXT4_I(second)->i_data_sem); |
3167 | -} |
3168 | - |
3169 | -/** |
3170 | - * mext_double_up_read - Release two inodes' read semaphore |
3171 | - * |
3172 | - * @orig_inode: original inode structure to be released its lock first |
3173 | - * @donor_inode: donor inode structure to be released its lock second |
3174 | - * Release read semaphore of two inodes (orig and donor). |
3175 | - */ |
3176 | -static void |
3177 | -mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode) |
3178 | -{ |
3179 | - BUG_ON(orig_inode == NULL || donor_inode == NULL); |
3180 | - |
3181 | - up_read(&EXT4_I(orig_inode)->i_data_sem); |
3182 | - up_read(&EXT4_I(donor_inode)->i_data_sem); |
3183 | + down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING); |
3184 | } |
3185 | |
3186 | /** |
3187 | - * mext_double_up_write - Release two inodes' write semaphore |
3188 | + * double_up_write_data_sem - Release two inodes' write lock of i_data_sem |
3189 | * |
3190 | * @orig_inode: original inode structure to be released its lock first |
3191 | * @donor_inode: donor inode structure to be released its lock second |
3192 | - * Release write semaphore of two inodes (orig and donor). |
3193 | + * Release write lock of i_data_sem of two inodes (orig and donor). |
3194 | */ |
3195 | static void |
3196 | -mext_double_up_write(struct inode *orig_inode, struct inode *donor_inode) |
3197 | +double_up_write_data_sem(struct inode *orig_inode, struct inode *donor_inode) |
3198 | { |
3199 | - BUG_ON(orig_inode == NULL || donor_inode == NULL); |
3200 | - |
3201 | up_write(&EXT4_I(orig_inode)->i_data_sem); |
3202 | up_write(&EXT4_I(donor_inode)->i_data_sem); |
3203 | } |
3204 | @@ -283,23 +289,23 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode, |
3205 | } |
3206 | |
3207 | if (new_flag) { |
3208 | - get_ext_path(orig_path, orig_inode, eblock, err); |
3209 | - if (orig_path == NULL) |
3210 | + err = get_ext_path(orig_inode, eblock, &orig_path); |
3211 | + if (err) |
3212 | goto out; |
3213 | |
3214 | if (ext4_ext_insert_extent(handle, orig_inode, |
3215 | - orig_path, new_ext)) |
3216 | + orig_path, new_ext, 0)) |
3217 | goto out; |
3218 | } |
3219 | |
3220 | if (end_flag) { |
3221 | - get_ext_path(orig_path, orig_inode, |
3222 | - le32_to_cpu(end_ext->ee_block) - 1, err); |
3223 | - if (orig_path == NULL) |
3224 | + err = get_ext_path(orig_inode, |
3225 | + le32_to_cpu(end_ext->ee_block) - 1, &orig_path); |
3226 | + if (err) |
3227 | goto out; |
3228 | |
3229 | if (ext4_ext_insert_extent(handle, orig_inode, |
3230 | - orig_path, end_ext)) |
3231 | + orig_path, end_ext, 0)) |
3232 | goto out; |
3233 | } |
3234 | out: |
3235 | @@ -519,7 +525,15 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode, |
3236 | * oext |-----------| |
3237 | * new_ext |-------| |
3238 | */ |
3239 | - BUG_ON(le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end); |
3240 | + if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) { |
3241 | + ext4_error(orig_inode->i_sb, __func__, |
3242 | + "new_ext_end(%u) should be less than or equal to " |
3243 | + "oext->ee_block(%u) + oext_alen(%d) - 1", |
3244 | + new_ext_end, le32_to_cpu(oext->ee_block), |
3245 | + oext_alen); |
3246 | + ret = -EIO; |
3247 | + goto out; |
3248 | + } |
3249 | |
3250 | /* |
3251 | * Case: new_ext is smaller than original extent |
3252 | @@ -543,6 +557,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode, |
3253 | |
3254 | ret = mext_insert_extents(handle, orig_inode, orig_path, o_start, |
3255 | o_end, &start_ext, &new_ext, &end_ext); |
3256 | +out: |
3257 | return ret; |
3258 | } |
3259 | |
3260 | @@ -554,8 +569,10 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode, |
3261 | * @orig_off: block offset of original inode |
3262 | * @donor_off: block offset of donor inode |
3263 | * @max_count: the maximun length of extents |
3264 | + * |
3265 | + * Return 0 on success, or a negative error value on failure. |
3266 | */ |
3267 | -static void |
3268 | +static int |
3269 | mext_calc_swap_extents(struct ext4_extent *tmp_dext, |
3270 | struct ext4_extent *tmp_oext, |
3271 | ext4_lblk_t orig_off, ext4_lblk_t donor_off, |
3272 | @@ -564,6 +581,19 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext, |
3273 | ext4_lblk_t diff, orig_diff; |
3274 | struct ext4_extent dext_old, oext_old; |
3275 | |
3276 | + BUG_ON(orig_off != donor_off); |
3277 | + |
3278 | + /* original and donor extents have to cover the same block offset */ |
3279 | + if (orig_off < le32_to_cpu(tmp_oext->ee_block) || |
3280 | + le32_to_cpu(tmp_oext->ee_block) + |
3281 | + ext4_ext_get_actual_len(tmp_oext) - 1 < orig_off) |
3282 | + return -ENODATA; |
3283 | + |
3284 | + if (orig_off < le32_to_cpu(tmp_dext->ee_block) || |
3285 | + le32_to_cpu(tmp_dext->ee_block) + |
3286 | + ext4_ext_get_actual_len(tmp_dext) - 1 < orig_off) |
3287 | + return -ENODATA; |
3288 | + |
3289 | dext_old = *tmp_dext; |
3290 | oext_old = *tmp_oext; |
3291 | |
3292 | @@ -591,6 +621,8 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext, |
3293 | |
3294 | copy_extent_status(&oext_old, tmp_dext); |
3295 | copy_extent_status(&dext_old, tmp_oext); |
3296 | + |
3297 | + return 0; |
3298 | } |
3299 | |
3300 | /** |
3301 | @@ -601,6 +633,7 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext, |
3302 | * @donor_inode: donor inode |
3303 | * @from: block offset of orig_inode |
3304 | * @count: block count to be replaced |
3305 | + * @err: pointer to save return value |
3306 | * |
3307 | * Replace original inode extents and donor inode extents page by page. |
3308 | * We implement this replacement in the following three steps: |
3309 | @@ -611,33 +644,33 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext, |
3310 | * 3. Change the block information of donor inode to point at the saved |
3311 | * original inode blocks in the dummy extents. |
3312 | * |
3313 | - * Return 0 on success, or a negative error value on failure. |
3314 | + * Return replaced block count. |
3315 | */ |
3316 | static int |
3317 | mext_replace_branches(handle_t *handle, struct inode *orig_inode, |
3318 | struct inode *donor_inode, ext4_lblk_t from, |
3319 | - ext4_lblk_t count) |
3320 | + ext4_lblk_t count, int *err) |
3321 | { |
3322 | struct ext4_ext_path *orig_path = NULL; |
3323 | struct ext4_ext_path *donor_path = NULL; |
3324 | struct ext4_extent *oext, *dext; |
3325 | struct ext4_extent tmp_dext, tmp_oext; |
3326 | ext4_lblk_t orig_off = from, donor_off = from; |
3327 | - int err = 0; |
3328 | int depth; |
3329 | int replaced_count = 0; |
3330 | int dext_alen; |
3331 | |
3332 | - mext_double_down_write(orig_inode, donor_inode); |
3333 | + /* Protect extent trees against block allocations via delalloc */ |
3334 | + double_down_write_data_sem(orig_inode, donor_inode); |
3335 | |
3336 | /* Get the original extent for the block "orig_off" */ |
3337 | - get_ext_path(orig_path, orig_inode, orig_off, err); |
3338 | - if (orig_path == NULL) |
3339 | + *err = get_ext_path(orig_inode, orig_off, &orig_path); |
3340 | + if (*err) |
3341 | goto out; |
3342 | |
3343 | /* Get the donor extent for the head */ |
3344 | - get_ext_path(donor_path, donor_inode, donor_off, err); |
3345 | - if (donor_path == NULL) |
3346 | + *err = get_ext_path(donor_inode, donor_off, &donor_path); |
3347 | + if (*err) |
3348 | goto out; |
3349 | depth = ext_depth(orig_inode); |
3350 | oext = orig_path[depth].p_ext; |
3351 | @@ -647,24 +680,39 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode, |
3352 | dext = donor_path[depth].p_ext; |
3353 | tmp_dext = *dext; |
3354 | |
3355 | - mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, |
3356 | + *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, |
3357 | donor_off, count); |
3358 | + if (*err) |
3359 | + goto out; |
3360 | |
3361 | /* Loop for the donor extents */ |
3362 | while (1) { |
3363 | /* The extent for donor must be found. */ |
3364 | - BUG_ON(!dext || donor_off != le32_to_cpu(tmp_dext.ee_block)); |
3365 | + if (!dext) { |
3366 | + ext4_error(donor_inode->i_sb, __func__, |
3367 | + "The extent for donor must be found"); |
3368 | + *err = -EIO; |
3369 | + goto out; |
3370 | + } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) { |
3371 | + ext4_error(donor_inode->i_sb, __func__, |
3372 | + "Donor offset(%u) and the first block of donor " |
3373 | + "extent(%u) should be equal", |
3374 | + donor_off, |
3375 | + le32_to_cpu(tmp_dext.ee_block)); |
3376 | + *err = -EIO; |
3377 | + goto out; |
3378 | + } |
3379 | |
3380 | /* Set donor extent to orig extent */ |
3381 | - err = mext_leaf_block(handle, orig_inode, |
3382 | + *err = mext_leaf_block(handle, orig_inode, |
3383 | orig_path, &tmp_dext, &orig_off); |
3384 | - if (err < 0) |
3385 | + if (*err) |
3386 | goto out; |
3387 | |
3388 | /* Set orig extent to donor extent */ |
3389 | - err = mext_leaf_block(handle, donor_inode, |
3390 | + *err = mext_leaf_block(handle, donor_inode, |
3391 | donor_path, &tmp_oext, &donor_off); |
3392 | - if (err < 0) |
3393 | + if (*err) |
3394 | goto out; |
3395 | |
3396 | dext_alen = ext4_ext_get_actual_len(&tmp_dext); |
3397 | @@ -678,36 +726,26 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode, |
3398 | |
3399 | if (orig_path) |
3400 | ext4_ext_drop_refs(orig_path); |
3401 | - get_ext_path(orig_path, orig_inode, orig_off, err); |
3402 | - if (orig_path == NULL) |
3403 | + *err = get_ext_path(orig_inode, orig_off, &orig_path); |
3404 | + if (*err) |
3405 | goto out; |
3406 | depth = ext_depth(orig_inode); |
3407 | oext = orig_path[depth].p_ext; |
3408 | - if (le32_to_cpu(oext->ee_block) + |
3409 | - ext4_ext_get_actual_len(oext) <= orig_off) { |
3410 | - err = 0; |
3411 | - goto out; |
3412 | - } |
3413 | tmp_oext = *oext; |
3414 | |
3415 | if (donor_path) |
3416 | ext4_ext_drop_refs(donor_path); |
3417 | - get_ext_path(donor_path, donor_inode, |
3418 | - donor_off, err); |
3419 | - if (donor_path == NULL) |
3420 | + *err = get_ext_path(donor_inode, donor_off, &donor_path); |
3421 | + if (*err) |
3422 | goto out; |
3423 | depth = ext_depth(donor_inode); |
3424 | dext = donor_path[depth].p_ext; |
3425 | - if (le32_to_cpu(dext->ee_block) + |
3426 | - ext4_ext_get_actual_len(dext) <= donor_off) { |
3427 | - err = 0; |
3428 | - goto out; |
3429 | - } |
3430 | tmp_dext = *dext; |
3431 | |
3432 | - mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, |
3433 | - donor_off, |
3434 | - count - replaced_count); |
3435 | + *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, |
3436 | + donor_off, count - replaced_count); |
3437 | + if (*err) |
3438 | + goto out; |
3439 | } |
3440 | |
3441 | out: |
3442 | @@ -720,8 +758,12 @@ out: |
3443 | kfree(donor_path); |
3444 | } |
3445 | |
3446 | - mext_double_up_write(orig_inode, donor_inode); |
3447 | - return err; |
3448 | + ext4_ext_invalidate_cache(orig_inode); |
3449 | + ext4_ext_invalidate_cache(donor_inode); |
3450 | + |
3451 | + double_up_write_data_sem(orig_inode, donor_inode); |
3452 | + |
3453 | + return replaced_count; |
3454 | } |
3455 | |
3456 | /** |
3457 | @@ -733,16 +775,17 @@ out: |
3458 | * @data_offset_in_page: block index where data swapping starts |
3459 | * @block_len_in_page: the number of blocks to be swapped |
3460 | * @uninit: orig extent is uninitialized or not |
3461 | + * @err: pointer to save return value |
3462 | * |
3463 | * Save the data in original inode blocks and replace original inode extents |
3464 | * with donor inode extents by calling mext_replace_branches(). |
3465 | - * Finally, write out the saved data in new original inode blocks. Return 0 |
3466 | - * on success, or a negative error value on failure. |
3467 | + * Finally, write out the saved data in new original inode blocks. Return |
3468 | + * replaced block count. |
3469 | */ |
3470 | static int |
3471 | -move_extent_par_page(struct file *o_filp, struct inode *donor_inode, |
3472 | +move_extent_per_page(struct file *o_filp, struct inode *donor_inode, |
3473 | pgoff_t orig_page_offset, int data_offset_in_page, |
3474 | - int block_len_in_page, int uninit) |
3475 | + int block_len_in_page, int uninit, int *err) |
3476 | { |
3477 | struct inode *orig_inode = o_filp->f_dentry->d_inode; |
3478 | struct address_space *mapping = orig_inode->i_mapping; |
3479 | @@ -754,9 +797,11 @@ move_extent_par_page(struct file *o_filp, struct inode *donor_inode, |
3480 | long long offs = orig_page_offset << PAGE_CACHE_SHIFT; |
3481 | unsigned long blocksize = orig_inode->i_sb->s_blocksize; |
3482 | unsigned int w_flags = 0; |
3483 | - unsigned int tmp_data_len, data_len; |
3484 | + unsigned int tmp_data_size, data_size, replaced_size; |
3485 | void *fsdata; |
3486 | - int ret, i, jblocks; |
3487 | + int i, jblocks; |
3488 | + int err2 = 0; |
3489 | + int replaced_count = 0; |
3490 | int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; |
3491 | |
3492 | /* |
3493 | @@ -766,8 +811,8 @@ move_extent_par_page(struct file *o_filp, struct inode *donor_inode, |
3494 | jblocks = ext4_writepage_trans_blocks(orig_inode) * 2; |
3495 | handle = ext4_journal_start(orig_inode, jblocks); |
3496 | if (IS_ERR(handle)) { |
3497 | - ret = PTR_ERR(handle); |
3498 | - return ret; |
3499 | + *err = PTR_ERR(handle); |
3500 | + return 0; |
3501 | } |
3502 | |
3503 | if (segment_eq(get_fs(), KERNEL_DS)) |
3504 | @@ -783,39 +828,36 @@ move_extent_par_page(struct file *o_filp, struct inode *donor_inode, |
3505 | * Just swap data blocks between orig and donor. |
3506 | */ |
3507 | if (uninit) { |
3508 | - ret = mext_replace_branches(handle, orig_inode, |
3509 | - donor_inode, orig_blk_offset, |
3510 | - block_len_in_page); |
3511 | - |
3512 | - /* Clear the inode cache not to refer to the old data */ |
3513 | - ext4_ext_invalidate_cache(orig_inode); |
3514 | - ext4_ext_invalidate_cache(donor_inode); |
3515 | + replaced_count = mext_replace_branches(handle, orig_inode, |
3516 | + donor_inode, orig_blk_offset, |
3517 | + block_len_in_page, err); |
3518 | goto out2; |
3519 | } |
3520 | |
3521 | offs = (long long)orig_blk_offset << orig_inode->i_blkbits; |
3522 | |
3523 | - /* Calculate data_len */ |
3524 | + /* Calculate data_size */ |
3525 | if ((orig_blk_offset + block_len_in_page - 1) == |
3526 | ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) { |
3527 | /* Replace the last block */ |
3528 | - tmp_data_len = orig_inode->i_size & (blocksize - 1); |
3529 | + tmp_data_size = orig_inode->i_size & (blocksize - 1); |
3530 | /* |
3531 | - * If data_len equal zero, it shows data_len is multiples of |
3532 | + * If data_size equal zero, it shows data_size is multiples of |
3533 | * blocksize. So we set appropriate value. |
3534 | */ |
3535 | - if (tmp_data_len == 0) |
3536 | - tmp_data_len = blocksize; |
3537 | + if (tmp_data_size == 0) |
3538 | + tmp_data_size = blocksize; |
3539 | |
3540 | - data_len = tmp_data_len + |
3541 | + data_size = tmp_data_size + |
3542 | ((block_len_in_page - 1) << orig_inode->i_blkbits); |
3543 | - } else { |
3544 | - data_len = block_len_in_page << orig_inode->i_blkbits; |
3545 | - } |
3546 | + } else |
3547 | + data_size = block_len_in_page << orig_inode->i_blkbits; |
3548 | + |
3549 | + replaced_size = data_size; |
3550 | |
3551 | - ret = a_ops->write_begin(o_filp, mapping, offs, data_len, w_flags, |
3552 | + *err = a_ops->write_begin(o_filp, mapping, offs, data_size, w_flags, |
3553 | &page, &fsdata); |
3554 | - if (unlikely(ret < 0)) |
3555 | + if (unlikely(*err < 0)) |
3556 | goto out; |
3557 | |
3558 | if (!PageUptodate(page)) { |
3559 | @@ -836,14 +878,17 @@ move_extent_par_page(struct file *o_filp, struct inode *donor_inode, |
3560 | /* Release old bh and drop refs */ |
3561 | try_to_release_page(page, 0); |
3562 | |
3563 | - ret = mext_replace_branches(handle, orig_inode, donor_inode, |
3564 | - orig_blk_offset, block_len_in_page); |
3565 | - if (ret < 0) |
3566 | - goto out; |
3567 | - |
3568 | - /* Clear the inode cache not to refer to the old data */ |
3569 | - ext4_ext_invalidate_cache(orig_inode); |
3570 | - ext4_ext_invalidate_cache(donor_inode); |
3571 | + replaced_count = mext_replace_branches(handle, orig_inode, donor_inode, |
3572 | + orig_blk_offset, block_len_in_page, |
3573 | + &err2); |
3574 | + if (err2) { |
3575 | + if (replaced_count) { |
3576 | + block_len_in_page = replaced_count; |
3577 | + replaced_size = |
3578 | + block_len_in_page << orig_inode->i_blkbits; |
3579 | + } else |
3580 | + goto out; |
3581 | + } |
3582 | |
3583 | if (!page_has_buffers(page)) |
3584 | create_empty_buffers(page, 1 << orig_inode->i_blkbits, 0); |
3585 | @@ -853,16 +898,16 @@ move_extent_par_page(struct file *o_filp, struct inode *donor_inode, |
3586 | bh = bh->b_this_page; |
3587 | |
3588 | for (i = 0; i < block_len_in_page; i++) { |
3589 | - ret = ext4_get_block(orig_inode, |
3590 | + *err = ext4_get_block(orig_inode, |
3591 | (sector_t)(orig_blk_offset + i), bh, 0); |
3592 | - if (ret < 0) |
3593 | + if (*err < 0) |
3594 | goto out; |
3595 | |
3596 | if (bh->b_this_page != NULL) |
3597 | bh = bh->b_this_page; |
3598 | } |
3599 | |
3600 | - ret = a_ops->write_end(o_filp, mapping, offs, data_len, data_len, |
3601 | + *err = a_ops->write_end(o_filp, mapping, offs, data_size, replaced_size, |
3602 | page, fsdata); |
3603 | page = NULL; |
3604 | |
3605 | @@ -871,11 +916,15 @@ out: |
3606 | if (PageLocked(page)) |
3607 | unlock_page(page); |
3608 | page_cache_release(page); |
3609 | + ext4_journal_stop(handle); |
3610 | } |
3611 | out2: |
3612 | ext4_journal_stop(handle); |
3613 | |
3614 | - return ret < 0 ? ret : 0; |
3615 | + if (err2) |
3616 | + *err = err2; |
3617 | + |
3618 | + return replaced_count; |
3619 | } |
3620 | |
3621 | /** |
3622 | @@ -886,7 +935,6 @@ out2: |
3623 | * @orig_start: logical start offset in block for orig |
3624 | * @donor_start: logical start offset in block for donor |
3625 | * @len: the number of blocks to be moved |
3626 | - * @moved_len: moved block length |
3627 | * |
3628 | * Check the arguments of ext4_move_extents() whether the files can be |
3629 | * exchanged with each other. |
3630 | @@ -894,9 +942,13 @@ out2: |
3631 | */ |
3632 | static int |
3633 | mext_check_arguments(struct inode *orig_inode, |
3634 | - struct inode *donor_inode, __u64 orig_start, |
3635 | - __u64 donor_start, __u64 *len, __u64 moved_len) |
3636 | + struct inode *donor_inode, __u64 orig_start, |
3637 | + __u64 donor_start, __u64 *len) |
3638 | { |
3639 | + ext4_lblk_t orig_blocks, donor_blocks; |
3640 | + unsigned int blkbits = orig_inode->i_blkbits; |
3641 | + unsigned int blocksize = 1 << blkbits; |
3642 | + |
3643 | /* Regular file check */ |
3644 | if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) { |
3645 | ext4_debug("ext4 move extent: The argument files should be " |
3646 | @@ -905,6 +957,13 @@ mext_check_arguments(struct inode *orig_inode, |
3647 | return -EINVAL; |
3648 | } |
3649 | |
3650 | + if (donor_inode->i_mode & (S_ISUID|S_ISGID)) { |
3651 | + ext4_debug("ext4 move extent: suid or sgid is set" |
3652 | + " to donor file [ino:orig %lu, donor %lu]\n", |
3653 | + orig_inode->i_ino, donor_inode->i_ino); |
3654 | + return -EINVAL; |
3655 | + } |
3656 | + |
3657 | /* Ext4 move extent does not support swapfile */ |
3658 | if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) { |
3659 | ext4_debug("ext4 move extent: The argument files should " |
3660 | @@ -921,14 +980,6 @@ mext_check_arguments(struct inode *orig_inode, |
3661 | return -EINVAL; |
3662 | } |
3663 | |
3664 | - /* orig and donor should be different file */ |
3665 | - if (orig_inode->i_ino == donor_inode->i_ino) { |
3666 | - ext4_debug("ext4 move extent: The argument files should not " |
3667 | - "be same file [ino:orig %lu, donor %lu]\n", |
3668 | - orig_inode->i_ino, donor_inode->i_ino); |
3669 | - return -EINVAL; |
3670 | - } |
3671 | - |
3672 | /* Ext4 move extent supports only extent based file */ |
3673 | if (!(EXT4_I(orig_inode)->i_flags & EXT4_EXTENTS_FL)) { |
3674 | ext4_debug("ext4 move extent: orig file is not extents " |
3675 | @@ -953,13 +1004,6 @@ mext_check_arguments(struct inode *orig_inode, |
3676 | return -EINVAL; |
3677 | } |
3678 | |
3679 | - if (moved_len) { |
3680 | - ext4_debug("ext4 move extent: moved_len should be 0 " |
3681 | - "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino, |
3682 | - donor_inode->i_ino); |
3683 | - return -EINVAL; |
3684 | - } |
3685 | - |
3686 | if ((orig_start > MAX_DEFRAG_SIZE) || |
3687 | (donor_start > MAX_DEFRAG_SIZE) || |
3688 | (*len > MAX_DEFRAG_SIZE) || |
3689 | @@ -971,43 +1015,47 @@ mext_check_arguments(struct inode *orig_inode, |
3690 | } |
3691 | |
3692 | if (orig_inode->i_size > donor_inode->i_size) { |
3693 | - if (orig_start >= donor_inode->i_size) { |
3694 | + donor_blocks = (donor_inode->i_size + blocksize - 1) >> blkbits; |
3695 | + /* TODO: eliminate this artificial restriction */ |
3696 | + if (orig_start >= donor_blocks) { |
3697 | ext4_debug("ext4 move extent: orig start offset " |
3698 | - "[%llu] should be less than donor file size " |
3699 | - "[%lld] [ino:orig %lu, donor_inode %lu]\n", |
3700 | - orig_start, donor_inode->i_size, |
3701 | + "[%llu] should be less than donor file blocks " |
3702 | + "[%u] [ino:orig %lu, donor %lu]\n", |
3703 | + orig_start, donor_blocks, |
3704 | orig_inode->i_ino, donor_inode->i_ino); |
3705 | return -EINVAL; |
3706 | } |
3707 | |
3708 | - if (orig_start + *len > donor_inode->i_size) { |
3709 | + /* TODO: eliminate this artificial restriction */ |
3710 | + if (orig_start + *len > donor_blocks) { |
3711 | ext4_debug("ext4 move extent: End offset [%llu] should " |
3712 | - "be less than donor file size [%lld]." |
3713 | - "So adjust length from %llu to %lld " |
3714 | + "be less than donor file blocks [%u]." |
3715 | + "So adjust length from %llu to %llu " |
3716 | "[ino:orig %lu, donor %lu]\n", |
3717 | - orig_start + *len, donor_inode->i_size, |
3718 | - *len, donor_inode->i_size - orig_start, |
3719 | + orig_start + *len, donor_blocks, |
3720 | + *len, donor_blocks - orig_start, |
3721 | orig_inode->i_ino, donor_inode->i_ino); |
3722 | - *len = donor_inode->i_size - orig_start; |
3723 | + *len = donor_blocks - orig_start; |
3724 | } |
3725 | } else { |
3726 | - if (orig_start >= orig_inode->i_size) { |
3727 | + orig_blocks = (orig_inode->i_size + blocksize - 1) >> blkbits; |
3728 | + if (orig_start >= orig_blocks) { |
3729 | ext4_debug("ext4 move extent: start offset [%llu] " |
3730 | - "should be less than original file size " |
3731 | - "[%lld] [inode:orig %lu, donor %lu]\n", |
3732 | - orig_start, orig_inode->i_size, |
3733 | + "should be less than original file blocks " |
3734 | + "[%u] [ino:orig %lu, donor %lu]\n", |
3735 | + orig_start, orig_blocks, |
3736 | orig_inode->i_ino, donor_inode->i_ino); |
3737 | return -EINVAL; |
3738 | } |
3739 | |
3740 | - if (orig_start + *len > orig_inode->i_size) { |
3741 | + if (orig_start + *len > orig_blocks) { |
3742 | ext4_debug("ext4 move extent: Adjust length " |
3743 | - "from %llu to %lld. Because it should be " |
3744 | - "less than original file size " |
3745 | + "from %llu to %llu. Because it should be " |
3746 | + "less than original file blocks " |
3747 | "[ino:orig %lu, donor %lu]\n", |
3748 | - *len, orig_inode->i_size - orig_start, |
3749 | + *len, orig_blocks - orig_start, |
3750 | orig_inode->i_ino, donor_inode->i_ino); |
3751 | - *len = orig_inode->i_size - orig_start; |
3752 | + *len = orig_blocks - orig_start; |
3753 | } |
3754 | } |
3755 | |
3756 | @@ -1027,18 +1075,23 @@ mext_check_arguments(struct inode *orig_inode, |
3757 | * @inode1: the inode structure |
3758 | * @inode2: the inode structure |
3759 | * |
3760 | - * Lock two inodes' i_mutex by i_ino order. This function is moved from |
3761 | - * fs/inode.c. |
3762 | + * Lock two inodes' i_mutex by i_ino order. |
3763 | + * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0. |
3764 | */ |
3765 | -static void |
3766 | +static int |
3767 | mext_inode_double_lock(struct inode *inode1, struct inode *inode2) |
3768 | { |
3769 | - if (inode1 == NULL || inode2 == NULL || inode1 == inode2) { |
3770 | - if (inode1) |
3771 | - mutex_lock(&inode1->i_mutex); |
3772 | - else if (inode2) |
3773 | - mutex_lock(&inode2->i_mutex); |
3774 | - return; |
3775 | + int ret = 0; |
3776 | + |
3777 | + BUG_ON(inode1 == NULL && inode2 == NULL); |
3778 | + |
3779 | + ret = mext_check_null_inode(inode1, inode2, __func__); |
3780 | + if (ret < 0) |
3781 | + goto out; |
3782 | + |
3783 | + if (inode1 == inode2) { |
3784 | + mutex_lock(&inode1->i_mutex); |
3785 | + goto out; |
3786 | } |
3787 | |
3788 | if (inode1->i_ino < inode2->i_ino) { |
3789 | @@ -1048,6 +1101,9 @@ mext_inode_double_lock(struct inode *inode1, struct inode *inode2) |
3790 | mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT); |
3791 | mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD); |
3792 | } |
3793 | + |
3794 | +out: |
3795 | + return ret; |
3796 | } |
3797 | |
3798 | /** |
3799 | @@ -1056,17 +1112,28 @@ mext_inode_double_lock(struct inode *inode1, struct inode *inode2) |
3800 | * @inode1: the inode that is released first |
3801 | * @inode2: the inode that is released second |
3802 | * |
3803 | - * This function is moved from fs/inode.c. |
3804 | + * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0. |
3805 | */ |
3806 | |
3807 | -static void |
3808 | +static int |
3809 | mext_inode_double_unlock(struct inode *inode1, struct inode *inode2) |
3810 | { |
3811 | + int ret = 0; |
3812 | + |
3813 | + BUG_ON(inode1 == NULL && inode2 == NULL); |
3814 | + |
3815 | + ret = mext_check_null_inode(inode1, inode2, __func__); |
3816 | + if (ret < 0) |
3817 | + goto out; |
3818 | + |
3819 | if (inode1) |
3820 | mutex_unlock(&inode1->i_mutex); |
3821 | |
3822 | if (inode2 && inode2 != inode1) |
3823 | mutex_unlock(&inode2->i_mutex); |
3824 | + |
3825 | +out: |
3826 | + return ret; |
3827 | } |
3828 | |
3829 | /** |
3830 | @@ -1123,70 +1190,84 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, |
3831 | ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0; |
3832 | ext4_lblk_t rest_blocks; |
3833 | pgoff_t orig_page_offset = 0, seq_end_page; |
3834 | - int ret, depth, last_extent = 0; |
3835 | + int ret1, ret2, depth, last_extent = 0; |
3836 | int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; |
3837 | int data_offset_in_page; |
3838 | int block_len_in_page; |
3839 | int uninit; |
3840 | |
3841 | - /* protect orig and donor against a truncate */ |
3842 | - mext_inode_double_lock(orig_inode, donor_inode); |
3843 | + /* orig and donor should be different file */ |
3844 | + if (orig_inode->i_ino == donor_inode->i_ino) { |
3845 | + ext4_debug("ext4 move extent: The argument files should not " |
3846 | + "be same file [ino:orig %lu, donor %lu]\n", |
3847 | + orig_inode->i_ino, donor_inode->i_ino); |
3848 | + return -EINVAL; |
3849 | + } |
3850 | + |
3851 | + /* Protect orig and donor inodes against a truncate */ |
3852 | + ret1 = mext_inode_double_lock(orig_inode, donor_inode); |
3853 | + if (ret1 < 0) |
3854 | + return ret1; |
3855 | |
3856 | - mext_double_down_read(orig_inode, donor_inode); |
3857 | + /* Protect extent tree against block allocations via delalloc */ |
3858 | + double_down_write_data_sem(orig_inode, donor_inode); |
3859 | /* Check the filesystem environment whether move_extent can be done */ |
3860 | - ret = mext_check_arguments(orig_inode, donor_inode, orig_start, |
3861 | - donor_start, &len, *moved_len); |
3862 | - mext_double_up_read(orig_inode, donor_inode); |
3863 | - if (ret) |
3864 | - goto out2; |
3865 | + ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start, |
3866 | + donor_start, &len); |
3867 | + if (ret1) |
3868 | + goto out; |
3869 | |
3870 | file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits; |
3871 | block_end = block_start + len - 1; |
3872 | if (file_end < block_end) |
3873 | len -= block_end - file_end; |
3874 | |
3875 | - get_ext_path(orig_path, orig_inode, block_start, ret); |
3876 | - if (orig_path == NULL) |
3877 | - goto out2; |
3878 | + ret1 = get_ext_path(orig_inode, block_start, &orig_path); |
3879 | + if (ret1) |
3880 | + goto out; |
3881 | |
3882 | /* Get path structure to check the hole */ |
3883 | - get_ext_path(holecheck_path, orig_inode, block_start, ret); |
3884 | - if (holecheck_path == NULL) |
3885 | + ret1 = get_ext_path(orig_inode, block_start, &holecheck_path); |
3886 | + if (ret1) |
3887 | goto out; |
3888 | |
3889 | depth = ext_depth(orig_inode); |
3890 | ext_cur = holecheck_path[depth].p_ext; |
3891 | - if (ext_cur == NULL) { |
3892 | - ret = -EINVAL; |
3893 | - goto out; |
3894 | - } |
3895 | |
3896 | /* |
3897 | - * Get proper extent whose ee_block is beyond block_start |
3898 | - * if block_start was within the hole. |
3899 | + * Get proper starting location of block replacement if block_start was |
3900 | + * within the hole. |
3901 | */ |
3902 | if (le32_to_cpu(ext_cur->ee_block) + |
3903 | ext4_ext_get_actual_len(ext_cur) - 1 < block_start) { |
3904 | + /* |
3905 | + * The hole exists between extents or the tail of |
3906 | + * original file. |
3907 | + */ |
3908 | last_extent = mext_next_extent(orig_inode, |
3909 | holecheck_path, &ext_cur); |
3910 | if (last_extent < 0) { |
3911 | - ret = last_extent; |
3912 | + ret1 = last_extent; |
3913 | goto out; |
3914 | } |
3915 | last_extent = mext_next_extent(orig_inode, orig_path, |
3916 | &ext_dummy); |
3917 | if (last_extent < 0) { |
3918 | - ret = last_extent; |
3919 | + ret1 = last_extent; |
3920 | goto out; |
3921 | } |
3922 | - } |
3923 | - seq_start = block_start; |
3924 | + seq_start = le32_to_cpu(ext_cur->ee_block); |
3925 | + } else if (le32_to_cpu(ext_cur->ee_block) > block_start) |
3926 | + /* The hole exists at the beginning of original file. */ |
3927 | + seq_start = le32_to_cpu(ext_cur->ee_block); |
3928 | + else |
3929 | + seq_start = block_start; |
3930 | |
3931 | /* No blocks within the specified range. */ |
3932 | if (le32_to_cpu(ext_cur->ee_block) > block_end) { |
3933 | ext4_debug("ext4 move extent: The specified range of file " |
3934 | "may be the hole\n"); |
3935 | - ret = -EINVAL; |
3936 | + ret1 = -EINVAL; |
3937 | goto out; |
3938 | } |
3939 | |
3940 | @@ -1206,7 +1287,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, |
3941 | last_extent = mext_next_extent(orig_inode, holecheck_path, |
3942 | &ext_cur); |
3943 | if (last_extent < 0) { |
3944 | - ret = last_extent; |
3945 | + ret1 = last_extent; |
3946 | break; |
3947 | } |
3948 | add_blocks = ext4_ext_get_actual_len(ext_cur); |
3949 | @@ -1246,29 +1327,39 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, |
3950 | seq_start = le32_to_cpu(ext_cur->ee_block); |
3951 | rest_blocks = seq_blocks; |
3952 | |
3953 | - /* Discard preallocations of two inodes */ |
3954 | - down_write(&EXT4_I(orig_inode)->i_data_sem); |
3955 | - ext4_discard_preallocations(orig_inode); |
3956 | - up_write(&EXT4_I(orig_inode)->i_data_sem); |
3957 | - |
3958 | - down_write(&EXT4_I(donor_inode)->i_data_sem); |
3959 | - ext4_discard_preallocations(donor_inode); |
3960 | - up_write(&EXT4_I(donor_inode)->i_data_sem); |
3961 | + /* |
3962 | + * Up semaphore to avoid following problems: |
3963 | + * a. transaction deadlock among ext4_journal_start, |
3964 | + * ->write_begin via pagefault, and jbd2_journal_commit |
3965 | + * b. racing with ->readpage, ->write_begin, and ext4_get_block |
3966 | + * in move_extent_per_page |
3967 | + */ |
3968 | + double_up_write_data_sem(orig_inode, donor_inode); |
3969 | |
3970 | while (orig_page_offset <= seq_end_page) { |
3971 | |
3972 | /* Swap original branches with new branches */ |
3973 | - ret = move_extent_par_page(o_filp, donor_inode, |
3974 | + block_len_in_page = move_extent_per_page( |
3975 | + o_filp, donor_inode, |
3976 | orig_page_offset, |
3977 | data_offset_in_page, |
3978 | - block_len_in_page, uninit); |
3979 | - if (ret < 0) |
3980 | - goto out; |
3981 | - orig_page_offset++; |
3982 | + block_len_in_page, uninit, |
3983 | + &ret1); |
3984 | + |
3985 | /* Count how many blocks we have exchanged */ |
3986 | *moved_len += block_len_in_page; |
3987 | - BUG_ON(*moved_len > len); |
3988 | + if (ret1 < 0) |
3989 | + break; |
3990 | + if (*moved_len > len) { |
3991 | + ext4_error(orig_inode->i_sb, __func__, |
3992 | + "We replaced blocks too much! " |
3993 | + "sum of replaced: %llu requested: %llu", |
3994 | + *moved_len, len); |
3995 | + ret1 = -EIO; |
3996 | + break; |
3997 | + } |
3998 | |
3999 | + orig_page_offset++; |
4000 | data_offset_in_page = 0; |
4001 | rest_blocks -= block_len_in_page; |
4002 | if (rest_blocks > blocks_per_page) |
4003 | @@ -1277,20 +1368,23 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, |
4004 | block_len_in_page = rest_blocks; |
4005 | } |
4006 | |
4007 | + double_down_write_data_sem(orig_inode, donor_inode); |
4008 | + if (ret1 < 0) |
4009 | + break; |
4010 | + |
4011 | /* Decrease buffer counter */ |
4012 | if (holecheck_path) |
4013 | ext4_ext_drop_refs(holecheck_path); |
4014 | - get_ext_path(holecheck_path, orig_inode, |
4015 | - seq_start, ret); |
4016 | - if (holecheck_path == NULL) |
4017 | + ret1 = get_ext_path(orig_inode, seq_start, &holecheck_path); |
4018 | + if (ret1) |
4019 | break; |
4020 | depth = holecheck_path->p_depth; |
4021 | |
4022 | /* Decrease buffer counter */ |
4023 | if (orig_path) |
4024 | ext4_ext_drop_refs(orig_path); |
4025 | - get_ext_path(orig_path, orig_inode, seq_start, ret); |
4026 | - if (orig_path == NULL) |
4027 | + ret1 = get_ext_path(orig_inode, seq_start, &orig_path); |
4028 | + if (ret1) |
4029 | break; |
4030 | |
4031 | ext_cur = holecheck_path[depth].p_ext; |
4032 | @@ -1299,6 +1393,11 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, |
4033 | |
4034 | } |
4035 | out: |
4036 | + if (*moved_len) { |
4037 | + ext4_discard_preallocations(orig_inode); |
4038 | + ext4_discard_preallocations(donor_inode); |
4039 | + } |
4040 | + |
4041 | if (orig_path) { |
4042 | ext4_ext_drop_refs(orig_path); |
4043 | kfree(orig_path); |
4044 | @@ -1307,14 +1406,13 @@ out: |
4045 | ext4_ext_drop_refs(holecheck_path); |
4046 | kfree(holecheck_path); |
4047 | } |
4048 | -out2: |
4049 | - mext_inode_double_unlock(orig_inode, donor_inode); |
4050 | - |
4051 | - if (ret) |
4052 | - return ret; |
4053 | + double_up_write_data_sem(orig_inode, donor_inode); |
4054 | + ret2 = mext_inode_double_unlock(orig_inode, donor_inode); |
4055 | |
4056 | - /* All of the specified blocks must be exchanged in succeed */ |
4057 | - BUG_ON(*moved_len != len); |
4058 | + if (ret1) |
4059 | + return ret1; |
4060 | + else if (ret2) |
4061 | + return ret2; |
4062 | |
4063 | return 0; |
4064 | } |
4065 | diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c |
4066 | index de04013..9dcd686 100644 |
4067 | --- a/fs/ext4/namei.c |
4068 | +++ b/fs/ext4/namei.c |
4069 | @@ -1292,9 +1292,6 @@ errout: |
4070 | * add_dirent_to_buf will attempt search the directory block for |
4071 | * space. It will return -ENOSPC if no space is available, and -EIO |
4072 | * and -EEXIST if directory entry already exists. |
4073 | - * |
4074 | - * NOTE! bh is NOT released in the case where ENOSPC is returned. In |
4075 | - * all other cases bh is released. |
4076 | */ |
4077 | static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, |
4078 | struct inode *inode, struct ext4_dir_entry_2 *de, |
4079 | @@ -1315,14 +1312,10 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, |
4080 | top = bh->b_data + blocksize - reclen; |
4081 | while ((char *) de <= top) { |
4082 | if (!ext4_check_dir_entry("ext4_add_entry", dir, de, |
4083 | - bh, offset)) { |
4084 | - brelse(bh); |
4085 | + bh, offset)) |
4086 | return -EIO; |
4087 | - } |
4088 | - if (ext4_match(namelen, name, de)) { |
4089 | - brelse(bh); |
4090 | + if (ext4_match(namelen, name, de)) |
4091 | return -EEXIST; |
4092 | - } |
4093 | nlen = EXT4_DIR_REC_LEN(de->name_len); |
4094 | rlen = ext4_rec_len_from_disk(de->rec_len, blocksize); |
4095 | if ((de->inode? rlen - nlen: rlen) >= reclen) |
4096 | @@ -1337,7 +1330,6 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, |
4097 | err = ext4_journal_get_write_access(handle, bh); |
4098 | if (err) { |
4099 | ext4_std_error(dir->i_sb, err); |
4100 | - brelse(bh); |
4101 | return err; |
4102 | } |
4103 | |
4104 | @@ -1377,7 +1369,6 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, |
4105 | err = ext4_handle_dirty_metadata(handle, dir, bh); |
4106 | if (err) |
4107 | ext4_std_error(dir->i_sb, err); |
4108 | - brelse(bh); |
4109 | return 0; |
4110 | } |
4111 | |
4112 | @@ -1471,7 +1462,9 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, |
4113 | if (!(de)) |
4114 | return retval; |
4115 | |
4116 | - return add_dirent_to_buf(handle, dentry, inode, de, bh); |
4117 | + retval = add_dirent_to_buf(handle, dentry, inode, de, bh); |
4118 | + brelse(bh); |
4119 | + return retval; |
4120 | } |
4121 | |
4122 | /* |
4123 | @@ -1514,8 +1507,10 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, |
4124 | if(!bh) |
4125 | return retval; |
4126 | retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh); |
4127 | - if (retval != -ENOSPC) |
4128 | + if (retval != -ENOSPC) { |
4129 | + brelse(bh); |
4130 | return retval; |
4131 | + } |
4132 | |
4133 | if (blocks == 1 && !dx_fallback && |
4134 | EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) |
4135 | @@ -1528,7 +1523,9 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, |
4136 | de = (struct ext4_dir_entry_2 *) bh->b_data; |
4137 | de->inode = 0; |
4138 | de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize); |
4139 | - return add_dirent_to_buf(handle, dentry, inode, de, bh); |
4140 | + retval = add_dirent_to_buf(handle, dentry, inode, de, bh); |
4141 | + brelse(bh); |
4142 | + return retval; |
4143 | } |
4144 | |
4145 | /* |
4146 | @@ -1561,10 +1558,8 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, |
4147 | goto journal_error; |
4148 | |
4149 | err = add_dirent_to_buf(handle, dentry, inode, NULL, bh); |
4150 | - if (err != -ENOSPC) { |
4151 | - bh = NULL; |
4152 | + if (err != -ENOSPC) |
4153 | goto cleanup; |
4154 | - } |
4155 | |
4156 | /* Block full, should compress but for now just split */ |
4157 | dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n", |
4158 | @@ -1590,9 +1585,9 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, |
4159 | goto cleanup; |
4160 | node2 = (struct dx_node *)(bh2->b_data); |
4161 | entries2 = node2->entries; |
4162 | + memset(&node2->fake, 0, sizeof(struct fake_dirent)); |
4163 | node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize, |
4164 | sb->s_blocksize); |
4165 | - node2->fake.inode = 0; |
4166 | BUFFER_TRACE(frame->bh, "get_write_access"); |
4167 | err = ext4_journal_get_write_access(handle, frame->bh); |
4168 | if (err) |
4169 | @@ -1657,7 +1652,6 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, |
4170 | if (!de) |
4171 | goto cleanup; |
4172 | err = add_dirent_to_buf(handle, dentry, inode, de, bh); |
4173 | - bh = NULL; |
4174 | goto cleanup; |
4175 | |
4176 | journal_error: |
4177 | @@ -1775,7 +1769,7 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, int mode, |
4178 | retry: |
4179 | handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + |
4180 | EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + |
4181 | - 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb)); |
4182 | + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); |
4183 | if (IS_ERR(handle)) |
4184 | return PTR_ERR(handle); |
4185 | |
4186 | @@ -1809,7 +1803,7 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry, |
4187 | retry: |
4188 | handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + |
4189 | EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + |
4190 | - 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb)); |
4191 | + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); |
4192 | if (IS_ERR(handle)) |
4193 | return PTR_ERR(handle); |
4194 | |
4195 | @@ -1846,7 +1840,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode) |
4196 | retry: |
4197 | handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + |
4198 | EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + |
4199 | - 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb)); |
4200 | + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); |
4201 | if (IS_ERR(handle)) |
4202 | return PTR_ERR(handle); |
4203 | |
4204 | @@ -2068,7 +2062,8 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) |
4205 | struct ext4_iloc iloc; |
4206 | int err = 0; |
4207 | |
4208 | - if (!ext4_handle_valid(handle)) |
4209 | + /* ext4_handle_valid() assumes a valid handle_t pointer */ |
4210 | + if (handle && !ext4_handle_valid(handle)) |
4211 | return 0; |
4212 | |
4213 | mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock); |
4214 | @@ -2258,7 +2253,7 @@ static int ext4_symlink(struct inode *dir, |
4215 | retry: |
4216 | handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + |
4217 | EXT4_INDEX_EXTRA_TRANS_BLOCKS + 5 + |
4218 | - 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb)); |
4219 | + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); |
4220 | if (IS_ERR(handle)) |
4221 | return PTR_ERR(handle); |
4222 | |
4223 | @@ -2310,7 +2305,7 @@ static int ext4_link(struct dentry *old_dentry, |
4224 | struct inode *inode = old_dentry->d_inode; |
4225 | int err, retries = 0; |
4226 | |
4227 | - if (EXT4_DIR_LINK_MAX(inode)) |
4228 | + if (inode->i_nlink >= EXT4_LINK_MAX) |
4229 | return -EMLINK; |
4230 | |
4231 | /* |
4232 | @@ -2413,7 +2408,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, |
4233 | goto end_rename; |
4234 | retval = -EMLINK; |
4235 | if (!new_inode && new_dir != old_dir && |
4236 | - new_dir->i_nlink >= EXT4_LINK_MAX) |
4237 | + EXT4_DIR_LINK_MAX(new_dir)) |
4238 | goto end_rename; |
4239 | } |
4240 | if (!new_bh) { |
4241 | diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c |
4242 | index 68b0351..96302cd 100644 |
4243 | --- a/fs/ext4/resize.c |
4244 | +++ b/fs/ext4/resize.c |
4245 | @@ -247,7 +247,7 @@ static int setup_new_group_blocks(struct super_block *sb, |
4246 | goto exit_bh; |
4247 | |
4248 | if (IS_ERR(gdb = bclean(handle, sb, block))) { |
4249 | - err = PTR_ERR(bh); |
4250 | + err = PTR_ERR(gdb); |
4251 | goto exit_bh; |
4252 | } |
4253 | ext4_handle_dirty_metadata(handle, NULL, gdb); |
4254 | diff --git a/fs/ext4/super.c b/fs/ext4/super.c |
4255 | index 8f4f079..ed38f25 100644 |
4256 | --- a/fs/ext4/super.c |
4257 | +++ b/fs/ext4/super.c |
4258 | @@ -45,6 +45,7 @@ |
4259 | #include "ext4_jbd2.h" |
4260 | #include "xattr.h" |
4261 | #include "acl.h" |
4262 | +#include "mballoc.h" |
4263 | |
4264 | #define CREATE_TRACE_POINTS |
4265 | #include <trace/events/ext4.h> |
4266 | @@ -188,6 +189,36 @@ void ext4_itable_unused_set(struct super_block *sb, |
4267 | bg->bg_itable_unused_hi = cpu_to_le16(count >> 16); |
4268 | } |
4269 | |
4270 | + |
4271 | +/* Just increment the non-pointer handle value */ |
4272 | +static handle_t *ext4_get_nojournal(void) |
4273 | +{ |
4274 | + handle_t *handle = current->journal_info; |
4275 | + unsigned long ref_cnt = (unsigned long)handle; |
4276 | + |
4277 | + BUG_ON(ref_cnt >= EXT4_NOJOURNAL_MAX_REF_COUNT); |
4278 | + |
4279 | + ref_cnt++; |
4280 | + handle = (handle_t *)ref_cnt; |
4281 | + |
4282 | + current->journal_info = handle; |
4283 | + return handle; |
4284 | +} |
4285 | + |
4286 | + |
4287 | +/* Decrement the non-pointer handle value */ |
4288 | +static void ext4_put_nojournal(handle_t *handle) |
4289 | +{ |
4290 | + unsigned long ref_cnt = (unsigned long)handle; |
4291 | + |
4292 | + BUG_ON(ref_cnt == 0); |
4293 | + |
4294 | + ref_cnt--; |
4295 | + handle = (handle_t *)ref_cnt; |
4296 | + |
4297 | + current->journal_info = handle; |
4298 | +} |
4299 | + |
4300 | /* |
4301 | * Wrappers for jbd2_journal_start/end. |
4302 | * |
4303 | @@ -214,11 +245,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks) |
4304 | } |
4305 | return jbd2_journal_start(journal, nblocks); |
4306 | } |
4307 | - /* |
4308 | - * We're not journaling, return the appropriate indication. |
4309 | - */ |
4310 | - current->journal_info = EXT4_NOJOURNAL_HANDLE; |
4311 | - return current->journal_info; |
4312 | + return ext4_get_nojournal(); |
4313 | } |
4314 | |
4315 | /* |
4316 | @@ -234,11 +261,7 @@ int __ext4_journal_stop(const char *where, handle_t *handle) |
4317 | int rc; |
4318 | |
4319 | if (!ext4_handle_valid(handle)) { |
4320 | - /* |
4321 | - * Do this here since we don't call jbd2_journal_stop() in |
4322 | - * no-journal mode. |
4323 | - */ |
4324 | - current->journal_info = NULL; |
4325 | + ext4_put_nojournal(handle); |
4326 | return 0; |
4327 | } |
4328 | sb = handle->h_transaction->t_journal->j_private; |
4329 | @@ -344,7 +367,8 @@ static const char *ext4_decode_error(struct super_block *sb, int errno, |
4330 | errstr = "Out of memory"; |
4331 | break; |
4332 | case -EROFS: |
4333 | - if (!sb || EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT) |
4334 | + if (!sb || (EXT4_SB(sb)->s_journal && |
4335 | + EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT)) |
4336 | errstr = "Journal has aborted"; |
4337 | else |
4338 | errstr = "Readonly filesystem"; |
4339 | @@ -578,15 +602,14 @@ static void ext4_put_super(struct super_block *sb) |
4340 | struct ext4_super_block *es = sbi->s_es; |
4341 | int i, err; |
4342 | |
4343 | + flush_workqueue(sbi->dio_unwritten_wq); |
4344 | + destroy_workqueue(sbi->dio_unwritten_wq); |
4345 | + |
4346 | lock_super(sb); |
4347 | lock_kernel(); |
4348 | if (sb->s_dirt) |
4349 | ext4_commit_super(sb, 1); |
4350 | |
4351 | - ext4_release_system_zone(sb); |
4352 | - ext4_mb_release(sb); |
4353 | - ext4_ext_release(sb); |
4354 | - ext4_xattr_put_super(sb); |
4355 | if (sbi->s_journal) { |
4356 | err = jbd2_journal_destroy(sbi->s_journal); |
4357 | sbi->s_journal = NULL; |
4358 | @@ -594,6 +617,12 @@ static void ext4_put_super(struct super_block *sb) |
4359 | ext4_abort(sb, __func__, |
4360 | "Couldn't clean up the journal"); |
4361 | } |
4362 | + |
4363 | + ext4_release_system_zone(sb); |
4364 | + ext4_mb_release(sb); |
4365 | + ext4_ext_release(sb); |
4366 | + ext4_xattr_put_super(sb); |
4367 | + |
4368 | if (!(sb->s_flags & MS_RDONLY)) { |
4369 | EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); |
4370 | es->s_state = cpu_to_le16(sbi->s_mount_state); |
4371 | @@ -682,6 +711,10 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) |
4372 | ei->i_allocated_meta_blocks = 0; |
4373 | ei->i_delalloc_reserved_flag = 0; |
4374 | spin_lock_init(&(ei->i_block_reservation_lock)); |
4375 | + INIT_LIST_HEAD(&ei->i_aio_dio_complete_list); |
4376 | + ei->cur_aio_dio = NULL; |
4377 | + ei->i_sync_tid = 0; |
4378 | + ei->i_datasync_tid = 0; |
4379 | |
4380 | return &ei->vfs_inode; |
4381 | } |
4382 | @@ -877,6 +910,12 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) |
4383 | if (test_opt(sb, NO_AUTO_DA_ALLOC)) |
4384 | seq_puts(seq, ",noauto_da_alloc"); |
4385 | |
4386 | + if (test_opt(sb, DISCARD)) |
4387 | + seq_puts(seq, ",discard"); |
4388 | + |
4389 | + if (test_opt(sb, NOLOAD)) |
4390 | + seq_puts(seq, ",norecovery"); |
4391 | + |
4392 | ext4_show_quota_options(seq, sb); |
4393 | |
4394 | return 0; |
4395 | @@ -1057,7 +1096,8 @@ enum { |
4396 | Opt_usrquota, Opt_grpquota, Opt_i_version, |
4397 | Opt_stripe, Opt_delalloc, Opt_nodelalloc, |
4398 | Opt_block_validity, Opt_noblock_validity, |
4399 | - Opt_inode_readahead_blks, Opt_journal_ioprio |
4400 | + Opt_inode_readahead_blks, Opt_journal_ioprio, |
4401 | + Opt_discard, Opt_nodiscard, |
4402 | }; |
4403 | |
4404 | static const match_table_t tokens = { |
4405 | @@ -1082,6 +1122,7 @@ static const match_table_t tokens = { |
4406 | {Opt_acl, "acl"}, |
4407 | {Opt_noacl, "noacl"}, |
4408 | {Opt_noload, "noload"}, |
4409 | + {Opt_noload, "norecovery"}, |
4410 | {Opt_nobh, "nobh"}, |
4411 | {Opt_bh, "bh"}, |
4412 | {Opt_commit, "commit=%u"}, |
4413 | @@ -1123,6 +1164,8 @@ static const match_table_t tokens = { |
4414 | {Opt_auto_da_alloc, "auto_da_alloc=%u"}, |
4415 | {Opt_auto_da_alloc, "auto_da_alloc"}, |
4416 | {Opt_noauto_da_alloc, "noauto_da_alloc"}, |
4417 | + {Opt_discard, "discard"}, |
4418 | + {Opt_nodiscard, "nodiscard"}, |
4419 | {Opt_err, NULL}, |
4420 | }; |
4421 | |
4422 | @@ -1551,6 +1594,12 @@ set_qf_format: |
4423 | else |
4424 | set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC); |
4425 | break; |
4426 | + case Opt_discard: |
4427 | + set_opt(sbi->s_mount_opt, DISCARD); |
4428 | + break; |
4429 | + case Opt_nodiscard: |
4430 | + clear_opt(sbi->s_mount_opt, DISCARD); |
4431 | + break; |
4432 | default: |
4433 | ext4_msg(sb, KERN_ERR, |
4434 | "Unrecognized mount option \"%s\" " |
4435 | @@ -1666,14 +1715,14 @@ static int ext4_fill_flex_info(struct super_block *sb) |
4436 | size_t size; |
4437 | int i; |
4438 | |
4439 | - if (!sbi->s_es->s_log_groups_per_flex) { |
4440 | + sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; |
4441 | + groups_per_flex = 1 << sbi->s_log_groups_per_flex; |
4442 | + |
4443 | + if (groups_per_flex < 2) { |
4444 | sbi->s_log_groups_per_flex = 0; |
4445 | return 1; |
4446 | } |
4447 | |
4448 | - sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; |
4449 | - groups_per_flex = 1 << sbi->s_log_groups_per_flex; |
4450 | - |
4451 | /* We allocate both existing and potentially added groups */ |
4452 | flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) + |
4453 | ((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) << |
4454 | @@ -1695,12 +1744,12 @@ static int ext4_fill_flex_info(struct super_block *sb) |
4455 | gdp = ext4_get_group_desc(sb, i, NULL); |
4456 | |
4457 | flex_group = ext4_flex_group(sbi, i); |
4458 | - atomic_set(&sbi->s_flex_groups[flex_group].free_inodes, |
4459 | - ext4_free_inodes_count(sb, gdp)); |
4460 | - atomic_set(&sbi->s_flex_groups[flex_group].free_blocks, |
4461 | - ext4_free_blks_count(sb, gdp)); |
4462 | - atomic_set(&sbi->s_flex_groups[flex_group].used_dirs, |
4463 | - ext4_used_dirs_count(sb, gdp)); |
4464 | + atomic_add(ext4_free_inodes_count(sb, gdp), |
4465 | + &sbi->s_flex_groups[flex_group].free_inodes); |
4466 | + atomic_add(ext4_free_blks_count(sb, gdp), |
4467 | + &sbi->s_flex_groups[flex_group].free_blocks); |
4468 | + atomic_add(ext4_used_dirs_count(sb, gdp), |
4469 | + &sbi->s_flex_groups[flex_group].used_dirs); |
4470 | } |
4471 | |
4472 | return 1; |
4473 | @@ -2197,6 +2246,7 @@ EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan); |
4474 | EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); |
4475 | EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); |
4476 | EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); |
4477 | +EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump); |
4478 | |
4479 | static struct attribute *ext4_attrs[] = { |
4480 | ATTR_LIST(delayed_allocation_blocks), |
4481 | @@ -2210,6 +2260,7 @@ static struct attribute *ext4_attrs[] = { |
4482 | ATTR_LIST(mb_order2_req), |
4483 | ATTR_LIST(mb_stream_req), |
4484 | ATTR_LIST(mb_group_prealloc), |
4485 | + ATTR_LIST(max_writeback_mb_bump), |
4486 | NULL, |
4487 | }; |
4488 | |
4489 | @@ -2253,6 +2304,49 @@ static struct kobj_type ext4_ktype = { |
4490 | .release = ext4_sb_release, |
4491 | }; |
4492 | |
4493 | +/* |
4494 | + * Check whether this filesystem can be mounted based on |
4495 | + * the features present and the RDONLY/RDWR mount requested. |
4496 | + * Returns 1 if this filesystem can be mounted as requested, |
4497 | + * 0 if it cannot be. |
4498 | + */ |
4499 | +static int ext4_feature_set_ok(struct super_block *sb, int readonly) |
4500 | +{ |
4501 | + if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP)) { |
4502 | + ext4_msg(sb, KERN_ERR, |
4503 | + "Couldn't mount because of " |
4504 | + "unsupported optional features (%x)", |
4505 | + (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) & |
4506 | + ~EXT4_FEATURE_INCOMPAT_SUPP)); |
4507 | + return 0; |
4508 | + } |
4509 | + |
4510 | + if (readonly) |
4511 | + return 1; |
4512 | + |
4513 | + /* Check that feature set is OK for a read-write mount */ |
4514 | + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP)) { |
4515 | + ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of " |
4516 | + "unsupported optional features (%x)", |
4517 | + (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) & |
4518 | + ~EXT4_FEATURE_RO_COMPAT_SUPP)); |
4519 | + return 0; |
4520 | + } |
4521 | + /* |
4522 | + * Large file size enabled file system can only be mounted |
4523 | + * read-write on 32-bit systems if kernel is built with CONFIG_LBDAF |
4524 | + */ |
4525 | + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) { |
4526 | + if (sizeof(blkcnt_t) < sizeof(u64)) { |
4527 | + ext4_msg(sb, KERN_ERR, "Filesystem with huge files " |
4528 | + "cannot be mounted RDWR without " |
4529 | + "CONFIG_LBDAF"); |
4530 | + return 0; |
4531 | + } |
4532 | + } |
4533 | + return 1; |
4534 | +} |
4535 | + |
4536 | static int ext4_fill_super(struct super_block *sb, void *data, int silent) |
4537 | __releases(kernel_lock) |
4538 | __acquires(kernel_lock) |
4539 | @@ -2274,7 +2368,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) |
4540 | unsigned int db_count; |
4541 | unsigned int i; |
4542 | int needs_recovery, has_huge_files; |
4543 | - int features; |
4544 | __u64 blocks_count; |
4545 | int err; |
4546 | unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; |
4547 | @@ -2401,39 +2494,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) |
4548 | * previously didn't change the revision level when setting the flags, |
4549 | * so there is a chance incompat flags are set on a rev 0 filesystem. |
4550 | */ |
4551 | - features = EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP); |
4552 | - if (features) { |
4553 | - ext4_msg(sb, KERN_ERR, |
4554 | - "Couldn't mount because of " |
4555 | - "unsupported optional features (%x)", |
4556 | - (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) & |
4557 | - ~EXT4_FEATURE_INCOMPAT_SUPP)); |
4558 | - goto failed_mount; |
4559 | - } |
4560 | - features = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP); |
4561 | - if (!(sb->s_flags & MS_RDONLY) && features) { |
4562 | - ext4_msg(sb, KERN_ERR, |
4563 | - "Couldn't mount RDWR because of " |
4564 | - "unsupported optional features (%x)", |
4565 | - (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) & |
4566 | - ~EXT4_FEATURE_RO_COMPAT_SUPP)); |
4567 | + if (!ext4_feature_set_ok(sb, (sb->s_flags & MS_RDONLY))) |
4568 | goto failed_mount; |
4569 | - } |
4570 | - has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb, |
4571 | - EXT4_FEATURE_RO_COMPAT_HUGE_FILE); |
4572 | - if (has_huge_files) { |
4573 | - /* |
4574 | - * Large file size enabled file system can only be |
4575 | - * mount if kernel is build with CONFIG_LBDAF |
4576 | - */ |
4577 | - if (sizeof(root->i_blocks) < sizeof(u64) && |
4578 | - !(sb->s_flags & MS_RDONLY)) { |
4579 | - ext4_msg(sb, KERN_ERR, "Filesystem with huge " |
4580 | - "files cannot be mounted read-write " |
4581 | - "without CONFIG_LBDAF"); |
4582 | - goto failed_mount; |
4583 | - } |
4584 | - } |
4585 | + |
4586 | blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); |
4587 | |
4588 | if (blocksize < EXT4_MIN_BLOCK_SIZE || |
4589 | @@ -2469,6 +2532,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) |
4590 | } |
4591 | } |
4592 | |
4593 | + has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb, |
4594 | + EXT4_FEATURE_RO_COMPAT_HUGE_FILE); |
4595 | sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits, |
4596 | has_huge_files); |
4597 | sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files); |
4598 | @@ -2549,12 +2614,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) |
4599 | goto failed_mount; |
4600 | } |
4601 | |
4602 | - if (ext4_blocks_count(es) > |
4603 | - (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { |
4604 | + /* |
4605 | + * Test whether we have more sectors than will fit in sector_t, |
4606 | + * and whether the max offset is addressable by the page cache. |
4607 | + */ |
4608 | + if ((ext4_blocks_count(es) > |
4609 | + (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) || |
4610 | + (ext4_blocks_count(es) > |
4611 | + (pgoff_t)(~0ULL) >> (PAGE_CACHE_SHIFT - sb->s_blocksize_bits))) { |
4612 | ext4_msg(sb, KERN_ERR, "filesystem" |
4613 | - " too large to mount safely"); |
4614 | + " too large to mount safely on this system"); |
4615 | if (sizeof(sector_t) < 8) |
4616 | ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled"); |
4617 | + ret = -EFBIG; |
4618 | goto failed_mount; |
4619 | } |
4620 | |
4621 | @@ -2595,6 +2667,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) |
4622 | goto failed_mount; |
4623 | } |
4624 | sbi->s_groups_count = blocks_count; |
4625 | + sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count, |
4626 | + (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb))); |
4627 | db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / |
4628 | EXT4_DESC_PER_BLOCK(sb); |
4629 | sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *), |
4630 | @@ -2656,6 +2730,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) |
4631 | } |
4632 | |
4633 | sbi->s_stripe = ext4_get_stripe_size(sbi); |
4634 | + sbi->s_max_writeback_mb_bump = 128; |
4635 | |
4636 | /* |
4637 | * set up enough so that it can read an inode |
4638 | @@ -2781,6 +2856,12 @@ no_journal: |
4639 | clear_opt(sbi->s_mount_opt, NOBH); |
4640 | } |
4641 | } |
4642 | + EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten"); |
4643 | + if (!EXT4_SB(sb)->dio_unwritten_wq) { |
4644 | + printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n"); |
4645 | + goto failed_mount_wq; |
4646 | + } |
4647 | + |
4648 | /* |
4649 | * The jbd2_journal_load will have done any necessary log recovery, |
4650 | * so we can safely mount the rest of the filesystem now. |
4651 | @@ -2893,6 +2974,8 @@ cantfind_ext4: |
4652 | |
4653 | failed_mount4: |
4654 | ext4_msg(sb, KERN_ERR, "mount failed"); |
4655 | + destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq); |
4656 | +failed_mount_wq: |
4657 | ext4_release_system_zone(sb); |
4658 | if (sbi->s_journal) { |
4659 | jbd2_journal_destroy(sbi->s_journal); |
4660 | @@ -3208,7 +3291,18 @@ static int ext4_commit_super(struct super_block *sb, int sync) |
4661 | clear_buffer_write_io_error(sbh); |
4662 | set_buffer_uptodate(sbh); |
4663 | } |
4664 | - es->s_wtime = cpu_to_le32(get_seconds()); |
4665 | + /* |
4666 | + * If the file system is mounted read-only, don't update the |
4667 | + * superblock write time. This avoids updating the superblock |
4668 | + * write time when we are mounting the root file system |
4669 | + * read/only but we need to replay the journal; at that point, |
4670 | + * for people who are east of GMT and who make their clock |
4671 | + * tick in localtime for Windows bug-for-bug compatibility, |
4672 | + * the clock is set in the future, and this will cause e2fsck |
4673 | + * to complain and force a full file system check. |
4674 | + */ |
4675 | + if (!(sb->s_flags & MS_RDONLY)) |
4676 | + es->s_wtime = cpu_to_le32(get_seconds()); |
4677 | es->s_kbytes_written = |
4678 | cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + |
4679 | ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) - |
4680 | @@ -3333,11 +3427,13 @@ static int ext4_sync_fs(struct super_block *sb, int wait) |
4681 | { |
4682 | int ret = 0; |
4683 | tid_t target; |
4684 | + struct ext4_sb_info *sbi = EXT4_SB(sb); |
4685 | |
4686 | trace_ext4_sync_fs(sb, wait); |
4687 | - if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) { |
4688 | + flush_workqueue(sbi->dio_unwritten_wq); |
4689 | + if (jbd2_journal_start_commit(sbi->s_journal, &target)) { |
4690 | if (wait) |
4691 | - jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, target); |
4692 | + jbd2_log_wait_commit(sbi->s_journal, target); |
4693 | } |
4694 | return ret; |
4695 | } |
4696 | @@ -3477,18 +3573,11 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) |
4697 | if (sbi->s_journal) |
4698 | ext4_mark_recovery_complete(sb, es); |
4699 | } else { |
4700 | - int ret; |
4701 | - if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb, |
4702 | - ~EXT4_FEATURE_RO_COMPAT_SUPP))) { |
4703 | - ext4_msg(sb, KERN_WARNING, "couldn't " |
4704 | - "remount RDWR because of unsupported " |
4705 | - "optional features (%x)", |
4706 | - (le32_to_cpu(sbi->s_es->s_feature_ro_compat) & |
4707 | - ~EXT4_FEATURE_RO_COMPAT_SUPP)); |
4708 | + /* Make sure we can mount this feature set readwrite */ |
4709 | + if (!ext4_feature_set_ok(sb, 0)) { |
4710 | err = -EROFS; |
4711 | goto restore_opts; |
4712 | } |
4713 | - |
4714 | /* |
4715 | * Make sure the group descriptor checksums |
4716 | * are sane. If they aren't, refuse to remount r/w. |
4717 | @@ -3624,13 +3713,11 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) |
4718 | buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last; |
4719 | buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) - |
4720 | percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter); |
4721 | - ext4_free_blocks_count_set(es, buf->f_bfree); |
4722 | buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es); |
4723 | if (buf->f_bfree < ext4_r_blocks_count(es)) |
4724 | buf->f_bavail = 0; |
4725 | buf->f_files = le32_to_cpu(es->s_inodes_count); |
4726 | buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter); |
4727 | - es->s_free_inodes_count = cpu_to_le32(buf->f_ffree); |
4728 | buf->f_namelen = EXT4_NAME_LEN; |
4729 | fsid = le64_to_cpup((void *)es->s_uuid) ^ |
4730 | le64_to_cpup((void *)es->s_uuid + sizeof(u64)); |
4731 | diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c |
4732 | index 62b31c2..0257019 100644 |
4733 | --- a/fs/ext4/xattr.c |
4734 | +++ b/fs/ext4/xattr.c |
4735 | @@ -810,12 +810,23 @@ inserted: |
4736 | get_bh(new_bh); |
4737 | } else { |
4738 | /* We need to allocate a new block */ |
4739 | - ext4_fsblk_t goal = ext4_group_first_block_no(sb, |
4740 | + ext4_fsblk_t goal, block; |
4741 | + |
4742 | + goal = ext4_group_first_block_no(sb, |
4743 | EXT4_I(inode)->i_block_group); |
4744 | - ext4_fsblk_t block = ext4_new_meta_blocks(handle, inode, |
4745 | + |
4746 | + /* non-extent files can't have physical blocks past 2^32 */ |
4747 | + if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) |
4748 | + goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; |
4749 | + |
4750 | + block = ext4_new_meta_blocks(handle, inode, |
4751 | goal, NULL, &error); |
4752 | if (error) |
4753 | goto cleanup; |
4754 | + |
4755 | + if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) |
4756 | + BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS); |
4757 | + |
4758 | ea_idebug(inode, "creating block %d", block); |
4759 | |
4760 | new_bh = sb_getblk(sb, block); |
4761 | @@ -977,6 +988,10 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, |
4762 | if (error) |
4763 | goto cleanup; |
4764 | |
4765 | + error = ext4_journal_get_write_access(handle, is.iloc.bh); |
4766 | + if (error) |
4767 | + goto cleanup; |
4768 | + |
4769 | if (EXT4_I(inode)->i_state & EXT4_STATE_NEW) { |
4770 | struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc); |
4771 | memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); |
4772 | @@ -1002,9 +1017,6 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, |
4773 | if (flags & XATTR_CREATE) |
4774 | goto cleanup; |
4775 | } |
4776 | - error = ext4_journal_get_write_access(handle, is.iloc.bh); |
4777 | - if (error) |
4778 | - goto cleanup; |
4779 | if (!value) { |
4780 | if (!is.s.not_found) |
4781 | error = ext4_xattr_ibody_set(handle, inode, &i, &is); |
4782 | diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c |
4783 | index 7b4088b..8cf902a 100644 |
4784 | --- a/fs/jbd2/commit.c |
4785 | +++ b/fs/jbd2/commit.c |
4786 | @@ -636,6 +636,10 @@ void jbd2_journal_commit_transaction(journal_t *journal) |
4787 | JBUFFER_TRACE(jh, "ph3: write metadata"); |
4788 | flags = jbd2_journal_write_metadata_buffer(commit_transaction, |
4789 | jh, &new_jh, blocknr); |
4790 | + if (flags < 0) { |
4791 | + jbd2_journal_abort(journal, flags); |
4792 | + continue; |
4793 | + } |
4794 | set_bit(BH_JWrite, &jh2bh(new_jh)->b_state); |
4795 | wbuf[bufs++] = jh2bh(new_jh); |
4796 | |
4797 | diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c |
4798 | index e378cb3..4b74149 100644 |
4799 | --- a/fs/jbd2/journal.c |
4800 | +++ b/fs/jbd2/journal.c |
4801 | @@ -78,6 +78,7 @@ EXPORT_SYMBOL(jbd2_journal_errno); |
4802 | EXPORT_SYMBOL(jbd2_journal_ack_err); |
4803 | EXPORT_SYMBOL(jbd2_journal_clear_err); |
4804 | EXPORT_SYMBOL(jbd2_log_wait_commit); |
4805 | +EXPORT_SYMBOL(jbd2_log_start_commit); |
4806 | EXPORT_SYMBOL(jbd2_journal_start_commit); |
4807 | EXPORT_SYMBOL(jbd2_journal_force_commit_nested); |
4808 | EXPORT_SYMBOL(jbd2_journal_wipe); |
4809 | @@ -361,6 +362,10 @@ repeat: |
4810 | |
4811 | jbd_unlock_bh_state(bh_in); |
4812 | tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS); |
4813 | + if (!tmp) { |
4814 | + jbd2_journal_put_journal_head(new_jh); |
4815 | + return -ENOMEM; |
4816 | + } |
4817 | jbd_lock_bh_state(bh_in); |
4818 | if (jh_in->b_frozen_data) { |
4819 | jbd2_free(tmp, bh_in->b_size); |
4820 | @@ -1187,6 +1192,12 @@ static int journal_reset(journal_t *journal) |
4821 | |
4822 | first = be32_to_cpu(sb->s_first); |
4823 | last = be32_to_cpu(sb->s_maxlen); |
4824 | + if (first + JBD2_MIN_JOURNAL_BLOCKS > last + 1) { |
4825 | + printk(KERN_ERR "JBD: Journal too short (blocks %llu-%llu).\n", |
4826 | + first, last); |
4827 | + journal_fail_superblock(journal); |
4828 | + return -EINVAL; |
4829 | + } |
4830 | |
4831 | journal->j_first = first; |
4832 | journal->j_last = last; |
4833 | diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c |
4834 | index 6213ac7..a051270 100644 |
4835 | --- a/fs/jbd2/transaction.c |
4836 | +++ b/fs/jbd2/transaction.c |
4837 | @@ -57,7 +57,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction) |
4838 | INIT_LIST_HEAD(&transaction->t_private_list); |
4839 | |
4840 | /* Set up the commit timer for the new transaction. */ |
4841 | - journal->j_commit_timer.expires = round_jiffies(transaction->t_expires); |
4842 | + journal->j_commit_timer.expires = round_jiffies_up(transaction->t_expires); |
4843 | add_timer(&journal->j_commit_timer); |
4844 | |
4845 | J_ASSERT(journal->j_running_transaction == NULL); |
4846 | @@ -238,6 +238,8 @@ repeat_locked: |
4847 | __jbd2_log_space_left(journal)); |
4848 | spin_unlock(&transaction->t_handle_lock); |
4849 | spin_unlock(&journal->j_state_lock); |
4850 | + |
4851 | + lock_map_acquire(&handle->h_lockdep_map); |
4852 | out: |
4853 | if (unlikely(new_transaction)) /* It's usually NULL */ |
4854 | kfree(new_transaction); |
4855 | @@ -303,8 +305,6 @@ handle_t *jbd2_journal_start(journal_t *journal, int nblocks) |
4856 | handle = ERR_PTR(err); |
4857 | goto out; |
4858 | } |
4859 | - |
4860 | - lock_map_acquire(&handle->h_lockdep_map); |
4861 | out: |
4862 | return handle; |
4863 | } |
4864 | @@ -426,6 +426,7 @@ int jbd2_journal_restart(handle_t *handle, int nblocks) |
4865 | __jbd2_log_start_commit(journal, transaction->t_tid); |
4866 | spin_unlock(&journal->j_state_lock); |
4867 | |
4868 | + lock_map_release(&handle->h_lockdep_map); |
4869 | handle->h_buffer_credits = nblocks; |
4870 | ret = start_this_handle(journal, handle); |
4871 | return ret; |
4872 | diff --git a/include/linux/sched.h b/include/linux/sched.h |
4873 | index 0f1ea4a..d3e910b 100644 |
4874 | --- a/include/linux/sched.h |
4875 | +++ b/include/linux/sched.h |
4876 | @@ -1999,11 +1999,18 @@ static inline int is_si_special(const struct siginfo *info) |
4877 | return info <= SEND_SIG_FORCED; |
4878 | } |
4879 | |
4880 | -/* True if we are on the alternate signal stack. */ |
4881 | - |
4882 | +/* |
4883 | + * True if we are on the alternate signal stack. |
4884 | + */ |
4885 | static inline int on_sig_stack(unsigned long sp) |
4886 | { |
4887 | - return (sp - current->sas_ss_sp < current->sas_ss_size); |
4888 | +#ifdef CONFIG_STACK_GROWSUP |
4889 | + return sp >= current->sas_ss_sp && |
4890 | + sp - current->sas_ss_sp < current->sas_ss_size; |
4891 | +#else |
4892 | + return sp > current->sas_ss_sp && |
4893 | + sp - current->sas_ss_sp <= current->sas_ss_size; |
4894 | +#endif |
4895 | } |
4896 | |
4897 | static inline int sas_ss_flags(unsigned long sp) |
4898 | diff --git a/include/scsi/osd_protocol.h b/include/scsi/osd_protocol.h |
4899 | index 2cc8e8b..6856612 100644 |
4900 | --- a/include/scsi/osd_protocol.h |
4901 | +++ b/include/scsi/osd_protocol.h |
4902 | @@ -17,6 +17,7 @@ |
4903 | #define __OSD_PROTOCOL_H__ |
4904 | |
4905 | #include <linux/types.h> |
4906 | +#include <linux/kernel.h> |
4907 | #include <asm/unaligned.h> |
4908 | #include <scsi/scsi.h> |
4909 | |
4910 | diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h |
4911 | index b62a097..6cc72e2 100644 |
4912 | --- a/include/scsi/scsi_host.h |
4913 | +++ b/include/scsi/scsi_host.h |
4914 | @@ -677,6 +677,12 @@ struct Scsi_Host { |
4915 | void *shost_data; |
4916 | |
4917 | /* |
4918 | + * Points to the physical bus device we'd use to do DMA |
4919 | + * Needed just in case we have virtual hosts. |
4920 | + */ |
4921 | + struct device *dma_dev; |
4922 | + |
4923 | + /* |
4924 | * We should ensure that this is aligned, both for better performance |
4925 | * and also because some compilers (m68k) don't automatically force |
4926 | * alignment to a long boundary. |
4927 | @@ -720,7 +726,9 @@ extern int scsi_queue_work(struct Scsi_Host *, struct work_struct *); |
4928 | extern void scsi_flush_work(struct Scsi_Host *); |
4929 | |
4930 | extern struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *, int); |
4931 | -extern int __must_check scsi_add_host(struct Scsi_Host *, struct device *); |
4932 | +extern int __must_check scsi_add_host_with_dma(struct Scsi_Host *, |
4933 | + struct device *, |
4934 | + struct device *); |
4935 | extern void scsi_scan_host(struct Scsi_Host *); |
4936 | extern void scsi_rescan_device(struct device *); |
4937 | extern void scsi_remove_host(struct Scsi_Host *); |
4938 | @@ -731,6 +739,12 @@ extern const char *scsi_host_state_name(enum scsi_host_state); |
4939 | |
4940 | extern u64 scsi_calculate_bounce_limit(struct Scsi_Host *); |
4941 | |
4942 | +static inline int __must_check scsi_add_host(struct Scsi_Host *host, |
4943 | + struct device *dev) |
4944 | +{ |
4945 | + return scsi_add_host_with_dma(host, dev, dev); |
4946 | +} |
4947 | + |
4948 | static inline struct device *scsi_get_device(struct Scsi_Host *shost) |
4949 | { |
4950 | return shost->shost_gendev.parent; |
4951 | diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h |
4952 | index 7d8b5bc..824979e 100644 |
4953 | --- a/include/trace/events/ext4.h |
4954 | +++ b/include/trace/events/ext4.h |
4955 | @@ -5,10 +5,12 @@ |
4956 | #define _TRACE_EXT4_H |
4957 | |
4958 | #include <linux/writeback.h> |
4959 | -#include "../../../fs/ext4/ext4.h" |
4960 | -#include "../../../fs/ext4/mballoc.h" |
4961 | #include <linux/tracepoint.h> |
4962 | |
4963 | +struct ext4_allocation_context; |
4964 | +struct ext4_allocation_request; |
4965 | +struct ext4_prealloc_space; |
4966 | + |
4967 | TRACE_EVENT(ext4_free_inode, |
4968 | TP_PROTO(struct inode *inode), |
4969 | |
4970 | @@ -229,6 +231,7 @@ TRACE_EVENT(ext4_da_writepages, |
4971 | __field( char, for_reclaim ) |
4972 | __field( char, for_writepages ) |
4973 | __field( char, range_cyclic ) |
4974 | + __field( pgoff_t, writeback_index ) |
4975 | ), |
4976 | |
4977 | TP_fast_assign( |
4978 | @@ -243,14 +246,51 @@ TRACE_EVENT(ext4_da_writepages, |
4979 | __entry->for_reclaim = wbc->for_reclaim; |
4980 | __entry->for_writepages = wbc->for_writepages; |
4981 | __entry->range_cyclic = wbc->range_cyclic; |
4982 | + __entry->writeback_index = inode->i_mapping->writeback_index; |
4983 | ), |
4984 | |
4985 | - TP_printk("dev %s ino %lu nr_t_write %ld pages_skipped %ld range_start %llu range_end %llu nonblocking %d for_kupdate %d for_reclaim %d for_writepages %d range_cyclic %d", |
4986 | - jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->nr_to_write, |
4987 | + TP_printk("dev %s ino %lu nr_to_write %ld pages_skipped %ld range_start %llu range_end %llu nonblocking %d for_kupdate %d for_reclaim %d for_writepages %d range_cyclic %d writeback_index %lu", |
4988 | + jbd2_dev_to_name(__entry->dev), |
4989 | + (unsigned long) __entry->ino, __entry->nr_to_write, |
4990 | __entry->pages_skipped, __entry->range_start, |
4991 | __entry->range_end, __entry->nonblocking, |
4992 | __entry->for_kupdate, __entry->for_reclaim, |
4993 | - __entry->for_writepages, __entry->range_cyclic) |
4994 | + __entry->for_writepages, __entry->range_cyclic, |
4995 | + (unsigned long) __entry->writeback_index) |
4996 | +); |
4997 | + |
4998 | +TRACE_EVENT(ext4_da_write_pages, |
4999 | + TP_PROTO(struct inode *inode, struct mpage_da_data *mpd), |
5000 | + |
5001 | + TP_ARGS(inode, mpd), |
5002 | + |
5003 | + TP_STRUCT__entry( |
5004 | + __field( dev_t, dev ) |
5005 | + __field( ino_t, ino ) |
5006 | + __field( __u64, b_blocknr ) |
5007 | + __field( __u32, b_size ) |
5008 | + __field( __u32, b_state ) |
5009 | + __field( unsigned long, first_page ) |
5010 | + __field( int, io_done ) |
5011 | + __field( int, pages_written ) |
5012 | + ), |
5013 | + |
5014 | + TP_fast_assign( |
5015 | + __entry->dev = inode->i_sb->s_dev; |
5016 | + __entry->ino = inode->i_ino; |
5017 | + __entry->b_blocknr = mpd->b_blocknr; |
5018 | + __entry->b_size = mpd->b_size; |
5019 | + __entry->b_state = mpd->b_state; |
5020 | + __entry->first_page = mpd->first_page; |
5021 | + __entry->io_done = mpd->io_done; |
5022 | + __entry->pages_written = mpd->pages_written; |
5023 | + ), |
5024 | + |
5025 | + TP_printk("dev %s ino %lu b_blocknr %llu b_size %u b_state 0x%04x first_page %lu io_done %d pages_written %d", |
5026 | + jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, |
5027 | + __entry->b_blocknr, __entry->b_size, |
5028 | + __entry->b_state, __entry->first_page, |
5029 | + __entry->io_done, __entry->pages_written) |
5030 | ); |
5031 | |
5032 | TRACE_EVENT(ext4_da_writepages_result, |
5033 | @@ -268,6 +308,7 @@ TRACE_EVENT(ext4_da_writepages_result, |
5034 | __field( char, encountered_congestion ) |
5035 | __field( char, more_io ) |
5036 | __field( char, no_nrwrite_index_update ) |
5037 | + __field( pgoff_t, writeback_index ) |
5038 | ), |
5039 | |
5040 | TP_fast_assign( |
5041 | @@ -279,13 +320,16 @@ TRACE_EVENT(ext4_da_writepages_result, |
5042 | __entry->encountered_congestion = wbc->encountered_congestion; |
5043 | __entry->more_io = wbc->more_io; |
5044 | __entry->no_nrwrite_index_update = wbc->no_nrwrite_index_update; |
5045 | + __entry->writeback_index = inode->i_mapping->writeback_index; |
5046 | ), |
5047 | |
5048 | - TP_printk("dev %s ino %lu ret %d pages_written %d pages_skipped %ld congestion %d more_io %d no_nrwrite_index_update %d", |
5049 | - jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->ret, |
5050 | + TP_printk("dev %s ino %lu ret %d pages_written %d pages_skipped %ld congestion %d more_io %d no_nrwrite_index_update %d writeback_index %lu", |
5051 | + jbd2_dev_to_name(__entry->dev), |
5052 | + (unsigned long) __entry->ino, __entry->ret, |
5053 | __entry->pages_written, __entry->pages_skipped, |
5054 | __entry->encountered_congestion, __entry->more_io, |
5055 | - __entry->no_nrwrite_index_update) |
5056 | + __entry->no_nrwrite_index_update, |
5057 | + (unsigned long) __entry->writeback_index) |
5058 | ); |
5059 | |
5060 | TRACE_EVENT(ext4_da_write_begin, |